import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
%matplotlib inline
%precision 5
Read in the data for the example:
savings = pd.read_csv("data/savings.csv",index_col=0)
savings.head()
lmod = smf.ols(formula='sr ~ pop15 + pop75 + dpi + ddpi', data=savings).fit()
lmod.summary()
The simple residual-fitted plot:
plt.scatter(lmod.fittedvalues, lmod.resid)
plt.ylabel("Residuals")
plt.xlabel("Fitted values")
plt.axhline(0)
This plot is slightly better for checking for constant variance:
plt.scatter(lmod.fittedvalues, np.sqrt(abs(lmod.resid)))
plt.ylabel("sqrt |Residuals|")
plt.xlabel("Fitted values")
Check that there is no significant relationship:
ddf = pd.DataFrame({'x':lmod.fittedvalues,'y':np.sqrt(abs(lmod.resid))})
dmod = smf.ols('y ~ x',data=ddf).fit()
dmod.summary()
Simulate some residual-fitted plots:
n = 50
x = np.random.sample(n)
y = np.random.normal(size=n)
plt.scatter(x,y)
plt.title("No problem plot")
plt.axhline(0)
plt.scatter(x,y*x)
plt.title("Increasing variance")
plt.axhline(0)
plt.scatter(x,y*np.sqrt(x))
plt.title("Mildly increasing variance")
plt.axhline(0)
y = np.cos(2*x*np.pi) + np.random.normal(size=n)
plt.scatter(x,y)
plt.title("Lack of fit plot")
sx = np.sort(x)
plt.plot(sx,np.cos(2*sx*np.pi),c='red')
plt.axhline(0)
Plots of residuals against predictors:
plt.scatter(savings.pop15, lmod.resid)
plt.xlabel("%pop under 15")
plt.ylabel("Residuals")
plt.axhline(0)
plt.scatter(savings.pop75, lmod.resid)
plt.xlabel("%pop over 75")
plt.ylabel("Residuals")
plt.axhline(0)
Check for equality of the variances in the two groups seen in pop15
numres = lmod.resid[savings.pop15 > 35]
denres = lmod.resid[savings.pop15 < 35]
fstat = np.var(numres,ddof=1)/np.var(denres,ddof=1)
2*(1-sp.stats.f.cdf(fstat,len(numres)-1,len(denres)-1))
Do the residual fitted plot for the Galapagos data:
gala = pd.read_csv("data/gala.csv")
gmod = smf.ols(formula='Species ~ Area + Elevation + Nearest + Scruz + Adjacent', data=gala).fit()
plt.scatter(gmod.fittedvalues, gmod.resid)
plt.ylabel("Residuals")
plt.xlabel("Fitted values")
plt.axhline(0)
See how transformation improves the plot:
gmod = smf.ols(formula='np.sqrt(Species) ~ Area + Elevation + Nearest + Scruz + Adjacent', data=gala).fit()
plt.scatter(gmod.fittedvalues, gmod.resid)
plt.ylabel("Residuals")
plt.xlabel("Fitted values")
plt.axhline(0)
Check the QQ plot of the residuals. Plot is assigned to foo so it doesn't print twice.
foo = sm.qqplot(lmod.resid, line="q")
Histogram is not a good way to check:
plt.hist(lmod.resid)
plt.xlabel("Residuals")
Generate some random QQ plots from known distributions:
foo = sm.qqplot(np.random.normal(size=50),line="q")
foo = sm.qqplot(np.exp(np.random.normal(size=50)),line="q")
foo = sm.qqplot(np.random.standard_t(1,size=50),line="q")
foo = sm.qqplot(np.random.sample(size=50),line="q")
Do the Shapiro test - second value is the p-value
sp.stats.shapiro(lmod.resid)
globwarm = pd.read_csv("data/globwarm.csv")
globwarm.head()
lmod = smf.ols(formula='nhtemp ~ wusa + jasper + westgreen + chesapeake + tornetrask + urals + mongolia + tasman',
data=globwarm).fit()
lmod.summary()
plt.scatter(globwarm.year[lmod.resid.keys()], lmod.resid)
plt.axhline(0)
plt.scatter(lmod.resid.iloc[:-1],lmod.resid.iloc[1:])
plt.axhline(0,alpha=0.5)
plt.axvline(0,alpha=0.5)
This is the statistic. P-values do not seem to be available. 2 is considered null but less than 1 is a sign of a problem.
sm.stats.stattools.durbin_watson(lmod.resid)
lmod = smf.ols(formula='sr ~ pop15 + pop75 + dpi + ddpi', data=savings).fit()
diagv = lmod.get_influence()
hatv = pd.Series(diagv.hat_matrix_diag, savings.index)
hatv.head()
Make the half-normal plot:
n=50
ix = np.arange(1,n+1)
halfq = sp.stats.norm.ppf((n+ix)/(2*n+1)),
plt.scatter(halfq, np.sort(hatv))
Should tag these on the plot
hatv.sort_values().iloc[-5:]
One of the built-in plots. Not sure about residuals^2 as a diagnostic.
foo = sm.graphics.plot_leverage_resid2(lmod)
Would prefer R style plot here.
foo = sm.graphics.influence_plot(lmod)
Make a QQ plot of the Pearson residuals. These are not the same as the standardised residuals in R.
foo = sm.qqplot(lmod.resid_pearson)
Make the example plots with outliers. (Seems like this could be done more neatly)
np.random.seed(123)
testdata = pd.DataFrame({'x' : np.arange(1,11), 'y' : np.arange(1,11) + np.random.normal(size=10)})
p1 = pd.DataFrame({'x': [5.5], 'y':[12]})
alldata = testdata.append(p1,ignore_index=True)
marksize = np.ones(11)
marksize[10] = 3
plt.scatter(alldata.x, alldata.y, s= marksize*5)
slope, intercept = np.polyfit(testdata.x, testdata.y,1)
plt.plot(testdata.x, intercept + slope * testdata.x)
slope, intercept = np.polyfit(alldata.x, alldata.y,1)
plt.plot(alldata.x, intercept + slope * alldata.x, color='red')
p1 = pd.DataFrame({'x': [15], 'y':[15.1]})
alldata = testdata.append(p1,ignore_index=True)
plt.scatter(alldata.x, alldata.y, s= marksize*5)
slope, intercept = np.polyfit(testdata.x, testdata.y,1)
plt.plot(testdata.x, intercept + slope * testdata.x)
slope, intercept = np.polyfit(alldata.x, alldata.y,1)
plt.plot(alldata.x, intercept + slope * alldata.x, color='red')
p1 = pd.DataFrame({'x': [15], 'y':[5.1]})
alldata = testdata.append(p1,ignore_index=True)
plt.scatter(alldata.x, alldata.y, s= marksize*5)
slope, intercept = np.polyfit(testdata.x, testdata.y,1)
plt.plot(testdata.x, intercept + slope * testdata.x)
slope, intercept = np.polyfit(alldata.x, alldata.y,1)
plt.plot(alldata.x, intercept + slope * alldata.x, color='red')
Create the studentized residuals
stud = pd.Series(diagv.resid_studentized_external, savings.index)
(pd.Series.idxmax(abs(stud)), np.max(abs(stud)))
Find the Bonferroni-adjusted critical value:
sp.stats.t.ppf(0.05/(2*50),44)
Star example that has multiple outliers:
star = pd.read_csv("data/star.csv")
star.head()
Put the line on the plot:
lmod = smf.ols(formula='light ~ temp',data=star).fit()
plt.scatter(star.temp, star.light)
plt.plot([3.4, 4.7], [lmod.params[0] + lmod.params[1]*3.4, lmod.params[0] + lmod.params[1]*4.7])
Look for outliers:
stud = lmod.get_influence().resid_studentized_external
np.min(stud), np.max(stud)
Fit the line without the four points:
lmodr = smf.ols(formula='light ~ temp',data=star[star.temp > 3.6]).fit()
plt.scatter(star.temp, star.light)
plt.plot([3.4, 4.7], [lmod.params[0] + lmod.params[1]*3.4, lmod.params[0] + lmod.params[1]*4.7])
plt.plot([3.4, 4.7], [lmodr.params[0] + lmodr.params[1]*3.4, lmodr.params[0] + lmodr.params[1]*4.7],color='red')
lmod = smf.ols(formula='sr ~ pop15 + pop75 + dpi + ddpi', data=savings).fit()
diagv = lmod.get_influence()
cooks = pd.Series(diagv.cooks_distance[0], savings.index)
n=50
ix = np.arange(1,n+1)
halfq = sp.stats.norm.ppf((n+ix)/(2*n+1)),
plt.scatter(halfq, np.sort(cooks))
Would be best placed on the plot.
cooks.sort_values().iloc[-5:]
First column is the all data estimates and second is one without largest cook stat.
lmodi = smf.ols(formula='sr ~ pop15 + pop75 + dpi + ddpi', data=savings[cooks < 0.2]).fit()
pd.DataFrame({'with':lmod.params,'without':lmodi.params})
Would be better to autodetect extreme cases and label them. Note that dfbetas is the change in standard errors and not the change in coefficient as in R.
p15d = diagv.dfbetas[:,1]
plt.scatter(np.arange(1,51),p15d)
plt.axhline(0)
ix = 22
plt.annotate(savings.index[ix],(ix, p15d[ix]))
ix = 48
plt.annotate(savings.index[ix],(ix, p15d[ix]))
Partial regression plot
d = smf.ols(formula='sr ~ pop75 + dpi + ddpi', data=savings).fit().resid
m = smf.ols(formula='pop15 ~ pop75 + dpi + ddpi', data=savings).fit().resid
plt.scatter(m,d)
plt.xlabel("pop15 residuals")
plt.ylabel("sr residuals")
plt.plot([-10,8], [-10*lmod.params[1], 8*lmod.params[1]])
Partial residual plot
pr = lmod.resid + savings.pop15*lmod.params[1]
plt.scatter(savings.pop15, pr)
plt.xlabel("pop15")
plt.ylabel("partial residuals")
plt.plot([20,50], [20*lmod.params[1], 50*lmod.params[1]])
smf.ols(formula='sr ~ pop15 + pop75 + dpi + ddpi', data=savings[savings.pop15 > 35]).fit().summary()
smf.ols(formula='sr ~ pop15 + pop75 + dpi + ddpi', data=savings[savings.pop15 < 35]).fit().summary()
Plot the young and old groups with different colors:
savings['age'] = np.where(savings.pop15 > 35, 'red', 'blue')
plt.scatter(savings.ddpi, savings.sr, color=savings.age)
plt.xlabel("ddpi")
plt.ylabel("sr")
Would be nice to have a legend but matplotlib
does not make this easy. Fortunately, seaborn
makes this more convenient:
savings['age'] = np.where(savings.pop15 > 35, 'young', 'old')
sns.lmplot('ddpi','sr',data=savings, hue='age')
Can facet this also:
sns.lmplot('ddpi','sr',data=savings, col='age')
%load_ext version_information
%version_information pandas, numpy, matplotlib, seaborn, scipy, patsy, statsmodels