Load in the Python packages
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
Read in the data:
eco = pd.read_csv("data/eco.csv", index_col=0)
eco.head()
Plot the data:
plt.scatter(eco.usborn, eco.income)
plt.ylabel("Income")
plt.xlabel("Fraction US born")
plt.show()
lmod = sm.OLS(eco.income, sm.add_constant(eco.usborn)).fit()
lmod.summary()
plt.scatter(eco.usborn, eco.income)
plt.ylabel("Income")
plt.xlabel("Fraction US born")
plt.plot([0, 1.0], [lmod.params[0] + lmod.params[1]*0.0, lmod.params[0] + lmod.params[1]*1])
plt.show()
Read in and plot the data:
chredlin = pd.read_csv("data/chredlin.csv", index_col=0)
chredlin.head()
fig, ((ax1, ax2, ax3), (ax4, ax5, ax6)) = plt.subplots(nrows=2, ncols=3)
ax1.scatter(chredlin.race, chredlin.involact)
ax1.set(title="Race")
ax1.set_ylabel("Involact")
ax2.scatter(chredlin.fire, chredlin.involact)
ax2.set(title="Fire")
ax3.scatter(chredlin.theft, chredlin.involact)
ax3.set(title="Theft")
ax4.scatter(chredlin.age, chredlin.involact)
ax4.set(title="Age")
ax4.set_ylabel("Involact")
ax5.scatter(chredlin.income, chredlin.involact)
ax5.set(title="Income")
ax6.scatter(chredlin.side, chredlin.involact)
ax6.set(title="Side")
fig.tight_layout()
plt.show()
A more compact way of achieving a similar result:
import seaborn as sns
sns.pairplot(chredlin, x_vars=["race","fire","theft","age","income"], y_vars="involact")
plt.show()
lmod = sm.OLS(chredlin.involact, sm.add_constant(chredlin.race)).fit()
lmod.summary()
plt.subplot(1, 2, 1)
plt.scatter(chredlin.race, chredlin.fire)
plt.xlabel("Race")
plt.ylabel("Fire")
plt.subplot(1, 2, 2)
plt.scatter(chredlin.race, chredlin.theft)
plt.xlabel("Race")
plt.ylabel("Theft")
plt.tight_layout()
plt.show()
lmod = smf.ols(formula='involact ~ race + fire + theft + age + np.log(income)', data=chredlin).fit()
lmod.summary()
plt.scatter(lmod.fittedvalues, lmod.resid)
plt.ylabel("Residuals")
plt.xlabel("Fitted values")
plt.axhline(0)
plt.show()
Trick needed to get statsmodels plots not to appear twice:
fig=sm.qqplot(lmod.resid, line="q")
Make the partial residual plots
pr = lmod.resid + chredlin.race*lmod.params['race']
plt.scatter(chredlin.race, pr)
plt.xlabel("Race")
plt.ylabel("partial residuals")
xl,xu = [0, 100]
plt.plot([xl,xu], [xl*lmod.params['race'], xu*lmod.params['race']])
plt.show()
pr = lmod.resid + chredlin.fire*lmod.params['fire']
plt.scatter(chredlin.fire, pr)
plt.xlabel("Fire")
plt.ylabel("partial residuals")
xl,xu = [0, 40]
plt.plot([xl,xu], [xl*lmod.params['fire'], xu*lmod.params['fire']])
plt.show()
Generate all combinations
import itertools
inds = [1, 2, 3, 4]
clist = []
for i in range(0, len(inds)+1):
clist.extend(itertools.combinations(inds, i))
clist
X = chredlin.iloc[:,[0,1,2,3,5]].copy()
X.loc[:,'income'] = np.log(chredlin['income'])
betarace = []
pvals = []
for k in range(0, len(clist)):
lmod = sm.OLS(chredlin.involact, sm.add_constant(X.iloc[:,np.append(0,clist[k])])).fit()
betarace.append(lmod.params[1])
pvals.append(lmod.pvalues[1])
Construct list of model names
vlist = ['race']
varnames = np.array(['race','fire','theft','age','logincome'])
for k in range(1, len(clist)):
vlist.append('+'.join(varnames[np.append(0,clist[k])]))
vlist
pd.DataFrame({'beta':np.round(betarace,4), 'pvals':np.round(pvals,4)}, index=vlist)
Calculate the dfbetas - note these are not the same as leave out one coef changes as they have been scaled to standard error changes
lmod = smf.ols(formula='involact ~ race + fire + theft + age + np.log(income)', data=chredlin).fit()
diagv = lmod.get_influence()
min(diagv.dfbetas[:,1])
Since the t-statistic is 3.82, a reduction of 0.4 is not going make a difference to the conclusion.
plt.scatter(diagv.dfbetas[:,2], diagv.dfbetas[:,3])
plt.xlabel("Change in Fire")
plt.ylabel("Change in Theft")
plt.show()
sm.graphics.influence_plot(lmod)
chredlin.loc[[60607, 60610],:]
Leave out these two case. Could be a more elegant way?
ch45 = chredlin.loc[~chredlin.index.isin([60607, 60610]),:]
lmod45 = smf.ols(formula='involact ~ race + fire + np.log(income)', data=ch45).fit()
lmod45.summary()
lmods = smf.ols(formula='involact ~ race + fire + theft + age', data=chredlin.loc[chredlin.side == 's',:]).fit()
lmods.summary()
lmodn = smf.ols(formula='involact ~ race + fire + theft + age', data=chredlin.loc[chredlin.side == 'n',:]).fit()
lmodn.summary()
%load_ext version_information
%version_information pandas, numpy, matplotlib, seaborn, scipy, patsy, statsmodels