Import the packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from scipy import stats
Read in the data
sexab = pd.read_csv("data/sexab.csv", index_col=0)
sexab.head()
Construct something similar to the by
function summary from LMR.
lfuncs = ['min','median','max']
sexab.groupby('csa').agg({'cpa': lfuncs,'ptsd': lfuncs}).round(1)
sexab.boxplot('ptsd',by='csa')
plt.suptitle("")
plt.show()
sns.pairplot(x_vars="cpa", y_vars="ptsd", data=sexab, hue="csa",size=5)
plt.show()
stats.ttest_ind(sexab.ptsd[sexab.csa == 'Abused'], sexab.ptsd[sexab.csa == 'NotAbused'])
Define dummy variables and including an intercept does fit a model but the meaning of the coefficients and standard errors are problematic. Look out for the design matrix is singular warning at the end.
df1 = (sexab.csa == 'Abused').astype(int)
df2 = (sexab.csa == 'NotAbused').astype(int)
X = np.column_stack((df1,df2))
lmod = sm.OLS(sexab.ptsd,sm.add_constant(X)).fit()
lmod.summary()
Use only one of the dummy variables:
lmod = sm.OLS(sexab.ptsd,sm.add_constant(df2)).fit()
lmod.summary()
Or we can just not have an intercept term:
lmod = sm.OLS(sexab.ptsd,X).fit()
lmod.summary()
Using the formula method produces R-like results
lmod = smf.ols('ptsd ~ csa', sexab).fit()
lmod.summary()
Can view the design matrix: (picking out the first two rows of each level of csa).
import patsy
selcols = [0,1,45,46]
patsy.dmatrix('~ csa', sexab)[selcols,:]
Can check the types of variables:
sexab.csa.dtype, sexab.ptsd.dtype
pandas
has a way to create the dummy variables:
sac = pd.concat([sexab,pd.get_dummies(sexab.csa)],axis=1)
lmod = smf.ols('ptsd ~ Abused', sac).fit()
lmod.summary()
Alternatively, we can set the reference level to NotAbused:
lmod = smf.ols('ptsd ~ C(csa,Treatment(reference="NotAbused"))', sexab).fit()
lmod.summary()
lmod4 = smf.ols('ptsd ~ csa*cpa', sexab).fit()
lmod4.summary()
Take a look at a chunk from the design matrix:
patsy.dmatrix('~ csa*cpa', sexab)[40:50,]
Plot the fit by group - seem like a lot of work!
abused = (sexab.csa == "Abused")
plt.scatter(sexab.cpa[abused], sexab.ptsd[abused], marker='x',label="abused")
xl,xu = [-3, 9]
a, b = (lmod4.params[0], lmod4.params[2])
plt.plot([xl,xu], [a+xl*b,a+xu*b])
plt.scatter(sexab.cpa[~abused], sexab.ptsd[~abused], marker='o',label="not abused")
a, b = (lmod4.params[0]+lmod4.params[1], lmod4.params[2]+lmod4.params[3])
plt.plot([xl,xu], [a+xl*b,a+xu*b])
plt.legend()
plt.show()
Can produce essentially the same plot but the regression lines are fit independently to the groups. In this case, the fits will be identical.
sns.lmplot(x="cpa", y="ptsd", hue="csa", data=sexab, ci=None)
plt.show()
lmod3 = smf.ols('ptsd ~ csa+cpa', sexab).fit()
lmod3.summary()
No shortcut to producing the plot this time.
abused = (sexab.csa == "Abused")
plt.scatter(sexab.cpa[abused], sexab.ptsd[abused], marker='x',label="abused")
xl,xu = [-3, 9]
a, b = (lmod3.params[0], lmod3.params[2])
plt.plot([xl,xu], [a+xl*b,a+xu*b])
plt.scatter(sexab.cpa[~abused], sexab.ptsd[~abused], marker='o',label="not abused")
a, b = (lmod3.params[0]+lmod4.params[1], lmod3.params[2])
plt.plot([xl,xu], [a+xl*b,a+xu*b])
plt.legend()
plt.show()
Get the confidence intervals:
lmod3.conf_int()
sns.residplot(lmod3.fittedvalues, lmod3.resid, lowess=True)
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
plt.show()
lmod1 = smf.ols('ptsd ~ cpa', sexab).fit()
lmod1.summary()
ws = pd.read_csv("data/whiteside.csv", index_col=0)
ws.head()
sns.lmplot(x="Temp", y="Gas", col="Insul", data=ws)
plt.show()
lmod = smf.ols('Gas ~ Temp*Insul', ws).fit()
lmod.summary()
ws.Temp.mean()
ws['cTemp'] = ws.Temp - ws.Temp.mean()
lmod = smf.ols('Gas ~ cTemp*Insul', ws).fit()
lmod.summary()
ff = pd.read_csv("data/fruitfly.csv", index_col=0)
ff.head()
sns.pairplot(x_vars="thorax", y_vars="longevity", data=ff, hue="activity",size=5)
plt.show()
sns.lmplot(x="thorax", y="longevity", data=ff, col="activity",size=5)
plt.show()
Fit the model
lmod = smf.ols('longevity ~ thorax*activity', ff).fit()
lmod.summary()
Construct and show selected rows of the design matrix (one for each level of activity). DataFrame is just for pretty printing.
mm = patsy.dmatrix('~ thorax*activity', ff)
ii = (1, 25, 49, 75, 99)
pd.DataFrame(mm[ii,:],index=ii,columns=lmod.params.index)
sns.residplot(lmod.fittedvalues, lmod.resid, lowess=True)
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
plt.show()
sm.qqplot(lmod.resid, line="q")
sm.stats.anova_lm(lmod)
lmods = smf.ols('longevity ~ thorax+activity', ff).fit()
sm.stats.anova_lm(lmods,lmod)
lmod = smf.ols('np.log(longevity) ~ thorax+activity', ff).fit()
lmod.summary()
sns.residplot(lmod.fittedvalues, lmod.resid, lowess=True)
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
plt.show()
np.exp(lmod.params[1:5])
lmod = smf.ols('longevity ~ activity', ff).fit()
lmod.summary()
sm.stats.anova_lm(lmod)
lmod = smf.ols('np.log(longevity) ~ activity', ff).fit()
lmod.summary()
from patsy.contrasts import Treatment
levels = [1,2,3,4]
contrast = Treatment(reference=0).code_without_intercept(levels)
print(contrast.matrix)
from patsy.contrasts import Sum
contrast = Sum().code_without_intercept(levels)
print(contrast.matrix)
from patsy.contrasts import Helmert
contrast = Helmert().code_without_intercept(levels)
print(contrast.matrix)
lmod = smf.ols('ptsd ~ C(csa,Sum)', sexab).fit()
lmod.summary()
%load_ext version_information
%version_information pandas, numpy, matplotlib, seaborn, scipy, patsy, statsmodels