import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from scipy import stats
oatvar = pd.read_csv("data/oatvar.csv", index_col=0, dtype={'yield':'int','block':'category','variety':'category'})
oatvar.head()
Display the data as a table:
oatvar.pivot(index = 'variety', columns='block', values='yield')
Make plots of the data:
sns.boxplot(x="variety", y="yield", data=oatvar)
plt.show()
sns.boxplot(x="block", y="yield", data=oatvar)
plt.show()
sns.catplot(x='variety',y='yield',hue='block', data=oatvar, kind='point')
plt.show()
sns.catplot(x='block',y='yield',hue='variety', data=oatvar, kind='point')
plt.show()
The obvious way to fit the model fails, apparently because patsy
doesn't like the response to be called yield
.
try:
lmod = smf.ols('yield ~ variety + block', oatvar).fit()
lmod.summary()
except:
pass
Look at the design matrix:
import patsy
dm = patsy.dmatrix('~ variety + block', oatvar)
np.asarray(dm)
lmod = sm.OLS(oatvar['yield'], dm).fit()
lmod.summary()
Another solution is to create a response with a new name:
oatvar['y'] = oatvar['yield']
lmod = smf.ols('y ~ variety + block', oatvar).fit()
sm.stats.anova_lm(lmod).round(3)
Fit with only block
lmod = smf.ols('y ~ block', oatvar).fit()
sm.stats.anova_lm(lmod).round(3)
Leave out the first observation:
oat1 = oatvar.iloc[1:,]
lmod = smf.ols('y ~ variety + block', oat1).fit()
sm.stats.anova_lm(lmod).round(3)
Change the factor order:
lmod = smf.ols('y ~ block + variety', oat1).fit()
sm.stats.anova_lm(lmod).round(3)
Or use type III ANOVA which is like drop()
in R
sm.stats.anova_lm(lmod,typ=3)
lmod = smf.ols('y ~ variety + block', oatvar).fit()
sns.residplot(lmod.fittedvalues, lmod.resid)
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
plt.show()
res = np.sort(lmod.resid)
n = len(res)
qq = stats.norm.ppf(np.arange(1,n+1)/(n+1))
fig, ax = plt.subplots()
ax.scatter(qq, res)
plt.xlabel("Normal Quantiles")
plt.ylabel("Sorted Residuals")
plt.show()
Do the interaction test:
lmod.params
varcoefs = np.append([0.],lmod.params[1:8])
varcoefs = np.repeat(varcoefs,5)
blockcoefs = np.append([0.],lmod.params[8:12])
blockcoefs = np.tile(blockcoefs,8)
oatvar['crossp'] = varcoefs * blockcoefs
tmod = smf.ols("y ~ variety + block + crossp", oatvar).fit()
tmod.summary()
Relative efficiency
lmcrd = smf.ols("y ~ variety", oatvar).fit()
np.sqrt(lmcrd.scale)
np.sqrt(lmod.scale)
lmcrd.scale/lmod.scale
abrasion = pd.read_csv("data/abrasion.csv", index_col=0,
dtype={'run':'category','position':'category','material':'category','wear':'int'})
abrasion.head()
See the Latin square layout:
abrasion.pivot(index = 'run', columns='position', values='wear')
abrasion.pivot(index = 'run', columns='position', values='material')
sns.catplot(x='run',y='wear',hue='position', data=abrasion, kind='point',scale=0.5)
sns.catplot(x='run',y='wear',hue='material', data=abrasion, kind='point',scale=0.5)
lmod = smf.ols('wear ~ run + position + material', abrasion).fit()
sm.stats.anova_lm(lmod,typ=3).round(3)
%%capture --no-display
lmod.summary()
Do the Tukey computation:
from statsmodels.sandbox.stats.multicomp import get_tukeyQcrit
get_tukeyQcrit(4,6)*lmod.bse[1]/np.sqrt(2)
mcoefs = np.append([0],lmod.params[7:10])
nmats = ['A','B','C','D']
import itertools
dp = set(itertools.combinations(range(0,4),2))
dcoef = []
namdiff = []
for cp in dp:
dcoef.append(mcoefs[cp[0]] - mcoefs[cp[1]])
namdiff.append(nmats[cp[0]] + '-' + nmats[cp[1]])
thsd = pd.DataFrame({'Difference':dcoef},index=namdiff)
thsd["lb"] = thsd.Difference - get_tukeyQcrit(4,6) * lmod.bse[1]/np.sqrt(2)
thsd["ub"] = thsd.Difference + get_tukeyQcrit(4,6) * lmod.bse[1]/np.sqrt(2)
thsd.round(2)
Relative efficiency
lmodr = smf.ols('wear ~ material', abrasion).fit()
lmodr.scale/lmod.scale
rabbit = pd.read_csv("data/rabbit.csv", index_col=0,
dtype={'treat':'category','gain':'float','block':'category'})
rabbit.head()
Show the arrange of the BIB. I replace the NaNs with spaces for clarity.
pt = rabbit.pivot(index = 'treat', columns='block', values='gain')
pt.replace(np.nan," ", regex=True)
sns.swarmplot(x='block',y='gain',hue='treat',data=rabbit)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
sns.swarmplot(x='treat',y='gain',hue='block',data=rabbit)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
lmod = smf.ols('gain ~ treat + block', rabbit).fit()
sm.stats.anova_lm(lmod,typ=3).round(3)
sns.residplot(lmod.fittedvalues, lmod.resid)
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
plt.show()
res = np.sort(lmod.resid)
n = len(res)
qq = stats.norm.ppf(np.arange(1,n+1)/(n+1))
fig, ax = plt.subplots()
ax.scatter(qq, res)
plt.xlabel("Normal Quantiles")
plt.ylabel("Sorted Residuals")
plt.show()
lmod.summary()
Tukey pairwise differences:
mcoefs = np.append([0],lmod.params[1:6])
nmats = [chr(i) for i in range(ord('a'),ord('f')+1)]
p = len(mcoefs)
dp = set(itertools.combinations(range(0,p),2))
dcoef = []
namdiff = []
for cp in dp:
dcoef.append(mcoefs[cp[0]] - mcoefs[cp[1]])
namdiff.append(nmats[cp[0]] + '-' + nmats[cp[1]])
thsd = pd.DataFrame({'Difference':dcoef},index=namdiff)
thsd["lb"] = thsd.Difference - get_tukeyQcrit(p,lmod.df_resid) * lmod.bse[1]/np.sqrt(2)
thsd["ub"] = thsd.Difference + get_tukeyQcrit(p,lmod.df_resid) * lmod.bse[1]/np.sqrt(2)
thsd.round(2)
Relative efficiency
lmodr = smf.ols('gain ~ treat', rabbit).fit()
lmodr.scale/lmod.scale
%load_ext version_information
%version_information pandas, numpy, matplotlib, seaborn, scipy, patsy, statsmodels