Load the packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from scipy import stats
Load in the data
composite = pd.read_csv("data/composite.csv", index_col=0)
composite
Create the interaction plots:
sns.catplot(x="laser", y="strength", hue="tape", data=composite, kind="point")
sns.catplot(x="tape", y="strength", hue="laser", data=composite, kind="point")
Fit a saturated model. Don't show the error messages (get complaints about small datasets)
%%capture --no-display
lmod = smf.ols("strength ~ tape * laser", composite).fit()
lmod.summary()
Fit main effects only model:
lmod = smf.ols("strength ~ tape + laser", composite).fit()
lmod.params
lasercoefs = np.tile([0, 6.57, 12.21],3)
lasercoefs
tapecoefs = np.repeat([1.48, 5.52, 0], 3)
tapecoefs
%%capture --no-display
composite['crossp'] = tapecoefs * lasercoefs
tmod = smf.ols("strength ~ tape + laser + crossp", composite).fit()
tmod.summary()
sm.stats.anova_lm(tmod)
Fix the order of the categories
cat_type = pd.api.types.CategoricalDtype(categories=['slow','medium','fast'],ordered=True)
composite['tape'] = composite.tape.astype(cat_type)
%%capture --no-display
from patsy.contrasts import Poly
lmod = smf.ols("strength ~ C(tape,Poly) + C(laser,Poly)", composite).fit()
lmod.summary()
Check out the design matrix
import patsy
dm = patsy.dmatrix('~ C(tape,Poly) + C(laser,Poly)', composite)
np.asarray(dm).round(2)
composite['Nlaser'] = np.tile([40,50,60],3)
composite['Ntape'] = np.repeat([6.42,13,27], 3)
%%capture --no-display
lmodn = smf.ols("strength ~ np.log(Ntape) + I(np.log(Ntape)**2) + Nlaser", composite).fit()
lmodn.summary()
Read in the data while making sure the predictors are treated as categorical variables:
pvc = pd.read_csv("data/pvc.csv", index_col=0, dtype={'psize':'float','operator':'category','resin':'category'})
pvc.head()
Compute the group means
pvc.groupby('operator')[['psize']].mean()
sns.catplot(x='resin',y='psize',hue='operator', data=pvc, kind="point",ci=None)
ax = sns.catplot(x='operator',y='psize',hue='resin', data=pvc, ci=None,scale=0.5,kind="point")
ax = sns.swarmplot(x='operator',y='psize',hue='resin', data=pvc)
ax.legend_.remove()
Create the ANOVA table
lmod = smf.ols('psize ~ operator*resin', pvc).fit()
sm.stats.anova_lm(lmod).round(3)
Now remove the interaction term:
lmod = smf.ols('psize ~ operator+resin', pvc).fit()
sm.stats.anova_lm(lmod).round(3)
Make the diagnostic plots:
fig = sm.qqplot(lmod.resid, line="q")
sns.residplot(lmod.fittedvalues, lmod.resid)
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
plt.show()
sns.swarmplot(pvc['operator'], lmod.resid)
plt.axhline(0)
plt.xlabel("Operator")
plt.ylabel("Residuals")
plt.show()
Test for equality of variance:
lmod = smf.ols('psize ~ operator*resin', pvc).fit()
ii = np.arange(0,48,2)
pvce = pvc.iloc[ii,:].copy()
pvce['res'] = np.sqrt(abs(lmod.resid.loc[ii+1]))
vmod = smf.ols('res ~ operator+resin', pvce).fit()
sm.stats.anova_lm(vmod,typ=3).round(3)
lmod = smf.ols('psize ~ operator+resin', pvc).fit()
lmod.summary()
Get the Tukey critical value:
from statsmodels.sandbox.stats.multicomp import get_tukeyQcrit
get_tukeyQcrit(3,38)
Get the first pairwise difference:
lmod.params[1] + np.array([-1,1]) * get_tukeyQcrit(3,38) * lmod.bse[1] / np.sqrt(2)
Read in the data:
warpbreaks = pd.read_csv("data/warpbreaks.csv", index_col=0, dtype={'breaks':'int','wool':'category','tension':'category'})
warpbreaks.head()
sns.boxplot(x="wool", y="breaks", data=warpbreaks)
plt.show()
sns.catplot(x='wool',y='breaks',hue='tension', data=warpbreaks, ci=None,kind="point")
warpbreaks['nwool'] = warpbreaks['wool'].cat.codes
ax = sns.catplot(x='wool',y='breaks',hue='tension', data=warpbreaks, ci=None,kind="point")
ax = sns.swarmplot(x='wool',y='breaks',hue='tension', data=warpbreaks)
ax.legend_.remove()
ax = sns.catplot(x='tension',y='breaks',hue='wool', data=warpbreaks, ci=None, kind="point")
ax = sns.swarmplot(x='tension',y='breaks',hue='wool', data=warpbreaks)
ax.legend_.remove()
lmod = smf.ols('breaks ~ wool * tension', warpbreaks).fit()
sns.regplot(lmod.fittedvalues, lmod.resid,scatter_kws={'alpha':0.3}, ci=None)
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
lmod = smf.ols('np.sqrt(breaks) ~ wool * tension', warpbreaks).fit()
sns.regplot(lmod.fittedvalues, lmod.resid,scatter_kws={'alpha':0.3}, ci=None)
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
sm.stats.anova_lm(lmod)
lmod.summary()
lmod = smf.ols('np.sqrt(breaks) ~ wool:tension-1', warpbreaks).fit()
lmod.summary()
Width of Tukey HSD bands is ( qt/sqrt(2) se sqrt(2))
get_tukeyQcrit(6,48) * lmod.bse[0]
Compute all the pairwise differences:
import itertools
dp = set(itertools.combinations(range(0,6),2))
dcoef = []
namdiff = []
for cp in dp:
dcoef.append(lmod.params[cp[0]] - lmod.params[cp[1]])
namdiff.append(lmod.params.index[cp[0]] + '-' + lmod.params.index[cp[1]])
thsd = pd.DataFrame({'Difference':dcoef},index=namdiff)
thsd["lb"] = thsd.Difference - get_tukeyQcrit(6,48) * lmod.bse[0]
thsd["ub"] = thsd.Difference + get_tukeyQcrit(6,48) * lmod.bse[0]
thsd
lmod.params.index
Read in the data
speedo = pd.read_csv("data/speedo.csv", index_col=0)
speedo
%%capture --no-display
lmod = smf.ols('y ~ h+d+l+b+j+f+n+a+i+e+m+c+k+g+o', speedo).fit()
lmod.summary()
Look at the design matrix:
dm = patsy.dmatrix('~ h+d+l+b+j+f+n+a+i+e+m+c+k+g+o', speedo)
np.asarray(dm)
Make a QQ plot of the coefficients:
ii = np.argsort(lmod.params[1:])
scoef = np.array(lmod.params[1:])[ii]
lcoef = (speedo.columns[:-1])[ii]
n = len(scoef)
qq = stats.norm.ppf(np.arange(1,n+1)/(n+1))
fig, ax = plt.subplots()
ax.scatter(qq, scoef,s=1)
for i in range(len(qq)):
ax.annotate(lcoef[i], (qq[i],scoef[i]))
plt.xlabel("Normal Quantiles")
plt.ylabel("Sorted coefficients")
plt.show()
n = len(scoef)
qq = stats.norm.ppf((n + np.arange(1,n+1))/(2*n+1))
acoef = np.abs(lmod.params[1:])
ii = np.argsort(acoef)
acoef = acoef[ii]
lcoef = (speedo.columns[:-1])[ii]
fig, ax = plt.subplots()
ax.scatter(qq, acoef,s=1)
for i in range(len(qq)):
ax.annotate(lcoef[i], (qq[i],acoef[i]))
plt.xlabel("Normal Half Quantiles")
plt.ylabel("Sorted absolute coefficients")
plt.show()
%load_ext version_information
%version_information pandas, numpy, matplotlib, seaborn, scipy, patsy, statsmodels