Read in the packages, data and exclude an unused variable.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
Read in the Galapagos data
gala = pd.read_csv("data/gala.csv", index_col=0)
gala.drop('Endemics', axis=1, inplace=True)
gala.head()
Fit the model:
lmod = smf.ols(formula='Species ~ Area + Elevation + Nearest + Scruz + Adjacent', data=gala).fit()
lmod.summary()
Total sum of squares and residual sum of squares
lmod.centered_tss, lmod.ssr
Degrees of freedom for the numerator and denominator of the F-test
lmod.df_model, lmod.df_resid
Mean square of numerator and denominator
lmod.mse_model, lmod.mse_resid
F-statistic reproduced
lmod.mse_model/ lmod.mse_resid
Compute p-value
1-sp.stats.f.cdf(lmod.fvalue, lmod.df_model, lmod.df_resid)
F-test for comparing two models. (Turn off the warnings about NaNs in the output - we don't care).
%%capture --no-display
lmods = smf.ols(formula='Species ~ Elevation + Nearest + Scruz + Adjacent', data=gala).fit()
sm.stats.anova_lm(lmods,lmod)
%%capture --no-display
lmods = smf.ols(formula='Species ~ Elevation + Nearest + Scruz', data=gala).fit()
sm.stats.anova_lm(lmods,lmod)
Use I() notation to test replacing by the sum of two variables
%%capture --no-display
lmods = smf.ols(formula='Species ~ I(Area+Adjacent) + Elevation + Nearest + Scruz', data=gala).fit()
sm.stats.anova_lm(lmods,lmod)
Need to use glm function to get offset functionality
lmod = smf.glm(formula='Species ~ Area + Elevation + Nearest + Scruz + Adjacent', data=gala).fit()
lmods = smf.glm(formula='Species ~ Area + Nearest + Scruz + Adjacent', offset=(0.5*gala['Elevation']), data=gala).fit()
fstat = (lmods.deviance-lmod.deviance)/(lmod.deviance/lmod.df_resid)
fstat
1-sp.stats.f.cdf(fstat, 1, lmod.df_resid)
Compute t-statistic and corresponding p-value
lmod = smf.ols(formula='Species ~ Area + Elevation + Nearest + Scruz + Adjacent', data=gala).fit()
tstat=(lmod.params['Elevation']-0.5)/lmod.bse['Elevation']
tstat, 2*sp.stats.t.cdf(tstat, lmod.df_resid)
tstat**2
Permutation of the response
lmod = smf.ols(formula='Species ~ Nearest + Scruz', data=gala).fit()
np.random.permutation(gala.Species)
Compute the f-statistic for a sample of permutations
fstats = np.zeros(4000)
for i in range(0,4000):
gala['ysamp'] = np.random.permutation(gala.Species)
lmodi = smf.ols(formula='ysamp ~ Nearest + Scruz', data=gala).fit()
fstats[i] = lmodi.fvalue
Proportion of permuted f-statistics that exceed the observed value
np.mean(fstats > lmod.fvalue)
Which is close to the observed p-value
lmod.f_pvalue
Do t-test with permutation
tstats = np.zeros(4000)
for i in range(0, 4000):
gala['ssamp'] = np.random.permutation(gala.Scruz)
lmodi = smf.ols(formula='Species ~ Nearest + ssamp', data=gala).fit()
tstats[i] = lmodi.tvalues[2]
Proportion of permuted t-statistcs which exceed the observed t-statistic in absolute value
np.mean(np.fabs(tstats) > np.fabs(lmod.tvalues[2]))
Very close to observed p-value:
lmod.pvalues[2]
lmod = smf.ols(formula='Species ~ Area + Elevation + Nearest + Scruz + Adjacent', data=gala).fit()
qt = np.array(sp.stats.t.interval(0.95,24))
lmod.params[1] + lmod.bse[1]*qt
lmod.conf_int()
Drawing the confidence ellipses appears to require some effort in coding
Bootstrap the residuals
breps = 4000
coefmat = np.empty((breps,6))
resids = lmod.resid
preds = lmod.predict()
for i in range(0,breps):
gala['ysamp'] = preds + np.random.choice(resids,30)
lmodi = smf.ols(formula='ysamp ~ Area + Elevation + Nearest + Scruz + Adjacent', data=gala).fit()
coefmat[i,:] = lmodi.params
Turn into pandas dataframe and add columns
coefmat = pd.DataFrame(coefmat, columns=("intercept", "area","elevation","nearest","Scruz","adjacent"))
Compute the quantiles by column for bootstrap confidence intervals
coefmat.quantile((0.025,0.975))
Plot the kernel density estimate
coefmat.area.plot.density()
xci = coefmat.area.quantile((0.025,0.975)).ravel()
plt.axvline(x=xci[0], linestyle='--')
plt.axvline(x=xci[1], linestyle='--')
plt.show()
%load_ext version_information
%version_information pandas, numpy, matplotlib, seaborn, scipy, patsy, statsmodels