Load the packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
Read in the data:
savings = pd.read_csv("data/savings.csv",index_col=0)
savings.head()
Fit the base model:
lmod = smf.ols(formula='sr ~ pop15 + pop75 + dpi + ddpi', data=savings).fit()
lmod.summary()
Construct the Box-Cox plot:
X = sm.add_constant(savings.iloc[:,1:])
n = savings.shape[0]
sumlogy = np.sum(np.log(savings.sr))
lam = np.linspace(0.5,1.5,100)
llk = np.empty(100)
for i in range(0, 100):
lmod = sm.OLS(sp.stats.boxcox(savings.sr,lam[i]),X).fit()
llk[i] = -(n/2)*np.log(lmod.ssr/n) + (lam[i]-1)*sumlogy
plt.plot(lam,llk)
maxi = llk.argmax()
plt.axvline(lam[maxi],linestyle = 'dashed')
cicut = max(llk) - sp.stats.chi2.ppf(0.95,1)/2
plt.axhline(cicut,linestyle = 'dashed')
rlam = lam[llk > cicut]
plt.axvline(rlam[0],linestyle = 'dashed')
plt.axvline(rlam[-1],linestyle = 'dashed')
plt.show()
Repeat on the Galapagos data:
gala = pd.read_csv("data/gala.csv", index_col = 0)
gala.drop('Endemics', axis=1, inplace=True)
X = sm.add_constant(gala.iloc[:,1:])
n = gala.shape[0]
sumlogy = np.sum(np.log(gala.Species))
lam = np.linspace(-0.25,0.75,100)
llk = np.empty(100)
for i in range(0, 100):
lmod = sm.OLS(sp.stats.boxcox(gala.Species,lam[i]),X).fit()
llk[i] = -(n/2)*np.log(lmod.ssr/n) + (lam[i]-1)*sumlogy
plt.plot(lam,llk)
maxi = llk.argmax()
plt.axvline(lam[maxi],linestyle = 'dashed')
cicut = max(llk) - sp.stats.chi2.ppf(0.95,1)/2
plt.axhline(cicut,linestyle = 'dashed')
rlam = lam[llk > cicut]
plt.axvline(rlam[0],linestyle = 'dashed')
plt.axvline(rlam[-1],linestyle = 'dashed')
plt.show()
Consider the logtrans family of transformations:
leafburn = pd.read_csv("data/leafburn.csv")
leafburn.head()
X = sm.add_constant(leafburn.iloc[:,:-1])
n = leafburn.shape[0]
alpha = np.linspace(-0.999,0,100)
llk = np.empty(100)
for i in range(0, 100):
lmod = sm.OLS(np.log(leafburn.burntime+alpha[i]),X).fit()
llk[i] = -(n/2)*np.log(lmod.ssr) - np.sum(np.log(leafburn.burntime + alpha[i]))
plt.plot(alpha,llk)
maxi = llk.argmax()
plt.axvline(alpha[maxi],linestyle = 'dashed')
cicut = max(llk) - sp.stats.chi2.ppf(0.95,1)/2
plt.axhline(cicut,linestyle = 'dashed')
ralp = alpha[llk > cicut]
plt.axvline(ralp[0],linestyle = 'dashed')
plt.axvline(ralp[-1],linestyle = 'dashed')
plt.show()
lmod1 = smf.ols(formula='sr ~ pop15', data=savings[savings.pop15 < 35]).fit()
lmod2 = smf.ols(formula='sr ~ pop15', data=savings[savings.pop15 > 35]).fit()
plt.scatter(savings.pop15, savings.sr)
plt.xlabel('Population under 15')
plt.ylabel('Savings rate')
plt.axvline(35,linestyle='dashed')
plt.plot([20,35],[lmod1.params[0]+lmod1.params[1]*20,lmod1.params[0]+lmod1.params[1]*35])
plt.plot([35,48],[lmod2.params[0]+lmod2.params[1]*35,lmod2.params[0]+lmod2.params[1]*48])
def lhs (x,c): return(np.where(x < c, c-x, 0))
def rhs (x,c): return(np.where(x < c, 0, x-c))
lmod = smf.ols(formula='sr ~ lhs(pop15,35) + rhs(pop15,35)', data=savings).fit()
x = np.arange(20,49)
py = lmod.params[0] + lmod.params[1]*lhs(x,35) + lmod.params[2]*rhs(x,35)
plt.plot(x,py,linestyle='dotted')
plt.show()
smf.ols(formula='sr ~ ddpi', data=savings).fit().summary()
smf.ols(formula='sr ~ ddpi + I(ddpi**2)', data=savings).fit().summary()
smf.ols(formula='sr ~ ddpi + I(ddpi**2) + I(ddpi**3)', data=savings).fit().summary()
savings['mddpi'] = savings.ddpi - 10
smf.ols(formula='sr ~ mddpi + I(mddpi**2)', data=savings).fit().summary()
Unfortunately, the following is not orthogonal. Seems to require add on functions.
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(4)
Xp = poly.fit_transform(savings.ddpi.values.reshape(-1,1))
sm.OLS(savings.sr,Xp).fit().summary()
Note that this does not match the R results (R is incorrect because of the orthogonal polynomial handling)
poly = PolynomialFeatures(2)
X = savings.iloc[:,[1,4]]
Xp = poly.fit_transform(X)
lmod = sm.OLS(savings.sr,Xp).fit()
pop15r = np.array(list(np.arange(20,51,3))*11)
ddpir = np.repeat(np.arange(0,21,2),11)
X = np.stack((pop15r, ddpir),axis=1)
Xp = poly.fit_transform(X)
pv = np.dot(Xp,lmod.params)
Xm = np.reshape(pop15r,(11,11))
Ym = np.reshape(ddpir,(11,11))
Zm = np.reshape(pv,(11,11))
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(Xm,Ym,Zm)
plt.show()
Generate the example data
def funky (x): return(np.sin(2*np.pi*x**3)**3)
x = np.linspace(0., 1., 101)
y = funky(x) + sp.stats.norm.rvs(0,0.1,101)
plt.scatter(x,y)
plt.plot(x, funky(x))
plt.show()
4th order polynomial fit
p = np.poly1d(np.polyfit(x,y,4))
plt.scatter(x,y)
plt.plot(x, funky(x))
plt.plot(x,p(x))
plt.show()
12th order polynomial fit
p = np.poly1d(np.polyfit(x,y,12))
plt.scatter(x,y)
plt.plot(x, funky(x))
plt.plot(x,p(x))
plt.show()
Regression splines (choice of knots cheats by putting more knots near the point of inflexion)
import patsy
df = pd.DataFrame(x,y)
knots = [0,0.2,0.4,0.5,0.6,0.7,0.8,0.85,0.9,1]
z = smf.ols(formula='y ~ 1 + patsy.bs(x,knots=knots)',data=df).fit()
plt.scatter(x,y)
plt.plot(x, z.fittedvalues)
plt.show()
Smoothing splines as in R apparently not available in Python without specialized libraries
GAM is apparently not available in Python but we can implement a poor man's version:
import patsy
gamod = smf.ols(formula = 'sr ~ patsy.bs(pop15,4) + patsy.bs(ddpi,4)', data=savings).fit()
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(savings.pop15,savings.ddpi,gamod.fittedvalues)
plt.show()
%load_ext version_information
%version_information pandas, numpy, matplotlib, seaborn, scipy, patsy, statsmodels