Load the packages:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
Load in the data:
statedata = pd.read_csv("data/statedata.csv",index_col=0)
statedata.head()
Fit the base model:
lmod = smf.ols('LifeExp ~ Population + Income + Illiteracy + Murder + HSGrad + Frost + Area', data=statedata).fit()
lmod.summary()
Backward elimination using p-values:
lmod.pvalues.round(2)
lmod = smf.ols('LifeExp ~ Population + Income + Illiteracy + Murder + HSGrad + Frost', data=statedata).fit()
lmod.pvalues.round(2)
lmod = smf.ols('LifeExp ~ Population + Income + Murder + HSGrad + Frost', data=statedata).fit()
lmod.pvalues.round(2)
lmod = smf.ols('LifeExp ~ Population + Murder + HSGrad + Frost', data=statedata).fit()
lmod.pvalues.round(2)
lmod = smf.ols('LifeExp ~ Murder + HSGrad + Frost', data=statedata).fit()
lmod.summary()
Variables that are eliminated may have some significance.
lmod = smf.ols('LifeExp ~ Illiteracy + Murder + Frost', data=statedata).fit()
lmod.summary()
Can use sklearn
but only makes sense if data are scaled. Note this method is not in "Linear Models with R".
from sklearn.preprocessing import scale
scalstat = pd.DataFrame(scale(statedata), index=statedata.index, columns=statedata.columns)
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=False)
X = scalstat.drop('LifeExp',axis=1)
smlmod = sm.OLS(scalstat.LifeExp, X).fit()
smlmod.params
reg.fit(X, scalstat.LifeExp)
reg.coef_
Expect the intercept to be zero since was not included.
reg.intercept_
Chooses importance of predictors based on the coefficient size:
from sklearn.feature_selection import RFE
selector = RFE(reg, 1, step=1)
selector = selector.fit(X, scalstat.LifeExp)
selector.ranking_
In order of importance:
X.columns[selector.ranking_-1]
Use 10-fold crossvalidation
from sklearn.feature_selection import RFECV
selector = RFECV(reg, step=1, cv=10)
selector = selector.fit(X, scalstat.LifeExp)
selector.ranking_
Picks only 3 predictors.
X.columns[selector.support_]
Compute the RSS for every subset of predictors.
import itertools
pcols = list(statedata.columns)
pcols.remove('LifeExp')
rss = np.empty(len(pcols) + 1)
rss[0] = np.sum((statedata.LifeExp - np.mean(statedata.LifeExp))**2)
selvar = ['Null']
for k in range(1,len(pcols)+1):
RSS = {}
for variables in itertools.combinations(pcols, k):
predictors = statedata.loc[:,list(variables)]
predictors['Intercept'] = 1
res = sm.OLS(statedata.LifeExp, predictors).fit()
RSS[variables] = res.ssr
rss[k] = min(RSS.values())
selvar.append(min(RSS, key=RSS.get))
rss.round(3)
selvar
aic = 50 * np.log(rss/50) + np.arange(1,9)*2
plt.plot(np.arange(1,8), aic[1:])
plt.xlabel('Number of Predictors')
plt.ylabel('AIC')
plt.show()
adjr2 = 1 - (rss/(50 - np.arange(1,9)))/(rss[0]/49)
plt.plot(np.arange(1,8),adjr2[1:])
plt.xlabel('Number of Predictors')
plt.ylabel('Adjusted R^2')
plt.show()
lmod = smf.ols('LifeExp ~ Population + Income + Illiteracy + Murder + HSGrad + Frost + Area', data=statedata).fit()
cpstat = rss/lmod.scale + 2 * np.arange(1,9) - 50
plt.plot(np.arange(2,9),cpstat[1:])
plt.xlabel('Number of Parameters')
plt.ylabel('Cp statistic')
plt.plot([2,8],[2,8])
plt.show()
Calculate the hat values. Although the parameter estimates are the same, the hat values are different from R.
lmod = smf.ols('LifeExp ~ Population + Murder + HSGrad + Frost', data=statedata).fit()
diagv = lmod.get_influence()
hatv = pd.Series(diagv.hat_matrix_diag, statedata.index)
hatv.sort_values().iloc[-5:]
Leave out AK as in LMR book.
state49 = statedata.drop(['AK'])
rss = np.empty(len(pcols) + 1)
rss[0] = np.sum((state49.LifeExp - np.mean(state49.LifeExp))**2)
selvar = ['Null']
for k in range(1,len(pcols)+1):
RSS = {}
for variables in itertools.combinations(pcols, k):
predictors = state49.loc[:,list(variables)]
predictors['Intercept'] = 1
res = sm.OLS(state49.LifeExp, predictors).fit()
RSS[variables] = res.ssr
rss[k] = min(RSS.values())
selvar.append(min(RSS, key=RSS.get))
adjr2 = 1 - (rss/(49 - np.arange(1,9)))/(rss[0]/48)
adjr2
selvar[adjr2.argmax()]
adjr2.argmax()
Scale data and strip plot:
from sklearn.preprocessing import scale
scalstat = pd.DataFrame(scale(statedata), index=statedata.index, columns=statedata.columns)
scalstat.head()
import seaborn as sns
sns.set_style("whitegrid")
sns.stripplot(data=scalstat, jitter=True)
plt.show()
Do variable selection on the transformed model.
lmod = smf.ols('LifeExp ~ np.log(Population) + Income + Illiteracy + Murder + HSGrad + Frost + np.log(Area)', data=statedata).fit()
rss = np.empty(len(pcols) + 1)
rss[0] = np.sum((statedata.LifeExp - np.mean(statedata.LifeExp))**2)
selvar = ['Null']
for k in range(1,len(pcols)+1):
RSS = {}
for variables in itertools.combinations(pcols, k):
predictors = statedata.loc[:,list(variables)]
predictors['Intercept'] = 1
res = sm.OLS(statedata.LifeExp, predictors).fit()
RSS[variables] = res.ssr
rss[k] = min(RSS.values())
selvar.append(min(RSS, key=RSS.get))
adjr2 = 1 - (rss/(50 - np.arange(1,9)))/(rss[0]/49)
adjr2
selvar[adjr2.argmax()]
Would be worth creating a function for the all subsets regression.
%load_ext version_information
%version_information pandas, numpy, matplotlib, seaborn, scipy, patsy, statsmodels