Load the packages:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
%precision 7
Load in the data:
cars = pd.read_csv("data/cars.csv")
cars.head()
Set up the plot and the random seed
fig, ax = plt.subplots()
ax.scatter(cars.speed, cars.dist)
plt.xlabel("Speed")
plt.ylabel("Distance")
xr = np.array(ax.get_xlim())
np.random.seed(123)
Do the four fits with increasing amounts of error. Since we only need univariate regression np.polyfit is adequate.
est = np.polyfit(cars.speed, cars.dist, 1)
ax.plot(xr, est[1] + est[0] * xr)
est1 = np.polyfit(cars.speed + np.random.normal(size=50), cars.dist, 1)
ax.plot(xr, est1[1] + est1[0] * xr, c='red')
est2 = np.polyfit(cars.speed + np.random.normal(scale=2,size=50), cars.dist, 1)
ax.plot(xr, est2[1] + est2[0] * xr, c='green')
est5 = np.polyfit(cars.speed + np.random.normal(scale=5,size=50), cars.dist, 1)
ax.plot(xr, est5[1] + est5[0] * xr, c='cyan')
plt.show()
est, est1, est2, est5
vv = np.repeat(np.array([0.1, 0.2, 0.3, 0.4, 0.5]), [1000, 1000, 1000, 1000, 1000])
slopes = np.zeros(5000)
for i in range(5000):
slopes[i] = np.polyfit(cars.speed+np.random.normal(scale=np.sqrt(vv[i]), size=50), cars.dist, 1)[0]
betas = np.reshape(slopes, (5, 1000)).mean(axis=1)
betas = np.append(betas,est[0])
variances = np.array([0.6, 0.7, 0.8, 0.9, 1.0, 0.5])
gv = np.polyfit(variances, betas,1)
gv
plt.scatter(variances, betas)
xr = np.array([0,1])
plt.plot(xr, np.array(gv[1] + gv[0]*xr))
plt.plot([0], [gv[1]], marker='x', markersize=6)
plt.show()
Do not believe there is a simex
package for Python.
Read in the data and fit the model:
savings = pd.read_csv("data/savings.csv",index_col=0)
lmod = smf.ols(formula = 'sr ~ pop15 + pop75 + dpi + ddpi', data=savings).fit()
lmod.summary()
Rescale one of the predictors:
lmod = smf.ols(formula = 'sr ~ pop15 + pop75 + I(dpi/1000) + ddpi', data=savings).fit()
lmod.summary()
Standardize all the variables:
scsav = savings.apply(sp.stats.zscore)
lmod = smf.ols(formula = 'sr ~ pop15 + pop75 + dpi + ddpi', data=scsav).fit()
lmod.summary()
Set up data frame and make a plot of the estimates and confidence intervals:
edf = pd.concat([lmod.params, lmod.conf_int()],axis=1).iloc[1:,]
edf.columns = ['estimate','lb','ub']
npreds = edf.shape[0]
fig, ax = plt.subplots()
ax.scatter(edf.estimate,np.arange(npreds))
for i in range(npreds):
ax.plot([edf.lb[i], edf.ub[i]], [i, i])
ax.set_yticks(np.arange(npreds))
ax.set_yticklabels(edf.index)
ax.axvline(0)
plt.show()
Do Gelman's rescaling:
savings['age'] = np.where(savings.pop15 > 35, 0, 1)
savings['dpis'] = sp.stats.zscore(savings.dpi)/2
savings['ddpis'] = sp.stats.zscore(savings.ddpi)/2
smf.ols(formula='sr ~ age + dpis + ddpis', data=savings).fit().summary()
Read in the data:
seatpos = pd.read_csv("data/seatpos.csv")
seatpos.head()
Fit a model with all the predictors:
lmod = smf.ols(formula = 'hipcenter ~ Age+Weight+HtShoes+Ht+Seated+Arm+Thigh+Leg', data=seatpos).fit()
lmod.summary()
Look at the correlations of the predictors:
seatpos.iloc[:,:-1].corr().round(3)
Do the eigendecomposition. Gives similar but not identical results to R calculation.
X = seatpos.iloc[:,:-1]
XTX = np.dot(X.T,X)
evals, evecs = np.linalg.eig(XTX)
evals
Calculate the condition numbers;
evals[0]/evals
Compute the variance inflation factors:
from patsy import dmatrix
X = dmatrix("Age+Weight+HtShoes+Ht+Seated+Arm+Thigh+Leg", data=seatpos, return_type='dataframe')
lmod = sm.OLS(X['Age'],X.drop('Age',axis=1)).fit()
lmod.rsquared
1/(1-lmod.rsquared)
Get them all:
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
pd.Series(vif, X.columns)
Change in coefficients due to perturbing the response
seatpos['hiperb'] = seatpos.hipcenter+np.random.normal(scale=10,size=38)
lmod = smf.ols(formula = 'hipcenter ~ Age+Weight+HtShoes+Ht+Seated+Arm+Thigh+Leg', data=seatpos).fit()
lmodp = smf.ols(formula = 'hiperb ~ Age+Weight+HtShoes+Ht+Seated+Arm+Thigh+Leg', data=seatpos).fit()
pd.DataFrame([lmod.params, lmodp.params])
Change in R-squared
lmod.rsquared, lmodp.rsquared
Correlations of the length variables
pd.DataFrame.corr(X.iloc[3:,3:]).round(3)
Recommended smaller model:
smf.ols(formula = 'hipcenter ~ Age+Weight+Ht', data=seatpos).fit().summary()
%load_ext version_information
%version_information pandas, numpy, matplotlib, seaborn, scipy, patsy, statsmodels