Load the modules.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
Read in the data. Drop the unused variable.
gala = pd.read_csv("data/gala.csv", index_col=0)
gala.drop('Endemics', axis=1, inplace=True)
gala.head()
Fit the basic model
lmod = smf.ols(formula='Species ~ Area + Elevation + Nearest + Scruz + Adjacent', data=gala).fit()
lmod.summary()
See how coefficient is different when only one predictor is used:
smf.ols(formula='Species ~ Elevation', data=gala).fit().summary()
Set up a plot with univariate regression line added
plt.scatter(gala.Elevation, gala.Species)
plt.plot([0,1750],[11.3, 11.3+1750*0.2008])
Mean of all variables
x0 = np.append(1,gala.iloc[:,1:].mean())
x0
Find the predicted output when Elevation varies from its minimum to its maximum value.
x0[2] = np.min(gala.Elevation)
yl = np.dot(lmod.params, x0)
x0[2] = np.max(gala.Elevation)
yh = np.dot(lmod.params, x0)
[yl, yh]
Add this covariate adjusted prediction to the plot and display.
plt.scatter(gala.Elevation, gala.Species)
plt.plot([0,1750],[11.3, 11.3+1750*0.2008])
plt.plot([min(gala.Elevation),max(gala.Elevation)],[yl,yh])
Read in the New Hampshire voting data
newhamp = pd.read_csv("data/newhamp.csv")
Sum of votes for the candidates by voting method.
newhamp.groupby('votesys').agg({'Obama': sum, 'Clinton': sum})
Set up a binary variable
newhamp['trt'] = np.where(newhamp.votesys == 'H',1,0)
Proportion voting for obama by voting method. See there is a significant difference of 4.25%
smf.ols(formula='pObama ~ trt',data=newhamp).fit().summary()
Adjust by previous votes for Dean. Treatment effect becomes insignificant.
smf.ols(formula='pObama ~ trt + Dean',data=newhamp).fit().summary()
Check that proportion voting for Dean changes by voting method.
smf.ols(formula='Dean ~ trt',data=newhamp).fit().summary()
Uses code from match module
from match import Match
newhamp.head()
Construct matched pairs
np.random.seed(100)
mp = Match(newhamp.votesys, newhamp.Dean, caliper=0.01)
str(mp)
Look at a pair. See that Dean voting proportion is very similar.
newhamp.iloc[[51,12],]
segs = np.zeros((len(mp), 2, 2), float)
mp.keys()
Draw line segments for matched pairs. This is much more complicated than the R code.
from matplotlib.collections import LineCollection
fig, ax = plt.subplots()
segs = np.zeros((len(mp), 2, 2), float)
segs[:,0,0] = newhamp.Dean.loc[mp.keys()]
segs[:,1,0] = newhamp.Dean.loc[mp.values()]
segs[:,0,1] = newhamp.pObama.loc[mp.keys()]
segs[:,1,1] = newhamp.pObama.loc[mp.values()]
plt.scatter(newhamp.Dean, newhamp.pObama, c=newhamp.trt, s=2)
line_segments = LineCollection(segs)
ax.add_collection(line_segments)
plt.show(ax)
Extract and test the pairwise differences.
pdiff = newhamp.pObama.loc[mp.keys()].ravel() - newhamp.pObama.loc[mp.values()].ravel()
sp.stats.ttest_1samp(pdiff,0)
See that these difference do not varying according Dean voting proportion.
plt.scatter(newhamp.Dean.loc[mp.keys()], pdiff)
plt.axhline(0)
plt.xlabel('Proportion voted for Dean')
plt.ylabel('Digital vs. manual difference')
Construct plot that demonstrates how difference goes away when covariate adjustment is applied and that is functionally similar to matching.
fig, ax = plt.subplots()
plt.scatter(newhamp.Dean, newhamp.pObama, c=newhamp.trt, s=2)
line_segments = LineCollection(segs)
ax.add_collection(line_segments)
plt.axhline(0.353)
plt.axhline(0.353+0.042,linestyle="dashed",c='red')
plt.plot([0.1, 0.6],[0.5229*0.1+0.221, 0.5229*0.6+0.221])
plt.plot([0.1, 0.6],[0.5229*0.1+0.216, 0.5229*0.6+0.216], linestyle="dashed")
plt.show(ax)
%load_ext version_information
%version_information pandas, numpy, matplotlib, seaborn, scipy, patsy, statsmodels