import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
Read in a missing version of the Chicago Insurance data:
chmiss = pd.read_csv("data/chmiss.csv", index_col=0)
chmiss.head()
Describe does give observed count but can only tell the number of missing by subtraction
chmiss.describe().round(2)
Number of missing values per variable
chmiss.isna().sum(axis=0)
Number of missing values per case
chmiss.isna().sum(axis=1)
Make a plot of the missing data:
plt.imshow(~chmiss.isna(), aspect='auto')
plt.xlabel("variables")
plt.ylabel("cases")
plt.gray()
plt.show()
Full data model
chredlin = pd.read_csv("data/chredlin.csv", index_col=0)
lmod = smf.ols(formula='involact ~ race + fire + theft + age + np.log(income)', data=chredlin).fit()
lmod.summary()
Missing data model
lmodm = smf.ols(formula='involact ~ race + fire + theft + age + np.log(income)', data=chmiss).fit()
lmodm.summary()
Mean values for the variables
cmeans = chmiss.mean(axis=0)
cmeans
Fill in missing values with means
mchm = chmiss.copy()
mchm.race.fillna(cmeans['race'],inplace=True)
mchm.fire.fillna(cmeans['fire'],inplace=True)
mchm.theft.fillna(cmeans['theft'],inplace=True)
mchm.age.fillna(cmeans['age'],inplace=True)
mchm.income.fillna(cmeans['income'],inplace=True)
imod = smf.ols(formula='involact ~ race + fire + theft + age + np.log(income)', data=mchm).fit()
imod.summary()
lmodr = smf.ols(formula='race ~ fire + theft + age + np.log(income)', data=chmiss).fit()
mv = chmiss.race.isna()
lmodr.predict(chmiss)[mv]
chmiss.race.fillna(lmodr.predict(chmiss))[mv]
Use logit transformation to restrict to [0,1]
def logit(x): return(np.log(x/(1-x)))
def ilogit(x): return(np.exp(x)/(1+np.exp(x)))
ilogit(0)
lmodr = smf.ols(formula='logit(race/100) ~ fire + theft + age + np.log(income)', data=chmiss).fit()
(ilogit(lmodr.predict(chmiss))*100)[mv]
statsmodels
has Multiple Imputation with Chained Equations (MICE). Not the same as the R package demonstrated in R.
import statsmodels.imputation.mice as smi
imp = smi.MICEData(chmiss)
fm = 'involact ~ race + fire + theft + age + np.log(income)'
mmod = smi.MICE(fm, sm.OLS, imp)
results = mmod.fit(10, 10)
print(results.summary())
As might be expected, the inference is not as sharp as seen in the full data version with somewhat larger p-values.
%load_ext version_information
%version_information pandas, numpy, matplotlib, seaborn, scipy, patsy, statsmodels