Import standard libraries and read in data:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
fat = pd.read_csv("data/fat.csv")
fat.head()
Fit the fat prediction model and produce summary
lmod = smf.ols(formula=
'brozek ~ age + weight + height + neck + chest + abdom + hip + thigh + knee + ankle + biceps + forearm + wrist',
data=fat).fit()
lmod.summary()
Construct the predictor vector
x0 = fat.loc[:,("age","weight","height","neck","chest","abdom","hip","thigh","knee","ankle","biceps","forearm","wrist")].median()
x0 = np.append(1, x0.ravel())
x0
Compute the prediction
np.dot(x0, lmod.params)
Compute prediction using sm predict() function. Note how x0 is constructed with variable labels
x0 = fat.loc[:,("age","weight","height","neck","chest","abdom","hip","thigh","knee","ankle","biceps","forearm","wrist")].median()
x0 = sm.tools.add_constant(pd.DataFrame(x0).T)
lmod.predict(x0)
x0
We can get confidence and prediction intervals also:
p = lmod.get_prediction(x0)
p.summary_frame()
Load in the airline passenger data and plot
air = pd.read_csv("data/airpass.csv")
plt.plot(air['year'], air['pass'])
X = sm.tools.add_constant(air['year'])
y = np.log(air['pass'])
lmod = sm.OLS(y,X).fit()
Put the fitted line onto the plot:
plt.plot(air['year'], air['pass'])
plt.plot(air['year'],np.exp(lmod.predict()))
Construct the lagged variables and drop the missing values
air['lag1'] = np.log(air['pass']).shift(1)
air['lag12'] = np.log(air['pass']).shift(12)
air['lag13'] = np.log(air['pass']).shift(13)
airlag = air.dropna()
Fit the lagged model
X = sm.tools.add_constant(airlag.loc[:,('lag1','lag12','lag13')])
y = np.log(airlag['pass'])
lmod = sm.OLS(y,X).fit()
lmod.summary()
Show the fitted model on top of the data. First year of data is not predicted because of lagging.
plt.plot(air['year'], air['pass'])
plt.plot(airlag['year'],np.exp(lmod.predict()))
Find the appropriate lagged variables:
x0 = np.log(air['pass'].iloc[[-1,-12,-13]])
x0
Make the prediction:
x0 = pd.DataFrame([{"const":1,"lag1": 6.068426, "lag12": 6.033086, "lag13": 6.003887}])
x0
p = lmod.get_prediction(x0)
p.summary_frame()
%load_ext version_information
%version_information pandas, numpy, matplotlib, seaborn, scipy, patsy, statsmodels