{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chapter 2 - Estimation\n",
    "\n",
    "Load in the packages:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import scipy as sp\n",
    "import statsmodels.api as sm\n",
    "import statsmodels.formula.api as smf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.6 Example"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Read in the Galapagos data and check the first few lines:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Species</th>\n",
       "      <th>Endemics</th>\n",
       "      <th>Area</th>\n",
       "      <th>Elevation</th>\n",
       "      <th>Nearest</th>\n",
       "      <th>Scruz</th>\n",
       "      <th>Adjacent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Baltra</th>\n",
       "      <td>58</td>\n",
       "      <td>23</td>\n",
       "      <td>25.09</td>\n",
       "      <td>346</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.6</td>\n",
       "      <td>1.84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bartolome</th>\n",
       "      <td>31</td>\n",
       "      <td>21</td>\n",
       "      <td>1.24</td>\n",
       "      <td>109</td>\n",
       "      <td>0.6</td>\n",
       "      <td>26.3</td>\n",
       "      <td>572.33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Caldwell</th>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>0.21</td>\n",
       "      <td>114</td>\n",
       "      <td>2.8</td>\n",
       "      <td>58.7</td>\n",
       "      <td>0.78</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Champion</th>\n",
       "      <td>25</td>\n",
       "      <td>9</td>\n",
       "      <td>0.10</td>\n",
       "      <td>46</td>\n",
       "      <td>1.9</td>\n",
       "      <td>47.4</td>\n",
       "      <td>0.18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Coamano</th>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>0.05</td>\n",
       "      <td>77</td>\n",
       "      <td>1.9</td>\n",
       "      <td>1.9</td>\n",
       "      <td>903.82</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           Species  Endemics   Area  Elevation  Nearest  Scruz  Adjacent\n",
       "Baltra          58        23  25.09        346      0.6    0.6      1.84\n",
       "Bartolome       31        21   1.24        109      0.6   26.3    572.33\n",
       "Caldwell         3         3   0.21        114      2.8   58.7      0.78\n",
       "Champion        25         9   0.10         46      1.9   47.4      0.18\n",
       "Coamano          2         1   0.05         77      1.9    1.9    903.82"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gala = pd.read_csv(\"data/gala.csv\", index_col=0)\n",
    "gala.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Drop the endemics variable permanently from the data frame:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Species</th>\n",
       "      <th>Area</th>\n",
       "      <th>Elevation</th>\n",
       "      <th>Nearest</th>\n",
       "      <th>Scruz</th>\n",
       "      <th>Adjacent</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Baltra</th>\n",
       "      <td>58</td>\n",
       "      <td>25.09</td>\n",
       "      <td>346</td>\n",
       "      <td>0.6</td>\n",
       "      <td>0.6</td>\n",
       "      <td>1.84</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Bartolome</th>\n",
       "      <td>31</td>\n",
       "      <td>1.24</td>\n",
       "      <td>109</td>\n",
       "      <td>0.6</td>\n",
       "      <td>26.3</td>\n",
       "      <td>572.33</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Caldwell</th>\n",
       "      <td>3</td>\n",
       "      <td>0.21</td>\n",
       "      <td>114</td>\n",
       "      <td>2.8</td>\n",
       "      <td>58.7</td>\n",
       "      <td>0.78</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Champion</th>\n",
       "      <td>25</td>\n",
       "      <td>0.10</td>\n",
       "      <td>46</td>\n",
       "      <td>1.9</td>\n",
       "      <td>47.4</td>\n",
       "      <td>0.18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Coamano</th>\n",
       "      <td>2</td>\n",
       "      <td>0.05</td>\n",
       "      <td>77</td>\n",
       "      <td>1.9</td>\n",
       "      <td>1.9</td>\n",
       "      <td>903.82</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           Species   Area  Elevation  Nearest  Scruz  Adjacent\n",
       "Baltra          58  25.09        346      0.6    0.6      1.84\n",
       "Bartolome       31   1.24        109      0.6   26.3    572.33\n",
       "Caldwell         3   0.21        114      2.8   58.7      0.78\n",
       "Champion        25   0.10         46      1.9   47.4      0.18\n",
       "Coamano          2   0.05         77      1.9    1.9    903.82"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gala.drop('Endemics', axis=1, inplace=True)\n",
    "gala.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fit a linear model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>         <td>Species</td>     <th>  R-squared:         </th> <td>   0.766</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.717</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   15.70</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Fri, 07 Sep 2018</td> <th>  Prob (F-statistic):</th> <td>6.84e-07</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>15:11:42</td>     <th>  Log-Likelihood:    </th> <td> -162.54</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>    30</td>      <th>  AIC:               </th> <td>   337.1</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>    24</td>      <th>  BIC:               </th> <td>   345.5</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     5</td>      <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "      <td></td>         <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Intercept</th> <td>    7.0682</td> <td>   19.154</td> <td>    0.369</td> <td> 0.715</td> <td>  -32.464</td> <td>   46.601</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Area</th>      <td>   -0.0239</td> <td>    0.022</td> <td>   -1.068</td> <td> 0.296</td> <td>   -0.070</td> <td>    0.022</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Elevation</th> <td>    0.3195</td> <td>    0.054</td> <td>    5.953</td> <td> 0.000</td> <td>    0.209</td> <td>    0.430</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Nearest</th>   <td>    0.0091</td> <td>    1.054</td> <td>    0.009</td> <td> 0.993</td> <td>   -2.166</td> <td>    2.185</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Scruz</th>     <td>   -0.2405</td> <td>    0.215</td> <td>   -1.117</td> <td> 0.275</td> <td>   -0.685</td> <td>    0.204</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Adjacent</th>  <td>   -0.0748</td> <td>    0.018</td> <td>   -4.226</td> <td> 0.000</td> <td>   -0.111</td> <td>   -0.038</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td>12.683</td> <th>  Durbin-Watson:     </th> <td>   2.476</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th> <td> 0.002</td> <th>  Jarque-Bera (JB):  </th> <td>  13.498</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>          <td> 1.136</td> <th>  Prob(JB):          </th> <td> 0.00117</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>      <td> 5.374</td> <th>  Cond. No.          </th> <td>1.90e+03</td>\n",
       "</tr>\n",
       "</table><br/><br/>Warnings:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.<br/>[2] The condition number is large, 1.9e+03. This might indicate that there are<br/>strong multicollinearity or other numerical problems."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                Species   R-squared:                       0.766\n",
       "Model:                            OLS   Adj. R-squared:                  0.717\n",
       "Method:                 Least Squares   F-statistic:                     15.70\n",
       "Date:                Fri, 07 Sep 2018   Prob (F-statistic):           6.84e-07\n",
       "Time:                        15:11:42   Log-Likelihood:                -162.54\n",
       "No. Observations:                  30   AIC:                             337.1\n",
       "Df Residuals:                      24   BIC:                             345.5\n",
       "Df Model:                           5                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "Intercept      7.0682     19.154      0.369      0.715     -32.464      46.601\n",
       "Area          -0.0239      0.022     -1.068      0.296      -0.070       0.022\n",
       "Elevation      0.3195      0.054      5.953      0.000       0.209       0.430\n",
       "Nearest        0.0091      1.054      0.009      0.993      -2.166       2.185\n",
       "Scruz         -0.2405      0.215     -1.117      0.275      -0.685       0.204\n",
       "Adjacent      -0.0748      0.018     -4.226      0.000      -0.111      -0.038\n",
       "==============================================================================\n",
       "Omnibus:                       12.683   Durbin-Watson:                   2.476\n",
       "Prob(Omnibus):                  0.002   Jarque-Bera (JB):               13.498\n",
       "Skew:                           1.136   Prob(JB):                      0.00117\n",
       "Kurtosis:                       5.374   Cond. No.                     1.90e+03\n",
       "==============================================================================\n",
       "\n",
       "Warnings:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "[2] The condition number is large, 1.9e+03. This might indicate that there are\n",
       "strong multicollinearity or other numerical problems.\n",
       "\"\"\""
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lmod = smf.ols(formula='Species ~ Area + Elevation + Nearest + Scruz  + Adjacent', data=gala).fit()\n",
    "lmod.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The warnings can be ignored for now. The first is always assumed. The second can be a problem.\n",
    "\n",
    "Compute LS estimates using the formula:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 7.06822071, -0.02393834,  0.31946476,  0.00914396, -0.24052423,\n",
       "       -0.07480483])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X = gala.iloc[:,1:]\n",
    "X.insert(0,'intercept',1)\n",
    "XtXi = np.linalg.inv(X.T @ X)\n",
    "np.dot(XtXi @ X.T, gala['Species'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A somewhat more efficient way to do the calculation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 7.06822071, -0.02393834,  0.31946476,  0.00914396, -0.24052423,\n",
       "       -0.07480483])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.linalg.solve(X.T @ X, np.dot(X.T,gala['Species']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.9 QR decomposition"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Computation using the QR decomposition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([-466.84219318,  381.40557435,  256.25047255,    5.40764552,\n",
       "       -119.49834019,  257.69436853])"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "q, r = np.linalg.qr(X)\n",
    "f = np.dot(np.transpose(q), gala['Species'])\n",
    "f"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This function from scipy uses the fact that r is upper triangular. The np.linalg.solve does not."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([ 7.06822071, -0.02393834,  0.31946476,  0.00914396, -0.24052423,\n",
       "       -0.07480483])"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sp.linalg.solve_triangular(r, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.10 Identifiability"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Add a predictor which is a linear combination of other predictors. In contrast to R, the parameter is estimated. The second warning indicates that the design matrix is singular (as indeed it is)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<table class=\"simpletable\">\n",
       "<caption>OLS Regression Results</caption>\n",
       "<tr>\n",
       "  <th>Dep. Variable:</th>         <td>Species</td>     <th>  R-squared:         </th> <td>   0.766</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Model:</th>                   <td>OLS</td>       <th>  Adj. R-squared:    </th> <td>   0.717</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Method:</th>             <td>Least Squares</td>  <th>  F-statistic:       </th> <td>   15.70</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Date:</th>             <td>Fri, 07 Sep 2018</td> <th>  Prob (F-statistic):</th> <td>6.84e-07</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Time:</th>                 <td>15:11:42</td>     <th>  Log-Likelihood:    </th> <td> -162.54</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>No. Observations:</th>      <td>    30</td>      <th>  AIC:               </th> <td>   337.1</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Residuals:</th>          <td>    24</td>      <th>  BIC:               </th> <td>   345.5</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Df Model:</th>              <td>     5</td>      <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Covariance Type:</th>      <td>nonrobust</td>    <th>                     </th>     <td> </td>   \n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "      <td></td>         <th>coef</th>     <th>std err</th>      <th>t</th>      <th>P>|t|</th>  <th>[0.025</th>    <th>0.975]</th>  \n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Intercept</th> <td>    7.0682</td> <td>   19.154</td> <td>    0.369</td> <td> 0.715</td> <td>  -32.464</td> <td>   46.601</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Area</th>      <td>   -0.0409</td> <td>    0.018</td> <td>   -2.236</td> <td> 0.035</td> <td>   -0.079</td> <td>   -0.003</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Elevation</th> <td>    0.3195</td> <td>    0.054</td> <td>    5.953</td> <td> 0.000</td> <td>    0.209</td> <td>    0.430</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Nearest</th>   <td>    0.0091</td> <td>    1.054</td> <td>    0.009</td> <td> 0.993</td> <td>   -2.166</td> <td>    2.185</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Scruz</th>     <td>   -0.2405</td> <td>    0.215</td> <td>   -1.117</td> <td> 0.275</td> <td>   -0.685</td> <td>    0.204</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Adjacent</th>  <td>   -0.0578</td> <td>    0.016</td> <td>   -3.511</td> <td> 0.002</td> <td>   -0.092</td> <td>   -0.024</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Adiff</th>     <td>    0.0170</td> <td>    0.007</td> <td>    2.340</td> <td> 0.028</td> <td>    0.002</td> <td>    0.032</td>\n",
       "</tr>\n",
       "</table>\n",
       "<table class=\"simpletable\">\n",
       "<tr>\n",
       "  <th>Omnibus:</th>       <td>12.683</td> <th>  Durbin-Watson:     </th> <td>   2.476</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Prob(Omnibus):</th> <td> 0.002</td> <th>  Jarque-Bera (JB):  </th> <td>  13.498</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Skew:</th>          <td> 1.136</td> <th>  Prob(JB):          </th> <td> 0.00117</td>\n",
       "</tr>\n",
       "<tr>\n",
       "  <th>Kurtosis:</th>      <td> 5.374</td> <th>  Cond. No.          </th> <td>4.67e+15</td>\n",
       "</tr>\n",
       "</table><br/><br/>Warnings:<br/>[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.<br/>[2] The smallest eigenvalue is 2.45e-24. This might indicate that there are<br/>strong multicollinearity problems or that the design matrix is singular."
      ],
      "text/plain": [
       "<class 'statsmodels.iolib.summary.Summary'>\n",
       "\"\"\"\n",
       "                            OLS Regression Results                            \n",
       "==============================================================================\n",
       "Dep. Variable:                Species   R-squared:                       0.766\n",
       "Model:                            OLS   Adj. R-squared:                  0.717\n",
       "Method:                 Least Squares   F-statistic:                     15.70\n",
       "Date:                Fri, 07 Sep 2018   Prob (F-statistic):           6.84e-07\n",
       "Time:                        15:11:42   Log-Likelihood:                -162.54\n",
       "No. Observations:                  30   AIC:                             337.1\n",
       "Df Residuals:                      24   BIC:                             345.5\n",
       "Df Model:                           5                                         \n",
       "Covariance Type:            nonrobust                                         \n",
       "==============================================================================\n",
       "                 coef    std err          t      P>|t|      [0.025      0.975]\n",
       "------------------------------------------------------------------------------\n",
       "Intercept      7.0682     19.154      0.369      0.715     -32.464      46.601\n",
       "Area          -0.0409      0.018     -2.236      0.035      -0.079      -0.003\n",
       "Elevation      0.3195      0.054      5.953      0.000       0.209       0.430\n",
       "Nearest        0.0091      1.054      0.009      0.993      -2.166       2.185\n",
       "Scruz         -0.2405      0.215     -1.117      0.275      -0.685       0.204\n",
       "Adjacent      -0.0578      0.016     -3.511      0.002      -0.092      -0.024\n",
       "Adiff          0.0170      0.007      2.340      0.028       0.002       0.032\n",
       "==============================================================================\n",
       "Omnibus:                       12.683   Durbin-Watson:                   2.476\n",
       "Prob(Omnibus):                  0.002   Jarque-Bera (JB):               13.498\n",
       "Skew:                           1.136   Prob(JB):                      0.00117\n",
       "Kurtosis:                       5.374   Cond. No.                     4.67e+15\n",
       "==============================================================================\n",
       "\n",
       "Warnings:\n",
       "[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.\n",
       "[2] The smallest eigenvalue is 2.45e-24. This might indicate that there are\n",
       "strong multicollinearity problems or that the design matrix is singular.\n",
       "\"\"\""
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "gala['Adiff'] = gala['Area'] - gala['Adjacent']\n",
    "lmod = smf.ols(formula='Species ~ Area + Elevation + Nearest + Scruz  + Adjacent + Adiff', data=gala).fit()\n",
    "lmod.summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2.11 Orthogonality"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Orthogonality example"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>odor</th>\n",
       "      <th>temp</th>\n",
       "      <th>gas</th>\n",
       "      <th>pack</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>66</td>\n",
       "      <td>-1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>39</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>43</td>\n",
       "      <td>-1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>49</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>58</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>-1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   odor  temp  gas  pack\n",
       "0    66    -1   -1     0\n",
       "1    39     1   -1     0\n",
       "2    43    -1    1     0\n",
       "3    49     1    1     0\n",
       "4    58    -1    0    -1"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "odor = pd.read_csv(\"data/odor.csv\")\n",
    "odor.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Covariance of the predictors is diagonal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>temp</th>\n",
       "      <th>gas</th>\n",
       "      <th>pack</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>temp</th>\n",
       "      <td>0.571429</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gas</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.571429</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pack</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.571429</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          temp       gas      pack\n",
       "temp  0.571429  0.000000  0.000000\n",
       "gas   0.000000  0.571429  0.000000\n",
       "pack  0.000000  0.000000  0.571429"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "odor.iloc[:,1:].cov()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "LS estimates with all three predictors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Intercept    15.200\n",
       "temp        -12.125\n",
       "gas         -17.000\n",
       "pack        -21.375\n",
       "dtype: float64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lmod = smf.ols(formula='odor ~ temp + gas + pack', data=odor).fit()\n",
    "lmod.params"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Covariance of the parameter estimates"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Intercept</th>\n",
       "      <th>temp</th>\n",
       "      <th>gas</th>\n",
       "      <th>pack</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Intercept</th>\n",
       "      <td>86.46</td>\n",
       "      <td>-0.0</td>\n",
       "      <td>-0.0</td>\n",
       "      <td>-0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>temp</th>\n",
       "      <td>-0.00</td>\n",
       "      <td>162.1</td>\n",
       "      <td>-0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>gas</th>\n",
       "      <td>-0.00</td>\n",
       "      <td>-0.0</td>\n",
       "      <td>162.1</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pack</th>\n",
       "      <td>-0.00</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>162.1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           Intercept   temp    gas   pack\n",
       "Intercept      86.46   -0.0   -0.0   -0.0\n",
       "temp           -0.00  162.1   -0.0    0.0\n",
       "gas            -0.00   -0.0  162.1    0.0\n",
       "pack           -0.00    0.0    0.0  162.1"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.round(lmod.cov_params(),2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "see that estimates do not change when a predictor is dropped from the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Intercept    15.200\n",
       "gas         -17.000\n",
       "pack        -21.375\n",
       "dtype: float64"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lmod = smf.ols(formula='odor ~ gas + pack', data=odor).fit()\n",
    "lmod.params"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/json": {
       "Software versions": [
        {
         "module": "Python",
         "version": "3.7.0 64bit [Clang 4.0.1 (tags/RELEASE_401/final)]"
        },
        {
         "module": "IPython",
         "version": "6.5.0"
        },
        {
         "module": "OS",
         "version": "Darwin 17.7.0 x86_64 i386 64bit"
        },
        {
         "module": "pandas",
         "version": "0.23.4"
        },
        {
         "module": "numpy",
         "version": "1.15.1"
        },
        {
         "module": "matplotlib",
         "version": "2.2.3"
        },
        {
         "module": "seaborn",
         "version": "0.9.0"
        },
        {
         "module": "scipy",
         "version": "1.1.0"
        },
        {
         "module": "patsy",
         "version": "0.5.0"
        },
        {
         "module": "statsmodels",
         "version": "0.9.0"
        }
       ]
      },
      "text/html": [
       "<table><tr><th>Software</th><th>Version</th></tr><tr><td>Python</td><td>3.7.0 64bit [Clang 4.0.1 (tags/RELEASE_401/final)]</td></tr><tr><td>IPython</td><td>6.5.0</td></tr><tr><td>OS</td><td>Darwin 17.7.0 x86_64 i386 64bit</td></tr><tr><td>pandas</td><td>0.23.4</td></tr><tr><td>numpy</td><td>1.15.1</td></tr><tr><td>matplotlib</td><td>2.2.3</td></tr><tr><td>seaborn</td><td>0.9.0</td></tr><tr><td>scipy</td><td>1.1.0</td></tr><tr><td>patsy</td><td>0.5.0</td></tr><tr><td>statsmodels</td><td>0.9.0</td></tr><tr><td colspan='2'>Fri Sep 07 15:11:42 2018 BST</td></tr></table>"
      ],
      "text/latex": [
       "\\begin{tabular}{|l|l|}\\hline\n",
       "{\\bf Software} & {\\bf Version} \\\\ \\hline\\hline\n",
       "Python & 3.7.0 64bit [Clang 4.0.1 (tags/RELEASE\\_401/final)] \\\\ \\hline\n",
       "IPython & 6.5.0 \\\\ \\hline\n",
       "OS & Darwin 17.7.0 x86\\_64 i386 64bit \\\\ \\hline\n",
       "pandas & 0.23.4 \\\\ \\hline\n",
       "numpy & 1.15.1 \\\\ \\hline\n",
       "matplotlib & 2.2.3 \\\\ \\hline\n",
       "seaborn & 0.9.0 \\\\ \\hline\n",
       "scipy & 1.1.0 \\\\ \\hline\n",
       "patsy & 0.5.0 \\\\ \\hline\n",
       "statsmodels & 0.9.0 \\\\ \\hline\n",
       "\\hline \\multicolumn{2}{|l|}{Fri Sep 07 15:11:42 2018 BST} \\\\ \\hline\n",
       "\\end{tabular}\n"
      ],
      "text/plain": [
       "Software versions\n",
       "Python 3.7.0 64bit [Clang 4.0.1 (tags/RELEASE_401/final)]\n",
       "IPython 6.5.0\n",
       "OS Darwin 17.7.0 x86_64 i386 64bit\n",
       "pandas 0.23.4\n",
       "numpy 1.15.1\n",
       "matplotlib 2.2.3\n",
       "seaborn 0.9.0\n",
       "scipy 1.1.0\n",
       "patsy 0.5.0\n",
       "statsmodels 0.9.0\n",
       "Fri Sep 07 15:11:42 2018 BST"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "%load_ext version_information\n",
    "%version_information pandas, numpy, matplotlib, seaborn, scipy, patsy, statsmodels"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}