try:
import statsmodels.api as sm
except:
!pip install statsmodels==0.13.1
import statsmodels.api as sm
Collecting statsmodels==0.13.1
Downloading statsmodels-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
|████████████████████████████████| 9.8 MB 22.3 MB/s
Requirement already satisfied: numpy>=1.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.13.1) (1.19.5)
Collecting patsy>=0.5.2
Downloading patsy-0.5.2-py2.py3-none-any.whl (233 kB)
|████████████████████████████████| 233 kB 85.1 MB/s
Requirement already satisfied: scipy>=1.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.13.1) (1.7.2)
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.13.1) (1.2.5)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels==0.13.1) (1.16.0)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels==0.13.1) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels==0.13.1) (2021.3)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.2 statsmodels-0.13.1
WARNING: You are using pip version 20.1.1; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
pm25 = pd.read_csv('mich_pm25.csv')
pm25
ruca_ca = pd.read_csv('ruca2010revised.csv')
ruca_ca
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3173: DtypeWarning: Columns (0,3,4,5) have mixed types.Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
med_income_by_state = pd.read_csv('median_income_2011.csv')
med_income_by_state = med_income_by_state.head(51)
med_income_by_state['2011 Med Income'] = [int(i.replace(',', '')) for i in med_income_by_state['2011 Med Income']]
med_income_by_state['State'] = med_income_by_state['State'].str.strip()
mean_pm25_by_state = pd.read_csv("stateag_pm25_2011.csv")
mean_pm25_by_state = mean_pm25_by_state.drop(labels=['year', 'countyfips', 'ctfips', 'latitude', 'longitude', 'ds_pm_stdd'], axis=1)
chronic_disease = pd.read_csv("U.S._Chronic_Disease_Indicators__CDI_.csv")
chronic_disease
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3173: DtypeWarning: Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
ruca_ca.columns = ruca_ca.loc[0]
ruca_ca = ruca_ca.drop(0).reset_index()
ruca_ca = ruca_ca.drop(columns = 'index')
ruca_ca.to_csv('ruca2010.csv')
ruca_ca = ruca_ca[ruca_ca['Select State'] == 'MI']
ruca_ca
ruca_ca['State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)'] = ruca_ca['State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)'].astype(int)
pm25 = pm25.drop(columns=['statefips', 'countyfips', 'longitude', 'latitude'])
pm25 = pm25.drop(columns=['Unnamed: 0', 'year'])
merged_pm25 = pm25.merge(ruca_ca, how='inner', left_on='ctfips', right_on="State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)")
merged_pm25['urban'] = merged_pm25['Primary RUCA Code 2010'].astype(int) <= 6
merged_pm25['Primary RUCA Code 2010'] = merged_pm25['Primary RUCA Code 2010'].astype(int)
merged_grouped = merged_pm25.groupby('ctfips').agg(np.mean)
merged_grouped = merged_grouped[merged_grouped['Primary RUCA Code 2010'] < 99]
merged_pm25['Tract Population, 2010'] = merged_pm25['Tract Population, 2010'].str.replace(",","").astype(int)
merged_pm25['Population Density (per square mile), 2010'] = merged_pm25['Population Density (per square mile), 2010'].str.replace(",","").astype(float)
merged_grouped = merged_pm25.groupby('ctfips').agg(np.mean)
merged_grouped = merged_grouped[merged_grouped['Tract Population, 2010'] < 15000]
merged_grouped = merged_grouped[merged_grouped['Population Density (per square mile), 2010'] > 1000]
merged_grouped = merged_grouped[merged_grouped['Population Density (per square mile), 2010'] < 50000]
merged_2011 = merged_pm25[merged_pm25['date'].str.contains('2011')]
merged_2011.to_csv('ruca_merged_michigan.csv')
ruca_mi = pd.read_csv('ruca2010revised.csv')
ruca_mi.columns = ruca_mi.loc[0]
ruca_mi = ruca_mi.drop(0).reset_index()
ruca_mi = ruca_mi.drop(columns = 'index')
ruca_mi = ruca_mi[ruca_mi['Select State'] == 'MI']
ruca_mi_counts = ruca_mi['Primary RUCA Code 2010'].astype(int).value_counts(sort=False)
ruca_mi_counts = ruca_mi_counts.sort_index()
plt.bar(ruca_mi_counts.index.astype('str'), ruca_mi_counts.values)
plt.xlabel('Primary RUCA Code')
plt.ylabel('Count')
plt.title('Number of Census Tracts in Michigan with each RUCA Code');
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3173: DtypeWarning: Columns (0,3,4,5) have mixed types.Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
ruca_merged_mi = pd.read_csv('ruca_merged_michigan.csv')
ruca_merged_mi
pairplot_df = ruca_merged_mi.drop(columns=['Unnamed: 0','date', 'ctfips', 'ds_pm_stdd', 'State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)'])
merged_mi_2011 = ruca_merged_mi[ruca_merged_mi['date'].str.contains('2011')]
merged_mi_2011_grouped = merged_mi_2011.groupby('ctfips').agg(np.mean)
merged_mi_2011_grouped = merged_mi_2011_grouped[merged_mi_2011_grouped['Primary RUCA Code 2010'] < 99]
merged_mi_2011_grouped
pairplot_df = merged_mi_2011_grouped.drop(columns=['Unnamed: 0', 'ds_pm_stdd', 'State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)'])
sns.pairplot(pairplot_df.drop(['Primary RUCA Code 2010', 'Secondary RUCA Code, 2010 (see errata)'], axis=1))
non_par_df = pairplot_df.drop('Secondary RUCA Code, 2010 (see errata)', axis=1).dropna()
non_par_df['Primary RUCA Code 2010'] = non_par_df['Primary RUCA Code 2010'].astype(str)
X = non_par_df.drop(['ds_pm_pred'], axis=1)
y = non_par_df['ds_pm_pred']
X = pd.get_dummies(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=88)
X_train_2 = X_train.drop(['Primary RUCA Code 2010_1.0'], axis=1)
X_test_2 = X_test.drop(['Primary RUCA Code 2010_1.0'], axis=1)
X_train_3 = X_train_2.drop('Tract Population, 2010', axis=1)
X_test_3 = X_test_2.drop('Tract Population, 2010', axis=1)
import statsmodels.api as sm
import statsmodels.formula.api as smf
gaussian_model = sm.GLM(
y_train, sm.add_constant(X_train_2),
family=sm.families.Gaussian()
)
gaussian_results = gaussian_model.fit()
print(gaussian_results.summary())
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: ds_pm_pred No. Observations: 1934
Model: GLM Df Residuals: 1921
Model Family: Gaussian Df Model: 12
Link Function: identity Scale: 0.34379
Method: IRLS Log-Likelihood: -1705.2
Date: Tue, 14 Dec 2021 Deviance: 660.41
Time: 04:53:54 Pearson chi2: 660.
No. Iterations: 3 Pseudo R-squ. (CS): 0.8524
Covariance Type: nonrobust
==============================================================================================================
coef std err z P>|z| [0.025 0.975]
--------------------------------------------------------------------------------------------------------------
const 10.0672 0.041 246.339 0.000 9.987 10.147
Tract Population, 2010 -2.846e-06 8.77e-06 -0.325 0.745 -2e-05 1.43e-05
Land Area (square miles), 2010 -0.0036 0.000 -11.777 0.000 -0.004 -0.003
Population Density (per square mile), 2010 4.724e-05 4.79e-06 9.865 0.000 3.79e-05 5.66e-05
Primary RUCA Code 2010_10.0 -1.5332 0.066 -23.064 0.000 -1.663 -1.403
Primary RUCA Code 2010_2.0 -0.4046 0.047 -8.647 0.000 -0.496 -0.313
Primary RUCA Code 2010_3.0 -0.7555 0.112 -6.731 0.000 -0.976 -0.536
Primary RUCA Code 2010_4.0 -1.5419 0.064 -23.987 0.000 -1.668 -1.416
Primary RUCA Code 2010_5.0 -1.5024 0.085 -17.686 0.000 -1.669 -1.336
Primary RUCA Code 2010_6.0 -0.9337 0.150 -6.230 0.000 -1.227 -0.640
Primary RUCA Code 2010_7.0 -1.6591 0.083 -20.063 0.000 -1.821 -1.497
Primary RUCA Code 2010_8.0 -2.0929 0.136 -15.427 0.000 -2.359 -1.827
Primary RUCA Code 2010_9.0 -1.4821 0.166 -8.917 0.000 -1.808 -1.156
==============================================================================================================
import statsmodels.api as sm
import statsmodels.formula.api as smf
gaussian_model = sm.GLM(
y_train, sm.add_constant(X_train_3),
family=sm.families.Gaussian()
)
gaussian_results = gaussian_model.fit()
print(gaussian_results.summary())
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: ds_pm_pred No. Observations: 1934
Model: GLM Df Residuals: 1922
Model Family: Gaussian Df Model: 11
Link Function: identity Scale: 0.34363
Method: IRLS Log-Likelihood: -1705.3
Date: Tue, 14 Dec 2021 Deviance: 660.45
Time: 05:18:57 Pearson chi2: 660.
No. Iterations: 3 Pseudo R-squ. (CS): 0.8525
Covariance Type: nonrobust
==============================================================================================================
coef std err z P>|z| [0.025 0.975]
--------------------------------------------------------------------------------------------------------------
const 10.0565 0.024 419.678 0.000 10.010 10.103
Land Area (square miles), 2010 -0.0036 0.000 -11.821 0.000 -0.004 -0.003
Population Density (per square mile), 2010 4.739e-05 4.77e-06 9.939 0.000 3.8e-05 5.67e-05
Primary RUCA Code 2010_10.0 -1.5302 0.066 -23.249 0.000 -1.659 -1.401
Primary RUCA Code 2010_2.0 -0.4049 0.047 -8.656 0.000 -0.497 -0.313
Primary RUCA Code 2010_3.0 -0.7546 0.112 -6.727 0.000 -0.975 -0.535
Primary RUCA Code 2010_4.0 -1.5424 0.064 -24.006 0.000 -1.668 -1.416
Primary RUCA Code 2010_5.0 -1.5017 0.085 -17.688 0.000 -1.668 -1.335
Primary RUCA Code 2010_6.0 -0.9323 0.150 -6.225 0.000 -1.226 -0.639
Primary RUCA Code 2010_7.0 -1.6589 0.083 -20.065 0.000 -1.821 -1.497
Primary RUCA Code 2010_8.0 -2.0886 0.135 -15.471 0.000 -2.353 -1.824
Primary RUCA Code 2010_9.0 -1.4794 0.166 -8.914 0.000 -1.805 -1.154
==============================================================================================================
# Code from lecture
import numpy.random as rnd
def bootstrap_xy(X, y, fnc, w=None, B=1000, plot=True):
d = X.shape[1]
N = X.shape[0]
w_hat = fnc(