try:
import statsmodels.api as sm
except:
!pip install statsmodels==0.13.1
import statsmodels.api as sm
Collecting statsmodels==0.13.1
Downloading statsmodels-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
|████████████████████████████████| 9.8 MB 22.3 MB/s
Requirement already satisfied: numpy>=1.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.13.1) (1.19.5)
Collecting patsy>=0.5.2
Downloading patsy-0.5.2-py2.py3-none-any.whl (233 kB)
|████████████████████████████████| 233 kB 85.1 MB/s
Requirement already satisfied: scipy>=1.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.13.1) (1.7.2)
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.13.1) (1.2.5)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels==0.13.1) (1.16.0)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels==0.13.1) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels==0.13.1) (2021.3)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.2 statsmodels-0.13.1
WARNING: You are using pip version 20.1.1; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
pm25 = pd.read_csv('mich_pm25.csv')
pm25
ruca_ca = pd.read_csv('ruca2010revised.csv')
ruca_ca
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3173: DtypeWarning: Columns (0,3,4,5) have mixed types.Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
med_income_by_state = pd.read_csv('median_income_2011.csv')
med_income_by_state = med_income_by_state.head(51)
med_income_by_state['2011 Med Income'] = [int(i.replace(',', '')) for i in med_income_by_state['2011 Med Income']]
med_income_by_state['State'] = med_income_by_state['State'].str.strip()
mean_pm25_by_state = pd.read_csv("stateag_pm25_2011.csv")
mean_pm25_by_state = mean_pm25_by_state.drop(labels=['year', 'countyfips', 'ctfips', 'latitude', 'longitude', 'ds_pm_stdd'], axis=1)
chronic_disease = pd.read_csv("U.S._Chronic_Disease_Indicators__CDI_.csv")
chronic_disease
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3173: DtypeWarning: Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
ruca_ca.columns = ruca_ca.loc[0]
ruca_ca = ruca_ca.drop(0).reset_index()
ruca_ca = ruca_ca.drop(columns = 'index')
ruca_ca.to_csv('ruca2010.csv')
ruca_ca = ruca_ca[ruca_ca['Select State'] == 'MI']
ruca_ca
ruca_ca['State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)'] = ruca_ca['State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)'].astype(int)
pm25 = pm25.drop(columns=['statefips', 'countyfips', 'longitude', 'latitude'])
pm25 = pm25.drop(columns=['Unnamed: 0', 'year'])
merged_pm25 = pm25.merge(ruca_ca, how='inner', left_on='ctfips', right_on="State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)")
merged_pm25['urban'] = merged_pm25['Primary RUCA Code 2010'].astype(int) <= 6
merged_pm25['Primary RUCA Code 2010'] = merged_pm25['Primary RUCA Code 2010'].astype(int)
merged_grouped = merged_pm25.groupby('ctfips').agg(np.mean)
merged_grouped = merged_grouped[merged_grouped['Primary RUCA Code 2010'] < 99]
merged_pm25['Tract Population, 2010'] = merged_pm25['Tract Population, 2010'].str.replace(",","").astype(int)
merged_pm25['Population Density (per square mile), 2010'] = merged_pm25['Population Density (per square mile), 2010'].str.replace(",","").astype(float)
merged_grouped = merged_pm25.groupby('ctfips').agg(np.mean)
merged_grouped = merged_grouped[merged_grouped['Tract Population, 2010'] < 15000]
merged_grouped = merged_grouped[merged_grouped['Population Density (per square mile), 2010'] > 1000]
merged_grouped = merged_grouped[merged_grouped['Population Density (per square mile), 2010'] < 50000]
merged_2011 = merged_pm25[merged_pm25['date'].str.contains('2011')]
merged_2011.to_csv('ruca_merged_michigan.csv')
ruca_mi = pd.read_csv('ruca2010revised.csv')
ruca_mi.columns = ruca_mi.loc[0]
ruca_mi = ruca_mi.drop(0).reset_index()
ruca_mi = ruca_mi.drop(columns = 'index')
ruca_mi = ruca_mi[ruca_mi['Select State'] == 'MI']
ruca_mi_counts = ruca_mi['Primary RUCA Code 2010'].astype(int).value_counts(sort=False)
ruca_mi_counts = ruca_mi_counts.sort_index()
plt.bar(ruca_mi_counts.index.astype('str'), ruca_mi_counts.values)
plt.xlabel('Primary RUCA Code')
plt.ylabel('Count')
plt.title('Number of Census Tracts in Michigan with each RUCA Code');
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3173: DtypeWarning: Columns (0,3,4,5) have mixed types.Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
ruca_merged_mi = pd.read_csv('ruca_merged_michigan.csv')
ruca_merged_mi
pairplot_df = ruca_merged_mi.drop(columns=['Unnamed: 0','date', 'ctfips', 'ds_pm_stdd', 'State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)'])
merged_mi_2011 = ruca_merged_mi[ruca_merged_mi['date'].str.contains('2011')]
merged_mi_2011_grouped = merged_mi_2011.groupby('ctfips').agg(np.mean)
merged_mi_2011_grouped = merged_mi_2011_grouped[merged_mi_2011_grouped['Primary RUCA Code 2010'] < 99]
merged_mi_2011_grouped
pairplot_df = merged_mi_2011_grouped.drop(columns=['Unnamed: 0', 'ds_pm_stdd', 'State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)'])
sns.pairplot(pairplot_df.drop(['Primary RUCA Code 2010', 'Secondary RUCA Code, 2010 (see errata)'], axis=1))
non_par_df = pairplot_df.drop('Secondary RUCA Code, 2010 (see errata)', axis=1).dropna()
non_par_df['Primary RUCA Code 2010'] = non_par_df['Primary RUCA Code 2010'].astype(str)
X = non_par_df.drop(['ds_pm_pred'], axis=1)
y = non_par_df['ds_pm_pred']
X = pd.get_dummies(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=88)
X_train_2 = X_train.drop(['Primary RUCA Code 2010_1.0'], axis=1)
X_test_2 = X_test.drop(['Primary RUCA Code 2010_1.0'], axis=1)
X_train_3 = X_train_2.drop('Tract Population, 2010', axis=1)
X_test_3 = X_test_2.drop('Tract Population, 2010', axis=1)
import statsmodels.api as sm
import statsmodels.formula.api as smf
gaussian_model = sm.GLM(
y_train, sm.add_constant(X_train_2),
family=sm.families.Gaussian()
)
gaussian_results = gaussian_model.fit()
print(gaussian_results.summary())
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: ds_pm_pred No. Observations: 1934
Model: GLM Df Residuals: 1921
Model Family: Gaussian Df Model: 12
Link Function: identity Scale: 0.34379
Method: IRLS Log-Likelihood: -1705.2
Date: Tue, 14 Dec 2021 Deviance: 660.41
Time: 04:53:54 Pearson chi2: 660.
No. Iterations: 3 Pseudo R-squ. (CS): 0.8524
Covariance Type: nonrobust
==============================================================================================================
coef std err z P>|z| [0.025 0.975]
--------------------------------------------------------------------------------------------------------------
const 10.0672 0.041 246.339 0.000 9.987 10.147
Tract Population, 2010 -2.846e-06 8.77e-06 -0.325 0.745 -2e-05 1.43e-05
Land Area (square miles), 2010 -0.0036 0.000 -11.777 0.000 -0.004 -0.003
Population Density (per square mile), 2010 4.724e-05 4.79e-06 9.865 0.000 3.79e-05 5.66e-05
Primary RUCA Code 2010_10.0 -1.5332 0.066 -23.064 0.000 -1.663 -1.403
Primary RUCA Code 2010_2.0 -0.4046 0.047 -8.647 0.000 -0.496 -0.313
Primary RUCA Code 2010_3.0 -0.7555 0.112 -6.731 0.000 -0.976 -0.536
Primary RUCA Code 2010_4.0 -1.5419 0.064 -23.987 0.000 -1.668 -1.416
Primary RUCA Code 2010_5.0 -1.5024 0.085 -17.686 0.000 -1.669 -1.336
Primary RUCA Code 2010_6.0 -0.9337 0.150 -6.230 0.000 -1.227 -0.640
Primary RUCA Code 2010_7.0 -1.6591 0.083 -20.063 0.000 -1.821 -1.497
Primary RUCA Code 2010_8.0 -2.0929 0.136 -15.427 0.000 -2.359 -1.827
Primary RUCA Code 2010_9.0 -1.4821 0.166 -8.917 0.000 -1.808 -1.156
==============================================================================================================
import statsmodels.api as sm
import statsmodels.formula.api as smf
gaussian_model = sm.GLM(
y_train, sm.add_constant(X_train_3),
family=sm.families.Gaussian()
)
gaussian_results = gaussian_model.fit()
print(gaussian_results.summary())
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: ds_pm_pred No. Observations: 1934
Model: GLM Df Residuals: 1922
Model Family: Gaussian Df Model: 11
Link Function: identity Scale: 0.34363
Method: IRLS Log-Likelihood: -1705.3
Date: Tue, 14 Dec 2021 Deviance: 660.45
Time: 05:18:57 Pearson chi2: 660.
No. Iterations: 3 Pseudo R-squ. (CS): 0.8525
Covariance Type: nonrobust
==============================================================================================================
coef std err z P>|z| [0.025 0.975]
--------------------------------------------------------------------------------------------------------------
const 10.0565 0.024 419.678 0.000 10.010 10.103
Land Area (square miles), 2010 -0.0036 0.000 -11.821 0.000 -0.004 -0.003
Population Density (per square mile), 2010 4.739e-05 4.77e-06 9.939 0.000 3.8e-05 5.67e-05
Primary RUCA Code 2010_10.0 -1.5302 0.066 -23.249 0.000 -1.659 -1.401
Primary RUCA Code 2010_2.0 -0.4049 0.047 -8.656 0.000 -0.497 -0.313
Primary RUCA Code 2010_3.0 -0.7546 0.112 -6.727 0.000 -0.975 -0.535
Primary RUCA Code 2010_4.0 -1.5424 0.064 -24.006 0.000 -1.668 -1.416
Primary RUCA Code 2010_5.0 -1.5017 0.085 -17.688 0.000 -1.668 -1.335
Primary RUCA Code 2010_6.0 -0.9323 0.150 -6.225 0.000 -1.226 -0.639
Primary RUCA Code 2010_7.0 -1.6589 0.083 -20.065 0.000 -1.821 -1.497
Primary RUCA Code 2010_8.0 -2.0886 0.135 -15.471 0.000 -2.353 -1.824
Primary RUCA Code 2010_9.0 -1.4794 0.166 -8.914 0.000 -1.805 -1.154
==============================================================================================================
# Code from lecture
import numpy.random as rnd
def bootstrap_xy(X, y, fnc, w=None, B=1000, plot=True):
d = X.shape[1]
N = X.shape[0]
w_hat = fnc(X, y)
w_boot = np.zeros(shape=(B,d))
for b in range(B):
bootstrap_indices = rnd.choice(np.arange(N), N)
bootstrap_X = X.iloc[bootstrap_indices, :]
bootstrap_y = y.iloc[bootstrap_indices]
w_boot[b,:] = fnc(bootstrap_X, bootstrap_y)
if plot:
plt.scatter(w_boot[:,0], w_boot[:,1], c='b')
plt.scatter(w_hat[0], w_hat[1], c='r', marker='x', s=300)
if w:
plt.scatter(w[0], w[1], c='g', marker='x', s=300)
plt.show()
return w_boot
def lin_model(x, y):
model = sm.GLM(
y, x,
family=sm.families.Gaussian()
)
results = model.fit()
params = results.params
return params
w_gaussian_boot = bootstrap_xy(sm.add_constant(X_train_2.drop('Tract Population, 2010', axis=1)), y_train, lin_model, plot=False)
betas = w_gaussian_boot.std(axis = 0)
columns = sm.add_constant(X_train_2.drop('Tract Population, 2010', axis=1)).columns
for i in range(len(betas)):
col = columns[i]
beta = betas[i]
print(f"Bootstrap std error for {col}: {beta:.5f}")
Bootstrap std error for const: 0.04899
Bootstrap std error for Land Area (square miles), 2010: 0.00042
Bootstrap std error for Population Density (per square mile), 2010: 0.00001
Bootstrap std error for Primary RUCA Code 2010_10.0: 0.11134
Bootstrap std error for Primary RUCA Code 2010_2.0: 0.05017
Bootstrap std error for Primary RUCA Code 2010_3.0: 0.10514
Bootstrap std error for Primary RUCA Code 2010_4.0: 0.12249
Bootstrap std error for Primary RUCA Code 2010_5.0: 0.16628
Bootstrap std error for Primary RUCA Code 2010_6.0: 0.23740
Bootstrap std error for Primary RUCA Code 2010_7.0: 0.15764
Bootstrap std error for Primary RUCA Code 2010_8.0: 0.14866
Bootstrap std error for Primary RUCA Code 2010_9.0: 0.24617
from statsmodels.tools.eval_measures import rmse
gaussian_y_train = gaussian_results.predict(sm.add_constant(X_train_3))
gaussian_rmse_train = rmse(y_train, gaussian_y_train)
print('gaussian_rmse_train =', gaussian_rmse_train)
gaussian_y_test = gaussian_results.predict(sm.add_constant(X_test_3))
gaussian_mse_test = rmse(y_test, gaussian_y_test)
print('gaussian_rmse_test =', gaussian_rmse_test)
gaussian_rmse_train = 0.5843743438932951
gaussian_rmse_test = 0.5469140796656224
X_train = X_train.drop(['Tract Population, 2010'], axis=1)
X_test = X_test.drop(['Tract Population, 2010'], axis=1)
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model, tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
tree_regr = tree.DecisionTreeRegressor()
tree_regr.fit(X_train,y_train)
tree_train_y_pred = tree_regr.predict(X_train)
tree_train_rmse = mean_squared_error(y_train, tree_train_y_pred, squared=False)
print('Decision Tree train rmse = ', tree_train_rmse)
tree_train_mse = mean_squared_error(y_train, tree_train_y_pred)
print('Decision Tree train mse = ', tree_train_mse)
tree_train_r2_score = r2_score(y_train, tree_train_y_pred)
print('Decision Tree train r2_score = ', tree_train_r2_score)
tree_test_y_pred = tree_regr.predict(X_test)
tree_test_rmse = mean_squared_error(y_test, tree_test_y_pred, squared=False)
print('Decision Tree test rmse = ', tree_test_rmse)
tree_test_mse = mean_squared_error(y_test, tree_test_y_pred)
print('Decision Tree test mse = ', tree_test_mse)
tree_test_r2_score = r2_score(y_test, tree_test_y_pred)
print('Decision Tree test r2_score = ', tree_test_r2_score)
print('==================================================')
forest_regr = RandomForestRegressor()
forest_regr.fit(X_train, y_train)
forest_train_y_pred = forest_regr.predict(X_train)
forest_train_rmse = mean_squared_error(y_train, forest_train_y_pred, squared=False)
print('Random Forest train rmse = ', forest_train_rmse)
forest_train_mse = mean_squared_error(y_train, forest_train_y_pred)
print('Random Forest train mse = ', forest_train_mse)
forest_train_r2_score = r2_score(y_train, forest_train_y_pred)
print('Random Forest train r2_score = ', forest_train_r2_score)
forest_test_y_pred = forest_regr.predict(X_test)
forest_test_rmse = mean_squared_error(y_test, forest_test_y_pred, squared=False)
print('Random Forest test rmse = ', forest_test_rmse)
forest_test_mse = mean_squared_error(y_test, forest_test_y_pred)
print('Random Forest test mse = ', forest_test_mse)
forest_test_r2_score = r2_score(y_test, forest_test_y_pred)
print('Random Forest test r2_score = ', forest_test_r2_score)
Decision Tree train rmse = 0.0
Decision Tree train mse = 0.0
Decision Tree train r2_score = 1.0
Decision Tree test rmse = 0.7400620781521317
Decision Tree test mse = 0.5476918795188519
Decision Tree test r2_score = 0.40426237567621615
==================================================
Random Forest train rmse = 0.2156455612095654
Random Forest train mse = 0.04650300806938842
Random Forest train r2_score = 0.9534632581106105
Random Forest test rmse = 0.5239669349005022
Random Forest test mse = 0.27454134886902704
Random Forest test r2_score = 0.7013747746313781
med_income_by_state_eda = med_income_by_state.copy()
med_income_by_state_eda = med_income_by_state_eda.drop(['2010 Med Income', '2010 SE', '2011 SE'], axis=1)
med_income_by_state_eda['bins'] = np.digitize(med_income_by_state_eda['2011 Med Income'], range(10000, 70000, 10000))
med_income_by_state_eda['bins_str'] = [f"${i * 10}–{(i + 1) * 10}K" for i in med_income_by_state_eda['bins']]
income_bin_counts = med_income_by_state_eda['bins_str'].value_counts(sort=False)
plt.bar(income_bin_counts.index, income_bin_counts.values)
plt.xlabel('Income Bracket')
plt.ylabel('Count')
plt.title('Median Income in Different States');
copd = chronic_disease[chronic_disease['Topic'] == 'Chronic Obstructive Pulmonary Disease']
copd
copd_mortality = copd[copd['Question'] == 'Mortality with chronic obstructive pulmonary disease as underlying or contributing cause among adults aged >= 45 years']
copd_mortality_2011 = copd_mortality[(copd_mortality['YearStart'] == 2011) & (copd_mortality['YearEnd'] == 2011)]
copd_mortality_2011 = copd_mortality_2011[(copd_mortality_2011['DataValueUnit'] == 'cases per 100,000') & (copd_mortality_2011['DataValueType'] == 'Age-adjusted Rate')].dropna(subset=["DataValue"])
copd_mortality_2011
copd_mortality_2011_grouped = copd_mortality_2011.groupby('LocationDesc', as_index=False).agg(np.mean)
copd_mortality_2011_grouped
combined_copd_mortality_pm25 = mean_pm25_by_state.merge(copd_mortality_2011_grouped, how='inner', left_on='statefips', right_on='LocationID')
copd_mortality_pm25_income = combined_copd_mortality_pm25.merge(med_income_by_state, how='inner', left_on='LocationDesc', right_on='State')
copd_mortality_pm25_income = copd_mortality_pm25_income.drop(['Response', 'StratificationCategory2', 'Stratification2', 'StratificationCategory3','Stratification3', 'ResponseID', 'StratificationCategoryID2', 'StratificationID2','StratificationCategoryID3', 'StratificationID3', '2010 Med Income', '2010 SE', '2011 SE'], axis=1)
copd_mortality_pm25_income = copd_mortality_pm25_income.drop(['LocationID', 'LocationDesc'], axis=1)
copd_mortality_pm25_income_2011 = copd_mortality_pm25_income.drop(['YearStart', 'YearEnd'], axis=1)
copd_mortality_pm25_income_2011 = copd_mortality_pm25_income_2011.drop(['LowConfidenceLimit', 'HighConfidenceLimit'], axis=1)
copd_mortality_pm25_income_2011
copd_mortality_pm25_income_2011['elevated'] = copd_mortality_pm25_income_2011['ds_pm_pred'] > np.median(copd_mortality_pm25_income_2011['ds_pm_pred'])
sns.pairplot(copd_mortality_pm25_income_2011.drop('statefips', axis=1))
from sklearn.linear_model import LogisticRegression as LR
lr = LR(penalty='none', max_iter=200, random_state=0)
X = copd_mortality_pm25_income_2011['2011 Med Income'].values.reshape(-1, 1)
Y = copd_mortality_pm25_income_2011['DataValueAlt']
Z = copd_mortality_pm25_income_2011['elevated']
def estimate_treatment_effect(lr, X, Y, Z):
ex = lr.predict_proba(X)[:, 1]
return np.mean(Z * Y / ex) - np.mean((1 - Z) * Y / (1 - ex))
lr.fit(X, Z)
estimate_treatment_effect(lr, X, Y, Z)
estimate_treatment_effect(lr, X, Y, Z)