try:
import statsmodels.api as sm
except:
!pip install statsmodels==0.13.1
import statsmodels.api as sm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
pm25 = pd.read_csv('mich_pm25.csv')
pm25
ruca_ca = pd.read_csv('ruca2010revised.csv')
ruca_ca
med_income_by_state = pd.read_csv('median_income_2011.csv')
med_income_by_state = med_income_by_state.head(51)
med_income_by_state['2011 Med Income'] = [int(i.replace(',', '')) for i in med_income_by_state['2011 Med Income']]
med_income_by_state['State'] = med_income_by_state['State'].str.strip()
mean_pm25_by_state = pd.read_csv("stateag_pm25_2011.csv")
mean_pm25_by_state = mean_pm25_by_state.drop(labels=['year', 'countyfips', 'ctfips', 'latitude', 'longitude', 'ds_pm_stdd'], axis=1)
chronic_disease = pd.read_csv("U.S._Chronic_Disease_Indicators__CDI_.csv")
chronic_disease
ruca_ca.columns = ruca_ca.loc[0]
ruca_ca = ruca_ca.drop(0).reset_index()
ruca_ca = ruca_ca.drop(columns = 'index')
ruca_ca.to_csv('ruca2010.csv')
ruca_ca = ruca_ca[ruca_ca['Select State'] == 'MI']
ruca_ca
ruca_ca['State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)'] = ruca_ca['State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)'].astype(int)
pm25 = pm25.drop(columns=['statefips', 'countyfips', 'longitude', 'latitude'])
pm25 = pm25.drop(columns=['Unnamed: 0', 'year'])
merged_pm25 = pm25.merge(ruca_ca, how='inner', left_on='ctfips', right_on="State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)")
merged_pm25['urban'] = merged_pm25['Primary RUCA Code 2010'].astype(int) <= 6
merged_pm25['Primary RUCA Code 2010'] = merged_pm25['Primary RUCA Code 2010'].astype(int)
merged_grouped = merged_pm25.groupby('ctfips').agg(np.mean)
merged_grouped = merged_grouped[merged_grouped['Primary RUCA Code 2010'] < 99]
merged_pm25['Tract Population, 2010'] = merged_pm25['Tract Population, 2010'].str.replace(",","").astype(int)
merged_pm25['Population Density (per square mile), 2010'] = merged_pm25['Population Density (per square mile), 2010'].str.replace(",","").astype(float)
merged_grouped = merged_pm25.groupby('ctfips').agg(np.mean)
merged_grouped = merged_grouped[merged_grouped['Tract Population, 2010'] < 15000]
merged_grouped = merged_grouped[merged_grouped['Population Density (per square mile), 2010'] > 1000]
merged_grouped = merged_grouped[merged_grouped['Population Density (per square mile), 2010'] < 50000]
merged_2011 = merged_pm25[merged_pm25['date'].str.contains('2011')]
merged_2011.to_csv('ruca_merged_michigan.csv')
ruca_mi = pd.read_csv('ruca2010revised.csv')
ruca_mi.columns = ruca_mi.loc[0]
ruca_mi = ruca_mi.drop(0).reset_index()
ruca_mi = ruca_mi.drop(columns = 'index')
ruca_mi = ruca_mi[ruca_mi['Select State'] == 'MI']
ruca_mi_counts = ruca_mi['Primary RUCA Code 2010'].astype(int).value_counts(sort=False)
ruca_mi_counts = ruca_mi_counts.sort_index()
plt.bar(ruca_mi_counts.index.astype('str'), ruca_mi_counts.values)
plt.xlabel('Primary RUCA Code')
plt.ylabel('Count')
plt.title('Number of Census Tracts in Michigan with each RUCA Code');
ruca_merged_mi = pd.read_csv('ruca_merged_michigan.csv')
ruca_merged_mi
pairplot_df = ruca_merged_mi.drop(columns=['Unnamed: 0','date', 'ctfips', 'ds_pm_stdd', 'State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)'])
merged_mi_2011 = ruca_merged_mi[ruca_merged_mi['date'].str.contains('2011')]
merged_mi_2011_grouped = merged_mi_2011.groupby('ctfips').agg(np.mean)
merged_mi_2011_grouped = merged_mi_2011_grouped[merged_mi_2011_grouped['Primary RUCA Code 2010'] < 99]
merged_mi_2011_grouped
pairplot_df = merged_mi_2011_grouped.drop(columns=['Unnamed: 0', 'ds_pm_stdd', 'State-County-Tract FIPS Code (lookup by address at http://www.ffiec.gov/Geocode/)'])
sns.pairplot(pairplot_df.drop(['Primary RUCA Code 2010', 'Secondary RUCA Code, 2010 (see errata)'], axis=1))
non_par_df = pairplot_df.drop('Secondary RUCA Code, 2010 (see errata)', axis=1).dropna()
non_par_df['Primary RUCA Code 2010'] = non_par_df['Primary RUCA Code 2010'].astype(str)
X = non_par_df.drop(['ds_pm_pred'], axis=1)
y = non_par_df['ds_pm_pred']
X = pd.get_dummies(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=88)
X_train_2 = X_train.drop(['Primary RUCA Code 2010_1.0'], axis=1)
X_test_2 = X_test.drop(['Primary RUCA Code 2010_1.0'], axis=1)
X_train_3 = X_train_2.drop('Tract Population, 2010', axis=1)
X_test_3 = X_test_2.drop('Tract Population, 2010', axis=1)
import statsmodels.api as sm
import statsmodels.formula.api as smf
gaussian_model = sm.GLM(
y_train, sm.add_constant(X_train_2),
family=sm.families.Gaussian()
)
gaussian_results = gaussian_model.fit()
print(gaussian_results.summary())
import statsmodels.api as sm
import statsmodels.formula.api as smf
gaussian_model = sm.GLM(
y_train, sm.add_constant(X_train_3),
family=sm.families.Gaussian()
)
gaussian_results = gaussian_model.fit()
print(gaussian_results.summary())
# Code from lecture
import numpy.random as rnd
def bootstrap_xy(X, y, fnc, w=None, B=1000, plot=True):
d = X.shape[1]
N = X.shape[0]
w_hat = fnc(X, y)
w_boot = np.zeros(shape=(B,d))
for b in range(B):
bootstrap_indices = rnd.choice(np.arange(N), N)
bootstrap_X = X.iloc[bootstrap_indices, :]
bootstrap_y = y.iloc[bootstrap_indices]
w_boot[b,:] = fnc(bootstrap_X, bootstrap_y)
if plot:
plt.scatter(w_boot[:,0], w_boot[:,1], c='b')
plt.scatter(w_hat[0], w_hat[1], c='r', marker='x', s=300)
if w:
plt.scatter(w[0], w[1], c='g', marker='x', s=300)
plt.show()
return w_boot
def lin_model(x, y):
model = sm.GLM(
y, x,
family=sm.families.Gaussian()
)
results = model.fit()
params = results.params
return params
w_gaussian_boot = bootstrap_xy(sm.add_constant(X_train_2.drop('Tract Population, 2010', axis=1)), y_train, lin_model, plot=False)
betas = w_gaussian_boot.std(axis = 0)
columns = sm.add_constant(X_train_2.drop('Tract Population, 2010', axis=1)).columns
for i in range(len(betas)):
col = columns[i]
beta = betas[i]
print(f"Bootstrap std error for {col}: {beta:.5f}")
from statsmodels.tools.eval_measures import rmse
gaussian_y_train = gaussian_results.predict(sm.add_constant(X_train_3))
gaussian_rmse_train = rmse(y_train, gaussian_y_train)
print('gaussian_rmse_train =', gaussian_rmse_train)
gaussian_y_test = gaussian_results.predict(sm.add_constant(X_test_3))
gaussian_mse_test = rmse(y_test, gaussian_y_test)
print('gaussian_rmse_test =', gaussian_rmse_test)
X_train = X_train.drop(['Tract Population, 2010'], axis=1)
X_test = X_test.drop(['Tract Population, 2010'], axis=1)
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model, tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
tree_regr = tree.DecisionTreeRegressor()
tree_regr.fit(X_train,y_train)
tree_train_y_pred = tree_regr.predict(X_train)
tree_train_rmse = mean_squared_error(y_train, tree_train_y_pred, squared=False)
print('Decision Tree train rmse = ', tree_train_rmse)
tree_train_mse = mean_squared_error(y_train, tree_train_y_pred)
print('Decision Tree train mse = ', tree_train_mse)
tree_train_r2_score = r2_score(y_train, tree_train_y_pred)
print('Decision Tree train r2_score = ', tree_train_r2_score)
tree_test_y_pred = tree_regr.predict(X_test)
tree_test_rmse = mean_squared_error(y_test, tree_test_y_pred, squared=False)
print('Decision Tree test rmse = ', tree_test_rmse)
tree_test_mse = mean_squared_error(y_test, tree_test_y_pred)
print('Decision Tree test mse = ', tree_test_mse)
tree_test_r2_score = r2_score(y_test, tree_test_y_pred)
print('Decision Tree test r2_score = ', tree_test_r2_score)
print('==================================================')
forest_regr = RandomForestRegressor()
forest_regr.fit(X_train, y_train)
forest_train_y_pred = forest_regr.predict(X_train)
forest_train_rmse = mean_squared_error(y_train, forest_train_y_pred, squared=False)
print('Random Forest train rmse = ', forest_train_rmse)
forest_train_mse = mean_squared_error(y_train, forest_train_y_pred)
print('Random Forest train mse = ', forest_train_mse)
forest_train_r2_score = r2_score(y_train, forest_train_y_pred)
print('Random Forest train r2_score = ', forest_train_r2_score)
forest_test_y_pred = forest_regr.predict(X_test)
forest_test_rmse = mean_squared_error(y_test, forest_test_y_pred, squared=False)
print('Random Forest test rmse = ', forest_test_rmse)
forest_test_mse = mean_squared_error(y_test, forest_test_y_pred)
print('Random Forest test mse = ', forest_test_mse)
forest_test_r2_score = r2_score(y_test, forest_test_y_pred)
print('Random Forest test r2_score = ', forest_test_r2_score)
med_income_by_state_eda = med_income_by_state.copy()
med_income_by_state_eda = med_income_by_state_eda.drop(['2010 Med Income', '2010 SE', '2011 SE'], axis=1)
med_income_by_state_eda['bins'] = np.digitize(med_income_by_state_eda['2011 Med Income'], range(10000, 70000, 10000))
med_income_by_state_eda['bins_str'] = [f"${i * 10}–{(i + 1) * 10}K" for i in med_income_by_state_eda['bins']]
income_bin_counts = med_income_by_state_eda['bins_str'].value_counts(sort=False)
plt.bar(income_bin_counts.index, income_bin_counts.values)
plt.xlabel('Income Bracket')
plt.ylabel('Count')
plt.title('Median Income in Different States');
copd = chronic_disease[chronic_disease['Topic'] == 'Chronic Obstructive Pulmonary Disease']
copd
copd_mortality = copd[copd['Question'] == 'Mortality with chronic obstructive pulmonary disease as underlying or contributing cause among adults aged >= 45 years']
copd_mortality_2011 = copd_mortality[(copd_mortality['YearStart'] == 2011) & (copd_mortality['YearEnd'] == 2011)]
copd_mortality_2011 = copd_mortality_2011[(copd_mortality_2011['DataValueUnit'] == 'cases per 100,000') & (copd_mortality_2011['DataValueType'] == 'Age-adjusted Rate')].dropna(subset=["DataValue"])
copd_mortality_2011
copd_mortality_2011_grouped = copd_mortality_2011.groupby('LocationDesc', as_index=False).agg(np.mean)
copd_mortality_2011_grouped
combined_copd_mortality_pm25 = mean_pm25_by_state.merge(copd_mortality_2011_grouped, how='inner', left_on='statefips', right_on='LocationID')
copd_mortality_pm25_income = combined_copd_mortality_pm25.merge(med_income_by_state, how='inner', left_on='LocationDesc', right_on='State')
copd_mortality_pm25_income = copd_mortality_pm25_income.drop(['Response', 'StratificationCategory2', 'Stratification2', 'StratificationCategory3','Stratification3', 'ResponseID', 'StratificationCategoryID2', 'StratificationID2','StratificationCategoryID3', 'StratificationID3', '2010 Med Income', '2010 SE', '2011 SE'], axis=1)
copd_mortality_pm25_income = copd_mortality_pm25_income.drop(['LocationID', 'LocationDesc'], axis=1)
copd_mortality_pm25_income_2011 = copd_mortality_pm25_income.drop(['YearStart', 'YearEnd'], axis=1)
copd_mortality_pm25_income_2011 = copd_mortality_pm25_income_2011.drop(['LowConfidenceLimit', 'HighConfidenceLimit'], axis=1)
copd_mortality_pm25_income_2011
copd_mortality_pm25_income_2011['elevated'] = copd_mortality_pm25_income_2011['ds_pm_pred'] > np.median(copd_mortality_pm25_income_2011['ds_pm_pred'])
sns.pairplot(copd_mortality_pm25_income_2011.drop('statefips', axis=1))
from sklearn.linear_model import LogisticRegression as LR
lr = LR(penalty='none', max_iter=200, random_state=0)
X = copd_mortality_pm25_income_2011['2011 Med Income'].values.reshape(-1, 1)
Y = copd_mortality_pm25_income_2011['DataValueAlt']
Z = copd_mortality_pm25_income_2011['elevated']
def estimate_treatment_effect(lr, X, Y, Z):
ex = lr.predict_proba(X)[:, 1]
return np.mean(Z * Y / ex) - np.mean((1 - Z) * Y / (1 - ex))
lr.fit(X, Z)
estimate_treatment_effect(lr, X, Y, Z)
estimate_treatment_effect(lr, X, Y, Z)