import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
# import geopandas as gpd
# from shapely.geometry import Point
# from shapely import wkt
# import geopandas
# try:
# from pymc3 import *
# import pymc3 as pm
# except:
# ! pip install pymc3
# from pymc3 import *
# import pymc3 as pm
# import arviz as az
from pymc3 import glm
# from scipy.stats import poisson, norm, gamma
%matplotlib inline
## Reading into UNFILTERED Chronic Disease Indicator dataset.
us_cdi = pd.read_csv('us_cdi.csv')
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/IPython/core/interactiveshell.py:3173: DtypeWarning: Columns (10) have mixed types.Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
#all values in dropped col are null -> drop
us_cdi[us_cdi['Response'].notnull()]
us_cdi = us_cdi.drop(['Response','StratificationCategory2','Stratification2','StratificationCategory3','Stratification3','StratificationCategoryID2','StratificationID2','StratificationCategoryID3','StratificationID3','ResponseID'], axis=1)
us_cdi.head()
df = us_cdi.loc[us_cdi['Topic'] == 'Cardiovascular Disease']
df = df.loc[(df['Question'] == 'Mortality from total cardiovascular diseases') & (df['Topic'] == 'Cardiovascular Disease')]
df = df.loc[(df['DataValueType'] == 'Age-adjusted Rate')]
df = df.loc[(df['StratificationCategory1'] == 'Race/Ethnicity')]
df = df[df['DataValue'].notna()]
df = df.drop(['YearEnd', 'LocationAbbr', 'DataSource', 'Topic', 'Question', 'DataValueUnit', 'DataValueAlt', 'DataValueType', 'DataValueFootnoteSymbol','DatavalueFootnote', 'LowConfidenceLimit', 'HighConfidenceLimit', 'StratificationCategory1','GeoLocation', 'LocationID', 'TopicID', 'QuestionID', 'DataValueTypeID', 'StratificationCategoryID1','StratificationID1'], axis=1)
# df.head()
#Diagnosed Diabetes; Total; Adults Aged 18+ Years; Age-Adjusted Percentage; U.S. States; 2008-2018
diabetes = pd.read_csv('DiabetesAtlasData.csv')
diabetes_drop = diabetes.iloc[:-3, :]
diabetes_drop.isna().sum()
diabetes_drop.head()
## Insert Diabetes DF
diabetes_copy = diabetes_drop.copy() ## Copy diabetes rates without US Territories
diabetes_copy['State'] = diabetes_drop['State']
df2_diabetes = df2.copy() ## Copy Original DF (with race labels updated)
df2_diabetes['Diabetes Rate'] = np.nan ## Insert Empty Column for Diabetes
def diabetes_match(row):
correct_date = row['YearStart']
correct_state = row['LocationDesc']
return diabetes_copy[diabetes_copy['State'].str.contains(correct_state)][str(correct_date)].values[0]
df2_diabetes['Diabetes Rate'] = df2_diabetes.apply(lambda row: diabetes_match(row), axis = 1)
df2_diabetes.head()
tobacco = pd.read_csv('The_Tax_Burden_on_Tobacco__1970-2019.csv', usecols=[ 1, 2, 5, 6, 7])
tobacco = tobacco[tobacco["SubMeasureDesc"] == "Cigarette Consumption (Pack Sales Per Capita)"]
yearly_consumption = tobacco.groupby('Year')['Data_Value'].sum()
yearly_consumption = yearly_consumption.reset_index(name = "Year Sum")
combine = tobacco.merge(yearly_consumption, how='left', on='Year')
tobacco_consumption = combine
tobacco_consumption['Tobacco Consumption Based on US Total'] = (tobacco_consumption['Data_Value'] / tobacco_consumption['Year Sum'])
tobacco_consumption = tobacco_consumption.sort_values(['Year', 'LocationDesc'], ascending = True)
tobacco_consumption = tobacco_consumption.drop(columns =['Year Sum', 'Data_Value', 'SubMeasureDesc', 'MeasureDesc' ])
tobacco_consumption = tobacco_consumption.pivot(index='LocationDesc', columns='Year', values='Tobacco Consumption Based on US Total').reset_index()
tobacco_consumption = tobacco_consumption.rename(columns={'LocationDesc':'State'})
tobacco_consumption = tobacco_consumption[['State', 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]]
tobacco_consumption.iloc[:,[1,2,3,4,5,6,7,8,9]] = tobacco_consumption.iloc[:,[1,2,3,4,5,6,7,8,9]]*100
tobacco_consumption.head()
## Insert Tobacco into DF
tobacco_copy = tobacco_consumption.copy() ## Copy tobacco consump. rate
tobacco_copy['State'] = tobacco_consumption['State']
df3_tobacco = df2_diabetes.copy() ## Copy DF with added diabetes from before
df3_tobacco['Tobacco Consumption Rate'] = np.nan #Insert empty vals into col
def tobacco_match(row):
correct_date = row['YearStart']
correct_state = row['LocationDesc']
return tobacco_copy[tobacco_copy['State'].str.contains(correct_state)][(correct_date)].values[0]
df3_tobacco['Tobacco Consumption Rate'] = df3_tobacco.apply(lambda row: tobacco_match(row), axis = 1)
df3_tobacco.head()
#obesity rate by state 2011-2020
import pandas as pd
obesity = pd.read_csv('ExportCSV.csv')
obesity_clean = obesity[['YearStart','LocationDesc','Data_Value']]#,'Low_Confidence_Limit','High_Confidence_Limit']]
obesity_clean = obesity_clean.rename(columns={'Data_Value':'Obesity_Rate'})#,'Low_Confidence_Limit':'Obesity_Low_Conf_Limit','High_Confidence_Limit':'Obesity_High_Conf_Limit'})
obesity_clean2 = obesity_clean.copy()
# obesity_clean3 = obesity_clean2.copy()
obesity_clean2 = obesity_clean2.pivot(index='LocationDesc',columns = 'YearStart', values = 'Obesity_Rate').reset_index()
obesity_clean2 = obesity_clean2.rename(columns={'LocationDesc':'State'})
obesity_clean2 = obesity_clean2[['State', 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]]
obesity_clean2.head()
# Insert Obesity into DF
df4_obesity = df3_tobacco.copy() ## Copy
def obesity_match(row):
correct_date = row['YearStart']
correct_state = row['LocationDesc']
return (df_obesity[df_obesity['State'].str.contains(correct_state)][correct_date].values[0])
df4_obesity['Obesity Rate'] = df4_obesity.apply(lambda row: obesity_match(row), axis = 1)
df4_obesity.head()
#Cleaning Air Quality Data for Q1
daily10 = pd.read_csv('daily_88101_2010.csv')
daily11 = pd.read_csv('daily_88101_2011.csv')
daily12 = pd.read_csv('daily_88101_2012.csv')
daily13 = pd.read_csv('daily_88101_2013.csv')
daily14 = pd.read_csv('daily_88101_2014.csv')
daily15 = pd.read_csv('daily_88101_2015.csv')
daily16 = pd.read_csv('daily_88101_2016.csv')
daily17 = pd.read_csv('daily_88101_2017.csv')
daily18 = pd.read_csv('daily_88101_2018.csv')
daily18['Sample Duration'].value_counts()
#Cleaning pm2.5 data
#daily18[daily18['Sample Duration'] == '24 HOUR']
import datetime
def get_year_average(df):
#State Name, POC, Parameter Name, Sample Duration, Pollutant Standard, Date Local , Arithmetic Mean, AQI
df = df[df['Sample Duration'] == '24-HR BLK AVG']
df = df[['State Name', 'Parameter Name', 'Sample Duration', 'Pollutant Standard', 'Date Local' , 'Arithmetic Mean', 'AQI']]
df['Year'] = pd.DatetimeIndex(df['Date Local']).year
df = df.groupby('State Name').mean()
return df
annual18 = get_year_average(daily18)
annual17 = get_year_average(daily17)
annual16 = get_year_average(daily16)
annual15 = get_year_average(daily15)
annual14 = get_year_average(daily14)
annual13 = get_year_average(daily13)
annual12 = get_year_average(daily12)
annual11 = get_year_average(daily11)
annual10 = get_year_average(daily10)
# Reads and creates data frame from population
pop_sex = pd.read_csv('pop_sex.csv')
pop_sex.head()
sex_header = pop_sex.iloc[1]
pop_sex_df = pd.DataFrame(pop_sex.values[1:], columns=sex_header)
pop_sex_df = pop_sex_df.drop(['2008__Male', '2008__Female', '2009__Male', '2009__Female', '2019__Male', '2019__Female', '2019__Total', 'Footnotes'], axis = 1)
pop_sex_df = pop_sex_df.drop(['2008__Total', '2009__Total', '2010__Total', '2011__Total', '2012__Total', '2013__Total', '2014__Total', '2015__Total', '2016__Total', '2017__Total', '2018__Total'], axis = 1)
pop_sex_df = pop_sex_df.sort_values('Location').dropna()
pop_sex_df_male = pop_sex_df[['Location', '2010__Male', '2011__Male', '2012__Male', '2013__Male', '2014__Male', '2015__Male', '2016__Male', '2017__Male', '2018__Male']]
pop_sex_df_male = pop_sex_df_male.rename(columns={'Location': 'State', '2010__Male': 2010, '2011__Male': 2011, '2012__Male':2012, '2013__Male':2013, '2014__Male':2014, '2015__Male':2015, '2016__Male':2016, '2017__Male':2017, '2018__Male':2018})
pop_sex_df_male
pop_sex_df_male[2018] = pd.to_numeric(pop_sex_df_male[2018], errors = 'coerce') * 100
pop_sex_df_male[2017] = pd.to_numeric(pop_sex_df_male[2017], errors = 'coerce') * 100
pop_sex_df_male[2016] = pd.to_numeric(pop_sex_df_male[2016], errors = 'coerce') * 100
pop_sex_df_male[2015] = pd.to_numeric(pop_sex_df_male[2015], errors = 'coerce') * 100
pop_sex_df_male[2014] = pd.to_numeric(pop_sex_df_male[2014], errors = 'coerce') * 100
pop_sex_df_male[2013] = pd.to_numeric(pop_sex_df_male[2013], errors = 'coerce') * 100
pop_sex_df_male[2012] = pd.to_numeric(pop_sex_df_male[2012], errors = 'coerce') * 100
pop_sex_df_male[2011] = pd.to_numeric(pop_sex_df_male[2011], errors = 'coerce') * 100
pop_sex_df_male[2010] = pd.to_numeric(pop_sex_df_male[2010], errors = 'coerce') * 100
pop_sex_df_male[2010].fillna((pop_sex_df_male[2010].mean()), inplace=True)
pop_sex_df_male[2011].fillna((pop_sex_df_male[2011].mean()), inplace=True)
pop_sex_df_male[2012].fillna((pop_sex_df_male[2012].mean()), inplace=True)
pop_sex_df_male[2013].fillna((pop_sex_df_male[2013].mean()), inplace=True)
pop_sex_df_male[2014].fillna((pop_sex_df_male[2014].mean()), inplace=True)
pop_sex_df_male[2015].fillna((pop_sex_df_male[2015].mean()), inplace=True)
pop_sex_df_male[2016].fillna((pop_sex_df_male[2016].mean()), inplace=True)
pop_sex_df_male[2017].fillna((pop_sex_df_male[2017].mean()), inplace=True)
pop_sex_df_male[2018].fillna((pop_sex_df_male[2018].mean()), inplace=True)
pop_sex_df_male.isna().sum()
## Insert Sex into DF
sex_copy = pop_sex_df_male.copy()
df4_sex = df4_obesity.copy()
df4_sex['Male Sex %'] = np.nan
def sex_match(row):
correct_date = row['YearStart']
correct_state = row['LocationDesc']
return (sex_copy[sex_copy['State'].str.contains(correct_state)][correct_date].values[0])
df4_sex['Male Sex %'] = df4_sex.apply(lambda row: sex_match(row), axis = 1)
df4_sex.head()
# Table 23 extracted from XLSX file: Medicare Per Enrollee (Dollars per Person) + Cleaning
health23 = pd.read_csv('Table 23 Medicare Per Enrollee.csv')
health23.drop(health23.tail(2).index,inplace=True)
health23 = health23.iloc[: , :-2]
new_header23 = health23.iloc[0] #grab the first row for the header
health23 = health23[1:] #take the data less the header row
health23.columns = new_header23 #set the header row as the df header
health23_yr = health23.drop(['1991', '1992','1993', '1994', '1995',
'1996', '1997', '1998', '1999', '2000',
'2001', '2002', '2003', '2004', '2005',
'2006', '2007', '2008', '2009'], axis = 1)
health23_yr = health23_yr.iloc[:, : -1] #removes avg growth column
health23_yr= health23_yr.rename(columns={'Region/state of residence':'State'})
health23_yr.head()
#Add Medicare Expenditure as new column to make new df
medicare = health23_yr.copy()
# df4['Medicare Expenditure'] = np.nan
df4_medicare = df4_sex.copy()
df4_medicare['Medicare Expenditure'] = np.nan
medicare = medicare[medicare.columns[1:]].replace('[\$,]', '', regex=True).astype(float)
#estimates for 2015-2018 based on 1.7% growth rate annually in expenditures
medicare['2015'] = medicare['2014'] * 1.017
medicare['2016'] = medicare['2015'] * 1.017
medicare['2017'] = medicare['2016'] * 1.017
medicare['2018'] = medicare['2017'] * 1.017
#just to fix the dataset
medicare['State'] = health23_yr['State']
medicare
#match the year and state and grab the appropriate input
def match(row):
correct_date = row['YearStart']
correct_state = row['LocationDesc']
return (medicare[medicare['State'].str.contains(correct_state)][str(correct_date)].values[0])
#apply function to get final column in medicare expenditure
df4_medicare['Medicare Expenditure'] = df4_medicare.apply(lambda row: match(row), axis=1)
df4_medicare.head()
Comparing GLM and non-parametric methods: DT/RF
from sklearn.model_selection import train_test_split
train, test = train_test_split(cleaned_data,test_size=.3, random_state = 4)
#IF a poisson distribution was good for mortality rates, then the mean and variance would be close
#since its not, poisson distribution is NOT a good fit for age-related mortality rates
np.mean(cleaned_data.DataValue), np.var(cleaned_data.DataValue)
#Frequentist GLM - Poisson (race ONLY)
y = (cleaned_data.DataValue).astype(None)
poisson_model_race = sm.GLM(
y, sm.add_constant(cleaned_data[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic']]),
family=sm.families.Poisson()
)
poisson_results_race = poisson_model_race.fit()
print(poisson_results_race.summary())
#Again: same problem, low standard of error, model is too confident. Can conclude poisson distribution is not a good fit for this data
# We will try this again with location once we use medicaid expenditure, etc. as features
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: DataValue No. Observations: 1885
Model: GLM Df Residuals: 1880
Model Family: Poisson Df Model: 4
Link Function: Log Scale: 1.0000
Method: IRLS Log-Likelihood: -18045.
Date: Mon, 13 Dec 2021 Deviance: 22850.
Time: 16:26:07 Pearson chi2: 2.28e+04
No. Iterations: 100 Pseudo R-squ. (CS): 1.000
Covariance Type: nonrobust
===================================================================================================================
coef std err z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------------------------------------------
const 4.3467 0.001 2974.357 0.000 4.344 4.350
Stratification_American Indian or Alaska Native 0.9776 0.004 264.129 0.000 0.970 0.985
Stratification_Asian or Pacific Islander 0.5049 0.004 124.518 0.000 0.497 0.513
Stratification_Black, non-Hispanic 1.2827 0.003 444.475 0.000 1.277 1.288
Stratification_Hispanic 0.5340 0.004 138.829 0.000 0.526 0.542
Stratification_White, non-Hispanic 1.0475 0.003 355.788 0.000 1.042 1.053
===================================================================================================================
#Frequentist GLM - Poisson (race + medicare expenditure + year)
poisson_model_race_medicare_year = sm.GLM(
y, sm.add_constant(cleaned_data[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic', 'Medicare Expenditure', 'YearStart']]),
family=sm.families.Poisson()
)
poisson_results_race_medicare_year = poisson_model_race_medicare_year.fit()
print(poisson_results_race_medicare_year.summary())
#Again: non-significant p-values; too wide confidence intervals this time; log-likelihood & Chi2
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: DataValue No. Observations: 1885
Model: GLM Df Residuals: 1878
Model Family: Poisson Df Model: 6
Link Function: Log Scale: 1.0000
Method: IRLS Log-Likelihood: -17884.
Date: Mon, 13 Dec 2021 Deviance: 22529.
Time: 16:26:13 Pearson chi2: 2.24e+04
No. Iterations: 4 Pseudo R-squ. (CS): 1.000
Covariance Type: nonrobust
===================================================================================================================
coef std err z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------------------------------------------
const 15.9118 1.131 14.063 0.000 13.694 18.130
Stratification_American Indian or Alaska Native 3.2878 0.226 14.518 0.000 2.844 3.732
Stratification_Asian or Pacific Islander 2.8197 0.226 12.459 0.000 2.376 3.263
Stratification_Black, non-Hispanic 3.5980 0.226 15.904 0.000 3.155 4.041
Stratification_Hispanic 2.8479 0.226 12.585 0.000 2.404 3.291
Stratification_White, non-Hispanic 3.3584 0.226 14.839 0.000 2.915 3.802
Medicare Expenditure -1.56e-05 1.44e-06 -10.851 0.000 -1.84e-05 -1.28e-05
YearStart -0.0068 0.001 -10.066 0.000 -0.008 -0.005
===================================================================================================================
#Frequent GLM - Negative Binomial (race + year)
negbin_model_raceyear = sm.GLM(
y, sm.add_constant(cleaned_data[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic', 'YearStart']]),
family=sm.families.NegativeBinomial()
)
negbin_results_raceyear = negbin_model_raceyear.fit()
print(negbin_results_raceyear.summary())
#non-significant p-values, too wide confidence intervals, high standard errors
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: DataValue No. Observations: 1885
Model: GLM Df Residuals: 1879
Model Family: NegativeBinomial Df Model: 5
Link Function: Log Scale: 1.0000
Method: IRLS Log-Likelihood: -11728.
Date: Mon, 13 Dec 2021 Deviance: 126.36
Time: 16:26:31 Pearson chi2: 122.
No. Iterations: 5 Pseudo R-squ. (CS): 0.08798
Covariance Type: nonrobust
===================================================================================================================
coef std err z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------------------------------------------
const 20.4320 15.011 1.361 0.173 -8.989 49.853
Stratification_American Indian or Alaska Native 4.1949 3.003 1.397 0.162 -1.691 10.081
Stratification_Asian or Pacific Islander 3.7218 3.003 1.239 0.215 -2.163 9.607
Stratification_Black, non-Hispanic 4.4999 3.002 1.499 0.134 -1.385 10.384
Stratification_Hispanic 3.7512 3.002 1.249 0.212 -2.133 9.636
Stratification_White, non-Hispanic 4.2642 3.002 1.420 0.155 -1.620 10.148
YearStart -0.0096 0.009 -1.072 0.284 -0.027 0.008
===================================================================================================================
#Frequentist Model - Negative Binomial (race only)
negbin_model_race = sm.GLM(
y, sm.add_constant(cleaned_data[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic']]),
family=sm.families.NegativeBinomial()
)
negbin_results_race = negbin_model_race.fit()
print(negbin_results_race.summary())
#log-likelihood was okay, avg log-likelihood of -6, log-likelihoods close to 0 are good.
#Chi-square statistic & deviance should be less than n-p, it WAS !
# roughly be n - p = 1885 - 5 = 1880 but instead is 123, since its less, better fit!)
#confidence intervals & standard erros are a bit TOO small so model might be overconfident
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: DataValue No. Observations: 1885
Model: GLM Df Residuals: 1880
Model Family: NegativeBinomial Df Model: 4
Link Function: Log Scale: 1.0000
Method: IRLS Log-Likelihood: -11728.
Date: Sun, 12 Dec 2021 Deviance: 127.52
Time: 03:34:17 Pearson chi2: 123.
No. Iterations: 100 Pseudo R-squ. (CS): 0.08742
Covariance Type: nonrobust
===================================================================================================================
coef std err z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------------------------------------------
const 4.3467 0.020 222.861 0.000 4.308 4.385
Stratification_American Indian or Alaska Native 0.9776 0.053 18.535 0.000 0.874 1.081
Stratification_Asian or Pacific Islander 0.5049 0.047 10.726 0.000 0.413 0.597
Stratification_Black, non-Hispanic 1.2828 0.046 27.901 0.000 1.193 1.373
Stratification_Hispanic 0.5340 0.046 11.707 0.000 0.445 0.623
Stratification_White, non-Hispanic 1.0474 0.043 24.422 0.000 0.963 1.131
===================================================================================================================
#Frequentist Model - Negative Binomial (race + expenditure)
negbin_model_race_medicare = sm.GLM(
y, sm.add_constant(cleaned_data[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic', 'Medicare Expenditure']]),
family=sm.families.NegativeBinomial()
)
negbin_results_race_medicare = negbin_model_race_medicare.fit()
print(negbin_results_race_medicare.summary())
#Similar idea for deviance / chi2 / log-likelihood --> seem good
#Medicare expenditure had very little impact and it seems like race is a much bigger deal than
# state differences in medicare expenditure
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: DataValue No. Observations: 1885
Model: GLM Df Residuals: 1879
Model Family: NegativeBinomial Df Model: 5
Link Function: Log Scale: 1.0000
Method: IRLS Log-Likelihood: -11727.
Date: Sun, 12 Dec 2021 Deviance: 125.35
Time: 03:34:17 Pearson chi2: 120.
No. Iterations: 6 Pseudo R-squ. (CS): 0.08847
Covariance Type: nonrobust
===================================================================================================================
coef std err z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------------------------------------------
const 4.5917 0.166 27.578 0.000 4.265 4.918
Stratification_American Indian or Alaska Native 1.0154 0.060 16.851 0.000 0.897 1.134
Stratification_Asian or Pacific Islander 0.5556 0.059 9.476 0.000 0.441 0.671
Stratification_Black, non-Hispanic 1.3388 0.058 22.946 0.000 1.224 1.453
Stratification_Hispanic 0.5858 0.057 10.263 0.000 0.474 0.698
Stratification_White, non-Hispanic 1.0960 0.053 20.707 0.000 0.992 1.200
Medicare Expenditure -2.814e-05 1.89e-05 -1.487 0.137 -6.52e-05 8.96e-06
===================================================================================================================
#Frequentist Model - Negative Binomial (race + expenditure + year)
negbin_model_race_medicare_year = sm.GLM(
y, sm.add_constant(cleaned_data[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic', 'Medicare Expenditure', 'YearStart']]),
family=sm.families.NegativeBinomial()
)
negbin_results_race_medicare_year = negbin_model_race_medicare_year.fit()
print(negbin_results_race_medicare_year.summary())
#Similar idea for deviance / chi2 / log-likelihood --> seem good
#Medicare expenditure had very little impact, same with year.
# confidence intervals are TOO wide so this was not good
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: DataValue No. Observations: 1885
Model: GLM Df Residuals: 1878
Model Family: NegativeBinomial Df Model: 6
Link Function: Log Scale: 1.0000
Method: IRLS Log-Likelihood: -11727.
Date: Sun, 12 Dec 2021 Deviance: 124.94
Time: 03:34:17 Pearson chi2: 120.
No. Iterations: 6 Pseudo R-squ. (CS): 0.08867
Covariance Type: nonrobust
===================================================================================================================
coef std err z P>|z| [0.025 0.975]
-------------------------------------------------------------------------------------------------------------------
const 14.7293 15.769 0.934 0.350 -16.178 45.636
Stratification_American Indian or Alaska Native 3.0447 3.156 0.965 0.335 -3.141 9.231
Stratification_Asian or Pacific Islander 2.5828 3.154 0.819 0.413 -3.599 8.764
Stratification_Black, non-Hispanic 3.3654 3.153 1.067 0.286 -2.815 9.545
Stratification_Hispanic 2.6130 3.154 0.829 0.407 -3.568 8.794
Stratification_White, non-Hispanic 3.1234 3.154 0.990 0.322 -3.059 9.306
Medicare Expenditure -2.406e-05 2e-05 -1.206 0.228 -6.32e-05 1.5e-05
YearStart -0.0061 0.009 -0.643 0.520 -0.025 0.012
===================================================================================================================
#predicting mortality rate based on race
from sklearn.tree import DecisionTreeRegressor
race_tree_model = DecisionTreeRegressor()
X_train = train[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic']]
X_test = test[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic']]
y_train = train['DataValue']
y_test = test['DataValue']
race_tree_model.fit(X_train,y_train)
train["race_tree_pred"] = race_tree_model.predict(X_train)
test["race_tree_pred"] = race_tree_model.predict(X_test)
#RMSE for race predictor
train_rmse = np.mean((train["race_tree_pred"] - train["DataValue"]) ** 2) ** 0.5
test_rmse = np.mean((test["race_tree_pred"] - test["DataValue"]) ** 2) ** 0.5
print("Training set error for decision tree:", train_rmse)
print("Test set error for decision tree: ", test_rmse)
Training set error for decision tree: 50.207348968537
Test set error for decision tree: 45.59776047690627
from sklearn.tree import plot_tree
plt.figure(figsize=(16, 4))
plot_tree(race_tree_model);
#predicting mortality rate based on medicare expenditure
from sklearn.tree import DecisionTreeRegressor
medicare_tree_model = DecisionTreeRegressor()
y_train = train['DataValue']
y_test = train['DataValue']
X_train = train[['Medicare Expenditure']]
X_test = test[['Medicare Expenditure']]
medicare_tree_model.fit(X_train,y_train)
train["medicare_tree_pred"] = medicare_tree_model.predict(X_train)
test["medicare_tree_pred"] = medicare_tree_model.predict(X_test)
#RMSE for expenditure predictor
train_rmse = np.mean((train["medicare_tree_pred"] - train["DataValue"]) ** 2) ** 0.5
test_rmse = np.mean((test["medicare_tree_pred"] - test["DataValue"]) ** 2) ** 0.5
print("Training set error for decision tree:", train_rmse)
print("Test set error for decision tree: ", test_rmse)
Training set error for decision tree: 66.00607556447662
Test set error for decision tree: 96.51548242376563
from sklearn.tree import plot_tree
plt.figure(figsize=(50, 50))
plot_tree(medicare_tree_model);
#closer look of nodes at the top
X_cols = ['Medicare Expenditure']
plt.figure(figsize=(12, 7))
plot_tree(medicare_tree_model, max_depth=2, fontsize=14, feature_names=X_cols);
#predicting mortality from race + expenditure
from sklearn.tree import DecisionTreeRegressor
medicare_race_tree_model = DecisionTreeRegressor()
y_train = train['DataValue']
y_test = train['DataValue']
X_train = train[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic','Medicare Expenditure']]
X_test = test[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic','Medicare Expenditure']]
medicare_race_tree_model.fit(X_train,y_train)
train["medicare_race_tree_pred"] = medicare_race_tree_model.predict(X_train)
test["medicare_race_tree_pred"] = medicare_race_tree_model.predict(X_test)
#RMSE for race + expenditure predictor
train_rmse = np.mean((train["medicare_race_tree_pred"] - train["DataValue"]) ** 2) ** 0.5
test_rmse = np.mean((test["medicare_race_tree_pred"] - test["DataValue"]) ** 2) ** 0.5
print("Training set error for decision tree:", train_rmse)
print("Test set error for decision tree: ", test_rmse)
Training set error for decision tree: 7.373723145400766
Test set error for decision tree: 60.44698820778606
from sklearn.tree import plot_tree
plt.figure(figsize=(30, 15))
plot_tree(medicare_race_tree_model);
#closer look at nodes
X_cols = ['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic','Medicare Expenditure']
plt.figure(figsize=(20, 15))
plot_tree(medicare_race_tree_model, max_depth=7, fontsize=14, feature_names=X_cols);
from sklearn.ensemble import RandomForestRegressor
race_forest_model = RandomForestRegressor(max_features = 1)
X_train = train[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic']]
X_test = test[['Stratification_American Indian or Alaska Native',
'Stratification_Asian or Pacific Islander',
'Stratification_Black, non-Hispanic', 'Stratification_Hispanic',
'Stratification_White, non-Hispanic']]
y_train = train['DataValue']
y_test = test['DataValue']
race_forest_model.fit(X_train,y_train)
train_pred = race_forest_model.predict(X_train)
test_pred = race_forest_model.predict(X_test)
train["race_forest_pred"] = train_pred
test["race_forest_pred"] = test_pred
train_rmse = np.mean((train["race_forest_pred"] - train["DataValue"]) ** 2) ** 0.5
test_rmse = np.mean((test["race_forest_pred"] - test["DataValue"]) ** 2) ** 0.5
print("Training set error for random forest:", train_rmse)
print("Test set error for random forest: ", test_rmse)
Training set error for random forest: 50.2078110225084
Test set error for random forest: 45.59711152421809
from sklearn.ensemble import RandomForestRegressor
medicare_forest_model = RandomForestRegressor(max_features = 1)
X_train = train[['Medicare Expenditure']]
X_test = test[['Medicare Expenditure']]
y_train = train['DataValue']
y_test = test['DataValue']
medicare_forest_model .fit(X_train,y_train)
train_pred = medicare_forest_model .predict(X_train)
test_pred = medicare_forest_model .predict(X_test)
train["medicare_forest_pred"] = train_pred
test["medicare_forest_pred"] = test_pred
train_rmse = np.mean((train["medicare_forest_pred"] - train["DataValue"]) ** 2) ** 0.5
test_rmse = np.mean((test["medicare_forest_pred"] - test["DataValue"]) ** 2) ** 0.5
print("Training set error for random forest:", train_rmse)
print("Test set error for random forest: ", test_rmse)
Training set error for random forest: 66.47391705702886
Test set error for random forest: 92.46639123567687
## OLS Functions
def fit_OLS_model(df, target_variable, explanatory_variables, intercept = True):
target = df[target_variable]
inputs = df[explanatory_variables]
if intercept:
inputs = sm.add_constant(inputs)
fitted_model = sm.OLS(target, inputs).fit()
return(fitted_model)
def compute_OLS_predictions(input_array, input_params):
predictions = input_params[0] + input_params[1] * input_array
return predictions