import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics

# Reading data
cancer_df = pd.read_csv("U.S._Chronic_Disease_Indicators__Asthma.csv")
COPD_df = pd.read_csv("U.S._Chronic_Disease_Indicators__Chronic_Obstructive_Pulmonary_Disease.csv")
ozone_df = pd.read_csv("Daily_Census_Tract-Level_Ozone_Concentrations__2011-2014.csv")
pm2_5_df = pd.read_csv("Daily_Census_Tract-Level_PM2.5_Concentrations__2011-2014.csv")
fips_df = pd.read_csv("us-state-ansi-fips.csv") # https://gist.github.com/dantonnoriega/bf1acd2290e15b91e6710b6fd3be0a53#file-us-state-ansi-fips-csv
co2_df = pd.read_csv("state_energy-related_carbon_dioxide_emissions_by_year.csv") # https://www.eia.gov/environment/emissions/state/
print(sum(cancer_df.columns == COPD_df.columns), len(cancer_df.columns), len(COPD_df.columns))

cancer_df.head(1)

COPD_df.head(1)

# EDA

## EDA for Cancer and COPD

# Are there any rows in YearStart and YearEnd that are not the same
# Since there are 0 rows that are not the same lets just have one year column
print(sum(cancer_df['YearStart']!=cancer_df['YearEnd']))
print(sum(COPD_df['YearStart']!=COPD_df['YearEnd']))
cancer_df.rename({'YearStart':'Year'},axis=1,inplace=True)
cancer_df.drop(['YearEnd','LocationAbbr'],axis=1,inplace=True)
COPD_df.rename({'YearStart':'Year'},axis=1,inplace=True)
COPD_df.drop(['YearEnd','LocationAbbr'],axis=1,inplace=True)
# Drop columns with only Nulls
cancer_df.dropna(axis=1, how='all',inplace=True)
COPD_df.dropna(axis=1, how='all',inplace=True)

cancer_df.head(1)

COPD_df.head(1)

# Check for columns with the same value in all rows
# Outputs "Topic" and "TopicID" which we know is "Asthma" and "AST" repectably
for column in cancer_df.columns:
if len(cancer_df[column].unique()) == 1:
print(column)
for column in COPD_df.columns:
if len(COPD_df[column].unique()) == 1:
print(column)

# Choosing the non redundant columns and rows that represent the overall population
cancer_df2 = cancer_df.loc[cancer_df['StratificationCategory1'] == "Overall"].sort_values(['LocationDesc','Question','Year','StratificationCategory1'])[['Year', 'LocationDesc', 'Question', 'Topic', 'StratificationCategory1', 'Stratification1',
'DataValueUnit', 'DataValueType', 'DataValue', 'DataValueAlt',
'DataValueFootnoteSymbol', 'DatavalueFootnote', 'LowConfidenceLimit',
'HighConfidenceLimit']]
COPD_df2 = COPD_df.loc[COPD_df['StratificationCategory1'] == "Overall"].sort_values(['LocationDesc','Question','Year','StratificationCategory1'])[['Year', 'LocationDesc', 'Question', 'Topic', 'StratificationCategory1', 'Stratification1',
'DataValueUnit', 'DataValueType', 'DataValue', 'DataValueAlt',
'DataValueFootnoteSymbol', 'DatavalueFootnote', 'LowConfidenceLimit',
'HighConfidenceLimit']]
max(cancer_df['Year']), min(cancer_df['Year']), max(COPD_df['Year']), min(COPD_df['Year'])

# Only want the rows that show the Crude Prevalence: total number of cases or deaths divided by the total population
# Only wants the rows for Current asthma prevalence among adults aged >= 18 years
cancer_df3 = cancer_df2.loc[(cancer_df2['DataValueType']=="Crude Prevalence") & (cancer_df2['Question']=="Current asthma prevalence among adults aged >= 18 years")]
COPD_df3 = COPD_df2.loc[(COPD_df2['DataValueType']=="Crude Prevalence") & (COPD_df2['Question']=="Prevalence of chronic obstructive pulmonary disease among adults >= 18")]
COPD_df3.shape, cancer_df3.shape

cancer_df3.isna().sum()

# Drop NA values
COPD_df4 = COPD_df3.copy()
COPD_df4.drop(['DataValueFootnoteSymbol','DatavalueFootnote','HighConfidenceLimit','LowConfidenceLimit','DataValueAlt','Stratification1','StratificationCategory1','Question','DataValueUnit','DataValueType'], inplace=True, axis=1)
COPD_df4.dropna(inplace=True)
cancer_df4 = cancer_df3.copy()
cancer_df4.drop(['DataValueFootnoteSymbol','DatavalueFootnote','HighConfidenceLimit','LowConfidenceLimit','DataValueAlt','Stratification1','StratificationCategory1','Question','DataValueUnit','DataValueType'], inplace=True, axis=1)
cancer_df4.dropna(inplace=True)
COPD_df4.shape, cancer_df4.shape

# Now keep only years 2011 - 2014
COPD_df5 = COPD_df4.loc[COPD_df4['Year'].isin([2011,2012,2013,2014])]
cancer_df5 = cancer_df4.loc[cancer_df4['Year'].isin([2011,2012,2013,2014])]
COPD_df5.shape, cancer_df5.shape

# Need to merge these two together
cancer_and_COPD_df = cancer_df5.merge(COPD_df5, how='left', on=['Year','LocationDesc'])
print(cancer_and_COPD_df.shape)
cancer_and_COPD_df.head()

## EDA for pm2.5 and Ozone concentrations

ozone_df.head(1)

pm2_5_df.head(1)

fips_df.head(1)

pm2_5_df.shape, ozone_df.shape

They have the same columns and same number of rows so lets see if we can just join them!

# Checking for NA values
# We have none
ozone_df.isna().sum(), pm2_5_df.isna().sum()

# Merged the two dataframes to have pm2.5 and Ozone concentrations in one dataframe
pm2_5_ozone_df = pm2_5_df.merge(ozone_df, how='left', on=['year','statefips','countyfips'])
pm2_5_ozone_df.shape

pm2_5_ozone_df.head(2)

# Merging our pm2_5_ozone_df with the fips_df to get he state names
pm2_5_ozone_df2 = pm2_5_ozone_df.merge(fips_df, how="left", left_on="statefips", right_on=" st").drop([" st"," stusps"], axis=1)
pm2_5_ozone_df2 = pm2_5_ozone_df2[['year', 'stname', 'statefips', 'countyfips', 'ds_pm_pred', 'ds_o3_pred']]

pm2_5_ozone_df2 = pm2_5_ozone_df2.groupby(["year","stname"]).agg(np.mean).drop(["countyfips","statefips"],axis=1).reset_index()

pm2_5_ozone_df2.sort_values(['year','stname'], ascending=True, inplace=True)
pm2_5_ozone_df2.head()

### Merging both DataFrames

# Merge our pm2_5 and ozone dataframe with our cancer and copd dataframe
final_df = pm2_5_ozone_df2.merge(cancer_and_COPD_df, how='left', left_on=['year','stname'], right_on=['Year','LocationDesc']).drop(['Year','LocationDesc'],axis=1)
final_df.rename({"DataValue_x":"asthma_prevalence", "DataValue_y":"copd_prevalence"}, inplace=True, axis=1)
final_df.drop(["Topic_x","Topic_y"], axis=1, inplace=True)
print(final_df.shape)
final_df.head(3)

## EDA for co2 dataframe

# million metric tons of energy-related carbon dioxide
co2_df2 = co2_df.loc[0:50, ["State","2011","2012","2013","2014"]]
co2_df3 = pd.melt(co2_df2,
id_vars='State',
value_vars=list(co2_df2.columns[1:]), # list of years
var_name='year',
value_name='co2_levels')[['year','State','co2_levels']]
co2_df3['year'] = co2_df3['year'].astype(int)
co2_df3['co2_levels'] = co2_df3['co2_levels'].astype(float)
co2_df3.head(3)
# Merge final_df with co2_df3
# Don't run this cell two times because of this line of code!
final_df2 = final_df.merge(co2_df3, how='left', left_on=['year','stname'], right_on=['year','State']).drop(['State'],axis=1)

print(final_df2.shape)
final_df2.rename({"ds_pm_pred":"pm2.5_levels", "ds_o3_pred":"ozone_levels"}, axis=1,inplace=True)
final_df2.head(3)

# Visualizations

sns.set_style("whitegrid")
fig, axs = plt.subplots(3, 2, figsize=(15, 11))
sns.scatterplot(data=final_df2, x="asthma_prevalence", y="pm2.5_levels", ax=axs[0][0]);
sns.scatterplot(data=final_df2, x="asthma_prevalence", y="ozone_levels", ax=axs[0][1]);
sns.scatterplot(data=final_df2, x="copd_prevalence", y="pm2.5_levels", ax=axs[1][0]);
sns.scatterplot(data=final_df2, x="copd_prevalence", y="ozone_levels", ax=axs[1][1]);
sns.scatterplot(data=final_df2, x="asthma_prevalence", y="co2_levels", ax=axs[2][0]);
sns.scatterplot(data=final_df2, x="copd_prevalence", y="co2_levels", ax=axs[2][1]);
plt.show()

Figure 1: Plots showing the relationships between the pollutants and the chronic illness prevalence.

# Methods

We want to explore if having more pollution in a geographical area is associated with or causes a higher prevalence of chronic illness.

We want to create a baseline of average concentrations of PM2.5, CO2, and Ozone in a geographic area (States) and compare that to see if there is a positive correlation between different pollutants and chronic illness.

We will utilize regression models to estimate the association between each pollutant (PM2.5, Ozone, CO2) and the prevalence of the corresponding chronic illness (COPD and Asthma) in a geographical area (States) since we want to see if there is a positive correlation between different pollutants and chronic illness.

We will adjust the comparison values using techniques from class such as Bonferroni (Family-wise Error Rate), Benjamini-Hochberg (False Discovery Rate) and Benjamini–Yekutieli (False Discovery Rate) since our Multiple Hypothesis Tests may be dependent.

Bonferroni correction will help guarantee a Family-wise Error Rate (FWER) of 0.05. Meanwhile, Benjamini-Hochberg and Benjamini–Yekutieli will help guarantee a False Discovery Rate (FDR) of 0.05. The difference between Benjamini-Hochberg and Benjamini–Yekutieli is that Benjamini–Yekutieli controls the false discovery rate under arbitrary dependence assumptions which may be the case in our situation. Controlling FDR is more appropriate in our case since FWER, which is the probability of at least one false positive, can have a stricter threshold. We want some room to keep variables for our regression model. Additionally, since our hypothesis tests may not all be independent it is important for us to use specifically use Benjamini–Yekutieli rather than Benjamini-Hochberg.

Alternative Hypothesis:

Higher PM2.5 concentrations are associated with a higher prevalence of COPD.

Higher Ozone concentrations are associated with a higher prevalence of COPD.

Higher CO2 concentrations are associated with a higher prevalence of COPD

Higher PM2.5 concentrations are associated with a higher prevalence of Asthma.

Higher Ozone concentrations are associated with a higher prevalence of Asthma.

Higher CO2 concentrations are associated with a higher prevalence of Asthma.

import statsmodels.api as sm
from sklearn.model_selection import train_test_split

# Utilize regression models to estimate the association between each pollutan and the prevalence of the corresponding chronic illness
# Source: https://www.analyticsvidhya.com/blog/2019/09/everything-know-about-p-value-from-scratch-data-science/?utm_source=reading_list&utm_medium=https://www.analyticsvidhya.com/blog/2021/09/hypothesis-testing-in-machine-learning-everything-you-need-to-know/
X = final_df2[['pm2.5_levels','ozone_levels','co2_levels']]
Y1 = final_df2['asthma_prevalence']
Y2 = final_df2['copd_prevalence']
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, Y1, test_size=0.2, random_state=42)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, Y2, test_size=0.2, random_state=42)

# Regression model for Asthma
lr1 = sm.OLS(y_train1, sm.add_constant(X_train1)).fit()
print(lr1.summary())

# Regression model for COPD
lr2 = sm.OLS(y_train2, sm.add_constant(X_train2)).fit()
print(lr2.summary())

Figure 2: Initial Regression models for asthma and COPD with all three pollutants. These models were build on the training data set.

lr1.pvalues, lr2.pvalues

array_p_values = {}
asthma_pm2_5 = lr1.pvalues['pm2.5_levels']
print(asthma_pm2_5)
array_p_values["Asthma Prevalence and pm2.5"] = asthma_pm2_5

asthma_Ozone = lr1.pvalues['ozone_levels']
print(asthma_Ozone)
array_p_values["Asthma Prevalence and Ozone"] = asthma_Ozone

asthma_Co2 = lr1.pvalues['co2_levels']
print(asthma_Co2)
array_p_values["Asthma Prevalence and CO2"] = asthma_Co2

copd_pm2_5 = lr2.pvalues['pm2.5_levels']
print(copd_pm2_5)
array_p_values["COPD Prevalence and pm2.5"] = copd_pm2_5

copd_Ozone = lr2.pvalues['ozone_levels']
print(copd_Ozone)
array_p_values["COPD Prevalence and Ozone"] = copd_Ozone

copd_Co2 = lr2.pvalues['co2_levels']
print(copd_Co2)
array_p_values["COPD Prevalence and CO2"] = copd_Co2

# Basic
for key in array_p_values.keys():
if array_p_values[key] <= 0.05:
print(key)

# Bonferroni correction
for key in array_p_values.keys():
if array_p_values[key] <= 0.05/6:
print(key)

# Benjamini-Hochberg
k = 1
for key in sorted(array_p_values.items(), key=lambda x:x[1]):
if key[1] <= (k*0.05)/6:
print(key[0])
k+=1

## Benjamini–Yekutieli

Our Multiple Hypothesis Tests may be dependent

# Benjamini–Yekutieli
# The Benjamini–Yekutieli procedure controls the false discovery rate
# under arbitrary dependence assumptions
# Source: https://en.wikipedia.org/wiki/False_discovery_rate
def c(m):
sumation = 0
for i in np.arange(1,m+1):
sumation += (1/i)
return sumation
k = 1
for key in sorted(array_p_values.items(), key=lambda x:x[1]):
if key[1] <= (k*0.05)/(6*c(6)):
print(key[0])
k+=1

# Adjust the comparison values

Using the results above we remove parameters that didn't reject the null.

# Adjusted Regression model for Asthma
lr_asthma = sm.OLS(y_train1, sm.add_constant(X_train1[['co2_levels']])).fit()
print(lr_asthma.summary())

plt_x = np.arange(0,700)
plt_y = -0.0037*plt_x+9.5535
sns.lineplot(x=plt_x, y=plt_y, color='red');
plt.title("CO2 levels VS Asthma Prevalence")
sns.scatterplot(x=X_train1['co2_levels'], y=y_train1);

# Adjusted Regression model for COPD
lr_copd = sm.OLS(y_train2, sm.add_constant(X_train2[['pm2.5_levels']])).fit()
print(lr_copd.summary())

plt_x = np.arange(4,13)
plt_y = 0.4966*plt_x+2.5499
sns.lineplot(x=plt_x, y=plt_y, color='red');
plt.title("PM2.5 levels VS COPD Prevalence")
sns.scatterplot(x=X_train2['pm2.5_levels'], y=y_train2);

## Testing models

# Initial test predictions
asthma_initial_test_predictions = lr1.predict(sm.add_constant(X_test1))
copd_initial_test_predictions = lr2.predict(sm.add_constant(X_test2))
print(np.sqrt(metrics.mean_squared_error(y_test1, asthma_initial_test_predictions)))
print(np.sqrt(metrics.mean_squared_error(y_test2, copd_initial_test_predictions)))

# Adjusted test predictions
asthma_adjusted_test_predictions = lr_asthma.predict(sm.add_constant(X_test1[['co2_levels']]))
copd_adjusted_test_predictions = lr_copd.predict(sm.add_constant(X_test2[['pm2.5_levels']]))
print(np.sqrt(metrics.mean_squared_error(y_test1, asthma_adjusted_test_predictions)))
print(np.sqrt(metrics.mean_squared_error(y_test2, copd_adjusted_test_predictions)))

sns.set_style("whitegrid")
fig, axs = plt.subplots(2, 2, figsize=(15, 11))
sns.regplot(x=y_test1, y=asthma_initial_test_predictions, ax=axs[0][0]).set(title="Initial test asthma predictions vs actual");
sns.regplot(x=y_test1, y=asthma_adjusted_test_predictions, ax=axs[0][1]).set(title="Adjusted test asthma predictions vs actual");
sns.regplot(x=y_test2, y=copd_initial_test_predictions, ax=axs[1][0]).set(title="Initial test COPD predictions vs actual");
sns.regplot(x=y_test2, y=copd_adjusted_test_predictions, ax=axs[1][1]).set(title="Adjusted test COPD predictions vs actual");
plt.setp(axs[:, :], xlabel='Actual', ylabel="Prediction")
plt.show()

# Results

From our findings we see Bonferroni correction and Benjamini–Yekutieli both rejected (COPD Prevalence and pm2.5) and (Asthma Prevalence and CO2) leaning towards the alternative that pm2.5 has a significant effect on COPD and CO2 had a significant effect on Asthma. Meanwhile, Benjamini-Hochberg in addition to those two rejections also rejected (Asthma Prevalence and Ozone) leaning towards the alternative that Ozone had a significant effect on Asthma.

However, the regression model checked to see if the independent variables had a significant effect on the target variables. We want to check if the independent variables have a positive significant effect on the target variable meaning only pm2.5 has a positive significant effect on COPD since CO2 had a negative significant effect on Asthma.

Additionally, by only utilizing pm2.5 to predict COPD, our model preformed similarly to when using all three parameters (ozone, co2, pm2.5). With only pm2.5 the model obtained a Root Mean Squared Error of ~1.56. Meanwhile when using all three parameters the model obtained a Root Mean Squared Error of ~1.55.

In conclusion, out of our null hypothesis, we failed to reject all but “Higher PM2.5 concentrations are not associated with a higher prevalence of COPD” and “Higher CO2 concentrations are not associated with a higher prevalence of Asthma” utilizing regression models and Benjamini–Yekutieli correction. However, when run in our adjusted regression model we found out the CO2 coefficient for asthma was negative so therefore didn’t apply to our alternative hypothesis. In the end, we only rejected “Higher PM2.5 concentrations are not associated with a higher prevalence of COPD”.

# Discussion

The regression model checked to see if the independent variables had a significant effect on the target variables. We, however, wanted to check if the independent variables have a positive significant effect on the target variable meaning only pm2.5 has a positive significant effect on COPD since CO2 had a negative significant effect on Asthma.

For each individual test we can provide health officials with stats supporting or refuting the association of poor air quality with the prevalence of chronic illness. Since in the end we only rejected “Higher PM2.5 concentrations are not associated with a higher prevalence of COPD”, we can inform health officials on the possitive correlation between PM2.5 and Chronic Obstructive Pulmonary Disease.

A limitation we had with our analysis was that the datasets for PM2.5 and Ozone levels only had data from 2011 to 2014 so we were only able to utilize data for the rest of the datasets from the years 2011 to 2014.

If we had more data (other than the years 2011 to 2014) we would have tried to visualize the the correlation between the pollutants and chronic illnesses per year. Having more resent data would also provide better analysis to provide health officials with.