Research Question 1

Installing Libraries

#!/usr/bin/env python # make sure to install these packages before running: !pip install pandas !pip install sodapy !pip install us

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import us

Creating the Asthma Dataset

#reading in CDI data and droping columns we don't need and null values asthma_data=pd.read_csv("U.S._Chronic_Disease_Indicators__Asthma.csv") list(asthma_data.columns) asthma_data = asthma_data.drop(columns=['YearEnd','LocationDesc', 'DataSource','Topic','Question','Response','DataValueType', 'DataValueAlt','DataValueFootnoteSymbol','DatavalueFootnote','LowConfidenceLimit','HighConfidenceLimit','StratificationCategory1', 'Stratification1','StratificationCategory2','Stratification2','StratificationCategory3','Stratification3','ResponseID', 'LocationID','TopicID','QuestionID','DataValueTypeID','StratificationCategoryID1','StratificationID1', 'StratificationCategoryID2','StratificationID2','StratificationCategoryID3','StratificationID3']) asthma_data = asthma_data.dropna()

#final dataset for asthma asthma_data_per_10K= asthma_data[asthma_data["DataValueUnit"] == "cases per 10,000" ] asthma_data_per_10K = asthma_data_per_10K.drop(columns= ["DataValueUnit"]) asthma_data_per_10K

Creating the Ozone Dataset

import pandas as pd from sodapy import Socrata # Unauthenticated client only works with public data sets. Note 'None' # in place of application token, and no username or password: client = Socrata("data.cdc.gov", None) # Example authenticated client (needed for non-public datasets): # client = Socrata(data.cdc.gov, # MyAppToken, # userame="user@example.com", # password="AFakePassword") # First 2000 results, returned as JSON from API / converted to Python list of # dictionaries by sodapy. query_11 = """SELECT year, statefips, ds_o3_pred WHERE year='2011' LIMIT 30000""" query_12 = """SELECT year, statefips, ds_o3_pred WHERE year='2012' LIMIT 30000""" query_13 = """SELECT year, statefips, ds_o3_pred WHERE year='2013' LIMIT 30000""" query_14 = """SELECT year, statefips, ds_o3_pred WHERE year='2014' LIMIT 30000""" results11 = client.get("372p-dx3h",query = query_11) results12 = client.get("372p-dx3h",query = query_12) results13 = client.get("372p-dx3h",query = query_13) results14 = client.get("372p-dx3h",query = query_14) # Convert to pandas DataFrame ozone_data = pd.DataFrame.from_records(results11+results12+results13+results14) ozone_data

#Creating a function for transforming state fips into state abbreviations def filter_fips(table): abbrev=[] for i in table["statefips"]: if int(i) < 10: i = '0' + str(i) abbrev.append(us.states.lookup(str(i)).abbr) #print(len(abbrev)) #print(abbrev) table["stateabbrev"] = abbrev

#Creating a state abbreviation column in the ozone data #and grabbing necessary columns filter_fips(ozone_data) ozone_data=ozone_data.filter(items=['ds_o3_pred', 'stateabbrev', 'year']) ozone_data

#Performing an inner join on the tables ozone_data = ozone_data.rename(columns={"year": "YearStart", "stateabbrev": "LocationAbbr"}) ozone_data["YearStart"] = ozone_data["YearStart"].astype(int) #q1_dataset = pd.concat([asthma_data_per_10K, ozone_data]) #q1_dataset = pd.merge(asthma_data_per_10K, ozone_data, left_on = ["YearStart", "LocationAbbr"], right_on = ["year", "stateabbrev"]) q1_dataset = pd.merge(asthma_data_per_10K, ozone_data, on = ["LocationAbbr", "YearStart"]) #q1_dataset = q1_dataset.dropna() q1_dataset

Creating a Treatment Column

#Finding the mean in order to create a treatment column float_ozone_pred = q1_dataset['ds_o3_pred'].astype(float) mean_ozone_pred = float_ozone_pred.mean() q1_dataset['treat'] = float_ozone_pred > mean_ozone_pred q1_dataset

Importing Dew Point and Temperature

#Reading in dew point and temp data ave_dewpt_temp=pd.read_csv("State_ave_temp_and_dew_pt.csv") ave_dewpt_temp

#convert state names to state abbreviations #dictionary sourced from https://gist.github.com/rogerallen/1583593 us_state_to_abbrev = { "Alabama": "AL", "Alaska": "AK", "Arizona": "AZ", "Arkansas": "AR", "California": "CA", "Colorado": "CO", "Connecticut": "CT", "Delaware": "DE", "Florida": "FL", "Georgia": "GA", "Hawaii": "HI", "Idaho": "ID", "Illinois": "IL", "Indiana": "IN", "Iowa": "IA", "Kansas": "KS", "Kentucky": "KY", "Louisiana": "LA", "Maine": "ME", "Maryland": "MD", "Massachusetts": "MA", "Michigan": "MI", "Minnesota": "MN", "Mississippi": "MS", "Missouri": "MO", "Montana": "MT", "Nebraska": "NE", "Nevada": "NV", "New Hampshire": "NH", "New Jersey": "NJ", "New Mexico": "NM", "New York": "NY", "North Carolina": "NC", "North Dakota": "ND", "Ohio": "OH", "Oklahoma": "OK", "Oregon": "OR", "Pennsylvania": "PA", "Rhode Island": "RI", "South Carolina": "SC", "South Dakota": "SD", "Tennessee": "TN", "Texas": "TX", "Utah": "UT", "Vermont": "VT", "Virginia": "VA", "Washington": "WA", "West Virginia": "WV", "Wisconsin": "WI", "Wyoming": "WY", "District of Columbia": "DC", "American Samoa": "AS", "Guam": "GU", "Northern Mariana Islands": "MP", "Puerto Rico": "PR", "United States Minor Outlying Islands": "UM", "U.S. Virgin Islands": "VI", } stateabbr = [] for i in ave_dewpt_temp["State"]: #ave_dewpt_temp["StateAbbr"][i] = us_state_to_abbrev.get(i) stateabbr.append(us_state_to_abbrev.get(i)) ave_dewpt_temp["LocationAbbr"] = stateabbr ave_dewpt_temp

Joining the Confounding Variables with the Dataset

#Joining dewpoint and temperature data with asthma and ozone data q1_dataset_ = pd.merge(q1_dataset, ave_dewpt_temp, on = ["LocationAbbr"]) q1_dataset_

q1_dataset_["LocationAbbr"].unique()

EDA For Ozone Concentration and Asthma

q1_dataset_temp = q1_dataset_.sample(5000, random_state = 1) plt.bar(q1_dataset_temp["LocationAbbr"], q1_dataset_temp["DataValue"]) #fix location plt.title("Asthma cases per million for each State") plt.xlabel("State") #fix label plt.ylabel("Asthma cases per million") #fix label plt.plot()

#bar graph of ozone concentration for each location plt.bar(q1_dataset_temp["LocationAbbr"], q1_dataset_temp["ds_o3_pred"]) #fix location plt.title("Ozone Concentrations for each State") plt.xlabel("State") #fix label plt.ylabel("Ozone Concentration") #fix label plt.plot()

#r- scatterplot of number of asthma cases per million vs. ozone concentration q1_dataset_temp = q1_dataset_.sample(5000, random_state = 1) plt.scatter(q1_dataset_temp["DataValue"], q1_dataset_temp["ds_o3_pred"]) plt.title("number of asthma cases per million vs. ozone concentration") plt.xlabel("Asthma cases per million") plt.ylabel("Ozone Concentration") plt.plot()

q1_dataset_temp["ds_o3_pred"] = q1_dataset_temp["ds_o3_pred"].astype(float) sns.jointplot(data = q1_dataset_temp, x = "DataValue", y = "ds_o3_pred", kind = "kde", fill = True);

Inverse Propensity Weighting

#calculate propensity scores #code partially from homework 4 from sklearn.linear_model import LogisticRegression as LR lr = LR(penalty="none", max_iter=200, random_state=0) X= q1_dataset_[["Average Dew Point", "Average Temperature"]] Y = q1_dataset_["ds_o3_pred"]#outcome Z = q1_dataset_["treat"]# treatment lr_model = lr.fit(X, Z) sum(lr_model.predict(X))

Causal Effect

# Computing the IPW estimates for the Average Treatment Effect #remove propensity scores < 0.1 and > 0.9 def estimate_treatment_effect(lr, X, Y, Z): ex = lr.predict_proba(X)[:, 1] return np.mean(Z * Y / ex) - np.mean((1 - Z) * Y / (1 - ex)) estimate_treatment_effect(lr_model, X, Y.astype(float), Z) #fix Y in dataset later

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Research Question 1