Research Question 1
Installing Libraries
#!/usr/bin/env python
# make sure to install these packages before running:
!pip install pandas
!pip install sodapy
!pip install us
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import us
Creating the Asthma Dataset
#reading in CDI data and droping columns we don't need and null values
asthma_data=pd.read_csv("U.S._Chronic_Disease_Indicators__Asthma.csv")
list(asthma_data.columns)
asthma_data = asthma_data.drop(columns=['YearEnd','LocationDesc', 'DataSource','Topic','Question','Response','DataValueType',
'DataValueAlt','DataValueFootnoteSymbol','DatavalueFootnote','LowConfidenceLimit','HighConfidenceLimit','StratificationCategory1',
'Stratification1','StratificationCategory2','Stratification2','StratificationCategory3','Stratification3','ResponseID',
'LocationID','TopicID','QuestionID','DataValueTypeID','StratificationCategoryID1','StratificationID1',
'StratificationCategoryID2','StratificationID2','StratificationCategoryID3','StratificationID3'])
asthma_data = asthma_data.dropna()
#final dataset for asthma
asthma_data_per_10K= asthma_data[asthma_data["DataValueUnit"] == "cases per 10,000" ]
asthma_data_per_10K = asthma_data_per_10K.drop(columns= ["DataValueUnit"])
asthma_data_per_10K
Creating the Ozone Dataset
import pandas as pd
from sodapy import Socrata
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cdc.gov", None)
# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cdc.gov,
# MyAppToken,
# userame="user@example.com",
# password="AFakePassword")
# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
query_11 = """SELECT year, statefips, ds_o3_pred WHERE year='2011' LIMIT 30000"""
query_12 = """SELECT year, statefips, ds_o3_pred WHERE year='2012' LIMIT 30000"""
query_13 = """SELECT year, statefips, ds_o3_pred WHERE year='2013' LIMIT 30000"""
query_14 = """SELECT year, statefips, ds_o3_pred WHERE year='2014' LIMIT 30000"""
results11 = client.get("372p-dx3h",query = query_11)
results12 = client.get("372p-dx3h",query = query_12)
results13 = client.get("372p-dx3h",query = query_13)
results14 = client.get("372p-dx3h",query = query_14)
# Convert to pandas DataFrame
ozone_data = pd.DataFrame.from_records(results11+results12+results13+results14)
ozone_data
#Creating a function for transforming state fips into state abbreviations
def filter_fips(table):
abbrev=[]
for i in table["statefips"]:
if int(i) < 10:
i = '0' + str(i)
abbrev.append(us.states.lookup(str(i)).abbr)
#print(len(abbrev))
#print(abbrev)
table["stateabbrev"] = abbrev
#Creating a state abbreviation column in the ozone data
#and grabbing necessary columns
filter_fips(ozone_data)
ozone_data=ozone_data.filter(items=['ds_o3_pred', 'stateabbrev', 'year'])
ozone_data
#Performing an inner join on the tables
ozone_data = ozone_data.rename(columns={"year": "YearStart", "stateabbrev": "LocationAbbr"})
ozone_data["YearStart"] = ozone_data["YearStart"].astype(int)
#q1_dataset = pd.concat([asthma_data_per_10K, ozone_data])
#q1_dataset = pd.merge(asthma_data_per_10K, ozone_data, left_on = ["YearStart", "LocationAbbr"], right_on = ["year", "stateabbrev"])
q1_dataset = pd.merge(asthma_data_per_10K, ozone_data, on = ["LocationAbbr", "YearStart"])
#q1_dataset = q1_dataset.dropna()
q1_dataset
Creating a Treatment Column
#Finding the mean in order to create a treatment column
float_ozone_pred = q1_dataset['ds_o3_pred'].astype(float)
mean_ozone_pred = float_ozone_pred.mean()
q1_dataset['treat'] = float_ozone_pred > mean_ozone_pred
q1_dataset
Importing Dew Point and Temperature
#Reading in dew point and temp data
ave_dewpt_temp=pd.read_csv("State_ave_temp_and_dew_pt.csv")
ave_dewpt_temp
#convert state names to state abbreviations
#dictionary sourced from https://gist.github.com/rogerallen/1583593
us_state_to_abbrev = {
"Alabama": "AL",
"Alaska": "AK",
"Arizona": "AZ",
"Arkansas": "AR",
"California": "CA",
"Colorado": "CO",
"Connecticut": "CT",
"Delaware": "DE",
"Florida": "FL",
"Georgia": "GA",
"Hawaii": "HI",
"Idaho": "ID",
"Illinois": "IL",
"Indiana": "IN",
"Iowa": "IA",
"Kansas": "KS",
"Kentucky": "KY",
"Louisiana": "LA",
"Maine": "ME",
"Maryland": "MD",
"Massachusetts": "MA",
"Michigan": "MI",
"Minnesota": "MN",
"Mississippi": "MS",
"Missouri": "MO",
"Montana": "MT",
"Nebraska": "NE",
"Nevada": "NV",
"New Hampshire": "NH",
"New Jersey": "NJ",
"New Mexico": "NM",
"New York": "NY",
"North Carolina": "NC",
"North Dakota": "ND",
"Ohio": "OH",
"Oklahoma": "OK",
"Oregon": "OR",
"Pennsylvania": "PA",
"Rhode Island": "RI",
"South Carolina": "SC",
"South Dakota": "SD",
"Tennessee": "TN",
"Texas": "TX",
"Utah": "UT",
"Vermont": "VT",
"Virginia": "VA",
"Washington": "WA",
"West Virginia": "WV",
"Wisconsin": "WI",
"Wyoming": "WY",
"District of Columbia": "DC",
"American Samoa": "AS",
"Guam": "GU",
"Northern Mariana Islands": "MP",
"Puerto Rico": "PR",
"United States Minor Outlying Islands": "UM",
"U.S. Virgin Islands": "VI",
}
stateabbr = []
for i in ave_dewpt_temp["State"]:
#ave_dewpt_temp["StateAbbr"][i] = us_state_to_abbrev.get(i)
stateabbr.append(us_state_to_abbrev.get(i))
ave_dewpt_temp["LocationAbbr"] = stateabbr
ave_dewpt_temp
Joining the Confounding Variables with the Dataset
#Joining dewpoint and temperature data with asthma and ozone data
q1_dataset_ = pd.merge(q1_dataset, ave_dewpt_temp, on = ["LocationAbbr"])
q1_dataset_
q1_dataset_["LocationAbbr"].unique()
EDA For Ozone Concentration and Asthma
q1_dataset_temp = q1_dataset_.sample(5000, random_state = 1)
plt.bar(q1_dataset_temp["LocationAbbr"], q1_dataset_temp["DataValue"]) #fix location
plt.title("Asthma cases per million for each State")
plt.xlabel("State") #fix label
plt.ylabel("Asthma cases per million") #fix label
plt.plot()
#bar graph of ozone concentration for each location
plt.bar(q1_dataset_temp["LocationAbbr"], q1_dataset_temp["ds_o3_pred"]) #fix location
plt.title("Ozone Concentrations for each State")
plt.xlabel("State") #fix label
plt.ylabel("Ozone Concentration") #fix label
plt.plot()
#r- scatterplot of number of asthma cases per million vs. ozone concentration
q1_dataset_temp = q1_dataset_.sample(5000, random_state = 1)
plt.scatter(q1_dataset_temp["DataValue"], q1_dataset_temp["ds_o3_pred"])
plt.title("number of asthma cases per million vs. ozone concentration")
plt.xlabel("Asthma cases per million")
plt.ylabel("Ozone Concentration")
plt.plot()
q1_dataset_temp["ds_o3_pred"] = q1_dataset_temp["ds_o3_pred"].astype(float)
sns.jointplot(data = q1_dataset_temp, x = "DataValue", y = "ds_o3_pred", kind = "kde", fill = True);
Inverse Propensity Weighting
#calculate propensity scores
#code partially from homework 4
from sklearn.linear_model import LogisticRegression as LR
lr = LR(penalty="none", max_iter=200, random_state=0)
X= q1_dataset_[["Average Dew Point", "Average Temperature"]]
Y = q1_dataset_["ds_o3_pred"]#outcome
Z = q1_dataset_["treat"]# treatment
lr_model = lr.fit(X, Z)
sum(lr_model.predict(X))
Causal Effect
# Computing the IPW estimates for the Average Treatment Effect
#remove propensity scores < 0.1 and > 0.9
def estimate_treatment_effect(lr, X, Y, Z):
ex = lr.predict_proba(X)[:, 1]
return np.mean(Z * Y / ex) - np.mean((1 - Z) * Y / (1 - ex))
estimate_treatment_effect(lr_model, X, Y.astype(float), Z) #fix Y in dataset later