Installing Libraries
#!/usr/bin/env python
# make sure to install these packages before running:
!pip install pandas
!pip install sodapy
!pip install us
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import us
Creating the Asthma Dataset
asthma_data=pd.read_csv("U.S._Chronic_Disease_Indicators__Asthma.csv")
list(asthma_data.columns)
asthma_data = asthma_data.drop(columns=['YearEnd','LocationDesc', 'DataSource','Topic','Question','Response','DataValueType',
'DataValueAlt','DataValueFootnoteSymbol','DatavalueFootnote','LowConfidenceLimit','HighConfidenceLimit','StratificationCategory1',
'Stratification1','StratificationCategory2','Stratification2','StratificationCategory3','Stratification3','ResponseID',
'LocationID','TopicID','QuestionID','DataValueTypeID','StratificationCategoryID1','StratificationID1',
'StratificationCategoryID2','StratificationID2','StratificationCategoryID3','StratificationID3'])
asthma_data = asthma_data.dropna()
#final dataset for asthma
asthma_data_per_10K= asthma_data[asthma_data["DataValueUnit"] == "cases per 10,000" ]
asthma_data_per_10K
Creating the Ozone Dataset
import pandas as pd
from sodapy import Socrata
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cdc.gov", None)
# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cdc.gov,
# MyAppToken,
# userame="user@example.com",
# password="AFakePassword")
# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("372p-dx3h", limit=2000)
# Convert to pandas DataFrame
ozone_data = pd.DataFrame.from_records(results)
ozone_data
def filter_fips(table):
abbrev=[]
for i in table["statefips"]:
if int(i) < 10:
i = '0' + str(i)
abbrev.append(us.states.lookup(str(i)).abbr)
#print(len(abbrev))
#print(abbrev)
table["stateabbrev"] = abbrev
filter_fips(ozone_data)
ozone_data=ozone_data.filter(items=['ds_o3_pred', 'stateabbrev', 'year'])
ozone_data
#inner join the table
ozone_data = ozone_data.rename(columns={"year": "YearStart", "stateabbrev": "LocationAbbr"})
ozone_data["YearStart"] = ozone_data["YearStart"].astype(int)
#q1_dataset = pd.concat([asthma_data_per_10K, ozone_data])
#q1_dataset = pd.merge(asthma_data_per_10K, ozone_data, left_on = ["YearStart", "LocationAbbr"], right_on = ["year", "stateabbrev"])
q1_dataset = pd.merge(asthma_data_per_10K, ozone_data, on = ["LocationAbbr"])
#q1_dataset = q1_dataset.dropna()
q1_dataset
Creating a Treatment Column
#find the mean
float_ozone_pred = q1_dataset['ds_o3_pred'].astype(float)
mean_ozone_pred = float_ozone_pred.mean()
q1_dataset['treat'] = float_ozone_pred > mean_ozone_pred
q1_dataset
Importing Dew Point and Temperature
ave_dewpt_temp=pd.read_csv("State_ave_temp_and_dew_pt.csv")
ave_dewpt_temp
#convert state names to state abbreviations
#dictionary sourced from https://gist.github.com/rogerallen/1583593
us_state_to_abbrev = {
"Alabama": "AL",
"Alaska": "AK",
"Arizona": "AZ",
"Arkansas": "AR",
"California": "CA",
"Colorado": "CO",
"Connecticut": "CT",
"Delaware": "DE",
"Florida": "FL",
"Georgia": "GA",
"Hawaii": "HI",
"Idaho": "ID",
"Illinois": "IL",
"Indiana": "IN",
"Iowa": "IA",
"Kansas": "KS",
"Kentucky": "KY",
"Louisiana": "LA",
"Maine": "ME",
"Maryland": "MD",
"Massachusetts": "MA",
"Michigan": "MI",
"Minnesota": "MN",
"Mississippi": "MS",
"Missouri": "MO",
"Montana": "MT",
"Nebraska": "NE",
"Nevada": "NV",
"New Hampshire": "NH",
"New Jersey": "NJ",
"New Mexico": "NM",
"New York": "NY",
"North Carolina": "NC",
"North Dakota": "ND",
"Ohio": "OH",
"Oklahoma": "OK",
"Oregon": "OR",
"Pennsylvania": "PA",
"Rhode Island": "RI",
"South Carolina": "SC",
"South Dakota": "SD",
"Tennessee": "TN",
"Texas": "TX",
"Utah": "UT",
"Vermont": "VT",
"Virginia": "VA",
"Washington": "WA",
"West Virginia": "WV",
"Wisconsin": "WI",
"Wyoming": "WY",
"District of Columbia": "DC",
"American Samoa": "AS",
"Guam": "GU",
"Northern Mariana Islands": "MP",
"Puerto Rico": "PR",
"United States Minor Outlying Islands": "UM",
"U.S. Virgin Islands": "VI",
}
stateabbr = []
for i in ave_dewpt_temp["State"]:
#ave_dewpt_temp["StateAbbr"][i] = us_state_to_abbrev.get(i)
stateabbr.append(us_state_to_abbrev.get(i))
ave_dewpt_temp["LocationAbbr"] = stateabbr
ave_dewpt_temp
Joining the Confounding Variables with the Dataset
q1_dataset_ = pd.merge(q1_dataset, ave_dewpt_temp, on = ["LocationAbbr"])
q1_dataset_
Inverse Propensity Weighting
#calculate propensity scores
#code partially from homework 4
from sklearn.linear_model import LogisticRegression as LR
lr = LR(penalty="none", max_iter=200, random_state=0)
X= q1_dataset_[["Average Dew Point", "Average Temperature"]]
Y = q1_dataset_["ds_o3_pred"]#outcome
Z = q1_dataset_["treat"]# treatment
lr_model = lr.fit(X, Z)
sum(lr_model.predict(X))
Causal Effect
#don't use this one
def estimate_treatment_effect(fitted_model, X, Y, Z):
pscore = lr_model.predict_proba(X)[:,1]
constant = (1 / len(X))
first_term = (Z * Y) / pscore
second_term = ((1 - Z) * Y) / (1 - pscore)
sum_terms = sum(first_term - second_term)
ATE = constant * sum_terms
return ATE
estimate_treatment_effect(lr_model, X, Y.astype(float), Z) #fix Y in dataset later
# Computing the IPW estimates for the Average Treatment Effect
#remove propensity scores < 0.1 and > 0.9
def estimate_treatment_effect(lr, X, Y, Z):
ex = lr.predict_proba(X)[:, 1]
return np.mean(Z * Y / ex) - np.mean((1 - Z) * Y / (1 - ex))
estimate_treatment_effect(lr_model, X, Y.astype(float), Z) #fix Y in dataset later