Installing Libraries
#!/usr/bin/env python
# make sure to install these packages before running:
!pip install pandas
!pip install sodapy
!pip install us
Requirement already satisfied: pandas in /shared-libs/python3.7/py/lib/python3.7/site-packages (1.2.5)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas) (2021.3)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas) (2.8.2)
Requirement already satisfied: numpy>=1.16.5 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas) (1.19.5)
Requirement already satisfied: six>=1.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas) (1.16.0)
WARNING: You are using pip version 20.1.1; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
Requirement already satisfied: sodapy in /root/venv/lib/python3.7/site-packages (2.1.0)
Requirement already satisfied: requests>=2.20.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from sodapy) (2.26.0)
Requirement already satisfied: charset-normalizer~=2.0.0; python_version >= "3" in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from requests>=2.20.0->sodapy) (2.0.7)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from requests>=2.20.0->sodapy) (1.26.7)
Requirement already satisfied: idna<4,>=2.5; python_version >= "3" in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from requests>=2.20.0->sodapy) (3.3)
Requirement already satisfied: certifi>=2017.4.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from requests>=2.20.0->sodapy) (2021.10.8)
WARNING: You are using pip version 20.1.1; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
Requirement already satisfied: us in /root/venv/lib/python3.7/site-packages (2.0.2)
Requirement already satisfied: jellyfish==0.6.1 in /root/venv/lib/python3.7/site-packages (from us) (0.6.1)
WARNING: You are using pip version 20.1.1; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import us
Creating the Asthma Dataset
asthma_data=pd.read_csv("U.S._Chronic_Disease_Indicators__Asthma.csv")
list(asthma_data.columns)
asthma_data = asthma_data.drop(columns=['YearEnd','LocationDesc', 'DataSource','Topic','Question','Response','DataValueType',
'DataValueAlt','DataValueFootnoteSymbol','DatavalueFootnote','LowConfidenceLimit','HighConfidenceLimit','StratificationCategory1',
'Stratification1','StratificationCategory2','Stratification2','StratificationCategory3','Stratification3','ResponseID',
'LocationID','TopicID','QuestionID','DataValueTypeID','StratificationCategoryID1','StratificationID1',
'StratificationCategoryID2','StratificationID2','StratificationCategoryID3','StratificationID3'])
asthma_data = asthma_data.dropna()
#final dataset for asthma
asthma_data_per_10K= asthma_data[asthma_data["DataValueUnit"] == "cases per 10,000" ]
asthma_data_per_10K
Creating the Ozone Dataset
import pandas as pd
from sodapy import Socrata
# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.cdc.gov", None)
# Example authenticated client (needed for non-public datasets):
# client = Socrata(data.cdc.gov,
# MyAppToken,
# userame="user@example.com",
# password="AFakePassword")
# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("372p-dx3h", limit=2000)
# Convert to pandas DataFrame
ozone_data = pd.DataFrame.from_records(results)
ozone_data
WARNING:root:Requests made without an app_token will be subject to strict throttling limits.
def filter_fips(table):
abbrev=[]
for i in table["statefips"]:
if int(i) < 10:
i = '0' + str(i)
abbrev.append(us.states.lookup(str(i)).abbr)
#print(len(abbrev))
#print(abbrev)
table["stateabbrev"] = abbrev
filter_fips(ozone_data)
ozone_data=ozone_data.filter(items=['ds_o3_pred', 'stateabbrev', 'year'])
ozone_data
#inner join the table
ozone_data = ozone_data.rename(columns={"year": "YearStart", "stateabbrev": "LocationAbbr"})
ozone_data["YearStart"] = ozone_data["YearStart"].astype(int)
#q1_dataset = pd.concat([asthma_data_per_10K, ozone_data])
#q1_dataset = pd.merge(asthma_data_per_10K, ozone_data, left_on = ["YearStart", "LocationAbbr"], right_on = ["year", "stateabbrev"])
q1_dataset = pd.merge(asthma_data_per_10K, ozone_data, on = ["LocationAbbr"])
#q1_dataset = q1_dataset.dropna()
q1_dataset
Creating a Treatment Column
#find the mean
float_ozone_pred = q1_dataset['ds_o3_pred'].astype(float)
mean_ozone_pred = float_ozone_pred.mean()
q1_dataset['treat'] = float_ozone_pred > mean_ozone_pred
q1_dataset
Importing Dew Point and Temperature
ave_dewpt_temp=pd.read_csv("State_ave_temp_and_dew_pt.csv")
ave_dewpt_temp
#convert state names to state abbreviations
#dictionary sourced from https://gist.github.com/rogerallen/1583593
us_state_to_abbrev = {
"Alabama": "AL",
"Alaska": "AK",
"Arizona": "AZ",
"Arkansas": "AR",
"California": "CA",
"Colorado": "CO",
"Connecticut": "CT",
"Delaware": "DE",
"Florida": "FL",
"Georgia": "GA",
"Hawaii": "HI",
"Idaho": "ID",
"Illinois": "IL",
"Indiana": "IN",
"Iowa": "IA",
"Kansas": "KS",
"Kentucky": "KY",
"Louisiana": "LA",
"Maine": "ME",
"Maryland": "MD",
"Massachusetts": "MA",
"Michigan": "MI",
"Minnesota": "MN",
"Mississippi": "MS",
"Missouri": "MO",
"Montana": "MT",
"Nebraska": "NE",
"Nevada": "NV",
"New Hampshire": "NH",
"New Jersey": "NJ",
"New Mexico": "NM",
"New York": "NY",
"North Carolina": "NC",
"North Dakota": "ND",
"Ohio": "OH",
"Oklahoma": "OK",
"Oregon": "OR",
"Pennsylvania": "PA",
"Rhode Island": "RI",
"South Carolina": "SC",
"South Dakota": "SD",
"Tennessee": "TN",
"Texas": "TX",
"Utah": "UT",
"Vermont": "VT",
"Virginia": "VA",
"Washington": "WA",
"West Virginia": "WV",
"Wisconsin": "WI",
"Wyoming": "WY",
"District of Columbia": "DC",
"American Samoa": "AS",
"Guam": "GU",
"Northern Mariana Islands": "MP",
"Puerto Rico": "PR",
"United States Minor Outlying Islands": "UM",
"U.S. Virgin Islands": "VI",
}
stateabbr = []
for i in ave_dewpt_temp["State"]:
#ave_dewpt_temp["StateAbbr"][i] = us_state_to_abbrev.get(i)
stateabbr.append(us_state_to_abbrev.get(i))
ave_dewpt_temp["LocationAbbr"] = stateabbr
ave_dewpt_temp
Joining the Confounding Variables with the Dataset
q1_dataset_ = pd.merge(q1_dataset, ave_dewpt_temp, on = ["LocationAbbr"])
q1_dataset_
Inverse Propensity Weighting
#calculate propensity scores
#code partially from homework 4
from sklearn.linear_model import LogisticRegression as LR
lr = LR(penalty="none", max_iter=200, random_state=0)
X= q1_dataset_[["Average Dew Point", "Average Temperature"]]
Y = q1_dataset_["ds_o3_pred"]#outcome
Z = q1_dataset_["treat"]# treatment
lr_model = lr.fit(X, Z)
sum(lr_model.predict(X))
Causal Effect
#don't use this one
def estimate_treatment_effect(fitted_model, X, Y, Z):
pscore = lr_model.predict_proba(X)[:,1]
constant = (1 / len(X))
first_term = (Z * Y) / pscore
second_term = ((1 - Z) * Y) / (1 - pscore)
sum_terms = sum(first_term - second_term)
ATE = constant * sum_terms
return ATE
estimate_treatment_effect(lr_model, X, Y.astype(float), Z) #fix Y in dataset later
# Computing the IPW estimates for the Average Treatment Effect
#remove propensity scores < 0.1 and > 0.9
def estimate_treatment_effect(lr, X, Y, Z):
ex = lr.predict_proba(X)[:, 1]
return np.mean(Z * Y / ex) - np.mean((1 - Z) * Y / (1 - ex))
estimate_treatment_effect(lr_model, X, Y.astype(float), Z) #fix Y in dataset later