import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv('/work/income_data.csv')
df = df[['county', 'state', 'median household income']]
df.dropna(inplace=True)
df_cc = pd.read_csv('incidence_rate_and_redlining.csv')
df
df_merged = df_cc.merge(df, on=['county', 'state'])
greater_than_national_avg = df_merged['median household income'] >= 51749
df_merged['greater than national avg'] = list(greater_than_national_avg)
df_merged = df_merged[df_merged['incidence_rate'] > 0]
plt.plot(np.log(df_merged['median household income']), np.log(df_merged['incidence_rate']), 'ro')
plt.xlabel('log avg median household income')
plt.ylabel('log avg cervical cancer incidence rates')
plt.title('Cervical Cancer Incidence Rates vs Median Household Income (USD) for 2014-2018');
df_high = df_merged[df_merged['greater than national avg'] == True]
df_low = df_merged[df_merged['greater than national avg'] == False]
ir_mean_high = np.mean(df_high['incidence_rate'].astype(float))
ir_mean_low = np.mean(df_low['incidence_rate'].astype(float))
print('ir_mean_high', ir_mean_high)
print('ir_mean_low', ir_mean_low)
simulations = []
for i in range(10000):
shuffled = df_merged['incidence_rate'].sample(n = len(df_merged))
shuffled.reset_index(inplace=True, drop=True)
df_merged["shuffled_incidence_ratings"] = shuffled
df_high = df_merged[df_merged['greater than national avg'] == True]
df_low = df_merged[df_merged['greater than national avg'] == False]
test_stat = np.mean(df_low["shuffled_incidence_ratings"].astype(float)) - np.mean(df_high["shuffled_incidence_ratings"].astype(float))
simulations.append(test_stat)
simulations = np.array(simulations)
plt.hist(simulations);
plt.axvline(x=ir_mean_low - ir_mean_high, color ='red', label='observed value')
plt.legend()
plt.title('Testing Hypothesis 5');
t_stat = ir_mean_low - ir_mean_high
n = len(simulations)
p_value = (sum(simulations >= t_stat) / n) + (sum(simulations <= -t_stat) / n)
p_value
# refer to lab in the beginning of semester - naive p value testing
# two examples of correction methods - use both of them
# explain and calculate both of them
# but at the end pick one
# look at notes for both
mortality = pd.read_csv('/work/death - death.csv')
mortality = mortality[['state', 'county', 'age-adjusted death rate - deaths per 100,000']]
mortality.dropna(inplace=True)
mortality.replace('*', 0, inplace=True)
mortality['age-adjusted death rate - deaths per 100,000'] = mortality['age-adjusted death rate - deaths per 100,000'].astype(float)
mortality.rename(columns={'age-adjusted death rate - deaths per 100,000': 'death rate'}, inplace=True)
mortality = mortality[mortality['death rate'] > 0]
mortality
df_merged = df.merge(mortality, on=['state','county'])
greater_than_national_avg = df_merged['median household income'] >= 51749
df_merged['greater than national avg'] = list(greater_than_national_avg)
df_merged = df_merged[df_merged['death rate']>0]
plt.plot(np.log(df_merged['median household income']), np.log(df_merged['death rate']), 'ro')
plt.xlabel('log avg median household income')
plt.ylabel('log avg cervical cancer death rates')
plt.title('Cervical Cancer Death Rates vs Median Household Income (USD) for 2014-2018');
df_high = df_merged[df_merged['greater than national avg'] == True]
df_low = df_merged[df_merged['greater than national avg'] == False]
dr_mean_high = np.mean(df_high['death rate'].astype(float))
dr_mean_low = np.mean(df_low['death rate'].astype(float))
print('dr_mean_high', dr_mean_high)
print('dr_mean_low', dr_mean_low)
simulations = []
for i in range(10000):
shuffled = df_merged['death rate'].sample(n = len(df_merged))
shuffled.reset_index(inplace=True, drop=True)
df_merged["shuffled_death_ratings"] = shuffled
df_high = df_merged[df_merged['greater than national avg'] == True]
df_low = df_merged[df_merged['greater than national avg'] == False]
test_stat = np.mean(df_low["shuffled_death_ratings"].astype(float)) - np.mean(df_high["shuffled_death_ratings"].astype(float))
simulations.append(test_stat)
simulations = np.array(simulations)
plt.hist(simulations);
plt.axvline(x=dr_mean_low - dr_mean_high, color ='red');
t_stat = dr_mean_low - dr_mean_high
n = len(simulations)
p_value = (sum(simulations >= t_stat) / n) + (sum(simulations <= -t_stat) / n)
p_value