Data 102 Final Project - Hypothesis Testing

import pandas as pd import numpy as np import matplotlib.pyplot as plt

df = pd.read_csv('/work/income_data.csv') df = df[['county', 'state', 'median household income']] df.dropna(inplace=True) df_cc = pd.read_csv('incidence_rate_and_redlining.csv') df

df_merged = df_cc.merge(df, on=['county', 'state']) greater_than_national_avg = df_merged['median household income'] >= 51749 df_merged['greater than national avg'] = list(greater_than_national_avg) df_merged = df_merged[df_merged['incidence_rate'] > 0] plt.plot(np.log(df_merged['median household income']), np.log(df_merged['incidence_rate']), 'ro') plt.xlabel('log avg median household income') plt.ylabel('log avg cervical cancer incidence rates') plt.title('Cervical Cancer Incidence Rates vs Median Household Income (USD) for 2014-2018');

df_high = df_merged[df_merged['greater than national avg'] == True] df_low = df_merged[df_merged['greater than national avg'] == False] ir_mean_high = np.mean(df_high['incidence_rate'].astype(float)) ir_mean_low = np.mean(df_low['incidence_rate'].astype(float)) print('ir_mean_high', ir_mean_high) print('ir_mean_low', ir_mean_low)

simulations = [] for i in range(10000): shuffled = df_merged['incidence_rate'].sample(n = len(df_merged)) shuffled.reset_index(inplace=True, drop=True) df_merged["shuffled_incidence_ratings"] = shuffled df_high = df_merged[df_merged['greater than national avg'] == True] df_low = df_merged[df_merged['greater than national avg'] == False] test_stat = np.mean(df_low["shuffled_incidence_ratings"].astype(float)) - np.mean(df_high["shuffled_incidence_ratings"].astype(float)) simulations.append(test_stat)

simulations = np.array(simulations) plt.hist(simulations); plt.axvline(x=ir_mean_low - ir_mean_high, color ='red', label='observed value') plt.legend() plt.title('Testing Hypothesis 5');

t_stat = ir_mean_low - ir_mean_high n = len(simulations) p_value = (sum(simulations >= t_stat) / n) + (sum(simulations <= -t_stat) / n) p_value

# refer to lab in the beginning of semester - naive p value testing # two examples of correction methods - use both of them # explain and calculate both of them # but at the end pick one # look at notes for both

mortality = pd.read_csv('/work/death - death.csv') mortality = mortality[['state', 'county', 'age-adjusted death rate - deaths per 100,000']] mortality.dropna(inplace=True) mortality.replace('*', 0, inplace=True) mortality['age-adjusted death rate - deaths per 100,000'] = mortality['age-adjusted death rate - deaths per 100,000'].astype(float) mortality.rename(columns={'age-adjusted death rate - deaths per 100,000': 'death rate'}, inplace=True) mortality = mortality[mortality['death rate'] > 0] mortality

df_merged = df.merge(mortality, on=['state','county']) greater_than_national_avg = df_merged['median household income'] >= 51749 df_merged['greater than national avg'] = list(greater_than_national_avg) df_merged = df_merged[df_merged['death rate']>0]

plt.plot(np.log(df_merged['median household income']), np.log(df_merged['death rate']), 'ro') plt.xlabel('log avg median household income') plt.ylabel('log avg cervical cancer death rates') plt.title('Cervical Cancer Death Rates vs Median Household Income (USD) for 2014-2018');

df_high = df_merged[df_merged['greater than national avg'] == True] df_low = df_merged[df_merged['greater than national avg'] == False] dr_mean_high = np.mean(df_high['death rate'].astype(float)) dr_mean_low = np.mean(df_low['death rate'].astype(float)) print('dr_mean_high', dr_mean_high) print('dr_mean_low', dr_mean_low)

simulations = [] for i in range(10000): shuffled = df_merged['death rate'].sample(n = len(df_merged)) shuffled.reset_index(inplace=True, drop=True) df_merged["shuffled_death_ratings"] = shuffled df_high = df_merged[df_merged['greater than national avg'] == True] df_low = df_merged[df_merged['greater than national avg'] == False] test_stat = np.mean(df_low["shuffled_death_ratings"].astype(float)) - np.mean(df_high["shuffled_death_ratings"].astype(float)) simulations.append(test_stat)

simulations = np.array(simulations) plt.hist(simulations); plt.axvline(x=dr_mean_low - dr_mean_high, color ='red');

t_stat = dr_mean_low - dr_mean_high n = len(simulations) p_value = (sum(simulations >= t_stat) / n) + (sum(simulations <= -t_stat) / n) p_value