EDA: Is handwashing really effective?

import requests from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/Historical_mortality_rates_of_puerperal_fever' # create get requests page = requests.get(url) # create soup object soup = BeautifulSoup(page.text, 'lxml') soup.title

# find table table = soup.find('table', {'class': 'wikitable sortable'})

# getting table headers table_header = table.find_all('th') table_header_names = [th.text.strip() for th in table_header] table_header_names.pop() # only interested in the first 5 columns table_header_names

# getting table contents # generates a list of <tr> </tr> table_content = table.find_all('tr') # exclude the header row table_content = table_content[1:]

# initialize dictionary to pass in columns table_dict = dict([(name, '') for name in table_header_names]) print(table_dict)

# initialize empty 'columns' lists year = [] month = [] births = [] deaths = [] rate = [] def appending_td_to_list(td, lst, index): return lst.append(td[index].find(text=True)) for tr in table_content: # generates a list of <td> </td> td = tr.find_all('td') appending_td_to_list(td, year, 0) appending_td_to_list(td, month, 1) appending_td_to_list(td, births, 2) appending_td_to_list(td, deaths, 3) appending_td_to_list(td, rate, 4)

# pass in columns into dictionary table_dict['Year'] = year table_dict['Month'] = month table_dict['Births'] = births table_dict['Deaths'] = deaths table_dict['Rate (%)'] = rate

import pandas as pd import numpy as np pd.options.mode.chained_assignment = None # default='warn'

# from wiki table to DataFrame wiki_table = pd.DataFrame(table_dict) # inspect first 5 elements wiki_table.sample(5)

# removing '\n' in all columns using str.replace() for name in table_header_names: wiki_table[name] = wiki_table[name].str.replace('\n', '') wiki_table.sample(5)

# shocking, we don't have any NaN but, print(wiki_table.isnull().sum()) # we have a NaN that doesn't look like a NaN nan_but_not_nan = wiki_table[wiki_table.Births.str.match('na')] # replace 'na' with NaN lst_name = ['Births', 'Deaths', 'Rate (%)'] for name in lst_name: nan_but_not_nan.loc[:, name].replace('na', np.nan, inplace=True) # taking Set Union of wiki_table and nan_but_not_nan ---> concat + drop_duplicates wiki_table = pd.concat([wiki_table, nan_but_not_nan], ignore_index=True) wiki_table.drop_duplicates(subset=['Month'], keep='last', inplace=True) wiki_table.tail()

# running this returns ValueError: time data 'Mar 1843' does not match format '%B %Y' (match) # pd.to_datetime(wiki_table.Month, format='%B %Y')

# finding the culprit and removing it print(wiki_table[wiki_table.Month.str.match('Mar')]) # replace 'Mar 1843' with 'March 1843' to allow us to parse into datetime wiki_table['Month'] = wiki_table.Month.replace('Mar 1843', 'March 1843', regex=False) print('\n') print(wiki_table[wiki_table.Month.str.match('Mar')])

wiki_table['dates'] = pd.to_datetime(wiki_table.Month, format='%B %Y') wiki_table.dtypes

# set dates as index wiki_table.set_index('dates', inplace=True) wiki_table.sort_index(inplace=True) wiki_table.head()

# fill NaN using ffill wiki_table.fillna(method = 'ffill', inplace=True) # verify that NaN is indeed filled wiki_table[wiki_table.Month == 'December 1841']

# inspect wiki_table.dtypes

# convert 'Births', 'Deaths' to int type to_number_cols = ['Births', 'Deaths'] wiki_table[to_number_cols] = wiki_table[to_number_cols].apply(pd.to_numeric, errors='coerce', downcast='signed', axis=1) wiki_table.dtypes

# convert 'Rate (%)' to float type wiki_table['Rate (%)'] = wiki_table['Rate (%)'].astype('float', errors='ignore') wiki_table['Rate (%)'].dtype

import matplotlib.pyplot as plt

# useful function to plot a timeseries graph def timeseries_plot(ax, x, y, xlabel = 'some X label', ylabel = 'some Y label', color = 'black', linestyle = 'solid', alpha = 1, color_ticks = True): ax.plot(x, y, color, linestyle = linestyle, alpha = alpha) ax.set_xlabel(xlabel) if color_ticks == True: ax.set_ylabel(ylabel, color = color) ax.tick_params('y', colors = color) else: ax.set_ylabel(ylabel)

fig, ax = plt.subplots(figsize=(11.7, 8.27)) # plot rate vs year timeseries_plot(ax, wiki_table.index, wiki_table['Rate (%)'], 'Year', 'Mortality rate (%)', 'blue') # plot Births vs year ax2 = ax.twinx() timeseries_plot(ax2, wiki_table.index, wiki_table.Births, 'Year', 'Births', 'red', linestyle = '--', alpha = 0.5) ax.set_title('Mortality rate 1841-1849 at Vienna General Hospital') plt.show()

wiki_table.loc["May 1847"]

from datetime import datetime, timedelta instituted_date = datetime.strptime('May 1847', '%B %Y').date() effective_impact_date = datetime.strptime('June 1847', '%B %Y').date() strict_control = datetime.strptime('January 1848', '%B %Y').date() start_handwashing = wiki_table.loc['June 1847'] pre_handwashing = wiki_table.loc[:'June 1847'] post_handwashing = wiki_table.loc['June 1847':]

def get_rate_and_births(month): row = wiki_table[wiki_table.Month == month] rate = row.values[0][-1] births = row.values[0][2] return rate, births # get Rate (%) and Births on instituted_date: May 1847 rate_instituted_date, births_instituted_date = get_rate_and_births('May 1847') # get Rate (%) and Births on strict_control: January 1848 rate_strict_control, births_strict_control = get_rate_and_births('January 1848')

plt.style.use('seaborn') fig, ax2 = plt.subplots(2, 1, figsize=(15, 8)) # plot Rate (%) before and after handwashing ax2[0].plot(pre_handwashing.index, pre_handwashing['Rate (%)'], label = 'Before handwashing', color = 'red') ax2[0].plot(post_handwashing.index, post_handwashing['Rate (%)'], label = 'After handwashing', color = 'green') # plot Births before and after handwashing ax2[1].plot(pre_handwashing.index, pre_handwashing.Births, label = 'Before handwashing', color = 'red') ax2[1].plot(post_handwashing.index, post_handwashing.Births, label = 'After handwashing', color = 'green') # set labels ax2[0].set_ylabel('Proportion of deaths per births (%)') ax2[1].set_ylabel('Births') ax2[1].set_xlabel('Year') # set title ax2[0].set_title('Mortality rate & Births in 1841-1849 at Vienna General Hospital') # annotate both plots ax2[0].annotate('Handwashing \n policy \n instituted', xy = (instituted_date, rate_instituted_date), xytext = (pd.to_datetime('1847-10-01'), 10), arrowprops = {'arrowstyle':'simple', 'color':'k'}) ax2[1].annotate('Handwashing \n policy \n instituted', xy = (instituted_date, births_instituted_date), xytext = (pd.to_datetime('1847-4-01'), 325), arrowprops = {'arrowstyle':'simple', 'color':'k'}) # set legend ax2[0].legend() ax2[1].legend() plt.show()

pre_rate = pre_handwashing['Rate (%)'] post_rate = post_handwashing['Rate (%)'] diff_in_mean = post_rate.mean() - pre_rate.mean() diff_in_std = post_rate.std() - pre_rate.std() print(f'Mean of mortality rate pre-handwashing: {round(pre_rate.mean(), 2)}%') print(f'Mean of mortality rate post-handwashing: {round(post_rate.mean(), 2)}%') print(f'Difference in mean: {round(diff_in_mean, 2)}%') print('\n') print(f'Standard error of mortality rate pre-handwashing: {round(pre_rate.std(), 2)}%') print(f'Standard error of mortality rate post-handwashing: {round(post_rate.std(), 2)}%') print(f'Difference in standard error: {round(diff_in_std, 2)}%')

def func_diff_bootstrap_replicate(pre_series, post_series, func): '''Generates bootstrap replicate of the difference in func applied to two pre-post pandas Series''' pre_replicate = pre_series.sample(frac=1, replace=True) post_replicate = post_series.sample(frac=1, replace=True) func_diff = func(post_replicate) - func(pre_replicate) return func_diff def conf_int(lst, conf=90, result='series'): '''Computes the confidence interval from a list and returns a pandas Series or a numpy array''' if result == 'series': if conf == 90: return pd.Series(lst).quantile([0.05, 0.95]) elif conf == 95: return pd.Series(lst).quantile([0.025, 0.975]) elif conf == 97.5: return pd.Series(lst).quantile([0.0125, 0.9875]) elif conf == 99: return pd.Series(lst).quantile([0.005, 0.995]) if result == 'array': if conf == 90: return np.percentile(lst, [5, 95]) elif conf == 95: return np.percentile(lst, [2.5, 97.5]) elif conf == 97.5: return np.percentile(lst, [1.25, 98.75]) elif conf == 99: return np.percentile(lst, [0.5, 99.5])

bs_replicates = [] for i in range(2000): mean_diff = func_diff_bootstrap_replicate(pre_rate, post_rate, np.mean) bs_replicates.append(mean_diff) conf_interval = conf_int(bs_replicates, conf=90, result='series') conf_interval

bs_replicates_2 = [] for i in range(2000): std_diff = func_diff_bootstrap_replicate(pre_rate, post_rate, np.std) bs_replicates_2.append(std_diff) conf_interval = conf_int(bs_replicates_2, conf=90, result='series') conf_interval

effective_control_date = strict_control + timedelta(days=31) pre_control = post_handwashing[:effective_control_date] post_control = post_handwashing[effective_control_date:]

fig, ax3 = plt.subplots(2, 1, figsize=(15, 8)) ax3[0].plot(pre_control.index, pre_control['Rate (%)'], color = 'red', label='Before strict control') ax3[0].plot(post_control.index, post_control['Rate (%)'], color = 'green', label='After strict control') ax3[1].plot(pre_control.index, pre_control['Births'], color = 'red', label='Before strict control') ax3[1].plot(post_control.index, post_control['Births'], color = 'green', label='After strict control') # set labels ax3[0].set_title('Mortality rate & Births in 1847-1849 (post-handwashing) at Vienna General Hospital') ax3[0].set_ylabel('Proportion of death per birth (%)') ax3[1].set_ylabel('Births') ax3[1].set_xlabel('Months') ax3[0].annotate('Strict controls \n on negligent \n students', xy = (strict_control, rate_strict_control), xytext = (pd.to_datetime('1848-03-01'), 3), arrowprops = {'arrowstyle':'simple', 'color':'k'}) ax3[1].annotate('Strict controls \n on negligent \n students', xy = (strict_control, births_strict_control), xytext = (pd.to_datetime('1847-12-01'), 320), arrowprops = {'arrowstyle':'simple', 'color':'k'}) ax3[0].legend() ax3[1].legend() plt.show()

zero_rate = post_handwashing[post_handwashing['Rate (%)'] == 0] print(zero_rate)