EDA: Is handwashing really effective?

import requests from bs4 import BeautifulSoup

url = 'https://en.wikipedia.org/wiki/Historical_mortality_rates_of_puerperal_fever' # create get requests page = requests.get(url) # create soup object soup = BeautifulSoup(page.text, 'lxml') soup.title

# find table table = soup.find('table', {'class': 'wikitable sortable'})

# getting table headers table_header = table.find_all('th') table_header_names = [th.text.strip() for th in table_header] table_header_names.pop() # only interested in the first 5 columns table_header_names

# getting table contents # generates a list of <tr> </tr> table_content = table.find_all('tr') # exclude the header row table_content = table_content[1:]

# initialize dictionary to pass in columns table_dict = dict([(name, '') for name in table_header_names]) print(table_dict)

{'Year': '', 'Month': '', 'Births': '', 'Deaths': '', 'Rate (%)': ''}

# initialize empty 'columns' lists year = [] month = [] births = [] deaths = [] rate = [] def appending_td_to_list(td, lst, index): return lst.append(td[index].find(text=True)) for tr in table_content: # generates a list of <td> </td> td = tr.find_all('td') appending_td_to_list(td, year, 0) appending_td_to_list(td, month, 1) appending_td_to_list(td, births, 2) appending_td_to_list(td, deaths, 3) appending_td_to_list(td, rate, 4)

# pass in columns into dictionary table_dict['Year'] = year table_dict['Month'] = month table_dict['Births'] = births table_dict['Deaths'] = deaths table_dict['Rate (%)'] = rate

import pandas as pd import numpy as np pd.options.mode.chained_assignment = None # default='warn'

# from wiki table to DataFrame wiki_table = pd.DataFrame(table_dict) # inspect first 5 elements wiki_table.sample(5)

# removing '\n' in all columns using str.replace() for name in table_header_names: wiki_table[name] = wiki_table[name].str.replace('\n', '') wiki_table.sample(5)

# shocking, we don't have any NaN but, print(wiki_table.isnull().sum()) # we have a NaN that doesn't look like a NaN nan_but_not_nan = wiki_table[wiki_table.Births.str.match('na')] # replace 'na' with NaN lst_name = ['Births', 'Deaths', 'Rate (%)'] for name in lst_name: nan_but_not_nan.loc[:, name].replace('na', np.nan, inplace=True) # taking Set Union of wiki_table and nan_but_not_nan ---> concat + drop_duplicates wiki_table = pd.concat([wiki_table, nan_but_not_nan], ignore_index=True) wiki_table.drop_duplicates(subset=['Month'], keep='last', inplace=True) wiki_table.tail()

Year        0
Month       0
Births      0
Deaths      0
Rate (%)    0
dtype: int64

# running this returns ValueError: time data 'Mar 1843' does not match format '%B %Y' (match) # pd.to_datetime(wiki_table.Month, format='%B %Y')

# finding the culprit and removing it print(wiki_table[wiki_table.Month.str.match('Mar')]) # replace 'Mar 1843' with 'March 1843' to allow us to parse into datetime wiki_table['Month'] = wiki_table.Month.replace('Mar 1843', 'March 1843', regex=False) print('\n') print(wiki_table[wiki_table.Month.str.match('Mar')])

   Year       Month Births Deaths Rate (%)
2        March 1841    277     12      4.3
14       March 1842    264     27     10.2
26         Mar 1843    266     33     12.4
38       March 1844    276     47     17.0
50       March 1845    292     13      4.5
62       March 1846    311     48     15.4
74       March 1847    305     11      3.6
86       March 1848    276      0      0.0
98       March 1849    406     20      4.9


   Year       Month Births Deaths Rate (%)
2        March 1841    277     12      4.3
14       March 1842    264     27     10.2
26       March 1843    266     33     12.4
38       March 1844    276     47     17.0
50       March 1845    292     13      4.5
62       March 1846    311     48     15.4
74       March 1847    305     11      3.6
86       March 1848    276      0      0.0
98       March 1849    406     20      4.9

wiki_table['dates'] = pd.to_datetime(wiki_table.Month, format='%B %Y') wiki_table.dtypes

# set dates as index wiki_table.set_index('dates', inplace=True) wiki_table.sort_index(inplace=True) wiki_table.head()

# fill NaN using ffill wiki_table.fillna(method = 'ffill', inplace=True) # verify that NaN is indeed filled wiki_table[wiki_table.Month == 'December 1841']

# inspect wiki_table.dtypes

# convert 'Births', 'Deaths' to int type to_number_cols = ['Births', 'Deaths'] wiki_table[to_number_cols] = wiki_table[to_number_cols].apply(pd.to_numeric, errors='coerce', downcast='signed', axis=1) wiki_table.dtypes

# convert 'Rate (%)' to float type wiki_table['Rate (%)'] = wiki_table['Rate (%)'].astype('float', errors='ignore') wiki_table['Rate (%)'].dtype

import matplotlib.pyplot as plt

# useful function to plot a timeseries graph def timeseries_plot(ax, x, y, xlabel = 'some X label', ylabel = 'some Y label', color = 'black', linestyle = 'solid', alpha = 1, color_ticks = True): ax.plot(x, y, color, linestyle = linestyle, alpha = alpha) ax.set_xlabel(xlabel) if color_ticks == True: ax.set_ylabel(ylabel, color = color) ax.tick_params('y', colors = color) else: ax.set_ylabel(ylabel)

fig, ax = plt.subplots(figsize=(11.7, 8.27)) # plot rate vs year timeseries_plot(ax, wiki_table.index, wiki_table['Rate (%)'], 'Year', 'Mortality rate (%)', 'blue') # plot Births vs year ax2 = ax.twinx() timeseries_plot(ax2, wiki_table.index, wiki_table.Births, 'Year', 'Births', 'red', linestyle = '--', alpha = 0.5) ax.set_title('Mortality rate 1841-1849 at Vienna General Hospital') plt.show()

wiki_table.loc["May 1847"]

from datetime import datetime, timedelta instituted_date = datetime.strptime('May 1847', '%B %Y').date() effective_impact_date = datetime.strptime('June 1847', '%B %Y').date() strict_control = datetime.strptime('January 1848', '%B %Y').date() start_handwashing = wiki_table.loc['June 1847'] pre_handwashing = wiki_table.loc[:'June 1847'] post_handwashing = wiki_table.loc['June 1847':]

def get_rate_and_births(month): row = wiki_table[wiki_table.Month == month] rate = row.values[0][-1] births = row.values[0][2] return rate, births # get Rate (%) and Births on instituted_date: May 1847 rate_instituted_date, births_instituted_date = get_rate_and_births('May 1847') # get Rate (%) and Births on strict_control: January 1848 rate_strict_control, births_strict_control = get_rate_and_births('January 1848')

plt.style.use('seaborn') fig, ax2 = plt.subplots(2, 1, figsize=(15, 8)) # plot Rate (%) before and after handwashing ax2[0].plot(pre_handwashing.index, pre_handwashing['Rate (%)'], label = 'Before handwashing', color = 'red') ax2[0].plot(post_handwashing.index, post_handwashing['Rate (%)'], label = 'After handwashing', color = 'green') # plot Births before and after handwashing ax2[1].plot(pre_handwashing.index, pre_handwashing.Births, label = 'Before handwashing', color = 'red') ax2[1].plot(post_handwashing.index, post_handwashing.Births, label = 'After handwashing', color = 'green') # set labels ax2[0].set_ylabel('Proportion of deaths per births (%)') ax2[1].set_ylabel('Births') ax2[1].set_xlabel('Year') # set title ax2[0].set_title('Mortality rate & Births in 1841-1849 at Vienna General Hospital') # annotate both plots ax2[0].annotate('Handwashing \n policy \n instituted', xy = (instituted_date, rate_instituted_date), xytext = (pd.to_datetime('1847-10-01'), 10), arrowprops = {'arrowstyle':'simple', 'color':'k'}) ax2[1].annotate('Handwashing \n policy \n instituted', xy = (instituted_date, births_instituted_date), xytext = (pd.to_datetime('1847-4-01'), 325), arrowprops = {'arrowstyle':'simple', 'color':'k'}) # set legend ax2[0].legend() ax2[1].legend() plt.show()

pre_rate = pre_handwashing['Rate (%)'] post_rate = post_handwashing['Rate (%)'] diff_in_mean = post_rate.mean() - pre_rate.mean() diff_in_std = post_rate.std() - pre_rate.std() print(f'Mean of mortality rate pre-handwashing: {round(pre_rate.mean(), 2)}%') print(f'Mean of mortality rate post-handwashing: {round(post_rate.mean(), 2)}%') print(f'Difference in mean: {round(diff_in_mean, 2)}%') print('\n') print(f'Standard error of mortality rate pre-handwashing: {round(pre_rate.std(), 2)}%') print(f'Standard error of mortality rate post-handwashing: {round(post_rate.std(), 2)}%') print(f'Difference in standard error: {round(diff_in_std, 2)}%')

Mean of mortality rate pre-handwashing: 10.55%
Mean of mortality rate post-handwashing: 2.11%
Difference in mean: -8.44%


Standard error of mortality rate pre-handwashing: 7.18%
Standard error of mortality rate post-handwashing: 1.52%
Difference in standard error: -5.66%

def func_diff_bootstrap_replicate(pre_series, post_series, func): '''Generates bootstrap replicate of the difference in func applied to two pre-post pandas Series''' pre_replicate = pre_series.sample(frac=1, replace=True) post_replicate = post_series.sample(frac=1, replace=True) func_diff = func(post_replicate) - func(pre_replicate) return func_diff def conf_int(lst, conf=90, result='series'): '''Computes the confidence interval from a list and returns a pandas Series or a numpy array''' if result == 'series': if conf == 90: return pd.Series(lst).quantile([0.05, 0.95]) elif conf == 95: return pd.Series(lst).quantile([0.025, 0.975]) elif conf == 97.5: return pd.Series(lst).quantile([0.0125, 0.9875]) elif conf == 99: return pd.Series(lst).quantile([0.005, 0.995]) if result == 'array': if conf == 90: return np.percentile(lst, [5, 95]) elif conf == 95: return np.percentile(lst, [2.5, 97.5]) elif conf == 97.5: return np.percentile(lst, [1.25, 98.75]) elif conf == 99: return np.percentile(lst, [0.5, 99.5])

bs_replicates = [] for i in range(2000): mean_diff = func_diff_bootstrap_replicate(pre_rate, post_rate, np.mean) bs_replicates.append(mean_diff) conf_interval = conf_int(bs_replicates, conf=90, result='series') conf_interval

bs_replicates_2 = [] for i in range(2000): std_diff = func_diff_bootstrap_replicate(pre_rate, post_rate, np.std) bs_replicates_2.append(std_diff) conf_interval = conf_int(bs_replicates_2, conf=90, result='series') conf_interval

effective_control_date = strict_control + timedelta(days=31) pre_control = post_handwashing[:effective_control_date] post_control = post_handwashing[effective_control_date:]

fig, ax3 = plt.subplots(2, 1, figsize=(15, 8)) ax3[0].plot(pre_control.index, pre_control['Rate (%)'], color = 'red', label='Before strict control') ax3[0].plot(post_control.index, post_control['Rate (%)'], color = 'green', label='After strict control') ax3[1].plot(pre_control.index, pre_control['Births'], color = 'red', label='Before strict control') ax3[1].plot(post_control.index, post_control['Births'], color = 'green', label='After strict control') # set labels ax3[0].set_title('Mortality rate & Births in 1847-1849 (post-handwashing) at Vienna General Hospital') ax3[0].set_ylabel('Proportion of death per birth (%)') ax3[1].set_ylabel('Births') ax3[1].set_xlabel('Months') ax3[0].annotate('Strict controls \n on negligent \n students', xy = (strict_control, rate_strict_control), xytext = (pd.to_datetime('1848-03-01'), 3), arrowprops = {'arrowstyle':'simple', 'color':'k'}) ax3[1].annotate('Strict controls \n on negligent \n students', xy = (strict_control, births_strict_control), xytext = (pd.to_datetime('1847-12-01'), 320), arrowprops = {'arrowstyle':'simple', 'color':'k'}) ax3[0].legend() ax3[1].legend() plt.show()

zero_rate = post_handwashing[post_handwashing['Rate (%)'] == 0] print(zero_rate)

           Year        Month  Births  Deaths  Rate (%)
dates                                                 
1848-03-01        March 1848     276       0       0.0
1848-08-01       August 1848     261       0       0.0