import requests
from bs4 import BeautifulSoup
url = 'https://en.wikipedia.org/wiki/Historical_mortality_rates_of_puerperal_fever'
# create get requests
page = requests.get(url)
# create soup object
soup = BeautifulSoup(page.text, 'lxml')
soup.title
# find table
table = soup.find('table', {'class': 'wikitable sortable'})
# getting table headers
table_header = table.find_all('th')
table_header_names = [th.text.strip() for th in table_header]
table_header_names.pop() # only interested in the first 5 columns
table_header_names
# getting table contents
# generates a list of <tr> </tr>
table_content = table.find_all('tr')
# exclude the header row
table_content = table_content[1:]
# initialize dictionary to pass in columns
table_dict = dict([(name, '') for name in table_header_names])
print(table_dict)
{'Year': '', 'Month': '', 'Births': '', 'Deaths': '', 'Rate (%)': ''}
# initialize empty 'columns' lists
year = []
month = []
births = []
deaths = []
rate = []
def appending_td_to_list(td, lst, index):
return lst.append(td[index].find(text=True))
for tr in table_content:
# generates a list of <td> </td>
td = tr.find_all('td')
appending_td_to_list(td, year, 0)
appending_td_to_list(td, month, 1)
appending_td_to_list(td, births, 2)
appending_td_to_list(td, deaths, 3)
appending_td_to_list(td, rate, 4)
# pass in columns into dictionary
table_dict['Year'] = year
table_dict['Month'] = month
table_dict['Births'] = births
table_dict['Deaths'] = deaths
table_dict['Rate (%)'] = rate
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None # default='warn'
# from wiki table to DataFrame
wiki_table = pd.DataFrame(table_dict)
# inspect first 5 elements
wiki_table.sample(5)
# removing '\n' in all columns using str.replace()
for name in table_header_names:
wiki_table[name] = wiki_table[name].str.replace('\n', '')
wiki_table.sample(5)
# shocking, we don't have any NaN but,
print(wiki_table.isnull().sum())
# we have a NaN that doesn't look like a NaN
nan_but_not_nan = wiki_table[wiki_table.Births.str.match('na')]
# replace 'na' with NaN
lst_name = ['Births', 'Deaths', 'Rate (%)']
for name in lst_name:
nan_but_not_nan.loc[:, name].replace('na', np.nan, inplace=True)
# taking Set Union of wiki_table and nan_but_not_nan ---> concat + drop_duplicates
wiki_table = pd.concat([wiki_table, nan_but_not_nan], ignore_index=True)
wiki_table.drop_duplicates(subset=['Month'], keep='last', inplace=True)
wiki_table.tail()
Year 0
Month 0
Births 0
Deaths 0
Rate (%) 0
dtype: int64
# running this returns ValueError: time data 'Mar 1843' does not match format '%B %Y' (match)
# pd.to_datetime(wiki_table.Month, format='%B %Y')
# finding the culprit and removing it
print(wiki_table[wiki_table.Month.str.match('Mar')])
# replace 'Mar 1843' with 'March 1843' to allow us to parse into datetime
wiki_table['Month'] = wiki_table.Month.replace('Mar 1843', 'March 1843', regex=False)
print('\n')
print(wiki_table[wiki_table.Month.str.match('Mar')])
Year Month Births Deaths Rate (%)
2 March 1841 277 12 4.3
14 March 1842 264 27 10.2
26 Mar 1843 266 33 12.4
38 March 1844 276 47 17.0
50 March 1845 292 13 4.5
62 March 1846 311 48 15.4
74 March 1847 305 11 3.6
86 March 1848 276 0 0.0
98 March 1849 406 20 4.9
Year Month Births Deaths Rate (%)
2 March 1841 277 12 4.3
14 March 1842 264 27 10.2
26 March 1843 266 33 12.4
38 March 1844 276 47 17.0
50 March 1845 292 13 4.5
62 March 1846 311 48 15.4
74 March 1847 305 11 3.6
86 March 1848 276 0 0.0
98 March 1849 406 20 4.9
wiki_table['dates'] = pd.to_datetime(wiki_table.Month, format='%B %Y')
wiki_table.dtypes
# set dates as index
wiki_table.set_index('dates', inplace=True)
wiki_table.sort_index(inplace=True)
wiki_table.head()
# fill NaN using ffill
wiki_table.fillna(method = 'ffill', inplace=True)
# verify that NaN is indeed filled
wiki_table[wiki_table.Month == 'December 1841']
# inspect
wiki_table.dtypes
# convert 'Births', 'Deaths' to int type
to_number_cols = ['Births', 'Deaths']
wiki_table[to_number_cols] = wiki_table[to_number_cols].apply(pd.to_numeric, errors='coerce', downcast='signed', axis=1)
wiki_table.dtypes
# convert 'Rate (%)' to float type
wiki_table['Rate (%)'] = wiki_table['Rate (%)'].astype('float', errors='ignore')
wiki_table['Rate (%)'].dtype
import matplotlib.pyplot as plt
# useful function to plot a timeseries graph
def timeseries_plot(ax, x, y, xlabel = 'some X label', ylabel = 'some Y label', color = 'black', linestyle = 'solid', alpha = 1, color_ticks = True):
ax.plot(x, y, color, linestyle = linestyle, alpha = alpha)
ax.set_xlabel(xlabel)
if color_ticks == True:
ax.set_ylabel(ylabel, color = color)
ax.tick_params('y', colors = color)
else:
ax.set_ylabel(ylabel)
fig, ax = plt.subplots(figsize=(11.7, 8.27))
# plot rate vs year
timeseries_plot(ax, wiki_table.index, wiki_table['Rate (%)'], 'Year', 'Mortality rate (%)', 'blue')
# plot Births vs year
ax2 = ax.twinx()
timeseries_plot(ax2, wiki_table.index, wiki_table.Births, 'Year', 'Births', 'red', linestyle = '--', alpha = 0.5)
ax.set_title('Mortality rate 1841-1849 at Vienna General Hospital')
plt.show()
wiki_table.loc["May 1847"]
from datetime import datetime, timedelta
instituted_date = datetime.strptime('May 1847', '%B %Y').date()
effective_impact_date = datetime.strptime('June 1847', '%B %Y').date()
strict_control = datetime.strptime('January 1848', '%B %Y').date()
start_handwashing = wiki_table.loc['June 1847']
pre_handwashing = wiki_table.loc[:'June 1847']
post_handwashing = wiki_table.loc['June 1847':]
def get_rate_and_births(month):
row = wiki_table[wiki_table.Month == month]
rate = row.values[0][-1]
births = row.values[0][2]
return rate, births
# get Rate (%) and Births on instituted_date: May 1847
rate_instituted_date, births_instituted_date = get_rate_and_births('May 1847')
# get Rate (%) and Births on strict_control: January 1848
rate_strict_control, births_strict_control = get_rate_and_births('January 1848')
plt.style.use('seaborn')
fig, ax2 = plt.subplots(2, 1, figsize=(15, 8))
# plot Rate (%) before and after handwashing
ax2[0].plot(pre_handwashing.index, pre_handwashing['Rate (%)'], label = 'Before handwashing', color = 'red')
ax2[0].plot(post_handwashing.index, post_handwashing['Rate (%)'], label = 'After handwashing', color = 'green')
# plot Births before and after handwashing
ax2[1].plot(pre_handwashing.index, pre_handwashing.Births, label = 'Before handwashing', color = 'red')
ax2[1].plot(post_handwashing.index, post_handwashing.Births, label = 'After handwashing', color = 'green')
# set labels
ax2[0].set_ylabel('Proportion of deaths per births (%)')
ax2[1].set_ylabel('Births')
ax2[1].set_xlabel('Year')
# set title
ax2[0].set_title('Mortality rate & Births in 1841-1849 at Vienna General Hospital')
# annotate both plots
ax2[0].annotate('Handwashing \n policy \n instituted',
xy = (instituted_date, rate_instituted_date),
xytext = (pd.to_datetime('1847-10-01'), 10),
arrowprops = {'arrowstyle':'simple', 'color':'k'})
ax2[1].annotate('Handwashing \n policy \n instituted',
xy = (instituted_date, births_instituted_date),
xytext = (pd.to_datetime('1847-4-01'), 325),
arrowprops = {'arrowstyle':'simple', 'color':'k'})
# set legend
ax2[0].legend()
ax2[1].legend()
plt.show()
pre_rate = pre_handwashing['Rate (%)']
post_rate = post_handwashing['Rate (%)']
diff_in_mean = post_rate.mean() - pre_rate.mean()
diff_in_std = post_rate.std() - pre_rate.std()
print(f'Mean of mortality rate pre-handwashing: {round(pre_rate.mean(), 2)}%')
print(f'Mean of mortality rate post-handwashing: {round(post_rate.mean(), 2)}%')
print(f'Difference in mean: {round(diff_in_mean, 2)}%')
print('\n')
print(f'Standard error of mortality rate pre-handwashing: {round(pre_rate.std(), 2)}%')
print(f'Standard error of mortality rate post-handwashing: {round(post_rate.std(), 2)}%')
print(f'Difference in standard error: {round(diff_in_std, 2)}%')
Mean of mortality rate pre-handwashing: 10.55%
Mean of mortality rate post-handwashing: 2.11%
Difference in mean: -8.44%
Standard error of mortality rate pre-handwashing: 7.18%
Standard error of mortality rate post-handwashing: 1.52%
Difference in standard error: -5.66%
def func_diff_bootstrap_replicate(pre_series, post_series, func):
'''Generates bootstrap replicate of the difference in func applied to two pre-post pandas Series'''
pre_replicate = pre_series.sample(frac=1, replace=True)
post_replicate = post_series.sample(frac=1, replace=True)
func_diff = func(post_replicate) - func(pre_replicate)
return func_diff
def conf_int(lst, conf=90, result='series'):
'''Computes the confidence interval from a list and returns a pandas Series or a numpy array'''
if result == 'series':
if conf == 90:
return pd.Series(lst).quantile([0.05, 0.95])
elif conf == 95:
return pd.Series(lst).quantile([0.025, 0.975])
elif conf == 97.5:
return pd.Series(lst).quantile([0.0125, 0.9875])
elif conf == 99:
return pd.Series(lst).quantile([0.005, 0.995])
if result == 'array':
if conf == 90:
return np.percentile(lst, [5, 95])
elif conf == 95:
return np.percentile(lst, [2.5, 97.5])
elif conf == 97.5:
return np.percentile(lst, [1.25, 98.75])
elif conf == 99:
return np.percentile(lst, [0.5, 99.5])
bs_replicates = []
for i in range(2000):
mean_diff = func_diff_bootstrap_replicate(pre_rate, post_rate, np.mean)
bs_replicates.append(mean_diff)
conf_interval = conf_int(bs_replicates, conf=90, result='series')
conf_interval
bs_replicates_2 = []
for i in range(2000):
std_diff = func_diff_bootstrap_replicate(pre_rate, post_rate, np.std)
bs_replicates_2.append(std_diff)
conf_interval = conf_int(bs_replicates_2, conf=90, result='series')
conf_interval
effective_control_date = strict_control + timedelta(days=31)
pre_control = post_handwashing[:effective_control_date]
post_control = post_handwashing[effective_control_date:]
fig, ax3 = plt.subplots(2, 1, figsize=(15, 8))
ax3[0].plot(pre_control.index, pre_control['Rate (%)'], color = 'red', label='Before strict control')
ax3[0].plot(post_control.index, post_control['Rate (%)'], color = 'green', label='After strict control')
ax3[1].plot(pre_control.index, pre_control['Births'], color = 'red', label='Before strict control')
ax3[1].plot(post_control.index, post_control['Births'], color = 'green', label='After strict control')
# set labels
ax3[0].set_title('Mortality rate & Births in 1847-1849 (post-handwashing) at Vienna General Hospital')
ax3[0].set_ylabel('Proportion of death per birth (%)')
ax3[1].set_ylabel('Births')
ax3[1].set_xlabel('Months')
ax3[0].annotate('Strict controls \n on negligent \n students',
xy = (strict_control, rate_strict_control),
xytext = (pd.to_datetime('1848-03-01'), 3),
arrowprops = {'arrowstyle':'simple', 'color':'k'})
ax3[1].annotate('Strict controls \n on negligent \n students',
xy = (strict_control, births_strict_control),
xytext = (pd.to_datetime('1847-12-01'), 320),
arrowprops = {'arrowstyle':'simple', 'color':'k'})
ax3[0].legend()
ax3[1].legend()
plt.show()
zero_rate = post_handwashing[post_handwashing['Rate (%)'] == 0]
print(zero_rate)
Year Month Births Deaths Rate (%)
dates
1848-03-01 March 1848 276 0 0.0
1848-08-01 August 1848 261 0 0.0