import requests
from bs4 import BeautifulSoup
url = 'https://en.wikipedia.org/wiki/Historical_mortality_rates_of_puerperal_fever'
# create get requests
page = requests.get(url)
# create soup object
soup = BeautifulSoup(page.text, 'lxml')
soup.title
# find table
table = soup.find('table', {'class': 'wikitable sortable'})
# getting table headers
table_header = table.find_all('th')
table_header_names = [th.text.strip() for th in table_header]
table_header_names.pop() # only interested in the first 5 columns
table_header_names
# getting table contents
# generates a list of <tr> </tr>
table_content = table.find_all('tr')
# exclude the header row
table_content = table_content[1:]
# initialize dictionary to pass in columns
table_dict = dict([(name, '') for name in table_header_names])
print(table_dict)
# initialize empty 'columns' lists
year = []
month = []
births = []
deaths = []
rate = []
def appending_td_to_list(td, lst, index):
return lst.append(td[index].find(text=True))
for tr in table_content:
# generates a list of <td> </td>
td = tr.find_all('td')
appending_td_to_list(td, year, 0)
appending_td_to_list(td, month, 1)
appending_td_to_list(td, births, 2)
appending_td_to_list(td, deaths, 3)
appending_td_to_list(td, rate, 4)
# pass in columns into dictionary
table_dict['Year'] = year
table_dict['Month'] = month
table_dict['Births'] = births
table_dict['Deaths'] = deaths
table_dict['Rate (%)'] = rate
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None # default='warn'
# from wiki table to DataFrame
wiki_table = pd.DataFrame(table_dict)
# inspect first 5 elements
wiki_table.sample(5)
# removing '\n' in all columns using str.replace()
for name in table_header_names:
wiki_table[name] = wiki_table[name].str.replace('\n', '')
wiki_table.sample(5)
# shocking, we don't have any NaN but,
print(wiki_table.isnull().sum())
# we have a NaN that doesn't look like a NaN
nan_but_not_nan = wiki_table[wiki_table.Births.str.match('na')]
# replace 'na' with NaN
lst_name = ['Births', 'Deaths', 'Rate (%)']
for name in lst_name:
nan_but_not_nan.loc[:, name].replace('na', np.nan, inplace=True)
# taking Set Union of wiki_table and nan_but_not_nan ---> concat + drop_duplicates
wiki_table = pd.concat([wiki_table, nan_but_not_nan], ignore_index=True)
wiki_table.drop_duplicates(subset=['Month'], keep='last', inplace=True)
wiki_table.tail()
# running this returns ValueError: time data 'Mar 1843' does not match format '%B %Y' (match)
# pd.to_datetime(wiki_table.Month, format='%B %Y')
# finding the culprit and removing it
print(wiki_table[wiki_table.Month.str.match('Mar')])
# replace 'Mar 1843' with 'March 1843' to allow us to parse into datetime
wiki_table['Month'] = wiki_table.Month.replace('Mar 1843', 'March 1843', regex=False)
print('\n')
print(wiki_table[wiki_table.Month.str.match('Mar')])
wiki_table['dates'] = pd.to_datetime(wiki_table.Month, format='%B %Y')
wiki_table.dtypes
# set dates as index
wiki_table.set_index('dates', inplace=True)
wiki_table.sort_index(inplace=True)
wiki_table.head()
# fill NaN using ffill
wiki_table.fillna(method = 'ffill', inplace=True)
# verify that NaN is indeed filled
wiki_table[wiki_table.Month == 'December 1841']
# inspect
wiki_table.dtypes
# convert 'Births', 'Deaths' to int type
to_number_cols = ['Births', 'Deaths']
wiki_table[to_number_cols] = wiki_table[to_number_cols].apply(pd.to_numeric, errors='coerce', downcast='signed', axis=1)
wiki_table.dtypes
# convert 'Rate (%)' to float type
wiki_table['Rate (%)'] = wiki_table['Rate (%)'].astype('float', errors='ignore')
wiki_table['Rate (%)'].dtype
import matplotlib.pyplot as plt
# useful function to plot a timeseries graph
def timeseries_plot(ax, x, y, xlabel = 'some X label', ylabel = 'some Y label', color = 'black', linestyle = 'solid', alpha = 1, color_ticks = True):
ax.plot(x, y, color, linestyle = linestyle, alpha = alpha)
ax.set_xlabel(xlabel)
if color_ticks == True:
ax.set_ylabel(ylabel, color = color)
ax.tick_params('y', colors = color)
else:
ax.set_ylabel(ylabel)
fig, ax = plt.subplots(figsize=(11.7, 8.27))
# plot rate vs year
timeseries_plot(ax, wiki_table.index, wiki_table['Rate (%)'], 'Year', 'Mortality rate (%)', 'blue')
# plot Births vs year
ax2 = ax.twinx()
timeseries_plot(ax2, wiki_table.index, wiki_table.Births, 'Year', 'Births', 'red', linestyle = '--', alpha = 0.5)
ax.set_title('Mortality rate 1841-1849 at Vienna General Hospital')
plt.show()
wiki_table.loc["May 1847"]
from datetime import datetime, timedelta
instituted_date = datetime.strptime('May 1847', '%B %Y').date()
effective_impact_date = datetime.strptime('June 1847', '%B %Y').date()
strict_control = datetime.strptime('January 1848', '%B %Y').date()
start_handwashing = wiki_table.loc['June 1847']
pre_handwashing = wiki_table.loc[:'June 1847']
post_handwashing = wiki_table.loc['June 1847':]
def get_rate_and_births(month):
row = wiki_table[wiki_table.Month == month]
rate = row.values[0][-1]
births = row.values[0][2]
return rate, births
# get Rate (%) and Births on instituted_date: May 1847
rate_instituted_date, births_instituted_date = get_rate_and_births('May 1847')
# get Rate (%) and Births on strict_control: January 1848
rate_strict_control, births_strict_control = get_rate_and_births('January 1848')
plt.style.use('seaborn')
fig, ax2 = plt.subplots(2, 1, figsize=(15, 8))
# plot Rate (%) before and after handwashing
ax2[0].plot(pre_handwashing.index, pre_handwashing['Rate (%)'], label = 'Before handwashing', color = 'red')
ax2[0].plot(post_handwashing.index, post_handwashing['Rate (%)'], label = 'After handwashing', color = 'green')
# plot Births before and after handwashing
ax2[1].plot(pre_handwashing.index, pre_handwashing.Births, label = 'Before handwashing', color = 'red')
ax2[1].plot(post_handwashing.index, post_handwashing.Births, label = 'After handwashing', color = 'green')
# set labels
ax2[0].set_ylabel('Proportion of deaths per births (%)')
ax2[1].set_ylabel('Births')
ax2[1].set_xlabel('Year')
# set title
ax2[0].set_title('Mortality rate & Births in 1841-1849 at Vienna General Hospital')
# annotate both plots
ax2[0].annotate('Handwashing \n policy \n instituted',
xy = (instituted_date, rate_instituted_date),
xytext = (pd.to_datetime('1847-10-01'), 10),
arrowprops = {'arrowstyle':'simple', 'color':'k'})
ax2[1].annotate('Handwashing \n policy \n instituted',
xy = (instituted_date, births_instituted_date),
xytext = (pd.to_datetime('1847-4-01'), 325),
arrowprops = {'arrowstyle':'simple', 'color':'k'})
# set legend
ax2[0].legend()
ax2[1].legend()
plt.show()
pre_rate = pre_handwashing['Rate (%)']
post_rate = post_handwashing['Rate (%)']
diff_in_mean = post_rate.mean() - pre_rate.mean()
diff_in_std = post_rate.std() - pre_rate.std()
print(f'Mean of mortality rate pre-handwashing: {round(pre_rate.mean(), 2)}%')
print(f'Mean of mortality rate post-handwashing: {round(post_rate.mean(), 2)}%')
print(f'Difference in mean: {round(diff_in_mean, 2)}%')
print('\n')
print(f'Standard error of mortality rate pre-handwashing: {round(pre_rate.std(), 2)}%')
print(f'Standard error of mortality rate post-handwashing: {round(post_rate.std(), 2)}%')
print(f'Difference in standard error: {round(diff_in_std, 2)}%')
def func_diff_bootstrap_replicate(pre_series, post_series, func):
'''Generates bootstrap replicate of the difference in func applied to two pre-post pandas Series'''
pre_replicate = pre_series.sample(frac=1, replace=True)
post_replicate = post_series.sample(frac=1, replace=True)
func_diff = func(post_replicate) - func(pre_replicate)
return func_diff
def conf_int(lst, conf=90, result='series'):
'''Computes the confidence interval from a list and returns a pandas Series or a numpy array'''
if result == 'series':
if conf == 90:
return pd.Series(lst).quantile([0.05, 0.95])
elif conf == 95:
return pd.Series(lst).quantile([0.025, 0.975])
elif conf == 97.5:
return pd.Series(lst).quantile([0.0125, 0.9875])
elif conf == 99:
return pd.Series(lst).quantile([0.005, 0.995])
if result == 'array':
if conf == 90:
return np.percentile(lst, [5, 95])
elif conf == 95:
return np.percentile(lst, [2.5, 97.5])
elif conf == 97.5:
return np.percentile(lst, [1.25, 98.75])
elif conf == 99:
return np.percentile(lst, [0.5, 99.5])
bs_replicates = []
for i in range(2000):
mean_diff = func_diff_bootstrap_replicate(pre_rate, post_rate, np.mean)
bs_replicates.append(mean_diff)
conf_interval = conf_int(bs_replicates, conf=90, result='series')
conf_interval
bs_replicates_2 = []
for i in range(2000):
std_diff = func_diff_bootstrap_replicate(pre_rate, post_rate, np.std)
bs_replicates_2.append(std_diff)
conf_interval = conf_int(bs_replicates_2, conf=90, result='series')
conf_interval
effective_control_date = strict_control + timedelta(days=31)
pre_control = post_handwashing[:effective_control_date]
post_control = post_handwashing[effective_control_date:]
fig, ax3 = plt.subplots(2, 1, figsize=(15, 8))
ax3[0].plot(pre_control.index, pre_control['Rate (%)'], color = 'red', label='Before strict control')
ax3[0].plot(post_control.index, post_control['Rate (%)'], color = 'green', label='After strict control')
ax3[1].plot(pre_control.index, pre_control['Births'], color = 'red', label='Before strict control')
ax3[1].plot(post_control.index, post_control['Births'], color = 'green', label='After strict control')
# set labels
ax3[0].set_title('Mortality rate & Births in 1847-1849 (post-handwashing) at Vienna General Hospital')
ax3[0].set_ylabel('Proportion of death per birth (%)')
ax3[1].set_ylabel('Births')
ax3[1].set_xlabel('Months')
ax3[0].annotate('Strict controls \n on negligent \n students',
xy = (strict_control, rate_strict_control),
xytext = (pd.to_datetime('1848-03-01'), 3),
arrowprops = {'arrowstyle':'simple', 'color':'k'})
ax3[1].annotate('Strict controls \n on negligent \n students',
xy = (strict_control, births_strict_control),
xytext = (pd.to_datetime('1847-12-01'), 320),
arrowprops = {'arrowstyle':'simple', 'color':'k'})
ax3[0].legend()
ax3[1].legend()
plt.show()
zero_rate = post_handwashing[post_handwashing['Rate (%)'] == 0]
print(zero_rate)