#!pip install statsmodels==0.13.0
#!pip install nb_black==1.0.7
# Data Management
from dateutil import relativedelta as rd
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import plotly as py
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode()
import seaborn as sns
# Regression
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.graphics.api as smg
# auto code formatting
%load_ext nb_black
# Read and rename column country
cty_info = pd.read_csv('country/covid19countryinfo.csv').rename(columns={'country':'Country'})
# Filter observations with aggregate country-level information
# The column region for region-level observations is populated
cty_info = cty_info[cty_info.region.isnull()]
# Convert string data type to floating data type
# Remove comma from the fields
cty_info['healthexp'] = cty_info[~cty_info['healthexp'].isnull()]['healthexp'].str.replace(',','').astype('float')
cty_info['gdp2019'] = cty_info[~cty_info['gdp2019'].isnull()]['gdp2019'].str.replace(',','').astype('float')
# Convert to date objects with to_datetime method
gov_actions = ['quarantine', 'schools', 'gathering', 'nonessential', 'publicplace']
for gov_action in gov_actions:
cty_info[gov_action] = pd.to_datetime(cty_info[gov_action], format = '%m/%d/%Y')
# Filter columns of interest
# Note: feel free to explore other variables or datasets
cty_info = cty_info[['Country', 'avghumidity', 'avgtemp', 'fertility', 'medianage', 'urbanpop', 'quarantine', 'schools', \
'publicplace', 'gatheringlimit', 'gathering', 'nonessential', 'hospibed', 'smokers', \
'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'sexratio', 'lung', 'femalelung', \
'malelung', 'gdp2019', 'healthexp', 'healthperpop']]
# cty_info.describe()
cty_info.info()
#cty_info.head(20)
# Worldometer data
# ================
worldometer_data = pd.read_csv("corona-virus-report/worldometer_data.csv")
# Replace missing values '' with NAN and then 0
worldometer_data = worldometer_data.replace("", np.nan).fillna(0)
# Transform variables and round them up to the two decimal points
# Note that there are instances of division by zero issue when there are either zero total tests or total cases
worldometer_data["Case Positivity"] = round(
worldometer_data["TotalCases"] / worldometer_data["TotalTests"], 2
)
worldometer_data["Case Fatality"] = round(
worldometer_data["TotalDeaths"] / worldometer_data["TotalCases"], 2
)
# Resolve the division by zero issue by replacing infinity value with zero
worldometer_data[worldometer_data["Case Positivity"] == np.inf] = 0
worldometer_data[worldometer_data["Case Fatality"] == np.inf] = 0
# Place case positivity into three bins
worldometer_data["Case Positivity Bin"] = pd.qcut(
worldometer_data["Case Positivity"], q=3, labels=["low", "medium", "high"]
)
# Population Structure
worldometer_pop_struc = pd.read_csv("corona-virus-report/population_structure_by_age_per_contry.csv")
# Replace missing values with zeros
worldometer_pop_struc = worldometer_pop_struc.fillna(0)
# Merge datasets by common key country
worldometer_data = worldometer_data.merge(
worldometer_pop_struc, how="inner", left_on="Country/Region", right_on="Country"
)
worldometer_data = worldometer_data[worldometer_data["Country/Region"] != 0]
# Country information
worldometer_data = worldometer_data.merge(cty_info, how="left", on="Country")
# Replace space in variable names with '_'
worldometer_data.columns = worldometer_data.columns.str.replace(" ", "_")
worldometer_data.describe()
worldometer_data.info()
# Full data
# =========
full_table = pd.read_csv("corona-virus-report/full_grouped.csv")
full_table["Date"] = pd.to_datetime(full_table["Date"])
# Examine DataFrame (object type, shape, columns, dtypes)
full_table.info()
# type(full_table)
# full_table.shape
# full_table.columns
# full_table.dtypes
# Deep dive into the DataFrame
full_table.head()
# Grouped by day, country
# =======================
full_grouped = pd.read_csv("corona-virus-report/full_grouped.csv")
full_grouped["Date"] = pd.to_datetime(full_grouped["Date"])
# full_grouped.loc[full_grouped['Country/Region'] == 'US', 'Country/Region'] = 'USA'
full_grouped.head()
# Correct country names in worldometer to make them consistent with dataframe full_grouped column Country/Region before merging
worldometer_data["Country/Region"].replace(
{
"USA": "US",
"UAE": "United Arab Emirates",
"S. Korea": "South Korea",
"UK": "United Kingdom",
},
inplace=True,
)
# Draw population and country-level data
full_grouped = full_grouped.merge(
worldometer_data[["Country/Region", "Population"]], how="left", on="Country/Region"
)
full_grouped = full_grouped.merge(
cty_info, how="left", left_on="Country/Region", right_on="Country"
)
full_grouped["Confirmed per 1000"] = (
full_grouped["Confirmed"] / full_grouped["Population"] * 1000
)
# Backfill data
full_grouped = full_grouped.fillna(method="ffill")
# Create post-invention indicators
gov_actions = ["quarantine", "schools", "gathering", "nonessential", "publicplace"]
for gov_action in gov_actions:
full_grouped["post_" + gov_action] = (
full_grouped["Date"] >= full_grouped[gov_action]
)
full_grouped["day_rel_to_" + gov_action] = (
full_grouped["Date"] - full_grouped[gov_action]
).dt.days
# Create percent changes in covid19 outcomes
covid_outcomes = ["Confirmed", "Deaths", "Recovered", "Active", "Confirmed per 1000"]
for covid_outcome in covid_outcomes:
full_grouped["pct_change_" + covid_outcome] = full_grouped.groupby(
["Country/Region"]
)[covid_outcome].pct_change()
full_grouped[full_grouped["pct_change_" + covid_outcome] == np.inf] = 0
# Replace space in variable names with '_'
full_grouped.columns = full_grouped.columns.str.replace(" ", "_")
full_grouped.info()
# full_grouped.tail(20)
# print(full_grouped.iloc[0,0])
# full_grouped[full_grouped['quarantine'] != None]['Country/Region'].unique()
# full_grouped[full_grouped['Country/Region'] == 'Germany'][['quarantine','day_rel_to_quarantine']]
# list(full_grouped.columns.values)
# full_grouped.describe()
# Visualize the missingness isue in the dataset
sns.heatmap(cty_info.isnull(), cbar=False)
# Create a function to plot (reusing from the previous BootCamp)
def gt_n(n):
countries = full_grouped[full_grouped["Confirmed"] > n]["Country/Region"].unique()
temp = full_table[full_table["Country/Region"].isin(countries)]
temp = temp.groupby(["Country/Region", "Date"])["Confirmed"].sum().reset_index()
temp = temp[temp["Confirmed"] > n]
temp["Log Confirmed"] = np.log(1 + temp["Confirmed"])
# print(temp.head())
min_date = temp.groupby("Country/Region")["Date"].min().reset_index()
min_date.columns = ["Country/Region", "Min Date"]
# print(min_date.head())
from_nth_case = pd.merge(temp, min_date, on="Country/Region")
from_nth_case["Date"] = pd.to_datetime(from_nth_case["Date"])
from_nth_case["Min Date"] = pd.to_datetime(from_nth_case["Min Date"])
from_nth_case["N days"] = (
from_nth_case["Date"] - from_nth_case["Min Date"]
).dt.days
# print(from_nth_case.head())
fig = px.line(
from_nth_case,
x="N days",
y="Confirmed",
color="Country/Region",
title="N days from " + str(n) + " case",
height=600,
)
fig.show()
fig = px.line(
from_nth_case,
x="N days",
y="Log Confirmed",
color="Country/Region",
title="N days from " + str(n) + " case",
height=600,
)
fig.show()
def graph_cty_exceeding_cases(n, full_grouped_df, full_table_df):
"""
Function graphs the countries with more than n confirmed cases
Parameters:
n (int): The threshold minimum number of cases
fully_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set
full_data_df (pandas.DataFrame): the data source used. From Kaggle's covid_19_clean_complete data set
Returns:
null
"""
# Data Preprocessing
countries = full_grouped_df[full_grouped_df["Confirmed"] > n][
"Country/Region"
].unique()
temp = full_table_df[full_table_df["Country/Region"].isin(countries)]
temp = temp.groupby(["Country/Region", "Date"])["Confirmed"].sum().reset_index()
temp = temp[temp["Confirmed"] > n]
temp["Log Confirmed"] = np.log(1 + temp["Confirmed"])
# print(temp.head())
min_date = temp.groupby("Country/Region")["Date"].min().reset_index()
min_date.columns = ["Country/Region", "Min Date"]
# print(min_date.head())
from_nth_case = pd.merge(temp, min_date, on="Country/Region")
from_nth_case["Date"] = pd.to_datetime(from_nth_case["Date"])
from_nth_case["Min Date"] = pd.to_datetime(from_nth_case["Min Date"])
from_nth_case["N days"] = (
from_nth_case["Date"] - from_nth_case["Min Date"]
).dt.days
# print(from_nth_case.head())
fig = px.line(
from_nth_case,
x="N days",
y="Confirmed",
color="Country/Region",
title="N days from " + str(n) + " case",
height=600,
)
fig.show()
fig = px.line(
from_nth_case,
x="N days",
y="Log Confirmed",
color="Country/Region",
title="N days from " + str(n) + " case",
height=600,
)
fig.show()
full_grouped.head()
Stringency_index = pd.read_csv("covid-stringency-index.csv")
Stringency_index["Date"] = pd.to_datetime(Stringency_index["Day"])
df = Stringency_index.sort_values(by=['Date','Entity'])
df.reset_index(inplace=True)
df["Entity"].replace({"United States": "US",},inplace=True)
df["Date"]=df["Date"].astype(object)
df.head()
n_full_grouped = full_grouped.merge(
df[["Date", "Entity", "stringency_index"]], how="inner", left_on=["Date", 'Country/Region'], right_on=["Date", "Entity"])
n_full_grouped.tail()
def plot_gov_action(covid_outcome, gov_action, full_grouped_df):
"""
Function plots the government action and outcome
Parameters:
covid_outcome (str): The outcome from covid
gov_action (str): The government action to be analysed
full_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set
Returns:
null
"""
fig = px.scatter(
full_grouped_df[full_grouped_df[gov_action] != None],
x="day_rel_to_" + gov_action,
y=covid_outcome,
color="Country/Region",
title="N days from " + gov_action,
height=600,
)
fig.update_layout(yaxis=dict(range=[0, 10]))
fig.show()
gov_actions = ['quarantine', 'schools', 'gathering', 'nonessential', 'publicplace']
for i in gov_actions:
plot_gov_action('pct_change_Confirmed_per_1000', i, n_full_grouped)
fig = px.scatter(
n_full_grouped[n_full_grouped['stringency_index'] != None],
x=n_full_grouped['stringency_index'],
y='pct_change_Confirmed_per_1000',
color="Country/Region",
title='stringency_index',
height=600,
)
fig.update_layout(yaxis=dict(range=[0, 10]))
fig.show()
n_full_grouped['Confirmed_per_1000'].describe()
n_full_grouped['log_Confirmed_per_1000'] = np.log(n_full_grouped['Confirmed_per_1000']+1)
n_full_grouped['log_Confirmed_per_1000'].describe()
#Plot pairplot with countries organized by WHO regions
g = sns.pairplot(n_full_grouped[['log_Confirmed_per_1000', 'avghumidity', 'avgtemp', 'urbanpop', 'WHO_Region']], hue='WHO_Region')
#plt.matshow(full_grouped.corr())
#plt.show()
f, ax = plt.subplots(figsize=(10, 8))
corr = n_full_grouped.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Plot heatmap
sns.heatmap(corr, mask=mask, cmap=cmap, square=True, ax=ax)
n_full_grouped.columns
# 'post_schools', 'post_gathering', 'post_nonessential', 'post_publicplace',
# Create interaction term
n_full_grouped['quarXurbanpop'] = n_full_grouped['post_quarantine'] * n_full_grouped['urbanpop']
# OLS regression
y = n_full_grouped['log_Confirmed_per_1000']
X = n_full_grouped[['post_quarantine', 'avghumidity', 'avgtemp', 'urbanpop', 'quarXurbanpop','stringency_index']]
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())