Task
# Import modules for API calls
import requests
import io
import pandas as pd
import requests
import json
from datetime import datetime
# Import module for plotting
import seaborn as sns
## JHU Vaccination Rates (Taken From: https://github.com/owid/covid-19-data/tree/master/public/data)
url = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv'
download = requests.get(url).content
covid = pd.read_csv(io.StringIO(download.decode('utf-8')), parse_dates=['date'])
covid.tail()
pip install statsmodels
# Data Management
from dateutil import relativedelta as rd
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import plotly as py
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode()
import seaborn as sns
# Regression
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.graphics.api as smg
covid.columns.values
covid.tail()
full_grouped = covid.copy()
# Backfill data
full_grouped = full_grouped.fillna(method="ffill")
# Create percent changes in covid19 outcomes
#covid_outcomes = ["confirmed", "deaths", "recovered", "active", "confirmed per 1000",]
covid_outcomes = ["new_cases", "new_deaths", "new_cases_per_million","stringency_index"]
for covid_outcome in covid_outcomes:
full_grouped["pct_change_" + covid_outcome] = full_grouped.groupby(
["location"]
)[covid_outcome].pct_change()
full_grouped[full_grouped["pct_change_" + covid_outcome] == np.inf] = 0
# Replace space in variable names with '_'
full_grouped.columns = full_grouped.columns.str.replace(" ", "_")
full_grouped.tail()
full_grouped.info()
# Read and rename column country
# cty_info = pd.read_csv('../input/countryinfo/covid19countryinfo.csv').rename(columns={'country':'Country'})
# cty_info = pd.read_csv(local_path + 'covid19countryinfo.csv').rename(columns={'country':'Country'})
cty_info = pd.read_csv('/work/countryinfo/covid19countryinfo.csv').rename(columns={'country':'Country'})
# Filter observations with aggregate country-level information
# The column region for region-level observations is populated
cty_info = cty_info[cty_info.region.isnull()]
# Convert string data type to floating data type
# Remove comma from the fields
cty_info['healthexp'] = cty_info[~cty_info['healthexp'].isnull()]['healthexp'].str.replace(',','').astype('float')
cty_info['gdp2019'] = cty_info[~cty_info['gdp2019'].isnull()]['gdp2019'].str.replace(',','').astype('float')
# Convert to date objects with to_datetime method
gov_actions = ['quarantine', 'schools', 'gathering', 'nonessential', 'publicplace']
for gov_action in gov_actions:
cty_info[gov_action] = pd.to_datetime(cty_info[gov_action], format = '%m/%d/%Y')
# Filter columns of interest
# Note: feel free to explore other variables or datasets
cty_info = cty_info[['Country', 'avghumidity', 'avgtemp', 'fertility', 'medianage', 'urbanpop', 'quarantine', 'schools', \
'publicplace', 'gatheringlimit', 'gathering', 'nonessential', 'hospibed', 'smokers', \
'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'sexratio', 'lung', 'femalelung', \
'malelung', 'gdp2019', 'healthexp', 'healthperpop']]
# cty_info.describe()
cty_info.info()
# Worldometer data
# ================
# worldometer_data = pd.read_csv('../input/corona-virus-report/worldometer_data.csv')
# worldometer_data = pd.read_csv(local_path + 'worldometer_data.csv')
worldometer_data = pd.read_csv("/work/corona-virus-report/worldometer_data.csv")
# Replace missing values '' with NAN and then 0
worldometer_data = worldometer_data.replace("", np.nan).fillna(0)
# Transform variables and round them up to the two decimal points
# Note that there are instances of division by zero issue when there are either zero total tests or total cases
worldometer_data["Case Positivity"] = round(
worldometer_data["TotalCases"] / worldometer_data["TotalTests"], 2
)
worldometer_data["Case Fatality"] = round(
worldometer_data["TotalDeaths"] / worldometer_data["TotalCases"], 2
)
# Resolve the division by zero issue by replacing infinity value with zero
worldometer_data[worldometer_data["Case Positivity"] == np.inf] = 0
worldometer_data[worldometer_data["Case Fatality"] == np.inf] = 0
# Place case positivity into three bins
worldometer_data["Case Positivity Bin"] = pd.qcut(
worldometer_data["Case Positivity"], q=3, labels=["low", "medium", "high"]
)
# Population Structure
# worldometer_pop_struc = pd.read_csv('../input/covid19-worldometer-snapshots-since-april-18/population_structure_by_age_per_contry.csv')
# worldometer_pop_struc = pd.read_csv(local_path + 'population_structure_by_age_per_contry.csv')
worldometer_pop_struc = pd.read_csv("/work/COVID-19 worldometer daily snapshots/population_structure_by_age_per_contry.csv")
# Replace missing values with zeros
worldometer_pop_struc = worldometer_pop_struc.fillna(0)
# Merge datasets by common key country
worldometer_data = worldometer_data.merge(
worldometer_pop_struc, how="inner", left_on="Country/Region", right_on="Country"
)
worldometer_data = worldometer_data[worldometer_data["Country/Region"] != 0]
# Country information
worldometer_data = worldometer_data.merge(cty_info, how="left", on="Country")
# Replace space in variable names with '_'
worldometer_data.columns = worldometer_data.columns.str.replace(" ", "_")
# Full data
# =========
# full_table = pd.read_csv('../input/corona-virus-report/covid_19_clean_complete.csv')
# full_table = pd.read_csv(local_path + 'covid_19_clean_complete.csv')
full_table = pd.read_csv("/work/corona-virus-report/covid_19_clean_complete.csv")
full_table["Date"] = pd.to_datetime(full_table["Date"])
# Grouped by day, country
# =======================
# full_grouped = pd.read_csv('../input/corona-virus-report/full_grouped.csv')
# full_grouped = pd.read_csv(local_path + 'full_grouped.csv')
full_grouped = pd.read_csv("/work/corona-virus-report/full_grouped.csv")
full_grouped["Date"] = pd.to_datetime(full_grouped["Date"])
# full_grouped.loc[full_grouped['Country/Region'] == 'US', 'Country/Region'] = 'USA'
full_grouped.head()
# Correct country names in worldometer to make them consistent with dataframe full_grouped column Country/Region before merging
worldometer_data["Country/Region"].replace(
{
"USA": "US",
"UAE": "United Arab Emirates",
"S. Korea": "South Korea",
"UK": "United Kingdom",
},
inplace=True,
)
# Draw population and country-level data
full_grouped = full_grouped.merge(
worldometer_data[["Country/Region", "Population"]], how="left", on="Country/Region"
)
full_grouped = full_grouped.merge(
cty_info, how="left", left_on="Country/Region", right_on="Country"
)
full_grouped["Confirmed per 1000"] = (
full_grouped["Confirmed"] / full_grouped["Population"] * 1000
)
# Backfill data
full_grouped = full_grouped.fillna(method="ffill")
# Create post-invention indicators
gov_actions = ["quarantine", "schools", "gathering", "nonessential", "publicplace"]
for gov_action in gov_actions:
full_grouped["post_" + gov_action] = (
full_grouped["Date"] >= full_grouped[gov_action]
)
full_grouped["day_rel_to_" + gov_action] = (
full_grouped["Date"] - full_grouped[gov_action]
).dt.days
# Create percent changes in covid19 outcomes
covid_outcomes = ["Confirmed", "Deaths", "Recovered", "Active", "Confirmed per 1000"]
for covid_outcome in covid_outcomes:
full_grouped["pct_change_" + covid_outcome] = full_grouped.groupby(
["Country/Region"]
)[covid_outcome].pct_change()
full_grouped[full_grouped["pct_change_" + covid_outcome] == np.inf] = 0
# Replace space in variable names with '_'
full_grouped.columns = full_grouped.columns.str.replace(" ", "_")
full_grouped.info()
# Visualize the missingness isue in the dataset
sns.heatmap(cty_info.isnull(), cbar=False)
Define Function
# Create a function to plot (reusing from the previous BootCamp)
def gt_n(n):
countries = full_grouped[full_grouped["Confirmed"] > n]["Country/Region"].unique()
temp = full_table[full_table["Country/Region"].isin(countries)]
temp = temp.groupby(["Country/Region", "Date"])["Confirmed"].sum().reset_index()
temp = temp[temp["Confirmed"] > n]
temp["Log Confirmed"] = np.log(1 + temp["Confirmed"])
# print(temp.head())
min_date = temp.groupby("Country/Region")["Date"].min().reset_index()
min_date.columns = ["Country/Region", "Min Date"]
# print(min_date.head())
from_nth_case = pd.merge(temp, min_date, on="Country/Region")
from_nth_case["Date"] = pd.to_datetime(from_nth_case["Date"])
from_nth_case["Min Date"] = pd.to_datetime(from_nth_case["Min Date"])
from_nth_case["N days"] = (
from_nth_case["Date"] - from_nth_case["Min Date"]
).dt.days
# print(from_nth_case.head())
fig = px.line(
from_nth_case,
x="N days",
y="Confirmed",
color="Country/Region",
title="N days from " + str(n) + " case",
height=600,
)
fig.show()
fig = px.line(
from_nth_case,
x="N days",
y="Log Confirmed",
color="Country/Region",
title="N days from " + str(n) + " case",
height=600,
)
fig.show()
2.3a Edited
def graph_cty_exceeding_cases(n, full_grouped_df, full_table_df):
"""
Function graphs the countries with more than n confirmed cases
Parameters:
n (int): The threshold minimum number of cases
fully_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set
full_data_df (pandas.DataFrame): the data source used. From Kaggle's covid_19_clean_complete data set
Returns:
null
"""
# Data Preprocessing
countries = full_grouped_df[full_grouped_df["Confirmed"] > n][
"Country/Region"
].unique()
temp = full_table_df[full_table_df["Country/Region"].isin(countries)]
temp = temp.groupby(["Country/Region", "Date"])["Confirmed"].sum().reset_index()
temp = temp[temp["Confirmed"] > n]
temp["Log Confirmed"] = np.log(1 + temp["Confirmed"])
# print(temp.head())
min_date = temp.groupby("Country/Region")["Date"].min().reset_index()
min_date.columns = ["Country/Region", "Min Date"]
# print(min_date.head())
from_nth_case = pd.merge(temp, min_date, on="Country/Region")
from_nth_case["Date"] = pd.to_datetime(from_nth_case["Date"])
from_nth_case["Min Date"] = pd.to_datetime(from_nth_case["Min Date"])
from_nth_case["N days"] = (
from_nth_case["Date"] - from_nth_case["Min Date"]
).dt.days
# print(from_nth_case.head())
fig = px.line(
from_nth_case,
x="N days",
y="Confirmed",
color="Country/Region",
title="N days from " + str(n) + " case",
height=600,
)
fig.show()
fig = px.line(
from_nth_case,
x="N days",
y="Log Confirmed",
color="Country/Region",
title="N days from " + str(n) + " case",
height=600,
)
fig.show()
Q1: Do government actions matter?
def plot_gov_action(covid_outcome, gov_action, full_grouped_df):
"""
Function plots the government action and outcome
Parameters:
covid_outcome (str): The outcome from covid
gov_action (str): The government action to be analysed
full_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set
Returns:
null
"""
fig = px.scatter(
full_grouped_df[full_grouped_df[gov_action] != None],
x="day_rel_to_" + gov_action,
y=covid_outcome,
color="Country/Region",
title="N days from " + gov_action,
height=600,
)
fig.update_layout(yaxis=dict(range=[0, 10]))
fig.show()
# perhaps test theory with:
# gov_actions = ['publicplace', 'gatheringlimit']
plt.figure(figsize=(16,9))
sns.set_style("dark")
sns.scatterplot(x = 'pct_change_Confirmed_per_1000', y = 'publicplace', data = full_grouped, hue = "WHO_Region")
plt.show()
sns.scatterplot(x = 'pct_change_Confirmed_per_1000', y = 'gatheringlimit', data = full_grouped, hue = "WHO_Region")
plt.show()
full_grouped_1 = covid.copy()
# Backfill data
full_grouped_1 = full_grouped_1.fillna(method="ffill")
# Create percent changes in covid19 outcomes
#covid_outcomes = ["confirmed", "deaths", "recovered", "active", "confirmed per 1000",]
covid_outcomes = ["new_cases", "new_deaths", "new_cases_per_million","stringency_index"]
for covid_outcome in covid_outcomes:
full_grouped_1["pct_change_" + covid_outcome] = full_grouped_1.groupby(
["location"]
)[covid_outcome].pct_change()
full_grouped_1[full_grouped_1["pct_change_" + covid_outcome] == np.inf] = 0
# Replace space in variable names with '_'
full_grouped_1.columns = full_grouped_1.columns.str.replace(" ", "_")
full_grouped_1.info()
def plot_gov_action(covid_outcome, gov_action, full_grouped_df):
"""
Function plots the government action and outcome
Parameters:
covid_outcome (str): The outcome from covid
gov_action (str): The government action to be analysed
full_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set
Returns:
null
"""
fig = px.scatter(
full_grouped_df[full_grouped_df[gov_action] != None],
x= gov_action,
y= covid_outcome,
color= "location",
title= gov_action,
height=600,
)
fig.update_layout(yaxis=dict(range=[0, 10]))
fig.show()
plt.figure(figsize=(16,9))
sns.set_style("dark")
sns.scatterplot(x = 'stringency_index', y = 'pct_change_new_cases_per_million', data = full_grouped_1, hue = "continent")
plt.show()
different_grouped = pd.read_csv("/work/corona-virus-report/full_grouped.csv")
full_grouped_1['location'].unique()
plt.figure(figsize=(16,9))
plt.grid()
plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "China"])
plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "United States"])
plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "United Kingdom"])
plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "India"])
plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "France"])
plt.legend(['China',"US","UK","India","France"])
plt.xlabel('Stringency')
plt.ylabel('New cases per million (smoothed)')
Q2: How much do government interventions matter?
full_grouped['Confirmed_per_1000'].describe()
full_grouped['log_Confirmed_per_1000'] = np.log(full_grouped['Confirmed_per_1000']+1)
full_grouped['log_Confirmed_per_1000'].describe()
full_grouped_1['log_new_cases_per_million'] = np.log(full_grouped_1['new_cases_per_million']+1)
full_grouped_1['log_new_cases_per_million'].describe()
#Plot pairplot with countries organized by continent
g = sns.pairplot(full_grouped_1[['log_new_cases_per_million', 'population_density', 'human_development_index','handwashing_facilities', 'median_age','continent']], hue='continent')
f, ax = plt.subplots(figsize=(10, 8))
corr = full_grouped_1.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Plot heatmap
sns.heatmap(corr, mask=mask, cmap=cmap, square=True, ax=ax)
full_grouped_1['log_new_cases_per_million'] = np.log(full_grouped_1['new_cases_per_million']+1)
full_grouped_1['log_new_cases_per_million'].describe()
# 'post_schools', 'post_gathering', 'post_nonessential', 'post_publicplace',
# OLS regression
y = full_grouped_1['log_new_cases_per_million']
X = full_grouped_1[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']]
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
full_grouped_1['continent'].unique()
data = full_grouped_1[full_grouped_1['continent'] == "Europe"]
# OLS regression
y = data['log_new_cases_per_million']
X = data[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']]
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
data = full_grouped_1[full_grouped_1['continent'] == "Asia"]
# OLS regression
y = data['log_new_cases_per_million']
X = data[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']]
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
data = full_grouped_1[full_grouped_1['continent'] == "North America"]
# OLS regression
y = data['log_new_cases_per_million']
X = data[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']]
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())