Task
# Import modules for API calls
import requests
import io
import pandas as pd
import requests
import json
from datetime import datetime
# Import module for plotting
import seaborn as sns
## JHU Vaccination Rates (Taken From: https://github.com/owid/covid-19-data/tree/master/public/data)
url = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv'
download = requests.get(url).content
covid = pd.read_csv(io.StringIO(download.decode('utf-8')), parse_dates=['date'])
covid.tail()
pip install statsmodels
Requirement already satisfied: statsmodels in /usr/local/lib/python3.7/site-packages (0.13.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/site-packages (from statsmodels) (0.5.2)
Requirement already satisfied: scipy>=1.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.7.1)
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.2.5)
Requirement already satisfied: numpy>=1.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.19.5)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2021.3)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2.8.2)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
Note: you may need to restart the kernel to use updated packages.
# Data Management
from dateutil import relativedelta as rd
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import plotly as py
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode()
import seaborn as sns
# Regression
import statsmodels.api as sm
from statsmodels.formula.api import ols
import statsmodels.graphics.api as smg
covid.columns.values
covid.tail()
full_grouped = covid.copy()
# Backfill data
full_grouped = full_grouped.fillna(method="ffill")
# Create percent changes in covid19 outcomes
#covid_outcomes = ["confirmed", "deaths", "recovered", "active", "confirmed per 1000",]
covid_outcomes = ["new_cases", "new_deaths", "new_cases_per_million","stringency_index"]
for covid_outcome in covid_outcomes:
full_grouped["pct_change_" + covid_outcome] = full_grouped.groupby(
["location"]
)[covid_outcome].pct_change()
full_grouped[full_grouped["pct_change_" + covid_outcome] == np.inf] = 0
# Replace space in variable names with '_'
full_grouped.columns = full_grouped.columns.str.replace(" ", "_")
full_grouped.tail()
full_grouped.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121539 entries, 0 to 121538
Data columns (total 69 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 iso_code 121539 non-null object
1 continent 121539 non-null object
2 location 121539 non-null object
3 date 121539 non-null object
4 total_cases 121539 non-null float64
5 new_cases 121539 non-null float64
6 new_cases_smoothed 121534 non-null float64
7 total_deaths 121516 non-null float64
8 new_deaths 121516 non-null float64
9 new_deaths_smoothed 121534 non-null float64
10 total_cases_per_million 121539 non-null float64
11 new_cases_per_million 121539 non-null float64
12 new_cases_smoothed_per_million 121534 non-null float64
13 total_deaths_per_million 121516 non-null float64
14 new_deaths_per_million 121516 non-null float64
15 new_deaths_smoothed_per_million 121534 non-null float64
16 reproduction_rate 121512 non-null float64
17 icu_patients 119776 non-null float64
18 icu_patients_per_million 119776 non-null float64
19 hosp_patients 115238 non-null float64
20 hosp_patients_per_million 115238 non-null float64
21 weekly_icu_admissions 98070 non-null float64
22 weekly_icu_admissions_per_million 98070 non-null float64
23 weekly_hosp_admissions 111768 non-null float64
24 weekly_hosp_admissions_per_million 111768 non-null float64
25 new_tests 120462 non-null float64
26 total_tests 120462 non-null float64
27 total_tests_per_thousand 120462 non-null float64
28 new_tests_per_thousand 120462 non-null float64
29 new_tests_smoothed 120455 non-null float64
30 new_tests_smoothed_per_thousand 120455 non-null float64
31 positive_rate 120444 non-null float64
32 tests_per_case 120444 non-null float64
33 tests_units 120462 non-null object
34 total_vaccinations 121250 non-null float64
35 people_vaccinated 121250 non-null float64
36 people_fully_vaccinated 121190 non-null float64
37 total_boosters 115932 non-null float64
38 new_vaccinations 121174 non-null float64
39 new_vaccinations_smoothed 121249 non-null float64
40 total_vaccinations_per_hundred 121250 non-null float64
41 people_vaccinated_per_hundred 121250 non-null float64
42 people_fully_vaccinated_per_hundred 121190 non-null float64
43 total_boosters_per_hundred 115932 non-null float64
44 new_vaccinations_smoothed_per_million 121249 non-null float64
45 stringency_index 121539 non-null float64
46 population 121539 non-null float64
47 population_density 121539 non-null float64
48 median_age 121539 non-null float64
49 aged_65_older 121539 non-null float64
50 aged_70_older 121539 non-null float64
51 gdp_per_capita 121539 non-null float64
52 extreme_poverty 120462 non-null float64
53 cardiovasc_death_rate 121539 non-null float64
54 diabetes_prevalence 121539 non-null float64
55 female_smokers 120462 non-null float64
56 male_smokers 120462 non-null float64
57 handwashing_facilities 121539 non-null float64
58 hospital_beds_per_thousand 121539 non-null float64
59 life_expectancy 121539 non-null float64
60 human_development_index 121539 non-null float64
61 excess_mortality_cumulative_absolute 120458 non-null float64
62 excess_mortality_cumulative 120458 non-null float64
63 excess_mortality 120458 non-null float64
64 excess_mortality_cumulative_per_million 120458 non-null float64
65 pct_change_new_cases 104879 non-null float64
66 pct_change_new_deaths 79463 non-null float64
67 pct_change_new_cases_per_million 86874 non-null float64
68 pct_change_stringency_index 102279 non-null float64
dtypes: float64(64), object(5)
memory usage: 64.0+ MB
# Read and rename column country
# cty_info = pd.read_csv('../input/countryinfo/covid19countryinfo.csv').rename(columns={'country':'Country'})
# cty_info = pd.read_csv(local_path + 'covid19countryinfo.csv').rename(columns={'country':'Country'})
cty_info = pd.read_csv('/work/countryinfo/covid19countryinfo.csv').rename(columns={'country':'Country'})
# Filter observations with aggregate country-level information
# The column region for region-level observations is populated
cty_info = cty_info[cty_info.region.isnull()]
# Convert string data type to floating data type
# Remove comma from the fields
cty_info['healthexp'] = cty_info[~cty_info['healthexp'].isnull()]['healthexp'].str.replace(',','').astype('float')
cty_info['gdp2019'] = cty_info[~cty_info['gdp2019'].isnull()]['gdp2019'].str.replace(',','').astype('float')
# Convert to date objects with to_datetime method
gov_actions = ['quarantine', 'schools', 'gathering', 'nonessential', 'publicplace']
for gov_action in gov_actions:
cty_info[gov_action] = pd.to_datetime(cty_info[gov_action], format = '%m/%d/%Y')
# Filter columns of interest
# Note: feel free to explore other variables or datasets
cty_info = cty_info[['Country', 'avghumidity', 'avgtemp', 'fertility', 'medianage', 'urbanpop', 'quarantine', 'schools', \
'publicplace', 'gatheringlimit', 'gathering', 'nonessential', 'hospibed', 'smokers', \
'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'sexratio', 'lung', 'femalelung', \
'malelung', 'gdp2019', 'healthexp', 'healthperpop']]
# cty_info.describe()
cty_info.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 175 entries, 0 to 192
Data columns (total 27 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Country 175 non-null object
1 avghumidity 145 non-null float64
2 avgtemp 145 non-null float64
3 fertility 174 non-null float64
4 medianage 175 non-null float64
5 urbanpop 175 non-null float64
6 quarantine 24 non-null datetime64[ns]
7 schools 26 non-null datetime64[ns]
8 publicplace 14 non-null datetime64[ns]
9 gatheringlimit 12 non-null float64
10 gathering 12 non-null datetime64[ns]
11 nonessential 4 non-null datetime64[ns]
12 hospibed 175 non-null float64
13 smokers 123 non-null float64
14 sex0 173 non-null float64
15 sex14 173 non-null float64
16 sex25 173 non-null float64
17 sex54 173 non-null float64
18 sex64 173 non-null float64
19 sex65plus 173 non-null float64
20 sexratio 172 non-null float64
21 lung 163 non-null float64
22 femalelung 163 non-null float64
23 malelung 163 non-null float64
24 gdp2019 174 non-null float64
25 healthexp 170 non-null float64
26 healthperpop 173 non-null float64
dtypes: datetime64[ns](5), float64(21), object(1)
memory usage: 38.3+ KB
# Worldometer data
# ================
# worldometer_data = pd.read_csv('../input/corona-virus-report/worldometer_data.csv')
# worldometer_data = pd.read_csv(local_path + 'worldometer_data.csv')
worldometer_data = pd.read_csv("/work/corona-virus-report/worldometer_data.csv")
# Replace missing values '' with NAN and then 0
worldometer_data = worldometer_data.replace("", np.nan).fillna(0)
# Transform variables and round them up to the two decimal points
# Note that there are instances of division by zero issue when there are either zero total tests or total cases
worldometer_data["Case Positivity"] = round(
worldometer_data["TotalCases"] / worldometer_data["TotalTests"], 2
)
worldometer_data["Case Fatality"] = round(
worldometer_data["TotalDeaths"] / worldometer_data["TotalCases"], 2
)
# Resolve the division by zero issue by replacing infinity value with zero
worldometer_data[worldometer_data["Case Positivity"] == np.inf] = 0
worldometer_data[worldometer_data["Case Fatality"] == np.inf] = 0
# Place case positivity into three bins
worldometer_data["Case Positivity Bin"] = pd.qcut(
worldometer_data["Case Positivity"], q=3, labels=["low", "medium", "high"]
)
# Population Structure
# worldometer_pop_struc = pd.read_csv('../input/covid19-worldometer-snapshots-since-april-18/population_structure_by_age_per_contry.csv')
# worldometer_pop_struc = pd.read_csv(local_path + 'population_structure_by_age_per_contry.csv')
worldometer_pop_struc = pd.read_csv("/work/COVID-19 worldometer daily snapshots/population_structure_by_age_per_contry.csv")
# Replace missing values with zeros
worldometer_pop_struc = worldometer_pop_struc.fillna(0)
# Merge datasets by common key country
worldometer_data = worldometer_data.merge(
worldometer_pop_struc, how="inner", left_on="Country/Region", right_on="Country"
)
worldometer_data = worldometer_data[worldometer_data["Country/Region"] != 0]
# Country information
worldometer_data = worldometer_data.merge(cty_info, how="left", on="Country")
# Replace space in variable names with '_'
worldometer_data.columns = worldometer_data.columns.str.replace(" ", "_")
# Full data
# =========
# full_table = pd.read_csv('../input/corona-virus-report/covid_19_clean_complete.csv')
# full_table = pd.read_csv(local_path + 'covid_19_clean_complete.csv')
full_table = pd.read_csv("/work/corona-virus-report/covid_19_clean_complete.csv")
full_table["Date"] = pd.to_datetime(full_table["Date"])
# Grouped by day, country
# =======================
# full_grouped = pd.read_csv('../input/corona-virus-report/full_grouped.csv')
# full_grouped = pd.read_csv(local_path + 'full_grouped.csv')
full_grouped = pd.read_csv("/work/corona-virus-report/full_grouped.csv")
full_grouped["Date"] = pd.to_datetime(full_grouped["Date"])
# full_grouped.loc[full_grouped['Country/Region'] == 'US', 'Country/Region'] = 'USA'
full_grouped.head()
# Correct country names in worldometer to make them consistent with dataframe full_grouped column Country/Region before merging
worldometer_data["Country/Region"].replace(
{
"USA": "US",
"UAE": "United Arab Emirates",
"S. Korea": "South Korea",
"UK": "United Kingdom",
},
inplace=True,
)
# Draw population and country-level data
full_grouped = full_grouped.merge(
worldometer_data[["Country/Region", "Population"]], how="left", on="Country/Region"
)
full_grouped = full_grouped.merge(
cty_info, how="left", left_on="Country/Region", right_on="Country"
)
full_grouped["Confirmed per 1000"] = (
full_grouped["Confirmed"] / full_grouped["Population"] * 1000
)
# Backfill data
full_grouped = full_grouped.fillna(method="ffill")
# Create post-invention indicators
gov_actions = ["quarantine", "schools", "gathering", "nonessential", "publicplace"]
for gov_action in gov_actions:
full_grouped["post_" + gov_action] = (
full_grouped["Date"] >= full_grouped[gov_action]
)
full_grouped["day_rel_to_" + gov_action] = (
full_grouped["Date"] - full_grouped[gov_action]
).dt.days
# Create percent changes in covid19 outcomes
covid_outcomes = ["Confirmed", "Deaths", "Recovered", "Active", "Confirmed per 1000"]
for covid_outcome in covid_outcomes:
full_grouped["pct_change_" + covid_outcome] = full_grouped.groupby(
["Country/Region"]
)[covid_outcome].pct_change()
full_grouped[full_grouped["pct_change_" + covid_outcome] == np.inf] = 0
# Replace space in variable names with '_'
full_grouped.columns = full_grouped.columns.str.replace(" ", "_")
full_grouped.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 35156 entries, 0 to 35155
Data columns (total 54 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Date 35156 non-null object
1 Country/Region 35156 non-null object
2 Confirmed 35156 non-null int64
3 Deaths 35156 non-null int64
4 Recovered 35156 non-null int64
5 Active 35156 non-null int64
6 New_cases 35156 non-null int64
7 New_deaths 35156 non-null int64
8 New_recovered 35156 non-null int64
9 WHO_Region 35156 non-null object
10 Population 35156 non-null float64
11 Country 35156 non-null object
12 avghumidity 35156 non-null float64
13 avgtemp 35156 non-null float64
14 fertility 35156 non-null float64
15 medianage 35156 non-null float64
16 urbanpop 35156 non-null float64
17 quarantine 35150 non-null object
18 schools 35124 non-null object
19 publicplace 35148 non-null object
20 gatheringlimit 35148 non-null float64
21 gathering 35148 non-null object
22 nonessential 35095 non-null object
23 hospibed 35156 non-null float64
24 smokers 35155 non-null float64
25 sex0 35156 non-null float64
26 sex14 35156 non-null float64
27 sex25 35156 non-null float64
28 sex54 35156 non-null float64
29 sex64 35156 non-null float64
30 sex65plus 35156 non-null float64
31 sexratio 35156 non-null float64
32 lung 35156 non-null float64
33 femalelung 35156 non-null float64
34 malelung 35156 non-null float64
35 gdp2019 35156 non-null float64
36 healthexp 35156 non-null float64
37 healthperpop 35156 non-null float64
38 Confirmed_per_1000 35156 non-null float64
39 post_quarantine 35156 non-null object
40 day_rel_to_quarantine 35150 non-null float64
41 post_schools 35156 non-null object
42 day_rel_to_schools 35124 non-null float64
43 post_gathering 35156 non-null object
44 day_rel_to_gathering 35148 non-null float64
45 post_nonessential 35156 non-null object
46 day_rel_to_nonessential 35095 non-null float64
47 post_publicplace 35156 non-null object
48 day_rel_to_publicplace 35148 non-null float64
49 pct_change_Confirmed 27152 non-null float64
50 pct_change_Deaths 21438 non-null float64
51 pct_change_Recovered 23768 non-null float64
52 pct_change_Active 25665 non-null float64
53 pct_change_Confirmed_per_1000 26741 non-null float64
dtypes: float64(33), int64(7), object(14)
memory usage: 14.8+ MB
# Visualize the missingness isue in the dataset
sns.heatmap(cty_info.isnull(), cbar=False)
Define Function
# Create a function to plot (reusing from the previous BootCamp)
def gt_n(n):
countries = full_grouped[full_grouped["Confirmed"] > n]["Country/Region"].unique()
temp = full_table[full_table["Country/Region"].isin(countries)]
temp = temp.groupby(["Country/Region", "Date"])["Confirmed"].sum().reset_index()
temp = temp[temp["Confirmed"] > n]
temp["Log Confirmed"] = np.log(1 + temp["Confirmed"])
# print(temp.head())
min_date = temp.groupby("Country/Region")["Date"].min().reset_index()
min_date.columns = ["Country/Region", "Min Date"]
# print(min_date.head())
from_nth_case = pd.merge(temp, min_date, on="Country/Region")
from_nth_case["Date"] = pd.to_datetime(from_nth_case["Date"])
from_nth_case["Min Date"] = pd.to_datetime(from_nth_case["Min Date"])
from_nth_case["N days"] = (
from_nth_case["Date"] - from_nth_case["Min Date"]
).dt.days
# print(from_nth_case.head())
fig = px.line(
from_nth_case,
x="N days",
y="Confirmed",
color="Country/Region",
title="N days from " + str(n) + " case",
height=600,
)
fig.show()
fig = px.line(
from_nth_case,
x="N days",
y="Log Confirmed",
color="Country/Region",
title="N days from " + str(n) + " case",
height=600,
)
fig.show()
2.3a Edited
def graph_cty_exceeding_cases(n, full_grouped_df, full_table_df):
"""
Function graphs the countries with more than n confirmed cases
Parameters:
n (int): The threshold minimum number of cases
fully_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set
full_data_df (pandas.DataFrame): the data source used. From Kaggle's covid_19_clean_complete data set
Returns:
null
"""
# Data Preprocessing
countries = full_grouped_df[full_grouped_df["Confirmed"] > n][
"Country/Region"
].unique()
temp = full_table_df[full_table_df["Country/Region"].isin(countries)]
temp = temp.groupby(["Country/Region", "Date"])["Confirmed"].sum().reset_index()
temp = temp[temp["Confirmed"] > n]
temp["Log Confirmed"] = np.log(1 + temp["Confirmed"])
# print(temp.head())
min_date = temp.groupby("Country/Region")["Date"].min().reset_index()
min_date.columns = ["Country/Region", "Min Date"]
# print(min_date.head())
from_nth_case = pd.merge(temp, min_date, on="Country/Region")
from_nth_case["Date"] = pd.to_datetime(from_nth_case["Date"])
from_nth_case["Min Date"] = pd.to_datetime(from_nth_case["Min Date"])
from_nth_case["N days"] = (
from_nth_case["Date"] - from_nth_case["Min Date"]
).dt.days
# print(from_nth_case.head())
fig = px.line(
from_nth_case,
x="N days",
y="Confirmed",
color="Country/Region",
title="N days from " + str(n) + " case",
height=600,
)
fig.show()
fig = px.line(
from_nth_case,
x="N days",
y="Log Confirmed",
color="Country/Region",
title="N days from " + str(n) + " case",
height=600,
)
fig.show()
Q1: Do government actions matter?
def plot_gov_action(covid_outcome, gov_action, full_grouped_df):
"""
Function plots the government action and outcome
Parameters:
covid_outcome (str): The outcome from covid
gov_action (str): The government action to be analysed
full_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set
Returns:
null
"""
fig = px.scatter(
full_grouped_df[full_grouped_df[gov_action] != None],
x="day_rel_to_" + gov_action,
y=covid_outcome,
color="Country/Region",
title="N days from " + gov_action,
height=600,
)
fig.update_layout(yaxis=dict(range=[0, 10]))
fig.show()
# perhaps test theory with:
# gov_actions = ['publicplace', 'gatheringlimit']
plt.figure(figsize=(16,9))
sns.set_style("dark")
sns.scatterplot(x = 'pct_change_Confirmed_per_1000', y = 'publicplace', data = full_grouped, hue = "WHO_Region")
plt.show()
sns.scatterplot(x = 'pct_change_Confirmed_per_1000', y = 'gatheringlimit', data = full_grouped, hue = "WHO_Region")
plt.show()
full_grouped_1 = covid.copy()
# Backfill data
full_grouped_1 = full_grouped_1.fillna(method="ffill")
# Create percent changes in covid19 outcomes
#covid_outcomes = ["confirmed", "deaths", "recovered", "active", "confirmed per 1000",]
covid_outcomes = ["new_cases", "new_deaths", "new_cases_per_million","stringency_index"]
for covid_outcome in covid_outcomes:
full_grouped_1["pct_change_" + covid_outcome] = full_grouped_1.groupby(
["location"]
)[covid_outcome].pct_change()
full_grouped_1[full_grouped_1["pct_change_" + covid_outcome] == np.inf] = 0
# Replace space in variable names with '_'
full_grouped_1.columns = full_grouped_1.columns.str.replace(" ", "_")
full_grouped_1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121539 entries, 0 to 121538
Data columns (total 69 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 iso_code 121539 non-null object
1 continent 121539 non-null object
2 location 121539 non-null object
3 date 121539 non-null object
4 total_cases 121539 non-null float64
5 new_cases 121539 non-null float64
6 new_cases_smoothed 121534 non-null float64
7 total_deaths 121516 non-null float64
8 new_deaths 121516 non-null float64
9 new_deaths_smoothed 121534 non-null float64
10 total_cases_per_million 121539 non-null float64
11 new_cases_per_million 121539 non-null float64
12 new_cases_smoothed_per_million 121534 non-null float64
13 total_deaths_per_million 121516 non-null float64
14 new_deaths_per_million 121516 non-null float64
15 new_deaths_smoothed_per_million 121534 non-null float64
16 reproduction_rate 121512 non-null float64
17 icu_patients 119776 non-null float64
18 icu_patients_per_million 119776 non-null float64
19 hosp_patients 115238 non-null float64
20 hosp_patients_per_million 115238 non-null float64
21 weekly_icu_admissions 98070 non-null float64
22 weekly_icu_admissions_per_million 98070 non-null float64
23 weekly_hosp_admissions 111768 non-null float64
24 weekly_hosp_admissions_per_million 111768 non-null float64
25 new_tests 120462 non-null float64
26 total_tests 120462 non-null float64
27 total_tests_per_thousand 120462 non-null float64
28 new_tests_per_thousand 120462 non-null float64
29 new_tests_smoothed 120455 non-null float64
30 new_tests_smoothed_per_thousand 120455 non-null float64
31 positive_rate 120444 non-null float64
32 tests_per_case 120444 non-null float64
33 tests_units 120462 non-null object
34 total_vaccinations 121250 non-null float64
35 people_vaccinated 121250 non-null float64
36 people_fully_vaccinated 121190 non-null float64
37 total_boosters 115932 non-null float64
38 new_vaccinations 121174 non-null float64
39 new_vaccinations_smoothed 121249 non-null float64
40 total_vaccinations_per_hundred 121250 non-null float64
41 people_vaccinated_per_hundred 121250 non-null float64
42 people_fully_vaccinated_per_hundred 121190 non-null float64
43 total_boosters_per_hundred 115932 non-null float64
44 new_vaccinations_smoothed_per_million 121249 non-null float64
45 stringency_index 121539 non-null float64
46 population 121539 non-null float64
47 population_density 121539 non-null float64
48 median_age 121539 non-null float64
49 aged_65_older 121539 non-null float64
50 aged_70_older 121539 non-null float64
51 gdp_per_capita 121539 non-null float64
52 extreme_poverty 120462 non-null float64
53 cardiovasc_death_rate 121539 non-null float64
54 diabetes_prevalence 121539 non-null float64
55 female_smokers 120462 non-null float64
56 male_smokers 120462 non-null float64
57 handwashing_facilities 121539 non-null float64
58 hospital_beds_per_thousand 121539 non-null float64
59 life_expectancy 121539 non-null float64
60 human_development_index 121539 non-null float64
61 excess_mortality_cumulative_absolute 120458 non-null float64
62 excess_mortality_cumulative 120458 non-null float64
63 excess_mortality 120458 non-null float64
64 excess_mortality_cumulative_per_million 120458 non-null float64
65 pct_change_new_cases 104879 non-null float64
66 pct_change_new_deaths 79463 non-null float64
67 pct_change_new_cases_per_million 86874 non-null float64
68 pct_change_stringency_index 102279 non-null float64
dtypes: float64(64), object(5)
memory usage: 64.0+ MB
def plot_gov_action(covid_outcome, gov_action, full_grouped_df):
"""
Function plots the government action and outcome
Parameters:
covid_outcome (str): The outcome from covid
gov_action (str): The government action to be analysed
full_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set
Returns:
null
"""
fig = px.scatter(
full_grouped_df[full_grouped_df[gov_action] != None],
x= gov_action,
y= covid_outcome,
color= "location",
title= gov_action,
height=600,
)
fig.update_layout(yaxis=dict(range=[0, 10]))
fig.show()
plt.figure(figsize=(16,9))
sns.set_style("dark")
sns.scatterplot(x = 'stringency_index', y = 'pct_change_new_cases_per_million', data = full_grouped_1, hue = "continent")
plt.show()
different_grouped = pd.read_csv("/work/corona-virus-report/full_grouped.csv")
full_grouped_1['location'].unique()
plt.figure(figsize=(16,9))
plt.grid()
plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "China"])
plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "United States"])
plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "United Kingdom"])
plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "India"])
plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "France"])
plt.legend(['China',"US","UK","India","France"])
plt.xlabel('Stringency')
plt.ylabel('New cases per million (smoothed)')
Q2: How much do government interventions matter?
full_grouped['Confirmed_per_1000'].describe()
full_grouped['log_Confirmed_per_1000'] = np.log(full_grouped['Confirmed_per_1000']+1)
full_grouped['log_Confirmed_per_1000'].describe()
full_grouped_1['log_new_cases_per_million'] = np.log(full_grouped_1['new_cases_per_million']+1)
full_grouped_1['log_new_cases_per_million'].describe()
#Plot pairplot with countries organized by continent
g = sns.pairplot(full_grouped_1[['log_new_cases_per_million', 'population_density', 'human_development_index','handwashing_facilities', 'median_age','continent']], hue='continent')
f, ax = plt.subplots(figsize=(10, 8))
corr = full_grouped_1.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)
# Plot heatmap
sns.heatmap(corr, mask=mask, cmap=cmap, square=True, ax=ax)
full_grouped_1['log_new_cases_per_million'] = np.log(full_grouped_1['new_cases_per_million']+1)
full_grouped_1['log_new_cases_per_million'].describe()
# 'post_schools', 'post_gathering', 'post_nonessential', 'post_publicplace',
# OLS regression
y = full_grouped_1['log_new_cases_per_million']
X = full_grouped_1[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']]
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
Results: Ordinary least squares
=============================================================================
Model: OLS Adj. R-squared: 0.328
Dependent Variable: log_new_cases_per_million AIC: 488489.1558
Date: 2021-10-06 11:54 BIC: 488547.4014
No. Observations: 121491 Log-Likelihood: -2.4424e+05
Df Model: 5 F-statistic: 1.188e+04
Df Residuals: 121485 Prob (F-statistic): 0.00
R-squared: 0.328 Scale: 3.2637
------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -0.3309 0.0122 -27.0240 0.0000 -0.3549 -0.3069
stringency_index 0.0166 0.0003 61.2277 0.0000 0.0160 0.0171
population_density -0.0000 0.0000 -0.7746 0.4386 -0.0000 0.0000
human_development_index 0.7163 0.0618 11.5938 0.0000 0.5952 0.8374
handwashing_facilities 0.0012 0.0002 6.6285 0.0000 0.0009 0.0016
median_age 0.0504 0.0012 41.7447 0.0000 0.0480 0.0527
-----------------------------------------------------------------------------
Omnibus: 10617.366 Durbin-Watson: 0.448
Prob(Omnibus): 0.000 Jarque-Bera (JB): 3289.761
Skew: 0.014 Prob(JB): 0.000
Kurtosis: 2.194 Condition No.: 20325
=============================================================================
* The condition number is large (2e+04). This might indicate
strong multicollinearity or other numerical problems.
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/arraylike.py:358: RuntimeWarning:
invalid value encountered in log
full_grouped_1['continent'].unique()
data = full_grouped_1[full_grouped_1['continent'] == "Europe"]
# OLS regression
y = data['log_new_cases_per_million']
X = data[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']]
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
Results: Ordinary least squares
=============================================================================
Model: OLS Adj. R-squared: 0.104
Dependent Variable: log_new_cases_per_million AIC: 105582.8902
Date: 2021-10-06 12:13 BIC: 105631.7445
No. Observations: 25397 Log-Likelihood: -52785.
Df Model: 5 F-statistic: 589.6
Df Residuals: 25391 Prob (F-statistic): 0.00
R-squared: 0.104 Scale: 3.7404
------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const 1.5248 0.1182 12.8963 0.0000 1.2930 1.7565
stringency_index 0.0280 0.0007 41.2054 0.0000 0.0267 0.0294
population_density -0.0000 0.0000 -7.4845 0.0000 -0.0000 -0.0000
human_development_index -2.4203 0.1851 -13.0764 0.0000 -2.7831 -2.0575
handwashing_facilities 0.0021 0.0004 5.4076 0.0000 0.0013 0.0028
median_age 0.0659 0.0027 24.4941 0.0000 0.0606 0.0711
-----------------------------------------------------------------------------
Omnibus: 3424.104 Durbin-Watson: 0.334
Prob(Omnibus): 0.000 Jarque-Bera (JB): 1149.610
Skew: -0.283 Prob(JB): 0.000
Kurtosis: 2.124 Condition No.: 46027
=============================================================================
* The condition number is large (5e+04). This might indicate
strong multicollinearity or other numerical problems.
data = full_grouped_1[full_grouped_1['continent'] == "Asia"]
# OLS regression
y = data['log_new_cases_per_million']
X = data[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']]
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
Results: Ordinary least squares
=============================================================================
Model: OLS Adj. R-squared: 0.187
Dependent Variable: log_new_cases_per_million AIC: 104818.5210
Date: 2021-10-06 12:14 BIC: 104867.5366
No. Observations: 26089 Log-Likelihood: -52403.
Df Model: 5 F-statistic: 1200.
Df Residuals: 26083 Prob (F-statistic): 0.00
R-squared: 0.187 Scale: 3.2531
------------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
const -1.8546 0.0785 -23.6399 0.0000 -2.0084 -1.7008
stringency_index 0.0217 0.0006 37.8904 0.0000 0.0206 0.0228
population_density 0.0000 0.0000 3.5461 0.0004 0.0000 0.0000
human_development_index 7.9614 0.1436 55.4449 0.0000 7.6800 8.2429
handwashing_facilities -0.0001 0.0004 -0.3309 0.7407 -0.0010 0.0007
median_age -0.0864 0.0023 -37.1637 0.0000 -0.0910 -0.0819
-----------------------------------------------------------------------------
Omnibus: 2836.588 Durbin-Watson: 0.117
Prob(Omnibus): 0.000 Jarque-Bera (JB): 791.905
Skew: -0.029 Prob(JB): 0.000
Kurtosis: 2.148 Condition No.: 34229
=============================================================================
* The condition number is large (3e+04). This might indicate
strong multicollinearity or other numerical problems.
data = full_grouped_1[full_grouped_1['continent'] == "North America"]
# OLS regression
y = data['log_new_cases_per_million']
X = data[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']]
X = sm.add_constant(X)
ols_model=sm.OLS(y,X.astype(float), missing='drop')
result=ols_model.fit()
print(result.summary2())
Results: Ordinary least squares
============================================================================
Model: OLS Adj. R-squared: 0.198
Dependent Variable: log_new_cases_per_million AIC: 58726.4412
Date: 2021-10-06 12:14 BIC: 58771.6101
No. Observations: 13741 Log-Likelihood: -29357.
Df Model: 5 F-statistic: 680.8
Df Residuals: 13735 Prob (F-statistic): 0.00
R-squared: 0.199 Scale: 4.2017
----------------------------------------------------------------------------
Coef. Std.Err. t P>|t| [0.025 0.975]
----------------------------------------------------------------------------
const -3.2012 0.1849 -17.3122 0.0000 -3.5636 -2.8387
stringency_index 0.0281 0.0009 32.7124 0.0000 0.0264 0.0298
population_density -0.0007 0.0001 -9.3976 0.0000 -0.0009 -0.0006
human_development_index 2.3135 0.3330 6.9465 0.0000 1.6607 2.9663
handwashing_facilities 0.0044 0.0006 7.1896 0.0000 0.0032 0.0056
median_age 0.0795 0.0043 18.4455 0.0000 0.0710 0.0879
----------------------------------------------------------------------------
Omnibus: 3298.067 Durbin-Watson: 0.260
Prob(Omnibus): 0.000 Jarque-Bera (JB): 605.798
Skew: -0.104 Prob(JB): 0.000
Kurtosis: 1.992 Condition No.: 7262
============================================================================
* The condition number is large (7e+03). This might indicate
strong multicollinearity or other numerical problems.