Task

# Import modules for API calls import requests import io import pandas as pd import requests import json from datetime import datetime # Import module for plotting import seaborn as sns ## JHU Vaccination Rates (Taken From: https://github.com/owid/covid-19-data/tree/master/public/data) url = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv' download = requests.get(url).content covid = pd.read_csv(io.StringIO(download.decode('utf-8')), parse_dates=['date']) covid.tail()

pip install statsmodels

Requirement already satisfied: statsmodels in /usr/local/lib/python3.7/site-packages (0.13.0)
Requirement already satisfied: patsy>=0.5.2 in /usr/local/lib/python3.7/site-packages (from statsmodels) (0.5.2)
Requirement already satisfied: scipy>=1.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.7.1)
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.2.5)
Requirement already satisfied: numpy>=1.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.19.5)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2021.3)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2.8.2)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv
Note: you may need to restart the kernel to use updated packages.

# Data Management from dateutil import relativedelta as rd import pandas as pd import numpy as np # Visualization import matplotlib.pyplot as plt import plotly as py import plotly.express as px import plotly.graph_objects as go import plotly.offline as pyo pyo.init_notebook_mode() import seaborn as sns # Regression import statsmodels.api as sm from statsmodels.formula.api import ols import statsmodels.graphics.api as smg

covid.columns.values

covid.tail()

full_grouped = covid.copy() # Backfill data full_grouped = full_grouped.fillna(method="ffill") # Create percent changes in covid19 outcomes #covid_outcomes = ["confirmed", "deaths", "recovered", "active", "confirmed per 1000",] covid_outcomes = ["new_cases", "new_deaths", "new_cases_per_million","stringency_index"] for covid_outcome in covid_outcomes: full_grouped["pct_change_" + covid_outcome] = full_grouped.groupby( ["location"] )[covid_outcome].pct_change() full_grouped[full_grouped["pct_change_" + covid_outcome] == np.inf] = 0 # Replace space in variable names with '_' full_grouped.columns = full_grouped.columns.str.replace(" ", "_") full_grouped.tail() full_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121539 entries, 0 to 121538
Data columns (total 69 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   iso_code                                 121539 non-null  object 
 1   continent                                121539 non-null  object 
 2   location                                 121539 non-null  object 
 3   date                                     121539 non-null  object 
 4   total_cases                              121539 non-null  float64
 5   new_cases                                121539 non-null  float64
 6   new_cases_smoothed                       121534 non-null  float64
 7   total_deaths                             121516 non-null  float64
 8   new_deaths                               121516 non-null  float64
 9   new_deaths_smoothed                      121534 non-null  float64
 10  total_cases_per_million                  121539 non-null  float64
 11  new_cases_per_million                    121539 non-null  float64
 12  new_cases_smoothed_per_million           121534 non-null  float64
 13  total_deaths_per_million                 121516 non-null  float64
 14  new_deaths_per_million                   121516 non-null  float64
 15  new_deaths_smoothed_per_million          121534 non-null  float64
 16  reproduction_rate                        121512 non-null  float64
 17  icu_patients                             119776 non-null  float64
 18  icu_patients_per_million                 119776 non-null  float64
 19  hosp_patients                            115238 non-null  float64
 20  hosp_patients_per_million                115238 non-null  float64
 21  weekly_icu_admissions                    98070 non-null   float64
 22  weekly_icu_admissions_per_million        98070 non-null   float64
 23  weekly_hosp_admissions                   111768 non-null  float64
 24  weekly_hosp_admissions_per_million       111768 non-null  float64
 25  new_tests                                120462 non-null  float64
 26  total_tests                              120462 non-null  float64
 27  total_tests_per_thousand                 120462 non-null  float64
 28  new_tests_per_thousand                   120462 non-null  float64
 29  new_tests_smoothed                       120455 non-null  float64
 30  new_tests_smoothed_per_thousand          120455 non-null  float64
 31  positive_rate                            120444 non-null  float64
 32  tests_per_case                           120444 non-null  float64
 33  tests_units                              120462 non-null  object 
 34  total_vaccinations                       121250 non-null  float64
 35  people_vaccinated                        121250 non-null  float64
 36  people_fully_vaccinated                  121190 non-null  float64
 37  total_boosters                           115932 non-null  float64
 38  new_vaccinations                         121174 non-null  float64
 39  new_vaccinations_smoothed                121249 non-null  float64
 40  total_vaccinations_per_hundred           121250 non-null  float64
 41  people_vaccinated_per_hundred            121250 non-null  float64
 42  people_fully_vaccinated_per_hundred      121190 non-null  float64
 43  total_boosters_per_hundred               115932 non-null  float64
 44  new_vaccinations_smoothed_per_million    121249 non-null  float64
 45  stringency_index                         121539 non-null  float64
 46  population                               121539 non-null  float64
 47  population_density                       121539 non-null  float64
 48  median_age                               121539 non-null  float64
 49  aged_65_older                            121539 non-null  float64
 50  aged_70_older                            121539 non-null  float64
 51  gdp_per_capita                           121539 non-null  float64
 52  extreme_poverty                          120462 non-null  float64
 53  cardiovasc_death_rate                    121539 non-null  float64
 54  diabetes_prevalence                      121539 non-null  float64
 55  female_smokers                           120462 non-null  float64
 56  male_smokers                             120462 non-null  float64
 57  handwashing_facilities                   121539 non-null  float64
 58  hospital_beds_per_thousand               121539 non-null  float64
 59  life_expectancy                          121539 non-null  float64
 60  human_development_index                  121539 non-null  float64
 61  excess_mortality_cumulative_absolute     120458 non-null  float64
 62  excess_mortality_cumulative              120458 non-null  float64
 63  excess_mortality                         120458 non-null  float64
 64  excess_mortality_cumulative_per_million  120458 non-null  float64
 65  pct_change_new_cases                     104879 non-null  float64
 66  pct_change_new_deaths                    79463 non-null   float64
 67  pct_change_new_cases_per_million         86874 non-null   float64
 68  pct_change_stringency_index              102279 non-null  float64
dtypes: float64(64), object(5)
memory usage: 64.0+ MB

# Read and rename column country # cty_info = pd.read_csv('../input/countryinfo/covid19countryinfo.csv').rename(columns={'country':'Country'}) # cty_info = pd.read_csv(local_path + 'covid19countryinfo.csv').rename(columns={'country':'Country'}) cty_info = pd.read_csv('/work/countryinfo/covid19countryinfo.csv').rename(columns={'country':'Country'}) # Filter observations with aggregate country-level information # The column region for region-level observations is populated cty_info = cty_info[cty_info.region.isnull()] # Convert string data type to floating data type # Remove comma from the fields cty_info['healthexp'] = cty_info[~cty_info['healthexp'].isnull()]['healthexp'].str.replace(',','').astype('float') cty_info['gdp2019'] = cty_info[~cty_info['gdp2019'].isnull()]['gdp2019'].str.replace(',','').astype('float') # Convert to date objects with to_datetime method gov_actions = ['quarantine', 'schools', 'gathering', 'nonessential', 'publicplace'] for gov_action in gov_actions: cty_info[gov_action] = pd.to_datetime(cty_info[gov_action], format = '%m/%d/%Y') # Filter columns of interest # Note: feel free to explore other variables or datasets cty_info = cty_info[['Country', 'avghumidity', 'avgtemp', 'fertility', 'medianage', 'urbanpop', 'quarantine', 'schools', \ 'publicplace', 'gatheringlimit', 'gathering', 'nonessential', 'hospibed', 'smokers', \ 'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'sexratio', 'lung', 'femalelung', \ 'malelung', 'gdp2019', 'healthexp', 'healthperpop']] # cty_info.describe() cty_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175 entries, 0 to 192
Data columns (total 27 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   Country         175 non-null    object        
 1   avghumidity     145 non-null    float64       
 2   avgtemp         145 non-null    float64       
 3   fertility       174 non-null    float64       
 4   medianage       175 non-null    float64       
 5   urbanpop        175 non-null    float64       
 6   quarantine      24 non-null     datetime64[ns]
 7   schools         26 non-null     datetime64[ns]
 8   publicplace     14 non-null     datetime64[ns]
 9   gatheringlimit  12 non-null     float64       
 10  gathering       12 non-null     datetime64[ns]
 11  nonessential    4 non-null      datetime64[ns]
 12  hospibed        175 non-null    float64       
 13  smokers         123 non-null    float64       
 14  sex0            173 non-null    float64       
 15  sex14           173 non-null    float64       
 16  sex25           173 non-null    float64       
 17  sex54           173 non-null    float64       
 18  sex64           173 non-null    float64       
 19  sex65plus       173 non-null    float64       
 20  sexratio        172 non-null    float64       
 21  lung            163 non-null    float64       
 22  femalelung      163 non-null    float64       
 23  malelung        163 non-null    float64       
 24  gdp2019         174 non-null    float64       
 25  healthexp       170 non-null    float64       
 26  healthperpop    173 non-null    float64       
dtypes: datetime64[ns](5), float64(21), object(1)
memory usage: 38.3+ KB

# Worldometer data # ================ # worldometer_data = pd.read_csv('../input/corona-virus-report/worldometer_data.csv') # worldometer_data = pd.read_csv(local_path + 'worldometer_data.csv') worldometer_data = pd.read_csv("/work/corona-virus-report/worldometer_data.csv") # Replace missing values '' with NAN and then 0 worldometer_data = worldometer_data.replace("", np.nan).fillna(0) # Transform variables and round them up to the two decimal points # Note that there are instances of division by zero issue when there are either zero total tests or total cases worldometer_data["Case Positivity"] = round( worldometer_data["TotalCases"] / worldometer_data["TotalTests"], 2 ) worldometer_data["Case Fatality"] = round( worldometer_data["TotalDeaths"] / worldometer_data["TotalCases"], 2 ) # Resolve the division by zero issue by replacing infinity value with zero worldometer_data[worldometer_data["Case Positivity"] == np.inf] = 0 worldometer_data[worldometer_data["Case Fatality"] == np.inf] = 0 # Place case positivity into three bins worldometer_data["Case Positivity Bin"] = pd.qcut( worldometer_data["Case Positivity"], q=3, labels=["low", "medium", "high"] ) # Population Structure # worldometer_pop_struc = pd.read_csv('../input/covid19-worldometer-snapshots-since-april-18/population_structure_by_age_per_contry.csv') # worldometer_pop_struc = pd.read_csv(local_path + 'population_structure_by_age_per_contry.csv') worldometer_pop_struc = pd.read_csv("/work/COVID-19 worldometer daily snapshots/population_structure_by_age_per_contry.csv") # Replace missing values with zeros worldometer_pop_struc = worldometer_pop_struc.fillna(0) # Merge datasets by common key country worldometer_data = worldometer_data.merge( worldometer_pop_struc, how="inner", left_on="Country/Region", right_on="Country" ) worldometer_data = worldometer_data[worldometer_data["Country/Region"] != 0] # Country information worldometer_data = worldometer_data.merge(cty_info, how="left", on="Country") # Replace space in variable names with '_' worldometer_data.columns = worldometer_data.columns.str.replace(" ", "_") # Full data # ========= # full_table = pd.read_csv('../input/corona-virus-report/covid_19_clean_complete.csv') # full_table = pd.read_csv(local_path + 'covid_19_clean_complete.csv') full_table = pd.read_csv("/work/corona-virus-report/covid_19_clean_complete.csv") full_table["Date"] = pd.to_datetime(full_table["Date"]) # Grouped by day, country # ======================= # full_grouped = pd.read_csv('../input/corona-virus-report/full_grouped.csv') # full_grouped = pd.read_csv(local_path + 'full_grouped.csv') full_grouped = pd.read_csv("/work/corona-virus-report/full_grouped.csv") full_grouped["Date"] = pd.to_datetime(full_grouped["Date"]) # full_grouped.loc[full_grouped['Country/Region'] == 'US', 'Country/Region'] = 'USA' full_grouped.head() # Correct country names in worldometer to make them consistent with dataframe full_grouped column Country/Region before merging worldometer_data["Country/Region"].replace( { "USA": "US", "UAE": "United Arab Emirates", "S. Korea": "South Korea", "UK": "United Kingdom", }, inplace=True, ) # Draw population and country-level data full_grouped = full_grouped.merge( worldometer_data[["Country/Region", "Population"]], how="left", on="Country/Region" ) full_grouped = full_grouped.merge( cty_info, how="left", left_on="Country/Region", right_on="Country" ) full_grouped["Confirmed per 1000"] = ( full_grouped["Confirmed"] / full_grouped["Population"] * 1000 ) # Backfill data full_grouped = full_grouped.fillna(method="ffill") # Create post-invention indicators gov_actions = ["quarantine", "schools", "gathering", "nonessential", "publicplace"] for gov_action in gov_actions: full_grouped["post_" + gov_action] = ( full_grouped["Date"] >= full_grouped[gov_action] ) full_grouped["day_rel_to_" + gov_action] = ( full_grouped["Date"] - full_grouped[gov_action] ).dt.days # Create percent changes in covid19 outcomes covid_outcomes = ["Confirmed", "Deaths", "Recovered", "Active", "Confirmed per 1000"] for covid_outcome in covid_outcomes: full_grouped["pct_change_" + covid_outcome] = full_grouped.groupby( ["Country/Region"] )[covid_outcome].pct_change() full_grouped[full_grouped["pct_change_" + covid_outcome] == np.inf] = 0 # Replace space in variable names with '_' full_grouped.columns = full_grouped.columns.str.replace(" ", "_") full_grouped.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35156 entries, 0 to 35155
Data columns (total 54 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Date                           35156 non-null  object 
 1   Country/Region                 35156 non-null  object 
 2   Confirmed                      35156 non-null  int64  
 3   Deaths                         35156 non-null  int64  
 4   Recovered                      35156 non-null  int64  
 5   Active                         35156 non-null  int64  
 6   New_cases                      35156 non-null  int64  
 7   New_deaths                     35156 non-null  int64  
 8   New_recovered                  35156 non-null  int64  
 9   WHO_Region                     35156 non-null  object 
 10  Population                     35156 non-null  float64
 11  Country                        35156 non-null  object 
 12  avghumidity                    35156 non-null  float64
 13  avgtemp                        35156 non-null  float64
 14  fertility                      35156 non-null  float64
 15  medianage                      35156 non-null  float64
 16  urbanpop                       35156 non-null  float64
 17  quarantine                     35150 non-null  object 
 18  schools                        35124 non-null  object 
 19  publicplace                    35148 non-null  object 
 20  gatheringlimit                 35148 non-null  float64
 21  gathering                      35148 non-null  object 
 22  nonessential                   35095 non-null  object 
 23  hospibed                       35156 non-null  float64
 24  smokers                        35155 non-null  float64
 25  sex0                           35156 non-null  float64
 26  sex14                          35156 non-null  float64
 27  sex25                          35156 non-null  float64
 28  sex54                          35156 non-null  float64
 29  sex64                          35156 non-null  float64
 30  sex65plus                      35156 non-null  float64
 31  sexratio                       35156 non-null  float64
 32  lung                           35156 non-null  float64
 33  femalelung                     35156 non-null  float64
 34  malelung                       35156 non-null  float64
 35  gdp2019                        35156 non-null  float64
 36  healthexp                      35156 non-null  float64
 37  healthperpop                   35156 non-null  float64
 38  Confirmed_per_1000             35156 non-null  float64
 39  post_quarantine                35156 non-null  object 
 40  day_rel_to_quarantine          35150 non-null  float64
 41  post_schools                   35156 non-null  object 
 42  day_rel_to_schools             35124 non-null  float64
 43  post_gathering                 35156 non-null  object 
 44  day_rel_to_gathering           35148 non-null  float64
 45  post_nonessential              35156 non-null  object 
 46  day_rel_to_nonessential        35095 non-null  float64
 47  post_publicplace               35156 non-null  object 
 48  day_rel_to_publicplace         35148 non-null  float64
 49  pct_change_Confirmed           27152 non-null  float64
 50  pct_change_Deaths              21438 non-null  float64
 51  pct_change_Recovered           23768 non-null  float64
 52  pct_change_Active              25665 non-null  float64
 53  pct_change_Confirmed_per_1000  26741 non-null  float64
dtypes: float64(33), int64(7), object(14)
memory usage: 14.8+ MB

# Visualize the missingness isue in the dataset sns.heatmap(cty_info.isnull(), cbar=False)

Define Function

# Create a function to plot (reusing from the previous BootCamp) def gt_n(n): countries = full_grouped[full_grouped["Confirmed"] > n]["Country/Region"].unique() temp = full_table[full_table["Country/Region"].isin(countries)] temp = temp.groupby(["Country/Region", "Date"])["Confirmed"].sum().reset_index() temp = temp[temp["Confirmed"] > n] temp["Log Confirmed"] = np.log(1 + temp["Confirmed"]) # print(temp.head()) min_date = temp.groupby("Country/Region")["Date"].min().reset_index() min_date.columns = ["Country/Region", "Min Date"] # print(min_date.head()) from_nth_case = pd.merge(temp, min_date, on="Country/Region") from_nth_case["Date"] = pd.to_datetime(from_nth_case["Date"]) from_nth_case["Min Date"] = pd.to_datetime(from_nth_case["Min Date"]) from_nth_case["N days"] = ( from_nth_case["Date"] - from_nth_case["Min Date"] ).dt.days # print(from_nth_case.head()) fig = px.line( from_nth_case, x="N days", y="Confirmed", color="Country/Region", title="N days from " + str(n) + " case", height=600, ) fig.show() fig = px.line( from_nth_case, x="N days", y="Log Confirmed", color="Country/Region", title="N days from " + str(n) + " case", height=600, ) fig.show()

2.3a Edited

def graph_cty_exceeding_cases(n, full_grouped_df, full_table_df): """ Function graphs the countries with more than n confirmed cases Parameters: n (int): The threshold minimum number of cases fully_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set full_data_df (pandas.DataFrame): the data source used. From Kaggle's covid_19_clean_complete data set Returns: null """ # Data Preprocessing countries = full_grouped_df[full_grouped_df["Confirmed"] > n][ "Country/Region" ].unique() temp = full_table_df[full_table_df["Country/Region"].isin(countries)] temp = temp.groupby(["Country/Region", "Date"])["Confirmed"].sum().reset_index() temp = temp[temp["Confirmed"] > n] temp["Log Confirmed"] = np.log(1 + temp["Confirmed"]) # print(temp.head()) min_date = temp.groupby("Country/Region")["Date"].min().reset_index() min_date.columns = ["Country/Region", "Min Date"] # print(min_date.head()) from_nth_case = pd.merge(temp, min_date, on="Country/Region") from_nth_case["Date"] = pd.to_datetime(from_nth_case["Date"]) from_nth_case["Min Date"] = pd.to_datetime(from_nth_case["Min Date"]) from_nth_case["N days"] = ( from_nth_case["Date"] - from_nth_case["Min Date"] ).dt.days # print(from_nth_case.head()) fig = px.line( from_nth_case, x="N days", y="Confirmed", color="Country/Region", title="N days from " + str(n) + " case", height=600, ) fig.show() fig = px.line( from_nth_case, x="N days", y="Log Confirmed", color="Country/Region", title="N days from " + str(n) + " case", height=600, ) fig.show()

Q1: Do government actions matter?

def plot_gov_action(covid_outcome, gov_action, full_grouped_df): """ Function plots the government action and outcome Parameters: covid_outcome (str): The outcome from covid gov_action (str): The government action to be analysed full_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set Returns: null """ fig = px.scatter( full_grouped_df[full_grouped_df[gov_action] != None], x="day_rel_to_" + gov_action, y=covid_outcome, color="Country/Region", title="N days from " + gov_action, height=600, ) fig.update_layout(yaxis=dict(range=[0, 10])) fig.show()

# perhaps test theory with: # gov_actions = ['publicplace', 'gatheringlimit'] plt.figure(figsize=(16,9)) sns.set_style("dark") sns.scatterplot(x = 'pct_change_Confirmed_per_1000', y = 'publicplace', data = full_grouped, hue = "WHO_Region") plt.show() sns.scatterplot(x = 'pct_change_Confirmed_per_1000', y = 'gatheringlimit', data = full_grouped, hue = "WHO_Region") plt.show()

full_grouped_1 = covid.copy() # Backfill data full_grouped_1 = full_grouped_1.fillna(method="ffill") # Create percent changes in covid19 outcomes #covid_outcomes = ["confirmed", "deaths", "recovered", "active", "confirmed per 1000",] covid_outcomes = ["new_cases", "new_deaths", "new_cases_per_million","stringency_index"] for covid_outcome in covid_outcomes: full_grouped_1["pct_change_" + covid_outcome] = full_grouped_1.groupby( ["location"] )[covid_outcome].pct_change() full_grouped_1[full_grouped_1["pct_change_" + covid_outcome] == np.inf] = 0 # Replace space in variable names with '_' full_grouped_1.columns = full_grouped_1.columns.str.replace(" ", "_") full_grouped_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121539 entries, 0 to 121538
Data columns (total 69 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   iso_code                                 121539 non-null  object 
 1   continent                                121539 non-null  object 
 2   location                                 121539 non-null  object 
 3   date                                     121539 non-null  object 
 4   total_cases                              121539 non-null  float64
 5   new_cases                                121539 non-null  float64
 6   new_cases_smoothed                       121534 non-null  float64
 7   total_deaths                             121516 non-null  float64
 8   new_deaths                               121516 non-null  float64
 9   new_deaths_smoothed                      121534 non-null  float64
 10  total_cases_per_million                  121539 non-null  float64
 11  new_cases_per_million                    121539 non-null  float64
 12  new_cases_smoothed_per_million           121534 non-null  float64
 13  total_deaths_per_million                 121516 non-null  float64
 14  new_deaths_per_million                   121516 non-null  float64
 15  new_deaths_smoothed_per_million          121534 non-null  float64
 16  reproduction_rate                        121512 non-null  float64
 17  icu_patients                             119776 non-null  float64
 18  icu_patients_per_million                 119776 non-null  float64
 19  hosp_patients                            115238 non-null  float64
 20  hosp_patients_per_million                115238 non-null  float64
 21  weekly_icu_admissions                    98070 non-null   float64
 22  weekly_icu_admissions_per_million        98070 non-null   float64
 23  weekly_hosp_admissions                   111768 non-null  float64
 24  weekly_hosp_admissions_per_million       111768 non-null  float64
 25  new_tests                                120462 non-null  float64
 26  total_tests                              120462 non-null  float64
 27  total_tests_per_thousand                 120462 non-null  float64
 28  new_tests_per_thousand                   120462 non-null  float64
 29  new_tests_smoothed                       120455 non-null  float64
 30  new_tests_smoothed_per_thousand          120455 non-null  float64
 31  positive_rate                            120444 non-null  float64
 32  tests_per_case                           120444 non-null  float64
 33  tests_units                              120462 non-null  object 
 34  total_vaccinations                       121250 non-null  float64
 35  people_vaccinated                        121250 non-null  float64
 36  people_fully_vaccinated                  121190 non-null  float64
 37  total_boosters                           115932 non-null  float64
 38  new_vaccinations                         121174 non-null  float64
 39  new_vaccinations_smoothed                121249 non-null  float64
 40  total_vaccinations_per_hundred           121250 non-null  float64
 41  people_vaccinated_per_hundred            121250 non-null  float64
 42  people_fully_vaccinated_per_hundred      121190 non-null  float64
 43  total_boosters_per_hundred               115932 non-null  float64
 44  new_vaccinations_smoothed_per_million    121249 non-null  float64
 45  stringency_index                         121539 non-null  float64
 46  population                               121539 non-null  float64
 47  population_density                       121539 non-null  float64
 48  median_age                               121539 non-null  float64
 49  aged_65_older                            121539 non-null  float64
 50  aged_70_older                            121539 non-null  float64
 51  gdp_per_capita                           121539 non-null  float64
 52  extreme_poverty                          120462 non-null  float64
 53  cardiovasc_death_rate                    121539 non-null  float64
 54  diabetes_prevalence                      121539 non-null  float64
 55  female_smokers                           120462 non-null  float64
 56  male_smokers                             120462 non-null  float64
 57  handwashing_facilities                   121539 non-null  float64
 58  hospital_beds_per_thousand               121539 non-null  float64
 59  life_expectancy                          121539 non-null  float64
 60  human_development_index                  121539 non-null  float64
 61  excess_mortality_cumulative_absolute     120458 non-null  float64
 62  excess_mortality_cumulative              120458 non-null  float64
 63  excess_mortality                         120458 non-null  float64
 64  excess_mortality_cumulative_per_million  120458 non-null  float64
 65  pct_change_new_cases                     104879 non-null  float64
 66  pct_change_new_deaths                    79463 non-null   float64
 67  pct_change_new_cases_per_million         86874 non-null   float64
 68  pct_change_stringency_index              102279 non-null  float64
dtypes: float64(64), object(5)
memory usage: 64.0+ MB

def plot_gov_action(covid_outcome, gov_action, full_grouped_df): """ Function plots the government action and outcome Parameters: covid_outcome (str): The outcome from covid gov_action (str): The government action to be analysed full_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set Returns: null """ fig = px.scatter( full_grouped_df[full_grouped_df[gov_action] != None], x= gov_action, y= covid_outcome, color= "location", title= gov_action, height=600, ) fig.update_layout(yaxis=dict(range=[0, 10])) fig.show() plt.figure(figsize=(16,9)) sns.set_style("dark") sns.scatterplot(x = 'stringency_index', y = 'pct_change_new_cases_per_million', data = full_grouped_1, hue = "continent") plt.show() different_grouped = pd.read_csv("/work/corona-virus-report/full_grouped.csv")

full_grouped_1['location'].unique()

plt.figure(figsize=(16,9)) plt.grid() plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "China"]) plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "United States"]) plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "United Kingdom"]) plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "India"]) plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "France"]) plt.legend(['China',"US","UK","India","France"]) plt.xlabel('Stringency') plt.ylabel('New cases per million (smoothed)')

Q2: How much do government interventions matter?

full_grouped['Confirmed_per_1000'].describe()

full_grouped['log_Confirmed_per_1000'] = np.log(full_grouped['Confirmed_per_1000']+1) full_grouped['log_Confirmed_per_1000'].describe()

full_grouped_1['log_new_cases_per_million'] = np.log(full_grouped_1['new_cases_per_million']+1) full_grouped_1['log_new_cases_per_million'].describe() #Plot pairplot with countries organized by continent g = sns.pairplot(full_grouped_1[['log_new_cases_per_million', 'population_density', 'human_development_index','handwashing_facilities', 'median_age','continent']], hue='continent')

f, ax = plt.subplots(figsize=(10, 8)) corr = full_grouped_1.corr() # Generate a mask for the upper triangle mask = np.triu(np.ones_like(corr, dtype=bool)) # Generate a custom diverging colormap cmap = sns.diverging_palette(230, 20, as_cmap=True) # Plot heatmap sns.heatmap(corr, mask=mask, cmap=cmap, square=True, ax=ax)

full_grouped_1['log_new_cases_per_million'] = np.log(full_grouped_1['new_cases_per_million']+1) full_grouped_1['log_new_cases_per_million'].describe() # 'post_schools', 'post_gathering', 'post_nonessential', 'post_publicplace', # OLS regression y = full_grouped_1['log_new_cases_per_million'] X = full_grouped_1[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']] X = sm.add_constant(X) ols_model=sm.OLS(y,X.astype(float), missing='drop') result=ols_model.fit() print(result.summary2())

                       Results: Ordinary least squares
=============================================================================
Model:              OLS                       Adj. R-squared:     0.328      
Dependent Variable: log_new_cases_per_million AIC:                488489.1558
Date:               2021-10-06 11:54          BIC:                488547.4014
No. Observations:   121491                    Log-Likelihood:     -2.4424e+05
Df Model:           5                         F-statistic:        1.188e+04  
Df Residuals:       121485                    Prob (F-statistic): 0.00       
R-squared:          0.328                     Scale:              3.2637     
------------------------------------------------------------------------------
                          Coef.   Std.Err.     t      P>|t|    [0.025   0.975]
------------------------------------------------------------------------------
const                    -0.3309    0.0122  -27.0240  0.0000  -0.3549  -0.3069
stringency_index          0.0166    0.0003   61.2277  0.0000   0.0160   0.0171
population_density       -0.0000    0.0000   -0.7746  0.4386  -0.0000   0.0000
human_development_index   0.7163    0.0618   11.5938  0.0000   0.5952   0.8374
handwashing_facilities    0.0012    0.0002    6.6285  0.0000   0.0009   0.0016
median_age                0.0504    0.0012   41.7447  0.0000   0.0480   0.0527
-----------------------------------------------------------------------------
Omnibus:                 10617.366         Durbin-Watson:            0.448   
Prob(Omnibus):           0.000             Jarque-Bera (JB):         3289.761
Skew:                    0.014             Prob(JB):                 0.000   
Kurtosis:                2.194             Condition No.:            20325   
=============================================================================
* The condition number is large (2e+04). This might indicate
strong multicollinearity or other numerical problems.
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/arraylike.py:358: RuntimeWarning:

invalid value encountered in log

full_grouped_1['continent'].unique()

data = full_grouped_1[full_grouped_1['continent'] == "Europe"] # OLS regression y = data['log_new_cases_per_million'] X = data[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']] X = sm.add_constant(X) ols_model=sm.OLS(y,X.astype(float), missing='drop') result=ols_model.fit() print(result.summary2())

                       Results: Ordinary least squares
=============================================================================
Model:              OLS                       Adj. R-squared:     0.104      
Dependent Variable: log_new_cases_per_million AIC:                105582.8902
Date:               2021-10-06 12:13          BIC:                105631.7445
No. Observations:   25397                     Log-Likelihood:     -52785.    
Df Model:           5                         F-statistic:        589.6      
Df Residuals:       25391                     Prob (F-statistic): 0.00       
R-squared:          0.104                     Scale:              3.7404     
------------------------------------------------------------------------------
                          Coef.   Std.Err.     t      P>|t|    [0.025   0.975]
------------------------------------------------------------------------------
const                     1.5248    0.1182   12.8963  0.0000   1.2930   1.7565
stringency_index          0.0280    0.0007   41.2054  0.0000   0.0267   0.0294
population_density       -0.0000    0.0000   -7.4845  0.0000  -0.0000  -0.0000
human_development_index  -2.4203    0.1851  -13.0764  0.0000  -2.7831  -2.0575
handwashing_facilities    0.0021    0.0004    5.4076  0.0000   0.0013   0.0028
median_age                0.0659    0.0027   24.4941  0.0000   0.0606   0.0711
-----------------------------------------------------------------------------
Omnibus:                3424.104          Durbin-Watson:             0.334   
Prob(Omnibus):          0.000             Jarque-Bera (JB):          1149.610
Skew:                   -0.283            Prob(JB):                  0.000   
Kurtosis:               2.124             Condition No.:             46027   
=============================================================================
* The condition number is large (5e+04). This might indicate
strong multicollinearity or other numerical problems.

data = full_grouped_1[full_grouped_1['continent'] == "Asia"] # OLS regression y = data['log_new_cases_per_million'] X = data[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']] X = sm.add_constant(X) ols_model=sm.OLS(y,X.astype(float), missing='drop') result=ols_model.fit() print(result.summary2())

                       Results: Ordinary least squares
=============================================================================
Model:              OLS                       Adj. R-squared:     0.187      
Dependent Variable: log_new_cases_per_million AIC:                104818.5210
Date:               2021-10-06 12:14          BIC:                104867.5366
No. Observations:   26089                     Log-Likelihood:     -52403.    
Df Model:           5                         F-statistic:        1200.      
Df Residuals:       26083                     Prob (F-statistic): 0.00       
R-squared:          0.187                     Scale:              3.2531     
------------------------------------------------------------------------------
                          Coef.   Std.Err.     t      P>|t|    [0.025   0.975]
------------------------------------------------------------------------------
const                    -1.8546    0.0785  -23.6399  0.0000  -2.0084  -1.7008
stringency_index          0.0217    0.0006   37.8904  0.0000   0.0206   0.0228
population_density        0.0000    0.0000    3.5461  0.0004   0.0000   0.0000
human_development_index   7.9614    0.1436   55.4449  0.0000   7.6800   8.2429
handwashing_facilities   -0.0001    0.0004   -0.3309  0.7407  -0.0010   0.0007
median_age               -0.0864    0.0023  -37.1637  0.0000  -0.0910  -0.0819
-----------------------------------------------------------------------------
Omnibus:                 2836.588          Durbin-Watson:             0.117  
Prob(Omnibus):           0.000             Jarque-Bera (JB):          791.905
Skew:                    -0.029            Prob(JB):                  0.000  
Kurtosis:                2.148             Condition No.:             34229  
=============================================================================
* The condition number is large (3e+04). This might indicate
strong multicollinearity or other numerical problems.

data = full_grouped_1[full_grouped_1['continent'] == "North America"] # OLS regression y = data['log_new_cases_per_million'] X = data[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']] X = sm.add_constant(X) ols_model=sm.OLS(y,X.astype(float), missing='drop') result=ols_model.fit() print(result.summary2())

                      Results: Ordinary least squares
============================================================================
Model:              OLS                       Adj. R-squared:     0.198     
Dependent Variable: log_new_cases_per_million AIC:                58726.4412
Date:               2021-10-06 12:14          BIC:                58771.6101
No. Observations:   13741                     Log-Likelihood:     -29357.   
Df Model:           5                         F-statistic:        680.8     
Df Residuals:       13735                     Prob (F-statistic): 0.00      
R-squared:          0.199                     Scale:              4.2017    
----------------------------------------------------------------------------
                             Coef.  Std.Err.    t     P>|t|   [0.025  0.975]
----------------------------------------------------------------------------
const                       -3.2012   0.1849 -17.3122 0.0000 -3.5636 -2.8387
stringency_index             0.0281   0.0009  32.7124 0.0000  0.0264  0.0298
population_density          -0.0007   0.0001  -9.3976 0.0000 -0.0009 -0.0006
human_development_index      2.3135   0.3330   6.9465 0.0000  1.6607  2.9663
handwashing_facilities       0.0044   0.0006   7.1896 0.0000  0.0032  0.0056
median_age                   0.0795   0.0043  18.4455 0.0000  0.0710  0.0879
----------------------------------------------------------------------------
Omnibus:                3298.067          Durbin-Watson:             0.260  
Prob(Omnibus):          0.000             Jarque-Bera (JB):          605.798
Skew:                   -0.104            Prob(JB):                  0.000  
Kurtosis:               1.992             Condition No.:             7262   
============================================================================
* The condition number is large (7e+03). This might indicate
strong multicollinearity or other numerical problems.