Task

# Import modules for API calls import requests import io import pandas as pd import requests import json from datetime import datetime # Import module for plotting import seaborn as sns ## JHU Vaccination Rates (Taken From: https://github.com/owid/covid-19-data/tree/master/public/data) url = 'https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv' download = requests.get(url).content covid = pd.read_csv(io.StringIO(download.decode('utf-8')), parse_dates=['date']) covid.tail()

pip install statsmodels

# Data Management from dateutil import relativedelta as rd import pandas as pd import numpy as np # Visualization import matplotlib.pyplot as plt import plotly as py import plotly.express as px import plotly.graph_objects as go import plotly.offline as pyo pyo.init_notebook_mode() import seaborn as sns # Regression import statsmodels.api as sm from statsmodels.formula.api import ols import statsmodels.graphics.api as smg

covid.columns.values

covid.tail()

full_grouped = covid.copy() # Backfill data full_grouped = full_grouped.fillna(method="ffill") # Create percent changes in covid19 outcomes #covid_outcomes = ["confirmed", "deaths", "recovered", "active", "confirmed per 1000",] covid_outcomes = ["new_cases", "new_deaths", "new_cases_per_million","stringency_index"] for covid_outcome in covid_outcomes: full_grouped["pct_change_" + covid_outcome] = full_grouped.groupby( ["location"] )[covid_outcome].pct_change() full_grouped[full_grouped["pct_change_" + covid_outcome] == np.inf] = 0 # Replace space in variable names with '_' full_grouped.columns = full_grouped.columns.str.replace(" ", "_") full_grouped.tail() full_grouped.info()

# Read and rename column country # cty_info = pd.read_csv('../input/countryinfo/covid19countryinfo.csv').rename(columns={'country':'Country'}) # cty_info = pd.read_csv(local_path + 'covid19countryinfo.csv').rename(columns={'country':'Country'}) cty_info = pd.read_csv('/work/countryinfo/covid19countryinfo.csv').rename(columns={'country':'Country'}) # Filter observations with aggregate country-level information # The column region for region-level observations is populated cty_info = cty_info[cty_info.region.isnull()] # Convert string data type to floating data type # Remove comma from the fields cty_info['healthexp'] = cty_info[~cty_info['healthexp'].isnull()]['healthexp'].str.replace(',','').astype('float') cty_info['gdp2019'] = cty_info[~cty_info['gdp2019'].isnull()]['gdp2019'].str.replace(',','').astype('float') # Convert to date objects with to_datetime method gov_actions = ['quarantine', 'schools', 'gathering', 'nonessential', 'publicplace'] for gov_action in gov_actions: cty_info[gov_action] = pd.to_datetime(cty_info[gov_action], format = '%m/%d/%Y') # Filter columns of interest # Note: feel free to explore other variables or datasets cty_info = cty_info[['Country', 'avghumidity', 'avgtemp', 'fertility', 'medianage', 'urbanpop', 'quarantine', 'schools', \ 'publicplace', 'gatheringlimit', 'gathering', 'nonessential', 'hospibed', 'smokers', \ 'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'sexratio', 'lung', 'femalelung', \ 'malelung', 'gdp2019', 'healthexp', 'healthperpop']] # cty_info.describe() cty_info.info()

# Worldometer data # ================ # worldometer_data = pd.read_csv('../input/corona-virus-report/worldometer_data.csv') # worldometer_data = pd.read_csv(local_path + 'worldometer_data.csv') worldometer_data = pd.read_csv("/work/corona-virus-report/worldometer_data.csv") # Replace missing values '' with NAN and then 0 worldometer_data = worldometer_data.replace("", np.nan).fillna(0) # Transform variables and round them up to the two decimal points # Note that there are instances of division by zero issue when there are either zero total tests or total cases worldometer_data["Case Positivity"] = round( worldometer_data["TotalCases"] / worldometer_data["TotalTests"], 2 ) worldometer_data["Case Fatality"] = round( worldometer_data["TotalDeaths"] / worldometer_data["TotalCases"], 2 ) # Resolve the division by zero issue by replacing infinity value with zero worldometer_data[worldometer_data["Case Positivity"] == np.inf] = 0 worldometer_data[worldometer_data["Case Fatality"] == np.inf] = 0 # Place case positivity into three bins worldometer_data["Case Positivity Bin"] = pd.qcut( worldometer_data["Case Positivity"], q=3, labels=["low", "medium", "high"] ) # Population Structure # worldometer_pop_struc = pd.read_csv('../input/covid19-worldometer-snapshots-since-april-18/population_structure_by_age_per_contry.csv') # worldometer_pop_struc = pd.read_csv(local_path + 'population_structure_by_age_per_contry.csv') worldometer_pop_struc = pd.read_csv("/work/COVID-19 worldometer daily snapshots/population_structure_by_age_per_contry.csv") # Replace missing values with zeros worldometer_pop_struc = worldometer_pop_struc.fillna(0) # Merge datasets by common key country worldometer_data = worldometer_data.merge( worldometer_pop_struc, how="inner", left_on="Country/Region", right_on="Country" ) worldometer_data = worldometer_data[worldometer_data["Country/Region"] != 0] # Country information worldometer_data = worldometer_data.merge(cty_info, how="left", on="Country") # Replace space in variable names with '_' worldometer_data.columns = worldometer_data.columns.str.replace(" ", "_") # Full data # ========= # full_table = pd.read_csv('../input/corona-virus-report/covid_19_clean_complete.csv') # full_table = pd.read_csv(local_path + 'covid_19_clean_complete.csv') full_table = pd.read_csv("/work/corona-virus-report/covid_19_clean_complete.csv") full_table["Date"] = pd.to_datetime(full_table["Date"]) # Grouped by day, country # ======================= # full_grouped = pd.read_csv('../input/corona-virus-report/full_grouped.csv') # full_grouped = pd.read_csv(local_path + 'full_grouped.csv') full_grouped = pd.read_csv("/work/corona-virus-report/full_grouped.csv") full_grouped["Date"] = pd.to_datetime(full_grouped["Date"]) # full_grouped.loc[full_grouped['Country/Region'] == 'US', 'Country/Region'] = 'USA' full_grouped.head() # Correct country names in worldometer to make them consistent with dataframe full_grouped column Country/Region before merging worldometer_data["Country/Region"].replace( { "USA": "US", "UAE": "United Arab Emirates", "S. Korea": "South Korea", "UK": "United Kingdom", }, inplace=True, ) # Draw population and country-level data full_grouped = full_grouped.merge( worldometer_data[["Country/Region", "Population"]], how="left", on="Country/Region" ) full_grouped = full_grouped.merge( cty_info, how="left", left_on="Country/Region", right_on="Country" ) full_grouped["Confirmed per 1000"] = ( full_grouped["Confirmed"] / full_grouped["Population"] * 1000 ) # Backfill data full_grouped = full_grouped.fillna(method="ffill") # Create post-invention indicators gov_actions = ["quarantine", "schools", "gathering", "nonessential", "publicplace"] for gov_action in gov_actions: full_grouped["post_" + gov_action] = ( full_grouped["Date"] >= full_grouped[gov_action] ) full_grouped["day_rel_to_" + gov_action] = ( full_grouped["Date"] - full_grouped[gov_action] ).dt.days # Create percent changes in covid19 outcomes covid_outcomes = ["Confirmed", "Deaths", "Recovered", "Active", "Confirmed per 1000"] for covid_outcome in covid_outcomes: full_grouped["pct_change_" + covid_outcome] = full_grouped.groupby( ["Country/Region"] )[covid_outcome].pct_change() full_grouped[full_grouped["pct_change_" + covid_outcome] == np.inf] = 0 # Replace space in variable names with '_' full_grouped.columns = full_grouped.columns.str.replace(" ", "_") full_grouped.info()

# Visualize the missingness isue in the dataset sns.heatmap(cty_info.isnull(), cbar=False)

Define Function

# Create a function to plot (reusing from the previous BootCamp) def gt_n(n): countries = full_grouped[full_grouped["Confirmed"] > n]["Country/Region"].unique() temp = full_table[full_table["Country/Region"].isin(countries)] temp = temp.groupby(["Country/Region", "Date"])["Confirmed"].sum().reset_index() temp = temp[temp["Confirmed"] > n] temp["Log Confirmed"] = np.log(1 + temp["Confirmed"]) # print(temp.head()) min_date = temp.groupby("Country/Region")["Date"].min().reset_index() min_date.columns = ["Country/Region", "Min Date"] # print(min_date.head()) from_nth_case = pd.merge(temp, min_date, on="Country/Region") from_nth_case["Date"] = pd.to_datetime(from_nth_case["Date"]) from_nth_case["Min Date"] = pd.to_datetime(from_nth_case["Min Date"]) from_nth_case["N days"] = ( from_nth_case["Date"] - from_nth_case["Min Date"] ).dt.days # print(from_nth_case.head()) fig = px.line( from_nth_case, x="N days", y="Confirmed", color="Country/Region", title="N days from " + str(n) + " case", height=600, ) fig.show() fig = px.line( from_nth_case, x="N days", y="Log Confirmed", color="Country/Region", title="N days from " + str(n) + " case", height=600, ) fig.show()

2.3a Edited

def graph_cty_exceeding_cases(n, full_grouped_df, full_table_df): """ Function graphs the countries with more than n confirmed cases Parameters: n (int): The threshold minimum number of cases fully_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set full_data_df (pandas.DataFrame): the data source used. From Kaggle's covid_19_clean_complete data set Returns: null """ # Data Preprocessing countries = full_grouped_df[full_grouped_df["Confirmed"] > n][ "Country/Region" ].unique() temp = full_table_df[full_table_df["Country/Region"].isin(countries)] temp = temp.groupby(["Country/Region", "Date"])["Confirmed"].sum().reset_index() temp = temp[temp["Confirmed"] > n] temp["Log Confirmed"] = np.log(1 + temp["Confirmed"]) # print(temp.head()) min_date = temp.groupby("Country/Region")["Date"].min().reset_index() min_date.columns = ["Country/Region", "Min Date"] # print(min_date.head()) from_nth_case = pd.merge(temp, min_date, on="Country/Region") from_nth_case["Date"] = pd.to_datetime(from_nth_case["Date"]) from_nth_case["Min Date"] = pd.to_datetime(from_nth_case["Min Date"]) from_nth_case["N days"] = ( from_nth_case["Date"] - from_nth_case["Min Date"] ).dt.days # print(from_nth_case.head()) fig = px.line( from_nth_case, x="N days", y="Confirmed", color="Country/Region", title="N days from " + str(n) + " case", height=600, ) fig.show() fig = px.line( from_nth_case, x="N days", y="Log Confirmed", color="Country/Region", title="N days from " + str(n) + " case", height=600, ) fig.show()

Q1: Do government actions matter?

def plot_gov_action(covid_outcome, gov_action, full_grouped_df): """ Function plots the government action and outcome Parameters: covid_outcome (str): The outcome from covid gov_action (str): The government action to be analysed full_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set Returns: null """ fig = px.scatter( full_grouped_df[full_grouped_df[gov_action] != None], x="day_rel_to_" + gov_action, y=covid_outcome, color="Country/Region", title="N days from " + gov_action, height=600, ) fig.update_layout(yaxis=dict(range=[0, 10])) fig.show()

# perhaps test theory with: # gov_actions = ['publicplace', 'gatheringlimit'] plt.figure(figsize=(16,9)) sns.set_style("dark") sns.scatterplot(x = 'pct_change_Confirmed_per_1000', y = 'publicplace', data = full_grouped, hue = "WHO_Region") plt.show() sns.scatterplot(x = 'pct_change_Confirmed_per_1000', y = 'gatheringlimit', data = full_grouped, hue = "WHO_Region") plt.show()

full_grouped_1 = covid.copy() # Backfill data full_grouped_1 = full_grouped_1.fillna(method="ffill") # Create percent changes in covid19 outcomes #covid_outcomes = ["confirmed", "deaths", "recovered", "active", "confirmed per 1000",] covid_outcomes = ["new_cases", "new_deaths", "new_cases_per_million","stringency_index"] for covid_outcome in covid_outcomes: full_grouped_1["pct_change_" + covid_outcome] = full_grouped_1.groupby( ["location"] )[covid_outcome].pct_change() full_grouped_1[full_grouped_1["pct_change_" + covid_outcome] == np.inf] = 0 # Replace space in variable names with '_' full_grouped_1.columns = full_grouped_1.columns.str.replace(" ", "_") full_grouped_1.info()

def plot_gov_action(covid_outcome, gov_action, full_grouped_df): """ Function plots the government action and outcome Parameters: covid_outcome (str): The outcome from covid gov_action (str): The government action to be analysed full_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set Returns: null """ fig = px.scatter( full_grouped_df[full_grouped_df[gov_action] != None], x= gov_action, y= covid_outcome, color= "location", title= gov_action, height=600, ) fig.update_layout(yaxis=dict(range=[0, 10])) fig.show() plt.figure(figsize=(16,9)) sns.set_style("dark") sns.scatterplot(x = 'stringency_index', y = 'pct_change_new_cases_per_million', data = full_grouped_1, hue = "continent") plt.show() different_grouped = pd.read_csv("/work/corona-virus-report/full_grouped.csv")

full_grouped_1['location'].unique()

plt.figure(figsize=(16,9)) plt.grid() plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "China"]) plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "United States"]) plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "United Kingdom"]) plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "India"]) plt.scatter(y = 'new_cases_per_million', x = 'stringency_index', data = full_grouped_1[full_grouped_1['location'] == "France"]) plt.legend(['China',"US","UK","India","France"]) plt.xlabel('Stringency') plt.ylabel('New cases per million (smoothed)')

Q2: How much do government interventions matter?

full_grouped['Confirmed_per_1000'].describe()

full_grouped['log_Confirmed_per_1000'] = np.log(full_grouped['Confirmed_per_1000']+1) full_grouped['log_Confirmed_per_1000'].describe()

full_grouped_1['log_new_cases_per_million'] = np.log(full_grouped_1['new_cases_per_million']+1) full_grouped_1['log_new_cases_per_million'].describe() #Plot pairplot with countries organized by continent g = sns.pairplot(full_grouped_1[['log_new_cases_per_million', 'population_density', 'human_development_index','handwashing_facilities', 'median_age','continent']], hue='continent')

f, ax = plt.subplots(figsize=(10, 8)) corr = full_grouped_1.corr() # Generate a mask for the upper triangle mask = np.triu(np.ones_like(corr, dtype=bool)) # Generate a custom diverging colormap cmap = sns.diverging_palette(230, 20, as_cmap=True) # Plot heatmap sns.heatmap(corr, mask=mask, cmap=cmap, square=True, ax=ax)

full_grouped_1['log_new_cases_per_million'] = np.log(full_grouped_1['new_cases_per_million']+1) full_grouped_1['log_new_cases_per_million'].describe() # 'post_schools', 'post_gathering', 'post_nonessential', 'post_publicplace', # OLS regression y = full_grouped_1['log_new_cases_per_million'] X = full_grouped_1[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']] X = sm.add_constant(X) ols_model=sm.OLS(y,X.astype(float), missing='drop') result=ols_model.fit() print(result.summary2())

full_grouped_1['continent'].unique()

data = full_grouped_1[full_grouped_1['continent'] == "Europe"] # OLS regression y = data['log_new_cases_per_million'] X = data[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']] X = sm.add_constant(X) ols_model=sm.OLS(y,X.astype(float), missing='drop') result=ols_model.fit() print(result.summary2())

data = full_grouped_1[full_grouped_1['continent'] == "Asia"] # OLS regression y = data['log_new_cases_per_million'] X = data[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']] X = sm.add_constant(X) ols_model=sm.OLS(y,X.astype(float), missing='drop') result=ols_model.fit() print(result.summary2())

data = full_grouped_1[full_grouped_1['continent'] == "North America"] # OLS regression y = data['log_new_cases_per_million'] X = data[['stringency_index','population_density', 'human_development_index','handwashing_facilities', 'median_age']] X = sm.add_constant(X) ols_model=sm.OLS(y,X.astype(float), missing='drop') result=ols_model.fit() print(result.summary2())

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Task

Define Function

2.3a Edited

Q1: Do government actions matter?

Q2: How much do government interventions matter?

Task