Untitled Python Project

#!pip install statsmodels==0.13.0

#!pip install nb_black==1.0.7

# Data Management from dateutil import relativedelta as rd import pandas as pd import numpy as np # Visualization import matplotlib.pyplot as plt import plotly as py import plotly.express as px import plotly.graph_objects as go import plotly.offline as pyo pyo.init_notebook_mode() import seaborn as sns # Regression import statsmodels.api as sm from statsmodels.formula.api import ols import statsmodels.graphics.api as smg # auto code formatting %load_ext nb_black

# Read and rename column country cty_info = pd.read_csv('country/covid19countryinfo.csv').rename(columns={'country':'Country'}) # Filter observations with aggregate country-level information # The column region for region-level observations is populated cty_info = cty_info[cty_info.region.isnull()] # Convert string data type to floating data type # Remove comma from the fields cty_info['healthexp'] = cty_info[~cty_info['healthexp'].isnull()]['healthexp'].str.replace(',','').astype('float') cty_info['gdp2019'] = cty_info[~cty_info['gdp2019'].isnull()]['gdp2019'].str.replace(',','').astype('float') # Convert to date objects with to_datetime method gov_actions = ['quarantine', 'schools', 'gathering', 'nonessential', 'publicplace'] for gov_action in gov_actions: cty_info[gov_action] = pd.to_datetime(cty_info[gov_action], format = '%m/%d/%Y') # Filter columns of interest # Note: feel free to explore other variables or datasets cty_info = cty_info[['Country', 'avghumidity', 'avgtemp', 'fertility', 'medianage', 'urbanpop', 'quarantine', 'schools', \ 'publicplace', 'gatheringlimit', 'gathering', 'nonessential', 'hospibed', 'smokers', \ 'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'sexratio', 'lung', 'femalelung', \ 'malelung', 'gdp2019', 'healthexp', 'healthperpop']] # cty_info.describe() cty_info.info() #cty_info.head(20)

# Worldometer data # ================ worldometer_data = pd.read_csv("corona-virus-report/worldometer_data.csv") # Replace missing values '' with NAN and then 0 worldometer_data = worldometer_data.replace("", np.nan).fillna(0) # Transform variables and round them up to the two decimal points # Note that there are instances of division by zero issue when there are either zero total tests or total cases worldometer_data["Case Positivity"] = round( worldometer_data["TotalCases"] / worldometer_data["TotalTests"], 2 ) worldometer_data["Case Fatality"] = round( worldometer_data["TotalDeaths"] / worldometer_data["TotalCases"], 2 ) # Resolve the division by zero issue by replacing infinity value with zero worldometer_data[worldometer_data["Case Positivity"] == np.inf] = 0 worldometer_data[worldometer_data["Case Fatality"] == np.inf] = 0 # Place case positivity into three bins worldometer_data["Case Positivity Bin"] = pd.qcut( worldometer_data["Case Positivity"], q=3, labels=["low", "medium", "high"] ) # Population Structure worldometer_pop_struc = pd.read_csv("corona-virus-report/population_structure_by_age_per_contry.csv") # Replace missing values with zeros worldometer_pop_struc = worldometer_pop_struc.fillna(0) # Merge datasets by common key country worldometer_data = worldometer_data.merge( worldometer_pop_struc, how="inner", left_on="Country/Region", right_on="Country" ) worldometer_data = worldometer_data[worldometer_data["Country/Region"] != 0] # Country information worldometer_data = worldometer_data.merge(cty_info, how="left", on="Country") # Replace space in variable names with '_' worldometer_data.columns = worldometer_data.columns.str.replace(" ", "_") worldometer_data.describe() worldometer_data.info()

# Full data # ========= full_table = pd.read_csv("corona-virus-report/full_grouped.csv") full_table["Date"] = pd.to_datetime(full_table["Date"]) # Examine DataFrame (object type, shape, columns, dtypes) full_table.info() # type(full_table) # full_table.shape # full_table.columns # full_table.dtypes # Deep dive into the DataFrame full_table.head()

# Grouped by day, country # ======================= full_grouped = pd.read_csv("corona-virus-report/full_grouped.csv") full_grouped["Date"] = pd.to_datetime(full_grouped["Date"]) # full_grouped.loc[full_grouped['Country/Region'] == 'US', 'Country/Region'] = 'USA' full_grouped.head() # Correct country names in worldometer to make them consistent with dataframe full_grouped column Country/Region before merging worldometer_data["Country/Region"].replace( { "USA": "US", "UAE": "United Arab Emirates", "S. Korea": "South Korea", "UK": "United Kingdom", }, inplace=True, ) # Draw population and country-level data full_grouped = full_grouped.merge( worldometer_data[["Country/Region", "Population"]], how="left", on="Country/Region" ) full_grouped = full_grouped.merge( cty_info, how="left", left_on="Country/Region", right_on="Country" ) full_grouped["Confirmed per 1000"] = ( full_grouped["Confirmed"] / full_grouped["Population"] * 1000 ) # Backfill data full_grouped = full_grouped.fillna(method="ffill") # Create post-invention indicators gov_actions = ["quarantine", "schools", "gathering", "nonessential", "publicplace"] for gov_action in gov_actions: full_grouped["post_" + gov_action] = ( full_grouped["Date"] >= full_grouped[gov_action] ) full_grouped["day_rel_to_" + gov_action] = ( full_grouped["Date"] - full_grouped[gov_action] ).dt.days # Create percent changes in covid19 outcomes covid_outcomes = ["Confirmed", "Deaths", "Recovered", "Active", "Confirmed per 1000"] for covid_outcome in covid_outcomes: full_grouped["pct_change_" + covid_outcome] = full_grouped.groupby( ["Country/Region"] )[covid_outcome].pct_change() full_grouped[full_grouped["pct_change_" + covid_outcome] == np.inf] = 0 # Replace space in variable names with '_' full_grouped.columns = full_grouped.columns.str.replace(" ", "_") full_grouped.info() # full_grouped.tail(20) # print(full_grouped.iloc[0,0]) # full_grouped[full_grouped['quarantine'] != None]['Country/Region'].unique() # full_grouped[full_grouped['Country/Region'] == 'Germany'][['quarantine','day_rel_to_quarantine']] # list(full_grouped.columns.values) # full_grouped.describe()

# Visualize the missingness isue in the dataset sns.heatmap(cty_info.isnull(), cbar=False)

# Create a function to plot (reusing from the previous BootCamp) def gt_n(n): countries = full_grouped[full_grouped["Confirmed"] > n]["Country/Region"].unique() temp = full_table[full_table["Country/Region"].isin(countries)] temp = temp.groupby(["Country/Region", "Date"])["Confirmed"].sum().reset_index() temp = temp[temp["Confirmed"] > n] temp["Log Confirmed"] = np.log(1 + temp["Confirmed"]) # print(temp.head()) min_date = temp.groupby("Country/Region")["Date"].min().reset_index() min_date.columns = ["Country/Region", "Min Date"] # print(min_date.head()) from_nth_case = pd.merge(temp, min_date, on="Country/Region") from_nth_case["Date"] = pd.to_datetime(from_nth_case["Date"]) from_nth_case["Min Date"] = pd.to_datetime(from_nth_case["Min Date"]) from_nth_case["N days"] = ( from_nth_case["Date"] - from_nth_case["Min Date"] ).dt.days # print(from_nth_case.head()) fig = px.line( from_nth_case, x="N days", y="Confirmed", color="Country/Region", title="N days from " + str(n) + " case", height=600, ) fig.show() fig = px.line( from_nth_case, x="N days", y="Log Confirmed", color="Country/Region", title="N days from " + str(n) + " case", height=600, ) fig.show()

def graph_cty_exceeding_cases(n, full_grouped_df, full_table_df): """ Function graphs the countries with more than n confirmed cases Parameters: n (int): The threshold minimum number of cases fully_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set full_data_df (pandas.DataFrame): the data source used. From Kaggle's covid_19_clean_complete data set Returns: null """ # Data Preprocessing countries = full_grouped_df[full_grouped_df["Confirmed"] > n][ "Country/Region" ].unique() temp = full_table_df[full_table_df["Country/Region"].isin(countries)] temp = temp.groupby(["Country/Region", "Date"])["Confirmed"].sum().reset_index() temp = temp[temp["Confirmed"] > n] temp["Log Confirmed"] = np.log(1 + temp["Confirmed"]) # print(temp.head()) min_date = temp.groupby("Country/Region")["Date"].min().reset_index() min_date.columns = ["Country/Region", "Min Date"] # print(min_date.head()) from_nth_case = pd.merge(temp, min_date, on="Country/Region") from_nth_case["Date"] = pd.to_datetime(from_nth_case["Date"]) from_nth_case["Min Date"] = pd.to_datetime(from_nth_case["Min Date"]) from_nth_case["N days"] = ( from_nth_case["Date"] - from_nth_case["Min Date"] ).dt.days # print(from_nth_case.head()) fig = px.line( from_nth_case, x="N days", y="Confirmed", color="Country/Region", title="N days from " + str(n) + " case", height=600, ) fig.show() fig = px.line( from_nth_case, x="N days", y="Log Confirmed", color="Country/Region", title="N days from " + str(n) + " case", height=600, ) fig.show()

full_grouped.head()

Stringency_index = pd.read_csv("covid-stringency-index.csv") Stringency_index["Date"] = pd.to_datetime(Stringency_index["Day"]) df = Stringency_index.sort_values(by=['Date','Entity']) df.reset_index(inplace=True) df["Entity"].replace({"United States": "US",},inplace=True) df["Date"]=df["Date"].astype(object) df.head()

n_full_grouped = full_grouped.merge( df[["Date", "Entity", "stringency_index"]], how="inner", left_on=["Date", 'Country/Region'], right_on=["Date", "Entity"])

n_full_grouped.tail()

def plot_gov_action(covid_outcome, gov_action, full_grouped_df): """ Function plots the government action and outcome Parameters: covid_outcome (str): The outcome from covid gov_action (str): The government action to be analysed full_grouped_df (pandas.DataFrame): the data source used. From Kaggle's full_grouped data set Returns: null """ fig = px.scatter( full_grouped_df[full_grouped_df[gov_action] != None], x="day_rel_to_" + gov_action, y=covid_outcome, color="Country/Region", title="N days from " + gov_action, height=600, ) fig.update_layout(yaxis=dict(range=[0, 10])) fig.show()

gov_actions = ['quarantine', 'schools', 'gathering', 'nonessential', 'publicplace'] for i in gov_actions: plot_gov_action('pct_change_Confirmed_per_1000', i, n_full_grouped)

fig = px.scatter( n_full_grouped[n_full_grouped['stringency_index'] != None], x=n_full_grouped['stringency_index'], y='pct_change_Confirmed_per_1000', color="Country/Region", title='stringency_index', height=600, ) fig.update_layout(yaxis=dict(range=[0, 10])) fig.show()

n_full_grouped['Confirmed_per_1000'].describe()

n_full_grouped['log_Confirmed_per_1000'] = np.log(n_full_grouped['Confirmed_per_1000']+1) n_full_grouped['log_Confirmed_per_1000'].describe()

#Plot pairplot with countries organized by WHO regions g = sns.pairplot(n_full_grouped[['log_Confirmed_per_1000', 'avghumidity', 'avgtemp', 'urbanpop', 'WHO_Region']], hue='WHO_Region')

#plt.matshow(full_grouped.corr()) #plt.show() f, ax = plt.subplots(figsize=(10, 8)) corr = n_full_grouped.corr() # Generate a mask for the upper triangle mask = np.triu(np.ones_like(corr, dtype=bool)) # Generate a custom diverging colormap cmap = sns.diverging_palette(230, 20, as_cmap=True) # Plot heatmap sns.heatmap(corr, mask=mask, cmap=cmap, square=True, ax=ax)

n_full_grouped.columns

# 'post_schools', 'post_gathering', 'post_nonessential', 'post_publicplace', # Create interaction term n_full_grouped['quarXurbanpop'] = n_full_grouped['post_quarantine'] * n_full_grouped['urbanpop'] # OLS regression y = n_full_grouped['log_Confirmed_per_1000'] X = n_full_grouped[['post_quarantine', 'avghumidity', 'avgtemp', 'urbanpop', 'quarXurbanpop','stringency_index']] X = sm.add_constant(X) ols_model=sm.OLS(y,X.astype(float), missing='drop') result=ols_model.fit() print(result.summary2())