Loading Data
! pip install covidcast
from datetime import date
import pandas as pd
import covidcast
from itertools import chain
ageusia_raw_search = pd.read_csv('ageusia_raw_search.csv')
anosmia_raw_search = pd.read_csv('anosmia_raw_search.csv')
change_outpatient_confirmed_percentage = pd.read_csv('change_outpatient_confirmed_percentage.csv')
change_outpatient_primary_percentage = pd.read_csv('change_outpatient_primary_percentage.csv')
doctor_visits = pd.read_csv('doctor_visits.csv')
ground_truth_cases = pd.read_csv('ground_truth_cases.csv')
smoothed_adj_covid19_from_claims = pd.read_csv("smoothed_adj_covid19_from_claims.csv")
smoothed_adj_covid19 = pd.read_csv("smoothed_adj_covid19.csv")
Data Cleaning
1. Signals Selection Cont.
ground_truth_cases = ground_truth_cases.drop(['issue','lag','missing_value','stderr','sample_size','missing_stderr','missing_sample_size'],axis=1)
ageusia_raw_search = ageusia_raw_search.drop(['issue','lag','missing_value','stderr','sample_size','missing_stderr','missing_sample_size'],axis=1)
anosmia_raw_search = anosmia_raw_search.drop(['issue','lag','missing_value','stderr','sample_size','missing_stderr','missing_sample_size'],axis=1)
change_outpatient_confirmed_percentage = change_outpatient_confirmed_percentage.drop(['issue','lag','missing_value','stderr','sample_size','missing_stderr','missing_sample_size'],axis=1)
change_outpatient_primary_percentage = change_outpatient_primary_percentage.drop(['issue','lag','missing_value','stderr','sample_size','missing_stderr','missing_sample_size'],axis=1)
doctor_visits = doctor_visits.drop(['issue','lag','missing_value','stderr','sample_size','missing_stderr','missing_sample_size'],axis=1)
smoothed_adj_covid19_from_claims = smoothed_adj_covid19_from_claims.drop(['issue','lag','missing_value','stderr','sample_size','missing_stderr','missing_sample_size'],axis=1)
smoothed_adj_covid19 = smoothed_adj_covid19.drop(['issue','lag','missing_value','stderr','sample_size','missing_stderr','missing_sample_size'],axis=1)
# 02/20/2020-09/27/2020 amount of data, county level
anosmia_raw_search[anosmia_raw_search["time_value"]<"2020-09-30"]
# smoothed_adj_covid19_from_claims[smoothed_adj_covid19_from_claims["time_value"]<"2020-09-30"]
print(len(ground_truth_cases))
print(len(anosmia_raw_search))
print(len(ageusia_raw_search))
print(len(change_outpatient_confirmed_percentage))
print(len(change_outpatient_primary_percentage))
print(len(doctor_visits))
print(len(smoothed_adj_covid19_from_claims))
print(len(smoothed_adj_covid19))
2. Data Imputation
raw = pd.read_csv('train_raw.csv')
raw.columns
columns = ['Unnamed: 0',
'chng_smoothed_adj_outpatient_covid_0_issue',
'chng_smoothed_adj_outpatient_covid_0_lag',
'chng_smoothed_adj_outpatient_covid_0_missing_value',
'chng_smoothed_adj_outpatient_covid_0_missing_stderr',
'chng_smoothed_adj_outpatient_covid_0_missing_sample_size',
'chng_smoothed_adj_outpatient_covid_0_stderr',
'chng_smoothed_adj_outpatient_covid_0_sample_size',
'chng_smoothed_adj_outpatient_cli_1_issue',
'chng_smoothed_adj_outpatient_cli_1_lag',
'chng_smoothed_adj_outpatient_cli_1_missing_value',
'chng_smoothed_adj_outpatient_cli_1_missing_stderr',
'chng_smoothed_adj_outpatient_cli_1_missing_sample_size',
'chng_smoothed_adj_outpatient_cli_1_stderr',
'chng_smoothed_adj_outpatient_cli_1_sample_size',
'doctor-visits_smoothed_adj_cli_2_issue',
'doctor-visits_smoothed_adj_cli_2_lag',
'doctor-visits_smoothed_adj_cli_2_missing_value',
'doctor-visits_smoothed_adj_cli_2_missing_stderr',
'doctor-visits_smoothed_adj_cli_2_missing_sample_size',
'doctor-visits_smoothed_adj_cli_2_stderr',
'doctor-visits_smoothed_adj_cli_2_sample_size',
'hospital-admissions_smoothed_adj_covid19_from_claims_3_issue',
'hospital-admissions_smoothed_adj_covid19_from_claims_3_lag',
'hospital-admissions_smoothed_adj_covid19_from_claims_3_missing_value',
'hospital-admissions_smoothed_adj_covid19_from_claims_3_missing_stderr',
'hospital-admissions_smoothed_adj_covid19_from_claims_3_missing_sample_size',
'hospital-admissions_smoothed_adj_covid19_from_claims_3_stderr',
'hospital-admissions_smoothed_adj_covid19_from_claims_3_sample_size',
'hospital-admissions_smoothed_adj_covid19_4_issue',
'hospital-admissions_smoothed_adj_covid19_4_lag',
'hospital-admissions_smoothed_adj_covid19_4_missing_value',
'hospital-admissions_smoothed_adj_covid19_4_missing_stderr',
'hospital-admissions_smoothed_adj_covid19_4_missing_sample_size',
'hospital-admissions_smoothed_adj_covid19_4_stderr',
'hospital-admissions_smoothed_adj_covid19_4_sample_size',
'indicator-combination_confirmed_incidence_num_5_issue',
'indicator-combination_confirmed_incidence_num_5_lag',
'indicator-combination_confirmed_incidence_num_5_missing_value',
'indicator-combination_confirmed_incidence_num_5_missing_stderr',
'indicator-combination_confirmed_incidence_num_5_missing_sample_size',
'indicator-combination_confirmed_incidence_num_5_stderr',
'indicator-combination_confirmed_incidence_num_5_sample_size',
'geo_type']
raw = raw.drop(columns, axis=1)
raw[raw["time_value"]<="2020-09-30"]
# check missing values rate
raw[raw["time_value"]<"2020-09-30"].isna().sum()/len(raw)
raw = raw[raw["time_value"]<"2020-09-30"]
raw
raw.columns[2:7]
# fill na values in each columns with mean value group by day
for i in raw.columns[2:7]:
raw[i]=raw.groupby('time_value')[i].apply(lambda x:x.fillna(x.mean()))
# drop na and check na again
raw = raw.dropna()
raw.isna().sum()/len(raw)
raw
# final train_raw size
print(len(raw))
train_raw = raw
Model Training
1. Time Step Transform
from pandas import concat
from pandas import DataFrame
import numpy as np
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
"""
Frame a time series as a supervised learning dataset.
Arguments:
data: Sequence of observations as a list or NumPy array.
n_in: Number of lag observations as input (X).
n_out: Number of observations as output (y).
dropnan: Boolean whether or not to drop rows with NaN values.
Returns:
Pandas DataFrame of series framed for supervised learning.
"""
n_vars = 1 if type(data) is list else data.shape[1]
df = DataFrame(data)
cols, names = list(), list()
# input sequence (t-n, ... t-1)
for i in range(n_in, 0, -1):
cols.append(df.shift(i))
names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
# forecast sequence (t, t+1, ... t+n)
for i in range(0, n_out):
cols.append(df.shift(-i))
if i == 0:
names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
else:
names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
# put it all together
agg = concat(cols, axis=1)
agg.columns = names
# drop rows with NaN values
if dropnan:
agg.dropna(inplace=True)
return agg
train_raw.columns
train_raw
# sort the geo_value and time-value for shifting
to_shift = train_raw.sort_values(["geo_value","time_value"])
to_shift
# county = to_shift.geo_value
# cases = to_shift["indicator-combination_confirmed_incidence_num_5_value"]
# to_shift = to_shift.drop(["geo_value","indicator-combination_confirmed_incidence_num_5_value"],axis=1)
# ordinal encoding
from numpy import asarray
from sklearn.preprocessing import OrdinalEncoder
# define data
data = asarray(to_shift['geo_value'].values.reshape(-1,1))
# define ordinal encoding
encoder = OrdinalEncoder()
# transform data
result = encoder.fit_transform(data)
to_shift["geo_value"] = result.reshape(1,-1)[0]
values = to_shift.values
data = series_to_supervised(values, 7,1)
print(data)
data.head()
This chart is empty
Chart was probably not set up properly in the notebook
This chart is empty
Chart was probably not set up properly in the notebook
data.columns
data.drop(data.columns[[0,1,7,8,9,15,16,17,23,24,25,31,32,33,39,40,41,47,48,49,55,56]], axis = 1, inplace=True)
data
data["var8(t+1)"] = data["var8(t)"].shift(-1)
# drop first few rows because of shifting
data = data.dropna().drop("var8(t)",axis=1)
data
data = data.sort_values("var2(t)")
time_step = data.drop("var2(t)", axis=1)
time_step.head()
2.Train, Test, Validation Split
time_step.to_csv("train_final.csv")
from sklearn.model_selection import train_test_split
X = time_step.iloc[:,:-1]
y = time_step.iloc[:,-1]
# split into train, test dataset
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
tscv = TimeSeriesSplit(n_splits=5)
print(tscv)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2)
3. Interpretable Model
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, KFold
tree = DecisionTreeRegressor()
import matplotlib.pyplot as plt
tree.fit(X_train,y_train)
y_pred = tree.predict(X_test)
print("R-Squared on train dataset = {}".format(tree.score(X_train,y_train)))
print("R-Squared on test dataset = {}".format(tree.score(X_test,y_test)))
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.set_title('R^2:-2.0203133591291422')
def reg_metrics(y_test, y_pred, X_train):
from sklearn.metrics import mean_squared_error, r2_score
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
r2 = r2_score(y_test,y_pred)
# Scikit-learn doesn't have adjusted r-square, hence custom code
n = y_pred.shape[0]
k = X_train.shape[1]
adj_r_sq = 1 - (1 - r2)*(n-1)/(n-1-k)
print('rmse:',rmse, 'r^2:', r2, 'adjusted r-square:', adj_r_sq)
reg_metrics(y_test, y_pred, X_train)
# hyperparameter tuning with GridSearchCV
from sklearn.model_selection import GridSearchCV
parameters = {"min_samples_split": [10, 20, 40],
"max_depth": [2, 6, 8],
"min_samples_leaf": [20, 40, 100],
"max_leaf_nodes": [5, 20, 100],
}
tuning_model = GridSearchCV(tree,param_grid=parameters,scoring='neg_mean_squared_error',cv=tscv)
tuning_model.fit(X_train, y_train)
# best hyperparameters
tuning_model.best_params_
tuned_hyper_model= DecisionTreeRegressor(max_depth= 8, max_leaf_nodes=100,min_samples_leaf=100,min_samples_split=10)
tuned_hyper_model.fit(X_train,y_train)
# prediction
print("R-Squared on train dataset = {}".format(tuned_hyper_model.score(X_train,y_train)))
print("R-Squared on test dataset = {}".format(tuned_hyper_model.score(X_test,y_test)))
tuned_pred=tuned_hyper_model.predict(X_test)
fig, ax = plt.subplots()
ax.scatter(y_test, tuned_pred, edgecolors=(0, 0, 0))
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.set_title('R^2:0.019717384131596294')
reg_metrics(y_test, tuned_pred, X_train)
# get importance
importance = tuned_hyper_model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.title('Feature Importance')
plt.xlabel('Feature Number')
plt.ylabel('Score')
plt.show()
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
figure(figsize=(10, 8), dpi=80)
x = data["var2(t)"]
x = list(range(1,2563))
# plot train and test error v.s. depth
plt.plot(x, tuned_pred, label = "Predicted Number of Cases")
plt.plot(x, y_test, label = "Ground Truth Cases")
plt.xlabel('Time', fontweight ='bold', fontsize = 15)
plt.ylabel('Cases', fontweight ='bold', fontsize = 15)
plt.legend(loc = 'upper right')
plt.show()
4. Complex Model
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
# train test data reshape for LSTM
train_X = np.array(X_train.values.reshape((X_train.shape[0], 1, X_train.shape[1])), dtype=np.float32)
test_X = np.array(X_test.values.reshape((X_test.shape[0], 1, X_test.shape[1])), dtype=np.float32)
train_y = np.array(y_train.values, dtype=np.float32)
test_y = np.array(y_test.values, dtype=np.float32)
# design network
model = Sequential()
model.add(LSTM(100, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
# fit network
history = model.fit(train_X, train_y, epochs=100, batch_size=40, validation_data=(test_X, test_y), verbose=2, shuffle=False)
# plot history
pyplot.plot(history.history['loss'], label='Training')
pyplot.plot(history.history['val_loss'], label='Testing')
pyplot.title('Neural Network Model Loss')
pyplot.xlabel('Epoch')
pyplot.ylabel('Loss')
pyplot.legend()
pyplot.show()
# make a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))
# calculate RMSE
rmse = sqrt(mean_squared_error(test_y, yhat))
print('Test RMSE: %.3f' % rmse)
x = data["var2(t)"]