pip install -r requirements.txt

# Start writing code here... import pandas as pd import numpy as np from datetime import date from datetime import timedelta import covidcast import math from sklearn.model_selection import TimeSeriesSplit import torch from torch.utils.data import TensorDataset from torch import optim from sklearn.metrics import mean_squared_error import matplotlib.pyplot as plt

def getCounty(df,county): return df[df["fips"]==county]

import torch from torch import nn # Any Pytorch's network class is an extension of the torch.nn.Module parent class. # To define a network class, you need to define at least 2 methods: an __init__() method (constructor) and a forward() method class LSTM(nn.Module): def __init__(self, input_dim, hidden_dim, n_layers): super().__init__() self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim*2, num_layers=n_layers, batch_first=True) #LSTM(neurons, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True)) self.linear = nn.Linear(hidden_dim*2, hidden_dim) self.relu = nn.ReLU() self.sigmoid = nn.Sigmoid() self.predictor = nn.Linear(hidden_dim, 1) self.dropout = nn.Dropout(0.5) def forward(self, input): lstm_out, (hidden, cell) = self.lstm(input) # hidden = self.linear(hidden) hidden = self.linear(hidden) #hidden = self.sigmoid(hidden) hidden = self.relu(hidden) out = self.predictor(hidden) return out

county_avg_valid_MSEs

#Cross Validate the model with all counties data cv = 5 EPOCHS = 100 lrs = [0.001,0.01,0.1,1] HIDDEN_DIM = 4 train_data = pd.read_csv("training_data.csv",dtype=object,index_col=False) train_data['time']= pd.to_datetime(train_data['time']) headers = ['dailyCases_chng_t-1', 'dailyCases_chng_t-2', 'outpatient_cli_t-1', 'outpatient_cli_t-2', 'hospitalAdm_claim_t-1', 'hospitalAdm_claim_t-2', 'googleSym_sum_t-1', 'googleSym_sum_t-2', 'doctorVisits_t-1', 'doctorVisits_t-2','dailyCases_t'] mean_dailyCases = train_data['dailyCases_t'].astype(float).mean() for column_name in headers: train_data[column_name] = train_data[column_name].astype(float) train_data[column_name] = train_data[column_name]-train_data[column_name].mean() y = train_data["dailyCases_t"] y = np.array(y,dtype=np.float32) X = train_data.drop(columns=['dailyCases_t','time','fips']) X = np.array(X,dtype=np.float32) X = X.reshape(X.shape[0], 1, X.shape[1]) county_avg_valid_MSEs = [] for lr in lrs: print("\n*********** CURRENT LEARNING RATE :", lr, "***********") i=0 tscv = TimeSeriesSplit(n_splits=cv) avg_valid_mse = 0.0 for train_index, valid_index in tscv.split(X): X_train, X_valid = X[train_index], X[valid_index] y_train, y_valid = y[train_index], y[valid_index] training_set = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train)) training_dataloader = torch.utils.data.DataLoader(training_set, batch_size=16, shuffle=False) valid_set = TensorDataset(torch.from_numpy(X_valid), torch.from_numpy(y_valid)) validation_dataloader = torch.utils.data.DataLoader(valid_set, batch_size=16, shuffle=False) print("SPLIT",i+1,":") LEARNING_RATE = lr model = LSTM(input_dim=10, hidden_dim=HIDDEN_DIM,n_layers=1) # optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE) loss_fn = nn.MSELoss() model.train() for epoch in range(EPOCHS): # Train the network by filling in this block of code epoch_loss = 0.0 #epoch_loss = [] #train_mse = 0.0 train_prediction = [] train_ground_truth = [] for inputs, labels in training_dataloader: optimizer.zero_grad() output = model(inputs) loss = torch.sqrt(loss_fn(output, torch.reshape(labels,output.shape))) loss.backward() optimizer.step() epoch_loss += loss.item() train_prediction = np.concatenate((train_prediction,torch.flatten(output).detach().numpy())) train_ground_truth = np.concatenate((train_ground_truth,labels.detach().numpy())) train_mse = np.sqrt(mean_squared_error(train_prediction,train_ground_truth)) print("Epoch", epoch, ", Loss:", epoch_loss, ", Train RMSE:", train_mse) model.eval() valid_prediction=[] valid_ground_truth =[] with torch.set_grad_enabled(False): for inputs, labels in validation_dataloader: output = model(inputs) valid_prediction = np.concatenate((valid_prediction,torch.flatten(output).detach().numpy())) valid_ground_truth = np.concatenate((valid_ground_truth,labels.detach().numpy())) valid_mse = np.sqrt(mean_squared_error(valid_prediction,valid_ground_truth)) avg_valid_mse+=valid_mse print("Validation MSE: ", valid_mse) plt.plot(np.arange(len(valid_prediction)),valid_prediction,label="prediction") plt.plot(np.arange(len(valid_ground_truth)),valid_ground_truth,label="ground truth") plt.ylabel("Daily Cases") plt.title("Cross Validation Result") plt.legend() plt.show() i += 1 # if i==cv: # test_prediction=[] # test_ground_truth =[] # test_mse = 0.0 # with torch.set_grad_enabled(False): # for inputs, labels in test_dataloader: # output = model(inputs) # test_prediction = np.concatenate((test_prediction,torch.flatten(output).detach().numpy())) # test_ground_truth = np.concatenate((test_ground_truth,labels.detach().numpy())) # test_mse = np.sqrt(mean_squared_error(test_prediction,test_ground_truth)) # print("Test MSE: ", test_mse) # plt.plot(np.arange(len(test_prediction)),test_prediction+mean_dailyCases,label="prediction") # plt.plot(np.arange(len(test_ground_truth)),test_ground_truth+mean_dailyCases,label="ground truth") # plt.title("Test data Prediction") # plt.legend() # plt.show() avg_valid_mse /= cv county_avg_valid_MSEs.append(avg_valid_mse) print("lr:",lr) print("Average validation MSE:",avg_valid_mse) print()

Train the model with all counties data and test with all counties data

#Train the model with all data cv = 5 EPOCHS = 100 lr = 0.001 HIDDEN_DIM = 4 train_data = pd.read_csv("training_data.csv",dtype=object,index_col=False) train_data['time']= pd.to_datetime(train_data['time']) headers = ['dailyCases_chng_t-1', 'dailyCases_chng_t-2', 'outpatient_cli_t-1', 'outpatient_cli_t-2', 'hospitalAdm_claim_t-1', 'hospitalAdm_claim_t-2', 'googleSym_sum_t-1', 'googleSym_sum_t-2', 'doctorVisits_t-1', 'doctorVisits_t-2','dailyCases_t'] for column_name in headers: train_data[column_name] = train_data[column_name].astype(float) train_data[column_name] = train_data[column_name]-train_data[column_name].mean() y = train_data["dailyCases_t"] y = np.array(y,dtype=np.float32) X = train_data.drop(columns=['dailyCases_t','time','fips']) X = np.array(X,dtype=np.float32) X = X.reshape(X.shape[0], 1, X.shape[1]) LEARNING_RATE = lr training_set = TensorDataset(torch.from_numpy(X), torch.from_numpy(y)) training_dataloader = torch.utils.data.DataLoader(training_set, batch_size=16, shuffle=False) model = LSTM(input_dim=10, hidden_dim=HIDDEN_DIM,n_layers=1) # optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE) loss_fn = nn.MSELoss() model.train() train_MSEs = [] for epoch in range(EPOCHS): epoch_loss = 0.0 train_prediction = [] train_ground_truth = [] for inputs, labels in training_dataloader: optimizer.zero_grad() output = model(inputs) loss = torch.sqrt(loss_fn(output, torch.reshape(labels,output.shape))) loss.backward() optimizer.step() epoch_loss += loss.item() train_prediction = np.concatenate((train_prediction,torch.flatten(output).detach().numpy())) train_ground_truth = np.concatenate((train_ground_truth,labels.detach().numpy())) train_mse = np.sqrt(mean_squared_error(train_prediction,train_ground_truth)) train_MSEs.append(train_mse) print("Epoch", epoch, ", Loss:", epoch_loss, ", Train RMSE:", train_mse) #Test on all county test_data = pd.read_csv("test_data.csv",dtype=object,index_col=False) mean_dailyCases = test_data['dailyCases_t'].astype(float).mean() for column_name in headers: test_data[column_name] = test_data[column_name].astype(float) test_data[column_name] = test_data[column_name]-test_data[column_name].mean() y_test = test_data["dailyCases_t"] y_test = np.array(y_test,dtype=np.float32) X_test = test_data.drop(columns=['dailyCases_t','time','fips']) X_test = np.array(X_test,dtype=np.float32) X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1]) test_set = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test)) test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=16, shuffle=False) model.eval() test_prediction=[] test_ground_truth =[] test_mse = 0.0 with torch.set_grad_enabled(False): for inputs, labels in test_dataloader: output = model(inputs) test_prediction = np.concatenate((test_prediction,torch.flatten(output).detach().numpy())) test_ground_truth = np.concatenate((test_ground_truth,labels.detach().numpy())) test_mse = np.sqrt(mean_squared_error(test_prediction,test_ground_truth)) print("Test RMSE: ", test_mse) plt.plot(np.arange(len(test_ground_truth)),test_ground_truth+mean_dailyCases,label="ground truth") plt.plot(np.arange(len(test_prediction)),test_prediction+mean_dailyCases,label="prediction",) plt.legend() plt.title("Predict COVID19 cases 9/1/2021-10/1/2021 for 15 counties") # plt.xlabel("Day") plt.ylabel("Daily Cases") plt.savefig("lstm_test_prediction.png") plt.show()

#Plot train MSE plt.plot(np.arange(EPOCHS),train_MSEs) plt.xlabel("Epoch") plt.ylabel("Train RMSE") plt.savefig("train_RMSE.png") plt.show()

Test on a single county

#Test on single county counties = ['06001', '06013', '06019', '06029', '06037', '06059', '06065', '06067', '06071', '06073', '06075', '06077', '06081', '06085', '06111'] for county in counties: test_data = pd.read_csv("test_data.csv",dtype=object,index_col=False) test_data = test_data[test_data["fips"]==county] mean_dailyCases = test_data['dailyCases_t'].astype(float).mean() for column_name in headers: test_data[column_name] = test_data[column_name].astype(float) test_data[column_name] = test_data[column_name]-test_data[column_name].mean() y_test = test_data["dailyCases_t"] y_test = np.array(y_test,dtype=np.float32) X_test = test_data.drop(columns=['dailyCases_t','time','fips']) X_test = np.array(X_test,dtype=np.float32) X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1]) test_set = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test)) test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=16, shuffle=False) model.eval() test_prediction=[] test_ground_truth =[] test_mse = 0.0 with torch.set_grad_enabled(False): for inputs, labels in test_dataloader: output = model(inputs) test_prediction = np.concatenate((test_prediction,torch.flatten(output).detach().numpy())) test_ground_truth = np.concatenate((test_ground_truth,labels.detach().numpy())) test_mse = np.sqrt(mean_squared_error(test_prediction,test_ground_truth)) print("Test RMSE: ", test_mse) plt.plot(np.arange(len(test_ground_truth)),test_ground_truth+mean_dailyCases,label="ground truth",c='r') plt.plot(np.arange(len(test_prediction)),test_prediction+mean_dailyCases,label="prediction",) plt.legend() plt.xlabel("Day") plt.ylabel("Daily Cases") plt.title("county "+county) plt.show() # plt.savefig("lstm_test_prediction.png")

Train the model with average county data

counties = [ '06019', '06029', '06037', '06059', '06065', '06067', '06071', '06073', '06075', '06077', '06081', '06085'] df = pd.read_csv("data.csv",dtype=object,index_col=False) df['time']= pd.to_datetime(df['time']) time = df["time"] df=df.drop(columns=["time"]) headers = ['dailyCases_chng_t-1', 'dailyCases_chng_t-2', 'outpatient_cli_t-1', 'outpatient_cli_t-2', 'hospitalAdm_claim_t-1', 'hospitalAdm_claim_t-2', 'googleSym_sum_t-1', 'googleSym_sum_t-2', 'doctorVisits_t-1', 'doctorVisits_t-2','dailyCases_t'] for column_name in headers: df[column_name] = df[column_name].astype(float) avg_county_data = df[df["fips"]=='06013'].drop(columns=["fips"]) for county in counties: data = df[df["fips"]==county].drop(columns=["fips"]) for h in headers: avg_county_data[h] += np.array(data[h]) avg_county_data /= (len(counties)+1) avg_county_data["time"] = time avg_county_data

# Train the model with average county data cv = 5 EPOCHS = 100 lr = 1 HIDDEN_DIM = 4 data = avg_county_data headers = ['dailyCases_chng_t-1', 'dailyCases_chng_t-2', 'outpatient_cli_t-1', 'outpatient_cli_t-2', 'hospitalAdm_claim_t-1', 'hospitalAdm_claim_t-2', 'googleSym_sum_t-1', 'googleSym_sum_t-2', 'doctorVisits_t-1', 'doctorVisits_t-2','dailyCases_t'] mean_dailyCases = data['dailyCases_t'].astype(float).mean() for column_name in headers: data[column_name] = data[column_name].astype(float) data[column_name] = data[column_name]-data[column_name].mean() idx = data.index[data["time"]==np.datetime64(date(2021,9,1))] test_data = data.loc[idx[0]:] y_test = test_data["dailyCases_t"] y_test = np.array(y_test,dtype=np.float32) X_test = test_data.drop(columns=['dailyCases_t','time']) X_test = np.array(X_test,dtype=np.float32) X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1]) test_set = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test)) test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=16, shuffle=False) train_data = data.loc[:idx[0]-1] y = train_data["dailyCases_t"] y = np.array(y,dtype=np.float32) X = train_data.drop(columns=['dailyCases_t','time']) X = np.array(X,dtype=np.float32) X = X.reshape(X.shape[0], 1, X.shape[1]) LEARNING_RATE = lr i=0 tscv = TimeSeriesSplit(n_splits=cv) avg_valid_mse = 0.0 train_RMSEs_nn_avg = [] valid_RMSEs_nn_avg = [] test_RMSEs_nn_avg = [] for train_index, valid_index in tscv.split(X): X_train, X_valid = X[train_index], X[valid_index] y_train, y_valid = y[train_index], y[valid_index] training_set = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train)) training_dataloader = torch.utils.data.DataLoader(training_set, batch_size=16, shuffle=False) valid_set = TensorDataset(torch.from_numpy(X_valid), torch.from_numpy(y_valid)) validation_dataloader = torch.utils.data.DataLoader(valid_set, batch_size=16, shuffle=False) print("split",i+1,":") model = LSTM(input_dim=10, hidden_dim=HIDDEN_DIM,n_layers=1) #optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE) loss_fn = nn.MSELoss() model.train() train_mse_epoch_sum = 0.0 for epoch in range(EPOCHS): # Train the network by filling in this block of code epoch_loss = 0.0 #epoch_loss = [] #train_mse = 0.0 train_prediction = [] train_ground_truth = [] for inputs, labels in training_dataloader: optimizer.zero_grad() output = model(inputs) loss = torch.sqrt(loss_fn(output, torch.reshape(labels,output.shape))) loss.backward() optimizer.step() epoch_loss += loss.item() #epoch_loss.append(loss.item()) #train_MSEs[i].append(loss.item()) train_prediction = np.concatenate((train_prediction,torch.flatten(output).detach().numpy())) train_ground_truth = np.concatenate((train_ground_truth,labels.detach().numpy())) # train_MSEs[i].append(mean_squared_error(torch.flatten(output).detach().numpy(), labels.detach().numpy())) #train_prediction[i] = np.concatenate((train_prediction[i],torch.flatten(output).detach().numpy())) # train_ground_truth[i] = np.concatenate((train_ground_truth[i],labels.detach().numpy())) #train_acc += torch.sum(prediction == labels) #train_accs.append(train_acc/count) #losses.append(np.mean(epoch_loss)) train_mse = np.sqrt(mean_squared_error(train_prediction,train_ground_truth)) train_mse_epoch_sum += train_mse print("Epoch", epoch, ", Loss:", epoch_loss, ", Train MSE:", train_mse) train_RMSEs_nn_avg.append(train_mse_epoch_sum/EPOCHS) model.eval() valid_mse = 0.0 #valid_mse = [] valid_prediction=[] valid_ground_truth =[] with torch.set_grad_enabled(False): for inputs, labels in validation_dataloader: output = model(inputs) valid_prediction = np.concatenate((valid_prediction,torch.flatten(output).detach().numpy())) valid_ground_truth = np.concatenate((valid_ground_truth,labels.detach().numpy())) #valid_MSEs[i].append(mean_squared_error(torch.flatten(output).detach().numpy(), labels.detach().numpy())) #valid_MSEs[i].append(valid_mse) valid_mse = np.sqrt(mean_squared_error(valid_prediction,valid_ground_truth)) avg_valid_mse+=valid_mse valid_RMSEs_nn_avg.append(valid_mse) print("Validation MSE: ", valid_mse) plt.plot(np.arange(len(valid_prediction)),valid_prediction,label="prediction") plt.plot(np.arange(len(valid_ground_truth)),valid_ground_truth,label="ground truth") plt.legend() plt.show() i += 1 if i==cv: test_prediction=[] test_ground_truth =[] test_mse = 0.0 with torch.set_grad_enabled(False): for inputs, labels in test_dataloader: output = model(inputs) test_prediction = np.concatenate((test_prediction,torch.flatten(output).detach().numpy())) test_ground_truth = np.concatenate((test_ground_truth,labels.detach().numpy())) test_mse = np.sqrt(mean_squared_error(test_prediction,test_ground_truth)) test_RMSEs_nn_avg.append(test_mse) print("Test MSE: ", test_mse) plt.plot(np.arange(len(test_prediction)),test_prediction+mean_dailyCases,label="prediction") plt.plot(np.arange(len(test_ground_truth)),test_ground_truth+mean_dailyCases,label="ground truth") plt.title("Test data Prediction") plt.legend() plt.show() avg_valid_mse /= cv print("Average validation MSE:",avg_valid_mse) print()

import matplotlib.pyplot as plt test_RMSEs_nn_avg = [test_RMSEs_nn_avg[0] for i in range(5)] plt.plot(range(1,6), train_RMSEs_nn_avg, color='red', label='Training RMSEs') plt.plot(range(1,6), valid_RMSEs_nn_avg, color='green', label='Validation RMSEs') plt.plot(range(1,6), test_RMSEs_nn_avg, color='blue', label='Testing RMSEs') plt.legend(loc='best') plt.title('Train/Valid/Test RMSEs Across the 5 Splits') plt.xlabel('Splits') plt.ylabel('RMSE') plt.show()

Train the model with a single county data

#cross validate the model on single county data cv = 5 EPOCHS = 100 lr = 1 HIDDEN_DIMs = [4] counties = ['06037','06013','06073','06075']#, '06013', '06019', '06029', '06037', '06059', '06065', #'06067', '06071', '06073', '06075', '06077', '06081', '06085', #'06111'] county_avg_valid_MSEs = [] for HIDDEN_DIM in HIDDEN_DIMs: for county in counties: print("County:",county) data = df[df["fips"]==county] headers = ['dailyCases_chng_t-1', 'dailyCases_chng_t-2', 'outpatient_cli_t-1', 'outpatient_cli_t-2', 'hospitalAdm_claim_t-1', 'hospitalAdm_claim_t-2', 'googleSym_sum_t-1', 'googleSym_sum_t-2', 'doctorVisits_t-1', 'doctorVisits_t-2','dailyCases_t'] mean_dailyCases = data['dailyCases_t'].astype(float).mean() for column_name in headers: data[column_name] = data[column_name].astype(float) data[column_name] = data[column_name]-data[column_name].mean() idx = data.index[data["time"]==np.datetime64(date(2021,9,1))] test_data = data.loc[idx[0]:] y_test = test_data["dailyCases_t"] y_test = np.array(y_test,dtype=np.float32) X_test = test_data.drop(columns=['dailyCases_t','time','fips']) X_test = np.array(X_test,dtype=np.float32) X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1]) test_set = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test)) test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=16, shuffle=False) train_data = data.loc[:idx[0]-1] y = train_data["dailyCases_t"] y = np.array(y,dtype=np.float32) X = train_data.drop(columns=['dailyCases_t','time','fips']) X = np.array(X,dtype=np.float32) X = X.reshape(X.shape[0], 1, X.shape[1]) LEARNING_RATE = lr i=0 tscv = TimeSeriesSplit(n_splits=cv) avg_valid_mse = 0.0 for train_index, valid_index in tscv.split(X): X_train, X_valid = X[train_index], X[valid_index] y_train, y_valid = y[train_index], y[valid_index] training_set = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train)) training_dataloader = torch.utils.data.DataLoader(training_set, batch_size=16, shuffle=False) valid_set = TensorDataset(torch.from_numpy(X_valid), torch.from_numpy(y_valid)) validation_dataloader = torch.utils.data.DataLoader(valid_set, batch_size=16, shuffle=False) print("split",i+1,":") model = LSTM(input_dim=10, hidden_dim=HIDDEN_DIM,n_layers=1) # optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE) optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE) loss_fn = nn.MSELoss() model.train() #losses = [] for epoch in range(EPOCHS): # Train the network by filling in this block of code epoch_loss = 0.0 #epoch_loss = [] #train_mse = 0.0 train_prediction = [] train_ground_truth = [] for inputs, labels in training_dataloader: optimizer.zero_grad() output = model(inputs) loss = torch.sqrt(loss_fn(output, torch.reshape(labels,output.shape))) loss.backward() optimizer.step() epoch_loss += loss.item() #epoch_loss.append(loss.item()) #train_MSEs[i].append(loss.item()) train_prediction = np.concatenate((train_prediction,torch.flatten(output).detach().numpy())) train_ground_truth = np.concatenate((train_ground_truth,labels.detach().numpy())) # train_MSEs[i].append(mean_squared_error(torch.flatten(output).detach().numpy(), labels.detach().numpy())) #train_prediction[i] = np.concatenate((train_prediction[i],torch.flatten(output).detach().numpy())) # train_ground_truth[i] = np.concatenate((train_ground_truth[i],labels.detach().numpy())) #train_acc += torch.sum(prediction == labels) #train_accs.append(train_acc/count) #losses.append(np.mean(epoch_loss)) train_mse = np.sqrt(mean_squared_error(train_prediction,train_ground_truth)) print("Epoch", epoch, ", Loss:", epoch_loss, ", Train MSE:", train_mse) model.eval() valid_mse = 0.0 #valid_mse = [] valid_prediction=[] valid_ground_truth =[] with torch.set_grad_enabled(False): for inputs, labels in validation_dataloader: output = model(inputs) valid_prediction = np.concatenate((valid_prediction,torch.flatten(output).detach().numpy())) valid_ground_truth = np.concatenate((valid_ground_truth,labels.detach().numpy())) #valid_MSEs[i].append(mean_squared_error(torch.flatten(output).detach().numpy(), labels.detach().numpy())) #valid_MSEs[i].append(valid_mse) valid_mse = np.sqrt(mean_squared_error(valid_prediction,valid_ground_truth)) avg_valid_mse+=valid_mse print("Validation MSE: ", valid_mse) # plt.plot(np.arange(len(valid_prediction)),valid_prediction,label="prediction") # plt.plot(np.arange(len(valid_ground_truth)),valid_ground_truth,label="ground truth") # plt.legend() # plt.show() # valid_prediction[i] = np.concatenate((valid_prediction[i],torch.flatten(output).detach().numpy())) # valid_ground_truth[i] = np.concatenate((valid_ground_truth[i],labels.detach().numpy())) #valid_acc += torch.sum(prediction == labels) #valid_accs.append(valid_acc/len(X_valid) i += 1 if i==cv: test_prediction=[] test_ground_truth =[] test_mse = 0.0 with torch.set_grad_enabled(False): for inputs, labels in test_dataloader: output = model(inputs) test_prediction = np.concatenate((test_prediction,torch.flatten(output).detach().numpy())) test_ground_truth = np.concatenate((test_ground_truth,labels.detach().numpy())) test_mse = np.sqrt(mean_squared_error(test_prediction,test_ground_truth)) print("Test MSE: ", test_mse) plt.plot(np.arange(len(test_prediction)),test_prediction+mean_dailyCases,label="prediction") plt.plot(np.arange(len(test_ground_truth)),test_ground_truth+mean_dailyCases,label="ground truth") plt.title("Test data Prediction") plt.legend() plt.show() print() avg_valid_mse /= cv county_avg_valid_MSEs.append(avg_valid_mse) print("Average validation MSE:",avg_valid_mse) print()

Decision Tree Regression Model

df = pd.read_csv("data.csv",dtype=object) train = df[df['time'] < '2021-09-04 00:00:00'] test = df[df['time'] > '2021-09-03 00:00:00' ]

from sklearn.model_selection import TimeSeriesSplit from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import cross_validate from sklearn.pipeline import make_pipeline from sklearn import preprocessing from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error tscv = TimeSeriesSplit(n_splits=5) all_counties = df.fips.unique() best_model_per_county = [] max_depths = range(2,20) # keeping track of the RMSEs of our successful models per county to plot later train_RMSEs_per_county = [] valid_RMSEs_per_county = [] test_RMSEs_per_county = [] # keeping track of the R2s of our successful models per county to plot later train_R2_per_county = [] valid_R2_per_county = [] test_R2_per_county = [] for county in all_counties: print("\n ************* CURRENT COUNTY FIPS: ", county, "*************") min_rmse = float('inf') best_model = 0 best_train_RMSEs = 0 best_valid_RMSEs = 0 best_test_RMSEs = 0 best_train_R2s = 0 best_valid_R2s = 0 best_test_R2s = 0 data = getCounty(train, county) # county data from 9/3/20-9/3/21 data_test = getCounty(test, county) # county dara from 9/4/21-10/2/21 # All our training data y = np.array(data["dailyCases_t"], dtype=np.float32) # class labels X = data.drop(columns=['dailyCases_t','time','fips']) X = np.array(X,dtype=np.float32) # training data # Our testing data y_test = np.array(data_test["dailyCases_t"], dtype=np.float32) # class labels for testing X_test = data_test.drop(columns=['dailyCases_t','time','fips']) X_test = np.array(X_test,dtype=np.float32) # testing data # Hyperparameter tuning for depth in max_depths: print("\n------ CURRENT DEPTH : ", depth) i = 0 train_error = [] valid_error = [] test_error = [] for train_index, valid_index in tscv.split(X): X_train, X_valid = X[train_index], X[valid_index] y_train, y_valid = y[train_index], y[valid_index] print("split", i) i += 1 dtree = DecisionTreeRegressor(max_depth=depth) dtree.fit(X_train, y_train) # train pred_train = dtree.predict(X_train) train_rmse = np.sqrt(mean_squared_error(y_train, pred_train)) train_error.append(train_rmse) train_R2 = r2_score(y_train, pred_train) print("The Train R-squared value is:", train_R2) print("The Train RMSE is:", train_rmse) # validation pred_valid = dtree.predict(X_valid) valid_rmse = np.sqrt(mean_squared_error(y_valid, pred_valid)) valid_error.append(valid_rmse) valid_R2 = r2_score(y_valid, pred_valid) print("The Validation R-squared value is:", valid_R2) print("The Validation RMSE is:", valid_rmse) # test pred_test = dtree.predict(X_test) test_rmse = np.sqrt(mean_squared_error(y_test, pred_test)) test_error.append(test_rmse) test_R2 = r2_score(y_test, pred_test) print("The Test R-squared value is:", test_R2) print("The Test RMSE is:", test_rmse) avg_test_rmse = np.mean(test_error) if avg_test_rmse < min_rmse: min_rmse = avg_test_rmse best_model = dtree best_train_RMSEs = train_error best_valid_RMSEs = valid_error best_test_RMSEs = test_error best_train_R2s = train_R2 best_valid_R2s = valid_R2 best_test_R2s = test_R2 best_model_per_county.append(best_model) train_RMSEs_per_county.append(best_train_RMSEs) valid_RMSEs_per_county.append(best_valid_RMSEs) test_RMSEs_per_county.append(best_test_RMSEs) train_R2_per_county.append(best_train_R2s) valid_R2_per_county.append(best_valid_R2s) test_R2_per_county.append(best_valid_R2s)

best_index = 0 best_general_model = 0 min_test_rmse = float("inf") i = 0 for model in best_model_per_county: curr_test_errors = [] for county in all_counties: data_test = getCounty(test, county) # Our testing data y_test = np.array(data_test["dailyCases_t"], dtype=np.float32) X_test = data_test.drop(columns=['dailyCases_t','time','fips']) X_test = np.array(X_test,dtype=np.float32) # testing data # test pred_test = model.predict(X_test) test_rmse = mean_squared_error(y_test, pred_test, squared = False) curr_test_errors.append(test_rmse) if np.mean(curr_test_errors) < min_test_rmse: min_test_rmse = np.mean(curr_test_errors) best_general_model = model best_index = i i += 1

import matplotlib.pyplot as plt t = getCounty(test, '06037') y_t = np.array(t["dailyCases_t"], dtype=np.float32) X_t = np.array(t.drop(columns=['dailyCases_t','time','fips'])) model_t = best_general_model pred_t = model_t.predict(X_t) print("Test RMSE: ", mean_squared_error(y_t, pred_t, squared = False)) plt.plot(range(1,len(y_t)+1), y_t, color='red', label='Ground Truth') plt.plot(range(1,len(y_t)+1), pred_t, color='blue', label='Predictions by Best DT Model') plt.legend(loc='best') plt.title('Best DT Model Predictions VS Actual Number of Cases in LA') plt.xlabel('Days') plt.ylabel('Number of Cases') plt.show()

min_test_rmse

# Here are the test averages of the NN model across all counties NN_test_RMSEs = np.array([422.96855915421315,389.79688977191597,548.7047392266221,519.2485649450427,2165.628641457177,577.7467416936556,847.4010750524387,625.0115243391487,813.5223898441664,988.6604761160371,336.09873845254344,470.7531034449218,310.65657127066,452.3952963502877,351.9262260762237]) np.mean(NN_test_RMSEs)

# feature names feature_names=data.columns[2:12] feature_names

# feature importances print("Feature importances are:", best_general_model.feature_importances_) # visualize feature importances (pd.Series(best_general_model.feature_importances_, index=data.columns[2:12]) .nlargest(4) .plot(kind='barh'))

# visualize best tree from sklearn import tree fig = plt.figure(figsize=(25,20)) _ = tree.plot_tree(best_general_model, feature_names=data.columns[2:12], filled=True)

#best_index import matplotlib.pyplot as plt plt.plot(range(1,6), train_RMSEs_per_county[best_index], color='red', label='Training RMSEs') plt.plot(range(1,6), valid_RMSEs_per_county[best_index], color='green', label='Validation RMSEs') plt.plot(range(1,6), test_RMSEs_per_county[best_index], color='blue', label='Testing RMSEs') plt.legend(loc='best') plt.title('Train/Valid/Test RMSEs Across the 5 Splits') plt.xlabel('Splits') plt.ylabel('RMSE') plt.show()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Train the model with all counties data and test with all counties data

Test on a single county

Train the model with average county data

Train the model with a single county data

Decision Tree Regression Model

Train the model with all counties data and test with all counties data