pip install -r requirements.txt
# Start writing code here...
import pandas as pd
import numpy as np
from datetime import date
from datetime import timedelta
import covidcast
import math
from sklearn.model_selection import TimeSeriesSplit
import torch
from torch.utils.data import TensorDataset
from torch import optim
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
def getCounty(df,county):
return df[df["fips"]==county]
import torch
from torch import nn
# Any Pytorch's network class is an extension of the torch.nn.Module parent class.
# To define a network class, you need to define at least 2 methods: an __init__() method (constructor) and a forward() method
class LSTM(nn.Module):
def __init__(self, input_dim, hidden_dim, n_layers):
super().__init__()
self.lstm = nn.LSTM(input_size=input_dim,
hidden_size=hidden_dim*2,
num_layers=n_layers,
batch_first=True)
#LSTM(neurons, batch_input_shape=(batch_size, X.shape[1], X.shape[2]), stateful=True))
self.linear = nn.Linear(hidden_dim*2, hidden_dim)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
self.predictor = nn.Linear(hidden_dim, 1)
self.dropout = nn.Dropout(0.5)
def forward(self, input):
lstm_out, (hidden, cell) = self.lstm(input)
# hidden = self.linear(hidden)
hidden = self.linear(hidden)
#hidden = self.sigmoid(hidden)
hidden = self.relu(hidden)
out = self.predictor(hidden)
return out
county_avg_valid_MSEs
#Cross Validate the model with all counties data
cv = 5
EPOCHS = 100
lrs = [0.001,0.01,0.1,1]
HIDDEN_DIM = 4
train_data = pd.read_csv("training_data.csv",dtype=object,index_col=False)
train_data['time']= pd.to_datetime(train_data['time'])
headers = ['dailyCases_chng_t-1', 'dailyCases_chng_t-2',
'outpatient_cli_t-1', 'outpatient_cli_t-2', 'hospitalAdm_claim_t-1',
'hospitalAdm_claim_t-2', 'googleSym_sum_t-1', 'googleSym_sum_t-2',
'doctorVisits_t-1', 'doctorVisits_t-2','dailyCases_t']
mean_dailyCases = train_data['dailyCases_t'].astype(float).mean()
for column_name in headers:
train_data[column_name] = train_data[column_name].astype(float)
train_data[column_name] = train_data[column_name]-train_data[column_name].mean()
y = train_data["dailyCases_t"]
y = np.array(y,dtype=np.float32)
X = train_data.drop(columns=['dailyCases_t','time','fips'])
X = np.array(X,dtype=np.float32)
X = X.reshape(X.shape[0], 1, X.shape[1])
county_avg_valid_MSEs = []
for lr in lrs:
print("\n*********** CURRENT LEARNING RATE :", lr, "***********")
i=0
tscv = TimeSeriesSplit(n_splits=cv)
avg_valid_mse = 0.0
for train_index, valid_index in tscv.split(X):
X_train, X_valid = X[train_index], X[valid_index]
y_train, y_valid = y[train_index], y[valid_index]
training_set = TensorDataset(torch.from_numpy(X_train),
torch.from_numpy(y_train))
training_dataloader = torch.utils.data.DataLoader(training_set, batch_size=16, shuffle=False)
valid_set = TensorDataset(torch.from_numpy(X_valid),
torch.from_numpy(y_valid))
validation_dataloader = torch.utils.data.DataLoader(valid_set, batch_size=16, shuffle=False)
print("SPLIT",i+1,":")
LEARNING_RATE = lr
model = LSTM(input_dim=10, hidden_dim=HIDDEN_DIM,n_layers=1)
# optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.MSELoss()
model.train()
for epoch in range(EPOCHS):
# Train the network by filling in this block of code
epoch_loss = 0.0
#epoch_loss = []
#train_mse = 0.0
train_prediction = []
train_ground_truth = []
for inputs, labels in training_dataloader:
optimizer.zero_grad()
output = model(inputs)
loss = torch.sqrt(loss_fn(output, torch.reshape(labels,output.shape)))
loss.backward()
optimizer.step()
epoch_loss += loss.item()
train_prediction = np.concatenate((train_prediction,torch.flatten(output).detach().numpy()))
train_ground_truth = np.concatenate((train_ground_truth,labels.detach().numpy()))
train_mse = np.sqrt(mean_squared_error(train_prediction,train_ground_truth))
print("Epoch", epoch, ", Loss:", epoch_loss, ", Train RMSE:", train_mse)
model.eval()
valid_prediction=[]
valid_ground_truth =[]
with torch.set_grad_enabled(False):
for inputs, labels in validation_dataloader:
output = model(inputs)
valid_prediction = np.concatenate((valid_prediction,torch.flatten(output).detach().numpy()))
valid_ground_truth = np.concatenate((valid_ground_truth,labels.detach().numpy()))
valid_mse = np.sqrt(mean_squared_error(valid_prediction,valid_ground_truth))
avg_valid_mse+=valid_mse
print("Validation MSE: ", valid_mse)
plt.plot(np.arange(len(valid_prediction)),valid_prediction,label="prediction")
plt.plot(np.arange(len(valid_ground_truth)),valid_ground_truth,label="ground truth")
plt.ylabel("Daily Cases")
plt.title("Cross Validation Result")
plt.legend()
plt.show()
i += 1
# if i==cv:
# test_prediction=[]
# test_ground_truth =[]
# test_mse = 0.0
# with torch.set_grad_enabled(False):
# for inputs, labels in test_dataloader:
# output = model(inputs)
# test_prediction = np.concatenate((test_prediction,torch.flatten(output).detach().numpy()))
# test_ground_truth = np.concatenate((test_ground_truth,labels.detach().numpy()))
# test_mse = np.sqrt(mean_squared_error(test_prediction,test_ground_truth))
# print("Test MSE: ", test_mse)
# plt.plot(np.arange(len(test_prediction)),test_prediction+mean_dailyCases,label="prediction")
# plt.plot(np.arange(len(test_ground_truth)),test_ground_truth+mean_dailyCases,label="ground truth")
# plt.title("Test data Prediction")
# plt.legend()
# plt.show()
avg_valid_mse /= cv
county_avg_valid_MSEs.append(avg_valid_mse)
print("lr:",lr)
print("Average validation MSE:",avg_valid_mse)
print()
Train the model with all counties data and test with all counties data
#Train the model with all data
cv = 5
EPOCHS = 100
lr = 0.001
HIDDEN_DIM = 4
train_data = pd.read_csv("training_data.csv",dtype=object,index_col=False)
train_data['time']= pd.to_datetime(train_data['time'])
headers = ['dailyCases_chng_t-1', 'dailyCases_chng_t-2',
'outpatient_cli_t-1', 'outpatient_cli_t-2', 'hospitalAdm_claim_t-1',
'hospitalAdm_claim_t-2', 'googleSym_sum_t-1', 'googleSym_sum_t-2',
'doctorVisits_t-1', 'doctorVisits_t-2','dailyCases_t']
for column_name in headers:
train_data[column_name] = train_data[column_name].astype(float)
train_data[column_name] = train_data[column_name]-train_data[column_name].mean()
y = train_data["dailyCases_t"]
y = np.array(y,dtype=np.float32)
X = train_data.drop(columns=['dailyCases_t','time','fips'])
X = np.array(X,dtype=np.float32)
X = X.reshape(X.shape[0], 1, X.shape[1])
LEARNING_RATE = lr
training_set = TensorDataset(torch.from_numpy(X),
torch.from_numpy(y))
training_dataloader = torch.utils.data.DataLoader(training_set, batch_size=16, shuffle=False)
model = LSTM(input_dim=10, hidden_dim=HIDDEN_DIM,n_layers=1)
# optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.MSELoss()
model.train()
train_MSEs = []
for epoch in range(EPOCHS):
epoch_loss = 0.0
train_prediction = []
train_ground_truth = []
for inputs, labels in training_dataloader:
optimizer.zero_grad()
output = model(inputs)
loss = torch.sqrt(loss_fn(output, torch.reshape(labels,output.shape)))
loss.backward()
optimizer.step()
epoch_loss += loss.item()
train_prediction = np.concatenate((train_prediction,torch.flatten(output).detach().numpy()))
train_ground_truth = np.concatenate((train_ground_truth,labels.detach().numpy()))
train_mse = np.sqrt(mean_squared_error(train_prediction,train_ground_truth))
train_MSEs.append(train_mse)
print("Epoch", epoch, ", Loss:", epoch_loss, ", Train RMSE:", train_mse)
#Test on all county
test_data = pd.read_csv("test_data.csv",dtype=object,index_col=False)
mean_dailyCases = test_data['dailyCases_t'].astype(float).mean()
for column_name in headers:
test_data[column_name] = test_data[column_name].astype(float)
test_data[column_name] = test_data[column_name]-test_data[column_name].mean()
y_test = test_data["dailyCases_t"]
y_test = np.array(y_test,dtype=np.float32)
X_test = test_data.drop(columns=['dailyCases_t','time','fips'])
X_test = np.array(X_test,dtype=np.float32)
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
test_set = TensorDataset(torch.from_numpy(X_test),
torch.from_numpy(y_test))
test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=16, shuffle=False)
model.eval()
test_prediction=[]
test_ground_truth =[]
test_mse = 0.0
with torch.set_grad_enabled(False):
for inputs, labels in test_dataloader:
output = model(inputs)
test_prediction = np.concatenate((test_prediction,torch.flatten(output).detach().numpy()))
test_ground_truth = np.concatenate((test_ground_truth,labels.detach().numpy()))
test_mse = np.sqrt(mean_squared_error(test_prediction,test_ground_truth))
print("Test RMSE: ", test_mse)
plt.plot(np.arange(len(test_ground_truth)),test_ground_truth+mean_dailyCases,label="ground truth")
plt.plot(np.arange(len(test_prediction)),test_prediction+mean_dailyCases,label="prediction",)
plt.legend()
plt.title("Predict COVID19 cases 9/1/2021-10/1/2021 for 15 counties")
# plt.xlabel("Day")
plt.ylabel("Daily Cases")
plt.savefig("lstm_test_prediction.png")
plt.show()
#Plot train MSE
plt.plot(np.arange(EPOCHS),train_MSEs)
plt.xlabel("Epoch")
plt.ylabel("Train RMSE")
plt.savefig("train_RMSE.png")
plt.show()
Test on a single county
#Test on single county
counties = ['06001', '06013', '06019', '06029', '06037', '06059', '06065',
'06067', '06071', '06073', '06075', '06077', '06081', '06085',
'06111']
for county in counties:
test_data = pd.read_csv("test_data.csv",dtype=object,index_col=False)
test_data = test_data[test_data["fips"]==county]
mean_dailyCases = test_data['dailyCases_t'].astype(float).mean()
for column_name in headers:
test_data[column_name] = test_data[column_name].astype(float)
test_data[column_name] = test_data[column_name]-test_data[column_name].mean()
y_test = test_data["dailyCases_t"]
y_test = np.array(y_test,dtype=np.float32)
X_test = test_data.drop(columns=['dailyCases_t','time','fips'])
X_test = np.array(X_test,dtype=np.float32)
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
test_set = TensorDataset(torch.from_numpy(X_test),
torch.from_numpy(y_test))
test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=16, shuffle=False)
model.eval()
test_prediction=[]
test_ground_truth =[]
test_mse = 0.0
with torch.set_grad_enabled(False):
for inputs, labels in test_dataloader:
output = model(inputs)
test_prediction = np.concatenate((test_prediction,torch.flatten(output).detach().numpy()))
test_ground_truth = np.concatenate((test_ground_truth,labels.detach().numpy()))
test_mse = np.sqrt(mean_squared_error(test_prediction,test_ground_truth))
print("Test RMSE: ", test_mse)
plt.plot(np.arange(len(test_ground_truth)),test_ground_truth+mean_dailyCases,label="ground truth",c='r')
plt.plot(np.arange(len(test_prediction)),test_prediction+mean_dailyCases,label="prediction",)
plt.legend()
plt.xlabel("Day")
plt.ylabel("Daily Cases")
plt.title("county "+county)
plt.show()
# plt.savefig("lstm_test_prediction.png")
Train the model with average county data
counties = [ '06019', '06029', '06037', '06059', '06065',
'06067', '06071', '06073', '06075', '06077', '06081', '06085']
df = pd.read_csv("data.csv",dtype=object,index_col=False)
df['time']= pd.to_datetime(df['time'])
time = df["time"]
df=df.drop(columns=["time"])
headers = ['dailyCases_chng_t-1', 'dailyCases_chng_t-2',
'outpatient_cli_t-1', 'outpatient_cli_t-2', 'hospitalAdm_claim_t-1',
'hospitalAdm_claim_t-2', 'googleSym_sum_t-1', 'googleSym_sum_t-2',
'doctorVisits_t-1', 'doctorVisits_t-2','dailyCases_t']
for column_name in headers:
df[column_name] = df[column_name].astype(float)
avg_county_data = df[df["fips"]=='06013'].drop(columns=["fips"])
for county in counties:
data = df[df["fips"]==county].drop(columns=["fips"])
for h in headers:
avg_county_data[h] += np.array(data[h])
avg_county_data /= (len(counties)+1)
avg_county_data["time"] = time
avg_county_data
# Train the model with average county data
cv = 5
EPOCHS = 100
lr = 1
HIDDEN_DIM = 4
data = avg_county_data
headers = ['dailyCases_chng_t-1', 'dailyCases_chng_t-2',
'outpatient_cli_t-1', 'outpatient_cli_t-2', 'hospitalAdm_claim_t-1',
'hospitalAdm_claim_t-2', 'googleSym_sum_t-1', 'googleSym_sum_t-2',
'doctorVisits_t-1', 'doctorVisits_t-2','dailyCases_t']
mean_dailyCases = data['dailyCases_t'].astype(float).mean()
for column_name in headers:
data[column_name] = data[column_name].astype(float)
data[column_name] = data[column_name]-data[column_name].mean()
idx = data.index[data["time"]==np.datetime64(date(2021,9,1))]
test_data = data.loc[idx[0]:]
y_test = test_data["dailyCases_t"]
y_test = np.array(y_test,dtype=np.float32)
X_test = test_data.drop(columns=['dailyCases_t','time'])
X_test = np.array(X_test,dtype=np.float32)
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
test_set = TensorDataset(torch.from_numpy(X_test),
torch.from_numpy(y_test))
test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=16, shuffle=False)
train_data = data.loc[:idx[0]-1]
y = train_data["dailyCases_t"]
y = np.array(y,dtype=np.float32)
X = train_data.drop(columns=['dailyCases_t','time'])
X = np.array(X,dtype=np.float32)
X = X.reshape(X.shape[0], 1, X.shape[1])
LEARNING_RATE = lr
i=0
tscv = TimeSeriesSplit(n_splits=cv)
avg_valid_mse = 0.0
train_RMSEs_nn_avg = []
valid_RMSEs_nn_avg = []
test_RMSEs_nn_avg = []
for train_index, valid_index in tscv.split(X):
X_train, X_valid = X[train_index], X[valid_index]
y_train, y_valid = y[train_index], y[valid_index]
training_set = TensorDataset(torch.from_numpy(X_train),
torch.from_numpy(y_train))
training_dataloader = torch.utils.data.DataLoader(training_set, batch_size=16, shuffle=False)
valid_set = TensorDataset(torch.from_numpy(X_valid),
torch.from_numpy(y_valid))
validation_dataloader = torch.utils.data.DataLoader(valid_set, batch_size=16, shuffle=False)
print("split",i+1,":")
model = LSTM(input_dim=10, hidden_dim=HIDDEN_DIM,n_layers=1)
#optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.MSELoss()
model.train()
train_mse_epoch_sum = 0.0
for epoch in range(EPOCHS):
# Train the network by filling in this block of code
epoch_loss = 0.0
#epoch_loss = []
#train_mse = 0.0
train_prediction = []
train_ground_truth = []
for inputs, labels in training_dataloader:
optimizer.zero_grad()
output = model(inputs)
loss = torch.sqrt(loss_fn(output, torch.reshape(labels,output.shape)))
loss.backward()
optimizer.step()
epoch_loss += loss.item()
#epoch_loss.append(loss.item())
#train_MSEs[i].append(loss.item())
train_prediction = np.concatenate((train_prediction,torch.flatten(output).detach().numpy()))
train_ground_truth = np.concatenate((train_ground_truth,labels.detach().numpy()))
# train_MSEs[i].append(mean_squared_error(torch.flatten(output).detach().numpy(), labels.detach().numpy()))
#train_prediction[i] = np.concatenate((train_prediction[i],torch.flatten(output).detach().numpy()))
# train_ground_truth[i] = np.concatenate((train_ground_truth[i],labels.detach().numpy()))
#train_acc += torch.sum(prediction == labels)
#train_accs.append(train_acc/count)
#losses.append(np.mean(epoch_loss))
train_mse = np.sqrt(mean_squared_error(train_prediction,train_ground_truth))
train_mse_epoch_sum += train_mse
print("Epoch", epoch, ", Loss:", epoch_loss, ", Train MSE:", train_mse)
train_RMSEs_nn_avg.append(train_mse_epoch_sum/EPOCHS)
model.eval()
valid_mse = 0.0
#valid_mse = []
valid_prediction=[]
valid_ground_truth =[]
with torch.set_grad_enabled(False):
for inputs, labels in validation_dataloader:
output = model(inputs)
valid_prediction = np.concatenate((valid_prediction,torch.flatten(output).detach().numpy()))
valid_ground_truth = np.concatenate((valid_ground_truth,labels.detach().numpy()))
#valid_MSEs[i].append(mean_squared_error(torch.flatten(output).detach().numpy(), labels.detach().numpy()))
#valid_MSEs[i].append(valid_mse)
valid_mse = np.sqrt(mean_squared_error(valid_prediction,valid_ground_truth))
avg_valid_mse+=valid_mse
valid_RMSEs_nn_avg.append(valid_mse)
print("Validation MSE: ", valid_mse)
plt.plot(np.arange(len(valid_prediction)),valid_prediction,label="prediction")
plt.plot(np.arange(len(valid_ground_truth)),valid_ground_truth,label="ground truth")
plt.legend()
plt.show()
i += 1
if i==cv:
test_prediction=[]
test_ground_truth =[]
test_mse = 0.0
with torch.set_grad_enabled(False):
for inputs, labels in test_dataloader:
output = model(inputs)
test_prediction = np.concatenate((test_prediction,torch.flatten(output).detach().numpy()))
test_ground_truth = np.concatenate((test_ground_truth,labels.detach().numpy()))
test_mse = np.sqrt(mean_squared_error(test_prediction,test_ground_truth))
test_RMSEs_nn_avg.append(test_mse)
print("Test MSE: ", test_mse)
plt.plot(np.arange(len(test_prediction)),test_prediction+mean_dailyCases,label="prediction")
plt.plot(np.arange(len(test_ground_truth)),test_ground_truth+mean_dailyCases,label="ground truth")
plt.title("Test data Prediction")
plt.legend()
plt.show()
avg_valid_mse /= cv
print("Average validation MSE:",avg_valid_mse)
print()
import matplotlib.pyplot as plt
test_RMSEs_nn_avg = [test_RMSEs_nn_avg[0] for i in range(5)]
plt.plot(range(1,6), train_RMSEs_nn_avg, color='red', label='Training RMSEs')
plt.plot(range(1,6), valid_RMSEs_nn_avg, color='green', label='Validation RMSEs')
plt.plot(range(1,6), test_RMSEs_nn_avg, color='blue', label='Testing RMSEs')
plt.legend(loc='best')
plt.title('Train/Valid/Test RMSEs Across the 5 Splits')
plt.xlabel('Splits')
plt.ylabel('RMSE')
plt.show()
Train the model with a single county data
#cross validate the model on single county data
cv = 5
EPOCHS = 100
lr = 1
HIDDEN_DIMs = [4]
counties = ['06037','06013','06073','06075']#, '06013', '06019', '06029', '06037', '06059', '06065',
#'06067', '06071', '06073', '06075', '06077', '06081', '06085',
#'06111']
county_avg_valid_MSEs = []
for HIDDEN_DIM in HIDDEN_DIMs:
for county in counties:
print("County:",county)
data = df[df["fips"]==county]
headers = ['dailyCases_chng_t-1', 'dailyCases_chng_t-2',
'outpatient_cli_t-1', 'outpatient_cli_t-2', 'hospitalAdm_claim_t-1',
'hospitalAdm_claim_t-2', 'googleSym_sum_t-1', 'googleSym_sum_t-2',
'doctorVisits_t-1', 'doctorVisits_t-2','dailyCases_t']
mean_dailyCases = data['dailyCases_t'].astype(float).mean()
for column_name in headers:
data[column_name] = data[column_name].astype(float)
data[column_name] = data[column_name]-data[column_name].mean()
idx = data.index[data["time"]==np.datetime64(date(2021,9,1))]
test_data = data.loc[idx[0]:]
y_test = test_data["dailyCases_t"]
y_test = np.array(y_test,dtype=np.float32)
X_test = test_data.drop(columns=['dailyCases_t','time','fips'])
X_test = np.array(X_test,dtype=np.float32)
X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])
test_set = TensorDataset(torch.from_numpy(X_test),
torch.from_numpy(y_test))
test_dataloader = torch.utils.data.DataLoader(test_set, batch_size=16, shuffle=False)
train_data = data.loc[:idx[0]-1]
y = train_data["dailyCases_t"]
y = np.array(y,dtype=np.float32)
X = train_data.drop(columns=['dailyCases_t','time','fips'])
X = np.array(X,dtype=np.float32)
X = X.reshape(X.shape[0], 1, X.shape[1])
LEARNING_RATE = lr
i=0
tscv = TimeSeriesSplit(n_splits=cv)
avg_valid_mse = 0.0
for train_index, valid_index in tscv.split(X):
X_train, X_valid = X[train_index], X[valid_index]
y_train, y_valid = y[train_index], y[valid_index]
training_set = TensorDataset(torch.from_numpy(X_train),
torch.from_numpy(y_train))
training_dataloader = torch.utils.data.DataLoader(training_set, batch_size=16, shuffle=False)
valid_set = TensorDataset(torch.from_numpy(X_valid),
torch.from_numpy(y_valid))
validation_dataloader = torch.utils.data.DataLoader(valid_set, batch_size=16, shuffle=False)
print("split",i+1,":")
model = LSTM(input_dim=10, hidden_dim=HIDDEN_DIM,n_layers=1)
# optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.MSELoss()
model.train()
#losses = []
for epoch in range(EPOCHS):
# Train the network by filling in this block of code
epoch_loss = 0.0
#epoch_loss = []
#train_mse = 0.0
train_prediction = []
train_ground_truth = []
for inputs, labels in training_dataloader:
optimizer.zero_grad()
output = model(inputs)
loss = torch.sqrt(loss_fn(output, torch.reshape(labels,output.shape)))
loss.backward()
optimizer.step()
epoch_loss += loss.item()
#epoch_loss.append(loss.item())
#train_MSEs[i].append(loss.item())
train_prediction = np.concatenate((train_prediction,torch.flatten(output).detach().numpy()))
train_ground_truth = np.concatenate((train_ground_truth,labels.detach().numpy()))
# train_MSEs[i].append(mean_squared_error(torch.flatten(output).detach().numpy(), labels.detach().numpy()))
#train_prediction[i] = np.concatenate((train_prediction[i],torch.flatten(output).detach().numpy()))
# train_ground_truth[i] = np.concatenate((train_ground_truth[i],labels.detach().numpy()))
#train_acc += torch.sum(prediction == labels)
#train_accs.append(train_acc/count)
#losses.append(np.mean(epoch_loss))
train_mse = np.sqrt(mean_squared_error(train_prediction,train_ground_truth))
print("Epoch", epoch, ", Loss:", epoch_loss, ", Train MSE:", train_mse)
model.eval()
valid_mse = 0.0
#valid_mse = []
valid_prediction=[]
valid_ground_truth =[]
with torch.set_grad_enabled(False):
for inputs, labels in validation_dataloader:
output = model(inputs)
valid_prediction = np.concatenate((valid_prediction,torch.flatten(output).detach().numpy()))
valid_ground_truth = np.concatenate((valid_ground_truth,labels.detach().numpy()))
#valid_MSEs[i].append(mean_squared_error(torch.flatten(output).detach().numpy(), labels.detach().numpy()))
#valid_MSEs[i].append(valid_mse)
valid_mse = np.sqrt(mean_squared_error(valid_prediction,valid_ground_truth))
avg_valid_mse+=valid_mse
print("Validation MSE: ", valid_mse)
# plt.plot(np.arange(len(valid_prediction)),valid_prediction,label="prediction")
# plt.plot(np.arange(len(valid_ground_truth)),valid_ground_truth,label="ground truth")
# plt.legend()
# plt.show()
# valid_prediction[i] = np.concatenate((valid_prediction[i],torch.flatten(output).detach().numpy()))
# valid_ground_truth[i] = np.concatenate((valid_ground_truth[i],labels.detach().numpy()))
#valid_acc += torch.sum(prediction == labels)
#valid_accs.append(valid_acc/len(X_valid)
i += 1
if i==cv:
test_prediction=[]
test_ground_truth =[]
test_mse = 0.0
with torch.set_grad_enabled(False):
for inputs, labels in test_dataloader:
output = model(inputs)
test_prediction = np.concatenate((test_prediction,torch.flatten(output).detach().numpy()))
test_ground_truth = np.concatenate((test_ground_truth,labels.detach().numpy()))
test_mse = np.sqrt(mean_squared_error(test_prediction,test_ground_truth))
print("Test MSE: ", test_mse)
plt.plot(np.arange(len(test_prediction)),test_prediction+mean_dailyCases,label="prediction")
plt.plot(np.arange(len(test_ground_truth)),test_ground_truth+mean_dailyCases,label="ground truth")
plt.title("Test data Prediction")
plt.legend()
plt.show()
print()
avg_valid_mse /= cv
county_avg_valid_MSEs.append(avg_valid_mse)
print("Average validation MSE:",avg_valid_mse)
print()
Decision Tree Regression Model
df = pd.read_csv("data.csv",dtype=object)
train = df[df['time'] < '2021-09-04 00:00:00']
test = df[df['time'] > '2021-09-03 00:00:00' ]
from sklearn.model_selection import TimeSeriesSplit
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
tscv = TimeSeriesSplit(n_splits=5)
all_counties = df.fips.unique()
best_model_per_county = []
max_depths = range(2,20)
# keeping track of the RMSEs of our successful models per county to plot later
train_RMSEs_per_county = []
valid_RMSEs_per_county = []
test_RMSEs_per_county = []
# keeping track of the R2s of our successful models per county to plot later
train_R2_per_county = []
valid_R2_per_county = []
test_R2_per_county = []
for county in all_counties:
print("\n ************* CURRENT COUNTY FIPS: ", county, "*************")
min_rmse = float('inf')
best_model = 0
best_train_RMSEs = 0
best_valid_RMSEs = 0
best_test_RMSEs = 0
best_train_R2s = 0
best_valid_R2s = 0
best_test_R2s = 0
data = getCounty(train, county) # county data from 9/3/20-9/3/21
data_test = getCounty(test, county) # county dara from 9/4/21-10/2/21
# All our training data
y = np.array(data["dailyCases_t"], dtype=np.float32) # class labels
X = data.drop(columns=['dailyCases_t','time','fips'])
X = np.array(X,dtype=np.float32) # training data
# Our testing data
y_test = np.array(data_test["dailyCases_t"], dtype=np.float32) # class labels for testing
X_test = data_test.drop(columns=['dailyCases_t','time','fips'])
X_test = np.array(X_test,dtype=np.float32) # testing data
# Hyperparameter tuning
for depth in max_depths:
print("\n------ CURRENT DEPTH : ", depth)
i = 0
train_error = []
valid_error = []
test_error = []
for train_index, valid_index in tscv.split(X):
X_train, X_valid = X[train_index], X[valid_index]
y_train, y_valid = y[train_index], y[valid_index]
print("split", i)
i += 1
dtree = DecisionTreeRegressor(max_depth=depth)
dtree.fit(X_train, y_train)
# train
pred_train = dtree.predict(X_train)
train_rmse = np.sqrt(mean_squared_error(y_train, pred_train))
train_error.append(train_rmse)
train_R2 = r2_score(y_train, pred_train)
print("The Train R-squared value is:", train_R2)
print("The Train RMSE is:", train_rmse)
# validation
pred_valid = dtree.predict(X_valid)
valid_rmse = np.sqrt(mean_squared_error(y_valid, pred_valid))
valid_error.append(valid_rmse)
valid_R2 = r2_score(y_valid, pred_valid)
print("The Validation R-squared value is:", valid_R2)
print("The Validation RMSE is:", valid_rmse)
# test
pred_test = dtree.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, pred_test))
test_error.append(test_rmse)
test_R2 = r2_score(y_test, pred_test)
print("The Test R-squared value is:", test_R2)
print("The Test RMSE is:", test_rmse)
avg_test_rmse = np.mean(test_error)
if avg_test_rmse < min_rmse:
min_rmse = avg_test_rmse
best_model = dtree
best_train_RMSEs = train_error
best_valid_RMSEs = valid_error
best_test_RMSEs = test_error
best_train_R2s = train_R2
best_valid_R2s = valid_R2
best_test_R2s = test_R2
best_model_per_county.append(best_model)
train_RMSEs_per_county.append(best_train_RMSEs)
valid_RMSEs_per_county.append(best_valid_RMSEs)
test_RMSEs_per_county.append(best_test_RMSEs)
train_R2_per_county.append(best_train_R2s)
valid_R2_per_county.append(best_valid_R2s)
test_R2_per_county.append(best_valid_R2s)
best_index = 0
best_general_model = 0
min_test_rmse = float("inf")
i = 0
for model in best_model_per_county:
curr_test_errors = []
for county in all_counties:
data_test = getCounty(test, county)
# Our testing data
y_test = np.array(data_test["dailyCases_t"], dtype=np.float32)
X_test = data_test.drop(columns=['dailyCases_t','time','fips'])
X_test = np.array(X_test,dtype=np.float32) # testing data
# test
pred_test = model.predict(X_test)
test_rmse = mean_squared_error(y_test, pred_test, squared = False)
curr_test_errors.append(test_rmse)
if np.mean(curr_test_errors) < min_test_rmse:
min_test_rmse = np.mean(curr_test_errors)
best_general_model = model
best_index = i
i += 1
import matplotlib.pyplot as plt
t = getCounty(test, '06037')
y_t = np.array(t["dailyCases_t"], dtype=np.float32)
X_t = np.array(t.drop(columns=['dailyCases_t','time','fips']))
model_t = best_general_model
pred_t = model_t.predict(X_t)
print("Test RMSE: ", mean_squared_error(y_t, pred_t, squared = False))
plt.plot(range(1,len(y_t)+1), y_t, color='red', label='Ground Truth')
plt.plot(range(1,len(y_t)+1), pred_t, color='blue', label='Predictions by Best DT Model')
plt.legend(loc='best')
plt.title('Best DT Model Predictions VS Actual Number of Cases in LA')
plt.xlabel('Days')
plt.ylabel('Number of Cases')
plt.show()
min_test_rmse
# Here are the test averages of the NN model across all counties
NN_test_RMSEs = np.array([422.96855915421315,389.79688977191597,548.7047392266221,519.2485649450427,2165.628641457177,577.7467416936556,847.4010750524387,625.0115243391487,813.5223898441664,988.6604761160371,336.09873845254344,470.7531034449218,310.65657127066,452.3952963502877,351.9262260762237])
np.mean(NN_test_RMSEs)
# feature names
feature_names=data.columns[2:12]
feature_names
# feature importances
print("Feature importances are:", best_general_model.feature_importances_)
# visualize feature importances
(pd.Series(best_general_model.feature_importances_, index=data.columns[2:12])
.nlargest(4)
.plot(kind='barh'))
# visualize best tree
from sklearn import tree
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(best_general_model,
feature_names=data.columns[2:12],
filled=True)
#best_index
import matplotlib.pyplot as plt
plt.plot(range(1,6), train_RMSEs_per_county[best_index], color='red', label='Training RMSEs')
plt.plot(range(1,6), valid_RMSEs_per_county[best_index], color='green', label='Validation RMSEs')
plt.plot(range(1,6), test_RMSEs_per_county[best_index], color='blue', label='Testing RMSEs')
plt.legend(loc='best')
plt.title('Train/Valid/Test RMSEs Across the 5 Splits')
plt.xlabel('Splits')
plt.ylabel('RMSE')
plt.show()