PSTAT 197A Data Prep

!pip install scikit-optimize from tensorflow.keras import regularizers from skopt import BayesSearchCV import numpy as np import pandas as pd from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from tensorflow.keras import Sequential from tensorflow.keras.layers import Bidirectional from tensorflow.keras.layers import LSTM from tensorflow.keras.layers import Dropout from tensorflow.keras.layers import Dense from tensorflow.keras.layers import Flatten from tensorflow.keras.layers import Conv1D from tensorflow.keras.layers import MaxPooling1D from tensorflow.keras.layers import Conv2D from tensorflow.keras.layers import GRU from tensorflow.keras.layers import MaxPooling2D from tensorflow.keras.layers import TimeDistributed from tensorflow.keras.layers import Activation from tensorflow.keras.layers import LeakyReLU from tensorflow.keras.layers import ReLU from tensorflow.keras.layers import PReLU from tensorflow.keras.layers import SimpleRNN from tensorflow.keras import Input from tensorflow.keras.activations import softplus from tensorflow.keras.optimizers import SGD from tensorflow.keras.optimizers.schedules import LearningRateSchedule from tensorflow.keras.optimizers.schedules import ExponentialDecay from tensorflow.keras.callbacks import LearningRateScheduler import math from sklearn.model_selection import GridSearchCV from tensorflow.keras.wrappers.scikit_learn import KerasRegressor from sklearn.model_selection import TimeSeriesSplit from sklearn.model_selection import RandomizedSearchCV from tensorflow.keras.callbacks import EarlyStopping from sklearn.metrics import mean_squared_error from tensorflow.keras.regularizers import l1 from tensorflow.keras.regularizers import l2 from tensorflow.keras.regularizers import l1_l2 from sklearn.model_selection import cross_val_score

def all_features(): #Reading in DF and Formatting clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_all_features.csv', index_col=0) #clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_HomeTime.csv', index_col=0) clean_good_full_df['geo_value'] = clean_good_full_df['geo_value'].astype("float") clean_good_full_df['geo_value_t_minus_1'] = clean_good_full_df['geo_value_t_minus_1'].astype("float") t = clean_good_full_df.iloc[:,0:10].reset_index() t_minus_1 = clean_good_full_df.iloc[:,10:20].reset_index() t_plus_1 = clean_good_full_df.iloc[:,20].reset_index() clean_good_full_df = t_minus_1.merge(t, on = ['index']).merge(t_plus_1, on = ['index']).drop(columns = ['index']).reset_index(drop = True) clean_good_full_df = clean_good_full_df.sort_values(by = 'time_value').drop(columns = ['time_value','time_value_t_minus_1']) clean_good_full_df.shape global scaler scaler = StandardScaler() global full_df_scaled global X_full global y_full global X_train global X_test global y_train global y_test global X_cv global y_cv global X_df_scaled_reshaped global Y_df_scaled_reshaped global X_test_df_scaled_reshaped global Y_test_df_scaled_reshaped global X_cv_df_scaled_reshaped global Y_cv_df_scaled_reshaped #Scaling and Reshaping scaler = StandardScaler() scaler = scaler.fit(clean_good_full_df) full_df_scaled = scaler.transform(clean_good_full_df) X_full = full_df_scaled[:,0:18] y_full = full_df_scaled[:,18] X_train = X_full[0:4256,:] X_test = X_full[4256:,:] y_train = y_full[0:4256] y_test = y_full[4256:] X_cv = X_train y_cv = y_train X_df_scaled_reshaped = X_train.reshape(4256,2,9) Y_df_scaled_reshaped = y_train.reshape(4256,1) X_test_df_scaled_reshaped = X_test.reshape(1065,2,9) Y_test_df_scaled_reshaped = y_test.reshape(1065,1) X_cv_df_scaled_reshaped = X_cv.reshape(4256,2,9) Y_cv_df_scaled_reshaped = y_cv.reshape(4256,1)

def no_google(): #Reading in DF and Formatting clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_GoogleDropped.csv', index_col=0) #clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_HomeTime.csv', index_col=0) clean_good_full_df['geo_value'] = clean_good_full_df['geo_value'].astype("float") clean_good_full_df['geo_value_t_minus_1'] = clean_good_full_df['geo_value_t_minus_1'].astype("float") t = clean_good_full_df.iloc[:,0:8].reset_index() t_minus_1 = clean_good_full_df.iloc[:,8:16].reset_index() t_plus_1 = clean_good_full_df.iloc[:,16].reset_index() clean_good_full_df = t_minus_1.merge(t, on = ['index']).merge(t_plus_1, on = ['index']).drop(columns = ['index']).reset_index(drop = True) clean_good_full_df = clean_good_full_df.sort_values(by = 'time_value').drop(columns = ['time_value','time_value_t_minus_1']) clean_good_full_df.shape global scaler scaler = StandardScaler() global full_df_scaled global X_full global y_full global X_train global X_test global y_train global y_test global X_cv global y_cv global X_df_scaled_reshaped global Y_df_scaled_reshaped global X_test_df_scaled_reshaped global Y_test_df_scaled_reshaped global X_cv_df_scaled_reshaped global Y_cv_df_scaled_reshaped #Scaling and Reshaping scaler = StandardScaler() scaler = scaler.fit(clean_good_full_df) full_df_scaled = scaler.transform(clean_good_full_df) X_full = full_df_scaled[:,0:14] y_full = full_df_scaled[:,14] X_train = X_full[0:6398,:] X_test = X_full[6398:,:] y_train = y_full[0:6398] y_test = y_full[6398:] X_cv = X_train y_cv = y_train X_df_scaled_reshaped = X_train.reshape(6398,2,7) Y_df_scaled_reshaped = y_train.reshape(6398,1) X_test_df_scaled_reshaped = X_test.reshape(1600,2,7) Y_test_df_scaled_reshaped = y_test.reshape(1600,1) X_cv_df_scaled_reshaped = X_cv.reshape(6398,2,7) Y_cv_df_scaled_reshaped = y_cv.reshape(6398,1)

def no_home(): #Reading in DF and Formatting clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_HomeTimeDropped.csv', index_col=0) #clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_HomeTime.csv', index_col=0) clean_good_full_df['geo_value'] = clean_good_full_df['geo_value'].astype("float") clean_good_full_df['geo_value_t_minus_1'] = clean_good_full_df['geo_value_t_minus_1'].astype("float") t = clean_good_full_df.iloc[:,0:9].reset_index() t_minus_1 = clean_good_full_df.iloc[:,9:18].reset_index() t_plus_1 = clean_good_full_df.iloc[:,18].reset_index() clean_good_full_df = t_minus_1.merge(t, on = ['index']).merge(t_plus_1, on = ['index']).drop(columns = ['index']).reset_index(drop = True) clean_good_full_df = clean_good_full_df.sort_values(by = 'time_value').drop(columns = ['time_value','time_value_t_minus_1']) clean_good_full_df.shape global scaler scaler = StandardScaler() global full_df_scaled global X_full global y_full global X_train global X_test global y_train global y_test global X_cv global y_cv global X_df_scaled_reshaped global Y_df_scaled_reshaped global X_test_df_scaled_reshaped global Y_test_df_scaled_reshaped global X_cv_df_scaled_reshaped global Y_cv_df_scaled_reshaped #Scaling and Reshaping scaler = StandardScaler() scaler = scaler.fit(clean_good_full_df) full_df_scaled = scaler.transform(clean_good_full_df) X_full = full_df_scaled[:,0:16] y_full = full_df_scaled[:,16] X_train = X_full[0:5948,:] X_test = X_full[5948:,:] y_train = y_full[0:5948] y_test = y_full[5948:] X_cv = X_train y_cv = y_train X_df_scaled_reshaped = X_train.reshape(5948,2,8) Y_df_scaled_reshaped = y_train.reshape(5948,1) X_test_df_scaled_reshaped = X_test.reshape(1488,2,8) Y_test_df_scaled_reshaped = y_test.reshape(1488,1) X_cv_df_scaled_reshaped = X_cv.reshape(5948,2,8) Y_cv_df_scaled_reshaped = y_cv.reshape(5948,1)

def no_home_no_google(): #Reading in DF and Formatting clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_HomeTimeDropped_GoogleDropped.csv', index_col=0) #clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_HomeTime.csv', index_col=0) clean_good_full_df['geo_value'] = clean_good_full_df['geo_value'].astype("float") clean_good_full_df['geo_value_t_minus_1'] = clean_good_full_df['geo_value_t_minus_1'].astype("float") t = clean_good_full_df.iloc[:,0:7].reset_index() t_minus_1 = clean_good_full_df.iloc[:,7:14].reset_index() t_plus_1 = clean_good_full_df.iloc[:,14].reset_index() clean_good_full_df = t_minus_1.merge(t, on = ['index']).merge(t_plus_1, on = ['index']).drop(columns = ['index']).reset_index(drop = True) clean_good_full_df = clean_good_full_df.sort_values(by = 'time_value').drop(columns = ['time_value','time_value_t_minus_1']) clean_good_full_df.shape global scaler scaler = StandardScaler() global full_df_scaled global X_full global y_full global X_train global X_test global y_train global y_test global X_cv global y_cv global X_df_scaled_reshaped global Y_df_scaled_reshaped global X_test_df_scaled_reshaped global Y_test_df_scaled_reshaped global X_cv_df_scaled_reshaped global Y_cv_df_scaled_reshaped #Scaling and Reshaping scaler = StandardScaler() scaler = scaler.fit(clean_good_full_df) full_df_scaled = scaler.transform(clean_good_full_df) X_full = full_df_scaled[:,0:12] y_full = full_df_scaled[:,12] X_train = X_full[0:9048,:] X_test = X_full[9048:,:] y_train = y_full[0:9048] y_test = y_full[9048:] X_cv = X_train y_cv = y_train X_df_scaled_reshaped = X_train.reshape(9048,2,6) Y_df_scaled_reshaped = y_train.reshape(9048,1) X_test_df_scaled_reshaped = X_test.reshape(2263,2,6) Y_test_df_scaled_reshaped = y_test.reshape(2263,1) X_cv_df_scaled_reshaped = X_cv.reshape(9048,2,6) Y_cv_df_scaled_reshaped = y_cv.reshape(9048,1)

def delta_noHome_noGoogle(): #Reading in DF and Formatting clean_good_full_df = pd.read_csv('delta_full_df_GoogleDropped_HomeTimeDropped.csv', index_col=0) #clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_HomeTime.csv', index_col=0) clean_good_full_df['geo_value'] = clean_good_full_df['geo_value'].astype("float") clean_good_full_df = clean_good_full_df.sort_values(by = 'time_value').drop(columns = ['time_value']) clean_good_full_df.shape global scaler scaler = StandardScaler() global full_df_scaled global X_full global y_full global X_train global X_test global y_train global y_test global X_cv global y_cv scaler = StandardScaler() scaler = scaler.fit(clean_good_full_df) full_df_scaled = scaler.transform(clean_good_full_df) X_full = full_df_scaled[:,0:11] y_full = full_df_scaled[:,11] X_train = X_full[0:9048,:] X_test = X_full[9048:,:] y_train = y_full[0:9048] y_test = y_full[9048:] X_cv = X_train y_cv = y_train

def delta_noHome(): #Reading in DF and Formatting clean_good_full_df = pd.read_csv('delta_full_df_HomeTimeDropped.csv', index_col=0) #clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_HomeTime.csv', index_col=0) clean_good_full_df['geo_value'] = clean_good_full_df['geo_value'].astype("float") clean_good_full_df = clean_good_full_df.sort_values(by = 'time_value').drop(columns = ['time_value']) clean_good_full_df.shape global scaler scaler = StandardScaler() global full_df_scaled global X_full global y_full global X_train global X_test global y_train global y_test global X_cv global y_cv scaler = StandardScaler() scaler = scaler.fit(clean_good_full_df) full_df_scaled = scaler.transform(clean_good_full_df) X_full = full_df_scaled[:,0:15] y_full = full_df_scaled[:,15] X_train = X_full[0:5948,:] X_test = X_full[5948:,:] y_train = y_full[0:5948] y_test = y_full[5948:] X_cv = X_train y_cv = y_train

def delta_noGoogle(): #Reading in DF and Formatting clean_good_full_df = pd.read_csv('delta_full_df_GoogleDropped.csv', index_col=0) #clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_HomeTime.csv', index_col=0) clean_good_full_df['geo_value'] = clean_good_full_df['geo_value'].astype("float") clean_good_full_df = clean_good_full_df.sort_values(by = 'time_value').drop(columns = ['time_value']) clean_good_full_df.shape global scaler scaler = StandardScaler() global full_df_scaled global X_full global y_full global X_train global X_test global y_train global y_test global X_cv global y_cv scaler = StandardScaler() scaler = scaler.fit(clean_good_full_df) full_df_scaled = scaler.transform(clean_good_full_df) X_full = full_df_scaled[:,0:13] y_full = full_df_scaled[:,13] X_train = X_full[0:6398,:] X_test = X_full[6398:,:] y_train = y_full[0:6398] y_test = y_full[6398:] X_cv = X_train y_cv = y_train

def delta_allFeatures(): #Reading in DF and Formatting clean_good_full_df = pd.read_csv('delta_full_df_all_features.csv', index_col=0) #clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_HomeTime.csv', index_col=0) clean_good_full_df['geo_value'] = clean_good_full_df['geo_value'].astype("float") clean_good_full_df = clean_good_full_df.sort_values(by = 'time_value').drop(columns = ['time_value']) clean_good_full_df.shape global scaler scaler = StandardScaler() global full_df_scaled global X_full global y_full global X_train global X_test global y_train global y_test global X_cv global y_cv scaler = StandardScaler() scaler = scaler.fit(clean_good_full_df) full_df_scaled = scaler.transform(clean_good_full_df) X_full = full_df_scaled[:,0:17] y_full = full_df_scaled[:,17] X_train = X_full[0:4256,:] X_test = X_full[4256:,:] y_train = y_full[0:4256] y_test = y_full[4256:] X_cv = X_train y_cv = y_train

def no_google_6timesteps(): #Reading in DF and Formatting global clean_good_full_df global time clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_GoogleDropped_6timesteps.csv', index_col=0) #clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_HomeTime.csv', index_col=0) clean_good_full_df['geo_value'] = clean_good_full_df['geo_value'].astype("float") clean_good_full_df['geo_value_t_minus_1'] = clean_good_full_df['geo_value_t_minus_1'].astype("float") clean_good_full_df['geo_value_t_minus_2'] = clean_good_full_df['geo_value_t_minus_2'].astype("float") clean_good_full_df['geo_value_t_minus_3'] = clean_good_full_df['geo_value_t_minus_3'].astype("float") clean_good_full_df['geo_value_t_minus_4'] = clean_good_full_df['geo_value_t_minus_4'].astype("float") clean_good_full_df['geo_value_t_minus_5'] = clean_good_full_df['geo_value_t_minus_5'].astype("float") time = clean_good_full_df.sort_values(by = 'time_value').drop(columns = [ 'time_value_t_minus_1','time_value_t_minus_2','time_value_t_minus_3','time_value_t_minus_4', 'time_value_t_minus_5']) clean_good_full_df = clean_good_full_df.sort_values(by = 'time_value').drop(columns = ['time_value', 'time_value_t_minus_1','time_value_t_minus_2','time_value_t_minus_3','time_value_t_minus_4', 'time_value_t_minus_5']) clean_good_full_df.shape global scaler scaler = StandardScaler() global full_df_scaled global X_full global y_full global X_train global X_test global y_train global y_test global X_cv global y_cv global X_df_scaled_reshaped global Y_df_scaled_reshaped global X_test_df_scaled_reshaped global Y_test_df_scaled_reshaped global X_cv_df_scaled_reshaped global Y_cv_df_scaled_reshaped #Scaling and Reshaping scaler = StandardScaler() scaler = scaler.fit(clean_good_full_df) full_df_scaled = scaler.transform(clean_good_full_df) X_full = full_df_scaled[:,0:42] y_full = full_df_scaled[:,42] X_train = X_full[0:6695,:] X_test = X_full[6695:,:] y_train = y_full[0:6695] y_test = y_full[6695:] X_cv = X_train y_cv = y_train #(8402, 49 X_df_scaled_reshaped = X_train.reshape(6695,6,7) Y_df_scaled_reshaped = y_train.reshape(6695,1) X_test_df_scaled_reshaped = X_test.reshape(1674,6,7) Y_test_df_scaled_reshaped = y_test.reshape(1674,1) X_cv_df_scaled_reshaped = X_cv.reshape(6695,6,7) Y_cv_df_scaled_reshaped = y_cv.reshape(6695,1) no_google_6timesteps()

def no_google_5timesteps(): #Reading in DF and Formatting clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_GoogleDropped_5timesteps.csv', index_col=0) #clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_HomeTime.csv', index_col=0) clean_good_full_df['geo_value'] = clean_good_full_df['geo_value'].astype("float") clean_good_full_df['geo_value_t_minus_1'] = clean_good_full_df['geo_value_t_minus_1'].astype("float") clean_good_full_df['geo_value_t_minus_2'] = clean_good_full_df['geo_value_t_minus_2'].astype("float") clean_good_full_df['geo_value_t_minus_3'] = clean_good_full_df['geo_value_t_minus_3'].astype("float") clean_good_full_df['geo_value_t_minus_4'] = clean_good_full_df['geo_value_t_minus_4'].astype("float") clean_good_full_df = clean_good_full_df.sort_values(by = 'time_value').drop(columns = ['time_value', 'time_value_t_minus_1','time_value_t_minus_2','time_value_t_minus_3','time_value_t_minus_4']) clean_good_full_df.shape global scaler scaler = StandardScaler() global full_df_scaled global X_full global y_full global X_train global X_test global y_train global y_test global X_cv global y_cv global X_df_scaled_reshaped global Y_df_scaled_reshaped global X_test_df_scaled_reshaped global Y_test_df_scaled_reshaped global X_cv_df_scaled_reshaped global Y_cv_df_scaled_reshaped #Scaling and Reshaping scaler = StandardScaler() scaler = scaler.fit(clean_good_full_df) full_df_scaled = scaler.transform(clean_good_full_df) X_full = full_df_scaled[:,0:35] y_full = full_df_scaled[:,35] X_train = X_full[0:6695,:] X_test = X_full[6695:,:] y_train = y_full[0:6695] y_test = y_full[6695:] X_cv = X_train y_cv = y_train #(8402, 49 X_df_scaled_reshaped = X_train.reshape(6695,5,7) Y_df_scaled_reshaped = y_train.reshape(6695,1) X_test_df_scaled_reshaped = X_test.reshape(1795,5,7) Y_test_df_scaled_reshaped = y_test.reshape(1795,1) X_cv_df_scaled_reshaped = X_cv.reshape(6695,5,7) Y_cv_df_scaled_reshaped = y_cv.reshape(6695,1) no_google_5timesteps()

def no_google_4timesteps(): #Reading in DF and Formatting clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_GoogleDropped_4timesteps.csv', index_col=0) #clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_HomeTime.csv', index_col=0) clean_good_full_df['geo_value'] = clean_good_full_df['geo_value'].astype("float") clean_good_full_df['geo_value_t_minus_1'] = clean_good_full_df['geo_value_t_minus_1'].astype("float") clean_good_full_df['geo_value_t_minus_2'] = clean_good_full_df['geo_value_t_minus_2'].astype("float") clean_good_full_df['geo_value_t_minus_3'] = clean_good_full_df['geo_value_t_minus_3'].astype("float") clean_good_full_df = clean_good_full_df.sort_values(by = 'time_value').drop(columns = ['time_value', 'time_value_t_minus_1','time_value_t_minus_2','time_value_t_minus_3']) clean_good_full_df.shape global scaler scaler = StandardScaler() global full_df_scaled global X_full global y_full global X_train global X_test global y_train global y_test global X_cv global y_cv global X_df_scaled_reshaped global Y_df_scaled_reshaped global X_test_df_scaled_reshaped global Y_test_df_scaled_reshaped global X_cv_df_scaled_reshaped global Y_cv_df_scaled_reshaped #Scaling and Reshaping scaler = StandardScaler() scaler = scaler.fit(clean_good_full_df) full_df_scaled = scaler.transform(clean_good_full_df) X_full = full_df_scaled[:,0:28] y_full = full_df_scaled[:,28] X_train = X_full[0:6721,:] X_test = X_full[6721:,:] y_train = y_full[0:6721] y_test = y_full[6721:] X_cv = X_train y_cv = y_train #(8402, 49 X_df_scaled_reshaped = X_train.reshape(6721,4,7) Y_df_scaled_reshaped = y_train.reshape(6721,1) X_test_df_scaled_reshaped = X_test.reshape(1859,4,7) Y_test_df_scaled_reshaped = y_test.reshape(1859,1) X_cv_df_scaled_reshaped = X_cv.reshape(6721,4,7) Y_cv_df_scaled_reshaped = y_cv.reshape(6721,1) no_google_4timesteps()

def no_google_3timesteps(): global clean_good_full_df global time #Reading in DF and Formatting clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_GoogleDropped_3timesteps.csv', index_col=0) #clean_good_full_df = pd.read_csv('imputed_clean_good_full_df_HomeTime.csv', index_col=0) clean_good_full_df['geo_value'] = clean_good_full_df['geo_value'].astype("float") clean_good_full_df['geo_value_t_minus_1'] = clean_good_full_df['geo_value_t_minus_1'].astype("float") clean_good_full_df['geo_value_t_minus_2'] = clean_good_full_df['geo_value_t_minus_2'].astype("float") time = clean_good_full_df.sort_values(by = 'time_value').drop(columns = [ 'time_value_t_minus_1','time_value_t_minus_2']) clean_good_full_df = clean_good_full_df.sort_values(by = 'time_value').drop(columns = ['time_value', 'time_value_t_minus_1','time_value_t_minus_2']) clean_good_full_df.shape global scaler scaler = StandardScaler() global full_df_scaled global X_full global y_full global X_train global X_test global y_train global y_test global X_cv global y_cv global X_df_scaled_reshaped global Y_df_scaled_reshaped global X_test_df_scaled_reshaped global Y_test_df_scaled_reshaped global X_cv_df_scaled_reshaped global Y_cv_df_scaled_reshaped #Scaling and Reshaping scaler = StandardScaler() scaler = scaler.fit(clean_good_full_df) full_df_scaled = scaler.transform(clean_good_full_df) X_full = full_df_scaled[:,0:21] y_full = full_df_scaled[:,21] X_train = X_full[0:6863,:] X_test = X_full[6863:,:] y_train = y_full[0:6863] y_test = y_full[6863:] X_cv = X_train y_cv = y_train global train_combined train_combined = pd.DataFrame(np.append(X_train, y_train.reshape(6863,1), axis = 1)).sample(frac = 1,replace = True).to_numpy() X_train = train_combined[:,0:21] y_train = train_combined[:,21] #(8402, 49 X_df_scaled_reshaped = X_train.reshape(6863,3,7) Y_df_scaled_reshaped = y_train.reshape(6863,1) X_test_df_scaled_reshaped = X_test.reshape(1747,3,7) Y_test_df_scaled_reshaped = y_test.reshape(1747,1) X_cv_df_scaled_reshaped = X_cv.reshape(6863,3,7) Y_cv_df_scaled_reshaped = y_cv.reshape(6863,1) no_google_3timesteps()

#Regular all_features() no_google() no_home() no_home_no_google() #Delta delta_allFeatures() delta_noGoogle() delta_noHome() delta_noHome_noGoogle() #Multi Time steps no_google_6timesteps() no_google_5timesteps() no_google_4timesteps() no_google_3timesteps()

from tensorflow.keras.callbacks import Callback # Learning Rate schedulerclass EarlyStoppingByLossVal(Callback): class EarlyStoppingByLossVal(Callback): def __init__(self, monitor='val_loss', value=0.00001, verbose=0): super(Callback, self).__init__() self.monitor = monitor self.value = value self.verbose = verbose def on_epoch_end(self, epoch, logs={}): current = logs.get(self.monitor) if current is None: warnings.warn("Early stopping requires %s available!" % self.monitor, RuntimeWarning) if current < self.value: if self.verbose > 0: print("Epoch %05d: early stopping THR" % epoch) self.model.stop_training = True def step_decay(epoch): initial_lrate = 0.01 drop = 0.5 epochs_drop = 9.0 lrate = initial_lrate * math.pow(drop, math.floor((1+epoch)/epochs_drop)) return lrate lrate = LearningRateScheduler(step_decay) es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=100) callbacks_list = [lrate, EarlyStoppingByLossVal(monitor='val_loss', value=4.5*10**(-5), verbose=1)]

#LSTM Model from tensorflow.keras.constraints import max_norm def create_LSTMmodel(optimizer="sgd", dropout_rate=0.4, activation='softplus', neurons1 = 96, init_mode='glorot_uniform' , learn_rate=0.01, momentum = 0.9): model = Sequential() model.add(LSTM(neurons1,activation = activation, kernel_initializer=init_mode,input_shape=(X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]),kernel_constraint=max_norm(3), recurrent_constraint=max_norm(3), bias_constraint=max_norm(3))) #model.add(Dropout(dropout_rate)) model.add(Dense(1,kernel_initializer=init_mode)) model.compile(optimizer = SGD(lr=0.01,momentum= .9, clipnorm=1.0,clipvalue=0.5), loss = 'mse') return model model = create_LSTMmodel() model.summary() #kernel_regularizer=regularizers.l2(l2=0.0001)

#Bidirectional LSTM Model def create_Bidirectional_LSTMmodel(optimizer="sgd", dropout_rate=0.0, activation='softplus', neurons1 = 128, init_mode='glorot_uniform', momentum = 0.9): model = Sequential() model.add(Bidirectional(LSTM(neurons1,activation = activation, kernel_initializer=init_mode),input_shape=(X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) model.add(Dropout(dropout_rate)) model.add(Dense(1,kernel_initializer=init_mode)) model.compile(optimizer = SGD(lr=0.01,momentum= .9,clipnorm=1.0,clipvalue=0.5), loss = 'mse') return model model = create_Bidirectional_LSTMmodel() model.summary()

#GRU Model def create_GRUmodel(optimizer="sgd", dropout_rate=0.0, activation='softplus', neurons = 128, init_mode='glorot_uniform' , learn_rate=0.01, momentum = 0.9): model = Sequential() model.add(GRU(neurons, kernel_initializer=init_mode,activation= activation,input_shape = (X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) model.add(Dropout(dropout_rate)) model.add(Dense(Y_df_scaled_reshaped.shape[1],kernel_initializer=init_mode)) model.compile(optimizer = SGD(lr=0.0, momentum=momentum,clipnorm=1.0,clipvalue=0.5), loss = 'mse') return model model = create_GRUmodel() model.summary()

#Convolutional Model def create_Convmodel(optimizer="sgd", dropout_rate=0.2, activation='softplus', neurons1 = 256, neurons2 = 64,neurons3 = 8, init_mode='glorot_uniform' , learn_rate=0.01, momentum = 0.9): model = Sequential() model.add(Conv1D(filters=64, kernel_size=2, activation=activation, input_shape = (X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) model.add(MaxPooling1D(pool_size=1)) model.add(Flatten()) model.add(Dense(50, activation= activation)) model.add(Dense(1,kernel_initializer=init_mode)) model.compile(optimizer = SGD(lr=0.0, momentum=momentum,clipnorm=1.0,clipvalue=0.5), loss = 'mse') return model model = create_Convmodel() model.summary()

#RNN Model def create_RNNmodel(optimizer="sgd", dropout_rate=0.0, activation='softplus', neurons = 128, init_mode='glorot_uniform' , learn_rate=0.01, momentum = 0.9): model = Sequential() model.add(SimpleRNN(neurons, kernel_initializer=init_mode,activation= activation,input_shape = (X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) model.add(Dropout(dropout_rate)) model.add(Dense(1,kernel_initializer=init_mode)) model.compile(optimizer = SGD(lr=0.0, momentum=momentum,clipnorm=1.0,clipvalue=0.5), loss = 'mse') return model model = create_RNNmodel() model.summary()

#Feed-Forward Model def create_FFmodel(optimizer="sgd", dropout_rate=0.0, activation='relu', neurons = 128, init_mode='glorot_uniform' , learn_rate=0.01, momentum = 0.9): model = Sequential() model.add(Flatten(input_shape = (X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) model.add(Dense(neurons, activation=activation,kernel_initializer=init_mode)) model.add(Dense(1,kernel_initializer=init_mode)) model.compile(optimizer = SGD(lr=0.0, momentum=momentum,clipnorm=1.0,clipvalue=0.5), loss = 'mse') return model model = create_FFmodel() model.summary()

# multiLayer-LSTM Model def create_Bidirectional_multiLayer_LSTMmodel(optimizer="sgd", dropout_rate=0.0, activation='softplus', hidden_layers = 1, neurons1 = 128, neurons2 = 128,neurons3 = 128, init_mode='glorot_uniform' , learn_rate=0.01, momentum = 0.9): model = Sequential() if hidden_layers == 1: model.add(Bidirectional(LSTM(neurons1,activation= activation, kernel_initializer=init_mode),input_shape=(X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) if hidden_layers == 2: model.add(Bidirectional(LSTM(neurons1, return_sequences=True,activation= activation, kernel_initializer=init_mode),input_shape=(X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) model.add(Bidirectional(LSTM(neurons1, kernel_initializer=init_mode))) if hidden_layers == 3: model.add(Bidirectional(LSTM(neurons1, return_sequences=True,activation= activation, kernel_initializer=init_mode),input_shape=(X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) model.add(Bidirectional(LSTM(neurons1, return_sequences=True, kernel_initializer=init_mode))) model.add(Bidirectional(LSTM(neurons1, kernel_initializer=init_mode))) if hidden_layers == 4: model.add(Bidirectional(LSTM(neurons1, return_sequences=True,activation= activation, kernel_initializer=init_mode),input_shape=(X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) model.add(Bidirectional(LSTM(neurons1, return_sequences=True, kernel_initializer=init_mode))) model.add(Bidirectional(LSTM(neurons1, return_sequences=True, kernel_initializer=init_mode))) model.add(Bidirectional(LSTM(neurons1, kernel_initializer=init_mode))) if hidden_layers == 5: model.add(Bidirectional(LSTM(neurons1, return_sequences=True,activation= activation, kernel_initializer=init_mode),input_shape=(X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) model.add(Bidirectional(LSTM(neurons1, return_sequences=True, kernel_initializer=init_mode))) model.add(Bidirectional(LSTM(neurons1, return_sequences=True, kernel_initializer=init_mode))) model.add(Bidirectional(LSTM(neurons1, return_sequences=True, kernel_initializer=init_mode))) model.add(Bidirectional(LSTM(neurons1, kernel_initializer=init_mode))) #if hidden_layers = 10: #if hidden_layers = 20: model.add(Dropout(dropout_rate)) model.add(Dense(1,kernel_initializer=init_mode)) model.compile(optimizer = SGD(lr=0.01,momentum= momentum,clipnorm=1.0,clipvalue=0.5), loss = 'mse') return model model = create_Bidirectional_multiLayer_LSTMmodel() model.summary()

# multiLayer-LSTM Model def create_multiLayer_LSTMmodel(optimizer="sgd", dropout_rate=0.0, activation='tanh', hidden_layers = 1, neurons1 = 256, neurons2 = 128,neurons3 = 128, init_mode='glorot_uniform' , learn_rate=0.01, momentum = 0.9): model = Sequential() if hidden_layers == 1: model.add(LSTM(neurons1,activation= activation, kernel_initializer=init_mode,input_shape=(X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) if hidden_layers == 2: model.add(LSTM(neurons1, return_sequences=True,activation= activation, kernel_initializer=init_mode,input_shape=(X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) model.add(LSTM(neurons1, kernel_initializer=init_mode)) if hidden_layers == 3: model.add(LSTM(neurons1, return_sequences=True,activation= activation, kernel_initializer=init_mode,input_shape=(X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) model.add(LSTM(neurons1, return_sequences=True, kernel_initializer=init_mode)) model.add(LSTM(neurons1, kernel_initializer=init_mode)) if hidden_layers == 4: model.add(LSTM(neurons1, return_sequences=True,activation= activation, kernel_initializer=init_mode,input_shape=(X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) model.add(LSTM(neurons1, return_sequences=True, kernel_initializer=init_mode)) model.add(LSTM(neurons1, return_sequences=True, kernel_initializer=init_mode)) model.add(LSTM(neurons1, kernel_initializer=init_mode)) if hidden_layers == 5: model.add(LSTM(neurons1, return_sequences=True,activation= activation, kernel_initializer=init_mode,input_shape=(X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) model.add(LSTM(neurons1, return_sequences=True, kernel_initializer=init_mode)) model.add(LSTM(neurons1, return_sequences=True, kernel_initializer=init_mode)) model.add(LSTM(neurons1, return_sequences=True, kernel_initializer=init_mode)) model.add(LSTM(neurons1, kernel_initializer=init_mode)) #if hidden_layers = 10: #if hidden_layers = 20: model.add(Dropout(dropout_rate)) model.add(Dense(1,kernel_initializer=init_mode)) model.compile(optimizer = SGD(lr=0.01,momentum= momentum,clipnorm=1.0,clipvalue=0.5), loss = 'mse') return model model = create_multiLayer_LSTMmodel() model.summary()

def create_Bidirectional_LSTM_Convmodel(optimizer="sgd", dropout_rate=0.0, activation='softplus', neurons1 = 9, neurons2 = 128,neurons3 = 128, init_mode='glorot_uniform' , learn_rate=0.01, momentum = 0.9): model = Sequential() model.add(Bidirectional(LSTM(neurons1, return_sequences=True,activation= activation, kernel_initializer=init_mode),input_shape=(X_df_scaled_reshaped.shape[1],X_df_scaled_reshaped.shape[2]))) model.add(Conv1D(filters=64, kernel_size=2, activation=activation)) model.add(MaxPooling1D(pool_size=1)) model.add(Flatten()) model.add(Dense(1,kernel_initializer=init_mode)) model.compile(optimizer = SGD(lr=0.0, momentum=momentum,clipnorm=1.0,clipvalue=0.5), loss = 'mse') return model model = create_Bidirectional_LSTM_Convmodel() model.summary()

#Regular all_features() no_google() no_home() no_home_no_google() #Delta delta_allFeatures() delta_noGoogle() delta_noHome() delta_noHome_noGoogle()

create_LSTMmodel() create_Bidirectional_LSTMmodel() create_GRUmodel() create_Convmodel() create_RNNmodel() create_FFmodel() create_Bidirectional_multiLayer_LSTMmodel() create_multiLayer_LSTMmodel() create_Bidirectional_LSTM_Convmodel()

#No Google no_google() tscv = TimeSeriesSplit(n_splits=5) bi_multi_model = KerasRegressor(build_fn=create_LSTMmodel, verbose=2, epochs = 40) print(np.mean(cross_val_score(bi_multi_model, X_cv_df_scaled_reshaped , Y_cv_df_scaled_reshaped , cv= tscv, n_jobs = -1, scoring = 'neg_mean_squared_error')))

#No Google no_google() tscv = TimeSeriesSplit(n_splits=5) Bi_LSTM_model = KerasRegressor(build_fn=create_Bidirectional_LSTMmodel, verbose=2, epochs = 40) print(np.mean(cross_val_score(Bi_LSTM_model, X_cv_df_scaled_reshaped , Y_cv_df_scaled_reshaped , cv= tscv, n_jobs = -1, scoring = 'neg_mean_squared_error')))

#All Features all_features() tscv = TimeSeriesSplit(n_splits=5) Bi_LSTM_model = KerasRegressor(build_fn=create_Bidirectional_LSTMmodel, verbose=2, epochs = 40) print(np.mean(cross_val_score(Bi_LSTM_model, X_cv_df_scaled_reshaped , Y_cv_df_scaled_reshaped , cv= tscv, n_jobs = -1, scoring = 'neg_mean_squared_error')))

#No Home no_home() tscv = TimeSeriesSplit(n_splits=5) Bi_LSTM_model = KerasRegressor(build_fn=create_Bidirectional_LSTMmodel, verbose=2, epochs = 40) print(np.mean(cross_val_score(Bi_LSTM_model, X_cv_df_scaled_reshaped , Y_cv_df_scaled_reshaped , cv= tscv, n_jobs = -1, scoring = 'neg_mean_squared_error')))

#No Home, No Google no_home_no_google() tscv = TimeSeriesSplit(n_splits=5) Bi_LSTM_model = KerasRegressor(build_fn=create_Bidirectional_LSTMmodel, verbose=2, epochs = 40) print(np.mean(cross_val_score(Bi_LSTM_model, X_cv_df_scaled_reshaped , Y_cv_df_scaled_reshaped , cv= tscv, n_jobs = -1, scoring = 'neg_mean_squared_error')))

from sklearn.model_selection import cross_val_score # create dict to save all the cv scores cv_scores = {} cv_scores['1 Timesteps'] = [] cv_scores['2 Timesteps'] = [] cv_scores['3 Timesteps'] = [] cv_scores['4 Timesteps'] = [] cv_scores['5 Timesteps'] = [] # forecast_models list tscv = TimeSeriesSplit(n_splits=5) #LSTM_model = KerasRegressor(build_fn=create_LSTMmodel, verbose=2, epochs = 10) # test with 10 epochs #Bi_LSTM_model = KerasRegressor(build_fn=create_Bi_Directional_LSTMmodel, verbose=2, epochs = 50) #forecast_models = [LSTM_model,Bi_LSTM_model] # for loop to run through models and epochs for epoch in np.array([10,20,30,40,50, 60, 70, 80, 90, 100]): epoch_num = int(epoch) # LSTM Model no_google() LSTM_model = KerasRegressor(build_fn=create_LSTMmodel, verbose=0, epochs = epoch_num) cv_scores['1 Timesteps'] += [np.mean(cross_val_score(LSTM_model, X_cv_df_scaled_reshaped, Y_cv_df_scaled_reshaped, cv= tscv))] no_google_3timesteps() LSTM_model = KerasRegressor(build_fn=create_LSTMmodel, verbose=0, epochs = epoch_num) cv_scores['2 Timesteps'] += [np.mean(cross_val_score(LSTM_model, X_cv_df_scaled_reshaped, Y_cv_df_scaled_reshaped, cv= tscv))] no_google_4timesteps() LSTM_model = KerasRegressor(build_fn=create_LSTMmodel, verbose=0, epochs = epoch_num) cv_scores['3 Timesteps'] += [np.mean(cross_val_score(LSTM_model, X_cv_df_scaled_reshaped, Y_cv_df_scaled_reshaped, cv= tscv))] no_google_5timesteps() LSTM_model = KerasRegressor(build_fn=create_LSTMmodel, verbose=0, epochs = epoch_num) cv_scores['4 Timesteps'] += [np.mean(cross_val_score(LSTM_model, X_cv_df_scaled_reshaped, Y_cv_df_scaled_reshaped, cv= tscv))] no_google_6timesteps() LSTM_model = KerasRegressor(build_fn=create_LSTMmodel, verbose=0, epochs = epoch_num) cv_scores['5 Timesteps'] += [np.mean(cross_val_score(LSTM_model, X_cv_df_scaled_reshaped, Y_cv_df_scaled_reshaped, cv= tscv))] print('epoch '+ str(epoch) + ' done.')

cv_df = pd.DataFrame(cv_scores, index= ['10','20','30','40','50', '60', '70','80','90','100']) cv_df.plot(marker="o") plt.xlabel("Epoch") plt.ylabel("Cross Validation Scores") plt.title("Comparing cross validation scores for different timesteps included in training") plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') plt.grid() plt.show()

from sklearn.model_selection import cross_val_score tscv = TimeSeriesSplit(n_splits=5) no_google_6timesteps() #no_google() # create dict to save all the cv scores cv_scores = {} cv_scores['LSTM'] = [] cv_scores['Bi LSTM'] = [] cv_scores['FF'] = [] cv_scores['Conv'] = [] cv_scores['RNN'] = [] cv_scores['GRU'] = [] # forecast_models list tscv = TimeSeriesSplit(n_splits=5) #LSTM_model = KerasRegressor(build_fn=create_LSTMmodel, verbose=2, epochs = 10) # test with 10 epochs #Bi_LSTM_model = KerasRegressor(build_fn=create_Bi_Directional_LSTMmodel, verbose=2, epochs = 50) #forecast_models = [LSTM_model,Bi_LSTM_model] # for loop to run through models and epochs for epoch in np.array([10,20,30,40,50,60,70, 80, 90, 100, 110, 120 , 130, 140, 150]): epoch_num = int(epoch) # LSTM Model LSTM_model = KerasRegressor(build_fn=create_LSTMmodel, verbose=0, epochs = epoch_num) cv_scores['LSTM'] += [np.mean(cross_val_score(LSTM_model, X_cv_df_scaled_reshaped, Y_cv_df_scaled_reshaped, cv= tscv))] # Bi LSTM Model Bi_LSTM_model = KerasRegressor(build_fn=create_Bidirectional_LSTMmodel, verbose=0, epochs = epoch_num) cv_scores['Bi LSTM'] += [np.mean(cross_val_score(Bi_LSTM_model, X_cv_df_scaled_reshaped, Y_cv_df_scaled_reshaped, cv= tscv))] # Multi Bi LSTM Model #Bi_multi_LSTM_model = KerasRegressor(build_fn=create_Bidirectional_multiLayer_LSTMmodel, verbose=0, epochs = epoch_num) #cv_scores['Bi Multi LSTM '+str(epoch)] = np.mean(cross_val_score(Bi_multi_LSTM_model, X_cv_df_scaled_reshaped, Y_cv_df_scaled_reshaped, cv= tscv)) # Multi LSTM Model #Multi_LSTM_model = KerasRegressor(build_fn=create_multiLayer_LSTMmodel, verbose=0, epochs = epoch_num) #cv_scores['Multi LSTM '+str(epoch)] = np.mean(cross_val_score(Multi_LSTM_model, X_cv_df_scaled_reshaped, Y_cv_df_scaled_reshaped, cv= tscv)) # FF Model #FF_model = KerasRegressor(build_fn=create_FFmodel, verbose=0, epochs = epoch_num) #cv_scores['FF'] += [np.mean(cross_val_score(FF_model, X_cv_df_scaled_reshaped, Y_cv_df_scaled_reshaped, cv= tscv))] # Conv model #Conv_model = KerasRegressor(build_fn=create_Convmodel, verbose=0, epochs = epoch_num) #cv_scores['Conv'] += [np.mean(cross_val_score(Conv_model, X_cv_df_scaled_reshaped, Y_cv_df_scaled_reshaped, cv= tscv))] # RNN model #RNN_model = KerasRegressor(build_fn=create_RNNmodel, verbose=0, epochs = epoch_num) #cv_scores['RNN'] += [np.mean(cross_val_score(RNN_model, X_cv_df_scaled_reshaped, Y_cv_df_scaled_reshaped, cv= tscv))] # GRU model #GRU_model = KerasRegressor(build_fn=create_GRUmodel, verbose=0, epochs = epoch_num) #cv_scores['GRU'] += [np.mean(cross_val_score(GRU_model, X_cv_df_scaled_reshaped, Y_cv_df_scaled_reshaped, cv= tscv))] print('epoch '+ str(epoch) + ' done.')

# visualization + explanation cv_df = pd.DataFrame(cv_scores, index= ['Epoch 10','Epoch 20','Epoch 30','Epoch 40','Epoch 50']) cv_df.plot(marker="o") plt.xlabel("Epoch") plt.ylabel("Cross Validation Scores") plt.title("Comparing cross validation scores for different epochs across models") plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') plt.grid() plt.show()

cv_scores.pop('FF', None) cv_scores.pop('Conv', None) cv_scores.pop('RNN', None) cv_scores.pop('GRU', None) cv_df = pd.DataFrame(cv_scores, index= ['10','20','30','40','50', '60', '70', '80','90', '100', '110', '120' , '130', '140', '150']) cv_df.iloc[:,:2].plot(marker="o") plt.xlabel("Epoch") plt.ylabel("Cross Validation Scores") plt.title("Comparing cross validation scores for different epochs across models") plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left') plt.grid() plt.show()

model = KerasRegressor(build_fn= create_multiLayer_LSTMmodel, verbose=1, epochs = 30) optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam'] init_mode = ['uniform', 'lecun_uniform', 'normal', 'zero', 'glorot_normal', 'glorot_uniform', 'he_normal', 'he_uniform'] activation = ['softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear'] learn_rate = [0.001, 0.01, 0.1, 0.2, 0.3] dropout_rate = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] batch_size = [2, 4, 8, 16, 32, 64, 128, 256] neurons1 = [256, 512] neurons2 = [8, 16, 32, 64, 128, 256] neurons3 = [8, 16, 32, 64, 128, 256] hidden_layers = [1,2 , 3 , 4 , 5] epochs = [10, 30, 50, 70, 100] momentum = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9] tscv = TimeSeriesSplit(n_splits=5) param_grid = dict(batch_size = batch_size, dropout_rate = dropout_rate, activation= activation, hidden_layers = hidden_layers, momentum = momentum, neurons1 = neurons1) grid = RandomizedSearchCV(estimator=model, n_iter = 20 ,param_distributions=param_grid, cv=tscv,n_jobs=-1,verbose = 2, ) #grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=tscv) #grid = BayesSearchCV(estimator=model, search_spaces=param_grid, n_jobs=-1, cv=tscv, n_iter = 5, verbose = 1) grid_result = grid.fit( X_cv_df_scaled_reshaped , Y_cv_df_scaled_reshaped, verbose = 1)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_)) #means = grid_result.cv_results_['mean_test_score'] #stds = grid_result.cv_results_['std_test_score'] #params = grid_result.cv_results_['params'] #for mean, stdev, param in zip(means, stds, params): #print("%f (%f) with: %r" % (mean, stdev, param))

# Train and Plot loss no_google_3timesteps() forecast_model = create_LSTMmodel() history = forecast_model.fit(X_df_scaled_reshaped, Y_df_scaled_reshaped,validation_data=(X_test_df_scaled_reshaped, Y_test_df_scaled_reshaped), epochs = 100, batch_size = 2, verbose=1, callbacks=callbacks_list)#callbacks=callbacks_list plt.plot(history.history['loss'][5:], label='Training loss') plt.plot(history.history['val_loss'][5:], label = 'Validation loss') plt.xlabel("Epoch") plt.ylabel("MSE Scores") plt.grid() plt.legend()

for i in np.arange(8,100): no_google_3timesteps() forecast_model = create_LSTMmodel() history = forecast_model.fit(X_df_scaled_reshaped, Y_df_scaled_reshaped,validation_data=(X_test_df_scaled_reshaped, Y_test_df_scaled_reshaped), epochs = 100, batch_size = 2, verbose=1, callbacks=callbacks_list)#callbacks=callbacks_list forecast_model.save('forecast_model'+str(i))

forecast_model = create_FFmodel() history = forecast_model.fit(X_train, y_train,validation_data=(X_test, y_test), epochs = 100, batch_size = 8, verbose=1, callbacks=callbacks_list) plt.plot(history.history['loss'][5:], label='Training loss') plt.plot(history.history['val_loss'][5:], label = 'Validation loss') plt.xlabel("Epoch") plt.ylabel("MSE Scores") plt.grid() plt.legend()

import tensorflow as tf; print(tf.__version__)

from tensorflow import keras def ensemble_load(): global forecast_ensemble forecast_ensemble = list() for i in np.arange(8): reconstructed_model = keras.models.load_model("forecast_model"+str(i)) forecast_ensemble.append(reconstructed_model) def ensemble_predict(ensemble, number_of_predictions): predictions = np.zeros(number_of_predictions) for model in np.arange(len(ensemble)): forecast = ensemble[model].predict(X_test_df_scaled_reshaped) full_scaled_df_forecast = np.append(X_test, forecast, axis = 1) full_df_forecast = scaler.inverse_transform(full_scaled_df_forecast) predictions = predictions + full_df_forecast[:,full_df_scaled.shape[1]-1] predictions = predictions / len(ensemble) return predictions def ensemble_rsme(ensemble): for model in np.arange(len(ensemble)): forecast = ensemble[model].predict(X_test_df_scaled_reshaped) full_scaled_df_forecast = np.append(X_test, forecast, axis = 1) full_df_forecast = scaler.inverse_transform(full_scaled_df_forecast) test_df_transformed = scaler.inverse_transform(full_df_scaled[int(6863):,:]) print('RMSE: ',np.sqrt(mean_squared_error(test_df_transformed[:,full_df_scaled.shape[1]-1], full_df_forecast[:,full_df_scaled.shape[1]-1])))

ensemble_load()

ensemble_rsme(forecast_ensemble)

test = ensemble_predict(forecast_ensemble, 1747)

from sklearn.metrics import median_absolute_error from sklearn.metrics import max_error from sklearn.metrics import r2_score test_df_transformed = scaler.inverse_transform(full_df_scaled[int(6863):,:]) #full_df_forecast = full_df_forecast.clip(min=0) print('RMSE: ',np.sqrt(mean_squared_error(test_df_transformed[:,full_df_scaled.shape[1]-1], test))) print('Median Absolute Error: ',median_absolute_error(test_df_transformed[:,full_df_scaled.shape[1]-1], test)) print('Max Error: ',max_error(test_df_transformed[:,full_df_scaled.shape[1]-1], test)) print('R squared Score: ', r2_score(test_df_transformed[:,full_df_scaled.shape[1]-1], test))

forecast = forecast_model.predict(X_test_df_scaled_reshaped) full_scaled_df_forecast = np.append(X_test, forecast, axis = 1) full_df_forecast = scaler.inverse_transform(full_scaled_df_forecast) test_df_transformed = scaler.inverse_transform(full_df_scaled[int(6863):,:]) #full_df_forecast = full_df_forecast.clip(min=0) print('RMSE: ',np.sqrt(mean_squared_error(test_df_transformed[:,full_df_scaled.shape[1]-1], full_df_forecast[:,full_df_scaled.shape[1]-1]))) print('Scaled MSE: ' , mean_squared_error(full_df_scaled[int(6863):,full_df_scaled.shape[1]-1], full_scaled_df_forecast[:,full_df_scaled.shape[1]-1]))

forecast_train = forecast_model.predict(X_df_scaled_reshaped) full_scaled_df_forecast_train = np.append(X_train, forecast_train, axis = 1) full_df_forecast_train = scaler.inverse_transform(full_scaled_df_forecast_train) test_df_transformed_train = scaler.inverse_transform(train_combined[0:6863,:]) #full_df_forecast = full_df_forecast.clip(min=0) print('RMSE: ',np.sqrt(mean_squared_error(test_df_transformed_train[:,full_df_scaled.shape[1]-1], full_df_forecast_train[:,full_df_scaled.shape[1]-1]))) print('Scaled MSE: ' , mean_squared_error(train_combined[0:6863:,full_df_scaled.shape[1]-1], full_scaled_df_forecast_train[:,full_df_scaled.shape[1]-1]))

forecast = forecast_model.predict(X_test) full_scaled_df_forecast = np.append(X_test, forecast, axis = 1) full_df_forecast = scaler.inverse_transform(full_scaled_df_forecast) test_df_transformed = scaler.inverse_transform(full_df_scaled[int(full_df_scaled.shape[0]*.8):,:]) #full_df_forecast = full_df_forecast.clip(min=0) print('RMSE: ',np.sqrt(mean_squared_error(test_df_transformed[:,full_df_scaled.shape[1]-1], full_df_forecast[:,full_df_scaled.shape[1]-1]))) print('Scaled MSE: ' , mean_squared_error(full_df_scaled[int(full_df_scaled.shape[0]*.8):,full_df_scaled.shape[1]-1], full_scaled_df_forecast[:,full_df_scaled.shape[1]-1]))

time_values = time.iloc[6863:,:][time['geo_value_t_minus_2']==25].time_value actual_covid_cases = clean_good_full_df.iloc[6863:,21][clean_good_full_df.iloc[6863:,[0,21]].geo_value_t_minus_2 == 25] predicted_covid_cases = full_df_forecast[clean_good_full_df.iloc[6863:,[0,21]].geo_value_t_minus_2 == 25][:,full_df_scaled.shape[1]-1] plt.figure(figsize=(13,6)) plt.plot(time_values, actual_covid_cases, label='Actual Covid Cases') plt.plot(time_values,predicted_covid_cases, label='Predicted Covid Cases') plt.title("Los Angeles County Covid Cases Forecast") plt.xlabel("Time") plt.ylabel("Number of Covid Cases") plt.xticks(np.arange(0,77,10),time_values.to_numpy()[np.arange(0,77,10)]) plt.grid() plt.legend()

# create empty dictionary county_RMSE = {} for county in clean_good_full_df.iloc[6863:,[0,21]].geo_value_t_minus_2.unique(): # Train # filter to get predicted and actual covid cases for each county predicted_covid_train = full_df_forecast_train[pd.DataFrame(test_df_transformed_train).iloc[0:6863,[0,21]][0] == county][:,full_df_scaled.shape[1]-1] actual_covid_train = test_df_transformed_train[0:6863,21][pd.DataFrame(test_df_transformed_train).iloc[0:6863,[0,21]][0] == county] # Test # filter to get predicted and actual covid cases for each county predicted_covid = full_df_forecast[clean_good_full_df.iloc[6863:,[0,21]].geo_value_t_minus_2 == county][:,full_df_scaled.shape[1]-1] actual_covid = clean_good_full_df.iloc[6863:,21][clean_good_full_df.iloc[6863:,[0,21]].geo_value_t_minus_2 == county] # calculate RMSE RMSE_train = np.sqrt(np.mean(actual_covid_train - predicted_covid_train)**2) RMSE_test = np.sqrt(np.mean(actual_covid - predicted_covid)**2) # save RMSE county_RMSE[county] = [RMSE_train,RMSE_test]

county_RMSE # .rename(columns = {'6.0': 'Merced','22.0':'San Diego'}) # 6: minimum 56.95034971416714 - 6047 geovalue Merced County # 22: maximum 2064.8157958559805 - 6073 geovalue San Diego

RMSE_df = pd.DataFrame.from_dict(county_RMSE, orient='index').sort_index() # plot plt.figure(figsize=(13,6)) plt.scatter(np.arange(1,25),RMSE_df[1], label='Testing RMSE') plt.scatter(np.arange(1,25),RMSE_df[0], label='Training RMSE') plt.title("County Training and Testing RMSE values") plt.xlabel("County (Ascendingly ordered by amount of Covid Cases") plt.ylabel("RMSE") plt.xticks(np.arange(1,25),RMSE_df.index) plt.grid() plt.legend(loc='upper left')

# get predicted and actual values of merced time_values_merced = time.iloc[6863:,:][time['geo_value_t_minus_2']==6].time_value actual_covid_cases_merced = clean_good_full_df.iloc[6863:,21][clean_good_full_df.iloc[6863:,[0,21]].geo_value_t_minus_2 == 6] predicted_covid_cases_merced = full_df_forecast[clean_good_full_df.iloc[6863:,[0,21]].geo_value_t_minus_2 == 6][:,full_df_scaled.shape[1]-1] merced_len = len(time_values_merced) # get predicted and actual values of san diego time_values_sd = time.iloc[6863:,:][time['geo_value_t_minus_2']==22].time_value actual_covid_cases_sd = clean_good_full_df.iloc[6863:,21][clean_good_full_df.iloc[6863:,[0,21]].geo_value_t_minus_2 == 22] predicted_covid_cases_sd = full_df_forecast[clean_good_full_df.iloc[6863:,[0,21]].geo_value_t_minus_2 == 22][:,full_df_scaled.shape[1]-1] sd_len = len(time_values_sd) # plot # figure 1 plt.figure(figsize=(20,10)) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,5)) fig.suptitle('Comparing Test predictions for Merced and San Diego') ax1.plot(time_values_merced, actual_covid_cases_merced, label='Actual Covid Cases') ax1.plot(time_values_merced, predicted_covid_cases_merced, label='Predicted Covid Cases') ax1.grid() ax1.legend() # figure 2 ax2.plot(time_values_sd, actual_covid_cases_sd, label='Actual Covid Cases') ax2.plot(time_values_sd, predicted_covid_cases_sd, label='Predicted Covid Cases') ax2.grid() ax2.legend() # labeling the plots # figure 1 ax1.set_xlabel("Time") ax1.set_ylabel("Number of Covid Cases in Merced") plt.sca(ax1) plt.xticks(np.arange(0,merced_len,17),time_values.to_numpy()[np.arange(0,merced_len,17)]) # figure 2 ax2.set_xlabel("Time") ax2.set_ylabel("Number of Covid Cases in San Diego") plt.sca(ax2) plt.xticks(np.arange(0,sd_len,17),time_values.to_numpy()[np.arange(0,sd_len,17)]) plt.tight_layout(pad=2.0)