Applying Machine Learning Methods to Movie Ratings Data

By Mary Nwangwu

import pandas as pd import numpy as np import matplotlib.pyplot as plt %matplotlib inline import seaborn as sns sns.set() from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, LeaveOneOut, LeavePOut, validation_curve, learning_curve, GridSearchCV, RandomizedSearchCV from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve, auc, roc_curve from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression from sklearn.model_selection import ShuffleSplit import warnings warnings.filterwarnings('ignore')

# Load movie ratings data set data = pd.read_csv('movieReplicationSet.csv')

# Create dataframe with only movie ratings data_ratings = data.iloc[:,0:400] # Check for any rows or columns that had ALL null values participant_null = data_ratings[data_ratings.isna().all(axis=1)] movie_null = (data_ratings.isna().all()).sum() print("Participants with no ratings: ") display(participant_null) print("Movies with no ratings: " + str(movie_null)) # drop user with all null values data_ratings = data_ratings.drop(896) #user 896 data_ratings = data_ratings.reset_index(drop=True) display(data_ratings)

# Calculate the average ratings for each movie and for each participant to fill in null values avgRatings_movies = np.array(data_ratings.mean(axis=0)) # mean across rows, return 400 values (400,) avgRatings_participant = np.array(data_ratings.mean(axis=1)) # mean across columns, return 1096 values (1096,) # print(avgRatings_movies.shape) #(400,) # print(avgRatings_participant.shape) #(1096,) # Create new dataframe where every element is the average rating of the corresponding movie and participant avgRatings_movies_2 = np.vstack([avgRatings_movies]*1096) # stack avg movie rating array vertically, returns 1096 duplicated rows of the 400 avg movie ratings avgRatings_participant_2 = np.transpose(np.vstack([avgRatings_participant]*400)) # stack avg particpant rating array vertically, returns 400 duplicated rows of the 1096 avg participant ratings, then transpose # print(np.isnan(avgRatings_movies_2).sum()) #(1096,400) # print(np.isnan(avgRatings_participant_2).sum()) #(1096,400) # 400 null values???? avgRatings_df = pd.DataFrame((avgRatings_movies_2 + avgRatings_participant_2) / 2) # print(avgRatings_df.shape) #(1096,400) data2 = data_ratings # print(data2.shape) #(1096,400) # display(avgRatings_df) # display(data2) # convert column headings in avgRatings_df to same headings as in data 2 movie_names = list(data2) avgRatings_df.columns = movie_names # print(avgRatings_df.shape) #(1096,400) # overlay the two dataframes so that null values from 1st are replaced with non-nulls in second, new df with all nulls filled movieRatings_fill = data2.combine_first(avgRatings_df) # print(movieRatings_fill.shape) #(1096,400) # display(movieRatings_fill) movieRatings = movieRatings_fill display(movieRatings)

# Create dictionary to obtain COD values for each movie combo r2_max_dict = {} # Iterate through all movie combinations, calculate COD for i in movieRatings: r2_dict = {} y = movieRatings[i].to_numpy() for j in movieRatings: if j != i: x = movieRatings[j].to_numpy() reg = LinearRegression().fit(x.reshape(-1,1), y) y_hat = reg.predict(x.reshape(-1,1)) r2 = r2_score(y,y_hat) # Add COD values to dictionary r2_dict[str(i) + ' vs ' + str(j)] = r2 # Add max COD value for movie i to a new dictionary r2_max_dict[max(r2_dict, key=r2_dict.get)] = max(r2_dict.values()) # Average COD of 400 Simple Linear Regression Models avgCOD_SLR = sum(r2_max_dict.values()) / len(r2_max_dict) print("The average COD of the 400 simple linear regression models is: " + str(avgCOD_SLR)) # Plot 400 COD values on histogram f = plt.figure(figsize = (5,3), dpi = 200) plt.title('CODs for 400 Simple Linear Regression Models', fontsize=15) plt.ylabel('Count', fontsize=15) plt.xlabel('COD Value', fontsize=15) plt.vlines(avgCOD_SLR, ymin=0, ymax=25, colors='#AAFF32', label='Mean') plt.legend() plt.hist(r2_max_dict.values(), bins=50, color='palevioletred', edgecolor='black', linewidth=0.5);

# Sort the COD max dictionary r2_max_dict_sorted = sorted(r2_max_dict.items(), key=lambda x: x[1]) # Return the 10 movies with high COD, and 10 movies with low COD first_ten_items = r2_max_dict_sorted[:10] last_ten_items = r2_max_dict_sorted[390:] # Convert lists back into dictionaries moviesEasy={x[0]:x[1] for x in last_ten_items} moviesHard={x[0]:x[1] for x in first_ten_items} # Sort Dictionaries moviesEasy_sorted = sorted(moviesEasy.items(), key=lambda x: x[1], reverse=True) moviesHard_sorted = sorted(moviesHard.items(), key=lambda x: x[1], reverse=True) # Convert lists back into dictionaries moviesEasy={x[0]:x[1] for x in moviesEasy_sorted} moviesHard={x[0]:x[1] for x in moviesHard_sorted} # Initialize dictionary for COD table table_COD = {'Movie': [], 'COD': [], 'Predictor Movie': []} # Initialize lists movie_name = [] COD = [] pred_movie_name = [] # Acquire keys and values from moviesEasy and moviesHard for key, value in moviesEasy.items(): name = key name_split = name.split(' vs ', 2) movie_name.append(name_split[0]) COD.append(value) pred_movie_name.append(name_split[1]) for key, value in moviesHard.items(): name = key name_split = name.split(' vs ', 2) movie_name.append(name_split[0]) COD.append(value) pred_movie_name.append(name_split[1]) # Create COD table dictionary table_COD['Movie'] = movie_name table_COD['COD'] = COD table_COD['Predictor Movie'] = pred_movie_name # Convert COD table dictionary into pandas dataframe table_COD = pd.DataFrame(table_COD) display(table_COD)

# each movie uses the 3 new columns plus its best predicting movie # Create new dataframe with the 20 movies and their best predicting movie, along with 3 personality columns pers_df = data.iloc[:,474:477] pers_df = pers_df.drop(896) # drop participant with no ratings here as well pers_df = pers_df.reset_index(drop=True) movie_df = movieRatings.loc[:,['Erik the Viking (1989)','I.Q. (1994)','The Lookout (2007)','Patton (1970)', 'The Bandit (1996)','Best Laid Plans (1999)','Congo (1995)','The Straight Story (1999)', 'The Final Conflict (1981)','Heavy Traffic (1973)','Grown Ups 2 (2013)','The Fast and the Furious (2001)', '13 Going on 30 (2004)','Titanic (1997)','La La Land (2016)','The Cabin in the Woods (2012)', 'Clueless (1995)','Black Swan (2010)','Interstellar (2014)','Avatar (2009)', 'Ran (1985)','The Core (2003)','Terminator 3: Rise of the Machines (2003)', "Can't Hardly Wait (1998)",'Cocktail (1988)','The Evil Dead (1981)','Escape from LA (1996)', 'Sorority Boys (2002)','Torque (2004)','Bad Boys (1995)']] mulReg_df = pd.concat([movie_df, pers_df], axis=1) # merge columns, 1096 rows # display(mulReg_df.shape) # check for null values # print(mulReg_df.isnull().sum()) mulReg_df = mulReg_df.dropna().reset_index(drop=True) # drop null value rows, 23 rows # display(mulReg_df.shape) # drop rows where personality columns were -1, 9 rows mulReg_df = mulReg_df.loc[mulReg_df["Are you an only child? (1: Yes; 0: No; -1: Did not respond)"] != -1 ] mulReg_df = mulReg_df.loc[mulReg_df["Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)"] != -1] mulReg_df = mulReg_df.reset_index(drop=True) # display(mulReg_df.shape) # create dummy columns for gender identity mulReg_df = pd.get_dummies(data=mulReg_df, columns=['Gender identity (1 = female; 2 = male; 3 = self-described)'], dummy_na=False) mulReg_df = mulReg_df.drop(columns=['Gender identity (1 = female; 2 = male; 3 = self-described)_3.0']) mulReg_df = mulReg_df.rename(columns={'Gender identity (1 = female; 2 = male; 3 = self-described)_1.0': 'Gender identity_Female', 'Gender identity (1 = female; 2 = male; 3 = self-described)_2.0': 'Gender identity_Male', 'Are you an only child? (1: Yes; 0: No; -1: Did not respond)': 'Are you an only child? (1: Yes; 0: No)', 'Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)': 'Movies are best enjoyed alone (1: Yes; 0: No)'}) display(mulReg_df) #1064 rows, 34 cols

# 20 movies y1 = mulReg_df['Erik the Viking (1989)'].to_numpy() y2 = mulReg_df['I.Q. (1994)'].to_numpy() y3 = mulReg_df['The Lookout (2007)'].to_numpy() y4 = mulReg_df['Patton (1970)'].to_numpy() y5 = mulReg_df['The Bandit (1996)'].to_numpy() y6 = mulReg_df['Best Laid Plans (1999)'].to_numpy() y7 = mulReg_df['Congo (1995)'].to_numpy() y8 = mulReg_df['The Straight Story (1999)'].to_numpy() y9 = mulReg_df['The Final Conflict (1981)'].to_numpy() y10 = mulReg_df['Heavy Traffic (1973)'].to_numpy() y11 = mulReg_df['Grown Ups 2 (2013)'].to_numpy() y12 = mulReg_df['The Fast and the Furious (2001)'].to_numpy() y13 = mulReg_df['13 Going on 30 (2004)'].to_numpy() y14 = mulReg_df['Titanic (1997)'].to_numpy() y15 = mulReg_df['La La Land (2016)'].to_numpy() y16 = mulReg_df['The Cabin in the Woods (2012)'].to_numpy() y17 = mulReg_df['Clueless (1995)'].to_numpy() y18 = mulReg_df['Black Swan (2010)'].to_numpy() y19 = mulReg_df['Interstellar (2014)'].to_numpy() y20 = mulReg_df['Avatar (2009)'].to_numpy() # additional predictors x1_f = mulReg_df['Gender identity_Female'].to_numpy() x1_m = mulReg_df['Gender identity_Male'].to_numpy() x2 = mulReg_df['Are you an only child? (1: Yes; 0: No)'].to_numpy() x3 = mulReg_df['Movies are best enjoyed alone (1: Yes; 0: No)'].to_numpy() x5 = mulReg_df['Ran (1985)'].to_numpy() x6 = mulReg_df['The Core (2003)'].to_numpy() x7 = mulReg_df['Terminator 3: Rise of the Machines (2003)'].to_numpy() x8 = mulReg_df["Can't Hardly Wait (1998)"].to_numpy() x9 = mulReg_df['Cocktail (1988)'].to_numpy() x10 = mulReg_df['The Evil Dead (1981)'].to_numpy() x11 = mulReg_df['Escape from LA (1996)'].to_numpy() x12 = mulReg_df['Sorority Boys (2002)'].to_numpy() x13 = mulReg_df['Torque (2004)'].to_numpy() x14 = mulReg_df['Bad Boys (1995)'].to_numpy()

new_r2_list = [] # Erik the Viking (1989) w/ I.Q. (1994) x = np.concatenate((y2.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) # predictors reg = LinearRegression().fit(x,y1) # linear regression with predictors and outcome y y_hat = reg.predict(x) r2 = r2_score(y1,y_hat) new_r2_list.append(r2) # I.Q. (1994) w/ Erik the Viking (1989) x = np.concatenate((y1.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y2) y_hat = reg.predict(x) r2 = r2_score(y2,y_hat) new_r2_list.append(r2) # The Lookout (2007) w/ Patton (1970) x = np.concatenate((y4.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y3) y_hat = reg.predict(x) r2 = r2_score(y3,y_hat) new_r2_list.append(r2) # Patton (1970) w/ The Lookout (2007) x = np.concatenate((y3.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y4) y_hat = reg.predict(x) r2 = r2_score(y4,y_hat) new_r2_list.append(r2) # The Bandit (1996) w/ Best Laid Plans (1999) x = np.concatenate((y6.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y5) y_hat = reg.predict(x) r2 = r2_score(y5,y_hat) new_r2_list.append(r2) # Best Laid Plans (1999) w/ The Bandit (1996) x = np.concatenate((y5.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y6) y_hat = reg.predict(x) r2 = r2_score(y6,y_hat) new_r2_list.append(r2) # Congo (1995) w/ The Straight Story (1999) x = np.concatenate((y8.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y7) y_hat = reg.predict(x) r2 = r2_score(y7,y_hat) new_r2_list.append(r2) # The Straight Story (1999) w/ Congo (1995) x = np.concatenate((y7.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y8) y_hat = reg.predict(x) r2 = r2_score(y8,y_hat) new_r2_list.append(r2) # The Final Conflict (1981) w/ The Lookout (2007) x = np.concatenate((y3.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y9) y_hat = reg.predict(x) r2 = r2_score(y9,y_hat) new_r2_list.append(r2) # Heavy Traffic (1973) w/ Ran (1985) x = np.concatenate((x5.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y10) y_hat = reg.predict(x) r2 = r2_score(y10,y_hat) new_r2_list.append(r2) # Grown Ups 2 (2013) w/ The Core (2003) x = np.concatenate((x6.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y11) y_hat = reg.predict(x) r2 = r2_score(y11,y_hat) new_r2_list.append(r2) # The Fast and the Furious (2001) w/ Terminator 3: Rise of the Machines (2003) x = np.concatenate((x7.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y12) y_hat = reg.predict(x) r2 = r2_score(y12,y_hat) new_r2_list.append(r2) # 13 Going on 30 (2004) w/ Can't Hardly Wait (1998) x = np.concatenate((x8.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y13) y_hat = reg.predict(x) r2 = r2_score(y13,y_hat) new_r2_list.append(r2) # Titanic (1997) w/ Cocktail (1988) x = np.concatenate((x9.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y14) y_hat = reg.predict(x) r2 = r2_score(y14,y_hat) new_r2_list.append(r2) # La La Land (2016) w/ The Lookout (2007) x = np.concatenate((y3.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y15) y_hat = reg.predict(x) r2 = r2_score(y15,y_hat) new_r2_list.append(r2) # The Cabin in the Woods (2012) w/ The Evil Dead (1981) x = np.concatenate((x10.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y16) y_hat = reg.predict(x) r2 = r2_score(y16,y_hat) new_r2_list.append(r2) # Clueless (1995) w/ Escape from LA (1996) x = np.concatenate((x11.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y17) y_hat = reg.predict(x) r2 = r2_score(y17,y_hat) new_r2_list.append(r2) # Black Swan (2010) w/ Sorority Boys (2002) x = np.concatenate((x12.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y18) y_hat = reg.predict(x) r2 = r2_score(y18,y_hat) new_r2_list.append(r2) # Interstellar (2014) w/ Torque (2004) x = np.concatenate((x13.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y19) y_hat = reg.predict(x) r2 = r2_score(y19,y_hat) new_r2_list.append(r2) # Avatar (2009) w/ Bad Boys (1995) x = np.concatenate((x14.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) reg = LinearRegression().fit(x,y20) y_hat = reg.predict(x) r2 = r2_score(y20,y_hat) new_r2_list.append(r2) # Initialize dictionary for New COD table table_COD2 = {'Movie': [], 'New COD': [], 'Predictor Movie': []} # Create New COD table dictionary table_COD2['Movie'] = movie_name table_COD2['New COD'] = new_r2_list table_COD2['Predictor Movie'] = pred_movie_name # Convert COD table dictionary into pandas dataframe table_COD2 = pd.DataFrame(table_COD2) # display(table_COD) display(table_COD2) plt.scatter(np.array(COD),np.array(new_r2_list), color='white', edgecolor='black', s=100) plt.title('New $\mathregular{R^{2}}$ vs Old $\mathregular{R^{2}}$', fontsize=20) plt.ylabel('New $\mathregular{R^{2}}$', fontsize=15) plt.xlabel('Old $\mathregular{R^{2}}$', fontsize=15);

plt.xlim(0.05, 0.2) plt.ylim(0.07, 0.2) plt.title('Hardest to Predict Movies', fontsize=20) plt.ylabel('New $\mathregular{R^{2}}$', fontsize=15) plt.xlabel('Old $\mathregular{R^{2}}$', fontsize=15) plt.scatter(np.array(COD),np.array(new_r2_list), color='white', edgecolor='red', s=100);

plt.xlim(0.68, 0.74) plt.ylim(0.68, 0.74) plt.title('Easiest to Predict Movies', fontsize=20) plt.ylabel('New $\mathregular{R^{2}}$', fontsize=15) plt.xlabel('Old $\mathregular{R^{2}}$', fontsize=15) plt.scatter(np.array(COD),np.array(new_r2_list), color='white', edgecolor='green', s=100);

# subtract new COD values from old to find differences array1 = np.array(COD) array2 = np.array(new_r2_list) subtracted_array = np.subtract(array1, array2) subtracted = list(subtracted_array) print(subtracted)

# Pick 30 movies in the middle of the COD range mid_30_movies = r2_max_dict_sorted[185:215] # remove movie idx 196, "the evil dead" used in Q2???: r2_max_dict_sorted[185:196] + r2_max_dict_sorted[197:216] moviesMid={x[0]:x[1] for x in mid_30_movies} # Convert lists back into dictionaries moviesMid_name = [] # Initialize list for key, value in moviesMid.items(): # Acquire keys and values from moviesMid name = key name_split = name.split(' vs ', 2) moviesMid_name.append(name_split[0]) # Pick 10 other movies in middle of COD range from Q1 other_movies_10 = r2_max_dict_sorted[50:60] movies10={x[0]:x[1] for x in other_movies_10} # Convert lists back into dictionaries movies10_name = [] # Initialize list for key, value in movies10.items(): # Acquire keys and values from movies10 name = key name_split = name.split(' vs ', 2) movies10_name.append(name_split[0]) # Create df for 30 movies: movies_30 = [] for i in movieRatings: for j in moviesMid_name: if i == j: movies_30.append(movieRatings[i]) movies_30 = pd.DataFrame(movies_30).transpose() # Create df for 10 movies: movies_10 = [] for i in movieRatings: for j in movies10_name: if i == j: movies_10.append(movieRatings[i]) movies_10 = pd.DataFrame(movies_10).transpose() # do i need to transpose? .transpose() # prepare dataframes for regression model outcomes = movies_30 predictors = movies_10 # display(outcomes) # display(predictors)

#create alpha range parameters = {'alpha':[c for c in np.arange(0, 200, 0.1)]} # set up gridsearchCV model = Ridge() cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=10) #cross-validation method for gridsearch ridge_grid = GridSearchCV(model, parameters, scoring='neg_mean_squared_error', cv=cv) # intitialize lists rmse_rigGrid = [] movie_name_rigGrid = [] betas_rigGrid = [] alpha_rigGrid = [] # loop through 30 movies for i in outcomes: Y = np.array(outcomes[i]) X = np.array(predictors) # split data set into train test splits x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10) # use gridsearch to split training data into train and validation set and find best alpha hyperparameter ridge_grid.fit(x_train, y_train) best_alpha = ridge_grid.best_estimator_.alpha print("The best parameters of Grid Search are as follows: {} with a score of {}".format(ridge_grid.best_params_, np.round(ridge_grid.best_score_, 3))) # Use Best Alpha for calculating coefficients and RMSE ridgeReg = Ridge(alpha=best_alpha) # input best alpha found for that model ridgeReg.fit(x_train, y_train) y_pred = ridgeReg.predict(x_test) # append values to lists movie_name_rigGrid.append(i) rmse_rigGrid.append(mean_squared_error(y_test, y_pred)) betas_rigGrid.append(ridgeReg.coef_) alpha_rigGrid.append(best_alpha)

# Initialize dictionary for RMSE table table_RMSE_rigGrid = {'Movie': [], 'RMSE': [], 'alpha': []} # Create New COD table dictionary table_RMSE_rigGrid['Movie'] = movie_name_rigGrid table_RMSE_rigGrid['RMSE'] = rmse_rigGrid table_RMSE_rigGrid['alpha'] = alpha_rigGrid # Convert RMSE table dictionary into pandas dataframe table_RMSE_rigGrid = pd.DataFrame(table_RMSE_rigGrid) display(table_RMSE_rigGrid) # Create dataframe for coefficients betas_df_rigGrid = pd.DataFrame(columns=predictors.columns) betas_df_rigGrid = betas_df_rigGrid.append(pd.DataFrame(betas_rigGrid, columns=betas_df_rigGrid.columns)) betas_df_rigGrid_2 = betas_df_rigGrid betas_df_rigGrid_3 = betas_df_rigGrid_2.set_index(outcomes.columns) display(betas_df_rigGrid_3)

avg_alpha = (np.array(alpha_rigGrid)).mean() print('Average Value of Most Suitable Alphas: ', avg_alpha) # intitialize lists rmse_rigGrid_avg = [] movie_name_rigGrid_avg = [] betas_rigGrid_avg = [] # loop through 30 movies for i in outcomes: Y = np.array(outcomes[i]) X = np.array(predictors) # split data set into train test splits x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10) # Use Best Alpha for calculating coefficients and RMSE ridgeReg = Ridge(alpha=avg_alpha) # input avg of best alphas found for all models ridgeReg.fit(x_train, y_train) y_pred = ridgeReg.predict(x_test) # append values to lists movie_name_rigGrid_avg.append(i) rmse_rigGrid_avg.append(mean_squared_error(y_test, y_pred)) betas_rigGrid_avg.append(ridgeReg.coef_)

# Initialize dictionary for RMSE table table_RMSE_rigGrid_avg = {'Movie': [], 'RMSE': []} # Create New COD table dictionary table_RMSE_rigGrid_avg['Movie'] = movie_name_rigGrid_avg table_RMSE_rigGrid_avg['RMSE'] = rmse_rigGrid_avg # Convert RMSE table dictionary into pandas dataframe table_RMSE_rigGrid_avg = pd.DataFrame(table_RMSE_rigGrid_avg) display(table_RMSE_rigGrid_avg) # Create dataframe for coefficients betas_df_rigGrid_avg = pd.DataFrame(columns=predictors.columns) betas_df_rigGrid_avg = betas_df_rigGrid_avg.append(pd.DataFrame(betas_rigGrid_avg, columns=betas_df_rigGrid_avg.columns)) betas_df_rigGrid_avg = betas_df_rigGrid_avg.set_index(outcomes.columns) display(betas_df_rigGrid_avg)

#create alpha range parameters = {'alpha':[c for c in np.arange(0, 0.01, 0.00001)]} # set up gridsearchCV model = Lasso() cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=10) #cross-validation method for gridsearch lasso_grid = GridSearchCV(model, parameters, scoring='neg_mean_squared_error', cv=cv) # intitialize lists rmse_lasGrid = [] movie_name_lasGrid = [] betas_lasGrid = [] alpha_lasGrid = [] # loop through 30 movies for i in outcomes: Y = np.array(outcomes[i]) X = np.array(predictors) # split data set into train test splits x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10) # use gridsearch to split training data into train and validation set and find best alpha hyperparameter lasso_grid.fit(x_train, y_train) best_alpha = lasso_grid.best_estimator_.alpha print("The best parameters of Grid Search are as follows: {} with a score of {}".format(lasso_grid.best_params_, np.round(lasso_grid.best_score_, 3))) # Use Best Alpha for calculating coefficients and RMSE lassoReg = Lasso(alpha=best_alpha) # input best alpha found for that model lassoReg.fit(x_train, y_train) y_pred = lassoReg.predict(x_test) # append values to lists movie_name_lasGrid.append(i) rmse_lasGrid.append(mean_squared_error(y_test, y_pred)) betas_lasGrid.append(lassoReg.coef_) alpha_lasGrid.append(best_alpha)

# Initialize dictionary for RMSE table table_RMSE_lasGrid = {'Movie': [], 'RMSE': [], 'alpha': []} # Create New COD table dictionary table_RMSE_lasGrid['Movie'] = movie_name_lasGrid table_RMSE_lasGrid['RMSE'] = rmse_lasGrid table_RMSE_lasGrid['alpha'] = alpha_lasGrid # Convert RMSE table dictionary into pandas dataframe table_RMSE_lasGrid = pd.DataFrame(table_RMSE_lasGrid) display(table_RMSE_lasGrid) # Create dataframe for coefficients betas_df_lasGrid = pd.DataFrame(columns=predictors.columns) betas_df_lasGrid = betas_df_lasGrid.append(pd.DataFrame(betas_lasGrid, columns=betas_df_lasGrid.columns)) betas_df_lasGrid_2 = betas_df_lasGrid betas_df_lasGrid_3 = betas_df_lasGrid_2.set_index(outcomes.columns) display(betas_df_lasGrid_3)

avg_alpha = (np.array(alpha_lasGrid)).mean() print('Average Value of Most Suitable Alphas: ', avg_alpha) # intitialize lists rmse_lasGrid_avg = [] movie_name_lasGrid_avg = [] betas_lasGrid_avg = [] # loop through 30 movies for i in outcomes: Y = np.array(outcomes[i]) X = np.array(predictors) # split data set into train test splits x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10) # Use Best Alpha for calculating coefficients and RMSE lassoReg = Lasso(alpha=avg_alpha) # input avg of best alphas found for all models lassoReg.fit(x_train, y_train) y_pred = lassoReg.predict(x_test) # append values to lists movie_name_lasGrid_avg.append(i) rmse_lasGrid_avg.append(mean_squared_error(y_test, y_pred)) betas_lasGrid_avg.append(lassoReg.coef_)

# Initialize dictionary for RMSE table table_RMSE_lasGrid_avg = {'Movie': [], 'RMSE': []} # Create New COD table dictionary table_RMSE_lasGrid_avg['Movie'] = movie_name_lasGrid_avg table_RMSE_lasGrid_avg['RMSE'] = rmse_lasGrid_avg # Convert RMSE table dictionary into pandas dataframe table_RMSE_lasGrid_avg = pd.DataFrame(table_RMSE_lasGrid_avg) display(table_RMSE_lasGrid_avg) # Create dataframe for coefficients betas_df_lasGrid_avg = pd.DataFrame(columns=predictors.columns) betas_df_lasGrid_avg = betas_df_lasGrid_avg.append(pd.DataFrame(betas_lasGrid_avg, columns=betas_df_lasGrid_avg.columns)) betas_df_lasGrid_avg = betas_df_lasGrid_avg.set_index(outcomes.columns) display(betas_df_lasGrid_avg)

# subtract lasso RMSE from ridge RMSE to find differences array1 = np.array(rmse_rigGrid_avg) array2 = np.array(rmse_lasGrid_avg) subtracted_array = np.subtract(array1, array2) subtracted = list(subtracted_array) print(subtracted) #16/30 RMSE values increased from lasso

# Compute average movie enjoyment for each user enjoy_users = data_ratings # Predictor Variable X X = np.array(enjoy_users.mean(axis=1)) X = X.reshape(-1,1) # print(X.shape) # Compute Average Movie Rating and Sort movies_avg_rating = enjoy_users.mean(axis=0) # get the avg of ratings for each movie mar_reshape = movies_avg_rating.values.reshape(1,400) mar_append = enjoy_users.append(pd.DataFrame(mar_reshape, columns=enjoy_users.columns), ignore_index=True) # append avgs to df sorted_movies = mar_append.sort_values(mar_append.last_valid_index(), axis=1) #sort df by rating avgs sorted_movies = sorted_movies.iloc[:1096,198:202] # movies in middle of range, drop median values in last row # display(sorted_movies) # find median value for each movie movies_4 = movieRatings.loc[:,['Fahrenheit 9/11 (2004)','Happy Gilmore (1996)','Diamonds are Forever (1971)', 'Scream (1996)']] movies_4_med = movies_4.median(axis=0) # get the median of ratings for each movie # display(movies_4_med) # Code movies movies_4["Fahrenheit 9/11 (2004)"] = np.where(movies_4["Fahrenheit 9/11 (2004)"] < movies_4_med[0], 0, 1) movies_4["Happy Gilmore (1996)"] = np.where(movies_4["Happy Gilmore (1996)"] < movies_4_med[1], 0, 1) movies_4["Diamonds are Forever (1971)"] = np.where(movies_4["Diamonds are Forever (1971)"] < movies_4_med[2], 0, 1) movies_4["Scream (1996)"] = np.where(movies_4["Scream (1996)"] < movies_4_med[3], 0, 1) outcomes = movies_4 for i in outcomes: Y = np.array(outcomes[i]) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10) print(i) logReg = LogisticRegression() logReg.fit(x_train, y_train) y_pred = logReg.predict(x_test) print("Model Coef: ", logReg.coef_) print("Accuracy of the model is: %.3f" % accuracy_score(y_test, y_pred)) fpr, tpr, thresholds = roc_curve(y_test, y_pred) random_classifier=np.linspace(0.0, 1.0, 100) plt.figure(figsize=(12, 8)) plt.plot(fpr, tpr, color="purple") plt.plot(random_classifier, random_classifier, 'r--') plt.xlabel("FPR", fontsize=25) plt.ylabel("TPR", fontsize=25) plt.title("ROC Curve: " + str(i), fontsize=30) plt.show() print("Area under ROC Curve (AUROC): ", auc(fpr, tpr)) ## Confusion Matrix for the predicted outputs conf_matrix_sk = confusion_matrix(y_test, y_pred) disp=ConfusionMatrixDisplay(confusion_matrix=conf_matrix_sk, display_labels=logReg.classes_) disp.plot() plt.title("Confusion Matrix: " + str(i), fontsize=15) plt.grid() plt.show() print('\n \n') # average enjoyment is a good predictor of whether or not users would enjoy these 4 movies

# Make df with 4 movies of interest horror_movies_df = movieRatings.loc[:,['The Blair Witch Project (1999)', 'Ouija: Origin of Evil (2016)', 'Shutter Island (2010)', 'The Exorcist (1973)']] # Make df with personality/sensation responses of interest user_responses_df = data.loc[:,['I enjoy watching horror movies', 'I like to be surprised even if it startles or scares me', 'I enjoy haunted houses', 'Is emotionally stable/not easily upset', 'Can be cold and aloof', 'No risk - No fun', 'I had a sheltered upbringing', 'Has an active imagination', 'Remains calm in tense situations', 'Is depressed/Blue']] user_responses_df = user_responses_df.drop(896) # drop null user user_responses_df = user_responses_df.reset_index(drop=True) lasso_horror_df = pd.concat([horror_movies_df, user_responses_df], axis=1) # merge columns, 1096 rows # display(lasso_horror_df) # check for null values # print(lasso_horror_df.isnull().sum()) lasso_horror_df = lasso_horror_df.dropna().reset_index(drop=True) # drop null value rows, 20 rows # display(lasso_horror_df) # check responses for anything outside of normal range #lasso_horror_df.loc[(lasso_horror_df["I enjoy haunted houses"] < 1.0) | (lasso_horror_df["I enjoy haunted houses"] > 5.0)] # split data frames up into outcomes and predictors, now have same length row-wise outcomes = lasso_horror_df.iloc[:,0:4].reset_index(drop=True) predictors = lasso_horror_df.iloc[:,4:14].reset_index(drop=True) display(outcomes) display(predictors)

# multiple features, check for multicollinearity plt.figure(figsize = (16,5)) sns.heatmap(user_responses_df.corr(), annot=True, ) #should have used predictors df plt.show()

#create alpha range parameters = {'alpha':[c for c in np.arange(0, 1, 0.0001)]} # set up gridsearchCV model = Lasso() cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=10) #cross-validation method for gridsearch ec_lasso_grid = GridSearchCV(model, parameters, scoring='neg_mean_squared_error', cv=cv) # intitialize lists ec_rmse_lasGrid = [] ec_movie_name_lasGrid = [] ec_betas_lasGrid = [] ec_alpha_lasGrid = [] # loop through 4 movies for i in outcomes: Y = np.array(outcomes[i]) X = np.array(predictors) # split data set into train test splits x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10) # use gridsearch to split training data into train and validation set and find best alpha hyperparameter ec_lasso_grid.fit(x_train, y_train) best_alpha = ec_lasso_grid.best_estimator_.alpha print("The best parameters of Grid Search are as follows: {} with a score of {}".format(ec_lasso_grid.best_params_, np.round(ec_lasso_grid.best_score_, 3))) # Use Best Alpha for calculating coefficients and RMSE ec_lassoReg = Lasso(alpha=best_alpha) # input best alpha found for that model ec_lassoReg.fit(x_train, y_train) y_pred = ec_lassoReg.predict(x_test) # append values to lists ec_movie_name_lasGrid.append(i) ec_rmse_lasGrid.append(mean_squared_error(y_test, y_pred)) ec_betas_lasGrid.append(lassoReg.coef_) ec_alpha_lasGrid.append(best_alpha)

# Initialize dictionary for RMSE table ec_table_RMSE_lasGrid = {'Movie': [], 'RMSE': [], 'alpha': []} # Create New COD table dictionary ec_table_RMSE_lasGrid['Movie'] = ec_movie_name_lasGrid ec_table_RMSE_lasGrid['RMSE'] = ec_rmse_lasGrid ec_table_RMSE_lasGrid['alpha'] = ec_alpha_lasGrid # Convert RMSE table dictionary into pandas dataframe ec_table_RMSE_lasGrid = pd.DataFrame(ec_table_RMSE_lasGrid) display(ec_table_RMSE_lasGrid) # Create dataframe for coefficients ec_betas_df_lasGrid = pd.DataFrame(columns=predictors.columns) ec_betas_df_lasGrid = ec_betas_df_lasGrid.append(pd.DataFrame(ec_betas_lasGrid, columns=ec_betas_df_lasGrid.columns)) ec_betas_df_lasGrid_2 = ec_betas_df_lasGrid ec_betas_df_lasGrid_3 = ec_betas_df_lasGrid_2.set_index(outcomes.columns) display(ec_betas_df_lasGrid_3)