Applying Machine Learning Methods to Movie Ratings Data
By Mary Nwangwu
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, LeaveOneOut, LeavePOut, validation_curve, learning_curve, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve, auc, roc_curve
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.model_selection import ShuffleSplit
import warnings
warnings.filterwarnings('ignore')
# Load movie ratings data set
data = pd.read_csv('movieReplicationSet.csv')
# Create dataframe with only movie ratings
data_ratings = data.iloc[:,0:400]
# Check for any rows or columns that had ALL null values
participant_null = data_ratings[data_ratings.isna().all(axis=1)]
movie_null = (data_ratings.isna().all()).sum()
print("Participants with no ratings: ")
display(participant_null)
print("Movies with no ratings: " + str(movie_null))
# drop user with all null values
data_ratings = data_ratings.drop(896) #user 896
data_ratings = data_ratings.reset_index(drop=True)
display(data_ratings)
# Calculate the average ratings for each movie and for each participant to fill in null values
avgRatings_movies = np.array(data_ratings.mean(axis=0)) # mean across rows, return 400 values (400,)
avgRatings_participant = np.array(data_ratings.mean(axis=1)) # mean across columns, return 1096 values (1096,)
# print(avgRatings_movies.shape) #(400,)
# print(avgRatings_participant.shape) #(1096,)
# Create new dataframe where every element is the average rating of the corresponding movie and participant
avgRatings_movies_2 = np.vstack([avgRatings_movies]*1096) # stack avg movie rating array vertically, returns 1096 duplicated rows of the 400 avg movie ratings
avgRatings_participant_2 = np.transpose(np.vstack([avgRatings_participant]*400)) # stack avg particpant rating array vertically, returns 400 duplicated rows of the 1096 avg participant ratings, then transpose
# print(np.isnan(avgRatings_movies_2).sum()) #(1096,400)
# print(np.isnan(avgRatings_participant_2).sum()) #(1096,400) # 400 null values????
avgRatings_df = pd.DataFrame((avgRatings_movies_2 + avgRatings_participant_2) / 2)
# print(avgRatings_df.shape) #(1096,400)
data2 = data_ratings
# print(data2.shape) #(1096,400)
# display(avgRatings_df)
# display(data2)
# convert column headings in avgRatings_df to same headings as in data 2
movie_names = list(data2)
avgRatings_df.columns = movie_names
# print(avgRatings_df.shape) #(1096,400)
# overlay the two dataframes so that null values from 1st are replaced with non-nulls in second, new df with all nulls filled
movieRatings_fill = data2.combine_first(avgRatings_df)
# print(movieRatings_fill.shape) #(1096,400)
# display(movieRatings_fill)
movieRatings = movieRatings_fill
display(movieRatings)
# Create dictionary to obtain COD values for each movie combo
r2_max_dict = {}
# Iterate through all movie combinations, calculate COD
for i in movieRatings:
r2_dict = {}
y = movieRatings[i].to_numpy()
for j in movieRatings:
if j != i:
x = movieRatings[j].to_numpy()
reg = LinearRegression().fit(x.reshape(-1,1), y)
y_hat = reg.predict(x.reshape(-1,1))
r2 = r2_score(y,y_hat)
# Add COD values to dictionary
r2_dict[str(i) + ' vs ' + str(j)] = r2
# Add max COD value for movie i to a new dictionary
r2_max_dict[max(r2_dict, key=r2_dict.get)] = max(r2_dict.values())
# Average COD of 400 Simple Linear Regression Models
avgCOD_SLR = sum(r2_max_dict.values()) / len(r2_max_dict)
print("The average COD of the 400 simple linear regression models is: " + str(avgCOD_SLR))
# Plot 400 COD values on histogram
f = plt.figure(figsize = (5,3), dpi = 200)
plt.title('CODs for 400 Simple Linear Regression Models', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.xlabel('COD Value', fontsize=15)
plt.vlines(avgCOD_SLR, ymin=0, ymax=25, colors='#AAFF32', label='Mean')
plt.legend()
plt.hist(r2_max_dict.values(), bins=50, color='palevioletred', edgecolor='black', linewidth=0.5);
# Sort the COD max dictionary
r2_max_dict_sorted = sorted(r2_max_dict.items(), key=lambda x: x[1])
# Return the 10 movies with high COD, and 10 movies with low COD
first_ten_items = r2_max_dict_sorted[:10]
last_ten_items = r2_max_dict_sorted[390:]
# Convert lists back into dictionaries
moviesEasy={x[0]:x[1] for x in last_ten_items}
moviesHard={x[0]:x[1] for x in first_ten_items}
# Sort Dictionaries
moviesEasy_sorted = sorted(moviesEasy.items(), key=lambda x: x[1], reverse=True)
moviesHard_sorted = sorted(moviesHard.items(), key=lambda x: x[1], reverse=True)
# Convert lists back into dictionaries
moviesEasy={x[0]:x[1] for x in moviesEasy_sorted}
moviesHard={x[0]:x[1] for x in moviesHard_sorted}
# Initialize dictionary for COD table
table_COD = {'Movie': [], 'COD': [], 'Predictor Movie': []}
# Initialize lists
movie_name = []
COD = []
pred_movie_name = []
# Acquire keys and values from moviesEasy and moviesHard
for key, value in moviesEasy.items():
name = key
name_split = name.split(' vs ', 2)
movie_name.append(name_split[0])
COD.append(value)
pred_movie_name.append(name_split[1])
for key, value in moviesHard.items():
name = key
name_split = name.split(' vs ', 2)
movie_name.append(name_split[0])
COD.append(value)
pred_movie_name.append(name_split[1])
# Create COD table dictionary
table_COD['Movie'] = movie_name
table_COD['COD'] = COD
table_COD['Predictor Movie'] = pred_movie_name
# Convert COD table dictionary into pandas dataframe
table_COD = pd.DataFrame(table_COD)
display(table_COD)
# each movie uses the 3 new columns plus its best predicting movie
# Create new dataframe with the 20 movies and their best predicting movie, along with 3 personality columns
pers_df = data.iloc[:,474:477]
pers_df = pers_df.drop(896) # drop participant with no ratings here as well
pers_df = pers_df.reset_index(drop=True)
movie_df = movieRatings.loc[:,['Erik the Viking (1989)','I.Q. (1994)','The Lookout (2007)','Patton (1970)',
'The Bandit (1996)','Best Laid Plans (1999)','Congo (1995)','The Straight Story (1999)',
'The Final Conflict (1981)','Heavy Traffic (1973)','Grown Ups 2 (2013)','The Fast and the Furious (2001)',
'13 Going on 30 (2004)','Titanic (1997)','La La Land (2016)','The Cabin in the Woods (2012)',
'Clueless (1995)','Black Swan (2010)','Interstellar (2014)','Avatar (2009)',
'Ran (1985)','The Core (2003)','Terminator 3: Rise of the Machines (2003)',
"Can't Hardly Wait (1998)",'Cocktail (1988)','The Evil Dead (1981)','Escape from LA (1996)',
'Sorority Boys (2002)','Torque (2004)','Bad Boys (1995)']]
mulReg_df = pd.concat([movie_df, pers_df], axis=1) # merge columns, 1096 rows
# display(mulReg_df.shape)
# check for null values
# print(mulReg_df.isnull().sum())
mulReg_df = mulReg_df.dropna().reset_index(drop=True) # drop null value rows, 23 rows
# display(mulReg_df.shape)
# drop rows where personality columns were -1, 9 rows
mulReg_df = mulReg_df.loc[mulReg_df["Are you an only child? (1: Yes; 0: No; -1: Did not respond)"] != -1 ]
mulReg_df = mulReg_df.loc[mulReg_df["Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)"] != -1]
mulReg_df = mulReg_df.reset_index(drop=True)
# display(mulReg_df.shape)
# create dummy columns for gender identity
mulReg_df = pd.get_dummies(data=mulReg_df, columns=['Gender identity (1 = female; 2 = male; 3 = self-described)'], dummy_na=False)
mulReg_df = mulReg_df.drop(columns=['Gender identity (1 = female; 2 = male; 3 = self-described)_3.0'])
mulReg_df = mulReg_df.rename(columns={'Gender identity (1 = female; 2 = male; 3 = self-described)_1.0': 'Gender identity_Female',
'Gender identity (1 = female; 2 = male; 3 = self-described)_2.0': 'Gender identity_Male',
'Are you an only child? (1: Yes; 0: No; -1: Did not respond)': 'Are you an only child? (1: Yes; 0: No)',
'Movies are best enjoyed alone (1: Yes; 0: No; -1: Did not respond)': 'Movies are best enjoyed alone (1: Yes; 0: No)'})
display(mulReg_df) #1064 rows, 34 cols
# 20 movies
y1 = mulReg_df['Erik the Viking (1989)'].to_numpy()
y2 = mulReg_df['I.Q. (1994)'].to_numpy()
y3 = mulReg_df['The Lookout (2007)'].to_numpy()
y4 = mulReg_df['Patton (1970)'].to_numpy()
y5 = mulReg_df['The Bandit (1996)'].to_numpy()
y6 = mulReg_df['Best Laid Plans (1999)'].to_numpy()
y7 = mulReg_df['Congo (1995)'].to_numpy()
y8 = mulReg_df['The Straight Story (1999)'].to_numpy()
y9 = mulReg_df['The Final Conflict (1981)'].to_numpy()
y10 = mulReg_df['Heavy Traffic (1973)'].to_numpy()
y11 = mulReg_df['Grown Ups 2 (2013)'].to_numpy()
y12 = mulReg_df['The Fast and the Furious (2001)'].to_numpy()
y13 = mulReg_df['13 Going on 30 (2004)'].to_numpy()
y14 = mulReg_df['Titanic (1997)'].to_numpy()
y15 = mulReg_df['La La Land (2016)'].to_numpy()
y16 = mulReg_df['The Cabin in the Woods (2012)'].to_numpy()
y17 = mulReg_df['Clueless (1995)'].to_numpy()
y18 = mulReg_df['Black Swan (2010)'].to_numpy()
y19 = mulReg_df['Interstellar (2014)'].to_numpy()
y20 = mulReg_df['Avatar (2009)'].to_numpy()
# additional predictors
x1_f = mulReg_df['Gender identity_Female'].to_numpy()
x1_m = mulReg_df['Gender identity_Male'].to_numpy()
x2 = mulReg_df['Are you an only child? (1: Yes; 0: No)'].to_numpy()
x3 = mulReg_df['Movies are best enjoyed alone (1: Yes; 0: No)'].to_numpy()
x5 = mulReg_df['Ran (1985)'].to_numpy()
x6 = mulReg_df['The Core (2003)'].to_numpy()
x7 = mulReg_df['Terminator 3: Rise of the Machines (2003)'].to_numpy()
x8 = mulReg_df["Can't Hardly Wait (1998)"].to_numpy()
x9 = mulReg_df['Cocktail (1988)'].to_numpy()
x10 = mulReg_df['The Evil Dead (1981)'].to_numpy()
x11 = mulReg_df['Escape from LA (1996)'].to_numpy()
x12 = mulReg_df['Sorority Boys (2002)'].to_numpy()
x13 = mulReg_df['Torque (2004)'].to_numpy()
x14 = mulReg_df['Bad Boys (1995)'].to_numpy()
new_r2_list = []
# Erik the Viking (1989) w/ I.Q. (1994)
x = np.concatenate((y2.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1) # predictors
reg = LinearRegression().fit(x,y1) # linear regression with predictors and outcome y
y_hat = reg.predict(x)
r2 = r2_score(y1,y_hat)
new_r2_list.append(r2)
# I.Q. (1994) w/ Erik the Viking (1989)
x = np.concatenate((y1.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y2)
y_hat = reg.predict(x)
r2 = r2_score(y2,y_hat)
new_r2_list.append(r2)
# The Lookout (2007) w/ Patton (1970)
x = np.concatenate((y4.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y3)
y_hat = reg.predict(x)
r2 = r2_score(y3,y_hat)
new_r2_list.append(r2)
# Patton (1970) w/ The Lookout (2007)
x = np.concatenate((y3.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y4)
y_hat = reg.predict(x)
r2 = r2_score(y4,y_hat)
new_r2_list.append(r2)
# The Bandit (1996) w/ Best Laid Plans (1999)
x = np.concatenate((y6.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y5)
y_hat = reg.predict(x)
r2 = r2_score(y5,y_hat)
new_r2_list.append(r2)
# Best Laid Plans (1999) w/ The Bandit (1996)
x = np.concatenate((y5.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y6)
y_hat = reg.predict(x)
r2 = r2_score(y6,y_hat)
new_r2_list.append(r2)
# Congo (1995) w/ The Straight Story (1999)
x = np.concatenate((y8.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y7)
y_hat = reg.predict(x)
r2 = r2_score(y7,y_hat)
new_r2_list.append(r2)
# The Straight Story (1999) w/ Congo (1995)
x = np.concatenate((y7.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y8)
y_hat = reg.predict(x)
r2 = r2_score(y8,y_hat)
new_r2_list.append(r2)
# The Final Conflict (1981) w/ The Lookout (2007)
x = np.concatenate((y3.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y9)
y_hat = reg.predict(x)
r2 = r2_score(y9,y_hat)
new_r2_list.append(r2)
# Heavy Traffic (1973) w/ Ran (1985)
x = np.concatenate((x5.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y10)
y_hat = reg.predict(x)
r2 = r2_score(y10,y_hat)
new_r2_list.append(r2)
# Grown Ups 2 (2013) w/ The Core (2003)
x = np.concatenate((x6.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y11)
y_hat = reg.predict(x)
r2 = r2_score(y11,y_hat)
new_r2_list.append(r2)
# The Fast and the Furious (2001) w/ Terminator 3: Rise of the Machines (2003)
x = np.concatenate((x7.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y12)
y_hat = reg.predict(x)
r2 = r2_score(y12,y_hat)
new_r2_list.append(r2)
# 13 Going on 30 (2004) w/ Can't Hardly Wait (1998)
x = np.concatenate((x8.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y13)
y_hat = reg.predict(x)
r2 = r2_score(y13,y_hat)
new_r2_list.append(r2)
# Titanic (1997) w/ Cocktail (1988)
x = np.concatenate((x9.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y14)
y_hat = reg.predict(x)
r2 = r2_score(y14,y_hat)
new_r2_list.append(r2)
# La La Land (2016) w/ The Lookout (2007)
x = np.concatenate((y3.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y15)
y_hat = reg.predict(x)
r2 = r2_score(y15,y_hat)
new_r2_list.append(r2)
# The Cabin in the Woods (2012) w/ The Evil Dead (1981)
x = np.concatenate((x10.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y16)
y_hat = reg.predict(x)
r2 = r2_score(y16,y_hat)
new_r2_list.append(r2)
# Clueless (1995) w/ Escape from LA (1996)
x = np.concatenate((x11.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y17)
y_hat = reg.predict(x)
r2 = r2_score(y17,y_hat)
new_r2_list.append(r2)
# Black Swan (2010) w/ Sorority Boys (2002)
x = np.concatenate((x12.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y18)
y_hat = reg.predict(x)
r2 = r2_score(y18,y_hat)
new_r2_list.append(r2)
# Interstellar (2014) w/ Torque (2004)
x = np.concatenate((x13.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y19)
y_hat = reg.predict(x)
r2 = r2_score(y19,y_hat)
new_r2_list.append(r2)
# Avatar (2009) w/ Bad Boys (1995)
x = np.concatenate((x14.reshape(-1,1), x1_f.reshape(-1,1), x1_m.reshape(-1,1), x2.reshape(-1,1), x3.reshape(-1,1)), axis=1)
reg = LinearRegression().fit(x,y20)
y_hat = reg.predict(x)
r2 = r2_score(y20,y_hat)
new_r2_list.append(r2)
# Initialize dictionary for New COD table
table_COD2 = {'Movie': [], 'New COD': [], 'Predictor Movie': []}
# Create New COD table dictionary
table_COD2['Movie'] = movie_name
table_COD2['New COD'] = new_r2_list
table_COD2['Predictor Movie'] = pred_movie_name
# Convert COD table dictionary into pandas dataframe
table_COD2 = pd.DataFrame(table_COD2)
# display(table_COD)
display(table_COD2)
plt.scatter(np.array(COD),np.array(new_r2_list), color='white', edgecolor='black', s=100)
plt.title('New $\mathregular{R^{2}}$ vs Old $\mathregular{R^{2}}$', fontsize=20)
plt.ylabel('New $\mathregular{R^{2}}$', fontsize=15)
plt.xlabel('Old $\mathregular{R^{2}}$', fontsize=15);
plt.xlim(0.05, 0.2)
plt.ylim(0.07, 0.2)
plt.title('Hardest to Predict Movies', fontsize=20)
plt.ylabel('New $\mathregular{R^{2}}$', fontsize=15)
plt.xlabel('Old $\mathregular{R^{2}}$', fontsize=15)
plt.scatter(np.array(COD),np.array(new_r2_list), color='white', edgecolor='red', s=100);
plt.xlim(0.68, 0.74)
plt.ylim(0.68, 0.74)
plt.title('Easiest to Predict Movies', fontsize=20)
plt.ylabel('New $\mathregular{R^{2}}$', fontsize=15)
plt.xlabel('Old $\mathregular{R^{2}}$', fontsize=15)
plt.scatter(np.array(COD),np.array(new_r2_list), color='white', edgecolor='green', s=100);
# subtract new COD values from old to find differences
array1 = np.array(COD)
array2 = np.array(new_r2_list)
subtracted_array = np.subtract(array1, array2)
subtracted = list(subtracted_array)
print(subtracted)
# Pick 30 movies in the middle of the COD range
mid_30_movies = r2_max_dict_sorted[185:215] # remove movie idx 196, "the evil dead" used in Q2???: r2_max_dict_sorted[185:196] + r2_max_dict_sorted[197:216]
moviesMid={x[0]:x[1] for x in mid_30_movies} # Convert lists back into dictionaries
moviesMid_name = [] # Initialize list
for key, value in moviesMid.items(): # Acquire keys and values from moviesMid
name = key
name_split = name.split(' vs ', 2)
moviesMid_name.append(name_split[0])
# Pick 10 other movies in middle of COD range from Q1
other_movies_10 = r2_max_dict_sorted[50:60]
movies10={x[0]:x[1] for x in other_movies_10} # Convert lists back into dictionaries
movies10_name = [] # Initialize list
for key, value in movies10.items(): # Acquire keys and values from movies10
name = key
name_split = name.split(' vs ', 2)
movies10_name.append(name_split[0])
# Create df for 30 movies:
movies_30 = []
for i in movieRatings:
for j in moviesMid_name:
if i == j:
movies_30.append(movieRatings[i])
movies_30 = pd.DataFrame(movies_30).transpose()
# Create df for 10 movies:
movies_10 = []
for i in movieRatings:
for j in movies10_name:
if i == j:
movies_10.append(movieRatings[i])
movies_10 = pd.DataFrame(movies_10).transpose() # do i need to transpose? .transpose()
# prepare dataframes for regression model
outcomes = movies_30
predictors = movies_10
# display(outcomes)
# display(predictors)
#create alpha range
parameters = {'alpha':[c for c in np.arange(0, 200, 0.1)]}
# set up gridsearchCV
model = Ridge()
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=10) #cross-validation method for gridsearch
ridge_grid = GridSearchCV(model, parameters, scoring='neg_mean_squared_error', cv=cv)
# intitialize lists
rmse_rigGrid = []
movie_name_rigGrid = []
betas_rigGrid = []
alpha_rigGrid = []
# loop through 30 movies
for i in outcomes:
Y = np.array(outcomes[i])
X = np.array(predictors)
# split data set into train test splits
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10)
# use gridsearch to split training data into train and validation set and find best alpha hyperparameter
ridge_grid.fit(x_train, y_train)
best_alpha = ridge_grid.best_estimator_.alpha
print("The best parameters of Grid Search are as follows: {} with a score of {}".format(ridge_grid.best_params_, np.round(ridge_grid.best_score_, 3)))
# Use Best Alpha for calculating coefficients and RMSE
ridgeReg = Ridge(alpha=best_alpha) # input best alpha found for that model
ridgeReg.fit(x_train, y_train)
y_pred = ridgeReg.predict(x_test)
# append values to lists
movie_name_rigGrid.append(i)
rmse_rigGrid.append(mean_squared_error(y_test, y_pred))
betas_rigGrid.append(ridgeReg.coef_)
alpha_rigGrid.append(best_alpha)
# Initialize dictionary for RMSE table
table_RMSE_rigGrid = {'Movie': [], 'RMSE': [], 'alpha': []}
# Create New COD table dictionary
table_RMSE_rigGrid['Movie'] = movie_name_rigGrid
table_RMSE_rigGrid['RMSE'] = rmse_rigGrid
table_RMSE_rigGrid['alpha'] = alpha_rigGrid
# Convert RMSE table dictionary into pandas dataframe
table_RMSE_rigGrid = pd.DataFrame(table_RMSE_rigGrid)
display(table_RMSE_rigGrid)
# Create dataframe for coefficients
betas_df_rigGrid = pd.DataFrame(columns=predictors.columns)
betas_df_rigGrid = betas_df_rigGrid.append(pd.DataFrame(betas_rigGrid, columns=betas_df_rigGrid.columns))
betas_df_rigGrid_2 = betas_df_rigGrid
betas_df_rigGrid_3 = betas_df_rigGrid_2.set_index(outcomes.columns)
display(betas_df_rigGrid_3)
avg_alpha = (np.array(alpha_rigGrid)).mean()
print('Average Value of Most Suitable Alphas: ', avg_alpha)
# intitialize lists
rmse_rigGrid_avg = []
movie_name_rigGrid_avg = []
betas_rigGrid_avg = []
# loop through 30 movies
for i in outcomes:
Y = np.array(outcomes[i])
X = np.array(predictors)
# split data set into train test splits
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10)
# Use Best Alpha for calculating coefficients and RMSE
ridgeReg = Ridge(alpha=avg_alpha) # input avg of best alphas found for all models
ridgeReg.fit(x_train, y_train)
y_pred = ridgeReg.predict(x_test)
# append values to lists
movie_name_rigGrid_avg.append(i)
rmse_rigGrid_avg.append(mean_squared_error(y_test, y_pred))
betas_rigGrid_avg.append(ridgeReg.coef_)
# Initialize dictionary for RMSE table
table_RMSE_rigGrid_avg = {'Movie': [], 'RMSE': []}
# Create New COD table dictionary
table_RMSE_rigGrid_avg['Movie'] = movie_name_rigGrid_avg
table_RMSE_rigGrid_avg['RMSE'] = rmse_rigGrid_avg
# Convert RMSE table dictionary into pandas dataframe
table_RMSE_rigGrid_avg = pd.DataFrame(table_RMSE_rigGrid_avg)
display(table_RMSE_rigGrid_avg)
# Create dataframe for coefficients
betas_df_rigGrid_avg = pd.DataFrame(columns=predictors.columns)
betas_df_rigGrid_avg = betas_df_rigGrid_avg.append(pd.DataFrame(betas_rigGrid_avg, columns=betas_df_rigGrid_avg.columns))
betas_df_rigGrid_avg = betas_df_rigGrid_avg.set_index(outcomes.columns)
display(betas_df_rigGrid_avg)
#create alpha range
parameters = {'alpha':[c for c in np.arange(0, 0.01, 0.00001)]}
# set up gridsearchCV
model = Lasso()
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=10) #cross-validation method for gridsearch
lasso_grid = GridSearchCV(model, parameters, scoring='neg_mean_squared_error', cv=cv)
# intitialize lists
rmse_lasGrid = []
movie_name_lasGrid = []
betas_lasGrid = []
alpha_lasGrid = []
# loop through 30 movies
for i in outcomes:
Y = np.array(outcomes[i])
X = np.array(predictors)
# split data set into train test splits
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10)
# use gridsearch to split training data into train and validation set and find best alpha hyperparameter
lasso_grid.fit(x_train, y_train)
best_alpha = lasso_grid.best_estimator_.alpha
print("The best parameters of Grid Search are as follows: {} with a score of {}".format(lasso_grid.best_params_, np.round(lasso_grid.best_score_, 3)))
# Use Best Alpha for calculating coefficients and RMSE
lassoReg = Lasso(alpha=best_alpha) # input best alpha found for that model
lassoReg.fit(x_train, y_train)
y_pred = lassoReg.predict(x_test)
# append values to lists
movie_name_lasGrid.append(i)
rmse_lasGrid.append(mean_squared_error(y_test, y_pred))
betas_lasGrid.append(lassoReg.coef_)
alpha_lasGrid.append(best_alpha)
# Initialize dictionary for RMSE table
table_RMSE_lasGrid = {'Movie': [], 'RMSE': [], 'alpha': []}
# Create New COD table dictionary
table_RMSE_lasGrid['Movie'] = movie_name_lasGrid
table_RMSE_lasGrid['RMSE'] = rmse_lasGrid
table_RMSE_lasGrid['alpha'] = alpha_lasGrid
# Convert RMSE table dictionary into pandas dataframe
table_RMSE_lasGrid = pd.DataFrame(table_RMSE_lasGrid)
display(table_RMSE_lasGrid)
# Create dataframe for coefficients
betas_df_lasGrid = pd.DataFrame(columns=predictors.columns)
betas_df_lasGrid = betas_df_lasGrid.append(pd.DataFrame(betas_lasGrid, columns=betas_df_lasGrid.columns))
betas_df_lasGrid_2 = betas_df_lasGrid
betas_df_lasGrid_3 = betas_df_lasGrid_2.set_index(outcomes.columns)
display(betas_df_lasGrid_3)
avg_alpha = (np.array(alpha_lasGrid)).mean()
print('Average Value of Most Suitable Alphas: ', avg_alpha)
# intitialize lists
rmse_lasGrid_avg = []
movie_name_lasGrid_avg = []
betas_lasGrid_avg = []
# loop through 30 movies
for i in outcomes:
Y = np.array(outcomes[i])
X = np.array(predictors)
# split data set into train test splits
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10)
# Use Best Alpha for calculating coefficients and RMSE
lassoReg = Lasso(alpha=avg_alpha) # input avg of best alphas found for all models
lassoReg.fit(x_train, y_train)
y_pred = lassoReg.predict(x_test)
# append values to lists
movie_name_lasGrid_avg.append(i)
rmse_lasGrid_avg.append(mean_squared_error(y_test, y_pred))
betas_lasGrid_avg.append(lassoReg.coef_)
# Initialize dictionary for RMSE table
table_RMSE_lasGrid_avg = {'Movie': [], 'RMSE': []}
# Create New COD table dictionary
table_RMSE_lasGrid_avg['Movie'] = movie_name_lasGrid_avg
table_RMSE_lasGrid_avg['RMSE'] = rmse_lasGrid_avg
# Convert RMSE table dictionary into pandas dataframe
table_RMSE_lasGrid_avg = pd.DataFrame(table_RMSE_lasGrid_avg)
display(table_RMSE_lasGrid_avg)
# Create dataframe for coefficients
betas_df_lasGrid_avg = pd.DataFrame(columns=predictors.columns)
betas_df_lasGrid_avg = betas_df_lasGrid_avg.append(pd.DataFrame(betas_lasGrid_avg, columns=betas_df_lasGrid_avg.columns))
betas_df_lasGrid_avg = betas_df_lasGrid_avg.set_index(outcomes.columns)
display(betas_df_lasGrid_avg)
# subtract lasso RMSE from ridge RMSE to find differences
array1 = np.array(rmse_rigGrid_avg)
array2 = np.array(rmse_lasGrid_avg)
subtracted_array = np.subtract(array1, array2)
subtracted = list(subtracted_array)
print(subtracted) #16/30 RMSE values increased from lasso
# Compute average movie enjoyment for each user
enjoy_users = data_ratings
# Predictor Variable X
X = np.array(enjoy_users.mean(axis=1))
X = X.reshape(-1,1)
# print(X.shape)
# Compute Average Movie Rating and Sort
movies_avg_rating = enjoy_users.mean(axis=0) # get the avg of ratings for each movie
mar_reshape = movies_avg_rating.values.reshape(1,400)
mar_append = enjoy_users.append(pd.DataFrame(mar_reshape, columns=enjoy_users.columns), ignore_index=True) # append avgs to df
sorted_movies = mar_append.sort_values(mar_append.last_valid_index(), axis=1) #sort df by rating avgs
sorted_movies = sorted_movies.iloc[:1096,198:202] # movies in middle of range, drop median values in last row
# display(sorted_movies)
# find median value for each movie
movies_4 = movieRatings.loc[:,['Fahrenheit 9/11 (2004)','Happy Gilmore (1996)','Diamonds are Forever (1971)', 'Scream (1996)']]
movies_4_med = movies_4.median(axis=0) # get the median of ratings for each movie
# display(movies_4_med)
# Code movies
movies_4["Fahrenheit 9/11 (2004)"] = np.where(movies_4["Fahrenheit 9/11 (2004)"] < movies_4_med[0], 0, 1)
movies_4["Happy Gilmore (1996)"] = np.where(movies_4["Happy Gilmore (1996)"] < movies_4_med[1], 0, 1)
movies_4["Diamonds are Forever (1971)"] = np.where(movies_4["Diamonds are Forever (1971)"] < movies_4_med[2], 0, 1)
movies_4["Scream (1996)"] = np.where(movies_4["Scream (1996)"] < movies_4_med[3], 0, 1)
outcomes = movies_4
for i in outcomes:
Y = np.array(outcomes[i])
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10)
print(i)
logReg = LogisticRegression()
logReg.fit(x_train, y_train)
y_pred = logReg.predict(x_test)
print("Model Coef: ", logReg.coef_)
print("Accuracy of the model is: %.3f" % accuracy_score(y_test, y_pred))
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
random_classifier=np.linspace(0.0, 1.0, 100)
plt.figure(figsize=(12, 8))
plt.plot(fpr, tpr, color="purple")
plt.plot(random_classifier, random_classifier, 'r--')
plt.xlabel("FPR", fontsize=25)
plt.ylabel("TPR", fontsize=25)
plt.title("ROC Curve: " + str(i), fontsize=30)
plt.show()
print("Area under ROC Curve (AUROC): ", auc(fpr, tpr))
## Confusion Matrix for the predicted outputs
conf_matrix_sk = confusion_matrix(y_test, y_pred)
disp=ConfusionMatrixDisplay(confusion_matrix=conf_matrix_sk, display_labels=logReg.classes_)
disp.plot()
plt.title("Confusion Matrix: " + str(i), fontsize=15)
plt.grid()
plt.show()
print('\n \n')
# average enjoyment is a good predictor of whether or not users would enjoy these 4 movies
# Make df with 4 movies of interest
horror_movies_df = movieRatings.loc[:,['The Blair Witch Project (1999)', 'Ouija: Origin of Evil (2016)',
'Shutter Island (2010)', 'The Exorcist (1973)']]
# Make df with personality/sensation responses of interest
user_responses_df = data.loc[:,['I enjoy watching horror movies', 'I like to be surprised even if it startles or scares me',
'I enjoy haunted houses', 'Is emotionally stable/not easily upset', 'Can be cold and aloof',
'No risk - No fun', 'I had a sheltered upbringing', 'Has an active imagination', 'Remains calm in tense situations',
'Is depressed/Blue']]
user_responses_df = user_responses_df.drop(896) # drop null user
user_responses_df = user_responses_df.reset_index(drop=True)
lasso_horror_df = pd.concat([horror_movies_df, user_responses_df], axis=1) # merge columns, 1096 rows
# display(lasso_horror_df)
# check for null values
# print(lasso_horror_df.isnull().sum())
lasso_horror_df = lasso_horror_df.dropna().reset_index(drop=True) # drop null value rows, 20 rows
# display(lasso_horror_df)
# check responses for anything outside of normal range
#lasso_horror_df.loc[(lasso_horror_df["I enjoy haunted houses"] < 1.0) | (lasso_horror_df["I enjoy haunted houses"] > 5.0)]
# split data frames up into outcomes and predictors, now have same length row-wise
outcomes = lasso_horror_df.iloc[:,0:4].reset_index(drop=True)
predictors = lasso_horror_df.iloc[:,4:14].reset_index(drop=True)
display(outcomes)
display(predictors)
# multiple features, check for multicollinearity
plt.figure(figsize = (16,5))
sns.heatmap(user_responses_df.corr(), annot=True, ) #should have used predictors df
plt.show()
#create alpha range
parameters = {'alpha':[c for c in np.arange(0, 1, 0.0001)]}
# set up gridsearchCV
model = Lasso()
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=10) #cross-validation method for gridsearch
ec_lasso_grid = GridSearchCV(model, parameters, scoring='neg_mean_squared_error', cv=cv)
# intitialize lists
ec_rmse_lasGrid = []
ec_movie_name_lasGrid = []
ec_betas_lasGrid = []
ec_alpha_lasGrid = []
# loop through 4 movies
for i in outcomes:
Y = np.array(outcomes[i])
X = np.array(predictors)
# split data set into train test splits
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=10)
# use gridsearch to split training data into train and validation set and find best alpha hyperparameter
ec_lasso_grid.fit(x_train, y_train)
best_alpha = ec_lasso_grid.best_estimator_.alpha
print("The best parameters of Grid Search are as follows: {} with a score of {}".format(ec_lasso_grid.best_params_, np.round(ec_lasso_grid.best_score_, 3)))
# Use Best Alpha for calculating coefficients and RMSE
ec_lassoReg = Lasso(alpha=best_alpha) # input best alpha found for that model
ec_lassoReg.fit(x_train, y_train)
y_pred = ec_lassoReg.predict(x_test)
# append values to lists
ec_movie_name_lasGrid.append(i)
ec_rmse_lasGrid.append(mean_squared_error(y_test, y_pred))
ec_betas_lasGrid.append(lassoReg.coef_)
ec_alpha_lasGrid.append(best_alpha)
# Initialize dictionary for RMSE table
ec_table_RMSE_lasGrid = {'Movie': [], 'RMSE': [], 'alpha': []}
# Create New COD table dictionary
ec_table_RMSE_lasGrid['Movie'] = ec_movie_name_lasGrid
ec_table_RMSE_lasGrid['RMSE'] = ec_rmse_lasGrid
ec_table_RMSE_lasGrid['alpha'] = ec_alpha_lasGrid
# Convert RMSE table dictionary into pandas dataframe
ec_table_RMSE_lasGrid = pd.DataFrame(ec_table_RMSE_lasGrid)
display(ec_table_RMSE_lasGrid)
# Create dataframe for coefficients
ec_betas_df_lasGrid = pd.DataFrame(columns=predictors.columns)
ec_betas_df_lasGrid = ec_betas_df_lasGrid.append(pd.DataFrame(ec_betas_lasGrid, columns=ec_betas_df_lasGrid.columns))
ec_betas_df_lasGrid_2 = ec_betas_df_lasGrid
ec_betas_df_lasGrid_3 = ec_betas_df_lasGrid_2.set_index(outcomes.columns)
display(ec_betas_df_lasGrid_3)