from collections import defaultdict
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import surprise as sp
ratings_set = pd.read_csv('ratings.csv')
display(ratings_set)
movies_set = pd.read_csv('movies.csv')
display(movies_set)
ratings_movies_set = ratings_set.merge(movies_set, how='inner', on='movieId')
display(ratings_movies_set)
reader = sp.Reader(rating_scale=(0, 5))
dataset = sp.Dataset.load_from_df(ratings_set[['userId', 'movieId', 'rating']], reader)
# First we try to load best config from json file
# If the file doesn't exists, we use GridSearchCV to tune algorithm
best_params = {}
with open('knn_params_best.json', 'r') as f:
best_params = json.load(f)
if not best_params:
knn_params_grid = {
'min_k': [2],
'k': [10,30,40,50],
'verbose': [False],
'sim_options': {
'name': ['pearson', 'cosine'],
'min_support': [10,15,20],
'user_based': [True]
}
}
gs = sp.model_selection.GridSearchCV(
algo_class=sp.KNNBasic,
param_grid=knn_params_grid,
measures=['rmse'],
cv=5, # using 5-fold cross-validation
n_jobs=-1
)
gs.fit(dataset)
print("best rmse score:", gs.best_score['rmse'])
print("Best params: ", gs.best_params['rmse'])
# Asign knn algorithm with the best accuracy
knn = gs.best_estimator['rmse']
# writes best param to json file for later use
with open('knn_params_best.json', 'w') as f:
json.dump(gs.best_params['rmse'], f)
else:
print("Best params: ", best_params)
knn = sp.KNNBasic(k=best_params['k'], min_k=best_params['min_k'],
sim_options=best_params['sim_options'], verbose=False)
sp.model_selection.cross_validate(knn, dataset, measures=['rmse'], cv=5, verbose=Fals, n_jobs=-1)
knn.verbose = True
def predict_ratings(algo, user_id, movie_id):
check = ratings_movies_set.query(f"userId == {user_id} and movieId == {movie_id}")
if not check.empty:
print(f"User {user_id} has already rate movie {movie_id}")
display(check)
else:
display(f"User {user_id} ratings:", ratings_movies_set[ratings_movies_set['userId'] == user_id])
display(f"Movie {movie_id} ratings:", ratings_movies_set[ratings_movies_set['movieId'] == movie_id])
predict = algo.predict(uid=user_id, iid=movie_id)
print(f"The estimation for user {user_id} and {movie_id} is: ", predict.est)
print("More details:",predict)
knn.fit(dataset.build_full_trainset())
predict_ratings(knn, 11, 293)
predict_ratings(knn, 100, 72998)
def get_top_n(predictions, n=10):
# First map the predictions to each user.
top_n = defaultdict(list)
for uid, iid, true_r, est, _ in predictions:
top_n[uid].append((iid, est))
# Then sort the predictions for each user and retrieve the n highest ones.
for uid, user_ratings in top_n.items():
user_ratings.sort(key=lambda x: x[1], reverse=True)
top_n[uid] = user_ratings[:n]
return top_n
train_set = dataset.build_full_trainset()
# build test set for all pairs that are not in the training set
test_set = train_set.build_anti_testset()
knn.fit(train_set)
predictions = knn.test(test_set)
top_10 = get_top_n(predictions, n=10)
sample_top_10 = random.sample(top_10.items(), 50)
counter = 0
for uid, user_ratings in sample_top_10:
counter += 1
print(f"{counter}. User {uid} top 10 recommendations are:")
for (iid, rating) in user_ratings:
movie_title = movies_set[movies_set['movieId'] == iid].title.item()
print(f"{movie_title} (id:{iid}), rating: {rating}")
print("\n")