import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import string
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from string import punctuation
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import itertools
from wordcloud import WordCloud
def join_json_values(arrayValues):
allValues = json.loads(arrayValues)
joinedValues = ''
for singleValue in allValues:
joinedValues += singleValue['name']
joinedValues += ','
return joinedValues[:-1]
movies_df = pd.read_csv("tmdb_5000_movies.csv")
movies_df.dropna(subset=['overview'], inplace=True)
movies_df.head()
movies_df.info()
import itertools
list_country = [x.split(', ') for x in movies_df.dropna(subset=['production_countries'])['production_countries'].tolist()]
list_country = list(itertools.chain(*list_country))
from collections import Counter
df_movies_country_count = pd.DataFrame(Counter(list_country).most_common()[:10], columns=['Country', 'Count'])
import itertools
list_company = [x.split(', ') for x in movies_df.dropna(subset=['production_companies'])['production_companies'].tolist()]
list_company = list(itertools.chain(*list_company))
from collections import Counter
df_movies_company_count = pd.DataFrame(Counter(list_company).most_common()[:10], columns=['Company', 'Count'])
import itertools
list_genre = [x.split(', ') for x in movies_df.dropna(subset=['genres'])['genres'].tolist()]
list_genre = list(itertools.chain(*list_genre))
from collections import Counter
df_movies_genre_count = pd.DataFrame(Counter(list_genre).most_common()[:10], columns=['Genre', 'Count'])
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.barplot(y="Genre", x='Count', data=df_movies_genre_count, palette="Set2", orient='h')
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.barplot(y="Company", x='Count', data=df_movies_company_count, palette="Set2", orient='h')
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.barplot(y="Country", x='Count', data=df_movies_country_count, palette="Set2", orient='h')
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(x="original_language", data=movies_df, palette="Set2")
import datetime
movies_df['year']=pd.DatetimeIndex(movies_df['release_date']).year
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(y='year', data=movies_df, palette="Set2", order=movies_df['year'].value_counts().index[0:15])
movies_df.drop(columns=['budget','year','homepage','original_language','popularity','original_title','production_companies','production_countries','release_date','revenue','runtime','spoken_languages','status','tagline','vote_count','vote_average'], inplace=True)
movies_df.head()
movies_df['title_list'] = movies_df['title'].str.lower()
movies_df['overview'] = movies_df['overview'].str.lower()
movies_df['genres'] = movies_df['genres'].apply(join_json_values).str.lower()
movies_df['keywords'] = movies_df['keywords'].apply(join_json_values).str.lower()
movies_df['genres'] = movies_df['genres'].str.lower()
movies_df['keywords'] = movies_df['keywords'].str.lower()
movies_df.head()
movies_df['title_list'] = movies_df['title_list'].apply(word_tokenize)
movies_df['overview'] = movies_df['overview'].apply(word_tokenize)
movies_df['genres'] = movies_df['genres'].apply(word_tokenize)
movies_df['keywords'] = movies_df['keywords'].apply(word_tokenize)
movies_df.head()
list_stopwords = set(stopwords.words('english') + list(punctuation))
movies_df['title_list'] = movies_df['title_list'].apply(lambda x: [word for word in x if word not in list_stopwords])
movies_df['overview'] = movies_df['overview'].apply(lambda x: [word for word in x if word not in list_stopwords])
movies_df['genres'] = movies_df['genres'].apply(lambda x: [word for word in x if word not in list_stopwords])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: [word for word in x if word not in list_stopwords])
movies_df['overview'] = movies_df['overview'].apply(lambda x : [word.translate(str.maketrans('', '', string.punctuation)) for word in x])
movies_df['overview'] = movies_df['overview'].apply(lambda x : [word for word in x if len(word) > 0])
movies_df['title_list'] = movies_df['title_list'].apply(lambda x : list(set(x)))
movies_df['genres'] = movies_df['genres'].apply(lambda x : list(set(x)))
movies_df['overview'] = movies_df['overview'].apply(lambda x : list(set(x)))
movies_df['keywords'] = movies_df['keywords'].apply(lambda x : list(set(x)))
movies_df.head()
from wordcloud import WordCloud
list_genre = movies_df['genres'].tolist()
list_genre = list(itertools.chain(*list_genre))
genre = ' '.join(list_genre)
plt.figure(figsize=(16,12))
wordcloud = WordCloud(max_font_size=50, max_words=100,background_color="white").generate(genre)
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()
list_overview = movies_df['overview'].tolist()
list_overview = list(itertools.chain(*list_overview))
overview = ' '.join(list_overview)
plt.figure(figsize=(16,12))
wordcloud = WordCloud(max_font_size=50, max_words=100,background_color="white").generate(overview)
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()
list_keyword = movies_df['keywords'].tolist()
list_keyword = list(itertools.chain(*list_keyword))
keyword = ' '.join(list_keyword)
plt.figure(figsize=(16,12))
wordcloud = WordCloud(max_font_size=50, max_words=100,background_color="white").generate(keyword)
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()
import gensim
print('Loading KeyedVectors . . .')
wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)
print('Done loading KeyedVectors. . .')
movie_vocab_matrix = []
for list_ in movies_df.to_numpy():
list_[2] = [word for word in list_[2] if word in wv.key_to_index]
list_[3] = [word for word in list_[3] if word in wv.key_to_index]
list_[0] = [word for word in list_[0] if word in wv.key_to_index]
list_[5] = [word for word in list_[5] if word in wv.key_to_index]
movie_vocab_matrix.append(list_)
from tqdm import tqdm
def recommendation(title):
movie_matrix_title_vocab = []
for list_ in movies_df[movies_df['title'] == title].to_numpy():
list_[3] = [word for word in list_[3] if word in wv.key_to_index]
list_[2] = [word for word in list_[2] if word in wv.key_to_index]
list_[0] = [word for word in list_[0] if word in wv.key_to_index]
list_[5] = [word for word in list_[5] if word in wv.key_to_index]
movie_matrix_title_vocab.append(list_)
matrix_similarity = []
pbar = tqdm(movie_vocab_matrix)
for list1 in pbar:
for list2 in movie_matrix_title_vocab:
if not(len(list1[0])) or not(len(list1[3])) or not(len(list2[3])) or not(len(list1[2])) or not(len(list2[2])):
continue
score_catg = wv.n_similarity(list1[0], list2[0])
score_keyword = wv.n_similarity(list1[2], list2[2])
score_desc = wv.n_similarity(list1[3], list2[3])
try:
score_title = wv.n_similarity(list1[5], list2[5])/2
except:
score_title = 0
if ((list1[4] != list2[4]) & (score_catg > 0.85)):
matrix_similarity.append([list1[4], list2[4], score_title, score_catg, score_keyword, score_desc])
pbar.update()
pbar.close()
movies_df_similarity = pd.DataFrame(matrix_similarity, columns = ['recommendation','title','score_title', 'score_category', 'score_keyword', 'score_description'])
movies_df_similarity['final_score'] = movies_df_similarity['score_title'] + movies_df_similarity['score_category'] + movies_df_similarity['score_keyword'] + movies_df_similarity['score_description']
return (movies_df_similarity.sort_values(by=['final_score', 'score_category', 'score_keyword', 'score_description', 'score_title'], ascending=False).head(10))
recommendation('Avatar')