import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
import string
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from string import punctuation
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
import itertools
from wordcloud import WordCloud
def join_json_values(arrayValues):
allValues = json.loads(arrayValues)
joinedValues = ''
for singleValue in allValues:
joinedValues += singleValue['name']
joinedValues += ','
return joinedValues[:-1]
movies_df = pd.read_csv("tmdb_5000_movies.csv")
movies_df.dropna(subset=['overview'], inplace=True)
movies_df.head()
[nltk_data] Downloading package punkt to
[nltk_data] /Users/j.okoroafor/nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data] /Users/j.okoroafor/nltk_data...
[nltk_data] Package stopwords is already up-to-date!
movies_df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4800 entries, 0 to 4802
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 budget 4800 non-null int64
1 genres 4800 non-null object
2 homepage 1712 non-null object
3 id 4800 non-null int64
4 keywords 4800 non-null object
5 original_language 4800 non-null object
6 original_title 4800 non-null object
7 overview 4800 non-null object
8 popularity 4800 non-null float64
9 production_companies 4800 non-null object
10 production_countries 4800 non-null object
11 release_date 4799 non-null object
12 revenue 4800 non-null int64
13 runtime 4800 non-null float64
14 spoken_languages 4800 non-null object
15 status 4800 non-null object
16 tagline 3959 non-null object
17 title 4800 non-null object
18 vote_average 4800 non-null float64
19 vote_count 4800 non-null int64
dtypes: float64(3), int64(4), object(13)
memory usage: 787.5+ KB
import itertools
list_country = [x.split(', ') for x in movies_df.dropna(subset=['production_countries'])['production_countries'].tolist()]
list_country = list(itertools.chain(*list_country))
from collections import Counter
df_movies_country_count = pd.DataFrame(Counter(list_country).most_common()[:10], columns=['Country', 'Count'])
import itertools
list_company = [x.split(', ') for x in movies_df.dropna(subset=['production_companies'])['production_companies'].tolist()]
list_company = list(itertools.chain(*list_company))
from collections import Counter
df_movies_company_count = pd.DataFrame(Counter(list_company).most_common()[:10], columns=['Company', 'Count'])
import itertools
list_genre = [x.split(', ') for x in movies_df.dropna(subset=['genres'])['genres'].tolist()]
list_genre = list(itertools.chain(*list_genre))
from collections import Counter
df_movies_genre_count = pd.DataFrame(Counter(list_genre).most_common()[:10], columns=['Genre', 'Count'])
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.barplot(y="Genre", x='Count', data=df_movies_genre_count, palette="Set2", orient='h')
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.barplot(y="Company", x='Count', data=df_movies_company_count, palette="Set2", orient='h')
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.barplot(y="Country", x='Count', data=df_movies_country_count, palette="Set2", orient='h')
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(x="original_language", data=movies_df, palette="Set2")
import datetime
movies_df['year']=pd.DatetimeIndex(movies_df['release_date']).year
plt.figure(figsize=(12,10))
sns.set(style="darkgrid")
ax = sns.countplot(y='year', data=movies_df, palette="Set2", order=movies_df['year'].value_counts().index[0:15])
movies_df.drop(columns=['budget','year','homepage','original_language','popularity','original_title','production_companies','production_countries','release_date','revenue','runtime','spoken_languages','status','tagline','vote_count','vote_average'], inplace=True)
movies_df.head()
movies_df['title_list'] = movies_df['title'].str.lower()
movies_df['overview'] = movies_df['overview'].str.lower()
movies_df['genres'] = movies_df['genres'].apply(join_json_values).str.lower()
movies_df['keywords'] = movies_df['keywords'].apply(join_json_values).str.lower()
movies_df['genres'] = movies_df['genres'].str.lower()
movies_df['keywords'] = movies_df['keywords'].str.lower()
movies_df.head()
movies_df['title_list'] = movies_df['title_list'].apply(word_tokenize)
movies_df['overview'] = movies_df['overview'].apply(word_tokenize)
movies_df['genres'] = movies_df['genres'].apply(word_tokenize)
movies_df['keywords'] = movies_df['keywords'].apply(word_tokenize)
movies_df.head()
list_stopwords = set(stopwords.words('english') + list(punctuation))
movies_df['title_list'] = movies_df['title_list'].apply(lambda x: [word for word in x if word not in list_stopwords])
movies_df['overview'] = movies_df['overview'].apply(lambda x: [word for word in x if word not in list_stopwords])
movies_df['genres'] = movies_df['genres'].apply(lambda x: [word for word in x if word not in list_stopwords])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: [word for word in x if word not in list_stopwords])
movies_df['overview'] = movies_df['overview'].apply(lambda x : [word.translate(str.maketrans('', '', string.punctuation)) for word in x])
movies_df['overview'] = movies_df['overview'].apply(lambda x : [word for word in x if len(word) > 0])
movies_df['title_list'] = movies_df['title_list'].apply(lambda x : list(set(x)))
movies_df['genres'] = movies_df['genres'].apply(lambda x : list(set(x)))
movies_df['overview'] = movies_df['overview'].apply(lambda x : list(set(x)))
movies_df['keywords'] = movies_df['keywords'].apply(lambda x : list(set(x)))
movies_df.head()
from wordcloud import WordCloud
list_genre = movies_df['genres'].tolist()
list_genre = list(itertools.chain(*list_genre))
genre = ' '.join(list_genre)
plt.figure(figsize=(16,12))
wordcloud = WordCloud(max_font_size=50, max_words=100,background_color="white").generate(genre)
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()
list_overview = movies_df['overview'].tolist()
list_overview = list(itertools.chain(*list_overview))
overview = ' '.join(list_overview)
plt.figure(figsize=(16,12))
wordcloud = WordCloud(max_font_size=50, max_words=100,background_color="white").generate(overview)
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()
list_keyword = movies_df['keywords'].tolist()
list_keyword = list(itertools.chain(*list_keyword))
keyword = ' '.join(list_keyword)
plt.figure(figsize=(16,12))
wordcloud = WordCloud(max_font_size=50, max_words=100,background_color="white").generate(keyword)
plt.imshow(wordcloud,interpolation="bilinear")
plt.axis("off")
plt.show()
import gensim
print('Loading KeyedVectors . . .')
wv = gensim.models.KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)
print('Done loading KeyedVectors. . .')
movie_vocab_matrix = []
for list_ in movies_df.to_numpy():
list_[2] = [word for word in list_[2] if word in wv.key_to_index]
list_[3] = [word for word in list_[3] if word in wv.key_to_index]
list_[0] = [word for word in list_[0] if word in wv.key_to_index]
list_[5] = [word for word in list_[5] if word in wv.key_to_index]
movie_vocab_matrix.append(list_)
Loading KeyedVectors . . .
from tqdm import tqdm
def recommendation(title):
movie_matrix_title_vocab = []
for list_ in movies_df[movies_df['title'] == title].to_numpy():
list_[3] = [word for word in list_[3] if word in wv.key_to_index]
list_[2] = [word for word in list_[2] if word in wv.key_to_index]
list_[0] = [word for word in list_[0] if word in wv.key_to_index]
list_[5] = [word for word in list_[5] if word in wv.key_to_index]
movie_matrix_title_vocab.append(list_)
matrix_similarity = []
pbar = tqdm(movie_vocab_matrix)
for list1 in pbar:
for list2 in movie_matrix_title_vocab:
if not(len(list1[0])) or not(len(list1[3])) or not(len(list2[3])) or not(len(list1[2])) or not(len(list2[2])):
continue
score_catg = wv.n_similarity(list1[0], list2[0])
score_keyword = wv.n_similarity(list1[2], list2[2])
score_desc = wv.n_similarity(list1[3], list2[3])
try:
score_title = wv.n_similarity(list1[5], list2[5])/2
except:
score_title = 0
if ((list1[4] != list2[4]) & (score_catg > 0.85)):
matrix_similarity.append([list1[4], list2[4], score_title, score_catg, score_keyword, score_desc])
pbar.update()
pbar.close()
movies_df_similarity = pd.DataFrame(matrix_similarity, columns = ['recommendation','title','score_title', 'score_category', 'score_keyword', 'score_description'])
movies_df_similarity['final_score'] = movies_df_similarity['score_title'] + movies_df_similarity['score_category'] + movies_df_similarity['score_keyword'] + movies_df_similarity['score_description']
return (movies_df_similarity.sort_values(by=['final_score', 'score_category', 'score_keyword', 'score_description', 'score_title'], ascending=False).head(10))
recommendation('Avatar')