Importing the data:

import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split import numpy as np

import os for dirname, _, filenames in os.walk('/datasets'): for filename in filenames: print(os.path.join(dirname, filename))

movie = pd.read_csv('/datasets/movie-datasets/tmdb_5000_credits.csv') credits = pd.read_csv('/datasets/movie-datasets/tmdb_5000_movies.csv') display(movie.head(3)) display(credits.head(3)) display(movie.columns.tolist()) display(credits.columns.tolist())

jsonText = movie.cast[1] import json y = json.loads(jsonText) for dicts in y: for key,values in dicts.items(): print(f'{key}: {values}') print('\n')

# Making a function that unpacks a json and produces the output def unparse(jsonList): y = json.loads(jsonList) for dicts in y: for key,values in dicts.items(): print(f'{key}: {values}') print('\n')

unparse(movie.crew[1])

unparse(credits.genres[1])

unparse(credits.production_companies[1])

unparse(credits.production_countries[1])

unparse(credits.keywords[1])

unparse(credits.spoken_languages[1])

mergeddf = movie.merge(credits, on='title') mergeddf.head(2)

display(mergeddf.shape) display(mergeddf.dtypes)

Dropping Columns

We don't have to drop any columns except the ID column since it is repeated, apart from that we see that all columns are equally important for classifying and making keywords.

df = mergeddf.drop(columns=['id']) df.head()

# Extracting the columns cols = ['movie_id','cast', 'genres', 'overview','release_date','runtime','title','crew'] df_filtered = df[cols] df_filtered.head()

movpre = df_filtered.copy() movpre.columns

movpre.info

movpre.isna().sum()

movpre.shape

# Checking the data where release date is missing display(movpre[movpre.release_date.isnull()])

movpre.drop(index=4559, inplace=True)

movpre.isna().sum()

# retrieving the data which doesnot have run time display(movpre[movpre.runtime.isnull()])

movpre.loc[movpre.title == 'Chiamatemi Francesco - Il Papa della gente', 'runtime'] = 98.0 movpre.loc[movpre.title == 'To Be Frank, Sinatra at 100','runtime'] = 81.0 # Also adding other missing values like overview movpre.loc[movpre.title == 'Chiamatemi Francesco - Il Papa della gente', 'overview'] = 'The story of Pope Francis life' movpre.loc[movpre.title == 'To Be Frank, Sinatra at 100','overview']='The life of Frank Sinatra, as an actor and singer and the steps along the way that led him to become such an icon' display(movpre.loc[movpre.title == 'Chiamatemi Francesco - Il Papa della gente']) display(movpre.loc[movpre.title == 'Chiamatemi Francesco - Il Papa della gente'])

movpre.isnull().sum()

movpre.loc[movpre.overview.isnull()]

We add the overview into this movie

movpre.loc[movpre.overview.isnull(),'overview']='To protest their working conditions and poor wages, farmworkers in Immokalee, Florida, start a hunger strike outside the headquarters of Publix supermarkets' movpre.loc[movpre.title=='Food Chains']

movpre.isnull().sum()

# Checking for duplicates movpre.duplicated().sum()

Converting JSON to Lists

# Cast column movpre.iloc[0].cast

Lets make a function to parse the json and retrieve the top 4 cast and their names only

import json def getcast(castJson): nameList = [] castJson = json.loads(castJson) for i in range(min(4, len(castJson))): castItem = castJson[i] nameList.append(castItem['name']) return nameList cast2 = getcast(movpre.iloc[0].cast) cast2

Great now lets convert the cast column into a list of names:

movpre['cast'] = movpre['cast'].apply(getcast) movpre.head()

Lets convert the genres columns into a list of genres

# Genre column movpre.iloc[0].genres

# Function to generate genres list def getGenre(obj): genreList = [] genreJson = json.loads(obj) for i in range(len(genreJson)): genreList.append(genreJson[i]['name']) return genreList genres = getGenre(movpre.iloc[0].genres) genres

movpre.genres = movpre.genres.apply(getGenre) movpre.head()

Lets do the same for the crew column, but lets retrieve the director name only:

# Crew column movpre.iloc[0].crew

# Making a function def getDirector(obj): director = [] crewJson = json.loads(obj) for i in range(len(crewJson)): crew = crewJson[i] if crew['job']=='Director': director.append(crew['name']) return director director = getDirector(movpre.iloc[0].crew) director

# Applying the function movpre.crew = movpre.crew.apply(getDirector) movpre.head()

Now we convert the overview column into a list of words and store it as a list

# Overview column movpre.iloc[0].overview

# Making a function def getOverview(string): overviewList = string.split(' ') return overviewList getOverview('Hello there, this is a cool list.')

# Applying the function movpre.overview = movpre.overview.apply(getOverview) movpre.head()

movpre.head()

def getYear(obj): year = [] year.append(obj.split('-')[0]) return year getYear('2002-10-10')

# Applying the function movpre.release_date = movpre.release_date.apply(getYear) movpre.head()

Making a new dataframe

# Concatening into one data column movpre['tags'] = movpre['cast'] + movpre['genres'] + movpre['overview'] + movpre['release_date'] + movpre['crew'] movpre.head()

newdf = movpre[['movie_id','title','tags']] newdf.head()

newdf.tags[0]

newdf['tags']=newdf['tags'].apply(lambda x : [i.lower() for i in x]) newdf.head()

newdf.tags[0]

Machine Learning Imports

import nltk from nltk.stem.porter import PorterStemmer ps = PorterStemmer()

Making function for stemming of words

# Stemming the words def stem(text): y = [] for i in text.split(): y.append(ps.stem(i)) return " ".join(y) stem('in this movie i am a batman') stem('dance dances dancing dancer dancingball')

Tokenization and Stemming of Words

# Converting the list to string first newdf['tags'] = newdf.tags.apply(lambda x: ' '.join(x)) newdf.head()

Now, lets vectorize the words and convert it into numbers.

We extract the top 5000 words from the tags

# Stemming words newdf.tags = newdf.tags.apply(stem) newdf.head()

newdf.tags.sample(1)

Vectorization of Words

We vectorize the tags using the CountVectorizer library and then vectorize the given tagline adn convert into numpy array.

Most of the array will consist of 0s because not all the movie contains the 5000 words as its tags. It will only have selected words.

from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features=5000 ,stop_words= 'english')

vectors = cv.fit_transform(newdf['tags']).toarray()

display(vectors.shape) display(vectors[0]) display(vectors[0].shape) display(len(cv.get_feature_names_out())) display(cv.get_feature_names())

Calculating Similarity using cosine distance

Using cosine distances -- distance between two movies.

Cosine Distance: Distance between 2 vectors as an angle

Distance is inversely proportional to similarity -> high similarity, low distance

from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors) display(similarity) display(similarity.shape)

Here, we have a matrix this represents an array or arrays, where each array is the distance between a given movie and all other movies. So the shape of the array is 4808x4808.

Lets see the distance of the movies from first movie.

# Retrieving the first movie newdf.iloc[0].title

similarity[0]

def recommend(movie): if newdf[newdf['title'] == movie].empty : print(f"{movie} not present in the database") else: movie_index = newdf[newdf['title'] == movie].index[0] distances = similarity[movie_index] movies_list = sorted(list(enumerate(distances)),reverse=True , key = lambda x : x[1])[1:6] print("Top picks for you: \n") for i in movies_list : display(movpre.iloc[i[0]][['title','release_date','runtime','crew']])

recommend('Avatar')

recommend('Batman')

.css-hdxizt{color:var(--chakra-colors-fg-neutral-primary);font-weight:var(--chakra-fontWeights-bold);letter-spacing:-0.09px;}Importing the data:

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Dropping Columns