#import the relevant librarires
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
#load the data
data= pd.read_csv('netflix_titles.csv')
data
#copy the data in the variable called netflix
netflix= data.copy()
#extract the year and month from the "date_added" column
netflix["date_added"]=pd.to_datetime(netflix["date_added"])
netflix["year_added"]=netflix["date_added"].dt.year
netflix["month_added"]=netflix["date_added"].dt.month
#bar chart showing the count of TV Shows and Movies on netflix platform
sns.countplot(x=netflix['type'], data= netflix,palette=["#000000",'#E50914']).set(title='COUNT OF TV SHOWS AND MOVIES ON NETFLIX')
#create a data frame having years in chronological order
type_trend= pd.DataFrame({'Year':np.sort(netflix['year_added'].unique())})
#create separate list for movies and tv shows containing their frequencies according to years.
movie_count=[]
for i in np.sort(netflix['year_added'].unique()):
movie_count.append(np.argwhere((netflix['year_added']==i)&(netflix['type']=='Movie')).shape[0])
tvshow_count=[]
for i in np.sort(netflix['year_added'].unique()):
tvshow_count.append(np.argwhere((netflix['year_added']==i)&(netflix['type']=='TV Show')).shape[0])
#add the newly made columns to the dataframe
type_trend['Tv show']= tvshow_count
type_trend['movie']= movie_count
plot=type_trend
plot
#create a stacked area chart depicting the yearly trend of Movies and TV Shows on netflix.
plt.stackplot(plot['Year'],plot['Tv show'],plot['movie'], colors =["#000000",'#E50914'])
plt.legend(labels=['TV Show','Movies'], loc="upper left")
plt.title("YEARLY TREND OF MOVIES AND TV SHOWS ON NETFLIX",FONTSIZE=15)
plt.xlabel("YEARS")
plt.ylabel("FREQUENCY")
#top 5 TV Shows genres on netflix on the basis of their appearance
tv_genre=[]
for i in range(netflix["listed_in"].shape[0]):
if netflix["type"][i]=="TV Show":
a=str(netflix["listed_in"][i]).split(",")
for j in range(len(a)):
a[j]=a[j].lstrip()
tv_genre+=a
top_tv_genre=pd.Series(tv_genre)
top_tv_genre.value_counts().head(5)
#top 5 Movies genres on netflix on the basis of their appearance
movie_genre=[]
for i in range(netflix["listed_in"].shape[0]):
if netflix["type"][i]=="Movie":
b=str(netflix["listed_in"][i]).split(",")
for j in range(len(b)):
b[j]=b[j].lstrip()
movie_genre+=b
top_movie_genre=pd.Series(movie_genre)
top_movie_genre.value_counts().head(5)
#top 10 actors on netflix on the basis of their appearance
actors=[]
for i in range(netflix["cast"].shape[0]):
c=str(netflix["cast"][i]).split(",")
actors+=c
top_actors=pd.Series(actors)
top_actors.value_counts().head(11)
#top 10 directors on netflix on the basis of their appearances
directors=[]
for i in range(netflix["director"].shape[0]):
d=str(netflix["director"][i]).split(",")
directors+=d
top_directors=pd.Series(directors)
top_directors.value_counts().head(11)
#evaluating the difference between release of content and its inclusion on netflix platform
netflix["difference"]=netflix["year_added"]-netflix["release_year"]
#curve showing the frequency of difference between release of content and its inclusion on netflix platform
sns.distplot(netflix["difference"].dropna(), color='#E50914').set(title="DIFFERENCE BETWEEN CONTENT RELEASE AND INCLUSION ON NETFLIX",xlabel='DIFFERENCE',ylabel='DENSITY')
netflix_color=["#000000","#161A1D","#261C21","#2B2D42","#5F181B","#86090B","#D91C1F","#E5383B","#B1A7A6","#D3D3D3","#8D99AE",'#F5F3F4','#FFFFFF']
#bar chart showing frequency of content rating
plt.grid(False)
plt.barh(netflix["rating"].value_counts().index,netflix["rating"].value_counts().values, color=netflix_color)
plt.title("DISTRIBUTION OF RATINGS",fontsize=15)
plt.xlabel("COUNT")
plt.ylabel("RATING")
#create a list showing duration of each movie on netflix platform
duration_movie=[]
for i in range(netflix["duration"].shape[0]):
if netflix["type"][i]=="Movie":
duration_movie.append(netflix['duration'][i])
duration_movie_num=[]
for j in range(len(duration_movie)):
duration_movie_num.append(int(duration_movie[j].split(" ")[0]))
#curve showing the distribution of movie duration on netflix
sns.distplot(duration_movie_num, color='#E50914').set(title="DISTRIBUTION OF MOVIE DURATION ON NETFLIX",xlabel='DURATION',ylabel='DENSITY')
#create a list showing duration of each TV Show on netflix platform
duration_tv=[]
for i in range(netflix["duration"].shape[0]):
if netflix["type"][i]=="TV Show":
duration_tv.append(netflix['duration'][i])
duration_tv_num=[]
for j in range(len(duration_tv)):
duration_tv_num.append(int(duration_tv[j].split(" ")[0]))
#bar plot showing the distribution of TV Show duration on netflix
sns.countplot(duration_tv_num, palette=["#161A1D","#5F181B","#86090B","#D91C1F","#E5383B","#B1A7A6","#D3D3D3","#8D99AE",'#F5F3F4','#FFFFFF']).set(title="DISTRIBUTION OF TV SHOW DURATION ON NETFLIX",xlabel='DURATION',ylabel='COUNT')
# to check how frequent a country's content gets added on netflix.
country=[]
for i in range(netflix["country"].shape[0]):
e=str(netflix["country"][i]).split(",")
for j in range(len(e)):
e[j]=e[j].lstrip()
country+=e
country_final=pd.Series(country)
country_final.value_counts()
#laod the file containing the country codes which will later be used to create map.
country_codes= pd.read_csv('C:\\Users\\Dell\\Desktop\\wikipedia-iso-country-codes.csv')
country_codes=country_codes.rename({'English short name lower case':'Country'},axis='columns')
country_codes
#create a dataframe containing country columns and their corresponding frequencies.
country= pd.DataFrame({'Country':country_final.value_counts().index, 'Frequency':country_final.value_counts().values })
#add corresponding country codes
location= pd.merge(country, country_codes[country_codes['Country'].isin(country['Country'])], how='left', on=['Country'])
#import relevant libraries for creating world map
import plotly.express as px
import plotly.graph_objects as go
#create a world map showing the contribution of countries towards content on netflix.
fig = px.choropleth(location, locations="Alpha-3 code",
color="Frequency",
hover_name="Country",
title="Production Location Frequency of the Netflix Contents",
color_continuous_scale=px.colors.sequential.amp)
fig.show()
#import relevant library for wordcloud
from wordcloud import WordCloud, STOPWORDS
#create a single string of all descriptions.
text=''
for i in range(netflix['description'].shape[0]):
text= text + netflix['description'][i]
#create wordcloud of description
wordcloud = WordCloud(background_color="black",
width=1600, height=800, stopwords= set(STOPWORDS),colormap='inferno').generate(text)
plt.figure( figsize=(10,10) );
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
#heatmap showing the correlation between different genres.
from sklearn.preprocessing import MultiLabelBinarizer
df= pd.DataFrame()
df['genre'] = netflix['listed_in'].apply(lambda x : x.replace(' ,',',').replace(', ',',').split(','))
test = df['genre']
mlb = MultiLabelBinarizer()
res = pd.DataFrame(mlb.fit_transform(test), columns=mlb.classes_, index=test.index)
corr = res.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(16,14))
sns.heatmap(corr, mask=mask, cmap= "Reds", vmax=.5, vmin=-.5, center=0, square=True, linewidths=.7, cbar_kws={"shrink": 0.7})
plt.title("CORRELATION BETWEEN DIFFERENT GENRE", fontsize=16)
plt.show()
#create a dataframe with "year" and "release_year" column
content= pd.DataFrame({'Type': netflix['type'],'Year': netflix['release_year']})
content
#boxplot showing how old/new the content is on netflix platform
fig = px.violin(content, y="Year", x="Type", color="Type", box=True, points="all",
hover_data=content.columns,title='FRESHNESS OF CONTENT ON NETFLIX', color_discrete_map={'TV Show':"#000000",'Movie':'#E50914'})
fig.show()
#map the rating from 0-7 going from premature to mature audience.
netflix["rating_score"]=netflix["rating"].map({"TV-Y":0,"UR":0,"NR":0,"TV-Y7":1,"TV-Y7-FV":2,"TV-G":3,"G":3,"TV-PG":4,"PG":4,"TV-14":5,"PG-13":5,"R":6,"TV-MA":7,"NC-17":7})
#taking a random sample of 100 units from the ratings column.
random_sample=netflix['rating_score'].sample(n=100,random_state=1)
#import relevants library for z test
import statsmodels.stats.weightstats
#Performing the z test
statsmodels.stats.weightstats.ztest(random_sample,value=2,alternative ="larger")
#create dataframe called rating_trend having years in a column and corresponding frequencies of kids and movies ratings
netflix["ratings_sensitivity"]=np.where(np.isin(netflix["rating_score"],[0,1,2]),"kids","adults")
rating_trend= pd.DataFrame({'Year':np.sort(netflix['year_added'].unique())})
kids_count=[]
for i in np.sort(netflix['year_added'].unique()):
kids_count.append(np.argwhere((netflix['year_added'].to_numpy()==i)&(netflix['ratings_sensitivity'].to_numpy()=='kids')).shape[0])
adult_count=[]
for i in np.sort(netflix['year_added'].unique()):
adult_count.append(np.argwhere((netflix['year_added'].to_numpy()==i)&(netflix['ratings_sensitivity'].to_numpy()=='adults')).shape[0])
rating_trend['Kids']= kids_count
rating_trend['Adults']= adult_count
plot=rating_trend.dropna()
#line chart showing the distribution of ratings over the years.
plt.plot(plot['Year'],plot['Kids'], color='k')
plt.plot(plot['Year'],plot['Adults'], color='r')
plt.legend(labels=["Kids","Adults"],loc="upper left")
plt.title("DISTRIBUTION OF RATINGS OVER THE YEAR")
plt.xlabel("YEARS")
plt.ylabel("FREQUENCY")