EDA-Netflix

#import the relevant librarires import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns sns.set()

#load the data data= pd.read_csv('netflix_titles.csv') data

#copy the data in the variable called netflix netflix= data.copy()

#extract the year and month from the "date_added" column netflix["date_added"]=pd.to_datetime(netflix["date_added"]) netflix["year_added"]=netflix["date_added"].dt.year netflix["month_added"]=netflix["date_added"].dt.month

#bar chart showing the count of TV Shows and Movies on netflix platform sns.countplot(x=netflix['type'], data= netflix,palette=["#000000",'#E50914']).set(title='COUNT OF TV SHOWS AND MOVIES ON NETFLIX')

#create a data frame having years in chronological order type_trend= pd.DataFrame({'Year':np.sort(netflix['year_added'].unique())})

#create separate list for movies and tv shows containing their frequencies according to years. movie_count=[] for i in np.sort(netflix['year_added'].unique()): movie_count.append(np.argwhere((netflix['year_added']==i)&(netflix['type']=='Movie')).shape[0]) tvshow_count=[] for i in np.sort(netflix['year_added'].unique()): tvshow_count.append(np.argwhere((netflix['year_added']==i)&(netflix['type']=='TV Show')).shape[0])

#add the newly made columns to the dataframe type_trend['Tv show']= tvshow_count type_trend['movie']= movie_count plot=type_trend plot

#create a stacked area chart depicting the yearly trend of Movies and TV Shows on netflix. plt.stackplot(plot['Year'],plot['Tv show'],plot['movie'], colors =["#000000",'#E50914']) plt.legend(labels=['TV Show','Movies'], loc="upper left") plt.title("YEARLY TREND OF MOVIES AND TV SHOWS ON NETFLIX",FONTSIZE=15) plt.xlabel("YEARS") plt.ylabel("FREQUENCY")

#top 5 TV Shows genres on netflix on the basis of their appearance tv_genre=[] for i in range(netflix["listed_in"].shape[0]): if netflix["type"][i]=="TV Show": a=str(netflix["listed_in"][i]).split(",") for j in range(len(a)): a[j]=a[j].lstrip() tv_genre+=a top_tv_genre=pd.Series(tv_genre) top_tv_genre.value_counts().head(5)

#top 5 Movies genres on netflix on the basis of their appearance movie_genre=[] for i in range(netflix["listed_in"].shape[0]): if netflix["type"][i]=="Movie": b=str(netflix["listed_in"][i]).split(",") for j in range(len(b)): b[j]=b[j].lstrip() movie_genre+=b top_movie_genre=pd.Series(movie_genre) top_movie_genre.value_counts().head(5)

#top 10 actors on netflix on the basis of their appearance actors=[] for i in range(netflix["cast"].shape[0]): c=str(netflix["cast"][i]).split(",") actors+=c top_actors=pd.Series(actors) top_actors.value_counts().head(11)

#top 10 directors on netflix on the basis of their appearances directors=[] for i in range(netflix["director"].shape[0]): d=str(netflix["director"][i]).split(",") directors+=d top_directors=pd.Series(directors) top_directors.value_counts().head(11)

#evaluating the difference between release of content and its inclusion on netflix platform netflix["difference"]=netflix["year_added"]-netflix["release_year"]

#curve showing the frequency of difference between release of content and its inclusion on netflix platform sns.distplot(netflix["difference"].dropna(), color='#E50914').set(title="DIFFERENCE BETWEEN CONTENT RELEASE AND INCLUSION ON NETFLIX",xlabel='DIFFERENCE',ylabel='DENSITY')

netflix_color=["#000000","#161A1D","#261C21","#2B2D42","#5F181B","#86090B","#D91C1F","#E5383B","#B1A7A6","#D3D3D3","#8D99AE",'#F5F3F4','#FFFFFF']

#bar chart showing frequency of content rating plt.grid(False) plt.barh(netflix["rating"].value_counts().index,netflix["rating"].value_counts().values, color=netflix_color) plt.title("DISTRIBUTION OF RATINGS",fontsize=15) plt.xlabel("COUNT") plt.ylabel("RATING")

#create a list showing duration of each movie on netflix platform duration_movie=[] for i in range(netflix["duration"].shape[0]): if netflix["type"][i]=="Movie": duration_movie.append(netflix['duration'][i]) duration_movie_num=[] for j in range(len(duration_movie)): duration_movie_num.append(int(duration_movie[j].split(" ")[0]))

#curve showing the distribution of movie duration on netflix sns.distplot(duration_movie_num, color='#E50914').set(title="DISTRIBUTION OF MOVIE DURATION ON NETFLIX",xlabel='DURATION',ylabel='DENSITY')

#create a list showing duration of each TV Show on netflix platform duration_tv=[] for i in range(netflix["duration"].shape[0]): if netflix["type"][i]=="TV Show": duration_tv.append(netflix['duration'][i]) duration_tv_num=[] for j in range(len(duration_tv)): duration_tv_num.append(int(duration_tv[j].split(" ")[0])) #bar plot showing the distribution of TV Show duration on netflix sns.countplot(duration_tv_num, palette=["#161A1D","#5F181B","#86090B","#D91C1F","#E5383B","#B1A7A6","#D3D3D3","#8D99AE",'#F5F3F4','#FFFFFF']).set(title="DISTRIBUTION OF TV SHOW DURATION ON NETFLIX",xlabel='DURATION',ylabel='COUNT')

# to check how frequent a country's content gets added on netflix. country=[] for i in range(netflix["country"].shape[0]): e=str(netflix["country"][i]).split(",") for j in range(len(e)): e[j]=e[j].lstrip() country+=e country_final=pd.Series(country) country_final.value_counts()

#laod the file containing the country codes which will later be used to create map. country_codes= pd.read_csv('C:\\Users\\Dell\\Desktop\\wikipedia-iso-country-codes.csv') country_codes=country_codes.rename({'English short name lower case':'Country'},axis='columns') country_codes

#create a dataframe containing country columns and their corresponding frequencies. country= pd.DataFrame({'Country':country_final.value_counts().index, 'Frequency':country_final.value_counts().values }) #add corresponding country codes location= pd.merge(country, country_codes[country_codes['Country'].isin(country['Country'])], how='left', on=['Country'])

#import relevant libraries for creating world map import plotly.express as px import plotly.graph_objects as go

#create a world map showing the contribution of countries towards content on netflix. fig = px.choropleth(location, locations="Alpha-3 code", color="Frequency", hover_name="Country", title="Production Location Frequency of the Netflix Contents", color_continuous_scale=px.colors.sequential.amp) fig.show()

#import relevant library for wordcloud from wordcloud import WordCloud, STOPWORDS

#create a single string of all descriptions. text='' for i in range(netflix['description'].shape[0]): text= text + netflix['description'][i]

#create wordcloud of description wordcloud = WordCloud(background_color="black", width=1600, height=800, stopwords= set(STOPWORDS),colormap='inferno').generate(text) plt.figure( figsize=(10,10) ); plt.imshow(wordcloud) plt.axis("off") plt.show()

#heatmap showing the correlation between different genres. from sklearn.preprocessing import MultiLabelBinarizer df= pd.DataFrame() df['genre'] = netflix['listed_in'].apply(lambda x : x.replace(' ,',',').replace(', ',',').split(',')) test = df['genre'] mlb = MultiLabelBinarizer() res = pd.DataFrame(mlb.fit_transform(test), columns=mlb.classes_, index=test.index) corr = res.corr() mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True plt.figure(figsize=(16,14)) sns.heatmap(corr, mask=mask, cmap= "Reds", vmax=.5, vmin=-.5, center=0, square=True, linewidths=.7, cbar_kws={"shrink": 0.7}) plt.title("CORRELATION BETWEEN DIFFERENT GENRE", fontsize=16) plt.show()

#create a dataframe with "year" and "release_year" column content= pd.DataFrame({'Type': netflix['type'],'Year': netflix['release_year']}) content

#boxplot showing how old/new the content is on netflix platform fig = px.violin(content, y="Year", x="Type", color="Type", box=True, points="all", hover_data=content.columns,title='FRESHNESS OF CONTENT ON NETFLIX', color_discrete_map={'TV Show':"#000000",'Movie':'#E50914'}) fig.show()

#map the rating from 0-7 going from premature to mature audience. netflix["rating_score"]=netflix["rating"].map({"TV-Y":0,"UR":0,"NR":0,"TV-Y7":1,"TV-Y7-FV":2,"TV-G":3,"G":3,"TV-PG":4,"PG":4,"TV-14":5,"PG-13":5,"R":6,"TV-MA":7,"NC-17":7})

#taking a random sample of 100 units from the ratings column. random_sample=netflix['rating_score'].sample(n=100,random_state=1)

#import relevants library for z test import statsmodels.stats.weightstats

#Performing the z test statsmodels.stats.weightstats.ztest(random_sample,value=2,alternative ="larger")

#create dataframe called rating_trend having years in a column and corresponding frequencies of kids and movies ratings netflix["ratings_sensitivity"]=np.where(np.isin(netflix["rating_score"],[0,1,2]),"kids","adults") rating_trend= pd.DataFrame({'Year':np.sort(netflix['year_added'].unique())}) kids_count=[] for i in np.sort(netflix['year_added'].unique()): kids_count.append(np.argwhere((netflix['year_added'].to_numpy()==i)&(netflix['ratings_sensitivity'].to_numpy()=='kids')).shape[0]) adult_count=[] for i in np.sort(netflix['year_added'].unique()): adult_count.append(np.argwhere((netflix['year_added'].to_numpy()==i)&(netflix['ratings_sensitivity'].to_numpy()=='adults')).shape[0]) rating_trend['Kids']= kids_count rating_trend['Adults']= adult_count plot=rating_trend.dropna()

#line chart showing the distribution of ratings over the years. plt.plot(plot['Year'],plot['Kids'], color='k') plt.plot(plot['Year'],plot['Adults'], color='r') plt.legend(labels=["Kids","Adults"],loc="upper left") plt.title("DISTRIBUTION OF RATINGS OVER THE YEAR") plt.xlabel("YEARS") plt.ylabel("FREQUENCY")