Visualizing Netflix Data

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from matplotlib import style style.use('fivethirtyeight')

df = pd.read_csv('/work/netflix_titles.csv') df.head()

#Calculating the missing data for i in df.columns: null_rate = df[i].isna().sum()/len(df) * 100 if null_rate > 0 : print("{} null rate: {}%".format(i,round(null_rate,2)))

df['country'] = df['country'].fillna(df['country'].mode()[0]) df['cast'].replace(np.nan,'No data',inplace=True) df['director'].replace(np.nan,'No data',inplace=True) df.dropna(inplace=True) df.drop_duplicates(inplace=True)

df['date_added'] = pd.to_datetime(df['date_added']) df['month_added'] = df['date_added'].dt.month df['month_name_added'] = df['date_added'].dt.month_name() df['year_added'] = df['date_added'].dt.year

df.head()

sns.palplot(['#221f1f', '#b20710', '#e50914','#f5f5f1']) plt.title("Netflix brand palette",loc='left',fontfamily='serif',fontsize=15,y=1.2) plt.show()

#Calculating the ratio x = df.groupby(['type'])['type'].count() y = len(df) r=((x/y)).round(2) mf_ratio = pd.DataFrame(r).T

mf_ratio

#Drawing the figure: fig, ax = plt.subplots(1,1,figsize=(6.5,2.5)) ax.barh(mf_ratio.index, mf_ratio['Movie'], color='#b20710', alpha=0.9, label='Male') ax.barh(mf_ratio.index, mf_ratio['TV Show'], left=mf_ratio['Movie'], color='#221f1f', alpha=0.9, label='Female') ax.set_xlim(0, 1) ax.set_xticks([]) ax.set_yticks([]) plt.show()

#Annotating the figure: fig, ax = plt.subplots(1,1,figsize=(6.5,2.5)) ax.barh(mf_ratio.index, mf_ratio['Movie'], color='#b20710', alpha=0.9, label='Male') ax.barh(mf_ratio.index, mf_ratio['TV Show'], left=mf_ratio['Movie'], color='#221f1f', alpha=0.9, label='Female') ax.set_xlim(0, 1) ax.set_xticks([]) ax.set_yticks([]) #annotating code starts here for i in mf_ratio.index: ax.annotate(f"{int(mf_ratio['Movie'][i]*100)}%", xy=(mf_ratio['Movie'][i]/2, i), va = 'center', ha='center',fontsize=40, fontweight='light', fontfamily='serif', color='white') ax.annotate("Movie", xy=(mf_ratio['Movie'][i]/2, -0.25), va = 'center', ha='center',fontsize=15, fontweight='light', fontfamily='serif', color='white') for i in mf_ratio.index: ax.annotate(f"{int(mf_ratio['TV Show'][i]*100)}%", xy=(mf_ratio['Movie'][i]+mf_ratio['TV Show'][i]/2,i), va = 'center', ha='center',fontsize=40, fontweight='light', fontfamily='serif', color='white') ax.annotate("TV Shows", xy=(mf_ratio['Movie'][i]+mf_ratio['TV Show'][i]/2, -0.25), va = 'center', ha='center',fontsize=15, fontweight='light', fontfamily='serif', color='white') plt.show()

#Adding text and removing legend & spines: fig, ax = plt.subplots(1,1,figsize=(6.5,2.5)) ax.barh(mf_ratio.index, mf_ratio['Movie'], color='#b20710', alpha=0.9, label='Male') ax.barh(mf_ratio.index, mf_ratio['TV Show'], left=mf_ratio['Movie'], color='#221f1f', alpha=0.9, label='Female') ax.set_xlim(0, 1) ax.set_xticks([]) ax.set_yticks([]) #Annontate code for i in mf_ratio.index: ax.annotate(f"{int(mf_ratio['Movie'][i]*100)}%", xy=(mf_ratio['Movie'][i]/2, i), va = 'center', ha='center',fontsize=40, fontweight='light', fontfamily='serif', color='white') ax.annotate("Movie", xy=(mf_ratio['Movie'][i]/2, -0.25), va = 'center', ha='center',fontsize=15, fontweight='light', fontfamily='serif', color='white') for i in mf_ratio.index: ax.annotate(f"{int(mf_ratio['TV Show'][i]*100)}%", xy=(mf_ratio['Movie'][i]+mf_ratio['TV Show'][i]/2,i), va = 'center', ha='center',fontsize=40, fontweight='light', fontfamily='serif', color='white') ax.annotate("TV Shows", xy=(mf_ratio['Movie'][i]+mf_ratio['TV Show'][i]/2, -0.25), va = 'center', ha='center',fontsize=15, fontweight='light', fontfamily='serif', color='white') #Adding text and removing spines and legend fig.text(0.125,1.0,'Movie & TV Show distribution',fontfamily='serif',fontsize=15,fontweight='bold') fig.text(0.125,0.90,'We see vastly more movies than TV shows on Netflix.',fontfamily='serif',fontsize=12,fontweight='light') for s in ['top','left','right','bottom']: ax.spines[s].set_visible(False) ax.legend().set_visible(False) plt.show()

#Initializing the timeline list: from datetime import datetime tl_dates = [ "1997nFounded", "1998nMail Services", "2003nGoes Public", "2007nStreming service", "2016nGoes Global", "2021nNetflix & Chill" ] tl_x = [1,2,4,5.3,8,9]

df.head()

#Drawing the figure : fig,ax = plt.subplots(figsize=(15,4),constrained_layout=True) ax.set_ylim(-2,1.5) ax.set_xlim(0,10) ax.axhline(0, xmin=0.1, xmax=0.9,c="#000000",zorder=1) ax.scatter(tl_x,np.zeros(len(tl_x)),s=120,c="#4a4a4a",zorder=2) ax.scatter(tl_x, np.zeros(len(tl_x)), s=30, c='#fafafa', zorder=3) for x, date in zip(tl_x, tl_dates): ax.text(x, -0.55, date, ha='center', fontfamily='serif', fontweight='bold', color='#4a4a4a',fontsize=12) for spine in ["left", "top", "right", "bottom"]: ax.spines[spine].set_visible(False) ax.set_xticks([]) ax.set_yticks([]) ax.set_title("Netflix through the years", fontweight="bold", fontfamily='serif', fontsize=16, color='#4a4a4a') plt.show()

df['country']

df['first_country'] = df['country'].apply(lambda x: x.split(",")[0]) df['first_country']

#Now we will replace some of the country names with their short form. df['first_country'].replace('United States', 'USA', inplace=True) df['first_country'].replace('United Kingdom', 'UK',inplace=True) df['first_country'].replace('South Korea', 'S. Korea',inplace=True)

#After that, we calculate the total occurrence of each country. df['count']=1 #helper column data = df.groupby('first_country')['count'].sum().sort_values(ascending=False)[:10]

data

#Drawing the figure color_map = ['#f5f5f1' for _ in range(10)] color_map[0] = color_map[1]= color_map[2] = '#b20710' fig,ax = plt.subplots(1,1,figsize=(14,7)) #Annotating the figure ax.bar(data.index,data,width=0.5,edgecolor='darkgray',linewidth=0.6,color=color_map) for i in data.index: ax.annotate(f"{data[i]}",xy=(i,data[i]+100),va='center',ha='center',fontweight='light',fontfamily='serif') for s in ['top','left','right']: ax.spines[s].set_visible(False) #Adding text fig.text(0.125,1,'Top 10 countries on Netflix',fontsize=15,fontweight='bold',fontfamily='serif') fig.text(0.125,0.95,'The three most frequent countries have been highlighted.',fontsize=16,fontweight='light',fontfamily='serif') fig.text(1.1, 1.01, 'Insight', fontsize=15, fontweight='bold', fontfamily='serif') fig.text(1.1, 0.67, ''' Here we see that US is major content producers for Netflix, and on second we have India after UK and so on. Netflix being a US Company, it makes sense that they major producers. ''' , fontsize=16, fontweight='light', fontfamily='serif') ax.grid(axis='y', linestyle='-', alpha=0.5)

!pip install wordcloud==1.8.1

#Importing necessary libraries: from wordcloud import WordCloud import random from PIL import Image import matplotlib #import matplotlib.cm as cm from matplotlib import cm

#Creating a word cloud and displaying it: paintcmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ['#221f1f', '#b20710']) text = str(list(df['title'])).replace(',', '').replace('[', '').replace("'", '').replace(']', '').replace('.', '') mask = np.array(Image.open('mask.png')) wordcloud = WordCloud(background_color = 'white', width = 500, height = 200,colormap='Dark2_r', max_words = 150, mask = mask).generate(text) plt.figure( figsize=(5,5)) plt.imshow(wordcloud, interpolation = 'bilinear') plt.axis('off') plt.tight_layout(pad=0) plt.show()