import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import style
style.use('fivethirtyeight')
df = pd.read_csv('/work/netflix_titles.csv')
df.head()
#Calculating the missing data
for i in df.columns:
null_rate = df[i].isna().sum()/len(df) * 100
if null_rate > 0 :
print("{} null rate: {}%".format(i,round(null_rate,2)))
df['country'] = df['country'].fillna(df['country'].mode()[0])
df['cast'].replace(np.nan,'No data',inplace=True)
df['director'].replace(np.nan,'No data',inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df['date_added'] = pd.to_datetime(df['date_added'])
df['month_added'] = df['date_added'].dt.month
df['month_name_added'] = df['date_added'].dt.month_name()
df['year_added'] = df['date_added'].dt.year
df.head()
sns.palplot(['#221f1f', '#b20710', '#e50914','#f5f5f1'])
plt.title("Netflix brand palette",loc='left',fontfamily='serif',fontsize=15,y=1.2)
plt.show()
#Calculating the ratio
x = df.groupby(['type'])['type'].count()
y = len(df)
r=((x/y)).round(2)
mf_ratio = pd.DataFrame(r).T
mf_ratio
#Drawing the figure:
fig, ax = plt.subplots(1,1,figsize=(6.5,2.5))
ax.barh(mf_ratio.index, mf_ratio['Movie'],
color='#b20710', alpha=0.9, label='Male')
ax.barh(mf_ratio.index, mf_ratio['TV Show'], left=mf_ratio['Movie'],
color='#221f1f', alpha=0.9, label='Female')
ax.set_xlim(0, 1)
ax.set_xticks([])
ax.set_yticks([])
plt.show()
#Annotating the figure:
fig, ax = plt.subplots(1,1,figsize=(6.5,2.5))
ax.barh(mf_ratio.index, mf_ratio['Movie'],
color='#b20710', alpha=0.9, label='Male')
ax.barh(mf_ratio.index, mf_ratio['TV Show'], left=mf_ratio['Movie'],
color='#221f1f', alpha=0.9, label='Female')
ax.set_xlim(0, 1)
ax.set_xticks([])
ax.set_yticks([])
#annotating code starts here
for i in mf_ratio.index:
ax.annotate(f"{int(mf_ratio['Movie'][i]*100)}%",
xy=(mf_ratio['Movie'][i]/2, i),
va = 'center', ha='center',fontsize=40, fontweight='light', fontfamily='serif',
color='white')
ax.annotate("Movie",
xy=(mf_ratio['Movie'][i]/2, -0.25),
va = 'center', ha='center',fontsize=15, fontweight='light', fontfamily='serif',
color='white')
for i in mf_ratio.index:
ax.annotate(f"{int(mf_ratio['TV Show'][i]*100)}%",
xy=(mf_ratio['Movie'][i]+mf_ratio['TV Show'][i]/2,i),
va = 'center', ha='center',fontsize=40, fontweight='light', fontfamily='serif',
color='white')
ax.annotate("TV Shows",
xy=(mf_ratio['Movie'][i]+mf_ratio['TV Show'][i]/2, -0.25),
va = 'center', ha='center',fontsize=15, fontweight='light', fontfamily='serif',
color='white')
plt.show()
#Adding text and removing legend & spines:
fig, ax = plt.subplots(1,1,figsize=(6.5,2.5))
ax.barh(mf_ratio.index, mf_ratio['Movie'],
color='#b20710', alpha=0.9, label='Male')
ax.barh(mf_ratio.index, mf_ratio['TV Show'], left=mf_ratio['Movie'],
color='#221f1f', alpha=0.9, label='Female')
ax.set_xlim(0, 1)
ax.set_xticks([])
ax.set_yticks([])
#Annontate code
for i in mf_ratio.index:
ax.annotate(f"{int(mf_ratio['Movie'][i]*100)}%",
xy=(mf_ratio['Movie'][i]/2, i),
va = 'center', ha='center',fontsize=40, fontweight='light', fontfamily='serif',
color='white')
ax.annotate("Movie",
xy=(mf_ratio['Movie'][i]/2, -0.25),
va = 'center', ha='center',fontsize=15, fontweight='light', fontfamily='serif',
color='white')
for i in mf_ratio.index:
ax.annotate(f"{int(mf_ratio['TV Show'][i]*100)}%",
xy=(mf_ratio['Movie'][i]+mf_ratio['TV Show'][i]/2,i),
va = 'center', ha='center',fontsize=40, fontweight='light', fontfamily='serif',
color='white')
ax.annotate("TV Shows",
xy=(mf_ratio['Movie'][i]+mf_ratio['TV Show'][i]/2, -0.25),
va = 'center', ha='center',fontsize=15, fontweight='light', fontfamily='serif',
color='white')
#Adding text and removing spines and legend
fig.text(0.125,1.0,'Movie & TV Show distribution',fontfamily='serif',fontsize=15,fontweight='bold')
fig.text(0.125,0.90,'We see vastly more movies than TV shows on Netflix.',fontfamily='serif',fontsize=12,fontweight='light')
for s in ['top','left','right','bottom']:
ax.spines[s].set_visible(False)
ax.legend().set_visible(False)
plt.show()
#Initializing the timeline list:
from datetime import datetime
tl_dates = [
"1997nFounded",
"1998nMail Services",
"2003nGoes Public",
"2007nStreming service",
"2016nGoes Global",
"2021nNetflix & Chill"
]
tl_x = [1,2,4,5.3,8,9]
df.head()
#Drawing the figure :
fig,ax = plt.subplots(figsize=(15,4),constrained_layout=True)
ax.set_ylim(-2,1.5)
ax.set_xlim(0,10)
ax.axhline(0, xmin=0.1, xmax=0.9,c="#000000",zorder=1)
ax.scatter(tl_x,np.zeros(len(tl_x)),s=120,c="#4a4a4a",zorder=2)
ax.scatter(tl_x, np.zeros(len(tl_x)), s=30, c='#fafafa', zorder=3)
for x, date in zip(tl_x, tl_dates):
ax.text(x, -0.55, date, ha='center',
fontfamily='serif', fontweight='bold',
color='#4a4a4a',fontsize=12)
for spine in ["left", "top", "right", "bottom"]:
ax.spines[spine].set_visible(False)
ax.set_xticks([])
ax.set_yticks([])
ax.set_title("Netflix through the years", fontweight="bold", fontfamily='serif', fontsize=16, color='#4a4a4a')
plt.show()
df['country']
df['first_country'] = df['country'].apply(lambda x: x.split(",")[0])
df['first_country']
#Now we will replace some of the country names with their short form.
df['first_country'].replace('United States', 'USA', inplace=True)
df['first_country'].replace('United Kingdom', 'UK',inplace=True)
df['first_country'].replace('South Korea', 'S. Korea',inplace=True)
#After that, we calculate the total occurrence of each country.
df['count']=1 #helper column
data = df.groupby('first_country')['count'].sum().sort_values(ascending=False)[:10]
data
#Drawing the figure
color_map = ['#f5f5f1' for _ in range(10)]
color_map[0] = color_map[1]= color_map[2] = '#b20710'
fig,ax = plt.subplots(1,1,figsize=(14,7))
#Annotating the figure
ax.bar(data.index,data,width=0.5,edgecolor='darkgray',linewidth=0.6,color=color_map)
for i in data.index:
ax.annotate(f"{data[i]}",xy=(i,data[i]+100),va='center',ha='center',fontweight='light',fontfamily='serif')
for s in ['top','left','right']:
ax.spines[s].set_visible(False)
#Adding text
fig.text(0.125,1,'Top 10 countries on Netflix',fontsize=15,fontweight='bold',fontfamily='serif')
fig.text(0.125,0.95,'The three most frequent countries have been highlighted.',fontsize=16,fontweight='light',fontfamily='serif')
fig.text(1.1, 1.01, 'Insight', fontsize=15, fontweight='bold', fontfamily='serif')
fig.text(1.1, 0.67, '''
Here we see that US is major content
producers for Netflix, and on second
we have India after UK and so on.
Netflix being a US Company, it makes
sense that they major producers.
'''
, fontsize=16, fontweight='light', fontfamily='serif')
ax.grid(axis='y', linestyle='-', alpha=0.5)
!pip install wordcloud==1.8.1
#Importing necessary libraries:
from wordcloud import WordCloud
import random
from PIL import Image
import matplotlib
#import matplotlib.cm as cm
from matplotlib import cm
#Creating a word cloud and displaying it:
paintcmap = matplotlib.colors.LinearSegmentedColormap.from_list("", ['#221f1f', '#b20710'])
text = str(list(df['title'])).replace(',', '').replace('[', '').replace("'", '').replace(']', '').replace('.', '')
mask = np.array(Image.open('mask.png'))
wordcloud = WordCloud(background_color = 'white', width = 500, height = 200,colormap='Dark2_r', max_words = 150, mask = mask).generate(text)
plt.figure( figsize=(5,5))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()