This is an Exploratory Data Analysis of the top IMDB TV shows
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/work/imdb_tvshows.csv'):
for filename in filenames:
print(os.path.join(dirname, filename))
Reading the data after importing the necessary libraries
df = pd.read_csv("/work/imdb_tvshows.csv")
Data info
df.head(4)
df.tail(4)
This returns a random sample of rows/columns
df.sample(4)
df.rename(columns = {'EpisodeDuration(in Minutes)':'runtime'}, inplace = True)
df.shape
df.isna().sum()
print([col for col in df.columns if df[col].isna().sum()>0])
The features above have missing values, imputation can be done but we aren't building a model. Hence imputing those features would result in misleading insights.
#finding Dulpicates
df.duplicated().sum()
df.dropna(inplace=True)
df.describe()
df.info()
Univariate Analysis
import plotly.express as px
df["Title"].value_counts()
titles =df["Title"].value_counts().index[:10]
title_occurence =df["Title"].value_counts().values[:10]
fig = px.bar(x = titles,y = title_occurence,labels={"x":"Show Title","y":"Repeated"},title="Shows With Same Titles",template="ggplot2",color_discrete_sequence =['green']*len(df)) #Top 10 Movies
fig.show()
run_t = df["runtime"].value_counts().index
rt_count =df["runtime"].value_counts().values
fi =px.bar(y=run_t,x=rt_count,labels={"x":"No of Movies","y":"Runtime"},title="Runtime (Minutes) ")
fi.update_layout(yaxis=dict(autorange="reversed"),title_x=0.5)
#Top 6 Genres
genre = df["Genres"].value_counts()
fig = px.pie(values = genre.values[:6],names=genre.index[:6],title="Top 6 Genres")
fig.update_layout(title_x=0.432)
!pip install wordcloud
#World CLoud
import matplotlib.pyplot as plt
from wordcloud import WordCloud
genre = genre
genre.index=genre.index.map(str)
wordcloud = WordCloud(background_color="white").generate_from_frequencies(genre)
plt.figure(figsize = (12,8),dpi=50)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
genre
#Top 6 Genres
ratings = df["Rating"].value_counts()
ratings.index=ratings.index.map(str)
wordcloud = WordCloud().generate_from_frequencies(ratings)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
#Top 15 Shows
top_15 = df.sort_values(["Votes","Rating"],ascending=False)[:15]
top_15.loc[:,["Title","Years","Genres","Rating","Votes"]]
#Top Least Rated Shows
top_voted = df.sort_values("Votes",ascending=False)
top_voted.sort_values("Rating")