Exploratory Data Analysis for Hollywood Most Profitable Stories
Overview of the Dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import plotly.express as px
# Let's import the project's cover image
from PIL import Image
image = Image.open("imagen_hollywood.jpg")
image.show()
df = pd.read_csv('/work/HollywoodsMostProfitableStories.csv')
df.head()
df.shape
df.info()
df.isnull().sum()
df.nunique()
df.describe()
Missing Values
df[df['Lead Studio'].isnull()]
df[df['Audience score %'].isnull()]
df[df['Profitability'].isnull()]
Analysis
Genre
df['Genre'].value_counts()
df['Genre'].value_counts(normalize=True)
fig = px.histogram(df, x='Genre', title='Genre', color='Genre')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
Lead Studio
df['Lead Studio'].value_counts()
df['Lead Studio'].value_counts(normalize=True)
fig = px.histogram(df, x='Lead Studio', title='Lead Studios')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()
Audience Score %
df['Audience score %'].describe()
fig = px.histogram(df, x='Audience score %', title='Percentage of Audience Score %',
marginal='box' )
fig.show()
Profitability
df['Profitability'].describe()
fig = px.histogram(df, x='Profitability', title='Profitability of Films', marginal='box',
hover_data = df[['Film','Genre', 'Year']])
fig.show()
Rotten Tomatoes %
df['Rotten Tomatoes %'].describe()
fig = px.histogram(df, x=df['Rotten Tomatoes %'], title='Rotten Tomatoes Scores', marginal='box',
hover_data = df[['Film','Genre', 'Year']])
fig.show()
Worldwide Gross
df['Worldwide Gross'].describe()
fig = px.histogram(df, x='Worldwide Gross', title= 'Worldwide Gross of Movies', marginal='box',
hover_data = df[['Film','Genre', 'Year']])
fig.show()
Correlations
df[['Audience score %', 'Profitability', 'Rotten Tomatoes %', 'Worldwide Gross']].corr()
fig = px.scatter_matrix(df, dimensions=['Audience score %', 'Profitability', 'Rotten Tomatoes %', 'Worldwide Gross'],
color='Genre', title='Correlations', opacity=0.6, width=800, height=800,
hover_data=['Film', 'Lead Studio'])
fig.show()
Behavior Over Time
Genre by Year
genre_by_year = df.groupby(['Year', 'Genre'])[['Genre']].count().reset_index(level=0).rename(
columns={'Genre':'Genre Count'})
genre_by_year
fig = px.histogram(df, x='Year', color='Genre')
fig.show()
Lead Studios by year
# groupby to count number of films per studio per year
lead_studio_by_year = df.groupby(['Year', 'Lead Studio'])[['Film']].count()
lead_studio_by_year.reset_index(inplace=True)
print(lead_studio_by_year.head())
# create pivot table using groupby recentely created
film_count_by_year_studio = lead_studio_by_year.pivot(index='Year', columns='Lead Studio',
values='Film')
film_count_by_year_studio.fillna(0, inplace=True)
film_count_by_year_studio
fig = px.bar(film_count_by_year_studio, barmode='group')
fig.show()
film_relative_pivot = film_count_by_year_studio.T
film_relative = film_relative_pivot.divide(film_relative_pivot.sum())
fig1 = px.bar(film_relative.T, barmode='relative')
fig1.show()
Profitability
# groupby to create profit data frame and check profit totals per year, genre, studio and film
profit_all = df.groupby(['Year', 'Lead Studio', 'Genre', 'Film'])[['Profitability']].sum()
profit_all.reset_index(inplace=True)
profit_all.sort_values(by='Profitability', ascending=False).head(10)
profit = profit_all.drop(16)
profit.sort_values(by='Profitability', ascending=False).head(20)
fig = px.treemap(profit, path=['Lead Studio'],
values='Profitability', color='Profitability')
fig.show()
# create pivot table to use px.imshow function (tree map)
profit_pivot = profit.pivot_table(index='Year', columns='Lead Studio', values='Profitability')
profit_pivot.fillna(0, inplace=True)
fig = px.imshow(profit_pivot)
fig.show()