Hollywood Most Profitable Stories

Exploratory Data Analysis for Hollywood Most Profitable Stories

Overview of the Dataset

import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import matplotlib as mpl import plotly.express as px

# Let's import the project's cover image from PIL import Image image = Image.open("imagen_hollywood.jpg") image.show()

df = pd.read_csv('/work/HollywoodsMostProfitableStories.csv') df.head()

df.shape

df.info()

df.isnull().sum()

df.nunique()

df.describe()

Missing Values

df[df['Lead Studio'].isnull()]

df[df['Audience score %'].isnull()]

df[df['Profitability'].isnull()]

Analysis

Genre

df['Genre'].value_counts()

df['Genre'].value_counts(normalize=True)

fig = px.histogram(df, x='Genre', title='Genre', color='Genre') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.show()

Lead Studio

df['Lead Studio'].value_counts()

df['Lead Studio'].value_counts(normalize=True)

fig = px.histogram(df, x='Lead Studio', title='Lead Studios') fig.update_layout(xaxis={'categoryorder':'total descending'}) fig.show()

Audience Score %

df['Audience score %'].describe()

fig = px.histogram(df, x='Audience score %', title='Percentage of Audience Score %', marginal='box' ) fig.show()

Profitability

df['Profitability'].describe()

fig = px.histogram(df, x='Profitability', title='Profitability of Films', marginal='box', hover_data = df[['Film','Genre', 'Year']]) fig.show()

Rotten Tomatoes %

df['Rotten Tomatoes %'].describe()

fig = px.histogram(df, x=df['Rotten Tomatoes %'], title='Rotten Tomatoes Scores', marginal='box', hover_data = df[['Film','Genre', 'Year']]) fig.show()

Worldwide Gross

df['Worldwide Gross'].describe()

fig = px.histogram(df, x='Worldwide Gross', title= 'Worldwide Gross of Movies', marginal='box', hover_data = df[['Film','Genre', 'Year']]) fig.show()

Correlations

df[['Audience score %', 'Profitability', 'Rotten Tomatoes %', 'Worldwide Gross']].corr()

fig = px.scatter_matrix(df, dimensions=['Audience score %', 'Profitability', 'Rotten Tomatoes %', 'Worldwide Gross'], color='Genre', title='Correlations', opacity=0.6, width=800, height=800, hover_data=['Film', 'Lead Studio']) fig.show()

Behavior Over Time

Genre by Year

genre_by_year = df.groupby(['Year', 'Genre'])[['Genre']].count().reset_index(level=0).rename( columns={'Genre':'Genre Count'}) genre_by_year

fig = px.histogram(df, x='Year', color='Genre') fig.show()

Lead Studios by year

# groupby to count number of films per studio per year lead_studio_by_year = df.groupby(['Year', 'Lead Studio'])[['Film']].count() lead_studio_by_year.reset_index(inplace=True) print(lead_studio_by_year.head()) # create pivot table using groupby recentely created film_count_by_year_studio = lead_studio_by_year.pivot(index='Year', columns='Lead Studio', values='Film') film_count_by_year_studio.fillna(0, inplace=True) film_count_by_year_studio

fig = px.bar(film_count_by_year_studio, barmode='group') fig.show()

film_relative_pivot = film_count_by_year_studio.T film_relative = film_relative_pivot.divide(film_relative_pivot.sum()) fig1 = px.bar(film_relative.T, barmode='relative') fig1.show()

Profitability

# groupby to create profit data frame and check profit totals per year, genre, studio and film profit_all = df.groupby(['Year', 'Lead Studio', 'Genre', 'Film'])[['Profitability']].sum() profit_all.reset_index(inplace=True) profit_all.sort_values(by='Profitability', ascending=False).head(10)

profit = profit_all.drop(16) profit.sort_values(by='Profitability', ascending=False).head(20)

fig = px.treemap(profit, path=['Lead Studio'], values='Profitability', color='Profitability') fig.show()

# create pivot table to use px.imshow function (tree map) profit_pivot = profit.pivot_table(index='Year', columns='Lead Studio', values='Profitability') profit_pivot.fillna(0, inplace=True) fig = px.imshow(profit_pivot) fig.show()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Exploratory&nbsp;Data&nbsp;Analysis&nbsp;for&nbsp;Hollywood&nbsp;Most&nbsp;Profitable&nbsp;Stories

Overview of the Dataset

Missing Values

Analysis

Genre

Lead Studio

Audience Score %

Profitability

Rotten Tomatoes %

Worldwide Gross

Correlations

Behavior Over Time

Genre by Year

Lead Studios by year

Profitability

Exploratory Data Analysis for Hollywood Most Profitable Stories