Project Overview

# import libraries we will need later import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

df = pd.read_csv('fandango_score_comparison.csv')

df.info()

df.isnull().sum()

# we will fix the typo by copying the original column and drop it after df['Metacritic_user_norm'] = df['Metacritic_user_nom'] df = df.drop(['Metacritic_user_nom'],axis=1)

norms = ['Fandango_Stars','Fandango_Ratingvalue','RT_norm','RT_user_norm','Metacritic_norm','Metacritic_user_norm','IMDB_norm'] df[norms].describe()

# kde plot to compare actual ratings and stars displayed # both distribution seem skewed-right plt.figure(figsize=(12,8)) sns.kdeplot(data=df,x='Fandango_Stars',clip=[0,5],shade=True,label='Stars Displayed') sns.kdeplot(data=df,x='Fandango_Ratingvalue',clip=[0,5],shade=True,label='Actual Ratings') plt.legend(loc=(0.05,0.7))

# countplot of Fandango_Stars # notice, more than half of the movies received 4.0 or better ratings plt.figure(figsize=(18,6)) sns.countplot(data=df,x='Fandango_Stars',palette='mako')

df['Fandango_Difference'].unique()

# countplot of Fandango_Difference # notice, all the difference is between 0 and 0.5 plt.figure(figsize=(18,6)) sns.countplot(data=df,x='Fandango_Difference',palette='mako')

# scatterplot of RottenTomatoes and RottenTomatoes_User columns # althouh there seems few outliers, we can see general linear relationship between two columns plt.figure(figsize=(10,10)) sns.scatterplot(data=df,x='RottenTomatoes',y='RottenTomatoes_User',s=100) plt.ylim(0,105) plt.xlim(0,105)

# kde plot to compare critic ratings and user ratings # the distribution of critic ratings does not seem normally distributed # the distribution of user ratings seem skewed-right plt.figure(figsize=(12,8)) sns.kdeplot(data=df,x='RottenTomatoes',clip=[0,100],shade=True,label='Critic Ratings') sns.kdeplot(data=df,x='RottenTomatoes_User',clip=[0,100],shade=True,label='User Ratings') plt.legend(loc=(0.05,0.7))

# in order to ease the labeling and moving legend, we use following function from github (https://github.com/mwaskom/seaborn/issues/2280) def move_legend(ax, new_loc, **kws): old_legend = ax.legend_ handles = old_legend.legendHandles labels = [t.get_text() for t in old_legend.get_texts()] title = old_legend.get_title().get_text() ax.legend(handles, labels, loc=new_loc, title=title, **kws)

fig,ax = plt.subplots(figsize=(12,8)) sns.kdeplot(data=df[['RT_user_norm','Fandango_Stars','RT_norm','Fandango_Ratingvalue']],clip=[0,5],shade=True) move_legend(ax, "upper left")

df[['Metacritic','Metacritic_User','Metacritic_norm','Metacritic_user_norm']].describe()

# scatterplot of Metacritic_norm and Metacritic_user_norm columns # we can see general linear relationship between two columns plt.figure(figsize=(10,10)) sns.scatterplot(data=df,x='Metacritic_norm',y='Metacritic_user_norm',s=100) plt.ylim(0,5) plt.xlim(0,5)

# kde plot to compare critic ratings and user ratings # Both distributions seem normally distributed # the distribution of user ratings seem skewed-right # the distribution of critic ratings seem to have high standard deviation plt.figure(figsize=(12,8)) sns.kdeplot(data=df,x='Metacritic_norm',clip=[0,5],shade=True,label='Critic Ratings') sns.kdeplot(data=df,x='Metacritic_user_norm',clip=[0,5],shade=True,label='User Ratings') plt.legend(loc=(0.05,0.7))

# we use predefined function from above fig,ax = plt.subplots(figsize=(12,8)) sns.kdeplot(data=df[['Metacritic_user_norm','Fandango_Stars','Metacritic_norm','Fandango_Ratingvalue']],clip=[0,5],shade=True) move_legend(ax, "upper left")

# we use predefined function from above fig,ax = plt.subplots(figsize=(12,8)) sns.kdeplot(data=df[['IMDB_norm','Fandango_Stars','Fandango_Ratingvalue']],clip=[0,5],shade=True) move_legend(ax, "upper left")

# Recall that we previously defined a list 'norms' containing all the columns with normed ratings fig,ax = plt.subplots(figsize=(12,8)) sns.kdeplot(data=df[norms],clip=[0,5],shade=True) move_legend(ax, "upper left")

fig,ax = plt.subplots(figsize=(12,8)) sns.kdeplot(data=df[['Fandango_Stars','RT_user_norm','Metacritic_user_norm','IMDB_norm']],clip=[0,5],shade=True) move_legend(ax, "upper left")

# for anyone interested in 10 worst movies, we include the film name as well norms.append('FILM')

norms

df.nsmallest(10,'RottenTomatoes')[norms]

bad_movies = df.nsmallest(10,'RottenTomatoes')[norms]

# kdeplot of 10 worst movies from Rotten Tomatoes fig,ax=plt.subplots(figsize=(16,6)) sns.kdeplot(data=bad_movies,clip=[0,5],shade=True,palette='colorblind')

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Project Overview

Project Overview