Project Overview
# import libraries we will need later
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('fandango_score_comparison.csv')
df.info()
df.isnull().sum()
# we will fix the typo by copying the original column and drop it after
df['Metacritic_user_norm'] = df['Metacritic_user_nom']
df = df.drop(['Metacritic_user_nom'],axis=1)
norms = ['Fandango_Stars','Fandango_Ratingvalue','RT_norm','RT_user_norm','Metacritic_norm','Metacritic_user_norm','IMDB_norm']
df[norms].describe()
# kde plot to compare actual ratings and stars displayed
# both distribution seem skewed-right
plt.figure(figsize=(12,8))
sns.kdeplot(data=df,x='Fandango_Stars',clip=[0,5],shade=True,label='Stars Displayed')
sns.kdeplot(data=df,x='Fandango_Ratingvalue',clip=[0,5],shade=True,label='Actual Ratings')
plt.legend(loc=(0.05,0.7))
# countplot of Fandango_Stars
# notice, more than half of the movies received 4.0 or better ratings
plt.figure(figsize=(18,6))
sns.countplot(data=df,x='Fandango_Stars',palette='mako')
df['Fandango_Difference'].unique()
# countplot of Fandango_Difference
# notice, all the difference is between 0 and 0.5
plt.figure(figsize=(18,6))
sns.countplot(data=df,x='Fandango_Difference',palette='mako')
# scatterplot of RottenTomatoes and RottenTomatoes_User columns
# althouh there seems few outliers, we can see general linear relationship between two columns
plt.figure(figsize=(10,10))
sns.scatterplot(data=df,x='RottenTomatoes',y='RottenTomatoes_User',s=100)
plt.ylim(0,105)
plt.xlim(0,105)
# kde plot to compare critic ratings and user ratings
# the distribution of critic ratings does not seem normally distributed
# the distribution of user ratings seem skewed-right
plt.figure(figsize=(12,8))
sns.kdeplot(data=df,x='RottenTomatoes',clip=[0,100],shade=True,label='Critic Ratings')
sns.kdeplot(data=df,x='RottenTomatoes_User',clip=[0,100],shade=True,label='User Ratings')
plt.legend(loc=(0.05,0.7))
# in order to ease the labeling and moving legend, we use following function from github (https://github.com/mwaskom/seaborn/issues/2280)
def move_legend(ax, new_loc, **kws):
old_legend = ax.legend_
handles = old_legend.legendHandles
labels = [t.get_text() for t in old_legend.get_texts()]
title = old_legend.get_title().get_text()
ax.legend(handles, labels, loc=new_loc, title=title, **kws)
fig,ax = plt.subplots(figsize=(12,8))
sns.kdeplot(data=df[['RT_user_norm','Fandango_Stars','RT_norm','Fandango_Ratingvalue']],clip=[0,5],shade=True)
move_legend(ax, "upper left")
df[['Metacritic','Metacritic_User','Metacritic_norm','Metacritic_user_norm']].describe()
# scatterplot of Metacritic_norm and Metacritic_user_norm columns
# we can see general linear relationship between two columns
plt.figure(figsize=(10,10))
sns.scatterplot(data=df,x='Metacritic_norm',y='Metacritic_user_norm',s=100)
plt.ylim(0,5)
plt.xlim(0,5)
# kde plot to compare critic ratings and user ratings
# Both distributions seem normally distributed
# the distribution of user ratings seem skewed-right
# the distribution of critic ratings seem to have high standard deviation
plt.figure(figsize=(12,8))
sns.kdeplot(data=df,x='Metacritic_norm',clip=[0,5],shade=True,label='Critic Ratings')
sns.kdeplot(data=df,x='Metacritic_user_norm',clip=[0,5],shade=True,label='User Ratings')
plt.legend(loc=(0.05,0.7))
# we use predefined function from above
fig,ax = plt.subplots(figsize=(12,8))
sns.kdeplot(data=df[['Metacritic_user_norm','Fandango_Stars','Metacritic_norm','Fandango_Ratingvalue']],clip=[0,5],shade=True)
move_legend(ax, "upper left")
# we use predefined function from above
fig,ax = plt.subplots(figsize=(12,8))
sns.kdeplot(data=df[['IMDB_norm','Fandango_Stars','Fandango_Ratingvalue']],clip=[0,5],shade=True)
move_legend(ax, "upper left")
# Recall that we previously defined a list 'norms' containing all the columns with normed ratings
fig,ax = plt.subplots(figsize=(12,8))
sns.kdeplot(data=df[norms],clip=[0,5],shade=True)
move_legend(ax, "upper left")
fig,ax = plt.subplots(figsize=(12,8))
sns.kdeplot(data=df[['Fandango_Stars','RT_user_norm','Metacritic_user_norm','IMDB_norm']],clip=[0,5],shade=True)
move_legend(ax, "upper left")
# for anyone interested in 10 worst movies, we include the film name as well
norms.append('FILM')
norms
df.nsmallest(10,'RottenTomatoes')[norms]
bad_movies = df.nsmallest(10,'RottenTomatoes')[norms]
# kdeplot of 10 worst movies from Rotten Tomatoes
fig,ax=plt.subplots(figsize=(16,6))
sns.kdeplot(data=bad_movies,clip=[0,5],shade=True,palette='colorblind')