Project Overview
# import libraries we will need later
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('fandango_score_comparison.csv')
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146 entries, 0 to 145
Data columns (total 22 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 FILM 146 non-null object
1 RottenTomatoes 146 non-null int64
2 RottenTomatoes_User 146 non-null int64
3 Metacritic 146 non-null int64
4 Metacritic_User 146 non-null float64
5 IMDB 146 non-null float64
6 Fandango_Stars 146 non-null float64
7 Fandango_Ratingvalue 146 non-null float64
8 RT_norm 146 non-null float64
9 RT_user_norm 146 non-null float64
10 Metacritic_norm 146 non-null float64
11 Metacritic_user_nom 146 non-null float64
12 IMDB_norm 146 non-null float64
13 RT_norm_round 146 non-null float64
14 RT_user_norm_round 146 non-null float64
15 Metacritic_norm_round 146 non-null float64
16 Metacritic_user_norm_round 146 non-null float64
17 IMDB_norm_round 146 non-null float64
18 Metacritic_user_vote_count 146 non-null int64
19 IMDB_user_vote_count 146 non-null int64
20 Fandango_votes 146 non-null int64
21 Fandango_Difference 146 non-null float64
dtypes: float64(15), int64(6), object(1)
memory usage: 25.2+ KB
df.isnull().sum()
# we will fix the typo by copying the original column and drop it after
df['Metacritic_user_norm'] = df['Metacritic_user_nom']
df = df.drop(['Metacritic_user_nom'],axis=1)
norms = ['Fandango_Stars','Fandango_Ratingvalue','RT_norm','RT_user_norm','Metacritic_norm','Metacritic_user_norm','IMDB_norm']
df[norms].describe()
Fandango_Starsfloat64
Fandango_Ratingvaluefloat64
count
146
146
mean
4.089041096
3.845205479
std
0.540385978
0.502830909
min
3
2.7
25%
3.5
3.5
50%
4
3.9
75%
4.5
4.2
max
5
4.8
# kde plot to compare actual ratings and stars displayed
# both distribution seem skewed-right
plt.figure(figsize=(12,8))
sns.kdeplot(data=df,x='Fandango_Stars',clip=[0,5],shade=True,label='Stars Displayed')
sns.kdeplot(data=df,x='Fandango_Ratingvalue',clip=[0,5],shade=True,label='Actual Ratings')
plt.legend(loc=(0.05,0.7))
# countplot of Fandango_Stars
# notice, more than half of the movies received 4.0 or better ratings
plt.figure(figsize=(18,6))
sns.countplot(data=df,x='Fandango_Stars',palette='mako')
df['Fandango_Difference'].unique()
# countplot of Fandango_Difference
# notice, all the difference is between 0 and 0.5
plt.figure(figsize=(18,6))
sns.countplot(data=df,x='Fandango_Difference',palette='mako')
# scatterplot of RottenTomatoes and RottenTomatoes_User columns
# althouh there seems few outliers, we can see general linear relationship between two columns
plt.figure(figsize=(10,10))
sns.scatterplot(data=df,x='RottenTomatoes',y='RottenTomatoes_User',s=100)
plt.ylim(0,105)
plt.xlim(0,105)
# kde plot to compare critic ratings and user ratings
# the distribution of critic ratings does not seem normally distributed
# the distribution of user ratings seem skewed-right
plt.figure(figsize=(12,8))
sns.kdeplot(data=df,x='RottenTomatoes',clip=[0,100],shade=True,label='Critic Ratings')
sns.kdeplot(data=df,x='RottenTomatoes_User',clip=[0,100],shade=True,label='User Ratings')
plt.legend(loc=(0.05,0.7))
# in order to ease the labeling and moving legend, we use following function from github (https://github.com/mwaskom/seaborn/issues/2280)
def move_legend(ax, new_loc, **kws):
old_legend = ax.legend_
handles = old_legend.legendHandles
labels = [t.get_text() for t in old_legend.get_texts()]
title = old_legend.get_title().get_text()
ax.legend(handles, labels, loc=new_loc, title=title, **kws)
fig,ax = plt.subplots(figsize=(12,8))
sns.kdeplot(data=df[['RT_user_norm','Fandango_Stars','RT_norm','Fandango_Ratingvalue']],clip=[0,5],shade=True)
move_legend(ax, "upper left")
df[['Metacritic','Metacritic_User','Metacritic_norm','Metacritic_user_norm']].describe()
Metacriticfloat64
Metacritic_Userfloat64
count
146
146
mean
58.80821918
6.519178082
std
19.51738926
1.51071187
min
13
2.4
25%
43.5
5.7
50%
59
6.85
75%
75
7.5
max
94
9.6
# scatterplot of Metacritic_norm and Metacritic_user_norm columns
# we can see general linear relationship between two columns
plt.figure(figsize=(10,10))
sns.scatterplot(data=df,x='Metacritic_norm',y='Metacritic_user_norm',s=100)
plt.ylim(0,5)
plt.xlim(0,5)
# kde plot to compare critic ratings and user ratings
# Both distributions seem normally distributed
# the distribution of user ratings seem skewed-right
# the distribution of critic ratings seem to have high standard deviation
plt.figure(figsize=(12,8))
sns.kdeplot(data=df,x='Metacritic_norm',clip=[0,5],shade=True,label='Critic Ratings')
sns.kdeplot(data=df,x='Metacritic_user_norm',clip=[0,5],shade=True,label='User Ratings')
plt.legend(loc=(0.05,0.7))
# we use predefined function from above
fig,ax = plt.subplots(figsize=(12,8))
sns.kdeplot(data=df[['Metacritic_user_norm','Fandango_Stars','Metacritic_norm','Fandango_Ratingvalue']],clip=[0,5],shade=True)
move_legend(ax, "upper left")
# we use predefined function from above
fig,ax = plt.subplots(figsize=(12,8))
sns.kdeplot(data=df[['IMDB_norm','Fandango_Stars','Fandango_Ratingvalue']],clip=[0,5],shade=True)
move_legend(ax, "upper left")
# Recall that we previously defined a list 'norms' containing all the columns with normed ratings
fig,ax = plt.subplots(figsize=(12,8))
sns.kdeplot(data=df[norms],clip=[0,5],shade=True)
move_legend(ax, "upper left")
fig,ax = plt.subplots(figsize=(12,8))
sns.kdeplot(data=df[['Fandango_Stars','RT_user_norm','Metacritic_user_norm','IMDB_norm']],clip=[0,5],shade=True)
move_legend(ax, "upper left")
# for anyone interested in 10 worst movies, we include the film name as well
norms.append('FILM')
norms
df.nsmallest(10,'RottenTomatoes')[norms]
Fandango_Starsfloat64
3.0 - 4.5
Fandango_Ratingvaluefloat64
2.7 - 4.1
133
3.5
3.5
105
4
3.9
53
4
3.7
15
4.5
4.1
48
3
2.7
33
4
3.6
35
4
3.6
60
3.5
3.2
58
3.5
3.2
59
3.5
3.2
bad_movies = df.nsmallest(10,'RottenTomatoes')[norms]
# kdeplot of 10 worst movies from Rotten Tomatoes
fig,ax=plt.subplots(figsize=(16,6))
sns.kdeplot(data=bad_movies,clip=[0,5],shade=True,palette='colorblind')