import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
fandango = pd.read_csv('fandango_scrape.csv')
fandango.head()
fandango.describe()
plt.figure(figsize=(10, 4), dpi=150)
sns.scatterplot(data=fandango, y="VOTES", x="RATING")
fandango.corr()
# example
title = 'Film Title Name (Year)'
title.split('(')[-1].replace(')', '')
fandango['YEAR'] = fandango['FILM'].apply(lambda title: title.split('(')[-1].replace(')', ''))
fandango
fandango['YEAR'].value_counts()
sns.countplot(data=fandango, x='YEAR')
len(fandango[fandango['VOTES'] == 0])
fan_reviewed = fandango[fandango['VOTES'] > 0]
plt.figure(figsize=(10,4), dpi=150)
sns.kdeplot(data=fan_reviewed, x='RATING', clip=[0, 5], fill=True, label='True Rating')
sns.kdeplot(data=fan_reviewed, x='STARS', clip=[0, 5], fill=True, label='Stars Displayed')
plt.legend(loc=(1.05, 0.5))
fan_reviewed['STARS_DIFF'] = fan_reviewed['STARS'] - fan_reviewed['RATING']
fan_reviewed['STARS_DIFF'] = fan_reviewed['STARS_DIFF'].round(2)
fan_reviewed
plt.figure(figsize=(12, 4), dpi=150)
sns.countplot(data=fan_reviewed, x='STARS_DIFF')
fan_reviewed[fan_reviewed['STARS_DIFF'] == 1]
all_sites = pd.read_csv('all_sites_scores.csv')
all_sites.head()
all_sites.describe()
plt.figure(dpi=150)
sns.scatterplot(data=all_sites, x='RottenTomatoes', y='RottenTomatoes_User')
plt.ylim(0, 100)
plt.xlim(0, 100)
all_sites['Rotten_Diff'] = all_sites['RottenTomatoes'] - all_sites['RottenTomatoes_User']
all_sites['Rotten_Diff'].apply(abs).mean()
plt.figure(figsize=(10, 4), dpi=200)
sns.histplot(data=all_sites, x='Rotten_Diff', kde=True, bins=25)
plt.figure(figsize=(10, 4), dpi=200)
sns.histplot(x=all_sites['Rotten_Diff'].apply(abs), kde=True, bins=25)
all_sites.nsmallest(5, 'Rotten_Diff')[['FILM', 'Rotten_Diff']]
plt.figure(figsize=(10, 4), dpi=150)
sns.scatterplot(data=all_sites, x='Metacritic', y='Metacritic_User')
plt.xlim(0, 100)
plt.ylim(0, 10)
plt.figure(figsize=(10,4),dpi=150)
sns.scatterplot(data=all_sites,x='Metacritic_user_vote_count',y='IMDB_user_vote_count')
all_sites.nlargest(1, 'IMDB_user_vote_count')
all_sites.nlargest(1, 'Metacritic_user_vote_count')
df = pd.merge(fandango, all_sites, on='FILM', how='inner')
df.info()
df.head()
# Dont run this cell multiple times, otherwise you keep dividing!
df['RT_Norm'] = np.round(df['RottenTomatoes']/20,1)
df['RTU_Norm'] = np.round(df['RottenTomatoes_User']/20,1)
# Dont run this cell multiple times, otherwise you keep dividing!
df['Meta_Norm'] = np.round(df['Metacritic']/20,1)
df['Meta_U_Norm'] = np.round(df['Metacritic_User']/2,1)
# Dont run this cell multiple times, otherwise you keep dividing!
df['IMDB_Norm'] = np.round(df['IMDB']/2,1)
df.head()
df.columns
norm_scores = df[['STARS', 'RATING', 'RT_Norm', 'RTU_Norm', 'Meta_Norm', 'Meta_U_Norm', 'IMDB_Norm']]
norm_scores.head()
def move_legend(ax, new_loc, **kws):
old_legend = ax.legend_
handles = old_legend.legendHandles
labels = [t.get_text() for t in old_legend.get_texts()]
title = old_legend.get_title().get_text()
ax.legend(handles, labels, loc=new_loc, title=title, **kws)
fig, ax = plt.subplots(figsize=(15,6),dpi=150)
sns.kdeplot(data=norm_scores,clip=[0,5],shade=True,palette='Set1',ax=ax)
move_legend(ax, 'upper left')
fig, ax = plt.subplots(figsize=(15,6),dpi=150)
sns.kdeplot(data=norm_scores[['RT_Norm','STARS']],clip=[0,5],shade=True,palette='Set1',ax=ax)
move_legend(ax, "upper left")
plt.subplots(figsize=(15,6),dpi=150)
sns.histplot(norm_scores,bins=50)
sns.clustermap(norm_scores, col_cluster=False)
norm_films = df[['STARS','RATING','RT_Norm','RTU_Norm','Meta_Norm','Meta_U_Norm','IMDB_Norm','FILM']]
norm_films.nsmallest(10, 'RT_Norm')
plt.figure(figsize=(15,6),dpi=150)
worst_films = norm_films.nsmallest(10,'RT_Norm').drop('FILM',axis=1)
sns.kdeplot(data=worst_films,clip=[0,5],shade=True,palette='Set1')
plt.title("Ratings for RT Critic's 10 Worst Reviewed Films");