import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
fandango = pd.read_csv('fandango_scrape.csv')
fandango.head()
fandango.describe()
plt.figure(figsize=(10, 4), dpi=150)
sns.scatterplot(data=fandango, y="VOTES", x="RATING")
fandango.corr()
# example
title = 'Film Title Name (Year)'
title.split('(')[-1].replace(')', '')
fandango['YEAR'] = fandango['FILM'].apply(lambda title: title.split('(')[-1].replace(')', ''))
fandango
fandango['YEAR'].value_counts()
sns.countplot(data=fandango, x='YEAR')
len(fandango[fandango['VOTES'] == 0])
fan_reviewed = fandango[fandango['VOTES'] > 0]
plt.figure(figsize=(10,4), dpi=150)
sns.kdeplot(data=fan_reviewed, x='RATING', clip=[0, 5], fill=True, label='True Rating')
sns.kdeplot(data=fan_reviewed, x='STARS', clip=[0, 5], fill=True, label='Stars Displayed')
plt.legend(loc=(1.05, 0.5))
fan_reviewed['STARS_DIFF'] = fan_reviewed['STARS'] - fan_reviewed['RATING']
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
fan_reviewed['STARS_DIFF'] = fan_reviewed['STARS_DIFF'].round(2)
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
fan_reviewed
plt.figure(figsize=(12, 4), dpi=150)
sns.countplot(data=fan_reviewed, x='STARS_DIFF')
fan_reviewed[fan_reviewed['STARS_DIFF'] == 1]
all_sites = pd.read_csv(all_sites_scores.csv')
Execution error
SyntaxError: EOL while scanning string literal (<ipython-input-21-2cc2d58b8fe3>, line 1)
all_sites.head()
all_sites.describe()
plt.figure(dpi=150)
sns.scatterplot(data=all_sites, x='RottenTomatoes', y='RottenTomatoes_User')
plt.ylim(0, 100)
plt.xlim(0, 100)
all_sites['Rotten_Diff'] = all_sites['RottenTomatoes'] - all_sites['RottenTomatoes_User']
all_sites['Rotten_Diff'].apply(abs).mean()
plt.figure(figsize=(10, 4), dpi=200)
sns.histplot(data=all_sites, x='Rotten_Diff', kde=True, bins=25)
plt.figure(figsize=(10, 4), dpi=200)
sns.histplot(x=all_sites['Rotten_Diff'].apply(abs), kde=True, bins=25)
all_sites.nsmallest(5, 'Rotten_Diff')[['FILM', 'Rotten_Diff']]
plt.figure(figsize=(10, 4), dpi=150)
sns.scatterplot(data=all_sites, x='Metacritic', y='Metacritic_User')
plt.xlim(0, 100)
plt.ylim(0, 10)
plt.figure(figsize=(10,4),dpi=150)
sns.scatterplot(data=all_sites,x='Metacritic_user_vote_count',y='IMDB_user_vote_count')
all_sites.nlargest(1, 'IMDB_user_vote_count')
all_sites.nlargest(1, 'Metacritic_user_vote_count')
df = pd.merge(fandango, all_sites, on='FILM', how='inner')
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 145 entries, 0 to 144
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 FILM 145 non-null object
1 STARS 145 non-null float64
2 RATING 145 non-null float64
3 VOTES 145 non-null int64
4 YEAR 145 non-null object
5 RottenTomatoes 145 non-null int64
6 RottenTomatoes_User 145 non-null int64
7 Metacritic 145 non-null int64
8 Metacritic_User 145 non-null float64
9 IMDB 145 non-null float64
10 Metacritic_user_vote_count 145 non-null int64
11 IMDB_user_vote_count 145 non-null int64
12 Rotten_Diff 145 non-null int64
dtypes: float64(4), int64(7), object(2)
memory usage: 15.9+ KB
df.head()
# Dont run this cell multiple times, otherwise you keep dividing!
df['RT_Norm'] = np.round(df['RottenTomatoes']/20,1)
df['RTU_Norm'] = np.round(df['RottenTomatoes_User']/20,1)
# Dont run this cell multiple times, otherwise you keep dividing!
df['Meta_Norm'] = np.round(df['Metacritic']/20,1)
df['Meta_U_Norm'] = np.round(df['Metacritic_User']/2,1)
# Dont run this cell multiple times, otherwise you keep dividing!
df['IMDB_Norm'] = np.round(df['IMDB']/2,1)
df.head()
df.columns
norm_scores = df[['STARS', 'RATING', 'RT_Norm', 'RTU_Norm', 'Meta_Norm', 'Meta_U_Norm', 'IMDB_Norm']]
norm_scores.head()
def move_legend(ax, new_loc, **kws):
old_legend = ax.legend_
handles = old_legend.legendHandles
labels = [t.get_text() for t in old_legend.get_texts()]
title = old_legend.get_title().get_text()
ax.legend(handles, labels, loc=new_loc, title=title, **kws)
fig, ax = plt.subplots(figsize=(15,6),dpi=150)
sns.kdeplot(data=norm_scores,clip=[0,5],shade=True,palette='Set1',ax=ax)
move_legend(ax, 'upper left')
fig, ax = plt.subplots(figsize=(15,6),dpi=150)
sns.kdeplot(data=norm_scores[['RT_Norm','STARS']],clip=[0,5],shade=True,palette='Set1',ax=ax)
move_legend(ax, "upper left")
plt.subplots(figsize=(15,6),dpi=150)
sns.histplot(norm_scores,bins=50)
sns.clustermap(norm_scores, col_cluster=False)
norm_films = df[['STARS','RATING','RT_Norm','RTU_Norm','Meta_Norm','Meta_U_Norm','IMDB_Norm','FILM']]
norm_films.nsmallest(10, 'RT_Norm')
plt.figure(figsize=(15,6),dpi=150)
worst_films = norm_films.nsmallest(10,'RT_Norm').drop('FILM',axis=1)
sns.kdeplot(data=worst_films,clip=[0,5],shade=True,palette='Set1')
plt.title("Ratings for RT Critic's 10 Worst Reviewed Films");