!pip install requests
!pip install lxml
Requirement already satisfied: requests in /opt/venv/lib/python3.7/site-packages (2.25.0)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/venv/lib/python3.7/site-packages (from requests) (1.26.2)
Requirement already satisfied: certifi>=2017.4.17 in /opt/venv/lib/python3.7/site-packages (from requests) (2020.11.8)
Requirement already satisfied: chardet<4,>=3.0.2 in /opt/venv/lib/python3.7/site-packages (from requests) (3.0.4)
Requirement already satisfied: idna<3,>=2.5 in /opt/venv/lib/python3.7/site-packages (from requests) (2.10)
WARNING: You are using pip version 20.2.4; however, version 20.3.1 is available.
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.
Requirement already satisfied: lxml in /opt/venv/lib/python3.7/site-packages (4.6.2)
WARNING: You are using pip version 20.2.4; however, version 20.3.1 is available.
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.
import pandas as pd
import numpy as np
from urllib.request import Request, urlopen #from stack overflow
from lxml import html
def import_table(year,table_title):
req = Request('https://www.the-numbers.com/market/'+year+'/'+table_title, #ask a server on the internet to get data info
headers={'User-Agent': 'Mozilla/5.0'}) #had to change browser to be "Firefox"
webpage = urlopen(req).read()
return pd.read_html(webpage) #get all the tables you can from this html in the form of dataframes
df_2019_top_movies = import_table('2019','top-grossing-movies')[0]
df_2020_top_movies = import_table('2020','top-grossing-movies')[0]
df_2019_distributor = import_table('2019','distributors')[1]
df_2020_distributor = import_table('2020','distributors')[1]
df_2019_top_movies.drop(df_2019_top_movies.tail(2).index, inplace = True)
df_2020_top_movies.drop(df_2020_top_movies.tail(2).index, inplace = True)
df_2019_distributor.dtypes
df_2019_distributor['2019 Gross'] = df_2019_distributor['2019 Gross'].str.replace('$','').str.replace(',','').astype(float)
df_2020_distributor['2020 Gross'] = df_2020_distributor['2020 Gross'].str.replace('$','').str.replace(',','').astype(float)
df_2019_top_movies['2019 Gross'] = df_2019_top_movies['2019 Gross'].str.replace('$','').str.replace(',','').astype(float)
df_2020_top_movies['2020 Gross'] = df_2020_top_movies['2020 Gross'].str.replace('$','').str.replace(',','').astype(float)
df_2019_distributor['2019 Gross/Movies'] = df_2019_distributor['2019 Gross'] / df_2019_distributor['Movies']
df_2019_distributor['2019 Gross/Movies'] = df_2019_distributor['2019 Gross/Movies'].astype(float)
df_2020_distributor['2020 Gross/Movies'] = df_2020_distributor['2020 Gross'] / df_2020_distributor['Movies']
df_2020_distributor['2020 Gross/Movies'] = df_2020_distributor['2020 Gross/Movies'].astype(float)
dist_2019 = df_2019_top_movies.groupby('Distributor')['2019 Gross'].max()
dist_2019_max = pd.DataFrame(dist_2019)
dist_2019_max.reset_index(inplace=True)
dist_2019_max = dist_2019_max.rename(columns = {'index': 'Distributor','2019 Gross':'2019 Gross Max'})
df_dist_2019 = pd.merge(dist_2019_max, df_2019_distributor, on= 'Distributor')
df_dist_2019 = df_dist_2019[['Distributor','2019 Gross Max', '2019 Gross/Movies']]
df_dist_2019
dist_2020 = df_2020_top_movies.groupby('Distributor')['2020 Gross'].max()
dist_2020_max = pd.DataFrame(dist_2020)
dist_2020_max.reset_index(inplace=True)
dist_2020_max = dist_2020_max.rename(columns = {'index': 'Distributor','2020 Gross':'2020 Gross Max'})
df_dist_2020 = pd.merge(dist_2020_max, df_2020_distributor, on= 'Distributor')
df_dist_2020 = df_dist_2020[['Distributor','2020 Gross Max', '2020 Gross/Movies']]
df_dist_2020
df_merge_gross= pd.merge(df_dist_2020, df_dist_2019, on = 'Distributor', how = 'right' )
df_merge_gross['2020 Gross Max'] = df_merge_gross['2020 Gross Max'].replace(np.nan,0)
df_merge_gross['2020 Gross/Movies'] = df_merge_gross['2020 Gross/Movies'].replace(np.nan,0)
df_merge_gross
df_2018_distributor = import_table('2018','distributors')[1]
df_2017_distributor = import_table('2017','distributors')[1]
df_2018_distributor['2018 Gross'] = df_2018_distributor['2018 Gross'].str.replace('$','').str.replace(',','').astype(float)
df_2018_distributor['Share'] = df_2018_distributor['Share'].str.replace('%','').astype(float)
df_2018_distributor['Share'] = df_2018_distributor['Share']/100
df_2018_distributor['Tickets'] = df_2018_distributor['Tickets'].astype(float)
df_2018_distributor
df_2017_distributor['2017 Gross'] = df_2017_distributor['2017 Gross'].str.replace('$','').str.replace(',','').astype(float)
df_2017_distributor['Share'] = df_2017_distributor['Share'].str.replace('%','').astype(float)
df_2017_distributor['Share'] = df_2017_distributor['Share']/100
df_2017_distributor['Tickets'] = df_2017_distributor['Tickets'].astype(float)
df_2017_distributor
df_2018_distributor['2018 Gross/Movies'] = df_2018_distributor['2018 Gross'] / df_2018_distributor['Movies']
df_2017_distributor['2017 Gross/Movies'] = df_2017_distributor['2017 Gross'] / df_2017_distributor['Movies']
df_2017_distributor = df_2017_distributor.rename(columns= {'Tickets':'2017 Tickets','Share':'2017 Share'})
df_2018_distributor = df_2018_distributor.rename(columns= {'Tickets':'2018 Tickets','Share':'2018 Share'})
ml_dist_2018 = pd.merge(df_2018_distributor, df_2017_distributor, on = 'Distributor')
ml_dist_2018['Gross/Movies Difference'] = ml_dist_2018['2018 Gross/Movies']-ml_dist_2018['2017 Gross/Movies']
ml_dist_2018['Gross/Movies Difference'] = np.where(ml_dist_2018['Gross/Movies Difference'] > 0, 1, 0)
ml_dist_2018.drop(['Rank_x', 'Rank_y','Movies_x','Movies_y','2018 Gross','2017 Gross', '2018 Gross/Movies','2017 Gross/Movies','Distributor'], axis=1, inplace=True)
ml_dist_2018
ml_dist_2018['2018 Tickets'] = np.log10(ml_dist_2018['2018 Tickets'])
ml_dist_2018['2017 Tickets'] = np.log10(ml_dist_2018['2017 Tickets'])
ml_dist_2018.head(10)
df_2019_distributor['Share'] = df_2019_distributor['Share'].str.replace('%','').astype(float)
df_2019_distributor['Share'] = df_2019_distributor['Share']/100
df_2019_distributor
df_2019_distributor = df_2019_distributor.rename(columns= {'Tickets':'2019 Tickets','Share':'2019 Share'})
ml_dist_2019 = pd.merge(df_2019_distributor, df_2018_distributor, on= 'Distributor')
ml_dist_2019['Gross/Movies Difference'] = ml_dist_2019['2019 Gross/Movies']-ml_dist_2019['2018 Gross/Movies']
ml_dist_2019['Gross/Movies Difference'] = np.where(ml_dist_2019['Gross/Movies Difference'] > 0, 1, 0)
ml_dist_2019.drop(['Rank_x', 'Rank_y','Movies_x','Movies_y','2018 Gross','2019 Gross', '2019 Gross/Movies','2018 Gross/Movies','Distributor'], axis=1, inplace=True)
ml_dist_2019
ml_dist_2019['2018 Tickets'] = np.log10(ml_dist_2019['2018 Tickets'])
ml_dist_2019['2019 Tickets'] = np.log10(ml_dist_2019['2019 Tickets'])
ml_dist_2019.head(10)
df_2020_distributor['Share'] = df_2020_distributor['Share'].str.replace('%','').astype(float)
df_2020_distributor['Share'] = df_2020_distributor['Share']/100
df_2020_distributor
df_2020_distributor = df_2020_distributor.rename(columns= {'Tickets':'2020 Tickets','Share':'2020 Share'})
ml_dist_2020 = pd.merge(df_2020_distributor, df_2019_distributor, on= 'Distributor')
ml_dist_2020['Gross/Movies Difference'] = ml_dist_2020['2020 Gross/Movies']-ml_dist_2020['2019 Gross/Movies']
ml_dist_2020['Gross/Movies Difference'] = np.where(ml_dist_2020['Gross/Movies Difference'] > 0, 1, 0)
ml_dist_2020.drop(['Rank_x', 'Rank_y','Movies_x','Movies_y','2020 Gross','2019 Gross', '2019 Gross/Movies','2020 Gross/Movies','Distributor'], axis=1, inplace=True)
ml_dist_2020
ml_dist_2020['2020 Tickets'] = np.log10(ml_dist_2020['2020 Tickets'])
ml_dist_2020['2019 Tickets'] = np.log10(ml_dist_2020['2019 Tickets'])
ml_dist_2020.head(10)