!pip install seaborn
WARNING: The directory '/home/jovyan/.cache/pip' or its parent directory is not owned or is not writable by the current user. The cache has been disabled. Check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.
Requirement already satisfied: seaborn in /opt/venv/lib/python3.7/site-packages (0.11.0)
Requirement already satisfied: pandas>=0.23 in /opt/venv/lib/python3.7/site-packages (from seaborn) (1.0.1)
Requirement already satisfied: scipy>=1.0 in /opt/venv/lib/python3.7/site-packages (from seaborn) (1.5.4)
Requirement already satisfied: matplotlib>=2.2 in /opt/venv/lib/python3.7/site-packages (from seaborn) (3.1.3)
Requirement already satisfied: numpy>=1.15 in /opt/venv/lib/python3.7/site-packages (from seaborn) (1.18.5)
Requirement already satisfied: pytz>=2017.2 in /opt/venv/lib/python3.7/site-packages (from pandas>=0.23->seaborn) (2020.4)
Requirement already satisfied: python-dateutil>=2.6.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.23->seaborn) (2.8.1)
Requirement already satisfied: cycler>=0.10 in /opt/venv/lib/python3.7/site-packages (from matplotlib>=2.2->seaborn) (0.10.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib>=2.2->seaborn) (2.4.7)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/venv/lib/python3.7/site-packages (from matplotlib>=2.2->seaborn) (1.3.1)
Requirement already satisfied: six>=1.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from python-dateutil>=2.6.1->pandas>=0.23->seaborn) (1.15.0)
WARNING: You are using pip version 20.2.4; however, version 20.3.3 is available.
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%run /home/jovyan/work/DataCleaning.ipynb
WARNING: The directory '/home/jovyan/.cache/pip' or its parent directory is not owned or is not writable by the current user. The cache has been disabled. Check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.
Requirement already satisfied: seaborn in /opt/venv/lib/python3.7/site-packages (0.11.0)
Requirement already satisfied: pandas>=0.23 in /opt/venv/lib/python3.7/site-packages (from seaborn) (1.0.1)
Requirement already satisfied: numpy>=1.15 in /opt/venv/lib/python3.7/site-packages (from seaborn) (1.18.5)
Requirement already satisfied: matplotlib>=2.2 in /opt/venv/lib/python3.7/site-packages (from seaborn) (3.1.3)
Requirement already satisfied: scipy>=1.0 in /opt/venv/lib/python3.7/site-packages (from seaborn) (1.5.4)
Requirement already satisfied: python-dateutil>=2.6.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.23->seaborn) (2.8.1)
Requirement already satisfied: pytz>=2017.2 in /opt/venv/lib/python3.7/site-packages (from pandas>=0.23->seaborn) (2020.4)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from matplotlib>=2.2->seaborn) (2.4.7)
Requirement already satisfied: cycler>=0.10 in /opt/venv/lib/python3.7/site-packages (from matplotlib>=2.2->seaborn) (0.10.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /opt/venv/lib/python3.7/site-packages (from matplotlib>=2.2->seaborn) (1.3.1)
Requirement already satisfied: six>=1.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from python-dateutil>=2.6.1->pandas>=0.23->seaborn) (1.15.0)
WARNING: You are using pip version 20.2.4; however, version 20.3.3 is available.
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.
WARNING: The directory '/home/jovyan/.cache/pip' or its parent directory is not owned or is not writable by the current user. The cache has been disabled. Check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.
Requirement already satisfied: requests in /opt/venv/lib/python3.7/site-packages (2.25.0)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/venv/lib/python3.7/site-packages (from requests) (1.26.2)
Requirement already satisfied: certifi>=2017.4.17 in /opt/venv/lib/python3.7/site-packages (from requests) (2020.11.8)
Requirement already satisfied: idna<3,>=2.5 in /opt/venv/lib/python3.7/site-packages (from requests) (2.10)
Requirement already satisfied: chardet<4,>=3.0.2 in /opt/venv/lib/python3.7/site-packages (from requests) (3.0.4)
WARNING: You are using pip version 20.2.4; however, version 20.3.3 is available.
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.
WARNING: The directory '/home/jovyan/.cache/pip' or its parent directory is not owned or is not writable by the current user. The cache has been disabled. Check the permissions and owner of that directory. If executing pip with sudo, you may want sudo's -H flag.
Requirement already satisfied: lxml in /opt/venv/lib/python3.7/site-packages (4.6.2)
WARNING: You are using pip version 20.2.4; however, version 20.3.3 is available.
You should consider upgrading via the '/opt/venv/bin/python -m pip install --upgrade pip' command.
scatter = sns.lmplot( x='2019 Gross/Movies',y='2020 Gross/Movies', data=df_merge_gross)
plt.ylim(0,80000000)
plt.xlim(0,80000000)
plt.show()
df_heat_map = df_merge_gross[['2019 Gross/Movies','2020 Gross/Movies','2019 Gross Max', '2020 Gross Max']]
correlation_coefficients = np.corrcoef( df_heat_map, rowvar=False )
labels = df_heat_map.columns
heatmap = sns.heatmap(correlation_coefficients, xticklabels = labels, yticklabels = labels, annot=True)
bins = np.linspace(0,10000000,10)
plt.hist([ df_2019_top_movies['Tickets Sold'],df_2020_top_movies['Tickets Sold']], bins= bins,label=['2019','2020'] )
plt.xlabel( 'Tickets Sold' )
plt.ylabel( 'Frequency' )
plt.legend()
plt.title( 'Distribution of Tickets Sold' )
plt.show()
from sklearn.linear_model import LogisticRegression
predictors = ml_dist_2018[["2015 Share","2016 Share","2017 Share", "2015 Tickets", "2016 Tickets", "2017 Tickets"]]
response = ml_dist_2018['Gross/Movies Difference']
model = LogisticRegression()
model.fit( predictors, response )
ml_dist_2018['Prediction'] = model.predict( predictors )
ml_dist_2018['Correct'] = ml_dist_2018['Prediction'] == ml_dist_2018['Gross/Movies Difference']
ml_dist_2018['Correct'].sum() / len(ml_dist_2018)
predictions = model.predict( predictors )
TP = ( ml_dist_2018['Gross/Movies Difference'] & predictions ).sum()
FP = ( ~ml_dist_2018['Gross/Movies Difference'] & predictions ).sum()
FN = ( ml_dist_2018['Gross/Movies Difference'] & ~predictions ).sum()
precision = TP / ( TP + FP )
recall = TP / ( TP + FN )
F1 = 2 * precision * recall / ( precision + recall )
precision, recall, F1
len(ml_dist_2018)
ml_dist_2018 = ml_dist_2018[["2015 Share","2016 Share","2017 Share", "2015 Tickets", "2016 Tickets", "2017 Tickets",'Gross/Movies Difference']]
import numpy as np
rows_for_training = np.random.choice( ml_dist_2018.index, 57)
training = ml_dist_2018.index.isin( rows_for_training )
df_training = ml_dist_2018[training]
df_validation = ml_dist_2018[~training]
len( df_training ), len( df_validation )
def fit_model_to (training):
# choose predictors and fit model as before
predictors = training.iloc[:,:-1]
response = training['Gross/Movies Difference']
model = LogisticRegression()
model.fit( predictors, response )
# fit another model to standardized predictors
standardized = ( predictors - predictors.mean() ) / predictors.std()
temp_model = LogisticRegression()
temp_model.fit( standardized, response )
# get that model's coefficients and display them
coeffs = pd.Series( temp_model.coef_[0], index=predictors.columns )
sorted = np.abs( coeffs ).sort_values( ascending=False ) # these two lines are the
coeffs = coeffs.loc[sorted.index] # optional bonus, sorting
print( coeffs )
return model
def score_model ( M, validation ):
predictions = M.predict( validation.iloc[:,:-1])
TP = ( validation['Gross/Movies Difference'] & predictions ).sum()
FP = ( ~validation['Gross/Movies Difference'] & predictions ).sum()
FN = ( validation['Gross/Movies Difference'] & ~predictions ).sum()
if TP == 0:
return 0
precision = TP / ( TP + FP )
recall = TP / ( TP + FN )
return 2 * precision * recall / ( precision + recall )
model = fit_model_to( df_training )
print( score_model( model, df_training ), score_model( model, df_validation ) )
2017 Tickets -0.580145
2017 Share -0.205851
2015 Share 0.201300
2016 Share 0.147397
2015 Tickets 0.140949
2016 Tickets 0.125064
dtype: float64
0.7419354838709677 0.45454545454545453
columns = [1,2,3,4,5,-1]
model = fit_model_to( df_training.iloc[:,columns] )
score_model( model, df_training.iloc[:,columns] ), score_model( model, df_validation.iloc[:,columns] )
2017 Tickets -0.580470
2016 Share 0.243245
2015 Tickets 0.150333
2016 Tickets 0.130088
2017 Share -0.119788
dtype: float64
columns = [2,3,4,5,-1]
model = fit_model_to( df_training.iloc[:,columns] )
score_model( model, df_training.iloc[:,columns] ), score_model( model, df_validation.iloc[:,columns] )
2017 Tickets -0.579885
2015 Tickets 0.161205
2016 Tickets 0.145424
2017 Share 0.080810
dtype: float64
columns = [1,2,4,5,-1]
model = fit_model_to( df_training.iloc[:,columns] )
score_model( model, df_training.iloc[:,columns] ), score_model( model, df_validation.iloc[:,columns] )
2017 Tickets -0.514328
2016 Share 0.251219
2016 Tickets 0.188652
2017 Share -0.111083
dtype: float64
model = fit_model_to(df_training)
2017 Tickets -0.580145
2017 Share -0.205851
2015 Share 0.201300
2016 Share 0.147397
2015 Tickets 0.140949
2016 Tickets 0.125064
dtype: float64
score_model( model, ml_dist_2019)
score_model( model, ml_dist_2020 )