import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
train = pd.read_csv('yelp142_train.csv')
test = pd.read_csv('yelp142_test.csv')
2b i) build a linear regression model
!pip install statsmodels==0.12.2
Collecting statsmodels==0.12.2
Downloading statsmodels-0.12.2-cp37-cp37m-manylinux1_x86_64.whl (9.5 MB)
|████████████████████████████████| 9.5 MB 8.2 MB/s
Requirement already satisfied: numpy>=1.15 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.12.2) (1.19.5)
Requirement already satisfied: scipy>=1.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.12.2) (1.7.1)
Collecting patsy>=0.5
Downloading patsy-0.5.2-py2.py3-none-any.whl (233 kB)
|████████████████████████████████| 233 kB 74.9 MB/s
Requirement already satisfied: pandas>=0.21 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.12.2) (1.2.5)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.21->statsmodels==0.12.2) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.21->statsmodels==0.12.2) (2021.3)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5->statsmodels==0.12.2) (1.16.0)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.2 statsmodels-0.12.2
WARNING: You are using pip version 21.2.4; however, version 21.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
#build a linear regression model with (Missing) as reference
import statsmodels.formula.api as smf
ols = smf.ols(formula = 'stars ~ review_count + C(GoodForKids, Treatment(reference="(Missing)")) + C(Alcohol, Treatment(reference="(Missing)")) + C(BusinessAcceptsCreditCards, Treatment(reference="(Missing)")) + C(WiFi, Treatment(reference="(Missing)")) + C(BikeParking, Treatment(reference="(Missing)")) + C(ByAppointmentOnly, Treatment(reference="(Missing)")) + C(WheelechairAccessible, Treatment(reference="(Missing)")) + C(OutdoorSeating, Treatment(reference="(Missing)")) + C(RestaurantsReservations, Treatment(reference="(Missing)")) + C(DogsAllowed, Treatment(reference="(Missing)")) + C(Caters, Treatment(reference="(Missing)"))',
data = train)
linear_model = ols.fit()
print(linear_model.summary())
OLS Regression Results
==============================================================================
Dep. Variable: stars R-squared: 0.175
Model: OLS Adj. R-squared: 0.171
Method: Least Squares F-statistic: 52.86
Date: Sat, 16 Oct 2021 Prob (F-statistic): 1.16e-237
Time: 21:56:27 Log-Likelihood: -7281.7
No. Observations: 6272 AIC: 1.462e+04
Df Residuals: 6246 BIC: 1.479e+04
Df Model: 25
Covariance Type: nonrobust
============================================================================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------------------------------------------------------
Intercept 3.3376 0.039 85.220 0.000 3.261 3.414
C(GoodForKids, Treatment(reference="(Missing)"))[T.FALSE] -0.0021 0.046 -0.047 0.963 -0.092 0.088
C(GoodForKids, Treatment(reference="(Missing)"))[T.TRUE] -0.0992 0.036 -2.770 0.006 -0.169 -0.029
C(Alcohol, Treatment(reference="(Missing)"))[T.'beer_and_wine'] 0.2437 0.047 5.141 0.000 0.151 0.337
C(Alcohol, Treatment(reference="(Missing)"))[T.'full_bar'] 0.1305 0.044 2.997 0.003 0.045 0.216
C(Alcohol, Treatment(reference="(Missing)"))[T.'none'] 0.1226 0.039 3.143 0.002 0.046 0.199
C(BusinessAcceptsCreditCards, Treatment(reference="(Missing)"))[T.FALSE] 0.5590 0.091 6.124 0.000 0.380 0.738
C(BusinessAcceptsCreditCards, Treatment(reference="(Missing)"))[T.TRUE] 0.1337 0.046 2.899 0.004 0.043 0.224
C(WiFi, Treatment(reference="(Missing)"))[T.'free'] 0.1094 0.035 3.165 0.002 0.042 0.177
C(WiFi, Treatment(reference="(Missing)"))[T.'no'] 0.1102 0.033 3.317 0.001 0.045 0.175
C(WiFi, Treatment(reference="(Missing)"))[T.'paid'] -0.1727 0.101 -1.703 0.089 -0.371 0.026
C(BikeParking, Treatment(reference="(Missing)"))[T.FALSE] -0.1634 0.032 -5.111 0.000 -0.226 -0.101
C(BikeParking, Treatment(reference="(Missing)"))[T.TRUE] -0.1170 0.029 -4.069 0.000 -0.173 -0.061
C(ByAppointmentOnly, Treatment(reference="(Missing)"))[T.FALSE] 0.1045 0.034 3.081 0.002 0.038 0.171
C(ByAppointmentOnly, Treatment(reference="(Missing)"))[T.TRUE] 0.2720 0.102 2.656 0.008 0.071 0.473
C(WheelechairAccessible, Treatment(reference="(Missing)"))[T.FALSE] 0.7328 0.091 8.034 0.000 0.554 0.912
C(WheelechairAccessible, Treatment(reference="(Missing)"))[T.TRUE] 0.3771 0.028 13.488 0.000 0.322 0.432
C(OutdoorSeating, Treatment(reference="(Missing)"))[T.FALSE] -0.1502 0.040 -3.773 0.000 -0.228 -0.072
C(OutdoorSeating, Treatment(reference="(Missing)"))[T.TRUE] -0.0299 0.043 -0.701 0.483 -0.113 0.054
C(RestaurantsReservations, Treatment(reference="(Missing)"))[T.FALSE] -0.1914 0.041 -4.712 0.000 -0.271 -0.112
C(RestaurantsReservations, Treatment(reference="(Missing)"))[T.TRUE] 0.0198 0.046 0.435 0.664 -0.070 0.109
C(DogsAllowed, Treatment(reference="(Missing)"))[T.FALSE] 0.2623 0.030 8.888 0.000 0.204 0.320
C(DogsAllowed, Treatment(reference="(Missing)"))[T.TRUE] 0.1912 0.054 3.513 0.000 0.085 0.298
C(Caters, Treatment(reference="(Missing)"))[T.FALSE] -0.1333 0.030 -4.436 0.000 -0.192 -0.074
C(Caters, Treatment(reference="(Missing)"))[T.TRUE] 0.1173 0.033 3.584 0.000 0.053 0.181
review_count 8.773e-05 2.62e-05 3.355 0.001 3.65e-05 0.000
==============================================================================
Omnibus: 136.028 Durbin-Watson: 1.955
Prob(Omnibus): 0.000 Jarque-Bera (JB): 144.498
Skew: -0.370 Prob(JB): 4.19e-32
Kurtosis: 3.078 Cond. No. 5.01e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.01e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
2b ii) build a regression tree model
#identify categorical variables for dummy encoding
train.dtypes
#dummy encoding
train_enc = pd.get_dummies(train, drop_first=True)
test_enc = pd.get_dummies(test, drop_first=True)
train_enc.head()
#split between x and y
y_train = train_enc['stars'].astype('int64')
y_test = test_enc['stars'].astype('int64')
x_train = train_enc.drop(['stars'], axis=1).astype('int64')
x_test = test_enc.drop(['stars'], axis=1).astype('int64')
#cross-validation with custom loss function. Create a function here (from Lab 6)
def average_loss_function(y_test, y_pred):
weights = np.array([20 if i == 1 else 1 for i in y_test])
return np.mean(weights*(y_test != y_pred))
#cross-validation with custom loss function. Calculate the loss for each hyperparameter (grid values for hyperparameters other than ccp alpha given in Lab 6)
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
grid_values = {'ccp_alpha': np.linspace(0, 0.10, 201),
'min_samples_leaf': [5],
'min_samples_split': [20],
'max_depth': [30],
'random_state': [88]}
dtr = DecisionTreeRegressor()
dtr_cv_avgloss = GridSearchCV(dtr, param_grid = grid_values, cv=10, verbose=1,
scoring = make_scorer(average_loss_function, greater_is_better=False))
dtr_cv_avgloss.fit(x_train, y_train)
Fitting 10 folds for each of 201 candidates, totalling 2010 fits
ccp = dtr_cv_avgloss.cv_results_['param_ccp_alpha'].data
mean_avgloss = dtr_cv_avgloss.cv_results_['mean_test_score']*(-1)
pd.DataFrame({'ccp alpha': ccp, 'Mean Validation Average Loss': mean_avgloss}).head()
plt.figure(figsize=(8, 6))
plt.xlabel('ccp alpha', fontsize=16)
plt.ylabel('mean validation average loss', fontsize=16)
plt.scatter(ccp, mean_avgloss, s=1)
plt.plot(ccp, mean_avgloss, linewidth=3)
plt.grid(True, which='both')
plt.show()
print('Grid best parameter (min. Avg Loss): ', dtr_cv_avgloss.best_params_['ccp_alpha'])
print('Grid best score (Avg Loss): ', dtr_cv_avgloss.best_score_*(-1))
Grid best parameter (min. Avg Loss): 0.0
Grid best score (Avg Loss): 1.46714208799358
#plot the regression tree
print('Node count =', dtr_cv_avgloss.best_estimator_.tree_.node_count)
from sklearn.tree import plot_tree
plt.figure(figsize=(24,12))
plot_tree(dtr_cv_avgloss.best_estimator_,
feature_names=x_train.columns,
class_names=['0','1'],
filled=True,
impurity=True,
rounded=True,
fontsize=12,
max_depth=3)
plt.show()
Node count = 1085
2b iii) Comparison with linear regression & regression tree models
#function for OSR^2
def OSR2(model, X_test, y_test, y_train):
y_pred = model.predict(X_test)
SSE = np.sum((y_test - y_pred)**2)
SST = np.sum((y_test - np.mean(y_train))**2)
return (1 - SSE/SST)
#function for MAE
from sklearn.metrics import mean_absolute_error
def MAE(model, x_test, y_test, y_train):
y_pred = model.predict(x_test)
return mean_absolute_error(y_test, y_pred)
train
#Calculate OSR^2 of linear regression model
print('OSR2 for linear regression model:', OSR2(linear_model, test.drop(columns='stars', axis=1), test['stars'], train['stars']))
OSR2 for linear regression model: 0.1457737680162715
#Calculate MAE of linear regression model
print('MAE for linear regression:', MAE(linear_model, test.drop(columns='stars', axis=1), test['stars'], train['stars']))
MAE for linear regression: 0.6295425742392495
#Calculate OSR^2 of regression tree model
print('OSR2 for regression tree:', OSR2(dtr_cv_avgloss, x_test, y_test, y_train))
OSR2 for regression tree: 0.05116545929390115
#Calculate MAE of regression tree model
print('MAE for regression tree:', MAE(dtr_cv_avgloss, x_test, y_test, y_train))
MAE for regression tree: 0.6754928669200531
2c i) Regression to Classification
#add fourOrAbove column to train and test
train_enc['fourOrAbove'] = train_enc[['stars']] >= 4
train_c = train_enc.replace({'fourOrAbove': {True:1, False: 0}})
test_enc['fourOrAbove'] = test_enc[['stars']] >= 4
test_c = test_enc.replace({'fourOrAbove': {True: 1, False: 0}})
train_c.head()
x_train_c = train_c.drop(columns=['stars','fourOrAbove'], axis=1)
x_test_c = test_c.drop(columns=['stars','fourOrAbove'], axis=1)
y_train_c = train_c[['fourOrAbove']]
y_test_c = test_c[['fourOrAbove']]
2d i) Build Classification model with accuracy metric
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
#use the hyperparameters from lab to fit the classification model
from sklearn.tree import DecisionTreeClassifier
dtc_c = DecisionTreeClassifier(min_samples_leaf=5,
ccp_alpha=0.001,
random_state = 88)
dtc_c = dtc_c.fit(x_train_c, y_train_c)
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
print('Node count =', dtc_c.tree_.node_count)
plt.figure(figsize=(12,12))
plot_tree(dtc_c,
feature_names=x_train_c.columns,
class_names=['0','1'],
filled=True,
impurity=True,
rounded=True,
fontsize=12)
plt.show()
Node count = 27
y_pred_c = dtc_c.predict(x_test_c)
cm = confusion_matrix(y_test_c, y_pred_c)
print ("Confusion Matrix : \n", cm)
acc = (cm.ravel()[0]+cm.ravel()[3])/sum(cm.ravel())
TPR = cm.ravel()[3]/(cm.ravel()[3]+cm.ravel()[2])
FPR = cm.ravel()[1]/(cm.ravel()[1]+cm.ravel()[0])
print('Accuracy is: %.4f' %acc)
print('TPR is: %.4f' % TPR)
print('FPR is: %.4f' % FPR)
Confusion Matrix :
[[1137 358]
[ 606 587]]
Accuracy is: 0.6414
TPR is: 0.4920
FPR is: 0.2395
2d ii) Use regression model with threshold at 4
#make continuous predictions into binary to fit to classification
def make_binary(predictions):
result = []
for i in predictions:
if i >= 0.5:
result.append(1)
else:
result.append(0)
return result
#linear regression model with fourOrAbove
import statsmodels.formula.api as smf
#attach fourOrAbove to unencoded train dataset
train_unenc_c = train.drop('stars', axis=1)
train_unenc_c['fourOrAbove'] = train_c['fourOrAbove']
ols = smf.ols(formula = 'fourOrAbove ~ review_count + C(GoodForKids, Treatment(reference="(Missing)")) + C(Alcohol, Treatment(reference="(Missing)")) + C(BusinessAcceptsCreditCards, Treatment(reference="(Missing)")) + C(WiFi, Treatment(reference="(Missing)")) + C(BikeParking, Treatment(reference="(Missing)")) + C(ByAppointmentOnly, Treatment(reference="(Missing)")) + C(WheelechairAccessible, Treatment(reference="(Missing)")) + C(OutdoorSeating, Treatment(reference="(Missing)")) + C(RestaurantsReservations, Treatment(reference="(Missing)")) + C(DogsAllowed, Treatment(reference="(Missing)")) + C(Caters, Treatment(reference="(Missing)"))',
data = train_unenc_c)
linear_model2 = ols.fit()
print(linear_model2.summary())
OLS Regression Results
==============================================================================
Dep. Variable: fourOrAbove R-squared: 0.156
Model: OLS Adj. R-squared: 0.153
Method: Least Squares F-statistic: 46.23
Date: Sat, 16 Oct 2021 Prob (F-statistic): 3.39e-208
Time: 19:56:29 Log-Likelihood: -3974.6
No. Observations: 6272 AIC: 8001.
Df Residuals: 6246 BIC: 8177.
Df Model: 25
Covariance Type: nonrobust
============================================================================================================================================
coef std err t P>|t| [0.025 0.975]
--------------------------------------------------------------------------------------------------------------------------------------------
Intercept 0.3916 0.023 16.943 0.000 0.346 0.437
C(GoodForKids, Treatment(reference="(Missing)"))[T.FALSE] 0.0512 0.027 1.886 0.059 -0.002 0.104
C(GoodForKids, Treatment(reference="(Missing)"))[T.TRUE] -0.0422 0.021 -1.997 0.046 -0.084 -0.001
C(Alcohol, Treatment(reference="(Missing)"))[T.'beer_and_wine'] 0.0706 0.028 2.524 0.012 0.016 0.125
C(Alcohol, Treatment(reference="(Missing)"))[T.'full_bar'] -0.0057 0.026 -0.223 0.823 -0.056 0.045
C(Alcohol, Treatment(reference="(Missing)"))[T.'none'] 0.0819 0.023 3.557 0.000 0.037 0.127
C(BusinessAcceptsCreditCards, Treatment(reference="(Missing)"))[T.FALSE] 0.2541 0.054 4.717 0.000 0.148 0.360
C(BusinessAcceptsCreditCards, Treatment(reference="(Missing)"))[T.TRUE] 0.0313 0.027 1.148 0.251 -0.022 0.085
C(WiFi, Treatment(reference="(Missing)"))[T.'free'] 0.0635 0.020 3.112 0.002 0.024 0.104
C(WiFi, Treatment(reference="(Missing)"))[T.'no'] 0.0533 0.020 2.718 0.007 0.015 0.092
C(WiFi, Treatment(reference="(Missing)"))[T.'paid'] -0.0907 0.060 -1.516 0.130 -0.208 0.027
C(BikeParking, Treatment(reference="(Missing)"))[T.FALSE] -0.0871 0.019 -4.615 0.000 -0.124 -0.050
C(BikeParking, Treatment(reference="(Missing)"))[T.TRUE] -0.0614 0.017 -3.622 0.000 -0.095 -0.028
C(ByAppointmentOnly, Treatment(reference="(Missing)"))[T.FALSE] 0.0885 0.020 4.422 0.000 0.049 0.128
C(ByAppointmentOnly, Treatment(reference="(Missing)"))[T.TRUE] 0.1121 0.060 1.854 0.064 -0.006 0.231
C(WheelechairAccessible, Treatment(reference="(Missing)"))[T.FALSE] 0.3666 0.054 6.809 0.000 0.261 0.472
C(WheelechairAccessible, Treatment(reference="(Missing)"))[T.TRUE] 0.2225 0.017 13.480 0.000 0.190 0.255
C(OutdoorSeating, Treatment(reference="(Missing)"))[T.FALSE] -0.0981 0.023 -4.174 0.000 -0.144 -0.052
C(OutdoorSeating, Treatment(reference="(Missing)"))[T.TRUE] -0.0511 0.025 -2.031 0.042 -0.100 -0.002
C(RestaurantsReservations, Treatment(reference="(Missing)"))[T.FALSE] -0.0839 0.024 -3.501 0.000 -0.131 -0.037
C(RestaurantsReservations, Treatment(reference="(Missing)"))[T.TRUE] 0.0157 0.027 0.584 0.559 -0.037 0.068
C(DogsAllowed, Treatment(reference="(Missing)"))[T.FALSE] 0.1692 0.017 9.716 0.000 0.135 0.203
C(DogsAllowed, Treatment(reference="(Missing)"))[T.TRUE] 0.1123 0.032 3.496 0.000 0.049 0.175
C(Caters, Treatment(reference="(Missing)"))[T.FALSE] -0.0797 0.018 -4.496 0.000 -0.114 -0.045
C(Caters, Treatment(reference="(Missing)"))[T.TRUE] 0.0283 0.019 1.465 0.143 -0.010 0.066
review_count 7.835e-05 1.54e-05 5.076 0.000 4.81e-05 0.000
==============================================================================
Omnibus: 37313.842 Durbin-Watson: 1.988
Prob(Omnibus): 0.000 Jarque-Bera (JB): 528.111
Skew: 0.267 Prob(JB): 2.10e-115
Kurtosis: 1.683 Cond. No. 5.01e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 5.01e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
#calculate accuracy of linear regression
from sklearn.metrics import confusion_matrix
y_pred_linreg = make_binary(linear_model2.predict(test.drop('stars', axis=1)))
cm_linreg = confusion_matrix(y_test_c, y_pred_linreg)
#regression tree model with fourOrAbove
from sklearn.tree import DecisionTreeRegressor
dtr2 = DecisionTreeRegressor(min_samples_split=10,
ccp_alpha=0.0,
random_state=88)
dtr2 = dtr2.fit(x_train_c, y_train_c)
#calculate accuracy of regression tree
y_pred_regtree = make_binary(dtr2.predict(x_test_c))
cm_regtree = confusion_matrix(y_test_c, y_pred_regtree)
2d iii) Logistic regression model for fourOrAbove
!pip install statsmodels==0.13.0
Collecting statsmodels==0.13.0
Downloading statsmodels-0.13.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.8 MB)
|████████████████████████████████| 9.8 MB 32.7 MB/s
Requirement already satisfied: numpy>=1.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.13.0) (1.19.5)
Requirement already satisfied: patsy>=0.5.2 in /root/venv/lib/python3.7/site-packages (from statsmodels==0.13.0) (0.5.2)
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.13.0) (1.2.5)
Requirement already satisfied: scipy>=1.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels==0.13.0) (1.7.1)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels==0.13.0) (2.8.2)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels==0.13.0) (2021.3)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels==0.13.0) (1.16.0)
Installing collected packages: statsmodels
Attempting uninstall: statsmodels
Found existing installation: statsmodels 0.12.2
Uninstalling statsmodels-0.12.2:
Successfully uninstalled statsmodels-0.12.2
Successfully installed statsmodels-0.13.0
WARNING: You are using pip version 21.2.4; however, version 21.3 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
#cleaning training dataset for the model
train_logreg = train.replace({'TRUE':1, 'FALSE':0})
train_logreg['fourOrAbove'] = train_logreg[['stars']] >= 4
train_logreg = train_logreg.replace({'fourOrAbove': {True:1, False: 0}}).drop('stars', axis=1)
train_logreg.head()
#cleaning test dataset
test_logreg = test.replace({'TRUE':1, 'FALSE':0})
test_logreg['fourOrAbove'] = test_logreg[['stars']] >= 4
test_logreg = test_logreg.replace({'fourOrAbove': {True:1, False:0}}).drop('stars', axis=1)
test_logreg.head()
#fit the logistic regression model
import statsmodels.formula.api as smf
logreg = smf.logit(formula = 'fourOrAbove ~ review_count + C(GoodForKids, Treatment(reference="(Missing)")) + C(Alcohol, Treatment(reference="(Missing)")) + C(BusinessAcceptsCreditCards, Treatment(reference="(Missing)")) + C(WiFi, Treatment(reference="(Missing)")) + C(BikeParking, Treatment(reference="(Missing)")) + C(ByAppointmentOnly, Treatment(reference="(Missing)")) + C(WheelechairAccessible, Treatment(reference="(Missing)")) + C(OutdoorSeating, Treatment(reference="(Missing)")) + C(RestaurantsReservations, Treatment(reference="(Missing)")) + C(DogsAllowed, Treatment(reference="(Missing)")) + C(Caters, Treatment(reference="(Missing)"))',
data = train_logreg).fit()
print(logreg.summary())
Optimization terminated successfully.
Current function value: 0.603049
Iterations 6
Logit Regression Results
==============================================================================
Dep. Variable: fourOrAbove No. Observations: 6272
Model: Logit Df Residuals: 6246
Method: MLE Df Model: 25
Date: Sat, 16 Oct 2021 Pseudo R-squ.: 0.1209
Time: 20:37:45 Log-Likelihood: -3782.3
converged: True LL-Null: -4302.5
Covariance Type: nonrobust LLR p-value: 1.634e-203
========================================================================================================================================
coef std err z P>|z| [0.025 0.975]
----------------------------------------------------------------------------------------------------------------------------------------
Intercept -0.4425 0.105 -4.219 0.000 -0.648 -0.237
C(GoodForKids, Treatment(reference="(Missing)"))[T.0] 0.2585 0.130 1.986 0.047 0.003 0.514
C(GoodForKids, Treatment(reference="(Missing)"))[T.1] -0.2006 0.101 -1.977 0.048 -0.400 -0.002
C(Alcohol, Treatment(reference="(Missing)"))[T.'beer_and_wine'] 0.3357 0.136 2.463 0.014 0.069 0.603
C(Alcohol, Treatment(reference="(Missing)"))[T.'full_bar'] -0.0452 0.126 -0.358 0.720 -0.292 0.202
C(Alcohol, Treatment(reference="(Missing)"))[T.'none'] 0.3954 0.113 3.503 0.000 0.174 0.617
C(BusinessAcceptsCreditCards, Treatment(reference="(Missing)"))[T.0] 1.1767 0.259 4.540 0.000 0.669 1.685
C(BusinessAcceptsCreditCards, Treatment(reference="(Missing)"))[T.1] 0.1375 0.124 1.105 0.269 -0.106 0.381
C(WiFi, Treatment(reference="(Missing)"))[T.'free'] 0.2966 0.098 3.025 0.002 0.104 0.489
C(WiFi, Treatment(reference="(Missing)"))[T.'no'] 0.2490 0.094 2.637 0.008 0.064 0.434
C(WiFi, Treatment(reference="(Missing)"))[T.'paid'] -0.5527 0.325 -1.699 0.089 -1.190 0.085
C(BikeParking, Treatment(reference="(Missing)"))[T.0] -0.4221 0.092 -4.609 0.000 -0.602 -0.243
C(BikeParking, Treatment(reference="(Missing)"))[T.1] -0.2906 0.081 -3.594 0.000 -0.449 -0.132
C(ByAppointmentOnly, Treatment(reference="(Missing)"))[T.0] 0.4338 0.098 4.423 0.000 0.242 0.626
C(ByAppointmentOnly, Treatment(reference="(Missing)"))[T.1] 0.5371 0.308 1.746 0.081 -0.066 1.140
C(WheelechairAccessible, Treatment(reference="(Missing)"))[T.0] 1.8650 0.319 5.850 0.000 1.240 2.490
C(WheelechairAccessible, Treatment(reference="(Missing)"))[T.1] 0.9883 0.077 12.836 0.000 0.837 1.139
C(OutdoorSeating, Treatment(reference="(Missing)"))[T.0] -0.4577 0.112 -4.073 0.000 -0.678 -0.237
C(OutdoorSeating, Treatment(reference="(Missing)"))[T.1] -0.2323 0.120 -1.943 0.052 -0.467 0.002
C(RestaurantsReservations, Treatment(reference="(Missing)"))[T.0] -0.4095 0.115 -3.564 0.000 -0.635 -0.184
C(RestaurantsReservations, Treatment(reference="(Missing)"))[T.1] 0.0775 0.129 0.602 0.547 -0.175 0.330
C(DogsAllowed, Treatment(reference="(Missing)"))[T.0] 0.7588 0.083 9.187 0.000 0.597 0.921
C(DogsAllowed, Treatment(reference="(Missing)"))[T.1] 0.4872 0.157 3.102 0.002 0.179 0.795
C(Caters, Treatment(reference="(Missing)"))[T.0] -0.3975 0.086 -4.624 0.000 -0.566 -0.229
C(Caters, Treatment(reference="(Missing)"))[T.1] 0.1282 0.092 1.396 0.163 -0.052 0.308
review_count 0.0004 9.48e-05 4.606 0.000 0.000 0.001
========================================================================================================================================
y_prob_logreg = logreg.predict(test_logreg)
y_pred_logreg = pd.Series([1 if X > 1/2 else 0 for X in y_prob_logreg], index=y_prob_logreg.index)
from sklearn.metrics import confusion_matrix
y_test_logreg = test_logreg['fourOrAbove']
cm_logreg = confusion_matrix(y_test_logreg, y_pred_logreg)
print("Confusion Matrix: \n", cm_logreg)
Confusion Matrix:
[[1177 318]
[ 610 583]]
accuracy_logreg = (1177 + 583)/(1177 + 318 + 610 + 583)
accuracy_logreg
2d iv) Classification tree model for fourOrAbove
#cross-validation to get the optimal ccp alpha
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
grid_values = {'ccp_alpha': np.linspace(0, 0.10, 201),
'min_samples_leaf': [5],
'min_samples_split': [20],
'max_depth': [30],
'random_state': [88]}
dtc2 = DecisionTreeClassifier()
dtc2_cv_acc = GridSearchCV(dtc2, param_grid = grid_values, cv=10, verbose=1,
scoring = 'accuracy')
dtc2_cv_acc.fit(x_train_c, y_train_c)
Fitting 10 folds for each of 201 candidates, totalling 2010 fits
acc2 = dtc2_cv_acc.cv_results_['mean_test_score'] # what sklearn calls mean_test_score is the holdout set, i.e. the validation set.
ccp2 = dtc2_cv_acc.cv_results_['param_ccp_alpha'].data
pd.DataFrame({'ccp alpha' : ccp2, 'Validation Accuracy': acc2}).head(10)
plt.figure(figsize=(8, 6))
plt.xlabel('ccp alpha', fontsize=16)
plt.ylabel('mean validation accuracy', fontsize=16)
plt.scatter(ccp2, acc2, s=2)
plt.plot(ccp2, acc2, linewidth=3)
plt.grid(True, which='both')
plt.show()
print('Grid best parameter ccp_alpha (max. accuracy): ', dtc2_cv_acc.best_params_['ccp_alpha'])
print('Grid best score (accuracy): ', dtc2_cv_acc.best_score_)
Grid best parameter ccp_alpha (max. accuracy): 0.001
Grid best score (accuracy): 0.6624673655766516
#create confusion matrix using the optimal ccp alpha value
dtc2 = DecisionTreeClassifier(min_samples_leaf=5,
ccp_alpha=0.001,
class_weight = {0: 1, 1: 20},
random_state = 88)
dtc2 = dtc2.fit(x_train_c, y_train_c)
y_pred_dtc2 = dtc2.predict(x_test_c)
from sklearn.metrics import confusion_matrix
cm_clatree = confusion_matrix(y_test_c, y_pred_dtc2)
cm_clatree
2d v) produce a table for performance comparison
#accuracy, TPR, and FPR functions
def accuracy(cm):
tn = cm.item((0,0))
fp = cm.item((0,1))
fn = cm.item((1,0))
tp = cm.item((1,1))
return (tn + tp)/(tn+fp+fn+tp)
def TPR(cm):
tn = cm.item((0,0))
fp = cm.item((0,1))
fn = cm.item((1,0))
tp = cm.item((1,1))
if fp + tp == 0:
return 0
else:
return tp/(tp+fp)
def FPR(cm):
tn = cm.item((0,0))
fp = cm.item((0,1))
fn = cm.item((1,0))
tp = cm.item((1,1))
if tp + fp == 0:
return 0
return fp/(tp+fp)
#baseline assume all fourOrAbove = 0
tn_baseline = len(y_train_c[y_train_c['fourOrAbove']==0])
fn_baseline = len(y_train_c[y_train_c['fourOrAbove']==1])
#calculate metrics for the matrices
cm_baseline = np.matrix([[tn_baseline, 0],[fn_baseline, 0]])
list_matrices = [cm_baseline, cm_linreg, cm_regtree, cm_logreg, cm_clatree]
for i in list_matrices:
print('accuracy of',i,':', accuracy(i))
print('TPR of',i,':', TPR(i))
print('FPR of',i,':', FPR(i))
accuracy of [[3511 0]
[2761 0]] : 0.5597895408163265
TPR of [[3511 0]
[2761 0]] : 0
FPR of [[3511 0]
[2761 0]] : 0
accuracy of [[1194 301]
[ 617 576]] : 0.6584821428571429
TPR of [[1194 301]
[ 617 576]] : 0.6567844925883695
FPR of [[1194 301]
[ 617 576]] : 0.34321550741163054
accuracy of [[990 505]
[532 661]] : 0.6142113095238095
TPR of [[990 505]
[532 661]] : 0.5668953687821612
FPR of [[990 505]
[532 661]] : 0.43310463121783876
accuracy of [[1177 318]
[ 610 583]] : 0.6547619047619048
TPR of [[1177 318]
[ 610 583]] : 0.6470588235294118
FPR of [[1177 318]
[ 610 583]] : 0.35294117647058826
accuracy of [[ 0 1495]
[ 0 1193]] : 0.44382440476190477
TPR of [[ 0 1495]
[ 0 1193]] : 0.44382440476190477
FPR of [[ 0 1495]
[ 0 1193]] : 0.5561755952380952