import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
train = pd.read_csv('yelp142_train.csv')
test = pd.read_csv('yelp142_test.csv')
2b i) build a linear regression model
!pip install statsmodels==0.12.2
#build a linear regression model with (Missing) as reference
import statsmodels.formula.api as smf
ols = smf.ols(formula = 'stars ~ review_count + C(GoodForKids, Treatment(reference="(Missing)")) + C(Alcohol, Treatment(reference="(Missing)")) + C(BusinessAcceptsCreditCards, Treatment(reference="(Missing)")) + C(WiFi, Treatment(reference="(Missing)")) + C(BikeParking, Treatment(reference="(Missing)")) + C(ByAppointmentOnly, Treatment(reference="(Missing)")) + C(WheelechairAccessible, Treatment(reference="(Missing)")) + C(OutdoorSeating, Treatment(reference="(Missing)")) + C(RestaurantsReservations, Treatment(reference="(Missing)")) + C(DogsAllowed, Treatment(reference="(Missing)")) + C(Caters, Treatment(reference="(Missing)"))',
data = train)
linear_model = ols.fit()
print(linear_model.summary())
2b ii) build a regression tree model
#identify categorical variables for dummy encoding
train.dtypes
#dummy encoding
train_enc = pd.get_dummies(train, drop_first=True)
test_enc = pd.get_dummies(test, drop_first=True)
train_enc.head()
#split between x and y
y_train = train_enc['stars'].astype('int64')
y_test = test_enc['stars'].astype('int64')
x_train = train_enc.drop(['stars'], axis=1).astype('int64')
x_test = test_enc.drop(['stars'], axis=1).astype('int64')
#cross-validation with custom loss function. Create a function here (from Lab 6)
def average_loss_function(y_test, y_pred):
weights = np.array([20 if i == 1 else 1 for i in y_test])
return np.mean(weights*(y_test != y_pred))
#cross-validation with custom loss function. Calculate the loss for each hyperparameter (grid values for hyperparameters other than ccp alpha given in Lab 6)
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
grid_values = {'ccp_alpha': np.linspace(0, 0.10, 201),
'min_samples_leaf': [5],
'min_samples_split': [20],
'max_depth': [30],
'random_state': [88]}
dtr = DecisionTreeRegressor()
dtr_cv_avgloss = GridSearchCV(dtr, param_grid = grid_values, cv=10, verbose=1,
scoring = make_scorer(average_loss_function, greater_is_better=False))
dtr_cv_avgloss.fit(x_train, y_train)
ccp = dtr_cv_avgloss.cv_results_['param_ccp_alpha'].data
mean_avgloss = dtr_cv_avgloss.cv_results_['mean_test_score']*(-1)
pd.DataFrame({'ccp alpha': ccp, 'Mean Validation Average Loss': mean_avgloss}).head()
plt.figure(figsize=(8, 6))
plt.xlabel('ccp alpha', fontsize=16)
plt.ylabel('mean validation average loss', fontsize=16)
plt.scatter(ccp, mean_avgloss, s=1)
plt.plot(ccp, mean_avgloss, linewidth=3)
plt.grid(True, which='both')
plt.show()
print('Grid best parameter (min. Avg Loss): ', dtr_cv_avgloss.best_params_['ccp_alpha'])
print('Grid best score (Avg Loss): ', dtr_cv_avgloss.best_score_*(-1))
#plot the regression tree
print('Node count =', dtr_cv_avgloss.best_estimator_.tree_.node_count)
from sklearn.tree import plot_tree
plt.figure(figsize=(24,12))
plot_tree(dtr_cv_avgloss.best_estimator_,
feature_names=x_train.columns,
class_names=['0','1'],
filled=True,
impurity=True,
rounded=True,
fontsize=12,
max_depth=3)
plt.show()
2b iii) Comparison with linear regression & regression tree models
#function for OSR^2
def OSR2(model, X_test, y_test, y_train):
y_pred = model.predict(X_test)
SSE = np.sum((y_test - y_pred)**2)
SST = np.sum((y_test - np.mean(y_train))**2)
return (1 - SSE/SST)
#function for MAE
from sklearn.metrics import mean_absolute_error
def MAE(model, x_test, y_test, y_train):
y_pred = model.predict(x_test)
return mean_absolute_error(y_test, y_pred)
train
#Calculate OSR^2 of linear regression model
print('OSR2 for linear regression model:', OSR2(linear_model, test.drop(columns='stars', axis=1), test['stars'], train['stars']))
#Calculate MAE of linear regression model
print('MAE for linear regression:', MAE(linear_model, test.drop(columns='stars', axis=1), test['stars'], train['stars']))
#Calculate OSR^2 of regression tree model
print('OSR2 for regression tree:', OSR2(dtr_cv_avgloss, x_test, y_test, y_train))
#Calculate MAE of regression tree model
print('MAE for regression tree:', MAE(dtr_cv_avgloss, x_test, y_test, y_train))
2c i) Regression to Classification
#add fourOrAbove column to train and test
train_enc['fourOrAbove'] = train_enc[['stars']] >= 4
train_c = train_enc.replace({'fourOrAbove': {True:1, False: 0}})
test_enc['fourOrAbove'] = test_enc[['stars']] >= 4
test_c = test_enc.replace({'fourOrAbove': {True: 1, False: 0}})
train_c.head()
x_train_c = train_c.drop(columns=['stars','fourOrAbove'], axis=1)
x_test_c = test_c.drop(columns=['stars','fourOrAbove'], axis=1)
y_train_c = train_c[['fourOrAbove']]
y_test_c = test_c[['fourOrAbove']]
2d i) Build Classification model with accuracy metric
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
#use the hyperparameters from lab to fit the classification model
from sklearn.tree import DecisionTreeClassifier
dtc_c = DecisionTreeClassifier(min_samples_leaf=5,
ccp_alpha=0.001,
random_state = 88)
dtc_c = dtc_c.fit(x_train_c, y_train_c)
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
print('Node count =', dtc_c.tree_.node_count)
plt.figure(figsize=(12,12))
plot_tree(dtc_c,
feature_names=x_train_c.columns,
class_names=['0','1'],
filled=True,
impurity=True,
rounded=True,
fontsize=12)
plt.show()
y_pred_c = dtc_c.predict(x_test_c)
cm = confusion_matrix(y_test_c, y_pred_c)
print ("Confusion Matrix : \n", cm)
acc = (cm.ravel()[0]+cm.ravel()[3])/sum(cm.ravel())
TPR = cm.ravel()[3]/(cm.ravel()[3]+cm.ravel()[2])
FPR = cm.ravel()[1]/(cm.ravel()[1]+cm.ravel()[0])
print('Accuracy is: %.4f' %acc)
print('TPR is: %.4f' % TPR)
print('FPR is: %.4f' % FPR)
2d ii) Use regression model with threshold at 4
#make continuous predictions into binary to fit to classification
def make_binary(predictions):
result = []
for i in predictions:
if i >= 0.5:
result.append(1)
else:
result.append(0)
return result
#linear regression model with fourOrAbove
import statsmodels.formula.api as smf
#attach fourOrAbove to unencoded train dataset
train_unenc_c = train.drop('stars', axis=1)
train_unenc_c['fourOrAbove'] = train_c['fourOrAbove']
ols = smf.ols(formula = 'fourOrAbove ~ review_count + C(GoodForKids, Treatment(reference="(Missing)")) + C(Alcohol, Treatment(reference="(Missing)")) + C(BusinessAcceptsCreditCards, Treatment(reference="(Missing)")) + C(WiFi, Treatment(reference="(Missing)")) + C(BikeParking, Treatment(reference="(Missing)")) + C(ByAppointmentOnly, Treatment(reference="(Missing)")) + C(WheelechairAccessible, Treatment(reference="(Missing)")) + C(OutdoorSeating, Treatment(reference="(Missing)")) + C(RestaurantsReservations, Treatment(reference="(Missing)")) + C(DogsAllowed, Treatment(reference="(Missing)")) + C(Caters, Treatment(reference="(Missing)"))',
data = train_unenc_c)
linear_model2 = ols.fit()
print(linear_model2.summary())
#calculate accuracy of linear regression
from sklearn.metrics import confusion_matrix
y_pred_linreg = make_binary(linear_model2.predict(test.drop('stars', axis=1)))
cm_linreg = confusion_matrix(y_test_c, y_pred_linreg)
#regression tree model with fourOrAbove
from sklearn.tree import DecisionTreeRegressor
dtr2 = DecisionTreeRegressor(min_samples_split=10,
ccp_alpha=0.0,
random_state=88)
dtr2 = dtr2.fit(x_train_c, y_train_c)
#calculate accuracy of regression tree
y_pred_regtree = make_binary(dtr2.predict(x_test_c))
cm_regtree = confusion_matrix(y_test_c, y_pred_regtree)
2d iii) Logistic regression model for fourOrAbove
!pip install statsmodels==0.13.0
#cleaning training dataset for the model
train_logreg = train.replace({'TRUE':1, 'FALSE':0})
train_logreg['fourOrAbove'] = train_logreg[['stars']] >= 4
train_logreg = train_logreg.replace({'fourOrAbove': {True:1, False: 0}}).drop('stars', axis=1)
train_logreg.head()
#cleaning test dataset
test_logreg = test.replace({'TRUE':1, 'FALSE':0})
test_logreg['fourOrAbove'] = test_logreg[['stars']] >= 4
test_logreg = test_logreg.replace({'fourOrAbove': {True:1, False:0}}).drop('stars', axis=1)
test_logreg.head()
#fit the logistic regression model
import statsmodels.formula.api as smf
logreg = smf.logit(formula = 'fourOrAbove ~ review_count + C(GoodForKids, Treatment(reference="(Missing)")) + C(Alcohol, Treatment(reference="(Missing)")) + C(BusinessAcceptsCreditCards, Treatment(reference="(Missing)")) + C(WiFi, Treatment(reference="(Missing)")) + C(BikeParking, Treatment(reference="(Missing)")) + C(ByAppointmentOnly, Treatment(reference="(Missing)")) + C(WheelechairAccessible, Treatment(reference="(Missing)")) + C(OutdoorSeating, Treatment(reference="(Missing)")) + C(RestaurantsReservations, Treatment(reference="(Missing)")) + C(DogsAllowed, Treatment(reference="(Missing)")) + C(Caters, Treatment(reference="(Missing)"))',
data = train_logreg).fit()
print(logreg.summary())
y_prob_logreg = logreg.predict(test_logreg)
y_pred_logreg = pd.Series([1 if X > 1/2 else 0 for X in y_prob_logreg], index=y_prob_logreg.index)
from sklearn.metrics import confusion_matrix
y_test_logreg = test_logreg['fourOrAbove']
cm_logreg = confusion_matrix(y_test_logreg, y_pred_logreg)
print("Confusion Matrix: \n", cm_logreg)
accuracy_logreg = (1177 + 583)/(1177 + 318 + 610 + 583)
accuracy_logreg
2d iv) Classification tree model for fourOrAbove
#cross-validation to get the optimal ccp alpha
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
grid_values = {'ccp_alpha': np.linspace(0, 0.10, 201),
'min_samples_leaf': [5],
'min_samples_split': [20],
'max_depth': [30],
'random_state': [88]}
dtc2 = DecisionTreeClassifier()
dtc2_cv_acc = GridSearchCV(dtc2, param_grid = grid_values, cv=10, verbose=1,
scoring = 'accuracy')
dtc2_cv_acc.fit(x_train_c, y_train_c)
acc2 = dtc2_cv_acc.cv_results_['mean_test_score'] # what sklearn calls mean_test_score is the holdout set, i.e. the validation set.
ccp2 = dtc2_cv_acc.cv_results_['param_ccp_alpha'].data
pd.DataFrame({'ccp alpha' : ccp2, 'Validation Accuracy': acc2}).head(10)
plt.figure(figsize=(8, 6))
plt.xlabel('ccp alpha', fontsize=16)
plt.ylabel('mean validation accuracy', fontsize=16)
plt.scatter(ccp2, acc2, s=2)
plt.plot(ccp2, acc2, linewidth=3)
plt.grid(True, which='both')
plt.show()
print('Grid best parameter ccp_alpha (max. accuracy): ', dtc2_cv_acc.best_params_['ccp_alpha'])
print('Grid best score (accuracy): ', dtc2_cv_acc.best_score_)
#create confusion matrix using the optimal ccp alpha value
dtc2 = DecisionTreeClassifier(min_samples_leaf=5,
ccp_alpha=0.001,
class_weight = {0: 1, 1: 20},
random_state = 88)
dtc2 = dtc2.fit(x_train_c, y_train_c)
y_pred_dtc2 = dtc2.predict(x_test_c)
from sklearn.metrics import confusion_matrix
cm_clatree = confusion_matrix(y_test_c, y_pred_dtc2)
cm_clatree
2d v) produce a table for performance comparison
#accuracy, TPR, and FPR functions
def accuracy(cm):
tn = cm.item((0,0))
fp = cm.item((0,1))
fn = cm.item((1,0))
tp = cm.item((1,1))
return (tn + tp)/(tn+fp+fn+tp)
def TPR(cm):
tn = cm.item((0,0))
fp = cm.item((0,1))
fn = cm.item((1,0))
tp = cm.item((1,1))
if fp + tp == 0:
return 0
else:
return tp/(tp+fp)
def FPR(cm):
tn = cm.item((0,0))
fp = cm.item((0,1))
fn = cm.item((1,0))
tp = cm.item((1,1))
if tp + fp == 0:
return 0
return fp/(tp+fp)
#baseline assume all fourOrAbove = 0
tn_baseline = len(y_train_c[y_train_c['fourOrAbove']==0])
fn_baseline = len(y_train_c[y_train_c['fourOrAbove']==1])
#calculate metrics for the matrices
cm_baseline = np.matrix([[tn_baseline, 0],[fn_baseline, 0]])
list_matrices = [cm_baseline, cm_linreg, cm_regtree, cm_logreg, cm_clatree]
for i in list_matrices:
print('accuracy of',i,':', accuracy(i))
print('TPR of',i,':', TPR(i))
print('FPR of',i,':', FPR(i))