IEOR 142 HW3

import pandas as pd import numpy as np import matplotlib.pyplot as plt

train = pd.read_csv('yelp142_train.csv') test = pd.read_csv('yelp142_test.csv')

2b i) build a linear regression model

!pip install statsmodels==0.12.2

#build a linear regression model with (Missing) as reference import statsmodels.formula.api as smf ols = smf.ols(formula = 'stars ~ review_count + C(GoodForKids, Treatment(reference="(Missing)")) + C(Alcohol, Treatment(reference="(Missing)")) + C(BusinessAcceptsCreditCards, Treatment(reference="(Missing)")) + C(WiFi, Treatment(reference="(Missing)")) + C(BikeParking, Treatment(reference="(Missing)")) + C(ByAppointmentOnly, Treatment(reference="(Missing)")) + C(WheelechairAccessible, Treatment(reference="(Missing)")) + C(OutdoorSeating, Treatment(reference="(Missing)")) + C(RestaurantsReservations, Treatment(reference="(Missing)")) + C(DogsAllowed, Treatment(reference="(Missing)")) + C(Caters, Treatment(reference="(Missing)"))', data = train) linear_model = ols.fit() print(linear_model.summary())

2b ii) build a regression tree model

#identify categorical variables for dummy encoding train.dtypes

#dummy encoding train_enc = pd.get_dummies(train, drop_first=True) test_enc = pd.get_dummies(test, drop_first=True) train_enc.head()

#split between x and y y_train = train_enc['stars'].astype('int64') y_test = test_enc['stars'].astype('int64') x_train = train_enc.drop(['stars'], axis=1).astype('int64') x_test = test_enc.drop(['stars'], axis=1).astype('int64')

#cross-validation with custom loss function. Create a function here (from Lab 6) def average_loss_function(y_test, y_pred): weights = np.array([20 if i == 1 else 1 for i in y_test]) return np.mean(weights*(y_test != y_pred))

#cross-validation with custom loss function. Calculate the loss for each hyperparameter (grid values for hyperparameters other than ccp alpha given in Lab 6) from sklearn.metrics import make_scorer from sklearn.tree import DecisionTreeRegressor from sklearn.model_selection import GridSearchCV grid_values = {'ccp_alpha': np.linspace(0, 0.10, 201), 'min_samples_leaf': [5], 'min_samples_split': [20], 'max_depth': [30], 'random_state': [88]} dtr = DecisionTreeRegressor() dtr_cv_avgloss = GridSearchCV(dtr, param_grid = grid_values, cv=10, verbose=1, scoring = make_scorer(average_loss_function, greater_is_better=False)) dtr_cv_avgloss.fit(x_train, y_train)

ccp = dtr_cv_avgloss.cv_results_['param_ccp_alpha'].data mean_avgloss = dtr_cv_avgloss.cv_results_['mean_test_score']*(-1) pd.DataFrame({'ccp alpha': ccp, 'Mean Validation Average Loss': mean_avgloss}).head()

plt.figure(figsize=(8, 6)) plt.xlabel('ccp alpha', fontsize=16) plt.ylabel('mean validation average loss', fontsize=16) plt.scatter(ccp, mean_avgloss, s=1) plt.plot(ccp, mean_avgloss, linewidth=3) plt.grid(True, which='both') plt.show()

print('Grid best parameter (min. Avg Loss): ', dtr_cv_avgloss.best_params_['ccp_alpha']) print('Grid best score (Avg Loss): ', dtr_cv_avgloss.best_score_*(-1))

#plot the regression tree print('Node count =', dtr_cv_avgloss.best_estimator_.tree_.node_count) from sklearn.tree import plot_tree plt.figure(figsize=(24,12)) plot_tree(dtr_cv_avgloss.best_estimator_, feature_names=x_train.columns, class_names=['0','1'], filled=True, impurity=True, rounded=True, fontsize=12, max_depth=3) plt.show()

2b iii) Comparison with linear regression & regression tree models

#function for OSR^2 def OSR2(model, X_test, y_test, y_train): y_pred = model.predict(X_test) SSE = np.sum((y_test - y_pred)**2) SST = np.sum((y_test - np.mean(y_train))**2) return (1 - SSE/SST)

#function for MAE from sklearn.metrics import mean_absolute_error def MAE(model, x_test, y_test, y_train): y_pred = model.predict(x_test) return mean_absolute_error(y_test, y_pred)

train

#Calculate OSR^2 of linear regression model print('OSR2 for linear regression model:', OSR2(linear_model, test.drop(columns='stars', axis=1), test['stars'], train['stars']))

#Calculate MAE of linear regression model print('MAE for linear regression:', MAE(linear_model, test.drop(columns='stars', axis=1), test['stars'], train['stars']))

#Calculate OSR^2 of regression tree model print('OSR2 for regression tree:', OSR2(dtr_cv_avgloss, x_test, y_test, y_train))

#Calculate MAE of regression tree model print('MAE for regression tree:', MAE(dtr_cv_avgloss, x_test, y_test, y_train))

2c i) Regression to Classification

#add fourOrAbove column to train and test train_enc['fourOrAbove'] = train_enc[['stars']] >= 4 train_c = train_enc.replace({'fourOrAbove': {True:1, False: 0}}) test_enc['fourOrAbove'] = test_enc[['stars']] >= 4 test_c = test_enc.replace({'fourOrAbove': {True: 1, False: 0}}) train_c.head()

x_train_c = train_c.drop(columns=['stars','fourOrAbove'], axis=1) x_test_c = test_c.drop(columns=['stars','fourOrAbove'], axis=1) y_train_c = train_c[['fourOrAbove']] y_test_c = test_c[['fourOrAbove']]

2d i) Build Classification model with accuracy metric

from sklearn.metrics import confusion_matrix from sklearn.metrics import precision_score from sklearn.metrics import recall_score

#use the hyperparameters from lab to fit the classification model from sklearn.tree import DecisionTreeClassifier dtc_c = DecisionTreeClassifier(min_samples_leaf=5, ccp_alpha=0.001, random_state = 88) dtc_c = dtc_c.fit(x_train_c, y_train_c)

import matplotlib.pyplot as plt from sklearn.tree import plot_tree print('Node count =', dtc_c.tree_.node_count) plt.figure(figsize=(12,12)) plot_tree(dtc_c, feature_names=x_train_c.columns, class_names=['0','1'], filled=True, impurity=True, rounded=True, fontsize=12) plt.show()

y_pred_c = dtc_c.predict(x_test_c) cm = confusion_matrix(y_test_c, y_pred_c) print ("Confusion Matrix : \n", cm) acc = (cm.ravel()[0]+cm.ravel()[3])/sum(cm.ravel()) TPR = cm.ravel()[3]/(cm.ravel()[3]+cm.ravel()[2]) FPR = cm.ravel()[1]/(cm.ravel()[1]+cm.ravel()[0]) print('Accuracy is: %.4f' %acc) print('TPR is: %.4f' % TPR) print('FPR is: %.4f' % FPR)

2d ii) Use regression model with threshold at 4

#make continuous predictions into binary to fit to classification def make_binary(predictions): result = [] for i in predictions: if i >= 0.5: result.append(1) else: result.append(0) return result

#linear regression model with fourOrAbove import statsmodels.formula.api as smf #attach fourOrAbove to unencoded train dataset train_unenc_c = train.drop('stars', axis=1) train_unenc_c['fourOrAbove'] = train_c['fourOrAbove'] ols = smf.ols(formula = 'fourOrAbove ~ review_count + C(GoodForKids, Treatment(reference="(Missing)")) + C(Alcohol, Treatment(reference="(Missing)")) + C(BusinessAcceptsCreditCards, Treatment(reference="(Missing)")) + C(WiFi, Treatment(reference="(Missing)")) + C(BikeParking, Treatment(reference="(Missing)")) + C(ByAppointmentOnly, Treatment(reference="(Missing)")) + C(WheelechairAccessible, Treatment(reference="(Missing)")) + C(OutdoorSeating, Treatment(reference="(Missing)")) + C(RestaurantsReservations, Treatment(reference="(Missing)")) + C(DogsAllowed, Treatment(reference="(Missing)")) + C(Caters, Treatment(reference="(Missing)"))', data = train_unenc_c) linear_model2 = ols.fit() print(linear_model2.summary())

#calculate accuracy of linear regression from sklearn.metrics import confusion_matrix y_pred_linreg = make_binary(linear_model2.predict(test.drop('stars', axis=1))) cm_linreg = confusion_matrix(y_test_c, y_pred_linreg)

#regression tree model with fourOrAbove from sklearn.tree import DecisionTreeRegressor dtr2 = DecisionTreeRegressor(min_samples_split=10, ccp_alpha=0.0, random_state=88) dtr2 = dtr2.fit(x_train_c, y_train_c)

#calculate accuracy of regression tree y_pred_regtree = make_binary(dtr2.predict(x_test_c)) cm_regtree = confusion_matrix(y_test_c, y_pred_regtree)

2d iii) Logistic regression model for fourOrAbove

!pip install statsmodels==0.13.0

#cleaning training dataset for the model train_logreg = train.replace({'TRUE':1, 'FALSE':0}) train_logreg['fourOrAbove'] = train_logreg[['stars']] >= 4 train_logreg = train_logreg.replace({'fourOrAbove': {True:1, False: 0}}).drop('stars', axis=1) train_logreg.head()

#cleaning test dataset test_logreg = test.replace({'TRUE':1, 'FALSE':0}) test_logreg['fourOrAbove'] = test_logreg[['stars']] >= 4 test_logreg = test_logreg.replace({'fourOrAbove': {True:1, False:0}}).drop('stars', axis=1) test_logreg.head()

#fit the logistic regression model import statsmodels.formula.api as smf logreg = smf.logit(formula = 'fourOrAbove ~ review_count + C(GoodForKids, Treatment(reference="(Missing)")) + C(Alcohol, Treatment(reference="(Missing)")) + C(BusinessAcceptsCreditCards, Treatment(reference="(Missing)")) + C(WiFi, Treatment(reference="(Missing)")) + C(BikeParking, Treatment(reference="(Missing)")) + C(ByAppointmentOnly, Treatment(reference="(Missing)")) + C(WheelechairAccessible, Treatment(reference="(Missing)")) + C(OutdoorSeating, Treatment(reference="(Missing)")) + C(RestaurantsReservations, Treatment(reference="(Missing)")) + C(DogsAllowed, Treatment(reference="(Missing)")) + C(Caters, Treatment(reference="(Missing)"))', data = train_logreg).fit() print(logreg.summary())

y_prob_logreg = logreg.predict(test_logreg) y_pred_logreg = pd.Series([1 if X > 1/2 else 0 for X in y_prob_logreg], index=y_prob_logreg.index)

from sklearn.metrics import confusion_matrix y_test_logreg = test_logreg['fourOrAbove'] cm_logreg = confusion_matrix(y_test_logreg, y_pred_logreg) print("Confusion Matrix: \n", cm_logreg)

accuracy_logreg = (1177 + 583)/(1177 + 318 + 610 + 583) accuracy_logreg

2d iv) Classification tree model for fourOrAbove

#cross-validation to get the optimal ccp alpha from sklearn.metrics import make_scorer from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV grid_values = {'ccp_alpha': np.linspace(0, 0.10, 201), 'min_samples_leaf': [5], 'min_samples_split': [20], 'max_depth': [30], 'random_state': [88]} dtc2 = DecisionTreeClassifier() dtc2_cv_acc = GridSearchCV(dtc2, param_grid = grid_values, cv=10, verbose=1, scoring = 'accuracy') dtc2_cv_acc.fit(x_train_c, y_train_c)

acc2 = dtc2_cv_acc.cv_results_['mean_test_score'] # what sklearn calls mean_test_score is the holdout set, i.e. the validation set. ccp2 = dtc2_cv_acc.cv_results_['param_ccp_alpha'].data pd.DataFrame({'ccp alpha' : ccp2, 'Validation Accuracy': acc2}).head(10)

plt.figure(figsize=(8, 6)) plt.xlabel('ccp alpha', fontsize=16) plt.ylabel('mean validation accuracy', fontsize=16) plt.scatter(ccp2, acc2, s=2) plt.plot(ccp2, acc2, linewidth=3) plt.grid(True, which='both') plt.show()

print('Grid best parameter ccp_alpha (max. accuracy): ', dtc2_cv_acc.best_params_['ccp_alpha']) print('Grid best score (accuracy): ', dtc2_cv_acc.best_score_)

#create confusion matrix using the optimal ccp alpha value dtc2 = DecisionTreeClassifier(min_samples_leaf=5, ccp_alpha=0.001, class_weight = {0: 1, 1: 20}, random_state = 88) dtc2 = dtc2.fit(x_train_c, y_train_c) y_pred_dtc2 = dtc2.predict(x_test_c) from sklearn.metrics import confusion_matrix cm_clatree = confusion_matrix(y_test_c, y_pred_dtc2) cm_clatree

2d v) produce a table for performance comparison

#accuracy, TPR, and FPR functions def accuracy(cm): tn = cm.item((0,0)) fp = cm.item((0,1)) fn = cm.item((1,0)) tp = cm.item((1,1)) return (tn + tp)/(tn+fp+fn+tp) def TPR(cm): tn = cm.item((0,0)) fp = cm.item((0,1)) fn = cm.item((1,0)) tp = cm.item((1,1)) if fp + tp == 0: return 0 else: return tp/(tp+fp) def FPR(cm): tn = cm.item((0,0)) fp = cm.item((0,1)) fn = cm.item((1,0)) tp = cm.item((1,1)) if tp + fp == 0: return 0 return fp/(tp+fp)

#baseline assume all fourOrAbove = 0 tn_baseline = len(y_train_c[y_train_c['fourOrAbove']==0]) fn_baseline = len(y_train_c[y_train_c['fourOrAbove']==1])

#calculate metrics for the matrices cm_baseline = np.matrix([[tn_baseline, 0],[fn_baseline, 0]]) list_matrices = [cm_baseline, cm_linreg, cm_regtree, cm_logreg, cm_clatree] for i in list_matrices: print('accuracy of',i,':', accuracy(i)) print('TPR of',i,':', TPR(i)) print('FPR of',i,':', FPR(i))

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}2b i) build a linear regression model