IEOR 142 HW 4

import pandas as pd import numpy as np

train = pd.read_csv('Letters_train.csv') train.head()

test = pd.read_csv('Letters_test.csv')

1 a) i)

#make a function that tells if the letter is B or not def is_b(text): if text == 'B': return 1 else: return 0

#apply is_b function to the training and test dataset train['isB'] = train['letter'].apply(is_b) test['isB'] = test['letter'].apply(is_b) test.head()

#see how many No's and Yes's in the dataset train.groupby('isB').count()

1 a) ii)

!pip install statsmodels==0.13.0

import os import statsmodels.formula.api as smf

#build a logistic regression model logreg = smf.logit(formula = 'isB ~ xbox + ybox + width + height + onpix + xbar + ybar + x2bar + y2bar + xybar + x2ybar + xy2bar + xedge + xedgeycor + yedge + yedgexcor', data = train).fit() print(logreg.summary())

y_prob = logreg.predict(test) y_pred = pd.Series([1 if X > 1/2 else 0 for X in y_prob], index=y_prob.index)

from sklearn.metrics import confusion_matrix y_test = test['isB'] tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() (tn, fp, fn, tp)

#calculate accuracy of the model using test set def accuracy(tn, fp, fn, tp): return (tn + tp)/(tn+fp+fn+tp) accuracy(tn, fp, fn, tp)

1 a) iii)

#separate x and y of train and test set y_train = train['isB'] x_train = train.drop(['isB', 'letter'], axis=1) y_test = test['isB'] x_test = test.drop(['isB','letter'], axis=1)

import matplotlib.pyplot as plt from sklearn.metrics import roc_curve, auc fpr, tpr, _ = roc_curve(y_test, y_prob) roc_auc = auc(fpr, tpr) plt.figure(figsize=(8, 6)) plt.title('ROC Curve', fontsize=18) plt.xlabel('FPR', fontsize=16) plt.ylabel('TPR', fontsize=16) plt.xlim([-0.01, 1.00]) plt.ylim([-0.01, 1.01]) plt.plot(fpr, tpr, lw=3, label='Logistic Regression (area = {:0.2f})'.format(roc_auc)) plt.plot([0,1],[0,1], color='navy', lw=3, linestyle='--', label='Naive Baseline (area = 0.50)') plt.legend(loc='lower right', fontsize=14) plt.show()

1 a) iv)

#cross-validation to get the optimal ccp alpha from sklearn.metrics import make_scorer from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV grid_values = {'ccp_alpha': np.linspace(0, 0.10, 201), 'min_samples_leaf': [5], 'min_samples_split': [20], 'max_depth': [30], 'random_state': [88]} dtc = DecisionTreeClassifier() dtc_cv_acc = GridSearchCV(dtc, param_grid = grid_values, cv=10, verbose=1, scoring = 'accuracy') dtc_cv_acc.fit(x_train, y_train)

#derive the top 10 ccp_alpha values acc = dtc_cv_acc.cv_results_['mean_test_score'] # what sklearn calls mean_test_score is the holdout set, i.e. the validation set. ccp = dtc_cv_acc.cv_results_['param_ccp_alpha'].data pd.DataFrame({'ccp alpha' : ccp, 'Validation Accuracy': acc}).head(10)

#run the model on test set dtc_test = DecisionTreeClassifier(min_samples_leaf=5, ccp_alpha=0.001, #class_weight = {0: 1, 1: 20}, random_state = 88) dtc_test = dtc_test.fit(x_train, y_train) y_pred_dtc_test = dtc_test.predict(x_test) cm_clatree = confusion_matrix(y_test, y_pred_dtc_test) accuracy(cm_clatree.item((0,0)), cm_clatree.item((0,1)), cm_clatree.item((1, 0)), cm_clatree.item((1, 1)))

1 a) v)

#build random forest model with the default setting from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier() rf_model = rf.fit(x_train, y_train)

y_pred_rf_test = rf_model.predict(x_test) cm_rf = confusion_matrix(y_test, y_pred_rf_test) accuracy(cm_rf.item((0,0)), cm_rf.item((0,1)), cm_rf.item((1, 0)), cm_rf.item((1, 1)))

1 b) i)

#baseline model -- see what letter is the most frequent letter train.groupby('letter').count()

#accuracy of the baseline model with multi-class 578/(562+529+578+512)

1 b) ii)

x_train.head()

y_train_letter = train['letter'] y_test_letter = test['letter']

#no need to one-hot encode as all of the features are int64 #build LDA model from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.metrics import accuracy_score lda = LinearDiscriminantAnalysis() lda.fit(x_train, y_train_letter) y_pred_lda = lda.predict(x_test) len(y_pred_lda)

#calculate accuracy of the LDA model #join the prediction and y_test to see the result lda_result = {'y_pred_lda':y_pred_lda, 'y_test':y_test_letter} lda_result = pd.DataFrame(data=lda_result) lda_result['correct'] = lda_result['y_pred_lda'] == lda_result['y_test'] lda_result.groupby('correct').count()

#accuracy of the LDA model 842/(93+842)

1 b) iii)

#cross-validation to get the optimal ccp alpha from sklearn.metrics import make_scorer from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV grid_values = {'ccp_alpha': np.linspace(0, 0.10, 201), 'min_samples_leaf': [5], 'min_samples_split': [20], 'max_depth': [30], 'random_state': [88]} dtc_b = DecisionTreeClassifier() dtc_cv_acc_b = GridSearchCV(dtc_b, param_grid = grid_values, cv=10, verbose=1, scoring = 'accuracy') dtc_cv_acc_b.fit(x_train, y_train_letter)

#derive the top 10 ccp_alpha values acc_b = dtc_cv_acc_b.cv_results_['mean_test_score'] # what sklearn calls mean_test_score is the holdout set, i.e. the validation set. ccp_b = dtc_cv_acc_b.cv_results_['param_ccp_alpha'].data pd.DataFrame({'ccp alpha' : ccp_b, 'Validation Accuracy': acc_b}).head(10)

#run the model on test set dtc_test_b = DecisionTreeClassifier(min_samples_leaf=5, ccp_alpha=0.0005, #class_weight = {0: 1, 1: 20}, random_state = 88) dtc_test_b = dtc_test_b.fit(x_train, y_train_letter) y_pred_dtc_test_b = dtc_test_b.predict(x_test) cm_clatree_b = confusion_matrix(y_test_letter, y_pred_dtc_test_b) accuracy(cm_clatree_b.item((0,0)), cm_clatree_b.item((0,1)), cm_clatree_b.item((1, 0)), cm_clatree_b.item((1, 1)))

1 b) iv)

x_train.info()

#fit random forest model with max_features = 16 rf_b = RandomForestClassifier(max_features = 16) rf_model_b = rf_b.fit(x_train, y_train_letter)

#accuracy of the "vanilla" bagging model with Random Forest y_pred_rf_test_b = rf_model_b.predict(x_test) cm_rf_b = confusion_matrix(y_test_letter, y_pred_rf_test_b) accuracy(cm_rf_b.item((0,0)), cm_rf_b.item((0,1)), cm_rf_b.item((1, 0)), cm_rf_b.item((1, 1)))

1 b) v)

#Random Forest with CV import time from sklearn.model_selection import KFold grid_values = {'max_features': np.linspace(1,16,16, dtype='int32'), 'min_samples_leaf': [5], 'n_estimators': [500], 'random_state': [88]} tic = time.time() rf_b2 = RandomForestClassifier() # Note: here we set verbose=2 to keep track of the progress (the running time) of the cross validation. cv = KFold(n_splits=5,random_state=333,shuffle=True) rf_cv_b2 = GridSearchCV(rf_b2, param_grid=grid_values, scoring='accuracy', cv=cv,verbose=2) rf_cv_b2.fit(x_train, y_train_letter) toc = time.time() print('time:', round(toc-tic, 2),'s')

#select the best hyperparameter max_features = rf_cv_b2.cv_results_['param_max_features'].data accuracy_scores = rf_cv_b2.cv_results_['mean_test_score'] plt.figure(figsize=(8, 6)) plt.xlabel('max features', fontsize=16) plt.ylabel('CV Accuracy', fontsize=16) plt.scatter(max_features, accuracy_scores, s=30) plt.plot(max_features, accuracy_scores, linewidth=3) plt.grid(True, which='both') plt.xlim([1, 17]) plt.ylim([0.9, 1.0])

print(rf_cv_b2.best_params_)

#evaluate the CV-d Random Forest on the test dataset using the best parameters rf_optimal = RandomForestClassifier(max_features=2, min_samples_leaf=5, n_estimators=500, random_state=88) rf_optimal_model = rf_optimal.fit(x_train, y_train_letter)

y_pred_rf_optimal = rf_optimal_model.predict(x_test) cm_rf_optimal = confusion_matrix(y_test_letter, y_pred_rf_optimal) accuracy(cm_rf_optimal.item((0,0)), cm_rf_optimal.item((0,1)), cm_rf_optimal.item((1, 0)), cm_rf_optimal.item((1, 1)))

1 b) vi)

from sklearn.ensemble import GradientBoostingClassifier gbr = GradientBoostingClassifier(n_estimators=3300, max_leaf_nodes=10) gbr.fit(x_train, y_train_letter)

#accuracy of Gradient Boosting Tree from sklearn.metrics import confusion_matrix def accuracy(tn, fp, fn, tp): return (tn + tp)/(tn+fp+fn+tp) y_pred_gbr = gbr.predict(x_test) cm_gbr = confusion_matrix(y_test_letter, y_pred_gbr) accuracy(cm_gbr.item((0,0)), cm_gbr.item((0,1)), cm_gbr.item((1, 0)), cm_gbr.item((1, 1)))

1 c)

#code from Lab 8b for bootstrapping import time def bootstrap_validation(test_data, test_label, train_label, model, metrics_list, sample=500, random_state=66): tic = time.time() n_sample = sample n_metrics = len(metrics_list) output_array=np.zeros([n_sample, n_metrics]) output_array[:]=np.nan print(output_array.shape) for bs_iter in range(n_sample): bs_index = np.random.choice(test_data.index, len(test_data.index), replace=True) bs_data = test_data.loc[bs_index] bs_label = test_label.loc[bs_index] bs_predicted = model.predict(bs_data) for metrics_iter in range(n_metrics): metrics = metrics_list[metrics_iter] output_array[bs_iter, metrics_iter]=metrics(bs_predicted,bs_label) # if bs_iter % 100 == 0: # print(bs_iter, time.time()-tic) output_df = pd.DataFrame(output_array) return output_df

Bootstrap for LDA

#bootstrap for LDA bs_output_lda = bootstrap_validation(x_test,y_test_letter,y_train_letter,lda, metrics_list=[accuracy_score], sample = 5000)

lda_accuracy = accuracy_score(y_test_letter, y_pred_lda) fig, axs = plt.subplots(ncols=2, figsize=(12,5)) axs[1].set_xlabel('Boot Accuracy - Test Set Accuracy', fontsize=16) axs[0].set_ylabel('Count', fontsize=16) axs[0].hist(bs_output_lda.iloc[:,0], bins=20,edgecolor='green', linewidth=2,color = "grey") axs[0].set_xlim([0.4,0.7]) axs[1].hist(bs_output_lda.iloc[:,0]-lda_accuracy, bins=20,edgecolor='green', linewidth=2,color = "grey") axs[1].set_xlim([-0.15,0.15])

Bootstrap for CART

#bootstrap for CART bs_output_cart = bootstrap_validation(x_test,y_test_letter,y_train_letter,dtc_test_b, metrics_list=[accuracy_score], sample = 5000)

cart_accuracy = accuracy_score(y_test_letter, y_pred_dtc_test_b) fig, axs = plt.subplots(ncols=2, figsize=(12,5)) axs[0].set_xlabel('Bootstrap Accuracy Estimate', fontsize=16) axs[1].set_xlabel('Boot Accuracy - Test Set Accuracy', fontsize=16) axs[0].set_ylabel('Count', fontsize=16) axs[0].hist(bs_output_cart.iloc[:,0], bins=20,edgecolor='green', linewidth=2,color = "grey") axs[0].set_xlim([0.4,0.7]) axs[1].hist(bs_output_cart.iloc[:,0]-cart_accuracy, bins=20,edgecolor='green', linewidth=2,color = "grey") axs[1].set_xlim([-0.15,0.15])

Bootstrap for Bagging

#bootstrap for Bagging bs_output_bagging = bootstrap_validation(x_test,y_test_letter,y_train_letter,rf_model_b, metrics_list=[accuracy_score], sample = 5000)

bagging_accuracy = accuracy_score(y_test_letter, y_pred_rf_test_b) fig, axs = plt.subplots(ncols=2, figsize=(12,5)) axs[0].set_xlabel('Bootstrap Accuracy Estimate', fontsize=16) axs[1].set_xlabel('Boot Accuracy - Test Set Accuracy', fontsize=16) axs[0].set_ylabel('Count', fontsize=16) axs[0].hist(bs_output_bagging.iloc[:,0], bins=20,edgecolor='green', linewidth=2,color = "grey") axs[0].set_xlim([0.4,0.7]) axs[1].hist(bs_output_bagging.iloc[:,0]-bagging_accuracy, bins=20,edgecolor='green', linewidth=2,color = "grey") axs[1].set_xlim([-0.15,0.15])

Bootstrap for Random Forest

#Bootstrap for Random Forest bs_output_rf = bootstrap_validation(x_test,y_test_letter,y_train_letter,rf_optimal_model, metrics_list=[accuracy_score], sample = 5000)

rf_accuracy = accuracy_score(y_test_letter, y_pred_rf_optimal) fig, axs = plt.subplots(ncols=2, figsize=(12,5)) axs[0].set_xlabel('Bootstrap Accuracy Estimate', fontsize=16) axs[1].set_xlabel('Boot Accuracy - Test Set Accuracy', fontsize=16) axs[0].set_ylabel('Count', fontsize=16) axs[0].hist(bs_output_rf.iloc[:,0], bins=20,edgecolor='green', linewidth=2,color = "grey") axs[0].set_xlim([0.4,0.7]) axs[1].hist(bs_output_rf.iloc[:,0]-rf_accuracy, bins=20,edgecolor='green', linewidth=2,color = "grey") axs[1].set_xlim([-0.15,0.15])

Bootstrap for Boosting

#Bootstrap for boosting bs_output_boost = bootstrap_validation(x_test,y_test_letter,y_train_letter,gbr, metrics_list=[accuracy_score], sample = 5000)

boosting_accuracy = accuracy_score(y_test_letter, y_pred_gbr) fig, axs = plt.subplots(ncols=2, figsize=(12,5)) axs[0].set_xlabel('Bootstrap Accuracy Estimate', fontsize=16) axs[1].set_xlabel('Boot Accuracy - Test Set Accuracy', fontsize=16) axs[0].set_ylabel('Count', fontsize=16) axs[0].hist(bs_output_boost.iloc[:,0], bins=20,edgecolor='green', linewidth=2,color = "grey") axs[0].set_xlim([0.4,0.7]) axs[1].hist(bs_output_boost.iloc[:,0]-boosting_accuracy, bins=20,edgecolor='green', linewidth=2,color = "grey") axs[1].set_xlim([-0.15,0.15])

# The 95% confidence interval of LDA CI_0_lda = np.quantile(bs_output_lda.iloc[:,0]-lda_accuracy,np.array([0.025,0.975])) CI_lda = [lda_accuracy - CI_0_lda[1],lda_accuracy - CI_0_lda[0]] print("The 95-percent confidence interval of LDA accuracy is %s" % CI_lda) #0.5,0.64

# The 95% confidence interval of CART CI_0_cart = np.quantile(bs_output_cart.iloc[:,0]-cart_accuracy,np.array([0.025,0.975])) CI_cart = [cart_accuracy - CI_0_cart[1],cart_accuracy - CI_0_cart[0]] print("The 95-percent confidence interval of CART accuracy is %s" % CI_cart) #0.5,0.64

# The 95% confidence interval of Bagging CI_0_bagging = np.quantile(bs_output_bagging.iloc[:,0]-bagging_accuracy,np.array([0.025,0.975])) CI_bagging = [bagging_accuracy - CI_0_bagging[1],bagging_accuracy - CI_0_bagging[0]] print("The 95-percent confidence interval of Bagging accuracy is %s" % CI_bagging) #0.5,0.64

# The 95% confidence interval of RF CI_0_rf = np.quantile(bs_output_rf.iloc[:,0]-rf_accuracy,np.array([0.025,0.975])) CI_rf = [rf_accuracy - CI_0_rf[1],rf_accuracy - CI_0_rf[0]] print("The 95-percent confidence interval of RF accuracy is %s" % CI_rf) #0.5,0.64

# The 95% confidence interval of Boosting CI_0_boosting = np.quantile(bs_output_boost.iloc[:,0]-boosting_accuracy,np.array([0.025,0.975])) CI_boosting = [boosting_accuracy - CI_0_boosting[1],boosting_accuracy - CI_0_boosting[0]] print("The 95-percent confidence interval of Boosting accuracy is %s" % CI_boosting) #0.5,0.64

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}1 a) i)