import pandas as pd
import numpy as np
train = pd.read_csv('Letters_train.csv')
train.head()
test = pd.read_csv('Letters_test.csv')
1 a) i)
#make a function that tells if the letter is B or not
def is_b(text):
if text == 'B':
return 1
else:
return 0
#apply is_b function to the training and test dataset
train['isB'] = train['letter'].apply(is_b)
test['isB'] = test['letter'].apply(is_b)
test.head()
#see how many No's and Yes's in the dataset
train.groupby('isB').count()
1 a) ii)
!pip install statsmodels==0.13.0
import os
import statsmodels.formula.api as smf
#build a logistic regression model
logreg = smf.logit(formula = 'isB ~ xbox + ybox + width + height + onpix + xbar + ybar + x2bar + y2bar + xybar + x2ybar + xy2bar + xedge + xedgeycor + yedge + yedgexcor',
data = train).fit()
print(logreg.summary())
y_prob = logreg.predict(test)
y_pred = pd.Series([1 if X > 1/2 else 0 for X in y_prob], index=y_prob.index)
from sklearn.metrics import confusion_matrix
y_test = test['isB']
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
(tn, fp, fn, tp)
#calculate accuracy of the model using test set
def accuracy(tn, fp, fn, tp):
return (tn + tp)/(tn+fp+fn+tp)
accuracy(tn, fp, fn, tp)
1 a) iii)
#separate x and y of train and test set
y_train = train['isB']
x_train = train.drop(['isB', 'letter'], axis=1)
y_test = test['isB']
x_test = test.drop(['isB','letter'], axis=1)
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.title('ROC Curve', fontsize=18)
plt.xlabel('FPR', fontsize=16)
plt.ylabel('TPR', fontsize=16)
plt.xlim([-0.01, 1.00])
plt.ylim([-0.01, 1.01])
plt.plot(fpr, tpr, lw=3, label='Logistic Regression (area = {:0.2f})'.format(roc_auc))
plt.plot([0,1],[0,1], color='navy', lw=3, linestyle='--', label='Naive Baseline (area = 0.50)')
plt.legend(loc='lower right', fontsize=14)
plt.show()
1 a) iv)
#cross-validation to get the optimal ccp alpha
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
grid_values = {'ccp_alpha': np.linspace(0, 0.10, 201),
'min_samples_leaf': [5],
'min_samples_split': [20],
'max_depth': [30],
'random_state': [88]}
dtc = DecisionTreeClassifier()
dtc_cv_acc = GridSearchCV(dtc, param_grid = grid_values, cv=10, verbose=1,
scoring = 'accuracy')
dtc_cv_acc.fit(x_train, y_train)
#derive the top 10 ccp_alpha values
acc = dtc_cv_acc.cv_results_['mean_test_score'] # what sklearn calls mean_test_score is the holdout set, i.e. the validation set.
ccp = dtc_cv_acc.cv_results_['param_ccp_alpha'].data
pd.DataFrame({'ccp alpha' : ccp, 'Validation Accuracy': acc}).head(10)
#run the model on test set
dtc_test = DecisionTreeClassifier(min_samples_leaf=5,
ccp_alpha=0.001,
#class_weight = {0: 1, 1: 20},
random_state = 88)
dtc_test = dtc_test.fit(x_train, y_train)
y_pred_dtc_test = dtc_test.predict(x_test)
cm_clatree = confusion_matrix(y_test, y_pred_dtc_test)
accuracy(cm_clatree.item((0,0)), cm_clatree.item((0,1)), cm_clatree.item((1, 0)), cm_clatree.item((1, 1)))
1 a) v)
#build random forest model with the default setting
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(x_train, y_train)
y_pred_rf_test = rf_model.predict(x_test)
cm_rf = confusion_matrix(y_test, y_pred_rf_test)
accuracy(cm_rf.item((0,0)), cm_rf.item((0,1)), cm_rf.item((1, 0)), cm_rf.item((1, 1)))
1 b) i)
#baseline model -- see what letter is the most frequent letter
train.groupby('letter').count()
#accuracy of the baseline model with multi-class
578/(562+529+578+512)
1 b) ii)
x_train.head()
y_train_letter = train['letter']
y_test_letter = test['letter']
#no need to one-hot encode as all of the features are int64
#build LDA model
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score
lda = LinearDiscriminantAnalysis()
lda.fit(x_train, y_train_letter)
y_pred_lda = lda.predict(x_test)
len(y_pred_lda)
#calculate accuracy of the LDA model
#join the prediction and y_test to see the result
lda_result = {'y_pred_lda':y_pred_lda, 'y_test':y_test_letter}
lda_result = pd.DataFrame(data=lda_result)
lda_result['correct'] = lda_result['y_pred_lda'] == lda_result['y_test']
lda_result.groupby('correct').count()
#accuracy of the LDA model
842/(93+842)
1 b) iii)
#cross-validation to get the optimal ccp alpha
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
grid_values = {'ccp_alpha': np.linspace(0, 0.10, 201),
'min_samples_leaf': [5],
'min_samples_split': [20],
'max_depth': [30],
'random_state': [88]}
dtc_b = DecisionTreeClassifier()
dtc_cv_acc_b = GridSearchCV(dtc_b, param_grid = grid_values, cv=10, verbose=1,
scoring = 'accuracy')
dtc_cv_acc_b.fit(x_train, y_train_letter)
#derive the top 10 ccp_alpha values
acc_b = dtc_cv_acc_b.cv_results_['mean_test_score'] # what sklearn calls mean_test_score is the holdout set, i.e. the validation set.
ccp_b = dtc_cv_acc_b.cv_results_['param_ccp_alpha'].data
pd.DataFrame({'ccp alpha' : ccp_b, 'Validation Accuracy': acc_b}).head(10)
#run the model on test set
dtc_test_b = DecisionTreeClassifier(min_samples_leaf=5,
ccp_alpha=0.0005,
#class_weight = {0: 1, 1: 20},
random_state = 88)
dtc_test_b = dtc_test_b.fit(x_train, y_train_letter)
y_pred_dtc_test_b = dtc_test_b.predict(x_test)
cm_clatree_b = confusion_matrix(y_test_letter, y_pred_dtc_test_b)
accuracy(cm_clatree_b.item((0,0)), cm_clatree_b.item((0,1)), cm_clatree_b.item((1, 0)), cm_clatree_b.item((1, 1)))
1 b) iv)
x_train.info()
#fit random forest model with max_features = 16
rf_b = RandomForestClassifier(max_features = 16)
rf_model_b = rf_b.fit(x_train, y_train_letter)
#accuracy of the "vanilla" bagging model with Random Forest
y_pred_rf_test_b = rf_model_b.predict(x_test)
cm_rf_b = confusion_matrix(y_test_letter, y_pred_rf_test_b)
accuracy(cm_rf_b.item((0,0)), cm_rf_b.item((0,1)), cm_rf_b.item((1, 0)), cm_rf_b.item((1, 1)))
1 b) v)
#Random Forest with CV
import time
from sklearn.model_selection import KFold
grid_values = {'max_features': np.linspace(1,16,16, dtype='int32'),
'min_samples_leaf': [5],
'n_estimators': [500],
'random_state': [88]}
tic = time.time()
rf_b2 = RandomForestClassifier()
# Note: here we set verbose=2 to keep track of the progress (the running time) of the cross validation.
cv = KFold(n_splits=5,random_state=333,shuffle=True)
rf_cv_b2 = GridSearchCV(rf_b2, param_grid=grid_values, scoring='accuracy', cv=cv,verbose=2)
rf_cv_b2.fit(x_train, y_train_letter)
toc = time.time()
print('time:', round(toc-tic, 2),'s')
#select the best hyperparameter
max_features = rf_cv_b2.cv_results_['param_max_features'].data
accuracy_scores = rf_cv_b2.cv_results_['mean_test_score']
plt.figure(figsize=(8, 6))
plt.xlabel('max features', fontsize=16)
plt.ylabel('CV Accuracy', fontsize=16)
plt.scatter(max_features, accuracy_scores, s=30)
plt.plot(max_features, accuracy_scores, linewidth=3)
plt.grid(True, which='both')
plt.xlim([1, 17])
plt.ylim([0.9, 1.0])
print(rf_cv_b2.best_params_)
#evaluate the CV-d Random Forest on the test dataset using the best parameters
rf_optimal = RandomForestClassifier(max_features=2, min_samples_leaf=5,
n_estimators=500, random_state=88)
rf_optimal_model = rf_optimal.fit(x_train, y_train_letter)
y_pred_rf_optimal = rf_optimal_model.predict(x_test)
cm_rf_optimal = confusion_matrix(y_test_letter, y_pred_rf_optimal)
accuracy(cm_rf_optimal.item((0,0)), cm_rf_optimal.item((0,1)), cm_rf_optimal.item((1, 0)), cm_rf_optimal.item((1, 1)))
1 b) vi)
from sklearn.ensemble import GradientBoostingClassifier
gbr = GradientBoostingClassifier(n_estimators=3300, max_leaf_nodes=10)
gbr.fit(x_train, y_train_letter)
#accuracy of Gradient Boosting Tree
from sklearn.metrics import confusion_matrix
def accuracy(tn, fp, fn, tp):
return (tn + tp)/(tn+fp+fn+tp)
y_pred_gbr = gbr.predict(x_test)
cm_gbr = confusion_matrix(y_test_letter, y_pred_gbr)
accuracy(cm_gbr.item((0,0)), cm_gbr.item((0,1)), cm_gbr.item((1, 0)), cm_gbr.item((1, 1)))
1 c)
#code from Lab 8b for bootstrapping
import time
def bootstrap_validation(test_data, test_label, train_label, model, metrics_list, sample=500, random_state=66):
tic = time.time()
n_sample = sample
n_metrics = len(metrics_list)
output_array=np.zeros([n_sample, n_metrics])
output_array[:]=np.nan
print(output_array.shape)
for bs_iter in range(n_sample):
bs_index = np.random.choice(test_data.index, len(test_data.index), replace=True)
bs_data = test_data.loc[bs_index]
bs_label = test_label.loc[bs_index]
bs_predicted = model.predict(bs_data)
for metrics_iter in range(n_metrics):
metrics = metrics_list[metrics_iter]
output_array[bs_iter, metrics_iter]=metrics(bs_predicted,bs_label)
# if bs_iter % 100 == 0:
# print(bs_iter, time.time()-tic)
output_df = pd.DataFrame(output_array)
return output_df
Bootstrap for LDA
#bootstrap for LDA
bs_output_lda = bootstrap_validation(x_test,y_test_letter,y_train_letter,lda,
metrics_list=[accuracy_score],
sample = 5000)
lda_accuracy = accuracy_score(y_test_letter, y_pred_lda)
fig, axs = plt.subplots(ncols=2, figsize=(12,5))
axs[1].set_xlabel('Boot Accuracy - Test Set Accuracy', fontsize=16)
axs[0].set_ylabel('Count', fontsize=16)
axs[0].hist(bs_output_lda.iloc[:,0], bins=20,edgecolor='green', linewidth=2,color = "grey")
axs[0].set_xlim([0.4,0.7])
axs[1].hist(bs_output_lda.iloc[:,0]-lda_accuracy, bins=20,edgecolor='green', linewidth=2,color = "grey")
axs[1].set_xlim([-0.15,0.15])
Bootstrap for CART
#bootstrap for CART
bs_output_cart = bootstrap_validation(x_test,y_test_letter,y_train_letter,dtc_test_b,
metrics_list=[accuracy_score],
sample = 5000)
cart_accuracy = accuracy_score(y_test_letter, y_pred_dtc_test_b)
fig, axs = plt.subplots(ncols=2, figsize=(12,5))
axs[0].set_xlabel('Bootstrap Accuracy Estimate', fontsize=16)
axs[1].set_xlabel('Boot Accuracy - Test Set Accuracy', fontsize=16)
axs[0].set_ylabel('Count', fontsize=16)
axs[0].hist(bs_output_cart.iloc[:,0], bins=20,edgecolor='green', linewidth=2,color = "grey")
axs[0].set_xlim([0.4,0.7])
axs[1].hist(bs_output_cart.iloc[:,0]-cart_accuracy, bins=20,edgecolor='green', linewidth=2,color = "grey")
axs[1].set_xlim([-0.15,0.15])
Bootstrap for Bagging
#bootstrap for Bagging
bs_output_bagging = bootstrap_validation(x_test,y_test_letter,y_train_letter,rf_model_b,
metrics_list=[accuracy_score],
sample = 5000)
bagging_accuracy = accuracy_score(y_test_letter, y_pred_rf_test_b)
fig, axs = plt.subplots(ncols=2, figsize=(12,5))
axs[0].set_xlabel('Bootstrap Accuracy Estimate', fontsize=16)
axs[1].set_xlabel('Boot Accuracy - Test Set Accuracy', fontsize=16)
axs[0].set_ylabel('Count', fontsize=16)
axs[0].hist(bs_output_bagging.iloc[:,0], bins=20,edgecolor='green', linewidth=2,color = "grey")
axs[0].set_xlim([0.4,0.7])
axs[1].hist(bs_output_bagging.iloc[:,0]-bagging_accuracy, bins=20,edgecolor='green', linewidth=2,color = "grey")
axs[1].set_xlim([-0.15,0.15])
Bootstrap for Random Forest
#Bootstrap for Random Forest
bs_output_rf = bootstrap_validation(x_test,y_test_letter,y_train_letter,rf_optimal_model,
metrics_list=[accuracy_score],
sample = 5000)
rf_accuracy = accuracy_score(y_test_letter, y_pred_rf_optimal)
fig, axs = plt.subplots(ncols=2, figsize=(12,5))
axs[0].set_xlabel('Bootstrap Accuracy Estimate', fontsize=16)
axs[1].set_xlabel('Boot Accuracy - Test Set Accuracy', fontsize=16)
axs[0].set_ylabel('Count', fontsize=16)
axs[0].hist(bs_output_rf.iloc[:,0], bins=20,edgecolor='green', linewidth=2,color = "grey")
axs[0].set_xlim([0.4,0.7])
axs[1].hist(bs_output_rf.iloc[:,0]-rf_accuracy, bins=20,edgecolor='green', linewidth=2,color = "grey")
axs[1].set_xlim([-0.15,0.15])
Bootstrap for Boosting
#Bootstrap for boosting
bs_output_boost = bootstrap_validation(x_test,y_test_letter,y_train_letter,gbr,
metrics_list=[accuracy_score],
sample = 5000)
boosting_accuracy = accuracy_score(y_test_letter, y_pred_gbr)
fig, axs = plt.subplots(ncols=2, figsize=(12,5))
axs[0].set_xlabel('Bootstrap Accuracy Estimate', fontsize=16)
axs[1].set_xlabel('Boot Accuracy - Test Set Accuracy', fontsize=16)
axs[0].set_ylabel('Count', fontsize=16)
axs[0].hist(bs_output_boost.iloc[:,0], bins=20,edgecolor='green', linewidth=2,color = "grey")
axs[0].set_xlim([0.4,0.7])
axs[1].hist(bs_output_boost.iloc[:,0]-boosting_accuracy, bins=20,edgecolor='green', linewidth=2,color = "grey")
axs[1].set_xlim([-0.15,0.15])
# The 95% confidence interval of LDA
CI_0_lda = np.quantile(bs_output_lda.iloc[:,0]-lda_accuracy,np.array([0.025,0.975]))
CI_lda = [lda_accuracy - CI_0_lda[1],lda_accuracy - CI_0_lda[0]]
print("The 95-percent confidence interval of LDA accuracy is %s" % CI_lda) #0.5,0.64
# The 95% confidence interval of CART
CI_0_cart = np.quantile(bs_output_cart.iloc[:,0]-cart_accuracy,np.array([0.025,0.975]))
CI_cart = [cart_accuracy - CI_0_cart[1],cart_accuracy - CI_0_cart[0]]
print("The 95-percent confidence interval of CART accuracy is %s" % CI_cart) #0.5,0.64
# The 95% confidence interval of Bagging
CI_0_bagging = np.quantile(bs_output_bagging.iloc[:,0]-bagging_accuracy,np.array([0.025,0.975]))
CI_bagging = [bagging_accuracy - CI_0_bagging[1],bagging_accuracy - CI_0_bagging[0]]
print("The 95-percent confidence interval of Bagging accuracy is %s" % CI_bagging) #0.5,0.64
# The 95% confidence interval of RF
CI_0_rf = np.quantile(bs_output_rf.iloc[:,0]-rf_accuracy,np.array([0.025,0.975]))
CI_rf = [rf_accuracy - CI_0_rf[1],rf_accuracy - CI_0_rf[0]]
print("The 95-percent confidence interval of RF accuracy is %s" % CI_rf) #0.5,0.64
# The 95% confidence interval of Boosting
CI_0_boosting = np.quantile(bs_output_boost.iloc[:,0]-boosting_accuracy,np.array([0.025,0.975]))
CI_boosting = [boosting_accuracy - CI_0_boosting[1],boosting_accuracy - CI_0_boosting[0]]
print("The 95-percent confidence interval of Boosting accuracy is %s" % CI_boosting) #0.5,0.64