import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.mlab as mlab
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
import csv
from scipy.stats import multivariate_normal
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import accuracy_score
import random
from scipy.stats import mode
import time
data = pd.read_csv("/work/spambase.data", delimiter =",")
data = data.to_numpy()
data_X = data[:,0:len(data[0])-1]
data_y = data[:,-1]
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10
X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=1 - train_ratio)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))
clf = tree.DecisionTreeClassifier()
t1 = time.time()
clf.fit(X_train, y_train)
prediction = clf.predict(X_test)
print(accuracy_score(prediction,y_test))
t2 = time.time()
print(t2-t1)
0.8869565217391304
0.03530597686767578
def get_error_rate(pred, Y):
return sum(pred != Y) / float(len(Y))
def adaboost(X,y, X_val, y_val, clf, M):
N = len(y)
N_val = len(y_val)
alpha_arr = np.zeros(M)
y_predict_arr = np.zeros((M,N_val))
error_arr = np.zeros(M)
#Initialize weights
weight = np.ones(N) / N
#For m = 1 to M
for m in range(M):
# Fit classifier
classifier = clf
classifier.fit(X, y, sample_weight=weight)
y_predict = classifier.predict(X_val)
predict = classifier.predict(X)
# Indication
#indicator = (y_predict != y_val)
indicator = (predict != y)
#Estimator error
err_m = np.average(indicator, weights=weight, axis=0)
# Alpha
alpha = np.log((1. - err_m) / err_m)
# Weights
weight *= np.exp(alpha * indicator )
# Save values
error_arr[m] = err_m
y_predict_arr[m] = y_predict.copy()
alpha_arr[m] = alpha
# Predictions
preds = np.zeros(N_val)
for i in range(N_val):
preds[i] = np.sum(y_predict_arr[:,i] * alpha_arr)
preds = np.sign(preds)
print('Accuracy = ', (preds == y_val).sum() / N_val)
return ((preds == y_val).sum() / N_val)
clf = tree.DecisionTreeClassifier(max_depth = 2, max_leaf_nodes=None)
y_train[y_train == 0] = -1
y_val[y_val == 0] = -1
y_test[y_test == 0] = -1
t1 = time.time()
adaboost(X_train, y_train, X_test, y_test, clf, M=100)
t2 = time.time()
print((t2-t1))
Accuracy = 0.9442028985507246
0.5021812915802002
def bagging(X,y, X_val, y_val, clf, M, N):
prediction_arr = np.zeros((M,len(y_val)))
for m in range(M):
idx = np.random.choice(X.shape[0], N, replace=False)
X_sample = X[idx,:]
y_sample = y[idx]
classifier = clf
classifier.fit(X_sample, y_sample)
prediction_arr[m] = classifier.predict(X_val)
often = np.zeros(len(y_val))
for i in range(len(y_val)):
often[i] = mode(prediction_arr[:,i])[0][0]
print('Accuracy = ', (often == y_val).sum() / len(y_val))
return (often == y_val).sum() / len(y_val)
clf = tree.DecisionTreeClassifier(max_depth = 2, max_leaf_nodes=None)
y_train[y_train == 0] = -1
y_val[y_val == 0] = -1
y_test[y_test == 0] = -1
tmp_bag = 0.
t1 = time.time()
#bagging(X_train, y_train, X_test, y_test, clf,200, 100)
for i in range(100):
tmp_bag += bagging(X_train, y_train, X_test, y_test, clf,200, 100)
t2 = time.time()
print((t2-t1)/100)
print(tmp_bag/100)
Accuracy = 0.9094202898550725
Accuracy = 0.908695652173913
Accuracy = 0.9094202898550725
Accuracy = 0.9108695652173913
Accuracy = 0.908695652173913
Accuracy = 0.908695652173913
Accuracy = 0.9043478260869565
Accuracy = 0.9065217391304348
Accuracy = 0.9065217391304348
Accuracy = 0.9028985507246376
Accuracy = 0.908695652173913
Accuracy = 0.9036231884057971
Accuracy = 0.9028985507246376
Accuracy = 0.9072463768115943
Accuracy = 0.9072463768115943
Accuracy = 0.908695652173913
Accuracy = 0.9007246376811594
Accuracy = 0.9101449275362319
Accuracy = 0.9065217391304348
Accuracy = 0.9108695652173913
Accuracy = 0.9108695652173913
Accuracy = 0.9108695652173913
Accuracy = 0.9065217391304348
Accuracy = 0.9043478260869565
Accuracy = 0.9043478260869565
Accuracy = 0.9108695652173913
Accuracy = 0.9065217391304348
Accuracy = 0.9065217391304348
Accuracy = 0.9079710144927536
Accuracy = 0.9094202898550725
Accuracy = 0.9094202898550725
Accuracy = 0.9065217391304348
Accuracy = 0.9108695652173913
Accuracy = 0.9043478260869565
Accuracy = 0.9079710144927536
Accuracy = 0.9021739130434783
Accuracy = 0.9094202898550725
Accuracy = 0.9065217391304348
Accuracy = 0.908695652173913
Accuracy = 0.9043478260869565
Accuracy = 0.9108695652173913
Accuracy = 0.9050724637681159
Accuracy = 0.9057971014492754
Accuracy = 0.9050724637681159
Accuracy = 0.9057971014492754
Accuracy = 0.9036231884057971
Accuracy = 0.9057971014492754
Accuracy = 0.908695652173913
Accuracy = 0.8971014492753623
Accuracy = 0.9065217391304348
Accuracy = 0.9050724637681159
Accuracy = 0.9028985507246376
Accuracy = 0.908695652173913
Accuracy = 0.9072463768115943
Accuracy = 0.9007246376811594
Accuracy = 0.908695652173913
Accuracy = 0.9036231884057971
Accuracy = 0.9
Accuracy = 0.908695652173913
Accuracy = 0.9050724637681159
Accuracy = 0.9079710144927536
Accuracy = 0.9028985507246376
Accuracy = 0.9101449275362319
Accuracy = 0.9050724637681159
Accuracy = 0.9050724637681159
Accuracy = 0.908695652173913
Accuracy = 0.9115942028985508
Accuracy = 0.9
Accuracy = 0.9072463768115943
Accuracy = 0.908695652173913
Accuracy = 0.9065217391304348
Accuracy = 0.9101449275362319
Accuracy = 0.9043478260869565
Accuracy = 0.9050724637681159
Accuracy = 0.8898550724637682
Accuracy = 0.9014492753623189
Accuracy = 0.894927536231884
Accuracy = 0.9057971014492754
Accuracy = 0.9028985507246376
Accuracy = 0.8978260869565218
Accuracy = 0.9079710144927536
Accuracy = 0.9065217391304348
Accuracy = 0.9057971014492754
Accuracy = 0.9050724637681159
Accuracy = 0.9014492753623189
Accuracy = 0.9101449275362319
Accuracy = 0.9101449275362319
Accuracy = 0.9079710144927536
Accuracy = 0.9072463768115943
Accuracy = 0.9079710144927536
Accuracy = 0.9021739130434783
Accuracy = 0.9050724637681159
Accuracy = 0.908695652173913
Accuracy = 0.9079710144927536
Accuracy = 0.908695652173913
Accuracy = 0.913768115942029
Accuracy = 0.9
Accuracy = 0.9043478260869565
Accuracy = 0.9094202898550725
Accuracy = 0.9043478260869565
0.300751211643219
0.9061739130434794
''' depth = np.array([1,2,3,4,5,10])
for k in range(len(depth)):
clf = tree.DecisionTreeClassifier(max_depth = depth[k], max_leaf_nodes=None)
y_train[y_train == 0] = -1
y_val[y_val == 0] = -1
y_test[y_test == 0] = -1
n_samples = np.array([1,5,10,20,50,100,200,500])
x_axis = np.linspace(1,len(n_samples), len(n_samples))
save = np.zeros((len(n_samples),len(n_samples)))
for i in range(len(n_samples)):
for j in range(len(n_samples)):
save[i,j] = bagging(X_train, y_train, X_val, y_val, clf,n_samples[i], n_samples[j])
title_name = "Accuracy with depth = " +str(depth[k])
plt.title(title_name)
plt.xlabel("Size of sampling")
plt.ylabel("Accuracy")
plt.plot(n_samples,save[i],label = "Iterations: " + str(n_samples[i]))
plt.legend(loc = "best")
plt.show()
name = ("bagging_hyper_"+str(depth[k]))
plt.savefig(name+".png") '''
''' depth = np.array([1,2,3,4,5,10])
for k in range(len(depth)):
clf = tree.DecisionTreeClassifier(max_depth = depth[k], max_leaf_nodes=None)
y_train[y_train == 0] = -1
y_val[y_val == 0] = -1
y_test[y_test == 0] = -1
n_samples = np.array([1,5,10,20,50,100,200,500])
x_axis = np.linspace(1,len(n_samples), len(n_samples))
save = np.zeros(len(n_samples))
for i in range(len(n_samples)):
save[i] = adaboost(X_train, y_train, X_val, y_val, clf, M=n_samples[i])
plt.xlabel("Number of iterations")
plt.ylabel("Accuracy")
plt.plot(n_samples,save,label = "depth = : "+str(depth[k]))
plt.legend(loc = "best")
title_name = "Accuracy of Adaboost"
plt.title(title_name)
plt.show()
#name = ("adaboost"+str(depth[k]))
#plt.savefig(name+".png") '''