Starter Project  

by NigolehaMar 20, 2021
0 likes0 duplicates
Share
Twitter iconTwitter
Facebook iconFacebook
Linkedin
Email
Copy link
Save as PDF
    No headers found
import numpy as np import matplotlib.pyplot as plt import pandas as pd import matplotlib.mlab as mlab import seaborn as sns sns.set_style('whitegrid') %matplotlib inline import csv from scipy.stats import multivariate_normal from sklearn.model_selection import train_test_split from sklearn import tree from sklearn.metrics import accuracy_score import random from scipy.stats import mode import time
data = pd.read_csv("/work/spambase.data", delimiter =",") data = data.to_numpy() data_X = data[:,0:len(data[0])-1] data_y = data[:,-1]
train_ratio = 0.75 validation_ratio = 0.15 test_ratio = 0.10 X_train, X_test, y_train, y_test = train_test_split(data_X, data_y, test_size=1 - train_ratio) X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio))
clf = tree.DecisionTreeClassifier() t1 = time.time() clf.fit(X_train, y_train) prediction = clf.predict(X_test) print(accuracy_score(prediction,y_test)) t2 = time.time() print(t2-t1)
def get_error_rate(pred, Y): return sum(pred != Y) / float(len(Y))
def adaboost(X,y, X_val, y_val, clf, M): N = len(y) N_val = len(y_val) alpha_arr = np.zeros(M) y_predict_arr = np.zeros((M,N_val)) error_arr = np.zeros(M) #Initialize weights weight = np.ones(N) / N #For m = 1 to M for m in range(M): # Fit classifier classifier = clf classifier.fit(X, y, sample_weight=weight) y_predict = classifier.predict(X_val) predict = classifier.predict(X) # Indication #indicator = (y_predict != y_val) indicator = (predict != y) #Estimator error err_m = np.average(indicator, weights=weight, axis=0) # Alpha alpha = np.log((1. - err_m) / err_m) # Weights weight *= np.exp(alpha * indicator ) # Save values error_arr[m] = err_m y_predict_arr[m] = y_predict.copy() alpha_arr[m] = alpha # Predictions preds = np.zeros(N_val) for i in range(N_val): preds[i] = np.sum(y_predict_arr[:,i] * alpha_arr) preds = np.sign(preds) print('Accuracy = ', (preds == y_val).sum() / N_val) return ((preds == y_val).sum() / N_val)
clf = tree.DecisionTreeClassifier(max_depth = 2, max_leaf_nodes=None) y_train[y_train == 0] = -1 y_val[y_val == 0] = -1 y_test[y_test == 0] = -1 t1 = time.time() adaboost(X_train, y_train, X_test, y_test, clf, M=100) t2 = time.time() print((t2-t1))
def bagging(X,y, X_val, y_val, clf, M, N): prediction_arr = np.zeros((M,len(y_val))) for m in range(M): idx = np.random.choice(X.shape[0], N, replace=False) X_sample = X[idx,:] y_sample = y[idx] classifier = clf classifier.fit(X_sample, y_sample) prediction_arr[m] = classifier.predict(X_val) often = np.zeros(len(y_val)) for i in range(len(y_val)): often[i] = mode(prediction_arr[:,i])[0][0] print('Accuracy = ', (often == y_val).sum() / len(y_val)) return (often == y_val).sum() / len(y_val)
clf = tree.DecisionTreeClassifier(max_depth = 2, max_leaf_nodes=None) y_train[y_train == 0] = -1 y_val[y_val == 0] = -1 y_test[y_test == 0] = -1 tmp_bag = 0. t1 = time.time() #bagging(X_train, y_train, X_test, y_test, clf,200, 100) for i in range(100): tmp_bag += bagging(X_train, y_train, X_test, y_test, clf,200, 100) t2 = time.time() print((t2-t1)/100) print(tmp_bag/100)
''' depth = np.array([1,2,3,4,5,10]) for k in range(len(depth)): clf = tree.DecisionTreeClassifier(max_depth = depth[k], max_leaf_nodes=None) y_train[y_train == 0] = -1 y_val[y_val == 0] = -1 y_test[y_test == 0] = -1 n_samples = np.array([1,5,10,20,50,100,200,500]) x_axis = np.linspace(1,len(n_samples), len(n_samples)) save = np.zeros((len(n_samples),len(n_samples))) for i in range(len(n_samples)): for j in range(len(n_samples)): save[i,j] = bagging(X_train, y_train, X_val, y_val, clf,n_samples[i], n_samples[j]) title_name = "Accuracy with depth = " +str(depth[k]) plt.title(title_name) plt.xlabel("Size of sampling") plt.ylabel("Accuracy") plt.plot(n_samples,save[i],label = "Iterations: " + str(n_samples[i])) plt.legend(loc = "best") plt.show() name = ("bagging_hyper_"+str(depth[k])) plt.savefig(name+".png") '''
''' depth = np.array([1,2,3,4,5,10]) for k in range(len(depth)): clf = tree.DecisionTreeClassifier(max_depth = depth[k], max_leaf_nodes=None) y_train[y_train == 0] = -1 y_val[y_val == 0] = -1 y_test[y_test == 0] = -1 n_samples = np.array([1,5,10,20,50,100,200,500]) x_axis = np.linspace(1,len(n_samples), len(n_samples)) save = np.zeros(len(n_samples)) for i in range(len(n_samples)): save[i] = adaboost(X_train, y_train, X_val, y_val, clf, M=n_samples[i]) plt.xlabel("Number of iterations") plt.ylabel("Accuracy") plt.plot(n_samples,save,label = "depth = : "+str(depth[k])) plt.legend(loc = "best") title_name = "Accuracy of Adaboost" plt.title(title_name) plt.show() #name = ("adaboost"+str(depth[k])) #plt.savefig(name+".png") '''

Recommended on Deepnote

Stock Market Analysis

Stock Market Analysis

Last update 3 months ago
The 10 Best Ways to Create NumPy Arrays

The 10 Best Ways to Create NumPy Arrays

Last update 4 months ago
Wide Residual Networks

Wide Residual Networks

Last update 4 months ago