CV - hw5

# Setup from time import time from collections import defaultdict import numpy as np import matplotlib.pyplot as plt from matplotlib import rc from skimage import io %matplotlib inline plt.rcParams['image.interpolation'] = 'nearest' plt.rcParams['image.cmap'] = 'gray' # for auto-reloading external modules %load_ext autoreload %autoreload 2

plt.rcParams['figure.figsize'] = (15.0, 12.0) image = io.imread('pitbull.jpg', as_gray=True) plt.imshow(image) plt.axis('off') plt.show()

from compression import compress_image compressed_image, compressed_size = compress_image(image, 100) compression_ratio = compressed_size / image.size print('Original image shape:', image.shape) print('Compressed size: %d' % compressed_size) print('Compression ratio: %.3f' % compression_ratio) assert compressed_size == 298500

# Number of singular values to keep n_values = [10, 50, 100] for n in n_values: # Compress the image using `n` singular values compressed_image, compressed_size = compress_image(image, n) compression_ratio = compressed_size / image.size print("Data size (original): %d" % (image.size)) print("Data size (compressed): %d" % compressed_size) print("Compression ratio: %f" % (compression_ratio)) plt.imshow(compressed_image, cmap='gray') title = "n = %s" % n plt.title(title) plt.axis('off') plt.show()

from utils import load_dataset X_train, y_train, classes_train = load_dataset('faces', train=True, as_gray=True) X_test, y_test, classes_test = load_dataset('faces', train=False, as_gray=True) assert classes_train == classes_test classes = classes_train print('Class names:', classes) print('Training data shape:', X_train.shape) print('Training labels shape: ', y_train.shape) print('Test data shape:', X_test.shape) print('Test labels shape: ', y_test.shape)

# Visualize some examples from the dataset. # We show a few examples of training images from each class. num_classes = len(classes) samples_per_class = 10 for y, cls in enumerate(classes): idxs = np.flatnonzero(y_train == y) idxs = np.random.choice(idxs, samples_per_class, replace=False) for i, idx in enumerate(idxs): plt_idx = i * num_classes + y + 1 plt.subplot(samples_per_class, num_classes, plt_idx) plt.imshow(X_train[idx]) plt.axis('off') if i == 0: plt.title(y) plt.show()

# Flatten the image data into rows # we now have one 4096 dimensional featue vector for each example X_train = np.reshape(X_train, (X_train.shape[0], -1)) X_test = np.reshape(X_test, (X_test.shape[0], -1)) print("Training data shape:", X_train.shape) print("Test data shape:", X_test.shape)

from k_nearest_neighbor import compute_distances # Step 1: compute the distances between all features from X_train and from X_test dists = compute_distances(X_test, X_train) assert dists.shape == (160, 800) print("dists shape:", dists.shape)

from k_nearest_neighbor import predict_labels # We use k = 1 (which corresponds to only taking the nearest neighbor to decide) y_test_pred = predict_labels(dists, y_train, k=1) # Compute and print the fraction of correctly predicted examples num_test = y_test.shape[0] num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / num_test print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))

from k_nearest_neighbor import split_folds # Step 2: split the data into 5 folds to perform cross-validation. num_folds = 5 X_trains, y_trains, X_vals, y_vals = split_folds(X_train, y_train, num_folds) assert X_trains.shape == (5, 640, 4096) assert y_trains.shape == (5, 640) assert X_vals.shape == (5, 160, 4096) assert y_vals.shape == (5, 160)

# Step 3: Measure the mean accuracy for each value of `k` # List of k to choose from k_choices = list(range(5, 101, 5)) # Dictionnary mapping k values to accuracies # For each k value, we will have `num_folds` accuracies to compute # k_to_accuracies[1] will be for instance [0.22, 0.23, 0.19, 0.25, 0.20] for 5 folds k_to_accuracies = {} for k in k_choices: print("Running for k=%d" % k) accuracies = [] for i in range(num_folds): # Make predictions fold_dists = compute_distances(X_vals[i], X_trains[i]) y_pred = predict_labels(fold_dists, y_trains[i], k) # Compute and print the fraction of correctly predicted examples num_correct = np.sum(y_pred == y_vals[i]) accuracy = float(num_correct) / len(y_vals[i]) accuracies.append(accuracy) k_to_accuracies[k] = accuracies

# plot the raw observations for k in k_choices: accuracies = k_to_accuracies[k] plt.scatter([k] * len(accuracies), accuracies) # plot the trend line with error bars that correspond to standard deviation accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())]) accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())]) plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std) plt.title('Cross-validation on k') plt.xlabel('k') plt.ylabel('Cross-validation accuracy') plt.show()

# Based on the cross-validation results above, choose the best value for k, # retrain the classifier using all the training data, and test it on the test # data. You should be able to get above 26% accuracy on the test data. best_k = None # YOUR CODE HERE # Choose the best k based on the cross validation above best_k = k_choices[np.argmax(accuracies_mean)] # END YOUR CODE y_test_pred = predict_labels(dists, y_train, k=best_k) # Compute and display the accuracy num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / num_test print('For k = %d, got %d / %d correct => accuracy: %f' % (best_k, num_correct, num_test, accuracy))

from features import PCA pca = PCA()

# Perform eigenvalue decomposition on the covariance matrix of training data. e_vecs, e_vals = pca._eigen_decomp(X_train - X_train.mean(axis=0)) print(e_vals.shape) print(e_vecs.shape)

# Perform SVD on directly on the training data. u, s = pca._svd(X_train - X_train.mean(axis=0)) print(s.shape) print(u.shape)

# Test whether the square of singular values and eigenvalues are the same. # We also observe that `e_vecs` and `u` are the same (only the sign of each column can differ). N = X_train.shape[0] assert np.allclose((s ** 2) / (N - 1), e_vals[:len(s)]) for i in range(len(s) - 1): assert np.allclose(e_vecs[:, i], u[:, i]) or np.allclose(e_vecs[:, i], -u[:, i]) # (the last eigenvector for i = len(s) - 1 is very noisy because the eigenvalue is almost 0, # so imprecisions in the computation build up)

# Dimensionality reduction by projecting the data onto # lower dimensional subspace spanned by k principal components # To visualize, we will project in 2 dimensions n_components = 2 pca.fit(X_train) X_proj = pca.transform(X_train, n_components) # Plot the top two principal components for y in np.unique(y_train): plt.scatter(X_proj[y_train==y,0], X_proj[y_train==y,1], label=classes[y]) plt.xlabel('1st component') plt.ylabel('2nd component') plt.legend() plt.show()

for i in range(10): plt.subplot(1, 10, i+1) plt.imshow(pca.W_pca[:, i].reshape(64, 64)) plt.title("%.2f" % s[i]) plt.show()

# Reconstruct data with principal components n_components = 100 # Experiment with different number of components. X_proj = pca.transform(X_train, n_components) X_rec = pca.reconstruct(X_proj) print(X_rec.shape) print(classes) # Visualize reconstructed faces samples_per_class = 10 for y, cls in enumerate(classes): idxs = np.flatnonzero(y_train == y) idxs = np.random.choice(idxs, samples_per_class, replace=False) for i, idx in enumerate(idxs): plt_idx = i * num_classes + y + 1 plt.subplot(samples_per_class, num_classes, plt_idx) plt.imshow((X_rec[idx]).reshape((64, 64))) plt.axis('off') if i == 0: plt.title(y) plt.show()

# Plot reconstruction errors for different k N = X_train.shape[0] d = X_train.shape[1] ns = range(1, d, 100) errors = [] for n in ns: X_proj = pca.transform(X_train, n) X_rec = pca.reconstruct(X_proj) # Compute reconstruction error error = np.mean((X_rec - X_train) ** 2) errors.append(error) plt.plot(ns, errors) plt.xlabel('Number of Components') plt.ylabel('Reconstruction Error') plt.show()

# Plot captured variance ns = range(1, d, 100) var_cap = [] for n in ns: var_cap.append(np.sum(s[:n] ** 2)/np.sum(s ** 2)) plt.plot(ns, var_cap) plt.xlabel('Number of Components') plt.ylabel('Variance Captured') plt.show()

num_test = X_test.shape[0] # We computed the best k and n for you best_k = 20 best_n = 500 # PCA pca = PCA() pca.fit(X_train) X_proj = pca.transform(X_train, best_n) X_test_proj = pca.transform(X_test, best_n) # kNN dists = compute_distances(X_test_proj, X_proj) y_test_pred = predict_labels(dists, y_train, k=best_k) # Compute and display the accuracy num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / num_test print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))

from features import LDA lda = LDA()

N = X_train.shape[0] c = num_classes pca = PCA() pca.fit(X_train) X_train_pca = pca.transform(X_train, N-c) X_test_pca = pca.transform(X_test, N-c)

# Compute within-class scatter matrix S_W = lda._within_class_scatter(X_train_pca, y_train) print(S_W.shape)

# Compute between-class scatter matrix S_B = lda._between_class_scatter(X_train_pca, y_train) print(S_B.shape)

lda.fit(X_train_pca, y_train)

# Dimensionality reduction by projecting the data onto # lower dimensional subspace spanned by k principal components n_components = 2 X_proj = lda.transform(X_train_pca, n_components) X_test_proj = lda.transform(X_test_pca, n_components) # Plot the top two principal components on the training set for y in np.unique(y_train): plt.scatter(X_proj[y_train==y, 0], X_proj[y_train==y, 1], label=classes[y]) plt.xlabel('1st component') plt.ylabel('2nd component') plt.legend() plt.title("Training set") plt.show() # Plot the top two principal components on the test set for y in np.unique(y_test): plt.scatter(X_test_proj[y_test==y, 0], X_test_proj[y_test==y,1], label=classes[y]) plt.xlabel('1st component') plt.ylabel('2nd component') plt.legend() plt.title("Test set") plt.show()

num_folds = 5 X_trains, y_trains, X_vals, y_vals = split_folds(X_train, y_train, num_folds) k_choices = [1, 5, 10, 20] n_choices = [5, 10, 20, 50, 100, 200, 500] # n_k_to_accuracies[(n, k)] should be a list of length num_folds giving the different # accuracy values that we found when using that value of n and k. n_k_to_accuracies = defaultdict(list) for i in range(num_folds): # Fit PCA pca = PCA() pca.fit(X_trains[i]) N = len(X_trains[i]) X_train_pca = pca.transform(X_trains[i], N-c) X_val_pca = pca.transform(X_vals[i], N-c) # Fit LDA lda = LDA() lda.fit(X_train_pca, y_trains[i]) for n in n_choices: X_train_proj = lda.transform(X_train_pca, n) X_val_proj = lda.transform(X_val_pca, n) dists = compute_distances(X_val_proj, X_train_proj) for k in k_choices: y_pred = predict_labels(dists, y_trains[i], k=k) # Compute and print the fraction of correctly predicted examples num_correct = np.sum(y_pred == y_vals[i]) accuracy = float(num_correct) / len(y_vals[i]) n_k_to_accuracies[(n, k)].append(accuracy) for n in n_choices: print() for k in k_choices: accuracies = n_k_to_accuracies[(n, k)] print("For n=%d, k=%d: average accuracy is %f" % (n, k, np.mean(accuracies)))

# Based on the cross-validation results above, choose the best value for k, # retrain the classifier using all the training data, and test it on the test # data. You should be able to get above 40% accuracy on the test data. best_k = None best_n = None # YOUR CODE HERE # Choose the best k based on the cross validation above best_n, best_k = max(n_k_to_accuracies, key=n_k_to_accuracies.get) # END YOUR CODE N = len(X_train) # Fit PCA pca = PCA() pca.fit(X_train) X_train_pca = pca.transform(X_train, N-c) X_test_pca = pca.transform(X_test, N-c) # Fit LDA lda = LDA() lda.fit(X_train_pca, y_train) # Project using LDA X_train_proj = lda.transform(X_train_pca, best_n) X_test_proj = lda.transform(X_test_pca, best_n) dists = compute_distances(X_test_proj, X_train_proj) y_test_pred = predict_labels(dists, y_train, k=best_k) # Compute and display the accuracy num_correct = np.sum(y_test_pred == y_test) accuracy = float(num_correct) / num_test print("For k=%d and n=%d" % (best_k, best_n)) print('Got %d / %d correct => accuracy: %f' % (num_correct, num_test, accuracy))