Untitled Python Project

%matplotlib inline import numpy as np import scipy.optimize import sklearn.datasets import matplotlib.pyplot as plt np.set_printoptions(suppress=True, precision=6, linewidth=200) plt.style.use('ggplot')

def f(x, y): return x ** 2 + y ** 2 + x * (y + 2) + np.cos(3 * x) def grad_x_f(x, y): return 2 * x - 3 * np.sin(3 * x) + y + 2 def grad_y_f(x, y): return x + 2 * y

def plot_f_contours(): xx, yy = np.meshgrid(np.linspace(-5, 5), np.linspace(-5, 5)) zz = f(xx, yy) plt.contourf(xx, yy, zz, 50) plt.contour(xx, yy, zz, 50, alpha=0.2, colors='black', linestyles='solid') plt.xlabel('x') plt.ylabel('y') plt.figure(figsize=(10, 7)) plot_f_contours()

def optimize_f(x, y, step_size, steps): # keep track of the parameters we tried so far x_hist, y_hist = [x], [y] # run gradient descent for the number of steps for step in range(steps): # compute the gradients at the current point dx = grad_x_f(x, y) dy = grad_y_f(x, y) # apply the gradient descent updates to x and y x = x - (step_size * dx) y = y - (step_size * dy) # store the new parameters x_hist.append(x) y_hist.append(y) return x, y, f(x, y), x_hist, y_hist

# helper function that plots the results of the gradient descent optimization def plot_gradient_descent_results(x, y, val, x_hist, y_hist): # plot the path on the contour plot plt.figure(figsize=(20, 7)) plt.subplot(1, 2, 1) plot_f_contours() plt.plot(x_hist, y_hist, '.-') # plot the learning curve plt.subplot(1, 2, 2) plt.plot(f(np.array(x_hist), np.array(y_hist)), '.r-') plt.title('Minimum value: %f' % f(x_hist[-1], y_hist[-1]))

results = optimize_f(x=3, y=2, step_size=0.1, steps=10) plot_gradient_descent_results(*results)

# TODO: tune the parameters to find a better optimum results = optimize_f(x=3, y=2, step_size=0.16, steps=100) plot_gradient_descent_results(*results)

def optimize_f(x, y, step_size, steps, decay=1.0): # keep track of the parameters we tried so far x_hist, y_hist = [x], [y] # run gradient descent for the number of steps for step in range(steps): # compute the gradients at this point dx = grad_x_f(x, y) dy = grad_y_f(x, y) # apply the gradient descent updates to x and y x = x - (step_size * ( decay ** step ) * dx) # TODO: compute the update including step size decay y = y - (step_size * ( decay ** step ) * dy) # TODO: compute the update including step size decay # store the new parameters x_hist.append(x) y_hist.append(y) return x, y, f(x, y), x_hist, y_hist

# TODO: tune the parameters to find the local optimum results = optimize_f(x=3, y=2, step_size=0.16, steps=100) plot_gradient_descent_results(*results)

def sigmoid(x): sig = 1 / (1 + np.exp(-x)) # TODO: implement the sigmoid function return sig def sigmoid_grad(x): grad = (sigmoid(x) * (1 - sigmoid(x))) # TODO: implement the gradient of the sigmoid function return grad # try with a random input x = np.random.uniform(-10, 10, size=5) print('x:', x) print('sigmoid(x):', sigmoid(x)) print('sigmoid_grad(x):', sigmoid_grad(x))

# start with some random inputs x = np.random.uniform(-2, 2, size=5) # compute the symbolic gradient print('Symbolic', sigmoid_grad(x)) # TODO: compute the numerical gradient def sig_numerical(x): e = 0.0001 num = (sigmoid(x+(0.5*e)) - sigmoid(x-(0.5*e)))/e return num print('Numerical', sig_numerical(x))

def relu(x): return np.maximum(x,0) def relu_grad(x): return np.greater(x,0).astype(int) # try with a random input x = np.random.uniform(-10, 10, size=5) print('x:', x) print('relu(x):', relu(x)) print('relu_grad(x):', relu_grad(x)) print() # TODO: compute and compare the symbolic and numerical gradients def relu_numerical(x): e = 0.0001 num = (relu(x+(0.5*e)) - relu(x-(0.5*e)))/e return num print('Numerical', relu_numerical(x))

x = np.linspace(-10, 10, 100) plt.figure(figsize=(15, 8)) plt.subplot(2, 2, 1) plt.plot(x, sigmoid(x), label='Sigmoid') plt.xlabel('x') plt.legend(loc='upper left') plt.subplot(2, 2, 2) plt.plot(x, relu(x), label='ReLU') plt.xlabel('x') plt.legend(loc='upper left') plt.subplot(2, 2, 3) plt.plot(x, sigmoid_grad(x), label='Sigmoid gradient') plt.xlabel('x') plt.legend(loc='upper left') plt.subplot(2, 2, 4) plt.plot(x, relu_grad(x), label='ReLU gradient') plt.xlabel('x') plt.legend(loc='upper left');

def bce_loss(y, y_hat): #print('vals:', y) #print('predictions:', y_hat) return -( y * np.log(y_hat) + (1-y) * np.log(1-y_hat) ) def bce_loss_grad(y, y_hat): return (y_hat - y) / (y_hat - y_hat ** 2) # try with some random inputs y = np.random.randint(2, size=5) y_hat = np.random.uniform(0, 1, size=5) print('y:', y) print('y_hat:', y_hat) print('bceloss(y, y_hat):', bce_loss(y, y_hat)) print('bceloss_grad(y, y_hat):', bce_loss_grad(y, y_hat)) # TODO: compute and compare the symbolic and numerical gradients def numerical_gradient_bce(y, y_hat): e = 0.0001 y_hat_increment = y_hat + e y_hat_decrement = y_hat - e return ( bce_loss(y, y_hat_increment) - bce_loss(y, y_hat_decrement) ) / (2 * e) print('numerical gradient:', numerical_gradient_bce(y, y_hat))

# initialize parameters w = np.random.uniform(size=5) b = np.random.rand() # implement the model def fn(x, y): #print(x.shape) # TODO: forward: compute h, y_hat, loss h = np.matmul(np.transpose(x), w) + b #print('h:', h) y_hat = sigmoid(h) loss = bce_loss(y, y_hat) # TODO: backward: compute grad_y_hat, grad_h, grad_x grad_y_hat = bce_loss_grad(y, y_hat) #print(grad_y_hat) grad_h = grad_y_hat * sigmoid_grad(h) grad_x = grad_h * w return loss, grad_x # test with a random input x = np.random.uniform(size=5) y = 1 loss, grad_x = fn(x, y) print("Loss", loss) print("Gradient", grad_x)

# start with some random inputs x = np.random.uniform(size=5) y = 1 # set epsilon to a small value eps = 0.00001 numerical_grad = np.zeros(x.shape) # compute the gradient for each element of x separately for i in range(len(x)): # compute inputs at -eps/2 and +eps/2 x_a, x_b = x.copy(), x.copy() x_a[i] += eps / 2 x_b[i] -= eps / 2 # compute the gradient for this element loss_a, _ = fn(x_a, y) loss_b, _ = fn(x_b, y) numerical_grad[i] = (loss_a - loss_b) / eps # compute the symbolic gradient loss, symbolic_grad = fn(x, y) print("Symbolic gradient") print(symbolic_grad) print("Numerical gradient") print(numerical_grad)

# Computes y = x * w + b. class Linear: def __init__(self, n_in, n_out): # initialize the weights randomly, # using the Xavier initialization rule for scale a = np.sqrt(6 / (n_in * n_out)) self.W = np.random.uniform(-a, a, size=(n_in, n_out)) self.b = np.zeros((n_out,)) def forward(self, x): # TODO: compute the forward pass y = np.matmul(x, self.W) + self.b return y def backward(self, x, dy): # TODO: compute the backward pass, # given dy, compute the gradients for x, W and b dx = np.matmul(dy, self.W.transpose()) #print('x:', x.shape, 'dx:', dx.shape) self.dW = np.matmul(x.transpose(), dy) #print('W:', self.W) self.db = np.sum(dy, axis=0) #print('b:', self.b) return dx def step(self, step_size): self.W = self.W - step_size * self.dW self.b = self.b - step_size * self.db def __str__(self): return 'Linear %dx%d' % self.W.shape # Try the new class with some random values. # Debugging tip: always choose a unique length for each # dimension, so you'll get an error if you mix them up. x = np.random.uniform(size=(3, 5)) layer = Linear(5, 7) y = layer.forward(x) dx = layer.backward(x, np.ones_like(y)) print('x:', x) print('y:', y) print('dx:', dx)

# Computes y = 1 / (1 + exp(-x)). class Sigmoid: def forward(self, x): return sigmoid(x) def backward(self, x, dy): # TODO: compute the backward pass, return dy * sigmoid_grad(x) def step(self, step_size): pass # ? is something supposed to happen here def __str__(self): return 'Sigmoid' # try the new class with some random values x = np.random.uniform(size=(3, 5)) layer = Sigmoid() y = layer.forward(x) dx = layer.backward(x, np.ones_like(y)) print('y:', y) print('dx:', dx)

# Computes y = max(0, x). class ReLU: def forward(self, x): return relu(x) def backward(self, x, dy): # TODO: compute the backward pass, return dy * relu_grad(x) def step(self, step_size): pass # ? Might add something here later def __str__(self): return 'ReLU' # try the new class with some random values x = np.random.uniform(-10, 10, size=(3, 5)) layer = ReLU() y = layer.forward(x) dx = layer.backward(x, np.ones_like(y)) print('y:', y) print('dx:', dx)

## Verify gradient computations for Linear # test for dx layer = Linear(5, 7) def test_fn(x): x = x.reshape(3, 5) # multiply the output with a constant to check if # the gradient uses dy return 2 * np.sum(layer.forward(x)) def test_fn_grad(x): x = x.reshape(3, 5) # multiply the incoming dy gradient with a constant return layer.backward(x, 2 * np.ones((3, 7))).flatten() err = scipy.optimize.check_grad(test_fn, test_fn_grad, np.random.uniform(-10, 10, size=3 * 5)) print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err) # test for dW x = np.random.uniform(size=(3, 5)) layer = Linear(5, 7) def test_fn(w): layer.W = w.reshape(5, 7) # multiply the output with a constant to check if # the gradient uses dy return 2 * np.sum(layer.forward(x)) def test_fn_grad(w): layer.W = w.reshape(5, 7) # multiply the incoming dy gradient with a constant layer.backward(x, 2 * np.ones((3, 7))) return layer.dW.flatten() err = scipy.optimize.check_grad(test_fn, test_fn_grad, np.random.uniform(-10, 10, size=5 * 7)) print("err on dW:", "OK" if np.abs(err) < 1e-5 else "ERROR", err) # test for db x = np.random.uniform(size=(3, 5,)) layer = Linear(5, 7) def test_fn(b): layer.b = b # multiply the output with a constant to check if # the gradient uses dy return 2 * np.sum(layer.forward(x)) def test_fn_grad(b): layer.b = b # multiply the incoming dy gradient with a constant layer.backward(x, 2 * np.ones((x.shape[0], 7))) return layer.db err = scipy.optimize.check_grad(test_fn, test_fn_grad, np.random.uniform(-10, 10, size=7)) print("err on db:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)

## Verify gradient computation for Sigmoid # test for dx layer = Sigmoid() def test_fn(x): # multiply the output with a constant to check if # the gradient uses dy return np.sum(2 * layer.forward(x)) def test_fn_grad(x): # multiply the incoming dy gradient with a constant return layer.backward(x, 2 * np.ones(x.shape)) err = scipy.optimize.check_grad(test_fn, test_fn_grad, np.random.uniform(-10, 10, size=5)) print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)

## Verify gradient computation for ReLU # test for dx layer = ReLU() def test_fn(x): # multiply the output with a constant to check if # the gradient uses dy return 2 * np.sum(layer.forward(x)) def test_fn_grad(x): # multiply the incoming dy gradient with a constant return layer.backward(x, 2 * np.ones(x.shape)) err = scipy.optimize.check_grad(test_fn, test_fn_grad, np.random.uniform(1, 10, size=5)) print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)

class Net: def __init__(self, layers): self.layers = layers def forward(self, x): # compute the forward pass for each layer trace = [] for layer in self.layers: # compute the forward pass y = layer.forward(x) # store the original input for the backward pass trace.append((layer, x)) x = y # return the final output and the history trace return y, trace def backward(self, trace, dy): # compute the backward pass for each layer for layer, x in trace[::-1]: # compute the backward pass using the original input x dy = layer.backward(x, dy) def step(self, learning_rate): # apply the gradient descent updates of each layer for layer in self.layers: layer.step(learning_rate) def __str__(self): return '\n'.join(str(l) for l in self.layers)

# load the first two classes of the digits dataset dataset = sklearn.datasets.load_digits() digits_x, digits_y = dataset['data'], dataset['target'] # create a binary classification problem digits_y = (digits_y < 5).astype(float) # plot some of the digits plt.figure(figsize=(10, 2)) plt.imshow(np.hstack([digits_x[i].reshape(8, 8) for i in range(10)]), cmap='gray') plt.grid(False) plt.tight_layout() plt.axis('off') # normalize the values to [0, 1] digits_x -= np.mean(digits_x) digits_x /= np.std(digits_x) # print some statistics print('digits_x.shape:', digits_x.shape) print('digits_y.shape:', digits_y.shape) print('min, max values:', np.min(digits_x), np.max(digits_x)) print('labels:', np.unique(digits_y))

# make a 50%/50% train/test split train_prop = 0.5 n_train = int(digits_x.shape[0] * train_prop) # shuffle the images idxs = np.random.permutation(digits_x.shape[0]) # take a subset x = {'train': digits_x[idxs[:n_train]], 'test': digits_x[idxs[n_train:]]} y = {'train': digits_y[idxs[:n_train]], 'test': digits_y[idxs[n_train:]]} print('Training samples:', x['train'].shape[0]) print('Test samples:', x['test'].shape[0])

def fit(net, x, y, epochs=25, learning_rate=0.001, mb_size=10): # initialize the loss and accuracy history loss_hist = {'train': [], 'test': []} accuracy_hist = {'train': [], 'test': []} for epoch in range(epochs): # initialize the loss and accuracy for this epoch loss = {'train': 0.0, 'test': 0.0} accuracy = {'train': 0.0, 'test': 0.0} # first train on training data, then evaluate on the test data for phase in ('train', 'test'): # compute the number of minibatches steps = x[phase].shape[0] // mb_size # loop over all minibatches for step in range(steps): # get the samples for the current minibatch x_mb = x[phase][(step * mb_size):((step + 1) * mb_size)] y_mb = y[phase][(step * mb_size):((step + 1) * mb_size), None] # compute the forward pass through the network pred_y, trace = net.forward(x_mb) # compute the current loss and accuracy loss[phase] += np.mean(bce_loss(y_mb, pred_y)) accuracy[phase] += np.mean((y_mb > 0.5) == (pred_y > 0.5)) # only update the network in the training phase if phase == 'train': # compute the gradient for the loss dy = bce_loss_grad(y_mb, pred_y) # backpropagate the gradient through the network net.backward(trace, dy) # update the weights net.step(learning_rate) # compute the mean loss and accuracy over all minibatches loss[phase] = loss[phase] / steps accuracy[phase] = accuracy[phase] / steps # add statistics to history loss_hist[phase].append(loss[phase]) accuracy_hist[phase].append(accuracy[phase]) print('Epoch %3d: loss[train]=%7.4f accuracy[train]=%7.4f loss[test]=%7.4f accuracy[test]=%7.4f' % (epoch, loss['train'], accuracy['train'], loss['test'], accuracy['test'])) # plot the learning curves plt.figure(figsize=(20, 5)) plt.subplot(1, 2, 1) for phase in loss_hist: plt.plot(loss_hist[phase], label=phase) plt.title('BCE loss') plt.xlabel('Epoch') plt.legend() plt.subplot(1, 2, 2) for phase in accuracy_hist: plt.plot(accuracy_hist[phase], label=phase) plt.title('Accuracy') plt.xlabel('Epoch') plt.legend()

# construct network net = Net([ Linear(64, 32), ReLU(), Linear(32, 1), Sigmoid()]) # TODO: tune the hyperparameters fit(net, x, y, epochs = 25, learning_rate = 0.01, mb_size = 32)

# construct network net = Net([ Linear(64, 32), Linear(32, 1), Sigmoid()]) # TODO: tune the hyperparameters fit(net, x, y, epochs = 25, learning_rate = 0.01, mb_size = 32)

# construct network net = Net([ Linear(64, 1), Sigmoid()]) # TODO: tune the hyperparameters fit(net, x, y, epochs = 25, learning_rate = 0.01, mb_size = 32)

net = Net([ Linear(64, 32), ReLU(), Linear(32, 16), ReLU(), Linear(16, 8), ReLU(), Linear(8, 4), ReLU(), Linear(4, 1), Sigmoid()]) # TODO: tune the hyperparameters fit(net, x, y, epochs = 25, learning_rate = 0.01, mb_size = 32)