Deep Learning Assignments

%matplotlib inline import numpy as np import scipy.optimize import sklearn.datasets import matplotlib.pyplot as plt np.set_printoptions(suppress=True, precision=6, linewidth=200) plt.style.use('ggplot')

def f(x, y): return x ** 2 + y ** 2 + x * (y + 2) + np.cos(3 * x) def grad_x_f(x, y): return 2 * x - 3 * np.sin(3 * x) + y + 2 def grad_y_f(x, y): return x + 2 * y

def plot_f_contours(): xx, yy = np.meshgrid(np.linspace(-5, 5), np.linspace(-5, 5)) zz = f(xx, yy) plt.contourf(xx, yy, zz, 50) plt.contour(xx, yy, zz, 50, alpha=0.2, colors='black', linestyles='solid') plt.xlabel('x') plt.ylabel('y') plt.figure(figsize=(10, 7)) plot_f_contours()

def optimize_f(x, y, step_size, steps): # keep track of the parameters we tried so far x_hist, y_hist = [x], [y] # run gradient descent for the number of steps for step in range(steps): # compute the gradients at the current point dx = grad_x_f(x, y) dy = grad_y_f(x, y) # apply the gradient descent updates to x and y x = x - (step_size * dx) y = y - (step_size * dy) # store the new parameters x_hist.append(x) y_hist.append(y) return x, y, f(x, y), x_hist, y_hist

# helper function that plots the results of the gradient descent optimization def plot_gradient_descent_results(x, y, val, x_hist, y_hist): # plot the path on the contour plot plt.figure(figsize=(20, 7)) plt.subplot(1, 2, 1) plot_f_contours() plt.plot(x_hist, y_hist, '.-') # plot the learning curve plt.subplot(1, 2, 2) plt.plot(f(np.array(x_hist), np.array(y_hist)), '.r-') plt.title('Minimum value: %f' % f(x_hist[-1], y_hist[-1]))

results = optimize_f(x=3, y=2, step_size=0.1, steps=10) plot_gradient_descent_results(*results)

# TODO: tune the parameters to find a better optimum results = optimize_f(x=3, y=2, step_size=0.15, steps=500) plot_gradient_descent_results(*results)

def optimize_f(x, y, step_size, steps, decay=1.0): # keep track of the parameters we tried so far x_hist, y_hist = [x], [y] # run gradient descent for the number of steps for step in range(steps): # compute the gradients at this point dx = grad_x_f(x, y) dy = grad_y_f(x, y) # apply the gradient descent updates to x and y step_size = step_size * decay x = x - (step_size * dx) y = y - (step_size * dy) # store the new parameters x_hist.append(x) y_hist.append(y) return x, y, f(x, y), x_hist, y_hist

# TODO: tune the parameters to find the local optimum results = optimize_f(x=3, y=2, step_size=0.5, steps=100, decay=0.95) plot_gradient_descent_results(*results)

def sigmoid(x): return 1/(1+ np.exp(-x)) def sigmoid_grad(x): return np.exp(-x) / ((1 + np.exp(-x))**2) # try with a random input x = np.random.uniform(-10, 10, size=5) print('x:', x) print('sigmoid(x):', sigmoid(x)) print('sigmoid_grad(x):', sigmoid_grad(x))

x: [-9.098796  9.979386 -5.079815  9.951826 -1.562108]
sigmoid(x): [0.000112 0.999954 0.006183 0.999952 0.173344]
sigmoid_grad(x): [0.000112 0.000046 0.006144 0.000048 0.143296]

# start with some random inputs x = np.random.uniform(-2, 2, size=5) # compute the symbolic gradient print('Symbolic', sigmoid_grad(x)) # TODO: compute the numerical gradient epsilon = 0.001 numerical_gradient = (sigmoid(x+0.5*epsilon)-sigmoid(x-0.5*epsilon))/epsilon print('Numerical', numerical_gradient)

Symbolic [0.235547 0.249954 0.221157 0.176144 0.106473]
Numerical [0.235547 0.249954 0.221157 0.176144 0.106473]

def relu(x): return np.maximum(0,x) def relu_grad(x): return np.maximum(0,np.sign(x)) # try with a random input x = np.random.uniform(-10, 10, size=5) print('x:', x) print('relu(x):', relu(x)) print('relu_grad(x):', relu_grad(x)) epsilon = 0.001 num_relu_grad = (relu(x+0.5*epsilon)-relu(x-0.5*epsilon))/epsilon print('num_relu_grad(x)', num_relu_grad) # TODO: compute and compare the symbolic and numerical gradients

x: [-2.03056  -4.020691 -1.175124 -6.169135  1.66985 ]
relu(x): [0.      0.      0.      0.      1.66985]
relu_grad(x): [0. 0. 0. 0. 1.]
num_relu_grad(x) [0. 0. 0. 0. 1.]

x = np.linspace(-10, 10, 100) plt.figure(figsize=(15, 8)) plt.subplot(2, 2, 1) plt.plot(x, sigmoid(x), label='Sigmoid') plt.xlabel('x') plt.legend(loc='upper left') plt.subplot(2, 2, 2) plt.plot(x, relu(x), label='ReLU') plt.xlabel('x') plt.legend(loc='upper left') plt.subplot(2, 2, 3) plt.plot(x, sigmoid_grad(x), label='Sigmoid gradient') plt.xlabel('x') plt.legend(loc='upper left') plt.subplot(2, 2, 4) plt.plot(x, relu_grad(x), label='ReLU gradient') plt.xlabel('x') plt.legend(loc='upper left');

def bce_loss(y, y_hat): return -(y*np.log(y_hat) + (1-y)*np.log(1-y_hat)) def bce_loss_grad(y, y_hat): return (-y)/y_hat + (1-y)/(1-y_hat) # try with some random inputs y = np.random.randint(2, size=5) y_hat = np.random.uniform(0, 1, size=5) print('y:', y) print('y_hat:', y_hat) print('bceloss(y, y_hat):', bce_loss(y, y_hat)) epsilon = 0.00001 num_bce_grad = (bce_loss(y, y_hat+0.5*epsilon)-bce_loss(y,y_hat-0.5*epsilon))/epsilon print('num_bce_grad(y, y_hat):', num_bce_grad) print('bce_loss_grad(y, y_hat):', bce_loss_grad(y, y_hat))

y: [0 1 1 1 1]
y_hat: [0.053824 0.962561 0.198698 0.659568 0.548089]
bceloss(y, y_hat): [0.055326 0.038157 1.615968 0.41617  0.601318]
num_bce_grad(y, y_hat): [ 1.056885 -1.038895 -5.032759 -1.516144 -1.824523]
bce_loss_grad(y, y_hat): [ 1.056885 -1.038895 -5.032759 -1.516144 -1.824523]

# initialize parameters w = np.random.uniform(size=5) b = np.random.rand() # implement the model def fn(x, y): # TODO: forward: compute h, y_hat, loss h = np.dot(np.transpose(x), w) + b y_hat = sigmoid(h) loss = bce_loss(y, y_hat) # TODO: backward: compute grad_y_hat, grad_h, grad_x grad_y_hat = bce_loss_grad(y, y_hat) grad_h = sigmoid_grad(grad_y_hat) #(dl dyhat * W transposed in lecture example) grad_x = np.dot(grad_h, np.transpose(w)) # sigmoid_grad: np.exp(-x) / ((1 + np.exp(-x))**2) return loss, grad_x # test with a random input x = np.random.uniform(size=5) y = 1 loss, grad_x = fn(x, y) print("Loss", loss) print("Gradient", grad_x)

Loss 0.1659708964889129
Gradient [0.170876 0.076722 0.159885 0.006955 0.016888]

# start with some random inputs x = np.random.uniform(size=5) y = 1 # set epsilon to a small value eps = 0.00001 numerical_grad = np.zeros(x.shape) # compute the gradient for each element of x separately for i in range(len(x)): # compute inputs at -eps/2 and +eps/2 x_a, x_b = x.copy(), x.copy() x_a[i] += eps / 2 x_b[i] -= eps / 2 # compute the gradient for this element loss_a, _ = fn(x_a, y) loss_b, _ = fn(x_b, y) numerical_grad[i] = (loss_a - loss_b) / eps # compute the symbolic gradient loss, symbolic_grad = fn(x, y) print("Symbolic gradient") print(symbolic_grad) print("Numerical gradient") print(numerical_grad)

Symbolic gradient
[0.161812 0.072653 0.151404 0.006586 0.015992]
Numerical gradient
[-0.207983 -0.093383 -0.194604 -0.008465 -0.020555]

# Computes y = x * w + b. class Linear: def __init__(self, n_in, n_out): # initialize the weights randomly, # using the Xavier initialization rule for scale a = np.sqrt(6 / (n_in * n_out)) self.W = np.random.uniform(-a, a, size=(n_in, n_out)) self.b = np.zeros((n_out,)) def forward(self, x): # TODO: compute the forward pass y = np.dot(x, self.W) + self.b return y def backward(self, x, dy): # TODO: compute the backward pass, # given dy, compute the gradients for x, W and b dx = np.dot(dy, np.transpose(self.W)) self.dW = np.dot(np.transpose(x), dy) self.db = (np.sum(dy) / len(dy)) return dx def step(self, step_size): # TODO: apply a gradient descent update step self.W = self.W - self.dW * step_size # TODO self.b = self.b - self.db * step_size # TODO def __str__(self): return 'Linear %dx%d' % self.W.shape # Try the new class with some random values. # Debugging tip: always choose a unique length for each # dimension, so you'll get an error if you mix them up. x = np.random.uniform(size=(3, 5)) layer = Linear(5, 7) y = layer.forward(x) dx = layer.backward(x, np.ones_like(y)) print('y:', y) print('dx:', dx)

y: [[ 0.339211  0.501915  0.037213  0.100355  0.278983 -0.186019 -0.291139]
 [ 0.303208  0.592294 -0.073754  0.105227  0.291073 -0.26955   0.127579]
 [ 0.100563  0.595091 -0.105874 -0.065961  0.368961 -0.206255  0.266175]]
dx: [[0.576871 0.132018 0.427378 0.021808 0.549294]
 [0.576871 0.132018 0.427378 0.021808 0.549294]
 [0.576871 0.132018 0.427378 0.021808 0.549294]]

# Computes y = 1 / (1 + exp(-x)). class Sigmoid: def forward(self, x): # TODO: compute the forward pass return sigmoid(x) def backward(self, x, dy): # TODO: compute the backward pass, # return the gradient for x given dy sigmoid_x = sigmoid(x) self.dx = dy * (sigmoid_x * (1-sigmoid_x)) return self.dx def step(self, step_size): pass def __str__(self): return 'Sigmoid' # try the new class with some random values x = np.random.uniform(size=(3, 5)) layer = Sigmoid() y = layer.forward(x) dx = layer.backward(x, np.ones_like(y)) print('y:', y) print('dx:', dx)

y: [[0.648987 0.545029 0.62944  0.644926 0.655771]
 [0.652881 0.657442 0.549253 0.673219 0.68591 ]
 [0.682264 0.509436 0.609581 0.58547  0.593213]]
dx: [[0.227803 0.247972 0.233245 0.228997 0.225735]
 [0.226627 0.225212 0.247574 0.219995 0.215438]
 [0.21678  0.249911 0.237992 0.242695 0.241311]]

# Computes y = max(0, x). class ReLU: def forward(self, x): # TODO: compute the forward pass return np.maximum(0,x) def backward(self, x, dy): # TODO: compute the backward pass, # return the gradient for x given dy return np.multiply(dy, np.int64(x > 0)) def step(self, step_size): pass def __str__(self): return 'ReLU' # try the new class with some random values x = np.random.uniform(-10, 10, size=(3, 5)) layer = ReLU() y = layer.forward(x) dx = layer.backward(x, np.ones_like(y)) print('y:', y) print('dx:', dx)

y: [[0.       2.290372 5.163791 0.       0.      ]
 [0.       0.       0.       2.037993 0.      ]
 [0.       0.       0.       0.       3.88808 ]]
dx: [[0. 1. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]

## Verify gradient computations for Linear # test for dx layer = Linear(5, 7) def test_fn(x): x = x.reshape(3, 5) # multiply the output with a constant to check if # the gradient uses dy return 2 * np.sum(layer.forward(x)) def test_fn_grad(x): x = x.reshape(3, 5) # multiply the incoming dy gradient with a constant return layer.backward(x, 2 * np.ones((3, 7))).flatten() err = scipy.optimize.check_grad(test_fn, test_fn_grad, np.random.uniform(-10, 10, size=3 * 5)) print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err) # test for dW x = np.random.uniform(size=(3, 5)) layer = Linear(5, 7) def test_fn(w): layer.W = w.reshape(5, 7) # multiply the output with a constant to check if # the gradient uses dy return 2 * np.sum(layer.forward(x)) def test_fn_grad(w): layer.W = w.reshape(5, 7) # multiply the incoming dy gradient with a constant layer.backward(x, 2 * np.ones((3, 7))) return layer.dW.flatten() err = scipy.optimize.check_grad(test_fn, test_fn_grad, np.random.uniform(-10, 10, size=5 * 7)) print("err on dW:", "OK" if np.abs(err) < 1e-5 else "ERROR", err) # test for db x = np.random.uniform(size=(3, 5,)) layer = Linear(5, 7) def test_fn(b): layer.b = b # multiply the output with a constant to check if # the gradient uses dy return 2 * np.sum(layer.forward(x)) def test_fn_grad(b): layer.b = b # multiply the incoming dy gradient with a constant layer.backward(x, 2 * np.ones((x.shape[0], 7))) return layer.db err = scipy.optimize.check_grad(test_fn, test_fn_grad, np.random.uniform(-10, 10, size=7)) print("err on db:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)

err on dx: OK 4.686306062709855e-07
err on dW: OK 4.035680032700044e-06
err on db: ERROR 21.166010488516726

## Verify gradient computation for Sigmoid # test for dx layer = Sigmoid() def test_fn(x): # multiply the output with a constant to check if # the gradient uses dy return np.sum(2 * layer.forward(x)) def test_fn_grad(x): # multiply the incoming dy gradient with a constant return layer.backward(x, 2 * np.ones(x.shape)) err = scipy.optimize.check_grad(test_fn, test_fn_grad, np.random.uniform(-10, 10, size=5)) print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)

err on dx: OK 4.69246953421683e-08

## Verify gradient computation for ReLU # test for dx layer = ReLU() def test_fn(x): # multiply the output with a constant to check if # the gradient uses dy return 2 * np.sum(layer.forward(x)) def test_fn_grad(x): # multiply the incoming dy gradient with a constant return layer.backward(x, 2 * np.ones(x.shape)) err = scipy.optimize.check_grad(test_fn, test_fn_grad, np.random.uniform(1, 10, size=5)) print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)

err on dx: OK 0.0

class Net: def __init__(self, layers): self.layers = layers def forward(self, x): # compute the forward pass for each layer trace = [] for layer in self.layers: # compute the forward pass y = layer.forward(x) # store the original input for the backward pass trace.append((layer, x)) x = y # return the final output and the history trace return y, trace def backward(self, trace, dy): # compute the backward pass for each layer for layer, x in trace[::-1]: # compute the backward pass using the original input x dy = layer.backward(x, dy) def step(self, learning_rate): # apply the gradient descent updates of each layer for layer in self.layers: layer.step(learning_rate) def __str__(self): return '\n'.join(str(l) for l in self.layers)

# load the first two classes of the digits dataset dataset = sklearn.datasets.load_digits() digits_x, digits_y = dataset['data'], dataset['target'] # create a binary classification problem digits_y = (digits_y < 5).astype(float) # plot some of the digits plt.figure(figsize=(10, 2)) plt.imshow(np.hstack([digits_x[i].reshape(8, 8) for i in range(10)]), cmap='gray') plt.grid(False) plt.tight_layout() plt.axis('off') # normalize the values to [0, 1] digits_x -= np.mean(digits_x) digits_x /= np.std(digits_x) # print some statistics print('digits_x.shape:', digits_x.shape) print('digits_y.shape:', digits_y.shape) print('min, max values:', np.min(digits_x), np.max(digits_x)) print('labels:', np.unique(digits_y))

digits_x.shape: (1797, 64)
digits_y.shape: (1797,)
min, max values: -0.8117561971974786 1.847470154168513
labels: [0. 1.]

# make a 50%/50% train/test split train_prop = 0.5 n_train = int(digits_x.shape[0] * train_prop) # shuffle the images idxs = np.random.permutation(digits_x.shape[0]) # take a subset x = {'train': digits_x[idxs[:n_train]], 'test': digits_x[idxs[n_train:]]} y = {'train': digits_y[idxs[:n_train]], 'test': digits_y[idxs[n_train:]]} print('Training samples:', x['train'].shape[0]) print('Test samples:', x['test'].shape[0])

Training samples: 898
Test samples: 899

def fit(net, x, y, epochs=25, learning_rate=0.001, mb_size=10): # initialize the loss and accuracy history loss_hist = {'train': [], 'test': []} accuracy_hist = {'train': [], 'test': []} for epoch in range(epochs): # initialize the loss and accuracy for this epoch loss = {'train': 0.0, 'test': 0.0} accuracy = {'train': 0.0, 'test': 0.0} # first train on training data, then evaluate on the test data for phase in ('train', 'test'): # compute the number of minibatches steps = x[phase].shape[0] // mb_size # loop over all minibatches for step in range(steps): # get the samples for the current minibatch x_mb = x[phase][(step * mb_size):((step + 1) * mb_size)] y_mb = y[phase][(step * mb_size):((step + 1) * mb_size), None] # compute the forward pass through the network pred_y, trace = net.forward(x_mb) # compute the current loss and accuracy loss[phase] += np.mean(bce_loss(y_mb, pred_y)) accuracy[phase] += np.mean((y_mb > 0.5) == (pred_y > 0.5)) # only update the network in the training phase if phase == 'train': # compute the gradient for the loss dy = bce_loss_grad(y_mb, pred_y) # backpropagate the gradient through the network net.backward(trace, dy) # update the weights net.step(learning_rate) # compute the mean loss and accuracy over all minibatches loss[phase] = loss[phase] / steps accuracy[phase] = accuracy[phase] / steps # add statistics to history loss_hist[phase].append(loss[phase]) accuracy_hist[phase].append(accuracy[phase]) print('Epoch %3d: loss[train]=%7.4f accuracy[train]=%7.4f loss[test]=%7.4f accuracy[test]=%7.4f' % (epoch, loss['train'], accuracy['train'], loss['test'], accuracy['test'])) # plot the learning curves plt.figure(figsize=(20, 5)) plt.subplot(1, 2, 1) for phase in loss_hist: plt.plot(loss_hist[phase], label=phase) plt.title('BCE loss') plt.xlabel('Epoch') plt.legend() plt.subplot(1, 2, 2) for phase in accuracy_hist: plt.plot(accuracy_hist[phase], label=phase) plt.title('Accuracy') plt.xlabel('Epoch') plt.legend()

# construct network net = Net([ Linear(64, 32), ReLU(), Linear(32, 1), Sigmoid()]) # TODO: tune the hyperparameters fit(net, x, y, epochs = 25, learning_rate = 0.01, mb_size = 10)

Epoch   0: loss[train]= 0.4128  accuracy[train]= 0.8213  loss[test]= 0.2512  accuracy[test]= 0.9022
Epoch   1: loss[train]= 0.2213  accuracy[train]= 0.9270  loss[test]= 0.1727  accuracy[test]= 0.9315
Epoch   2: loss[train]= 0.1500  accuracy[train]= 0.9494  loss[test]= 0.1349  accuracy[test]= 0.9551
Epoch   3: loss[train]= 0.1150  accuracy[train]= 0.9663  loss[test]= 0.1174  accuracy[test]= 0.9596
Epoch   4: loss[train]= 0.0930  accuracy[train]= 0.9708  loss[test]= 0.1073  accuracy[test]= 0.9629
Epoch   5: loss[train]= 0.0767  accuracy[train]= 0.9764  loss[test]= 0.1015  accuracy[test]= 0.9640
Epoch   6: loss[train]= 0.0640  accuracy[train]= 0.9820  loss[test]= 0.0960  accuracy[test]= 0.9640
Epoch   7: loss[train]= 0.0541  accuracy[train]= 0.9899  loss[test]= 0.0930  accuracy[test]= 0.9663
Epoch   8: loss[train]= 0.0469  accuracy[train]= 0.9921  loss[test]= 0.0942  accuracy[test]= 0.9640
Epoch   9: loss[train]= 0.0400  accuracy[train]= 0.9933  loss[test]= 0.0932  accuracy[test]= 0.9640
Epoch  10: loss[train]= 0.0355  accuracy[train]= 0.9933  loss[test]= 0.0928  accuracy[test]= 0.9663
Epoch  11: loss[train]= 0.0303  accuracy[train]= 0.9944  loss[test]= 0.0919  accuracy[test]= 0.9640
Epoch  12: loss[train]= 0.0274  accuracy[train]= 0.9944  loss[test]= 0.0894  accuracy[test]= 0.9652
Epoch  13: loss[train]= 0.0241  accuracy[train]= 0.9966  loss[test]= 0.0899  accuracy[test]= 0.9674
Epoch  14: loss[train]= 0.0217  accuracy[train]= 0.9955  loss[test]= 0.0890  accuracy[test]= 0.9674
Epoch  15: loss[train]= 0.0191  accuracy[train]= 0.9966  loss[test]= 0.0895  accuracy[test]= 0.9685
Epoch  16: loss[train]= 0.0167  accuracy[train]= 0.9978  loss[test]= 0.0890  accuracy[test]= 0.9697
Epoch  17: loss[train]= 0.0150  accuracy[train]= 0.9978  loss[test]= 0.0896  accuracy[test]= 0.9697
Epoch  18: loss[train]= 0.0137  accuracy[train]= 0.9989  loss[test]= 0.0886  accuracy[test]= 0.9708
Epoch  19: loss[train]= 0.0121  accuracy[train]= 0.9989  loss[test]= 0.0883  accuracy[test]= 0.9685
Epoch  20: loss[train]= 0.0107  accuracy[train]= 0.9989  loss[test]= 0.0890  accuracy[test]= 0.9685
Epoch  21: loss[train]= 0.0097  accuracy[train]= 1.0000  loss[test]= 0.0886  accuracy[test]= 0.9685
Epoch  22: loss[train]= 0.0089  accuracy[train]= 1.0000  loss[test]= 0.0877  accuracy[test]= 0.9685
Epoch  23: loss[train]= 0.0082  accuracy[train]= 1.0000  loss[test]= 0.0879  accuracy[test]= 0.9697
Epoch  24: loss[train]= 0.0075  accuracy[train]= 1.0000  loss[test]= 0.0884  accuracy[test]= 0.9697

# TODO: Your code here. # construct network net = Net([ Linear(64, 32), Linear(32, 1), Sigmoid()]) # TODO: tune the hyperparameters fit(net, x, y, epochs = 25, learning_rate = 0.01, mb_size = 10)

Epoch   0: loss[train]= 0.4589  accuracy[train]= 0.7843  loss[test]= 0.3148  accuracy[test]= 0.8685
Epoch   1: loss[train]= 0.3484  accuracy[train]= 0.8596  loss[test]= 0.2906  accuracy[test]= 0.8764
Epoch   2: loss[train]= 0.3265  accuracy[train]= 0.8742  loss[test]= 0.2809  accuracy[test]= 0.8843
Epoch   3: loss[train]= 0.3152  accuracy[train]= 0.8798  loss[test]= 0.2760  accuracy[test]= 0.8888
Epoch   4: loss[train]= 0.3079  accuracy[train]= 0.8854  loss[test]= 0.2735  accuracy[test]= 0.8921
Epoch   5: loss[train]= 0.3026  accuracy[train]= 0.8910  loss[test]= 0.2724  accuracy[test]= 0.8955
Epoch   6: loss[train]= 0.2987  accuracy[train]= 0.8933  loss[test]= 0.2720  accuracy[test]= 0.9000
Epoch   7: loss[train]= 0.2955  accuracy[train]= 0.8921  loss[test]= 0.2720  accuracy[test]= 0.9011
Epoch   8: loss[train]= 0.2930  accuracy[train]= 0.8921  loss[test]= 0.2722  accuracy[test]= 0.9000
Epoch   9: loss[train]= 0.2908  accuracy[train]= 0.8933  loss[test]= 0.2726  accuracy[test]= 0.9000
Epoch  10: loss[train]= 0.2890  accuracy[train]= 0.8933  loss[test]= 0.2730  accuracy[test]= 0.8989
Epoch  11: loss[train]= 0.2874  accuracy[train]= 0.8944  loss[test]= 0.2734  accuracy[test]= 0.8989
Epoch  12: loss[train]= 0.2861  accuracy[train]= 0.8933  loss[test]= 0.2739  accuracy[test]= 0.8989
Epoch  13: loss[train]= 0.2848  accuracy[train]= 0.8944  loss[test]= 0.2743  accuracy[test]= 0.9000
Epoch  14: loss[train]= 0.2837  accuracy[train]= 0.8966  loss[test]= 0.2747  accuracy[test]= 0.9000
Epoch  15: loss[train]= 0.2828  accuracy[train]= 0.8978  loss[test]= 0.2752  accuracy[test]= 0.9000
Epoch  16: loss[train]= 0.2819  accuracy[train]= 0.9022  loss[test]= 0.2756  accuracy[test]= 0.9000
Epoch  17: loss[train]= 0.2811  accuracy[train]= 0.9034  loss[test]= 0.2759  accuracy[test]= 0.9000
Epoch  18: loss[train]= 0.2803  accuracy[train]= 0.9022  loss[test]= 0.2763  accuracy[test]= 0.9000
Epoch  19: loss[train]= 0.2796  accuracy[train]= 0.9022  loss[test]= 0.2767  accuracy[test]= 0.9000
Epoch  20: loss[train]= 0.2790  accuracy[train]= 0.9022  loss[test]= 0.2770  accuracy[test]= 0.9000
Epoch  21: loss[train]= 0.2784  accuracy[train]= 0.9022  loss[test]= 0.2773  accuracy[test]= 0.9000
Epoch  22: loss[train]= 0.2778  accuracy[train]= 0.9011  loss[test]= 0.2776  accuracy[test]= 0.9000
Epoch  23: loss[train]= 0.2773  accuracy[train]= 0.9022  loss[test]= 0.2779  accuracy[test]= 0.9000
Epoch  24: loss[train]= 0.2768  accuracy[train]= 0.9011  loss[test]= 0.2782  accuracy[test]= 0.9000

# TODO: Your code here. # construct network net = Net([ Linear(64, 1), Sigmoid()]) # TODO: tune the hyperparameters fit(net, x, y, epochs = 25, learning_rate = 0.01, mb_size = 10)

Epoch   0: loss[train]= 0.4900  accuracy[train]= 0.7809  loss[test]= 0.3529  accuracy[test]= 0.8640
Epoch   1: loss[train]= 0.3616  accuracy[train]= 0.8483  loss[test]= 0.3127  accuracy[test]= 0.8865
Epoch   2: loss[train]= 0.3324  accuracy[train]= 0.8685  loss[test]= 0.2956  accuracy[test]= 0.8933
Epoch   3: loss[train]= 0.3170  accuracy[train]= 0.8820  loss[test]= 0.2862  accuracy[test]= 0.8921
Epoch   4: loss[train]= 0.3072  accuracy[train]= 0.8876  loss[test]= 0.2804  accuracy[test]= 0.8944
Epoch   5: loss[train]= 0.3004  accuracy[train]= 0.8899  loss[test]= 0.2766  accuracy[test]= 0.8989
Epoch   6: loss[train]= 0.2955  accuracy[train]= 0.8921  loss[test]= 0.2742  accuracy[test]= 0.9000
Epoch   7: loss[train]= 0.2918  accuracy[train]= 0.8933  loss[test]= 0.2725  accuracy[test]= 0.8989
Epoch   8: loss[train]= 0.2889  accuracy[train]= 0.8933  loss[test]= 0.2715  accuracy[test]= 0.9000
Epoch   9: loss[train]= 0.2866  accuracy[train]= 0.8921  loss[test]= 0.2708  accuracy[test]= 0.9011
Epoch  10: loss[train]= 0.2847  accuracy[train]= 0.8966  loss[test]= 0.2704  accuracy[test]= 0.9011
Epoch  11: loss[train]= 0.2831  accuracy[train]= 0.8978  loss[test]= 0.2702  accuracy[test]= 0.9000
Epoch  12: loss[train]= 0.2818  accuracy[train]= 0.8989  loss[test]= 0.2702  accuracy[test]= 0.9011
Epoch  13: loss[train]= 0.2807  accuracy[train]= 0.9011  loss[test]= 0.2702  accuracy[test]= 0.9011
Epoch  14: loss[train]= 0.2797  accuracy[train]= 0.9000  loss[test]= 0.2704  accuracy[test]= 0.9000
Epoch  15: loss[train]= 0.2789  accuracy[train]= 0.9000  loss[test]= 0.2706  accuracy[test]= 0.9000
Epoch  16: loss[train]= 0.2781  accuracy[train]= 0.8989  loss[test]= 0.2708  accuracy[test]= 0.9000
Epoch  17: loss[train]= 0.2775  accuracy[train]= 0.9011  loss[test]= 0.2711  accuracy[test]= 0.9022
Epoch  18: loss[train]= 0.2769  accuracy[train]= 0.9022  loss[test]= 0.2715  accuracy[test]= 0.9022
Epoch  19: loss[train]= 0.2764  accuracy[train]= 0.9022  loss[test]= 0.2718  accuracy[test]= 0.9000
Epoch  20: loss[train]= 0.2759  accuracy[train]= 0.9022  loss[test]= 0.2721  accuracy[test]= 0.9011
Epoch  21: loss[train]= 0.2755  accuracy[train]= 0.9022  loss[test]= 0.2725  accuracy[test]= 0.9011
Epoch  22: loss[train]= 0.2751  accuracy[train]= 0.9011  loss[test]= 0.2729  accuracy[test]= 0.9034
Epoch  23: loss[train]= 0.2747  accuracy[train]= 0.9011  loss[test]= 0.2732  accuracy[test]= 0.9045
Epoch  24: loss[train]= 0.2744  accuracy[train]= 0.9011  loss[test]= 0.2736  accuracy[test]= 0.9045

# TODO: Your code here. # construct network net = Net([ Linear(64, 32), ReLU(), Linear(32, 16), ReLU(), Linear(16,8), ReLU(), Linear(8,1), Sigmoid()]) # TODO: tune the hyperparameters fit(net, x, y, epochs = 25, learning_rate = 0.01, mb_size = 10)

Epoch   0: loss[train]= 0.6906  accuracy[train]= 0.6315  loss[test]= 0.6830  accuracy[test]= 0.7989
Epoch   1: loss[train]= 0.5793  accuracy[train]= 0.7955  loss[test]= 0.3588  accuracy[test]= 0.8517
Epoch   2: loss[train]= 0.3062  accuracy[train]= 0.8888  loss[test]= 0.2032  accuracy[test]= 0.9258
Epoch   3: loss[train]= 0.1875  accuracy[train]= 0.9236  loss[test]= 0.1414  accuracy[test]= 0.9393
Epoch   4: loss[train]= 0.1299  accuracy[train]= 0.9494  loss[test]= 0.1232  accuracy[test]= 0.9483
Epoch   5: loss[train]= 0.0984  accuracy[train]= 0.9652  loss[test]= 0.1070  accuracy[test]= 0.9607
Epoch   6: loss[train]= 0.0697  accuracy[train]= 0.9764  loss[test]= 0.1062  accuracy[test]= 0.9618
Epoch   7: loss[train]= 0.0430  accuracy[train]= 0.9910  loss[test]= 0.0995  accuracy[test]= 0.9663
Epoch   8: loss[train]= 0.0314  accuracy[train]= 0.9933  loss[test]= 0.1002  accuracy[test]= 0.9674
Epoch   9: loss[train]= 0.0277  accuracy[train]= 0.9921  loss[test]= 0.1120  accuracy[test]= 0.9652
Epoch  10: loss[train]= 0.0218  accuracy[train]= 0.9944  loss[test]= 0.1104  accuracy[test]= 0.9708
Epoch  11: loss[train]= 0.0423  accuracy[train]= 0.9854  loss[test]= 0.0917  accuracy[test]= 0.9708
Epoch  12: loss[train]= 0.0146  accuracy[train]= 0.9978  loss[test]= 0.1048  accuracy[test]= 0.9685
Epoch  13: loss[train]= 0.0122  accuracy[train]= 0.9978  loss[test]= 0.1081  accuracy[test]= 0.9674
Epoch  14: loss[train]= 0.0101  accuracy[train]= 0.9978  loss[test]= 0.1112  accuracy[test]= 0.9674
Epoch  15: loss[train]= 0.0086  accuracy[train]= 0.9978  loss[test]= 0.1150  accuracy[test]= 0.9685
Epoch  16: loss[train]= 0.0780  accuracy[train]= 0.9753  loss[test]= 0.1661  accuracy[test]= 0.9506
Epoch  17: loss[train]= 0.0548  accuracy[train]= 0.9876  loss[test]= 0.0940  accuracy[test]= 0.9730
Epoch  18: loss[train]= 0.0229  accuracy[train]= 0.9933  loss[test]= 0.0929  accuracy[test]= 0.9697
Epoch  19: loss[train]= 0.0196  accuracy[train]= 0.9910  loss[test]= 0.1225  accuracy[test]= 0.9652
Epoch  20: loss[train]= 0.0267  accuracy[train]= 0.9933  loss[test]= 0.0927  accuracy[test]= 0.9708
Epoch  21: loss[train]= 0.0096  accuracy[train]= 0.9978  loss[test]= 0.0933  accuracy[test]= 0.9697
Epoch  22: loss[train]= 0.0051  accuracy[train]= 0.9989  loss[test]= 0.0932  accuracy[test]= 0.9730
Epoch  23: loss[train]= 0.0044  accuracy[train]= 0.9989  loss[test]= 0.0942  accuracy[test]= 0.9753
Epoch  24: loss[train]= 0.0012  accuracy[train]= 1.0000  loss[test]= 0.0968  accuracy[test]= 0.9764