Untitled Python Project

%matplotlib inline import numpy as np import scipy.optimize import sklearn.datasets import matplotlib.pyplot as plt np.set_printoptions(suppress=True, precision=6, linewidth=200) plt.style.use('ggplot')

def f(x, y): return x ** 2 + y ** 2 + x * (y + 2) + np.cos(3 * x) def grad_x_f(x, y): return 2 * x - 3 * np.sin(3 * x) + y + 2 def grad_y_f(x, y): return x + 2 * y

def plot_f_contours(): xx, yy = np.meshgrid(np.linspace(-5, 5), np.linspace(-5, 5)) zz = f(xx, yy) plt.contourf(xx, yy, zz, 50) plt.contour(xx, yy, zz, 50, alpha=0.2, colors='black', linestyles='solid') plt.xlabel('x') plt.ylabel('y') plt.figure(figsize=(10, 7)) plot_f_contours()

def optimize_f(x, y, step_size, steps): # keep track of the parameters we tried so far x_hist, y_hist = [x], [y] # run gradient descent for the number of steps for step in range(steps): # compute the gradients at the current point dx = grad_x_f(x, y) dy = grad_y_f(x, y) # apply the gradient descent updates to x and y x = x - (step_size * dx) y = y - (step_size * dy) # store the new parameters x_hist.append(x) y_hist.append(y) return x, y, f(x, y), x_hist, y_hist

# helper function that plots the results of the gradient descent optimization def plot_gradient_descent_results(x, y, val, x_hist, y_hist): # plot the path on the contour plot plt.figure(figsize=(20, 7)) plt.subplot(1, 2, 1) plot_f_contours() plt.plot(x_hist, y_hist, '.-') # plot the learning curve plt.subplot(1, 2, 2) plt.plot(f(np.array(x_hist), np.array(y_hist)), '.r-') plt.title('Minimum value: %f' % f(x_hist[-1], y_hist[-1]))

results = optimize_f(x=3, y=2, step_size=0.1, steps=10) plot_gradient_descent_results(*results)

# TODO: tune the parameters to find a better optimum results = optimize_f(x=3, y=2, step_size=0.16, steps=100) plot_gradient_descent_results(*results)

def optimize_f(x, y, step_size, steps, decay=1.0): # keep track of the parameters we tried so far x_hist, y_hist = [x], [y] # run gradient descent for the number of steps for step in range(steps): # compute the gradients at this point dx = grad_x_f(x, y) dy = grad_y_f(x, y) # apply the gradient descent updates to x and y x = x - (step_size * ( decay ** step ) * dx) # TODO: compute the update including step size decay y = y - (step_size * ( decay ** step ) * dy) # TODO: compute the update including step size decay # store the new parameters x_hist.append(x) y_hist.append(y) return x, y, f(x, y), x_hist, y_hist

# TODO: tune the parameters to find the local optimum results = optimize_f(x=3, y=2, step_size=0.16, steps=100) plot_gradient_descent_results(*results)

def sigmoid(x): sig = 1 / (1 + np.exp(-x)) # TODO: implement the sigmoid function return sig def sigmoid_grad(x): grad = (sigmoid(x) * (1 - sigmoid(x))) # TODO: implement the gradient of the sigmoid function return grad # try with a random input x = np.random.uniform(-10, 10, size=5) print('x:', x) print('sigmoid(x):', sigmoid(x)) print('sigmoid_grad(x):', sigmoid_grad(x))

x: [-4.44383  -9.736057  3.312347  0.975021  0.331723]
sigmoid(x): [0.011614 0.000059 0.96485  0.726119 0.582179]
sigmoid_grad(x): [0.011479 0.000059 0.033915 0.19887  0.243247]

# start with some random inputs x = np.random.uniform(-2, 2, size=5) # compute the symbolic gradient print('Symbolic', sigmoid_grad(x)) # TODO: compute the numerical gradient def sig_numerical(x): e = 0.0001 num = (sigmoid(x+(0.5*e)) - sigmoid(x-(0.5*e)))/e return num print('Numerical', sig_numerical(x))

Symbolic [0.202839 0.158799 0.178058 0.143906 0.21232 ]
Numerical [0.202839 0.158799 0.178058 0.143906 0.21232 ]

def relu(x): return np.maximum(x,0) def relu_grad(x): return np.greater(x,0).astype(int) # try with a random input x = np.random.uniform(-10, 10, size=5) print('x:', x) print('relu(x):', relu(x)) print('relu_grad(x):', relu_grad(x)) print() # TODO: compute and compare the symbolic and numerical gradients def relu_numerical(x): e = 0.0001 num = (relu(x+(0.5*e)) - relu(x-(0.5*e)))/e return num print('Numerical', relu_numerical(x))

x: [ 0.189314  3.448005 -8.834715 -8.311136 -7.027012]
relu(x): [0.189314 3.448005 0.       0.       0.      ]
relu_grad(x): [1 1 0 0 0]

Numerical [1. 1. 0. 0. 0.]

x = np.linspace(-10, 10, 100) plt.figure(figsize=(15, 8)) plt.subplot(2, 2, 1) plt.plot(x, sigmoid(x), label='Sigmoid') plt.xlabel('x') plt.legend(loc='upper left') plt.subplot(2, 2, 2) plt.plot(x, relu(x), label='ReLU') plt.xlabel('x') plt.legend(loc='upper left') plt.subplot(2, 2, 3) plt.plot(x, sigmoid_grad(x), label='Sigmoid gradient') plt.xlabel('x') plt.legend(loc='upper left') plt.subplot(2, 2, 4) plt.plot(x, relu_grad(x), label='ReLU gradient') plt.xlabel('x') plt.legend(loc='upper left');

def bce_loss(y, y_hat): #print('vals:', y) #print('predictions:', y_hat) return -( y * np.log(y_hat) + (1-y) * np.log(1-y_hat) ) def bce_loss_grad(y, y_hat): return (y_hat - y) / (y_hat - y_hat ** 2) # try with some random inputs y = np.random.randint(2, size=5) y_hat = np.random.uniform(0, 1, size=5) print('y:', y) print('y_hat:', y_hat) print('bceloss(y, y_hat):', bce_loss(y, y_hat)) print('bceloss_grad(y, y_hat):', bce_loss_grad(y, y_hat)) # TODO: compute and compare the symbolic and numerical gradients def numerical_gradient_bce(y, y_hat): e = 0.0001 y_hat_increment = y_hat + e y_hat_decrement = y_hat - e return ( bce_loss(y, y_hat_increment) - bce_loss(y, y_hat_decrement) ) / (2 * e) print('numerical gradient:', numerical_gradient_bce(y, y_hat))

y: [0 1 1 1 0]
y_hat: [0.528383 0.625416 0.978426 0.610352 0.823556]
bceloss(y, y_hat): [0.751588 0.469338 0.02181  0.493719 1.734753]
bceloss_grad(y, y_hat): [ 2.120364 -1.598936 -1.02205  -1.638399  5.667528]
numerical gradient: [ 2.120364 -1.598936 -1.02205  -1.638399  5.667528]

# initialize parameters w = np.random.uniform(size=5) b = np.random.rand() # implement the model def fn(x, y): #print(x.shape) # TODO: forward: compute h, y_hat, loss h = np.matmul(np.transpose(x), w) + b #print('h:', h) y_hat = sigmoid(h) loss = bce_loss(y, y_hat) # TODO: backward: compute grad_y_hat, grad_h, grad_x grad_y_hat = bce_loss_grad(y, y_hat) #print(grad_y_hat) grad_h = grad_y_hat * sigmoid_grad(h) grad_x = grad_h * w return loss, grad_x # test with a random input x = np.random.uniform(size=5) y = 1 loss, grad_x = fn(x, y) print("Loss", loss) print("Gradient", grad_x)

Loss 0.10379424098359609
Gradient [-0.053601 -0.046361 -0.016126 -0.027251 -0.070656]

# start with some random inputs x = np.random.uniform(size=5) y = 1 # set epsilon to a small value eps = 0.00001 numerical_grad = np.zeros(x.shape) # compute the gradient for each element of x separately for i in range(len(x)): # compute inputs at -eps/2 and +eps/2 x_a, x_b = x.copy(), x.copy() x_a[i] += eps / 2 x_b[i] -= eps / 2 # compute the gradient for this element loss_a, _ = fn(x_a, y) loss_b, _ = fn(x_b, y) numerical_grad[i] = (loss_a - loss_b) / eps # compute the symbolic gradient loss, symbolic_grad = fn(x, y) print("Symbolic gradient") print(symbolic_grad) print("Numerical gradient") print(numerical_grad)

Symbolic gradient
[-0.073778 -0.063813 -0.022197 -0.037508 -0.097252]
Numerical gradient
[-0.073778 -0.063813 -0.022197 -0.037508 -0.097252]

# Computes y = x * w + b. class Linear: def __init__(self, n_in, n_out): # initialize the weights randomly, # using the Xavier initialization rule for scale a = np.sqrt(6 / (n_in * n_out)) self.W = np.random.uniform(-a, a, size=(n_in, n_out)) self.b = np.zeros((n_out,)) def forward(self, x): # TODO: compute the forward pass y = np.matmul(x, self.W) + self.b return y def backward(self, x, dy): # TODO: compute the backward pass, # given dy, compute the gradients for x, W and b dx = np.matmul(dy, self.W.transpose()) #print('x:', x.shape, 'dx:', dx.shape) self.dW = np.matmul(x.transpose(), dy) #print('W:', self.W) self.db = np.sum(dy, axis=0) #print('b:', self.b) return dx def step(self, step_size): self.W = self.W - step_size * self.dW self.b = self.b - step_size * self.db def __str__(self): return 'Linear %dx%d' % self.W.shape # Try the new class with some random values. # Debugging tip: always choose a unique length for each # dimension, so you'll get an error if you mix them up. x = np.random.uniform(size=(3, 5)) layer = Linear(5, 7) y = layer.forward(x) dx = layer.backward(x, np.ones_like(y)) print('x:', x) print('y:', y) print('dx:', dx)

x: [[0.609244 0.224034 0.127317 0.662874 0.415835]
 [0.104281 0.64191  0.890062 0.456009 0.195825]
 [0.232494 0.172701 0.494333 0.466718 0.534599]]
y: [[-0.287955  0.469923 -0.212664  0.144343 -0.207398 -0.352429 -0.130105]
 [-0.564896  0.176808 -0.116377 -0.196505 -0.088324 -0.335466 -0.17007 ]
 [-0.193814  0.271203 -0.038256  0.12183  -0.122044 -0.415687 -0.189367]]
dx: [[-0.667074 -0.596145 -0.968056 -0.152265  0.451777]
 [-0.667074 -0.596145 -0.968056 -0.152265  0.451777]
 [-0.667074 -0.596145 -0.968056 -0.152265  0.451777]]

# Computes y = 1 / (1 + exp(-x)). class Sigmoid: def forward(self, x): return sigmoid(x) def backward(self, x, dy): # TODO: compute the backward pass, return dy * sigmoid_grad(x) def step(self, step_size): pass # ? is something supposed to happen here def __str__(self): return 'Sigmoid' # try the new class with some random values x = np.random.uniform(size=(3, 5)) layer = Sigmoid() y = layer.forward(x) dx = layer.backward(x, np.ones_like(y)) print('y:', y) print('dx:', dx)

y: [[0.727463 0.52465  0.717638 0.671433 0.726193]
 [0.586252 0.511077 0.507328 0.582837 0.564365]
 [0.535155 0.7039   0.630621 0.682751 0.526597]]
dx: [[0.19826  0.249392 0.202634 0.220611 0.198837]
 [0.242561 0.249877 0.249946 0.243138 0.245857]
 [0.248764 0.208425 0.232938 0.216602 0.249293]]

# Computes y = max(0, x). class ReLU: def forward(self, x): return relu(x) def backward(self, x, dy): # TODO: compute the backward pass, return dy * relu_grad(x) def step(self, step_size): pass # ? Might add something here later def __str__(self): return 'ReLU' # try the new class with some random values x = np.random.uniform(-10, 10, size=(3, 5)) layer = ReLU() y = layer.forward(x) dx = layer.backward(x, np.ones_like(y)) print('y:', y) print('dx:', dx)

y: [[0.       0.       0.993679 3.375293 0.      ]
 [9.677483 2.518926 8.79728  1.905408 7.152779]
 [6.032617 0.       0.       1.818964 0.      ]]
dx: [[0. 0. 1. 1. 0.]
 [1. 1. 1. 1. 1.]
 [1. 0. 0. 1. 0.]]

## Verify gradient computations for Linear # test for dx layer = Linear(5, 7) def test_fn(x): x = x.reshape(3, 5) # multiply the output with a constant to check if # the gradient uses dy return 2 * np.sum(layer.forward(x)) def test_fn_grad(x): x = x.reshape(3, 5) # multiply the incoming dy gradient with a constant return layer.backward(x, 2 * np.ones((3, 7))).flatten() err = scipy.optimize.check_grad(test_fn, test_fn_grad, np.random.uniform(-10, 10, size=3 * 5)) print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err) # test for dW x = np.random.uniform(size=(3, 5)) layer = Linear(5, 7) def test_fn(w): layer.W = w.reshape(5, 7) # multiply the output with a constant to check if # the gradient uses dy return 2 * np.sum(layer.forward(x)) def test_fn_grad(w): layer.W = w.reshape(5, 7) # multiply the incoming dy gradient with a constant layer.backward(x, 2 * np.ones((3, 7))) return layer.dW.flatten() err = scipy.optimize.check_grad(test_fn, test_fn_grad, np.random.uniform(-10, 10, size=5 * 7)) print("err on dW:", "OK" if np.abs(err) < 1e-5 else "ERROR", err) # test for db x = np.random.uniform(size=(3, 5,)) layer = Linear(5, 7) def test_fn(b): layer.b = b # multiply the output with a constant to check if # the gradient uses dy return 2 * np.sum(layer.forward(x)) def test_fn_grad(b): layer.b = b # multiply the incoming dy gradient with a constant layer.backward(x, 2 * np.ones((x.shape[0], 7))) return layer.db err = scipy.optimize.check_grad(test_fn, test_fn_grad, np.random.uniform(-10, 10, size=7)) print("err on db:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)

err on dx: OK 1.2139252212818763e-06
err on dW: ERROR 1.7842868084499042e-05
err on db: OK 0.0

## Verify gradient computation for Sigmoid # test for dx layer = Sigmoid() def test_fn(x): # multiply the output with a constant to check if # the gradient uses dy return np.sum(2 * layer.forward(x)) def test_fn_grad(x): # multiply the incoming dy gradient with a constant return layer.backward(x, 2 * np.ones(x.shape)) err = scipy.optimize.check_grad(test_fn, test_fn_grad, np.random.uniform(-10, 10, size=5)) print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)

err on dx: OK 7.485249618123588e-08

## Verify gradient computation for ReLU # test for dx layer = ReLU() def test_fn(x): # multiply the output with a constant to check if # the gradient uses dy return 2 * np.sum(layer.forward(x)) def test_fn_grad(x): # multiply the incoming dy gradient with a constant return layer.backward(x, 2 * np.ones(x.shape)) err = scipy.optimize.check_grad(test_fn, test_fn_grad, np.random.uniform(1, 10, size=5)) print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)

err on dx: OK 0.0

class Net: def __init__(self, layers): self.layers = layers def forward(self, x): # compute the forward pass for each layer trace = [] for layer in self.layers: # compute the forward pass y = layer.forward(x) # store the original input for the backward pass trace.append((layer, x)) x = y # return the final output and the history trace return y, trace def backward(self, trace, dy): # compute the backward pass for each layer for layer, x in trace[::-1]: # compute the backward pass using the original input x dy = layer.backward(x, dy) def step(self, learning_rate): # apply the gradient descent updates of each layer for layer in self.layers: layer.step(learning_rate) def __str__(self): return '\n'.join(str(l) for l in self.layers)

# load the first two classes of the digits dataset dataset = sklearn.datasets.load_digits() digits_x, digits_y = dataset['data'], dataset['target'] # create a binary classification problem digits_y = (digits_y < 5).astype(float) # plot some of the digits plt.figure(figsize=(10, 2)) plt.imshow(np.hstack([digits_x[i].reshape(8, 8) for i in range(10)]), cmap='gray') plt.grid(False) plt.tight_layout() plt.axis('off') # normalize the values to [0, 1] digits_x -= np.mean(digits_x) digits_x /= np.std(digits_x) # print some statistics print('digits_x.shape:', digits_x.shape) print('digits_y.shape:', digits_y.shape) print('min, max values:', np.min(digits_x), np.max(digits_x)) print('labels:', np.unique(digits_y))

digits_x.shape: (1797, 64)
digits_y.shape: (1797,)
min, max values: -0.8117561971974786 1.847470154168513
labels: [0. 1.]

# make a 50%/50% train/test split train_prop = 0.5 n_train = int(digits_x.shape[0] * train_prop) # shuffle the images idxs = np.random.permutation(digits_x.shape[0]) # take a subset x = {'train': digits_x[idxs[:n_train]], 'test': digits_x[idxs[n_train:]]} y = {'train': digits_y[idxs[:n_train]], 'test': digits_y[idxs[n_train:]]} print('Training samples:', x['train'].shape[0]) print('Test samples:', x['test'].shape[0])

Training samples: 898
Test samples: 899

def fit(net, x, y, epochs=25, learning_rate=0.001, mb_size=10): # initialize the loss and accuracy history loss_hist = {'train': [], 'test': []} accuracy_hist = {'train': [], 'test': []} for epoch in range(epochs): # initialize the loss and accuracy for this epoch loss = {'train': 0.0, 'test': 0.0} accuracy = {'train': 0.0, 'test': 0.0} # first train on training data, then evaluate on the test data for phase in ('train', 'test'): # compute the number of minibatches steps = x[phase].shape[0] // mb_size # loop over all minibatches for step in range(steps): # get the samples for the current minibatch x_mb = x[phase][(step * mb_size):((step + 1) * mb_size)] y_mb = y[phase][(step * mb_size):((step + 1) * mb_size), None] # compute the forward pass through the network pred_y, trace = net.forward(x_mb) # compute the current loss and accuracy loss[phase] += np.mean(bce_loss(y_mb, pred_y)) accuracy[phase] += np.mean((y_mb > 0.5) == (pred_y > 0.5)) # only update the network in the training phase if phase == 'train': # compute the gradient for the loss dy = bce_loss_grad(y_mb, pred_y) # backpropagate the gradient through the network net.backward(trace, dy) # update the weights net.step(learning_rate) # compute the mean loss and accuracy over all minibatches loss[phase] = loss[phase] / steps accuracy[phase] = accuracy[phase] / steps # add statistics to history loss_hist[phase].append(loss[phase]) accuracy_hist[phase].append(accuracy[phase]) print('Epoch %3d: loss[train]=%7.4f accuracy[train]=%7.4f loss[test]=%7.4f accuracy[test]=%7.4f' % (epoch, loss['train'], accuracy['train'], loss['test'], accuracy['test'])) # plot the learning curves plt.figure(figsize=(20, 5)) plt.subplot(1, 2, 1) for phase in loss_hist: plt.plot(loss_hist[phase], label=phase) plt.title('BCE loss') plt.xlabel('Epoch') plt.legend() plt.subplot(1, 2, 2) for phase in accuracy_hist: plt.plot(accuracy_hist[phase], label=phase) plt.title('Accuracy') plt.xlabel('Epoch') plt.legend()

# construct network net = Net([ Linear(64, 32), ReLU(), Linear(32, 1), Sigmoid()]) # TODO: tune the hyperparameters fit(net, x, y, epochs = 25, learning_rate = 0.01, mb_size = 32)

Epoch   0: loss[train]= 0.4707  accuracy[train]= 0.7734  loss[test]= 0.3186  accuracy[test]= 0.8705
Epoch   1: loss[train]= 0.2325  accuracy[train]= 0.9196  loss[test]= 0.2105  accuracy[test]= 0.9241
Epoch   2: loss[train]= 0.1401  accuracy[train]= 0.9487  loss[test]= 0.1802  accuracy[test]= 0.9342
Epoch   3: loss[train]= 0.1007  accuracy[train]= 0.9654  loss[test]= 0.1622  accuracy[test]= 0.9442
Epoch   4: loss[train]= 0.0789  accuracy[train]= 0.9732  loss[test]= 0.1552  accuracy[test]= 0.9442
Epoch   5: loss[train]= 0.0640  accuracy[train]= 0.9777  loss[test]= 0.1467  accuracy[test]= 0.9498
Epoch   6: loss[train]= 0.0524  accuracy[train]= 0.9833  loss[test]= 0.1370  accuracy[test]= 0.9554
Epoch   7: loss[train]= 0.0443  accuracy[train]= 0.9855  loss[test]= 0.1278  accuracy[test]= 0.9576
Epoch   8: loss[train]= 0.0365  accuracy[train]= 0.9911  loss[test]= 0.1194  accuracy[test]= 0.9632
Epoch   9: loss[train]= 0.0319  accuracy[train]= 0.9933  loss[test]= 0.1128  accuracy[test]= 0.9654
Epoch  10: loss[train]= 0.0275  accuracy[train]= 0.9955  loss[test]= 0.1096  accuracy[test]= 0.9665
Epoch  11: loss[train]= 0.0232  accuracy[train]= 0.9978  loss[test]= 0.1042  accuracy[test]= 0.9632
Epoch  12: loss[train]= 0.0207  accuracy[train]= 0.9978  loss[test]= 0.1034  accuracy[test]= 0.9654
Epoch  13: loss[train]= 0.0177  accuracy[train]= 0.9978  loss[test]= 0.1020  accuracy[test]= 0.9654
Epoch  14: loss[train]= 0.0157  accuracy[train]= 0.9989  loss[test]= 0.1018  accuracy[test]= 0.9654
Epoch  15: loss[train]= 0.0139  accuracy[train]= 1.0000  loss[test]= 0.1020  accuracy[test]= 0.9643
Epoch  16: loss[train]= 0.0123  accuracy[train]= 1.0000  loss[test]= 0.1021  accuracy[test]= 0.9643
Epoch  17: loss[train]= 0.0111  accuracy[train]= 1.0000  loss[test]= 0.1032  accuracy[test]= 0.9643
Epoch  18: loss[train]= 0.0100  accuracy[train]= 1.0000  loss[test]= 0.1033  accuracy[test]= 0.9643
Epoch  19: loss[train]= 0.0091  accuracy[train]= 1.0000  loss[test]= 0.1042  accuracy[test]= 0.9643
Epoch  20: loss[train]= 0.0083  accuracy[train]= 1.0000  loss[test]= 0.1042  accuracy[test]= 0.9643
Epoch  21: loss[train]= 0.0077  accuracy[train]= 1.0000  loss[test]= 0.1056  accuracy[test]= 0.9643
Epoch  22: loss[train]= 0.0071  accuracy[train]= 1.0000  loss[test]= 0.1059  accuracy[test]= 0.9643
Epoch  23: loss[train]= 0.0067  accuracy[train]= 1.0000  loss[test]= 0.1069  accuracy[test]= 0.9654
Epoch  24: loss[train]= 0.0061  accuracy[train]= 1.0000  loss[test]= 0.1072  accuracy[test]= 0.9643

# construct network net = Net([ Linear(64, 32), Linear(32, 1), Sigmoid()]) # TODO: tune the hyperparameters fit(net, x, y, epochs = 25, learning_rate = 0.01, mb_size = 32)

Epoch   0: loss[train]= 0.5409  accuracy[train]= 0.7690  loss[test]= 0.3747  accuracy[test]= 0.8348
Epoch   1: loss[train]= 0.3535  accuracy[train]= 0.8583  loss[test]= 0.3219  accuracy[test]= 0.8605
Epoch   2: loss[train]= 0.3138  accuracy[train]= 0.8783  loss[test]= 0.3138  accuracy[test]= 0.8650
Epoch   3: loss[train]= 0.2977  accuracy[train]= 0.8795  loss[test]= 0.3128  accuracy[test]= 0.8705
Epoch   4: loss[train]= 0.2871  accuracy[train]= 0.8906  loss[test]= 0.3128  accuracy[test]= 0.8750
Epoch   5: loss[train]= 0.2796  accuracy[train]= 0.8940  loss[test]= 0.3128  accuracy[test]= 0.8750
Epoch   6: loss[train]= 0.2740  accuracy[train]= 0.8984  loss[test]= 0.3128  accuracy[test]= 0.8772
Epoch   7: loss[train]= 0.2696  accuracy[train]= 0.8996  loss[test]= 0.3127  accuracy[test]= 0.8783
Epoch   8: loss[train]= 0.2661  accuracy[train]= 0.9018  loss[test]= 0.3125  accuracy[test]= 0.8783
Epoch   9: loss[train]= 0.2631  accuracy[train]= 0.9029  loss[test]= 0.3123  accuracy[test]= 0.8772
Epoch  10: loss[train]= 0.2607  accuracy[train]= 0.9040  loss[test]= 0.3121  accuracy[test]= 0.8783
Epoch  11: loss[train]= 0.2585  accuracy[train]= 0.9051  loss[test]= 0.3119  accuracy[test]= 0.8772
Epoch  12: loss[train]= 0.2567  accuracy[train]= 0.9051  loss[test]= 0.3117  accuracy[test]= 0.8739
Epoch  13: loss[train]= 0.2550  accuracy[train]= 0.9062  loss[test]= 0.3115  accuracy[test]= 0.8739
Epoch  14: loss[train]= 0.2536  accuracy[train]= 0.9062  loss[test]= 0.3112  accuracy[test]= 0.8761
Epoch  15: loss[train]= 0.2523  accuracy[train]= 0.9062  loss[test]= 0.3110  accuracy[test]= 0.8761
Epoch  16: loss[train]= 0.2512  accuracy[train]= 0.9074  loss[test]= 0.3109  accuracy[test]= 0.8761
Epoch  17: loss[train]= 0.2501  accuracy[train]= 0.9096  loss[test]= 0.3107  accuracy[test]= 0.8761
Epoch  18: loss[train]= 0.2492  accuracy[train]= 0.9096  loss[test]= 0.3105  accuracy[test]= 0.8772
Epoch  19: loss[train]= 0.2483  accuracy[train]= 0.9096  loss[test]= 0.3104  accuracy[test]= 0.8783
Epoch  20: loss[train]= 0.2475  accuracy[train]= 0.9107  loss[test]= 0.3102  accuracy[test]= 0.8783
Epoch  21: loss[train]= 0.2467  accuracy[train]= 0.9107  loss[test]= 0.3101  accuracy[test]= 0.8783
Epoch  22: loss[train]= 0.2460  accuracy[train]= 0.9107  loss[test]= 0.3100  accuracy[test]= 0.8795
Epoch  23: loss[train]= 0.2454  accuracy[train]= 0.9107  loss[test]= 0.3099  accuracy[test]= 0.8795
Epoch  24: loss[train]= 0.2448  accuracy[train]= 0.9107  loss[test]= 0.3098  accuracy[test]= 0.8795

# construct network net = Net([ Linear(64, 1), Sigmoid()]) # TODO: tune the hyperparameters fit(net, x, y, epochs = 25, learning_rate = 0.01, mb_size = 32)

Epoch   0: loss[train]= 0.5437  accuracy[train]= 0.7266  loss[test]= 0.3806  accuracy[test]= 0.8371
Epoch   1: loss[train]= 0.3371  accuracy[train]= 0.8672  loss[test]= 0.3482  accuracy[test]= 0.8527
Epoch   2: loss[train]= 0.3018  accuracy[train]= 0.8929  loss[test]= 0.3336  accuracy[test]= 0.8594
Epoch   3: loss[train]= 0.2850  accuracy[train]= 0.8973  loss[test]= 0.3245  accuracy[test]= 0.8627
Epoch   4: loss[train]= 0.2748  accuracy[train]= 0.9029  loss[test]= 0.3185  accuracy[test]= 0.8694
Epoch   5: loss[train]= 0.2679  accuracy[train]= 0.9051  loss[test]= 0.3144  accuracy[test]= 0.8717
Epoch   6: loss[train]= 0.2630  accuracy[train]= 0.9062  loss[test]= 0.3115  accuracy[test]= 0.8761
Epoch   7: loss[train]= 0.2592  accuracy[train]= 0.9040  loss[test]= 0.3096  accuracy[test]= 0.8750
Epoch   8: loss[train]= 0.2563  accuracy[train]= 0.9029  loss[test]= 0.3082  accuracy[test]= 0.8772
Epoch   9: loss[train]= 0.2539  accuracy[train]= 0.9062  loss[test]= 0.3073  accuracy[test]= 0.8795
Epoch  10: loss[train]= 0.2520  accuracy[train]= 0.9062  loss[test]= 0.3067  accuracy[test]= 0.8795
Epoch  11: loss[train]= 0.2505  accuracy[train]= 0.9062  loss[test]= 0.3064  accuracy[test]= 0.8817
Epoch  12: loss[train]= 0.2491  accuracy[train]= 0.9062  loss[test]= 0.3063  accuracy[test]= 0.8806
Epoch  13: loss[train]= 0.2480  accuracy[train]= 0.9051  loss[test]= 0.3063  accuracy[test]= 0.8795
Epoch  14: loss[train]= 0.2470  accuracy[train]= 0.9062  loss[test]= 0.3064  accuracy[test]= 0.8806
Epoch  15: loss[train]= 0.2462  accuracy[train]= 0.9062  loss[test]= 0.3065  accuracy[test]= 0.8795
Epoch  16: loss[train]= 0.2455  accuracy[train]= 0.9074  loss[test]= 0.3068  accuracy[test]= 0.8795
Epoch  17: loss[train]= 0.2448  accuracy[train]= 0.9085  loss[test]= 0.3071  accuracy[test]= 0.8783
Epoch  18: loss[train]= 0.2443  accuracy[train]= 0.9096  loss[test]= 0.3074  accuracy[test]= 0.8783
Epoch  19: loss[train]= 0.2438  accuracy[train]= 0.9096  loss[test]= 0.3077  accuracy[test]= 0.8795
Epoch  20: loss[train]= 0.2433  accuracy[train]= 0.9107  loss[test]= 0.3081  accuracy[test]= 0.8795
Epoch  21: loss[train]= 0.2429  accuracy[train]= 0.9107  loss[test]= 0.3085  accuracy[test]= 0.8795
Epoch  22: loss[train]= 0.2425  accuracy[train]= 0.9096  loss[test]= 0.3088  accuracy[test]= 0.8795
Epoch  23: loss[train]= 0.2422  accuracy[train]= 0.9096  loss[test]= 0.3092  accuracy[test]= 0.8795
Epoch  24: loss[train]= 0.2419  accuracy[train]= 0.9107  loss[test]= 0.3096  accuracy[test]= 0.8806

net = Net([ Linear(64, 32), ReLU(), Linear(32, 16), ReLU(), Linear(16, 8), ReLU(), Linear(8, 4), ReLU(), Linear(4, 1), Sigmoid()]) # TODO: tune the hyperparameters fit(net, x, y, epochs = 25, learning_rate = 0.01, mb_size = 32)

Epoch   0: loss[train]= 0.6929  accuracy[train]= 0.4978  loss[test]= 0.6922  accuracy[test]= 0.5078
Epoch   1: loss[train]= 0.6910  accuracy[train]= 0.5513  loss[test]= 0.6874  accuracy[test]= 0.7969
Epoch   2: loss[train]= 0.6694  accuracy[train]= 0.6518  loss[test]= 0.5829  accuracy[test]= 0.7913
Epoch   3: loss[train]= 0.5031  accuracy[train]= 0.7824  loss[test]= 0.7568  accuracy[test]= 0.6384
Epoch   4: loss[train]= 0.3420  accuracy[train]= 0.8728  loss[test]= 0.1940  accuracy[test]= 0.9353
Epoch   5: loss[train]= 0.2038  accuracy[train]= 0.9230  loss[test]= 0.2431  accuracy[test]= 0.9152
Epoch   6: loss[train]= 0.1199  accuracy[train]= 0.9598  loss[test]= 0.1915  accuracy[test]= 0.9342
Epoch   7: loss[train]= 0.0874  accuracy[train]= 0.9699  loss[test]= 0.3728  accuracy[test]= 0.8828
Epoch   8: loss[train]= 0.0445  accuracy[train]= 0.9821  loss[test]= 0.0969  accuracy[test]= 0.9654
Epoch   9: loss[train]= 0.1485  accuracy[train]= 0.9699  loss[test]= 0.0987  accuracy[test]= 0.9721
Epoch  10: loss[train]= 0.0301  accuracy[train]= 0.9933  loss[test]= 0.0815  accuracy[test]= 0.9754
Epoch  11: loss[train]= 0.0113  accuracy[train]= 0.9978  loss[test]= 0.0872  accuracy[test]= 0.9699
Epoch  12: loss[train]= 0.0082  accuracy[train]= 0.9967  loss[test]= 0.0895  accuracy[test]= 0.9766
Epoch  13: loss[train]= 0.0033  accuracy[train]= 1.0000  loss[test]= 0.0907  accuracy[test]= 0.9788
Epoch  14: loss[train]= 0.0017  accuracy[train]= 1.0000  loss[test]= 0.1091  accuracy[test]= 0.9721
Epoch  15: loss[train]= 0.0014  accuracy[train]= 1.0000  loss[test]= 0.1129  accuracy[test]= 0.9732
Epoch  16: loss[train]= 0.0011  accuracy[train]= 1.0000  loss[test]= 0.1197  accuracy[test]= 0.9710
Epoch  17: loss[train]= 0.0008  accuracy[train]= 1.0000  loss[test]= 0.1197  accuracy[test]= 0.9721
Epoch  18: loss[train]= 0.0007  accuracy[train]= 1.0000  loss[test]= 0.1225  accuracy[test]= 0.9721
Epoch  19: loss[train]= 0.0007  accuracy[train]= 1.0000  loss[test]= 0.1192  accuracy[test]= 0.9743
Epoch  20: loss[train]= 0.0006  accuracy[train]= 1.0000  loss[test]= 0.1221  accuracy[test]= 0.9754
Epoch  21: loss[train]= 0.0004  accuracy[train]= 1.0000  loss[test]= 0.1221  accuracy[test]= 0.9766
Epoch  22: loss[train]= 0.0004  accuracy[train]= 1.0000  loss[test]= 0.1225  accuracy[test]= 0.9766
Epoch  23: loss[train]= 0.0003  accuracy[train]= 1.0000  loss[test]= 0.1232  accuracy[test]= 0.9766
Epoch  24: loss[train]= 0.0003  accuracy[train]= 1.0000  loss[test]= 0.1241  accuracy[test]= 0.9766