%matplotlib inline
import numpy as np
import scipy.optimize
import sklearn.datasets
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True, precision=6, linewidth=200)
plt.style.use('ggplot')
def f(x, y):
return x ** 2 + y ** 2 + x * (y + 2) + np.cos(3 * x)
def grad_x_f(x, y):
return 2 * x - 3 * np.sin(3 * x) + y + 2
def grad_y_f(x, y):
return x + 2 * y
def plot_f_contours():
xx, yy = np.meshgrid(np.linspace(-5, 5), np.linspace(-5, 5))
zz = f(xx, yy)
plt.contourf(xx, yy, zz, 50)
plt.contour(xx, yy, zz, 50, alpha=0.2, colors='black', linestyles='solid')
plt.xlabel('x')
plt.ylabel('y')
plt.figure(figsize=(10, 7))
plot_f_contours()
def optimize_f(x, y, step_size, steps):
# keep track of the parameters we tried so far
x_hist, y_hist = [x], [y]
# run gradient descent for the number of steps
for step in range(steps):
# compute the gradients at the current point
dx = grad_x_f(x, y)
dy = grad_y_f(x, y)
# apply the gradient descent updates to x and y
x = x - (step_size * dx)
y = y - (step_size * dy)
# store the new parameters
x_hist.append(x)
y_hist.append(y)
return x, y, f(x, y), x_hist, y_hist
# helper function that plots the results of the gradient descent optimization
def plot_gradient_descent_results(x, y, val, x_hist, y_hist):
# plot the path on the contour plot
plt.figure(figsize=(20, 7))
plt.subplot(1, 2, 1)
plot_f_contours()
plt.plot(x_hist, y_hist, '.-')
# plot the learning curve
plt.subplot(1, 2, 2)
plt.plot(f(np.array(x_hist), np.array(y_hist)), '.r-')
plt.title('Minimum value: %f' % f(x_hist[-1], y_hist[-1]))
results = optimize_f(x=3, y=2, step_size=0.1, steps=10)
plot_gradient_descent_results(*results)
# TODO: tune the parameters to find a better optimum
results = optimize_f(x=3, y=2, step_size=0.16, steps=100)
plot_gradient_descent_results(*results)
def optimize_f(x, y, step_size, steps, decay=1.0):
# keep track of the parameters we tried so far
x_hist, y_hist = [x], [y]
# run gradient descent for the number of steps
for step in range(steps):
# compute the gradients at this point
dx = grad_x_f(x, y)
dy = grad_y_f(x, y)
# apply the gradient descent updates to x and y
x = x - (step_size * ( decay ** step ) * dx) # TODO: compute the update including step size decay
y = y - (step_size * ( decay ** step ) * dy) # TODO: compute the update including step size decay
# store the new parameters
x_hist.append(x)
y_hist.append(y)
return x, y, f(x, y), x_hist, y_hist
# TODO: tune the parameters to find the local optimum
results = optimize_f(x=3, y=2, step_size=0.16, steps=100)
plot_gradient_descent_results(*results)
def sigmoid(x):
sig = 1 / (1 + np.exp(-x)) # TODO: implement the sigmoid function
return sig
def sigmoid_grad(x):
grad = (sigmoid(x) * (1 - sigmoid(x))) # TODO: implement the gradient of the sigmoid function
return grad
# try with a random input
x = np.random.uniform(-10, 10, size=5)
print('x:', x)
print('sigmoid(x):', sigmoid(x))
print('sigmoid_grad(x):', sigmoid_grad(x))
x: [-4.44383 -9.736057 3.312347 0.975021 0.331723]
sigmoid(x): [0.011614 0.000059 0.96485 0.726119 0.582179]
sigmoid_grad(x): [0.011479 0.000059 0.033915 0.19887 0.243247]
# start with some random inputs
x = np.random.uniform(-2, 2, size=5)
# compute the symbolic gradient
print('Symbolic', sigmoid_grad(x))
# TODO: compute the numerical gradient
def sig_numerical(x):
e = 0.0001
num = (sigmoid(x+(0.5*e)) - sigmoid(x-(0.5*e)))/e
return num
print('Numerical', sig_numerical(x))
Symbolic [0.202839 0.158799 0.178058 0.143906 0.21232 ]
Numerical [0.202839 0.158799 0.178058 0.143906 0.21232 ]
def relu(x):
return np.maximum(x,0)
def relu_grad(x):
return np.greater(x,0).astype(int)
# try with a random input
x = np.random.uniform(-10, 10, size=5)
print('x:', x)
print('relu(x):', relu(x))
print('relu_grad(x):', relu_grad(x))
print()
# TODO: compute and compare the symbolic and numerical gradients
def relu_numerical(x):
e = 0.0001
num = (relu(x+(0.5*e)) - relu(x-(0.5*e)))/e
return num
print('Numerical', relu_numerical(x))
x: [ 0.189314 3.448005 -8.834715 -8.311136 -7.027012]
relu(x): [0.189314 3.448005 0. 0. 0. ]
relu_grad(x): [1 1 0 0 0]
Numerical [1. 1. 0. 0. 0.]
x = np.linspace(-10, 10, 100)
plt.figure(figsize=(15, 8))
plt.subplot(2, 2, 1)
plt.plot(x, sigmoid(x), label='Sigmoid')
plt.xlabel('x')
plt.legend(loc='upper left')
plt.subplot(2, 2, 2)
plt.plot(x, relu(x), label='ReLU')
plt.xlabel('x')
plt.legend(loc='upper left')
plt.subplot(2, 2, 3)
plt.plot(x, sigmoid_grad(x), label='Sigmoid gradient')
plt.xlabel('x')
plt.legend(loc='upper left')
plt.subplot(2, 2, 4)
plt.plot(x, relu_grad(x), label='ReLU gradient')
plt.xlabel('x')
plt.legend(loc='upper left');
def bce_loss(y, y_hat):
#print('vals:', y)
#print('predictions:', y_hat)
return -( y * np.log(y_hat) + (1-y) * np.log(1-y_hat) )
def bce_loss_grad(y, y_hat):
return (y_hat - y) / (y_hat - y_hat ** 2)
# try with some random inputs
y = np.random.randint(2, size=5)
y_hat = np.random.uniform(0, 1, size=5)
print('y:', y)
print('y_hat:', y_hat)
print('bceloss(y, y_hat):', bce_loss(y, y_hat))
print('bceloss_grad(y, y_hat):', bce_loss_grad(y, y_hat))
# TODO: compute and compare the symbolic and numerical gradients
def numerical_gradient_bce(y, y_hat):
e = 0.0001
y_hat_increment = y_hat + e
y_hat_decrement = y_hat - e
return ( bce_loss(y, y_hat_increment) - bce_loss(y, y_hat_decrement) ) / (2 * e)
print('numerical gradient:', numerical_gradient_bce(y, y_hat))
y: [0 1 1 1 0]
y_hat: [0.528383 0.625416 0.978426 0.610352 0.823556]
bceloss(y, y_hat): [0.751588 0.469338 0.02181 0.493719 1.734753]
bceloss_grad(y, y_hat): [ 2.120364 -1.598936 -1.02205 -1.638399 5.667528]
numerical gradient: [ 2.120364 -1.598936 -1.02205 -1.638399 5.667528]
# initialize parameters
w = np.random.uniform(size=5)
b = np.random.rand()
# implement the model
def fn(x, y):
#print(x.shape)
# TODO: forward: compute h, y_hat, loss
h = np.matmul(np.transpose(x), w) + b
#print('h:', h)
y_hat = sigmoid(h)
loss = bce_loss(y, y_hat)
# TODO: backward: compute grad_y_hat, grad_h, grad_x
grad_y_hat = bce_loss_grad(y, y_hat)
#print(grad_y_hat)
grad_h = grad_y_hat * sigmoid_grad(h)
grad_x = grad_h * w
return loss, grad_x
# test with a random input
x = np.random.uniform(size=5)
y = 1
loss, grad_x = fn(x, y)
print("Loss", loss)
print("Gradient", grad_x)
Loss 0.10379424098359609
Gradient [-0.053601 -0.046361 -0.016126 -0.027251 -0.070656]
# start with some random inputs
x = np.random.uniform(size=5)
y = 1
# set epsilon to a small value
eps = 0.00001
numerical_grad = np.zeros(x.shape)
# compute the gradient for each element of x separately
for i in range(len(x)):
# compute inputs at -eps/2 and +eps/2
x_a, x_b = x.copy(), x.copy()
x_a[i] += eps / 2
x_b[i] -= eps / 2
# compute the gradient for this element
loss_a, _ = fn(x_a, y)
loss_b, _ = fn(x_b, y)
numerical_grad[i] = (loss_a - loss_b) / eps
# compute the symbolic gradient
loss, symbolic_grad = fn(x, y)
print("Symbolic gradient")
print(symbolic_grad)
print("Numerical gradient")
print(numerical_grad)
Symbolic gradient
[-0.073778 -0.063813 -0.022197 -0.037508 -0.097252]
Numerical gradient
[-0.073778 -0.063813 -0.022197 -0.037508 -0.097252]
# Computes y = x * w + b.
class Linear:
def __init__(self, n_in, n_out):
# initialize the weights randomly,
# using the Xavier initialization rule for scale
a = np.sqrt(6 / (n_in * n_out))
self.W = np.random.uniform(-a, a, size=(n_in, n_out))
self.b = np.zeros((n_out,))
def forward(self, x):
# TODO: compute the forward pass
y = np.matmul(x, self.W) + self.b
return y
def backward(self, x, dy):
# TODO: compute the backward pass,
# given dy, compute the gradients for x, W and b
dx = np.matmul(dy, self.W.transpose())
#print('x:', x.shape, 'dx:', dx.shape)
self.dW = np.matmul(x.transpose(), dy)
#print('W:', self.W)
self.db = np.sum(dy, axis=0)
#print('b:', self.b)
return dx
def step(self, step_size):
self.W = self.W - step_size * self.dW
self.b = self.b - step_size * self.db
def __str__(self):
return 'Linear %dx%d' % self.W.shape
# Try the new class with some random values.
# Debugging tip: always choose a unique length for each
# dimension, so you'll get an error if you mix them up.
x = np.random.uniform(size=(3, 5))
layer = Linear(5, 7)
y = layer.forward(x)
dx = layer.backward(x, np.ones_like(y))
print('x:', x)
print('y:', y)
print('dx:', dx)
x: [[0.609244 0.224034 0.127317 0.662874 0.415835]
[0.104281 0.64191 0.890062 0.456009 0.195825]
[0.232494 0.172701 0.494333 0.466718 0.534599]]
y: [[-0.287955 0.469923 -0.212664 0.144343 -0.207398 -0.352429 -0.130105]
[-0.564896 0.176808 -0.116377 -0.196505 -0.088324 -0.335466 -0.17007 ]
[-0.193814 0.271203 -0.038256 0.12183 -0.122044 -0.415687 -0.189367]]
dx: [[-0.667074 -0.596145 -0.968056 -0.152265 0.451777]
[-0.667074 -0.596145 -0.968056 -0.152265 0.451777]
[-0.667074 -0.596145 -0.968056 -0.152265 0.451777]]
# Computes y = 1 / (1 + exp(-x)).
class Sigmoid:
def forward(self, x):
return sigmoid(x)
def backward(self, x, dy):
# TODO: compute the backward pass,
return dy * sigmoid_grad(x)
def step(self, step_size):
pass # ? is something supposed to happen here
def __str__(self):
return 'Sigmoid'
# try the new class with some random values
x = np.random.uniform(size=(3, 5))
layer = Sigmoid()
y = layer.forward(x)
dx = layer.backward(x, np.ones_like(y))
print('y:', y)
print('dx:', dx)
y: [[0.727463 0.52465 0.717638 0.671433 0.726193]
[0.586252 0.511077 0.507328 0.582837 0.564365]
[0.535155 0.7039 0.630621 0.682751 0.526597]]
dx: [[0.19826 0.249392 0.202634 0.220611 0.198837]
[0.242561 0.249877 0.249946 0.243138 0.245857]
[0.248764 0.208425 0.232938 0.216602 0.249293]]
# Computes y = max(0, x).
class ReLU:
def forward(self, x):
return relu(x)
def backward(self, x, dy):
# TODO: compute the backward pass,
return dy * relu_grad(x)
def step(self, step_size):
pass # ? Might add something here later
def __str__(self):
return 'ReLU'
# try the new class with some random values
x = np.random.uniform(-10, 10, size=(3, 5))
layer = ReLU()
y = layer.forward(x)
dx = layer.backward(x, np.ones_like(y))
print('y:', y)
print('dx:', dx)
y: [[0. 0. 0.993679 3.375293 0. ]
[9.677483 2.518926 8.79728 1.905408 7.152779]
[6.032617 0. 0. 1.818964 0. ]]
dx: [[0. 0. 1. 1. 0.]
[1. 1. 1. 1. 1.]
[1. 0. 0. 1. 0.]]
## Verify gradient computations for Linear
# test for dx
layer = Linear(5, 7)
def test_fn(x):
x = x.reshape(3, 5)
# multiply the output with a constant to check if
# the gradient uses dy
return 2 * np.sum(layer.forward(x))
def test_fn_grad(x):
x = x.reshape(3, 5)
# multiply the incoming dy gradient with a constant
return layer.backward(x, 2 * np.ones((3, 7))).flatten()
err = scipy.optimize.check_grad(test_fn, test_fn_grad,
np.random.uniform(-10, 10, size=3 * 5))
print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)
# test for dW
x = np.random.uniform(size=(3, 5))
layer = Linear(5, 7)
def test_fn(w):
layer.W = w.reshape(5, 7)
# multiply the output with a constant to check if
# the gradient uses dy
return 2 * np.sum(layer.forward(x))
def test_fn_grad(w):
layer.W = w.reshape(5, 7)
# multiply the incoming dy gradient with a constant
layer.backward(x, 2 * np.ones((3, 7)))
return layer.dW.flatten()
err = scipy.optimize.check_grad(test_fn, test_fn_grad,
np.random.uniform(-10, 10, size=5 * 7))
print("err on dW:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)
# test for db
x = np.random.uniform(size=(3, 5,))
layer = Linear(5, 7)
def test_fn(b):
layer.b = b
# multiply the output with a constant to check if
# the gradient uses dy
return 2 * np.sum(layer.forward(x))
def test_fn_grad(b):
layer.b = b
# multiply the incoming dy gradient with a constant
layer.backward(x, 2 * np.ones((x.shape[0], 7)))
return layer.db
err = scipy.optimize.check_grad(test_fn, test_fn_grad,
np.random.uniform(-10, 10, size=7))
print("err on db:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)
err on dx: OK 1.2139252212818763e-06
err on dW: ERROR 1.7842868084499042e-05
err on db: OK 0.0
## Verify gradient computation for Sigmoid
# test for dx
layer = Sigmoid()
def test_fn(x):
# multiply the output with a constant to check if
# the gradient uses dy
return np.sum(2 * layer.forward(x))
def test_fn_grad(x):
# multiply the incoming dy gradient with a constant
return layer.backward(x, 2 * np.ones(x.shape))
err = scipy.optimize.check_grad(test_fn, test_fn_grad,
np.random.uniform(-10, 10, size=5))
print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)
err on dx: OK 7.485249618123588e-08
## Verify gradient computation for ReLU
# test for dx
layer = ReLU()
def test_fn(x):
# multiply the output with a constant to check if
# the gradient uses dy
return 2 * np.sum(layer.forward(x))
def test_fn_grad(x):
# multiply the incoming dy gradient with a constant
return layer.backward(x, 2 * np.ones(x.shape))
err = scipy.optimize.check_grad(test_fn, test_fn_grad,
np.random.uniform(1, 10, size=5))
print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)
err on dx: OK 0.0
class Net:
def __init__(self, layers):
self.layers = layers
def forward(self, x):
# compute the forward pass for each layer
trace = []
for layer in self.layers:
# compute the forward pass
y = layer.forward(x)
# store the original input for the backward pass
trace.append((layer, x))
x = y
# return the final output and the history trace
return y, trace
def backward(self, trace, dy):
# compute the backward pass for each layer
for layer, x in trace[::-1]:
# compute the backward pass using the original input x
dy = layer.backward(x, dy)
def step(self, learning_rate):
# apply the gradient descent updates of each layer
for layer in self.layers:
layer.step(learning_rate)
def __str__(self):
return '\n'.join(str(l) for l in self.layers)
# load the first two classes of the digits dataset
dataset = sklearn.datasets.load_digits()
digits_x, digits_y = dataset['data'], dataset['target']
# create a binary classification problem
digits_y = (digits_y < 5).astype(float)
# plot some of the digits
plt.figure(figsize=(10, 2))
plt.imshow(np.hstack([digits_x[i].reshape(8, 8) for i in range(10)]), cmap='gray')
plt.grid(False)
plt.tight_layout()
plt.axis('off')
# normalize the values to [0, 1]
digits_x -= np.mean(digits_x)
digits_x /= np.std(digits_x)
# print some statistics
print('digits_x.shape:', digits_x.shape)
print('digits_y.shape:', digits_y.shape)
print('min, max values:', np.min(digits_x), np.max(digits_x))
print('labels:', np.unique(digits_y))
digits_x.shape: (1797, 64)
digits_y.shape: (1797,)
min, max values: -0.8117561971974786 1.847470154168513
labels: [0. 1.]
# make a 50%/50% train/test split
train_prop = 0.5
n_train = int(digits_x.shape[0] * train_prop)
# shuffle the images
idxs = np.random.permutation(digits_x.shape[0])
# take a subset
x = {'train': digits_x[idxs[:n_train]],
'test': digits_x[idxs[n_train:]]}
y = {'train': digits_y[idxs[:n_train]],
'test': digits_y[idxs[n_train:]]}
print('Training samples:', x['train'].shape[0])
print('Test samples:', x['test'].shape[0])
Training samples: 898
Test samples: 899
def fit(net, x, y, epochs=25, learning_rate=0.001, mb_size=10):
# initialize the loss and accuracy history
loss_hist = {'train': [], 'test': []}
accuracy_hist = {'train': [], 'test': []}
for epoch in range(epochs):
# initialize the loss and accuracy for this epoch
loss = {'train': 0.0, 'test': 0.0}
accuracy = {'train': 0.0, 'test': 0.0}
# first train on training data, then evaluate on the test data
for phase in ('train', 'test'):
# compute the number of minibatches
steps = x[phase].shape[0] // mb_size
# loop over all minibatches
for step in range(steps):
# get the samples for the current minibatch
x_mb = x[phase][(step * mb_size):((step + 1) * mb_size)]
y_mb = y[phase][(step * mb_size):((step + 1) * mb_size), None]
# compute the forward pass through the network
pred_y, trace = net.forward(x_mb)
# compute the current loss and accuracy
loss[phase] += np.mean(bce_loss(y_mb, pred_y))
accuracy[phase] += np.mean((y_mb > 0.5) == (pred_y > 0.5))
# only update the network in the training phase
if phase == 'train':
# compute the gradient for the loss
dy = bce_loss_grad(y_mb, pred_y)
# backpropagate the gradient through the network
net.backward(trace, dy)
# update the weights
net.step(learning_rate)
# compute the mean loss and accuracy over all minibatches
loss[phase] = loss[phase] / steps
accuracy[phase] = accuracy[phase] / steps
# add statistics to history
loss_hist[phase].append(loss[phase])
accuracy_hist[phase].append(accuracy[phase])
print('Epoch %3d: loss[train]=%7.4f accuracy[train]=%7.4f loss[test]=%7.4f accuracy[test]=%7.4f' %
(epoch, loss['train'], accuracy['train'], loss['test'], accuracy['test']))
# plot the learning curves
plt.figure(figsize=(20, 5))
plt.subplot(1, 2, 1)
for phase in loss_hist:
plt.plot(loss_hist[phase], label=phase)
plt.title('BCE loss')
plt.xlabel('Epoch')
plt.legend()
plt.subplot(1, 2, 2)
for phase in accuracy_hist:
plt.plot(accuracy_hist[phase], label=phase)
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.legend()
# construct network
net = Net([
Linear(64, 32),
ReLU(),
Linear(32, 1),
Sigmoid()])
# TODO: tune the hyperparameters
fit(net, x, y,
epochs = 25,
learning_rate = 0.01,
mb_size = 32)
Epoch 0: loss[train]= 0.4707 accuracy[train]= 0.7734 loss[test]= 0.3186 accuracy[test]= 0.8705
Epoch 1: loss[train]= 0.2325 accuracy[train]= 0.9196 loss[test]= 0.2105 accuracy[test]= 0.9241
Epoch 2: loss[train]= 0.1401 accuracy[train]= 0.9487 loss[test]= 0.1802 accuracy[test]= 0.9342
Epoch 3: loss[train]= 0.1007 accuracy[train]= 0.9654 loss[test]= 0.1622 accuracy[test]= 0.9442
Epoch 4: loss[train]= 0.0789 accuracy[train]= 0.9732 loss[test]= 0.1552 accuracy[test]= 0.9442
Epoch 5: loss[train]= 0.0640 accuracy[train]= 0.9777 loss[test]= 0.1467 accuracy[test]= 0.9498
Epoch 6: loss[train]= 0.0524 accuracy[train]= 0.9833 loss[test]= 0.1370 accuracy[test]= 0.9554
Epoch 7: loss[train]= 0.0443 accuracy[train]= 0.9855 loss[test]= 0.1278 accuracy[test]= 0.9576
Epoch 8: loss[train]= 0.0365 accuracy[train]= 0.9911 loss[test]= 0.1194 accuracy[test]= 0.9632
Epoch 9: loss[train]= 0.0319 accuracy[train]= 0.9933 loss[test]= 0.1128 accuracy[test]= 0.9654
Epoch 10: loss[train]= 0.0275 accuracy[train]= 0.9955 loss[test]= 0.1096 accuracy[test]= 0.9665
Epoch 11: loss[train]= 0.0232 accuracy[train]= 0.9978 loss[test]= 0.1042 accuracy[test]= 0.9632
Epoch 12: loss[train]= 0.0207 accuracy[train]= 0.9978 loss[test]= 0.1034 accuracy[test]= 0.9654
Epoch 13: loss[train]= 0.0177 accuracy[train]= 0.9978 loss[test]= 0.1020 accuracy[test]= 0.9654
Epoch 14: loss[train]= 0.0157 accuracy[train]= 0.9989 loss[test]= 0.1018 accuracy[test]= 0.9654
Epoch 15: loss[train]= 0.0139 accuracy[train]= 1.0000 loss[test]= 0.1020 accuracy[test]= 0.9643
Epoch 16: loss[train]= 0.0123 accuracy[train]= 1.0000 loss[test]= 0.1021 accuracy[test]= 0.9643
Epoch 17: loss[train]= 0.0111 accuracy[train]= 1.0000 loss[test]= 0.1032 accuracy[test]= 0.9643
Epoch 18: loss[train]= 0.0100 accuracy[train]= 1.0000 loss[test]= 0.1033 accuracy[test]= 0.9643
Epoch 19: loss[train]= 0.0091 accuracy[train]= 1.0000 loss[test]= 0.1042 accuracy[test]= 0.9643
Epoch 20: loss[train]= 0.0083 accuracy[train]= 1.0000 loss[test]= 0.1042 accuracy[test]= 0.9643
Epoch 21: loss[train]= 0.0077 accuracy[train]= 1.0000 loss[test]= 0.1056 accuracy[test]= 0.9643
Epoch 22: loss[train]= 0.0071 accuracy[train]= 1.0000 loss[test]= 0.1059 accuracy[test]= 0.9643
Epoch 23: loss[train]= 0.0067 accuracy[train]= 1.0000 loss[test]= 0.1069 accuracy[test]= 0.9654
Epoch 24: loss[train]= 0.0061 accuracy[train]= 1.0000 loss[test]= 0.1072 accuracy[test]= 0.9643
# construct network
net = Net([
Linear(64, 32),
Linear(32, 1),
Sigmoid()])
# TODO: tune the hyperparameters
fit(net, x, y,
epochs = 25,
learning_rate = 0.01,
mb_size = 32)
Epoch 0: loss[train]= 0.5409 accuracy[train]= 0.7690 loss[test]= 0.3747 accuracy[test]= 0.8348
Epoch 1: loss[train]= 0.3535 accuracy[train]= 0.8583 loss[test]= 0.3219 accuracy[test]= 0.8605
Epoch 2: loss[train]= 0.3138 accuracy[train]= 0.8783 loss[test]= 0.3138 accuracy[test]= 0.8650
Epoch 3: loss[train]= 0.2977 accuracy[train]= 0.8795 loss[test]= 0.3128 accuracy[test]= 0.8705
Epoch 4: loss[train]= 0.2871 accuracy[train]= 0.8906 loss[test]= 0.3128 accuracy[test]= 0.8750
Epoch 5: loss[train]= 0.2796 accuracy[train]= 0.8940 loss[test]= 0.3128 accuracy[test]= 0.8750
Epoch 6: loss[train]= 0.2740 accuracy[train]= 0.8984 loss[test]= 0.3128 accuracy[test]= 0.8772
Epoch 7: loss[train]= 0.2696 accuracy[train]= 0.8996 loss[test]= 0.3127 accuracy[test]= 0.8783
Epoch 8: loss[train]= 0.2661 accuracy[train]= 0.9018 loss[test]= 0.3125 accuracy[test]= 0.8783
Epoch 9: loss[train]= 0.2631 accuracy[train]= 0.9029 loss[test]= 0.3123 accuracy[test]= 0.8772
Epoch 10: loss[train]= 0.2607 accuracy[train]= 0.9040 loss[test]= 0.3121 accuracy[test]= 0.8783
Epoch 11: loss[train]= 0.2585 accuracy[train]= 0.9051 loss[test]= 0.3119 accuracy[test]= 0.8772
Epoch 12: loss[train]= 0.2567 accuracy[train]= 0.9051 loss[test]= 0.3117 accuracy[test]= 0.8739
Epoch 13: loss[train]= 0.2550 accuracy[train]= 0.9062 loss[test]= 0.3115 accuracy[test]= 0.8739
Epoch 14: loss[train]= 0.2536 accuracy[train]= 0.9062 loss[test]= 0.3112 accuracy[test]= 0.8761
Epoch 15: loss[train]= 0.2523 accuracy[train]= 0.9062 loss[test]= 0.3110 accuracy[test]= 0.8761
Epoch 16: loss[train]= 0.2512 accuracy[train]= 0.9074 loss[test]= 0.3109 accuracy[test]= 0.8761
Epoch 17: loss[train]= 0.2501 accuracy[train]= 0.9096 loss[test]= 0.3107 accuracy[test]= 0.8761
Epoch 18: loss[train]= 0.2492 accuracy[train]= 0.9096 loss[test]= 0.3105 accuracy[test]= 0.8772
Epoch 19: loss[train]= 0.2483 accuracy[train]= 0.9096 loss[test]= 0.3104 accuracy[test]= 0.8783
Epoch 20: loss[train]= 0.2475 accuracy[train]= 0.9107 loss[test]= 0.3102 accuracy[test]= 0.8783
Epoch 21: loss[train]= 0.2467 accuracy[train]= 0.9107 loss[test]= 0.3101 accuracy[test]= 0.8783
Epoch 22: loss[train]= 0.2460 accuracy[train]= 0.9107 loss[test]= 0.3100 accuracy[test]= 0.8795
Epoch 23: loss[train]= 0.2454 accuracy[train]= 0.9107 loss[test]= 0.3099 accuracy[test]= 0.8795
Epoch 24: loss[train]= 0.2448 accuracy[train]= 0.9107 loss[test]= 0.3098 accuracy[test]= 0.8795
# construct network
net = Net([
Linear(64, 1),
Sigmoid()])
# TODO: tune the hyperparameters
fit(net, x, y,
epochs = 25,
learning_rate = 0.01,
mb_size = 32)
Epoch 0: loss[train]= 0.5437 accuracy[train]= 0.7266 loss[test]= 0.3806 accuracy[test]= 0.8371
Epoch 1: loss[train]= 0.3371 accuracy[train]= 0.8672 loss[test]= 0.3482 accuracy[test]= 0.8527
Epoch 2: loss[train]= 0.3018 accuracy[train]= 0.8929 loss[test]= 0.3336 accuracy[test]= 0.8594
Epoch 3: loss[train]= 0.2850 accuracy[train]= 0.8973 loss[test]= 0.3245 accuracy[test]= 0.8627
Epoch 4: loss[train]= 0.2748 accuracy[train]= 0.9029 loss[test]= 0.3185 accuracy[test]= 0.8694
Epoch 5: loss[train]= 0.2679 accuracy[train]= 0.9051 loss[test]= 0.3144 accuracy[test]= 0.8717
Epoch 6: loss[train]= 0.2630 accuracy[train]= 0.9062 loss[test]= 0.3115 accuracy[test]= 0.8761
Epoch 7: loss[train]= 0.2592 accuracy[train]= 0.9040 loss[test]= 0.3096 accuracy[test]= 0.8750
Epoch 8: loss[train]= 0.2563 accuracy[train]= 0.9029 loss[test]= 0.3082 accuracy[test]= 0.8772
Epoch 9: loss[train]= 0.2539 accuracy[train]= 0.9062 loss[test]= 0.3073 accuracy[test]= 0.8795
Epoch 10: loss[train]= 0.2520 accuracy[train]= 0.9062 loss[test]= 0.3067 accuracy[test]= 0.8795
Epoch 11: loss[train]= 0.2505 accuracy[train]= 0.9062 loss[test]= 0.3064 accuracy[test]= 0.8817
Epoch 12: loss[train]= 0.2491 accuracy[train]= 0.9062 loss[test]= 0.3063 accuracy[test]= 0.8806
Epoch 13: loss[train]= 0.2480 accuracy[train]= 0.9051 loss[test]= 0.3063 accuracy[test]= 0.8795
Epoch 14: loss[train]= 0.2470 accuracy[train]= 0.9062 loss[test]= 0.3064 accuracy[test]= 0.8806
Epoch 15: loss[train]= 0.2462 accuracy[train]= 0.9062 loss[test]= 0.3065 accuracy[test]= 0.8795
Epoch 16: loss[train]= 0.2455 accuracy[train]= 0.9074 loss[test]= 0.3068 accuracy[test]= 0.8795
Epoch 17: loss[train]= 0.2448 accuracy[train]= 0.9085 loss[test]= 0.3071 accuracy[test]= 0.8783
Epoch 18: loss[train]= 0.2443 accuracy[train]= 0.9096 loss[test]= 0.3074 accuracy[test]= 0.8783
Epoch 19: loss[train]= 0.2438 accuracy[train]= 0.9096 loss[test]= 0.3077 accuracy[test]= 0.8795
Epoch 20: loss[train]= 0.2433 accuracy[train]= 0.9107 loss[test]= 0.3081 accuracy[test]= 0.8795
Epoch 21: loss[train]= 0.2429 accuracy[train]= 0.9107 loss[test]= 0.3085 accuracy[test]= 0.8795
Epoch 22: loss[train]= 0.2425 accuracy[train]= 0.9096 loss[test]= 0.3088 accuracy[test]= 0.8795
Epoch 23: loss[train]= 0.2422 accuracy[train]= 0.9096 loss[test]= 0.3092 accuracy[test]= 0.8795
Epoch 24: loss[train]= 0.2419 accuracy[train]= 0.9107 loss[test]= 0.3096 accuracy[test]= 0.8806
net = Net([
Linear(64, 32),
ReLU(),
Linear(32, 16),
ReLU(),
Linear(16, 8),
ReLU(),
Linear(8, 4),
ReLU(),
Linear(4, 1),
Sigmoid()])
# TODO: tune the hyperparameters
fit(net, x, y,
epochs = 25,
learning_rate = 0.01,
mb_size = 32)
Epoch 0: loss[train]= 0.6929 accuracy[train]= 0.4978 loss[test]= 0.6922 accuracy[test]= 0.5078
Epoch 1: loss[train]= 0.6910 accuracy[train]= 0.5513 loss[test]= 0.6874 accuracy[test]= 0.7969
Epoch 2: loss[train]= 0.6694 accuracy[train]= 0.6518 loss[test]= 0.5829 accuracy[test]= 0.7913
Epoch 3: loss[train]= 0.5031 accuracy[train]= 0.7824 loss[test]= 0.7568 accuracy[test]= 0.6384
Epoch 4: loss[train]= 0.3420 accuracy[train]= 0.8728 loss[test]= 0.1940 accuracy[test]= 0.9353
Epoch 5: loss[train]= 0.2038 accuracy[train]= 0.9230 loss[test]= 0.2431 accuracy[test]= 0.9152
Epoch 6: loss[train]= 0.1199 accuracy[train]= 0.9598 loss[test]= 0.1915 accuracy[test]= 0.9342
Epoch 7: loss[train]= 0.0874 accuracy[train]= 0.9699 loss[test]= 0.3728 accuracy[test]= 0.8828
Epoch 8: loss[train]= 0.0445 accuracy[train]= 0.9821 loss[test]= 0.0969 accuracy[test]= 0.9654
Epoch 9: loss[train]= 0.1485 accuracy[train]= 0.9699 loss[test]= 0.0987 accuracy[test]= 0.9721
Epoch 10: loss[train]= 0.0301 accuracy[train]= 0.9933 loss[test]= 0.0815 accuracy[test]= 0.9754
Epoch 11: loss[train]= 0.0113 accuracy[train]= 0.9978 loss[test]= 0.0872 accuracy[test]= 0.9699
Epoch 12: loss[train]= 0.0082 accuracy[train]= 0.9967 loss[test]= 0.0895 accuracy[test]= 0.9766
Epoch 13: loss[train]= 0.0033 accuracy[train]= 1.0000 loss[test]= 0.0907 accuracy[test]= 0.9788
Epoch 14: loss[train]= 0.0017 accuracy[train]= 1.0000 loss[test]= 0.1091 accuracy[test]= 0.9721
Epoch 15: loss[train]= 0.0014 accuracy[train]= 1.0000 loss[test]= 0.1129 accuracy[test]= 0.9732
Epoch 16: loss[train]= 0.0011 accuracy[train]= 1.0000 loss[test]= 0.1197 accuracy[test]= 0.9710
Epoch 17: loss[train]= 0.0008 accuracy[train]= 1.0000 loss[test]= 0.1197 accuracy[test]= 0.9721
Epoch 18: loss[train]= 0.0007 accuracy[train]= 1.0000 loss[test]= 0.1225 accuracy[test]= 0.9721
Epoch 19: loss[train]= 0.0007 accuracy[train]= 1.0000 loss[test]= 0.1192 accuracy[test]= 0.9743
Epoch 20: loss[train]= 0.0006 accuracy[train]= 1.0000 loss[test]= 0.1221 accuracy[test]= 0.9754
Epoch 21: loss[train]= 0.0004 accuracy[train]= 1.0000 loss[test]= 0.1221 accuracy[test]= 0.9766
Epoch 22: loss[train]= 0.0004 accuracy[train]= 1.0000 loss[test]= 0.1225 accuracy[test]= 0.9766
Epoch 23: loss[train]= 0.0003 accuracy[train]= 1.0000 loss[test]= 0.1232 accuracy[test]= 0.9766
Epoch 24: loss[train]= 0.0003 accuracy[train]= 1.0000 loss[test]= 0.1241 accuracy[test]= 0.9766