%matplotlib inline
import numpy as np
import scipy.optimize
import sklearn.datasets
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True, precision=6, linewidth=200)
plt.style.use('ggplot')
def f(x, y):
return x ** 2 + y ** 2 + x * (y + 2) + np.cos(3 * x)
def grad_x_f(x, y):
return 2 * x - 3 * np.sin(3 * x) + y + 2
def grad_y_f(x, y):
return x + 2 * y
def plot_f_contours():
xx, yy = np.meshgrid(np.linspace(-5, 5), np.linspace(-5, 5))
zz = f(xx, yy)
plt.contourf(xx, yy, zz, 50)
plt.contour(xx, yy, zz, 50, alpha=0.2, colors='black', linestyles='solid')
plt.xlabel('x')
plt.ylabel('y')
plt.figure(figsize=(10, 7))
plot_f_contours()
def optimize_f(x, y, step_size, steps):
# keep track of the parameters we tried so far
x_hist, y_hist = [x], [y]
# run gradient descent for the number of steps
for step in range(steps):
# compute the gradients at the current point
dx = grad_x_f(x, y)
dy = grad_y_f(x, y)
# apply the gradient descent updates to x and y
x = x - (step_size * dx)
y = y - (step_size * dy)
# store the new parameters
x_hist.append(x)
y_hist.append(y)
return x, y, f(x, y), x_hist, y_hist
# helper function that plots the results of the gradient descent optimization
def plot_gradient_descent_results(x, y, val, x_hist, y_hist):
# plot the path on the contour plot
plt.figure(figsize=(20, 7))
plt.subplot(1, 2, 1)
plot_f_contours()
plt.plot(x_hist, y_hist, '.-')
# plot the learning curve
plt.subplot(1, 2, 2)
plt.plot(f(np.array(x_hist), np.array(y_hist)), '.r-')
plt.title('Minimum value: %f' % f(x_hist[-1], y_hist[-1]))
results = optimize_f(x=3, y=2, step_size=0.1, steps=10)
plot_gradient_descent_results(*results)
# TODO: tune the parameters to find a better optimum
results = optimize_f(x=3, y=2, step_size=0.15, steps=500)
plot_gradient_descent_results(*results)
def optimize_f(x, y, step_size, steps, decay=1.0):
# keep track of the parameters we tried so far
x_hist, y_hist = [x], [y]
# run gradient descent for the number of steps
for step in range(steps):
# compute the gradients at this point
dx = grad_x_f(x, y)
dy = grad_y_f(x, y)
# apply the gradient descent updates to x and y
step_size = step_size * decay
x = x - (step_size * dx)
y = y - (step_size * dy)
# store the new parameters
x_hist.append(x)
y_hist.append(y)
return x, y, f(x, y), x_hist, y_hist
# TODO: tune the parameters to find the local optimum
results = optimize_f(x=3, y=2, step_size=0.5, steps=100, decay=0.95)
plot_gradient_descent_results(*results)
def sigmoid(x):
return 1/(1+ np.exp(-x))
def sigmoid_grad(x):
return np.exp(-x) / ((1 + np.exp(-x))**2)
# try with a random input
x = np.random.uniform(-10, 10, size=5)
print('x:', x)
print('sigmoid(x):', sigmoid(x))
print('sigmoid_grad(x):', sigmoid_grad(x))
x: [-9.098796 9.979386 -5.079815 9.951826 -1.562108]
sigmoid(x): [0.000112 0.999954 0.006183 0.999952 0.173344]
sigmoid_grad(x): [0.000112 0.000046 0.006144 0.000048 0.143296]
# start with some random inputs
x = np.random.uniform(-2, 2, size=5)
# compute the symbolic gradient
print('Symbolic', sigmoid_grad(x))
# TODO: compute the numerical gradient
epsilon = 0.001
numerical_gradient = (sigmoid(x+0.5*epsilon)-sigmoid(x-0.5*epsilon))/epsilon
print('Numerical', numerical_gradient)
Symbolic [0.235547 0.249954 0.221157 0.176144 0.106473]
Numerical [0.235547 0.249954 0.221157 0.176144 0.106473]
def relu(x):
return np.maximum(0,x)
def relu_grad(x):
return np.maximum(0,np.sign(x))
# try with a random input
x = np.random.uniform(-10, 10, size=5)
print('x:', x)
print('relu(x):', relu(x))
print('relu_grad(x):', relu_grad(x))
epsilon = 0.001
num_relu_grad = (relu(x+0.5*epsilon)-relu(x-0.5*epsilon))/epsilon
print('num_relu_grad(x)', num_relu_grad)
# TODO: compute and compare the symbolic and numerical gradients
x: [-2.03056 -4.020691 -1.175124 -6.169135 1.66985 ]
relu(x): [0. 0. 0. 0. 1.66985]
relu_grad(x): [0. 0. 0. 0. 1.]
num_relu_grad(x) [0. 0. 0. 0. 1.]
x = np.linspace(-10, 10, 100)
plt.figure(figsize=(15, 8))
plt.subplot(2, 2, 1)
plt.plot(x, sigmoid(x), label='Sigmoid')
plt.xlabel('x')
plt.legend(loc='upper left')
plt.subplot(2, 2, 2)
plt.plot(x, relu(x), label='ReLU')
plt.xlabel('x')
plt.legend(loc='upper left')
plt.subplot(2, 2, 3)
plt.plot(x, sigmoid_grad(x), label='Sigmoid gradient')
plt.xlabel('x')
plt.legend(loc='upper left')
plt.subplot(2, 2, 4)
plt.plot(x, relu_grad(x), label='ReLU gradient')
plt.xlabel('x')
plt.legend(loc='upper left');
def bce_loss(y, y_hat):
return -(y*np.log(y_hat) + (1-y)*np.log(1-y_hat))
def bce_loss_grad(y, y_hat):
return (-y)/y_hat + (1-y)/(1-y_hat)
# try with some random inputs
y = np.random.randint(2, size=5)
y_hat = np.random.uniform(0, 1, size=5)
print('y:', y)
print('y_hat:', y_hat)
print('bceloss(y, y_hat):', bce_loss(y, y_hat))
epsilon = 0.00001
num_bce_grad = (bce_loss(y, y_hat+0.5*epsilon)-bce_loss(y,y_hat-0.5*epsilon))/epsilon
print('num_bce_grad(y, y_hat):', num_bce_grad)
print('bce_loss_grad(y, y_hat):', bce_loss_grad(y, y_hat))
y: [0 1 1 1 1]
y_hat: [0.053824 0.962561 0.198698 0.659568 0.548089]
bceloss(y, y_hat): [0.055326 0.038157 1.615968 0.41617 0.601318]
num_bce_grad(y, y_hat): [ 1.056885 -1.038895 -5.032759 -1.516144 -1.824523]
bce_loss_grad(y, y_hat): [ 1.056885 -1.038895 -5.032759 -1.516144 -1.824523]
# initialize parameters
w = np.random.uniform(size=5)
b = np.random.rand()
# implement the model
def fn(x, y):
# TODO: forward: compute h, y_hat, loss
h = np.dot(np.transpose(x), w) + b
y_hat = sigmoid(h)
loss = bce_loss(y, y_hat)
# TODO: backward: compute grad_y_hat, grad_h, grad_x
grad_y_hat = bce_loss_grad(y, y_hat)
grad_h = sigmoid_grad(grad_y_hat) #(dl dyhat * W transposed in lecture example)
grad_x = np.dot(grad_h, np.transpose(w))
# sigmoid_grad: np.exp(-x) / ((1 + np.exp(-x))**2)
return loss, grad_x
# test with a random input
x = np.random.uniform(size=5)
y = 1
loss, grad_x = fn(x, y)
print("Loss", loss)
print("Gradient", grad_x)
Loss 0.1659708964889129
Gradient [0.170876 0.076722 0.159885 0.006955 0.016888]
# start with some random inputs
x = np.random.uniform(size=5)
y = 1
# set epsilon to a small value
eps = 0.00001
numerical_grad = np.zeros(x.shape)
# compute the gradient for each element of x separately
for i in range(len(x)):
# compute inputs at -eps/2 and +eps/2
x_a, x_b = x.copy(), x.copy()
x_a[i] += eps / 2
x_b[i] -= eps / 2
# compute the gradient for this element
loss_a, _ = fn(x_a, y)
loss_b, _ = fn(x_b, y)
numerical_grad[i] = (loss_a - loss_b) / eps
# compute the symbolic gradient
loss, symbolic_grad = fn(x, y)
print("Symbolic gradient")
print(symbolic_grad)
print("Numerical gradient")
print(numerical_grad)
Symbolic gradient
[0.161812 0.072653 0.151404 0.006586 0.015992]
Numerical gradient
[-0.207983 -0.093383 -0.194604 -0.008465 -0.020555]
# Computes y = x * w + b.
class Linear:
def __init__(self, n_in, n_out):
# initialize the weights randomly,
# using the Xavier initialization rule for scale
a = np.sqrt(6 / (n_in * n_out))
self.W = np.random.uniform(-a, a, size=(n_in, n_out))
self.b = np.zeros((n_out,))
def forward(self, x):
# TODO: compute the forward pass
y = np.dot(x, self.W) + self.b
return y
def backward(self, x, dy):
# TODO: compute the backward pass,
# given dy, compute the gradients for x, W and b
dx = np.dot(dy, np.transpose(self.W))
self.dW = np.dot(np.transpose(x), dy)
self.db = (np.sum(dy) / len(dy))
return dx
def step(self, step_size):
# TODO: apply a gradient descent update step
self.W = self.W - self.dW * step_size # TODO
self.b = self.b - self.db * step_size # TODO
def __str__(self):
return 'Linear %dx%d' % self.W.shape
# Try the new class with some random values.
# Debugging tip: always choose a unique length for each
# dimension, so you'll get an error if you mix them up.
x = np.random.uniform(size=(3, 5))
layer = Linear(5, 7)
y = layer.forward(x)
dx = layer.backward(x, np.ones_like(y))
print('y:', y)
print('dx:', dx)
y: [[ 0.339211 0.501915 0.037213 0.100355 0.278983 -0.186019 -0.291139]
[ 0.303208 0.592294 -0.073754 0.105227 0.291073 -0.26955 0.127579]
[ 0.100563 0.595091 -0.105874 -0.065961 0.368961 -0.206255 0.266175]]
dx: [[0.576871 0.132018 0.427378 0.021808 0.549294]
[0.576871 0.132018 0.427378 0.021808 0.549294]
[0.576871 0.132018 0.427378 0.021808 0.549294]]
# Computes y = 1 / (1 + exp(-x)).
class Sigmoid:
def forward(self, x):
# TODO: compute the forward pass
return sigmoid(x)
def backward(self, x, dy):
# TODO: compute the backward pass,
# return the gradient for x given dy
sigmoid_x = sigmoid(x)
self.dx = dy * (sigmoid_x * (1-sigmoid_x))
return self.dx
def step(self, step_size):
pass
def __str__(self):
return 'Sigmoid'
# try the new class with some random values
x = np.random.uniform(size=(3, 5))
layer = Sigmoid()
y = layer.forward(x)
dx = layer.backward(x, np.ones_like(y))
print('y:', y)
print('dx:', dx)
y: [[0.648987 0.545029 0.62944 0.644926 0.655771]
[0.652881 0.657442 0.549253 0.673219 0.68591 ]
[0.682264 0.509436 0.609581 0.58547 0.593213]]
dx: [[0.227803 0.247972 0.233245 0.228997 0.225735]
[0.226627 0.225212 0.247574 0.219995 0.215438]
[0.21678 0.249911 0.237992 0.242695 0.241311]]
# Computes y = max(0, x).
class ReLU:
def forward(self, x):
# TODO: compute the forward pass
return np.maximum(0,x)
def backward(self, x, dy):
# TODO: compute the backward pass,
# return the gradient for x given dy
return np.multiply(dy, np.int64(x > 0))
def step(self, step_size):
pass
def __str__(self):
return 'ReLU'
# try the new class with some random values
x = np.random.uniform(-10, 10, size=(3, 5))
layer = ReLU()
y = layer.forward(x)
dx = layer.backward(x, np.ones_like(y))
print('y:', y)
print('dx:', dx)
y: [[0. 2.290372 5.163791 0. 0. ]
[0. 0. 0. 2.037993 0. ]
[0. 0. 0. 0. 3.88808 ]]
dx: [[0. 1. 1. 0. 0.]
[0. 0. 0. 1. 0.]
[0. 0. 0. 0. 1.]]
## Verify gradient computations for Linear
# test for dx
layer = Linear(5, 7)
def test_fn(x):
x = x.reshape(3, 5)
# multiply the output with a constant to check if
# the gradient uses dy
return 2 * np.sum(layer.forward(x))
def test_fn_grad(x):
x = x.reshape(3, 5)
# multiply the incoming dy gradient with a constant
return layer.backward(x, 2 * np.ones((3, 7))).flatten()
err = scipy.optimize.check_grad(test_fn, test_fn_grad,
np.random.uniform(-10, 10, size=3 * 5))
print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)
# test for dW
x = np.random.uniform(size=(3, 5))
layer = Linear(5, 7)
def test_fn(w):
layer.W = w.reshape(5, 7)
# multiply the output with a constant to check if
# the gradient uses dy
return 2 * np.sum(layer.forward(x))
def test_fn_grad(w):
layer.W = w.reshape(5, 7)
# multiply the incoming dy gradient with a constant
layer.backward(x, 2 * np.ones((3, 7)))
return layer.dW.flatten()
err = scipy.optimize.check_grad(test_fn, test_fn_grad,
np.random.uniform(-10, 10, size=5 * 7))
print("err on dW:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)
# test for db
x = np.random.uniform(size=(3, 5,))
layer = Linear(5, 7)
def test_fn(b):
layer.b = b
# multiply the output with a constant to check if
# the gradient uses dy
return 2 * np.sum(layer.forward(x))
def test_fn_grad(b):
layer.b = b
# multiply the incoming dy gradient with a constant
layer.backward(x, 2 * np.ones((x.shape[0], 7)))
return layer.db
err = scipy.optimize.check_grad(test_fn, test_fn_grad,
np.random.uniform(-10, 10, size=7))
print("err on db:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)
err on dx: OK 4.686306062709855e-07
err on dW: OK 4.035680032700044e-06
err on db: ERROR 21.166010488516726
## Verify gradient computation for Sigmoid
# test for dx
layer = Sigmoid()
def test_fn(x):
# multiply the output with a constant to check if
# the gradient uses dy
return np.sum(2 * layer.forward(x))
def test_fn_grad(x):
# multiply the incoming dy gradient with a constant
return layer.backward(x, 2 * np.ones(x.shape))
err = scipy.optimize.check_grad(test_fn, test_fn_grad,
np.random.uniform(-10, 10, size=5))
print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)
err on dx: OK 4.69246953421683e-08
## Verify gradient computation for ReLU
# test for dx
layer = ReLU()
def test_fn(x):
# multiply the output with a constant to check if
# the gradient uses dy
return 2 * np.sum(layer.forward(x))
def test_fn_grad(x):
# multiply the incoming dy gradient with a constant
return layer.backward(x, 2 * np.ones(x.shape))
err = scipy.optimize.check_grad(test_fn, test_fn_grad,
np.random.uniform(1, 10, size=5))
print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)
err on dx: OK 0.0
class Net:
def __init__(self, layers):
self.layers = layers
def forward(self, x):
# compute the forward pass for each layer
trace = []
for layer in self.layers:
# compute the forward pass
y = layer.forward(x)
# store the original input for the backward pass
trace.append((layer, x))
x = y
# return the final output and the history trace
return y, trace
def backward(self, trace, dy):
# compute the backward pass for each layer
for layer, x in trace[::-1]:
# compute the backward pass using the original input x
dy = layer.backward(x, dy)
def step(self, learning_rate):
# apply the gradient descent updates of each layer
for layer in self.layers:
layer.step(learning_rate)
def __str__(self):
return '\n'.join(str(l) for l in self.layers)
# load the first two classes of the digits dataset
dataset = sklearn.datasets.load_digits()
digits_x, digits_y = dataset['data'], dataset['target']
# create a binary classification problem
digits_y = (digits_y < 5).astype(float)
# plot some of the digits
plt.figure(figsize=(10, 2))
plt.imshow(np.hstack([digits_x[i].reshape(8, 8) for i in range(10)]), cmap='gray')
plt.grid(False)
plt.tight_layout()
plt.axis('off')
# normalize the values to [0, 1]
digits_x -= np.mean(digits_x)
digits_x /= np.std(digits_x)
# print some statistics
print('digits_x.shape:', digits_x.shape)
print('digits_y.shape:', digits_y.shape)
print('min, max values:', np.min(digits_x), np.max(digits_x))
print('labels:', np.unique(digits_y))
digits_x.shape: (1797, 64)
digits_y.shape: (1797,)
min, max values: -0.8117561971974786 1.847470154168513
labels: [0. 1.]
# make a 50%/50% train/test split
train_prop = 0.5
n_train = int(digits_x.shape[0] * train_prop)
# shuffle the images
idxs = np.random.permutation(digits_x.shape[0])
# take a subset
x = {'train': digits_x[idxs[:n_train]],
'test': digits_x[idxs[n_train:]]}
y = {'train': digits_y[idxs[:n_train]],
'test': digits_y[idxs[n_train:]]}
print('Training samples:', x['train'].shape[0])
print('Test samples:', x['test'].shape[0])
Training samples: 898
Test samples: 899
def fit(net, x, y, epochs=25, learning_rate=0.001, mb_size=10):
# initialize the loss and accuracy history
loss_hist = {'train': [], 'test': []}
accuracy_hist = {'train': [], 'test': []}
for epoch in range(epochs):
# initialize the loss and accuracy for this epoch
loss = {'train': 0.0, 'test': 0.0}
accuracy = {'train': 0.0, 'test': 0.0}
# first train on training data, then evaluate on the test data
for phase in ('train', 'test'):
# compute the number of minibatches
steps = x[phase].shape[0] // mb_size
# loop over all minibatches
for step in range(steps):
# get the samples for the current minibatch
x_mb = x[phase][(step * mb_size):((step + 1) * mb_size)]
y_mb = y[phase][(step * mb_size):((step + 1) * mb_size), None]
# compute the forward pass through the network
pred_y, trace = net.forward(x_mb)
# compute the current loss and accuracy
loss[phase] += np.mean(bce_loss(y_mb, pred_y))
accuracy[phase] += np.mean((y_mb > 0.5) == (pred_y > 0.5))
# only update the network in the training phase
if phase == 'train':
# compute the gradient for the loss
dy = bce_loss_grad(y_mb, pred_y)
# backpropagate the gradient through the network
net.backward(trace, dy)
# update the weights
net.step(learning_rate)
# compute the mean loss and accuracy over all minibatches
loss[phase] = loss[phase] / steps
accuracy[phase] = accuracy[phase] / steps
# add statistics to history
loss_hist[phase].append(loss[phase])
accuracy_hist[phase].append(accuracy[phase])
print('Epoch %3d: loss[train]=%7.4f accuracy[train]=%7.4f loss[test]=%7.4f accuracy[test]=%7.4f' %
(epoch, loss['train'], accuracy['train'], loss['test'], accuracy['test']))
# plot the learning curves
plt.figure(figsize=(20, 5))
plt.subplot(1, 2, 1)
for phase in loss_hist:
plt.plot(loss_hist[phase], label=phase)
plt.title('BCE loss')
plt.xlabel('Epoch')
plt.legend()
plt.subplot(1, 2, 2)
for phase in accuracy_hist:
plt.plot(accuracy_hist[phase], label=phase)
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.legend()
# construct network
net = Net([
Linear(64, 32),
ReLU(),
Linear(32, 1),
Sigmoid()])
# TODO: tune the hyperparameters
fit(net, x, y,
epochs = 25,
learning_rate = 0.01,
mb_size = 10)
Epoch 0: loss[train]= 0.4128 accuracy[train]= 0.8213 loss[test]= 0.2512 accuracy[test]= 0.9022
Epoch 1: loss[train]= 0.2213 accuracy[train]= 0.9270 loss[test]= 0.1727 accuracy[test]= 0.9315
Epoch 2: loss[train]= 0.1500 accuracy[train]= 0.9494 loss[test]= 0.1349 accuracy[test]= 0.9551
Epoch 3: loss[train]= 0.1150 accuracy[train]= 0.9663 loss[test]= 0.1174 accuracy[test]= 0.9596
Epoch 4: loss[train]= 0.0930 accuracy[train]= 0.9708 loss[test]= 0.1073 accuracy[test]= 0.9629
Epoch 5: loss[train]= 0.0767 accuracy[train]= 0.9764 loss[test]= 0.1015 accuracy[test]= 0.9640
Epoch 6: loss[train]= 0.0640 accuracy[train]= 0.9820 loss[test]= 0.0960 accuracy[test]= 0.9640
Epoch 7: loss[train]= 0.0541 accuracy[train]= 0.9899 loss[test]= 0.0930 accuracy[test]= 0.9663
Epoch 8: loss[train]= 0.0469 accuracy[train]= 0.9921 loss[test]= 0.0942 accuracy[test]= 0.9640
Epoch 9: loss[train]= 0.0400 accuracy[train]= 0.9933 loss[test]= 0.0932 accuracy[test]= 0.9640
Epoch 10: loss[train]= 0.0355 accuracy[train]= 0.9933 loss[test]= 0.0928 accuracy[test]= 0.9663
Epoch 11: loss[train]= 0.0303 accuracy[train]= 0.9944 loss[test]= 0.0919 accuracy[test]= 0.9640
Epoch 12: loss[train]= 0.0274 accuracy[train]= 0.9944 loss[test]= 0.0894 accuracy[test]= 0.9652
Epoch 13: loss[train]= 0.0241 accuracy[train]= 0.9966 loss[test]= 0.0899 accuracy[test]= 0.9674
Epoch 14: loss[train]= 0.0217 accuracy[train]= 0.9955 loss[test]= 0.0890 accuracy[test]= 0.9674
Epoch 15: loss[train]= 0.0191 accuracy[train]= 0.9966 loss[test]= 0.0895 accuracy[test]= 0.9685
Epoch 16: loss[train]= 0.0167 accuracy[train]= 0.9978 loss[test]= 0.0890 accuracy[test]= 0.9697
Epoch 17: loss[train]= 0.0150 accuracy[train]= 0.9978 loss[test]= 0.0896 accuracy[test]= 0.9697
Epoch 18: loss[train]= 0.0137 accuracy[train]= 0.9989 loss[test]= 0.0886 accuracy[test]= 0.9708
Epoch 19: loss[train]= 0.0121 accuracy[train]= 0.9989 loss[test]= 0.0883 accuracy[test]= 0.9685
Epoch 20: loss[train]= 0.0107 accuracy[train]= 0.9989 loss[test]= 0.0890 accuracy[test]= 0.9685
Epoch 21: loss[train]= 0.0097 accuracy[train]= 1.0000 loss[test]= 0.0886 accuracy[test]= 0.9685
Epoch 22: loss[train]= 0.0089 accuracy[train]= 1.0000 loss[test]= 0.0877 accuracy[test]= 0.9685
Epoch 23: loss[train]= 0.0082 accuracy[train]= 1.0000 loss[test]= 0.0879 accuracy[test]= 0.9697
Epoch 24: loss[train]= 0.0075 accuracy[train]= 1.0000 loss[test]= 0.0884 accuracy[test]= 0.9697
# TODO: Your code here.
# construct network
net = Net([
Linear(64, 32),
Linear(32, 1),
Sigmoid()])
# TODO: tune the hyperparameters
fit(net, x, y,
epochs = 25,
learning_rate = 0.01,
mb_size = 10)
Epoch 0: loss[train]= 0.4589 accuracy[train]= 0.7843 loss[test]= 0.3148 accuracy[test]= 0.8685
Epoch 1: loss[train]= 0.3484 accuracy[train]= 0.8596 loss[test]= 0.2906 accuracy[test]= 0.8764
Epoch 2: loss[train]= 0.3265 accuracy[train]= 0.8742 loss[test]= 0.2809 accuracy[test]= 0.8843
Epoch 3: loss[train]= 0.3152 accuracy[train]= 0.8798 loss[test]= 0.2760 accuracy[test]= 0.8888
Epoch 4: loss[train]= 0.3079 accuracy[train]= 0.8854 loss[test]= 0.2735 accuracy[test]= 0.8921
Epoch 5: loss[train]= 0.3026 accuracy[train]= 0.8910 loss[test]= 0.2724 accuracy[test]= 0.8955
Epoch 6: loss[train]= 0.2987 accuracy[train]= 0.8933 loss[test]= 0.2720 accuracy[test]= 0.9000
Epoch 7: loss[train]= 0.2955 accuracy[train]= 0.8921 loss[test]= 0.2720 accuracy[test]= 0.9011
Epoch 8: loss[train]= 0.2930 accuracy[train]= 0.8921 loss[test]= 0.2722 accuracy[test]= 0.9000
Epoch 9: loss[train]= 0.2908 accuracy[train]= 0.8933 loss[test]= 0.2726 accuracy[test]= 0.9000
Epoch 10: loss[train]= 0.2890 accuracy[train]= 0.8933 loss[test]= 0.2730 accuracy[test]= 0.8989
Epoch 11: loss[train]= 0.2874 accuracy[train]= 0.8944 loss[test]= 0.2734 accuracy[test]= 0.8989
Epoch 12: loss[train]= 0.2861 accuracy[train]= 0.8933 loss[test]= 0.2739 accuracy[test]= 0.8989
Epoch 13: loss[train]= 0.2848 accuracy[train]= 0.8944 loss[test]= 0.2743 accuracy[test]= 0.9000
Epoch 14: loss[train]= 0.2837 accuracy[train]= 0.8966 loss[test]= 0.2747 accuracy[test]= 0.9000
Epoch 15: loss[train]= 0.2828 accuracy[train]= 0.8978 loss[test]= 0.2752 accuracy[test]= 0.9000
Epoch 16: loss[train]= 0.2819 accuracy[train]= 0.9022 loss[test]= 0.2756 accuracy[test]= 0.9000
Epoch 17: loss[train]= 0.2811 accuracy[train]= 0.9034 loss[test]= 0.2759 accuracy[test]= 0.9000
Epoch 18: loss[train]= 0.2803 accuracy[train]= 0.9022 loss[test]= 0.2763 accuracy[test]= 0.9000
Epoch 19: loss[train]= 0.2796 accuracy[train]= 0.9022 loss[test]= 0.2767 accuracy[test]= 0.9000
Epoch 20: loss[train]= 0.2790 accuracy[train]= 0.9022 loss[test]= 0.2770 accuracy[test]= 0.9000
Epoch 21: loss[train]= 0.2784 accuracy[train]= 0.9022 loss[test]= 0.2773 accuracy[test]= 0.9000
Epoch 22: loss[train]= 0.2778 accuracy[train]= 0.9011 loss[test]= 0.2776 accuracy[test]= 0.9000
Epoch 23: loss[train]= 0.2773 accuracy[train]= 0.9022 loss[test]= 0.2779 accuracy[test]= 0.9000
Epoch 24: loss[train]= 0.2768 accuracy[train]= 0.9011 loss[test]= 0.2782 accuracy[test]= 0.9000
# TODO: Your code here.
# construct network
net = Net([
Linear(64, 1),
Sigmoid()])
# TODO: tune the hyperparameters
fit(net, x, y,
epochs = 25,
learning_rate = 0.01,
mb_size = 10)
Epoch 0: loss[train]= 0.4900 accuracy[train]= 0.7809 loss[test]= 0.3529 accuracy[test]= 0.8640
Epoch 1: loss[train]= 0.3616 accuracy[train]= 0.8483 loss[test]= 0.3127 accuracy[test]= 0.8865
Epoch 2: loss[train]= 0.3324 accuracy[train]= 0.8685 loss[test]= 0.2956 accuracy[test]= 0.8933
Epoch 3: loss[train]= 0.3170 accuracy[train]= 0.8820 loss[test]= 0.2862 accuracy[test]= 0.8921
Epoch 4: loss[train]= 0.3072 accuracy[train]= 0.8876 loss[test]= 0.2804 accuracy[test]= 0.8944
Epoch 5: loss[train]= 0.3004 accuracy[train]= 0.8899 loss[test]= 0.2766 accuracy[test]= 0.8989
Epoch 6: loss[train]= 0.2955 accuracy[train]= 0.8921 loss[test]= 0.2742 accuracy[test]= 0.9000
Epoch 7: loss[train]= 0.2918 accuracy[train]= 0.8933 loss[test]= 0.2725 accuracy[test]= 0.8989
Epoch 8: loss[train]= 0.2889 accuracy[train]= 0.8933 loss[test]= 0.2715 accuracy[test]= 0.9000
Epoch 9: loss[train]= 0.2866 accuracy[train]= 0.8921 loss[test]= 0.2708 accuracy[test]= 0.9011
Epoch 10: loss[train]= 0.2847 accuracy[train]= 0.8966 loss[test]= 0.2704 accuracy[test]= 0.9011
Epoch 11: loss[train]= 0.2831 accuracy[train]= 0.8978 loss[test]= 0.2702 accuracy[test]= 0.9000
Epoch 12: loss[train]= 0.2818 accuracy[train]= 0.8989 loss[test]= 0.2702 accuracy[test]= 0.9011
Epoch 13: loss[train]= 0.2807 accuracy[train]= 0.9011 loss[test]= 0.2702 accuracy[test]= 0.9011
Epoch 14: loss[train]= 0.2797 accuracy[train]= 0.9000 loss[test]= 0.2704 accuracy[test]= 0.9000
Epoch 15: loss[train]= 0.2789 accuracy[train]= 0.9000 loss[test]= 0.2706 accuracy[test]= 0.9000
Epoch 16: loss[train]= 0.2781 accuracy[train]= 0.8989 loss[test]= 0.2708 accuracy[test]= 0.9000
Epoch 17: loss[train]= 0.2775 accuracy[train]= 0.9011 loss[test]= 0.2711 accuracy[test]= 0.9022
Epoch 18: loss[train]= 0.2769 accuracy[train]= 0.9022 loss[test]= 0.2715 accuracy[test]= 0.9022
Epoch 19: loss[train]= 0.2764 accuracy[train]= 0.9022 loss[test]= 0.2718 accuracy[test]= 0.9000
Epoch 20: loss[train]= 0.2759 accuracy[train]= 0.9022 loss[test]= 0.2721 accuracy[test]= 0.9011
Epoch 21: loss[train]= 0.2755 accuracy[train]= 0.9022 loss[test]= 0.2725 accuracy[test]= 0.9011
Epoch 22: loss[train]= 0.2751 accuracy[train]= 0.9011 loss[test]= 0.2729 accuracy[test]= 0.9034
Epoch 23: loss[train]= 0.2747 accuracy[train]= 0.9011 loss[test]= 0.2732 accuracy[test]= 0.9045
Epoch 24: loss[train]= 0.2744 accuracy[train]= 0.9011 loss[test]= 0.2736 accuracy[test]= 0.9045
# TODO: Your code here.
# construct network
net = Net([
Linear(64, 32),
ReLU(),
Linear(32, 16),
ReLU(),
Linear(16,8),
ReLU(),
Linear(8,1),
Sigmoid()])
# TODO: tune the hyperparameters
fit(net, x, y,
epochs = 25,
learning_rate = 0.01,
mb_size = 10)
Epoch 0: loss[train]= 0.6906 accuracy[train]= 0.6315 loss[test]= 0.6830 accuracy[test]= 0.7989
Epoch 1: loss[train]= 0.5793 accuracy[train]= 0.7955 loss[test]= 0.3588 accuracy[test]= 0.8517
Epoch 2: loss[train]= 0.3062 accuracy[train]= 0.8888 loss[test]= 0.2032 accuracy[test]= 0.9258
Epoch 3: loss[train]= 0.1875 accuracy[train]= 0.9236 loss[test]= 0.1414 accuracy[test]= 0.9393
Epoch 4: loss[train]= 0.1299 accuracy[train]= 0.9494 loss[test]= 0.1232 accuracy[test]= 0.9483
Epoch 5: loss[train]= 0.0984 accuracy[train]= 0.9652 loss[test]= 0.1070 accuracy[test]= 0.9607
Epoch 6: loss[train]= 0.0697 accuracy[train]= 0.9764 loss[test]= 0.1062 accuracy[test]= 0.9618
Epoch 7: loss[train]= 0.0430 accuracy[train]= 0.9910 loss[test]= 0.0995 accuracy[test]= 0.9663
Epoch 8: loss[train]= 0.0314 accuracy[train]= 0.9933 loss[test]= 0.1002 accuracy[test]= 0.9674
Epoch 9: loss[train]= 0.0277 accuracy[train]= 0.9921 loss[test]= 0.1120 accuracy[test]= 0.9652
Epoch 10: loss[train]= 0.0218 accuracy[train]= 0.9944 loss[test]= 0.1104 accuracy[test]= 0.9708
Epoch 11: loss[train]= 0.0423 accuracy[train]= 0.9854 loss[test]= 0.0917 accuracy[test]= 0.9708
Epoch 12: loss[train]= 0.0146 accuracy[train]= 0.9978 loss[test]= 0.1048 accuracy[test]= 0.9685
Epoch 13: loss[train]= 0.0122 accuracy[train]= 0.9978 loss[test]= 0.1081 accuracy[test]= 0.9674
Epoch 14: loss[train]= 0.0101 accuracy[train]= 0.9978 loss[test]= 0.1112 accuracy[test]= 0.9674
Epoch 15: loss[train]= 0.0086 accuracy[train]= 0.9978 loss[test]= 0.1150 accuracy[test]= 0.9685
Epoch 16: loss[train]= 0.0780 accuracy[train]= 0.9753 loss[test]= 0.1661 accuracy[test]= 0.9506
Epoch 17: loss[train]= 0.0548 accuracy[train]= 0.9876 loss[test]= 0.0940 accuracy[test]= 0.9730
Epoch 18: loss[train]= 0.0229 accuracy[train]= 0.9933 loss[test]= 0.0929 accuracy[test]= 0.9697
Epoch 19: loss[train]= 0.0196 accuracy[train]= 0.9910 loss[test]= 0.1225 accuracy[test]= 0.9652
Epoch 20: loss[train]= 0.0267 accuracy[train]= 0.9933 loss[test]= 0.0927 accuracy[test]= 0.9708
Epoch 21: loss[train]= 0.0096 accuracy[train]= 0.9978 loss[test]= 0.0933 accuracy[test]= 0.9697
Epoch 22: loss[train]= 0.0051 accuracy[train]= 0.9989 loss[test]= 0.0932 accuracy[test]= 0.9730
Epoch 23: loss[train]= 0.0044 accuracy[train]= 0.9989 loss[test]= 0.0942 accuracy[test]= 0.9753
Epoch 24: loss[train]= 0.0012 accuracy[train]= 1.0000 loss[test]= 0.0968 accuracy[test]= 0.9764