%matplotlib inline
import numpy as np
import scipy.optimize
import sklearn.datasets
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True, precision=6, linewidth=200)
plt.style.use('ggplot')
def f(x, y):
return x ** 2 + y ** 2 + x * (y + 2) + np.cos(3 * x)
def grad_x_f(x, y):
return 2 * x - 3 * np.sin(3 * x) + y + 2
def grad_y_f(x, y):
return x + 2 * y
def plot_f_contours():
xx, yy = np.meshgrid(np.linspace(-5, 5), np.linspace(-5, 5))
zz = f(xx, yy)
plt.contourf(xx, yy, zz, 50)
plt.contour(xx, yy, zz, 50, alpha=0.2, colors='black', linestyles='solid')
plt.xlabel('x')
plt.ylabel('y')
plt.figure(figsize=(10, 7))
plot_f_contours()
def optimize_f(x, y, step_size, steps):
# keep track of the parameters we tried so far
x_hist, y_hist = [x], [y]
# run gradient descent for the number of steps
for step in range(steps):
# compute the gradients at the current point
dx = grad_x_f(x, y)
dy = grad_y_f(x, y)
# apply the gradient descent updates to x and y
x = x - (step_size * dx)
y = y - (step_size * dy)
# store the new parameters
x_hist.append(x)
y_hist.append(y)
return x, y, f(x, y), x_hist, y_hist
# helper function that plots the results of the gradient descent optimization
def plot_gradient_descent_results(x, y, val, x_hist, y_hist):
# plot the path on the contour plot
plt.figure(figsize=(20, 7))
plt.subplot(1, 2, 1)
plot_f_contours()
plt.plot(x_hist, y_hist, '.-')
# plot the learning curve
plt.subplot(1, 2, 2)
plt.plot(f(np.array(x_hist), np.array(y_hist)), '.r-')
plt.title('Minimum value: %f' % f(x_hist[-1], y_hist[-1]))
results = optimize_f(x=3, y=2, step_size=0.1, steps=10)
plot_gradient_descent_results(*results)
# TODO: tune the parameters to find a better optimum
results = optimize_f(x=3, y=2, step_size=0.15, steps=500)
plot_gradient_descent_results(*results)
def optimize_f(x, y, step_size, steps, decay=1.0):
# keep track of the parameters we tried so far
x_hist, y_hist = [x], [y]
# run gradient descent for the number of steps
for step in range(steps):
# compute the gradients at this point
dx = grad_x_f(x, y)
dy = grad_y_f(x, y)
# apply the gradient descent updates to x and y
step_size = step_size * decay
x = x - (step_size * dx)
y = y - (step_size * dy)
# store the new parameters
x_hist.append(x)
y_hist.append(y)
return x, y, f(x, y), x_hist, y_hist
# TODO: tune the parameters to find the local optimum
results = optimize_f(x=3, y=2, step_size=0.5, steps=100, decay=0.95)
plot_gradient_descent_results(*results)
def sigmoid(x):
return 1/(1+ np.exp(-x))
def sigmoid_grad(x):
return np.exp(-x) / ((1 + np.exp(-x))**2)
# try with a random input
x = np.random.uniform(-10, 10, size=5)
print('x:', x)
print('sigmoid(x):', sigmoid(x))
print('sigmoid_grad(x):', sigmoid_grad(x))
# start with some random inputs
x = np.random.uniform(-2, 2, size=5)
# compute the symbolic gradient
print('Symbolic', sigmoid_grad(x))
# TODO: compute the numerical gradient
epsilon = 0.001
numerical_gradient = (sigmoid(x+0.5*epsilon)-sigmoid(x-0.5*epsilon))/epsilon
print('Numerical', numerical_gradient)
def relu(x):
return np.maximum(0,x)
def relu_grad(x):
return np.maximum(0,np.sign(x))
# try with a random input
x = np.random.uniform(-10, 10, size=5)
print('x:', x)
print('relu(x):', relu(x))
print('relu_grad(x):', relu_grad(x))
epsilon = 0.001
num_relu_grad = (relu(x+0.5*epsilon)-relu(x-0.5*epsilon))/epsilon
print('num_relu_grad(x)', num_relu_grad)
# TODO: compute and compare the symbolic and numerical gradients
x = np.linspace(-10, 10, 100)
plt.figure(figsize=(15, 8))
plt.subplot(2, 2, 1)
plt.plot(x, sigmoid(x), label='Sigmoid')
plt.xlabel('x')
plt.legend(loc='upper left')
plt.subplot(2, 2, 2)
plt.plot(x, relu(x), label='ReLU')
plt.xlabel('x')
plt.legend(loc='upper left')
plt.subplot(2, 2, 3)
plt.plot(x, sigmoid_grad(x), label='Sigmoid gradient')
plt.xlabel('x')
plt.legend(loc='upper left')
plt.subplot(2, 2, 4)
plt.plot(x, relu_grad(x), label='ReLU gradient')
plt.xlabel('x')
plt.legend(loc='upper left');
def bce_loss(y, y_hat):
return -(y*np.log(y_hat) + (1-y)*np.log(1-y_hat))
def bce_loss_grad(y, y_hat):
return (-y)/y_hat + (1-y)/(1-y_hat)
# try with some random inputs
y = np.random.randint(2, size=5)
y_hat = np.random.uniform(0, 1, size=5)
print('y:', y)
print('y_hat:', y_hat)
print('bceloss(y, y_hat):', bce_loss(y, y_hat))
epsilon = 0.00001
num_bce_grad = (bce_loss(y, y_hat+0.5*epsilon)-bce_loss(y,y_hat-0.5*epsilon))/epsilon
print('num_bce_grad(y, y_hat):', num_bce_grad)
print('bce_loss_grad(y, y_hat):', bce_loss_grad(y, y_hat))
# initialize parameters
w = np.random.uniform(size=5)
b = np.random.rand()
# implement the model
def fn(x, y):
# TODO: forward: compute h, y_hat, loss
h = np.dot(np.transpose(x), w) + b
y_hat = sigmoid(h)
loss = bce_loss(y, y_hat)
# TODO: backward: compute grad_y_hat, grad_h, grad_x
grad_y_hat = bce_loss_grad(y, y_hat)
grad_h = sigmoid_grad(grad_y_hat) #(dl dyhat * W transposed in lecture example)
grad_x = np.dot(grad_h, np.transpose(w))
# sigmoid_grad: np.exp(-x) / ((1 + np.exp(-x))**2)
return loss, grad_x
# test with a random input
x = np.random.uniform(size=5)
y = 1
loss, grad_x = fn(x, y)
print("Loss", loss)
print("Gradient", grad_x)
# start with some random inputs
x = np.random.uniform(size=5)
y = 1
# set epsilon to a small value
eps = 0.00001
numerical_grad = np.zeros(x.shape)
# compute the gradient for each element of x separately
for i in range(len(x)):
# compute inputs at -eps/2 and +eps/2
x_a, x_b = x.copy(), x.copy()
x_a[i] += eps / 2
x_b[i] -= eps / 2
# compute the gradient for this element
loss_a, _ = fn(x_a, y)
loss_b, _ = fn(x_b, y)
numerical_grad[i] = (loss_a - loss_b) / eps
# compute the symbolic gradient
loss, symbolic_grad = fn(x, y)
print("Symbolic gradient")
print(symbolic_grad)
print("Numerical gradient")
print(numerical_grad)
# Computes y = x * w + b.
class Linear:
def __init__(self, n_in, n_out):
# initialize the weights randomly,
# using the Xavier initialization rule for scale
a = np.sqrt(6 / (n_in * n_out))
self.W = np.random.uniform(-a, a, size=(n_in, n_out))
self.b = np.zeros((n_out,))
def forward(self, x):
# TODO: compute the forward pass
y = np.dot(x, self.W) + self.b
return y
def backward(self, x, dy):
# TODO: compute the backward pass,
# given dy, compute the gradients for x, W and b
dx = np.dot(dy, np.transpose(self.W))
self.dW = np.dot(np.transpose(x), dy)
self.db = (np.sum(dy) / len(dy))
return dx
def step(self, step_size):
# TODO: apply a gradient descent update step
self.W = self.W - self.dW * step_size # TODO
self.b = self.b - self.db * step_size # TODO
def __str__(self):
return 'Linear %dx%d' % self.W.shape
# Try the new class with some random values.
# Debugging tip: always choose a unique length for each
# dimension, so you'll get an error if you mix them up.
x = np.random.uniform(size=(3, 5))
layer = Linear(5, 7)
y = layer.forward(x)
dx = layer.backward(x, np.ones_like(y))
print('y:', y)
print('dx:', dx)
# Computes y = 1 / (1 + exp(-x)).
class Sigmoid:
def forward(self, x):
# TODO: compute the forward pass
return sigmoid(x)
def backward(self, x, dy):
# TODO: compute the backward pass,
# return the gradient for x given dy
sigmoid_x = sigmoid(x)
self.dx = dy * (sigmoid_x * (1-sigmoid_x))
return self.dx
def step(self, step_size):
pass
def __str__(self):
return 'Sigmoid'
# try the new class with some random values
x = np.random.uniform(size=(3, 5))
layer = Sigmoid()
y = layer.forward(x)
dx = layer.backward(x, np.ones_like(y))
print('y:', y)
print('dx:', dx)
# Computes y = max(0, x).
class ReLU:
def forward(self, x):
# TODO: compute the forward pass
return np.maximum(0,x)
def backward(self, x, dy):
# TODO: compute the backward pass,
# return the gradient for x given dy
return np.multiply(dy, np.int64(x > 0))
def step(self, step_size):
pass
def __str__(self):
return 'ReLU'
# try the new class with some random values
x = np.random.uniform(-10, 10, size=(3, 5))
layer = ReLU()
y = layer.forward(x)
dx = layer.backward(x, np.ones_like(y))
print('y:', y)
print('dx:', dx)
## Verify gradient computations for Linear
# test for dx
layer = Linear(5, 7)
def test_fn(x):
x = x.reshape(3, 5)
# multiply the output with a constant to check if
# the gradient uses dy
return 2 * np.sum(layer.forward(x))
def test_fn_grad(x):
x = x.reshape(3, 5)
# multiply the incoming dy gradient with a constant
return layer.backward(x, 2 * np.ones((3, 7))).flatten()
err = scipy.optimize.check_grad(test_fn, test_fn_grad,
np.random.uniform(-10, 10, size=3 * 5))
print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)
# test for dW
x = np.random.uniform(size=(3, 5))
layer = Linear(5, 7)
def test_fn(w):
layer.W = w.reshape(5, 7)
# multiply the output with a constant to check if
# the gradient uses dy
return 2 * np.sum(layer.forward(x))
def test_fn_grad(w):
layer.W = w.reshape(5, 7)
# multiply the incoming dy gradient with a constant
layer.backward(x, 2 * np.ones((3, 7)))
return layer.dW.flatten()
err = scipy.optimize.check_grad(test_fn, test_fn_grad,
np.random.uniform(-10, 10, size=5 * 7))
print("err on dW:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)
# test for db
x = np.random.uniform(size=(3, 5,))
layer = Linear(5, 7)
def test_fn(b):
layer.b = b
# multiply the output with a constant to check if
# the gradient uses dy
return 2 * np.sum(layer.forward(x))
def test_fn_grad(b):
layer.b = b
# multiply the incoming dy gradient with a constant
layer.backward(x, 2 * np.ones((x.shape[0], 7)))
return layer.db
err = scipy.optimize.check_grad(test_fn, test_fn_grad,
np.random.uniform(-10, 10, size=7))
print("err on db:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)
## Verify gradient computation for Sigmoid
# test for dx
layer = Sigmoid()
def test_fn(x):
# multiply the output with a constant to check if
# the gradient uses dy
return np.sum(2 * layer.forward(x))
def test_fn_grad(x):
# multiply the incoming dy gradient with a constant
return layer.backward(x, 2 * np.ones(x.shape))
err = scipy.optimize.check_grad(test_fn, test_fn_grad,
np.random.uniform(-10, 10, size=5))
print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)
## Verify gradient computation for ReLU
# test for dx
layer = ReLU()
def test_fn(x):
# multiply the output with a constant to check if
# the gradient uses dy
return 2 * np.sum(layer.forward(x))
def test_fn_grad(x):
# multiply the incoming dy gradient with a constant
return layer.backward(x, 2 * np.ones(x.shape))
err = scipy.optimize.check_grad(test_fn, test_fn_grad,
np.random.uniform(1, 10, size=5))
print("err on dx:", "OK" if np.abs(err) < 1e-5 else "ERROR", err)
class Net:
def __init__(self, layers):
self.layers = layers
def forward(self, x):
# compute the forward pass for each layer
trace = []
for layer in self.layers:
# compute the forward pass
y = layer.forward(x)
# store the original input for the backward pass
trace.append((layer, x))
x = y
# return the final output and the history trace
return y, trace
def backward(self, trace, dy):
# compute the backward pass for each layer
for layer, x in trace[::-1]:
# compute the backward pass using the original input x
dy = layer.backward(x, dy)
def step(self, learning_rate):
# apply the gradient descent updates of each layer
for layer in self.layers:
layer.step(learning_rate)
def __str__(self):
return '\n'.join(str(l) for l in self.layers)
# load the first two classes of the digits dataset
dataset = sklearn.datasets.load_digits()
digits_x, digits_y = dataset['data'], dataset['target']
# create a binary classification problem
digits_y = (digits_y < 5).astype(float)
# plot some of the digits
plt.figure(figsize=(10, 2))
plt.imshow(np.hstack([digits_x[i].reshape(8, 8) for i in range(10)]), cmap='gray')
plt.grid(False)
plt.tight_layout()
plt.axis('off')
# normalize the values to [0, 1]
digits_x -= np.mean(digits_x)
digits_x /= np.std(digits_x)
# print some statistics
print('digits_x.shape:', digits_x.shape)
print('digits_y.shape:', digits_y.shape)
print('min, max values:', np.min(digits_x), np.max(digits_x))
print('labels:', np.unique(digits_y))
# make a 50%/50% train/test split
train_prop = 0.5
n_train = int(digits_x.shape[0] * train_prop)
# shuffle the images
idxs = np.random.permutation(digits_x.shape[0])
# take a subset
x = {'train': digits_x[idxs[:n_train]],
'test': digits_x[idxs[n_train:]]}
y = {'train': digits_y[idxs[:n_train]],
'test': digits_y[idxs[n_train:]]}
print('Training samples:', x['train'].shape[0])
print('Test samples:', x['test'].shape[0])
def fit(net, x, y, epochs=25, learning_rate=0.001, mb_size=10):
# initialize the loss and accuracy history
loss_hist = {'train': [], 'test': []}
accuracy_hist = {'train': [], 'test': []}
for epoch in range(epochs):
# initialize the loss and accuracy for this epoch
loss = {'train': 0.0, 'test': 0.0}
accuracy = {'train': 0.0, 'test': 0.0}
# first train on training data, then evaluate on the test data
for phase in ('train', 'test'):
# compute the number of minibatches
steps = x[phase].shape[0] // mb_size
# loop over all minibatches
for step in range(steps):
# get the samples for the current minibatch
x_mb = x[phase][(step * mb_size):((step + 1) * mb_size)]
y_mb = y[phase][(step * mb_size):((step + 1) * mb_size), None]
# compute the forward pass through the network
pred_y, trace = net.forward(x_mb)
# compute the current loss and accuracy
loss[phase] += np.mean(bce_loss(y_mb, pred_y))
accuracy[phase] += np.mean((y_mb > 0.5) == (pred_y > 0.5))
# only update the network in the training phase
if phase == 'train':
# compute the gradient for the loss
dy = bce_loss_grad(y_mb, pred_y)
# backpropagate the gradient through the network
net.backward(trace, dy)
# update the weights
net.step(learning_rate)
# compute the mean loss and accuracy over all minibatches
loss[phase] = loss[phase] / steps
accuracy[phase] = accuracy[phase] / steps
# add statistics to history
loss_hist[phase].append(loss[phase])
accuracy_hist[phase].append(accuracy[phase])
print('Epoch %3d: loss[train]=%7.4f accuracy[train]=%7.4f loss[test]=%7.4f accuracy[test]=%7.4f' %
(epoch, loss['train'], accuracy['train'], loss['test'], accuracy['test']))
# plot the learning curves
plt.figure(figsize=(20, 5))
plt.subplot(1, 2, 1)
for phase in loss_hist:
plt.plot(loss_hist[phase], label=phase)
plt.title('BCE loss')
plt.xlabel('Epoch')
plt.legend()
plt.subplot(1, 2, 2)
for phase in accuracy_hist:
plt.plot(accuracy_hist[phase], label=phase)
plt.title('Accuracy')
plt.xlabel('Epoch')
plt.legend()
# construct network
net = Net([
Linear(64, 32),
ReLU(),
Linear(32, 1),
Sigmoid()])
# TODO: tune the hyperparameters
fit(net, x, y,
epochs = 25,
learning_rate = 0.01,
mb_size = 10)
# TODO: Your code here.
# construct network
net = Net([
Linear(64, 32),
Linear(32, 1),
Sigmoid()])
# TODO: tune the hyperparameters
fit(net, x, y,
epochs = 25,
learning_rate = 0.01,
mb_size = 10)
# TODO: Your code here.
# construct network
net = Net([
Linear(64, 1),
Sigmoid()])
# TODO: tune the hyperparameters
fit(net, x, y,
epochs = 25,
learning_rate = 0.01,
mb_size = 10)
# TODO: Your code here.
# construct network
net = Net([
Linear(64, 32),
ReLU(),
Linear(32, 16),
ReLU(),
Linear(16,8),
ReLU(),
Linear(8,1),
Sigmoid()])
# TODO: tune the hyperparameters
fit(net, x, y,
epochs = 25,
learning_rate = 0.01,
mb_size = 10)