Gradient Pseudo-swap
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Creating X; a random number between 0 and 1 with a uniform distribution
X = np.random.random(size=1000)
# Creating y; 0 if X < 0.4 else 1, with some random noise
mask = np.random.normal(0.7, 0.1, len(X))
y = np.empty(len(X))
for i in range(len(X)):
y[i] = 1 if X[i] < mask[i] else 0
# Visualizing the dataset
plt.figure(figsize=(12, 6));
sns.scatterplot(x=X, y=np.random.random(len(X)), hue=y);
plt.xlabel("$X$");
sns.despine(left=True);
plt.yticks([]);
plt.title("Visualizing the Dataset");
def func(x, theta):
return 1 if x < theta else 0
def func_variant(x, theta):
return -x + theta + 0.5
def mse(y_true, y_pred):
return np.mean((y_true - y_pred)**2)
def forward_propagation(X, theta):
# return 1 if X < theta else 0
return np.array(X < theta).astype(np.int32)
def back_propagation(X, y, theta):
y_pred = forward_propagation(X, theta)
C_wrt_theta = np.mean(2*(y_pred - y))
return C_wrt_theta
def gradient_descent(X, y, theta, learning_rate=0.05):
C_wrt_theta = back_propagation(X, y, theta)
theta_new = theta - (learning_rate*C_wrt_theta)
return theta_new
n_epochs = 50
learning_rate = 0.05
# Let's initilize theta to a value far away from it's optimal state
theta = 0.1
cost = mse(forward_propagation(X, theta), y)
theta_hist, gradient_hist, cost_hist = [theta], [back_propagation(X, y, theta)], [cost]
for epoch in range(n_epochs):
print(f"\rOn epoch {epoch}/{n_epochs} - {round(i*100/len(X), 2)}% - MSE: {round(cost, 3)} - theta: {round(theta, 3)} ", end="")
theta = gradient_descent(X, y, theta, learning_rate=learning_rate)
cost = mse(forward_propagation(X, theta), y)
theta_hist.append(theta)
gradient_hist.append(back_propagation(X, y, theta))
cost_hist.append(cost)
On epoch 49/50 - 99.9% - MSE: 0.083 - theta: 0.7
from copy import copy
sigmoid = lambda x: 1 / (1 + np.exp(-x))
sigmoid_derivative = lambda x: sigmoid(x)*(1 - sigmoid(x))
def mse(y_true, y_pred):
return np.mean((y_true - y_pred)**2)
def forward_propagation(X, param_dict, return_stages=False):
W1, b1, theta, W3, b3, W4, b4 = param_dict["W1"], param_dict["b1"], param_dict["theta"], param_dict["W3"], param_dict["b3"], param_dict["W4"], param_dict["b4"]
h1 = np.matmul(W1, X) + b1
a1 = sigmoid(h1)
h2 = np.apply_along_axis(lambda x: 1 if (x < theta).any() else 0, axis=0, arr=a1).reshape(1, -1)
h3 = np.matmul(W3, h2) + param_dict["b3"]
a3 = sigmoid(h3)
h4 = np.matmul(param_dict["W4"], a3) + param_dict["b4"]
a4 = sigmoid(h4)
if return_stages==True:
return {"h1": h1, "a1": a1, "h2": h2, "h3": h3, "a3": a3, "h4": h4, "a4": a4}
else:
return a4
def back_propagation(X, y, stages_dict, param_dict):
h1, a1, h2, h3, a3, h4, a4 = stages_dict["h1"], stages_dict["a1"], stages_dict["h2"], stages_dict["h3"], stages_dict["a3"], stages_dict["h4"], stages_dict["a4"]
W1, b1, theta, W3, b3, W4, b4 = param_dict["W1"], param_dict["b1"], param_dict["theta"], param_dict["W3"], param_dict["b3"], param_dict["W4"], param_dict["b4"]
C_wrt_a4 = 2*(a4 - y)
C_wrt_h4 = C_wrt_a4 * sigmoid_derivative(h4)
C_wrt_a3 = np.matmul(W4.T, C_wrt_h4)
C_wrt_h3 = C_wrt_a3 * sigmoid_derivative(h3)
C_wrt_h2 = np.matmul(W3.T, C_wrt_h3)
C_wrt_a1 = np.matmul(-1*np.ones((3, 1)), C_wrt_h2)
C_wrt_h1 = C_wrt_a1 * sigmoid_derivative(h1)
C_wrt_W4 = np.matmul(C_wrt_h4, a3.T) / X.shape[1]
C_wrt_b4 = np.mean(C_wrt_h4, axis=1).reshape((1, 1))
C_wrt_W3 = np.matmul(C_wrt_h3, h2.T) / X.shape[1]
C_wrt_b3 = np.mean(C_wrt_h3, axis=1).reshape((3, 1))
C_wrt_theta = np.mean(C_wrt_h2, axis=1)[0]
C_wrt_W1 = np.matmul(C_wrt_h1, X.T) / X.shape[1]
C_wrt_b1 = np.mean(C_wrt_h1, axis=1).reshape((3, 1))
return {"W1": C_wrt_W1, "b1": C_wrt_b1, "theta": C_wrt_theta, "W3": C_wrt_W3, "b3": C_wrt_b3, "W4": C_wrt_W4, "b4": C_wrt_b4}
def gradient_descent(X, y, param_dict, learning_rate=0.05):
forward_prop = forward_propagation(X, param_dict, return_stages=True)
gradients = back_propagation(X, y, forward_prop, param_dict)
new_W1 = param_dict["W1"] - (learning_rate * gradients["W1"])
new_b1 = param_dict["b1"] - (learning_rate * gradients["b1"])
new_theta = param_dict["theta"] - (learning_rate * gradients["theta"])
new_W3 = param_dict["W3"] - (learning_rate * gradients["W3"])
new_b3 = param_dict["b3"] - (learning_rate * gradients["b3"])
new_W4 = param_dict["W4"] - (learning_rate * gradients["W4"])
new_b4 = param_dict["b4"] - (learning_rate * gradients["b4"])
return {"W1": new_W1, "b1": new_b1, "theta": new_theta, "W3": new_W3, "b3": new_b3, "W4": new_W4, "b4": new_b4}
X
(1, 1000)
matmul(W1, X) + b1 := h1
(3, 1) * (1, 1000) + (3, 1) = (3, 1000)
relu(h1) := a1
= (3, 1000)
1 if (a1 < theta).any() else 0 := h2
= (1, 1000)
matmul(W3, h2) + b3 := h3
(3, 1) * (1, 1000) + (3, 1) = (3, 1000)
relu(h3) := a3
= (3, 1000)
matmul(W4, a3) + b4 := h4
(1, 3) * (3, 1000) + (1, 1) = (1, 1000)
sigmoid(h4) := a4
= (1, 1000)
n_epochs = 2000
batch_size = len(X)
learning_rate = 0.05
np.random.seed(42)
param_dict = {"W1": np.random.randn(3, 1)/2+0.5, "b1": np.random.randn(3, 1)/2+0.5,
"theta": np.random.randn()/2+0.5,
"W3": np.random.randn(3, 1)/2+0.5, "b3": np.random.randn(3, 1)/2+0.5,
"W4": np.random.randn(1, 3)/2+0.5, "b4": np.random.randn(1, 1)/2+0.5}
stages_dict = forward_propagation(X.reshape((1, -1)), param_dict, return_stages=True)
cost = mse(y, stages_dict["a4"])
cost_hist, param_hist, gradient_hist = [cost], [param_dict], [back_propagation(X.reshape((1, -1)), y, stages_dict, param_dict)]
n_batches = int(np.floor( len(X)/batch_size ))
for epoch in range(n_epochs):
for i in range(n_batches):
print(f"\rOn epoch {epoch}/{n_epochs} - {round(i*100/len(X), 2)}% - MSE: {round(cost, 3)} - theta: {round(param_dict['theta'], 3)} ", end="")
new_step = gradient_descent(X=X[i*batch_size:(i+1)*batch_size].reshape(1, -1),
y=y[i*batch_size:(i+1)*batch_size], param_dict=param_dict,
learning_rate=learning_rate)
param_dict["W1"], param_dict["b1"], param_dict["theta"], param_dict["W3"], param_dict["b3"], param_dict["W4"], param_dict["b4"] = new_step["W1"], new_step["b1"], new_step["theta"], new_step["W3"], new_step["b3"], new_step["W4"], new_step["b4"]
cost = mse(y, forward_propagation(X.reshape(1, -1), param_dict))
cost_hist.append(cost)
param_hist.append(param_dict)
stages_dict = forward_propagation(X.reshape((1, -1)), param_dict, return_stages=True)
gradient_hist.append(back_propagation(X.reshape((1, -1)), y, stages_dict, param_dict))
On epoch 1999/2000 - 0.0% - MSE: 0.2 - theta: 1.289
Average Forward Propagation after Training
-------------------------------
h1: [[1.62439825]
[0.61114907]
[0.79108641]]
a1: [[0.83322213]
[0.64758384]
[0.68568604]]
h2: [[1.]]
h3: [[1.10172577]
[0.48331673]
[1.46149753]]
a3: [[1.62439825]
[0.61114907]
[0.79108641]]
h4: [[0.95939172]]
a4: [[0.723]]
Average Forward Propagation Initially
------------------------
a4: [0.39975717]
h4: [-0.40647702]
a3: [0.75987759 0.63004222 0.80095298]
h3: [1.15200852 0.53239793 1.39226116]
h2: [1.]
a1: [0.83401584 0.64394283 0.6851901 ]
h1: [1.62927601 0.59466238 0.78778887]
X: [0.93512643 0.41987956 0.46406362 0.58630771 0.42449692 0.67519225
0.71912595 0.43278524 0.7246073 0.54919406]
W4: [-0.45664012 -0.36245892 0.21885624]
b4: [-0.00641556]
W3: [0.88371736 0.26526281 0.77128002]
b3: [0.26829115 0.26713512 0.62098114]
theta: [1.2896064077536957]
W1: [0.74835708 0.43086785 0.82384427]
b1: [1.26151493 0.38292331 0.38293152]
Average Back Propagation Initially
------------------------
a4: [-0.64648567]
h4: [-0.15512512]
a3: [ 0.07083636 0.05622648 -0.0339501 ]
h3: [ 0.01292506 0.01310578 -0.00541257]
h2: [0.01072396]
a1: [-0.01072396 -0.01072396 -0.01072396]
h1: [-0.00168499 -0.0025419 -0.00249668]
W4: [-0.11787611 -0.09773538 -0.12424793]
b4: [-0.15512512]
W3: [ 0.01292506 0.01310578 -0.00541257]
b3: [ 0.01292506 0.01310578 -0.00541257]
theta: [0.010723964812460325]
W1: [-0.00037038 -0.00052865 -0.00054115]
b1: [-0.00168499 -0.0025419 -0.00249668]