import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
from sklearn import decomponsition
from sklearn import preprocessing
import statistics as stats
import mgen # Generating rotation matrices
np.set_printoptions(suppress=True) # Don't use scientific notation when printing numpy arrays
Data creation
# Function to normalize data
normalize = lambda arr: preprocessing.normalize([arr])[0]
def gen3dData(m, c, size=20, noiseFactor=0.3):
return genNdData(n=3, size=size, noiseFactor=noiseFactor)
# y=mx+c
xs = np.random.uniform(size=size)
ys = [m * x**2 + c + np.random.normal() * noiseFactor for x in xs]
zs = [m * x**2 + c + np.random.normal() * noiseFactor for x in xs]
return xs, ys, zs
# Generates n-dimensional data (n input variables)
def genNdData(n=3, size=20, noiseFactor=0.3):
variables = np.zeros((n, size))
variables[0] = xs = np.random.uniform(size=size)
for dim in range(1, n):
variables[dim] = [x**(dim+1) + np.random.normal() * noiseFactor for x in xs]
variables = [normalize(xs) for xs in variables]
return variables
def unzip(xs):
return list( zip(*xs) )
# Same as transposing a matrix
transpose = lambda xs: unzip(xs)
xs, ys = genNdData(n=2, size=20, noiseFactor=0.01)
vectors2d = transpose([xs, ys])
sns.scatterplot(x=xs, y=ys)
"Variance: ", stats.variance(xs)
PCA (sklearn)
# PCA
pca = decomposition.PCA(n_components=1)
X_reduced = pca.fit_transform(vectors2d)
pca.explained_variance_ratio_
pca.components_
# Plot PCA
xs_reduced = np.transpose(X_reduced)[0]
sns.stripplot(x=xs_reduced, jitter=0.03)
The approximation
vectorToRotate = [1, 0, 0]
rotation_degrees = 90
rotation_radians = np.radians(rotation_degrees)
plane_v1 = np.array([1, 0, 0])
plane_v2 = np.array([0, 1, 0])
# Create two rotation matrices to compare
M1 = mgen.rotation_around_axis([0, 0, 1], rotation_radians)
M2 = mgen.rotation_from_angle_and_plane(rotation_radians, plane_v1, plane_v2)
# e-16 is a floating point error and can be considered 0
print(M1.dot(vectorToRotate))
print(M2.dot(vectorToRotate))
PCA approximation in 2D
# Rotate a single vector in 2D
# (In 2D there is only one plane to rotate on)
def rotate(vec, angle_deg):
angle_rad = np.radians(angle_deg)
M = mgen.rotation_from_angle(angle_rad)
return M.dot(vec)
# Rotate a list of vectors in 2D
def rotateAll(vectors, angle_deg):
angle_rad = np.radians(angle_deg)
M = mgen.rotation_from_angle(angle_rad)
return [M.dot(v) for v in vectors]
def xsOf(vectors):
return list( list(unzip(vectors))[0] )
def ysOf(vectors):
return list( list(unzip(vectors))[1] )
def plot1d(xs):
sns.stripplot(x=xs, jitter=0.03)
def plot2d(vectors):
xs, ys = unzip(vectors)
return sns.scatterplot(xs, ys)
angle = 0
angle_change = 1
var = -1
var_new = 0
vectors = np.copy(vectors2d)
while var < var_new:
var = var_new
vectors_left = rotateAll(vectors, -angle_change)
vectors_right = rotateAll(vectors, angle_change)
var_left = stats.variance(xsOf(vectors_left))
var_right = stats.variance(xsOf(vectors_right))
if var_left < var_right:
var_new = var_right
angle += angle_change
vectors = vectors_right
elif var_right < var_left:
var_new = var_left
angle -= angle_change
vectors = vectors_left
else:
break
print(f'Optimal rotation: {angle}° clockwise')
print(f'Variance along X-axis (1st PC): {var_new}')
# Variance explained
xs, ys = unzip(vectors2d)
var_explained = var_new/(stats.variance(xs) + stats.variance(ys))
print( "Variance explained by approximation:", var_explained)
print( "Variance explained by PCA:", pca.explained_variance_ratio_)
print("Error: ", (var_explained - pca.explained_variance_ratio_)/ pca.explained_variance_ratio_)
plot1d(xsOf(vectors))
PCA approximation in 3D and higher
def plot3d(vectors):
xs, ys, zs = unzip(vectors)
fig = px.scatter_3d(x=xs, y=ys, z=zs)
fig.show()
# Generate data
xs, ys, zs = genNdData(n=3, noiseFactor=0.02)
vectors3d = transpose([xs, ys, zs])
df = pd.DataFrame(vectors3d)
sns.pairplot(df)
plot3d(vectors3d)
# Convert two axes into orthogonal vectors that describe their plane
def planeToVectors(dimension, axis1, axis2):
v1 = np.zeros(dimension)
v2 = np.zeros(dimension)
v1[axis1] = 1
v2[axis2] = 1
return v1, v2
# e.g. In 3D between X and Z axes
# planeToVectors(3, 0, 2)
# -> [1 0 0], [0 0 1]
# Perform a clock-wise rotation by [rotation_deg]° on a list of vectors on the given plane
def rotateOnPlane(plane_v1, plane_v2, rotation_deg, vectors):
# Create rotation matrix
rotation_rad = np.radians(rotation_deg)
M = mgen.rotation_from_angle_and_plane(rotation_rad, plane_v1, plane_v2)
# Rotate all vectors
return [M.dot(v) for v in vectors]
# Tries rotating vectors in both directions on the plane described by the two axes.
# Chooses the direction that maximises the variance along the optimisation_axis.
#
# Returns (rotation_deg or -rotation_deg), new_variance, rotated_vectors
def bestRotationOnAxes(optimisation_axis, dimension, axis1, axis2, rotation_deg, vectors):
# Generate orthogonal vectors describing the plane
plane_v1, plane_v2 = planeToVectors(dimension, axis1, axis2)
# Try rotating left (anti-clockwise)
vectors_left = rotateOnPlane(plane_v1, plane_v2, -rotation_deg, vectors)
points = unzip(vectors_left)[optimisation_axis]
variance_left = stats.variance(points)
# Try rotating right (clockwise)
vectors_right = rotateOnPlane(plane_v1, plane_v2, rotation_deg, vectors)
points = unzip(vectors_right)[optimisation_axis]
variance_right = stats.variance(points)
# Choose rotation the maximises the variance
if variance_left > variance_right:
return -rotation_deg, variance_left, vectors_left
else:
return rotation_deg, variance_right, vectors_right
# Rotates the points by [rotation_deg]° on every plane allowed to improve the variance along the optimisation_axis
#
# To improve the X-axis alignment, the points are allowed to rotate on every plane,
# but to improve the Y-axis, the rotations may not affect the X-axis alignment
# In 3D: For X-axis alignment rotation on XY and XZ plane (YZ is also allowed, but will not affect X-axis)
# For Y-axis alignment rotation on YZ, but not YX plane
def improveAxisAlignmentBy(optimisation_axis, dimensions, rotation_deg, vectors):
if optimisation_axis == dimensions-1:
points = unzip(vectors)[optimisation_axis]
return stats.variance(points), vectors
# TODO
# Axes to pair with optimisation_axis to form the plane to rotate on.
# Assumes all axes < optimisation_axis to be fixed and immutable
for axis2 in range(optimisation_axis+1, dimensions):
angle_diff, var, vectors = bestRotationOnAxes(
optimisation_axis = optimisation_axis,
dimension = dimensions,
axis1 = optimisation_axis,
axis2 = axis2,
rotation_deg = rotation_deg,
vectors = vectors
)
return var, vectors
# Improve alignment along the optimisation_axis by rotating on the given plane (axis1, axis2)
# until a maxima in the variance is reached.
def optimiseAxisOnPlane(optimisation_axis, dimensions, axis1, axis2, rotation_deg, vectors):
var_old = -1
var = 0
while var > var_old:
var_old = var
angle_diff, var, vectors = bestRotationOnAxes(
optimisation_axis = optimisation_axis,
dimension = dimensions,
axis1 = axis1,
axis2 = axis2,
rotation_deg = rotation_deg,
vectors = vectors
)
return var, vectors
# Find the nth principal component (target_pc) by optimising variance along its corresponding axis
# by rotating on all available planes.
# Makes sure to not affect previous principal components.
#
# It is highly recommended to execute it in order of increasing PC number
def approximatePC(from_dimension, target_pc, precision, vectors, log_factor=1.5):
# Requires that prior PCs are already approximated in the input vectors
var_old = -1
var = 0
# Instead of improving by a little bit every time, initially make big adjustments and get finer and finer
# Approach is inspired by binary search.
angle = 90 # Start at 90° changes instead of 180°, since flipping by 180° results in the same variance
while angle > precision:
var_old = var
var, vectors = improveAxisAlignmentBy(target_pc, from_dimension, angle, vectors)
angle = angle / log_factor
return var, vectors
# Simulates PCA for
# any number of input dimensions >= 2
# and any number of desired Principal Components <= no. of dimensions
def myPCA(input_vectors, components, log_factor=1.5):
precision = .1 # Smallest rotation-angle size (in degrees)
dimensions = len(input_vectors[0])
target_dimensions = components # No. of Principal Components to find
if target_dimensions > dimensions:
raise Exception(f'Requested more components ({components}) than input data has dimensions ({dimensions})')
variances = np.zeros(target_dimensions)
vectors_new = np.copy(input_vectors)
# Compute Principal Components
for pc in range(target_dimensions):
variances[pc], vectors_new = approximatePC(dimensions, pc, precision, vectors_new, log_factor)
# Calculate variances explained by components
total_var = 0
for points in unzip(input_vectors):
total_var += stats.variance(points)
varsExplained = [var / total_var for var in variances]
return {
"variance explained": varsExplained,
"components": unzip(vectors_new)[:target_dimensions]
}
vectors3d = transpose(genNdData(n=3, size=100))
results = myPCA(vectors3d, 3)
results['variance explained']
vectors5d = transpose(genNdData(n=5, size=100))
results = myPCA(vectors5d, 3)
results['variance explained']
vectors10d = transpose(genNdData(n=10, size=100))
results = myPCA(vectors10d, 3)
results['variance explained']
df = pd.DataFrame(transpose(results['components']))
sns.pairplot(df)
plot3d(transpose(results['components']))
# PCA
X = np.ndarray.transpose(np.array(unzip(vectors10d)))
pca = decomposition.PCA(n_components=3)
X_reduced = pca.fit_transform(X)
print("Standard PCA explained variances:", pca.explained_variance_ratio_)
myPCAErrors = np.absolute(results['variance explained'] - pca.explained_variance_ratio_) / pca.explained_variance_ratio_
print("myPCA errors per component:", myPCAErrors)