import numpy as np import pandas as pd import seaborn as sns from matplotlib import pyplot as plt import plotly.express as px from sklearn import decomponsition from sklearn import preprocessing import statistics as stats import mgen # Generating rotation matrices

np.set_printoptions(suppress=True) # Don't use scientific notation when printing numpy arrays

Data creation

# Function to normalize data normalize = lambda arr: preprocessing.normalize([arr])[0]

def gen3dData(m, c, size=20, noiseFactor=0.3): return genNdData(n=3, size=size, noiseFactor=noiseFactor) # y=mx+c xs = np.random.uniform(size=size) ys = [m * x**2 + c + np.random.normal() * noiseFactor for x in xs] zs = [m * x**2 + c + np.random.normal() * noiseFactor for x in xs] return xs, ys, zs # Generates n-dimensional data (n input variables) def genNdData(n=3, size=20, noiseFactor=0.3): variables = np.zeros((n, size)) variables[0] = xs = np.random.uniform(size=size) for dim in range(1, n): variables[dim] = [x**(dim+1) + np.random.normal() * noiseFactor for x in xs] variables = [normalize(xs) for xs in variables] return variables

def unzip(xs): return list( zip(*xs) ) # Same as transposing a matrix transpose = lambda xs: unzip(xs)

xs, ys = genNdData(n=2, size=20, noiseFactor=0.01) vectors2d = transpose([xs, ys]) sns.scatterplot(x=xs, y=ys) "Variance: ", stats.variance(xs)

PCA (sklearn)

# PCA pca = decomposition.PCA(n_components=1) X_reduced = pca.fit_transform(vectors2d) pca.explained_variance_ratio_ pca.components_

# Plot PCA xs_reduced = np.transpose(X_reduced)[0] sns.stripplot(x=xs_reduced, jitter=0.03)

The approximation

vectorToRotate = [1, 0, 0] rotation_degrees = 90 rotation_radians = np.radians(rotation_degrees) plane_v1 = np.array([1, 0, 0]) plane_v2 = np.array([0, 1, 0]) # Create two rotation matrices to compare M1 = mgen.rotation_around_axis([0, 0, 1], rotation_radians) M2 = mgen.rotation_from_angle_and_plane(rotation_radians, plane_v1, plane_v2) # e-16 is a floating point error and can be considered 0 print(M1.dot(vectorToRotate)) print(M2.dot(vectorToRotate))

PCA approximation in 2D

# Rotate a single vector in 2D # (In 2D there is only one plane to rotate on) def rotate(vec, angle_deg): angle_rad = np.radians(angle_deg) M = mgen.rotation_from_angle(angle_rad) return M.dot(vec) # Rotate a list of vectors in 2D def rotateAll(vectors, angle_deg): angle_rad = np.radians(angle_deg) M = mgen.rotation_from_angle(angle_rad) return [M.dot(v) for v in vectors]

def xsOf(vectors): return list( list(unzip(vectors))[0] ) def ysOf(vectors): return list( list(unzip(vectors))[1] )

def plot1d(xs): sns.stripplot(x=xs, jitter=0.03) def plot2d(vectors): xs, ys = unzip(vectors) return sns.scatterplot(xs, ys)

angle = 0 angle_change = 1 var = -1 var_new = 0 vectors = np.copy(vectors2d) while var < var_new: var = var_new vectors_left = rotateAll(vectors, -angle_change) vectors_right = rotateAll(vectors, angle_change) var_left = stats.variance(xsOf(vectors_left)) var_right = stats.variance(xsOf(vectors_right)) if var_left < var_right: var_new = var_right angle += angle_change vectors = vectors_right elif var_right < var_left: var_new = var_left angle -= angle_change vectors = vectors_left else: break print(f'Optimal rotation: {angle}° clockwise') print(f'Variance along X-axis (1st PC): {var_new}')

# Variance explained xs, ys = unzip(vectors2d) var_explained = var_new/(stats.variance(xs) + stats.variance(ys)) print( "Variance explained by approximation:", var_explained) print( "Variance explained by PCA:", pca.explained_variance_ratio_) print("Error: ", (var_explained - pca.explained_variance_ratio_)/ pca.explained_variance_ratio_)

plot1d(xsOf(vectors))

PCA approximation in 3D and higher

def plot3d(vectors): xs, ys, zs = unzip(vectors) fig = px.scatter_3d(x=xs, y=ys, z=zs) fig.show()

# Generate data xs, ys, zs = genNdData(n=3, noiseFactor=0.02) vectors3d = transpose([xs, ys, zs])

df = pd.DataFrame(vectors3d) sns.pairplot(df)

plot3d(vectors3d)

# Convert two axes into orthogonal vectors that describe their plane def planeToVectors(dimension, axis1, axis2): v1 = np.zeros(dimension) v2 = np.zeros(dimension) v1[axis1] = 1 v2[axis2] = 1 return v1, v2 # e.g. In 3D between X and Z axes # planeToVectors(3, 0, 2) # -> [1 0 0], [0 0 1]

# Perform a clock-wise rotation by [rotation_deg]° on a list of vectors on the given plane def rotateOnPlane(plane_v1, plane_v2, rotation_deg, vectors): # Create rotation matrix rotation_rad = np.radians(rotation_deg) M = mgen.rotation_from_angle_and_plane(rotation_rad, plane_v1, plane_v2) # Rotate all vectors return [M.dot(v) for v in vectors]

# Tries rotating vectors in both directions on the plane described by the two axes. # Chooses the direction that maximises the variance along the optimisation_axis. # # Returns (rotation_deg or -rotation_deg), new_variance, rotated_vectors def bestRotationOnAxes(optimisation_axis, dimension, axis1, axis2, rotation_deg, vectors): # Generate orthogonal vectors describing the plane plane_v1, plane_v2 = planeToVectors(dimension, axis1, axis2) # Try rotating left (anti-clockwise) vectors_left = rotateOnPlane(plane_v1, plane_v2, -rotation_deg, vectors) points = unzip(vectors_left)[optimisation_axis] variance_left = stats.variance(points) # Try rotating right (clockwise) vectors_right = rotateOnPlane(plane_v1, plane_v2, rotation_deg, vectors) points = unzip(vectors_right)[optimisation_axis] variance_right = stats.variance(points) # Choose rotation the maximises the variance if variance_left > variance_right: return -rotation_deg, variance_left, vectors_left else: return rotation_deg, variance_right, vectors_right

# Rotates the points by [rotation_deg]° on every plane allowed to improve the variance along the optimisation_axis # # To improve the X-axis alignment, the points are allowed to rotate on every plane, # but to improve the Y-axis, the rotations may not affect the X-axis alignment # In 3D: For X-axis alignment rotation on XY and XZ plane (YZ is also allowed, but will not affect X-axis) # For Y-axis alignment rotation on YZ, but not YX plane def improveAxisAlignmentBy(optimisation_axis, dimensions, rotation_deg, vectors): if optimisation_axis == dimensions-1: points = unzip(vectors)[optimisation_axis] return stats.variance(points), vectors # TODO # Axes to pair with optimisation_axis to form the plane to rotate on. # Assumes all axes < optimisation_axis to be fixed and immutable for axis2 in range(optimisation_axis+1, dimensions): angle_diff, var, vectors = bestRotationOnAxes( optimisation_axis = optimisation_axis, dimension = dimensions, axis1 = optimisation_axis, axis2 = axis2, rotation_deg = rotation_deg, vectors = vectors ) return var, vectors

# Improve alignment along the optimisation_axis by rotating on the given plane (axis1, axis2) # until a maxima in the variance is reached. def optimiseAxisOnPlane(optimisation_axis, dimensions, axis1, axis2, rotation_deg, vectors): var_old = -1 var = 0 while var > var_old: var_old = var angle_diff, var, vectors = bestRotationOnAxes( optimisation_axis = optimisation_axis, dimension = dimensions, axis1 = axis1, axis2 = axis2, rotation_deg = rotation_deg, vectors = vectors ) return var, vectors

# Find the nth principal component (target_pc) by optimising variance along its corresponding axis # by rotating on all available planes. # Makes sure to not affect previous principal components. # # It is highly recommended to execute it in order of increasing PC number def approximatePC(from_dimension, target_pc, precision, vectors, log_factor=1.5): # Requires that prior PCs are already approximated in the input vectors var_old = -1 var = 0 # Instead of improving by a little bit every time, initially make big adjustments and get finer and finer # Approach is inspired by binary search. angle = 90 # Start at 90° changes instead of 180°, since flipping by 180° results in the same variance while angle > precision: var_old = var var, vectors = improveAxisAlignmentBy(target_pc, from_dimension, angle, vectors) angle = angle / log_factor return var, vectors

# Simulates PCA for # any number of input dimensions >= 2 # and any number of desired Principal Components <= no. of dimensions def myPCA(input_vectors, components, log_factor=1.5): precision = .1 # Smallest rotation-angle size (in degrees) dimensions = len(input_vectors[0]) target_dimensions = components # No. of Principal Components to find if target_dimensions > dimensions: raise Exception(f'Requested more components ({components}) than input data has dimensions ({dimensions})') variances = np.zeros(target_dimensions) vectors_new = np.copy(input_vectors) # Compute Principal Components for pc in range(target_dimensions): variances[pc], vectors_new = approximatePC(dimensions, pc, precision, vectors_new, log_factor) # Calculate variances explained by components total_var = 0 for points in unzip(input_vectors): total_var += stats.variance(points) varsExplained = [var / total_var for var in variances] return { "variance explained": varsExplained, "components": unzip(vectors_new)[:target_dimensions] }

vectors3d = transpose(genNdData(n=3, size=100)) results = myPCA(vectors3d, 3) results['variance explained']

vectors5d = transpose(genNdData(n=5, size=100)) results = myPCA(vectors5d, 3) results['variance explained']

vectors10d = transpose(genNdData(n=10, size=100)) results = myPCA(vectors10d, 3) results['variance explained']

df = pd.DataFrame(transpose(results['components'])) sns.pairplot(df) plot3d(transpose(results['components']))

# PCA X = np.ndarray.transpose(np.array(unzip(vectors10d))) pca = decomposition.PCA(n_components=3) X_reduced = pca.fit_transform(X) print("Standard PCA explained variances:", pca.explained_variance_ratio_) myPCAErrors = np.absolute(results['variance explained'] - pca.explained_variance_ratio_) / pca.explained_variance_ratio_ print("myPCA errors per component:", myPCAErrors)

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Data creation

PCA (sklearn)

The approximation

PCA approximation in 2D

PCA approximation in 3D and higher

Data creation