import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
class PCA_sklearn:
# main func
def __init__(self, data, n_components=2, target=None,colNames=None):
self.data = data
self.scaled_data = StandardScaler().fit_transform(self.data)
if n_components > self.data.shape[1]:
n_components = self.data.shape[1]
self.n_components = n_components
self.pca = PCA(n_components=self.n_components)
self.principalComponents = self.pca.fit_transform(self.scaled_data)
if (colNames != None):
self.principalDf = pd.DataFrame(
self.principalComponents, columns=colNames)
else:
self.principalDf = pd.DataFrame(
self.principalComponents, columns=['PC'+str(i+1) for i in range(self.n_components)])
if (type(target) != type(None)):
self.principalDf = pd.concat([self.principalDf, pd.DataFrame(target, columns=['target'])], axis=1)
# to access data
def get_data(self):
return self.data
def get_scaled_data(self):
return self.scaled_data
def get_n_components(self):
return self.n_components
def get_pca(self):
return self.pca
def get_principalComponents(self):
return self.principalComponents
def get_principalDf(self):
return self.principalDf
def get_explained_variance(self):
return self.pca.explained_variance_ratio_ # returns a vector of the explained variance ratio
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
features = ['sepal length','sepal width','petal length','petal width']
target = ['target']
features_target = features + target
df = pd.read_csv(url, names=features_target)
df
X = df.loc[:, features]
y = df.loc[:, target]
pca = PCA_sklearn(X, n_components=2, target=y,colNames=['Principal Component 1', 'Principal Component 2'])
principalDf = pca.get_principalDf()
principalDf
pca = PCA_sklearn(X, n_components=2, target=y,colNames=['PC1', 'PC2'])
principalDf = pca.get_principalDf()
principalDf
plt.figure(figsize = (5 , 5))
plt.xlabel('PC1', fontsize = 15)
plt.ylabel('PC2', fontsize = 15)
plt.title('PCA: 2 components projection', fontsize = 20)
outClasses = np.unique(y) #['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
colors = ['r', 'g', 'b']
for outClass, color in zip(outClasses,colors):
ind = np.where(y["target"] == outClass)
plt.scatter(principalDf.loc[ind, 'PC1']
, principalDf.loc[ind, 'PC2']
, c = color)
plt.legend(outClasses)
plt.grid()
explained_variance = pca.get_explained_variance()
print("explained_variance per principal component [%]: ", np.round(explained_variance*100,decimals=2))
print("sum of explained_variance [%]: ", np.round(sum(explained_variance)*100, decimals=2))