Principle Component Analysis

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler %matplotlib inline

df = pd.read_csv('data_clean')

df.head()

# Checking for Correlation df.corr()

# Creating a Correlation Matrix with the absolute values only because we are interested only in the Magnitude and not the direction corr_mat = df.corr().abs()

corr_mat

sns.heatmap(corr_mat, fmt='g')

# Selecting Upper Triangle of the Correlation Matrix upper = corr_mat.where(np.triu(np.ones(corr_mat.shape), k=1).astype(bool))

upper

# finding index of variables with correlation greater than 0.45 max_corr = [column for column in upper.columns if any(upper[column] > 0.45)]

max_corr

# finding index of variables with correlation greater than 0.4 max_corr = [column for column in upper.columns if any(upper[column] > 0.4)]

max_corr

# Normalizing the dataframe scaler = StandardScaler() df_scaled = scaler.fit_transform(df)

# Converting array into DataFrame # Creating dataframe with the scaled features scaled_feat_col = ['Occupation','Marital_Status','Product_Category_1','Product_Category_2','Product_Category_3','Purchase','gender_cat','Age_cat','City_Category_cat','Stay_In_Current_City_Years_cat'] df_scaled = pd.DataFrame(df_scaled,columns=scaled_feat_col) df_scaled.head()

# Creating a Covariance Matrix cov_mat = np.cov(df_scaled.T) cov_mat = pd.DataFrame(cov_mat, columns=scaled_feat_col, index=scaled_feat_col ) cov_mat

# Selecting Upper Triangle of the Correlation Matrix upper = cov_mat.where(np.triu(np.ones(cov_mat.shape), k=1).astype(bool))

sns.heatmap(upper)

# finding index of variables with correlation greater than 0.45 max_cov = [column for column in upper.columns if any(upper[column].abs() > 0.45)] max_cov

# finding index of variables with correlation greater than 0.40 max_cov = [column for column in upper.columns if any(upper[column].abs() > 0.40)] max_cov

# Eigenvalues and Eigen vectors from numpy.linalg import eig eigenvalues, eigenvectors = eig(cov_mat)

eig_val = pd.DataFrame(eigenvalues) print(eig_val)

eig_vec = pd.DataFrame(eigenvectors) eig_vec

# Creating X and y X = df.drop(['Purchase'],axis=1) y = df['Purchase']

# Splitting the dataset in train and test sets from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=10)

# Normalizing by fitting on training set and applying transform to both scaler.fit(X_train) scaler.transform(X_train) scaler.transform(X_test)

# Building a LR model before PCA for comparing later from sklearn.linear_model import LinearRegression lr = LinearRegression()

# Fitting the training sets in LR lr.fit(X_train,y_train)

# Predicting the Values using X_test pred = lr.predict(X_test)

# Finding the mean squared error of the model by comparing pred and y_test from sklearn.metrics import mean_squared_error as mse print(np.sqrt(mse(y_test,pred)))

# PCA from sklearn.decomposition import PCA

pca = PCA(n_components=2) # Selecting only 8 components

# Fitting the training set only pca.fit(X_train)

# Applying Transform to train and test set pca.transform(X_test) pca.transform(X_train)

# Checking if PCA has worked by finding the RMSE of LR model lr.fit(X_train,y_train)

# Predicting the Values using X_test pred = lr.predict(X_test)

# Finding the mean squared error of the model by comparing pred and y_test from sklearn.metrics import mean_squared_error as mse print(np.sqrt(mse(y_test,pred)))