import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
%matplotlib inline
df = pd.read_csv('data_clean')
df.head()
# Checking for Correlation
df.corr()
# Creating a Correlation Matrix with the absolute values only because we are interested only in the Magnitude and not the direction
corr_mat = df.corr().abs()
corr_mat
sns.heatmap(corr_mat, fmt='g')
# Selecting Upper Triangle of the Correlation Matrix
upper = corr_mat.where(np.triu(np.ones(corr_mat.shape), k=1).astype(bool))
upper
# finding index of variables with correlation greater than 0.45
max_corr = [column for column in upper.columns if any(upper[column] > 0.45)]
max_corr
# finding index of variables with correlation greater than 0.4
max_corr = [column for column in upper.columns if any(upper[column] > 0.4)]
max_corr
# Normalizing the dataframe
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
# Converting array into DataFrame
# Creating dataframe with the scaled features
scaled_feat_col = ['Occupation','Marital_Status','Product_Category_1','Product_Category_2','Product_Category_3','Purchase','gender_cat','Age_cat','City_Category_cat','Stay_In_Current_City_Years_cat']
df_scaled = pd.DataFrame(df_scaled,columns=scaled_feat_col)
df_scaled.head()
# Creating a Covariance Matrix
cov_mat = np.cov(df_scaled.T)
cov_mat = pd.DataFrame(cov_mat, columns=scaled_feat_col, index=scaled_feat_col )
cov_mat
# Selecting Upper Triangle of the Correlation Matrix
upper = cov_mat.where(np.triu(np.ones(cov_mat.shape), k=1).astype(bool))
sns.heatmap(upper)
# finding index of variables with correlation greater than 0.45
max_cov = [column for column in upper.columns if any(upper[column].abs() > 0.45)]
max_cov
# finding index of variables with correlation greater than 0.40
max_cov = [column for column in upper.columns if any(upper[column].abs() > 0.40)]
max_cov
# Eigenvalues and Eigen vectors
from numpy.linalg import eig
eigenvalues, eigenvectors = eig(cov_mat)
eig_val = pd.DataFrame(eigenvalues)
print(eig_val)
eig_vec = pd.DataFrame(eigenvectors)
eig_vec
# Creating X and y
X = df.drop(['Purchase'],axis=1)
y = df['Purchase']
# Splitting the dataset in train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=10)
# Normalizing by fitting on training set and applying transform to both
scaler.fit(X_train)
scaler.transform(X_train)
scaler.transform(X_test)
# Building a LR model before PCA for comparing later
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
# Fitting the training sets in LR
lr.fit(X_train,y_train)
# Predicting the Values using X_test
pred = lr.predict(X_test)
# Finding the mean squared error of the model by comparing pred and y_test
from sklearn.metrics import mean_squared_error as mse
print(np.sqrt(mse(y_test,pred)))
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2) # Selecting only 8 components
# Fitting the training set only
pca.fit(X_train)
# Applying Transform to train and test set
pca.transform(X_test)
pca.transform(X_train)
# Checking if PCA has worked by finding the RMSE of LR model
lr.fit(X_train,y_train)
# Predicting the Values using X_test
pred = lr.predict(X_test)
# Finding the mean squared error of the model by comparing pred and y_test
from sklearn.metrics import mean_squared_error as mse
print(np.sqrt(mse(y_test,pred)))