Customer Segmentation using PCA and K-Means
Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
Loading data
data_df=pd.read_csv('/work/Mall_Customers.csv')
data_df.head()
print(data_df.shape)
print(data_df.info())
Cheking for missing data
data_df.isnull().sum()
droping column
# drop Customer id
data_df = data_df.drop('CustomerID', axis=1)
data_df.head(2)
Data exlporation
plt.figure(1 , figsize = (15 , 6))
n = 0
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
n += 1
plt.subplot(1 , 3 , n)
plt.subplots_adjust(hspace =0.5 , wspace = 0.5)
sns.distplot(data_df[x] , bins = 20)
plt.title('Distplot of {}'.format(x))
plt.show()
plt.figure(1 , figsize = (15 , 5))
sns.countplot(y = 'Gender' , data = data_df)
plt.show()
plt.figure(1 , figsize = (15 , 7))
n = 0
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
for y in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
n += 1
plt.subplot(3 , 3 , n)
plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
sns.regplot(x = x , y = y , data = data_df)
plt.ylabel(y.split()[0]+' '+y.split()[1] if len(y.split()) > 1 else y )
plt.show()
Train/ test split
train_X, test_X = train_test_split(data_df, test_size=0.2, random_state=42)
print(len(train_X), "train +", len(test_X), "test")
# lets take copy of the data
df = train_X.copy()
Prepare the data
# Let fit and transform the Gender attribute into numeric
le = LabelEncoder()
le.fit(df.Gender)
df.loc[:,'Gender'] = le.transform(df.Gender)
df.head(3)
# Create scaler: scaler
scaler = StandardScaler()
scaler.fit(df)
# transform
data_scaled = scaler.transform(df)
data_scaled[0]
Principal Component Analysis (PCA)
pca = PCA()
# fit PCA
pca.fit(data_scaled)
# PCA features
features = range(pca.n_components_)
# PCA transformed data
data_pca = pca.transform(data_scaled)
pca.explained_variance_ratio_
plt.bar(features, pca.explained_variance_ratio_)
plt.xticks(features)
plt.ylabel('variance')
plt.xlabel('PCA feature')
plt.show()
# Principal component analysis (PCA) and singular value decomposition (SVD)
# PCA and SVD are closely related approaches and can be both applied to decompose any rectangular matrices.
pca2 = PCA(n_components=2, svd_solver='full')
# fit PCA
pca2.fit(data_scaled)
# PCA transformed data
data_pca2 = pca2.transform(data_scaled)
print(data_pca2.shape)
xs = data_pca2[:,0]
ys = data_pca2[:,1]
plt.scatter(ys, xs)
plt.grid(False)
plt.title('Scatter Plot of Customers data')
plt.xlabel('PCA-01')
plt.ylabel('PCA-02')
plt.show()
Determine the number of K-means clusters needed
# finding elbow value for different number of clusters.
X = data_pca2
inertia = []
for n in range(1 , 11):
algorithm = (KMeans(n_clusters = n ,init='k-means++',random_state= 42 ) )
algorithm.fit(X)
inertia.append(algorithm.inertia_)
plt.figure(1 , figsize = (15 ,6))
plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()
K-means clustring
# KMeans model
# 5 clusters to start with
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=0)
Pipeline
# Build pipeline
pipeline = make_pipeline(scaler, pca2, kmeans)
# fit the model to the scaled dataset
model_fit = pipeline.fit(df)
model_fit
# return a label for each data point based on their cluster
labels = model_fit.predict(df)
train_X['Clusters'] = labels
# Number of data points for each feature in each cluster
train_X.groupby('Clusters').count()
# Scatter plot visuals with labels
xs = data_pca2[:,0]
ys = data_pca2[:,1]
#zs = train_X.iloc[:,2]
plt.scatter(ys, xs,c=labels)
#plt.scatter(ys, zs, c=labels)
plt.grid(False)
plt.title('Scatter Plot of Customers data')
plt.xlabel('PCA-01')
plt.ylabel('PCA-02')
plt.show()
Validate with test data
# predict the labels
le.fit(test_X.Gender)
#update df2 with transformed values of gender
test_X.loc[:,'Gender'] = le.transform(test_X.Gender)
labels_test = model_fit.predict(test_X)
test_X['Clusters'] = labels_test
labels_test
# Number of data points for each feature in each cluster
test_X.groupby('Clusters').count()
Enfin !
cluster_heatmap_df = train_X.groupby(['Clusters'])[['Age','Annual Income (k$)','Spending Score (1-100)']].median()
heatcmap = sns.diverging_palette(230, 20, as_cmap=True)
cluster_heatmap_df['FemaleRatio'] = train_X[train_X['Gender'] == 'Female'].groupby(['Clusters']).count()['Gender'] / train_X.groupby(['Clusters']).count()['Gender'] * 100
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
sns.heatmap(cluster_heatmap_df.T,
square=True,
linewidth=3,
vmax=80,
vmin=1,
cmap=heatcmap,
cbar=False,
annot=True,
fmt='3.0f',
ax=ax,
);
ax.set_title('Cluster Summary');
plt.tight_layout()