Customer Segmentation using PCA and K-Means
The component failed to load. This could have happened because we've just released a new version of Deepnote. Please refresh the browser or contact our support.
The component failed to load. This could have happened because we've just released a new version of Deepnote. Please refresh the browser or contact our support.
Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
Loading data
data_df=pd.read_csv('/work/Mall_Customers.csv')
data_df.head()
print(data_df.shape)
print(data_df.info())
Cheking for missing data
data_df.isnull().sum()
droping column
# drop Customer id
data_df = data_df.drop('CustomerID', axis=1)
data_df.head(2)
Data exlporation
plt.figure(1 , figsize = (15 , 6))
n = 0
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
n += 1
plt.subplot(1 , 3 , n)
plt.subplots_adjust(hspace =0.5 , wspace = 0.5)
sns.distplot(data_df[x] , bins = 20)
plt.title('Distplot of {}'.format(x))
plt.show()
plt.figure(1 , figsize = (15 , 5))
sns.countplot(y = 'Gender' , data = data_df)
plt.show()
plt.figure(1 , figsize = (15 , 7))
n = 0
for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
for y in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']:
n += 1
plt.subplot(3 , 3 , n)
plt.subplots_adjust(hspace = 0.5 , wspace = 0.5)
sns.regplot(x = x , y = y , data = data_df)
plt.ylabel(y.split()[0]+' '+y.split()[1] if len(y.split()) > 1 else y )
plt.show()
Train/ test split
train_X, test_X = train_test_split(data_df, test_size=0.2, random_state=42)
print(len(train_X), "train +", len(test_X), "test")
# lets take copy of the data
df = train_X.copy()
Prepare the data
# Let fit and transform the Gender attribute into numeric
le = LabelEncoder()
le.fit(df.Gender)
df.loc[:,'Gender'] = le.transform(df.Gender)
df.head(3)
# Create scaler: scaler
scaler = StandardScaler()
scaler.fit(df)
# transform
data_scaled = scaler.transform(df)
data_scaled[0]
Principal Component Analysis (PCA)
pca = PCA()
# fit PCA
pca.fit(data_scaled)
# PCA features
features = range(pca.n_components_)
# PCA transformed data
data_pca = pca.transform(data_scaled)
pca.explained_variance_ratio_
plt.bar(features, pca.explained_variance_ratio_)
plt.xticks(features)
plt.ylabel('variance')
plt.xlabel('PCA feature')
plt.show()
# Principal component analysis (PCA) and singular value decomposition (SVD)
# PCA and SVD are closely related approaches and can be both applied to decompose any rectangular matrices.
pca2 = PCA(n_components=2, svd_solver='full')
# fit PCA
pca2.fit(data_scaled)
# PCA transformed data
data_pca2 = pca2.transform(data_scaled)
print(data_pca2.shape)
xs = data_pca2[:,0]
ys = data_pca2[:,1]
plt.scatter(ys, xs)
plt.grid(False)
plt.title('Scatter Plot of Customers data')
plt.xlabel('PCA-01')
plt.ylabel('PCA-02')
plt.show()
Determine the number of K-means clusters needed
# finding elbow value for different number of clusters.
X = data_pca2
inertia = []
for n in range(1 , 11):
algorithm = (KMeans(n_clusters = n ,init='k-means++',random_state= 42 ) )
algorithm.fit(X)
inertia.append(algorithm.inertia_)
plt.figure(1 , figsize = (15 ,6))
plt.plot(np.arange(1 , 11) , inertia , 'o')
plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5)
plt.xlabel('Number of Clusters') , plt.ylabel('Inertia')
plt.show()
K-means clustring
# KMeans model
# 5 clusters to start with
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=0)
Pipeline
# Build pipeline
pipeline = make_pipeline(scaler, pca2, kmeans)
# fit the model to the scaled dataset
model_fit = pipeline.fit(df)
model_fit
# return a label for each data point based on their cluster
labels = model_fit.predict(df)
train_X['Clusters'] = labels
# Number of data points for each feature in each cluster
train_X.groupby('Clusters').count()
# Scatter plot visuals with labels
xs = data_pca2[:,0]
ys = data_pca2[:,1]
#zs = train_X.iloc[:,2]
plt.scatter(ys, xs,c=labels)
#plt.scatter(ys, zs, c=labels)
plt.grid(False)
plt.title('Scatter Plot of Customers data')
plt.xlabel('PCA-01')
plt.ylabel('PCA-02')
plt.show()
Validate with test data
# predict the labels
le.fit(test_X.Gender)
#update df2 with transformed values of gender
test_X.loc[:,'Gender'] = le.transform(test_X.Gender)
labels_test = model_fit.predict(test_X)
test_X['Clusters'] = labels_test
labels_test
# Number of data points for each feature in each cluster
test_X.groupby('Clusters').count()
Enfin !
cluster_heatmap_df = train_X.groupby(['Clusters'])[['Age','Annual Income (k$)','Spending Score (1-100)']].median()
heatcmap = sns.diverging_palette(230, 20, as_cmap=True)
cluster_heatmap_df['FemaleRatio'] = train_X[train_X['Gender'] == 'Female'].groupby(['Clusters']).count()['Gender'] / train_X.groupby(['Clusters']).count()['Gender'] * 100
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
sns.heatmap(cluster_heatmap_df.T,
square=True,
linewidth=3,
vmax=80,
vmin=1,
cmap=heatcmap,
cbar=False,
annot=True,
fmt='3.0f',
ax=ax,
);
ax.set_title('Cluster Summary');
plt.tight_layout()