Customer Segmentation

import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt import seaborn as sns import plotly as py import plotly.graph_objs as go from sklearn.cluster import KMeans import warnings import os warnings.filterwarnings("ignore") py.offline.init_notebook_mode(connected = True)

df = pd.read_csv(r'/work/Mall_Customers.csv') df.head()

df.shape

df.describe()

df.dtypes

df.isnull().sum()

plt.style.use('fivethirtyeight')

plt.figure(1 , figsize = (15 , 6)) n = 0 for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']: n += 1 plt.subplot(1 , 3 , n) plt.subplots_adjust(hspace =0.5 , wspace = 0.5) sns.distplot(df[x] , bins = 20) plt.title('Distplot of {}'.format(x)) plt.show()

plt.figure(1 , figsize = (15 , 5)) sns.countplot(y = 'Gender' , data = df) plt.show()

plt.figure(1 , figsize = (15 , 7)) n = 0 for x in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']: for y in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']: n += 1 plt.subplot(3 , 3 , n) plt.subplots_adjust(hspace = 0.5 , wspace = 0.5) sns.regplot(x = x , y = y , data = df) plt.ylabel(y.split()[0]+' '+y.split()[1] if len(y.split()) > 1 else y ) plt.show()

plt.figure(1 , figsize = (15 , 6)) for gender in ['Male' , 'Female']: plt.scatter(x = 'Age' , y = 'Annual Income (k$)' , data = df[df['Gender'] == gender] , s = 200 , alpha = 0.5 , label = gender) plt.xlabel('Age'), plt.ylabel('Annual Income (k$)') plt.title('Age vs Annual Income w.r.t Gender') plt.legend() plt.show()

plt.figure(1 , figsize = (15 , 6)) for gender in ['Male' , 'Female']: plt.scatter(x = 'Annual Income (k$)',y = 'Spending Score (1-100)' , data = df[df['Gender'] == gender] ,s = 200 , alpha = 0.5 , label = gender) plt.xlabel('Annual Income (k$)'), plt.ylabel('Spending Score (1-100)') plt.title('Annual Income vs Spending Score w.r.t Gender') plt.legend() plt.show()

plt.figure(1 , figsize = (15 , 7)) n = 0 for cols in ['Age' , 'Annual Income (k$)' , 'Spending Score (1-100)']: n += 1 plt.subplot(1 , 3 , n) plt.subplots_adjust(hspace = 0.5 , wspace = 0.5) sns.violinplot(x = cols , y = 'Gender' , data = df , palette = 'vlag') sns.swarmplot(x = cols , y = 'Gender' , data = df) plt.ylabel('Gender' if n == 1 else '') plt.title('Boxplots & Swarmplots' if n == 2 else '') plt.show()

'''Age and spending Score''' X1 = df[['Age' , 'Spending Score (1-100)']].iloc[: , :].values inertia = [] for n in range(1 , 11): algorithm = (KMeans(n_clusters = n ,init='k-means++', n_init = 10 ,max_iter=300, tol=0.0001, random_state= 111 , algorithm='elkan') ) algorithm.fit(X1) inertia.append(algorithm.inertia_)

plt.figure(1 , figsize = (15 ,6)) plt.plot(np.arange(1 , 11) , inertia , 'o') plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5) plt.xlabel('Number of Clusters') , plt.ylabel('Inertia') plt.show()

algorithm = (KMeans(n_clusters = 4 ,init='k-means++', n_init = 10 ,max_iter=300, tol=0.0001, random_state= 111 , algorithm='elkan') ) algorithm.fit(X1) labels1 = algorithm.labels_ centroids1 = algorithm.cluster_centers_

h = 0.02 x_min, x_max = X1[:, 0].min() - 1, X1[:, 0].max() + 1 y_min, y_max = X1[:, 1].min() - 1, X1[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z = algorithm.predict(np.c_[xx.ravel(), yy.ravel()])

plt.figure(1 , figsize = (15 , 7) ) plt.clf() Z = Z.reshape(xx.shape) plt.imshow(Z , interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap = plt.cm.Pastel2, aspect = 'auto', origin='lower') plt.scatter( x = 'Age' ,y = 'Spending Score (1-100)' , data = df , c = labels1 , s = 200 ) plt.scatter(x = centroids1[: , 0] , y = centroids1[: , 1] , s = 300 , c = 'red' , alpha = 0.5) plt.ylabel('Spending Score (1-100)') , plt.xlabel('Age') plt.show()

'''Annual Income and spending Score''' X2 = df[['Annual Income (k$)' , 'Spending Score (1-100)']].iloc[: , :].values inertia = [] for n in range(1 , 11): algorithm = (KMeans(n_clusters = n ,init='k-means++', n_init = 10 ,max_iter=300, tol=0.0001, random_state= 111 , algorithm='elkan') ) algorithm.fit(X2) inertia.append(algorithm.inertia_)

plt.figure(1 , figsize = (15 ,6)) plt.plot(np.arange(1 , 11) , inertia , 'o') plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5) plt.xlabel('Number of Clusters') , plt.ylabel('Inertia') plt.show()

algorithm = (KMeans(n_clusters = 5 ,init='k-means++', n_init = 10 ,max_iter=300, tol=0.0001, random_state= 111 , algorithm='elkan') ) algorithm.fit(X2) labels2 = algorithm.labels_ centroids2 = algorithm.cluster_centers_

h = 0.02 x_min, x_max = X2[:, 0].min() - 1, X2[:, 0].max() + 1 y_min, y_max = X2[:, 1].min() - 1, X2[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) Z2 = algorithm.predict(np.c_[xx.ravel(), yy.ravel()])

plt.figure(1 , figsize = (15 , 7) ) plt.clf() Z2 = Z2.reshape(xx.shape) plt.imshow(Z2 , interpolation='nearest', extent=(xx.min(), xx.max(), yy.min(), yy.max()), cmap = plt.cm.Pastel2, aspect = 'auto', origin='lower') plt.scatter( x = 'Annual Income (k$)' ,y = 'Spending Score (1-100)' , data = df , c = labels2 , s = 200 ) plt.scatter(x = centroids2[: , 0] , y = centroids2[: , 1] , s = 300 , c = 'red' , alpha = 0.5) plt.ylabel('Spending Score (1-100)') , plt.xlabel('Annual Income (k$)') plt.show()

X3 = df[['Age' , 'Annual Income (k$)' ,'Spending Score (1-100)']].iloc[: , :].values inertia = [] for n in range(1 , 11): algorithm = (KMeans(n_clusters = n ,init='k-means++', n_init = 10 ,max_iter=300, tol=0.0001, random_state= 111 , algorithm='elkan') ) algorithm.fit(X3) inertia.append(algorithm.inertia_)

plt.figure(1 , figsize = (15 ,6)) plt.plot(np.arange(1 , 11) , inertia , 'o') plt.plot(np.arange(1 , 11) , inertia , '-' , alpha = 0.5) plt.xlabel('Number of Clusters') , plt.ylabel('Inertia') plt.show()

algorithm = (KMeans(n_clusters = 6 ,init='k-means++', n_init = 10 ,max_iter=300, tol=0.0001, random_state= 111 , algorithm='elkan') ) algorithm.fit(X3) labels3 = algorithm.labels_ centroids3 = algorithm.cluster_centers_

df.columns

# Creating KMeans instance kmeans = KMeans(n_clusters=5) # Fitting the data kmeans.fit(df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]) # Adding cluster labels to the dataframe df['Cluster'] = kmeans.labels_ fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111, projection='3d') ax.scatter(df.Age[df.Cluster == 0], df['Annual Income (k$)'][df.Cluster == 0], df['Spending Score (1-100)'][df.Cluster == 0], c='blue', s=60) ax.scatter(df.Age[df.Cluster == 1], df['Annual Income (k$)'][df.Cluster == 1], df['Spending Score (1-100)'][df.Cluster == 1], c='red', s=60) ax.scatter(df.Age[df.Cluster == 2], df['Annual Income (k$)'][df.Cluster == 2], df['Spending Score (1-100)'][df.Cluster == 2], c='green', s=60) ax.scatter(df.Age[df.Cluster == 3], df['Annual Income (k$)'][df.Cluster == 3], df['Spending Score (1-100)'][df.Cluster == 3], c='orange', s=60) ax.scatter(df.Age[df.Cluster == 4], df['Annual Income (k$)'][df.Cluster == 4], df['Spending Score (1-100)'][df.Cluster == 4], c='purple', s=60) ax.set_xlabel('Age') ax.set_ylabel('Annual Income (k$)') ax.set_zlabel('Spending Score (1-100)') ax.legend(['Cluster 0', 'Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4']) plt.show()