%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
print('Libraries imported')
Libraries imported
df_customers = pd.read_csv('../data/Mall_Customers.csv')
df_customers.head()
CustomerIDint64
Genderobject
0
1
Male
1
2
Male
2
3
Female
3
4
Female
4
5
Female
df_customers = df_customers.drop(['CustomerID'], axis=1)
df_customers.head()
Genderobject
Ageint64
0
Male
19
1
Male
21
2
Female
20
3
Female
23
4
Female
31
df_customers.isnull().sum()
from sklearn.preprocessing import StandardScaler
numerical_columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
numerical_data = df_customers[numerical_columns]
scaler = StandardScaler().fit(numerical_data.values)
numerical_data = scaler.transform(numerical_data.values)
scaled_num_data = pd.DataFrame(numerical_data, columns=numerical_columns)
scaled_num_data.head()
Agefloat64
Annual Income (k$)float64
0
-1.42456879
-1.738999193
1
-1.281035411
-1.738999193
2
-1.3528021
-1.700829764
3
-1.137502031
-1.700829764
4
-0.563368514
-1.662660335
gender = df_customers['Gender']
df = scaled_num_data.join(gender)
df = pd.get_dummies(df)
df = df.drop(['Gender_Male'], axis=1)
df.head()
Agefloat64
Annual Income (k$)float64
0
-1.42456879
-1.738999193
1
-1.281035411
-1.738999193
2
-1.3528021
-1.700829764
3
-1.137502031
-1.700829764
4
-0.563368514
-1.662660335
df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].describe()
Agefloat64
Annual Income (k$)float64
count
200
200
mean
-1.021405183e-16
-2.131628207e-16
std
1.002509414
1.002509414
min
-1.49633548
-1.738999193
25%
-0.7248435657
-0.72750932
50%
-0.2045350656
0.03587926342
75%
0.7284319002
0.6656748448
max
2.235532383
2.917671166
covariance_matrix = np.cov(df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].T)
covariance_matrix
import seaborn as sns
plt.figure(figsize=(10,10))
sns.set(font_scale=1.5)
hm = sns.heatmap(covariance_matrix,
cbar=True,
annot=True,
square=True,
fmt='.2f',
annot_kws={'size': 12},
yticklabels=['Age', 'Annual Income (k$)', 'Spending Score (1-100)'],
xticklabels=['Age', 'Annual Income (k$)', 'Spending Score (1-100)'])
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
principal_components = pca.fit_transform(df)
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_ratio_)
plt.xlabel('PCA features')
plt.ylabel('variance %')
plt.xticks(features)
print(pca.explained_variance_ratio_)
PCA_components = pd.DataFrame(principal_components)
[0.4095285 0.3081674 0.20723465 0.07506945]
from sklearn.cluster import KMeans
ks = range(1, 10)
inertias = []
for k in ks:
model = KMeans(n_clusters=k)
model.fit(PCA_components.iloc[:,:3])
inertias.append(model.inertia_)
plt.plot(ks, inertias, '-o', color='black')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
df = pd.read_csv('../data/Mall_Customers.csv')
df = df.drop(['CustomerID'],axis=1)
model = KMeans(n_clusters=4)
model.fit(PCA_components.iloc[:,:3])
k_predict = model.predict(PCA_components.iloc[:,:3])
frame = pd.DataFrame(df)
frame['cluster'] = k_predict
frame.head()
Genderobject
Ageint64
0
Male
19
1
Male
21
2
Female
20
3
Female
23
4
Female
31
avg_df = df.groupby(['cluster'], as_index=False).mean()
avg_df
clusterint64
Agefloat64
0
0
25.43859649
1
1
53.98461538
2
2
32.875
3
3
39.36842105
fig, ax = plt.subplots(1,3,figsize=(20,5))
sns.barplot(x='cluster',y='Age',data=avg_df, ax=ax[0])
sns.barplot(x='cluster',y='Spending Score (1-100)',data=avg_df, ax=ax[1])
sns.barplot(x='cluster',y='Annual Income (k$)',data=avg_df, ax=ax[2])
df2 = pd.DataFrame(df.groupby(['cluster','Gender'])['Gender'].count())
df2
Genderint64
(0, 'Female')
34
(0, 'Male')
23
(1, 'Female')
37
(1, 'Male')
28
(2, 'Female')
22
(2, 'Male')
18
(3, 'Female')
19
(3, 'Male')
19
df3 = pd.DataFrame(df.groupby(['Gender'])['Gender'].count())
df3
Genderint64
Female
112
Male
88