%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
print('Libraries imported')
df_customers = pd.read_csv('../data/Mall_Customers.csv')
df_customers.head()
df_customers = df_customers.drop(['CustomerID'], axis=1)
df_customers.head()
df_customers.isnull().sum()
from sklearn.preprocessing import StandardScaler
numerical_columns = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
numerical_data = df_customers[numerical_columns]
scaler = StandardScaler().fit(numerical_data.values)
numerical_data = scaler.transform(numerical_data.values)
scaled_num_data = pd.DataFrame(numerical_data, columns=numerical_columns)
scaled_num_data.head()
gender = df_customers['Gender']
df = scaled_num_data.join(gender)
df = pd.get_dummies(df)
df = df.drop(['Gender_Male'], axis=1)
df.head()
df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].describe()
covariance_matrix = np.cov(df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].T)
covariance_matrix
import seaborn as sns
plt.figure(figsize=(10,10))
sns.set(font_scale=1.5)
hm = sns.heatmap(covariance_matrix,
cbar=True,
annot=True,
square=True,
fmt='.2f',
annot_kws={'size': 12},
yticklabels=['Age', 'Annual Income (k$)', 'Spending Score (1-100)'],
xticklabels=['Age', 'Annual Income (k$)', 'Spending Score (1-100)'])
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
principal_components = pca.fit_transform(df)
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_ratio_)
plt.xlabel('PCA features')
plt.ylabel('variance %')
plt.xticks(features)
print(pca.explained_variance_ratio_)
PCA_components = pd.DataFrame(principal_components)
from sklearn.cluster import KMeans
ks = range(1, 10)
inertias = []
for k in ks:
model = KMeans(n_clusters=k)
model.fit(PCA_components.iloc[:,:3])
inertias.append(model.inertia_)
plt.plot(ks, inertias, '-o', color='black')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()
df = pd.read_csv('../data/Mall_Customers.csv')
df = df.drop(['CustomerID'],axis=1)
model = KMeans(n_clusters=4)
model.fit(PCA_components.iloc[:,:3])
k_predict = model.predict(PCA_components.iloc[:,:3])
frame = pd.DataFrame(df)
frame['cluster'] = k_predict
frame.head()
avg_df = df.groupby(['cluster'], as_index=False).mean()
avg_df
fig, ax = plt.subplots(1,3,figsize=(20,5))
sns.barplot(x='cluster',y='Age',data=avg_df, ax=ax[0])
sns.barplot(x='cluster',y='Spending Score (1-100)',data=avg_df, ax=ax[1])
sns.barplot(x='cluster',y='Annual Income (k$)',data=avg_df, ax=ax[2])
df2 = pd.DataFrame(df.groupby(['cluster','Gender'])['Gender'].count())
df2
df3 = pd.DataFrame(df.groupby(['Gender'])['Gender'].count())
df3