import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import pickle
import matplotlib.pyplot as plt
from matplotlib import image
img = image.imread("/work/clustering.jpg")
plt.figure(figsize=(10,10))
plt.imshow(img)
plt.show()
df_segmentation= pd.read_csv("/work/segmentation data.csv", index_col=0)
df_segmentation.head()
df_segmentation.describe()
df_segmentation.corr()
plt.figure(figsize=(12,9))
s=sns.heatmap(df_segmentation.corr(), annot= True, cmap='RdBu', vmin=-1, vmax=1)
s.set_yticklabels(s.get_yticklabels(), rotation=0, fontsize=12)
s.set_xticklabels(s.get_xticklabels(), rotation=90, fontsize=12)
plt.title("Correlation Heatmap")
plt.show()
plt.figure(figsize=(12,9))
plt.scatter(df_segmentation.iloc[:,2], df_segmentation.iloc[:,4])
plt.xlabel("Age")
plt.ylabel("Income")
plt.title("Visualization of raw data")
plt.show()
scaler= StandardScaler()
segmentation_std= scaler.fit_transform(df_segmentation)
hier_clust= linkage(segmentation_std, method='ward')
plt.figure(figsize=(12,9))
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Observations")
plt.ylabel("Distance")
dendrogram(hier_clust, show_leaf_counts=False, no_labels=True, color_threshold=0)
plt.show()
plt.figure(figsize=(12,9))
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Observations")
plt.ylabel("Distance")
dendrogram(hier_clust, truncate_mode='level', p=5, show_leaf_counts=False, no_labels=True)
plt.show()
wcss=[]
for i in range(1,11):
kmeans=KMeans(n_clusters=i, init='k-means++', random_state=42)
kmeans.fit(segmentation_std)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(10,8))
plt.plot(range(1,11), wcss)
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.title("K-Means Clustering")
plt.show()
kmean=KMeans(n_clusters=4, init='k-means++', random_state=42)
kmean.fit(segmentation_std)
df_segm_kmeans=df_segmentation.copy()
df_segm_kmeans['Segment k-Means']=kmean.labels_
df_segm_analysis=df_segm_kmeans.groupby(['Segment k-Means']).mean()
df_segm_analysis
df_segm_analysis['N-obs']=df_segm_kmeans[['Sex','Segment k-Means']].groupby(['Segment k-Means']).count()
df_segm_analysis['Prop-obs']=df_segm_analysis['N-obs']/df_segm_analysis['N-obs'].sum()
df_segm_analysis
df_segm_analysis.rename({0:'Well off',1:'Standard',2: 'Fewer Opportunities',3:'Career Focussed'})
df_segm_kmeans['Labels']= df_segm_kmeans['Segment k-Means'].map({0:'Well off',1:'Standard',2: 'Fewer Opportunities',3:'Career Focussed'})
x_axis=df_segm_kmeans['Age']
y_axis=df_segm_kmeans['Income']
plt.figure(figsize=(12,9))
sns.scatterplot(x_axis,y_axis,hue= df_segm_kmeans['Labels'], palette={'g','r','c','m'})
plt.title("Segmented K-Means")
plt.show()
pca=PCA()
pca.fit(segmentation_std)
pca.explained_variance_ratio_
plt.figure(figsize=(12,9))
plt.plot(range(1,8), pca.explained_variance_ratio_.cumsum(), marker='o',linestyle='--' )
plt.title("Explained Variance")
plt.show()
pca=PCA(n_components=3)
pca.fit(segmentation_std)
pca.components_
df_pca_comp= pd.DataFrame(data= pca.components_, columns= df_segmentation.columns.values, index=['Component 1', 'Component 2', 'Component 3'])
df_pca_comp
sns.heatmap(df_pca_comp, vmin=-1, vmax=1, annot=True, cmap='RdBu')
plt.yticks([0,1,2], ['Component 1','Component 2', 'Component 3'], rotation=45, fontsize=9)
plt.show()
scores_pca= pca.transform(segmentation_std)
scores_pca
wcss_pca=[]
for i in range(1,11):
kmeans_pca=KMeans(n_clusters= i, init='k-means++', random_state=42)
kmeans_pca.fit(scores_pca)
wcss_pca.append(kmeans_pca.inertia_)
plt.figure(figsize=(12,9))
plt.plot(range(1,11),wcss_pca, marker='o', linestyle='--')
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.title("K-Means with PCA")
plt.show()
kmeans_pca=KMeans(n_clusters= 4, init='k-means++', random_state=42)
kmeans_pca.fit(scores_pca)
df_segm_pca_kmeans= pd.concat([df_segmentation.reset_index(drop=True), pd.DataFrame(scores_pca)], axis=1)
df_segm_pca_kmeans.columns.values[-3: ]= ['Component 1','Component 2','Component 3']
df_segm_pca_kmeans['Segment k-means PCA']= kmeans_pca.labels_
df_segm_pca_kmeans
df_segm_pca_kmeans_freq= df_segm_pca_kmeans.groupby(['Segment k-means PCA']).mean()
df_segm_pca_kmeans_freq
df_segm_pca_kmeans_freq['N-obs']=df_segm_pca_kmeans[['Sex','Segment k-means PCA']].groupby(['Segment k-means PCA']).count()
df_segm_pca_kmeans_freq['Prop-obs']=df_segm_pca_kmeans_freq['N-obs']/df_segm_pca_kmeans_freq['N-obs'].sum()
df_segm_pca_kmeans_freq=df_segm_pca_kmeans_freq.rename({0:'Standard',1:'Career Focussed',2: 'Fewer Opportunities',3:'Well off'})
df_segm_pca_kmeans_freq
df_segm_pca_kmeans['Legend']= df_segm_pca_kmeans['Segment k-means PCA'].map({0:'Standard',1:'Career Focussed',2: 'Fewer Opportunities',3:'Well off'})
plt.figure(figsize=(12,9))
x_axis=df_segm_pca_kmeans['Component 2']
y_axis=df_segm_pca_kmeans['Component 1']
sns.scatterplot(x_axis, y_axis, hue= df_segm_pca_kmeans['Legend'], palette=['g','r','c','m'])
plt.title('Clusters by PCA Components')
plt.show()
pickle.dump(scaler, open('scaler.pickle','wb'))
pickle.dump(pca,open('pca.pickle','wb'))
pickle.dump(kmeans_pca, open('kmeans_pca.pickle','wb'))