# import useful libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('wholesome_customers_data.csv')
# there are 440 customers' data
# there seems no missing values
df.info()
plt.figure(dpi=150)
sns.pairplot(data=df,hue='Region',palette='Set1');
plt.figure(dpi=150)
sns.pairplot(data=df,hue='Channel',palette='Set1');
plt.figure(figsize=(10,8))
sns.histplot(data=df,x='Milk',hue='Channel',palette='Set1',multiple='stack');
plt.figure(figsize=(10,8))
sns.histplot(data=df,x='Frozen',hue='Channel',palette='Set1',multiple='stack');
plt.figure(figsize=(10,8))
sns.histplot(data=df,x='Detergents_Paper',hue='Channel',palette='Set1',multiple='stack');
plt.figure(figsize=(10,8))
sns.histplot(data=df,x='Grocery',hue='Channel',palette='Set1',multiple='stack');
plt.figure(figsize=(10,8))
sns.clustermap(df.drop(['Region','Channel'],axis=1).corr(),annot=True);
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_df = scaler.fit_transform(df)
scaled_df.shape
from sklearn.cluster import DBSCAN
outlier_count = []
outlier_percentage = []
for epsilon in np.linspace(0.1,3,50):
db = DBSCAN(eps=epsilon,min_samples=8)
db.fit(scaled_df)
# when label == -1, this means that customer is not in any cluster
outliers = np.sum(db.labels_==-1)
outlier_count.append(outliers)
outlier_percentage.append(100*outliers/len(df))
plt.plot(np.linspace(0.1,3,50),outlier_count)
plt.plot(np.linspace(0.1,3,50),outlier_percentage)
final_model = DBSCAN(eps=1.5,min_samples=8)
final_model.fit(scaled_df)
final_model.labels_
labels = final_model.labels_
unique, counts = np.unique(labels, return_counts=True)
dict(zip(unique, counts))
plt.figure(figsize=(10,8))
sns.scatterplot(data=df,x='Detergents_Paper',y='Milk',hue=final_model.labels_,palette='Set1',alpha=0.7);
plt.figure(figsize=(10,8))
sns.histplot(data=df,x='Milk',hue=final_model.labels_,palette='Set1');
plt.figure(figsize=(10,8))
sns.histplot(data=df,x='Detergents_Paper',hue=final_model.labels_,palette='Set1');
plt.figure(figsize=(10,8))
sns.histplot(data=df,x='Grocery',hue=final_model.labels_,palette='Set1');
df['Labels'] = final_model.labels_
# mean of each feature, groupby 'Labels'
df.groupby(['Labels']).mean().drop(['Channel','Region'],axis=1)
# number of customers per label
dict(zip(unique, counts))
277/len(df)
118/len(df)
45/len(df)