Customer's Segmentation

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline import warnings warnings.filterwarnings("ignore") plt.rcParams['font.family'] = 'serif' plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']

# Loading the dataset df = pd.read_csv("marketing_campaign.csv", sep="\t") print("Number of datapoints:", len(df)) df.head()

# --- Dataset Report --- ProfileReport(df, title='Customer Segmentation:', minimal=True, progress_bar=False, samples=None, correlations=None, interactions=None, explorative=True, dark_mode=True, notebook={'iframe':{'height': '600px'}}, html={'style':{'primary_color': '#FFCC00'}}, missing_diagrams={'heatmap': False, 'dendrogram': False}).to_notebook_iframe()

warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib") # --- Correlation Map (Heatmap) --- # Create a boolean mask for the upper triangle of the correlation matrix mask = np.triu(np.ones_like(df.corr(), dtype=bool)) # Create a new figure with a single subplot fig, ax = plt.subplots(figsize=(15,15),dpi =300) # Draw a heatmap in the subplot, using the mask, and with the specified appearance settings sns.heatmap(df.corr(), mask=mask, annot=True, cmap='Blues', linewidths=0.1, cbar=False, annot_kws={"size":5}) # Get the current tick locations and labels of the y-axis and x-axis yticks, ylabels = plt.yticks() xticks, xlabels = plt.xticks() # Set the x-axis and y-axis labels with specified font size and family ax.set_xticklabels(xlabels, size=6, fontfamily='serif',rotation = 75) ax.set_yticklabels(ylabels, size=6, fontfamily='serif') # Set the main title of the figure plt.suptitle('Correlation Map of Numerical Variables', fontweight='heavy', x=0.327, y=0.96, ha='left', fontsize=13, fontfamily='serif') # Adjust the padding of the subplots plt.tight_layout(rect=[0, 0.04, 1, 1.01]) plt.show

df.info()

#Age of customer today pd.set_option('display.max_columns', None) df["Age"] = 2023-df["Year_Birth"] #Total spendings on various items df["Spent"] = df["MntWines"]+ df["MntFruits"]+ df["MntMeatProducts"]+ df["MntFishProducts"]+ df["MntSweetProducts"]+ df["MntGoldProds"] #Deriving living situation by marital status"Alone" df["Living_With"]=df["Marital_Status"].replace({"Married":"Partner", "Together":"Partner", "Absurd":"Alone", "Widow":"Alone", "YOLO":"Alone", "Divorced":"Alone", "Single":"Alone",}) #Feature indicating total children living in the household df["Children"]=df["Kidhome"]+df["Teenhome"] #Feature for total members in the household df["Family_Size"] = df["Living_With"].replace({"Alone": 1, "Partner":2})+ df["Children"] #Feature pertaining parenthood df["Is_Parent"] = np.where(df.Children> 0, 1, 0) #Segmenting education levels in three groups df["Education"]=df["Education"].replace({"Basic":"Undergraduate","2n Cycle":"Undergraduate", "Graduation":"Graduate", "Master":"Postgraduate", "PhD":"Postgraduate"}) #For clarity df=df.rename(columns={"MntWines": "Wines","MntFruits":"Fruits","MntMeatProducts":"Meat","MntFishProducts":"Fish","MntSweetProducts":"Sweets","MntGoldProds":"Gold"}) #Dropping some of the redundant features to_drop = ["Marital_Status", "Dt_Customer", "Z_CostContact", "Z_Revenue", "Year_Birth", "ID"] df = df.drop(to_drop, axis=1) df.head(10)

#Dropping the outliers by setting a cap on Age and income. df = df[(df["Age"]<90)] df = df[(df["Income"]<600000)]

from sklearn.preprocessing import LabelEncoder #Get list of categorical variables s = (df.dtypes == 'object') object_cols = list(s[s].index) print("Categorical variables in the dataset:", object_cols) #Label Encoding the object dtypes. LE=LabelEncoder() for i in object_cols: df[i]=df[[i]].apply(LE.fit_transform) print("All features are now numerical")

df.head(20)

from sklearn.preprocessing import StandardScaler #Creating a copy of data ds = df.copy() # creating a subset of dataframe by dropping the features on deals accepted and promotions cols_del = ['AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1','AcceptedCmp2', 'Complain', 'Response'] ds = ds.drop(cols_del, axis=1) #Scaling scaler = StandardScaler() scaler.fit(ds) X = pd.DataFrame(scaler.transform(ds),columns= ds.columns ) print("All features are now scaled")

from sklearn.neighbors import NearestNeighbors from math import isnan from random import sample from numpy.random import uniform import pandas as pd def hopkins(X): d = X.shape[1] n = len(X) m = int(0.1 * n) nbrs = NearestNeighbors(n_neighbors=1).fit(X) rand_X = sample(range(0, n, 1), m) ujd = [] wjd = [] for j in range(0, m): u_sample = pd.DataFrame(uniform(np.amin(X,axis=0),np.amax(X,axis=0),d).reshape(1, -1), columns=X.columns) u_dist, _ = nbrs.kneighbors(u_sample, 2, return_distance=True) ujd.append(u_dist[0][1]) w_sample = pd.DataFrame([X.iloc[rand_X[j]].values], columns=X.columns) w_dist, _ = nbrs.kneighbors(w_sample, 2, return_distance=True) wjd.append(w_dist[0][1]) H = sum(ujd) / (sum(ujd) + sum(wjd)) if isnan(H): print (ujd, wjd) H = 0 return H # --- Perform Hopkins Test --- hopkins_value = hopkins(X) hopkins_result = 'Result: ''{:.4f}'.format(hopkins_value) print(hopkins_result) if 0.7 < hopkins_value < 0.99: print('>> From the result above it has a high tendency to cluster (contains meaningful clusters)') print(' Conclusions: Accept H0 ') else: print('>> From the result above it has no meaningful clusters') print(' Conclusions: Reject H0')

from sklearn.decomposition import PCA # --- Transform into Array --- X = np.asarray(X) # --- Applying PCA --- pca = PCA(n_components=2, random_state=24) X = pca.fit_transform(X)

import yellowbrick from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering from sklearn.metrics import davies_bouldin_score, silhouette_score, calinski_harabasz_score from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer from yellowbrick.style import set_palette from yellowbrick.contrib.wrapper import wrap # --- Define K-Means Functions --- def kmeans(): # --- Figures Settings --- color_palette=['#FFCC00', '#54318C'] set_palette(color_palette) title=dict(fontsize=12, fontweight='bold', style='italic', fontfamily='serif') text_style=dict(fontweight='bold', fontfamily='serif') fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5)) # --- Elbow Score --- elbow_score = KElbowVisualizer(KMeans(random_state=32, max_iter=500,n_init=10), k=(2, 10), timings=False,ax=ax1) elbow_score.fit(X) elbow_score.finalize() elbow_score.ax.set_title('Distortion Score Elbow\n', **title) elbow_score.ax.tick_params(labelsize=7) for text in elbow_score.ax.legend_.texts: text.set_fontsize(9) for spine in elbow_score.ax.spines.values(): spine.set_color('None') elbow_score.ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), borderpad=2, frameon=False, fontsize=8) elbow_score.ax.grid(axis='y', alpha=0.5, color='#9B9A9C', linestyle='dotted') elbow_score.ax.grid(axis='x', alpha=0) elbow_score.ax.set_xlabel('\nK Values', fontsize=9, **text_style) elbow_score.ax.set_ylabel('Distortion Scores\n', fontsize=9, **text_style) # --- Elbow Score (Calinski-Harabasz Index) --- elbow_score_ch = KElbowVisualizer(KMeans(random_state=32, max_iter=500,n_init=10), k=(2, 10), metric='calinski_harabasz', timings=False, ax=ax2) elbow_score_ch.fit(X) elbow_score_ch.finalize() elbow_score_ch.ax.set_title('Calinski-Harabasz Score Elbow\n', **title) elbow_score_ch.ax.tick_params(labelsize=7) for text in elbow_score_ch.ax.legend_.texts: text.set_fontsize(9) for spine in elbow_score_ch.ax.spines.values(): spine.set_color('None') elbow_score_ch.ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), borderpad=2, frameon=False, fontsize=8) elbow_score_ch.ax.grid(axis='y', alpha=0.5, color='#9B9A9C', linestyle='dotted') elbow_score_ch.ax.grid(axis='x', alpha=0) elbow_score_ch.ax.set_xlabel('\nK Values', fontsize=9, **text_style) elbow_score_ch.ax.set_ylabel('Calinski-Harabasz Score\n', fontsize=9, **text_style) plt.suptitle('Customer Segmentation using K-Means', fontsize=14, **text_style) plt.tight_layout() plt.show # --- Calling K-Means Functions --- kmeans()

from pywaffle import Waffle # --- Implementing K-Means --- kmeans = KMeans(n_clusters=4, random_state=32, max_iter=500,n_init=5) y_kmeans = kmeans.fit_predict(X) # --- Define K-Means Visualizer & Plots --- def visualizer(kmeans, y_kmeans): # --- Figures Settings --- cluster_colors=['#B9C0C9', '#3C096C', '#9D4EDD', '#FFBB00'] labels = ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Centroids'] title=dict(fontsize=12, fontweight='bold', style='italic', fontfamily='serif') text_style=dict(fontweight='bold', fontfamily='serif') scatter_style=dict(linewidth=0.65, edgecolor='#100C07', alpha=0.85) legend_style=dict(borderpad=2, frameon=False, fontsize=8) fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(14, 10)) # --- Silhouette Plots --- s_viz = SilhouetteVisualizer(kmeans, ax=ax1, colors=cluster_colors) s_viz.fit(X) s_viz.finalize() s_viz.ax.set_title('Silhouette Plots of Clusters\n', **title) s_viz.ax.tick_params(labelsize=7) for text in s_viz.ax.legend_.texts: text.set_fontsize(9) for spine in s_viz.ax.spines.values(): spine.set_color('None') s_viz.ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), **legend_style) s_viz.ax.grid(axis='x', alpha=0.5, color='#9B9A9C', linestyle='dotted') s_viz.ax.grid(axis='y', alpha=0) s_viz.ax.set_xlabel('\nCoefficient Values', fontsize=9, **text_style) s_viz.ax.set_ylabel('Cluster Labels\n', fontsize=9, **text_style) # --- Clusters Distribution --- y_kmeans_labels = list(set(y_kmeans.tolist())) for i in y_kmeans_labels: ax2.scatter(X[y_kmeans==i, 0], X[y_kmeans == i, 1], s=50, c=cluster_colors[i], **scatter_style) ax2.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=65, c='#0353A4', label='Centroids', **scatter_style) for spine in ax2.spines.values(): spine.set_color('None') ax2.set_title('Scatter Plot Clusters Distributions\n', **title) ax2.legend(labels, bbox_to_anchor=(0.95, -0.05), ncol=5, **legend_style) ax2.grid(axis='both', alpha=0.5, color='#9B9A9C', linestyle='dotted') ax2.tick_params(left=False, right=False , labelleft=False , labelbottom=False, bottom=False) ax2.spines['bottom'].set_visible(True) ax2.spines['bottom'].set_color('#CAC9CD') # --- Waffle Chart --- unique, counts = np.unique(y_kmeans, return_counts=True) df_waffle = dict(zip(unique, counts)) total = sum(df_waffle.values()) wfl_square = {key: value/25 for key, value in df_waffle.items()} wfl_label = {key: round(value/total*100, 2) for key, value in df_waffle.items()} ax3=plt.subplot(2, 2, (3,4)) ax3.set_title('Percentage of Each Clusters\n', **title) ax3.set_aspect(aspect='auto') Waffle.make_waffle(ax=ax3, rows=6, values=wfl_square, colors=cluster_colors, labels=[f"Cluster {i+1} - ({k}%)" for i, k in wfl_label.items()], icons='child', icon_size=30, legend={'loc': 'upper center', 'bbox_to_anchor': (0.5, -0.05), 'ncol': 4, 'borderpad': 2, 'frameon': False, 'fontsize':10}) ax3.text(0.01, -0.09, '** 1 square ≈ 25 customers', weight = 'bold', style='italic', fontsize=8) # --- Suptitle & WM --- plt.suptitle('Customer Segmentation using K-Means\n', fontsize=14, **text_style) plt.tight_layout() plt.show() # --- Calling K-Means Functions --- visualizer(kmeans, y_kmeans)

# --- Evaluate Clustering Quality Function --- def evaluate_clustering(X, y): db_index = round(davies_bouldin_score(X, y), 3) s_score = round(silhouette_score(X, y), 3) ch_index = round(calinski_harabasz_score(X, y), 3) print('Evaluate Clustering Quality :.') print('Davies-Bouldin Index: ' ,db_index) print(' Silhouette Score: ', s_score) print(' Calinski Harabasz Index: ', ch_index) return db_index, s_score, ch_index # --- Evaluate K-Means Cluster Quality --- db_kmeans, ss_kmeans, ch_kmeans = evaluate_clustering(X, y_kmeans)

from matplotlib.patches import Rectangle # --- Define Epsilon Values --- def epsilon(): # --- Calculate Nearest Neighbors --- neighbors=NearestNeighbors(n_neighbors=2) nbrs=neighbors.fit(X) distances, indices=nbrs.kneighbors(X) distances=np.sort(distances, axis = 0) # --- Figure Settings --- bbox=dict(boxstyle='round', pad=0.3, color='#FFDA47', alpha=0.6) txt1=dict(textcoords='offset points', va='center', ha='center', fontfamily='serif', style='italic') txt2=dict(textcoords='offset points', va='center', fontfamily='serif', style='italic') kw=dict(arrowstyle='Simple, tail_width=0.1, head_width=0.4, head_length=1', color='black') text_style=dict(fontweight='bold', fontfamily='serif') fig=plt.figure(figsize=(14, 5)) # --- Epsilon Plot --- distances_1=distances[:, 1] ax1=fig.add_subplot(1, 3, (1, 2)) plt.plot(distances_1, color='#5829A7') plt.xlabel('\nTotal', fontsize=9, **text_style) plt.ylabel('Oldpeak\n', fontsize=9, **text_style) ax1.add_patch(Rectangle((2090, 0.14), 150, 0.35, edgecolor='#FFCC00', fill=False, lw=1.5)) plt.annotate('The optimal Epsilon value is\nat the point of maximum curvature.', xy=(1200, 0.87), xytext=(1, 1), fontsize=10, bbox=bbox, **txt1) plt.annotate('', xy=(2100, 0.4), xytext=(1300, 0.75), arrowprops=kw) for spine in ax1.spines.values(): spine.set_color('None') plt.grid(axis='y', alpha=0.5, color='#9B9A9C', linestyle='dotted') plt.grid(axis='x', alpha=0) plt.tick_params(labelsize=7) # --- Explanations --- ax2=fig.add_subplot(1, 3, 3) plt.annotate('From the plot, the maximum curvature\nof the curve is about 0.35, and thus\nwe picked our Eps as 0.35', xy=(0.1, 0.5), xytext=(1, 1), fontsize=14, bbox=bbox, **txt2) for spine in ax2.spines.values(): spine.set_color('None') plt.grid(axis='both', alpha=0) plt.axis('off') plt.suptitle('DBSCAN Epsilon Value\n', fontsize=13, **text_style) plt.tight_layout() plt.show # --- Calling Epsilon Functions --- epsilon()

import numpy as np # fit_predict your data dbscan = DBSCAN(eps=0.35, min_samples=4) y_dbscan = dbscan.fit_predict(X) # get the labels (clusters) from the model labels = dbscan.labels_ # count the number of unique labels (excluding noise if present) n_clusters = len(np.unique(labels[labels != -1])) print('Number of clusters: {}'.format(n_clusters))

from matplotlib.lines import Line2D # --- Implementing DBSCAN --- dbscan = DBSCAN(eps=0.35, min_samples=4) y_dbscan = dbscan.fit_predict(X) # --- Define DBSCAN Result Distribution --- def dbscan_visualizer(dbscan, y_dbscan): # --- Figures Settings --- cluster_colors = ['#FFBB00', '#9D4EDD', '#00FF00', '#0000FF', '#FA8072', '#808080', '#ADFF2F', 'red', 'black'] suptitle=dict(fontsize=12, fontweight='heavy', fontfamily='serif') title=dict(fontsize=8, fontfamily='serif') scatter_style=dict(linewidth=0.65, edgecolor='#100C07', alpha=0.85) bbox=dict(boxstyle='round', pad=0.3, color='#FFDA47', alpha=0.6) txt=dict(textcoords='offset points', va='center', ha='center', fontfamily='serif', style='italic') legend_style=dict(borderpad=2, frameon=False, fontsize=6) # --- Arrow Settings --- style = 'Simple, tail_width=0.3, head_width=3, head_length=5' kw = dict(arrowstyle=style, color='#3E3B39') # --- Percentage labels --- unique, counts = np.unique(y_dbscan, return_counts=True) dbscan_count = dict(zip(unique, counts)) total = sum(dbscan_count.values()) dbscan_label = {key: round(value/total*100, 2) for key, value in dbscan_count.items() if key != -1} # --- Clusters Distribution --- fig, ax = plt.subplots(1, 1, figsize=(7, 5),dpi=300) unique_labels = np.unique(y_dbscan) legend_elements = [] for label in unique_labels: if label == -1: # outliers plt.scatter(X[y_dbscan == label, 0], X[y_dbscan == label, 1], s=15, c=cluster_colors[-1], **scatter_style) legend_elements.append(Line2D([0], [0], marker='o', color='w', markerfacecolor=cluster_colors[-1], markersize=5, label='Outliers')) else: # regular clusters plt.scatter(X[y_dbscan == label, 0], X[y_dbscan == label, 1], s=50, c=cluster_colors[label], **scatter_style) legend_elements.append(Line2D([0], [0], marker='o', color='w', markerfacecolor=cluster_colors[label], markersize=5, label=f'Cluster {label+1} - ({dbscan_label[label]}%)')) ax.legend(handles=legend_elements, bbox_to_anchor=(1, 1), **legend_style) for spine in ax.spines.values(): spine.set_color('None') plt.grid(axis='both', alpha=0.3, color='#9B9A9C', linestyle='dotted') plt.annotate('Outliers', xy=(23, 18.8), xytext=(1, 1), fontsize=8, bbox=bbox, **txt) plt.tick_params(left=False, right=False , labelleft=False , labelbottom=False, bottom=False) plt.title('Two clusters of credit card customers were formed. There are also some outliers detected.\n', loc='left', **title) plt.suptitle('Credit Card Customer Clustering using DBSCAN', x=0.123, y=0.98, ha='left', **suptitle) plt.show() # --- Calling DBSCAN Functions --- dbscan_visualizer(dbscan, y_dbscan);

# --- Evaluate DBSCAN Cluster Quality --- db_dbscan, ss_dbscan, ch_dbscan = evaluate_clustering(X, y_dbscan)

import scipy.cluster.hierarchy as shc from matplotlib import pyplot as plt from seaborn import set_palette def agg_dendrogram(): # Define and set color palette color_palette = ['#FFBB00', '#3C096C', '#9D4EDD', '#3C096C'] set_palette(color_palette) # Define styles for plot elements text_style = dict(fontweight='bold', fontfamily='serif') ann = dict(textcoords='offset points', va='center', ha='center', fontfamily='serif', style='italic') title = dict(fontsize=12, fontweight='bold', style='italic', fontfamily='serif') bbox = dict(boxstyle='round', pad=0.3, color='#FFDA47', alpha=0.6) # Initialize plot fig = plt.figure(figsize=(14, 7)) ax = fig.add_subplot() # Generate dendrogram dend = shc.dendrogram(shc.linkage(X, method='ward', metric='euclidean')) # Add visual elements to plot plt.axhline(y=115, color='#3E3B39', linestyle='--') plt.xlabel('\nData Points', fontsize=9, **text_style) plt.ylabel('Euclidean Distances\n', fontsize=9, **text_style) plt.annotate('Horizontal Cut Line', xy=(6500, 122), xytext=(1, 1), fontsize=13, bbox=bbox, **ann) plt.tick_params(labelbottom=False) for spine in ax.spines.values(): spine.set_color('None') plt.grid(axis='both', alpha=0) plt.tick_params(labelsize=7) plt.title('Dendrograms\n', **title) plt.suptitle('Credit Card Customer Clustering using Hierarchical Clustering\n', fontsize=14, **text_style) plt.tight_layout() # Display the plot plt.show() # Call function agg_dendrogram()

# --- Implementing Hierarchical Clustering --- agg_cluster = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward') y_agg_cluster = agg_cluster.fit_predict(X) # --- Define Hierarchical Clustering Distributions --- def agg_visualizer(agg_cluster, y_agg_cluster): # --- Figures Settings --- cluster_colors=['#FFBB00', '#3C096C'] labels = ['Cluster 1', 'Cluster 2'] suptitle=dict(fontsize=14, fontweight='heavy', fontfamily='serif') title=dict(fontsize=10, fontweight='bold', style='italic', fontfamily='serif') scatter_style=dict(linewidth=0.65, edgecolor='#100C07', alpha=0.85) legend_style=dict(borderpad=2, frameon=False, fontsize=9) fig=plt.figure(figsize=(14, 7)) # --- Percentage Labels --- unique, counts = np.unique(y_agg_cluster, return_counts=True) df_waffle = dict(zip(unique, counts)) total = sum(df_waffle.values()) wfl_square = {key: value/25 for key, value in df_waffle.items()} wfl_label = {key: round(value/total*100, 2) for key, value in df_waffle.items()} # --- Clusters Distribution --- y_agg_labels = list(set(y_agg_cluster.tolist())) ax1=fig.add_subplot(1, 3, (1, 2)) for i in y_agg_labels: ax1.scatter(X[y_agg_cluster==i, 0], X[y_agg_cluster == i, 1], s=50, c=cluster_colors[i], label=labels[i], **scatter_style) for spine in ax1.spines.values(): spine.set_color('None') for spine in ['bottom', 'left']: ax1.spines[spine].set_visible(True) ax1.spines[spine].set_color('#CAC9CD') ax1.legend([f"Cluster {i+1} - ({k}%)" for i, k in wfl_label.items()], bbox_to_anchor=(1.3, -0.03), ncol=4, **legend_style) ax1.grid(axis='both', alpha=0.3, color='#9B9A9C', linestyle='dotted') ax1.tick_params(left=False, right=False , labelleft=False , labelbottom=False, bottom=False) plt.title('Scatter Plot Clusters Distributions\n', **title) # --- Waffle Chart --- ax2=fig.add_subplot(1, 3, 3) ax2.set_title('Percentage of Each Clusters\n', **title) ax2.set_aspect(aspect='auto') Waffle.make_waffle(ax=ax2, rows=7, values=wfl_square, colors=cluster_colors, icons='user', icon_size=16) ax2.get_legend().remove() ax2.text(0.01, 0.95, '** 1 square ≈ 25 customers', style='italic', fontsize=7) plt.suptitle('Credit Card Customer Clustering using Hierarchical Clustering\n', **suptitle) plt.show() # --- Calling Hierarchical Clustering Functions --- agg_visualizer(agg_cluster, y_agg_cluster);

# --- Evaluate DBSCAN Cluster Quality --- db_agg, ss_agg, ch_agg = evaluate_clustering(X, y_agg_cluster)

# --- Comparison Table --- compare = pd.DataFrame({'Model': ['K-Means', 'DBSCAN', 'Hierarchical Clustering'], 'Davies-Bouldin Index': [db_kmeans, db_dbscan, db_agg], 'Silhouette Score': [ss_kmeans, ss_dbscan, ss_agg], 'Calinski-Harabasz Index': [ch_kmeans, ch_dbscan, ch_agg]}) # --- Create Accuracy Comparison Table --- print(' Model Accuracy Comparison: ') compare.sort_values(by='Model', ascending=False).style.background_gradient(cmap='Blues').hide_index().set_properties(**{'font-family': 'Segoe UI'})

# --- Add K-Means Prediction to Data Frame ---- df['cluster_result'] = y_kmeans+1 df['cluster_result'] = 'Cluster '+df['cluster_result'].astype(str) # --- Calculationg Overall Mean from Current Data Frame --- df_profile_overall = pd.DataFrame() df_profile_overall = df.describe().loc[['mean']].T # --- Summarize Mean of Each Clusters --- df_cluster_summary = df.groupby('cluster_result').describe().T.reset_index().rename(columns={'level_0': 'Column Name', 'level_1': 'Metrics'}) df_cluster_summary = df_cluster_summary[df_cluster_summary['Metrics'] == 'mean'].set_index('Column Name') # --- Combining Both Data Frame --- df_profile = df_cluster_summary.join(df_profile_overall).reset_index() df_profile.style.hide_index()