import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.cluster.vq import kmeans, kmeans2, vq
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import SilhouetteVisualizer
plt.style.use("fivethirtyeight")
data = pd.read_csv('data/data.csv')
data.head()
sns.scatterplot(x='x_scaled', y='y_scaled', data=data)
plt.title("Jelas Terlihat Terdapat 2 Cluster", loc="left")
plt.grid(False)
plt.show()
centroids, labels = kmeans2(data[["x_scaled", "y_scaled"]], k=2, minit="++")
print("centroids:", centroids, sep="\n")
print("labels:", labels)
data["labels_kmeans2"] = labels
sns.scatterplot(x='x_scaled', y='y_scaled', data=data, hue="labels_kmeans2")
plt.scatter(centroids[:, 0], centroids[:, 1], marker="*", c="black", label="centroid")
plt.title("Implementasi kmeans2", loc="left")
plt.legend()
plt.grid(False)
plt.show()
data[["x", "y", "labels_kmeans2"]]
# Inisialisasi pusat klaster
cluster_centers, distortion = kmeans(data[['x_scaled', 'y_scaled']], k_or_guess=2)
print("centroids:", cluster_centers, sep="\n")
print("distortion:", distortion)
# Penetapan label klaster
data['labels_kmeans'], distortion_list = vq(data[['x_scaled', 'y_scaled']], cluster_centers)
print(f"{distortion_list = }\n{distortion_list.mean() = }")
print("labels:", data["labels_kmeans"].tolist())
# Plot clusters
sns.scatterplot(x='x_scaled', y='y_scaled', hue='labels_kmeans', data=data)
plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], marker="*", c="black", label="centroid")
plt.title("Implementasi kmeans", loc="left")
plt.legend()
plt.grid(False)
plt.show()
data[["x", "y", "labels_kmeans2", "labels_kmeans"]]
k_means_2 = KMeans(n_clusters=2)
k_means_2.fit(data[["x_scaled", "y_scaled"]])
centroids = k_means_2.cluster_centers_
inertia = k_means_2.inertia_
print("centroids:", centroids, sep="\n")
print("inertia:", inertia)
labels = k_means_2.predict(data[["x_scaled", "y_scaled"]])
data["labels_sklearn"] = labels
print("labels:", labels)
sns.scatterplot(x='x_scaled', y='y_scaled', hue='labels_sklearn', data=data)
plt.scatter(centroids[:, 0], centroids[:, 1], marker="*", c="black", label="centroid")
plt.title("Implementasi kmeans", loc="left")
plt.legend()
plt.grid(False)
plt.show()
cluster_0 = data.loc[data["labels_sklearn"] == 0, ["x", "y"]]
cluster_1 = data.loc[data["labels_sklearn"] == 1, ["x", "y"]]
cluster_0.describe().T
cluster_1.describe().T
k_means_3 = KMeans(n_clusters=3)
k_means_3.fit(data[["x_scaled", "y_scaled"]])
centroids = k_means_3.cluster_centers_
inertia = k_means_3.inertia_
print("centroids:", centroids, sep="\n")
print("inertia:", inertia)
labels = k_means_3.predict(data[["x_scaled", "y_scaled"]])
data["labels_sklearn"] = labels
print("labels:", labels)
sns.scatterplot(x='x_scaled', y='y_scaled', hue='labels_sklearn', data=data)
plt.scatter(centroids[:, 0], centroids[:, 1], marker="*", c="black", label="centroid")
plt.title("Implementasi kmeans", loc="left")
plt.legend()
plt.grid(False)
plt.show()
cluster_0 = data.loc[data["labels_sklearn"] == 0, ["x", "y"]]
cluster_1 = data.loc[data["labels_sklearn"] == 1, ["x", "y"]]
cluster_2 = data.loc[data["labels_sklearn"] == 2, ["x", "y"]]
display(cluster_0.describe().T)
display(cluster_1.describe().T)
display(cluster_2.describe().T)
distortions = []
num_clusters = range(1, 7)
# Buat list untuk distorsi dari kmeans
for i in num_clusters:
cluster_centers, distortion = kmeans(data[['x_scaled', 'y_scaled']], k_or_guess=i)
distortions.append(distortion)
# Buat dataframe dengan dua lists - num_clusters, distortions
elbow_plot = pd.DataFrame({'num_clusters': num_clusters, 'distortions': distortions})
# Buat line plotnya
sns.lineplot(x='num_clusters', y='distortions', data = elbow_plot)
plt.xticks(num_clusters)
plt.title("Elbow method using distortion")
plt.grid(False)
plt.show()
ssd = []
clusters = range(1, 11)
for k in clusters:
k_means = KMeans(n_clusters=k, random_state=11)
k_means.fit(data[["x_scaled", "y_scaled"]])
ssd.append(k_means.inertia_)
plt.plot(clusters, ssd)
plt.xlabel("Clusters")
plt.ylabel("Inertia")
plt.title("Elbow method using inertia", loc="left")
plt.grid(False)
plt.show()
k_means_2 = KMeans(n_clusters=2)
k_means_2.fit(data[["x_scaled", "y_scaled"]])
labels_2 = k_means_2.predict(data[["x_scaled", "y_scaled"]])
average_silhouette_score_2 = silhouette_score(
data[["x_scaled", "y_scaled"]], labels_2
)
print(f"(k=2) {average_silhouette_score_2 = }")
k_means_3 = KMeans(n_clusters=3)
k_means_3.fit(data[["x_scaled", "y_scaled"]])
labels_3 = k_means_3.predict(data[["x_scaled", "y_scaled"]])
average_silhouette_score_3 = silhouette_score(
data[["x_scaled", "y_scaled"]], labels_3
)
print(f"(k=3) {average_silhouette_score_3 = }")
silhouette_viz = SilhouetteVisualizer(k_means_2)
silhouette_viz.fit(data[["x_scaled", "y_scaled"]])
plt.grid(False)
silhouette_viz.show()
plt.show()
silhouette_viz = SilhouetteVisualizer(k_means_3)
silhouette_viz.fit(data[["x_scaled", "y_scaled"]])
plt.grid(False)
silhouette_viz.show()
plt.show()