import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.cluster.vq import kmeans, kmeans2, vq
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from yellowbrick.cluster import SilhouetteVisualizer
plt.style.use("fivethirtyeight")
data = pd.read_csv('data/data.csv')
data.head()
xint64
yint64
0
17
4
1
20
6
2
35
0
3
14
0
4
37
4
sns.scatterplot(x='x_scaled', y='y_scaled', data=data)
plt.title("Jelas Terlihat Terdapat 2 Cluster", loc="left")
plt.grid(False)
plt.show()
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
findfont: Font family ['sans-serif'] not found. Falling back to DejaVu Sans.
findfont: Generic family 'sans-serif' not found because none of the following families were found: Arial, Liberation Sans, Bitstream Vera Sans, sans-serif
centroids, labels = kmeans2(data[["x_scaled", "y_scaled"]], k=2, minit="++")
print("centroids:", centroids, sep="\n")
print("labels:", labels)
data["labels_kmeans2"] = labels
sns.scatterplot(x='x_scaled', y='y_scaled', data=data, hue="labels_kmeans2")
plt.scatter(centroids[:, 0], centroids[:, 1], marker="*", c="black", label="centroid")
plt.title("Implementasi kmeans2", loc="left")
plt.legend()
plt.grid(False)
plt.show()
centroids:
[[2.6446202 2.17073753]
[0.66058433 0.15108805]]
labels: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1]
data[["x", "y", "labels_kmeans2"]]
xint64
10 - 97
yint64
0 - 100
0
17
4
1
20
6
2
35
0
3
14
0
4
37
4
5
33
3
6
14
1
7
30
6
8
35
5
9
17
4
# Inisialisasi pusat klaster
cluster_centers, distortion = kmeans(data[['x_scaled', 'y_scaled']], k_or_guess=2)
print("centroids:", cluster_centers, sep="\n")
print("distortion:", distortion)
# Penetapan label klaster
data['labels_kmeans'], distortion_list = vq(data[['x_scaled', 'y_scaled']], cluster_centers)
print(f"{distortion_list = }\n{distortion_list.mean() = }")
print("labels:", data["labels_kmeans"].tolist())
# Plot clusters
sns.scatterplot(x='x_scaled', y='y_scaled', hue='labels_kmeans', data=data)
plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], marker="*", c="black", label="centroid")
plt.title("Implementasi kmeans", loc="left")
plt.legend()
plt.grid(False)
plt.show()
centroids:
[[2.6446202 2.17073753]
[0.66058433 0.15108805]]
distortion: 0.20140471735984214
distortion_list = array([0.1631032 , 0.06342087, 0.41644966, 0.28454633, 0.45214305,
0.33862857, 0.27326432, 0.23880818, 0.38998825, 0.1631032 ,
0.33139534, 0.08032372, 0.27262188, 0.36846179, 0.21805273,
0.13936667, 0.0684838 , 0.16121261, 0.06153335, 0.26194088,
0.18099276, 0.05756371, 0.05969703, 0.15284734, 0.18353941,
0.19090412, 0.23178557, 0.13637183, 0.08886738, 0.18645441,
0.07925213, 0.17507865, 0.11528924, 0.14852088, 0.27648338,
0.24007361])
distortion_list.mean() = 0.20140471735984214
labels: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
data[["x", "y", "labels_kmeans2", "labels_kmeans"]]
xint64
10 - 97
yint64
0 - 100
30
23
10
31
25
0
32
25
10
33
27
7
34
17
17
35
17
15
k_means_2 = KMeans(n_clusters=2)
k_means_2.fit(data[["x_scaled", "y_scaled"]])
centroids = k_means_2.cluster_centers_
inertia = k_means_2.inertia_
print("centroids:", centroids, sep="\n")
print("inertia:", inertia)
labels = k_means_2.predict(data[["x_scaled", "y_scaled"]])
data["labels_sklearn"] = labels
print("labels:", labels)
sns.scatterplot(x='x_scaled', y='y_scaled', hue='labels_sklearn', data=data)
plt.scatter(centroids[:, 0], centroids[:, 1], marker="*", c="black", label="centroid")
plt.title("Implementasi kmeans", loc="left")
plt.legend()
plt.grid(False)
plt.show()
centroids:
[[2.6446202 2.17073753]
[0.66058433 0.15108805]]
inertia: 1.8654066617677527
labels: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1]
cluster_0 = data.loc[data["labels_sklearn"] == 0, ["x", "y"]]
cluster_1 = data.loc[data["labels_sklearn"] == 1, ["x", "y"]]
cluster_0.describe().T
countfloat64
meanfloat64
x
21
22.04761905
y
21
6.714285714
cluster_1.describe().T
countfloat64
meanfloat64
x
15
88.26666667
y
15
96.46666667
k_means_3 = KMeans(n_clusters=3)
k_means_3.fit(data[["x_scaled", "y_scaled"]])
centroids = k_means_3.cluster_centers_
inertia = k_means_3.inertia_
print("centroids:", centroids, sep="\n")
print("inertia:", inertia)
labels = k_means_3.predict(data[["x_scaled", "y_scaled"]])
data["labels_sklearn"] = labels
print("labels:", labels)
sns.scatterplot(x='x_scaled', y='y_scaled', hue='labels_sklearn', data=data)
plt.scatter(centroids[:, 0], centroids[:, 1], marker="*", c="black", label="centroid")
plt.title("Implementasi kmeans", loc="left")
plt.legend()
plt.grid(False)
plt.show()
centroids:
[[0.49782531 0.18348177]
[2.6446202 2.17073753]
[0.92506775 0.09844825]]
inertia: 0.9256091567302105
labels: [0 0 2 0 2 2 0 2 2 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 2 2 2 0 0]
cluster_0 = data.loc[data["labels_sklearn"] == 0, ["x", "y"]]
cluster_1 = data.loc[data["labels_sklearn"] == 1, ["x", "y"]]
cluster_2 = data.loc[data["labels_sklearn"] == 2, ["x", "y"]]
display(cluster_0.describe().T)
display(cluster_1.describe().T)
display(cluster_2.describe().T)
countfloat64
meanfloat64
x
13
16.61538462
y
13
8.153846154
distortions = []
num_clusters = range(1, 7)
# Buat list untuk distorsi dari kmeans
for i in num_clusters:
cluster_centers, distortion = kmeans(data[['x_scaled', 'y_scaled']], k_or_guess=i)
distortions.append(distortion)
# Buat dataframe dengan dua lists - num_clusters, distortions
elbow_plot = pd.DataFrame({'num_clusters': num_clusters, 'distortions': distortions})
# Buat line plotnya
sns.lineplot(x='num_clusters', y='distortions', data = elbow_plot)
plt.xticks(num_clusters)
plt.title("Elbow method using distortion")
plt.grid(False)
plt.show()
ssd = []
clusters = range(1, 11)
for k in clusters:
k_means = KMeans(n_clusters=k, random_state=11)
k_means.fit(data[["x_scaled", "y_scaled"]])
ssd.append(k_means.inertia_)
plt.plot(clusters, ssd)
plt.xlabel("Clusters")
plt.ylabel("Inertia")
plt.title("Elbow method using inertia", loc="left")
plt.grid(False)
plt.show()
k_means_2 = KMeans(n_clusters=2)
k_means_2.fit(data[["x_scaled", "y_scaled"]])
labels_2 = k_means_2.predict(data[["x_scaled", "y_scaled"]])
average_silhouette_score_2 = silhouette_score(
data[["x_scaled", "y_scaled"]], labels_2
)
print(f"(k=2) {average_silhouette_score_2 = }")
k_means_3 = KMeans(n_clusters=3)
k_means_3.fit(data[["x_scaled", "y_scaled"]])
labels_3 = k_means_3.predict(data[["x_scaled", "y_scaled"]])
average_silhouette_score_3 = silhouette_score(
data[["x_scaled", "y_scaled"]], labels_3
)
print(f"(k=3) {average_silhouette_score_3 = }")
(k=2) average_silhouette_score_2 = 0.8993895429449152
(k=3) average_silhouette_score_3 = 0.670566102305304
silhouette_viz = SilhouetteVisualizer(k_means_2)
silhouette_viz.fit(data[["x_scaled", "y_scaled"]])
plt.grid(False)
silhouette_viz.show()
plt.show()
/shared-libs/python3.9/py/lib/python3.9/site-packages/sklearn/base.py:445: UserWarning: X does not have valid feature names, but KMeans was fitted with feature names
warnings.warn(
silhouette_viz = SilhouetteVisualizer(k_means_3)
silhouette_viz.fit(data[["x_scaled", "y_scaled"]])
plt.grid(False)
silhouette_viz.show()
plt.show()
/shared-libs/python3.9/py/lib/python3.9/site-packages/sklearn/base.py:445: UserWarning: X does not have valid feature names, but KMeans was fitted with feature names
warnings.warn(