from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
import numpy as np
X = StudentRecord[['c01', 'c02', 'c03', 'c04','c05','c06','c07','c08','c09','c10']]
sse = []
for k in range(2, 5):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)
sse.append(kmeans.inertia_)
labels = kmeans.labels_
fig, ax = plt.subplots()
scatter = ax.scatter(X.iloc[:, 0], X.iloc[:, 1], c=labels)
ax.set_xlabel('c01')
ax.set_ylabel('c02')
plt.colorbar(scatter)
plt.title('K-Means Clustering with k='+str(k))
plt.show()
plt.plot(range(2, 5), sse, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Sum of squared distances')
plt.title('Elbow Method')
optimal_k = np.argmin(np.diff(sse)) + 2
plt.axvline(x=optimal_k, color='r', linestyle='--')
plt.show()
from sklearn.metrics import silhouette_samples, silhouette_score
range_n_clusters = range(2, 5)
silhouette_scores = []
for n_clusters in range_n_clusters:
clusterer = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = clusterer.fit_predict(X)
silhouette_avg = silhouette_score(X, cluster_labels)
silhouette_scores.append(silhouette_avg)
plt.plot(range_n_clusters, silhouette_scores, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.title('Silhouette Method')
optimal_k = range_n_clusters[silhouette_scores.index(max(silhouette_scores))]
plt.axvline(x=optimal_k, color='r', linestyle='--')
plt.show()
sse = []
for k in range(2, 5):
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)
sse.append(kmeans.inertia_)
labels = kmeans.labels_
fig, ax = plt.subplots()
scatter = ax.scatter(X.iloc[:, 2], X.iloc[:, 3], c=labels)
ax.set_xlabel('c03')
ax.set_ylabel('c04')
plt.colorbar(scatter)
plt.title('K-Means Clustering with k='+str(k))
plt.show()
plt.plot(range(2, 5), sse, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Sum of squared distances')
plt.title('Elbow Method')
optimal_k = np.argmin(np.diff(sse)) + 2
plt.axvline(x=optimal_k, color='r', linestyle='--')
plt.show()
from sklearn.metrics import silhouette_samples, silhouette_score
range_n_clusters = range(2, 5)
silhouette_scores = []
for n_clusters in range_n_clusters:
clusterer = KMeans(n_clusters=n_clusters, random_state=42)
cluster_labels = clusterer.fit_predict(X)
silhouette_avg = silhouette_score(X, cluster_labels)
silhouette_scores.append(silhouette_avg)
plt.plot(range_n_clusters, silhouette_scores, 'bx-')
plt.xlabel('Number of clusters')
plt.ylabel('Silhouette score')
plt.title('Silhouette Method')
optimal_k = range_n_clusters[silhouette_scores.index(max(silhouette_scores))]
plt.axvline(x=optimal_k, color='r', linestyle='--')
plt.show()