from sklearn.datasets import make_blobs
import pandas as pd
import matplotlib.pylab as plt
x,y = make_blobs(n_samples=100, centers=4, n_features=2, cluster_std=[1,1.5,2,2], random_state=7)
# n_samples: Es el número total de muestras o puntos a generar.
# n_features: Es el número de características o dimensiones de cada muestra.
# centers: Es el número de clústeres que se desean generar, o una matriz con las coordenadas de los centroides de los clústeres.
# cluster_std: Es la desviación estándar de cada clúster. Cuanto mayor sea el valor, mayor será la dispersión de los puntos dentro de cada clúster.
# random_state: Es una semilla para la generación aleatoria de los datos.
(x,y)
df_blobs = pd.DataFrame({
"x1": x[:,0],
"x2": x[:,1],
'y': y
})
df_blobs
def plot_2d_cluster(x,y,ax):
y_uniques = pd.Series(y).unique()
for _ in y_uniques:
x[y==_].plot(
title=f'{len(y_uniques)} Clusters',
kind='scatter',
x='x1',
y='x2',
marker=f'${_}$',
s=50,
ax = ax
)
fig,ax = plt.subplots(1,1, figsize=(15,10))
x,y = df_blobs[['x1','x2']], df_blobs['y']
plot_2d_cluster(x,y,ax)
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=4, random_state=7 )
y_pred = kmeans.fit_predict(x)
fig,ax = plt.subplots(1,2, figsize=(20,8))
plot_2d_cluster(x,y_pred,ax[1])
# plot_2d_cluster(x,y,ax[0])
ax[0].set_title(f'Actual {ax[0].get_title()}')
ax[1].set_title(f'KMeans {ax[1].get_title()}')
!pip install yellowbrick==1.5
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs
from sklearn.cluster import kmeans_plusplus
from sklearn.metrics import silhouette_score
from yellowbrick.cluster.silhouette import silhouette_visualizer
sns.set_style(style='whitegrid')
# Genera un conjunto de datos sintéticos con la función make_blobs
# n_samples: el número total de puntos de datos que se generarán
# cluster_std: la desviación estándar de los clústeres generados
# centers: el número de clústeres o grupos a generar
# n_features: el número de características o dimensiones de los puntos de datos
# return_centers: si se devuelven los centros de los clústeres generados
x,y,c = make_blobs(n_samples=500, cluster_std=0.8, centers=4, n_features=2, return_centers=True, random_state=42)
x
y
c
# Crea un DataFrame utilizando pandas
df_blobs = pd.DataFrame(
{
'x1': x[:,0], # Los valores de 'x1' se toman de la primera columna del arreglo x
'x2': x[:,1], # Los valores de 'x2' se toman de la segunda columna del arreglo x
'y': y # Los valores de 'y' se toman del arreglo y
}
)
df_blobs
# Crea un DataFrame utilizando pandas
df_centers = pd.DataFrame(
{
'x1': c[:,0], # Los valores de 'x1' se toman de la primera columna del arreglo c
'x2': c[:,1] # Los valores de 'x2' se toman de la segunda columna del arreglo
}
)
df_centers
# Visualización del scatter plot de los datos en df_blobs
sns.scatterplot(data=df_blobs, x='x1', y='x2')
# Visualización del scatter plot de los centros en df_centers
sns.scatterplot(data=df_centers, x='x1',y='x2', marker='*',s=250,color='red')
plt.show()
# Crear una instancia del algoritmo K-means con 4 clusters
kmeans = KMeans(n_clusters=4)
# Realizar el agrupamiento y obtener las etiquetas de los clusters para cada punto de datos en x
df_cluster = kmeans.fit_predict(x)
# Agregar la columna 'cluster' al DataFrame df_blobs con las etiquetas de los clusters
df_blobs['cluster'] = df_cluster
# Obtener los centros de los clusters generados por K-means
k_means_centers = kmeans.cluster_centers_
# Crear un DataFrame df_kmeans_center con los centros de los clusters
df_kmeans_center = pd.DataFrame(
{
'x1': k_means_centers[:,0], # Primera columna de k_means_centers
'x2': k_means_centers[:,1] # Segunda columna de k_means_centers
}
)
plt.figure(figsize=(12,8))
# Visualización del scatter plot de los datos en df_blobs
sns.scatterplot(data=df_blobs, x='x1', y='x2', hue='cluster', palette='pastel')
# Visualización del scatter plot de los centros en df_centers
sns.scatterplot(data=df_centers, x='x1', y='x2', marker='*', s=250, color='red')
# Visualización del scatter plot de los centros de K-means en df_kmeans_center
sns.scatterplot(data=df_kmeans_center, x='x1', y='x2', marker='*', s=250, color='purple')
plt.title(f'silhouette score: {silhouette_score(x,df_blobs["cluster"]).round(5)}')
plt.show()
def viz_cluster(k,ax=None):
# Crear una instancia del algoritmo K-means con 4 clusters
kmeans = KMeans(n_clusters=k)
# Realizar el agrupamiento y obtener las etiquetas de los clusters para cada punto de datos en x
df_cluster = kmeans.fit_predict(x)
# Agregar la columna 'cluster' al DataFrame df_blobs con las etiquetas de los clusters
df_blobs['cluster'] = df_cluster
# Obtener los centros de los clusters generados por K-means
k_means_centers = kmeans.cluster_centers_
# Crear un DataFrame df_kmeans_center con los centros de los clusters
df_kmeans_center = pd.DataFrame(
{
'x1': k_means_centers[:,0], # Primera columna de k_means_centers
'x2': k_means_centers[:,1] # Segunda columna de k_means_centers
}
)
plt.figure(figsize=(12,8))
# Visualización del scatter plot de los datos en df_blobs
sns.scatterplot(data=df_blobs, x='x1', y='x2', hue='cluster', palette='pastel',ax=ax);
# Visualización del scatter plot de los centros en df_centers
sns.scatterplot(data=df_centers, x='x1', y='x2', marker='*', s=250, color='red',ax=ax);
# Visualización del scatter plot de los centros de K-means en df_kmeans_center
sns.scatterplot(data=df_kmeans_center, x='x1', y='x2', marker='*', s=250, color='purple',ax=ax);
if ax != None:
ax.set_title(f'silhouette score: {silhouette_score(x,df_blobs["cluster"]).round(5)}');
else:
plt.title(f'silhouette score: {silhouette_score(x,df_blobs["cluster"]).round(5)}');
fig, axs = plt.subplots(2, 2, figsize=(12, 12));
for i in range(2):
for j in range(2):
ax = axs[i, j]
viz_cluster(i*2 + j + 3,ax);
sum_of_squared_distances = []
K = range(2,15)
for k in K:
km = KMeans(n_clusters=k, random_state=42)
km = km.fit(x)
sum_of_squared_distances.append(km.inertia_)
plt.figure(figsize=(8,8))
plt.plot(K, sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Average within-cluster sum of squares')
plt.title('Elbow for KMeans clustering')
plt.show()
silhouetteScore = []
K = range(2,15)
for k in K:
km = KMeans(n_clusters=k, random_state=42)
km = km.fit(x)
y = km.predict(x)
silhouetteScore.append(silhouette_score(x,y))
plt.figure(figsize=(8,8))
plt.plot(K, silhouetteScore, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette score')
plt.show()
# Visualizamos los clusters con el valor optimo de K
viz_cluster(4)
km = KMeans(n_clusters=4)
km.fit(x)
plt.figure(figsize=(15,8))
silhouette_visualizer(km,x,colors='yellowbrick');
import numpy as np
import pandas as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
# Genera un conjunto de datos sintéticos con la función make_blobs
x,y,c = make_blobs(n_samples=500, cluster_std=0.8, centers=4, n_features=2, return_centers=True, random_state=40)
df_blobs = pd.DataFrame({
"x1": x[:,0],
"x2": x[:,1],
'y': y
})
df_centers = pd.DataFrame(
{
'x1': c[:,0],
'x2': c[:,1]
}
)
fig = plt.figure(figsize=(8,8))
sns.scatterplot(data=df_blobs, x='x1', y='x2')
plt.show()
sns.set_style(style='white')
fig = plt.figure(figsize=(12,8))
dendrogram_plot = dendrogram(linkage(x, method='ward'))
plt.title('Deondograma usando ward linkage')
plt.xlabel('Cluster',fontsize=12)
plt.ylabel('Distancia Euclidiana',fontsize=12)
plt.show();
sns.set_style(style='white')
fig = plt.figure(figsize=(12,8))
dendrogram_plot = dendrogram(linkage(x, method='ward'))
plt.title('Deondograma usando ward linkage')
plt.hlines(y=40, xmin=0, xmax=5000, linestyles='dashed', colors='r')
plt.xlabel('Cluster',fontsize=12)
plt.ylabel('Distancia Euclidiana',fontsize=12)
plt.show();
#hc = Herarchical Clustering
hc = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward')
y_hc = hc.fit_predict(x)
estimator_hc = hc.fit(x)
df_blobs['cluster'] = y_hc
fig = plt.figure(figsize=(8,8))
sns.scatterplot(data=df_blobs, x='x1', y='x2', hue='cluster')
plt.show();
silhouette_score(x,y_hc)
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
range_n_clusters = [3,4,5,6]
def plot_herarchical_cluster(x,range_n_clusters):
for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(x) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
cluster_labels = clusterer.fit_predict(x)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(x, cluster_labels)
print(
"For n_clusters =",
n_clusters,
"The average silhouette_score is :",
silhouette_avg,
)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(x, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(
np.arange(y_lower, y_upper),
0,
ith_cluster_silhouette_values,
facecolor=color,
edgecolor=color,
alpha=0.7,
)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(
x[:, 0], x[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
)
plt.show()
range_n_clusters = [4]
plot_herarchical_cluster(x, range_n_clusters)
range_n_clusters = [3,5,6]
plot_herarchical_cluster(x, range_n_clusters)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_blobs, make_moons
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
X, y, c = make_blobs(n_samples=500, cluster_std=0.8, centers=4, n_features=2, return_centers=True, random_state=42)
df_blobs = pd.DataFrame(
{
'x1':X[:,0],
'x2':X[:,1]
}
)
df_centers = pd.DataFrame(
{
'x1': c[:,0],
'x2': c[:,1]
}
)
X_m, y_m = make_moons(n_samples=250, noise=0.05, random_state=42)
df_moons = pd.DataFrame(
{
'x1':X_m[:,0],
'x2':X_m[:,1]
}
)
sns.scatterplot(data=df_blobs, x='x1',y='x2')
plt.scatter(data=df_centers, x='x1',y='x2', marker='*', s=400, color='brown')
sns.scatterplot(data=df_moons, x='x1',y='x2')
dbscan_cluster = DBSCAN(eps=0.3, min_samples=3)
y_m_predict = dbscan_cluster.fit_predict(X_m)
df_moons['cluster'] = y_m_predict
df_moons
sns.scatterplot(data=df_moons, x='x1',y='x2',hue='cluster');
dbscan = DBSCAN(eps=1, min_samples=3)
y_predict = dbscan.fit_predict(X)
df_blobs['cluster'] = y_predict
sns.scatterplot(data=df_blobs, x='x1',y='x2',hue='cluster');
plt.title('''DBSCAN
ε: 1
minPts:3''');
dbscan = DBSCAN(eps=0.3, min_samples=3)
y_predict = dbscan.fit_predict(X)
df_blobs['cluster'] = y_predict
df_blobs
sns.scatterplot(data=df_blobs, x='x1',y='x2',hue='cluster');
plt.title('''DBSCAN
ε: 0.3
minPts:3''');
from sklearn.neighbors import NearestNeighbors
neighbors = NearestNeighbors(n_neighbors=3)
neighbors_fit = neighbors.fit(X)
distances, indices = neighbors_fit.kneighbors(X)
distances
indices
distances = np.sort(distances, axis=0)
distances = distances[:,1]
fig = plt.figure(figsize=(10,10))
plt.plot(distances)
plt.xlabel('data')
plt.ylabel('Distance')
plt.hlines(y=0.4 ,xmin=0,xmax=500, linestyles="dashed", colors='r')
plt.hlines(y=0.2 ,xmin=0,xmax=500, linestyles="dashed", colors='r')
plt.show()
eps_values = np.arange(0.25, 0.80, 0.1)
min_samples = np.arange(2,10)
min_samples
from itertools import product
dbscan_params = list(product(eps_values, min_samples))
sil_scores = []
for p in dbscan_params:
y_pred = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(X)
sil_scores.append(silhouette_score(X,y_pred))
df_params_tunning = pd.DataFrame.from_records(dbscan_params, columns=['Eps','Min_Samples'])
df_params_tunning['sil_score'] = sil_scores
pivot_data = pd.pivot_table(df_params_tunning, values='sil_score', index='Min_Samples', columns='Eps')
pivot_data
fig, ax = plt.subplots(figsize=(18, 6))
sns.heatmap(pivot_data, annot=True, annot_kws={'size':10}, cmap='coolwarm')
dbscan_cluster = DBSCAN(eps=0.8, min_samples=4)
y_predict = dbscan_cluster.fit_predict(X)
df_blobs['cluster'] = y_predict
df_blobs
sns.scatterplot(df_blobs, x='x1', y='x2', hue='cluster');
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
def plot_dbscan_cluster(x,eps, min_samples):
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
clusterer = DBSCAN(eps=eps, min_samples=min_samples)
cluster_labels = clusterer.fit_predict(x)
n_clusters = len(np.unique(cluster_labels))
ax1.set_ylim([0, len(x) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(x, cluster_labels)
print(
"For n_clusters =",
n_clusters,
"The average silhouette_score is :",
silhouette_avg,
)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(x, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i - 1]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(
np.arange(y_lower, y_upper),
0,
ith_cluster_silhouette_values,
facecolor=color,
edgecolor=color,
alpha=0.7,
)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(
x[:, 0], x[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k"
)
plt.show()
plot_dbscan_cluster(X,eps=0.749,min_samples=5)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style(style='whitegrid')
df_country = pd.read_csv('./Country-data.csv')
df_country
df_country.describe().T
df_country.isnull().sum()
df_country[df_country.duplicated()]
int_cols = df_country.select_dtypes(exclude='object').columns
int_cols
for i,cols in enumerate(int_cols):
print(cols)
fig, axs = plt.subplots(3,3,figsize=(18,18))
for i,cols in enumerate(int_cols):
fila = i // 3 # Calcular la fila correspondiente
columna = i % 3 # Calcular la columna correspondiente
sns.boxplot(data=df_country,y=cols, ax=axs[fila, columna])
axs[fila, columna].set_title(f'{cols.upper()}')
plt.figure(figsize=(14,12))
sns.heatmap(
df_country.select_dtypes(exclude='object').corr(),
annot=True,
cmap='coolwarm');
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_country_scaled = scaler.fit_transform(df_country.drop('country', axis=1))
df_country_scaled = pd.DataFrame(
df_country_scaled,
columns=df_country.drop('country', axis=1).columns)
df_country_scaled
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(df_country_scaled)
pca_data_scaled = pca.transform(df_country_scaled)
pd.DataFrame(pca_data_scaled)
# Calcular la varianza explicada por cada componente principal
var = pca.explained_variance_ratio_
var
# Calcular la varianza acumulada
cum_var = np.cumsum(np.round(var, decimals=4))*100
plt.figure(figsize=(10,10))
# Crear una gráfica de para visualizar la varianza acumulada y marcar los puntos con estrellas rojas
plt.plot(cum_var, marker='o', markersize=10, color='b')
# Etiquetar el valor de la varianza acumulada en los puntos correspondientes
for i, ix in enumerate(cum_var):
plt.annotate(np.round(ix, decimals=1), xy=(i, ix-2.5), ha='center')
pca_data_standar = pd.DataFrame(pca_data_scaled)
pca_data_standar.drop([4,5,6,7,8],axis=1,inplace=True)
pca_data_standar
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
def plot_elbow_silhouette(K, values, title, ax):
"""
Traces an elbow plot or a silhouette coefficient plot based on the number of clusters.
Parameters:
- K: A list or range of values representing the number of clusters.
- values: A list of values corresponding to the y-axis of the plot.
- title: The title of the plot.
- ax: The matplotlib axes object on which the plot will be drawn.
"""
sns.lineplot(ax=ax, x=K, y=values, linestyle='dashed', marker='o', markersize=10, color='b')
ax.set_xlabel('K')
ax.set_ylabel('Inertia' if 'WCSS' in title else 'Silhouette Coefficient')
ax.set_title(title, size=15)
def analyze_cluster(data):
"""
Performs a cluster analysis using the K-Means algorithm.
Parameters:
- data: The data on which the clustering analysis will be performed.
Returns:
- sum_of_squared_distances: A list of the sum of squared distances for each number of clusters.
- silhouette_scores: A list of the silhouette coefficients for each number of clusters.
"""
sum_of_squared_distances = []
silhouette_scores = []
K = range(2, 15)
for k in K:
km = KMeans(n_clusters=k,random_state=42)
y = km.fit_predict(data)
sum_of_squared_distances.append(km.inertia_)
silhouette_scores.append(silhouette_score(data, y))
return sum_of_squared_distances, silhouette_scores
def plot_cluster_elbow_silhouette(data, title, axes, row, col):
"""
Plot the elbow plots for WCSS and silhouette coefficients.
Parameters:
- data: The data on which the clustering analysis will be performed.
- title: The title of the analysis. (PCA | No PCA)
- axes: The matplotlib axes grid on which the plots
- row: The row index of the current subplot.
- col: The column index of the current subplot.
"""
elbow, silhouette = analyze_cluster(data)
plot_elbow_silhouette(K, elbow, f'Elbow Method WCSS ({title})', axes[row, col])
plot_elbow_silhouette(K, silhouette, f'Elbow Method Silhouette Coefficient ({title})', axes[row, col + 1])
def calculate_and_assign_clusters(data, title, df, k):
"""
Calculates clusters using the K-Means algorithm and assigns the cluster labels to a DataFrame.
Parameters:
- data: The data on which the clustering analysis will be performed.
- title: The title of the analysis. (PCA | No PCA)
- df: The DataFrame to which the cluster labels will be assigned.
- k: The number of clusters to be generated by the K-Means algorithm.
"""
km = KMeans(n_clusters=k, random_state=42)
y = km.fit_predict(data)
print('silhouette_score', title, ':', silhouette_score(data, y))
if title == 'No PCA':
df['k_means'] = y
else:
df['k_means_pca'] = y
score = pd.DataFrame([])
def calculate_and_assign_clusters(data: pd.DataFrame=None, method: str=None, title: str=None, df: pd.DataFrame=None, k: int=None, eps: float=None, min_samples: int=None):
"""
Calculates clusters using the specified algorithm and assigns the cluster labels to a DataFrame.
Parameters:
- data: The data on which the clustering analysis will be performed.
- method: The clustering algorithm to be used ('kmeans', 'hierarchical', or 'dbscan').
- title: The title of the analysis ('PCA', or 'No PCa').
- df: The DataFrame to which the cluster labels will be assigned.
- k: The number of clusters to be generated (applicable for 'kmeans' and 'hierarchical').
- eps: The maximum distance between two samples for them to be considered in the same neighborhood (applicable for 'dbscan').
- min_samples: The number of samples in a neighborhood for a point to be considered as a core point (applicable for 'dbscan').
"""
if method == 'kmeans':
# Perform K-Means clustering
km = KMeans(n_clusters=k, random_state=42)
y = km.fit_predict(data)
if title == 'No PCA':
df['k_means'] = y
else:
df['k_means_pca'] = y
elif method == 'hierarchical':
# Perform hierarchical clustering
hc = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage='ward')
y = hc.fit_predict(data)
if title == 'No PCA':
df['hc'] = y
else:
df['hc_pca'] = y
elif method == 'dbscan':
# Perform DBSCAN clustering
dbscan_train = DBSCAN(eps=eps, min_samples=min_samples)
y = dbscan_train.fit_predict(data)
if title == 'No PCA':
df['dbscan'] = y
else:
df['dbscan_pca'] = y
# Calculate and print the silhouette score
print('silhouette_score', title, ':', silhouette_score(data, y))
score[method + '' + title] = [silhouette_score(data, y)]
fig, axes = plt.subplots(2, 2, figsize=(18, 12))
# Elbow Method with PCA
plot_cluster_elbow_silhouette(pca_data_standar, 'PCA', axes, 0, 0)
# Elbow Method without PCA
plot_cluster_elbow_silhouette(df_country_scaled, 'No PCA', axes, 1, 0)
plt.suptitle('Análisis Elbow Method: WCSS y Silhouette', size=25)
plt.tight_layout()
plt.show()
kmeans_pca_score = calculate_and_assign_clusters(pca_data_standar,'kmeans','PCA',df_country,4)
kmeans_score = calculate_and_assign_clusters(df_country_scaled,'kmeans','No PCA',df_country,4)
df_country
fig = plt.figure(figsize=(12,10))
sns.set_style('white')
dendrogram_plot = dendrogram(linkage(pca_data_scaled, method='ward'))
plt.title('Dendogram country')
plt.xlabel('Cluster')
plt.ylabel('Euclidean distance')
for i in [80,45,20]:
plt.hlines(y=i,xmin=0,xmax=5000,linestyles='dashed', alpha=.5, colors='black')
plt.show()
sns.set_style('whitegrid')
for i in range(2,7):
hc = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='ward')
y_hc = hc.fit_predict(pca_data_scaled)
print(f'{i}_cluster :',silhouette_score(pca_data_scaled, y_hc))
calculate_and_assign_clusters(pca_data_scaled,'hierarchical','PCA',df_country,2)
fig = plt.figure(figsize=(12,10))
sns.set_style('white')
dendrogram_plot = dendrogram(linkage(df_country_scaled, method='ward'))
plt.title('Dendogram country')
plt.xlabel('Cluster')
plt.ylabel('Euclidean distance')
for i in [23,18,14]:
plt.hlines(y=i,xmin=0,xmax=5000,linestyles='dashed', alpha=.5, colors='black')
plt.show()
sns.set_style('whitegrid')
for i in range(2,7):
hc = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='ward')
y_hc = hc.fit_predict(df_country_scaled)
print(f'{i}_cluster :',silhouette_score(df_country_scaled, y_hc))
hc_score = calculate_and_assign_clusters(df_country_scaled,'hierarchical','No PCA',df_country,2)
from sklearn.neighbors import NearestNeighbors
# Crear una instancia del modelo NearestNeighbors con n_neighbors=2
neighbors = NearestNeighbors(n_neighbors=2)
# Ajustar el modelo NearestNeighbors utilizando los datos transformados mediante PCA
neighbors_fit = neighbors.fit(pca_data_standar)
# Calcular las distancias y los índices de los vecinos más cercanos
distances, indices = neighbors_fit.kneighbors(pca_data_standar)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
plt.xlabel('data')
plt.ylabel('Distance')
plt.hlines(y=2 ,xmin=0,xmax=165, linestyles="dashed", colors='r')
plt.hlines(y=0.5 ,xmin=0,xmax=165, linestyles="dashed", colors='r')
# Crear un arreglo de valores de epsilon desde 0.5 hasta 2 con paso de 0.1
eps_values = np.arange(0.5, 2, 0.1)
# Crear un arreglo de valores de min_samples desde 3 hasta 12
min_samples = np.arange(3, 12)
from itertools import product
# Generar todas las combinaciones posibles entre los valores de eps_values y min_samples
dbscan_params = list(product(eps_values, min_samples))
no_of_clusters = []
sil_scores = []
for p in dbscan_params:
y_dbscan = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(pca_data_standar)
try:
sil_scores.append(silhouette_score(pca_data_standar, y_dbscan))
except:
sil_scores.append(0)
no_of_clusters.append(len(np.unique(y_dbscan)))
df_params_tunning = pd.DataFrame.from_records(dbscan_params, columns=['Eps','Min_samples'])
df_params_tunning['sil_score'] = sil_scores
df_params_tunning['n_clusters'] = no_of_clusters
df_params_tunning
pivot_1 = pd.pivot_table(df_params_tunning, values='sil_score', columns='Eps', index='Min_samples')
pivot_2 = pd.pivot_table(df_params_tunning, values='n_clusters', columns='Eps', index='Min_samples')
plt.figure(figsize=(18,8))
sns.heatmap(pivot_1,annot=True, annot_kws={'size':10}, cmap='coolwarm');
plt.title('sil_score')
plt.show()
plt.figure(figsize=(18,8))
sns.heatmap(pivot_2,annot=True, annot_kws={'size':10}, cmap='coolwarm');
plt.title('n_clusters')
plt.show()
dbscan_pca_score = calculate_and_assign_clusters(
data=pca_data_standar,
method='dbscan',
title='PCA',
df=df_country,
eps=1.2,
min_samples=3)
from sklearn.neighbors import NearestNeighbors
# Crear una instancia del modelo NearestNeighbors con n_neighbors=2
neighbors = NearestNeighbors(n_neighbors=2)
# Ajustar el modelo NearestNeighbors utilizando los datos transformados mediante PCA
neighbors_fit = neighbors.fit(df_country_scaled)
# Calcular las distancias y los índices de los vecinos más cercanos
distances, indices = neighbors_fit.kneighbors(df_country_scaled)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
plt.plot(distances)
plt.xlabel('data')
plt.ylabel('Distance')
plt.hlines(y=3 ,xmin=0,xmax=165, linestyles="dashed", colors='r')
plt.hlines(y=1 ,xmin=0,xmax=165, linestyles="dashed", colors='r')
# Crear un arreglo de valores de epsilon desde 0.5 hasta 2 con paso de 0.1
eps_values = np.arange(1, 3, 0.1)
# Crear un arreglo de valores de min_samples desde 3 hasta 12
min_samples = np.arange(3, 12)
from itertools import product
# Generar todas las combinaciones posibles entre los valores de eps_values y min_samples
dbscan_params = list(product(eps_values, min_samples))
no_of_clusters = []
sil_scores = []
for p in dbscan_params:
y_dbscan = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(df_country_scaled)
try:
sil_scores.append(silhouette_score(df_country_scaled, y_dbscan))
except:
sil_scores.append(0)
no_of_clusters.append(len(np.unique(y_dbscan)))
df_params_tunning = pd.DataFrame.from_records(dbscan_params, columns=['Eps','Min_samples'])
df_params_tunning['sil_score'] = sil_scores
df_params_tunning['n_clusters'] = no_of_clusters
df_params_tunning
pivot_1 = pd.pivot_table(df_params_tunning, values='sil_score', columns='Eps', index='Min_samples')
pivot_2 = pd.pivot_table(df_params_tunning, values='n_clusters', columns='Eps', index='Min_samples')
plt.figure(figsize=(18,8))
sns.heatmap(pivot_1,annot=True, annot_kws={'size':10}, cmap='coolwarm');
plt.title('sil_score')
plt.show()
plt.figure(figsize=(18,8))
sns.heatmap(pivot_2,annot=True, annot_kws={'size':10}, cmap='coolwarm');
plt.title('n_clusters')
plt.show()
dbscan_score = calculate_and_assign_clusters(
data=df_country_scaled,
method='dbscan',
title='No PCA',
df=df_country,
eps=1.4,
min_samples=6)
df_country
score.values
plt.figure(figsize=(14,10))
sns.barplot(score)
for i, value in enumerate(score.values[0]):
plt.annotate(str(f'{(value*100).round(3)}%'), xy=(i, value), ha='center', va='bottom')
df_country.columns
df_country[['child_mort', 'exports', 'health', 'imports', 'income',
'inflation', 'life_expec', 'total_fer', 'gdpp']]
sns.pairplot(data=df_country[['child_mort', 'exports', 'health', 'imports', 'income',
'inflation', 'life_expec', 'total_fer', 'gdpp', 'k_means_pca',]], hue='k_means_pca', palette='coolwarm')
plt.figure(figsize=(12,8))
sns.scatterplot(data=df_country, x='child_mort', y='gdpp', hue='k_means_pca', palette='coolwarm')
plt.title('Relationship between Child Mortality and GDP per capita')
df_country.columns
!pip install pycountry
pip install plotly
import pycountry
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
def get_alpha_3(location):
try:
return pycountry.countries.get(name=location).alpha_3
except:
return None
df_map = df_country.copy()
df_map['iso_alpha'] = df_country['country'].apply(lambda x: get_alpha_3(x))
iso3_to_iso2 = {c.alpha_3: c.alpha_2 for c in pycountry.countries}
df_map["iso_alpha2"] = df_map["iso_alpha"].map(iso3_to_iso2)
# for bar gdpp u other
df_bar = df_map[['child_mort', 'country', 'k_means_pca']].sort_values(by='child_mort',ascending=False).head(10)
# Initialize figure with subplots
fig = make_subplots(
rows=2, cols=2,
column_widths=[0.5, 0.5],
row_heights=[0.6, 0.4],
subplot_titles=('Child mortality vs GDP with size by Child mortality',
'10 Countries with the Highest Child Mortality',
'Infant Mortality & GDP: Country Analysis with K-means Clusters'),
specs=[[{"type": "scatter", "colspan": 2}, None],
[{"type": "bar"}, {"type": "choropleth"}]])
#######################
fig1 = px.scatter(
df_map,
x="child_mort",
y="gdpp",
hover_name="country",
hover_data=['gdpp', "k_means_pca", "child_mort"]
)
fig1.update_traces(marker_color="rgba(0,0,0,0)")
minDim = df_map[["child_mort", "gdpp"]].max().idxmax()
maxi = df_map[minDim].max()
fig1.update_layout(height=600, width=1000, plot_bgcolor="#dfdfdf", yaxis_range=[-5e3, 55e3])
trace1 = fig1.data[0]
fig.add_trace(trace1, row=1, col=1)
for i, row in df_map.iterrows():
country_iso = row["iso_alpha2"]
fig.add_layout_image(
dict(
source=f"https://raw.githubusercontent.com/matahombres/CSS-Country-Flags-Rounded/master/flags/{country_iso}.png",
xref="x",
yref="y",
xanchor="center",
yanchor="middle",
x=row["child_mort"],
y=row["gdpp"],
sizex=np.sqrt(row["child_mort"] / df_map["child_mort"].max()) * maxi * 0.15 + maxi * 0.03,
sizey=np.sqrt(row["child_mort"] / df_map["child_mort"].max()) * maxi * 0.15+ maxi * 0.03,
sizing="contain",
opacity=0.8,
layer="above"
)
)
##########################
# Add locations bar chart
fig.add_trace(
go.Bar(x=df_bar.country,
y=df_bar.child_mort,
marker=dict(color=df_bar.child_mort,
showscale=True,
colorscale=px.colors.diverging.RdBu[::-1],
cmax=df_map.child_mort.max(), cmin=df_map.child_mort.min(),
colorbar=dict(len=0.45, x=0.47, y=0.2),
colorbar_title = "gdpp",),
showlegend=False,
),
row=2, col=1
)
fig.add_trace(
go.Choropleth(
locations=df_map.iso_alpha,
z=df_map.k_means_pca,
marker_line_color='red',
colorbar_title = "Cluster",
colorscale=[[0, 'green'], [0.25, 'green'],
[0.25, 'red'], [0.5, 'red'],
[0.5, 'yellow'], [0.75, 'yellow'],
[0.75, 'rgb(117, 112, 179)'], [1, 'rgb(117, 112, 179)']],
marker_opacity=0.5,
marker_line_width=0,
hoverinfo='location+z',
zmin=df_map.k_means_pca.min(),
zmax=df_map.k_means_pca.max(),
colorbar=dict(len=0.45, y=0.2),
)
,
row=2, col=2
)
#############################################
# Update geo subplot properties
fig.update_geos(
visible=True,
fitbounds="locations",
projection_type="natural earth",
projection_scale=2,
landcolor="white",
oceancolor="LightBlue",
showocean=True,
lakecolor="white",
lataxis_showgrid=True,
lonaxis_showgrid=True,
)
# Rotate x-axis labels
fig.update_xaxes(tickangle=30, automargin='height')
# Set theme, margin, and annotation in layout
fig.update_layout(
width=1400, height=800,
margin=dict(r=5, t=25, b=20, l=60),
)
# Update xaxis properties
fig.update_xaxes(title_text="child mort", row=1, col=1)
fig.update_xaxes(title_text="Paises", row=2, col=1)
# Update yaxis properties
fig.update_yaxes(title_text="gdpp", row=1, col=1)
fig.update_yaxes(title_text="gdpp", row=2, col=1)
fig.show()