Clustering con Python y scikit-learn

from sklearn.datasets import make_blobs import pandas as pd import matplotlib.pylab as plt

x,y = make_blobs(n_samples=100, centers=4, n_features=2, cluster_std=[1,1.5,2,2], random_state=7) # n_samples: Es el número total de muestras o puntos a generar. # n_features: Es el número de características o dimensiones de cada muestra. # centers: Es el número de clústeres que se desean generar, o una matriz con las coordenadas de los centroides de los clústeres. # cluster_std: Es la desviación estándar de cada clúster. Cuanto mayor sea el valor, mayor será la dispersión de los puntos dentro de cada clúster. # random_state: Es una semilla para la generación aleatoria de los datos.

(x,y)

df_blobs = pd.DataFrame({ "x1": x[:,0], "x2": x[:,1], 'y': y })

df_blobs

def plot_2d_cluster(x,y,ax): y_uniques = pd.Series(y).unique() for _ in y_uniques: x[y==_].plot( title=f'{len(y_uniques)} Clusters', kind='scatter', x='x1', y='x2', marker=f'${_}$', s=50, ax = ax )

fig,ax = plt.subplots(1,1, figsize=(15,10)) x,y = df_blobs[['x1','x2']], df_blobs['y'] plot_2d_cluster(x,y,ax)

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4, random_state=7 )

y_pred = kmeans.fit_predict(x)

fig,ax = plt.subplots(1,2, figsize=(20,8)) plot_2d_cluster(x,y_pred,ax[1]) # plot_2d_cluster(x,y,ax[0]) ax[0].set_title(f'Actual {ax[0].get_title()}') ax[1].set_title(f'KMeans {ax[1].get_title()}')

!pip install yellowbrick==1.5

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets import make_blobs from sklearn.cluster import kmeans_plusplus from sklearn.metrics import silhouette_score from yellowbrick.cluster.silhouette import silhouette_visualizer sns.set_style(style='whitegrid')

# Genera un conjunto de datos sintéticos con la función make_blobs # n_samples: el número total de puntos de datos que se generarán # cluster_std: la desviación estándar de los clústeres generados # centers: el número de clústeres o grupos a generar # n_features: el número de características o dimensiones de los puntos de datos # return_centers: si se devuelven los centros de los clústeres generados x,y,c = make_blobs(n_samples=500, cluster_std=0.8, centers=4, n_features=2, return_centers=True, random_state=42)

x

y

c

# Crea un DataFrame utilizando pandas df_blobs = pd.DataFrame( { 'x1': x[:,0], # Los valores de 'x1' se toman de la primera columna del arreglo x 'x2': x[:,1], # Los valores de 'x2' se toman de la segunda columna del arreglo x 'y': y # Los valores de 'y' se toman del arreglo y } )

df_blobs

# Crea un DataFrame utilizando pandas df_centers = pd.DataFrame( { 'x1': c[:,0], # Los valores de 'x1' se toman de la primera columna del arreglo c 'x2': c[:,1] # Los valores de 'x2' se toman de la segunda columna del arreglo } )

df_centers

# Visualización del scatter plot de los datos en df_blobs sns.scatterplot(data=df_blobs, x='x1', y='x2') # Visualización del scatter plot de los centros en df_centers sns.scatterplot(data=df_centers, x='x1',y='x2', marker='*',s=250,color='red') plt.show()

# Crear una instancia del algoritmo K-means con 4 clusters kmeans = KMeans(n_clusters=4) # Realizar el agrupamiento y obtener las etiquetas de los clusters para cada punto de datos en x df_cluster = kmeans.fit_predict(x) # Agregar la columna 'cluster' al DataFrame df_blobs con las etiquetas de los clusters df_blobs['cluster'] = df_cluster # Obtener los centros de los clusters generados por K-means k_means_centers = kmeans.cluster_centers_ # Crear un DataFrame df_kmeans_center con los centros de los clusters df_kmeans_center = pd.DataFrame( { 'x1': k_means_centers[:,0], # Primera columna de k_means_centers 'x2': k_means_centers[:,1] # Segunda columna de k_means_centers } )

plt.figure(figsize=(12,8)) # Visualización del scatter plot de los datos en df_blobs sns.scatterplot(data=df_blobs, x='x1', y='x2', hue='cluster', palette='pastel') # Visualización del scatter plot de los centros en df_centers sns.scatterplot(data=df_centers, x='x1', y='x2', marker='*', s=250, color='red') # Visualización del scatter plot de los centros de K-means en df_kmeans_center sns.scatterplot(data=df_kmeans_center, x='x1', y='x2', marker='*', s=250, color='purple') plt.title(f'silhouette score: {silhouette_score(x,df_blobs["cluster"]).round(5)}') plt.show()

def viz_cluster(k,ax=None): # Crear una instancia del algoritmo K-means con 4 clusters kmeans = KMeans(n_clusters=k) # Realizar el agrupamiento y obtener las etiquetas de los clusters para cada punto de datos en x df_cluster = kmeans.fit_predict(x) # Agregar la columna 'cluster' al DataFrame df_blobs con las etiquetas de los clusters df_blobs['cluster'] = df_cluster # Obtener los centros de los clusters generados por K-means k_means_centers = kmeans.cluster_centers_ # Crear un DataFrame df_kmeans_center con los centros de los clusters df_kmeans_center = pd.DataFrame( { 'x1': k_means_centers[:,0], # Primera columna de k_means_centers 'x2': k_means_centers[:,1] # Segunda columna de k_means_centers } ) plt.figure(figsize=(12,8)) # Visualización del scatter plot de los datos en df_blobs sns.scatterplot(data=df_blobs, x='x1', y='x2', hue='cluster', palette='pastel',ax=ax); # Visualización del scatter plot de los centros en df_centers sns.scatterplot(data=df_centers, x='x1', y='x2', marker='*', s=250, color='red',ax=ax); # Visualización del scatter plot de los centros de K-means en df_kmeans_center sns.scatterplot(data=df_kmeans_center, x='x1', y='x2', marker='*', s=250, color='purple',ax=ax); if ax != None: ax.set_title(f'silhouette score: {silhouette_score(x,df_blobs["cluster"]).round(5)}'); else: plt.title(f'silhouette score: {silhouette_score(x,df_blobs["cluster"]).round(5)}');

fig, axs = plt.subplots(2, 2, figsize=(12, 12)); for i in range(2): for j in range(2): ax = axs[i, j] viz_cluster(i*2 + j + 3,ax);

sum_of_squared_distances = [] K = range(2,15) for k in K: km = KMeans(n_clusters=k, random_state=42) km = km.fit(x) sum_of_squared_distances.append(km.inertia_) plt.figure(figsize=(8,8)) plt.plot(K, sum_of_squared_distances, 'bx-') plt.xlabel('k') plt.ylabel('Average within-cluster sum of squares') plt.title('Elbow for KMeans clustering') plt.show()

silhouetteScore = [] K = range(2,15) for k in K: km = KMeans(n_clusters=k, random_state=42) km = km.fit(x) y = km.predict(x) silhouetteScore.append(silhouette_score(x,y)) plt.figure(figsize=(8,8)) plt.plot(K, silhouetteScore, 'bx-') plt.xlabel('k') plt.ylabel('silhouette score') plt.show()

# Visualizamos los clusters con el valor optimo de K viz_cluster(4)

km = KMeans(n_clusters=4) km.fit(x) plt.figure(figsize=(15,8)) silhouette_visualizer(km,x,colors='yellowbrick');

import numpy as np import pandas as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets import make_blobs from sklearn.cluster import AgglomerativeClustering from sklearn.metrics import silhouette_score from scipy.cluster.hierarchy import dendrogram, linkage

# Genera un conjunto de datos sintéticos con la función make_blobs x,y,c = make_blobs(n_samples=500, cluster_std=0.8, centers=4, n_features=2, return_centers=True, random_state=40) df_blobs = pd.DataFrame({ "x1": x[:,0], "x2": x[:,1], 'y': y }) df_centers = pd.DataFrame( { 'x1': c[:,0], 'x2': c[:,1] } )

fig = plt.figure(figsize=(8,8)) sns.scatterplot(data=df_blobs, x='x1', y='x2') plt.show()

sns.set_style(style='white') fig = plt.figure(figsize=(12,8)) dendrogram_plot = dendrogram(linkage(x, method='ward')) plt.title('Deondograma usando ward linkage') plt.xlabel('Cluster',fontsize=12) plt.ylabel('Distancia Euclidiana',fontsize=12) plt.show();

sns.set_style(style='white') fig = plt.figure(figsize=(12,8)) dendrogram_plot = dendrogram(linkage(x, method='ward')) plt.title('Deondograma usando ward linkage') plt.hlines(y=40, xmin=0, xmax=5000, linestyles='dashed', colors='r') plt.xlabel('Cluster',fontsize=12) plt.ylabel('Distancia Euclidiana',fontsize=12) plt.show();

#hc = Herarchical Clustering hc = AgglomerativeClustering(n_clusters=4, affinity='euclidean', linkage='ward') y_hc = hc.fit_predict(x) estimator_hc = hc.fit(x)

df_blobs['cluster'] = y_hc

fig = plt.figure(figsize=(8,8)) sns.scatterplot(data=df_blobs, x='x1', y='x2', hue='cluster') plt.show();

silhouette_score(x,y_hc)

from sklearn.metrics import silhouette_samples, silhouette_score import matplotlib.pyplot as plt import matplotlib.cm as cm import numpy as np range_n_clusters = [3,4,5,6] def plot_herarchical_cluster(x,range_n_clusters): for n_clusters in range_n_clusters: # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. ax1.set_ylim([0, len(x) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. clusterer = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward') cluster_labels = clusterer.fit_predict(x) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(x, cluster_labels) print( "For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg, ) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(x, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters) ax1.fill_betweenx( np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7, ) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter( x[:, 0], x[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k" ) plt.show()

range_n_clusters = [4] plot_herarchical_cluster(x, range_n_clusters)

range_n_clusters = [3,5,6] plot_herarchical_cluster(x, range_n_clusters)

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets import make_blobs, make_moons from sklearn.cluster import DBSCAN from sklearn.metrics import silhouette_score

X, y, c = make_blobs(n_samples=500, cluster_std=0.8, centers=4, n_features=2, return_centers=True, random_state=42) df_blobs = pd.DataFrame( { 'x1':X[:,0], 'x2':X[:,1] } ) df_centers = pd.DataFrame( { 'x1': c[:,0], 'x2': c[:,1] } )

X_m, y_m = make_moons(n_samples=250, noise=0.05, random_state=42) df_moons = pd.DataFrame( { 'x1':X_m[:,0], 'x2':X_m[:,1] } )

sns.scatterplot(data=df_blobs, x='x1',y='x2') plt.scatter(data=df_centers, x='x1',y='x2', marker='*', s=400, color='brown')

sns.scatterplot(data=df_moons, x='x1',y='x2')

dbscan_cluster = DBSCAN(eps=0.3, min_samples=3) y_m_predict = dbscan_cluster.fit_predict(X_m) df_moons['cluster'] = y_m_predict df_moons

sns.scatterplot(data=df_moons, x='x1',y='x2',hue='cluster');

dbscan = DBSCAN(eps=1, min_samples=3) y_predict = dbscan.fit_predict(X) df_blobs['cluster'] = y_predict sns.scatterplot(data=df_blobs, x='x1',y='x2',hue='cluster'); plt.title('''DBSCAN ε: 1 minPts:3''');

dbscan = DBSCAN(eps=0.3, min_samples=3) y_predict = dbscan.fit_predict(X) df_blobs['cluster'] = y_predict df_blobs sns.scatterplot(data=df_blobs, x='x1',y='x2',hue='cluster'); plt.title('''DBSCAN ε: 0.3 minPts:3''');

from sklearn.neighbors import NearestNeighbors

neighbors = NearestNeighbors(n_neighbors=3) neighbors_fit = neighbors.fit(X) distances, indices = neighbors_fit.kneighbors(X)

distances

indices

distances = np.sort(distances, axis=0) distances = distances[:,1]

fig = plt.figure(figsize=(10,10)) plt.plot(distances) plt.xlabel('data') plt.ylabel('Distance') plt.hlines(y=0.4 ,xmin=0,xmax=500, linestyles="dashed", colors='r') plt.hlines(y=0.2 ,xmin=0,xmax=500, linestyles="dashed", colors='r') plt.show()

eps_values = np.arange(0.25, 0.80, 0.1) min_samples = np.arange(2,10)

min_samples

from itertools import product

dbscan_params = list(product(eps_values, min_samples)) sil_scores = [] for p in dbscan_params: y_pred = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(X) sil_scores.append(silhouette_score(X,y_pred))

df_params_tunning = pd.DataFrame.from_records(dbscan_params, columns=['Eps','Min_Samples']) df_params_tunning['sil_score'] = sil_scores

pivot_data = pd.pivot_table(df_params_tunning, values='sil_score', index='Min_Samples', columns='Eps')

pivot_data

fig, ax = plt.subplots(figsize=(18, 6)) sns.heatmap(pivot_data, annot=True, annot_kws={'size':10}, cmap='coolwarm')

dbscan_cluster = DBSCAN(eps=0.8, min_samples=4) y_predict = dbscan_cluster.fit_predict(X) df_blobs['cluster'] = y_predict

df_blobs

sns.scatterplot(df_blobs, x='x1', y='x2', hue='cluster');

from sklearn.metrics import silhouette_samples, silhouette_score import matplotlib.pyplot as plt import matplotlib.cm as cm import numpy as np def plot_dbscan_cluster(x,eps, min_samples): # Create a subplot with 1 row and 2 columns fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) # The 1st subplot is the silhouette plot # The silhouette coefficient can range from -1, 1 but in this example all # lie within [-0.1, 1] ax1.set_xlim([-0.1, 1]) # The (n_clusters+1)*10 is for inserting blank space between silhouette # plots of individual clusters, to demarcate them clearly. clusterer = DBSCAN(eps=eps, min_samples=min_samples) cluster_labels = clusterer.fit_predict(x) n_clusters = len(np.unique(cluster_labels)) ax1.set_ylim([0, len(x) + (n_clusters + 1) * 10]) # Initialize the clusterer with n_clusters value and a random generator # seed of 10 for reproducibility. # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation of the formed # clusters silhouette_avg = silhouette_score(x, cluster_labels) print( "For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg, ) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(x, cluster_labels) y_lower = 10 for i in range(n_clusters): # Aggregate the silhouette scores for samples belonging to # cluster i, and sort them ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i - 1] ith_cluster_silhouette_values.sort() size_cluster_i = ith_cluster_silhouette_values.shape[0] y_upper = y_lower + size_cluster_i color = cm.nipy_spectral(float(i) / n_clusters) ax1.fill_betweenx( np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7, ) # Label the silhouette plots with their cluster numbers at the middle ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i)) # Compute the new y_lower for next plot y_lower = y_upper + 10 # 10 for the 0 samples ax1.set_title("The silhouette plot for the various clusters.") ax1.set_xlabel("The silhouette coefficient values") ax1.set_ylabel("Cluster label") # The vertical line for average silhouette score of all the values ax1.axvline(x=silhouette_avg, color="red", linestyle="--") ax1.set_yticks([]) # Clear the yaxis labels / ticks ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1]) # 2nd Plot showing the actual clusters formed colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters) ax2.scatter( x[:, 0], x[:, 1], marker=".", s=30, lw=0, alpha=0.7, c=colors, edgecolor="k" ) plt.show()

plot_dbscan_cluster(X,eps=0.749,min_samples=5)

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set_style(style='whitegrid')

df_country = pd.read_csv('./Country-data.csv') df_country

df_country.describe().T

df_country.isnull().sum()

df_country[df_country.duplicated()]

int_cols = df_country.select_dtypes(exclude='object').columns int_cols

for i,cols in enumerate(int_cols): print(cols)

fig, axs = plt.subplots(3,3,figsize=(18,18)) for i,cols in enumerate(int_cols): fila = i // 3 # Calcular la fila correspondiente columna = i % 3 # Calcular la columna correspondiente sns.boxplot(data=df_country,y=cols, ax=axs[fila, columna]) axs[fila, columna].set_title(f'{cols.upper()}')

plt.figure(figsize=(14,12)) sns.heatmap( df_country.select_dtypes(exclude='object').corr(), annot=True, cmap='coolwarm');

from sklearn.preprocessing import StandardScaler scaler = StandardScaler() df_country_scaled = scaler.fit_transform(df_country.drop('country', axis=1))

df_country_scaled = pd.DataFrame( df_country_scaled, columns=df_country.drop('country', axis=1).columns)

df_country_scaled

from sklearn.decomposition import PCA

pca = PCA() pca.fit(df_country_scaled) pca_data_scaled = pca.transform(df_country_scaled) pd.DataFrame(pca_data_scaled)

# Calcular la varianza explicada por cada componente principal var = pca.explained_variance_ratio_ var

# Calcular la varianza acumulada cum_var = np.cumsum(np.round(var, decimals=4))*100

plt.figure(figsize=(10,10)) # Crear una gráfica de para visualizar la varianza acumulada y marcar los puntos con estrellas rojas plt.plot(cum_var, marker='o', markersize=10, color='b') # Etiquetar el valor de la varianza acumulada en los puntos correspondientes for i, ix in enumerate(cum_var): plt.annotate(np.round(ix, decimals=1), xy=(i, ix-2.5), ha='center')

pca_data_standar = pd.DataFrame(pca_data_scaled) pca_data_standar.drop([4,5,6,7,8],axis=1,inplace=True)

pca_data_standar

from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN from sklearn.metrics import silhouette_score from scipy.cluster.hierarchy import dendrogram, linkage

def plot_elbow_silhouette(K, values, title, ax): """ Traces an elbow plot or a silhouette coefficient plot based on the number of clusters. Parameters: - K: A list or range of values representing the number of clusters. - values: A list of values corresponding to the y-axis of the plot. - title: The title of the plot. - ax: The matplotlib axes object on which the plot will be drawn. """ sns.lineplot(ax=ax, x=K, y=values, linestyle='dashed', marker='o', markersize=10, color='b') ax.set_xlabel('K') ax.set_ylabel('Inertia' if 'WCSS' in title else 'Silhouette Coefficient') ax.set_title(title, size=15)

def analyze_cluster(data): """ Performs a cluster analysis using the K-Means algorithm. Parameters: - data: The data on which the clustering analysis will be performed. Returns: - sum_of_squared_distances: A list of the sum of squared distances for each number of clusters. - silhouette_scores: A list of the silhouette coefficients for each number of clusters. """ sum_of_squared_distances = [] silhouette_scores = [] K = range(2, 15) for k in K: km = KMeans(n_clusters=k,random_state=42) y = km.fit_predict(data) sum_of_squared_distances.append(km.inertia_) silhouette_scores.append(silhouette_score(data, y)) return sum_of_squared_distances, silhouette_scores

def plot_cluster_elbow_silhouette(data, title, axes, row, col): """ Plot the elbow plots for WCSS and silhouette coefficients. Parameters: - data: The data on which the clustering analysis will be performed. - title: The title of the analysis. (PCA | No PCA) - axes: The matplotlib axes grid on which the plots - row: The row index of the current subplot. - col: The column index of the current subplot. """ elbow, silhouette = analyze_cluster(data) plot_elbow_silhouette(K, elbow, f'Elbow Method WCSS ({title})', axes[row, col]) plot_elbow_silhouette(K, silhouette, f'Elbow Method Silhouette Coefficient ({title})', axes[row, col + 1])

def calculate_and_assign_clusters(data, title, df, k): """ Calculates clusters using the K-Means algorithm and assigns the cluster labels to a DataFrame. Parameters: - data: The data on which the clustering analysis will be performed. - title: The title of the analysis. (PCA | No PCA) - df: The DataFrame to which the cluster labels will be assigned. - k: The number of clusters to be generated by the K-Means algorithm. """ km = KMeans(n_clusters=k, random_state=42) y = km.fit_predict(data) print('silhouette_score', title, ':', silhouette_score(data, y)) if title == 'No PCA': df['k_means'] = y else: df['k_means_pca'] = y

score = pd.DataFrame([]) def calculate_and_assign_clusters(data: pd.DataFrame=None, method: str=None, title: str=None, df: pd.DataFrame=None, k: int=None, eps: float=None, min_samples: int=None): """ Calculates clusters using the specified algorithm and assigns the cluster labels to a DataFrame. Parameters: - data: The data on which the clustering analysis will be performed. - method: The clustering algorithm to be used ('kmeans', 'hierarchical', or 'dbscan'). - title: The title of the analysis ('PCA', or 'No PCa'). - df: The DataFrame to which the cluster labels will be assigned. - k: The number of clusters to be generated (applicable for 'kmeans' and 'hierarchical'). - eps: The maximum distance between two samples for them to be considered in the same neighborhood (applicable for 'dbscan'). - min_samples: The number of samples in a neighborhood for a point to be considered as a core point (applicable for 'dbscan'). """ if method == 'kmeans': # Perform K-Means clustering km = KMeans(n_clusters=k, random_state=42) y = km.fit_predict(data) if title == 'No PCA': df['k_means'] = y else: df['k_means_pca'] = y elif method == 'hierarchical': # Perform hierarchical clustering hc = AgglomerativeClustering(n_clusters=k, affinity='euclidean', linkage='ward') y = hc.fit_predict(data) if title == 'No PCA': df['hc'] = y else: df['hc_pca'] = y elif method == 'dbscan': # Perform DBSCAN clustering dbscan_train = DBSCAN(eps=eps, min_samples=min_samples) y = dbscan_train.fit_predict(data) if title == 'No PCA': df['dbscan'] = y else: df['dbscan_pca'] = y # Calculate and print the silhouette score print('silhouette_score', title, ':', silhouette_score(data, y)) score[method + '' + title] = [silhouette_score(data, y)]

fig, axes = plt.subplots(2, 2, figsize=(18, 12)) # Elbow Method with PCA plot_cluster_elbow_silhouette(pca_data_standar, 'PCA', axes, 0, 0) # Elbow Method without PCA plot_cluster_elbow_silhouette(df_country_scaled, 'No PCA', axes, 1, 0) plt.suptitle('Análisis Elbow Method: WCSS y Silhouette', size=25) plt.tight_layout() plt.show()

kmeans_pca_score = calculate_and_assign_clusters(pca_data_standar,'kmeans','PCA',df_country,4) kmeans_score = calculate_and_assign_clusters(df_country_scaled,'kmeans','No PCA',df_country,4)

df_country

fig = plt.figure(figsize=(12,10)) sns.set_style('white') dendrogram_plot = dendrogram(linkage(pca_data_scaled, method='ward')) plt.title('Dendogram country') plt.xlabel('Cluster') plt.ylabel('Euclidean distance') for i in [80,45,20]: plt.hlines(y=i,xmin=0,xmax=5000,linestyles='dashed', alpha=.5, colors='black') plt.show() sns.set_style('whitegrid')

for i in range(2,7): hc = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='ward') y_hc = hc.fit_predict(pca_data_scaled) print(f'{i}_cluster :',silhouette_score(pca_data_scaled, y_hc))

calculate_and_assign_clusters(pca_data_scaled,'hierarchical','PCA',df_country,2)

fig = plt.figure(figsize=(12,10)) sns.set_style('white') dendrogram_plot = dendrogram(linkage(df_country_scaled, method='ward')) plt.title('Dendogram country') plt.xlabel('Cluster') plt.ylabel('Euclidean distance') for i in [23,18,14]: plt.hlines(y=i,xmin=0,xmax=5000,linestyles='dashed', alpha=.5, colors='black') plt.show() sns.set_style('whitegrid')

for i in range(2,7): hc = AgglomerativeClustering(n_clusters=i, affinity='euclidean', linkage='ward') y_hc = hc.fit_predict(df_country_scaled) print(f'{i}_cluster :',silhouette_score(df_country_scaled, y_hc))

hc_score = calculate_and_assign_clusters(df_country_scaled,'hierarchical','No PCA',df_country,2)

from sklearn.neighbors import NearestNeighbors # Crear una instancia del modelo NearestNeighbors con n_neighbors=2 neighbors = NearestNeighbors(n_neighbors=2) # Ajustar el modelo NearestNeighbors utilizando los datos transformados mediante PCA neighbors_fit = neighbors.fit(pca_data_standar) # Calcular las distancias y los índices de los vecinos más cercanos distances, indices = neighbors_fit.kneighbors(pca_data_standar)

distances = np.sort(distances, axis=0) distances = distances[:,1]

plt.plot(distances) plt.xlabel('data') plt.ylabel('Distance') plt.hlines(y=2 ,xmin=0,xmax=165, linestyles="dashed", colors='r') plt.hlines(y=0.5 ,xmin=0,xmax=165, linestyles="dashed", colors='r')

# Crear un arreglo de valores de epsilon desde 0.5 hasta 2 con paso de 0.1 eps_values = np.arange(0.5, 2, 0.1) # Crear un arreglo de valores de min_samples desde 3 hasta 12 min_samples = np.arange(3, 12)

from itertools import product # Generar todas las combinaciones posibles entre los valores de eps_values y min_samples dbscan_params = list(product(eps_values, min_samples)) no_of_clusters = [] sil_scores = [] for p in dbscan_params: y_dbscan = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(pca_data_standar) try: sil_scores.append(silhouette_score(pca_data_standar, y_dbscan)) except: sil_scores.append(0) no_of_clusters.append(len(np.unique(y_dbscan)))

df_params_tunning = pd.DataFrame.from_records(dbscan_params, columns=['Eps','Min_samples']) df_params_tunning['sil_score'] = sil_scores df_params_tunning['n_clusters'] = no_of_clusters

df_params_tunning

pivot_1 = pd.pivot_table(df_params_tunning, values='sil_score', columns='Eps', index='Min_samples') pivot_2 = pd.pivot_table(df_params_tunning, values='n_clusters', columns='Eps', index='Min_samples')

plt.figure(figsize=(18,8)) sns.heatmap(pivot_1,annot=True, annot_kws={'size':10}, cmap='coolwarm'); plt.title('sil_score') plt.show() plt.figure(figsize=(18,8)) sns.heatmap(pivot_2,annot=True, annot_kws={'size':10}, cmap='coolwarm'); plt.title('n_clusters') plt.show()

dbscan_pca_score = calculate_and_assign_clusters( data=pca_data_standar, method='dbscan', title='PCA', df=df_country, eps=1.2, min_samples=3)

from sklearn.neighbors import NearestNeighbors # Crear una instancia del modelo NearestNeighbors con n_neighbors=2 neighbors = NearestNeighbors(n_neighbors=2) # Ajustar el modelo NearestNeighbors utilizando los datos transformados mediante PCA neighbors_fit = neighbors.fit(df_country_scaled) # Calcular las distancias y los índices de los vecinos más cercanos distances, indices = neighbors_fit.kneighbors(df_country_scaled)

distances = np.sort(distances, axis=0) distances = distances[:,1]

plt.plot(distances) plt.xlabel('data') plt.ylabel('Distance') plt.hlines(y=3 ,xmin=0,xmax=165, linestyles="dashed", colors='r') plt.hlines(y=1 ,xmin=0,xmax=165, linestyles="dashed", colors='r')

# Crear un arreglo de valores de epsilon desde 0.5 hasta 2 con paso de 0.1 eps_values = np.arange(1, 3, 0.1) # Crear un arreglo de valores de min_samples desde 3 hasta 12 min_samples = np.arange(3, 12)

from itertools import product # Generar todas las combinaciones posibles entre los valores de eps_values y min_samples dbscan_params = list(product(eps_values, min_samples)) no_of_clusters = [] sil_scores = [] for p in dbscan_params: y_dbscan = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(df_country_scaled) try: sil_scores.append(silhouette_score(df_country_scaled, y_dbscan)) except: sil_scores.append(0) no_of_clusters.append(len(np.unique(y_dbscan)))

df_params_tunning = pd.DataFrame.from_records(dbscan_params, columns=['Eps','Min_samples']) df_params_tunning['sil_score'] = sil_scores df_params_tunning['n_clusters'] = no_of_clusters

df_params_tunning

pivot_1 = pd.pivot_table(df_params_tunning, values='sil_score', columns='Eps', index='Min_samples') pivot_2 = pd.pivot_table(df_params_tunning, values='n_clusters', columns='Eps', index='Min_samples')

plt.figure(figsize=(18,8)) sns.heatmap(pivot_1,annot=True, annot_kws={'size':10}, cmap='coolwarm'); plt.title('sil_score') plt.show() plt.figure(figsize=(18,8)) sns.heatmap(pivot_2,annot=True, annot_kws={'size':10}, cmap='coolwarm'); plt.title('n_clusters') plt.show()

dbscan_score = calculate_and_assign_clusters( data=df_country_scaled, method='dbscan', title='No PCA', df=df_country, eps=1.4, min_samples=6)

df_country

score.values

plt.figure(figsize=(14,10)) sns.barplot(score) for i, value in enumerate(score.values[0]): plt.annotate(str(f'{(value*100).round(3)}%'), xy=(i, value), ha='center', va='bottom')

df_country.columns

df_country[['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp']]

sns.pairplot(data=df_country[['child_mort', 'exports', 'health', 'imports', 'income', 'inflation', 'life_expec', 'total_fer', 'gdpp', 'k_means_pca',]], hue='k_means_pca', palette='coolwarm')

plt.figure(figsize=(12,8)) sns.scatterplot(data=df_country, x='child_mort', y='gdpp', hue='k_means_pca', palette='coolwarm') plt.title('Relationship between Child Mortality and GDP per capita')

df_country.columns

!pip install pycountry

pip install plotly

import pycountry import plotly.graph_objects as go import plotly.express as px from plotly.subplots import make_subplots def get_alpha_3(location): try: return pycountry.countries.get(name=location).alpha_3 except: return None df_map = df_country.copy() df_map['iso_alpha'] = df_country['country'].apply(lambda x: get_alpha_3(x)) iso3_to_iso2 = {c.alpha_3: c.alpha_2 for c in pycountry.countries} df_map["iso_alpha2"] = df_map["iso_alpha"].map(iso3_to_iso2) # for bar gdpp u other df_bar = df_map[['child_mort', 'country', 'k_means_pca']].sort_values(by='child_mort',ascending=False).head(10) # Initialize figure with subplots fig = make_subplots( rows=2, cols=2, column_widths=[0.5, 0.5], row_heights=[0.6, 0.4], subplot_titles=('Child mortality vs GDP with size by Child mortality', '10 Countries with the Highest Child Mortality', 'Infant Mortality & GDP: Country Analysis with K-means Clusters'), specs=[[{"type": "scatter", "colspan": 2}, None], [{"type": "bar"}, {"type": "choropleth"}]]) ####################### fig1 = px.scatter( df_map, x="child_mort", y="gdpp", hover_name="country", hover_data=['gdpp', "k_means_pca", "child_mort"] ) fig1.update_traces(marker_color="rgba(0,0,0,0)") minDim = df_map[["child_mort", "gdpp"]].max().idxmax() maxi = df_map[minDim].max() fig1.update_layout(height=600, width=1000, plot_bgcolor="#dfdfdf", yaxis_range=[-5e3, 55e3]) trace1 = fig1.data[0] fig.add_trace(trace1, row=1, col=1) for i, row in df_map.iterrows(): country_iso = row["iso_alpha2"] fig.add_layout_image( dict( source=f"https://raw.githubusercontent.com/matahombres/CSS-Country-Flags-Rounded/master/flags/{country_iso}.png", xref="x", yref="y", xanchor="center", yanchor="middle", x=row["child_mort"], y=row["gdpp"], sizex=np.sqrt(row["child_mort"] / df_map["child_mort"].max()) * maxi * 0.15 + maxi * 0.03, sizey=np.sqrt(row["child_mort"] / df_map["child_mort"].max()) * maxi * 0.15+ maxi * 0.03, sizing="contain", opacity=0.8, layer="above" ) ) ########################## # Add locations bar chart fig.add_trace( go.Bar(x=df_bar.country, y=df_bar.child_mort, marker=dict(color=df_bar.child_mort, showscale=True, colorscale=px.colors.diverging.RdBu[::-1], cmax=df_map.child_mort.max(), cmin=df_map.child_mort.min(), colorbar=dict(len=0.45, x=0.47, y=0.2), colorbar_title = "gdpp",), showlegend=False, ), row=2, col=1 ) fig.add_trace( go.Choropleth( locations=df_map.iso_alpha, z=df_map.k_means_pca, marker_line_color='red', colorbar_title = "Cluster", colorscale=[[0, 'green'], [0.25, 'green'], [0.25, 'red'], [0.5, 'red'], [0.5, 'yellow'], [0.75, 'yellow'], [0.75, 'rgb(117, 112, 179)'], [1, 'rgb(117, 112, 179)']], marker_opacity=0.5, marker_line_width=0, hoverinfo='location+z', zmin=df_map.k_means_pca.min(), zmax=df_map.k_means_pca.max(), colorbar=dict(len=0.45, y=0.2), ) , row=2, col=2 ) ############################################# # Update geo subplot properties fig.update_geos( visible=True, fitbounds="locations", projection_type="natural earth", projection_scale=2, landcolor="white", oceancolor="LightBlue", showocean=True, lakecolor="white", lataxis_showgrid=True, lonaxis_showgrid=True, ) # Rotate x-axis labels fig.update_xaxes(tickangle=30, automargin='height') # Set theme, margin, and annotation in layout fig.update_layout( width=1400, height=800, margin=dict(r=5, t=25, b=20, l=60), ) # Update xaxis properties fig.update_xaxes(title_text="child mort", row=1, col=1) fig.update_xaxes(title_text="Paises", row=2, col=1) # Update yaxis properties fig.update_yaxes(title_text="gdpp", row=1, col=1) fig.update_yaxes(title_text="gdpp", row=2, col=1) fig.show()