import matplotlib
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.datasets import make_blobs
from sklearn.neighbors import KNeighborsClassifier
from ipywidgets import interactive
from collections import defaultdict
import folium
import re
# DESIGNAMOS COLORES PARA REALIZAR CLUSTERS
cols = ['#e6194b', '#3cb44b', '#ffe119', '#4363d8', '#f58231', '#911eb4',
'#46f0f0', '#f032e6', '#bcf60c', '#fabebe', '#008080', '#e6beff',
'#9a6324', '#fffac8', '#800000', '#aaffc3', '#808000', '#ffd8b1',
'#000075', '#808080']*10
df = pd.read_csv('/content/coordenadas_sii_dummy_variables2.csv')
df = df.drop(['Unnamed: 0'], axis = 1)
df.head()
df.info('rut_completo')
m = folium.Map(location=[df.lat.mean(), df.long.mean()], zoom_start=13,
tiles='OpenStreet Map')
for _, row in df.iterrows():
folium.CircleMarker(
location=[row.lat, row.long],
radius=0.3,
popup=re.sub(r'[^a-zA-Z ]+', '', row.rut_completo),
color='#1787FE',
fill=True,
fill_colour='#1787FE'
).add_to(m)
Revision de Puntos GeoEspaciales en el Mapa
m
X = np.array(df[['long', 'lat'
]], dtype='float64')
X
from sklearn.cluster import KMeans
loss=list()
for i in range(1,10):
kmeans=KMeans(n_clusters= i, init="k-means++")
kmeans.fit(X)
loss.append(kmeans.inertia_)
sns.set_style("darkgrid")
plt.figure(figsize=(5,5))
plt.plot(range(1,10), loss)
plt.title("Método del Codo")
plt.xlabel("Numero de Clusters")
plt.ylabel("Perdida")
plt.show()
kmeans=KMeans(n_clusters=9,init="k-means++")
predicted_clusters=kmeans.fit_predict(X)
silhouette_score(X,predicted_clusters)
k = 9
model = KMeans(n_clusters=k, random_state=40).fit(X)
class_predictions = model.predict(X)
df[f'CLUSTER_kmeans{k}'] = class_predictions
df.head(2)
def create_map(df, cluster_column):
m = folium.Map(location=[df.lat.mean(), df.long.mean()], zoom_start=12, tiles='OpenStreet Map')
for _, row in df.iterrows():
if row[cluster_column] == -1:
cluster_colour = '#000000'
else:
cluster_colour = cols[row[cluster_column]]
folium.CircleMarker(
location= [row['lat'], row['long']],
radius=0.5,
popup= row[cluster_column],
color=cluster_colour,
fill=True,
fill_color=cluster_colour
).add_to(m)
return m
m = create_map(df, 'CLUSTER_kmeans9')
print(f'K={k}')
print(f'Silhouette Score: {silhouette_score(X, class_predictions)}')
#m.save('kmeans_9_02.html') #Sirve para guardar el mapa en formato html para abrirlo externamente
Plotting el mapa con los Clusters K-Means
m
best_silhouette, best_k = -1, 0
for k in tqdm(range(2, 10)):
model = KMeans(n_clusters=k, random_state=1).fit(X)
class_predictions = model.predict(X)
curr_silhouette = silhouette_score(X, class_predictions)
if curr_silhouette > best_silhouette:
best_k = k
best_silhouette = curr_silhouette
print(f'K={best_k}')
print(f'Silhouette Score: {best_silhouette}')
dummy = np.array([-1, -1, -1, 2, 3, 4, 5, -1])
new = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(dummy)])
model = DBSCAN(eps=0.01, min_samples=2).fit(X)
class_predictions = model.labels_
df['CLUSTERS_DBSCAN'] = class_predictions
print(f'Number of clusters found: {len(np.unique(class_predictions))}')
print(f'Number of outliers found: {len(class_predictions[class_predictions==-1])}')
#print(f'Silhouette ignoring outliers: {silhouette_score(X[class_predictions!=-1], class_predictions[class_predictions!=-1])}')
no_outliers = 0
no_outliers = np.array([(counter+2)*x if x==-1 else x for counter, x in enumerate(class_predictions)])
#print(f'Silhouette outliers as singletons: {silhouette_score(X, no_outliers)}')
df.head()
Plotting el mapa con los Clusters DBSCAN
m = create_map(df,'CLUSTERS_DBSCAN')
m
df.head()
df.to_csv('Coordenadas_modelo_kmeans_dbscan.csv')