import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df_country = pd.read_csv('/work/Country-data.csv')
df_country.describe()
df_country.isnull().sum()
df_country[df_country.duplicated()]
int_cols = df_country.select_dtypes(exclude='object').columns
for _ in int_cols:
plt.figure(figsize=(5,5))
sns.boxplot(data=df_country, y= _)
plt.show()
fig = plt.figure(figsize=(10,5))
sns.heatmap(df_country.corr(), annot=True, cmap='coolwarm')
plt.show()
from sklearn.preprocessing import StandardScaler
scaaler = StandardScaler()
df_country_scaled = scaaler.fit_transform(df_country.drop('country',axis=1))
df_country_scaled = pd.DataFrame(df_country_scaled, columns = df_country.drop('country',axis=1).columns )
df_country_scaled
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(df_country_scaled)
pca_data_scaled = pca.transform(df_country_scaled)
pca_data_scaled
var = pca.explained_variance_ratio_
print(var)
cum_var = np.cumsum(np.round(var, decimals=4)*100)
plt.figure(figsize=(7,7))
plt.plot(cum_var, 'r-x')
plt.show()
pca_data_standard = pd.DataFrame(pca_data_scaled)
pca_data_standard.drop([4,5,6,7,8],axis=1, inplace=True)
pca_data_standard
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram, linkage
Sum_of_squared_distances = []
silhouette_scores = []
K = range(2,15)
for _ in K:
km = KMeans(n_clusters = _)
y = km.fit_predict(pca_data_standard)
Sum_of_squared_distances.append(km.inertia_)
silhouette_scores.append(silhouette_score(pca_data_standard,y))
fig = plt.figure(figsize=(7,7))
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('K')
plt.ylabel('Inertia')
plt.show()
fig = plt.figure(figsize=(7,7))
plt.plot(K, silhouette_scores, 'rx-')
plt.xlabel('K')
plt.ylabel('silhouette score')
plt.show()
km = KMeans(n_clusters = 4)
y = km.fit_predict(pca_data_standard)
print(silhouette_score(pca_data_standard, y))
df_country['k_means_pca'] = y
Sum_of_squared_distances = []
silhouette_scores = []
K = range(2,15)
for _ in K:
km = KMeans(n_clusters = _)
y = km.fit_predict(df_country_scaled)
Sum_of_squared_distances.append(km.inertia_)
silhouette_scores.append(silhouette_score(df_country_scaled,y))
fig = plt.figure(figsize=(7,7))
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('K')
plt.ylabel('Inertia')
plt.show()
fig = plt.figure(figsize=(7,7))
plt.plot(K, silhouette_scores, 'rx-')
plt.xlabel('K')
plt.ylabel('silhouette score')
plt.show()
km = KMeans(n_clusters = 5)
y = km.fit_predict(df_country_scaled)
print(silhouette_score(df_country_scaled, y))
df_country['k_means'] = y
fig = plt.figure(figsize=(6,6))
dendrogram_plot = dendrogram(linkage(pca_data_standard, method='ward'))
plt.title('Dendrograma country')
plt.xlabel('Clusters')
plt.ylabel('Euclidean distance')
plt.show()
hc = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
y_hc = hc.fit_predict(pca_data_standard)
print(silhouette_score(pca_data_standard,y_hc ))
df_country['hc_pca'] = y_hc
fig = plt.figure(figsize=(6,6))
dendrogram_plot = dendrogram(linkage(df_country_scaled, method='ward'))
plt.title('Dendrograma country')
plt.xlabel('Clusters')
plt.ylabel('Euclidean distance')
plt.show()
hc = AgglomerativeClustering(n_clusters=2, affinity='euclidean', linkage='ward')
y_hc = hc.fit_predict(df_country_scaled)
print(silhouette_score(df_country_scaled,y_hc ))
df_country['hc'] = y_hc
from sklearn.neighbors import NearestNeighbors
neighbors = NearestNeighbors(n_neighbors=2)
neighbors_fit = neighbors.fit(pca_data_standard)
distances, indices = neighbors_fit.kneighbors(pca_data_standard)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
fig = plt.figure(figsize=(7,7))
plt.plot(distances)
eps_values = np.arange(0.5, 2.0, 0.10)
min_samples = np.arange(3,12)
from itertools import product
dbscan_paramns = list(product(eps_values,min_samples))
no_of_clusters = []
sil_score = []
for p in dbscan_paramns:
y_dbscan = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(pca_data_standard)
try:
sil_score.append(silhouette_score(pca_data_standard,y_dbscan))
except:
sil_score.append(0)
no_of_clusters.append(len(np.unique(y_dbscan)))
df_param_tunning = pd.DataFrame.from_records(dbscan_paramns, columns=['Eps','Min_samples'])
df_param_tunning['sil_score'] = sil_score
df_param_tunning['n_clusters'] = no_of_clusters
pivot_1 = pd.pivot_table(df_param_tunning, values='sil_score', columns='Eps', index='Min_samples')
pivot_2 = pd.pivot_table(df_param_tunning, values='n_clusters', columns='Eps', index='Min_samples')
fig, ax = plt.subplots(figsize=(15,4))
sns.heatmap(pivot_1, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()
fig, ax = plt.subplots(figsize=(15,4))
sns.heatmap(pivot_2, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()
dbscan_train = DBSCAN(eps=1.2, min_samples=3)
y_dbscan = dbscan_train.fit_predict(pca_data_standard)
print(silhouette_score(pca_data_standard,y_dbscan ))
df_country['dbscan_pca'] = y_dbscan
neighbors = NearestNeighbors(n_neighbors=2)
neighbors_fit = neighbors.fit(df_country_scaled)
distances, indices = neighbors_fit.kneighbors(df_country_scaled)
distances = np.sort(distances, axis=0)
distances = distances[:,1]
fig = plt.figure(figsize=(7,7))
plt.plot(distances)
eps_values = np.arange(1, 3.0, 0.10)
min_samples = np.arange(3,12)
from itertools import product
dbscan_paramns = list(product(eps_values,min_samples))
no_of_clusters = []
sil_score = []
for p in dbscan_paramns:
y_dbscan = DBSCAN(eps=p[0], min_samples=p[1]).fit_predict(df_country_scaled)
try:
sil_score.append(silhouette_score(df_country_scaled,y_dbscan))
except:
sil_score.append(0)
no_of_clusters.append(len(np.unique(y_dbscan)))
df_param_tunning = pd.DataFrame.from_records(dbscan_paramns, columns=['Eps','Min_samples'])
df_param_tunning['sil_score'] = sil_score
df_param_tunning['n_clusters'] = no_of_clusters
pivot_1 = pd.pivot_table(df_param_tunning, values='sil_score', columns='Eps', index='Min_samples')
pivot_2 = pd.pivot_table(df_param_tunning, values='n_clusters', columns='Eps', index='Min_samples')
fig, ax = plt.subplots(figsize=(15,4))
sns.heatmap(pivot_1, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()
fig, ax = plt.subplots(figsize=(15,4))
sns.heatmap(pivot_2, annot=True, annot_kws={'size':10}, cmap='coolwarm', ax=ax)
plt.show()
dbscan_train = DBSCAN(eps=1.7, min_samples=3)
y_dbscan = dbscan_train.fit_predict(df_country_scaled)
print(silhouette_score(df_country_scaled,y_dbscan ))
df_country['dbscan_cluster'] = y_dbscan
df_country
df_country.columns
df_country[['child_mort', 'exports', 'health', 'imports', 'income',
'inflation', 'life_expec', 'total_fer', 'gdpp', 'k_means_pca']]
sns.pairplot(data = df_country[['child_mort', 'exports', 'health', 'imports', 'income',
'inflation', 'life_expec', 'total_fer', 'gdpp', 'k_means_pca']], hue='k_means_pca', palette='coolwarm')
df_country
fig = plt.figure(figsize=(7,7))
sns.scatterplot(data=df_country, x='child_mort', y='gdpp',hue='k_means_pca', palette='coolwarm')
plt.show()
df_country[df_country['k_means_pca']==3]