Clustering Countries (CIA Factbook)

# import useful libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

df = pd.read_csv('CIA_Country_Facts.csv')

# features of countries df.columns

# 227 countries with 20 attributes per country df.shape

# 5 countries with largest population # China and India have much larger populations than any other countries df.sort_values('Population',ascending=False)[['Country','Population']][0:5]

# histogram of population of countries # we set xlim so that we exclude China and India plt.figure(figsize=(10,8)) sns.histplot(data=df,x='Population',bins=100) plt.xlim(0,3e8);

# barplot of Region vs. GDP plt.figure(figsize=(12,8)) sns.barplot(data=df,x='Region',y='GDP ($ per capita)') plt.xticks(rotation=90);

# 5 countries with highest GDP per Capita df.sort_values('GDP ($ per capita)',ascending=False)[['Country','GDP ($ per capita)']][0:5]

# scatter plot of GDP per Capita vs. Phones, hue by Region # it seems that as GDP per Capita increases, more population tend to have phones plt.figure(figsize=(12,8)) sns.scatterplot(data=df,x='GDP ($ per capita)',y='Phones (per 1000)',hue='Region');

# barplot of Region vs. Phones # again, Western Europe and NA have highest numbers for Phones plt.figure(figsize=(12,8)) sns.barplot(data=df,x='Region',y='Phones (per 1000)') plt.xticks(rotation=90);

# country where everyone has a phone df.sort_values('Phones (per 1000)',ascending=False)[['Country','Phones (per 1000)','GDP ($ per capita)']][0:1]

# rich country, but not that many phones df.sort_values('GDP ($ per capita)',ascending=False)[['Country','Phones (per 1000)','GDP ($ per capita)']][0:1]

# scatter plot of GDP per Capita vs. Literacy Rate, hue by Region plt.figure(figsize=(12,8)) sns.scatterplot(data=df,x='GDP ($ per capita)',y='Literacy (%)',hue='Region') plt.hlines(y=70,xmin=0,xmax=50000,colors='red');

# scatter plot of GDP per Capita vs. Infant Mortality rate, hue by Region plt.figure(figsize=(12,8)) sns.scatterplot(data=df,x='GDP ($ per capita)',y='Infant mortality (per 1000 births)',hue='Region');

# scatter plot of GDP per Capita vs. Literacy Rate, hue by Region plt.figure(figsize=(12,8)) sns.scatterplot(data=df,x='GDP ($ per capita)',y='Literacy (%)',hue='Region');

# scatter plot of GDP per Capita vs. Death Rate, hue by Region plt.figure(figsize=(12,8)) sns.scatterplot(data=df,x='GDP ($ per capita)',y='Deathrate',hue='Region');

sns.clustermap(df.corr())

# number of columns are missing values df.isnull().sum()

# We start with Climate column, as it is missing 22 values df[df['Climate'].isnull()]

# since climate has close relation with country's region, we use region's mean climate to fill NA mean_climate = df.groupby('Region')['Climate'].transform('mean') df['Climate'] = df['Climate'].fillna(mean_climate)

# although not strong, there seems correlation between literacy rate and region of a country # hence we do the same process as above mean_literacy = df.groupby('Region')['Literacy (%)'].transform('mean') df['Literacy (%)'] = df['Literacy (%)'].fillna(mean_literacy)

# countries with missing agriculture values are mostly small Islands # these countries simply do not have enough lands for agriculutre, so we fill NA with 0 df[df['Agriculture'].isnull()]['Country']

df['Agriculture'] = df['Agriculture'].fillna(0)

# fill empty entries with 0 df['Industry'] = df['Industry'].fillna(0) df['Service'] = df['Service'].fillna(0)

# we only have few countries now df.isnull().sum()

# since only few countries are still missing data, we will simply drop these countries df = df.dropna(axis=0)

df.isnull().sum()

# we have dropped 9 countries df.shape

# Country name is more of an index rather than a feature X = df.drop('Country',axis=1)

# Create dummy features for Region column X = pd.get_dummies(X) X

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler() scaled_X = scaler.fit_transform(X) scaled_X

from sklearn.cluster import KMeans

ssd = [] for k in range(2,40): kmeans = KMeans(n_clusters=k) kmeans.fit(scaled_X) # inertia_ is sum of squared distances of samples to their closest cluster center ssd.append(kmeans.inertia_)

plt.plot(range(2,40),ssd,'o-');

pd.Series(ssd).diff().plot(kind='bar');

final_model = KMeans(n_clusters=5) final_model.fit(scaled_X)

final_model.labels_

# ISO code of every country iso = pd.read_csv('country_iso_codes.csv')

iso.head()

iso.set_index('Country')

iso.set_index('Country')['ISO Code'].to_dict()

# create a dictionary for mapping iso_map = iso.set_index('Country')['ISO Code'].to_dict() df['ISO'] = df['Country'].map(iso_map)

df['Cluster'] = final_model.labels_

import plotly.express as px

df.columns

fig = px.choropleth(df, locations="ISO", color="Cluster", # lifeExp is a column of gapminder hover_name="Country", # column to add to hover information ) fig.show()