# import useful libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('CIA_Country_Facts.csv')
# features of countries
df.columns
# 227 countries with 20 attributes per country
df.shape
# 5 countries with largest population
# China and India have much larger populations than any other countries
df.sort_values('Population',ascending=False)[['Country','Population']][0:5]
# histogram of population of countries
# we set xlim so that we exclude China and India
plt.figure(figsize=(10,8))
sns.histplot(data=df,x='Population',bins=100)
plt.xlim(0,3e8);
# barplot of Region vs. GDP
plt.figure(figsize=(12,8))
sns.barplot(data=df,x='Region',y='GDP ($ per capita)')
plt.xticks(rotation=90);
# 5 countries with highest GDP per Capita
df.sort_values('GDP ($ per capita)',ascending=False)[['Country','GDP ($ per capita)']][0:5]
# scatter plot of GDP per Capita vs. Phones, hue by Region
# it seems that as GDP per Capita increases, more population tend to have phones
plt.figure(figsize=(12,8))
sns.scatterplot(data=df,x='GDP ($ per capita)',y='Phones (per 1000)',hue='Region');
# barplot of Region vs. Phones
# again, Western Europe and NA have highest numbers for Phones
plt.figure(figsize=(12,8))
sns.barplot(data=df,x='Region',y='Phones (per 1000)')
plt.xticks(rotation=90);
# country where everyone has a phone
df.sort_values('Phones (per 1000)',ascending=False)[['Country','Phones (per 1000)','GDP ($ per capita)']][0:1]
# rich country, but not that many phones
df.sort_values('GDP ($ per capita)',ascending=False)[['Country','Phones (per 1000)','GDP ($ per capita)']][0:1]
# scatter plot of GDP per Capita vs. Literacy Rate, hue by Region
plt.figure(figsize=(12,8))
sns.scatterplot(data=df,x='GDP ($ per capita)',y='Literacy (%)',hue='Region')
plt.hlines(y=70,xmin=0,xmax=50000,colors='red');
# scatter plot of GDP per Capita vs. Infant Mortality rate, hue by Region
plt.figure(figsize=(12,8))
sns.scatterplot(data=df,x='GDP ($ per capita)',y='Infant mortality (per 1000 births)',hue='Region');
# scatter plot of GDP per Capita vs. Literacy Rate, hue by Region
plt.figure(figsize=(12,8))
sns.scatterplot(data=df,x='GDP ($ per capita)',y='Literacy (%)',hue='Region');
# scatter plot of GDP per Capita vs. Death Rate, hue by Region
plt.figure(figsize=(12,8))
sns.scatterplot(data=df,x='GDP ($ per capita)',y='Deathrate',hue='Region');
sns.clustermap(df.corr())
# number of columns are missing values
df.isnull().sum()
# We start with Climate column, as it is missing 22 values
df[df['Climate'].isnull()]
# since climate has close relation with country's region, we use region's mean climate to fill NA
mean_climate = df.groupby('Region')['Climate'].transform('mean')
df['Climate'] = df['Climate'].fillna(mean_climate)
# although not strong, there seems correlation between literacy rate and region of a country
# hence we do the same process as above
mean_literacy = df.groupby('Region')['Literacy (%)'].transform('mean')
df['Literacy (%)'] = df['Literacy (%)'].fillna(mean_literacy)
# countries with missing agriculture values are mostly small Islands
# these countries simply do not have enough lands for agriculutre, so we fill NA with 0
df[df['Agriculture'].isnull()]['Country']
df['Agriculture'] = df['Agriculture'].fillna(0)
# fill empty entries with 0
df['Industry'] = df['Industry'].fillna(0)
df['Service'] = df['Service'].fillna(0)
# we only have few countries now
df.isnull().sum()
# since only few countries are still missing data, we will simply drop these countries
df = df.dropna(axis=0)
df.isnull().sum()
# we have dropped 9 countries
df.shape
# Country name is more of an index rather than a feature
X = df.drop('Country',axis=1)
# Create dummy features for Region column
X = pd.get_dummies(X)
X
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X
from sklearn.cluster import KMeans
ssd = []
for k in range(2,40):
kmeans = KMeans(n_clusters=k)
kmeans.fit(scaled_X)
# inertia_ is sum of squared distances of samples to their closest cluster center
ssd.append(kmeans.inertia_)
plt.plot(range(2,40),ssd,'o-');
pd.Series(ssd).diff().plot(kind='bar');
final_model = KMeans(n_clusters=5)
final_model.fit(scaled_X)
final_model.labels_
# ISO code of every country
iso = pd.read_csv('country_iso_codes.csv')
iso.head()
iso.set_index('Country')
iso.set_index('Country')['ISO Code'].to_dict()
# create a dictionary for mapping
iso_map = iso.set_index('Country')['ISO Code'].to_dict()
df['ISO'] = df['Country'].map(iso_map)
df['Cluster'] = final_model.labels_
import plotly.express as px
df.columns
fig = px.choropleth(df, locations="ISO",
color="Cluster", # lifeExp is a column of gapminder
hover_name="Country", # column to add to hover information
)
fig.show()