IMPORTS
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')
LOADING DATA
df = pd.read_csv('/work/AsiaPopulation2020.csv')
DATA DESCRIPTION
df.head()
df.shape
df.describe()
sns.heatmap(df.isna())
FILLNA
df[df['UrbanPop'].isna()]
df.UrbanPop.fillna(100, inplace=True)
sns.heatmap(df.isna());
df.dtypes
EXPLORATORY DATA ANALYSIS (EDA)
df.sample(3)
WHICH ASIAS MOST POPULOUS COUNTRIES?
plt.figure(figsize=(15,7))
sns.barplot(x='Country', y='Population' ,data=df.nlargest(10, 'Population'));
plt.title('The most populous countries in Asia');
WHAT ARE THE 10 LEAST URBANIZED COUNTRIES IN ASIA?
plt.figure(figsize=(15,7))
g = sns.barplot(x='Country', y='UrbanPop' ,data=df.nsmallest(10, 'UrbanPop'));
for p in g.patches:
height = p.get_height()
g.text(p.get_x()+p.get_width()/2.,
height/2,
'{}%'.format(round(height)),
ha="center", color='white')
plt.ylabel('Urbanization rate')
plt.title('The 10 biggest countries in Asia');
WHICH COUNTRY WITH THE MOST DENSITY IN ASIA?
plt.figure(figsize=(15,7))
g = sns.barplot(x='Country', y='Density', data=df.nlargest(10, 'Density'));
for p in g.patches:
height = p.get_height()
g.text(p.get_x()+p.get_width()/2.,
height/2,
'{}'.format(round(height)),
ha="center", color='white')
plt.title("The 10 densest countries in Asia");
WHAT ARE THE 10 BIGGEST COUNTRIES IN ASIA?
plt.figure(figsize=(15,7))
g = sns.barplot(x='Country', y='LandArea' ,data=df.nlargest(10, 'LandArea'));
plt.title('The 10 biggest countries in Asia');
WHICH ARE THE 10 COUNTRIES WITH THE HIGHEST AVERAGE AGE IN ASIA?
plt.figure(figsize=(15,7))
g= sns.barplot(x='Country', y='MedAge', data=df.nlargest(10, 'MedAge'));
plt.title('Average age in Asia');
for p in g.patches:
height = p.get_height()
g.text(p.get_x()+p.get_width()/2.,
height/2,
'{}'.format(round(height)),
ha="center", color='white')
As we can see, Japan has the highest average age in Asia
WHAT IS THE PROPORTION OF THE POPULATION IN RELATION TO THE WORLD?
plt.figure(figsize=(15,7))
g = sns.barplot(x='Country', y='WorldShare' ,data=df.nlargest(10, 'WorldShare'));
g.annotate("China and India together represent 36.17% of the world's population", xy=(0.5, 6), xytext=(3, 5),arrowprops=dict(facecolor="black", shrink=0.09),)
for p in g.patches:
height = p.get_height()
g.text(p.get_x()+p.get_width()/2.,
height/2,
'{}'.format(round(height)),
ha="center", color='white')
plt.title('The most populous countries in Asia');
Correlation
correlations = df.corr()
f, ax = plt.subplots(figsize = (10,5))
sns.heatmap(correlations, annot = True);