import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import minmax_scale
import plotly.express as px
import altair as alt
wdi = pd.read_csv('/datasets/wdi-data/WDIData.csv')
wdi.head()
# drop unnecessary unnamed column
wdi = wdi.drop(columns='Unnamed: 66')
# pandas "melt" method reshapes data from wide to long
wdi = \
wdi.melt(id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], \
var_name='Year', value_name='Value')
# drop remaining nans
wdi = wdi.dropna()
# convert year to int for easy filtering/comparision
wdi['Year'] = wdi['Year'].astype('int')
wdi = wdi.rename(columns={'Country Name': 'Country', 'Indicator Name': 'Indicator'})
asean_states = \
['Brunei Darussalam', 'Cambodia', 'Indonesia', 'Lao PDR', 'Malaysia', 'Myanmar', 'Philippines', 'Singapore', 'Thailand', 'Vietnam']
se_asian_states = wdi[(wdi['Country'].isin(asean_states)) & (wdi['Year'] >= 2010)]
# define the key health and education indicators used for the analysis
key_growth_indicators = \
[
'Human capital index (HCI) (scale 0-1)',
'Life expectancy at birth, total (years)',
'Mortality rate, adult, male (per 1,000 male adults)',
'GDP per capita (current US$)',
'Literacy rate, adult total (% of people ages 15 and above)',
'Progression to secondary school (%)',
'Government expenditure per student, primary (% of GDP per capita)',
'Physicians (per 1,000 people)',
'People using safely managed drinking water services (% of population)'
]
df = \
se_asian_states[se_asian_states['Indicator']\
.isin(key_growth_indicators)]
df = df.drop(columns=['Country Code', 'Indicator Code'])
# let's inspect the data
df.head()
# break out each column value into it's own column for easy comparison
df = df\
.pivot_table(values='Value', index=['Year', 'Country'], columns='Indicator').reset_index()
df.columns.name = None
# and viola, each indicator is it's own unique column now
df.columns
region_year_outcomes = \
['Year', 'Country', 'GDP per capita (current US$)', 'Human capital index (HCI) (scale 0-1)']
education = [
'Government expenditure per student, primary (% of GDP per capita)',
'Literacy rate, adult total (% of people ages 15 and above)',
'Progression to secondary school (%)',
]
education.extend(region_year_outcomes)
health = [ind for ind in df.columns.tolist() if ind not in education]
health.extend(region_year_outcomes)
edu = df[education]
heal = df[health]
# check pct of nans in edu dataframe
edu.isna().sum().sum() / (edu.shape[0] * edu.shape[1])
edu.drop(columns='Year').describe()
# check pct of nans in heal dataframe
heal.isna().sum().sum() / (heal.shape[0] * heal.shape[1])
heal.drop(columns='Year').describe()
'''
let's inspect the education correlation coefficents
using the default 'pearson' correlation as part of
pandas .corr() method
'''
edu.corr()
hci_to_prog_second_school = \
edu[['Human capital index (HCI) (scale 0-1)', 'Progression to secondary school (%)']]
scaled_hci_to_prog_school = hci_to_prog_second_school.copy()
# scale hci and progression variables between 0 and 1 to accurately depict relationship
scaled_hci_to_prog_school[:] = minmax_scale(hci_to_prog_second_school)
px.scatter(scaled_hci_to_prog_school, x='Progression to secondary school (%)',
y='Human capital index (HCI) (scale 0-1)', trendline='lowess',
trendline_scope = 'overall')
edu.corr(method='spearman').loc['Progression to secondary school (%)', 'Human capital index (HCI) (scale 0-1)']
edu.corr(method='kendall').loc['Progression to secondary school (%)', 'Human capital index (HCI) (scale 0-1)']
'''
let's filter our education and health dataframe
to exclude certain columns, namely year and
country to look at the feature variables'
relationship with the target
'''
heal_corr_gdp = heal\
.loc[:, ~heal.columns.isin(['Year', 'Country', 'Human capital index (HCI) (scale 0-1)', 'GDP per capita (current US$)'])].corrwith(heal['GDP per capita (current US$)']).sort_values(ascending=False)
heal_corr_hci = heal\
.loc[:, ~heal.columns.isin(['Year', 'Country', 'GDP per capita (current US$)', 'Human capital index (HCI) (scale 0-1)'])].corrwith(heal['Human capital index (HCI) (scale 0-1)']).sort_values(ascending=False)
heal_hci_data = pd.DataFrame(heal_corr_hci)
heal_hci_data = heal_hci_data.rename(columns={0: 'HCI'})
heal_gdp_data = pd.DataFrame(heal_corr_gdp)
heal_gdp_data = heal_gdp_data.rename(columns={0: 'GDP per capita'})
edu_corr_gdp = edu\
.loc[:, ~edu.columns.isin(['Year', 'Country', 'Human capital index (HCI) (scale 0-1)', 'GDP per capita (current US$)'])].corrwith(edu['GDP per capita (current US$)']).sort_values(ascending=False)
edu_corr_hci = edu\
.loc[:, ~edu.columns.isin(['Year', 'Country', 'GDP per capita (current US$)', 'Human capital index (HCI) (scale 0-1)'])].corrwith(edu['Human capital index (HCI) (scale 0-1)']).sort_values(ascending=False)
edu_hci_data = pd.DataFrame(edu_corr_hci)
edu_hci_data = edu_hci_data.rename(columns={0: 'HCI'})
edu_gdp_data = pd.DataFrame(edu_corr_gdp)
edu_gdp_data = edu_gdp_data.rename(columns={0: 'GDP per capita'})
heal_gdp_data.head()
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [10.0,10.0]
fig = px.imshow(edu_gdp_data, text_auto=True)
fig.update_layout(
title_text='Correlation of Education Indicators\' on GDP per capita',
xaxis=dict({'side': 'top'}),
)
fig.show()
fig = px.imshow(edu_hci_data, text_auto=True)
fig.update_layout(
title_text='Correlation of Education Indicators\' on HCI',
xaxis=dict({'side': 'top'})
)
fig.show()
fig = px.imshow(heal_gdp_data, text_auto=True)
fig.update_layout(
# height=600,
# width=800,
title_text='Correlation of Health Indicators\' on GDP per capita',
xaxis=dict({'side': 'top'}),
# yaxis=dict({title_text:'Education Indicators'})
)
fig.show()
fig = px.imshow(heal_hci_data, text_auto=True)
fig.update_layout(
# height=600,
# width=800,
title_text='Correlation of Health Indicators\' on HCI',
xaxis=dict({'side': 'top'}),
# yaxis=dict({title_text:'Education Indicators'})
)
fig.show()
# education-related df's, measuring govt expenditure on literacy & progression to secondary school
literacy = edu[[
# 'Government expenditure on education, total (% of GDP)', \
'Government expenditure per student, primary (% of GDP per capita)',\
'Literacy rate, adult total (% of people ages 15 and above)',\
'Year',\
'Country'
]]
grouped = \
literacy.groupby(['Country'])[['Government expenditure per student, primary (% of GDP per capita)',
'Literacy rate, adult total (% of people ages 15 and above)']]
mean_govt_spending_on_literacy = \
grouped.mean().reset_index().dropna()
mean_govt_spending_on_literacy = mean_govt_spending_on_literacy.sort_values(by=['Government expenditure per student, primary (% of GDP per capita)'])
mean_govt_spending_on_literacy['Government expenditure per student, primary (% of GDP per capita)'] = \
round(mean_govt_spending_on_literacy['Government expenditure per student, primary (% of GDP per capita)'], 2)
fig = px.bar(mean_govt_spending_on_literacy,
x='Government expenditure per student, primary (% of GDP per capita)',\
y='Literacy rate, adult total (% of people ages 15 and above)', color='Country')
fig.update_xaxes(type='category')
fig.show()
hci_second = edu[['Progression to secondary school (%)',
'Human capital index (HCI) (scale 0-1)',
'Year', 'Country']].dropna()
fig1 = px.bar(
hci_second,
x='Progression to secondary school (%)',
y='Human capital index (HCI) (scale 0-1)',
color='Country'
# , size_max=50
)
fig1.show()
gdp = heal[['Physicians (per 1,000 people)',
'GDP per capita (current US$)',
'Year', 'Country', 'Life expectancy at birth, total (years)',
'Human capital index (HCI) (scale 0-1)']].dropna()
fig = px.scatter(
gdp,
x='Physicians (per 1,000 people)',
y='Life expectancy at birth, total (years)',
color='Country', hover_name='Country',
trendline='ols', trendline_scope = 'overall')
fig.update_layout(title="Physicians per 1000 people on Life expectancy")
fig.show()
fig = px.scatter(
gdp,
x='Life expectancy at birth, total (years)',
y='Human capital index (HCI) (scale 0-1)',
color='Country', hover_name='Country',
trendline='ols', trendline_scope = 'overall')
fig.update_layout(title="Life Expectancy on HCI")
fig.show()
# health data - see how no. of physicians & basic santiation impacts life expectancy
phys_on_mortality = heal[[
'Physicians (per 1,000 people)',
'Year',
'Country',
# 'Mortality rate, adult, female (per 1,000 female adults)',
'Mortality rate, adult, male (per 1,000 male adults)',
'Life expectancy at birth, total (years)'
]]
basic_sani_on_life_exp = heal[[
'People using safely managed drinking water services (% of population)',
'Year',
'Country',
'Life expectancy at birth, total (years)',
'Mortality rate, adult, male (per 1,000 male adults)'
# 'Human capital index (HCI) (scale 0-1)'
]]
import plotly.express as px
fig = px.scatter(basic_sani_on_life_exp,
x="People using safely managed drinking water services (% of population)", \
y='Mortality rate, adult, male (per 1,000 male adults)',
animation_frame='Year', animation_group='Country',
size=basic_sani_on_life_exp["Life expectancy at birth, total (years)"].fillna(0), color='Country',
height=500, width=800, hover_name='Country', size_max=30, log_x=True)
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 700
fig.update_layout(title="Safe drinking water services on Mortality Rate")
fig.show()
# phys_on_mortality = phys_on_mortality.dropna()
fig = px.scatter(phys_on_mortality, x="Physicians (per 1,000 people)", y="Mortality rate, adult, male (per 1,000 male adults)", animation_frame="Year", animation_group="Country",
color="Country", size=phys_on_mortality["Life expectancy at birth, total (years)"].fillna(0), hover_name="Country",
log_x=True, size_max=30)
fig.update_layout(title="No. of Physicians per 1000 people on Mortality Rate")
fig.show()
grouped = \
df.groupby(['Year', 'Country'])[['Human capital index (HCI) (scale 0-1)',
'GDP per capita (current US$)']]
mean_govt_spending_on_gdp_per_capita = \
grouped.mean().reset_index().dropna()
fig = px.scatter(mean_govt_spending_on_gdp_per_capita,
x='Human capital index (HCI) (scale 0-1)',\
y='GDP per capita (current US$)', color='Country', trendline='lowess',
trendline_scope = 'overall')
fig.update_layout(title="HCI vs GDP per Capita")
def pivot(df):
df = df.pivot_table(values='Value', index=['Year', 'Country'], columns='Indicator').reset_index()
df.columns.name = None
return df
def filter_for_countries_with_high_indicator_count(df):
grouped = df.set_index(['Year', 'Country']).groupby(level=[0, 1])
df = \
grouped\
.filter(lambda x: x['Indicator'].count() >= len(groupby_df['Indicator'].unique()) / 2).reset_index()
display('Count of unique Health and Education Indicators per Year & Country', df.groupby(['Year', 'Country'])['Indicator'].count().to_frame().rename(columns={'Indicator': 'Indicator Count'}))
return df
groupby_df = \
se_asian_states[se_asian_states['Indicator']\
.isin(key_growth_indicators)]
groupby_df = \
groupby_df.drop(columns=['Country Code', 'Indicator Code'])
groupby_df = filter_for_countries_with_high_indicator_count(groupby_df)
grouped_df_pivot = pivot(groupby_df)
grouped = \
grouped_df_pivot.groupby(['Year', 'Country'])[['Literacy rate, adult total (% of people ages 15 and above)',
'Progression to secondary school (%)']]
mean_literacy_rate_on_prog_to_second_school = \
grouped.mean().reset_index().dropna()
px.scatter(mean_literacy_rate_on_prog_to_second_school,
x='Literacy rate, adult total (% of people ages 15 and above)',\
y='Progression to secondary school (%)', color='Country', trendline='ols',
trendline_scope = 'overall')