Oral Exam - WDI analysis

import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.preprocessing import minmax_scale import plotly.express as px import altair as alt

wdi = pd.read_csv('/datasets/wdi-data/WDIData.csv')

wdi.head()

# drop unnecessary unnamed column wdi = wdi.drop(columns='Unnamed: 66')

# pandas "melt" method reshapes data from wide to long wdi = \ wdi.melt(id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], \ var_name='Year', value_name='Value') # drop remaining nans wdi = wdi.dropna() # convert year to int for easy filtering/comparision wdi['Year'] = wdi['Year'].astype('int') wdi = wdi.rename(columns={'Country Name': 'Country', 'Indicator Name': 'Indicator'})

asean_states = \ ['Brunei Darussalam', 'Cambodia', 'Indonesia', 'Lao PDR', 'Malaysia', 'Myanmar', 'Philippines', 'Singapore', 'Thailand', 'Vietnam'] se_asian_states = wdi[(wdi['Country'].isin(asean_states)) & (wdi['Year'] >= 2010)]

# define the key health and education indicators used for the analysis key_growth_indicators = \ [ 'Human capital index (HCI) (scale 0-1)', 'Life expectancy at birth, total (years)', 'Mortality rate, adult, male (per 1,000 male adults)', 'GDP per capita (current US$)', 'Literacy rate, adult total (% of people ages 15 and above)', 'Progression to secondary school (%)', 'Government expenditure per student, primary (% of GDP per capita)', 'Physicians (per 1,000 people)', 'People using safely managed drinking water services (% of population)' ] df = \ se_asian_states[se_asian_states['Indicator']\ .isin(key_growth_indicators)] df = df.drop(columns=['Country Code', 'Indicator Code'])

# let's inspect the data df.head()

# break out each column value into it's own column for easy comparison df = df\ .pivot_table(values='Value', index=['Year', 'Country'], columns='Indicator').reset_index() df.columns.name = None

# and viola, each indicator is it's own unique column now df.columns

region_year_outcomes = \ ['Year', 'Country', 'GDP per capita (current US$)', 'Human capital index (HCI) (scale 0-1)'] education = [ 'Government expenditure per student, primary (% of GDP per capita)', 'Literacy rate, adult total (% of people ages 15 and above)', 'Progression to secondary school (%)', ] education.extend(region_year_outcomes) health = [ind for ind in df.columns.tolist() if ind not in education] health.extend(region_year_outcomes) edu = df[education] heal = df[health]

# check pct of nans in edu dataframe edu.isna().sum().sum() / (edu.shape[0] * edu.shape[1])

edu.drop(columns='Year').describe()

# check pct of nans in heal dataframe heal.isna().sum().sum() / (heal.shape[0] * heal.shape[1])

heal.drop(columns='Year').describe()

''' let's inspect the education correlation coefficents using the default 'pearson' correlation as part of pandas .corr() method ''' edu.corr()

hci_to_prog_second_school = \ edu[['Human capital index (HCI) (scale 0-1)', 'Progression to secondary school (%)']] scaled_hci_to_prog_school = hci_to_prog_second_school.copy() # scale hci and progression variables between 0 and 1 to accurately depict relationship scaled_hci_to_prog_school[:] = minmax_scale(hci_to_prog_second_school) px.scatter(scaled_hci_to_prog_school, x='Progression to secondary school (%)', y='Human capital index (HCI) (scale 0-1)', trendline='lowess', trendline_scope = 'overall')

edu.corr(method='spearman').loc['Progression to secondary school (%)', 'Human capital index (HCI) (scale 0-1)']

edu.corr(method='kendall').loc['Progression to secondary school (%)', 'Human capital index (HCI) (scale 0-1)']

''' let's filter our education and health dataframe to exclude certain columns, namely year and country to look at the feature variables' relationship with the target ''' heal_corr_gdp = heal\ .loc[:, ~heal.columns.isin(['Year', 'Country', 'Human capital index (HCI) (scale 0-1)', 'GDP per capita (current US$)'])].corrwith(heal['GDP per capita (current US$)']).sort_values(ascending=False) heal_corr_hci = heal\ .loc[:, ~heal.columns.isin(['Year', 'Country', 'GDP per capita (current US$)', 'Human capital index (HCI) (scale 0-1)'])].corrwith(heal['Human capital index (HCI) (scale 0-1)']).sort_values(ascending=False) heal_hci_data = pd.DataFrame(heal_corr_hci) heal_hci_data = heal_hci_data.rename(columns={0: 'HCI'}) heal_gdp_data = pd.DataFrame(heal_corr_gdp) heal_gdp_data = heal_gdp_data.rename(columns={0: 'GDP per capita'}) edu_corr_gdp = edu\ .loc[:, ~edu.columns.isin(['Year', 'Country', 'Human capital index (HCI) (scale 0-1)', 'GDP per capita (current US$)'])].corrwith(edu['GDP per capita (current US$)']).sort_values(ascending=False) edu_corr_hci = edu\ .loc[:, ~edu.columns.isin(['Year', 'Country', 'GDP per capita (current US$)', 'Human capital index (HCI) (scale 0-1)'])].corrwith(edu['Human capital index (HCI) (scale 0-1)']).sort_values(ascending=False) edu_hci_data = pd.DataFrame(edu_corr_hci) edu_hci_data = edu_hci_data.rename(columns={0: 'HCI'}) edu_gdp_data = pd.DataFrame(edu_corr_gdp) edu_gdp_data = edu_gdp_data.rename(columns={0: 'GDP per capita'})

heal_gdp_data.head()

from plotly.subplots import make_subplots import plotly.graph_objects as go import matplotlib as mpl mpl.rcParams['figure.figsize'] = [10.0,10.0] fig = px.imshow(edu_gdp_data, text_auto=True) fig.update_layout( title_text='Correlation of Education Indicators\' on GDP per capita', xaxis=dict({'side': 'top'}), ) fig.show() fig = px.imshow(edu_hci_data, text_auto=True) fig.update_layout( title_text='Correlation of Education Indicators\' on HCI', xaxis=dict({'side': 'top'}) ) fig.show()

fig = px.imshow(heal_gdp_data, text_auto=True) fig.update_layout( # height=600, # width=800, title_text='Correlation of Health Indicators\' on GDP per capita', xaxis=dict({'side': 'top'}), # yaxis=dict({title_text:'Education Indicators'}) ) fig.show() fig = px.imshow(heal_hci_data, text_auto=True) fig.update_layout( # height=600, # width=800, title_text='Correlation of Health Indicators\' on HCI', xaxis=dict({'side': 'top'}), # yaxis=dict({title_text:'Education Indicators'}) ) fig.show()

# education-related df's, measuring govt expenditure on literacy & progression to secondary school literacy = edu[[ # 'Government expenditure on education, total (% of GDP)', \ 'Government expenditure per student, primary (% of GDP per capita)',\ 'Literacy rate, adult total (% of people ages 15 and above)',\ 'Year',\ 'Country' ]] grouped = \ literacy.groupby(['Country'])[['Government expenditure per student, primary (% of GDP per capita)', 'Literacy rate, adult total (% of people ages 15 and above)']] mean_govt_spending_on_literacy = \ grouped.mean().reset_index().dropna() mean_govt_spending_on_literacy = mean_govt_spending_on_literacy.sort_values(by=['Government expenditure per student, primary (% of GDP per capita)']) mean_govt_spending_on_literacy['Government expenditure per student, primary (% of GDP per capita)'] = \ round(mean_govt_spending_on_literacy['Government expenditure per student, primary (% of GDP per capita)'], 2) fig = px.bar(mean_govt_spending_on_literacy, x='Government expenditure per student, primary (% of GDP per capita)',\ y='Literacy rate, adult total (% of people ages 15 and above)', color='Country') fig.update_xaxes(type='category') fig.show()

hci_second = edu[['Progression to secondary school (%)', 'Human capital index (HCI) (scale 0-1)', 'Year', 'Country']].dropna() fig1 = px.bar( hci_second, x='Progression to secondary school (%)', y='Human capital index (HCI) (scale 0-1)', color='Country' # , size_max=50 ) fig1.show()

gdp = heal[['Physicians (per 1,000 people)', 'GDP per capita (current US$)', 'Year', 'Country', 'Life expectancy at birth, total (years)', 'Human capital index (HCI) (scale 0-1)']].dropna() fig = px.scatter( gdp, x='Physicians (per 1,000 people)', y='Life expectancy at birth, total (years)', color='Country', hover_name='Country', trendline='ols', trendline_scope = 'overall') fig.update_layout(title="Physicians per 1000 people on Life expectancy") fig.show() fig = px.scatter( gdp, x='Life expectancy at birth, total (years)', y='Human capital index (HCI) (scale 0-1)', color='Country', hover_name='Country', trendline='ols', trendline_scope = 'overall') fig.update_layout(title="Life Expectancy on HCI") fig.show()

# health data - see how no. of physicians & basic santiation impacts life expectancy phys_on_mortality = heal[[ 'Physicians (per 1,000 people)', 'Year', 'Country', # 'Mortality rate, adult, female (per 1,000 female adults)', 'Mortality rate, adult, male (per 1,000 male adults)', 'Life expectancy at birth, total (years)' ]] basic_sani_on_life_exp = heal[[ 'People using safely managed drinking water services (% of population)', 'Year', 'Country', 'Life expectancy at birth, total (years)', 'Mortality rate, adult, male (per 1,000 male adults)' # 'Human capital index (HCI) (scale 0-1)' ]]

import plotly.express as px fig = px.scatter(basic_sani_on_life_exp, x="People using safely managed drinking water services (% of population)", \ y='Mortality rate, adult, male (per 1,000 male adults)', animation_frame='Year', animation_group='Country', size=basic_sani_on_life_exp["Life expectancy at birth, total (years)"].fillna(0), color='Country', height=500, width=800, hover_name='Country', size_max=30, log_x=True) fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 700 fig.update_layout(title="Safe drinking water services on Mortality Rate") fig.show()

# phys_on_mortality = phys_on_mortality.dropna() fig = px.scatter(phys_on_mortality, x="Physicians (per 1,000 people)", y="Mortality rate, adult, male (per 1,000 male adults)", animation_frame="Year", animation_group="Country", color="Country", size=phys_on_mortality["Life expectancy at birth, total (years)"].fillna(0), hover_name="Country", log_x=True, size_max=30) fig.update_layout(title="No. of Physicians per 1000 people on Mortality Rate") fig.show()

grouped = \ df.groupby(['Year', 'Country'])[['Human capital index (HCI) (scale 0-1)', 'GDP per capita (current US$)']] mean_govt_spending_on_gdp_per_capita = \ grouped.mean().reset_index().dropna() fig = px.scatter(mean_govt_spending_on_gdp_per_capita, x='Human capital index (HCI) (scale 0-1)',\ y='GDP per capita (current US$)', color='Country', trendline='lowess', trendline_scope = 'overall') fig.update_layout(title="HCI vs GDP per Capita")

def pivot(df): df = df.pivot_table(values='Value', index=['Year', 'Country'], columns='Indicator').reset_index() df.columns.name = None return df def filter_for_countries_with_high_indicator_count(df): grouped = df.set_index(['Year', 'Country']).groupby(level=[0, 1]) df = \ grouped\ .filter(lambda x: x['Indicator'].count() >= len(groupby_df['Indicator'].unique()) / 2).reset_index() display('Count of unique Health and Education Indicators per Year & Country', df.groupby(['Year', 'Country'])['Indicator'].count().to_frame().rename(columns={'Indicator': 'Indicator Count'})) return df

groupby_df = \ se_asian_states[se_asian_states['Indicator']\ .isin(key_growth_indicators)] groupby_df = \ groupby_df.drop(columns=['Country Code', 'Indicator Code']) groupby_df = filter_for_countries_with_high_indicator_count(groupby_df)

grouped_df_pivot = pivot(groupby_df) grouped = \ grouped_df_pivot.groupby(['Year', 'Country'])[['Literacy rate, adult total (% of people ages 15 and above)', 'Progression to secondary school (%)']] mean_literacy_rate_on_prog_to_second_school = \ grouped.mean().reset_index().dropna() px.scatter(mean_literacy_rate_on_prog_to_second_school, x='Literacy rate, adult total (% of people ages 15 and above)',\ y='Progression to secondary school (%)', color='Country', trendline='ols', trendline_scope = 'overall')