import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import minmax_scale
import plotly.express as px
import altair as alt
wdi = pd.read_csv('/datasets/wdi-data/WDIData.csv')
wdi.head()
# drop unnecessary unnamed column
wdi = wdi.drop(columns='Unnamed: 66')
# pandas "melt" method reshapes data from wide to long 
wdi = \
wdi.melt(id_vars=['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'], \
         var_name='Year', value_name='Value')
# drop remaining nans
wdi = wdi.dropna()
# convert year to int for easy filtering/comparision 
wdi['Year'] = wdi['Year'].astype('int')
wdi = wdi.rename(columns={'Country Name': 'Country', 'Indicator Name': 'Indicator'})
asean_states = \
['Brunei Darussalam', 'Cambodia', 'Indonesia', 'Lao PDR', 'Malaysia', 'Myanmar', 'Philippines', 'Singapore', 'Thailand', 'Vietnam']
se_asian_states = wdi[(wdi['Country'].isin(asean_states)) & (wdi['Year'] >= 2010)]
# define the key health and education indicators used for the analysis
key_growth_indicators = \
[
    'Human capital index (HCI) (scale 0-1)',
    'Life expectancy at birth, total (years)',
    'Mortality rate, adult, male (per 1,000 male adults)',
    'GDP per capita (current US$)',
    'Literacy rate, adult total (% of people ages 15 and above)',
    'Progression to secondary school (%)',
    'Government expenditure per student, primary (% of GDP per capita)',
    'Physicians (per 1,000 people)',
    'People using safely managed drinking water services (% of population)'
]
df = \
se_asian_states[se_asian_states['Indicator']\
.isin(key_growth_indicators)]
df = df.drop(columns=['Country Code', 'Indicator Code'])
# let's inspect the data
df.head()
# break out each column value into it's own column for easy comparison 
df = df\
.pivot_table(values='Value', index=['Year', 'Country'], columns='Indicator').reset_index()
df.columns.name = None
# and viola, each indicator is it's own unique column now
df.columns
region_year_outcomes = \
['Year', 'Country', 'GDP per capita (current US$)', 'Human capital index (HCI) (scale 0-1)']
education = [
             'Government expenditure per student, primary (% of GDP per capita)',
             'Literacy rate, adult total (% of people ages 15 and above)',
            'Progression to secondary school (%)',
            ]
education.extend(region_year_outcomes)
health = [ind for ind in df.columns.tolist() if ind not in education]
health.extend(region_year_outcomes)
edu = df[education]
heal = df[health]
# check pct of nans in edu dataframe
edu.isna().sum().sum() / (edu.shape[0] * edu.shape[1])
edu.drop(columns='Year').describe()
# check pct of nans in heal dataframe
heal.isna().sum().sum() / (heal.shape[0] * heal.shape[1])
heal.drop(columns='Year').describe()
'''
let's inspect the education correlation coefficents 
using the default 'pearson' correlation as part of 
pandas .corr() method
'''
edu.corr()
hci_to_prog_second_school = \
edu[['Human capital index (HCI) (scale 0-1)', 'Progression to secondary school (%)']]
scaled_hci_to_prog_school = hci_to_prog_second_school.copy()
# scale hci and progression variables between 0 and 1 to accurately depict relationship
scaled_hci_to_prog_school[:] = minmax_scale(hci_to_prog_second_school)
px.scatter(scaled_hci_to_prog_school, x='Progression to secondary school (%)', 
y='Human capital index (HCI) (scale 0-1)', trendline='lowess', 
trendline_scope = 'overall')
edu.corr(method='spearman').loc['Progression to secondary school (%)', 'Human capital index (HCI) (scale 0-1)']
edu.corr(method='kendall').loc['Progression to secondary school (%)', 'Human capital index (HCI) (scale 0-1)']
'''
let's filter our education and health dataframe 
to exclude certain columns, namely year and 
country to look at the feature variables' 
relationship with the target
'''
heal_corr_gdp = heal\
.loc[:, ~heal.columns.isin(['Year', 'Country', 'Human capital index (HCI) (scale 0-1)', 'GDP per capita (current US$)'])].corrwith(heal['GDP per capita (current US$)']).sort_values(ascending=False)
heal_corr_hci = heal\
.loc[:, ~heal.columns.isin(['Year', 'Country', 'GDP per capita (current US$)', 'Human capital index (HCI) (scale 0-1)'])].corrwith(heal['Human capital index (HCI) (scale 0-1)']).sort_values(ascending=False)
heal_hci_data = pd.DataFrame(heal_corr_hci)
heal_hci_data = heal_hci_data.rename(columns={0: 'HCI'})
heal_gdp_data = pd.DataFrame(heal_corr_gdp)
heal_gdp_data = heal_gdp_data.rename(columns={0: 'GDP per capita'})
edu_corr_gdp = edu\
.loc[:, ~edu.columns.isin(['Year', 'Country', 'Human capital index (HCI) (scale 0-1)', 'GDP per capita (current US$)'])].corrwith(edu['GDP per capita (current US$)']).sort_values(ascending=False)
edu_corr_hci = edu\
.loc[:, ~edu.columns.isin(['Year', 'Country', 'GDP per capita (current US$)', 'Human capital index (HCI) (scale 0-1)'])].corrwith(edu['Human capital index (HCI) (scale 0-1)']).sort_values(ascending=False)
edu_hci_data = pd.DataFrame(edu_corr_hci)
edu_hci_data = edu_hci_data.rename(columns={0: 'HCI'})
edu_gdp_data = pd.DataFrame(edu_corr_gdp)
edu_gdp_data = edu_gdp_data.rename(columns={0: 'GDP per capita'})
heal_gdp_data.head()
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = [10.0,10.0]
fig = px.imshow(edu_gdp_data, text_auto=True)
fig.update_layout(
    title_text='Correlation of Education Indicators\' on GDP per capita',
    xaxis=dict({'side': 'top'}),
    
)
fig.show()
fig = px.imshow(edu_hci_data, text_auto=True)
fig.update_layout(
    title_text='Correlation of Education Indicators\' on HCI',
    xaxis=dict({'side': 'top'})
    
)
fig.show()
fig = px.imshow(heal_gdp_data, text_auto=True)
fig.update_layout(
#     height=600, 
#     width=800,
    title_text='Correlation of Health Indicators\' on GDP per capita',
    xaxis=dict({'side': 'top'}),
#     yaxis=dict({title_text:'Education Indicators'})
    
)
fig.show()
fig = px.imshow(heal_hci_data, text_auto=True)
fig.update_layout(
#     height=600, 
#     width=800,
    title_text='Correlation of Health Indicators\' on HCI',
    xaxis=dict({'side': 'top'}),
#     yaxis=dict({title_text:'Education Indicators'})
    
)
fig.show()
# education-related df's, measuring govt expenditure on literacy & progression to secondary school
literacy = edu[[
                # 'Government expenditure on education, total (% of GDP)', \
                'Government expenditure per student, primary (% of GDP per capita)',\
                'Literacy rate, adult total (% of people ages 15 and above)',\
                'Year',\
                'Country'
                ]]
grouped = \
literacy.groupby(['Country'])[['Government expenditure per student, primary (% of GDP per capita)',
'Literacy rate, adult total (% of people ages 15 and above)']]
mean_govt_spending_on_literacy = \
grouped.mean().reset_index().dropna()
mean_govt_spending_on_literacy = mean_govt_spending_on_literacy.sort_values(by=['Government expenditure per student, primary (% of GDP per capita)'])
mean_govt_spending_on_literacy['Government expenditure per student, primary (% of GDP per capita)'] = \
round(mean_govt_spending_on_literacy['Government expenditure per student, primary (% of GDP per capita)'], 2)
fig = px.bar(mean_govt_spending_on_literacy,
x='Government expenditure per student, primary (% of GDP per capita)',\
y='Literacy rate, adult total (% of people ages 15 and above)', color='Country')
fig.update_xaxes(type='category')
fig.show()
hci_second = edu[['Progression to secondary school (%)',
'Human capital index (HCI) (scale 0-1)',
'Year', 'Country']].dropna()
fig1 = px.bar(
           hci_second, 
           x='Progression to secondary school (%)',
           y='Human capital index (HCI) (scale 0-1)',
           color='Country'
        #    , size_max=50
           )
fig1.show()
  
gdp = heal[['Physicians (per 1,000 people)', 
'GDP per capita (current US$)',
'Year', 'Country', 'Life expectancy at birth, total (years)',
'Human capital index (HCI) (scale 0-1)']].dropna()
fig = px.scatter(
           gdp, 
           x='Physicians (per 1,000 people)',
           y='Life expectancy at birth, total (years)',
           color='Country', hover_name='Country',
           trendline='ols', trendline_scope = 'overall')
fig.update_layout(title="Physicians per 1000 people on Life expectancy")
fig.show()
fig = px.scatter(
           gdp, 
           x='Life expectancy at birth, total (years)',
           y='Human capital index (HCI) (scale 0-1)',
           color='Country', hover_name='Country', 
           trendline='ols', trendline_scope = 'overall')
fig.update_layout(title="Life Expectancy on HCI")
fig.show()
# health data - see how no. of physicians & basic santiation impacts life expectancy
phys_on_mortality = heal[[
    'Physicians (per 1,000 people)', 
    'Year', 
    'Country',
    # 'Mortality rate, adult, female (per 1,000 female adults)',
    'Mortality rate, adult, male (per 1,000 male adults)',
    'Life expectancy at birth, total (years)'
]]
basic_sani_on_life_exp = heal[[
    'People using safely managed drinking water services (% of population)',
    'Year', 
    'Country',
    'Life expectancy at birth, total (years)',
    'Mortality rate, adult, male (per 1,000 male adults)'
    # 'Human capital index (HCI) (scale 0-1)'
]]
import plotly.express as px                
fig = px.scatter(basic_sani_on_life_exp, 
                 x="People using safely managed drinking water services (% of population)", \
                 y='Mortality rate, adult, male (per 1,000 male adults)',
                 animation_frame='Year', animation_group='Country',
                 size=basic_sani_on_life_exp["Life expectancy at birth, total (years)"].fillna(0), color='Country', 
                 height=500, width=800, hover_name='Country', size_max=30, log_x=True)
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 700
fig.update_layout(title="Safe drinking water services on Mortality Rate")
fig.show()
# phys_on_mortality = phys_on_mortality.dropna()
fig = px.scatter(phys_on_mortality, x="Physicians (per 1,000 people)", y="Mortality rate, adult, male (per 1,000 male adults)", animation_frame="Year", animation_group="Country",
           color="Country", size=phys_on_mortality["Life expectancy at birth, total (years)"].fillna(0), hover_name="Country",
           log_x=True, size_max=30)
fig.update_layout(title="No. of Physicians per 1000 people on Mortality Rate")
fig.show()
grouped = \
df.groupby(['Year', 'Country'])[['Human capital index (HCI) (scale 0-1)',
'GDP per capita (current US$)']]
mean_govt_spending_on_gdp_per_capita = \
grouped.mean().reset_index().dropna()
fig = px.scatter(mean_govt_spending_on_gdp_per_capita,
x='Human capital index (HCI) (scale 0-1)',\
y='GDP per capita (current US$)', color='Country', trendline='lowess', 
trendline_scope = 'overall')
fig.update_layout(title="HCI vs GDP per Capita")
def pivot(df):
    df = df.pivot_table(values='Value', index=['Year', 'Country'], columns='Indicator').reset_index()
    df.columns.name = None
    return df
def filter_for_countries_with_high_indicator_count(df):
    
    grouped = df.set_index(['Year', 'Country']).groupby(level=[0, 1])
    df = \
    grouped\
    .filter(lambda x: x['Indicator'].count() >= len(groupby_df['Indicator'].unique()) / 2).reset_index()
    
    display('Count of unique Health and Education Indicators per Year & Country', df.groupby(['Year', 'Country'])['Indicator'].count().to_frame().rename(columns={'Indicator': 'Indicator Count'}))
    return df
    
    
groupby_df = \
se_asian_states[se_asian_states['Indicator']\
.isin(key_growth_indicators)]
groupby_df = \
groupby_df.drop(columns=['Country Code', 'Indicator Code'])
groupby_df = filter_for_countries_with_high_indicator_count(groupby_df)
grouped_df_pivot = pivot(groupby_df)
grouped = \
grouped_df_pivot.groupby(['Year', 'Country'])[['Literacy rate, adult total (% of people ages 15 and above)',
'Progression to secondary school (%)']]
mean_literacy_rate_on_prog_to_second_school = \
grouped.mean().reset_index().dropna()
px.scatter(mean_literacy_rate_on_prog_to_second_school,
x='Literacy rate, adult total (% of people ages 15 and above)',\
y='Progression to secondary school (%)', color='Country', trendline='ols', 
trendline_scope = 'overall')