Motivation
Setup
# Load libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.ticker import FormatStrFormatter
from matplotlib_scalebar.scalebar import ScaleBar
from pylab import rcParams
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 72
import seaborn as sns
#sns.set_style("darkgrid")
#sns.set_context(context="paper", font_scale=1.5, rc=None)
#sns.set(font="serif")
import plotly.express as px
import plotly.graph_objects as go
import geopandas as gpd
import contextily as cx
import libpysal
from libpysal import weights
from libpysal.weights import Queen
import esda
from esda.moran import Moran, Moran_Local
import splot
from splot.esda import moran_scatterplot, plot_moran, lisa_cluster, plot_local_autocorrelation
from splot.libpysal import plot_spatial_weights
from giddy.directional import Rose
import statsmodels.api as sm
import statsmodels.formula.api as smf
from spreg import OLS
from spreg import MoranRes
from spreg import ML_Lag
from spreg import ML_Error
from mgwr.gwr import GWR, MGWR
from mgwr.sel_bw import Sel_BW
from mgwr.utils import shift_colormap, truncate_colormap
import warnings
warnings.filterwarnings('ignore')
Import data
df.info()
Prepare data
df["year"]= df["year"].astype(str)
# Remove national observations
df1 = df[df.level != 'National']
df1in1990 = df.query("year == '1990'")
df1in2019 = df.query("year == '2019'")
# Keep only national observations
df2 = df[df.level == 'National']
Descriptive statistics
df1[['shdi', 'healthindex', 'incindex', 'edindex']].describe().round(2)
df1.query("year == '2019'").groupby('subcontinent')['shdi'].agg(['mean', 'std', 'min', 'max']).round(2)
df1.query("year == '2019' & subcontinent == 'South America'")[['shdi', 'healthindex', 'incindex', 'edindex']].describe().round(2)
df1.query("year == '2019' & country == 'Bolivia'")[['region', 'shdi', 'healthindex', 'incindex', 'edindex']].round(2)
Exploratory data analysis (EDA)
Cross-section dynamics
Strip plot
px.strip(df2,
x = 'shdi',
y = 'continentName',
title = "Dynamics of human development across countries classified by continent",
range_x= [0, 1],
color = 'continentName',
hover_name= 'countryName2',
hover_data = ['countryName2'],
animation_frame= 'year',
labels=dict(continentName = "Continent",
shdi ="Subnational human development index")
)
px.strip(df1,
x = 'shdi',
y = 'continentName',
title = "Dynamics of human development across subnational regions classified by continent",
range_x= [0, 1],
color = 'continentName',
hover_name= 'region',
hover_data = ['countryName2'],
animation_frame= 'year',
labels=dict(continentName = "Continent",
countryName2 = "Country",
shdi ="Subnational human development index")
)
px.strip(df1[df1['country'].isin(['China', 'Bolivia', 'Japan'])],
x = 'shdi',
y = 'country',
color = 'country',
title = "Dynamics of human development across subnational regions of selected countries",
hover_name= 'region',
hover_data = ['countryName2'],
range_x = [0.3, 1],
animation_frame = 'year',
labels=dict(continentName = "Continent",
country = "Country",
shdi ="Subnational human development index")
)
Histogram
px.histogram(df1[df1['subcontinent'].notna()],
x = 'shdi',
range_x= [0.15, 1],
nbins= 80,
color = 'subcontinent',
title = "Distribution of human development across subnational regions classified by subcontinent",
hover_name= 'region',
hover_data= ['countryName2'],
marginal='rug', # rug and box
animation_frame = 'year',
labels=dict(subcontinent = "Subcontinent",
shdi ="Subnational human development index")
)
ECDF
px.ecdf(df1[df1['continentName'].notna()],
x="shdi",
range_x= [0.15, 1],
color="continentName",
hover_name= 'region',
hover_data= ['country'],
title = "Cumulative distribution of human development across subnational regions classified by continent",
#markers=True,
#lines=False,
marginal="rug", # histogram
animation_frame = 'year',
labels=dict(continentName = "Continent",
shdi ="Subnational human development index")
)
Boxplot
px.box(df1[df1['subcontinent'].notna()],
x = 'shdi',
y = 'subcontinent',
range_x= [0.15, 1],
color = 'subcontinent',
hover_name= 'region',
hover_data = ['countryName2'],
title = "Distribution of human development across subnational regions classified by subcontinent",
animation_frame= 'year',
labels=dict(subcontinent = "Sub-continent",
countryName2 = "Country name",
shdi ="Subnational human development index")
)
Maps
px.choropleth(
df2,
locations="iso3",
color="shdi",
hover_name="country",
animation_frame="year",
title = "Spatial distribution of human development across countries",
color_continuous_scale=px.colors.sequential.Plasma,
projection="natural earth")
subnationalMap = gpd.read_file("https://gist.github.com/cmg777/81b6e2b1ea5865c09daf3b2bbf35bbf8/raw/5e0a0370f453421bc503594cfae1e466c89e449f/GDL%2520Shapefiles%2520V4.geojson")
subnationalMap.info()
gdf2019 = subnationalMap.merge(df1in2019, on = 'GDLcode', how = 'left', suffixes=('', '_y'))
# To remove the suffixes that appear after merge we use
gdf2019.drop(gdf2019.filter(regex='_y$').columns.tolist(), axis=1, inplace=True)
gdf2019.info()
fig, ax = plt.subplots(figsize=(12,8))
gdf2019.plot(column="shdi",
scheme='BoxPlot',
cmap='coolwarm',
legend=True,
ax=ax,
legend_kwds={'bbox_to_anchor':(1.00, 0.65)})
plt.title('Spatial distribution of human development in 2019: \nWhich regions are below/above the median?')
plt.tight_layout()
ax.axis("off")
#plt.savefig('myMap.png',dpi=300, bbox_inches='tight')
plt.show()
Time series evolution
Continents
fig = px.line(
df2,
x="year",
y="shdi", # Compare tr6_shdi and shdi
log_y= True,
color="country",
hover_name="country",
hover_data= ['country'],
title = "Evolution of human development across countries classified by continent",
labels=dict(shdi="HDI",
continentName ="Continent",
country = "Country",
year = "Year"),
facet_col="continentName",
facet_col_wrap = 2,
facet_row_spacing = 0.01,
height= 1800
)
fig.update_layout(showlegend=False)
fig = px.line(
df1,
x="year",
y="shdi", # Compare tr6_shdi and shdi
log_y= True,
color="region",
hover_name="region",
hover_data= ['country'],
title = "Evolution of human development across subnational regions classified by continent",
labels=dict(shdi="HDI",
continentName ="Continent",
subcontinent = "Subcontinent",
year = "Year"),
facet_col="continentName",
facet_col_wrap = 2,
facet_row_spacing = 0.01,
height= 1800
)
fig.update_layout(showlegend=False)
Subcontinents
fig = px.line(
df2[df2['subcontinent'].notna()],
x="year",
y="shdi",
log_y= True,
color="country",
hover_name="subcontinent",
hover_data= ['country'],
title = "Evolution of human development across countries classified by continent",
labels=dict(shdi="HDI",
subcontinent ="Subcontinent",
year = "Year"),
facet_col="subcontinent",
facet_col_wrap = 3,
facet_row_spacing = 0.01,
height= 2400
)
fig.update_layout(showlegend=False)
fig = px.line(
df1[df1['subcontinent'].notna()],
x="year",
y="shdi",
log_y= True,
color="region",
hover_name="region",
hover_data= ['country', 'continentName', 'subcontinent'],
title = "Evolution of human development across subnational regions classified by continent",
labels=dict(shdi="HDI",
subcontinent ="Subcontinent",
year = "Year"),
facet_col="subcontinent",
facet_col_wrap = 3,
facet_row_spacing = 0.01,
height= 2400
)
fig.update_layout(showlegend=False)
Countries
fig = px.line(
df2[df2['country'].isin(['China', 'Bolivia', 'Japan'])],
x="year",
y="shdi",
log_y= True,
color="country",
hover_name="subcontinent",
hover_data= ['country'],
title = "Evolution of human development in selected countries",
labels=dict(shdi="Human development index",
country ="Country",
year = "Year"),
facet_col="country",
facet_col_wrap = 3,
#facet_row_spacing = 0.01,
#height= 2400
)
fig.update_layout(showlegend=False)
fig = px.line(
df1[df1['country'].isin(['China', 'Bolivia', 'Japan'])],
x="year",
y="shdi",
log_y= True,
color="region",
hover_name="subcontinent",
hover_data= ['country'],
title = "Evolution of human development across subnational regions of selected countries",
labels=dict(shdi="Human development index",
country ="Country",
year = "Year"),
facet_col="country",
facet_col_wrap = 3,
#facet_row_spacing = 0.01,
#height= 2400
)
fig.update_layout(showlegend=False)
Correlations over time
Continents
px.scatter(df2,
y = "lifexp",
x = "lgnic",
range_y= [30, 90],
range_x= [5.5, 12.5],
hover_name = "country",
hover_data= ['country'],
color = "continentName",
#size = "pop", size_max = 60,
trendline= 'ols',
#trendline_scope= "overall",
#marginal_x= 'box',
#marginal_y= 'box',
animation_frame= 'year',
title = "The relationship between income and health across countries classified by continent",
labels=dict(country = "Country",
continentName = "Continent",
lgnic ="Log of gross national income per capita",
lifexp="Life Expectancy (years)")
)
px.scatter(df1,
y = "lifexp",
x = "lgnic",
range_y= [30, 90],
range_x= [5.5, 12.5],
hover_name = "region",
hover_data= ['country'],
color = "continentName",
#size = "pop", size_max = 60,
trendline= 'ols',
#trendline_scope= "overall",
#marginal_x= 'box',
#marginal_y= 'box',
animation_frame= 'year',
title = "Relationship between income and health across subnational regions classified by continent",
labels=dict(country = "Country",
continentName = "Continent",
lgnic ="Log of gross national income per capita",
lifexp="Life Expectancy (years)")
)
Subcontinents
px.scatter(df2[df2['subcontinent'].notna()],
y = "lifexp",
x = "lgnic",
range_y= [30, 90],
range_x= [5.5, 12.5],
hover_name = "country",
hover_data= ['country'],
color = "subcontinent",
#size = "pop", size_max = 60,
trendline= 'ols',
#trendline_scope= "overall",
#marginal_x= 'box',
#marginal_y= 'box',
animation_frame= 'year',
title = "Relationship between income and health across countries classified by subcontinent",
labels=dict(country = "Country",
subcontinent = "Subcontinent",
lgnic ="Log of gross national income per capita",
lifexp="Life Expectancy (years)")
)
px.scatter(df1[df1['subcontinent'].notna()],
y = "lifexp",
x = "lgnic",
range_y= [30, 90],
range_x= [5.5, 12.5],
hover_name = "region",
hover_data= ['country'],
color = "subcontinent",
#size = "pop", size_max = 60,
trendline= 'ols',
#trendline_scope= "overall",
#marginal_x= 'box',
#marginal_y= 'box',
animation_frame= 'year',
title = "Relationship between income and health across subnational regions classified by subcontinent",
labels=dict(country = "Country",
subcontinent = "Subcontinent",
lgnic ="Log of gross national income per capita",
lifexp="Life Expectancy (years)")
)
Countries
px.scatter(df1[df1['country'].isin(['China', 'Bolivia', 'Japan'])],
y = "lifexp",
x = "lgnic",
range_y= [50, 90],
range_x= [6, 11],
hover_name = "region",
hover_data= ['country'],
color = "country",
#size = "pop", size_max = 60,
trendline= 'ols',
#marginal_x= 'box',
#marginal_y= 'box',
animation_frame= 'year',
title = "Relationship between income and health across subnational regions in selected countries",
labels=dict(country = "Country",
lgnic ="Log of gross national income per capita",
lifexp="Life Expectancy (years)")
)
Export data
Panel data
southAmerica = df1.query("subcontinent == 'South America'").reset_index(drop=True)
southAmerica
southAmerica.to_csv('southAmerica.csv', index= False)
Digital map
gdf2019.info()
gdf2019.subcontinent.unique()
southAmerica2019 = gdf2019.query("subcontinent == 'South America'").reset_index(drop=True)
southAmerica2019
southAmerica2019.to_file("southAmerica2019.geojson", driver='GeoJSON')