Science is ...
How can we stand on the shoulders of giants?
We could use scientific data repositories
What is the human development index?
Let's access the subnational human development dataset
Recommended papers based on this dataset
Understand the contents of the dataset
Data science on the shoulders of giants
Setup
# Load general libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import chart_studio
#chart_studio.tools.set_credentials_file(username='econdata777', api_key='ADDhere')
import chart_studio.plotly as save2cs
import statsmodels.api as sm
import statsmodels.formula.api as smf
# Load spatial analysis libraries
import geopandas as gpd
import libpysal
from libpysal import weights
from libpysal.weights import Queen
import esda
from esda.moran import Moran, Moran_Local
from giddy.directional import Rose
import splot
from splot.esda import moran_scatterplot, plot_moran, lisa_cluster, plot_local_autocorrelation
from splot.libpysal import plot_spatial_weights
from spreg import OLS
from spreg import MoranRes
from spreg import ML_Lag
from spreg import ML_Error
from mgwr.gwr import GWR, MGWR
from mgwr.sel_bw import Sel_BW
from mgwr.utils import shift_colormap, truncate_colormap
import warnings
warnings.filterwarnings('ignore')
Import data
df.info()
df
map = gpd.read_file("https://gist.github.com/cmg777/81b6e2b1ea5865c09daf3b2bbf35bbf8/raw/5e0a0370f453421bc503594cfae1e466c89e449f/GDL%2520Shapefiles%2520V4.geojson")
map.plot('shdi');
map.info()
map.head(9)
fig, ax = plt.subplots(figsize=(12,8))
map.plot(column="shdi", scheme='BoxPlot', cmap='coolwarm', legend=True, ax=ax, legend_kwds={'bbox_to_anchor':(1.00, 0.70)})
plt.title('Spatial distribution of the subnational HDI: \nWhich regions are below/above the median?')
plt.tight_layout()
ax.axis("off")
#plt.savefig('myMap.png',dpi=150, bbox_inches='tight')
plt.show()
map['shdi'].describe().round(2)
Prepare data
# Remove national observations
df1 = df[df.level != 'National']
df1
df.columns
df["year"]= df["year"].astype(str)
df1990 = df[df['year'] == '1990']
df2019 = df[df['year'] == '2019']
df2019['Continent'].unique()
df2019['SubContinent'].unique()
df2019['country'].unique()
Descriptive statistics
df1.loc[:, df1.columns != 'year'].describe().round(2)
df1990.describe().round(2)
df2019.describe().round(2)
df2019.query("country == 'China'").describe().round(2)
Regional differences
Strip plot
px.strip(df1,
x = 'shdi',
y = 'continent2',
range_x= [0, 1],
color = 'continent2',
hover_name= 'region',
hover_data = ['country'],
animation_frame= 'year',
labels=dict(continent2 = "Continent",
shdi ="Subnational human development index")
)
px.strip(df1,
x = 'shdi',
y = 'SubContinent',
range_x= [0, 1],
color = 'SubContinent',
hover_name= 'region',
hover_data = ['country'],
animation_frame= 'year',
labels=dict(SubContinent = "Sub-continent",
shdi ="Subnational human development index")
)
px.strip(df1[df1['country'].isin(['China', 'Bolivia', 'Japan'])], x = 'shdi', y = 'country', color = 'country', hover_name= 'region', hover_data = ['country'], range_x = [0.3, 1], animation_frame = 'year')
Box plot
px.box(df1,
x = 'shdi',
y = 'continent2',
range_x= [0.15, 1],
color = 'continent2',
hover_name= 'region',
hover_data = ['country'],
animation_frame= 'year',
labels=dict(continent2 = "Continent",
shdi ="Subnational human development index")
)
px.box(df1,
x = 'shdi',
y = 'SubContinent',
range_x= [0.15, 1],
color = 'SubContinent',
hover_name= 'region',
hover_data = ['country'],
animation_frame= 'year',
labels=dict(SubContinent = "Sub-continent",
shdi ="Subnational human development index")
)
Histogram
px.histogram(df1,
x = 'shdi',
range_x= [0.15, 1],
nbins= 50,
color = 'continent2',
hover_name= 'region',
hover_data= ['country'],
marginal='box', # rug is also a good option
animation_frame = 'year',
labels=dict(continent2 = "Continent",
shdi ="Subnational human development index")
)
px.histogram(df1,
x = 'shdi',
range_x= [0.15, 1],
nbins= 80,
color = 'SubContinent',
hover_name= 'region',
hover_data= ['country'],
marginal='box', # rug is also a good option
animation_frame = 'year',
labels=dict(SubContinent = "Continent",
shdi ="Subnational human development index")
)
ECDF
px.ecdf(df1,
x="shdi",
range_x= [0.15, 1],
color="continent2",
hover_name= 'region',
hover_data= ['country'],
#markers=True,
#lines=False,
marginal="rug", # histogram
animation_frame = 'year',
labels=dict(continent2 = "Continent",
shdi ="Subnational human development index")
)
Time series
Continents
fig = px.line(
df1,
x="year",
y="shdi",
log_y= True,
color="region",
hover_name="continent2",
hover_data= ['country'],
labels=dict(shdi="HDI",
continent2 ="Continent",
year = ""),
facet_col="continent2",
facet_col_wrap = 2,
facet_row_spacing = 0.01,
height= 1800
)
fig.update_layout(showlegend=False)
Subcontinents
fig = px.line(
df1,
x="year",
y="shdi",
log_y= True,
color="region",
hover_name="SubContinent",
hover_data= ['country'],
labels=dict(shdi="HDI",
SubContinent ="Subcontinent",
year = ""),
facet_col="SubContinent",
facet_col_wrap = 3,
facet_row_spacing = 0.01,
height= 2400
)
fig.update_layout(showlegend=False)
Countries
fig = px.line(
df1[df1['country'].isin(['China', 'Bolivia', 'Japan'])],
x="year",
y="shdi",
log_y= True,
color="region",
hover_name="SubContinent",
hover_data= ['country'],
labels=dict(shdi="Human development index",
country ="Country",
year = ""),
facet_col="country",
facet_col_wrap = 3,
#facet_row_spacing = 0.01,
#height= 2400
)
fig.update_layout(showlegend=False)
Correlations
px.scatter(df1,
y = "lifexp",
x = "lgnic",
range_y= [20, 90],
range_x= [5, 13],
hover_name = "region",
hover_data= ['country'],
color = "continent2",
#size = "pop", size_max = 60,
trendline= 'ols',
#trendline_scope= "overall",
#marginal_x= 'box',
#marginal_y= 'box',
animation_frame= 'year',
labels=dict(country = "Country",
continent2 = "Continent",
lgnic ="Log of gross national income per capita",
lifexp="Life Expectancy (years)")
)
px.scatter(df1[df1['country'].isin(['China', 'Bolivia', 'Japan'])],
y = "lifexp",
x = "lgnic",
range_y= [50, 90],
range_x= [6, 11],
hover_name = "region",
hover_data= ['country'],
color = "country",
#size = "pop", size_max = 60,
trendline= 'ols',
#marginal_x= 'box',
#marginal_y= 'box',
animation_frame= 'year',
labels=dict(country = "Country",
lgnic ="Log of gross national income per capita",
lifexp="Life Expectancy (years)")
)
px.scatter(df1,
y = "shdi",
x = "lgnic",
#range_y= [0.3, 1],
#range_x= [6, 16],
hover_name = "region",
hover_data= ['country'],
color = "continent2",
#size = "pop", size_max = 60,
trendline= 'ols',
#marginal_x= 'box',
#marginal_y= 'box',
animation_frame= 'year',
labels=dict(country = "Country",
continent2 = "Continent",
lgnic ="Log of gross national income per capita",
shdi="Human development index")
)
px.scatter(df1[df1['country'].isin(['China', 'Bolivia', 'Japan'])],
y = "shdi",
x = "lgnic",
range_y= [0.3, 1],
#range_x= [6, 16],
hover_name = "region",
hover_data= ['country'],
color = "country",
#size = "pop", size_max = 60,
trendline= 'ols',
#marginal_x= 'box',
#marginal_y= 'box',
animation_frame= 'year',
labels=dict(country = "Country",
lgnic ="Log of gross national income per capita",
shdi="Human development index")
)
Maps
gdf1990 = map.merge(df1990, on = 'GDLcode', how = 'left')
gdf2019 = map.merge(df2019, on = 'GDLcode', how = 'left')
gdf2019.columns
fig, ax = plt.subplots(figsize=(12,8))
gdf1990.plot(column="shdi_y",
scheme='BoxPlot',
cmap='coolwarm',
legend=True,
ax=ax,
legend_kwds={'bbox_to_anchor':(1.00, 0.65)})
plt.title('Spatial distribution of human development in 1990: \nWhich regions are below/above the median?')
plt.tight_layout()
ax.axis("off")
#plt.savefig('myMap.png',dpi=300, bbox_inches='tight')
plt.show()
fig, ax = plt.subplots(figsize=(12,8))
gdf2019.plot(column="shdi_y",
scheme='BoxPlot',
cmap='coolwarm',
legend=True,
ax=ax,
legend_kwds={'bbox_to_anchor':(1.00, 0.65)})
plt.title('Spatial distribution of human development in 2019: \nWhich regions are below/above the median?')
plt.tight_layout()
ax.axis("off")
#plt.savefig('myMap.png',dpi=300, bbox_inches='tight')
plt.show()
fig, ax = plt.subplots(figsize=(12,8))
gdf2019.query("continent2 == 'Africa'").plot(column="shdi_y",
scheme='BoxPlot',
cmap='coolwarm',
legend=True,
ax=ax,
legend_kwds={'bbox_to_anchor':(1.00, 0.65)})
plt.title('Spatial distribution of human development in 2019: \nWhich regions are below/above the median?')
plt.tight_layout()
ax.axis("off")
#plt.savefig('myMap.png',dpi=300, bbox_inches='tight')
plt.show()
Spatial dependence
gdf2019Africa = gdf2019.query("continent == 'Africa'").reset_index(drop=True)
#W = weights.KNN.from_dataframe(gdf2019Africa, k=4)
W = weights.KNN.from_dataframe(gdf2019Africa, k=6)
#W = weights.KNN.from_dataframe(ggdf2019Africa, k=8)
#W = weights.Queen.from_dataframe(gdf2019Africa)
#W = weights.Rook.from_dataframe(gdf2019Africa)
#>>>> Compute inverse distance squared based on maximum nearest neighbor distance between the n observations
#maxMin = weights.min_threshold_dist_from_shapefile("nepal.shp")
#W = weights.DistanceBand.from_dataframe(nepal, threshold = maxMin, binary=False, alpha=-2)
# >>> Row-standardize W
W.transform = 'r'
plot_spatial_weights(W, gdf2019Africa);
gdf2019Africa['Wshdi_y'] = weights.lag_spatial(W, gdf2019Africa['shdi_y'])
globalMoran = Moran(gdf2019Africa['shdi_y'], W)
MoranI = globalMoran.I
MoranI = "{:.2f}".format(MoranI)
localMoran = Moran_Local(gdf2019Africa['shdi_y'], W, permutations = 999, seed=12345)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,5))
moran_scatterplot(localMoran, p=0.10, aspect_equal=False, zstandard = False, ax=axes[0])
lisa_cluster(localMoran, gdf2019Africa, p=0.10, legend_kwds={'bbox_to_anchor':(0.02, 0.90)}, ax=axes[1])
#cx.add_basemap(axes[1], crs=gdf.crs.to_string(), source=cx.providers.Stamen.TonerLite, attribution=False)
#cx.add_basemap(axes[1], crs=gdf.crs.to_string(), source=cx.providers.Stamen.TonerLabels, attribution=False)
axes[0].set_xlabel('Indicator')
axes[0].set_ylabel('Spatial lag of indicator')
axes[0].set_title(f"(a) Moran scatterplot (Moran's I = {MoranI})")
axes[1].set_title("(b) Spatial clusters and outliers (p<0.10)")
plt.show()
gdf2019Africa.explore(
column= 'shdi_y',
tooltip=['shdi_y','country_y', 'region_y'],
k = 3,
scheme='FisherJenks',
cmap='coolwarm',
legend=True,
tiles='CartoDB dark_matter',
style_kwds =dict(color="gray", weight=0.4),
legend_kwds=dict(colorbar=False)
)
px.scatter(
gdf2019Africa,
x='shdi_y',
y='Wshdi_y',
hover_name="region_y",
hover_data = ['shdi_y', 'Wshdi_y', 'country_y'],
trendline="ols",
marginal_x="box",
marginal_y="box")