Science is ...

How can we stand on the shoulders of giants?

We could use scientific data repositories

What is the human development index?

Let's access the subnational human development dataset

Recommended papers based on this dataset

Understand the contents of the dataset

Data science on the shoulders of giants

Setup

# Load general libraries import numpy as np import pandas as pd pd.set_option('display.max_columns', None) import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import plotly.graph_objects as go import chart_studio #chart_studio.tools.set_credentials_file(username='econdata777', api_key='ADDhere') import chart_studio.plotly as save2cs import statsmodels.api as sm import statsmodels.formula.api as smf # Load spatial analysis libraries import geopandas as gpd import libpysal from libpysal import weights from libpysal.weights import Queen import esda from esda.moran import Moran, Moran_Local from giddy.directional import Rose import splot from splot.esda import moran_scatterplot, plot_moran, lisa_cluster, plot_local_autocorrelation from splot.libpysal import plot_spatial_weights from spreg import OLS from spreg import MoranRes from spreg import ML_Lag from spreg import ML_Error from mgwr.gwr import GWR, MGWR from mgwr.sel_bw import Sel_BW from mgwr.utils import shift_colormap, truncate_colormap import warnings warnings.filterwarnings('ignore')

Import data

df.info()

df

map = gpd.read_file("https://gist.github.com/cmg777/81b6e2b1ea5865c09daf3b2bbf35bbf8/raw/5e0a0370f453421bc503594cfae1e466c89e449f/GDL%2520Shapefiles%2520V4.geojson")

map.plot('shdi');

map.info()

map.head(9)

fig, ax = plt.subplots(figsize=(12,8)) map.plot(column="shdi", scheme='BoxPlot', cmap='coolwarm', legend=True, ax=ax, legend_kwds={'bbox_to_anchor':(1.00, 0.70)}) plt.title('Spatial distribution of the subnational HDI: \nWhich regions are below/above the median?') plt.tight_layout() ax.axis("off") #plt.savefig('myMap.png',dpi=150, bbox_inches='tight') plt.show()

map['shdi'].describe().round(2)

Prepare data

# Remove national observations df1 = df[df.level != 'National'] df1

df.columns

df["year"]= df["year"].astype(str)

df1990 = df[df['year'] == '1990']

df2019 = df[df['year'] == '2019']

df2019['Continent'].unique()

df2019['SubContinent'].unique()

df2019['country'].unique()

Descriptive statistics

df1.loc[:, df1.columns != 'year'].describe().round(2)

df1990.describe().round(2)

df2019.describe().round(2)

df2019.query("country == 'China'").describe().round(2)

Regional differences

Strip plot

px.strip(df1, x = 'shdi', y = 'continent2', range_x= [0, 1], color = 'continent2', hover_name= 'region', hover_data = ['country'], animation_frame= 'year', labels=dict(continent2 = "Continent", shdi ="Subnational human development index") )

px.strip(df1, x = 'shdi', y = 'SubContinent', range_x= [0, 1], color = 'SubContinent', hover_name= 'region', hover_data = ['country'], animation_frame= 'year', labels=dict(SubContinent = "Sub-continent", shdi ="Subnational human development index") )

px.strip(df1[df1['country'].isin(['China', 'Bolivia', 'Japan'])], x = 'shdi', y = 'country', color = 'country', hover_name= 'region', hover_data = ['country'], range_x = [0.3, 1], animation_frame = 'year')

Box plot

px.box(df1, x = 'shdi', y = 'continent2', range_x= [0.15, 1], color = 'continent2', hover_name= 'region', hover_data = ['country'], animation_frame= 'year', labels=dict(continent2 = "Continent", shdi ="Subnational human development index") )

px.box(df1, x = 'shdi', y = 'SubContinent', range_x= [0.15, 1], color = 'SubContinent', hover_name= 'region', hover_data = ['country'], animation_frame= 'year', labels=dict(SubContinent = "Sub-continent", shdi ="Subnational human development index") )

Histogram

px.histogram(df1, x = 'shdi', range_x= [0.15, 1], nbins= 50, color = 'continent2', hover_name= 'region', hover_data= ['country'], marginal='box', # rug is also a good option animation_frame = 'year', labels=dict(continent2 = "Continent", shdi ="Subnational human development index") )

px.histogram(df1, x = 'shdi', range_x= [0.15, 1], nbins= 80, color = 'SubContinent', hover_name= 'region', hover_data= ['country'], marginal='box', # rug is also a good option animation_frame = 'year', labels=dict(SubContinent = "Continent", shdi ="Subnational human development index") )

ECDF

px.ecdf(df1, x="shdi", range_x= [0.15, 1], color="continent2", hover_name= 'region', hover_data= ['country'], #markers=True, #lines=False, marginal="rug", # histogram animation_frame = 'year', labels=dict(continent2 = "Continent", shdi ="Subnational human development index") )

Time series

Continents

fig = px.line( df1, x="year", y="shdi", log_y= True, color="region", hover_name="continent2", hover_data= ['country'], labels=dict(shdi="HDI", continent2 ="Continent", year = ""), facet_col="continent2", facet_col_wrap = 2, facet_row_spacing = 0.01, height= 1800 ) fig.update_layout(showlegend=False)

Subcontinents

fig = px.line( df1, x="year", y="shdi", log_y= True, color="region", hover_name="SubContinent", hover_data= ['country'], labels=dict(shdi="HDI", SubContinent ="Subcontinent", year = ""), facet_col="SubContinent", facet_col_wrap = 3, facet_row_spacing = 0.01, height= 2400 ) fig.update_layout(showlegend=False)

Countries

fig = px.line( df1[df1['country'].isin(['China', 'Bolivia', 'Japan'])], x="year", y="shdi", log_y= True, color="region", hover_name="SubContinent", hover_data= ['country'], labels=dict(shdi="Human development index", country ="Country", year = ""), facet_col="country", facet_col_wrap = 3, #facet_row_spacing = 0.01, #height= 2400 ) fig.update_layout(showlegend=False)

Correlations

px.scatter(df1, y = "lifexp", x = "lgnic", range_y= [20, 90], range_x= [5, 13], hover_name = "region", hover_data= ['country'], color = "continent2", #size = "pop", size_max = 60, trendline= 'ols', #trendline_scope= "overall", #marginal_x= 'box', #marginal_y= 'box', animation_frame= 'year', labels=dict(country = "Country", continent2 = "Continent", lgnic ="Log of gross national income per capita", lifexp="Life Expectancy (years)") )

px.scatter(df1[df1['country'].isin(['China', 'Bolivia', 'Japan'])], y = "lifexp", x = "lgnic", range_y= [50, 90], range_x= [6, 11], hover_name = "region", hover_data= ['country'], color = "country", #size = "pop", size_max = 60, trendline= 'ols', #marginal_x= 'box', #marginal_y= 'box', animation_frame= 'year', labels=dict(country = "Country", lgnic ="Log of gross national income per capita", lifexp="Life Expectancy (years)") )

px.scatter(df1, y = "shdi", x = "lgnic", #range_y= [0.3, 1], #range_x= [6, 16], hover_name = "region", hover_data= ['country'], color = "continent2", #size = "pop", size_max = 60, trendline= 'ols', #marginal_x= 'box', #marginal_y= 'box', animation_frame= 'year', labels=dict(country = "Country", continent2 = "Continent", lgnic ="Log of gross national income per capita", shdi="Human development index") )

px.scatter(df1[df1['country'].isin(['China', 'Bolivia', 'Japan'])], y = "shdi", x = "lgnic", range_y= [0.3, 1], #range_x= [6, 16], hover_name = "region", hover_data= ['country'], color = "country", #size = "pop", size_max = 60, trendline= 'ols', #marginal_x= 'box', #marginal_y= 'box', animation_frame= 'year', labels=dict(country = "Country", lgnic ="Log of gross national income per capita", shdi="Human development index") )

Maps

gdf1990 = map.merge(df1990, on = 'GDLcode', how = 'left') gdf2019 = map.merge(df2019, on = 'GDLcode', how = 'left')

gdf2019.columns

fig, ax = plt.subplots(figsize=(12,8)) gdf1990.plot(column="shdi_y", scheme='BoxPlot', cmap='coolwarm', legend=True, ax=ax, legend_kwds={'bbox_to_anchor':(1.00, 0.65)}) plt.title('Spatial distribution of human development in 1990: \nWhich regions are below/above the median?') plt.tight_layout() ax.axis("off") #plt.savefig('myMap.png',dpi=300, bbox_inches='tight') plt.show()

fig, ax = plt.subplots(figsize=(12,8)) gdf2019.plot(column="shdi_y", scheme='BoxPlot', cmap='coolwarm', legend=True, ax=ax, legend_kwds={'bbox_to_anchor':(1.00, 0.65)}) plt.title('Spatial distribution of human development in 2019: \nWhich regions are below/above the median?') plt.tight_layout() ax.axis("off") #plt.savefig('myMap.png',dpi=300, bbox_inches='tight') plt.show()

fig, ax = plt.subplots(figsize=(12,8)) gdf2019.query("continent2 == 'Africa'").plot(column="shdi_y", scheme='BoxPlot', cmap='coolwarm', legend=True, ax=ax, legend_kwds={'bbox_to_anchor':(1.00, 0.65)}) plt.title('Spatial distribution of human development in 2019: \nWhich regions are below/above the median?') plt.tight_layout() ax.axis("off") #plt.savefig('myMap.png',dpi=300, bbox_inches='tight') plt.show()

Spatial dependence

gdf2019Africa = gdf2019.query("continent == 'Africa'").reset_index(drop=True)

#W = weights.KNN.from_dataframe(gdf2019Africa, k=4) W = weights.KNN.from_dataframe(gdf2019Africa, k=6) #W = weights.KNN.from_dataframe(ggdf2019Africa, k=8) #W = weights.Queen.from_dataframe(gdf2019Africa) #W = weights.Rook.from_dataframe(gdf2019Africa) #>>>> Compute inverse distance squared based on maximum nearest neighbor distance between the n observations #maxMin = weights.min_threshold_dist_from_shapefile("nepal.shp") #W = weights.DistanceBand.from_dataframe(nepal, threshold = maxMin, binary=False, alpha=-2) # >>> Row-standardize W W.transform = 'r'

plot_spatial_weights(W, gdf2019Africa);

gdf2019Africa['Wshdi_y'] = weights.lag_spatial(W, gdf2019Africa['shdi_y'])

globalMoran = Moran(gdf2019Africa['shdi_y'], W) MoranI = globalMoran.I MoranI = "{:.2f}".format(MoranI) localMoran = Moran_Local(gdf2019Africa['shdi_y'], W, permutations = 999, seed=12345)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,5)) moran_scatterplot(localMoran, p=0.10, aspect_equal=False, zstandard = False, ax=axes[0]) lisa_cluster(localMoran, gdf2019Africa, p=0.10, legend_kwds={'bbox_to_anchor':(0.02, 0.90)}, ax=axes[1]) #cx.add_basemap(axes[1], crs=gdf.crs.to_string(), source=cx.providers.Stamen.TonerLite, attribution=False) #cx.add_basemap(axes[1], crs=gdf.crs.to_string(), source=cx.providers.Stamen.TonerLabels, attribution=False) axes[0].set_xlabel('Indicator') axes[0].set_ylabel('Spatial lag of indicator') axes[0].set_title(f"(a) Moran scatterplot (Moran's I = {MoranI})") axes[1].set_title("(b) Spatial clusters and outliers (p<0.10)") plt.show()

gdf2019Africa.explore( column= 'shdi_y', tooltip=['shdi_y','country_y', 'region_y'], k = 3, scheme='FisherJenks', cmap='coolwarm', legend=True, tiles='CartoDB dark_matter', style_kwds =dict(color="gray", weight=0.4), legend_kwds=dict(colorbar=False) )

px.scatter( gdf2019Africa, x='shdi_y', y='Wshdi_y', hover_name="region_y", hover_data = ['shdi_y', 'Wshdi_y', 'country_y'], trendline="ols", marginal_x="box", marginal_y="box")

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Science is ...