Exploring subnational human development in India
Setup
# Load libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.ticker import FormatStrFormatter
from matplotlib_scalebar.scalebar import ScaleBar
from pylab import rcParams
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 72
import seaborn as sns
#sns.set_style("darkgrid")
#sns.set_context(context="paper", font_scale=1.5, rc=None)
#sns.set(font="serif")
import plotly.express as px
import plotly.graph_objects as go
import geopandas as gpd
import libpysal
from libpysal import weights
from libpysal.weights import Queen
import esda
from esda.moran import Moran, Moran_Local
import splot
from splot.esda import moran_scatterplot, plot_moran, lisa_cluster, plot_local_autocorrelation
from splot.libpysal import plot_spatial_weights
from giddy.directional import Rose
import statsmodels.api as sm
import statsmodels.formula.api as smf
from spreg import OLS
from spreg import MoranRes
from spreg import ML_Lag
from spreg import ML_Error
from mgwr.gwr import GWR, MGWR
from mgwr.sel_bw import Sel_BW
from mgwr.utils import shift_colormap, truncate_colormap
import warnings
warnings.filterwarnings('ignore')
Import data
df = pd.read_csv("https://github.com/quarcs-lab/data-open/raw/master/shdi/SHDIold.csv").sort_values(by=['year', 'SubContinent'])
df.info()
df
map = gpd.read_file("https://gist.github.com/cmg777/81b6e2b1ea5865c09daf3b2bbf35bbf8/raw/5e0a0370f453421bc503594cfae1e466c89e449f/GDL%2520Shapefiles%2520V4.geojson")
map.plot('shdi');
map.info()
map.head(9)
fig, ax = plt.subplots(figsize=(12,8))
map.plot(column="shdi", scheme='BoxPlot', cmap='coolwarm', legend=True, ax=ax, legend_kwds={'bbox_to_anchor':(1.00, 0.70)})
plt.title('Spatial distribution of the subnational HDI: \nWhich regions are below/above the median?')
plt.tight_layout()
ax.axis("off")
#plt.savefig('myMap.png',dpi=150, bbox_inches='tight')
plt.show()
map['shdi'].describe().round(2)
Prepare data
# Remove national observations and focus on India
df1 = df[(df.level != 'National') & (df.country == 'India')]
df1
df1.columns
df1["year"]= df["year"].astype(str)
df1990 = df1[df1['year'] == '1990']
df2019 = df1[df1['year'] == '2019']
df2019['Continent'].unique()
df2019['SubContinent'].unique()
df2019['country'].unique()
Descriptive statistics
df1.loc[:, df1.columns != 'year'].describe().round(2)
df1990.describe().round(2)
df2019.describe().round(2)
Regional differences
Strip plot
px.strip(df1,
x = ['shdi'],
y = 'country',
range_x= [0.35, 0.80],
color = 'continent2',
hover_name= 'region',
hover_data = ['country'],
animation_frame= 'year',
labels=dict(continent2 = "Continent",
shdi ="Subnational human development index")
)
Box plot
px.box(df1,
x = 'shdi',
y = 'country',
range_x= [0.35, 0.80],
color = 'continent2',
hover_name= 'region',
hover_data = ['country'],
animation_frame= 'year',
labels=dict(continent2 = "Continent",
shdi ="Subnational human development index")
)
Histogram
px.histogram(df1,
x = 'shdi',
range_x= [0.35, 0.80],
nbins= 15,
color = 'country',
hover_name= 'region',
hover_data= ['country'],
marginal='rug', # box is also a good option
animation_frame = 'year',
labels=dict(continent2 = "Continent",
shdi ="Subnational human development index")
)
ECDF
px.ecdf(df1,
x="shdi",
range_x= [0.30, 0.80],
color="country",
hover_name= 'region',
hover_data= ['country'],
#markers=True,
#lines=False,
marginal="rug", # histogram
animation_frame = 'year',
labels=dict(continent2 = "Continent",
shdi ="Subnational human development index")
)
Time series
Continents
fig = px.line(
df1,
x="year",
y="shdi",
log_y= True,
color="region",
hover_name="country",
hover_data= ['country'],
labels=dict(shdi="HDI",
continent2 ="Continent",
year = ""),
facet_col="country",
facet_col_wrap = 2,
facet_row_spacing = 0.01,
height= 500
)
fig.update_layout(showlegend=False)
Correlations
px.scatter(df1,
y = "shdi",
x = "incindex",
range_y= [0.2, 0.9],
range_x= [0.2, 0.9],
hover_name = "region",
hover_data= ['country'],
color = "country",
#size = "pop", size_max = 60,
trendline= 'ols',
#trendline_scope= "overall",
#marginal_x= 'box',
#marginal_y= 'box',
animation_frame= 'year',
labels=dict(country = "Country",
continent2 = "Continent",
lgnic ="Log of gross national income per capita",
lifexp="Life Expectancy (years)")
)
Maps
gdf1990 = map.merge(df1990, on = 'GDLcode', how = 'left')
gdf2019 = map.merge(df2019, on = 'GDLcode', how = 'left')
gdf2019.columns
fig, ax = plt.subplots(figsize=(12,8))
gdf1990.plot(column="shdi_y",
scheme='BoxPlot',
cmap='coolwarm',
legend=True,
ax=ax,
edgecolor='black',
legend_kwds={'bbox_to_anchor':(0.80, 0.95)})
plt.title('Spatial distribution of human development in 1990: \nWhich regions are below/above the median?')
plt.tight_layout()
ax.axis("off")
#plt.savefig('myMap.png',dpi=300, bbox_inches='tight')
plt.show()
fig, ax = plt.subplots(figsize=(12,8))
gdf2019.plot(column="shdi_y",
scheme='BoxPlot',
cmap='coolwarm',
legend=True,
ax=ax,
edgecolor='black',
legend_kwds={'bbox_to_anchor':(0.80, 0.95)})
plt.title('Spatial distribution of human development in 1990: \nWhich regions are below/above the median?')
plt.tight_layout()
ax.axis("off")
#plt.savefig('myMap.png',dpi=300, bbox_inches='tight')
plt.show()
Spatial dependence
gdf2019India = gdf2019.query("country_x == 'India'").reset_index(drop=True)
#W = weights.KNN.from_dataframe(gdf2019India, k=4)
W = weights.KNN.from_dataframe(gdf2019India, k=6)
#W = weights.KNN.from_dataframe(gdf2019India, k=8)
#W = weights.Queen.from_dataframe(gdf2019India)
#W = weights.Rook.from_dataframe(gdf2019India)
#>>>> Compute inverse distance squared based on maximum nearest neighbor distance between the n observations
#maxMin = weights.min_threshold_dist_from_shapefile("nepal.shp")
#W = weights.DistanceBand.from_dataframe(nepal, threshold = maxMin, binary=False, alpha=-2)
# >>> Row-standardize W
W.transform = 'r'
plot_spatial_weights(W, gdf2019India);
gdf2019India['Wshdi_y'] = weights.lag_spatial(W, gdf2019India['shdi_y'])
globalMoran = Moran(gdf2019India['shdi_y'], W)
MoranI = globalMoran.I
MoranI = "{:.2f}".format(MoranI)
localMoran = Moran_Local(gdf2019India['shdi_y'], W, permutations = 999, seed=12345)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,5))
moran_scatterplot(localMoran, p=0.10, aspect_equal=False, zstandard = False, ax=axes[0])
lisa_cluster(localMoran, gdf2019India, p=0.10, legend_kwds={'bbox_to_anchor':(0.02, 0.90)}, ax=axes[1])
#cx.add_basemap(axes[1], crs=gdf.crs.to_string(), source=cx.providers.Stamen.TonerLite, attribution=False)
#cx.add_basemap(axes[1], crs=gdf.crs.to_string(), source=cx.providers.Stamen.TonerLabels, attribution=False)
axes[0].set_xlabel('Indicator')
axes[0].set_ylabel('Spatial lag of indicator')
axes[0].set_title(f"(a) Moran scatterplot (Moran's I = {MoranI})")
axes[1].set_title("(b) Spatial clusters and outliers (p<0.10)")
plt.show()
gdf2019India.explore(
column= 'shdi_y',
tooltip=['shdi_y','country_y', 'region_y'],
k = 3,
scheme='FisherJenks',
cmap='coolwarm',
legend=True,
tiles='CartoDB dark_matter',
style_kwds =dict(color="gray", weight=0.4),
legend_kwds=dict(colorbar=False)
)
px.scatter(
gdf2019India,
x='shdi_y',
y='Wshdi_y',
hover_name="region_y",
hover_data = ['shdi_y', 'Wshdi_y', 'country_y'],
trendline="ols",
marginal_x="box",
marginal_y="box")