Spatial convergence across subnational regions of South America
Slides
from IPython.display import IFrame
IFrame(src='https://project2020e-slides.netlify.app/#1', width=600, height=400)
Setup
# Load libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.ticker import FormatStrFormatter
from pylab import rcParams
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 72
import seaborn as sns
#sns.set_style("darkgrid")
#sns.set_context(context="paper", font_scale=1.5, rc=None)
#sns.set(font="serif")
import plotly.express as px
import plotly.graph_objects as go
import geopandas as gpd
import libpysal
from libpysal import weights
from libpysal.weights import Queen
import esda
from esda.moran import Moran, Moran_Local
import splot
from splot.esda import moran_scatterplot, plot_moran, lisa_cluster, plot_local_autocorrelation
from splot.libpysal import plot_spatial_weights
from giddy.directional import Rose
import statsmodels.api as sm
import statsmodels.formula.api as smf
from spreg import OLS
from spreg import MoranRes
from spreg import ML_Lag
from spreg import ML_Error
from mgwr.gwr import GWR, MGWR
from mgwr.sel_bw import Sel_BW
from mgwr.utils import shift_colormap, truncate_colormap
import warnings
warnings.filterwarnings('ignore')
Import data
gdf = gpd.read_file("https://github.com/quarcs-lab/data-quarcs/raw/master/southAmerica151/balanced_hdi_beta_south_merica_withCoors.geojson")
gdf
Explore data
gdf[['ln_gnic90', 'ln_gnic00', 'g_gnic9000', 'ln_hdi90', 'ln_hdi00', 'gr_hdi9000']].describe().round(2)
Beta convergence
px.scatter(
gdf,
x="ln_gnic90",
y="g_gnic9000",
hover_name="region",
#color="country",
trendline="ols",
marginal_x="rug",
marginal_y="rug",
labels={"ln_gnic90": "Log GNIpc in 1990",
"g_gnic9000": "Growth GNIpc 1990-2000"}
)
px.strip(gdf, x = 'ln_gnic90', hover_name= 'region', color= 'country')
Heterogenous beta convergence
px.scatter(gdf,
x="ln_gnic90",
y="g_gnic9000",
hover_name="region",
color="country",
trendline="ols",
marginal_x="rug",
marginal_y="rug",
labels={"ln_gnic90": "Log GNIpc in 1990",
"g_gnic9000": "Growth GNIpc 1990-2000"})
Basic maps
Static
gdf.plot('ln_gnic90', cmap = 'coolwarm', legend = True, scheme = 'BoxPlot', legend_kwds={'bbox_to_anchor':(0.42, 0.50)})
gdf.plot('g_gnic9000', cmap = 'coolwarm', legend = True, scheme = 'BoxPlot', legend_kwds={'bbox_to_anchor':(0.42, 0.50)});
Interactive
#help(gdf.explore)
gdf.explore(
column='ln_gnic90',
tooltip=['country', 'region', 'ln_gnic90', 'g_gnic9000'],
scheme='BoxPlot',
#k=5,
cmap='coolwarm',
legend=True,
tiles='CartoDB positron', # OpenStreetMap, Stamen Terrain, Stamen Toner, Stamen Watercolor, CartoDB positron, CartoDB dark_matter
style_kwds =dict(color="gray", weight=0.5),
legend_kwds=dict(colorbar=False)
)
Spatial weights
W = weights.Queen.from_dataframe(gdf)
#W = weights.Rook.from_dataframe(gdf)
#W = weights.KNN.from_dataframe(gdf, k=4)
#W = weights.KNN.from_dataframe(gdf, k=6)
#W = weights.KNN.from_dataframe(gdf, k=8)
#>>>> Compute inverse distance squared based on maximum nearest neighbor distance between the n observations
#maxMin = weights.min_threshold_dist_from_shapefile("map.shp")
#W = weights.DistanceBand.from_dataframe(gdf, threshold = maxMin, binary=False, alpha=-2)
# >>> Row-standardize W
W.transform = 'r'
# Ref: https://splot.readthedocs.io/en/stable/users/tutorials/weights.html#a-closer-look-at-w
plot_spatial_weights(W, gdf);
W.mean_neighbors
W.min_neighbors
W.max_neighbors
Spatial dependence
gdf["Wln_gnic90"] = weights.lag_spatial(W, gdf['ln_gnic90'])
px.scatter(gdf,
x="ln_gnic90",
y="Wln_gnic90",
hover_name="region",
labels = dict(ln_gnic90 = "Ln GNPpc in focal region",
Wln_gnic90 = "Neighbors' average Ln GNPpc"
),
trendline="ols",
marginal_x="box",
marginal_y="box")
Global dependence
globalMoran = Moran(gdf['ln_gnic90'], W)
print(globalMoran.I)
0.5760929600585696
print(globalMoran.p_sim)
0.001
moran_scatterplot(globalMoran, aspect_equal=False, zstandard=False);
Local dependence
localMoran = Moran_Local(gdf['ln_gnic90'], W, permutations = 999, seed=12345)
moran_scatterplot(localMoran, p=0.10, zstandard =False);
lisa_cluster(localMoran, gdf, p=0.10);
Spatial convergence
y = gdf['g_gnic9000'].values
y_name = 'Growth GNIpc'
x = np.array([gdf.ln_gnic90]).T
x_name = 'Log GNIpc in 1990'
OLS
ols = OLS(y = y, x = x, w = W,
name_y=y_name, name_x = [x_name], name_w="Wqueen", name_ds='gdf',
white_test=True, spat_diag=True, moran=True)
print(ols.summary)
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : gdf
Weights matrix : Wqueen
Dependent Variable :Growth GNIpc Number of Observations: 151
Mean dependent var : 1.2259 Number of Variables : 2
S.D. dependent var : 0.2916 Degrees of Freedom : 149
R-squared : 0.2948
Adjusted R-squared : 0.2901
Sum squared residual: 8.994 F-statistic : 62.2990
Sigma-square : 0.060 Prob(F-statistic) : 5.898e-13
S.E. of regression : 0.246 Log likelihood : -1.294
Sigma-square ML : 0.060 Akaike info criterion : 6.589
S.E of regression ML: 0.2441 Schwarz criterion : 12.623
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 1.6929883 0.0624636 27.1036091 0.0000000
Log GNIpc in 1990 -0.2422577 0.0306928 -7.8929728 0.0000000
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 6.084
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 11.315 0.0035
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 1 17.897 0.0000
Koenker-Bassett test 1 20.496 0.0000
SPECIFICATION ROBUST TEST
TEST DF VALUE PROB
White 2 23.247 0.0000
DIAGNOSTICS FOR SPATIAL DEPENDENCE
TEST MI/DF VALUE PROB
Moran's I (error) 0.6533 12.546 0.0000
Lagrange Multiplier (lag) 1 122.338 0.0000
Robust LM (lag) 1 0.179 0.6719
Lagrange Multiplier (error) 1 145.524 0.0000
Robust LM (error) 1 23.365 0.0000
Lagrange Multiplier (SARMA) 2 145.704 0.0000
================================ END OF REPORT =====================================
SAR
sar = ML_Lag(y=y, x=x, w=W,
name_y=y_name, name_x = [x_name], name_w="Wqueen", name_ds='gdf')
print(sar.summary)
REGRESSION
----------
SUMMARY OF OUTPUT: MAXIMUM LIKELIHOOD SPATIAL LAG (METHOD = FULL)
-----------------------------------------------------------------
Data set : gdf
Weights matrix : Wqueen
Dependent Variable :Growth GNIpc Number of Observations: 151
Mean dependent var : 1.2259 Number of Variables : 3
S.D. dependent var : 0.2916 Degrees of Freedom : 148
Pseudo R-squared : 0.7136
Spatial Pseudo R-squared: 0.2526
Sigma-square ML : 0.025 Log likelihood : 52.956
S.E of regression : 0.157 Akaike info criterion : -99.912
Schwarz criterion : -90.860
------------------------------------------------------------------------------------
Variable Coefficient Std.Error z-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 0.5616761 0.0948118 5.9241144 0.0000000
Log GNIpc in 1990 -0.1278745 0.0228983 -5.5844589 0.0000000
W_Growth GNIpc 0.7449687 0.0510695 14.5873432 0.0000000
------------------------------------------------------------------------------------
================================ END OF REPORT =====================================
SEM
sem = ML_Error(y=y, x=x, w=W,
name_y=y_name, name_x = [x_name], name_w="Wqueen", name_ds='gdf')
print(sem.summary)
REGRESSION
----------
SUMMARY OF OUTPUT: MAXIMUM LIKELIHOOD SPATIAL ERROR (METHOD = FULL)
-------------------------------------------------------------------
Data set : gdf
Weights matrix : Wqueen
Dependent Variable :Growth GNIpc Number of Observations: 151
Mean dependent var : 1.2259 Number of Variables : 2
S.D. dependent var : 0.2916 Degrees of Freedom : 149
Pseudo R-squared : 0.2948
Sigma-square ML : 0.021 Log likelihood : 61.335
S.E of regression : 0.145 Akaike info criterion : -118.669
Schwarz criterion : -112.635
------------------------------------------------------------------------------------
Variable Coefficient Std.Error z-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 1.6624672 0.0846098 19.6486322 0.0000000
Log GNIpc in 1990 -0.2140703 0.0272916 -7.8438270 0.0000000
lambda 0.8191001 0.0464692 17.6267400 0.0000000
------------------------------------------------------------------------------------
================================ END OF REPORT =====================================
Heterogeneous convergence
GWR data preparation
y = gdf['g_gnic9000'].values.reshape((-1,1))
X = gdf['ln_gnic90'].values.reshape((-1,1))
print(y.shape, X.shape)
(151, 1) (151, 1)
u = gdf['COORD_X']
v = gdf['COORD_Y']
coords = list(zip(u,v))
GWR results
gwr_selector = Sel_BW(coords, y, X)
gwr_bw = gwr_selector.search(bw_min=2)
gwr_bw
gwr_results = GWR(coords, y, X, gwr_bw).fit()
gwr_results.summary()
===========================================================================
Model type Gaussian
Number of observations: 151
Number of covariates: 2
Global Regression Results
---------------------------------------------------------------------------
Residual sum of squares: 8.994
Log-likelihood: -1.294
AIC: 6.589
AICc: 8.752
BIC: -738.581
R2: 0.295
Adj. R2: 0.290
Variable Est. SE t(Est/SE) p-value
------------------------------- ---------- ---------- ---------- ----------
X0 1.693 0.062 27.104 0.000
X1 -0.242 0.031 -7.893 0.000
Geographically Weighted Regression (GWR) Results
---------------------------------------------------------------------------
Spatial kernel: Adaptive bisquare
Bandwidth used: 37.000
Diagnostic information
---------------------------------------------------------------------------
Residual sum of squares: 2.853
Effective number of parameters (trace(S)): 18.309
Degree of freedom (n - trace(S)): 132.691
Sigma estimate: 0.147
Log-likelihood: 85.400
AIC: -132.181
AICc: -126.179
BIC: -73.919
R2: 0.776
Adjusted R2: 0.745
Adj. alpha (95%): 0.005
Adj. critical t value (95%): 2.819
Summary Statistics For GWR Parameter Estimates
---------------------------------------------------------------------------
Variable Mean STD Min Median Max
-------------------- ---------- ---------- ---------- ---------- ----------
X0 1.455 0.425 0.791 1.364 2.476
X1 -0.129 0.193 -0.443 -0.143 0.225
===========================================================================
GWR maps
# Add GWR parameters to GeoDataframe
gdf['GWRintercept'] = gwr_results.params[:,0]
gdf['GWRln_gnic90'] = gwr_results.params[:,1]
# Obtain t-values filtered based on multiple testing correction
gwr_filtered_t = gwr_results.filter_tvals()