Basic spatial econometrics: OLS, SAR, SEM
Set up
# Load libraries
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style('darkgrid')
#sns.set_style('whitegrid')
#sns.set_style('white')
import geopandas as gpd
import matplotlib.pyplot as plt
import libpysal
from libpysal import weights
from libpysal.weights import Queen
import esda
from esda.moran import Moran, Moran_Local
import splot
from splot.esda import moran_scatterplot, plot_moran, lisa_cluster, plot_local_autocorrelation
from splot.libpysal import plot_spatial_weights
from giddy.directional import Rose
from spreg import OLS
from spreg import MoranRes
from spreg import ML_Lag
from spreg import ML_Error
OMP: Info #270: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
Import data
gdf = gpd.read_file("https://github.com/quarcs-lab/data-quarcs/raw/master/City/City.geojson")
# Or, if the file is already in your Data directory, you can use #gdf = gpd.read_file("../../Data/City.geojson")
gdf.head()
gdf.crs
Spatial weights matrix
W = weights.KNN.from_dataframe(gdf, k=8)
#W = weights.KNN.from_dataframe(gdf, k=6)
#W = weights.KNN.from_dataframe(gdf, k=4)
#W = weights.Queen.from_dataframe(gdf)
#W = weights.Rook.from_dataframe(gdf)
# Compute inverse distance squared based on maximum nearest neighbor distance between the n observations
#maxMin = weights.min_threshold_dist_from_shapefile("map.shp")
#W = weights.DistanceBand.from_dataframe(gdf, threshold = maxMin, binary=False, alpha=-2)
# Row-standardize W
W.transform = 'r'
plot_spatial_weights(W, gdf);
/opt/conda/lib/python3.8/site-packages/splot/_viz_libpysal_mpl.py:115: UserWarning: Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.
centroids_shp = gdf.centroid.values
/opt/conda/lib/python3.8/site-packages/splot/_viz_libpysal_mpl.py:154: UserWarning: Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.
gdf.centroid.plot(ax=ax, **node_kws)
W.neighbors
W.min_neighbors
W.mean_neighbors
W.max_neighbors
W.histogram
#Note: (numbers of neighbors, frequency)
Regression preparations
y = gdf['Income'].values
y_name = 'Income'
x = np.array([gdf.Insurance]).T
x_name = 'Insurance'
OLS
ols = OLS(y = y, x = x, w = W,
name_y=y_name, name_x = [x_name], name_w="W", name_ds='gdf',
white_test=True, spat_diag=True, moran=True)
print(ols.summary)
REGRESSION
----------
SUMMARY OF OUTPUT: ORDINARY LEAST SQUARES
-----------------------------------------
Data set : gdf
Weights matrix : W
Dependent Variable : Income Number of Observations: 90
Mean dependent var : 16316.7536 Number of Variables : 2
S.D. dependent var : 4975.6156 Degrees of Freedom : 88
R-squared : 0.8104
Adjusted R-squared : 0.8083
Sum squared residual:417719006.849 F-statistic : 376.1754
Sigma-square : 4746806.896 Prob(F-statistic) : 1.571e-33
S.E. of regression : 2178.717 Log likelihood : -818.477
Sigma-square ML : 4641322.298 Akaike info criterion : 1640.955
S.E of regression ML: 2154.3728 Schwarz criterion : 1645.954
------------------------------------------------------------------------------------
Variable Coefficient Std.Error t-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 6815.4668877 541.0378866 12.5970233 0.0000000
Insurance 39.6472092 2.0441720 19.3952408 0.0000000
------------------------------------------------------------------------------------
REGRESSION DIAGNOSTICS
MULTICOLLINEARITY CONDITION NUMBER 4.489
TEST ON NORMALITY OF ERRORS
TEST DF VALUE PROB
Jarque-Bera 2 216.901 0.0000
DIAGNOSTICS FOR HETEROSKEDASTICITY
RANDOM COEFFICIENTS
TEST DF VALUE PROB
Breusch-Pagan test 1 170.260 0.0000
Koenker-Bassett test 1 36.189 0.0000
SPECIFICATION ROBUST TEST
TEST DF VALUE PROB
White 2 42.348 0.0000
DIAGNOSTICS FOR SPATIAL DEPENDENCE
TEST MI/DF VALUE PROB
Moran's I (error) 0.4476 10.509 0.0000
Lagrange Multiplier (lag) 1 17.983 0.0000
Robust LM (lag) 1 0.000 0.9890
Lagrange Multiplier (error) 1 83.638 0.0000
Robust LM (error) 1 65.654 0.0000
Lagrange Multiplier (SARMA) 2 83.638 0.0000
================================ END OF REPORT =====================================
SEM
sem = ML_Error(y=y, x=x, w=W,
name_y=y_name, name_x = [x_name], name_w="W", name_ds='gdf')
print(sem.summary)
REGRESSION
----------
SUMMARY OF OUTPUT: MAXIMUM LIKELIHOOD SPATIAL ERROR (METHOD = FULL)
-------------------------------------------------------------------
Data set : gdf
Weights matrix : W
Dependent Variable : Income Number of Observations: 90
Mean dependent var : 16316.7536 Number of Variables : 2
S.D. dependent var : 4975.6156 Degrees of Freedom : 88
Pseudo R-squared : 0.8104
Sigma-square ML : 2695606.608 Log likelihood : -797.894
S.E of regression : 1641.830 Akaike info criterion : 1599.789
Schwarz criterion : 1604.789
------------------------------------------------------------------------------------
Variable Coefficient Std.Error z-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 6953.6619556 944.8613120 7.3594525 0.0000000
Insurance 40.0801115 2.7498202 14.5755388 0.0000000
lambda 0.7749350 0.0893545 8.6725876 0.0000000
------------------------------------------------------------------------------------
================================ END OF REPORT =====================================
/opt/conda/lib/python3.8/site-packages/scipy/optimize/_minimize.py:779: RuntimeWarning: Method 'bounded' does not support relative tolerance in x; defaulting to absolute tolerance.
warn("Method 'bounded' does not support relative tolerance in x; "
SAR
sar = ML_Lag(y=y, x=x, w=W,
name_y=y_name, name_x = [x_name], name_w="W", name_ds='gdf')
print(sar.summary)
REGRESSION
----------
SUMMARY OF OUTPUT: MAXIMUM LIKELIHOOD SPATIAL LAG (METHOD = FULL)
-----------------------------------------------------------------
Data set : gdf
Weights matrix : W
Dependent Variable : Income Number of Observations: 90
Mean dependent var : 16316.7536 Number of Variables : 3
S.D. dependent var : 4975.6156 Degrees of Freedom : 87
Pseudo R-squared : 0.8441
Spatial Pseudo R-squared: 0.7818
Sigma-square ML : 3815809.415 Log likelihood : -810.308
S.E of regression : 1953.410 Akaike info criterion : 1626.617
Schwarz criterion : 1634.116
------------------------------------------------------------------------------------
Variable Coefficient Std.Error z-Statistic Probability
------------------------------------------------------------------------------------
CONSTANT 3032.3835376 908.6900588 3.3370933 0.0008466
Insurance 29.8082339 3.0878242 9.6534750 0.0000000
W_Income 0.3674626 0.0805803 4.5602066 0.0000051
------------------------------------------------------------------------------------
================================ END OF REPORT =====================================
/opt/conda/lib/python3.8/site-packages/scipy/optimize/_minimize.py:779: RuntimeWarning: Method 'bounded' does not support relative tolerance in x; defaulting to absolute tolerance.
warn("Method 'bounded' does not support relative tolerance in x; "