Introduction to exploratory space-time data analysis (ESTDA)
Setup
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
from pylab import rcParams
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 72
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context(context="paper", font_scale=1.5, rc=None)
sns.set(font="serif")
import plotly.express as px
import plotly.graph_objects as go
import geopandas as gpd
import libpysal
from libpysal import weights
from libpysal.weights import Queen
import esda
from esda.moran import Moran, Moran_Local
import splot
from splot.esda import moran_scatterplot, plot_moran, lisa_cluster, plot_local_autocorrelation
from splot.libpysal import plot_spatial_weights
from giddy.directional import Rose
import statsmodels.api as sm
import statsmodels.formula.api as smf
from spreg import OLS
from spreg import MoranRes
from spreg import ML_Lag
from spreg import ML_Error
import warnings
warnings.filterwarnings('ignore')
Import data
# Import a .zip file containing spatial files and data
#! wget https://github.com/quarcs-lab/data-open/raw/master/us_income/widePanel.zip
#! unzip widePanel.zip
gdf = gpd.read_file("https://raw.githubusercontent.com/quarcs-lab/data-open/master/us_income/widePanel.geojson")
gdf
gdf['gdp1969'] = np.exp(gdf.ln_gp_1969)
gdf['gdp2008'] = np.exp(gdf.ln_gp_2008)
gdf.columns
gdfSimple = gdf.loc[:, ['id', 'state', 'gdp1969', 'gdp2008', 'geometry']]
type(gdfSimple)
gdfSimple['rel1969'] = gdfSimple['gdp1969'] / gdfSimple['gdp1969'].mean()
gdfSimple['rel2008'] = gdfSimple['gdp2008'] / gdfSimple['gdp2008'].mean()
gdfSimple
gdfSimple.columns
Describe data
gdfSimple[['gdp1969', 'gdp2008', 'rel1969', 'rel2008']].describe().round(2)
sns.histplot(x=gdfSimple['rel1969'], kde=True);
sns.histplot(x=gdfSimple['rel2008'], kde=True);
px.histogram(gdfSimple, x = 'rel1969', hover_name= 'state', marginal='rug')
px.histogram(gdfSimple, x = 'rel2008', hover_name= 'state', marginal='rug')
px.scatter(
gdfSimple,
x="rel1969",
y="rel2008",
hover_name="state",
trendline="ols",
marginal_x="box",
marginal_y="box")
gdfSimple.crs
Basic maps
fig, ax = plt.subplots(figsize=(6,4))
gdfSimple.plot(column="rel1969", scheme='NaturalBreaks', k=5, cmap='coolwarm', legend=True, legend_kwds={'bbox_to_anchor':(1.12, 0.55)}, ax=ax)
plt.title('Relative GDP per capita in 1969')
plt.tight_layout()
ax.axis("off")
plt.savefig('map1969',dpi=300, bbox_inches='tight')
plt.show()
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,4))
gdfSimple.plot(column="rel1969", scheme='Quantiles', k=5, cmap='coolwarm', legend=True, legend_kwds={'bbox_to_anchor':(1.12, 0.55)}, ax=axes[0])
gdfSimple.plot(column="rel2008", scheme='user_defined', classification_kwds={'bins':[0.85, 0.95, 1.03, 1.16]}, cmap='coolwarm', legend=True, legend_kwds={'bbox_to_anchor':(1.12, 0.55)},ax=axes[1])
plt.tight_layout()
axes[0].axis("off")
axes[1].axis("off")
axes[0].set_title('(a) Relative GDP per capita in 1969')
axes[1].set_title('(b) Relative GDP per capita in 2008')
plt.savefig('map1969map2008',dpi=300, bbox_inches='tight')
plt.show()
gdfSimple.explore(
column='rel1969',
tooltip=['state', 'rel1969', 'rel2008'],
scheme='naturalbreaks',
k=5,
cmap='coolwarm',
legend=True,
tiles='Stamen Terrain',
style_kwds =dict(color="gray", weight=0.5),
legend_kwds=dict(colorbar=False)
)
Spatial weights matrix
Import W
# Load weights matrix from Geoda .gal file
W = libpysal.io.open("Wqueen_fromGEODA.gal").read()
W.n
W.neighbors
W.min_neighbors
W.mean_neighbors
W.max_neighbors
# Row-standardize weights matrix
W.transform = "R"
Create W
W = weights.Queen.from_dataframe(gdf)
#W = weights.Rook.from_dataframe(gdf)
#W = weights.KNN.from_dataframe(gdf, k=4)
#W = weights.KNN.from_dataframe(gdf, k=6)
#W = weights.KNN.from_dataframe(gdf, k=8)
#>>>> Compute inverse distance squared based on maximum nearest neighbor distance between the n observations
#maxMin = weights.min_threshold_dist_from_shapefile("map.shp")
#W = weights.DistanceBand.from_dataframe(gdf, threshold = maxMin, binary=False, alpha=-2)
# >>> Row-standardize W
W.transform = 'r'
# Ref: https://splot.readthedocs.io/en/stable/users/tutorials/weights.html#a-closer-look-at-w
W.neighbors
W.min_neighbors
W.mean_neighbors
W.max_neighbors
Plot W
plot_spatial_weights(W, gdf);
Spatial lag
# Create spatial lag variables
gdfSimple["Wrel1969"] = weights.lag_spatial(W, gdfSimple["rel1969"])
gdfSimple["Wrel2008"] = weights.lag_spatial(W, gdfSimple["rel2008"])
gdfSimple[["rel1969", "Wrel1969", "rel2008", "Wrel2008"]].round(2)
gdfSimple[["rel1969", "Wrel1969", "rel2008", "Wrel2008"]].describe().round(2)
px.scatter(
gdfSimple,
x="rel1969",
y="Wrel1969",
hover_name="state",
labels = dict(rel1969 = "Relative GDPpc in 1969",
Wrel1969 = "Neighbors' relative GDPpc in 1969",
rel2008 = "Relative GDPpc in 2008",
Wrel2008 = "Neighbors' relative GDPpc in 2008"),
trendline="ols",
marginal_x="box",
marginal_y="box")
f,ax = plt.subplots(1,2, figsize=(12, 4))
gdfSimple.plot(column='rel1969', ax=ax[0], edgecolor='k',
scheme='NaturalBreaks', k=5, cmap='coolwarm', legend=False)
ax[0].set_title("Relative GDP pc in 1969")
gdfSimple.plot(column='Wrel1969', ax=ax[1], edgecolor='k',
scheme='NaturalBreaks', k=5, cmap='coolwarm', legend=False)
ax[1].set_title("Spatial lag of relative GDP pc in 1969");
Spatial autocorrelation
x1969 = gdfSimple["rel1969"]
x2008 = gdfSimple["rel2008"]
moran1969 = Moran(x1969, W)
moran2008 = Moran(x2008, W)
print(moran1969.I, moran2008.I)
print(moran1969.p_sim, moran2008.p_sim)
print(moran1969.p_norm, moran2008.p_norm)
sns.kdeplot(moran1969.sim, shade=True)
plt.vlines(moran1969.I, 0, 1, color='r')
plt.vlines(moran1969.EI, 0, 1)
plt.xlabel("Moran's I in 1969");
fig, axes = plt.subplots(nrows=1, ncols=2,figsize=(12,4))
moran_scatterplot(moran1969, aspect_equal=False, zstandard=False, ax=axes[0])
moran_scatterplot(moran2008, aspect_equal=False, zstandard=False, ax=axes[1])
axes[0].set_xlim([0.7, 1.4])
axes[0].set_xlabel("Relative GDPpc in 1969")
axes[0].set_ylim([0.7, 1.4])
axes[0].set_ylabel("Neighbors' relative GDPpc in 1969")
axes[1].set_xlim([0.7, 1.4])
axes[1].set_xlabel("Relative GDPpc in 2008")
axes[1].set_ylim([0.7, 1.4])
axes[1].set_ylabel("Neighbors' relative GDPpc in 2008")
plt.savefig('moran1969moran2008',dpi=300, bbox_inches='tight')
plt.show()
px.scatter(
gdfSimple,
x="rel1969",
y="Wrel1969",
hover_name="state",
labels = dict(rel1969 = "Relative GDPpc in 1969",
Wrel1969 = "Neighbors' relative GDPpc in 1969",
rel2008 = "Relative GDPpc in 2008",
Wrel2008 = "Neighbors' relative GDPpc in 2008"),
trendline="ols",
marginal_x="box",
marginal_y="box")
px.scatter(
gdfSimple,
x="rel2008",
y="Wrel2008",
hover_name="state",
labels = dict(rel1969 = "Relative GDPpc in 1969",
Wrel1969 = "Neighbors' relative GDPpc in 1969",
rel2008 = "Relative GDPpc in 2008",
Wrel2008 = "Neighbors' relative GDPpc in 2008"),
trendline="ols",
marginal_x="box",
marginal_y="box")
Evolution of spatial dependence
# Create multidimentional array
dfa = gdf.loc[:,'ln_gp_1929':'ln_gp_1994'].values
dfa
dfa.shape
# Create array of years
years = np.arange(1929,1995)
years
years.shape
mits = [Moran(cs, W) for cs in dfa.T]
res = np.array([(m.I, m.EI, m.p_sim, m.z_sim) for m in mits])
plt.plot(years, res[:,0])
plt.title("Evolution of spatial dependence")
plt.ylabel("Moran's I of log GDP pc");
Evolution of regional disparities
sigma = dfa.std(axis=0)
plt.plot(years, sigma)
plt.title("Sigma Convergence")
plt.ylabel('Stand. Dev.of log GDP pc');
plt.plot(years, res[:,0], label="Spatial Dep.")
plt.plot(years, sigma, label="Disparities")
plt.legend(bbox_to_anchor=(1.31,1), loc="upper right");
plt.style.use('seaborn-white')
# create figure and axis objects with subplots()
fig,ax = plt.subplots()
# make a plot
ax.plot(years, res[:,0], color="royalblue")
# set y-axis label
ax.set_ylabel("Spatial Dependence", color="royalblue")
# twin object for two different y-axis on the sample plot
ax2=ax.twinx()
# make a plot with different y-axis using second axis object
ax2.plot(years, sigma, color="orange")
ax2.set_ylabel("Regional Disparities", color="darkorange")
plt.show()
sns.set_style("darkgrid")
for row in dfa:
plt.plot(years, row)
rel_dfa = dfa / dfa.mean(axis=0)
rel_dfa
rel_dfa.shape
for row in rel_dfa:
plt.plot(years, row)
sns.set_palette(sns.color_palette("coolwarm", 2010-1929))
for row in range(1995-1929):
sns.kdeplot(rel_dfa[:,row])
LISA scatterplot and map
Lmoran1969 = Moran_Local(x1969, W, permutations = 999, seed=12345)
Lmoran2008 = Moran_Local(x2008, W, permutations = 999, seed=12345)
Lmoran1969.p_sim
# Plot local Moran
fig, ax = moran_scatterplot(Lmoran1969, p=0.01, zstandard =False)
ax.set_xlabel('Relative GDPpc in 1969')
ax.set_ylabel('Spatial Lag of relative GDPpc in 1969')
plt.show()
# Trying to fix the colors
# It seems that at least one region each quadrant needs to be significant for the proper coloring of points
moran_scatterplot(Lmoran1969, p=0.15, zstandard = False);
# Plot LISA map
fig, ax = plt.subplots(figsize=(14,12))
lisa_cluster(Lmoran1969, gdf, p=0.15, figsize = (16,12), ax=ax)
plt.show()
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12,4))
lisa_cluster(Lmoran1969, gdf, p=0.01, ax=axes[0])
lisa_cluster(Lmoran2008, gdf, p=0.01, ax=axes[1])
plt.tight_layout()
axes[0].axis("off")
axes[1].axis("off")
axes[0].set_title('Year 1969')
axes[1].set_title('Year 2008')
plt.show()
Directional LISA
Y = gdfSimple[['rel1969', 'rel2008']].values
Y.shape
np.random.seed(100)
r4 = Rose(Y, W, k=4)
r8 = Rose(Y, W, k=8)
Four segments
r4.plot_vectors();
r4.plot_origin();
r4.plot(Y[:,0]); # condition on starting relative income
#Number of vectors contained in each sector.
r4.counts
#Vector lengths
r4.r
# Psuedo p-values for the observed sector counts under the two-sided alternative
r4.permute()
r4.p
# Hypotheisis: The focal unit and its lag move in the same directions.
r4.permute(alternative='positive')
r4.p
# Hypotheisis: The focal unit and its lag move in oposite directions.
r4.permute(alternative='negative')
r4.p
Eight segments
# condition on starting relative income
r8.plot(Y[:,0]) ;
# Number of vectors contained in each sector.
r8.counts
# Vector lengths
r8.r
r8.permute()
r8.p
# origin standardized
r8.plot_origin();
Preparation for regression models
y = gdf['g1929_1994'].values
y_name = 'Growth 1929-1994'
x = np.array([gdf.ln_gp_1929]).T
x_name = 'Log GPDpc in 1929'
OLS
ols = OLS(y = y, x = x, w = W,
name_y=y_name, name_x = [x_name], name_w="Wqueen", name_ds='gdf',
white_test=True, spat_diag=True, moran=True)
print(ols.summary)
SAR
sar = ML_Lag(y=y, x=x, w=W,
name_y=y_name, name_x = [x_name], name_w="Wqueen", name_ds='gdf')
print(sar.summary)
SEM
sem = ML_Error(y=y, x=x, w=W,
name_y=y_name, name_x = [x_name], name_w="Wqueen", name_ds='gdf')
print(sem.summary)