# Start writing code here..import numpy as np
import pandas as pd
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import libpysal as ps
from mgwr.gwr import GWR, MGWR
from mgwr.sel_bw import Sel_BW
import scipy as sp
from matplotlib_scalebar.scalebar import ScaleBar
from mgwr.utils import shift_colormap, truncate_colormap
import numpy as np
import itertools
import time
import statsmodels.api as sm
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
import statsmodels.api as statm
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
df = pd.read_csv("https://msachde1.carto.com/api/v2/sql?filename=poverty_rates&format=CSV&q=SELECT+*+FROM+poverty_rates+as+poverty_rates")
gdf = gpd.read_file("https://msachde1.carto.com/api/v2/sql?filename=indonesia_shapefile&format=SHP&q=SELECT+*+FROM+indonesia_shapes+as+indonesia_shapefile")
print(df.columns)
print(gdf.columns)
gdf.plot()
df_corr = df[['pov', 'gap', 'sev', 'mys', 'gdpgr', 'unemp','shr_agr', 'shr_ind', 'inv_shr', 'subs_rice', 'gdi']]
#Checking for any nulls in our attribute columns
df_corr.isnull().values.any()
sns.set(rc = {'figure.figsize':(15,10)})
cmap = sns.diverging_palette(240, 10, n=9, as_cmap=True)
ax=sns.heatmap(df_corr.corr(),cmap=cmap)
sns.set_style("whitegrid")
sns.distplot(df['pov'])
#Log transformation check for poverty rate
sns.set_style("whitegrid")
sns.distplot(np.log(df['pov']),color='red')
sns.distplot(df['pov'])
from scipy.stats import normaltest
# normality test
stat, p = normaltest(df['pov'])
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
print('Sample looks Gaussian (fail to reject H0)')
else:
print('Sample does not look Gaussian (reject H0)')
df['ln_pov'] = np.log(df['pov'])
from scipy.stats import normaltest
# normality test
stat, p = normaltest(df['ln_pov'])
print('Statistics=%.3f, p=%.3f' % (stat, p))
# interpret
alpha = 0.05
if p > alpha:
print('Sample looks Gaussian (fail to reject H0)')
else:
print('Sample does not look Gaussian (reject H0)')
pd.set_option('precision',2)
df_corr.describe()
#Log transformation check for poverty gap index
sns.set_style("whitegrid")
sns.distplot(np.log(df['gap']),color='red')
sns.distplot(df['gap'])
np.isnan(df['pov']).any()
gdf['poverty_rate'] = df['pov']
gdf['poverty_gap_index'] = df['gap']
#Define a mapping function for consistent maps
def mapp(col,name,title,bw_col=None):
vmin=min(col[name])
vmax=max(col[name])
fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(15,7),facecolor='white')
if bw_col:
ax.set_title(title+' (BW: ' + str(bw_col) +')', fontsize=12)
else:
ax.set_title(title, fontsize=12)
cmap = plt.cm.seismic
if (vmin < 0) & (vmax < 0):
cmap = truncate_colormap(cmap, 0.0, 0.5)
elif (vmin > 0) & (vmax > 0):
cmap = truncate_colormap(cmap, 0.5, 1.0)
else:
cmap = shift_colormap(cmap, start=0.0, midpoint=1 - vmax/(vmax + abs(vmin)), stop=1.)
sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))
col.plot(name, cmap=sm.cmap, ax=ax, vmin=vmin, vmax=vmax, **{'edgecolor':None, 'alpha':0.65})
if (col[name] == 0).any():
col[col[name]==0].plot(color='white',ax=ax,**{'edgecolor':'grey', 'alpha':1})
texts = []
fig.tight_layout()
sm._A = []
cbar = fig.colorbar(sm,fraction=0.0175, pad=0.04)
cbar.ax.tick_params(labelsize=12)
ax.get_xaxis().set_visible(False)
ax.get_yaxis().set_visible(False)
plt.show()
mapp(gdf,'poverty_rate',"Poverty Rate in Indonesia - 2010")
mapp(gdf,'poverty_gap_index',"Poverty Gap Index in Indonesia - 2010")
df_corr.columns
correlation = df_corr.corr(method='pearson')
columns = correlation.nlargest(10, 'pov').index
columns
correlation = df_corr.corr(method='pearson')
columns = correlation.nlargest(11, 'gap').index
columns
def fit_linear_reg(X,Y):
#Fit linear regression model and return RSS and R squared values
model_k = linear_model.LinearRegression(fit_intercept = True)
model_k.fit(X,Y)
RSS = mean_squared_error(Y,model_k.predict(X)) * len(Y)
R_squared = model_k.score(X,Y)
return RSS, R_squared
# Best subset selection
#Importing tqdm for the progress bar
import tqdm
#Initialization variables
Y = df.ln_pov
X = df_corr.drop(columns = {'pov','gap'}, axis = 1)
k = 10
RSS_list, R_squared_list, feature_list = [],[], []
numb_features = []
#Looping over k = 1 to k = 11 features in X
for k in tqdm.tnrange(1,len(X.columns) + 1, desc = 'Loop...'):
#Looping over all possible combinations: from 11 choose k
for combo in itertools.combinations(X.columns,k):
tmp_result = fit_linear_reg(X[list(combo)],Y) #Store temp result
RSS_list.append(tmp_result[0]) #Append lists
R_squared_list.append(tmp_result[1])
feature_list.append(combo)
numb_features.append(len(combo))
#Store in DataFrame
best_sub = pd.DataFrame({'Feature_count': numb_features,'RSS': RSS_list, 'R_squared':R_squared_list,'Covariates':feature_list})
best_min = best_sub[best_sub.groupby('Feature_count')['RSS'].transform(min) == best_sub['RSS']]
display(best_min.head(3))
best_sub['Min_RSS'] = best_sub.groupby('Feature_count')['RSS'].transform(min)
best_sub['Max_R_squared'] = best_sub.groupby('Feature_count')['R_squared'].transform(max)
best_sub.head()
fig = plt.figure(figsize = (16,6),facecolor='white')
ax = fig.add_subplot(1, 2, 1)
ax.scatter(best_sub['Feature_count'],best_sub.RSS, alpha = .2, color = 'darkblue' )
ax.set_xlabel('# Features')
ax.set_ylabel('RSS')
ax.set_title('RSS - Best subset selection')
ax.plot(best_sub['Feature_count'],best_sub.Min_RSS,color = 'r', label = 'Best subset')
ax.grid(False);ax.set_xticks([]);ax.set_yticks([]);ax.set_facecolor('white')
ax.legend()
ax = fig.add_subplot(1, 2, 2)
ax.scatter(best_sub['Feature_count'],best_sub.R_squared, alpha = .2, color = 'darkblue' )
ax.plot(best_sub['Feature_count'],best_sub.Max_R_squared,color = 'r', label = 'Best subset')
ax.set_xlabel('# Features')
ax.set_ylabel('R squared')
ax.set_title('R_squared - Best subset selection')
ax.legend()
# Hide grid lines
ax.grid(False);ax.set_xticks([]);ax.set_yticks([]);ax.set_facecolor('white')
plt.show()
best_Rsq=best_sub.sort_values(by='R_squared',ascending=False)[:1]
best_Rss=best_sub.sort_values(by='RSS',ascending=True)[:1]
best_Rsq['Covariates'].to_list()
best_Rss['Covariates'].to_list()
#Initialization variables
Y = df.ln_pov
X = df_corr.drop(columns = {'pov','gap'}, axis = 1)
k = 9
remaining_features = list(X.columns.values)
features = []
RSS_list, R_squared_list = [np.inf], [np.inf] #Due to 1 indexing of the loop...
features_list = dict()
for i in range(1,k+1):
best_RSS = np.inf
for combo in itertools.combinations(remaining_features,1):
RSS = fit_linear_reg(X[list(combo) + features],Y) #Store temp result
if RSS[0] < best_RSS:
best_RSS = RSS[0]
best_R_squared = RSS[1]
best_feature = combo[0]
#Updating variables for next loop
features.append(best_feature)
#remaining_features.remove(best_feature)
#Saving values for plotting
RSS_list.append(best_RSS)
R_squared_list.append(best_R_squared)
features_list[i] = features.copy()
print('Forward stepwise subset selection')
print('Number of features |', 'Features |', 'RSS')
display([(i,features_list[i], round(RSS_list[i])) for i in range(1,5)])
df1 = pd.concat([pd.DataFrame({'features':features_list}),pd.DataFrame({'RSS':RSS_list, 'R_squared': R_squared_list})], axis=1, join='inner')
df1['numb_features'] = df1.index
#Initializing useful variables
m = len(Y)
p = 9
hat_sigma_squared = (1/(m - p -1)) * min(df1['RSS'])
#Computing
df1['C_p'] = (1/m) * (df1['RSS'] + 2 * df1['numb_features'] * hat_sigma_squared )
df1['AIC'] = (1/(m*hat_sigma_squared)) * (df1['RSS'] + 2 * df1['numb_features'] * hat_sigma_squared )
df1['BIC'] = (1/(m*hat_sigma_squared)) * (df1['RSS'] + np.log(m) * df1['numb_features'] * hat_sigma_squared )
df1['R_squared_adj'] = 1 - ( (1 - df1['R_squared'])*(m-1)/(m-df1['numb_features'] -1))
df1
variables = ['C_p', 'AIC','BIC','R_squared_adj']
fig = plt.figure(figsize = (18,6))
for i,v in enumerate(variables):
ax = fig.add_subplot(1, 4, i+1)
ax.plot(df1['numb_features'],df1[v], color = 'lightblue')
ax.scatter(df1['numb_features'],df1[v], color = 'darkblue')
if v == 'R_squared_adj':
ax.plot(df1[v].idxmax(),df1[v].max(), marker = 'x', markersize = 20)
else:
ax.plot(df1[v].idxmin(),df1[v].min(), marker = 'x', markersize = 20)
ax.set_xlabel('Number of predictors')
ax.set_ylabel(v)
fig.suptitle('Subset selection using C_p, AIC, BIC, Adjusted R2', fontsize = 16)
plt.show()
best_Rsq['Covariates'].to_list()
from statsmodels.stats.outliers_influence import variance_inflation_factor
X = df_corr[[ 'sev','mys' , 'gdpgr' , 'unemp', 'shr_agr', 'shr_ind' , 'inv_shr' , 'subs_rice' , 'gdi']]
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
for i in range(len(X.columns))]
print(vif_data)
gdf['gdi'] = df['gdi']
gdf['mys'] = df['mys']
mapp(gdf,'gdi',"gender_development_index")
mapp(gdf,'mys',"mean years of schooling")
sns.set_style("whitegrid")
sns.regplot(df['mys'],df['gdi'])
X = df_corr[[ 'sev','mys' , 'gdpgr' , 'unemp', 'shr_agr', 'shr_ind' , 'inv_shr' ]]
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
for i in range(len(X.columns))]
print(vif_data)
df['ln_pov'] = np.log(df['pov'])
X = df[['sev','mys' , 'gdpgr' , 'unemp', 'shr_agr', 'shr_ind' , 'inv_shr']].copy()
X_std = (X-X.mean(axis=0))/X.std(axis=0)
X_std=statm.add_constant(X_std)
y = df['ln_pov']
y_std = (y-y.mean(axis=0))/y.std(axis=0)
model = statm.OLS(y_std,X_std).fit()
predictions=model.predict(X_std)
model.summary()
gdf['global_resid'] = model.resid
mapp(gdf,'global_resid',"Global residuals - OLS")
import esda
from esda.moran import Moran, Moran_Local
import libpysal
from libpysal import weights
from libpysal.weights.util import min_threshold_distance
W = weights.Rook.from_dataframe(gdf)
W.transform = 'r'
import splot
from splot.esda import moran_scatterplot, plot_moran
from splot.libpysal import plot_spatial_weights
plot_spatial_weights(W, gdf);
moran_pov = Moran(gdf["global_resid"], W)
print(moran_pov.I,moran_pov.p_sim)
df.columns
pov_y = df['ln_pov'].values.reshape((-1,1))
X = df[['sev','mys' , 'gdpgr', 'shr_agr' , 'inv_shr']].values
u = df['coord_x']
v = df['coord_y']
coords = list(zip(u,v))
pov_y = (pov_y - pov_y.mean(axis=0)) / pov_y.std(axis=0)
X = (X - X.mean(axis=0)) / X.std(axis=0)
gwr_selector = Sel_BW(coords, pov_y, X,spherical=True)
gwr_bw = gwr_selector.search()
gwr_bw
gwr_results = GWR(coords, pov_y, X, gwr_bw,spherical=True).fit()
gwr_results.summary()
mgwr_selector = Sel_BW(coords, pov_y, X, multi=True,spherical=True)
mgwr_bw = mgwr_selector.search(verbose=True)
mgwr_results = MGWR(coords, pov_y, X, mgwr_selector,spherical=True).fit()
mgwr_results.summary()
gdf.columns
t_crit=mgwr_results.critical_tval()
#'sev','mys' , 'gdpgr', 'shr_agr' , 'inv_shr'
gdf['b_constant'] = mgwr_results.params[:,0]
gdf['b_sev'] = mgwr_results.params[:,1]
gdf['b_mys'] = mgwr_results.params[:,2]
gdf['b_gdpgr'] = mgwr_results.params[:,3]
gdf['b_shr_agr'] = mgwr_results.params[:,4]
gdf['b_inv_shr'] = mgwr_results.params[:,5]
gdf['t_constant'] = mgwr_results.tvalues[:,0]
gdf['t_sev'] = mgwr_results.tvalues[:,1]
gdf['t_mys'] = mgwr_results.tvalues[:,2]
gdf['t_gdpgr'] = mgwr_results.tvalues[:,3]
gdf['t_shr_agr'] = mgwr_results.tvalues[:,4]
gdf['t_inv_shr'] = mgwr_results.tvalues[:,5]
#'sev','mys' , 'gdpgr', 'shr_agr' , 'inv_shr'
gdf['constant'] = np.ones_like(df['sev'])
gdf['ln_pov'] = df['ln_pov']
org_cols = ['constant','sev','mys' , 'gdpgr', 'shr_agr' , 'inv_shr']
b_cols = ['b_constant','b_sev','b_mys' , 'b_gdpgr', 'b_shr_agr' , 'b_inv_shr']
bu_cols = ['bu_constant','bu_sev','bu_mys' , 'bu_gdpgr', 'bu_shr_agr' , 'bu_inv_shr']
bt_cols = ['bt_constant','bt_sev','bt_mys' , 'bt_gdpgr', 'bt_shr_agr' , 'bt_inv_shr']
t_cols = ['t_constant','t_sev','t_mys' , 't_gdpgr', 't_shr_agr' , 't_inv_shr']
for i in range(1,6):
gdf[bu_cols[i]] = (gdf[b_cols[i]]*np.std(gdf['ln_pov']))/np.std(df[org_cols[i]])
for i in range(6):
gdf.loc[gdf[t_cols[i]] >=t_crit[i], bt_cols[i]] = gdf[b_cols[i]]
gdf.loc[gdf[t_cols[i]] <=-t_crit[i], bt_cols[i]] = gdf[b_cols[i]]
gdf['bt_inv_shr']
np.isnan(gdf['bt_inv_shr']).all()
gdf=gdf.fillna(0.0)
mapp(gdf,'bt_constant',"Local intercept")
gdf['conv_mys'] = (np.exp(gdf['bt_mys'])-1)*100
mapp(gdf,'conv_mys',"Mean years of schooling - local estimate")
gdf['conv_shr_agr'] = (np.exp(gdf['bt_shr_agr'])-1)*100
mapp(gdf,'conv_shr_agr',"Share of agriculture to GDP - local estimate")
gdf['local_resid'] = mgwr_results.resid_response
mapp(gdf,'local_resid',"Local errors")
moran_pov = Moran(gdf["local_resid"], W)
print(moran_pov.I,moran_pov.p_sim)