Motivation
Setup
#!pip install --quiet wbgapi
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import chart_studio
chart_studio.tools.set_credentials_file(username='econdata777', api_key='HERE')
import chart_studio.plotly as save2cs
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy import stats
import inequality
import wbgapi as wb
WBGAPI library
Sources
wb.source.info()
wb.series.info(db = 12) # Series in the Education Statistics Database
Economies
wb.economy.info()
wb.economy.DataFrame()
wb.economy.info(wb.income.members('HIC')) # high-income economies
Regions
wb.region.info()
# population for African countries, every other year
wb.data.DataFrame('SP.POP.TOTL', wb.region.members('AFR'), range(2010, 2020, 2))
Income groups
wb.income.info()
Topics
wb.topic.info()
Series
wb.series.info() # WDI by default
wb.series.info(db = 6) # Series in the Debt Statistics database
wb.series.info(topic = 3) # Series in the topic Economy & Growth
Search series
#Not working
#wb.series.info(q='gdp')
Metadata of series
wb.series.metadata.get('NY.GNS.ICTR.GN.ZS')
wb.series.metadata.get('NY.GDP.PCAP.PP.KD')
Import data
Function DataFrame
help(wb.data.DataFrame)
# 5 years of population data (with economy names) from 2010 to 2014
wb.data.DataFrame('SP.POP.TOTL', time=range(2010,2015), labels=True)
# Most recent poverty and income data for Latin American countries (LAC)
wb.data.DataFrame(['SI.POV.NAHC', 'NY.GDP.PCAP.CD'],
economy = wb.region.members('LAC'),
mrnev = 1, # Most recent non-empty most values (time period varies)
timeColumns = True, # Show the time dimension for each series/economy
labels = True
)
wb.series.metadata.get('SI.POV.NAHC')
# Note: National poverty rates lines are not necessarily comparable across contries
wb.series.metadata.get('EN.ATM.CO2E.PC')
# Most recent CO2 emissions per capita for each country and merge its income group
wb.data.DataFrame('EN.ATM.CO2E.PC', mrnev=1, labels = True).join(wb.economy.DataFrame()['incomeLevel'])
# Top 10 emitters per capita
wb.data.DataFrame('EN.ATM.CO2E.PC',mrnev=1,labels=True).sort_values('EN.ATM.CO2E.PC',ascending=False).head(10)
Long and wide panel data
# Import GDP per capita (PPP Constant international $)
GDPpc_long = wb.data.DataFrame(['NY.GDP.PCAP.PP.CD'],
time=range(2000, 2021),
labels=True,
skipAggs=True,
skipBlanks=True,
columns='series').reset_index()
GDPpc_long
# Rearrange data in wide form and drop countries for which data for all years are not available
GDPpc_wide = pd.pivot_table(GDPpc_long,
values='NY.GDP.PCAP.PP.CD',
index = 'economy', columns='time').dropna()
GDPpc_wide
wb.data.DataFrame('NY.GDP.PCAP.CD', wb.region.members('EMU'), time=range(1960, 2020))
wb.data.DataFrame('NY.GDP.PCAP.CD', wb.region.members('EMU'), time=range(1960, 2020)).dropna()
ren = wb.data.DataFrame('EG.ELC.RNEW.ZS',
['DEU','FRA','ESP','GBR','USA'],
time=range(2000,2016,5))
ren
Countries and regions
regionalIndentifiers = wb.economy.DataFrame(skipAggs=True).reset_index()
regionalIndentifiers
Merge datasets
# Merge regional intifiers with long-form panel data (merge one to many)
GDPpc_longWITHri = pd.merge(GDPpc_long, regionalIndentifiers, how="left", left_on= "economy", right_on= "index")
GDPpc_longWITHri
px.line(GDPpc_longWITHri,
x= 'Time',
y= 'NY.GDP.PCAP.PP.CD',
#log_y= True,
color='Country',
#facet_col = 'incomeLevel',
#facet_col_wrap= 2,
labels={'NY.GDP.PCAP.PP.CD': 'GDP per capita'}
)
px.line(GDPpc_longWITHri,
x= 'Time',
y= 'NY.GDP.PCAP.PP.CD',
log_y= True,
color='Country',
facet_col = 'incomeLevel',
facet_col_wrap= 2,
labels={'NY.GDP.PCAP.PP.CD': 'GDP per capita'}
)
# Save to Chart Studio
figPlotly20220520 = px.line(GDPpc_longWITHri,
x= 'Time',
y= 'NY.GDP.PCAP.PP.CD',
log_y= True,
color='Country',
facet_col = 'incomeLevel',
facet_col_wrap= 2,
labels={'NY.GDP.PCAP.PP.CD': 'GDP per capita'}
)
#save2cs.plot(figPlotly20220520, filename = 'figPlotly20220520', auto_open=True)
Visualize data
wb.series.metadata.get('EG.ELC.RNEW.ZS')
Bar plots
wb.data.DataFrame('EG.ELC.RNEW.ZS',
['DEU','FRA','ESP','GBR','USA'],
time=range(2000,2016,5)).plot.bar();
Line plots
wb.data.DataFrame('EG.ELC.RNEW.ZS',
['DEU','FRA','ESP','GBR','USA'],
time=range(2000,2016,5)).T.plot();
df8 = wb.data.DataFrame('EG.ELC.RNEW.ZS',
['DEU','FRA','ESP','GBR','USA'],
time=range(2000, 2016, 5), numericTimeKeys = True).T.reset_index()
df8
px.line(df8,
x= 'index',
y= ['DEU','FRA','ESP','GBR','USA'],
labels=dict(index = '',
value = 'Renewable electricity share',
variable = 'Country')
)
wb.series.metadata.get('NY.GDP.PCAP.CD')
# What about this other indicator?
wb.series.metadata.get('NY.GDP.PCAP.PP.KD')
#Basic chart of income growth for countries in the South Asia region
wb.data.DataFrame('NY.GDP.PCAP.PP.KD',
economy = wb.region.members('SAS'),
time = range(2000, 2021), numericTimeKeys = True,
labels = True).set_index('Country').transpose().plot(title='GDP per capita in South Asia');
wb.data.DataFrame('NY.GDP.PCAP.PP.KD',
economy = wb.region.members('SAS'),
time = range(2000, 2016), numericTimeKeys = True,
labels = True)
# Import GDP per capita (PPP Constant international $)
GDPpc_long_SAS = wb.data.DataFrame(['NY.GDP.PCAP.PP.KD'],
economy = wb.region.members('SAS'),
time=range(2000, 2021),
labels=True,
skipAggs=True,
#skipBlanks=True,
columns='series').reset_index().sort_values(['economy', 'Time'])
GDPpc_long_SAS
px.line(GDPpc_long_SAS,
x= 'Time',
y= 'NY.GDP.PCAP.PP.KD',
color='Country',
labels={"NY.GDP.PCAP.PP.KD": "GDP per capita"}
)
Scatter plots
# Import GDP per capita and secondary school enrollment
df2017_2019 = wb.data.DataFrame(['NY.GDP.PCAP.PP.KD', 'SE.SEC.NENR'],
time=range(2017, 2019),
labels=True,
skipAggs=True,
skipBlanks=True,
columns='series').reset_index()
df2017_2019
px.scatter(df2017_2019.query("Time == '2017'"),
x="SE.SEC.NENR",
y="NY.GDP.PCAP.PP.KD",
log_y = True, # log scale for the y axis
trendline="ols", trendline_options=dict(log_y=True),
#color="region",
#size="pop", size_max=60,
hover_name="Country",
labels={"SE.SEC.NENR": "School enrollment, secondary (% net)",
"NY.GDP.PCAP.PP.KD": "GDP per capita"
}
)
Maps
px.choropleth(
df2017_2019.query("Time == '2017'"),
locations = "economy",
color = "NY.GDP.PCAP.PP.KD",
hover_name = "Country",
color_continuous_scale = px.colors.sequential.Plasma,
projection = "natural earth",
labels = {"NY.GDP.PCAP.PP.KD": "GDP per capita"}
)
Cross-country inequality
GDPpc_wide = wb.data.DataFrame('NY.GDP.PCAP.KD',
wb.region.members('WLD'),
skipAggs=True,
time=range(1970, 2021)).dropna()
GDPpc_wide
cv = GDPpc_wide.apply(stats.variation, axis = 0)
def gini_by_col(column):
return inequality.gini.Gini(column.values).g
gini = GDPpc_wide.apply(gini_by_col, axis=0)
def theil_by_col(column):
return inequality.theil.Theil(column.values).T
theil = GDPpc_wide.apply(theil_by_col, axis=0)
df = pd.DataFrame({'cv': cv,
'gini': gini,
'theil': theil})
df.round(2)
px.line(df, x=df.index, y="gini")
px.line(df, x=df.index, y="cv")
px.line(df, x=df.index, y= 'theil')
px.line(df, x=df.index, y= ['theil','gini'])
World evolution
df = GDPpc_wide.apply(np.mean, axis = 0)
px.line(df,
x=df.index,
y= df.values,
)