New Expansion of Plotly Express--based on Penn World Table Version. 10.0
Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
#chart_studio.tools.set_credentials_file(username='econdata777', api_key='ADDhere')
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
#warnings.filterwarnings('ignore')
Import data
df = pd.read_csv("/work/data/PWT100+countryIDs.csv")
df
df.columns
df.dtypes
df['CountryName'].unique()
Describe variables
# Import dataset descriptions
df_def = pd.read_csv("/work/data/PWT100+countryIDs-descriptions.csv")
df_def
Transform data & Summarize data
Create key variables & Subset a sample
# Generate log of real GDP per worker (labor productivity)
df['lp'] = np.log(df['rgdpo']/df['emp'])
# Generate log of the adjusted ratio of physical capital to output
# Assuming alpha = 1/3
df['k'] = ((1/3)/(2/3))*(np.log(df['cn']/df['cgdpo']))
# Generate log of human capital per worker
df['h'] = np.log(df['hc'])
# Generate log of aggregate efficiency (total factor productivity)
df['a'] = df['lp']-df['k']-df['h']
# Select time period (1980~2019)
# The np.range function doesn't include the termination parameter (enter n+1 to include n)
df0 = df[df['year'].isin(np.arange(1980, 2020))]
# Subset the sample
df1 = df0[['CountryCode', 'CountryName', 'year', 'SubRegion_PWT100', 'SubContinent', 'Continent',
'incomegroup', 'nonoil', 'intermediate', 'OECD', 'pop', 'lp', 'k', 'h', 'a']]
# Eliminate missing values
df2 = df1.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
df2
Descriptive statistics
df2[['year', 'nonoil', 'intermediate', 'OECD', 'pop', 'lp', 'k', 'h', 'a']].query("year == 2014").describe().round(2)
Group data
df2.groupby('SubRegion_PWT100').lp.agg(['mean', 'std', 'min', 'max']).round(2)
df2.groupby('SubContinent').lp.agg(['mean', 'std', 'min', 'max']).round(2)
Visualize data
Strip plot
px.strip(df2,
x = 'lp',
y = 'SubRegion_PWT100',
hover_name= 'CountryName',
hover_data= [ 'pop', 'lp', 'k', 'h', 'a'],
color= 'SubRegion_PWT100',
labels=dict(lp = 'Labor productivity (in logs)',
SubRegion_PWT100 = 'Subregion'),
animation_frame= 'year')
px.strip(df2,
x = 'lp',
y = 'incomegroup',
hover_name= 'CountryName',
hover_data= [ 'pop', 'lp', 'k', 'h', 'a'],
color= 'incomegroup',
labels=dict(lp = 'Labor productivity (in logs)',
incomegroup = 'Income groups'),
animation_frame= 'year')
Line plots
px.line(df2,
x='year',
y='lp',
color='CountryName',
hover_name = 'CountryName',
hover_data= ['Continent', 'pop', 'lp'],
labels=dict(lp = 'Labor productivity (in logs)',
CountryName = 'Country'),
)
px.line(df2,
x='year',
y='lp',
color='CountryName',
hover_name = 'CountryName',
facet_col= 'SubRegion_PWT100', facet_col_wrap=2, width=900, height=1200,
labels=dict(lp = 'Log LP',
SubRegion_PWT100 = 'SR',
CountryName = 'Country'),
title='Labor productivity across sub-regions (defined by PWT 10.0)'
)
px.line(df2,
x='year',
y='lp',
color='CountryName',
hover_name = 'CountryName',
facet_col= 'incomegroup', facet_col_wrap=2, width=1000, height=1000,
labels=dict(lp = 'Log LP',
incomegroup = 'IC group',
CountryName = 'Country'),
title='Labor productivity across income groups (defined by JP (2020))'
)
px.line(df2.query("SubRegion_PWT100 == 'Latin America & Caribbean'"),
x='year',
y='lp',
color='CountryName',
hover_name = 'CountryName',
labels=dict(lp = 'Labor productivity (in logs)',
CountryName = 'Country'),
title='Weird performance (?): labor productivity in Latin America & Caribbean'
)
Scatter plots
Regression
px.scatter(df2,
x = 'a',
y = 'lp',
color = 'SubRegion_PWT100',
hover_name = 'CountryName',
range_x = [3.5,11.5],
range_y = [6,13],
trendline = 'ols',
trendline_scope = 'overall',
labels=dict(a = 'Aggregate efficiency (in logs)',
lp = 'Labor productivity (in logs)',
SubRegion_PWT100 = 'Sub-region'),
animation_frame = 'year'
)
px.scatter(df2,
x = 'a',
y = 'lp',
color = 'SubRegion_PWT100',
animation_frame = 'year',
hover_name = 'CountryName',
trendline = 'ols',
)
Margins
px.scatter(df2.query("year == 1980"),
x = 'a',
y = 'lp',
color = 'SubRegion_PWT100',
hover_name = 'CountryName',
trendline = 'ols',
marginal_x = 'box',
marginal_y = 'box')
px.scatter(df2.query("year == 2019"),
x = 'a',
y = 'lp',
color = 'SubRegion_PWT100',
hover_name = 'CountryName',
trendline = 'ols',
marginal_x = 'box',
marginal_y = 'box')
Animated
px.scatter(df2,
x = 'a',
y = 'lp',
animation_frame = 'year',
animation_group = 'CountryName',
size = 'pop', size_max = 60,
color = 'SubRegion_PWT100',
hover_name = 'CountryName',
range_x = [3.5,11.5],
range_y = [6,13],
labels=dict(a = 'Aggregate efficiency (in logs)',
lp = 'Labor productivity (in logs)',
SubRegion_PWT100 = 'Sub-region')
)
Facets
px.scatter(df2,
x = 'a',
y = 'lp',
animation_frame = 'year',
animation_group = 'CountryName',
facet_col = 'nonoil',
size = 'pop', size_max = 60,
color = 'SubRegion_PWT100',
hover_name = 'CountryName',
trendline = 'ols',
range_x = [3.5,11.5],
range_y = [6,13],
labels=dict(a = 'Aggregate efficiency (in logs)',
lp = 'Labor productivity (in logs)',
SubRegion_PWT100 = 'Sub-region')
)
px.scatter(df2,
x = 'a',
y = 'lp',
animation_frame = 'year',
animation_group = 'CountryName',
facet_col = 'intermediate',
size = 'pop', size_max = 60,
color = 'SubRegion_PWT100',
hover_name = 'CountryName',
trendline = 'ols',
range_x = [3.5,11.5],
range_y = [6,13],
labels=dict(a = 'Aggregate efficiency (in logs)',
lp = 'Labor productivity (in logs)',
SubRegion_PWT100 = 'Sub-region')
)
px.scatter(df2,
x = 'a',
y = 'lp',
animation_frame = 'year',
animation_group = 'CountryName',
facet_col = 'OECD',
size = 'pop', size_max = 60,
color = 'SubRegion_PWT100',
hover_name = 'CountryName',
trendline = 'ols',
range_x = [3.5,11.5],
range_y = [6,13],
labels=dict(a = 'Aggregate efficiency (in logs)',
lp = 'Labor productivity (in logs)',
SubRegion_PWT100 = 'Sub-region')
)
df1 = df0[['CountryCode', 'CountryName', 'year', 'SubRegion_PWT100', 'SubContinent', 'Continent',
'incomegroup', 'nonoil', 'intermediate', 'OECD', 'pop', 'lp', 'k', 'h', 'a']]
fig = px.choropleth(df2.query("year == 2019"), locations = 'CountryCode',
color = 'lp',
hover_name = 'CountryName',
color_continuous_scale=px.colors.sequential.Plasma,
projection='natural earth',
labels=dict(CountryCode = 'ISO country code',
lp = 'Labor productivity')
)
fig.show()
fig = px.choropleth(df2, locations = 'CountryCode',
color = 'lp',
animation_frame="year",
hover_name = 'CountryName',
color_continuous_scale=px.colors.sequential.Plasma,
projection='natural earth',
labels=dict(CountryCode = 'ISO country code',
lp = 'Labor productivity')
)
fig.show()