Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
#chart_studio.tools.set_credentials_file(username='econdata777', api_key='ADDhere')
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
#warnings.filterwarnings('ignore')
Import data
df1 = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master/assets/dat.csv")
#df1 = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master/assets/dat.csv", parse_dates =['year'])
df1
#df1.head()
df1.columns
df1.dtypes
df1['country'].unique()
Dataset definitions
# Import definitions of dataset
df1_def = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master//assets/dat-definitions.csv")
df1_def
Descriptive statistics
df1.describe().round(2)
Prepare data
select
df1.GDPpc
df2 = df1[['country', 'year', 'GDPpc', 'lp', 'kp', 'TFP']]
df2
query
df_2014 = df1[['country', 'region', 'hi1990', 'year', 'GDPpc', 'lp', 'kp', 'TFP']].query("year == 2014")
df_2014
df_china = df1[['country', 'year', 'GDPpc', 'lp', 'kp', 'TFP']].query("country == 'China'")
df_china
df1[['country', 'year', 'lp', 'kp', 'TFP', 'GDPpc']].query("country==['China', 'Japan','Vietnam'] and year == 2014")
# Define the logical condictions
is_Asia        = df1['region'] == 'Asia'
is_HighIncome  = df1['hi1990'] == 'yes'
is_1990        = df1['year']   == 1990
# Apply the logical conditions
df1[is_Asia & is_HighIncome & is_1990]
groupby
df_2014.groupby('region').mean().round(2)
df_2014.groupby('region').std().round(2)
#df_2014.groupby('region')['GDPpc'].agg(['mean', 'std', 'min', 'max']).round(2)
df_2014.groupby('region').GDPpc.agg(['mean', 'std', 'min', 'max']).round(2)
df1.groupby('year').s.agg(['mean', 'std', 'min', 'max']).round(2)
pivot_table
# Pivot panel data from long form to wide form
df1_h_wide = df1.pivot_table(
             index = ['country', 'region', 'hi1990'],
             columns = 'year',
             values = 'h').reset_index(drop=False)
# Make sure the column names are strings
df1_h_wide.columns = df1_h_wide.columns.astype(str)
df1_h_wide
df1_h_wide.describe().round(2)
df1_h_wide.std().round(2).plot();
loc
df1_h_wide.loc[:,'1990':'2014']
df1_h_wide.loc[:,'1990':'2014'].std().round(2).plot();
isin
df1[df1.country.isin(['Vietnam', 'China', 'Japan'])]
Visualize data
Strip plot
px.strip(df_2014, x = 'TFP', hover_name= 'country')
#fig1 = px.strip(df_2014, x = 'kp', hover_name= 'country')
#save2cs.plot(fig1, filename = 'figureName', auto_open=True)
px.strip(df1, 
        x = 'TFP',
        y = 'region', 
        hover_name= 'country', 
        hover_data= ['lp', 'kp'], 
        color= 'region', 
        animation_frame= 'year')
Line plots
px.line(df1, x='year', y='log_tfp', color='country')
px.line(df1, x='year', y='log_tfp', color='country', facet_col= 'region', facet_col_wrap= 2, height= 800)
px.line(df1.query("country=='China'"), x="year", y="log_tfp")
fig1 = px.line(df1.query("country=='China'"), x="year", y="log_tfp")
fig1.write_html("fig1.html")
px.line(
    df1.query("country==['China', 'Japan','Vietnam']"),
    x="year",
    y="log_tfp",
    color="country"
    )
px.line(
    df1.query("region=='Americas'"),
    x="year",
    y="log_tfp",
    color="country"
    )
Histogram
px.histogram(df_2014, x = 'TFP', hover_name= 'country', color= 'hi1990', marginal='rug')
px.histogram(
    df1,
    x="log_tfp",
    color="hi1990",
    hover_name= 'country',
    marginal='box',
    animation_frame = 'year'
    )
Box plot
px.box(
    df1,
    x="log_tfp",
    color="hi1990",
    hover_name= 'country',
    animation_frame = 'year'
    )
Violin plot
px.violin(
    df1,
    y="hi1990",
    x="log_tfp",
    range_x=  [4, 9],
    color="hi1990",
    box=True,
    hover_name= 'country',
    animation_frame = 'year',
    points="all")
ECDF
px.ecdf(
    df1,
    x="log_tfp",
    range_x= [4, 9],
    color="hi1990",
    hover_name= 'country',
    animation_frame = 'year'
)
Treemap plot
px.treemap(df1.query("year == 2014"), color = "log_tfp", values = "pop", path = ["region", "country"], hover_name = "country")
Sunburst plot
px.sunburst(df1.query("year == 2014"), color = "log_tfp", values = "pop", path = ["region", "country"], hover_name = "country")
Scatter plots
Simple
px.scatter(
           df1,
           x="log_lp",
           y="log_tfp",
           color="region",
           hover_name="country",
           animation_frame = 'year'
           )
Regression
px.scatter(
    df1.query("year == 1990"),
    x="log_lp",
    y="log_tfp",
    color="hi1990",
    hover_name="country",
    hover_data= ['region'], 
    trendline="ols",
    trendline_scope="overall"
    )
Regression by groups
px.scatter(
    df1,
    x="log_lp",
    y="log_tfp",
    color="hi1990",
    hover_name="country",
    animation_frame = 'year',
    trendline="ols")
Lowess regression
px.scatter(
    df1,
    x="log_lp",
    y="log_tfp",
    range_x= [6, 12],
    range_y= [4, 9],
    color="hi1990",
    hover_name="country",
    animation_frame = 'year',
    trendline="lowess")
Margins
px.scatter(
    df1.query("year == 1990"),
    x="log_lp",
    y="log_tfp",
    hover_name="country",
    color="hi1990",
    trendline="ols",
    marginal_x="box",
    marginal_y="box")
Multivariate
px.scatter(
           df1.query("year == 1990"),
           x="log_lp",
           y="log_tfp",
           color="region",
           size="pop", size_max=60,
           hover_name="country",
           labels=dict(log_lp="Labor productivity in 1990 (in logs)",
                       log_tfp="Aggregate efficiency in 1990 (in logs)",
                       region="Continent",
                       pop= "Population")
           )
Customized
fig = px.scatter(df1.query("year == 1990"), 
                y="log_tfp",
                x="log_lp", 
                log_x = False,
                color = "region",
                size ="pop", size_max=60,
                hover_name = "country", 
                height =500, width=800, 
                template = "simple_white",
                color_discrete_sequence=px.colors.qualitative.G10,
                #title = "Year 1990",
                labels=dict(region = "Continent",
                            pop = "Population",
                            log_lp = "Labor productivity (in logs)",
                            log_tfp  = "Aggregate efficiency (in logs)")
                            )
fig.update_layout(font_family = "Rockwell",
                  legend=dict(orientation = "h", title="", y=1.1, x=1, xanchor="right", yanchor="bottom"))
fig.add_hline(df1.query("year == 1990")['log_lp'].mean(), line_width=1, line_dash="dot")
fig.add_vline(df1.query("year == 1990")['log_tfp'].mean(), line_width=1, line_dash="dot")
fig.show()
px.scatter(df1,
           animation_frame="year",
           x="log_lp",
           y="log_tfp",
           range_x= [6, 12],
           range_y= [4, 9],
           color="region",
           size="pop", size_max=60,
           hover_name="country",
           hover_data = ['log_h', 'log_GDPpc'],
           labels=dict(log_lp="Labor productivity (in logs)",
                       log_tfp="Aggregate efficiency (in logs)",
                       region="Continent",
                       pop= "Population")
           )
px.scatter(
    df1,
    x="log_lp",
    y="log_tfp",
    color="region",
    size="pop", size_max=60,
    trendline="ols",
    hover_name="country",
    facet_col="hi1990",
    animation_frame="year"
    )
px.density_contour(
    df1,
    x="log_lp",
    y="log_tfp",
    hover_name="country",
    marginal_x="histogram",
    marginal_y="histogram",
    animation_frame="year"
    )
px.density_contour(
    df1,
    x="log_lp",
    y="log_tfp",
    marginal_x="box",
    marginal_y="box",
    animation_frame="year"
    )
px.density_contour(
    df1,
    x="log_lp",
    y="log_tfp",
    hover_name="country",
    marginal_x="rug",
    marginal_y="rug",
    animation_frame="year"
    )
den = px.density_contour(
    df1.query("year == 2014"),
    x="log_lp",
    y="log_tfp",
    )
den.update_traces(contours_coloring="fill", contours_showlabels = True)
den.show()
den = px.density_contour(
    df1.query("year == 2014"),
    x="log_lp",
    y="log_tfp",
    )
den.update_traces(contours_coloring="fill", contours_showlabels = True, colorscale = 'Viridis')
den.show()
3D
px.scatter_3d(
           df1.query("year == 1990"),
           x="log_lp",
           y="log_tfp",
           z= "pop",
           color="region",
           hover_name="country"
           )
px.choropleth(
    df1.query("year == 1990"),
    locations="isocode",
    color="log_tfp",
    hover_name="country",
    color_continuous_scale=px.colors.sequential.Plasma,
    projection="natural earth")
px.choropleth(
    df1,
    locations="isocode",
    color="log_tfp",
    hover_name="country",
    animation_frame="year",
    color_continuous_scale=px.colors.sequential.Plasma,
    projection="natural earth")