Plotly express
Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
#chart_studio.tools.set_credentials_file(username='econdata777', api_key='ADDhere')
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
#warnings.filterwarnings('ignore')
Import data
df1 = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master/assets/dat.csv")
#df1 = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master/assets/dat.csv", parse_dates =['year'])
df1
df1.columns
df1.dtypes
df1['country'].unique()
Dataset definitions
# Import definitions of dataset
df1_def = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master//assets/dat-definitions.csv")
df1_def
Descriptive statistics
df1.describe().round(2)
Prepare data
select
df2 = df1[['country', 'year', 'log_GDPpc', 'log_h', 'log_ky', 'log_tfp']]
df2
query
df_chn = df1[['country', 'year', 'log_GDPpc', 'log_h', 'log_ky', 'log_tfp']].query("country == 'China'")
df_chn
df1[['country', 'year','log_GDPpc', 'log_h', 'log_ky', 'log_tfp']].query("country==['China', 'India', 'Indonesia'] and year == 2008")
df1[['country', 'year','log_GDPpc', 'log_h', 'log_ky', 'log_tfp']].query("country==['China', 'India', 'Indonesia'] and year == 2014")
# Define the logical condictions
is_Asia = df1['region'] == 'Asia'
is_HighIncome = df1['hi1990'] == 'yes'
is_1990 = df1['year'] == 1990
# Apply the logical conditions
df1[is_Asia & is_HighIncome & is_1990]
groupby
df_2014.groupby('region').mean().round(2)
df_2014.groupby('region').std().round(2)
#df_2014.groupby('region')['log_GDPpc'].agg(['mean', 'std', 'min', 'max']).round(2)
df_2014.groupby('region').log_GDPpc.agg(['mean', 'std', 'min', 'max']).round(2)
df1.groupby('year').s.agg(['mean', 'std', 'min', 'max']).round(2)
pivot_table
df1_gdppc_wide = df1.pivot_table(
index = ['country','region','hi1990'],
columns = 'year',
values = 'log_GDPpc').reset_index(drop=False)
# Make sure the column names are strings
df1_gdppc_wide.columns = df1_gdppc_wide.columns.astype(str)
df1_gdppc_wide
df1_gdppc_wide.describe().round(2)
df1_gdppc_wide.std().round(2).plot();
loc
df1_gdppc_wide.loc[:,'1990':'2014']
df1_gdppc_wide.loc[:,'1990':'2014'].std().round(2).plot();
isin
df1[df1.country.isin(['China', 'Indonesia','India'])]
Visualize data
Strip plot
#fig1 = px.strip(df_2014, x = 'GDPpc', hover_name= 'country')
#save2cs.plot(fig1, filename = 'figureName', auto_open=True)
px.strip(df1,
x = 'log_GDPpc',
y = 'region',
hover_name= 'country',
hover_data= ['log_h', 'log_ky'],
color= 'region',
animation_frame= 'year',
range_x = [6,12]
)
Line plots
px.line(df1, x='year', y='log_GDPpc', color='country')
px.line(df1, x='year', y='log_GDPpc', color='country', facet_col= 'region', facet_col_wrap= 2, height= 800)
px.line(df1.query("country=='China'"), x="year", y="log_GDPpc")
px.line(df1.query("country=='China'"), x="year", y="log_ky")
px.line(df1.query("country=='China'"), x="year", y="log_tfp")
px.line(df1.query("country=='China'"), x="year", y="log_h")
fig1 = px.line(df1.query("country=='China'"), x="year", y="log_GDPpc")
fig1.write_html("fig1.html")
px.line(
df1.query("country==['Indonesia', 'China','India']"),
x="year",
y="log_GDPpc",
color="country"
)
px.line(
df1.query("region=='Asia'"),
x="year",
y="log_GDPpc",
color="country"
)
Box plot
px.box(
df1,
x="log_GDPpc",
color="hi1990",
hover_name= 'country',
animation_frame = 'year'
)
ECDF
px.ecdf(
df1.query('region== "Asia"'),
x="log_GDPpc",
range_x= [6, 12],
color="hi1990",
hover_name= 'country',
animation_frame = 'year',
)
Sunburst plot
px.sunburst(df1.query("year == 2008"), color = "log_GDPpc", values = "pop", path = ["region", "country"], hover_name = "country")
px.sunburst(df1.query("year == 2014"), color = "log_GDPpc", values = "pop", path = ["region", "country"], hover_name = "country")
Scatter plots
Regression by groups
px.scatter(
df1,
x="log_tfp",
y="log_GDPpc",
color="hi1990",
hover_name="country",
animation_frame = 'year',
trendline="ols",
range_x = [4,9],
range_y = [6,11.5])
px.scatter(
df1,
x="log_ky",
y="log_GDPpc",
color="hi1990",
hover_name="country",
animation_frame = 'year',
trendline="ols",
range_x = [-2,2.5],
range_y = [6,11.5])
px.scatter(
df1,
x="log_h",
y="log_GDPpc",
color="hi1990",
hover_name="country",
animation_frame = 'year',
trendline="ols")
Margins
px.scatter(
df1,
x="log_tfp",
y="log_GDPpc",
hover_name="country",
color="hi1990",
trendline="ols",
marginal_x="box",
marginal_y="box",
animation_frame = 'year',
range_x = [4,9],
range_y = [6,11.5])
Lowess regression
px.scatter(
df1,
x="log_tfp",
y="log_GDPpc",
color="hi1990",
hover_name="country",
animation_frame = 'year',
trendline="lowess",
range_x = [4,9],
range_y = [6,11.5])
Multivariate
px.scatter(
df1.query("year == 2008"),
x="log_tfp",
y="log_GDPpc",
color="region",
size="pop", size_max=60,
hover_name="country",
labels=dict(log_tfp="Total factor productivity in 2008 (in logs)",
log_GDPpc="GDP per capita in 2008 (in logs)",
region="Continent",
pop= "Population")
)
Customized
fig = px.scatter(df1.query("year == 2008"),
y="log_GDPpc",
x="log_tfp",
log_x = False,
color = "region",
size ="pop", size_max=60,
hover_name = "country",
height =500, width=800,
template = "simple_white",
color_discrete_sequence=px.colors.qualitative.G10,
#title = "Year 1990",
labels=dict(region = "Continent",
pop = "Population",
log_GDPpc = "GDP per capita (in logs)",
log_tfp = "Total factor productivity (in logs)")
)
fig.update_layout(font_family = "Rockwell",
legend=dict(orientation = "h", title="", y=1.1, x=1, xanchor="right", yanchor="bottom"))
fig.add_hline(df1.query("year == 2008")['log_GDPpc'].mean(), line_width=1, line_dash="dot")
fig.add_vline(df1.query("year == 2008")['log_tfp'].mean(), line_width=1, line_dash="dot")
fig.show()
px.scatter(df1,
animation_frame="year",
x="log_tfp",
y="log_GDPpc",
range_x = [4,9],
range_y = [6,11.5],
color="region",
size="pop", size_max=60,
hover_name="country",
hover_data = ['log_ky', 'log_h'],
labels=dict(log_tfp="Total factor productivity (in logs)",
log_GDPpc="GDP per capita (in logs)",
region="Continent",
pop= "Population")
)
px.scatter(
df1,
x="log_tfp",
y="log_GDPpc",
color="region",
size="pop", size_max=60,
trendline="ols",
hover_name="country",
facet_col="hi1990",
animation_frame="year"
)
px.choropleth(
df1.query("year == 2008"),
locations="isocode",
color="log_GDPpc",
hover_name="country",
color_continuous_scale=px.colors.sequential.Plasma,
projection="natural earth")
px.choropleth(
df1,
locations="isocode",
color="log_GDPpc",
hover_name="country",
animation_frame="year",
color_continuous_scale=px.colors.sequential.Plasma,
projection="natural earth")