Setup
# Install libraries
!pip install --upgrade pip --quiet
!pip install statsmodels --quiet
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context(context="paper", font_scale=1.5, rc=None)
sns.set(font="serif")
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
#warnings.filterwarnings('ignore')
Import data
df1 = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master/assets/dat.csv")
#df1 = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master/assets/dat.csv", parse_dates =['year'])
df1
#df1.head()
df1.columns
df1.dtypes
df1['country'].unique()
Dataset definitions
# Import definitions of dataset
df1_def = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master//assets/dat-definitions.csv")
df1_def
Descriptive statistics
df1.describe().round(2)
Transform data
select
df1.log_GDPpc
df2 = df1[['country', 'year', 'log_GDPpc', 'log_h']]
df2
query
df_2014 = df1[['country', 'region', 'hi1990', 'year', 'log_GDPpc', 'log_h']].query("year == 2014")
df_2014
df_indonesia = df1[['country', 'year', 'log_GDPpc', 'log_h']].query("country == 'Indonesia'")
df_indonesia
Visualize data
Strip plot
px.strip(df_2014, x = 'log_GDPpc', hover_name= 'country', color= 'region')
Line plots
px.line(df1.query("country == 'Indonesia'"), x='year', y='log_GDPpc', color='country')
fig1 = px.line(df1.query("country=='Indonesia'"), x="year", y="log_GDPpc")
fig1.write_html("fig1.html")
px.line(df1, x='year', y='log_GDPpc', color='country', facet_col= 'region')
Box plot
px.box(
df1.query("year == 2014"),
x="log_GDPpc",
color="hi1990",
hover_name= 'country'
)
px.box(
df1,
x="log_h",
color="hi1990",
hover_name= 'country',
animation_frame = 'year'
)
Scatter plots
Simple
px.scatter(
df1.query("year == 1990"),
x="log_GDPpc",
y="log_h",
color="region",
hover_name="country"
)
Regression
px.scatter(
df1.query("year == 1990"),
x="log_GDPpc",
y="log_h",
color="hi1990",
hover_name="country",
trendline="ols")
px.scatter(
df1.query("year == 1990"),
x="log_GDPpc",
y="log_h",
color="region",
size="pop", size_max=60,
hover_name="country",
labels=dict(log_GDPpc="GDP per capita in 1990 (in logs)",
log_H="Human capita index in 1990 (in logs)",
region="Continent",
pop= "Population")
)
px.scatter(
df1.query("year == 1990"),
x="log_GDPpc",
y="log_h",
color="region",
size="pop", size_max=60,
trendline="ols",
hover_name="country",
facet_col="hi1990")
px.choropleth(
df1,
locations="isocode",
color="log_h",
hover_name="country",
animation_frame="year",
color_continuous_scale=px.colors.sequential.Plasma,
projection="natural earth")