Setup
# Install libraries
!pip install --upgrade pip --quiet
!pip install statsmodels --quiet
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import FormatStrFormatter
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context(context="paper", font_scale=1.5, rc=None)
sns.set(font="serif")
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
#warnings.filterwarnings('ignore')
Import data
df1 = pd.read_stata("/work/project2022e_penny/data/pwt100_merge.dta")
#df1 = pd.read_csv("/work/project2022e_penny/data/pwt100_merge.dta", parse_dates =['year'])
df1
df1.columns
df1.dtypes
#df1['country'].unique()
Descriptive statistics
df1.describe().round(2).to_csv("describe.csv")
Transform data
select
df2 = df1[['countrycode', 'country', 'year', 'rgdpo', 'pop', 'cn', 'ck', 'emp',
'hc', 'region', 'incomegroup', 'MRWgroup', 'y', 'h', 'k', 'a']].drop(df1[df1.year < 1960].index)
df2
groupby
df2.groupby('incomegroup').std().round(2)
df2.groupby('incomegroup').rgdpo.agg(['mean', 'std', 'min', 'max']).round(0)
Visualize data
Strip plot
px.strip(df2[df2.year == 1960], x = 'y', hover_name= 'country')
px.strip(df2[df2.year == 1960], x = 'y', hover_name= 'country', color= 'incomegroup')
px.strip(df2, x = 'y', hover_name= 'country', color= 'incomegroup', animation_frame= 'year')
Line plots
px.line(df2[df2.incomegroup == "Lower middle income"], x='year', y='h', color='country')
px.line(df2, x='year', y='h', color='country', facet_col= 'incomegroup')
px.line(df1.query("country=='Japan'"), x="year", y="rgdpo")
fig1 = px.line(df1.query("country=='Japan'"), x="year", y="rgdpo")
fig1.write_html("fig1.html")
px.line(
df1.query("country==['Japan', 'China', 'India']"),
x="year",
y="y",
color="country"
)
Histogram
px.histogram(df2, x = 'y', hover_name= 'country', color= 'incomegroup', marginal='rug')
px.histogram(
df2,
x="y",
color="incomegroup",
hover_name= 'country',
marginal='rug',
animation_frame = 'year'
)
Box plot
px.box(
df2.query("year == 2019"),
x="y",
color="incomegroup",
hover_name= 'country'
)
px.box(
df2,
x="y",
color="incomegroup",
hover_name= 'country',
animation_frame = 'year'
)
Violin plot
px.violin(
df2.query("year == 1960"),
y="y",
x="region",
hover_name= 'country',
color="incomegroup",
box=True,
points="all")
px.violin(
df2,
y="y",
x="region",
color="incomegroup",
box=True,
hover_name= 'country',
animation_frame = 'year',
points="all")
Scatter plots
Simple
px.scatter(
df2.query("year == 2019"),
x="h",
y="y",
color="incomegroup",
hover_name="country"
)
px.scatter(
df2,
x="h",
y="y",
color="incomegroup",
hover_name="country",
animation_frame = 'year'
)
Regression
px.scatter(
df2.query("year == 1990"),
x="h",
y="y",
color="incomegroup",
hover_name="country",
trendline="ols")
px.scatter(
df2,
x="h",
y="y",
color="incomegroup",
hover_name="country",
animation_frame = 'year',
trendline="ols")
Margins
px.scatter(
df1.query("year == 1990"),
x="h",
y="y",
hover_name="country",
color="incomegroup",
trendline="ols",
marginal_x="box",
marginal_y="box")
Multivariate
px.scatter(
df2.query("year == 2019"),
x="h",
y="y",
color="incomegroup",
size="pop", size_max=60,
hover_name="country",
labels=dict(log_hc="Human capital index in 2019 (in logs)",
y="Labor productivity in 2019 (in logs)",
incomegroup="Income group",
pop= "Population")
)
px.scatter(df2[df2.year > 2004],
animation_frame="year",
x="h",
y="y",
color="incomegroup",
size="pop", size_max=60,
hover_name="country",
labels=dict(log_h="Human capital index (in logs)",
log_lp="Labor productivity (in logs)",
incomegroup="Income group",
pop= "Population")
)
px.scatter(
df1.query("year == 2019"),
x="h",
y="y",
color="region",
size="pop", size_max=60,
trendline="ols",
hover_name="country",
facet_col="incomegroup")
px.scatter(
df2[df2.year > 2004],
x="h",
y="y",
color="region",
size="pop", size_max=60,
trendline="ols",
hover_name="country",
facet_col="incomegroup",
animation_frame="year"
)
px.density_contour(
df1.query("year == 1990"),
x="h",
y="y",
hover_name="country",
marginal_x="histogram",
marginal_y="histogram"
)
px.density_contour(
df2,
x="h",
y="y",
marginal_x="histogram",
marginal_y="histogram",
animation_frame="year"
)
px.density_contour(
df2,
x="h",
y="y",
hover_name="country",
marginal_x="rug",
marginal_y="rug",
animation_frame="year"
)
den = px.density_contour(
df1.query("year == 2019"),
x="h",
y="y",
)
den.update_traces(contours_coloring="fill", contours_showlabels = True)
den.show()
den = px.density_contour(
df1.query("year == 2019"),
x="h",
y="y",
)
den.update_traces(contours_coloring="fill", contours_showlabels = True, colorscale = 'Viridis')
den.show()
px.choropleth(
df2.query("year == 1990"),
locations="countrycode",
color="y",
hover_name="country",
color_continuous_scale=px.colors.sequential.Plasma,
projection="natural earth")
px.choropleth(
df2,
locations="countrycode",
color="y",
hover_name="country",
animation_frame="year",
color_continuous_scale=px.colors.sequential.Plasma,
projection="natural earth")