Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
#chart_studio.tools.set_credentials_file(username='econdata777', api_key='ADDhere')
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
#warnings.filterwarnings('ignore')
Import data
df1 = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master/assets/dat.csv")
#df1 = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master/assets/dat.csv", parse_dates =['year'])
df1
idint64
1 - 108
countryobject
Zambia0.9%
Italy0.9%
106 others98.1%
2699
108
Zambia
1199
48
Italy
1324
53
Kyrgyz Republic
1299
52
Kenya
1274
51
Kazakhstan
2649
106
Vietnam
1249
50
Japan
1224
49
Jamaica
1174
47
Israel
1374
55
Lithuania
#df1.head()
df1.columns
df1.dtypes
df1['country'].unique()
Dataset definitions
# Import definitions of dataset
df1_def = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master//assets/dat-definitions.csv")
df1_def
var_nameobject
country3.6%
year3.6%
26 others92.9%
var_defobject
Standardized country name (from PWT)3.6%
Year3.6%
26 others92.9%
0
country
Standardized country name (from PWT)
1
year
Year
2
Y
GDP
3
K
Physical Capital
4
pop
Population
5
L
Labor Force
6
s
Years of Schooling
7
alpha_it
Variable Capital Share
8
GDPpc
GDP per capita
9
lp
Labor Productivity
Descriptive statistics
df1.describe().round(2)
idfloat64
yearfloat64
count
2700
2700
mean
54.5
2002
std
31.18
7.21
min
1
1990
25%
27.75
1996
50%
54.5
2002
75%
81.25
2008
max
108
2014
Prepare data
select
df1.GDPpc
df2 = df1[['country', 'year', 'region', 'hi1990','log_GDPpc','GDPpc','pop', 'log_lp', 'log_ky','h', 'log_h', 's', 'log_tfp','isocode']]
#hi1990:as of 1999 high income country
#log_lp:Labor Productivity
#log_ky:Capital-Output Ratio
#log_h:Human Capital Index
#s:Years of Schooling
#log_tfp:Aggregate Efficiency
df2
countryobject
Albania0.9%
Algeria0.9%
106 others98.1%
yearint64
1990 - 2014
2680
Zambia
1995
2681
Zambia
1996
2682
Zambia
1997
2683
Zambia
1998
2684
Zambia
1999
2685
Zambia
2000
2686
Zambia
2001
2687
Zambia
2002
2688
Zambia
2003
2689
Zambia
2004
query
df_2014 =df2.query("year == 2014")
df_1990=df2.query("year==1990")
df_asia_1990 =df_1990.query("region == 'Asia'")
df_asia_2014=df_2014.query("region=='Asia'")
df_asia_2014_h=df_asia_2014[['country','year','h']]
#获取亚洲国家2014年的收入
#获取2014年的高收入的亚洲国家
df_asia_HighIncome_1990=df_asia_1990.query("hi1990=='yes'")
df_asia_HighIncome_1990
countryobject
yearint64
1000
Hong Kong
1990
1150
Israel
1990
1225
Japan
1990
1975
Saudi Arabia
1990
2050
Singapore
1990
2300
Taiwan
1990
# Define the logical condictions
is_Asia = df1['region'] == 'Asia'
is_HighIncome = df1['hi1990'] == 'yes'
is_1990 = df1['year'] == 1990
# Apply the logical conditions
df1[is_Asia & is_HighIncome & is_1990]
idint64
countryobject
1000
41
Hong Kong
1150
47
Israel
1225
50
Japan
1975
80
Saudi Arabia
2050
83
Singapore
2300
93
Taiwan
Visualize data
px.histogram(
df1,
x="log_lp",
color="hi1990",
hover_name= 'country',
marginal='box',
animation_frame = 'year'
)
8、the relationship between GDPpc and labor productivity
px.scatter(
df1,
y="log_GDPpc",
x="log_lp",
color="region",
hover_name="country",
animation_frame = 'year'
)
px.scatter(
df1.query("year == 1990"),
x="log_lp",
y="log_GDPpc",
color="hi1990",
hover_name="country",
hover_data= ['region'],
trendline="ols",
trendline_scope="overall"
)
px.scatter(
df1,
x="log_lp",
y="log_GDPpc",
color="hi1990",
hover_name="country",
animation_frame = 'year',
trendline="ols")