[PYTHON] Into the amazing world of Data Science

Set up

# Load computational modules import numpy as np import pandas as pd import matplotlib.pyplot as plt # from matplotlib import pylot as plt import seaborn as sns import plotly.express as px import plotly.graph_objects as go import statsmodels.api as sm import statsmodels.formula.api as smf

Import and describing data

Import

df = pd.read_csv('/work/pwt100withGroups_productionAccounting.csv') # df = pd.read_stata('/work/pwt100withGroups_productionAccounting.dta') df

df.shape

df.columns

df.dtypes

df['CountryName_pwt100'].nunique() # 183 countries

df['CountryName_pwt100'].unique()

df['Continent'].unique()

df['incomegroup'].unique()

Data definitions

var_def = pd.read_csv('/work/variable_definitions.csv') var_def

Data descriptives

df.describe().round(2)

Prepare data

Subset

Select columns

df_col = df[['CountryName_pwt100', 'SubContinent', 'Continent', 'GDPpc', 'pop', 'L', 'H','lp', 'H']] df_col

Select rows

# Select rows with Continient being specified as "Asia" df_asia = df.query('Continent == "Asia"') df_asia

# Combine 2 conditions df_asia_2019 = df.query('Continent == "Asia" and year == 2019') df_asia_2019

# Combine multiple conditions is_asia = df['Continent'] == "Asia" is_2019 = df['year'] >= 2009 is_highincome = df['incomegroup'] == 'High income' df_asia_2019_high = df[is_asia & is_2019 & is_highincome] df_asia_2019_high

# isin vnjpth = df.CountryName_pwt100.isin(['Viet Nam', 'Japan', 'Thailand']) y90_00_10 = df.year.isin([1990, 2000, 2010]) df_3countries = df[vnjpth & y90_00_10] df_3countries

# loc df_vn = df.loc[df["countrycode"] == "VNM", :] df_vn

# try other way with loc df.set_index('countrycode', inplace=True) df_vn = df.loc['VNM',:] df_vn

Aggregation

df[df['year'] == 2019].groupby('Continent').mean().round(2)

df[df['year'] == 2019].groupby('incomegroup').GDPpc.agg(['mean', 'std', 'min', 'max']).round(2)

Transform

From long to wide

# Pivot panel data with GDPpc and H df_GDPpc_wide = df.pivot_table( index = ['CountryName_pwt100', 'SubContinent', 'Continent', 'incomegroup','countrycode'], columns = 'year', values = ['GDPpc', 'H']).reset_index(drop=False) df_GDPpc_wide.columns = ['_'.join(str(s).strip() for s in col if s) for col in df_GDPpc_wide.columns] df_GDPpc_wide # Note: Let use the following code line to create column names like GDPpc_2010 # df.columns = ['_'.join(str(s).strip() for s in col if s) for col in df.columns]

From wide to long

df_GDPpc_long = pd.wide_to_long(df_GDPpc_wide, stubnames=['GDPpc', 'H'], i=['CountryName_pwt100', 'SubContinent', 'Continent', 'incomegroup','countrycode'], j='year', sep='_') df_GDPpc_long.reset_index()

Merging

# I am creating a 2-column dataframe df_lp = df.reset_index()[['countrycode','lp']] df_lp

df_GDPpc_lp = pd.merge(df_GDPpc_long, df_lp, on='countrycode') df_GDPpc_lp # Note: if the key column for merging has different name in 2 datatsets, we use # option: left_on, right_on

Visualization

From the world vision

px.choropleth( df.reset_index(), locations="countrycode", color="ln_GDPpc", hover_name="CountryName_pwt100", color_continuous_scale='RdBu_r', projection="natural earth", labels=dict(ln_GDPpc = 'GDP per capita(in logs)'), title='Distribution of world income level' )

px.choropleth(df.reset_index(), locations="countrycode", color="ln_h", hover_name="CountryName_pwt100", projection="natural earth", color_continuous_scale='RdBu_r', title='Distribution of Human Capital Index', labels=dict(ln_h='Human capital index in logs'), )

How about Asia

Line plot

# subset data for Asia from 1980 onwards, then sort values df_asia_1980_sorted = df_asia.query("year >= 1980").sort_values(['year','pop','ln_GDPpc','K','ln_lp','ln_h','ln_TFP'])

px.line(df_asia_1980_sorted, x='year', y='ln_GDPpc', color='CountryName_pwt100', hover_name='CountryName_pwt100', hover_data=['SubContinent','incomegroup','pop','pop','H','K','ln_TFP'], labels=dict(ln_GDPpc="GDP per capita in log", CountryName_pwt100 = "Country name"), title='How GDP per capita of Asian country go through time')

px.line(df_asia_1980_sorted, x='year', y='ln_GDPpc', color='CountryName_pwt100', hover_name='CountryName_pwt100', hover_data=['SubContinent','incomegroup','pop','pop','H','K','ln_TFP'], labels=dict(ln_lp="Labor productivity", CountryName_pwt100 = "Country name"), facet_col='SubContinent', facet_col_wrap=1, height=1000, width=600, title='How labor productivity of Asian countries go through time')

Strip plot

px.strip(df_asia_1980_sorted, x='ln_GDPpc', y='SubContinent', range_x= [6, 13], color='SubContinent', hover_name='CountryName_pwt100', hover_data=['SubContinent','incomegroup','pop','pop','H','K','ln_TFP'], labels=dict(GDPpc="Real GDP per capita (in logs)", CountryName_pwt100 = "Country name", SubContinent = "Region"), template="simple_white", animation_frame='year', title='How GDP per capita of Asian countries go through time')

Scatter plot

px.scatter(df_asia_1980_sorted, x="ln_h", y="ln_GDPpc", range_x=[0,1.6], range_y=[6,12.5], color="SubContinent", hover_name="CountryName_pwt100", hover_data = ['GDPpc','pop','H','K','ln_TFP'], labels=dict(ln_GDPpc="GDP per capita (in logs)", ln_h="Human capital (in logs)", SubContinent="Region"), animation_frame='year' )

px.scatter(df_asia_1980_sorted, x="ln_h", y="ln_GDPpc", range_x=[0,1.6], range_y=[6,12.5], color="SubContinent", hover_name="CountryName_pwt100", hover_data = ['GDPpc','pop','H','K','ln_TFP'], labels=dict(ln_GDPpc="GDP per capita (in logs)", ln_h="Human capital (in logs)", SubContinent="Region"), animation_frame='year', trendline="ols" )

fig = px.scatter(df_asia_1980_sorted.query("year == 2019"), x="ln_h", y="ln_GDPpc", log_x = False, color = "SubContinent", size ="pop", size_max=60, hover_name = "CountryName_pwt100", height =500, width=800, template = "simple_white", color_discrete_sequence=px.colors.qualitative.G10, title = "Asia in 2019", labels=dict(SubContinent = "Region", pop = "Population", ln_GDPpc = "GDP per capita (in logs)", ln_h = "Human capital index (in logs)") ) fig.update_layout(font_family = "Rockwell", legend=dict(orientation = "h", title="", y=1.1, x=1, xanchor="right", yanchor="bottom")) fig.add_hline(df_asia_1980_sorted.query("year == 2019")['ln_GDPpc'].mean(), line_width=1, line_dash="dot") fig.add_vline(df_asia_1980_sorted.query("year == 2019")['ln_h'].mean(), line_width=1, line_dash="dot") fig.show()

df_asia_1980_sorted_withoutNA = df_asia_1980_sorted.dropna(inplace=True) px.scatter(df_asia_1980_sorted, animation_frame="year", x="ln_h", y="ln_GDPpc", range_x=[0, 1.6], range_y=[6, 12.5], color="SubContinent", size="pop", size_max=60, hover_name="CountryName_pwt100", hover_data = ['GDPpc','pop','H','K','ln_TFP'], labels=dict(log_h="Human capital index (in logs)", log_GDPpc="GDP per capita (in logs)", SubContinent="Region", pop= "Population"), trendline='ols', trendline_scope='overall' )

Sunburst plot

px.sunburst(df_asia_1980_sorted.query('year==2019'), path=['SubContinent','CountryName_pwt100'], values='pop', color='ln_GDPpc', hover_data=df_asia_1980_sorted[['pop','GDPpc','K','H','L','lp','ln_TFP']], color_continuous_scale='RdBu', color_continuous_midpoint=np.average(df['ln_GDPpc'], weights=df['pop']), title='Population and GDP per capita of Asia in 2019', labels=dict(ln_GDPpc='GDP per capita') )