[PYTHON] Into the amazing world of Data Science
Set up
# Load computational modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt # from matplotlib import pylot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.api as sm
import statsmodels.formula.api as smf
Import and describing data
Import
df = pd.read_csv('/work/pwt100withGroups_productionAccounting.csv')
# df = pd.read_stata('/work/pwt100withGroups_productionAccounting.dta')
df
df.shape
df.columns
df.dtypes
df['CountryName_pwt100'].nunique() # 183 countries
df['CountryName_pwt100'].unique()
df['Continent'].unique()
df['incomegroup'].unique()
Data definitions
var_def = pd.read_csv('/work/variable_definitions.csv')
var_def
Data descriptives
df.describe().round(2)
Prepare data
Subset
Select columns
df_col = df[['CountryName_pwt100', 'SubContinent', 'Continent', 'GDPpc', 'pop', 'L', 'H','lp', 'H']]
df_col
Select rows
# Select rows with Continient being specified as "Asia"
df_asia = df.query('Continent == "Asia"')
df_asia
# Combine 2 conditions
df_asia_2019 = df.query('Continent == "Asia" and year == 2019')
df_asia_2019
# Combine multiple conditions
is_asia = df['Continent'] == "Asia"
is_2019 = df['year'] >= 2009
is_highincome = df['incomegroup'] == 'High income'
df_asia_2019_high = df[is_asia & is_2019 & is_highincome]
df_asia_2019_high
# isin
vnjpth = df.CountryName_pwt100.isin(['Viet Nam', 'Japan', 'Thailand'])
y90_00_10 = df.year.isin([1990, 2000, 2010])
df_3countries = df[vnjpth & y90_00_10]
df_3countries
# loc
df_vn = df.loc[df["countrycode"] == "VNM", :]
df_vn
# try other way with loc
df.set_index('countrycode', inplace=True)
df_vn = df.loc['VNM',:]
df_vn
Aggregation
df[df['year'] == 2019].groupby('Continent').mean().round(2)
df[df['year'] == 2019].groupby('incomegroup').GDPpc.agg(['mean', 'std', 'min', 'max']).round(2)
Transform
From long to wide
# Pivot panel data with GDPpc and H
df_GDPpc_wide = df.pivot_table(
index = ['CountryName_pwt100', 'SubContinent', 'Continent', 'incomegroup','countrycode'],
columns = 'year',
values = ['GDPpc', 'H']).reset_index(drop=False)
df_GDPpc_wide.columns = ['_'.join(str(s).strip() for s in col if s) for col in df_GDPpc_wide.columns]
df_GDPpc_wide
# Note: Let use the following code line to create column names like GDPpc_2010
# df.columns = ['_'.join(str(s).strip() for s in col if s) for col in df.columns]
From wide to long
df_GDPpc_long = pd.wide_to_long(df_GDPpc_wide,
stubnames=['GDPpc', 'H'],
i=['CountryName_pwt100', 'SubContinent', 'Continent', 'incomegroup','countrycode'],
j='year',
sep='_')
df_GDPpc_long.reset_index()
Merging
# I am creating a 2-column dataframe
df_lp = df.reset_index()[['countrycode','lp']]
df_lp
df_GDPpc_lp = pd.merge(df_GDPpc_long, df_lp, on='countrycode')
df_GDPpc_lp
# Note: if the key column for merging has different name in 2 datatsets, we use
# option: left_on, right_on
Visualization
From the world vision
px.choropleth(
df.reset_index(),
locations="countrycode",
color="ln_GDPpc",
hover_name="CountryName_pwt100",
color_continuous_scale='RdBu_r',
projection="natural earth",
labels=dict(ln_GDPpc = 'GDP per capita(in logs)'),
title='Distribution of world income level'
)
px.choropleth(df.reset_index(),
locations="countrycode",
color="ln_h",
hover_name="CountryName_pwt100",
projection="natural earth",
color_continuous_scale='RdBu_r',
title='Distribution of Human Capital Index',
labels=dict(ln_h='Human capital index in logs'),
)
How about Asia
Line plot
# subset data for Asia from 1980 onwards, then sort values
df_asia_1980_sorted = df_asia.query("year >= 1980").sort_values(['year','pop','ln_GDPpc','K','ln_lp','ln_h','ln_TFP'])
px.line(df_asia_1980_sorted,
x='year',
y='ln_GDPpc',
color='CountryName_pwt100',
hover_name='CountryName_pwt100',
hover_data=['SubContinent','incomegroup','pop','pop','H','K','ln_TFP'],
labels=dict(ln_GDPpc="GDP per capita in log",
CountryName_pwt100 = "Country name"),
title='How GDP per capita of Asian country go through time')
px.line(df_asia_1980_sorted,
x='year',
y='ln_GDPpc',
color='CountryName_pwt100',
hover_name='CountryName_pwt100',
hover_data=['SubContinent','incomegroup','pop','pop','H','K','ln_TFP'],
labels=dict(ln_lp="Labor productivity",
CountryName_pwt100 = "Country name"),
facet_col='SubContinent',
facet_col_wrap=1,
height=1000,
width=600,
title='How labor productivity of Asian countries go through time')
Strip plot
px.strip(df_asia_1980_sorted,
x='ln_GDPpc',
y='SubContinent',
range_x= [6, 13],
color='SubContinent',
hover_name='CountryName_pwt100',
hover_data=['SubContinent','incomegroup','pop','pop','H','K','ln_TFP'],
labels=dict(GDPpc="Real GDP per capita (in logs)",
CountryName_pwt100 = "Country name",
SubContinent = "Region"),
template="simple_white",
animation_frame='year',
title='How GDP per capita of Asian countries go through time')
Scatter plot
px.scatter(df_asia_1980_sorted,
x="ln_h",
y="ln_GDPpc",
range_x=[0,1.6],
range_y=[6,12.5],
color="SubContinent",
hover_name="CountryName_pwt100",
hover_data = ['GDPpc','pop','H','K','ln_TFP'],
labels=dict(ln_GDPpc="GDP per capita (in logs)",
ln_h="Human capital (in logs)",
SubContinent="Region"),
animation_frame='year'
)
px.scatter(df_asia_1980_sorted,
x="ln_h",
y="ln_GDPpc",
range_x=[0,1.6],
range_y=[6,12.5],
color="SubContinent",
hover_name="CountryName_pwt100",
hover_data = ['GDPpc','pop','H','K','ln_TFP'],
labels=dict(ln_GDPpc="GDP per capita (in logs)",
ln_h="Human capital (in logs)",
SubContinent="Region"),
animation_frame='year',
trendline="ols"
)
fig = px.scatter(df_asia_1980_sorted.query("year == 2019"),
x="ln_h",
y="ln_GDPpc",
log_x = False,
color = "SubContinent",
size ="pop", size_max=60,
hover_name = "CountryName_pwt100",
height =500, width=800,
template = "simple_white",
color_discrete_sequence=px.colors.qualitative.G10,
title = "Asia in 2019",
labels=dict(SubContinent = "Region",
pop = "Population",
ln_GDPpc = "GDP per capita (in logs)",
ln_h = "Human capital index (in logs)")
)
fig.update_layout(font_family = "Rockwell",
legend=dict(orientation = "h", title="", y=1.1, x=1, xanchor="right", yanchor="bottom"))
fig.add_hline(df_asia_1980_sorted.query("year == 2019")['ln_GDPpc'].mean(), line_width=1, line_dash="dot")
fig.add_vline(df_asia_1980_sorted.query("year == 2019")['ln_h'].mean(), line_width=1, line_dash="dot")
fig.show()
df_asia_1980_sorted_withoutNA = df_asia_1980_sorted.dropna(inplace=True)
px.scatter(df_asia_1980_sorted,
animation_frame="year",
x="ln_h",
y="ln_GDPpc",
range_x=[0, 1.6],
range_y=[6, 12.5],
color="SubContinent",
size="pop", size_max=60,
hover_name="CountryName_pwt100",
hover_data = ['GDPpc','pop','H','K','ln_TFP'],
labels=dict(log_h="Human capital index (in logs)",
log_GDPpc="GDP per capita (in logs)",
SubContinent="Region",
pop= "Population"),
trendline='ols',
trendline_scope='overall'
)
Sunburst plot
px.sunburst(df_asia_1980_sorted.query('year==2019'),
path=['SubContinent','CountryName_pwt100'],
values='pop',
color='ln_GDPpc',
hover_data=df_asia_1980_sorted[['pop','GDPpc','K','H','L','lp','ln_TFP']],
color_continuous_scale='RdBu',
color_continuous_midpoint=np.average(df['ln_GDPpc'], weights=df['pop']),
title='Population and GDP per capita of Asia in 2019',
labels=dict(ln_GDPpc='GDP per capita')
)