This deepnote is an inspiration from my exemplary data science lecturer, Professor Carlos Mendez

#Here is a motivational video to all future data scientists from the famous Swedish physician and academia Hans Rosling's

Setup

#load all the computational libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import plotly.graph_objects as go #chart_studio.tools.set_credentials_file(username='econdata777', api_key='ADDhere') import statsmodels.api as sm import statsmodels.formula.api as smf import warnings #warnings.filterwarnings('ignore')

Import data

df1 = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master/assets/dat.csv") #df1 = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master/assets/dat.csv", parse_dates =['year'])

#df means data frame df1

#df1.columns is used to call all the coloumns in data frame one (df1) df1.columns

#df types shows the type of data in your columns e.g float,integer df1.dtypes

#The unique functions selects all data in the variable selected df1['region'].unique()

Dataset definitions

# Import definitions of dataset df1_def = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master//assets/dat-definitions.csv") df1_def

Descriptive statistics

#Use the function describe to run descriptive statistics df1.describe().round(2)

Prepare data

#Here is a video to understand how pandas work, enjoy!

select

# the df1 fullstop(.) followed by a variable(s) give a quick glance of the results of that variable df1.s

#Subset your data and tailor make to your preferences

df2 = df1[['country', 'year', 'GDPpc', 'h', 'ky','s', 'log_lp', 'TFP']] df2

query

df_2014 = df1[['country', 'region', 'hi1990', 'year', 'GDPpc', 'h', 'ky', 'TFP']].query("year == 2014") df_2014

df_Kenya = df2[['country','s', 'log_lp', 'year', 'GDPpc', 'h', 'ky', 'TFP']].query("country == 'Kenya'") df_Kenya

df2[['country', 'year', 'GDPpc']].query("country==['Kenya','Tanzania', 'Democratic Republic of Congo', 'Rwanda', 'Burundi', 'Uganda', 'Tanzania'] and year == 2014")

# Define the logical conditions is_Africa = df1['region'] == 'Africa' is_HighIncome = df1['hi1990'] == 'no' is_1990 = df1['year'] == 1990 # Apply the logical conditions df1[is_Africa & is_HighIncome & is_1990]

groupby

df_2014.groupby('region').mean().round(2)

df_2014.groupby('region').std().round(2)

#df_2014.groupby('region')['h'].agg(['mean', 'std', 'min', 'max']).round(2) df_2014.groupby('region').h.agg(['mean', 'std', 'min', 'max']).round(2)

Setup

Import data

Dataset definitions

Descriptive statistics

Prepare data

select

query

groupby

Visualize data

Strip plot

Sunburst plot

References