This deepnote is an inspiration from my exemplary data science lecturer, Professor Carlos Mendez
#Here is a motivational video to all future data scientists from the famous Swedish physician and academia Hans Rosling's
Setup
#load all the computational libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
#chart_studio.tools.set_credentials_file(username='econdata777', api_key='ADDhere')
import statsmodels.api as sm
import statsmodels.formula.api as smf
import warnings
#warnings.filterwarnings('ignore')
Import data
df1 = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master/assets/dat.csv")
#df1 = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master/assets/dat.csv", parse_dates =['year'])
#df means data frame
df1
#df1.columns is used to call all the coloumns in data frame one (df1)
df1.columns
#df types shows the type of data in your columns e.g float,integer
df1.dtypes
#The unique functions selects all data in the variable selected
df1['region'].unique()
Dataset definitions
# Import definitions of dataset
df1_def = pd.read_csv("https://raw.githubusercontent.com/quarcs-lab/mendez2020-convergence-clubs-code-data/master//assets/dat-definitions.csv")
df1_def
Descriptive statistics
#Use the function describe to run descriptive statistics
df1.describe().round(2)
Prepare data
#Here is a video to understand how pandas work, enjoy!
select
# the df1 fullstop(.) followed by a variable(s) give a quick glance of the results of that variable
df1.s
#Subset your data and tailor make to your preferences
df2 = df1[['country', 'year', 'GDPpc', 'h', 'ky','s', 'log_lp', 'TFP']]
df2
query
df_2014 = df1[['country', 'region', 'hi1990', 'year', 'GDPpc', 'h', 'ky', 'TFP']].query("year == 2014")
df_2014
df_Kenya = df2[['country','s', 'log_lp', 'year', 'GDPpc', 'h', 'ky', 'TFP']].query("country == 'Kenya'")
df_Kenya
df2[['country', 'year', 'GDPpc']].query("country==['Kenya','Tanzania', 'Democratic Republic of Congo', 'Rwanda', 'Burundi', 'Uganda', 'Tanzania'] and year == 2014")
# Define the logical conditions
is_Africa = df1['region'] == 'Africa'
is_HighIncome = df1['hi1990'] == 'no'
is_1990 = df1['year'] == 1990
# Apply the logical conditions
df1[is_Africa & is_HighIncome & is_1990]
groupby
df_2014.groupby('region').mean().round(2)
df_2014.groupby('region').std().round(2)
#df_2014.groupby('region')['h'].agg(['mean', 'std', 'min', 'max']).round(2)
df_2014.groupby('region').h.agg(['mean', 'std', 'min', 'max']).round(2)
df1.groupby('year').GDPpc.agg(['mean', 'std', 'min', 'max']).round(2)
Visualize data
#Here is a video to simplify visualization , take a look !