Introduction to AI and data science

a. Scatter plot

import csv import pandas as pd data_path = "." gdp_per_capita_file_name = "/gdp-per-capita-maddison-2020.csv" life_expectancy_file_name = "/life-expectancy.csv" gdp_file_name = "/gdp-world-regions-stacked-area.csv" gdp_per_capita_data = pd.read_csv(data_path+gdp_per_capita_file_name) life_expectancy_data = pd.read_csv(data_path+life_expectancy_file_name) gdp_data = pd.read_csv(data_path+gdp_file_name) gdp_per_capita_data = pd.DataFrame(gdp_per_capita_data) life_expectancy_data = pd.DataFrame(life_expectancy_data) gdp_data = pd.DataFrame(gdp_data)

data = pd.merge(gdp_per_capita_data, life_expectancy_data, on=["Year", "Code" , "Entity"]) grouped_data = data.sort_values('Year', ascending=False).drop_duplicates(['Entity']) grouped_data = grouped_data.drop(['145446-annotations'], axis=1) grouped_data = grouped_data[grouped_data.Entity != "World"] grouped_data

import matplotlib.pyplot as plt import matplotlib.lines as li import numpy as np gdp_cap = [] life_exp = [] labels = [] for index, row in grouped_data.iterrows(): life_exp.append(row['Life expectancy']) gdp_cap.append(row['GDP per capita']) labels.append(row['Entity']) f, ax = plt.subplots() f.set_figwidth(30) f.set_figheight(30) ax.set_xlabel('GDP per capita') ax.set_ylabel('Life expectancy') ax.scatter(gdp_cap, life_exp) for i, txt in enumerate(labels): ax.annotate(txt, (gdp_cap[i], life_exp[i])) # Draw a line for the average life expectancy average_life_exp = np.mean(life_exp) ax.plot([0,max(gdp_cap)], [average_life_exp]*2, linestyle='dashed') ax.annotate('Average life expectancy', (max(gdp_cap)*(6/7), average_life_exp + 0.2)) ax.title.set_text('Life expectancy to GDP per capita for 166 countries')

b. Analysis

c. Data cleaning

d. Highest life expectancy

from tabulate import tabulate mean_plus_std = np.mean(life_exp)+np.std(life_exp) highest_life_expectancy = grouped_data.loc[grouped_data['Life expectancy']>mean_plus_std] highest_life_expectancy = highest_life_expectancy.sort_values(by=['Life expectancy'],ascending=False) print(tabulate(highest_life_expectancy, headers='keys', tablefmt='psql'))

e. High life expectancy with low gdp per capita

highest_life_expectancy = grouped_data.loc[grouped_data['Life expectancy']>np.mean(life_exp)+np.std(life_exp)/3] highest_life_expectancy_with_low_gdp = highest_life_expectancy.loc[grouped_data['GDP per capita']<np.mean(gdp_cap)-np.std(gdp_cap)/3] print(tabulate(highest_life_expectancy_with_low_gdp, headers='keys', tablefmt='psql'))

f. Does a strong economy always lead to a high life expectancy

First we define some helper methods

def merge_datasets(set_a, set_b, columns=None, columns_to_exclude=None, rows_to_exclude=None): data = pd.merge(set_a, set_b, on=["Year", "Code" , "Entity"]) grouped_data = data.sort_values('Year', ascending=False).drop_duplicates(['Entity']) if columns: grouped_data = grouped_data[['Year', 'Code', 'Entity'] + columns] if columns_to_exclude: grouped_data = grouped_data.drop(columns_to_exclude, axis=1) if rows_to_exclude: for r in rows_to_exclude: grouped_data = grouped_data[grouped_data.Entity != r] return grouped_data

def draw_scatter_plot(data , x_label, y_label, title=None): import matplotlib.pyplot as plt import matplotlib.lines as li import numpy as np x = [] y = [] labels = [] for index, row in data.iterrows(): x.append(row[x_label]) y.append(row[y_label]) labels.append(row['Entity']) f, ax = plt.subplots() f.set_figwidth(30) f.set_figheight(30) ax.set_xlabel(x_label) ax.set_ylabel(y_label) ax.scatter(x, y) for i, txt in enumerate(labels): ax.annotate(txt, (x[i], y[i])) if title: ax.title.set_text(title) return f, ax

Now we can analyse the data

gdp_life = merge_datasets(gdp_data, life_expectancy_data, columns_to_exclude=['146201-annotations'], rows_to_exclude=['World', 'United States', 'China', 'India'])

# Check for countries that have a high GDP and low life expectancy # By looking at the quartiles upper_quartile_gdp = gdp_life['GDP'].quantile([0.75])[0.75] countries_upper_quartile_gdp = gdp_life[gdp_life['GDP'] > upper_quartile_gdp] lower_quartile_life = gdp_life['Life expectancy'].quantile([0.25])[0.25] countries_lower_quartile_life = gdp_life[gdp_life['Life expectancy'] < lower_quartile_life] high_gdp_low_life_quartile = gdp_life[(gdp_life['Life expectancy'] < lower_quartile_life) & (gdp_life['GDP'] > upper_quartile_gdp)] # By looking at the means and standard deviations gdp_mean_plus_std = gdp_life['GDP'].mean() + (1/3) * gdp_life['GDP'].std() life_mean_minus_std = gdp_life['Life expectancy'].mean() - (1/3) * gdp_life['Life expectancy'].std() high_gdp_low_life_std_dev = gdp_life[(gdp_life['Life expectancy'] < life_mean_minus_std) & (gdp_life['GDP'] > gdp_mean_plus_std)] # Printing out results print('Countries that are in the upper quartile for GDP and lower quartile for life expectancy') print(tabulate(high_gdp_low_life_quartile, headers='keys', tablefmt='psql')) print('Countires that are in the category GDP > mean(GDP)+(1/3)*stddev(GDP) and Life expectancy > mean(Life expectancy)-(1/3)*stddev(Life expectancy)') print(tabulate(high_gdp_low_life_std_dev, headers='keys', tablefmt='psql'))

_ = draw_scatter_plot(gdp_life, "GDP", "Life expectancy")

g. GDP per capita as economy indicator

gdp_per_cap_life = grouped_data gdp_life # Check for countries that have a high GDP and low life expectancy # By looking at the quartiles upper_quartile_gdp = gdp_per_cap_life['GDP per capita'].quantile([0.75])[0.75] countries_upper_quartile_gdp = gdp_per_cap_life[gdp_per_cap_life['GDP per capita'] > upper_quartile_gdp] lower_quartile_life = gdp_per_cap_life['Life expectancy'].quantile([0.25])[0.25] countries_lower_quartile_life = gdp_per_cap_life[gdp_per_cap_life['Life expectancy'] < lower_quartile_life] high_gdp_low_life_quartile = gdp_per_cap_life[(gdp_per_cap_life['Life expectancy'] < lower_quartile_life) & (gdp_per_cap_life['GDP per capita'] > upper_quartile_gdp)] high_gdp_low_life_quartile # By looking at the means and standard deviations gdp_mean_plus_std = gdp_per_cap_life['GDP per capita'].mean() + (1/3) * gdp_per_cap_life['GDP per capita'].std() life_mean_minus_std = gdp_per_cap_life['Life expectancy'].mean() - (1/3) * gdp_per_cap_life['Life expectancy'].std() high_gdp_per_capita_low_life_std_dev = gdp_per_cap_life[(gdp_per_cap_life['Life expectancy'] < life_mean_minus_std) & (gdp_per_cap_life['GDP per capita'] > gdp_mean_plus_std)] high_gdp_per_capita_low_life_std_dev # Printing out results print('Countries that are in the upper quartile for GDP per capita and lower quartile for life expectancy') print(tabulate(high_gdp_low_life_quartile, headers='keys', tablefmt='psql')) print('Countires that are in the category GDP > mean(GDP per capita)+stddev(GDP per capita) and Life expectancy > mean(Life expectancy)-stddev(Life expectancy)') print(tabulate(high_gdp_per_capita_low_life_std_dev, headers='keys', tablefmt='psql'))

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}a. Scatter plot