a. Scatter plot
import csv
import pandas as pd
data_path = "."
gdp_per_capita_file_name = "/gdp-per-capita-maddison-2020.csv"
life_expectancy_file_name = "/life-expectancy.csv"
gdp_file_name = "/gdp-world-regions-stacked-area.csv"
gdp_per_capita_data = pd.read_csv(data_path+gdp_per_capita_file_name)
life_expectancy_data = pd.read_csv(data_path+life_expectancy_file_name)
gdp_data = pd.read_csv(data_path+gdp_file_name)
gdp_per_capita_data = pd.DataFrame(gdp_per_capita_data)
life_expectancy_data = pd.DataFrame(life_expectancy_data)
gdp_data = pd.DataFrame(gdp_data)
data = pd.merge(gdp_per_capita_data, life_expectancy_data, on=["Year", "Code" , "Entity"])
grouped_data = data.sort_values('Year', ascending=False).drop_duplicates(['Entity'])
grouped_data = grouped_data.drop(['145446-annotations'], axis=1)
grouped_data = grouped_data[grouped_data.Entity != "World"]
grouped_data
import matplotlib.pyplot as plt
import matplotlib.lines as li
import numpy as np
gdp_cap = []
life_exp = []
labels = []
for index, row in grouped_data.iterrows():
life_exp.append(row['Life expectancy'])
gdp_cap.append(row['GDP per capita'])
labels.append(row['Entity'])
f, ax = plt.subplots()
f.set_figwidth(30)
f.set_figheight(30)
ax.set_xlabel('GDP per capita')
ax.set_ylabel('Life expectancy')
ax.scatter(gdp_cap, life_exp)
for i, txt in enumerate(labels):
ax.annotate(txt, (gdp_cap[i], life_exp[i]))
# Draw a line for the average life expectancy
average_life_exp = np.mean(life_exp)
ax.plot([0,max(gdp_cap)], [average_life_exp]*2, linestyle='dashed')
ax.annotate('Average life expectancy', (max(gdp_cap)*(6/7), average_life_exp + 0.2))
ax.title.set_text('Life expectancy to GDP per capita for 166 countries')
b. Analysis
c. Data cleaning
d. Highest life expectancy
from tabulate import tabulate
mean_plus_std = np.mean(life_exp)+np.std(life_exp)
highest_life_expectancy = grouped_data.loc[grouped_data['Life expectancy']>mean_plus_std]
highest_life_expectancy = highest_life_expectancy.sort_values(by=['Life expectancy'],ascending=False)
print(tabulate(highest_life_expectancy, headers='keys', tablefmt='psql'))
e. High life expectancy with low gdp per capita
highest_life_expectancy = grouped_data.loc[grouped_data['Life expectancy']>np.mean(life_exp)+np.std(life_exp)/3]
highest_life_expectancy_with_low_gdp = highest_life_expectancy.loc[grouped_data['GDP per capita']<np.mean(gdp_cap)-np.std(gdp_cap)/3]
print(tabulate(highest_life_expectancy_with_low_gdp, headers='keys', tablefmt='psql'))
f. Does a strong economy always lead to a high life expectancy
First we define some helper methods
def merge_datasets(set_a, set_b, columns=None, columns_to_exclude=None, rows_to_exclude=None):
data = pd.merge(set_a, set_b, on=["Year", "Code" , "Entity"])
grouped_data = data.sort_values('Year', ascending=False).drop_duplicates(['Entity'])
if columns:
grouped_data = grouped_data[['Year', 'Code', 'Entity'] + columns]
if columns_to_exclude:
grouped_data = grouped_data.drop(columns_to_exclude, axis=1)
if rows_to_exclude:
for r in rows_to_exclude:
grouped_data = grouped_data[grouped_data.Entity != r]
return grouped_data
def draw_scatter_plot(data , x_label, y_label, title=None):
import matplotlib.pyplot as plt
import matplotlib.lines as li
import numpy as np
x = []
y = []
labels = []
for index, row in data.iterrows():
x.append(row[x_label])
y.append(row[y_label])
labels.append(row['Entity'])
f, ax = plt.subplots()
f.set_figwidth(30)
f.set_figheight(30)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.scatter(x, y)
for i, txt in enumerate(labels):
ax.annotate(txt, (x[i], y[i]))
if title:
ax.title.set_text(title)
return f, ax
Now we can analyse the data
gdp_life = merge_datasets(gdp_data, life_expectancy_data, columns_to_exclude=['146201-annotations'], rows_to_exclude=['World', 'United States', 'China', 'India'])
# Check for countries that have a high GDP and low life expectancy
# By looking at the quartiles
upper_quartile_gdp = gdp_life['GDP'].quantile([0.75])[0.75]
countries_upper_quartile_gdp = gdp_life[gdp_life['GDP'] > upper_quartile_gdp]
lower_quartile_life = gdp_life['Life expectancy'].quantile([0.25])[0.25]
countries_lower_quartile_life = gdp_life[gdp_life['Life expectancy'] < lower_quartile_life]
high_gdp_low_life_quartile = gdp_life[(gdp_life['Life expectancy'] < lower_quartile_life) & (gdp_life['GDP'] > upper_quartile_gdp)]
# By looking at the means and standard deviations
gdp_mean_plus_std = gdp_life['GDP'].mean() + (1/3) * gdp_life['GDP'].std()
life_mean_minus_std = gdp_life['Life expectancy'].mean() - (1/3) * gdp_life['Life expectancy'].std()
high_gdp_low_life_std_dev = gdp_life[(gdp_life['Life expectancy'] < life_mean_minus_std) & (gdp_life['GDP'] > gdp_mean_plus_std)]
# Printing out results
print('Countries that are in the upper quartile for GDP and lower quartile for life expectancy')
print(tabulate(high_gdp_low_life_quartile, headers='keys', tablefmt='psql'))
print('Countires that are in the category GDP > mean(GDP)+(1/3)*stddev(GDP) and Life expectancy > mean(Life expectancy)-(1/3)*stddev(Life expectancy)')
print(tabulate(high_gdp_low_life_std_dev, headers='keys', tablefmt='psql'))
_ = draw_scatter_plot(gdp_life, "GDP", "Life expectancy")
g. GDP per capita as economy indicator
gdp_per_cap_life = grouped_data
gdp_life
# Check for countries that have a high GDP and low life expectancy
# By looking at the quartiles
upper_quartile_gdp = gdp_per_cap_life['GDP per capita'].quantile([0.75])[0.75]
countries_upper_quartile_gdp = gdp_per_cap_life[gdp_per_cap_life['GDP per capita'] > upper_quartile_gdp]
lower_quartile_life = gdp_per_cap_life['Life expectancy'].quantile([0.25])[0.25]
countries_lower_quartile_life = gdp_per_cap_life[gdp_per_cap_life['Life expectancy'] < lower_quartile_life]
high_gdp_low_life_quartile = gdp_per_cap_life[(gdp_per_cap_life['Life expectancy'] < lower_quartile_life) & (gdp_per_cap_life['GDP per capita'] > upper_quartile_gdp)]
high_gdp_low_life_quartile
# By looking at the means and standard deviations
gdp_mean_plus_std = gdp_per_cap_life['GDP per capita'].mean() + (1/3) * gdp_per_cap_life['GDP per capita'].std()
life_mean_minus_std = gdp_per_cap_life['Life expectancy'].mean() - (1/3) * gdp_per_cap_life['Life expectancy'].std()
high_gdp_per_capita_low_life_std_dev = gdp_per_cap_life[(gdp_per_cap_life['Life expectancy'] < life_mean_minus_std) & (gdp_per_cap_life['GDP per capita'] > gdp_mean_plus_std)]
high_gdp_per_capita_low_life_std_dev
# Printing out results
print('Countries that are in the upper quartile for GDP per capita and lower quartile for life expectancy')
print(tabulate(high_gdp_low_life_quartile, headers='keys', tablefmt='psql'))
print('Countires that are in the category GDP > mean(GDP per capita)+stddev(GDP per capita) and Life expectancy > mean(Life expectancy)-stddev(Life expectancy)')
print(tabulate(high_gdp_per_capita_low_life_std_dev, headers='keys', tablefmt='psql'))