a. Scatter plot
import csv
import pandas as pd
data_path = "."
gdp_per_capita_file_name = "/gdp-per-capita-maddison-2020.csv"
life_expectancy_file_name = "/life-expectancy.csv"
gdp_file_name = "/gdp-world-regions-stacked-area.csv"
gdp_per_capita_data = pd.read_csv(data_path+gdp_per_capita_file_name)
life_expectancy_data = pd.read_csv(data_path+life_expectancy_file_name)
gdp_data = pd.read_csv(data_path+gdp_file_name)
gdp_per_capita_data = pd.DataFrame(gdp_per_capita_data)
life_expectancy_data = pd.DataFrame(life_expectancy_data)
gdp_data = pd.DataFrame(gdp_data)
Execution Error
FileNotFoundError: [Errno 2] No such file or directory: './gdp-per-capita-maddison-2020.csv'
data = pd.merge(gdp_per_capita_data, life_expectancy_data, on=["Year", "Code" , "Entity"])
grouped_data = data.sort_values('Year', ascending=False).drop_duplicates(['Entity'])
grouped_data = grouped_data.drop(['145446-annotations'], axis=1)
grouped_data = grouped_data[grouped_data.Entity != "World"]
grouped_data
import matplotlib.pyplot as plt
import matplotlib.lines as li
import numpy as np
gdp_cap = []
life_exp = []
labels = []
for index, row in grouped_data.iterrows():
life_exp.append(row['Life expectancy'])
gdp_cap.append(row['GDP per capita'])
labels.append(row['Entity'])
f, ax = plt.subplots()
f.set_figwidth(30)
f.set_figheight(30)
ax.set_xlabel('GDP per capita')
ax.set_ylabel('Life expectancy')
ax.scatter(gdp_cap, life_exp)
for i, txt in enumerate(labels):
ax.annotate(txt, (gdp_cap[i], life_exp[i]))
# Draw a line for the average life expectancy
average_life_exp = np.mean(life_exp)
ax.plot([0,max(gdp_cap)], [average_life_exp]*2, linestyle='dashed')
ax.annotate('Average life expectancy', (max(gdp_cap)*(6/7), average_life_exp + 0.2))
ax.title.set_text('Life expectancy to GDP per capita for 166 countries')
b. Analysis
c. Data cleaning
d. Highest life expectancy
from tabulate import tabulate
mean_plus_std = np.mean(life_exp)+np.std(life_exp)
highest_life_expectancy = grouped_data.loc[grouped_data['Life expectancy']>mean_plus_std]
highest_life_expectancy = highest_life_expectancy.sort_values(by=['Life expectancy'],ascending=False)
print(tabulate(highest_life_expectancy, headers='keys', tablefmt='psql'))
+-------+----------------+--------+--------+------------------+-------------------+
| | Entity | Code | Year | GDP per capita | Life expectancy |
|-------+----------------+--------+--------+------------------+-------------------|
| 4914 | Hong Kong | HKG | 2018 | 50839.4 | 84.687 |
| 5801 | Japan | JPN | 2018 | 38673.8 | 84.47 |
| 10867 | Switzerland | CHE | 2018 | 61372.7 | 83.63 |
| 9950 | Singapore | SGP | 2018 | 68402.4 | 83.458 |
| 10310 | Spain | ESP | 2018 | 31496.5 | 83.433 |
| 5636 | Italy | ITA | 2018 | 34364.2 | 83.352 |
| 500 | Australia | AUS | 2018 | 49830.8 | 83.281 |
| 5058 | Iceland | ISL | 2018 | 43438.5 | 82.855 |
| 10196 | South Korea | KOR | 2018 | 37927.6 | 82.846 |
| 5489 | Israel | ISR | 2018 | 32954.8 | 82.819 |
| 10724 | Sweden | SWE | 2018 | 45541.9 | 82.654 |
| 4084 | France | FRA | 2018 | 38515.9 | 82.541 |
| 6927 | Malta | MLT | 2018 | 32028.9 | 82.376 |
| 1843 | Canada | CAN | 2018 | 44868.7 | 82.315 |
| 8427 | Norway | NOR | 2018 | 84580.1 | 82.271 |
| 7941 | New Zealand | NZL | 2018 | 35336.1 | 82.145 |
| 7870 | Netherlands | NLD | 2018 | 47474.1 | 82.143 |
| 5420 | Ireland | IRL | 2018 | 64684.3 | 82.103 |
| 6582 | Luxembourg | LUX | 2018 | 57427.5 | 82.102 |
| 4493 | Greece | GRC | 2018 | 23450.8 | 82.072 |
| 9064 | Portugal | PRT | 2018 | 27035.6 | 81.857 |
| 3885 | Finland | FIN | 2018 | 38896.7 | 81.736 |
| 1034 | Belgium | BEL | 2018 | 39756.2 | 81.468 |
| 579 | Austria | AUT | 2018 | 42988.1 | 81.434 |
| 11917 | United Kingdom | GBR | 2018 | 38058.1 | 81.236 |
| 4340 | Germany | DEU | 2018 | 46177.6 | 81.18 |
| 10051 | Slovenia | SVN | 2018 | 29244.9 | 81.172 |
| 2776 | Cyprus | CYP | 2018 | 27184.4 | 80.828 |
| 3078 | Denmark | DNK | 2018 | 46312.3 | 80.784 |
+-------+----------------+--------+--------+------------------+-------------------+
e. High life expectancy with low gdp per capita
highest_life_expectancy = grouped_data.loc[grouped_data['Life expectancy']>np.mean(life_exp)+np.std(life_exp)/3]
highest_life_expectancy_with_low_gdp = highest_life_expectancy.loc[grouped_data['GDP per capita']<np.mean(gdp_cap)-np.std(gdp_cap)/3]
print(tabulate(highest_life_expectancy_with_low_gdp, headers='keys', tablefmt='psql'))
+-------+------------------------+--------+--------+------------------+-------------------+
| | Entity | Code | Year | GDP per capita | Life expectancy |
|-------+------------------------+--------+--------+------------------+-------------------|
| 3354 | Ecuador | ECU | 2018 | 10638.8 | 76.8 |
| 9469 | Saint Lucia | LCA | 2018 | 10475.4 | 76.057 |
| 1245 | Bosnia and Herzegovina | BIH | 2018 | 10460.5 | 77.262 |
| 10387 | Sri Lanka | LKA | 2018 | 11662.9 | 76.812 |
| 2707 | Cuba | CUB | 2018 | 8325.63 | 78.726 |
| 826 | Barbados | BRB | 2018 | 11995.2 | 79.081 |
| 7424 | Morocco | MAR | 2018 | 8451.14 | 76.453 |
| 11390 | Tunisia | TUN | 2018 | 11353.9 | 76.505 |
| 137 | Albania | ALB | 2018 | 11104.2 | 78.458 |
| 12294 | Vietnam | VNM | 2018 | 6814.14 | 75.317 |
+-------+------------------------+--------+--------+------------------+-------------------+
f. Does a strong economy always lead to a high life expectancy
First we define some helper methods
def merge_datasets(set_a, set_b, columns=None, columns_to_exclude=None, rows_to_exclude=None):
data = pd.merge(set_a, set_b, on=["Year", "Code" , "Entity"])
grouped_data = data.sort_values('Year', ascending=False).drop_duplicates(['Entity'])
if columns:
grouped_data = grouped_data[['Year', 'Code', 'Entity'] + columns]
if columns_to_exclude:
grouped_data = grouped_data.drop(columns_to_exclude, axis=1)
if rows_to_exclude:
for r in rows_to_exclude:
grouped_data = grouped_data[grouped_data.Entity != r]
return grouped_data
def draw_scatter_plot(data , x_label, y_label, title=None):
import matplotlib.pyplot as plt
import matplotlib.lines as li
import numpy as np
x = []
y = []
labels = []
for index, row in data.iterrows():
x.append(row[x_label])
y.append(row[y_label])
labels.append(row['Entity'])
f, ax = plt.subplots()
f.set_figwidth(30)
f.set_figheight(30)
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.scatter(x, y)
for i, txt in enumerate(labels):
ax.annotate(txt, (x[i], y[i]))
if title:
ax.title.set_text(title)
return f, ax
Now we can analyse the data
gdp_life = merge_datasets(gdp_data, life_expectancy_data, columns_to_exclude=['146201-annotations'], rows_to_exclude=['World', 'United States', 'China', 'India'])
# Check for countries that have a high GDP and low life expectancy
# By looking at the quartiles
upper_quartile_gdp = gdp_life['GDP'].quantile([0.75])[0.75]
countries_upper_quartile_gdp = gdp_life[gdp_life['GDP'] > upper_quartile_gdp]
lower_quartile_life = gdp_life['Life expectancy'].quantile([0.25])[0.25]
countries_lower_quartile_life = gdp_life[gdp_life['Life expectancy'] < lower_quartile_life]
high_gdp_low_life_quartile = gdp_life[(gdp_life['Life expectancy'] < lower_quartile_life) & (gdp_life['GDP'] > upper_quartile_gdp)]
# By looking at the means and standard deviations
gdp_mean_plus_std = gdp_life['GDP'].mean() + (1/3) * gdp_life['GDP'].std()
life_mean_minus_std = gdp_life['Life expectancy'].mean() - (1/3) * gdp_life['Life expectancy'].std()
high_gdp_low_life_std_dev = gdp_life[(gdp_life['Life expectancy'] < life_mean_minus_std) & (gdp_life['GDP'] > gdp_mean_plus_std)]
# Printing out results
print('Countries that are in the upper quartile for GDP and lower quartile for life expectancy')
print(tabulate(high_gdp_low_life_quartile, headers='keys', tablefmt='psql'))
print('Countires that are in the category GDP > mean(GDP)+(1/3)*stddev(GDP) and Life expectancy > mean(Life expectancy)-(1/3)*stddev(Life expectancy)')
print(tabulate(high_gdp_low_life_std_dev, headers='keys', tablefmt='psql'))
Countries that are in the upper quartile for GDP and lower quartile for life expectancy
+-------+--------------+--------+--------+-------------+-------------------+
| | Entity | Code | Year | GDP | Life expectancy |
|-------+--------------+--------+--------+-------------+-------------------|
| 8145 | Nigeria | NGA | 2018 | 1.06081e+12 | 54.332 |
| 10108 | South Africa | ZAF | 2018 | 6.73272e+11 | 63.857 |
+-------+--------------+--------+--------+-------------+-------------------+
Countires that are in the category GDP > mean(GDP)+(1/3)*stddev(GDP) and Life expectancy > mean(Life expectancy)-(1/3)*stddev(Life expectancy)
+------+----------+--------+--------+-------------+-------------------+
| | Entity | Code | Year | GDP | Life expectancy |
|------+----------+--------+--------+-------------+-------------------|
| 8553 | Pakistan | PAK | 2018 | 1.21209e+12 | 67.114 |
| 8145 | Nigeria | NGA | 2018 | 1.06081e+12 | 54.332 |
+------+----------+--------+--------+-------------+-------------------+
_ = draw_scatter_plot(gdp_life, "GDP", "Life expectancy")
g. GDP per capita as economy indicator
gdp_per_cap_life = grouped_data
gdp_life
# Check for countries that have a high GDP and low life expectancy
# By looking at the quartiles
upper_quartile_gdp = gdp_per_cap_life['GDP per capita'].quantile([0.75])[0.75]
countries_upper_quartile_gdp = gdp_per_cap_life[gdp_per_cap_life['GDP per capita'] > upper_quartile_gdp]
lower_quartile_life = gdp_per_cap_life['Life expectancy'].quantile([0.25])[0.25]
countries_lower_quartile_life = gdp_per_cap_life[gdp_per_cap_life['Life expectancy'] < lower_quartile_life]
high_gdp_low_life_quartile = gdp_per_cap_life[(gdp_per_cap_life['Life expectancy'] < lower_quartile_life) & (gdp_per_cap_life['GDP per capita'] > upper_quartile_gdp)]
high_gdp_low_life_quartile
# By looking at the means and standard deviations
gdp_mean_plus_std = gdp_per_cap_life['GDP per capita'].mean() + (1/3) * gdp_per_cap_life['GDP per capita'].std()
life_mean_minus_std = gdp_per_cap_life['Life expectancy'].mean() - (1/3) * gdp_per_cap_life['Life expectancy'].std()
high_gdp_per_capita_low_life_std_dev = gdp_per_cap_life[(gdp_per_cap_life['Life expectancy'] < life_mean_minus_std) & (gdp_per_cap_life['GDP per capita'] > gdp_mean_plus_std)]
high_gdp_per_capita_low_life_std_dev
# Printing out results
print('Countries that are in the upper quartile for GDP per capita and lower quartile for life expectancy')
print(tabulate(high_gdp_low_life_quartile, headers='keys', tablefmt='psql'))
print('Countires that are in the category GDP > mean(GDP per capita)+stddev(GDP per capita) and Life expectancy > mean(Life expectancy)-stddev(Life expectancy)')
print(tabulate(high_gdp_per_capita_low_life_std_dev, headers='keys', tablefmt='psql'))
Countries that are in the upper quartile for GDP per capita and lower quartile for life expectancy
+------+-------------------+--------+--------+------------------+-------------------+
| | Entity | Code | Year | GDP per capita | Life expectancy |
|------+-------------------+--------+--------+------------------+-------------------|
| 3564 | Equatorial Guinea | GNQ | 2018 | 28529 | 58.402 |
+------+-------------------+--------+--------+------------------+-------------------+
Countires that are in the category GDP > mean(GDP per capita)+stddev(GDP per capita) and Life expectancy > mean(Life expectancy)-stddev(Life expectancy)
+-------+-------------------+--------+--------+------------------+-------------------+
| | Entity | Code | Year | GDP per capita | Life expectancy |
|-------+-------------------+--------+--------+------------------+-------------------|
| 3564 | Equatorial Guinea | GNQ | 2018 | 28529 | 58.402 |
| 11502 | Turkmenistan | TKM | 2018 | 26318.4 | 68.073 |
+-------+-------------------+--------+--------+------------------+-------------------+