import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from helper import LIFE_EXPECTANCY_KEY, GDP_PC_KEY, GDP_KEY, COUNTRY_KEY, YEAR_KEY
# Read CSV files into DataFrames
# source: https://ourworldindata.org/economic-growth
gdp_capita_data = pd.read_csv('datasets/gdp-2016.csv')
# source: https://ourworldindata.org/life-expectancy
life_expectancy_data = pd.read_csv('datasets/life-expectancy.csv')
# Filter and pick a subset of the data
# Take only the year 2016
gdp_capita_data_2016 = gdp_capita_data[gdp_capita_data[YEAR_KEY] == 2016]
life_expectancy_data_2016 = life_expectancy_data[life_expectancy_data[YEAR_KEY] == 2016]
# Merge the two datasets by combining the columns
merged = pd \
.merge(gdp_capita_data_2016, life_expectancy_data_2016, how='inner', on=COUNTRY_KEY) \
.dropna() \
.sort_values(GDP_PC_KEY);
# Define X & Y data for the plot
y = merged[LIFE_EXPECTANCY_KEY]
x = merged[GDP_PC_KEY]
# Draw the scatter plot with corresponding labels and title
scatter_plot = merged \
.plot \
.scatter(x = GDP_PC_KEY, y = LIFE_EXPECTANCY_KEY, c = "c") \
.update(dict(title="Life expectancy vs GDP per capita"));
transformer = FunctionTransformer(np.log, validate=True)
x_transformed = transformer.fit_transform(np.array(x).reshape(-1,1))
model = LinearRegression().fit(
x_transformed,
y)
yfit = model.predict(x_transformed)
# Visualising the fitted curve
plt.title('Fitting data to logarithmic curve')
plt.xlabel(GDP_PC_KEY)
plt.ylabel(LIFE_EXPECTANCY_KEY)
plt.scatter(x, y, c = "c")
plt.plot(x, yfit, "k--")
plt.show()
# Finding the mean of life expectancy
mean_life = merged.mean()[LIFE_EXPECTANCY_KEY]
# Finding the standard deviation of life expectancy
std_life = merged.std()[LIFE_EXPECTANCY_KEY]
# Finding the countries with a life expectancy higher than one std above the mean
country_life_over_std = merged[merged[LIFE_EXPECTANCY_KEY] > mean_life + std_life] \
[[COUNTRY_KEY, LIFE_EXPECTANCY_KEY]] \
.sort_values(LIFE_EXPECTANCY_KEY, ascending = False)
country_life_over_std
# Load dataset with GDP
# source: https://ourworldindata.org/economic-growth
gdp_data = pd.read_csv('datasets/gross-domestic-product.csv')
# Filter and pick a subset of the GDP data
# Take only the year 2016
gdp_data_2016 = gdp_data[gdp_data[YEAR_KEY] == 2016]
# Merge to a single table
gdp_le = pd \
.merge(gdp_data_2016, life_expectancy_data_2016, how='inner',on=COUNTRY_KEY) \
.merge(gdp_capita_data_2016, how='inner', on=COUNTRY_KEY) \
.dropna()
# Remove 'World' from the table, as it's not a country
# and sort the entries in descending order with the GDP
world_index = gdp_le[(gdp_le[COUNTRY_KEY] == 'World')].index
gdp_le = gdp_le \
.drop(world_index) \
.sort_values(GDP_KEY, ascending = False)
fig, axs = plt.subplots(1, 2, figsize=(14,6))
# Visualizing the GDP data's distribution with a histogram
axs[0].hist(gdp_le[[GDP_KEY]], bins=100)
axs[0].set_xlabel(GDP_KEY)
axs[0].set_ylabel("Frequency")
axs[0].set_title("Histogram of GDP year 2016")
# Visualizing the GDP data with boxplot
axs[1].boxplot(gdp_le[GDP_KEY])
axs[1].set_ylabel(GDP_KEY)
axs[1].set_title("Boxplot of GDP year 2016")
plt.tight_layout()
mean_gdp = gdp_le.mean()[GDP_KEY]
std_gdp = gdp_le.std()[GDP_KEY]
# Finding countries with low GDP
gdp_le[gdp_le[GDP_KEY] < mean_gdp - std_gdp] \
[[COUNTRY_KEY, LIFE_EXPECTANCY_KEY, GDP_KEY]]
# Combine GDP data with those countries that have high life expectancy
high_le_gdp = pd \
.merge(country_life_over_std, gdp_data_2016, how="inner", on=COUNTRY_KEY) \
[[COUNTRY_KEY, LIFE_EXPECTANCY_KEY, GDP_KEY]]
# Take the mean and std of the gdp from the subset defined above
mean_gdp_highle = high_le_gdp.mean()[GDP_KEY]
std_gdp_highle = high_le_gdp.std()[GDP_KEY]
# Find the countries with high LE and low GDP using our definitions from above
low_gdp_high_le = high_le_gdp[high_le_gdp[GDP_KEY] < mean_gdp_highle - std_gdp_highle]
low_gdp_high_le
gdp_lower_quartile = np.percentile(gdp_data_2016[GDP_KEY], 25)
low_gdp_high_le = high_le_gdp[high_le_gdp[GDP_KEY] < gdp_lower_quartile]
low_gdp_high_le
from helper import _gdp_le_scatter_plot
# Separating the data into subpopulations, those that fall
# outside of our high life expectancy definition, and those within
mask = gdp_le[LIFE_EXPECTANCY_KEY] > mean_life + std_life
x, x_high_LE = (gdp_le[~mask][GDP_KEY], gdp_le[mask][GDP_KEY])
y, y_high_LE = (gdp_le[~mask][LIFE_EXPECTANCY_KEY], gdp_le[mask][LIFE_EXPECTANCY_KEY])
# Define arguments for helper function
args = [x, y, x_high_LE, y_high_LE, gdp_lower_quartile, "Lower quartile of GDP"]
# Plot
plt.subplots(1, 2, figsize = (12, 5))
plt.subplot(1, 2, 1)
_gdp_le_scatter_plot(*args)
plt.title("Life expectancy vs GDP")
# Zoom in the graph by resetting the axes' limits
plt.subplot(1, 2, 2)
_gdp_le_scatter_plot(*args)
plt.title("Zoomed In")
plt.gca().set_xlim([0, gdp_lower_quartile + 1e+10])
plt.gca().set_ylim([mean_life + std_life - 5, mean_life + std_life + 6])
plt.tight_layout()
plt.show()
gdp_lower_quartile = np.percentile(high_le_gdp[GDP_KEY], 25)
low_gdp_high_le = high_le_gdp[high_le_gdp[GDP_KEY] < gdp_lower_quartile]
low_gdp_high_le
# calculate the upper quartile
gdp_upper_quartile = np.percentile(gdp_data_2016[GDP_KEY], 75)
# calculate the gdp one standard deviation above the mean
gdp_std_above_mean = gdp_data_2016[GDP_KEY].mean() + gdp_data_2016[GDP_KEY].std()
plt.figure(figsize=(10,6))
# Redefine our arguments for the plot
args[4], args[5] = (gdp_upper_quartile, "Upper quartile of GDP")
_gdp_le_scatter_plot(*args, second_limit = gdp_std_above_mean, second_label = "One σ above µ of GDP")
plt.title("Life Expectancy vs GDP")
plt.show()
# Getting the countries with a high GDP
mask = gdp_le[GDP_KEY] > gdp_upper_quartile
high_gdp = gdp_le[mask]
# Filter them to see which have a life expectancy below our definition of a high life expectancy
mask = high_gdp[LIFE_EXPECTANCY_KEY] <= mean_life + std_life
high_gdp_low_le = high_gdp[mask][[COUNTRY_KEY, GDP_KEY, LIFE_EXPECTANCY_KEY]]
# List the three strongest economies that don't have high life expectancy
high_gdp_low_le.sort_values(GDP_KEY, ascending = False).head(3)
gdp_pc_std_above_mean = gdp_capita_data_2016[GDP_PC_KEY].mean() + gdp_capita_data_2016[GDP_PC_KEY].std()
gdp_pc_upper_quartile = np.percentile(gdp_capita_data_2016[GDP_PC_KEY], 75)
# Divide the data from 1a) into a set of those
# with high life expectancy, and those outside the spectrum
mask = merged[LIFE_EXPECTANCY_KEY] > mean_life + std_life
x, x_high_LE = (merged[~mask][GDP_PC_KEY], merged[mask][GDP_PC_KEY])
y, y_high_LE = (merged[~mask][LIFE_EXPECTANCY_KEY], merged[mask][LIFE_EXPECTANCY_KEY])
# Define the arguments for the scatter plot
args = [x, y, x_high_LE, y_high_LE, gdp_pc_upper_quartile, "Upper quartile of GDP per capita", GDP_PC_KEY]
# Plot the figure
plt.figure(figsize=(9, 6))
_gdp_le_scatter_plot(*args,
second_limit = gdp_pc_std_above_mean,
second_label = "One σ above µ of GDP per capita"
)
plt.title("Life expectancy vs GDP per capita")
plt.show()
from helper import KEY_USA, LIFE_SATISFACTION_KEY, SUICIDE_KEY, BEER_KEY, WINE_KEY, SPIRITS_KEY, ALCOHOL_KEY
# load the datasets
# source: https://ourworldindata.org/happiness-and-life-satisfaction
happiness_data = pd.read_csv('datasets-2/happiness.csv')
# source: https://ourworldindata.org/suicide#suicide-is-a-leading-cause-of-death-especially-in-young-people
suicide_data = pd.read_csv('datasets-2/suicide-death-rates.csv')
# source: https://ourworldindata.org/alcohol-consumption
alcohol_data = pd.read_csv('datasets-2/alcohol-consumption-per-person-us.csv')
# Add different types of alcohol consumption into a single column
alcohol_data[ALCOHOL_KEY] = alcohol_data[BEER_KEY] + alcohol_data[WINE_KEY] + alcohol_data[SPIRITS_KEY]
# Filter to only select USA and from year 2005
happiness_data_usa = happiness_data[happiness_data[COUNTRY_KEY] == KEY_USA]
suicide_data_usa = suicide_data[(suicide_data[COUNTRY_KEY] == KEY_USA) & (suicide_data[YEAR_KEY] > 2005)]
alcohol_data = alcohol_data[alcohol_data[YEAR_KEY] > 2005]
# plot line plots
# life satisfaction
plt.plot(happiness_data_usa[YEAR_KEY], happiness_data_usa[LIFE_SATISFACTION_KEY], c="g")
plt.title("Life satisfiscation in USA over time (1-10)")
plt.xlabel(YEAR_KEY)
plt.ylabel(LIFE_SATISFACTION_KEY)
plt.show()
fig, axs = plt.subplots(1, 2, figsize=(14,4))
# alcohol
axs[0].plot(alcohol_data[YEAR_KEY], alcohol_data[ALCOHOL_KEY], c="c")
axs[0].set_title("Total alcohol consumption per person per year in USA")
axs[0].set_xlabel(YEAR_KEY)
axs[0].set_ylabel(ALCOHOL_KEY)
# suicide
axs[1].plot(suicide_data_usa[YEAR_KEY], suicide_data_usa[SUICIDE_KEY], c="r")
axs[1].set_title("Suicides per 100K people, USA over time")
axs[1].set_xlabel(YEAR_KEY)
axs[1].set_ylabel("Suicide rate")
_ = plt.tight_layout
# Merge life satisfaction data with suicide rate data
merged = pd \
.merge(happiness_data_usa, suicide_data_usa, how='inner', on=[COUNTRY_KEY, YEAR_KEY]) \
.dropna()
# Plot line charts for both datasets
fig, axs = plt.subplots(figsize=(10, 4))
axs.set_title("Life Satisfaction and Suicide Rate")
axs.set_xlabel(YEAR_KEY)
axs.set_ylabel(LIFE_SATISFACTION_KEY)
l1, = axs.plot(merged[YEAR_KEY], merged[LIFE_SATISFACTION_KEY], c="g", label="Life satisfaction (1-10)")
# Reuse x-axis
axs = axs.twinx()
axs.set_ylabel(SUICIDE_KEY)
l2, = axs.plot(merged[YEAR_KEY], merged[SUICIDE_KEY], c="r", label="Suicide rate")
fig.tight_layout()
plt.legend([l1, l2],["Life satisfaction (1-10)", "Suicide rate"], loc="center right")
plt.show()
# Merge alcohol consumption data with suicide rate data
merged = pd \
.merge(alcohol_data, suicide_data_usa, how='inner', on=[YEAR_KEY]) \
.dropna()
# Plot line charts for both datasets
fig, axs = plt.subplots(figsize=(10, 4))
axs.set_title("Alcohol consumption and Suicide Rate")
axs.set_xlabel(YEAR_KEY)
axs.set_ylabel(ALCOHOL_KEY)
l1, = axs.plot(merged[YEAR_KEY], merged[ALCOHOL_KEY], c="c", label="Alcohol consumption (litres / person)")
# Reuse x-axis
axs = axs.twinx()
axs.set_ylabel(SUICIDE_KEY)
l2, = axs.plot(merged[YEAR_KEY], merged[SUICIDE_KEY], c="r", label="Suicide rate")
fig.tight_layout()
plt.legend([l1, l2],["Alcohol consumption (litres / person)", "Suicide rate"], loc="center right")
plt.show()
# Merge alcohol consumption data with happiness data
merged = pd \
.merge(alcohol_data, happiness_data_usa, how='inner', on=[YEAR_KEY]) \
.dropna()
# Plot line charts for both datasets
fig, axs = plt.subplots(figsize=(10, 4))
axs.set_title("Alcohol consumption and Life Satisfaction")
axs.set_xlabel(YEAR_KEY)
axs.set_ylabel(ALCOHOL_KEY)
l1, = axs.plot(merged[YEAR_KEY], merged[ALCOHOL_KEY], c="c", label="Alcohol consumption (litres / person)")
# Reuse x-axis
axs = axs.twinx()
axs.set_ylabel(LIFE_SATISFACTION_KEY)
l2, = axs.plot(merged[YEAR_KEY], merged[LIFE_SATISFACTION_KEY], c="g", label="Life satisfaction (1-10)")
fig.tight_layout()
plt.legend([l1, l2],["Alcohol consumption (litres / person)", "Life Satisfaction"], loc="center right")
plt.show()