# Set your working directory (where you store the csv files) here to run the code
# Figures are loaded with markdown to make sure they are visible. Markdown does NOT use your workdir.
workdir = '/work/'
# Import modules needed for the program
import numpy
import matplotlib
import pandas
pandas.set_option("display.max_rows", None, "display.max_columns", None)
# Read CSV files with data into variables
expectancy = pandas.read_csv(workdir + 'life-expectancy.csv')
gdp = pandas.read_csv(workdir + 'gdp-per-capita-worldbank.csv')
#Select data for 2019 and drop rows with empty fields, as rows with empty fields represents continents and not countries
expectancy2019 = expectancy[expectancy['Year'] == 2019].dropna()
gdp2019 = gdp[gdp['Year'] == 2019].dropna()
# Check that entity (country) exists for both DFs
expectancyMatch = expectancy2019[expectancy2019['Entity'].isin(gdp2019['Entity'])]
gdpMatch = gdp2019[gdp2019['Entity'].isin(expectancyMatch['Entity'])]
#Check that both DFs contain exactly the same entities
dotheymatch = expectancyMatch['Entity'].isin(gdpMatch['Entity']).value_counts()
print(dotheymatch)
#Sort values and reset indexes to ensure that the datasets match
sortedExpectancy = expectancyMatch.sort_values(by=['Entity'], ascending=True).reset_index(drop=True)
sortedgdp = gdpMatch.sort_values(by=['Entity'], ascending=True).reset_index(drop=True)
#Merge data into one dataframe
gdpAndExpectancy = pandas.concat([sortedgdp, sortedExpectancy], axis=1)
#Scatter plot life expectancy vs GDP per capita
expectancy_gdp_plot = gdpAndExpectancy.plot.scatter(x="GDP per capita, PPP (constant 2017 international $)", \
y="Life expectancy", c='GDP per capita, PPP (constant 2017 international $)', colormap='viridis_r',
sharex=False, figsize=(15, 10), title='Life expectancy vs GDP per capita')
expectancy_gdp_plot.figure.savefig('plot1.png')
1d: Which countries have a life expectancy higher than one standard deviation above the mean?
#calculate mean, standard deviation and gdp per capita representing one standard deivation above the mean
expectancystd = gdpAndExpectancy['Life expectancy'].std()
expectancymean = gdpAndExpectancy['Life expectancy'].mean()
expectancyoneovermean = expectancymean + expectancystd
#filter the dataset for gdp based on the results above
expectancyoneovermeancountries = gdpAndExpectancy[gdpAndExpectancy['Life expectancy']
> expectancyoneovermean]
#print countries with a life expectancy of one standard deviation above the mean
expectancyoneovermeancountries
1e: Which countries have high life expectancy but have low GDP (per capita)?
# The countries identified in 1d as having high life expectancy are sorted with the lowest GDP
# appearing first. The top 5 countries are presented.
expectancyovermean_sorted = expectancyoneovermeancountries.sort_values(by=['GDP per capita, PPP (constant 2017 international $)'], ascending=True).reset_index()
expectancyovermean_sorted.head(5)
gdpTotal = pandas.read_csv(workdir + 'gross-domestic-product.csv') # import GDP data
gdpTotalfiltered = gdpTotal[gdpTotal['Year'] == 2019].dropna() # Filter GDP data for 2019
# sort values, reset index, and merge with life expectancy
gdpSorted = gdpTotalfiltered.sort_values(by=['Entity'], ascending=True).reset_index(drop=True)
mergedgdpTotalandLife = pandas.merge(expectancy2019, gdpSorted, on=['Entity'])
#Scatter plot life expectancy vs GDP
totalAndlifeplot = mergedgdpTotalandLife.plot.scatter(x="GDP (constant 2010 US$)", \
y="Life expectancy", c='GDP (constant 2010 US$)', colormap='viridis_r',
sharex=False, figsize=(15, 10), title='Life expectancy vs GDP')
totalAndlifeplot.figure.savefig('plot2.png')
expectancywline = expectancy_gdp_plot.axvline(x=20000)
expectancy_gdp_plot.figure
expectancy_gdp_plot.figure.savefig('plot3.png')
Part 2
We found some more appropriate methods when working with part 2. But we keep the more crude approaches in part one to show the development in our knowledge of dataframes.
Sources: Income inequality: https://ourworldindata.org/income-inequality, Corruption statistics: https://ourworldindata.org/grapher/share-of-people-paying-bribes-vs-corruption-perception, Happiness: https://ourworldindata.org/grapher/happiness-wvs-vs-gallup
# Read csv into dataframe
def read_csv(file):
output = pandas.read_csv(workdir + file)
return output
# Filter by a column and keep the lastest entry for duplicates
def df_filter(df, label, subset):
output = df[df[label].notnull()].drop_duplicates(subset=subset, keep='last')
return output
# Merge two dataframes by a column
def df_merge(df1, df2, label):
output = pandas.merge(df1, df2, on=[label], )
return output
# Read bribes data
bribes = read_csv('share-of-people-paying-bribes-vs-corruption-perception.csv')
# Only keep countries that have bribe rate data
bribesFilter = df_filter(bribes, 'Bribery Rate (%)', 'Entity')
#bribes[bribes['Bribery Rate (%)'].notnull()].drop_duplicates(subset='Entity', keep='last')
# Read happiness data
happiness = read_csv('Happiness-WVS-vs-Gallup.csv')
# Only keep countries that have happiness data
happinessFilter = df_filter(happiness, 'Share of people who are happy (World Value Survey 2014)', 'Entity')
# Merge happiness and bribes with an inner union.
happinessAndBribes = df_merge(bribesFilter, happinessFilter, 'Entity')
#merged
# Read economic inequality data
inequality = read_csv('economic-inequality-gini-index.csv')
#Only keep last data entry of countries with inequality data
inequalityFilter = df_filter(inequality, 'Gini index', 'Entity')
# Merge all
merged = df_merge(happinessAndBribes, inequalityFilter, 'Entity')
plot4 = merged.plot.scatter(x="Share of people who are happy (World Value Survey 2014)", \
y="Bribery Rate (%)", c='Corruption Perception Index - Transparency International (2018)', \
colormap='viridis_r', sharex=False, figsize=(15, 10), title='Bribery Rate vs. Happiness' )
plot4.figure.savefig('plot4.png')
plot5 = merged.plot.scatter(x="Gini index", y="Bribery Rate (%)", c='Corruption Perception Index - Transparency International (2018)', \
colormap='viridis_r', sharex=False, figsize=(15, 10), title= 'Bribery Rate vs Economic inequality')
plot5.set_xlabel('Income inequality (Gini Index, lower = more equal)')
plot5.figure.savefig('plot5.png')
happyVSgini = merged.plot.scatter(x="Gini index", y="Share of people who are happy (World Value Survey 2014)", sharex=False, figsize=(15, 10), title= 'Happiness vs Economic inequality', c='g')
happyVSgini.axvline(x= merged['Gini index'].mean(), c='k', lw=0.5)
happyVSgini.axhline(y=merged['Share of people who are happy (World Value Survey 2014)'].mean(), c='k', lw=0.5)
happyVSgini.set_xlabel('Income inequality (Gini Index, lower = more equal)')
happyVSgini.figure.savefig('plot6.png')