Samuel Ivarsson and Martin Andersson
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
gdpDF = pd.read_csv('/work/gdp-per-capita-worldbank.csv')
lifeDF = pd.read_csv('/work/life-expectancy-at-birth-total-years.csv')
gdp2DF = pd.read_csv('/work/gdp-world-regions-stacked-area.csv')
gdpString = 'GDP per capita, PPP (constant 2017 international $)'
lifeString = 'Life expectancy at birth, total (years)'
gdp2String = 'GDP'
# Selecting data from 2017
gdp2017 = gdpDF.loc[gdpDF['Year'] == 2017]
# Removing all entries with no code (removing areas and only keeping specific countries)
gdp2017.dropna(subset = ['Code'], inplace=True)
# Removing the world entry
gdp2017 = gdp2017[gdp2017['Code'] != 'OWID_WRL']
# Selecting data from 2017
life2017 = lifeDF.loc[lifeDF['Year'] == 2017]
# Removing all entries with no code (removing areas and only keeping specific countries)
life2017.dropna(subset = ['Code'], inplace=True)
# Removing the world entry
life2017 = life2017[life2017['Code'] != 'OWID_WRL']
# Combining the data based on the code, entity and year.
result = pd.merge(gdp2017, life2017, on=['Code','Entity','Year'])
# Setting GDP as the x-axis
x = result[gdpString]
# Setting Life expectancy as the y-axis
y = result[lifeString]
# Plot the data as a scatter plot
plt.scatter(x, y)
plt.title('GDP per capita (PPP) vs. Life expectancy')
plt.show()
# Find the standard deviation of life expectancy
lifeSTD = np.std(result[lifeString])
# Find the mean of life expectancy
lifeMean = np.mean(result[lifeString])
# Life expectancy of one standard deviation higher than the mean
var1 = result.loc[result[lifeString] > lifeMean + lifeSTD]
# High life expectancy but low GDP per capita
lowGDP = 23937
var2 = result.loc[result[gdpString] < lowGDP]
var2 = var2.loc[var2[lifeString] > lifeMean]
# Does every strong economy (normally indicated by GDP) have high life expectancy?
# Selecting data from 2017
gdp2_2017 = gdp2DF.loc[gdp2DF['Year'] == 2017]
# Removing all entries with no code (removing areas and only keeping specific countries)
gdp2_2017.dropna(subset = ['Code'], inplace=True)
# Removing the world entry
gdp2_2017 = gdp2_2017[gdp2_2017['Code'] != 'OWID_WRL']
result2 = pd.merge(gdp2_2017, life2017, on=['Code','Entity','Year'])
# Sort descending
result2Sorted = result2.sort_values(by=[gdp2String], ascending=False)
x = result2Sorted['Code'][0:10]
y = result2Sorted[lifeString][0:10]
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(x,y)
plt.show()
# Sort descending
resultSorted = result.sort_values(by=[gdpString], ascending=False)
x = resultSorted['Code'][0:10]
y = resultSorted[lifeString][0:10]
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.bar(x,y)
plt.show()
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
################# Deaths by cancer then vs now #################
df = pd.read_csv('/work/cancerDeathsThenVsNow/cancer-death-rates-by-type.csv')
colString = 'Deaths - Breast cancer - Sex: Both - Age: Age-standardized (Rate)'
# Selecting data from 2019
xByYear = df.loc[df['Year'] == 2019]
# Removing all entries with no code (removing areas and only keeping specific countries)
xByYear.dropna(subset = ['Code'], inplace=True)
# Removing the world entry
xByYear = xByYear[xByYear['Code'] != 'OWID_WRL']
# Selecting data from 1990
yByYear = df.loc[df['Year'] == 1990]
# Removing all entries with no code (removing areas and only keeping specific countries)
yByYear.dropna(subset = ['Code'], inplace=True)
# Removing the world entry
yByYear = yByYear[yByYear['Code'] != 'OWID_WRL']
# Combining the data based on the code and entity.
result = pd.merge(xByYear, yByYear, on=['Code','Entity'])
# Sort by number of deaths by Prostate cancer in 1990
resultSorted = result.sort_values(by=[colString+'_y'], ascending=False)
# Plot the data as a bar plot (Only use the countries with the most deaths in 1990)
res = resultSorted[0:5]
ax = res.plot.bar(x='Code', y=[colString+'_y', colString+'_x'], xlabel='Country code', ylabel='Deaths per 100 000')
ax.set_title('Deaths - Breast cancer')
ax.legend(['1990','2019'])
# Selecting the same countries from the bar plot but including all years
allYears = df[(df['Code'] == 'GRL') | (df['Code'] == 'KNA') | (df['Code'] == 'DNK') | (df['Code'] == 'URY') | (df['Code'] == 'BMU')]
allYears.dropna(subset = ['Code'], inplace=True)
# Plot the data through all years
allYears = allYears.pivot(index='Year', columns='Code', values=colString)
ax = allYears.plot(xlabel='Year', ylabel='Deaths per 100 000')
ax.set_title('Deaths - Breast cancer')
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
################# Happiness vs Vacation #################
xDF = pd.read_csv('/work/Happiness vs vacation/days-of-vacation-and-holidays.csv')
yDF = pd.read_csv('/work/Happiness vs vacation/share-of-people-who-say-they-are-happy.csv')
xString = 'Days of vacation and holidays for full-time production workers in non-agricultural activities (Huberman & Minns 2007)'
yString = 'Share of people who are happy (World Value Survey 2014)'
#print(yDF)
# Selecting data from 2000
xByYear = xDF.loc[xDF['Year'] == 2000]
# Removing all entries with no code (removing areas and only keeping specific countries)
xByYear.dropna(subset = ['Code'], inplace=True)
# Selecting data from 1998
yByYear = yDF.loc[yDF['Year'] == 1998]
# Combining the data based on the code, entity.
result = pd.merge(xByYear, yByYear, on=['Code','Entity'])
#print(result.sort_values(by=xString))
# Setting Days as the x-axis
x = result[xString]
# Setting Happiness percentage as the y-axis
y = result[yString]
# Plot the data as a scatter plot
plt.scatter(x, y)
plt.title('Happiness vs Vacation')
plt.xlabel("days")
plt.ylabel("percent")
#plt.show()
# Plots regression line.
m, b = np.polyfit(x, y, 1)
plt.plot(x, m*x + b)