DAT490 - Assignment 1
Question 1
Clean data and create a new csv file with relevant entries
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# read GDP per capita and life expectancy data
df1 = pd.read_csv("/work/sample_data/gdp-per-capita-worldbank.csv")
df2 = pd.read_csv("/work/sample_data/life-expectancy-at-birth-total-years.csv")
SELECT Entity, GDPPerCapita
FROM df1
WHERE Year = 2019
SELECT Entity, LifeExpectancyAtBirth
FROM df2
WHERE Year = 2019
# save the two new csv files
df_1.to_csv('df1_2019.csv')
df_2.to_csv('df2_2019.csv')
select one.Entity, one.GDPPerCapita, two.LifeExpectancyAtBirth
from 'df1_2019.csv' as one
left join 'df2_2019.csv' as two on (one.Entity = two.Entity)
df_3.to_csv('pre_data.csv')
SELECT *
FROM 'pre_data.csv'
EXCEPT
SELECT *
FROM 'pre_data.csv'
WHERE Entity = 'Africa Eastern and Southern'
or Entity = 'Africa Western and Central'
or Entity = 'Arab World'
or Entity = 'Caribbean Small States'
or Entity = 'Central African Republic'
or Entity = 'Central Europe and the Baltics'
or Entity = 'Early-demographic dividend'
or Entity = 'East Asia & Pacific'
or Entity = 'East Asia & Pacific (IDA & IBRD)'
or Entity = 'East Asia & Pacific (excluding high income)'
or Entity = 'Euro area'
or Entity = 'Europe & Central Asia'
or Entity = 'Europe & Central Asia (IDA & IBRD)'
or Entity = 'Europe & Central Asia (excluding high income)'
or Entity = 'European Union'
or Entity = 'Heavily indebted poor countries (HIPC)'
or Entity = 'IBRD only'
or Entity = 'IDA & IBRD total'
or Entity = 'IDA blend'
or Entity = 'IDA only'
or Entity = 'IDA total'
or Entity = 'Late-demographic dividend'
or Entity = 'Latin America & Caribbean'
or Entity = 'Latin America & Caribbean (IDA & IBRD)'
or Entity = 'Latin America & Caribbean (excluding high income)'
or Entity = 'Least developed countries: UN classification'
or Entity = 'Low & middle income'
or Entity = 'Lower middle income'
or Entity = 'Middle East & North Africa'
or Entity = 'Middle East & North Africa (IDA & IBRD)'
or Entity = 'Middle East & North Africa (excluding high income)'
or Entity = 'Middle income'
or Entity = 'OECD members'
OR Entity = 'World'
OR Entity = 'Upper middle income'
OR Entity = 'Sub-Saharan Africa (excluding high income)'
OR Entity = 'Sub-Saharan Africa (IDA & IBRD)'
OR Entity = 'Sub-Saharan Africa'
OR Entity = 'South Asia (IDA & IBRD)'
OR Entity = 'Small states'
OR Entity = 'Pre-demographic dividend'
OR Entity = 'Post-demographic dividend'
OR Entity = 'Pacific island small states'
OR Entity = 'Other small states'
OR Entity = 'High income'
# saves the final csv file and reads it as a data frame.
df_4.to_csv('data.csv')
df = pd.read_csv('data.csv')
a)
df.plot(kind='scatter',x= 'GDPPerCapita', y= 'LifeExpectancyAtBirth')
plt.title('BNP per capita vs Life expectancy at birth')
b)
c)
d)
lifeExpectancy_std = df['LifeExpectancyAtBirth'].std()
lifeExpectancy_mean = df['LifeExpectancyAtBirth'].mean()
gdpPerCapita_std = df['GDPPerCapita'].std()
gdpPerCapita_mean = df['GDPPerCapita'].mean()
oneOver = lifeExpectancy_mean + lifeExpectancy_std
for ind in df.index:
if df['LifeExpectancyAtBirth'][ind] > oneOver:
print(df['Entity'][ind])
e)
for ind in df.index:
if df['LifeExpectancyAtBirth'][ind] > lifeExpectancy_mean and df['GDPPerCapita'][ind] < gdpPerCapita_mean:
print(df['Entity'][ind])
f)
for ind in df.index:
if df['LifeExpectancyAtBirth'][ind] < lifeExpectancy_mean and df['GDPPerCapita'][ind] > gdpPerCapita_mean:
print(df['Entity'][ind])
g)
SELECT *
FROM 'data.csv'
WHERE Entity = 'Sweden' or Entity = 'India'
Question 2
Clean data and create a new csv file
# read the two csv files with data
df3 = pd.read_csv("/work/sample_data/happiness-cantril-ladder.csv")
df4 = pd.read_csv("/work/sample_data/TI-corruption-perception-index.csv")
SELECT Entity, LifeSatisfaction
FROM df3
WHERE Year = 2018
SELECT Entity, Corruption
FROM df4
WHERE Year = 2018
# the two tables are saved as csv files
df_5.to_csv('df3_2018.csv')
df_6.to_csv('df4_2018.csv')
select one.Entity, one.LifeSatisfaction, two.Corruption
from 'df3_2018.csv' as one
left join 'df4_2018.csv' as two on (one.Entity = two.Entity)
a)
# save the final table as a new csv file and reads the file as a data frame
df_7.to_csv('data2.csv')
df2 = pd.read_csv('data2.csv')
# scatter plot showing the correlation between percieved corruption and life satisfaction
# y-axis: 0 = percieved as highly corrupt
# x-axis: 0 = worst life satisfaction
df2.plot(kind='scatter',x= 'LifeSatisfaction', y= 'Corruption')
plt.title('Self-reported Life Satisfaction vs Corruption Perception')
df2['Corruption'].plot.kde()
plt.title('Density of percieved corruption')
df2['LifeSatisfaction'].plot.kde()
plt.title('Density of self-reported life satisfaction')