DAT490 - Assignment 1

Question 1

Clean data and create a new csv file with relevant entries

import matplotlib.pyplot as plt import pandas as pd import numpy as np # read GDP per capita and life expectancy data df1 = pd.read_csv("/work/sample_data/gdp-per-capita-worldbank.csv") df2 = pd.read_csv("/work/sample_data/life-expectancy-at-birth-total-years.csv")

SELECT Entity, GDPPerCapita FROM df1 WHERE Year = 2019

SELECT Entity, LifeExpectancyAtBirth FROM df2 WHERE Year = 2019

# save the two new csv files df_1.to_csv('df1_2019.csv') df_2.to_csv('df2_2019.csv')

select one.Entity, one.GDPPerCapita, two.LifeExpectancyAtBirth from 'df1_2019.csv' as one left join 'df2_2019.csv' as two on (one.Entity = two.Entity)

df_3.to_csv('pre_data.csv')

SELECT * FROM 'pre_data.csv' EXCEPT SELECT * FROM 'pre_data.csv' WHERE Entity = 'Africa Eastern and Southern' or Entity = 'Africa Western and Central' or Entity = 'Arab World' or Entity = 'Caribbean Small States' or Entity = 'Central African Republic' or Entity = 'Central Europe and the Baltics' or Entity = 'Early-demographic dividend' or Entity = 'East Asia & Pacific' or Entity = 'East Asia & Pacific (IDA & IBRD)' or Entity = 'East Asia & Pacific (excluding high income)' or Entity = 'Euro area' or Entity = 'Europe & Central Asia' or Entity = 'Europe & Central Asia (IDA & IBRD)' or Entity = 'Europe & Central Asia (excluding high income)' or Entity = 'European Union' or Entity = 'Heavily indebted poor countries (HIPC)' or Entity = 'IBRD only' or Entity = 'IDA & IBRD total' or Entity = 'IDA blend' or Entity = 'IDA only' or Entity = 'IDA total' or Entity = 'Late-demographic dividend' or Entity = 'Latin America & Caribbean' or Entity = 'Latin America & Caribbean (IDA & IBRD)' or Entity = 'Latin America & Caribbean (excluding high income)' or Entity = 'Least developed countries: UN classification' or Entity = 'Low & middle income' or Entity = 'Lower middle income' or Entity = 'Middle East & North Africa' or Entity = 'Middle East & North Africa (IDA & IBRD)' or Entity = 'Middle East & North Africa (excluding high income)' or Entity = 'Middle income' or Entity = 'OECD members' OR Entity = 'World' OR Entity = 'Upper middle income' OR Entity = 'Sub-Saharan Africa (excluding high income)' OR Entity = 'Sub-Saharan Africa (IDA & IBRD)' OR Entity = 'Sub-Saharan Africa' OR Entity = 'South Asia (IDA & IBRD)' OR Entity = 'Small states' OR Entity = 'Pre-demographic dividend' OR Entity = 'Post-demographic dividend' OR Entity = 'Pacific island small states' OR Entity = 'Other small states' OR Entity = 'High income'

# saves the final csv file and reads it as a data frame. df_4.to_csv('data.csv') df = pd.read_csv('data.csv')

a)

df.plot(kind='scatter',x= 'GDPPerCapita', y= 'LifeExpectancyAtBirth') plt.title('BNP per capita vs Life expectancy at birth')

b)

c)

d)

lifeExpectancy_std = df['LifeExpectancyAtBirth'].std() lifeExpectancy_mean = df['LifeExpectancyAtBirth'].mean() gdpPerCapita_std = df['GDPPerCapita'].std() gdpPerCapita_mean = df['GDPPerCapita'].mean() oneOver = lifeExpectancy_mean + lifeExpectancy_std for ind in df.index: if df['LifeExpectancyAtBirth'][ind] > oneOver: print(df['Entity'][ind])

e)

for ind in df.index: if df['LifeExpectancyAtBirth'][ind] > lifeExpectancy_mean and df['GDPPerCapita'][ind] < gdpPerCapita_mean: print(df['Entity'][ind])

f)

for ind in df.index: if df['LifeExpectancyAtBirth'][ind] < lifeExpectancy_mean and df['GDPPerCapita'][ind] > gdpPerCapita_mean: print(df['Entity'][ind])

g)

SELECT * FROM 'data.csv' WHERE Entity = 'Sweden' or Entity = 'India'

Question 2

Clean data and create a new csv file

# read the two csv files with data df3 = pd.read_csv("/work/sample_data/happiness-cantril-ladder.csv") df4 = pd.read_csv("/work/sample_data/TI-corruption-perception-index.csv")

SELECT Entity, LifeSatisfaction FROM df3 WHERE Year = 2018

SELECT Entity, Corruption FROM df4 WHERE Year = 2018

# the two tables are saved as csv files df_5.to_csv('df3_2018.csv') df_6.to_csv('df4_2018.csv')

select one.Entity, one.LifeSatisfaction, two.Corruption from 'df3_2018.csv' as one left join 'df4_2018.csv' as two on (one.Entity = two.Entity)

a)

# save the final table as a new csv file and reads the file as a data frame df_7.to_csv('data2.csv') df2 = pd.read_csv('data2.csv') # scatter plot showing the correlation between percieved corruption and life satisfaction # y-axis: 0 = percieved as highly corrupt # x-axis: 0 = worst life satisfaction df2.plot(kind='scatter',x= 'LifeSatisfaction', y= 'Corruption') plt.title('Self-reported Life Satisfaction vs Corruption Perception')

df2['Corruption'].plot.kde() plt.title('Density of percieved corruption')

df2['LifeSatisfaction'].plot.kde() plt.title('Density of self-reported life satisfaction')

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}DAT490 - Assignment 1

Question 1

Clean data and create a new csv file with relevant entries

a)

b)

c)

d)

e)

f)

g)

Question 2

Clean data and create a new csv file

a)

b)

DAT490 - Assignment 1