#1
import pandas as pd
import matplotlib.pyplot as plt
# datasets
gdp_data = pd.read_csv('datasets/gdp_per_capita.csv', skiprows=4, usecols=lambda x: 'Unnamed' not in x)
malnutrition_data = pd.read_excel('datasets/malnutrition.xls', sheet_name='Data', skiprows=3)
gdp_metadata = pd.read_csv('datasets/gdp_metadata.csv', usecols=lambda x: 'Unnamed' not in x)
# set country name as an index
indexed_gdg_data = gdp_data.set_index('Country Name')
indexed_malnutrition_data = malnutrition_data.set_index('Country Name')
# malnutrition against gdp scatter plot
plt.figure(figsize=(10, 6))
plt.title("Malnutrition prevalence vs GPD per capita")
plt.xlabel("GDP per capita")
plt.ylabel("Malnutrition Prevalence")
for y in range(1980,2021):
plt.scatter(indexed_gdg_data[str(y)], indexed_malnutrition_data[str(y)])
plt.show()
# merging the data with metadata
df_metadata = gdp_metadata[['Country Code', 'Region', 'IncomeGroup']]
df_mal_merged = pd.merge(malnutrition_data, df_metadata, on='Country Code')
df_gdp_merged = pd.merge(gdp_data, df_metadata, on='Country Code')
# regions
df_mal_new1 = df_mal_merged.drop(['Indicator Name', 'Indicator Code', 'IncomeGroup', 'Country Code', 'Country Name'], axis=1)
df_mal_grouped_regions= df_mal_new1.groupby('Region')
df_regions = df_mal_merged['Region'].dropna().unique()
df_gdp_new1 = df_gdp_merged.drop(['Indicator Name', 'Indicator Code', 'IncomeGroup', 'Country Code', 'Country Name'], axis=1)
df_gdp_grouped_regions = df_gdp_new1.groupby('Region')
# regions scatter plot
plt.figure(figsize=(10, 6))
plt.title("Malnutrition prevalence vs GPD per capita")
plt.xlabel("GDP per capita")
plt.ylabel("Malnutrition Prevalence")
for r in df_regions:
if r != 'North America':
plt.scatter(df_gdp_grouped_regions.get_group(r).drop(['Region'], axis=1).values, df_mal_grouped_regions.get_group(r).drop(['Region'], axis=1).values)
plt.legend(title='Region', labels=df_regions)
plt.show()
# income levels
df_incomes = df_mal_merged['IncomeGroup'].dropna().unique()
df_mal_new2 = df_mal_merged.drop(['Indicator Name', 'Indicator Code', 'Region', 'Country Code', 'Country Name'], axis=1)
df_mal_grouped_incomes = df_mal_new2.groupby('IncomeGroup')
df_gdp_new2 = df_gdp_merged.drop(['Indicator Name', 'Indicator Code', 'Region', 'Country Code', 'Country Name'], axis=1)
df_gdp_grouped_incomes = df_gdp_new2.groupby('IncomeGroup')
# income levels scatter plot
plt.figure(figsize=(10, 6))
plt.title("Malnutrition prevalence vs GPD per capita")
plt.xlabel("GDP per capita")
plt.ylabel("Malnutrition Prevalence")
for inc in df_incomes:
plt.scatter(df_gdp_grouped_incomes.get_group(inc).drop(['IncomeGroup'], axis=1).values, df_mal_grouped_incomes.get_group(inc).drop(['IncomeGroup'], axis=1).values)
plt.legend(title='Income levels', labels=df_incomes)
plt.show()
# 2
import quandl
quandl.ApiConfig.api_key = 'o8AsPMeibEkFTbsJS3mP'
# datasets
wheat_price_data = quandl.get('ODA/PWHEAMT_USD', sort_order='desc')
crude_oil_price_data = quandl.get('WGEC/WLD_CRUDE_WTI', sort_order='desc')
gold_price_data = quandl.get('BUNDESBANK/BBK01_WT5511', sort_order='desc')
# merged data
df_merged_data_1 = pd.merge(wheat_price_data, crude_oil_price_data, on='Date', suffixes=('wheat', 'crude_oil')) #suffixes=('wheat', 'crude_oil')
df_merged_data_2 = pd.merge(df_merged_data_1, gold_price_data, on='Date')
# minimums
min_wheat_price = min(df_merged_data_2['Valuewheat'])
min_crude_oil_price = min(df_merged_data_2['Valuecrude_oil'])
min_gold_price = min(df_merged_data_2['Value'])
# maximums
max_wheat_price = max(df_merged_data_2['Valuewheat'])
max_crude_oil_price = max(df_merged_data_2['Valuecrude_oil'])
max_gold_price = max(df_merged_data_2['Value'])
# plot structure
plt.figure(figsize=(10, 6))
plt.title("Synchronized Wheat, Crude oil and Gold prices")
plt.xlabel("Years")
plt.ylabel("Price")
# plot of value against date
plt.plot(df_merged_data_2.index, df_merged_data_2['Valuewheat'], label='Wheat Price')
plt.plot(df_merged_data_2.index, df_merged_data_2['Valuecrude_oil'], label='Crude Oil Price')
plt.plot(df_merged_data_2.index, df_merged_data_2['Value'], label='Gold Price')
# # plot a dot of min values
plt.plot(df_merged_data_2['Valuewheat'].where(df_merged_data_2['Valuewheat'] == min_wheat_price), 'bo', label='min(wheat_price)')
plt.plot(df_merged_data_2['Valuecrude_oil'].where(df_merged_data_2['Valuecrude_oil'] == min_crude_oil_price), 'yo', label='min(crude_oil_price)')
plt.plot(df_merged_data_2['Value'].where(df_merged_data_2['Value'] == min_gold_price), 'go', label='min(gold_price)')
# # plot a dot of max values
plt.plot(df_merged_data_2['Valuewheat'].where(df_merged_data_2['Valuewheat'] == max_wheat_price), 'ro', label='max(wheat_price)')
plt.plot(df_merged_data_2['Valuecrude_oil'].where(df_merged_data_2['Valuecrude_oil'] == max_crude_oil_price), 'mo', label='max(crude_oil_price)')
plt.plot(df_merged_data_2['Value'].where(df_merged_data_2['Value'] == max_gold_price), 'co', label='max(gold_price)')
plt.legend()
plt.show()
# 3
# datasets
df_carbon_emmission = pd.read_csv('datasets/carbon_emmission.csv', skiprows=4)
df_enrolment_primary = pd.read_csv('datasets/enrolment_primary.csv', skiprows=4)
# 2010 data
carbon_emmission_2010 = df_carbon_emmission['2010']
enrolment_primary_2010 = df_enrolment_primary['2010']
# carbon emmission results
c_e_describe = carbon_emmission_2010.describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95])
c_e_results = c_e_describe[['mean', 'std', '5%', '25%', '50%', '75%', '95%']]
new_ce_results = c_e_results.set_axis(['mean', 'std', '5th percentile', '25th percentile',\
'median', '75th percentile', '95th percentile'], inplace=False)
df_ce_results = pd.DataFrame({'RESULTS': new_ce_results})
df_ce_results = df_ce_results.reset_index()
# enrolment primary results
e_p_describe = enrolment_primary_2010.describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95])
e_p_results = e_p_describe[['mean', 'std', '5%', '25%', '50%', '75%', '95%']]
new_ep_results = e_p_results.set_axis(['mean', 'std', '5th percentile', '25th percentile',\
'median', '75th percentile', '95th percentile'], inplace=False)
df_ep_results = pd.DataFrame({'RESULTS': new_ep_results})
df_ep_results = df_ep_results.reset_index()
# styling tables by adding captions
df_ce_style = df_ce_results.style.\
set_table_attributes("style='display:inline'").\
set_caption('CARBON EMISSION 2010')
df_ep_style = df_ep_results.\
style.set_table_attributes("style='display:inline'").\
set_caption('ENROLMENT PRIMARY 2010')
display(df_ce_style)
display(df_ep_style)
# 4
import numpy as np
fertility_rate_data = pd.read_csv('datasets/fertility_rate.csv', skiprows=4, skip_blank_lines=True)
gdp_per_capita_data = pd.read_csv('datasets/gdp_per_capita.csv', skiprows=4)
# set country name as an index
indexed_fertility_rate_data = fertility_rate_data.set_index('Country Name')
indexed_gdp_per_capita_data = gdp_per_capita_data.set_index('Country Name')
# 2010 data
df_fr_2010 = indexed_fertility_rate_data['2010']
df_gdp_2010 = indexed_gdp_per_capita_data['2010']
# scatter plot of 2010 data
plt.figure(figsize=(10,5))
plt.title('Fertility Rate vs GDP per Capita')
plt.xlabel('GPD/Capita 2010')
plt.ylabel('Fertility Rate 2010')
plt.scatter(df_gdp_2010, df_fr_2010)
plt.show()
# 1990 data
df_fr_1990 = indexed_fertility_rate_data['1990']
df_gdp_1990 = indexed_gdp_per_capita_data['1990']
# sorted fertility data
new_1990 = np.sort(df_fr_1990.values)
new_2010 = np.sort(df_fr_2010.values)
# cdf in 1990 and 2010
fertility_1990_cdf = np.arange(len(new_1990)) / len(new_1990)
fertility_2010_cdf = np.arange(len(new_2010)) / len(new_2010)
# cdf vs fertility rate plots
plt.figure(figsize=(10,5))
plt.title('CDF vs Fertility rate')
plt.xlabel('Fertility rate')
plt.ylabel('CDF')
plt.plot(new_1990, fertility_1990_cdf, label='Fertility Rate 1990 CDF')
plt.plot(new_2010, fertility_2010_cdf, label='Fertility Rate 2010 CDF')
# mean and median on lines
plt.axvline(x=df_fr_1990.mean(), color='g', linewidth=2, label='Mean Fertility Rate 1990')
plt.axvline(x=df_fr_1990.median(), color='r', linewidth=2, label='Median Fertility Rate 1990')
plt.axvline(x=df_fr_2010.mean(), color='y', linewidth=2, label='Mean Fertility Rate 2010')
plt.axvline(x=df_fr_2010.median(), color='b', linewidth=2, label='Median Fertility Rate 2010')
plt.legend()
plt.show()
# 5
# datasets
hpi_data = pd.read_excel('datasets/hpi-data-2016.xlsx', skiprows=5, sheet_name='Complete HPI data', usecols=lambda x: 'Unnamed' not in x)
cpi_data = pd.read_excel('datasets/CPI2016_Results.xlsx', usecols=lambda x: 'Unnamed' not in x)
# # set country name as an index
indexed_cpi_data = cpi_data.set_index('Country')
indexed_hpi_data = hpi_data.set_index('Country')
# merge indexed dataframes
df_merged_data = indexed_hpi_data.merge(indexed_cpi_data, left_index=True, right_index=True)
# scatter plot labelling
plt.figure(figsize=(16,12))
plt.title('CPI vs HPI /2016')
plt.xlabel('HPI')
plt.ylabel('CPI')
plt.scatter(df_merged_data['HPI Rank'], df_merged_data['Rank'])
# use country abbreviations to annotate points
for i in range(len(df_merged_data['WB Code'])) :
plt.annotate(df_merged_data['WB Code'].values[i], (df_merged_data['HPI Rank'].values[i], df_merged_data['Rank'].values[i]))
plt.show()