# loading libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style='darkgrid')
# Loading datasets
df1 = pd.read_excel('DIGITAL_ECONOMY_SOCIETY-GLOBAL.xlsx')
df2 = pd.read_excel('HEALTH INDICATOR-GLOBAL.xlsx')
df3 = pd.read_excel('HEALTH STATUS- GLOBAL.xlsx')
# Removing columns (mostly single observation in these rows and others not necessary)
df1.drop(['Super Region', 'Source', 'Information society indicator','Sub-Sector', 'Time Level', 'Sub-Sector Level 2', 'Sub-Sector Level 3'], axis=1, inplace=True)
df2.drop(['Sub-Sector', 'Super Region', 'SOURCE ORGANIZATION', 'Indicator Name', 'Source', 'Special Notes', 'Region'], axis=1, inplace=True)
df3.drop(['Sub-Sector', 'Super Region', 'Measure', 'Frequency', 'Source', 'Indicator'], axis=1, inplace=True)
# Digital dataset has records for three years. I filtered only these years so we have common datasets
df2 = df2[df2['Date'].isin(list(df1['Date'].unique()))]
df3 = df3[df3['Date'].isin(list(df1['Date'].unique()))]
# Renamed the columns for merging
df1.rename(columns={'Geopolitical entity (reporting)': 'Country Name'}, inplace=True)
df3.rename(columns={'Region': 'Country Name'}, inplace=True)
# first merge of the column is based on Country Name. So we are working on Europeans countries only
# second merge of the column is on Date. So the observations correspond to same dates on both datasets
data = pd.merge(df1, df2, how='inner', on=['Country Name', 'Date'], sort=True, suffixes=('_digital', '_health'))
# Converted the date column to datetime object
# Also converted all the dates to year since they are all reports of end of the year
data['Date'] = pd.to_datetime(data['Date'])
data['Date'] = pd.DatetimeIndex(data['Date']).year
# A lot of columns have similar data on several rows. So i grouped and aggregated them for visualisation
data = data.groupby(['Country Name', 'Date', 'Individual type', 'Unit of measure', 'Income Group']).agg('mean')
# saved clean data locally
#data.to_csv('clean_data.csv')
data