DATAnalyze

# loading libraries import pandas as pd import matplotlib.pyplot as plt import seaborn as sns sns.set_theme(style='darkgrid')

# Loading datasets df1 = pd.read_excel('DIGITAL_ECONOMY_SOCIETY-GLOBAL.xlsx') df2 = pd.read_excel('HEALTH INDICATOR-GLOBAL.xlsx') df3 = pd.read_excel('HEALTH STATUS- GLOBAL.xlsx')

# Removing columns (mostly single observation in these rows and others not necessary) df1.drop(['Super Region', 'Source', 'Information society indicator','Sub-Sector', 'Time Level', 'Sub-Sector Level 2', 'Sub-Sector Level 3'], axis=1, inplace=True) df2.drop(['Sub-Sector', 'Super Region', 'SOURCE ORGANIZATION', 'Indicator Name', 'Source', 'Special Notes', 'Region'], axis=1, inplace=True) df3.drop(['Sub-Sector', 'Super Region', 'Measure', 'Frequency', 'Source', 'Indicator'], axis=1, inplace=True)

# Digital dataset has records for three years. I filtered only these years so we have common datasets df2 = df2[df2['Date'].isin(list(df1['Date'].unique()))] df3 = df3[df3['Date'].isin(list(df1['Date'].unique()))]

# Renamed the columns for merging df1.rename(columns={'Geopolitical entity (reporting)': 'Country Name'}, inplace=True) df3.rename(columns={'Region': 'Country Name'}, inplace=True)

# first merge of the column is based on Country Name. So we are working on Europeans countries only # second merge of the column is on Date. So the observations correspond to same dates on both datasets data = pd.merge(df1, df2, how='inner', on=['Country Name', 'Date'], sort=True, suffixes=('_digital', '_health'))

# Converted the date column to datetime object # Also converted all the dates to year since they are all reports of end of the year data['Date'] = pd.to_datetime(data['Date']) data['Date'] = pd.DatetimeIndex(data['Date']).year

# A lot of columns have similar data on several rows. So i grouped and aggregated them for visualisation data = data.groupby(['Country Name', 'Date', 'Individual type', 'Unit of measure', 'Income Group']).agg('mean')

# saved clean data locally #data.to_csv('clean_data.csv')

data