About This Project

# Let's import the project's cover image from PIL import Image image = Image.open("/work/front_internet_worlwide.png") image.show()

Set Up

# Import requried libraries import pandas as pd import numpy as np import plotly.express as px import time from datetime import datetime !pip install gspread !pip install gspread_dataframe from gspread_dataframe import set_with_dataframe import gspread gc = gspread.service_account(filename='/work/world-wide-web-356017-564c77672260.json')

Extract

# extract first df and create variable "prices". # list of default na values is redefined to exclude string 'NA' which happens to be Namibia's country code prices = pd.read_csv('/work/cerebro/worldwide internet prices in 2022 - IN 2022.csv',keep_default_na=False, na_values=['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', ''])

# extract second df and create variable "speed" speed = pd.read_csv('/work/cerebro/worldwide internet speed in 2022 - avg speed.csv')

# extract third df and create variable "users" users = pd.read_csv('/work/cerebro/worldwide internet users - users.csv')

Transform

Prices

# This is our first DF prices

# Checking prices columns, null values and data types prices.info()

prices.loc[prices['NO. OF Internet Plans '].isnull(),]

# drop rows with null values for selected columns only prices.dropna(subset=['NO. OF Internet Plans ','Average price of 1GB (USD)','Cheapest 1GB for 30 days (USD)', 'Most expensive 1GB (USD)','Average price of 1GB (USD at the start of 2021)', 'Average price of 1GB (USD – at start of 2020)'],inplace=True)

prices

# First, let's apply a function to delete the $ sign cols = ['Average price of 1GB (USD)', 'Cheapest 1GB for 30 days (USD)', 'Most expensive 1GB (USD)','Average price of 1GB (USD at the start of 2021)','Average price of 1GB (USD – at start of 2020)'] prices[cols] = prices[cols].apply(lambda x: x.str.lstrip('$')) # now let's convert these columns to numeric prices[cols] = prices[cols].apply(pd.to_numeric, errors= 'coerce') # Finally, column names are a bit messy, we could improve them prices.rename({'NO. OF Internet Plans ':'num_internet_plans','Average price of 1GB (USD)': 'avg_price_1gb','Cheapest 1GB for 30 days (USD)':'cheapest_1gb_30_days', 'Most expensive 1GB (USD)':'most_expensive_1gb','Average price of 1GB (USD at the start of 2021)':'avg_price_1gb_2021','Average price of 1GB (USD – at start of 2020)':'avg_price_1gb_2020'}, axis=1, inplace=True)

prices.head()

# As a good practice, let's get rid of whitespaces at the begining and end in all three columns cols = ['Country code', 'Name', 'Continental region'] prices[cols] = prices[cols].apply(lambda x: x.str.strip()) # We can also improve the column names prices.rename({'Country code':'country_code','Name': 'name','Continental region':'continental_region'}, axis=1, inplace=True)

prices['continental_region'].value_counts()

# Reduce continental_region column to 5 regions conditions = [ (prices['continental_region'].isin(('SOUTH AMERICA', 'CENTRAL AMERICA','NORTHERN AMERICA','CARIBBEAN'))), (prices['continental_region'].isin(('WESTERN EUROPE', 'EASTERN EUROPE', 'BALTICS'))), (prices['continental_region'].isin(('SUB-SAHARAN AFRICA', 'NORTHERN AFRICA'))), (prices['continental_region'].isin(('ASIA (EX. NEAR EAST)', 'NEAR EAST','CIS (FORMER USSR)'))), (prices['continental_region'].str.contains(('OCEANIA'))) ] regions = ['Americas', 'Europe', 'Africa','Asia','Oceania'] prices["continental_region"] = np.select(conditions, regions, default=np.nan)

# There seems to be a problem with a few european countries that figure as Asian. Let's correct this. prices.loc[prices['name'] == 'Belarus','continental_region'] = 'Europe' prices.loc[prices['name'] == 'Ukraine','continental_region'] = 'Europe' prices.loc[prices['name'] == 'Cyprus','continental_region'] = 'Europe'

prices['continental_region'].value_counts()

prices.describe()

fig = px.histogram(prices, x='avg_price_1gb', title='Distribution_avg_price_1gb', marginal='box') fig.show()

# Let's determine outliers for this distribution using a common formula to set minimum and maximum values: min_limit = q1 - 1.5*iqr and max_limit = q3 + 1.5*iqr q1 = prices['avg_price_1gb'].quantile(0.25) q3 = prices['avg_price_1gb'].quantile(0.75) print('Quantile 1: ' + str(q1) + ', '+ 'Quantile 3: ' + str(q3)) # Inter quantile range, IQR iqr = q3 - q1 print('Inter Quantile Range: ' + str(iqr)) # defining limits for outliers min_limit_col1 = q1 - 1.5*iqr max_limit_col1 = q3 + 1.5*iqr print('Min limit: ' + str(min_limit_col1) + ', ' + 'Max limit: ' + str(max_limit_col1))

fig = px.histogram(prices, x='cheapest_1gb_30_days', title='Distribution_cheapest_1gb_30_days', marginal='box') fig.show()

# Let's determine outliers for this distribution using a common formula to set minimum and maximum values: min_limit = q1 - 1.5*iqr and max_limit = q3 + 1.5*iqr q1 = prices['cheapest_1gb_30_days'].quantile(0.25) q3 = prices['cheapest_1gb_30_days'].quantile(0.75) print('Quantile 1: ' + str(q1) + ', '+ 'Quantile 3: ' + str(q3)) # Inter quantile range, IQR iqr = q3 - q1 print('Inter Quantile Range: ' + str(iqr)) # defining limits for outliers min_limit_col2 = q1 - 1.5*iqr max_limit_col2 = q3 + 1.5*iqr print('Min limit: ' + str(min_limit_col2) + ', ' + 'Max limit: ' + str(max_limit_col2))

fig = px.histogram(prices, x='most_expensive_1gb', title='Distribution_most_expensive_1gb', marginal='box') fig.show()

# Let's determine outliers for this distribution using a common formula to set minimum and maximum values: min_limit = q1 - 1.5*iqr and max_limit = q3 + 1.5*iqr q1 = prices['most_expensive_1gb'].quantile(0.25) q3 = prices['most_expensive_1gb'].quantile(0.75) print('Quantile 1: ' + str(q1) + ', '+ 'Quantile 3: ' + str(q3)) # Inter quantile range, IQR iqr = q3 - q1 print('Inter Quantile Range: ' + str(iqr)) # defining limits for outliers min_limit_col3 = q1 - 1.5*iqr max_limit_col3 = q3 + 1.5*iqr print('Min limit: ' + str(min_limit_col3) + ', ' + 'Max limit: ' + str(max_limit_col3))

fig = px.histogram(prices, x='avg_price_1gb_2021', title='Distribution_avg_price_1gb_2021', marginal='box') fig.show()

# Let's determine outliers for this distribution using a common formula to set minimum and maximum values: min_limit = q1 - 1.5*iqr and max_limit = q3 + 1.5*iqr q1 = prices['avg_price_1gb_2021'].quantile(0.25) q3 = prices['avg_price_1gb_2021'].quantile(0.75) print('Quantile 1: ' + str(q1) + ', '+ 'Quantile 3: ' + str(q3)) # Inter quantile range, IQR iqr = q3 - q1 print('Inter Quantile Range: ' + str(iqr)) # defining limits for outliers min_limit_col4 = q1 - 1.5*iqr max_limit_col4 = q3 + 1.5*iqr print('Min limit: ' + str(min_limit_col4) + ', ' + 'Max limit: ' + str(max_limit_col4))

fig = px.histogram(prices, x='avg_price_1gb_2020', title='Distribution_avg_price_1gb_2020', marginal='box') fig.show()

# Let's determine outliers for this distribution using a common formula to set minimum and maximum values: min_limit = q1 - 1.5*iqr and max_limit = q3 + 1.5*iqr q1 = prices['avg_price_1gb_2020'].quantile(0.25) q3 = prices['avg_price_1gb_2020'].quantile(0.75) print('Quantile 1: ' + str(q1) + ', '+ 'Quantile 3: ' + str(q3)) # Inter quantile range, IQR iqr = q3 - q1 print('Inter Quantile Range: ' + str(iqr)) # defining limits for outliers min_limit_col5 = q1 - 1.5*iqr max_limit_col5 = q3 + 1.5*iqr print('Min limit: ' + str(min_limit_col5) + ', ' + 'Max limit: ' + str(max_limit_col5))

# create column with % variation between 2020 and 2021. We can build a function for that def percentage_change(col1,col2): return ((col2 - col1) / col1) prices['%_change_2020_2021'] = round(percentage_change(prices['avg_price_1gb_2020'],prices['avg_price_1gb_2021']),4)

Speed

# Let's inspect our data frame speed

# Checking null values and data types speed.info()

speed.columns

# Renaming the column speed.rename({'Avg \n(Mbit/s)Ookla':'avg_speed_mbit/s'}, axis=1, inplace=True)

speed.columns

# Let's see the only null value the second column has speed.loc[speed['avg_speed_mbit/s'].isnull(),]

# Sorry Puerto Rico, but you need to go # drop rows with null values for selected columns only speed.dropna(subset=['avg_speed_mbit/s'],inplace=True)

# No null values in our data set speed.isna().sum()

# The speed value doesn't need to be with decimals speed['avg_speed_mbit/s'] = round(speed['avg_speed_mbit/s'],0)

# Finally, as a good practice, let's get rid of whitespaces at the begining and end speed['Country'] = speed['Country'].str.strip()

# Setting up speed categories: conditions = [ (speed['avg_speed_mbit/s'] <= 12), (speed['avg_speed_mbit/s'] > 12) & (speed['avg_speed_mbit/s'] <= 25), (speed['avg_speed_mbit/s'] > 25) ] values = ['low_speed (12 or less)','medium_speed (12 to 25)','high_speed (over 25)'] speed['speed_category'] = np.select(conditions, values)

speed['speed_category'].value_counts()

speed.shape

speed.nunique()

# After a quick search, the country Lebanon is duplicated speed.loc[speed['Country']=='Lebanon',]

speed.drop_duplicates(subset='Country',inplace=True)

speed.loc[speed['Country']=='Lebanon',]

Users

users

users.info()

# Let's now convert numeric columns to numeric type, after doing a little trick we need to be able to do the convertion: delete the commas users['Population'] = pd.to_numeric(users['Population'].str.replace(',',''),errors='coerce') users['Internet users'] = pd.to_numeric(users['Internet users'].str.replace(',',''),errors='coerce')

# Checking null values for the only column that has any. users.loc[users['Population'].isnull(),]

# Filling the missing values in our DF users.loc[[98,197],'Population' ] = 4803000, 99000

users.loc[(users['Country or area'] == 'Palestine') | (users['Country or area'] == 'Jersey'),]

# As a good practice, let's get rid of whitespaces at the begining and end in all three columns cols = ['Country or area', 'Subregion', 'Region'] users[cols] = users[cols].apply(lambda x: x.str.strip()) # We can also improve the column names users.rename({'Country or area':'country','Region': 'continent','Internet users':'internet_users','Population':'population'}, axis=1, inplace=True) # We could drop the 'Subregion' column as we won't be using it, just the continent information users.drop(['Subregion'],axis=1, inplace=True)

users['%_internet_users_vs_pop'] = users['internet_users'] / users['population']

users.head()

Load

#load data to google sheets #Create key key = "1ta2iOwjKzIRu7CL60DqzXpVlYhvaTOz7618GmgF8Kts" #Selects the sheet to export to sheet = gc.open_by_key(key).worksheet('prices') sheet1 = gc.open_by_key(key).worksheet('speed') sheet2 = gc.open_by_key(key).worksheet('users') #Clears the sheet and exports data sheet.batch_clear(["A1:L40000"]) set_with_dataframe(sheet, prices) sheet1.batch_clear(["A1:L4000"]) set_with_dataframe(sheet1, speed) sheet2.batch_clear(["A1:L4000"]) set_with_dataframe(sheet2, users)

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}About This Project