About This Project
# Let's import the project's cover image
from PIL import Image
image = Image.open("/work/front_internet_worlwide.png")
image.show()
Set Up
# Import requried libraries
import pandas as pd
import numpy as np
import plotly.express as px
import time
from datetime import datetime
!pip install gspread
!pip install gspread_dataframe
from gspread_dataframe import set_with_dataframe
import gspread
gc = gspread.service_account(filename='/work/world-wide-web-356017-564c77672260.json')
Extract
# extract first df and create variable "prices".
# list of default na values is redefined to exclude string 'NA' which happens to be Namibia's country code
prices = pd.read_csv('/work/cerebro/worldwide internet prices in 2022 - IN 2022.csv',keep_default_na=False, na_values=['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', ''])
# extract second df and create variable "speed"
speed = pd.read_csv('/work/cerebro/worldwide internet speed in 2022 - avg speed.csv')
# extract third df and create variable "users"
users = pd.read_csv('/work/cerebro/worldwide internet users - users.csv')
Transform
Prices
# This is our first DF
prices
# Checking prices columns, null values and data types
prices.info()
prices.loc[prices['NO. OF Internet Plans '].isnull(),]
# drop rows with null values for selected columns only
prices.dropna(subset=['NO. OF Internet Plans ','Average price of 1GB (USD)','Cheapest 1GB for 30 days (USD)',
'Most expensive 1GB (USD)','Average price of 1GB (USD at the start of 2021)',
'Average price of 1GB (USD – at start of 2020)'],inplace=True)
prices
# First, let's apply a function to delete the $ sign
cols = ['Average price of 1GB (USD)', 'Cheapest 1GB for 30 days (USD)', 'Most expensive 1GB (USD)','Average price of 1GB (USD at the start of 2021)','Average price of 1GB (USD – at start of 2020)']
prices[cols] = prices[cols].apply(lambda x: x.str.lstrip('$'))
# now let's convert these columns to numeric
prices[cols] = prices[cols].apply(pd.to_numeric, errors= 'coerce')
# Finally, column names are a bit messy, we could improve them
prices.rename({'NO. OF Internet Plans ':'num_internet_plans','Average price of 1GB (USD)': 'avg_price_1gb','Cheapest 1GB for 30 days (USD)':'cheapest_1gb_30_days',
'Most expensive 1GB (USD)':'most_expensive_1gb','Average price of 1GB (USD at the start of 2021)':'avg_price_1gb_2021','Average price of 1GB (USD – at start of 2020)':'avg_price_1gb_2020'}, axis=1, inplace=True)
prices.head()
# As a good practice, let's get rid of whitespaces at the begining and end in all three columns
cols = ['Country code', 'Name', 'Continental region']
prices[cols] = prices[cols].apply(lambda x: x.str.strip())
# We can also improve the column names
prices.rename({'Country code':'country_code','Name': 'name','Continental region':'continental_region'}, axis=1, inplace=True)
prices['continental_region'].value_counts()
# Reduce continental_region column to 5 regions
conditions = [ (prices['continental_region'].isin(('SOUTH AMERICA', 'CENTRAL AMERICA','NORTHERN AMERICA','CARIBBEAN'))),
(prices['continental_region'].isin(('WESTERN EUROPE', 'EASTERN EUROPE', 'BALTICS'))),
(prices['continental_region'].isin(('SUB-SAHARAN AFRICA', 'NORTHERN AFRICA'))),
(prices['continental_region'].isin(('ASIA (EX. NEAR EAST)', 'NEAR EAST','CIS (FORMER USSR)'))),
(prices['continental_region'].str.contains(('OCEANIA'))) ]
regions = ['Americas', 'Europe', 'Africa','Asia','Oceania']
prices["continental_region"] = np.select(conditions, regions, default=np.nan)
# There seems to be a problem with a few european countries that figure as Asian. Let's correct this.
prices.loc[prices['name'] == 'Belarus','continental_region'] = 'Europe'
prices.loc[prices['name'] == 'Ukraine','continental_region'] = 'Europe'
prices.loc[prices['name'] == 'Cyprus','continental_region'] = 'Europe'
prices['continental_region'].value_counts()
prices.describe()
fig = px.histogram(prices, x='avg_price_1gb', title='Distribution_avg_price_1gb', marginal='box')
fig.show()
# Let's determine outliers for this distribution using a common formula to set minimum and maximum values: min_limit = q1 - 1.5*iqr and max_limit = q3 + 1.5*iqr
q1 = prices['avg_price_1gb'].quantile(0.25)
q3 = prices['avg_price_1gb'].quantile(0.75)
print('Quantile 1: ' + str(q1) + ', '+ 'Quantile 3: ' + str(q3))
# Inter quantile range, IQR
iqr = q3 - q1
print('Inter Quantile Range: ' + str(iqr))
# defining limits for outliers
min_limit_col1 = q1 - 1.5*iqr
max_limit_col1 = q3 + 1.5*iqr
print('Min limit: ' + str(min_limit_col1) + ', ' + 'Max limit: ' + str(max_limit_col1))
fig = px.histogram(prices, x='cheapest_1gb_30_days', title='Distribution_cheapest_1gb_30_days', marginal='box')
fig.show()
# Let's determine outliers for this distribution using a common formula to set minimum and maximum values: min_limit = q1 - 1.5*iqr and max_limit = q3 + 1.5*iqr
q1 = prices['cheapest_1gb_30_days'].quantile(0.25)
q3 = prices['cheapest_1gb_30_days'].quantile(0.75)
print('Quantile 1: ' + str(q1) + ', '+ 'Quantile 3: ' + str(q3))
# Inter quantile range, IQR
iqr = q3 - q1
print('Inter Quantile Range: ' + str(iqr))
# defining limits for outliers
min_limit_col2 = q1 - 1.5*iqr
max_limit_col2 = q3 + 1.5*iqr
print('Min limit: ' + str(min_limit_col2) + ', ' + 'Max limit: ' + str(max_limit_col2))
fig = px.histogram(prices, x='most_expensive_1gb', title='Distribution_most_expensive_1gb', marginal='box')
fig.show()
# Let's determine outliers for this distribution using a common formula to set minimum and maximum values: min_limit = q1 - 1.5*iqr and max_limit = q3 + 1.5*iqr
q1 = prices['most_expensive_1gb'].quantile(0.25)
q3 = prices['most_expensive_1gb'].quantile(0.75)
print('Quantile 1: ' + str(q1) + ', '+ 'Quantile 3: ' + str(q3))
# Inter quantile range, IQR
iqr = q3 - q1
print('Inter Quantile Range: ' + str(iqr))
# defining limits for outliers
min_limit_col3 = q1 - 1.5*iqr
max_limit_col3 = q3 + 1.5*iqr
print('Min limit: ' + str(min_limit_col3) + ', ' + 'Max limit: ' + str(max_limit_col3))
fig = px.histogram(prices, x='avg_price_1gb_2021', title='Distribution_avg_price_1gb_2021', marginal='box')
fig.show()
# Let's determine outliers for this distribution using a common formula to set minimum and maximum values: min_limit = q1 - 1.5*iqr and max_limit = q3 + 1.5*iqr
q1 = prices['avg_price_1gb_2021'].quantile(0.25)
q3 = prices['avg_price_1gb_2021'].quantile(0.75)
print('Quantile 1: ' + str(q1) + ', '+ 'Quantile 3: ' + str(q3))
# Inter quantile range, IQR
iqr = q3 - q1
print('Inter Quantile Range: ' + str(iqr))
# defining limits for outliers
min_limit_col4 = q1 - 1.5*iqr
max_limit_col4 = q3 + 1.5*iqr
print('Min limit: ' + str(min_limit_col4) + ', ' + 'Max limit: ' + str(max_limit_col4))
fig = px.histogram(prices, x='avg_price_1gb_2020', title='Distribution_avg_price_1gb_2020', marginal='box')
fig.show()
# Let's determine outliers for this distribution using a common formula to set minimum and maximum values: min_limit = q1 - 1.5*iqr and max_limit = q3 + 1.5*iqr
q1 = prices['avg_price_1gb_2020'].quantile(0.25)
q3 = prices['avg_price_1gb_2020'].quantile(0.75)
print('Quantile 1: ' + str(q1) + ', '+ 'Quantile 3: ' + str(q3))
# Inter quantile range, IQR
iqr = q3 - q1
print('Inter Quantile Range: ' + str(iqr))
# defining limits for outliers
min_limit_col5 = q1 - 1.5*iqr
max_limit_col5 = q3 + 1.5*iqr
print('Min limit: ' + str(min_limit_col5) + ', ' + 'Max limit: ' + str(max_limit_col5))
# create column with % variation between 2020 and 2021. We can build a function for that
def percentage_change(col1,col2):
return ((col2 - col1) / col1)
prices['%_change_2020_2021'] = round(percentage_change(prices['avg_price_1gb_2020'],prices['avg_price_1gb_2021']),4)
Speed
# Let's inspect our data frame
speed
# Checking null values and data types
speed.info()
speed.columns
# Renaming the column
speed.rename({'Avg \n(Mbit/s)Ookla':'avg_speed_mbit/s'}, axis=1, inplace=True)
speed.columns
# Let's see the only null value the second column has
speed.loc[speed['avg_speed_mbit/s'].isnull(),]
# Sorry Puerto Rico, but you need to go
# drop rows with null values for selected columns only
speed.dropna(subset=['avg_speed_mbit/s'],inplace=True)
# No null values in our data set
speed.isna().sum()
# The speed value doesn't need to be with decimals
speed['avg_speed_mbit/s'] = round(speed['avg_speed_mbit/s'],0)
# Finally, as a good practice, let's get rid of whitespaces at the begining and end
speed['Country'] = speed['Country'].str.strip()
# Setting up speed categories:
conditions = [
(speed['avg_speed_mbit/s'] <= 12),
(speed['avg_speed_mbit/s'] > 12) & (speed['avg_speed_mbit/s'] <= 25),
(speed['avg_speed_mbit/s'] > 25)
]
values = ['low_speed (12 or less)','medium_speed (12 to 25)','high_speed (over 25)']
speed['speed_category'] = np.select(conditions, values)
speed['speed_category'].value_counts()
speed.shape
speed.nunique()
# After a quick search, the country Lebanon is duplicated
speed.loc[speed['Country']=='Lebanon',]
speed.drop_duplicates(subset='Country',inplace=True)
speed.loc[speed['Country']=='Lebanon',]
Users
users
users.info()
# Let's now convert numeric columns to numeric type, after doing a little trick we need to be able to do the convertion: delete the commas
users['Population'] = pd.to_numeric(users['Population'].str.replace(',',''),errors='coerce')
users['Internet users'] = pd.to_numeric(users['Internet users'].str.replace(',',''),errors='coerce')
# Checking null values for the only column that has any.
users.loc[users['Population'].isnull(),]
# Filling the missing values in our DF
users.loc[[98,197],'Population' ] = 4803000, 99000
users.loc[(users['Country or area'] == 'Palestine') | (users['Country or area'] == 'Jersey'),]
# As a good practice, let's get rid of whitespaces at the begining and end in all three columns
cols = ['Country or area', 'Subregion', 'Region']
users[cols] = users[cols].apply(lambda x: x.str.strip())
# We can also improve the column names
users.rename({'Country or area':'country','Region': 'continent','Internet users':'internet_users','Population':'population'}, axis=1, inplace=True)
# We could drop the 'Subregion' column as we won't be using it, just the continent information
users.drop(['Subregion'],axis=1, inplace=True)
users['%_internet_users_vs_pop'] = users['internet_users'] / users['population']
users.head()
Load
#load data to google sheets
#Create key
key = "1ta2iOwjKzIRu7CL60DqzXpVlYhvaTOz7618GmgF8Kts"
#Selects the sheet to export to
sheet = gc.open_by_key(key).worksheet('prices')
sheet1 = gc.open_by_key(key).worksheet('speed')
sheet2 = gc.open_by_key(key).worksheet('users')
#Clears the sheet and exports data
sheet.batch_clear(["A1:L40000"])
set_with_dataframe(sheet, prices)
sheet1.batch_clear(["A1:L4000"])
set_with_dataframe(sheet1, speed)
sheet2.batch_clear(["A1:L4000"])
set_with_dataframe(sheet2, users)