import pandas as pd
import datetime as datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
print('Setup completed ^_____^')
Setup completed ^_____^
df = pd.read_csv('/content/country_vaccinations_by_manufacturer.csv')
df.head()
df.tail()
df.nunique() # Get the number of unique value on each column:
df.dtypes # take a look of the data types that we dealing with; precisely the date column
df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d") # to_datetime is a pandas method which helps to convert datetime string into pandas datetime object to make it easy when works with TimseSeries data
df.dtypes # check our new data types after converting date(column) into datetime64[ns] by using pd.to_datetime()
# Check the missing values:
missing_values = df.isnull().sum()
missing_values[:]
df.columns
df.dtypes
variables = ['date', 'vaccine', 'total_vaccinations']
group_variables = variables[:2]
outcome_variable = variables[2]
data = df.groupby(group_variables)[outcome_variable].sum().reset_index()
data.head()
data.tail()
sns.barplot(x=data['total_vaccinations'], y=data['vaccine'])
data.set_index('date', inplace=True)
data.head()
print(data.index)
DatetimeIndex(['2020-12-04', '2020-12-07', '2020-12-09', '2020-12-15',
'2020-12-16', '2020-12-17', '2020-12-18', '2020-12-20',
'2020-12-20', '2020-12-21',
...
'2021-12-20', '2021-12-20', '2021-12-20', '2021-12-21',
'2021-12-21', '2021-12-21', '2021-12-21', '2021-12-21',
'2021-12-21', '2021-12-21'],
dtype='datetime64[ns]', name='date', length=2749, freq=None)
# Get the observations of 2020:-
print(data['2020'].head())
print(data['2020'].tail())
vaccine total_vaccinations
date
2020-12-04 Moderna 1
2020-12-07 Pfizer/BioNTech 1
2020-12-09 Pfizer/BioNTech 2
2020-12-15 Pfizer/BioNTech 3
2020-12-16 Pfizer/BioNTech 4
vaccine total_vaccinations
date
2020-12-31 Oxford/AstraZeneca 6
2020-12-31 Pfizer/BioNTech 568935
2020-12-31 Sinopharm/Beijing 0
2020-12-31 Sinovac 0
2020-12-31 Sputnik V 0
print('Day:', data.index.day, '\n')
print('Week:', data.index.week, '\n')
data['dayofweek'] = data.index.dayofweek # Day of Week: Monday=0, Sunday=6
print(data.head())
print(data.tail())
del(data['dayofweek'])
Day: Int64Index([ 4, 7, 9, 15, 16, 17, 18, 20, 20, 21,
...
20, 20, 20, 21, 21, 21, 21, 21, 21, 21],
dtype='int64', name='date', length=2749)
Week: Int64Index([49, 50, 50, 51, 51, 51, 51, 51, 51, 52,
...
51, 51, 51, 51, 51, 51, 51, 51, 51, 51],
dtype='int64', name='date', length=2749)
vaccine total_vaccinations dayofweek
date
2020-12-04 Moderna 1 4
2020-12-07 Pfizer/BioNTech 1 0
2020-12-09 Pfizer/BioNTech 2 2
2020-12-15 Pfizer/BioNTech 3 1
2020-12-16 Pfizer/BioNTech 4 2
vaccine total_vaccinations dayofweek
date
2021-12-21 Oxford/AstraZeneca 69352175 1
2021-12-21 Pfizer/BioNTech 514601597 1
2021-12-21 Sinopharm/Beijing 2222929 1
2021-12-21 Sinovac 9 1
2021-12-21 Sputnik V 1845078 1
/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:2: FutureWarning: weekofyear and week have been deprecated, please use DatetimeIndex.isocalendar().week instead, which returns a Series. To exactly reproduce the behavior of week and weekofyear and return an Index, you may call pd.Int64Index(idx.isocalendar().week)
data.head()
data.reset_index(inplace=True)
data_pivot= data.pivot(index='date', columns='vaccine', values='total_vaccinations')
data_pivot.head()
data_pivot.tail()
# Convert NaN values with 0:
data_pivot.fillna(0, axis=0, inplace=True)
data_pivot.head()
vaccination_amount = data.set_index(['date', 'vaccine']).unstack('vaccine').fillna(0)
vaccination_amount.columns = vaccination_amount.columns.levels[1].rename(None)
vaccination_amount.head()
print(vaccination_amount.index)
print('\nUnique dates in our data: ', len(vaccination_amount.index.unique()), 'Days')
DatetimeIndex(['2020-12-04', '2020-12-07', '2020-12-09', '2020-12-15',
'2020-12-16', '2020-12-17', '2020-12-18', '2020-12-20',
'2020-12-21', '2020-12-22',
...
'2021-12-12', '2021-12-13', '2021-12-14', '2021-12-15',
'2021-12-16', '2021-12-17', '2021-12-18', '2021-12-19',
'2021-12-20', '2021-12-21'],
dtype='datetime64[ns]', name='date', length=374, freq=None)
Unique dates in our data: 374 Days
print('\nUnique dates in our data: ', len(vaccination_amount.index.unique()), 'Days')
date_ranges = vaccination_amount.index.max() - vaccination_amount.index.min()
# Calculate number of days in date range:
print('Total days in our date range: ', date_ranges.days, 'Days')
Unique dates in our data: 374 Days
Total days in our date range: 382 Days
new_index = pd.date_range(vaccination_amount.index.min(), vaccination_amount.index.max())
print(new_index)
DatetimeIndex(['2020-12-04', '2020-12-05', '2020-12-06', '2020-12-07',
'2020-12-08', '2020-12-09', '2020-12-10', '2020-12-11',
'2020-12-12', '2020-12-13',
...
'2021-12-12', '2021-12-13', '2021-12-14', '2021-12-15',
'2021-12-16', '2021-12-17', '2021-12-18', '2021-12-19',
'2021-12-20', '2021-12-21'],
dtype='datetime64[ns]', length=383, freq='D')
new_vaccinations_amount = vaccination_amount.reindex(new_index, fill_value=0)
new_vaccinations_amount.index
weekly_vaccinations = new_vaccinations_amount.resample('W').sum()
print('Weekly vaccinations')
print(weekly_vaccinations.head(), '\n')
monthly_vacinations = new_vaccinations_amount.resample('M').sum()
print('Monthly Vaccinations')
print(monthly_vacinations.head(), '\n')
quaterly_vaccinations = new_vaccinations_amount.resample('Q').sum()
print('Quarterly Vaccinations')
print(quaterly_vaccinations.head(), '\n')
annual_vaccinations = new_vaccinations_amount.resample('Y').sum()
print('Annual Vaccinations')
print(annual_vaccinations.head(), '\n')
Weekly vaccinations
CanSino Johnson&Johnson ... Sinovac Sputnik V
2020-12-06 0.0 0.0 ... 0.0 0.0
2020-12-13 0.0 0.0 ... 0.0 0.0
2020-12-20 0.0 0.0 ... 0.0 0.0
2020-12-27 0.0 7.0 ... 0.0 0.0
2021-01-03 0.0 14.0 ... 0.0 0.0
[5 rows x 8 columns]
Monthly Vaccinations
CanSino Johnson&Johnson ... Sinovac Sputnik V
2020-12-31 0.0 11.0 ... 0.0 0.0
2021-01-31 0.0 2817.0 ... 3.0 0.0
2021-02-28 0.0 5755.0 ... 51501405.0 244236.0
2021-03-31 0.0 47071817.0 ... 217875017.0 3480789.0
2021-04-30 0.0 203658514.0 ... 389180038.0 27532209.0
[5 rows x 8 columns]
Quarterly Vaccinations
CanSino Johnson&Johnson ... Sinovac Sputnik V
2020-12-31 0.0 1.100000e+01 ... 0.000000e+00 0.0
2021-03-31 0.0 4.708039e+07 ... 2.693764e+08 3725025.0
2021-06-30 6478467.0 1.231991e+09 ... 1.597338e+09 142010121.0
2021-09-30 58460860.0 3.082204e+09 ... 3.259170e+09 192041676.0
2021-12-31 76269565.0 3.420388e+09 ... 3.837549e+09 172956564.0
[5 rows x 8 columns]
Annual Vaccinations
CanSino Johnson&Johnson ... Sinovac Sputnik V
2020-12-31 0.0 1.100000e+01 ... 0.000000e+00 0.0
2021-12-31 141208892.0 7.781663e+09 ... 8.963433e+09 510733386.0
[2 rows x 8 columns]
# Variable First Difference
print('Monthly Vaccination, First Difference \n', monthly_vacinations.diff().head())
# Variable Percentage Change:
print('Monthly Vaccinations % Change \n', monthly_vacinations.pct_change().head())
# Add % change to original data:
monthly_vacinations.join(monthly_vacinations.pct_change().add_suffix('_%_Change')).head()
Monthly Vaccination, First Difference
CanSino Johnson&Johnson ... Sinovac Sputnik V
2020-12-31 NaN NaN ... NaN NaN
2021-01-31 0.0 2806.0 ... 3.0 0.0
2021-02-28 0.0 2938.0 ... 51501402.0 244236.0
2021-03-31 0.0 47066062.0 ... 166373612.0 3236553.0
2021-04-30 0.0 156586697.0 ... 171305021.0 24051420.0
[5 rows x 8 columns]
Monthly Vaccinations % Change
CanSino Johnson&Johnson ... Sinovac Sputnik V
2020-12-31 NaN NaN ... NaN NaN
2021-01-31 NaN 255.090909 ... inf NaN
2021-02-28 NaN 1.042953 ... 1.716713e+07 inf
2021-03-31 NaN 8178.290530 ... 3.230467e+00 13.251744
2021-04-30 NaN 3.326549 ... 7.862536e-01 6.909761
[5 rows x 8 columns]
plotsize = (13, 5)
quaterly_vaccinations.plot(figsize=plotsize, title='Quarterly Vaccinations')
monthly_vacinations.plot(figsize=plotsize, title='Monthly Vaccinations')
weekly_vaccinations.plot(figsize=plotsize, title='Weekly Vaccinations')