import pandas as pd
import datetime as datetime
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()
print('Setup completed ^_____^')
df = pd.read_csv('/content/country_vaccinations_by_manufacturer.csv')
df.head()
df.tail()
df.nunique() # Get the number of unique value on each column:
df.dtypes # take a look of the data types that we dealing with; precisely the date column
df['date'] = pd.to_datetime(df['date'], format="%Y-%m-%d") # to_datetime is a pandas method which helps to convert datetime string into pandas datetime object to make it easy when works with TimseSeries data
df.dtypes # check our new data types after converting date(column) into datetime64[ns] by using pd.to_datetime()
# Check the missing values:
missing_values = df.isnull().sum()
missing_values[:]
df.columns
df.dtypes
variables = ['date', 'vaccine', 'total_vaccinations']
group_variables = variables[:2]
outcome_variable = variables[2]
data = df.groupby(group_variables)[outcome_variable].sum().reset_index()
data.head()
data.tail()
sns.barplot(x=data['total_vaccinations'], y=data['vaccine'])
data.set_index('date', inplace=True)
data.head()
print(data.index)
# Get the observations of 2020:-
print(data['2020'].head())
print(data['2020'].tail())
print('Day:', data.index.day, '\n')
print('Week:', data.index.week, '\n')
data['dayofweek'] = data.index.dayofweek # Day of Week: Monday=0, Sunday=6
print(data.head())
print(data.tail())
del(data['dayofweek'])
data.head()
data.reset_index(inplace=True)
data_pivot= data.pivot(index='date', columns='vaccine', values='total_vaccinations')
data_pivot.head()
data_pivot.tail()
# Convert NaN values with 0:
data_pivot.fillna(0, axis=0, inplace=True)
data_pivot.head()
vaccination_amount = data.set_index(['date', 'vaccine']).unstack('vaccine').fillna(0)
vaccination_amount.columns = vaccination_amount.columns.levels[1].rename(None)
vaccination_amount.head()
print(vaccination_amount.index)
print('\nUnique dates in our data: ', len(vaccination_amount.index.unique()), 'Days')
print('\nUnique dates in our data: ', len(vaccination_amount.index.unique()), 'Days')
date_ranges = vaccination_amount.index.max() - vaccination_amount.index.min()
# Calculate number of days in date range:
print('Total days in our date range: ', date_ranges.days, 'Days')
new_index = pd.date_range(vaccination_amount.index.min(), vaccination_amount.index.max())
print(new_index)
new_vaccinations_amount = vaccination_amount.reindex(new_index, fill_value=0)
new_vaccinations_amount.index
weekly_vaccinations = new_vaccinations_amount.resample('W').sum()
print('Weekly vaccinations')
print(weekly_vaccinations.head(), '\n')
monthly_vacinations = new_vaccinations_amount.resample('M').sum()
print('Monthly Vaccinations')
print(monthly_vacinations.head(), '\n')
quaterly_vaccinations = new_vaccinations_amount.resample('Q').sum()
print('Quarterly Vaccinations')
print(quaterly_vaccinations.head(), '\n')
annual_vaccinations = new_vaccinations_amount.resample('Y').sum()
print('Annual Vaccinations')
print(annual_vaccinations.head(), '\n')
# Variable First Difference
print('Monthly Vaccination, First Difference \n', monthly_vacinations.diff().head())
# Variable Percentage Change:
print('Monthly Vaccinations % Change \n', monthly_vacinations.pct_change().head())
# Add % change to original data:
monthly_vacinations.join(monthly_vacinations.pct_change().add_suffix('_%_Change')).head()
plotsize = (13, 5)
quaterly_vaccinations.plot(figsize=plotsize, title='Quarterly Vaccinations')
monthly_vacinations.plot(figsize=plotsize, title='Monthly Vaccinations')
weekly_vaccinations.plot(figsize=plotsize, title='Weekly Vaccinations')