import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import ttest_ind
# from scipy.stats import f_oneway
# from arch.unitroot import VarianceRatio
# from sklearn.model_selection import train_test_split
sns.set_theme(style="darkgrid")
# loading the data
grid_data = pd.read_csv('dataset/EirGridSystemDemand2014.csv')
# shape
grid_data.shape
# first 5 rows
grid_data.head()
# last 5 rows
grid_data.tail()
# structure information of the data
grid_data.info()
# checking what's wrong with 'Demand'
grid_data['Demand'].isnull().sum()
# checking the type of object types
grid_data.select_dtypes(include=[np.datetime64])
grid_data.select_dtypes(include=[np.number])
grid_data.describe()
# turning 'Date' column to datetime64
grid_data['Date'] = pd.to_datetime(grid_data['Date'])
grid_data['Date'].dtype
# turning 'Time' column to timedelta64
grid_data['Time'] = pd.to_timedelta(grid_data['Time'] + ':00')
# combining date and time with pop()
grid_data['Datetime'] = grid_data.pop('Date') + grid_data.pop('Time')
# grid_data['Datetime'] = grid_data['Date'] + grid_data['Time']
grid_data.info()
# looking for outliers
plt.figure(figsize=(12, 6))
sns.boxplot(grid_data['Demand'])
# relationship between 'Demand' and 'Datetime'
plt.figure(figsize=(12, 6))
sns.scatterplot(grid_data['Datetime'], grid_data['Demand'], label='Demand vs Datetime').set_title('Relationship between Datetime and Demand')
# grid_data_og = pd.read_excel('dataset/System-Data-Qtr-Hourly-2014-2015.xlsx')
# to csv
# grid_data_og.to_csv('dataset/System-Data-Qtr-Hourly-2014-2015.csv', index=False)
# read the original data
grid_data_og = pd.read_csv('dataset/System-Data-Qtr-Hourly-2014-2015.csv')
grid_data_og.columns
# drop everything except 'Datetime' and 'Demand'
grid_data_og = grid_data_og.drop(grid_data_og.columns.difference(['DateTime', 'IE Demand']), 1)
# grid_data_og.drop(grid_data_og.columns.difference(['DateTime', 'IE Demand']), 1, inplace=True)
grid_data_og.info()
# setting date as datetime64
grid_data_og['DateTime'] = pd.to_datetime(grid_data_og['DateTime'])
# select only the data from 2014
grid_data_og = grid_data_og[grid_data_og['DateTime'].dt.year == 2014]
grid_data_og.info()
# range of dates
min_date = grid_data_og['DateTime'].min()
max_date = grid_data_og['DateTime'].max()
# compute the number of days between the first and last date with a frequency of 15 min
range_dates = pd.date_range(start=min_date, end=max_date, freq='15min')
# difference -> look for missing dates in the original data
# probably because of the daylight saving time
print(range_dates.difference(grid_data_og['DateTime']))
# displaying the missing values in the new dataset
grid_data[grid_data['Demand'].isna() == True]
# 2014-08-14 14:15:00 of the original data
grid_data_og[grid_data_og['DateTime'] == '2014-08-14 14:15:00']
# interpolating missing values
grid_data['Demand'] = grid_data['Demand'].interpolate(method='linear')
grid_data.info()
# plot the time series
plt.figure(figsize=(15,10))
sns.lineplot(x='Datetime', y='Demand', data=grid_data, label='Demand').set_title('Energy demand overtime')
plt.show()
# estimate autocorrelation coefficient for 10 days
ten_days_lags_samples = 10 * 24 * 4
fig, ax = plt.subplots(figsize=(15,10))
plt.xlabel('10 days lag')
plt.ylabel('Autocorrelation')
sm.graphics.tsa.plot_acf(grid_data['Demand'], lags=ten_days_lags_samples, ax=ax, title='Autocorrelation of Energy demand')
plt.show()
# 3
# time of year variable that varies from 0 to 1
time_of_year = grid_data['Datetime'].dt.dayofyear / 365
time_of_year
# how demand varies over the year
plt.figure(figsize=(15,10))
plt.xlabel("Time of the year(0-1)")
plt.ylabel("Demand variation")
sns.lineplot(x=time_of_year, y=grid_data['Demand'], data=grid_data, label='Demand variation').set_title('Demand variation overtime')
plt.show()
# 4
# average demand per month
# avg_demand_per_month = grid_data.resample('M', on='Datetime').mean()
avg_demand_per_month = grid_data.groupby(grid_data['Datetime'].dt.month).mean()
avg_demand_per_month
# 4
# months = grid_data['Datetime'].dt.month
# divide months in 12 categories
# months_categories = pd.cut(months, 12, labels=False)
twelve_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
# plot average demand per month over months
plt.figure(figsize=(15,10))
# bar plot
fig = sns.barplot(x=twelve_months, y=avg_demand_per_month['Demand'], data=avg_demand_per_month).set_title('Average Demand per month')
plt.xlabel("Months")
plt.ylabel("Average Demand")
plt.show()
# 5
# average demand each hour of the day
avg_demand_per_hour = grid_data.groupby(grid_data['Datetime'].dt.hour).mean()
# avg_demand_per_hour = grid_data.resample('H', on='Datetime').mean()
# avg_demand_per_hour
# # plot average demand per hour over hours
plt.figure(figsize=(15,10))
# bar plot
sns.barplot(x=avg_demand_per_hour.index, y=avg_demand_per_hour['Demand'], data=avg_demand_per_hour).set_title('Average Demand per hour')
plt.xlabel("Hours")
plt.ylabel("Average Demand")
plt.show()
# 6
# average demand for each day of the week
avg_demand_per_day_of_week = grid_data.groupby(grid_data['Datetime'].dt.dayofweek).mean()
# avg_demand_per_week = grid_data.resample('D', on='Datetime').mean()
avg_demand_per_day_of_week
# 6
days_of_the_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
# plot average demand per day of the week
plt.figure(figsize=(15,10))
# bar plot
sns.barplot(x=days_of_the_week, y=avg_demand_per_day_of_week['Demand'], data=avg_demand_per_day_of_week).set_title('Average Demand per day of the week')
plt.xlabel("Days of the week")
plt.ylabel("Average demand")
plt.show()
# 7
# days_of_the_week_in_year = grid_data['Datetime'].dt.dayofweek
# print(days_of_the_week_in_year)
# avg_demand_per_hour_of_day = []
# for d in days_of_the_week_in_year:
def avg_daily_demand(d):
avg_demand_per_hour_of_day = grid_data[grid_data['Datetime'].dt.dayofweek == d].groupby(grid_data['Datetime'].dt.hour).mean()
return avg_demand_per_hour_of_day
# print(avg_daily_demand(0)['Demand'].values)
# average daily demand for seven days of the week
plt.figure(figsize=(15,10))
for d in range(0, 7):
sns.lineplot(x=np.arange(0,24), y=avg_daily_demand(d)['Demand'].values, label=d).set_title('Average Demand per hour in 7 days')
plt.xlabel("Hours")
plt.ylabel("Average Demand")
plt.show()
# 8
# weekdays
weekdays = grid_data[grid_data['Datetime'].dt.dayofweek <= 4]
weekdays
# weekends
weekends = grid_data[grid_data['Datetime'].dt.dayofweek > 4]
weekends
# statistical hypothesis test between weekdays and weekends
t_stat, p_value = ttest_ind(weekdays['Demand'], weekends['Demand'])
print(t_stat, p_value)
# t_stat, p_value = f_oneway(weekdays['Demand'], weekends['Demand'])
# print(t_stat, p_value)
# 9
# split the data into halves
train = grid_data[:int(len(grid_data) / 2)]
# evaluation set
test = grid_data[int(len(grid_data) / 2):]
# mean absolute error
def mean_abs_error(actual, predicted):
return np.mean(np.abs(actual - predicted))
# persistence benchmark, forecast horizons
persistence_model = []
error = []
fh = 24*4
for n in range(0, fh):
persistence_model = pd.Series(test['Demand'].shift(n))
persistence_model.dropna(inplace=True)
error.append(mean_abs_error(test['Demand'], persistence_model))
# print(error)
# maximum (worst) error
print(max(error), "error(energy)","at ", "horizon", error.index(max(error)))
# minimum (best) error
print(error[1], "error(energy)","at ", "horizon", error.index(error[1]))
# plot MAE against time leads
plt.figure(figsize=(15,10))
plt.xlabel("Forecast horizons")
plt.ylabel("Mean absolute error")
sns.lineplot(x=range(0, fh), y=error).set_title('Mean absolute error vs forecast horizons')
plt.show()
# MAPE
def mean_abs_percentage_error(actual, predicted):
return np.mean(np.abs((actual - predicted) / actual)) * 100
p_error = []
p_persistence_model = []
for n in range(0, fh):
p_persistence_model = pd.Series(test['Demand'].shift(n))
p_persistence_model.dropna(inplace=True)
p_error.append(mean_abs_percentage_error(test['Demand'], p_persistence_model))
# print(p_error)
# maximum (worst) error
print(max(p_error), "error(energy)","at", "horizon", p_error.index(max(p_error)))
# minimum (best) error
# print(min(p_error), "error(energy)","at", "horizon", p_error.index(min(p_error)))
print(p_error[1], "error(energy)","at ", "horizon", p_error.index(p_error[1]))
# plot MAPE against time leads
plt.figure(figsize=(15,10))
plt.xlabel("Forecast horizons")
plt.ylabel("Mean absolute percentage error")
sns.lineplot(x=range(0, fh), y=p_error, ).set_title('Mean absolute percentage error vs forecast horizons')