Time series analysis on EirGrid intra-day wind-power generation

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import statsmodels.api as sm from scipy.stats import ttest_ind # from scipy.stats import f_oneway # from arch.unitroot import VarianceRatio # from sklearn.model_selection import train_test_split sns.set_theme(style="darkgrid")

# loading the data grid_data = pd.read_csv('dataset/EirGridSystemDemand2014.csv')

# shape grid_data.shape

# first 5 rows grid_data.head()

# last 5 rows grid_data.tail()

# structure information of the data grid_data.info()

# checking what's wrong with 'Demand' grid_data['Demand'].isnull().sum()

# checking the type of object types grid_data.select_dtypes(include=[np.datetime64])

grid_data.select_dtypes(include=[np.number])

grid_data.describe()

# turning 'Date' column to datetime64 grid_data['Date'] = pd.to_datetime(grid_data['Date'])

grid_data['Date'].dtype

# turning 'Time' column to timedelta64 grid_data['Time'] = pd.to_timedelta(grid_data['Time'] + ':00')

# combining date and time with pop() grid_data['Datetime'] = grid_data.pop('Date') + grid_data.pop('Time') # grid_data['Datetime'] = grid_data['Date'] + grid_data['Time'] grid_data.info()

# looking for outliers plt.figure(figsize=(12, 6)) sns.boxplot(grid_data['Demand'])

# relationship between 'Demand' and 'Datetime' plt.figure(figsize=(12, 6)) sns.scatterplot(grid_data['Datetime'], grid_data['Demand'], label='Demand vs Datetime').set_title('Relationship between Datetime and Demand')

# grid_data_og = pd.read_excel('dataset/System-Data-Qtr-Hourly-2014-2015.xlsx') # to csv # grid_data_og.to_csv('dataset/System-Data-Qtr-Hourly-2014-2015.csv', index=False)

# read the original data grid_data_og = pd.read_csv('dataset/System-Data-Qtr-Hourly-2014-2015.csv') grid_data_og.columns

# drop everything except 'Datetime' and 'Demand' grid_data_og = grid_data_og.drop(grid_data_og.columns.difference(['DateTime', 'IE Demand']), 1) # grid_data_og.drop(grid_data_og.columns.difference(['DateTime', 'IE Demand']), 1, inplace=True) grid_data_og.info()

# setting date as datetime64 grid_data_og['DateTime'] = pd.to_datetime(grid_data_og['DateTime']) # select only the data from 2014 grid_data_og = grid_data_og[grid_data_og['DateTime'].dt.year == 2014] grid_data_og.info()

# range of dates min_date = grid_data_og['DateTime'].min() max_date = grid_data_og['DateTime'].max() # compute the number of days between the first and last date with a frequency of 15 min range_dates = pd.date_range(start=min_date, end=max_date, freq='15min')

# difference -> look for missing dates in the original data # probably because of the daylight saving time print(range_dates.difference(grid_data_og['DateTime']))

# displaying the missing values in the new dataset grid_data[grid_data['Demand'].isna() == True]

# 2014-08-14 14:15:00 of the original data grid_data_og[grid_data_og['DateTime'] == '2014-08-14 14:15:00']

# interpolating missing values grid_data['Demand'] = grid_data['Demand'].interpolate(method='linear')

grid_data.info()

# plot the time series plt.figure(figsize=(15,10)) sns.lineplot(x='Datetime', y='Demand', data=grid_data, label='Demand').set_title('Energy demand overtime') plt.show()

# estimate autocorrelation coefficient for 10 days ten_days_lags_samples = 10 * 24 * 4 fig, ax = plt.subplots(figsize=(15,10)) plt.xlabel('10 days lag') plt.ylabel('Autocorrelation') sm.graphics.tsa.plot_acf(grid_data['Demand'], lags=ten_days_lags_samples, ax=ax, title='Autocorrelation of Energy demand') plt.show()

# 3 # time of year variable that varies from 0 to 1 time_of_year = grid_data['Datetime'].dt.dayofyear / 365 time_of_year

# how demand varies over the year plt.figure(figsize=(15,10)) plt.xlabel("Time of the year(0-1)") plt.ylabel("Demand variation") sns.lineplot(x=time_of_year, y=grid_data['Demand'], data=grid_data, label='Demand variation').set_title('Demand variation overtime') plt.show()

# 4 # average demand per month # avg_demand_per_month = grid_data.resample('M', on='Datetime').mean() avg_demand_per_month = grid_data.groupby(grid_data['Datetime'].dt.month).mean() avg_demand_per_month

# 4 # months = grid_data['Datetime'].dt.month # divide months in 12 categories # months_categories = pd.cut(months, 12, labels=False) twelve_months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] # plot average demand per month over months plt.figure(figsize=(15,10)) # bar plot fig = sns.barplot(x=twelve_months, y=avg_demand_per_month['Demand'], data=avg_demand_per_month).set_title('Average Demand per month') plt.xlabel("Months") plt.ylabel("Average Demand") plt.show()

# 5 # average demand each hour of the day avg_demand_per_hour = grid_data.groupby(grid_data['Datetime'].dt.hour).mean() # avg_demand_per_hour = grid_data.resample('H', on='Datetime').mean() # avg_demand_per_hour # # plot average demand per hour over hours plt.figure(figsize=(15,10)) # bar plot sns.barplot(x=avg_demand_per_hour.index, y=avg_demand_per_hour['Demand'], data=avg_demand_per_hour).set_title('Average Demand per hour') plt.xlabel("Hours") plt.ylabel("Average Demand") plt.show()

# 6 # average demand for each day of the week avg_demand_per_day_of_week = grid_data.groupby(grid_data['Datetime'].dt.dayofweek).mean() # avg_demand_per_week = grid_data.resample('D', on='Datetime').mean() avg_demand_per_day_of_week

# 6 days_of_the_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] # plot average demand per day of the week plt.figure(figsize=(15,10)) # bar plot sns.barplot(x=days_of_the_week, y=avg_demand_per_day_of_week['Demand'], data=avg_demand_per_day_of_week).set_title('Average Demand per day of the week') plt.xlabel("Days of the week") plt.ylabel("Average demand") plt.show()

# 7 # days_of_the_week_in_year = grid_data['Datetime'].dt.dayofweek # print(days_of_the_week_in_year) # avg_demand_per_hour_of_day = [] # for d in days_of_the_week_in_year: def avg_daily_demand(d): avg_demand_per_hour_of_day = grid_data[grid_data['Datetime'].dt.dayofweek == d].groupby(grid_data['Datetime'].dt.hour).mean() return avg_demand_per_hour_of_day

# print(avg_daily_demand(0)['Demand'].values)

# average daily demand for seven days of the week plt.figure(figsize=(15,10)) for d in range(0, 7): sns.lineplot(x=np.arange(0,24), y=avg_daily_demand(d)['Demand'].values, label=d).set_title('Average Demand per hour in 7 days') plt.xlabel("Hours") plt.ylabel("Average Demand") plt.show()

# 8 # weekdays weekdays = grid_data[grid_data['Datetime'].dt.dayofweek <= 4] weekdays

# weekends weekends = grid_data[grid_data['Datetime'].dt.dayofweek > 4] weekends

# statistical hypothesis test between weekdays and weekends t_stat, p_value = ttest_ind(weekdays['Demand'], weekends['Demand']) print(t_stat, p_value)

# t_stat, p_value = f_oneway(weekdays['Demand'], weekends['Demand']) # print(t_stat, p_value)

# 9 # split the data into halves train = grid_data[:int(len(grid_data) / 2)] # evaluation set test = grid_data[int(len(grid_data) / 2):]

# mean absolute error def mean_abs_error(actual, predicted): return np.mean(np.abs(actual - predicted))

# persistence benchmark, forecast horizons persistence_model = [] error = [] fh = 24*4 for n in range(0, fh): persistence_model = pd.Series(test['Demand'].shift(n)) persistence_model.dropna(inplace=True) error.append(mean_abs_error(test['Demand'], persistence_model))

# print(error)

# maximum (worst) error print(max(error), "error(energy)","at ", "horizon", error.index(max(error)))

# minimum (best) error print(error[1], "error(energy)","at ", "horizon", error.index(error[1]))

# plot MAE against time leads plt.figure(figsize=(15,10)) plt.xlabel("Forecast horizons") plt.ylabel("Mean absolute error") sns.lineplot(x=range(0, fh), y=error).set_title('Mean absolute error vs forecast horizons') plt.show()

# MAPE def mean_abs_percentage_error(actual, predicted): return np.mean(np.abs((actual - predicted) / actual)) * 100 p_error = [] p_persistence_model = [] for n in range(0, fh): p_persistence_model = pd.Series(test['Demand'].shift(n)) p_persistence_model.dropna(inplace=True) p_error.append(mean_abs_percentage_error(test['Demand'], p_persistence_model))

# print(p_error)

# maximum (worst) error print(max(p_error), "error(energy)","at", "horizon", p_error.index(max(p_error)))

# minimum (best) error # print(min(p_error), "error(energy)","at", "horizon", p_error.index(min(p_error))) print(p_error[1], "error(energy)","at ", "horizon", p_error.index(p_error[1]))

# plot MAPE against time leads plt.figure(figsize=(15,10)) plt.xlabel("Forecast horizons") plt.ylabel("Mean absolute percentage error") sns.lineplot(x=range(0, fh), y=p_error, ).set_title('Mean absolute percentage error vs forecast horizons')