Setting up a Data Pipeline for Energy Data Extraction βοΈ
Setting up the Extraction Logic
import io
from datetime import datetime
import requests
import pandas as pd
from dateutil.rrule import rrule, MONTHLY
import numpy as np
### Data Eng. Functions ###
def month_iter(start_month, start_year, end_month, end_year):
"""
Returns a generator that yields a tuple like (month, year). Generator
has monthly frequency between start and end dates.
"""
start = datetime(start_year, start_month, 1)
end = datetime(end_year, end_month, 1)
return ((d.month, d.year) for d in rrule(MONTHLY, dtstart=start, until=end))
def download_aemo_data_nem(state, start_month, start_year, end_month, end_year):
"""
Downloads price and demand data from AEMO from 1st day of the starting
month in the starting year to the last day of the ending month in the
ending year. Returns pandas DataFrame.
"""
aemo_df = pd.DataFrame()
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) \
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
for date in month_iter(start_month, start_year, end_month, end_year):
month = str(date[0]).zfill(2)
year = str(date[1])
reference = year + month
print(f'Adding Month/Year: {month}/{year}')
url = f'https://www.aemo.com.au/aemo/data/nem/priceanddemand/PRICE_AND_DEMAND_{reference}_{state}1.csv'
url_data = requests.get(url, headers=headers).content
raw_data = pd.read_csv(io.StringIO(url_data.decode('utf-8')))
aemo_df = aemo_df.append(raw_data, ignore_index=True)
aemo_df['datetime'] = pd.to_datetime(aemo_df['SETTLEMENTDATE'])
aemo_df = aemo_df.set_index('datetime')
aemo_df.drop(['SETTLEMENTDATE', 'PERIODTYPE'],
axis=1, inplace=True)
return aemo_df