import pandas as pd
import plotly.express as px
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from datetime import datetime
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
train = pd.read_csv('https://raw.githubusercontent.com/benthecoder/HackIllinois2022/main/data/Ashrae/buildings_train.csv')
test = pd.read_csv('https://raw.githubusercontent.com/benthecoder/HackIllinois2022/main/data/Ashrae/buildings_test.csv')
train.head()
fig = px.histogram(train, x="meter_reading", log_y=True, labels={
"meter_reading": "Energy Consumption (kWh)",
"count": "Count (log)"
},
title="Counts of Energy Consumption Readings")
fig.show()
fig.write_html("hist.html")
box_plt = px.box(train, x="primary_use", y="meter_reading")
box_plt.update_layout(height=1000)
box_plt.write_html("box_plt.html")
plt.figure(figsize=(13, 7))
sns.set_theme()
sns.boxplot(data=train, x='primary_use', y="meter_reading")
plt.xlabel('Building Usage')
plt.ylabel('Energy Consumption (kWh)')
plt.title('Energy Distribution based on Building Usage')
plt.xticks(rotation=45)
plt.show()
plt.figure(figsize=(13, 7))
sns.set_theme()
sns.boxplot(data=train, x='year_built', y="meter_reading")
plt.xlabel('Year Building was Constructed')
plt.ylabel('Energy Consumption (kWh)')
plt.title('Energy Distribution based on Construction year')
plt.xticks(rotation=45)
plt.show()
fig = px.bar(train.groupby('year_built').mean().reset_index(), x='year_built', y="meter_reading", labels={
'year_built': "Year building was constructed",
"meter_reading": "Energy Consumption (kWh)"
},
title="Mean Energy Consumption based on Construction Year")
fig.show()
fig = px.bar(train.groupby(['wind_speed','wind_direction']).mean().reset_index(), x='wind_speed', y="meter_reading", color='wind_speed', labels={
'year_built': "Year building was constructed",
"meter_reading": "Energy Consumption (kWh)"
},
title="Energy Consmption vs Wind Conditions", animation_frame="wind_direction", animation_group="wind_speed", range_y=[0,450], range_x=[0,20])
fig.show()
def scatterplt(var, x_label):
plt.figure(figsize=(13, 7))
sns.set_theme()
sns.scatterplot(data=train, x=var, y="meter_reading", alpha=0.1)
plt.xlabel(x_label)
plt.ylabel('Energy Consumption (kWh)')
plt.title('Energy consumption based on'+str(x_label))
plt.show()
scatterplt('air_temperature', 'Air Temperature')
scatterplt('dew_temperature', 'Dew Temperature')
plt.figure(figsize=(13, 7))
sns.set_theme()
sns.boxplot(data=train, x='cloud_coverage', y="meter_reading")
plt.xlabel('Cloud Coverage')
plt.ylabel('Energy Consumption (kWh)')
plt.title('Energy consumption based on cloud coverage')
plt.show()
scatterplt('precip_depth_1_hr', 'Precipitation Depth')
scatterplt('sea_level_pressure', 'Sea Level Pressure')
scatterplt('square_feet', 'Square Feet')
# For training and testing data
train = pd.read_csv('https://raw.githubusercontent.com/benthecoder/HackIllinois2022/main/data/Ashrae/buildings_train.csv')
train['timestamp'] = pd.to_datetime(train['timestamp'])
# get info from timestamp
train['hour'] = [i.hour for i in train.timestamp]
train['day'] = [i.day for i in train.timestamp]
train['month'] = [i.month for i in train.timestamp]
train['week'] = [i.week for i in train.timestamp]
train['day_name'] = [i.day_name() for i in train.timestamp]
cal = calendar()
holidays = cal.holidays(start=train['timestamp'].min(), end=train['timestamp'].max())
train['holiday'] = train['timestamp'].isin(holidays).astype(int)
## One hot encoding
one_hot = pd.get_dummies(train['primary_use'])
# Drop column primary use as it is now encoded (drop cloud_coverage because 40% of the data is missing)
train = train.drop(['primary_use','cloud_coverage'],axis = 1)
# Join the encoded df
train = train.join(one_hot)
# Repeat for weekday name column
one_hot = pd.get_dummies(train['day_name'])
train = train.drop(['day_name'],axis = 1)
train = train.join(one_hot)
# Now drop other missing values
train = train.dropna()
# get X and y datasets
y = train['meter_reading']
X = train.drop(['meter_reading', 'timestamp'], axis=1)
X
# 20% test, 20% validation, and 60% training split
dev_X, test_X, dev_y, test_y = train_test_split(X, y, test_size=0.2, random_state = 42)
# Scale data
scaler = StandardScaler()
dev_X_transformed = scaler.fit_transform(dev_X)
test_X_transformed = scaler.transform(test_X)
pipe = Pipeline([('dr', PCA()), ('rf', RandomForestRegressor())])
params = {"dr__n_components": [15, 20, 28],
#"rf__max_depth": [5, 10, 25, 40],
"rf__max_depth": [5],
"rf__n_estimators": [50, 100, 150, 200]
}
search = GridSearchCV(pipe, params, return_train_score=True, cv=10)
search.fit(dev_X, dev_y)
best_params = search.best_params_
score = search.score(test_X, test_y)
pred_y = search.predict(test_X)
rmse = mean_squared_error(test_y, pred_y, squared=False)