Hackillinois 2022

import pandas as pd import plotly.express as px import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.metrics import mean_squared_error from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.pipeline import Pipeline from sklearn.ensemble import RandomForestRegressor from datetime import datetime from pandas.tseries.holiday import USFederalHolidayCalendar as calendar

train = pd.read_csv('https://raw.githubusercontent.com/benthecoder/HackIllinois2022/main/data/Ashrae/buildings_train.csv') test = pd.read_csv('https://raw.githubusercontent.com/benthecoder/HackIllinois2022/main/data/Ashrae/buildings_test.csv')

train.head()

fig = px.histogram(train, x="meter_reading", log_y=True, labels={ "meter_reading": "Energy Consumption (kWh)", "count": "Count (log)" }, title="Counts of Energy Consumption Readings") fig.show() fig.write_html("hist.html")

box_plt = px.box(train, x="primary_use", y="meter_reading") box_plt.update_layout(height=1000) box_plt.write_html("box_plt.html")

plt.figure(figsize=(13, 7)) sns.set_theme() sns.boxplot(data=train, x='primary_use', y="meter_reading") plt.xlabel('Building Usage') plt.ylabel('Energy Consumption (kWh)') plt.title('Energy Distribution based on Building Usage') plt.xticks(rotation=45) plt.show()

plt.figure(figsize=(13, 7)) sns.set_theme() sns.boxplot(data=train, x='year_built', y="meter_reading") plt.xlabel('Year Building was Constructed') plt.ylabel('Energy Consumption (kWh)') plt.title('Energy Distribution based on Construction year') plt.xticks(rotation=45) plt.show()

fig = px.bar(train.groupby('year_built').mean().reset_index(), x='year_built', y="meter_reading", labels={ 'year_built': "Year building was constructed", "meter_reading": "Energy Consumption (kWh)" }, title="Mean Energy Consumption based on Construction Year") fig.show()

fig = px.bar(train.groupby(['wind_speed','wind_direction']).mean().reset_index(), x='wind_speed', y="meter_reading", color='wind_speed', labels={ 'year_built': "Year building was constructed", "meter_reading": "Energy Consumption (kWh)" }, title="Energy Consmption vs Wind Conditions", animation_frame="wind_direction", animation_group="wind_speed", range_y=[0,450], range_x=[0,20]) fig.show()

def scatterplt(var, x_label): plt.figure(figsize=(13, 7)) sns.set_theme() sns.scatterplot(data=train, x=var, y="meter_reading", alpha=0.1) plt.xlabel(x_label) plt.ylabel('Energy Consumption (kWh)') plt.title('Energy consumption based on'+str(x_label)) plt.show()

scatterplt('air_temperature', 'Air Temperature')

scatterplt('dew_temperature', 'Dew Temperature')

plt.figure(figsize=(13, 7)) sns.set_theme() sns.boxplot(data=train, x='cloud_coverage', y="meter_reading") plt.xlabel('Cloud Coverage') plt.ylabel('Energy Consumption (kWh)') plt.title('Energy consumption based on cloud coverage') plt.show()

scatterplt('precip_depth_1_hr', 'Precipitation Depth')

scatterplt('sea_level_pressure', 'Sea Level Pressure')

scatterplt('square_feet', 'Square Feet')

# For training and testing data train = pd.read_csv('https://raw.githubusercontent.com/benthecoder/HackIllinois2022/main/data/Ashrae/buildings_train.csv') train['timestamp'] = pd.to_datetime(train['timestamp']) # get info from timestamp train['hour'] = [i.hour for i in train.timestamp] train['day'] = [i.day for i in train.timestamp] train['month'] = [i.month for i in train.timestamp] train['week'] = [i.week for i in train.timestamp] train['day_name'] = [i.day_name() for i in train.timestamp] cal = calendar() holidays = cal.holidays(start=train['timestamp'].min(), end=train['timestamp'].max()) train['holiday'] = train['timestamp'].isin(holidays).astype(int)

## One hot encoding one_hot = pd.get_dummies(train['primary_use']) # Drop column primary use as it is now encoded (drop cloud_coverage because 40% of the data is missing) train = train.drop(['primary_use','cloud_coverage'],axis = 1) # Join the encoded df train = train.join(one_hot) # Repeat for weekday name column one_hot = pd.get_dummies(train['day_name']) train = train.drop(['day_name'],axis = 1) train = train.join(one_hot) # Now drop other missing values train = train.dropna()

# get X and y datasets y = train['meter_reading'] X = train.drop(['meter_reading', 'timestamp'], axis=1) X

# 20% test, 20% validation, and 60% training split dev_X, test_X, dev_y, test_y = train_test_split(X, y, test_size=0.2, random_state = 42) # Scale data scaler = StandardScaler() dev_X_transformed = scaler.fit_transform(dev_X) test_X_transformed = scaler.transform(test_X)

pipe = Pipeline([('dr', PCA()), ('rf', RandomForestRegressor())]) params = {"dr__n_components": [15, 20, 28], #"rf__max_depth": [5, 10, 25, 40], "rf__max_depth": [5], "rf__n_estimators": [50, 100, 150, 200] } search = GridSearchCV(pipe, params, return_train_score=True, cv=10) search.fit(dev_X, dev_y) best_params = search.best_params_ score = search.score(test_X, test_y) pred_y = search.predict(test_X) rmse = mean_squared_error(test_y, pred_y, squared=False)