Regression Analysis on TfL Santander Cycles

import os DATASET_DIR = '/work/datasets' PROJECT_DATASET_DIR = os.path.join(DATASET_DIR, 'santander-cycle-london-tfl') if not os.path.exists(PROJECT_DATASET_DIR): os.makedirs(PROJECT_DATASET_DIR)

os.environ['KAGGLE_CONFIG_DIR'] = DATASET_DIR !kaggle datasets download hmavrodiev/london-bike-sharing-dataset !unzip -o london-bike-sharing-dataset.zip -d {PROJECT_DATASET_DIR} !rm london-bike-sharing-dataset.zip

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns sns.set(rc={'figure.figsize':(11.7,8.27)}) plt.style.use("fivethirtyeight")

data_path = os.path.join(PROJECT_DATASET_DIR, 'london_merged.csv') hourly_df_raw = pd.read_csv(data_path) hourly_df_raw.insert(1, "hires", hourly_df_raw.cnt) hourly_df_raw.drop(["cnt"], axis=1, inplace=True) hourly_df_raw.head()

hourly_df_raw.info()

hourly_df_raw['timestamp'] = pd.to_datetime(hourly_df_raw.timestamp) hourly_df_raw.dtypes

for col in ["weather_code", "is_holiday", "is_weekend", "season"]: hourly_df_raw[col] = hourly_df_raw[col].astype(int).astype("category") hourly_df_raw.dtypes

hourly_df_raw.isna().sum()

hourly_df_raw.hires.describe()

sns.boxplot(x=hourly_df_raw.hires);

# Triangle heatmap: https://medium.com/@szabo.bibor/how-to-create-a-seaborn-correlation-heatmap-in-python-834c0686b88e mask = np.triu(np.ones_like(hourly_df_raw.corr(), dtype=np.bool)) ax = sns.heatmap(hourly_df_raw.corr(), mask=mask, annot=True, cmap="BrBG")

hourly_df_raw[["t1", "t2"]].describe()

_, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 10)) ax[0].scatter(x=hourly_df_raw.t1, y=hourly_df_raw.hires, alpha=0.5) ax[1].scatter(x=hourly_df_raw.t2, y=hourly_df_raw.hires, alpha=0.5) fontsize = 14 ax[0].set_xlabel("Actual temperature (celsius)", fontsize=fontsize) ax[0].set_title("Count of cycle hires vs actual temperature", fontsize=fontsize) ax[1].set_title("Count of cycle hires vs 'feels like' temperature", fontsize=fontsize) ax[1].set_xlabel("'Feels like' temperature (celsius)", fontsize=fontsize) ax[0].set_ylabel("Count of cycle hires", fontsize=fontsize) plt.show()

hourly_df_raw.insert(2, "temp", hourly_df_raw.t1) hourly_df_raw.drop(["t1", "t2"], axis=1, inplace=True) hourly_df_raw.head()

def scatterplot_feat_vs_hires(df: pd.DataFrame, feat_name: str, feat_label: str = None): if feat_label is None: feat_label = feat_name if feat_name not in df.columns: raise KeyError(f"'{feat_name}' is not a column in the dataframe.") _, ax = plt.subplots(figsize=(10, 5)) sns.regplot(x=feat_name, y="hires", data=df, ci=95, scatter_kws={"alpha": 0.2}, line_kws={"color": "red", "lw": 2, "alpha": 0.8}) fontsize = 14 ax.set_xlabel(feat_label, fontsize=fontsize) ax.set_ylabel("Count of cycle hires", fontsize=fontsize) ax.set_title(f"Count of cycle hires vs {feat_label}", fontsize=fontsize) plt.show()

scatterplot_feat_vs_hires(hourly_df_raw, "temp")

scatterplot_feat_vs_hires(hourly_df_raw, "hum")

hourly_df_raw.wind_speed.describe()

ax = sns.histplot(hourly_df_raw.wind_speed, kde=True) ax.set_title("Distribution of wind speed")

hourly_df_raw.wind_speed.value_counts(bins=5, sort=False)

scatterplot_feat_vs_hires(hourly_df_raw, "wind_speed")

def get_n_sample_of_windspeed_interval(n: int, lo: float, hi: float): df_filtered_on_interval = hourly_df_raw[((hourly_df_raw.wind_speed >= lo) & (hourly_df_raw.wind_speed < hi))] # Sampling with replacement. n_samples = df_filtered_on_interval.sample(n=n, random_state=44, replace=True) return n_samples[["wind_speed", "hires"]] n = 100 # Large n is ok since we are sampling with replacement. samples = [get_n_sample_of_windspeed_interval(n, i, i+10) for i in range(0, 60, 10)] balanced_windspeed_df = pd.concat(samples)

scatterplot_feat_vs_hires(balanced_windspeed_df, "wind_speed")

season_mapping = { 0: "spring", 1: "summer", 2: "fall", 3: "winter", } weather_code_mapping = { 1: "clear", 2: "scattered clouds", 3: "broken clouds", 4: "cloudy", 7: "rain", 10: "rain with thunderstorm", 26: "snowfall", 94: "freezing fog", } hourly_df_raw["weather_code_str"] = hourly_df_raw.weather_code.map(weather_code_mapping) hourly_df_raw["season_str"] = hourly_df_raw.season.map(season_mapping)

ax = sns.barplot(data=hourly_df_raw, x="hires", y="weather_code_str") ax.set_title("Count of cycle hires by weather");

ax = sns.barplot(data=hourly_df_raw, x="is_holiday", y="hires") ax.set_title("Count of cycle hires by holidays");

ax = sns.barplot(data=hourly_df_raw, x="is_holiday", y="hires", hue="season_str") ax.set_title("Count of cycle hires by holidays grouped by seasons");

ax = sns.barplot(data=hourly_df_raw, x="is_weekend", y="hires") ax.set_title("Count of cycle hires by weekends");

ax = sns.barplot(data=hourly_df_raw, x="is_weekend", y="hires", hue="season_str") ax.set_title("Count of cycle hires by weekends grouped by seasons");

hourly_df_raw.drop(["season_str", "weather_code_str"], axis=1, inplace=True) hourly_df_raw.head()

hourly_df_raw.insert(1, "hour", hourly_df_raw.timestamp.dt.hour) hourly_df_raw.head()

ax = sns.barplot(data=hourly_df_raw, x="hour", y="hires", color="#ef553b") ax.set_title("Count of rented bikes by hour");

target = hourly_df_raw["hires"] features = hourly_df_raw.drop(["hires", "timestamp"], axis=1) features.head()

import statsmodels.api as sm from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split( features, target, test_size=0.3, random_state=44)

X_train_sm = sm.add_constant(X_train) X_test_sm = sm.add_constant(X_test) model_sm = sm.OLS(y_train, X_train).fit() y_pred = model_sm.predict(X_test) rmse = mean_squared_error(y_test, y_pred, squared=False) r2 = r2_score(y_test, y_pred) print(f"RMSE naive linear regression: {rmse}") print(f"R^2 score: {r2}")

target.describe()

from sklearn.preprocessing import PolynomialFeatures

rmses = [] r2_scores = [] degrees = [1, 2, 3, 4] for d in degrees: poly = PolynomialFeatures(degree=d) X_train_tmp = poly.fit_transform(X_train) X_test_tmp = poly.transform(X_test) # No need to do sm.add_constant() as that is # already done by PolynomialFeatures. model_sm = sm.OLS(y_train, X_train_tmp).fit() y_pred = model_sm.predict(X_test_tmp) rmse = mean_squared_error(y_test, y_pred, squared=False) r2 = r2_score(y_test, y_pred) rmses.append(rmse) r2_scores.append(r2)

for d, rmse, r2 in zip(degrees, rmses, r2_scores): print(f"Degree {d} | RMSE: {rmse}, r^2: {r2}")

ax = sns.lineplot(x=degrees, y=rmses) ax.set(xticks=degrees) ax.set_title("RMSE vs degree of power terms considered") ax.set_xlabel("Degree of power terms") ax.set_ylabel("Root mean squared error (RMSE)")

ax = sns.lineplot(x=degrees, y=r2_scores) ax.set(xticks=degrees) ax.set_title("$R^2$ scores vs degree of power term considered") ax.set_xlabel("Degree of power term") ax.set_ylabel("$R^2$ scores")

from sklearn.preprocessing import OneHotEncoder

categorical_cols = X_train.columns[X_train.dtypes == "category"].tolist() enc = OneHotEncoder(handle_unknown="error", sparse=False) enc_X_train = enc.fit_transform(X_train[categorical_cols]) # Encoded X_train above is a numpy array. To convert it back to a DataFrame, # we use pd.DataFrame with the transformed columns and original X_train index. # The index is important when concatenating to the original X_train dataframe # containing the non-categorical columns as the join is on index. enc_X_train = pd.DataFrame(enc_X_train, columns=enc.get_feature_names_out(categorical_cols), index=X_train.index) enc_X_train = pd.concat([X_train, enc_X_train], axis=1)\ .drop(categorical_cols, axis=1) enc_X_train.head()

def preprocess_new_sample(new_sample, ohe_encoder): for col in ["weather_code", "is_holiday", "is_weekend", "season"]: new_sample[col] = new_sample[col].astype(int).astype("category") categorical_cols = new_sample.columns[new_sample.dtypes == "category"].tolist() enc_new_sample = ohe_encoder.transform(new_sample[categorical_cols]) index = new_sample.index enc_new_sample = pd.DataFrame(enc_new_sample, columns=ohe_encoder.get_feature_names_out( categorical_cols), index=index) enc_df = pd.concat([new_sample, enc_new_sample], axis=1)\ .drop(categorical_cols, axis=1) return enc_df

test_sample = pd.DataFrame( dict(hour=13, temp=15, hum=21, wind_speed=17.1, weather_code=3, is_holiday=0, is_weekend=0, season=3), index=[0] ) preprocess_new_sample(test_sample, enc)

enc_X_test = preprocess_new_sample(X_test, enc) enc_X_test.head()

enc_X_train_sm = sm.add_constant(enc_X_train) enc_X_test_sm = sm.add_constant(enc_X_test) model_sm = sm.OLS(y_train, enc_X_train_sm).fit() y_pred = model_sm.predict(enc_X_test_sm) rmse = mean_squared_error(y_test, y_pred, squared=False) r2 = r2_score(y_test, y_pred) print(f"RMSE linear regression on one hot encoded data: {rmse}") print(f"R^2 score: {r2}")

poly = PolynomialFeatures(degree=3) enc_X_train_tmp = poly.fit_transform(enc_X_train) enc_X_test_tmp = poly.transform(enc_X_test) # No need to do sm.add_constant() as that is # already done by PolynomialFeatures. model_sm = sm.OLS(y_train, enc_X_train_tmp).fit() y_pred = model_sm.predict(enc_X_test_tmp) rmse = mean_squared_error(y_test, y_pred, squared=False) r2 = r2_score(y_test, y_pred) print("RMSE linear regression on one-hot encoded + cubic power term data:" + f" {rmse}") print(f"R^2 score: {r2}")

poly = PolynomialFeatures(degree=3) X_train_deg3 = poly.fit_transform(X_train) X_test_deg3 = poly.transform(X_test) # No need to do sm.add_constant() as that is # already done by PolynomialFeatures. model_sm = sm.OLS(y_train, X_train_deg3).fit() y_pred = model_sm.predict(X_test_deg3) rmse = mean_squared_error(y_test, y_pred, squared=False) r2 = r2_score(y_test, y_pred) print("RMSE linear regression on cubic power term transformation:" + f" {rmse}") print(f"R^2 score: {r2}")

def adjusted_r2_score(r2, n, p): return 1 - (1-r2) * ((n-1.)/(n-p-1.)) adjusted_r2 = adjusted_r2_score(r2, len(y_test), X_test_deg3.shape[1]) print(f"Adjusted R^2 score: {adjusted_r2}")

hourly_df_lag = hourly_df_raw.copy() ma_lags = [2, 5, 10] for i in ma_lags: hourly_df_lag["hires_ma_" + str(i)] = hourly_df_lag.hires.rolling(i).mean() hourly_df_lag.dropna(inplace=True)

target = hourly_df_lag.hires features = hourly_df_lag.drop(["hires", "timestamp"], axis=1) X_train_lag, X_test_lag, y_train_lag, y_test_lag = train_test_split( features, target, test_size=0.3, random_state=44)

X_train_lag_sm = sm.add_constant(X_train_lag) X_test_lag_sm = sm.add_constant(X_test_lag) model_lag_sm = sm.OLS(y_train_lag, X_train_lag_sm).fit() y_pred = model_lag_sm.predict(X_test_lag_sm) rmse = mean_squared_error(y_test_lag, y_pred, squared=False) r2 = r2_score(y_test_lag, y_pred) print(f"RMSE linear regression on data with lag: {rmse}") print(f"R^2 score: {r2}")

mask = np.triu(np.ones_like(hourly_df_lag.corr(), dtype=np.bool)) ax = sns.heatmap(hourly_df_lag.corr(), mask=mask, annot=True, cmap="BrBG")

model_sm = sm.OLS(y_train, X_train_deg3).fit() y_pred = model_sm.predict(X_test_deg3)

def residuals_vs_fitted_plot(y_pred, y_true, ax=None): if ax is None: _, ax = plt.subplots() sns.residplot( x=y_pred, y=y_true, lowess=True, scatter_kws={"alpha": 0.2}, line_kws={"color": "red", "lw": 2, "alpha": 0.8}, ax=ax, ) ax.set_title("Residuals vs Fitted plot") ax.set_xlabel("Fitted values") ax.set_ylabel("Residuals"); residuals_vs_fitted_plot(y_pred, y_test)

from statsmodels.graphics.gofplots import qqplot

std_residuals = model_sm.get_influence().resid_studentized_internal qq = qqplot(std_residuals, line="s", alpha=0.2);

sns.kdeplot(y_train)

# Observe that min = 0, so we can't apply Boxcox target.min()

import scipy.stats as stats

y_train_yj, lambda_yj = stats.yeojohnson(y_train) sns.kdeplot(y_train_yj)

model_sm = sm.OLS(y_train_yj, X_train_deg3).fit() y_pred = model_sm.predict(X_test_deg3) y_test_yj = stats.yeojohnson(y_test, lmbda=lambda_yj) rmse = mean_squared_error(y_test_yj, y_pred, squared=False) r2 = r2_score(y_test_yj, y_pred) adjusted_r2 = adjusted_r2_score(r2, len(y_test_yj), X_test.shape[1]) print(("RMSE linear model on Yeo-Johnson transformed data" + f" + cubic power term features: {rmse:.5f}")) print(f"R^2 score: {r2:.5f}") print(f"Adjusted R^2 score: {adjusted_r2:.5f}")

residuals_vs_fitted_plot(y_pred, y_test_yj)

std_residuals = model_sm.get_influence().resid_studentized_internal qq = qqplot(std_residuals, line="s", alpha=0.2);

test_sample = pd.DataFrame( dict(hour=13, temp=15, hum=21, wind_speed=17.1, weather_code=3, is_holiday=0, is_weekend=0, season=3), index=[0] )

def inverse_yeojohnson(y, lam): return (y*lam + 1)**(1/lam) - 1

def pipeline(new_sample): new_sample = poly.transform(new_sample) # Auto add constant y_pred = model_sm.predict(new_sample) return inverse_yeojohnson(y_pred, lambda_yj)

pipeline(test_sample)