import os
DATASET_DIR = '/work/datasets'
PROJECT_DATASET_DIR = os.path.join(DATASET_DIR, 'santander-cycle-london-tfl')
if not os.path.exists(PROJECT_DATASET_DIR):
os.makedirs(PROJECT_DATASET_DIR)
os.environ['KAGGLE_CONFIG_DIR'] = DATASET_DIR
!kaggle datasets download hmavrodiev/london-bike-sharing-dataset
!unzip -o london-bike-sharing-dataset.zip -d {PROJECT_DATASET_DIR}
!rm london-bike-sharing-dataset.zip
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
plt.style.use("fivethirtyeight")
data_path = os.path.join(PROJECT_DATASET_DIR, 'london_merged.csv')
hourly_df_raw = pd.read_csv(data_path)
hourly_df_raw.insert(1, "hires", hourly_df_raw.cnt)
hourly_df_raw.drop(["cnt"], axis=1, inplace=True)
hourly_df_raw.head()
hourly_df_raw.info()
hourly_df_raw['timestamp'] = pd.to_datetime(hourly_df_raw.timestamp)
hourly_df_raw.dtypes
for col in ["weather_code", "is_holiday", "is_weekend", "season"]:
hourly_df_raw[col] = hourly_df_raw[col].astype(int).astype("category")
hourly_df_raw.dtypes
hourly_df_raw.isna().sum()
hourly_df_raw.hires.describe()
sns.boxplot(x=hourly_df_raw.hires);
# Triangle heatmap: https://medium.com/@szabo.bibor/how-to-create-a-seaborn-correlation-heatmap-in-python-834c0686b88e
mask = np.triu(np.ones_like(hourly_df_raw.corr(), dtype=np.bool))
ax = sns.heatmap(hourly_df_raw.corr(), mask=mask, annot=True, cmap="BrBG")
hourly_df_raw[["t1", "t2"]].describe()
_, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 10))
ax[0].scatter(x=hourly_df_raw.t1, y=hourly_df_raw.hires, alpha=0.5)
ax[1].scatter(x=hourly_df_raw.t2, y=hourly_df_raw.hires, alpha=0.5)
fontsize = 14
ax[0].set_xlabel("Actual temperature (celsius)", fontsize=fontsize)
ax[0].set_title("Count of cycle hires vs actual temperature",
fontsize=fontsize)
ax[1].set_title("Count of cycle hires vs 'feels like' temperature",
fontsize=fontsize)
ax[1].set_xlabel("'Feels like' temperature (celsius)", fontsize=fontsize)
ax[0].set_ylabel("Count of cycle hires", fontsize=fontsize)
plt.show()
hourly_df_raw.insert(2, "temp", hourly_df_raw.t1)
hourly_df_raw.drop(["t1", "t2"], axis=1, inplace=True)
hourly_df_raw.head()
def scatterplot_feat_vs_hires(df: pd.DataFrame, feat_name: str, feat_label: str = None):
if feat_label is None:
feat_label = feat_name
if feat_name not in df.columns:
raise KeyError(f"'{feat_name}' is not a column in the dataframe.")
_, ax = plt.subplots(figsize=(10, 5))
sns.regplot(x=feat_name, y="hires", data=df, ci=95,
scatter_kws={"alpha": 0.2}, line_kws={"color": "red", "lw": 2, "alpha": 0.8})
fontsize = 14
ax.set_xlabel(feat_label, fontsize=fontsize)
ax.set_ylabel("Count of cycle hires", fontsize=fontsize)
ax.set_title(f"Count of cycle hires vs {feat_label}", fontsize=fontsize)
plt.show()
scatterplot_feat_vs_hires(hourly_df_raw, "temp")
scatterplot_feat_vs_hires(hourly_df_raw, "hum")
hourly_df_raw.wind_speed.describe()
ax = sns.histplot(hourly_df_raw.wind_speed, kde=True)
ax.set_title("Distribution of wind speed")
hourly_df_raw.wind_speed.value_counts(bins=5, sort=False)
scatterplot_feat_vs_hires(hourly_df_raw, "wind_speed")
def get_n_sample_of_windspeed_interval(n: int, lo: float, hi: float):
df_filtered_on_interval = hourly_df_raw[((hourly_df_raw.wind_speed >= lo)
& (hourly_df_raw.wind_speed < hi))]
# Sampling with replacement.
n_samples = df_filtered_on_interval.sample(n=n, random_state=44, replace=True)
return n_samples[["wind_speed", "hires"]]
n = 100 # Large n is ok since we are sampling with replacement.
samples = [get_n_sample_of_windspeed_interval(n, i, i+10)
for i in range(0, 60, 10)]
balanced_windspeed_df = pd.concat(samples)
scatterplot_feat_vs_hires(balanced_windspeed_df, "wind_speed")
season_mapping = {
0: "spring",
1: "summer",
2: "fall",
3: "winter",
}
weather_code_mapping = {
1: "clear",
2: "scattered clouds",
3: "broken clouds",
4: "cloudy",
7: "rain",
10: "rain with thunderstorm",
26: "snowfall",
94: "freezing fog",
}
hourly_df_raw["weather_code_str"] = hourly_df_raw.weather_code.map(weather_code_mapping)
hourly_df_raw["season_str"] = hourly_df_raw.season.map(season_mapping)
ax = sns.barplot(data=hourly_df_raw, x="hires", y="weather_code_str")
ax.set_title("Count of cycle hires by weather");
ax = sns.barplot(data=hourly_df_raw, x="is_holiday", y="hires")
ax.set_title("Count of cycle hires by holidays");
ax = sns.barplot(data=hourly_df_raw, x="is_holiday", y="hires", hue="season_str")
ax.set_title("Count of cycle hires by holidays grouped by seasons");
ax = sns.barplot(data=hourly_df_raw, x="is_weekend", y="hires")
ax.set_title("Count of cycle hires by weekends");
ax = sns.barplot(data=hourly_df_raw, x="is_weekend", y="hires", hue="season_str")
ax.set_title("Count of cycle hires by weekends grouped by seasons");
hourly_df_raw.drop(["season_str", "weather_code_str"], axis=1, inplace=True)
hourly_df_raw.head()
hourly_df_raw.insert(1, "hour", hourly_df_raw.timestamp.dt.hour)
hourly_df_raw.head()
ax = sns.barplot(data=hourly_df_raw, x="hour", y="hires", color="#ef553b")
ax.set_title("Count of rented bikes by hour");
target = hourly_df_raw["hires"]
features = hourly_df_raw.drop(["hires", "timestamp"], axis=1)
features.head()
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
X_train, X_test, y_train, y_test = train_test_split(
features, target, test_size=0.3, random_state=44)
X_train_sm = sm.add_constant(X_train)
X_test_sm = sm.add_constant(X_test)
model_sm = sm.OLS(y_train, X_train).fit()
y_pred = model_sm.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print(f"RMSE naive linear regression: {rmse}")
print(f"R^2 score: {r2}")
target.describe()
from sklearn.preprocessing import PolynomialFeatures
rmses = []
r2_scores = []
degrees = [1, 2, 3, 4]
for d in degrees:
poly = PolynomialFeatures(degree=d)
X_train_tmp = poly.fit_transform(X_train)
X_test_tmp = poly.transform(X_test)
# No need to do sm.add_constant() as that is
# already done by PolynomialFeatures.
model_sm = sm.OLS(y_train, X_train_tmp).fit()
y_pred = model_sm.predict(X_test_tmp)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
rmses.append(rmse)
r2_scores.append(r2)
for d, rmse, r2 in zip(degrees, rmses, r2_scores):
print(f"Degree {d} | RMSE: {rmse}, r^2: {r2}")
ax = sns.lineplot(x=degrees, y=rmses)
ax.set(xticks=degrees)
ax.set_title("RMSE vs degree of power terms considered")
ax.set_xlabel("Degree of power terms")
ax.set_ylabel("Root mean squared error (RMSE)")
ax = sns.lineplot(x=degrees, y=r2_scores)
ax.set(xticks=degrees)
ax.set_title("$R^2$ scores vs degree of power term considered")
ax.set_xlabel("Degree of power term")
ax.set_ylabel("$R^2$ scores")
from sklearn.preprocessing import OneHotEncoder
categorical_cols = X_train.columns[X_train.dtypes == "category"].tolist()
enc = OneHotEncoder(handle_unknown="error", sparse=False)
enc_X_train = enc.fit_transform(X_train[categorical_cols])
# Encoded X_train above is a numpy array. To convert it back to a DataFrame,
# we use pd.DataFrame with the transformed columns and original X_train index.
# The index is important when concatenating to the original X_train dataframe
# containing the non-categorical columns as the join is on index.
enc_X_train = pd.DataFrame(enc_X_train,
columns=enc.get_feature_names_out(categorical_cols),
index=X_train.index)
enc_X_train = pd.concat([X_train, enc_X_train], axis=1)\
.drop(categorical_cols, axis=1)
enc_X_train.head()
def preprocess_new_sample(new_sample, ohe_encoder):
for col in ["weather_code", "is_holiday", "is_weekend", "season"]:
new_sample[col] = new_sample[col].astype(int).astype("category")
categorical_cols = new_sample.columns[new_sample.dtypes
== "category"].tolist()
enc_new_sample = ohe_encoder.transform(new_sample[categorical_cols])
index = new_sample.index
enc_new_sample = pd.DataFrame(enc_new_sample,
columns=ohe_encoder.get_feature_names_out(
categorical_cols),
index=index)
enc_df = pd.concat([new_sample, enc_new_sample], axis=1)\
.drop(categorical_cols, axis=1)
return enc_df
test_sample = pd.DataFrame(
dict(hour=13,
temp=15,
hum=21,
wind_speed=17.1,
weather_code=3,
is_holiday=0,
is_weekend=0,
season=3),
index=[0]
)
preprocess_new_sample(test_sample, enc)
enc_X_test = preprocess_new_sample(X_test, enc)
enc_X_test.head()
enc_X_train_sm = sm.add_constant(enc_X_train)
enc_X_test_sm = sm.add_constant(enc_X_test)
model_sm = sm.OLS(y_train, enc_X_train_sm).fit()
y_pred = model_sm.predict(enc_X_test_sm)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print(f"RMSE linear regression on one hot encoded data: {rmse}")
print(f"R^2 score: {r2}")
poly = PolynomialFeatures(degree=3)
enc_X_train_tmp = poly.fit_transform(enc_X_train)
enc_X_test_tmp = poly.transform(enc_X_test)
# No need to do sm.add_constant() as that is
# already done by PolynomialFeatures.
model_sm = sm.OLS(y_train, enc_X_train_tmp).fit()
y_pred = model_sm.predict(enc_X_test_tmp)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print("RMSE linear regression on one-hot encoded + cubic power term data:"
+ f" {rmse}")
print(f"R^2 score: {r2}")
poly = PolynomialFeatures(degree=3)
X_train_deg3 = poly.fit_transform(X_train)
X_test_deg3 = poly.transform(X_test)
# No need to do sm.add_constant() as that is
# already done by PolynomialFeatures.
model_sm = sm.OLS(y_train, X_train_deg3).fit()
y_pred = model_sm.predict(X_test_deg3)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
print("RMSE linear regression on cubic power term transformation:"
+ f" {rmse}")
print(f"R^2 score: {r2}")
def adjusted_r2_score(r2, n, p):
return 1 - (1-r2) * ((n-1.)/(n-p-1.))
adjusted_r2 = adjusted_r2_score(r2, len(y_test), X_test_deg3.shape[1])
print(f"Adjusted R^2 score: {adjusted_r2}")
hourly_df_lag = hourly_df_raw.copy()
ma_lags = [2, 5, 10]
for i in ma_lags:
hourly_df_lag["hires_ma_" + str(i)] = hourly_df_lag.hires.rolling(i).mean()
hourly_df_lag.dropna(inplace=True)
target = hourly_df_lag.hires
features = hourly_df_lag.drop(["hires", "timestamp"], axis=1)
X_train_lag, X_test_lag, y_train_lag, y_test_lag = train_test_split(
features, target, test_size=0.3, random_state=44)
X_train_lag_sm = sm.add_constant(X_train_lag)
X_test_lag_sm = sm.add_constant(X_test_lag)
model_lag_sm = sm.OLS(y_train_lag, X_train_lag_sm).fit()
y_pred = model_lag_sm.predict(X_test_lag_sm)
rmse = mean_squared_error(y_test_lag, y_pred, squared=False)
r2 = r2_score(y_test_lag, y_pred)
print(f"RMSE linear regression on data with lag: {rmse}")
print(f"R^2 score: {r2}")
mask = np.triu(np.ones_like(hourly_df_lag.corr(), dtype=np.bool))
ax = sns.heatmap(hourly_df_lag.corr(), mask=mask, annot=True, cmap="BrBG")
model_sm = sm.OLS(y_train, X_train_deg3).fit()
y_pred = model_sm.predict(X_test_deg3)
def residuals_vs_fitted_plot(y_pred, y_true, ax=None):
if ax is None:
_, ax = plt.subplots()
sns.residplot(
x=y_pred,
y=y_true,
lowess=True,
scatter_kws={"alpha": 0.2},
line_kws={"color": "red", "lw": 2, "alpha": 0.8},
ax=ax,
)
ax.set_title("Residuals vs Fitted plot")
ax.set_xlabel("Fitted values")
ax.set_ylabel("Residuals");
residuals_vs_fitted_plot(y_pred, y_test)
from statsmodels.graphics.gofplots import qqplot
std_residuals = model_sm.get_influence().resid_studentized_internal
qq = qqplot(std_residuals, line="s", alpha=0.2);
sns.kdeplot(y_train)
# Observe that min = 0, so we can't apply Boxcox
target.min()
import scipy.stats as stats
y_train_yj, lambda_yj = stats.yeojohnson(y_train)
sns.kdeplot(y_train_yj)
model_sm = sm.OLS(y_train_yj, X_train_deg3).fit()
y_pred = model_sm.predict(X_test_deg3)
y_test_yj = stats.yeojohnson(y_test, lmbda=lambda_yj)
rmse = mean_squared_error(y_test_yj, y_pred, squared=False)
r2 = r2_score(y_test_yj, y_pred)
adjusted_r2 = adjusted_r2_score(r2, len(y_test_yj), X_test.shape[1])
print(("RMSE linear model on Yeo-Johnson transformed data"
+ f" + cubic power term features: {rmse:.5f}"))
print(f"R^2 score: {r2:.5f}")
print(f"Adjusted R^2 score: {adjusted_r2:.5f}")
residuals_vs_fitted_plot(y_pred, y_test_yj)
std_residuals = model_sm.get_influence().resid_studentized_internal
qq = qqplot(std_residuals, line="s", alpha=0.2);
test_sample = pd.DataFrame(
dict(hour=13,
temp=15,
hum=21,
wind_speed=17.1,
weather_code=3,
is_holiday=0,
is_weekend=0,
season=3),
index=[0]
)
def inverse_yeojohnson(y, lam):
return (y*lam + 1)**(1/lam) - 1
def pipeline(new_sample):
new_sample = poly.transform(new_sample) # Auto add constant
y_pred = model_sm.predict(new_sample)
return inverse_yeojohnson(y_pred, lambda_yj)
pipeline(test_sample)