import warnings
import functools
import numpy as np
import pandas as pd
from group_lasso import GroupLasso
from sklearn.compose import TransformedTargetRegressor
from sklearn.linear_model import Lasso, LassoCV, LinearRegression, Ridge
from sklearn.model_selection import cross_validate, train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import matplotlib
root_mean_squared_error = functools.partial(mean_squared_error, squared=False)
font = {'weight' : 'bold',
'size' : 22}
matplotlib.rc('font', **font)
dataset = pd.read_csv("/datasets/insurance/insurance.csv")
dataset.describe(include="all")
corr = dataset.corr()
corr.style.background_gradient(cmap='coolwarm')
data = dataset.copy()
data["is_male"] = data["sex"] == "male"
del data["sex"]
data["is_smoker"] = data["smoker"] == "yes"
del data["smoker"]
data = pd.get_dummies(data, columns=["region"])
y = data["charges"]
del data["charges"]
X = data.to_numpy(dtype=np.float32)
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
random_state=0,
train_size=0.05,
)
def create_standardized_model(model):
# transform x
model = Pipeline(
[
("std", StandardScaler()),
("regr", model),
]
)
# transform y
model = TransformedTargetRegressor(
regressor=model,
transformer=StandardScaler(),
)
return model
num_samples = 50
m = create_standardized_model(LinearRegression())
m.fit(X_train, y_train);
plt.figure(figsize=(25,10))
y_pred = m.predict(X_train)
residuals = y_train - y_pred
residuals -= residuals.mean()
residuals /= residuals.std()
plt.axhline(0, linestyle="--", c="k", alpha=0.6)
plt.scatter(y_pred, residuals)
plt.title("Residuals")
plt.xlabel("Value")
plt.ylabel("Standardized residual")
plt.show()
param_name = "regressor__regr__alpha"
search_space = np.logspace(-3, 3, num=50, base=10)
scoring = "neg_root_mean_squared_error"
cv = 5
model = Ridge(fit_intercept=False)
grid_search = GridSearchCV(
create_standardized_model(model),
{param_name: search_space},
scoring=scoring,
cv=cv,
)
grid_search.fit(X_train, y_train)
best_ridge_model = grid_search.best_estimator_
print("lambda:", grid_search.best_params_[param_name])
print(" RMSE:", -grid_search.best_score_)
lambda: 4.714866363457395
RMSE: 8192.232562038713
mean = -grid_search.cv_results_["mean_test_score"]
std = grid_search.cv_results_["std_test_score"]
plt.figure(figsize=(25,10))
plt.plot(search_space, mean)
plt.fill_between(search_space, mean-std, mean+std, alpha=0.1)
plt.axvline(grid_search.best_params_[param_name], c="k")
plt.xscale("log")
plt.ylabel("RMSE")
plt.title("Expected value of RMSE given λ")
plt.xlabel("λ")
plt.show()
coefs = []
space = np.logspace(-1, 4, 50)
for alpha in space:
model = create_standardized_model(Ridge(alpha, fit_intercept=False))
model.fit(X_train, y_train)
coefs.append(model.regressor_["regr"].coef_)
coefs = np.array(coefs)
plt.figure(figsize=(25,10))
for i, column_name in enumerate(data.columns):
plt.plot(space, coefs[:,i], label=column_name)
plt.xscale("log")
plt.title("Coefficient values by λ for Ridge regression")
plt.xlabel("λ")
plt.ylabel("Coefficients")
plt.axvline(grid_search.best_params_[param_name], c="k")
plt.legend()
plt.show()
def bootstrap(
model,
X,
y,
num_samples,
cv=5,
scoring="neg_root_mean_squared_error",
search_space=np.logspace(-3, 1, num=50, base=10),
param_name="regressor__regr__alpha",
):
coefs = []
params = []
for _ in range(num_samples):
grid_search = GridSearchCV(
create_standardized_model(model),
{param_name: search_space},
scoring=scoring,
cv=cv,
)
X_, y_ = resample(X, y)
grid_search.fit(X_, y_)
params.append(grid_search.best_params_[param_name])
coefs.append(grid_search.best_estimator_.regressor_["regr"].coef_)
return coefs, params
coefs, params = bootstrap(
Ridge(max_iter=10000, fit_intercept=False), X_train, y_train, 50
)
plt.figure(figsize=(25, 11))
plt.boxplot(np.stack(coefs, axis=0), labels=data.columns)
plt.title("Boxplot for bootstrapped ridge coefficient values")
plt.ylabel("Coefficient value")
plt.xticks(rotation=45)
plt.show()
KeyboardInterrupt:
param_name = "regressor__regr__alpha"
search_space = np.logspace(-3, 1, num=50, base=10)
scoring = "neg_root_mean_squared_error"
cv = 5
model = Lasso(fit_intercept=False)
grid_search = GridSearchCV(
create_standardized_model(model),
{param_name: search_space},
scoring=scoring,
cv=cv,
)
grid_search.fit(X_train, y_train)
best_lasso_model = grid_search.best_estimator_
print("lambda:", grid_search.best_params_[param_name])
print(" RMSE:", -grid_search.best_score_)
lambda: 0.0625055192527397
RMSE: 7649.917352752478
mean = -grid_search.cv_results_["mean_test_score"]
std = grid_search.cv_results_["std_test_score"]
plt.figure(figsize=(25,10))
plt.plot(search_space, mean)
plt.fill_between(search_space, mean-std, mean+std, alpha=0.1)
plt.axvline(grid_search.best_params_[param_name], c="k")
plt.xscale("log")
plt.ylabel("RMSE")
plt.title("Expected value of RMSE given λ")
plt.xlabel("λ")
plt.show()
space = np.logspace(-3, 1, 50)
coefs = []
for alpha in space:
model = create_standardized_model(Lasso(alpha, fit_intercept=False))
model.fit(X_train, y_train)
coefs.append(model.regressor_["regr"].coef_)
coefs = np.array(coefs)
plt.figure(figsize=(25,10))
for i, column_name in enumerate(data.columns):
plt.plot(space, coefs[:, i], label=column_name)
plt.xscale("log")
plt.xlabel("λ")
plt.title("Coefficient values by λ for Lasso regression")
plt.ylabel("Coefficients")
plt.axvline(grid_search.best_params_[param_name], c="k")
plt.legend()
plt.show()
coefs, params = bootstrap(Lasso(max_iter=10000, fit_intercept=False), X_train, y_train, 50)
plt.figure(figsize=(25,11))
plt.boxplot(np.stack(coefs, axis=0), labels=data.columns)
plt.title("Boxplot for bootstrapped lasso coefficient values")
plt.ylabel("Coefficient value")
plt.xticks(rotation=45)
plt.show()
model = create_standardized_model(
GroupLasso(
groups=np.array([None, None, None, None, None, 1, 1, 1, 1]),
supress_warning=True,
n_iter=100,
random_state=0
)
)
param_1 = "regressor__regr__l1_reg"
param_2 = "regressor__regr__group_reg"
search_space = np.logspace(-3, -1, num=10)
scoring = "neg_root_mean_squared_error"
cv = 5
grid_search = GridSearchCV(
model,
{param_1: search_space, param_2: search_space},
scoring=scoring,
cv=cv,
)
with warnings.catch_warnings():
warnings.filterwarnings("ignore")
grid_search.fit(X_train, y_train)
print("lambda_regular:", grid_search.best_params_[param_1])
print(" lambda_group:", grid_search.best_params_[param_2])
print(" RMSE:", -grid_search.best_score_)
best_group_model = grid_search.best_estimator_
lambda_regular: 0.021544346900318832
lambda_group: 0.1
RMSE: 7595.59541101182
mlr = create_standardized_model(LinearRegression())
mlr.fit(X_train, y_train)
print("Root Mean Squared Error:")
print("MLR:\t\t", root_mean_squared_error(y_test, mlr.predict(X_test)))
print("Ridge:\t\t", root_mean_squared_error(y_test, best_ridge_model.predict(X_test)))
print("Lasso:\t\t", root_mean_squared_error(y_test, best_lasso_model.predict(X_test)))
print("Group Lasso:\t", root_mean_squared_error(y_test, best_group_model.predict(X_test)))
Root Mean Squared Error:
MLR: 6450.789539072769
Ridge: 6278.762360678457
Lasso: 6223.883671218353
Group Lasso: 6250.8908500688