Tuning Hyperparameters with Optuna
import pandas as pd
import numpy as np
import optuna
from sklearn import compose
from sklearn import impute
from sklearn import metrics
from sklearn import model_selection
from sklearn import pipeline
from sklearn import preprocessing
import xgboost as xgb
import catboost as cat
# This is nice handy constant to turn on and off the GPU. When `False`
# the notebook will ignore the GPU even when present.
GPU_ENABLED = True
train = pd.read_csv("train.csv").sample(frac=0.10, random_state=42)
cont_features = [f for f in train.columns.tolist() if f.startswith('cont')]
cat_features = [f for f in train.columns.tolist() if f.startswith('cat')]
y = train.target
X = train
numerical_preprocessor = pipeline.Pipeline(steps=[
("imputer", impute.SimpleImputer(strategy="mean")),
("scaler", preprocessing.MinMaxScaler())
])
categorical_preprocessor = pipeline.Pipeline(steps=[
("imputer", impute.SimpleImputer(strategy="most_frequent")),
("ordinal", preprocessing.OrdinalEncoder())
])
preprocessor = compose.ColumnTransformer(
transformers=[
("numerical_preprocessor", numerical_preprocessor, cont_features),
("categorical_preprocessor", categorical_preprocessor, cat_features)
]
)
def train_model_for_study(X, y, model):
X_train, X_valid, y_train, y_valid = model_selection.train_test_split(
X,
y,
test_size=0.20,
random_state=42
)
X_train = preprocessor.fit_transform(X_train, y_train)
X_valid = preprocessor.transform(X_valid)
model.fit(
X_train,
y_train,
early_stopping_rounds=300,
eval_set=[(X_valid, y_valid)],
verbose=False
)
yhat = model.predict(X_valid)
return metrics.mean_squared_error(y_valid, yhat, squared=False)
def objective_xgb(trial):
"""
Objective function to tune an `XGBRegressor` model.
"""
params = {
'n_estimators': trial.suggest_int("n_estimators", 1000, 10000),
'reg_alpha': trial.suggest_loguniform("reg_alpha", 1e-8, 100.0),
'reg_lambda': trial.suggest_loguniform("reg_lambda", 1e-8, 100.0),
"subsample": trial.suggest_float("subsample", 0.5, 1.0, step=0.1),
"learning_rate": trial.suggest_float("learning_rate", 0.01, 1.0, log=True),
'max_depth': trial.suggest_int("max_depth", 2, 9),
'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
}
if GPU_ENABLED:
params["tree_method"] = "gpu_hist"
params["predictor"] = "gpu_predictor"
model = xgb.XGBRegressor(
booster="gbtree",
objective="reg:squarederror",
random_state=42,
**params
)
return train_model_for_study(X, y, model)
def objective_cat(trial):
"""
Objective function to tune a `CatBoostRegressor` model.
"""
params = {
'iterations':trial.suggest_int("iterations", 4000, 25000),
'od_wait':trial.suggest_int('od_wait', 500, 2300),
'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
'subsample': trial.suggest_uniform('subsample',0,1),
'random_strength': trial.suggest_uniform('random_strength',10,50),
'depth': trial.suggest_int('depth',1, 15),
'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
}
if GPU_ENABLED:
params["task_type"] = "GPU"
params["bootstrap_type"] = "Poisson"
model = cat.CatBoostRegressor(
loss_function="RMSE",
random_state=42,
**params,
)
return train_model_for_study(X, y, model)
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=5)
study_xgb.best_params
[I 2021-09-01 17:55:42,724] A new study created in memory with name: no-name-0e3cc451-aa00-406c-93f5-ef1f11526c9c
[I 2021-09-01 17:55:48,279] Trial 0 finished with value: 0.738416091458543 and parameters: {'n_estimators': 7992, 'reg_alpha': 1.3133580413265216e-06, 'reg_lambda': 16.290093746736527, 'subsample': 0.6, 'learning_rate': 0.16027908156522835, 'max_depth': 5, 'colsample_bytree': 0.6595439362298875}. Best is trial 0 with value: 0.738416091458543.
[I 2021-09-01 17:55:58,392] Trial 1 finished with value: 0.7373312498627275 and parameters: {'n_estimators': 5838, 'reg_alpha': 4.396167769884513e-08, 'reg_lambda': 5.326525015477008e-05, 'subsample': 0.7, 'learning_rate': 0.06114842392791542, 'max_depth': 7, 'colsample_bytree': 0.42392061015953364}. Best is trial 1 with value: 0.7373312498627275.
[I 2021-09-01 17:56:17,568] Trial 2 finished with value: 0.7902618092751004 and parameters: {'n_estimators': 6637, 'reg_alpha': 1.2146118678935444e-07, 'reg_lambda': 1.0920854782930351e-05, 'subsample': 0.8, 'learning_rate': 0.5532814143906422, 'max_depth': 9, 'colsample_bytree': 0.7532729084007421}. Best is trial 1 with value: 0.7373312498627275.
[I 2021-09-01 17:56:25,627] Trial 3 finished with value: 0.7358492670026711 and parameters: {'n_estimators': 4955, 'reg_alpha': 0.0031430412204915526, 'reg_lambda': 1.1775420703839643, 'subsample': 0.7, 'learning_rate': 0.09235910120865394, 'max_depth': 6, 'colsample_bytree': 0.4168448487051154}. Best is trial 3 with value: 0.7358492670026711.
[I 2021-09-01 17:56:51,362] Trial 4 finished with value: 0.7375163511817688 and parameters: {'n_estimators': 6086, 'reg_alpha': 0.48387553975401604, 'reg_lambda': 0.0072717179065714625, 'subsample': 0.6, 'learning_rate': 0.03840690780111572, 'max_depth': 9, 'colsample_bytree': 0.5856589040681989}. Best is trial 3 with value: 0.7358492670026711.
study_cat = optuna.create_study(direction="minimize")
study_cat.optimize(objective_cat, n_trials=1)
study_cat.best_params
[I 2021-09-01 18:13:34,984] A new study created in memory with name: no-name-da3b86a4-9a22-45ed-af28-c382ae8df078
Custom logger is already specified. Specify more than one logger at same time is not thread safe.[I 2021-09-01 18:13:55,191] Trial 0 finished with value: 0.7350755084577055 and parameters: {'iterations': 24686, 'od_wait': 1507, 'learning_rate': 0.2910523692193494, 'reg_lambda': 78.53417683107563, 'subsample': 0.803401786024069, 'random_strength': 35.357782114945834, 'depth': 4, 'min_data_in_leaf': 22, 'leaf_estimation_iterations': 11}. Best is trial 0 with value: 0.7350755084577055.