import pandas as pd
from pandas.api.types import is_object_dtype
import numpy as np
from easydict import EasyDict as edict
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import optuna
import warnings
warnings.filterwarnings('ignore')
config = edict()
config.main = edict()
config.main.TRAIN_FILE = "/work/train.csv"
config.main.TEST_FILE = "/work/test.csv"
df = pd.read_csv(config.main.TRAIN_FILE)
target = df["target"]
# For the baseline we just encode our categorical features so that they can be handled by our model later on
def feature_engineering(dataframe):
cat_features = [col for col in dataframe.columns if is_object_dtype(dataframe[col])]
for feat in cat_features:
le = LabelEncoder()
le.fit(dataframe[feat])
dataframe[feat] = le.transform(dataframe[feat])
features = dataframe.columns[1:30]
return dataframe, features
df, features = feature_engineering(df)
What is Optuna ?
def objective(trial, data=df[features], target=target):
train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2, random_state=95, stratify=target)
param = {
'objective' : "binary:logistic",
'eval_metric' : 'auc',
'seed': 95,
'n_estimators': trial.suggest_int("n_estimators", 500, 10000),
'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1),
'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1),
'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
'max_depth': trial.suggest_categorical('max_depth', [5,10,15,20,25]),
'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
}
model = xgb.XGBClassifier(**param)
model.fit(train_x,train_y, eval_set=[(test_x,test_y)], early_stopping_rounds=100, verbose=0)
preds = model.predict_proba(test_x)[:, 1]
auc = roc_auc_score(test_y, preds)
return auc
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=3600)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
optuna.visualization.plot_optimization_history(study)
#Visualize parameter importances.
optuna.visualization.plot_param_importances(study)
params=study.best_params
params