#4 Kfold cross-validation - My machine learning pipeline
Imports
import pandas as pd
from pandas.api.types import is_object_dtype
import numpy as np
from easydict import EasyDict as edict
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
Config
config = edict()
config.main = edict()
config.main.TRAIN_FILE = "/work/train.csv"
config.main.TEST_FILE = "/work/test.csv"
config.main.N_FOLDS = 5
config.model = edict()
config.model.XGB_PARAM = {
"objective" : "binary:logistic",
"eval_metric" : "auc",
"seed" : 95,
'n_estimators': 7434,
'reg_alpha': 6.567985477578714e-05,
'reg_lambda': 0.0909202163062362,
'colsample_bytree': 0.3,
'subsample': 1.0,
'learning_rate': 0.014,
'max_depth': 5,
'min_child_weight': 289
}
Loading data
Training and validation folds
df = df.sample(frac=1).reset_index(drop=True) #We resample and randomly shuffle our dataset
df["split"] = 0
target = df["target"]
skf = StratifiedKFold(n_splits=config.main.N_FOLDS) #We use stratifiedKfold to ensure that the target have the same distribution in every fold
for fold, (t_, v_) in enumerate(skf.split(X=df, y=target)):
df.loc[v_, "split"] = fold
Feature engineering
Model & metric
model = xgb.XGBClassifier(**config.model.XGB_PARAM)
auc = roc_auc_score
Training & validation
scores = pd.DataFrame(columns = ['fold', 'score'])
for fold in range(config.main.N_FOLDS):
print(f"Starting training for fold : {fold}")
df_train = df[df.split != fold].reset_index(drop=True)
df_train, features_train = feature_engineering(df_train)
df_valid = df[df.split == fold].reset_index(drop=True)
df_valid, features_valid = feature_engineering(df_valid)
target_train = df_train["target"].values
target_valid = df_valid["target"].values
model.fit(df_train[features_train], target_train, eval_set=[(df_valid[features_valid], target_valid)], early_stopping_rounds=200, verbose=500)
preds = model.predict_proba(X_valid)[:, 1]
score = auc(target_valid, preds)
#Saving the model
joblib.dump(model, f"XGB_{fold}_{round(score, 2)}")
print(f"Validation score : {score} for fold {fold}")
scores = scores.append({'fold' : fold, 'score' : score,}, ignore_index = True)
scores