Imports
import time
import pandas as pd
from pandas.api.types import is_object_dtype
import numpy as np
from easydict import EasyDict as edict
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
Config
config = edict()
config.main = edict()
config.main.TRAIN_FILE = "/work/train.csv"
config.main.TEST_FILE = "/work/test.csv"
config.model = edict()
config.model.LOGIT_PARAM = {
"penalty" : "l2",
"random_state" : 95,
"max_iter" : 100,
}
config.model.DT_PARAM = {
"criterion" : "gini",
"random_state" : 95
}
config.model.RF_PARAM = {
"random_state" : 95
}
config.model.XGB_PARAM = {
"objective" : "binary:logistic",
"eval_metric" : "auc",
"seed" : 95
}
Data loading
df = pd.read_csv(config.main.TRAIN_FILE)
Training & Validation splits
df = df.sample(frac=1).reset_index(drop=True) #We resample and randomly shuffle our dataset
df["split"] = 0
target = df["target"]
# We use the stratify parameter to be sure that our train and validation set have the same distribution in the target variable
train, valid = train_test_split(df, test_size=0.2, stratify=target)
for i in valid.index:
df.loc[i, "split"] = 1
df_train = df[df.split != 1].reset_index(drop=True)
df_valid = df[df.split == 1].reset_index(drop=True)
Feature engineering
# For the baseline we just encode our categorical features so that they can be handled by our model later on
def feature_engineering(dataframe):
cat_features = [col for col in dataframe.columns if is_object_dtype(dataframe[col])]
for feat in cat_features:
le = LabelEncoder()
le.fit(dataframe[feat])
dataframe[feat] = le.transform(dataframe[feat])
features = dataframe.columns[1:30]
return dataframe, features
Models & Metric
model_dict = {
"LogisticRegression" : LogisticRegression(**config.model.LOGIT_PARAM),
"DTClassifier" : DecisionTreeClassifier(**config.model.DT_PARAM),
"RFClassifier" : RandomForestClassifier(**config.model.RF_PARAM),
"XGBClassifier" : xgb.XGBClassifier(**config.model.XGB_PARAM)
}
auc = roc_auc_score
Model Selection
def compare_models(models, X_train, y_train, X_valid, y_valid, metric):
scores = pd.DataFrame(columns = ['score', 'model'])
for name, model in models.items():
start = time.perf_counter()
print(f"Training model : {name}")
model.fit(X_train, y_train)
preds = model.predict_proba(X_valid)[:, 1]
score = metric(y_valid, preds)
end = time.perf_counter()
print(f"Validation score : {score} in {end - start:0.4f} seconds")
scores = scores.append({'score' : score, "model" : name}, ignore_index = True)
return scores
scores = compare_models(model_dict, df_train[features_train], target_train, df_valid[features_valid], target_valid, auc)
scores
And the winner is...
plt.figure(figsize = (10,5))
sns.barplot(data = scores, x = 'model', y = 'score')
plt.show()