Imports
import pandas as pd
from pandas.api.types import is_object_dtype
import numpy as np
from easydict import EasyDict as edict
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')
Config
config = edict()
config.main = edict()
config.main.TRAIN_FILE = "/work/train.csv"
config.main.TEST_FILE = "/work/test.csv"
config.model = edict()
config.model.LOGIT_PARAM = {
"penalty" : "l2",
"random_state" : 95,
"max_iter" : 100,
}
Data loading
df = pd.read_csv(config.main.TRAIN_FILE)
df.head()
Training & validation splits
df = df.sample(frac=1).reset_index(drop=True) #We resample and randomly shuffle our dataset
df["split"] = 0
target = df["target"]
# We use the stratify parameter to be sure that our train and validation set have the same distribution in the target variable
train, valid = train_test_split(df, test_size=0.2, stratify=target)
for i in valid.index:
df.loc[i, "split"] = 1
df_train = df[df.split != 1].reset_index(drop=True)
df_valid = df[df.split == 1].reset_index(drop=True)
Feature engineering
# For the baseline we just encode our categorical features so that they can be handled by our model later on
def feature_engineering(dataframe):
cat_features = [col for col in dataframe.columns if is_object_dtype(dataframe[col])]
for feat in cat_features:
le = LabelEncoder()
le.fit(dataframe[feat])
dataframe[feat] = le.transform(dataframe[feat])
features = dataframe.columns[1:30]
return dataframe, features
df_train, features_train = feature_engineering(df_train)
df_valid, features_valid = feature_engineering(df_valid)
target_train = df_train["target"].values
target_valid = df_valid["target"].values
Model
model = LogisticRegression(**config.model.LOGIT_PARAM)
Training & validation
model.fit(df_train[features_train], target_train)
valid_preds = model.predict_proba(df_valid[features_valid])[:, 1]
score = roc_auc_score(target_valid, valid_preds)
print(f"Validation AUC : {score}")
Predictions on new data
df_test = pd.read_csv(config.main.TEST_FILE)
df_test, features_test = feature_engineering(df_test)
preds = model.predict_proba(df_test[features_test])[:, 1]
df_test["preds"] = preds
df_test["preds"].head()