#Basics
import pandas as pd
import numpy as np
#Plotting
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')
#Train Test Split
from sklearn.model_selection import train_test_split
# Imputer
from sklearn.impute import SimpleImputer
# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
OneHotEncoder,
StandardScaler,
FunctionTransformer)
# Classifiers
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
#Pipeline
from sklearn.pipeline import Pipeline
#Grid Search
from sklearn.model_selection import GridSearchCV
# Model evaluation
from sklearn.metrics import plot_confusion_matrix
random_state = 42
drop_cols = []
features = pd.read_csv(
filepath_or_buffer='/work/Data/training_features.csv',
index_col='id'
)
targets = pd.read_csv(
filepath_or_buffer='/work/Data/training_labels.csv',
index_col='id'
)
df = features.join(targets, how='left')
X = df.drop('status_group', axis=1)
y = df['status_group']
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.1,
random_state=random_state
)
def convert_categorical_to_string(data):
return pd.DataFrame(data).astype(str)
CategoricalTypeConverter = FunctionTransformer(
convert_categorical_to_string
)
def classify_columns(df, drop_cols):
"""Takes a dataframe and a list of columns to
drop and returns:
- cat_cols: A list of categorical columns.
- num_cols: A list of numerical columns.
"""
cols = df.columns
keep_cols = [col for col in cols if col not in drop_cols]
cat_cols = []
num_cols = []
for col in keep_cols:
if df[col].dtype == object:
cat_cols.append(col)
else:
num_cols.append(col)
return cat_cols, num_cols
cat_cols, num_cols = classify_columns(X_train, drop_cols)
categorical_pipeline = Pipeline(
steps=[
(
'typeConverter',
CategoricalTypeConverter
),
(
'imputer',
SimpleImputer(
strategy='constant',
fill_value='missing'
)
),
(
'standardizer',
OneHotEncoder(
handle_unknown='ignore',
dtype=float
)
)
]
)
numerical_pipeline = Pipeline(
steps=[
(
'imputer',
SimpleImputer(
strategy='median'
)
),
(
'standardizer',
StandardScaler()
)
]
)
if len(drop_cols) > 0:
preprocessor = ColumnTransformer(
transformers=[
(
'numericalPreprocessor',
numerical_pipeline,
num_cols
),
(
'categoricalPreprocessor',
categorical_pipeline,
cat_cols
),
(
'dropPreprocessor',
'drop',
drop_cols
)
]
)
else:
preprocessor = ColumnTransformer(
transformers=[
(
'numericalPreprocessor',
numerical_pipeline,
num_cols
),
(
'categoricalPreprocessor',
categorical_pipeline,
cat_cols
)
]
)
pipeline = Pipeline(
steps=[
(
'preprocessor',
preprocessor
),
(
'classifier',
OneVsRestClassifier(
estimator='passthrough'
)
)
]
)
from catboost import CatBoostClassifier
parameter_grid = [
{
'classifier__estimator': [
XGBClassifier()
],
'classifier__estimator__max_depth': [
20
],
'classifier__estimator__n_estimators': [
800
]
}
]
grid_search = GridSearchCV(
estimator=pipeline,
param_grid=parameter_grid,
scoring='accuracy',
cv=5,
verbose=1,
n_jobs=-2,
refit=True
)
grid_search.fit(
X, y
)
model = grid_search.best_estimator_
Fitting 5 folds for each of 1 candidates, totalling 5 fits
/root/venv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[21:29:11] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[22:04:55] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[22:37:09] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
/root/venv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[23:13:56] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[23:49:59] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[00:22:28] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
/root/venv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[00:48:33] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[00:55:21] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[01:02:03] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
/root/venv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[01:09:33] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[01:16:37] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[01:23:21] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
/root/venv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[01:30:18] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[01:37:13] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[01:44:06] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
/root/venv/lib/python3.7/site-packages/xgboost/sklearn.py:888: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[01:51:14] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[01:59:57] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[02:08:17] WARNING: ../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
grid_search_results = pd.DataFrame(
grid_search.cv_results_
)
grid_search_results.to_csv(
'./grid_search_results.csv'
)
grid_search_results
X_validate = pd.read_csv(
'Data/testing_features.csv',
index_col='id'
)
y_validate = model.predict(
X_validate
)
df_predictions = pd.DataFrame(
y_validate,
index=X_validate.index,
columns=['status_group']
)
df_predictions.to_csv(
'Submission.csv'
)