import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import tree
from xgboost import XGBClassifier, plot_tree
import matplotlib.pyplot as plt
df = pd.read_csv("creditcard_Kaggle_PCA.csv")
# split features and label into train and test set
features = df.iloc[:, 0:-1]
label = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size =0.33, random_state=77)
Explorative Analyse
df.describe()
df['Class'].value_counts()
df.corr()
plt.rcParams["figure.figsize"] = (25,10)
plt.matshow(df.corr())
plt.show()
Einzelner Entscheidungsbaum
dec_tree = tree.DecisionTreeClassifier()
dec_tree = dec_tree.fit(X_train, y_train)
y_predict = dec_tree.predict(X_test)
predictions = [round(value) for value in y_predict]
print("F1 Score: ",f1_score(y_test, predictions))
#F1 Score: 0.7484276729559748
F1 Score: 0.7484276729559748
fig = tree.plot_tree(dec_tree)
XGBoost
xgb_tree = XGBClassifier()
xgb_tree = xgb_tree.fit(X_train, y_train)
print(xgb_tree) # show config of xgboost model
/root/venv/lib/python3.7/site-packages/xgboost/sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1].
warnings.warn(label_encoder_deprecation_msg, UserWarning)
[11:31:44] WARNING: ../src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=2,
num_parallel_tree=1, predictor='auto', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', validate_parameters=1, verbosity=None)
y_predict = xgb_tree.predict(X_test)
predictions = [round(value) for value in y_predict]
print("F1 Score: ",f1_score(y_test, predictions))
# F1 Score: 0.850498338870432
F1 Score: 0.850498338870432
fig = plot_tree(xgb_tree)