import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import tree
from xgboost import XGBClassifier, plot_tree
import matplotlib.pyplot as plt
df = pd.read_csv("creditcard_Kaggle_PCA.csv")
# split features and label into train and test set
features = df.iloc[:, 0:-1]
label = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size =0.33, random_state=77)
Explorative Analyse
df.describe()
df['Class'].value_counts()
df.corr()
plt.rcParams["figure.figsize"] = (25,10)
plt.matshow(df.corr())
plt.show()
Einzelner Entscheidungsbaum
dec_tree = tree.DecisionTreeClassifier()
dec_tree = dec_tree.fit(X_train, y_train)
y_predict = dec_tree.predict(X_test)
predictions = [round(value) for value in y_predict]
print("F1 Score: ",f1_score(y_test, predictions))
#F1 Score: 0.7484276729559748
fig = tree.plot_tree(dec_tree)
XGBoost
xgb_tree = XGBClassifier()
xgb_tree = xgb_tree.fit(X_train, y_train)
print(xgb_tree) # show config of xgboost model
y_predict = xgb_tree.predict(X_test)
predictions = [round(value) for value in y_predict]
print("F1 Score: ",f1_score(y_test, predictions))
# F1 Score: 0.850498338870432
fig = plot_tree(xgb_tree)