from datetime import date, datetime, timedelta
import os
import math
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
transactions_df = pd.read_feather("data/transactions.feather")
transactions_df.sample(10, random_state=0)
TRANSACTION_IDint64
102974 - 1676835
TX_DATETIMEdatetime64[ns]
2018-04-11 15:25:15 - 2018-09-22 18:58:10
1359175
1368663
2018-08-21T15:58:32.000000
744402
753890
2018-06-18T13:24:19.000000
743737
753225
2018-06-18T12:29:50.000000
681649
691137
2018-06-12T04:01:22.000000
412664
422152
2018-05-14T22:03:16.000000
1275701
1285189
2018-08-13T03:32:52.000000
821080
830568
2018-06-26T13:30:53.000000
1667347
1676835
2018-09-22T18:58:10.000000
93486
102974
2018-04-11T15:25:15.000000
896790
906278
2018-07-04T11:33:25.000000
not_fraud_count, fraud_count = np.bincount(transactions_df["TX_FRAUD"])
total_count = not_fraud_count + fraud_count
print(
(
f"Data:\n"
f" Total: {total_count}\n"
f" Fraud: {fraud_count} ({100 * fraud_count / total_count:.2f}% of total)\n"
)
)
Data:
Total: 1744667
Fraud: 14678 (0.84% of total)
df = pd.concat(
[
transactions_df[transactions_df["TX_FRAUD"] == 0].sample(1000, random_state=0),
transactions_df[transactions_df["TX_FRAUD"] == 1].sample(1000, random_state=0),
]
)
fig = px.histogram(
df,
title="Transaction count for different amounts",
x="TX_AMOUNT",
color="TX_FRAUD",
marginal="box",
)
fig.update_traces(opacity=0.75)
fig.update_layout(barmode="overlay")
fig.show()
cleaned_df = pd.DataFrame()
cleaned_df["amount"] = transactions_df["TX_AMOUNT"]
cleaned_df["is_fraud"] = transactions_df["TX_FRAUD"]
cleaned_df["is_weekend"] = transactions_df["TX_DATETIME"].dt.weekday >= 5
cleaned_df["is_night"] = transactions_df["TX_DATETIME"].dt.hour <= 6
cleaned_df["customer_num_transactions_1_day"] = transactions_df.groupby(
"CUSTOMER_ID"
).apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("1d", on="TX_DATETIME").count()
)[
"TX_AMOUNT"
]
cleaned_df["customer_num_transactions_7_day"] = transactions_df.groupby(
"CUSTOMER_ID"
).apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("7d", on="TX_DATETIME").count()
)[
"TX_AMOUNT"
]
cleaned_df["customer_num_transactions_30_day"] = transactions_df.groupby(
"CUSTOMER_ID"
).apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("30d", on="TX_DATETIME").count()
)[
"TX_AMOUNT"
]
cleaned_df["customer_avg_amount_1_day"] = transactions_df.groupby("CUSTOMER_ID").apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("1d", on="TX_DATETIME").mean()
)["TX_AMOUNT"]
cleaned_df["customer_avg_amount_7_day"] = transactions_df.groupby("CUSTOMER_ID").apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("7d", on="TX_DATETIME").mean()
)["TX_AMOUNT"]
cleaned_df["customer_avg_amount_30_day"] = transactions_df.groupby("CUSTOMER_ID").apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("30d", on="TX_DATETIME").mean()
)["TX_AMOUNT"]
DAY_DELAY = 7
def get_count_risk_rolling_window(
terminal_transactions, window_size, delay_period=DAY_DELAY
):
frauds_in_delay = terminal_transactions.rolling(
str(delay_period) + "d", on="TX_DATETIME"
)["TX_FRAUD"].sum()
transactions_in_delay = terminal_transactions.rolling(
str(delay_period) + "d", on="TX_DATETIME"
)["TX_FRAUD"].count()
frauds_until_window = terminal_transactions.rolling(
str(delay_period + window_size) + "d", on="TX_DATETIME"
)["TX_FRAUD"].sum()
transactions_until_window = terminal_transactions.rolling(
str(delay_period + window_size) + "d", on="TX_DATETIME"
)["TX_FRAUD"].count()
frauds_in_window = frauds_until_window - frauds_in_delay
transactions_in_window = transactions_until_window - transactions_in_delay
terminal_transactions["fraud_risk"] = (
frauds_in_window / transactions_in_window
).fillna(0)
return terminal_transactions
cleaned_df["terminal_num_transactions_1_day"] = transactions_df.groupby(
"TERMINAL_ID"
).apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("1d", on="TX_DATETIME").count()
)[
"TX_AMOUNT"
]
cleaned_df["terminal_num_transactions_7_day"] = transactions_df.groupby(
"TERMINAL_ID"
).apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("7d", on="TX_DATETIME").count()
)[
"TX_AMOUNT"
]
cleaned_df["terminal_num_transactions_30_day"] = transactions_df.groupby(
"TERMINAL_ID"
).apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("30d", on="TX_DATETIME").count()
)[
"TX_AMOUNT"
]
cleaned_df["terminal_fraud_risk_1_day"] = transactions_df.groupby("TERMINAL_ID").apply(
lambda x: get_count_risk_rolling_window(x, 1, 7)
)["fraud_risk"]
cleaned_df["terminal_fraud_risk_7_day"] = transactions_df.groupby("TERMINAL_ID").apply(
lambda x: get_count_risk_rolling_window(x, 7, 7)
)["fraud_risk"]
cleaned_df["terminal_fraud_risk_30_day"] = transactions_df.groupby("TERMINAL_ID").apply(
lambda x: get_count_risk_rolling_window(x, 30, 7)
)["fraud_risk"]
cleaned_df["day"] = transactions_df["TX_TIME_DAYS"]
cleaned_df["datetime"] = transactions_df["TX_DATETIME"]
cleaned_df["customer_id"] = transactions_df["CUSTOMER_ID"]
cleaned_df["id"] = transactions_df["TRANSACTION_ID"]
pd.concat(
# show some fraudulent and non-fraudulent transactions
[
cleaned_df[cleaned_df["is_fraud"] == 1].sample(5, random_state=0),
cleaned_df[cleaned_df["is_fraud"] == 0].sample(5, random_state=0),
]
).sample(10, random_state=0)
amountfloat64
5.15 - 132.03
is_fraudint64
0 - 1
764833
40.66
1
1109119
54.77
0
408126
26.29
1
1437242
132.03
0
1073063
5.15
1
1027460
48.39
0
1340733
69.47
0
1416564
54.42
1
498910
29.27
1
1741939
87.12
0
# this is adapted from get_train_test_set at
# https://fraud-detection-handbook.github.io/fraud-detection-handbook/Chapter_References/shared_functions.html#get-train-test-set
def get_train_test_set(
df,
start_date_training,
delta_train=7,
delta_delay=DAY_DELAY,
delta_test=7,
random_state=0,
):
# Get the training set data
train_df = df[
(df["datetime"] >= start_date_training)
& (df["datetime"] < start_date_training + timedelta(days=delta_train))
]
# Get the test set data
test_df = []
# Note: Cards known to be compromised after the delay period are removed from the test set
# That is, for each test day, all frauds known at (test_day-delay_period) are removed
# First, get known defrauded customers from the training set
known_defrauded_customers = set(train_df[train_df["is_fraud"] == 1]["customer_id"])
# Get the relative starting day of training set (easier than TX_DATETIME to collect test data)
start_tx_time_days_training = train_df["day"].min()
# Then, for each day of the test set
for day in range(delta_test):
# Get test data for that day
test_df_day = df[
df["day"] == start_tx_time_days_training + delta_train + delta_delay + day
]
# Compromised cards from that test day, minus the delay period, are added to the pool of known defrauded customers
test_df_day_delay_period = df[
df["day"] == start_tx_time_days_training + delta_train + day - 1
]
new_defrauded_customers = set(
test_df_day_delay_period[test_df_day_delay_period["is_fraud"] == 1][
"customer_id"
]
)
known_defrauded_customers = known_defrauded_customers.union(
new_defrauded_customers
)
test_df_day = test_df_day[
~test_df_day["customer_id"].isin(known_defrauded_customers)
]
test_df.append(test_df_day)
test_df = pd.concat(test_df)
# Sort data sets by ascending order of transaction ID
train_df = train_df.sort_values("id")
test_df = test_df.sort_values("id")
return (train_df, test_df)
train_df, test_df = get_train_test_set(
cleaned_df, datetime(2018, 7, 25), delta_train=21
)
train_df, val_df = get_train_test_set(train_df, datetime(2018, 7, 25))
label_columns = ["is_fraud"]
feature_columns = [
"amount",
"is_weekend",
"is_night",
"customer_num_transactions_1_day",
"customer_num_transactions_7_day",
"customer_num_transactions_30_day",
"customer_avg_amount_1_day",
"customer_avg_amount_7_day",
"customer_avg_amount_30_day",
"terminal_num_transactions_1_day",
"terminal_num_transactions_7_day",
"terminal_num_transactions_30_day",
"terminal_fraud_risk_1_day",
"terminal_fraud_risk_7_day",
"terminal_fraud_risk_30_day",
]
train_labels = np.array(train_df[label_columns])
val_labels = np.array(val_df[label_columns])
test_labels = np.array(test_df[label_columns])
train_features = np.array(train_df[feature_columns])
val_features = np.array(val_df[feature_columns])
test_features = np.array(test_df[feature_columns])
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)
print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)
print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)
Training labels shape: (67240, 1)
Validation labels shape: (58264, 1)
Test labels shape: (50321, 1)
Training features shape: (67240, 15)
Validation features shape: (58264, 15)
Test features shape: (50321, 15)
weight_for_not_fraud = (1.0 / not_fraud_count) * total_count / 2.0
weight_for_fraud = (1.0 / fraud_count) * total_count / 2.0
class_weight = {0: weight_for_not_fraud, 1: weight_for_fraud}
class_weight
# bias fix to speed up training
# see https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#optional_set_the_correct_initial_bias
output_bias = tf.keras.initializers.Constant(np.log([fraud_count / not_fraud_count]))
model = keras.Sequential(
[
keras.layers.Dense(
500, activation="relu", input_shape=(train_features.shape[-1],)
),
keras.layers.Dense(
500, activation="relu", input_shape=(train_features.shape[-1],)
),
keras.layers.Dropout(0.2),
keras.layers.Dense(1, activation="sigmoid", bias_initializer=output_bias),
]
)
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=1e-3),
loss=keras.losses.BinaryCrossentropy(),
metrics=[
keras.metrics.Precision(name="precision"),
keras.metrics.Recall(name="recall"),
keras.metrics.AUC(name="auc"),
keras.metrics.AUC(name="prc", curve="PR"),
],
)
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 500) 8000
dense_1 (Dense) (None, 500) 250500
dropout (Dropout) (None, 500) 0
dense_2 (Dense) (None, 1) 501
=================================================================
Total params: 259,001
Trainable params: 259,001
Non-trainable params: 0
_________________________________________________________________
BATCH_SIZE = 64
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor="val_prc", verbose=1, patience=10, mode="max", restore_best_weights=True
)
training_history = model.fit(
train_features,
train_labels,
batch_size=BATCH_SIZE,
epochs=40,
callbacks=[early_stopping],
validation_data=(val_features, val_labels),
class_weight=class_weight,
)
Epoch 1/40
1051/1051 [==============================] - 6s 5ms/step - loss: 0.4941 - precision: 0.0703 - recall: 0.7341 - auc: 0.8588 - prc: 0.4330 - val_loss: 0.1791 - val_precision: 0.2200 - val_recall: 0.6857 - val_auc: 0.8455 - val_prc: 0.5665
Epoch 2/40
1051/1051 [==============================] - 5s 4ms/step - loss: 0.3639 - precision: 0.1013 - recall: 0.7843 - auc: 0.8975 - prc: 0.5457 - val_loss: 0.1643 - val_precision: 0.2773 - val_recall: 0.6857 - val_auc: 0.8536 - val_prc: 0.5064
Epoch 3/40
1051/1051 [==============================] - 5s 4ms/step - loss: 0.3477 - precision: 0.1080 - recall: 0.8010 - auc: 0.9027 - prc: 0.5893 - val_loss: 0.1683 - val_precision: 0.1562 - val_recall: 0.6961 - val_auc: 0.8521 - val_prc: 0.5504
Epoch 4/40
1051/1051 [==============================] - 5s 4ms/step - loss: 0.3529 - precision: 0.1029 - recall: 0.8043 - auc: 0.9041 - prc: 0.5336 - val_loss: 0.3273 - val_precision: 0.0568 - val_recall: 0.7403 - val_auc: 0.8591 - val_prc: 0.4484
Epoch 5/40
1051/1051 [==============================] - 5s 4ms/step - loss: 0.3473 - precision: 0.0981 - recall: 0.8077 - auc: 0.9094 - prc: 0.5159 - val_loss: 0.2161 - val_precision: 0.0938 - val_recall: 0.7117 - val_auc: 0.8590 - val_prc: 0.5172
Epoch 6/40
1051/1051 [==============================] - 5s 4ms/step - loss: 0.3464 - precision: 0.0954 - recall: 0.8027 - auc: 0.9127 - prc: 0.5232 - val_loss: 0.1878 - val_precision: 0.1090 - val_recall: 0.7143 - val_auc: 0.8561 - val_prc: 0.5082
Epoch 7/40
1051/1051 [==============================] - 5s 4ms/step - loss: 0.3405 - precision: 0.1012 - recall: 0.8027 - auc: 0.9165 - prc: 0.5199 - val_loss: 0.1828 - val_precision: 0.1418 - val_recall: 0.7143 - val_auc: 0.8551 - val_prc: 0.5539
Epoch 8/40
1051/1051 [==============================] - 5s 4ms/step - loss: 0.3613 - precision: 0.0910 - recall: 0.8043 - auc: 0.9087 - prc: 0.4759 - val_loss: 0.2578 - val_precision: 0.1024 - val_recall: 0.7169 - val_auc: 0.8588 - val_prc: 0.4040
Epoch 9/40
1051/1051 [==============================] - 5s 4ms/step - loss: 0.3563 - precision: 0.1014 - recall: 0.8027 - auc: 0.9114 - prc: 0.4653 - val_loss: 0.2156 - val_precision: 0.1343 - val_recall: 0.7091 - val_auc: 0.8474 - val_prc: 0.4739
Epoch 10/40
1051/1051 [==============================] - 5s 4ms/step - loss: 0.3440 - precision: 0.1070 - recall: 0.8094 - auc: 0.9147 - prc: 0.5179 - val_loss: 0.4411 - val_precision: 0.0307 - val_recall: 0.7766 - val_auc: 0.8563 - val_prc: 0.3938
Epoch 11/40
1034/1051 [============================>.] - ETA: 0s - loss: 0.3642 - precision: 0.0871 - recall: 0.8048 - auc: 0.9140 - prc: 0.4533Restoring model weights from the end of the best epoch: 1.
1051/1051 [==============================] - 5s 4ms/step - loss: 0.3631 - precision: 0.0878 - recall: 0.8060 - auc: 0.9143 - prc: 0.4530 - val_loss: 0.1506 - val_precision: 0.2033 - val_recall: 0.6987 - val_auc: 0.8573 - val_prc: 0.4634
Epoch 11: early stopping
res = []
metrics_to_plot = [
("loss", "Loss"),
("precision", "Precision"),
("recall", "Recall"),
("auc", "Area under ROC curve"),
("prc", "Area under PR curve"),
]
fig = make_subplots(rows=len(metrics_to_plot), cols=1)
for metric, name in metrics_to_plot:
fig = go.Figure(
data=[
go.Scatter(
x=training_history.epoch,
y=training_history.history[metric],
mode="lines",
name="Training",
),
go.Scatter(
x=training_history.epoch,
y=training_history.history["val_" + metric],
mode="lines",
line={"dash": "dash"},
name="Validation",
),
]
)
fig.update_yaxes(title=name)
fig.update_xaxes(title="Epoch")
if (metric, name) == metrics_to_plot[0]:
fig.update_layout(
height=250, title="Training history", margin={"b": 0, "t": 50}
)
else:
fig.update_layout(height=200, margin={"b": 0, "t": 0})
fig.show()
train_predictions = model.predict(train_features, batch_size=BATCH_SIZE)
test_predictions = model.predict(test_features, batch_size=BATCH_SIZE)
predictions_df = pd.DataFrame(
{"Prediction": train_predictions.ravel(), "Label": train_labels.ravel()}
)
predictions_df = pd.concat(
[
predictions_df[predictions_df["Label"] == 0].sample(5000, random_state=0),
predictions_df[predictions_df["Label"] == 1].sample(500, random_state=0),
]
)
fig = px.histogram(
predictions_df,
x="Prediction",
title="Prediction values",
color="Label",
marginal="box",
labels={"0": "Legitimate", "1": "Fraudulent"},
)
fig.update_traces(opacity=0.75)
fig.update_layout(barmode="overlay")
fig.show()
def make_roc_df(name, predictions, labels):
fp, tp, _ = sklearn.metrics.roc_curve(labels, predictions)
return pd.DataFrame({"fp": fp * 100, "tp": tp * 100, "Dataset": name})
roc_df = pd.concat(
[
make_roc_df("Training", train_predictions, train_labels),
make_roc_df("Test", test_predictions, test_labels),
]
)
fig = px.line(
roc_df,
title="ROC Curve",
x="fp",
y="tp",
color="Dataset",
labels={"fp": "False Positives (%)", "tp": "True Positives (%)"},
)
fig.update_yaxes(range=[60, 100])
fig.update_traces(line={"dash": "dash"}, selector={"name": "test"})
fig.show()