from datetime import date, datetime, timedelta
import os
import math
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
mpl.rcParams['figure.figsize'] = (12, 10)
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
transactions_df = pd.read_feather("data/transactions.feather")
transactions_df.sample(10, random_state=0)
not_fraud_count, fraud_count = np.bincount(transactions_df["TX_FRAUD"])
total_count = not_fraud_count + fraud_count
print(
(
f"Data:\n"
f" Total: {total_count}\n"
f" Fraud: {fraud_count} ({100 * fraud_count / total_count:.2f}% of total)\n"
)
)
df = pd.concat(
[
transactions_df[transactions_df["TX_FRAUD"] == 0].sample(1000, random_state=0),
transactions_df[transactions_df["TX_FRAUD"] == 1].sample(1000, random_state=0),
]
)
fig = px.histogram(
df,
title="Transaction count for different amounts",
x="TX_AMOUNT",
color="TX_FRAUD",
marginal="box",
)
fig.update_traces(opacity=0.75)
fig.update_layout(barmode="overlay")
fig.show()
cleaned_df = pd.DataFrame()
cleaned_df["amount"] = transactions_df["TX_AMOUNT"]
cleaned_df["is_fraud"] = transactions_df["TX_FRAUD"]
cleaned_df["is_weekend"] = transactions_df["TX_DATETIME"].dt.weekday >= 5
cleaned_df["is_night"] = transactions_df["TX_DATETIME"].dt.hour <= 6
cleaned_df["customer_num_transactions_1_day"] = transactions_df.groupby(
"CUSTOMER_ID"
).apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("1d", on="TX_DATETIME").count()
)[
"TX_AMOUNT"
]
cleaned_df["customer_num_transactions_7_day"] = transactions_df.groupby(
"CUSTOMER_ID"
).apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("7d", on="TX_DATETIME").count()
)[
"TX_AMOUNT"
]
cleaned_df["customer_num_transactions_30_day"] = transactions_df.groupby(
"CUSTOMER_ID"
).apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("30d", on="TX_DATETIME").count()
)[
"TX_AMOUNT"
]
cleaned_df["customer_avg_amount_1_day"] = transactions_df.groupby("CUSTOMER_ID").apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("1d", on="TX_DATETIME").mean()
)["TX_AMOUNT"]
cleaned_df["customer_avg_amount_7_day"] = transactions_df.groupby("CUSTOMER_ID").apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("7d", on="TX_DATETIME").mean()
)["TX_AMOUNT"]
cleaned_df["customer_avg_amount_30_day"] = transactions_df.groupby("CUSTOMER_ID").apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("30d", on="TX_DATETIME").mean()
)["TX_AMOUNT"]
DAY_DELAY = 7
def get_count_risk_rolling_window(
terminal_transactions, window_size, delay_period=DAY_DELAY
):
frauds_in_delay = terminal_transactions.rolling(
str(delay_period) + "d", on="TX_DATETIME"
)["TX_FRAUD"].sum()
transactions_in_delay = terminal_transactions.rolling(
str(delay_period) + "d", on="TX_DATETIME"
)["TX_FRAUD"].count()
frauds_until_window = terminal_transactions.rolling(
str(delay_period + window_size) + "d", on="TX_DATETIME"
)["TX_FRAUD"].sum()
transactions_until_window = terminal_transactions.rolling(
str(delay_period + window_size) + "d", on="TX_DATETIME"
)["TX_FRAUD"].count()
frauds_in_window = frauds_until_window - frauds_in_delay
transactions_in_window = transactions_until_window - transactions_in_delay
terminal_transactions["fraud_risk"] = (
frauds_in_window / transactions_in_window
).fillna(0)
return terminal_transactions
cleaned_df["terminal_num_transactions_1_day"] = transactions_df.groupby(
"TERMINAL_ID"
).apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("1d", on="TX_DATETIME").count()
)[
"TX_AMOUNT"
]
cleaned_df["terminal_num_transactions_7_day"] = transactions_df.groupby(
"TERMINAL_ID"
).apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("7d", on="TX_DATETIME").count()
)[
"TX_AMOUNT"
]
cleaned_df["terminal_num_transactions_30_day"] = transactions_df.groupby(
"TERMINAL_ID"
).apply(
lambda x: x[["TX_DATETIME", "TX_AMOUNT"]].rolling("30d", on="TX_DATETIME").count()
)[
"TX_AMOUNT"
]
cleaned_df["terminal_fraud_risk_1_day"] = transactions_df.groupby("TERMINAL_ID").apply(
lambda x: get_count_risk_rolling_window(x, 1, 7)
)["fraud_risk"]
cleaned_df["terminal_fraud_risk_7_day"] = transactions_df.groupby("TERMINAL_ID").apply(
lambda x: get_count_risk_rolling_window(x, 7, 7)
)["fraud_risk"]
cleaned_df["terminal_fraud_risk_30_day"] = transactions_df.groupby("TERMINAL_ID").apply(
lambda x: get_count_risk_rolling_window(x, 30, 7)
)["fraud_risk"]
cleaned_df["day"] = transactions_df["TX_TIME_DAYS"]
cleaned_df["datetime"] = transactions_df["TX_DATETIME"]
cleaned_df["customer_id"] = transactions_df["CUSTOMER_ID"]
cleaned_df["id"] = transactions_df["TRANSACTION_ID"]
pd.concat(
# show some fraudulent and non-fraudulent transactions
[
cleaned_df[cleaned_df["is_fraud"] == 1].sample(5, random_state=0),
cleaned_df[cleaned_df["is_fraud"] == 0].sample(5, random_state=0),
]
).sample(10, random_state=0)
# this is adapted from get_train_test_set at
# https://fraud-detection-handbook.github.io/fraud-detection-handbook/Chapter_References/shared_functions.html#get-train-test-set
def get_train_test_set(
df,
start_date_training,
delta_train=7,
delta_delay=DAY_DELAY,
delta_test=7,
random_state=0,
):
# Get the training set data
train_df = df[
(df["datetime"] >= start_date_training)
& (df["datetime"] < start_date_training + timedelta(days=delta_train))
]
# Get the test set data
test_df = []
# Note: Cards known to be compromised after the delay period are removed from the test set
# That is, for each test day, all frauds known at (test_day-delay_period) are removed
# First, get known defrauded customers from the training set
known_defrauded_customers = set(train_df[train_df["is_fraud"] == 1]["customer_id"])
# Get the relative starting day of training set (easier than TX_DATETIME to collect test data)
start_tx_time_days_training = train_df["day"].min()
# Then, for each day of the test set
for day in range(delta_test):
# Get test data for that day
test_df_day = df[
df["day"] == start_tx_time_days_training + delta_train + delta_delay + day
]
# Compromised cards from that test day, minus the delay period, are added to the pool of known defrauded customers
test_df_day_delay_period = df[
df["day"] == start_tx_time_days_training + delta_train + day - 1
]
new_defrauded_customers = set(
test_df_day_delay_period[test_df_day_delay_period["is_fraud"] == 1][
"customer_id"
]
)
known_defrauded_customers = known_defrauded_customers.union(
new_defrauded_customers
)
test_df_day = test_df_day[
~test_df_day["customer_id"].isin(known_defrauded_customers)
]
test_df.append(test_df_day)
test_df = pd.concat(test_df)
# Sort data sets by ascending order of transaction ID
train_df = train_df.sort_values("id")
test_df = test_df.sort_values("id")
return (train_df, test_df)
train_df, test_df = get_train_test_set(
cleaned_df, datetime(2018, 7, 25), delta_train=21
)
train_df, val_df = get_train_test_set(train_df, datetime(2018, 7, 25))
label_columns = ["is_fraud"]
feature_columns = [
"amount",
"is_weekend",
"is_night",
"customer_num_transactions_1_day",
"customer_num_transactions_7_day",
"customer_num_transactions_30_day",
"customer_avg_amount_1_day",
"customer_avg_amount_7_day",
"customer_avg_amount_30_day",
"terminal_num_transactions_1_day",
"terminal_num_transactions_7_day",
"terminal_num_transactions_30_day",
"terminal_fraud_risk_1_day",
"terminal_fraud_risk_7_day",
"terminal_fraud_risk_30_day",
]
train_labels = np.array(train_df[label_columns])
val_labels = np.array(val_df[label_columns])
test_labels = np.array(test_df[label_columns])
train_features = np.array(train_df[feature_columns])
val_features = np.array(val_df[feature_columns])
test_features = np.array(test_df[feature_columns])
scaler = StandardScaler()
train_features = scaler.fit_transform(train_features)
val_features = scaler.transform(val_features)
test_features = scaler.transform(test_features)
print('Training labels shape:', train_labels.shape)
print('Validation labels shape:', val_labels.shape)
print('Test labels shape:', test_labels.shape)
print('Training features shape:', train_features.shape)
print('Validation features shape:', val_features.shape)
print('Test features shape:', test_features.shape)
weight_for_not_fraud = (1.0 / not_fraud_count) * total_count / 2.0
weight_for_fraud = (1.0 / fraud_count) * total_count / 2.0
class_weight = {0: weight_for_not_fraud, 1: weight_for_fraud}
class_weight
# bias fix to speed up training
# see https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#optional_set_the_correct_initial_bias
output_bias = tf.keras.initializers.Constant(np.log([fraud_count / not_fraud_count]))
model = keras.Sequential(
[
keras.layers.Dense(
500, activation="relu", input_shape=(train_features.shape[-1],)
),
keras.layers.Dense(
500, activation="relu", input_shape=(train_features.shape[-1],)
),
keras.layers.Dropout(0.2),
keras.layers.Dense(1, activation="sigmoid", bias_initializer=output_bias),
]
)
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=1e-3),
loss=keras.losses.BinaryCrossentropy(),
metrics=[
keras.metrics.Precision(name="precision"),
keras.metrics.Recall(name="recall"),
keras.metrics.AUC(name="auc"),
keras.metrics.AUC(name="prc", curve="PR"),
],
)
model.summary()
BATCH_SIZE = 64
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor="val_prc", verbose=1, patience=10, mode="max", restore_best_weights=True
)
training_history = model.fit(
train_features,
train_labels,
batch_size=BATCH_SIZE,
epochs=40,
callbacks=[early_stopping],
validation_data=(val_features, val_labels),
class_weight=class_weight,
)
res = []
metrics_to_plot = [
("loss", "Loss"),
("precision", "Precision"),
("recall", "Recall"),
("auc", "Area under ROC curve"),
("prc", "Area under PR curve"),
]
fig = make_subplots(rows=len(metrics_to_plot), cols=1)
for metric, name in metrics_to_plot:
fig = go.Figure(
data=[
go.Scatter(
x=training_history.epoch,
y=training_history.history[metric],
mode="lines",
name="Training",
),
go.Scatter(
x=training_history.epoch,
y=training_history.history["val_" + metric],
mode="lines",
line={"dash": "dash"},
name="Validation",
),
]
)
fig.update_yaxes(title=name)
fig.update_xaxes(title="Epoch")
if (metric, name) == metrics_to_plot[0]:
fig.update_layout(
height=250, title="Training history", margin={"b": 0, "t": 50}
)
else:
fig.update_layout(height=200, margin={"b": 0, "t": 0})
fig.show()
train_predictions = model.predict(train_features, batch_size=BATCH_SIZE)
test_predictions = model.predict(test_features, batch_size=BATCH_SIZE)
predictions_df = pd.DataFrame(
{"Prediction": train_predictions.ravel(), "Label": train_labels.ravel()}
)
predictions_df = pd.concat(
[
predictions_df[predictions_df["Label"] == 0].sample(5000, random_state=0),
predictions_df[predictions_df["Label"] == 1].sample(500, random_state=0),
]
)
fig = px.histogram(
predictions_df,
x="Prediction",
title="Prediction values",
color="Label",
marginal="box",
labels={"0": "Legitimate", "1": "Fraudulent"},
)
fig.update_traces(opacity=0.75)
fig.update_layout(barmode="overlay")
fig.show()
def make_roc_df(name, predictions, labels):
fp, tp, _ = sklearn.metrics.roc_curve(labels, predictions)
return pd.DataFrame({"fp": fp * 100, "tp": tp * 100, "Dataset": name})
roc_df = pd.concat(
[
make_roc_df("Training", train_predictions, train_labels),
make_roc_df("Test", test_predictions, test_labels),
]
)
fig = px.line(
roc_df,
title="ROC Curve",
x="fp",
y="tp",
color="Dataset",
labels={"fp": "False Positives (%)", "tp": "True Positives (%)"},
)
fig.update_yaxes(range=[60, 100])
fig.update_traces(line={"dash": "dash"}, selector={"name": "test"})
fig.show()