Efficient Jet Tagging using the BIP Framework
Introduction
The BIPs framework
# Lets import our Boost Invariant Polynomial Package
using BIPs
# And other usefull packages:
using Statistics
using Plots
using PyCall
Lets Bring some data in:
We show in our paper that the BIP framework is extremely efficient to respect to the data used to train. Therefore, we will only use a fraction of the training dataset (20 % of the original dataset), but still obtain state of the art results.
data_path = "/work"
train_path, val_path = data_path*"/mini_train.h5", data_path*"/mini_val.h5"
train_jets, train_labels = BIPs.read_data("TQ", train_path)
train_labels = [reinterpret(Bool, b == 1.0) for b in train_labels]
print("Number of entries in the training data: ", length(train_jets))
train_jets[1][begin:3]
train_transf_jets = data2hyp(train_jets)
println("Transformed jets")
nu
3 / 5
gamma
6 / 7
f_bip, specs = build_ip(order=order, levels=gamma)
function bip_data(dataset_jets)
storage = zeros(length(dataset_jets), length(specs))
for i = 1:length(dataset_jets)
storage[i, :] = f_bip(dataset_jets[i])
end
storage
end
Let us inspect the number of features in our model...
print("Each jet descirptor contains ", length(specs)," featues")
train_embedded_jets = bip_data(train_transf_jets)
println("Embedded jets correclty")
using PyCall
@pyimport sklearn.ensemble as sk_ensemble
Training a classifier
clf_model = sk_ensemble.HistGradientBoostingClassifier(verbose=true)
clf_model.fit(train_embedded_jets, train_labels)
Notice that we are able to train the model in only
Plots.plot(clf_model[:validation_score_], label="Validation", xlabel="Iteration", ylabel="Log Loss")
Plots.plot!(clf_model[:train_score_], label="Training", title=" Trainig curve for the Histogram XGB Classifier")
Classifier performance
val_jets, val_labels = BIPs.read_data("TQ", val_path)
val_labels = [reinterpret(Bool, b == 1.0) for b in val_labels]
val_transf_jets = data2hyp(val_jets)
val_embedded_jets = bip_data(val_transf_jets)
print("Embedded test jets correclty")
test_preds = xgb_clf.score(val_embedded_jets, val_labels)
print("Model performance on validation dataset ", (round(test_preds, digits=3)))
val_probas = xgb_clf.predict_proba(val_embedded_jets)
bkg_index = [label==false for label in val_labels]
Plots.histogram(val_probas[:, 2][val_labels], color="Blue", bins=100, label="Top Jets", xlabel="Model's Output Probability", ylabel="Number of Jets")
Plots.histogram!(val_probas[:, 2][bkg_index], color="Red", bins=100, label="QCD Jets", title="Probability Scores")