Credit Card Fraud Detection System

# importing libraries and modules import pandas as pd import numpy as np import matplotlib.pyplot as plt from collections import Counter import itertools import seaborn as sns from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, recall_score, f1_score from sklearn.metrics import plot_confusion_matrix

# importing the dataset. df = pd.read_csv('creditcard.csv') df.head()

# checking for null values df.isnull().values.any() # there are no null values present in the dataset

# a summary of the"Amount" column df['Amount'].describe()

# this provides info about the dataset, which includes the column count, non-null count and their data types df.info()

# checking the total amount of fradulent and non-fradulent data df['Class'].value_counts()

# Using Matplotlib to check the number of occurences of each class model non_fraud = len(df[df.Class == 0]) fraud = len(df[df.Class == 1]) fraud_percent = (fraud / (fraud + non_fraud)) * 100 print("Number of Genuine transactions: ", non_fraud) print("Number of Fraud transactions: ", fraud) print("Percentage of Fraud transactions: {:.4f}".format(fraud_percent))

# Using matplotlib to plot the information above import matplotlib.pyplot as plt labels = ["Genuine", "Fraud"] count_classes = df.value_counts(df['Class'], sort= True) count_classes.plot(kind = "bar", rot = 0) plt.title("Visualization of Labels") plt.ylabel("Count") plt.xticks(range(2), labels) plt.show()

import numpy as np from sklearn.preprocessing import StandardScaler scaler = StandardScaler() df["NormalizedAmount"] = scaler.fit_transform(df["Amount"].values.reshape(-1, 1)) df.drop(["Amount", "Time"], inplace= True, axis= 1) Y = df["Class"] X = df.drop(["Class"], axis= 1)

from sklearn.model_selection import train_test_split (train_X, test_X, train_Y, test_Y) = train_test_split(X, Y, test_size= 0.3, random_state= 42) print("Shape of train_X: ", train_X.shape) print("Shape of test_X: ", test_X.shape)

# buildng the Decision Tree and Random Forest Classifiers from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier #Decision Tree decision_tree = DecisionTreeClassifier() # Random Forest random_forest = RandomForestClassifier(n_estimators= 100)

decision_tree.fit(train_X, train_Y) predictions_dt = decision_tree.predict(test_X) decision_tree_score = decision_tree.score(test_X, test_Y) * 100 random_forest.fit(train_X, train_Y) predictions_rf = random_forest.predict(test_X) random_forest_score = random_forest.score(test_X, test_Y) * 100 print("Random Forest Score: ", random_forest_score) print("Decision Tree Score: ", decision_tree_score)

from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, recall_score, f1_score, ConfusionMatrixDisplay def metrics(actuals, predictions): print("Accuracy: {:.5f}".format(accuracy_score(actuals, predictions))) print("Precision: {:.5f}".format(precision_score(actuals, predictions))) print("Recall: {:.5f}".format(recall_score(actuals, predictions))) print("F1-score: {:.5f}".format(f1_score(actuals, predictions)))

confusion_matrix_dt = confusion_matrix(test_Y, predictions_dt.round()) print("Confusion Matrix - Decision Tree") print(confusion_matrix_dt)

from sklearn.metrics import confusion_matrix cm_plot = confusion_matrix(test_Y, predictions_dt) cm_plot # after creating the confusion matrix, for better understaning plot the cm. import seaborn as sn plt.figure(figsize = (10,8)) # were 'cmap' is used to set the accent colour. sn.heatmap(cm_plot, annot=True, cmap= 'icefire_r', fmt='d', cbar=True) plt.xlabel('Predicted_Label') plt.ylabel('Truth_Label') plt.title('Confusion Matrix - Decision Tree')

# evaluation of our Decision Tree model print("Evaluation of Decision Tree Model") print() metrics(test_Y, predictions_dt.round())

confusion_matrix_rf = confusion_matrix(test_Y, predictions_rf.round()) print("Confusion Matrix - Random Forest") print(confusion_matrix_rf)

# plotting the confusion matrix of our Random Forest classifier from sklearn.metrics import confusion_matrix cm_plot = confusion_matrix(test_Y, predictions_rf) cm_plot # after creating the confusion matrix, for better understaning plot the cm. import seaborn as sn plt.figure(figsize = (10,8)) # were 'cmap' is used to set the accent colour. sn.heatmap(cm_plot, annot=True, cmap= 'tab20c', fmt='d', cbar=True) plt.xlabel('Predicted_Label') plt.ylabel('Truth_Label') plt.title('Confusion Matrix - Random Forest')

print("Evaluation of Random Forest Model") print() metrics(test_Y, predictions_rf.round())

# installing imblearn on the notebook !pip3 install imblearn

# performing oversampling on RF and DT from imblearn.over_sampling import SMOTE X_resampled, Y_resampled = SMOTE().fit_resample(X, Y) print("Resampled shape of X: ", X_resampled.shape) print("Resampled shape of Y: ", Y_resampled.shape) value_counts = Counter(Y_resampled) print(value_counts) (train_X, test_X, train_Y, test_Y) = train_test_split(X_resampled, Y_resampled, test_size= 0.3, random_state= 42)

# we use the Random Forest algorithm since it performed better than the Decision Tree algorithm rf_resampled = RandomForestClassifier(n_estimators = 100) rf_resampled.fit(train_X, train_Y) predictions_resampled = rf_resampled.predict(test_X) random_forest_score_resampled = rf_resampled.score(test_X, test_Y) * 100

# visizualiztions using confusion matrix cm_resampled = confusion_matrix(test_Y, predictions_resampled.round()) print("Confusion Matrix - Random Forest") print(cm_resampled)

# plotting the confusion matrix of our Random Forest After Oversampling from sklearn.metrics import confusion_matrix cm_plot = confusion_matrix(test_Y, predictions_resampled) cm_plot # after creating the confusion matrix, for better understaning plot the cm. import seaborn as sn plt.figure(figsize = (10,8)) # were 'cmap' is used to set the accent colour. sn.heatmap(cm_plot, annot=True, cmap= 'tab20c', fmt='d', cbar=True) plt.xlabel('Predicted_Label') plt.ylabel('Truth_Label') plt.title('Confusion Matrix - Random Forest After Oversampling')

print("Evaluation of Random Forest Model") print() metrics(test_Y, predictions_resampled.round())