# importing libraries and modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import itertools
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, recall_score, f1_score
from sklearn.metrics import plot_confusion_matrix
# importing the dataset.
df = pd.read_csv('creditcard.csv')
df.head()
# checking for null values
df.isnull().values.any()
# there are no null values present in the dataset
# a summary of the"Amount" column
df['Amount'].describe()
# this provides info about the dataset, which includes the column count, non-null count and their data types
df.info()
# checking the total amount of fradulent and non-fradulent data
df['Class'].value_counts()
# Using Matplotlib to check the number of occurences of each class model
non_fraud = len(df[df.Class == 0])
fraud = len(df[df.Class == 1])
fraud_percent = (fraud / (fraud + non_fraud)) * 100
print("Number of Genuine transactions: ", non_fraud)
print("Number of Fraud transactions: ", fraud)
print("Percentage of Fraud transactions: {:.4f}".format(fraud_percent))
# Using matplotlib to plot the information above
import matplotlib.pyplot as plt
labels = ["Genuine", "Fraud"]
count_classes = df.value_counts(df['Class'], sort= True)
count_classes.plot(kind = "bar", rot = 0)
plt.title("Visualization of Labels")
plt.ylabel("Count")
plt.xticks(range(2), labels)
plt.show()
import numpy as np
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df["NormalizedAmount"] = scaler.fit_transform(df["Amount"].values.reshape(-1, 1))
df.drop(["Amount", "Time"], inplace= True, axis= 1)
Y = df["Class"]
X = df.drop(["Class"], axis= 1)
from sklearn.model_selection import train_test_split
(train_X, test_X, train_Y, test_Y) = train_test_split(X, Y, test_size= 0.3, random_state= 42)
print("Shape of train_X: ", train_X.shape)
print("Shape of test_X: ", test_X.shape)
# buildng the Decision Tree and Random Forest Classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
#Decision Tree
decision_tree = DecisionTreeClassifier()
# Random Forest
random_forest = RandomForestClassifier(n_estimators= 100)
decision_tree.fit(train_X, train_Y)
predictions_dt = decision_tree.predict(test_X)
decision_tree_score = decision_tree.score(test_X, test_Y) * 100
random_forest.fit(train_X, train_Y)
predictions_rf = random_forest.predict(test_X)
random_forest_score = random_forest.score(test_X, test_Y) * 100
print("Random Forest Score: ", random_forest_score)
print("Decision Tree Score: ", decision_tree_score)
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, recall_score, f1_score, ConfusionMatrixDisplay
def metrics(actuals, predictions):
print("Accuracy: {:.5f}".format(accuracy_score(actuals, predictions)))
print("Precision: {:.5f}".format(precision_score(actuals, predictions)))
print("Recall: {:.5f}".format(recall_score(actuals, predictions)))
print("F1-score: {:.5f}".format(f1_score(actuals, predictions)))
confusion_matrix_dt = confusion_matrix(test_Y, predictions_dt.round())
print("Confusion Matrix - Decision Tree")
print(confusion_matrix_dt)
from sklearn.metrics import confusion_matrix
cm_plot = confusion_matrix(test_Y, predictions_dt)
cm_plot
# after creating the confusion matrix, for better understaning plot the cm.
import seaborn as sn
plt.figure(figsize = (10,8))
# were 'cmap' is used to set the accent colour.
sn.heatmap(cm_plot, annot=True, cmap= 'icefire_r', fmt='d', cbar=True)
plt.xlabel('Predicted_Label')
plt.ylabel('Truth_Label')
plt.title('Confusion Matrix - Decision Tree')
# evaluation of our Decision Tree model
print("Evaluation of Decision Tree Model")
print()
metrics(test_Y, predictions_dt.round())
confusion_matrix_rf = confusion_matrix(test_Y, predictions_rf.round())
print("Confusion Matrix - Random Forest")
print(confusion_matrix_rf)
# plotting the confusion matrix of our Random Forest classifier
from sklearn.metrics import confusion_matrix
cm_plot = confusion_matrix(test_Y, predictions_rf)
cm_plot
# after creating the confusion matrix, for better understaning plot the cm.
import seaborn as sn
plt.figure(figsize = (10,8))
# were 'cmap' is used to set the accent colour.
sn.heatmap(cm_plot, annot=True, cmap= 'tab20c', fmt='d', cbar=True)
plt.xlabel('Predicted_Label')
plt.ylabel('Truth_Label')
plt.title('Confusion Matrix - Random Forest')
print("Evaluation of Random Forest Model")
print()
metrics(test_Y, predictions_rf.round())
# installing imblearn on the notebook
!pip3 install imblearn
# performing oversampling on RF and DT
from imblearn.over_sampling import SMOTE
X_resampled, Y_resampled = SMOTE().fit_resample(X, Y)
print("Resampled shape of X: ", X_resampled.shape)
print("Resampled shape of Y: ", Y_resampled.shape)
value_counts = Counter(Y_resampled)
print(value_counts)
(train_X, test_X, train_Y, test_Y) = train_test_split(X_resampled, Y_resampled, test_size= 0.3, random_state= 42)
# we use the Random Forest algorithm since it performed better than the Decision Tree algorithm
rf_resampled = RandomForestClassifier(n_estimators = 100)
rf_resampled.fit(train_X, train_Y)
predictions_resampled = rf_resampled.predict(test_X)
random_forest_score_resampled = rf_resampled.score(test_X, test_Y) * 100
# visizualiztions using confusion matrix
cm_resampled = confusion_matrix(test_Y, predictions_resampled.round())
print("Confusion Matrix - Random Forest")
print(cm_resampled)
# plotting the confusion matrix of our Random Forest After Oversampling
from sklearn.metrics import confusion_matrix
cm_plot = confusion_matrix(test_Y, predictions_resampled)
cm_plot
# after creating the confusion matrix, for better understaning plot the cm.
import seaborn as sn
plt.figure(figsize = (10,8))
# were 'cmap' is used to set the accent colour.
sn.heatmap(cm_plot, annot=True, cmap= 'tab20c', fmt='d', cbar=True)
plt.xlabel('Predicted_Label')
plt.ylabel('Truth_Label')
plt.title('Confusion Matrix - Random Forest After Oversampling')
print("Evaluation of Random Forest Model")
print()
metrics(test_Y, predictions_resampled.round())