Credit card fraud detection

import pandas as pd import numpy as np import matplotlib.pyplot as plt import plotly.express as px import seaborn as sns sns.set_theme(context = 'notebook', style='darkgrid',palette='muted') from imblearn.over_sampling import SMOTE from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.tree import DecisionTreeClassifier from sklearn import metrics from sklearn.metrics import roc_auc_score from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, recall_score, f1_score import xgboost import warnings warnings.filterwarnings("ignore")

data = pd.read_csv('/work/creditcard.csv') pd.set_option('display.max_columns', None) data.head()

corr = data.corr() plt.figure(figsize=(30,15)) sns.heatmap(corr, cmap='coolwarm', fmt='.2f', annot=True) plt.show()

plt.figure(figsize=(30, 22), dpi=150) plt.subplot(6,5,1) sns.distplot(data.V1) plt.subplot(6,5,2) sns.distplot(data.V2) plt.subplot(6,5,3) sns.distplot(data.V3) plt.subplot(6,5,4) sns.distplot(data.V4) plt.subplot(6,5,5) sns.distplot(data.V5) plt.subplot(6,5,6) sns.distplot(data.V6) plt.subplot(6,5,7) sns.distplot(data.V7) plt.subplot(6,5,8) sns.distplot(data.V8) plt.subplot(6,5,9) sns.distplot(data.V9) plt.subplot(6,5,10) sns.distplot(data.V10) plt.subplot(6,5,11) sns.distplot(data.V11) plt.subplot(6,5,12) sns.distplot(data.V12) plt.subplot(6,5,13) sns.distplot(data.V13) plt.subplot(6,5,14) sns.distplot(data.V14) plt.subplot(6,5,15) sns.distplot(data.V15) plt.subplot(6,5,16) sns.distplot(data.V16) plt.subplot(6,5,17) sns.distplot(data.V17) plt.subplot(6,5,18) sns.distplot(data.V18) plt.subplot(6,5,19) sns.distplot(data.V19) plt.subplot(6,5,20) sns.distplot(data.V20) plt.subplot(6,5,21) sns.distplot(data.V21) plt.subplot(6,5,22) sns.distplot(data.V22) plt.subplot(6,5,23) sns.distplot(data.V23) plt.subplot(6,5,24) sns.distplot(data.V24) plt.subplot(6,5,25) sns.distplot(data.V25) plt.subplot(6,5,26) sns.distplot(data.V26) plt.subplot(6,5,27) sns.distplot(data.V27) plt.subplot(6,5,28) sns.distplot(data.V28) plt.subplot(6,5,29) sns.distplot(data.Time) plt.subplot(6,5,30) sns.distplot(data.Amount) plt.suptitle("Distribution of Data column wise", fontsize=20) plt.show()

plt.gcf().set_size_inches(0.75, 0.5) for col in data.columns: sns.lmplot(x=col, y='Amount', hue='Class', data=data, line_kws={"color": "red"}) plt.show()

# Distribution of Amounts in class fig = px.scatter(data, x = 'Amount', y =data.index, color = data.Class, title = 'Distribution of Amount in respect of class') fig.update_layout(xaxis_title='Transaction Amount', yaxis_title='Transactions') fig.show()

# Visualizing the Class distribution fig = px.pie(data.Class,values = data.Class.value_counts(), names=['Genuine', 'Fraud'], title='Genuine vs Fraud Transactions in Data') fig.show()

# Distribution of fraud transactions amount fig = px.scatter(data.query("Class==1"), x = 'Amount', y =data.query("Class==1").index, title = 'Distribution of Fraud transactions Amounts') fig.update_layout(xaxis_title='Transaction Amount', yaxis_title='Transactions') fig.show()

data = data.drop(columns = ['Time'], axis = 1)

features = data.drop(columns=['Class'], axis=1) target = data.Class

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size= .3, random_state = 7)

print("Data Size after Splitting : ") print('-----------------------------------') print('X Train size : ', X_train.shape,' :', "%s%%"%round((len(X_train) / (len(X_train) + len(X_test))) * 100)) print('X Test size : ', X_test.shape,' :',"%s%%"%round((len(X_test) / (len(X_train) + len(X_test))) * 100)) print('-----------------------------------') print('Y Train size : ', y_train.shape,' :', "%s%%"%round((len(y_train) / (len(y_train) + len(y_test))) * 100)) print('Y Test size : ', y_test.shape,' :',"%s%%"%round((len(y_test) / (len(y_train) + len(y_test))) * 100)) print('-----------------------------------')

# Scaling data on the training set ss = StandardScaler() X_train['Amount'] = ss.fit_transform(X_train.Amount.values.reshape(-1,1))

pd.DataFrame(X_train, columns=X_train.columns).plot.box(figsize=(20,5), rot=90) plt.show()

target.value_counts() # 0 = Genuine Transactions | 1 = Fraud

X_train, y_train = SMOTE().fit_resample(X_train,y_train)

y_train.value_counts()

print("Data Size after Oversampling : ") print('-----------------------------------') print('X Train size : ', X_train.shape,' :', "%s%%"%round((len(X_train) / (len(X_train) + len(X_test))) * 100)) print('X Test size : ', X_test.shape,' :',"%s%%"%round((len(X_test) / (len(X_train) + len(X_test))) * 100)) print('-----------------------------------') print('Y Train size : ', y_train.shape,' :', "%s%%"%round((len(y_train) / (len(y_train) + len(y_test))) * 100)) print('Y Test size : ', y_test.shape,' :',"%s%%"%round((len(y_test) / (len(y_train) + len(y_test))) * 100)) print('-----------------------------------')

# Logistic Regression lr = LogisticRegression(max_iter=150) lr.fit(X_train, y_train) y_predictions_lr = lr.predict(X_test) # Random Forest Classifier rf = RandomForestClassifier(n_estimators = 100, random_state = 7) rf.fit(X_train,y_train) y_predictions_rf = rf.predict(X_test) # Decision Tree Classifier dt = DecisionTreeClassifier(random_state = 7) dt.fit(X_train,y_train) y_predictions_dt = dt.predict(X_test) # Ada Boost Classifier ab = AdaBoostClassifier(n_estimators = 100, random_state = 7) ab.fit(X_train,y_train) y_predictions_ab = ab.predict(X_test) # Gradient Boosting Classifier gb = GradientBoostingClassifier(n_estimators = 100, random_state = 7) gb.fit(X_train,y_train) y_predictions_gb = gb.predict(X_test) # xgboost Classifier xg = xgboost.XGBClassifier() xg.fit(X_train,y_train) y_predictions_xg = xg.predict(X_test) # KNeighborsClassifier kn = KNeighborsClassifier(n_neighbors=1) kn.fit(X_train,y_train) y_predictions_kn = kn.predict(X_test)

met = ['Accuracy','Precision','Recall','F1_score','AUROC'] lgr = [accuracy_score(y_test, y_predictions_lr),precision_score(y_test, y_predictions_lr),recall_score(y_test, y_predictions_lr),f1_score(y_test, y_predictions_lr),(roc_auc_score(y_test, y_predictions_lr) * 100)] rfc = [accuracy_score(y_test, y_predictions_rf),precision_score(y_test, y_predictions_rf),recall_score(y_test, y_predictions_rf),f1_score(y_test, y_predictions_rf),(roc_auc_score(y_test, y_predictions_rf) * 100)] dtc = [accuracy_score(y_test, y_predictions_dt),precision_score(y_test, y_predictions_dt),recall_score(y_test, y_predictions_dt),f1_score(y_test, y_predictions_dt),(roc_auc_score(y_test, y_predictions_dt) * 100)] abc = [accuracy_score(y_test, y_predictions_ab),precision_score(y_test, y_predictions_ab),recall_score(y_test, y_predictions_ab),f1_score(y_test, y_predictions_ab),(roc_auc_score(y_test, y_predictions_ab) * 100)] gbc = [accuracy_score(y_test, y_predictions_gb),precision_score(y_test, y_predictions_gb),recall_score(y_test, y_predictions_gb),f1_score(y_test, y_predictions_gb),(roc_auc_score(y_test, y_predictions_gb) * 100)] xgb = [accuracy_score(y_test, y_predictions_xg),precision_score(y_test, y_predictions_xg),recall_score(y_test, y_predictions_xg),f1_score(y_test, y_predictions_xg),(roc_auc_score(y_test, y_predictions_xg) * 100)] knn = [accuracy_score(y_test, y_predictions_kn),precision_score(y_test, y_predictions_kn),recall_score(y_test, y_predictions_kn),f1_score(y_test, y_predictions_kn),(roc_auc_score(y_test, y_predictions_kn) * 100)] dic = {"Metric":met, 'Logistic Regression':lgr, 'Random Forest Classifier':rfc, 'Decision Tree Classifier':dtc, 'Ada Boost Classifier':abc, 'Gradient Boosting Classifier':gbc, 'xgboost':xgb, 'K-Neighbors Classifier':knn}

pd.DataFrame(dic)

plt.figure(figsize=(30, 18), dpi=200) # plt.subplot(2,3,1) # Confusion Matrix for Logistic Regression confusion_matrix_lr = confusion_matrix(y_test, y_predictions_lr) # Visualization plt.figure(figsize=(10,7)) ax = plt.subplot() sns.heatmap(confusion_matrix_lr, annot=True, fmt='g', ax = ax) ax.set_xlabel('Predicted Values') ax.set_ylabel('Actual Values') ax.set_title('Confusion Matrix - Logistic Regression') ax.xaxis.set_ticklabels(['Genuine','Fraud']) ax.yaxis.set_ticklabels(['Genuine','Fraud']) # plt.show() # plt.subplot(2,3,2) # Confusion Matrix for Random Forest confusion_matrix_rf = confusion_matrix(y_test, y_predictions_rf) # Visualization plt.figure(figsize=(10,7)) ax = plt.subplot() sns.heatmap(confusion_matrix_rf, annot=True, fmt='g', ax = ax) ax.set_xlabel('Predicted Values') ax.set_ylabel('Actual Values') ax.set_title('Confusion Matrix - Random Forest') ax.xaxis.set_ticklabels(['Genuine','Fraud']) ax.yaxis.set_ticklabels(['Genuine','Fraud']) # plt.show() # plt.subplot(2,3,3) # Confusion Matrix for Decision Tree confusion_matrix_dt = confusion_matrix(y_test, y_predictions_dt) # Visualization plt.figure(figsize=(10,7)) ax = plt.subplot() sns.heatmap(confusion_matrix_dt, annot=True, fmt='g', ax = ax) ax.set_xlabel('Predicted Values') ax.set_ylabel('Actual Values') ax.set_title('Confusion Matrix - Decision Tree') ax.xaxis.set_ticklabels(['Genuine','Fraud']) ax.yaxis.set_ticklabels(['Genuine','Fraud']) # plt.show() # plt.subplot(2,3,4) # Confusion Matrix for Ada Boost confusion_matrix_ab = confusion_matrix(y_test, y_predictions_ab) # Visualization plt.figure(figsize=(10,7)) ax = plt.subplot() sns.heatmap(confusion_matrix_ab, annot=True, fmt='g', ax = ax) ax.set_xlabel('Predicted Values') ax.set_ylabel('Actual Values') ax.set_title('Confusion Matrix - Ada Boost') ax.xaxis.set_ticklabels(['Genuine','Fraud']) ax.yaxis.set_ticklabels(['Genuine','Fraud']) # plt.show() # plt.subplot(2,3,5) # Confusion Matrix for Gradient Boosting confusion_matrix_gb = confusion_matrix(y_test, y_predictions_gb) # Visualization plt.figure(figsize=(10,7)) ax = plt.subplot() sns.heatmap(confusion_matrix_gb, annot=True, fmt='g', ax = ax) ax.set_xlabel('Predicted Values') ax.set_ylabel('Actual Values') ax.set_title('Confusion Matrix - Gradient Boosting') ax.xaxis.set_ticklabels(['Genuine','Fraud']) ax.yaxis.set_ticklabels(['Genuine','Fraud']) # plt.show() # plt.subplot(2,3,6) # Confusion Matrix for XGBoost confusion_matrix_xg = confusion_matrix(y_test, y_predictions_xg) # Visualization plt.figure(figsize=(10,7)) ax = plt.subplot() sns.heatmap(confusion_matrix_xg, annot=True, fmt='g', ax = ax) ax.set_xlabel('Predicted Values') ax.set_ylabel('Actual Values') ax.set_title('Confusion Matrix - XGBoost') ax.xaxis.set_ticklabels(['Genuine','Fraud']) ax.yaxis.set_ticklabels(['Genuine','Fraud']) # plt.show() # Confusion Matrix for K-Nearest Neighbour confusion_matrix_kn = confusion_matrix(y_test, y_predictions_kn) # Visualization plt.figure(figsize=(10,7)) ax = plt.subplot() sns.heatmap(confusion_matrix_kn, annot=True, fmt='g', ax = ax) ax.set_xlabel('Predicted Values') ax.set_ylabel('Actual Values') ax.set_title('Confusion Matrix - K-Nearest Neighbour') ax.xaxis.set_ticklabels(['Genuine','Fraud']) ax.yaxis.set_ticklabels(['Genuine','Fraud']) plt.show()