import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
sns.set_theme(context = 'notebook', style='darkgrid',palette='muted')
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, recall_score, f1_score
import xgboost
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv('/work/creditcard.csv')
pd.set_option('display.max_columns', None)
data.head()
corr = data.corr()
plt.figure(figsize=(30,15))
sns.heatmap(corr, cmap='coolwarm', fmt='.2f', annot=True)
plt.show()
plt.figure(figsize=(30, 22), dpi=150)
plt.subplot(6,5,1)
sns.distplot(data.V1)
plt.subplot(6,5,2)
sns.distplot(data.V2)
plt.subplot(6,5,3)
sns.distplot(data.V3)
plt.subplot(6,5,4)
sns.distplot(data.V4)
plt.subplot(6,5,5)
sns.distplot(data.V5)
plt.subplot(6,5,6)
sns.distplot(data.V6)
plt.subplot(6,5,7)
sns.distplot(data.V7)
plt.subplot(6,5,8)
sns.distplot(data.V8)
plt.subplot(6,5,9)
sns.distplot(data.V9)
plt.subplot(6,5,10)
sns.distplot(data.V10)
plt.subplot(6,5,11)
sns.distplot(data.V11)
plt.subplot(6,5,12)
sns.distplot(data.V12)
plt.subplot(6,5,13)
sns.distplot(data.V13)
plt.subplot(6,5,14)
sns.distplot(data.V14)
plt.subplot(6,5,15)
sns.distplot(data.V15)
plt.subplot(6,5,16)
sns.distplot(data.V16)
plt.subplot(6,5,17)
sns.distplot(data.V17)
plt.subplot(6,5,18)
sns.distplot(data.V18)
plt.subplot(6,5,19)
sns.distplot(data.V19)
plt.subplot(6,5,20)
sns.distplot(data.V20)
plt.subplot(6,5,21)
sns.distplot(data.V21)
plt.subplot(6,5,22)
sns.distplot(data.V22)
plt.subplot(6,5,23)
sns.distplot(data.V23)
plt.subplot(6,5,24)
sns.distplot(data.V24)
plt.subplot(6,5,25)
sns.distplot(data.V25)
plt.subplot(6,5,26)
sns.distplot(data.V26)
plt.subplot(6,5,27)
sns.distplot(data.V27)
plt.subplot(6,5,28)
sns.distplot(data.V28)
plt.subplot(6,5,29)
sns.distplot(data.Time)
plt.subplot(6,5,30)
sns.distplot(data.Amount)
plt.suptitle("Distribution of Data column wise", fontsize=20)
plt.show()
plt.gcf().set_size_inches(0.75, 0.5)
for col in data.columns:
sns.lmplot(x=col, y='Amount', hue='Class', data=data, line_kws={"color": "red"})
plt.show()
# Distribution of Amounts in class
fig = px.scatter(data, x = 'Amount', y =data.index, color = data.Class,
title = 'Distribution of Amount in respect of class')
fig.update_layout(xaxis_title='Transaction Amount',
yaxis_title='Transactions')
fig.show()
# Visualizing the Class distribution
fig = px.pie(data.Class,values = data.Class.value_counts(),
names=['Genuine', 'Fraud'], title='Genuine vs Fraud Transactions in Data')
fig.show()
# Distribution of fraud transactions amount
fig = px.scatter(data.query("Class==1"), x = 'Amount', y =data.query("Class==1").index,
title = 'Distribution of Fraud transactions Amounts')
fig.update_layout(xaxis_title='Transaction Amount',
yaxis_title='Transactions')
fig.show()
data = data.drop(columns = ['Time'], axis = 1)
features = data.drop(columns=['Class'], axis=1)
target = data.Class
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size= .3, random_state = 7)
print("Data Size after Splitting : ")
print('-----------------------------------')
print('X Train size : ', X_train.shape,' :', "%s%%"%round((len(X_train) / (len(X_train) + len(X_test))) * 100))
print('X Test size : ', X_test.shape,' :',"%s%%"%round((len(X_test) / (len(X_train) + len(X_test))) * 100))
print('-----------------------------------')
print('Y Train size : ', y_train.shape,' :', "%s%%"%round((len(y_train) / (len(y_train) + len(y_test))) * 100))
print('Y Test size : ', y_test.shape,' :',"%s%%"%round((len(y_test) / (len(y_train) + len(y_test))) * 100))
print('-----------------------------------')
# Scaling data on the training set
ss = StandardScaler()
X_train['Amount'] = ss.fit_transform(X_train.Amount.values.reshape(-1,1))
pd.DataFrame(X_train, columns=X_train.columns).plot.box(figsize=(20,5), rot=90)
plt.show()
target.value_counts() # 0 = Genuine Transactions | 1 = Fraud
X_train, y_train = SMOTE().fit_resample(X_train,y_train)
y_train.value_counts()
print("Data Size after Oversampling : ")
print('-----------------------------------')
print('X Train size : ', X_train.shape,' :', "%s%%"%round((len(X_train) / (len(X_train) + len(X_test))) * 100))
print('X Test size : ', X_test.shape,' :',"%s%%"%round((len(X_test) / (len(X_train) + len(X_test))) * 100))
print('-----------------------------------')
print('Y Train size : ', y_train.shape,' :', "%s%%"%round((len(y_train) / (len(y_train) + len(y_test))) * 100))
print('Y Test size : ', y_test.shape,' :',"%s%%"%round((len(y_test) / (len(y_train) + len(y_test))) * 100))
print('-----------------------------------')
# Logistic Regression
lr = LogisticRegression(max_iter=150)
lr.fit(X_train, y_train)
y_predictions_lr = lr.predict(X_test)
# Random Forest Classifier
rf = RandomForestClassifier(n_estimators = 100, random_state = 7)
rf.fit(X_train,y_train)
y_predictions_rf = rf.predict(X_test)
# Decision Tree Classifier
dt = DecisionTreeClassifier(random_state = 7)
dt.fit(X_train,y_train)
y_predictions_dt = dt.predict(X_test)
# Ada Boost Classifier
ab = AdaBoostClassifier(n_estimators = 100, random_state = 7)
ab.fit(X_train,y_train)
y_predictions_ab = ab.predict(X_test)
# Gradient Boosting Classifier
gb = GradientBoostingClassifier(n_estimators = 100, random_state = 7)
gb.fit(X_train,y_train)
y_predictions_gb = gb.predict(X_test)
# xgboost Classifier
xg = xgboost.XGBClassifier()
xg.fit(X_train,y_train)
y_predictions_xg = xg.predict(X_test)
# KNeighborsClassifier
kn = KNeighborsClassifier(n_neighbors=1)
kn.fit(X_train,y_train)
y_predictions_kn = kn.predict(X_test)
met = ['Accuracy','Precision','Recall','F1_score','AUROC']
lgr = [accuracy_score(y_test, y_predictions_lr),precision_score(y_test, y_predictions_lr),recall_score(y_test, y_predictions_lr),f1_score(y_test, y_predictions_lr),(roc_auc_score(y_test, y_predictions_lr) * 100)]
rfc = [accuracy_score(y_test, y_predictions_rf),precision_score(y_test, y_predictions_rf),recall_score(y_test, y_predictions_rf),f1_score(y_test, y_predictions_rf),(roc_auc_score(y_test, y_predictions_rf) * 100)]
dtc = [accuracy_score(y_test, y_predictions_dt),precision_score(y_test, y_predictions_dt),recall_score(y_test, y_predictions_dt),f1_score(y_test, y_predictions_dt),(roc_auc_score(y_test, y_predictions_dt) * 100)]
abc = [accuracy_score(y_test, y_predictions_ab),precision_score(y_test, y_predictions_ab),recall_score(y_test, y_predictions_ab),f1_score(y_test, y_predictions_ab),(roc_auc_score(y_test, y_predictions_ab) * 100)]
gbc = [accuracy_score(y_test, y_predictions_gb),precision_score(y_test, y_predictions_gb),recall_score(y_test, y_predictions_gb),f1_score(y_test, y_predictions_gb),(roc_auc_score(y_test, y_predictions_gb) * 100)]
xgb = [accuracy_score(y_test, y_predictions_xg),precision_score(y_test, y_predictions_xg),recall_score(y_test, y_predictions_xg),f1_score(y_test, y_predictions_xg),(roc_auc_score(y_test, y_predictions_xg) * 100)]
knn = [accuracy_score(y_test, y_predictions_kn),precision_score(y_test, y_predictions_kn),recall_score(y_test, y_predictions_kn),f1_score(y_test, y_predictions_kn),(roc_auc_score(y_test, y_predictions_kn) * 100)]
dic = {"Metric":met, 'Logistic Regression':lgr, 'Random Forest Classifier':rfc, 'Decision Tree Classifier':dtc, 'Ada Boost Classifier':abc, 'Gradient Boosting Classifier':gbc, 'xgboost':xgb, 'K-Neighbors Classifier':knn}
pd.DataFrame(dic)
plt.figure(figsize=(30, 18), dpi=200)
# plt.subplot(2,3,1)
# Confusion Matrix for Logistic Regression
confusion_matrix_lr = confusion_matrix(y_test, y_predictions_lr)
# Visualization
plt.figure(figsize=(10,7))
ax = plt.subplot()
sns.heatmap(confusion_matrix_lr, annot=True, fmt='g', ax = ax)
ax.set_xlabel('Predicted Values')
ax.set_ylabel('Actual Values')
ax.set_title('Confusion Matrix - Logistic Regression')
ax.xaxis.set_ticklabels(['Genuine','Fraud'])
ax.yaxis.set_ticklabels(['Genuine','Fraud'])
# plt.show()
# plt.subplot(2,3,2)
# Confusion Matrix for Random Forest
confusion_matrix_rf = confusion_matrix(y_test, y_predictions_rf)
# Visualization
plt.figure(figsize=(10,7))
ax = plt.subplot()
sns.heatmap(confusion_matrix_rf, annot=True, fmt='g', ax = ax)
ax.set_xlabel('Predicted Values')
ax.set_ylabel('Actual Values')
ax.set_title('Confusion Matrix - Random Forest')
ax.xaxis.set_ticklabels(['Genuine','Fraud'])
ax.yaxis.set_ticklabels(['Genuine','Fraud'])
# plt.show()
# plt.subplot(2,3,3)
# Confusion Matrix for Decision Tree
confusion_matrix_dt = confusion_matrix(y_test, y_predictions_dt)
# Visualization
plt.figure(figsize=(10,7))
ax = plt.subplot()
sns.heatmap(confusion_matrix_dt, annot=True, fmt='g', ax = ax)
ax.set_xlabel('Predicted Values')
ax.set_ylabel('Actual Values')
ax.set_title('Confusion Matrix - Decision Tree')
ax.xaxis.set_ticklabels(['Genuine','Fraud'])
ax.yaxis.set_ticklabels(['Genuine','Fraud'])
# plt.show()
# plt.subplot(2,3,4)
# Confusion Matrix for Ada Boost
confusion_matrix_ab = confusion_matrix(y_test, y_predictions_ab)
# Visualization
plt.figure(figsize=(10,7))
ax = plt.subplot()
sns.heatmap(confusion_matrix_ab, annot=True, fmt='g', ax = ax)
ax.set_xlabel('Predicted Values')
ax.set_ylabel('Actual Values')
ax.set_title('Confusion Matrix - Ada Boost')
ax.xaxis.set_ticklabels(['Genuine','Fraud'])
ax.yaxis.set_ticklabels(['Genuine','Fraud'])
# plt.show()
# plt.subplot(2,3,5)
# Confusion Matrix for Gradient Boosting
confusion_matrix_gb = confusion_matrix(y_test, y_predictions_gb)
# Visualization
plt.figure(figsize=(10,7))
ax = plt.subplot()
sns.heatmap(confusion_matrix_gb, annot=True, fmt='g', ax = ax)
ax.set_xlabel('Predicted Values')
ax.set_ylabel('Actual Values')
ax.set_title('Confusion Matrix - Gradient Boosting')
ax.xaxis.set_ticklabels(['Genuine','Fraud'])
ax.yaxis.set_ticklabels(['Genuine','Fraud'])
# plt.show()
# plt.subplot(2,3,6)
# Confusion Matrix for XGBoost
confusion_matrix_xg = confusion_matrix(y_test, y_predictions_xg)
# Visualization
plt.figure(figsize=(10,7))
ax = plt.subplot()
sns.heatmap(confusion_matrix_xg, annot=True, fmt='g', ax = ax)
ax.set_xlabel('Predicted Values')
ax.set_ylabel('Actual Values')
ax.set_title('Confusion Matrix - XGBoost')
ax.xaxis.set_ticklabels(['Genuine','Fraud'])
ax.yaxis.set_ticklabels(['Genuine','Fraud'])
# plt.show()
# Confusion Matrix for K-Nearest Neighbour
confusion_matrix_kn = confusion_matrix(y_test, y_predictions_kn)
# Visualization
plt.figure(figsize=(10,7))
ax = plt.subplot()
sns.heatmap(confusion_matrix_kn, annot=True, fmt='g', ax = ax)
ax.set_xlabel('Predicted Values')
ax.set_ylabel('Actual Values')
ax.set_title('Confusion Matrix - K-Nearest Neighbour')
ax.xaxis.set_ticklabels(['Genuine','Fraud'])
ax.yaxis.set_ticklabels(['Genuine','Fraud'])
plt.show()