Implementation-of-binary-classification-for-Credit-Card-Fraud-Detection

!python -m pip install -q --upgrade pip !pip install -q -r requirements.txt

# Miscellaneous from __future__ import print_function from IPython.display import display #Importing Requierd Libraries import pandas as pd import numpy as np # For interactive graphics import plotly.express as px import plotly.graph_objects as go import plotly.figure_factory as ff from plotly.subplots import make_subplots import matplotlib.pyplot as plt import matplotlib as mpl import seaborn as sns from ydata_profiling import ProfileReport from yellowbrick.classifier import ClassPredictionError from tqdm.notebook import trange, tqdm # Sklearn from sklearn.preprocessing import StandardScaler from sklearn.datasets import make_blobs from imblearn.over_sampling import SMOTE from sklearn.model_selection import GridSearchCV from sklearn.metrics import roc_curve, roc_auc_score, precision_score, f1_score, RocCurveDisplay, confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score, recall_score from sklearn.model_selection import train_test_split, LearningCurveDisplay, ShuffleSplit from imblearn.pipeline import Pipeline ## Algorithms for Binary Classification from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import GaussianNB from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier ## Visualization of Decision Tree from sklearn.tree import plot_tree, export_text # Scipy metods robust for outlier treatment from scipy.stats import gmean, gstd, describe from scipy.stats.mstats import winsorize # Magics Funtions %load_ext autoreload %autoreload 2 %run "template_visualitation.ipynb" # Export function pandas_title() %run "pandas-missing-extension.ipynb" # Export missing functions from the API of Pandas library

%%time df_data = pd.read_csv('creditcard.csv')

df_data.to_feather("creditcard.feather")

%%time df_data = pd.read_feather("creditcard.feather")

pandas_title(df_data.describe().iloc[:,0:15].style.format('{:.5f}').applymap(above_zero), 'Describe Data: First middle') pandas_title(df_data.describe().iloc[:,16:].style.format('{:.5f}').applymap(above_zero), 'Describe Data: Second middle')

memory_usage = df_data.memory_usage(deep=True) / 1024 ** 2 memory_usage.loc['total'] = memory_usage.sum() memory_usage = memory_usage.to_frame(name="memory usage of variable (MB)") pandas_title(memory_usage, 'memory usage of features', True).bar(subset=["memory usage of variable (MB)",], color='#ee1f5f', axis=0).format("{:.6f} MB")

def reduce_memory_usage(df, verbose=True): numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"] start_mem = df.memory_usage().sum() / 1024 ** 2 for col in df.columns: col_type = df[col].dtypes if col_type in numerics: c_min = df[col].min() c_max = df[col].max() if str(col_type)[:3] == "int": if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max: df[col] = df[col].astype(np.int8) elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max: df[col] = df[col].astype(np.int16) elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max: df[col] = df[col].astype(np.int32) elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max: df[col] = df[col].astype(np.int64) else: if ( c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max ): df[col] = df[col].astype(np.float16) elif ( c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max ): df[col] = df[col].astype(np.float32) else: df[col] = df[col].astype(np.float64) end_mem = df.memory_usage().sum() / 1024 ** 2 if verbose: print( "Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format( end_mem, 100 * (start_mem - end_mem) / start_mem ) ) return df df_data = reduce_memory_usage(df_data, verbose=True)

pd.DataFrame(df_data.info(memory_usage="deep"))

df_data_processing = df_data.copy() pandas_title(df_data_processing.missing.missing_variable_summary().set_index('variable').T.style.format('{:.0f}'), 'Quantity and percentage of Missing Values')

pandas_title(df_data_processing[df_data_processing.duplicated()].head(5).style.format('{:.4f}'), 'Values Duplicate')

df_data_processing.duplicated().value_counts()

df_data_processing.drop_duplicates(inplace=True)

pandas_title(df_data_processing.nunique().to_frame(name="Quantity ").T, 'Unique values')

df_data_processing = reduce_memory_usage(df_data_processing, verbose=True)

profile = ProfileReport( df_data_processing, title="Pandas Profiling Report", html={"style": {"primary_color": "#FA0087"}}, minimal=True ) profile

labels, counts=np.unique(df_data_processing.Class, return_counts=True)

fig = go.Figure(data=[go.Pie(labels=['Legal', 'Fraudulent'], values=counts, insidetextorientation='radial', hole=.4)]) fig.update_traces(textposition='auto', textinfo='percent+label') # fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide') fig.update_layout( title_text="Percentage analysis of variable predictor", title_y=0.98, # Add annotations in the center of the donut pies. annotations=[dict(text='Transactions', x=0.5, y=0.5, font_size=20, showarrow=False)]) fig.show()

ax = sns.boxplot(data=df_data_processing, y='Amount', x='Class', hue="Class", dodge=False); ax.set_xticklabels(["Legal", "Fraud"]) ax.set_xlabel("Tipo transaccion") plt.title("Distribution variable Amount", fontsize=24, fontdict={"weight": "bold"}) plt.show()

report_fraud = describe(df_data_processing['Amount'].astype("float32")[(df_data_processing['Class'] == 1)])._asdict() report_legal = describe(df_data_processing['Amount'].astype("float32")[(df_data_processing['Class'] == 0)])._asdict() report_amount = pd.DataFrame([report_legal, report_fraud], columns=report_legal.keys(), index=['Legal', 'Fraud']) pandas_title(report_amount, 'Describe variable Amount')

print(f'Weighted geometric mean for data Amount = {gmean(df_data_processing["Amount"][(df_data_processing["Amount"] > 0)].astype("float32"))}') print(f'Geometric standard deviation for data Amount = {gstd(df_data_processing["Amount"][(df_data_processing["Amount"] > 0)].astype("float32"))}\n\n') # we apply average winsorize to replace 0.2% of the outliers amount_winsorize = winsorize(df_data_processing['Amount'].astype("float32"), limits=[0, 0.002]) # comparative report with normal mean and variance report_amount_winsorize = describe(amount_winsorize)._asdict() report_amount_winsorize = pd.DataFrame([report_amount_winsorize], columns=report_amount_winsorize.keys(), index=['Amount winsorize']) pandas_title(report_amount_winsorize, 'Describe amount_winsorize') print('\n') print(f'Weighted geometric mean for data winsorize Amount = {gmean(amount_winsorize.data[amount_winsorize.data != 0])}') print(f'Geometric standard deviation for data winsorize Amount = {gstd(amount_winsorize.data[amount_winsorize.data != 0])}') df_data_processing["Amount"] = amount_winsorize.data

report_fraud = describe(df_data_processing['Amount'].astype("float32")[(df_data_processing['Class'] == 1)])._asdict() report_legal = describe(df_data_processing['Amount'].astype("float32")[(df_data_processing['Class'] == 0)])._asdict() report_amount = pd.DataFrame([report_legal, report_fraud], columns=report_legal.keys(), index=['Legal', 'Fraud']) pandas_title(report_amount, 'Describe variable Amount processed')

ax = sns.boxplot(data=df_data_processing, y='Amount', x='Class', hue="Class", dodge=False); ax.set_xticklabels(["Legal", "Fraud"]) ax.set_xlabel("Tipo transaccion") plt.title("Distribution variable Amount winsorize", fontsize=24, fontdict={"weight": "bold"}) plt.show()

data_corre = df_data_processing.corr()['Class'].sort_values(ascending=True).reset_index().iloc[:-1] fig = go.Figure() fig.add_trace( go.Bar( x=data_corre['index'], y=data_corre['Class'], opacity=0.8, marker=dict(cmax=1, cmin=-1, color=data_corre['Class'], showscale=True, colorbar={"title": "Correlation"}))) fig.update_xaxes(tickangle=60, tickfont=dict(size=10), automargin='height') fig.update_layout(title_text='Graphic correlation variable Class', xaxis_title="Feature", yaxis_title="Correlation", coloraxis_colorbar_title_text = 'your title') fig.show() data_corre = df_data_processing.corr()['Amount'].sort_values(ascending=True).reset_index().iloc[:-1] fig = go.Figure() fig.add_trace( go.Bar( x=data_corre['index'], y=data_corre['Amount'], opacity=0.8, marker=dict(cmax=1, cmin=-1, color=data_corre['Amount'], showscale=True, colorbar={"title": "Correlation"}))) fig.update_xaxes(tickangle=60, tickfont=dict(size=10), automargin='height') fig.update_layout(title_text='Graphic correlation variable Amount', xaxis_title="Feature", yaxis_title="Correlation", coloraxis_colorbar_title_text = 'your title') fig.show()

pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 16, 'color': 'white'} fig = px.histogram(df_data_processing, x="Time", color="Class", opacity=0.8, barmode='overlay') fig.update_layout( title=f"Histogram Time", ) fig.show() fig = px.histogram(df_data_processing, x="Amount", color="Class", opacity=0.8, barmode='overlay', nbins=200) fig.update_layout( title=f"Histogram Amount", ) fig.show()

scaler = StandardScaler() df_data_scaled = scaler.fit_transform(df_data_processing)

df_data_scaled = pd.DataFrame(df_data_scaled, columns = df_data_processing.columns)

X = df_data_scaled.drop(['Class'], axis=1) y = df_data_processing['Class']

sm = SMOTE(random_state=42) X, y = sm.fit_resample(X, y)

def matrix_confusion_plotly(cm, name): pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 16, 'color': 'white'} x_matrix = ['False', 'True'] y_matrix = ['True', 'False'] z_text = [['TN', 'FP'], ['FN', 'TP']] # Display element name and prediction on hover hover_text = [['True Neg', 'False Pos'], ['False Neg', 'True Pos']] hover=[] for _ in range(len(z_text)): hover.append([i + ' ' + 'Quantity prediction: ' + str(j) + ' ' + 'Percentage data in each quadrant: ' + str(round(j/np.sum(cm)*100, 2)) + '%' for i, j in zip(hover_text[_], cm[_])]) fig = ff.create_annotated_heatmap(cm[::-1], x=x_matrix, y=y_matrix, annotation_text=z_text[::-1], text=hover[::-1], hoverinfo='text', colorscale=px.colors.diverging.RdBu[::-1], showscale=True) fig.update_layout(title_text=f'Confusion matrix for {name}') # add custom xaxis title fig.add_annotation(dict(font=dict(color="white",size=14), x=0.5, y=-0.12, showarrow=False, text="Predicted value", xref="paper", yref="paper")) # add custom yaxis title fig.add_annotation(dict(font=dict(color="white",size=14), x=-0.08, y=0.5, showarrow=False, text="Real value", textangle=-90, xref="paper", yref="paper")) fig.update_layout(width=560, height=560) fig.show()

import warnings warnings.filterwarnings("ignore") # method used a cross-validation procedure cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=0) # Grid for classifier reg_classifier = { 'LogisticRegression': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'KNeighborsClassifier': KNeighborsClassifier(), } params = { 'LogisticRegression': { "C": np.logspace(-4, 4, 4), "penalty": ['l1', 'l2'], "solver": ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'], 'max_iter': [1000], 'random_state': [42], 'n_jobs': [-1], }, 'RandomForestClassifier': { "criterion": ['entropy', 'gini'], "n_estimators": [64], "max_depth": [32], 'random_state': [42], 'n_jobs': [-1], }, 'DecisionTreeClassifier': { "criterion": ['entropy', 'gini'], "max_depth": [32], 'random_state': [42], }, 'KNeighborsClassifier': { 'n_neighbors': [3], 'weights': ['distance'], 'n_jobs': [-1], 'metric': ['minkowski'], 'p': [2], }, } # Grid for LearningCurveDisplay common_params = { "X": X, "y": y, "train_sizes": np.linspace(0.2, 1.0, 5), "cv": cv, "score_type": "both", "n_jobs": -1, "line_kw": {"marker": "o"}, "std_display_style": "fill_between", "score_name": "Accuracy", "random_state": 42 } # Dict and list to save score to models models_best = {} models_names = [] test_scores = [] train_scores = [] ###################################### Models deployment ########################################### for name, reg in tqdm(reg_classifier.items()): grid_reg = GridSearchCV(reg, params.get(name), n_jobs=-1, cv=cv).fit(X, y) score = np.abs(grid_reg.best_score_) #Getting the best estimators, score & parameters GridSearchCV print(f'├──GridSearchCV of {name}:') print(f'\t├──Best Estimator: {grid_reg.best_estimator_}') print(f'\t├──Best recall Score on Train: {grid_reg.best_score_}') print(f'\t├──Best Params: {grid_reg.best_params_} \n\n') # Best model for comparative Roc Curves models_best[name] = grid_reg.best_estimator_ # Split with 'X' and 'y' balanced and pca_components_best of 'X' X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Training our best model and prediction model = grid_reg.best_estimator_.fit(X_train, y_train) prediction = model.predict(X_test) # View report with pro style df = pd.DataFrame(classification_report(y_test, prediction, digits=2, output_dict=True)).T df['support'] = df.support.apply(int) pandas_title(df.style.background_gradient(cmap='coolwarm', subset=pd.IndexSlice['0':'1', :'f1-score']), f'Report metrics of best estimator {name}') # Display table results Test_Score and Train_Score for all models models_names.append(name) test_scores.append(model.score(X_test, y_test)*100) train_scores.append(model.score(X_train, y_train)*100) # Function plot matrix confusion with plotly cm = confusion_matrix(y_test, prediction, labels=model.classes_) matrix_confusion_plotly(cm, name) # Class Prediction Error of result LogisticRegression# Class Prediction Error of result LogisticRegression ## Instantiate the classification model and visualizer fig, ax = plt.subplots(figsize=(9,7)) visualizer = ClassPredictionError(grid_reg.best_estimator_, classes=['Legal', 'Fraud']) ## Fit the training data to the visualizer visualizer.fit(X_train, y_train) ## Evaluate the model on the test data visualizer.score(X_test, y_test) ## Draw visualization visualizer.show() # Learning Curve visualization fig, ax = plt.subplots(figsize=(9,9)) LearningCurveDisplay.from_estimator(grid_reg.best_estimator_, **common_params, ax=ax) handles, label = ax.get_legend_handles_labels() ax.legend(handles[:2], ["Training Score", "Test Score"], fontsize=12) plt.title(f"Learning Curve for {name}", fontsize=24, fontdict={"weight": "bold"}) plt.show() # # ROC Curve single # fig, ax = plt.subplots(figsize=(9,9)) # clf = grid_reg.best_estimator_.fit(X_train, y_train) # RocCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax) # plt.title(f"ROC Curve for {name}", # fontsize=24, fontdict={"weight": "bold"}) # plt.show() # Plot a decision tree if name == 'DecisionTreeClassifier': mpl.rcParams['text.color'] = 'black' sns.set_style('darkgrid', {"grid.color": ".6", "grid.linestyle": ":"}) fig, ax = plt.subplots(figsize=(12, 12), facecolor='k') plot_tree(grid_reg.best_estimator_, feature_names=list(X.columns), label=list(np.unique(y)), filled=True, fontsize=12, max_depth= 2, ax=ax) plt.title(f"Decision tree for {name}", color='white', fontsize=28, fontweight='bold') plt.show() print('\n' * 3, '_' * 85, '\n' * 3)

def RocCurves_plotly(models): # Create an empty figure, and iteratively add new lines # every time we compute a new class fig = go.Figure() fig.add_shape( type='line', line=dict(dash='dash'), x0=0, x1=1, y0=0, y1=1 ) for name, reg in models.items(): # Fit the model reg.fit(X_train, y_train) y_score = reg.predict_proba(X_test)[:, 1] fpr, tpr, thresholds = roc_curve(y_test, y_score) auc_score = roc_auc_score(y_test, y_score) name = f"{name} (AUC={auc_score:.2f})" fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines')) fig.update_layout( title="Roc Curves all best classifier", xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', yaxis=dict(scaleanchor="x", scaleratio=1), xaxis=dict(constrain='domain'), ) fig.show()

# plot RocCurves for best classifier RocCurves_plotly(models_best) dict_models_scores = {'Model': models_names, 'Test Score': test_scores, 'Train Score': train_scores} df = pd.DataFrame(dict_models_scores) # plot bar comparative scores test and train for best classifier fig = go.Figure() fig.add_trace(go.Bar( y=df['Test Score'], x=df['Model'], name='Test Scores', marker_color='indianred' )) fig.add_trace(go.Bar( y=df['Train Score'], x=df['Model'], name='Train Scores', marker_color='lightsalmon' )) # Here we modify the tickangle of the xaxis, resulting in rotated labels. fig.update_layout(barmode='group', xaxis_tickangle=-45, title="Bar comparative scores of test and train for best classifier",) fig.update_yaxes(tickfont=dict(family='Arial', size=14), automargin='height') fig.update_xaxes(tickfont=dict(family='Arial', size=14), automargin='height') fig.show()

# !pip install -q pipreqsnb # !pipreqsnb '.' --force # Edit to requirements.txt with any additional libraries you require