!python -m pip install -q --upgrade pip
!pip install -q -r requirements.txt
# Miscellaneous
from __future__ import print_function
from IPython.display import display
#Importing Requierd Libraries
import pandas as pd
import numpy as np
# For interactive graphics
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
from ydata_profiling import ProfileReport
from yellowbrick.classifier import ClassPredictionError
from tqdm.notebook import trange, tqdm
# Sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, roc_auc_score, precision_score, f1_score, RocCurveDisplay, confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score, recall_score
from sklearn.model_selection import train_test_split, LearningCurveDisplay, ShuffleSplit
from imblearn.pipeline import Pipeline
## Algorithms for Binary Classification
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier
## Visualization of Decision Tree
from sklearn.tree import plot_tree, export_text
# Scipy metods robust for outlier treatment
from scipy.stats import gmean, gstd, describe
from scipy.stats.mstats import winsorize
# Magics Funtions
%load_ext autoreload
%autoreload 2
%run "template_visualitation.ipynb" # Export function pandas_title()
%run "pandas-missing-extension.ipynb" # Export missing functions from the API of Pandas library
%%time
df_data = pd.read_csv('creditcard.csv')
df_data.to_feather("creditcard.feather")
%%time
df_data = pd.read_feather("creditcard.feather")
pandas_title(df_data.describe().iloc[:,0:15].style.format('{:.5f}').applymap(above_zero), 'Describe Data: First middle')
pandas_title(df_data.describe().iloc[:,16:].style.format('{:.5f}').applymap(above_zero), 'Describe Data: Second middle')
memory_usage = df_data.memory_usage(deep=True) / 1024 ** 2
memory_usage.loc['total'] = memory_usage.sum()
memory_usage = memory_usage.to_frame(name="memory usage of variable (MB)")
pandas_title(memory_usage, 'memory usage of features', True).bar(subset=["memory usage of variable (MB)",], color='#ee1f5f', axis=0).format("{:.6f} MB")
def reduce_memory_usage(df, verbose=True):
numerics = ["int8", "int16", "int32", "int64", "float16", "float32", "float64"]
start_mem = df.memory_usage().sum() / 1024 ** 2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == "int":
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if (
c_min > np.finfo(np.float16).min
and c_max < np.finfo(np.float16).max
):
df[col] = df[col].astype(np.float16)
elif (
c_min > np.finfo(np.float32).min
and c_max < np.finfo(np.float32).max
):
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024 ** 2
if verbose:
print(
"Mem. usage decreased to {:.2f} Mb ({:.1f}% reduction)".format(
end_mem, 100 * (start_mem - end_mem) / start_mem
)
)
return df
df_data = reduce_memory_usage(df_data, verbose=True)
pd.DataFrame(df_data.info(memory_usage="deep"))
df_data_processing = df_data.copy()
pandas_title(df_data_processing.missing.missing_variable_summary().set_index('variable').T.style.format('{:.0f}'), 'Quantity and percentage of Missing Values')
pandas_title(df_data_processing[df_data_processing.duplicated()].head(5).style.format('{:.4f}'), 'Values Duplicate')
df_data_processing.duplicated().value_counts()
df_data_processing.drop_duplicates(inplace=True)
pandas_title(df_data_processing.nunique().to_frame(name="Quantity ").T, 'Unique values')
df_data_processing = reduce_memory_usage(df_data_processing, verbose=True)
profile = ProfileReport(
df_data_processing, title="Pandas Profiling Report", html={"style": {"primary_color": "#FA0087"}},
minimal=True
)
profile
labels, counts=np.unique(df_data_processing.Class, return_counts=True)
fig = go.Figure(data=[go.Pie(labels=['Legal', 'Fraudulent'], values=counts, insidetextorientation='radial', hole=.4)])
fig.update_traces(textposition='auto', textinfo='percent+label')
# fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.update_layout(
title_text="Percentage analysis of variable predictor",
title_y=0.98,
# Add annotations in the center of the donut pies.
annotations=[dict(text='Transactions', x=0.5, y=0.5, font_size=20, showarrow=False)])
fig.show()
ax = sns.boxplot(data=df_data_processing, y='Amount', x='Class', hue="Class", dodge=False);
ax.set_xticklabels(["Legal", "Fraud"])
ax.set_xlabel("Tipo transaccion")
plt.title("Distribution variable Amount", fontsize=24, fontdict={"weight": "bold"})
plt.show()
report_fraud = describe(df_data_processing['Amount'].astype("float32")[(df_data_processing['Class'] == 1)])._asdict()
report_legal = describe(df_data_processing['Amount'].astype("float32")[(df_data_processing['Class'] == 0)])._asdict()
report_amount = pd.DataFrame([report_legal, report_fraud], columns=report_legal.keys(), index=['Legal', 'Fraud'])
pandas_title(report_amount, 'Describe variable Amount')
print(f'Weighted geometric mean for data Amount = {gmean(df_data_processing["Amount"][(df_data_processing["Amount"] > 0)].astype("float32"))}')
print(f'Geometric standard deviation for data Amount = {gstd(df_data_processing["Amount"][(df_data_processing["Amount"] > 0)].astype("float32"))}\n\n')
# we apply average winsorize to replace 0.2% of the outliers
amount_winsorize = winsorize(df_data_processing['Amount'].astype("float32"), limits=[0, 0.002])
# comparative report with normal mean and variance
report_amount_winsorize = describe(amount_winsorize)._asdict()
report_amount_winsorize = pd.DataFrame([report_amount_winsorize], columns=report_amount_winsorize.keys(), index=['Amount winsorize'])
pandas_title(report_amount_winsorize, 'Describe amount_winsorize')
print('\n')
print(f'Weighted geometric mean for data winsorize Amount = {gmean(amount_winsorize.data[amount_winsorize.data != 0])}')
print(f'Geometric standard deviation for data winsorize Amount = {gstd(amount_winsorize.data[amount_winsorize.data != 0])}')
df_data_processing["Amount"] = amount_winsorize.data
report_fraud = describe(df_data_processing['Amount'].astype("float32")[(df_data_processing['Class'] == 1)])._asdict()
report_legal = describe(df_data_processing['Amount'].astype("float32")[(df_data_processing['Class'] == 0)])._asdict()
report_amount = pd.DataFrame([report_legal, report_fraud], columns=report_legal.keys(), index=['Legal', 'Fraud'])
pandas_title(report_amount, 'Describe variable Amount processed')
ax = sns.boxplot(data=df_data_processing, y='Amount', x='Class', hue="Class", dodge=False);
ax.set_xticklabels(["Legal", "Fraud"])
ax.set_xlabel("Tipo transaccion")
plt.title("Distribution variable Amount winsorize", fontsize=24, fontdict={"weight": "bold"})
plt.show()
data_corre = df_data_processing.corr()['Class'].sort_values(ascending=True).reset_index().iloc[:-1]
fig = go.Figure()
fig.add_trace(
go.Bar(
x=data_corre['index'],
y=data_corre['Class'], opacity=0.8,
marker=dict(cmax=1, cmin=-1, color=data_corre['Class'], showscale=True, colorbar={"title": "Correlation"})))
fig.update_xaxes(tickangle=60, tickfont=dict(size=10), automargin='height')
fig.update_layout(title_text='Graphic correlation variable Class', xaxis_title="Feature", yaxis_title="Correlation", coloraxis_colorbar_title_text = 'your title')
fig.show()
data_corre = df_data_processing.corr()['Amount'].sort_values(ascending=True).reset_index().iloc[:-1]
fig = go.Figure()
fig.add_trace(
go.Bar(
x=data_corre['index'],
y=data_corre['Amount'], opacity=0.8,
marker=dict(cmax=1, cmin=-1, color=data_corre['Amount'], showscale=True, colorbar={"title": "Correlation"})))
fig.update_xaxes(tickangle=60, tickfont=dict(size=10), automargin='height')
fig.update_layout(title_text='Graphic correlation variable Amount', xaxis_title="Feature", yaxis_title="Correlation", coloraxis_colorbar_title_text = 'your title')
fig.show()
pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 16, 'color': 'white'}
fig = px.histogram(df_data_processing, x="Time", color="Class", opacity=0.8, barmode='overlay')
fig.update_layout(
title=f"Histogram Time",
)
fig.show()
fig = px.histogram(df_data_processing, x="Amount", color="Class", opacity=0.8, barmode='overlay', nbins=200)
fig.update_layout(
title=f"Histogram Amount",
)
fig.show()
scaler = StandardScaler()
df_data_scaled = scaler.fit_transform(df_data_processing)
df_data_scaled = pd.DataFrame(df_data_scaled, columns = df_data_processing.columns)
X = df_data_scaled.drop(['Class'], axis=1)
y = df_data_processing['Class']
sm = SMOTE(random_state=42)
X, y = sm.fit_resample(X, y)
def matrix_confusion_plotly(cm, name):
pio.templates['new_template']['layout']['font'] = {'family': 'verdana', 'size': 16, 'color': 'white'}
x_matrix = ['False', 'True']
y_matrix = ['True', 'False']
z_text = [['TN', 'FP'],
['FN', 'TP']]
# Display element name and prediction on hover
hover_text = [['True Neg', 'False Pos'],
['False Neg', 'True Pos']]
hover=[]
for _ in range(len(z_text)):
hover.append([i + '<br>' + 'Quantity prediction: ' + str(j) + '<br>' + 'Percentage data in each quadrant: ' + str(round(j/np.sum(cm)*100, 2)) + '%'
for i, j in zip(hover_text[_], cm[_])])
fig = ff.create_annotated_heatmap(cm[::-1], x=x_matrix, y=y_matrix, annotation_text=z_text[::-1], text=hover[::-1], hoverinfo='text', colorscale=px.colors.diverging.RdBu[::-1], showscale=True)
fig.update_layout(title_text=f'<i><b>Confusion matrix for {name}</b></i>')
# add custom xaxis title
fig.add_annotation(dict(font=dict(color="white",size=14),
x=0.5,
y=-0.12,
showarrow=False,
text="Predicted value",
xref="paper",
yref="paper"))
# add custom yaxis title
fig.add_annotation(dict(font=dict(color="white",size=14),
x=-0.08,
y=0.5,
showarrow=False,
text="Real value",
textangle=-90,
xref="paper",
yref="paper"))
fig.update_layout(width=560, height=560)
fig.show()
import warnings
warnings.filterwarnings("ignore")
# method used a cross-validation procedure
cv = ShuffleSplit(n_splits=3, test_size=0.2, random_state=0)
# Grid for classifier
reg_classifier = {
'LogisticRegression': LogisticRegression(),
'RandomForestClassifier': RandomForestClassifier(),
'DecisionTreeClassifier': DecisionTreeClassifier(),
'KNeighborsClassifier': KNeighborsClassifier(),
}
params = {
'LogisticRegression':
{
"C": np.logspace(-4, 4, 4),
"penalty": ['l1', 'l2'],
"solver": ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
'max_iter': [1000],
'random_state': [42],
'n_jobs': [-1],
},
'RandomForestClassifier':
{
"criterion": ['entropy', 'gini'],
"n_estimators": [64],
"max_depth": [32],
'random_state': [42],
'n_jobs': [-1],
},
'DecisionTreeClassifier':
{
"criterion": ['entropy', 'gini'],
"max_depth": [32],
'random_state': [42],
},
'KNeighborsClassifier':
{
'n_neighbors': [3],
'weights': ['distance'],
'n_jobs': [-1],
'metric': ['minkowski'],
'p': [2],
},
}
# Grid for LearningCurveDisplay
common_params = {
"X": X,
"y": y,
"train_sizes": np.linspace(0.2, 1.0, 5),
"cv": cv,
"score_type": "both",
"n_jobs": -1,
"line_kw": {"marker": "o"},
"std_display_style": "fill_between",
"score_name": "Accuracy",
"random_state": 42
}
# Dict and list to save score to models
models_best = {}
models_names = []
test_scores = []
train_scores = []
###################################### Models deployment ###########################################
for name, reg in tqdm(reg_classifier.items()):
grid_reg = GridSearchCV(reg, params.get(name), n_jobs=-1, cv=cv).fit(X, y)
score = np.abs(grid_reg.best_score_)
#Getting the best estimators, score & parameters
GridSearchCV
print(f'├──GridSearchCV of {name}:')
print(f'\t├──Best Estimator: {grid_reg.best_estimator_}')
print(f'\t├──Best recall Score on Train: {grid_reg.best_score_}')
print(f'\t├──Best Params: {grid_reg.best_params_} \n\n')
# Best model for comparative Roc Curves
models_best[name] = grid_reg.best_estimator_
# Split with 'X' and 'y' balanced and pca_components_best of 'X'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Training our best model and prediction
model = grid_reg.best_estimator_.fit(X_train, y_train)
prediction = model.predict(X_test)
# View report with pro style
df = pd.DataFrame(classification_report(y_test,
prediction,
digits=2,
output_dict=True)).T
df['support'] = df.support.apply(int)
pandas_title(df.style.background_gradient(cmap='coolwarm',
subset=pd.IndexSlice['0':'1', :'f1-score']), f'Report metrics of best estimator {name}')
# Display table results Test_Score and Train_Score for all models
models_names.append(name)
test_scores.append(model.score(X_test, y_test)*100)
train_scores.append(model.score(X_train, y_train)*100)
# Function plot matrix confusion with plotly
cm = confusion_matrix(y_test, prediction, labels=model.classes_)
matrix_confusion_plotly(cm, name)
# Class Prediction Error of result LogisticRegression# Class Prediction Error of result LogisticRegression
## Instantiate the classification model and visualizer
fig, ax = plt.subplots(figsize=(9,7))
visualizer = ClassPredictionError(grid_reg.best_estimator_, classes=['Legal', 'Fraud'])
## Fit the training data to the visualizer
visualizer.fit(X_train, y_train)
## Evaluate the model on the test data
visualizer.score(X_test, y_test)
## Draw visualization
visualizer.show()
# Learning Curve visualization
fig, ax = plt.subplots(figsize=(9,9))
LearningCurveDisplay.from_estimator(grid_reg.best_estimator_, **common_params, ax=ax)
handles, label = ax.get_legend_handles_labels()
ax.legend(handles[:2], ["Training Score", "Test Score"], fontsize=12)
plt.title(f"Learning Curve for {name}", fontsize=24, fontdict={"weight": "bold"})
plt.show()
# # ROC Curve single
# fig, ax = plt.subplots(figsize=(9,9))
# clf = grid_reg.best_estimator_.fit(X_train, y_train)
# RocCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax)
# plt.title(f"ROC Curve for {name}",
# fontsize=24, fontdict={"weight": "bold"})
# plt.show()
# Plot a decision tree
if name == 'DecisionTreeClassifier':
mpl.rcParams['text.color'] = 'black'
sns.set_style('darkgrid', {"grid.color": ".6", "grid.linestyle": ":"})
fig, ax = plt.subplots(figsize=(12, 12), facecolor='k')
plot_tree(grid_reg.best_estimator_, feature_names=list(X.columns), label=list(np.unique(y)), filled=True, fontsize=12, max_depth= 2, ax=ax)
plt.title(f"Decision tree for {name}", color='white', fontsize=28, fontweight='bold')
plt.show()
print('\n' * 3, '_' * 85, '\n' * 3)
def RocCurves_plotly(models):
# Create an empty figure, and iteratively add new lines
# every time we compute a new class
fig = go.Figure()
fig.add_shape(
type='line', line=dict(dash='dash'),
x0=0, x1=1, y0=0, y1=1
)
for name, reg in models.items():
# Fit the model
reg.fit(X_train, y_train)
y_score = reg.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_score)
auc_score = roc_auc_score(y_test, y_score)
name = f"{name} (AUC={auc_score:.2f})"
fig.add_trace(go.Scatter(x=fpr, y=tpr, name=name, mode='lines'))
fig.update_layout(
title="Roc Curves all best classifier",
xaxis_title='False Positive Rate',
yaxis_title='True Positive Rate',
yaxis=dict(scaleanchor="x", scaleratio=1),
xaxis=dict(constrain='domain'),
)
fig.show()
# plot RocCurves for best classifier
RocCurves_plotly(models_best)
dict_models_scores = {'Model': models_names, 'Test Score': test_scores, 'Train Score': train_scores}
df = pd.DataFrame(dict_models_scores)
# plot bar comparative scores test and train for best classifier
fig = go.Figure()
fig.add_trace(go.Bar(
y=df['Test Score'],
x=df['Model'],
name='Test Scores',
marker_color='indianred'
))
fig.add_trace(go.Bar(
y=df['Train Score'],
x=df['Model'],
name='Train Scores',
marker_color='lightsalmon'
))
# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.update_layout(barmode='group', xaxis_tickangle=-45, title="Bar comparative scores of test and train for best classifier",)
fig.update_yaxes(tickfont=dict(family='Arial', size=14), automargin='height')
fig.update_xaxes(tickfont=dict(family='Arial', size=14), automargin='height')
fig.show()
# !pip install -q pipreqsnb
# !pipreqsnb '.' --force
# Edit to requirements.txt with any additional libraries you require