Credit Risk Keras-GridSearchCV

from DeepClassifier import DeepClassifier, create_classifier from tensorflow.keras import optimizers import tensorflow from tensorflow.keras.callbacks import EarlyStopping from sklearn.model_selection import GridSearchCV import pandas as pd from sklearn.inspection import permutation_importance from sklearn.metrics import confusion_matrix, classification_report from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import StandardScaler, MinMaxScaler from sklearn.model_selection import train_test_split import os import warnings import matplotlib.pyplot as plt import matplotlib.image as mpimg warnings.filterwarnings("ignore", category=DeprecationWarning) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1' df = pd.read_csv('credit_risk_dataset.csv') df

df["loan_status"].value_counts()

print(f"{int(100*df[df['loan_status']==1].shape[0] / df.shape[0])}% of Rows are Defaults")

df["person_emp_length"].fillna(df["person_emp_length"].median(), inplace=True) df["loan_int_rate"].fillna(df["loan_int_rate"].median(), inplace=True) df.drop_duplicates(inplace=True) df[df.isnull().any(axis=1)]

categorical_columns = df.columns[df.dtypes == 'object'] categorical_columns

for column in categorical_columns: df = pd.concat([df.drop(columns=[column]), pd.get_dummies(df[column], prefix=column, drop_first=True)],axis=1) df

X = df.drop(columns=['loan_status', 'loan_percent_income']) Y = df['loan_status'] numerical_columns = X.columns[X.dtypes != 'uint8']

X.loc[:,numerical_columns]

# StandardScaler only in numerical columns to preserve One Hot Encoded variables scaler = StandardScaler().fit(X.loc[:,numerical_columns])

X.loc[:,numerical_columns] = scaler.transform(X.loc[:,numerical_columns])

classifcator = DeepClassifier( model=create_classifier, loss="binary_crossentropy", model__architecture={ "Layers": ["Dense", "Dense"], "ActivationFunctions": ["relu", "sigmoid"], "Neurons": [50, 1], }, callbacks=EarlyStopping, callbacks__monitor="val_loss", callbacks__min_delta=0.0001, callbacks__patience=20, callbacks__verbose=0, callbacks__restore_best_weights=True, verbose=0, epochs=100, train_ratio=0.8, val_ratio=0.2, batch_size_custom=32*8, fit__shuffle=True, optimizer=optimizers.Adam, ) param_grid = { "optimizer__learning_rate": [0.00001, 0.0001, 0.001], "loss":["binary_crossentropy"], "model__architecture": [ { "Layers": ["Dense", "Dense"], "ActivationFunctions": ["relu", "sigmoid"], "Neurons": [25, 1], }, { "Layers": ["Dense", "Dense"], "ActivationFunctions": ["relu", "softmax"], "Neurons": [25, 1], }, { "Layers": ["Dense", "Dense"], "ActivationFunctions": ["relu", "sigmoid"], "Neurons": [50, 1], }, { "Layers": ["Dense", "Dense"], "ActivationFunctions": ["relu", "softmax"], "Neurons": [50, 1], }, { "Layers": ["Dense", "Dense", "Dense"], "ActivationFunctions": ["relu", "relu", "sigmoid"], "Neurons": [50, 25, 1], }, { "Layers": ["Dense", "Dense", "Dense"], "ActivationFunctions": ["relu", "relu", "softmax"], "Neurons": [50, 25, 1], } ], } clf = GridSearchCV( refit=True, estimator=classifcator, param_grid=param_grid, # Train all the models in paralel n_jobs=-1, cv=3, verbose=10, scoring='f1_macro', ) # 80% train / 20% test # Train set will be splitted again in order to use 20% of it for Keras validation X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=7) X_train = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1)) clf.fit(X_train, y_train) grid_search_cv = ( pd.DataFrame.from_dict(clf.cv_results_) .sort_values(by=["rank_test_score"], ascending=True) .head(100) ) grid_search_cv.index = grid_search_cv.rank_test_score grid_search_cv.to_csv("GridSearchResults.csv") pd.read_csv('GridSearchResults.csv')

X_test = X_test.values.reshape((X_test.shape[0], X_test.shape[1], 1)) y_pred = clf.best_estimator_.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel() print(f"True Negative: {tn}\nFalse Positive: {fp}\nFalse Negative: {fn}\nTrue Positive: {tp}") print("\n") print(classification_report(y_test, y_pred, target_names = ['Non-Default', 'Default']))

print('Best Estimator') print('-'*90) print(clf.best_estimator_) print('-'*90)

X_test = X_test.reshape((X_test.shape[0], X_test.shape[1]))

from sklearn.metrics import f1_score import numpy as np import shap import warnings warnings.filterwarnings('ignore') def f(X): X = X.reshape((X.shape[0], X.shape[1], 1)) results = clf.best_estimator_.predict(X).flatten() return results

explainer = shap.KernelExplainer(f, X_test[:100,:]) shap_values = explainer.shap_values(X_test[300:350,:], nsamples=500)

plot = shap.force_plot(explainer.expected_value, shap_values, pd.DataFrame(X_test[300:350,:], columns=X.columns), matplotlib=False,show=False)

plot

plot = shap.force_plot(explainer.expected_value, shap_values[0], pd.DataFrame(X_test[1300:1301,:], columns=X.columns), matplotlib=True, show=False) plt.savefig('uni_row_plot.png') plt.close() plt.figure(figsize = (20,4)) img = mpimg.imread('uni_row_plot.png') plt.imshow(img) plt.show()