from DeepClassifier import DeepClassifier, create_classifier
from tensorflow.keras import optimizers
import tensorflow
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.inspection import permutation_importance
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
import os
import warnings
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
warnings.filterwarnings("ignore", category=DeprecationWarning)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
df = pd.read_csv('credit_risk_dataset.csv')
df
df["loan_status"].value_counts()
print(f"{int(100*df[df['loan_status']==1].shape[0] / df.shape[0])}% of Rows are Defaults")
df["person_emp_length"].fillna(df["person_emp_length"].median(), inplace=True)
df["loan_int_rate"].fillna(df["loan_int_rate"].median(), inplace=True)
df.drop_duplicates(inplace=True)
df[df.isnull().any(axis=1)]
categorical_columns = df.columns[df.dtypes == 'object']
categorical_columns
for column in categorical_columns:
df = pd.concat([df.drop(columns=[column]), pd.get_dummies(df[column], prefix=column, drop_first=True)],axis=1)
df
X = df.drop(columns=['loan_status', 'loan_percent_income'])
Y = df['loan_status']
numerical_columns = X.columns[X.dtypes != 'uint8']
X.loc[:,numerical_columns]
# StandardScaler only in numerical columns to preserve One Hot Encoded variables
scaler = StandardScaler().fit(X.loc[:,numerical_columns])
X.loc[:,numerical_columns] = scaler.transform(X.loc[:,numerical_columns])
X
classifcator = DeepClassifier(
model=create_classifier,
loss="binary_crossentropy",
model__architecture={
"Layers": ["Dense", "Dense"],
"ActivationFunctions": ["relu", "sigmoid"],
"Neurons": [50, 1],
},
callbacks=EarlyStopping,
callbacks__monitor="val_loss",
callbacks__min_delta=0.0001,
callbacks__patience=20,
callbacks__verbose=0,
callbacks__restore_best_weights=True,
verbose=0,
epochs=100,
train_ratio=0.8,
val_ratio=0.2,
batch_size_custom=32*8,
fit__shuffle=True,
optimizer=optimizers.Adam,
)
param_grid = {
"optimizer__learning_rate": [0.00001, 0.0001, 0.001],
"loss":["binary_crossentropy"],
"model__architecture": [
{
"Layers": ["Dense", "Dense"],
"ActivationFunctions": ["relu", "sigmoid"],
"Neurons": [25, 1],
},
{
"Layers": ["Dense", "Dense"],
"ActivationFunctions": ["relu", "softmax"],
"Neurons": [25, 1],
},
{
"Layers": ["Dense", "Dense"],
"ActivationFunctions": ["relu", "sigmoid"],
"Neurons": [50, 1],
},
{
"Layers": ["Dense", "Dense"],
"ActivationFunctions": ["relu", "softmax"],
"Neurons": [50, 1],
},
{
"Layers": ["Dense", "Dense", "Dense"],
"ActivationFunctions": ["relu", "relu", "sigmoid"],
"Neurons": [50, 25, 1],
},
{
"Layers": ["Dense", "Dense", "Dense"],
"ActivationFunctions": ["relu", "relu", "softmax"],
"Neurons": [50, 25, 1],
}
],
}
clf = GridSearchCV(
refit=True,
estimator=classifcator,
param_grid=param_grid,
# Train all the models in paralel
n_jobs=-1,
cv=3,
verbose=10,
scoring='f1_macro',
)
# 80% train / 20% test
# Train set will be splitted again in order to use 20% of it for Keras validation
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=7)
X_train = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
clf.fit(X_train, y_train)
grid_search_cv = (
pd.DataFrame.from_dict(clf.cv_results_)
.sort_values(by=["rank_test_score"], ascending=True)
.head(100)
)
grid_search_cv.index = grid_search_cv.rank_test_score
grid_search_cv.to_csv("GridSearchResults.csv")
pd.read_csv('GridSearchResults.csv')
X_test = X_test.values.reshape((X_test.shape[0], X_test.shape[1], 1))
y_pred = clf.best_estimator_.predict(X_test)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f"True Negative: {tn}\nFalse Positive: {fp}\nFalse Negative: {fn}\nTrue Positive: {tp}")
print("\n")
print(classification_report(y_test, y_pred, target_names = ['Non-Default', 'Default']))
print('Best Estimator')
print('-'*90)
print(clf.best_estimator_)
print('-'*90)
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1]))
from sklearn.metrics import f1_score
import numpy as np
import shap
import warnings
warnings.filterwarnings('ignore')
def f(X):
X = X.reshape((X.shape[0], X.shape[1], 1))
results = clf.best_estimator_.predict(X).flatten()
return results
explainer = shap.KernelExplainer(f, X_test[:100,:])
shap_values = explainer.shap_values(X_test[300:350,:], nsamples=500)
plot = shap.force_plot(explainer.expected_value, shap_values, pd.DataFrame(X_test[300:350,:], columns=X.columns), matplotlib=False,show=False)
plot
plot = shap.force_plot(explainer.expected_value, shap_values[0], pd.DataFrame(X_test[1300:1301,:], columns=X.columns), matplotlib=True, show=False)
plt.savefig('uni_row_plot.png')
plt.close()
plt.figure(figsize = (20,4))
img = mpimg.imread('uni_row_plot.png')
plt.imshow(img)
plt.show()