import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import scipy.stats as stats %matplotlib inline plt.rcParams['figure.figsize'] = 10, 8 plt.style.use("seaborn")

!pip install statsmodels==0.13.5

#for machine learning import statsmodels.formula.api as sm from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn import metrics from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import GridSearchCV from sklearn.pipeline import make_pipeline from sklearn.model_selection import cross_validate

Data Pre-Processing: - Missing Values Treatment: Numerical (Mean/Median imputation) and Categorical (Separate Missing Category or Merging) - Univariate Analysis: Outlier and Frequency Analysis

url = "https://raw.githubusercontent.com/Jennnnnnn8133/RiskMinder-Real-time-Credit-Risk-Management-through-Data-Analysis/main/bankloans.csv" bankloans = pd.read_csv(url) bankloans.head()

bankloans.columns

bankloans.shape #number of observations & features

bankloans.info #data types in df

bankloans.isnull().any() #check for any column has missing values

bankloans.isnull().sum()

The dataset contains some missing values, and upon closer examination, it was discovered that the missing values in the "default" column are associated with a new group of customers.

-> To address outliers in the dataset, we applied the technique of Winsorization.

-> We examined the distribution of customers who defaulted and those who did not in the dataset to determine if it is a balanced or imbalanced dataset.

#Segregating the numeric and categorical variable names numeric_var_names = [key for key in dict(bankloans.dtypes) if dict(bankloans.dtypes)[key] in ['float64', 'int64', 'float32', 'int32']] catgorical_var_names = [key for key in dict(bankloans.dtypes) if dict(bankloans.dtypes)[key] in ['object']]

numeric_var_names

#splitting the data set into two sets - existing customers and new customers bankloans_existing = bankloans.loc[bankloans.default.isnull() == 0] bankloans_new = bankloans.loc[bankloans.default.isnull() == 1] bankloans_existing.describe(percentiles=[.25,0.5,0.75,0.90,0.95])

sns.boxplot(y = "age",data=bankloans_existing) plt.title("Box-Plot of age") plt.show()

sns.boxplot(y = "employ",data=bankloans_existing) plt.title("Box-Plot of employee tenure") plt.show()

sns.boxplot(y = "income",data=bankloans_existing) plt.title("Box-Plot of employee income") plt.show()

sns.boxplot(y = "debtinc",data=bankloans_existing) plt.title("Box-Plot of employee debt to income ratio") plt.show()

sns.boxplot(y = "creddebt",data=bankloans_existing) plt.title("Box-Plot of Credit to debit ratio") plt.show()

income_minlimit = bankloans_existing["income"].quantile(0.75) + 1.5 * (bankloans_existing["income"].quantile(0.75) - bankloans_existing["income"].quantile(0.25)) income_minlimit

def outlier_capping(x): """A function to remove and replace the outliers for numerical columns""" threshold = x.quantile(0.95) x = x.apply(lambda y: threshold if y > threshold else y) return x

##Correlation Matrix bankloans_existing.corr()

#Visualize the correlation using seaborn heatmap sns.heatmap(bankloans_existing.corr(),annot=True,fmt="0.2f",cmap="coolwarm") plt.show()

bankloans_existing.shape

bankloans_new.shape

#Indicator variable unique types bankloans_existing['default'].value_counts()

bankloans_existing['default'].value_counts().plot.bar() plt.xlabel("default") plt.ylabel("count") plt.title("Distribution of default") plt.show()

#percentage of unique types in indicator variable round(bankloans_existing['default'].value_counts()/bankloans_existing.shape[0] * 100,3)

Data Exploratory Analysis: - Bivariate Analysis - Numeric(T-test)/ Categorical(Chi-square) - Bivariate Analysis - Visualization - Variable Reduction - Multicollinearity

## performing the independent t test on numerical variables tstats_df = pd.DataFrame() for eachvariable in numeric_var_names: tstats = stats.ttest_ind(bankloans_existing.loc[bankloans_existing["default"] == 1,eachvariable],bankloans_existing.loc[bankloans_existing["default"] == 0, eachvariable],equal_var=False) temp = pd.DataFrame([eachvariable, tstats[0], tstats[1]]).T temp.columns = ['Variable Name', 'T-Statistic', 'P-Value'] tstats_df = pd.concat([tstats_df, temp], axis=0, ignore_index=True) tstats_df = tstats_df.sort_values(by = "P-Value").reset_index(drop = True)

tstats_df

Bi-variate Analysis:

def BivariateAnalysisPlot(segment_by): """A funtion to analyze the impact of features on the target variable""" fig, ax = plt.subplots(ncols=1,figsize = (10,8)) #boxplot sns.boxplot(x = 'default', y = segment_by, data=bankloans_existing) plt.title("Box plot of "+segment_by) plt.show()

BivariateAnalysisPlot("age")

BivariateAnalysisPlot("ed")

BivariateAnalysisPlot("employ")

BivariateAnalysisPlot("address")

BivariateAnalysisPlot("income")

BivariateAnalysisPlot("debtinc")

BivariateAnalysisPlot("creddebt")

BivariateAnalysisPlot("othdebt")

Multi Collinearity Check

from patsy import dmatrices from statsmodels.stats.outliers_influence import variance_inflation_factor

features = "+".join(bankloans_existing.columns.difference(["default"])) features

#perform vif a, b = dmatrices(formula_like= 'default ~ ' + features,data=bankloans_existing,return_type="dataframe") vif = pd.DataFrame() vif["VIF Factor"] = [variance_inflation_factor(b.values, i) for i in range(b.shape[1])] vif["Features"] = b.columns vif

Observations:

There are 850 observations and 9 features in the data set - All 9 features are numerical in nature - There are no missing values in the data set - Out of 850 customers data, 700 are existing customers and 150 are new customers - In the 700 existing customers, 517 customers are tagged as non defaulters and remaining 183 are tagged as defaulters - The data is highly imbalanced - From VIF check, found out that the correlation between the variables is within the acceptable limits

Model Building and Model Diagnostics

Logistics Regression & Decision Tree Classification:

Train/Test Split: Splitting data into training and testing sets to check the model's performance on unseen data.

Variable Significance: Assessing significance of each variable using statistical tests to determine their association with the outcome variable.

Gini and ROC/Concordance: Measures of the model's performance - Gini coefficient measures classification accuracy, ROC curve measures sensitivity vs specificity trade-off, concordance measures predictive ability.

Classification Table Accuracy: Evaluate model performance using classification table to compare predicted vs actual values and calculate accuracy.

Decision Tree Classifier: Same analysis as logistic regression with additional use of measures like Gini index, Chi-square or Information gain to determine significant variables. ROC curves and concordance analysis may not be applicable as the output is categorical. Confusion matrix can also be used to evaluate model performance.

Logistic Regression

featurecolumns = bankloans_existing.columns.difference(['default']) featurecolumns

#Train and test split train_X,test_X,train_y,test_y = train_test_split(bankloans_existing[featurecolumns], bankloans_existing['default'], stratify = bankloans_existing['default'], test_size = 0.2, random_state = 123)

train_X.shape

test_X.shape

round(train_y.value_counts()/train_y.shape[0] * 100,3)

## Model Building logreg = LogisticRegression() logreg.fit(train_X,train_y)

#Features and their coefficients coefficient_df = pd.DataFrame({'Features' : pd.Series(featurecolumns), "Coefficients" : pd.Series(logreg.coef_[0])}) coefficient_df

logreg.intercept_

Model Performance

Test dataset:

Recall measures the ratio of the total number of correctly classified positive examples (True Positives) to the total number of positive examples (True Positives + False Negatives). A high Recall indicates that the model is correctly recognizing the positive examples in the dataset.

Precision measures the ratio of the total number of correctly classified positive examples (True Positives) to the total number of predicted positive examples (True Positives + False Positives). A high Precision indicates that an example labeled as positive by the model is indeed positive.

#Predicting the test cases bankloans_test_pred_log = pd.DataFrame({'actual':test_y, 'predicted': logreg.predict(test_X)}) bankloans_test_pred_log = bankloans_test_pred_log.reset_index() bankloans_test_pred_log.head()

#creating a confusion matrix cm_logreg = metrics.confusion_matrix(bankloans_test_pred_log.actual, bankloans_test_pred_log.predicted,labels = [1,0]) cm_logreg

sns.heatmap(cm_logreg,annot=True, fmt=".2f", cmap="coolwarm", xticklabels = ["Default", "Not Default"] , yticklabels = ["Default", "Not Default"]) plt.title("Confusion Matrix for Test data") plt.ylabel("True Label") plt.xlabel("Predicted Label") plt.show()

#find precision score prec_score = metrics.precision_score(bankloans_test_pred_log.actual, bankloans_test_pred_log.predicted) print("Precision score :", round(prec_score,3))

#find the overall accuracy of model acc_score = metrics.accuracy_score(bankloans_test_pred_log.actual,bankloans_test_pred_log.predicted) print("Accuracy of model :", round(acc_score,3))

bankloans_test_pred_log.actual.value_counts()

print(metrics.classification_report(bankloans_test_pred_log.actual, bankloans_test_pred_log.predicted))

Inference: The model's overall test accuracy is 80%, but this metric is not sufficient to evaluate the model's performance because the primary objective is to identify customers who are likely to default. However, there are numerous cases where customers have defaulted but the model has predicted them as not defaulting, indicating a high rate of false negatives.

-> To improve the model's ability to identify customers who are likely to default, it may be necessary to adjust the threshold for determining default risk from the default assumption of 0.5 probability. The bank can intervene and take action based on this more accurate prediction of default risk.

Find the optimum cutoff value

#probabilty of prediction predict_prob_df = pd.DataFrame(logreg.predict_proba(test_X)) predict_prob_df.head()

bankloans_test_pred_log = pd.concat([bankloans_test_pred_log, predict_prob_df], axis = 1) bankloans_test_pred_log.columns = ['index', 'actual', 'predicted', 'default_0','default_1'] bankloans_test_pred_log.head()

#find the auc score auc_score = metrics.roc_auc_score(bankloans_test_pred_log.actual, bankloans_test_pred_log.default_1) round(auc_score,4)

#Draw a roc curve fpr, tpr, thresholds = metrics.roc_curve(bankloans_test_pred_log.actual, bankloans_test_pred_log.default_1, drop_intermediate= False) plt.plot(fpr, tpr , label = 'ROC curve (area = %0.2f)' % auc_score) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate or [1 - True Negative Rate]') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic Curve') plt.legend(loc="lower right") plt.show()

Cutoff would be optimum where specificity and sensitivity would be maximum for the given cutoff

##TPR - Sensitivity ##1-FPR - Specificity i = np.arange(len(tpr)) roc_like_df = pd.DataFrame({'falsepositiverate' : pd.Series(fpr, index=i),'sensitivity' : pd.Series(tpr, index = i), 'specificity' : pd.Series(1-fpr, index = i),'cutoff' : pd.Series(thresholds, index = i)}) roc_like_df['total'] = roc_like_df['sensitivity'] + roc_like_df['specificity'] roc_like_df[roc_like_df['total']==roc_like_df['total'].max()]

plt.subplots(figsize=(10,6)) plt.scatter(roc_like_df['cutoff'], roc_like_df['sensitivity'], marker='*', label='Sensitivity') plt.scatter(roc_like_df['cutoff'], roc_like_df['specificity'], marker='*', label='Specificity') plt.scatter(roc_like_df['cutoff'], roc_like_df['falsepositiverate'], marker='*', label='FPR') plt.title('For each cutoff, pair of sensitivity and FPR is plotted for ROC') plt.legend() plt.show()

#Predicting with new cut-off probability bankloans_test_pred_log['new_labels'] = bankloans_test_pred_log['default_1'].map( lambda x: 1 if x >= 0.224326 else 0 ) bankloans_test_pred_log.head()

#creating a confusion matrix cm_logreg = metrics.confusion_matrix(bankloans_test_pred_log.actual, bankloans_test_pred_log.new_labels,labels = [1,0]) cm_logreg

#classification report print(metrics.classification_report(bankloans_test_pred_log.actual,bankloans_test_pred_log.new_labels))

#intuitively the ability of the classifier to find all the positive samples recall_score = metrics.recall_score(bankloans_test_pred_log.actual, bankloans_test_pred_log.new_labels) print("recall_score:", round(recall_score , 3))

#find the overall accuracy of model acc_score = metrics.accuracy_score(bankloans_test_pred_log.actual,bankloans_test_pred_log.new_labels) print("Accuracy of model :", round(acc_score,3))

Inference:

The model's overall accuracy may be reduced from 80% to 75% by adjusting the cutoff to 0.224, but this change can significantly improve the recall score from 54% to 89%. Recall score is an important metric because it measures the model's ability to identify all positive samples, or customers who are likely to default. However, this adjustment comes at a cost, as the precision score drops from 67% to 52%. Precision score measures the model's ability not to label non-default customers as default customers. The choice of cutoff value ultimately depends on the business's priorities and the value placed on true positives versus false positives. In practice, the cutoff value is often determined by business decisions.

Decision Tree Classifier

#make a pipeline for decision tree model pipelines = { "dtclass": make_pipeline(DecisionTreeClassifier(random_state=100)) } #To check the accuracy of the pipeline scores = cross_validate(pipelines['dtclass'],train_X,train_y,return_train_score=True) scores['test_score'].mean()

Effective machine learning models: -> While cross-validation is an important process in evaluating a model, it is not specifically focused on finding the best combination of parameters. Instead, cross-validation is a technique for assessing how well a model will generalize to new data.

-> Hyperparameter tuning, on the other hand, involves selecting the best hyperparameters for a model, such as the learning rate or number of layers in a neural network. This is typically done by training multiple models with different hyperparameters and evaluating their performance on a validation set. Cross-validation can be used as part of this process to estimate the generalization performance of each model.

Declare a hyper-parameters to fine the Decision Tree Classifier:

Decision tree algorithm can lead to overfitting of training data, resulting in poor performance on unseen data. Pruning is a process used to prevent this by stopping the tree from growing too complex.

-> Hyperparameters in decision tree control pruning and include parameters like maximum depth of the tree and minimum number of samples required to split an internal node.

-> Tuning hyperparameters can optimize the model's performance by finding the best combination of parameters.

Cross-validation is used to evaluate the effectiveness of different hyperparameter settings by training the model on one subset of data and testing it on another, allowing for an understanding of how well the model generalizes to unseen data and which hyperparameters work best.

#list of tunable hyper parameters for decision tree classifier pipeline pipelines['dtclass'].get_params().keys()

decisiontree_hyperparameters = { 'decisiontreeclassifier__max_depth' : np.arange(3, 10), 'decisiontreeclassifier__max_features' : np.arange(3, 8), 'decisiontreeclassifier__min_samples_split' : np.arange(2, 15), "decisiontreeclassifier__min_samples_leaf" : np.arange(1,3) }

Decision Tree classifier with gini index -> Fit and tuning models with cross-validation

Now that we have our pipelines and hyperparameters dictionaries declared, we're ready to tune our models with cross-validation.

We are doing 5-fold cross validation

#Create a cross validation object from decision tree classifier and it's hyperparameters dtclass_model = GridSearchCV(pipelines['dtclass'],decisiontree_hyperparameters,cv=5, n_jobs=-1) #fit the model dtclass_model.fit(train_X, train_y)

#display the best parameters for decision tree model dtclass_model.best_params_

#best score for the model dtclass_model.best_score_

# In pipeline we can use the string names to get the decision tree dtclass_best_model = dtclass_model.best_estimator_.named_steps['decisiontreeclassifier'] dtclass_best_model

Model Performace Evaluation

On Test Data

#Predicting the test cases bankloans_test_pred_dtclass = pd.DataFrame({'actual':test_y, 'predicted': dtclass_best_model.predict(test_X)}) bankloans_test_pred_dtclass = bankloans_test_pred_dtclass.reset_index() bankloans_test_pred_dtclass.head()

#creating a confusion matrix cm_dtclass = metrics.confusion_matrix(bankloans_test_pred_dtclass.actual, bankloans_test_pred_dtclass.predicted,labels = [1,0]) cm_dtclass

sns.heatmap(cm_dtclass,annot=True, fmt=".2f", cmap="Greens",linewidths=.5,linecolor="red", xticklabels = ["Default", "Not Default"] , yticklabels = ["Default", "Not Default"]) plt.title("Confusion Matrix for Test data") plt.ylabel("True Label") plt.xlabel("Predicted Label") plt.show()

#probabilty of prediction predict_prob_df = pd.DataFrame(dtclass_best_model.predict_proba(test_X)) predict_prob_df.head()

bankloans_test_pred_dtclass = pd.concat([bankloans_test_pred_dtclass, predict_prob_df], axis = 1) bankloans_test_pred_dtclass.columns = ['index', 'actual', 'predicted', 'default_0','default_1'] bankloans_test_pred_dtclass.head()

#find the auc score auc_score = metrics.roc_auc_score(bankloans_test_pred_dtclass.actual, bankloans_test_pred_dtclass.default_1) round(auc_score,4)

#plotting the roc curve fpr, tpr, thresholds = metrics.roc_curve(bankloans_test_pred_dtclass.actual, bankloans_test_pred_dtclass.default_1, drop_intermediate=False) plt.plot(fpr, tpr, label = "ROC Curve (Area = %0.4f)" % auc_score) plt.plot([1,0],[1,0],'k--') plt.ylabel("True Positive Rate") plt.xlabel("False Positive Rate or [1 - True Negative Rate]") plt.legend(loc = "lower right") plt.show()

#find precision score prec_score = metrics.precision_score(bankloans_test_pred_dtclass.actual, bankloans_test_pred_dtclass.predicted) print("Precision score :", round(prec_score,3))

#find the overall accuracy of model acc_score = metrics.accuracy_score(bankloans_test_pred_dtclass.actual,bankloans_test_pred_dtclass.predicted) print("Accuracy of model :", round(acc_score,3))

#classification report print(metrics.classification_report(bankloans_test_pred_dtclass.actual,bankloans_test_pred_dtclass.predicted))

Visualization of Decision Tree

Dependencies:

Need to install graphviz (conda install pydot graphviz)

Set the environment path variable to graphviz folder

!pip install pydotplus==2.0.2

from six import StringIO from IPython.display import Image from sklearn.tree import export_graphviz import pydotplus as pdot

#writing the dot data dot_data = StringIO()

#export the decision tree along with the feature names into a dot file format export_graphviz(dtclass_best_model,out_file=dot_data,filled=True,special_characters=True,rounded=True, feature_names=train_X.columns.values,class_names = ["No","Yes"])

#make a graph from dot file graph = pdot.graph_from_dot_data(dot_data.getvalue()) Image(graph.create_png())

Model Selection and Business Insights

The logistic regression model has shown better performance than the decision tree model based on their respective F1-scores, with the logistic model having an F1-score of 0.66 for positive labels (default customers) compared to 0.44 for the decision tree model. Therefore, the logistic model will be used to predict the creditworthiness of the remaining 150 customers. A cutoff of 0.224 will be used to classify customers as either default or non-default.

#probability for new customers new_cust_prob = pd.DataFrame(logreg.predict_proba(bankloans_new[featurecolumns])) new_cust_prob.columns = ["prob_default_0", "prob_default_1"] new_cust_prob.index = bankloans_new.index new_cust_prob.head()

bankloans_new_predicted = pd.concat([bankloans_new,new_cust_prob],axis=1) bankloans_new_predicted.head()

#using the cutoff value we will predict the default bankloans_new_predicted['predicted_default'] = bankloans_new_predicted["prob_default_1"].apply(lambda x: 1 if x > 0.224 else 0) bankloans_new_predicted.head()

#Model Prediction bankloans_new_predicted.predicted_default.value_counts()

#Model Prediction bankloans_new_predicted.predicted_default.value_counts().plot.bar() plt.ylabel("Count") plt.xlabel("Default") plt.show()

Insights

The results of the logistic regression model's predictions on a set of 150 new customers. According to the model's predictions, out of these 150 customers:

-> 85 customers are predicted to not default on the bank loan -> 65 customers are predicted to most likely default on the loan.

Model Performance Validation: - KS Chart - Lift and Gain Chart we will use the concept of decile analysis for these validations

#For train data train_predict = pd.DataFrame({'actual': train_y.reset_index(drop = True), 'prob': pd.DataFrame(logreg.predict_proba(train_X))[1]}) train_predict['predicted'] = train_predict['prob'].apply(lambda x: 1 if x > 0.224 else 0) train_predict.head()

#For test data test_predict = pd.DataFrame({'actual': test_y.reset_index(drop = True), 'prob': pd.DataFrame(logreg.predict_proba(test_X))[1]}) test_predict['predicted'] = test_predict['prob'].apply(lambda x: 1 if x > 0.224 else 0) test_predict.head()

#splitting the train data into different deciles train_predict['Deciles']=pd.qcut(train_predict['prob'],10, labels=False) train_predict.head()

#splitting the test data into different deciles test_predict['Deciles']=pd.qcut(test_predict['prob'],10, labels=False) test_predict.head()

#sumation of deciles for train data train_predict[['Deciles','actual']].groupby(train_predict.Deciles).sum().sort_index(ascending=False)

#sumation of deciles for test data test_predict[['Deciles','actual']].groupby(test_predict.Deciles).sum().sort_index(ascending=False)

train_predict[['Deciles','actual']].groupby(train_predict.Deciles).count().sort_index(ascending=False)

test_predict[['Deciles','actual']].groupby(test_predict.Deciles).count().sort_index(ascending=False)

from IPython.display import Image

To get the Training dataset & Testing dataset:

KS, Lift, and Gain charts for both the training and testing datasets to compare the model's performance on both datasets. This can help us determine whether the model is overfitting to the training dataset and can also provide insights into the model's performance on unseen data.

#For Training data set from IPython.display import Image Image(filename='/work/KS-Traindata.png', width = 500)

#For Test data set Image(filename="/work/KS-Testdata.png",width=600)

##Lift Chart Image(filename="/work/LiftChart.png",width=500)

##Grains Chart Image(filename="/work/GainsChart.png",width=500)

Observations: The analysis shows that the gain chart indicates that 90% of the defaulters who are likely to default on the loan can be identified by analyzing just 50% of the total customers. The lift chart indicates that by selecting 20% of the records based on the model, one can expect 2.7 times the total number of defaulters to be found than by randomly selecting 20% of the data without a model. The winning model should be saved using the standard method of serializing objects in Python, which is the pickle operation, to allow for future reuse in testing the model on new data, comparing multiple models, or other purposes.

Later you can load this file to deserialize your model and use it to make new predictions.

import pickle

import os if os.path.isfile("final_model.pkl"): if os.path.getsize("final_model.pkl") > 0: print("File exists and is not empty.") else: print("File exists but is empty.") else: print("File does not exist.") with open('final_model.pkl', 'wb') as f: pickle.dump(logreg, f) #with open("final_model.pkl", "rb") as f: # loaded_model = pickle.load(f) # load the model from disk - use to classify the default customers directly loaded_model = pickle.load(open('final_model.pkl', 'rb'))