Credit Default Prediction

#endeavour to uncomment the packages below and install it before use. #!pip install xgboost !pip install pywebio

import numpy as np # import numpy for numeric calculations import pandas as pd #import pandas for dataframe analysis import os from matplotlib.pyplot import Figure import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline from sklearn.preprocessing import MinMaxScaler, LabelEncoder import random as rd from sklearn import preprocessing from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.feature_selection import RFE from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report,auc,roc_curve from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import GradientBoostingClassifier from pprint import pprint import warnings import pickle warnings.filterwarnings('ignore') from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC from sklearn.utils import class_weight from xgboost import XGBClassifier from sklearn.neural_network import MLPClassifier from mpl_toolkits import mplot3d sns.set_theme(style="darkgrid", palette='dark',font='serif') color = sns.color_palette('dark') import pywebio #web app tool for developing end user app from pywebio.input import * from pywebio.output import * from pywebio import *

def plot_categorical2(data): ''' This function plots categorical variables as piecharts ''' cat_col = [col for col in data.columns if data[col].dtypes=='object'] f,ax = plt.subplots(int(len(cat_col)/7),7, figsize=(25,10)) for i in range(len(cat_col)): count = data[cat_col[i]].value_counts() explode_list = [0] for j in range(len(count)-1): explode_list.append(0.05) count.plot.pie( colors = color, autopct = '%0.0f%%', explode = explode_list, shadow = 'True', startangle = 40, textprops = {'color': 'White','fontsize':12}, wedgeprops = {'linewidth': 3, 'edgecolor':'grey'}, rotatelabels = 'true', ax=ax[i//7,i%7],) ax[i//7,i%7].grid(True) f.suptitle('Column Distribution', y=0.98,ha='center',va='center',size=15, weight=150) plt.show()

def plot_categorical(data,height): ''' This function plots categorical variables as bar charts wit h the % weight of each value on the bar ''' cat_col = [col for col in data.columns if data[col].dtypes=='object'] f,ax = plt.subplots(int(len(cat_col)/7),7, figsize=(33,20)) for i in range(len(cat_col)): count = data[cat_col[i]].value_counts(normalize=True).reset_index() sns.barplot(data=count,x='index',y=cat_col[i], ax=ax[i//7,i%7], edgecolor='white',linewidth=2,palette='dark') ax[i//7,i%7].set_xlabel('') ax[i//7,i%7].set_ylabel('') ax[i//7,i%7].set_title(cat_col[i], size=15) ax[i//7,i%7].tick_params('x', labelrotation=30, size=12) for j in count.index: ax[i//7,i%7].text(x=j,y=count[cat_col[i]][j]+height, s=str(round(count[cat_col[i]][j],2))+'%',ha="center", va="center", color="black",size=13) f.suptitle('Column Distribution', y=0.98,ha='center',va='center',size=15, weight=150) plt.show()

def plot_missing_vals(data,height): ''' this function plots missing values as bars and appends text of missing values on the bar ''' f,ax = plt.subplots(1,1,figsize=(25,10)) na_df = pd.DataFrame(data.isna().sum()).reset_index() na_df.columns=['Columns','Missing Values'] sns.barplot(data=na_df, x='Columns',y='Missing Values',edgecolor='white', linewidth=3,palette='dark') ax.tick_params('x',labelrotation=90) for i in na_df.index: ax.text(x=i,y=na_df['Missing Values'][i]+height, s=na_df['Missing Values'][i],ha="center", va="center", color="black",rotation=25,size=14) f.suptitle('Plot Of Missing Value',y=0.98,ha='center',va='center',size=15, weight=150) plt.show()

def plot_columns(data): ''' a strip plot that just plots the columns ''' f,ax = plt.subplots(1,1,figsize=(12,10)) df = pd.DataFrame(data.columns, columns=['Columns']) sns.stripplot(data=df, x=df.index,y='Columns', palette='dark',ax=ax,size=10) ax.tick_params('x',labelrotation=60) f.suptitle('Plot Of Columns',y=0.98,ha='center',va='center',size=15, weight=150) plt.show()

def plot_targetcol(data,col): ''' This function plots the target column with the distributions as both pie and barcharts ''' f,ax= plt.subplots(1,2,figsize=(20,5)) sns.countplot(data = data, x=col, ax=ax[1],palette='dark',edgecolor='white', linewidth=4) data.Status.value_counts().plot.pie(autopct='%.2f%%',explode=[0,0.05], ax=ax[0],colors=color,shadow = 'True', startangle = 40, textprops = {'color': 'White','fontsize':12}, wedgeprops = {'linewidth':3,'edgecolor':'white'}, rotatelabels = 'true') ax[1].text(x=0,y=100000,s=data[col].value_counts()[0], color='white', va='center', ha='center') ax[1].text(x=1,y=25000,s=data[col].value_counts()[1], color='white', va='center', ha='center') f.suptitle('Target Column Distribution',y=0.98,ha='center',va='center',size=15, weight=150) plt.show()

def plot_columndist(data, cols): ''' This function plots the column distribution alongside kde of the entire dataset ''' col_len = int(len(data.columns)/4) fig,axes = plt.subplots(col_len,4, figsize=(40,40)) _data = data.drop(labels = cols, axis=1) for i in range(len(_data.columns)): sns.histplot(data=_data, x=_data.columns[i] , hue='Status', multiple='stack', kde=True, palette='dark', ax=axes[i%col_len,i//col_len]) plt.show()

def show_datashape(data): ''' This function shows the rows and column count of a dataset ''' row, column = data.shape f,axes = plt.subplots(1,2, figsize=(10,2)) axes[0].text(x=0, y=1, s='Number of rows', color='black', size=20,) axes[0].set_xticks([]) axes[0].set_yticks([]) axes[0].set_facecolor('xkcd:white') axes[0].text(x=0, y=0.5, s=row, color='purple', size=50,fontfamily='serif') axes[1].text(x=0,y=1, s='Number of columns', color='black', fontfamily='serif', size=20) axes[1].set_xticks([]) axes[1].set_yticks([]) axes[1].set_facecolor('xkcd:white') axes[1].text(x=0, y=0.5, s=column, color='purple', size=50,fontfamily='serif')

def boxy(dum_data,figsize=(20,40)): ''' This function makes boxplots ''' num_cols = [cols for cols in dum_data.columns if dum_data[cols].dtypes != 'object'] row = int(len(num_cols)/4) fig,axes = plt.subplots(row,4, figsize=figsize) routine = 0 i=0 axcol =0 route = 0 for i in range(len(num_cols)): sns.boxplot(data = dum_data, y= num_cols[i], x='Status', ax=axes[i//4,i%4], width=0.5,linewidth=0.5,palette='dark') axes[i//4,i%4].set_ylabel('') axes[i//4,i%4].set_xlabel('') axes[i//4,i%4].set_title(num_cols[i]) plt.show()

def vplot(data,labels=[]): ''' This function makes a violin plot ''' _data =data.drop(labels = labels, axis=1) row= int(len(_data.columns)/4) f,axes = plt.subplots(row,4, figsize=(30,20)) for i in range(len(_data.columns)): sns.violinplot(data=_data, x='Status',y=_data.columns[i], ax=axes[i//4,i%4]) plt.show()

def replace_na(df): ''' This function replaces null value in the dataset with the mode ''' na_cols = [cols for cols in df.columns] for cols in na_cols: md = df[cols].mode()[0] df[cols] = df[cols].fillna(md) return df

def dummify(df): ''' This function dummifies the categorical columns in the dataset, making a new column for each ''' x=[col for col in df.columns if df[col].dtypes=='object'] for column in x: dummy_df = pd.get_dummies(data = df[column], prefix= column, dummy_na=False) df = pd.concat([df,dummy_df], axis = 1) df.drop(column, axis=1, inplace=True) return df

def corrplot(data,labels=[]): ''' This function plots correlation plot of a dataset ''' f,ax = plt.subplots(figsize=(15,10)) sns.heatmap(data=data.drop(labels=labels, axis=1).corr(),annot=True, cmap=plt.cm.Reds,ax=ax,fmt='.2f', cbar=False) f.suptitle('CORRELATION PLOT OF NUMERIC COLUMNS') plt.show()

def plot_conf_mat(model,xtest,ytest): ''' This function plots a confusion matrix, shows the balanced accuracy and roc curve of a model on test data. ''' fpr,tpr,_ = roc_curve(ytest,model.predict(xtest)) f,ax =plt.subplots(1,3,figsize=(20,5)) sns.heatmap(pd.DataFrame(confusion_matrix(ytest,model.predict(xtest)), index=['Actual: 0','Actual: 1'], columns=['Predicted: 0','Predicted: 1']),annot=True,fmt='.0f', cbar=False, cmap=plt.cm.Blues, ax=ax[0]) sns.despine() ax[0].set_title('Confusion Matrix') ax[1].text(x=0.1,y=0.8, s='Balanced Accuracy', color='black', fontfamily='serif', size=20) ax[1].set_xticks([]) ax[1].set_yticks([]) ax[1].set_facecolor('xkcd:white') ax[1].text(x=0.3, y=0.5, s=round(auc(fpr,tpr),2), color='purple', size=50,fontfamily='serif') ax[2].plot(fpr,tpr, label='AUC: %.2f' %auc(fpr,tpr)) ax[2].legend(loc='lower right') ax[2].set_xlabel('True Positive Rate') ax[2].set_ylabel('False Positive Rate') ax[2].set_title('Receiver Operator Characteristics')

def three_d_up(x='rate_of_interest',y='Credit_Score',z='income', color='Status'): ''' This function makes a 3d plot ''' plt.figure(figsize=(15,10)) ax= plt.axes(projection='3d',) ax.scatter3D(data[x],data[y],data[z],c= data[color]) ax.set_xlabel(x) ax.set_ylabel(y) ax.set_zlabel(z)

def plot_all_roc(models,names): ''' This function plots roc curves for all models. ''' with plt.xkcd(scale=0.01): plt.figure(figsize=(16,10)) colors = ['red','orange','brown','green','blue'] plt.plot([0,1],[0,1],color='black',linestyle='--',linewidth=3) for i in range(len(models)): fpr,tpr,_ = roc_curve(y_test,models[i].predict(X_test)) plt.plot(fpr,tpr, label=f'{names[i].upper()} AUC: %.3f' %auc(fpr,tpr),color=colors[i%len(models)],linewidth=3,) plt.legend(loc='lower right') plt.xlabel('True Positive Rate') plt.ylabel('False Positive Rate') plt.suptitle('Receiver Operator Characteristics') plt.show()

def get_prediction (): ''' This generates an app that gets data we need from users and predict the likelihood of a loan default ''' #create dictionary to populate with values dic_of_values ={} cat_dict_vals ={} num_dict_vals = {} cat_cols = [cols for cols in data.columns if data[cols].dtypes == 'object'] num_cols = [ cols for cols in data.columns if cols not in cat_cols+['Status']] #populatee dictionary with unique values of each columns for vals in cat_cols: dic_of_values[vals] = list(data[vals].unique()) #collect input from webapp for i in range(len(cat_cols)): if i%2 ==0 : cat_dict_vals[cat_cols[i]] = input.select(label = cat_cols[i], options = dic_of_values[cat_cols[i]] , required=True) else: cat_dict_vals[cat_cols[i]] = input.radio(label = cat_cols[i], options = dic_of_values[cat_cols[i]], required=True) #do same for numeric values for i in range(len(num_cols)): num_dict_vals [num_cols[i]]= [input.input(label=num_cols[i],type=FLOAT,required=True,help_text=num_cols[i])] num_dict_vals.update(cat_dict_vals) stated_data = pd.DataFrame(num_dict_vals) dummy_stated_data = dummify(stated_data) #create dataframe from the dictionaries for vals in X_train.columns: if vals not in dummy_stated_data.columns: dummy_stated_data[vals] = 0 X_stated = pd.DataFrame(data=scalar.transform(dummy_stated_data),columns=dummy_stated_data.columns) #load saved model and make prediction and print on screen loaded_model = pickle.load(open('finalized_model.sav', 'rb')) result = loaded_model.predict(X_stated) if result == 1: put_collapse(title='This customer is likely to default in their loan repayment', content=put_markdown('model predicted with %.4f percent probability '%loaded_model.predict_proba(X_stated).max()*100)) else: put_collapse(title='This customer is not likely to default in their loan repayment', content=put_markdown('model predicted with %.4f percent probability '%loaded_model.predict_proba(X_stated).max()*100))

for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename)) data = pd.read_csv('/kaggle/input/loan-default-dataset/Loan_Default.csv') data.head().style.background_gradient(vmin=0)

data.describe().style.background_gradient(vmin=0)

data.info(verbose=True,)

show_datashape(data)

data.head().style.background_gradient(vmin=0)

plot_missing_vals(data,1200)

plot_columns(data)

plot_categorical(data,0.01)

plot_targetcol(data,'Status')

Figure.figsize = (12,6) sns.pairplot(data=data.drop(['ID','year'],axis=1), hue='Status').map_upper(sns.kdeplot, n_levels=5, color='red',multiple='stack',thresh=0,) plt.show()

data = replace_na(data)

plot_missing_vals(data,0)

vplot(data, labels=['ID','year'])

boxy(data.drop('ID',axis=1), (20,15))

corrplot(data,labels=['ID','year'])

data.groupby('Neg_ammortization')['Status'].mean().plot(kind='bar', title='Neg Ammortization Default', colormap='Blues_r',) plt.show() #plt.axes.text(x=0, y=0, s=0)

three_d_up()

three_d_up('loan_amount','Upfront_charges','income','Status')

#drop columns that are not relevant to our models or are data.drop(labels = ['year','Secured_by','ID','property_value','construction_type','occupancy_type','total_units','lump_sum_payment','Credit_Score','term',], axis='columns', inplace=True)

data['age'] = ['other' if x in ['>74', '<25'] else x for x in data['age']] data['Region'] = ['other' if region not in ['North', 'south'] else region for region in data['Region']] data['loan_type'] = [loan if loan=='type1' else 'other' for loan in data['loan_type']]

plot_columndist(data, [])

dummy_data = dummify(data)

dummy_data.head().style.background_gradient(vmin=0)

show_datashape(dummy_data)

f,ax = plt.subplots(figsize=(100,50)) sns.heatmap(data=dummy_data.corr(),annot=True, cmap=plt.cm.Blues,ax=ax,fmt='.2f',square=True,linewidth=2,linecolor='white', annot_kws={'animated':True,'fontsize':20,'fontweight':'light', 'color':'black','fontfamily':'fantasy',}) ax.tick_params('x',labelsize=40) ax.tick_params('y',labelsize=40) f.suptitle('CORRELATION PLOT OF NUMERIC COLUMNS') plt.show()

_,ax = plt.subplots(1,1,figsize=(16,12)) sns.barplot( x = dummy_data.corr().Status.drop('Status').values, y = dummy_data.corr().Status.drop('Status').index, palette='dark', ax=ax) ax.set_title('Correlation of Prediction Columns to Target Column') plt.show()

Y= dummy_data.pop('Status')

scalar = MinMaxScaler().fit(dummy_data)

normalized_data = pd.DataFrame(data=scalar.transform(dummy_data), columns= dummy_data.columns)

X_train,X_test,y_train,y_test = train_test_split(normalized_data,Y,test_size=0.3, random_state=17)

class_weights = dict(zip(y_train.unique(),class_weight.compute_class_weight(class_weight='balanced',classes=y_train.unique(),y=y_train)))

class_weights

show_datashape(X_train)

lr= LogisticRegression(C=1,verbose=0,solver = 'liblinear',random_state=17) lr.fit(X_train,y_train) plot_conf_mat(lr,X_test,y_test)

print(classification_report(y_test,lr.predict(X_test)))

lrcw= LogisticRegression(C=1,solver= 'liblinear',verbose=0,random_state=17,class_weight=class_weights) lrcw.fit(X_train,y_train) plot_conf_mat(lrcw,X_test,y_test)

solver_list = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'] parameters = dict(solver=solver_list) lrclf = LogisticRegression(random_state=17,n_jobs=-1, C=1,class_weight=class_weights) clf = GridSearchCV(lrclf, parameters, cv=5, scoring='roc_auc') clf.fit(X_train, y_train) scores = clf.cv_results_['mean_test_score'] for score, solver, in zip(scores, solver_list): print(f"{solver}: {score:.4f}")

box = sns.barplot(x=solver_list,y=scores) for i in range(len(solver_list)): box.annotate(f"{scores[i]:.3f}",xy=(i,scores[i]-0.1),color='white',horizontalalignment='center')

plot_conf_mat(clf,X_test,y_test)

print(classification_report(y_test,clf.predict(X_test)))

rf= RandomForestClassifier(random_state=17) rf.fit(X_train,y_train) plot_conf_mat(rf,X_test,y_test)

print(classification_report(y_test,rf.predict(X_test)))

rfcw= RandomForestClassifier(class_weight=class_weights, random_state=17) rfcw.fit(X_train,y_train) plot_conf_mat(rfcw,X_test,y_test)

print(classification_report(y_test,rfcw.predict(X_test)))

criterion=['gini','entropy'] max_features = ["sqrt", "log2"] n_estimators = [100,500,1000,1500,2000,2500,3000] random_grid = { 'n_estimators': n_estimators, 'criterion':criterion, 'max_features':max_features, } rfcv = GridSearchCV(RandomForestClassifier(random_state=17), random_grid, scoring='roc_auc',cv=2,n_jobs=-1, verbose=1) rfcv.fit(X_train, y_train) scores = rfcv.cv_results_['mean_test_score'] print(scores)

plot_conf_mat(rfcv,X_test,y_test)

print(classification_report(y_test,rfcv.predict(X_test)))

svc = SVC(random_state=17) svc.fit(X_train,y_train) plot_conf_mat(svc,X_test,y_test)

svccw = SVC(class_weight=class_weights, random_state=17) svccw.fit(X_train,y_train) plot_conf_mat(svccw,X_test,y_test)

print(classification_report(y_test,svccw.predict(X_test)))

kernel = ['linear', 'poly', 'rbf', 'sigmoid'] c=[0.1,.01,1] params ={ 'kernel':kernel, 'C':c }

svccv = GridSearchCV(SVC(class_weight=class_weights),params,scoring='roc_auc',verbose=1,n_jobs=-1,cv=2) svccv.fit(X_train,y_train)

plot_conf_mat(svccv,X_test,y_test)

print(classification_report(y_test,svccv.predict(X_test)))

xgb=XGBClassifier(random_state=17)

xgb.fit(X_train,y_train)

plot_conf_mat(xgb,X_test,y_test)

n_estimators = [100,500,1000,1500] gamma = [0,1] scale_pos_weight = [0.1,0.01,1,10] tree_method = ['auto', 'exact', 'approx', 'hist', 'gpu_hist'] xgb_grid = { 'n_estimators': n_estimators, 'gamma':gamma, 'scale_pos_weight':scale_pos_weight, 'tree_method':tree_method, 'eval_metric':['auc'] }

xgbcv = GridSearchCV(XGBClassifier(random_state=17,),xgb_grid,scoring='roc_auc',cv=2,verbose=0) xgbcv.fit(X_train,y_train)

plot_conf_mat(xgbcv,X_test,y_test)

xgbcv.best_estimator_

print(classification_report(y_test,xgbcv.predict(X_test)))

plt.figure(figsize=(16,10)) sns.barplot(x=X_train.columns, y=xgbcv.best_estimator_.feature_importances_, palette='dark') plt.tick_params('x',rotation=90) plt.suptitle('Feature Importance') plt.show()

mlp = MLPClassifier(hidden_layer_sizes=(100,10,2),random_state=17)

mlp.fit(X_train,y_train)

plot_conf_mat(mlp,X_test,y_test)

print(classification_report(y_test,mlp.predict(X_test)))

hidden_layer_sizes = [(100,10,2)] alpha = [0.0001,0.001,0.01,0.1,1] learning_rate_init = [0.0001,0.001,0.01,0.1,1] solver = ['lbfgs', 'sgd', 'adam'] mlpgrid = { 'hidden_layer_sizes':hidden_layer_sizes, 'alpha':alpha, 'learning_rate_init':learning_rate_init, 'solver':solver }

mlpcv = GridSearchCV(MLPClassifier(random_state=17,),mlpgrid,scoring='roc_auc',cv=2,verbose=1) mlpcv.fit(X_train,y_train) plot_conf_mat(mlpcv, X_test,y_test)

print(classification_report(y_test,mlpcv.predict(X_test)))

plot_all_roc([rf,svc,mlp,lr,xgb],['RF','SVC','MLP','LR','XGB'])

plot_all_roc([rfcw,svccw,lrcw],['RF','SVC','LR'])

plot_all_roc([rfcv,svccv,mlpcv,clf,xgbcv],['RF','SVC','MLP','LR','XGB'])

filename = 'finalized_model.sav' pickle.dump(xgbcv, open(filename, 'wb'))

#uncomment the function below to run webapp #get_prediction()