#endeavour to uncomment the packages below and install it before use.
#!pip install xgboost
!pip install pywebio
import numpy as np # import numpy for numeric calculations
import pandas as pd #import pandas for dataframe analysis
import os
from matplotlib.pyplot import Figure
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import random as rd
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report,auc,roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from pprint import pprint
import warnings
import pickle
warnings.filterwarnings('ignore')
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.utils import class_weight
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from mpl_toolkits import mplot3d
sns.set_theme(style="darkgrid", palette='dark',font='serif')
color = sns.color_palette('dark')
import pywebio #web app tool for developing end user app
from pywebio.input import *
from pywebio.output import *
from pywebio import *
def plot_categorical2(data):
'''
This function plots categorical variables as piecharts
'''
cat_col = [col for col in data.columns if data[col].dtypes=='object']
f,ax = plt.subplots(int(len(cat_col)/7),7, figsize=(25,10))
for i in range(len(cat_col)):
count = data[cat_col[i]].value_counts()
explode_list = [0]
for j in range(len(count)-1):
explode_list.append(0.05)
count.plot.pie(
colors = color,
autopct = '%0.0f%%',
explode = explode_list,
shadow = 'True',
startangle = 40,
textprops = {'color': 'White','fontsize':12},
wedgeprops = {'linewidth': 3, 'edgecolor':'grey'},
rotatelabels = 'true',
ax=ax[i//7,i%7],)
ax[i//7,i%7].grid(True)
f.suptitle('Column Distribution', y=0.98,ha='center',va='center',size=15, weight=150)
plt.show()
def plot_categorical(data,height):
'''
This function plots categorical variables as bar charts wit
h the % weight of each value on the bar
'''
cat_col = [col for col in data.columns if data[col].dtypes=='object']
f,ax = plt.subplots(int(len(cat_col)/7),7, figsize=(33,20))
for i in range(len(cat_col)):
count = data[cat_col[i]].value_counts(normalize=True).reset_index()
sns.barplot(data=count,x='index',y=cat_col[i], ax=ax[i//7,i%7], edgecolor='white',linewidth=2,palette='dark')
ax[i//7,i%7].set_xlabel('')
ax[i//7,i%7].set_ylabel('')
ax[i//7,i%7].set_title(cat_col[i], size=15)
ax[i//7,i%7].tick_params('x', labelrotation=30, size=12)
for j in count.index:
ax[i//7,i%7].text(x=j,y=count[cat_col[i]][j]+height, s=str(round(count[cat_col[i]][j],2))+'%',ha="center", va="center", color="black",size=13)
f.suptitle('Column Distribution', y=0.98,ha='center',va='center',size=15, weight=150)
plt.show()
def plot_missing_vals(data,height):
'''
this function plots missing values as bars and appends text of missing values on the bar
'''
f,ax = plt.subplots(1,1,figsize=(25,10))
na_df = pd.DataFrame(data.isna().sum()).reset_index()
na_df.columns=['Columns','Missing Values']
sns.barplot(data=na_df, x='Columns',y='Missing Values',edgecolor='white', linewidth=3,palette='dark')
ax.tick_params('x',labelrotation=90)
for i in na_df.index:
ax.text(x=i,y=na_df['Missing Values'][i]+height, s=na_df['Missing Values'][i],ha="center", va="center", color="black",rotation=25,size=14)
f.suptitle('Plot Of Missing Value',y=0.98,ha='center',va='center',size=15, weight=150)
plt.show()
def plot_columns(data):
'''
a strip plot that just plots the columns
'''
f,ax = plt.subplots(1,1,figsize=(12,10))
df = pd.DataFrame(data.columns, columns=['Columns'])
sns.stripplot(data=df, x=df.index,y='Columns', palette='dark',ax=ax,size=10)
ax.tick_params('x',labelrotation=60)
f.suptitle('Plot Of Columns',y=0.98,ha='center',va='center',size=15, weight=150)
plt.show()
def plot_targetcol(data,col):
'''
This function plots the target column with the distributions as both pie and barcharts
'''
f,ax= plt.subplots(1,2,figsize=(20,5))
sns.countplot(data = data, x=col, ax=ax[1],palette='dark',edgecolor='white', linewidth=4)
data.Status.value_counts().plot.pie(autopct='%.2f%%',explode=[0,0.05], ax=ax[0],colors=color,shadow = 'True',
startangle = 40,
textprops = {'color': 'White','fontsize':12},
wedgeprops = {'linewidth':3,'edgecolor':'white'},
rotatelabels = 'true')
ax[1].text(x=0,y=100000,s=data[col].value_counts()[0], color='white', va='center', ha='center')
ax[1].text(x=1,y=25000,s=data[col].value_counts()[1], color='white', va='center', ha='center')
f.suptitle('Target Column Distribution',y=0.98,ha='center',va='center',size=15, weight=150)
plt.show()
def plot_columndist(data, cols):
'''
This function plots the column distribution alongside kde of the entire dataset
'''
col_len = int(len(data.columns)/4)
fig,axes = plt.subplots(col_len,4, figsize=(40,40))
_data = data.drop(labels = cols, axis=1)
for i in range(len(_data.columns)):
sns.histplot(data=_data, x=_data.columns[i] , hue='Status', multiple='stack', kde=True, palette='dark', ax=axes[i%col_len,i//col_len])
plt.show()
def show_datashape(data):
'''
This function shows the rows and column count of a dataset
'''
row, column = data.shape
f,axes = plt.subplots(1,2, figsize=(10,2))
axes[0].text(x=0, y=1, s='Number of rows', color='black', size=20,)
axes[0].set_xticks([])
axes[0].set_yticks([])
axes[0].set_facecolor('xkcd:white')
axes[0].text(x=0, y=0.5, s=row, color='purple', size=50,fontfamily='serif')
axes[1].text(x=0,y=1, s='Number of columns', color='black', fontfamily='serif', size=20)
axes[1].set_xticks([])
axes[1].set_yticks([])
axes[1].set_facecolor('xkcd:white')
axes[1].text(x=0, y=0.5, s=column, color='purple', size=50,fontfamily='serif')
def boxy(dum_data,figsize=(20,40)):
'''
This function makes boxplots
'''
num_cols = [cols for cols in dum_data.columns if dum_data[cols].dtypes != 'object']
row = int(len(num_cols)/4)
fig,axes = plt.subplots(row,4, figsize=figsize)
routine = 0
i=0
axcol =0
route = 0
for i in range(len(num_cols)):
sns.boxplot(data = dum_data, y= num_cols[i], x='Status', ax=axes[i//4,i%4], width=0.5,linewidth=0.5,palette='dark')
axes[i//4,i%4].set_ylabel('')
axes[i//4,i%4].set_xlabel('')
axes[i//4,i%4].set_title(num_cols[i])
plt.show()
def vplot(data,labels=[]):
'''
This function makes a violin plot
'''
_data =data.drop(labels = labels, axis=1)
row= int(len(_data.columns)/4)
f,axes = plt.subplots(row,4, figsize=(30,20))
for i in range(len(_data.columns)):
sns.violinplot(data=_data, x='Status',y=_data.columns[i], ax=axes[i//4,i%4])
plt.show()
def replace_na(df):
'''
This function replaces null value in the dataset with the mode
'''
na_cols = [cols for cols in df.columns]
for cols in na_cols:
md = df[cols].mode()[0]
df[cols] = df[cols].fillna(md)
return df
def dummify(df):
'''
This function dummifies the categorical columns in the dataset, making a new column for each
'''
x=[col for col in df.columns if df[col].dtypes=='object']
for column in x:
dummy_df = pd.get_dummies(data = df[column], prefix= column, dummy_na=False)
df = pd.concat([df,dummy_df], axis = 1)
df.drop(column, axis=1, inplace=True)
return df
def corrplot(data,labels=[]):
'''
This function plots correlation plot of a dataset
'''
f,ax = plt.subplots(figsize=(15,10))
sns.heatmap(data=data.drop(labels=labels, axis=1).corr(),annot=True, cmap=plt.cm.Reds,ax=ax,fmt='.2f', cbar=False)
f.suptitle('CORRELATION PLOT OF NUMERIC COLUMNS')
plt.show()
def plot_conf_mat(model,xtest,ytest):
'''
This function plots a confusion matrix, shows the balanced accuracy and roc curve of a model on test data.
'''
fpr,tpr,_ = roc_curve(ytest,model.predict(xtest))
f,ax =plt.subplots(1,3,figsize=(20,5))
sns.heatmap(pd.DataFrame(confusion_matrix(ytest,model.predict(xtest)), index=['Actual: 0','Actual: 1'], columns=['Predicted: 0','Predicted: 1']),annot=True,fmt='.0f', cbar=False, cmap=plt.cm.Blues, ax=ax[0])
sns.despine()
ax[0].set_title('Confusion Matrix')
ax[1].text(x=0.1,y=0.8, s='Balanced Accuracy', color='black', fontfamily='serif', size=20)
ax[1].set_xticks([])
ax[1].set_yticks([])
ax[1].set_facecolor('xkcd:white')
ax[1].text(x=0.3, y=0.5, s=round(auc(fpr,tpr),2), color='purple', size=50,fontfamily='serif')
ax[2].plot(fpr,tpr, label='AUC: %.2f' %auc(fpr,tpr))
ax[2].legend(loc='lower right')
ax[2].set_xlabel('True Positive Rate')
ax[2].set_ylabel('False Positive Rate')
ax[2].set_title('Receiver Operator Characteristics')
def three_d_up(x='rate_of_interest',y='Credit_Score',z='income', color='Status'):
'''
This function makes a 3d plot
'''
plt.figure(figsize=(15,10))
ax= plt.axes(projection='3d',)
ax.scatter3D(data[x],data[y],data[z],c= data[color])
ax.set_xlabel(x)
ax.set_ylabel(y)
ax.set_zlabel(z)
def plot_all_roc(models,names):
'''
This function plots roc curves for all models.
'''
with plt.xkcd(scale=0.01):
plt.figure(figsize=(16,10))
colors = ['red','orange','brown','green','blue']
plt.plot([0,1],[0,1],color='black',linestyle='--',linewidth=3)
for i in range(len(models)):
fpr,tpr,_ = roc_curve(y_test,models[i].predict(X_test))
plt.plot(fpr,tpr, label=f'{names[i].upper()} AUC: %.3f' %auc(fpr,tpr),color=colors[i%len(models)],linewidth=3,)
plt.legend(loc='lower right')
plt.xlabel('True Positive Rate')
plt.ylabel('False Positive Rate')
plt.suptitle('Receiver Operator Characteristics')
plt.show()
def get_prediction ():
'''
This generates an app that gets data we need from users and predict the likelihood of a loan default
'''
#create dictionary to populate with values
dic_of_values ={}
cat_dict_vals ={}
num_dict_vals = {}
cat_cols = [cols for cols in data.columns if data[cols].dtypes == 'object']
num_cols = [ cols for cols in data.columns if cols not in cat_cols+['Status']]
#populatee dictionary with unique values of each columns
for vals in cat_cols:
dic_of_values[vals] = list(data[vals].unique())
#collect input from webapp
for i in range(len(cat_cols)):
if i%2 ==0 :
cat_dict_vals[cat_cols[i]] = input.select(label = cat_cols[i], options = dic_of_values[cat_cols[i]] , required=True)
else:
cat_dict_vals[cat_cols[i]] = input.radio(label = cat_cols[i], options = dic_of_values[cat_cols[i]], required=True)
#do same for numeric values
for i in range(len(num_cols)):
num_dict_vals [num_cols[i]]= [input.input(label=num_cols[i],type=FLOAT,required=True,help_text=num_cols[i])]
num_dict_vals.update(cat_dict_vals)
stated_data = pd.DataFrame(num_dict_vals)
dummy_stated_data = dummify(stated_data)
#create dataframe from the dictionaries
for vals in X_train.columns:
if vals not in dummy_stated_data.columns:
dummy_stated_data[vals] = 0
X_stated = pd.DataFrame(data=scalar.transform(dummy_stated_data),columns=dummy_stated_data.columns)
#load saved model and make prediction and print on screen
loaded_model = pickle.load(open('finalized_model.sav', 'rb'))
result = loaded_model.predict(X_stated)
if result == 1:
put_collapse(title='This customer is likely to default in their loan repayment', content=put_markdown('model predicted with %.4f percent probability '%loaded_model.predict_proba(X_stated).max()*100))
else:
put_collapse(title='This customer is not likely to default in their loan repayment', content=put_markdown('model predicted with %.4f percent probability '%loaded_model.predict_proba(X_stated).max()*100))
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
data = pd.read_csv('/kaggle/input/loan-default-dataset/Loan_Default.csv')
data.head().style.background_gradient(vmin=0)
data.describe().style.background_gradient(vmin=0)
data.info(verbose=True,)
show_datashape(data)
data.head().style.background_gradient(vmin=0)
plot_missing_vals(data,1200)
plot_columns(data)
plot_categorical(data,0.01)
plot_targetcol(data,'Status')
Figure.figsize = (12,6)
sns.pairplot(data=data.drop(['ID','year'],axis=1), hue='Status').map_upper(sns.kdeplot, n_levels=5, color='red',multiple='stack',thresh=0,)
plt.show()
data = replace_na(data)
plot_missing_vals(data,0)
vplot(data, labels=['ID','year'])
boxy(data.drop('ID',axis=1), (20,15))
corrplot(data,labels=['ID','year'])
data.groupby('Neg_ammortization')['Status'].mean().plot(kind='bar', title='Neg Ammortization Default', colormap='Blues_r',)
plt.show()
#plt.axes.text(x=0, y=0, s=0)
three_d_up()
three_d_up('loan_amount','Upfront_charges','income','Status')
#drop columns that are not relevant to our models or are
data.drop(labels = ['year','Secured_by','ID','property_value','construction_type','occupancy_type','total_units','lump_sum_payment','Credit_Score','term',], axis='columns', inplace=True)
data['age'] = ['other' if x in ['>74', '<25'] else x for x in data['age']]
data['Region'] = ['other' if region not in ['North', 'south'] else region for region in data['Region']]
data['loan_type'] = [loan if loan=='type1' else 'other' for loan in data['loan_type']]
plot_columndist(data, [])
dummy_data = dummify(data)
dummy_data.head().style.background_gradient(vmin=0)
show_datashape(dummy_data)
f,ax = plt.subplots(figsize=(100,50))
sns.heatmap(data=dummy_data.corr(),annot=True, cmap=plt.cm.Blues,ax=ax,fmt='.2f',square=True,linewidth=2,linecolor='white', annot_kws={'animated':True,'fontsize':20,'fontweight':'light', 'color':'black','fontfamily':'fantasy',})
ax.tick_params('x',labelsize=40)
ax.tick_params('y',labelsize=40)
f.suptitle('CORRELATION PLOT OF NUMERIC COLUMNS')
plt.show()
_,ax = plt.subplots(1,1,figsize=(16,12))
sns.barplot( x = dummy_data.corr().Status.drop('Status').values, y = dummy_data.corr().Status.drop('Status').index, palette='dark', ax=ax)
ax.set_title('Correlation of Prediction Columns to Target Column')
plt.show()
Y= dummy_data.pop('Status')
scalar = MinMaxScaler().fit(dummy_data)
normalized_data = pd.DataFrame(data=scalar.transform(dummy_data), columns= dummy_data.columns)
X_train,X_test,y_train,y_test = train_test_split(normalized_data,Y,test_size=0.3, random_state=17)
class_weights = dict(zip(y_train.unique(),class_weight.compute_class_weight(class_weight='balanced',classes=y_train.unique(),y=y_train)))
class_weights
show_datashape(X_train)
lr= LogisticRegression(C=1,verbose=0,solver = 'liblinear',random_state=17)
lr.fit(X_train,y_train)
plot_conf_mat(lr,X_test,y_test)
print(classification_report(y_test,lr.predict(X_test)))
lrcw= LogisticRegression(C=1,solver= 'liblinear',verbose=0,random_state=17,class_weight=class_weights)
lrcw.fit(X_train,y_train)
plot_conf_mat(lrcw,X_test,y_test)
solver_list = ['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga']
parameters = dict(solver=solver_list)
lrclf = LogisticRegression(random_state=17,n_jobs=-1, C=1,class_weight=class_weights)
clf = GridSearchCV(lrclf, parameters, cv=5, scoring='roc_auc')
clf.fit(X_train, y_train)
scores = clf.cv_results_['mean_test_score']
for score, solver, in zip(scores, solver_list):
print(f"{solver}: {score:.4f}")
box = sns.barplot(x=solver_list,y=scores)
for i in range(len(solver_list)):
box.annotate(f"{scores[i]:.3f}",xy=(i,scores[i]-0.1),color='white',horizontalalignment='center')
plot_conf_mat(clf,X_test,y_test)
print(classification_report(y_test,clf.predict(X_test)))
rf= RandomForestClassifier(random_state=17)
rf.fit(X_train,y_train)
plot_conf_mat(rf,X_test,y_test)
print(classification_report(y_test,rf.predict(X_test)))
rfcw= RandomForestClassifier(class_weight=class_weights, random_state=17)
rfcw.fit(X_train,y_train)
plot_conf_mat(rfcw,X_test,y_test)
print(classification_report(y_test,rfcw.predict(X_test)))
criterion=['gini','entropy']
max_features = ["sqrt", "log2"]
n_estimators = [100,500,1000,1500,2000,2500,3000]
random_grid = {
'n_estimators': n_estimators,
'criterion':criterion,
'max_features':max_features,
}
rfcv = GridSearchCV(RandomForestClassifier(random_state=17), random_grid, scoring='roc_auc',cv=2,n_jobs=-1, verbose=1)
rfcv.fit(X_train, y_train)
scores = rfcv.cv_results_['mean_test_score']
print(scores)
plot_conf_mat(rfcv,X_test,y_test)
print(classification_report(y_test,rfcv.predict(X_test)))
svc = SVC(random_state=17)
svc.fit(X_train,y_train)
plot_conf_mat(svc,X_test,y_test)
svccw = SVC(class_weight=class_weights, random_state=17)
svccw.fit(X_train,y_train)
plot_conf_mat(svccw,X_test,y_test)
print(classification_report(y_test,svccw.predict(X_test)))
kernel = ['linear', 'poly', 'rbf', 'sigmoid']
c=[0.1,.01,1]
params ={
'kernel':kernel,
'C':c
}
svccv = GridSearchCV(SVC(class_weight=class_weights),params,scoring='roc_auc',verbose=1,n_jobs=-1,cv=2)
svccv.fit(X_train,y_train)
plot_conf_mat(svccv,X_test,y_test)
print(classification_report(y_test,svccv.predict(X_test)))
xgb=XGBClassifier(random_state=17)
xgb.fit(X_train,y_train)
plot_conf_mat(xgb,X_test,y_test)
n_estimators = [100,500,1000,1500]
gamma = [0,1]
scale_pos_weight = [0.1,0.01,1,10]
tree_method = ['auto', 'exact', 'approx', 'hist', 'gpu_hist']
xgb_grid = {
'n_estimators': n_estimators,
'gamma':gamma,
'scale_pos_weight':scale_pos_weight,
'tree_method':tree_method,
'eval_metric':['auc']
}
xgbcv = GridSearchCV(XGBClassifier(random_state=17,),xgb_grid,scoring='roc_auc',cv=2,verbose=0)
xgbcv.fit(X_train,y_train)
plot_conf_mat(xgbcv,X_test,y_test)
xgbcv.best_estimator_
print(classification_report(y_test,xgbcv.predict(X_test)))
plt.figure(figsize=(16,10))
sns.barplot(x=X_train.columns, y=xgbcv.best_estimator_.feature_importances_, palette='dark')
plt.tick_params('x',rotation=90)
plt.suptitle('Feature Importance')
plt.show()
mlp = MLPClassifier(hidden_layer_sizes=(100,10,2),random_state=17)
mlp.fit(X_train,y_train)
plot_conf_mat(mlp,X_test,y_test)
print(classification_report(y_test,mlp.predict(X_test)))
hidden_layer_sizes = [(100,10,2)]
alpha = [0.0001,0.001,0.01,0.1,1]
learning_rate_init = [0.0001,0.001,0.01,0.1,1]
solver = ['lbfgs', 'sgd', 'adam']
mlpgrid = {
'hidden_layer_sizes':hidden_layer_sizes,
'alpha':alpha,
'learning_rate_init':learning_rate_init,
'solver':solver
}
mlpcv = GridSearchCV(MLPClassifier(random_state=17,),mlpgrid,scoring='roc_auc',cv=2,verbose=1)
mlpcv.fit(X_train,y_train)
plot_conf_mat(mlpcv, X_test,y_test)
print(classification_report(y_test,mlpcv.predict(X_test)))
plot_all_roc([rf,svc,mlp,lr,xgb],['RF','SVC','MLP','LR','XGB'])
plot_all_roc([rfcw,svccw,lrcw],['RF','SVC','LR'])
plot_all_roc([rfcv,svccv,mlpcv,clf,xgbcv],['RF','SVC','MLP','LR','XGB'])
filename = 'finalized_model.sav'
pickle.dump(xgbcv, open(filename, 'wb'))
#uncomment the function below to run webapp
#get_prediction()