deloitte_ml_challenge_predict_loan

!pip install -U klib

import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import seaborn as sns # visualization from matplotlib import pyplot as plt # visualization import klib # visualization %matplotlib inline import warnings warnings.filterwarnings("ignore") from sklearn.feature_selection import chi2 from scipy.stats import chi2_contingency

from google.colab import drive drive.mount('/content/gdrive')

train=pd.read_csv("/content/gdrive/MyDrive/deloitte_ml_challenge_predict_loan_defaulters./train.csv")

test=pd.read_csv("/content/gdrive/MyDrive/deloitte_ml_challenge_predict_loan_defaulters./test.csv")

train=train.rename(columns={'Employment Duration':'Home Ownership', 'Home Ownership':'Employment Duration'# }) test=test.rename(columns={'Employment Duration':'Home Ownership', 'Home Ownership':'Employment Duration'# })

train.info()

test.info()

train.isnull().sum()

test.isnull().sum()

train.head(3)

def cat_plot(df,catcol,title='',**arg): _=plt.figure(figsize=(8,5)) _=sns.countplot(data=df,x=catcol,order=df[catcol].value_counts().index,**arg) _=plt.title(title,fontsize=25) _=plt.xlabel(catcol,fontsize=15) _=plt.xticks(fontsize=10, rotation=90)

cat_plot(train,"Loan Status" ,"Taget(Loan Status) Column Distribution")

train["Loan Status"].value_counts()

_=plt.figure(figsize=(8,15)) _=sns.countplot(y=train['Batch Enrolled'],hue=train['Loan Status'].astype('object'))

train.select_dtypes(include="object").columns.values

fig=plt.subplots(figsize=(20, 20)) for i,col in enumerate(['Grade', 'Sub Grade', 'Home Ownership', 'Verification Status', 'Payment Plan', 'Initial List Status', 'Application Type']): _=plt.subplot(4,2,i+1) _=sns.countplot(x=train[col],hue=train['Loan Status'].astype('object')) _=plt.title(col+' Distribution',fontsize=15) _=plt.xlabel(col,fontsize=10) _=plt.xticks(fontsize=15) _=plt.tight_layout() plt.show()

for col in train.select_dtypes(include="object").columns.values: print("=="*50) print(col) print(train[col].value_counts()) print("=="*50)

fig=plt.subplots(figsize=(20, 20)) for i,col in enumerate(train.select_dtypes(exclude="object").columns.values): plt.subplot(13,2,i+1) _=sns.histplot(data=train,x=col,hue="Loan Status") _=plt.title(col+' Distribution',fontsize=15) _=plt.xlabel("") _=plt.xticks(fontsize=8) _=plt.tight_layout() plt.show()

for col in train.select_dtypes(exclude="object").columns.values: print("=="*50) print(col) print(train[col].describe()) print("=="*50)

train['Accounts Delinquent'].value_counts()

def pre_process(df): #drop the loan title and payment plan column df=df.drop(columns=['Loan Title','Payment Plan']) #Interest per month df['Interest_per_mon']=((df['Loan Amount']*df['Interest Rate'])/100)/df['Term'] #total interest amount df['total_intr_amt']=df['Interest_per_mon']*df['Term'] #Check investor funded amount greater than funded amount df['Fund_amnt_grt']=(df['Funded Amount Investor']>df['Funded Amount']).astype('int') #total revolve amount df['total_revolve']=df['Revolving Balance']+df['Revolving Utilities'] #total received amount df['total_received']=df['Total Received Interest']+df['Total Collection Amount'] #total recovery amount df['total_recovery']=df['Recoveries']+df['Collection Recovery Fee'] #check total revolve amount less than total revolving credit limit df['revolve_amnt_grt']=(df['total_revolve']<df['Total Revolving Credit Limit']).astype('int') #check Loan amount greater than total current balance df['loan_grt_balance']=(df['Loan Amount']>df['Total Current Balance']).astype('int') #representative's customer count. df['reprs_cust_count']=df.groupby(['Batch Enrolled'])['Batch Enrolled'].transform('count') return df

train=pre_process(train)

grpcol=['Batch Enrolled', 'Grade', 'Sub Grade', 'Home Ownership', 'Verification Status', 'Initial List Status', 'Application Type']

numcols=['Loan Amount', 'Funded Amount', 'Funded Amount Investor', 'Interest Rate', 'Employment Duration', 'Debit to Income', 'Open Account', 'Revolving Balance', 'Revolving Utilities', 'Total Accounts', 'Total Received Interest', 'Total Received Late Fee', 'Recoveries', 'Collection Recovery Fee', 'Total Collection Amount', 'Total Current Balance', 'Total Revolving Credit Limit', 'Interest_per_mon', 'total_intr_amt', 'total_revolve', 'total_received', 'total_recovery']

for col in numcols: df1=(train.groupby(grpcol)[col]. agg({'min','median','max'}).reset_index()) df1=df1.rename(columns={c:col+'_'+c for c in df1.loc[:,df1.columns.str.match("(min|median|max)")]}) train=pd.merge(train,df1,on=grpcol,how='left')

for c, i in enumerate(train.columns.values): print(f"{c}_{i}")

test=pre_process(test)

for col in numcols: df1=(test.groupby(grpcol)[col]. agg({'min','median','max'}).reset_index()) df1=df1.rename(columns={c:col+'_'+c for c in df1.loc[:,df1.columns.str.match("(min|median|max)")]}) test=pd.merge(test,df1,on=grpcol,how='left')

for c, i in enumerate(test.columns.values): print(f"{c}_{i}")

!pip install optuna !pip install catboost

from sklearn.model_selection import cross_val_score,KFold,train_test_split,ShuffleSplit,StratifiedKFold,learning_curve from catboost import CatBoostClassifier,Pool,cv,monoforest import optuna from optuna.samplers import RandomSampler,TPESampler,MOTPESampler,CmaEsSampler from sklearn.metrics import f1_score,classification_report,confusion_matrix,log_loss from xgboost import XGBClassifier,plot_tree import xgboost as xgb from optuna.integration import XGBoostPruningCallback,LightGBMPruningCallback from sklearn.preprocessing import LabelEncoder,StandardScaler,PolynomialFeatures import lightgbm as lgbm import math

for c, i in enumerate(train.columns.values): print(f"{c}_{i}")

train.select_dtypes(include='object').columns

X=train.iloc[:,np.r_[1:32,33:108]] y=train['Loan Status']

train.select_dtypes(include='object').columns

def objective(trial): skf = StratifiedKFold(n_splits=5,random_state=2000,shuffle=True) for i, (train_index, test_index) in enumerate(skf.split(X,y)): X_train, X_valid = X.iloc[train_index], X.iloc[test_index] y_train, y_valid = y.iloc[train_index], y.iloc[test_index] param = { 'reg_lambda':trial.suggest_categorical('reg_lambda',[0.1, 1.0, 5.0, 10.0, 50.0, 100.0,150.0,200.0,250.0]), 'learning_rate': trial.suggest_float('learning_rate', 0.001,1.0), 'n_estimators': trial.suggest_categorical('n_estimators',[200,400,600,800,1000]), 'max_depth': trial.suggest_int('max_depth', 2,12), 'random_state': trial.suggest_categorical('random_state', [1024, 1048,2020]), "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1), "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]), "bootstrap_type": trial.suggest_categorical( "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"] ), } if param["bootstrap_type"] == "Bayesian": param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10) elif param["bootstrap_type"] == "Bernoulli": param["subsample"] = trial.suggest_float("subsample", 0.1, 1) cat_clf = CatBoostClassifier(early_stopping_rounds=30,eval_metric="Logloss", logging_level="Silent", **param) cat_clf.fit(X_train, y_train,cat_features=['Batch Enrolled', 'Grade', 'Sub Grade', 'Home Ownership','Verification Status', 'Initial List Status', 'Application Type']) preds = cat_clf.predict_proba(X_valid) accuracy = log_loss(y_valid, preds) return accuracy if __name__ == "__main__": study = optuna.create_study(direction='minimize',sampler=MOTPESampler(), pruner=optuna.pruners.SuccessiveHalvingPruner()) study.optimize(objective, n_trials=100) print('Number of finished trials:', len(study.trials)) print('Best trial:', study.best_trial.params)

study.best_value

study.best_params

optuna.visualization.plot_optimization_history(study)

optuna.visualization.plot_slice(study)

optuna.visualization.plot_param_importances(study)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42, shuffle=True,stratify=y)

param_cat={'bagging_temperature': 2.0022013667340754, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'colsample_bylevel': 0.010267520707686379, 'learning_rate': 0.08580081062052972, 'max_depth': 2, 'n_estimators': 1000, 'random_state': 2020, 'reg_lambda': 250.0}

cat_model = CatBoostClassifier(**param_cat, eval_metric="Logloss",)

cat_model.fit(X_train,y_train,eval_set=(X_valid,y_valid),use_best_model=True, verbose=True,early_stopping_rounds=30, cat_features=['Batch Enrolled', 'Grade', 'Sub Grade', 'Home Ownership','Verification Status', 'Initial List Status', 'Application Type'])

pd.DataFrame({'train_logloss':cat_model.evals_result_['learn']['Logloss'], 'validation_logloss':cat_model.evals_result_['validation']['Logloss']}).plot()

!pip install shap

import shap shap.initjs()

explainer = shap.TreeExplainer(cat_model) shap_values = explainer.shap_values(Pool(X,y,cat_features=['Batch Enrolled', 'Grade', 'Sub Grade', 'Home Ownership','Verification Status', 'Initial List Status', 'Application Type']))

shap.summary_plot(shap_values,X,class_names=['a','b'],plot_type ='bar')

shap.summary_plot(shap_values,X)

shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:],matplotlib=True)

lbl=LabelEncoder()

for i in X.select_dtypes(include='object').columns.values: X[i]=lbl.fit_transform(X[[i]])

def objective(trial): skf = StratifiedKFold(n_splits=10,random_state=22,shuffle=True) for i, (train_index, test_index) in enumerate(skf.split(X,y)): X_train, X_valid = X.iloc[train_index], X.iloc[test_index] y_train, y_valid = y.iloc[train_index], y.iloc[test_index] param = { "verbosity": 0, "objective": "binary:logistic", # use exact for small dataset. "tree_method": trial.suggest_categorical("tree_method",['exact', 'approx', 'hist']), "eta": trial.suggest_float("eta", 0.001, 1.0), #'interaction_constraints':[[2,3,8,12],[12,13,14],[5,6,9,12]], # defines booster, gblinear for linear functions. "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]), # L2 regularization weight. "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True), # L1 regularization weight. "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True), # sampling ratio for training data. "subsample": trial.suggest_float("subsample", 0.1, 1.0), # sampling according to each tree. "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0), "n_estimators":trial.suggest_categorical("n_estimators",[200,400,600,800,1000]) } if param["booster"] in ["gbtree", "dart"]: # maximum depth of the tree, signifies complexity of the tree. param["max_depth"] = trial.suggest_int("max_depth", 2, 60) # minimum child weight, larger the term more conservative the tree. param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 60) # defines how selective algorithm is. param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True) param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]) if param["booster"] == "dart": param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"]) param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"]) param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True) param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True) pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation_0-logloss") xgb_clf=XGBClassifier(early_stopping_rounds=50, **param) xgb_clf.fit(X_train,y_train, eval_set=[(X_valid, y_valid)], eval_metric='logloss', early_stopping_rounds=50, callbacks=[pruning_callback], ) preds = xgb_clf.predict_proba(X_valid) accuracy = log_loss(y_valid, preds) return accuracy if __name__ == "__main__": study1 = optuna.create_study(direction='minimize',sampler=RandomSampler(), pruner=optuna.pruners.SuccessiveHalvingPruner()) study1.optimize(objective, n_trials=100) print('Number of finished trials:', len(study1.trials)) print('Best trial:', study1.best_trial.params)

100*study1.best_value

study1.best_params

optuna.visualization.plot_optimization_history(study1)

optuna.visualization.plot_slice(study1)

optuna.visualization.plot_param_importances(study1)

params_xgb={'alpha': 0.00013628501532199046, 'booster': 'gbtree', 'colsample_bytree': 0.4510459869291909, 'eta': 0.23201860736451954, 'gamma': 1.18221420230537e-05, 'grow_policy': 'depthwise', 'lambda': 7.036258743817963e-06, 'max_depth': 6, 'min_child_weight': 52, 'n_estimators': 400, 'subsample': 0.44069082236462886, 'tree_method': 'exact'}

X.rename(columns=lambda x: x.replace(' ', '_'),inplace=True)

X_train1, X_valid1, y_train1, y_valid1 = train_test_split(X, y, test_size = 0.2, random_state = 42, shuffle=True,stratify=y)

xgb_model=XGBClassifier(**params_xgb,verbose=1)

xgb_model.fit(X_train1,y_train1,eval_set=[(X_valid1,y_valid1)], eval_metric='logloss',early_stopping_rounds=30,verbose=True)

plot_tree(xgb_model) fig = plt.gcf() fig.set_size_inches(30, 30)

explainer_xgb = shap.TreeExplainer(xgb_model) shap_values_xgb = explainer_xgb.shap_values(X)

shap.summary_plot(shap_values_xgb,X,plot_type ='bar')

def objective(trial): skf = StratifiedKFold(n_splits=10,random_state=22,shuffle=True) for i, (train_index, test_index) in enumerate(skf.split(X,y)): X_train, X_valid = X.iloc[train_index], X.iloc[test_index] y_train, y_valid = y.iloc[train_index], y.iloc[test_index] param_grid = { # "device_type": trial.suggest_categorical("device_type", ['gpu']), "boosting_type":trial.suggest_categorical("boosting_type",['rf','gbdt']), "n_estimators": trial.suggest_categorical("n_estimators", [200,400,600,800,1000]), "learning_rate": trial.suggest_float("learning_rate", 0.001, 1.0), "num_leaves": trial.suggest_int("num_leaves", 20, 100), "max_depth": trial.suggest_int("max_depth", 2, 50), "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 100, ), "lambda_l1": trial.suggest_int("lambda_l1", 1, 1000 ), "lambda_l2": trial.suggest_int("lambda_l2", 1, 1000), 'random_state': trial.suggest_categorical('random_state', [24, 48,2020]), "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15), "bagging_fraction": trial.suggest_float( "bagging_fraction", 0.2, 0.95, step=0.1 ), "bagging_freq": trial.suggest_categorical("bagging_freq", [1,2,4,6,8,10]), "feature_fraction": trial.suggest_float( "feature_fraction", 0.2, 0.95, step=0.1 ), } model = lgbm.LGBMClassifier(objective="binary", **param_grid) model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)], eval_metric='logloss', early_stopping_rounds=50, callbacks=[ LightGBMPruningCallback(trial, 'binary_logloss') ], # Add a pruning callback ) preds = model.predict_proba(X_valid) score = log_loss(y_valid, preds) return 100*score if __name__ == "__main__": study2 = optuna.create_study(direction='minimize',sampler=RandomSampler(), pruner=optuna.pruners.ThresholdPruner(lower=0.0)) study2.optimize(objective, n_trials=100) print('Number of finished trials:', len(study2.trials)) print('Best trial:', study2.best_trial.params)

study2.best_value

study2.best_params

optuna.visualization.plot_optimization_history(study2)

optuna.visualization.plot_slice(study2)

optuna.visualization.plot_param_importances(study2)

params_lgbm={'bagging_fraction': 0.7, 'bagging_freq': 8, 'boosting_type': 'rf', 'feature_fraction': 0.4, 'lambda_l1': 9, 'lambda_l2': 517, 'learning_rate': 0.30902486498872506, 'max_depth': 4, 'min_data_in_leaf': 7, 'min_gain_to_split': 1.776112821650258, 'n_estimators': 400, 'num_leaves': 36, 'random_state': 2020}

lgb_model=lgbm.LGBMClassifier(objective="binary", **params_lgbm)

lgb_model.fit(X_train1,y_train1,eval_set=[(X_valid1,y_valid1)], eval_metric='logloss',early_stopping_rounds=30)

lgbm.plot_tree(lgb_model) fig = plt.gcf() fig.set_size_inches(30, 30)

lgbm.plot_metric(lgb_model)

explainer_lgb = shap.TreeExplainer(lgb_model) shap_values_lgb = explainer_lgb.shap_values(X)

shap.summary_plot(shap_values_lgb,X,plot_type ='bar')

scl=StandardScaler()

for i in X.select_dtypes(exclude='object').columns.values: X[i]=scl.fit_transform(X[[i]])

from tensorflow import keras from tensorflow.keras import layers

nn = keras.Sequential([ layers.BatchNormalization(input_shape = [X.shape[1]]), layers.Dense(units = 128, activation = 'relu'), layers.BatchNormalization(), layers.Dropout(rate = 0.5), layers.Dense(units = 64, activation = 'relu'), layers.BatchNormalization(), layers.Dropout(rate = 0.4), layers.Dense(units = 32, activation = 'relu'), layers.BatchNormalization(), layers.Dropout(rate = 0.3), layers.Dense(units = 1, activation = 'sigmoid') ])

auc = keras.metrics.BinaryAccuracy() nn.compile(optimizer = keras.optimizers.Adam(), loss = keras.losses.BinaryCrossentropy(), metrics = [auc])

early_stopping = keras.callbacks.EarlyStopping(patience = 20, min_delta = 0.001, restore_best_weights = False)

initial_learning_rate = 0.01 def lr_step_decay(epoch, lr): drop_rate = 0.5 epochs_drop = 10.0 return initial_learning_rate * math.pow(drop_rate, math.floor(epoch/epochs_drop))

callback_lr = keras.callbacks.LearningRateScheduler(lr_step_decay, verbose=1)

history =nn.fit(X, y, validation_split=0.2, batch_size = 64, epochs = 100, callbacks = [early_stopping,callback_lr ] )

history_df = pd.DataFrame(history.history)

history_df.head()

plt.figure(figsize=(10,10)) plt.plot(xgb_model.evals_result_['validation_0']['logloss'],label="xgb_validation_loss") plt.plot(cat_model.evals_result_['validation']['Logloss'],label="catboost_validation_loss") plt.plot(lgb_model.evals_result_['valid_0']['binary_logloss'],label="lgbm_validation_loss") plt.plot(history_df['val_loss'],label="Keras_Nnet_validation_loss") plt.legend() plt.xticks(color='w') plt.title("Model's Validation-Logloss Comparison")