!pip install -U klib
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # visualization
from matplotlib import pyplot as plt # visualization
import klib # visualization
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from google.colab import drive
drive.mount('/content/gdrive')
train=pd.read_csv("/content/gdrive/MyDrive/deloitte_ml_challenge_predict_loan_defaulters./train.csv")
test=pd.read_csv("/content/gdrive/MyDrive/deloitte_ml_challenge_predict_loan_defaulters./test.csv")
train=train.rename(columns={'Employment Duration':'Home Ownership',
'Home Ownership':'Employment Duration'#
})
test=test.rename(columns={'Employment Duration':'Home Ownership',
'Home Ownership':'Employment Duration'#
})
train.info()
test.info()
train.isnull().sum()
test.isnull().sum()
train.head(3)
def cat_plot(df,catcol,title='',**arg):
_=plt.figure(figsize=(8,5))
_=sns.countplot(data=df,x=catcol,order=df[catcol].value_counts().index,**arg)
_=plt.title(title,fontsize=25)
_=plt.xlabel(catcol,fontsize=15)
_=plt.xticks(fontsize=10, rotation=90)
cat_plot(train,"Loan Status" ,"Taget(Loan Status) Column Distribution")
train["Loan Status"].value_counts()
_=plt.figure(figsize=(8,15))
_=sns.countplot(y=train['Batch Enrolled'],hue=train['Loan Status'].astype('object'))
train.select_dtypes(include="object").columns.values
fig=plt.subplots(figsize=(20, 20))
for i,col in enumerate(['Grade', 'Sub Grade', 'Home Ownership',
'Verification Status', 'Payment Plan',
'Initial List Status', 'Application Type']):
_=plt.subplot(4,2,i+1)
_=sns.countplot(x=train[col],hue=train['Loan Status'].astype('object'))
_=plt.title(col+' Distribution',fontsize=15)
_=plt.xlabel(col,fontsize=10)
_=plt.xticks(fontsize=15)
_=plt.tight_layout()
plt.show()
for col in train.select_dtypes(include="object").columns.values:
print("=="*50)
print(col)
print(train[col].value_counts())
print("=="*50)
fig=plt.subplots(figsize=(20, 20))
for i,col in enumerate(train.select_dtypes(exclude="object").columns.values):
plt.subplot(13,2,i+1)
_=sns.histplot(data=train,x=col,hue="Loan Status")
_=plt.title(col+' Distribution',fontsize=15)
_=plt.xlabel("")
_=plt.xticks(fontsize=8)
_=plt.tight_layout()
plt.show()
for col in train.select_dtypes(exclude="object").columns.values:
print("=="*50)
print(col)
print(train[col].describe())
print("=="*50)
train['Accounts Delinquent'].value_counts()
def pre_process(df):
#drop the loan title and payment plan column
df=df.drop(columns=['Loan Title','Payment Plan'])
#Interest per month
df['Interest_per_mon']=((df['Loan Amount']*df['Interest Rate'])/100)/df['Term']
#total interest amount
df['total_intr_amt']=df['Interest_per_mon']*df['Term']
#Check investor funded amount greater than funded amount
df['Fund_amnt_grt']=(df['Funded Amount Investor']>df['Funded Amount']).astype('int')
#total revolve amount
df['total_revolve']=df['Revolving Balance']+df['Revolving Utilities']
#total received amount
df['total_received']=df['Total Received Interest']+df['Total Collection Amount']
#total recovery amount
df['total_recovery']=df['Recoveries']+df['Collection Recovery Fee']
#check total revolve amount less than total revolving credit limit
df['revolve_amnt_grt']=(df['total_revolve']<df['Total Revolving Credit Limit']).astype('int')
#check Loan amount greater than total current balance
df['loan_grt_balance']=(df['Loan Amount']>df['Total Current Balance']).astype('int')
#representative's customer count.
df['reprs_cust_count']=df.groupby(['Batch Enrolled'])['Batch Enrolled'].transform('count')
return df
train=pre_process(train)
grpcol=['Batch Enrolled', 'Grade', 'Sub Grade', 'Home Ownership',
'Verification Status', 'Initial List Status', 'Application Type']
numcols=['Loan Amount', 'Funded Amount', 'Funded Amount Investor',
'Interest Rate', 'Employment Duration', 'Debit to Income',
'Open Account', 'Revolving Balance', 'Revolving Utilities',
'Total Accounts', 'Total Received Interest', 'Total Received Late Fee',
'Recoveries', 'Collection Recovery Fee', 'Total Collection Amount',
'Total Current Balance', 'Total Revolving Credit Limit',
'Interest_per_mon', 'total_intr_amt',
'total_revolve', 'total_received', 'total_recovery']
for col in numcols:
df1=(train.groupby(grpcol)[col].
agg({'min','median','max'}).reset_index())
df1=df1.rename(columns={c:col+'_'+c for c in df1.loc[:,df1.columns.str.match("(min|median|max)")]})
train=pd.merge(train,df1,on=grpcol,how='left')
for c, i in enumerate(train.columns.values):
print(f"{c}_{i}")
test=pre_process(test)
for col in numcols:
df1=(test.groupby(grpcol)[col].
agg({'min','median','max'}).reset_index())
df1=df1.rename(columns={c:col+'_'+c for c in df1.loc[:,df1.columns.str.match("(min|median|max)")]})
test=pd.merge(test,df1,on=grpcol,how='left')
for c, i in enumerate(test.columns.values):
print(f"{c}_{i}")
!pip install optuna
!pip install catboost
from sklearn.model_selection import cross_val_score,KFold,train_test_split,ShuffleSplit,StratifiedKFold,learning_curve
from catboost import CatBoostClassifier,Pool,cv,monoforest
import optuna
from optuna.samplers import RandomSampler,TPESampler,MOTPESampler,CmaEsSampler
from sklearn.metrics import f1_score,classification_report,confusion_matrix,log_loss
from xgboost import XGBClassifier,plot_tree
import xgboost as xgb
from optuna.integration import XGBoostPruningCallback,LightGBMPruningCallback
from sklearn.preprocessing import LabelEncoder,StandardScaler,PolynomialFeatures
import lightgbm as lgbm
import math
for c, i in enumerate(train.columns.values):
print(f"{c}_{i}")
train.select_dtypes(include='object').columns
X=train.iloc[:,np.r_[1:32,33:108]]
y=train['Loan Status']
train.select_dtypes(include='object').columns
def objective(trial):
skf = StratifiedKFold(n_splits=5,random_state=2000,shuffle=True)
for i, (train_index, test_index) in enumerate(skf.split(X,y)):
X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
param = {
'reg_lambda':trial.suggest_categorical('reg_lambda',[0.1, 1.0, 5.0, 10.0, 50.0, 100.0,150.0,200.0,250.0]),
'learning_rate': trial.suggest_float('learning_rate', 0.001,1.0),
'n_estimators': trial.suggest_categorical('n_estimators',[200,400,600,800,1000]),
'max_depth': trial.suggest_int('max_depth', 2,12),
'random_state': trial.suggest_categorical('random_state', [1024, 1048,2020]),
"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
"boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
"bootstrap_type": trial.suggest_categorical(
"bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
),
}
if param["bootstrap_type"] == "Bayesian":
param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
elif param["bootstrap_type"] == "Bernoulli":
param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
cat_clf = CatBoostClassifier(early_stopping_rounds=30,eval_metric="Logloss",
logging_level="Silent",
**param)
cat_clf.fit(X_train, y_train,cat_features=['Batch Enrolled', 'Grade', 'Sub Grade',
'Home Ownership','Verification Status',
'Initial List Status', 'Application Type'])
preds = cat_clf.predict_proba(X_valid)
accuracy = log_loss(y_valid, preds)
return accuracy
if __name__ == "__main__":
study = optuna.create_study(direction='minimize',sampler=MOTPESampler(),
pruner=optuna.pruners.SuccessiveHalvingPruner())
study.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
study.best_value
study.best_params
optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_slice(study)
optuna.visualization.plot_param_importances(study)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2,
random_state = 42,
shuffle=True,stratify=y)
param_cat={'bagging_temperature': 2.0022013667340754,
'boosting_type': 'Ordered',
'bootstrap_type': 'Bayesian',
'colsample_bylevel': 0.010267520707686379,
'learning_rate': 0.08580081062052972,
'max_depth': 2,
'n_estimators': 1000,
'random_state': 2020,
'reg_lambda': 250.0}
cat_model = CatBoostClassifier(**param_cat,
eval_metric="Logloss",)
cat_model.fit(X_train,y_train,eval_set=(X_valid,y_valid),use_best_model=True,
verbose=True,early_stopping_rounds=30,
cat_features=['Batch Enrolled', 'Grade', 'Sub Grade',
'Home Ownership','Verification Status',
'Initial List Status', 'Application Type'])
pd.DataFrame({'train_logloss':cat_model.evals_result_['learn']['Logloss'],
'validation_logloss':cat_model.evals_result_['validation']['Logloss']}).plot()
!pip install shap
import shap
shap.initjs()
explainer = shap.TreeExplainer(cat_model)
shap_values = explainer.shap_values(Pool(X,y,cat_features=['Batch Enrolled', 'Grade', 'Sub Grade',
'Home Ownership','Verification Status',
'Initial List Status', 'Application Type']))
shap.summary_plot(shap_values,X,class_names=['a','b'],plot_type ='bar')
shap.summary_plot(shap_values,X)
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:],matplotlib=True)
lbl=LabelEncoder()
for i in X.select_dtypes(include='object').columns.values:
X[i]=lbl.fit_transform(X[[i]])
def objective(trial):
skf = StratifiedKFold(n_splits=10,random_state=22,shuffle=True)
for i, (train_index, test_index) in enumerate(skf.split(X,y)):
X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
param = {
"verbosity": 0,
"objective": "binary:logistic",
# use exact for small dataset.
"tree_method": trial.suggest_categorical("tree_method",['exact', 'approx', 'hist']),
"eta": trial.suggest_float("eta", 0.001, 1.0),
#'interaction_constraints':[[2,3,8,12],[12,13,14],[5,6,9,12]],
# defines booster, gblinear for linear functions.
"booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
# L2 regularization weight.
"lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
# L1 regularization weight.
"alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
# sampling ratio for training data.
"subsample": trial.suggest_float("subsample", 0.1, 1.0),
# sampling according to each tree.
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
"n_estimators":trial.suggest_categorical("n_estimators",[200,400,600,800,1000])
}
if param["booster"] in ["gbtree", "dart"]:
# maximum depth of the tree, signifies complexity of the tree.
param["max_depth"] = trial.suggest_int("max_depth", 2, 60)
# minimum child weight, larger the term more conservative the tree.
param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 60)
# defines how selective algorithm is.
param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
if param["booster"] == "dart":
param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation_0-logloss")
xgb_clf=XGBClassifier(early_stopping_rounds=50,
**param)
xgb_clf.fit(X_train,y_train,
eval_set=[(X_valid, y_valid)],
eval_metric='logloss',
early_stopping_rounds=50,
callbacks=[pruning_callback],
)
preds = xgb_clf.predict_proba(X_valid)
accuracy = log_loss(y_valid, preds)
return accuracy
if __name__ == "__main__":
study1 = optuna.create_study(direction='minimize',sampler=RandomSampler(),
pruner=optuna.pruners.SuccessiveHalvingPruner())
study1.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study1.trials))
print('Best trial:', study1.best_trial.params)
100*study1.best_value
study1.best_params
optuna.visualization.plot_optimization_history(study1)
optuna.visualization.plot_slice(study1)
optuna.visualization.plot_param_importances(study1)
params_xgb={'alpha': 0.00013628501532199046,
'booster': 'gbtree',
'colsample_bytree': 0.4510459869291909,
'eta': 0.23201860736451954,
'gamma': 1.18221420230537e-05,
'grow_policy': 'depthwise',
'lambda': 7.036258743817963e-06,
'max_depth': 6,
'min_child_weight': 52,
'n_estimators': 400,
'subsample': 0.44069082236462886,
'tree_method': 'exact'}
X.rename(columns=lambda x: x.replace(' ', '_'),inplace=True)
X_train1, X_valid1, y_train1, y_valid1 = train_test_split(X, y, test_size = 0.2,
random_state = 42,
shuffle=True,stratify=y)
xgb_model=XGBClassifier(**params_xgb,verbose=1)
xgb_model.fit(X_train1,y_train1,eval_set=[(X_valid1,y_valid1)],
eval_metric='logloss',early_stopping_rounds=30,verbose=True)
plot_tree(xgb_model)
fig = plt.gcf()
fig.set_size_inches(30, 30)
explainer_xgb = shap.TreeExplainer(xgb_model)
shap_values_xgb = explainer_xgb.shap_values(X)
shap.summary_plot(shap_values_xgb,X,plot_type ='bar')
def objective(trial):
skf = StratifiedKFold(n_splits=10,random_state=22,shuffle=True)
for i, (train_index, test_index) in enumerate(skf.split(X,y)):
X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
param_grid = {
# "device_type": trial.suggest_categorical("device_type", ['gpu']),
"boosting_type":trial.suggest_categorical("boosting_type",['rf','gbdt']),
"n_estimators": trial.suggest_categorical("n_estimators", [200,400,600,800,1000]),
"learning_rate": trial.suggest_float("learning_rate", 0.001, 1.0),
"num_leaves": trial.suggest_int("num_leaves", 20, 100),
"max_depth": trial.suggest_int("max_depth", 2, 50),
"min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 100, ),
"lambda_l1": trial.suggest_int("lambda_l1", 1, 1000 ),
"lambda_l2": trial.suggest_int("lambda_l2", 1, 1000),
'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
"bagging_fraction": trial.suggest_float(
"bagging_fraction", 0.2, 0.95, step=0.1
),
"bagging_freq": trial.suggest_categorical("bagging_freq", [1,2,4,6,8,10]),
"feature_fraction": trial.suggest_float(
"feature_fraction", 0.2, 0.95, step=0.1
),
}
model = lgbm.LGBMClassifier(objective="binary", **param_grid)
model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
eval_metric='logloss',
early_stopping_rounds=50,
callbacks=[
LightGBMPruningCallback(trial, 'binary_logloss')
], # Add a pruning callback
)
preds = model.predict_proba(X_valid)
score = log_loss(y_valid, preds)
return 100*score
if __name__ == "__main__":
study2 = optuna.create_study(direction='minimize',sampler=RandomSampler(),
pruner=optuna.pruners.ThresholdPruner(lower=0.0))
study2.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study2.trials))
print('Best trial:', study2.best_trial.params)
study2.best_value
study2.best_params
optuna.visualization.plot_optimization_history(study2)
optuna.visualization.plot_slice(study2)
optuna.visualization.plot_param_importances(study2)
params_lgbm={'bagging_fraction': 0.7,
'bagging_freq': 8,
'boosting_type': 'rf',
'feature_fraction': 0.4,
'lambda_l1': 9,
'lambda_l2': 517,
'learning_rate': 0.30902486498872506,
'max_depth': 4,
'min_data_in_leaf': 7,
'min_gain_to_split': 1.776112821650258,
'n_estimators': 400,
'num_leaves': 36,
'random_state': 2020}
lgb_model=lgbm.LGBMClassifier(objective="binary", **params_lgbm)
lgb_model.fit(X_train1,y_train1,eval_set=[(X_valid1,y_valid1)],
eval_metric='logloss',early_stopping_rounds=30)
lgbm.plot_tree(lgb_model)
fig = plt.gcf()
fig.set_size_inches(30, 30)
lgbm.plot_metric(lgb_model)
explainer_lgb = shap.TreeExplainer(lgb_model)
shap_values_lgb = explainer_lgb.shap_values(X)
shap.summary_plot(shap_values_lgb,X,plot_type ='bar')
scl=StandardScaler()
for i in X.select_dtypes(exclude='object').columns.values:
X[i]=scl.fit_transform(X[[i]])
from tensorflow import keras
from tensorflow.keras import layers
nn = keras.Sequential([
layers.BatchNormalization(input_shape = [X.shape[1]]),
layers.Dense(units = 128, activation = 'relu'),
layers.BatchNormalization(),
layers.Dropout(rate = 0.5),
layers.Dense(units = 64, activation = 'relu'),
layers.BatchNormalization(),
layers.Dropout(rate = 0.4),
layers.Dense(units = 32, activation = 'relu'),
layers.BatchNormalization(),
layers.Dropout(rate = 0.3),
layers.Dense(units = 1, activation = 'sigmoid')
])
auc = keras.metrics.BinaryAccuracy()
nn.compile(optimizer = keras.optimizers.Adam(),
loss = keras.losses.BinaryCrossentropy(),
metrics = [auc])
early_stopping = keras.callbacks.EarlyStopping(patience = 20,
min_delta = 0.001,
restore_best_weights = False)
initial_learning_rate = 0.01
def lr_step_decay(epoch, lr):
drop_rate = 0.5
epochs_drop = 10.0
return initial_learning_rate * math.pow(drop_rate, math.floor(epoch/epochs_drop))
callback_lr = keras.callbacks.LearningRateScheduler(lr_step_decay, verbose=1)
history =nn.fit(X, y,
validation_split=0.2,
batch_size = 64,
epochs = 100,
callbacks = [early_stopping,callback_lr
]
)
history_df = pd.DataFrame(history.history)
history_df.head()
plt.figure(figsize=(10,10))
plt.plot(xgb_model.evals_result_['validation_0']['logloss'],label="xgb_validation_loss")
plt.plot(cat_model.evals_result_['validation']['Logloss'],label="catboost_validation_loss")
plt.plot(lgb_model.evals_result_['valid_0']['binary_logloss'],label="lgbm_validation_loss")
plt.plot(history_df['val_loss'],label="Keras_Nnet_validation_loss")
plt.legend()
plt.xticks(color='w')
plt.title("Model's Validation-Logloss Comparison")