!pip install -U klib
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # visualization
from matplotlib import pyplot as plt # visualization
import klib # visualization
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_selection import chi2
from scipy.stats import chi2_contingency
from google.colab import drive
drive.mount('/content/gdrive')
Mounted at /content/gdrive
train=pd.read_csv("/content/gdrive/MyDrive/deloitte_ml_challenge_predict_loan_defaulters./train.csv")
test=pd.read_csv("/content/gdrive/MyDrive/deloitte_ml_challenge_predict_loan_defaulters./test.csv")
train=train.rename(columns={'Employment Duration':'Home Ownership',
'Home Ownership':'Employment Duration'#
})
test=test.rename(columns={'Employment Duration':'Home Ownership',
'Home Ownership':'Employment Duration'#
})
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67463 entries, 0 to 67462
Data columns (total 35 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 67463 non-null int64
1 Loan Amount 67463 non-null int64
2 Funded Amount 67463 non-null int64
3 Funded Amount Investor 67463 non-null float64
4 Term 67463 non-null int64
5 Batch Enrolled 67463 non-null object
6 Interest Rate 67463 non-null float64
7 Grade 67463 non-null object
8 Sub Grade 67463 non-null object
9 Home Ownership 67463 non-null object
10 Employment Duration 67463 non-null float64
11 Verification Status 67463 non-null object
12 Payment Plan 67463 non-null object
13 Loan Title 67463 non-null object
14 Debit to Income 67463 non-null float64
15 Delinquency - two years 67463 non-null int64
16 Inquires - six months 67463 non-null int64
17 Open Account 67463 non-null int64
18 Public Record 67463 non-null int64
19 Revolving Balance 67463 non-null int64
20 Revolving Utilities 67463 non-null float64
21 Total Accounts 67463 non-null int64
22 Initial List Status 67463 non-null object
23 Total Received Interest 67463 non-null float64
24 Total Received Late Fee 67463 non-null float64
25 Recoveries 67463 non-null float64
26 Collection Recovery Fee 67463 non-null float64
27 Collection 12 months Medical 67463 non-null int64
28 Application Type 67463 non-null object
29 Last week Pay 67463 non-null int64
30 Accounts Delinquent 67463 non-null int64
31 Total Collection Amount 67463 non-null int64
32 Total Current Balance 67463 non-null int64
33 Total Revolving Credit Limit 67463 non-null int64
34 Loan Status 67463 non-null int64
dtypes: float64(9), int64(17), object(9)
memory usage: 18.0+ MB
test.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28913 entries, 0 to 28912
Data columns (total 35 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ID 28913 non-null int64
1 Loan Amount 28913 non-null int64
2 Funded Amount 28913 non-null int64
3 Funded Amount Investor 28913 non-null float64
4 Term 28913 non-null int64
5 Batch Enrolled 28913 non-null object
6 Interest Rate 28913 non-null float64
7 Grade 28913 non-null object
8 Sub Grade 28913 non-null object
9 Home Ownership 28913 non-null object
10 Employment Duration 28913 non-null float64
11 Verification Status 28913 non-null object
12 Payment Plan 28913 non-null object
13 Loan Title 28913 non-null object
14 Debit to Income 28913 non-null float64
15 Delinquency - two years 28913 non-null int64
16 Inquires - six months 28913 non-null int64
17 Open Account 28913 non-null int64
18 Public Record 28913 non-null int64
19 Revolving Balance 28913 non-null int64
20 Revolving Utilities 28913 non-null float64
21 Total Accounts 28913 non-null int64
22 Initial List Status 28913 non-null object
23 Total Received Interest 28913 non-null float64
24 Total Received Late Fee 28913 non-null float64
25 Recoveries 28913 non-null float64
26 Collection Recovery Fee 28913 non-null float64
27 Collection 12 months Medical 28913 non-null int64
28 Application Type 28913 non-null object
29 Last week Pay 28913 non-null int64
30 Accounts Delinquent 28913 non-null int64
31 Total Collection Amount 28913 non-null int64
32 Total Current Balance 28913 non-null int64
33 Total Revolving Credit Limit 28913 non-null int64
34 Loan Status 0 non-null float64
dtypes: float64(10), int64(16), object(9)
memory usage: 7.7+ MB
train.isnull().sum()
test.isnull().sum()
train.head(3)
def cat_plot(df,catcol,title='',**arg):
_=plt.figure(figsize=(8,5))
_=sns.countplot(data=df,x=catcol,order=df[catcol].value_counts().index,**arg)
_=plt.title(title,fontsize=25)
_=plt.xlabel(catcol,fontsize=15)
_=plt.xticks(fontsize=10, rotation=90)
cat_plot(train,"Loan Status" ,"Taget(Loan Status) Column Distribution")
train["Loan Status"].value_counts()
_=plt.figure(figsize=(8,15))
_=sns.countplot(y=train['Batch Enrolled'],hue=train['Loan Status'].astype('object'))
train.select_dtypes(include="object").columns.values
fig=plt.subplots(figsize=(20, 20))
for i,col in enumerate(['Grade', 'Sub Grade', 'Home Ownership',
'Verification Status', 'Payment Plan',
'Initial List Status', 'Application Type']):
_=plt.subplot(4,2,i+1)
_=sns.countplot(x=train[col],hue=train['Loan Status'].astype('object'))
_=plt.title(col+' Distribution',fontsize=15)
_=plt.xlabel(col,fontsize=10)
_=plt.xticks(fontsize=15)
_=plt.tight_layout()
plt.show()
for col in train.select_dtypes(include="object").columns.values:
print("=="*50)
print(col)
print(train[col].value_counts())
print("=="*50)
====================================================================================================
Batch Enrolled
BAT3873588 3626
BAT1586599 3142
BAT1104812 2996
BAT2252229 2557
BAT2803411 2425
BAT1780517 2403
BAT1184694 2298
BAT2078974 2290
BAT2575549 2257
BAT4694572 2248
BAT4271519 2054
BAT2558388 1963
BAT3193689 1864
BAT1930365 1844
BAT2136391 1790
BAT2333412 1775
BAT3726927 1774
BAT4136152 1766
BAT5341619 1717
BAT5525466 1709
BAT5489674 1677
BAT5629144 1639
BAT1766061 1461
BAT2833642 1421
BAT5924421 1404
BAT2522922 1399
BAT2428731 1398
BAT4808022 1303
BAT4351734 1140
BAT5547201 1127
BAT5714674 1105
BAT3461431 1068
BAT224923 895
BAT1761981 894
BAT4722912 887
BAT2003848 842
BAT1467036 802
BAT5849876 768
BAT3865626 728
BAT5811547 711
BAT1135695 296
Name: Batch Enrolled, dtype: int64
====================================================================================================
====================================================================================================
Grade
C 19085
B 18742
A 12055
D 8259
E 6446
F 2246
G 630
Name: Grade, dtype: int64
====================================================================================================
====================================================================================================
Sub Grade
B4 4462
C1 4188
B3 3999
A5 3540
B2 3520
B5 3408
D1 3304
C4 3250
C2 3219
C3 3121
B1 2924
C5 2472
A4 2264
D4 2050
D2 1963
D5 1952
A2 1837
D3 1824
E2 1746
A3 1685
A1 1364
E3 1321
E1 1298
E4 1117
F2 947
F1 824
E5 769
F5 582
F3 578
G2 447
F4 441
G1 366
G5 284
G3 246
G4 151
Name: Sub Grade, dtype: int64
====================================================================================================
====================================================================================================
Home Ownership
MORTGAGE 36351
RENT 24150
OWN 6962
Name: Home Ownership, dtype: int64
====================================================================================================
====================================================================================================
Verification Status
Source Verified 33036
Verified 18078
Not Verified 16349
Name: Verification Status, dtype: int64
====================================================================================================
====================================================================================================
Payment Plan
n 67463
Name: Payment Plan, dtype: int64
====================================================================================================
====================================================================================================
Loan Title
Credit card refinancing 30728
Debt consolidation 24841
Debt Consolidation 3544
Other 2455
Home improvement 2211
...
Home loan 6
Personal loan 5
Getting Ahead 5
bills 4
Credit 4
Name: Loan Title, Length: 109, dtype: int64
====================================================================================================
====================================================================================================
Initial List Status
w 36299
f 31164
Name: Initial List Status, dtype: int64
====================================================================================================
====================================================================================================
Application Type
INDIVIDUAL 67340
JOINT 123
Name: Application Type, dtype: int64
====================================================================================================
fig=plt.subplots(figsize=(20, 20))
for i,col in enumerate(train.select_dtypes(exclude="object").columns.values):
plt.subplot(13,2,i+1)
_=sns.histplot(data=train,x=col,hue="Loan Status")
_=plt.title(col+' Distribution',fontsize=15)
_=plt.xlabel("")
_=plt.xticks(fontsize=8)
_=plt.tight_layout()
plt.show()
for col in train.select_dtypes(exclude="object").columns.values:
print("=="*50)
print(col)
print(train[col].describe())
print("=="*50)
====================================================================================================
ID
count 6.746300e+04
mean 2.562761e+07
std 2.109155e+07
min 1.297933e+06
25% 6.570288e+06
50% 1.791565e+07
75% 4.271521e+07
max 7.224578e+07
Name: ID, dtype: float64
====================================================================================================
====================================================================================================
Loan Amount
count 67463.000000
mean 16848.902776
std 8367.865726
min 1014.000000
25% 10012.000000
50% 16073.000000
75% 22106.000000
max 35000.000000
Name: Loan Amount, dtype: float64
====================================================================================================
====================================================================================================
Funded Amount
count 67463.000000
mean 15770.599114
std 8150.992662
min 1014.000000
25% 9266.500000
50% 13042.000000
75% 21793.000000
max 34999.000000
Name: Funded Amount, dtype: float64
====================================================================================================
====================================================================================================
Funded Amount Investor
count 67463.000000
mean 14621.799323
std 6785.345170
min 1114.590204
25% 9831.684984
50% 12793.682170
75% 17807.594120
max 34999.746430
Name: Funded Amount Investor, dtype: float64
====================================================================================================
====================================================================================================
Term
count 67463.000000
mean 58.173814
std 3.327441
min 36.000000
25% 58.000000
50% 59.000000
75% 59.000000
max 59.000000
Name: Term, dtype: float64
====================================================================================================
====================================================================================================
Interest Rate
count 67463.000000
mean 11.846258
std 3.718629
min 5.320006
25% 9.297147
50% 11.377696
75% 14.193533
max 27.182348
Name: Interest Rate, dtype: float64
====================================================================================================
====================================================================================================
Employment Duration
count 67463.000000
mean 80541.502522
std 45029.120366
min 14573.537170
25% 51689.843335
50% 69335.832680
75% 94623.322785
max 406561.536400
Name: Employment Duration, dtype: float64
====================================================================================================
====================================================================================================
Debit to Income
count 67463.000000
mean 23.299241
std 8.451824
min 0.675299
25% 16.756416
50% 22.656658
75% 30.048400
max 39.629862
Name: Debit to Income, dtype: float64
====================================================================================================
====================================================================================================
Delinquency - two years
count 67463.000000
mean 0.327127
std 0.800888
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 8.000000
Name: Delinquency - two years, dtype: float64
====================================================================================================
====================================================================================================
Inquires - six months
count 67463.000000
mean 0.145754
std 0.473291
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 5.000000
Name: Inquires - six months, dtype: float64
====================================================================================================
====================================================================================================
Open Account
count 67463.000000
mean 14.266561
std 6.225060
min 2.000000
25% 10.000000
50% 13.000000
75% 16.000000
max 37.000000
Name: Open Account, dtype: float64
====================================================================================================
====================================================================================================
Public Record
count 67463.000000
mean 0.081437
std 0.346606
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 4.000000
Name: Public Record, dtype: float64
====================================================================================================
====================================================================================================
Revolving Balance
count 67463.000000
mean 7699.342425
std 7836.148190
min 0.000000
25% 2557.000000
50% 5516.000000
75% 10184.500000
max 116933.000000
Name: Revolving Balance, dtype: float64
====================================================================================================
====================================================================================================
Revolving Utilities
count 67463.000000
mean 52.889443
std 22.539450
min 0.005172
25% 38.658825
50% 54.082334
75% 69.177117
max 100.880050
Name: Revolving Utilities, dtype: float64
====================================================================================================
====================================================================================================
Total Accounts
count 67463.000000
mean 18.627929
std 8.319246
min 4.000000
25% 13.000000
50% 18.000000
75% 23.000000
max 72.000000
Name: Total Accounts, dtype: float64
====================================================================================================
====================================================================================================
Total Received Interest
count 67463.000000
mean 2068.992542
std 2221.918745
min 4.736746
25% 570.903814
50% 1330.842771
75% 2656.956837
max 14301.368310
Name: Total Received Interest, dtype: float64
====================================================================================================
====================================================================================================
Total Received Late Fee
count 67463.000000
mean 1.143969
std 5.244365
min 0.000003
25% 0.021114
50% 0.043398
75% 0.071884
max 42.618882
Name: Total Received Late Fee, dtype: float64
====================================================================================================
====================================================================================================
Recoveries
count 67463.000000
mean 59.691578
std 357.026346
min 0.000036
25% 1.629818
50% 3.344524
75% 5.453727
max 4354.467419
Name: Recoveries, dtype: float64
====================================================================================================
====================================================================================================
Collection Recovery Fee
count 67463.000000
mean 1.125141
std 3.489885
min 0.000036
25% 0.476259
50% 0.780141
75% 1.070566
max 166.833000
Name: Collection Recovery Fee, dtype: float64
====================================================================================================
====================================================================================================
Collection 12 months Medical
count 67463.000000
mean 0.021301
std 0.144385
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 1.000000
Name: Collection 12 months Medical, dtype: float64
====================================================================================================
====================================================================================================
Last week Pay
count 67463.000000
mean 71.163260
std 43.315845
min 0.000000
25% 35.000000
50% 68.000000
75% 105.000000
max 161.000000
Name: Last week Pay, dtype: float64
====================================================================================================
====================================================================================================
Accounts Delinquent
count 67463.0
mean 0.0
std 0.0
min 0.0
25% 0.0
50% 0.0
75% 0.0
max 0.0
Name: Accounts Delinquent, dtype: float64
====================================================================================================
====================================================================================================
Total Collection Amount
count 67463.000000
mean 146.467990
std 744.382233
min 1.000000
25% 24.000000
50% 36.000000
75% 46.000000
max 16421.000000
Name: Total Collection Amount, dtype: float64
====================================================================================================
====================================================================================================
Total Current Balance
count 6.746300e+04
mean 1.595739e+05
std 1.390332e+05
min 6.170000e+02
25% 5.037900e+04
50% 1.183690e+05
75% 2.283750e+05
max 1.177412e+06
Name: Total Current Balance, dtype: float64
====================================================================================================
====================================================================================================
Total Revolving Credit Limit
count 67463.000000
mean 23123.005544
std 20916.699999
min 1000.000000
25% 8155.500000
50% 16733.000000
75% 32146.500000
max 201169.000000
Name: Total Revolving Credit Limit, dtype: float64
====================================================================================================
====================================================================================================
Loan Status
count 67463.000000
mean 0.092510
std 0.289747
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 1.000000
Name: Loan Status, dtype: float64
====================================================================================================
train['Accounts Delinquent'].value_counts()
def pre_process(df):
#drop the loan title and payment plan column
df=df.drop(columns=['Loan Title','Payment Plan'])
#Interest per month
df['Interest_per_mon']=((df['Loan Amount']*df['Interest Rate'])/100)/df['Term']
#total interest amount
df['total_intr_amt']=df['Interest_per_mon']*df['Term']
#Check investor funded amount greater than funded amount
df['Fund_amnt_grt']=(df['Funded Amount Investor']>df['Funded Amount']).astype('int')
#total revolve amount
df['total_revolve']=df['Revolving Balance']+df['Revolving Utilities']
#total received amount
df['total_received']=df['Total Received Interest']+df['Total Collection Amount']
#total recovery amount
df['total_recovery']=df['Recoveries']+df['Collection Recovery Fee']
#check total revolve amount less than total revolving credit limit
df['revolve_amnt_grt']=(df['total_revolve']<df['Total Revolving Credit Limit']).astype('int')
#check Loan amount greater than total current balance
df['loan_grt_balance']=(df['Loan Amount']>df['Total Current Balance']).astype('int')
#representative's customer count.
df['reprs_cust_count']=df.groupby(['Batch Enrolled'])['Batch Enrolled'].transform('count')
return df
train=pre_process(train)
grpcol=['Batch Enrolled', 'Grade', 'Sub Grade', 'Home Ownership',
'Verification Status', 'Initial List Status', 'Application Type']
numcols=['Loan Amount', 'Funded Amount', 'Funded Amount Investor',
'Interest Rate', 'Employment Duration', 'Debit to Income',
'Open Account', 'Revolving Balance', 'Revolving Utilities',
'Total Accounts', 'Total Received Interest', 'Total Received Late Fee',
'Recoveries', 'Collection Recovery Fee', 'Total Collection Amount',
'Total Current Balance', 'Total Revolving Credit Limit',
'Interest_per_mon', 'total_intr_amt',
'total_revolve', 'total_received', 'total_recovery']
for col in numcols:
df1=(train.groupby(grpcol)[col].
agg({'min','median','max'}).reset_index())
df1=df1.rename(columns={c:col+'_'+c for c in df1.loc[:,df1.columns.str.match("(min|median|max)")]})
train=pd.merge(train,df1,on=grpcol,how='left')
for c, i in enumerate(train.columns.values):
print(f"{c}_{i}")
0_ID
1_Loan Amount
2_Funded Amount
3_Funded Amount Investor
4_Term
5_Batch Enrolled
6_Interest Rate
7_Grade
8_Sub Grade
9_Home Ownership
10_Employment Duration
11_Verification Status
12_Debit to Income
13_Delinquency - two years
14_Inquires - six months
15_Open Account
16_Public Record
17_Revolving Balance
18_Revolving Utilities
19_Total Accounts
20_Initial List Status
21_Total Received Interest
22_Total Received Late Fee
23_Recoveries
24_Collection Recovery Fee
25_Collection 12 months Medical
26_Application Type
27_Last week Pay
28_Accounts Delinquent
29_Total Collection Amount
30_Total Current Balance
31_Total Revolving Credit Limit
32_Loan Status
33_Interest_per_mon
34_total_intr_amt
35_Fund_amnt_grt
36_total_revolve
37_total_received
38_total_recovery
39_revolve_amnt_grt
40_loan_grt_balance
41_reprs_cust_count
42_Loan Amount_max
43_Loan Amount_median
44_Loan Amount_min
45_Funded Amount_max
46_Funded Amount_median
47_Funded Amount_min
48_Funded Amount Investor_max
49_Funded Amount Investor_median
50_Funded Amount Investor_min
51_Interest Rate_max
52_Interest Rate_median
53_Interest Rate_min
54_Employment Duration_max
55_Employment Duration_median
56_Employment Duration_min
57_Debit to Income_max
58_Debit to Income_median
59_Debit to Income_min
60_Open Account_max
61_Open Account_median
62_Open Account_min
63_Revolving Balance_max
64_Revolving Balance_median
65_Revolving Balance_min
66_Revolving Utilities_max
67_Revolving Utilities_median
68_Revolving Utilities_min
69_Total Accounts_max
70_Total Accounts_median
71_Total Accounts_min
72_Total Received Interest_max
73_Total Received Interest_median
74_Total Received Interest_min
75_Total Received Late Fee_max
76_Total Received Late Fee_median
77_Total Received Late Fee_min
78_Recoveries_max
79_Recoveries_median
80_Recoveries_min
81_Collection Recovery Fee_max
82_Collection Recovery Fee_median
83_Collection Recovery Fee_min
84_Total Collection Amount_max
85_Total Collection Amount_median
86_Total Collection Amount_min
87_Total Current Balance_max
88_Total Current Balance_median
89_Total Current Balance_min
90_Total Revolving Credit Limit_max
91_Total Revolving Credit Limit_median
92_Total Revolving Credit Limit_min
93_Interest_per_mon_max
94_Interest_per_mon_median
95_Interest_per_mon_min
96_total_intr_amt_max
97_total_intr_amt_median
98_total_intr_amt_min
99_total_revolve_max
100_total_revolve_median
101_total_revolve_min
102_total_received_max
103_total_received_median
104_total_received_min
105_total_recovery_max
106_total_recovery_median
107_total_recovery_min
test=pre_process(test)
for col in numcols:
df1=(test.groupby(grpcol)[col].
agg({'min','median','max'}).reset_index())
df1=df1.rename(columns={c:col+'_'+c for c in df1.loc[:,df1.columns.str.match("(min|median|max)")]})
test=pd.merge(test,df1,on=grpcol,how='left')
for c, i in enumerate(test.columns.values):
print(f"{c}_{i}")
0_ID
1_Loan Amount
2_Funded Amount
3_Funded Amount Investor
4_Term
5_Batch Enrolled
6_Interest Rate
7_Grade
8_Sub Grade
9_Home Ownership
10_Employment Duration
11_Verification Status
12_Debit to Income
13_Delinquency - two years
14_Inquires - six months
15_Open Account
16_Public Record
17_Revolving Balance
18_Revolving Utilities
19_Total Accounts
20_Initial List Status
21_Total Received Interest
22_Total Received Late Fee
23_Recoveries
24_Collection Recovery Fee
25_Collection 12 months Medical
26_Application Type
27_Last week Pay
28_Accounts Delinquent
29_Total Collection Amount
30_Total Current Balance
31_Total Revolving Credit Limit
32_Loan Status
33_Interest_per_mon
34_total_intr_amt
35_Fund_amnt_grt
36_total_revolve
37_total_received
38_total_recovery
39_revolve_amnt_grt
40_loan_grt_balance
41_reprs_cust_count
42_Loan Amount_max
43_Loan Amount_median
44_Loan Amount_min
45_Funded Amount_max
46_Funded Amount_median
47_Funded Amount_min
48_Funded Amount Investor_max
49_Funded Amount Investor_median
50_Funded Amount Investor_min
51_Interest Rate_max
52_Interest Rate_median
53_Interest Rate_min
54_Employment Duration_max
55_Employment Duration_median
56_Employment Duration_min
57_Debit to Income_max
58_Debit to Income_median
59_Debit to Income_min
60_Open Account_max
61_Open Account_median
62_Open Account_min
63_Revolving Balance_max
64_Revolving Balance_median
65_Revolving Balance_min
66_Revolving Utilities_max
67_Revolving Utilities_median
68_Revolving Utilities_min
69_Total Accounts_max
70_Total Accounts_median
71_Total Accounts_min
72_Total Received Interest_max
73_Total Received Interest_median
74_Total Received Interest_min
75_Total Received Late Fee_max
76_Total Received Late Fee_median
77_Total Received Late Fee_min
78_Recoveries_max
79_Recoveries_median
80_Recoveries_min
81_Collection Recovery Fee_max
82_Collection Recovery Fee_median
83_Collection Recovery Fee_min
84_Total Collection Amount_max
85_Total Collection Amount_median
86_Total Collection Amount_min
87_Total Current Balance_max
88_Total Current Balance_median
89_Total Current Balance_min
90_Total Revolving Credit Limit_max
91_Total Revolving Credit Limit_median
92_Total Revolving Credit Limit_min
93_Interest_per_mon_max
94_Interest_per_mon_median
95_Interest_per_mon_min
96_total_intr_amt_max
97_total_intr_amt_median
98_total_intr_amt_min
99_total_revolve_max
100_total_revolve_median
101_total_revolve_min
102_total_received_max
103_total_received_median
104_total_received_min
105_total_recovery_max
106_total_recovery_median
107_total_recovery_min
!pip install optuna
!pip install catboost
from sklearn.model_selection import cross_val_score,KFold,train_test_split,ShuffleSplit,StratifiedKFold,learning_curve
from catboost import CatBoostClassifier,Pool,cv,monoforest
import optuna
from optuna.samplers import RandomSampler,TPESampler,MOTPESampler,CmaEsSampler
from sklearn.metrics import f1_score,classification_report,confusion_matrix,log_loss
from xgboost import XGBClassifier,plot_tree
import xgboost as xgb
from optuna.integration import XGBoostPruningCallback,LightGBMPruningCallback
from sklearn.preprocessing import LabelEncoder,StandardScaler,PolynomialFeatures
import lightgbm as lgbm
import math
for c, i in enumerate(train.columns.values):
print(f"{c}_{i}")
0_ID
1_Loan Amount
2_Funded Amount
3_Funded Amount Investor
4_Term
5_Batch Enrolled
6_Interest Rate
7_Grade
8_Sub Grade
9_Home Ownership
10_Employment Duration
11_Verification Status
12_Debit to Income
13_Delinquency - two years
14_Inquires - six months
15_Open Account
16_Public Record
17_Revolving Balance
18_Revolving Utilities
19_Total Accounts
20_Initial List Status
21_Total Received Interest
22_Total Received Late Fee
23_Recoveries
24_Collection Recovery Fee
25_Collection 12 months Medical
26_Application Type
27_Last week Pay
28_Accounts Delinquent
29_Total Collection Amount
30_Total Current Balance
31_Total Revolving Credit Limit
32_Loan Status
33_Interest_per_mon
34_total_intr_amt
35_Fund_amnt_grt
36_total_revolve
37_total_received
38_total_recovery
39_revolve_amnt_grt
40_loan_grt_balance
41_reprs_cust_count
42_Loan Amount_min
43_Loan Amount_median
44_Loan Amount_max
45_Funded Amount_min
46_Funded Amount_median
47_Funded Amount_max
48_Funded Amount Investor_min
49_Funded Amount Investor_median
50_Funded Amount Investor_max
51_Interest Rate_min
52_Interest Rate_median
53_Interest Rate_max
54_Employment Duration_min
55_Employment Duration_median
56_Employment Duration_max
57_Debit to Income_min
58_Debit to Income_median
59_Debit to Income_max
60_Open Account_min
61_Open Account_median
62_Open Account_max
63_Revolving Balance_min
64_Revolving Balance_median
65_Revolving Balance_max
66_Revolving Utilities_min
67_Revolving Utilities_median
68_Revolving Utilities_max
69_Total Accounts_min
70_Total Accounts_median
71_Total Accounts_max
72_Total Received Interest_min
73_Total Received Interest_median
74_Total Received Interest_max
75_Total Received Late Fee_min
76_Total Received Late Fee_median
77_Total Received Late Fee_max
78_Recoveries_min
79_Recoveries_median
80_Recoveries_max
81_Collection Recovery Fee_min
82_Collection Recovery Fee_median
83_Collection Recovery Fee_max
84_Total Collection Amount_min
85_Total Collection Amount_median
86_Total Collection Amount_max
87_Total Current Balance_min
88_Total Current Balance_median
89_Total Current Balance_max
90_Total Revolving Credit Limit_min
91_Total Revolving Credit Limit_median
92_Total Revolving Credit Limit_max
93_Interest_per_mon_min
94_Interest_per_mon_median
95_Interest_per_mon_max
96_total_intr_amt_min
97_total_intr_amt_median
98_total_intr_amt_max
99_total_revolve_min
100_total_revolve_median
101_total_revolve_max
102_total_received_min
103_total_received_median
104_total_received_max
105_total_recovery_min
106_total_recovery_median
107_total_recovery_max
train.select_dtypes(include='object').columns
X=train.iloc[:,np.r_[1:32,33:108]]
y=train['Loan Status']
train.select_dtypes(include='object').columns
def objective(trial):
skf = StratifiedKFold(n_splits=5,random_state=2000,shuffle=True)
for i, (train_index, test_index) in enumerate(skf.split(X,y)):
X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
param = {
'reg_lambda':trial.suggest_categorical('reg_lambda',[0.1, 1.0, 5.0, 10.0, 50.0, 100.0,150.0,200.0,250.0]),
'learning_rate': trial.suggest_float('learning_rate', 0.001,1.0),
'n_estimators': trial.suggest_categorical('n_estimators',[200,400,600,800,1000]),
'max_depth': trial.suggest_int('max_depth', 2,12),
'random_state': trial.suggest_categorical('random_state', [1024, 1048,2020]),
"colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
"boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
"bootstrap_type": trial.suggest_categorical(
"bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
),
}
if param["bootstrap_type"] == "Bayesian":
param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
elif param["bootstrap_type"] == "Bernoulli":
param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
cat_clf = CatBoostClassifier(early_stopping_rounds=30,eval_metric="Logloss",
logging_level="Silent",
**param)
cat_clf.fit(X_train, y_train,cat_features=['Batch Enrolled', 'Grade', 'Sub Grade',
'Home Ownership','Verification Status',
'Initial List Status', 'Application Type'])
preds = cat_clf.predict_proba(X_valid)
accuracy = log_loss(y_valid, preds)
return accuracy
if __name__ == "__main__":
study = optuna.create_study(direction='minimize',sampler=MOTPESampler(),
pruner=optuna.pruners.SuccessiveHalvingPruner())
study.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)
[I 2021-12-28 13:19:48,727] A new study created in memory with name: no-name-dff88beb-72a5-48ec-a075-940e721f6802
Custom logger is already specified. Specify more than one logger at same time is not thread safe.[I 2021-12-28 13:21:38,488] Trial 0 finished with value: 0.4100010985018204 and parameters: {'reg_lambda': 200.0, 'learning_rate': 0.9552004688504021, 'n_estimators': 800, 'max_depth': 8, 'random_state': 2020, 'colsample_bylevel': 0.08140795734520816, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.4734171462679141}. Best is trial 0 with value: 0.4100010985018204.
[I 2021-12-28 13:22:04,720] Trial 1 finished with value: 0.31667966817519916 and parameters: {'reg_lambda': 50.0, 'learning_rate': 0.22908297508327882, 'n_estimators': 1000, 'max_depth': 8, 'random_state': 1024, 'colsample_bylevel': 0.010789467699509715, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.31667966817519916.
[I 2021-12-28 13:23:29,795] Trial 2 finished with value: 0.6644377310777809 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.4907128774180346, 'n_estimators': 600, 'max_depth': 11, 'random_state': 1024, 'colsample_bylevel': 0.08409755247942045, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.0476285042720348}. Best is trial 1 with value: 0.31667966817519916.
[I 2021-12-28 13:26:51,294] Trial 3 finished with value: 0.6441318953162362 and parameters: {'reg_lambda': 5.0, 'learning_rate': 0.6072281987571729, 'n_estimators': 800, 'max_depth': 12, 'random_state': 2020, 'colsample_bylevel': 0.09828546260935145, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.31667966817519916.
[I 2021-12-28 13:27:37,484] Trial 4 finished with value: 0.31595243376249565 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.2054762763652627, 'n_estimators': 600, 'max_depth': 7, 'random_state': 1024, 'colsample_bylevel': 0.05518975818331533, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.1710960305616296}. Best is trial 4 with value: 0.31595243376249565.
[I 2021-12-28 13:27:55,039] Trial 5 finished with value: 0.30865882778312453 and parameters: {'reg_lambda': 5.0, 'learning_rate': 0.4824021905021146, 'n_estimators': 400, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.05016541377598957, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.642969598452986}. Best is trial 5 with value: 0.30865882778312453.
[I 2021-12-28 13:29:56,729] Trial 6 finished with value: 0.5960156764001286 and parameters: {'reg_lambda': 50.0, 'learning_rate': 0.9973247845857358, 'n_estimators': 1000, 'max_depth': 11, 'random_state': 1048, 'colsample_bylevel': 0.04633694165559028, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 5 with value: 0.30865882778312453.
[I 2021-12-28 13:31:45,797] Trial 7 finished with value: 0.6494287241532615 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.889824572532522, 'n_estimators': 800, 'max_depth': 12, 'random_state': 2020, 'colsample_bylevel': 0.030429052412292913, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 5 with value: 0.30865882778312453.
[I 2021-12-28 13:32:21,756] Trial 8 finished with value: 0.31039193616078997 and parameters: {'reg_lambda': 0.1, 'learning_rate': 0.07120933762096478, 'n_estimators': 400, 'max_depth': 6, 'random_state': 1024, 'colsample_bylevel': 0.09371015572041257, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.1615454799997345}. Best is trial 5 with value: 0.30865882778312453.
[I 2021-12-28 13:32:54,548] Trial 9 finished with value: 0.34790236365789184 and parameters: {'reg_lambda': 50.0, 'learning_rate': 0.537035648242743, 'n_estimators': 1000, 'max_depth': 4, 'random_state': 1048, 'colsample_bylevel': 0.09305993218346688, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.5078539013455677}. Best is trial 5 with value: 0.30865882778312453.
[I 2021-12-28 13:33:03,926] Trial 10 finished with value: 0.30956134657010265 and parameters: {'reg_lambda': 5.0, 'learning_rate': 0.725132928411979, 'n_estimators': 200, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.0602365598835579, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.689488389844328}. Best is trial 5 with value: 0.30865882778312453.
[I 2021-12-28 13:33:22,795] Trial 11 finished with value: 0.30830904610404347 and parameters: {'reg_lambda': 150.0, 'learning_rate': 0.3663598185117414, 'n_estimators': 400, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.03268824602344013, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.09061257217755}. Best is trial 11 with value: 0.30830904610404347.
[I 2021-12-28 13:33:41,171] Trial 12 finished with value: 0.3107439287639494 and parameters: {'reg_lambda': 150.0, 'learning_rate': 0.36630079410257294, 'n_estimators': 400, 'max_depth': 4, 'random_state': 1048, 'colsample_bylevel': 0.013147059693571057, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.49413530659445026}. Best is trial 11 with value: 0.30830904610404347.
[I 2021-12-28 13:33:52,699] Trial 13 finished with value: 0.3126120126448186 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.7674274927514826, 'n_estimators': 200, 'max_depth': 4, 'random_state': 2020, 'colsample_bylevel': 0.030794726248672875, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.132662204898661}. Best is trial 11 with value: 0.30830904610404347.
[I 2021-12-28 13:34:17,258] Trial 14 finished with value: 0.3100713944783724 and parameters: {'reg_lambda': 150.0, 'learning_rate': 0.32293624629552825, 'n_estimators': 400, 'max_depth': 5, 'random_state': 2020, 'colsample_bylevel': 0.03182953143065234, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.108678877520031}. Best is trial 11 with value: 0.30830904610404347.
[I 2021-12-28 13:35:26,947] Trial 15 finished with value: 0.3106249861112574 and parameters: {'reg_lambda': 150.0, 'learning_rate': 0.09053893066523971, 'n_estimators': 400, 'max_depth': 9, 'random_state': 1048, 'colsample_bylevel': 0.06739905226535768, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.8814953897509814}. Best is trial 11 with value: 0.30830904610404347.
[I 2021-12-28 13:35:51,773] Trial 16 finished with value: 0.3080115307011757 and parameters: {'reg_lambda': 0.1, 'learning_rate': 0.010232376347675265, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.038777259160665156, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 7.036264331268556}. Best is trial 16 with value: 0.3080115307011757.
[I 2021-12-28 13:36:20,919] Trial 17 finished with value: 0.32092658148024794 and parameters: {'reg_lambda': 0.1, 'learning_rate': 0.0024824579284581294, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.042173756850336155, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.9950295776067846}. Best is trial 16 with value: 0.3080115307011757.
[I 2021-12-28 13:36:47,646] Trial 18 finished with value: 0.32592028239956705 and parameters: {'reg_lambda': 0.1, 'learning_rate': 0.18049555099381945, 'n_estimators': 600, 'max_depth': 6, 'random_state': 1048, 'colsample_bylevel': 0.07047022661817581, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 7.352700877033383}. Best is trial 16 with value: 0.3080115307011757.
[I 2021-12-28 13:37:32,600] Trial 19 finished with value: 0.35123040457979327 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.6940345290470873, 'n_estimators': 600, 'max_depth': 9, 'random_state': 1024, 'colsample_bylevel': 0.020600506409986005, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 16 with value: 0.3080115307011757.
[I 2021-12-28 13:37:41,683] Trial 20 finished with value: 0.30796640704515776 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.33882214314664055, 'n_estimators': 200, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.03578120544200743, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.701392567958283}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:37:52,417] Trial 21 finished with value: 0.30859615225391535 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.2894760008256461, 'n_estimators': 200, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.04003421499523795, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 7.42220300403602}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:38:02,152] Trial 22 finished with value: 0.3081579375865388 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.11051749013026863, 'n_estimators': 200, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.0223841969671017, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 7.2594057202635485}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:38:15,745] Trial 23 finished with value: 0.30988585026373144 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.4174945707267844, 'n_estimators': 200, 'max_depth': 5, 'random_state': 2020, 'colsample_bylevel': 0.03907008454470365, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.39495804316205}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:38:31,020] Trial 24 finished with value: 0.38325819347463413 and parameters: {'reg_lambda': 200.0, 'learning_rate': 0.003470484942332831, 'n_estimators': 200, 'max_depth': 5, 'random_state': 2020, 'colsample_bylevel': 0.059667912229920655, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.539183040049113}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:38:55,868] Trial 25 finished with value: 0.3081018926133609 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.23597855088836595, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.02294271783509348, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.6179427758701808}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:39:25,688] Trial 26 finished with value: 0.3176193701983347 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.5938364157517737, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.049693921237819634, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 8.584491854195686}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:39:41,718] Trial 27 finished with value: 0.3122706767917134 and parameters: {'reg_lambda': 0.1, 'learning_rate': 0.13868036685126545, 'n_estimators': 200, 'max_depth': 6, 'random_state': 1048, 'colsample_bylevel': 0.07355366433666687, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:40:00,374] Trial 28 finished with value: 0.3194869900184245 and parameters: {'reg_lambda': 0.1, 'learning_rate': 0.4198047063559755, 'n_estimators': 800, 'max_depth': 4, 'random_state': 1024, 'colsample_bylevel': 0.017374518892885864, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.10607444119211629}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:41:32,840] Trial 29 finished with value: 0.3276336844470651 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.2643572687983563, 'n_estimators': 1000, 'max_depth': 7, 'random_state': 2020, 'colsample_bylevel': 0.06325729097795736, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 8.233886782860344}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:41:57,986] Trial 30 finished with value: 0.3080162712982084 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.1660773845006956, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.024730562490008774, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.193901465164717}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:42:23,498] Trial 31 finished with value: 0.308146343448234 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.16260971408286365, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.025550538373078326, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.430379968035934}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:42:53,744] Trial 32 finished with value: 0.30801458890943334 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.05584768927673188, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.035785952677856246, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.950426627500387}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:43:24,225] Trial 33 finished with value: 0.30785029071462094 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.0563049857054157, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.0375976522119426, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.161919118032015}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:44:00,431] Trial 34 finished with value: 0.3081083066109118 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.059167911630636436, 'n_estimators': 600, 'max_depth': 4, 'random_state': 2020, 'colsample_bylevel': 0.04550445722001303, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.681095947227308}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:44:11,047] Trial 35 finished with value: 0.30859831252859493 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.30000076290540223, 'n_estimators': 200, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.05398003566205212, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 8.098845389145255}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:44:49,104] Trial 36 finished with value: 0.3140262564431863 and parameters: {'reg_lambda': 200.0, 'learning_rate': 0.003106924185174607, 'n_estimators': 600, 'max_depth': 5, 'random_state': 2020, 'colsample_bylevel': 0.03666400120344703, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.415244577696475}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:45:17,370] Trial 37 finished with value: 0.30797088016327895 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.11731392256095559, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.04353896742282525, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.263974631990708}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:45:28,402] Trial 38 finished with value: 0.30993022867406184 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.3399827866991869, 'n_estimators': 200, 'max_depth': 3, 'random_state': 1048, 'colsample_bylevel': 0.05230017706100512, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:45:52,526] Trial 39 finished with value: 0.3299671446383035 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.414506926208746, 'n_estimators': 800, 'max_depth': 4, 'random_state': 1024, 'colsample_bylevel': 0.044573350184222295, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.7370442965110242}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:46:19,377] Trial 40 finished with value: 0.30788983164494393 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.12245332874878374, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.028562415478461462, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.331776254379165}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:46:46,565] Trial 41 finished with value: 0.3075584206617818 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.12106647986177312, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.02771062925991618, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.976679263417514}. Best is trial 41 with value: 0.3075584206617818.
[I 2021-12-28 13:47:16,717] Trial 42 finished with value: 0.3084138912880576 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.255888808198655, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.02734596006292813, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.825313838703929}. Best is trial 41 with value: 0.3075584206617818.
[I 2021-12-28 13:47:43,811] Trial 43 finished with value: 0.3075713833552239 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.17924460055942992, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.028441377492810418, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.720148265189939}. Best is trial 41 with value: 0.3075584206617818.
[I 2021-12-28 13:48:12,035] Trial 44 finished with value: 0.30807460755630495 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.19689379712753324, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.017857872470314368, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.3561124462289205}. Best is trial 41 with value: 0.3075584206617818.
[I 2021-12-28 13:48:39,899] Trial 45 finished with value: 0.30754235530557406 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.13329828734526583, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.02800155755287607, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.5691284250573703}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:49:09,341] Trial 46 finished with value: 0.3077844707729344 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.0564022730385032, 'n_estimators': 600, 'max_depth': 4, 'random_state': 2020, 'colsample_bylevel': 0.017748551752997985, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.7942847670631612}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:49:37,772] Trial 47 finished with value: 0.30809672491371987 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.15763541078569066, 'n_estimators': 600, 'max_depth': 4, 'random_state': 2020, 'colsample_bylevel': 0.015347491117266833, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.6870129233571016}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:50:04,277] Trial 48 finished with value: 0.3078857243602207 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.08427577019345282, 'n_estimators': 600, 'max_depth': 5, 'random_state': 2020, 'colsample_bylevel': 0.010561719695356263, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.485688181103633}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:50:30,621] Trial 49 finished with value: 0.3081719456439294 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.26730201879461335, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.018254933863627212, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.699428099362245}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:51:01,187] Trial 50 finished with value: 0.3089203366939519 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.13638703751004952, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.026581972956907653, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.7688652937870724}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:51:30,080] Trial 51 finished with value: 0.30792937310734625 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.0331600885789784, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.020880351691040272, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.391605857706495}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:51:58,211] Trial 52 finished with value: 0.30777842599045563 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.053210493573485895, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.03367054589376216, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.984676745513563}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:52:26,391] Trial 53 finished with value: 0.30823872528759644 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.2088903798621219, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.03372689451792627, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.825401857572525}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:52:52,135] Trial 54 finished with value: 0.3077095777189635 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.10408603684078382, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.014530165803284055, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.330932319528904}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:53:20,425] Trial 55 finished with value: 0.3079577870019453 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.10180245720178235, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.028268171100153776, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.1227693947272825}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:54:01,846] Trial 56 finished with value: 0.3076783408291407 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.1814278085018206, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.014694006339004018, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.9917929098283906}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:54:42,262] Trial 57 finished with value: 0.307434512450736 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.22202683443420487, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.011852431987746277, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.9611631489676657}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 13:55:21,945] Trial 58 finished with value: 0.3091207746627096 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.23317500474272052, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 1048, 'colsample_bylevel': 0.01236008795222979, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.3745540911095694}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 13:55:48,080] Trial 59 finished with value: 0.3124426495439441 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.2904939413843156, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 1024, 'colsample_bylevel': 0.02153328987913529, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 13:56:29,127] Trial 60 finished with value: 0.30769711737383637 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.18610383686010318, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.014339366658207845, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.054618212718263}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 13:57:10,425] Trial 61 finished with value: 0.3081170148894457 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.20140171771107332, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.014960953017796874, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.9766940180027124}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 13:57:57,728] Trial 62 finished with value: 0.3084481641283402 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.15540006118635152, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.0200433754067231, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.668744983224088}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 13:58:42,493] Trial 63 finished with value: 0.30843999968506436 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.18864976712161993, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.025139539802041675, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.6963300987855834}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 13:59:26,705] Trial 64 finished with value: 0.30844061149596924 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.23615649630269608, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.015538140996618684, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.2201867393445855}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:00:06,223] Trial 65 finished with value: 0.30780405326364346 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.14726107801328875, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.011232301211031123, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.755806739464729}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:00:56,022] Trial 66 finished with value: 0.3092458290947544 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.1786017629006514, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.023493548537692834, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.0394019206650715}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:01:35,105] Trial 67 finished with value: 0.3090707355174857 and parameters: {'reg_lambda': 5.0, 'learning_rate': 0.3053102172036845, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.019206541713959503, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.990327368951434}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:02:20,076] Trial 68 finished with value: 0.30903255877305735 and parameters: {'reg_lambda': 200.0, 'learning_rate': 0.2661219219232608, 'n_estimators': 1000, 'max_depth': 4, 'random_state': 2020, 'colsample_bylevel': 0.014125857515133428, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.365412988399983}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:02:56,426] Trial 69 finished with value: 0.3076469661168601 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.22473943400823987, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010310619631260995, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.4017374144677635}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:03:33,563] Trial 70 finished with value: 0.3082435815008343 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.21959297026801608, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.012855432267646405, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.3415286599657676}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:04:09,443] Trial 71 finished with value: 0.3089773736311915 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.37000381228594587, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.01021415699688447, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.0934511817285424}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:04:51,325] Trial 72 finished with value: 0.30820690560792424 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.12764559171362783, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.0168407674795086, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.521113922735104}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:05:30,906] Trial 73 finished with value: 0.30828739525010385 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.17818549171698533, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.02934914531093111, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.7959423619978017}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:06:19,551] Trial 74 finished with value: 0.3091530094140145 and parameters: {'reg_lambda': 50.0, 'learning_rate': 0.25557695164562216, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.023809062824247336, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.70691542412889}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:06:59,956] Trial 75 finished with value: 0.307696540265026 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.09458670882813933, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010038022476935509, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.228660796676878}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:07:41,141] Trial 76 finished with value: 0.30730791194462437 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.08580081062052972, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010267520707686379, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.0022013667340754}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:08:27,309] Trial 77 finished with value: 0.3079363368031429 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.13134837603355667, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.016585184647102177, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.6317388943038855}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:09:12,733] Trial 78 finished with value: 0.3077409284160152 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.035478889433778624, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.019953219863178356, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.2979298781856263}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:09:53,385] Trial 79 finished with value: 0.3079259743261942 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.08218598002213888, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.01285909756057752, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.9183882073133267}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:10:34,692] Trial 80 finished with value: 0.307543352753024 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.10455491902957995, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010626333321137174, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.098985812540241}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:11:16,800] Trial 81 finished with value: 0.30777009892834034 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.030388653796977597, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.012212144106790473, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.0020989995703067}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:12:01,576] Trial 82 finished with value: 0.3077826325050363 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.10926014489407798, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.018995959410143787, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.6499844212628298}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:12:44,950] Trial 83 finished with value: 0.3081240300178788 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.14532500808558851, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.01654267409193488, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.9821312743917145}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:13:28,037] Trial 84 finished with value: 0.30808062130046154 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.2179122376160405, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.013067456824596937, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.4311928044392253}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:14:08,602] Trial 85 finished with value: 0.30763200259961204 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.08158861789201716, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010086726000962694, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.5760825233967264}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:14:49,866] Trial 86 finished with value: 0.3076623277292799 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.03481042974934406, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010190168406570745, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.2550207181909907}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:15:41,127] Trial 87 finished with value: 0.3077001043379401 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.0861489293507142, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.021956150127820873, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.1692987193000635}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:16:16,491] Trial 88 finished with value: 0.3077201799043222 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.07277509159340519, 'n_estimators': 800, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.01689929791170685, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.5884748454348174}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:16:59,027] Trial 89 finished with value: 0.30813338376726573 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.12149840021129285, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 1024, 'colsample_bylevel': 0.012388716278478195, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.6722913923519971}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:17:43,989] Trial 90 finished with value: 0.30746054298748443 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.03838194807833625, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010913391940164316, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.1457965345339334}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:18:26,306] Trial 91 finished with value: 0.30762295798006867 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.04531265501307803, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010165036394957666, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.45346901853716903}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:19:10,785] Trial 92 finished with value: 0.3076288772679253 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.02712599747021193, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.015159468383245807, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.3252185014853817}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:19:55,416] Trial 93 finished with value: 0.3076194019283185 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.029889240608320912, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.015220439810751147, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.1681180901283668}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:20:40,639] Trial 94 finished with value: 0.3076980567937852 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.03578899852067052, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.019519762631036386, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.683420744179926}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:21:31,965] Trial 95 finished with value: 0.30869871730974413 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.0618333530334237, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.026582446917876267, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.09860467704874662}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:22:15,257] Trial 96 finished with value: 0.30768189610784086 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.10163182690307862, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.013326646958461103, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.9403215375021393}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:23:05,643] Trial 97 finished with value: 0.30805641315780585 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.04309094125787194, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.023423556965846234, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.421649418720508}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:23:32,348] Trial 98 finished with value: 0.30869032191344525 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.1428252564351233, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 1048, 'colsample_bylevel': 0.018539327351832797, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:24:16,653] Trial 99 finished with value: 0.3077183558437069 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.017401625577887297, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.015882097345279532, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.1982140103302499}. Best is trial 76 with value: 0.30730791194462437.
Number of finished trials: 100
Best trial: {'reg_lambda': 250.0, 'learning_rate': 0.08580081062052972, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010267520707686379, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.0022013667340754}
study.best_value
study.best_params
optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_slice(study)
optuna.visualization.plot_param_importances(study)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2,
random_state = 42,
shuffle=True,stratify=y)
param_cat={'bagging_temperature': 2.0022013667340754,
'boosting_type': 'Ordered',
'bootstrap_type': 'Bayesian',
'colsample_bylevel': 0.010267520707686379,
'learning_rate': 0.08580081062052972,
'max_depth': 2,
'n_estimators': 1000,
'random_state': 2020,
'reg_lambda': 250.0}
cat_model = CatBoostClassifier(**param_cat,
eval_metric="Logloss",)
cat_model.fit(X_train,y_train,eval_set=(X_valid,y_valid),use_best_model=True,
verbose=True,early_stopping_rounds=30,
cat_features=['Batch Enrolled', 'Grade', 'Sub Grade',
'Home Ownership','Verification Status',
'Initial List Status', 'Application Type'])
pd.DataFrame({'train_logloss':cat_model.evals_result_['learn']['Logloss'],
'validation_logloss':cat_model.evals_result_['validation']['Logloss']}).plot()
!pip install shap
import shap
shap.initjs()
explainer = shap.TreeExplainer(cat_model)
shap_values = explainer.shap_values(Pool(X,y,cat_features=['Batch Enrolled', 'Grade', 'Sub Grade',
'Home Ownership','Verification Status',
'Initial List Status', 'Application Type']))
shap.summary_plot(shap_values,X,class_names=['a','b'],plot_type ='bar')
shap.summary_plot(shap_values,X)
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:],matplotlib=True)
lbl=LabelEncoder()
for i in X.select_dtypes(include='object').columns.values:
X[i]=lbl.fit_transform(X[[i]])
def objective(trial):
skf = StratifiedKFold(n_splits=10,random_state=22,shuffle=True)
for i, (train_index, test_index) in enumerate(skf.split(X,y)):
X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
param = {
"verbosity": 0,
"objective": "binary:logistic",
# use exact for small dataset.
"tree_method": trial.suggest_categorical("tree_method",['exact', 'approx', 'hist']),
"eta": trial.suggest_float("eta", 0.001, 1.0),
#'interaction_constraints':[[2,3,8,12],[12,13,14],[5,6,9,12]],
# defines booster, gblinear for linear functions.
"booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
# L2 regularization weight.
"lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
# L1 regularization weight.
"alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
# sampling ratio for training data.
"subsample": trial.suggest_float("subsample", 0.1, 1.0),
# sampling according to each tree.
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0),
"n_estimators":trial.suggest_categorical("n_estimators",[200,400,600,800,1000])
}
if param["booster"] in ["gbtree", "dart"]:
# maximum depth of the tree, signifies complexity of the tree.
param["max_depth"] = trial.suggest_int("max_depth", 2, 60)
# minimum child weight, larger the term more conservative the tree.
param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 60)
# defines how selective algorithm is.
param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
if param["booster"] == "dart":
param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)
pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation_0-logloss")
xgb_clf=XGBClassifier(early_stopping_rounds=50,
**param)
xgb_clf.fit(X_train,y_train,
eval_set=[(X_valid, y_valid)],
eval_metric='logloss',
early_stopping_rounds=50,
callbacks=[pruning_callback],
)
preds = xgb_clf.predict_proba(X_valid)
accuracy = log_loss(y_valid, preds)
return accuracy
if __name__ == "__main__":
study1 = optuna.create_study(direction='minimize',sampler=RandomSampler(),
pruner=optuna.pruners.SuccessiveHalvingPruner())
study1.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study1.trials))
print('Best trial:', study1.best_trial.params)
100*study1.best_value
study1.best_params
optuna.visualization.plot_optimization_history(study1)
optuna.visualization.plot_slice(study1)
optuna.visualization.plot_param_importances(study1)
params_xgb={'alpha': 0.00013628501532199046,
'booster': 'gbtree',
'colsample_bytree': 0.4510459869291909,
'eta': 0.23201860736451954,
'gamma': 1.18221420230537e-05,
'grow_policy': 'depthwise',
'lambda': 7.036258743817963e-06,
'max_depth': 6,
'min_child_weight': 52,
'n_estimators': 400,
'subsample': 0.44069082236462886,
'tree_method': 'exact'}
X.rename(columns=lambda x: x.replace(' ', '_'),inplace=True)
X_train1, X_valid1, y_train1, y_valid1 = train_test_split(X, y, test_size = 0.2,
random_state = 42,
shuffle=True,stratify=y)
xgb_model=XGBClassifier(**params_xgb,verbose=1)
xgb_model.fit(X_train1,y_train1,eval_set=[(X_valid1,y_valid1)],
eval_metric='logloss',early_stopping_rounds=30,verbose=True)
plot_tree(xgb_model)
fig = plt.gcf()
fig.set_size_inches(30, 30)
explainer_xgb = shap.TreeExplainer(xgb_model)
shap_values_xgb = explainer_xgb.shap_values(X)
shap.summary_plot(shap_values_xgb,X,plot_type ='bar')
def objective(trial):
skf = StratifiedKFold(n_splits=10,random_state=22,shuffle=True)
for i, (train_index, test_index) in enumerate(skf.split(X,y)):
X_train, X_valid = X.iloc[train_index], X.iloc[test_index]
y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
param_grid = {
# "device_type": trial.suggest_categorical("device_type", ['gpu']),
"boosting_type":trial.suggest_categorical("boosting_type",['rf','gbdt']),
"n_estimators": trial.suggest_categorical("n_estimators", [200,400,600,800,1000]),
"learning_rate": trial.suggest_float("learning_rate", 0.001, 1.0),
"num_leaves": trial.suggest_int("num_leaves", 20, 100),
"max_depth": trial.suggest_int("max_depth", 2, 50),
"min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 100, ),
"lambda_l1": trial.suggest_int("lambda_l1", 1, 1000 ),
"lambda_l2": trial.suggest_int("lambda_l2", 1, 1000),
'random_state': trial.suggest_categorical('random_state', [24, 48,2020]),
"min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
"bagging_fraction": trial.suggest_float(
"bagging_fraction", 0.2, 0.95, step=0.1
),
"bagging_freq": trial.suggest_categorical("bagging_freq", [1,2,4,6,8,10]),
"feature_fraction": trial.suggest_float(
"feature_fraction", 0.2, 0.95, step=0.1
),
}
model = lgbm.LGBMClassifier(objective="binary", **param_grid)
model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],
eval_metric='logloss',
early_stopping_rounds=50,
callbacks=[
LightGBMPruningCallback(trial, 'binary_logloss')
], # Add a pruning callback
)
preds = model.predict_proba(X_valid)
score = log_loss(y_valid, preds)
return 100*score
if __name__ == "__main__":
study2 = optuna.create_study(direction='minimize',sampler=RandomSampler(),
pruner=optuna.pruners.ThresholdPruner(lower=0.0))
study2.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study2.trials))
print('Best trial:', study2.best_trial.params)
study2.best_value
study2.best_params
optuna.visualization.plot_optimization_history(study2)
optuna.visualization.plot_slice(study2)
optuna.visualization.plot_param_importances(study2)
params_lgbm={'bagging_fraction': 0.7,
'bagging_freq': 8,
'boosting_type': 'rf',
'feature_fraction': 0.4,
'lambda_l1': 9,
'lambda_l2': 517,
'learning_rate': 0.30902486498872506,
'max_depth': 4,
'min_data_in_leaf': 7,
'min_gain_to_split': 1.776112821650258,
'n_estimators': 400,
'num_leaves': 36,
'random_state': 2020}
lgb_model=lgbm.LGBMClassifier(objective="binary", **params_lgbm)
lgb_model.fit(X_train1,y_train1,eval_set=[(X_valid1,y_valid1)],
eval_metric='logloss',early_stopping_rounds=30)
lgbm.plot_tree(lgb_model)
fig = plt.gcf()
fig.set_size_inches(30, 30)
lgbm.plot_metric(lgb_model)
explainer_lgb = shap.TreeExplainer(lgb_model)
shap_values_lgb = explainer_lgb.shap_values(X)
shap.summary_plot(shap_values_lgb,X,plot_type ='bar')
scl=StandardScaler()
for i in X.select_dtypes(exclude='object').columns.values:
X[i]=scl.fit_transform(X[[i]])
from tensorflow import keras
from tensorflow.keras import layers
nn = keras.Sequential([
layers.BatchNormalization(input_shape = [X.shape[1]]),
layers.Dense(units = 128, activation = 'relu'),
layers.BatchNormalization(),
layers.Dropout(rate = 0.5),
layers.Dense(units = 64, activation = 'relu'),
layers.BatchNormalization(),
layers.Dropout(rate = 0.4),
layers.Dense(units = 32, activation = 'relu'),
layers.BatchNormalization(),
layers.Dropout(rate = 0.3),
layers.Dense(units = 1, activation = 'sigmoid')
])
auc = keras.metrics.BinaryAccuracy()
nn.compile(optimizer = keras.optimizers.Adam(),
loss = keras.losses.BinaryCrossentropy(),
metrics = [auc])
early_stopping = keras.callbacks.EarlyStopping(patience = 20,
min_delta = 0.001,
restore_best_weights = False)
initial_learning_rate = 0.01
def lr_step_decay(epoch, lr):
drop_rate = 0.5
epochs_drop = 10.0
return initial_learning_rate * math.pow(drop_rate, math.floor(epoch/epochs_drop))
callback_lr = keras.callbacks.LearningRateScheduler(lr_step_decay, verbose=1)
history =nn.fit(X, y,
validation_split=0.2,
batch_size = 64,
epochs = 100,
callbacks = [early_stopping,callback_lr
]
)
history_df = pd.DataFrame(history.history)
history_df.head()
plt.figure(figsize=(10,10))
plt.plot(xgb_model.evals_result_['validation_0']['logloss'],label="xgb_validation_loss")
plt.plot(cat_model.evals_result_['validation']['Logloss'],label="catboost_validation_loss")
plt.plot(lgb_model.evals_result_['valid_0']['binary_logloss'],label="lgbm_validation_loss")
plt.plot(history_df['val_loss'],label="Keras_Nnet_validation_loss")
plt.legend()
plt.xticks(color='w')
plt.title("Model's Validation-Logloss Comparison")