deloitte_ml_challenge_predict_loan

!pip install -U klib

import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import seaborn as sns # visualization from matplotlib import pyplot as plt # visualization import klib # visualization %matplotlib inline import warnings warnings.filterwarnings("ignore") from sklearn.feature_selection import chi2 from scipy.stats import chi2_contingency

from google.colab import drive drive.mount('/content/gdrive')

Mounted at /content/gdrive

train=pd.read_csv("/content/gdrive/MyDrive/deloitte_ml_challenge_predict_loan_defaulters./train.csv")

test=pd.read_csv("/content/gdrive/MyDrive/deloitte_ml_challenge_predict_loan_defaulters./test.csv")

train=train.rename(columns={'Employment Duration':'Home Ownership', 'Home Ownership':'Employment Duration'# }) test=test.rename(columns={'Employment Duration':'Home Ownership', 'Home Ownership':'Employment Duration'# })

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67463 entries, 0 to 67462
Data columns (total 35 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   ID                            67463 non-null  int64  
 1   Loan Amount                   67463 non-null  int64  
 2   Funded Amount                 67463 non-null  int64  
 3   Funded Amount Investor        67463 non-null  float64
 4   Term                          67463 non-null  int64  
 5   Batch Enrolled                67463 non-null  object 
 6   Interest Rate                 67463 non-null  float64
 7   Grade                         67463 non-null  object 
 8   Sub Grade                     67463 non-null  object 
 9   Home Ownership                67463 non-null  object 
 10  Employment Duration           67463 non-null  float64
 11  Verification Status           67463 non-null  object 
 12  Payment Plan                  67463 non-null  object 
 13  Loan Title                    67463 non-null  object 
 14  Debit to Income               67463 non-null  float64
 15  Delinquency - two years       67463 non-null  int64  
 16  Inquires - six months         67463 non-null  int64  
 17  Open Account                  67463 non-null  int64  
 18  Public Record                 67463 non-null  int64  
 19  Revolving Balance             67463 non-null  int64  
 20  Revolving Utilities           67463 non-null  float64
 21  Total Accounts                67463 non-null  int64  
 22  Initial List Status           67463 non-null  object 
 23  Total Received Interest       67463 non-null  float64
 24  Total Received Late Fee       67463 non-null  float64
 25  Recoveries                    67463 non-null  float64
 26  Collection Recovery Fee       67463 non-null  float64
 27  Collection 12 months Medical  67463 non-null  int64  
 28  Application Type              67463 non-null  object 
 29  Last week Pay                 67463 non-null  int64  
 30  Accounts Delinquent           67463 non-null  int64  
 31  Total Collection Amount       67463 non-null  int64  
 32  Total Current Balance         67463 non-null  int64  
 33  Total Revolving Credit Limit  67463 non-null  int64  
 34  Loan Status                   67463 non-null  int64  
dtypes: float64(9), int64(17), object(9)
memory usage: 18.0+ MB

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28913 entries, 0 to 28912
Data columns (total 35 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   ID                            28913 non-null  int64  
 1   Loan Amount                   28913 non-null  int64  
 2   Funded Amount                 28913 non-null  int64  
 3   Funded Amount Investor        28913 non-null  float64
 4   Term                          28913 non-null  int64  
 5   Batch Enrolled                28913 non-null  object 
 6   Interest Rate                 28913 non-null  float64
 7   Grade                         28913 non-null  object 
 8   Sub Grade                     28913 non-null  object 
 9   Home Ownership                28913 non-null  object 
 10  Employment Duration           28913 non-null  float64
 11  Verification Status           28913 non-null  object 
 12  Payment Plan                  28913 non-null  object 
 13  Loan Title                    28913 non-null  object 
 14  Debit to Income               28913 non-null  float64
 15  Delinquency - two years       28913 non-null  int64  
 16  Inquires - six months         28913 non-null  int64  
 17  Open Account                  28913 non-null  int64  
 18  Public Record                 28913 non-null  int64  
 19  Revolving Balance             28913 non-null  int64  
 20  Revolving Utilities           28913 non-null  float64
 21  Total Accounts                28913 non-null  int64  
 22  Initial List Status           28913 non-null  object 
 23  Total Received Interest       28913 non-null  float64
 24  Total Received Late Fee       28913 non-null  float64
 25  Recoveries                    28913 non-null  float64
 26  Collection Recovery Fee       28913 non-null  float64
 27  Collection 12 months Medical  28913 non-null  int64  
 28  Application Type              28913 non-null  object 
 29  Last week Pay                 28913 non-null  int64  
 30  Accounts Delinquent           28913 non-null  int64  
 31  Total Collection Amount       28913 non-null  int64  
 32  Total Current Balance         28913 non-null  int64  
 33  Total Revolving Credit Limit  28913 non-null  int64  
 34  Loan Status                   0 non-null      float64
dtypes: float64(10), int64(16), object(9)
memory usage: 7.7+ MB

train.isnull().sum()

test.isnull().sum()

train.head(3)

def cat_plot(df,catcol,title='',**arg): _=plt.figure(figsize=(8,5)) _=sns.countplot(data=df,x=catcol,order=df[catcol].value_counts().index,**arg) _=plt.title(title,fontsize=25) _=plt.xlabel(catcol,fontsize=15) _=plt.xticks(fontsize=10, rotation=90)

cat_plot(train,"Loan Status" ,"Taget(Loan Status) Column Distribution")

train["Loan Status"].value_counts()

_=plt.figure(figsize=(8,15)) _=sns.countplot(y=train['Batch Enrolled'],hue=train['Loan Status'].astype('object'))

train.select_dtypes(include="object").columns.values

fig=plt.subplots(figsize=(20, 20)) for i,col in enumerate(['Grade', 'Sub Grade', 'Home Ownership', 'Verification Status', 'Payment Plan', 'Initial List Status', 'Application Type']): _=plt.subplot(4,2,i+1) _=sns.countplot(x=train[col],hue=train['Loan Status'].astype('object')) _=plt.title(col+' Distribution',fontsize=15) _=plt.xlabel(col,fontsize=10) _=plt.xticks(fontsize=15) _=plt.tight_layout() plt.show()

for col in train.select_dtypes(include="object").columns.values: print("=="*50) print(col) print(train[col].value_counts()) print("=="*50)

====================================================================================================
Batch Enrolled
BAT3873588    3626
BAT1586599    3142
BAT1104812    2996
BAT2252229    2557
BAT2803411    2425
BAT1780517    2403
BAT1184694    2298
BAT2078974    2290
BAT2575549    2257
BAT4694572    2248
BAT4271519    2054
BAT2558388    1963
BAT3193689    1864
BAT1930365    1844
BAT2136391    1790
BAT2333412    1775
BAT3726927    1774
BAT4136152    1766
BAT5341619    1717
BAT5525466    1709
BAT5489674    1677
BAT5629144    1639
BAT1766061    1461
BAT2833642    1421
BAT5924421    1404
BAT2522922    1399
BAT2428731    1398
BAT4808022    1303
BAT4351734    1140
BAT5547201    1127
BAT5714674    1105
BAT3461431    1068
BAT224923      895
BAT1761981     894
BAT4722912     887
BAT2003848     842
BAT1467036     802
BAT5849876     768
BAT3865626     728
BAT5811547     711
BAT1135695     296
Name: Batch Enrolled, dtype: int64
====================================================================================================
====================================================================================================
Grade
C    19085
B    18742
A    12055
D     8259
E     6446
F     2246
G      630
Name: Grade, dtype: int64
====================================================================================================
====================================================================================================
Sub Grade
B4    4462
C1    4188
B3    3999
A5    3540
B2    3520
B5    3408
D1    3304
C4    3250
C2    3219
C3    3121
B1    2924
C5    2472
A4    2264
D4    2050
D2    1963
D5    1952
A2    1837
D3    1824
E2    1746
A3    1685
A1    1364
E3    1321
E1    1298
E4    1117
F2     947
F1     824
E5     769
F5     582
F3     578
G2     447
F4     441
G1     366
G5     284
G3     246
G4     151
Name: Sub Grade, dtype: int64
====================================================================================================
====================================================================================================
Home Ownership
MORTGAGE    36351
RENT        24150
OWN          6962
Name: Home Ownership, dtype: int64
====================================================================================================
====================================================================================================
Verification Status
Source Verified    33036
Verified           18078
Not Verified       16349
Name: Verification Status, dtype: int64
====================================================================================================
====================================================================================================
Payment Plan
n    67463
Name: Payment Plan, dtype: int64
====================================================================================================
====================================================================================================
Loan Title
Credit card refinancing    30728
Debt consolidation         24841
Debt Consolidation          3544
Other                       2455
Home improvement            2211
                           ...  
Home loan                      6
Personal loan                  5
Getting Ahead                  5
bills                          4
Credit                         4
Name: Loan Title, Length: 109, dtype: int64
====================================================================================================
====================================================================================================
Initial List Status
w    36299
f    31164
Name: Initial List Status, dtype: int64
====================================================================================================
====================================================================================================
Application Type
INDIVIDUAL    67340
JOINT           123
Name: Application Type, dtype: int64
====================================================================================================

fig=plt.subplots(figsize=(20, 20)) for i,col in enumerate(train.select_dtypes(exclude="object").columns.values): plt.subplot(13,2,i+1) _=sns.histplot(data=train,x=col,hue="Loan Status") _=plt.title(col+' Distribution',fontsize=15) _=plt.xlabel("") _=plt.xticks(fontsize=8) _=plt.tight_layout() plt.show()

for col in train.select_dtypes(exclude="object").columns.values: print("=="*50) print(col) print(train[col].describe()) print("=="*50)

====================================================================================================
ID
count    6.746300e+04
mean     2.562761e+07
std      2.109155e+07
min      1.297933e+06
25%      6.570288e+06
50%      1.791565e+07
75%      4.271521e+07
max      7.224578e+07
Name: ID, dtype: float64
====================================================================================================
====================================================================================================
Loan Amount
count    67463.000000
mean     16848.902776
std       8367.865726
min       1014.000000
25%      10012.000000
50%      16073.000000
75%      22106.000000
max      35000.000000
Name: Loan Amount, dtype: float64
====================================================================================================
====================================================================================================
Funded Amount
count    67463.000000
mean     15770.599114
std       8150.992662
min       1014.000000
25%       9266.500000
50%      13042.000000
75%      21793.000000
max      34999.000000
Name: Funded Amount, dtype: float64
====================================================================================================
====================================================================================================
Funded Amount Investor
count    67463.000000
mean     14621.799323
std       6785.345170
min       1114.590204
25%       9831.684984
50%      12793.682170
75%      17807.594120
max      34999.746430
Name: Funded Amount Investor, dtype: float64
====================================================================================================
====================================================================================================
Term
count    67463.000000
mean        58.173814
std          3.327441
min         36.000000
25%         58.000000
50%         59.000000
75%         59.000000
max         59.000000
Name: Term, dtype: float64
====================================================================================================
====================================================================================================
Interest Rate
count    67463.000000
mean        11.846258
std          3.718629
min          5.320006
25%          9.297147
50%         11.377696
75%         14.193533
max         27.182348
Name: Interest Rate, dtype: float64
====================================================================================================
====================================================================================================
Employment Duration
count     67463.000000
mean      80541.502522
std       45029.120366
min       14573.537170
25%       51689.843335
50%       69335.832680
75%       94623.322785
max      406561.536400
Name: Employment Duration, dtype: float64
====================================================================================================
====================================================================================================
Debit to Income
count    67463.000000
mean        23.299241
std          8.451824
min          0.675299
25%         16.756416
50%         22.656658
75%         30.048400
max         39.629862
Name: Debit to Income, dtype: float64
====================================================================================================
====================================================================================================
Delinquency - two years
count    67463.000000
mean         0.327127
std          0.800888
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          8.000000
Name: Delinquency - two years, dtype: float64
====================================================================================================
====================================================================================================
Inquires - six months
count    67463.000000
mean         0.145754
std          0.473291
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          5.000000
Name: Inquires - six months, dtype: float64
====================================================================================================
====================================================================================================
Open Account
count    67463.000000
mean        14.266561
std          6.225060
min          2.000000
25%         10.000000
50%         13.000000
75%         16.000000
max         37.000000
Name: Open Account, dtype: float64
====================================================================================================
====================================================================================================
Public Record
count    67463.000000
mean         0.081437
std          0.346606
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          4.000000
Name: Public Record, dtype: float64
====================================================================================================
====================================================================================================
Revolving Balance
count     67463.000000
mean       7699.342425
std        7836.148190
min           0.000000
25%        2557.000000
50%        5516.000000
75%       10184.500000
max      116933.000000
Name: Revolving Balance, dtype: float64
====================================================================================================
====================================================================================================
Revolving Utilities
count    67463.000000
mean        52.889443
std         22.539450
min          0.005172
25%         38.658825
50%         54.082334
75%         69.177117
max        100.880050
Name: Revolving Utilities, dtype: float64
====================================================================================================
====================================================================================================
Total Accounts
count    67463.000000
mean        18.627929
std          8.319246
min          4.000000
25%         13.000000
50%         18.000000
75%         23.000000
max         72.000000
Name: Total Accounts, dtype: float64
====================================================================================================
====================================================================================================
Total Received Interest
count    67463.000000
mean      2068.992542
std       2221.918745
min          4.736746
25%        570.903814
50%       1330.842771
75%       2656.956837
max      14301.368310
Name: Total Received Interest, dtype: float64
====================================================================================================
====================================================================================================
Total Received Late Fee
count    67463.000000
mean         1.143969
std          5.244365
min          0.000003
25%          0.021114
50%          0.043398
75%          0.071884
max         42.618882
Name: Total Received Late Fee, dtype: float64
====================================================================================================
====================================================================================================
Recoveries
count    67463.000000
mean        59.691578
std        357.026346
min          0.000036
25%          1.629818
50%          3.344524
75%          5.453727
max       4354.467419
Name: Recoveries, dtype: float64
====================================================================================================
====================================================================================================
Collection Recovery Fee
count    67463.000000
mean         1.125141
std          3.489885
min          0.000036
25%          0.476259
50%          0.780141
75%          1.070566
max        166.833000
Name: Collection Recovery Fee, dtype: float64
====================================================================================================
====================================================================================================
Collection 12 months Medical
count    67463.000000
mean         0.021301
std          0.144385
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: Collection 12 months Medical, dtype: float64
====================================================================================================
====================================================================================================
Last week Pay
count    67463.000000
mean        71.163260
std         43.315845
min          0.000000
25%         35.000000
50%         68.000000
75%        105.000000
max        161.000000
Name: Last week Pay, dtype: float64
====================================================================================================
====================================================================================================
Accounts Delinquent
count    67463.0
mean         0.0
std          0.0
min          0.0
25%          0.0
50%          0.0
75%          0.0
max          0.0
Name: Accounts Delinquent, dtype: float64
====================================================================================================
====================================================================================================
Total Collection Amount
count    67463.000000
mean       146.467990
std        744.382233
min          1.000000
25%         24.000000
50%         36.000000
75%         46.000000
max      16421.000000
Name: Total Collection Amount, dtype: float64
====================================================================================================
====================================================================================================
Total Current Balance
count    6.746300e+04
mean     1.595739e+05
std      1.390332e+05
min      6.170000e+02
25%      5.037900e+04
50%      1.183690e+05
75%      2.283750e+05
max      1.177412e+06
Name: Total Current Balance, dtype: float64
====================================================================================================
====================================================================================================
Total Revolving Credit Limit
count     67463.000000
mean      23123.005544
std       20916.699999
min        1000.000000
25%        8155.500000
50%       16733.000000
75%       32146.500000
max      201169.000000
Name: Total Revolving Credit Limit, dtype: float64
====================================================================================================
====================================================================================================
Loan Status
count    67463.000000
mean         0.092510
std          0.289747
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: Loan Status, dtype: float64
====================================================================================================

train['Accounts Delinquent'].value_counts()

def pre_process(df): #drop the loan title and payment plan column df=df.drop(columns=['Loan Title','Payment Plan']) #Interest per month df['Interest_per_mon']=((df['Loan Amount']*df['Interest Rate'])/100)/df['Term'] #total interest amount df['total_intr_amt']=df['Interest_per_mon']*df['Term'] #Check investor funded amount greater than funded amount df['Fund_amnt_grt']=(df['Funded Amount Investor']>df['Funded Amount']).astype('int') #total revolve amount df['total_revolve']=df['Revolving Balance']+df['Revolving Utilities'] #total received amount df['total_received']=df['Total Received Interest']+df['Total Collection Amount'] #total recovery amount df['total_recovery']=df['Recoveries']+df['Collection Recovery Fee'] #check total revolve amount less than total revolving credit limit df['revolve_amnt_grt']=(df['total_revolve']<df['Total Revolving Credit Limit']).astype('int') #check Loan amount greater than total current balance df['loan_grt_balance']=(df['Loan Amount']>df['Total Current Balance']).astype('int') #representative's customer count. df['reprs_cust_count']=df.groupby(['Batch Enrolled'])['Batch Enrolled'].transform('count') return df

train=pre_process(train)

grpcol=['Batch Enrolled', 'Grade', 'Sub Grade', 'Home Ownership', 'Verification Status', 'Initial List Status', 'Application Type']

numcols=['Loan Amount', 'Funded Amount', 'Funded Amount Investor', 'Interest Rate', 'Employment Duration', 'Debit to Income', 'Open Account', 'Revolving Balance', 'Revolving Utilities', 'Total Accounts', 'Total Received Interest', 'Total Received Late Fee', 'Recoveries', 'Collection Recovery Fee', 'Total Collection Amount', 'Total Current Balance', 'Total Revolving Credit Limit', 'Interest_per_mon', 'total_intr_amt', 'total_revolve', 'total_received', 'total_recovery']

for col in numcols: df1=(train.groupby(grpcol)[col]. agg({'min','median','max'}).reset_index()) df1=df1.rename(columns={c:col+'_'+c for c in df1.loc[:,df1.columns.str.match("(min|median|max)")]}) train=pd.merge(train,df1,on=grpcol,how='left')

for c, i in enumerate(train.columns.values): print(f"{c}_{i}")

0_ID
1_Loan Amount
2_Funded Amount
3_Funded Amount Investor
4_Term
5_Batch Enrolled
6_Interest Rate
7_Grade
8_Sub Grade
9_Home Ownership
10_Employment Duration
11_Verification Status
12_Debit to Income
13_Delinquency - two years
14_Inquires - six months
15_Open Account
16_Public Record
17_Revolving Balance
18_Revolving Utilities
19_Total Accounts
20_Initial List Status
21_Total Received Interest
22_Total Received Late Fee
23_Recoveries
24_Collection Recovery Fee
25_Collection 12 months Medical
26_Application Type
27_Last week Pay
28_Accounts Delinquent
29_Total Collection Amount
30_Total Current Balance
31_Total Revolving Credit Limit
32_Loan Status
33_Interest_per_mon
34_total_intr_amt
35_Fund_amnt_grt
36_total_revolve
37_total_received
38_total_recovery
39_revolve_amnt_grt
40_loan_grt_balance
41_reprs_cust_count
42_Loan Amount_max
43_Loan Amount_median
44_Loan Amount_min
45_Funded Amount_max
46_Funded Amount_median
47_Funded Amount_min
48_Funded Amount Investor_max
49_Funded Amount Investor_median
50_Funded Amount Investor_min
51_Interest Rate_max
52_Interest Rate_median
53_Interest Rate_min
54_Employment Duration_max
55_Employment Duration_median
56_Employment Duration_min
57_Debit to Income_max
58_Debit to Income_median
59_Debit to Income_min
60_Open Account_max
61_Open Account_median
62_Open Account_min
63_Revolving Balance_max
64_Revolving Balance_median
65_Revolving Balance_min
66_Revolving Utilities_max
67_Revolving Utilities_median
68_Revolving Utilities_min
69_Total Accounts_max
70_Total Accounts_median
71_Total Accounts_min
72_Total Received Interest_max
73_Total Received Interest_median
74_Total Received Interest_min
75_Total Received Late Fee_max
76_Total Received Late Fee_median
77_Total Received Late Fee_min
78_Recoveries_max
79_Recoveries_median
80_Recoveries_min
81_Collection Recovery Fee_max
82_Collection Recovery Fee_median
83_Collection Recovery Fee_min
84_Total Collection Amount_max
85_Total Collection Amount_median
86_Total Collection Amount_min
87_Total Current Balance_max
88_Total Current Balance_median
89_Total Current Balance_min
90_Total Revolving Credit Limit_max
91_Total Revolving Credit Limit_median
92_Total Revolving Credit Limit_min
93_Interest_per_mon_max
94_Interest_per_mon_median
95_Interest_per_mon_min
96_total_intr_amt_max
97_total_intr_amt_median
98_total_intr_amt_min
99_total_revolve_max
100_total_revolve_median
101_total_revolve_min
102_total_received_max
103_total_received_median
104_total_received_min
105_total_recovery_max
106_total_recovery_median
107_total_recovery_min

test=pre_process(test)

for col in numcols: df1=(test.groupby(grpcol)[col]. agg({'min','median','max'}).reset_index()) df1=df1.rename(columns={c:col+'_'+c for c in df1.loc[:,df1.columns.str.match("(min|median|max)")]}) test=pd.merge(test,df1,on=grpcol,how='left')

for c, i in enumerate(test.columns.values): print(f"{c}_{i}")

0_ID
1_Loan Amount
2_Funded Amount
3_Funded Amount Investor
4_Term
5_Batch Enrolled
6_Interest Rate
7_Grade
8_Sub Grade
9_Home Ownership
10_Employment Duration
11_Verification Status
12_Debit to Income
13_Delinquency - two years
14_Inquires - six months
15_Open Account
16_Public Record
17_Revolving Balance
18_Revolving Utilities
19_Total Accounts
20_Initial List Status
21_Total Received Interest
22_Total Received Late Fee
23_Recoveries
24_Collection Recovery Fee
25_Collection 12 months Medical
26_Application Type
27_Last week Pay
28_Accounts Delinquent
29_Total Collection Amount
30_Total Current Balance
31_Total Revolving Credit Limit
32_Loan Status
33_Interest_per_mon
34_total_intr_amt
35_Fund_amnt_grt
36_total_revolve
37_total_received
38_total_recovery
39_revolve_amnt_grt
40_loan_grt_balance
41_reprs_cust_count
42_Loan Amount_max
43_Loan Amount_median
44_Loan Amount_min
45_Funded Amount_max
46_Funded Amount_median
47_Funded Amount_min
48_Funded Amount Investor_max
49_Funded Amount Investor_median
50_Funded Amount Investor_min
51_Interest Rate_max
52_Interest Rate_median
53_Interest Rate_min
54_Employment Duration_max
55_Employment Duration_median
56_Employment Duration_min
57_Debit to Income_max
58_Debit to Income_median
59_Debit to Income_min
60_Open Account_max
61_Open Account_median
62_Open Account_min
63_Revolving Balance_max
64_Revolving Balance_median
65_Revolving Balance_min
66_Revolving Utilities_max
67_Revolving Utilities_median
68_Revolving Utilities_min
69_Total Accounts_max
70_Total Accounts_median
71_Total Accounts_min
72_Total Received Interest_max
73_Total Received Interest_median
74_Total Received Interest_min
75_Total Received Late Fee_max
76_Total Received Late Fee_median
77_Total Received Late Fee_min
78_Recoveries_max
79_Recoveries_median
80_Recoveries_min
81_Collection Recovery Fee_max
82_Collection Recovery Fee_median
83_Collection Recovery Fee_min
84_Total Collection Amount_max
85_Total Collection Amount_median
86_Total Collection Amount_min
87_Total Current Balance_max
88_Total Current Balance_median
89_Total Current Balance_min
90_Total Revolving Credit Limit_max
91_Total Revolving Credit Limit_median
92_Total Revolving Credit Limit_min
93_Interest_per_mon_max
94_Interest_per_mon_median
95_Interest_per_mon_min
96_total_intr_amt_max
97_total_intr_amt_median
98_total_intr_amt_min
99_total_revolve_max
100_total_revolve_median
101_total_revolve_min
102_total_received_max
103_total_received_median
104_total_received_min
105_total_recovery_max
106_total_recovery_median
107_total_recovery_min

!pip install optuna !pip install catboost

from sklearn.model_selection import cross_val_score,KFold,train_test_split,ShuffleSplit,StratifiedKFold,learning_curve from catboost import CatBoostClassifier,Pool,cv,monoforest import optuna from optuna.samplers import RandomSampler,TPESampler,MOTPESampler,CmaEsSampler from sklearn.metrics import f1_score,classification_report,confusion_matrix,log_loss from xgboost import XGBClassifier,plot_tree import xgboost as xgb from optuna.integration import XGBoostPruningCallback,LightGBMPruningCallback from sklearn.preprocessing import LabelEncoder,StandardScaler,PolynomialFeatures import lightgbm as lgbm import math

for c, i in enumerate(train.columns.values): print(f"{c}_{i}")

0_ID
1_Loan Amount
2_Funded Amount
3_Funded Amount Investor
4_Term
5_Batch Enrolled
6_Interest Rate
7_Grade
8_Sub Grade
9_Home Ownership
10_Employment Duration
11_Verification Status
12_Debit to Income
13_Delinquency - two years
14_Inquires - six months
15_Open Account
16_Public Record
17_Revolving Balance
18_Revolving Utilities
19_Total Accounts
20_Initial List Status
21_Total Received Interest
22_Total Received Late Fee
23_Recoveries
24_Collection Recovery Fee
25_Collection 12 months Medical
26_Application Type
27_Last week Pay
28_Accounts Delinquent
29_Total Collection Amount
30_Total Current Balance
31_Total Revolving Credit Limit
32_Loan Status
33_Interest_per_mon
34_total_intr_amt
35_Fund_amnt_grt
36_total_revolve
37_total_received
38_total_recovery
39_revolve_amnt_grt
40_loan_grt_balance
41_reprs_cust_count
42_Loan Amount_min
43_Loan Amount_median
44_Loan Amount_max
45_Funded Amount_min
46_Funded Amount_median
47_Funded Amount_max
48_Funded Amount Investor_min
49_Funded Amount Investor_median
50_Funded Amount Investor_max
51_Interest Rate_min
52_Interest Rate_median
53_Interest Rate_max
54_Employment Duration_min
55_Employment Duration_median
56_Employment Duration_max
57_Debit to Income_min
58_Debit to Income_median
59_Debit to Income_max
60_Open Account_min
61_Open Account_median
62_Open Account_max
63_Revolving Balance_min
64_Revolving Balance_median
65_Revolving Balance_max
66_Revolving Utilities_min
67_Revolving Utilities_median
68_Revolving Utilities_max
69_Total Accounts_min
70_Total Accounts_median
71_Total Accounts_max
72_Total Received Interest_min
73_Total Received Interest_median
74_Total Received Interest_max
75_Total Received Late Fee_min
76_Total Received Late Fee_median
77_Total Received Late Fee_max
78_Recoveries_min
79_Recoveries_median
80_Recoveries_max
81_Collection Recovery Fee_min
82_Collection Recovery Fee_median
83_Collection Recovery Fee_max
84_Total Collection Amount_min
85_Total Collection Amount_median
86_Total Collection Amount_max
87_Total Current Balance_min
88_Total Current Balance_median
89_Total Current Balance_max
90_Total Revolving Credit Limit_min
91_Total Revolving Credit Limit_median
92_Total Revolving Credit Limit_max
93_Interest_per_mon_min
94_Interest_per_mon_median
95_Interest_per_mon_max
96_total_intr_amt_min
97_total_intr_amt_median
98_total_intr_amt_max
99_total_revolve_min
100_total_revolve_median
101_total_revolve_max
102_total_received_min
103_total_received_median
104_total_received_max
105_total_recovery_min
106_total_recovery_median
107_total_recovery_max

train.select_dtypes(include='object').columns

X=train.iloc[:,np.r_[1:32,33:108]] y=train['Loan Status']

train.select_dtypes(include='object').columns

def objective(trial): skf = StratifiedKFold(n_splits=5,random_state=2000,shuffle=True) for i, (train_index, test_index) in enumerate(skf.split(X,y)): X_train, X_valid = X.iloc[train_index], X.iloc[test_index] y_train, y_valid = y.iloc[train_index], y.iloc[test_index] param = { 'reg_lambda':trial.suggest_categorical('reg_lambda',[0.1, 1.0, 5.0, 10.0, 50.0, 100.0,150.0,200.0,250.0]), 'learning_rate': trial.suggest_float('learning_rate', 0.001,1.0), 'n_estimators': trial.suggest_categorical('n_estimators',[200,400,600,800,1000]), 'max_depth': trial.suggest_int('max_depth', 2,12), 'random_state': trial.suggest_categorical('random_state', [1024, 1048,2020]), "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1), "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]), "bootstrap_type": trial.suggest_categorical( "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"] ), } if param["bootstrap_type"] == "Bayesian": param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10) elif param["bootstrap_type"] == "Bernoulli": param["subsample"] = trial.suggest_float("subsample", 0.1, 1) cat_clf = CatBoostClassifier(early_stopping_rounds=30,eval_metric="Logloss", logging_level="Silent", **param) cat_clf.fit(X_train, y_train,cat_features=['Batch Enrolled', 'Grade', 'Sub Grade', 'Home Ownership','Verification Status', 'Initial List Status', 'Application Type']) preds = cat_clf.predict_proba(X_valid) accuracy = log_loss(y_valid, preds) return accuracy if __name__ == "__main__": study = optuna.create_study(direction='minimize',sampler=MOTPESampler(), pruner=optuna.pruners.SuccessiveHalvingPruner()) study.optimize(objective, n_trials=100) print('Number of finished trials:', len(study.trials)) print('Best trial:', study.best_trial.params)

[I 2021-12-28 13:19:48,727] A new study created in memory with name: no-name-dff88beb-72a5-48ec-a075-940e721f6802
Custom logger is already specified. Specify more than one logger at same time is not thread safe.[I 2021-12-28 13:21:38,488] Trial 0 finished with value: 0.4100010985018204 and parameters: {'reg_lambda': 200.0, 'learning_rate': 0.9552004688504021, 'n_estimators': 800, 'max_depth': 8, 'random_state': 2020, 'colsample_bylevel': 0.08140795734520816, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.4734171462679141}. Best is trial 0 with value: 0.4100010985018204.
[I 2021-12-28 13:22:04,720] Trial 1 finished with value: 0.31667966817519916 and parameters: {'reg_lambda': 50.0, 'learning_rate': 0.22908297508327882, 'n_estimators': 1000, 'max_depth': 8, 'random_state': 1024, 'colsample_bylevel': 0.010789467699509715, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.31667966817519916.
[I 2021-12-28 13:23:29,795] Trial 2 finished with value: 0.6644377310777809 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.4907128774180346, 'n_estimators': 600, 'max_depth': 11, 'random_state': 1024, 'colsample_bylevel': 0.08409755247942045, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.0476285042720348}. Best is trial 1 with value: 0.31667966817519916.
[I 2021-12-28 13:26:51,294] Trial 3 finished with value: 0.6441318953162362 and parameters: {'reg_lambda': 5.0, 'learning_rate': 0.6072281987571729, 'n_estimators': 800, 'max_depth': 12, 'random_state': 2020, 'colsample_bylevel': 0.09828546260935145, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.31667966817519916.
[I 2021-12-28 13:27:37,484] Trial 4 finished with value: 0.31595243376249565 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.2054762763652627, 'n_estimators': 600, 'max_depth': 7, 'random_state': 1024, 'colsample_bylevel': 0.05518975818331533, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.1710960305616296}. Best is trial 4 with value: 0.31595243376249565.
[I 2021-12-28 13:27:55,039] Trial 5 finished with value: 0.30865882778312453 and parameters: {'reg_lambda': 5.0, 'learning_rate': 0.4824021905021146, 'n_estimators': 400, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.05016541377598957, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.642969598452986}. Best is trial 5 with value: 0.30865882778312453.
[I 2021-12-28 13:29:56,729] Trial 6 finished with value: 0.5960156764001286 and parameters: {'reg_lambda': 50.0, 'learning_rate': 0.9973247845857358, 'n_estimators': 1000, 'max_depth': 11, 'random_state': 1048, 'colsample_bylevel': 0.04633694165559028, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 5 with value: 0.30865882778312453.
[I 2021-12-28 13:31:45,797] Trial 7 finished with value: 0.6494287241532615 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.889824572532522, 'n_estimators': 800, 'max_depth': 12, 'random_state': 2020, 'colsample_bylevel': 0.030429052412292913, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 5 with value: 0.30865882778312453.
[I 2021-12-28 13:32:21,756] Trial 8 finished with value: 0.31039193616078997 and parameters: {'reg_lambda': 0.1, 'learning_rate': 0.07120933762096478, 'n_estimators': 400, 'max_depth': 6, 'random_state': 1024, 'colsample_bylevel': 0.09371015572041257, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.1615454799997345}. Best is trial 5 with value: 0.30865882778312453.
[I 2021-12-28 13:32:54,548] Trial 9 finished with value: 0.34790236365789184 and parameters: {'reg_lambda': 50.0, 'learning_rate': 0.537035648242743, 'n_estimators': 1000, 'max_depth': 4, 'random_state': 1048, 'colsample_bylevel': 0.09305993218346688, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.5078539013455677}. Best is trial 5 with value: 0.30865882778312453.
[I 2021-12-28 13:33:03,926] Trial 10 finished with value: 0.30956134657010265 and parameters: {'reg_lambda': 5.0, 'learning_rate': 0.725132928411979, 'n_estimators': 200, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.0602365598835579, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.689488389844328}. Best is trial 5 with value: 0.30865882778312453.
[I 2021-12-28 13:33:22,795] Trial 11 finished with value: 0.30830904610404347 and parameters: {'reg_lambda': 150.0, 'learning_rate': 0.3663598185117414, 'n_estimators': 400, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.03268824602344013, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.09061257217755}. Best is trial 11 with value: 0.30830904610404347.
[I 2021-12-28 13:33:41,171] Trial 12 finished with value: 0.3107439287639494 and parameters: {'reg_lambda': 150.0, 'learning_rate': 0.36630079410257294, 'n_estimators': 400, 'max_depth': 4, 'random_state': 1048, 'colsample_bylevel': 0.013147059693571057, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.49413530659445026}. Best is trial 11 with value: 0.30830904610404347.
[I 2021-12-28 13:33:52,699] Trial 13 finished with value: 0.3126120126448186 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.7674274927514826, 'n_estimators': 200, 'max_depth': 4, 'random_state': 2020, 'colsample_bylevel': 0.030794726248672875, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 9.132662204898661}. Best is trial 11 with value: 0.30830904610404347.
[I 2021-12-28 13:34:17,258] Trial 14 finished with value: 0.3100713944783724 and parameters: {'reg_lambda': 150.0, 'learning_rate': 0.32293624629552825, 'n_estimators': 400, 'max_depth': 5, 'random_state': 2020, 'colsample_bylevel': 0.03182953143065234, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.108678877520031}. Best is trial 11 with value: 0.30830904610404347.
[I 2021-12-28 13:35:26,947] Trial 15 finished with value: 0.3106249861112574 and parameters: {'reg_lambda': 150.0, 'learning_rate': 0.09053893066523971, 'n_estimators': 400, 'max_depth': 9, 'random_state': 1048, 'colsample_bylevel': 0.06739905226535768, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.8814953897509814}. Best is trial 11 with value: 0.30830904610404347.
[I 2021-12-28 13:35:51,773] Trial 16 finished with value: 0.3080115307011757 and parameters: {'reg_lambda': 0.1, 'learning_rate': 0.010232376347675265, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.038777259160665156, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 7.036264331268556}. Best is trial 16 with value: 0.3080115307011757.
[I 2021-12-28 13:36:20,919] Trial 17 finished with value: 0.32092658148024794 and parameters: {'reg_lambda': 0.1, 'learning_rate': 0.0024824579284581294, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.042173756850336155, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.9950295776067846}. Best is trial 16 with value: 0.3080115307011757.
[I 2021-12-28 13:36:47,646] Trial 18 finished with value: 0.32592028239956705 and parameters: {'reg_lambda': 0.1, 'learning_rate': 0.18049555099381945, 'n_estimators': 600, 'max_depth': 6, 'random_state': 1048, 'colsample_bylevel': 0.07047022661817581, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 7.352700877033383}. Best is trial 16 with value: 0.3080115307011757.
[I 2021-12-28 13:37:32,600] Trial 19 finished with value: 0.35123040457979327 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.6940345290470873, 'n_estimators': 600, 'max_depth': 9, 'random_state': 1024, 'colsample_bylevel': 0.020600506409986005, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 16 with value: 0.3080115307011757.
[I 2021-12-28 13:37:41,683] Trial 20 finished with value: 0.30796640704515776 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.33882214314664055, 'n_estimators': 200, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.03578120544200743, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.701392567958283}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:37:52,417] Trial 21 finished with value: 0.30859615225391535 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.2894760008256461, 'n_estimators': 200, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.04003421499523795, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 7.42220300403602}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:38:02,152] Trial 22 finished with value: 0.3081579375865388 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.11051749013026863, 'n_estimators': 200, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.0223841969671017, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 7.2594057202635485}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:38:15,745] Trial 23 finished with value: 0.30988585026373144 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.4174945707267844, 'n_estimators': 200, 'max_depth': 5, 'random_state': 2020, 'colsample_bylevel': 0.03907008454470365, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.39495804316205}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:38:31,020] Trial 24 finished with value: 0.38325819347463413 and parameters: {'reg_lambda': 200.0, 'learning_rate': 0.003470484942332831, 'n_estimators': 200, 'max_depth': 5, 'random_state': 2020, 'colsample_bylevel': 0.059667912229920655, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.539183040049113}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:38:55,868] Trial 25 finished with value: 0.3081018926133609 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.23597855088836595, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.02294271783509348, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.6179427758701808}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:39:25,688] Trial 26 finished with value: 0.3176193701983347 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.5938364157517737, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.049693921237819634, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 8.584491854195686}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:39:41,718] Trial 27 finished with value: 0.3122706767917134 and parameters: {'reg_lambda': 0.1, 'learning_rate': 0.13868036685126545, 'n_estimators': 200, 'max_depth': 6, 'random_state': 1048, 'colsample_bylevel': 0.07355366433666687, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:40:00,374] Trial 28 finished with value: 0.3194869900184245 and parameters: {'reg_lambda': 0.1, 'learning_rate': 0.4198047063559755, 'n_estimators': 800, 'max_depth': 4, 'random_state': 1024, 'colsample_bylevel': 0.017374518892885864, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.10607444119211629}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:41:32,840] Trial 29 finished with value: 0.3276336844470651 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.2643572687983563, 'n_estimators': 1000, 'max_depth': 7, 'random_state': 2020, 'colsample_bylevel': 0.06325729097795736, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 8.233886782860344}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:41:57,986] Trial 30 finished with value: 0.3080162712982084 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.1660773845006956, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.024730562490008774, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.193901465164717}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:42:23,498] Trial 31 finished with value: 0.308146343448234 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.16260971408286365, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.025550538373078326, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.430379968035934}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:42:53,744] Trial 32 finished with value: 0.30801458890943334 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.05584768927673188, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.035785952677856246, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.950426627500387}. Best is trial 20 with value: 0.30796640704515776.
[I 2021-12-28 13:43:24,225] Trial 33 finished with value: 0.30785029071462094 and parameters: {'reg_lambda': 10.0, 'learning_rate': 0.0563049857054157, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.0375976522119426, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.161919118032015}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:44:00,431] Trial 34 finished with value: 0.3081083066109118 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.059167911630636436, 'n_estimators': 600, 'max_depth': 4, 'random_state': 2020, 'colsample_bylevel': 0.04550445722001303, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.681095947227308}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:44:11,047] Trial 35 finished with value: 0.30859831252859493 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.30000076290540223, 'n_estimators': 200, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.05398003566205212, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 8.098845389145255}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:44:49,104] Trial 36 finished with value: 0.3140262564431863 and parameters: {'reg_lambda': 200.0, 'learning_rate': 0.003106924185174607, 'n_estimators': 600, 'max_depth': 5, 'random_state': 2020, 'colsample_bylevel': 0.03666400120344703, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.415244577696475}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:45:17,370] Trial 37 finished with value: 0.30797088016327895 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.11731392256095559, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.04353896742282525, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.263974631990708}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:45:28,402] Trial 38 finished with value: 0.30993022867406184 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.3399827866991869, 'n_estimators': 200, 'max_depth': 3, 'random_state': 1048, 'colsample_bylevel': 0.05230017706100512, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:45:52,526] Trial 39 finished with value: 0.3299671446383035 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.414506926208746, 'n_estimators': 800, 'max_depth': 4, 'random_state': 1024, 'colsample_bylevel': 0.044573350184222295, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.7370442965110242}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:46:19,377] Trial 40 finished with value: 0.30788983164494393 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.12245332874878374, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.028562415478461462, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 6.331776254379165}. Best is trial 33 with value: 0.30785029071462094.
[I 2021-12-28 13:46:46,565] Trial 41 finished with value: 0.3075584206617818 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.12106647986177312, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.02771062925991618, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.976679263417514}. Best is trial 41 with value: 0.3075584206617818.
[I 2021-12-28 13:47:16,717] Trial 42 finished with value: 0.3084138912880576 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.255888808198655, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.02734596006292813, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.825313838703929}. Best is trial 41 with value: 0.3075584206617818.
[I 2021-12-28 13:47:43,811] Trial 43 finished with value: 0.3075713833552239 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.17924460055942992, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.028441377492810418, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.720148265189939}. Best is trial 41 with value: 0.3075584206617818.
[I 2021-12-28 13:48:12,035] Trial 44 finished with value: 0.30807460755630495 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.19689379712753324, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.017857872470314368, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.3561124462289205}. Best is trial 41 with value: 0.3075584206617818.
[I 2021-12-28 13:48:39,899] Trial 45 finished with value: 0.30754235530557406 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.13329828734526583, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.02800155755287607, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.5691284250573703}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:49:09,341] Trial 46 finished with value: 0.3077844707729344 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.0564022730385032, 'n_estimators': 600, 'max_depth': 4, 'random_state': 2020, 'colsample_bylevel': 0.017748551752997985, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.7942847670631612}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:49:37,772] Trial 47 finished with value: 0.30809672491371987 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.15763541078569066, 'n_estimators': 600, 'max_depth': 4, 'random_state': 2020, 'colsample_bylevel': 0.015347491117266833, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.6870129233571016}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:50:04,277] Trial 48 finished with value: 0.3078857243602207 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.08427577019345282, 'n_estimators': 600, 'max_depth': 5, 'random_state': 2020, 'colsample_bylevel': 0.010561719695356263, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.485688181103633}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:50:30,621] Trial 49 finished with value: 0.3081719456439294 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.26730201879461335, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.018254933863627212, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.699428099362245}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:51:01,187] Trial 50 finished with value: 0.3089203366939519 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.13638703751004952, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.026581972956907653, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.7688652937870724}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:51:30,080] Trial 51 finished with value: 0.30792937310734625 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.0331600885789784, 'n_estimators': 600, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.020880351691040272, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.391605857706495}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:51:58,211] Trial 52 finished with value: 0.30777842599045563 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.053210493573485895, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.03367054589376216, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.984676745513563}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:52:26,391] Trial 53 finished with value: 0.30823872528759644 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.2088903798621219, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.03372689451792627, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.825401857572525}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:52:52,135] Trial 54 finished with value: 0.3077095777189635 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.10408603684078382, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.014530165803284055, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.330932319528904}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:53:20,425] Trial 55 finished with value: 0.3079577870019453 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.10180245720178235, 'n_estimators': 600, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.028268171100153776, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.1227693947272825}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:54:01,846] Trial 56 finished with value: 0.3076783408291407 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.1814278085018206, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.014694006339004018, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.9917929098283906}. Best is trial 45 with value: 0.30754235530557406.
[I 2021-12-28 13:54:42,262] Trial 57 finished with value: 0.307434512450736 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.22202683443420487, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.011852431987746277, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.9611631489676657}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 13:55:21,945] Trial 58 finished with value: 0.3091207746627096 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.23317500474272052, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 1048, 'colsample_bylevel': 0.01236008795222979, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.3745540911095694}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 13:55:48,080] Trial 59 finished with value: 0.3124426495439441 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.2904939413843156, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 1024, 'colsample_bylevel': 0.02153328987913529, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 13:56:29,127] Trial 60 finished with value: 0.30769711737383637 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.18610383686010318, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.014339366658207845, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.054618212718263}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 13:57:10,425] Trial 61 finished with value: 0.3081170148894457 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.20140171771107332, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.014960953017796874, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.9766940180027124}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 13:57:57,728] Trial 62 finished with value: 0.3084481641283402 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.15540006118635152, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.0200433754067231, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.668744983224088}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 13:58:42,493] Trial 63 finished with value: 0.30843999968506436 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.18864976712161993, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.025139539802041675, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.6963300987855834}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 13:59:26,705] Trial 64 finished with value: 0.30844061149596924 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.23615649630269608, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.015538140996618684, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.2201867393445855}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:00:06,223] Trial 65 finished with value: 0.30780405326364346 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.14726107801328875, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.011232301211031123, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.755806739464729}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:00:56,022] Trial 66 finished with value: 0.3092458290947544 and parameters: {'reg_lambda': 100.0, 'learning_rate': 0.1786017629006514, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.023493548537692834, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.0394019206650715}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:01:35,105] Trial 67 finished with value: 0.3090707355174857 and parameters: {'reg_lambda': 5.0, 'learning_rate': 0.3053102172036845, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.019206541713959503, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.990327368951434}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:02:20,076] Trial 68 finished with value: 0.30903255877305735 and parameters: {'reg_lambda': 200.0, 'learning_rate': 0.2661219219232608, 'n_estimators': 1000, 'max_depth': 4, 'random_state': 2020, 'colsample_bylevel': 0.014125857515133428, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.365412988399983}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:02:56,426] Trial 69 finished with value: 0.3076469661168601 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.22473943400823987, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010310619631260995, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.4017374144677635}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:03:33,563] Trial 70 finished with value: 0.3082435815008343 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.21959297026801608, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.012855432267646405, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.3415286599657676}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:04:09,443] Trial 71 finished with value: 0.3089773736311915 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.37000381228594587, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.01021415699688447, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.0934511817285424}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:04:51,325] Trial 72 finished with value: 0.30820690560792424 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.12764559171362783, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.0168407674795086, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.521113922735104}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:05:30,906] Trial 73 finished with value: 0.30828739525010385 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.17818549171698533, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.02934914531093111, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.7959423619978017}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:06:19,551] Trial 74 finished with value: 0.3091530094140145 and parameters: {'reg_lambda': 50.0, 'learning_rate': 0.25557695164562216, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.023809062824247336, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.70691542412889}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:06:59,956] Trial 75 finished with value: 0.307696540265026 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.09458670882813933, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010038022476935509, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.228660796676878}. Best is trial 57 with value: 0.307434512450736.
[I 2021-12-28 14:07:41,141] Trial 76 finished with value: 0.30730791194462437 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.08580081062052972, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010267520707686379, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.0022013667340754}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:08:27,309] Trial 77 finished with value: 0.3079363368031429 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.13134837603355667, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.016585184647102177, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.6317388943038855}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:09:12,733] Trial 78 finished with value: 0.3077409284160152 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.035478889433778624, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.019953219863178356, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.2979298781856263}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:09:53,385] Trial 79 finished with value: 0.3079259743261942 and parameters: {'reg_lambda': 1.0, 'learning_rate': 0.08218598002213888, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.01285909756057752, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.9183882073133267}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:10:34,692] Trial 80 finished with value: 0.307543352753024 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.10455491902957995, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010626333321137174, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.098985812540241}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:11:16,800] Trial 81 finished with value: 0.30777009892834034 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.030388653796977597, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.012212144106790473, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.0020989995703067}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:12:01,576] Trial 82 finished with value: 0.3077826325050363 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.10926014489407798, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.018995959410143787, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.6499844212628298}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:12:44,950] Trial 83 finished with value: 0.3081240300178788 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.14532500808558851, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.01654267409193488, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.9821312743917145}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:13:28,037] Trial 84 finished with value: 0.30808062130046154 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.2179122376160405, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.013067456824596937, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 3.4311928044392253}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:14:08,602] Trial 85 finished with value: 0.30763200259961204 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.08158861789201716, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010086726000962694, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.5760825233967264}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:14:49,866] Trial 86 finished with value: 0.3076623277292799 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.03481042974934406, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010190168406570745, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.2550207181909907}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:15:41,127] Trial 87 finished with value: 0.3077001043379401 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.0861489293507142, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.021956150127820873, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.1692987193000635}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:16:16,491] Trial 88 finished with value: 0.3077201799043222 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.07277509159340519, 'n_estimators': 800, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.01689929791170685, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.5884748454348174}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:16:59,027] Trial 89 finished with value: 0.30813338376726573 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.12149840021129285, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 1024, 'colsample_bylevel': 0.012388716278478195, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.6722913923519971}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:17:43,989] Trial 90 finished with value: 0.30746054298748443 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.03838194807833625, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010913391940164316, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.1457965345339334}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:18:26,306] Trial 91 finished with value: 0.30762295798006867 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.04531265501307803, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010165036394957666, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.45346901853716903}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:19:10,785] Trial 92 finished with value: 0.3076288772679253 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.02712599747021193, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.015159468383245807, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.3252185014853817}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:19:55,416] Trial 93 finished with value: 0.3076194019283185 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.029889240608320912, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.015220439810751147, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.1681180901283668}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:20:40,639] Trial 94 finished with value: 0.3076980567937852 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.03578899852067052, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.019519762631036386, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.683420744179926}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:21:31,965] Trial 95 finished with value: 0.30869871730974413 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.0618333530334237, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.026582446917876267, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.09860467704874662}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:22:15,257] Trial 96 finished with value: 0.30768189610784086 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.10163182690307862, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.013326646958461103, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.9403215375021393}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:23:05,643] Trial 97 finished with value: 0.30805641315780585 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.04309094125787194, 'n_estimators': 1000, 'max_depth': 3, 'random_state': 2020, 'colsample_bylevel': 0.023423556965846234, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.421649418720508}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:23:32,348] Trial 98 finished with value: 0.30869032191344525 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.1428252564351233, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 1048, 'colsample_bylevel': 0.018539327351832797, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 76 with value: 0.30730791194462437.
[I 2021-12-28 14:24:16,653] Trial 99 finished with value: 0.3077183558437069 and parameters: {'reg_lambda': 250.0, 'learning_rate': 0.017401625577887297, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.015882097345279532, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.1982140103302499}. Best is trial 76 with value: 0.30730791194462437.

Number of finished trials: 100
Best trial: {'reg_lambda': 250.0, 'learning_rate': 0.08580081062052972, 'n_estimators': 1000, 'max_depth': 2, 'random_state': 2020, 'colsample_bylevel': 0.010267520707686379, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.0022013667340754}

study.best_value

study.best_params

optuna.visualization.plot_optimization_history(study)

optuna.visualization.plot_slice(study)

optuna.visualization.plot_param_importances(study)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.2, random_state = 42, shuffle=True,stratify=y)

param_cat={'bagging_temperature': 2.0022013667340754, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'colsample_bylevel': 0.010267520707686379, 'learning_rate': 0.08580081062052972, 'max_depth': 2, 'n_estimators': 1000, 'random_state': 2020, 'reg_lambda': 250.0}

cat_model = CatBoostClassifier(**param_cat, eval_metric="Logloss",)

cat_model.fit(X_train,y_train,eval_set=(X_valid,y_valid),use_best_model=True, verbose=True,early_stopping_rounds=30, cat_features=['Batch Enrolled', 'Grade', 'Sub Grade', 'Home Ownership','Verification Status', 'Initial List Status', 'Application Type'])

pd.DataFrame({'train_logloss':cat_model.evals_result_['learn']['Logloss'], 'validation_logloss':cat_model.evals_result_['validation']['Logloss']}).plot()

!pip install shap

import shap shap.initjs()

explainer = shap.TreeExplainer(cat_model) shap_values = explainer.shap_values(Pool(X,y,cat_features=['Batch Enrolled', 'Grade', 'Sub Grade', 'Home Ownership','Verification Status', 'Initial List Status', 'Application Type']))

shap.summary_plot(shap_values,X,class_names=['a','b'],plot_type ='bar')

shap.summary_plot(shap_values,X)

shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:],matplotlib=True)

lbl=LabelEncoder()

for i in X.select_dtypes(include='object').columns.values: X[i]=lbl.fit_transform(X[[i]])

def objective(trial): skf = StratifiedKFold(n_splits=10,random_state=22,shuffle=True) for i, (train_index, test_index) in enumerate(skf.split(X,y)): X_train, X_valid = X.iloc[train_index], X.iloc[test_index] y_train, y_valid = y.iloc[train_index], y.iloc[test_index] param = { "verbosity": 0, "objective": "binary:logistic", # use exact for small dataset. "tree_method": trial.suggest_categorical("tree_method",['exact', 'approx', 'hist']), "eta": trial.suggest_float("eta", 0.001, 1.0), #'interaction_constraints':[[2,3,8,12],[12,13,14],[5,6,9,12]], # defines booster, gblinear for linear functions. "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]), # L2 regularization weight. "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True), # L1 regularization weight. "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True), # sampling ratio for training data. "subsample": trial.suggest_float("subsample", 0.1, 1.0), # sampling according to each tree. "colsample_bytree": trial.suggest_float("colsample_bytree", 0.2, 1.0), "n_estimators":trial.suggest_categorical("n_estimators",[200,400,600,800,1000]) } if param["booster"] in ["gbtree", "dart"]: # maximum depth of the tree, signifies complexity of the tree. param["max_depth"] = trial.suggest_int("max_depth", 2, 60) # minimum child weight, larger the term more conservative the tree. param["min_child_weight"] = trial.suggest_int("min_child_weight", 2, 60) # defines how selective algorithm is. param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True) param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]) if param["booster"] == "dart": param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"]) param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"]) param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True) param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True) pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation_0-logloss") xgb_clf=XGBClassifier(early_stopping_rounds=50, **param) xgb_clf.fit(X_train,y_train, eval_set=[(X_valid, y_valid)], eval_metric='logloss', early_stopping_rounds=50, callbacks=[pruning_callback], ) preds = xgb_clf.predict_proba(X_valid) accuracy = log_loss(y_valid, preds) return accuracy if __name__ == "__main__": study1 = optuna.create_study(direction='minimize',sampler=RandomSampler(), pruner=optuna.pruners.SuccessiveHalvingPruner()) study1.optimize(objective, n_trials=100) print('Number of finished trials:', len(study1.trials)) print('Best trial:', study1.best_trial.params)

100*study1.best_value

study1.best_params

optuna.visualization.plot_optimization_history(study1)

optuna.visualization.plot_slice(study1)

optuna.visualization.plot_param_importances(study1)

params_xgb={'alpha': 0.00013628501532199046, 'booster': 'gbtree', 'colsample_bytree': 0.4510459869291909, 'eta': 0.23201860736451954, 'gamma': 1.18221420230537e-05, 'grow_policy': 'depthwise', 'lambda': 7.036258743817963e-06, 'max_depth': 6, 'min_child_weight': 52, 'n_estimators': 400, 'subsample': 0.44069082236462886, 'tree_method': 'exact'}

X.rename(columns=lambda x: x.replace(' ', '_'),inplace=True)

X_train1, X_valid1, y_train1, y_valid1 = train_test_split(X, y, test_size = 0.2, random_state = 42, shuffle=True,stratify=y)

xgb_model=XGBClassifier(**params_xgb,verbose=1)

xgb_model.fit(X_train1,y_train1,eval_set=[(X_valid1,y_valid1)], eval_metric='logloss',early_stopping_rounds=30,verbose=True)

plot_tree(xgb_model) fig = plt.gcf() fig.set_size_inches(30, 30)

explainer_xgb = shap.TreeExplainer(xgb_model) shap_values_xgb = explainer_xgb.shap_values(X)

shap.summary_plot(shap_values_xgb,X,plot_type ='bar')

def objective(trial): skf = StratifiedKFold(n_splits=10,random_state=22,shuffle=True) for i, (train_index, test_index) in enumerate(skf.split(X,y)): X_train, X_valid = X.iloc[train_index], X.iloc[test_index] y_train, y_valid = y.iloc[train_index], y.iloc[test_index] param_grid = { # "device_type": trial.suggest_categorical("device_type", ['gpu']), "boosting_type":trial.suggest_categorical("boosting_type",['rf','gbdt']), "n_estimators": trial.suggest_categorical("n_estimators", [200,400,600,800,1000]), "learning_rate": trial.suggest_float("learning_rate", 0.001, 1.0), "num_leaves": trial.suggest_int("num_leaves", 20, 100), "max_depth": trial.suggest_int("max_depth", 2, 50), "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 100, ), "lambda_l1": trial.suggest_int("lambda_l1", 1, 1000 ), "lambda_l2": trial.suggest_int("lambda_l2", 1, 1000), 'random_state': trial.suggest_categorical('random_state', [24, 48,2020]), "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15), "bagging_fraction": trial.suggest_float( "bagging_fraction", 0.2, 0.95, step=0.1 ), "bagging_freq": trial.suggest_categorical("bagging_freq", [1,2,4,6,8,10]), "feature_fraction": trial.suggest_float( "feature_fraction", 0.2, 0.95, step=0.1 ), } model = lgbm.LGBMClassifier(objective="binary", **param_grid) model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)], eval_metric='logloss', early_stopping_rounds=50, callbacks=[ LightGBMPruningCallback(trial, 'binary_logloss') ], # Add a pruning callback ) preds = model.predict_proba(X_valid) score = log_loss(y_valid, preds) return 100*score if __name__ == "__main__": study2 = optuna.create_study(direction='minimize',sampler=RandomSampler(), pruner=optuna.pruners.ThresholdPruner(lower=0.0)) study2.optimize(objective, n_trials=100) print('Number of finished trials:', len(study2.trials)) print('Best trial:', study2.best_trial.params)

study2.best_value

study2.best_params

optuna.visualization.plot_optimization_history(study2)

optuna.visualization.plot_slice(study2)

optuna.visualization.plot_param_importances(study2)

params_lgbm={'bagging_fraction': 0.7, 'bagging_freq': 8, 'boosting_type': 'rf', 'feature_fraction': 0.4, 'lambda_l1': 9, 'lambda_l2': 517, 'learning_rate': 0.30902486498872506, 'max_depth': 4, 'min_data_in_leaf': 7, 'min_gain_to_split': 1.776112821650258, 'n_estimators': 400, 'num_leaves': 36, 'random_state': 2020}

lgb_model=lgbm.LGBMClassifier(objective="binary", **params_lgbm)

lgb_model.fit(X_train1,y_train1,eval_set=[(X_valid1,y_valid1)], eval_metric='logloss',early_stopping_rounds=30)

lgbm.plot_tree(lgb_model) fig = plt.gcf() fig.set_size_inches(30, 30)

lgbm.plot_metric(lgb_model)

explainer_lgb = shap.TreeExplainer(lgb_model) shap_values_lgb = explainer_lgb.shap_values(X)

shap.summary_plot(shap_values_lgb,X,plot_type ='bar')

scl=StandardScaler()

for i in X.select_dtypes(exclude='object').columns.values: X[i]=scl.fit_transform(X[[i]])

from tensorflow import keras from tensorflow.keras import layers

nn = keras.Sequential([ layers.BatchNormalization(input_shape = [X.shape[1]]), layers.Dense(units = 128, activation = 'relu'), layers.BatchNormalization(), layers.Dropout(rate = 0.5), layers.Dense(units = 64, activation = 'relu'), layers.BatchNormalization(), layers.Dropout(rate = 0.4), layers.Dense(units = 32, activation = 'relu'), layers.BatchNormalization(), layers.Dropout(rate = 0.3), layers.Dense(units = 1, activation = 'sigmoid') ])

auc = keras.metrics.BinaryAccuracy() nn.compile(optimizer = keras.optimizers.Adam(), loss = keras.losses.BinaryCrossentropy(), metrics = [auc])

early_stopping = keras.callbacks.EarlyStopping(patience = 20, min_delta = 0.001, restore_best_weights = False)

initial_learning_rate = 0.01 def lr_step_decay(epoch, lr): drop_rate = 0.5 epochs_drop = 10.0 return initial_learning_rate * math.pow(drop_rate, math.floor(epoch/epochs_drop))

callback_lr = keras.callbacks.LearningRateScheduler(lr_step_decay, verbose=1)

history =nn.fit(X, y, validation_split=0.2, batch_size = 64, epochs = 100, callbacks = [early_stopping,callback_lr ] )

history_df = pd.DataFrame(history.history)

history_df.head()

plt.figure(figsize=(10,10)) plt.plot(xgb_model.evals_result_['validation_0']['logloss'],label="xgb_validation_loss") plt.plot(cat_model.evals_result_['validation']['Logloss'],label="catboost_validation_loss") plt.plot(lgb_model.evals_result_['valid_0']['binary_logloss'],label="lgbm_validation_loss") plt.plot(history_df['val_loss'],label="Keras_Nnet_validation_loss") plt.legend() plt.xticks(color='w') plt.title("Model's Validation-Logloss Comparison")