!pip install torch==1.2.0 torchvision==0.4.0 tensorboardx LightGBM==2.2.1 scikit-learn==0.19.2 category-encoders tqdm
import torch
torch.__version__
import pandas as pd
# pd.set_option('display.max_columns', 200)
import numpy as np
import category_encoders as ce
from tqdm import tqdm
import collections, os
import gc
import pdb
file_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
df = pd.read_csv(file_url, sep=';')
df.head(2)
fixed acidityfloat64
volatile acidityfloat64
0
7
0.27
1
6.3
0.3
nume_col = df.select_dtypes('number').columns.tolist()
cate_col = df.select_dtypes('object').columns.tolist()
label_col = 'quality'
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(
df, test_size=0.2, random_state=42)
out_dir = '/work/data/data_offline'
if not os.path.isdir(out_dir):
os.mkdir(out_dir)
threshold = 10
thresrate = 0.99
num_bins = 32
test_csv_path = os.path.join(out_dir, 'test.csv')
train_csv_path = os.path.join(out_dir, 'train.csv')
X_train.to_csv(train_csv_path)
X_test.to_csv(test_csv_path)
out_dir_num = '/work/data/data_offline_num'
if not os.path.isdir(out_dir_num):
os.mkdir(out_dir_num)
ec = NumEncoder(cate_col, nume_col, threshold, thresrate, label_col)
ec.fit_transform(train_csv_path, out_dir_num + '/train')
ec.transform(test_csv_path, out_dir_num + '/test')
----------------------------------------------------------------------
Fitting and Transforming /work/data/data_offline/train.csv .
----------------------------------------------------------------------
Filtering and fillna features
0it [00:00, ?it/s]
100%|██████████| 12/12 [00:00<00:00, 693.28it/s]
Ordinal encoding cate features
Target encoding cate features
0it [00:00, ?it/s]
Start manual binary encode
100%|██████████| 12/12 [00:00<00:00, 18.97it/s]
0it [00:00, ?it/s]
----------------------------------------------------------------------
Transforming /work/data/data_offline/test.csv .
----------------------------------------------------------------------
Filtering and fillna features
0it [00:00, ?it/s]
100%|██████████| 12/12 [00:00<00:00, 2227.95it/s]
Ordinal encoding cate features
Target encoding cate features
0it [00:00, ?it/s]
Start manual binary encode
100%|██████████| 12/12 [00:00<00:00, 19.57it/s]
0it [00:00, ?it/s]
out_dir_cate = '/work/data/data_offline_cate'
if not os.path.isdir(out_dir_cate):
os.mkdir(out_dir_cate)
ec = CateEncoder(cate_col, nume_col, threshold, thresrate, num_bins, label_col)
ec.fit_transform(train_csv_path, out_dir_cate + '/train/')
ec.transform(test_csv_path, out_dir_cate + '/test/')
----------------------------------------------------------------------
Fitting and Transforming /work/data/data_offline/train.csv .
----------------------------------------------------------------------
Filtering and fillna features
0it [00:00, ?it/s]
Fillna and Bucketize numeric features
100%|██████████| 12/12 [00:00<00:00, 315.16it/s]
Ordinal encoding cate features
----------------------------------------------------------------------
Transforming /work/data/data_offline/test.csv .
----------------------------------------------------------------------
Filtering and fillna features
0it [00:00, ?it/s]
100%|██████████| 12/12 [00:00<00:00, 545.39it/s]
Ordinal encoding cate features
Train DeepGBM
import sys
sys.path.insert(0, '/work/data/models')
!python /work/main.py -data data_offline -batch_size 512 -plot_title 'paper_0201' \
-max_epoch 20 -lr 1e-3 -opt Adam -test_batch_size 128 -model deepgbm \
-task regression -l2_reg 1e-6 -test_freq 50 -seed 1,2,3,4,5 -group_method Random \
-emb_epoch 2 -loss_de 2 -loss_dr 0.7 -tree_lr 0.1 -cate_layers 16,16 -nslices 5 \
-tree_layers 100,100,100,50 -embsize 20 -maxleaf 64 -log_freq 50
2021-12-26 20:15:03,726 [INFO] data loaded.
train_x shape: (3918, 12). train_y shape: (3918, 1).
test_x shape: (980, 12). test_y shape: (980, 1).
loaded from data//data_offline_cate/train/.
loaded from data//data_offline_cate/test/.
2021-12-26 20:15:03,747 [INFO] Categorical data loaded.
train_x shape: (3918, 12). train_y shape: (3918, 1).
test_x shape: (980, 12). test_y shape: (980, 1).
[LightGBM] [Info] Total Bins 1349
[LightGBM] [Info] Number of data: 3918, number of used features: 12
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[1] valid_0's l2: 28.8602
Training until validation scores don't improve for 20 rounds.
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[2] valid_0's l2: 23.3746
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[3] valid_0's l2: 18.9312
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[4] valid_0's l2: 15.3323
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[5] valid_0's l2: 12.4181
[6] valid_0's l2: 10.0927
[7] valid_0's l2: 8.17432
[8] valid_0's l2: 6.6204
[9] valid_0's l2: 5.36224
[10] valid_0's l2: 4.34353
[11] valid_0's l2: 3.51785
[12] valid_0's l2: 2.84947
[13] valid_0's l2: 2.32035
[14] valid_0's l2: 1.89044
[15] valid_0's l2: 1.53176
[16] valid_0's l2: 1.2411
[17] valid_0's l2: 1.006
[18] valid_0's l2: 0.815457
[19] valid_0's l2: 0.661149
[20] valid_0's l2: 0.536325
[21] valid_0's l2: 0.435143
[22] valid_0's l2: 0.355798
[23] valid_0's l2: 0.289072
[24] valid_0's l2: 0.234945
[25] valid_0's l2: 0.19292
[26] valid_0's l2: 0.157193
[27] valid_0's l2: 0.128321
[28] valid_0's l2: 0.104879
[29] valid_0's l2: 0.085969
[30] valid_0's l2: 0.0706291
[31] valid_0's l2: 0.0581826
[32] valid_0's l2: 0.0481455
[33] valid_0's l2: 0.0400125
[34] valid_0's l2: 0.0334011
[35] valid_0's l2: 0.0280634
[36] valid_0's l2: 0.0237327
[37] valid_0's l2: 0.0202725
[38] valid_0's l2: 0.0174394
[39] valid_0's l2: 0.0151574
[40] valid_0's l2: 0.0133604
[41] valid_0's l2: 0.0118826
[42] valid_0's l2: 0.0106659
[43] valid_0's l2: 0.00974625
[44] valid_0's l2: 0.00892071
[45] valid_0's l2: 0.00828114
[46] valid_0's l2: 0.00775478
[47] valid_0's l2: 0.00734878
[48] valid_0's l2: 0.0070255
[49] valid_0's l2: 0.00680749
[50] valid_0's l2: 0.00657801
[51] valid_0's l2: 0.00640597
[52] valid_0's l2: 0.00628913
[53] valid_0's l2: 0.00619658
[54] valid_0's l2: 0.00611129
[55] valid_0's l2: 0.00600852
[56] valid_0's l2: 0.00595943
[57] valid_0's l2: 0.00588267
[58] valid_0's l2: 0.00585195
[59] valid_0's l2: 0.00576745
[60] valid_0's l2: 0.00574538
[61] valid_0's l2: 0.00570889
[62] valid_0's l2: 0.00568758
[63] valid_0's l2: 0.00565424
[64] valid_0's l2: 0.00564084
[65] valid_0's l2: 0.00559572
[66] valid_0's l2: 0.00557614
[67] valid_0's l2: 0.00558327
[68] valid_0's l2: 0.00558008
[69] valid_0's l2: 0.00557734
[70] valid_0's l2: 0.00553099
[71] valid_0's l2: 0.00556286
[72] valid_0's l2: 0.00553455
[73] valid_0's l2: 0.00552231
[74] valid_0's l2: 0.00551154
[75] valid_0's l2: 0.00550066
[76] valid_0's l2: 0.00550182
[77] valid_0's l2: 0.00551161
[78] valid_0's l2: 0.0055286
[79] valid_0's l2: 0.00547652
[80] valid_0's l2: 0.00543651
[81] valid_0's l2: 0.00540197
[82] valid_0's l2: 0.00541317
[83] valid_0's l2: 0.00540328
[84] valid_0's l2: 0.00540826
[85] valid_0's l2: 0.00540427
[86] valid_0's l2: 0.00544133
[87] valid_0's l2: 0.00547701
[88] valid_0's l2: 0.0054919
[89] valid_0's l2: 0.00549112
[90] valid_0's l2: 0.00549627
[91] valid_0's l2: 0.00547389
[92] valid_0's l2: 0.00547925
[93] valid_0's l2: 0.00550259
[94] valid_0's l2: 0.00550739
[95] valid_0's l2: 0.00552155
[96] valid_0's l2: 0.00551727
[97] valid_0's l2: 0.00547091
[98] valid_0's l2: 0.00542945
[99] valid_0's l2: 0.00542732
[100] valid_0's l2: 0.00542483
Did not meet early stopping. Best iteration is:
[81] valid_0's l2: 0.00540197
Model Interpreting...
[(17,), (16,), (16,), (16,), (16,)]
emb-Evaluate Result:
Epoch-000 8 Batches, Step 8, Testing Loss: 34.826273, Used Time: 0.0m, Remaining Time: 0.0m
-------------------------------------------------------------------------------
Best Metric: 34.826272972262636
####################################################################################
emb-Evaluate Result:
Epoch-001 8 Batches, Step 16, Testing Loss: 32.158336, Used Time: 0.0m, Remaining Time: 0.0m
-------------------------------------------------------------------------------
Best Metric: 32.15833629686005
####################################################################################
Final Best Metric: 32.15833629686005
Init GBDT2NN
Init GBDT2NN succeed!
Cuda is not available, automatically changed into cpu model
The model is deepfm(fm+deep layers)
Init fm part
Init fm part succeed
Init deep part
Init deep part succeed
Init succeed
Init DeepGBM succeed!
Alpha: Parameter containing:
tensor(0., requires_grad=True)
Beta: Parameter containing:
tensor(0., requires_grad=True)
Traceback (most recent call last):
File "/work/main.py", line 81, in <module>
main()
File "/work/main.py", line 78, in main
train_DEEPGBM(args, num_data, cate_data, plot_title)
File "/work/train_models.py", line 157, in train_DEEPGBM
train_x_opt=train_xc, test_x_opt=test_xc)
File "/work/helper.py", line 168, in TrainWithLog
test_loss, pred_y = EvalTestset(test_x, test_y, model, args.test_batch_size, test_x_opt)
File "/work/helper.py", line 80, in EvalTestset
outputs = model(inputs, inputs_opt)
File "/root/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/work/models/deepgbm.py", line 65, in forward
deepfm_out = self.deepfm(Xd)
File "/root/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/work/models/deepfm.py", line 151, in forward
fm_first_order = self.fm_first_order_embedding(Xi.view(X.size(0)*self.field_size)).view(X.size(0), -1)
File "/root/venv/lib/python3.7/site-packages/torch/nn/modules/module.py", line 547, in __call__
result = self.forward(*input, **kwargs)
File "/root/venv/lib/python3.7/site-packages/torch/nn/modules/sparse.py", line 114, in forward
self.norm_type, self.scale_grad_by_freq, self.sparse)
File "/root/venv/lib/python3.7/site-packages/torch/nn/functional.py", line 1467, in embedding
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: index out of range: Tried to access index -1 out of table with 347 rows. at /pytorch/aten/src/TH/generic/THTensorEvenMoreMath.cpp:237