!pip install torch==1.2.0 torchvision==0.4.0 tensorboardx LightGBM==2.2.1 scikit-learn==0.19.2 category-encoders tqdm
import torch
torch.__version__
import pandas as pd
# pd.set_option('display.max_columns', 200)
import numpy as np
import category_encoders as ce
from tqdm import tqdm
import collections, os
import gc
import pdb
file_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
df = pd.read_csv(file_url, sep=';')
df.head(2)
nume_col = df.select_dtypes('number').columns.tolist()
cate_col = df.select_dtypes('object').columns.tolist()
label_col = 'quality'
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(
df, test_size=0.2, random_state=42)
out_dir = '/work/data/data_offline'
if not os.path.isdir(out_dir):
os.mkdir(out_dir)
threshold = 10
thresrate = 0.99
num_bins = 32
test_csv_path = os.path.join(out_dir, 'test.csv')
train_csv_path = os.path.join(out_dir, 'train.csv')
X_train.to_csv(train_csv_path)
X_test.to_csv(test_csv_path)
out_dir_num = '/work/data/data_offline_num'
if not os.path.isdir(out_dir_num):
os.mkdir(out_dir_num)
ec = NumEncoder(cate_col, nume_col, threshold, thresrate, label_col)
ec.fit_transform(train_csv_path, out_dir_num + '/train')
ec.transform(test_csv_path, out_dir_num + '/test')
out_dir_cate = '/work/data/data_offline_cate'
if not os.path.isdir(out_dir_cate):
os.mkdir(out_dir_cate)
ec = CateEncoder(cate_col, nume_col, threshold, thresrate, num_bins, label_col)
ec.fit_transform(train_csv_path, out_dir_cate + '/train/')
ec.transform(test_csv_path, out_dir_cate + '/test/')
Train DeepGBM
import sys
sys.path.insert(0, '/work/data/models')
!python /work/main.py -data data_offline -batch_size 512 -plot_title 'paper_0201' \
-max_epoch 20 -lr 1e-3 -opt Adam -test_batch_size 128 -model deepgbm \
-task regression -l2_reg 1e-6 -test_freq 50 -seed 1,2,3,4,5 -group_method Random \
-emb_epoch 2 -loss_de 2 -loss_dr 0.7 -tree_lr 0.1 -cate_layers 16,16 -nslices 5 \
-tree_layers 100,100,100,50 -embsize 20 -maxleaf 64 -log_freq 50