library(dplyr)
library(tidyr)
library(stringr)
library(purrr)
library(ggplot2)
library(MLmetrics)
library(data.table)
library(lightgbm)
library(FNN)
Attaching package: ‘MLmetrics’
The following object is masked from ‘package:base’:
Recall
dat <-
fread("Data/train.csv", data.table = F) %>%
mutate(is_test = 0) %>%
bind_rows(
fread("Data/test.csv", data.table = F) %>%
mutate(is_test = 1)
)
dat[0:5,0:-1]
dat %>% group_by(Ticket) %>% summarise(n = n()) %>% arrange(desc(n))
dat %>% group_by(Embarked) %>% summarise(n = n()) %>% arrange(desc(n))
dat <- dat %>%
mutate(Ticket = str_replace_all(Ticket, "\\.", "")) %>%
add_count(Ticket) %>%
rename(Ticket_N = n) %>%
separate(Ticket, into = c("Ticket_1", "Ticket_2"), sep = "/", remove = T) %>%
separate(Ticket_1, into = c("Ticket_11", "Ticket_12"), sep = " ", remove = T) %>%
separate(Ticket_2, into = c("Ticket_21", "Ticket_22"), sep = " ", remove = T) %>%
mutate_if(negate(is.numeric), function(x){as.numeric(as.factor(x))})
Warning message:
“Expected 2 pieces. Missing pieces filled with `NA` in 185200 rows [1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, ...].”
Warning message:
“Expected 2 pieces. Missing pieces filled with `NA` in 165668 rows [1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 24, ...].”
Warning message:
“Expected 2 pieces. Missing pieces filled with `NA` in 1586 rows [7, 86, 273, 320, 426, 449, 488, 691, 693, 745, 823, 833, 1070, 1486, 1898, 2077, 2230, 2299, 2311, 2384, ...].”
dat[0:5,0:-1]
non_feats <- c("PassangerId", "Survived", "is_test")
feats <- setdiff(colnames(dat), non_feats)
NMODELS <- 50
p_test <<- matrix(0, nrow = dat %>% filter(is_test == 1) %>% nrow, ncol = NMODELS)
train_fold_bagged <- function(model, seed_list){
set.seed(seed_list[model])
## Dividing train data into train and validation datset
tr_ind <- sample(1:nrow(dat[dat$is_test == 0,]), size = nrow(dat[dat$is_test == 0,]), replace = T)
val_ind <- setdiff(1:nrow(dat[dat$is_test == 0,]), tr_ind)
## Defining X and y for both train and validation
x_train <- dat[dat$is_test == 0,][tr_ind,feats]
y_train <- dat[dat$is_test == 0,][tr_ind,]$Survived
x_val <- dat[dat$is_test == 0,][val_ind,feats]
y_val <- dat[dat$is_test == 0,][val_ind,]$Survived
## Optimum parmeters selection
p <- list(objective = "binary",
metric = "auc",
boosting = "gbdt",
learning_rate = 0.01,
sub_feature = 0.8,
sub_row = 0.8,
seed = seed_list[model])
## Creating Lightgbm database for both train and validation
xtr <- lgb.Dataset(as.matrix(x_train), label = y_train, categorical_feature = c("Sex", "Pclass", "Ticket_11", "Ticket_12", "Ticket_21", "Ticket_22"))
xval <- lgb.Dataset(as.matrix(x_val), label = y_val, categorical_feature = c("Sex", "Pclass", "Ticket_11", "Ticket_12", "Ticket_21", "Ticket_22"))
## Training rounds = 5000
m_lgb <- lgb.train(params = p,
data = xtr,
nrounds = 5000,
valids = list(val = xval),
early_stopping_rounds = 100, #500,
eval_freq = 250,
verbose = -1)
## Predicting from validation data
p <- lightgbm:::predict.lgb.Booster(m_lgb, as.matrix(x_val), num_iteration = m_lgb$best_iter)
## Creating threshold function for metrics and converting output into binary
find_thresh <- function(thresh, p){
p_bin <- ifelse(p >= quantile(p, thresh), 1,0)
score <- MLmetrics::Accuracy(y_pred = p_bin, y_true = y_val)
return(tibble(score, thresh))
}
## Using three .4,.7,.001 thershold for predicting best performing metric
thresh_df <- purrr::map_dfr(seq(.4,.7,.001), find_thresh, p = p)
best_thresh <- arrange(thresh_df, desc(score)) %>% head(1) %>% .$thresh
best_acc <- arrange(thresh_df, desc(score)) %>% head(1) %>% .$score
## Display of best Accuray Achive and best AUC score
cat("model #:", model, "seed:", seed_list[model], "ACC:", best_acc, "AUC:", m_lgb$best_score, "\n")
## Predicting Test daf frame for competition submission.
p_te <- lightgbm:::predict.lgb.Booster(m_lgb, as.matrix(dat %>% filter(is_test == 1) %>% select(feats)), num_iteration = m_lgb$best_iter)
p_te <- ifelse(p_te >= quantile(p_te, best_thresh), 1,0)
p_test[,model] <<- p_te
return(tibble(auc_scores = m_lgb$best_score, acc_scores = best_acc))
}
seed_list <- sample(1:1000,NMODELS)
out <- purrr::map_dfr(1:length(seed_list), train_fold_bagged, seed_list)
cat("final model scores:", "ACC:", mean(out$acc_scores), "AUC:", mean(out$auc_scores), "\n")
model #: 1 seed: 470 ACC: 0.7824624 AUC: 0.8543032
Note: Using an external vector in selections is ambiguous.
ℹ Use `all_of(feats)` instead of `feats` to silence this message.
ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
model #: 2 seed: 843 ACC: 0.7849791 AUC: 0.8567712
model #: 3 seed: 343 ACC: 0.7837757 AUC: 0.8558361
model #: 4 seed: 118 ACC: 0.7856013 AUC: 0.8562494
model #: 5 seed: 249 ACC: 0.7837027 AUC: 0.8553962
model #: 6 seed: 786 ACC: 0.7817762 AUC: 0.853922
model #: 7 seed: 747 ACC: 0.7794706 AUC: 0.8505838
model #: 8 seed: 807 ACC: 0.7855219 AUC: 0.8585228
model #: 9 seed: 157 ACC: 0.7836643 AUC: 0.8560131
model #: 10 seed: 26 ACC: 0.7862039 AUC: 0.8573822
model #: 11 seed: 306 ACC: 0.7801303 AUC: 0.8537842
model #: 12 seed: 656 ACC: 0.789253 AUC: 0.8600723
model #: 13 seed: 185 ACC: 0.7824354 AUC: 0.8542458
model #: 14 seed: 952 ACC: 0.7833129 AUC: 0.8551152
model #: 15 seed: 129 ACC: 0.787293 AUC: 0.8575129
model #: 16 seed: 135 ACC: 0.7866529 AUC: 0.8580856
model #: 17 seed: 298 ACC: 0.7817411 AUC: 0.855127
model #: 18 seed: 514 ACC: 0.7835681 AUC: 0.8571224
model #: 19 seed: 535 ACC: 0.7806592 AUC: 0.8539657
model #: 20 seed: 741 ACC: 0.7855809 AUC: 0.8572515
model #: 21 seed: 694 ACC: 0.7825696 AUC: 0.8543187
model #: 22 seed: 337 ACC: 0.7844811 AUC: 0.8576231
model #: 23 seed: 295 ACC: 0.7872839 AUC: 0.8572733
model #: 24 seed: 342 ACC: 0.7836005 AUC: 0.8558735
model #: 25 seed: 492 ACC: 0.7855921 AUC: 0.8567851
model #: 26 seed: 105 ACC: 0.7817869 AUC: 0.8546047
model #: 27 seed: 648 ACC: 0.7842387 AUC: 0.8568261
model #: 28 seed: 828 ACC: 0.7871297 AUC: 0.8579704
model #: 29 seed: 445 ACC: 0.7873966 AUC: 0.8589307
model #: 30 seed: 876 ACC: 0.783144 AUC: 0.8543877
model #: 31 seed: 957 ACC: 0.7846763 AUC: 0.8554405
model #: 32 seed: 33 ACC: 0.7839175 AUC: 0.8555747
model #: 33 seed: 234 ACC: 0.7846813 AUC: 0.8567741
model #: 34 seed: 12 ACC: 0.7841309 AUC: 0.8580328
model #: 35 seed: 665 ACC: 0.784364 AUC: 0.8556491
model #: 36 seed: 441 ACC: 0.7822082 AUC: 0.8542989
model #: 37 seed: 794 ACC: 0.7847044 AUC: 0.8564387
model #: 38 seed: 836 ACC: 0.78236 AUC: 0.8544833
model #: 39 seed: 179 ACC: 0.7824436 AUC: 0.8560015
model #: 40 seed: 748 ACC: 0.7818172 AUC: 0.8551899
model #: 41 seed: 529 ACC: 0.7857707 AUC: 0.8585893
model #: 42 seed: 263 ACC: 0.7833193 AUC: 0.8548201
model #: 43 seed: 739 ACC: 0.7869035 AUC: 0.8577615
model #: 44 seed: 677 ACC: 0.7846049 AUC: 0.8559847
model #: 45 seed: 608 ACC: 0.7852589 AUC: 0.8576003
model #: 46 seed: 297 ACC: 0.783462 AUC: 0.8555373
model #: 47 seed: 414 ACC: 0.7879141 AUC: 0.8592708
model #: 48 seed: 494 ACC: 0.7856968 AUC: 0.8565074
model #: 49 seed: 921 ACC: 0.7839069 AUC: 0.855788
model #: 50 seed: 519 ACC: 0.7833518 AUC: 0.8545504
final model scores: ACC: 0.78421 AUC: 0.856123
p_majority <- apply(p_test,1,function(x) as.numeric(names(which.max(table(x)))) )
sub <- fread("Data/sample_submission.csv") %>%
mutate(Survived = p_majority)
fwrite(sub, "submission.csv")