library(tidyverse)
library(tidymodels)
library(psych)
library(ROSE)
company <- read.csv(file = '../input/company-bankruptcy-prediction/data.csv' , header = T)
company %>% select(where(is.integer)) %>% colnames() #Which variable is binary?
company$Bankrupt. <- factor(company$Bankrupt. , levels = c(0,1) ,labels = c('NO' , 'YES'))
company %>% select(-where(is.integer)) -> company
#company$Liability.Assets.Flag <- factor(company$Liability.Assets.Flag, levels = c(0,1) , labels = c('NO', 'Exceeds'))
#company$Net.Income.Flag <- factor(company$Net.Income.Flag , levels = c(0,1) , labels = c('NO' , 'Negative'))
#Factorize binary variables with label.
company %>% select(where(is.numeric)) %>% multi.hist()
hist(company$Current.Ratio)
company %>% select(where(is.numeric)) %>% select_if(~mean(.) > 10) %>% multi.hist()
company %>% select(where(is.numeric) , -Total.Asset.Growth.Rate) %>% select_if(~mean(.) > 10^8) %>% colnames() -> abnormal_col
company %>% select(-c(all_of(abnormal_col))) -> company_processed #remove imbalanced variables
company_processed %>% select(where(is.numeric)) %>% select_if(~mean(.) > 10) %>% select(-Total.Asset.Growth.Rate) %>% colnames() -> abnormal_col
company_processed %>% filter_at(vars(all_of(abnormal_col)) , all_vars(. < 10)) -> company_processed #remove observations have outliers
company_processed %>% select(where(is.numeric)) %>% multi.hist()
company_processed %>% select(Bankrupt. , ROA.B..before.interest.and.depreciation.after.tax, Operating.Gross.Margin, Net.Value.Per.Share..A.,
Realized.Sales.Gross.Profit.Growth.Rate, Total.Asset.Growth.Rate, Net.Value.Growth.Rate, Quick.Ratio, Debt.ratio..,
Revenue.per.person, Long.term.Liability.to.Current.Assets, Total.income.Total.expense, Liability.to.Equity, Equity.to.Liability) -> company_remove_col
company_remove_col %>% initial_split(0.7) -> com_split # Split as training and test
com_split %>% training() %>% recipe(Bankrupt. ~.) %>% step_corr(all_predictors()) %>%
step_center(all_predictors() , -all_outcomes()) %>% step_scale(all_predictors() , -all_outcomes()) %>% prep() -> com_recipe # Recipe
com_recipe %>% juice -> com_training # training data
com_recipe %>% bake(com_split %>% testing()) -> com_test # test data
rose <- ROSE(Bankrupt. ~ . , data = com_training , seed = 4) # SMOTE
com_rose <- rose$data
# Prepare validation set
com_rose %>% vfold_cv(v = 10) -> com_vfold #Generate V-Fold data
com_vfold %>% mutate(com_ana = map(splits,analysis), com_ass = map(splits,assessment)) -> com_val
#Validation
rand_forest_model <- rand_forest() %>% set_engine('randomForest') %>% set_mode('classification') # Modeling
com_val %>% mutate(
recipe = map(com_ana, ~prep(com_recipe, training = .x)),
com_ana = map(recipe, juice),
com_ass = map2(recipe, com_ass, ~bake(.x , new_data = .y))) %>%
mutate(com_fit = map(com_ana, ~fit(rand_forest_model, Bankrupt. ~. , data = .x))) %>%
mutate(model_pred = map2(com_fit, com_ass, ~predict(.x, new_data = .y))) -> com_val # Prepare validation
com_val %>% mutate(res = map2(com_ass, model_pred, ~data.frame(Bankrupt. = .x$Bankrupt., pred = .y$.pred_class))) %>%
select(id, res) %>%
unnest(res) %>%
group_by(id) -> com_pred
metrics(com_pred, truth = Bankrupt., estimate = pred) # Metrics for validation
result <- list()
bankrupt_accuracy <- vector()
for(i in 1:10){
rand_forest_model %>% fit(Bankrupt. ~ . , data = com_rose) -> com_rf
com_rf %>% predict(com_test) %>% bind_cols(com_test %>% select(Bankrupt.)) -> com_pre
accu <- accuracy(com_pre, truth = Bankrupt., estimate = .pred_class)
prec <- yardstick::precision(com_pre, truth = Bankrupt., estimate = .pred_class)
rec <- yardstick::recall(com_pre, truth = Bankrupt., estimate = .pred_class)
f1 <- f_meas(com_pre, truth = Bankrupt., estimate = .pred_class)
result[[i]] <- bind_rows(accu,prec,rec,f1)
com_pre %>% conf_mat(Bankrupt., .pred_class) %>% pluck(1) %>% as_tibble() %>% spread(Truth, n) -> conf_matrix
bankrupt_accuracy <- c(bankrupt_accuracy , conf_matrix$YES[2] /sum(conf_matrix$YES))
}
result
paste0('The mean of bankrupt_accuracy : ' ,mean(bankrupt_accuracy))
roc.curve(response = com_test$Bankrupt. , predicted = com_pre$.pred_class)