A Company Bankruptcy Prediction

library(tidyverse) library(tidymodels) library(psych) library(ROSE) company <- read.csv(file = '../input/company-bankruptcy-prediction/data.csv' , header = T)

company %>% select(where(is.integer)) %>% colnames() #Which variable is binary? company$Bankrupt. <- factor(company$Bankrupt. , levels = c(0,1) ,labels = c('NO' , 'YES')) company %>% select(-where(is.integer)) -> company #company$Liability.Assets.Flag <- factor(company$Liability.Assets.Flag, levels = c(0,1) , labels = c('NO', 'Exceeds')) #company$Net.Income.Flag <- factor(company$Net.Income.Flag , levels = c(0,1) , labels = c('NO' , 'Negative')) #Factorize binary variables with label.

company %>% select(where(is.numeric)) %>% multi.hist()

hist(company$Current.Ratio)

company %>% select(where(is.numeric)) %>% select_if(~mean(.) > 10) %>% multi.hist()

company %>% select(where(is.numeric) , -Total.Asset.Growth.Rate) %>% select_if(~mean(.) > 10^8) %>% colnames() -> abnormal_col company %>% select(-c(all_of(abnormal_col))) -> company_processed #remove imbalanced variables company_processed %>% select(where(is.numeric)) %>% select_if(~mean(.) > 10) %>% select(-Total.Asset.Growth.Rate) %>% colnames() -> abnormal_col company_processed %>% filter_at(vars(all_of(abnormal_col)) , all_vars(. < 10)) -> company_processed #remove observations have outliers company_processed %>% select(where(is.numeric)) %>% multi.hist()

company_processed %>% select(Bankrupt. , ROA.B..before.interest.and.depreciation.after.tax, Operating.Gross.Margin, Net.Value.Per.Share..A., Realized.Sales.Gross.Profit.Growth.Rate, Total.Asset.Growth.Rate, Net.Value.Growth.Rate, Quick.Ratio, Debt.ratio.., Revenue.per.person, Long.term.Liability.to.Current.Assets, Total.income.Total.expense, Liability.to.Equity, Equity.to.Liability) -> company_remove_col

company_remove_col %>% initial_split(0.7) -> com_split # Split as training and test com_split %>% training() %>% recipe(Bankrupt. ~.) %>% step_corr(all_predictors()) %>% step_center(all_predictors() , -all_outcomes()) %>% step_scale(all_predictors() , -all_outcomes()) %>% prep() -> com_recipe # Recipe com_recipe %>% juice -> com_training # training data com_recipe %>% bake(com_split %>% testing()) -> com_test # test data rose <- ROSE(Bankrupt. ~ . , data = com_training , seed = 4) # SMOTE com_rose <- rose$data

# Prepare validation set com_rose %>% vfold_cv(v = 10) -> com_vfold #Generate V-Fold data com_vfold %>% mutate(com_ana = map(splits,analysis), com_ass = map(splits,assessment)) -> com_val

#Validation rand_forest_model <- rand_forest() %>% set_engine('randomForest') %>% set_mode('classification') # Modeling com_val %>% mutate( recipe = map(com_ana, ~prep(com_recipe, training = .x)), com_ana = map(recipe, juice), com_ass = map2(recipe, com_ass, ~bake(.x , new_data = .y))) %>% mutate(com_fit = map(com_ana, ~fit(rand_forest_model, Bankrupt. ~. , data = .x))) %>% mutate(model_pred = map2(com_fit, com_ass, ~predict(.x, new_data = .y))) -> com_val # Prepare validation com_val %>% mutate(res = map2(com_ass, model_pred, ~data.frame(Bankrupt. = .x$Bankrupt., pred = .y$.pred_class))) %>% select(id, res) %>% unnest(res) %>% group_by(id) -> com_pred metrics(com_pred, truth = Bankrupt., estimate = pred) # Metrics for validation

result <- list() bankrupt_accuracy <- vector() for(i in 1:10){ rand_forest_model %>% fit(Bankrupt. ~ . , data = com_rose) -> com_rf com_rf %>% predict(com_test) %>% bind_cols(com_test %>% select(Bankrupt.)) -> com_pre accu <- accuracy(com_pre, truth = Bankrupt., estimate = .pred_class) prec <- yardstick::precision(com_pre, truth = Bankrupt., estimate = .pred_class) rec <- yardstick::recall(com_pre, truth = Bankrupt., estimate = .pred_class) f1 <- f_meas(com_pre, truth = Bankrupt., estimate = .pred_class) result[[i]] <- bind_rows(accu,prec,rec,f1) com_pre %>% conf_mat(Bankrupt., .pred_class) %>% pluck(1) %>% as_tibble() %>% spread(Truth, n) -> conf_matrix bankrupt_accuracy <- c(bankrupt_accuracy , conf_matrix$YES[2] /sum(conf_matrix$YES)) } result paste0('The mean of bankrupt_accuracy : ' ,mean(bankrupt_accuracy)) roc.curve(response = com_test$Bankrupt. , predicted = com_pre$.pred_class)