(SANDBOX) LPA Final Project
Experiment setup and predictive modeling for AnCo
Purpose:
This notebook [Explain why we're doing this]
Data source: https://www.kaggle.com/akbaldawa/hr-analytics?select=Train.csvVB
Actually is from this challenge, which we will use for our test model verification: https://datahack.analyticsvidhya.com/contest/wns-analytics-hackathon-2018-1/
Section 1: Load data and packages
df_train <- read.csv("Train.csv")
library(tidyverse)
library(jtools)
library(stargazer)
library(sjPlot)
options(repr.plot.width=6, repr.plot.height=5)
As a first step, we will need to "prepare" the data for our analysis, by transforming data types into appropriate ones
df_train %>%
count(department)
df_train %>%
count(region)
df_train %>%
count(education)
df_train %>%
count(gender)
df_train %>%
count(recruitment_channel)
df_train %>%
count(previous_year_rating)
table(is.na(df_train$previous_year_rating))
df_train <- filter(df_train, previous_year_rating >= 1)
table(is.na(df_train$previous_year_rating))
df_train <- filter(df_train, education != "")
df_train <- filter(df_train, is.na(education)==FALSE)
df_train$department <- parse_factor(df_train$department, levels=NULL, ordered=FALSE)
class(df_train$department)
df_train$region <- parse_factor(df_train$region, levels=NULL, ordered=FALSE)
class(df_train$region)
df_train$gender <- parse_factor(df_train$gender, levels=NULL, ordered=FALSE)
class(df_train$gender)
df_train$recruitment_channel <- parse_factor(df_train$recruitment_channel, levels=NULL, ordered=FALSE)
class(df_train$recruitment_channel)
ed_levels <- c("Below Secondary","Bachelor's","Master's & above")
df_train$education <- parse_factor(df_train$education, levels=ed_levels, ordered=TRUE)
class(df_train$education)
rating_levels <- c("1","2","3","4","5")
df_train$previous_year_rating <- as.factor(df_train$previous_year_rating)
df_train$previous_year_rating <- factor(df_train$previous_year_rating, levels=rating_levels, ordered = TRUE)
class(df_train$previous_year_rating)
df_train$employee_id <- as.character(df_train$employee_id)
df_train$is_promoted <- as.logical(df_train$is_promoted)
df_train$awards_won. <- as.logical(df_train$awards_won.)
df_train$KPIs_met..80. <- as.logical(df_train$KPIs_met..80.)
Section 3: Visualize and understand data
Section 4: Running the models
4.1 "Intuitive" model
In the first attempt, we will use our business understanding and intuition to pick variables that we believe have influenced promotion results.
Given that we are predicting "is_promoted", a binary variable, we will use logistic regression
model_berten = glm(is_promoted ~ education + previous_year_rating + KPIs_met..80. + avg_training_score, data = df_train)
4.1 Step-wise model
The second model will utilize a stepwise method to automatically select variables. In this example we will go from the full model (that includes all variables) to a more selective one
model_cem_step1 = glm(is_promoted ~ . - employee_id, data = df_train)
model_cem_step2 = step(model_cem_step1, direction = "backward", trace = FALSE)
summary(model_cem_step2)
4.2 Random Forest
Lastly, let's also try running a random forest algorithm
install.packages("randomForest")
library(rpart.plot)
library(lubridate)
library(randomForest)
df_train$is_promoted = as.factor(df_train$is_promoted)
model_rf = randomForest(is_promoted ~ department + region + education + gender + recruitment_channel + no_of_trainings + age + previous_year_rating + length_of_service + KPIs_met..80. + awards_won. + avg_training_score, data = df_train, importance = TRUE, mtry = 4)
varImpPlot(model_rf, type=1)