(SANDBOX) LPA Final Project
Experiment setup and predictive modeling for AnCo
Purpose: This notebook [Explain why we're doing this]
Data source: https://www.kaggle.com/akbaldawa/hr-analytics?select=Train.csvVB
Actually is from this challenge, which we will use for our test model verification: https://datahack.analyticsvidhya.com/contest/wns-analytics-hackathon-2018-1/
Section 1: Load data and packages
#Read data
df_train <- read.csv("Train.csv")
#Install Libraries (to be extended)
#install.packages("tidyverse")
library(tidyverse)
library(jtools)
library(stargazer)
library(sjPlot)
#Set plot size
options(repr.plot.width=6, repr.plot.height=5)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
✔ ggplot2 3.3.3 ✔ purrr 0.3.4
✔ tibble 3.0.5 ✔ dplyr 1.0.3
✔ tidyr 1.1.2 ✔ stringr 1.4.0
✔ readr 1.4.0 ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
Please cite as:
Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
Section 2: Prepare data
#let's have an initial look
head(df_train)
# Several variables are stored as characters (<chr>) but will actually act as factors
# (department, region, education, gender, recruitment_channel, previous_year_rating should all be factors)
# we should have a look at the observations, and then convert them into factors
df_train %>%
count(department)
df_train %>%
count(region)
df_train %>%
count(education)
df_train %>%
count(gender)
df_train %>%
count(recruitment_channel)
df_train %>%
count(previous_year_rating)
#seems like there are som NA's in the "previous year rating" column. These are likely to be new employees
#let's see how many there are
table(is.na(df_train$previous_year_rating))
#only 8% of observations lack this data
#intuitively we probably are not promoting people within their first year
#it makes sense to delete these observations
df_train <- filter(df_train, previous_year_rating >= 1)
#make sure the NA rows are now deleted
table(is.na(df_train$previous_year_rating))
#similarly, 2409 have no education on file. let's remove these as well and make sure the rows are deleted
df_train <- filter(df_train, education != "")
df_train <- filter(df_train, is.na(education)==FALSE)
#now we can transform them into factors
#department
df_train$department <- parse_factor(df_train$department, levels=NULL, ordered=FALSE)
class(df_train$department)
#region
df_train$region <- parse_factor(df_train$region, levels=NULL, ordered=FALSE)
class(df_train$region)
#gender
df_train$gender <- parse_factor(df_train$gender, levels=NULL, ordered=FALSE)
class(df_train$gender)
#recruitment channel
df_train$recruitment_channel <- parse_factor(df_train$recruitment_channel, levels=NULL, ordered=FALSE)
class(df_train$recruitment_channel)
#education (we will sort)
ed_levels <- c("Below Secondary","Bachelor's","Master's & above")
df_train$education <- parse_factor(df_train$education, levels=ed_levels, ordered=TRUE)
class(df_train$education)
#previous_year_rating (we will sort) - different formula as the previous works for characters only
rating_levels <- c("1","2","3","4","5")
df_train$previous_year_rating <- as.factor(df_train$previous_year_rating)
df_train$previous_year_rating <- factor(df_train$previous_year_rating, levels=rating_levels, ordered = TRUE)
class(df_train$previous_year_rating)
#few more transformations needed
#employee_id should be character and not int, as the numbers do not represent values;
df_train$employee_id <- as.character(df_train$employee_id)
#is_promoted,awards_won,KPIs_met should be booleans, as they are binary options
df_train$is_promoted <- as.logical(df_train$is_promoted)
df_train$awards_won. <- as.logical(df_train$awards_won.)
df_train$KPIs_met..80. <- as.logical(df_train$KPIs_met..80.)
Section 3: Visualize and understand data
Section 4: Running the models
#our initial hypothesis is that promotion decisions could be linked to previous ratings, meeting KPIs, average training scores, and education levels
#let's test a model with these predictors
model_berten = glm(is_promoted ~ education + previous_year_rating + KPIs_met..80. + avg_training_score, data = df_train)
summary(model_berten)
model_cem_step1 = glm(is_promoted ~ . - employee_id, data = df_train)
model_cem_step2 = step(model_cem_step1, direction = "backward", trace = FALSE)
summary(model_cem_step2)
# we will need some new libraries if we want to try machine learning algorithms
install.packages("randomForest")
library(rpart.plot)
library(lubridate)
library(randomForest)
Installing package into ‘/work/.R/library’
(as ‘lib’ is unspecified)
Warning message in install.packages("randomForest"):
“installation of package ‘randomForest’ had non-zero exit status”
Loading required package: rpart
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
Attaching package: ‘randomForest’
The following object is masked from ‘package:dplyr’:
combine
The following object is masked from ‘package:ggplot2’:
margin
#first we need to turn the outcome variable into a factor
df_train$is_promoted = as.factor(df_train$is_promoted)
#now we can fit the random forest model (m=4 as rule of thumb of m= variables/3)
model_rf = randomForest(is_promoted ~ department + region + education + gender + recruitment_channel + no_of_trainings + age + previous_year_rating + length_of_service + KPIs_met..80. + awards_won. + avg_training_score, data = df_train, importance = TRUE, mtry = 4)
model_rf
#now we can check the importance assigned by the algorithm to each variable
varImpPlot(model_rf, type=1)