import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
np.set_printoptions(precision=4)
from sklearn.metrics import accuracy_score, recall_score, precision_score , confusion_matrix, f1_score, roc_auc_score
from sklearn.inspection import permutation_importance
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from IPython.display import display,Markdown,HTML
import warnings
warnings.filterwarnings('ignore')
#Importing dataset
!pip install openpyxl
df = pd.read_excel('hr-employee-attrition.xlsx', engine='openpyxl', sheet_name='Data')
df
df.info()
Data Cleaning and Manipulation
#Checking for missing/null values
df.isna().sum()
#Checking for duplicates
df.duplicated().sum()
There are no null values and no duplicate values in the dataset
df['Attrition'] = df['Attrition'].map({"No":0, "Yes":1})
text_negative = "Negative"
text_positive = "Positive"
target_column = "Attrition"
df_all = df.copy()
df_positive = df[df[target_column]==1]
df_negative = df[df[target_column]==0]
def plot_pie(column, title="All Group/Class"):
fig,axs = plt.subplots(1,1)
data = df_all[column].value_counts()
plt.pie(data,autopct='%1.2f%%',labels=data.index)
plt.title(title)
plt.show()
def plot_hist(column, title="All Group/Class"):
plt.hist(df_all[column],density=True)
plt.title(title)
plt.show()
def plot_bar(column, sort=False, title="All Group/Class"):
if sort:
data_all = df_all[column].value_counts().sort_index()
else:
data_all = df_all[column].value_counts()
plt.bar(data_all.index.astype(str),data_all)
plt.title(title)
plt.show()
def plot_bar_compare(column, sort=False):
if sort:
data_positive = df_positive[column].value_counts().sort_index()
data_negative = df_negative[column].value_counts().sort_index()
else:
data_positive = df_positive[column].value_counts()
data_negative = df_negative[column].value_counts()
fig,axs = plt.subplots(2,1)
plt.subplots_adjust(left=0, bottom=0, right=1, top=2, wspace=0, hspace=0.2)
axs[0].bar(data_negative.index.astype(str),data_negative)
axs[0].title.set_text(text_negative)
axs[1].bar(data_positive.index.astype(str),data_positive)
axs[1].title.set_text(text_positive)
plt.show()
def plot_hist_compare(column, bins=5):
plt.hist([df_negative[column], df_positive[column]] , color=['c','r'])
plt.legend((text_negative, text_positive))
plt.show()
def plot_pie_compare(column):
data_positive = df_positive[column].value_counts()
data_negative = df_negative[column].value_counts()
fig,axs = plt.subplots(2,1)
plt.subplots_adjust(left=0, bottom=0, right=1, top=2, wspace=0, hspace=0.2)
axs[0].pie(data_negative,autopct='%1.2f%%',labels=data_negative.index)
axs[0].title.set_text(text_negative)
axs[1].pie(data_positive,autopct='%1.2f%%',labels=data_positive.index)
axs[1].title.set_text(text_positive)
plt.show()
def plot_boxplot(column, title=""):
ax = sns.boxplot(x=target_column, y=column, palette=["c", "r"],
hue=target_column, data=df_all).set_title(title, fontsize=15)
plt.show()
def check_median(column):
data_negative = df_negative[column].describe()
data_positive = df_positive[column].describe()
print("Median:")
print('{}: {}'.format(text_negative,data_negative['50%']))
print('{}: {}'.format(text_positive,data_positive['50%']))
def check_most(column):
data_negative = df_negative[column].value_counts()
data_positive = df_positive[column].value_counts()
print("Most:")
print('{}: {}'.format(text_negative,data_negative.index[0]))
print('{}: {}'.format(text_positive,data_positive.index[0]))
def eda(df_all):
display(HTML('<h1>Exploratory Data Analysis<h1>'))
for column in df_all.columns:
if column == target_column:
continue
display(HTML('<h2>{}<h2>'.format(column)))
if df[column].dtype == 'int64' or df[column].dtype == 'float64':
if len(df[column].unique())>10 :
plot_boxplot(column)
check_median(column)
else:
plot_bar(column)
plot_pie(column)
plot_pie_compare(column)
check_most(column)
elif df[column].dtype == 'object':
if len(df[column].unique())>10 :
df[column].value_counts().head(5)
df_negative[column].value_counts().head(5)
df_positive[column].value_counts().head(5)
else:
plot_bar(column)
plot_pie(column)
plot_pie_compare(column)
check_most(column)
else:
None
Attrition
df['Attrition'].value_counts()
plot_pie('Attrition')
The dataset is not balanced
eda(df_all)
Data Processing
X = df.copy()
y = X[target_column]
X = X.drop([target_column,'Over18',"EmployeeCount","EmployeeNumber","HourlyRate","MonthlyRate","PercentSalaryHike","StandardHours","YearsSinceLastPromotion"], axis=1)
X = pd.get_dummies(X, columns=["BusinessTravel","Department","EducationField","Gender","JobRole","MaritalStatus","OverTime"],drop_first=True)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1234)
pip install --upgrade pip
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
#sm = SMOTE(random_state=1234)
ros = RandomOverSampler(sampling_strategy='minority',random_state=1234)
#rus = RandomUnderSampler(sampling_strategy='majority', random_state=1234)
#X_balance, y_balance = sm.fit_resample(X_train, y_train)
X_balance, y_balance = ros.fit_resample(X_train, y_train)
#X_balance, y_balance = rus.fit_resample(X_train, y_train)
print(f'''Shape of X before Balancing: {X.shape}
Shape of X after Balancing: {X_balance.shape}''')
print('\nBalance of positive and negative classes (%):')
y_balance.value_counts(normalize=True) * 100
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_balance = sc.fit_transform(X_balance)
X_test = sc.transform(X_test)
!pip install xgboost
!pip install catboost
!pip install lightgbm
# Import ML Libraries
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
classifiers = [[CatBoostClassifier(verbose=0),'CatBoost Classifier'],[XGBClassifier(eval_metric='error'),'XGB Classifier'], [RandomForestClassifier(),'Random Forest'],
[KNeighborsClassifier(), 'K-Nearest Neighbours'], [SGDClassifier(),'SGD Classifier'], [SVC(),'SVC'],[LGBMClassifier(),'LGBM Classifier'],
[GaussianNB(),'GaussianNB'],[DecisionTreeClassifier(),'Decision Tree Classifier'],[LogisticRegression(),'Logistic Regression'],[AdaBoostClassifier(),"AdaBoostClassifier"]]
for cls in classifiers:
model = cls[0]
model.fit(X_balance, y_balance)
y_pred = model.predict(X_test)
print(cls[1])
print ('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print("Accuracy : ", accuracy_score(y_test, y_pred) * 100)
print("Recall : ", recall_score(y_test, y_pred) * 100)
print("Precision : ", precision_score(y_test, y_pred) * 100)
print("F1 : ", f1_score(y_test, y_pred) * 100)
print("ROC AUC : ", roc_auc_score(y_test, y_pred) * 100)
print("\n")
Artificial Neural Network
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.layers import Dropout
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from keras.losses import BinaryCrossentropy
from numpy.random import seed
seed(1234)
tf.random.set_seed(1234)
#train the model
model = Sequential()
model.add(Dense(32, input_shape=(X_train.shape[1],), activation='relu')),
model.add(Dropout(0.2)),
model.add(Dense(32, activation='relu')),
model.add(Dropout(0.2)),
model.add(Dense(16, activation='relu')),
model.add(Dropout(0.2)),
model.add(Dense(8, activation='relu')),
model.add(Dropout(0.2)),
model.add(Dense(4, activation='relu')),
model.add(Dropout(0.2)),
model.add(Dense(1, activation='sigmoid'))
opt = Adam(learning_rate=0.001)
earlystopper = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy',mode='max',patience=15, verbose=1,restore_best_weights=True)
model.compile(optimizer = opt, loss = 'binary_crossentropy', metrics = ['accuracy'])
history = model.fit(X_balance, y_balance, batch_size=32, epochs=200,validation_split = 0.15, callbacks = [earlystopper],verbose = 1)
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values=history_dict['val_loss']
plt.plot(loss_values,'b',label='training loss')
plt.plot(val_loss_values,'r',label='val training loss')
plt.legend()
plt.xlabel("Epochs")
accuracy_values = history_dict['accuracy']
val_accuracy_values=history_dict['val_accuracy']
plt.plot(val_accuracy_values,'-r',label='val_accuracy')
plt.plot(accuracy_values,'-b',label='accuracy')
plt.legend()
plt.xlabel("Epochs")
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)
y_pred = [1 if x == True else 0 for x in y_pred]
print(confusion_matrix(y_test, y_pred))
print("Accuracy : ", accuracy_score(y_test, y_pred) * 100)
print("Recall : ", recall_score(y_test, y_pred) * 100)
print("Precision : ", precision_score(y_test, y_pred) * 100)
print("F1 : ", f1_score(y_test, y_pred) * 100)
print("ROC AUC : ", roc_auc_score(y_test, y_pred) * 100)
The best algorithm is XGB Classifier
Accuracy : 88.09523809523809 Recall : 44.680851063829785 Precision : 70.0 F1 : 54.54545454545453 ROC AUC : 70.51856318373675