Employee Turnover

import pandas as pd hr = pd.read_csv('HR.csv') col_names = hr.columns.tolist() print("Column names:") print(col_names) print("\nSample data:") hr.head()

hr=hr.rename(columns = {'sales':'department'})

hr.dtypes

hr.isnull().any()

hr.shape

hr.describe()

hr['department'].unique()

import numpy as np hr['department']=np.where(hr['department'] =='support', 'technical', hr['department']) hr['department']=np.where(hr['department'] =='IT', 'technical', hr['department'])

print(hr['department'].unique())

hr['left'].value_counts()

hr.groupby('left').mean()

hr.groupby('department').mean()

hr.groupby('salary').mean()

%matplotlib inline import matplotlib.pyplot as plt pd.crosstab(hr.department,hr.left).plot(kind='bar') plt.title('Turnover Frequency for Department') plt.xlabel('Department') plt.ylabel('Frequency of Turnover') plt.savefig('department_bar_chart')

table=pd.crosstab(hr.salary, hr.left) table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True) plt.title('Stacked Bar Chart of Salary Level vs Turnover') plt.xlabel('Salary Level') plt.ylabel('Proportion of Employees') plt.savefig('salary_bar_chart')

pd.crosstab(hr.department, hr.left)

num_bins = 10 hr.hist(bins=num_bins, figsize=(20,15)) plt.savefig("hr_histogram_plots") plt.show()

hr.head()

cat_vars=['department','salary'] for var in cat_vars: cat_list='var'+'_'+var cat_list = pd.get_dummies(hr[var], prefix=var) hr1=hr.join(cat_list) hr=hr1

hr.drop(hr.columns[[8, 9]], axis=1, inplace=True)

hr.columns.values

hr_vars=hr.columns.values.tolist() y=['left'] X=[i for i in hr_vars if i not in y]

from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression model = LogisticRegression() rfe = RFE(model, 10) rfe = rfe.fit(hr[X], hr[y]) print(rfe.support_) print(rfe.ranking_)

cols=['satisfaction_level', 'last_evaluation', 'time_spend_company', 'Work_accident', 'promotion_last_5years', 'department_RandD', 'department_hr', 'department_management', 'salary_high', 'salary_low'] X=hr[cols] y=hr['left']

from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

from sklearn.linear_model import LogisticRegression from sklearn import metrics logreg = LogisticRegression() logreg.fit(X_train, y_train)

from sklearn.metrics import accuracy_score print('Logistic regression accuracy: {:.3f}'.format(accuracy_score(y_test, logreg.predict(X_test))))

from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier() rf.fit(X_train, y_train)

print('Random Forest Accuracy: {:.3f}'.format(accuracy_score(y_test, rf.predict(X_test))))

from sklearn.svm import SVC svc = SVC() svc.fit(X_train, y_train)

print('Support vector machine accuracy: {:.3f}'.format(accuracy_score(y_test, svc.predict(X_test))))

from sklearn import model_selection from sklearn.model_selection import cross_val_score kfold = model_selection.KFold(n_splits=10, random_state=7) modelCV = RandomForestClassifier() scoring = 'accuracy' results = model_selection.cross_val_score(modelCV, X_train, y_train, cv=kfold, scoring=scoring) print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

from sklearn.metrics import classification_report print(classification_report(y_test, rf.predict(X_test)))

y_pred = rf.predict(X_test) from sklearn.metrics import confusion_matrix import seaborn as sns forest_cm = metrics.confusion_matrix(y_pred, y_test, [1,0]) sns.heatmap(forest_cm, annot=True, fmt='.2f',xticklabels = ["Left", "Stayed"] , yticklabels = ["Left", "Stayed"] ) plt.ylabel('True class') plt.xlabel('Predicted class') plt.title('Random Forest') plt.savefig('random_forest')

print(classification_report(y_test, logreg.predict(X_test)))

logreg_y_pred = logreg.predict(X_test) logreg_cm = metrics.confusion_matrix(logreg_y_pred, y_test, [1,0]) sns.heatmap(logreg_cm, annot=True, fmt='.2f',xticklabels = ["Left", "Stayed"] , yticklabels = ["Left", "Stayed"] ) plt.ylabel('True class') plt.xlabel('Predicted class') plt.title('Logistic Regression') plt.savefig('logistic_regression')

print(classification_report(y_test, svc.predict(X_test)))

svc_y_pred = svc.predict(X_test) svc_cm = metrics.confusion_matrix(svc_y_pred, y_test, [1,0]) sns.heatmap(svc_cm, annot=True, fmt='.2f',xticklabels = ["Left", "Stayed"] , yticklabels = ["Left", "Stayed"] ) plt.ylabel('True class') plt.xlabel('Predicted class') plt.title('Support Vector Machine') plt.savefig('support_vector_machine')

from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_curve logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test)) fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1]) rf_roc_auc = roc_auc_score(y_test, rf.predict(X_test)) rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test, rf.predict_proba(X_test)[:,1]) plt.figure() plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc) plt.plot(rf_fpr, rf_tpr, label='Random Forest (area = %0.2f)' % rf_roc_auc) plt.plot([0, 1], [0, 1],'r--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right") plt.savefig('ROC') plt.show()

feature_labels = np.array(['satisfaction_level', 'last_evaluation', 'time_spend_company', 'Work_accident', 'promotion_last_5years', 'department_RandD', 'department_hr', 'department_management', 'salary_high', 'salary_low']) importance = rf.feature_importances_ feature_indexes_by_importance = importance.argsort() for index in feature_indexes_by_importance: print('{}-{:.2f}%'.format(feature_labels[index], (importance[index] *100.0)))