Hash Analytics Project- Employees Churn prediction

import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt

import sys

!{sys.executable} -m pip install openpyxl

existing_employees= pd.read_excel("Hash-Analytic-Python-Analytics-Problem-case-study-1.xlsx", "Existing employees")

existing_employees.head()

existing_employees.describe()

left_employees= pd.read_excel("Hash-Analytic-Python-Analytics-Problem-case-study-1.xlsx", "Employees who have left")

left_employees.head()

left_employees.describe()

#boxplot showing the satisfaction levels of the employees that left

fig, ax = plt.subplots(figsize=(12,5)) sns.boxplot(x=left_employees.dept, y=left_employees.satisfaction_level).get_figure().savefig("boxplot.png")

#exploratory statistics of the left employees using countplot

features=['number_project','time_spend_company','Work_accident','promotion_last_5years','dept','salary'] fig=plt.subplots(figsize=(15,30)) for i, j in enumerate(features): plt.subplot(4, 2, i+1) plt.subplots_adjust(hspace = 1.0) sns.countplot(x=j,data = left_employees).get_figure().savefig("countplot.png") plt.xticks(rotation=90) plt.title("No. of employee")

#importing a merged dataset of the existing and left employees

existing_employees2= pd.read_excel("Hash-Analytic-Python-Analytics-Problem-combined.xlsx", "Existing employees")

existing_employees2.head()

#countplot analysis of the merged dataset using a left=0 for existing employees and left=1 for left employees

features=['number_project','time_spend_company','Work_accident','promotion_last_5years','dept','salary'] fig=plt.subplots(figsize=(20,40)) for i, j in enumerate(features): plt.subplot(4, 2, i+1) plt.subplots_adjust(hspace = 1.0) sns.countplot(x=j,data = existing_employees2, hue='left').get_figure().savefig("countplotall.png") plt.xticks(rotation=90) plt.title("No. of employee")

#using kmeans to determine the number of clusters using satisfaction level and last evaluation level of the existing employees dataset

employee_cluster = existing_employees.iloc[:, [1, 2]].values

from sklearn.cluster import KMeans wcss = [] for i in range(1, 11): kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42) kmeans.fit(employee_cluster) wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss) plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.show()

kmeans = KMeans(n_clusters =5, init = 'k-means++', random_state = 42) y_kmeans = kmeans.fit_predict(employee_cluster)

#cluster is equal to 5.. plotting the scatter plot

fig=plt.subplots(figsize=(10,10)) plt.scatter(employee_cluster[y_kmeans == 0, 0], employee_cluster[y_kmeans == 0, 1], s = 100, c = 'red', label = 'comfortable') plt.scatter(employee_cluster[y_kmeans == 1, 0], employee_cluster[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'frustrated') plt.scatter(employee_cluster[y_kmeans == 2, 0], employee_cluster[y_kmeans == 2, 1], s = 100, c = 'green', label = 'ambitious') plt.scatter(employee_cluster[y_kmeans == 3, 0], employee_cluster[y_kmeans == 3, 1], s = 100, c = 'orange', label = 'bad match') plt.scatter(employee_cluster[y_kmeans == 4, 0], employee_cluster[y_kmeans == 4, 1], s = 100, c = 'cyan', label = 'winners').get_figure().savefig("scatteroall.png") plt.title('Clusters of Employees') plt.xlabel('satisfaction level') plt.ylabel('last evaluation') plt.legend().get_figure().savefig("scattelegtall.png") plt.show()

#building a predictive model using random forest classifier

from sklearn.preprocessing import LabelEncoder encode = LabelEncoder() existing_employees2.dept = encode.fit_transform(existing_employees2.dept) existing_employees2.salary = encode.fit_transform(existing_employees2.salary)

y= existing_employees2["left"] x= existing_employees2.drop(["left"], axis=1)

from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)

#leasting out the features that greatly affected the models accuracy and their ratios

from sklearn.ensemble import RandomForestClassifier random_forest= RandomForestClassifier() random_forest.fit(x_train, y_train) importances = pd.DataFrame({'feature':x_train.columns,'importance':np.round(random_forest.feature_importances_,3)}) importances = importances.sort_values('importance',ascending=False).set_index('feature') importances.head(11)