import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
!{sys.executable} -m pip install openpyxl
Requirement already satisfied: openpyxl in /usr/local/lib/python3.7/site-packages (3.0.9)
Requirement already satisfied: et-xmlfile in /usr/local/lib/python3.7/site-packages (from openpyxl) (1.1.0)
WARNING: You are using pip version 20.1.1; however, version 22.0.3 is available.
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.
existing_employees= pd.read_excel("Hash-Analytic-Python-Analytics-Problem-case-study-1.xlsx", "Existing employees")
existing_employees.head()
Emp IDint64
satisfaction_levelfloat64
0
2001
0.58
1
2002
0.82
2
2003
0.45
3
2004
0.78
4
2005
0.49
existing_employees.describe()
Emp IDfloat64
satisfaction_levelfloat64
count
11428
11428
mean
7812.340742
0.6668095905
std
3453.947461
0.2171037545
min
2001
0.12
25%
4857.75
0.54
50%
7714.5
0.69
75%
10571.25
0.84
max
14211
1
left_employees= pd.read_excel("Hash-Analytic-Python-Analytics-Problem-case-study-1.xlsx", "Employees who have left")
left_employees.head()
Emp IDint64
satisfaction_levelfloat64
0
1
0.38
1
2
0.8
2
3
0.11
3
4
0.72
4
5
0.37
left_employees.describe()
Emp IDfloat64
satisfaction_levelfloat64
count
3571
3571
mean
6500.439653
0.4400980118
std
6266.484705
0.2639334362
min
1
0.09
25%
893.5
0.13
50%
1786
0.41
75%
12678.5
0.73
max
14999
0.92
#boxplot showing the satisfaction levels of the employees that left
fig, ax = plt.subplots(figsize=(12,5))
sns.boxplot(x=left_employees.dept, y=left_employees.satisfaction_level).get_figure().savefig("boxplot.png")
#exploratory statistics of the left employees using countplot
features=['number_project','time_spend_company','Work_accident','promotion_last_5years','dept','salary']
fig=plt.subplots(figsize=(15,30))
for i, j in enumerate(features):
plt.subplot(4, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = left_employees).get_figure().savefig("countplot.png")
plt.xticks(rotation=90)
plt.title("No. of employee")
#importing a merged dataset of the existing and left employees
existing_employees2= pd.read_excel("Hash-Analytic-Python-Analytics-Problem-combined.xlsx", "Existing employees")
existing_employees2.head()
Emp IDint64
satisfaction_levelfloat64
0
2001
0.58
1
2002
0.82
2
2003
0.45
3
2004
0.78
4
2005
0.49
#countplot analysis of the merged dataset using a left=0 for existing employees and left=1 for left employees
features=['number_project','time_spend_company','Work_accident','promotion_last_5years','dept','salary']
fig=plt.subplots(figsize=(20,40))
for i, j in enumerate(features):
plt.subplot(4, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = existing_employees2, hue='left').get_figure().savefig("countplotall.png")
plt.xticks(rotation=90)
plt.title("No. of employee")
#using kmeans to determine the number of clusters using satisfaction level and last evaluation level of the existing employees dataset
employee_cluster = existing_employees.iloc[:, [1, 2]].values
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
kmeans.fit(employee_cluster)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
kmeans = KMeans(n_clusters =5, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(employee_cluster)
#cluster is equal to 5.. plotting the scatter plot
fig=plt.subplots(figsize=(10,10))
plt.scatter(employee_cluster[y_kmeans == 0, 0], employee_cluster[y_kmeans == 0, 1], s = 100, c = 'red', label = 'comfortable')
plt.scatter(employee_cluster[y_kmeans == 1, 0], employee_cluster[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'frustrated')
plt.scatter(employee_cluster[y_kmeans == 2, 0], employee_cluster[y_kmeans == 2, 1], s = 100, c = 'green', label = 'ambitious')
plt.scatter(employee_cluster[y_kmeans == 3, 0], employee_cluster[y_kmeans == 3, 1], s = 100, c = 'orange', label = 'bad match')
plt.scatter(employee_cluster[y_kmeans == 4, 0], employee_cluster[y_kmeans == 4, 1], s = 100, c = 'cyan', label = 'winners').get_figure().savefig("scatteroall.png")
plt.title('Clusters of Employees')
plt.xlabel('satisfaction level')
plt.ylabel('last evaluation')
plt.legend().get_figure().savefig("scattelegtall.png")
plt.show()
#building a predictive model using random forest classifier
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()
existing_employees2.dept = encode.fit_transform(existing_employees2.dept)
existing_employees2.salary = encode.fit_transform(existing_employees2.salary)
y= existing_employees2["left"]
x= existing_employees2.drop(["left"], axis=1)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
#leasting out the features that greatly affected the models accuracy and their ratios
from sklearn.ensemble import RandomForestClassifier
random_forest= RandomForestClassifier()
random_forest.fit(x_train, y_train)
importances = pd.DataFrame({'feature':x_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.head(11)
importancefloat64
0.001 - 0.446
Emp ID
0.446
satisfaction_level
0.215
number_project
0.116
time_spend_company
0.091
average_montly_hours
0.07
last_evaluation
0.053
Work_accident
0.003
dept
0.003
salary
0.003
promotion_last_5years
0.001