import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys
!{sys.executable} -m pip install openpyxl
existing_employees= pd.read_excel("Hash-Analytic-Python-Analytics-Problem-case-study-1.xlsx", "Existing employees")
existing_employees.head()
existing_employees.describe()
left_employees= pd.read_excel("Hash-Analytic-Python-Analytics-Problem-case-study-1.xlsx", "Employees who have left")
left_employees.head()
left_employees.describe()
#boxplot showing the satisfaction levels of the employees that left
fig, ax = plt.subplots(figsize=(12,5))
sns.boxplot(x=left_employees.dept, y=left_employees.satisfaction_level).get_figure().savefig("boxplot.png")
#exploratory statistics of the left employees using countplot
features=['number_project','time_spend_company','Work_accident','promotion_last_5years','dept','salary']
fig=plt.subplots(figsize=(15,30))
for i, j in enumerate(features):
plt.subplot(4, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = left_employees).get_figure().savefig("countplot.png")
plt.xticks(rotation=90)
plt.title("No. of employee")
#importing a merged dataset of the existing and left employees
existing_employees2= pd.read_excel("Hash-Analytic-Python-Analytics-Problem-combined.xlsx", "Existing employees")
existing_employees2.head()
#countplot analysis of the merged dataset using a left=0 for existing employees and left=1 for left employees
features=['number_project','time_spend_company','Work_accident','promotion_last_5years','dept','salary']
fig=plt.subplots(figsize=(20,40))
for i, j in enumerate(features):
plt.subplot(4, 2, i+1)
plt.subplots_adjust(hspace = 1.0)
sns.countplot(x=j,data = existing_employees2, hue='left').get_figure().savefig("countplotall.png")
plt.xticks(rotation=90)
plt.title("No. of employee")
#using kmeans to determine the number of clusters using satisfaction level and last evaluation level of the existing employees dataset
employee_cluster = existing_employees.iloc[:, [1, 2]].values
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
kmeans.fit(employee_cluster)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
kmeans = KMeans(n_clusters =5, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(employee_cluster)
#cluster is equal to 5.. plotting the scatter plot
fig=plt.subplots(figsize=(10,10))
plt.scatter(employee_cluster[y_kmeans == 0, 0], employee_cluster[y_kmeans == 0, 1], s = 100, c = 'red', label = 'comfortable')
plt.scatter(employee_cluster[y_kmeans == 1, 0], employee_cluster[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'frustrated')
plt.scatter(employee_cluster[y_kmeans == 2, 0], employee_cluster[y_kmeans == 2, 1], s = 100, c = 'green', label = 'ambitious')
plt.scatter(employee_cluster[y_kmeans == 3, 0], employee_cluster[y_kmeans == 3, 1], s = 100, c = 'orange', label = 'bad match')
plt.scatter(employee_cluster[y_kmeans == 4, 0], employee_cluster[y_kmeans == 4, 1], s = 100, c = 'cyan', label = 'winners').get_figure().savefig("scatteroall.png")
plt.title('Clusters of Employees')
plt.xlabel('satisfaction level')
plt.ylabel('last evaluation')
plt.legend().get_figure().savefig("scattelegtall.png")
plt.show()
#building a predictive model using random forest classifier
from sklearn.preprocessing import LabelEncoder
encode = LabelEncoder()
existing_employees2.dept = encode.fit_transform(existing_employees2.dept)
existing_employees2.salary = encode.fit_transform(existing_employees2.salary)
y= existing_employees2["left"]
x= existing_employees2.drop(["left"], axis=1)
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=42)
#leasting out the features that greatly affected the models accuracy and their ratios
from sklearn.ensemble import RandomForestClassifier
random_forest= RandomForestClassifier()
random_forest.fit(x_train, y_train)
importances = pd.DataFrame({'feature':x_train.columns,'importance':np.round(random_forest.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False).set_index('feature')
importances.head(11)