import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as matplot
import seaborn as sns
# 以内嵌的方式显示图像,不跳出新的窗口
%matplotlib inline
df = pd.read_csv('HR_comma_sep.csv', index_col=None)
df.head()
satisfaction_levelfloat64
last_evaluationfloat64
0
0.38
0.53
1
0.8
0.86
2
0.11
0.88
3
0.72
0.87
4
0.37
0.52
df.shape
df.dtypes
df.isnull().any()
df = df.rename(columns={'satisfaction_level': 'satisfaction',
'last_evaluation': 'evaluation',
'number_project': 'projectCount',
'average_montly_hours': 'averageMonthlyHours',
'time_spend_company': 'yearsAtCompany',
'Work_accident': 'workAccident',
'promotion_last_5years': 'promotion',
'sales' : 'department',
'left' : 'turnover'})
front = df['turnover']
df.drop(labels=['turnover'], axis=1, inplace=True)
# 标签数据插到首列
df.insert(0, 'turnover', front)
df.head()
turnoverint64
satisfactionfloat64
0
1
0.38
1
1
0.8
2
1
0.11
3
1
0.72
4
1
0.37
turnover_rate = df.turnover.value_counts() / len(df.turnover)
turnover_rate
df.describe()
turnoverfloat64
satisfactionfloat64
count
14999
14999
mean
0.2380825388
0.6128335222
std
0.4259240994
0.2486306511
min
0
0.09
25%
0
0.44
50%
0
0.64
75%
0
0.82
max
1
1
# 分组查看员工的特征
turnover_Summary = df.groupby('turnover')
turnover_Summary.mean()
satisfactionfloat64
evaluationfloat64
0
0.6668095905
0.7154733987
1
0.4400980118
0.7181125735
# 相关性矩阵
corr = df.corr()
corr
turnoverfloat64
satisfactionfloat64
turnover
1
-0.3883749834
satisfaction
-0.3883749834
1
evaluation
0.006567120448
0.105021214
projectCount
0.02378718507
-0.142969586
averageMonthlyHours
0.07128717878
-0.02004811322
yearsAtCompany
0.1448221749
-0.1008660726
workAccident
-0.1546216337
0.05869724105
promotion
-0.06178810658
0.02560518571
# 通过热力图展示
sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values)
# 计算离职员工和未离职员工的满意度
emp_population = df['satisfaction'][df['turnover'] == 0].mean()
emp_turnover_satisfaction = df['satisfaction'][df['turnover'] == 1].mean()
print( '未离职员工满意度: ' + str(emp_population))
print( '离职员工满意度: ' + str(emp_turnover_satisfaction) )
未离职员工满意度: 0.666809590479524
离职员工满意度: 0.4400980117614114
import scipy.stats as stats
# 单样本t检验
stats.ttest_1samp(a=df['satisfaction'][df['turnover']==1], #离职员工样本数据
popmean=emp_population ) #未离职员工满意度均值
# 均值上相等的两份数据,分布上是不是一样
# 概率密度函数估计
fig = plt.figure(figsize=(15,4),)
# 绘制样本数据的概率密度函数
ax=sns.kdeplot(df.loc[(df['turnover'] == 0),'evaluation'] , color='b',shade=True,label='no turnover')
ax=sns.kdeplot(df.loc[(df['turnover'] == 1),'evaluation'] , color='r',shade=True, label='turnover')
ax.set(xlabel='Employee Evaluation', ylabel='Frequency')
plt.title('Employee Evaluation Distribution - Turnover V.S. No Turnover')
# 均值上相等的两份数据,分布上是不是一样
# 概率密度函数估计
fig = plt.figure(figsize=(15,4),)
# 绘制样本数据的概率密度函数
ax=sns.kdeplot(df.loc[(df['turnover'] == 0),'averageMonthlyHours'] , color='b',shade=True,label='no turnover')
ax=sns.kdeplot(df.loc[(df['turnover'] == 1),'averageMonthlyHours'] , color='r',shade=True, label='turnover')
ax.set(xlabel='Employee averageMonthlyHours', ylabel='Frequency')
plt.title('Employee averageMonthlyHours Distribution - Turnover V.S. No Turnover')
# 均值上相等的两份数据,分布上是不是一样
# 概率密度函数估计
fig = plt.figure(figsize=(15,4),)
# 绘制样本数据的概率密度函数
ax=sns.kdeplot(df.loc[(df['turnover'] == 0),'satisfaction'] , color='b',shade=True,label='no turnover')
ax=sns.kdeplot(df.loc[(df['turnover'] == 1),'satisfaction'] , color='r',shade=True, label='turnover')
ax.set(xlabel='Employee satisfaction', ylabel='Frequency')
plt.title('Employee satisfaction Distribution - Turnover V.S. No Turnover')
# 将string类型转换为整数类型
df['department'] = df['department'].astype('category').cat.codes
df['salary'] = df['salary'].astype('category').cat.codes
df.head()
turnoverint64
satisfactionfloat64
0
1
0.38
1
1
0.8
2
1
0.11
3
1
0.72
4
1
0.37
# 生成测试数据和训练数据
from sklearn.model_selection import train_test_split
X = df.drop(['turnover'], axis=1)
y = df.turnover
# 由于数据中离职人员数据仅占24%,数据不平衡,不加stratify可能会造成测试数据集中没有几条离职数据
# stratify保证原始数据、训练数据、测试数据三份数据中离职员工和未离职员工比例完全一样
# 如果样本数据均衡那么没必要这么做
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=123, stratify=y)
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
# 决策树
dtree = tree.DecisionTreeClassifier(
criterion='entropy',
# max_depth=3, 定义树的深度,防止过拟合
min_samples_leaf=0.01 #定义叶子节点最少要包含多少个样本,防止过拟合
)
dtree = dtree.fit(X_train, y_train)
print("--决策树--")
dt_roc_auc = roc_auc_score(y_test, dtree.predict(X_test))
print("决策树AUC:%2.2f" %dt_roc_auc)
print(classification_report(y_test, dtree.predict(X_test)))
--决策树--
决策树AUC:0.93
precision recall f1-score support
0 0.97 0.98 0.97 1714
1 0.93 0.89 0.91 536
accuracy 0.96 2250
macro avg 0.95 0.93 0.94 2250
weighted avg 0.96 0.96 0.96 2250