import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as matplot
import seaborn as sns
# 以内嵌的方式显示图像,不跳出新的窗口
%matplotlib inline
df = pd.read_csv('HR_comma_sep.csv', index_col=None)
df.head()
df.shape
df.dtypes
df.isnull().any()
df = df.rename(columns={'satisfaction_level': 'satisfaction',
'last_evaluation': 'evaluation',
'number_project': 'projectCount',
'average_montly_hours': 'averageMonthlyHours',
'time_spend_company': 'yearsAtCompany',
'Work_accident': 'workAccident',
'promotion_last_5years': 'promotion',
'sales' : 'department',
'left' : 'turnover'})
front = df['turnover']
df.drop(labels=['turnover'], axis=1, inplace=True)
# 标签数据插到首列
df.insert(0, 'turnover', front)
df.head()
turnover_rate = df.turnover.value_counts() / len(df.turnover)
turnover_rate
df.describe()
# 分组查看员工的特征
turnover_Summary = df.groupby('turnover')
turnover_Summary.mean()
# 相关性矩阵
corr = df.corr()
corr
# 通过热力图展示
sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values)
# 计算离职员工和未离职员工的满意度
emp_population = df['satisfaction'][df['turnover'] == 0].mean()
emp_turnover_satisfaction = df['satisfaction'][df['turnover'] == 1].mean()
print( '未离职员工满意度: ' + str(emp_population))
print( '离职员工满意度: ' + str(emp_turnover_satisfaction) )
import scipy.stats as stats
# 单样本t检验
stats.ttest_1samp(a=df['satisfaction'][df['turnover']==1], #离职员工样本数据
popmean=emp_population ) #未离职员工满意度均值
# 均值上相等的两份数据,分布上是不是一样
# 概率密度函数估计
fig = plt.figure(figsize=(15,4),)
# 绘制样本数据的概率密度函数
ax=sns.kdeplot(df.loc[(df['turnover'] == 0),'evaluation'] , color='b',shade=True,label='no turnover')
ax=sns.kdeplot(df.loc[(df['turnover'] == 1),'evaluation'] , color='r',shade=True, label='turnover')
ax.set(xlabel='Employee Evaluation', ylabel='Frequency')
plt.title('Employee Evaluation Distribution - Turnover V.S. No Turnover')
# 均值上相等的两份数据,分布上是不是一样
# 概率密度函数估计
fig = plt.figure(figsize=(15,4),)
# 绘制样本数据的概率密度函数
ax=sns.kdeplot(df.loc[(df['turnover'] == 0),'averageMonthlyHours'] , color='b',shade=True,label='no turnover')
ax=sns.kdeplot(df.loc[(df['turnover'] == 1),'averageMonthlyHours'] , color='r',shade=True, label='turnover')
ax.set(xlabel='Employee averageMonthlyHours', ylabel='Frequency')
plt.title('Employee averageMonthlyHours Distribution - Turnover V.S. No Turnover')
# 均值上相等的两份数据,分布上是不是一样
# 概率密度函数估计
fig = plt.figure(figsize=(15,4),)
# 绘制样本数据的概率密度函数
ax=sns.kdeplot(df.loc[(df['turnover'] == 0),'satisfaction'] , color='b',shade=True,label='no turnover')
ax=sns.kdeplot(df.loc[(df['turnover'] == 1),'satisfaction'] , color='r',shade=True, label='turnover')
ax.set(xlabel='Employee satisfaction', ylabel='Frequency')
plt.title('Employee satisfaction Distribution - Turnover V.S. No Turnover')
# 将string类型转换为整数类型
df['department'] = df['department'].astype('category').cat.codes
df['salary'] = df['salary'].astype('category').cat.codes
df.head()
# 生成测试数据和训练数据
from sklearn.model_selection import train_test_split
X = df.drop(['turnover'], axis=1)
y = df.turnover
# 由于数据中离职人员数据仅占24%,数据不平衡,不加stratify可能会造成测试数据集中没有几条离职数据
# stratify保证原始数据、训练数据、测试数据三份数据中离职员工和未离职员工比例完全一样
# 如果样本数据均衡那么没必要这么做
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=123, stratify=y)
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
# 决策树
dtree = tree.DecisionTreeClassifier(
criterion='entropy',
# max_depth=3, 定义树的深度,防止过拟合
min_samples_leaf=0.01 #定义叶子节点最少要包含多少个样本,防止过拟合
)
dtree = dtree.fit(X_train, y_train)
print("--决策树--")
dt_roc_auc = roc_auc_score(y_test, dtree.predict(X_test))
print("决策树AUC:%2.2f" %dt_roc_auc)
print(classification_report(y_test, dtree.predict(X_test)))