import numpy as np
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
sns.set_style('darkgrid')
warnings.filterwarnings('ignore')
''' reading dataset '''
df = pd.read_csv('MFG10YearTerminationData.csv')
''' displaying first 5 rows '''
df.head()
''' shape of data '''
df.shape
''' checking null values '''
df.isnull().sum()
''' checking info of data '''
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49653 entries, 0 to 49652
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 EmployeeID 49653 non-null int64
1 recorddate_key 49653 non-null object
2 birthdate_key 49653 non-null object
3 orighiredate_key 49653 non-null object
4 terminationdate_key 49653 non-null object
5 age 49653 non-null int64
6 length_of_service 49653 non-null int64
7 city_name 49653 non-null object
8 department_name 49653 non-null object
9 job_title 49653 non-null object
10 store_name 49653 non-null int64
11 gender_short 49653 non-null object
12 gender_full 49653 non-null object
13 termreason_desc 49653 non-null object
14 termtype_desc 49653 non-null object
15 STATUS_YEAR 49653 non-null int64
16 STATUS 49653 non-null object
17 BUSINESS_UNIT 49653 non-null object
dtypes: int64(5), object(13)
memory usage: 6.8+ MB
''' checking duplicates ros '''
df[df.duplicated()]
''' dropping some columns '''
df.drop(['EmployeeID', 'birthdate_key', 'recorddate_key', 'gender_full'], axis=1, inplace=True)
df.head()
''' unique values in city_name '''
df.city_name.unique()
''' unique values in department_name '''
df.department_name.unique()
''' unique values in termreason_desc '''
df.termreason_desc.unique()
''' unique values in BUSINESS_UNIT '''
df.BUSINESS_UNIT.unique()
''' unique values in job_title '''
df.job_title.unique()
''' converting job_title into 3 categories '''
board = ['CEO','VP Stores', 'Director, Recruitment', 'VP Human Resources', 'VP Finance', 'Director, Accounts Receivable',
'Director, Accounting', 'Director, Employee Records', 'Director, Accounts Payable', 'Director, HR Technology',
'Director, Investments', 'Director, Labor Relations', 'Director, Audit', 'Director, Training',
'Director, Compensation']
executive = [ 'Exec Assistant, VP Stores', 'Exec Assistant, Legal Counsel', 'CHief Information Officer',
'Exec Assistant, Human Resources', 'Exec Assistant, Finance']
manager = ['Customer Service Manager', 'Processed Foods Manager', 'Meats Manager',
'Bakery Manager', 'Produce Manager', 'Store Manager', 'Trainer', 'Dairy Manager']
employee = ['Meat Cutter', 'Dairy Person', 'Produce Clerk', 'Baker', 'Cashier',
'Shelf Stocker', 'Recruiter', 'HRIS Analyst', 'Accounting Clerk',
'Benefits Admin', 'Labor Relations Analyst', 'Accounts Receiveable Clerk',
'Accounts Payable Clerk', 'Auditor', 'Compensation Analyst',
'Investment Analyst', 'Systems Analyst', 'Corporate Lawyer', 'Legal Counsel']
def job_title(job):
if job in board: return 'board'
if job in executive: return 'executive'
if job in manager: return 'manager'
if job in employee: return 'employee'
df['job_title'] = df['job_title'].map(job_title)
df.job_title.unique()
''' now change the city_names '''
city_population = {
'Vancouver':2313328,
'Victoria':289625,
'Nanaimo':84905,
'New Westminster':58549,
'Kelowna':125109,
'Burnaby':202799,
'Kamloops':68714,
'Prince George':65558,
'Cranbrook':18610,
'Surrey':394976,
'Richmond':182000,
'Terrace':19443,
'Chilliwack':77000,
'Trail':9707,
'Langley':23606,
'Vernon':47274,
'Squamish':19512,
'Quesnel':13799,
'Abbotsford':151683,
'North Vancouver':48000,
'Fort St John':17402,
'Williams Lake':14168,
'West Vancouver':42694,
'Port Coquitlam':114565,
'Aldergrove':12363,
'Fort Nelson':3561,
'Nelson':9813,
'New Westminister':58549,
'Grand Forks':4049,
'White Rock':66450,
'Haney':82256,
'Princeton':2828,
'Dawson Creek':10802,
'Bella Bella':1019,
'Ocean Falls':129,
'Pitt Meadows':174410,
'Cortes Island':1042,
'Valemount':1021,
'Dease Lake':335,
'Blue River':157
}
def change_city_into_city_pop(city):
return city_population(city)
df['city_name'] = df['city_name'].map(city_population)
''' now converting city_name into rural or town '''
def change_city(population):
str = 'rural'
if (population >= 10000) & (population < 100000): str = 'town'
if population >= 100000: str = 'mega'
return str
df['city_name'] = df.city_name.map(change_city)
df['city_name'].unique()
''' now displaying dataset after mapping values '''
df.head()
''' barplot of status'''
status_label = df.STATUS.value_counts()
plt.figure(figsize=(10, 5))
sns.barplot(status_label.index, status_label);
plt.xlabel('STATUS', fontsize=20);
plt.ylabel('Count', fontsize=20)
''' kde plot '''
for column in ['store_name', 'STATUS_YEAR', 'length_of_service', 'age']:
plt.figure(figsize=(10, 5))
sns.kdeplot(df[column], color = '#32384D', shade = True, label = 'terminated', alpha = 0.5)
plt.title(column, size = 14)
df.head()
''' preprocessing of dataset '''
city_name_label = {value: key for key, value in enumerate(df['city_name'].unique())}
df['city_name'] = df['city_name'].map(city_name_label)
department_name_label = {value: key for key, value in enumerate(df['department_name'].unique())}
df['department_name'] = df['department_name'].map(department_name_label)
job_title_label = {value: key for key, value in enumerate(df['job_title'].unique())}
df['job_title'] = df['job_title'].map(job_title_label)
gender_short_label = {value: key for key, value in enumerate(df['gender_short'].unique())}
df['gender_short'] = df['gender_short'].map(gender_short_label)
term_desc_label = {value: key for key, value in enumerate(df['termreason_desc'].unique())}
df['termreason_desc'] = df['termreason_desc'].map(term_desc_label)
termtype_desc_label = {value: key for key, value in enumerate(df['termtype_desc'].unique())}
df['termtype_desc'] = df['termtype_desc'].map(termtype_desc_label)
status_label = {value: key for key, value in enumerate(df['STATUS'].unique())}
df['STATUS'] = df['STATUS'].map(status_label)
business_label = {value: key for key, value in enumerate(df['BUSINESS_UNIT'].unique())}
df['BUSINESS_UNIT'] = df['BUSINESS_UNIT'].map(business_label)
''' after preprocessing data, top 5 rows '''
df.head()
''' correlation matrix '''
plt.figure(figsize=(15, 10))
sns.heatmap(df.corr(), annot=True);
plt.title('Correlation Matrix', fontsize=20);
''' checking correlation of STATUS w.r.t other features '''
df.corr()['STATUS'].sort_values(ascending=False)[1:]
''' dropping some columns '''
df.drop(['termreason_desc', 'termtype_desc', 'orighiredate_key', 'terminationdate_key'], axis=1, inplace=True)
df.head()
''' train, test, evaluation function '''
def result(X, y, ts, rs, model):
''' train test split '''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=rs)
''' scaling '''
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
''' fit on data '''
model.fit(X_train, y_train)
''' prediction '''
pred = model.predict(X_test)
''' performance of model '''
print("Classification Report: \n", classification_report(y_test, pred))
print("-" * 100)
print()
print("Accuracy Score: ", accuracy_score(y_test, pred))
print("-" * 100)
print()
print("Confusion Matrix: ")
plt.figure(figsize=(10, 5))
sns.heatmap(confusion_matrix(y_test, pred), annot=True, fmt='g');
plt.title('Confusion Matrix', fontsize=20)
''' independent and dependent features '''
X = df[['age', 'length_of_service', 'city_name', 'department_name', 'job_title', 'store_name', 'gender_short',
'STATUS_YEAR', 'BUSINESS_UNIT']]
y = df[['STATUS']]
''' Logistic Regression '''
model = LogisticRegression()
result(X, y, 0.25, 0, model)
Classification Report:
precision recall f1-score support
0 0.97 1.00 0.98 12034
1 0.55 0.08 0.15 380
accuracy 0.97 12414
macro avg 0.76 0.54 0.57 12414
weighted avg 0.96 0.97 0.96 12414
----------------------------------------------------------------------------------------------------
Accuracy Score: 0.9698727243434832
----------------------------------------------------------------------------------------------------
Confusion Matrix:
''' RandomForest Classifier '''
rf = RandomForestClassifier()
result(X, y, 0.25, 42, rf)
Classification Report:
precision recall f1-score support
0 0.99 1.00 1.00 12058
1 0.91 0.74 0.82 356
accuracy 0.99 12414
macro avg 0.95 0.87 0.91 12414
weighted avg 0.99 0.99 0.99 12414
----------------------------------------------------------------------------------------------------
Accuracy Score: 0.9906557112937007
----------------------------------------------------------------------------------------------------
Confusion Matrix:
''' KNN '''
knn = KNeighborsClassifier()
result(X, y, 0.3, 25, knn)
Classification Report:
precision recall f1-score support
0 0.98 1.00 0.99 14458
1 0.74 0.44 0.55 438
accuracy 0.98 14896
macro avg 0.86 0.72 0.77 14896
weighted avg 0.98 0.98 0.98 14896
----------------------------------------------------------------------------------------------------
Accuracy Score: 0.9789876476906552
----------------------------------------------------------------------------------------------------
Confusion Matrix: