import numpy as np
import pandas as pd
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
sns.set_style('darkgrid')
warnings.filterwarnings('ignore')
''' reading dataset '''
df = pd.read_csv('MFG10YearTerminationData.csv')
''' displaying first 5 rows '''
df.head()
''' shape of data '''
df.shape
''' checking null values '''
df.isnull().sum()
''' checking info of data '''
df.info()
''' checking duplicates ros '''
df[df.duplicated()]
''' dropping some columns '''
df.drop(['EmployeeID', 'birthdate_key', 'recorddate_key', 'gender_full'], axis=1, inplace=True)
df.head()
''' unique values in city_name '''
df.city_name.unique()
''' unique values in department_name '''
df.department_name.unique()
''' unique values in termreason_desc '''
df.termreason_desc.unique()
''' unique values in BUSINESS_UNIT '''
df.BUSINESS_UNIT.unique()
''' unique values in job_title '''
df.job_title.unique()
''' converting job_title into 3 categories '''
board = ['CEO','VP Stores', 'Director, Recruitment', 'VP Human Resources', 'VP Finance', 'Director, Accounts Receivable',
'Director, Accounting', 'Director, Employee Records', 'Director, Accounts Payable', 'Director, HR Technology',
'Director, Investments', 'Director, Labor Relations', 'Director, Audit', 'Director, Training',
'Director, Compensation']
executive = [ 'Exec Assistant, VP Stores', 'Exec Assistant, Legal Counsel', 'CHief Information Officer',
'Exec Assistant, Human Resources', 'Exec Assistant, Finance']
manager = ['Customer Service Manager', 'Processed Foods Manager', 'Meats Manager',
'Bakery Manager', 'Produce Manager', 'Store Manager', 'Trainer', 'Dairy Manager']
employee = ['Meat Cutter', 'Dairy Person', 'Produce Clerk', 'Baker', 'Cashier',
'Shelf Stocker', 'Recruiter', 'HRIS Analyst', 'Accounting Clerk',
'Benefits Admin', 'Labor Relations Analyst', 'Accounts Receiveable Clerk',
'Accounts Payable Clerk', 'Auditor', 'Compensation Analyst',
'Investment Analyst', 'Systems Analyst', 'Corporate Lawyer', 'Legal Counsel']
def job_title(job):
if job in board: return 'board'
if job in executive: return 'executive'
if job in manager: return 'manager'
if job in employee: return 'employee'
df['job_title'] = df['job_title'].map(job_title)
df.job_title.unique()
''' now change the city_names '''
city_population = {
'Vancouver':2313328,
'Victoria':289625,
'Nanaimo':84905,
'New Westminster':58549,
'Kelowna':125109,
'Burnaby':202799,
'Kamloops':68714,
'Prince George':65558,
'Cranbrook':18610,
'Surrey':394976,
'Richmond':182000,
'Terrace':19443,
'Chilliwack':77000,
'Trail':9707,
'Langley':23606,
'Vernon':47274,
'Squamish':19512,
'Quesnel':13799,
'Abbotsford':151683,
'North Vancouver':48000,
'Fort St John':17402,
'Williams Lake':14168,
'West Vancouver':42694,
'Port Coquitlam':114565,
'Aldergrove':12363,
'Fort Nelson':3561,
'Nelson':9813,
'New Westminister':58549,
'Grand Forks':4049,
'White Rock':66450,
'Haney':82256,
'Princeton':2828,
'Dawson Creek':10802,
'Bella Bella':1019,
'Ocean Falls':129,
'Pitt Meadows':174410,
'Cortes Island':1042,
'Valemount':1021,
'Dease Lake':335,
'Blue River':157
}
def change_city_into_city_pop(city):
return city_population(city)
df['city_name'] = df['city_name'].map(city_population)
''' now converting city_name into rural or town '''
def change_city(population):
str = 'rural'
if (population >= 10000) & (population < 100000): str = 'town'
if population >= 100000: str = 'mega'
return str
df['city_name'] = df.city_name.map(change_city)
df['city_name'].unique()
''' now displaying dataset after mapping values '''
df.head()
''' barplot of status'''
status_label = df.STATUS.value_counts()
plt.figure(figsize=(10, 5))
sns.barplot(status_label.index, status_label);
plt.xlabel('STATUS', fontsize=20);
plt.ylabel('Count', fontsize=20)
''' kde plot '''
for column in ['store_name', 'STATUS_YEAR', 'length_of_service', 'age']:
plt.figure(figsize=(10, 5))
sns.kdeplot(df[column], color = '#32384D', shade = True, label = 'terminated', alpha = 0.5)
plt.title(column, size = 14)
df.head()
''' preprocessing of dataset '''
city_name_label = {value: key for key, value in enumerate(df['city_name'].unique())}
df['city_name'] = df['city_name'].map(city_name_label)
department_name_label = {value: key for key, value in enumerate(df['department_name'].unique())}
df['department_name'] = df['department_name'].map(department_name_label)
job_title_label = {value: key for key, value in enumerate(df['job_title'].unique())}
df['job_title'] = df['job_title'].map(job_title_label)
gender_short_label = {value: key for key, value in enumerate(df['gender_short'].unique())}
df['gender_short'] = df['gender_short'].map(gender_short_label)
term_desc_label = {value: key for key, value in enumerate(df['termreason_desc'].unique())}
df['termreason_desc'] = df['termreason_desc'].map(term_desc_label)
termtype_desc_label = {value: key for key, value in enumerate(df['termtype_desc'].unique())}
df['termtype_desc'] = df['termtype_desc'].map(termtype_desc_label)
status_label = {value: key for key, value in enumerate(df['STATUS'].unique())}
df['STATUS'] = df['STATUS'].map(status_label)
business_label = {value: key for key, value in enumerate(df['BUSINESS_UNIT'].unique())}
df['BUSINESS_UNIT'] = df['BUSINESS_UNIT'].map(business_label)
''' after preprocessing data, top 5 rows '''
df.head()
''' correlation matrix '''
plt.figure(figsize=(15, 10))
sns.heatmap(df.corr(), annot=True);
plt.title('Correlation Matrix', fontsize=20);
''' checking correlation of STATUS w.r.t other features '''
df.corr()['STATUS'].sort_values(ascending=False)[1:]
''' dropping some columns '''
df.drop(['termreason_desc', 'termtype_desc', 'orighiredate_key', 'terminationdate_key'], axis=1, inplace=True)
df.head()
''' train, test, evaluation function '''
def result(X, y, ts, rs, model):
''' train test split '''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=rs)
''' scaling '''
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
''' fit on data '''
model.fit(X_train, y_train)
''' prediction '''
pred = model.predict(X_test)
''' performance of model '''
print("Classification Report: \n", classification_report(y_test, pred))
print("-" * 100)
print()
print("Accuracy Score: ", accuracy_score(y_test, pred))
print("-" * 100)
print()
print("Confusion Matrix: ")
plt.figure(figsize=(10, 5))
sns.heatmap(confusion_matrix(y_test, pred), annot=True, fmt='g');
plt.title('Confusion Matrix', fontsize=20)
''' independent and dependent features '''
X = df[['age', 'length_of_service', 'city_name', 'department_name', 'job_title', 'store_name', 'gender_short',
'STATUS_YEAR', 'BUSINESS_UNIT']]
y = df[['STATUS']]
''' Logistic Regression '''
model = LogisticRegression()
result(X, y, 0.25, 0, model)
''' RandomForest Classifier '''
rf = RandomForestClassifier()
result(X, y, 0.25, 42, rf)
''' KNN '''
knn = KNeighborsClassifier()
result(X, y, 0.3, 25, knn)