# categorical features identified by their data type
# and put into a new dataframe
df_categorical=df.loc[:,df.dtypes==np.object]
df_categorical = df_categorical[['gender', 'education_level', 'marital_status', 'income_category','card_category','attrition_flag']]
df_categorical.head()
label = preprocessing.LabelEncoder()
df_categorical_encoded = pd.DataFrame()
for i in df_categorical.columns :
df_categorical_encoded[i]=label.fit_transform(df_categorical[i])
def cramers_V(var1,var2) :
crosstab =np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building
stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test
obs = np.sum(crosstab) # Number of observations
mini = min(crosstab.shape)-1 # Take the minimum value between the columns and the rows of the cross table
return (stat/(obs*mini))
rows= []
for var1 in df_categorical_encoded:
col = []
for var2 in df_categorical_encoded :
cramers =cramers_V(df_categorical_encoded[var1], df_categorical_encoded[var2]) # Cramer's V test
col.append(round(cramers,2)) # Keeping of the rounded value of the Cramer's V
rows.append(col)
cramers_results = np.array(rows)
# Putting all the results into a dataframe
cramerv_matrix = pd.DataFrame(cramers_results, columns = df_categorical_encoded.columns, index =df_categorical_encoded.columns)
# visualising
mask = np.triu(np.ones_like(cramerv_matrix, dtype=np.bool))
cat_heatmap = sns.heatmap(cramerv_matrix, mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
cat_heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);
# creating a seperate dataframe for numerical features
# get dummies to encode them
df_numerical=df.loc[:,df.dtypes!=np.object]
df_numerical['attrition_flag']=df.loc[:,'attrition_flag']
oh=pd.get_dummies(df_numerical['attrition_flag'])
df_numerical=df_numerical.drop(['attrition_flag'],axis=1)
df_numerical=df_numerical.join(oh)
df_numerical.head()
# using back Pearson's R for numerical features
num_corr = df_numerical.corr()
plt.figure(figsize=(16, 6))
mask = np.triu(np.ones_like(num_corr, dtype=np.bool))
num_heatmap = sns.heatmap(num_corr, mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG')
num_heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);
fig, ax=plt.subplots(ncols=2,figsize=(15, 5))
heatmap = sns.heatmap(num_corr[['Existing Customer']].sort_values(by='Existing Customer', ascending=False), ax=ax[0],vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlating with Existing Customers', fontdict={'fontsize':18}, pad=16);
heatmap = sns.heatmap(num_corr[['Attrited Customer']].sort_values(by='Attrited Customer', ascending=False), ax=ax[1],vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlating with Attrited Customers', fontdict={'fontsize':18}, pad=16);
fig.tight_layout(pad=5)
df2 = df.copy()
# variables with low coefficient as noted earlier are dropped
df2= df2.drop(columns=["credit_limit", "avg_open_to_buy", "months_on_book", "customer_age", "dependent_count"], axis=1)
# encoding target feature
map={'Attrited Customer':1,"Existing Customer":0}
df2["attrition_flag"].replace(map,inplace=True)
# combining categories within some features together
df2.education_level = df2.education_level.replace({'Graduate':'Tertiary','Post-Graduate':'Tertiary','Doctorate':'Tertiary'})
df2.income_category = df2.income_category.replace({'$40K - $60K':'$40K - $80K','$60K - $80K':'$40K - $80K'})
# get dummies for rest of categorical features
df2.columns = df2.columns.str.replace(' ', '')
dummies = ['gender','education_level', 'marital_status', 'income_category','card_category']
dummy_data = pd.get_dummies(df2[dummies])
df2 = pd.concat([df2, dummy_data], axis = 1)
df2.drop(dummies, axis=1, inplace=True)
df2.head(3)
# checking encoding results
df2.info()
# X contains independent values, y contains dependent value
X = df2.drop('attrition_flag', axis = 1)
y = df2['attrition_flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 77, stratify=y)
# visualising the imbalance of the training set
fig,ax=plt.subplots(figsize=(6, 5))
patches, texts, autotexts = ax.pie(x=y_train.value_counts(),labels=["Existing customers","Attrited Customers"],colors=["#c2cccc", "#a1e6e4"], autopct="%.1f%%")
fig.suptitle("Checking the imbalance of the Attrition class in the training set",y=1,fontweight="bold",fontsize=15)
[autotext.set_color('#000000') for autotext in autotexts]
plt.show()
# perform random under-sampling on training dataset
rus = RandomUnderSampler(random_state= 77)
X_rus, y_rus = rus.fit_resample(X_train, y_train)
# visualising the outcome
fig,ax=plt.subplots(figsize=(6, 5))
patches, texts, autotexts = ax.pie(x=y_rus.value_counts(),labels=["Existing customers","Attrited Customers"],colors=["#c2cccc", "#a1e6e4"], autopct="%.1f%%")
fig.suptitle("Imbalanced class in the training set after under-sampling",y=1,fontweight="bold",fontsize=15)
[autotext.set_color('#000000') for autotext in autotexts]
plt.show()
# over balancing the training set
sm = SMOTE(random_state = 77, sampling_strategy = "not majority")
X_smote, y_smote = sm.fit_resample(X_train, y_train)
# visualising the outcome
fig,ax=plt.subplots(figsize=(6, 5))
patches, texts, autotexts = ax.pie(x=y_smote.value_counts(),labels=["Existing customers","Attrited Customers"],colors=["#c2cccc", "#a1e6e4"], autopct="%.1f%%")
fig.suptitle("Imbalanced class in the training set after under-sampling",y=1,fontweight="bold",fontsize=15)
[autotext.set_color('#000000') for autotext in autotexts]
plt.show()
model = LogisticRegression(max_iter=600)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*model.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
numerical = ['total_relationship_count', 'months_inactive_12_month',
'contacts_count_12_month', 'total_revolving_bal',
'total_amt_change_q4_q1', 'total_trans_amt',
'total_trans_count', 'total_count_change_q4_q1', 'avg_utilization_ratio']
numeric_transformer = MinMaxScaler()
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numerical)])
classifier = LogisticRegression(random_state= 77)
clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", classifier)])
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test,y_test_pred)
plt.figure(figsize=(10,6))
plt.title("Confusion matrix of unoptimised logistics model")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
clf.fit(X_rus, y_rus)
y_test_pred = clf.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test,y_test_pred)
plt.figure(figsize=(10,6))
plt.title("Confusion matrix of unoptimised logistics regression model, under-sampling")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
clf.fit(X_smote, y_smote)
y_test_pred = clf.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test,y_test_pred)
plt.figure(figsize=(10,6))
plt.title("Confusion matrix of unoptimised logistics regression model, SMOTE")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
# defining parameter range
param_grid = {'classifier__class_weight': [{False:0.9, True:1}, {False:0.95, True:1}, {False:0.8, True:1}, {False:0.85, True:1}],
'classifier__solver': ['liblinear', 'lbfgs'], 'classifier__penalty': ['l1','l2'], 'classifier__C': [0.001,0.01,0.1,1,10,100]}
grid = GridSearchCV(clf, param_grid, refit = True, scoring= 'accuracy', verbose = 3)
# fitting the model for grid search
grid.fit(X_train, y_train)
print(grid.best_estimator_)
print('Best penalty:', grid.best_estimator_.get_params()['classifier__penalty'])
print('Best C:', grid.best_estimator_.get_params()['classifier__C'])
print('Best class weight:', grid.best_estimator_.get_params()['classifier__class_weight'])
print('Best solver:', grid.best_estimator_.get_params()['classifier__solver'])
numerical = ['total_relationship_count', 'months_inactive_12_month',
'contacts_count_12_month', 'total_revolving_bal',
'total_amt_change_q4_q1', 'total_trans_amt',
'total_trans_count', 'total_count_change_q4_q1', 'avg_utilization_ratio']
numeric_transformer = MinMaxScaler()
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numerical)])
classifier = LogisticRegression(solver= "liblinear", class_weight={False:0.8, True:1}, C= 10, penalty= "l2")
clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", classifier)])
clf.fit(X_train, y_train)
y_test_pred = clf.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test,y_test_pred)
plt.figure(figsize=(10,6))
plt.title("Confusion matrix of optimised logistics model")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
clf.fit(X_rus, y_rus)
y_test_pred = clf.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test,y_test_pred)
plt.figure(figsize=(10,6))
plt.title("Confusion matrix of optimised logistics regression model, under-sampling")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
clf.fit(X_smote, y_smote)
y_test_pred = clf.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test,y_test_pred)
plt.figure(figsize=(10,6))
plt.title("Confusion matrix of optimised logistics regression model, SMOTE")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
numerical = ['total_relationship_count', 'months_inactive_12_month',
'contacts_count_12_month', 'total_revolving_bal',
'total_amt_change_q4_q1', 'total_trans_amt',
'total_trans_count', 'total_count_change_q4_q1', 'avg_utilization_ratio']
numeric_transformer = MinMaxScaler()
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numerical)])
classifier = LogisticRegression(random_state= 77)
rfecv = RFECV(estimator= classifier)
clf2 = Pipeline(steps=[("preprocessor", preprocessor), ("selector", rfecv), ("classifier", classifier)])
clf2.fit(X_train, y_train)
y_test_pred = clf2.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf2.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test,y_test_pred)
plt.figure(figsize=(10,6))
plt.title("Confusion matrix of unoptimised logistics model")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
clf2.fit(X_rus, y_rus)
y_test_pred = clf2.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf2.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test,y_test_pred)
plt.figure(figsize=(10,6))
plt.title("Confusion matrix of unoptimised logistics regression model, under-sampling")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
clf2.fit(X_smote, y_smote)
y_test_pred = clf2.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf2.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test,y_test_pred)
plt.figure(figsize=(10,6))
plt.title("Confusion matrix of unoptimised logistics regression model, SMOTE")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
# defining parameter range
param_grid = {'classifier__class_weight': [{False:0.9, True:1}, {False:0.95, True:1}, {False:0.8, True:1}, {False:0.85, True:1}],
'classifier__solver': ['liblinear', 'lbfgs'], 'classifier__penalty': ['l1','l2'], 'classifier__C': [0.001,0.01,0.1,1,10,100]}
grid = GridSearchCV(clf2, param_grid, refit = True, scoring= 'accuracy', verbose = 3)
# fitting the model for grid search
grid.fit(X_train, y_train)
print(grid.best_estimator_)
print('Best penalty:', grid.best_estimator_.get_params()['classifier__penalty'])
print('Best C:', grid.best_estimator_.get_params()['classifier__C'])
print('Best class weight:', grid.best_estimator_.get_params()['classifier__class_weight'])
print('Best solver:', grid.best_estimator_.get_params()['classifier__solver'])
numerical = ['total_relationship_count', 'months_inactive_12_month',
'contacts_count_12_month', 'total_revolving_bal',
'total_amt_change_q4_q1', 'total_trans_amt',
'total_trans_count', 'total_count_change_q4_q1', 'avg_utilization_ratio']
numeric_transformer = MinMaxScaler()
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numerical)])
classifier = LogisticRegression(solver= "liblinear", class_weight={False:0.85, True:1}, C= 10, penalty= "l1")
rfecv = RFECV(estimator= classifier)
clf2 = Pipeline(steps=[("preprocessor", preprocessor), ("selector", rfecv), ("classifier", classifier)])
clf2.fit(X_train, y_train)
y_test_pred = clf2.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf2.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test,y_test_pred)
plt.figure(figsize=(10,6))
plt.title("Confusion matrix of optimised logistics regression model")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
clf2.fit(X_rus, y_rus)
y_test_pred = clf2.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf2.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test,y_test_pred)
plt.figure(figsize=(10,6))
plt.title("Confusion matrix of optimised logistics model, under-sampling")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
clf2.fit(X_smote, y_smote)
y_test_pred = clf2.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf2.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test,y_test_pred)
plt.figure(figsize=(10,6))
plt.title("Confusion matrix of optimised logistics model, SMOTE")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
print('Optimal number of features: {}'.format(rfecv.n_features_))
feature_selected = X_train.columns[rfecv.get_support(indices=True)]
feature_selected
mask = clf2['selector'].get_support(indices=True)
df_enc= df2.drop(["attrition_flag"], axis=1)
new_features = df_enc.columns[mask]
new_features
select_vars = new_features.tolist()
# get the models coefficients
logReg_coeff = pd.DataFrame({'feature_name': select_vars, 'model_coefficient': clf2['classifier'].coef_[0].transpose().flatten()})
logReg_coeff = logReg_coeff.sort_values('model_coefficient',ascending=False)
# visualising
plt.figure().set_size_inches(10, 6)
fg3 = sns.barplot(x='feature_name', y='model_coefficient',data=logReg_coeff, palette="Blues_d")
fg3.set_xticklabels(rotation=35, labels=logReg_coeff.feature_name)
plt.xlabel('Feature')
plt.ylabel('Coefficient')
plt.subplots_adjust(bottom=0.4)
plt.show()
numerical = ['total_relationship_count', 'months_inactive_12_month',
'contacts_count_12_month', 'total_revolving_bal',
'total_amt_change_q4_q1', 'total_trans_amt',
'total_trans_count', 'total_count_change_q4_q1', 'avg_utilization_ratio']
numeric_transformer = MinMaxScaler()
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numerical)])
classifier = LogisticRegression(random_state= 77)
skb = SelectKBest(f_classif, k='all')
clf3 = Pipeline(steps=[("preprocessor", preprocessor), ("selector", skb), ("classifier", classifier)])
clf3.fit(X_train, y_train)
y_test_pred = clf3.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf3.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test,y_test_pred)
plt.figure(figsize=(12,6))
plt.title("Confusion Matrix of unoptimised logistics regression model")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
clf3.fit(X_rus, y_rus)
y_test_pred = clf3.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf3.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(12,6))
plt.title("Confusion Matrix of unoptimised model, under-sampling")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
clf3.fit(X_smote, y_smote)
y_test_pred = clf3.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf3.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(12,6))
plt.title("Confusion Matrix of unoptimised model, SMOTE")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
# defining parameter range
param_grid = {'classifier__class_weight': [{False:1, True:1}, {False:0.9, True:1}, {False:0.95, True:1}, {False:0.8, True:1}, {False:0.85, True:1}], 'selector__k': list(range(1,30,1)),
'classifier__solver': ['liblinear', 'lbfgs'], 'classifier__penalty': ['l1','l2'], 'classifier__C': [0.001,0.01,0.1,1,10,100]}
grid = GridSearchCV(clf3, param_grid, refit = True, scoring= 'accuracy', verbose = 3)
# fitting the model for grid search
grid.fit(X_train, y_train)
print(grid.best_estimator_)
print('Best penalty:', grid.best_estimator_.get_params()['classifier__penalty'])
print('Best C:', grid.best_estimator_.get_params()['classifier__C'])
print('Best class weight:', grid.best_estimator_.get_params()['classifier__class_weight'])
print('Best solver:', grid.best_estimator_.get_params()['classifier__solver'])
print('Best k value:', grid.best_estimator_.get_params()['selector__k'])
numerical = ['total_relationship_count', 'months_inactive_12_month',
'contacts_count_12_month', 'total_revolving_bal',
'total_amt_change_q4_q1', 'total_trans_amt',
'total_trans_count', 'total_count_change_q4_q1', 'avg_utilization_ratio']
numeric_transformer = MinMaxScaler()
preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numerical)])
classifier = LogisticRegression(solver= "lbfgs", class_weight={False:0.8, True:1}, C= 1, penalty= "l2")
skb = SelectKBest(f_classif, k=8)
clf3 = Pipeline(steps=[("preprocessor", preprocessor), ("selector", skb), ("classifier", classifier)])
clf3.fit(X_train, y_train)
y_test_pred = clf3.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf3.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(12,6))
plt.title("Confusion Matrix of optimised logistics regression model")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
clf3.fit(X_rus, y_rus)
y_test_pred = clf3.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf3.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(12,6))
plt.title("Confusion Matrix of optimised model, under-sampling")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
clf3.fit(X_smote, y_smote)
y_test_pred = clf3.predict(X_test)
# get ROC AUC score
rocscore = roc_auc_score(y_test, y_test_pred)
#print results
print(f'Overall Accuracy: {100*clf3.score(X_test, y_test)}%')
print(f'ROC AUC Score: {100*rocscore}%')
print(classification_report(y_test, y_test_pred))
cm=confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(12,6))
plt.title("Confusion Matrix of optimised model, SMOTE")
sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg')
plt.ylabel("Actual Values")
plt.xlabel("Predicted Values")
plt.show()
feature_selected = X_train.columns[skb.get_support(indices=True)]
feature_selected
mask = clf3['selector'].get_support(indices=True)
df_enc2= df2.drop(["attrition_flag"], axis=1)
new_features = df_enc2.columns[mask]
new_features
select_vars = new_features.tolist()
# get the models coefficients
logReg_coeff = pd.DataFrame({'feature_name': select_vars, 'model_coefficient': clf3['classifier'].coef_[0].transpose().flatten()})
logReg_coeff = logReg_coeff.sort_values('model_coefficient',ascending=False)
# visualising
plt.figure().set_size_inches(10, 6)
fg3 = sns.barplot(x='feature_name', y='model_coefficient',data=logReg_coeff, palette="Blues_d")
fg3.set_xticklabels(rotation=35, labels=logReg_coeff.feature_name)
plt.xlabel('Feature')
plt.ylabel('Coefficient')
plt.subplots_adjust(bottom=0.4)
plt.show()