Logistic regression

# categorical features identified by their data type # and put into a new dataframe df_categorical=df.loc[:,df.dtypes==np.object] df_categorical = df_categorical[['gender', 'education_level', 'marital_status', 'income_category','card_category','attrition_flag']] df_categorical.head()

label = preprocessing.LabelEncoder() df_categorical_encoded = pd.DataFrame() for i in df_categorical.columns : df_categorical_encoded[i]=label.fit_transform(df_categorical[i]) def cramers_V(var1,var2) : crosstab =np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test obs = np.sum(crosstab) # Number of observations mini = min(crosstab.shape)-1 # Take the minimum value between the columns and the rows of the cross table return (stat/(obs*mini)) rows= [] for var1 in df_categorical_encoded: col = [] for var2 in df_categorical_encoded : cramers =cramers_V(df_categorical_encoded[var1], df_categorical_encoded[var2]) # Cramer's V test col.append(round(cramers,2)) # Keeping of the rounded value of the Cramer's V rows.append(col) cramers_results = np.array(rows) # Putting all the results into a dataframe cramerv_matrix = pd.DataFrame(cramers_results, columns = df_categorical_encoded.columns, index =df_categorical_encoded.columns) # visualising mask = np.triu(np.ones_like(cramerv_matrix, dtype=np.bool)) cat_heatmap = sns.heatmap(cramerv_matrix, mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG') cat_heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

# creating a seperate dataframe for numerical features # get dummies to encode them df_numerical=df.loc[:,df.dtypes!=np.object] df_numerical['attrition_flag']=df.loc[:,'attrition_flag'] oh=pd.get_dummies(df_numerical['attrition_flag']) df_numerical=df_numerical.drop(['attrition_flag'],axis=1) df_numerical=df_numerical.join(oh) df_numerical.head()

# using back Pearson's R for numerical features num_corr = df_numerical.corr() plt.figure(figsize=(16, 6)) mask = np.triu(np.ones_like(num_corr, dtype=np.bool)) num_heatmap = sns.heatmap(num_corr, mask=mask, vmin=-1, vmax=1, annot=True, cmap='BrBG') num_heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

fig, ax=plt.subplots(ncols=2,figsize=(15, 5)) heatmap = sns.heatmap(num_corr[['Existing Customer']].sort_values(by='Existing Customer', ascending=False), ax=ax[0],vmin=-1, vmax=1, annot=True, cmap='BrBG') heatmap.set_title('Features Correlating with Existing Customers', fontdict={'fontsize':18}, pad=16); heatmap = sns.heatmap(num_corr[['Attrited Customer']].sort_values(by='Attrited Customer', ascending=False), ax=ax[1],vmin=-1, vmax=1, annot=True, cmap='BrBG') heatmap.set_title('Features Correlating with Attrited Customers', fontdict={'fontsize':18}, pad=16); fig.tight_layout(pad=5)

df2 = df.copy()

# variables with low coefficient as noted earlier are dropped df2= df2.drop(columns=["credit_limit", "avg_open_to_buy", "months_on_book", "customer_age", "dependent_count"], axis=1)

# encoding target feature map={'Attrited Customer':1,"Existing Customer":0} df2["attrition_flag"].replace(map,inplace=True)

# combining categories within some features together df2.education_level = df2.education_level.replace({'Graduate':'Tertiary','Post-Graduate':'Tertiary','Doctorate':'Tertiary'}) df2.income_category = df2.income_category.replace({'$40K - $60K':'$40K - $80K','$60K - $80K':'$40K - $80K'})

# get dummies for rest of categorical features df2.columns = df2.columns.str.replace(' ', '') dummies = ['gender','education_level', 'marital_status', 'income_category','card_category'] dummy_data = pd.get_dummies(df2[dummies]) df2 = pd.concat([df2, dummy_data], axis = 1) df2.drop(dummies, axis=1, inplace=True)

df2.head(3)

# checking encoding results df2.info()

# X contains independent values, y contains dependent value X = df2.drop('attrition_flag', axis = 1) y = df2['attrition_flag'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 77, stratify=y)

# visualising the imbalance of the training set fig,ax=plt.subplots(figsize=(6, 5)) patches, texts, autotexts = ax.pie(x=y_train.value_counts(),labels=["Existing customers","Attrited Customers"],colors=["#c2cccc", "#a1e6e4"], autopct="%.1f%%") fig.suptitle("Checking the imbalance of the Attrition class in the training set",y=1,fontweight="bold",fontsize=15) [autotext.set_color('#000000') for autotext in autotexts] plt.show()

# perform random under-sampling on training dataset rus = RandomUnderSampler(random_state= 77) X_rus, y_rus = rus.fit_resample(X_train, y_train)

# visualising the outcome fig,ax=plt.subplots(figsize=(6, 5)) patches, texts, autotexts = ax.pie(x=y_rus.value_counts(),labels=["Existing customers","Attrited Customers"],colors=["#c2cccc", "#a1e6e4"], autopct="%.1f%%") fig.suptitle("Imbalanced class in the training set after under-sampling",y=1,fontweight="bold",fontsize=15) [autotext.set_color('#000000') for autotext in autotexts] plt.show()

# over balancing the training set sm = SMOTE(random_state = 77, sampling_strategy = "not majority") X_smote, y_smote = sm.fit_resample(X_train, y_train)

# visualising the outcome fig,ax=plt.subplots(figsize=(6, 5)) patches, texts, autotexts = ax.pie(x=y_smote.value_counts(),labels=["Existing customers","Attrited Customers"],colors=["#c2cccc", "#a1e6e4"], autopct="%.1f%%") fig.suptitle("Imbalanced class in the training set after under-sampling",y=1,fontweight="bold",fontsize=15) [autotext.set_color('#000000') for autotext in autotexts] plt.show()

model = LogisticRegression(max_iter=600) model.fit(X_train, y_train) y_test_pred = model.predict(X_test)

# get ROC AUC score rocscore = roc_auc_score(y_test, y_test_pred) #print results print(f'Overall Accuracy: {100*model.score(X_test, y_test)}%') print(f'ROC AUC Score: {100*rocscore}%') print(classification_report(y_test, y_test_pred))

numerical = ['total_relationship_count', 'months_inactive_12_month', 'contacts_count_12_month', 'total_revolving_bal', 'total_amt_change_q4_q1', 'total_trans_amt', 'total_trans_count', 'total_count_change_q4_q1', 'avg_utilization_ratio'] numeric_transformer = MinMaxScaler() preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numerical)]) classifier = LogisticRegression(random_state= 77) clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", classifier)])

clf.fit(X_train, y_train) y_test_pred = clf.predict(X_test)

# get ROC AUC score rocscore = roc_auc_score(y_test, y_test_pred) #print results print(f'Overall Accuracy: {100*clf.score(X_test, y_test)}%') print(f'ROC AUC Score: {100*rocscore}%') print(classification_report(y_test, y_test_pred))

cm=confusion_matrix(y_test,y_test_pred) plt.figure(figsize=(10,6)) plt.title("Confusion matrix of unoptimised logistics model") sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg') plt.ylabel("Actual Values") plt.xlabel("Predicted Values") plt.show()

clf.fit(X_rus, y_rus) y_test_pred = clf.predict(X_test)

# get ROC AUC score rocscore = roc_auc_score(y_test, y_test_pred) #print results print(f'Overall Accuracy: {100*clf.score(X_test, y_test)}%') print(f'ROC AUC Score: {100*rocscore}%') print(classification_report(y_test, y_test_pred))

cm=confusion_matrix(y_test,y_test_pred) plt.figure(figsize=(10,6)) plt.title("Confusion matrix of unoptimised logistics regression model, under-sampling") sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg') plt.ylabel("Actual Values") plt.xlabel("Predicted Values") plt.show()

clf.fit(X_smote, y_smote) y_test_pred = clf.predict(X_test)

# get ROC AUC score rocscore = roc_auc_score(y_test, y_test_pred) #print results print(f'Overall Accuracy: {100*clf.score(X_test, y_test)}%') print(f'ROC AUC Score: {100*rocscore}%') print(classification_report(y_test, y_test_pred))

cm=confusion_matrix(y_test,y_test_pred) plt.figure(figsize=(10,6)) plt.title("Confusion matrix of unoptimised logistics regression model, SMOTE") sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg') plt.ylabel("Actual Values") plt.xlabel("Predicted Values") plt.show()

# defining parameter range param_grid = {'classifier__class_weight': [{False:0.9, True:1}, {False:0.95, True:1}, {False:0.8, True:1}, {False:0.85, True:1}], 'classifier__solver': ['liblinear', 'lbfgs'], 'classifier__penalty': ['l1','l2'], 'classifier__C': [0.001,0.01,0.1,1,10,100]} grid = GridSearchCV(clf, param_grid, refit = True, scoring= 'accuracy', verbose = 3) # fitting the model for grid search grid.fit(X_train, y_train) print(grid.best_estimator_)

print('Best penalty:', grid.best_estimator_.get_params()['classifier__penalty']) print('Best C:', grid.best_estimator_.get_params()['classifier__C']) print('Best class weight:', grid.best_estimator_.get_params()['classifier__class_weight']) print('Best solver:', grid.best_estimator_.get_params()['classifier__solver'])

numerical = ['total_relationship_count', 'months_inactive_12_month', 'contacts_count_12_month', 'total_revolving_bal', 'total_amt_change_q4_q1', 'total_trans_amt', 'total_trans_count', 'total_count_change_q4_q1', 'avg_utilization_ratio'] numeric_transformer = MinMaxScaler() preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numerical)]) classifier = LogisticRegression(solver= "liblinear", class_weight={False:0.8, True:1}, C= 10, penalty= "l2") clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", classifier)])

clf.fit(X_train, y_train) y_test_pred = clf.predict(X_test)

# get ROC AUC score rocscore = roc_auc_score(y_test, y_test_pred) #print results print(f'Overall Accuracy: {100*clf.score(X_test, y_test)}%') print(f'ROC AUC Score: {100*rocscore}%') print(classification_report(y_test, y_test_pred))

cm=confusion_matrix(y_test,y_test_pred) plt.figure(figsize=(10,6)) plt.title("Confusion matrix of optimised logistics model") sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg') plt.ylabel("Actual Values") plt.xlabel("Predicted Values") plt.show()

clf.fit(X_rus, y_rus) y_test_pred = clf.predict(X_test)

# get ROC AUC score rocscore = roc_auc_score(y_test, y_test_pred) #print results print(f'Overall Accuracy: {100*clf.score(X_test, y_test)}%') print(f'ROC AUC Score: {100*rocscore}%') print(classification_report(y_test, y_test_pred))

cm=confusion_matrix(y_test,y_test_pred) plt.figure(figsize=(10,6)) plt.title("Confusion matrix of optimised logistics regression model, under-sampling") sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg') plt.ylabel("Actual Values") plt.xlabel("Predicted Values") plt.show()

clf.fit(X_smote, y_smote) y_test_pred = clf.predict(X_test)

# get ROC AUC score rocscore = roc_auc_score(y_test, y_test_pred) #print results print(f'Overall Accuracy: {100*clf.score(X_test, y_test)}%') print(f'ROC AUC Score: {100*rocscore}%') print(classification_report(y_test, y_test_pred))

cm=confusion_matrix(y_test,y_test_pred) plt.figure(figsize=(10,6)) plt.title("Confusion matrix of optimised logistics regression model, SMOTE") sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg') plt.ylabel("Actual Values") plt.xlabel("Predicted Values") plt.show()

numerical = ['total_relationship_count', 'months_inactive_12_month', 'contacts_count_12_month', 'total_revolving_bal', 'total_amt_change_q4_q1', 'total_trans_amt', 'total_trans_count', 'total_count_change_q4_q1', 'avg_utilization_ratio'] numeric_transformer = MinMaxScaler() preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numerical)]) classifier = LogisticRegression(random_state= 77) rfecv = RFECV(estimator= classifier) clf2 = Pipeline(steps=[("preprocessor", preprocessor), ("selector", rfecv), ("classifier", classifier)])

clf2.fit(X_train, y_train) y_test_pred = clf2.predict(X_test)

# get ROC AUC score rocscore = roc_auc_score(y_test, y_test_pred) #print results print(f'Overall Accuracy: {100*clf2.score(X_test, y_test)}%') print(f'ROC AUC Score: {100*rocscore}%') print(classification_report(y_test, y_test_pred))

cm=confusion_matrix(y_test,y_test_pred) plt.figure(figsize=(10,6)) plt.title("Confusion matrix of unoptimised logistics model") sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg') plt.ylabel("Actual Values") plt.xlabel("Predicted Values") plt.show()

clf2.fit(X_rus, y_rus) y_test_pred = clf2.predict(X_test)

# get ROC AUC score rocscore = roc_auc_score(y_test, y_test_pred) #print results print(f'Overall Accuracy: {100*clf2.score(X_test, y_test)}%') print(f'ROC AUC Score: {100*rocscore}%') print(classification_report(y_test, y_test_pred))

cm=confusion_matrix(y_test,y_test_pred) plt.figure(figsize=(10,6)) plt.title("Confusion matrix of unoptimised logistics regression model, under-sampling") sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg') plt.ylabel("Actual Values") plt.xlabel("Predicted Values") plt.show()

clf2.fit(X_smote, y_smote) y_test_pred = clf2.predict(X_test)

# get ROC AUC score rocscore = roc_auc_score(y_test, y_test_pred) #print results print(f'Overall Accuracy: {100*clf2.score(X_test, y_test)}%') print(f'ROC AUC Score: {100*rocscore}%') print(classification_report(y_test, y_test_pred))

cm=confusion_matrix(y_test,y_test_pred) plt.figure(figsize=(10,6)) plt.title("Confusion matrix of unoptimised logistics regression model, SMOTE") sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg') plt.ylabel("Actual Values") plt.xlabel("Predicted Values") plt.show()

# defining parameter range param_grid = {'classifier__class_weight': [{False:0.9, True:1}, {False:0.95, True:1}, {False:0.8, True:1}, {False:0.85, True:1}], 'classifier__solver': ['liblinear', 'lbfgs'], 'classifier__penalty': ['l1','l2'], 'classifier__C': [0.001,0.01,0.1,1,10,100]} grid = GridSearchCV(clf2, param_grid, refit = True, scoring= 'accuracy', verbose = 3) # fitting the model for grid search grid.fit(X_train, y_train) print(grid.best_estimator_)

print('Best penalty:', grid.best_estimator_.get_params()['classifier__penalty']) print('Best C:', grid.best_estimator_.get_params()['classifier__C']) print('Best class weight:', grid.best_estimator_.get_params()['classifier__class_weight']) print('Best solver:', grid.best_estimator_.get_params()['classifier__solver'])

numerical = ['total_relationship_count', 'months_inactive_12_month', 'contacts_count_12_month', 'total_revolving_bal', 'total_amt_change_q4_q1', 'total_trans_amt', 'total_trans_count', 'total_count_change_q4_q1', 'avg_utilization_ratio'] numeric_transformer = MinMaxScaler() preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numerical)]) classifier = LogisticRegression(solver= "liblinear", class_weight={False:0.85, True:1}, C= 10, penalty= "l1") rfecv = RFECV(estimator= classifier) clf2 = Pipeline(steps=[("preprocessor", preprocessor), ("selector", rfecv), ("classifier", classifier)])

clf2.fit(X_train, y_train) y_test_pred = clf2.predict(X_test)

# get ROC AUC score rocscore = roc_auc_score(y_test, y_test_pred) #print results print(f'Overall Accuracy: {100*clf2.score(X_test, y_test)}%') print(f'ROC AUC Score: {100*rocscore}%') print(classification_report(y_test, y_test_pred))

cm=confusion_matrix(y_test,y_test_pred) plt.figure(figsize=(10,6)) plt.title("Confusion matrix of optimised logistics regression model") sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg') plt.ylabel("Actual Values") plt.xlabel("Predicted Values") plt.show()

clf2.fit(X_rus, y_rus) y_test_pred = clf2.predict(X_test)

# get ROC AUC score rocscore = roc_auc_score(y_test, y_test_pred) #print results print(f'Overall Accuracy: {100*clf2.score(X_test, y_test)}%') print(f'ROC AUC Score: {100*rocscore}%') print(classification_report(y_test, y_test_pred))

cm=confusion_matrix(y_test,y_test_pred) plt.figure(figsize=(10,6)) plt.title("Confusion matrix of optimised logistics model, under-sampling") sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg') plt.ylabel("Actual Values") plt.xlabel("Predicted Values") plt.show()

clf2.fit(X_smote, y_smote) y_test_pred = clf2.predict(X_test)

# get ROC AUC score rocscore = roc_auc_score(y_test, y_test_pred) #print results print(f'Overall Accuracy: {100*clf2.score(X_test, y_test)}%') print(f'ROC AUC Score: {100*rocscore}%') print(classification_report(y_test, y_test_pred))

cm=confusion_matrix(y_test,y_test_pred) plt.figure(figsize=(10,6)) plt.title("Confusion matrix of optimised logistics model, SMOTE") sns.heatmap(cm, annot=True,fmt='d', cmap='gist_yarg') plt.ylabel("Actual Values") plt.xlabel("Predicted Values") plt.show()

print('Optimal number of features: {}'.format(rfecv.n_features_))

feature_selected = X_train.columns[rfecv.get_support(indices=True)] feature_selected

mask = clf2['selector'].get_support(indices=True) df_enc= df2.drop(["attrition_flag"], axis=1) new_features = df_enc.columns[mask] new_features select_vars = new_features.tolist() # get the models coefficients logReg_coeff = pd.DataFrame({'feature_name': select_vars, 'model_coefficient': clf2['classifier'].coef_[0].transpose().flatten()}) logReg_coeff = logReg_coeff.sort_values('model_coefficient',ascending=False) # visualising plt.figure().set_size_inches(10, 6) fg3 = sns.barplot(x='feature_name', y='model_coefficient',data=logReg_coeff, palette="Blues_d") fg3.set_xticklabels(rotation=35, labels=logReg_coeff.feature_name) plt.xlabel('Feature') plt.ylabel('Coefficient') plt.subplots_adjust(bottom=0.4) plt.show()

numerical = ['total_relationship_count', 'months_inactive_12_month', 'contacts_count_12_month', 'total_revolving_bal', 'total_amt_change_q4_q1', 'total_trans_amt', 'total_trans_count', 'total_count_change_q4_q1', 'avg_utilization_ratio'] numeric_transformer = MinMaxScaler() preprocessor = ColumnTransformer(transformers=[("num", numeric_transformer, numerical)]) classifier = LogisticRegression(random_state= 77) skb = SelectKBest(f_classif, k='all') clf3 = Pipeline(steps=[("preprocessor", preprocessor), ("selector", skb), ("classifier", classifier)])

clf3.fit(X_train, y_train) y_test_pred = clf3.predict(X_test)