Churn Prediction

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns %matplotlib inline import warnings warnings.filterwarnings('ignore')

df = pd.read_csv('train.csv') df.head()

df.shape

df.info()

print(df['churn'].value_counts()) not_churned = df[df['churn']=='no']['churn'].count() churned = df[df['churn']=='yes']['churn'].count() fig = plt.figure(figsize=(5,5)) plt.pie([not_churned, churned], labels = ['not churned', 'churned'], autopct='%.2f') plt.title('Pie chart customers churned vs not churned') plt.show()

# df.describe(include='all') df.describe()

# Account length ac_len_churned = df[df['churn']=='yes']['account_length'] ac_len_notchurned = df[df['churn']=='no']['account_length'] sns.distplot(ac_len_churned, label = 'Churned') sns.distplot(ac_len_notchurned, label = 'Not Churned') plt.legend() plt.title('Distribution of Account Lenght for Churned and Not Churned') plt.show() print("Distributionn of account_length for Churned examples and the distributionn of account_length for Not_churned examples is almost overlapping")

# Total_day_charge # plot PDF of the day charge for both the class labels day_charge_churned = df[df['churn']=='yes']['total_day_charge'] day_charge_notchurned = df[df['churn']=='no']['total_day_charge'] sns.distplot(day_charge_churned, label = 'Churned') sns.distplot(day_charge_notchurned, label = 'Not Churned') plt.legend() plt.title('Distribution of day charge by churned and not churned') plt.show() print("Observation: The orange pdf (not churned class) and blue pdf for churned class are not completely overlapping. Higher number of samples are churning when charge is over 40")

# Total eve charge day_charge_churned = df[df['churn']=='yes']['total_eve_charge'] day_charge_notchurned = df[df['churn']=='no']['total_eve_charge'] sns.distplot(day_charge_churned, label = 'Churned') sns.distplot(day_charge_notchurned, label = 'Not Churned') plt.legend() plt.title('Distribution of day charge by churned and not churned') plt.show()

# Total night charge day_charge_churned = df[df['churn']=='yes']['total_night_charge'] day_charge_notchurned = df[df['churn']=='no']['total_night_charge'] sns.distplot(day_charge_churned, label = 'Churned') sns.distplot(day_charge_notchurned, label = 'Not Churned') plt.legend() plt.title('Distribution of day charge by churned and not churned') plt.show()

# Total intnl charge day_charge_churned = df[df['churn']=='yes']['total_intl_charge'] day_charge_notchurned = df[df['churn']=='no']['total_intl_charge'] sns.distplot(day_charge_churned, label = 'Churned') sns.distplot(day_charge_notchurned, label = 'Not Churned') plt.legend() plt.title('Distribution of day charge by churned and not churned') plt.show()

# Number_vmail_messages # Plotting PDF of no_vmail_charge for both the class labels vmail_msgs_churned = df[df['churn']=='yes']['number_vmail_messages'] vmail_msgs_not_churned =df[df['churn']=='no']['number_vmail_messages'] sns.distplot(vmail_msgs_churned,hist=False,label='Churned') sns.distplot(vmail_msgs_not_churned,hist=False,label='Not_churned') plt.title('Distribution of number_vmail_messages for Churned and Not_Churned') plt.legend() plt.show()

df.head()

# number of customer service calls sercall_churned = df[df['churn']=='yes']['number_customer_service_calls'] sercall_notchurned = df[df['churn']=='no']['number_customer_service_calls'] sns.distplot(sercall_churned, hist=False ,label='Churned') sns.distplot(sercall_notchurned,hist=False , label='Not Churned') plt.legend() plt.title('Distribution of number of customer service calls by churned and not churned') plt.show() print("Observation: Contrary to the initial assumptions that customers who called customer service more frequently customers would be likely to churn, customers that churned are likely to contact customer service.")

# area code area_churned = df[df['churn']=='yes']['area_code'] area_notchurned = df[df['churn']=='no']['area_code'] sns.histplot(area_churned, label = 'churned') sns.histplot(area_notchurned, label = 'not churned') plt.legend() plt.show() print('Not much insight from area code.')

sns.pairplot(df, vars=['account_length', 'number_vmail_messages','total_day_charge','total_eve_charge','total_night_charge','total_intl_charge', 'number_customer_service_calls'], hue='churn')

# Feature correlation sns.set(style='white', font_scale=1) corr = df.corr() mask = np.zeros_like(corr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True sns.heatmap(corr,mask=mask, cmap='YlGnBu', vmax=.3, center=0,annot=True,annot_kws={"size":5}, square=True, linewidth=.5, cbar_kws={"shrink": .5})

# Remove the highly correlated features df2 = df.drop(['total_day_minutes', 'total_eve_minutes','total_night_minutes', 'total_intl_minutes'], axis=1) df2.head()

# Get highly correlated features corr_matrix = df.corr().abs() upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool)) considered_features = [col for col in upper.columns if any(upper[col] > 0.95)] considered_features

df[considered_features].values

from statsmodels.stats.outliers_influence import variance_inflation_factor from statsmodels.tools.tools import add_constant # conduct VIF on the selected features def compute_vif(considered_features): X = df[considered_features] # VIF requires a constant 1 X['intercept'] = 1 # Create df to store vif values vif = pd.DataFrame() vif['Variable'] = X.columns vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])] vif = vif[vif['Variable'] != 'intercept'] return vif # remove 1 & repeat

# Compute VIF compute_vif(considered_features).sort_values('VIF', ascending=False)

from sklearn.model_selection import train_test_split

df2.churn.replace(['yes','no'], [1,0], inplace=True) Y = df2['churn'] X = df2.drop('churn', axis=1) x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.1, stratify=Y, random_state=11) print('Shape of x_train and y_train:', x_train.shape, y_train.shape) print('Shape of x_test and y_test:', x_test.shape, y_test.shape)

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer() train_state = vectorizer.fit_transform(x_train['state'].values) test_state = vectorizer.fit_transform(x_test['state'].values) train_state.shape

state_fea = vectorizer.get_feature_names() print(state_fea)

vectorizer = CountVectorizer() train_area_code = vectorizer.fit_transform(x_train['area_code'].values) test_area_code = vectorizer.fit_transform(x_test['area_code'].values) train_area_code.shape

area_fea = vectorizer.get_feature_names() print(area_fea)

vectorizer = CountVectorizer() train_inter_plan = vectorizer.fit_transform(x_train['international_plan'].values) test_inter_plan = vectorizer.fit_transform(x_test['international_plan'].values)

# international features intnl_fea = ['intl_plan_no', 'intl_plan_yes'] print(intnl_fea)

train_inter_plan.shape

vectorizer = CountVectorizer() train_voicem_plan = vectorizer.fit_transform(x_train['voice_mail_plan'].values) test_voicem_plan = vectorizer.fit_transform(x_test['voice_mail_plan'].values)

# voice mail voice_mail_plan_fea = ['voice_plan_no', 'voice_plan_yes'] print(voice_mail_plan_fea)

numerical_features = [] for col in df2.columns: if col != 'churn' and (df2[col].dtype == 'int64' or df2[col].dtype =='float64'): numerical_features.append(col) print(numerical_features)

train_numericals = x_train[numerical_features] test_numericals = x_test[numerical_features]

# Scaling the data using sklearn's Standard scaler from sklearn.preprocessing import StandardScaler scaler = StandardScaler() train_numericals_scaled = scaler.fit_transform(train_numericals) test_numericals_scaled = scaler.fit_transform(test_numericals)

print(train_numericals_scaled.shape) print(test_numericals_scaled.shape)

print(train_numericals_scaled.shape) print(train_inter_plan.shape) print(train_voicem_plan.shape) print(train_state.shape) print(train_area_code.shape)

# To stack sparse matrices column-wise from scipy.sparse import hstack

x_train_merged = hstack((train_area_code, train_inter_plan, train_numericals_scaled, train_state, train_voicem_plan)) x_test_merged = hstack((test_area_code, test_inter_plan, test_numericals_scaled, test_state, test_voicem_plan)) all_features = numerical_features + voice_mail_plan_fea + intnl_fea + area_fea + state_fea

# all_features

from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC

# Grid search to get the best parameter parameters = {'C':[0.01, 0.1, 1, 3, 5, 10]} # higher C, higher fit # Initiate classifier: dict of hyperparameter c (penalty para. of of error term) svm_clf = SVC(random_state=43) # Define the GridSearchCV object grid = GridSearchCV(estimator = svm_clf, param_grid = parameters, scoring = 'accuracy', return_train_score = True, verbose = 1) # Fit GridS. object on training data grid.fit(x_train_merged, y_train) #returns gridsearch obj # GridSearch results in df cv_results = pd.DataFrame(grid.cv_results_) # Plot train vs test accuracy plt.scatter(cv_results['param_C'], cv_results['mean_train_score']) plt.plot(cv_results['param_C'], cv_results['mean_train_score'], label='Train') plt.scatter(cv_results['param_C'], cv_results['mean_test_score']) plt.plot(cv_results['param_C'], cv_results['mean_test_score'], label='CV') plt.title('Hyperparameter vs Accuracy') plt.legend() plt.xlabel('C') plt.ylabel('Accuracy') plt.show() print('Observation: there isnt a huge improvement when theres high gap between train and cv accuracy, starts to overfit')

# Store best parameter. best_parameter = 3 print(f"Best value of C: {best_parameter}")

from sklearn.metrics import accuracy_score from sklearn.metrics import recall_score from sklearn.metrics import confusion_matrix

# Train a new model using optimum hyperpara. from GridS svm_clf = SVC(C = best_parameter, random_state = 43) svm_clf.fit(x_train_merged, y_train) # Check performance in test data result1 = ["1.", "SVM", "No Class Balancing"] y_pred_tr = svm_clf.predict(x_train_merged) print(f"Train accuracy SVM: {accuracy_score(y_train, y_pred_tr)}") result1.append(round(accuracy_score(y_train, y_pred_tr), 2)) y_pred_test = svm_clf.predict(x_test_merged) print(f"Test accuracy SVM: {accuracy_score(y_test, y_pred_test)}") result1.append(round(accuracy_score(y_test, y_pred_test), 2)) # Save accuracy and recall in the list recall = recall_score(y_test, y_pred_test) print("Recall Score:", recall) result1.append(round(recall, 2)) print("-----"*20) cm = confusion_matrix(y_test, y_pred_test) ax = plt.subplot() sns.heatmap(cm, annot=True, fmt='d', linewidths = 1, linecolor = 'black',cmap='YlGnBu',ax=ax) #fmt: add text to cell ax.set_xlabel('Predicted') ax.set_ylabel('Actual') ax.set_ylim(2.0, 0) ax.set_title('Confusion Matrix') ax.xaxis.set_ticklabels(['Neg', 'Pos']) ax.yaxis.set_ticklabels(['Neg', 'Pos']) plt.show()

# Grid search parameters parameters = {'C':[0.01,0.1,1,3,5,10]} # Initiate SVM "class_weight='balanced'" svm_clf = SVC(class_weight='balanced', random_state = 43) # Define GridSearchCV grid = GridSearchCV(svm_clf, param_grid=parameters, scoring='accuracy', return_train_score=True, verbose = 1) # grid.fit grid.fit(x_train_merged, y_train) # cv_results = pd.Dataframe(grid.cv_results_) cv_results = pd.DataFrame(grid.cv_results_) # plot plt.scatter(cv_results['param_C'], cv_results['mean_train_score']) plt.plot(cv_results['param_C'], cv_results['mean_train_score'], label='Train') plt.scatter(cv_results['param_C'], cv_results['mean_test_score']) plt.plot(cv_results['param_C'], cv_results['mean_test_score'], label='Train') plt.legend() plt.title('Hyperparameter vs Accuracy') plt.ylabel('Accuracy') plt.xlabel('C') plt.show()

# Store best c value best_parameter = 3 print(f"Best value of C:", best_parameter)

# Train the model with the best para. discovered svm_clf = SVC(C = best_parameter, class_weight='balanced', random_state=43) svm_clf.fit(x_train_merged, y_train) # Initiate result2 list result2 = ['2.', 'SVM', 'Balanced using class weights'] # Predict train set y_pred_train = svm_clf.predict(x_train_merged) # Print y_pred_tr accuracy score print(f"Train accuracy SVM: {accuracy_score(y_train, y_pred_train)}") result2.append(round(accuracy_score(y_train, y_pred_train), 2)) # Predict test set y_pred_test = svm_clf.predict(x_test_merged) print(f"Test accuracy SVM: {accuracy_score(y_test, y_pred_test)}") result2.append(round(accuracy_score(y_test, y_pred_test), 2)) # Recall Score. Ability to classify positive samples in the model recall = recall_score(y_test, y_pred_test) print("Recall score:", recall) result2.append(round(recall, 2)) print("---"*20) # Confusion Matrix cm = confusion_matrix(y_test, y_pred_test) ax = plt.subplot(); sns.heatmap(cm, annot=True,fmt='d',linecolor='black',linewidths=1, cmap='YlGnBu', ax=ax) ax.set_xlabel('Predicted') ax.set_ylabel('Actual') ax.set_ylim(2.0,0) ax.set_title('Confusion Matrix') ax.xaxis.set_ticklabels(['Neg','Pos']) ax.yaxis.set_ticklabels(['Neg','Pos']) plt.show()

from sklearn.ensemble import RandomForestClassifier

# create dict of rf best para parameters = {'max_depth':[5,10,20,50], 'n_estimators':[100,200,300,400,500]} # initiate random forest object rf_clf = RandomForestClassifier(class_weight='balanced', random_state=43) # initiate GridSearch grid = GridSearchCV(rf_clf, param_grid=parameters, scoring='accuracy', return_train_score=True, verbose=1) # fit rf grid.fit(x_train_merged, y_train) # store result as df cv_results = pd.DataFrame(grid.cv_results_) # change parameter from dict > list of str. O/W: gets error params = list(cv_results['params'].astype('str')) # plot results plt.figure(figsize=(12,6)) plt.scatter(params, cv_results['mean_train_score'], label='Mean train score') plt.plot(params, cv_results['mean_train_score'], label='Train') plt.scatter(params, cv_results['mean_test_score'], label='Mean test score') plt.plot(params, cv_results['mean_test_score'], label='Train') plt.title('Hyperparameter vs Accuracy') plt.xlabel('Hyperparam dict.') plt.ylabel('Accuracy') plt.xticks(rotation=90) plt.legend() plt.show()

# list(rf_results['params'].astype('str'))

# Best parameters observed from gridsearch results best_max_depth = 10 best_n_estimators = 300

# Create a new model with optimum hyperparameters rf_clf = RandomForestClassifier(max_depth = best_max_depth, n_estimators = best_n_estimators, class_weight='balanced', verbose=1, random_state=43) # fit model rf_clf.fit(x_train_merged, y_train) # Initialize result3 result3 = ['3.','Random Forest', 'Balanced using class weights'] # predict y_pred_train = rf_clf.predict(x_train_merged) print(f"Train accuracy RF: {accuracy_score(y_train, y_pred_train)}") result3.append(round(accuracy_score(y_train, y_pred_train), 2)) y_pred_test = rf_clf.predict(x_test_merged) print(f"Train accuracy RF: {accuracy_score(y_test, y_pred_test)}") result3.append(round(accuracy_score(y_test, y_pred_test), 2)) # recall recall = recall_score(y_test, y_pred_test) print(f'Recall score:{recall}') result3.append(round(recall, 2)) print('---'*20) # confusion matrix cm = confusion_matrix(y_test, y_pred_test) ax = plt.subplot(); sns.heatmap(cm, annot=True,fmt='d', linewidths=1, linecolor='black', cmap='YlGnBu',ax=ax) ax.set_title('Confusion Matrix') ax.set_xlabel('Predicted') ax.set_ylabel('Actual') ax.set_ylim(2.0,0) ax.xaxis.set_ticklabels(['Neg', 'Pos']) ax.yaxis.set_ticklabels(['Neg','Pos']) plt.show()

importance = rf_clf.feature_importances_ # plot plt.figure(figsize=(18,5)) plt.bar(all_features, importance) plt.xticks(rotation=90) plt.ylabel('Importance') plt.xlabel('Features') plt.show() print("Observation: important features are 'total_day_charge', 'total_eve_calls', 'total_intl_calls', 'number_customer_service_calls','intl_plan'")

y_train.value_counts()

import math scale=round(math.sqrt(y_train.value_counts()[0]/y_train.value_counts()[1]),2) # use sqrt on the ratio to limit the effect of a multiplication of positive examples by a very high weight

math.sqrt(y_train.value_counts()[0]/y_train.value_counts()[1])

scale

from xgboost import XGBClassifier

# Grid search to get the best parameter # Parameters parameters = {'learning_rate':[0.1, 0.2, 0.3], 'max_depth':[3,5,10, 20], 'n_estimators':[100,200,300,500], 'colsample_bytree':[0.3, 0.5, 0.7]} # initiate & define xgboost classifier xgboost_clf = XGBClassifier(scale_pos_weight = scale, eval_metric = 'mlogloss') # initiate grid search grid = GridSearchCV(xgboost_clf, param_grid=parameters, scoring='accuracy', return_train_score=True, verbose=1) # fit model results grid.fit(x_train_merged, y_train) # store grid search result as df in new var. cv_results = pd.DataFrame(grid.cv_results_) cv_results.head() # plot test scores (next cells)

params = list(cv_results['params'].astype('str')) plt.figure(figsize=(18,6)) plt.scatter(params, cv_results['mean_train_score'], label='mean train score') plt.plot(params, cv_results['mean_train_score'], label='train') plt.scatter(params, cv_results['mean_test_score'], label='mean test score') plt.plot(params, cv_results['mean_test_score'], label='test') plt.title('Hyperparameters vs. Accuracy') plt.xlabel('Dict. of hyperparamters') plt.ylabel('Accuracy') plt.legend() plt.xticks(rotation=90) plt.show()

cv_results = pd.DataFrame(grid.cv_results_).sort_values(by='mean_train_score', ascending=True)[:70]

# plot only the first 70 rows. Use param index instead of params dict params = list(cv_results['params']) params_index = np.arange(70) plt.figure(figsize=(18,6)) plt.scatter(params_index, cv_results['mean_train_score'], label='mean train score') plt.plot(params_index, cv_results['mean_train_score'], label='train') plt.scatter(params_index, cv_results['mean_test_score'], label='mean test score') plt.plot(params_index, cv_results['mean_test_score'], label='cv') plt.title('Hyperparameters index vs Accuracy') plt.xlabel('Hyperparameters index') plt.ylabel('Accuracy') plt.xticks(params_index) plt.legend() plt.grid() plt.show()

# get best parameter best_param = params[34] best_param

# Train model with best parameter # initiate & define model xgboost_clf = XGBClassifier(learning_rate = best_param['learning_rate'], max_depth = best_param['max_depth'], n_estimators = best_param['n_estimators'], colsample_bytree = best_param['colsample_bytree'], eval_metric = 'mlogloss', scale_pos_weight = scale) # fit model on x_train & y_train xgboost_clf.fit(x_train_merged, y_train) # create result4 object result4 = ['4.', 'XGBoost', 'Imbalanced data'] # predict on x_train y_pred_tr = xgboost_clf.predict(x_train_merged) print(f'Accuracy score: {round(accuracy_score(y_train, y_pred_tr),2)}') result4.append(round(accuracy_score(y_train, y_pred_tr),2)) # predict on x_test y_pred_test = xgboost_clf.predict(x_test_merged) print(f'Accuracy score: {round(accuracy_score(y_test, y_pred_test),2)}') result4.append(round(accuracy_score(y_test, y_pred_test),2)) # recall recall = recall_score(y_test, y_pred_test) print(f'Recall score: {recall}') result4.append(recall) # confusion matrix cm = confusion_matrix(y_test, y_pred_test) ax = plt.subplot(); sns.heatmap(cm, annot=True, fmt='d',linewidths=1, linecolor='Black', cmap='YlGnBu', ax=ax) ax.set_xlabel('Predicted') ax.set_ylabel('Actual') ax.xaxis.set_ticklabels(['Neg','Pos']) ax.yaxis.set_ticklabels(['Neg','Pos']) ax.set_title('Confusion Matrix') plt.show()

# Show class imbalance between churned and not churned from imblearn.over_sampling import SMOTE X_resampled, y_resampled = SMOTE().fit_resample(x_train_merged, y_train) print(X_resampled.shape, y_resampled.shape) print(y_resampled.value_counts())

# paramters paramters = {'learning_rate':[0.1, 0.2, 0.3], 'max_depth':[3,5,10, 20], 'n_estimators':[100,200,300,500], 'colsample_bytree':[0.3, 0.5, 0.7]} # initiate baseline xgb clf xgb_clf = XGBClassifier(eval_metric='mlogloss') # initiate grid search grid = GridSearchCV(estimator=xgb_clf, param_grid=parameters, verbose = 1, scoring='accuracy', return_train_score=True) # grid fit grid.fit(X_resampled, y_resampled) # cv_results cv_results = pd.DataFrame(grid.cv_results_).sort_values(by='mean_train_score', ascending=True)[:70] # plot cv results params = cv_results['params'] params_index = np.arange(70)

plt.figure(figsize=(18,6)) plt.scatter(params_index, cv_results['mean_train_score'], label='mean train score') plt.plot(params_index, cv_results['mean_train_score'], label='train') plt.scatter(params_index, cv_results['mean_test_score'], label='mean test score') plt.plot(params_index, cv_results['mean_test_score'], label='test') plt.title('Hyperparam index vs Accuracy') plt.xlabel('Hyperparam index') plt.ylabel('Accuracy') plt.grid() plt.xticks(params_index) plt.legend() plt.show()

# best parameters best_para = params[56] print(best_para)

xgb_clf = XGBClassifier(learning_rate=best_para['learning_rate'], max_depth = best_para['max_depth'], n_estimators = best_para['n_estimators'], colsample_bytree = best_para['colsample_bytree'], eval_metric='mlogloss') xgb_clf.fit(X_resampled, y_resampled) result5 = ['5.', 'XGBoost', 'Balanced class using SMOTE'] y_pred_train = xgb_clf.predict(X_resampled) print("Train Accuracy Score:", round(accuracy_score(y_resampled, y_pred_train),2)) result5.append(round(accuracy_score(y_resampled, y_pred_train),2)) y_pred_test = xgb_clf.predict(x_test_merged) print("Test Accuracy Score:", round(accuracy_score(y_test, y_pred_test),2)) result5.append(round(accuracy_score(y_test, y_pred_test),2)) recall = recall_score(y_test, y_pred_test) print("Recall score:", recall) result5.append(round(recall,2)) cm = confusion_matrix(y_test, y_pred_test) ax = plt.subplot() sns.heatmap(cm, fmt = 'd', linecolor='black', linewidths=1, annot=True, ax=ax, cmap='YlGnBu') ax.set_title('Confusion Matrix') ax.set_ylabel('Actual') ax.set_xlabel('Predicted') ax.xaxis.set_ticklabels(['Neg','Pos']) ax.yaxis.set_ticklabels(['Neg','Pos']) plt.show() print("Observation: highest recall score of 85%, however the test accuracy score is low. High gap between test and train acc. score = high variance. Tune smote to fix this.")

# !pip install prettytable

result1

from prettytable import PrettyTable myTable = PrettyTable(['Model No.', 'Model Name', 'Variant', 'Train Acc. Score', 'Test Acc. Score', 'Recall Score']) myTable.add_row(result1) myTable.add_row(result2) myTable.add_row(result3) myTable.add_row(result4) myTable.add_row(result5)

print(myTable)