customer-churn-prediction

import pandas as pd import matplotlib import seaborn as sns import numpy as np import imblearn import sklearn from platform import python_version packages = {'Pandas': pd, 'Matplotlib': matplotlib, 'Seaborn': sns, 'NumPy': np, 'Scikit-Learn': sklearn, 'Imbalanced-Learn': imblearn} print('Versions of the packages:\n') print('{0:-^20} | {1:-^10}'.format('', '')) print('{0:^20} | {1:^10}'.format('Package', 'Version')) print('{0:-^20} | {1:-^10}'.format('', '')) for name, alias in sorted(packages.items()): print(f'{name:<20} | {alias.__version__:>10}') print() print('{0}: {1}'.format('Python version', python_version()))

# some warning may arise during GridSearchCV, and they can be ignored import warnings warnings.filterwarnings('ignore') warnings.simplefilter('ignore') # plots import matplotlib.pyplot as plt from matplotlib.ticker import EngFormatter, PercentFormatter, MultipleLocator plt.style.use('scripts/flsbustamante.mplstyle') # scaling and composing from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder from sklearn.compose import ColumnTransformer # model selection from sklearn.model_selection import train_test_split from sklearn.model_selection import RepeatedStratifiedKFold from sklearn.model_selection import cross_validate # imbalanced from imblearn.under_sampling import RandomUnderSampler from imblearn.over_sampling import SMOTE # classifiers from sklearn.dummy import DummyClassifier from sklearn.linear_model import LogisticRegression from sklearn.linear_model import SGDClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier from lightgbm import LGBMClassifier from xgboost import XGBClassifier import xgboost xgboost.set_config(verbosity=0) # custom functions and variables from scripts.functions import (model_summary, confusion_matrix_norm, confusion_matrix_plot, score_table, hyperparameter_tuning, RANDOM_STATE, scoring) # presentation and local constants # from IPython.core.display_functions import display JOBS = -2

DATAFILE = 'data/ibm_telco_customer_churn.csv' df = pd.read_csv(DATAFILE) with pd.option_context('display.max_columns', 25): display(df.head())

print(f'Number of instances (rows): {df.shape[0]:>10}') print(f'Number of attributes (columns): {df.shape[1]:>5}')

df.info()

sum(map(lambda x: x.isspace(), df['TotalCharges']))

object_columns = [column for column in df.columns if (df[column].dtype == 'object')] for column in object_columns: print(f'{column:<20} {sum(map(lambda x: x.isspace(), df[column]))}')

df = df.replace(r'^\s*$', np.nan, regex=True) df = df.apply(pd.to_numeric, errors='ignore') df['SeniorCitizen'] = df['SeniorCitizen'].replace({0: 'No', 1: 'Yes'})

df.info()

df.isnull().sum()

with pd.option_context('display.precision', 2, 'display.max_columns', 25): display(df.describe())

df['TotalCharges'].isnull().sum() / df.shape[0] * 100 # percentage of empty rows

df.loc[df['TotalCharges'].isnull()]

df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())

df[df == 0].count()

df.nunique()

df.describe(exclude=[np.number])

churn = df.groupby('Churn').size() churn

churn.div(churn.sum(axis=0)).multiply(100)

fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, nrows=1) for ax in (ax2, ax3): ax.axis('off') eng_format = EngFormatter(places=1) labels = churn.values churn.plot(kind='bar', ax=ax1) ax1.yaxis.set_major_formatter(EngFormatter()) ax1.spines['right'].set_visible(False) ax1.spines['top'].set_visible(False) ax1.tick_params(axis='x', labelrotation=0) ax1.bar_label(ax1.containers[0], label_type='edge', labels=labels, color='black', fontsize=10, weight='bold') ax2.text(-0.15, 0.75, 'From the 7043 customers', fontsize=19, color='blue') ax2.text(-0.15, 0.58, r'$\bf{1869}$', fontsize=80, color='purple') ax2.text(-0.15, 0.5, r'$\mathrm{\bf{have\ left}}$', fontsize=19, color='red') ax3.text(0.15, 0.51, 'This means that', fontsize=19, color='gray') ax3.text(0.15, 0.35, r'$\mathrm{\bf{26.5\%}}$', fontsize=80, color='red') ax3.text(0.15, 0.3, r'$\mathrm{\bf{of\ the\ customers\ left\ the\ company}}$', fontsize=19, color='blue') fig.suptitle('Data summary - Telco customer churn') plt.show()

fig, axes = plt.subplots(nrows=1, ncols=3) df.hist(ax=axes) axes[0].set_xlabel('Months') axes[0].set_ylabel('Customers') axes[1].set_xlabel('Charge') axes[1].set_ylabel('Customers') axes[2].set_xlabel('Charge') axes[2].set_ylabel('Customers') fig.suptitle('Distribution of the numerical features') plt.show()

fig, ax = plt.subplots() sns.histplot(x='tenure', hue='Contract', data=df, ax=ax, kde=False, fill=True, stat='density', common_norm=True, multiple='layer') sns.kdeplot(x='tenure', hue='Contract', data=df, ax=ax, fill=True, multiple='layer') fig.suptitle('Tenure distribution by contract kind') sns.move_legend(ax, 'upper center', bbox_to_anchor=(0.5, 1.15), ncol=3) plt.show()

df = df.drop('customerID', axis=1) not_categorical = ('customerID', 'tenure', 'MonthlyCharges', 'TotalCharges') categorical = [feature for feature in np.setdiff1d(df.columns.values, not_categorical)]

fig, axes = plt.subplots(ncols=3, figsize=(14, 6)) for i, feature in enumerate(not_categorical[1:]): k = sns.kdeplot(x=feature, hue='Churn', data=df, ax=axes.flat[i], fill=True, multiple='layer') handles = k.legend_.legendHandles labels = [t.get_text() for t in k.legend_.get_texts()] k.legend_.remove() fig.suptitle('Churn distribution in numerical features\n\n') fig.legend(bbox_to_anchor=(0.5, 0.925), loc='upper center', ncol=2, labels=labels, handles=handles, fontsize=12, title='Churn') plt.show()

fig, axes = plt.subplots(ncols=3, figsize=(16, 4), sharey=True) for i, feature in enumerate(not_categorical[1:]): sns.boxplot(x=feature, y='Churn', data=df, ax=axes.flat[i]) axes.flat[0].xaxis.set_major_locator(MultipleLocator(10)) axes.flat[1].xaxis.set_major_locator(MultipleLocator(10)) axes.flat[1].set_ylabel('') axes.flat[2].xaxis.set_major_locator(MultipleLocator(1000)) axes.flat[2].set_ylabel('') axes.flat[2].tick_params(axis='x', rotation=25) fig.suptitle('Churn distribution in numerical features') fig.align_labels() plt.show()

fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(14, 14), sharey=True) for i, feature in enumerate(categorical[1:]): c = sns.histplot(x=feature, data=df, ax=axes.flat[i], stat='percent', multiple='dodge', shrink=0.8,) c.tick_params(axis='x', labelsize=12, rotation=25) c.grid(False) c.yaxis.set_major_formatter(PercentFormatter()) c.yaxis.set_major_locator(MultipleLocator(20)) c.set_ylabel('') for bar in c.containers: c.bar_label(bar, label_type='edge', color='gray', labels=[f'{b.get_height():.1f}' if b.get_height() > 0 else '' for b in bar], fontsize=12, weight='bold') for b, c in zip(bar, sns.color_palette('tab20c')): b.set_facecolor(c) fig.align_labels() fig.suptitle('Categorical attributes percentage') plt.show()

fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(14, 20), sharey=True) for i, feature in enumerate(categorical[1:]): c = sns.histplot(x=feature, hue='Churn', data=df, ax=axes.flat[i], stat='percent', multiple='fill', shrink=0.8, common_norm=True) c.tick_params(axis='x', labelsize=12, rotation=45) c.grid(False) handles = c.legend_.legendHandles labels = [t.get_text() for t in c.legend_.get_texts()] c.legend_.remove() c.yaxis.set_major_formatter(PercentFormatter(1)) c.set_ylabel('') for bar in c.containers: c.bar_label(bar, label_type='center', color='white', labels=[f'{b.get_height() * 100:.1f}' if b.get_height() > 0 else '' for b in bar], fontsize=12, weight='bold') fig.suptitle('Churn by categorical attribute\n\n') fig.legend(bbox_to_anchor=(0.5, 0.965), loc='upper center', ncol=2, labels=labels, handles=handles, fontsize=12, title='Churn') fig.align_labels() plt.show()

c = sns.catplot(x='InternetService', y='MonthlyCharges', data=df, hue='Churn', kind='box', height=8, aspect=1.4) sns.move_legend(c, 'upper center', bbox_to_anchor=(0.5, 1.1), ncol=2) plt.show()

partner_dependents = df.groupby(['Partner', 'Dependents']).size().unstack() partner_dependents

partner_dependents_pct = partner_dependents.div(partner_dependents.sum(1), axis=0) partner_dependents_pct

fig, ax = plt.subplots(figsize=(10, 8)) c = partner_dependents_pct.plot(kind='bar', stacked=True, rot=0, ax=ax, width=0.6) c.yaxis.set_major_formatter(PercentFormatter(1)) c.grid(False) c.legend_.remove() for p in c.patches: width, height = p.get_width(), p.get_height() x, y = p.get_xy() c.annotate(f'{height*100:.1f}', (x + width/2, y + height*0.5), ha='center', color='white', weight='bold', size=14) fig.suptitle('Customers - partners vs dependents\n\n') plt.gcf().text(0.025, 0.865, 'Percentage of customers, grouped by partner status, that have dependents\n', fontsize=18, color='gray') fig.legend(bbox_to_anchor=(0.5, 0.885), loc='upper center', ncol=2, labels=['Yes', 'No'], fontsize=12, title='Dependents') plt.show()

df['Churn'] = df['Churn'].replace({'No': 0, 'Yes': 1}) df_dummies = pd.get_dummies(df) df_dummies.head()

fig, ax = plt.subplots() colormap = plt.get_cmap('RdBu') churn_corr_dummies = df_dummies.corr()['Churn'].sort_values(ascending=False) churn_corr_dummies[1:].plot(kind='bar', color=colormap(np.linspace(0, 1,len(churn_corr_dummies))), ax=ax) ax.yaxis.set_major_locator(MultipleLocator(0.1)) ax.set_title('Churn correlation with each feature') plt.show()

std_scaler = StandardScaler() label_encoder = LabelEncoder() ohe = OneHotEncoder(handle_unknown='ignore') preprocessing = ColumnTransformer(transformers=[('std_scaler', std_scaler, not_categorical[1:]), ('ohe', ohe, categorical[1:])]) df_clean = df.copy() df_clean['Churn'] = label_encoder.fit_transform(df_clean['Churn']) df_clean.head()

X = df_clean.drop('Churn', axis=1) y = df_clean['Churn']

X.head()

dummy_model = DummyClassifier(strategy='constant', constant=1) rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=RANDOM_STATE) scores = cross_validate(dummy_model, X, y, scoring=scoring, cv=rskf, n_jobs=JOBS) score_table(scores)

models = [('LR', LogisticRegression(max_iter=10000)), ('LDA', LinearDiscriminantAnalysis()), ('KNN', KNeighborsClassifier()), ('CART', DecisionTreeClassifier(random_state=RANDOM_STATE)), ('NB', GaussianNB()), ('SVC', SVC(random_state=RANDOM_STATE)), ('RF', RandomForestClassifier(random_state=RANDOM_STATE)), ('SGD', SGDClassifier(random_state=RANDOM_STATE, loss='modified_huber')), ('LGBM', LGBMClassifier(random_state=RANDOM_STATE, objective='binary')), ('XGB', XGBClassifier(tree_method='hist', objective='binary:logistic'))]

steps_before_model = [('preprocessing', preprocessing)] model_summary(X, y, models, steps_before_model, jobs=JOBS);

cm_means_norm_models = confusion_matrix_norm(X, y, models, steps_before_model)

confusion_matrix_plot(models, cm_means_norm_models, nrows=4, ncols=3, figsize=(16, 16), remove_empty_axes=2)

steps_before_model_rus = [('preprocessing', preprocessing), ('under', RandomUnderSampler(random_state=RANDOM_STATE))] model_summary(X, y, models, steps_before_model_rus, jobs=JOBS);

cm_means_norm_rus = confusion_matrix_norm(X, y, models, steps_before_model_rus)

confusion_matrix_plot(models, cm_means_norm_rus, nrows=4, ncols=3, figsize=(16, 16), remove_empty_axes=2)

models_params = [] models_params.append(('LR_l1', LogisticRegression(max_iter=10000), {'C': [0.01, 0.1, 1, 10], 'class_weight': [None, 'balanced'], 'penalty': ['l1'], 'solver': ['liblinear', 'saga']})) models_params.append(('LR_l2', LogisticRegression(max_iter=10000), {'C': [0.01, 0.1, 1, 10], 'class_weight': [None, 'balanced'], 'penalty': ['l2'], 'solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag', 'saga']})) models_params.append(('LDA', LinearDiscriminantAnalysis(), {'solver': ['svd', 'lsqr', 'eigen'], 'shrinkage': np.arange(0, 1, 0.01), })) # models_params.append(('KNN', # KNeighborsClassifier(), # {'n_neighbors': list(range(2, 16, 1)), # 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], # 'weights': ['uniform', 'distance']})) # # models_params.append(('CART', # DecisionTreeClassifier(random_state=RANDOM_STATE), # {'max_depth': [2, 3, 5, 10], # # 'min_samples_leaf': [5, 10, 20, 50, 100], # 'criterion': ['gini', 'entropy']})) # # models_params.append(('NB', # GaussianNB(), # {'var_smoothing': np.logspace(0, -9, num=100), })) models_params.append(('SVC', SVC(random_state=RANDOM_STATE), {'C': [0.5, 0.75, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear'], 'class_weight': [None, 'balanced'], })) # models_params.append(('RF', # RandomForestClassifier(random_state=RANDOM_STATE), # {'max_depth': [2, 3, 5, 10, 20], # # 'min_samples_split': [2, 5, 10], # # 'min_samples_leaf': [5, 10, 20, 50, 100], # 'criterion': ['gini', 'entropy', 'log_loss'], # 'max_features': ['sqrt', 'log2', None], # 'bootstrap': [True, False], # 'n_estimators': [100, 250, 500, 1000]})) # models_params.append(('SGD', # SGDClassifier(random_state=RANDOM_STATE, # early_stopping=True, # validation_fraction=0.2), # {'alpha': [1E-4, 1E-3, 1E-2, 1E-1, 1E0], # 'eta0': [0.01], # 'loss': ['hinge', 'log_loss', 'modified_huber', # 'squared_hinge', 'perceptron'], # 'penalty': ['l2', 'l1', 'elasticnet'], # 'learning_rate': ['constant', 'optimal', 'invscaling', # 'adaptive']})) models_params.append(('LGBM', LGBMClassifier(random_state=RANDOM_STATE, objective='binary', subsample=0.5, subsample_freq=5, colsample_bytree=0.7, ), {'max_depth': range(1, 6, 1), 'num_iterations': [10, 50, 100, 500], 'learning_rate': [0.01, 0.1, 0.3], 'min_child_samples': [500, 1000], # 'min_gain_to_split': [1, 5, 10], # 'boosting_type': ['gbdt', 'dart', 'goss', 'rf'] })) # models_params.append(('XGB', # XGBClassifier(tree_method='hist', # objective='binary:logistic'), # {'learning_rate': [0.01, 0.1, 0.3], # 'max_depth': range(1, 6, 1), # 'n_estimators': [10, 50, 100, 500], # # 'subsample': [0.5, 0.75, 1.0], # # 'colsample_bytree': [0.5, 0.75, 1.0] # }))

results_rus = hyperparameter_tuning(X, y, models_params, steps_before_model_rus, scoring='average_precision', verb=0, jobs=JOBS)

results_rus

models_tuning_rus = [r[2] for r in results_rus] models_tuning_rus

names = [n for n, _, _ in models_params] names

models_tuning_rus = list(zip(names, models_tuning_rus)) models_tuning_rus

model_summary(X, y, models_tuning_rus, steps_before_model_rus, jobs=JOBS);

cm_means_norm_rus_tuning = confusion_matrix_norm(X, y, models_tuning_rus, steps_before_model_rus)

confusion_matrix_plot(models_tuning_rus, cm_means_norm_rus_tuning, nrows=2, ncols=3, figsize=(14, 8), remove_empty_axes=1)

lgbm_tuning_rus = models_tuning_rus[-1][-1] lgbm_tuning_rus

lgbm_tuning_rus_importances = pd.DataFrame({'Feature': preprocessing.get_feature_names_out(), 'Importance': lgbm_tuning_rus.feature_importances_}) lgbm_tuning_rus_importances.sort_values(by='Importance', ascending=False)

fig, ax = plt.subplots() lgbm_tuning_rus_importances_top_10 = lgbm_tuning_rus_importances.sort_values(by='Importance', ascending=False)[:10] lgbm_tuning_rus_importances_top_10.plot(kind='bar', color=colormap(np.linspace(0, 0.4, 10)), ax=ax, x='Feature', y='Importance', legend=False) ax.set_title('Feature importance - LGBM classifier') plt.show()

models

steps_before_model_over = [('preprocessing', preprocessing), ('over', SMOTE(random_state=RANDOM_STATE))] model_summary(X, y, models, steps_before_model_over, jobs=JOBS);

cm_means_norm_over = confusion_matrix_norm(X, y, models, steps_before_model_over)

confusion_matrix_plot(models, cm_means_norm_over, nrows=4, ncols=3, figsize=(16, 16), remove_empty_axes=2)

results_over = hyperparameter_tuning(X, y, models_params, steps_before_model_over, scoring='average_precision', verb=0, jobs=JOBS)

results_over

models_tuning_over = [r[2] for r in results_over]

models_tuning_over = list(zip(names, models_tuning_over))

model_summary(X, y, models_tuning_over, steps_before_model_over, jobs=JOBS);

cm_means_norm_over_tuning = confusion_matrix_norm(X, y, models_tuning_over, steps_before_model_over)

confusion_matrix_plot(models_tuning_over, cm_means_norm_over_tuning, nrows=2, ncols=3, figsize=(14, 8), remove_empty_axes=1)

lr_tuning_over = models_tuning_over[1][-1] lr_tuning_over

lr_tuning_over_importances = pd.DataFrame({'Feature': preprocessing.get_feature_names_out(), 'Importance': lr_tuning_over.coef_[0]}) lr_tuning_over_importances.sort_values(by='Importance').head()

lr_tuning_over_importances.sort_values(by='Importance').tail()

coeff_to_odds = np.exp(lr_tuning_over.coef_[0]) odds = pd.DataFrame({'Feature': preprocessing.get_feature_names_out(), 'odds': coeff_to_odds}).sort_values(by='odds', ascending=False) odds

fig, ax = plt.subplots(figsize=(14, 10)) odds.plot(kind='bar', color=colormap(np.linspace(0, 1, len(odds))), ax=ax, x='Feature', y='odds', legend=False) ax.set_ylabel('Odds') ax.yaxis.set_major_locator(MultipleLocator(0.25)) fig.suptitle('Odds of churn per feature\n') plt.gcf().text(0.025, 0.915, 'Strong red: features that predict churn. Strong blue: the opposite', fontsize=18, color='gray') plt.show()