import pandas as pd
import matplotlib
import seaborn as sns
import numpy as np
import imblearn
import sklearn
from platform import python_version
packages = {'Pandas': pd,
'Matplotlib': matplotlib,
'Seaborn': sns,
'NumPy': np,
'Scikit-Learn': sklearn,
'Imbalanced-Learn': imblearn}
print('Versions of the packages:\n')
print('{0:-^20} | {1:-^10}'.format('', ''))
print('{0:^20} | {1:^10}'.format('Package', 'Version'))
print('{0:-^20} | {1:-^10}'.format('', ''))
for name, alias in sorted(packages.items()):
print(f'{name:<20} | {alias.__version__:>10}')
print()
print('{0}: {1}'.format('Python version', python_version()))
# some warning may arise during GridSearchCV, and they can be ignored
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
# plots
import matplotlib.pyplot as plt
from matplotlib.ticker import EngFormatter, PercentFormatter, MultipleLocator
plt.style.use('scripts/flsbustamante.mplstyle')
# scaling and composing
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
# imbalanced
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
# classifiers
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import xgboost
xgboost.set_config(verbosity=0)
# custom functions and variables
from scripts.functions import (model_summary,
confusion_matrix_norm, confusion_matrix_plot,
score_table,
hyperparameter_tuning,
RANDOM_STATE, scoring)
# presentation and local constants
# from IPython.core.display_functions import display
JOBS = -2
DATAFILE = 'data/ibm_telco_customer_churn.csv'
df = pd.read_csv(DATAFILE)
with pd.option_context('display.max_columns', 25):
display(df.head())
print(f'Number of instances (rows): {df.shape[0]:>10}')
print(f'Number of attributes (columns): {df.shape[1]:>5}')
df.info()
sum(map(lambda x: x.isspace(), df['TotalCharges']))
object_columns = [column for column in df.columns if (df[column].dtype == 'object')]
for column in object_columns:
print(f'{column:<20} {sum(map(lambda x: x.isspace(), df[column]))}')
df = df.replace(r'^\s*$', np.nan, regex=True)
df = df.apply(pd.to_numeric, errors='ignore')
df['SeniorCitizen'] = df['SeniorCitizen'].replace({0: 'No', 1: 'Yes'})
df.info()
df.isnull().sum()
with pd.option_context('display.precision', 2, 'display.max_columns', 25):
display(df.describe())
df['TotalCharges'].isnull().sum() / df.shape[0] * 100 # percentage of empty rows
df.loc[df['TotalCharges'].isnull()]
df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())
df[df == 0].count()
df.nunique()
df.describe(exclude=[np.number])
churn = df.groupby('Churn').size()
churn
churn.div(churn.sum(axis=0)).multiply(100)
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, nrows=1)
for ax in (ax2, ax3):
ax.axis('off')
eng_format = EngFormatter(places=1)
labels = churn.values
churn.plot(kind='bar', ax=ax1)
ax1.yaxis.set_major_formatter(EngFormatter())
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.tick_params(axis='x', labelrotation=0)
ax1.bar_label(ax1.containers[0], label_type='edge',
labels=labels,
color='black', fontsize=10,
weight='bold')
ax2.text(-0.15, 0.75, 'From the 7043 customers', fontsize=19, color='blue')
ax2.text(-0.15, 0.58, r'$\bf{1869}$', fontsize=80, color='purple')
ax2.text(-0.15, 0.5, r'$\mathrm{\bf{have\ left}}$', fontsize=19, color='red')
ax3.text(0.15, 0.51, 'This means that', fontsize=19, color='gray')
ax3.text(0.15, 0.35, r'$\mathrm{\bf{26.5\%}}$', fontsize=80, color='red')
ax3.text(0.15, 0.3, r'$\mathrm{\bf{of\ the\ customers\ left\ the\ company}}$',
fontsize=19, color='blue')
fig.suptitle('Data summary - Telco customer churn')
plt.show()
fig, axes = plt.subplots(nrows=1, ncols=3)
df.hist(ax=axes)
axes[0].set_xlabel('Months')
axes[0].set_ylabel('Customers')
axes[1].set_xlabel('Charge')
axes[1].set_ylabel('Customers')
axes[2].set_xlabel('Charge')
axes[2].set_ylabel('Customers')
fig.suptitle('Distribution of the numerical features')
plt.show()
fig, ax = plt.subplots()
sns.histplot(x='tenure', hue='Contract', data=df, ax=ax,
kde=False, fill=True, stat='density', common_norm=True,
multiple='layer')
sns.kdeplot(x='tenure', hue='Contract', data=df,
ax=ax, fill=True, multiple='layer')
fig.suptitle('Tenure distribution by contract kind')
sns.move_legend(ax, 'upper center', bbox_to_anchor=(0.5, 1.15), ncol=3)
plt.show()
df = df.drop('customerID', axis=1)
not_categorical = ('customerID', 'tenure', 'MonthlyCharges', 'TotalCharges')
categorical = [feature for feature in np.setdiff1d(df.columns.values,
not_categorical)]
fig, axes = plt.subplots(ncols=3, figsize=(14, 6))
for i, feature in enumerate(not_categorical[1:]):
k = sns.kdeplot(x=feature, hue='Churn', data=df, ax=axes.flat[i],
fill=True, multiple='layer')
handles = k.legend_.legendHandles
labels = [t.get_text() for t in k.legend_.get_texts()]
k.legend_.remove()
fig.suptitle('Churn distribution in numerical features\n\n')
fig.legend(bbox_to_anchor=(0.5, 0.925), loc='upper center', ncol=2,
labels=labels, handles=handles, fontsize=12, title='Churn')
plt.show()
fig, axes = plt.subplots(ncols=3, figsize=(16, 4), sharey=True)
for i, feature in enumerate(not_categorical[1:]):
sns.boxplot(x=feature, y='Churn', data=df, ax=axes.flat[i])
axes.flat[0].xaxis.set_major_locator(MultipleLocator(10))
axes.flat[1].xaxis.set_major_locator(MultipleLocator(10))
axes.flat[1].set_ylabel('')
axes.flat[2].xaxis.set_major_locator(MultipleLocator(1000))
axes.flat[2].set_ylabel('')
axes.flat[2].tick_params(axis='x', rotation=25)
fig.suptitle('Churn distribution in numerical features')
fig.align_labels()
plt.show()
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(14, 14), sharey=True)
for i, feature in enumerate(categorical[1:]):
c = sns.histplot(x=feature, data=df, ax=axes.flat[i],
stat='percent', multiple='dodge', shrink=0.8,)
c.tick_params(axis='x', labelsize=12, rotation=25)
c.grid(False)
c.yaxis.set_major_formatter(PercentFormatter())
c.yaxis.set_major_locator(MultipleLocator(20))
c.set_ylabel('')
for bar in c.containers:
c.bar_label(bar, label_type='edge', color='gray',
labels=[f'{b.get_height():.1f}' if b.get_height() > 0 else '' for b in bar],
fontsize=12, weight='bold')
for b, c in zip(bar, sns.color_palette('tab20c')):
b.set_facecolor(c)
fig.align_labels()
fig.suptitle('Categorical attributes percentage')
plt.show()
fig, axes = plt.subplots(nrows=4, ncols=4, figsize=(14, 20), sharey=True)
for i, feature in enumerate(categorical[1:]):
c = sns.histplot(x=feature, hue='Churn', data=df, ax=axes.flat[i],
stat='percent', multiple='fill', shrink=0.8,
common_norm=True)
c.tick_params(axis='x', labelsize=12, rotation=45)
c.grid(False)
handles = c.legend_.legendHandles
labels = [t.get_text() for t in c.legend_.get_texts()]
c.legend_.remove()
c.yaxis.set_major_formatter(PercentFormatter(1))
c.set_ylabel('')
for bar in c.containers:
c.bar_label(bar, label_type='center', color='white',
labels=[f'{b.get_height() * 100:.1f}' if b.get_height() > 0 else '' for b in bar],
fontsize=12, weight='bold')
fig.suptitle('Churn by categorical attribute\n\n')
fig.legend(bbox_to_anchor=(0.5, 0.965), loc='upper center', ncol=2,
labels=labels, handles=handles, fontsize=12, title='Churn')
fig.align_labels()
plt.show()
c = sns.catplot(x='InternetService', y='MonthlyCharges', data=df,
hue='Churn', kind='box', height=8, aspect=1.4)
sns.move_legend(c, 'upper center', bbox_to_anchor=(0.5, 1.1), ncol=2)
plt.show()
partner_dependents = df.groupby(['Partner', 'Dependents']).size().unstack()
partner_dependents
partner_dependents_pct = partner_dependents.div(partner_dependents.sum(1),
axis=0)
partner_dependents_pct
fig, ax = plt.subplots(figsize=(10, 8))
c = partner_dependents_pct.plot(kind='bar', stacked=True, rot=0, ax=ax,
width=0.6)
c.yaxis.set_major_formatter(PercentFormatter(1))
c.grid(False)
c.legend_.remove()
for p in c.patches:
width, height = p.get_width(), p.get_height()
x, y = p.get_xy()
c.annotate(f'{height*100:.1f}', (x + width/2, y + height*0.5), ha='center',
color='white', weight='bold', size=14)
fig.suptitle('Customers - partners vs dependents\n\n')
plt.gcf().text(0.025, 0.865,
'Percentage of customers, grouped by partner status, that have dependents\n',
fontsize=18, color='gray')
fig.legend(bbox_to_anchor=(0.5, 0.885), loc='upper center', ncol=2,
labels=['Yes', 'No'], fontsize=12, title='Dependents')
plt.show()
df['Churn'] = df['Churn'].replace({'No': 0, 'Yes': 1})
df_dummies = pd.get_dummies(df)
df_dummies.head()
fig, ax = plt.subplots()
colormap = plt.get_cmap('RdBu')
churn_corr_dummies = df_dummies.corr()['Churn'].sort_values(ascending=False)
churn_corr_dummies[1:].plot(kind='bar',
color=colormap(np.linspace(0, 1,len(churn_corr_dummies))),
ax=ax)
ax.yaxis.set_major_locator(MultipleLocator(0.1))
ax.set_title('Churn correlation with each feature')
plt.show()
std_scaler = StandardScaler()
label_encoder = LabelEncoder()
ohe = OneHotEncoder(handle_unknown='ignore')
preprocessing = ColumnTransformer(transformers=[('std_scaler', std_scaler,
not_categorical[1:]),
('ohe', ohe, categorical[1:])])
df_clean = df.copy()
df_clean['Churn'] = label_encoder.fit_transform(df_clean['Churn'])
df_clean.head()
X = df_clean.drop('Churn', axis=1)
y = df_clean['Churn']
X.head()
dummy_model = DummyClassifier(strategy='constant', constant=1)
rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=3,
random_state=RANDOM_STATE)
scores = cross_validate(dummy_model, X, y, scoring=scoring, cv=rskf,
n_jobs=JOBS)
score_table(scores)
models = [('LR', LogisticRegression(max_iter=10000)),
('LDA', LinearDiscriminantAnalysis()),
('KNN', KNeighborsClassifier()),
('CART', DecisionTreeClassifier(random_state=RANDOM_STATE)),
('NB', GaussianNB()),
('SVC', SVC(random_state=RANDOM_STATE)),
('RF', RandomForestClassifier(random_state=RANDOM_STATE)),
('SGD', SGDClassifier(random_state=RANDOM_STATE, loss='modified_huber')),
('LGBM', LGBMClassifier(random_state=RANDOM_STATE,
objective='binary')),
('XGB', XGBClassifier(tree_method='hist',
objective='binary:logistic'))]
steps_before_model = [('preprocessing', preprocessing)]
model_summary(X, y, models, steps_before_model, jobs=JOBS);
cm_means_norm_models = confusion_matrix_norm(X, y, models, steps_before_model)
confusion_matrix_plot(models, cm_means_norm_models, nrows=4, ncols=3,
figsize=(16, 16), remove_empty_axes=2)
steps_before_model_rus = [('preprocessing', preprocessing),
('under', RandomUnderSampler(random_state=RANDOM_STATE))]
model_summary(X, y, models, steps_before_model_rus, jobs=JOBS);
cm_means_norm_rus = confusion_matrix_norm(X, y, models, steps_before_model_rus)
confusion_matrix_plot(models, cm_means_norm_rus, nrows=4, ncols=3,
figsize=(16, 16), remove_empty_axes=2)
models_params = []
models_params.append(('LR_l1',
LogisticRegression(max_iter=10000),
{'C': [0.01, 0.1, 1, 10],
'class_weight': [None, 'balanced'],
'penalty': ['l1'],
'solver': ['liblinear', 'saga']}))
models_params.append(('LR_l2',
LogisticRegression(max_iter=10000),
{'C': [0.01, 0.1, 1, 10],
'class_weight': [None, 'balanced'],
'penalty': ['l2'],
'solver': ['liblinear', 'lbfgs', 'newton-cg', 'sag',
'saga']}))
models_params.append(('LDA',
LinearDiscriminantAnalysis(),
{'solver': ['svd', 'lsqr', 'eigen'],
'shrinkage': np.arange(0, 1, 0.01), }))
# models_params.append(('KNN',
# KNeighborsClassifier(),
# {'n_neighbors': list(range(2, 16, 1)),
# 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
# 'weights': ['uniform', 'distance']}))
#
# models_params.append(('CART',
# DecisionTreeClassifier(random_state=RANDOM_STATE),
# {'max_depth': [2, 3, 5, 10],
# # 'min_samples_leaf': [5, 10, 20, 50, 100],
# 'criterion': ['gini', 'entropy']}))
#
# models_params.append(('NB',
# GaussianNB(),
# {'var_smoothing': np.logspace(0, -9, num=100), }))
models_params.append(('SVC',
SVC(random_state=RANDOM_STATE),
{'C': [0.5, 0.75, 1],
'kernel': ['rbf', 'poly', 'sigmoid', 'linear'],
'class_weight': [None, 'balanced'], }))
# models_params.append(('RF',
# RandomForestClassifier(random_state=RANDOM_STATE),
# {'max_depth': [2, 3, 5, 10, 20],
# # 'min_samples_split': [2, 5, 10],
# # 'min_samples_leaf': [5, 10, 20, 50, 100],
# 'criterion': ['gini', 'entropy', 'log_loss'],
# 'max_features': ['sqrt', 'log2', None],
# 'bootstrap': [True, False],
# 'n_estimators': [100, 250, 500, 1000]}))
# models_params.append(('SGD',
# SGDClassifier(random_state=RANDOM_STATE,
# early_stopping=True,
# validation_fraction=0.2),
# {'alpha': [1E-4, 1E-3, 1E-2, 1E-1, 1E0],
# 'eta0': [0.01],
# 'loss': ['hinge', 'log_loss', 'modified_huber',
# 'squared_hinge', 'perceptron'],
# 'penalty': ['l2', 'l1', 'elasticnet'],
# 'learning_rate': ['constant', 'optimal', 'invscaling',
# 'adaptive']}))
models_params.append(('LGBM',
LGBMClassifier(random_state=RANDOM_STATE,
objective='binary',
subsample=0.5, subsample_freq=5,
colsample_bytree=0.7,
),
{'max_depth': range(1, 6, 1),
'num_iterations': [10, 50, 100, 500],
'learning_rate': [0.01, 0.1, 0.3],
'min_child_samples': [500, 1000],
# 'min_gain_to_split': [1, 5, 10],
# 'boosting_type': ['gbdt', 'dart', 'goss', 'rf']
}))
# models_params.append(('XGB',
# XGBClassifier(tree_method='hist',
# objective='binary:logistic'),
# {'learning_rate': [0.01, 0.1, 0.3],
# 'max_depth': range(1, 6, 1),
# 'n_estimators': [10, 50, 100, 500],
# # 'subsample': [0.5, 0.75, 1.0],
# # 'colsample_bytree': [0.5, 0.75, 1.0]
# }))
results_rus = hyperparameter_tuning(X, y, models_params, steps_before_model_rus,
scoring='average_precision', verb=0,
jobs=JOBS)
results_rus
models_tuning_rus = [r[2] for r in results_rus]
models_tuning_rus
names = [n for n, _, _ in models_params]
names
models_tuning_rus = list(zip(names, models_tuning_rus))
models_tuning_rus
model_summary(X, y, models_tuning_rus, steps_before_model_rus, jobs=JOBS);
cm_means_norm_rus_tuning = confusion_matrix_norm(X, y, models_tuning_rus,
steps_before_model_rus)
confusion_matrix_plot(models_tuning_rus, cm_means_norm_rus_tuning,
nrows=2, ncols=3,
figsize=(14, 8), remove_empty_axes=1)
lgbm_tuning_rus = models_tuning_rus[-1][-1]
lgbm_tuning_rus
lgbm_tuning_rus_importances = pd.DataFrame({'Feature': preprocessing.get_feature_names_out(),
'Importance': lgbm_tuning_rus.feature_importances_})
lgbm_tuning_rus_importances.sort_values(by='Importance', ascending=False)
fig, ax = plt.subplots()
lgbm_tuning_rus_importances_top_10 = lgbm_tuning_rus_importances.sort_values(by='Importance',
ascending=False)[:10]
lgbm_tuning_rus_importances_top_10.plot(kind='bar',
color=colormap(np.linspace(0, 0.4, 10)),
ax=ax, x='Feature', y='Importance',
legend=False)
ax.set_title('Feature importance - LGBM classifier')
plt.show()
models
steps_before_model_over = [('preprocessing', preprocessing),
('over', SMOTE(random_state=RANDOM_STATE))]
model_summary(X, y, models, steps_before_model_over, jobs=JOBS);
cm_means_norm_over = confusion_matrix_norm(X, y, models,
steps_before_model_over)
confusion_matrix_plot(models, cm_means_norm_over, nrows=4, ncols=3,
figsize=(16, 16), remove_empty_axes=2)
results_over = hyperparameter_tuning(X, y, models_params,
steps_before_model_over,
scoring='average_precision', verb=0,
jobs=JOBS)
results_over
models_tuning_over = [r[2] for r in results_over]
models_tuning_over = list(zip(names, models_tuning_over))
model_summary(X, y, models_tuning_over, steps_before_model_over, jobs=JOBS);
cm_means_norm_over_tuning = confusion_matrix_norm(X, y, models_tuning_over,
steps_before_model_over)
confusion_matrix_plot(models_tuning_over, cm_means_norm_over_tuning,
nrows=2, ncols=3, figsize=(14, 8), remove_empty_axes=1)
lr_tuning_over = models_tuning_over[1][-1]
lr_tuning_over
lr_tuning_over_importances = pd.DataFrame({'Feature': preprocessing.get_feature_names_out(),
'Importance': lr_tuning_over.coef_[0]})
lr_tuning_over_importances.sort_values(by='Importance').head()
lr_tuning_over_importances.sort_values(by='Importance').tail()
coeff_to_odds = np.exp(lr_tuning_over.coef_[0])
odds = pd.DataFrame({'Feature': preprocessing.get_feature_names_out(),
'odds': coeff_to_odds}).sort_values(by='odds',
ascending=False)
odds
fig, ax = plt.subplots(figsize=(14, 10))
odds.plot(kind='bar', color=colormap(np.linspace(0, 1, len(odds))), ax=ax,
x='Feature', y='odds', legend=False)
ax.set_ylabel('Odds')
ax.yaxis.set_major_locator(MultipleLocator(0.25))
fig.suptitle('Odds of churn per feature\n')
plt.gcf().text(0.025, 0.915,
'Strong red: features that predict churn. Strong blue: the opposite',
fontsize=18, color='gray')
plt.show()