Telco Customer Churn

# import useful libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

df = pd.read_csv('Telco-Customer-Churn.csv')

# 20 features and 1 label # 7032 rows (customers) df.shape

# there seems no missing value # many of the columns are categorical columns df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 
 17  PaymentMethod     7032 non-null   object 
 18  MonthlyCharges    7032 non-null   float64
 19  TotalCharges      7032 non-null   float64
 20  Churn             7032 non-null   object 
dtypes: float64(2), int64(2), object(17)
memory usage: 1.1+ MB

# let's first check churn column # there are less people who discontinued the service plt.figure(figsize=(8,6)) sns.countplot(x='Churn',data=df);

# our data is little bit imbalanced, but not too extreme df['Churn'].value_counts()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 
 17  PaymentMethod     7032 non-null   object 
 18  MonthlyCharges    7032 non-null   float64
 19  TotalCharges      7032 non-null   float64
 20  Churn             7032 non-null   object 
dtypes: float64(2), int64(2), object(17)
memory usage: 1.1+ MB

# distribution of tenure bettween Churn categories with a violin plot # people with 'No' churn tend to have longer tenure plt.figure(figsize=(10,8)) sns.violinplot(x='Churn',y='tenure',data=df);

# kde plot of tenure betwen Churn categories # it seems that the longer the tenure the less likely people turn away plt.figure(figsize=(10,8)) sns.kdeplot(data=df,x='tenure',hue='Churn');

# distribution of MonthlyCharges bettween Churn categories with a violin plot # people with 'No' churn tend to spend less for monthly charges plt.figure(figsize=(10,8)) sns.violinplot(x='Churn',y='MonthlyCharges',data=df);

# kde plot of MonthlyCharges betwen Churn categories # it seems that when monthly charge is lager than 60, density increases rapidly plt.figure(figsize=(10,8)) sns.kdeplot(data=df,x='MonthlyCharges',hue='Churn');

# distribution of TotalCharges bettween Churn categories with a violin plot # people with 'No' churn tend to spend more on total charges # this makes sense, they usually used the service for longer period of time plt.figure(figsize=(10,8)) sns.violinplot(x='Churn',y='TotalCharges',data=df);

# countplot of InternetService with hue equals to Churn plt.figure(figsize=(10,4)) sns.countplot(data=df,x='InternetService',hue='Churn');

mask = (df['InternetService']=='Fiber optic') &(df['Churn']=='Yes') # percentage of people churn after using fiber optic Internet service (len(df[mask])/len(df[df['InternetService']=='Fiber optic']))*100

plt.figure(figsize=(10,4)) sns.countplot(data=df,x='Contract',hue='Churn');

plt.figure(figsize=(10,4)) sns.countplot(data=df,x='PaymentMethod',hue='Churn');

# create a pd.series with groupby method # groubpy [churn,tenure] -> count number of elements -> transpose -> # only choose columns with 'yes' -> look at first row yes_churn_tenure = df.groupby(['Churn','tenure']).count().T['Yes'].loc['customerID'] # same for no churn no_churn_tenure = df.groupby(['Churn','tenure']).count().T['No'].loc['customerID'] # churn rate churn_rate = yes_churn_tenure*100/(yes_churn_tenure+no_churn_tenure)

churn_rate

# we can visualize this churn rate plt.figure(figsize=(10,8)) churn_rate.plot() plt.scatter(x=churn_rate.index,y=churn_rate) plt.ylabel('Churn Rate') plt.xlabel('Tenure');

# define a function for apply method def cohort(tenure): if tenure<13: return '0-12 Months' elif tenure<25: return '12-24 Monhts' elif tenure<49: return '24-48 Months' else: return 'Over 48 Months' # apply the function to create a new column 'Cohort' df['Cohort'] = df['tenure'].apply(cohort) df['Cohort']

df['Cohort'].value_counts(sort=False)

df[df['Churn']=='Yes']['Cohort'].value_counts(sort=False)

# churn rate drops significantly after a customer uses the service of 1 year churn_rate_cohort = (df[df['Churn']=='Yes']['Cohort'].value_counts(sort=False))/(df['Cohort'].value_counts(sort=False)) churn_rate_cohort

plt.figure(figsize=(10,8)) sns.countplot(x='Cohort',data=df,hue='Churn');

# catplot with cohort and contract method # people with monthly contract and less than 12 month tenure are very likely to churn plt.figure(figsize=(12,10)) sns.catplot(data=df,x='Cohort',hue='Churn',col='Contract',kind='count');

# MonthlyCharges vs. TotalCharges with hue = Cohort plt.figure(figsize=(12,8)) sns.scatterplot(data=df,x='MonthlyCharges',y='TotalCharges',hue='Cohort',palette='mako',alpha=.77);

customerIDobject

7590-VHVEG0%

5575-GNVDE0%

7030 others100%

genderobject

Male50.5%

Female49.5%

7590-VHVEG

Female

5575-GNVDE

Male

3668-QPYBK

Male

7795-CFOCW

Male

9237-HQITU

Female

9305-CDSKC

Female

1452-KIOVK

Male

6713-OKOMC

Female

7892-POOKP

Female

6388-TABGU

Male

# first split into X and y X = df.drop(['Churn','Cohort','customerID'],axis=1) X = pd.get_dummies(X,drop_first=True) y = df['Churn'] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import classification_report,ConfusionMatrixDisplay,confusion_matrix from sklearn.model_selection import GridSearchCV

# create base model dt_model = DecisionTreeClassifier(max_depth=6,random_state=42,class_weight='balanced')

# parameter grid dt_param_grid = {'criterion':['gini','entropy'],'splitter':['best','random']} # grid model dt_grid = GridSearchCV(estimator=dt_model,param_grid=dt_param_grid) # fit the model dt_grid.fit(X_train,y_train)

dt_grid.best_estimator_.get_params()

# In order to ease the evaluation process, we define a function def eval(model,X_true,y_true): pred = model.predict(X_true) cm = confusion_matrix(y_true,pred) ConfusionMatrixDisplay(cm,display_labels=model.classes_).plot() print(classification_report(y_true,pred))

eval(dt_grid,X_test,y_test)

              precision    recall  f1-score   support

          No       0.88      0.74      0.81       771
         Yes       0.51      0.73      0.60       284

    accuracy                           0.74      1055
   macro avg       0.70      0.73      0.70      1055
weighted avg       0.78      0.74      0.75      1055

from sklearn.ensemble import RandomForestClassifier

# base model rf_model = RandomForestClassifier(max_depth=6, random_state=42, class_weight='balanced') # parameter grid rf_param_grid = {'criterion':['gini','entropy']} # grid model rf_grid = GridSearchCV(estimator=rf_model,param_grid=rf_param_grid) # fit the model rf_grid.fit(X_train,y_train)

rf_grid.best_estimator_.get_params()

eval(rf_grid,X_test,y_test)

              precision    recall  f1-score   support

          No       0.90      0.70      0.79       771
         Yes       0.49      0.80      0.61       284

    accuracy                           0.73      1055
   macro avg       0.70      0.75      0.70      1055
weighted avg       0.79      0.73      0.74      1055

from sklearn.ensemble import AdaBoostClassifier

# base model ab_model = AdaBoostClassifier(random_state=42) # parameter grid ab_param_grid = {'learning_rate':[0.1,0.5,1.0],'n_estimators':[50,100]} # grid model ab_grid = GridSearchCV(estimator=ab_model,param_grid=ab_param_grid) # fit the model ab_grid.fit(X_train,y_train)

ab_grid.best_estimator_.get_params()

eval(ab_grid,X_test,y_test)

              precision    recall  f1-score   support

          No       0.83      0.89      0.86       771
         Yes       0.63      0.52      0.57       284

    accuracy                           0.79      1055
   macro avg       0.73      0.70      0.72      1055
weighted avg       0.78      0.79      0.78      1055

AdaBoost model performs better than Single Decision Tree and Random Forest model.

from sklearn.ensemble import GradientBoostingClassifier

# base model gb_model = GradientBoostingClassifier(random_state=42) # parameter grid gb_param_grid = {'learning_rate':[0.05,0.1,0.5,1.0],'n_estimators':[50,100],'max_depth':[4,5,6]} # grid model gb_grid = GridSearchCV(estimator=gb_model,param_grid=gb_param_grid) # fit the model gb_grid.fit(X_train,y_train)

gb_grid.best_estimator_.get_params()

eval(gb_grid,X_test,y_test)

              precision    recall  f1-score   support

          No       0.83      0.90      0.86       771
         Yes       0.65      0.50      0.56       284

    accuracy                           0.79      1055
   macro avg       0.74      0.70      0.71      1055
weighted avg       0.78      0.79      0.78      1055

final_model = GradientBoostingClassifier(learning_rate=0.1,max_depth=4,n_estimators=50,random_state=42) final_model.fit(X_train,y_train)

imp_feats = pd.DataFrame(index=X.columns,data=final_model.feature_importances_,columns=['ImpFeats']) imp_feats = imp_feats[imp_feats['ImpFeats']>0].sort_values('ImpFeats')

plt.figure(figsize=(12,8)) sns.barplot(x=imp_feats.index,y='ImpFeats',data=imp_feats) plt.xticks(rotation=90);

imp_feats[-6:]

ImpFeatsfloat64

Contract_One year

0.04844227966

PaymentMethod_Electronic check

0.0569301345

MonthlyCharges

0.07398093411

TotalCharges

0.0853592294

InternetService_Fiber optic

0.2324722123

tenure

0.3287519162