# import useful libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('Telco-Customer-Churn.csv')
# 20 features and 1 label
# 7032 rows (customers)
df.shape
# there seems no missing value
# many of the columns are categorical columns
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customerID 7032 non-null object
1 gender 7032 non-null object
2 SeniorCitizen 7032 non-null int64
3 Partner 7032 non-null object
4 Dependents 7032 non-null object
5 tenure 7032 non-null int64
6 PhoneService 7032 non-null object
7 MultipleLines 7032 non-null object
8 InternetService 7032 non-null object
9 OnlineSecurity 7032 non-null object
10 OnlineBackup 7032 non-null object
11 DeviceProtection 7032 non-null object
12 TechSupport 7032 non-null object
13 StreamingTV 7032 non-null object
14 StreamingMovies 7032 non-null object
15 Contract 7032 non-null object
16 PaperlessBilling 7032 non-null object
17 PaymentMethod 7032 non-null object
18 MonthlyCharges 7032 non-null float64
19 TotalCharges 7032 non-null float64
20 Churn 7032 non-null object
dtypes: float64(2), int64(2), object(17)
memory usage: 1.1+ MB
# let's first check churn column
# there are less people who discontinued the service
plt.figure(figsize=(8,6))
sns.countplot(x='Churn',data=df);
# our data is little bit imbalanced, but not too extreme
df['Churn'].value_counts()
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customerID 7032 non-null object
1 gender 7032 non-null object
2 SeniorCitizen 7032 non-null int64
3 Partner 7032 non-null object
4 Dependents 7032 non-null object
5 tenure 7032 non-null int64
6 PhoneService 7032 non-null object
7 MultipleLines 7032 non-null object
8 InternetService 7032 non-null object
9 OnlineSecurity 7032 non-null object
10 OnlineBackup 7032 non-null object
11 DeviceProtection 7032 non-null object
12 TechSupport 7032 non-null object
13 StreamingTV 7032 non-null object
14 StreamingMovies 7032 non-null object
15 Contract 7032 non-null object
16 PaperlessBilling 7032 non-null object
17 PaymentMethod 7032 non-null object
18 MonthlyCharges 7032 non-null float64
19 TotalCharges 7032 non-null float64
20 Churn 7032 non-null object
dtypes: float64(2), int64(2), object(17)
memory usage: 1.1+ MB
# distribution of tenure bettween Churn categories with a violin plot
# people with 'No' churn tend to have longer tenure
plt.figure(figsize=(10,8))
sns.violinplot(x='Churn',y='tenure',data=df);
# kde plot of tenure betwen Churn categories
# it seems that the longer the tenure the less likely people turn away
plt.figure(figsize=(10,8))
sns.kdeplot(data=df,x='tenure',hue='Churn');
# distribution of MonthlyCharges bettween Churn categories with a violin plot
# people with 'No' churn tend to spend less for monthly charges
plt.figure(figsize=(10,8))
sns.violinplot(x='Churn',y='MonthlyCharges',data=df);
# kde plot of MonthlyCharges betwen Churn categories
# it seems that when monthly charge is lager than 60, density increases rapidly
plt.figure(figsize=(10,8))
sns.kdeplot(data=df,x='MonthlyCharges',hue='Churn');
# distribution of TotalCharges bettween Churn categories with a violin plot
# people with 'No' churn tend to spend more on total charges
# this makes sense, they usually used the service for longer period of time
plt.figure(figsize=(10,8))
sns.violinplot(x='Churn',y='TotalCharges',data=df);
# countplot of InternetService with hue equals to Churn
plt.figure(figsize=(10,4))
sns.countplot(data=df,x='InternetService',hue='Churn');
mask = (df['InternetService']=='Fiber optic') &(df['Churn']=='Yes')
# percentage of people churn after using fiber optic Internet service
(len(df[mask])/len(df[df['InternetService']=='Fiber optic']))*100
plt.figure(figsize=(10,4))
sns.countplot(data=df,x='Contract',hue='Churn');
plt.figure(figsize=(10,4))
sns.countplot(data=df,x='PaymentMethod',hue='Churn');
# create a pd.series with groupby method
# groubpy [churn,tenure] -> count number of elements -> transpose ->
# only choose columns with 'yes' -> look at first row
yes_churn_tenure = df.groupby(['Churn','tenure']).count().T['Yes'].loc['customerID']
# same for no churn
no_churn_tenure = df.groupby(['Churn','tenure']).count().T['No'].loc['customerID']
# churn rate
churn_rate = yes_churn_tenure*100/(yes_churn_tenure+no_churn_tenure)
churn_rate
# we can visualize this churn rate
plt.figure(figsize=(10,8))
churn_rate.plot()
plt.scatter(x=churn_rate.index,y=churn_rate)
plt.ylabel('Churn Rate')
plt.xlabel('Tenure');
# define a function for apply method
def cohort(tenure):
if tenure<13:
return '0-12 Months'
elif tenure<25:
return '12-24 Monhts'
elif tenure<49:
return '24-48 Months'
else:
return 'Over 48 Months'
# apply the function to create a new column 'Cohort'
df['Cohort'] = df['tenure'].apply(cohort)
df['Cohort']
df['Cohort'].value_counts(sort=False)
df[df['Churn']=='Yes']['Cohort'].value_counts(sort=False)
# churn rate drops significantly after a customer uses the service of 1 year
churn_rate_cohort = (df[df['Churn']=='Yes']['Cohort'].value_counts(sort=False))/(df['Cohort'].value_counts(sort=False))
churn_rate_cohort
plt.figure(figsize=(10,8))
sns.countplot(x='Cohort',data=df,hue='Churn');
# catplot with cohort and contract method
# people with monthly contract and less than 12 month tenure are very likely to churn
plt.figure(figsize=(12,10))
sns.catplot(data=df,x='Cohort',hue='Churn',col='Contract',kind='count');
# MonthlyCharges vs. TotalCharges with hue = Cohort
plt.figure(figsize=(12,8))
sns.scatterplot(data=df,x='MonthlyCharges',y='TotalCharges',hue='Cohort',palette='mako',alpha=.77);
df
customerIDobject
7590-VHVEG0%
5575-GNVDE0%
7030 others100%
genderobject
Male50.5%
Female49.5%
0
7590-VHVEG
Female
1
5575-GNVDE
Male
2
3668-QPYBK
Male
3
7795-CFOCW
Male
4
9237-HQITU
Female
5
9305-CDSKC
Female
6
1452-KIOVK
Male
7
6713-OKOMC
Female
8
7892-POOKP
Female
9
6388-TABGU
Male
# first split into X and y
X = df.drop(['Churn','Cohort','customerID'],axis=1)
X = pd.get_dummies(X,drop_first=True)
y = df['Churn']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,ConfusionMatrixDisplay,confusion_matrix
from sklearn.model_selection import GridSearchCV
# create base model
dt_model = DecisionTreeClassifier(max_depth=6,random_state=42,class_weight='balanced')
# parameter grid
dt_param_grid = {'criterion':['gini','entropy'],'splitter':['best','random']}
# grid model
dt_grid = GridSearchCV(estimator=dt_model,param_grid=dt_param_grid)
# fit the model
dt_grid.fit(X_train,y_train)
dt_grid.best_estimator_.get_params()
# In order to ease the evaluation process, we define a function
def eval(model,X_true,y_true):
pred = model.predict(X_true)
cm = confusion_matrix(y_true,pred)
ConfusionMatrixDisplay(cm,display_labels=model.classes_).plot()
print(classification_report(y_true,pred))
eval(dt_grid,X_test,y_test)
precision recall f1-score support
No 0.88 0.74 0.81 771
Yes 0.51 0.73 0.60 284
accuracy 0.74 1055
macro avg 0.70 0.73 0.70 1055
weighted avg 0.78 0.74 0.75 1055
from sklearn.ensemble import RandomForestClassifier
# base model
rf_model = RandomForestClassifier(max_depth=6, random_state=42, class_weight='balanced')
# parameter grid
rf_param_grid = {'criterion':['gini','entropy']}
# grid model
rf_grid = GridSearchCV(estimator=rf_model,param_grid=rf_param_grid)
# fit the model
rf_grid.fit(X_train,y_train)
rf_grid.best_estimator_.get_params()
eval(rf_grid,X_test,y_test)
precision recall f1-score support
No 0.90 0.70 0.79 771
Yes 0.49 0.80 0.61 284
accuracy 0.73 1055
macro avg 0.70 0.75 0.70 1055
weighted avg 0.79 0.73 0.74 1055
from sklearn.ensemble import AdaBoostClassifier
# base model
ab_model = AdaBoostClassifier(random_state=42)
# parameter grid
ab_param_grid = {'learning_rate':[0.1,0.5,1.0],'n_estimators':[50,100]}
# grid model
ab_grid = GridSearchCV(estimator=ab_model,param_grid=ab_param_grid)
# fit the model
ab_grid.fit(X_train,y_train)
ab_grid.best_estimator_.get_params()
eval(ab_grid,X_test,y_test)
precision recall f1-score support
No 0.83 0.89 0.86 771
Yes 0.63 0.52 0.57 284
accuracy 0.79 1055
macro avg 0.73 0.70 0.72 1055
weighted avg 0.78 0.79 0.78 1055
AdaBoost model performs better than Single Decision Tree and Random Forest model.
from sklearn.ensemble import GradientBoostingClassifier
# base model
gb_model = GradientBoostingClassifier(random_state=42)
# parameter grid
gb_param_grid = {'learning_rate':[0.05,0.1,0.5,1.0],'n_estimators':[50,100],'max_depth':[4,5,6]}
# grid model
gb_grid = GridSearchCV(estimator=gb_model,param_grid=gb_param_grid)
# fit the model
gb_grid.fit(X_train,y_train)
gb_grid.best_estimator_.get_params()
eval(gb_grid,X_test,y_test)
precision recall f1-score support
No 0.83 0.90 0.86 771
Yes 0.65 0.50 0.56 284
accuracy 0.79 1055
macro avg 0.74 0.70 0.71 1055
weighted avg 0.78 0.79 0.78 1055
final_model = GradientBoostingClassifier(learning_rate=0.1,max_depth=4,n_estimators=50,random_state=42)
final_model.fit(X_train,y_train)
imp_feats = pd.DataFrame(index=X.columns,data=final_model.feature_importances_,columns=['ImpFeats'])
imp_feats = imp_feats[imp_feats['ImpFeats']>0].sort_values('ImpFeats')
plt.figure(figsize=(12,8))
sns.barplot(x=imp_feats.index,y='ImpFeats',data=imp_feats)
plt.xticks(rotation=90);
imp_feats[-6:]
ImpFeatsfloat64
Contract_One year
0.04844227966
PaymentMethod_Electronic check
0.0569301345
MonthlyCharges
0.07398093411
TotalCharges
0.0853592294
InternetService_Fiber optic
0.2324722123
tenure
0.3287519162