# import useful libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('Telco-Customer-Churn.csv')
# 20 features and 1 label
# 7032 rows (customers)
df.shape
# there seems no missing value
# many of the columns are categorical columns
df.info()
# let's first check churn column
# there are less people who discontinued the service
plt.figure(figsize=(8,6))
sns.countplot(x='Churn',data=df);
# our data is little bit imbalanced, but not too extreme
df['Churn'].value_counts()
df.info()
# distribution of tenure bettween Churn categories with a violin plot
# people with 'No' churn tend to have longer tenure
plt.figure(figsize=(10,8))
sns.violinplot(x='Churn',y='tenure',data=df);
# kde plot of tenure betwen Churn categories
# it seems that the longer the tenure the less likely people turn away
plt.figure(figsize=(10,8))
sns.kdeplot(data=df,x='tenure',hue='Churn');
# distribution of MonthlyCharges bettween Churn categories with a violin plot
# people with 'No' churn tend to spend less for monthly charges
plt.figure(figsize=(10,8))
sns.violinplot(x='Churn',y='MonthlyCharges',data=df);
# kde plot of MonthlyCharges betwen Churn categories
# it seems that when monthly charge is lager than 60, density increases rapidly
plt.figure(figsize=(10,8))
sns.kdeplot(data=df,x='MonthlyCharges',hue='Churn');
# distribution of TotalCharges bettween Churn categories with a violin plot
# people with 'No' churn tend to spend more on total charges
# this makes sense, they usually used the service for longer period of time
plt.figure(figsize=(10,8))
sns.violinplot(x='Churn',y='TotalCharges',data=df);
# countplot of InternetService with hue equals to Churn
plt.figure(figsize=(10,4))
sns.countplot(data=df,x='InternetService',hue='Churn');
mask = (df['InternetService']=='Fiber optic') &(df['Churn']=='Yes')
# percentage of people churn after using fiber optic Internet service
(len(df[mask])/len(df[df['InternetService']=='Fiber optic']))*100
plt.figure(figsize=(10,4))
sns.countplot(data=df,x='Contract',hue='Churn');
plt.figure(figsize=(10,4))
sns.countplot(data=df,x='PaymentMethod',hue='Churn');
# create a pd.series with groupby method
# groubpy [churn,tenure] -> count number of elements -> transpose ->
# only choose columns with 'yes' -> look at first row
yes_churn_tenure = df.groupby(['Churn','tenure']).count().T['Yes'].loc['customerID']
# same for no churn
no_churn_tenure = df.groupby(['Churn','tenure']).count().T['No'].loc['customerID']
# churn rate
churn_rate = yes_churn_tenure*100/(yes_churn_tenure+no_churn_tenure)
churn_rate
# we can visualize this churn rate
plt.figure(figsize=(10,8))
churn_rate.plot()
plt.scatter(x=churn_rate.index,y=churn_rate)
plt.ylabel('Churn Rate')
plt.xlabel('Tenure');
# define a function for apply method
def cohort(tenure):
if tenure<13:
return '0-12 Months'
elif tenure<25:
return '12-24 Monhts'
elif tenure<49:
return '24-48 Months'
else:
return 'Over 48 Months'
# apply the function to create a new column 'Cohort'
df['Cohort'] = df['tenure'].apply(cohort)
df['Cohort']
df['Cohort'].value_counts(sort=False)
df[df['Churn']=='Yes']['Cohort'].value_counts(sort=False)
# churn rate drops significantly after a customer uses the service of 1 year
churn_rate_cohort = (df[df['Churn']=='Yes']['Cohort'].value_counts(sort=False))/(df['Cohort'].value_counts(sort=False))
churn_rate_cohort
plt.figure(figsize=(10,8))
sns.countplot(x='Cohort',data=df,hue='Churn');
# catplot with cohort and contract method
# people with monthly contract and less than 12 month tenure are very likely to churn
plt.figure(figsize=(12,10))
sns.catplot(data=df,x='Cohort',hue='Churn',col='Contract',kind='count');
# MonthlyCharges vs. TotalCharges with hue = Cohort
plt.figure(figsize=(12,8))
sns.scatterplot(data=df,x='MonthlyCharges',y='TotalCharges',hue='Cohort',palette='mako',alpha=.77);
df
# first split into X and y
X = df.drop(['Churn','Cohort','customerID'],axis=1)
X = pd.get_dummies(X,drop_first=True)
y = df['Churn']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,ConfusionMatrixDisplay,confusion_matrix
from sklearn.model_selection import GridSearchCV
# create base model
dt_model = DecisionTreeClassifier(max_depth=6,random_state=42,class_weight='balanced')
# parameter grid
dt_param_grid = {'criterion':['gini','entropy'],'splitter':['best','random']}
# grid model
dt_grid = GridSearchCV(estimator=dt_model,param_grid=dt_param_grid)
# fit the model
dt_grid.fit(X_train,y_train)
dt_grid.best_estimator_.get_params()
# In order to ease the evaluation process, we define a function
def eval(model,X_true,y_true):
pred = model.predict(X_true)
cm = confusion_matrix(y_true,pred)
ConfusionMatrixDisplay(cm,display_labels=model.classes_).plot()
print(classification_report(y_true,pred))
eval(dt_grid,X_test,y_test)
from sklearn.ensemble import RandomForestClassifier
# base model
rf_model = RandomForestClassifier(max_depth=6, random_state=42, class_weight='balanced')
# parameter grid
rf_param_grid = {'criterion':['gini','entropy']}
# grid model
rf_grid = GridSearchCV(estimator=rf_model,param_grid=rf_param_grid)
# fit the model
rf_grid.fit(X_train,y_train)
rf_grid.best_estimator_.get_params()
eval(rf_grid,X_test,y_test)
from sklearn.ensemble import AdaBoostClassifier
# base model
ab_model = AdaBoostClassifier(random_state=42)
# parameter grid
ab_param_grid = {'learning_rate':[0.1,0.5,1.0],'n_estimators':[50,100]}
# grid model
ab_grid = GridSearchCV(estimator=ab_model,param_grid=ab_param_grid)
# fit the model
ab_grid.fit(X_train,y_train)
ab_grid.best_estimator_.get_params()
eval(ab_grid,X_test,y_test)
AdaBoost model performs better than Single Decision Tree and Random Forest model.
from sklearn.ensemble import GradientBoostingClassifier
# base model
gb_model = GradientBoostingClassifier(random_state=42)
# parameter grid
gb_param_grid = {'learning_rate':[0.05,0.1,0.5,1.0],'n_estimators':[50,100],'max_depth':[4,5,6]}
# grid model
gb_grid = GridSearchCV(estimator=gb_model,param_grid=gb_param_grid)
# fit the model
gb_grid.fit(X_train,y_train)
gb_grid.best_estimator_.get_params()
eval(gb_grid,X_test,y_test)
final_model = GradientBoostingClassifier(learning_rate=0.1,max_depth=4,n_estimators=50,random_state=42)
final_model.fit(X_train,y_train)
imp_feats = pd.DataFrame(index=X.columns,data=final_model.feature_importances_,columns=['ImpFeats'])
imp_feats = imp_feats[imp_feats['ImpFeats']>0].sort_values('ImpFeats')
plt.figure(figsize=(12,8))
sns.barplot(x=imp_feats.index,y='ImpFeats',data=imp_feats)
plt.xticks(rotation=90);
imp_feats[-6:]