Telco Customer Churn

# import useful libraries import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

df = pd.read_csv('Telco-Customer-Churn.csv')

# 20 features and 1 label # 7032 rows (customers) df.shape

# there seems no missing value # many of the columns are categorical columns df.info()

# let's first check churn column # there are less people who discontinued the service plt.figure(figsize=(8,6)) sns.countplot(x='Churn',data=df);

# our data is little bit imbalanced, but not too extreme df['Churn'].value_counts()

df.info()

# distribution of tenure bettween Churn categories with a violin plot # people with 'No' churn tend to have longer tenure plt.figure(figsize=(10,8)) sns.violinplot(x='Churn',y='tenure',data=df);

# kde plot of tenure betwen Churn categories # it seems that the longer the tenure the less likely people turn away plt.figure(figsize=(10,8)) sns.kdeplot(data=df,x='tenure',hue='Churn');

# distribution of MonthlyCharges bettween Churn categories with a violin plot # people with 'No' churn tend to spend less for monthly charges plt.figure(figsize=(10,8)) sns.violinplot(x='Churn',y='MonthlyCharges',data=df);

# kde plot of MonthlyCharges betwen Churn categories # it seems that when monthly charge is lager than 60, density increases rapidly plt.figure(figsize=(10,8)) sns.kdeplot(data=df,x='MonthlyCharges',hue='Churn');

# distribution of TotalCharges bettween Churn categories with a violin plot # people with 'No' churn tend to spend more on total charges # this makes sense, they usually used the service for longer period of time plt.figure(figsize=(10,8)) sns.violinplot(x='Churn',y='TotalCharges',data=df);

# countplot of InternetService with hue equals to Churn plt.figure(figsize=(10,4)) sns.countplot(data=df,x='InternetService',hue='Churn');

mask = (df['InternetService']=='Fiber optic') &(df['Churn']=='Yes') # percentage of people churn after using fiber optic Internet service (len(df[mask])/len(df[df['InternetService']=='Fiber optic']))*100

plt.figure(figsize=(10,4)) sns.countplot(data=df,x='Contract',hue='Churn');

plt.figure(figsize=(10,4)) sns.countplot(data=df,x='PaymentMethod',hue='Churn');

# create a pd.series with groupby method # groubpy [churn,tenure] -> count number of elements -> transpose -> # only choose columns with 'yes' -> look at first row yes_churn_tenure = df.groupby(['Churn','tenure']).count().T['Yes'].loc['customerID'] # same for no churn no_churn_tenure = df.groupby(['Churn','tenure']).count().T['No'].loc['customerID'] # churn rate churn_rate = yes_churn_tenure*100/(yes_churn_tenure+no_churn_tenure)

churn_rate

# we can visualize this churn rate plt.figure(figsize=(10,8)) churn_rate.plot() plt.scatter(x=churn_rate.index,y=churn_rate) plt.ylabel('Churn Rate') plt.xlabel('Tenure');

# define a function for apply method def cohort(tenure): if tenure<13: return '0-12 Months' elif tenure<25: return '12-24 Monhts' elif tenure<49: return '24-48 Months' else: return 'Over 48 Months' # apply the function to create a new column 'Cohort' df['Cohort'] = df['tenure'].apply(cohort) df['Cohort']

df['Cohort'].value_counts(sort=False)

df[df['Churn']=='Yes']['Cohort'].value_counts(sort=False)

# churn rate drops significantly after a customer uses the service of 1 year churn_rate_cohort = (df[df['Churn']=='Yes']['Cohort'].value_counts(sort=False))/(df['Cohort'].value_counts(sort=False)) churn_rate_cohort

plt.figure(figsize=(10,8)) sns.countplot(x='Cohort',data=df,hue='Churn');

# catplot with cohort and contract method # people with monthly contract and less than 12 month tenure are very likely to churn plt.figure(figsize=(12,10)) sns.catplot(data=df,x='Cohort',hue='Churn',col='Contract',kind='count');

# MonthlyCharges vs. TotalCharges with hue = Cohort plt.figure(figsize=(12,8)) sns.scatterplot(data=df,x='MonthlyCharges',y='TotalCharges',hue='Cohort',palette='mako',alpha=.77);

# first split into X and y X = df.drop(['Churn','Cohort','customerID'],axis=1) X = pd.get_dummies(X,drop_first=True) y = df['Churn'] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import classification_report,ConfusionMatrixDisplay,confusion_matrix from sklearn.model_selection import GridSearchCV

# create base model dt_model = DecisionTreeClassifier(max_depth=6,random_state=42,class_weight='balanced')

# parameter grid dt_param_grid = {'criterion':['gini','entropy'],'splitter':['best','random']} # grid model dt_grid = GridSearchCV(estimator=dt_model,param_grid=dt_param_grid) # fit the model dt_grid.fit(X_train,y_train)

dt_grid.best_estimator_.get_params()

# In order to ease the evaluation process, we define a function def eval(model,X_true,y_true): pred = model.predict(X_true) cm = confusion_matrix(y_true,pred) ConfusionMatrixDisplay(cm,display_labels=model.classes_).plot() print(classification_report(y_true,pred))

eval(dt_grid,X_test,y_test)

from sklearn.ensemble import RandomForestClassifier

# base model rf_model = RandomForestClassifier(max_depth=6, random_state=42, class_weight='balanced') # parameter grid rf_param_grid = {'criterion':['gini','entropy']} # grid model rf_grid = GridSearchCV(estimator=rf_model,param_grid=rf_param_grid) # fit the model rf_grid.fit(X_train,y_train)

rf_grid.best_estimator_.get_params()

eval(rf_grid,X_test,y_test)

from sklearn.ensemble import AdaBoostClassifier

# base model ab_model = AdaBoostClassifier(random_state=42) # parameter grid ab_param_grid = {'learning_rate':[0.1,0.5,1.0],'n_estimators':[50,100]} # grid model ab_grid = GridSearchCV(estimator=ab_model,param_grid=ab_param_grid) # fit the model ab_grid.fit(X_train,y_train)

ab_grid.best_estimator_.get_params()

eval(ab_grid,X_test,y_test)

AdaBoost model performs better than Single Decision Tree and Random Forest model.

from sklearn.ensemble import GradientBoostingClassifier

# base model gb_model = GradientBoostingClassifier(random_state=42) # parameter grid gb_param_grid = {'learning_rate':[0.05,0.1,0.5,1.0],'n_estimators':[50,100],'max_depth':[4,5,6]} # grid model gb_grid = GridSearchCV(estimator=gb_model,param_grid=gb_param_grid) # fit the model gb_grid.fit(X_train,y_train)

gb_grid.best_estimator_.get_params()

eval(gb_grid,X_test,y_test)

final_model = GradientBoostingClassifier(learning_rate=0.1,max_depth=4,n_estimators=50,random_state=42) final_model.fit(X_train,y_train)

imp_feats = pd.DataFrame(index=X.columns,data=final_model.feature_importances_,columns=['ImpFeats']) imp_feats = imp_feats[imp_feats['ImpFeats']>0].sort_values('ImpFeats')

plt.figure(figsize=(12,8)) sns.barplot(x=imp_feats.index,y='ImpFeats',data=imp_feats) plt.xticks(rotation=90);

imp_feats[-6:]