# import useful libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('Telco-Customer-Churn.csv')
# 20 features and 1 label
# 7032 rows (customers)
df.shape
# there seems no missing value
# many of the columns are categorical columns
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customerID 7032 non-null object
1 gender 7032 non-null object
2 SeniorCitizen 7032 non-null int64
3 Partner 7032 non-null object
4 Dependents 7032 non-null object
5 tenure 7032 non-null int64
6 PhoneService 7032 non-null object
7 MultipleLines 7032 non-null object
8 InternetService 7032 non-null object
9 OnlineSecurity 7032 non-null object
10 OnlineBackup 7032 non-null object
11 DeviceProtection 7032 non-null object
12 TechSupport 7032 non-null object
13 StreamingTV 7032 non-null object
14 StreamingMovies 7032 non-null object
15 Contract 7032 non-null object
16 PaperlessBilling 7032 non-null object
17 PaymentMethod 7032 non-null object
18 MonthlyCharges 7032 non-null float64
19 TotalCharges 7032 non-null float64
20 Churn 7032 non-null object
dtypes: float64(2), int64(2), object(17)
memory usage: 1.1+ MB
# let's first check churn column
# there are less people who discontinued the service
plt.figure(figsize=(8,6))
sns.countplot(x='Churn',data=df);
# our data is little bit imbalanced, but not too extreme
df['Churn'].value_counts()
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customerID 7032 non-null object
1 gender 7032 non-null object
2 SeniorCitizen 7032 non-null int64
3 Partner 7032 non-null object
4 Dependents 7032 non-null object
5 tenure 7032 non-null int64
6 PhoneService 7032 non-null object
7 MultipleLines 7032 non-null object
8 InternetService 7032 non-null object
9 OnlineSecurity 7032 non-null object
10 OnlineBackup 7032 non-null object
11 DeviceProtection 7032 non-null object
12 TechSupport 7032 non-null object
13 StreamingTV 7032 non-null object
14 StreamingMovies 7032 non-null object
15 Contract 7032 non-null object
16 PaperlessBilling 7032 non-null object
17 PaymentMethod 7032 non-null object
18 MonthlyCharges 7032 non-null float64
19 TotalCharges 7032 non-null float64
20 Churn 7032 non-null object
dtypes: float64(2), int64(2), object(17)
memory usage: 1.1+ MB
# distribution of tenure bettween Churn categories with a violin plot
# people with 'No' churn tend to have longer tenure
plt.figure(figsize=(10,8))
sns.violinplot(x='Churn',y='tenure',data=df);
# kde plot of tenure betwen Churn categories
# it seems that the longer the tenure the less likely people turn away
plt.figure(figsize=(10,8))
sns.kdeplot(data=df,x='tenure',hue='Churn');
# distribution of MonthlyCharges bettween Churn categories with a violin plot
# people with 'No' churn tend to spend less for monthly charges
plt.figure(figsize=(10,8))
sns.violinplot(x='Churn',y='MonthlyCharges',data=df);
# kde plot of MonthlyCharges betwen Churn categories
# it seems that when monthly charge is lager than 60, density increases rapidly
plt.figure(figsize=(10,8))
sns.kdeplot(data=df,x='MonthlyCharges',hue='Churn');
# distribution of TotalCharges bettween Churn categories with a violin plot
# people with 'No' churn tend to spend more on total charges
# this makes sense, they usually used the service for longer period of time
plt.figure(figsize=(10,8))
sns.violinplot(x='Churn',y='TotalCharges',data=df);
# countplot of InternetService with hue equals to Churn
plt.figure(figsize=(10,4))
sns.countplot(data=df,x='InternetService',hue='Churn');
mask = (df['InternetService']=='Fiber optic') &(df['Churn']=='Yes')
# percentage of people churn after using fiber optic Internet service
(len(df[mask])/len(df[df['InternetService']=='Fiber optic']))*100
plt.figure(figsize=(10,4))
sns.countplot(data=df,x='Contract',hue='Churn');
plt.figure(figsize=(10,4))
sns.countplot(data=df,x='PaymentMethod',hue='Churn');
# create a pd.series with groupby method
# groubpy [churn,tenure] -> count number of elements -> transpose ->
# only choose columns with 'yes' -> look at first row
yes_churn_tenure = df.groupby(['Churn','tenure']).count().T['Yes'].loc['customerID']
# same for no churn
no_churn_tenure = df.groupby(['Churn','tenure']).count().T['No'].loc['customerID']
# churn rate
churn_rate = yes_churn_tenure*100/(yes_churn_tenure+no_churn_tenure)
churn_rate
# we can visualize this churn rate
plt.figure(figsize=(10,8))
churn_rate.plot()
plt.scatter(x=churn_rate.index,y=churn_rate)
plt.ylabel('Churn Rate')
plt.xlabel('Tenure');
# define a function for apply method
def cohort(tenure):
if tenure<13:
return '0-12 Months'
elif tenure<25:
return '12-24 Monhts'
elif tenure<49:
return '24-48 Months'
else:
return 'Over 48 Months'
# apply the function to create a new column 'Cohort'
df['Cohort'] = df['tenure'].apply(cohort)
df['Cohort']
df['Cohort'].value_counts(sort=False)
df[df['Churn']=='Yes']['Cohort'].value_counts(sort=False)
# churn rate drops significantly after a customer uses the service of 1 year
churn_rate_cohort = (df[df['Churn']=='Yes']['Cohort'].value_counts(sort=False))/(df['Cohort'].value_counts(sort=False))
churn_rate_cohort
plt.figure(figsize=(10,8))
sns.countplot(x='Cohort',data=df,hue='Churn');
# catplot with cohort and contract method
# people with monthly contract and less than 12 month tenure are very likely to churn
plt.figure(figsize=(12,10))
sns.catplot(data=df,x='Cohort',hue='Churn',col='Contract',kind='count');
# MonthlyCharges vs. TotalCharges with hue = Cohort
plt.figure(figsize=(12,8))
sns.scatterplot(data=df,x='MonthlyCharges',y='TotalCharges',hue='Cohort',palette='mako',alpha=.77);
df
customerIDobject
7590-VHVEG0%
5575-GNVDE0%
7030 others100%
genderobject
Male50.5%
Female49.5%
0