import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('Telco-Customer-Churn.csv')
df.head()
df.info()
df.describe()
df.isnull().sum()
sns.countplot(data=df, x='Churn')
sns.boxplot(data=df, x='Churn', y='TotalCharges')
plt.figure(figsize=(8, 5), dpi=100)
sns.boxplot(data=df, x='Contract', y='TotalCharges', hue='Churn')
plt.legend(loc=(1.1,0.5))
df.columns
corr_df = pd.get_dummies(df[['gender', 'SeniorCitizen', 'Partner', 'Dependents','PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport','StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
'PaymentMethod','Churn']]).corr()
corr_df['Churn_Yes'].sort_values().iloc[1:-1]
plt.figure(figsize=(10,4),dpi=200)
sns.barplot(x=corr_df['Churn_Yes'].sort_values().iloc[1:-1].index,y=corr_df['Churn_Yes'].sort_values().iloc[1:-1].values)
plt.title("Feature Correlation to Yes Churn")
plt.xticks(rotation=90);
# Contract types
df['Contract'].unique()
plt.figure(figsize=(10, 4), dpi=150)
sns.histplot(data=df, x='tenure', bins=50)
plt.figure(figsize=(10, 4), dpi=150)
sns.displot(data=df, x='tenure',col='Contract', row='Churn')
plt.figure(figsize=(10, 4), dpi=150)
sns.scatterplot(data=df, x='MonthlyCharges', y='TotalCharges', hue='Churn')
df.head()
yes_churn = df.groupby(['Churn', 'tenure']).count().transpose()['Yes']
no_churn = df.groupby(['Churn', 'tenure']).count().transpose()['No']
churn_rate = 100 * yes_churn / (no_churn + yes_churn)
churn_rate.transpose()['customerID']
plt.figure(figsize=(10, 4))
plt.plot(range(1, 73), churn_rate.transpose()['customerID'])
plt.xlabel('tenure')
plt.ylabel('Churn Percentage')
def tenure_cohort(tenure):
if(tenure < 13):
return '0-12 Months'
elif tenure < 25:
return '12- 24 Months'
elif tenure < 49:
return '24-48 Months'
else:
return 'Over 48 Months'
df['Tenure Cohort'] = df['tenure'].apply(tenure_cohort)
df.head()
plt.figure(figsize=(10, 4), dpi=200)
sns.scatterplot(data=df,x='MonthlyCharges',y='TotalCharges',hue='Tenure Cohort', linewidth=0.5,alpha=0.5,palette='Dark2')
plt.figure(figsize=(10, 4))
sns.countplot(data=df, x='Tenure Cohort', hue='Churn')
plt.figure(figsize=(10,4),dpi=200)
sns.catplot(data=df,x='Tenure Cohort',hue='Churn',col='Contract',kind='count')
X = df.drop(['Churn','customerID'], axis=1)
X = pd.get_dummies(X, drop_first=True)
y = df['Churn']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,plot_confusion_matrix,classification_report
from sklearn.tree import plot_tree
dt_model = DecisionTreeClassifier(max_depth=6)
dt_model.fit(X_train, y_train)
y_preds = dt_model.predict(X_test)
accuracy_score(y_test, y_preds)
print(classification_report(y_test, y_preds))
plot_confusion_matrix(dt_model, X_test, y_test)
imp_feature = pd.DataFrame(index=X.columns, data=dt_model.feature_importances_,columns=['Importance']).sort_values('Importance')
imp_feature
# imp_feature = imp_feature[imp_feature['Importance'] > 0]
plt.figure(figsize=(14, 6), dpi=200)
sns.barplot(data=imp_feature.sort_values('Importance'), x=imp_feature.sort_values('Importance').index, y='Importance')
plt.xticks(rotation=90);
plt.figure(figsize=(12, 8), dpi=150)
plot_tree(dt_model, filled=True, feature_names=X.columns);
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)
y_preds = rf_model.predict(X_test)
print(classification_report(y_test, y_preds))
plot_confusion_matrix(rf_model, X_test, y_test)
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
ada_model = AdaBoostClassifier()
ada_model.fit(X_train, y_train)
y_preds = ada_model.predict(X_test)
print(classification_report(y_test, y_preds))
plot_confusion_matrix(ada_model, X_test, y_test)
gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_preds = gb_model.predict(X_test)
print(classification_report(y_test, y_preds))
plot_confusion_matrix(gb_model, X_test, y_test)