Supervised Learning Project - Tree Methods Focus

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

df = pd.read_csv('Telco-Customer-Churn.csv')

df.head()

df.info()

df.describe()

df.isnull().sum()

sns.countplot(data=df, x='Churn')

sns.boxplot(data=df, x='Churn', y='TotalCharges')

plt.figure(figsize=(8, 5), dpi=100) sns.boxplot(data=df, x='Contract', y='TotalCharges', hue='Churn') plt.legend(loc=(1.1,0.5))

df.columns

corr_df = pd.get_dummies(df[['gender', 'SeniorCitizen', 'Partner', 'Dependents','PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport','StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod','Churn']]).corr()

corr_df['Churn_Yes'].sort_values().iloc[1:-1]

plt.figure(figsize=(10,4),dpi=200) sns.barplot(x=corr_df['Churn_Yes'].sort_values().iloc[1:-1].index,y=corr_df['Churn_Yes'].sort_values().iloc[1:-1].values) plt.title("Feature Correlation to Yes Churn") plt.xticks(rotation=90);

# Contract types df['Contract'].unique()

plt.figure(figsize=(10, 4), dpi=150) sns.histplot(data=df, x='tenure', bins=50)

plt.figure(figsize=(10, 4), dpi=150) sns.displot(data=df, x='tenure',col='Contract', row='Churn')

plt.figure(figsize=(10, 4), dpi=150) sns.scatterplot(data=df, x='MonthlyCharges', y='TotalCharges', hue='Churn')

df.head()

yes_churn = df.groupby(['Churn', 'tenure']).count().transpose()['Yes'] no_churn = df.groupby(['Churn', 'tenure']).count().transpose()['No']

churn_rate = 100 * yes_churn / (no_churn + yes_churn)

churn_rate.transpose()['customerID']

plt.figure(figsize=(10, 4)) plt.plot(range(1, 73), churn_rate.transpose()['customerID']) plt.xlabel('tenure') plt.ylabel('Churn Percentage')

def tenure_cohort(tenure): if(tenure < 13): return '0-12 Months' elif tenure < 25: return '12- 24 Months' elif tenure < 49: return '24-48 Months' else: return 'Over 48 Months'

df['Tenure Cohort'] = df['tenure'].apply(tenure_cohort)

df.head()

plt.figure(figsize=(10, 4), dpi=200) sns.scatterplot(data=df,x='MonthlyCharges',y='TotalCharges',hue='Tenure Cohort', linewidth=0.5,alpha=0.5,palette='Dark2')

plt.figure(figsize=(10, 4)) sns.countplot(data=df, x='Tenure Cohort', hue='Churn')

plt.figure(figsize=(10,4),dpi=200) sns.catplot(data=df,x='Tenure Cohort',hue='Churn',col='Contract',kind='count')

X = df.drop(['Churn','customerID'], axis=1) X = pd.get_dummies(X, drop_first=True)

y = df['Churn']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score,plot_confusion_matrix,classification_report from sklearn.tree import plot_tree

dt_model = DecisionTreeClassifier(max_depth=6)

dt_model.fit(X_train, y_train)

y_preds = dt_model.predict(X_test)

accuracy_score(y_test, y_preds)

print(classification_report(y_test, y_preds))

plot_confusion_matrix(dt_model, X_test, y_test)

imp_feature = pd.DataFrame(index=X.columns, data=dt_model.feature_importances_,columns=['Importance']).sort_values('Importance')

imp_feature

# imp_feature = imp_feature[imp_feature['Importance'] > 0]

plt.figure(figsize=(14, 6), dpi=200) sns.barplot(data=imp_feature.sort_values('Importance'), x=imp_feature.sort_values('Importance').index, y='Importance') plt.xticks(rotation=90);

plt.figure(figsize=(12, 8), dpi=150) plot_tree(dt_model, filled=True, feature_names=X.columns);

from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100)

rf_model.fit(X_train, y_train)

y_preds = rf_model.predict(X_test)

print(classification_report(y_test, y_preds))

plot_confusion_matrix(rf_model, X_test, y_test)

from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier

ada_model = AdaBoostClassifier()

ada_model.fit(X_train, y_train)

y_preds = ada_model.predict(X_test)

print(classification_report(y_test, y_preds))

plot_confusion_matrix(ada_model, X_test, y_test)

gb_model = GradientBoostingClassifier()

gb_model.fit(X_train, y_train)

y_preds = gb_model.predict(X_test)

print(classification_report(y_test, y_preds))

plot_confusion_matrix(gb_model, X_test, y_test)