PREDICT CHURN

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import LabelEncoder from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report, log_loss from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.metrics import confusion_matrix from sklearn import metrics from sklearn.metrics import accuracy_score, recall_score, precision_score, mean_squared_error, mean_absolute_error

sns.set(style='darkgrid') def get_label(g): for p in g.patches: height = p.get_height() g.text(p.get_x()+p.get_width()/2., height/2, '{}'.format(round(height)), ha="center", color='black')

df = pd.read_csv('/work/BankChurners.csv')

df.sample(5)

df.shape

df.describe()

# DROP IRRELEVANT COLUMN df.drop(['CLIENTNUM', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis=1, inplace=True )

df.info()

sns.heatmap(df.isna())

df.head(1)

# Target feature is 'Attrition_Flag' in which 'Attrited Customer' means it's a churn (1) and 'Existing Customer' means there is no churn (0) df['Attrition_Flag'] = df['Attrition_Flag'].map({'Attrited Customer': 1, 'Existing Customer': 0})

df1 = df.copy()

df1.dtypes

df_num = df1.select_dtypes(include=['int64', 'float64']) df_cat = df1.select_dtypes(exclude=['int64', 'float64']) df_num = df_num.iloc[:, 1:15]

df_num.shape

df_cat.sample()

df_num.sample()

df2 = df1.copy()

df2.head()

plt.figure(figsize=(10,5)) g= sns.countplot(x='Attrition_Flag', data=df2); get_label(g) plt.title('Count values to target variable')

df_num.hist( bins=30, figsize=(15,15) );

df_cat.sample()

df_cat['Income_Category'] = df_cat['Income_Category'].apply(lambda x:'< $40k' if x == 'Less than $40K' else x);

plt.figure(figsize=(15,7)) g = sns.countplot(x='Gender', data=df_cat); get_label(g)

plt.figure(figsize=(15,7)) g = sns.countplot(x='Education_Level',data=df_cat); get_label(g)

plt.figure(figsize=(15,7)) g= sns.countplot(x='Marital_Status', data=df_cat); get_label(g)

plt.figure(figsize=(15,5)) g= sns.countplot(x='Income_Category', data=df_cat); get_label(g)

plt.figure(figsize=(15,7))

plt.figure(figsize=(15,5)) plt.title('Number of Customers Inactive - Months_Inactive_12_mon') g = sns.countplot(x = 'Months_Inactive_12_mon', data = df2) get_label(g)

plt.figure(figsize=[15,5]) sns.boxplot(x=df_num['Customer_Age'], y=df_cat['Gender']); plt.xlabel('Age')

df2.sample()

correlations = df2.corr(method='pearson')

f, ax = plt.subplots(figsize = (15,15)) sns.heatmap(correlations, annot = True);

df2.sample()

# Target feature is 'Attrition_Flag' in which 'Attrited Customer' means it's a churn (1) and 'Existing Customer' means there is no churn (0) plt.figure(figsize=[20,7]) sns.countplot(x='Customer_Age', hue='Attrition_Flag', data=df2); plt.xlabel('Age');

plt.figure(figsize=[20,7]) ax = sns.countplot(x='Months_Inactive_12_mon', hue='Attrition_Flag', data=df2); for p in ax.patches: ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005)) plt.xlabel('Months inactive');

df2['Income_Category'] = df2['Income_Category'].apply(lambda x: '< 40K' if x == 'Less than $40K' else x)

plt.figure(figsize=[20,7]) ax = sns.countplot(x='Income_Category', hue='Attrition_Flag', data=df2); for p in ax.patches: ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005)) plt.xlabel('FAIXA SALARIAL');

df2.sample()

plt.figure(figsize=[20,7]) ax = sns.countplot(x='Contacts_Count_12_mon', hue='Attrition_Flag', data=df2); for p in ax.patches: ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005)) plt.xlabel('Contacts');

plt.figure(figsize=(15,7)) g= sns.countplot(x='Card_Category',hue=df2['Attrition_Flag'], data=df_cat); get_label(g)

df_cat.sample()

df_cat_dummies = pd.get_dummies(df_cat)

df_cat_dummies.head()

df3 = pd.concat([df_num, df_cat_dummies], axis=1)

X = df3.copy() y = df2['Attrition_Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

modelLR = LogisticRegression() modelLR.fit(X_train, y_train); predictLR = modelLR.predict(X_test)

print('Logistic Regression: \n', classification_report(predictLR, y_test)) print('Logistic Regression Accuracy: ', accuracy_score(predictLR, y_test))

print('MAE:', mean_absolute_error(predictLR, y_test) ) print('MSE:', mean_squared_error(predictLR, y_test) ) print('RMSE:', np.sqrt(mean_squared_error(predictLR, y_test) ))

#Gerando matrix de confusao cm = confusion_matrix(y_test, predictLR) sns.heatmap(cm, annot=True);

modelRF = RandomForestClassifier() modelRF.fit(X_train, y_train) predictRF = modelRF.predict(X_test)

print('Random Forest Classifier : \n', classification_report(predictRF, y_test)) print('Random Forest Classifier Accuracy: ', accuracy_score(predictRF, y_test))

print('MAE:', mean_absolute_error(predictRF, y_test) ) print('MSE:', mean_squared_error(predictRF, y_test) ) print('RMSE:', np.sqrt(mean_squared_error(predictRF, y_test) ))

#Gerando matrix de confusao cm = confusion_matrix(y_test, predictRF) sns.heatmap(cm, annot=True);

modelGB = GradientBoostingClassifier() modelGB.fit(X_train, y_train) predictGB = modelGB.predict(X_test)

print('Gradient Boost Classifier : \n', classification_report(predictGB, y_test)) print('Gradient Boost Classifier Accuracy: ', accuracy_score(predictGB, y_test))

print('MAE:', mean_absolute_error(predictGB, y_test) ) print('MSE:', mean_squared_error(predictGB, y_test) ) print('RMSE:', np.sqrt(mean_squared_error(predictGB, y_test) ))

#Gerando matrix de confusao cm = confusion_matrix(y_test, predictGB) sns.heatmap(cm, annot=True);

FEATURE IMPORTANCES

# get important features plt.figure(figsize=(15,7)) feature_list = pd.Series(modelGB.feature_importances_, index=X_train.columns).sort_values(ascending=False) feature_list.nlargest(10).sort_values(ascending=True).plot(kind='barh') plt.show()

x_under = df3.copy() y_under = df2['Attrition_Flag']

from imblearn.under_sampling import RandomUnderSampler

under_sampler = RandomUnderSampler()

x_under, y_under = under_sampler.fit_resample(x_under, y_under)

plt.figure(figsize=(15,7)) g = sns.countplot(x=y_under) get_label(g)

X_under_train, X_under_test, y_under_train, y_under_test = train_test_split(x_under, y_under, random_state = 1, stratify=y_under) lr = LogisticRegression() lr.fit(X_under_train, y_under_train); y_pred = lr.predict(X_under_test);

print(classification_report(y_under_test, y_pred))

gb = GradientBoostingClassifier() gb.fit(X_under_train, y_under_train) gb_pred = gb.predict(X_under_test)

print(classification_report(y_under_test, gb_pred))

# get important features plt.figure(figsize=(15,7)) feature_list = pd.Series(modelGB.feature_importances_, index=X_under_train.columns).sort_values(ascending=False) feature_list.nlargest(10).sort_values(ascending=True).plot(kind='barh') plt.show()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}FEATURE IMPORTANCES

FEATURE IMPORTANCES