PREDICT CHURN

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import LabelEncoder from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, classification_report, log_loss from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.metrics import confusion_matrix from sklearn import metrics from sklearn.metrics import accuracy_score, recall_score, precision_score, mean_squared_error, mean_absolute_error

sns.set(style='darkgrid') def get_label(g): for p in g.patches: height = p.get_height() g.text(p.get_x()+p.get_width()/2., height/2, '{}'.format(round(height)), ha="center", color='black')

df = pd.read_csv('/work/BankChurners.csv')

df.sample(5)

df.shape

df.describe()

# DROP IRRELEVANT COLUMN df.drop(['CLIENTNUM', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis=1, inplace=True )

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Attrition_Flag            10127 non-null  object 
 1   Customer_Age              10127 non-null  int64  
 2   Gender                    10127 non-null  object 
 3   Dependent_count           10127 non-null  int64  
 4   Education_Level           10127 non-null  object 
 5   Marital_Status            10127 non-null  object 
 6   Income_Category           10127 non-null  object 
 7   Card_Category             10127 non-null  object 
 8   Months_on_book            10127 non-null  int64  
 9   Total_Relationship_Count  10127 non-null  int64  
 10  Months_Inactive_12_mon    10127 non-null  int64  
 11  Contacts_Count_12_mon     10127 non-null  int64  
 12  Credit_Limit              10127 non-null  float64
 13  Total_Revolving_Bal       10127 non-null  int64  
 14  Avg_Open_To_Buy           10127 non-null  float64
 15  Total_Amt_Chng_Q4_Q1      10127 non-null  float64
 16  Total_Trans_Amt           10127 non-null  int64  
 17  Total_Trans_Ct            10127 non-null  int64  
 18  Total_Ct_Chng_Q4_Q1       10127 non-null  float64
 19  Avg_Utilization_Ratio     10127 non-null  float64
dtypes: float64(5), int64(9), object(6)
memory usage: 1.5+ MB

sns.heatmap(df.isna())

df.head(1)

# Target feature is 'Attrition_Flag' in which 'Attrited Customer' means it's a churn (1) and 'Existing Customer' means there is no churn (0) df['Attrition_Flag'] = df['Attrition_Flag'].map({'Attrited Customer': 1, 'Existing Customer': 0})

df1 = df.copy()

df1.dtypes

df_num = df1.select_dtypes(include=['int64', 'float64']) df_cat = df1.select_dtypes(exclude=['int64', 'float64']) df_num = df_num.iloc[:, 1:15]

df_num.shape

df_cat.sample()

df_num.sample()

df2 = df1.copy()

df2.head()

plt.figure(figsize=(10,5)) g= sns.countplot(x='Attrition_Flag', data=df2); get_label(g) plt.title('Count values to target variable')

df_num.hist( bins=30, figsize=(15,15) );

df_cat.sample()

df_cat['Income_Category'] = df_cat['Income_Category'].apply(lambda x:'< $40k' if x == 'Less than $40K' else x);

/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.

plt.figure(figsize=(15,7)) g = sns.countplot(x='Gender', data=df_cat); get_label(g)

plt.figure(figsize=(15,7)) g = sns.countplot(x='Education_Level',data=df_cat); get_label(g)

plt.figure(figsize=(15,7)) g= sns.countplot(x='Marital_Status', data=df_cat); get_label(g)

plt.figure(figsize=(15,5)) g= sns.countplot(x='Income_Category', data=df_cat); get_label(g)

plt.figure(figsize=(15,7))

plt.figure(figsize=(15,5)) plt.title('Number of Customers Inactive - Months_Inactive_12_mon') g = sns.countplot(x = 'Months_Inactive_12_mon', data = df2) get_label(g)

plt.figure(figsize=[15,5]) sns.boxplot(x=df_num['Customer_Age'], y=df_cat['Gender']); plt.xlabel('Age')

df2.sample()

correlations = df2.corr(method='pearson')

f, ax = plt.subplots(figsize = (15,15)) sns.heatmap(correlations, annot = True);

df2.sample()

# Target feature is 'Attrition_Flag' in which 'Attrited Customer' means it's a churn (1) and 'Existing Customer' means there is no churn (0) plt.figure(figsize=[20,7]) sns.countplot(x='Customer_Age', hue='Attrition_Flag', data=df2); plt.xlabel('Age');

plt.figure(figsize=[20,7]) ax = sns.countplot(x='Months_Inactive_12_mon', hue='Attrition_Flag', data=df2); for p in ax.patches: ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005)) plt.xlabel('Months inactive');

df2['Income_Category'] = df2['Income_Category'].apply(lambda x: '< 40K' if x == 'Less than $40K' else x)

plt.figure(figsize=[20,7]) ax = sns.countplot(x='Income_Category', hue='Attrition_Flag', data=df2); for p in ax.patches: ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005)) plt.xlabel('FAIXA SALARIAL');

df2.sample()

plt.figure(figsize=[20,7]) ax = sns.countplot(x='Contacts_Count_12_mon', hue='Attrition_Flag', data=df2); for p in ax.patches: ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005)) plt.xlabel('Contacts');

plt.figure(figsize=(15,7)) g= sns.countplot(x='Card_Category',hue=df2['Attrition_Flag'], data=df_cat); get_label(g)

df_cat.sample()

df_cat_dummies = pd.get_dummies(df_cat)

df_cat_dummies.head()

df3 = pd.concat([df_num, df_cat_dummies], axis=1)

X = df3.copy() y = df2['Attrition_Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

modelLR = LogisticRegression() modelLR.fit(X_train, y_train); predictLR = modelLR.predict(X_test)

/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:818: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,

print('Logistic Regression: \n', classification_report(predictLR, y_test)) print('Logistic Regression Accuracy: ', accuracy_score(predictLR, y_test))

Logistic Regression: 
               precision    recall  f1-score   support

           0       0.97      0.89      0.93      2765
           1       0.40      0.70      0.51       274

    accuracy                           0.88      3039
   macro avg       0.68      0.80      0.72      3039
weighted avg       0.92      0.88      0.89      3039

Logistic Regression Accuracy:  0.876604146100691

print('MAE:', mean_absolute_error(predictLR, y_test) ) print('MSE:', mean_squared_error(predictLR, y_test) ) print('RMSE:', np.sqrt(mean_squared_error(predictLR, y_test) ))

MAE: 0.12339585389930899
MSE: 0.12339585389930899
RMSE: 0.3512774599932495

#Gerando matrix de confusao cm = confusion_matrix(y_test, predictLR) sns.heatmap(cm, annot=True);

modelRF = RandomForestClassifier() modelRF.fit(X_train, y_train) predictRF = modelRF.predict(X_test)

print('Random Forest Classifier : \n', classification_report(predictRF, y_test)) print('Random Forest Classifier Accuracy: ', accuracy_score(predictRF, y_test))

Random Forest Classifier : 
               precision    recall  f1-score   support

           0       0.99      0.96      0.97      2633
           1       0.77      0.93      0.84       406

    accuracy                           0.95      3039
   macro avg       0.88      0.94      0.91      3039
weighted avg       0.96      0.95      0.96      3039

Random Forest Classifier Accuracy:  0.9536031589338598

print('MAE:', mean_absolute_error(predictRF, y_test) ) print('MSE:', mean_squared_error(predictRF, y_test) ) print('RMSE:', np.sqrt(mean_squared_error(predictRF, y_test) ))

MAE: 0.04639684106614018
MSE: 0.04639684106614018
RMSE: 0.21539925966943382

#Gerando matrix de confusao cm = confusion_matrix(y_test, predictRF) sns.heatmap(cm, annot=True);

modelGB = GradientBoostingClassifier() modelGB.fit(X_train, y_train) predictGB = modelGB.predict(X_test)

print('Gradient Boost Classifier : \n', classification_report(predictGB, y_test)) print('Gradient Boost Classifier Accuracy: ', accuracy_score(predictGB, y_test))

Gradient Boost Classifier : 
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      2618
           1       0.82      0.95      0.88       421

    accuracy                           0.96      3039
   macro avg       0.91      0.96      0.93      3039
weighted avg       0.97      0.96      0.96      3039

Gradient Boost Classifier Accuracy:  0.9638038828562027

print('MAE:', mean_absolute_error(predictGB, y_test) ) print('MSE:', mean_squared_error(predictGB, y_test) ) print('RMSE:', np.sqrt(mean_squared_error(predictGB, y_test) ))

MAE: 0.0361961171437973
MSE: 0.0361961171437973
RMSE: 0.19025277171120872

#Gerando matrix de confusao cm = confusion_matrix(y_test, predictGB) sns.heatmap(cm, annot=True);

FEATURE IMPORTANCES

# get important features plt.figure(figsize=(15,7)) feature_list = pd.Series(modelGB.feature_importances_, index=X_train.columns).sort_values(ascending=False) feature_list.nlargest(10).sort_values(ascending=True).plot(kind='barh') plt.show()

x_under = df3.copy() y_under = df2['Attrition_Flag']

from imblearn.under_sampling import RandomUnderSampler

under_sampler = RandomUnderSampler()

x_under, y_under = under_sampler.fit_resample(x_under, y_under)

plt.figure(figsize=(15,7)) g = sns.countplot(x=y_under) get_label(g)

X_under_train, X_under_test, y_under_train, y_under_test = train_test_split(x_under, y_under, random_state = 1, stratify=y_under) lr = LogisticRegression() lr.fit(X_under_train, y_under_train); y_pred = lr.predict(X_under_test);

/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:818: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,

print(classification_report(y_under_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.85      0.83       407
           1       0.84      0.80      0.82       407

    accuracy                           0.83       814
   macro avg       0.83      0.83      0.83       814
weighted avg       0.83      0.83      0.83       814

gb = GradientBoostingClassifier() gb.fit(X_under_train, y_under_train) gb_pred = gb.predict(X_under_test)

print(classification_report(y_under_test, gb_pred))

              precision    recall  f1-score   support

           0       0.97      0.94      0.95       407
           1       0.94      0.97      0.95       407

    accuracy                           0.95       814
   macro avg       0.95      0.95      0.95       814
weighted avg       0.95      0.95      0.95       814

# get important features plt.figure(figsize=(15,7)) feature_list = pd.Series(modelGB.feature_importances_, index=X_under_train.columns).sort_values(ascending=False) feature_list.nlargest(10).sort_values(ascending=True).plot(kind='barh') plt.show()