import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, log_loss
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, mean_squared_error, mean_absolute_error
sns.set(style='darkgrid')
def get_label(g):
for p in g.patches:
height = p.get_height()
g.text(p.get_x()+p.get_width()/2.,
height/2,
'{}'.format(round(height)),
ha="center", color='black')
df = pd.read_csv('/work/BankChurners.csv')
df.sample(5)
df.shape
df.describe()
# DROP IRRELEVANT COLUMN
df.drop(['CLIENTNUM', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis=1, inplace=True )
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Attrition_Flag 10127 non-null object
1 Customer_Age 10127 non-null int64
2 Gender 10127 non-null object
3 Dependent_count 10127 non-null int64
4 Education_Level 10127 non-null object
5 Marital_Status 10127 non-null object
6 Income_Category 10127 non-null object
7 Card_Category 10127 non-null object
8 Months_on_book 10127 non-null int64
9 Total_Relationship_Count 10127 non-null int64
10 Months_Inactive_12_mon 10127 non-null int64
11 Contacts_Count_12_mon 10127 non-null int64
12 Credit_Limit 10127 non-null float64
13 Total_Revolving_Bal 10127 non-null int64
14 Avg_Open_To_Buy 10127 non-null float64
15 Total_Amt_Chng_Q4_Q1 10127 non-null float64
16 Total_Trans_Amt 10127 non-null int64
17 Total_Trans_Ct 10127 non-null int64
18 Total_Ct_Chng_Q4_Q1 10127 non-null float64
19 Avg_Utilization_Ratio 10127 non-null float64
dtypes: float64(5), int64(9), object(6)
memory usage: 1.5+ MB
sns.heatmap(df.isna())
df.head(1)
# Target feature is 'Attrition_Flag' in which 'Attrited Customer' means it's a churn (1) and 'Existing Customer' means there is no churn (0)
df['Attrition_Flag'] = df['Attrition_Flag'].map({'Attrited Customer': 1, 'Existing Customer': 0})
df1 = df.copy()
df1.dtypes
df_num = df1.select_dtypes(include=['int64', 'float64'])
df_cat = df1.select_dtypes(exclude=['int64', 'float64'])
df_num = df_num.iloc[:, 1:15]
df_num.shape
df_cat.sample()
df_num.sample()
df2 = df1.copy()
df2.head()
plt.figure(figsize=(10,5))
g= sns.countplot(x='Attrition_Flag', data=df2);
get_label(g)
plt.title('Count values to target variable')
df_num.hist( bins=30, figsize=(15,15) );
df_cat.sample()
df_cat['Income_Category'] = df_cat['Income_Category'].apply(lambda x:'< $40k' if x == 'Less than $40K' else x);
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
"""Entry point for launching an IPython kernel.
plt.figure(figsize=(15,7))
g = sns.countplot(x='Gender', data=df_cat);
get_label(g)
plt.figure(figsize=(15,7))
g = sns.countplot(x='Education_Level',data=df_cat);
get_label(g)
plt.figure(figsize=(15,7))
g= sns.countplot(x='Marital_Status', data=df_cat);
get_label(g)
plt.figure(figsize=(15,5))
g= sns.countplot(x='Income_Category', data=df_cat);
get_label(g)
plt.figure(figsize=(15,7))
plt.figure(figsize=(15,5))
plt.title('Number of Customers Inactive - Months_Inactive_12_mon')
g = sns.countplot(x = 'Months_Inactive_12_mon', data = df2)
get_label(g)
plt.figure(figsize=[15,5])
sns.boxplot(x=df_num['Customer_Age'], y=df_cat['Gender']);
plt.xlabel('Age')
df2.sample()
correlations = df2.corr(method='pearson')
f, ax = plt.subplots(figsize = (15,15))
sns.heatmap(correlations, annot = True);
df2.sample()
# Target feature is 'Attrition_Flag' in which 'Attrited Customer' means it's a churn (1) and 'Existing Customer' means there is no churn (0)
plt.figure(figsize=[20,7])
sns.countplot(x='Customer_Age', hue='Attrition_Flag', data=df2);
plt.xlabel('Age');
plt.figure(figsize=[20,7])
ax = sns.countplot(x='Months_Inactive_12_mon', hue='Attrition_Flag', data=df2);
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
plt.xlabel('Months inactive');
df2['Income_Category'] = df2['Income_Category'].apply(lambda x: '< 40K' if x == 'Less than $40K' else x)
plt.figure(figsize=[20,7])
ax = sns.countplot(x='Income_Category', hue='Attrition_Flag', data=df2);
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
plt.xlabel('FAIXA SALARIAL');
df2.sample()
plt.figure(figsize=[20,7])
ax = sns.countplot(x='Contacts_Count_12_mon', hue='Attrition_Flag', data=df2);
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
plt.xlabel('Contacts');
plt.figure(figsize=(15,7))
g= sns.countplot(x='Card_Category',hue=df2['Attrition_Flag'], data=df_cat);
get_label(g)
df_cat.sample()
df_cat_dummies = pd.get_dummies(df_cat)
df_cat_dummies.head()
df3 = pd.concat([df_num, df_cat_dummies], axis=1)
X = df3.copy()
y = df2['Attrition_Flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
modelLR = LogisticRegression()
modelLR.fit(X_train, y_train);
predictLR = modelLR.predict(X_test)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:818: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,
print('Logistic Regression: \n', classification_report(predictLR, y_test))
print('Logistic Regression Accuracy: ', accuracy_score(predictLR, y_test))
Logistic Regression:
precision recall f1-score support
0 0.97 0.89 0.93 2765
1 0.40 0.70 0.51 274
accuracy 0.88 3039
macro avg 0.68 0.80 0.72 3039
weighted avg 0.92 0.88 0.89 3039
Logistic Regression Accuracy: 0.876604146100691
print('MAE:', mean_absolute_error(predictLR, y_test) )
print('MSE:', mean_squared_error(predictLR, y_test) )
print('RMSE:', np.sqrt(mean_squared_error(predictLR, y_test) ))
MAE: 0.12339585389930899
MSE: 0.12339585389930899
RMSE: 0.3512774599932495
#Gerando matrix de confusao
cm = confusion_matrix(y_test, predictLR)
sns.heatmap(cm, annot=True);
modelRF = RandomForestClassifier()
modelRF.fit(X_train, y_train)
predictRF = modelRF.predict(X_test)
print('Random Forest Classifier : \n', classification_report(predictRF, y_test))
print('Random Forest Classifier Accuracy: ', accuracy_score(predictRF, y_test))
Random Forest Classifier :
precision recall f1-score support
0 0.99 0.96 0.97 2633
1 0.77 0.93 0.84 406
accuracy 0.95 3039
macro avg 0.88 0.94 0.91 3039
weighted avg 0.96 0.95 0.96 3039
Random Forest Classifier Accuracy: 0.9536031589338598
print('MAE:', mean_absolute_error(predictRF, y_test) )
print('MSE:', mean_squared_error(predictRF, y_test) )
print('RMSE:', np.sqrt(mean_squared_error(predictRF, y_test) ))
MAE: 0.04639684106614018
MSE: 0.04639684106614018
RMSE: 0.21539925966943382
#Gerando matrix de confusao
cm = confusion_matrix(y_test, predictRF)
sns.heatmap(cm, annot=True);
modelGB = GradientBoostingClassifier()
modelGB.fit(X_train, y_train)
predictGB = modelGB.predict(X_test)
print('Gradient Boost Classifier : \n', classification_report(predictGB, y_test))
print('Gradient Boost Classifier Accuracy: ', accuracy_score(predictGB, y_test))
Gradient Boost Classifier :
precision recall f1-score support
0 0.99 0.97 0.98 2618
1 0.82 0.95 0.88 421
accuracy 0.96 3039
macro avg 0.91 0.96 0.93 3039
weighted avg 0.97 0.96 0.96 3039
Gradient Boost Classifier Accuracy: 0.9638038828562027
print('MAE:', mean_absolute_error(predictGB, y_test) )
print('MSE:', mean_squared_error(predictGB, y_test) )
print('RMSE:', np.sqrt(mean_squared_error(predictGB, y_test) ))
MAE: 0.0361961171437973
MSE: 0.0361961171437973
RMSE: 0.19025277171120872
#Gerando matrix de confusao
cm = confusion_matrix(y_test, predictGB)
sns.heatmap(cm, annot=True);
FEATURE IMPORTANCES
# get important features
plt.figure(figsize=(15,7))
feature_list = pd.Series(modelGB.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_list.nlargest(10).sort_values(ascending=True).plot(kind='barh')
plt.show()
x_under = df3.copy()
y_under = df2['Attrition_Flag']
from imblearn.under_sampling import RandomUnderSampler
under_sampler = RandomUnderSampler()
x_under, y_under = under_sampler.fit_resample(x_under, y_under)
plt.figure(figsize=(15,7))
g = sns.countplot(x=y_under)
get_label(g)
X_under_train, X_under_test, y_under_train, y_under_test = train_test_split(x_under, y_under, random_state = 1, stratify=y_under)
lr = LogisticRegression()
lr.fit(X_under_train, y_under_train);
y_pred = lr.predict(X_under_test);
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:818: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,
print(classification_report(y_under_test, y_pred))
precision recall f1-score support
0 0.81 0.85 0.83 407
1 0.84 0.80 0.82 407
accuracy 0.83 814
macro avg 0.83 0.83 0.83 814
weighted avg 0.83 0.83 0.83 814
gb = GradientBoostingClassifier()
gb.fit(X_under_train, y_under_train)
gb_pred = gb.predict(X_under_test)
print(classification_report(y_under_test, gb_pred))
precision recall f1-score support
0 0.97 0.94 0.95 407
1 0.94 0.97 0.95 407
accuracy 0.95 814
macro avg 0.95 0.95 0.95 814
weighted avg 0.95 0.95 0.95 814
# get important features
plt.figure(figsize=(15,7))
feature_list = pd.Series(modelGB.feature_importances_, index=X_under_train.columns).sort_values(ascending=False)
feature_list.nlargest(10).sort_values(ascending=True).plot(kind='barh')
plt.show()