import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, log_loss
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, mean_squared_error, mean_absolute_error
sns.set(style='darkgrid')
def get_label(g):
for p in g.patches:
height = p.get_height()
g.text(p.get_x()+p.get_width()/2.,
height/2,
'{}'.format(round(height)),
ha="center", color='black')
df = pd.read_csv('/work/BankChurners.csv')
df.sample(5)
df.shape
df.describe()
# DROP IRRELEVANT COLUMN
df.drop(['CLIENTNUM', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis=1, inplace=True )
df.info()
sns.heatmap(df.isna())
df.head(1)
# Target feature is 'Attrition_Flag' in which 'Attrited Customer' means it's a churn (1) and 'Existing Customer' means there is no churn (0)
df['Attrition_Flag'] = df['Attrition_Flag'].map({'Attrited Customer': 1, 'Existing Customer': 0})
df1 = df.copy()
df1.dtypes
df_num = df1.select_dtypes(include=['int64', 'float64'])
df_cat = df1.select_dtypes(exclude=['int64', 'float64'])
df_num = df_num.iloc[:, 1:15]
df_num.shape
df_cat.sample()
df_num.sample()
df2 = df1.copy()
df2.head()
plt.figure(figsize=(10,5))
g= sns.countplot(x='Attrition_Flag', data=df2);
get_label(g)
plt.title('Count values to target variable')
df_num.hist( bins=30, figsize=(15,15) );
df_cat.sample()
df_cat['Income_Category'] = df_cat['Income_Category'].apply(lambda x:'< $40k' if x == 'Less than $40K' else x);
plt.figure(figsize=(15,7))
g = sns.countplot(x='Gender', data=df_cat);
get_label(g)
plt.figure(figsize=(15,7))
g = sns.countplot(x='Education_Level',data=df_cat);
get_label(g)
plt.figure(figsize=(15,7))
g= sns.countplot(x='Marital_Status', data=df_cat);
get_label(g)
plt.figure(figsize=(15,5))
g= sns.countplot(x='Income_Category', data=df_cat);
get_label(g)
plt.figure(figsize=(15,7))
plt.figure(figsize=(15,5))
plt.title('Number of Customers Inactive - Months_Inactive_12_mon')
g = sns.countplot(x = 'Months_Inactive_12_mon', data = df2)
get_label(g)
plt.figure(figsize=[15,5])
sns.boxplot(x=df_num['Customer_Age'], y=df_cat['Gender']);
plt.xlabel('Age')
df2.sample()
correlations = df2.corr(method='pearson')
f, ax = plt.subplots(figsize = (15,15))
sns.heatmap(correlations, annot = True);
df2.sample()
# Target feature is 'Attrition_Flag' in which 'Attrited Customer' means it's a churn (1) and 'Existing Customer' means there is no churn (0)
plt.figure(figsize=[20,7])
sns.countplot(x='Customer_Age', hue='Attrition_Flag', data=df2);
plt.xlabel('Age');
plt.figure(figsize=[20,7])
ax = sns.countplot(x='Months_Inactive_12_mon', hue='Attrition_Flag', data=df2);
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
plt.xlabel('Months inactive');
df2['Income_Category'] = df2['Income_Category'].apply(lambda x: '< 40K' if x == 'Less than $40K' else x)
plt.figure(figsize=[20,7])
ax = sns.countplot(x='Income_Category', hue='Attrition_Flag', data=df2);
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
plt.xlabel('FAIXA SALARIAL');
df2.sample()
plt.figure(figsize=[20,7])
ax = sns.countplot(x='Contacts_Count_12_mon', hue='Attrition_Flag', data=df2);
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))
plt.xlabel('Contacts');
plt.figure(figsize=(15,7))
g= sns.countplot(x='Card_Category',hue=df2['Attrition_Flag'], data=df_cat);
get_label(g)
df_cat.sample()
df_cat_dummies = pd.get_dummies(df_cat)
df_cat_dummies.head()
df3 = pd.concat([df_num, df_cat_dummies], axis=1)
X = df3.copy()
y = df2['Attrition_Flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
modelLR = LogisticRegression()
modelLR.fit(X_train, y_train);
predictLR = modelLR.predict(X_test)
print('Logistic Regression: \n', classification_report(predictLR, y_test))
print('Logistic Regression Accuracy: ', accuracy_score(predictLR, y_test))
print('MAE:', mean_absolute_error(predictLR, y_test) )
print('MSE:', mean_squared_error(predictLR, y_test) )
print('RMSE:', np.sqrt(mean_squared_error(predictLR, y_test) ))
#Gerando matrix de confusao
cm = confusion_matrix(y_test, predictLR)
sns.heatmap(cm, annot=True);
modelRF = RandomForestClassifier()
modelRF.fit(X_train, y_train)
predictRF = modelRF.predict(X_test)
print('Random Forest Classifier : \n', classification_report(predictRF, y_test))
print('Random Forest Classifier Accuracy: ', accuracy_score(predictRF, y_test))
print('MAE:', mean_absolute_error(predictRF, y_test) )
print('MSE:', mean_squared_error(predictRF, y_test) )
print('RMSE:', np.sqrt(mean_squared_error(predictRF, y_test) ))
#Gerando matrix de confusao
cm = confusion_matrix(y_test, predictRF)
sns.heatmap(cm, annot=True);
modelGB = GradientBoostingClassifier()
modelGB.fit(X_train, y_train)
predictGB = modelGB.predict(X_test)
print('Gradient Boost Classifier : \n', classification_report(predictGB, y_test))
print('Gradient Boost Classifier Accuracy: ', accuracy_score(predictGB, y_test))
print('MAE:', mean_absolute_error(predictGB, y_test) )
print('MSE:', mean_squared_error(predictGB, y_test) )
print('RMSE:', np.sqrt(mean_squared_error(predictGB, y_test) ))
#Gerando matrix de confusao
cm = confusion_matrix(y_test, predictGB)
sns.heatmap(cm, annot=True);
FEATURE IMPORTANCES
# get important features
plt.figure(figsize=(15,7))
feature_list = pd.Series(modelGB.feature_importances_, index=X_train.columns).sort_values(ascending=False)
feature_list.nlargest(10).sort_values(ascending=True).plot(kind='barh')
plt.show()
x_under = df3.copy()
y_under = df2['Attrition_Flag']
from imblearn.under_sampling import RandomUnderSampler
under_sampler = RandomUnderSampler()
x_under, y_under = under_sampler.fit_resample(x_under, y_under)
plt.figure(figsize=(15,7))
g = sns.countplot(x=y_under)
get_label(g)
X_under_train, X_under_test, y_under_train, y_under_test = train_test_split(x_under, y_under, random_state = 1, stratify=y_under)
lr = LogisticRegression()
lr.fit(X_under_train, y_under_train);
y_pred = lr.predict(X_under_test);
print(classification_report(y_under_test, y_pred))
gb = GradientBoostingClassifier()
gb.fit(X_under_train, y_under_train)
gb_pred = gb.predict(X_under_test)
print(classification_report(y_under_test, gb_pred))
# get important features
plt.figure(figsize=(15,7))
feature_list = pd.Series(modelGB.feature_importances_, index=X_under_train.columns).sort_values(ascending=False)
feature_list.nlargest(10).sort_values(ascending=True).plot(kind='barh')
plt.show()