Two year churn predictive model

import pandas as pd

tb_clients_path = '/content/drive/MyDrive/Jobs/KIN/clients_table.txt' tb_credit_score_path = '/content/drive/MyDrive/Jobs/KIN/credit_score_table.txt' tb_products_path = '/content/drive/MyDrive/Jobs/KIN/products_table.txt' tb_transactions_path = '/content/drive/MyDrive/Jobs/KIN/transactions_table.txt'

df_clients = pd.read_csv(tb_clients_path) df_credit_score = pd.read_csv(tb_credit_score_path) df_products = pd.read_csv(tb_products_path) df_transactions = pd.read_csv(tb_transactions_path)

df_clients.head()

df_clients.shape

df_clients_desired = df_clients[df_clients['application_date'] >= '2015-01-01'] print(df_clients_desired.head()) print(df_clients_desired.shape)

df_clients_desired = df_clients_desired[df_clients_desired['Geography'] != 'Italy'] print(df_clients_desired.head()) print(df_clients_desired.shape)

df_clients_desired.isna().sum()

number_info_missing = 75/100 * 9 print(number_info_missing)

df_clients_desired = df_clients_desired[df_clients_desired.isnull().sum(axis=1) < number_info_missing] print(df_clients_desired.head()) print(df_clients_desired.shape)

df_clients_desired.drop_duplicates(subset=['CustomerId'],keep= 'first', inplace= True) print(df_clients_desired.head()) print(df_clients_desired.shape)

today_date = '30/11/2019'

application_date = pd.to_datetime(df_clients_desired['application_date']) df_clients_desired['exit_date'] = df_clients_desired['exit_date'].fillna(today_date) exit_date = pd.to_datetime(df_clients_desired['exit_date'])

df_clients_desired['application_date'] = application_date df_clients_desired['exit_date'] = exit_date df_clients_desired.head()

df_clients_desired = df_clients_desired[df_clients_desired['exit_date'] - df_clients_desired['application_date'] >= '730 days']

print(df_clients_desired.head()) print(df_clients_desired.shape)

y = pd.DataFrame(df_clients_desired['IsActiveMember']) y['IsActiveMember'].value_counts(normalize = True) *100

y['IsActiveMember'].value_counts()

dataset = df_clients_desired[['CustomerId', 'EstimatedSalary', 'HasCrCard']] df_dataset = pd.DataFrame(data = dataset)

from datetime import datetime, date def age(born): born = datetime.strptime(born, "%Y-%m-%d").date() today = date.today() return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

df_dataset['Age'] = df_clients_desired['birth_date'].apply(age)

df_dataset = df_dataset.set_index('CustomerId')

df_products.set_index('CustomerId')

products_count = df_products['CustomerId'].value_counts()

df_products_count = pd.DataFrame(data=products_count)

df_products_count = df_products_count.loc[df_dataset.index,:]

df_products_count = df_products_count.rename(columns={'CustomerId' : 'NumProducts'})

df_products_count

df_transactions = df_transactions.set_index('CustomerId')

df_transactions = df_transactions.loc[df_dataset.index,:]

df_balance = df_transactions.groupby('CustomerId', as_index= True)[['Value']].sum() df_balance

df_balance = df_balance.rename(columns={'Value' : 'Balance'})

def get_month(appDate): return appDate.month, appDate.year

df_client_app_date = pd.DataFrame(df_clients_desired['CustomerId']) df_client_app_date['Month'], df_client_app_date['Year'] = zip(*df_clients_desired['application_date'].apply(get_month)) df_client_app_date = df_client_app_date.set_index('CustomerId')

df_credit_score = df_credit_score.set_index('CustomerId')

df_credit_score = df_credit_score.loc[df_dataset.index,:]

df_credit_score_date = pd.to_datetime(df_credit_score['Date']) df_credit_score['Date'] = df_credit_score_date df_credit_score['Month'], df_credit_score['Year']= zip(*df_credit_score['Date'].apply(get_month))

def getScore(row): score_by_id = df_credit_score[df_credit_score.index == row.name] score_by_month = score_by_id[(score_by_id['Year'] == row['Year']) & (score_by_id['Month'] == row['Month'])] return score_by_month['Score'].values[0]

score = df_client_app_date.apply(getScore, axis=1, )

df_score = pd.DataFrame(score) df_score = df_score.rename(columns={0 : 'Score'})

X = pd.concat([df_dataset, df_products_count, df_balance, df_score], axis=1, join='inner')

X.describe()

from sklearn.preprocessing import LabelEncoder from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from imblearn.under_sampling import RandomUnderSampler from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report from sklearn.svm import SVC from sklearn.linear_model import LogisticRegression from xgboost import XGBClassifier, plot_importance from sklearn.model_selection import cross_validate from sklearn.model_selection import GridSearchCV import matplotlib.pyplot as plt import seaborn as sns; sns.set()

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.10)

# final SVC model svc = SVC() svc.fit(X_train, y_train) # prediction X_test_svc = X_test y_pred_svc = svc.predict(X_test_svc) # classification report print(classification_report(y_test, y_pred_svc)) # confusion matrix fig, ax = plt.subplots() sns.heatmap(confusion_matrix(y_test, y_pred_svc, normalize='true'), annot=True, ax=ax) ax.set_title('Confusion Matrix') ax.set_ylabel('Real Value') ax.set_xlabel('Predicted Value') plt.show()

# final Logistic Regression model lr = LogisticRegression(solver='newton-cg', C=0.0001) lr.fit(X_train, y_train) # prediction X_test_lr = X_test y_pred_lr = lr.predict(X_test_lr) # classification report print(classification_report(y_test, y_pred_lr)) # confusion matrix fig, ax = plt.subplots() sns.heatmap(confusion_matrix(y_test, y_pred_lr, normalize='true'), annot=True, ax=ax) ax.set_title('Confusion Matrix') ax.set_ylabel('Real Value') ax.set_xlabel('Predicted Value') plt.show()

# final XGBoost model xgb = XGBClassifier(max_depth=8, learning_rate=0.08, objective= 'binary:logistic',n_jobs=-1) xgb.fit(X_train, y_train) # prediction X_test_xgb = X_test y_pred_xgb = xgb.predict(X_test_xgb) # classification report print(classification_report(y_test, y_pred_xgb)) # confusion matrix fig, ax = plt.subplots() sns.heatmap(confusion_matrix(y_test, y_pred_xgb, normalize='true'), annot=True, ax=ax) ax.set_title('Confusion Matrix') ax.set_ylabel('Real Value') ax.set_xlabel('Predicted Value') plt.show()