import pandas as pd
tb_clients_path = '/content/drive/MyDrive/Jobs/KIN/clients_table.txt'
tb_credit_score_path = '/content/drive/MyDrive/Jobs/KIN/credit_score_table.txt'
tb_products_path = '/content/drive/MyDrive/Jobs/KIN/products_table.txt'
tb_transactions_path = '/content/drive/MyDrive/Jobs/KIN/transactions_table.txt'
df_clients = pd.read_csv(tb_clients_path)
df_credit_score = pd.read_csv(tb_credit_score_path)
df_products = pd.read_csv(tb_products_path)
df_transactions = pd.read_csv(tb_transactions_path)
df_clients.head()
df_clients.shape
df_clients_desired = df_clients[df_clients['application_date'] >= '2015-01-01']
print(df_clients_desired.head())
print(df_clients_desired.shape)
df_clients_desired = df_clients_desired[df_clients_desired['Geography'] != 'Italy']
print(df_clients_desired.head())
print(df_clients_desired.shape)
df_clients_desired.isna().sum()
number_info_missing = 75/100 * 9
print(number_info_missing)
df_clients_desired = df_clients_desired[df_clients_desired.isnull().sum(axis=1) < number_info_missing]
print(df_clients_desired.head())
print(df_clients_desired.shape)
df_clients_desired.drop_duplicates(subset=['CustomerId'],keep= 'first', inplace= True)
print(df_clients_desired.head())
print(df_clients_desired.shape)
today_date = '30/11/2019'
application_date = pd.to_datetime(df_clients_desired['application_date'])
df_clients_desired['exit_date'] = df_clients_desired['exit_date'].fillna(today_date)
exit_date = pd.to_datetime(df_clients_desired['exit_date'])
df_clients_desired['application_date'] = application_date
df_clients_desired['exit_date'] = exit_date
df_clients_desired.head()
df_clients_desired = df_clients_desired[df_clients_desired['exit_date'] - df_clients_desired['application_date'] >= '730 days']
print(df_clients_desired.head())
print(df_clients_desired.shape)
y = pd.DataFrame(df_clients_desired['IsActiveMember'])
y['IsActiveMember'].value_counts(normalize = True) *100
y['IsActiveMember'].value_counts()
dataset = df_clients_desired[['CustomerId', 'EstimatedSalary', 'HasCrCard']]
df_dataset = pd.DataFrame(data = dataset)
from datetime import datetime, date
def age(born):
born = datetime.strptime(born, "%Y-%m-%d").date()
today = date.today()
return today.year - born.year - ((today.month,
today.day) < (born.month,
born.day))
df_dataset['Age'] = df_clients_desired['birth_date'].apply(age)
df_dataset = df_dataset.set_index('CustomerId')
df_products.set_index('CustomerId')
products_count = df_products['CustomerId'].value_counts()
df_products_count = pd.DataFrame(data=products_count)
df_products_count = df_products_count.loc[df_dataset.index,:]
df_products_count = df_products_count.rename(columns={'CustomerId' : 'NumProducts'})
df_products_count
df_transactions = df_transactions.set_index('CustomerId')
df_transactions = df_transactions.loc[df_dataset.index,:]
df_balance = df_transactions.groupby('CustomerId', as_index= True)[['Value']].sum()
df_balance
df_balance = df_balance.rename(columns={'Value' : 'Balance'})
def get_month(appDate):
return appDate.month, appDate.year
df_client_app_date = pd.DataFrame(df_clients_desired['CustomerId'])
df_client_app_date['Month'], df_client_app_date['Year'] = zip(*df_clients_desired['application_date'].apply(get_month))
df_client_app_date = df_client_app_date.set_index('CustomerId')
df_credit_score = df_credit_score.set_index('CustomerId')
df_credit_score = df_credit_score.loc[df_dataset.index,:]
df_credit_score_date = pd.to_datetime(df_credit_score['Date'])
df_credit_score['Date'] = df_credit_score_date
df_credit_score['Month'], df_credit_score['Year']= zip(*df_credit_score['Date'].apply(get_month))
def getScore(row):
score_by_id = df_credit_score[df_credit_score.index == row.name]
score_by_month = score_by_id[(score_by_id['Year'] == row['Year']) & (score_by_id['Month'] == row['Month'])]
return score_by_month['Score'].values[0]
score = df_client_app_date.apply(getScore, axis=1, )
df_score = pd.DataFrame(score)
df_score = df_score.rename(columns={0 : 'Score'})
X = pd.concat([df_dataset, df_products_count, df_balance, df_score], axis=1, join='inner')
X.describe()
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier, plot_importance
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.10)
# final SVC model
svc = SVC()
svc.fit(X_train, y_train)
# prediction
X_test_svc = X_test
y_pred_svc = svc.predict(X_test_svc)
# classification report
print(classification_report(y_test, y_pred_svc))
# confusion matrix
fig, ax = plt.subplots()
sns.heatmap(confusion_matrix(y_test, y_pred_svc, normalize='true'), annot=True, ax=ax)
ax.set_title('Confusion Matrix')
ax.set_ylabel('Real Value')
ax.set_xlabel('Predicted Value')
plt.show()
# final Logistic Regression model
lr = LogisticRegression(solver='newton-cg', C=0.0001)
lr.fit(X_train, y_train)
# prediction
X_test_lr = X_test
y_pred_lr = lr.predict(X_test_lr)
# classification report
print(classification_report(y_test, y_pred_lr))
# confusion matrix
fig, ax = plt.subplots()
sns.heatmap(confusion_matrix(y_test, y_pred_lr, normalize='true'), annot=True, ax=ax)
ax.set_title('Confusion Matrix')
ax.set_ylabel('Real Value')
ax.set_xlabel('Predicted Value')
plt.show()
# final XGBoost model
xgb = XGBClassifier(max_depth=8, learning_rate=0.08, objective= 'binary:logistic',n_jobs=-1)
xgb.fit(X_train, y_train)
# prediction
X_test_xgb = X_test
y_pred_xgb = xgb.predict(X_test_xgb)
# classification report
print(classification_report(y_test, y_pred_xgb))
# confusion matrix
fig, ax = plt.subplots()
sns.heatmap(confusion_matrix(y_test, y_pred_xgb, normalize='true'), annot=True, ax=ax)
ax.set_title('Confusion Matrix')
ax.set_ylabel('Real Value')
ax.set_xlabel('Predicted Value')
plt.show()