import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, auc
df = pd.read_csv('loan_data.csv')
df.head()
# check if the data type is correct
df.dtypes.to_frame().T
# check if there is any missing value
df.isnull().sum(axis = 0).sum()
# check if there are duplicated rows
df.duplicated().sum()
df.shape
cat_cols = [col for col in df.columns if df[col].dtypes == "O"]
cat_cols = list(set(cat_cols).union(set(["loan_status"])))
num_cols = [col for col in df.columns if col not in cat_cols]
nNum = len(num_cols)
if nNum % 3 == 0:
nRow = nNum // 3
else:
nRow = nNum // 3 + 1
fig, ax = plt.subplots(nrows= nRow, ncols=3, figsize=(12, 1.5 * nRow))
for i, col in enumerate(num_cols):
cur_ax = ax[i // 3, i % 3]
sns.boxplot(data=df, x= col, ax=cur_ax, orient="h")
fig.suptitle('Boxplots of Numerical Features: Raw Data', fontsize=12)
plt.tight_layout()
plt.show()
outlier_cols = ['person_age', 'person_income', 'person_emp_exp', 'cb_person_cred_hist_length', 'loan_percent_income', 'loan_amnt']
Q1 = df[outlier_cols].quantile(0.25)
Q3 = df[outlier_cols].quantile(0.75)
IQR = Q3 - Q1
outlier_mask = ((df[outlier_cols] < (Q1 - 1.75 * IQR)) | (df[outlier_cols] > (Q3 + 1.75 * IQR))).any(axis=1)
# remove outliers
df = df[~outlier_mask]
fig, ax = plt.subplots(nrows= nRow, ncols=3, figsize=(14, 1.5 * nRow))
for i, col in enumerate(num_cols):
cur_ax = ax[i // 3, i % 3]
sns.boxplot(data=df, x= col, ax=cur_ax, orient="h")
fig.suptitle('Boxplots of Numerical Features After Removing Outliers', fontsize=12)
plt.tight_layout()
plt.show()
nNum = len(num_cols)
if nNum % 3 == 0:
nRow = nNum // 3
else:
nRow = nNum // 3 + 1
fig, ax = plt.subplots(nrows= nRow, ncols=3, figsize=(14, 2 * nRow))
for i, col in enumerate(num_cols):
cur_ax = ax[i // 3, i % 3]
sns.histplot(data=df, x=col, ax=cur_ax)
fig.suptitle('Histogram of Numerical Features', fontsize=12)
plt.tight_layout()
plt.show()
cat_cols = [col for col in df.columns if df[col].dtypes == "O"]
cat_cols = list(set(cat_cols).union(set(["loan_status"])))
def autopct_func(pct, allvals):
absolute = int(np.round(pct/100.*np.sum(allvals)))
return f"{pct:.1f}%\n({absolute:d})"
nCat = len(cat_cols)
if nCat % 3 == 0:
nRow = nCat // 3
else:
nRow = nCat // 3 + 1
fig, ax = plt.subplots(nrows= nRow, ncols=3, figsize=(14, 6.5 * nRow))
for i, col in enumerate(cat_cols):
cur_ax = ax[i // 3, i % 3]
s = df[col].value_counts()
wedges, texts, autotexts = cur_ax.pie(s,labels = s.index, autopct=lambda pct: autopct_func(pct, s))
cur_ax.legend(labels=['{} - {:.2f}%'.format(index, value/sum(s)*100) for index, value in zip(s.index, s)],
loc='upper center', bbox_to_anchor=(0.5, 0), fancybox=True, shadow=True)
cur_ax.set_title("Frequency of " + col)
if 'loan_status' in cat_cols:
cat_cols.remove('loan_status')
nCat = len(cat_cols)
if nCat % 3 == 0:
nRow = nCat // 3
else:
nRow = nCat // 3 + 1
fig, ax = plt.subplots(nrows= nRow, ncols=3, figsize=(14, 4 * nRow))
for i, col in enumerate(cat_cols):
cur_ax = ax[i // 3, i % 3]
sns.barplot(x=col, hue = col, y='loan_status', data=df, palette="pastel", ax=cur_ax)
cur_ax.set_title("Mean Application per " + col, fontsize=12)
cur_ax.tick_params(axis='x', labelrotation=45)
plt.tight_layout()
plt.show()
df.groupby(['previous_loan_defaults_on_file','loan_status']).size()
nNum = len(num_cols)
if nNum % 3 == 0:
nRow = nNum // 3
else:
nRow = nNum // 3 + 1
fig, ax = plt.subplots(nrows= nRow, ncols=3, figsize=(14, 3.5 * nRow))
for i, col in enumerate(num_cols):
cur_ax = ax[i // 3, i % 3]
if col == 'person_income':
log_scale = True
else:
log_scale = False
sns.histplot(data=df, x=col, ax=cur_ax, kde=False, common_norm=False, stat="density", hue='loan_status', element='bars', log_scale = log_scale)
cur_ax.set_title("Distribution of " + col)
plt.tight_layout()
plt.show()
df['log_income'] = np.log(df['person_income']) # add a log income as a new feature
df[num_cols + ['log_income','loan_status']].corr(method='pearson')
target = 'loan_status'
features = [col for col in df.columns if col != target]
X_train, X_test, y_train, y_test = train_test_split(
df[features], df[target], test_size=0.2, stratify = df[target], random_state=42)
X_train.columns
model1_numerical_cols = ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length','credit_score']
model1_cat_cols = ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file']
# ColumnTransformer for preprocessing
model1_preprocessor = ColumnTransformer(
transformers=[
# Apply StandardScaler to continuous feature
('scaler', StandardScaler(), model1_numerical_cols),
# Apply OneHotEncoder to the categorical feature
('onehot', OneHotEncoder(drop='first', sparse=False), model1_cat_cols)
]
)
# Full pipeline with Logistic Regression
model1_pipeline = Pipeline(steps=[
('preprocessor', model1_preprocessor),
('classifier', LogisticRegression(
C=10, penalty='l2', solver='lbfgs', random_state=42
))
])
# fit the pipeline
model1_pipeline.fit(X_train, y_train)
model1_y_test_pred = model1_pipeline.predict(X_test)
# Evaluate the model
model1_roc_auc = roc_auc_score(y_test, model1_y_test_pred)
precision, recall, _ = precision_recall_curve(y_test, model1_y_test_pred)
model1_pr_auc = auc(recall, precision)
print(f"ROC-AUC: {model1_roc_auc:.2f}")
print(f"PR AUC: {model1_pr_auc:.2f}")
model2_num_bucket_cols = ['person_age', 'credit_score', 'person_emp_exp', 'loan_amnt', 'cb_person_cred_hist_length', ]
model2_num_cols = ['loan_int_rate', 'loan_percent_income', 'log_income']
model2_cat_cols = ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file']
# Nested pipeline for continuous_feature: StandardScaler -> KBinsDiscretizer -> OneHotEncoder
model2_binned_pipeline = Pipeline(steps=[
('scaler', StandardScaler()), # Standardize the feature
('binning', KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='quantile')),
('onehot', OneHotEncoder(drop='first', sparse=False)) # One-hot encode with drop first
])
# ColumnTransformer for preprocessing
model2_preprocessor = ColumnTransformer(
transformers=[
('binned', model2_binned_pipeline, model2_num_bucket_cols),
('scaler', StandardScaler(), model2_num_cols),
('onehot', OneHotEncoder(drop='first', sparse=False), model2_cat_cols)
]
)
# Full pipeline with Logistic Regression
model2_pipeline = Pipeline(steps=[
('preprocessor', model2_preprocessor),
('classifier', LogisticRegression(
C=10, penalty='l2', solver='lbfgs', random_state=42
))
])
# fit the pipeline
model2_pipeline.fit(X_train, y_train)
model2_y_test_pred = model2_pipeline.predict(X_test)
# Evaluate the model
model2_roc_auc = roc_auc_score(y_test, model2_y_test_pred)
precision, recall, _ = precision_recall_curve(y_test, model2_y_test_pred)
model2_pr_auc = auc(recall, precision)
print(f"ROC-AUC: {model2_roc_auc:.2f}")
print(f"PR AUC: {model2_pr_auc:.2f}")
model2_pipeline.fit(df[features], df[target])
# Get coefficients
classifier = model2_pipeline.named_steps['classifier']
preprocessor = model2_pipeline.named_steps['preprocessor']
# Get transformed feature names
binned_feature_names = preprocessor.named_transformers_['binned'].named_steps['onehot'].get_feature_names_out(model2_num_bucket_cols)
num_feature_names = preprocessor.named_transformers_['scaler'].get_feature_names_out(model2_num_cols)
categorical_feature_names = preprocessor.named_transformers_['onehot'].get_feature_names_out(model2_cat_cols)
# Combine all feature names
feature_names = np.concatenate([binned_feature_names, num_feature_names, categorical_feature_names])
# Get coefficients
coefficients = classifier.coef_[0] # LogisticRegression outputs coefficients for each class; use [0] for binary classification
# Create a DataFrame for better readability
coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
display(coefficients_df.sort_values(by='Coefficient', ascending=False))