Weiran Cai’s project of Loan Application Approval Classification

import pandas as pd import matplotlib.pyplot as plt import numpy as np import seaborn as sns from sklearn.compose import ColumnTransformer from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, StandardScaler from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression, LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve, auc

df = pd.read_csv('loan_data.csv')

df.head()

# check if the data type is correct df.dtypes.to_frame().T

# check if there is any missing value df.isnull().sum(axis = 0).sum()

# check if there are duplicated rows df.duplicated().sum()

df.shape

cat_cols = [col for col in df.columns if df[col].dtypes == "O"] cat_cols = list(set(cat_cols).union(set(["loan_status"]))) num_cols = [col for col in df.columns if col not in cat_cols]

nNum = len(num_cols) if nNum % 3 == 0: nRow = nNum // 3 else: nRow = nNum // 3 + 1 fig, ax = plt.subplots(nrows= nRow, ncols=3, figsize=(12, 1.5 * nRow)) for i, col in enumerate(num_cols): cur_ax = ax[i // 3, i % 3] sns.boxplot(data=df, x= col, ax=cur_ax, orient="h") fig.suptitle('Boxplots of Numerical Features: Raw Data', fontsize=12) plt.tight_layout() plt.show()

outlier_cols = ['person_age', 'person_income', 'person_emp_exp', 'cb_person_cred_hist_length', 'loan_percent_income', 'loan_amnt'] Q1 = df[outlier_cols].quantile(0.25) Q3 = df[outlier_cols].quantile(0.75) IQR = Q3 - Q1 outlier_mask = ((df[outlier_cols] < (Q1 - 1.75 * IQR)) | (df[outlier_cols] > (Q3 + 1.75 * IQR))).any(axis=1) # remove outliers df = df[~outlier_mask]

fig, ax = plt.subplots(nrows= nRow, ncols=3, figsize=(14, 1.5 * nRow)) for i, col in enumerate(num_cols): cur_ax = ax[i // 3, i % 3] sns.boxplot(data=df, x= col, ax=cur_ax, orient="h") fig.suptitle('Boxplots of Numerical Features After Removing Outliers', fontsize=12) plt.tight_layout() plt.show()

nNum = len(num_cols) if nNum % 3 == 0: nRow = nNum // 3 else: nRow = nNum // 3 + 1 fig, ax = plt.subplots(nrows= nRow, ncols=3, figsize=(14, 2 * nRow)) for i, col in enumerate(num_cols): cur_ax = ax[i // 3, i % 3] sns.histplot(data=df, x=col, ax=cur_ax) fig.suptitle('Histogram of Numerical Features', fontsize=12) plt.tight_layout() plt.show()

cat_cols = [col for col in df.columns if df[col].dtypes == "O"] cat_cols = list(set(cat_cols).union(set(["loan_status"])))

def autopct_func(pct, allvals): absolute = int(np.round(pct/100.*np.sum(allvals))) return f"{pct:.1f}%\n({absolute:d})"

nCat = len(cat_cols) if nCat % 3 == 0: nRow = nCat // 3 else: nRow = nCat // 3 + 1 fig, ax = plt.subplots(nrows= nRow, ncols=3, figsize=(14, 6.5 * nRow)) for i, col in enumerate(cat_cols): cur_ax = ax[i // 3, i % 3] s = df[col].value_counts() wedges, texts, autotexts = cur_ax.pie(s,labels = s.index, autopct=lambda pct: autopct_func(pct, s)) cur_ax.legend(labels=['{} - {:.2f}%'.format(index, value/sum(s)*100) for index, value in zip(s.index, s)], loc='upper center', bbox_to_anchor=(0.5, 0), fancybox=True, shadow=True) cur_ax.set_title("Frequency of " + col)

if 'loan_status' in cat_cols: cat_cols.remove('loan_status') nCat = len(cat_cols) if nCat % 3 == 0: nRow = nCat // 3 else: nRow = nCat // 3 + 1 fig, ax = plt.subplots(nrows= nRow, ncols=3, figsize=(14, 4 * nRow)) for i, col in enumerate(cat_cols): cur_ax = ax[i // 3, i % 3] sns.barplot(x=col, hue = col, y='loan_status', data=df, palette="pastel", ax=cur_ax) cur_ax.set_title("Mean Application per " + col, fontsize=12) cur_ax.tick_params(axis='x', labelrotation=45) plt.tight_layout() plt.show()

df.groupby(['previous_loan_defaults_on_file','loan_status']).size()

nNum = len(num_cols) if nNum % 3 == 0: nRow = nNum // 3 else: nRow = nNum // 3 + 1 fig, ax = plt.subplots(nrows= nRow, ncols=3, figsize=(14, 3.5 * nRow)) for i, col in enumerate(num_cols): cur_ax = ax[i // 3, i % 3] if col == 'person_income': log_scale = True else: log_scale = False sns.histplot(data=df, x=col, ax=cur_ax, kde=False, common_norm=False, stat="density", hue='loan_status', element='bars', log_scale = log_scale) cur_ax.set_title("Distribution of " + col) plt.tight_layout() plt.show()

df['log_income'] = np.log(df['person_income']) # add a log income as a new feature

df[num_cols + ['log_income','loan_status']].corr(method='pearson')

target = 'loan_status' features = [col for col in df.columns if col != target]

X_train, X_test, y_train, y_test = train_test_split( df[features], df[target], test_size=0.2, stratify = df[target], random_state=42)

X_train.columns

model1_numerical_cols = ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length','credit_score']

model1_cat_cols = ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file']

# ColumnTransformer for preprocessing model1_preprocessor = ColumnTransformer( transformers=[ # Apply StandardScaler to continuous feature ('scaler', StandardScaler(), model1_numerical_cols), # Apply OneHotEncoder to the categorical feature ('onehot', OneHotEncoder(drop='first', sparse=False), model1_cat_cols) ] ) # Full pipeline with Logistic Regression model1_pipeline = Pipeline(steps=[ ('preprocessor', model1_preprocessor), ('classifier', LogisticRegression( C=10, penalty='l2', solver='lbfgs', random_state=42 )) ]) # fit the pipeline model1_pipeline.fit(X_train, y_train) model1_y_test_pred = model1_pipeline.predict(X_test) # Evaluate the model model1_roc_auc = roc_auc_score(y_test, model1_y_test_pred) precision, recall, _ = precision_recall_curve(y_test, model1_y_test_pred) model1_pr_auc = auc(recall, precision) print(f"ROC-AUC: {model1_roc_auc:.2f}") print(f"PR AUC: {model1_pr_auc:.2f}")

model2_num_bucket_cols = ['person_age', 'credit_score', 'person_emp_exp', 'loan_amnt', 'cb_person_cred_hist_length', ] model2_num_cols = ['loan_int_rate', 'loan_percent_income', 'log_income'] model2_cat_cols = ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file']

# Nested pipeline for continuous_feature: StandardScaler -> KBinsDiscretizer -> OneHotEncoder model2_binned_pipeline = Pipeline(steps=[ ('scaler', StandardScaler()), # Standardize the feature ('binning', KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='quantile')), ('onehot', OneHotEncoder(drop='first', sparse=False)) # One-hot encode with drop first ]) # ColumnTransformer for preprocessing model2_preprocessor = ColumnTransformer( transformers=[ ('binned', model2_binned_pipeline, model2_num_bucket_cols), ('scaler', StandardScaler(), model2_num_cols), ('onehot', OneHotEncoder(drop='first', sparse=False), model2_cat_cols) ] ) # Full pipeline with Logistic Regression model2_pipeline = Pipeline(steps=[ ('preprocessor', model2_preprocessor), ('classifier', LogisticRegression( C=10, penalty='l2', solver='lbfgs', random_state=42 )) ]) # fit the pipeline model2_pipeline.fit(X_train, y_train) model2_y_test_pred = model2_pipeline.predict(X_test) # Evaluate the model model2_roc_auc = roc_auc_score(y_test, model2_y_test_pred) precision, recall, _ = precision_recall_curve(y_test, model2_y_test_pred) model2_pr_auc = auc(recall, precision) print(f"ROC-AUC: {model2_roc_auc:.2f}") print(f"PR AUC: {model2_pr_auc:.2f}")

model2_pipeline.fit(df[features], df[target]) # Get coefficients classifier = model2_pipeline.named_steps['classifier'] preprocessor = model2_pipeline.named_steps['preprocessor'] # Get transformed feature names binned_feature_names = preprocessor.named_transformers_['binned'].named_steps['onehot'].get_feature_names_out(model2_num_bucket_cols) num_feature_names = preprocessor.named_transformers_['scaler'].get_feature_names_out(model2_num_cols) categorical_feature_names = preprocessor.named_transformers_['onehot'].get_feature_names_out(model2_cat_cols) # Combine all feature names feature_names = np.concatenate([binned_feature_names, num_feature_names, categorical_feature_names]) # Get coefficients coefficients = classifier.coef_[0] # LogisticRegression outputs coefficients for each class; use [0] for binary classification # Create a DataFrame for better readability coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients}) display(coefficients_df.sort_values(by='Coefficient', ascending=False))