ALY6040 Final Project

import bamboolib as bam import pandas as pd import warnings warnings.filterwarnings('ignore') import numpy as np import seaborn as sns import matplotlib.pyplot as plt import plotly.express as px

default = pd.read_csv(r'train.csv', sep=',', decimal='.')

col_names = (default.columns .str.lower() .str.strip() .str.replace('[ _-]+', '_') ) default.columns = col_names default = default.drop(columns=['id', 'batch_enrolled', 'term', 'initial_list_status', 'application_type', 'payment_plan']) default = default.rename(columns={'employment_duration': 'homeOwner'}) default = default.rename(columns={'home_ownership': 'employmentDuration'}) default.head()

default.tail()

default.shape

default.info()

default.describe().transpose().round(decimals=2)

# Accounts delinquent has only one value, not useful for modelling default = default.drop(columns=['accounts_delinquent'])

default.duplicated().sum()

default.isnull().sum()

# boxplot sns.set(rc={"figure.figsize":(12, 8)}) box_plot = sns.boxplot(data=default).set(title='Box plot revealing outliers from each variable') plt.xticks(rotation=70)

outlier_col = ['employmentDuration', 'revolving_balance','total_current_balance', 'total_revolving_credit_limit'] for col in outlier_col: Q1 = default[col].quantile(0.25) Q3 = default[col].quantile(0.75) IQR = Q3 - Q1 default.drop(default[default[col] >= (Q3+1.5*IQR)].index, axis=0, inplace = True) default.drop(default[default[col] <= (Q1-1.5*IQR)].index, axis=0, inplace = True) default.reset_index(drop=True, inplace=True)

fig, ax = plt.subplots(figsize=(10, 6)) sns.distplot(default['total_current_balance']) plt.xlabel('Total current balance') plt.title("Most people only have 50k balance.", fontsize = 18) plt.suptitle('Density of total current balance', fontsize = 22) plt.show()

import plotly.graph_objs as go go.Figure( data=[go.Histogram(x=default["debit_to_income"], xbins={"start": 1.0, "end": 41.0, "size": 1.0})], layout=go.Layout(title="Histogram of debit_to_income", yaxis={"title": "Count"}, bargap=0.05), )

fig = px.histogram(default, color='homeOwner', orientation='h', y='delinquency_two_years', barmode='group', height=600, title='Bar chart of Delinquency in Two Years vs Home Ownership') fig

fig = px.box(default, x='total_current_balance', color='delinquency_two_years', boxmode='group', title='Boxplot of Total Current Balance vs Delinquency') fig

# The cloud platform can not render the widget. After the code is done, I will plot in my computer bam.plot(default, 'debit_to_income', 'total_current_balance')

# One-hot encoding of categorical variables default = pd.get_dummies(data=default,columns=['verification_status','grade', 'sub_grade', 'homeOwner', 'loan_title'])

from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 X = default.drop(columns = 'loan_status') y = default['loan_status'] #extracting top 10 best features by applying SelectKBest class bestfeatures = SelectKBest(score_func=chi2, k=10) fit = bestfeatures.fit(X,y) defaultscores = pd.DataFrame(fit.scores_) defaultcolumns = pd.DataFrame(X.columns) #concat two dataframes featureScores = pd.concat([defaultcolumns,defaultscores],axis=1) featureScores.columns = ['Specs','Score'] #naming the dataframe columns print(featureScores.nlargest(10,'Score')) #printing 10 best features

fig, ax = plt.subplots(figsize=(16, 10)) feature_score = featureScores.nlargest(10,'Score') sns.barplot(data = feature_score, x="Specs", y="Score").set(title='Bar plot of scores of the 10 best features') plt.rc('font', size=15) plt.gcf().autofmt_xdate()

features = list(feature_score['Specs']) features.append('loan_status') default[features].corr().style.background_gradient(cmap='coolwarm')

#define figure and axes fig, ax = plt.subplots(figsize=(16, 16)) plt.rcParams.update({'font.size':500}) #hide the axes fig.patch.set_visible(False) ax.axis('off') ax.axis('tight') corr = pd.DataFrame(round(default[features].corr(),2)) #create table table = ax.table(cellText=corr.values, colLabels=corr.columns, loc='center') table.scale(1,10) # table.set_fontsize(100) #display table fig.tight_layout() plt.show()

fig, ax = plt.subplots(figsize=(16, 16)) sns.heatmap(default.corr(), vmin=-1, vmax=1, center= 0, cmap= 'coolwarm')

#default.corr()

#default.corr().style.background_gradient(cmap='coolwarm')

sns.displot(default,x="loan_amount")

sns.displot(default,x="funded_amount")

# confusion matrix. do it later cm = confusion_matrix(y_test, y_pred_lr) x_axis_labels = ["Edible", "Poisonous"] y_axis_labels = ["Edible", "Poisonous"] f, ax = plt.subplots(figsize =(7,7)) sns.heatmap(cm, annot = True, linewidths=0.2, linecolor="black", fmt = ".0f", ax=ax, cmap="Purples", xticklabels=x_axis_labels, yticklabels=y_axis_labels) plt.xlabel("PREDICTED LABEL") plt.ylabel("TRUE LABEL") plt.title('Confusion Matrix for Logistic Regression Classifier') #plt.savefig("lrcm.png", format='png', dpi=500, bbox_inches='tight') plt.show()