import bamboolib as bam
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
default = pd.read_csv(r'train.csv', sep=',', decimal='.')
col_names = (default.columns
.str.lower()
.str.strip()
.str.replace('[ _-]+', '_')
)
default.columns = col_names
default = default.drop(columns=['id', 'batch_enrolled', 'term', 'initial_list_status', 'application_type', 'payment_plan'])
default = default.rename(columns={'employment_duration': 'homeOwner'})
default = default.rename(columns={'home_ownership': 'employmentDuration'})
default.head()
default.tail()
default.shape
default.info()
default.describe().transpose().round(decimals=2)
# Accounts delinquent has only one value, not useful for modelling
default = default.drop(columns=['accounts_delinquent'])
default.duplicated().sum()
default.isnull().sum()
# boxplot
sns.set(rc={"figure.figsize":(12, 8)})
box_plot = sns.boxplot(data=default).set(title='Box plot revealing outliers from each variable')
plt.xticks(rotation=70)
outlier_col = ['employmentDuration', 'revolving_balance','total_current_balance', 'total_revolving_credit_limit']
for col in outlier_col:
Q1 = default[col].quantile(0.25)
Q3 = default[col].quantile(0.75)
IQR = Q3 - Q1
default.drop(default[default[col] >= (Q3+1.5*IQR)].index, axis=0, inplace = True)
default.drop(default[default[col] <= (Q1-1.5*IQR)].index, axis=0, inplace = True)
default.reset_index(drop=True, inplace=True)
fig, ax = plt.subplots(figsize=(10, 6))
sns.distplot(default['total_current_balance'])
plt.xlabel('Total current balance')
plt.title("Most people only have 50k balance.", fontsize = 18)
plt.suptitle('Density of total current balance', fontsize = 22)
plt.show()
import plotly.graph_objs as go
go.Figure(
data=[go.Histogram(x=default["debit_to_income"], xbins={"start": 1.0, "end": 41.0, "size": 1.0})],
layout=go.Layout(title="Histogram of debit_to_income", yaxis={"title": "Count"}, bargap=0.05),
)
fig = px.histogram(default, color='homeOwner', orientation='h', y='delinquency_two_years', barmode='group', height=600,
title='Bar chart of Delinquency in Two Years vs Home Ownership')
fig
fig = px.box(default, x='total_current_balance', color='delinquency_two_years', boxmode='group', title='Boxplot of Total Current Balance vs Delinquency')
fig
# The cloud platform can not render the widget. After the code is done, I will plot in my computer
bam.plot(default, 'debit_to_income', 'total_current_balance')
# One-hot encoding of categorical variables
default = pd.get_dummies(data=default,columns=['verification_status','grade', 'sub_grade', 'homeOwner', 'loan_title'])
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
X = default.drop(columns = 'loan_status')
y = default['loan_status']
#extracting top 10 best features by applying SelectKBest class
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X,y)
defaultscores = pd.DataFrame(fit.scores_)
defaultcolumns = pd.DataFrame(X.columns)
#concat two dataframes
featureScores = pd.concat([defaultcolumns,defaultscores],axis=1)
featureScores.columns = ['Specs','Score'] #naming the dataframe columns
print(featureScores.nlargest(10,'Score')) #printing 10 best features
fig, ax = plt.subplots(figsize=(16, 10))
feature_score = featureScores.nlargest(10,'Score')
sns.barplot(data = feature_score, x="Specs", y="Score").set(title='Bar plot of scores of the 10 best features')
plt.rc('font', size=15)
plt.gcf().autofmt_xdate()
features = list(feature_score['Specs'])
features.append('loan_status')
default[features].corr().style.background_gradient(cmap='coolwarm')
#define figure and axes
fig, ax = plt.subplots(figsize=(16, 16))
plt.rcParams.update({'font.size':500})
#hide the axes
fig.patch.set_visible(False)
ax.axis('off')
ax.axis('tight')
corr = pd.DataFrame(round(default[features].corr(),2))
#create table
table = ax.table(cellText=corr.values, colLabels=corr.columns, loc='center')
table.scale(1,10)
# table.set_fontsize(100)
#display table
fig.tight_layout()
plt.show()
fig, ax = plt.subplots(figsize=(16, 16))
sns.heatmap(default.corr(), vmin=-1, vmax=1, center= 0, cmap= 'coolwarm')
#default.corr()
#default.corr().style.background_gradient(cmap='coolwarm')
sns.displot(default,x="loan_amount")
sns.displot(default,x="funded_amount")
# confusion matrix. do it later
cm = confusion_matrix(y_test, y_pred_lr)
x_axis_labels = ["Edible", "Poisonous"]
y_axis_labels = ["Edible", "Poisonous"]
f, ax = plt.subplots(figsize =(7,7))
sns.heatmap(cm, annot = True, linewidths=0.2, linecolor="black", fmt = ".0f", ax=ax, cmap="Purples", xticklabels=x_axis_labels, yticklabels=y_axis_labels)
plt.xlabel("PREDICTED LABEL")
plt.ylabel("TRUE LABEL")
plt.title('Confusion Matrix for Logistic Regression Classifier')
#plt.savefig("lrcm.png", format='png', dpi=500, bbox_inches='tight')
plt.show()