# Necessary imports
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")
from tqdm import tqdm, trange
import warnings; warnings.filterwarnings("ignore")
# Load data
data = pd.read_csv("data/pilot Report as at 31st jan.csv", parse_dates=['CreateDate', 'InvoiceDate'])
data
# Data descriptions
data_dict = pd.read_excel("data/data dictionary.xlsx")
data_dict
# Correct column name and print descriptions in full
data_dict.columns = ['Attribute', 'Description']
for row in data_dict.index:
print(f"{data_dict.loc[row, 'Attribute']} : {data_dict.loc[row, 'Description']}\n")
from datastand.datastand import datastand
datastand(data)
# fix data types
def to_int(value):
if str(value) != 'nan':
value = int(''.join(str(value).split(',')))
return value
def fix_dtypes(df):
df.CreditLimit = df.CreditLimit.apply(to_int)
#df.Cleared = df.Cleared.astype(bool)
#df.Overdue = df.Overdue.astype(bool)
df.CreditUsed = df.CreditUsed.apply(to_int)
df.AmountRepaid = df.AmountRepaid.apply(to_int)
df.Balance = df.Balance.apply(to_int)
return df
data = fix_dtypes(data)
print(data.dtypes)
data.dropna(subset=['Overdue',
'CreditUsed', 'AmountRepaid', 'Balance', 'Fees', 'DaysOverdue',
'CreateDate'], axis=0, inplace=True)
# reset index
data.reset_index(drop=True, inplace=True)
data.isnull().sum()
data[data.Cleared.isna()].sample(5, random_state=101)
data[data.Fees == 0].count()[0]
for idx in tqdm(data[data.Cleared.isna()].index):
if data.loc[idx, 'AmountRepaid'] >= data.loc[idx, 'CreditUsed']:
data.loc[idx, 'Cleared'] = True
else:
data.loc[idx, 'Cleared'] = False
# Check for duplicates
data[data.duplicated(subset=['CreditLimit', 'SONumber', 'Cleared', 'Overdue',
'CreditUsed', 'AmountRepaid', 'Balance', 'Fees', 'DaysOverdue',
'CreateDate', 'InvoiceDate', 'group'])]
def feature_eng(df):
# datetime features
df['CreateDate_year'] = df.CreateDate.dt.year.astype(int)
df['CreateDate_month'] = df.CreateDate.dt.month.astype(int)
df['CreateDate_day'] = df.CreateDate.dt.day.astype(int)
df['CreateDate_dayname'] = df.CreateDate.dt.day_name()
df['InvoiceDate_year'] = df.InvoiceDate.dt.year.astype(int)
df['InvoiceDate_month'] = df.InvoiceDate.dt.month.astype(int)
df['InvoiceDate_day'] = df.InvoiceDate.dt.day.astype(int)
df['InvoiceDate_dayname'] = df.InvoiceDate.dt.day_name()
# time difference in days between create date and invoice date
df['CrtInv_dateDiff'] = (df.InvoiceDate - df.CreateDate).apply(lambda x: int(str(x).split()[0]))
# binary category columns for cleared and overdue columns
df['cleared_cat'] = df.Cleared.map({True: 'Cleared', False: 'Not Cleared'})
df['overdue_cat'] = df.Overdue.map({True: 'Overdue', False: 'On time'})
return df
data = feature_eng(data)
data.head()
# Total number of customers spanned in this dataset
print(f"Total Customers: {data.PartnerID.nunique()}")
# Distribution of Cleared column
print(data.Cleared.value_counts())
# as a percentage
print(data.Cleared.value_counts()*100 / len(data))
plt.figure(figsize=(10,6))
sns.countplot(data.Cleared)
plt.title("Distribution of Cleared status")
# Distribution of Overdue column
print(data.Overdue.value_counts())
# as a percentage
print(data.Overdue.value_counts()*100 / len(data))
plt.figure(figsize=(10,6))
sns.countplot(data.Overdue)
plt.title("Distribution of Overdue status")
loan_times_dict = data.groupby('PartnerID')['SONumber'].count().to_dict()
# sort dictionary
marklist = sorted(loan_times_dict.items(), key=lambda x:x[1], reverse=True)
loan_times_dict = dict(marklist)
for key, val in zip(list(loan_times_dict.keys())[:10], list(loan_times_dict.values())[:10]):
print(f"{key} : {val}")
defaulting_customers = pd.DataFrame(sorted(data[data.Cleared == False].groupby('PartnerID')['Balance'].sum().to_dict().items(),
key=lambda x:x[1], reverse=True),
columns=['PartnerID', 'Total_balance'])
defaulting_customers = defaulting_customers[defaulting_customers.Total_balance > 0]
defaulting_customers
defaulting_customers.head(10)
data[data.DaysOverdue >= 100]#.count()[0]
data[data.DaysOverdue >= 100].count()[0]
plt.figure(figsize=(15,8))
sns.lineplot(x='CreateDate', y='CreditUsed', data=data.sort_values(by='CreateDate'),
hue='Cleared')
plt.title("Credit usage over time")
data[(data.CreateDate > pd.to_datetime('2021-12-01')) & (data.CreateDate < pd.to_datetime('2021-12-15')) &
(data.CreditUsed > 6000) & (data.Cleared == False)]
plt.figure(figsize=(10,6))
sns.countplot(y=data.CreateDate_dayname,
order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'])
plt.title("Number of Loan orders day-wise(CreateDate)")
plt.figure(figsize=(10,6))
sns.countplot(y=data.InvoiceDate_dayname,
order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'])
plt.title("Number of Invoices day-wise(InvoiceDate)")
data.CreateDate_year.value_counts()
data.CreateDate_month.value_counts()
data.InvoiceDate_month.value_counts()
data.CrtInv_dateDiff.value_counts()
fig, ax = plt.subplots(1,2, figsize=(14,6))
ax[0].pie(data.CrtInv_dateDiff.value_counts(), labels=['2 days','3 days','4 days','6 days','7 days','9 days','5 days'])
sns.countplot(y=data.CrtInv_dateDiff, ax=ax[1])
fig.suptitle("Difference in days from create date to invoice date")
cols, rows = 3, 2
fig, axes = plt.subplots(rows, cols, figsize=(16,12))
columns = ['CreditLimit', 'CreditUsed', 'AmountRepaid', 'Balance', 'Fees', 'DaysOverdue']
for index, col in enumerate(columns):
# new subplot with (i + 1)-th index laying on a grid
plt.subplot(rows, cols, index + 1)
# drawing the plot
sns.boxplot(x='cleared_cat', y=col, data=data)
plt.title(f"{col}")
fig.suptitle("Numerical columns in relation to Cleared status")
plt.show()
print(data[['CreditLimit', 'CreditUsed']].corr())
sns.heatmap(data[['CreditLimit', 'CreditUsed']].corr())
sns.jointplot(x='CreditLimit', y='CreditUsed', data=data, kind='reg')
sns.pairplot(data, vars=['CreditLimit', 'CreditUsed', 'AmountRepaid', 'Balance', 'Fees', 'DaysOverdue'], hue='Cleared', corner=True)
from pandas_profiling import ProfileReport
ProfileReport(data)