# Necessary imports
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("darkgrid")
from tqdm import tqdm, trange
import warnings; warnings.filterwarnings("ignore")
# Load data
data = pd.read_csv("data/pilot Report as at 31st jan.csv", parse_dates=['CreateDate', 'InvoiceDate'])
data
# Data descriptions
data_dict = pd.read_excel("data/data dictionary.xlsx")
data_dict
# Correct column name and print descriptions in full
data_dict.columns = ['Attribute', 'Description']
for row in data_dict.index:
print(f"{data_dict.loc[row, 'Attribute']} : {data_dict.loc[row, 'Description']}\n")
PartnerID : Customer Unique Identifier
CreditLimit : Maximum amount a customer can borrow at a given time
SONumber : Unique loan identifier
Cleared : Loan Status
True = Loan has been paid
False = Loan is still pending
Overdue : Loan Tenure Status
True = Loan has exceeded its repayment days
False = Loan is still within its repayment days
CreditUsed : Total Amount borrowed
AmountRepaid : Total Loan amount paid back
Balance : CreditUsed - AmountRepaid
Fees : Fees accrued from late repayment
DaysOverdue : Number of days the loan is overdue by
CreatedDate : Date order was placed
InvoiceDate : Date order was delivered
from datastand.datastand import datastand
datastand(data)
General stats:
==================
Size of DataFrame: 30667
Shape of DataFrame: (2359, 13)
Number of unique data types : {dtype('float64'), dtype('O'), dtype('int64'), dtype('<M8[ns]')}
Number of numerical columns: 3
Number of non-numerical columns: 8
Head of DataFrame:
__________________
PartnerID CreditLimit SONumber Cleared Overdue CreditUsed AmountRepaid \
0 36262 26,100 SO11705794 True False 1,464 1,464
1 36262 26,100 SO11705909 True True 146 148
2 36262 26,100 SO11780664 True False 1,650 1,650
3 36262 26,100 SO11833594 True False 8,220 8,220
4 36262 26,100 SO11909592 True False 2,080 2,080
Balance Fees DaysOverdue CreateDate InvoiceDate group
0 0 0.0 0.0 2021-10-15 2021-10-18 Test
1 0 2.0 0.0 2021-10-15 2021-10-18 Test
2 0 0.0 0.0 2021-10-19 2021-10-21 Test
3 0 0.0 0.0 2021-10-22 2021-10-25 Test
4 0 0.0 0.0 2021-10-27 2021-10-29 Test
Tail of DataFrame:
__________________
PartnerID CreditLimit SONumber Cleared Overdue CreditUsed \
2354 1298401 1,669 SO13572455 True True 870
2355 1298401 1,669 SO13572754 True True 220
2356 1298401 1,669 SO13810848 True False 1,344
2357 1298401 1,669 SO13810938 True False 236
2358 1298401 1,669 SO14061060 False False 1,320
AmountRepaid Balance Fees DaysOverdue CreateDate InvoiceDate group
2354 875 0 5.0 0.0 2022-01-05 2022-01-07 control
2355 222 0 2.0 0.0 2022-01-05 2022-01-07 control
2356 1,344 0 0.0 0.0 2022-01-15 2022-01-18 control
2357 236 0 0.0 0.0 2022-01-15 2022-01-18 control
2358 0 1,320 0.0 0.0 2022-01-25 2022-01-27 control
Data description:
_________________
count mean std min 25% \
PartnerID 2359.0 442192.579907 263014.785382 36262.0 303101.0
Fees 2337.0 18.633718 46.335946 0.0 0.0
DaysOverdue 2337.0 8.203680 23.101706 0.0 0.0
50% 75% max
PartnerID 388436.0 544293.5 1298401.0
Fees 0.0 14.0 504.0
DaysOverdue 0.0 0.0 106.0
Missing data:
=======================
DataFrame contains 314 missing values (1.02%) as follows column-wise:
-----------------------------------------------------------------------
PartnerID 0
CreditLimit 0
SONumber 94
Cleared 66
Overdue 22
CreditUsed 22
AmountRepaid 22
Balance 22
Fees 22
DaysOverdue 22
CreateDate 22
InvoiceDate 0
group 0
dtype: int64
-----------------------------------------------------------------------
Do you wish to long-list missing data statistics?(y/n): y
Column:
SONumber
_______________
Missing data points 94 out of total 2359.
Most occurring value: SO11554320, count: 1
Column:
Cleared
_______________
Missing data points 66 out of total 2359.
Most occurring value: True, count: 400
Column:
Overdue
_______________
Missing data points 22 out of total 2359.
Most occurring value: False, count: 1351
Column:
CreditUsed
_______________
Missing data points 22 out of total 2359.
Most occurring value: 860, count: 61
Column:
AmountRepaid
_______________
Missing data points 22 out of total 2359.
Most occurring value: 0, count: 389
Column:
Balance
_______________
Missing data points 22 out of total 2359.
Most occurring value: 0, count: 1928
Column:
Fees
_______________
Missing data points 22 out of total 2359.
Max value: 504.0 Min value: 0.0
Mean: 18.63371844244758 Std: 46.32603136768036
_________________________________________________________________
Column:
DaysOverdue
_______________
Missing data points 22 out of total 2359.
Max value: 106.0 Min value: 0.0
Mean: 8.203679931536158 Std: 23.096762544371032
_________________________________________________________________
Column:
CreateDate
_______________
Missing data points 22 out of total 2359.
You can visualize missing data automatically right away or you can use the
function plot_missing() after importing it from DataStand. Visualize now?(y/n): y
# fix data types
def to_int(value):
if str(value) != 'nan':
value = int(''.join(str(value).split(',')))
return value
def fix_dtypes(df):
df.CreditLimit = df.CreditLimit.apply(to_int)
#df.Cleared = df.Cleared.astype(bool)
#df.Overdue = df.Overdue.astype(bool)
df.CreditUsed = df.CreditUsed.apply(to_int)
df.AmountRepaid = df.AmountRepaid.apply(to_int)
df.Balance = df.Balance.apply(to_int)
return df
data = fix_dtypes(data)
print(data.dtypes)
PartnerID int64
CreditLimit int64
SONumber object
Cleared object
Overdue object
CreditUsed float64
AmountRepaid float64
Balance float64
Fees float64
DaysOverdue float64
CreateDate datetime64[ns]
InvoiceDate datetime64[ns]
group object
dtype: object
data.dropna(subset=['Overdue',
'CreditUsed', 'AmountRepaid', 'Balance', 'Fees', 'DaysOverdue',
'CreateDate'], axis=0, inplace=True)
# reset index
data.reset_index(drop=True, inplace=True)
data.isnull().sum()
data[data.Cleared.isna()].sample(5, random_state=101)
data[data.Fees == 0].count()[0]
for idx in tqdm(data[data.Cleared.isna()].index):
if data.loc[idx, 'AmountRepaid'] >= data.loc[idx, 'CreditUsed']:
data.loc[idx, 'Cleared'] = True
else:
data.loc[idx, 'Cleared'] = False
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:00<00:00, 1292.60it/s]
# Check for duplicates
data[data.duplicated(subset=['CreditLimit', 'SONumber', 'Cleared', 'Overdue',
'CreditUsed', 'AmountRepaid', 'Balance', 'Fees', 'DaysOverdue',
'CreateDate', 'InvoiceDate', 'group'])]
def feature_eng(df):
# datetime features
df['CreateDate_year'] = df.CreateDate.dt.year.astype(int)
df['CreateDate_month'] = df.CreateDate.dt.month.astype(int)
df['CreateDate_day'] = df.CreateDate.dt.day.astype(int)
df['CreateDate_dayname'] = df.CreateDate.dt.day_name()
df['InvoiceDate_year'] = df.InvoiceDate.dt.year.astype(int)
df['InvoiceDate_month'] = df.InvoiceDate.dt.month.astype(int)
df['InvoiceDate_day'] = df.InvoiceDate.dt.day.astype(int)
df['InvoiceDate_dayname'] = df.InvoiceDate.dt.day_name()
# time difference in days between create date and invoice date
df['CrtInv_dateDiff'] = (df.InvoiceDate - df.CreateDate).apply(lambda x: int(str(x).split()[0]))
# binary category columns for cleared and overdue columns
df['cleared_cat'] = df.Cleared.map({True: 'Cleared', False: 'Not Cleared'})
df['overdue_cat'] = df.Overdue.map({True: 'Overdue', False: 'On time'})
return df
data = feature_eng(data)
data.head()
# Total number of customers spanned in this dataset
print(f"Total Customers: {data.PartnerID.nunique()}")
Total Customers: 190
# Distribution of Cleared column
print(data.Cleared.value_counts())
# as a percentage
print(data.Cleared.value_counts()*100 / len(data))
plt.figure(figsize=(10,6))
sns.countplot(data.Cleared)
plt.title("Distribution of Cleared status")
True 1937
False 400
Name: Cleared, dtype: int64
True 82.884039
False 17.115961
Name: Cleared, dtype: float64
# Distribution of Overdue column
print(data.Overdue.value_counts())
# as a percentage
print(data.Overdue.value_counts()*100 / len(data))
plt.figure(figsize=(10,6))
sns.countplot(data.Overdue)
plt.title("Distribution of Overdue status")
False 1351
True 986
Name: Overdue, dtype: int64
False 57.809157
True 42.190843
Name: Overdue, dtype: float64
loan_times_dict = data.groupby('PartnerID')['SONumber'].count().to_dict()
# sort dictionary
marklist = sorted(loan_times_dict.items(), key=lambda x:x[1], reverse=True)
loan_times_dict = dict(marklist)
for key, val in zip(list(loan_times_dict.keys())[:10], list(loan_times_dict.values())[:10]):
print(f"{key} : {val}")
388436 : 122
363796 : 116
548447 : 86
437063 : 84
400649 : 78
105975 : 73
303101 : 60
302148 : 54
410793 : 52
340828 : 44
defaulting_customers = pd.DataFrame(sorted(data[data.Cleared == False].groupby('PartnerID')['Balance'].sum().to_dict().items(),
key=lambda x:x[1], reverse=True),
columns=['PartnerID', 'Total_balance'])
defaulting_customers = defaulting_customers[defaulting_customers.Total_balance > 0]
defaulting_customers
defaulting_customers.head(10)
data[data.DaysOverdue >= 100]#.count()[0]
data[data.DaysOverdue >= 100].count()[0]
plt.figure(figsize=(15,8))
sns.lineplot(x='CreateDate', y='CreditUsed', data=data.sort_values(by='CreateDate'),
hue='Cleared')
plt.title("Credit usage over time")
data[(data.CreateDate > pd.to_datetime('2021-12-01')) & (data.CreateDate < pd.to_datetime('2021-12-15')) &
(data.CreditUsed > 6000) & (data.Cleared == False)]
plt.figure(figsize=(10,6))
sns.countplot(y=data.CreateDate_dayname,
order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'])
plt.title("Number of Loan orders day-wise(CreateDate)")
plt.figure(figsize=(10,6))
sns.countplot(y=data.InvoiceDate_dayname,
order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday'])
plt.title("Number of Invoices day-wise(InvoiceDate)")
data.CreateDate_year.value_counts()
data.CreateDate_month.value_counts()
data.InvoiceDate_month.value_counts()
data.CrtInv_dateDiff.value_counts()
fig, ax = plt.subplots(1,2, figsize=(14,6))
ax[0].pie(data.CrtInv_dateDiff.value_counts(), labels=['2 days','3 days','4 days','6 days','7 days','9 days','5 days'])
sns.countplot(y=data.CrtInv_dateDiff, ax=ax[1])
fig.suptitle("Difference in days from create date to invoice date")
cols, rows = 3, 2
fig, axes = plt.subplots(rows, cols, figsize=(16,12))
columns = ['CreditLimit', 'CreditUsed', 'AmountRepaid', 'Balance', 'Fees', 'DaysOverdue']
for index, col in enumerate(columns):
# new subplot with (i + 1)-th index laying on a grid
plt.subplot(rows, cols, index + 1)
# drawing the plot
sns.boxplot(x='cleared_cat', y=col, data=data)
plt.title(f"{col}")
fig.suptitle("Numerical columns in relation to Cleared status")
plt.show()
print(data[['CreditLimit', 'CreditUsed']].corr())
sns.heatmap(data[['CreditLimit', 'CreditUsed']].corr())
CreditLimit CreditUsed
CreditLimit 1.000000 0.360038
CreditUsed 0.360038 1.000000
sns.jointplot(x='CreditLimit', y='CreditUsed', data=data, kind='reg')
sns.pairplot(data, vars=['CreditLimit', 'CreditUsed', 'AmountRepaid', 'Balance', 'Fees', 'DaysOverdue'], hue='Cleared', corner=True)
from pandas_profiling import ProfileReport
ProfileReport(data)
IPyWidgets are not supported
IPyWidgets are not supported
IPyWidgets are not supported