from jupyterthemes.stylefx import set_nb_theme
set_nb_theme('gruvboxd')
%run Libraries.ipynb
%run Utility_tools.ipynb
raw = pd.read_csv('Risk_Analytics.csv')
df = deepcopy(raw)
df.drop('Loan ID', axis=1, inplace=True)
df.rename(columns=lambda col: col.replace('?', '').replace(' ', '_').lower().strip(), inplace=True)
reduce_memory_usage(df)
Memory usage before: 39.88 MB
Memory usage now : 7.04 MB
Memory usage decreased by 82.3%
def plot_countplot(data):
sns.set_theme(style="darkgrid")
plt.figure(figsize=(9,7))
ax = sns.countplot(data=data, x='payment')
for p in ax.patches:
total = len(data)
ax.annotate('{:.1f}%'.format(p.get_height()/total*100), (p.get_x()+0.25, p.get_height()+0.01))
plt.show();
plot_countplot(df)
df.isna().sum()
df[df['profession'].isna()].shape[0]/df.shape[0]
df[~df['suite_type'].notna()].shape[0]/df.shape[0]
df.drop('profession', axis=1, inplace=True)
df[df['number_of_family_members'].isna()]
#I reckon there has to be a linear relationship between number_of_children and number_of_family_members.
df['number_of_children'].corr(df['number_of_family_members'])
#As seen from above, our missing values of family members have no children.
df[df['number_of_children']==0]['number_of_family_members'].value_counts(normalize=True)*100
#So, we'll fill in the missing family members with 2.
df['number_of_family_members'].fillna(value=2, inplace=True)
#Checking the validity of the dataset
df[df['number_of_children']>df['number_of_family_members']]
#Let's fill in our last missing values column.
def plot_pie(data, column):
plt.figure(figsize=(9,6))
data[column].value_counts(normalize=True).plot(kind='pie')
plt.show();
plot_pie(df, 'suite_type')
df['suite_type'].value_counts()
df['suite_type'].fillna(value='Unaccompanied', inplace=True)
df.duplicated().sum()
#Remove duplicate records
duplicate_check_remove(df)
Number of duplicate rows before: 15213
Number of duplicate rows now: 0
df.shape
df_cont = df.select_dtypes(include='number')
df_cat = df.select_dtypes(include='category')
df_cat['have_a_phone'] = df_cont['have_a_phone']
df_cat['have_a_mail'] = df_cont['have_a_mail']
df_cat['number_of_children'] = df_cont['number_of_children']
df_cat['payment'] = df_cont['payment']
df_cat['number_of_family_members'] = df_cont['number_of_family_members']
df_cont['income_credit_ratio'] = df_cont['total_income'] / df_cont['amount_of_credit'] #engineered
df_cont.drop(['number_of_children', 'have_a_phone', 'have_a_mail', 'number_of_family_members'], axis=1, inplace=True)
df_cont.sample(4)
#Comparing means
pd.pivot_table(data=df_cont, index='payment', aggfunc=[np.mean ,np.median])
#Checking equal variances assumption. Mostly violated, so we continue with Welch T-test
pd.pivot_table(data=df_cont, index='payment', aggfunc=[np.var, 'count'])
for col in df_cont.columns[1:]:
print(f'''P-value for {col}: {ttest(df_cont[df_cont['payment']==0][col],
df_cont[df_cont['payment']==1][col], correction='auto').T.iloc[3,:][0]} ''')
P-value for total_income: 0.25663433686295833
P-value for amount_of_credit: 1.7653363445999923e-101
P-value for income_credit_ratio: 0.5355667560092505
from sklearn.feature_selection import VarianceThreshold
print(df_cont.shape)
var_filter = VarianceThreshold(threshold = 0.0)
train = var_filter.fit_transform(df_cont)
print(train.shape) #All of them remained
(292298, 4)
(292298, 4)
find_outliers_iqr(df_cont)
Number of outlier records in total_income column: 13938
Number of outlier records in amount_of_credit column: 6502
Number of outlier records in income_credit_ratio column: 16955
def coerce_outliers(value):
if value > upperlimit:
value = upperlimit
elif value < lowerlimit:
value = lowerlimit
return value
for feature in df_cont.columns[1:]:
Q3 = df_cont[feature].quantile(q = 0.75)
Q1 = df_cont[feature].quantile(q = 0.25)
IQR = Q3 - Q1
outlier_range = IQR * 1.5
upperlimit = Q3 + outlier_range
lowerlimit = Q1 - outlier_range
df_cont[feature] = df_cont[feature].apply(coerce_outliers)
df_cont.drop(['total_income', 'amount_of_credit'], axis=1, inplace=True)
#We've got very few categories, but should we use one-hot encoding we'll end up with a lot of columns
for col in df_cat:
print(f'''Number of unique values for {col}: {df_cat[col].nunique()}''')
Number of unique values for loan_type: 2
Number of unique values for gender: 3
Number of unique values for have_a_car: 2
Number of unique values for have_a_house: 2
Number of unique values for suite_type: 7
Number of unique values for income_type: 8
Number of unique values for education_type: 5
Number of unique values for family_status: 6
Number of unique values for organization: 58
Number of unique values for have_a_phone: 2
Number of unique values for have_a_mail: 2
Number of unique values for number_of_children: 15
Number of unique values for payment: 2
Number of unique values for number_of_family_members: 17
a = list(df_cat.columns)
del a[-2]
for col in a:
expected, observed, stats = chi2_independence(data=df, x=col, y='payment')
print(stats)
test lambda chi2 dof pval cramer \
0 pearson 1.000000 235.189733 1.0 4.401512e-53 0.028366
1 cressie-read 0.666667 242.216794 1.0 1.292274e-54 0.028787
2 log-likelihood 0.000000 257.532030 1.0 5.922218e-58 0.029683
3 freeman-tukey -0.500000 270.241632 1.0 1.005187e-60 0.030406
4 mod-log-likelihood -1.000000 284.127645 1.0 9.465289e-64 0.031178
5 neyman -2.000000 315.967608 1.0 1.094613e-70 0.032878
power
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
5 1.0
test lambda chi2 dof pval cramer \
0 pearson 1.000000 786.287237 2.0 1.819265e-171 0.051865
1 cressie-read 0.666667 778.735112 2.0 7.939972e-170 0.051616
2 log-likelihood 0.000000 765.191590 2.0 6.930386e-167 0.051165
3 freeman-tukey -0.500000 NaN 2.0 NaN NaN
4 mod-log-likelihood -1.000000 inf 2.0 0.000000e+00 inf
5 neyman -2.000000 NaN 2.0 NaN NaN
power
0 1.0
1 1.0
2 1.0
3 NaN
4 1.0
5 NaN
test lambda chi2 dof pval cramer \
0 pearson 1.000000 210.585358 1.0 1.023666e-47 0.026841
1 cressie-read 0.666667 211.865985 1.0 5.379838e-48 0.026923
2 log-likelihood 0.000000 214.556338 1.0 1.392680e-48 0.027093
3 freeman-tukey -0.500000 216.690747 1.0 4.766948e-49 0.027227
4 mod-log-likelihood -1.000000 218.928809 1.0 1.548960e-49 0.027368
5 neyman -2.000000 223.729975 1.0 1.389348e-50 0.027666
power
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
5 1.0
test lambda chi2 dof pval cramer power
0 pearson 1.000000 4.377144 1.0 0.036424 0.003870 0.552612
1 cressie-read 0.666667 4.372577 1.0 0.036522 0.003868 0.552180
2 log-likelihood 0.000000 4.363503 1.0 0.036717 0.003864 0.551322
3 freeman-tukey -0.500000 4.356749 1.0 0.036863 0.003861 0.550682
4 mod-log-likelihood -1.000000 4.350040 1.0 0.037008 0.003858 0.550046
5 neyman -2.000000 4.336753 1.0 0.037298 0.003852 0.548784
test lambda chi2 dof pval cramer \
0 pearson 1.000000 50.690368 6.0 3.417623e-09 0.013169
1 cressie-read 0.666667 50.973125 6.0 2.998930e-09 0.013206
2 log-likelihood 0.000000 51.562847 6.0 2.283034e-09 0.013282
3 freeman-tukey -0.500000 52.026590 6.0 1.841999e-09 0.013341
4 mod-log-likelihood -1.000000 52.509005 6.0 1.473144e-09 0.013403
5 neyman -2.000000 53.531004 6.0 9.171370e-10 0.013533
power
0 0.999976
1 0.999978
2 0.999981
3 0.999984
4 0.999986
5 0.999989
test lambda chi2 dof pval cramer \
0 pearson 1.000000 991.690884 7.0 7.516938e-210 0.058247
1 cressie-read 0.666667 996.811307 7.0 5.884853e-211 0.058397
2 log-likelihood 0.000000 1014.812192 7.0 7.590778e-215 0.058922
3 freeman-tukey -0.500000 NaN 7.0 NaN NaN
4 mod-log-likelihood -1.000000 inf 7.0 0.000000e+00 inf
5 neyman -2.000000 NaN 7.0 NaN NaN
power
0 1.0
1 1.0
2 1.0
3 NaN
4 1.0
5 NaN
test lambda chi2 dof pval cramer \
0 pearson 1.000000 1161.602728 4.0 3.356968e-250 0.063040
1 cressie-read 0.666667 1192.893614 4.0 5.530219e-257 0.063883
2 log-likelihood 0.000000 1264.240284 4.0 1.884502e-272 0.065766
3 freeman-tukey -0.500000 1326.695402 4.0 5.421873e-286 0.067371
4 mod-log-likelihood -1.000000 1398.396201 4.0 1.539374e-301 0.069168
5 neyman -2.000000 1578.157880 4.0 0.000000e+00 0.073479
power
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
5 1.0
test lambda chi2 dof pval cramer \
0 pearson 1.000000 406.821010 5.0 1.004761e-85 0.037307
1 cressie-read 0.666667 404.023602 5.0 4.027533e-85 0.037178
2 log-likelihood 0.000000 399.249317 5.0 4.305828e-84 0.036958
3 freeman-tukey -0.500000 NaN 5.0 NaN NaN
4 mod-log-likelihood -1.000000 inf 5.0 0.000000e+00 inf
5 neyman -2.000000 NaN 5.0 NaN NaN
power
0 1.0
1 1.0
2 1.0
3 NaN
4 1.0
5 NaN
test lambda chi2 dof pval cramer \
0 pearson 1.000000 1366.792176 57.0 2.615352e-248 0.068381
1 cressie-read 0.666667 1370.241323 57.0 4.995886e-249 0.068468
2 log-likelihood 0.000000 1385.406172 57.0 3.442396e-252 0.068846
3 freeman-tukey -0.500000 1404.215908 57.0 4.103932e-256 0.069311
4 mod-log-likelihood -1.000000 1429.760260 57.0 1.911199e-261 0.069939
5 neyman -2.000000 1503.247531 57.0 8.346452e-277 0.071714
power
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
5 1.0
test lambda chi2 dof pval cramer \
0 pearson 1.000000 350.633308 1.0 3.084808e-78 0.034635
1 cressie-read 0.666667 358.836254 1.0 5.046467e-80 0.035038
2 log-likelihood 0.000000 376.548330 1.0 7.021872e-84 0.035892
3 freeman-tukey -0.500000 391.081122 1.0 4.814115e-87 0.036578
4 mod-log-likelihood -1.000000 406.795631 1.0 1.826593e-90 0.037306
5 neyman -2.000000 442.228612 1.0 3.543375e-98 0.038896
power
0 1.0
1 1.0
2 1.0
3 1.0
4 1.0
5 1.0
test lambda chi2 dof pval cramer power
0 pearson 1.000000 6.880529 1.0 0.008714 0.004852 0.746373
1 cressie-read 0.666667 6.922771 1.0 0.008510 0.004867 0.748940
2 log-likelihood 0.000000 7.008665 1.0 0.008112 0.004897 0.754094
3 freeman-tukey -0.500000 7.074342 1.0 0.007820 0.004920 0.757976
4 mod-log-likelihood -1.000000 7.141120 1.0 0.007534 0.004943 0.761870
5 neyman -2.000000 7.278067 1.0 0.006980 0.004990 0.769691
test lambda chi2 dof pval cramer \
0 pearson 1.000000 120.652171 14.0 4.688791e-19 0.020317
1 cressie-read 0.666667 108.663314 14.0 1.015785e-16 0.019281
2 log-likelihood 0.000000 98.304199 14.0 1.001487e-14 0.018339
3 freeman-tukey -0.500000 NaN 14.0 NaN NaN
4 mod-log-likelihood -1.000000 inf 14.0 0.000000e+00 inf
5 neyman -2.000000 NaN 14.0 NaN NaN
power
0 1.0
1 1.0
2 1.0
3 NaN
4 1.0
5 NaN
test lambda chi2 dof pval cramer \
0 pearson 1.000000 119.124696 16.0 8.091246e-18 0.020188
1 cressie-read 0.666667 109.837967 16.0 4.814677e-16 0.019385
2 log-likelihood 0.000000 101.272930 16.0 1.998822e-14 0.018614
3 freeman-tukey -0.500000 NaN 16.0 NaN NaN
4 mod-log-likelihood -1.000000 inf 16.0 0.000000e+00 inf
5 neyman -2.000000 NaN 16.0 NaN NaN
power
0 1.0
1 1.0
2 1.0
3 NaN
4 1.0
5 NaN
df_cat.drop(['loan_type', 'gender', 'have_a_car', 'have_a_phone',
'suite_type', 'education_type', 'family_status', 'income_type',
'organization', 'number_of_children', 'number_of_family_members'], axis=1, inplace=True)
df_cat = pd.concat([df_cat[df_cat['payment']==0].replace(0, 1), df_cat[df_cat['payment']==1].replace(1, 0)],
axis=0)
df_cat['have_a_house'] = df_cat['have_a_house'].replace('Y', 1).replace('N', 0)
df = pd.concat([df_cont.iloc[:,1:], df_cat], axis=1)
df.reset_index(drop=True, inplace=True)
df.tail()
df.to_csv('Risk--Analytics.csv')