Predicting Credit Risk - German Credit Dataset

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV from sklearn.ensemble import RandomForestClassifier from sklearn.linear_model import LinearRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score, r2_score, make_scorer, mean_absolute_error, roc_curve, auc

credit_df = pd.read_csv('german_credit_data.csv') credit_df.head()

credit_df.info()

credit_df[['Saving accounts', 'Checking account']].isna()

Purpose of the credits

plt.figure(figsize=(8, 8)) plt.style.use('ggplot') sns.histplot( y='Purpose', data=credit_df, hue='Purpose', palette='rocket' ) plt.show()

plt.figure(figsize=(8, 8)) plt.style.use('ggplot') sns.boxplot( y='Purpose', x='Credit amount', data=credit_df, palette='rocket' ) plt.show()

Credit amount and duration

plt.figure(figsize=(8, 8)) plt.style.use('ggplot') sns.scatterplot( y='Credit amount', x='Duration', data=credit_df, hue='Credit amount', palette='rocket', marker='o' ) plt.show()

plt.figure(figsize=(8, 8)) plt.style.use('ggplot') sns.scatterplot( y='Credit amount', x='Duration', data=credit_df, hue='Risk', palette='rocket', marker='o' ) plt.show()

Wealth of the credit borrowers

plt.figure(figsize=(8, 8)) plt.style.use('ggplot') savings_labels = ['little', 'moderate', 'quite rich', 'rich'] checkings_labels = ['little', 'moderate', 'rich'] pie_colors = sns.color_palette('BuPu') plt.show() plt.clf() plt.figure(figsize=(8, 8)) plt.style.use('ggplot') plt.figure(0) plt.pie( credit_df['Saving accounts'].value_counts(), labels=savings_labels, colors=pie_colors, autopct='%.0f%%' ) plt.figure(1) plt.pie( credit_df['Checking account'].value_counts(), labels=checkings_labels, colors=pie_colors, autopct='%.0f%%' ) plt.show()

Age and gender distribution

age = credit_df['Age'].values.tolist() male_age = credit_df['Age'].loc[credit_df['Risk'] == 'good'].values.tolist() female_age = credit_df['Age'].loc[credit_df['Risk'] == 'bad'].values.tolist() plt.clf() plt.figure(figsize=(8, 8)) plt.style.use('ggplot') fig, ax = plt.subplots(1, 3, sharex=False, figsize=(16,8)) plot = sns.histplot( age, color='dimgrey', kde=True, ax=ax[0] ) plot.set_title('Age total') plot = sns.histplot( male_age, color='steelblue', kde=True, ax=ax[1] ) plot.set_title('Age male') plot = sns.histplot( female_age, color='orangered', kde=True, ax=ax[2] ) plot.set_title('Age female') plt.show()

credit_df['Sex'].value_counts()

male_bad = credit_df[(credit_df['Sex'] == 'male') & (credit_df['Risk'] == 'bad')] female_bad = credit_df[(credit_df['Sex'] == 'female') & (credit_df['Risk'] == 'bad')] male_good = credit_df[(credit_df['Sex'] == 'male') & (credit_df['Risk'] == 'good')] female_good = credit_df[(credit_df['Sex'] == 'female') & (credit_df['Risk'] == 'good')] gender_risk = pd.DataFrame( {'Gender': ['Male', 'Female'], 'Good Risk': [len(male_good), len(female_bad)], 'Bad Risk': [len(male_bad), len(female_bad)],}) plt.clf() plt.figure(figsize=(10, 10)) plt.style.use('ggplot') gender_risk.set_index('Gender').plot(kind='bar', stacked=True, color=['powderblue', 'lightcoral']) plt.show()

credit_df = credit_df.drop(columns=['Saving accounts', 'Checking account'], axis=1)

encoded_df = pd.get_dummies(credit_df, drop_first=True)

correlation = encoded_df.corr() plt.figure(figsize=(8, 8)) plt.style.use('ggplot') sns.heatmap(correlation, cmap='rocket')

encoded_df.head()

encoded_df.info() encoded_df.describe()

encoded_df.head()

#Define the features, which are used in our ML Model feature_cols = ['Age', 'Job', 'Credit amount', 'Duration', 'Sex_male', 'Housing_own', 'Purpose_car', 'Purpose_radio/TV', 'Purpose_education', 'Purpose_vacation/others'] # Set X and y variables from the encoded dataframe X = encoded_df.loc[:, feature_cols] y = encoded_df['Risk_good']

# Split the X and y variables into a training and testing set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

lr = LinearRegression() # Fit the pipeline to the training data lr.fit(X_train, y_train) # Create prediction on the test set pred = lr.predict(X_test) # Get score of the search score = r2_score(y_test, pred) print(score)

# Set up a dict for paramter distribution param_dist = {'n_estimators': [50, 150, 200, 250], 'criterion': ['gini', 'entropy'], 'max_depth': [2, 8, 12, 16], 'min_samples_split': [2, 4, 8, 10], 'min_samples_leaf': [2, 5, 8, 10]} # Create scorer mae_scorer = make_scorer(mean_absolute_error) # Intantiate the Random Forest Classifier rfc = RandomForestClassifier() # Instantiate Random Search rs = RandomizedSearchCV(estimator=rfc, param_distributions=param_dist, n_iter=20, cv=5, scoring=mae_scorer) # Fit the pipeline to the training data rs.fit(X_train, y_train)

# print the mean test scores: print('The accuracy for each run was: {}.'.format(rs.cv_results_['mean_test_score'])) # print the best model score: print('The best accuracy for a single model was: {}'.format(rs.best_score_)) # print the best model score: print('The best parameters for a single model was: {}'.format(rs.best_params_))

RandomForest = RandomForestClassifier(n_estimators=200, min_samples_split=4, min_samples_leaf=2, max_depth=12, criterion='gini') RandomForest.fit(X_train, y_train) pred = RandomForest.predict(X_test) # View accuracy score print(accuracy_score(y_test, pred))

y_pred_prob = RandomForest.predict_proba(X_test)[:,1] # Generate ROC curve values fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob) # Plot a ROC curve plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr, tpr) plt.xlabel('False Positive Rate', color="r") plt.ylabel('True Positive Rate', color="g") plt.title('ROC Curve') plt.show()

.css-15w88e5{color:var(--chakra-colors-fg-neutral-primary);font-weight:inherit;letter-spacing:-0.09px;}Purpose of the credits

Credit amount and duration

Wealth of the credit borrowers

Age and gender distribution

Purpose of the credits