import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, r2_score, make_scorer, mean_absolute_error, roc_curve, auc
credit_df = pd.read_csv('german_credit_data.csv')
credit_df.head()
credit_df.info()
credit_df[['Saving accounts', 'Checking account']].isna()
Purpose of the credits
plt.figure(figsize=(8, 8))
plt.style.use('ggplot')
sns.histplot(
y='Purpose',
data=credit_df,
hue='Purpose',
palette='rocket'
)
plt.show()
plt.figure(figsize=(8, 8))
plt.style.use('ggplot')
sns.boxplot(
y='Purpose',
x='Credit amount',
data=credit_df,
palette='rocket'
)
plt.show()
Credit amount and duration
plt.figure(figsize=(8, 8))
plt.style.use('ggplot')
sns.scatterplot(
y='Credit amount',
x='Duration',
data=credit_df,
hue='Credit amount',
palette='rocket',
marker='o'
)
plt.show()
plt.figure(figsize=(8, 8))
plt.style.use('ggplot')
sns.scatterplot(
y='Credit amount',
x='Duration',
data=credit_df,
hue='Risk',
palette='rocket',
marker='o'
)
plt.show()
Wealth of the credit borrowers
plt.figure(figsize=(8, 8))
plt.style.use('ggplot')
savings_labels = ['little', 'moderate', 'quite rich', 'rich']
checkings_labels = ['little', 'moderate', 'rich']
pie_colors = sns.color_palette('BuPu')
plt.show()
plt.clf()
plt.figure(figsize=(8, 8))
plt.style.use('ggplot')
plt.figure(0)
plt.pie(
credit_df['Saving accounts'].value_counts(),
labels=savings_labels,
colors=pie_colors,
autopct='%.0f%%'
)
plt.figure(1)
plt.pie(
credit_df['Checking account'].value_counts(),
labels=checkings_labels,
colors=pie_colors,
autopct='%.0f%%'
)
plt.show()
Age and gender distribution
age = credit_df['Age'].values.tolist()
male_age = credit_df['Age'].loc[credit_df['Risk'] == 'good'].values.tolist()
female_age = credit_df['Age'].loc[credit_df['Risk'] == 'bad'].values.tolist()
plt.clf()
plt.figure(figsize=(8, 8))
plt.style.use('ggplot')
fig, ax = plt.subplots(1, 3, sharex=False, figsize=(16,8))
plot = sns.histplot(
age,
color='dimgrey',
kde=True,
ax=ax[0]
)
plot.set_title('Age total')
plot = sns.histplot(
male_age,
color='steelblue',
kde=True,
ax=ax[1]
)
plot.set_title('Age male')
plot = sns.histplot(
female_age,
color='orangered',
kde=True,
ax=ax[2]
)
plot.set_title('Age female')
plt.show()
credit_df['Sex'].value_counts()
male_bad = credit_df[(credit_df['Sex'] == 'male') & (credit_df['Risk'] == 'bad')]
female_bad = credit_df[(credit_df['Sex'] == 'female') & (credit_df['Risk'] == 'bad')]
male_good = credit_df[(credit_df['Sex'] == 'male') & (credit_df['Risk'] == 'good')]
female_good = credit_df[(credit_df['Sex'] == 'female') & (credit_df['Risk'] == 'good')]
gender_risk = pd.DataFrame(
{'Gender': ['Male', 'Female'],
'Good Risk': [len(male_good), len(female_bad)],
'Bad Risk': [len(male_bad), len(female_bad)],})
plt.clf()
plt.figure(figsize=(10, 10))
plt.style.use('ggplot')
gender_risk.set_index('Gender').plot(kind='bar', stacked=True, color=['powderblue', 'lightcoral'])
plt.show()
credit_df = credit_df.drop(columns=['Saving accounts', 'Checking account'], axis=1)
encoded_df = pd.get_dummies(credit_df, drop_first=True)
correlation = encoded_df.corr()
plt.figure(figsize=(8, 8))
plt.style.use('ggplot')
sns.heatmap(correlation, cmap='rocket')
encoded_df.head()
encoded_df.info()
encoded_df.describe()
encoded_df.head()
#Define the features, which are used in our ML Model
feature_cols = ['Age', 'Job', 'Credit amount', 'Duration', 'Sex_male',
'Housing_own', 'Purpose_car', 'Purpose_radio/TV', 'Purpose_education', 'Purpose_vacation/others']
# Set X and y variables from the encoded dataframe
X = encoded_df.loc[:, feature_cols]
y = encoded_df['Risk_good']
# Split the X and y variables into a training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
lr = LinearRegression()
# Fit the pipeline to the training data
lr.fit(X_train, y_train)
# Create prediction on the test set
pred = lr.predict(X_test)
# Get score of the search
score = r2_score(y_test, pred)
print(score)
# Set up a dict for paramter distribution
param_dist = {'n_estimators': [50, 150, 200, 250],
'criterion': ['gini', 'entropy'],
'max_depth': [2, 8, 12, 16],
'min_samples_split': [2, 4, 8, 10],
'min_samples_leaf': [2, 5, 8, 10]}
# Create scorer
mae_scorer = make_scorer(mean_absolute_error)
# Intantiate the Random Forest Classifier
rfc = RandomForestClassifier()
# Instantiate Random Search
rs = RandomizedSearchCV(estimator=rfc, param_distributions=param_dist, n_iter=20, cv=5, scoring=mae_scorer)
# Fit the pipeline to the training data
rs.fit(X_train, y_train)
# print the mean test scores:
print('The accuracy for each run was: {}.'.format(rs.cv_results_['mean_test_score']))
# print the best model score:
print('The best accuracy for a single model was: {}'.format(rs.best_score_))
# print the best model score:
print('The best parameters for a single model was: {}'.format(rs.best_params_))
RandomForest = RandomForestClassifier(n_estimators=200, min_samples_split=4, min_samples_leaf=2, max_depth=12, criterion='gini')
RandomForest.fit(X_train, y_train)
pred = RandomForest.predict(X_test)
# View accuracy score
print(accuracy_score(y_test, pred))
y_pred_prob = RandomForest.predict_proba(X_test)[:,1]
# Generate ROC curve values
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
# Plot a ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate', color="r")
plt.ylabel('True Positive Rate', color="g")
plt.title('ROC Curve')
plt.show()