Microsoft Capstone

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report from snsynth.mwem import MWEMSynthesizer

from themis_ml.datasets import german_credit credit = german_credit(raw=True) credit.head(10)

credit.info()

credit['credit_history'].unique()

credit['housing'].unique()

credit['status_of_existing_checking_account'].unique()

credit['purpose'].unique()

def preprocessGermanCredit(temp_df): df = temp_df.copy() df = df[['credit_risk', 'duration_in_month', 'credit_amount', 'installment_rate_in_percentage_of_disposable_income', 'age_in_years', 'credit_history', 'housing','status_of_existing_checking_account','foreign_worker','personal_status_and_sex', 'present_employment_since', 'purpose']] df['credit_amount'] /= 500 df['credit_amount'] = df['credit_amount'].astype(int) # categorical_features housing = {2: 'for free', 1: 'own', 0: 'rent'} credit_history = {0: 'no_credits_taken/all_credits_paid_back_duly', 1: 'all_credits_at_this_bank_paid_back_duly', 2: 'existing_credits_paid_back_duly_till_now', 3: 'delay_in_paying_off_in_the_past', 4: 'critical_account/other_credits_existing_not_at_this_bank', } sex = {'male_married/widowed': 1,'male_single': 1,'male_divorced/separated': 1, 'female_divorced/separated/married': 0, } df["credit_history"] = df["credit_history"].map({v: k for k, v in credit_history.items()}) df["housing"] = df["housing"].map({v: k for k,v in housing.items()}) df["personal_status_and_sex"] = df["personal_status_and_sex"].map({v: k for v,k in sex.items()}) df['purpose'] = pd.factorize(df['purpose'])[0] return df

df = preprocessGermanCredit(credit) df.shape

df.info()

df

def logisticRegression(x_train, y_train, x_test): from sklearn.linear_model import LogisticRegression logisticRegr = LogisticRegression(solver='lbfgs', max_iter=500) logisticRegr.fit(x_train, y_train) train_predictions = logisticRegr.predict(x_train) test_predictions = logisticRegr.predict(x_test) return train_predictions, test_predictions

def binaryClassfication(var_name, df): # Split processed data into train and test (fixed random seed) train_data, test_data = train_test_split(df, test_size=0.2, random_state=0) # Split test data into X and y x_test, y_test = test_data.drop(["credit_risk",var_name], axis=1), test_data["credit_risk"] train_results = [] test_results = [] epsilons = list(range(1,6)) # get slow after >=7 niteration = 1 # Loop through range of epsilon values for epsilon in epsilons: train_overall, train_unprotected, train_protected = [0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0] test_overall, test_unprotected, test_protected = [0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0] print(epsilon) # Generate synthetic data many times (to average results) for i in range(niteration): print("i: ",i) # Generate synthetic training data synth = MWEMSynthesizer(epsilon=epsilon, q_count=500, iterations=30, mult_weights_iterations=15, splits=[], split_factor=2, max_bin_count=400) # learn the distribution of the real data fit = synth.fit(train_data.to_numpy()) sample_size = train_data.shape[0] train_synth = pd.DataFrame(synth.sample(sample_size), columns=train_data.columns) # Split synthetic training data into X and y x_train, y_train = train_synth.drop(["credit_risk",var_name], axis=1), train_synth["credit_risk"] # Run the binary classification pipeline y_train_predict, y_test_predict = logisticRegression(x_train, y_train, x_test) # Save results for each i and for each epsilon curr_train_overall, curr_train_unprotected, curr_train_protected = predictionSummary(x_train, y_train, y_train_predict, train_synth, var_name) curr_test_overall, curr_test_unprotected, curr_test_protected = predictionSummary(x_test, y_test, y_test_predict, test_data, var_name) train_overall = [rate1+rate2 for rate1,rate2 in zip(train_overall,curr_train_overall)] train_protected = [rate1+rate2 for rate1,rate2 in zip(train_protected,curr_train_protected)] train_unprotected = [rate1+rate2 for rate1,rate2 in zip(train_unprotected,curr_train_unprotected)] test_overall = [rate1+rate2 for rate1,rate2 in zip(test_overall,curr_test_overall)] test_protected = [rate1+rate2 for rate1,rate2 in zip(test_protected,curr_test_protected)] test_unprotected = [rate1+rate2 for rate1,rate2 in zip(test_unprotected,curr_test_unprotected)] train_overall = [rate/niteration for rate in train_overall] train_unprotected = [rate/niteration for rate in train_unprotected] train_protected = [rate/niteration for rate in train_protected] train_results.append([train_overall,train_unprotected,train_protected]) test_overall = [rate/niteration for rate in test_overall] test_unprotected = [rate/niteration for rate in test_unprotected] test_protected = [rate/niteration for rate in test_protected] test_results.append([test_overall,test_unprotected,test_protected]) return test_results, train_results

def predictionSummary(temp_x, y_real, y_predict, df, var_name): tn, fp, fn, tp = confusion_matrix(y_real, y_predict, labels=[0,1]).ravel() # False Positive, False Negative, True Positive, True Negative, Overall Accuracy overall = [fp/(fp+tn),fn/(fn+tp),tp/(tp+fn),tn/(tn+fp),tp/(tp+fp),(tp+tn)/(tp+fp+fn+tn)] x = temp_x.copy() x['Predicted Creditability'] = y_predict x['Real Creditability'] = y_real if var_name == 'age': x_protected = x[df['age'] >= 25].copy() else: x_protected = x[df[var_name] == 1].copy() tn, fp, fn, tp = confusion_matrix(x_protected['Real Creditability'], x_protected['Predicted Creditability'], labels=[0,1]).ravel() protected = [fp/(fp+tn),fn/(fn+tp),tp/(tp+fn),tn/(tn+fp),tp/(tp+fp),(tp+tn)/(tp+fp+fn+tn)] if var_name == 'age': x_unprotected = x[df['age'] < 25].copy() else: x_unprotected = x[df[var_name] == 0].copy() tn, fp, fn, tp = confusion_matrix(x_unprotected['Real Creditability'], x_unprotected['Predicted Creditability'], labels=[0,1]).ravel() unprotected = [fp/(fp+tn),fn/(fn+tp),tp/(tp+fn),tn/(tn+fp),tp/(tp+fp),(tp+tn)/(tp+fp+fn+tn)] # # Calculate the difference in TPR (Equalized Opportunity distance) # equalized_opportunity = abs(protected[2]-unprotected[2]) # # Calculate the difference in FPR (second Equalized Odds distance) # equalized_odds = abs(protected[0]-unprotected[0]) return overall, unprotected, protected

# variable name # gender: 'personal_status_and_sex' # nationality: 'foreign_worker' # age: 'age_in_years'

binaryClassfication('personal_status_and_sex', df)

binaryClassfication('foreign_worker', df)

binaryClassfication('age_in_years', df)