import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from snsynth.mwem import MWEMSynthesizer
from themis_ml.datasets import german_credit
credit = german_credit(raw=True)
credit.head(10)
credit.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 status_of_existing_checking_account 1000 non-null int64
1 duration_in_month 1000 non-null int64
2 credit_history 1000 non-null object
3 purpose 1000 non-null object
4 credit_amount 1000 non-null int64
5 savings_account/bonds 1000 non-null int64
6 present_employment_since 1000 non-null int64
7 installment_rate_in_percentage_of_disposable_income 1000 non-null int64
8 personal_status_and_sex 1000 non-null object
9 other_debtors/guarantors 1000 non-null object
10 present_residence_since 1000 non-null int64
11 property 1000 non-null object
12 age_in_years 1000 non-null int64
13 other_installment_plans 1000 non-null object
14 housing 1000 non-null object
15 number_of_existing_credits_at_this_bank 1000 non-null int64
16 job 1000 non-null int64
17 number_of_people_being_liable_to_provide_maintenance_for 1000 non-null int64
18 telephone 1000 non-null int64
19 foreign_worker 1000 non-null int64
20 credit_risk 1000 non-null int64
dtypes: int64(14), object(7)
memory usage: 164.2+ KB
credit['credit_history'].unique()
credit['housing'].unique()
credit['status_of_existing_checking_account'].unique()
credit['purpose'].unique()
def preprocessGermanCredit(temp_df):
df = temp_df.copy()
df = df[['credit_risk', 'duration_in_month', 'credit_amount', 'installment_rate_in_percentage_of_disposable_income', 'age_in_years', 'credit_history', 'housing','status_of_existing_checking_account','foreign_worker','personal_status_and_sex', 'present_employment_since', 'purpose']]
df['credit_amount'] /= 500
df['credit_amount'] = df['credit_amount'].astype(int)
# categorical_features
housing = {2: 'for free', 1: 'own', 0: 'rent'}
credit_history = {0: 'no_credits_taken/all_credits_paid_back_duly',
1: 'all_credits_at_this_bank_paid_back_duly',
2: 'existing_credits_paid_back_duly_till_now',
3: 'delay_in_paying_off_in_the_past',
4: 'critical_account/other_credits_existing_not_at_this_bank',
}
sex = {'male_married/widowed': 1,'male_single': 1,'male_divorced/separated': 1,
'female_divorced/separated/married': 0,
}
df["credit_history"] = df["credit_history"].map({v: k for k, v in credit_history.items()})
df["housing"] = df["housing"].map({v: k for k,v in housing.items()})
df["personal_status_and_sex"] = df["personal_status_and_sex"].map({v: k for v,k in sex.items()})
df['purpose'] = pd.factorize(df['purpose'])[0]
return df
df = preprocessGermanCredit(credit)
df.shape
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 credit_risk 1000 non-null int64
1 duration_in_month 1000 non-null int64
2 credit_amount 1000 non-null int64
3 installment_rate_in_percentage_of_disposable_income 1000 non-null int64
4 age_in_years 1000 non-null int64
5 credit_history 1000 non-null int64
6 housing 1000 non-null int64
7 status_of_existing_checking_account 1000 non-null int64
8 foreign_worker 1000 non-null int64
9 personal_status_and_sex 1000 non-null int64
10 present_employment_since 1000 non-null int64
11 purpose 1000 non-null int64
dtypes: int64(12)
memory usage: 93.9 KB
df
def logisticRegression(x_train, y_train, x_test):
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression(solver='lbfgs', max_iter=500)
logisticRegr.fit(x_train, y_train)
train_predictions = logisticRegr.predict(x_train)
test_predictions = logisticRegr.predict(x_test)
return train_predictions, test_predictions
def binaryClassfication(var_name, df):
# Split processed data into train and test (fixed random seed)
train_data, test_data = train_test_split(df, test_size=0.2, random_state=0)
# Split test data into X and y
x_test, y_test = test_data.drop(["credit_risk",var_name], axis=1), test_data["credit_risk"]
train_results = []
test_results = []
epsilons = list(range(1,6)) # get slow after >=7
niteration = 1
# Loop through range of epsilon values
for epsilon in epsilons:
train_overall, train_unprotected, train_protected = [0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0]
test_overall, test_unprotected, test_protected = [0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0]
print(epsilon)
# Generate synthetic data many times (to average results)
for i in range(niteration):
print("i: ",i)
# Generate synthetic training data
synth = MWEMSynthesizer(epsilon=epsilon, q_count=500, iterations=30, mult_weights_iterations=15,
splits=[], split_factor=2, max_bin_count=400)
# learn the distribution of the real data
fit = synth.fit(train_data.to_numpy())
sample_size = train_data.shape[0]
train_synth = pd.DataFrame(synth.sample(sample_size), columns=train_data.columns)
# Split synthetic training data into X and y
x_train, y_train = train_synth.drop(["credit_risk",var_name], axis=1), train_synth["credit_risk"]
# Run the binary classification pipeline
y_train_predict, y_test_predict = logisticRegression(x_train, y_train, x_test)
# Save results for each i and for each epsilon
curr_train_overall, curr_train_unprotected, curr_train_protected = predictionSummary(x_train, y_train, y_train_predict, train_synth, var_name)
curr_test_overall, curr_test_unprotected, curr_test_protected = predictionSummary(x_test, y_test, y_test_predict, test_data, var_name)
train_overall = [rate1+rate2 for rate1,rate2 in zip(train_overall,curr_train_overall)]
train_protected = [rate1+rate2 for rate1,rate2 in zip(train_protected,curr_train_protected)]
train_unprotected = [rate1+rate2 for rate1,rate2 in zip(train_unprotected,curr_train_unprotected)]
test_overall = [rate1+rate2 for rate1,rate2 in zip(test_overall,curr_test_overall)]
test_protected = [rate1+rate2 for rate1,rate2 in zip(test_protected,curr_test_protected)]
test_unprotected = [rate1+rate2 for rate1,rate2 in zip(test_unprotected,curr_test_unprotected)]
train_overall = [rate/niteration for rate in train_overall]
train_unprotected = [rate/niteration for rate in train_unprotected]
train_protected = [rate/niteration for rate in train_protected]
train_results.append([train_overall,train_unprotected,train_protected])
test_overall = [rate/niteration for rate in test_overall]
test_unprotected = [rate/niteration for rate in test_unprotected]
test_protected = [rate/niteration for rate in test_protected]
test_results.append([test_overall,test_unprotected,test_protected])
return test_results, train_results
def predictionSummary(temp_x, y_real, y_predict, df, var_name):
tn, fp, fn, tp = confusion_matrix(y_real, y_predict, labels=[0,1]).ravel()
# False Positive, False Negative, True Positive, True Negative, Overall Accuracy
overall = [fp/(fp+tn),fn/(fn+tp),tp/(tp+fn),tn/(tn+fp),tp/(tp+fp),(tp+tn)/(tp+fp+fn+tn)]
x = temp_x.copy()
x['Predicted Creditability'] = y_predict
x['Real Creditability'] = y_real
if var_name == 'age':
x_protected = x[df['age'] >= 25].copy()
else:
x_protected = x[df[var_name] == 1].copy()
tn, fp, fn, tp = confusion_matrix(x_protected['Real Creditability'], x_protected['Predicted Creditability'], labels=[0,1]).ravel()
protected = [fp/(fp+tn),fn/(fn+tp),tp/(tp+fn),tn/(tn+fp),tp/(tp+fp),(tp+tn)/(tp+fp+fn+tn)]
if var_name == 'age':
x_unprotected = x[df['age'] < 25].copy()
else:
x_unprotected = x[df[var_name] == 0].copy()
tn, fp, fn, tp = confusion_matrix(x_unprotected['Real Creditability'], x_unprotected['Predicted Creditability'], labels=[0,1]).ravel()
unprotected = [fp/(fp+tn),fn/(fn+tp),tp/(tp+fn),tn/(tn+fp),tp/(tp+fp),(tp+tn)/(tp+fp+fn+tn)]
# # Calculate the difference in TPR (Equalized Opportunity distance)
# equalized_opportunity = abs(protected[2]-unprotected[2])
# # Calculate the difference in FPR (second Equalized Odds distance)
# equalized_odds = abs(protected[0]-unprotected[0])
return overall, unprotected, protected
# variable name
# gender: 'personal_status_and_sex'
# nationality: 'foreign_worker'
# age: 'age_in_years'
binaryClassfication('personal_status_and_sex', df)
1
i: 0
/work/smartnoise-sdk/synth/snsynth/mwem.py:315: Warning: Bin count 18175 in column: 2 exceeds max_bin_count, defaulting to: 400. Is this a continuous variable?
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
Warning,
2
i: 0
/work/smartnoise-sdk/synth/snsynth/mwem.py:315: Warning: Bin count 18175 in column: 2 exceeds max_bin_count, defaulting to: 400. Is this a continuous variable?
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
Warning,
3
i: 0
/work/smartnoise-sdk/synth/snsynth/mwem.py:315: Warning: Bin count 18175 in column: 2 exceeds max_bin_count, defaulting to: 400. Is this a continuous variable?
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
Warning,
4
i: 0
/work/smartnoise-sdk/synth/snsynth/mwem.py:315: Warning: Bin count 18175 in column: 2 exceeds max_bin_count, defaulting to: 400. Is this a continuous variable?
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
Warning,
5
i: 0
/work/smartnoise-sdk/synth/snsynth/mwem.py:315: Warning: Bin count 18175 in column: 2 exceeds max_bin_count, defaulting to: 400. Is this a continuous variable?
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
Warning,
binaryClassfication('foreign_worker', df)
1
i: 0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
2
i: 0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
3
i: 0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
4
i: 0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
Warning,
5
i: 0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
Warning,
binaryClassfication('age_in_years', df)
1
i: 0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
2
i: 0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
3
i: 0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
4
i: 0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
5
i: 0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars