Microsoft Capstone

import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.metrics import confusion_matrix from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report from snsynth.mwem import MWEMSynthesizer

from themis_ml.datasets import german_credit credit = german_credit(raw=True) credit.head(10)

credit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 21 columns):
 #   Column                                                    Non-Null Count  Dtype 
---  ------                                                    --------------  ----- 
 0   status_of_existing_checking_account                       1000 non-null   int64 
 1   duration_in_month                                         1000 non-null   int64 
 2   credit_history                                            1000 non-null   object
 3   purpose                                                   1000 non-null   object
 4   credit_amount                                             1000 non-null   int64 
 5   savings_account/bonds                                     1000 non-null   int64 
 6   present_employment_since                                  1000 non-null   int64 
 7   installment_rate_in_percentage_of_disposable_income       1000 non-null   int64 
 8   personal_status_and_sex                                   1000 non-null   object
 9   other_debtors/guarantors                                  1000 non-null   object
 10  present_residence_since                                   1000 non-null   int64 
 11  property                                                  1000 non-null   object
 12  age_in_years                                              1000 non-null   int64 
 13  other_installment_plans                                   1000 non-null   object
 14  housing                                                   1000 non-null   object
 15  number_of_existing_credits_at_this_bank                   1000 non-null   int64 
 16  job                                                       1000 non-null   int64 
 17  number_of_people_being_liable_to_provide_maintenance_for  1000 non-null   int64 
 18  telephone                                                 1000 non-null   int64 
 19  foreign_worker                                            1000 non-null   int64 
 20  credit_risk                                               1000 non-null   int64 
dtypes: int64(14), object(7)
memory usage: 164.2+ KB

credit['credit_history'].unique()

credit['housing'].unique()

credit['status_of_existing_checking_account'].unique()

credit['purpose'].unique()

def preprocessGermanCredit(temp_df): df = temp_df.copy() df = df[['credit_risk', 'duration_in_month', 'credit_amount', 'installment_rate_in_percentage_of_disposable_income', 'age_in_years', 'credit_history', 'housing','status_of_existing_checking_account','foreign_worker','personal_status_and_sex', 'present_employment_since', 'purpose']] df['credit_amount'] /= 500 df['credit_amount'] = df['credit_amount'].astype(int) # categorical_features housing = {2: 'for free', 1: 'own', 0: 'rent'} credit_history = {0: 'no_credits_taken/all_credits_paid_back_duly', 1: 'all_credits_at_this_bank_paid_back_duly', 2: 'existing_credits_paid_back_duly_till_now', 3: 'delay_in_paying_off_in_the_past', 4: 'critical_account/other_credits_existing_not_at_this_bank', } sex = {'male_married/widowed': 1,'male_single': 1,'male_divorced/separated': 1, 'female_divorced/separated/married': 0, } df["credit_history"] = df["credit_history"].map({v: k for k, v in credit_history.items()}) df["housing"] = df["housing"].map({v: k for k,v in housing.items()}) df["personal_status_and_sex"] = df["personal_status_and_sex"].map({v: k for v,k in sex.items()}) df['purpose'] = pd.factorize(df['purpose'])[0] return df

df = preprocessGermanCredit(credit) df.shape

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column                                               Non-Null Count  Dtype
---  ------                                               --------------  -----
 0   credit_risk                                          1000 non-null   int64
 1   duration_in_month                                    1000 non-null   int64
 2   credit_amount                                        1000 non-null   int64
 3   installment_rate_in_percentage_of_disposable_income  1000 non-null   int64
 4   age_in_years                                         1000 non-null   int64
 5   credit_history                                       1000 non-null   int64
 6   housing                                              1000 non-null   int64
 7   status_of_existing_checking_account                  1000 non-null   int64
 8   foreign_worker                                       1000 non-null   int64
 9   personal_status_and_sex                              1000 non-null   int64
 10  present_employment_since                             1000 non-null   int64
 11  purpose                                              1000 non-null   int64
dtypes: int64(12)
memory usage: 93.9 KB

def logisticRegression(x_train, y_train, x_test): from sklearn.linear_model import LogisticRegression logisticRegr = LogisticRegression(solver='lbfgs', max_iter=500) logisticRegr.fit(x_train, y_train) train_predictions = logisticRegr.predict(x_train) test_predictions = logisticRegr.predict(x_test) return train_predictions, test_predictions

def binaryClassfication(var_name, df): # Split processed data into train and test (fixed random seed) train_data, test_data = train_test_split(df, test_size=0.2, random_state=0) # Split test data into X and y x_test, y_test = test_data.drop(["credit_risk",var_name], axis=1), test_data["credit_risk"] train_results = [] test_results = [] epsilons = list(range(1,6)) # get slow after >=7 niteration = 1 # Loop through range of epsilon values for epsilon in epsilons: train_overall, train_unprotected, train_protected = [0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0] test_overall, test_unprotected, test_protected = [0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0] print(epsilon) # Generate synthetic data many times (to average results) for i in range(niteration): print("i: ",i) # Generate synthetic training data synth = MWEMSynthesizer(epsilon=epsilon, q_count=500, iterations=30, mult_weights_iterations=15, splits=[], split_factor=2, max_bin_count=400) # learn the distribution of the real data fit = synth.fit(train_data.to_numpy()) sample_size = train_data.shape[0] train_synth = pd.DataFrame(synth.sample(sample_size), columns=train_data.columns) # Split synthetic training data into X and y x_train, y_train = train_synth.drop(["credit_risk",var_name], axis=1), train_synth["credit_risk"] # Run the binary classification pipeline y_train_predict, y_test_predict = logisticRegression(x_train, y_train, x_test) # Save results for each i and for each epsilon curr_train_overall, curr_train_unprotected, curr_train_protected = predictionSummary(x_train, y_train, y_train_predict, train_synth, var_name) curr_test_overall, curr_test_unprotected, curr_test_protected = predictionSummary(x_test, y_test, y_test_predict, test_data, var_name) train_overall = [rate1+rate2 for rate1,rate2 in zip(train_overall,curr_train_overall)] train_protected = [rate1+rate2 for rate1,rate2 in zip(train_protected,curr_train_protected)] train_unprotected = [rate1+rate2 for rate1,rate2 in zip(train_unprotected,curr_train_unprotected)] test_overall = [rate1+rate2 for rate1,rate2 in zip(test_overall,curr_test_overall)] test_protected = [rate1+rate2 for rate1,rate2 in zip(test_protected,curr_test_protected)] test_unprotected = [rate1+rate2 for rate1,rate2 in zip(test_unprotected,curr_test_unprotected)] train_overall = [rate/niteration for rate in train_overall] train_unprotected = [rate/niteration for rate in train_unprotected] train_protected = [rate/niteration for rate in train_protected] train_results.append([train_overall,train_unprotected,train_protected]) test_overall = [rate/niteration for rate in test_overall] test_unprotected = [rate/niteration for rate in test_unprotected] test_protected = [rate/niteration for rate in test_protected] test_results.append([test_overall,test_unprotected,test_protected]) return test_results, train_results

def predictionSummary(temp_x, y_real, y_predict, df, var_name): tn, fp, fn, tp = confusion_matrix(y_real, y_predict, labels=[0,1]).ravel() # False Positive, False Negative, True Positive, True Negative, Overall Accuracy overall = [fp/(fp+tn),fn/(fn+tp),tp/(tp+fn),tn/(tn+fp),tp/(tp+fp),(tp+tn)/(tp+fp+fn+tn)] x = temp_x.copy() x['Predicted Creditability'] = y_predict x['Real Creditability'] = y_real if var_name == 'age': x_protected = x[df['age'] >= 25].copy() else: x_protected = x[df[var_name] == 1].copy() tn, fp, fn, tp = confusion_matrix(x_protected['Real Creditability'], x_protected['Predicted Creditability'], labels=[0,1]).ravel() protected = [fp/(fp+tn),fn/(fn+tp),tp/(tp+fn),tn/(tn+fp),tp/(tp+fp),(tp+tn)/(tp+fp+fn+tn)] if var_name == 'age': x_unprotected = x[df['age'] < 25].copy() else: x_unprotected = x[df[var_name] == 0].copy() tn, fp, fn, tp = confusion_matrix(x_unprotected['Real Creditability'], x_unprotected['Predicted Creditability'], labels=[0,1]).ravel() unprotected = [fp/(fp+tn),fn/(fn+tp),tp/(tp+fn),tn/(tn+fp),tp/(tp+fp),(tp+tn)/(tp+fp+fn+tn)] # # Calculate the difference in TPR (Equalized Opportunity distance) # equalized_opportunity = abs(protected[2]-unprotected[2]) # # Calculate the difference in FPR (second Equalized Odds distance) # equalized_odds = abs(protected[0]-unprotected[0]) return overall, unprotected, protected

# variable name # gender: 'personal_status_and_sex' # nationality: 'foreign_worker' # age: 'age_in_years'

binaryClassfication('personal_status_and_sex', df)

1
i:  0
/work/smartnoise-sdk/synth/snsynth/mwem.py:315: Warning: Bin count 18175 in column: 2 exceeds max_bin_count, defaulting to: 400. Is this a continuous variable?
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
  Warning,
2
i:  0
/work/smartnoise-sdk/synth/snsynth/mwem.py:315: Warning: Bin count 18175 in column: 2 exceeds max_bin_count, defaulting to: 400. Is this a continuous variable?
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
  Warning,
3
i:  0
/work/smartnoise-sdk/synth/snsynth/mwem.py:315: Warning: Bin count 18175 in column: 2 exceeds max_bin_count, defaulting to: 400. Is this a continuous variable?
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
  Warning,
4
i:  0
/work/smartnoise-sdk/synth/snsynth/mwem.py:315: Warning: Bin count 18175 in column: 2 exceeds max_bin_count, defaulting to: 400. Is this a continuous variable?
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
  Warning,
5
i:  0
/work/smartnoise-sdk/synth/snsynth/mwem.py:315: Warning: Bin count 18175 in column: 2 exceeds max_bin_count, defaulting to: 400. Is this a continuous variable?
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
  Warning,

binaryClassfication('foreign_worker', df)

1
i:  0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
  Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
2
i:  0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
  Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
3
i:  0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
  Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
4
i:  0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
  Warning,
5
i:  0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
  Warning,

binaryClassfication('age_in_years', df)

1
i:  0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
  Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
  
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
  
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
2
i:  0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
  Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
  
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
  
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
3
i:  0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
  Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
  
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
  
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
4
i:  0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
  Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
  
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
  
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
5
i:  0
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 12 Split: [6 7]
  Warning,
/work/smartnoise-sdk/synth/snsynth/mwem.py:229: Warning: Flattened dimensionality of synthetic histogram is less than the number of iterations. This is a privacy risk. Consider increasing your split_factor (especially if it is 1), or decreasing the number of iterations. Dim: 4 Split: [8 9]
  Warning,
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
  
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in long_scalars
  
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:21: RuntimeWarning: invalid value encountered in long_scalars