import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from snsynth.mwem import MWEMSynthesizer
from themis_ml.datasets import german_credit
credit = german_credit(raw=True)
credit.head(10)
credit.info()
credit['credit_history'].unique()
credit['housing'].unique()
credit['status_of_existing_checking_account'].unique()
credit['purpose'].unique()
def preprocessGermanCredit(temp_df):
    df = temp_df.copy()
    df = df[['credit_risk', 'duration_in_month', 'credit_amount', 'installment_rate_in_percentage_of_disposable_income', 'age_in_years', 'credit_history', 'housing','status_of_existing_checking_account','foreign_worker','personal_status_and_sex', 'present_employment_since', 'purpose']]
    
    df['credit_amount'] /= 500
    df['credit_amount'] = df['credit_amount'].astype(int)
    # categorical_features 
    housing = {2: 'for free', 1: 'own',  0: 'rent'}
    credit_history = {0: 'no_credits_taken/all_credits_paid_back_duly', 
                        1: 'all_credits_at_this_bank_paid_back_duly',
                        2: 'existing_credits_paid_back_duly_till_now', 
                        3: 'delay_in_paying_off_in_the_past',
                        4: 'critical_account/other_credits_existing_not_at_this_bank',
                    }
    sex = {'male_married/widowed': 1,'male_single': 1,'male_divorced/separated': 1,
           'female_divorced/separated/married': 0,
        }
    df["credit_history"] = df["credit_history"].map({v: k for k, v in credit_history.items()})
    df["housing"] = df["housing"].map({v: k for k,v in housing.items()})
    df["personal_status_and_sex"] = df["personal_status_and_sex"].map({v: k for v,k in sex.items()})
    df['purpose'] = pd.factorize(df['purpose'])[0]
    return df
df = preprocessGermanCredit(credit)
df.shape
df.info()
df
def logisticRegression(x_train, y_train, x_test):
    from sklearn.linear_model import LogisticRegression
    logisticRegr = LogisticRegression(solver='lbfgs', max_iter=500)
    logisticRegr.fit(x_train, y_train)
    train_predictions = logisticRegr.predict(x_train)
    test_predictions = logisticRegr.predict(x_test)
    return train_predictions, test_predictions
def binaryClassfication(var_name, df):
    # Split processed data into train and test (fixed random seed)
    train_data, test_data = train_test_split(df, test_size=0.2, random_state=0)
    # Split test data into X and y
    x_test, y_test = test_data.drop(["credit_risk",var_name], axis=1), test_data["credit_risk"]
    train_results = []
    test_results = []
    epsilons = list(range(1,6)) # get slow after >=7 
    niteration = 1
    # Loop through range of epsilon values
    for epsilon in epsilons:
        train_overall, train_unprotected, train_protected = [0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0]
        test_overall, test_unprotected, test_protected = [0,0,0,0,0],[0,0,0,0,0],[0,0,0,0,0]
        print(epsilon)
        # Generate synthetic data many times (to average results)
        for i in range(niteration):
            print("i: ",i)
            # Generate synthetic training data
            synth = MWEMSynthesizer(epsilon=epsilon, q_count=500, iterations=30, mult_weights_iterations=15,
                                    splits=[], split_factor=2, max_bin_count=400) 
            # learn the distribution of the real data
            fit = synth.fit(train_data.to_numpy())  
            sample_size = train_data.shape[0]
            train_synth = pd.DataFrame(synth.sample(sample_size), columns=train_data.columns) 
            # Split synthetic training data into X and y
            x_train, y_train = train_synth.drop(["credit_risk",var_name], axis=1), train_synth["credit_risk"]
            # Run the binary classification pipeline
            y_train_predict, y_test_predict = logisticRegression(x_train, y_train, x_test)
        # Save results for each i and for each epsilon
        curr_train_overall, curr_train_unprotected, curr_train_protected = predictionSummary(x_train, y_train, y_train_predict, train_synth, var_name)
        curr_test_overall, curr_test_unprotected, curr_test_protected = predictionSummary(x_test, y_test, y_test_predict, test_data, var_name)
        
        train_overall = [rate1+rate2 for rate1,rate2 in zip(train_overall,curr_train_overall)]
        train_protected = [rate1+rate2 for rate1,rate2 in zip(train_protected,curr_train_protected)]
        train_unprotected = [rate1+rate2 for rate1,rate2 in zip(train_unprotected,curr_train_unprotected)]
        test_overall = [rate1+rate2 for rate1,rate2 in zip(test_overall,curr_test_overall)]
        test_protected = [rate1+rate2 for rate1,rate2 in zip(test_protected,curr_test_protected)]
        test_unprotected = [rate1+rate2 for rate1,rate2 in zip(test_unprotected,curr_test_unprotected)]
    train_overall = [rate/niteration for rate in train_overall]
    train_unprotected = [rate/niteration for rate in train_unprotected]
    train_protected = [rate/niteration for rate in train_protected]
    train_results.append([train_overall,train_unprotected,train_protected])
    test_overall = [rate/niteration for rate in test_overall]
    test_unprotected = [rate/niteration for rate in test_unprotected]
    test_protected = [rate/niteration for rate in test_protected]
    test_results.append([test_overall,test_unprotected,test_protected])
    return test_results, train_results
def predictionSummary(temp_x, y_real, y_predict, df, var_name):
    tn, fp, fn, tp = confusion_matrix(y_real, y_predict, labels=[0,1]).ravel()
    # False Positive, False Negative, True Positive, True Negative, Overall Accuracy
    overall = [fp/(fp+tn),fn/(fn+tp),tp/(tp+fn),tn/(tn+fp),tp/(tp+fp),(tp+tn)/(tp+fp+fn+tn)]
    
    x = temp_x.copy()
    x['Predicted Creditability'] = y_predict
    x['Real Creditability'] = y_real
    if var_name == 'age':
        x_protected = x[df['age'] >= 25].copy()
    else:
        x_protected = x[df[var_name] == 1].copy()
    tn, fp, fn, tp  = confusion_matrix(x_protected['Real Creditability'], x_protected['Predicted Creditability'], labels=[0,1]).ravel()
    protected = [fp/(fp+tn),fn/(fn+tp),tp/(tp+fn),tn/(tn+fp),tp/(tp+fp),(tp+tn)/(tp+fp+fn+tn)]
    if var_name == 'age':
        x_unprotected = x[df['age'] < 25].copy()
    else:
        x_unprotected = x[df[var_name] == 0].copy()
    tn, fp, fn, tp  = confusion_matrix(x_unprotected['Real Creditability'], x_unprotected['Predicted Creditability'], labels=[0,1]).ravel()
    unprotected = [fp/(fp+tn),fn/(fn+tp),tp/(tp+fn),tn/(tn+fp),tp/(tp+fp),(tp+tn)/(tp+fp+fn+tn)]
    # # Calculate the difference in TPR (Equalized Opportunity distance)
    # equalized_opportunity = abs(protected[2]-unprotected[2])
    # # Calculate the difference in FPR (second Equalized Odds distance)
    # equalized_odds = abs(protected[0]-unprotected[0])
    return overall, unprotected, protected
# variable name
# gender: 'personal_status_and_sex'
# nationality: 'foreign_worker'
# age: 'age_in_years'
binaryClassfication('personal_status_and_sex', df)
binaryClassfication('foreign_worker', df)
binaryClassfication('age_in_years', df)