!pip install statsmodels
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
import os
import pickle
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
# Loading model:
models_path = os.path.join(os.getcwd(), 'models')
log_reg_model_path = os.path.join(models_path, 'log_reg.pickle')
with open(log_reg_model_path, 'rb') as handle:
log_reg = pickle.load(handle)
df = pd.read_csv('data.csv')
df['edema_prediction']=df.Model_prediction_edema>0.7
df['misclassified']=df.edema_prediction!=df.Edema
train_columns = ['Sex','Age','Frontal/Lateral','AP/PA','No Finding','Enlarged Cardiomediastinum','Cardiomegaly',\
'Consolidation','Pneumonia','Atelectasis',\
'Pneumothorax','Pleural Effusion','Pleural Other','Support Devices']
#dataset
x_train = pd.get_dummies(df[train_columns],drop_first=True)
Y = np.multiply(df.misclassified,1)
df['misclassification_probabilities'] = log_reg.predict(x_train)
logreg_summary = log_reg.summary()
logreg_df = pd.read_html(logreg_summary.tables[1].as_html(), header=0, index_col=0)[0]
logreg_df['odds'] = np.exp(logreg_df.coef)
logreg_df['odds_distance'] = np.abs(1-logreg_df['odds'])
logreg_df['odds_lower'] = np.exp(logreg_df['[0.025'])
logreg_df['odds_upper'] = np.exp(logreg_df['0.975]'])
logreg_df['significant_05'] = logreg_df['P>|z|']<0.05
logreg_df = logreg_df.sort_values(['significant_05','odds_distance'],ascending=False)
logreg_df
df['probability_bin'] = pd.cut(df.Model_prediction_edema,bins=np.arange(0,1.1,0.1),labels=np.arange(0,1,0.1))
prob_buckets = df.groupby(['probability_bin','Edema'])['Path'].count().reset_index()\
.rename(columns={"Path":"observations"}).pivot_table(columns="Edema",index="probability_bin",values="observations")
prob_buckets.columns.rename(None,inplace=True)
prob_buckets = prob_buckets.reset_index().rename(columns={0:"no_disease",1:"disease"})
prob_buckets['disease_percent'] = prob_buckets['disease']/(prob_buckets['disease']+prob_buckets['no_disease'])
prob_buckets['no_disease_percent'] = 1 - prob_buckets['disease_percent']
prob_buckets['probability_bin'] = [np.round(x,2) for x in prob_buckets['probability_bin']]
prob_buckets
fig, ax = plt.subplots(1,2, figsize=(18, 5))
ax[0].axvspan(7.5, 9.5, facecolor='orange', alpha=0.2)
ax[0].axvspan(-0.5, 7.5, facecolor='b', alpha=0.2)
ax[0].bar([str(np.round(x,2)) for x in prob_buckets['probability_bin']],prob_buckets['no_disease'])
ax[0].bar([str(np.round(x,2)) for x in prob_buckets['probability_bin']],prob_buckets['disease'],
bottom=prob_buckets['no_disease'])
ax[0].legend(["No Disease","Disease"])
ax[0].axvline(x=7.5, color='g')
ax[0].set_xlabel("Predicted Probability")
ax[0].set_ylabel("Frequency")
ax[0].set_title("Histogram of probabilty scores")
ax[1].axvspan(7.5, 9.5, facecolor='orange', alpha=0.2)
ax[1].axvspan(-0.5, 7.5, facecolor='b', alpha=0.2)
ax[1].bar([str(np.round(x,2)) for x in prob_buckets['probability_bin']],prob_buckets['no_disease_percent'])
ax[1].bar([str(np.round(x,2)) for x in prob_buckets['probability_bin']],prob_buckets['disease_percent'],
bottom=prob_buckets['no_disease_percent'])
ax[1].legend(["No Disease","Disease"])
ax[1].axvline(x=7.5, color='g')
ax[1].set_xlabel("Predicted Probability")
ax[1].set_ylabel("Percentage")
ax[1].set_title("Histogram of probabilty scores (Each bar scaled to 100%)");
plt.scatter(df.loc[df.misclassified,'Model_prediction_edema'],df.loc[df.misclassified,'misclassification_probabilities'],
alpha=0.5,color='r')
plt.scatter(df.loc[~df.misclassified,'Model_prediction_edema'],df.loc[~df.misclassified,'misclassification_probabilities'],
alpha=0.5,color='g')
plt.xlabel("Disease Prediction Probabilities")
plt.ylabel("Misclassification Probabilities")
plt.title("Disease vs Misclassification Probabilities")
plt.axvline(x=0.7,color='black')
plt.axhline(y=0.5,color='blue')
plt.legend(["Disease Classification Threshold","Flipping Threshold",'Misclassified','Correctly classified'])
;
from sdv.relational import HMA1
import xgboost as xgb
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
def score_model(model, x_val, y_val):
pred = model.predict_proba(x_val)
score = roc_auc_score(y_val, pred[:,1])
return pred, score
# SDV model/scaler/encoder pathways:
models_path = os.path.join(os.getcwd(), 'models')
sdv_path = os.path.join(models_path, 'sdv.pickle')
le_gender_path = os.path.join(models_path, 'le_gender.pickle')
le_fl_path = os.path.join(models_path, 'le_fl.pickle')
le_appa_path = os.path.join(models_path, 'le_appa.pickle')
sc_path = os.path.join(models_path, 'sc.pickle')
xgb_sdv_path = os.path.join(models_path, 'xgb_sdv.pickle')
gpc_sdv_path = os.path.join(models_path, 'gpc_sdv.pickle')
# Toy dataset model pathways:
xgb_toy_path = os.path.join(models_path, 'xgb_toy.pickle')
gpc_toy_path = os.path.join(models_path, 'gpc_toy.pickle')
# Loading SDV models:
sdv = HMA1.load(sdv_path)
with open(le_gender_path, 'rb') as handle:
le_gender = pickle.load(handle)
with open(le_fl_path, 'rb') as handle:
le_fl = pickle.load(handle)
with open(le_appa_path, 'rb') as handle:
le_appa = pickle.load(handle)
with open(sc_path, 'rb') as handle:
sc = pickle.load(handle)
with open(xgb_sdv_path, 'rb') as handle:
xgb_sdv = pickle.load(handle)
with open(gpc_sdv_path, 'rb') as handle:
gpc_sdv = pickle.load(handle)
# Loading toy dataset models:
with open(xgb_toy_path, 'rb') as handle:
xgb_toy = pickle.load(handle)
with open(gpc_toy_path, 'rb') as handle:
gpc_toy = pickle.load(handle)
# Loading sdv model:
model_path = os.path.join(os.getcwd(), 'models', 'sdv.pickle')
sdv = HMA1.load(model_path)
# Readingg in sdv data with subset of ages (ages=31-69)
df_age_subset = sdv.sample('df_age_id', num_rows=10000, sample_children=False)
# Reading in sdv data with ood ages (all ages outside subset):
df_age_ood = sdv.sample('df_age_ood', num_rows=10000, sample_children=False)
df_age_ood = df_age_ood[(df_age_ood['Age'] > 70) | (df_age_ood['Age'] < 30)].reset_index(drop=True)
# Label encoding categorical variables:
le_gender = LabelEncoder()
le_fl = LabelEncoder()
le_appa = LabelEncoder()
# Fitting on in-distribution data:
df_age_subset['Sex'] = le_gender.fit_transform(df_age_subset['Sex'])
df_age_subset['Frontal/Lateral'] = le_fl.fit_transform(df_age_subset['Frontal/Lateral'])
df_age_subset['AP/PA'] = le_appa.fit_transform(df_age_subset['AP/PA'])
# Transforming on ood data:
df_age_ood['Sex'] = le_gender.transform(df_age_ood['Sex'])
df_age_ood['Frontal/Lateral'] = le_fl.transform(df_age_ood['Frontal/Lateral'])
df_age_ood['AP/PA'] = le_appa.transform(df_age_ood['AP/PA'])
# Scaling age:
sc = StandardScaler()
df_age_subset['Age'] = sc.fit_transform(df_age_subset['Age'].values.reshape(-1, 1))
df_age_ood['Age'] = sc.transform(df_age_ood['Age'].values.reshape(-1, 1))
# Using only predictor columns:
df_age_subset = df_age_subset.drop(['Model_prediction_cardiomegaly',
'Model_prediction_edema', 'Model_prediction_consolidation', 'No Finding',
'Model_prediction_atelectasis', 'Model_prediction_pleuraleffusion',
'edema_prediction', 'Edema'], axis=1)
df_age_ood = df_age_ood.drop(['Model_prediction_cardiomegaly',
'Model_prediction_edema', 'Model_prediction_consolidation', 'No Finding',
'Model_prediction_atelectasis', 'Model_prediction_pleuraleffusion',
'edema_prediction', 'Edema'], axis=1)
# Subsetting to x/y:
x_age_id = df_age_subset.drop(['is_misclassed'], axis=1)
y_age_id = df_age_subset['is_misclassed']
x_age_ood = df_age_ood.drop(['is_misclassed'], axis=1)
y_age_ood = df_age_ood['is_misclassed']
# Generating in-distribution and out-of-distribution toy dataframes:
id_data = make_classification(n_samples=20000, n_features=2, n_informative=2,
n_redundant=0, weights=[0.85, 0.15], random_state=27)
ood_data = make_classification(n_samples=20000, n_features=2, n_informative=2,
n_redundant=0, shift=6, weights=[0.9, 0.1], random_state=27)
id_df = pd.DataFrame({'x_1': id_data[0][:,0], 'x_2': id_data[0][:,1], 'is_misclassed': id_data[1]})
ood_df = pd.DataFrame({'x_1': ood_data[0][:,0], 'x_2': ood_data[0][:,1], 'is_misclassed': ood_data[1]})
# Splitting into x and y variables:
x_id = id_df.drop(['is_misclassed'], axis=1)
y_id = id_df['is_misclassed']
x_ood = ood_df.drop(['is_misclassed'], axis=1)
y_ood = ood_df['is_misclassed']
# Visualizing toy data:
id_df_pos = id_df[id_df['is_misclassed'] == 1]
id_df_neg = id_df[id_df['is_misclassed'] == 0]
ood_df_pos = ood_df[ood_df['is_misclassed'] == 1]
ood_df_neg = ood_df[ood_df['is_misclassed'] == 0]
x_ood_pos = ood_df_pos.drop(['is_misclassed'], axis=1)
y_ood_pos = ood_df_pos['is_misclassed']
x_ood_neg = ood_df_neg.drop(['is_misclassed'], axis=1)
y_ood_neg = ood_df_neg['is_misclassed']
x_id_pos = id_df_pos.drop(['is_misclassed'], axis=1)
y_id_pos = id_df_pos['is_misclassed']
x_id_neg = id_df_neg.drop(['is_misclassed'], axis=1)
y_id_neg = id_df_neg['is_misclassed']
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.scatter(x_id_neg['x_1'], x_id_neg['x_2'], alpha=0.3, color='blue', label='class 0')
ax.scatter(x_id_pos['x_1'], x_id_pos['x_2'], alpha=0.3, color='red', label='class 1')
ax.scatter(x_ood_neg['x_1'], x_ood_neg['x_2'], alpha=0.3, marker='x', color='blue', label='test class 0')
ax.scatter(x_ood_pos['x_1'], x_ood_pos['x_2'], alpha=0.3, marker='x', color='red', label='test class 1')
ax.set_xlabel('x_1')
ax.set_ylabel('x_2')
ax.set_title('toy classification data')
ax.legend(loc='best')
plt.show()
# Gathering predictions and score:
xgb_toy_id_pred, xgb_toy_id_score = score_model(xgb_toy, x_id, y_id)
print('AUC of XGB Classifier on In-Distribution Toy Dataset: {}'.format(xgb_toy_id_score), '\n')
try:
print('mean probability when predicting true: {}'.format(np.mean(xgb_toy_id_pred[:,1][xgb_toy_id_pred[:,1] > 0.5])))
print('mean probability when predicting false: {}'.format(np.mean(xgb_toy_id_pred[:,1][xgb_toy_id_pred[:,1] < 0.5])),
'\n')
except:
pass
print('max probability overall: {}'.format(max(xgb_toy_id_pred[:,1])))
print('min probability overall: {}'.format(min(xgb_toy_id_pred[:,1])))
xgb_toy_ood_pred, xgb_toy_ood_score = score_model(xgb_toy, x_ood, y_ood)
print('AUC of XGB Classifier on Out-of-Distribution Toy Dataset: {}'.format(xgb_toy_ood_score), '\n')
try:
print('mean probability when predicting true: {}'.format(np.mean(xgb_toy_ood_pred[:,1][xgb_toy_ood_pred[:,1] > 0.5])))
print('mean probability when predicting false: {}'.format(np.mean(xgb_toy_ood_pred[:,1][xgb_toy_ood_pred[:,1] < 0.5])),
'\n')
except:
pass
print('max probability overall: {}'.format(max(xgb_toy_ood_pred[:,1])))
print('min probability overall: {}'.format(min(xgb_toy_ood_pred[:,1])))
gpc_toy_id_pred, gpc_toy_id_score = score_model(gpc_toy, x_id, y_id)
print('AUC of GPC Classifier on In-Distribution Toy Dataset: {}'.format(gpc_toy_id_score), '\n')
try:
print('mean probability when predicting true: {}'.format(np.mean(gpc_toy_id_pred[:,1][gpc_toy_id_pred[:,1] > 0.5])))
print('mean probability when predicting false: {}'.format(np.mean(gpc_toy_id_pred[:,1][gpc_toy_id_pred[:,1] < 0.5])), '\n')
except:
pass
print('max probability overall: {}'.format(max(gpc_toy_id_pred[:,1])))
print('min probability overall: {}'.format(min(gpc_toy_id_pred[:,1])))
gpc_toy_ood_pred, gpc_toy_ood_score = score_model(gpc_toy, x_ood, y_ood)
print('AUC of GPC Classifier on Out-of-Distribution Toy Dataset: {}'.format(gpc_toy_ood_score), '\n')
try:
print('mean probability when predicting true: {}'.format(np.mean(gpc_toy_ood_pred[:,1][gpc_toy_ood_pred[:,1] > 0.5])))
print('mean probability when predicting false: {}'.format(np.mean(gpc_toy_ood_pred[:,1][gpc_toy_ood_pred[:,1] < 0.5])),
'\n')
except:
pass
print('max probability overall: {}'.format(max(gpc_toy_ood_pred[:,1])))
print('min probability overall: {}'.format(min(gpc_toy_ood_pred[:,1])))
# Visualizing data:
plt.hist(df_age_subset['Age'], label='in-distribution values')
plt.hist(df_age_ood['Age'], label='out-of-distribution values')
plt.title('in-distribution vs out-of-distribution values')
plt.legend(loc='best')
gpc_sdv_id_pred, gpc_sdv_id_score = score_model(gpc_sdv, x_age_id, y_age_id)
print('AUC of GPC Classifier on In-Distribution sdv Dataset: {}'.format(gpc_sdv_id_score), '\n')
try:
print('mean probability when predicting true: {}'.format(np.mean(gpc_sdv_id_pred[:,1][gpc_sdv_id_pred[:,1] > 0.5])))
print('mean probability when predicting false: {}'.format(np.mean(gpc_sdv_id_pred[:,1][gpc_sdv_id_pred[:,1] < 0.5])), '\n')
except:
pass
print('max probability overall: {}'.format(max(gpc_sdv_id_pred[:,1])))
print('min probability overall: {}'.format(min(gpc_sdv_id_pred[:,1])))
gpc_sdv_ood_pred, gpc_sdv_ood_score = score_model(gpc_sdv, x_age_ood, y_age_ood)
print('AUC of GPC Classifier on Out-of-Distribution sdv Dataset: {}'.format(gpc_sdv_ood_score), '\n')
try:
print('mean probability when predicting true: {}'.format(np.mean(gpc_sdv_ood_pred[:,1][gpc_sdv_ood_pred[:,1] > 0.5])))
print('mean probability when predicting false: {}'.format(np.mean(gpc_sdv_ood_pred[:,1][gpc_sdv_ood_pred[:,1] < 0.5])),
'\n')
except:
pass
print('max probability overall: {}'.format(max(gpc_sdv_ood_pred[:,1])))
print('min probability overall: {}'.format(min(gpc_sdv_ood_pred[:,1])))