!pip install statsmodels
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
import os
import pickle
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
Requirement already satisfied: statsmodels in /root/venv/lib/python3.7/site-packages (0.13.1)
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.2.5)
Requirement already satisfied: scipy>=1.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.7.3)
Requirement already satisfied: patsy>=0.5.2 in /root/venv/lib/python3.7/site-packages (from statsmodels) (0.5.2)
Requirement already satisfied: numpy>=1.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.19.5)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2021.3)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2.8.2)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
WARNING: You are using pip version 20.1.1; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
# Loading model:
models_path = os.path.join(os.getcwd(), 'models')
log_reg_model_path = os.path.join(models_path, 'log_reg.pickle')
with open(log_reg_model_path, 'rb') as handle:
log_reg = pickle.load(handle)
df = pd.read_csv('data.csv')
df['edema_prediction']=df.Model_prediction_edema>0.7
df['misclassified']=df.edema_prediction!=df.Edema
train_columns = ['Sex','Age','Frontal/Lateral','AP/PA','No Finding','Enlarged Cardiomediastinum','Cardiomegaly',\
'Consolidation','Pneumonia','Atelectasis',\
'Pneumothorax','Pleural Effusion','Pleural Other','Support Devices']
#dataset
x_train = pd.get_dummies(df[train_columns],drop_first=True)
Y = np.multiply(df.misclassified,1)
df['misclassification_probabilities'] = log_reg.predict(x_train)
logreg_summary = log_reg.summary()
logreg_df = pd.read_html(logreg_summary.tables[1].as_html(), header=0, index_col=0)[0]
logreg_df['odds'] = np.exp(logreg_df.coef)
logreg_df['odds_distance'] = np.abs(1-logreg_df['odds'])
logreg_df['odds_lower'] = np.exp(logreg_df['[0.025'])
logreg_df['odds_upper'] = np.exp(logreg_df['0.975]'])
logreg_df['significant_05'] = logreg_df['P>|z|']<0.05
logreg_df = logreg_df.sort_values(['significant_05','odds_distance'],ascending=False)
logreg_df
/shared-libs/python3.7/py/lib/python3.7/site-packages/pandas/core/arraylike.py:358: RuntimeWarning: overflow encountered in exp
result = getattr(ufunc, method)(*inputs, **kwargs)
coeffloat64
-44.7089 - 2.1246
std errfloat64
0.007 - 8870000000.0
Consolidation
2.1246
0.667
No Finding
-1.6037
0.787
Pleural Effusion
-1.2753
0.604
Age
-0.0145
0.007
Enlarged Cardiomediastinum
0.7244
0.486
Pleural Other
-44.7089
8870000000
AP/PA_PA
-1.0228
0.805
Cardiomegaly
-0.7827
0.507
Atelectasis
-0.7702
0.499
Sex_Male
-0.4319
0.377
df['probability_bin'] = pd.cut(df.Model_prediction_edema,bins=np.arange(0,1.1,0.1),labels=np.arange(0,1,0.1))
prob_buckets = df.groupby(['probability_bin','Edema'])['Path'].count().reset_index()\
.rename(columns={"Path":"observations"}).pivot_table(columns="Edema",index="probability_bin",values="observations")
prob_buckets.columns.rename(None,inplace=True)
prob_buckets = prob_buckets.reset_index().rename(columns={0:"no_disease",1:"disease"})
prob_buckets['disease_percent'] = prob_buckets['disease']/(prob_buckets['disease']+prob_buckets['no_disease'])
prob_buckets['no_disease_percent'] = 1 - prob_buckets['disease_percent']
prob_buckets['probability_bin'] = [np.round(x,2) for x in prob_buckets['probability_bin']]
prob_buckets
fig, ax = plt.subplots(1,2, figsize=(18, 5))
ax[0].axvspan(7.5, 9.5, facecolor='orange', alpha=0.2)
ax[0].axvspan(-0.5, 7.5, facecolor='b', alpha=0.2)
ax[0].bar([str(np.round(x,2)) for x in prob_buckets['probability_bin']],prob_buckets['no_disease'])
ax[0].bar([str(np.round(x,2)) for x in prob_buckets['probability_bin']],prob_buckets['disease'],
bottom=prob_buckets['no_disease'])
ax[0].legend(["No Disease","Disease"])
ax[0].axvline(x=7.5, color='g')
ax[0].set_xlabel("Predicted Probability")
ax[0].set_ylabel("Frequency")
ax[0].set_title("Histogram of probabilty scores")
ax[1].axvspan(7.5, 9.5, facecolor='orange', alpha=0.2)
ax[1].axvspan(-0.5, 7.5, facecolor='b', alpha=0.2)
ax[1].bar([str(np.round(x,2)) for x in prob_buckets['probability_bin']],prob_buckets['no_disease_percent'])
ax[1].bar([str(np.round(x,2)) for x in prob_buckets['probability_bin']],prob_buckets['disease_percent'],
bottom=prob_buckets['no_disease_percent'])
ax[1].legend(["No Disease","Disease"])
ax[1].axvline(x=7.5, color='g')
ax[1].set_xlabel("Predicted Probability")
ax[1].set_ylabel("Percentage")
ax[1].set_title("Histogram of probabilty scores (Each bar scaled to 100%)");
plt.scatter(df.loc[df.misclassified,'Model_prediction_edema'],df.loc[df.misclassified,'misclassification_probabilities'],
alpha=0.5,color='r')
plt.scatter(df.loc[~df.misclassified,'Model_prediction_edema'],df.loc[~df.misclassified,'misclassification_probabilities'],
alpha=0.5,color='g')
plt.xlabel("Disease Prediction Probabilities")
plt.ylabel("Misclassification Probabilities")
plt.title("Disease vs Misclassification Probabilities")
plt.axvline(x=0.7,color='black')
plt.axhline(y=0.5,color='blue')
plt.legend(["Disease Classification Threshold","Flipping Threshold",'Misclassified','Correctly classified'])
;
from sdv.relational import HMA1
import xgboost as xgb
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
import pickle
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
def score_model(model, x_val, y_val):
pred = model.predict_proba(x_val)
score = roc_auc_score(y_val, pred[:,1])
return pred, score
# SDV model/scaler/encoder pathways:
models_path = os.path.join(os.getcwd(), 'models')
sdv_path = os.path.join(models_path, 'sdv.pickle')
le_gender_path = os.path.join(models_path, 'le_gender.pickle')
le_fl_path = os.path.join(models_path, 'le_fl.pickle')
le_appa_path = os.path.join(models_path, 'le_appa.pickle')
sc_path = os.path.join(models_path, 'sc.pickle')
xgb_sdv_path = os.path.join(models_path, 'xgb_sdv.pickle')
gpc_sdv_path = os.path.join(models_path, 'gpc_sdv.pickle')
# Toy dataset model pathways:
xgb_toy_path = os.path.join(models_path, 'xgb_toy.pickle')
gpc_toy_path = os.path.join(models_path, 'gpc_toy.pickle')
# Loading SDV models:
sdv = HMA1.load(sdv_path)
with open(le_gender_path, 'rb') as handle:
le_gender = pickle.load(handle)
with open(le_fl_path, 'rb') as handle:
le_fl = pickle.load(handle)
with open(le_appa_path, 'rb') as handle:
le_appa = pickle.load(handle)
with open(sc_path, 'rb') as handle:
sc = pickle.load(handle)
with open(xgb_sdv_path, 'rb') as handle:
xgb_sdv = pickle.load(handle)
with open(gpc_sdv_path, 'rb') as handle:
gpc_sdv = pickle.load(handle)
# Loading toy dataset models:
with open(xgb_toy_path, 'rb') as handle:
xgb_toy = pickle.load(handle)
with open(gpc_toy_path, 'rb') as handle:
gpc_toy = pickle.load(handle)
# Loading sdv model:
model_path = os.path.join(os.getcwd(), 'models', 'sdv.pickle')
sdv = HMA1.load(model_path)
# Readingg in sdv data with subset of ages (ages=31-69)
df_age_subset = sdv.sample('df_age_id', num_rows=10000, sample_children=False)
# Reading in sdv data with ood ages (all ages outside subset):
df_age_ood = sdv.sample('df_age_ood', num_rows=10000, sample_children=False)
df_age_ood = df_age_ood[(df_age_ood['Age'] > 70) | (df_age_ood['Age'] < 30)].reset_index(drop=True)
# Label encoding categorical variables:
le_gender = LabelEncoder()
le_fl = LabelEncoder()
le_appa = LabelEncoder()
# Fitting on in-distribution data:
df_age_subset['Sex'] = le_gender.fit_transform(df_age_subset['Sex'])
df_age_subset['Frontal/Lateral'] = le_fl.fit_transform(df_age_subset['Frontal/Lateral'])
df_age_subset['AP/PA'] = le_appa.fit_transform(df_age_subset['AP/PA'])
# Transforming on ood data:
df_age_ood['Sex'] = le_gender.transform(df_age_ood['Sex'])
df_age_ood['Frontal/Lateral'] = le_fl.transform(df_age_ood['Frontal/Lateral'])
df_age_ood['AP/PA'] = le_appa.transform(df_age_ood['AP/PA'])
# Scaling age:
sc = StandardScaler()
df_age_subset['Age'] = sc.fit_transform(df_age_subset['Age'].values.reshape(-1, 1))
df_age_ood['Age'] = sc.transform(df_age_ood['Age'].values.reshape(-1, 1))
# Using only predictor columns:
df_age_subset = df_age_subset.drop(['Model_prediction_cardiomegaly',
'Model_prediction_edema', 'Model_prediction_consolidation', 'No Finding',
'Model_prediction_atelectasis', 'Model_prediction_pleuraleffusion',
'edema_prediction', 'Edema'], axis=1)
df_age_ood = df_age_ood.drop(['Model_prediction_cardiomegaly',
'Model_prediction_edema', 'Model_prediction_consolidation', 'No Finding',
'Model_prediction_atelectasis', 'Model_prediction_pleuraleffusion',
'edema_prediction', 'Edema'], axis=1)
# Subsetting to x/y:
x_age_id = df_age_subset.drop(['is_misclassed'], axis=1)
y_age_id = df_age_subset['is_misclassed']
x_age_ood = df_age_ood.drop(['is_misclassed'], axis=1)
y_age_ood = df_age_ood['is_misclassed']
# Generating in-distribution and out-of-distribution toy dataframes:
id_data = make_classification(n_samples=20000, n_features=2, n_informative=2,
n_redundant=0, weights=[0.85, 0.15], random_state=27)
ood_data = make_classification(n_samples=20000, n_features=2, n_informative=2,
n_redundant=0, shift=6, weights=[0.9, 0.1], random_state=27)
id_df = pd.DataFrame({'x_1': id_data[0][:,0], 'x_2': id_data[0][:,1], 'is_misclassed': id_data[1]})
ood_df = pd.DataFrame({'x_1': ood_data[0][:,0], 'x_2': ood_data[0][:,1], 'is_misclassed': ood_data[1]})
# Splitting into x and y variables:
x_id = id_df.drop(['is_misclassed'], axis=1)
y_id = id_df['is_misclassed']
x_ood = ood_df.drop(['is_misclassed'], axis=1)
y_ood = ood_df['is_misclassed']
# Visualizing toy data:
id_df_pos = id_df[id_df['is_misclassed'] == 1]
id_df_neg = id_df[id_df['is_misclassed'] == 0]
ood_df_pos = ood_df[ood_df['is_misclassed'] == 1]
ood_df_neg = ood_df[ood_df['is_misclassed'] == 0]
x_ood_pos = ood_df_pos.drop(['is_misclassed'], axis=1)
y_ood_pos = ood_df_pos['is_misclassed']
x_ood_neg = ood_df_neg.drop(['is_misclassed'], axis=1)
y_ood_neg = ood_df_neg['is_misclassed']
x_id_pos = id_df_pos.drop(['is_misclassed'], axis=1)
y_id_pos = id_df_pos['is_misclassed']
x_id_neg = id_df_neg.drop(['is_misclassed'], axis=1)
y_id_neg = id_df_neg['is_misclassed']
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.scatter(x_id_neg['x_1'], x_id_neg['x_2'], alpha=0.3, color='blue', label='class 0')
ax.scatter(x_id_pos['x_1'], x_id_pos['x_2'], alpha=0.3, color='red', label='class 1')
ax.scatter(x_ood_neg['x_1'], x_ood_neg['x_2'], alpha=0.3, marker='x', color='blue', label='test class 0')
ax.scatter(x_ood_pos['x_1'], x_ood_pos['x_2'], alpha=0.3, marker='x', color='red', label='test class 1')
ax.set_xlabel('x_1')
ax.set_ylabel('x_2')
ax.set_title('toy classification data')
ax.legend(loc='best')
plt.show()
# Gathering predictions and score:
xgb_toy_id_pred, xgb_toy_id_score = score_model(xgb_toy, x_id, y_id)
print('AUC of XGB Classifier on In-Distribution Toy Dataset: {}'.format(xgb_toy_id_score), '\n')
try:
print('mean probability when predicting true: {}'.format(np.mean(xgb_toy_id_pred[:,1][xgb_toy_id_pred[:,1] > 0.5])))
print('mean probability when predicting false: {}'.format(np.mean(xgb_toy_id_pred[:,1][xgb_toy_id_pred[:,1] < 0.5])),
'\n')
except:
pass
print('max probability overall: {}'.format(max(xgb_toy_id_pred[:,1])))
print('min probability overall: {}'.format(min(xgb_toy_id_pred[:,1])))
AUC of XGB Classifier on In-Distribution Toy Dataset: 0.8937230519999
mean probability when predicting true: 0.9070900678634644
mean probability when predicting false: 0.03598755598068237
max probability overall: 1.0
min probability overall: 2.211924687856026e-08
xgb_toy_ood_pred, xgb_toy_ood_score = score_model(xgb_toy, x_ood, y_ood)
print('AUC of XGB Classifier on Out-of-Distribution Toy Dataset: {}'.format(xgb_toy_ood_score), '\n')
try:
print('mean probability when predicting true: {}'.format(np.mean(xgb_toy_ood_pred[:,1][xgb_toy_ood_pred[:,1] > 0.5])))
print('mean probability when predicting false: {}'.format(np.mean(xgb_toy_ood_pred[:,1][xgb_toy_ood_pred[:,1] < 0.5])),
'\n')
except:
pass
print('max probability overall: {}'.format(max(xgb_toy_ood_pred[:,1])))
print('min probability overall: {}'.format(min(xgb_toy_ood_pred[:,1])))
AUC of XGB Classifier on Out-of-Distribution Toy Dataset: 0.4991041380494505
mean probability when predicting true: 0.9997965693473816
mean probability when predicting false: nan
max probability overall: 0.9999195337295532
min probability overall: 0.9815993905067444
C:\Users\ddbai\anaconda3\envs\final-proj-env\lib\site-packages\numpy\core\fromnumeric.py:3419: RuntimeWarning: Mean of empty slice.
return _methods._mean(a, axis=axis, dtype=dtype,
C:\Users\ddbai\anaconda3\envs\final-proj-env\lib\site-packages\numpy\core\_methods.py:188: RuntimeWarning: invalid value encountered in true_divide
ret = ret.dtype.type(ret / rcount)
gpc_toy_id_pred, gpc_toy_id_score = score_model(gpc_toy, x_id, y_id)
print('AUC of GPC Classifier on In-Distribution Toy Dataset: {}'.format(gpc_toy_id_score), '\n')
try:
print('mean probability when predicting true: {}'.format(np.mean(gpc_toy_id_pred[:,1][gpc_toy_id_pred[:,1] > 0.5])))
print('mean probability when predicting false: {}'.format(np.mean(gpc_toy_id_pred[:,1][gpc_toy_id_pred[:,1] < 0.5])), '\n')
except:
pass
print('max probability overall: {}'.format(max(gpc_toy_id_pred[:,1])))
print('min probability overall: {}'.format(min(gpc_toy_id_pred[:,1])))
AUC of GPC Classifier on In-Distribution Toy Dataset: 0.8837065825751177
mean probability when predicting true: 0.8902085862531139
mean probability when predicting false: 0.05242018663763399
max probability overall: 0.9971661372757126
min probability overall: 0.0022328589567450763
gpc_toy_ood_pred, gpc_toy_ood_score = score_model(gpc_toy, x_ood, y_ood)
print('AUC of GPC Classifier on Out-of-Distribution Toy Dataset: {}'.format(gpc_toy_ood_score), '\n')
try:
print('mean probability when predicting true: {}'.format(np.mean(gpc_toy_ood_pred[:,1][gpc_toy_ood_pred[:,1] > 0.5])))
print('mean probability when predicting false: {}'.format(np.mean(gpc_toy_ood_pred[:,1][gpc_toy_ood_pred[:,1] < 0.5])),
'\n')
except:
pass
print('max probability overall: {}'.format(max(gpc_toy_ood_pred[:,1])))
print('min probability overall: {}'.format(min(gpc_toy_ood_pred[:,1])))
AUC of GPC Classifier on Out-of-Distribution Toy Dataset: 0.13254182584993132
mean probability when predicting true: 0.5012118882116352
mean probability when predicting false: 0.49999999612742785
max probability overall: 0.6988604419909734
min probability overall: 0.49999999500017694
# Visualizing data:
plt.hist(df_age_subset['Age'], label='in-distribution values')
plt.hist(df_age_ood['Age'], label='out-of-distribution values')
plt.title('in-distribution vs out-of-distribution values')
plt.legend(loc='best')
gpc_sdv_id_pred, gpc_sdv_id_score = score_model(gpc_sdv, x_age_id, y_age_id)
print('AUC of GPC Classifier on In-Distribution sdv Dataset: {}'.format(gpc_sdv_id_score), '\n')
try:
print('mean probability when predicting true: {}'.format(np.mean(gpc_sdv_id_pred[:,1][gpc_sdv_id_pred[:,1] > 0.5])))
print('mean probability when predicting false: {}'.format(np.mean(gpc_sdv_id_pred[:,1][gpc_sdv_id_pred[:,1] < 0.5])), '\n')
except:
pass
print('max probability overall: {}'.format(max(gpc_sdv_id_pred[:,1])))
print('min probability overall: {}'.format(min(gpc_sdv_id_pred[:,1])))
AUC of GPC Classifier on In-Distribution sdv Dataset: 0.6251323086687633
mean probability when predicting true: 0.5570559413785936
mean probability when predicting false: 0.25510980201812594
max probability overall: 0.6727652252548637
min probability overall: 0.04489319533820435
gpc_sdv_ood_pred, gpc_sdv_ood_score = score_model(gpc_sdv, x_age_ood, y_age_ood)
print('AUC of GPC Classifier on Out-of-Distribution sdv Dataset: {}'.format(gpc_sdv_ood_score), '\n')
try:
print('mean probability when predicting true: {}'.format(np.mean(gpc_sdv_ood_pred[:,1][gpc_sdv_ood_pred[:,1] > 0.5])))
print('mean probability when predicting false: {}'.format(np.mean(gpc_sdv_ood_pred[:,1][gpc_sdv_ood_pred[:,1] < 0.5])),
'\n')
except:
pass
print('max probability overall: {}'.format(max(gpc_sdv_ood_pred[:,1])))
print('min probability overall: {}'.format(min(gpc_sdv_ood_pred[:,1])))
AUC of GPC Classifier on Out-of-Distribution sdv Dataset: 0.5349234875620905
mean probability when predicting true: 0.5153369494828355
mean probability when predicting false: 0.4346178102434051
max probability overall: 0.6566209390911126
min probability overall: 0.1215429100699339