import pandas as pd
from xgboost import XGBRegressor, DMatrix
import shap
from sklearn.model_selection import train_test_split
import numpy as np
import transformers
import nlp
import scipy as sp
import logging
from transformers import AutoTokenizer, AutoModelForCausalLM,AutoModelWithLMHead, AutoTokenizer
import torch
import datasets
transformers.logging.get_verbosity = lambda : logging.NOTSET
datasets.logging.get_verbosity = lambda : logging.NOTSET
Machine Learning Explainability
What are SHAP Values?
How do they do this?
The Shap Library
Example Use-cases
Tabular Data
What makes a good Tinder date?
df = pd.read_csv("/work/okcupid_profiles.csv",engine='python')
original = df.copy()
df.head()
df = df[df.income>0]
df.drop(columns=['last_online'])
for column in df.columns:
if df[column].dtype not in ['int64']:
df[column] = df[column].astype('category').cat.codes
df.drop(columns=[col for col in df.columns if 'essay' in col],inplace=True)
for col in df.select_dtypes(include=np.number):
df[col] = df[col].fillna(df[col].mode())
df.head()
y = df.pop('income').copy()
X = df.copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
X_test.iloc[100]
y_test.iloc[100]
import tempfile
from IPython.core.display import display, HTML
shap.initjs()
def shap_deepnote_show(plot):
tmp_output_filename = tempfile.NamedTemporaryFile(suffix='.html').name
shap.save_html(tmp_output_filename, plot)
f = open(tmp_output_filename, "r")
data = f.read()
display(HTML(data))
shap_deepnote_show(shap.force_plot(explainer.expected_value, shap_values[100], X_test.iloc[100].values,feature_names = X_test.columns))
dict(enumerate(original['religion'].astype('category').cat.categories ) )
y_train.mean()
explainer.expected_value
Transformers and Text Generation
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")
# set model decoder to true
model.config.is_decoder=True
# set text-generation params under task_specific_params
model.config.task_specific_params["text-generation"] = {
"do_sample": True,
"max_length": 50,
"temperature": 0.4,
"top_k": 50,
"no_repeat_ngram_size": 2
}
prompt = ['Math is probably the most difficult subject in school. The problem is ']
explainer = shap.Explainer(model,tokenizer)
shap_values = explainer(prompt)
shap.plots.text(shap_values)
Computer Vision
from tensorflow import keras
model = keras.models.load_model('cnn.h5')
np.random.seed = 356
# select a set of background examples to take an expectation over
background = x_train[np.random.choice(x_train.shape[0], 100, replace=False)]
# explain predictions of the model on four images
e = shap.DeepExplainer(model, background)
start = 10
shap_values = e.shap_values(x_test[start:start+5])
# plot the feature attributions
shap.image_plot(shap_values, x_test[start:start+5])