import sys
import math
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
# import plotly
import plotly.express as px
# import plotly.offline
# import cufflinks as cf
# cf.go_offline()
# cf.set_config_file(offline=False, world_readable=True)
# import ipywidgets as widgets
# from ipywidgets import interact
sns.set_style('whitegrid')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')
print(f'Python version: {sys.version}')
print(f'pandas version: {pd.__version__}')
print(f'numpy version: {np.__version__}')
print(f'seaborn version: {sns.__version__}')
pd.Timestamp.now()
# pd.Timestamp.now().strftime('%Y-%m-%d')
Python version: 3.7.12 (default, Oct 12 2021, 03:36:26)
[GCC 8.3.0]
pandas version: 1.2.5
numpy version: 1.19.5
seaborn version: 0.11.2
url = 'https://raw.githubusercontent.com/GuySuphakit/Heart-Failure-Prediction/main/heart.csv'
temp_df = pd.read_csv(url)
temp_df.head()
temp_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 918 non-null int64
1 Sex 918 non-null object
2 ChestPainType 918 non-null object
3 RestingBP 918 non-null int64
4 Cholesterol 918 non-null int64
5 FastingBS 918 non-null int64
6 RestingECG 918 non-null object
7 MaxHR 918 non-null int64
8 ExerciseAngina 918 non-null object
9 Oldpeak 918 non-null float64
10 ST_Slope 918 non-null object
11 HeartDisease 918 non-null int64
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB
def read_data():
url = 'https://raw.githubusercontent.com/GuySuphakit/Heart-Failure-Prediction/main/heart.csv'
df = pd.read_csv(url)
return df
# def change_lebel_of_HeartDisease(df):
# # change the lebel of HeartDisease because it has caused some ambiguity.
# df.HeartDisease = df.HeartDisease.replace({0: 'Normal', 1: 'Heart Disease'})
# return df
def convert_FastingBS_to_category(df):
df.FastingBS = df.FastingBS.astype('category')
return df
def convert_obj_columns_to_category(df):
for c in df.columns:
col_type = df[c].dtype
if col_type == 'object' or col_type.name == 'category':
df[c] = df[c].astype('category')
return df
def drop_duplicate(df):
df = df.drop_duplicates()
return df
df = (read_data()
.pipe(convert_FastingBS_to_category)
.pipe(convert_obj_columns_to_category)
.pipe(drop_duplicate))
df.head()
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 918 entries, 0 to 917
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 918 non-null int64
1 Sex 918 non-null category
2 ChestPainType 918 non-null category
3 RestingBP 918 non-null int64
4 Cholesterol 918 non-null int64
5 FastingBS 918 non-null category
6 RestingECG 918 non-null category
7 MaxHR 918 non-null int64
8 ExerciseAngina 918 non-null category
9 Oldpeak 918 non-null float64
10 ST_Slope 918 non-null category
11 HeartDisease 918 non-null int64
dtypes: category(6), float64(1), int64(5)
memory usage: 56.4 KB
def print_category_columns(df):
for c in df.columns:
col_type = df[c].dtype
if col_type.name == 'category':
print(f'{c:15}: {list(enumerate(df[c].cat.categories))}')
print('-' * 60)
print_category_columns(df)
Sex : [(0, 'F'), (1, 'M')]
------------------------------------------------------------
ChestPainType : [(0, 'ASY'), (1, 'ATA'), (2, 'NAP'), (3, 'TA')]
------------------------------------------------------------
FastingBS : [(0, 0), (1, 1)]
------------------------------------------------------------
RestingECG : [(0, 'LVH'), (1, 'Normal'), (2, 'ST')]
------------------------------------------------------------
ExerciseAngina : [(0, 'N'), (1, 'Y')]
------------------------------------------------------------
ST_Slope : [(0, 'Down'), (1, 'Flat'), (2, 'Up')]
------------------------------------------------------------
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 918 entries, 0 to 917
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 918 non-null int64
1 Sex 918 non-null category
2 ChestPainType 918 non-null category
3 RestingBP 918 non-null int64
4 Cholesterol 918 non-null int64
5 FastingBS 918 non-null category
6 RestingECG 918 non-null category
7 MaxHR 918 non-null int64
8 ExerciseAngina 918 non-null category
9 Oldpeak 918 non-null float64
10 ST_Slope 918 non-null category
11 HeartDisease 918 non-null int64
dtypes: category(6), float64(1), int64(5)
memory usage: 56.4 KB
def plot_mn(df, cols, n_rows:int=1, kind:str='boxplot', color='salmon'):
"""
plot boxplot, violin, hist in m (rows) by n (columns)
>>> plot_mn(df, ['Calories', 'Fat'], 2, 'hist')
"""
n=len(cols)
n_cols=math.ceil(n / n_rows)
fig, ax = plt.subplots(n_rows, n_cols, figsize=(n_cols*3, n_rows*3.5))
ax=ax.ravel()
fig.tight_layout()
for i, c in enumerate(cols):
col_type = df[c].dtype
if col_type.name == 'category':
sns.countplot(data=df, x=c, ax=ax[i])
else:
if kind.lower()=='boxplot':
sns.boxplot(data=df[[c]], ax=ax[i], color=color)
if kind.lower()=='boxen':
sns.boxenplot(data=df[[c]], ax=ax[i], color=color)
elif kind.lower()=='violin':
sns.violinplot(data=df[[c]], ax=ax[i], color=color)
elif kind.lower()=='hist':
sns.distplot(df[c], hist=True, kde=False, ax=ax[i], color=color)
plot_mn(df, df.columns, 3, 'hist')
df.columns
df.describe().T.round(1)
df.describe(include='category').T
numerical= df.select_dtypes('int64').columns
categorical = df.select_dtypes('category').columns
print(f'Numerical Columns: {df[numerical].columns}')
print('-'* 100)
print(f'Categorical Columns: {df[categorical].columns}')
Numerical Columns: Index(['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'HeartDisease'], dtype='object')
----------------------------------------------------------------------------------------------------
Categorical Columns: Index(['Sex', 'ChestPainType', 'FastingBS', 'RestingECG', 'ExerciseAngina',
'ST_Slope'],
dtype='object')
df.nunique().sort_values(ascending=False)
df["Age"].describe().round(1)
df.Age.mode()[0]
print(stats.skew(df.Age))
-0.19561273124487544
fig = px.histogram(df, x='Age', marginal='box')
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.Sex.value_counts()
temp_df = df.groupby(by="Sex", as_index=False).agg(
counts=pd.NamedAgg(column="Sex", aggfunc="count")).sort_values(by="counts", ascending=False)
fig = px.bar(temp_df,
x='Sex',
y='counts',
color='Sex',
color_continuous_scale=px.colors.qualitative.D3)
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.ChestPainType.value_counts()
temp_df = df.groupby(by="ChestPainType", as_index=False).agg(
counts=pd.NamedAgg(column="ChestPainType", aggfunc="count")).sort_values(by="counts", ascending=False)
fig = px.bar(temp_df,
x='ChestPainType',
y='counts',
color='counts',
color_discrete_sequence = px.colors.qualitative.D3)
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.RestingBP.describe().round()
df.RestingBP.mode()[0]
print(stats.skew(df.RestingBP))
0.17954532149156327
fig = px.histogram(df, x='RestingBP', marginal='box')
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df[df['RestingBP'] == 0]
df.Cholesterol.describe().round()
df.Cholesterol.mode()[0]
print(stats.skew(df.Cholesterol))
-0.6090891046626045
fig = px.histogram(df, x='Cholesterol', marginal='box')
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df[df['Cholesterol'] == 0].head()
df[df['Cholesterol'] >= 500]['Cholesterol']
df.FastingBS.value_counts()
fig = px.histogram(df, x='FastingBS', color='FastingBS')
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df["RestingECG"].value_counts()
temp_df = df.groupby(by="RestingECG", as_index=False).agg(
counts=pd.NamedAgg(column="RestingECG", aggfunc="count")).sort_values(by="counts", ascending=False)
fig = px.bar(temp_df,
x='RestingECG',
y='counts',
color='counts',
color_discrete_sequence = px.colors.qualitative.D3)
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.MaxHR.describe().round()
df.MaxHR.mode()[0]
df.MaxHR.skew()
fig = px.histogram(df, x='MaxHR', marginal='box')
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.ExerciseAngina.value_counts()
temp_df = df.groupby(by="ExerciseAngina", as_index=False).agg(
counts=pd.NamedAgg(column="ExerciseAngina", aggfunc="count")).sort_values(by="counts", ascending=False)
fig = px.bar(temp_df,
x='ExerciseAngina',
y='counts',
color='ExerciseAngina',
color_continuous_scale=px.colors.qualitative.D3)
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.Oldpeak.describe().round()
df.Oldpeak.mode()[0]
df.Oldpeak.skew()
fig = px.histogram(df, x='Oldpeak', marginal='box')
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.ST_Slope.value_counts()
temp_df = df.groupby(by="ST_Slope", as_index=False).agg(
counts=pd.NamedAgg(column="ST_Slope", aggfunc="count")).sort_values(by="counts", ascending=False)
fig = px.bar(temp_df,
x='ST_Slope',
y='counts',
color='ST_Slope',
color_continuous_scale=px.colors.qualitative.D3)
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.HeartDisease.value_counts()
fig = px.histogram(df, x='HeartDisease', color='HeartDisease')
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
dcorr=df[df.columns].corr()
# dcorr
mask = np.zeros_like(dcorr)
# mask.shape
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(dcorr, cmap=sns.diverging_palette(10, 145, n=100),
vmin=-1, vmax=1, center=0, linewidths=1, annot=True, mask=mask, ax=ax).set_title("Correlation\nHeatmap", fontsize=22,fontweight="bold");
!pip install statsmodels
Requirement already satisfied: statsmodels in /root/venv/lib/python3.7/site-packages (0.13.1)
Requirement already satisfied: patsy>=0.5.2 in /root/venv/lib/python3.7/site-packages (from statsmodels) (0.5.2)
Requirement already satisfied: numpy>=1.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.19.5)
Requirement already satisfied: scipy>=1.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.7.2)
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.2.5)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2021.3)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2.8.2)
WARNING: You are using pip version 20.1.1; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
corr_value = df["Age"].corr(df["RestingBP"])
fig = px.scatter(
data_frame=df,
x="Age",
y="RestingBP",
marginal_x="histogram",
marginal_y="histogram",
trendline="ols",
trendline_color_override="red",
title=f"Correlation between Age and RestingBP is: {corr_value:.2f}",
)
fig.show()
corr_value = df["Age"].corr(df["Cholesterol"])
fig = px.scatter(
data_frame=df,
x="Age",
y="Cholesterol",
marginal_x="histogram",
marginal_y="histogram",
trendline="ols",
trendline_color_override="red",
title=f"Correlation between Age and Cholester0l is: {corr_value:.2f}",
)
fig.show()
corr_value = df["Age"].corr(df["Oldpeak"])
fig = px.scatter(
data_frame=df,
x="Age",
y="Oldpeak",
marginal_x="histogram",
marginal_y="histogram",
trendline="ols",
trendline_color_override="red",
title=f"Correlation between Age and Oldpeak is: {corr_value:.2f}",
)
fig.show()
corr_value = df["Age"].corr(df["MaxHR"])
fig = px.scatter(
data_frame=df,
x="Age",
y="MaxHR",
marginal_x="histogram",
marginal_y="histogram",
trendline="ols",
trendline_color_override="red",
title=f"Correlation between Age and MaxHR is: {corr_value:.2f}",
)
fig.show()
fig = px.density_heatmap(
data_frame=df, x="Age", y="RestingBP", color_continuous_scale="PuBu"
)
fig.show()
fig = px.density_heatmap(
data_frame=df, x="Age", y="Cholesterol", color_continuous_scale="PuBu"
)
fig.show()
fig = px.density_heatmap(
data_frame=df, x="Age", y="MaxHR", color_continuous_scale="PuBu"
)
fig.show()
fig = px.density_heatmap(
data_frame=df, x="Age", y="Oldpeak", color_continuous_scale="PuBu"
)
fig.show()
sns.pairplot(df, hue="HeartDisease", palette="husl", corner=True);
px.density_heatmap(
data_frame=df,
x="Age",
y="RestingBP",
color_continuous_scale="PuBu",
facet_col="HeartDisease",
title="Age vs. Chelesterol for different Heart Disease values",
)
px.density_heatmap(
data_frame=df,
x="Age",
y="Cholesterol",
color_continuous_scale="PuBu",
facet_col="HeartDisease",
title="Age vs. Chelesterol for different Heart Disease values",
)
px.density_heatmap(
data_frame=df,
x="Age",
y="MaxHR",
color_continuous_scale="PuBu",
facet_col="HeartDisease",
title="Age vs. Chelesterol for different Heart Disease values",
)
def num_plot(df, col):
fig = px.histogram(df, x=col, color="HeartDisease",
marginal="box")
fig.update_layout(height=400, width=700, showlegend=True)
fig.update_traces(marker_line_width=1,marker_line_color="black")
fig.show()
cols= df.columns
for col in cols:
num_plot(df, col)
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, ShuffleSplit, cross_val_score, cross_val_predict, cross_validate
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, precision_recall_fscore_support, f1_score
feature_cols = ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope']
target_cols = 'HeartDisease'
X = df[feature_cols]
y = df[target_cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
def data_preprocessing(X_train):
''' replace outliers with mean to make the distribution more normal. '''
mean_chol = X_train[X_train.Cholesterol > 0]['Cholesterol'].mean()
mean_rest = X_train[X_train.RestingBP > 0]['Cholesterol'].mean()
X_train['Cholesterol'].replace(to_replace = [0,X_train[X_train['Cholesterol'] >= 500]['Cholesterol']], value = mean_chol, inplace = True)
X_train['RestingBP'].replace(to_replace = [0,244.635389], value = mean_rest, inplace = True)
return X_train
X_train = data_preprocessing(X_train)
df.columns
y_train.value_counts()
!pip install flaml
Collecting flaml
Downloading FLAML-0.7.1-py3-none-any.whl (160 kB)
|████████████████████████████████| 160 kB 30.9 MB/s
Collecting xgboost<=1.3.3,>=0.90
Downloading xgboost-1.3.3-py3-none-manylinux2010_x86_64.whl (157.5 MB)
|████████████████████████████████| 157.5 MB 112 kB/s
Requirement already satisfied: NumPy>=1.16.2 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from flaml) (1.19.5)
Requirement already satisfied: scipy>=1.4.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from flaml) (1.7.2)
Requirement already satisfied: pandas>=1.1.4 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from flaml) (1.2.5)
Collecting lightgbm>=2.3.1
Downloading lightgbm-3.3.1-py3-none-manylinux1_x86_64.whl (2.0 MB)
|████████████████████████████████| 2.0 MB 42.8 MB/s
Requirement already satisfied: scikit-learn>=0.24 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from flaml) (1.0.1)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=1.1.4->flaml) (2021.3)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=1.1.4->flaml) (2.8.2)
Requirement already satisfied: wheel in /usr/local/lib/python3.7/site-packages (from lightgbm>=2.3.1->flaml) (0.37.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn>=0.24->flaml) (3.0.0)
Requirement already satisfied: joblib>=0.11 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn>=0.24->flaml) (1.1.0)
Requirement already satisfied: six>=1.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas>=1.1.4->flaml) (1.16.0)
Installing collected packages: xgboost, lightgbm, flaml
Successfully installed flaml-0.7.1 lightgbm-3.3.1 xgboost-1.3.3
WARNING: You are using pip version 20.1.1; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.
from flaml import AutoML
automl = AutoML()
settings = {
"time_budget": 180, # total running time in seconds
"metric": 'accuracy', # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
# 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'
"task": 'classification', # task type
"log_file_name": 'airlines_experiment.log', # flaml log file
}
automl.fit(X_train=X_train, y_train=y_train, **settings)
[flaml.automl: 11-16 11:46:57] {1485} INFO - Data split method: stratified
[flaml.automl: 11-16 11:46:57] {1489} INFO - Evaluation method: cv
[flaml.automl: 11-16 11:46:57] {1540} INFO - Minimizing error metric: 1-accuracy
[flaml.automl: 11-16 11:46:57] {1577} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'lrl1']
[flaml.automl: 11-16 11:46:57] {1826} INFO - iteration 0, current learner lgbm
[flaml.automl: 11-16 11:46:57] {1944} INFO - Estimated sufficient time budget=2245s. Estimated necessary time budget=37s.
[flaml.automl: 11-16 11:46:57] {2029} INFO - at 0.3s, estimator lgbm's best error=0.1769, best estimator lgbm's best error=0.1769
[flaml.automl: 11-16 11:46:57] {1826} INFO - iteration 1, current learner lgbm
[flaml.automl: 11-16 11:46:57] {2029} INFO - at 0.6s, estimator lgbm's best error=0.1769, best estimator lgbm's best error=0.1769
[flaml.automl: 11-16 11:46:57] {1826} INFO - iteration 2, current learner lgbm
[flaml.automl: 11-16 11:46:58] {2029} INFO - at 0.8s, estimator lgbm's best error=0.1564, best estimator lgbm's best error=0.1564
[flaml.automl: 11-16 11:46:58] {1826} INFO - iteration 3, current learner xgboost
[flaml.automl: 11-16 11:46:58] {2029} INFO - at 1.0s, estimator xgboost's best error=0.1744, best estimator lgbm's best error=0.1564
[flaml.automl: 11-16 11:46:58] {1826} INFO - iteration 4, current learner lgbm
[flaml.automl: 11-16 11:46:58] {2029} INFO - at 1.3s, estimator lgbm's best error=0.1500, best estimator lgbm's best error=0.1500
[flaml.automl: 11-16 11:46:58] {1826} INFO - iteration 5, current learner lgbm
[flaml.automl: 11-16 11:46:58] {2029} INFO - at 1.5s, estimator lgbm's best error=0.1500, best estimator lgbm's best error=0.1500
[flaml.automl: 11-16 11:46:58] {1826} INFO - iteration 6, current learner lgbm
[flaml.automl: 11-16 11:46:59] {2029} INFO - at 1.7s, estimator lgbm's best error=0.1487, best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:46:59] {1826} INFO - iteration 7, current learner lgbm
[flaml.automl: 11-16 11:46:59] {2029} INFO - at 2.0s, estimator lgbm's best error=0.1487, best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:46:59] {1826} INFO - iteration 8, current learner lgbm
[flaml.automl: 11-16 11:46:59] {2029} INFO - at 2.3s, estimator lgbm's best error=0.1487, best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:46:59] {1826} INFO - iteration 9, current learner xgboost
[flaml.automl: 11-16 11:46:59] {2029} INFO - at 2.5s, estimator xgboost's best error=0.1731, best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:46:59] {1826} INFO - iteration 10, current learner extra_tree
[flaml.automl: 11-16 11:47:01] {2029} INFO - at 3.8s, estimator extra_tree's best error=0.1551, best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:47:01] {1826} INFO - iteration 11, current learner rf
[flaml.automl: 11-16 11:47:02] {2029} INFO - at 5.2s, estimator rf's best error=0.1718, best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:47:02] {1826} INFO - iteration 12, current learner lgbm
[flaml.automl: 11-16 11:47:02] {2029} INFO - at 5.4s, estimator lgbm's best error=0.1487, best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:47:02] {1826} INFO - iteration 13, current learner rf
[flaml.automl: 11-16 11:47:04] {2029} INFO - at 6.7s, estimator rf's best error=0.1551, best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:47:04] {1826} INFO - iteration 14, current learner lgbm
[flaml.automl: 11-16 11:47:04] {2029} INFO - at 6.9s, estimator lgbm's best error=0.1487, best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:47:04] {1826} INFO - iteration 15, current learner xgboost
[flaml.automl: 11-16 11:47:04] {2029} INFO - at 7.2s, estimator xgboost's best error=0.1667, best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:47:04] {1826} INFO - iteration 16, current learner lgbm
[flaml.automl: 11-16 11:47:04] {2029} INFO - at 7.4s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:04] {1826} INFO - iteration 17, current learner extra_tree
[flaml.automl: 11-16 11:47:06] {2029} INFO - at 8.8s, estimator extra_tree's best error=0.1551, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:06] {1826} INFO - iteration 18, current learner lgbm
[flaml.automl: 11-16 11:47:06] {2029} INFO - at 9.1s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:06] {1826} INFO - iteration 19, current learner rf
[flaml.automl: 11-16 11:47:07] {2029} INFO - at 10.4s, estimator rf's best error=0.1551, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:07] {1826} INFO - iteration 20, current learner lgbm
[flaml.automl: 11-16 11:47:08] {2029} INFO - at 10.7s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:08] {1826} INFO - iteration 21, current learner rf
[flaml.automl: 11-16 11:47:09] {2029} INFO - at 12.2s, estimator rf's best error=0.1449, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:09] {1826} INFO - iteration 22, current learner lgbm
[flaml.automl: 11-16 11:47:09] {2029} INFO - at 12.4s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:09] {1826} INFO - iteration 23, current learner lgbm
[flaml.automl: 11-16 11:47:10] {2029} INFO - at 12.7s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:10] {1826} INFO - iteration 24, current learner xgboost
[flaml.automl: 11-16 11:47:10] {2029} INFO - at 12.9s, estimator xgboost's best error=0.1500, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:10] {1826} INFO - iteration 25, current learner xgboost
[flaml.automl: 11-16 11:47:10] {2029} INFO - at 13.2s, estimator xgboost's best error=0.1500, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:10] {1826} INFO - iteration 26, current learner rf
[flaml.automl: 11-16 11:47:11] {2029} INFO - at 14.5s, estimator rf's best error=0.1449, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:11] {1826} INFO - iteration 27, current learner xgboost
[flaml.automl: 11-16 11:47:12] {2029} INFO - at 14.7s, estimator xgboost's best error=0.1500, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:12] {1826} INFO - iteration 28, current learner xgboost
[flaml.automl: 11-16 11:47:12] {2029} INFO - at 15.0s, estimator xgboost's best error=0.1500, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:12] {1826} INFO - iteration 29, current learner xgboost
[flaml.automl: 11-16 11:47:12] {2029} INFO - at 15.2s, estimator xgboost's best error=0.1500, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:12] {1826} INFO - iteration 30, current learner lgbm
[flaml.automl: 11-16 11:47:12] {2029} INFO - at 15.5s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:12] {1826} INFO - iteration 31, current learner lgbm
[flaml.automl: 11-16 11:47:13] {2029} INFO - at 15.8s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:13] {1826} INFO - iteration 32, current learner lgbm
[flaml.automl: 11-16 11:47:13] {2029} INFO - at 16.0s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:13] {1826} INFO - iteration 33, current learner xgboost
[flaml.automl: 11-16 11:47:13] {2029} INFO - at 16.3s, estimator xgboost's best error=0.1500, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:13] {1826} INFO - iteration 34, current learner lgbm
[flaml.automl: 11-16 11:47:13] {2029} INFO - at 16.5s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:13] {1826} INFO - iteration 35, current learner rf
[flaml.automl: 11-16 11:47:15] {2029} INFO - at 18.0s, estimator rf's best error=0.1449, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:15] {1826} INFO - iteration 36, current learner xgboost
[flaml.automl: 11-16 11:47:15] {2029} INFO - at 18.3s, estimator xgboost's best error=0.1500, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:15] {1826} INFO - iteration 37, current learner lgbm
[flaml.automl: 11-16 11:47:15] {2029} INFO - at 18.6s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:15] {1826} INFO - iteration 38, current learner lgbm
[flaml.automl: 11-16 11:47:16] {2029} INFO - at 18.9s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:16] {1826} INFO - iteration 39, current learner lgbm
[flaml.automl: 11-16 11:47:16] {2029} INFO - at 19.4s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:16] {1826} INFO - iteration 40, current learner rf
[flaml.automl: 11-16 11:47:18] {2029} INFO - at 20.7s, estimator rf's best error=0.1449, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:18] {1826} INFO - iteration 41, current learner rf
[flaml.automl: 11-16 11:47:19] {2029} INFO - at 22.2s, estimator rf's best error=0.1449, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:19] {1826} INFO - iteration 42, current learner lgbm
[flaml.automl: 11-16 11:47:19] {2029} INFO - at 22.5s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:19] {1826} INFO - iteration 43, current learner rf
[flaml.automl: 11-16 11:47:21] {2029} INFO - at 23.8s, estimator rf's best error=0.1449, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:21] {1826} INFO - iteration 44, current learner rf
[flaml.automl: 11-16 11:47:22] {2029} INFO - at 25.3s, estimator rf's best error=0.1449, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:22] {1826} INFO - iteration 45, current learner lgbm
[flaml.automl: 11-16 11:47:22] {2029} INFO - at 25.6s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:22] {1826} INFO - iteration 46, current learner lgbm
[flaml.automl: 11-16 11:47:23] {2029} INFO - at 25.9s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:23] {1826} INFO - iteration 47, current learner rf
[flaml.automl: 11-16 11:47:24] {2029} INFO - at 27.4s, estimator rf's best error=0.1449, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:24] {1826} INFO - iteration 48, current learner lgbm
[flaml.automl: 11-16 11:47:24] {2029} INFO - at 27.7s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:24] {1826} INFO - iteration 49, current learner lgbm
[flaml.automl: 11-16 11:47:25] {2029} INFO - at 27.9s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:25] {1826} INFO - iteration 50, current learner rf
[flaml.automl: 11-16 11:47:26] {2029} INFO - at 29.3s, estimator rf's best error=0.1449, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:26] {1826} INFO - iteration 51, current learner lgbm
[flaml.automl: 11-16 11:47:26] {2029} INFO - at 29.5s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:26] {1826} INFO - iteration 52, current learner lgbm
[flaml.automl: 11-16 11:47:27] {2029} INFO - at 29.8s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:27] {1826} INFO - iteration 53, current learner lgbm
[flaml.automl: 11-16 11:47:27] {2029} INFO - at 30.0s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:27] {1826} INFO - iteration 54, current learner lgbm
[flaml.automl: 11-16 11:47:27] {2029} INFO - at 30.3s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:27] {1826} INFO - iteration 55, current learner lgbm
[flaml.automl: 11-16 11:47:27] {2029} INFO - at 30.6s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:27] {1826} INFO - iteration 56, current learner lgbm
[flaml.automl: 11-16 11:47:28] {2029} INFO - at 30.8s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:28] {1826} INFO - iteration 57, current learner lgbm
[flaml.automl: 11-16 11:47:28] {2029} INFO - at 31.1s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:28] {1826} INFO - iteration 58, current learner lgbm
[flaml.automl: 11-16 11:47:28] {2029} INFO - at 31.3s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:28] {1826} INFO - iteration 59, current learner lgbm
[flaml.automl: 11-16 11:47:29] {2029} INFO - at 31.7s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:29] {1826} INFO - iteration 60, current learner lgbm
[flaml.automl: 11-16 11:47:29] {2029} INFO - at 32.0s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:29] {1826} INFO - iteration 61, current learner lgbm
[flaml.automl: 11-16 11:47:29] {2029} INFO - at 32.3s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:29] {1826} INFO - iteration 62, current learner lgbm
[flaml.automl: 11-16 11:47:29] {2029} INFO - at 32.6s, estimator lgbm's best error=0.1397, best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:29] {1826} INFO - iteration 63, current learner rf
[flaml.automl: 11-16 11:47:31] {2029} INFO - at 34.1s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:31] {1826} INFO - iteration 64, current learner rf
[flaml.automl: 11-16 11:47:32] {2029} INFO - at 35.4s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:32] {1826} INFO - iteration 65, current learner rf
[flaml.automl: 11-16 11:47:34] {2029} INFO - at 36.9s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:34] {1826} INFO - iteration 66, current learner rf
[flaml.automl: 11-16 11:47:35] {2029} INFO - at 38.4s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:35] {1826} INFO - iteration 67, current learner rf
[flaml.automl: 11-16 11:47:37] {2029} INFO - at 39.9s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:37] {1826} INFO - iteration 68, current learner rf
[flaml.automl: 11-16 11:47:38] {2029} INFO - at 41.3s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:38] {1826} INFO - iteration 69, current learner rf
[flaml.automl: 11-16 11:47:40] {2029} INFO - at 42.8s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:40] {1826} INFO - iteration 70, current learner rf
[flaml.automl: 11-16 11:47:41] {2029} INFO - at 44.3s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:41] {1826} INFO - iteration 71, current learner rf
[flaml.automl: 11-16 11:47:43] {2029} INFO - at 45.8s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:43] {1826} INFO - iteration 72, current learner rf
[flaml.automl: 11-16 11:47:44] {2029} INFO - at 47.2s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:44] {1826} INFO - iteration 73, current learner rf
[flaml.automl: 11-16 11:47:46] {2029} INFO - at 48.7s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:46] {1826} INFO - iteration 74, current learner rf
[flaml.automl: 11-16 11:47:47] {2029} INFO - at 50.2s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:47] {1826} INFO - iteration 75, current learner rf
[flaml.automl: 11-16 11:47:48] {2029} INFO - at 51.6s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:48] {1826} INFO - iteration 76, current learner rf
[flaml.automl: 11-16 11:47:50] {2029} INFO - at 53.1s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:50] {1826} INFO - iteration 77, current learner rf
[flaml.automl: 11-16 11:47:51] {2029} INFO - at 54.6s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:51] {1826} INFO - iteration 78, current learner rf
[flaml.automl: 11-16 11:47:53] {2029} INFO - at 56.1s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:53] {1826} INFO - iteration 79, current learner rf
[flaml.automl: 11-16 11:47:54] {2029} INFO - at 57.5s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:54] {1826} INFO - iteration 80, current learner rf
[flaml.automl: 11-16 11:47:56] {2029} INFO - at 59.0s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:56] {1826} INFO - iteration 81, current learner rf
[flaml.automl: 11-16 11:47:57] {2029} INFO - at 60.5s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:57] {1826} INFO - iteration 82, current learner rf
[flaml.automl: 11-16 11:47:59] {2029} INFO - at 62.1s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:59] {1826} INFO - iteration 83, current learner rf
[flaml.automl: 11-16 11:48:00] {2029} INFO - at 63.5s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:00] {1826} INFO - iteration 84, current learner rf
[flaml.automl: 11-16 11:48:02] {2029} INFO - at 65.0s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:02] {1826} INFO - iteration 85, current learner rf
[flaml.automl: 11-16 11:48:03] {2029} INFO - at 66.5s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:03] {1826} INFO - iteration 86, current learner rf
[flaml.automl: 11-16 11:48:05] {2029} INFO - at 68.0s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:05] {1826} INFO - iteration 87, current learner rf
[flaml.automl: 11-16 11:48:07] {2029} INFO - at 69.7s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:07] {1826} INFO - iteration 88, current learner rf
[flaml.automl: 11-16 11:48:08] {2029} INFO - at 71.3s, estimator rf's best error=0.1372, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:08] {1826} INFO - iteration 89, current learner lrl1
[flaml.automl: 11-16 11:48:09] {2029} INFO - at 72.0s, estimator lrl1's best error=0.2756, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:09] {1826} INFO - iteration 90, current learner lrl1
[flaml.automl: 11-16 11:48:10] {2029} INFO - at 72.7s, estimator lrl1's best error=0.2756, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:10] {1826} INFO - iteration 91, current learner lrl1
[flaml.automl: 11-16 11:48:10] {2029} INFO - at 73.4s, estimator lrl1's best error=0.2756, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:10] {1826} INFO - iteration 92, current learner lrl1
[flaml.automl: 11-16 11:48:11] {2029} INFO - at 74.2s, estimator lrl1's best error=0.2756, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:11] {1826} INFO - iteration 93, current learner lrl1
[flaml.automl: 11-16 11:48:12] {2029} INFO - at 74.8s, estimator lrl1's best error=0.2756, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:12] {1826} INFO - iteration 94, current learner lrl1
[flaml.automl: 11-16 11:48:12] {2029} INFO - at 75.6s, estimator lrl1's best error=0.2756, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:12] {1826} INFO - iteration 95, current learner lrl1
[flaml.automl: 11-16 11:48:13] {2029} INFO - at 76.3s, estimator lrl1's best error=0.2756, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:13] {1826} INFO - iteration 96, current learner lrl1
[flaml.automl: 11-16 11:48:14] {2029} INFO - at 77.0s, estimator lrl1's best error=0.2756, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:14] {1826} INFO - iteration 97, current learner lrl1
[flaml.automl: 11-16 11:48:14] {2029} INFO - at 77.7s, estimator lrl1's best error=0.2756, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:14] {1826} INFO - iteration 98, current learner lrl1
[flaml.automl: 11-16 11:48:15] {2029} INFO - at 78.4s, estimator lrl1's best error=0.2756, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:15] {1826} INFO - iteration 99, current learner lrl1
[flaml.automl: 11-16 11:48:16] {2029} INFO - at 79.1s, estimator lrl1's best error=0.2756, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:16] {1826} INFO - iteration 100, current learner lrl1
[flaml.automl: 11-16 11:48:17] {2029} INFO - at 79.8s, estimator lrl1's best error=0.2756, best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:17] {1826} INFO - iteration 101, current learner lrl1
''' retrieve best config and best learner'''
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))
Best ML leaner: lgbm
Best hyperparmeter config: {'n_estimators': 17.0, 'num_leaves': 4.0, 'min_child_samples': 25.0, 'learning_rate': 0.38406305271476093, 'subsample': 0.7898698216956215, 'log_max_bin': 8.0, 'colsample_bytree': 0.9425456251062123, 'reg_alpha': 0.6913795549330475, 'reg_lambda': 0.12357355094487746}
Best accuracy on validation data: 0.8692
Training duration of best run: 0.1457 s
automl.model
''' compute predictions of testing dataset '''
y_pred = automl.model.predict(X_test)
print('Predicted labels', y_pred)
print('True labels', y_test)
y_pred_proba = automl.model.predict_proba(X_test)[:,1]
Predicted labels [0 0 1 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0
0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0
0 0 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 1
0 1 1 0 1 1 1 0 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 1 1 1 0]
True labels 29 0
615 1
383 1
783 0
684 1
..
238 1
242 1
234 0
605 0
492 1
Name: HeartDisease, Length: 138, dtype: int64
''' compute different metric values on testing dataset'''
from flaml.ml import sklearn_metric_loss_score
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))
accuracy = 0.34782608695652173
roc_auc = 0.24469439728353148
log_loss = 0.9483898773683159
Training duration of best run: 0.1457 s
cm = confusion_matrix(y_test, automl.model.predict(X_test))
cm
plot_confusion_matrix(automl.model, X_test, y_test, cmap='Oranges', normalize='true')
import shap
print(f'shap version {shap.__version__}')
# load JS visualization code to notebook
shap.initjs()
shap version 0.39.0
def case_detail(case_data):
'''
format obj returned from shap.force_plot()
'''
de=pd.DataFrame(case_data.data['features'])
fcols=[]
for i in case_data.data['features'].keys():
fcols.append(case_data.data['featureNames'][i])
de.columns=fcols
return de
def individual_case_plot(explainer, X, case_index, verbose=False):
"""
>>> individual_case_plot(explainer, X_train, 1)
"""
shap_values = explainer.shap_values(X.iloc[[case_index]])
g=shap.force_plot(explainer.expected_value, shap_values=shap_values, features=X.iloc[case_index, :])
if verbose:
pprint(g.__dict__)
return g
# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(automl.model)
shap_values = explainer.shap_values(X_test)
X
shap_values[:3]
shap.summary_plot(shap_values, X_train, plot_type="bar")
feature_cols
dshap=pd.DataFrame(shap_values, columns=feature_cols)
dshap