Heart Failure Prediction

import sys import math import pandas as pd import seaborn as sns import numpy as np import matplotlib.pyplot as plt from scipy import stats # import plotly import plotly.express as px # import plotly.offline # import cufflinks as cf # cf.go_offline() # cf.set_config_file(offline=False, world_readable=True) # import ipywidgets as widgets # from ipywidgets import interact sns.set_style('whitegrid') %matplotlib inline %config InlineBackend.figure_format = 'retina'

import warnings warnings.filterwarnings('ignore')

print(f'Python version: {sys.version}') print(f'pandas version: {pd.__version__}') print(f'numpy version: {np.__version__}') print(f'seaborn version: {sns.__version__}') pd.Timestamp.now() # pd.Timestamp.now().strftime('%Y-%m-%d')

Python  version: 3.7.12 (default, Oct 12 2021, 03:36:26) 
[GCC 8.3.0]
pandas  version: 1.2.5
numpy   version: 1.19.5
seaborn version: 0.11.2

url = 'https://raw.githubusercontent.com/GuySuphakit/Heart-Failure-Prediction/main/heart.csv' temp_df = pd.read_csv(url) temp_df.head()

temp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB

def read_data(): url = 'https://raw.githubusercontent.com/GuySuphakit/Heart-Failure-Prediction/main/heart.csv' df = pd.read_csv(url) return df # def change_lebel_of_HeartDisease(df): # # change the lebel of HeartDisease because it has caused some ambiguity. # df.HeartDisease = df.HeartDisease.replace({0: 'Normal', 1: 'Heart Disease'}) # return df def convert_FastingBS_to_category(df): df.FastingBS = df.FastingBS.astype('category') return df def convert_obj_columns_to_category(df): for c in df.columns: col_type = df[c].dtype if col_type == 'object' or col_type.name == 'category': df[c] = df[c].astype('category') return df def drop_duplicate(df): df = df.drop_duplicates() return df

df = (read_data() .pipe(convert_FastingBS_to_category) .pipe(convert_obj_columns_to_category) .pipe(drop_duplicate)) df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Age             918 non-null    int64   
 1   Sex             918 non-null    category
 2   ChestPainType   918 non-null    category
 3   RestingBP       918 non-null    int64   
 4   Cholesterol     918 non-null    int64   
 5   FastingBS       918 non-null    category
 6   RestingECG      918 non-null    category
 7   MaxHR           918 non-null    int64   
 8   ExerciseAngina  918 non-null    category
 9   Oldpeak         918 non-null    float64 
 10  ST_Slope        918 non-null    category
 11  HeartDisease    918 non-null    int64   
dtypes: category(6), float64(1), int64(5)
memory usage: 56.4 KB

def print_category_columns(df): for c in df.columns: col_type = df[c].dtype if col_type.name == 'category': print(f'{c:15}: {list(enumerate(df[c].cat.categories))}') print('-' * 60) print_category_columns(df)

Sex            : [(0, 'F'), (1, 'M')]
------------------------------------------------------------
ChestPainType  : [(0, 'ASY'), (1, 'ATA'), (2, 'NAP'), (3, 'TA')]
------------------------------------------------------------
FastingBS      : [(0, 0), (1, 1)]
------------------------------------------------------------
RestingECG     : [(0, 'LVH'), (1, 'Normal'), (2, 'ST')]
------------------------------------------------------------
ExerciseAngina : [(0, 'N'), (1, 'Y')]
------------------------------------------------------------
ST_Slope       : [(0, 'Down'), (1, 'Flat'), (2, 'Up')]
------------------------------------------------------------

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Age             918 non-null    int64   
 1   Sex             918 non-null    category
 2   ChestPainType   918 non-null    category
 3   RestingBP       918 non-null    int64   
 4   Cholesterol     918 non-null    int64   
 5   FastingBS       918 non-null    category
 6   RestingECG      918 non-null    category
 7   MaxHR           918 non-null    int64   
 8   ExerciseAngina  918 non-null    category
 9   Oldpeak         918 non-null    float64 
 10  ST_Slope        918 non-null    category
 11  HeartDisease    918 non-null    int64   
dtypes: category(6), float64(1), int64(5)
memory usage: 56.4 KB

def plot_mn(df, cols, n_rows:int=1, kind:str='boxplot', color='salmon'): """ plot boxplot, violin, hist in m (rows) by n (columns) >>> plot_mn(df, ['Calories', 'Fat'], 2, 'hist') """ n=len(cols) n_cols=math.ceil(n / n_rows) fig, ax = plt.subplots(n_rows, n_cols, figsize=(n_cols*3, n_rows*3.5)) ax=ax.ravel() fig.tight_layout() for i, c in enumerate(cols): col_type = df[c].dtype if col_type.name == 'category': sns.countplot(data=df, x=c, ax=ax[i]) else: if kind.lower()=='boxplot': sns.boxplot(data=df[[c]], ax=ax[i], color=color) if kind.lower()=='boxen': sns.boxenplot(data=df[[c]], ax=ax[i], color=color) elif kind.lower()=='violin': sns.violinplot(data=df[[c]], ax=ax[i], color=color) elif kind.lower()=='hist': sns.distplot(df[c], hist=True, kde=False, ax=ax[i], color=color) plot_mn(df, df.columns, 3, 'hist')

df.columns

df.describe().T.round(1)

df.describe(include='category').T

numerical= df.select_dtypes('int64').columns categorical = df.select_dtypes('category').columns print(f'Numerical Columns: {df[numerical].columns}') print('-'* 100) print(f'Categorical Columns: {df[categorical].columns}')

Numerical Columns:  Index(['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'HeartDisease'], dtype='object')
----------------------------------------------------------------------------------------------------
Categorical Columns: Index(['Sex', 'ChestPainType', 'FastingBS', 'RestingECG', 'ExerciseAngina',
       'ST_Slope'],
      dtype='object')

df.nunique().sort_values(ascending=False)

df["Age"].describe().round(1)

df.Age.mode()[0]

print(stats.skew(df.Age))

-0.19561273124487544

fig = px.histogram(df, x='Age', marginal='box') fig.update_traces(marker_line_width=1, marker_line_color="black") fig.show()

df.Sex.value_counts()

temp_df = df.groupby(by="Sex", as_index=False).agg( counts=pd.NamedAgg(column="Sex", aggfunc="count")).sort_values(by="counts", ascending=False) fig = px.bar(temp_df, x='Sex', y='counts', color='Sex', color_continuous_scale=px.colors.qualitative.D3) fig.update_traces(marker_line_width=1, marker_line_color="black") fig.show()

df.ChestPainType.value_counts()

temp_df = df.groupby(by="ChestPainType", as_index=False).agg( counts=pd.NamedAgg(column="ChestPainType", aggfunc="count")).sort_values(by="counts", ascending=False) fig = px.bar(temp_df, x='ChestPainType', y='counts', color='counts', color_discrete_sequence = px.colors.qualitative.D3) fig.update_traces(marker_line_width=1, marker_line_color="black") fig.show()

df.RestingBP.describe().round()

df.RestingBP.mode()[0]

print(stats.skew(df.RestingBP))

0.17954532149156327

fig = px.histogram(df, x='RestingBP', marginal='box') fig.update_traces(marker_line_width=1, marker_line_color="black") fig.show()

df[df['RestingBP'] == 0]

df.Cholesterol.describe().round()

df.Cholesterol.mode()[0]

print(stats.skew(df.Cholesterol))

-0.6090891046626045

fig = px.histogram(df, x='Cholesterol', marginal='box') fig.update_traces(marker_line_width=1, marker_line_color="black") fig.show()

df[df['Cholesterol'] == 0].head()

df[df['Cholesterol'] >= 500]['Cholesterol']

df.FastingBS.value_counts()

fig = px.histogram(df, x='FastingBS', color='FastingBS') fig.update_traces(marker_line_width=1, marker_line_color="black") fig.show()

df["RestingECG"].value_counts()

temp_df = df.groupby(by="RestingECG", as_index=False).agg( counts=pd.NamedAgg(column="RestingECG", aggfunc="count")).sort_values(by="counts", ascending=False)

fig = px.bar(temp_df, x='RestingECG', y='counts', color='counts', color_discrete_sequence = px.colors.qualitative.D3) fig.update_traces(marker_line_width=1, marker_line_color="black") fig.show()

df.MaxHR.describe().round()

df.MaxHR.mode()[0]

df.MaxHR.skew()

fig = px.histogram(df, x='MaxHR', marginal='box') fig.update_traces(marker_line_width=1, marker_line_color="black") fig.show()

df.ExerciseAngina.value_counts()

temp_df = df.groupby(by="ExerciseAngina", as_index=False).agg( counts=pd.NamedAgg(column="ExerciseAngina", aggfunc="count")).sort_values(by="counts", ascending=False) fig = px.bar(temp_df, x='ExerciseAngina', y='counts', color='ExerciseAngina', color_continuous_scale=px.colors.qualitative.D3) fig.update_traces(marker_line_width=1, marker_line_color="black") fig.show()

df.Oldpeak.describe().round()

df.Oldpeak.mode()[0]

df.Oldpeak.skew()

fig = px.histogram(df, x='Oldpeak', marginal='box') fig.update_traces(marker_line_width=1, marker_line_color="black") fig.show()

df.ST_Slope.value_counts()

temp_df = df.groupby(by="ST_Slope", as_index=False).agg( counts=pd.NamedAgg(column="ST_Slope", aggfunc="count")).sort_values(by="counts", ascending=False) fig = px.bar(temp_df, x='ST_Slope', y='counts', color='ST_Slope', color_continuous_scale=px.colors.qualitative.D3) fig.update_traces(marker_line_width=1, marker_line_color="black") fig.show()

df.HeartDisease.value_counts()

fig = px.histogram(df, x='HeartDisease', color='HeartDisease') fig.update_traces(marker_line_width=1, marker_line_color="black") fig.show()

dcorr=df[df.columns].corr() # dcorr mask = np.zeros_like(dcorr) # mask.shape mask[np.triu_indices_from(mask)] = True fig, ax = plt.subplots(figsize=(10,8)) sns.heatmap(dcorr, cmap=sns.diverging_palette(10, 145, n=100), vmin=-1, vmax=1, center=0, linewidths=1, annot=True, mask=mask, ax=ax).set_title("Correlation\nHeatmap", fontsize=22,fontweight="bold");

!pip install statsmodels

Requirement already satisfied: statsmodels in /root/venv/lib/python3.7/site-packages (0.13.1)
Requirement already satisfied: patsy>=0.5.2 in /root/venv/lib/python3.7/site-packages (from statsmodels) (0.5.2)
Requirement already satisfied: numpy>=1.17 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.19.5)
Requirement already satisfied: scipy>=1.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.7.2)
Requirement already satisfied: pandas>=0.25 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from statsmodels) (1.2.5)
Requirement already satisfied: six in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from patsy>=0.5.2->statsmodels) (1.16.0)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2021.3)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=0.25->statsmodels) (2.8.2)
WARNING: You are using pip version 20.1.1; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.

corr_value = df["Age"].corr(df["RestingBP"]) fig = px.scatter( data_frame=df, x="Age", y="RestingBP", marginal_x="histogram", marginal_y="histogram", trendline="ols", trendline_color_override="red", title=f"Correlation between Age and RestingBP is: {corr_value:.2f}", ) fig.show()

corr_value = df["Age"].corr(df["Cholesterol"]) fig = px.scatter( data_frame=df, x="Age", y="Cholesterol", marginal_x="histogram", marginal_y="histogram", trendline="ols", trendline_color_override="red", title=f"Correlation between Age and Cholester0l is: {corr_value:.2f}", ) fig.show()

corr_value = df["Age"].corr(df["Oldpeak"]) fig = px.scatter( data_frame=df, x="Age", y="Oldpeak", marginal_x="histogram", marginal_y="histogram", trendline="ols", trendline_color_override="red", title=f"Correlation between Age and Oldpeak is: {corr_value:.2f}", ) fig.show()

corr_value = df["Age"].corr(df["MaxHR"]) fig = px.scatter( data_frame=df, x="Age", y="MaxHR", marginal_x="histogram", marginal_y="histogram", trendline="ols", trendline_color_override="red", title=f"Correlation between Age and MaxHR is: {corr_value:.2f}", ) fig.show()

fig = px.density_heatmap( data_frame=df, x="Age", y="RestingBP", color_continuous_scale="PuBu" ) fig.show()

fig = px.density_heatmap( data_frame=df, x="Age", y="Cholesterol", color_continuous_scale="PuBu" ) fig.show()

fig = px.density_heatmap( data_frame=df, x="Age", y="MaxHR", color_continuous_scale="PuBu" ) fig.show()

fig = px.density_heatmap( data_frame=df, x="Age", y="Oldpeak", color_continuous_scale="PuBu" ) fig.show()

sns.pairplot(df, hue="HeartDisease", palette="husl", corner=True);

px.density_heatmap( data_frame=df, x="Age", y="RestingBP", color_continuous_scale="PuBu", facet_col="HeartDisease", title="Age vs. Chelesterol for different Heart Disease values", )

px.density_heatmap( data_frame=df, x="Age", y="Cholesterol", color_continuous_scale="PuBu", facet_col="HeartDisease", title="Age vs. Chelesterol for different Heart Disease values", )

px.density_heatmap( data_frame=df, x="Age", y="MaxHR", color_continuous_scale="PuBu", facet_col="HeartDisease", title="Age vs. Chelesterol for different Heart Disease values", )

def num_plot(df, col): fig = px.histogram(df, x=col, color="HeartDisease", marginal="box") fig.update_layout(height=400, width=700, showlegend=True) fig.update_traces(marker_line_width=1,marker_line_color="black") fig.show() cols= df.columns for col in cols: num_plot(df, col)

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, ShuffleSplit, cross_val_score, cross_val_predict, cross_validate from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, precision_recall_fscore_support, f1_score

feature_cols = ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS', 'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope'] target_cols = 'HeartDisease' X = df[feature_cols] y = df[target_cols] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)

def data_preprocessing(X_train): ''' replace outliers with mean to make the distribution more normal. ''' mean_chol = X_train[X_train.Cholesterol > 0]['Cholesterol'].mean() mean_rest = X_train[X_train.RestingBP > 0]['Cholesterol'].mean() X_train['Cholesterol'].replace(to_replace = [0,X_train[X_train['Cholesterol'] >= 500]['Cholesterol']], value = mean_chol, inplace = True) X_train['RestingBP'].replace(to_replace = [0,244.635389], value = mean_rest, inplace = True) return X_train X_train = data_preprocessing(X_train)

df.columns

y_train.value_counts()

!pip install flaml

Collecting flaml
  Downloading FLAML-0.7.1-py3-none-any.whl (160 kB)
     |████████████████████████████████| 160 kB 30.9 MB/s 
Collecting xgboost<=1.3.3,>=0.90
  Downloading xgboost-1.3.3-py3-none-manylinux2010_x86_64.whl (157.5 MB)
     |████████████████████████████████| 157.5 MB 112 kB/s 
Requirement already satisfied: NumPy>=1.16.2 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from flaml) (1.19.5)
Requirement already satisfied: scipy>=1.4.1 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from flaml) (1.7.2)
Requirement already satisfied: pandas>=1.1.4 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from flaml) (1.2.5)
Collecting lightgbm>=2.3.1
  Downloading lightgbm-3.3.1-py3-none-manylinux1_x86_64.whl (2.0 MB)
     |████████████████████████████████| 2.0 MB 42.8 MB/s 
Requirement already satisfied: scikit-learn>=0.24 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from flaml) (1.0.1)
Requirement already satisfied: pytz>=2017.3 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from pandas>=1.1.4->flaml) (2021.3)
Requirement already satisfied: python-dateutil>=2.7.3 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from pandas>=1.1.4->flaml) (2.8.2)
Requirement already satisfied: wheel in /usr/local/lib/python3.7/site-packages (from lightgbm>=2.3.1->flaml) (0.37.0)
Requirement already satisfied: threadpoolctl>=2.0.0 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn>=0.24->flaml) (3.0.0)
Requirement already satisfied: joblib>=0.11 in /shared-libs/python3.7/py/lib/python3.7/site-packages (from scikit-learn>=0.24->flaml) (1.1.0)
Requirement already satisfied: six>=1.5 in /shared-libs/python3.7/py-core/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas>=1.1.4->flaml) (1.16.0)
Installing collected packages: xgboost, lightgbm, flaml
Successfully installed flaml-0.7.1 lightgbm-3.3.1 xgboost-1.3.3
WARNING: You are using pip version 20.1.1; however, version 21.3.1 is available.
You should consider upgrading via the '/root/venv/bin/python -m pip install --upgrade pip' command.

from flaml import AutoML automl = AutoML()

settings = { "time_budget": 180, # total running time in seconds "metric": 'accuracy', # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr', # 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1' "task": 'classification', # task type "log_file_name": 'airlines_experiment.log', # flaml log file }

automl.fit(X_train=X_train, y_train=y_train, **settings)

[flaml.automl: 11-16 11:46:57] {1485} INFO - Data split method: stratified
[flaml.automl: 11-16 11:46:57] {1489} INFO - Evaluation method: cv
[flaml.automl: 11-16 11:46:57] {1540} INFO - Minimizing error metric: 1-accuracy
[flaml.automl: 11-16 11:46:57] {1577} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'lrl1']
[flaml.automl: 11-16 11:46:57] {1826} INFO - iteration 0, current learner lgbm
[flaml.automl: 11-16 11:46:57] {1944} INFO - Estimated sufficient time budget=2245s. Estimated necessary time budget=37s.
[flaml.automl: 11-16 11:46:57] {2029} INFO -  at 0.3s,	estimator lgbm's best error=0.1769,	best estimator lgbm's best error=0.1769
[flaml.automl: 11-16 11:46:57] {1826} INFO - iteration 1, current learner lgbm
[flaml.automl: 11-16 11:46:57] {2029} INFO -  at 0.6s,	estimator lgbm's best error=0.1769,	best estimator lgbm's best error=0.1769
[flaml.automl: 11-16 11:46:57] {1826} INFO - iteration 2, current learner lgbm
[flaml.automl: 11-16 11:46:58] {2029} INFO -  at 0.8s,	estimator lgbm's best error=0.1564,	best estimator lgbm's best error=0.1564
[flaml.automl: 11-16 11:46:58] {1826} INFO - iteration 3, current learner xgboost
[flaml.automl: 11-16 11:46:58] {2029} INFO -  at 1.0s,	estimator xgboost's best error=0.1744,	best estimator lgbm's best error=0.1564
[flaml.automl: 11-16 11:46:58] {1826} INFO - iteration 4, current learner lgbm
[flaml.automl: 11-16 11:46:58] {2029} INFO -  at 1.3s,	estimator lgbm's best error=0.1500,	best estimator lgbm's best error=0.1500
[flaml.automl: 11-16 11:46:58] {1826} INFO - iteration 5, current learner lgbm
[flaml.automl: 11-16 11:46:58] {2029} INFO -  at 1.5s,	estimator lgbm's best error=0.1500,	best estimator lgbm's best error=0.1500
[flaml.automl: 11-16 11:46:58] {1826} INFO - iteration 6, current learner lgbm
[flaml.automl: 11-16 11:46:59] {2029} INFO -  at 1.7s,	estimator lgbm's best error=0.1487,	best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:46:59] {1826} INFO - iteration 7, current learner lgbm
[flaml.automl: 11-16 11:46:59] {2029} INFO -  at 2.0s,	estimator lgbm's best error=0.1487,	best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:46:59] {1826} INFO - iteration 8, current learner lgbm
[flaml.automl: 11-16 11:46:59] {2029} INFO -  at 2.3s,	estimator lgbm's best error=0.1487,	best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:46:59] {1826} INFO - iteration 9, current learner xgboost
[flaml.automl: 11-16 11:46:59] {2029} INFO -  at 2.5s,	estimator xgboost's best error=0.1731,	best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:46:59] {1826} INFO - iteration 10, current learner extra_tree
[flaml.automl: 11-16 11:47:01] {2029} INFO -  at 3.8s,	estimator extra_tree's best error=0.1551,	best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:47:01] {1826} INFO - iteration 11, current learner rf
[flaml.automl: 11-16 11:47:02] {2029} INFO -  at 5.2s,	estimator rf's best error=0.1718,	best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:47:02] {1826} INFO - iteration 12, current learner lgbm
[flaml.automl: 11-16 11:47:02] {2029} INFO -  at 5.4s,	estimator lgbm's best error=0.1487,	best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:47:02] {1826} INFO - iteration 13, current learner rf
[flaml.automl: 11-16 11:47:04] {2029} INFO -  at 6.7s,	estimator rf's best error=0.1551,	best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:47:04] {1826} INFO - iteration 14, current learner lgbm
[flaml.automl: 11-16 11:47:04] {2029} INFO -  at 6.9s,	estimator lgbm's best error=0.1487,	best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:47:04] {1826} INFO - iteration 15, current learner xgboost
[flaml.automl: 11-16 11:47:04] {2029} INFO -  at 7.2s,	estimator xgboost's best error=0.1667,	best estimator lgbm's best error=0.1487
[flaml.automl: 11-16 11:47:04] {1826} INFO - iteration 16, current learner lgbm
[flaml.automl: 11-16 11:47:04] {2029} INFO -  at 7.4s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:04] {1826} INFO - iteration 17, current learner extra_tree
[flaml.automl: 11-16 11:47:06] {2029} INFO -  at 8.8s,	estimator extra_tree's best error=0.1551,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:06] {1826} INFO - iteration 18, current learner lgbm
[flaml.automl: 11-16 11:47:06] {2029} INFO -  at 9.1s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:06] {1826} INFO - iteration 19, current learner rf
[flaml.automl: 11-16 11:47:07] {2029} INFO -  at 10.4s,	estimator rf's best error=0.1551,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:07] {1826} INFO - iteration 20, current learner lgbm
[flaml.automl: 11-16 11:47:08] {2029} INFO -  at 10.7s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:08] {1826} INFO - iteration 21, current learner rf
[flaml.automl: 11-16 11:47:09] {2029} INFO -  at 12.2s,	estimator rf's best error=0.1449,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:09] {1826} INFO - iteration 22, current learner lgbm
[flaml.automl: 11-16 11:47:09] {2029} INFO -  at 12.4s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:09] {1826} INFO - iteration 23, current learner lgbm
[flaml.automl: 11-16 11:47:10] {2029} INFO -  at 12.7s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:10] {1826} INFO - iteration 24, current learner xgboost
[flaml.automl: 11-16 11:47:10] {2029} INFO -  at 12.9s,	estimator xgboost's best error=0.1500,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:10] {1826} INFO - iteration 25, current learner xgboost
[flaml.automl: 11-16 11:47:10] {2029} INFO -  at 13.2s,	estimator xgboost's best error=0.1500,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:10] {1826} INFO - iteration 26, current learner rf
[flaml.automl: 11-16 11:47:11] {2029} INFO -  at 14.5s,	estimator rf's best error=0.1449,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:11] {1826} INFO - iteration 27, current learner xgboost
[flaml.automl: 11-16 11:47:12] {2029} INFO -  at 14.7s,	estimator xgboost's best error=0.1500,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:12] {1826} INFO - iteration 28, current learner xgboost
[flaml.automl: 11-16 11:47:12] {2029} INFO -  at 15.0s,	estimator xgboost's best error=0.1500,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:12] {1826} INFO - iteration 29, current learner xgboost
[flaml.automl: 11-16 11:47:12] {2029} INFO -  at 15.2s,	estimator xgboost's best error=0.1500,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:12] {1826} INFO - iteration 30, current learner lgbm
[flaml.automl: 11-16 11:47:12] {2029} INFO -  at 15.5s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:12] {1826} INFO - iteration 31, current learner lgbm
[flaml.automl: 11-16 11:47:13] {2029} INFO -  at 15.8s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:13] {1826} INFO - iteration 32, current learner lgbm
[flaml.automl: 11-16 11:47:13] {2029} INFO -  at 16.0s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:13] {1826} INFO - iteration 33, current learner xgboost
[flaml.automl: 11-16 11:47:13] {2029} INFO -  at 16.3s,	estimator xgboost's best error=0.1500,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:13] {1826} INFO - iteration 34, current learner lgbm
[flaml.automl: 11-16 11:47:13] {2029} INFO -  at 16.5s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:13] {1826} INFO - iteration 35, current learner rf
[flaml.automl: 11-16 11:47:15] {2029} INFO -  at 18.0s,	estimator rf's best error=0.1449,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:15] {1826} INFO - iteration 36, current learner xgboost
[flaml.automl: 11-16 11:47:15] {2029} INFO -  at 18.3s,	estimator xgboost's best error=0.1500,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:15] {1826} INFO - iteration 37, current learner lgbm
[flaml.automl: 11-16 11:47:15] {2029} INFO -  at 18.6s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:15] {1826} INFO - iteration 38, current learner lgbm
[flaml.automl: 11-16 11:47:16] {2029} INFO -  at 18.9s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:16] {1826} INFO - iteration 39, current learner lgbm
[flaml.automl: 11-16 11:47:16] {2029} INFO -  at 19.4s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:16] {1826} INFO - iteration 40, current learner rf
[flaml.automl: 11-16 11:47:18] {2029} INFO -  at 20.7s,	estimator rf's best error=0.1449,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:18] {1826} INFO - iteration 41, current learner rf
[flaml.automl: 11-16 11:47:19] {2029} INFO -  at 22.2s,	estimator rf's best error=0.1449,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:19] {1826} INFO - iteration 42, current learner lgbm
[flaml.automl: 11-16 11:47:19] {2029} INFO -  at 22.5s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:19] {1826} INFO - iteration 43, current learner rf
[flaml.automl: 11-16 11:47:21] {2029} INFO -  at 23.8s,	estimator rf's best error=0.1449,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:21] {1826} INFO - iteration 44, current learner rf
[flaml.automl: 11-16 11:47:22] {2029} INFO -  at 25.3s,	estimator rf's best error=0.1449,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:22] {1826} INFO - iteration 45, current learner lgbm
[flaml.automl: 11-16 11:47:22] {2029} INFO -  at 25.6s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:22] {1826} INFO - iteration 46, current learner lgbm
[flaml.automl: 11-16 11:47:23] {2029} INFO -  at 25.9s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:23] {1826} INFO - iteration 47, current learner rf
[flaml.automl: 11-16 11:47:24] {2029} INFO -  at 27.4s,	estimator rf's best error=0.1449,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:24] {1826} INFO - iteration 48, current learner lgbm
[flaml.automl: 11-16 11:47:24] {2029} INFO -  at 27.7s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:24] {1826} INFO - iteration 49, current learner lgbm
[flaml.automl: 11-16 11:47:25] {2029} INFO -  at 27.9s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:25] {1826} INFO - iteration 50, current learner rf
[flaml.automl: 11-16 11:47:26] {2029} INFO -  at 29.3s,	estimator rf's best error=0.1449,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:26] {1826} INFO - iteration 51, current learner lgbm
[flaml.automl: 11-16 11:47:26] {2029} INFO -  at 29.5s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:26] {1826} INFO - iteration 52, current learner lgbm
[flaml.automl: 11-16 11:47:27] {2029} INFO -  at 29.8s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:27] {1826} INFO - iteration 53, current learner lgbm
[flaml.automl: 11-16 11:47:27] {2029} INFO -  at 30.0s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:27] {1826} INFO - iteration 54, current learner lgbm
[flaml.automl: 11-16 11:47:27] {2029} INFO -  at 30.3s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:27] {1826} INFO - iteration 55, current learner lgbm
[flaml.automl: 11-16 11:47:27] {2029} INFO -  at 30.6s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:27] {1826} INFO - iteration 56, current learner lgbm
[flaml.automl: 11-16 11:47:28] {2029} INFO -  at 30.8s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:28] {1826} INFO - iteration 57, current learner lgbm
[flaml.automl: 11-16 11:47:28] {2029} INFO -  at 31.1s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:28] {1826} INFO - iteration 58, current learner lgbm
[flaml.automl: 11-16 11:47:28] {2029} INFO -  at 31.3s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:28] {1826} INFO - iteration 59, current learner lgbm
[flaml.automl: 11-16 11:47:29] {2029} INFO -  at 31.7s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:29] {1826} INFO - iteration 60, current learner lgbm
[flaml.automl: 11-16 11:47:29] {2029} INFO -  at 32.0s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:29] {1826} INFO - iteration 61, current learner lgbm
[flaml.automl: 11-16 11:47:29] {2029} INFO -  at 32.3s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:29] {1826} INFO - iteration 62, current learner lgbm
[flaml.automl: 11-16 11:47:29] {2029} INFO -  at 32.6s,	estimator lgbm's best error=0.1397,	best estimator lgbm's best error=0.1397
[flaml.automl: 11-16 11:47:29] {1826} INFO - iteration 63, current learner rf
[flaml.automl: 11-16 11:47:31] {2029} INFO -  at 34.1s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:31] {1826} INFO - iteration 64, current learner rf
[flaml.automl: 11-16 11:47:32] {2029} INFO -  at 35.4s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:32] {1826} INFO - iteration 65, current learner rf
[flaml.automl: 11-16 11:47:34] {2029} INFO -  at 36.9s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:34] {1826} INFO - iteration 66, current learner rf
[flaml.automl: 11-16 11:47:35] {2029} INFO -  at 38.4s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:35] {1826} INFO - iteration 67, current learner rf
[flaml.automl: 11-16 11:47:37] {2029} INFO -  at 39.9s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:37] {1826} INFO - iteration 68, current learner rf
[flaml.automl: 11-16 11:47:38] {2029} INFO -  at 41.3s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:38] {1826} INFO - iteration 69, current learner rf
[flaml.automl: 11-16 11:47:40] {2029} INFO -  at 42.8s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:40] {1826} INFO - iteration 70, current learner rf
[flaml.automl: 11-16 11:47:41] {2029} INFO -  at 44.3s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:41] {1826} INFO - iteration 71, current learner rf
[flaml.automl: 11-16 11:47:43] {2029} INFO -  at 45.8s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:43] {1826} INFO - iteration 72, current learner rf
[flaml.automl: 11-16 11:47:44] {2029} INFO -  at 47.2s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:44] {1826} INFO - iteration 73, current learner rf
[flaml.automl: 11-16 11:47:46] {2029} INFO -  at 48.7s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:46] {1826} INFO - iteration 74, current learner rf
[flaml.automl: 11-16 11:47:47] {2029} INFO -  at 50.2s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:47] {1826} INFO - iteration 75, current learner rf
[flaml.automl: 11-16 11:47:48] {2029} INFO -  at 51.6s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:48] {1826} INFO - iteration 76, current learner rf
[flaml.automl: 11-16 11:47:50] {2029} INFO -  at 53.1s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:50] {1826} INFO - iteration 77, current learner rf
[flaml.automl: 11-16 11:47:51] {2029} INFO -  at 54.6s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:51] {1826} INFO - iteration 78, current learner rf
[flaml.automl: 11-16 11:47:53] {2029} INFO -  at 56.1s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:53] {1826} INFO - iteration 79, current learner rf
[flaml.automl: 11-16 11:47:54] {2029} INFO -  at 57.5s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:54] {1826} INFO - iteration 80, current learner rf
[flaml.automl: 11-16 11:47:56] {2029} INFO -  at 59.0s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:56] {1826} INFO - iteration 81, current learner rf
[flaml.automl: 11-16 11:47:57] {2029} INFO -  at 60.5s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:57] {1826} INFO - iteration 82, current learner rf
[flaml.automl: 11-16 11:47:59] {2029} INFO -  at 62.1s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:47:59] {1826} INFO - iteration 83, current learner rf
[flaml.automl: 11-16 11:48:00] {2029} INFO -  at 63.5s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:00] {1826} INFO - iteration 84, current learner rf
[flaml.automl: 11-16 11:48:02] {2029} INFO -  at 65.0s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:02] {1826} INFO - iteration 85, current learner rf
[flaml.automl: 11-16 11:48:03] {2029} INFO -  at 66.5s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:03] {1826} INFO - iteration 86, current learner rf
[flaml.automl: 11-16 11:48:05] {2029} INFO -  at 68.0s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:05] {1826} INFO - iteration 87, current learner rf
[flaml.automl: 11-16 11:48:07] {2029} INFO -  at 69.7s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:07] {1826} INFO - iteration 88, current learner rf
[flaml.automl: 11-16 11:48:08] {2029} INFO -  at 71.3s,	estimator rf's best error=0.1372,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:08] {1826} INFO - iteration 89, current learner lrl1
[flaml.automl: 11-16 11:48:09] {2029} INFO -  at 72.0s,	estimator lrl1's best error=0.2756,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:09] {1826} INFO - iteration 90, current learner lrl1
[flaml.automl: 11-16 11:48:10] {2029} INFO -  at 72.7s,	estimator lrl1's best error=0.2756,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:10] {1826} INFO - iteration 91, current learner lrl1
[flaml.automl: 11-16 11:48:10] {2029} INFO -  at 73.4s,	estimator lrl1's best error=0.2756,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:10] {1826} INFO - iteration 92, current learner lrl1
[flaml.automl: 11-16 11:48:11] {2029} INFO -  at 74.2s,	estimator lrl1's best error=0.2756,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:11] {1826} INFO - iteration 93, current learner lrl1
[flaml.automl: 11-16 11:48:12] {2029} INFO -  at 74.8s,	estimator lrl1's best error=0.2756,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:12] {1826} INFO - iteration 94, current learner lrl1
[flaml.automl: 11-16 11:48:12] {2029} INFO -  at 75.6s,	estimator lrl1's best error=0.2756,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:12] {1826} INFO - iteration 95, current learner lrl1
[flaml.automl: 11-16 11:48:13] {2029} INFO -  at 76.3s,	estimator lrl1's best error=0.2756,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:13] {1826} INFO - iteration 96, current learner lrl1
[flaml.automl: 11-16 11:48:14] {2029} INFO -  at 77.0s,	estimator lrl1's best error=0.2756,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:14] {1826} INFO - iteration 97, current learner lrl1
[flaml.automl: 11-16 11:48:14] {2029} INFO -  at 77.7s,	estimator lrl1's best error=0.2756,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:14] {1826} INFO - iteration 98, current learner lrl1
[flaml.automl: 11-16 11:48:15] {2029} INFO -  at 78.4s,	estimator lrl1's best error=0.2756,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:15] {1826} INFO - iteration 99, current learner lrl1
[flaml.automl: 11-16 11:48:16] {2029} INFO -  at 79.1s,	estimator lrl1's best error=0.2756,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:16] {1826} INFO - iteration 100, current learner lrl1
[flaml.automl: 11-16 11:48:17] {2029} INFO -  at 79.8s,	estimator lrl1's best error=0.2756,	best estimator rf's best error=0.1372
[flaml.automl: 11-16 11:48:17] {1826} INFO - iteration 101, current learner lrl1

''' retrieve best config and best learner''' print('Best ML leaner:', automl.best_estimator) print('Best hyperparmeter config:', automl.best_config) print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss)) print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

Best ML leaner: lgbm
Best hyperparmeter config: {'n_estimators': 17.0, 'num_leaves': 4.0, 'min_child_samples': 25.0, 'learning_rate': 0.38406305271476093, 'subsample': 0.7898698216956215, 'log_max_bin': 8.0, 'colsample_bytree': 0.9425456251062123, 'reg_alpha': 0.6913795549330475, 'reg_lambda': 0.12357355094487746}
Best accuracy on validation data: 0.8692
Training duration of best run: 0.1457 s

automl.model

''' compute predictions of testing dataset ''' y_pred = automl.model.predict(X_test) print('Predicted labels', y_pred) print('True labels', y_test) y_pred_proba = automl.model.predict_proba(X_test)[:,1]

Predicted labels [0 0 1 1 0 0 0 1 0 0 1 0 1 1 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0
 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0
 0 0 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 1
 0 1 1 0 1 1 1 0 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 1 1 1 0]
True labels 29     0
615    1
383    1
783    0
684    1
      ..
238    1
242    1
234    0
605    0
492    1
Name: HeartDisease, Length: 138, dtype: int64

''' compute different metric values on testing dataset''' from flaml.ml import sklearn_metric_loss_score print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test)) print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test)) print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test)) print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))

accuracy = 0.34782608695652173
roc_auc = 0.24469439728353148
log_loss = 0.9483898773683159
Training duration of best run: 0.1457 s

cm = confusion_matrix(y_test, automl.model.predict(X_test)) cm

plot_confusion_matrix(automl.model, X_test, y_test, cmap='Oranges', normalize='true')

import shap print(f'shap version {shap.__version__}') # load JS visualization code to notebook shap.initjs()

shap version 0.39.0

def case_detail(case_data): ''' format obj returned from shap.force_plot() ''' de=pd.DataFrame(case_data.data['features']) fcols=[] for i in case_data.data['features'].keys(): fcols.append(case_data.data['featureNames'][i]) de.columns=fcols return de def individual_case_plot(explainer, X, case_index, verbose=False): """ >>> individual_case_plot(explainer, X_train, 1) """ shap_values = explainer.shap_values(X.iloc[[case_index]]) g=shap.force_plot(explainer.expected_value, shap_values=shap_values, features=X.iloc[case_index, :]) if verbose: pprint(g.__dict__) return g

# explain the model's predictions using SHAP explainer = shap.TreeExplainer(automl.model) shap_values = explainer.shap_values(X_test)

shap_values[:3]

shap.summary_plot(shap_values, X_train, plot_type="bar")

feature_cols

dshap=pd.DataFrame(shap_values, columns=feature_cols) dshap