import sys
import math
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
# import plotly
import plotly.express as px
# import plotly.offline
# import cufflinks as cf
# cf.go_offline()
# cf.set_config_file(offline=False, world_readable=True)
# import ipywidgets as widgets
# from ipywidgets import interact
sns.set_style('whitegrid')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import warnings
warnings.filterwarnings('ignore')
print(f'Python version: {sys.version}')
print(f'pandas version: {pd.__version__}')
print(f'numpy version: {np.__version__}')
print(f'seaborn version: {sns.__version__}')
pd.Timestamp.now()
# pd.Timestamp.now().strftime('%Y-%m-%d')
url = 'https://raw.githubusercontent.com/GuySuphakit/Heart-Failure-Prediction/main/heart.csv'
temp_df = pd.read_csv(url)
temp_df.head()
temp_df.info()
def read_data():
url = 'https://raw.githubusercontent.com/GuySuphakit/Heart-Failure-Prediction/main/heart.csv'
df = pd.read_csv(url)
return df
# def change_lebel_of_HeartDisease(df):
# # change the lebel of HeartDisease because it has caused some ambiguity.
# df.HeartDisease = df.HeartDisease.replace({0: 'Normal', 1: 'Heart Disease'})
# return df
def convert_FastingBS_to_category(df):
df.FastingBS = df.FastingBS.astype('category')
return df
def convert_obj_columns_to_category(df):
for c in df.columns:
col_type = df[c].dtype
if col_type == 'object' or col_type.name == 'category':
df[c] = df[c].astype('category')
return df
def drop_duplicate(df):
df = df.drop_duplicates()
return df
df = (read_data()
.pipe(convert_FastingBS_to_category)
.pipe(convert_obj_columns_to_category)
.pipe(drop_duplicate))
df.head()
df.info()
def print_category_columns(df):
for c in df.columns:
col_type = df[c].dtype
if col_type.name == 'category':
print(f'{c:15}: {list(enumerate(df[c].cat.categories))}')
print('-' * 60)
print_category_columns(df)
df.info()
def plot_mn(df, cols, n_rows:int=1, kind:str='boxplot', color='salmon'):
"""
plot boxplot, violin, hist in m (rows) by n (columns)
>>> plot_mn(df, ['Calories', 'Fat'], 2, 'hist')
"""
n=len(cols)
n_cols=math.ceil(n / n_rows)
fig, ax = plt.subplots(n_rows, n_cols, figsize=(n_cols*3, n_rows*3.5))
ax=ax.ravel()
fig.tight_layout()
for i, c in enumerate(cols):
col_type = df[c].dtype
if col_type.name == 'category':
sns.countplot(data=df, x=c, ax=ax[i])
else:
if kind.lower()=='boxplot':
sns.boxplot(data=df[[c]], ax=ax[i], color=color)
if kind.lower()=='boxen':
sns.boxenplot(data=df[[c]], ax=ax[i], color=color)
elif kind.lower()=='violin':
sns.violinplot(data=df[[c]], ax=ax[i], color=color)
elif kind.lower()=='hist':
sns.distplot(df[c], hist=True, kde=False, ax=ax[i], color=color)
plot_mn(df, df.columns, 3, 'hist')
df.columns
df.describe().T.round(1)
df.describe(include='category').T
numerical= df.select_dtypes('int64').columns
categorical = df.select_dtypes('category').columns
print(f'Numerical Columns: {df[numerical].columns}')
print('-'* 100)
print(f'Categorical Columns: {df[categorical].columns}')
df.nunique().sort_values(ascending=False)
df["Age"].describe().round(1)
df.Age.mode()[0]
print(stats.skew(df.Age))
fig = px.histogram(df, x='Age', marginal='box')
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.Sex.value_counts()
temp_df = df.groupby(by="Sex", as_index=False).agg(
counts=pd.NamedAgg(column="Sex", aggfunc="count")).sort_values(by="counts", ascending=False)
fig = px.bar(temp_df,
x='Sex',
y='counts',
color='Sex',
color_continuous_scale=px.colors.qualitative.D3)
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.ChestPainType.value_counts()
temp_df = df.groupby(by="ChestPainType", as_index=False).agg(
counts=pd.NamedAgg(column="ChestPainType", aggfunc="count")).sort_values(by="counts", ascending=False)
fig = px.bar(temp_df,
x='ChestPainType',
y='counts',
color='counts',
color_discrete_sequence = px.colors.qualitative.D3)
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.RestingBP.describe().round()
df.RestingBP.mode()[0]
print(stats.skew(df.RestingBP))
fig = px.histogram(df, x='RestingBP', marginal='box')
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df[df['RestingBP'] == 0]
df.Cholesterol.describe().round()
df.Cholesterol.mode()[0]
print(stats.skew(df.Cholesterol))
fig = px.histogram(df, x='Cholesterol', marginal='box')
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df[df['Cholesterol'] == 0].head()
df[df['Cholesterol'] >= 500]['Cholesterol']
df.FastingBS.value_counts()
fig = px.histogram(df, x='FastingBS', color='FastingBS')
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df["RestingECG"].value_counts()
temp_df = df.groupby(by="RestingECG", as_index=False).agg(
counts=pd.NamedAgg(column="RestingECG", aggfunc="count")).sort_values(by="counts", ascending=False)
fig = px.bar(temp_df,
x='RestingECG',
y='counts',
color='counts',
color_discrete_sequence = px.colors.qualitative.D3)
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.MaxHR.describe().round()
df.MaxHR.mode()[0]
df.MaxHR.skew()
fig = px.histogram(df, x='MaxHR', marginal='box')
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.ExerciseAngina.value_counts()
temp_df = df.groupby(by="ExerciseAngina", as_index=False).agg(
counts=pd.NamedAgg(column="ExerciseAngina", aggfunc="count")).sort_values(by="counts", ascending=False)
fig = px.bar(temp_df,
x='ExerciseAngina',
y='counts',
color='ExerciseAngina',
color_continuous_scale=px.colors.qualitative.D3)
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.Oldpeak.describe().round()
df.Oldpeak.mode()[0]
df.Oldpeak.skew()
fig = px.histogram(df, x='Oldpeak', marginal='box')
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.ST_Slope.value_counts()
temp_df = df.groupby(by="ST_Slope", as_index=False).agg(
counts=pd.NamedAgg(column="ST_Slope", aggfunc="count")).sort_values(by="counts", ascending=False)
fig = px.bar(temp_df,
x='ST_Slope',
y='counts',
color='ST_Slope',
color_continuous_scale=px.colors.qualitative.D3)
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
df.HeartDisease.value_counts()
fig = px.histogram(df, x='HeartDisease', color='HeartDisease')
fig.update_traces(marker_line_width=1, marker_line_color="black")
fig.show()
dcorr=df[df.columns].corr()
# dcorr
mask = np.zeros_like(dcorr)
# mask.shape
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplots(figsize=(10,8))
sns.heatmap(dcorr, cmap=sns.diverging_palette(10, 145, n=100),
vmin=-1, vmax=1, center=0, linewidths=1, annot=True, mask=mask, ax=ax).set_title("Correlation\nHeatmap", fontsize=22,fontweight="bold");
!pip install statsmodels
corr_value = df["Age"].corr(df["RestingBP"])
fig = px.scatter(
data_frame=df,
x="Age",
y="RestingBP",
marginal_x="histogram",
marginal_y="histogram",
trendline="ols",
trendline_color_override="red",
title=f"Correlation between Age and RestingBP is: {corr_value:.2f}",
)
fig.show()
corr_value = df["Age"].corr(df["Cholesterol"])
fig = px.scatter(
data_frame=df,
x="Age",
y="Cholesterol",
marginal_x="histogram",
marginal_y="histogram",
trendline="ols",
trendline_color_override="red",
title=f"Correlation between Age and Cholester0l is: {corr_value:.2f}",
)
fig.show()
corr_value = df["Age"].corr(df["Oldpeak"])
fig = px.scatter(
data_frame=df,
x="Age",
y="Oldpeak",
marginal_x="histogram",
marginal_y="histogram",
trendline="ols",
trendline_color_override="red",
title=f"Correlation between Age and Oldpeak is: {corr_value:.2f}",
)
fig.show()
corr_value = df["Age"].corr(df["MaxHR"])
fig = px.scatter(
data_frame=df,
x="Age",
y="MaxHR",
marginal_x="histogram",
marginal_y="histogram",
trendline="ols",
trendline_color_override="red",
title=f"Correlation between Age and MaxHR is: {corr_value:.2f}",
)
fig.show()
fig = px.density_heatmap(
data_frame=df, x="Age", y="RestingBP", color_continuous_scale="PuBu"
)
fig.show()
fig = px.density_heatmap(
data_frame=df, x="Age", y="Cholesterol", color_continuous_scale="PuBu"
)
fig.show()
fig = px.density_heatmap(
data_frame=df, x="Age", y="MaxHR", color_continuous_scale="PuBu"
)
fig.show()
fig = px.density_heatmap(
data_frame=df, x="Age", y="Oldpeak", color_continuous_scale="PuBu"
)
fig.show()
sns.pairplot(df, hue="HeartDisease", palette="husl", corner=True);
px.density_heatmap(
data_frame=df,
x="Age",
y="RestingBP",
color_continuous_scale="PuBu",
facet_col="HeartDisease",
title="Age vs. Chelesterol for different Heart Disease values",
)
px.density_heatmap(
data_frame=df,
x="Age",
y="Cholesterol",
color_continuous_scale="PuBu",
facet_col="HeartDisease",
title="Age vs. Chelesterol for different Heart Disease values",
)
px.density_heatmap(
data_frame=df,
x="Age",
y="MaxHR",
color_continuous_scale="PuBu",
facet_col="HeartDisease",
title="Age vs. Chelesterol for different Heart Disease values",
)
def num_plot(df, col):
fig = px.histogram(df, x=col, color="HeartDisease",
marginal="box")
fig.update_layout(height=400, width=700, showlegend=True)
fig.update_traces(marker_line_width=1,marker_line_color="black")
fig.show()
cols= df.columns
for col in cols:
num_plot(df, col)
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, ShuffleSplit, cross_val_score, cross_val_predict, cross_validate
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, precision_recall_fscore_support, f1_score
feature_cols = ['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope']
target_cols = 'HeartDisease'
X = df[feature_cols]
y = df[target_cols]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
def data_preprocessing(X_train):
''' replace outliers with mean to make the distribution more normal. '''
mean_chol = X_train[X_train.Cholesterol > 0]['Cholesterol'].mean()
mean_rest = X_train[X_train.RestingBP > 0]['Cholesterol'].mean()
X_train['Cholesterol'].replace(to_replace = [0,X_train[X_train['Cholesterol'] >= 500]['Cholesterol']], value = mean_chol, inplace = True)
X_train['RestingBP'].replace(to_replace = [0,244.635389], value = mean_rest, inplace = True)
return X_train
X_train = data_preprocessing(X_train)
df.columns
y_train.value_counts()
!pip install flaml
from flaml import AutoML
automl = AutoML()
settings = {
"time_budget": 180, # total running time in seconds
"metric": 'accuracy', # can be: 'r2', 'rmse', 'mae', 'mse', 'accuracy', 'roc_auc', 'roc_auc_ovr',
# 'roc_auc_ovo', 'log_loss', 'mape', 'f1', 'ap', 'ndcg', 'micro_f1', 'macro_f1'
"task": 'classification', # task type
"log_file_name": 'airlines_experiment.log', # flaml log file
}
automl.fit(X_train=X_train, y_train=y_train, **settings)
''' retrieve best config and best learner'''
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print('Best accuracy on validation data: {0:.4g}'.format(1-automl.best_loss))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))
automl.model
''' compute predictions of testing dataset '''
y_pred = automl.model.predict(X_test)
print('Predicted labels', y_pred)
print('True labels', y_test)
y_pred_proba = automl.model.predict_proba(X_test)[:,1]
''' compute different metric values on testing dataset'''
from flaml.ml import sklearn_metric_loss_score
print('accuracy', '=', 1 - sklearn_metric_loss_score('accuracy', y_pred, y_test))
print('roc_auc', '=', 1 - sklearn_metric_loss_score('roc_auc', y_pred_proba, y_test))
print('log_loss', '=', sklearn_metric_loss_score('log_loss', y_pred_proba, y_test))
print('Training duration of best run: {0:.4g} s'.format(automl.best_config_train_time))
cm = confusion_matrix(y_test, automl.model.predict(X_test))
cm
plot_confusion_matrix(automl.model, X_test, y_test, cmap='Oranges', normalize='true')
import shap
print(f'shap version {shap.__version__}')
# load JS visualization code to notebook
shap.initjs()
def case_detail(case_data):
'''
format obj returned from shap.force_plot()
'''
de=pd.DataFrame(case_data.data['features'])
fcols=[]
for i in case_data.data['features'].keys():
fcols.append(case_data.data['featureNames'][i])
de.columns=fcols
return de
def individual_case_plot(explainer, X, case_index, verbose=False):
"""
>>> individual_case_plot(explainer, X_train, 1)
"""
shap_values = explainer.shap_values(X.iloc[[case_index]])
g=shap.force_plot(explainer.expected_value, shap_values=shap_values, features=X.iloc[case_index, :])
if verbose:
pprint(g.__dict__)
return g
# explain the model's predictions using SHAP
explainer = shap.TreeExplainer(automl.model)
shap_values = explainer.shap_values(X_test)
X
shap_values[:3]
shap.summary_plot(shap_values, X_train, plot_type="bar")
feature_cols
dshap=pd.DataFrame(shap_values, columns=feature_cols)
dshap