Heart Disease Prediction

# import useful libraries for EDA and Visualization import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns

df = pd.read_csv('heart.csv')

df.head()

# the data set has 13 features, and 303 patients # our label will be 'target' column df.shape

# no missing data df.isnull().sum()

# countplot to show the number of patients with/ without heart disease plt.figure(figsize=(8,6)) sns.countplot(x='target',data=df);

# heatmap of correlation between all the columns plt.figure(figsize=(10,10)) sns.heatmap(df.corr(),annot=True,cmap='viridis');

corr_target = df.corr()['target'].sort_values()[:-1] plt.figure(figsize=(10,8)) sns.barplot(x=corr_target.index,y=corr_target) plt.xticks(rotation=90);

# these are 5 features with highest correlation with target column df.corr()['target'].abs().sort_values(ascending=False)[1:6]

high_corr_cols = ['exang','cp','oldpeak','thalach','ca'] sns.pairplot(data=df,vars=high_corr_cols,hue='target',markers=['o','s'],corner=True);

from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler

# Train | Test split X = df.drop('target',axis=1) y = df['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

scaler = StandardScaler() scaled_X_train = scaler.fit_transform(X_train) scaled_X_test = scaler.transform(X_test)

from sklearn.linear_model import LogisticRegression from sklearn.model_selection import GridSearchCV

# set base model log_model = LogisticRegression(solver='saga',max_iter=3000,random_state=42) # set parameter grid for grid search param_grid = {'penalty':['l1','l2','elasticnet'],'l1_ratio':np.linspace(0,1,10),'C':np.logspace(0,10,20)} # set grid model grid = GridSearchCV(estimator=log_model,param_grid=param_grid,scoring='recall')

grid.fit(scaled_X_train,y_train)

# best hyperparameters grid.best_params_

# now create a new log model with above parameters log_grid = LogisticRegression(solver='saga',max_iter=3000,random_state=42, C=1.0,penalty='l2') log_grid.fit(scaled_X_train,y_train)

# how are the coefficients used? log_grid.coef_

# let's visualize it coefficients = pd.Series(index=X.columns,data=log_grid.coef_[0]) coefs = coefficients.sort_values() plt.figure(figsize=(10,8)) sns.barplot(x=coefs.index,y=coefs.values);

from sklearn.metrics import plot_confusion_matrix, classification_report, plot_roc_curve, plot_precision_recall_curve

log_pred = log_grid.predict(scaled_X_test)

# Confusion Matrix plot_confusion_matrix(log_grid,scaled_X_test,y_test)

# False Negative Rate # FN/FN+TP np.round(6/(6+22)*100,2)

# We have scores around 80% print(classification_report(y_test,log_pred))

# ROC, Receiver Operator Characteristic Curve # for our case, we should accept more false positives to reduce false negatives plot_roc_curve(log_grid,scaled_X_test,y_test)

plot_precision_recall_curve(log_grid,scaled_X_test,y_test)