# import useful libraries for EDA and Visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('heart.csv')
df.head()
# the data set has 13 features, and 303 patients
# our label will be 'target' column
df.shape
# no missing data
df.isnull().sum()
# countplot to show the number of patients with/ without heart disease
plt.figure(figsize=(8,6))
sns.countplot(x='target',data=df);
# heatmap of correlation between all the columns
plt.figure(figsize=(10,10))
sns.heatmap(df.corr(),annot=True,cmap='viridis');
corr_target = df.corr()['target'].sort_values()[:-1]
plt.figure(figsize=(10,8))
sns.barplot(x=corr_target.index,y=corr_target)
plt.xticks(rotation=90);
# these are 5 features with highest correlation with target column
df.corr()['target'].abs().sort_values(ascending=False)[1:6]
high_corr_cols = ['exang','cp','oldpeak','thalach','ca']
sns.pairplot(data=df,vars=high_corr_cols,hue='target',markers=['o','s'],corner=True);
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# Train | Test split
X = df.drop('target',axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
scaler = StandardScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# set base model
log_model = LogisticRegression(solver='saga',max_iter=3000,random_state=42)
# set parameter grid for grid search
param_grid = {'penalty':['l1','l2','elasticnet'],'l1_ratio':np.linspace(0,1,10),'C':np.logspace(0,10,20)}
# set grid model
grid = GridSearchCV(estimator=log_model,param_grid=param_grid,scoring='recall')
grid.fit(scaled_X_train,y_train)
# best hyperparameters
grid.best_params_
# now create a new log model with above parameters
log_grid = LogisticRegression(solver='saga',max_iter=3000,random_state=42,
C=1.0,penalty='l2')
log_grid.fit(scaled_X_train,y_train)
# how are the coefficients used?
log_grid.coef_
# let's visualize it
coefficients = pd.Series(index=X.columns,data=log_grid.coef_[0])
coefs = coefficients.sort_values()
plt.figure(figsize=(10,8))
sns.barplot(x=coefs.index,y=coefs.values);
from sklearn.metrics import plot_confusion_matrix, classification_report, plot_roc_curve, plot_precision_recall_curve
log_pred = log_grid.predict(scaled_X_test)
# Confusion Matrix
plot_confusion_matrix(log_grid,scaled_X_test,y_test)
# False Negative Rate
# FN/FN+TP
np.round(6/(6+22)*100,2)
# We have scores around 80%
print(classification_report(y_test,log_pred))
# ROC, Receiver Operator Characteristic Curve
# for our case, we should accept more false positives to reduce false negatives
plot_roc_curve(log_grid,scaled_X_test,y_test)
plot_precision_recall_curve(log_grid,scaled_X_test,y_test)