# importing used libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
#### ⬇️⬇️ uncomment this code to install the plotly library - it may take a minute or two :) ######
# !pip install plotly==5.4.0
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
# loading our csv dataset into a pandas dataframe
df = pd.read_csv('heart.csv')
print(df.shape, ">> we have 918 records, and 12 attributes")
df.dtypes
df.head()
print(df.isnull().sum())
print(">> there are no empty/missing/null records")
print(df.duplicated().sum(), ">> there are no duplicate records found")
# first distinguish numerical and categorical data
# columns with numerical data
num_columns =['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak']
# columns with categorical data
cat_columns = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'FastingBS']
for col in num_columns:
fig = px.histogram(df, x=col, color="HeartDisease", marginal="box", width=600, height=400, title = "Exploring " + col + " column")
fig.update_traces(marker_line_width = 1, marker_line_color="black")
fig.show()
# confirm how many datapoints have a RestingBP = 0
print((df["RestingBP"] == 0).sum(), ">> row with RestingBP = 0")
# show this row
df[df["RestingBP"] == 0]
df = df[df["RestingBP"] > 0]
print(df.shape, ">> we now have 917 records; we have successfully dropped the anomaly")
# confirm how many rows have the outliers
print((df["Cholesterol"] == 0).sum(), ">> rows with Cholesterol = 0")
# show these rows
df_chol_is_zero = df[df["Cholesterol"] == 0]
df_chol_is_zero
df_chol_is_not_zero = df[df["Cholesterol"] != 0]
print(df_chol_is_not_zero.shape) # confirm the dataframe shape without outliers
# plot numeric categories without outliers
for col in num_columns:
fig = px.histogram(df_chol_is_not_zero, x=col, color="HeartDisease", marginal="box", width=600, height=400, title = "Exploring " + col + " column (without Cholesterol outliers)")
fig.update_traces(marker_line_width = 1, marker_line_color="black")
fig.show()
# then let's check the balance of HeartDisease with these outliers and without them
fig, axes = plt.subplots(1, 2, figsize=(10, 7))
sns.histplot(df["HeartDisease"], ax=axes[0], color="#8d426e").set(title="with outliers");
sns.histplot(df_chol_is_not_zero["HeartDisease"], ax=axes[1], color="#ce6d6c").set(title="without outliers");
plt.setp(axes, xticks=[0, 1])
fig.tight_layout(pad = 6)
df = df[df["Cholesterol"] > 0]
print(df.shape, ">> we now have 746 records; we have successfully dropped the anomalies")
df["HeartDiseaseLabel"] = 'No Heart Disease'
df.loc[df['HeartDisease'] == 1, 'HeartDiseaseLabel'] = 'Has Heart Disease'
df["FastingBSLabel"] = '<= 120 mg/dl'
df.loc[df['FastingBS'] == 1, 'FastingBSLabel'] = '> 120 mg/dl'
df
disease = df[df['HeartDisease'] == 1]
noDisease = df[df['HeartDisease'] == 0]
sns.heatmap(data=disease[["Cholesterol", "Age"]]);
# df_hm = df.pivot("Cholesterol", "Age")
#sns.heatmap(df_hm)
ax0=sns.displot(data = df, x = "Age", y = "Cholesterol", hue="HeartDiseaseLabel", palette="husl", height=6);
plt.title("Heart Disease vs Cholesterol");
# create a function to plot the different attributes against HeartDisease
def plot_factor(factor_name, xticks = None, title = None):
plt.figure(figsize=(13,7))
ax = sns.countplot(x = df[factor_name], data=df, hue = df["HeartDiseaseLabel"], palette="flare");
sns.set(font_scale = 1.5)
if title:
ax.set_title(title);
else:
ax.set_title("Heart Disease vs " + factor_name);
if xticks:
ax.set_xticklabels(xticks);
ax.set(ylabel = "Count");
plt.show();
plot_factor("Sex", ["Male", "Female"], "Heart Disease Distribution Among Male and Female")
plot_factor("RestingECG", ["Normal", "ST","LVH"])
plot_factor("ChestPainType")
plot_factor("ExerciseAngina")
plot_factor("FastingBSLabel")
plot_factor("ST_Slope")
plt.figure(figsize=(14, 7))
ax6=sns.histplot(data=df, x = df["MaxHR"], hue = df["HeartDiseaseLabel"], palette="flare", kde=True);
sns.set(font_scale = 1)
ax6.set_title("Heart Disease Vs Maximum Heart Rate");
ax6.set(ylabel = "Count");
ax8=sns.displot(data = df, x = "ChestPainType", y = "Age", hue="Sex", palette="husl", height=5);
plt.title("Sex & Age vs Chest Pain types");
ax8=sns.displot(data = df, x = "ChestPainType", y = "Cholesterol", hue="HeartDiseaseLabel", palette="husl", height=6);
plt.title("Chest Pain Types & Cholesterol vs Heart Disease");