CSCI200U Final Project

# importing used libraries import pandas as pd import numpy as np import seaborn as sns import matplotlib import matplotlib.pyplot as plt #### ⬇️⬇️ uncomment this code to install the plotly library - it may take a minute or two :) ###### # !pip install plotly==5.4.0 import plotly.express as px from plotly.subplots import make_subplots import plotly.graph_objects as go # loading our csv dataset into a pandas dataframe df = pd.read_csv('heart.csv')

print(df.shape, ">> we have 918 records, and 12 attributes")

df.dtypes

df.head()

print(df.isnull().sum()) print(">> there are no empty/missing/null records")

print(df.duplicated().sum(), ">> there are no duplicate records found")

# first distinguish numerical and categorical data # columns with numerical data num_columns =['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak'] # columns with categorical data cat_columns = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope', 'FastingBS']

for col in num_columns: fig = px.histogram(df, x=col, color="HeartDisease", marginal="box", width=600, height=400, title = "Exploring " + col + " column") fig.update_traces(marker_line_width = 1, marker_line_color="black") fig.show()

# confirm how many datapoints have a RestingBP = 0 print((df["RestingBP"] == 0).sum(), ">> row with RestingBP = 0") # show this row df[df["RestingBP"] == 0]

df = df[df["RestingBP"] > 0] print(df.shape, ">> we now have 917 records; we have successfully dropped the anomaly")

# confirm how many rows have the outliers print((df["Cholesterol"] == 0).sum(), ">> rows with Cholesterol = 0") # show these rows df_chol_is_zero = df[df["Cholesterol"] == 0] df_chol_is_zero

df_chol_is_not_zero = df[df["Cholesterol"] != 0] print(df_chol_is_not_zero.shape) # confirm the dataframe shape without outliers # plot numeric categories without outliers for col in num_columns: fig = px.histogram(df_chol_is_not_zero, x=col, color="HeartDisease", marginal="box", width=600, height=400, title = "Exploring " + col + " column (without Cholesterol outliers)") fig.update_traces(marker_line_width = 1, marker_line_color="black") fig.show()

# then let's check the balance of HeartDisease with these outliers and without them fig, axes = plt.subplots(1, 2, figsize=(10, 7)) sns.histplot(df["HeartDisease"], ax=axes[0], color="#8d426e").set(title="with outliers"); sns.histplot(df_chol_is_not_zero["HeartDisease"], ax=axes[1], color="#ce6d6c").set(title="without outliers"); plt.setp(axes, xticks=[0, 1]) fig.tight_layout(pad = 6)

df = df[df["Cholesterol"] > 0] print(df.shape, ">> we now have 746 records; we have successfully dropped the anomalies")

df["HeartDiseaseLabel"] = 'No Heart Disease' df.loc[df['HeartDisease'] == 1, 'HeartDiseaseLabel'] = 'Has Heart Disease' df["FastingBSLabel"] = '<= 120 mg/dl' df.loc[df['FastingBS'] == 1, 'FastingBSLabel'] = '> 120 mg/dl' df

disease = df[df['HeartDisease'] == 1] noDisease = df[df['HeartDisease'] == 0] sns.heatmap(data=disease[["Cholesterol", "Age"]]); # df_hm = df.pivot("Cholesterol", "Age") #sns.heatmap(df_hm)

ax0=sns.displot(data = df, x = "Age", y = "Cholesterol", hue="HeartDiseaseLabel", palette="husl", height=6); plt.title("Heart Disease vs Cholesterol");

# create a function to plot the different attributes against HeartDisease def plot_factor(factor_name, xticks = None, title = None): plt.figure(figsize=(13,7)) ax = sns.countplot(x = df[factor_name], data=df, hue = df["HeartDiseaseLabel"], palette="flare"); sns.set(font_scale = 1.5) if title: ax.set_title(title); else: ax.set_title("Heart Disease vs " + factor_name); if xticks: ax.set_xticklabels(xticks); ax.set(ylabel = "Count"); plt.show();

plot_factor("Sex", ["Male", "Female"], "Heart Disease Distribution Among Male and Female")

plot_factor("RestingECG", ["Normal", "ST","LVH"])

plot_factor("ChestPainType")

plot_factor("ExerciseAngina")

plot_factor("FastingBSLabel")

plot_factor("ST_Slope")

plt.figure(figsize=(14, 7)) ax6=sns.histplot(data=df, x = df["MaxHR"], hue = df["HeartDiseaseLabel"], palette="flare", kde=True); sns.set(font_scale = 1) ax6.set_title("Heart Disease Vs Maximum Heart Rate"); ax6.set(ylabel = "Count");

ax8=sns.displot(data = df, x = "ChestPainType", y = "Age", hue="Sex", palette="husl", height=5); plt.title("Sex & Age vs Chest Pain types");

ax8=sns.displot(data = df, x = "ChestPainType", y = "Cholesterol", hue="HeartDiseaseLabel", palette="husl", height=6); plt.title("Chest Pain Types & Cholesterol vs Heart Disease");