import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Load the data set
df1 = pd.read_csv ('processed.cleveland.data',header=None)
df2 = pd.read_csv ('processed.switzerland.data',header=None)
df3 = pd.read_csv ('processed.hungarian.data',header=None)
df4 = pd.read_csv ('processed.va.data',header=None)
df = pd.concat([df1,df2,df3,df4])
df.columns=['age','sex','chest_pain_type','resting_blood_pressure','cholesterol','fasting_blood_sugar',
'resting_electrocardiographic_results','maximum_heart_rate','exercise_induced_angina',
'ST_depression',"slope_peak_exercise",'no_major_vessels_colored','thal','diagnosis_of_heart_disease']
# to replace all the values with "?" as nan for counting of missing values.
df = df.replace("?", np.nan)
replace_table = { 1 : 1, 2 : 1, 3 : 1, 4 : 1}
# to convert the diagnosis to 2 state instead of multiple states / levels
# State Meaning : 0 is 'Healthy', {1 ,2,3,4} becomes 1 and is 'Sick'}
df.diagnosis_of_heart_disease = df.diagnosis_of_heart_disease.replace(replace_table)
df
datatypes = df.dtypes
datatypes
#total percentage of missing data
missing_data = df.isnull().sum()
total_percentage = (missing_data.sum()/(df.shape[0]*len(df.columns))) * 100
print(f'The total percentage of missing data is {round(total_percentage,2)}%')
The total percentage of missing data is 13.66%
# percentage of missing data per category
total = df.isnull().sum().sort_values(ascending=False)
percent_total = round((df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)*100 ,2)
missing = pd.concat([total, percent_total], axis=1, keys=["Total", "Percentage"])
missing_data = missing[missing['Total']>0]
missing_data
# Make some plots to check the data
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.figure(figsize=(24,6))
sns.set(style="whitegrid")
sns.barplot(x=missing_data.index, y=missing_data['Percentage'], data = missing_data)
plt.title('Percentage of missing data by feature')
plt.xlabel('Features', fontsize=14)
plt.ylabel('Percentage', fontsize=14)
plt.show()
# We decided to drop 2 columns, as there are too many missing data
df.drop(['no_major_vessels_colored', 'thal'], axis='columns', inplace=True)
total = df.isnull().sum().sort_values(ascending=False)
percent_total = round((df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)*100 ,2)
missing = pd.concat([total, percent_total], axis=1, keys=["Total", "Percentage"])
missing_data = missing[missing['Total']>0]
missing_data
#This is to replace all the missing data in the dataframe with a number value and convert all the
#data in the dataframe into float.
df = df.replace(np.nan,-9)
df = df.astype(float)
# [As a quick reference] Columns data for the plot
# *3 age: age in years
# *4 sex: sex (1 = male; 0 = female)
# *9 cp: chest pain type
# *10 trestbps: resting blood pressure (in mm Hg on admission to the hospital)
# 11 htn
# *12 chol: serum cholestoral in mg/dl
# 14 cigs (cigarettes per day)
# 15 years (number of years as a smoker)
# *16 fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
# 18 famhist: family history of coronary artery disease (1 = yes; 0 = no)
# *19 restecg: resting electrocardiographic results
# 29 thaldur: duration of exercise test in minutes
# 30 thaltime: time when ST measure depression was noted
# 31 met: mets achieved
# *32 thalach: maximum heart rate achieved
# 33 thalrest: resting heart rate
# 34 tpeakbps: peak exercise blood pressure (first of 2 parts)
# 35 tpeakbpd: peak exercise blood pressure (second of 2 parts)
# 37 trestbpd: resting blood pressure
# *38 exang: exercise induced angina (1 = yes; 0 = no)
# *40 oldpeak = ST depression induced by exercise relative to rest
# *41 slope: the slope of the peak exercise ST segment
# *44 ca: number of major vessels (0-3) colored by flourosopy
# *51 thal: 3 = normal; 6 = fixed defect; 7 = reversable defect
# *58 num: diagnosis of heart disease (angiographic disease status)
# Make some plots to check the data
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import numpy as np
warnings.filterwarnings('ignore')
# using df_re: variables = ['age','sex','cp','trestbps','htn','chol','cigs','years','fbs','famhist','restecg','thaldur','thaltime','met','thalach','thalrest','tpeakbps','tpeakbpd','trestbpd','exang','oldpeak','slope','ca','thal','num'] # to be updated
# using df: original 14 data
# variables = ['age','sex','chest_pain_type','resting_blood_pressure','cholesterol','fasting_blood_sugar','resting_electrocardiographic_results','maximum_heart_rate','exercise_induced_angina','ST_depression','slope_peak_exercise','no_major_vessels_colored','thal','diagnosis_of_heart_disease']
# after dropping the 2 attributes (thal,no. major vessels colored)
variables = ['age','sex','chest_pain_type','resting_blood_pressure','cholesterol','fasting_blood_sugar','resting_electrocardiographic_results','maximum_heart_rate','exercise_induced_angina','ST_depression','slope_peak_exercise','diagnosis_of_heart_disease']
to_group = ['age','resting_blood_pressure','cholesterol','maximum_heart_rate']
others = ['ST_depression','sex','chest_pain_type','fasting_blood_sugar','resting_electrocardiographic_results','exercise_induced_angina','slope_peak_exercise','diagnosis_of_heart_disease']
fig, axes = plt.subplots(nrows=len(variables)//2, ncols=2, figsize=(30,60))
plotting_index = 0
for i in range(0,len(to_group),2):
plotting_variable = df[to_group[i]]
ax1 = axes[plotting_index][0]
ax2 = axes[plotting_index][1]
ax1.hist(df[to_group[i]],bins = 20)
ax2.hist(df[to_group[i+1]],bins = 20)
ax1.set_title(to_group[i] + ' Distribution')
ax1.set_xlabel(to_group[i])
ax1.set_ylabel('No. of people')
ax2.set_title(to_group[i+1] + ' Distribution')
ax2.set_xlabel(to_group[i+1])
ax2.set_ylabel('No. of people')
plotting_index += 1
# Backup code
#for elem in to_group:
# plotting_variable = df[elem]
# ax1 = axes[plotting_index]
# ax1.hist(plotting_variable,bins = 20)
#ax1.set_title(to_group[plotting_index] + ' Distribution')
#ax1.set_xlabel(to_group[plotting_index])
#ax1.set_ylabel('No. of people')
#plotting_index += 1
for j in range(0,len(others),2):
plotting_variable = df[elem2]
ax1 = axes[plotting_index][0]
ax2 = axes[plotting_index][1]
sns.countplot(df[others[j]],ax = ax1)
sns.countplot(df[others[j+1]],ax = ax2)
ax1.set_title(others[j] + ' Distribution')
ax1.set_xlabel(others[j])
ax1.set_ylabel('No. of people')
ax2.set_title(others[j+1] + ' Distribution')
ax2.set_xlabel(others[j+1])
ax2.set_ylabel('No. of people')
plotting_index += 1
#Back up code
#for elem2 in others:
# plotting_variable = df[elem2]
# ax1 = axes[plotting_index][0]
# ax2 = axes[plotting_index][1]
#sns.countplot(df[elem2],ax = ax1)
#sns.countplot(df[elem2],ax = ax2)
# ax1.set_title(variables[plotting_index] + ' Distribution')
# ax1.set_xlabel(variables[plotting_index])
# ax1.set_ylabel('No. of people')
# plotting_index += 1
plt.subplots_adjust( wspace=0.20, hspace=0.20, top=0.97,left = 0.2)
plt.suptitle("Heart Disease Data", fontsize=20)
plt.tight_layout()
IndexError: index 6 is out of bounds for axis 0 with size 6
#Methodology - multivariate analysis
# Multivariate regression models are used compare the relationship between two variables
# Closely related features are closest to 1.00 or 1
# From the Multivariate, we see that the resting_blood_pressure is closely related to ST_depression, can be considered to drop for future experimentation
import seaborn as sns
f, ax = plt.subplots(figsize=(13,13))
sns.heatmap(df_corr,
mask=np.zeros_like(df_corr, dtype=np.bool),
cmap=sns.diverging_palette(220, 10, as_cmap=True),
square=True, ax=ax,annot=True)
#Methodology - logistic regression
# >>> King Yeh is on this
import sklearn
new_features=df[['age','sex','chest_pain_type','resting_blood_pressure','cholesterol','fasting_blood_sugar','resting_electrocardiographic_results','maximum_heart_rate','exercise_induced_angina','ST_depression','slope_peak_exercise','diagnosis_of_heart_disease']]
x=new_features.iloc[:,:-1]
y=new_features.iloc[:,-1]
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.20,random_state=5)
#input from KingYeh
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(x_train,y_train)
y_pred=logreg.predict(x_test)
sklearn.metrics.accuracy_score(y_test,y_pred)
# Tricia
# Inputs for Decision-Tree template reference based on 6B diabetes example
#####
# 11 shortlisted features, excluding last column 'diagnosis_of_heart_disease'
features = [
'age',
'sex',
'chest_pain_type',
'resting_blood_pressure',
'cholesterol',
'fasting_blood_sugar',
'resting_electrocardiographic_results',
'maximum_heart_rate',
'exercise_induced_angina',
'ST_depression',
'slope_peak_exercise'
]
##### define x and y
df_x = df[features]
df_y = df['diagnosis_of_heart_disease']
#display(df_x, df_y)
# train,test,split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(
df_x, df_y,
random_state=0,
test_size=0.2
)
# x_train.head()
# y_train.head()
y_test.shape
# Tricia
# Full Tree #df_clf_fulltree
#train decision tree using DecisionTreeClassifier;
from sklearn.tree import DecisionTreeClassifier
df_clf_fulltree = DecisionTreeClassifier(criterion='entropy') # or gini
df_clf_fulltree.fit(x_train, y_train)
print(df_clf_fulltree.tree_.max_depth)
19
# Tricia
# Pruned Tree #df_clf_prunedtree
#train decision tree using DecisionTreeClassifier;
from sklearn.tree import DecisionTreeClassifier
df_clf_prunedtree = DecisionTreeClassifier(criterion='entropy',max_depth=4) # or gini
df_clf_prunedtree.fit(x_train, y_train)
print(df_clf_prunedtree.tree_.max_depth)
4
#Tricia Decision Tree
# test results using .predict()
full_results = df_clf_fulltree.predict(x_test)
# print("Full tree predictions:")
# print(full_results)
# print("-----")
pruned_results = df_clf_prunedtree.predict(x_test)
# print("Pruned tree predictions:")
# print(pruned_results)
# Create confusion matrix
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
cm_f = confusion_matrix(y_test, full_results) # actual labels, predicted labels
cm_p = confusion_matrix(y_test, pruned_results)
print(cm_f)
print(cm_p)
from sklearn.metrics import plot_confusion_matrix
titles_options = [("Confusion matrix (Full tree)", df_clf_fulltree),
("Confusion matrix (Pruned Tree)", df_clf_prunedtree)]
for title, classifier in titles_options:
disp = plot_confusion_matrix(classifier, x_test, y_test,
cmap=plt.cm.Blues, values_format='d',
normalize=None)
disp.ax_.set_title(title)
print(title)
print(disp.confusion_matrix)
plt.show()
[[50 18]
[39 77]]
[[40 28]
[20 96]]
Confusion matrix (Full tree)
[[50 18]
[39 77]]
Confusion matrix (Pruned Tree)
[[40 28]
[20 96]]
# ## Tricia Decision Tree
#NUMBERS ARE NOT UPDATED YET, pasted formula for template
# Recall = TruePositive / (TruePositive + FalseNegative)
# Accuracy = (TP+TN) / (P+N)
# Precision = TP / (TP+FP)
## Full
TNP0A0 = df_clf_fulltree[0][0]
FNP0A1 = df_clf_fulltree[0][1]
FPP1A0 = df_clf_fulltree[1][0]
TPP1A1 = df_clf_fulltree[1][1]
#Recall
recall_predict_noheartdisease_full = 21033 / (261 + 21033) # divide by true =0
recall_predict_heartdisease_full = 109 / (109 + 3760) # divide by true =1
print(recall_predict_noheartdisease_full) # predict no heart disease
print(recall_predict_heartdisease_full) #predict heart disease
#Accuracy
print("Full tree accuracy", df_clf_fulltree.score(x_test, y_test))
#Precision to be added
## Pruned
#Recall
recall_predict_noheartdisease_pruned = 21033 / (261 + 21033) # divide by true =0
recall_predict_heartdisease_pruned = 109 / (109 + 3760) # divide by true =1
print(recall_predict_noheartdisease_pruned) # predict no heart disease
print(recall_predict_heartdisease_pruned) #predict heart disease
#Accuracy
print("Pruned tree accuracy", df_clf_prunedtree.score(x_test, y_test))
#Precision to be added
0.9877430262045647
0.02817265443266994
Full tree accuracy 0.6902173913043478
0.9877430262045647
0.02817265443266994
Pruned tree accuracy 0.7608695652173914
## Tricia update on feature importance
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(12,8))
ax0 = fig.add_subplot(2,1,1) #2x2 grid, first area top left
ax1 = fig.add_subplot(2,1,2) #2x2 grid second area top right
ax0.barh(features, df_clf_fulltree.feature_importances_)
ax0.set_title('Feature Importance Full Tree')
ax1.barh(features, df_clf_prunedtree.feature_importances_)
ax1.set_title('Feature Importance Pruned Tree')
plt.show()
# Pruned tree is a more precise indicator of heart disease
# Chest pain, Age, ST_depression, gender are more key features