UWI Heart Failure Prediction

import pandas as pd # Use for preprocessing of the data from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import RepeatedKFold from sklearn.model_selection import cross_val_score from sklearn.preprocessing import LabelEncoder # Figuring out if our data variable’ distribution is Gaussian from scipy.stats import normaltest from scipy.stats import shapiro # Used for the models to predict the data #Logisitic Regression Model from sklearn.linear_model import LogisticRegression #Neural Net Model from sklearn.neural_network import MLPClassifier from sklearn.neural_network import MLPRegressor # Getting rid out outliers from sklearn.neighbors import LocalOutlierFactor # Training from sklearn.model_selection import train_test_split # Used for visualization #import scikitplot as skplt from sklearn.metrics import classification_report, confusion_matrix # Used for accuracy of the models from sklearn.metrics import mean_absolute_error

heartData = pd.read_csv("/work/heart_data.csv") heartData.head(5)

# Columns that have been extracted to be used le = LabelEncoder() heartData['Sex01'] = le.fit_transform(heartData.Sex) heartData['ChestPainType01'] = le.fit_transform(heartData.ChestPainType) heartData['RestingECG01'] = le.fit_transform(heartData.RestingECG) heartData['ExerciseAngina01'] = le.fit_transform(heartData.ExerciseAngina) heartData['ST_Slope01'] = le.fit_transform(heartData.ST_Slope) cols = heartData[['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'Sex01', 'ChestPainType01', 'RestingECG01', 'ExerciseAngina01', 'ST_Slope01']] colsValues = [1,4,5,6,8,10,13,14,15,16,17] # chosen columns from our data heartData.head(5) heartData.columns[colsValues]

# Before we choose to wrangle with our data we must first come to understand what underneath the hood. heartData.describe()

#[Age, RestingBP, Cholesterol, FastingBS, MaxHR, Oldpeak] print('\n###D’Agostino and Pearson’s Test####') for i in heartData.columns[colsValues]: print(f'{i}: {"Not Gaussian" if normaltest(heartData[i].values,)[1]<0.05 else "Gaussian"} {normaltest(heartData[i].values)}') print('\n###SHAPIRO TEST####') for i in heartData.columns[colsValues]: print(f'{i}: {"Not Gaussian" if shapiro(heartData[i])[1]<0.05 else "Gaussian"} {shapiro(heartData[i])}')

#Step 1 # total number of entries as well as count of non-null values with datatype of all features heartData.info()

heartData.isna().sum() # totalling all of the null values in each column

#Step 2 heartData.drop_duplicates() # removing any duplicates found in the dataset

# STEP 3 - Normalization # minmaxscalar normalizedHeartData = heartData[heartData.columns[colsValues]] minmax = MinMaxScaler() heartDataMinMax = minmax.fit_transform(normalizedHeartData)

heartData_HeartDisease_extracted = heartData[['HeartDisease']] #

#input data X = heartDataMinMax # what i am trying to predict Y = heartData_HeartDisease_extracted # split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.60, random_state=1) #K FOLD cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) # summarize the shape of the training dataset print("\nsummarize the shape of the training dataset") print(X_train.shape, y_train.shape) # identify outliers in the training dataset lof = LocalOutlierFactor() yhat = lof.fit_predict(X_train) # select all rows that are not outliers mask = yhat != -1 X_train, y_train = X_train[mask, :], y_train[mask] # summarize the shape of the updated training dataset print("\nsummarize the shape of the updated training dataset (outliers removed)") print(X_train.shape, y_train.shape) # fit the model model = LogisticRegression() model.fit(X_train, y_train) # evaluate the model yhat = model.predict(X_test) # evaluate predictions print('\nMAE: %.3f' % mean_absolute_error(y_test, yhat)) print(classification_report(y_test, yhat)) print(confusion_matrix(y_test, yhat)) #skplt.metrics.plot_confusion_matrix(y_test, yhat, figsize=(6,6), title = "Confusion matrix")

#First Test # summarize the shape of the training dataset print("\nsummarize the shape of the training dataset") print(X_train.shape, y_train.shape) #the number of neurons in that layer is the mean of the neurons in the input and output layers. modelNN = MLPClassifier(max_iter=300, learning_rate_init=0.002, activation='relu' ,solver='adam', hidden_layer_sizes=(100,6,6), random_state=1) #fitting the model modelNN.fit(X_train, y_train) # make prediction modelNNPred = modelNN.predict(X_test) # evaluate predictions print('\nMAE: %.3f' % mean_absolute_error(y_test, modelNNPred)) print(classification_report(y_test, modelNNPred)) print(confusion_matrix(y_test, modelNNPred)) #skplt.metrics.plot_confusion_matrix(y_test, modelNNPred, figsize=(6,6), title = "Confusion matrix")

#Second Test - 100 more neurons # summarize the shape of the training dataset print("\nsummarize the shape of the training dataset") print(X_train.shape, y_train.shape) #the number of neurons in that layer is the mean of the neurons in the input and output layers. modelNN2 = MLPClassifier(max_iter=300, learning_rate_init=0.002, activation='relu' ,solver='adam', hidden_layer_sizes=(200,6,6), random_state=1) #fitting the model modelNN2.fit(X_train, y_train) # make prediction modelNNPred2 = modelNN2.predict(X_test) # evaluate predictions print('\nMAE: %.3f' % mean_absolute_error(y_test, modelNNPred2)) print(classification_report(y_test, modelNNPred2)) print(confusion_matrix(y_test, modelNNPred2)) #skplt.metrics.plot_confusion_matrix(y_test, modelNNPred, figsize=(6,6), title = "Confusion matrix")

testDF = pd.read_csv('/work/heart_predict.csv') le2 = LabelEncoder() testDF['Sex01'] = le.fit_transform(testDF.Sex) testDF['ChestPainType01'] = le.fit_transform(testDF.ChestPainType) testDF['RestingECG01'] = le.fit_transform(testDF.RestingECG) testDF['ExerciseAngina01'] = le.fit_transform(testDF.ExerciseAngina) testDF['ST_Slope01'] = le.fit_transform(testDF.ST_Slope) xNew = testDF[['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'Sex01', 'ChestPainType01', 'RestingECG01', 'ExerciseAngina01', 'ST_Slope01']].values xNew = minmax.transform(xNew) predictions = model.predict(xNew)

submission = pd.DataFrame({'PatientId': testDF['PatientId'], 'HeartDisease': predictions}) submission.to_csv('submission11.csv', index=False)

#SVM TEST from sklearn.svm import SVC # summarize the shape of the training dataset print("\nsummarize the shape of the training dataset") print(X_train.shape, y_train.shape) #the number of neurons in that layer is the mean of the neurons in the input and output layers. modelSVM = SVC(kernel='poly', C=10) #fitting the model modelSVM.fit(X_train, y_train) # make prediction modelSVMPred = modelSVM.predict(X_test) # evaluate predictions print('\nMAE: %.3f' % mean_absolute_error(y_test, modelSVMPred)) print(classification_report(y_test, modelSVMPred)) print(confusion_matrix(y_test, modelSVMPred))