import pandas as pd
# Use for preprocessing of the data
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
# Figuring out if our data variable’ distribution is Gaussian
from scipy.stats import normaltest
from scipy.stats import shapiro
# Used for the models to predict the data
#Logisitic Regression Model
from sklearn.linear_model import LogisticRegression
#Neural Net Model
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
# Getting rid out outliers
from sklearn.neighbors import LocalOutlierFactor
# Training
from sklearn.model_selection import train_test_split
# Used for visualization
#import scikitplot as skplt
from sklearn.metrics import classification_report, confusion_matrix
# Used for accuracy of the models
from sklearn.metrics import mean_absolute_error
heartData = pd.read_csv("/work/heart_data.csv")
heartData.head(5)
# Columns that have been extracted to be used
le = LabelEncoder()
heartData['Sex01'] = le.fit_transform(heartData.Sex)
heartData['ChestPainType01'] = le.fit_transform(heartData.ChestPainType)
heartData['RestingECG01'] = le.fit_transform(heartData.RestingECG)
heartData['ExerciseAngina01'] = le.fit_transform(heartData.ExerciseAngina)
heartData['ST_Slope01'] = le.fit_transform(heartData.ST_Slope)
cols = heartData[['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'Sex01', 'ChestPainType01', 'RestingECG01', 'ExerciseAngina01', 'ST_Slope01']]
colsValues = [1,4,5,6,8,10,13,14,15,16,17] # chosen columns from our data
heartData.head(5)
heartData.columns[colsValues]
# Before we choose to wrangle with our data we must first come to understand what underneath the hood.
heartData.describe()
#[Age, RestingBP, Cholesterol, FastingBS, MaxHR, Oldpeak]
print('\n###D’Agostino and Pearson’s Test####')
for i in heartData.columns[colsValues]:
print(f'{i}: {"Not Gaussian" if normaltest(heartData[i].values,)[1]<0.05 else "Gaussian"} {normaltest(heartData[i].values)}')
print('\n###SHAPIRO TEST####')
for i in heartData.columns[colsValues]:
print(f'{i}: {"Not Gaussian" if shapiro(heartData[i])[1]<0.05 else "Gaussian"} {shapiro(heartData[i])}')
#Step 1
# total number of entries as well as count of non-null values with datatype of all features
heartData.info()
heartData.isna().sum() # totalling all of the null values in each column
#Step 2
heartData.drop_duplicates() # removing any duplicates found in the dataset
# STEP 3 - Normalization
# minmaxscalar
normalizedHeartData = heartData[heartData.columns[colsValues]]
minmax = MinMaxScaler()
heartDataMinMax = minmax.fit_transform(normalizedHeartData)
heartData_HeartDisease_extracted = heartData[['HeartDisease']] #
#input data
X = heartDataMinMax
# what i am trying to predict
Y = heartData_HeartDisease_extracted
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.60, random_state=1)
#K FOLD
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# summarize the shape of the training dataset
print("\nsummarize the shape of the training dataset")
print(X_train.shape, y_train.shape)
# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print("\nsummarize the shape of the updated training dataset (outliers removed)")
print(X_train.shape, y_train.shape)
# fit the model
model = LogisticRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
print('\nMAE: %.3f' % mean_absolute_error(y_test, yhat))
print(classification_report(y_test, yhat))
print(confusion_matrix(y_test, yhat))
#skplt.metrics.plot_confusion_matrix(y_test, yhat, figsize=(6,6), title = "Confusion matrix")
#First Test
# summarize the shape of the training dataset
print("\nsummarize the shape of the training dataset")
print(X_train.shape, y_train.shape)
#the number of neurons in that layer is the mean of the neurons in the input and output layers.
modelNN = MLPClassifier(max_iter=300, learning_rate_init=0.002, activation='relu' ,solver='adam', hidden_layer_sizes=(100,6,6), random_state=1)
#fitting the model
modelNN.fit(X_train, y_train)
# make prediction
modelNNPred = modelNN.predict(X_test)
# evaluate predictions
print('\nMAE: %.3f' % mean_absolute_error(y_test, modelNNPred))
print(classification_report(y_test, modelNNPred))
print(confusion_matrix(y_test, modelNNPred))
#skplt.metrics.plot_confusion_matrix(y_test, modelNNPred, figsize=(6,6), title = "Confusion matrix")
#Second Test - 100 more neurons
# summarize the shape of the training dataset
print("\nsummarize the shape of the training dataset")
print(X_train.shape, y_train.shape)
#the number of neurons in that layer is the mean of the neurons in the input and output layers.
modelNN2 = MLPClassifier(max_iter=300, learning_rate_init=0.002, activation='relu' ,solver='adam', hidden_layer_sizes=(200,6,6), random_state=1)
#fitting the model
modelNN2.fit(X_train, y_train)
# make prediction
modelNNPred2 = modelNN2.predict(X_test)
# evaluate predictions
print('\nMAE: %.3f' % mean_absolute_error(y_test, modelNNPred2))
print(classification_report(y_test, modelNNPred2))
print(confusion_matrix(y_test, modelNNPred2))
#skplt.metrics.plot_confusion_matrix(y_test, modelNNPred, figsize=(6,6), title = "Confusion matrix")
testDF = pd.read_csv('/work/heart_predict.csv')
le2 = LabelEncoder()
testDF['Sex01'] = le.fit_transform(testDF.Sex)
testDF['ChestPainType01'] = le.fit_transform(testDF.ChestPainType)
testDF['RestingECG01'] = le.fit_transform(testDF.RestingECG)
testDF['ExerciseAngina01'] = le.fit_transform(testDF.ExerciseAngina)
testDF['ST_Slope01'] = le.fit_transform(testDF.ST_Slope)
xNew = testDF[['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'Sex01', 'ChestPainType01', 'RestingECG01', 'ExerciseAngina01', 'ST_Slope01']].values
xNew = minmax.transform(xNew)
predictions = model.predict(xNew)
submission = pd.DataFrame({'PatientId': testDF['PatientId'], 'HeartDisease': predictions})
submission.to_csv('submission11.csv', index=False)
#SVM TEST
from sklearn.svm import SVC
# summarize the shape of the training dataset
print("\nsummarize the shape of the training dataset")
print(X_train.shape, y_train.shape)
#the number of neurons in that layer is the mean of the neurons in the input and output layers.
modelSVM = SVC(kernel='poly', C=10)
#fitting the model
modelSVM.fit(X_train, y_train)
# make prediction
modelSVMPred = modelSVM.predict(X_test)
# evaluate predictions
print('\nMAE: %.3f' % mean_absolute_error(y_test, modelSVMPred))
print(classification_report(y_test, modelSVMPred))
print(confusion_matrix(y_test, modelSVMPred))