import pandas as pd
# Use for preprocessing of the data
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
# Figuring out if our data variable’ distribution is Gaussian
from scipy.stats import normaltest
from scipy.stats import shapiro
# Used for the models to predict the data
#Logisitic Regression Model
from sklearn.linear_model import LogisticRegression
#Neural Net Model
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
# Getting rid out outliers
from sklearn.neighbors import LocalOutlierFactor
# Training
from sklearn.model_selection import train_test_split
# Used for visualization
#import scikitplot as skplt
from sklearn.metrics import classification_report, confusion_matrix
# Used for accuracy of the models
from sklearn.metrics import mean_absolute_error
heartData = pd.read_csv("/work/heart_data.csv")
heartData.head(5)
# Columns that have been extracted to be used
le = LabelEncoder()
heartData['Sex01'] = le.fit_transform(heartData.Sex)
heartData['ChestPainType01'] = le.fit_transform(heartData.ChestPainType)
heartData['RestingECG01'] = le.fit_transform(heartData.RestingECG)
heartData['ExerciseAngina01'] = le.fit_transform(heartData.ExerciseAngina)
heartData['ST_Slope01'] = le.fit_transform(heartData.ST_Slope)
cols = heartData[['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'Sex01', 'ChestPainType01', 'RestingECG01', 'ExerciseAngina01', 'ST_Slope01']]
colsValues = [1,4,5,6,8,10,13,14,15,16,17] # chosen columns from our data
heartData.head(5)
heartData.columns[colsValues]
# Before we choose to wrangle with our data we must first come to understand what underneath the hood.
heartData.describe()
#[Age, RestingBP, Cholesterol, FastingBS, MaxHR, Oldpeak]
print('\n###D’Agostino and Pearson’s Test####')
for i in heartData.columns[colsValues]:
print(f'{i}: {"Not Gaussian" if normaltest(heartData[i].values,)[1]<0.05 else "Gaussian"} {normaltest(heartData[i].values)}')
print('\n###SHAPIRO TEST####')
for i in heartData.columns[colsValues]:
print(f'{i}: {"Not Gaussian" if shapiro(heartData[i])[1]<0.05 else "Gaussian"} {shapiro(heartData[i])}')
###D’Agostino and Pearson’s Test####
Age: Not Gaussian NormaltestResult(statistic=14.10154250215735, pvalue=0.0008667402251263827)
RestingBP: Not Gaussian NormaltestResult(statistic=76.50982522759105, pvalue=2.4327783718283894e-17)
Cholesterol: Not Gaussian NormaltestResult(statistic=45.6811678375251, pvalue=1.203539755796199e-10)
FastingBS: Not Gaussian NormaltestResult(statistic=147.93997898036378, pvalue=7.503117333266766e-33)
MaxHR: Not Gaussian NormaltestResult(statistic=13.336898191027805, pvalue=0.0012703674425520017)
Oldpeak: Not Gaussian NormaltestResult(statistic=132.9970956996849, pvalue=1.3184012746452946e-29)
###SHAPIRO TEST####
Age: Not Gaussian ShapiroResult(statistic=0.990276575088501, pvalue=3.1005161872599274e-05)
RestingBP: Not Gaussian ShapiroResult(statistic=0.9544223546981812, pvalue=3.1177614133524705e-15)
Cholesterol: Not Gaussian ShapiroResult(statistic=0.8702148795127869, pvalue=1.7137967420666714e-25)
FastingBS: Not Gaussian ShapiroResult(statistic=0.5203630924224854, pvalue=2.6554605898955283e-42)
MaxHR: Not Gaussian ShapiroResult(statistic=0.9926148056983948, pvalue=0.00043499658931978047)
Oldpeak: Not Gaussian ShapiroResult(statistic=0.8573881387710571, pvalue=1.4811088444142e-26)
#Step 1
# total number of entries as well as count of non-null values with datatype of all features
heartData.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 818 entries, 0 to 817
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PatientId 818 non-null int64
1 Age 818 non-null int64
2 Sex 818 non-null object
3 ChestPainType 818 non-null object
4 RestingBP 818 non-null int64
5 Cholesterol 818 non-null int64
6 FastingBS 818 non-null int64
7 RestingECG 818 non-null object
8 MaxHR 818 non-null int64
9 ExerciseAngina 818 non-null object
10 Oldpeak 818 non-null float64
11 ST_Slope 818 non-null object
12 HeartDisease 818 non-null int64
dtypes: float64(1), int64(7), object(5)
memory usage: 83.2+ KB
heartData.isna().sum() # totalling all of the null values in each column
#Step 2
heartData.drop_duplicates() # removing any duplicates found in the dataset
# STEP 3 - Normalization
# minmaxscalar
normalizedHeartData = heartData[heartData.columns[colsValues]]
minmax = MinMaxScaler()
heartDataMinMax = minmax.fit_transform(normalizedHeartData)
heartData_HeartDisease_extracted = heartData[['HeartDisease']] #
#input data
X = heartDataMinMax
# what i am trying to predict
Y = heartData_HeartDisease_extracted
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.60, random_state=1)
#K FOLD
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# summarize the shape of the training dataset
print("\nsummarize the shape of the training dataset")
print(X_train.shape, y_train.shape)
# identify outliers in the training dataset
lof = LocalOutlierFactor()
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train, y_train = X_train[mask, :], y_train[mask]
# summarize the shape of the updated training dataset
print("\nsummarize the shape of the updated training dataset (outliers removed)")
print(X_train.shape, y_train.shape)
# fit the model
model = LogisticRegression()
model.fit(X_train, y_train)
# evaluate the model
yhat = model.predict(X_test)
# evaluate predictions
print('\nMAE: %.3f' % mean_absolute_error(y_test, yhat))
print(classification_report(y_test, yhat))
print(confusion_matrix(y_test, yhat))
#skplt.metrics.plot_confusion_matrix(y_test, yhat, figsize=(6,6), title = "Confusion matrix")
summarize the shape of the training dataset
(327, 11) (327, 1)
summarize the shape of the updated training dataset (outliers removed)
(319, 11) (319, 1)
MAE: 0.157
precision recall f1-score support
0 0.82 0.82 0.82 212
1 0.86 0.86 0.86 279
accuracy 0.84 491
macro avg 0.84 0.84 0.84 491
weighted avg 0.84 0.84 0.84 491
[[174 38]
[ 39 240]]
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
#First Test
# summarize the shape of the training dataset
print("\nsummarize the shape of the training dataset")
print(X_train.shape, y_train.shape)
#the number of neurons in that layer is the mean of the neurons in the input and output layers.
modelNN = MLPClassifier(max_iter=300, learning_rate_init=0.002, activation='relu' ,solver='adam', hidden_layer_sizes=(100,6,6), random_state=1)
#fitting the model
modelNN.fit(X_train, y_train)
# make prediction
modelNNPred = modelNN.predict(X_test)
# evaluate predictions
print('\nMAE: %.3f' % mean_absolute_error(y_test, modelNNPred))
print(classification_report(y_test, modelNNPred))
print(confusion_matrix(y_test, modelNNPred))
#skplt.metrics.plot_confusion_matrix(y_test, modelNNPred, figsize=(6,6), title = "Confusion matrix")
summarize the shape of the training dataset
(319, 11) (319, 1)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:1109: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
MAE: 0.167
precision recall f1-score support
0 0.80 0.82 0.81 212
1 0.86 0.84 0.85 279
accuracy 0.83 491
macro avg 0.83 0.83 0.83 491
weighted avg 0.83 0.83 0.83 491
[[174 38]
[ 44 235]]
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:696: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (300) reached and the optimization hasn't converged yet.
ConvergenceWarning,
#Second Test - 100 more neurons
# summarize the shape of the training dataset
print("\nsummarize the shape of the training dataset")
print(X_train.shape, y_train.shape)
#the number of neurons in that layer is the mean of the neurons in the input and output layers.
modelNN2 = MLPClassifier(max_iter=300, learning_rate_init=0.002, activation='relu' ,solver='adam', hidden_layer_sizes=(200,6,6), random_state=1)
#fitting the model
modelNN2.fit(X_train, y_train)
# make prediction
modelNNPred2 = modelNN2.predict(X_test)
# evaluate predictions
print('\nMAE: %.3f' % mean_absolute_error(y_test, modelNNPred2))
print(classification_report(y_test, modelNNPred2))
print(confusion_matrix(y_test, modelNNPred2))
#skplt.metrics.plot_confusion_matrix(y_test, modelNNPred, figsize=(6,6), title = "Confusion matrix")
summarize the shape of the training dataset
(319, 11) (319, 1)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:1109: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)
MAE: 0.163
precision recall f1-score support
0 0.82 0.80 0.81 212
1 0.85 0.87 0.86 279
accuracy 0.84 491
macro avg 0.83 0.83 0.83 491
weighted avg 0.84 0.84 0.84 491
[[169 43]
[ 37 242]]
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py:696: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (300) reached and the optimization hasn't converged yet.
ConvergenceWarning,
testDF = pd.read_csv('/work/heart_predict.csv')
le2 = LabelEncoder()
testDF['Sex01'] = le.fit_transform(testDF.Sex)
testDF['ChestPainType01'] = le.fit_transform(testDF.ChestPainType)
testDF['RestingECG01'] = le.fit_transform(testDF.RestingECG)
testDF['ExerciseAngina01'] = le.fit_transform(testDF.ExerciseAngina)
testDF['ST_Slope01'] = le.fit_transform(testDF.ST_Slope)
xNew = testDF[['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'Sex01', 'ChestPainType01', 'RestingECG01', 'ExerciseAngina01', 'ST_Slope01']].values
xNew = minmax.transform(xNew)
predictions = model.predict(xNew)
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/base.py:446: UserWarning: X does not have valid feature names, but MinMaxScaler was fitted with feature names
"X does not have valid feature names, but"
submission = pd.DataFrame({'PatientId': testDF['PatientId'], 'HeartDisease': predictions})
submission.to_csv('submission11.csv', index=False)
#SVM TEST
from sklearn.svm import SVC
# summarize the shape of the training dataset
print("\nsummarize the shape of the training dataset")
print(X_train.shape, y_train.shape)
#the number of neurons in that layer is the mean of the neurons in the input and output layers.
modelSVM = SVC(kernel='poly', C=10)
#fitting the model
modelSVM.fit(X_train, y_train)
# make prediction
modelSVMPred = modelSVM.predict(X_test)
# evaluate predictions
print('\nMAE: %.3f' % mean_absolute_error(y_test, modelSVMPred))
print(classification_report(y_test, modelSVMPred))
print(confusion_matrix(y_test, modelSVMPred))
summarize the shape of the training dataset
(319, 11) (319, 1)
MAE: 0.165
precision recall f1-score support
0 0.79 0.83 0.81 212
1 0.87 0.84 0.85 279
accuracy 0.84 491
macro avg 0.83 0.84 0.83 491
weighted avg 0.84 0.84 0.84 491
[[177 35]
[ 46 233]]
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/validation.py:985: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)