import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
df_Pima = pd.read_csv('diabetes.csv')
df_Pima.head()
df_Pima.describe()
df_Pima[['Glucose', 'BloodPressure', 'SkinThickness','Insulin', 'BMI']] = df_Pima[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.NaN)
df_Pima.isna().sum()
def find_mean(column):
temp = df_Pima[df_Pima[column].notnull()]
temp = df_Pima[[column, 'Outcome']].groupby('Outcome')[[column]].mean().reset_index()
return temp
print(find_mean('Glucose'))
print(find_mean('BloodPressure'))
print(find_mean('SkinThickness'))
print(find_mean('Insulin'))
print(find_mean('BMI'))
Outcome Glucose
0 0 110.643863
1 1 142.319549
Outcome BloodPressure
0 0 70.877339
1 1 75.321429
Outcome SkinThickness
0 0 27.235457
1 1 33.000000
Outcome Insulin
0 0 130.287879
1 1 206.846154
Outcome BMI
0 0 30.859674
1 1 35.406767
df_Pima.loc[(df_Pima['Outcome'] == 0) & (df_Pima['Glucose'].isnull()), 'Glucose'] = 110.6
df_Pima.loc[(df_Pima['Outcome'] == 1) & (df_Pima['Glucose'].isnull()), 'Glucose'] = 169.5
df_Pima.loc[(df_Pima['Outcome'] == 0) & (df_Pima['BloodPressure'].isnull()), 'BloodPressure'] = 70.8
df_Pima.loc[(df_Pima['Outcome'] == 1) & (df_Pima['BloodPressure'].isnull()), 'BloodPressure'] = 75.3
df_Pima.loc[(df_Pima['Outcome'] == 0) & (df_Pima['SkinThickness'].isnull()), 'SkinThickness'] = 27.2
df_Pima.loc[(df_Pima['Outcome'] == 1) & (df_Pima['SkinThickness'].isnull()), 'SkinThickness'] = 33.0
df_Pima.loc[(df_Pima['Outcome'] == 0) & (df_Pima['Insulin'].isnull()), 'Insulin'] = 130.3
df_Pima.loc[(df_Pima['Outcome'] == 1) & (df_Pima['Insulin'].isnull()), 'Insulin'] = 206.8
df_Pima.loc[(df_Pima['Outcome'] == 0) & (df_Pima['BMI'].isnull()), 'BMI'] = 30.8
df_Pima.loc[(df_Pima['Outcome'] == 1) & (df_Pima['BMI'].isnull()), 'BMI'] = 35.4
df_Pima.isna().sum()
plt.figure(figsize=(10,6))
sns.scatterplot(x='Pregnancies', y='BMI', data=df_Pima, hue='Outcome', alpha= 0.7)
plt.title('Comparison of BMI and Pregnancies')
plt.show()
plt.figure(figsize=(10,6))
sns.scatterplot(x='Insulin', y='BMI', data=df_Pima, hue='Outcome', alpha= 0.7)
plt.title('Comparison of BMI and Insulin')
plt.show()
plt.title('Count of Positive / Negative Outcome')
sns.countplot(data=df_Pima, y= 'Outcome')
plt.figure(figsize=(10, 12))
plt.title('Age count with Positive / Negative Outcome')
sns.histplot(data= df_Pima, x='Age', hue='Outcome', element= 'step', fill=False)
df_Pima.corr()
from sklearn.preprocessing import scale
diabetes_labels = df_Pima['Outcome']
diabetes_features = df_Pima[['Pregnancies', 'Glucose', 'SkinThickness', 'Insulin', 'BMI', 'Age']]
scaled_features = scale(diabetes_features, axis=0)
print(scaled_features)
[[ 0.63994726 0.85945808 0.64570209 0.56083978 0.16790636 1.4259954 ]
[-0.84488505 -1.20465424 -0.02697724 -0.30067319 -0.85032783 -0.19067191]
[ 1.23388019 2.00618715 0.42147564 0.56083978 -1.33035252 -0.10558415]
...
[ 0.3429808 -0.02516149 -0.69965657 -0.50676061 -0.90851264 -0.27575966]
[-0.84488505 0.13865695 0.42147564 0.56083978 -0.34121073 1.17073215]
[-0.84488505 -0.94254474 0.1972492 -0.30067319 -0.29757212 -0.87137393]]
from sklearn.model_selection import train_test_split
train_data, test_data, train_label, test_label = train_test_split(scaled_features, diabetes_labels, test_size=0.2, random_state =1)
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(train_data, train_label)
knn_prediction = classifier.predict(test_data)
classifier.score(test_data, test_label)
from sklearn.ensemble import RandomForestClassifier
forest_classifier = RandomForestClassifier(n_estimators = 100, max_depth=6)
forest_classifier.fit(train_data, train_label)
forest_prediction = forest_classifier.predict(test_data)
forest_classifier.score(test_data, test_label)
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
print('KNN Accuracey is: ' +str(accuracy_score(test_label, knn_prediction)))
print('KNN Precision score is ' + str(precision_score(test_label,knn_prediction)))
print('KNN F1 score is ' + str(f1_score(test_label,knn_prediction)))
KNN Accuracey is: 0.8831168831168831
KNN Precision score is 0.8627450980392157
KNN F1 score is 0.8301886792452831
print('Random forest Accuracey is: ' +str(accuracy_score(test_label,forest_prediction)))
print('Random forest Precision score is ' + str(precision_score(test_label,forest_prediction)))
print('Random forest F1 score is ' + str(f1_score(test_label,forest_prediction)))
Random forest Accuracey is: 0.8961038961038961
Random forest Precision score is 0.8823529411764706
Random forest F1 score is 0.8490566037735848
diabetes_labels2 = df_Pima['Outcome']
diabetes_features2 = df_Pima[['Glucose', 'SkinThickness', 'Insulin', 'BMI']]
scaled_features2 = scale(diabetes_features, axis=0)
train_data2, test_data2, train_label2, test_label2 = train_test_split(scaled_features2, diabetes_labels2, test_size=0.2, random_state =1)
classifier2 = KNeighborsClassifier(n_neighbors = 3)
classifier2.fit(train_data2, train_label2)
classifier2.score(test_data2, test_label2)
knn_prediction2 = classifier2.predict(test_data2)
print('KNN2 Accuracey is: ' +str(accuracy_score(test_label2,knn_prediction2)))
print('KNN2 Precision score is ' + str(precision_score(test_label2,knn_prediction2)))
print('KNN2 F1 score is ' + str(f1_score(test_label2,knn_prediction2)))
KNN2 Accuracey is: 0.8831168831168831
KNN2 Precision score is 0.8627450980392157
KNN2 F1 score is 0.8301886792452831
forest_classifier2 = RandomForestClassifier(n_estimators = 75, max_depth=6)
forest_classifier2.fit(train_data2, train_label2)
forest_classifier2.score(test_data2, test_label2)
predictions2 = forest_classifier2.predict(test_data2)
print('Random forest2 Accuracey is: ' +str(accuracy_score(test_label2,predictions2 )))
print('Random forest2 Precision score is ' + str(precision_score(test_label2,predictions2 )))
print('Random forest2 F1 score is ' + str(f1_score(test_label2,predictions2 )))
Random forest2 Accuracey is: 0.8831168831168831
Random forest2 Precision score is 0.8627450980392157
Random forest2 F1 score is 0.8301886792452831