import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
df_Pima = pd.read_csv('diabetes.csv')
df_Pima.head()
df_Pima.describe()
df_Pima[['Glucose', 'BloodPressure', 'SkinThickness','Insulin', 'BMI']] = df_Pima[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.NaN)
df_Pima.isna().sum()
def find_mean(column):
temp = df_Pima[df_Pima[column].notnull()]
temp = df_Pima[[column, 'Outcome']].groupby('Outcome')[[column]].mean().reset_index()
return temp
print(find_mean('Glucose'))
print(find_mean('BloodPressure'))
print(find_mean('SkinThickness'))
print(find_mean('Insulin'))
print(find_mean('BMI'))
df_Pima.loc[(df_Pima['Outcome'] == 0) & (df_Pima['Glucose'].isnull()), 'Glucose'] = 110.6
df_Pima.loc[(df_Pima['Outcome'] == 1) & (df_Pima['Glucose'].isnull()), 'Glucose'] = 169.5
df_Pima.loc[(df_Pima['Outcome'] == 0) & (df_Pima['BloodPressure'].isnull()), 'BloodPressure'] = 70.8
df_Pima.loc[(df_Pima['Outcome'] == 1) & (df_Pima['BloodPressure'].isnull()), 'BloodPressure'] = 75.3
df_Pima.loc[(df_Pima['Outcome'] == 0) & (df_Pima['SkinThickness'].isnull()), 'SkinThickness'] = 27.2
df_Pima.loc[(df_Pima['Outcome'] == 1) & (df_Pima['SkinThickness'].isnull()), 'SkinThickness'] = 33.0
df_Pima.loc[(df_Pima['Outcome'] == 0) & (df_Pima['Insulin'].isnull()), 'Insulin'] = 130.3
df_Pima.loc[(df_Pima['Outcome'] == 1) & (df_Pima['Insulin'].isnull()), 'Insulin'] = 206.8
df_Pima.loc[(df_Pima['Outcome'] == 0) & (df_Pima['BMI'].isnull()), 'BMI'] = 30.8
df_Pima.loc[(df_Pima['Outcome'] == 1) & (df_Pima['BMI'].isnull()), 'BMI'] = 35.4
df_Pima.isna().sum()
plt.figure(figsize=(10,6))
sns.scatterplot(x='Pregnancies', y='BMI', data=df_Pima, hue='Outcome', alpha= 0.7)
plt.title('Comparison of BMI and Pregnancies')
plt.show()
plt.figure(figsize=(10,6))
sns.scatterplot(x='Insulin', y='BMI', data=df_Pima, hue='Outcome', alpha= 0.7)
plt.title('Comparison of BMI and Insulin')
plt.show()
plt.title('Count of Positive / Negative Outcome')
sns.countplot(data=df_Pima, y= 'Outcome')
plt.figure(figsize=(10, 12))
plt.title('Age count with Positive / Negative Outcome')
sns.histplot(data= df_Pima, x='Age', hue='Outcome', element= 'step', fill=False)
df_Pima.corr()
from sklearn.preprocessing import scale
diabetes_labels = df_Pima['Outcome']
diabetes_features = df_Pima[['Pregnancies', 'Glucose', 'SkinThickness', 'Insulin', 'BMI', 'Age']]
scaled_features = scale(diabetes_features, axis=0)
print(scaled_features)
from sklearn.model_selection import train_test_split
train_data, test_data, train_label, test_label = train_test_split(scaled_features, diabetes_labels, test_size=0.2, random_state =1)
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 3)
classifier.fit(train_data, train_label)
knn_prediction = classifier.predict(test_data)
classifier.score(test_data, test_label)
from sklearn.ensemble import RandomForestClassifier
forest_classifier = RandomForestClassifier(n_estimators = 100, max_depth=6)
forest_classifier.fit(train_data, train_label)
forest_prediction = forest_classifier.predict(test_data)
forest_classifier.score(test_data, test_label)
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
print('KNN Accuracey is: ' +str(accuracy_score(test_label, knn_prediction)))
print('KNN Precision score is ' + str(precision_score(test_label,knn_prediction)))
print('KNN F1 score is ' + str(f1_score(test_label,knn_prediction)))
print('Random forest Accuracey is: ' +str(accuracy_score(test_label,forest_prediction)))
print('Random forest Precision score is ' + str(precision_score(test_label,forest_prediction)))
print('Random forest F1 score is ' + str(f1_score(test_label,forest_prediction)))
diabetes_labels2 = df_Pima['Outcome']
diabetes_features2 = df_Pima[['Glucose', 'SkinThickness', 'Insulin', 'BMI']]
scaled_features2 = scale(diabetes_features, axis=0)
train_data2, test_data2, train_label2, test_label2 = train_test_split(scaled_features2, diabetes_labels2, test_size=0.2, random_state =1)
classifier2 = KNeighborsClassifier(n_neighbors = 3)
classifier2.fit(train_data2, train_label2)
classifier2.score(test_data2, test_label2)
knn_prediction2 = classifier2.predict(test_data2)
print('KNN2 Accuracey is: ' +str(accuracy_score(test_label2,knn_prediction2)))
print('KNN2 Precision score is ' + str(precision_score(test_label2,knn_prediction2)))
print('KNN2 F1 score is ' + str(f1_score(test_label2,knn_prediction2)))
forest_classifier2 = RandomForestClassifier(n_estimators = 75, max_depth=6)
forest_classifier2.fit(train_data2, train_label2)
forest_classifier2.score(test_data2, test_label2)
predictions2 = forest_classifier2.predict(test_data2)
print('Random forest2 Accuracey is: ' +str(accuracy_score(test_label2,predictions2 )))
print('Random forest2 Precision score is ' + str(precision_score(test_label2,predictions2 )))
print('Random forest2 F1 score is ' + str(f1_score(test_label2,predictions2 )))