import pandas as pd
import numpy as np
#1,2
diabetes_data = pd.read_csv('diabetes.csv')
print(diabetes_data.head())
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \
0 6 148 72 35 0 33.6
1 1 85 66 29 0 26.6
2 8 183 64 0 0 23.3
3 1 89 66 23 94 28.1
4 0 137 40 35 168 43.1
DiabetesPedigreeFunction Age Outcome
0 0.627 50 1
1 0.351 31 0
2 0.672 32 1
3 0.167 21 0
4 2.288 33 1
#3
print(len(diabetes_data.columns))
9
#4
print(len(diabetes_data))
768
#5
print(diabetes_data.isnull().sum())
Pregnancies 0
Glucose 0
BloodPressure 0
SkinThickness 0
Insulin 0
BMI 0
DiabetesPedigreeFunction 0
Age 0
Outcome 0
dtype: int64
#6
print(diabetes_data.describe())
Pregnancies Glucose BloodPressure SkinThickness Insulin \
count 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479
std 3.369578 31.972618 19.355807 15.952218 115.244002
min 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000
75% 6.000000 140.250000 80.000000 32.000000 127.250000
max 17.000000 199.000000 122.000000 99.000000 846.000000
BMI DiabetesPedigreeFunction Age
count 768.000000 768.000000 768.000000
mean 31.992578 0.471876 33.240885
std 7.884160 0.331329 11.760232
min 0.000000 0.078000 21.000000
25% 27.300000 0.243750 24.000000
50% 32.000000 0.372500 29.000000
75% 36.600000 0.626250 41.000000
max 67.100000 2.420000 81.000000
#7,8,9
diabetes_data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = diabetes_data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.NaN)
#10
print(diabetes_data.isnull().sum())
Pregnancies 0
Glucose 5
BloodPressure 35
SkinThickness 227
Insulin 374
BMI 11
DiabetesPedigreeFunction 0
Age 0
Outcome 0
dtype: int64
#11,12,13
print(diabetes_data[diabetes_data.isnull().any(axis=1)])
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI \
0 6 148.0 72.0 35.0 NaN 33.6
1 1 85.0 66.0 29.0 NaN 26.6
2 8 183.0 64.0 NaN NaN 23.3
5 5 116.0 74.0 NaN NaN 25.6
7 10 115.0 NaN NaN NaN 35.3
.. ... ... ... ... ... ...
761 9 170.0 74.0 31.0 NaN 44.0
762 9 89.0 62.0 NaN NaN 22.5
764 2 122.0 70.0 27.0 NaN 36.8
766 1 126.0 60.0 NaN NaN 30.1
767 1 93.0 70.0 31.0 NaN 30.4
DiabetesPedigreeFunction Age Outcome
0 0.627 50 1
1 0.351 31 0
2 0.672 32 1
5 0.201 30 0
7 0.134 29 0
.. ... ... ...
761 0.403 43 1
762 0.142 33 0
764 0.340 27 0
766 0.349 47 1
767 0.315 23 0
[376 rows x 9 columns]
print(diabetes_data.info())
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Pregnancies 768 non-null int64
1 Glucose 763 non-null float64
2 BloodPressure 733 non-null float64
3 SkinThickness 541 non-null float64
4 Insulin 394 non-null float64
5 BMI 757 non-null float64
6 DiabetesPedigreeFunction 768 non-null float64
7 Age 768 non-null int64
8 Outcome 768 non-null object
dtypes: float64(6), int64(2), object(1)
memory usage: 54.1+ KB
None
#14,15
print(diabetes_data.Outcome.unique())
['1' '0' 'O']