import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
d = pd.read_csv("diabetes_data_upload.csv")
d.head()
d.columns = ['age', 'gender', 'polyuria','polydipsia','swl','weak', 'polyphagia', 'gt', 'vb',
'itch','irritate','dh', 'pp', 'ms', 'alopesia', 'obesity', 'class']
le = preprocessing.LabelEncoder()
for i in d.columns:
encode = le.fit_transform(d[i])
d[i] = encode
d.head()
d.shape
d['class'].value_counts()
d.groupby('class').mean()
for i in d.columns:
pd.crosstab(d[i],d['class']).plot(kind='bar')
plt.title('diabetes vs'+ i)
plt.xlabel(i)
plt.ylabel('counts')
X = d.loc[:, d.columns != 'class']
y = d.loc[:, d.columns == 'class']
!pip install imblearn
Collecting imblearn
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
Downloading imbalanced_learn-0.9.1-py3-none-any.whl (199 kB)
|████████████████████████████████| 199 kB 12.9 MB/s eta 0:00:01
Downloading imbalanced_learn-0.9.0-py3-none-any.whl (199 kB)
|████████████████████████████████| 199 kB 26.2 MB/s eta 0:00:01
Requirement already satisfied: joblib>=0.11 in /Users/xinzhang/anaconda3/lib/python3.7/site-packages (from imbalanced-learn->imblearn) (1.1.0)
Requirement already satisfied: numpy>=1.14.6 in /Users/xinzhang/anaconda3/lib/python3.7/site-packages (from imbalanced-learn->imblearn) (1.21.5)
Collecting scikit-learn>=1.0.1
Downloading scikit_learn-1.0.2-cp37-cp37m-macosx_10_13_x86_64.whl (7.8 MB)
|████████████████████████████████| 7.8 MB 26.5 MB/s eta 0:00:01
Requirement already satisfied: scipy>=1.1.0 in /Users/xinzhang/anaconda3/lib/python3.7/site-packages (from imbalanced-learn->imblearn) (1.7.3)
Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/xinzhang/anaconda3/lib/python3.7/site-packages (from imbalanced-learn->imblearn) (2.2.0)
Installing collected packages: scikit-learn, imbalanced-learn, imblearn
Attempting uninstall: scikit-learn
Found existing installation: scikit-learn 0.24.2
Uninstalling scikit-learn-0.24.2:
Successfully uninstalled scikit-learn-0.24.2
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ppscore 1.2.0 requires scikit-learn<1.0.0,>=0.20.2, but you have scikit-learn 1.0.2 which is incompatible.
Successfully installed imbalanced-learn-0.9.0 imblearn-0.0 scikit-learn-1.0.2
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns
os_data_X, os_data_y = os.fit_sample(X_train, y_train)
print("length of oversampled data is ",len(os_data_X))
print("Number of no diabetes in oversampled data",len(os_data_y[os_data_y['class']==0]))
print("Number of people with diabetes",len(os_data_y[os_data_y['class']==1]))
print("Proportion of no diabetes data in oversampled data is ",len(os_data_y[os_data_y['class']==0])/len(os_data_X))
print("Proportion of diabetes data in oversampled data is ",len(os_data_y[os_data_y['class']==1])/len(os_data_X))
length of oversampled data is 452
Number of no diabetes in oversampled data 226
Number of people with diabetes 226
Proportion of no diabetes data in oversampled data is 0.5
Proportion of diabetes data in oversampled data is 0.5
###在这里使用一次recursive feature elimination, 看一看有没有表现不好的需要被移除掉的变量
d_vars=d.columns.values.tolist()
y=['class']
X=[i for i in d_vars if i not in y]
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
rfe = RFE(logreg, 20)
##这里的20指的是number of features to select, 只要填大于variables的数量的数字即可
rfe = rfe.fit(os_data_X, os_data_y)
/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py:72: FutureWarning: Pass n_features_to_select=20 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
"will result in an error", FutureWarning)
/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)
print(rfe.support_)
print(rfe.ranking_)
[ True True True True True True True True True True True True
True True True True]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
###从上面看来,没有变量需要移除
###开始使用逻辑回归模型
import statsmodels.api as sm
logit_model=sm.Logit(y_train,X_train)
result=logit_model.fit()
print(result.summary2())
Optimization terminated successfully.
Current function value: 0.172118
Iterations 9
Results: Logit
=================================================================
Model: Logit Pseudo R-squared: 0.741
Dependent Variable: class AIC: 157.3017
Date: 2021-01-28 06:55 BIC: 219.6561
No. Observations: 364 Log-Likelihood: -62.651
Df Model: 15 LL-Null: -241.56
Df Residuals: 348 LLR p-value: 4.8509e-67
Converged: 1.0000 Scale: 1.0000
No. Iterations: 9.0000
------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
------------------------------------------------------------------
age -0.0140 0.0223 -0.6282 0.5299 -0.0578 0.0297
gender -3.7350 0.6531 -5.7186 0.0000 -5.0151 -2.4549
polyuria 3.8914 0.7433 5.2356 0.0000 2.4347 5.3482
polydipsia 4.8229 0.9580 5.0346 0.0000 2.9454 6.7005
swl 0.4687 0.6749 0.6944 0.4874 -0.8541 1.7914
weak 0.8314 0.6682 1.2442 0.2134 -0.4783 2.1411
polyphagia 0.8471 0.5809 1.4582 0.1448 -0.2915 1.9856
gt 2.1532 0.6658 3.2337 0.0012 0.8481 3.4582
vb 0.1928 0.7367 0.2617 0.7936 -1.2511 1.6366
itch -2.0452 0.7039 -2.9057 0.0037 -3.4248 -0.6656
irritate 2.1782 0.7367 2.9569 0.0031 0.7344 3.6221
dh -0.4645 0.6692 -0.6942 0.4876 -1.7761 0.8471
pp 1.3949 0.6610 2.1104 0.0348 0.0995 2.6904
ms -0.2034 0.6122 -0.3323 0.7397 -1.4033 0.9965
alopesia -0.1923 0.6752 -0.2848 0.7758 -1.5158 1.1311
obesity 0.0285 0.6617 0.0430 0.9657 -1.2684 1.3254
=================================================================
import statsmodels.api as sm
logit_model=sm.Logit(os_data_y,os_data_X)
result=logit_model.fit()
print(result.summary2())
Optimization terminated successfully.
Current function value: 0.153451
Iterations 10
Results: Logit
=================================================================
Model: Logit Pseudo R-squared: 0.779
Dependent Variable: class AIC: 170.7199
Date: 2021-01-28 06:55 BIC: 236.5388
No. Observations: 452 Log-Likelihood: -69.360
Df Model: 15 LL-Null: -313.30
Df Residuals: 436 LLR p-value: 2.0607e-94
Converged: 1.0000 Scale: 1.0000
No. Iterations: 10.0000
------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
------------------------------------------------------------------
age -0.0280 0.0218 -1.2826 0.1996 -0.0708 0.0148
gender -4.0515 0.6481 -6.2516 0.0000 -5.3217 -2.7813
polyuria 4.0793 0.7386 5.5228 0.0000 2.6316 5.5270
polydipsia 5.2951 0.9764 5.4232 0.0000 3.3814 7.2087
swl 0.5876 0.7002 0.8392 0.4013 -0.7847 1.9600
weak 0.8791 0.6825 1.2881 0.1977 -0.4585 2.2167
polyphagia 0.8906 0.5828 1.5282 0.1265 -0.2516 2.0329
gt 2.3781 0.6572 3.6184 0.0003 1.0899 3.6663
vb 0.4000 0.7403 0.5402 0.5890 -1.0511 1.8510
itch -2.1634 0.7076 -3.0575 0.0022 -3.5501 -0.7766
irritate 2.5823 0.7559 3.4163 0.0006 1.1008 4.0638
dh -0.5465 0.6680 -0.8181 0.4133 -1.8557 0.7628
pp 1.3940 0.6639 2.0996 0.0358 0.0927 2.6953
ms -0.1276 0.6211 -0.2054 0.8373 -1.3449 1.0897
alopesia -0.1322 0.6907 -0.1914 0.8482 -1.4859 1.2215
obesity -0.1282 0.6603 -0.1942 0.8460 -1.4223 1.1659
=================================================================
import statsmodels.api as sm
logit_model=sm.Logit(y_test,X_test)
result=logit_model.fit()
print(result.summary2())
Optimization terminated successfully.
Current function value: 0.144945
Iterations 10
Results: Logit
=================================================================
Model: Logit Pseudo R-squared: 0.784
Dependent Variable: class AIC: 77.2229
Date: 2021-01-28 06:55 BIC: 126.0206
No. Observations: 156 Log-Likelihood: -22.611
Df Model: 15 LL-Null: -104.83
Df Residuals: 140 LLR p-value: 3.2004e-27
Converged: 1.0000 Scale: 1.0000
No. Iterations: 10.0000
------------------------------------------------------------------
Coef. Std.Err. z P>|z| [0.025 0.975]
------------------------------------------------------------------
age -0.0196 0.0550 -0.3561 0.7218 -0.1273 0.0882
gender -4.1361 1.2444 -3.3239 0.0009 -6.5750 -1.6972
polyuria 5.7761 1.8137 3.1847 0.0014 2.2213 9.3309
polydipsia 5.4883 1.6854 3.2564 0.0011 2.1850 8.7917
swl -0.1104 1.1132 -0.0992 0.9210 -2.2922 2.0714
weak 1.8571 1.1102 1.6728 0.0944 -0.3188 4.0329
polyphagia 1.6790 1.2548 1.3380 0.1809 -0.7805 4.1385
gt 1.9952 1.2874 1.5498 0.1212 -0.5281 4.5186
vb 2.7913 1.4097 1.9801 0.0477 0.0284 5.5543
itch -4.0175 1.6191 -2.4813 0.0131 -7.1909 -0.8441
irritate 2.3554 1.2193 1.9318 0.0534 -0.0344 4.7452
dh -0.7667 1.2578 -0.6096 0.5422 -3.2319 1.6985
pp 1.5849 1.0993 1.4418 0.1494 -0.5696 3.7394
ms -2.3828 1.2492 -1.9074 0.0565 -4.8312 0.0657
alopesia -0.5948 1.3142 -0.4526 0.6508 -3.1706 1.9810
obesity -0.5416 1.2069 -0.4487 0.6536 -2.9071 1.8239
=================================================================
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
/opt/conda/lib/python3.6/site-packages/sklearn/utils/validation.py:63: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
return f(*args, **kwargs)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
Accuracy of logistic regression classifier on test set: 0.94
##绘制一个roc curve来检验模型。
###roc curve距离直线越远越好
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()