import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
d = pd.read_csv("diabetes_data_upload.csv")
d.head()
d.columns = ['age', 'gender', 'polyuria','polydipsia','swl','weak', 'polyphagia', 'gt', 'vb',
'itch','irritate','dh', 'pp', 'ms', 'alopesia', 'obesity', 'class']
le = preprocessing.LabelEncoder()
for i in d.columns:
encode = le.fit_transform(d[i])
d[i] = encode
d.head()
d.shape
d['class'].value_counts()
d.groupby('class').mean()
for i in d.columns:
pd.crosstab(d[i],d['class']).plot(kind='bar')
plt.title('diabetes vs'+ i)
plt.xlabel(i)
plt.ylabel('counts')
X = d.loc[:, d.columns != 'class']
y = d.loc[:, d.columns == 'class']
!pip install imblearn
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns
os_data_X, os_data_y = os.fit_sample(X_train, y_train)
print("length of oversampled data is ",len(os_data_X))
print("Number of no diabetes in oversampled data",len(os_data_y[os_data_y['class']==0]))
print("Number of people with diabetes",len(os_data_y[os_data_y['class']==1]))
print("Proportion of no diabetes data in oversampled data is ",len(os_data_y[os_data_y['class']==0])/len(os_data_X))
print("Proportion of diabetes data in oversampled data is ",len(os_data_y[os_data_y['class']==1])/len(os_data_X))
###在这里使用一次recursive feature elimination, 看一看有没有表现不好的需要被移除掉的变量
d_vars=d.columns.values.tolist()
y=['class']
X=[i for i in d_vars if i not in y]
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
rfe = RFE(logreg, 20)
##这里的20指的是number of features to select, 只要填大于variables的数量的数字即可
rfe = rfe.fit(os_data_X, os_data_y)
print(rfe.support_)
print(rfe.ranking_)
###从上面看来,没有变量需要移除
###开始使用逻辑回归模型
import statsmodels.api as sm
logit_model=sm.Logit(y_train,X_train)
result=logit_model.fit()
print(result.summary2())
import statsmodels.api as sm
logit_model=sm.Logit(os_data_y,os_data_X)
result=logit_model.fit()
print(result.summary2())
import statsmodels.api as sm
logit_model=sm.Logit(y_test,X_test)
result=logit_model.fit()
print(result.summary2())
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
##绘制一个roc curve来检验模型。
###roc curve距离直线越远越好
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()