# DMBA Imports
from dmba import regressionSummary, exhaustive_search
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import adjusted_r2_score, AIC_score, BIC_score
from dmba import plotDecisionTree, classificationSummary, regressionSummary, gainsChart, liftChart
%matplotlib inline
import warnings
# Load/Review/Visualize
from zipfile import ZipFile
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling
# for K-Mean
from sklearn.cluster import KMeans
from pandas.plotting import parallel_coordinates
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
# Model/Prediction
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, auc
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
import statsmodels.api as sm
from keras_visualizer import visualizer
from keras.models import Sequential
from keras.layers import Dense
# Inputs
datafile="dataset_diabetes.zip"
mapping=[]
diabetic=[]
with ZipFile(datafile) as rawData:
for info in rawData.infolist():
print("Reading: ", info.filename)
with rawData.open(info.filename) as f:
if 'IDs_mapping' in info.filename:
mapping = pd.read_csv(f)
else:
diabetic = pd.read_csv(f)
# Removing Colums with too many unknow values or lack of relevance based on description
dropped_Cols=['weight', 'medical_specialty', 'payer_code', 'encounter_id', 'patient_nbr']
diabetic.drop(columns=dropped_Cols, inplace=True)
# Drop remaining rows with unknown values
diabetic.drop(index=(diabetic[diabetic.isin(['?']).any(axis=1)].index), inplace=True)
# Let's check what's left
diabetic.shape
diabetic.head(20)
#diabetic = pd.get_dummies(diabetic, prefix_sep='_', drop_first=False)
Reading: dataset_diabetes/diabetic_data.csv
Reading: dataset_diabetes/IDs_mapping.csv
diabetic["isFemale"] = False
diabetic.loc[diabetic.gender == "Female", "isFemale"] = True
diabetic = diabetic[diabetic.gender != "Unknown/Invalid"]
diabetic.drop(columns=['gender'], inplace=True)
diabetic = pd.get_dummies(data = diabetic, columns = ["race"], prefix = "race", drop_first=False)
def fix_diag(diabetic, icd9) :
if diabetic[icd9][0] == "E" :
return "Other"
elif diabetic[icd9][0] == "V" :
return "Other"
else :
num = float(diabetic[icd9])
if np.trunc(num) == 250 :
return "diabetes"
elif num <= 139 :
return "other"
elif num <= 279 :
return "neoplasms"
elif num <= 389 :
return "other"
elif num <= 459 :
return "circulatory"
elif num <= 519 :
return "respiratory"
elif num <= 579 :
return "digestive"
elif num <= 629 :
return "genitourinary"
elif num <= 679 :
return "other"
elif num <= 709 :
return "neoplasms"
elif num <= 739 :
return "musculoskeletal"
elif num <= 759 :
return "other"
elif num in [780, 781, 782, 783, 784] :
return "neoplasms"
elif num == 785 :
return "circulatory"
elif num == 786 :
return "respiratory"
elif num == 787 :
return "digestive"
elif num == 788 :
return "genitourinary"
elif num == 789 :
return "digestive"
elif num in np.arange(790, 800) :
return "neoplasms"
elif num >= 800 :
return "injury"
else :
return num
diabetic["diag1_norm"] = diabetic.apply(fix_diag, axis=1, icd9="diag_1")
diabetic["diag2_norm"] = diabetic.apply(fix_diag, axis=1, icd9="diag_2")
diabetic["diag3_norm"] = diabetic.apply(fix_diag, axis=1, icd9="diag_3")
def diagnose (diabetic, diag) :
if (diabetic["diag1_norm"] == diag) | (diabetic["diag2_norm"] == diag) | (diabetic["diag3_norm"] == diag) :
return True
else :
return False
for val in ['diabetes', 'other', 'circulatory', 'neoplasms', 'respiratory', 'injury', 'musculoskeletal', 'digestive', 'genitourinary'] :
name = val + "_diagnosis"
diabetic[name] = diabetic.apply(diagnose, axis = 1, diag=val)
dropped_Cols=['diag1_norm', 'diag2_norm', 'diag3_norm', 'diag_1', 'diag_2', 'diag_3']
diabetic.drop(columns=dropped_Cols, inplace=True)
common_drugs = ['metformin', 'repaglinide', 'glimepiride', 'glipizide',
'glyburide', 'pioglitazone', 'rosiglitazone', 'insulin']
rare_drugs = ["nateglinide", "chlorpropamide", "acetohexamide", "tolbutamide",
"acarbose", "miglitol", "troglitazone", "tolazamide", "examide",
"citoglipton", "glyburide-metformin", "glipizide-metformin",
"glimepiride-pioglitazone", "metformin-rosiglitazone", "metformin-pioglitazone"]
drugs = common_drugs + rare_drugs
for drug in common_drugs :
name = "take_" + drug
diabetic[name] = diabetic[drug].isin(["Down", "Steady", "Up"])
diabetic.drop(columns=drugs, inplace=True)
diabetic = pd.get_dummies(data = diabetic, columns = ["readmitted"], prefix = "readmit", drop_first=False)
# I didn't use drop_first to be sure I was dropping readmit_NO
diabetic = diabetic.drop(["readmit_NO"], axis = 1)
diabetic = diabetic.drop(["readmit_>30"], axis = 1)
diabetic.rename(columns = {'readmit_<30': 'readmitted'}, inplace = True)
diabetic["A1C"] = diabetic["A1Cresult"]
diabetic.loc[diabetic.A1Cresult.isin([">7", ">8"]), "A1C"] = "Abnorm"
diabetic = pd.get_dummies(data = diabetic, columns = ["A1C"], prefix = "A1C", drop_first=False)
diabetic = diabetic.drop(["A1Cresult", "A1C_None"], axis = 1)
diabetic["glu_serum"] = diabetic["max_glu_serum"]
diabetic.loc[diabetic.max_glu_serum.isin([">200", ">300"]), "glu_serum"] = "Abnorm"
diabetic = pd.get_dummies(data = diabetic, columns = ["glu_serum"], prefix = "glu_serum", drop_first=False)
diabetic = diabetic.drop(["max_glu_serum", "glu_serum_None"], axis = 1)
diabetic.loc[diabetic.change == "Ch", "change"] = True
diabetic.loc[diabetic.change == "No", "change"] = False
diabetic.loc[diabetic.diabetesMed == "Yes", "diabetesMed"] = True
diabetic.loc[diabetic.diabetesMed == "No", "diabetesMed"] = False
diabetic.drop(columns=['admission_type_id', 'discharge_disposition_id', 'admission_source_id'], inplace=True)
def cutage (df) :
return int(df.age[-4::].replace('-', '').replace(')', ''))
diabetic["decade"] = diabetic.apply(cutage, axis = 1)
#var_quanti.append("age_num")
diabetic = diabetic.drop("age", axis = 1)
quant_values = ['decade', 'time_in_hospital', 'num_lab_procedures', 'num_procedures',
'num_medications', 'number_outpatient', 'number_emergency',
'number_inpatient','number_diagnoses' ]
diabetic
corr = diabetic.corr()
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns)
# We will be using the list of quant_values
# We need to count how many rows we need
rowsTot = -(-(len(quant_values)) // 3)
# Where we start
rowNum=0
colNum=0
fig, axs = plt.subplots(rowsTot, 3, figsize = (20, 20))
for feature in quant_values:
sns.countplot(data = diabetic, y = feature, ax = axs[rowNum, colNum])
axs[rowNum, colNum].set_title(feature + str(rowNum) + str(colNum))
# Col numbers flips between 1 and 2, Rows increase by one
if colNum < 2:
colNum += 1
else:
colNum = 0
rowNum += 1
plt.show()
sns.countplot(data = diabetic, y = 'readmitted').set_title('Distribution of Readmission')
fig = plt.figure(figsize=(13,7),)
ax=sns.kdeplot(diabetic.loc[(diabetic['readmitted'] == 0),'time_in_hospital'] , color='b',shade=True,label='Not Readmitted')
ax=sns.kdeplot(diabetic.loc[(diabetic['readmitted'] == 1),'time_in_hospital'] , color='r',shade=True, label='Readmitted')
ax.set(xlabel='Time in Hospital', ylabel='Frequency')
plt.legend()
plt.title('Time in Hospital compared to readmission')
# k-means clustering
inertia = []
for n_clusters in range(1, 10):
kmeans = KMeans(n_clusters=n_clusters, random_state=1).fit(diabetic)
inertia.append(kmeans.inertia_ / n_clusters)
inertias = pd.DataFrame({'n_clusters': range(1, 10), 'inertia': inertia})
ax = inertias.plot(x='n_clusters', y='inertia')
plt.xlabel('Number of clusters(k)')
plt.ylabel('Average Within-Cluster Squared Distances')
plt.ylim((0, 1.1 * inertias.inertia.max()))
ax.legend().set_visible(False)
plt.show()
kmeans = KMeans(n_clusters=4, random_state=0).fit(diabetic)
#Plot of centroid
centroids = pd.DataFrame(kmeans.cluster_centers_, columns=diabetic.columns)
centroids['cluster'] = ['Cluster {}'.format(i+1) for i in centroids.index]
fig = plt.figure(figsize=(10,6))
fig.subplots_adjust(right=3)
ax = parallel_coordinates(centroids, class_column='cluster', colormap='Dark2', linewidth=5)
plt.legend(loc='center left', bbox_to_anchor=(0.95, 0.5))
plt.xlim(-0.5,7.5)
centroids
# We first partition our data and prepare our input variables
X = diabetic.drop(['readmitted'],axis=1)
y = diabetic["readmitted"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=1)
rf = RandomForestClassifier(n_estimators=500, random_state=1)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
# variable (feature) importance plot
importances = rf.feature_importances_
std = np.std([tree.feature_importances_ for tree in rf.estimators_], axis=0)
rf_importance = pd.DataFrame({'feature': X_train.columns, 'importance': importances, 'std': std})
rf_importance = rf_importance.sort_values('importance')
pd.crosstab(pd.Series(y_test, name = 'Actual'), pd.Series(rf_pred, name = 'Predict'), margins = True)
accuracy_rf = accuracy_score(y_test, rf_pred)
precision_rf = precision_score(y_test, rf_pred)
recall_rf = recall_score(y_test, rf_pred)
summary_rf = classificationSummary(y_test, rf.predict(X_test))
print("Accuracy is {0:.2f}".format(accuracy_rf))
print("Precision is {0:.2f}".format(precision_rf))
print("Recall is {0:.2f}".format(recall_rf))
rf_proba = rf.predict_proba(X_test)
rf_result = pd.DataFrame({'actual': y_test,
'p(0)': [p[0] for p in rf_proba],
'p(1)': [p[1] for p in rf_proba],
'predicted': rf.predict(X_test),
})
rf_result = rf_result.sort_values(by=['p(1)'], ascending=False)
Confusion Matrix (Accuracy 0.8848)
Prediction
Actual 0 1
0 34668 34
1 4484 35
Accuracy is 0.88
Precision is 0.51
Recall is 0.01
ax = df.plot(kind='barh', xerr='std', x='feature', legend=False)
ax.set_ylabel('')
# changing the rc parameters and plotting a line plot
plt.rcParams['figure.figsize'] = [6, 20]
plt.show()
logit = LogisticRegression(fit_intercept=True, penalty='l2', max_iter=1000)
logit.fit(X_train, y_train)
logit_pred = logit.predict(X_test)
pd.crosstab(pd.Series(y_test, name = 'Actual'), pd.Series(logit_pred, name = 'Predict'), margins = True)
print("Accuracy is {0:.2f}".format(accuracy_score(y_test, logit_pred)))
print("Precision is {0:.2f}".format(precision_score(y_test, logit_pred)))
print("Recall is {0:.2f}".format(recall_score(y_test, logit_pred)))
accuracy_logit = accuracy_score(y_test, logit_pred)
precision_logit = precision_score(y_test, logit_pred)
recall_logit = recall_score(y_test, logit_pred)
summary_logit = classificationSummary(y_test, logit.predict(X_test))
logit_proba = logit.predict_proba(X_test)
logit_result = pd.DataFrame({'actual': y_test,
'p(0)': [p[0] for p in logit_proba],
'p(1)': [p[1] for p in logit_proba],
'predicted': logit.predict(X_test),
})
logit_result = logit_result.sort_values(by=['p(1)'], ascending=False)
Accuracy is 0.88
Precision is 0.45
Recall is 0.01
Confusion Matrix (Accuracy 0.8844)
Prediction
Actual 0 1
0 34627 75
1 4457 62
def reduction_variable_logit(X_train, y_train, showVarToDel=False) :
final_model = False
var_to_del = []
while (final_model == False) :
with warnings.catch_warnings():
warnings.simplefilter("ignore")
log_reg = sm.Logit(y_train,
X_train.drop(var_to_del, axis = 1).astype(float)).fit(maxiter = 100, disp = False)
max_pvalue = max(log_reg.pvalues)
if max_pvalue < 0.05 :
final_model = True
else :
varToDel = log_reg.pvalues.index[log_reg.pvalues == max(log_reg.pvalues)].values[0]
if showVarToDel :
print(varToDel + ", p-value = " + str(max(log_reg.pvalues)))
var_to_del.append(varToDel)
return log_reg, var_to_del
log_reg, var_to_del = reduction_variable_logit(X_train, y_train, False)
var_to_del
newlogit = LogisticRegression(fit_intercept=True, penalty='l2', max_iter=1000)
newlogit.fit(X_train.drop(var_to_del, axis = 1), y_train)
newlogit_pred = newlogit.predict(X_test.drop(var_to_del, axis = 1))
pd.crosstab(pd.Series(y_test, name = 'Actual'), pd.Series(newlogit_pred, name = 'Predict'), margins = True)
print("Accuracy is {0:.2f}".format(accuracy_score(y_test, newlogit_pred)))
print("Precision is {0:.2f}".format(precision_score(y_test, newlogit_pred)))
print("Recall is {0:.2f}".format(recall_score(y_test, newlogit_pred)))
accuracy_newlogit = accuracy_score(y_test, newlogit_pred)
precision_newlogit = precision_score(y_test, newlogit_pred)
recall_newlogit = recall_score(y_test, newlogit_pred)
newlogit_proba = newlogit.predict_proba(X_test.drop(var_to_del, axis = 1))
newlogit_result = pd.DataFrame({'actual': y_test,
'p(0)': [p[0] for p in newlogit_proba],
'p(1)': [p[1] for p in newlogit_proba],
'predicted': newlogit.predict(X_test.drop(var_to_del, axis = 1)),
})
newlogit_result = newlogit_result.sort_values(by=['p(1)'], ascending=False)
Accuracy is 0.88
Precision is 0.47
Recall is 0.01
#full tree
fullClassTree = DecisionTreeClassifier(max_depth=28, criterion = "entropy", min_samples_split=10)
fullClassTree.fit(X_train, y_train)
fullClassTree_pred = fullClassTree.predict(X_test)
pd.crosstab(pd.Series(y_test, name = 'Actual'), pd.Series(fullClassTree_pred, name = 'Predict'), margins = True)
print("Accuracy is {0:.2f}".format(accuracy_score(y_test, fullClassTree_pred)))
print("Precision is {0:.2f}".format(precision_score(y_test, fullClassTree_pred)))
print("Recall is {0:.2f}".format(recall_score(y_test, fullClassTree_pred)))
accuracy_fulltree = accuracy_score(y_test, fullClassTree_pred)
precision_fulltree = precision_score(y_test, fullClassTree_pred)
recall_fulltree = recall_score(y_test, fullClassTree_pred)
fullClassTree_proba = fullClassTree.predict_proba(X_test)
fullClassTree_result = pd.DataFrame({'actual': y_test,
'p(0)': [p[0] for p in fullClassTree_proba],
'p(1)': [p[1] for p in fullClassTree_proba],
'predicted': fullClassTree.predict(X_test),
})
fullClassTree_result = fullClassTree_result.sort_values(by=['p(1)'], ascending=False)
Accuracy is 0.82
Precision is 0.14
Recall is 0.10
# create model
annmodel = Sequential()
annmodel.add(Dense(12, input_dim=38, activation='relu'))
annmodel.add(Dense(8, activation='relu'))
annmodel.add(Dense(1, activation='sigmoid'))
# Compile model
annmodel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#print model summary
print('Neural Network Model Summary: ')
print(annmodel.summary())
Neural Network Model Summary:
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
dense (Dense) (None, 12) 468
_________________________________________________________________
dense_1 (Dense) (None, 8) 104
_________________________________________________________________
dense_2 (Dense) (None, 1) 9
=================================================================
Total params: 581
Trainable params: 581
Non-trainable params: 0
_________________________________________________________________
None
# Fix int to np.int so they fit in Tensor
X_ann=np.asarray(X).astype(np.int)
y_ann=np.asarray(y).astype(np.int)
X_train_ann=np.asarray(X_test).astype(np.int)
y_train_ann=np.asarray(y_test).astype(np.int)
X_test_ann=np.asarray(X_test).astype(np.int)
y_test_ann=np.asarray(y_test).astype(np.int)
# Fit the model; set verbose to 1 or 2 to see the training metrics in each epoch
history = annmodel.fit(X_ann, y_ann, validation_split=0.40, epochs=100, batch_size=2000, verbose=0)
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.rcParams['figure.figsize'] = [6, 6]
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
print("Confusion Matrix and acccuracy for Random Forest:")
classificationSummary(rf_result.actual, rf_result.predicted)
print("")
print("---------------------------------------------------------------")
print("Confusion Matrix and acccuracy for Logistic Regression:")
classificationSummary(logit_result.actual, logit_result.predicted)
print("")
print("---------------------------------------------------------------")
print("Confusion Matrix and acccuracy for Reduced Logistic Regression:")
classificationSummary(newlogit_result.actual, newlogit_result.predicted)
print("")
print("---------------------------------------------------------------")
print("Confusion Matrix and acccuracy for Classification Tree:")
classificationSummary(fullClassTree_result.actual, fullClassTree_result.predicted)
print("")
print("---------------------------------------------------------------")
print("Acccuracy for ANN model:")
history.history['val_accuracy'][-1]
Confusion Matrix and acccuracy for Random Forest:
Confusion Matrix (Accuracy 0.8848)
Prediction
Actual 0 1
0 34668 34
1 4484 35
---------------------------------------------------------------
Confusion Matrix and acccuracy for Logistic Regression:
Confusion Matrix (Accuracy 0.8844)
Prediction
Actual 0 1
0 34627 75
1 4457 62
---------------------------------------------------------------
Confusion Matrix and acccuracy for Reduced Logistic Regression:
Confusion Matrix (Accuracy 0.8846)
Prediction
Actual 0 1
0 34631 71
1 4457 62
---------------------------------------------------------------
Confusion Matrix and acccuracy for Classification Tree:
Confusion Matrix (Accuracy 0.8211)
Prediction
Actual 0 1
0 31733 2969
1 4049 470
---------------------------------------------------------------
Acccuracy for ANN model:
ax = gainsChart(rf_result.actual, label='Random Forest', color='C3', figsize=[5, 5])
ax = gainsChart(logit_result.actual, label='Full model', color='C1', ax=ax)
ax = gainsChart(newlogit_result.actual, label='Reduced model', color='C0', ax=ax)
ax = gainsChart(fullClassTree_result.actual, label='Classification model', color='C2', ax=ax)
ax.legend()
plt.show()
fig, axs = plt.subplots(1, 4, figsize = (10, 6))
liftChart(rf_result['p(1)'], title='Random Forest', labelBars=False, ax=axs[0])
liftChart(logit_result['p(1)'], title='Full Model', labelBars=False, ax=axs[1])
liftChart(newlogit_result['p(1)'], title='Reduced model', labelBars=False, ax=axs[2])
liftChart(fullClassTree_result['p(1)'], title='Classification model', labelBars=False, ax=axs[3])
ax.legend()
plt.tight_layout()
plt.show()
pred_y0 = rf.predict_proba(X_test)[:,1] #Random Forest
pred_y1 = logit.predict_proba(X_test)[:,1] #Logistic Regression
pred_y2 = newlogit.predict_proba(X_test.drop(var_to_del, axis=1))[:,1] # Log Reg w/ Reduced variables
pred_y3 = fullClassTree.predict_proba(X_test)[:,1] # Classification Regression
pred_y4 = annmodel.predict(X_test_ann) # keras ANN
fpr0, tpr0, threshold0 = roc_curve(y_test,pred_y0)
fpr1, tpr1, threshold1 = roc_curve(y_test,pred_y1)
fpr2, tpr2, threshold2 = roc_curve(y_test,pred_y2)
fpr3, tpr3, threshold3 = roc_curve(y_test,pred_y3)
fpr4, tpr4, threshold4 = roc_curve(y_test,pred_y4)
roc_auc0 = auc(fpr0, tpr0)
roc_auc1 = auc(fpr1, tpr1)
roc_auc2 = auc(fpr2, tpr2)
roc_auc3 = auc(fpr3, tpr3)
roc_auc4 = auc(fpr4, tpr4)
plt.figure(figsize=(7,7))
plt.title('Receiver Operating Characteristic')
plt.plot(fpr0, tpr0, 'k', label = 'AUC Random Forest= %0.2f' % roc_auc0)
plt.plot(fpr1, tpr1, 'b', label = 'AUC Sklearn Logit= %0.2f' % roc_auc1)
plt.plot(fpr2, tpr2, 'y', label = 'AUC Sklearn Log Red= %0.2f' % roc_auc2)
plt.plot(fpr3, tpr3, 'r', label = 'AUC Classification= %0.2f' % roc_auc3)
plt.plot(fpr4, tpr4, 'g', label = 'AUC Keras-ANN= %0.2f' % roc_auc4)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()