import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn import tree
from mlxtend.plotting import plot_decision_regions
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
import random
from pydotplus import graph_from_dot_data
from sklearn.tree import export_graphviz
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import graphviz
from IPython.display import display
Carregando o dataset
data = pd.read_csv('diabetes.csv')
data.head()
Pregnanciesint64
Glucoseint64
0
6
148
1
1
85
2
8
183
3
1
89
4
0
137
data.describe()
Pregnanciesfloat64
Glucosefloat64
count
768
768
mean
3.845052083
120.8945313
std
3.369578063
31.9726182
min
0
0
25%
1
99
50%
3
117
75%
6
140.25
max
17
199
X = data.iloc[:,:-1]
X
Pregnanciesint64
0 - 17
Glucoseint64
0 - 199
0
6
148
1
1
85
2
8
183
3
1
89
4
0
137
5
5
116
6
3
78
7
10
115
8
2
197
9
8
125
y = data.iloc[:, -1]
Utilizando Cross-Validation Estratificado do Sklearn
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=random.seed())
Experimento A
clf = DecisionTreeClassifier(criterion="entropy")
scores = cross_val_score(clf, X, y, cv=cv)
print("Accuracy:",scores.mean())
Accuracy: 0.688961038961039
for train_index, test_index in cv.split(X,y):
clf.fit(X.loc[train_index], y.loc[train_index])
fig, ax = plt.subplots(figsize=(10, 10))
tree.plot_tree(clf, fontsize=2,
feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness",
"Insulin","BMI","DiabetesPedigreeFunction","Age"])
plt.show()
dot_data = export_graphviz(clf, filled=True, rounded=True,max_depth=3,out_file=None,
feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness",
"Insulin","BMI","DiabetesPedigreeFunction","Age"], class_names=['0','1'])
graph = graph_from_dot_data(dot_data)
graph.write_png('/work/masters-degree-class-assignments/AM/decision-trees/ExperimentA-plot-tree.png')
#testX = np.array(data.iloc[:, [1,7]]) pega as colunas 1 e 7
#testY = np.array(data.iloc[:, -1]) pega a ultima coluna
Experimento B
# max_deph 1, 3, 7 and 10
for x in range(1, 31, 1):
clf = DecisionTreeClassifier(random_state=31, criterion="entropy", max_depth=x)
score = cross_val_score(clf, X, y, cv=cv)
print("Accuracy of max_deph="+str(x)+" | "+str(score.mean()))
Accuracy of max_deph=1 | 0.718181818181818
Accuracy of max_deph=2 | 0.7383116883116883
Accuracy of max_deph=3 | 0.7311688311688311
Accuracy of max_deph=4 | 0.7181818181818183
Accuracy of max_deph=5 | 0.72987012987013
Accuracy of max_deph=6 | 0.72987012987013
Accuracy of max_deph=7 | 0.7006493506493506
Accuracy of max_deph=8 | 0.7181818181818183
Accuracy of max_deph=9 | 0.7266233766233766
Accuracy of max_deph=10 | 0.7253246753246753
Accuracy of max_deph=11 | 0.7084415584415585
Accuracy of max_deph=12 | 0.6863636363636363
Accuracy of max_deph=13 | 0.7045454545454545
Accuracy of max_deph=14 | 0.6863636363636364
Accuracy of max_deph=15 | 0.7077922077922079
Accuracy of max_deph=16 | 0.7045454545454546
Accuracy of max_deph=17 | 0.7123376623376623
Accuracy of max_deph=18 | 0.6948051948051948
Accuracy of max_deph=19 | 0.7071428571428572
Accuracy of max_deph=20 | 0.7129870129870131
Accuracy of max_deph=21 | 0.7214285714285714
Accuracy of max_deph=22 | 0.7097402597402597
Accuracy of max_deph=23 | 0.7084415584415584
Accuracy of max_deph=24 | 0.7227272727272728
Accuracy of max_deph=25 | 0.7110389610389609
Accuracy of max_deph=26 | 0.6967532467532467
Accuracy of max_deph=27 | 0.6922077922077922
Accuracy of max_deph=28 | 0.6746753246753248
Accuracy of max_deph=29 | 0.7214285714285713
Accuracy of max_deph=30 | 0.6805194805194805
# min_samples_leaf from 5 to 200
for x in range(5,201,5):
clf = DecisionTreeClassifier(random_state=random.seed(), criterion="entropy", min_samples_leaf=x)
score = cross_val_score(clf, X, y, cv=cv)
print("Accuracy of min_samples_leaf="+str(x)+" | "+str(score.mean()))
Accuracy of min_samples_leaf=5 | 0.6948051948051949
Accuracy of min_samples_leaf=10 | 0.7668831168831168
Accuracy of min_samples_leaf=15 | 0.735064935064935
Accuracy of min_samples_leaf=20 | 0.7623376623376623
Accuracy of min_samples_leaf=25 | 0.7480519480519481
Accuracy of min_samples_leaf=30 | 0.7616883116883117
Accuracy of min_samples_leaf=35 | 0.7363636363636364
Accuracy of min_samples_leaf=40 | 0.7474025974025975
Accuracy of min_samples_leaf=45 | 0.7454545454545454
Accuracy of min_samples_leaf=50 | 0.7454545454545455
Accuracy of min_samples_leaf=55 | 0.7415584415584415
Accuracy of min_samples_leaf=60 | 0.7603896103896105
Accuracy of min_samples_leaf=65 | 0.7415584415584415
Accuracy of min_samples_leaf=70 | 0.7454545454545455
Accuracy of min_samples_leaf=75 | 0.7344155844155844
Accuracy of min_samples_leaf=80 | 0.7480519480519481
Accuracy of min_samples_leaf=85 | 0.7272727272727273
Accuracy of min_samples_leaf=90 | 0.7376623376623377
Accuracy of min_samples_leaf=95 | 0.740909090909091
Accuracy of min_samples_leaf=100 | 0.7266233766233766
Accuracy of min_samples_leaf=105 | 0.7311688311688311
Accuracy of min_samples_leaf=110 | 0.7298701298701299
Accuracy of min_samples_leaf=115 | 0.7155844155844155
Accuracy of min_samples_leaf=120 | 0.7298701298701299
Accuracy of min_samples_leaf=125 | 0.7285714285714285
Accuracy of min_samples_leaf=130 | 0.7246753246753247
Accuracy of min_samples_leaf=135 | 0.7344155844155844
Accuracy of min_samples_leaf=140 | 0.6980519480519479
Accuracy of min_samples_leaf=145 | 0.7272727272727273
Accuracy of min_samples_leaf=150 | 0.7149350649350649
Accuracy of min_samples_leaf=155 | 0.7116883116883116
Accuracy of min_samples_leaf=160 | 0.7175324675324675
Accuracy of min_samples_leaf=165 | 0.7253246753246753
Accuracy of min_samples_leaf=170 | 0.7344155844155844
Accuracy of min_samples_leaf=175 | 0.7207792207792207
Accuracy of min_samples_leaf=180 | 0.7077922077922078
Accuracy of min_samples_leaf=185 | 0.724025974025974
Accuracy of min_samples_leaf=190 | 0.712987012987013
Accuracy of min_samples_leaf=195 | 0.7227272727272728
Accuracy of min_samples_leaf=200 | 0.7337662337662338
clf = DecisionTreeClassifier(random_state=1, criterion="entropy", min_samples_leaf=50)
for train_index, test_index in cv.split(X,y):
clf.fit(X.loc[train_index], y.loc[train_index])
fig, ax = plt.subplots(figsize=(10, 10))
tree.plot_tree(clf, fontsize=4,
feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness",
"Insulin","BMI","DiabetesPedigreeFunction","Age"])
plt.show()
clf = DecisionTreeClassifier(random_state=1, criterion="entropy", min_samples_leaf=5)
for train_index, test_index in cv.split(X,y):
clf.fit(X.loc[train_index], y.loc[train_index])
fig, ax = plt.subplots(figsize=(10, 10))
tree.plot_tree(clf, fontsize=4,
feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness",
"Insulin","BMI","DiabetesPedigreeFunction","Age"])
plt.show()
# Desempenho com os hiperparâmetros min_samples_leaf + max_deph
clf = DecisionTreeClassifier(random_state=31, criterion="entropy", min_samples_leaf=20, max_depth=2)
score = cross_val_score(clf, X, y, cv=cv)
print("Accuracy of "+str(score.mean()))
Accuracy of 0.7337662337662337
X_train = pd.DataFrame()
y_train = pd.DataFrame()
X_test = pd.DataFrame()
y_test = pd.DataFrame()
for train_index, test_index in cv.split(X,y):
X_train = (X.loc[train_index]).copy()
y_train = (y.loc[train_index]).copy()
X_test = (X.loc[test_index]).copy()
y_test = (y.loc[test_index]).copy()
Baseline do experimento A:
clf = DecisionTreeClassifier(random_state=31, criterion="entropy")
score = cross_val_score(clf, X, y, cv=cv)
print("Accuracy of "+str(score.mean()))
Accuracy of 0.6922077922077922
path = clf.cost_complexity_pruning_path(X_train, y_train)
path
ccp_alphas, impurities = path.ccp_alphas, path.impurities
plt.figure(figsize=(10, 6))
plt.plot(ccp_alphas, impurities)
plt.xlabel("effective alpha")
plt.ylabel("total impurity of leaves")
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=31, criterion="entropy", ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
tree_depths = [clf.tree_.max_depth for clf in clfs]
plt.figure(figsize=(10, 6))
plt.plot(ccp_alphas[:-1], tree_depths[:-1])
plt.xlabel("effective alpha")
plt.ylabel("total depth")
acc_scores = [accuracy_score(y_test, clf.predict(X_test)) for clf in clfs]
tree_depths = [clf.tree_.max_depth for clf in clfs]
plt.figure(figsize=(10, 6))
plt.grid()
plt.plot(ccp_alphas[:-1], acc_scores[:-1])
plt.xlabel("effective alpha")
plt.ylabel("Test Accuracy scores")
clf = DecisionTreeClassifier(criterion="entropy", ccp_alpha=0.012)
score = cross_val_score(clf, X, y, cv=cv)
print("Accuracy:",score.mean())
Accuracy: 0.7402597402597403
Experimento D
# árvore com hiperparâmetros escolhidos
clf = DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=20)
score = cross_val_score(clf, X, y, cv=cv)
print("Accuracy:",score.mean())
Accuracy: 0.7532467532467533
for train_index, test_index in cv.split(X,y):
clf.fit(X.loc[train_index], y.loc[train_index])
dot_data = export_graphviz(clf, filled=True, rounded=True,max_depth=3,out_file=None,
feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness",
"Insulin","BMI","DiabetesPedigreeFunction","Age"], class_names=['0','1'])
graph = graphviz.Source(dot_data)
graph
Criando 10 árvores aleatórias com o método holdout
for i in range(10):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random.seed())
clf = DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=20)
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
print("Tree Number", i+1)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
plot_confusion_matrix(clf, X_test, y_test)
plt.show()
print("-------------------------")
Tree Number 1
Accuracy: 0.7662337662337663
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
-------------------------
Tree Number 2
Accuracy: 0.7597402597402597
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
-------------------------
Tree Number 3
Accuracy: 0.6493506493506493
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
-------------------------
Tree Number 4
Accuracy: 0.7532467532467533
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
-------------------------
Tree Number 5
Accuracy: 0.7467532467532467
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
-------------------------
Tree Number 6
Accuracy: 0.7597402597402597
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
-------------------------
Tree Number 7
Accuracy: 0.7077922077922078
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
-------------------------
Tree Number 8
Accuracy: 0.6948051948051948
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
-------------------------
Tree Number 9
Accuracy: 0.7532467532467533
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
-------------------------
Tree Number 10
Accuracy: 0.7402597402597403
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
-------------------------
Criando 10 árvores aleatórias com o método holdout estratificado
for i in range(10):
cv = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=random.seed())
clf = DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=20)
for train_index, test_index in cv.split(X,y):
clf.fit(X.loc[train_index], y.loc[train_index])
y_pred = clf.predict(X.loc[test_index])
y_test = y.loc[test_index]
print("Tree Number", i+1)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
plot_confusion_matrix(clf, X_test, y_test)
#predicted_probas = clf.predict_proba(X_test)
#skplt.metrics.plot_roc(y_test, predicted_probas)
plt.show()
dot_data = export_graphviz(clf, filled=True, rounded=True,max_depth=3,out_file=None,
feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness",
"Insulin","BMI","DiabetesPedigreeFunction","Age"], class_names=['0','1'])
graph = graphviz.Source(dot_data)
display(graph)
Tree Number 1
Accuracy: 0.7272727272727273
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
Tree Number 2
Accuracy: 0.7597402597402597
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
Tree Number 3
Accuracy: 0.7597402597402597
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
Tree Number 4
Accuracy: 0.7532467532467533
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
Tree Number 5
Accuracy: 0.7402597402597403
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
Tree Number 6
Accuracy: 0.7142857142857143
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
Tree Number 7
Accuracy: 0.7597402597402597
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
Tree Number 8
Accuracy: 0.7597402597402597
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
Tree Number 9
Accuracy: 0.7337662337662337
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)
Tree Number 10
Accuracy: 0.7532467532467533
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/utils/deprecation.py:87: FutureWarning: Function plot_confusion_matrix is deprecated; Function `plot_confusion_matrix` is deprecated in 1.0 and will be removed in 1.2. Use one of the class methods: ConfusionMatrixDisplay.from_predictions or ConfusionMatrixDisplay.from_estimator.
warnings.warn(msg, category=FutureWarning)