import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn import metrics
from matplotlib import pyplot as plt
from sklearn import tree
from mlxtend.plotting import plot_decision_regions
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
import random
from pydotplus import graph_from_dot_data
from sklearn.tree import export_graphviz
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import graphviz
from IPython.display import display
Carregando o dataset
data = pd.read_csv('diabetes.csv')
data.head()
data.describe()
X = data.iloc[:,:-1]
X
y = data.iloc[:, -1]
Utilizando Cross-Validation Estratificado do Sklearn
cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=random.seed())
Experimento A
clf = DecisionTreeClassifier(criterion="entropy")
scores = cross_val_score(clf, X, y, cv=cv)
print("Accuracy:",scores.mean())
for train_index, test_index in cv.split(X,y):
clf.fit(X.loc[train_index], y.loc[train_index])
fig, ax = plt.subplots(figsize=(10, 10))
tree.plot_tree(clf, fontsize=2,
feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness",
"Insulin","BMI","DiabetesPedigreeFunction","Age"])
plt.show()
dot_data = export_graphviz(clf, filled=True, rounded=True,max_depth=3,out_file=None,
feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness",
"Insulin","BMI","DiabetesPedigreeFunction","Age"], class_names=['0','1'])
graph = graph_from_dot_data(dot_data)
graph.write_png('/work/masters-degree-class-assignments/AM/decision-trees/ExperimentA-plot-tree.png')
#testX = np.array(data.iloc[:, [1,7]]) pega as colunas 1 e 7
#testY = np.array(data.iloc[:, -1]) pega a ultima coluna
Experimento B
# max_deph 1, 3, 7 and 10
for x in range(1, 31, 1):
clf = DecisionTreeClassifier(random_state=31, criterion="entropy", max_depth=x)
score = cross_val_score(clf, X, y, cv=cv)
print("Accuracy of max_deph="+str(x)+" | "+str(score.mean()))
# min_samples_leaf from 5 to 200
for x in range(5,201,5):
clf = DecisionTreeClassifier(random_state=random.seed(), criterion="entropy", min_samples_leaf=x)
score = cross_val_score(clf, X, y, cv=cv)
print("Accuracy of min_samples_leaf="+str(x)+" | "+str(score.mean()))
clf = DecisionTreeClassifier(random_state=1, criterion="entropy", min_samples_leaf=50)
for train_index, test_index in cv.split(X,y):
clf.fit(X.loc[train_index], y.loc[train_index])
fig, ax = plt.subplots(figsize=(10, 10))
tree.plot_tree(clf, fontsize=4,
feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness",
"Insulin","BMI","DiabetesPedigreeFunction","Age"])
plt.show()
clf = DecisionTreeClassifier(random_state=1, criterion="entropy", min_samples_leaf=5)
for train_index, test_index in cv.split(X,y):
clf.fit(X.loc[train_index], y.loc[train_index])
fig, ax = plt.subplots(figsize=(10, 10))
tree.plot_tree(clf, fontsize=4,
feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness",
"Insulin","BMI","DiabetesPedigreeFunction","Age"])
plt.show()
# Desempenho com os hiperparâmetros min_samples_leaf + max_deph
clf = DecisionTreeClassifier(random_state=31, criterion="entropy", min_samples_leaf=20, max_depth=2)
score = cross_val_score(clf, X, y, cv=cv)
print("Accuracy of "+str(score.mean()))
X_train = pd.DataFrame()
y_train = pd.DataFrame()
X_test = pd.DataFrame()
y_test = pd.DataFrame()
for train_index, test_index in cv.split(X,y):
X_train = (X.loc[train_index]).copy()
y_train = (y.loc[train_index]).copy()
X_test = (X.loc[test_index]).copy()
y_test = (y.loc[test_index]).copy()
Baseline do experimento A:
clf = DecisionTreeClassifier(random_state=31, criterion="entropy")
score = cross_val_score(clf, X, y, cv=cv)
print("Accuracy of "+str(score.mean()))
path = clf.cost_complexity_pruning_path(X_train, y_train)
path
ccp_alphas, impurities = path.ccp_alphas, path.impurities
plt.figure(figsize=(10, 6))
plt.plot(ccp_alphas, impurities)
plt.xlabel("effective alpha")
plt.ylabel("total impurity of leaves")
clfs = []
for ccp_alpha in ccp_alphas:
clf = DecisionTreeClassifier(random_state=31, criterion="entropy", ccp_alpha=ccp_alpha)
clf.fit(X_train, y_train)
clfs.append(clf)
tree_depths = [clf.tree_.max_depth for clf in clfs]
plt.figure(figsize=(10, 6))
plt.plot(ccp_alphas[:-1], tree_depths[:-1])
plt.xlabel("effective alpha")
plt.ylabel("total depth")
acc_scores = [accuracy_score(y_test, clf.predict(X_test)) for clf in clfs]
tree_depths = [clf.tree_.max_depth for clf in clfs]
plt.figure(figsize=(10, 6))
plt.grid()
plt.plot(ccp_alphas[:-1], acc_scores[:-1])
plt.xlabel("effective alpha")
plt.ylabel("Test Accuracy scores")
clf = DecisionTreeClassifier(criterion="entropy", ccp_alpha=0.012)
score = cross_val_score(clf, X, y, cv=cv)
print("Accuracy:",score.mean())
Experimento D
# árvore com hiperparâmetros escolhidos
clf = DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=20)
score = cross_val_score(clf, X, y, cv=cv)
print("Accuracy:",score.mean())
for train_index, test_index in cv.split(X,y):
clf.fit(X.loc[train_index], y.loc[train_index])
dot_data = export_graphviz(clf, filled=True, rounded=True,max_depth=3,out_file=None,
feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness",
"Insulin","BMI","DiabetesPedigreeFunction","Age"], class_names=['0','1'])
graph = graphviz.Source(dot_data)
graph
Criando 10 árvores aleatórias com o método holdout
for i in range(10):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random.seed())
clf = DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=20)
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
print("Tree Number", i+1)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
plot_confusion_matrix(clf, X_test, y_test)
plt.show()
print("-------------------------")
Criando 10 árvores aleatórias com o método holdout estratificado
for i in range(10):
cv = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=random.seed())
clf = DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=20)
for train_index, test_index in cv.split(X,y):
clf.fit(X.loc[train_index], y.loc[train_index])
y_pred = clf.predict(X.loc[test_index])
y_test = y.loc[test_index]
print("Tree Number", i+1)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
plot_confusion_matrix(clf, X_test, y_test)
#predicted_probas = clf.predict_proba(X_test)
#skplt.metrics.plot_roc(y_test, predicted_probas)
plt.show()
dot_data = export_graphviz(clf, filled=True, rounded=True,max_depth=3,out_file=None,
feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness",
"Insulin","BMI","DiabetesPedigreeFunction","Age"], class_names=['0','1'])
graph = graphviz.Source(dot_data)
display(graph)