Decision Trees

import pandas as pd from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier from sklearn import metrics from matplotlib import pyplot as plt from sklearn import tree from mlxtend.plotting import plot_decision_regions from sklearn.model_selection import StratifiedShuffleSplit import numpy as np import random from pydotplus import graph_from_dot_data from sklearn.tree import export_graphviz from sklearn.metrics import accuracy_score from sklearn.metrics import plot_confusion_matrix import matplotlib.pyplot as plt import scikitplot as skplt from sklearn.model_selection import cross_val_score from sklearn.model_selection import train_test_split import graphviz from IPython.display import display

Carregando o dataset

data = pd.read_csv('diabetes.csv') data.head()

data.describe()

X = data.iloc[:,:-1] X

y = data.iloc[:, -1]

Utilizando Cross-Validation Estratificado do Sklearn

cv = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=random.seed())

Experimento A

clf = DecisionTreeClassifier(criterion="entropy") scores = cross_val_score(clf, X, y, cv=cv) print("Accuracy:",scores.mean())

for train_index, test_index in cv.split(X,y): clf.fit(X.loc[train_index], y.loc[train_index])

fig, ax = plt.subplots(figsize=(10, 10)) tree.plot_tree(clf, fontsize=2, feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness", "Insulin","BMI","DiabetesPedigreeFunction","Age"]) plt.show()

dot_data = export_graphviz(clf, filled=True, rounded=True,max_depth=3,out_file=None, feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness", "Insulin","BMI","DiabetesPedigreeFunction","Age"], class_names=['0','1']) graph = graph_from_dot_data(dot_data) graph.write_png('/work/masters-degree-class-assignments/AM/decision-trees/ExperimentA-plot-tree.png')

#testX = np.array(data.iloc[:, [1,7]]) pega as colunas 1 e 7 #testY = np.array(data.iloc[:, -1]) pega a ultima coluna

Experimento B

# max_deph 1, 3, 7 and 10 for x in range(1, 31, 1): clf = DecisionTreeClassifier(random_state=31, criterion="entropy", max_depth=x) score = cross_val_score(clf, X, y, cv=cv) print("Accuracy of max_deph="+str(x)+" | "+str(score.mean()))

# min_samples_leaf from 5 to 200 for x in range(5,201,5): clf = DecisionTreeClassifier(random_state=random.seed(), criterion="entropy", min_samples_leaf=x) score = cross_val_score(clf, X, y, cv=cv) print("Accuracy of min_samples_leaf="+str(x)+" | "+str(score.mean()))

clf = DecisionTreeClassifier(random_state=1, criterion="entropy", min_samples_leaf=50) for train_index, test_index in cv.split(X,y): clf.fit(X.loc[train_index], y.loc[train_index])

fig, ax = plt.subplots(figsize=(10, 10)) tree.plot_tree(clf, fontsize=4, feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness", "Insulin","BMI","DiabetesPedigreeFunction","Age"]) plt.show()

clf = DecisionTreeClassifier(random_state=1, criterion="entropy", min_samples_leaf=5) for train_index, test_index in cv.split(X,y): clf.fit(X.loc[train_index], y.loc[train_index])

fig, ax = plt.subplots(figsize=(10, 10)) tree.plot_tree(clf, fontsize=4, feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness", "Insulin","BMI","DiabetesPedigreeFunction","Age"]) plt.show()

# Desempenho com os hiperparâmetros min_samples_leaf + max_deph clf = DecisionTreeClassifier(random_state=31, criterion="entropy", min_samples_leaf=20, max_depth=2) score = cross_val_score(clf, X, y, cv=cv) print("Accuracy of "+str(score.mean()))

X_train = pd.DataFrame() y_train = pd.DataFrame() X_test = pd.DataFrame() y_test = pd.DataFrame()

for train_index, test_index in cv.split(X,y): X_train = (X.loc[train_index]).copy() y_train = (y.loc[train_index]).copy() X_test = (X.loc[test_index]).copy() y_test = (y.loc[test_index]).copy()

Baseline do experimento A:

clf = DecisionTreeClassifier(random_state=31, criterion="entropy") score = cross_val_score(clf, X, y, cv=cv) print("Accuracy of "+str(score.mean()))

path = clf.cost_complexity_pruning_path(X_train, y_train) path

ccp_alphas, impurities = path.ccp_alphas, path.impurities plt.figure(figsize=(10, 6)) plt.plot(ccp_alphas, impurities) plt.xlabel("effective alpha") plt.ylabel("total impurity of leaves")

clfs = [] for ccp_alpha in ccp_alphas: clf = DecisionTreeClassifier(random_state=31, criterion="entropy", ccp_alpha=ccp_alpha) clf.fit(X_train, y_train) clfs.append(clf)

tree_depths = [clf.tree_.max_depth for clf in clfs] plt.figure(figsize=(10, 6)) plt.plot(ccp_alphas[:-1], tree_depths[:-1]) plt.xlabel("effective alpha") plt.ylabel("total depth")

acc_scores = [accuracy_score(y_test, clf.predict(X_test)) for clf in clfs] tree_depths = [clf.tree_.max_depth for clf in clfs] plt.figure(figsize=(10, 6)) plt.grid() plt.plot(ccp_alphas[:-1], acc_scores[:-1]) plt.xlabel("effective alpha") plt.ylabel("Test Accuracy scores")

clf = DecisionTreeClassifier(criterion="entropy", ccp_alpha=0.012) score = cross_val_score(clf, X, y, cv=cv) print("Accuracy:",score.mean())

Experimento D

# árvore com hiperparâmetros escolhidos clf = DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=20) score = cross_val_score(clf, X, y, cv=cv) print("Accuracy:",score.mean())

for train_index, test_index in cv.split(X,y): clf.fit(X.loc[train_index], y.loc[train_index])

dot_data = export_graphviz(clf, filled=True, rounded=True,max_depth=3,out_file=None, feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness", "Insulin","BMI","DiabetesPedigreeFunction","Age"], class_names=['0','1']) graph = graphviz.Source(dot_data) graph

Criando 10 árvores aleatórias com o método holdout

for i in range(10): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random.seed()) clf = DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=20) # Train Decision Tree Classifer clf = clf.fit(X_train,y_train) #Predict the response for test dataset y_pred = clf.predict(X_test) print("Tree Number", i+1) print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) plot_confusion_matrix(clf, X_test, y_test) plt.show() print("-------------------------")

Criando 10 árvores aleatórias com o método holdout estratificado

for i in range(10): cv = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=random.seed()) clf = DecisionTreeClassifier(criterion="entropy", max_depth=2, min_samples_leaf=20) for train_index, test_index in cv.split(X,y): clf.fit(X.loc[train_index], y.loc[train_index]) y_pred = clf.predict(X.loc[test_index]) y_test = y.loc[test_index] print("Tree Number", i+1) print("Accuracy:",metrics.accuracy_score(y_test, y_pred)) plot_confusion_matrix(clf, X_test, y_test) #predicted_probas = clf.predict_proba(X_test) #skplt.metrics.plot_roc(y_test, predicted_probas) plt.show() dot_data = export_graphviz(clf, filled=True, rounded=True,max_depth=3,out_file=None, feature_names=["Pregnancies","Glucose","BloodPressure","SkinThickness", "Insulin","BMI","DiabetesPedigreeFunction","Age"], class_names=['0','1']) graph = graphviz.Source(dot_data) display(graph)