A5 - Duplicate

#%config Completer.use_jedi = False import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.model_selection import train_test_split, cross_val_score from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, make_scorer, confusion_matrix, fbeta_score from sklearn.dummy import DummyClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import GridSearchCV from sklearn.tree import DecisionTreeClassifier from sklearn.base import ClassifierMixin, BaseEstimator from dtreeviz.trees import dtreeviz from sklearn.metrics import ConfusionMatrixDisplay from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report import seaborn as sns sns.set_context('paper') sns.set(font='sans-serif', font_scale=1.4) plt.rc('legend', fontsize=12) import pickle

data = pd.read_pickle('wdbc.pkl')

data

target = 'malignant' X = data.drop(columns=[target, 'id']) y = data[target] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True) print(f'sum of X_train = {len(X_train)}, malignant = {100*sum(y_train==1)/len(y_train):.2f}%') print(f'sum of X_test = {len(X_test)}, malignant = {100*sum(y_test==1)/len(y_test):.2f}%')

dummy_clf = DummyClassifier() cv_scores = cross_val_score(dummy_clf, X_train, y_train, cv=5, #scoring=make_scorer(fbeta_score, beta=2) ) mean_dummy, std = np.mean(cv_scores), np.std(cv_scores) print(f'accuracy {mean_dummy: .3f} +- {std: .4f}')

def plot_features(feature='radius'): fig, axs = plt.subplots(1,3, figsize=(16,4)) for i in range(3): ax=axs[i] ax.hist(X_train[f'{feature}_{i}'][y_train==0], alpha=0.5, label='benign', density=True) ax.hist(X_train[f'{feature}_{i}'][y_train==1], alpha=0.5, label='malignant', density=True) ax.set(title=f'{feature}_{i}', xlabel=f'{feature}_{i}') ax.legend() plt.show()

cell_size_features = ['radius', 'area'] for feature in cell_size_features: plot_features(feature)

cell_shape_features = ['smoothness', 'compactness', 'concavity', 'concave points', 'symmetry', 'fractal dimension'] for feature in cell_shape_features: plot_features(feature)

plot_features('texture')

# inirial value to threshold (the value of a specific column that sperate the has 95% benign cells under it) threshold = 0.95

class RuleBasedClassifier(BaseEstimator, ClassifierMixin): def __init__(self): self.size_features = ['radius_2', 'area_2'] self.shape_features = ['concave points_0', 'concave points_2', 'concavity_0'] self.texture_features = ['texture_2'] self.similarity_features = ['radius_1', 'area_1'] def fit(self, X, y): self.X = X self.y = y # manual founded threshold by observing the plots above self.size_thresholds = [17, 900] self.shape_thresholds = [0.07, 0.18, 0.18] self.texture_thresholds = [50] self.similarity_thresholds = [0.8, 55] def get_threshold(self, col_name): thres = self.X[self.y == 0][col_name].quantile([threshold])[threshold] return thres def predict(self, X): return self.predict_with_calculated_threshold(X) #return self.predict_with_manual_threshold(X) def predict_manual(self, X): preds = np.zeros(len(X)).astype('bool') # if size is abnormal for i, size_feature in enumerate(self.size_features): preds = (X[size_feature] >= self.size_thresholds[i]) | preds # if shape is abnormal for i, shape_feature in enumerate(self.shape_features): preds = (X[shape_feature] >= self.shape_thresholds[i]) | preds # if texture is abnormal for i, texture_feature in enumerate(self.texture_features): preds = (X[texture_feature] >= self.texture_thresholds[i]) | preds # if similarity is abnormal for i, similarity_feature in enumerate(self.similarity_features): preds = (X[similarity_feature] >= self.similarity_thresholds[i]) | preds return preds def predict_with_calculated_threshold(self, X): preds = np.zeros(len(X)).astype('bool') # if size is abnormal for i, size_feature in enumerate(self.size_features): preds = (X[size_feature] >= self.get_threshold(self.size_features[i])) | preds # if shape is abnormal for i, shape_feature in enumerate(self.shape_features): preds = (X[shape_feature] >= self.get_threshold(self.shape_features[i])) | preds # if texture is abnormal for i, texture_feature in enumerate(self.texture_features): preds = (X[texture_feature] >= self.get_threshold(self.texture_features[i])) | preds # if similarity is abnormal for i, similarity_feature in enumerate(self.similarity_features): preds = (X[similarity_feature] >= self.get_threshold(self.similarity_features[i])) | preds return preds

for threshold in np.linspace(0.95,1,20): clf = RuleBasedClassifier() cv_scores = cross_val_score(clf, X_train, y_train, cv=2, scoring=make_scorer(fbeta_score, beta=2)) mean, std = np.mean(cv_scores), np.std(cv_scores) print(f'{threshold:.5f}: f2 score{mean: .3f} +- {std: .4f}')

# we choos threshold = 0.995 threshold = 0.997 clf = RuleBasedClassifier() clf.fit(X_train, y_train) preds1 = clf.predict(X_test) print(f'f2 score = {fbeta_score(y_test, preds1, beta=2):.3f}')

print("Model evaluation metrics:") print(classification_report(list(y_test),preds1, target_names = ['Benign','Malignant'], digits = 3))

threshold = 0.997 fig, ax = plt.subplots(figsize=(4, 4)) clf = RuleBasedClassifier() clf.fit(X_train, y_train) preds1 = clf.predict(X_test) ConfusionMatrixDisplay.from_predictions(list(y_test),preds1, normalize = 'true', display_labels = ['Benign','Malignant'], values_format = ".3f", ax=ax, cmap = plt.get_cmap('Blues'), colorbar=False) plt.grid(None) plt.show()

def get_paramter_values(search): scores = search.cv_results_['mean_test_score'] param_vals_list = search.cv_results_['params'] titles = [ s for s in list(param_vals_list[0].keys())] n_params=len(titles) P = [[] for i in range(n_params)] for j, param_vals in enumerate(param_vals_list): param_vals = list(param_vals.values()) for i in range(n_params): p_val = param_vals[i] if type(p_val) is tuple: P[i].append(decode(p_val)) else: P[i].append(p_val) P = np.array(P) return P, titles, scores

rf = RandomForestClassifier() n=25 est_dist = np.unique([int(i) for i in np.logspace(0.4, 2.7, n)]) depth_dist = np.unique([int(i) for i in np.logspace(0.4, 1.8, n)]) distributions = {'n_estimators': est_dist, 'max_depth': depth_dist, } clf = GridSearchCV(rf, distributions, cv=5, n_jobs=-1, scoring=make_scorer(fbeta_score, beta=2)) search = clf.fit(X_train, y_train)

P, titles, scores = get_paramter_values(search) scores = scores.reshape(len(depth_dist), len(est_dist))

fig, ax = plt.subplots(1,1, figsize=(8,4)) cs = ax.contourf(est_dist, depth_dist, scores) cbar = fig.colorbar(cs) cbar.set_label('F2 score') ax.set(xlabel='n estimators', ylabel='max depth', title='Random Forest hyper parameter search', xscale='log', yscale='log'); fig.savefig('rfhyperparam.png', bbox_inches='tight')

search.best_params_

n_estimators = 133 max_dept = 24

rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_dept) cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring=make_scorer(fbeta_score, beta=2)) mean, std = np.mean(cv_scores), np.std(cv_scores) print(f'f2 score{mean: .3f} +- {std: .4f}')

rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_dept) rf.fit(X_train, y_train) preds2=rf.predict(X_test) print(f'accuracy score: {accuracy_score(y_test, preds2):.3f}') print(f'f2 score = {fbeta_score(y_test, preds2, beta=2):.3f}')

print("Model evaluation metrics:") print(classification_report(list(y_test),preds2, target_names = ['Benign','Malignant'], digits = 3))

fig, ax = plt.subplots(figsize=(4, 4)) ConfusionMatrixDisplay.from_predictions(list(y_test),preds2, normalize = 'true', display_labels = ['Benign','Malignant'], values_format = ".3f", ax=ax, cmap = plt.get_cmap('Blues'), colorbar=False) plt.grid(None) plt.show()

clf = DecisionTreeClassifier(random_state=42, max_depth=3) cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring=make_scorer(fbeta_score, beta=2)) mean, std = np.mean(cv_scores), np.std(cv_scores) print(f'f2 score{mean: .3f} +- {std: .4f}')

clf = DecisionTreeClassifier(random_state=42, max_depth=3) clf.fit(X_train, y_train) viz = dtreeviz(clf, X_train, y_train, target_name=target, class_names=['malignant', 'benign'], feature_names=data.columns[2:], ) viz

clf = DecisionTreeClassifier(random_state=42, max_depth=3) clf.fit(X_train, y_train) preds3=clf.predict(X_test) print(f'accuracy score: {accuracy_score(y_test, preds3):.3f}') print(f'f2 score = {fbeta_score(y_test, preds3, beta=2):.3f}')

print("Model evaluation metrics:") print(classification_report(list(y_test),preds3, target_names = ['Benign','Malignant'], digits = 3))

fig, ax = plt.subplots(figsize=(4, 4)) ConfusionMatrixDisplay.from_predictions(list(y_test),preds3, normalize = 'true', display_labels = ['Benign','Malignant'], values_format = ".3f", ax=ax, cmap = plt.get_cmap('Blues'), colorbar=False) plt.grid(None) plt.show()

viz.save("decision_tree.svg")

fig, axs = plt.subplots(1, 3, figsize=(12, 4), sharey=True) preds_list=[preds1, preds2, preds3] titles=['rule based', 'random forest', 'decision tree'] for i in range(3): ax=axs[i] preds=preds_list[i] ConfusionMatrixDisplay.from_predictions(list(y_test),preds, normalize = 'true', display_labels = ['Benign','Malignant'], values_format = ".3f", ax=ax, cmap = plt.get_cmap('Blues'), colorbar=False) ax.grid(None) ax.set(title=titles[i]) if i!=0: ax.set(ylabel='') fig.savefig('confusion_matrix.png', bbox_inches='tight')

clf1 = RuleBasedClassifier() clf2 = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_dept) clf3 = DecisionTreeClassifier(random_state=42, max_depth=3) clfs = [clf1, clf2, clf3]

df = pd.DataFrame(index=['5-cv F2 score', '5-cv Accuracy', 'Test F2 score', 'Test Accuracy']) models = ['Rule Based', 'Random Forest', 'Decision Tree'] for i in range(3): model=models[i] preds=preds_list[i] clf=clfs[i] cvf2 = cross_val_score(clf, X_train, y_train, cv=5, scoring=make_scorer(fbeta_score, beta=2)) cvacc = cross_val_score(clf, X_train, y_train, cv=5) temp = [f"{np.mean(cvf2):.3f}$\pm${np.std(cvf2):.4f}", f"{np.mean(cvacc):.3f}$\pm${np.std(cvacc):.4f}", f"{fbeta_score(y_test, preds, beta=2):.3f}", f"{accuracy_score(y_test, preds):.3f}" ] df[model] = temp df = df.T

df

print(df.to_latex(escape=False))

df.to_latex()