#%config Completer.use_jedi = False
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, make_scorer, confusion_matrix, fbeta_score
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import ClassifierMixin, BaseEstimator
from dtreeviz.trees import dtreeviz
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
sns.set_context('paper')
sns.set(font='sans-serif', font_scale=1.4)
plt.rc('legend', fontsize=12)
import pickle
data = pd.read_pickle('wdbc.pkl')
data
target = 'malignant'
X = data.drop(columns=[target, 'id'])
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.25,
random_state=42,
shuffle=True)
print(f'sum of X_train = {len(X_train)}, malignant = {100*sum(y_train==1)/len(y_train):.2f}%')
print(f'sum of X_test = {len(X_test)}, malignant = {100*sum(y_test==1)/len(y_test):.2f}%')
dummy_clf = DummyClassifier()
cv_scores = cross_val_score(dummy_clf,
X_train,
y_train,
cv=5,
#scoring=make_scorer(fbeta_score, beta=2)
)
mean_dummy, std = np.mean(cv_scores), np.std(cv_scores)
print(f'accuracy {mean_dummy: .3f} +- {std: .4f}')
def plot_features(feature='radius'):
fig, axs = plt.subplots(1,3, figsize=(16,4))
for i in range(3):
ax=axs[i]
ax.hist(X_train[f'{feature}_{i}'][y_train==0], alpha=0.5, label='benign', density=True)
ax.hist(X_train[f'{feature}_{i}'][y_train==1], alpha=0.5, label='malignant', density=True)
ax.set(title=f'{feature}_{i}', xlabel=f'{feature}_{i}')
ax.legend()
plt.show()
cell_size_features = ['radius', 'area']
for feature in cell_size_features:
plot_features(feature)
cell_shape_features = ['smoothness', 'compactness', 'concavity', 'concave points', 'symmetry', 'fractal dimension']
for feature in cell_shape_features:
plot_features(feature)
plot_features('texture')
# inirial value to threshold (the value of a specific column that sperate the has 95% benign cells under it)
threshold = 0.95
class RuleBasedClassifier(BaseEstimator, ClassifierMixin):
def __init__(self):
self.size_features = ['radius_2', 'area_2']
self.shape_features = ['concave points_0', 'concave points_2', 'concavity_0']
self.texture_features = ['texture_2']
self.similarity_features = ['radius_1', 'area_1']
def fit(self, X, y):
self.X = X
self.y = y
# manual founded threshold by observing the plots above
self.size_thresholds = [17, 900]
self.shape_thresholds = [0.07, 0.18, 0.18]
self.texture_thresholds = [50]
self.similarity_thresholds = [0.8, 55]
def get_threshold(self, col_name):
thres = self.X[self.y == 0][col_name].quantile([threshold])[threshold]
return thres
def predict(self, X):
return self.predict_with_calculated_threshold(X)
#return self.predict_with_manual_threshold(X)
def predict_manual(self, X):
preds = np.zeros(len(X)).astype('bool')
# if size is abnormal
for i, size_feature in enumerate(self.size_features):
preds = (X[size_feature] >= self.size_thresholds[i]) | preds
# if shape is abnormal
for i, shape_feature in enumerate(self.shape_features):
preds = (X[shape_feature] >= self.shape_thresholds[i]) | preds
# if texture is abnormal
for i, texture_feature in enumerate(self.texture_features):
preds = (X[texture_feature] >= self.texture_thresholds[i]) | preds
# if similarity is abnormal
for i, similarity_feature in enumerate(self.similarity_features):
preds = (X[similarity_feature] >= self.similarity_thresholds[i]) | preds
return preds
def predict_with_calculated_threshold(self, X):
preds = np.zeros(len(X)).astype('bool')
# if size is abnormal
for i, size_feature in enumerate(self.size_features):
preds = (X[size_feature] >= self.get_threshold(self.size_features[i])) | preds
# if shape is abnormal
for i, shape_feature in enumerate(self.shape_features):
preds = (X[shape_feature] >= self.get_threshold(self.shape_features[i])) | preds
# if texture is abnormal
for i, texture_feature in enumerate(self.texture_features):
preds = (X[texture_feature] >= self.get_threshold(self.texture_features[i])) | preds
# if similarity is abnormal
for i, similarity_feature in enumerate(self.similarity_features):
preds = (X[similarity_feature] >= self.get_threshold(self.similarity_features[i])) | preds
return preds
for threshold in np.linspace(0.95,1,20):
clf = RuleBasedClassifier()
cv_scores = cross_val_score(clf, X_train, y_train, cv=2, scoring=make_scorer(fbeta_score, beta=2))
mean, std = np.mean(cv_scores), np.std(cv_scores)
print(f'{threshold:.5f}: f2 score{mean: .3f} +- {std: .4f}')
# we choos threshold = 0.995
threshold = 0.997
clf = RuleBasedClassifier()
clf.fit(X_train, y_train)
preds1 = clf.predict(X_test)
print(f'f2 score = {fbeta_score(y_test, preds1, beta=2):.3f}')
print("Model evaluation metrics:")
print(classification_report(list(y_test),preds1, target_names = ['Benign','Malignant'], digits = 3))
threshold = 0.997
fig, ax = plt.subplots(figsize=(4, 4))
clf = RuleBasedClassifier()
clf.fit(X_train, y_train)
preds1 = clf.predict(X_test)
ConfusionMatrixDisplay.from_predictions(list(y_test),preds1, normalize = 'true', display_labels = ['Benign','Malignant'],
values_format = ".3f", ax=ax, cmap = plt.get_cmap('Blues'), colorbar=False)
plt.grid(None)
plt.show()
def get_paramter_values(search):
scores = search.cv_results_['mean_test_score']
param_vals_list = search.cv_results_['params']
titles = [ s for s in list(param_vals_list[0].keys())]
n_params=len(titles)
P = [[] for i in range(n_params)]
for j, param_vals in enumerate(param_vals_list):
param_vals = list(param_vals.values())
for i in range(n_params):
p_val = param_vals[i]
if type(p_val) is tuple:
P[i].append(decode(p_val))
else:
P[i].append(p_val)
P = np.array(P)
return P, titles, scores
rf = RandomForestClassifier()
n=25
est_dist = np.unique([int(i) for i in np.logspace(0.4, 2.7, n)])
depth_dist = np.unique([int(i) for i in np.logspace(0.4, 1.8, n)])
distributions = {'n_estimators': est_dist,
'max_depth': depth_dist,
}
clf = GridSearchCV(rf, distributions, cv=5, n_jobs=-1, scoring=make_scorer(fbeta_score, beta=2))
search = clf.fit(X_train, y_train)
P, titles, scores = get_paramter_values(search)
scores = scores.reshape(len(depth_dist), len(est_dist))
fig, ax = plt.subplots(1,1, figsize=(8,4))
cs = ax.contourf(est_dist, depth_dist, scores)
cbar = fig.colorbar(cs)
cbar.set_label('F2 score')
ax.set(xlabel='n estimators', ylabel='max depth', title='Random Forest hyper parameter search', xscale='log', yscale='log');
fig.savefig('rfhyperparam.png', bbox_inches='tight')
search.best_params_
n_estimators = 133
max_dept = 24
rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_dept)
cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring=make_scorer(fbeta_score, beta=2))
mean, std = np.mean(cv_scores), np.std(cv_scores)
print(f'f2 score{mean: .3f} +- {std: .4f}')
rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_dept)
rf.fit(X_train, y_train)
preds2=rf.predict(X_test)
print(f'accuracy score: {accuracy_score(y_test, preds2):.3f}')
print(f'f2 score = {fbeta_score(y_test, preds2, beta=2):.3f}')
print("Model evaluation metrics:")
print(classification_report(list(y_test),preds2, target_names = ['Benign','Malignant'], digits = 3))
fig, ax = plt.subplots(figsize=(4, 4))
ConfusionMatrixDisplay.from_predictions(list(y_test),preds2, normalize = 'true', display_labels = ['Benign','Malignant'],
values_format = ".3f", ax=ax, cmap = plt.get_cmap('Blues'), colorbar=False)
plt.grid(None)
plt.show()
clf = DecisionTreeClassifier(random_state=42, max_depth=3)
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring=make_scorer(fbeta_score, beta=2))
mean, std = np.mean(cv_scores), np.std(cv_scores)
print(f'f2 score{mean: .3f} +- {std: .4f}')
clf = DecisionTreeClassifier(random_state=42, max_depth=3)
clf.fit(X_train, y_train)
viz = dtreeviz(clf, X_train, y_train,
target_name=target,
class_names=['malignant', 'benign'],
feature_names=data.columns[2:],
)
viz
clf = DecisionTreeClassifier(random_state=42, max_depth=3)
clf.fit(X_train, y_train)
preds3=clf.predict(X_test)
print(f'accuracy score: {accuracy_score(y_test, preds3):.3f}')
print(f'f2 score = {fbeta_score(y_test, preds3, beta=2):.3f}')
print("Model evaluation metrics:")
print(classification_report(list(y_test),preds3, target_names = ['Benign','Malignant'], digits = 3))
fig, ax = plt.subplots(figsize=(4, 4))
ConfusionMatrixDisplay.from_predictions(list(y_test),preds3, normalize = 'true', display_labels = ['Benign','Malignant'],
values_format = ".3f", ax=ax, cmap = plt.get_cmap('Blues'), colorbar=False)
plt.grid(None)
plt.show()
viz.save("decision_tree.svg")
fig, axs = plt.subplots(1, 3, figsize=(12, 4), sharey=True)
preds_list=[preds1, preds2, preds3]
titles=['rule based', 'random forest', 'decision tree']
for i in range(3):
ax=axs[i]
preds=preds_list[i]
ConfusionMatrixDisplay.from_predictions(list(y_test),preds, normalize = 'true', display_labels = ['Benign','Malignant'],
values_format = ".3f", ax=ax, cmap = plt.get_cmap('Blues'), colorbar=False)
ax.grid(None)
ax.set(title=titles[i])
if i!=0:
ax.set(ylabel='')
fig.savefig('confusion_matrix.png', bbox_inches='tight')
clf1 = RuleBasedClassifier()
clf2 = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_dept)
clf3 = DecisionTreeClassifier(random_state=42, max_depth=3)
clfs = [clf1, clf2, clf3]
df = pd.DataFrame(index=['5-cv F2 score', '5-cv Accuracy', 'Test F2 score', 'Test Accuracy'])
models = ['Rule Based', 'Random Forest', 'Decision Tree']
for i in range(3):
model=models[i]
preds=preds_list[i]
clf=clfs[i]
cvf2 = cross_val_score(clf, X_train, y_train, cv=5, scoring=make_scorer(fbeta_score, beta=2))
cvacc = cross_val_score(clf, X_train, y_train, cv=5)
temp = [f"{np.mean(cvf2):.3f}$\pm${np.std(cvf2):.4f}",
f"{np.mean(cvacc):.3f}$\pm${np.std(cvacc):.4f}",
f"{fbeta_score(y_test, preds, beta=2):.3f}",
f"{accuracy_score(y_test, preds):.3f}"
]
df[model] = temp
df = df.T
df
print(df.to_latex(escape=False))
df.to_latex()