#%config Completer.use_jedi = False
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, make_scorer, confusion_matrix, fbeta_score
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import ClassifierMixin, BaseEstimator
from dtreeviz.trees import dtreeviz
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
sns.set_context('paper')
sns.set(font='sans-serif', font_scale=1.4)
plt.rc('legend', fontsize=12)
import pickle
data = pd.read_pickle('wdbc.pkl')
data
idint64
8670 - 911320502
malignantint32
0 - 1
0
842302
1
1
842517
1
2
84300903
1
3
84348301
1
4
84358402
1
5
843786
1
6
844359
1
7
84458202
1
8
844981
1
9
84501001
1
target = 'malignant'
X = data.drop(columns=[target, 'id'])
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.25,
random_state=42,
shuffle=True)
print(f'sum of X_train = {len(X_train)}, malignant = {100*sum(y_train==1)/len(y_train):.2f}%')
print(f'sum of X_test = {len(X_test)}, malignant = {100*sum(y_test==1)/len(y_test):.2f}%')
sum of X_train = 426, malignant = 37.09%
sum of X_test = 143, malignant = 37.76%
dummy_clf = DummyClassifier()
cv_scores = cross_val_score(dummy_clf,
X_train,
y_train,
cv=5,
#scoring=make_scorer(fbeta_score, beta=2)
)
mean_dummy, std = np.mean(cv_scores), np.std(cv_scores)
print(f'accuracy {mean_dummy: .3f} +- {std: .4f}')
accuracy 0.629 +- 0.0053
def plot_features(feature='radius'):
fig, axs = plt.subplots(1,3, figsize=(16,4))
for i in range(3):
ax=axs[i]
ax.hist(X_train[f'{feature}_{i}'][y_train==0], alpha=0.5, label='benign', density=True)
ax.hist(X_train[f'{feature}_{i}'][y_train==1], alpha=0.5, label='malignant', density=True)
ax.set(title=f'{feature}_{i}', xlabel=f'{feature}_{i}')
ax.legend()
plt.show()
cell_size_features = ['radius', 'area']
for feature in cell_size_features:
plot_features(feature)
cell_shape_features = ['smoothness', 'compactness', 'concavity', 'concave points', 'symmetry', 'fractal dimension']
for feature in cell_shape_features:
plot_features(feature)
plot_features('texture')
# inirial value to threshold (the value of a specific column that sperate the has 95% benign cells under it)
threshold = 0.95
class RuleBasedClassifier(BaseEstimator, ClassifierMixin):
def __init__(self):
self.size_features = ['radius_2', 'area_2']
self.shape_features = ['concave points_0', 'concave points_2', 'concavity_0']
self.texture_features = ['texture_2']
self.similarity_features = ['radius_1', 'area_1']
def fit(self, X, y):
self.X = X
self.y = y
# manual founded threshold by observing the plots above
self.size_thresholds = [17, 900]
self.shape_thresholds = [0.07, 0.18, 0.18]
self.texture_thresholds = [50]
self.similarity_thresholds = [0.8, 55]
def get_threshold(self, col_name):
thres = self.X[self.y == 0][col_name].quantile([threshold])[threshold]
return thres
def predict(self, X):
return self.predict_with_calculated_threshold(X)
#return self.predict_with_manual_threshold(X)
def predict_manual(self, X):
preds = np.zeros(len(X)).astype('bool')
# if size is abnormal
for i, size_feature in enumerate(self.size_features):
preds = (X[size_feature] >= self.size_thresholds[i]) | preds
# if shape is abnormal
for i, shape_feature in enumerate(self.shape_features):
preds = (X[shape_feature] >= self.shape_thresholds[i]) | preds
# if texture is abnormal
for i, texture_feature in enumerate(self.texture_features):
preds = (X[texture_feature] >= self.texture_thresholds[i]) | preds
# if similarity is abnormal
for i, similarity_feature in enumerate(self.similarity_features):
preds = (X[similarity_feature] >= self.similarity_thresholds[i]) | preds
return preds
def predict_with_calculated_threshold(self, X):
preds = np.zeros(len(X)).astype('bool')
# if size is abnormal
for i, size_feature in enumerate(self.size_features):
preds = (X[size_feature] >= self.get_threshold(self.size_features[i])) | preds
# if shape is abnormal
for i, shape_feature in enumerate(self.shape_features):
preds = (X[shape_feature] >= self.get_threshold(self.shape_features[i])) | preds
# if texture is abnormal
for i, texture_feature in enumerate(self.texture_features):
preds = (X[texture_feature] >= self.get_threshold(self.texture_features[i])) | preds
# if similarity is abnormal
for i, similarity_feature in enumerate(self.similarity_features):
preds = (X[similarity_feature] >= self.get_threshold(self.similarity_features[i])) | preds
return preds
for threshold in np.linspace(0.95,1,20):
clf = RuleBasedClassifier()
cv_scores = cross_val_score(clf, X_train, y_train, cv=2, scoring=make_scorer(fbeta_score, beta=2))
mean, std = np.mean(cv_scores), np.std(cv_scores)
print(f'{threshold:.5f}: f2 score{mean: .3f} +- {std: .4f}')
0.95000: f2 score 0.917 +- 0.0144
0.95263: f2 score 0.911 +- 0.0112
0.95526: f2 score 0.913 +- 0.0113
0.95789: f2 score 0.916 +- 0.0102
0.96053: f2 score 0.918 +- 0.0103
0.96316: f2 score 0.916 +- 0.0152
0.96579: f2 score 0.919 +- 0.0119
0.96842: f2 score 0.921 +- 0.0142
0.97105: f2 score 0.924 +- 0.0153
0.97368: f2 score 0.927 +- 0.0154
0.97632: f2 score 0.927 +- 0.0238
0.97895: f2 score 0.933 +- 0.0251
0.98158: f2 score 0.936 +- 0.0241
0.98421: f2 score 0.928 +- 0.0130
0.98684: f2 score 0.923 +- 0.0220
0.98947: f2 score 0.913 +- 0.0148
0.99211: f2 score 0.906 +- 0.0240
0.99474: f2 score 0.895 +- 0.0086
0.99737: f2 score 0.889 +- 0.0109
1.00000: f2 score 0.875 +- 0.0034
# we choos threshold = 0.995
threshold = 0.997
clf = RuleBasedClassifier()
clf.fit(X_train, y_train)
preds1 = clf.predict(X_test)
print(f'f2 score = {fbeta_score(y_test, preds1, beta=2):.3f}')
f2 score = 0.941
print("Model evaluation metrics:")
print(classification_report(list(y_test),preds1, target_names = ['Benign','Malignant'], digits = 3))
Model evaluation metrics:
precision recall f1-score support
Benign 0.966 0.955 0.960 89
Malignant 0.927 0.944 0.936 54
accuracy 0.951 143
macro avg 0.947 0.950 0.948 143
weighted avg 0.951 0.951 0.951 143
threshold = 0.997
fig, ax = plt.subplots(figsize=(4, 4))
clf = RuleBasedClassifier()
clf.fit(X_train, y_train)
preds1 = clf.predict(X_test)
ConfusionMatrixDisplay.from_predictions(list(y_test),preds1, normalize = 'true', display_labels = ['Benign','Malignant'],
values_format = ".3f", ax=ax, cmap = plt.get_cmap('Blues'), colorbar=False)
plt.grid(None)
plt.show()
def get_paramter_values(search):
scores = search.cv_results_['mean_test_score']
param_vals_list = search.cv_results_['params']
titles = [ s for s in list(param_vals_list[0].keys())]
n_params=len(titles)
P = [[] for i in range(n_params)]
for j, param_vals in enumerate(param_vals_list):
param_vals = list(param_vals.values())
for i in range(n_params):
p_val = param_vals[i]
if type(p_val) is tuple:
P[i].append(decode(p_val))
else:
P[i].append(p_val)
P = np.array(P)
return P, titles, scores
rf = RandomForestClassifier()
n=25
est_dist = np.unique([int(i) for i in np.logspace(0.4, 2.7, n)])
depth_dist = np.unique([int(i) for i in np.logspace(0.4, 1.8, n)])
distributions = {'n_estimators': est_dist,
'max_depth': depth_dist,
}
clf = GridSearchCV(rf, distributions, cv=5, n_jobs=-1, scoring=make_scorer(fbeta_score, beta=2))
search = clf.fit(X_train, y_train)
P, titles, scores = get_paramter_values(search)
scores = scores.reshape(len(depth_dist), len(est_dist))
fig, ax = plt.subplots(1,1, figsize=(8,4))
cs = ax.contourf(est_dist, depth_dist, scores)
cbar = fig.colorbar(cs)
cbar.set_label('F2 score')
ax.set(xlabel='n estimators', ylabel='max depth', title='Random Forest hyper parameter search', xscale='log', yscale='log');
fig.savefig('rfhyperparam.png', bbox_inches='tight')
/shared-libs/python3.7/py-core/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: Auto-removal of grids by pcolor() and pcolormesh() is deprecated since 3.5 and will be removed two minor releases later; please call grid(False) first.
after removing the cwd from sys.path.
search.best_params_
n_estimators = 133
max_dept = 24
rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_dept)
cv_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring=make_scorer(fbeta_score, beta=2))
mean, std = np.mean(cv_scores), np.std(cv_scores)
print(f'f2 score{mean: .3f} +- {std: .4f}')
f2 score 0.939 +- 0.0339
rf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_dept)
rf.fit(X_train, y_train)
preds2=rf.predict(X_test)
print(f'accuracy score: {accuracy_score(y_test, preds2):.3f}')
print(f'f2 score = {fbeta_score(y_test, preds2, beta=2):.3f}')
accuracy score: 0.972
f2 score = 0.951
print("Model evaluation metrics:")
print(classification_report(list(y_test),preds2, target_names = ['Benign','Malignant'], digits = 3))
Model evaluation metrics:
precision recall f1-score support
Benign 0.967 0.989 0.978 89
Malignant 0.981 0.944 0.962 54
accuracy 0.972 143
macro avg 0.974 0.967 0.970 143
weighted avg 0.972 0.972 0.972 143
fig, ax = plt.subplots(figsize=(4, 4))
ConfusionMatrixDisplay.from_predictions(list(y_test),preds2, normalize = 'true', display_labels = ['Benign','Malignant'],
values_format = ".3f", ax=ax, cmap = plt.get_cmap('Blues'), colorbar=False)
plt.grid(None)
plt.show()
clf = DecisionTreeClassifier(random_state=42, max_depth=3)
cv_scores = cross_val_score(clf, X_train, y_train, cv=5, scoring=make_scorer(fbeta_score, beta=2))
mean, std = np.mean(cv_scores), np.std(cv_scores)
print(f'f2 score{mean: .3f} +- {std: .4f}')
f2 score 0.906 +- 0.0244
clf = DecisionTreeClassifier(random_state=42, max_depth=3)
clf.fit(X_train, y_train)
viz = dtreeviz(clf, X_train, y_train,
target_name=target,
class_names=['malignant', 'benign'],
feature_names=data.columns[2:],
)
viz
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
/shared-libs/python3.7/py/lib/python3.7/site-packages/sklearn/base.py:451: UserWarning: X does not have valid feature names, but DecisionTreeClassifier was fitted with feature names
"X does not have valid feature names, but"
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.
clf = DecisionTreeClassifier(random_state=42, max_depth=3)
clf.fit(X_train, y_train)
preds3=clf.predict(X_test)
print(f'accuracy score: {accuracy_score(y_test, preds3):.3f}')
print(f'f2 score = {fbeta_score(y_test, preds3, beta=2):.3f}')
accuracy score: 0.958
f2 score = 0.933
print("Model evaluation metrics:")
print(classification_report(list(y_test),preds3, target_names = ['Benign','Malig