# ----------------------------------------------------------------------
# Created: 2022-02-21
# Authors: Anouka Ranby, Andreas Nilsson
# Description: Diagnostics
# Course: Design of AI Systems
# ----------------------------------------------------------------------
# Import required libs
import numpy as np
import time
import os
import pandas as pd
import matplotlib.pyplot as plt
from numpy import reshape
import seaborn as sns
import pandas as pd
from time import time
from sklearn.feature_selection import f_classif ,mutual_info_classif ,chi2
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score , cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve, auc, accuracy_score
from sklearn.feature_selection import f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline ,make_pipeline
from sklearn.metrics import classification_report
import pickle
## Import data
with open('wdbc.pkl', 'rb') as f:
data = pickle.load(f)
data.head(1)
X = data.drop(['malignant'],axis=1)
y = data['malignant']
# create the TEST set to check the final model - X_test_new, y_test_new (20% of data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)
def ANOVA( X_train, y_train):
""" Function to make output of filter feature meathod ANOVA visble"""
# ANOVA - Calculate F Statistic and corresponding p values
F_statistic, p_values = f_classif(X_train, y_train)
# convert to a DF
ANOVA_F_table = pd.DataFrame(data = {'Numerical_Feature': X_train.columns.values,
'F-Score': F_statistic, 'p values': p_values.round(decimals=10)})
ANOVA_F_table.sort_values(by = ['F-Score'], ascending = False, ignore_index = True, inplace = True)
# save the top 20 numerical features in a list
top_20_features = ANOVA_F_table.iloc[:20,0].to_list()
display(ANOVA_F_table)
ANOVA( X_train, y_train)
selectedVariabels = pd.DataFrame(y_train)
selectedVariabels['perimeter_0'] = X_train['perimeter_0']
selectedVariabels['area_0'] = X_train['area_0']
selectedVariabels['radius_0'] = X_train['radius_0']
X = selectedVariabels.drop(['malignant'],axis=1)
y = selectedVariabels['malignant']
X.corr()
selectedVariabels = pd.DataFrame(y_train)
selectedVariabels['concave points_0'] = X_train['concave points_0']
selectedVariabels['concavity_0'] = X_train['concavity_0']
selectedVariabels['cTIMESz'] = X_train['concavity_0']* X_train['concave points_0']
selectedVariabels['cTIMESz_05'] = (X_train['concavity_0']* X_train['concave points_0'])**(1/3.2)
selectedVariabels['concave points_0_2'] = X_train['concave points_0']**2
selectedVariabels['concavity_0_2'] = X_train['concavity_0']**2
selectedVariabels['concave points_0_05'] = X_train['concave points_0']**(1/1.5)
selectedVariabels['concavity_0_05'] = data['concavity_0']**(1/2)
X = selectedVariabels.drop(['malignant'],axis=1)
y = selectedVariabels['malignant']
ANOVA( X, y)
def findMaximum(inputVar, y_train):
""" Funtion that takes input and test different combinations of different roots to
assess if an optimum can be found regarding dependency for a feature or combinations of features.
"""
iter_val = []
f_val = []
for i in range(200):
i = (i/10) + 0.01
selectedVariabels = pd.DataFrame(y_train)
selectedVariabels['Test_Var_'+str(i)] = inputVar**(1/i)
X = selectedVariabels.drop(['malignant'],axis=1)
# ANOVA Test
F_statistic, p_values = f_classif(X, y_train)
# convert to a DF
ANOVA_F_table = pd.DataFrame(data = {'Numerical_Feature': X.columns.values,
'F-Score': F_statistic, 'p values': p_values.round(decimals=10)})
ANOVA_F_table.sort_values(by = ['F-Score'], ascending = False, ignore_index = True, inplace = True)
iter_val.append(i)
f_val.append(ANOVA_F_table['F-Score'].iloc[0])
frame = pd.DataFrame()
frame["iter_val"] = iter_val
frame["f_val"] = f_val
return frame
## store results
CP0 = findMaximum(X_train['concave points_0'], y_train)
CP0['Variable'] = 'CP0'
## store results
inputVar = X_train['concavity_0']* X_train['concave points_0']
CC = findMaximum(inputVar, y_train)
CC['Variable'] = 'CC'
## store results
inputVar = X_train['concavity_0']
C0 = findMaximum(inputVar, y_train)
C0['Variable'] = 'C0'
## Append all results
frame = CP0.append(CC)
frame = frame.append(C0)
print(frame[frame["f_val"] == max(frame["f_val"])])
plt.figure(figsize=(12,8))
sns.scatterplot(data=frame, x="iter_val", y="f_val", hue="Variable")
plt.title("Feature Enginearing", size=18)
selectedVariabels = pd.DataFrame(y_train)
selectedVariabels['compactness_0'] = X_train['compactness_0']
selectedVariabels['texture_0'] = X_train['texture_0']
X = selectedVariabels.drop(['malignant'],axis=1)
y = selectedVariabels['malignant']
X.corr()
selectedVariabels = pd.DataFrame(y_train)
selectedVariabels['compactness_0'] = X_train['compactness_0']
selectedVariabels['texture_0'] = X_train['texture_0']
selectedVariabels['compactness_05'] = X_train['compactness_0']**(1/2)
selectedVariabels['texture_05'] = X_train['texture_0']**(1/2)
selectedVariabels['compactness_x2'] = X_train['compactness_0']**(2)
selectedVariabels['texture_x2'] = X_train['texture_0']**(2)
selectedVariabels['CT'] = X_train['compactness_0'] * X_train['texture_0']
selectedVariabels['CT_05'] = X_train['compactness_0'] * X_train['texture_0']**(1/2)
selectedVariabels['CT_x2'] = X_train['compactness_0'] * X_train['texture_0']**(2)
X = selectedVariabels.drop(['malignant'],axis=1)
y = selectedVariabels['malignant']
ANOVA( X, y)
inputVar = X_train['compactness_0'] * X_train['texture_0']
CT = findMaximum(inputVar, y_train)
CT['Variable'] = 'CT'
inputVar = X_train['compactness_0']
C0 = findMaximum(inputVar, y_train)
C0['Variable'] = 'C0'
inputVar = X_train['texture_0']
T0 = findMaximum(inputVar, y_train)
T0['Variable'] = 'T0'
frame = CT.append(C0)
frame = frame.append(T0)
print(frame[frame["f_val"] == max(frame["f_val"])])
plt.figure(figsize=(12,8))
sns.scatterplot(data=frame, x="iter_val", y="f_val", hue="Variable")
plt.title("Feature Enginearing", size=18)
selectedVariabels = pd.DataFrame(y_train)
selectedVariabels['radius_1'] = X_train['radius_1']
selectedVariabels['perimeter_1'] = X_train['perimeter_1']
selectedVariabels['concave points_2'] = X_train['concave points_2']
selectedVariabels['perimeter_2'] = X_train['perimeter_2']
selectedVariabels['radius_2'] = X_train['radius_2']
selectedVariabels['area_2'] = X_train['area_2']
selectedVariabels['concavity_2'] = X_train['concavity_2']
selectedVariabels['compactness_2'] = X_train['compactness_2']
X = selectedVariabels.drop(['malignant'],axis=1)
y = selectedVariabels['malignant']
X.corr()
selectedVariabels = pd.DataFrame(y_train)
selectedVariabels['radius_1'] = X_train['radius_1']
selectedVariabels['perimeter_2 '] = X_train['perimeter_2']
selectedVariabels['concavity_2'] = X_train['concavity_2']
selectedVariabels['concave points_2'] = X_train['concave points_2']
selectedVariabels['raidus_1_05'] = X_train['radius_1']**(1/2)
selectedVariabels['raidus_1_x2'] = X_train['radius_1']**(2)
selectedVariabels['perimeter_2_05'] = X_train['perimeter_2']**(1/2)
selectedVariabels['perimeter_2_x2'] = X_train['perimeter_2']**(2)
selectedVariabels['concavity_2_05'] = X_train['concavity_2']**(1/2)
selectedVariabels['concavity_2x2'] = X_train['concavity_2']**(2)
selectedVariabels['concave points_2_05'] = X_train['concave points_2']**(1/2)
selectedVariabels['concave points_2_x2'] = X_train['concave points_2']**(2)
## Try combinations two variabels
selectedVariabels['r1xp2'] = X_train['radius_1'] * X_train['perimeter_2']
selectedVariabels['r1xp2_05'] = (X_train['radius_1'] * X_train['perimeter_2'])**(1/2)
selectedVariabels['r1xp2_2'] = (X_train['radius_1'] * X_train['perimeter_2'])**(2)
selectedVariabels['r1xC2'] = X_train['radius_1'] * X_train['concavity_2']
selectedVariabels['r1xC2_05'] = (X_train['radius_1'] * X_train['concavity_2'])**(1/2)
selectedVariabels['r1xC2_x2'] = (X_train['radius_1'] * X_train['concavity_2'])**(2)
selectedVariabels['r1xCP2'] = X_train['radius_1'] * X_train['concave points_2']
selectedVariabels['r1xCP2_05'] = (X_train['radius_1'] * X_train['concave points_2'])**(1/2)
selectedVariabels['r1xCP2_x2'] = (X_train['radius_1'] * X_train['concave points_2'])**(2)
selectedVariabels['p2xC2'] = X_train['perimeter_2'] * X_train['concavity_2']
selectedVariabels['p2xC2_05'] = (X_train['perimeter_2'] * X_train['concavity_2'])**(1/2)
selectedVariabels['p2xC2_x2'] = (X_train['perimeter_2'] * X_train['concavity_2'])**(2)
selectedVariabels['C2xCP2'] = X_train['concavity_2'] * X_train['concave points_2']
selectedVariabels['C2xCP2_05'] = (X_train['concavity_2'] * X_train['concave points_2'])**(1/2)
selectedVariabels['C2xCP2_x2'] = (X_train['concavity_2'] * X_train['concave points_2'])**(2)
selectedVariabels['p2xCP2'] = X_train['perimeter_2'] * X_train['concave points_2']
selectedVariabels['p2xCP2_05'] = (X_train['perimeter_2'] * X_train['concave points_2'])**(1/2)
selectedVariabels['p2xCP2_x2'] = (X_train['perimeter_2'] * X_train['concave points_2'])**(2)
## Try combinations three variabels
selectedVariabels['r1_p2_c2'] = X_train['radius_1'] * X_train['perimeter_2'] * X_train['concavity_2']
selectedVariabels['r1_p2_c2_03'] = (X_train['radius_1'] * X_train['perimeter_2'] * X_train['concavity_2'])**(1/3)
selectedVariabels['r1_p2_cp2'] = X_train['radius_1'] * X_train['perimeter_2'] * X_train['concave points_2']
selectedVariabels['r1_p2_cp2_03'] = (X_train['radius_1'] * X_train['perimeter_2'] * X_train['concave points_2'])**(1/3)
selectedVariabels['c2_p2_cp2'] = X_train['concavity_2'] * X_train['perimeter_2'] * X_train['concave points_2']
selectedVariabels['c2_p2_cp2-03'] = (X_train['concavity_2'] * X_train['perimeter_2'] * X_train['concave points_2'])**(1/3)
## Try combinations FOUR variabels
selectedVariabels['r1_p2_c2_cp2'] = X_train['radius_1'] * X_train['perimeter_2'] * X_train['concavity_2'] * X_train['concave points_2']
selectedVariabels['r1_p2_c2_cp2_04'] = (X_train['radius_1'] * X_train['perimeter_2'] * X_train['concavity_2'] * X_train['concave points_2'])**(1/4)
X = selectedVariabels.drop(['malignant'],axis=1)
y = selectedVariabels['malignant']
see = ANOVA( X, y)
inputVar = (X_train['perimeter_2'] * X_train['concave points_2'])
p2xCP2 = findMaximum(inputVar, y_train)
p2xCP2['Variable'] = 'p2xCP2'
inputVar = X_train['radius_1'] * X_train['perimeter_2'] * X_train['concavity_2'] * X_train['concave points_2']
r1_p2_c2_cp2 = findMaximum(inputVar, y_train)
r1_p2_c2_cp2['Variable'] = 'r1_p2_c2_cp2'
inputVar =X_train['concave points_2']
CP2 = findMaximum(inputVar, y_train)
CP2['Variable'] = 'CP2'
inputVar = X_train['concavity_2'] * X_train['perimeter_2'] * X_train['concave points_2']
c2_p2_cp2 = findMaximum(inputVar, y_train)
c2_p2_cp2['Variable'] = 'c2_p2_cp2'
## Add dataframes
frame = p2xCP2.append(r1_p2_c2_cp2)
frame = frame.append(CP2)
frame = frame.append(c2_p2_cp2)
frame['f_val'] = frame['f_val'].fillna(0)
print(frame[frame["f_val"] == max(frame["f_val"])])
plt.figure(figsize=(12,8))
sns.scatterplot(data=frame, x="iter_val", y="f_val", hue="Variable")
plt.title("Feature Enginearing", size=18)
celldata = pd.DataFrame(y_train)
celldata['size'] = X_train['perimeter_0']
celldata['shape'] = (X_train['concave points_0'])**(1/1.41) ## Created CC feature
celldata['texture'] = (X_train['compactness_0'] * X_train['texture_0'])**(1/2.71) ## Created CT feature
celldata['abnormality'] = (X_train['perimeter_2'] * X_train['concave points_2'])**(1/1.51)
X = celldata.drop(['malignant'],axis=1)
y = celldata['malignant']
X.corr()
ANOVA( X, y)
plt.figure(figsize=(8,6))
sns.histplot(celldata, x="abnormality", hue="malignant", element="poly")
#Filter data after new rule
round1 = celldata[celldata['abnormality'] < 5.9]
plt.figure(figsize=(8,6))
sns.histplot(round1, x='size', hue="malignant", element="poly")
#Filter data after new rule
round2 = round1[round1['size'] < 108]
plt.figure(figsize=(8,6))
sns.histplot(round2, x="shape", hue="malignant", element="poly")
#Filter data after new rule
round3 = round2[round2['shape'] < 0.16]
plt.figure(figsize=(8,6))
sns.histplot(round3, x="texture", hue="malignant", element="poly")
#Filter data after new rule
round4 = round3[round3['texture'] < 1.62]
plt.figure(figsize=(8,6))
sns.histplot(round4, x="texture", hue="malignant", element="poly")
celldata = pd.DataFrame(y_test)
celldata['size'] = X_test['perimeter_0']
celldata['shape'] = (X_test['concave points_0'])**(1/1.41) ## Created CC feature
celldata['texture'] = (X_test['compactness_0'] * X_test['texture_0'])**(1/2.71) ## Created CT feature
celldata['abnormality'] = (X_test['perimeter_2'] * X_test['concave points_2'])**(1/1.51)
## run rules on test features
celldata['predict'] = np.where((celldata['abnormality'] < 5.9) &
(celldata['size'] < 108) &
(celldata['shape'] < 0.16) &
(celldata['texture'] < 1.62) , 0, 1)
accuracy_score(y_test, celldata['predict'])
clf_report = classification_report(y_test, celldata['predict'])
print("\nClassification report:\n",clf_report)
# Find good hyperparameters for Random Forest classifier
rfc = RandomForestClassifier()
# specify grid
hparams_grid = {'max_depth': [1,3,6,9,12,15,18,21,51],
'n_estimators': [1,3,6,9,12,15,21,26,51,76,101,126,151,176,201,226,251] }
# Random search since faster than GridSearchCV
random_search = RandomizedSearchCV(rfc, hparams_grid, n_iter=10)
#fit model
random_search.fit(X_train, y_train)
# store best params
r_best_m_d = random_search.best_params_['max_depth']
r_best_n_e = random_search.best_params_['n_estimators']
# print result
print("Best Score: {}".format(random_search.best_score_))
print("Best params: {}".format(random_search.best_params_))
# MAKE PIPELINE
pipeline = make_pipeline(
# use best params found from above grid search
RandomForestClassifier(max_depth=r_best_m_d, n_estimators=r_best_n_e, random_state=0, n_jobs=-1) )
# train pipeline
pipeline.fit(X_train, y_train)
# get accuracy on unseen test data
accuracy_score(y_test, pipeline.predict(X_test))
rfc_report = classification_report(y_test, pipeline.predict(X_test))
print("\nClassification report:\n",rfc_report)
## Find the maxPerforming tree
treeNr = []
treeScore = []
for i in range(51):
see = pipeline.named_steps["randomforestclassifier"].estimators_[i]
see.fit(X_train, y_train)
treeNr.append(i)
treeScore.append(accuracy_score(y_test, see.predict(X_test)))
Tframe= pd.DataFrame(treeNr)
Tframe['treeScore'] = treeScore
winningTree = int(Tframe[Tframe['treeScore'] == max(treeScore)][0:1:].iloc[0][0])
winningTree
see = pipeline.named_steps["randomforestclassifier"].estimators_[winningTree]
names = list(X_train.columns)
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
fig = plt.figure(figsize=(50, 40))
plot_tree(pipeline.named_steps["randomforestclassifier"].estimators_[23],
feature_names=names,
class_names=['1','0'],
filled=True, impurity=True,
rounded=True)
# make preditions on our test set
y_hat_test = pipeline.predict(X_test)
# get the predicted probabilities
y_hat_test_proba = pipeline.predict_proba(X_test)
# select the probabilities of only the positive class (class 1 - default)
y_hat_test_proba = y_hat_test_proba[:][: , 1]
# we will now create a new DF with actual classes and the predicted probabilities
# create a temp y_test DF to reset its index to allow proper concatenation with y_hat_test_proba
y_test_temp = y_test.copy()
y_test_temp.reset_index(drop = True, inplace = True)
y_test_proba = pd.concat([y_test_temp, pd.DataFrame(y_hat_test_proba)], axis = 1)
# Rename the columns
y_test_proba.columns = ['y_test_class_actual', 'y_hat_test_proba']
# Makes the index of one dataframe equal to the index of another dataframe.
y_test_proba.index = X_test.index
# get the values required to plot a ROC curve
fpr, tpr, thresholds = roc_curve(y_test_proba['y_test_class_actual'],
y_test_proba['y_hat_test_proba'])
# plot the ROC curve
plt.plot(fpr, tpr)
# plot a secondary diagonal line, with dashed line style and black color to represent a no-skill classifier
plt.plot(fpr, fpr, linestyle = '--', color = 'k')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve');
# Calculate the Area Under the Receiver Operating Characteristic Curve (AUROC) on test set
AUROC = roc_auc_score(y_test_proba['y_test_class_actual'], y_test_proba['y_hat_test_proba'])
# calculate Gini from AUROC
Gini = AUROC * 2 - 1
# print AUROC and Gini
print('AUROC: %.4f' % (AUROC))
print('Gini: %.4f' % (Gini))
# draw a PR curve
# calculate the no skill line as the proportion of the positive class
no_skill = len(y_test[y_test == 1]) / len(y)
# plot the no skill precision-recall curve
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
# get the values required to plot a PR curve
precision, recall, thresholds = precision_recall_curve(y_test_proba['y_test_class_actual'],
y_test_proba['y_hat_test_proba'])
# plot PR curve
plt.plot(recall, precision, marker='.', label='Logistic')
#plt.plot(recall, precision, marker='.', label='RandomForest')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.title('PR curve');
from sklearn.linear_model import LogisticRegression
from time import time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
# Setting hyperparameters for Logit
logit_params = {
'logit__C': [0.01, 0.1, 1, 10, 100],
'logit__tol': [1e-3, 1e-4, 1e-5],
'logit__penalty': ['l1', 'l2'],
'logit__solver': ['liblinear']}
# Logit pipeline setup
logit_pipe = Pipeline([
('logit', LogisticRegression( max_iter=1200)) ])
# GridSearch set up for Logit
logit_gs = GridSearchCV(logit_pipe, param_grid=logit_params, cv=5, n_jobs=-1)
# Fit on train data
logit_time = time()
logit_gs.fit(X_train, y_train)
print("Logistic Regression:\nBest train score:", logit_gs.best_score_,"\nBest params:", logit_gs.best_params_)
clf = LogisticRegression(random_state = 0,
solver='liblinear',
multi_class='auto' ,
C = 100,
penalty = 'l1',
tol = 0.0001)
clf.fit(X_train,y_train)
rfc_report = classification_report(y_test, clf.predict(X_test))
print("\nClassification report:\n",rfc_report)
# make preditions on our test set
y_hat_test = clf.predict(X_test)
# get the predicted probabilities
y_hat_test_proba = clf.predict_proba(X_test)
# select the probabilities of only the positive class (class 1 - default)
y_hat_test_proba = y_hat_test_proba[:][: , 1]
# we will now create a new DF with actual classes and the predicted probabilities
# create a temp y_test DF to reset its index to allow proper concaternation with y_hat_test_proba
y_test_temp = y_test.copy()
y_test_temp.reset_index(drop = True, inplace = True)
y_test_proba = pd.concat([y_test_temp, pd.DataFrame(y_hat_test_proba)], axis = 1)
# Rename the columns
y_test_proba.columns = ['y_test_class_actual', 'y_hat_test_proba']
# Makes the index of one dataframe equal to the index of another dataframe.
y_test_proba.index = X_test.index
# get the values required to plot a ROC curve
fpr, tpr, thresholds = roc_curve(y_test_proba['y_test_class_actual'],
y_test_proba['y_hat_test_proba'])
# plot the ROC curve
plt.plot(fpr, tpr)
# plot a secondary diagonal line, with dashed line style and black color to represent a no-skill classifier
plt.plot(fpr, fpr, linestyle = '--', color = 'k')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve');
# Calculate the Area Under the Receiver Operating Characteristic Curve (AUROC) on test set
AUROC = roc_auc_score(y_test_proba['y_test_class_actual'], y_test_proba['y_hat_test_proba'])
# calculate Gini from AUROC
Gini = AUROC * 2 - 1
# print AUROC and Gini
print('AUROC: %.4f' % (AUROC))
print('Gini: %.4f' % (Gini))
# draw a PR curve
# calculate the no skill line as the proportion of the positive class
no_skill = len(y_test[y_test == 1]) / len(y)
# plot the no skill precision-recall curve
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
# get the values required to plot a PR curve
precision, recall, thresholds = precision_recall_curve(y_test_proba['y_test_class_actual'],
y_test_proba['y_hat_test_proba'])
# plot PR curve
plt.plot(recall, precision, marker='.', label='Logistic')
#plt.plot(recall, precision, marker='.', label='RandomForest')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.title('PR curve');
# Store the column names in X_train as a list
feature_name = X_train.columns.values
# Create a summary table of our logistic regression model
summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name)
# Create a new column in the dataframe, called 'Coefficients'
summary_table['Coefficients'] = np.transpose(clf.coef_)
# Increase the index of every row of the dataframe with 1 to store our model intercept in 1st row
summary_table.index = summary_table.index + 1
# Assign our model intercept to this new row
#summary_table.loc[0] = ['Intercept', clf.intercept_[0]]
finalVal = X_train
summary_table = summary_table[summary_table['Feature name'] != 'id']
# Sort the dataframe by index
summary_table.sort_values(by=['Coefficients'])
# https://www.displayr.com/how-to-interpret-logistic-regression-coefficients/