Design of AI System (Group 41)

# ---------------------------------------------------------------------- # Created: 2022-02-21 # Authors: Anouka Ranby, Andreas Nilsson # Description: Diagnostics # Course: Design of AI Systems # ----------------------------------------------------------------------

# Import required libs import numpy as np import time import os import pandas as pd import matplotlib.pyplot as plt from numpy import reshape import seaborn as sns import pandas as pd from time import time from sklearn.feature_selection import f_classif ,mutual_info_classif ,chi2 from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score , cross_validate, RandomizedSearchCV, GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve, auc, accuracy_score from sklearn.feature_selection import f_classif from sklearn.ensemble import RandomForestClassifier from sklearn.pipeline import Pipeline ,make_pipeline from sklearn.metrics import classification_report import pickle

## Import data with open('wdbc.pkl', 'rb') as f: data = pickle.load(f)

data.head(1)

X = data.drop(['malignant'],axis=1) y = data['malignant'] # create the TEST set to check the final model - X_test_new, y_test_new (20% of data) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

def ANOVA( X_train, y_train): """ Function to make output of filter feature meathod ANOVA visble""" # ANOVA - Calculate F Statistic and corresponding p values F_statistic, p_values = f_classif(X_train, y_train) # convert to a DF ANOVA_F_table = pd.DataFrame(data = {'Numerical_Feature': X_train.columns.values, 'F-Score': F_statistic, 'p values': p_values.round(decimals=10)}) ANOVA_F_table.sort_values(by = ['F-Score'], ascending = False, ignore_index = True, inplace = True) # save the top 20 numerical features in a list top_20_features = ANOVA_F_table.iloc[:20,0].to_list() display(ANOVA_F_table)

ANOVA( X_train, y_train)

selectedVariabels = pd.DataFrame(y_train) selectedVariabels['perimeter_0'] = X_train['perimeter_0'] selectedVariabels['area_0'] = X_train['area_0'] selectedVariabels['radius_0'] = X_train['radius_0'] X = selectedVariabels.drop(['malignant'],axis=1) y = selectedVariabels['malignant'] X.corr()

selectedVariabels = pd.DataFrame(y_train) selectedVariabels['concave points_0'] = X_train['concave points_0'] selectedVariabels['concavity_0'] = X_train['concavity_0'] selectedVariabels['cTIMESz'] = X_train['concavity_0']* X_train['concave points_0'] selectedVariabels['cTIMESz_05'] = (X_train['concavity_0']* X_train['concave points_0'])**(1/3.2) selectedVariabels['concave points_0_2'] = X_train['concave points_0']**2 selectedVariabels['concavity_0_2'] = X_train['concavity_0']**2 selectedVariabels['concave points_0_05'] = X_train['concave points_0']**(1/1.5) selectedVariabels['concavity_0_05'] = data['concavity_0']**(1/2)

X = selectedVariabels.drop(['malignant'],axis=1) y = selectedVariabels['malignant'] ANOVA( X, y)

def findMaximum(inputVar, y_train): """ Funtion that takes input and test different combinations of different roots to assess if an optimum can be found regarding dependency for a feature or combinations of features. """ iter_val = [] f_val = [] for i in range(200): i = (i/10) + 0.01 selectedVariabels = pd.DataFrame(y_train) selectedVariabels['Test_Var_'+str(i)] = inputVar**(1/i) X = selectedVariabels.drop(['malignant'],axis=1) # ANOVA Test F_statistic, p_values = f_classif(X, y_train) # convert to a DF ANOVA_F_table = pd.DataFrame(data = {'Numerical_Feature': X.columns.values, 'F-Score': F_statistic, 'p values': p_values.round(decimals=10)}) ANOVA_F_table.sort_values(by = ['F-Score'], ascending = False, ignore_index = True, inplace = True) iter_val.append(i) f_val.append(ANOVA_F_table['F-Score'].iloc[0]) frame = pd.DataFrame() frame["iter_val"] = iter_val frame["f_val"] = f_val return frame

## store results CP0 = findMaximum(X_train['concave points_0'], y_train) CP0['Variable'] = 'CP0' ## store results inputVar = X_train['concavity_0']* X_train['concave points_0'] CC = findMaximum(inputVar, y_train) CC['Variable'] = 'CC' ## store results inputVar = X_train['concavity_0'] C0 = findMaximum(inputVar, y_train) C0['Variable'] = 'C0' ## Append all results frame = CP0.append(CC) frame = frame.append(C0)

print(frame[frame["f_val"] == max(frame["f_val"])]) plt.figure(figsize=(12,8)) sns.scatterplot(data=frame, x="iter_val", y="f_val", hue="Variable") plt.title("Feature Enginearing", size=18)

selectedVariabels = pd.DataFrame(y_train) selectedVariabels['compactness_0'] = X_train['compactness_0'] selectedVariabels['texture_0'] = X_train['texture_0']

X = selectedVariabels.drop(['malignant'],axis=1) y = selectedVariabels['malignant'] X.corr()

selectedVariabels = pd.DataFrame(y_train) selectedVariabels['compactness_0'] = X_train['compactness_0'] selectedVariabels['texture_0'] = X_train['texture_0'] selectedVariabels['compactness_05'] = X_train['compactness_0']**(1/2) selectedVariabels['texture_05'] = X_train['texture_0']**(1/2) selectedVariabels['compactness_x2'] = X_train['compactness_0']**(2) selectedVariabels['texture_x2'] = X_train['texture_0']**(2) selectedVariabels['CT'] = X_train['compactness_0'] * X_train['texture_0'] selectedVariabels['CT_05'] = X_train['compactness_0'] * X_train['texture_0']**(1/2) selectedVariabels['CT_x2'] = X_train['compactness_0'] * X_train['texture_0']**(2)

X = selectedVariabels.drop(['malignant'],axis=1) y = selectedVariabels['malignant'] ANOVA( X, y)

inputVar = X_train['compactness_0'] * X_train['texture_0'] CT = findMaximum(inputVar, y_train) CT['Variable'] = 'CT' inputVar = X_train['compactness_0'] C0 = findMaximum(inputVar, y_train) C0['Variable'] = 'C0' inputVar = X_train['texture_0'] T0 = findMaximum(inputVar, y_train) T0['Variable'] = 'T0' frame = CT.append(C0) frame = frame.append(T0)

print(frame[frame["f_val"] == max(frame["f_val"])]) plt.figure(figsize=(12,8)) sns.scatterplot(data=frame, x="iter_val", y="f_val", hue="Variable") plt.title("Feature Enginearing", size=18)

selectedVariabels = pd.DataFrame(y_train) selectedVariabels['radius_1'] = X_train['radius_1'] selectedVariabels['perimeter_1'] = X_train['perimeter_1'] selectedVariabels['concave points_2'] = X_train['concave points_2'] selectedVariabels['perimeter_2'] = X_train['perimeter_2'] selectedVariabels['radius_2'] = X_train['radius_2'] selectedVariabels['area_2'] = X_train['area_2'] selectedVariabels['concavity_2'] = X_train['concavity_2'] selectedVariabels['compactness_2'] = X_train['compactness_2'] X = selectedVariabels.drop(['malignant'],axis=1) y = selectedVariabels['malignant'] X.corr()

selectedVariabels = pd.DataFrame(y_train) selectedVariabels['radius_1'] = X_train['radius_1'] selectedVariabels['perimeter_2 '] = X_train['perimeter_2'] selectedVariabels['concavity_2'] = X_train['concavity_2'] selectedVariabels['concave points_2'] = X_train['concave points_2'] selectedVariabels['raidus_1_05'] = X_train['radius_1']**(1/2) selectedVariabels['raidus_1_x2'] = X_train['radius_1']**(2) selectedVariabels['perimeter_2_05'] = X_train['perimeter_2']**(1/2) selectedVariabels['perimeter_2_x2'] = X_train['perimeter_2']**(2) selectedVariabels['concavity_2_05'] = X_train['concavity_2']**(1/2) selectedVariabels['concavity_2x2'] = X_train['concavity_2']**(2) selectedVariabels['concave points_2_05'] = X_train['concave points_2']**(1/2) selectedVariabels['concave points_2_x2'] = X_train['concave points_2']**(2) ## Try combinations two variabels selectedVariabels['r1xp2'] = X_train['radius_1'] * X_train['perimeter_2'] selectedVariabels['r1xp2_05'] = (X_train['radius_1'] * X_train['perimeter_2'])**(1/2) selectedVariabels['r1xp2_2'] = (X_train['radius_1'] * X_train['perimeter_2'])**(2) selectedVariabels['r1xC2'] = X_train['radius_1'] * X_train['concavity_2'] selectedVariabels['r1xC2_05'] = (X_train['radius_1'] * X_train['concavity_2'])**(1/2) selectedVariabels['r1xC2_x2'] = (X_train['radius_1'] * X_train['concavity_2'])**(2) selectedVariabels['r1xCP2'] = X_train['radius_1'] * X_train['concave points_2'] selectedVariabels['r1xCP2_05'] = (X_train['radius_1'] * X_train['concave points_2'])**(1/2) selectedVariabels['r1xCP2_x2'] = (X_train['radius_1'] * X_train['concave points_2'])**(2) selectedVariabels['p2xC2'] = X_train['perimeter_2'] * X_train['concavity_2'] selectedVariabels['p2xC2_05'] = (X_train['perimeter_2'] * X_train['concavity_2'])**(1/2) selectedVariabels['p2xC2_x2'] = (X_train['perimeter_2'] * X_train['concavity_2'])**(2) selectedVariabels['C2xCP2'] = X_train['concavity_2'] * X_train['concave points_2'] selectedVariabels['C2xCP2_05'] = (X_train['concavity_2'] * X_train['concave points_2'])**(1/2) selectedVariabels['C2xCP2_x2'] = (X_train['concavity_2'] * X_train['concave points_2'])**(2) selectedVariabels['p2xCP2'] = X_train['perimeter_2'] * X_train['concave points_2'] selectedVariabels['p2xCP2_05'] = (X_train['perimeter_2'] * X_train['concave points_2'])**(1/2) selectedVariabels['p2xCP2_x2'] = (X_train['perimeter_2'] * X_train['concave points_2'])**(2) ## Try combinations three variabels selectedVariabels['r1_p2_c2'] = X_train['radius_1'] * X_train['perimeter_2'] * X_train['concavity_2'] selectedVariabels['r1_p2_c2_03'] = (X_train['radius_1'] * X_train['perimeter_2'] * X_train['concavity_2'])**(1/3) selectedVariabels['r1_p2_cp2'] = X_train['radius_1'] * X_train['perimeter_2'] * X_train['concave points_2'] selectedVariabels['r1_p2_cp2_03'] = (X_train['radius_1'] * X_train['perimeter_2'] * X_train['concave points_2'])**(1/3) selectedVariabels['c2_p2_cp2'] = X_train['concavity_2'] * X_train['perimeter_2'] * X_train['concave points_2'] selectedVariabels['c2_p2_cp2-03'] = (X_train['concavity_2'] * X_train['perimeter_2'] * X_train['concave points_2'])**(1/3) ## Try combinations FOUR variabels selectedVariabels['r1_p2_c2_cp2'] = X_train['radius_1'] * X_train['perimeter_2'] * X_train['concavity_2'] * X_train['concave points_2'] selectedVariabels['r1_p2_c2_cp2_04'] = (X_train['radius_1'] * X_train['perimeter_2'] * X_train['concavity_2'] * X_train['concave points_2'])**(1/4)

X = selectedVariabels.drop(['malignant'],axis=1) y = selectedVariabels['malignant'] see = ANOVA( X, y)

inputVar = (X_train['perimeter_2'] * X_train['concave points_2']) p2xCP2 = findMaximum(inputVar, y_train) p2xCP2['Variable'] = 'p2xCP2' inputVar = X_train['radius_1'] * X_train['perimeter_2'] * X_train['concavity_2'] * X_train['concave points_2'] r1_p2_c2_cp2 = findMaximum(inputVar, y_train) r1_p2_c2_cp2['Variable'] = 'r1_p2_c2_cp2' inputVar =X_train['concave points_2'] CP2 = findMaximum(inputVar, y_train) CP2['Variable'] = 'CP2' inputVar = X_train['concavity_2'] * X_train['perimeter_2'] * X_train['concave points_2'] c2_p2_cp2 = findMaximum(inputVar, y_train) c2_p2_cp2['Variable'] = 'c2_p2_cp2' ## Add dataframes frame = p2xCP2.append(r1_p2_c2_cp2) frame = frame.append(CP2) frame = frame.append(c2_p2_cp2) frame['f_val'] = frame['f_val'].fillna(0)

print(frame[frame["f_val"] == max(frame["f_val"])]) plt.figure(figsize=(12,8)) sns.scatterplot(data=frame, x="iter_val", y="f_val", hue="Variable") plt.title("Feature Enginearing", size=18)

celldata = pd.DataFrame(y_train) celldata['size'] = X_train['perimeter_0'] celldata['shape'] = (X_train['concave points_0'])**(1/1.41) ## Created CC feature celldata['texture'] = (X_train['compactness_0'] * X_train['texture_0'])**(1/2.71) ## Created CT feature celldata['abnormality'] = (X_train['perimeter_2'] * X_train['concave points_2'])**(1/1.51)

X = celldata.drop(['malignant'],axis=1) y = celldata['malignant'] X.corr()

ANOVA( X, y)

plt.figure(figsize=(8,6)) sns.histplot(celldata, x="abnormality", hue="malignant", element="poly")

#Filter data after new rule round1 = celldata[celldata['abnormality'] < 5.9] plt.figure(figsize=(8,6)) sns.histplot(round1, x='size', hue="malignant", element="poly")

#Filter data after new rule round2 = round1[round1['size'] < 108] plt.figure(figsize=(8,6)) sns.histplot(round2, x="shape", hue="malignant", element="poly")

#Filter data after new rule round3 = round2[round2['shape'] < 0.16] plt.figure(figsize=(8,6)) sns.histplot(round3, x="texture", hue="malignant", element="poly")

#Filter data after new rule round4 = round3[round3['texture'] < 1.62] plt.figure(figsize=(8,6)) sns.histplot(round4, x="texture", hue="malignant", element="poly")

celldata = pd.DataFrame(y_test) celldata['size'] = X_test['perimeter_0'] celldata['shape'] = (X_test['concave points_0'])**(1/1.41) ## Created CC feature celldata['texture'] = (X_test['compactness_0'] * X_test['texture_0'])**(1/2.71) ## Created CT feature celldata['abnormality'] = (X_test['perimeter_2'] * X_test['concave points_2'])**(1/1.51)

## run rules on test features celldata['predict'] = np.where((celldata['abnormality'] < 5.9) & (celldata['size'] < 108) & (celldata['shape'] < 0.16) & (celldata['texture'] < 1.62) , 0, 1)

accuracy_score(y_test, celldata['predict'])

clf_report = classification_report(y_test, celldata['predict']) print("\nClassification report:\n",clf_report)

# Find good hyperparameters for Random Forest classifier rfc = RandomForestClassifier() # specify grid hparams_grid = {'max_depth': [1,3,6,9,12,15,18,21,51], 'n_estimators': [1,3,6,9,12,15,21,26,51,76,101,126,151,176,201,226,251] }

# Random search since faster than GridSearchCV random_search = RandomizedSearchCV(rfc, hparams_grid, n_iter=10) #fit model random_search.fit(X_train, y_train) # store best params r_best_m_d = random_search.best_params_['max_depth'] r_best_n_e = random_search.best_params_['n_estimators'] # print result print("Best Score: {}".format(random_search.best_score_)) print("Best params: {}".format(random_search.best_params_))

# MAKE PIPELINE pipeline = make_pipeline( # use best params found from above grid search RandomForestClassifier(max_depth=r_best_m_d, n_estimators=r_best_n_e, random_state=0, n_jobs=-1) )

# train pipeline pipeline.fit(X_train, y_train)

# get accuracy on unseen test data accuracy_score(y_test, pipeline.predict(X_test))

rfc_report = classification_report(y_test, pipeline.predict(X_test)) print("\nClassification report:\n",rfc_report)

## Find the maxPerforming tree treeNr = [] treeScore = [] for i in range(51): see = pipeline.named_steps["randomforestclassifier"].estimators_[i] see.fit(X_train, y_train) treeNr.append(i) treeScore.append(accuracy_score(y_test, see.predict(X_test))) Tframe= pd.DataFrame(treeNr) Tframe['treeScore'] = treeScore

winningTree = int(Tframe[Tframe['treeScore'] == max(treeScore)][0:1:].iloc[0][0]) winningTree

see = pipeline.named_steps["randomforestclassifier"].estimators_[winningTree]

names = list(X_train.columns)

import matplotlib.pyplot as plt from sklearn.tree import plot_tree fig = plt.figure(figsize=(50, 40)) plot_tree(pipeline.named_steps["randomforestclassifier"].estimators_[23], feature_names=names, class_names=['1','0'], filled=True, impurity=True, rounded=True)

# make preditions on our test set y_hat_test = pipeline.predict(X_test) # get the predicted probabilities y_hat_test_proba = pipeline.predict_proba(X_test) # select the probabilities of only the positive class (class 1 - default) y_hat_test_proba = y_hat_test_proba[:][: , 1] # we will now create a new DF with actual classes and the predicted probabilities # create a temp y_test DF to reset its index to allow proper concatenation with y_hat_test_proba y_test_temp = y_test.copy() y_test_temp.reset_index(drop = True, inplace = True) y_test_proba = pd.concat([y_test_temp, pd.DataFrame(y_hat_test_proba)], axis = 1) # Rename the columns y_test_proba.columns = ['y_test_class_actual', 'y_hat_test_proba'] # Makes the index of one dataframe equal to the index of another dataframe. y_test_proba.index = X_test.index # get the values required to plot a ROC curve fpr, tpr, thresholds = roc_curve(y_test_proba['y_test_class_actual'], y_test_proba['y_hat_test_proba']) # plot the ROC curve plt.plot(fpr, tpr) # plot a secondary diagonal line, with dashed line style and black color to represent a no-skill classifier plt.plot(fpr, fpr, linestyle = '--', color = 'k') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve');

# Calculate the Area Under the Receiver Operating Characteristic Curve (AUROC) on test set AUROC = roc_auc_score(y_test_proba['y_test_class_actual'], y_test_proba['y_hat_test_proba']) # calculate Gini from AUROC Gini = AUROC * 2 - 1 # print AUROC and Gini print('AUROC: %.4f' % (AUROC)) print('Gini: %.4f' % (Gini)) # draw a PR curve # calculate the no skill line as the proportion of the positive class no_skill = len(y_test[y_test == 1]) / len(y) # plot the no skill precision-recall curve plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill') # get the values required to plot a PR curve precision, recall, thresholds = precision_recall_curve(y_test_proba['y_test_class_actual'], y_test_proba['y_hat_test_proba']) # plot PR curve plt.plot(recall, precision, marker='.', label='Logistic') #plt.plot(recall, precision, marker='.', label='RandomForest') plt.xlabel('Recall') plt.ylabel('Precision') plt.legend() plt.title('PR curve');

from sklearn.linear_model import LogisticRegression from time import time from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_score from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report

# Setting hyperparameters for Logit logit_params = { 'logit__C': [0.01, 0.1, 1, 10, 100], 'logit__tol': [1e-3, 1e-4, 1e-5], 'logit__penalty': ['l1', 'l2'], 'logit__solver': ['liblinear']} # Logit pipeline setup logit_pipe = Pipeline([ ('logit', LogisticRegression( max_iter=1200)) ]) # GridSearch set up for Logit logit_gs = GridSearchCV(logit_pipe, param_grid=logit_params, cv=5, n_jobs=-1)

# Fit on train data logit_time = time() logit_gs.fit(X_train, y_train) print("Logistic Regression:\nBest train score:", logit_gs.best_score_,"\nBest params:", logit_gs.best_params_)

clf = LogisticRegression(random_state = 0, solver='liblinear', multi_class='auto' , C = 100, penalty = 'l1', tol = 0.0001) clf.fit(X_train,y_train)

rfc_report = classification_report(y_test, clf.predict(X_test)) print("\nClassification report:\n",rfc_report)

# make preditions on our test set y_hat_test = clf.predict(X_test) # get the predicted probabilities y_hat_test_proba = clf.predict_proba(X_test) # select the probabilities of only the positive class (class 1 - default) y_hat_test_proba = y_hat_test_proba[:][: , 1] # we will now create a new DF with actual classes and the predicted probabilities # create a temp y_test DF to reset its index to allow proper concaternation with y_hat_test_proba y_test_temp = y_test.copy() y_test_temp.reset_index(drop = True, inplace = True) y_test_proba = pd.concat([y_test_temp, pd.DataFrame(y_hat_test_proba)], axis = 1) # Rename the columns y_test_proba.columns = ['y_test_class_actual', 'y_hat_test_proba'] # Makes the index of one dataframe equal to the index of another dataframe. y_test_proba.index = X_test.index # get the values required to plot a ROC curve fpr, tpr, thresholds = roc_curve(y_test_proba['y_test_class_actual'], y_test_proba['y_hat_test_proba']) # plot the ROC curve plt.plot(fpr, tpr) # plot a secondary diagonal line, with dashed line style and black color to represent a no-skill classifier plt.plot(fpr, fpr, linestyle = '--', color = 'k') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve');

# Calculate the Area Under the Receiver Operating Characteristic Curve (AUROC) on test set AUROC = roc_auc_score(y_test_proba['y_test_class_actual'], y_test_proba['y_hat_test_proba']) # calculate Gini from AUROC Gini = AUROC * 2 - 1 # print AUROC and Gini print('AUROC: %.4f' % (AUROC)) print('Gini: %.4f' % (Gini)) # draw a PR curve # calculate the no skill line as the proportion of the positive class no_skill = len(y_test[y_test == 1]) / len(y) # plot the no skill precision-recall curve plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill') # get the values required to plot a PR curve precision, recall, thresholds = precision_recall_curve(y_test_proba['y_test_class_actual'], y_test_proba['y_hat_test_proba']) # plot PR curve plt.plot(recall, precision, marker='.', label='Logistic') #plt.plot(recall, precision, marker='.', label='RandomForest') plt.xlabel('Recall') plt.ylabel('Precision') plt.legend() plt.title('PR curve');

# Store the column names in X_train as a list feature_name = X_train.columns.values # Create a summary table of our logistic regression model summary_table = pd.DataFrame(columns = ['Feature name'], data = feature_name) # Create a new column in the dataframe, called 'Coefficients' summary_table['Coefficients'] = np.transpose(clf.coef_) # Increase the index of every row of the dataframe with 1 to store our model intercept in 1st row summary_table.index = summary_table.index + 1 # Assign our model intercept to this new row #summary_table.loc[0] = ['Intercept', clf.intercept_[0]] finalVal = X_train summary_table = summary_table[summary_table['Feature name'] != 'id'] # Sort the dataframe by index summary_table.sort_values(by=['Coefficients'])

# https://www.displayr.com/how-to-interpret-logistic-regression-coefficients/