Breast Cancer Classification

import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import scipy.stats as stats from collections import Counter %matplotlib inline

from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split

# load the breast cancse data set data = load_breast_cancer()

target_label = dict(zip(range(len(data.target_names)),data.target_names))

df_X, df_y = load_breast_cancer(return_X_y=True, as_frame=True)

# split the data into train, validation, and test data X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=20) df_train = pd.concat([X_train, y_train], axis=1) df_test = pd.concat([X_test, y_test], axis=1)

print("The size of train data = ", X_train.shape[0]) print("The size of test data = ", X_test.shape[0])

X_train_mean = X_train[['mean radius', 'mean texture', 'mean perimeter', 'mean area','mean smoothness', 'mean compactness', 'mean concavity','mean concave points', 'mean symmetry', 'mean fractal dimension',]] X_train_error = X_train[['radius error', 'texture error', 'perimeter error', 'area error','smoothness error', 'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error',]] X_train_worst = X_train[['worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension']]

# Generate and visualize the correlation matrix corr = X_train_mean.corr().round(2) # Set figure size f, ax = plt.subplots(figsize=(6, 6)) # Define custom colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap sns.heatmap(corr, cmap='coolwarm', vmin=-1, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, annot_kws={"fontsize":7}) plt.show()

# Generate and visualize the correlation matrix corr = X_train_error.corr().round(2) # Set figure size f, ax = plt.subplots(figsize=(6, 6)) # Define custom colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap sns.heatmap(corr, cmap='coolwarm', vmin=-1, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, annot_kws={"fontsize":7}) plt.show()

# Generate and visualize the correlation matrix corr = pd.concat([X_train_mean, X_train_worst],axis=1).corr().round(2) corr.drop(columns=X_train_mean.columns, index=X_train_worst.columns, inplace=True) # Set figure size f, ax = plt.subplots(figsize=(8, 8)) # Define custom colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap sns.heatmap(corr, cmap='coolwarm', vmin=-1, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, annot_kws={"fontsize":7}) plt.show()

# drop all unnecessary columns cols = [ # columns related to the "perimeter" and "area" 'mean perimeter', 'perimeter error', 'mean area', 'area error', # columns correlated to the "concavity" and "concave points" 'mean concavity', 'concavity error', 'mean concave points', 'concave points error', # all the "worst" colums 'worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity', 'worst concave points', 'worst symmetry', 'worst fractal dimension', ] print("Columns to be removed:") print(cols) print("\n") X_train_selected = X_train.drop(cols, axis=1) X_val_selected = X_val.drop(cols, axis=1) X_test_selected = X_test.drop(cols, axis=1) # verify remaining columns print("Remaining columns:") print(list(X_train_selected.columns))

# Generate and visualize the correlation matrix corr = X_train_selected.corr().round(2) corr_target_list = [] for idx in corr.index: corr_target_list.append(stats.pointbiserialr(df_train['target'], df_train[idx])[0].round(2)) corr['target'] = corr_target_list # Set figure size f, ax = plt.subplots(figsize=(8, 8)) # Define custom colormap cmap = sns.diverging_palette(220, 10, as_cmap=True) # Draw the heatmap sns.heatmap(corr, cmap='coolwarm', vmin=-1, vmax=1, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, annot_kws={"fontsize":7}) plt.show()

columns = ['mean radius', 'mean texture', 'mean smoothness', 'mean compactness','mean symmetry', 'mean fractal dimension'] fig, axes = plt.subplots(2,3, figsize=(30, 15)) for i in range(len(columns)): row = i//3 col = i%3 num = 10**(i+2) sns.histplot(df_train, x=columns[i], hue='target', ax=axes[row,col]) axes[row,col].set_title(columns[i]+"\n (correlation with target: r = %.2f)"%corr['target'][columns[i]], fontsize=14) legend = axes[row,col].get_legend() axes[row,col].legend(handles = legend.legendHandles,labels=target_label.values())

columns = ['radius error', 'texture error', 'smoothness error', 'compactness error', 'symmetry error', 'fractal dimension error'] fig, axes = plt.subplots(2,3, figsize=(30, 15)) for i in range(len(columns)): row = i//3 col = i%3 num = 10**(i+2) sns.histplot(df_train, x=columns[i], hue='target', ax=axes[row,col]) axes[row,col].set_title(columns[i]+"\n (correlation with target: r = %.2f)"%corr['target'][columns[i]], fontsize=14) legend = axes[row,col].get_legend() axes[row,col].legend(handles = legend.legendHandles,labels=target_label.values())

target = df_train['target'].value_counts().sort_index().rename(index=target_label) bar1 = plt.bar(x=target.index, height=target.values) plt.bar_label(bar1, padding=1) plt.yticks(ticks=np.arange(0,300,25)) plt.title('The Number of Malignant and Benign Samples ') plt.show()

print("The result of the two sample t-test for each feature") print("===="*8) for column in X_train_selected.columns: ttest, p_value = stats.ttest_ind( df_train[df_train['target']==0][column],df_train[df_train['target']==1][column], equal_var=False) print("feature: %s" %column) print("corr with target:", corr['target'][column]) print("p value:%.8f" % p_value) if p_value <0.05: print("Reject null hypothesis") else: print("Fail to reject null hypothesis") print("===="*8)

from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler() std_scaler.fit(X_train_selected) X_train_std = std_scaler.transform(X_train_selected) X_test_std = std_scaler.transform(X_test_selected)

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV

params = {} params['penalty'] = ['l1', 'l2', 'elasticnet', None] params['C'] = np.logspace(-1, 0.5, 10) params['solver'] = ['lbfgs','newton-cg','liblinear','sag','saga'] params['max_iter'] = [100, 1000]

lr = LogisticRegression() grid = GridSearchCV(lr, param_grid = params, cv = 4, verbose=True, n_jobs=-1)

grid.fit(X_train_std, y_train)

# Check the best combinations of the hyper-parameters based on the grid search grid.best_params_

grid.best_score_

# Create an instance of Logistic Regression classifier with the best parameters lr_std = LogisticRegression(C=0.46415888336127786, max_iter=100, penalty='l2', solver='lbfgs') # Train the model with the train data lr_std.fit(X_train_std, y_train)

from sklearn import metrics

score = lr_std.score(X_test_std, y_test)

cm = metrics.confusion_matrix(y_test, y_test_hat) fn = cm[0][1] tp = cm[0][0] fn_rate = fn/(fn+tp)

plt.figure(figsize=(7,7)) sns.heatmap(cm, annot=True, linewidths=.5, square = True, cmap = 'Blues', xticklabels=target_label.values(), yticklabels=target_label.values()) plt.ylabel('Actual label', size=13) plt.xlabel('Predicted label', size=13) all_sample_title = 'Accuracy Score: {:.5f} \n False Negative Rate: {:.5f}'.format(score, fn_rate) plt.title(all_sample_title, size = 15) plt.show();