import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from collections import Counter
%matplotlib inline
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
# load the breast cancse data set
data = load_breast_cancer()
target_label = dict(zip(range(len(data.target_names)),data.target_names))
df_X, df_y = load_breast_cancer(return_X_y=True, as_frame=True)
# split the data into train, validation, and test data
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2, random_state=20)
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)
print("The size of train data = ", X_train.shape[0])
print("The size of test data = ", X_test.shape[0])
X_train_mean = X_train[['mean radius', 'mean texture', 'mean perimeter', 'mean area','mean smoothness',
'mean compactness', 'mean concavity','mean concave points', 'mean symmetry', 'mean fractal dimension',]]
X_train_error = X_train[['radius error', 'texture error', 'perimeter error', 'area error','smoothness error',
'compactness error', 'concavity error', 'concave points error', 'symmetry error', 'fractal dimension error',]]
X_train_worst = X_train[['worst radius', 'worst texture', 'worst perimeter', 'worst area', 'worst smoothness', 'worst compactness', 'worst concavity',
'worst concave points', 'worst symmetry', 'worst fractal dimension']]
# Generate and visualize the correlation matrix
corr = X_train_mean.corr().round(2)
# Set figure size
f, ax = plt.subplots(figsize=(6, 6))
# Define custom colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap
sns.heatmap(corr, cmap='coolwarm', vmin=-1, vmax=1, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, annot_kws={"fontsize":7})
plt.show()
# Generate and visualize the correlation matrix
corr = X_train_error.corr().round(2)
# Set figure size
f, ax = plt.subplots(figsize=(6, 6))
# Define custom colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap
sns.heatmap(corr, cmap='coolwarm', vmin=-1, vmax=1, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, annot_kws={"fontsize":7})
plt.show()
# Generate and visualize the correlation matrix
corr = pd.concat([X_train_mean, X_train_worst],axis=1).corr().round(2)
corr.drop(columns=X_train_mean.columns, index=X_train_worst.columns, inplace=True)
# Set figure size
f, ax = plt.subplots(figsize=(8, 8))
# Define custom colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap
sns.heatmap(corr, cmap='coolwarm', vmin=-1, vmax=1, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, annot_kws={"fontsize":7})
plt.show()
# drop all unnecessary columns
cols = [
# columns related to the "perimeter" and "area"
'mean perimeter',
'perimeter error',
'mean area',
'area error',
# columns correlated to the "concavity" and "concave points"
'mean concavity',
'concavity error',
'mean concave points',
'concave points error',
# all the "worst" colums
'worst radius',
'worst texture',
'worst perimeter',
'worst area',
'worst smoothness',
'worst compactness',
'worst concavity',
'worst concave points',
'worst symmetry',
'worst fractal dimension',
]
print("Columns to be removed:")
print(cols)
print("\n")
X_train_selected = X_train.drop(cols, axis=1)
X_val_selected = X_val.drop(cols, axis=1)
X_test_selected = X_test.drop(cols, axis=1)
# verify remaining columns
print("Remaining columns:")
print(list(X_train_selected.columns))
# Generate and visualize the correlation matrix
corr = X_train_selected.corr().round(2)
corr_target_list = []
for idx in corr.index:
corr_target_list.append(stats.pointbiserialr(df_train['target'], df_train[idx])[0].round(2))
corr['target'] = corr_target_list
# Set figure size
f, ax = plt.subplots(figsize=(8, 8))
# Define custom colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap
sns.heatmap(corr, cmap='coolwarm', vmin=-1, vmax=1, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5}, annot=True, annot_kws={"fontsize":7})
plt.show()
columns = ['mean radius', 'mean texture', 'mean smoothness', 'mean compactness','mean symmetry', 'mean fractal dimension']
fig, axes = plt.subplots(2,3, figsize=(30, 15))
for i in range(len(columns)):
row = i//3
col = i%3
num = 10**(i+2)
sns.histplot(df_train, x=columns[i], hue='target', ax=axes[row,col])
axes[row,col].set_title(columns[i]+"\n (correlation with target: r = %.2f)"%corr['target'][columns[i]], fontsize=14)
legend = axes[row,col].get_legend()
axes[row,col].legend(handles = legend.legendHandles,labels=target_label.values())
columns = ['radius error', 'texture error', 'smoothness error', 'compactness error', 'symmetry error', 'fractal dimension error']
fig, axes = plt.subplots(2,3, figsize=(30, 15))
for i in range(len(columns)):
row = i//3
col = i%3
num = 10**(i+2)
sns.histplot(df_train, x=columns[i], hue='target', ax=axes[row,col])
axes[row,col].set_title(columns[i]+"\n (correlation with target: r = %.2f)"%corr['target'][columns[i]], fontsize=14)
legend = axes[row,col].get_legend()
axes[row,col].legend(handles = legend.legendHandles,labels=target_label.values())
target = df_train['target'].value_counts().sort_index().rename(index=target_label)
bar1 = plt.bar(x=target.index, height=target.values)
plt.bar_label(bar1, padding=1)
plt.yticks(ticks=np.arange(0,300,25))
plt.title('The Number of Malignant and Benign Samples ')
plt.show()
print("The result of the two sample t-test for each feature")
print("===="*8)
for column in X_train_selected.columns:
ttest, p_value = stats.ttest_ind(
df_train[df_train['target']==0][column],df_train[df_train['target']==1][column], equal_var=False)
print("feature: %s" %column)
print("corr with target:", corr['target'][column])
print("p value:%.8f" % p_value)
if p_value <0.05:
print("Reject null hypothesis")
else:
print("Fail to reject null hypothesis")
print("===="*8)
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
std_scaler.fit(X_train_selected)
X_train_std = std_scaler.transform(X_train_selected)
X_test_std = std_scaler.transform(X_test_selected)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
params = {}
params['penalty'] = ['l1', 'l2', 'elasticnet', None]
params['C'] = np.logspace(-1, 0.5, 10)
params['solver'] = ['lbfgs','newton-cg','liblinear','sag','saga']
params['max_iter'] = [100, 1000]
lr = LogisticRegression()
grid = GridSearchCV(lr, param_grid = params, cv = 4, verbose=True, n_jobs=-1)
grid.fit(X_train_std, y_train)
# Check the best combinations of the hyper-parameters based on the grid search
grid.best_params_
grid.best_score_
# Create an instance of Logistic Regression classifier with the best parameters
lr_std = LogisticRegression(C=0.46415888336127786, max_iter=100, penalty='l2', solver='lbfgs')
# Train the model with the train data
lr_std.fit(X_train_std, y_train)
from sklearn import metrics
score = lr_std.score(X_test_std, y_test)
cm = metrics.confusion_matrix(y_test, y_test_hat)
fn = cm[0][1]
tp = cm[0][0]
fn_rate = fn/(fn+tp)
plt.figure(figsize=(7,7))
sns.heatmap(cm, annot=True, linewidths=.5, square = True, cmap = 'Blues',
xticklabels=target_label.values(), yticklabels=target_label.values())
plt.ylabel('Actual label', size=13)
plt.xlabel('Predicted label', size=13)
all_sample_title = 'Accuracy Score: {:.5f} \n False Negative Rate: {:.5f}'.format(score, fn_rate)
plt.title(all_sample_title, size = 15)
plt.show();