mammoCAD

import re import os import numpy as np import pandas as pd import tensorflow as tf from kaggle_datasets import KaggleDatasets import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split try: tpu = tf.distribute.cluster_resolver.TPUClusterResolver() print('Device:', tpu.master()) tf.config.experimental_connect_to_cluster(tpu) tf.tpu.experimental.initialize_tpu_system(tpu) strategy = tf.distribute.experimental.TPUStrategy(tpu) except: strategy = tf.distribute.get_strategy() print('Number of replicas:', strategy.num_replicas_in_sync) print(tf.__version__)

# Step 1: Get the credential from the Cloud SDK from kaggle_secrets import UserSecretsClient user_secrets = UserSecretsClient() user_credential = user_secrets.get_gcloud_credential() # Step 2: Set the credentials user_secrets.set_tensorflow_credential(user_credential) # Step 3: Use a familiar call to get the GCS path of the dataset from kaggle_datasets import KaggleDatasets GCS_DS_PATH = KaggleDatasets().get_gcs_path()

AUTOTUNE = tf.data.experimental.AUTOTUNE BATCH_SIZE = 16 * strategy.num_replicas_in_sync GCS_PATH = KaggleDatasets().get_gcs_path() IMAGE_SIZE = [512, 512] EPOCHS = 100 print('done')

filenames = tf.io.gfile.glob(str(GCS_PATH + '/MINI-DDSM CANCER-NORMAL DATASET/Cancer/*.jpg')) filenames.extend(tf.io.gfile.glob(str(GCS_PATH + '/MINI-DDSM CANCER-NORMAL DATASET/Normal/*.jpg'))) train_filenames, val_filenames = train_test_split(filenames, test_size=0.2) print('done')

COUNT_NORMAL = len([filename for filename in train_filenames if "Normal" in filename]) print("Normal images count in training set: " + str(COUNT_NORMAL)) COUNT_CANCER = len([filename for filename in train_filenames if "Cancer" in filename]) print("Cancer images count in training set: " + str(COUNT_CANCER)) COUNT_NORMAL_V = len([filename for filename in val_filenames if "Normal" in filename]) print("Normal images count in testing set: " + str(COUNT_NORMAL_V)) COUNT_CANCER_V = len([filename for filename in val_filenames if "Cancer" in filename]) print("Cancer images count in testing set: " + str(COUNT_CANCER_V))

train_list_ds = tf.data.Dataset.from_tensor_slices(train_filenames) val_list_ds = tf.data.Dataset.from_tensor_slices(val_filenames) for f in train_list_ds.take(5): print(f.numpy())

TRAIN_IMG_COUNT = tf.data.experimental.cardinality(train_list_ds).numpy() print("Training images count: " + str(TRAIN_IMG_COUNT)) VAL_IMG_COUNT = tf.data.experimental.cardinality(val_list_ds).numpy() print("Validating images count: " + str(VAL_IMG_COUNT))

CLASS_NAMES = np.array([str(tf.strings.split(item, os.path.sep)[-1].numpy())[2:-1] for item in tf.io.gfile.glob(str(GCS_PATH + "/MINI-DDSM CANCER-NORMAL DATASET/*"))]) CLASS_NAMES

def get_label(file_path): # convert the path to a list of path components parts = tf.strings.split(file_path, os.path.sep) # The second to last is the class-directory return parts[-2] == "Cancer" print('done')

def decode_img(img): # convert the compressed string to a 3D uint8 tensor img = tf.image.decode_jpeg(img, channels=3) # Use `convert_image_dtype` to convert to floats in the [0,1] range. img = tf.image.convert_image_dtype(img, tf.float32) # resize the image to the desired size. return tf.image.resize(img, IMAGE_SIZE) print('done')

def process_path(file_path): label = get_label(file_path) # load the raw data from the file as a string img = tf.io.read_file(file_path) img = decode_img(img) return img, label print('done')

train_ds = train_list_ds.map(process_path, num_parallel_calls=AUTOTUNE) val_ds = val_list_ds.map(process_path, num_parallel_calls=AUTOTUNE) test_ds = val_list_ds.map(process_path, num_parallel_calls=AUTOTUNE) print('done')

for image, label in train_ds.take(1): print("Image shape: ", image.numpy().shape) print("Label: ", label.numpy())

def prepare_for_training(ds, cache=True, shuffle_buffer_size=1000): # This is a small dataset, only load it once, and keep it in memory. # use `.cache(filename)` to cache preprocessing work for datasets that don't # fit in memory. if cache: if isinstance(cache, str): ds = ds.cache(cache) else: ds = ds.cache() ds = ds.shuffle(buffer_size=shuffle_buffer_size) # Repeat forever ds = ds.repeat() ds = ds.batch(BATCH_SIZE) # `prefetch` lets the dataset fetch batches in the background while the model # is training. ds = ds.prefetch(buffer_size=AUTOTUNE) return ds print('done')

train_ds = prepare_for_training(train_ds) val_ds = prepare_for_training(val_ds) image_batch, label_batch = next(iter(train_ds)) print('done')

def show_batch(image_batch, label_batch): plt.figure(figsize=(10,10)) for n in range(25): ax = plt.subplot(5,5,n+1) plt.imshow(image_batch[n]) if label_batch[n]: plt.title("CANCER") else: plt.title("NORMAL") plt.axis("off") print('done')

show_batch(image_batch.numpy(), label_batch.numpy())

def build_model(): model = tf.keras.Sequential([ tf.keras.Input(shape=(512, 512, 3)), tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'), tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'), tf.keras.layers.MaxPool2D(), tf.keras.layers.SeparableConv2D(32, 3, activation='relu', padding='same'), tf.keras.layers.SeparableConv2D(32, 3, activation='relu', padding='same'), tf.keras.layers.BatchNormalization(), tf.keras.layers.MaxPool2D(), tf.keras.layers.SeparableConv2D(64, 3, activation='relu', padding='same'), tf.keras.layers.SeparableConv2D(64, 3, activation='relu', padding='same'), tf.keras.layers.BatchNormalization(), tf.keras.layers.MaxPool2D(), tf.keras.layers.SeparableConv2D(128, 3, activation='relu', padding='same'), tf.keras.layers.SeparableConv2D(128, 3, activation='relu', padding='same'), tf.keras.layers.BatchNormalization(), tf.keras.layers.MaxPool2D(), tf.keras.layers.Dropout(0.2), tf.keras.layers.SeparableConv2D(256, 3, activation='relu', padding='same'), tf.keras.layers.SeparableConv2D(256, 3, activation='relu', padding='same'), tf.keras.layers.BatchNormalization(), tf.keras.layers.Dropout(0.2), tf.keras.layers.Flatten(), tf.keras.layers.Dense(512, activation='relu'), tf.keras.layers.BatchNormalization(), tf.keras.layers.Dropout(0.7), tf.keras.layers.Dense(128, activation='relu'), tf.keras.layers.BatchNormalization(), tf.keras.layers.Dropout(0.5), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.BatchNormalization(), tf.keras.layers.Dropout(0.3), tf.keras.layers.Dense(1, activation='sigmoid') ]) return model print('done')

with strategy.scope(): model = build_model() METRICS = [ 'accuracy', tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall') ] model.compile( optimizer='adam', loss='binary_crossentropy', metrics=METRICS ) model.summary()

initial_bias = np.log([COUNT_CANCER/COUNT_NORMAL]) initial_bias

weight_for_0 = (1 / COUNT_NORMAL)*(TRAIN_IMG_COUNT)/2.0 weight_for_1 = (1 / COUNT_CANCER)*(TRAIN_IMG_COUNT)/2.0 class_weight = {0: weight_for_0, 1: weight_for_1} print('Weight for class 0: {:.2f}'.format(weight_for_0)) print('Weight for class 1: {:.2f}'.format(weight_for_1))

history = model.fit( train_ds, steps_per_epoch=TRAIN_IMG_COUNT // BATCH_SIZE, epochs=150, validation_data=val_ds, validation_steps=VAL_IMG_COUNT // BATCH_SIZE, )

model.save('./mammoCAD_V1.h5')

history_df = pd.DataFrame(history.history) history_df[['recall', 'precision']].plot() history_df = pd.DataFrame(history.history) history_df[['accuracy', 'val_accuracy']].plot()

checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("mammoCAD_model.h5", save_best_only=True) early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

def exponential_decay(lr0, s): def exponential_decay_fn(epoch): return lr0 * 0.1 **(epoch / s) return exponential_decay_fn exponential_decay_fn = exponential_decay(0.01, 20) lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn)

history = model.fit( train_ds, steps_per_epoch=TRAIN_IMG_COUNT // BATCH_SIZE, epochs=200, validation_data=val_ds, validation_steps=VAL_IMG_COUNT // BATCH_SIZE, class_weight=class_weight, callbacks=[checkpoint_cb, early_stopping_cb, lr_scheduler] )

model.save('./mammoCAD_V2.h5')

fig, ax = plt.subplots(1, 4, figsize=(20, 3)) ax = ax.ravel() for i, met in enumerate(['precision', 'recall', 'accuracy', 'loss']): ax[i].plot(history.history[met]) ax[i].plot(history.history['val_' + met]) ax[i].set_title('Model {}'.format(met)) ax[i].set_xlabel('epochs') ax[i].set_ylabel(met) ax[i].legend(['train', 'val'])

history = model.fit( train_ds, steps_per_epoch=TRAIN_IMG_COUNT // BATCH_SIZE, epochs=55, validation_data=val_ds, validation_steps=VAL_IMG_COUNT // BATCH_SIZE, )

model.save('./mammoCAD_V3.h5')

fig, ax = plt.subplots(1, 4, figsize=(20, 3)) ax = ax.ravel() for i, met in enumerate(['precision', 'recall', 'accuracy', 'loss']): ax[i].plot(history.history[met]) ax[i].plot(history.history['val_' + met]) ax[i].set_title('Model {}'.format(met)) ax[i].set_xlabel('epochs') ax[i].set_ylabel(met) ax[i].legend(['train', 'val'])

class CustomCallback(tf.keras.callbacks.Callback): def on_epoch_end(self, epoch, logs=None): if logs.get('val_accuracy') >= 9e-1 or logs.get('val_recall') >= 9e-1: self.model.stop_training = True callback = CustomCallback()

history = model.fit( train_ds, steps_per_epoch=TRAIN_IMG_COUNT // BATCH_SIZE, epochs=300, validation_data=val_ds, validation_steps=VAL_IMG_COUNT // BATCH_SIZE, callbacks=[callback] )

model.save('./mammoCAD_V4.h5')

fig, ax = plt.subplots(1, 4, figsize=(20, 3)) ax = ax.ravel() for i, met in enumerate(['precision', 'recall', 'accuracy', 'loss']): ax[i].plot(history.history[met]) ax[i].plot(history.history['val_' + met]) ax[i].set_title('Model {}'.format(met)) ax[i].set_xlabel('epochs') ax[i].set_ylabel(met) ax[i].legend(['train', 'val'])

class CustomCallback(tf.keras.callbacks.Callback): def on_epoch_end(self, epoch, logs=None): if logs.get('val_accuracy') >= 91e-2 or logs.get('val_recall') >= 98e-2: self.model.stop_training = True callback = CustomCallback()

history = model.fit( train_ds, steps_per_epoch=TRAIN_IMG_COUNT // BATCH_SIZE, epochs=300, validation_data=val_ds, validation_steps=VAL_IMG_COUNT // BATCH_SIZE, callbacks=[callback] )

model.save('./mammoCAD_V6.h5')

fig, ax = plt.subplots(1, 4, figsize=(20, 3)) ax = ax.ravel() for i, met in enumerate(['precision', 'recall', 'accuracy', 'loss']): ax[i].plot(history.history[met]) ax[i].plot(history.history['val_' + met]) ax[i].set_title('Model {}'.format(met)) ax[i].set_xlabel('epochs') ax[i].set_ylabel(met) ax[i].legend(['train', 'val'])

#Thanks for reading my code, if you plan to include it in your work, please properly cite me #Thanks again! #This code was inspired by this work of Amy Jang here: https://www.kaggle.com/code/amyjang/tensorflow-pneumonia-classification-on-x-rays #And Abhinav Sagar here:https://www.kdnuggets.com/2019/10/convolutional-neural-network-breast-cancer-classification.html