import re
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from kaggle_datasets import KaggleDatasets
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
try:
tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
print('Device:', tpu.master())
tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
strategy = tf.distribute.experimental.TPUStrategy(tpu)
except:
strategy = tf.distribute.get_strategy()
print('Number of replicas:', strategy.num_replicas_in_sync)
print(tf.__version__)
# Step 1: Get the credential from the Cloud SDK
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
user_credential = user_secrets.get_gcloud_credential()
# Step 2: Set the credentials
user_secrets.set_tensorflow_credential(user_credential)
# Step 3: Use a familiar call to get the GCS path of the dataset
from kaggle_datasets import KaggleDatasets
GCS_DS_PATH = KaggleDatasets().get_gcs_path()
AUTOTUNE = tf.data.experimental.AUTOTUNE
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
GCS_PATH = KaggleDatasets().get_gcs_path()
IMAGE_SIZE = [512, 512]
EPOCHS = 100
print('done')
filenames = tf.io.gfile.glob(str(GCS_PATH + '/MINI-DDSM CANCER-NORMAL DATASET/Cancer/*.jpg'))
filenames.extend(tf.io.gfile.glob(str(GCS_PATH + '/MINI-DDSM CANCER-NORMAL DATASET/Normal/*.jpg')))
train_filenames, val_filenames = train_test_split(filenames, test_size=0.2)
print('done')
COUNT_NORMAL = len([filename for filename in train_filenames if "Normal" in filename])
print("Normal images count in training set: " + str(COUNT_NORMAL))
COUNT_CANCER = len([filename for filename in train_filenames if "Cancer" in filename])
print("Cancer images count in training set: " + str(COUNT_CANCER))
COUNT_NORMAL_V = len([filename for filename in val_filenames if "Normal" in filename])
print("Normal images count in testing set: " + str(COUNT_NORMAL_V))
COUNT_CANCER_V = len([filename for filename in val_filenames if "Cancer" in filename])
print("Cancer images count in testing set: " + str(COUNT_CANCER_V))
train_list_ds = tf.data.Dataset.from_tensor_slices(train_filenames)
val_list_ds = tf.data.Dataset.from_tensor_slices(val_filenames)
for f in train_list_ds.take(5):
print(f.numpy())
TRAIN_IMG_COUNT = tf.data.experimental.cardinality(train_list_ds).numpy()
print("Training images count: " + str(TRAIN_IMG_COUNT))
VAL_IMG_COUNT = tf.data.experimental.cardinality(val_list_ds).numpy()
print("Validating images count: " + str(VAL_IMG_COUNT))
CLASS_NAMES = np.array([str(tf.strings.split(item, os.path.sep)[-1].numpy())[2:-1]
for item in tf.io.gfile.glob(str(GCS_PATH + "/MINI-DDSM CANCER-NORMAL DATASET/*"))])
CLASS_NAMES
def get_label(file_path):
# convert the path to a list of path components
parts = tf.strings.split(file_path, os.path.sep)
# The second to last is the class-directory
return parts[-2] == "Cancer"
print('done')
def decode_img(img):
# convert the compressed string to a 3D uint8 tensor
img = tf.image.decode_jpeg(img, channels=3)
# Use `convert_image_dtype` to convert to floats in the [0,1] range.
img = tf.image.convert_image_dtype(img, tf.float32)
# resize the image to the desired size.
return tf.image.resize(img, IMAGE_SIZE)
print('done')
def process_path(file_path):
label = get_label(file_path)
# load the raw data from the file as a string
img = tf.io.read_file(file_path)
img = decode_img(img)
return img, label
print('done')
train_ds = train_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
val_ds = val_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
test_ds = val_list_ds.map(process_path, num_parallel_calls=AUTOTUNE)
print('done')
for image, label in train_ds.take(1):
print("Image shape: ", image.numpy().shape)
print("Label: ", label.numpy())
def prepare_for_training(ds, cache=True, shuffle_buffer_size=1000):
# This is a small dataset, only load it once, and keep it in memory.
# use `.cache(filename)` to cache preprocessing work for datasets that don't
# fit in memory.
if cache:
if isinstance(cache, str):
ds = ds.cache(cache)
else:
ds = ds.cache()
ds = ds.shuffle(buffer_size=shuffle_buffer_size)
# Repeat forever
ds = ds.repeat()
ds = ds.batch(BATCH_SIZE)
# `prefetch` lets the dataset fetch batches in the background while the model
# is training.
ds = ds.prefetch(buffer_size=AUTOTUNE)
return ds
print('done')
train_ds = prepare_for_training(train_ds)
val_ds = prepare_for_training(val_ds)
image_batch, label_batch = next(iter(train_ds))
print('done')
def show_batch(image_batch, label_batch):
plt.figure(figsize=(10,10))
for n in range(25):
ax = plt.subplot(5,5,n+1)
plt.imshow(image_batch[n])
if label_batch[n]:
plt.title("CANCER")
else:
plt.title("NORMAL")
plt.axis("off")
print('done')
show_batch(image_batch.numpy(), label_batch.numpy())
def build_model():
model = tf.keras.Sequential([
tf.keras.Input(shape=(512, 512, 3)),
tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
tf.keras.layers.Conv2D(16, 3, activation='relu', padding='same'),
tf.keras.layers.MaxPool2D(),
tf.keras.layers.SeparableConv2D(32, 3, activation='relu', padding='same'),
tf.keras.layers.SeparableConv2D(32, 3, activation='relu', padding='same'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.MaxPool2D(),
tf.keras.layers.SeparableConv2D(64, 3, activation='relu', padding='same'),
tf.keras.layers.SeparableConv2D(64, 3, activation='relu', padding='same'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.MaxPool2D(),
tf.keras.layers.SeparableConv2D(128, 3, activation='relu', padding='same'),
tf.keras.layers.SeparableConv2D(128, 3, activation='relu', padding='same'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.MaxPool2D(),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.SeparableConv2D(256, 3, activation='relu', padding='same'),
tf.keras.layers.SeparableConv2D(256, 3, activation='relu', padding='same'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dropout(0.2),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(512, activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dropout(0.7),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dropout(0.5),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.BatchNormalization(),
tf.keras.layers.Dropout(0.3),
tf.keras.layers.Dense(1, activation='sigmoid')
])
return model
print('done')
with strategy.scope():
model = build_model()
METRICS = [
'accuracy',
tf.keras.metrics.Precision(name='precision'),
tf.keras.metrics.Recall(name='recall')
]
model.compile(
optimizer='adam',
loss='binary_crossentropy',
metrics=METRICS
)
model.summary()
initial_bias = np.log([COUNT_CANCER/COUNT_NORMAL])
initial_bias
weight_for_0 = (1 / COUNT_NORMAL)*(TRAIN_IMG_COUNT)/2.0
weight_for_1 = (1 / COUNT_CANCER)*(TRAIN_IMG_COUNT)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))
history = model.fit(
train_ds,
steps_per_epoch=TRAIN_IMG_COUNT // BATCH_SIZE,
epochs=150,
validation_data=val_ds,
validation_steps=VAL_IMG_COUNT // BATCH_SIZE,
)
model.save('./mammoCAD_V1.h5')
history_df = pd.DataFrame(history.history)
history_df[['recall', 'precision']].plot()
history_df = pd.DataFrame(history.history)
history_df[['accuracy', 'val_accuracy']].plot()
checkpoint_cb = tf.keras.callbacks.ModelCheckpoint("mammoCAD_model.h5",
save_best_only=True)
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10,
restore_best_weights=True)
def exponential_decay(lr0, s):
def exponential_decay_fn(epoch):
return lr0 * 0.1 **(epoch / s)
return exponential_decay_fn
exponential_decay_fn = exponential_decay(0.01, 20)
lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn)
history = model.fit(
train_ds,
steps_per_epoch=TRAIN_IMG_COUNT // BATCH_SIZE,
epochs=200,
validation_data=val_ds,
validation_steps=VAL_IMG_COUNT // BATCH_SIZE,
class_weight=class_weight,
callbacks=[checkpoint_cb, early_stopping_cb, lr_scheduler]
)
model.save('./mammoCAD_V2.h5')
fig, ax = plt.subplots(1, 4, figsize=(20, 3))
ax = ax.ravel()
for i, met in enumerate(['precision', 'recall', 'accuracy', 'loss']):
ax[i].plot(history.history[met])
ax[i].plot(history.history['val_' + met])
ax[i].set_title('Model {}'.format(met))
ax[i].set_xlabel('epochs')
ax[i].set_ylabel(met)
ax[i].legend(['train', 'val'])
history = model.fit(
train_ds,
steps_per_epoch=TRAIN_IMG_COUNT // BATCH_SIZE,
epochs=55,
validation_data=val_ds,
validation_steps=VAL_IMG_COUNT // BATCH_SIZE,
)
model.save('./mammoCAD_V3.h5')
fig, ax = plt.subplots(1, 4, figsize=(20, 3))
ax = ax.ravel()
for i, met in enumerate(['precision', 'recall', 'accuracy', 'loss']):
ax[i].plot(history.history[met])
ax[i].plot(history.history['val_' + met])
ax[i].set_title('Model {}'.format(met))
ax[i].set_xlabel('epochs')
ax[i].set_ylabel(met)
ax[i].legend(['train', 'val'])
class CustomCallback(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs=None):
if logs.get('val_accuracy') >= 9e-1 or logs.get('val_recall') >= 9e-1:
self.model.stop_training = True
callback = CustomCallback()
history = model.fit(
train_ds,
steps_per_epoch=TRAIN_IMG_COUNT // BATCH_SIZE,
epochs=300,
validation_data=val_ds,
validation_steps=VAL_IMG_COUNT // BATCH_SIZE,
callbacks=[callback]
)
model.save('./mammoCAD_V4.h5')
fig, ax = plt.subplots(1, 4, figsize=(20, 3))
ax = ax.ravel()
for i, met in enumerate(['precision', 'recall', 'accuracy', 'loss']):
ax[i].plot(history.history[met])
ax[i].plot(history.history['val_' + met])
ax[i].set_title('Model {}'.format(met))
ax[i].set_xlabel('epochs')
ax[i].set_ylabel(met)
ax[i].legend(['train', 'val'])
class CustomCallback(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs=None):
if logs.get('val_accuracy') >= 91e-2 or logs.get('val_recall') >= 98e-2:
self.model.stop_training = True
callback = CustomCallback()
history = model.fit(
train_ds,
steps_per_epoch=TRAIN_IMG_COUNT // BATCH_SIZE,
epochs=300,
validation_data=val_ds,
validation_steps=VAL_IMG_COUNT // BATCH_SIZE,
callbacks=[callback]
)
model.save('./mammoCAD_V6.h5')
fig, ax = plt.subplots(1, 4, figsize=(20, 3))
ax = ax.ravel()
for i, met in enumerate(['precision', 'recall', 'accuracy', 'loss']):
ax[i].plot(history.history[met])
ax[i].plot(history.history['val_' + met])
ax[i].set_title('Model {}'.format(met))
ax[i].set_xlabel('epochs')
ax[i].set_ylabel(met)
ax[i].legend(['train', 'val'])
#Thanks for reading my code, if you plan to include it in your work, please properly cite me
#Thanks again!
#This code was inspired by this work of Amy Jang here: https://www.kaggle.com/code/amyjang/tensorflow-pneumonia-classification-on-x-rays
#And Abhinav Sagar here:https://www.kdnuggets.com/2019/10/convolutional-neural-network-breast-cancer-classification.html