configuration.yaml

# Configuration file for the project
# A configuration object is a dictionary where each key has the format <prefix>__<parameter>.

# Data volumes are formatted as (channel, height, width, depth)
data:
  # Path to directory containing directories of DICOM slices
  data_dir: "./data"
  # Name of preprocessed h5 files for training and testing data
  h5_path: "./data/preprocessed_dataset.h5"
  # Number of processes for data preprocessing
  n_workers: 2
  # When writing data to h5 file, how to handle duplicate keys (patient IDs)
  # Any of: "skip" the duplicate, "overwrite" the duplicate, "rename" by adding a suffix
  duplicate_name_strategy: "skip"

unet:
  # ------------------ Conolution Block ------------------
  # Size of kernel for convolutional layers, a single integer
  kernel_size: 3
  # Number of convolutional layers in each block
  n_convolutions_per_block: 2
  # Activation class name from torch.nn for convolutional layers
  activation: "LeakyReLU"
  # Dropout rate applied after activation
  dropout_rate: 0.5
  # Whether to apply instance norm after convolution and before activation
  # - Batch norm don't work well with small batch size
  use_instance_norm: true
  # AKA decay, how much of new mean and variance are added to running mean and variance
  instance_norm_momentum: 0.9
  # Small value added to variance to avoid division by zero
  instance_norm_epsilon: 0.00001

  # ------------------ Encoder-Decoder Settings ------------------
  # Channel of input volume, 1 for grayscale, 3 for RGB
  input_channels: 1
  # Output channels of the final layer, number of classes to predict
  output_channels: 3
  # Number of kernels in the output of the first level of encoder
  # - This is doubled/halved at each level in the encoder/decoder
  n_kernels_init: 32
  # Maximum allowed number of kernels for a level
  n_kernels_max: 512
  # Number of resolutions/levels for the U-Net
  n_levels: 5
  # Activation function name from torch.nn for the final layer, usually
  # "Sigmoid" for binary classification or "Softmax" for multi-class classification
  final_layer_activation: "Sigmoid"

  # ------------------ U-Net Training Settings ------------------
  # Weight initialiser for model from torch.nn.init
  initialiser: "kaiming_normal_"
  # Optimiser for training from torch.optim
  optimiser: "SGD"
  # Optional arguments for optimiser
  optimiser_kwargs:
    momentum: 0.99
    nesterov: true
  # Learning rate scheduler from torch.optim.lr_scheduler
  lr_scheduler: "PolynomialLR"
  # Optional arguments for learning rate scheduler
  lr_scheduler_kwargs:
    total_iters: 1000
    power: 0.9
  # Loss will be weighted sum of losses from each level of the U-net decoder
  deep_supervision: true

confidnet:
  # Dimensions (channels) of the convolution layers
  # Number of convolution is specified by the length of this list
  hidden_conv_dims: [128, 128, 128, 64, 64]
  # Activation function name from torch.nn for convolutional layers
  activation: "LeakyReLU"
  # Activation function name from torch.nn for the final layer, usually
  # "Sigmoid" for binary classification or "Softmax" for multi-class classification
  last_activation: "Sigmoid"
  # Path to pretrained U-Net model to use as encoder
  #pretrained_unet_path: "./checkpoints/unet"
  # ------------------ ConfidNet Training Settings ------------------
  # Weight initialiser for model from torch.nn.init
  initialiser: "kaiming_normal_"
  # Optimiser for training from torch.optim
  optimiser: "SGD"
  # Optional arguments for optimiser
  optimiser_kwargs:
    momentum: 0.99
    nesterov: true
  # Learning rate scheduler from torch.optim.lr_scheduler
  lr_scheduler: "PolynomialLR"
  # Optional arguments for learning rate scheduler
  lr_scheduler_kwargs:
    total_iters: 1000
    power: 0.9

training:
  # Number of epochs to train for
  # NOTE: CHANGE total_iters in lr_scheduler_kwargs to this if scheduler have that arg!!
  n_epochs: 1000
  # Window for running loss calculation, reduce noise in loss plot
  running_loss_window: 10
  # Whether to dump x, y, and predictions from the model every n epochs
  dump_tensors_every_n_epoch: 50
  # Directory for the tensor dumps
  tensor_dump_dir: "./tensor-dumps"
  # Perform validation check every n epochs, 0 to disable validation
  check_val_every_n_epoch: 50
  # Number of validation steps to run before starting training, 0 to disable.
  num_sanity_val_steps: 2

  #  ---------- tensorboard settings -------------------
  #  see https://lightning.ai/docs/pytorch/stable/extensions/logging.html
  log_dir: "./tensorboard-logs/"  
  
  # ------------------ Trainer Settings ------------------
  # Specifies the distributed training strategy, see https://lightning.ai/docs/pytorch/stable/extensions/strategy.html
  strategy: "ddp"
  # Which device to train on, "cpu", "gpu", "tpu", or "auto"
  accelerator: "gpu"
  # Precision of the floating point numbers
  precision: "16-mixed"
  # Whether to show progress bar during training
  enable_progress_bar: true
  # Whether to display model summary at the start of training
  enable_model_summary: true

  # ------------------ Checkpoint Settings ------------------
  # Path to directory to save model checkpoints; model checkpoints are saved as
  #     "<model_checkpoint_path>/fold_<int>/<model_name_lowercase>_<int>.pth"
  # Configuration file, data splits and model checkpoints will be saved here
  train_dir: "./training-output"
  # See https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html
  checkpoint_name: "{epoch:03d}-{val_loss:.4f}"
  # Save model checkpoint every n epochs
  checkpoint_every_n_epoch: 100
  # Whether to save the last checkpoint as "last.pth"
  save_last_checkpoint: true

  # ------------------ Data Settings ------------------
  # How many folds to split the data into for cross-validation (determines valiation split ratio)
  n_folds: 5
  # Percentage of total dataset split for testing, other is for training
  test_split: 0.4
  # Number of batches for each epoch in training
  n_batches_per_epoch: 250
  # Number of batches for each epoch in validation
  n_batches_val: 50
  # Size of each batch in training
  batch_size: 2
  # Size of each batch in validation and testing
  batch_size_eval: 4
  # Patch size to sample from volume to form training data
  # - try to keep each dimension divisible by (2 ** (n_level - 1))
  # - because U-Net downsamples by 2 at each level
  patch_size: [256, 256, 64]
  # Step size for sliding window patch sampler (used for validation and testing)
  patch_step: 32
  # Percentage of patches in batch guaranteed to have foreground class
  foreground_oversample_ratio: 0.333

  # ------------------ Dataloader Settings ------------------
  # How many subprocesses for data loading (training and validation set)
  # Set to 0 to load data in the main process (prefetch_factors must be None)
  num_workers_train: 8
  num_workers_val: 8
  # Number of batches loaded in advance across all workers
  # e.g. factor = 2 means total_batch = 2 * n_workers
  prefetch_factor_train: 1
  prefetch_factor_val: 1
  # Don't kill dataloader processes after dataset is consumed once
  persistent_workers_train: false
  persistent_workers_val: false
  # copy Tensors into device/CUDA pinned memory before returning
  pin_memory_train: true
  pin_memory_val: true

evaluation:
  # List of evaluation modes to run, can be any of
  # "single", "mcdo", "tta", "ensemble"
  modes:
    - "single"
    - "mcdo"
    - "tta"
    - "ensemble"
  # List of metric names separated by comma, can be any of 
  # 'dice', 'hd', 'hd95', 'asd', 'assd', 'recall', 'sen', 'precision', 
  # 'ppv', 'surface_dice', 'mean_variance', 'mean_entropy', or 
  # 'surface_dice_[float] where [float] is the threshold for surface overlap
  metrics:
    - "dice"
    - "surface_dice_1.0"
    - "surface_dice_1.5"
    - "surface_dice_2.0"
    - "surface_dice_2.5"
    - "surface_dice_3.0"
    - "surface_dice_5.099"
    - "surface_dice_13.928"
    - "hd95"
    - "assd"
    - "sen"
    - "ppv"
    - "pairwise_dice"
  # List of names for each class in the dataset along the channel dimension
  class_names: ["prostate", "bladder", "rectum"]
  # How many outputs to produce for MCDO and TTA mode
  n_outputs: 10
  # Folder to store predictions from the model
  predictions_dir: "./predictions"
  # Folder to store CSV with evaluation metrics
  csv_dir: "./evaluations"

logger:
  #  ------------ loguru settings -------------------
  # see https://loguru.readthedocs.io/en/stable/api/logger.html
  # Where to output the logs, can also be "stdout" or "stderr" for standard output and error
  sink: "./logs/out_{time}.log"
  # Log message format
  format: "<green>{time:YYYY-MM-DD at HH:mm:ss}</green> <level>{level: <8} {message}</level>"
  # Log retention period, not used if using "stdout" or "stderr"
  retention: "7 days"
  # TRUE for detailed backtrace logging for debugging
  backtrace: true
  # extra diagnostic information
  diagnose: true
  # Log level; ["TRACE", "DEBUG", "INFO", "SUCCESS", "WARNING", "ERROR", "CRITICAL"]
  level: "INFO"