-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconfiguration.yaml
226 lines (212 loc) · 8.88 KB
/
configuration.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
# Configuration file for the project
# A configuration object is a dictionary where each key has the format <prefix>__<parameter>.
# Data volumes are formatted as (channel, height, width, depth)
data:
# Path to directory containing directories of DICOM slices
data_dir: "./data"
# Name of preprocessed h5 files for training and testing data
h5_path: "./data/preprocessed_dataset.h5"
# Number of processes for data preprocessing
n_workers: 2
# When writing data to h5 file, how to handle duplicate keys (patient IDs)
# Any of: "skip" the duplicate, "overwrite" the duplicate, "rename" by adding a suffix
duplicate_name_strategy: "skip"
unet:
# ------------------ Conolution Block ------------------
# Size of kernel for convolutional layers, a single integer
kernel_size: 3
# Number of convolutional layers in each block
n_convolutions_per_block: 2
# Activation class name from torch.nn for convolutional layers
activation: "LeakyReLU"
# Dropout rate applied after activation
dropout_rate: 0.5
# Whether to apply instance norm after convolution and before activation
# - Batch norm don't work well with small batch size
use_instance_norm: true
# AKA decay, how much of new mean and variance are added to running mean and variance
instance_norm_momentum: 0.9
# Small value added to variance to avoid division by zero
instance_norm_epsilon: 0.00001
# ------------------ Encoder-Decoder Settings ------------------
# Channel of input volume, 1 for grayscale, 3 for RGB
input_channels: 1
# Output channels of the final layer, number of classes to predict
output_channels: 3
# Number of kernels in the output of the first level of encoder
# - This is doubled/halved at each level in the encoder/decoder
n_kernels_init: 32
# Maximum allowed number of kernels for a level
n_kernels_max: 512
# Number of resolutions/levels for the U-Net
n_levels: 5
# Activation function name from torch.nn for the final layer, usually
# "Sigmoid" for binary classification or "Softmax" for multi-class classification
final_layer_activation: "Sigmoid"
# ------------------ U-Net Training Settings ------------------
# Weight initialiser for model from torch.nn.init
initialiser: "kaiming_normal_"
# Optimiser for training from torch.optim
optimiser: "SGD"
# Optional arguments for optimiser
optimiser_kwargs:
momentum: 0.99
nesterov: true
# Learning rate scheduler from torch.optim.lr_scheduler
lr_scheduler: "PolynomialLR"
# Optional arguments for learning rate scheduler
lr_scheduler_kwargs:
total_iters: 1000
power: 0.9
# Loss will be weighted sum of losses from each level of the U-net decoder
deep_supervision: true
confidnet:
# Dimensions (channels) of the convolution layers
# Number of convolution is specified by the length of this list
hidden_conv_dims: [128, 128, 128, 64, 64]
# Activation function name from torch.nn for convolutional layers
activation: "LeakyReLU"
# Activation function name from torch.nn for the final layer, usually
# "Sigmoid" for binary classification or "Softmax" for multi-class classification
last_activation: "Sigmoid"
# Path to pretrained U-Net model to use as encoder
#pretrained_unet_path: "./checkpoints/unet"
# ------------------ ConfidNet Training Settings ------------------
# Weight initialiser for model from torch.nn.init
initialiser: "kaiming_normal_"
# Optimiser for training from torch.optim
optimiser: "SGD"
# Optional arguments for optimiser
optimiser_kwargs:
momentum: 0.99
nesterov: true
# Learning rate scheduler from torch.optim.lr_scheduler
lr_scheduler: "PolynomialLR"
# Optional arguments for learning rate scheduler
lr_scheduler_kwargs:
total_iters: 1000
power: 0.9
training:
# Number of epochs to train for
# NOTE: CHANGE total_iters in lr_scheduler_kwargs to this if scheduler have that arg!!
n_epochs: 1000
# Window for running loss calculation, reduce noise in loss plot
running_loss_window: 10
# Whether to dump x, y, and predictions from the model every n epochs
dump_tensors_every_n_epoch: 50
# Directory for the tensor dumps
tensor_dump_dir: "./tensor-dumps"
# Perform validation check every n epochs, 0 to disable validation
check_val_every_n_epoch: 50
# Number of validation steps to run before starting training, 0 to disable.
num_sanity_val_steps: 2
# ---------- tensorboard settings -------------------
# see https://lightning.ai/docs/pytorch/stable/extensions/logging.html
log_dir: "./tensorboard-logs/"
# ------------------ Trainer Settings ------------------
# Specifies the distributed training strategy, see https://lightning.ai/docs/pytorch/stable/extensions/strategy.html
strategy: "ddp"
# Which device to train on, "cpu", "gpu", "tpu", or "auto"
accelerator: "gpu"
# Precision of the floating point numbers
precision: "16-mixed"
# Whether to show progress bar during training
enable_progress_bar: true
# Whether to display model summary at the start of training
enable_model_summary: true
# ------------------ Checkpoint Settings ------------------
# Path to directory to save model checkpoints; model checkpoints are saved as
# "<model_checkpoint_path>/fold_<int>/<model_name_lowercase>_<int>.pth"
# Configuration file, data splits and model checkpoints will be saved here
train_dir: "./training-output"
# See https://lightning.ai/docs/pytorch/stable/api/lightning.pytorch.callbacks.ModelCheckpoint.html
checkpoint_name: "{epoch:03d}-{val_loss:.4f}"
# Save model checkpoint every n epochs
checkpoint_every_n_epoch: 100
# Whether to save the last checkpoint as "last.pth"
save_last_checkpoint: true
# ------------------ Data Settings ------------------
# How many folds to split the data into for cross-validation (determines valiation split ratio)
n_folds: 5
# Percentage of total dataset split for testing, other is for training
test_split: 0.4
# Number of batches for each epoch in training
n_batches_per_epoch: 250
# Number of batches for each epoch in validation
n_batches_val: 50
# Size of each batch in training
batch_size: 2
# Size of each batch in validation and testing
batch_size_eval: 4
# Patch size to sample from volume to form training data
# - try to keep each dimension divisible by (2 ** (n_level - 1))
# - because U-Net downsamples by 2 at each level
patch_size: [256, 256, 64]
# Step size for sliding window patch sampler (used for validation and testing)
patch_step: 32
# Percentage of patches in batch guaranteed to have foreground class
foreground_oversample_ratio: 0.333
# ------------------ Dataloader Settings ------------------
# How many subprocesses for data loading (training and validation set)
# Set to 0 to load data in the main process (prefetch_factors must be None)
num_workers_train: 8
num_workers_val: 8
# Number of batches loaded in advance across all workers
# e.g. factor = 2 means total_batch = 2 * n_workers
prefetch_factor_train: 1
prefetch_factor_val: 1
# Don't kill dataloader processes after dataset is consumed once
persistent_workers_train: false
persistent_workers_val: false
# copy Tensors into device/CUDA pinned memory before returning
pin_memory_train: true
pin_memory_val: true
evaluation:
# List of evaluation modes to run, can be any of
# "single", "mcdo", "tta", "ensemble"
modes:
- "single"
- "mcdo"
- "tta"
- "ensemble"
# List of metric names separated by comma, can be any of
# 'dice', 'hd', 'hd95', 'asd', 'assd', 'recall', 'sen', 'precision',
# 'ppv', 'surface_dice', 'mean_variance', 'mean_entropy', or
# 'surface_dice_[float] where [float] is the threshold for surface overlap
metrics:
- "dice"
- "surface_dice_1.0"
- "surface_dice_1.5"
- "surface_dice_2.0"
- "surface_dice_2.5"
- "surface_dice_3.0"
- "surface_dice_5.099"
- "surface_dice_13.928"
- "hd95"
- "assd"
- "sen"
- "ppv"
- "pairwise_dice"
# List of names for each class in the dataset along the channel dimension
class_names: ["prostate", "bladder", "rectum"]
# How many outputs to produce for MCDO and TTA mode
n_outputs: 10
# Folder to store predictions from the model
predictions_dir: "./predictions"
# Folder to store CSV with evaluation metrics
csv_dir: "./evaluations"
logger:
# ------------ loguru settings -------------------
# see https://loguru.readthedocs.io/en/stable/api/logger.html
# Where to output the logs, can also be "stdout" or "stderr" for standard output and error
sink: "./logs/out_{time}.log"
# Log message format
format: "<green>{time:YYYY-MM-DD at HH:mm:ss}</green> <level>{level: <8} {message}</level>"
# Log retention period, not used if using "stdout" or "stderr"
retention: "7 days"
# TRUE for detailed backtrace logging for debugging
backtrace: true
# extra diagnostic information
diagnose: true
# Log level; ["TRACE", "DEBUG", "INFO", "SUCCESS", "WARNING", "ERROR", "CRITICAL"]
level: "INFO"