-
Notifications
You must be signed in to change notification settings - Fork 3k
Description
I have the following code for fine-tuning:
import os
from omegaconf import OmegaConf
import copy
import torch
from lightning.pytorch import Trainer
from nemo.collections.asr.models import EncDecFrameClassificationModel
from nemo.utils.exp_manager import exp_manager
torch.cuda.empty_cache()
# RESTORING PRETRAINED MODEL
frame_marblenet_model = EncDecFrameClassificationModel.from_pretrained(
model_name="vad_multilingual_frame_marblenet",
strict=False
)
train_subdataset_cfg = copy.deepcopy(frame_marblenet_model.cfg.train_ds)
val_subdataset_cfg = copy.deepcopy(frame_marblenet_model.cfg.validation_ds)
test_subdataset_cfg = copy.deepcopy(frame_marblenet_model.cfg.test_ds)
frame_marblenet_model._cfg.train_ds.manifest_filepath = "manifest/frame_train.json"
frame_marblenet_model._cfg.train_ds.batch_size = 2
frame_marblenet_model._cfg.train_ds.num_workers = 1
# Disable augmentations for the finetuning
frame_marblenet_model._cfg.train_ds.augmentor.noise.manifest_path= "data/manifest/background_training_manifest.json"
frame_marblenet_model._cfg.train_ds.augmentor.noise.prob = 0.0
frame_marblenet_model._cfg.train_ds.augmentor.white_noise.prob = 0.0
frame_marblenet_model._cfg.train_ds.augmentor.gain.prob = 0.0
frame_marblenet_model._cfg.validation_ds.manifest_filepath = "manifest/frame_val.json"
frame_marblenet_model._cfg.validation_ds.batch_size = 2
frame_marblenet_model._cfg.validation_ds.num_workers = 1
frame_marblenet_model._cfg.test_ds.manifest_filepath = "manifest/frame_test.json"
frame_marblenet_model._cfg.test_ds.batch_size = 2
frame_marblenet_model._cfg.test_ds.num_workers = 1
# Setup the data loader for the restored model
frame_marblenet_model.setup_training_data(copy.deepcopy(frame_marblenet_model.cfg.train_ds))
frame_marblenet_model.setup_multiple_validation_data(copy.deepcopy(frame_marblenet_model.cfg.validation_ds))
frame_marblenet_model.setup_multiple_test_data(copy.deepcopy(frame_marblenet_model.cfg.test_ds))
# SETTING UP A NEW TRAINER AND EXPERIMENT MANAGER
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'
trainer_config = OmegaConf.create(dict(
devices=1,
accelerator=accelerator,
max_epochs=20,
max_steps=-1, # computed at runtime if not set
num_nodes=1,
accumulate_grad_batches=1,
enable_checkpointing=False, # Provided by exp_manager
logger=False, # Provided by exp_manager
log_every_n_steps=1, # Interval of logging.
val_check_interval=1.0, # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
))
trainer_finetune = Trainer(**trainer_config)
# Setting the trainer to the pretrained model
frame_marblenet_model.set_trainer(trainer_finetune)
# Check data loaders are correct
print("Train dataset labels :", frame_marblenet_model._train_dl.dataset.labels)
print("Val dataset labels :", frame_marblenet_model._validation_dl.dataset.labels)
print("Test dataset labels :", frame_marblenet_model._test_dl.dataset.labels)
# Setting up the experiment manager
config = copy.deepcopy(frame_marblenet_model._cfg)
print("Experiment config:")
print(OmegaConf.to_yaml(config))
exp_dir_finetune = exp_manager(trainer_finetune, config.get("exp_manager", None))
print(f"Experiment directory: {str(exp_dir_finetune)}")
# SETUP OPTIMIZER + SCHEDULER
optim_sched_cfg = copy.deepcopy(frame_marblenet_model.cfg.optim)
# Struct mode prevents us from popping off elements from the config, so let's disable it
OmegaConf.set_struct(optim_sched_cfg, False)
# Lets change the maximum learning rate to previous minimum learning rate
optim_sched_cfg.lr = 0.001
# Lets change the scheduler
optim_sched_cfg.sched.name = "CosineAnnealing"
# "power" isn't applicable to CosineAnnealing so let's remove it
optim_sched_cfg.sched.pop('power')
# "hold_ratio" isn't applicable to CosineAnnealing, so let's remove it
optim_sched_cfg.sched.pop('hold_ratio')
# Set "min_lr" to lower value
optim_sched_cfg.sched.min_lr = 1e-4
print(OmegaConf.to_yaml(optim_sched_cfg))
# Now lets update the optimizer settings
frame_marblenet_model.setup_optimization(optim_sched_cfg)
# We can also just directly replace the config inplace if we choose to
frame_marblenet_model.cfg.optim = optim_sched_cfg
# FINE-TUNING FOR 20 EPOCHS
trainer_finetune.fit(frame_marblenet_model)
# Evaluate the model on the test set
trainer_finetune.test(frame_marblenet_model, ckpt_path=None)
# Save a model as a tarfile
frame_marblenet_model.save_to(os.path.join("resources/saved_models", "tuned_multilingual_marblenet.nemo"))
and I got the following error:
"""
Sanity Checking DataLoader 0: 0%| | 0/2 [00:00<?, ?it/s]Traceback (most recent call last):
File "frame_vad_train.py", line 118, in
trainer_finetune.fit(frame_marblenet_model)
File "env/lib/python3.12/site-packages/lightning/pytorch/trainer/trainer.py", line 538, in fit
call._call_and_handle_interrupt(
File "env/lib/python3.12/site-packages/lightning/pytorch/trainer/call.py", line 47, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "env/lib/python3.12/site-packages/lightning/pytorch/trainer/trainer.py", line 574, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "env/lib/python3.12/site-packages/lightning/pytorch/trainer/trainer.py", line 981, in _run
results = self._run_stage()
^^^^^^^^^^^^^^^^^
File "env/lib/python3.12/site-packages/lightning/pytorch/trainer/trainer.py", line 1023, in _run_stage
self._run_sanity_check()
File "env/lib/python3.12/site-packages/lightning/pytorch/trainer/trainer.py", line 1052, in _run_sanity_check
val_loop.run()
File "env/lib/python3.12/site-packages/lightning/pytorch/loops/utilities.py", line 178, in _decorator
return loop_run(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "env/lib/python3.12/site-packages/lightning/pytorch/loops/evaluation_loop.py", line 135, in run
self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
File "env/lib/python3.12/site-packages/lightning/pytorch/loops/evaluation_loop.py", line 396, in _evaluation_step
output = call._call_strategy_hook(trainer, hook_name, *step_args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "env/lib/python3.12/site-packages/lightning/pytorch/trainer/call.py", line 319, in _call_strategy_hook
output = fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "env/lib/python3.12/site-packages/lightning/pytorch/strategies/strategy.py", line 411, in validation_step
return self.lightning_module.validation_step(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "env/lib/python3.12/site-packages/nemo/collections/asr/models/classification_models.py", line 1216, in validation_step
labels, labels_len = self.reshape_labels(logits, labels, audio_signal_len, labels_len)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "env/lib/python3.12/site-packages/nemo/collections/asr/models/classification_models.py", line 1290, in reshape_labels
labels_max_len = labels.size(1)
^^^^^^^^^^^^^^
IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
""".
Similar strategy worked for fine-tuning of EncDecClassificationModel (segment VAD model).