Skip to content

Multilingual Frame Marblenet fine-tuning error #14312

@Akbarkhuja

Description

@Akbarkhuja

I have the following code for fine-tuning:

import os
from omegaconf import OmegaConf
import copy
import torch

from lightning.pytorch import Trainer

from nemo.collections.asr.models import EncDecFrameClassificationModel
from nemo.utils.exp_manager import exp_manager


torch.cuda.empty_cache()

# RESTORING PRETRAINED MODEL
frame_marblenet_model = EncDecFrameClassificationModel.from_pretrained(
    model_name="vad_multilingual_frame_marblenet",
    strict=False
)

train_subdataset_cfg = copy.deepcopy(frame_marblenet_model.cfg.train_ds)
val_subdataset_cfg = copy.deepcopy(frame_marblenet_model.cfg.validation_ds)
test_subdataset_cfg = copy.deepcopy(frame_marblenet_model.cfg.test_ds)


frame_marblenet_model._cfg.train_ds.manifest_filepath = "manifest/frame_train.json"
frame_marblenet_model._cfg.train_ds.batch_size = 2
frame_marblenet_model._cfg.train_ds.num_workers = 1

# Disable augmentations for the finetuning
frame_marblenet_model._cfg.train_ds.augmentor.noise.manifest_path= "data/manifest/background_training_manifest.json"
frame_marblenet_model._cfg.train_ds.augmentor.noise.prob = 0.0
frame_marblenet_model._cfg.train_ds.augmentor.white_noise.prob = 0.0
frame_marblenet_model._cfg.train_ds.augmentor.gain.prob = 0.0

frame_marblenet_model._cfg.validation_ds.manifest_filepath = "manifest/frame_val.json"
frame_marblenet_model._cfg.validation_ds.batch_size = 2
frame_marblenet_model._cfg.validation_ds.num_workers = 1

frame_marblenet_model._cfg.test_ds.manifest_filepath = "manifest/frame_test.json"
frame_marblenet_model._cfg.test_ds.batch_size = 2
frame_marblenet_model._cfg.test_ds.num_workers = 1

# Setup the data loader for the restored model
frame_marblenet_model.setup_training_data(copy.deepcopy(frame_marblenet_model.cfg.train_ds))
frame_marblenet_model.setup_multiple_validation_data(copy.deepcopy(frame_marblenet_model.cfg.validation_ds))
frame_marblenet_model.setup_multiple_test_data(copy.deepcopy(frame_marblenet_model.cfg.test_ds))


# SETTING UP A NEW TRAINER AND EXPERIMENT MANAGER
accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'

trainer_config = OmegaConf.create(dict(
    devices=1,
    accelerator=accelerator,
    max_epochs=20,
    max_steps=-1,  # computed at runtime if not set
    num_nodes=1,
    accumulate_grad_batches=1,
    enable_checkpointing=False,  # Provided by exp_manager
    logger=False,  # Provided by exp_manager
    log_every_n_steps=1,  # Interval of logging.
    val_check_interval=1.0,  # Set to 0.25 to check 4 times per epoch, or an int for number of iterations
))

trainer_finetune = Trainer(**trainer_config)


# Setting the trainer to the pretrained model
frame_marblenet_model.set_trainer(trainer_finetune)


# Check data loaders are correct
print("Train dataset labels :", frame_marblenet_model._train_dl.dataset.labels)
print("Val dataset labels :", frame_marblenet_model._validation_dl.dataset.labels)
print("Test dataset labels :", frame_marblenet_model._test_dl.dataset.labels)


# Setting up the experiment manager
config = copy.deepcopy(frame_marblenet_model._cfg)

print("Experiment config:")
print(OmegaConf.to_yaml(config))

exp_dir_finetune = exp_manager(trainer_finetune, config.get("exp_manager", None))
print(f"Experiment directory: {str(exp_dir_finetune)}")


# SETUP OPTIMIZER + SCHEDULER
optim_sched_cfg = copy.deepcopy(frame_marblenet_model.cfg.optim)
# Struct mode prevents us from popping off elements from the config, so let's disable it
OmegaConf.set_struct(optim_sched_cfg, False)

# Lets change the maximum learning rate to previous minimum learning rate
optim_sched_cfg.lr = 0.001

# Lets change the scheduler
optim_sched_cfg.sched.name = "CosineAnnealing"

# "power" isn't applicable to CosineAnnealing so let's remove it
optim_sched_cfg.sched.pop('power')

# "hold_ratio" isn't applicable to CosineAnnealing, so let's remove it
optim_sched_cfg.sched.pop('hold_ratio')

# Set "min_lr" to lower value
optim_sched_cfg.sched.min_lr = 1e-4

print(OmegaConf.to_yaml(optim_sched_cfg))

# Now lets update the optimizer settings
frame_marblenet_model.setup_optimization(optim_sched_cfg)

# We can also just directly replace the config inplace if we choose to
frame_marblenet_model.cfg.optim = optim_sched_cfg


# FINE-TUNING FOR 20 EPOCHS
trainer_finetune.fit(frame_marblenet_model)

# Evaluate the model on the test set
trainer_finetune.test(frame_marblenet_model, ckpt_path=None)

# Save a model as a tarfile
frame_marblenet_model.save_to(os.path.join("resources/saved_models", "tuned_multilingual_marblenet.nemo"))

and I got the following error:
"""
Sanity Checking DataLoader 0: 0%| | 0/2 [00:00<?, ?it/s]Traceback (most recent call last):
File "frame_vad_train.py", line 118, in
trainer_finetune.fit(frame_marblenet_model)
File "env/lib/python3.12/site-packages/lightning/pytorch/trainer/trainer.py", line 538, in fit
call._call_and_handle_interrupt(
File "env/lib/python3.12/site-packages/lightning/pytorch/trainer/call.py", line 47, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "env/lib/python3.12/site-packages/lightning/pytorch/trainer/trainer.py", line 574, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "env/lib/python3.12/site-packages/lightning/pytorch/trainer/trainer.py", line 981, in _run
results = self._run_stage()
^^^^^^^^^^^^^^^^^
File "env/lib/python3.12/site-packages/lightning/pytorch/trainer/trainer.py", line 1023, in _run_stage
self._run_sanity_check()
File "env/lib/python3.12/site-packages/lightning/pytorch/trainer/trainer.py", line 1052, in _run_sanity_check
val_loop.run()
File "env/lib/python3.12/site-packages/lightning/pytorch/loops/utilities.py", line 178, in _decorator
return loop_run(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "env/lib/python3.12/site-packages/lightning/pytorch/loops/evaluation_loop.py", line 135, in run
self._evaluation_step(batch, batch_idx, dataloader_idx, dataloader_iter)
File "env/lib/python3.12/site-packages/lightning/pytorch/loops/evaluation_loop.py", line 396, in _evaluation_step
output = call._call_strategy_hook(trainer, hook_name, *step_args)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "env/lib/python3.12/site-packages/lightning/pytorch/trainer/call.py", line 319, in _call_strategy_hook
output = fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "env/lib/python3.12/site-packages/lightning/pytorch/strategies/strategy.py", line 411, in validation_step
return self.lightning_module.validation_step(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "env/lib/python3.12/site-packages/nemo/collections/asr/models/classification_models.py", line 1216, in validation_step
labels, labels_len = self.reshape_labels(logits, labels, audio_signal_len, labels_len)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "env/lib/python3.12/site-packages/nemo/collections/asr/models/classification_models.py", line 1290, in reshape_labels
labels_max_len = labels.size(1)
^^^^^^^^^^^^^^
IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)
""".

Similar strategy worked for fine-tuning of EncDecClassificationModel (segment VAD model).

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions