Granite speech - minor fixes to support training with the HF trainer (#38833)

avihu111 · web-flow · commit be10d4df60be · 2025-06-24T17:06:52.000+02:00
* ensure the query is updated during training avoid unused parameters that DDP does not like * avoid a crash when `kwargs` contain `padding=True` trainers often pass this argument automatically * minor * Remove mel_spec lazy init, and rename to mel_filters. this ensures save_pretrained will not crash when saving the processor during training https://github.com/huggingface/transformers/blob/d5d007a1a0f0c11a726a54c8f00bd71825f84d02/src/transformers/feature_extraction_utils.py#L595 * minor - most feature extractors has a `sampling_rate` property
diff --git a/src/transformers/models/granite_speech/feature_extraction_granite_speech.py b/src/transformers/models/granite_speech/feature_extraction_granite_speech.py
@@ -50,15 +50,16 @@ def __init__(
         **kwargs,
     ):
         super().__init__(**kwargs)
+        self.sampling_rate = sampling_rate
         self.melspec_kwargs = {
             "sample_rate": sampling_rate,
             "n_fft": n_fft,
             "win_length": win_length,
             "hop_length": hop_length,
             "n_mels": n_mels,
         }
-        # Currently lazily initialized
-        self.melspec = None
+        requires_backends(self, ["torchaudio"])
+        self.mel_filters = torchaudio.transforms.MelSpectrogram(**self.melspec_kwargs)
         self.projector_window_size = projector_window_size
         self.projector_downsample_rate = projector_downsample_rate
 
@@ -91,34 +92,16 @@ def __call__(
         ).view(-1, 1)
         return BatchFeature(data=speech_inputs)
 
-    def _ensure_melspec_transform_is_initialized(self):
-        """
-        Ensures the mel spectrogram transform on this instance is initialized.
-
-        We do this for now since some logging explodes since the mel spectrogram
-        transform is not JSON serializable.
-        """
-        requires_backends(self, ["torchaudio"])
-
-        if self.melspec is None:
-            # TODO (@alex-jw-brooks / @eustlb) move this to common batch
-            # feature extraction in audio utils once they are written!
-            self.melspec = torchaudio.transforms.MelSpectrogram(**self.melspec_kwargs)
-
     def _extract_mel_spectrograms(self, audio: "torch.Tensor", device="cpu"):
         """
         Compute the Mel features to be passed to the conformer encoder.
         """
         requires_backends(self, ["torchaudio"])
-
-        # Initialize the mel spectrogram if isn't not already and
-        # move the melspec / audio to the computation device.
-        self._ensure_melspec_transform_is_initialized()
         if device is not None:
-            melspec = self.melspec.to(device)
+            melspec = self.mel_filters.to(device)
             audio = audio.to(device)
         else:
-            melspec = self.melspec
+            melspec = self.mel_filters
 
         bsz = audio.shape[0]
         with torch.no_grad():
diff --git a/src/transformers/models/granite_speech/modeling_granite_speech.py b/src/transformers/models/granite_speech/modeling_granite_speech.py
@@ -83,7 +83,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         hidden_states = hidden_states.view(batch_size * nblocks, self.window_size, dim)
 
         query_output = self.qformer(
-            query_embeds=self.query.data,
+            query_embeds=self.query,
             encoder_hidden_states=hidden_states,
             encoder_attention_mask=None,
             return_dict=True,
diff --git a/src/transformers/models/granite_speech/processing_granite_speech.py b/src/transformers/models/granite_speech/processing_granite_speech.py
@@ -88,7 +88,9 @@ def __call__(
         else:
             audio_inputs = {}
 
-        text_inputs = self.tokenizer(prompt_strings, padding=True, **kwargs)
+        if "padding" not in kwargs:
+            kwargs["padding"] = True
+        text_inputs = self.tokenizer(prompt_strings, **kwargs)
         return BatchFeature(data={**text_inputs, **audio_inputs})
 
     def _get_validated_text(self, text: Union[str, list]) -> list[str]: