huggingface
diff --git a/‎docs/source/en/_toctree.yml
Lines changed: 2 additions & 0 deletions b/‎docs/source/en/_toctree.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/en/model_doc/stt.md
Lines changed: 122 additions & 0 deletions b/‎docs/source/en/model_doc/stt.md
Lines changed: 122 additions & 0 deletions
diff --git a/‎src/transformers/modeling_utils.py
Lines changed: 4 additions & 1 deletion b/‎src/transformers/modeling_utils.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/transformers/models/__init__.py
Lines changed: 1 addition & 0 deletions b/‎src/transformers/models/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/transformers/models/auto/configuration_auto.py
Lines changed: 2 additions & 0 deletions b/‎src/transformers/models/auto/configuration_auto.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/transformers/models/auto/feature_extraction_auto.py
Lines changed: 1 addition & 0 deletions b/‎src/transformers/models/auto/feature_extraction_auto.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/transformers/models/auto/modeling_auto.py
Lines changed: 2 additions & 0 deletions b/‎src/transformers/models/auto/modeling_auto.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/transformers/models/auto/processing_auto.py
Lines changed: 1 addition & 0 deletions b/‎src/transformers/models/auto/processing_auto.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/transformers/models/mimi/configuration_mimi.py
Lines changed: 46 additions & 4 deletions b/‎src/transformers/models/mimi/configuration_mimi.py
Lines changed: 46 additions & 4 deletions
@@ -843,6 +843,8 @@
         title: GraniteSpeech
       - local: model_doc/hubert
         title: Hubert
+      - local: model_doc/stt
+        title: Kyutai Speech-To-Text
       - local: model_doc/mctct
         title: MCTCT
       - local: model_doc/mimi
 
@@ -0,0 +1,122 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Kyutai Speech-To-Text 
+## Overview
+
+Kyutai STT is a speech-to-text model architecture based on the [Mimi codec](https://huggingface.co/docs/transformers/en/model_doc/mimi), which encodes audio into discrete tokens in a streaming fashion, and a [Moshi-like](https://huggingface.co/docs/transformers/en/model_doc/moshi) autoregressive decoder. Kyutai’s lab has released two model checkpoints:
+- [kyutai/stt-1b-en_fr](https://huggingface.co/kyutai/stt-1b-en_fr): a 1B-parameter model capable of transcribing both English and French
+- [kyutai/stt-2.6b-en](https://huggingface.co/kyutai/stt-2.6b-en): a 2.6B-parameter model focused solely on English, optimized for maximum transcription accuracy
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/eustlb/documentation-images/resolve/main/kyutai_stt.png"/>
+</div>
+
+## Usage Tips
+
+### Inference
+
+```python
+import torch
+from datasets import load_dataset, Audio
+from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration
+
+# 1. load the model and the processor
+torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+model_id = "kyutai/stt-2.6b-en"
+
+processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
+model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
+
+# 2. load audio samples
+ds = load_dataset(
+    "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
+)
+ds = ds.cast_column("audio", Audio(sampling_rate=24000))
+
+# 3. prepare the model inputs
+inputs = processor(
+    ds[0]["audio"]["array"],
+)
+inputs.to(torch_device)
+
+# 4. infer the model
+output_tokens = model.generate(**inputs)
+
+# 5. decode the generated tokens
+print(processor.batch_decode(output_tokens, skip_special_tokens=True))
+```
+
+### Batched Inference
+
+```python
+import torch
+from datasets import load_dataset, Audio
+from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForConditionalGeneration
+
+# 1. load the model and the processor
+torch_device = "cuda" if torch.cuda.is_available() else "cpu"
+model_id = "kyutai/stt-2.6b-en"
+
+processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
+model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
+
+# 2. load audio samples
+ds = load_dataset(
+    "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation"
+)
+ds = ds.cast_column("audio", Audio(sampling_rate=24000))
+
+# 3. prepare the model inputs
+audio_arrays = [ds[i]["audio"]["array"] for i in range(4)]
+inputs = processor(audio_arrays, return_tensors="pt", padding=True)
+inputs = inputs.to(torch_device)
+
+# 4. infer the model
+output_tokens = model.generate(**inputs)
+
+# 5. decode the generated tokens
+decoded_outputs = processor.batch_decode(output_tokens, skip_special_tokens=True)
+for output in decoded_outputs:
+    print(output)
+```
+
+This model was contributed by [Eustache Le Bihan](https://huggingface.co/eustlb).
+The original code can be found [here](https://github.com/kyutai-labs/moshi).
+
+
+## KyutaiSpeechToTextConfig
+
+[[autodoc]] KyutaiSpeechToTextConfig
+
+## KyutaiSpeechToTextProcessor
+
+[[autodoc]] KyutaiSpeechToTextProcessor
+    - __call__
+
+## KyutaiSpeechToTextFeatureExtractor
+
+[[autodoc]] KyutaiSpeechToTextFeatureExtractor
+
+## KyutaiSpeechToTextForConditionalGeneration
+
+[[autodoc]] KyutaiSpeechToTextForConditionalGeneration
+    - forward
+    - generate
+
+## KyutaiSpeechToTextModel
+
+[[autodoc]] KyutaiSpeechToTextModel
@@ -4658,8 +4658,11 @@ def from_pretrained(
         # The _keep_in_fp32_modules flag is only used to avoid bf16 -> fp16 casting precision issues. It was introduced
         # in case of force loading a model that should stay bf16 in fp16 (which includes a few quantizers as this is a pre-processing
         # step for e.g. bitsandbytes). See https://github.com/huggingface/transformers/issues/20287 for details.
+        # Update: to extend _keep_in_fp32_modules flag feature, it can also be used to force modules that should stay in fp32
         if model._keep_in_fp32_modules is not None and (
-            torch_dtype == torch.float16 or getattr(hf_quantizer, "use_keep_in_fp32_modules", False)
+            torch_dtype == torch.float16
+            or torch_dtype == torch.bfloat16
+            or getattr(hf_quantizer, "use_keep_in_fp32_modules", False)
         ):
             # We need to match exact layers, so we add either `.` on each side, or start/end of string
             keep_in_fp32_regex = re.compile(
 
@@ -285,6 +285,7 @@
     from .squeezebert import *
     from .stablelm import *
     from .starcoder2 import *
+    from .stt import *
     from .superglue import *
     from .superpoint import *
     from .swiftformer import *
 
@@ -322,6 +322,7 @@
         ("squeezebert", "SqueezeBertConfig"),
         ("stablelm", "StableLmConfig"),
         ("starcoder2", "Starcoder2Config"),
+        ("stt", "KyutaiSpeechToTextConfig"),
         ("superglue", "SuperGlueConfig"),
         ("superpoint", "SuperPointConfig"),
         ("swiftformer", "SwiftFormerConfig"),
@@ -707,6 +708,7 @@
         ("squeezebert", "SqueezeBERT"),
         ("stablelm", "StableLm"),
         ("starcoder2", "Starcoder2"),
+        ("stt", "KyutaiSpeechToText"),
         ("superglue", "SuperGlue"),
         ("superpoint", "SuperPoint"),
         ("swiftformer", "SwiftFormer"),
 
@@ -91,6 +91,7 @@
         ("sew-d", "Wav2Vec2FeatureExtractor"),
         ("speech_to_text", "Speech2TextFeatureExtractor"),
         ("speecht5", "SpeechT5FeatureExtractor"),
+        ("stt", "KyutaiSpeechToTextFeatureExtractor"),
         ("swiftformer", "ViTFeatureExtractor"),
         ("swin", "ViTFeatureExtractor"),
         ("swinv2", "ViTFeatureExtractor"),
 
@@ -300,6 +300,7 @@
         ("squeezebert", "SqueezeBertModel"),
         ("stablelm", "StableLmModel"),
         ("starcoder2", "Starcoder2Model"),
+        ("stt", "KyutaiSpeechToTextModel"),
         ("superglue", "SuperGlueForKeypointMatching"),
         ("swiftformer", "SwiftFormerModel"),
         ("swin", "SwinModel"),
@@ -1055,6 +1056,7 @@
         ("speech-encoder-decoder", "SpeechEncoderDecoderModel"),
         ("speech_to_text", "Speech2TextForConditionalGeneration"),
         ("speecht5", "SpeechT5ForSpeechToText"),
+        ("stt", "KyutaiSpeechToTextForConditionalGeneration"),
         ("whisper", "WhisperForConditionalGeneration"),
     ]
 )
 
@@ -116,6 +116,7 @@
         ("speech_to_text", "Speech2TextProcessor"),
         ("speech_to_text_2", "Speech2Text2Processor"),
         ("speecht5", "SpeechT5Processor"),
+        ("stt", "KyutaiSpeechToTextProcessor"),
         ("trocr", "TrOCRProcessor"),
         ("tvlt", "TvltProcessor"),
         ("tvp", "TvpProcessor"),
 
@@ -38,8 +38,8 @@ class MimiConfig(PretrainedConfig):
     Args:
         sampling_rate (`int`, *optional*, defaults to 24000):
             The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
-        frame_rate (`float`, *optional*, defaults to 12.5):
-            Framerate of the model.
+        frame_rate (`float`, *optional*):
+            Should be computed from the other parameters, yet kept for backward compatibility.
         audio_channels (`int`, *optional*, defaults to 1):
             Number of channels in the audio data. Either 1 for mono or 2 for stereo.
         hidden_size (`int`, *optional*, defaults to 512):
@@ -111,6 +111,8 @@ class MimiConfig(PretrainedConfig):
         use_cache (`bool`, *optional*, defaults to `False`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
             relevant if `config.is_decoder=True`.
+        use_streaming (`bool`, *optional*, defaults to `False`):
+            Whether to use streaming mode. If `True`, the model encode method will return the padding cache that can be used in a subsequent call to the encode method.
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
         sliding_window (`int`, *optional*, defaults to 250):
@@ -141,7 +143,7 @@ class MimiConfig(PretrainedConfig):
     def __init__(
         self,
         sampling_rate=24_000,
-        frame_rate=12.5,
+        frame_rate=None,
         audio_channels=1,
         hidden_size=512,
         num_filters=64,
@@ -172,6 +174,7 @@ def __init__(
         initializer_range=0.02,
         norm_eps=1e-5,
         use_cache=False,
+        use_streaming=False,
         rope_theta=10000.0,
         sliding_window=250,
         attention_dropout=0.0,
@@ -180,7 +183,6 @@ def __init__(
         **kwargs,
     ):
         self.sampling_rate = sampling_rate
-        self.frame_rate = frame_rate
         self.audio_channels = audio_channels
         self.hidden_size = hidden_size
         self.num_filters = num_filters
@@ -209,13 +211,22 @@ def __init__(
         self.initializer_range = initializer_range
         self.norm_eps = norm_eps
         self.use_cache = use_cache
+        self.use_streaming = use_streaming
         self.rope_theta = rope_theta
         self.sliding_window = sliding_window
         self.attention_dropout = attention_dropout
         self.head_dim = head_dim or hidden_size // num_attention_heads
         self.layer_scale_initial_scale = layer_scale_initial_scale
         self.attention_bias = attention_bias
 
+        # Handle backward compatibility for frame_rate:
+        # If frame_rate is explicitly provided, use it (backward compatibility)
+        # Otherwise, compute it from other parameters (correctly)
+        if frame_rate is not None:
+            self._frame_rate = frame_rate
+        else:
+            self._frame_rate = None
+
         if num_semantic_quantizers >= self.num_quantizers:
             raise ValueError(
                 f"The number of semantic quantizers should be lower than the total number of quantizers {self.num_quantizers}, but is currently {num_semantic_quantizers}."
@@ -233,5 +244,36 @@ def num_codebooks(self) -> int:
         # alias to num_quantizers
         return self.num_quantizers
 
+    @property
+    def frame_size(self) -> int:
+        # 1. we need each encoder conv stride
+        # first conv
+        strides = [1]
+
+        # layer convs
+        for ratio in reversed(self.upsampling_ratios):
+            for j in range(self.num_residual_layers):
+                len_kernel_sizes = len(self.residual_kernel_size) if isinstance(self.residual_kernel_size, list) else 1
+                strides.extend([1] * (len_kernel_sizes + 1))
+                if self.use_conv_shortcut:  # skip connection
+                    strides.append(1)
+
+            strides.append(ratio)
+
+        # last conv
+        strides.append(1)
+
+        # downsampling layer
+        strides.append(2)
+
+        return math.prod(strides)
+
+    @property
+    def frame_rate(self) -> float:
+        # handle backward compatibility
+        if self._frame_rate is not None:
+            return self._frame_rate
+        return self.sampling_rate / self.frame_size
+
 
 __all__ = ["MimiConfig"]