redmoe-moutain
diff --git a/‎docs/source/en/attention_interface.md
Lines changed: 41 additions & 1 deletion b/‎docs/source/en/attention_interface.md
Lines changed: 41 additions & 1 deletion
diff --git a/‎docs/source/en/internal/modeling_utils.md
Lines changed: 5 additions & 0 deletions b/‎docs/source/en/internal/modeling_utils.md
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/transformers/__init__.py
Lines changed: 2 additions & 0 deletions b/‎src/transformers/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/transformers/cache_utils.py
Lines changed: 103 additions & 16 deletions b/‎src/transformers/cache_utils.py
Lines changed: 103 additions & 16 deletions
diff --git a/‎src/transformers/configuration_utils.py
Lines changed: 13 additions & 0 deletions b/‎src/transformers/configuration_utils.py
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/transformers/generation/utils.py
Lines changed: 30 additions & 5 deletions b/‎src/transformers/generation/utils.py
Lines changed: 30 additions & 5 deletions
@@ -125,4 +125,44 @@ would expect from a usual Python dictionary:
 
 # You can also globally `register` a new function directly on it
 >>> ALL_ATTENTION_FUNCTIONS.register("new_func", new_func)
-```
+```
+
+## Attention Mask Interface
+
+Having a new attention function may mean that you need a new format of attention mask to decide what key and value tokens
+the query tokens should attend to. This is now possible with the `AttentionMaskInterface`! It works in the same way as
+the `AttentionInterface`:
+
+```python
+from transformers import AttentionMaskInterface
+from transformers.masking_utils import sdpa_mask
+import torch
+
+def my_new_sdpa_mask(*args, **kwargs):
+    print("I just entered the attention mask computation")
+    return sdpa_mask(*args, **kwargs)
+
+AttentionMaskInterface.register("my_new_sdpa_mask", my_new_sdpa_mask)
+```
+
+The reason you have to register it is because we need to automatically correct your mask format based on the attention implementation (for example, flex attention uses a BlockMask format, while sdpa uses a 4D tensor).
+By default, if you do not register an attention mask function along with your attention function, mask creation will be skipped
+and `attention_mask=None` will be passed along to the Attention layers.
+
+The default signature of the attention mask functions is the following:
+
+```python
+def custom_attention_mask(
+    batch_size: int,  # required arg
+    cache_position: torch.Tensor,  # required arg
+    kv_length: int,  # required arg
+    kv_offset: int = 0,  # required arg
+    mask_function: Callable = causal_mask_function,  # required arg
+    attention_mask: Optional[torch.Tensor] = None,  # required arg
+    **kwargs,  # a few additional args may be passed as kwargs, especially the model's config is always passed
+) -> Optional[torch.Tensor]:
+```
+
+It mostly works thanks to the `mask_function`, which is a `Callable` in the form of [torch's mask_mod functions](https://pytorch.org/blog/flexattention/), taking 4 indices as input and returning a boolean to indicate if this position should take part in the attention computation.
+
+If you cannot use the `mask_function` to create your mask for some reason, you can try to work around it by doing something similar to our [torch export workaround](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/executorch.py).
@@ -29,6 +29,11 @@ Most of those are only useful if you are studying the code of the models in the
 [[autodoc]] AttentionInterface
     - register
 
+## Attention Mask Functions
+
+[[autodoc]] AttentionMaskInterface
+    - register
+
 ## Rotary Position Embedding Functions
 
 [[autodoc]] dynamic_rope_update
 
@@ -445,6 +445,7 @@
     _import_structure["modeling_outputs"] = []
     _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS", "dynamic_rope_update"]
     _import_structure["modeling_utils"] = ["PreTrainedModel", "AttentionInterface"]
+    _import_structure["masking_utils"] = ["AttentionMaskInterface"]
     _import_structure["optimization"] = [
         "Adafactor",
         "get_constant_schedule",
@@ -914,6 +915,7 @@
             TorchExportableModuleWithStaticCache,
             convert_and_export_with_cache,
         )
+        from .masking_utils import AttentionMaskInterface
         from .model_debugging_utils import (
             model_addition_debugger_context,
         )
 
@@ -196,6 +196,18 @@ def seen_tokens(self):
         else:
             return None
 
+    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
+        """
+        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
+        the given layer at `layer_idx`.
+        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
+        for each layer.
+        """
+        query_length = cache_position.shape[0]
+        past_seen_tokens = self.get_seq_length()
+        kv_length = query_length + past_seen_tokens
+        return kv_length, 0
+
 
 @dataclass
 class CacheConfig:
@@ -1084,8 +1096,6 @@ class SinkCache(Cache):
         ```
     """
 
-    is_sliding = True
-
     def __init__(self, window_length: int, num_sink_tokens: int) -> None:
         super().__init__()
         self.key_cache: List[torch.Tensor] = []
@@ -1390,6 +1400,16 @@ def reset(self):
             self.key_cache[layer_idx].zero_()
             self.value_cache[layer_idx].zero_()
 
+    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
+        """
+        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
+        the given layer at `layer_idx`.
+        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
+        for each layer.
+        """
+        kv_length = self.get_max_cache_shape()
+        return kv_length, 0
+
 
 class SlidingWindowCache(StaticCache):
     """
@@ -1446,7 +1466,6 @@ class SlidingWindowCache(StaticCache):
         ```
     """
 
-    is_sliding = True
     is_compileable = True
 
     def __init__(
@@ -1465,6 +1484,7 @@ def __init__(
                 "config and it's not set to None."
             )
         max_cache_len = min(config.sliding_window, max_cache_len)
+        self.sliding_window = config.sliding_window
         super().__init__(
             config=config,
             max_batch_size=max_batch_size,
@@ -1509,6 +1529,21 @@ def reset(self):
             self.key_cache[layer_idx].zero_()
             self.value_cache[layer_idx].zero_()
 
+    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
+        """
+        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
+        the given layer at `layer_idx`.
+        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
+        for each layer.
+        """
+        query_length = cache_position.shape[0]
+        first_cache_position = cache_position[0]
+        # torch.clamp() is equivalent to max() but should be compile-friendly/exportable as first_cache_position is a Tensor
+        kv_offset = torch.clamp(first_cache_position - self.sliding_window + 1, min=0)
+        # This is not general (see HybridChunkedCache for the whole general case), but it's what the cache returns
+        kv_length = max(query_length, self.get_max_cache_shape())
+        return kv_length, kv_offset
+
 
 class EncoderDecoderCache(Cache):
     """
@@ -1761,12 +1796,17 @@ def __init__(
             else config.num_key_value_heads
         )
 
-        layer_switch = config.sliding_window_pattern if hasattr(config, "sliding_window_pattern") else 2  # 2 is for BC
-        self.is_sliding_list = [bool((i + 1) % layer_switch) for i in range(config.num_hidden_layers)]
+        # If the attribute does not exist in the config, fallback to a simple StaticCache
+        if hasattr(config, "layer_types"):
+            self.is_sliding = [layer_type != "full_attention" for layer_type in config.layer_types]
+        else:
+            self.is_sliding = [False] * config.num_hidden_layers
+
         self.key_cache: List[torch.Tensor] = []
         self.value_cache: List[torch.Tensor] = []
         global_cache_shape = (self.max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
         sliding_cache_shape = (self.max_batch_size, self.num_key_value_heads, self.sliding_window_len, self.head_dim)
+        self.sliding_window = min(config.sliding_window, max_cache_len)
         device = torch.device(device) if device is not None else None
         for i in range(config.num_hidden_layers):
             if layer_device_map is not None:
@@ -1775,7 +1815,7 @@ def __init__(
                 layer_device = device
             # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
             # breaks when updating the cache.
-            cache_shape = sliding_cache_shape if self.is_sliding_list[i] else global_cache_shape
+            cache_shape = sliding_cache_shape if self.is_sliding[i] else global_cache_shape
             new_layer_key_cache = torch.zeros(cache_shape, dtype=self._dtype, device=layer_device)
             new_layer_value_cache = torch.zeros(cache_shape, dtype=self._dtype, device=layer_device)
             torch._dynamo.mark_static_address(new_layer_key_cache)
@@ -1796,7 +1836,7 @@ def update(
         if cache_position is None:
             raise ValueError("`cache_position` must be provided for HybridCache.")
 
-        is_sliding_layer = self.is_sliding_list[layer_idx]
+        is_sliding_layer = self.is_sliding[layer_idx]
 
         # These two `if` blocks are only reached in multigpu and if `layer_device_map` is not passed. They are used
         # when the cache is initialized in the forward pass (e.g. Gemma2)
@@ -1843,6 +1883,26 @@ def reset(self):
             self.key_cache[layer_idx].zero_()
             self.value_cache[layer_idx].zero_()
 
+    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
+        """
+        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
+        the given layer at `layer_idx`.
+        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
+        for each layer.
+        """
+        if self.is_sliding[layer_idx]:
+            query_length = cache_position.shape[0]
+            first_cache_position = cache_position[0]
+
+            local_mask_kv_offset = torch.clamp(first_cache_position - self.sliding_window + 1, min=0)
+            # This is not general (see HybridChunkedCache for the whole general case), but it's what the cache returns
+            local_mask_kv_length = max(query_length, self.sliding_window)
+            return local_mask_kv_length, local_mask_kv_offset
+
+        full_mask_kv_offset = 0
+        full_mask_kv_length = self.get_max_cache_shape()
+        return full_mask_kv_length, full_mask_kv_offset
+
 
 class HybridChunkedCache(Cache):
     """
@@ -1912,11 +1972,11 @@ def __init__(
         self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
         self._dtype = dtype
 
-        if hasattr(config.get_text_config(), "no_rope_layers"):
-            self.is_sliding = config.no_rope_layers
+        # If the attribute does not exist in the config, fallback to a simple StaticCache
+        if hasattr(config, "layer_types"):
+            self.is_sliding = [layer_type != "full_attention" for layer_type in config.layer_types]
         else:
-            layer_switch = getattr(config, "sliding_window_pattern", 2)
-            self.is_sliding = [bool((i + 1) % layer_switch) for i in range(config.num_hidden_layers)]
+            self.is_sliding = [False] * config.num_hidden_layers
 
         self.key_cache: List[torch.Tensor] = []
         self.value_cache: List[torch.Tensor] = []
@@ -1999,11 +2059,7 @@ def update(
         key_states = key_states.to(k_out.dtype)
         value_states = value_states.to(v_out.dtype)
 
-        if self.is_sliding[layer_idx]:
-            update_fn = self._sliding_update
-        else:
-            update_fn = self._static_update
-
+        update_fn = self._sliding_update if self.is_sliding[layer_idx] else self._static_update
         return update_fn(
             cache_position,
             layer_idx,
@@ -2038,6 +2094,37 @@ def reset(self):
             self.value_cache[layer_idx].zero_()
         self.cumulative_length = [0 for _ in range(len(self.cumulative_length))]
 
+    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
+        """
+        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
+        the given layer at `layer_idx`.
+        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
+        for each layer.
+        """
+        if self.is_sliding[layer_idx]:
+            query_length = cache_position.shape[0]
+            first_cache_position = cache_position[0]
+
+            local_mask_kv_offset = torch.clamp(first_cache_position - self.sliding_window + 1, min=0)
+            # This is the true general case for any Cache using local attention (sliding or chunked)
+            if first_cache_position >= self.sliding_window:
+                # Here the Cache is already full
+                local_mask_kv_length = self.sliding_window + query_length - 1
+            elif (
+                first_cache_position < self.sliding_window
+                and first_cache_position + query_length > self.sliding_window
+            ):
+                # Here the Cache becomes full with the new input
+                local_mask_kv_length = first_cache_position + query_length
+            else:
+                # Here the Cache is still smaller than the local size, but we return the local size as it's static
+                local_mask_kv_length = self.sliding_window
+            return local_mask_kv_length, local_mask_kv_offset
+
+        full_mask_kv_offset = 0
+        full_mask_kv_length = self.get_max_cache_shape()
+        return full_mask_kv_length, full_mask_kv_offset
+
 
 class OffloadedHybridCache(HybridChunkedCache):
     def __init__(
 
@@ -1209,3 +1209,16 @@ def recursive_diff_dict(dict_a, dict_b, config_obj=None):
     PretrainedConfig.push_to_hub.__doc__ = PretrainedConfig.push_to_hub.__doc__.format(
         object="config", object_class="AutoConfig", object_files="configuration file"
     )
+
+
+ALLOWED_LAYER_TYPES = (
+    "full_attention",
+    "sliding_attention",
+    "chunked_attention",
+)
+
+
+def layer_type_validation(layer_types: list[str]):
+    """Check that each entry in `layer_types` are allowed."""
+    if not all(layer_type in ALLOWED_LAYER_TYPES for layer_type in layer_types):
+        raise ValueError(f"The `layer_types` entries must be in {ALLOWED_LAYER_TYPES}")
@@ -46,6 +46,7 @@
 )
 from ..integrations.deepspeed import is_deepspeed_zero3_enabled
 from ..integrations.fsdp import is_fsdp_managed_module
+from ..masking_utils import create_masks_for_generate
 from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput
 from ..pytorch_utils import isin_mps_friendly
 from ..tokenization_utils import ExtensionsTrie
@@ -74,6 +75,7 @@
 from .configuration_utils import (
     NEED_SETUP_CACHE_CLASSES_MAPPING,
     QUANT_BACKEND_CLASSES_MAPPING,
+    CompileConfig,
     GenerationConfig,
     GenerationMode,
 )
@@ -649,12 +651,22 @@ def prepare_inputs_for_generation(
                 causal_mask_creation_function = getattr(
                     decoder, "_prepare_4d_causal_attention_mask_with_cache_position", None
                 )
+
+            # If it's not defined, it means the model uses the new general mask API
             if causal_mask_creation_function is None:  # can't be found
-                logger.warning_once(
-                    f"{self.__class__.__name__} has no `_prepare_4d_causal_attention_mask_with_cache_position` method "
-                    "defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're "
-                    "writing code, see Llama for an example implementation. If you're a user, please report this "
-                    "issue on GitHub."
+                output_attentions = kwargs.get("output_attentions", False)
+                token_type_ids = getattr(model_input, "token_type_ids", None)
+                # Some models may overwrite the general one
+                causal_mask_creation_function = getattr(self, "create_masks_for_generate", create_masks_for_generate)
+                attention_mask = causal_mask_creation_function(
+                    config=self.config,
+                    # we only need batch size, seq_length and dtype here - we don't care about the values of the embeddings
+                    input_embeds=torch.empty((batch_size, sequence_length), dtype=self.dtype),
+                    attention_mask=attention_mask,
+                    cache_position=cache_position,
+                    past_key_values=past_key_values,
+                    output_attentions=output_attentions,
+                    token_type_ids=token_type_ids,
                 )
             else:
                 attention_mask = causal_mask_creation_function(
@@ -3533,6 +3545,19 @@ def _sample(
         compile_forward = self._valid_auto_compile_criteria(model_kwargs, generation_config)
         if compile_forward:
             os.environ["TOKENIZERS_PARALLELISM"] = "0"
+            # If we use FA2 and a static cache, we cannot compile with fullgraph
+            if self.config._attn_implementation == "flash_attention_2" and getattr(
+                model_kwargs.get("past_key_values"), "is_compileable", False
+            ):
+                if generation_config.compile_config is None:
+                    generation_config.compile_config = CompileConfig(fullgraph=False)
+                # only raise warning if the user passed an explicit compile-config (otherwise, simply change the default without confusing the user)
+                elif generation_config.compile_config.fullgraph:
+                    logger.warning_once(
+                        "When using Flash Attention 2 and a static cache, you cannot use the option `CompileConfig(fullgraph=True)` as "
+                        "FA2 introduces graph breaks. We overrode the option with `fullgraph=False`."
+                    )
+                    generation_config.compile_config.fullgraph = False
             model_forward = self.get_compiled_call(generation_config.compile_config)
 
         if generation_config.prefill_chunk_size is not None: