huggingface · ArthurZucker · Oct 24, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024
diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
@@ -180,6 +180,10 @@ def prepare_fa2_from_position_ids(query, key, value, position_ids):
     return (query, key, value, indices_q, (cu_seq_lens, cu_seq_lens), (max_length, max_length))
 
 
+flash_241 = is_flash_attn_greater_or_equal("2.4.1")
+deterministic_g = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"
+
+
 def _flash_attention_forward(
     query_states: torch.Tensor,
     key_states: torch.Tensor,
@@ -194,6 +198,11 @@ def _flash_attention_forward(
     use_top_left_mask: bool = False,
     softcap: Optional[float] = None,
     deterministic: bool = None,
+    cu_seq_lens_q: Optional[torch.LongTensor] = None,
+    cu_seq_lens_k: Optional[torch.LongTensor] = None,
+    max_length_q: int = 0,
+    max_length_k: int = 0,
+    batch_size: int = 2,
 ):
     """
     Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
@@ -232,9 +241,9 @@ def _flash_attention_forward(
     )
     flash_kwargs = {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {}
 
-    if is_flash_attn_greater_or_equal("2.4.1"):
+    if flash_241:
         if deterministic is None:
-            deterministic = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"
+            deterministic = deterministic_g
         flash_kwargs["deterministic"] = deterministic
 
     if softcap is not None:
@@ -267,24 +276,15 @@ def _flash_attention_forward(
     # If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing
     # then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage.
     # Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach
-    # Note: the `torch.diff(...)` condition is last to use short-circuit and avoid the cuda synchronization it incurs during inference (query_length == 1 always)
-    elif position_ids is not None and query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all():
-        batch_size = query_states.size(0)
-        query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids(
-            query_states, key_states, value_states, position_ids
-        )
-
-        cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-        max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
+    elif position_ids is not None and max_length_q is not None:
         attn_output = flash_attn_varlen_func(
             query_states,
             key_states,
             value_states,
-            cu_seqlens_q=cu_seqlens_q,
-            cu_seqlens_k=cu_seqlens_k,
-            max_seqlen_q=max_seqlen_in_batch_q,
-            max_seqlen_k=max_seqlen_in_batch_k,
+            cu_seqlens_q=cu_seq_lens_q,
+            cu_seqlens_k=cu_seq_lens_k,
+            max_seqlen_q=max_length_q,
+            max_seqlen_k=max_length_k,
             dropout_p=dropout,
             softmax_scale=softmax_scale,
             causal=causal,

diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
@@ -48,6 +48,9 @@
     logging,
     replace_return_docstrings,
 )
+from ...processing_utils import (
+    Fa2Kwargs,
+)    
 from .configuration_llama import LlamaConfig
 
 
@@ -421,6 +424,7 @@ def forward(
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        **kwargs,
-        **kwargs,
+        **kwargs: Unpack[Fa2Kwargs],
-        **kwargs,
+        **kwargs: Unpack[Fa2Kwargs],
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if isinstance(past_key_value, StaticCache):
             raise ValueError(
@@ -494,6 +498,11 @@ def forward(
             key_states = key_states.to(target_dtype)
             value_states = value_states.to(target_dtype)
 
+        batch_size = query_states.size(0)
+        query_states = query_states.reshape(-1, query_states.size(-2), query_states.size(-1))
+        key_states = key_states.reshape(-1, key_states.size(-2), key_states.size(-1))
+        value_states = value_states.reshape(-1, value_states.size(-2), value_states.size(-1))
-        batch_size = query_states.size(0)
-        query_states = query_states.reshape(-1, query_states.size(-2), query_states.size(-1))
-        key_states = key_states.reshape(-1, key_states.size(-2), key_states.size(-1))
-        value_states = value_states.reshape(-1, value_states.size(-2), value_states.size(-1))
-        batch_size = query_states.size(0)
-        query_states = query_states.reshape(-1, query_states.size(-2), query_states.size(-1))
-        key_states = key_states.reshape(-1, key_states.size(-2), key_states.size(-1))
-        value_states = value_states.reshape(-1, value_states.size(-2), value_states.size(-1))
+
         attn_output = _flash_attention_forward(
             query_states,
             key_states,
@@ -505,6 +514,8 @@ def forward(
             sliding_window=getattr(self, "sliding_window", None),
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
             is_causal=self.is_causal,
+            batch_size=batch_size,
+            **kwargs
         )
 
         attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
@@ -868,7 +879,7 @@ def forward(
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
+        **fa2_kwargs: Fa2Kwargs,
-        **fa2_kwargs: Fa2Kwargs,
+        **fa2_kwargs: Unpack[Fa2Kwargs],
-        **fa2_kwargs: Fa2Kwargs,
+        **fa2_kwargs: Unpack[Fa2Kwargs],
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -950,6 +961,7 @@ def forward(
                     use_cache=use_cache,
                     cache_position=cache_position,
                     position_embeddings=position_embeddings,
+                    **fa2_kwargs
                 )
 
             hidden_states = layer_outputs[0]
@@ -1146,6 +1158,7 @@ def forward(
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
         num_logits_to_keep: int = 0,
+        **fa2_kwargs: Fa2Kwargs,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
         Args:
@@ -1195,6 +1208,7 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
+            **fa2_kwargs,
         )
 
         hidden_states = outputs[0]

diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py
@@ -23,6 +23,7 @@
 import sys
 import typing
 import warnings
+import torch
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, TypedDict, Union
 
@@ -77,6 +78,26 @@
 else:
     Unpack = typing_extensions.Unpack
 
+class Fa2Kwargs(TypedDict, total=False):
+    """
+    Keyword arguments for Flash Attention with Compile.
+
+    Attributes:
+        cu_seq_lens_q (`torch.LongTensor`, *optional*)
+            Gets cumlative sequence length for query state.
+        cu_seq_lens_k (`torch.LongTensor`, *optional*)
+            Gets cumlative sequence length for key state.
+        max_length_q (`int`, *optional*):
+            Maximum sequence length for query state.
+        max_length_k (`int`, *optional*):
+            Maximum sequence length for key state.
+    """
+
+    cu_seq_lens_q: Optional[torch.LongTensor]
+    cu_seq_lens_k: Optional[torch.LongTensor]
+    max_length_q: Optional[int]
+    max_length_k: Optional[int]
+
 
 class TextKwargs(TypedDict, total=False):
     """

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
@@ -31,6 +31,7 @@
 from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
 
 import numpy as np
+import torch
 from packaging import version
 
 from . import __version__
@@ -815,7 +816,9 @@ def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
         # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
         # into a HalfTensor
         if isinstance(device, str) or is_torch_device(device) or isinstance(device, int):
-            self.data = {k: v.to(device=device) for k, v in self.data.items() if isinstance(v, torch.Tensor)}
+            self.data = {
+                k: v.to(device=device) for k, v in self.data.items() if v is not None and isinstance(v, torch.Tensor)
+            }
         else:
             logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
         return self