huggingface · ArthurZucker · Oct 24, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024
diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
@@ -180,6 +180,10 @@ def prepare_fa2_from_position_ids(query, key, value, position_ids):
     return (query, key, value, indices_q, (cu_seq_lens, cu_seq_lens), (max_length, max_length))
 
 
+flash_241 = is_flash_attn_greater_or_equal("2.4.1")
+deterministic_g = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"
+
+
 def _flash_attention_forward(
     query_states: torch.Tensor,
     key_states: torch.Tensor,
@@ -194,6 +198,11 @@ def _flash_attention_forward(
     use_top_left_mask: bool = False,
     softcap: Optional[float] = None,
     deterministic: bool = None,
+    cu_seqlens_q: Optional[torch.LongTensor] = None,
+    cu_seqlens_k: Optional[torch.LongTensor] = None,
+    max_seqlen_in_batch_q: int = 0,
+    max_seqlen_in_batch_k: int = 0,
+    batch_size: int = 2,
 ):
     """
     Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
@@ -232,9 +241,9 @@ def _flash_attention_forward(
     )
     flash_kwargs = {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {}
 
-    if is_flash_attn_greater_or_equal("2.4.1"):
+    if flash_241:
         if deterministic is None:
-            deterministic = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"
+            deterministic = deterministic_g
         flash_kwargs["deterministic"] = deterministic
 
     if softcap is not None:
@@ -267,15 +276,7 @@ def _flash_attention_forward(
     # If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing
     # then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage.
     # Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach
-    elif position_ids is not None and not (torch.diff(position_ids, dim=-1) >= 0).all() and query_length != 1:
-        batch_size = query_states.size(0)
-        query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids(
-            query_states, key_states, value_states, position_ids
-        )
-
-        cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-        max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-
+    elif position_ids is not None and max_seqlen_in_batch_q is not None:
         attn_output = flash_attn_varlen_func(
             query_states,
             key_states,

diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
@@ -422,6 +422,10 @@ def forward(
         use_cache: bool = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        cu_seq_lens_q: Optional[torch.LongTensor] = None,
+        cu_seq_lens_k: Optional[torch.LongTensor] = None,
+        max_length_q: int = 0,
+        max_length_k: int = 0,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         if isinstance(past_key_value, StaticCache):
             raise ValueError(
@@ -495,6 +499,11 @@ def forward(
             key_states = key_states.to(target_dtype)
             value_states = value_states.to(target_dtype)
 
+        batch_size = query_states.size(0)
+        query_states = query_states.reshape(-1, query_states.size(-2), query_states.size(-1))
+        key_states = key_states.reshape(-1, key_states.size(-2), key_states.size(-1))
+        value_states = value_states.reshape(-1, value_states.size(-2), value_states.size(-1))
-        batch_size = query_states.size(0)
-        query_states = query_states.reshape(-1, query_states.size(-2), query_states.size(-1))
-        key_states = key_states.reshape(-1, key_states.size(-2), key_states.size(-1))
-        value_states = value_states.reshape(-1, value_states.size(-2), value_states.size(-1))
-        batch_size = query_states.size(0)
-        query_states = query_states.reshape(-1, query_states.size(-2), query_states.size(-1))
-        key_states = key_states.reshape(-1, key_states.size(-2), key_states.size(-1))
-        value_states = value_states.reshape(-1, value_states.size(-2), value_states.size(-1))
+
         attn_output = _flash_attention_forward(
             query_states,
             key_states,
@@ -506,6 +515,11 @@ def forward(
             sliding_window=getattr(self, "sliding_window", None),
             use_top_left_mask=self._flash_attn_uses_top_left_mask,
             is_causal=self.is_causal,
+            cu_seqlens_q=cu_seq_lens_q,
+            cu_seqlens_k=cu_seq_lens_k,
+            max_seqlen_in_batch_q=max_length_q if isinstance(max_length_q, int) else max_length_q.item(),
+            max_seqlen_in_batch_k=max_length_k if isinstance(max_length_k, int) else max_length_k.item(),
+            batch_size=batch_size,
         )
 
         attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
@@ -644,6 +658,10 @@ def forward(
         use_cache: Optional[bool] = False,
         cache_position: Optional[torch.LongTensor] = None,
         position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+        cu_seq_lens_q: Optional[torch.LongTensor] = None,
+        cu_seq_lens_k: Optional[torch.LongTensor] = None,
+        max_length_q: int = 0,
+        max_length_k: int = 0,
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
@@ -682,6 +700,10 @@ def forward(
             use_cache=use_cache,
             cache_position=cache_position,
             position_embeddings=position_embeddings,
+            cu_seq_lens_q=cu_seq_lens_q,
+            cu_seq_lens_k=cu_seq_lens_k,
+            max_length_q=max_length_q,
+            max_length_k=max_length_k,
             **kwargs,
         )
         hidden_states = residual + hidden_states
@@ -870,6 +892,10 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        cu_seq_lens_q: Optional[torch.LongTensor] = None,
+        cu_seq_lens_k: Optional[torch.LongTensor] = None,
+        max_length_q: int = 0,
+        max_length_k: int = 0,
     ) -> Union[Tuple, BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -953,6 +979,10 @@ def forward(
                     use_cache=use_cache,
                     cache_position=cache_position,
                     position_embeddings=position_embeddings,
+                    cu_seq_lens_q=cu_seq_lens_q,
+                    cu_seq_lens_k=cu_seq_lens_k,
+                    max_length_q=max_length_q,
+                    max_length_k=max_length_k,
                 )
 
             hidden_states = layer_outputs[0]
@@ -1148,6 +1178,10 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
+        cu_seq_lens_q: Optional[torch.LongTensor] = None,
+        cu_seq_lens_k: Optional[torch.LongTensor] = None,
+        max_length_q: int = 0,
+        max_length_k: int = 0,
 class TextKwargs(TypedDict, total=False): 
 class TextKwargs(TypedDict, total=False): 
         num_logits_to_keep: int = 0,
     ) -> Union[Tuple, CausalLMOutputWithPast]:
         r"""
@@ -1198,6 +1232,10 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             cache_position=cache_position,
+            cu_seq_lens_q=cu_seq_lens_q,
+            cu_seq_lens_k=cu_seq_lens_k,
+            max_length_q=max_length_q,
+            max_length_k=max_length_k,
         )
 
         hidden_states = outputs[0]

diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
@@ -31,6 +31,7 @@
 from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union
 
 import numpy as np
+import torch
 from packaging import version
 
 from . import __version__
@@ -813,7 +814,9 @@ def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
         # Otherwise it passes the casts down and casts the LongTensor containing the token idxs
         # into a HalfTensor
         if isinstance(device, str) or is_torch_device(device) or isinstance(device, int):
-            self.data = {k: v.to(device=device) for k, v in self.data.items() if v is not None}
+            self.data = {
+                k: v.to(device=device) for k, v in self.data.items() if v is not None and isinstance(v, torch.Tensor)
+            }
         else:
             logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.")
         return self