FlashAttentionImpl -> AttnType (#117)

feifeibear · web-flow · commit 2c4f5480398c · 2024-12-26T15:17:32.000+08:00
diff --git a/README.md b/README.md
@@ -46,7 +46,7 @@ As shown in the figure below, there are three usage methods based on the flash_a
 
 2. For A100, L40, hardware that supports FA v2, ring_flash_attn uses FA v2.
 
-3. For hardware such as NPUs that does not support FA, use torch to implement attention computation. In this case, there is no need to install `flash_attn`, and you should apply `LongContextAttention(ring_impl_type="basic", attn_type=FlashAttentionImpl.TORCH)`.
+3. For hardware such as NPUs that does not support FA, use torch to implement attention computation. In this case, there is no need to install `flash_attn`, and you should apply `LongContextAttention(ring_impl_type="basic", attn_type=AttnType.TORCH)`.
 
 Option 1: pip install
 
@@ -85,7 +85,7 @@ from yunchang import (
     set_seq_parallel_pg,
     EXTRACT_FUNC_DICT
 )
-from yunchang.kernels import FlashAttentionImpl
+from yunchang.kernels import AttnType
 
 sp_ulysses_degree = 2
 sp_ring_degree = 4
@@ -94,10 +94,10 @@ sp_ring_degree = 4
 set_seq_parallel_pg(sp_ulysses_degree, sp_ring_degree, rank, world_size)
 
 # attn_type could be FA, FA3, TORCH.
-longctx_attn = LongContextAttention(ring_impl_type="zigzag", attn_type=FlashAttentionImpl.FA)
+longctx_attn = LongContextAttention(ring_impl_type="zigzag", attn_type=AttnType.FA)
 
 # if you use NPUs, where no flash_attn is supported, you can use the following code.
-# LongContextAttention(ring_impl_type="zigzag", attn_type=FlashAttentionImpl.TORCH)
+# LongContextAttention(ring_impl_type="zigzag", attn_type=AttnType.TORCH)
 
 # extract a local shard for the global Q, K, V.
 local_q = EXTRACT_FUNC_DICT["zigzag"](
diff --git a/benchmark/benchmark_longctx.py b/benchmark/benchmark_longctx.py
@@ -160,8 +160,8 @@ def benchmark(num_iter=10, forward_only=True, log=True, profile=False):
         sp_ulysses_degree, sp_ring_degree, rank, world_size, args.use_ulysses_lowdim
     )
 
-    from yunchang.kernels import FlashAttentionImpl
-    attn_type = FlashAttentionImpl.from_string(args.attn_type) 
+    from yunchang.kernels import AttnType
+    attn_type = AttnType.from_string(args.attn_type) 
     if args.use_ulysses:
         longctx_attn = UlyssesAttention(attn_type=attn_type)
     else:
diff --git a/test/test_hybrid_attn.py b/test/test_hybrid_attn.py
@@ -6,7 +6,7 @@
 import torch
 import torch.distributed as dist
 from flash_attn import flash_attn_func
-from yunchang.kernels import FlashAttentionImpl
+from yunchang.kernels import AttnType
 from test_utils import attention_ref
 import argparse
 
@@ -133,11 +133,11 @@ def log(msg, a, rank0_only=False):
         local_k.requires_grad = True
         local_v.requires_grad = True
 
-    # Map argument to FlashAttentionImpl enum
+    # Map argument to AttnType enum
     attn_impl_map = {
-        'torch': FlashAttentionImpl.TORCH,
-        'fa': FlashAttentionImpl.FA,
-        'fa3': FlashAttentionImpl.FA3
+        'torch': AttnType.TORCH,
+        'fa': AttnType.FA,
+        'fa3': AttnType.FA3
     }
 
     usp_attn = LongContextAttention(ring_impl_type=ring_impl_type, 
diff --git a/test/test_hybrid_qkvpacked_attn.py b/test/test_hybrid_qkvpacked_attn.py
@@ -6,7 +6,7 @@
     EXTRACT_FUNC_DICT, 
     RING_IMPL_QKVPACKED_DICT
 )
-from yunchang.kernels import FlashAttentionImpl
+from yunchang.kernels import AttnType
 
 
 def log(msg, a, rank0_only=False):
@@ -66,7 +66,7 @@ def test(ring_impl_type="zigzag"):
 
     set_seq_parallel_pg(sp_ulysses_degree, sp_ring_degree, rank, world_size)
 
-    longctx_attn = LongContextAttentionQKVPacked(ring_impl_type=ring_impl_type, attn_type=FlashAttentionImpl.TORCH)
+    longctx_attn = LongContextAttentionQKVPacked(ring_impl_type=ring_impl_type, attn_type=AttnType.TORCH)
 
     ## prepare input and output tensors
 
diff --git a/test/test_ulysses_attn.py b/test/test_ulysses_attn.py
@@ -3,7 +3,7 @@
 from yunchang import UlyssesAttention
 
 from flash_attn import flash_attn_func
-from yunchang.kernels import FlashAttentionImpl
+from yunchang.kernels import AttnType
 
 def log(msg, a, rank0_only=False):
     world_size = dist.get_world_size()
@@ -79,7 +79,7 @@ def log(msg, a, rank0_only=False):
     # prcess_group == sequence_process_group
     sp_pg = None #dist.new_group(ranks=[i for i in range(world_size)])
 
-    dist_attn = UlyssesAttention(sp_pg, attn_type=FlashAttentionImpl.FA)
+    dist_attn = UlyssesAttention(sp_pg, attn_type=AttnType.FA)
 
     if rank == 0:
         print("#" * 30)
diff --git a/yunchang/hybrid/attn_layer.py b/yunchang/hybrid/attn_layer.py
@@ -8,7 +8,7 @@
 import torch.distributed as dist
 from .utils import RING_IMPL_DICT, RING_IMPL_QKVPACKED_DICT
 from yunchang.globals import PROCESS_GROUP
-from yunchang.kernels import FlashAttentionImpl
+from yunchang.kernels import AttnType
 
 
 class LongContextAttention(torch.nn.Module):
@@ -29,7 +29,7 @@ def __init__(
         ring_impl_type: str = "basic",
         use_pack_qkv: bool = False,
         use_sync: bool = False,
-        attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+        attn_type: AttnType = AttnType.FA,
     ) -> None:
 
         super(LongContextAttention, self).__init__()
@@ -157,7 +157,7 @@ def __init__(
         gather_idx: int = 1,
         ring_impl_type: str = "basic",
         use_sync: bool = False,
-        attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+        attn_type: AttnType = AttnType.FA,
     ) -> None:
 
         super(LongContextAttentionQKVPacked, self).__init__()
diff --git a/yunchang/kernels/__init__.py b/yunchang/kernels/__init__.py
@@ -14,7 +14,7 @@
 if HAS_FLASH_ATTN:
     from flash_attn import flash_attn_func
 
-class FlashAttentionImpl(Enum):
+class AttnType(Enum):
     FA = "fa"
     FA3 = "fa3"
     TORCH = "torch"
@@ -26,8 +26,8 @@ def from_string(cls, s: str):
                 return member
         raise ValueError(f"'{s}' is not a valid {cls.__name__}")
 
-def select_flash_attn_impl(impl_type: FlashAttentionImpl, stage : str = "fwd-bwd"):
-    if impl_type == FlashAttentionImpl.FA:
+def select_flash_attn_impl(impl_type: AttnType, stage : str = "fwd-bwd"):
+    if impl_type == AttnType.FA:
         if stage == "fwd-only":
             return flash_attn_forward
         elif stage == "bwd-only":
@@ -38,7 +38,7 @@ def select_flash_attn_impl(impl_type: FlashAttentionImpl, stage : str = "fwd-bwd
         else:
             raise ValueError(f"Unknown stage: {stage}")
         
-    elif impl_type == FlashAttentionImpl.FA3:
+    elif impl_type == AttnType.FA3:
         if stage == "fwd-only":
             return flash_attn3_func_forward
         elif stage == "bwd-only":
@@ -64,7 +64,7 @@ def fn(q,
         else:
             raise ValueError(f"Unknown stage: {stage}")
 
-    elif impl_type == FlashAttentionImpl.TORCH:
+    elif impl_type == AttnType.TORCH:
         if stage == "fwd-only":
             return pytorch_attn_forward
         elif stage == "bwd-only":
@@ -77,4 +77,4 @@ def fn(q,
     else:
         raise ValueError(f"Unknown flash attention implementation: {impl_type}")
 
-__all__ = ["flash_attn_forward", "flash_attn_backward", "flash_attn3_func_forward", "flash_attn3_func_forward", "FlashAttentionImpl"]
+__all__ = ["flash_attn_forward", "flash_attn_backward", "flash_attn3_func_forward", "flash_attn3_func_forward", "AttnType"]
diff --git a/yunchang/ring/ring_flash_attn.py b/yunchang/ring/ring_flash_attn.py
@@ -2,7 +2,7 @@
 import torch.distributed as dist
 # from flash_attn.flash_attn_interface import _flash_attn_forward, _flash_attn_backward
 from .utils import RingComm, update_out_and_lse
-from yunchang.kernels import select_flash_attn_impl, FlashAttentionImpl
+from yunchang.kernels import select_flash_attn_impl, AttnType
 
 def ring_flash_attn_forward(
     process_group,
@@ -16,7 +16,7 @@ def ring_flash_attn_forward(
     softcap=0.0,
     alibi_slopes=None,
     deterministic=False,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     comm = RingComm(process_group)
 
@@ -72,7 +72,7 @@ def ring_flash_attn_backward(
     softcap=0.0,
     alibi_slopes=None,
     deterministic=False,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     kv_comm = RingComm(process_group)
     d_kv_comm = RingComm(process_group)
@@ -227,7 +227,7 @@ def ring_flash_attn_qkvpacked_func(
     deterministic=False,
     return_attn_probs=False,
     group=None,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     return RingFlashAttnFunc.apply(
         qkv[:, :, 0],
@@ -258,7 +258,7 @@ def ring_flash_attn_kvpacked_func(
     deterministic=False,
     return_attn_probs=False,
     group=None,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     return RingFlashAttnFunc.apply(
         q,
@@ -290,7 +290,7 @@ def ring_flash_attn_func(
     deterministic=False,
     return_attn_probs=False,
     group=None,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     return RingFlashAttnFunc.apply(
         q,
diff --git a/yunchang/ring/ring_pytorch_attn.py b/yunchang/ring/ring_pytorch_attn.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn.functional as F
 from typing import Any, Optional, Tuple
-from yunchang.kernels import select_flash_attn_impl, FlashAttentionImpl
+from yunchang.kernels import select_flash_attn_impl, AttnType
 from .utils import RingComm, update_out_and_lse
 from yunchang.kernels.attention import pytorch_attn_forward, pytorch_attn_backward
 
@@ -22,7 +22,7 @@ def ring_pytorch_attn_func(
     deterministic=False,
     return_attn_probs=False,
     group=None,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     return RingAttentionFunc.apply(group, q, k, v, softmax_scale, causal)
 
diff --git a/yunchang/ring/stripe_flash_attn.py b/yunchang/ring/stripe_flash_attn.py
@@ -1,5 +1,5 @@
 import torch
-from yunchang.kernels import select_flash_attn_impl, FlashAttentionImpl
+from yunchang.kernels import select_flash_attn_impl, AttnType
 from .utils import RingComm, update_out_and_lse
 
 
@@ -15,7 +15,7 @@ def stripe_flash_attn_forward(
     softcap=0.0,
     alibi_slopes=None,
     deterministic=False,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     assert (
         causal
@@ -91,7 +91,7 @@ def stripe_flash_attn_backward(
     softcap=0.0,
     alibi_slopes=None,
     deterministic=False,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     assert (
         causal
@@ -211,7 +211,7 @@ def forward(
         deterministic,
         return_softmax,
         group,
-        attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+        attn_type: AttnType = AttnType.FA,
     ):
         if softmax_scale is None:
             softmax_scale = q.shape[-1] ** (-0.5)
@@ -280,7 +280,7 @@ def stripe_flash_attn_qkvpacked_func(
     deterministic=False,
     return_attn_probs=False,
     group=None,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     return StripeFlashAttnFunc.apply(
         qkv[:, :, 0],
@@ -311,7 +311,7 @@ def stripe_flash_attn_kvpacked_func(
     deterministic=False,
     return_attn_probs=False,
     group=None,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     return StripeFlashAttnFunc.apply(
         q,
@@ -343,7 +343,7 @@ def stripe_flash_attn_func(
     deterministic=False,
     return_attn_probs=False,
     group=None,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     return StripeFlashAttnFunc.apply(
         q,
diff --git a/yunchang/ring/zigzag_ring_flash_attn.py b/yunchang/ring/zigzag_ring_flash_attn.py
@@ -1,6 +1,6 @@
 import torch
 from .utils import RingComm, update_out_and_lse
-from yunchang.kernels import FlashAttentionImpl, select_flash_attn_impl
+from yunchang.kernels import AttnType, select_flash_attn_impl
 
 def zigzag_ring_flash_attn_forward(
     process_group,
@@ -14,7 +14,7 @@ def zigzag_ring_flash_attn_forward(
     softcap=0.0,
     alibi_slopes=None,
     deterministic=False,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     assert causal == True, "zigzag ring is meaningless for causal=False"
     comm = RingComm(process_group)
@@ -91,7 +91,7 @@ def zigzag_ring_flash_attn_backward(
     softcap=0.0,
     alibi_slopes=None,
     deterministic=False,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     assert causal == True, "zigzag ring is meaningless for causal=False"
     kv_comm = RingComm(process_group)
@@ -268,7 +268,7 @@ def zigzag_ring_flash_attn_qkvpacked_func(
     deterministic=False,
     return_attn_probs=False,
     group=None,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     return ZigZagRingFlashAttnFunc.apply(
         qkv[:, :, 0],
@@ -299,7 +299,7 @@ def zigzag_ring_flash_attn_kvpacked_func(
     deterministic=False,
     return_attn_probs=False,
     group=None,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     return ZigZagRingFlashAttnFunc.apply(
         q,
@@ -331,7 +331,7 @@ def zigzag_ring_flash_attn_func(
     deterministic=False,
     return_attn_probs=False,
     group=None,
-    attn_type: FlashAttentionImpl = FlashAttentionImpl.FA,
+    attn_type: AttnType = AttnType.FA,
 ):
     return ZigZagRingFlashAttnFunc.apply(
         q,
diff --git a/yunchang/ulysses/attn_layer.py b/yunchang/ulysses/attn_layer.py
@@ -7,7 +7,7 @@
 
 from typing import Any
 from torch import Tensor
-from yunchang.kernels import FlashAttentionImpl, select_flash_attn_impl
+from yunchang.kernels import AttnType, select_flash_attn_impl
 import torch.distributed as dist
 from yunchang.comm.all_to_all import SeqAllToAll4D
 
@@ -21,7 +21,7 @@ class UlyssesAttention(torch.nn.Module):
         scatter_idx (int): scatter_idx for all2all comm
         gather_idx (int): gather_idx for all2all comm
         use_sync (bool): whether to synchronize after all-to-all. This flag can save cuda memory but will slow down the speed.
-        attn_type (FlashAttentionImpl): attention type enum
+        attn_type (AttnType): attention type enum
     """
 
     def __init__(
@@ -30,7 +30,7 @@ def __init__(
         scatter_idx: int = 2,
         gather_idx: int = 1,
         use_sync: bool = False,
-        attn_type : FlashAttentionImpl = FlashAttentionImpl.FA,
+        attn_type : AttnType = AttnType.FA,
     ) -> None:
 
         super(UlyssesAttention, self).__init__()
@@ -43,7 +43,7 @@ def __init__(
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         gpu_name = torch.cuda.get_device_name(device)
         if "Turing" in gpu_name or "Tesla" in gpu_name or "T4" in gpu_name:
-            self.attn_type = FlashAttentionImpl.TORCH
+            self.attn_type = AttnType.TORCH
         self.attn_fn = select_flash_attn_impl(self.attn_type, stage="fwd-bwd")
 
     def forward(