feifeibear
diff --git a/‎test/test_hybrid_attn.py
Lines changed: 179 additions & 75 deletions b/‎test/test_hybrid_attn.py
Lines changed: 179 additions & 75 deletions
@@ -1,11 +1,8 @@
 import os
-from yunchang import (
-    LongContextAttention,
-    set_seq_parallel_pg,
-    EXTRACT_FUNC_DICT
-)
+from yunchang import LongContextAttention, set_seq_parallel_pg, EXTRACT_FUNC_DICT
 import torch
 import torch.distributed as dist
+
 try:
     from flash_attn import flash_attn_func
 except ImportError:
@@ -14,32 +11,82 @@
 from test_utils import attention_ref
 import argparse
 
+
 def parse_args():
-    parser = argparse.ArgumentParser(description='Test hybrid attention with configurable sequence length')
-    parser.add_argument('--seqlen', type=int, default=1024,
-                      help='sequence length (default: 1024)')
-    parser.add_argument('--use_bwd', action='store_true',
-                        help='whether to test backward pass (default: False)')
-    parser.add_argument('--sp_ulysses_degree', type=int, default=None,
-                      help='sp_ulysses_degree (default: world_size)')
-    parser.add_argument('--ring_impl_type', type=str, default='basic',
-                      choices=['basic', 'zigzag', 'basic_flashinfer'],
-                      help='ring implementation type (default: basic)')
-    parser.add_argument('--causal', action='store_true',
-                      help='whether to use causal attention (default: False)')
-    parser.add_argument('--attn_impl', type=str, default='torch',
-                      choices=['torch', 'fa', 'fa3', 'flashinfer', 'sage_fp16', 'sage_fp16_triton', 'sage_fp8', 'sparse_sage'],
-                      help='attention implementation type (default: torch)')
-    parser.add_argument('--sparse_sage_l1', type=float, default=0.07,
-                      help='l1 for sparse sage attention (default: 0.07)')
-    parser.add_argument('--sparse_sage_pv_l1', type=float, default=0.08,
-                      help='pv_l1 for sparse sage attention (default: 0.08)')
-    parser.add_argument('--sparse_sage_tune_mode', action='store_true', default=False,
-                      help='enable tune mode for sparse sage attention (default: False)')
-    parser.add_argument('--sparse_sage_tune_path', type=str, default='./sparsesage_autotune.pt',
-                      help='path to the sparse sage autotune results (default: ./sparsesage_autotune.pt)')
+    parser = argparse.ArgumentParser(
+        description="Test hybrid attention with configurable sequence length"
+    )
+    parser.add_argument(
+        "--seqlen", type=int, default=1024, help="sequence length (default: 1024)"
+    )
+    parser.add_argument(
+        "--use_bwd",
+        action="store_true",
+        help="whether to test backward pass (default: False)",
+    )
+    parser.add_argument(
+        "--sp_ulysses_degree",
+        type=int,
+        default=None,
+        help="sp_ulysses_degree (default: world_size)",
+    )
+    parser.add_argument(
+        "--ring_impl_type",
+        type=str,
+        default="basic",
+        choices=["basic", "zigzag", "basic_flashinfer"],
+        help="ring implementation type (default: basic)",
+    )
+    parser.add_argument(
+        "--causal",
+        action="store_true",
+        help="whether to use causal attention (default: False)",
+    )
+    parser.add_argument(
+        "--attn_impl",
+        type=str,
+        default="torch",
+        choices=[
+            "torch",
+            "fa",
+            "fa3",
+            "flashinfer",
+            "sage_fp16",
+            "sage_fp8",
+            "sparse_sage",
+            "sage_fp8_sm90",
+            "sage_fp16_triton",
+            "sage_auto",
+        ],
+        help="attention implementation type (default: torch)",
+    )
+    parser.add_argument(
+        "--sparse_sage_l1",
+        type=float,
+        default=0.07,
+        help="l1 for sparse sage attention (default: 0.07)",
+    )
+    parser.add_argument(
+        "--sparse_sage_pv_l1",
+        type=float,
+        default=0.08,
+        help="pv_l1 for sparse sage attention (default: 0.08)",
+    )
+    parser.add_argument(
+        "--sparse_sage_tune_mode",
+        action="store_true",
+        default=False,
+        help="enable tune mode for sparse sage attention (default: False)",
+    )
+    parser.add_argument(
+        "--sparse_sage_tune_path",
+        type=str,
+        default="./sparsesage_autotune.pt",
+        help="path to the sparse sage autotune results (default: ./sparsesage_autotune.pt)",
+    )
     return parser.parse_args()
 
+
 def log(msg, a, rank0_only=False):
     world_size = dist.get_world_size()
     rank = dist.get_rank()
@@ -65,19 +112,20 @@ def log(msg, a, rank0_only=False):
             )
         dist.barrier()
 
+
 # test it with:
 # torchrun --nproc_per_node=4  test/test_hybrid_attn.py
 if __name__ == "__main__":
     args = parse_args()
-    
+
     torch.random.manual_seed(0)
 
     dist.init_process_group("nccl")
 
     rank = dist.get_rank()
     world_size = dist.get_world_size()
 
-    # Inference mainly uses fp16; ROCM flash attention with bf16 precision is slightly larger, will be fixed soon 
+    # Inference mainly uses fp16; ROCM flash attention with bf16 precision is slightly larger, will be fixed soon
     dtype = torch.bfloat16
     device = torch.device(f"cuda:{rank}")
 
@@ -88,7 +136,7 @@ def log(msg, a, rank0_only=False):
     dropout_p = 0
     causal = args.causal
     deterministic = False
-    
+
     use_bwd = args.use_bwd
 
     assert seqlen % world_size == 0
@@ -98,13 +146,31 @@ def log(msg, a, rank0_only=False):
 
     # Prepare inputs
     q = torch.randn(
-        batch_size, seqlen, nheads, d, device=device, dtype=dtype, requires_grad=True if use_bwd else False
+        batch_size,
+        seqlen,
+        nheads,
+        d,
+        device=device,
+        dtype=dtype,
+        requires_grad=True if use_bwd else False,
     )
     k = torch.randn(
-        batch_size, seqlen, nheads, d, device=device, dtype=dtype, requires_grad=True if use_bwd else False 
+        batch_size,
+        seqlen,
+        nheads,
+        d,
+        device=device,
+        dtype=dtype,
+        requires_grad=True if use_bwd else False,
     )
     v = torch.randn(
-        batch_size, seqlen, nheads, d, device=device, dtype=dtype, requires_grad=True if use_bwd else False
+        batch_size,
+        seqlen,
+        nheads,
+        d,
+        device=device,
+        dtype=dtype,
+        requires_grad=True if use_bwd else False,
     )
     dout = torch.randn(batch_size, seqlen, nheads, d, device=device, dtype=dtype)
 
@@ -116,7 +182,9 @@ def log(msg, a, rank0_only=False):
     # prepare process group for hybrid sequence parallelism
     use_ring_low_dim = True
 
-    sp_ulysses_degree = args.sp_ulysses_degree if args.sp_ulysses_degree is not None else world_size
+    sp_ulysses_degree = (
+        args.sp_ulysses_degree if args.sp_ulysses_degree is not None else world_size
+    )
     sp_ring_degree = world_size // sp_ulysses_degree
 
     print(
@@ -126,19 +194,29 @@ def log(msg, a, rank0_only=False):
     set_seq_parallel_pg(sp_ulysses_degree, sp_ring_degree, rank, world_size)
 
     # Use EXTRACT_FUNC_DICT to shard the tensors
-    local_q = EXTRACT_FUNC_DICT[ring_impl_type](
-        q, rank, world_size=world_size, rd=sp_ring_degree, ud=sp_ulysses_degree
-    ).detach().clone()
-
-
-    local_k = EXTRACT_FUNC_DICT[ring_impl_type](
-        k, rank, world_size=world_size, rd=sp_ring_degree, ud=sp_ulysses_degree
-    ).detach().clone()
+    local_q = (
+        EXTRACT_FUNC_DICT[ring_impl_type](
+            q, rank, world_size=world_size, rd=sp_ring_degree, ud=sp_ulysses_degree
+        )
+        .detach()
+        .clone()
+    )
 
+    local_k = (
+        EXTRACT_FUNC_DICT[ring_impl_type](
+            k, rank, world_size=world_size, rd=sp_ring_degree, ud=sp_ulysses_degree
+        )
+        .detach()
+        .clone()
+    )
 
-    local_v = EXTRACT_FUNC_DICT[ring_impl_type](
-        v, rank, world_size=world_size, rd=sp_ring_degree, ud=sp_ulysses_degree
-    ).detach().clone()
+    local_v = (
+        EXTRACT_FUNC_DICT[ring_impl_type](
+            v, rank, world_size=world_size, rd=sp_ring_degree, ud=sp_ulysses_degree
+        )
+        .detach()
+        .clone()
+    )
 
     if use_bwd:
         local_q.requires_grad = True
@@ -147,32 +225,46 @@ def log(msg, a, rank0_only=False):
 
     # Map argument to AttnType enum
     attn_impl_map = {
-        'torch': AttnType.TORCH,
-        'fa': AttnType.FA,
-        'fa3': AttnType.FA3,
-        'flashinfer': AttnType.FLASHINFER,
-        'sage_fp16': AttnType.SAGE_FP16,
-        'sage_fp16_triton': AttnType.SAGE_FP16_TRITON,
-        'sage_fp8': AttnType.SAGE_FP8,
-        'sparse_sage': AttnType.SPARSE_SAGE
+        "torch": AttnType.TORCH,
+        "fa": AttnType.FA,
+        "fa3": AttnType.FA3,
+        "flashinfer": AttnType.FLASHINFER,
+        "sage_fp16": AttnType.SAGE_FP16,
+        "sage_fp8": AttnType.SAGE_FP8,
+        "sage_fp8_sm90": AttnType.SAGE_FP8_SM90,
+        "sage_fp16_triton": AttnType.SAGE_FP16_TRITON,
+        "sage_auto": AttnType.SAGE_AUTO,
+        "sparse_sage": AttnType.SPARSE_SAGE,
     }
 
-    if args.attn_impl == 'sparse_sage':
+    if args.attn_impl == "sparse_sage":
         if use_bwd:
             raise RuntimeError("Sparse Sage attention does not support backward pass")
-        from spas_sage_attn.autotune import SparseAttentionMeansim, load_sparse_attention_state_dict
-        attn_processor = SparseAttentionMeansim(l1=args.sparse_sage_l1, pv_l1=args.sparse_sage_pv_l1, tune_pv=True)
+        from spas_sage_attn.autotune import (
+            SparseAttentionMeansim,
+            load_sparse_attention_state_dict,
+        )
+
+        attn_processor = SparseAttentionMeansim(
+            l1=args.sparse_sage_l1, pv_l1=args.sparse_sage_pv_l1, tune_pv=True
+        )
     else:
         attn_processor = None
 
-    usp_attn = LongContextAttention(ring_impl_type=ring_impl_type, 
-                                    attn_type=attn_impl_map[args.attn_impl],
-                                    attn_processor=attn_processor)
+    usp_attn = LongContextAttention(
+        ring_impl_type=ring_impl_type,
+        attn_type=attn_impl_map[args.attn_impl],
+        attn_processor=attn_processor,
+    )
 
-    if args.attn_impl == 'sparse_sage':
+    if args.attn_impl == "sparse_sage":
         if not args.sparse_sage_tune_mode:
-            saved_state_dict = torch.load(args.sparse_sage_tune_path + f".rank{dist.get_rank()}")
-            load_sparse_attention_state_dict(usp_attn, saved_state_dict, multigpu=True, verbose=True)
+            saved_state_dict = torch.load(
+                args.sparse_sage_tune_path + f".rank{dist.get_rank()}"
+            )
+            load_sparse_attention_state_dict(
+                usp_attn, saved_state_dict, multigpu=True, verbose=True
+            )
         else:
             os.environ["sparse_sage_tune_mode"] = "1"
 
@@ -182,7 +274,7 @@ def log(msg, a, rank0_only=False):
         print("#" * 30)
 
     # common test parameters
-    window_size=(-1, -1)
+    window_size = (-1, -1)
     alibi_slopes, attn_bias = None, None
     dropout_mask = None
 
@@ -203,21 +295,25 @@ def log(msg, a, rank0_only=False):
     )
 
     # extract local dout
-    local_dout = EXTRACT_FUNC_DICT[ring_impl_type](
-        dout, rank, world_size=world_size, rd=sp_ring_degree, ud=sp_ulysses_degree
-    ).detach().clone()
+    local_dout = (
+        EXTRACT_FUNC_DICT[ring_impl_type](
+            dout, rank, world_size=world_size, rd=sp_ring_degree, ud=sp_ulysses_degree
+        )
+        .detach()
+        .clone()
+    )
 
-    
-    max_memory = torch.cuda.max_memory_allocated(device) / (1024 * 1024)  # Convert to MB
+    max_memory = torch.cuda.max_memory_allocated(device) / (
+        1024 * 1024
+    )  # Convert to MB
     print(f"[Rank#{rank}] Maximum GPU memory used: {max_memory:.2f} MB")
     torch.cuda.reset_peak_memory_stats(device)  # Reset stats
 
-
     if rank == 0:
         print("#" * 30)
         print("# ds-ulysses backward:")
         print("#" * 30)
-    
+
     # usp attn backward
     if use_bwd:
         local_out.backward(local_dout)
@@ -282,11 +378,19 @@ def log(msg, a, rank0_only=False):
     torch.testing.assert_close(local_out, local_out_ref, atol=1e-1, rtol=0)
     # torch.testing.assert_close(out_ref, out_pt_ref, atol=1e-2, rtol=0)
 
-    if args.attn_impl == 'sparse_sage':
-        from spas_sage_attn.autotune import SparseAttentionMeansim, extract_sparse_attention_state_dict
+    if args.attn_impl == "sparse_sage":
+        from spas_sage_attn.autotune import (
+            SparseAttentionMeansim,
+            extract_sparse_attention_state_dict,
+        )
+
         if args.sparse_sage_tune_mode:
-            saved_state_dict = extract_sparse_attention_state_dict(usp_attn, verbose=True)
-            torch.save(saved_state_dict, args.sparse_sage_tune_path + f".rank{dist.get_rank()}")
+            saved_state_dict = extract_sparse_attention_state_dict(
+                usp_attn, verbose=True
+            )
+            torch.save(
+                saved_state_dict, args.sparse_sage_tune_path + f".rank{dist.get_rank()}"
+            )
 
     if use_bwd:
         local_dq_ref = EXTRACT_FUNC_DICT[ring_impl_type](