bump version to 0.6.2 (#140)

feifeibear · web-flow · commit 6e8b003a82d6 · 2025-04-08T16:31:51.000+08:00
diff --git a/README.md b/README.md
@@ -122,7 +122,12 @@ local_out = usp_attn(
 
 ### 3.Test
 
-- Causal Attention Test
+if you do not install yuanchang, add the project root directory to the PYTHONPATH:
+```
+export PYTHONPATH=$PWD:$PYTHONPATH
+````
+
+- FlashAttn/Torch Test
 ```bash
 torchrun --nproc_per_node=4 ./test/test_hybrid_attn.py --sp_ulysses_degree 2 --ring_impl_type "zigzag" --causal --attn_impl fa --use_bwd
 torchrun --nproc_per_node=4 ./test/test_hybrid_attn.py --sp_ulysses_degree 2 --ring_impl_type "zigzag" --causal --attn_impl torch
@@ -134,13 +139,21 @@ torchrun --nproc_per_node 8 test/test_hybrid_qkvpacked_attn.py
 you need install [SpargeAttn](https://github.com/thu-ml/SpargeAttn) and [SageAttention](https://github.com/thu-ml/SageAttention) from source.
 
 ```bash
-torchrun --nproc_per_node=4 ./test/test_hybrid_attn.py --sp_ulysses_degree 4 --attn_impl sage_fp8
+torchrun --nproc_per_node=4 ./test/test_hybrid_attn.py --sp_ulysses_degree 2 --attn_impl sage_fp8
 ```
 
 ```bash
 torchrun --nproc_per_node=4 ./test/test_hybrid_attn.py --sp_ulysses_degree 4 --attn_impl sparse_sage --sparse_sage_tune_mode
 ```
 
+- FlashInfer Test (fwd only)
+
+Install FlashInfer from [here](https://docs.flashinfer.ai/installation.html#quick-start).
+
+```bash
+torchrun --nproc_per_node=4 --master_port=1234 ./test/test_hybrid_attn.py --sp_ulysses_degree 2 --ring_impl_type 'basic_flashinfer' --attn_impl flashinfer
+```
+
 ### 4. Verified in Megatron-LM
 The loss curves for Data Parallel (DP) and Unified Sequence Parallel (ulysses=2+ring=2) are closely aligned, as illustrated in the figure. This alignment confirms the accuracy of the unified sequence parallel.
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "yunchang"
-version = "0.6.1"
+version = "0.6.2"
 authors = [
   { name="Jiarui Fang", email="fangjiarui123@gmail.com" },
 ]
diff --git a/yunchang/__init__.py b/yunchang/__init__.py
@@ -4,5 +4,5 @@
 from .globals import set_seq_parallel_pg
 from .comm.extract_local import stripe_extract_local, basic_extract_local, zigzag_extract_local, EXTRACT_FUNC_DICT
 
-__version__ = "0.6.1"
+__version__ = "0.6.2"
 
diff --git a/yunchang/globals.py b/yunchang/globals.py
@@ -1,4 +1,5 @@
 import torch
+import os
 
 
 class Singleton:
@@ -98,6 +99,13 @@ def set_seq_parallel_pg(
 try:
     from flashinfer.prefill import single_prefill_with_kv_cache
     HAS_FLASHINFER = True
+    def get_cuda_arch():
+        major, minor = torch.cuda.get_device_capability()
+        return f"{major}.{minor}"
+
+    cuda_arch = get_cuda_arch()
+    os.environ['TORCH_CUDA_ARCH_LIST'] = cuda_arch
+    print(f"Set TORCH_CUDA_ARCH_LIST to {cuda_arch}")
 except ImportError:
     HAS_FLASHINFER = False
 
@@ -111,4 +119,5 @@ def set_seq_parallel_pg(
     import spas_sage_attn
     HAS_SPARSE_SAGE_ATTENTION = True
 except ImportError:
-    HAS_SPARSE_SAGE_ATTENTION = False
+    HAS_SPARSE_SAGE_ATTENTION = False
+

Original file line number	Diff line number	Diff line change
`@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"`
`4`	`4`
`5`	`5`	`[project]`
`6`	`6`	`name = "yunchang"`
`7`		`-version = "0.6.1"`
	`7`	`+version = "0.6.2"`
`8`	`8`	`authors = [`
`9`	`9`	`{ name="Jiarui Fang", email="[email protected]" },`
`10`	`10`	`]`