linkedin · Tcc0403 · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025 · Nov 3, 2025
diff --git a/benchmark/scripts/benchmark_fused_linear_cross_entropy.py b/benchmark/scripts/benchmark_fused_linear_cross_entropy.py
@@ -8,6 +8,7 @@
 from utils import parse_benchmark_script_args
 from utils import run_benchmarks
 
+from liger_kernel.ops.helion.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyHelion
 from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
 from liger_kernel.utils import infer_device
 
@@ -45,6 +46,20 @@ def forward(self, x, y):
         return self.ce_loss(self.lin.weight, x, y)
 
 
+class LigerLMHeadCEHelion(torch.nn.Module):
+    def __init__(
+        self, H: int, V: int, dtype: torch.dtype, ignore_index: int = -100, bwd_impl="chunk", grad_in_forward=False
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype)
+        self.ce_loss = LigerFusedLinearCrossEntropyHelion(
+            ignore_index=ignore_index, reduction="mean", bwd_impl=bwd_impl, grad_in_forward=grad_in_forward
+        )
+
+    def forward(self, x, y):
+        return self.ce_loss(x, self.lin.weight, y)
+
+
 #############################################################################
 # Test the memory consumption of the linear fused cross entropy loss
 #############################################################################
@@ -64,6 +79,10 @@ def bench_memory_fused_linear_cross_entropy(
         lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype).to(device)
     elif provider == "liger-fp32-accum":
         lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype, accum_dtype=torch.float32).to(device)
+    elif provider == "liger-helion":
+        lm_head_ce = LigerLMHeadCEHelion(H=H, V=V, dtype=dtype, bwd_impl="chunk", grad_in_forward=False).to(device)
+    elif provider == "liger-helion-grad-in-fwd":
+        lm_head_ce = LigerLMHeadCEHelion(H=H, V=V, dtype=dtype, bwd_impl="chunk", grad_in_forward=True).to(device)
     else:
         lm_head_ce = TorchLMHeadCE(H=H, V=V, dtype=dtype).to(device)
 
@@ -106,6 +125,10 @@ def bench_speed_fused_linear_cross_entropy(
         lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype).to(device)
     elif provider == "liger-fp32-accum":
         lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype, accum_dtype=torch.float32).to(device)
+    elif provider == "liger-helion":
+        lm_head_ce = LigerLMHeadCEHelion(H=H, V=V, dtype=dtype, bwd_impl="chunk", grad_in_forward=False).to(device)
+    elif provider == "liger-helion-grad-in-fwd":
+        lm_head_ce = LigerLMHeadCEHelion(H=H, V=V, dtype=dtype, bwd_impl="chunk", grad_in_forward=True).to(device)
     else:
         lm_head_ce = TorchLMHeadCE(H=H, V=V, dtype=dtype).to(device)
 
@@ -163,7 +186,7 @@ def full():
         "x_name": "BT",
         "x_label": "B x T",
         "x_values": [2**i for i in range(12, 16)],
-        "kernel_providers": ["liger", "liger-fp32-accum", "huggingface"],
+        "kernel_providers": ["liger", "liger-fp32-accum", "huggingface", "liger-helion", "liger-helion-grad-in-fwd"],
         "extra_benchmark_configs": [{"H": 4096, "V": 128256, "mode": "forward", "dtype": torch.bfloat16}],
         "overwrite": args.overwrite,
     }

diff --git a/src/liger_kernel/ops/helion/configs/fused_linear_cross_entropy_fwd_h100_llama_bf16.json b/src/liger_kernel/ops/helion/configs/fused_linear_cross_entropy_fwd_h100_llama_bf16.json
@@ -0,0 +1,45 @@
+{
+  "block_sizes": [
+    64,
+    64,
+    256
+  ],
+  "range_unroll_factors": [
+    0,
+    1,
+    1
+  ],
+  "range_num_stages": [
+    0,
+    3,
+    4
+  ],
+  "range_multi_buffers": [
+    null,
+    false,
+    null
+  ],
+  "range_flattens": [
+    null,
+    true,
+    true
+  ],
+  "load_eviction_policies": [
+    "last",
+    "last",
+    "",
+    ""
+  ],
+  "num_warps": 4,
+  "num_stages": 8,
+  "indexing": [
+    "tensor_descriptor",
+    "pointer",
+    "tensor_descriptor",
+    "tensor_descriptor",
+    "pointer",
+    "pointer"
+  ],
+  "pid_type": "flat",
+  "range_warp_specializes": []
+}
diff --git a/src/liger_kernel/ops/helion/configs/fused_linear_cross_entropy_fwd_h100_llama_fp32.json b/src/liger_kernel/ops/helion/configs/fused_linear_cross_entropy_fwd_h100_llama_fp32.json
@@ -0,0 +1,45 @@
+{
+  "block_sizes": [
+    64,
+    32,
+    256
+  ],
+  "range_unroll_factors": [
+    0,
+    1,
+    1
+  ],
+  "range_num_stages": [
+    0,
+    3,
+    4
+  ],
+  "range_multi_buffers": [
+    null,
+    true,
+    null
+  ],
+  "range_flattens": [
+    null,
+    null,
+    true
+  ],
+  "load_eviction_policies": [
+    "last",
+    "last",
+    "",
+    ""
+  ],
+  "num_warps": 4,
+  "num_stages": 6,
+  "indexing": [
+    "tensor_descriptor",
+    "tensor_descriptor",
+    "pointer",
+    "tensor_descriptor",
+    "pointer",
+    "tensor_descriptor"
+  ],
+  "pid_type": "flat",
+  "range_warp_specializes": []
+}
diff --git a/...el/ops/helion/configs/fused_linear_cross_entropy_grad_logits_compute_h100_llama_fp32.json b/...el/ops/helion/configs/fused_linear_cross_entropy_grad_logits_compute_h100_llama_fp32.json
@@ -0,0 +1,50 @@
+{
+  "block_sizes": [
+    64,
+    32,
+    256
+  ],
+  "loop_orders": [
+    [
+      0,
+      1
+    ]
+  ],
+  "l2_groupings": [
+    32
+  ],
+  "range_unroll_factors": [
+    0,
+    1
+  ],
+  "range_num_stages": [
+    4,
+    2
+  ],
+  "range_multi_buffers": [
+    true,
+    null
+  ],
+  "range_flattens": [
+    true,
+    true
+  ],
+  "load_eviction_policies": [
+    "last",
+    "last",
+    "first",
+    "first"
+  ],
+  "num_warps": 8,
+  "num_stages": 1,
+  "indexing": [
+    "tensor_descriptor",
+    "tensor_descriptor",
+    "tensor_descriptor",
+    "tensor_descriptor",
+    "pointer",
+    "tensor_descriptor"
+  ],
+  "pid_type": "persistent_interleaved",
+  "range_warp_specializes": []
+}
diff --git a/...helion/configs/fused_linear_cross_entropy_nll_and_grad_logit_compute_h100_llama_fp32.json b/...helion/configs/fused_linear_cross_entropy_nll_and_grad_logit_compute_h100_llama_fp32.json
@@ -0,0 +1,49 @@
+{
+  "block_sizes": [
+    64,
+    32,
+    512
+  ],
+  "range_unroll_factors": [
+    3,
+    0,
+    0
+  ],
+  "range_num_stages": [
+    4,
+    0,
+    3
+  ],
+  "range_multi_buffers": [
+    true,
+    true,
+    false
+  ],
+  "range_flattens": [
+    true,
+    true,
+    false
+  ],
+  "load_eviction_policies": [
+    "last",
+    "first",
+    "",
+    "last",
+    "",
+    "first"
+  ],
+  "num_warps": 8,
+  "num_stages": 7,
+  "indexing": [
+    "pointer",
+    "tensor_descriptor",
+    "pointer",
+    "pointer",
+    "pointer",
+    "pointer",
+    "pointer",
+    "pointer"
+  ],
+  "pid_type": "persistent_blocked",
+  "range_warp_specializes": []
+}