cleanup

Devjiu · Devjiu · commit fb48d7620bdf · 2025-05-22T16:52:07.000Z
diff --git a/bitsandbytes/backends/triton/ops.py b/bitsandbytes/backends/triton/ops.py
@@ -13,39 +13,6 @@
     triton_available = False
 
 
-# torch compile:
-# 1.53s call     tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-256-nested=T-bf16-xpu]
-#
-# triton:
-# 1.07s call     tests/test_functional.py::Test8BitBlockwiseQuantizeFunctional::test_dynamic_blockwise_quantization[signed=F-256-nested=T-bf16-xpu]
-@torch.compile
-def quantize_blockwise_torch(A, code, blocksize):
-    n = A.numel()
-    blocks = -(n // -blocksize)
-
-    absmax = torch.empty((blocks,), device=A.device, dtype=A.dtype)
-    quantized_out = torch.empty_like(A.flatten(), dtype=torch.uint8)
-
-    rem = n % blocksize
-    has_rem = rem > 0
-    blocks = n // blocksize + has_rem
-    A_reshaped = A.reshape(n)
-    A_com = A_reshaped[: n - rem]
-    A_com_reshaped = A_com.reshape(n // blocksize, blocksize)
-    absmax[: blocks - has_rem] = torch.abs(A_com_reshaped).max(dim=-1)[0]
-    scaled_A = torch.clamp(A_com_reshaped / absmax[: blocks - has_rem].view(-1, 1), -1, 1)
-    scaled_A = scaled_A.reshape(-1)
-    if has_rem:
-        absmax[-1] = torch.abs(A_reshaped[n - rem :]).max()
-        scaled_A_rem = torch.clamp((A_reshaped[n - rem :] / absmax[-1]), -1, 1)
-        scaled_A = torch.cat([scaled_A, scaled_A_rem], dim=0)
-
-    diff = torch.abs(scaled_A.unsqueeze(-1) - code.to(scaled_A.device))
-    quantized_out = torch.argmin(diff, dim=-1).to(torch.uint8).to(scaled_A.device).reshape(A.shape)
-    quantized_out = quantized_out.reshape(A.shape)
-    return quantized_out, absmax
-
-
 def quantize_blockwise(A: torch.Tensor, code: torch.Tensor, blocksize: int) -> tuple[torch.Tensor, torch.Tensor]:
     torch._check_is_size(blocksize)
     # torch._check(A.dtype == torch.float32, lambda: f"A must be float32 on xpu, got {A.dtype}")
@@ -99,33 +66,6 @@ def dequantize_blockwise_inplace(
     )
 
 
-# torch compile
-# 1.01s call     tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[64-fp4-fp32-xpu]
-#
-# triton
-# 0.80s call     tests/test_functional.py::TestQuantize4BitFunctional::test_4bit_quant[64-fp4-fp32-xpu]
-@torch.compile
-def quantize_4bit_torch(
-    A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
-) -> tuple[torch.Tensor, torch.Tensor]:
-    # Divide into blocks and normalize
-    blocks = A.reshape(-1, blocksize)
-    absmax = blocks.abs().max(dim=1).values.float()
-    scaled = blocks / absmax.unsqueeze(-1)
-    if quant_type == "fp4":
-        quantized = torch.argmin(torch.abs(scaled.view(-1, 1) - _FP4_QUANT_TABLE), dim=-1, keepdim=True).to(
-            torch.uint8
-        )
-    else:
-        quantized = torch.argmin(torch.abs(scaled.view(-1, 1) - _NF4_QUANT_TABLE), dim=-1, keepdim=True).to(
-            torch.uint8
-        )
-    packed = quantized[::2] << 4 | quantized[1::2]
-    if quant_storage != torch.uint8:
-        packed = packed.squeeze().view(quant_storage).unsqueeze(1)
-    return packed, absmax.float()
-
-
 def quantize_4bit(
     A: torch.Tensor, blocksize: int, quant_type: str, quant_storage: torch.dtype
 ) -> tuple[torch.Tensor, torch.Tensor]:
diff --git a/bitsandbytes/backends/triton/triton_kernels.py b/bitsandbytes/backends/triton/triton_kernels.py
@@ -7,41 +7,24 @@
 from .utils import _FP4_QUANT_TABLE, _NF4_QUANT_TABLE
 
 
-# @triton.autotune(
-#     configs=[
-#         triton.Config({'SPLIT_SIZE': 64}),
-#         triton.Config({'SPLIT_SIZE': 128}),
-#         triton.Config({'SPLIT_SIZE': 256}),
-#         triton.Config({'SPLIT_SIZE': 512}),
-#         triton.Config({'SPLIT_SIZE': 1024}),
-#         triton.Config({'SPLIT_SIZE': 2048}),
-#         triton.Config({'SPLIT_SIZE': 4096}),
-#         triton.Config({'SPLIT_SIZE': 8192}),
-#         triton.Config({'SPLIT_SIZE': 16384}),
-#     ],
-#     key=['SPLIT_SIZE'],
-# )
 @triton.jit
 def dequant_8bit_kernel(
     a_ptr,
     c_ptr,
     quant_ptr,
     absmax_ptr,
-    # bias_ptr,
     num_paired_elements,
     QUANT_BLOCK: tl.constexpr,
     SPLIT_SIZE: tl.constexpr,
 ):
-    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0.
+    pid = tl.program_id(axis=0)
     block_start = pid * SPLIT_SIZE
     offsets = block_start + tl.arange(0, SPLIT_SIZE)
     mask = offsets < num_paired_elements
 
     a = tl.load(a_ptr + offsets, mask)
     a = a.to(tl.uint8, bitcast=True)
 
-    # bias = tl.load(bias_ptr)
-
     # apply conversion
     scaled_int8 = tl.load(quant_ptr + a, mask)
 
@@ -52,7 +35,6 @@ def dequant_8bit_kernel(
     absmax = tl.load(absmax_ptr + abs_offsets, mask_blocked)
     # apply scales
     out_dq = scaled_int8 * absmax
-    # out_dq = out_dq + bias
 
     offs = block_start + tl.arange(0, SPLIT_SIZE)
     mask = offs < num_paired_elements
@@ -79,19 +61,7 @@ def dequant_int8_blockwise(
 
 @triton.autotune(
     configs=[
-        # triton.Config({'SPLIT_NUM_BLOCKS': 1, 'grf_mode': 'large'}, num_stages=2, num_warps=32),
-        # triton.Config({'SPLIT_NUM_BLOCKS': 1, 'grf_mode': 'auto'}, num_stages=2, num_warps=32),
-        # triton.Config({'SPLIT_NUM_BLOCKS': 1, 'grf_mode': 'large'}, num_stages=4, num_warps=32),
-        #
         triton.Config({"SPLIT_NUM_BLOCKS": 1, "grf_mode": "auto"}, num_stages=4, num_warps=32),
-        #
-        # triton.Config({"SPLIT_NUM_BLOCKS": 2, "grf_mode": "large"}, num_stages=2, num_warps=32),
-        # # triton.Config({'SPLIT_NUM_BLOCKS': 2, 'grf_mode': 'large'}, num_stages=4, num_warps=32),
-        # triton.Config({"SPLIT_NUM_BLOCKS": 2, "grf_mode": "auto"}, num_stages=2, num_warps=32),
-        # triton.Config({"SPLIT_NUM_BLOCKS": 2, "grf_mode": "auto"}, num_stages=4, num_warps=32),
-        # triton.Config({"SPLIT_NUM_BLOCKS": 4, "grf_mode": "large"}, num_stages=2, num_warps=32),
-        # triton.Config({"SPLIT_NUM_BLOCKS": 4, "grf_mode": "large"}, num_stages=4, num_warps=32),
-        # triton.Config({'SPLIT_NUM_BLOCKS': 8, 'grf_mode': 'large'}, num_stages=2, num_warps=32),
     ],
     key=["BLOCK_SIZE"],
 )
@@ -124,9 +94,6 @@ def quantize_blockwise_kernel(
     A_normalized = A_reshaped / absmax[:, None]
     A_normalized = tl.clamp(A_normalized, -1.0, 1.0)
 
-    # This can be fruitful, but compiler should preload it
-    # code = tl.load(code_ptr + tl.arange(0, CODE_SIZE))
-
     lower_pivot = tl.zeros((SPLIT_NUM_BLOCKS, BLOCK_SIZE), dtype=tl.int32)
     upper_pivot = tl.full((SPLIT_NUM_BLOCKS, BLOCK_SIZE), CODE_SIZE - 1, dtype=tl.int32)
 
@@ -176,24 +143,6 @@ def unite_2_int4(x, y):
     return (x & 0xF) | (y << 4)
 
 
-# @triton.autotune(
-#     configs=[
-#         # triton.Config({'SPLIT_NUM_BLOCKS': 1, 'grf_mode': 'large'}, num_stages=2, num_warps=32),
-#         # triton.Config({'SPLIT_NUM_BLOCKS': 1, 'grf_mode': 'auto'}, num_stages=2, num_warps=32),
-#         # triton.Config({'SPLIT_NUM_BLOCKS': 1, 'grf_mode': 'large'}, num_stages=4, num_warps=32),
-#         #
-#         triton.Config({"SPLIT_NUM_BLOCKS": 1, "grf_mode": "auto"}, num_stages=4, num_warps=32),
-#         #
-#         # triton.Config({"SPLIT_NUM_BLOCKS": 2, "grf_mode": "large"}, num_stages=2, num_warps=32),
-#         # # triton.Config({'SPLIT_NUM_BLOCKS': 2, 'grf_mode': 'large'}, num_stages=4, num_warps=32),
-#         # triton.Config({"SPLIT_NUM_BLOCKS": 2, "grf_mode": "auto"}, num_stages=2, num_warps=32),
-#         # triton.Config({"SPLIT_NUM_BLOCKS": 2, "grf_mode": "auto"}, num_stages=4, num_warps=32),
-#         # triton.Config({"SPLIT_NUM_BLOCKS": 4, "grf_mode": "large"}, num_stages=2, num_warps=32),
-#         # triton.Config({"SPLIT_NUM_BLOCKS": 4, "grf_mode": "large"}, num_stages=4, num_warps=32),
-#         # triton.Config({'SPLIT_NUM_BLOCKS': 8, 'grf_mode': 'large'}, num_stages=2, num_warps=32),
-#     ],
-#     key=["BLOCK_SIZE"],
-# )
 @triton.jit
 def quantize_4bit_blockwise_kernel(
     A_ptr,
@@ -261,11 +210,6 @@ def quantize_4bit_blockwise_triton(A, blocksize, code, blocks, absmax, quantized
 
     split_num_blocks = 1
     grid = (triton.cdiv(blocks, split_num_blocks),)
-    # grid = (1, )
-    # grid = lambda META: (triton.cdiv(blocks, META["SPLIT_NUM_BLOCKS"]),)
-    # print(" blocksize, split_num_blocks: ", blocksize, split_num_blocks)
-    # print(" blocksize, split_num_blocks: ", blocksize, split_num_blocks*2)
-    # print("A shape: ", A.shape, " numel: ", n, " blocks: ", blocks)
     quantize_4bit_blockwise_kernel[grid](
         A_ptr=A,
         code_ptr=code,
@@ -280,20 +224,6 @@ def quantize_4bit_blockwise_triton(A, blocksize, code, blocks, absmax, quantized
     return quantized_out, absmax
 
 
-# @triton.autotune(
-#     configs=[
-#         # triton.Config({'SPLIT_SIZE': 64}),
-#         # triton.Config({'SPLIT_SIZE': 128}),
-#         # triton.Config({'SPLIT_SIZE': 256}),
-#         triton.Config({'SPLIT_SIZE': 512}),
-#         # triton.Config({'SPLIT_SIZE': 1024}),
-#         # triton.Config({'SPLIT_SIZE': 2048}),
-#         # triton.Config({'SPLIT_SIZE': 4096}),
-#         # triton.Config({'SPLIT_SIZE': 8192}),
-#         # triton.Config({'SPLIT_SIZE': 16384}),
-#     ],
-#     key=['SPLIT_SIZE'],
-# )
 @triton.jit
 def dequant_4bit_kernel(
     a_ptr, c_ptr, quant_ptr, absmax_ptr, num_paired_elements, QUANT_BLOCK: tl.constexpr, SPLIT_SIZE: tl.constexpr
diff --git a/bitsandbytes/backends/xpu/ops.py b/bitsandbytes/backends/xpu/ops.py
@@ -12,13 +12,11 @@
 
 if triton_available:
     register_kernel("bitsandbytes::quantize_blockwise", "xpu")(triton_ops.quantize_blockwise)
-    # register_kernel("bitsandbytes::quantize_blockwise", "xpu")(quantize_blockwise_torch)
     register_kernel("bitsandbytes::dequantize_blockwise.out", "xpu")(triton_ops.dequantize_blockwise_inplace)
     register_kernel("bitsandbytes::dequantize_blockwise", "xpu")(triton_ops.dequantize_blockwise)
     register_kernel("bitsandbytes::quantize_4bit", "xpu")(triton_ops.quantize_4bit)
     register_kernel("bitsandbytes::dequantize_4bit.out", "xpu")(triton_ops.dequantize_4bit_inplace)
     register_kernel("bitsandbytes::dequantize_4bit", "xpu")(triton_ops.dequantize_4bit)
     register_kernel("bitsandbytes::gemv_4bit", "xpu")(triton_ops.gemv_4bit)
-    # register_kernel("bitsandbytes::gemv_4bit.out", "xpu")(triton_ops.gemv_4bit_inpalce)
 else:
     warnings.warn("XPU available, but trtion package is missing.")