Support Olive new uint8 quantization format (#1916)

xiaoyu-work · web-flow · commit 53661d034383 · 2025-12-12T16:44:56.000-08:00
This pull request updates the handling of the Olive quantization format
in `quantized_model.py` to match the latest specification and improve
code clarity. The main changes include correcting how in/out features
are computed for Olive quantized layers, documenting the Olive format,
and updating repacking logic for compatibility with ONNX Runtime (ORT).

**Olive quantization format support and documentation:**

* Updated computation of `in_features` and `out_features` for Olive
quantized layers to match the new format, which packs weights along the
last dimension (`qweight` is now `(out_features, packed_in_features)`),
and adjusted all relevant projections in self-attention and MLP modules.
[[1]](diffhunk://#diff-8c2caf775960974ce923934b24e069fae5b819a0fa972976363ab8689f996c23L557-R560)
[[2]](diffhunk://#diff-8c2caf775960974ce923934b24e069fae5b819a0fa972976363ab8689f996c23L658-R684)
* Added a docstring to the `OliveModel` class explaining the Olive
quantization format for weights, scales, and zero points.

**Repacking and compatibility improvements:**

* Implemented a new `repack` method for Olive quantized modules to
reshape tensors for ONNX Runtime (ORT) compatibility, including
reshaping `qweight`, flattening `scales`, and flattening `qzeros`.
* Added placeholder methods `handle_qzeros` and `unpack` for Olive
format to clarify that no offset or unpacking is required.
diff --git a/src/python/py/models/quantized_model.py b/src/python/py/models/quantized_model.py
@@ -366,11 +366,18 @@ def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, interme
                             # model.layers.layer_id.self_attention.query_key_value.qweight
                             # model.layers.layer_id.self_attn.qkv_proj.weight
                             # model.layers.layer_id.self_attention.query_key_value.weight
-                            q_dim = q_size // (32 // local_bits) if quant_type in {"awq", "quark"} else q_size
-                            kv_dim = kv_size // (32 // local_bits) if quant_type in {"awq", "quark"} else kv_size
-                            tensor_map["self_attn.q_proj.qweight"] = tensor[:, :q_dim]
-                            tensor_map["self_attn.k_proj.qweight"] = tensor[:, q_dim : q_dim + kv_dim]
-                            tensor_map["self_attn.v_proj.qweight"] = tensor[:, q_dim + kv_dim :]
+                            if quant_type == "olive":
+                                # Olive: (out_features, in_features), split on dim=0
+                                tensor_map["self_attn.q_proj.qweight"] = tensor[:q_size, :]
+                                tensor_map["self_attn.k_proj.qweight"] = tensor[q_size : q_size + kv_size, :]
+                                tensor_map["self_attn.v_proj.qweight"] = tensor[q_size + kv_size :, :]
+                            else:
+                                # AWQ/GPTQ/Quark: (in_features, out_features), split on dim=1
+                                q_dim = q_size // (32 // local_bits) if quant_type in {"awq", "quark"} else q_size
+                                kv_dim = kv_size // (32 // local_bits) if quant_type in {"awq", "quark"} else kv_size
+                                tensor_map["self_attn.q_proj.qweight"] = tensor[:, :q_dim]
+                                tensor_map["self_attn.k_proj.qweight"] = tensor[:, q_dim : q_dim + kv_dim]
+                                tensor_map["self_attn.v_proj.qweight"] = tensor[:, q_dim + kv_dim :]
                         elif bool(
                             re.match(
                                 r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.(scales|weight_scale)$",
@@ -381,9 +388,16 @@ def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, interme
                             # model.layers.layer_id.self_attention.query_key_value.scales
                             # model.layers.layer_id.self_attn.qkv_proj.weight_scale
                             # model.layers.layer_id.self_attention.query_key_value.weight_scale
-                            tensor_map["self_attn.q_proj.scales"] = tensor[:, :q_size]
-                            tensor_map["self_attn.k_proj.scales"] = tensor[:, q_size : q_size + kv_size]
-                            tensor_map["self_attn.v_proj.scales"] = tensor[:, q_size + kv_size :]
+                            if quant_type == "olive":
+                                # Olive: (out_features, num_groups), split on dim=0
+                                tensor_map["self_attn.q_proj.scales"] = tensor[:q_size, :]
+                                tensor_map["self_attn.k_proj.scales"] = tensor[q_size : q_size + kv_size, :]
+                                tensor_map["self_attn.v_proj.scales"] = tensor[q_size + kv_size :, :]
+                            else:
+                                # AWQ/GPTQ/Quark: split on dim=1
+                                tensor_map["self_attn.q_proj.scales"] = tensor[:, :q_size]
+                                tensor_map["self_attn.k_proj.scales"] = tensor[:, q_size : q_size + kv_size]
+                                tensor_map["self_attn.v_proj.scales"] = tensor[:, q_size + kv_size :]
                         elif bool(
                             re.match(
                                 r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.(qzeros|weight_zero_point)$",
@@ -394,19 +408,28 @@ def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, interme
                             # model.layers.layer_id.self_attention.query_key_value.qzeros
                             # model.layers.layer_id.self_attn.qkv_proj.weight_zero_point
                             # model.layers.layer_id.self_attention.query_key_value.weight_zero_point
-                            q_dim = (
-                                q_size // (32 // local_bits)
-                                if quant_type in {"awq", "gptq", "olive", "quark"}
-                                else q_size
-                            )
-                            kv_dim = (
-                                kv_size // (32 // local_bits)
-                                if quant_type in {"awq", "gptq", "olive", "quark"}
-                                else kv_size
-                            )
-                            tensor_map["self_attn.q_proj.qzeros"] = tensor[:, :q_dim]
-                            tensor_map["self_attn.k_proj.qzeros"] = tensor[:, q_dim : q_dim + kv_dim]
-                            tensor_map["self_attn.v_proj.qzeros"] = tensor[:, q_dim + kv_dim :]
+                            if quant_type == "olive":
+                                # Olive: (out_features, packed_num_groups) uint8, split on dim=0
+                                q_dim = q_size // (8 // local_bits)
+                                kv_dim = kv_size // (8 // local_bits)
+                                tensor_map["self_attn.q_proj.qzeros"] = tensor[:q_dim, :]
+                                tensor_map["self_attn.k_proj.qzeros"] = tensor[q_dim : q_dim + kv_dim, :]
+                                tensor_map["self_attn.v_proj.qzeros"] = tensor[q_dim + kv_dim :, :]
+                            else:
+                                # AWQ/GPTQ/Quark: int32 packing, split on dim=1
+                                q_dim = (
+                                    q_size // (32 // local_bits)
+                                    if quant_type in {"awq", "gptq", "quark"}
+                                    else q_size
+                                )
+                                kv_dim = (
+                                    kv_size // (32 // local_bits)
+                                    if quant_type in {"awq", "gptq", "quark"}
+                                    else kv_size
+                                )
+                                tensor_map["self_attn.q_proj.qzeros"] = tensor[:, :q_dim]
+                                tensor_map["self_attn.k_proj.qzeros"] = tensor[:, q_dim : q_dim + kv_dim]
+                                tensor_map["self_attn.v_proj.qzeros"] = tensor[:, q_dim + kv_dim :]
                         elif bool(
                             re.match(
                                 r"^model.layers\.\d+\.(self_attn.qkv_proj|self_attention.query_key_value)\.g_idx$", name
@@ -434,13 +457,19 @@ def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, interme
                             # model.layers.layer_id.mlp.dense_h_to_4h.qweight
                             # model.layers.layer_id.mlp.gate_up_proj.weight
                             # model.layers.layer_id.mlp.dense_h_to_4h.weight
-                            intermediate_dim = (
-                                intermediate_size // (32 // local_bits)
-                                if quant_type in {"awq", "quark"}
-                                else intermediate_size
-                            )
-                            tensor_map["mlp.gate_proj.qweight"] = tensor[:, :intermediate_dim]
-                            tensor_map["mlp.up_proj.qweight"] = tensor[:, intermediate_dim:]
+                            if quant_type == "olive":
+                                # Olive: (out_features, in_features), split on dim=0
+                                tensor_map["mlp.gate_proj.qweight"] = tensor[:intermediate_size, :]
+                                tensor_map["mlp.up_proj.qweight"] = tensor[intermediate_size:, :]
+                            else:
+                                # AWQ/GPTQ/Quark: (in_features, out_features), split on dim=1
+                                intermediate_dim = (
+                                    intermediate_size // (32 // local_bits)
+                                    if quant_type in {"awq", "quark"}
+                                    else intermediate_size
+                                )
+                                tensor_map["mlp.gate_proj.qweight"] = tensor[:, :intermediate_dim]
+                                tensor_map["mlp.up_proj.qweight"] = tensor[:, intermediate_dim:]
                         elif bool(
                             re.match(
                                 r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h|gate_proj)\.(scales|weight_scale)$",
@@ -451,8 +480,14 @@ def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, interme
                             # model.layers.layer_id.mlp.dense_h_to_4h.scales
                             # model.layers.layer_id.mlp.gate_up_proj.weight_scale
                             # model.layers.layer_id.mlp.dense_h_to_4h.weight_scale
-                            tensor_map["mlp.gate_proj.scales"] = tensor[:, :intermediate_size]
-                            tensor_map["mlp.up_proj.scales"] = tensor[:, intermediate_size:]
+                            if quant_type == "olive":
+                                # Olive: (out_features, num_groups), split on dim=0
+                                tensor_map["mlp.gate_proj.scales"] = tensor[:intermediate_size, :]
+                                tensor_map["mlp.up_proj.scales"] = tensor[intermediate_size:, :]
+                            else:
+                                # AWQ/GPTQ/Quark: split on dim=1
+                                tensor_map["mlp.gate_proj.scales"] = tensor[:, :intermediate_size]
+                                tensor_map["mlp.up_proj.scales"] = tensor[:, intermediate_size:]
                         elif bool(
                             re.match(
                                 r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h|gate_proj)\.(qzeros|weight_zero_point)$",
@@ -463,13 +498,20 @@ def __init__(self, quant_type, input_path, quant_attrs, q_size, kv_size, interme
                             # model.layers.layer_id.mlp.dense_h_to_4h.qzeros
                             # model.layers.layer_id.mlp.gate_up_proj.weight_zero_point
                             # model.layers.layer_id.mlp.dense_h_to_4h.weight_zero_point
-                            intermediate_dim = (
-                                intermediate_size // (32 // local_bits)
-                                if quant_type in {"awq", "gptq", "quark", "olive"}
-                                else intermediate_size
-                            )
-                            tensor_map["mlp.gate_proj.qzeros"] = tensor[:, :intermediate_dim]
-                            tensor_map["mlp.up_proj.qzeros"] = tensor[:, intermediate_dim:]
+                            if quant_type == "olive":
+                                # Olive: (out_features, packed_num_groups) uint8, split on dim=0
+                                intermediate_dim = intermediate_size // (8 // local_bits)
+                                tensor_map["mlp.gate_proj.qzeros"] = tensor[:intermediate_dim, :]
+                                tensor_map["mlp.up_proj.qzeros"] = tensor[intermediate_dim:, :]
+                            else:
+                                # AWQ/GPTQ/Quark: int32 packing, split on dim=1
+                                intermediate_dim = (
+                                    intermediate_size // (32 // local_bits)
+                                    if quant_type in {"awq", "gptq", "quark"}
+                                    else intermediate_size
+                                )
+                                tensor_map["mlp.gate_proj.qzeros"] = tensor[:, :intermediate_dim]
+                                tensor_map["mlp.up_proj.qzeros"] = tensor[:, intermediate_dim:]
                         elif bool(re.match(r"^model.layers\.\d+\.mlp.(gate_up_proj|dense_h_to_4h)\.g_idx$", name)):
                             # model.layers.layer_id.mlp.gate_up_proj.g_idx
                             # model.layers.layer_id.mlp.dense_h_to_4h.g_idx
@@ -554,10 +596,10 @@ def set_properties(self):
                 self.lm_head.out_features = self.lm_head.qweight.shape[1]
                 self.lm_head.in_features = self.lm_head.g_idx.shape[0]
             elif self.quant_type == "olive":
-                self.lm_head.out_features = self.lm_head.qweight.shape[1]
-                # expects in_features to be divisible by the packing factor (32 // bits)
-                # not a new assumption since no code here accounts for padded packed weights
-                self.lm_head.in_features = self.lm_head.qweight.shape[0] * 32 // self.lm_head.bits
+                # Olive format: qweight is (out_features, packed_in_features) uint8
+                # packed_in_features = in_features * bits / 8
+                self.lm_head.out_features = self.lm_head.qweight.shape[0]
+                self.lm_head.in_features = self.lm_head.qweight.shape[1] * 8 // self.lm_head.bits
             else:
                 raise NotImplementedError(f"The {self.quant_type} quantization method is not recognized.")
         for module in self.layers:
@@ -654,32 +696,31 @@ def set_properties(self):
                 module.mlp.down_proj.in_features = module.mlp.down_proj.g_idx.shape[0]
 
             elif self.quant_type == "olive":
-                # Set in_features and out_features
-                module.self_attn.q_proj.out_features = module.self_attn.q_proj.qweight.shape[1]
+                module.self_attn.q_proj.out_features = module.self_attn.q_proj.qweight.shape[0]
                 module.self_attn.q_proj.in_features = (
-                    module.self_attn.q_proj.qweight.shape[0] * 32 // module.self_attn.q_proj.bits
+                    module.self_attn.q_proj.qweight.shape[1] * 8 // module.self_attn.q_proj.bits
                 )
-                module.self_attn.k_proj.out_features = module.self_attn.k_proj.qweight.shape[1]
+                module.self_attn.k_proj.out_features = module.self_attn.k_proj.qweight.shape[0]
                 module.self_attn.k_proj.in_features = (
-                    module.self_attn.k_proj.qweight.shape[0] * 32 // module.self_attn.k_proj.bits
+                    module.self_attn.k_proj.qweight.shape[1] * 8 // module.self_attn.k_proj.bits
                 )
-                module.self_attn.v_proj.out_features = module.self_attn.v_proj.qweight.shape[1]
+                module.self_attn.v_proj.out_features = module.self_attn.v_proj.qweight.shape[0]
                 module.self_attn.v_proj.in_features = (
-                    module.self_attn.v_proj.qweight.shape[0] * 32 // module.self_attn.v_proj.bits
+                    module.self_attn.v_proj.qweight.shape[1] * 8 // module.self_attn.v_proj.bits
                 )
-                module.self_attn.o_proj.out_features = module.self_attn.o_proj.qweight.shape[1]
+                module.self_attn.o_proj.out_features = module.self_attn.o_proj.qweight.shape[0]
                 module.self_attn.o_proj.in_features = (
-                    module.self_attn.o_proj.qweight.shape[0] * 32 // module.self_attn.o_proj.bits
+                    module.self_attn.o_proj.qweight.shape[1] * 8 // module.self_attn.o_proj.bits
                 )
-                module.mlp.gate_proj.out_features = module.mlp.gate_proj.qweight.shape[1]
+                module.mlp.gate_proj.out_features = module.mlp.gate_proj.qweight.shape[0]
                 module.mlp.gate_proj.in_features = (
-                    module.mlp.gate_proj.qweight.shape[0] * 32 // module.mlp.gate_proj.bits
+                    module.mlp.gate_proj.qweight.shape[1] * 8 // module.mlp.gate_proj.bits
                 )
-                module.mlp.up_proj.out_features = module.mlp.up_proj.qweight.shape[1]
-                module.mlp.up_proj.in_features = module.mlp.up_proj.qweight.shape[0] * 32 // module.mlp.up_proj.bits
-                module.mlp.down_proj.out_features = module.mlp.down_proj.qweight.shape[1]
+                module.mlp.up_proj.out_features = module.mlp.up_proj.qweight.shape[0]
+                module.mlp.up_proj.in_features = module.mlp.up_proj.qweight.shape[1] * 8 // module.mlp.up_proj.bits
+                module.mlp.down_proj.out_features = module.mlp.down_proj.qweight.shape[0]
                 module.mlp.down_proj.in_features = (
-                    module.mlp.down_proj.qweight.shape[0] * 32 // module.mlp.down_proj.bits
+                    module.mlp.down_proj.qweight.shape[1] * 8 // module.mlp.down_proj.bits
                 )
 
             else:
@@ -1138,6 +1179,13 @@ def reverse_reorder_tensor(self, tensor, bits):
 
 
 class OliveModel(GPTQModel):
+    """
+    Olive quantization format:
+    - qweight: (out_features, packed_in_features) uint8, packed along last dim
+    - scales: (out_features, num_groups) float
+    - qzeros: (out_features, packed_num_groups) uint8, packed along last dim
+    """
+
     def _load_quant_config(self, quant_attrs):
         super()._load_quant_config(quant_attrs)
         self.overrides = quant_attrs["config"]["overrides"] or {}
@@ -1150,6 +1198,40 @@ def get_layer_group_size(self, layer_name):
         name = ".".join(layer_name.split(".")[:-1])
         return self.overrides.get(name, {}).get("group_size", self.global_group_size)
 
+    def handle_qzeros(self, module):
+        """Olive uses unsigned quantization, no offset needed."""
+        pass
+
+    def unpack(self, module):
+        """Skip unpack for Olive format."""
+        pass
+
+    def repack(self, module):
+        """
+        Olive format:
+        - qweight: (out_features, packed_in_features) uint8
+        - scales: (out_features, num_groups) float
+        - qzeros: (out_features, packed_num_groups) uint8
+
+        ORT format:
+        - qweight: (out_features, k_blocks, blob_size) uint8
+        - scales: (out_features * num_groups,) float, flattened
+        - qzeros: (out_features * packed_num_groups,) uint8, flattened
+        """
+        kpack = 8 // module.bits
+        k_blocks = module.in_features // module.group_size
+        blob_size = module.group_size // kpack
+
+        # qweight: (out_features, packed_in_features) -> (out_features, k_blocks, blob_size)
+        module.qweight = module.qweight.reshape(module.out_features, k_blocks, blob_size).contiguous()
+
+        # scales: (out_features, num_groups) -> flatten to 1D
+        module.scales = module.scales.reshape(-1).contiguous()
+
+        # qzeros: (out_features, packed_num_groups) -> flatten to 1D
+        if module.qzeros is not None and module.qzeros.numel() > 0:
+            module.qzeros = module.qzeros.reshape(-1).contiguous()
+
 
 class QuantModel:
     @staticmethod