huggingface · w3rew · Nov 4, 2024 · SunMarc · Nov 5, 2024 · zucchini-nlp
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -813,7 +813,8 @@ def _quantize(self, tensor, axis):
         if is_optimum_quanto_available():
             from optimum.quanto import quantize_weight
 
-            qtensor = quantize_weight(tensor, self.qtype, axis, self.q_group_size)
+            scale, zeropoint = self.optimizer(tensor, self.qtype, axis, self.q_group_size)
+            qtensor = quantize_weight(tensor, self.qtype, axis, scale, zeropoint, self.q_group_size)
             return qtensor
         elif is_quanto_available():
             logger.warning_once(