pytorch · metascroy · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025 · Mar 24, 2025
diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh
@@ -78,7 +78,6 @@ ${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \
     -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
     --group_size ${QLINEAR_GROUP_SIZE} \
     -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
-    --disable_dynamic_shape \
     -d fp32
 
 # Test run

@@ -382,7 +382,7 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de
 
 ## Running with low-bit kernels
 
-We now give instructions for quantizating and running your model with low-bit kernels.  These are still experimental, and require you do development on an Arm-based Mac.  Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results.  Currently dynamic shapes must be disabled when exporting a model with these kernels.
+We now give instructions for quantizating and running your model with low-bit kernels.  These are still experimental, and require you do development on an Arm-based Mac.  Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results.
 
 First export your model for lowbit quantization (step 2 above):
 
@@ -408,7 +408,6 @@ python -m examples.models.llama.export_llama \
   -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
   --group_size ${QLINEAR_GROUP_SIZE} \
   -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
-  --disable_dynamic_shape \
   -d fp32
 ```
 

@@ -699,19 +699,6 @@ def _validate_args(args):
                 "Shared embedding is only supported with torchao quantization."
             )
 
-    if (
-        args.quantization_mode is not None
-        and args.quantization_mode.startswith("torchao:")
-    ) or (
-        args.embedding_quantize is not None
-        and args.embedding_quantize.startswith("torchao:")
-    ):
-        if args.enable_dynamic_shape:
-            raise ValueError(
-                "Dynamic shape is not currently supported with torchao ops. Please use --disable_dynamic_shape."
-                "If you need this feature, please file an issue."
-            )
-
 
 def _to_edge_and_lower_llama_xnnpack(
     builder_exported,
+3 −8		torchao/__init__.py
+26 −12		torchao/csrc/cuda/rowwise_scaled_linear_sparse_cutlass/rowwise_scaled_linear_sparse_cutlass.cuh
+57 −0		torchao/experimental/op_lib.py
+12 −70		torchao/experimental/ops/embedding_xbit/op_embedding_xbit-impl.h
+3 −7		torchao/experimental/ops/embedding_xbit/op_embedding_xbit_aten.cpp
+4 −4		torchao/experimental/ops/embedding_xbit/op_embedding_xbit_executorch.cpp
+4 −0		torchao/experimental/ops/library.h
+4 −31		torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h
+1 −4		torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_aten.cpp
+2 −6		torchao/experimental/quant_api.py
+47 −0		torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py