diff --git a/.ci/scripts/test_llama_torchao_lowbit.sh b/.ci/scripts/test_llama_torchao_lowbit.sh index 76fabb04250..43f60c45524 100644 --- a/.ci/scripts/test_llama_torchao_lowbit.sh +++ b/.ci/scripts/test_llama_torchao_lowbit.sh @@ -78,7 +78,6 @@ ${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \ -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \ --group_size ${QLINEAR_GROUP_SIZE} \ -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \ - --disable_dynamic_shape \ -d fp32 # Test run diff --git a/examples/models/llama/README.md b/examples/models/llama/README.md index 95f92ddb887..0bef45ea3ae 100644 --- a/examples/models/llama/README.md +++ b/examples/models/llama/README.md @@ -382,7 +382,7 @@ Please refer to [this tutorial](https://pytorch.org/executorch/main/llm/llama-de ## Running with low-bit kernels -We now give instructions for quantizating and running your model with low-bit kernels. These are still experimental, and require you do development on an Arm-based Mac. Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results. Currently dynamic shapes must be disabled when exporting a model with these kernels. +We now give instructions for quantizating and running your model with low-bit kernels. These are still experimental, and require you do development on an Arm-based Mac. Also note that low-bit quantization often requires QAT (quantization-aware training) to give good quality results. First export your model for lowbit quantization (step 2 above): @@ -408,7 +408,6 @@ python -m examples.models.llama.export_llama \ -qmode "torchao:8da${QLINEAR_BITWIDTH}w" \ --group_size ${QLINEAR_GROUP_SIZE} \ -E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \ - --disable_dynamic_shape \ -d fp32 ``` diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 1620924f4f6..afa72def2f4 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -699,19 +699,6 @@ def _validate_args(args): "Shared embedding is only supported with torchao quantization." ) - if ( - args.quantization_mode is not None - and args.quantization_mode.startswith("torchao:") - ) or ( - args.embedding_quantize is not None - and args.embedding_quantize.startswith("torchao:") - ): - if args.enable_dynamic_shape: - raise ValueError( - "Dynamic shape is not currently supported with torchao ops. Please use --disable_dynamic_shape." - "If you need this feature, please file an issue." - ) - def _to_edge_and_lower_llama_xnnpack( builder_exported, diff --git a/third-party/ao b/third-party/ao index 83eb4903916..923242e22b5 160000 --- a/third-party/ao +++ b/third-party/ao @@ -1 +1 @@ -Subproject commit 83eb4903916340900c140afd0fe35dfaddf23c23 +Subproject commit 923242e22b5fb67646473605ab959b90cc450abc