diff --git a/.github/workflows/modelconverter_test.yaml b/.github/workflows/modelconverter_test.yaml index 0833773..638be04 100644 --- a/.github/workflows/modelconverter_test.yaml +++ b/.github/workflows/modelconverter_test.yaml @@ -43,7 +43,7 @@ jobs: uses: actions/checkout@v4 - name: Set up Docker - uses: crazy-max/ghaction-setup-docker@v3 + uses: crazy-max/ghaction-setup-docker@v4 - name: Set up Python uses: actions/setup-python@v5 diff --git a/README.md b/README.md index 51cfcfa..a116a96 100644 --- a/README.md +++ b/README.md @@ -361,6 +361,19 @@ Below is a table of common command-line options available when using the `modelc > [!NOTE] > This table is not exhaustive. For more detailed information about available options, run `modelconverter convert --help` in your command line interface. You can also check all the `[ config overrides ]` available at [defaults.yaml](shared_with_container/configs/defaults.yaml). +##### RVC4 Quantization Mode Option + +The `rvc4.quantization_mode` CLI option allows you to choose between different pre-defined quantization modes for RVC4 conversions. The available modes are: + +- `INT8_STANDARD`: Standard INT8 quantization **with calibration** (default), for optimal performance (FPS) and model size. +- `INT8_ACCURACY_FOCUSED`: INT8 quantization **with calibration**. This mode utilizes more advanced quantization techniques that may improve accuracy without reducing performance or increasing the model size, depending on the model. +- `INT8_INT16_MIXED`: Mixed INT8 and INT16 quantization **with calibration**. This mode uses 8-bit weights and 16-bit activations across all layers for improved numeric stability and accuracy at the cost of reduced performance (FPS) and increased model size. +- `FP16_STANDARD`: FP16 quantization **without calibration**, for models that require higher accuracy and numeric stability, at the cost of performance (FPS) and increased model size. +- `CUSTOM`: Custom quantization mode, where the user can specify more advanced options in the configuration file or via command-line arguments. + +> [!IMPORTANT] +> When `rvc4.quantization_mode` is set to anything other than `CUSTOM`, the default settings for that mode will override any custom settings provided in the configuration file or via command-line arguments (via the `rvc4.snpe_onnx_to_dlc_args`, `rvc4.snpe_dlc_quant_args`, or `rvc4.snpe_dlc_graph_prepare_args` options). + #### Handling Large ONNX Files (Exceeding 2GB) When working with ONNX models that exceed 2GB in size, the model data must be stored using ONNX's external data mechanism. This separates the model structure from the large weight data. diff --git a/modelconverter/packages/rvc4/exporter.py b/modelconverter/packages/rvc4/exporter.py index 08724a4..744cf4e 100644 --- a/modelconverter/packages/rvc4/exporter.py +++ b/modelconverter/packages/rvc4/exporter.py @@ -1,3 +1,4 @@ +import json import shutil import subprocess import time @@ -22,6 +23,7 @@ DataType, Encoding, InputFileType, + QuantizationMode, ResizeMethod, Target, ) @@ -37,7 +39,6 @@ def __init__(self, config: SingleStageConfig, output_dir: Path): super().__init__(config=config, output_dir=output_dir) rvc4_cfg = config.rvc4 - self.compress_to_fp16 = rvc4_cfg.compress_to_fp16 self.snpe_onnx_to_dlc = rvc4_cfg.snpe_onnx_to_dlc_args self.snpe_dlc_quant = rvc4_cfg.snpe_dlc_quant_args self.snpe_dlc_graph_prepare = rvc4_cfg.snpe_dlc_graph_prepare_args @@ -46,6 +47,14 @@ def __init__(self, config: SingleStageConfig, output_dir: Path): ) self.use_per_row_quantization = rvc4_cfg.use_per_row_quantization self.optimization_level = rvc4_cfg.optimization_level + self.quantization_mode = rvc4_cfg.quantization_mode + if self.quantization_mode != QuantizationMode.CUSTOM: + self.snpe_onnx_to_dlc = [] + self.snpe_dlc_quant = [] + self.snpe_dlc_graph_prepare = [] + logger.warning( + f"Using pre-defined arguments for quantization mode {self.quantization_mode.value}, which will override user-provided SNPE arguments. If you need full control of SNPE arguments, set `rvc4.quantization_mode: CUSTOM` in the config or CLI." + ) self.keep_raw_images = rvc4_cfg.keep_raw_images if "--htp_socs" in self.snpe_dlc_graph_prepare: i = self.snpe_dlc_graph_prepare.index("--htp_socs") @@ -114,7 +123,7 @@ def export(self) -> Path: args, ["--optimization_level", str(self.optimization_level)] ) self._add_args(args, ["--htp_socs", ",".join(self.htp_socs)]) - if self.compress_to_fp16: + if self.quantization_mode == QuantizationMode.FP16_STD: self._add_args(args, ["--use_float_io"]) self._subprocess_run( ["snpe-dlc-graph-prepare", *args], meta_name="graph_prepare" @@ -157,6 +166,15 @@ def calibrate(self, dlc_path: Path) -> Path: if self.use_per_row_quantization: args.append("--use_per_row_quantization") + if self.quantization_mode == QuantizationMode.INT8_ACC: + self._add_args(args, ["--param_quantizer", "enhanced"]) + self._add_args(args, ["--act_quantizer", "enhanced"]) + elif self.quantization_mode == QuantizationMode.INT8_16_MIX: + self._add_args(args, ["--param_quantizer", "enhanced"]) + self._add_args(args, ["--act_quantizer", "enhanced"]) + self._add_args(args, ["--act_bitwidth", "16"]) + args.append("--override_params") + start_time = time.time() self._subprocess_run( ["snpe-dlc-quant", *args], meta_name="quantization_cmd" @@ -241,6 +259,21 @@ class Entry(NamedTuple): f.write(entry_str + "\n") return self.input_list_path + def generate_io_encodings(self) -> Path: + encodings_dict = {"activation_encodings": {}, "param_encodings": {}} + if not (list(self.inputs.keys()) and list(self.outputs.keys())): + logger.warning( + "Cannot generate I/O encodings as inputs or outputs are not defined. The resulting DLC may not be compatible with DAI." + ) + for name in list(self.inputs.keys()) + list(self.outputs.keys()): + encodings_dict["activation_encodings"][name] = [ + {"bitwidth": 8, "dtype": "int"} + ] + encodings_path = self.intermediate_outputs_dir / "io_encodings.json" + with open(encodings_path, "w") as encodings_file: + json.dump(encodings_dict, encodings_file, indent=4) + return encodings_path + def onnx_to_dlc(self) -> Path: logger.info("Exporting for RVC4") args = self.snpe_onnx_to_dlc @@ -293,8 +326,17 @@ def onnx_to_dlc(self) -> Path: "Proceeding wihtout specifying layout." ) - if self.compress_to_fp16: + if self.quantization_mode == QuantizationMode.FP16_STD: self._add_args(args, ["--float_bitwidth", "16"]) + elif self.quantization_mode == QuantizationMode.INT8_16_MIX: + io_encodings_file = self.generate_io_encodings() + self._add_args( + args, + [ + "--quantization_overrides", + f"{io_encodings_file}", + ], + ) if self.is_tflite: command = "snpe-tflite-to-dlc" diff --git a/modelconverter/utils/config.py b/modelconverter/utils/config.py index aa2a5ed..6cbccd7 100644 --- a/modelconverter/utils/config.py +++ b/modelconverter/utils/config.py @@ -27,6 +27,7 @@ Encoding, InputFileType, PotDevice, + QuantizationMode, ResizeMethod, Target, ) @@ -264,7 +265,6 @@ class RVC3Config(BlobBaseConfig): class RVC4Config(TargetConfig): - compress_to_fp16: bool = False snpe_onnx_to_dlc_args: list[str] = [] snpe_dlc_quant_args: list[str] = [] snpe_dlc_graph_prepare_args: list[str] = [] @@ -272,17 +272,16 @@ class RVC4Config(TargetConfig): use_per_channel_quantization: bool = True use_per_row_quantization: bool = False optimization_level: Literal[1, 2, 3] = 2 + quantization_mode: QuantizationMode = QuantizationMode.INT8_STD htp_socs: list[ Literal["sm8350", "sm8450", "sm8550", "sm8650", "qcs6490", "qcs8550"] ] = ["sm8550"] @model_validator(mode="after") def _validate_fp16(self) -> Self: - if not self.compress_to_fp16: + if self.quantization_mode != QuantizationMode.FP16_STD: return self self.disable_calibration = True - if "qcs8550" not in self.htp_socs: - self.htp_socs.append("qcs8550") return self diff --git a/modelconverter/utils/nn_archive.py b/modelconverter/utils/nn_archive.py index bf30814..dc629f8 100644 --- a/modelconverter/utils/nn_archive.py +++ b/modelconverter/utils/nn_archive.py @@ -20,7 +20,12 @@ from modelconverter.utils.constants import MISC_DIR from modelconverter.utils.layout import guess_new_layout, make_default_layout from modelconverter.utils.metadata import Metadata, get_metadata -from modelconverter.utils.types import DataType, Encoding, Target +from modelconverter.utils.types import ( + DataType, + Encoding, + QuantizationMode, + Target, +) def get_archive_input(cfg: NNArchiveConfig, name: str) -> NNArchiveInput: @@ -231,21 +236,27 @@ def modelconverter_config_to_nn( target_cfg = cfg.get_target_config(target) # TODO: This might be more complicated for Hailo - - onnx_args = getattr(target_cfg, "snpe_onnx_to_dlc_args", []) - prep_args = getattr(target_cfg, "snpe_dlc_graph_prepare_args", []) - fb16 = any( - a == "--float_bitwidth" and str(b) == "16" - for a, b in pairwise(onnx_args) - ) or any( - isinstance(x, str) - and x.startswith("--float_bitwidth=") - and x.split("=", 1)[1] == "16" - for x in onnx_args - ) - compress_to_fp16 = getattr(target_cfg, "compress_to_fp16", False) or ( - fb16 and "--use_float_io" in prep_args - ) + quantization_mode = getattr(target_cfg, "quantization_mode", None) + if ( + quantization_mode is None + or quantization_mode == QuantizationMode.CUSTOM + ): + onnx_args = getattr(target_cfg, "snpe_onnx_to_dlc_args", []) + prep_args = getattr(target_cfg, "snpe_dlc_graph_prepare_args", []) + fb16 = any( + a == "--float_bitwidth" and str(b) == "16" + for a, b in pairwise(onnx_args) + ) or any( + isinstance(x, str) + and x.startswith("--float_bitwidth=") + and x.split("=", 1)[1] == "16" + for x in onnx_args + ) + compress_to_fp16 = getattr(target_cfg, "compress_to_fp16", False) or ( + fb16 and "--use_float_io" in prep_args + ) + else: + compress_to_fp16 = quantization_mode == QuantizationMode.FP16_STD disable_calibration = target_cfg.disable_calibration match target, compress_to_fp16, disable_calibration: @@ -290,10 +301,10 @@ def modelconverter_config_to_nn( layout = make_default_layout(new_shape) dai_type = inp.encoding.to.value if inp.data_type == DataType.FLOAT16: - type = "F16F16F16" + channel_format = "F16F16F16" else: - type = "888" - dai_type += type + channel_format = "888" + dai_type += channel_format dai_type += "i" if layout == "NHWC" else "p" dtype = _get_io_dtype( diff --git a/modelconverter/utils/types.py b/modelconverter/utils/types.py index fa9832f..757f4c1 100644 --- a/modelconverter/utils/types.py +++ b/modelconverter/utils/types.py @@ -229,6 +229,14 @@ class Target(Enum): RVC4 = "rvc4" +class QuantizationMode(Enum): + INT8_STD = "INT8_STANDARD" + INT8_ACC = "INT8_ACCURACY_FOCUSED" + INT8_16_MIX = "INT8_INT16_MIXED" + FP16_STD = "FP16_STANDARD" + CUSTOM = "CUSTOM" + + class InputFileType(Enum): ONNX = "ONNX" IR = "IR" diff --git a/shared_with_container/configs/defaults.yaml b/shared_with_container/configs/defaults.yaml index fa83921..3928fae 100644 --- a/shared_with_container/configs/defaults.yaml +++ b/shared_with_container/configs/defaults.yaml @@ -198,6 +198,9 @@ stages: # List of additional arguments to pass to SNPE snpe-dlc-graph-prepare. snpe_dlc_graph_prepare_args: [] + # Disables calibration/quantization. + disable_calibration: false + # Whether to include the raw images in the intermediate outputs. # Warning: the raw images can get very large. keep_raw_images: False @@ -213,12 +216,8 @@ stages: # List of platforms to pre-compute the DLC graph for. htp_socs: ["sm8550"] - # Configures conversion to float16 precision. This will - # disable the calibration, add the `--float_bitwidth 16` - # flag to `snpe-onnx-to-dlc`, the `--use_float_io` flag - # to `snpe-dlc-graph-prepare`, and the `qcs8550` platform - # to the `htp_socs` list. - compress_to_fp16: False - # Optimization level for the DLC graph preparation. The available levels are: 1, 2, and 3. Higher optimization levels incur longer offline prepare time but yields more optimal graph and hence faster execution time for most graphs. optimization_level: 2 + + # Pre-defined quantization modes for the RVC4 exporter. Pre-defined modes (except CUSTOM) will override any user-provided SNPE arguments via `snpe_onnx_to_dlc_args`, `snpe_dlc_quant_args`, and `snpe_dlc_graph_prepare_args`. The available quantization modes are: INT8_STANDARD, INT8_ACCURACY_FOCUSED, INT8_INT16_MIXED, FP16_STANDARD, and CUSTOM. + quantization_mode: INT8_STANDARD diff --git a/tests/test_utils/test_config.py b/tests/test_utils/test_config.py index 74073fe..5f4e775 100644 --- a/tests/test_utils/test_config.py +++ b/tests/test_utils/test_config.py @@ -25,6 +25,7 @@ Encoding, InputFileType, PotDevice, + QuantizationMode, ResizeMethod, Target, ) @@ -65,7 +66,7 @@ "disable_calibration": False, "use_per_channel_quantization": True, "use_per_row_quantization": False, - "compress_to_fp16": False, + "quantization_mode": QuantizationMode.INT8_STD, "optimization_level": 2, }, "hailo": {