Enable QNN-GPU in Olive thorugh QNN-EP (microsoft#2220)

jambayk · skadaver-qti · web-flow · commit da24463e14ed · 2025-10-16T23:21:21.000Z
## Describe your changes - Enable gpu in QNNExecutionProvider list - Update StaticLLM pass for gpu - Updat ContextBinaryGeneration pass for bin generation through QNN GPU - Use npu as default for QNN-EP - Added Olive-recipe for GPU under PR: microsoft/olive-recipes#145 Testing: - Validated the following models on Olive through gpu - Qwen-Qwen2.5-1.5B-Instruct - deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B - meta-llama-Llama-3.2-1B-Instruct -microsoft-Phi-3.5-mini-instruct - Validated HTP configs to make sure there are no regressions Copy of microsoft#2217 with unit test fixes --------- Co-authored-by: skadaver-qti <skadaver@qti.qualcomm.com>
diff --git a/olive/hardware/accelerator.py b/olive/hardware/accelerator.py
@@ -123,12 +123,12 @@ def infer_devices_from_execution_providers(execution_providers: list[str]):
                 # cannot infer device for CPUExecutionProvider since all ORT EP supports CPU
                 continue
 
-            inferered_devices = []
+            inferred_devices = []
             for device, eps in DEVICE_TO_EXECUTION_PROVIDERS.items():
                 if ep in eps:
-                    inferered_devices.append(device)
-            if inferered_devices:
-                ep_to_devices[ep] = inferered_devices
+                    inferred_devices.append(device)
+            if inferred_devices:
+                ep_to_devices[ep] = inferred_devices
             else:
                 ep_to_devices[ep] = None
 
@@ -163,6 +163,8 @@ def infer_single_device_from_execution_providers(execution_providers: list[str])
 
         if execution_providers == [ExecutionProvider.CPUExecutionProvider]:
             inferred_devices = ["cpu"]
+        elif execution_providers == [ExecutionProvider.QNNExecutionProvider]:
+            inferred_devices = ["npu"]
         else:
             inferred_devices = AcceleratorLookup.infer_devices_from_execution_providers(execution_providers)
             assert inferred_devices, (
diff --git a/olive/hardware/constants.py b/olive/hardware/constants.py
@@ -40,6 +40,7 @@ class ExecutionProvider(StrEnumBase):
 DEVICE_TO_EXECUTION_PROVIDERS = {
     "cpu": {ExecutionProvider.CPUExecutionProvider, ExecutionProvider.OpenVINOExecutionProvider},
     "gpu": {
+        ExecutionProvider.QNNExecutionProvider,
         ExecutionProvider.DmlExecutionProvider,
         ExecutionProvider.CUDAExecutionProvider,
         ExecutionProvider.ROCMExecutionProvider,
diff --git a/olive/passes/onnx/common.py b/olive/passes/onnx/common.py
@@ -713,3 +713,100 @@ def update_llm_pipeline_genai_config(
     additional_files.append(str(new_genai_config_path))
 
     return model
+
+
+def update_llm_pipeline_genai_config_gpu(
+    model: ONNXModelHandler,
+    output_model_dir: Union[str, Path],
+    input_model_path: Union[str, Path],
+    decoder_config_extra: Optional[dict[str, Any]] = None,
+) -> ONNXModelHandler:
+    """Update the LLM pipeline in the model's genai_config.json file.
+
+    :param model: The  model to update.
+    :param decoder_config_extra: Extra configuration for the decoder.
+    """
+    output_model_dir = Path(output_model_dir)
+
+    # update genai_config if it exists
+    genai_config_path = None
+    genai_config_path = Path(input_model_path).parent / "genai_config.json"
+
+    if genai_config_path.exists():
+        genai_config_path = str(genai_config_path.resolve())
+    else:
+        return model
+
+    with open(genai_config_path) as f:
+        genai_config = json.load(f)
+
+    # update model_type
+    genai_config["model"]["type"] = "decoder-pipeline"
+
+    # Update the provider_options list
+    provider_option = {"qnn": {"backend_type": "gpu"}}
+    genai_config["model"]["decoder"]["session_options"]["provider_options"] = [provider_option]
+
+    # update decoder config
+    decoder_config = genai_config["model"]["decoder"]
+    decoder_config.get("sliding_window", {}).pop("slide_inputs", None)
+    for key, value in (decoder_config_extra or {}).items():
+        exisiting_value = decoder_config.get(key)
+        if isinstance(exisiting_value, dict):
+            exisiting_value.update(value)
+        elif isinstance(exisiting_value, list):
+            exisiting_value.extend(value)
+        else:
+            decoder_config[key] = value
+
+    pipeline_config = {}
+    component_io_config = model.io_config
+    pipeline_config["model_onnx"] = {
+        "filename": Path(model.model_path).name,
+        "inputs": component_io_config["input_names"],
+        "outputs": component_io_config["output_names"],
+    }
+
+    decoder_config["pipeline"] = [pipeline_config]
+
+    # save the updated genai_config
+    new_genai_config_path = output_model_dir / "genai_config.json"
+    with new_genai_config_path.open("w") as f:
+        json.dump(genai_config, f, indent=4)
+
+    return model
+
+
+def update_llm_pipeline_genai_config_gpu_ctxbin(
+    model_path: Union[str, Path],
+) -> None:
+    """Update the filename fields in the model's genai_config.json file from 'model' to 'model_ctx'.
+
+    The genai_config.json file is updated in place in the model's directory.
+    :param model_path: Path to the model file.
+    """
+    # Find genai_config in the model's directory
+    model_dir = Path(model_path).parent
+    genai_config_path = model_dir / "genai_config.json"
+
+    if not genai_config_path.exists():
+        return
+
+    with open(genai_config_path) as f:
+        genai_config = json.load(f)
+
+    # Update decoder filename to 'model_ctx'
+    if "decoder" in genai_config.get("model", {}):
+        if "filename" in genai_config["model"]["decoder"]:
+            genai_config["model"]["decoder"]["filename"] = "model/model_ctx.onnx"
+
+        # Update filename in pipeline configuration
+        decoder_config = genai_config["model"]["decoder"]
+        if "pipeline" in decoder_config and isinstance(decoder_config["pipeline"], list):
+            for pipeline_item in decoder_config["pipeline"]:
+                if "model_onnx" in pipeline_item and "filename" in pipeline_item["model_onnx"]:
+                    pipeline_item["model_onnx"]["filename"] = "model/model_ctx.onnx"
+
+    # Save the updated genai_config back to the same location
+    with genai_config_path.open("w") as f:
+        json.dump(genai_config, f, indent=4)
diff --git a/olive/passes/onnx/context_binary.py b/olive/passes/onnx/context_binary.py
@@ -15,7 +15,11 @@
 from olive.model import CompositeModelHandler, ONNXModelHandler
 from olive.model.utils import resolve_onnx_path
 from olive.passes import Pass
-from olive.passes.onnx.common import get_context_bin_file_names, process_llm_pipeline
+from olive.passes.onnx.common import (
+    get_context_bin_file_names,
+    process_llm_pipeline,
+    update_llm_pipeline_genai_config_gpu_ctxbin,
+)
 from olive.passes.pass_config import BasePassConfig, PassConfigParam
 
 logger = logging.getLogger(__name__)
@@ -237,10 +241,14 @@ def _generate_context_binary(
         # prepare provider options
         provider_options = provider_options or {}
         if execution_provider == ExecutionProvider.QNNExecutionProvider:
-            if version.parse(OrtVersion).release < version.parse("1.22.0").release:
-                provider_options["backend_path"] = "libQnnHtp.so" if platform.system() == "Linux" else "QnnHtp.dll"
-            if share_ep_contexts:
-                provider_options["enable_htp_weight_sharing"] = "1"
+            if str(device).lower() == "gpu":
+                provider_options["backend_path"] = "libQnnGpu.so" if platform.system() == "Linux" else "QnnGpu.dll"
+                update_llm_pipeline_genai_config_gpu_ctxbin(model_path)
+            else:
+                if version.parse(OrtVersion).release < version.parse("1.22.0").release:
+                    provider_options["backend_path"] = "libQnnHtp.so" if platform.system() == "Linux" else "QnnHtp.dll"
+                    if share_ep_contexts:
+                        provider_options["enable_htp_weight_sharing"] = "1"
 
         # prepare session options
         session_options = session_options or {}
diff --git a/olive/passes/onnx/static_llm.py b/olive/passes/onnx/static_llm.py
@@ -7,14 +7,17 @@
 
 import onnx
 
+from olive.hardware import Device
 from olive.hardware.accelerator import AcceleratorSpec
+from olive.hardware.constants import ExecutionProvider
 from olive.model import CompositeModelHandler, ONNXModelHandler
 from olive.passes import Pass
 from olive.passes.onnx.common import (
     add_version_metadata_to_model_proto,
     fix_dim_params,
     process_llm_pipeline,
     resave_model,
+    update_llm_pipeline_genai_config_gpu,
 )
 from olive.passes.onnx.onnx_dag import OnnxDAG
 from olive.passes.pass_config import BasePassConfig, PassConfigParam
@@ -61,9 +64,18 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
             ),
         }
 
-    def _run_for_config(
-        self, model: CompositeModelHandler, config: type[BasePassConfig], output_model_path: str
-    ) -> CompositeModelHandler:
+    def _run_for_config(self, model, config: type[BasePassConfig], output_model_path: str):
+        if (
+            self.accelerator_spec.execution_provider == ExecutionProvider.QNNExecutionProvider
+            and self.accelerator_spec.accelerator_type == Device.GPU
+        ):
+            assert isinstance(model, ONNXModelHandler), "StaticLLM (qnn-gpu) requires a single ONNXModelHandler."
+            return self._run_qnn_gpu(model, config, output_model_path)
+
+        else:
+            return self._run_generic(model, config, output_model_path)
+
+    def _run_generic(self, model: CompositeModelHandler, config: type[BasePassConfig], output_model_path: str):
         assert isinstance(model, CompositeModelHandler), "StaticLLM pass only supports CompositeModelHandler"
         model_components = list(model.model_components)
         assert all(isinstance(m, ONNXModelHandler) for m in model_components), "All components must be ONNXModelHandler"
@@ -169,6 +181,60 @@ def process_context_iterator(component_models, llm_pipeline, output_dir):
             group_session_options=config.group_session_options,
         )
 
+    def _run_qnn_gpu(self, model: ONNXModelHandler, config: type[BasePassConfig], output_model_path: Path):
+        output_model_dir = Path(output_model_path).with_suffix("")
+        model_path = Path(model.model_path)
+
+        # --- Step 1: Load model (handle both single and external data) ---
+        try:
+            model_proto = onnx.load(model_path, load_external_data=True)
+        except Exception as e:
+            raise RuntimeError(f"Failed to load ONNX model: {e}") from e
+
+        # --- Step 2: Fix symbolic dimensions ---
+        batch_size, sequence_length = OnnxDAG(model_proto).get_io_shape("input_ids")
+        if not (isinstance(batch_size, str) and isinstance(sequence_length, str)):
+            raise ValueError("Input dimensions must be symbolic before static shape fixing.")
+
+        param_mapping = {batch_size: config.batch_size, sequence_length: config.context_length}
+        self.fix_shape(model_proto, param_mapping)
+
+        # --- Step 3: Save model as external-data format ---
+        output_model_file = Path(output_model_dir) / "model.onnx"
+        external_data_file = Path(output_model_dir) / "model.onnx.data"
+
+        onnx.save(
+            model_proto,
+            str(output_model_file),
+            save_as_external_data=True,
+            all_tensors_to_one_file=True,
+            location=external_data_file.name,
+            convert_attribute=False,
+        )
+
+        decoder_config_extra = {
+            "inputs": {
+                "past_sequence_length": "past_seq_len",
+                "total_sequence_length": "total_seq_len",
+            },
+            "sliding_window": {
+                "window_size": config.context_length,
+                "pad_value": 0,
+                "alignment": "left",
+                "slide_key_value_cache": False,
+            },
+        }
+
+        input_model_path = model.model_path
+        model_static = ONNXModelHandler(model_path=output_model_dir, onnx_file_name=output_model_file.name)
+
+        return update_llm_pipeline_genai_config_gpu(
+            model_static,
+            output_model_dir,
+            input_model_path,
+            decoder_config_extra,
+        )
+
     @staticmethod
     def fix_shape(model_proto: onnx.ModelProto, param_mapping: dict[str, int]):
         """Fix the shape of the model based on the param mapping.
diff --git a/test/hardware/test_accelerator.py b/test/hardware/test_accelerator.py
@@ -26,7 +26,7 @@
         ([ExecutionProvider.CUDAExecutionProvider], ["gpu"]),
         ([ExecutionProvider.CPUExecutionProvider, ExecutionProvider.CUDAExecutionProvider], ["gpu"]),
         ([ExecutionProvider.DmlExecutionProvider, ExecutionProvider.CUDAExecutionProvider], None),
-        ([ExecutionProvider.QNNExecutionProvider, ExecutionProvider.CUDAExecutionProvider], ["npu", "gpu"]),
+        ([ExecutionProvider.VitisAIExecutionProvider, ExecutionProvider.CUDAExecutionProvider], ["npu", "gpu"]),
     ],
 )
 def test_infer_accelerators_from_execution_provider(execution_providers_test):
@@ -35,6 +35,8 @@ def test_infer_accelerators_from_execution_provider(execution_providers_test):
     assert actual_rls == expected_accelerators
 
 
+# NOTE: Use PythonEnvironmentSystem test cases when using EPs that are not CPU EP
+# The @patch("onnxruntime.get_available_providers") doesn't seem to work on Windows in CI
 @pytest.mark.parametrize(
     ("system_config", "expected_acc_specs", "available_providers"),
     [
@@ -94,6 +96,18 @@ def test_infer_accelerators_from_execution_provider(execution_providers_test):
             [("cpu", ExecutionProvider.CPUExecutionProvider)],
             [ExecutionProvider.CPUExecutionProvider],
         ),
+        # for qnn, if only EP provided, we map it to npu device
+        (
+            {
+                "type": "PythonEnvironment",
+                "config": {
+                    "accelerators": [{"execution_providers": [ExecutionProvider.QNNExecutionProvider]}],
+                    "python_environment_path": Path(sys.executable).parent,
+                },
+            },
+            [("npu", ExecutionProvider.QNNExecutionProvider)],
+            [ExecutionProvider.QNNExecutionProvider],
+        ),
         # both device and EP provided
         (
             {
@@ -509,7 +523,7 @@ def test_normalize_accelerators_skip_ep_check(system_config, expected_acc):
                     "accelerators": [
                         {
                             "execution_providers": [
-                                ExecutionProvider.QNNExecutionProvider,
+                                ExecutionProvider.VitisAIExecutionProvider,
                                 ExecutionProvider.CUDAExecutionProvider,
                             ]
                         }
@@ -520,7 +534,7 @@ def test_normalize_accelerators_skip_ep_check(system_config, expected_acc):
             AssertionError,
             (
                 "Cannot infer the devices from the execution providers "
-                "['QNNExecutionProvider', 'CUDAExecutionProvider']. Multiple devices are inferred: ['npu', 'gpu']."
+                "['VitisAIExecutionProvider', 'CUDAExecutionProvider']. Multiple devices are inferred: ['npu', 'gpu']."
             ),
         ),
     ],