Skip to content

Commit da24463

Browse files
Enable QNN-GPU in Olive thorugh QNN-EP (microsoft#2220)
## Describe your changes - Enable gpu in QNNExecutionProvider list - Update StaticLLM pass for gpu - Updat ContextBinaryGeneration pass for bin generation through QNN GPU - Use npu as default for QNN-EP - Added Olive-recipe for GPU under PR: microsoft/olive-recipes#145 Testing: - Validated the following models on Olive through gpu - Qwen-Qwen2.5-1.5B-Instruct - deepseek-ai-DeepSeek-R1-Distill-Qwen-1.5B - meta-llama-Llama-3.2-1B-Instruct -microsoft-Phi-3.5-mini-instruct - Validated HTP configs to make sure there are no regressions Copy of microsoft#2217 with unit test fixes --------- Co-authored-by: skadaver-qti <[email protected]>
1 parent b5bb07d commit da24463

File tree

6 files changed

+203
-15
lines changed

6 files changed

+203
-15
lines changed

olive/hardware/accelerator.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,12 +123,12 @@ def infer_devices_from_execution_providers(execution_providers: list[str]):
123123
# cannot infer device for CPUExecutionProvider since all ORT EP supports CPU
124124
continue
125125

126-
inferered_devices = []
126+
inferred_devices = []
127127
for device, eps in DEVICE_TO_EXECUTION_PROVIDERS.items():
128128
if ep in eps:
129-
inferered_devices.append(device)
130-
if inferered_devices:
131-
ep_to_devices[ep] = inferered_devices
129+
inferred_devices.append(device)
130+
if inferred_devices:
131+
ep_to_devices[ep] = inferred_devices
132132
else:
133133
ep_to_devices[ep] = None
134134

@@ -163,6 +163,8 @@ def infer_single_device_from_execution_providers(execution_providers: list[str])
163163

164164
if execution_providers == [ExecutionProvider.CPUExecutionProvider]:
165165
inferred_devices = ["cpu"]
166+
elif execution_providers == [ExecutionProvider.QNNExecutionProvider]:
167+
inferred_devices = ["npu"]
166168
else:
167169
inferred_devices = AcceleratorLookup.infer_devices_from_execution_providers(execution_providers)
168170
assert inferred_devices, (

olive/hardware/constants.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class ExecutionProvider(StrEnumBase):
4040
DEVICE_TO_EXECUTION_PROVIDERS = {
4141
"cpu": {ExecutionProvider.CPUExecutionProvider, ExecutionProvider.OpenVINOExecutionProvider},
4242
"gpu": {
43+
ExecutionProvider.QNNExecutionProvider,
4344
ExecutionProvider.DmlExecutionProvider,
4445
ExecutionProvider.CUDAExecutionProvider,
4546
ExecutionProvider.ROCMExecutionProvider,

olive/passes/onnx/common.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -713,3 +713,100 @@ def update_llm_pipeline_genai_config(
713713
additional_files.append(str(new_genai_config_path))
714714

715715
return model
716+
717+
718+
def update_llm_pipeline_genai_config_gpu(
719+
model: ONNXModelHandler,
720+
output_model_dir: Union[str, Path],
721+
input_model_path: Union[str, Path],
722+
decoder_config_extra: Optional[dict[str, Any]] = None,
723+
) -> ONNXModelHandler:
724+
"""Update the LLM pipeline in the model's genai_config.json file.
725+
726+
:param model: The model to update.
727+
:param decoder_config_extra: Extra configuration for the decoder.
728+
"""
729+
output_model_dir = Path(output_model_dir)
730+
731+
# update genai_config if it exists
732+
genai_config_path = None
733+
genai_config_path = Path(input_model_path).parent / "genai_config.json"
734+
735+
if genai_config_path.exists():
736+
genai_config_path = str(genai_config_path.resolve())
737+
else:
738+
return model
739+
740+
with open(genai_config_path) as f:
741+
genai_config = json.load(f)
742+
743+
# update model_type
744+
genai_config["model"]["type"] = "decoder-pipeline"
745+
746+
# Update the provider_options list
747+
provider_option = {"qnn": {"backend_type": "gpu"}}
748+
genai_config["model"]["decoder"]["session_options"]["provider_options"] = [provider_option]
749+
750+
# update decoder config
751+
decoder_config = genai_config["model"]["decoder"]
752+
decoder_config.get("sliding_window", {}).pop("slide_inputs", None)
753+
for key, value in (decoder_config_extra or {}).items():
754+
exisiting_value = decoder_config.get(key)
755+
if isinstance(exisiting_value, dict):
756+
exisiting_value.update(value)
757+
elif isinstance(exisiting_value, list):
758+
exisiting_value.extend(value)
759+
else:
760+
decoder_config[key] = value
761+
762+
pipeline_config = {}
763+
component_io_config = model.io_config
764+
pipeline_config["model_onnx"] = {
765+
"filename": Path(model.model_path).name,
766+
"inputs": component_io_config["input_names"],
767+
"outputs": component_io_config["output_names"],
768+
}
769+
770+
decoder_config["pipeline"] = [pipeline_config]
771+
772+
# save the updated genai_config
773+
new_genai_config_path = output_model_dir / "genai_config.json"
774+
with new_genai_config_path.open("w") as f:
775+
json.dump(genai_config, f, indent=4)
776+
777+
return model
778+
779+
780+
def update_llm_pipeline_genai_config_gpu_ctxbin(
781+
model_path: Union[str, Path],
782+
) -> None:
783+
"""Update the filename fields in the model's genai_config.json file from 'model' to 'model_ctx'.
784+
785+
The genai_config.json file is updated in place in the model's directory.
786+
:param model_path: Path to the model file.
787+
"""
788+
# Find genai_config in the model's directory
789+
model_dir = Path(model_path).parent
790+
genai_config_path = model_dir / "genai_config.json"
791+
792+
if not genai_config_path.exists():
793+
return
794+
795+
with open(genai_config_path) as f:
796+
genai_config = json.load(f)
797+
798+
# Update decoder filename to 'model_ctx'
799+
if "decoder" in genai_config.get("model", {}):
800+
if "filename" in genai_config["model"]["decoder"]:
801+
genai_config["model"]["decoder"]["filename"] = "model/model_ctx.onnx"
802+
803+
# Update filename in pipeline configuration
804+
decoder_config = genai_config["model"]["decoder"]
805+
if "pipeline" in decoder_config and isinstance(decoder_config["pipeline"], list):
806+
for pipeline_item in decoder_config["pipeline"]:
807+
if "model_onnx" in pipeline_item and "filename" in pipeline_item["model_onnx"]:
808+
pipeline_item["model_onnx"]["filename"] = "model/model_ctx.onnx"
809+
810+
# Save the updated genai_config back to the same location
811+
with genai_config_path.open("w") as f:
812+
json.dump(genai_config, f, indent=4)

olive/passes/onnx/context_binary.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,11 @@
1515
from olive.model import CompositeModelHandler, ONNXModelHandler
1616
from olive.model.utils import resolve_onnx_path
1717
from olive.passes import Pass
18-
from olive.passes.onnx.common import get_context_bin_file_names, process_llm_pipeline
18+
from olive.passes.onnx.common import (
19+
get_context_bin_file_names,
20+
process_llm_pipeline,
21+
update_llm_pipeline_genai_config_gpu_ctxbin,
22+
)
1923
from olive.passes.pass_config import BasePassConfig, PassConfigParam
2024

2125
logger = logging.getLogger(__name__)
@@ -237,10 +241,14 @@ def _generate_context_binary(
237241
# prepare provider options
238242
provider_options = provider_options or {}
239243
if execution_provider == ExecutionProvider.QNNExecutionProvider:
240-
if version.parse(OrtVersion).release < version.parse("1.22.0").release:
241-
provider_options["backend_path"] = "libQnnHtp.so" if platform.system() == "Linux" else "QnnHtp.dll"
242-
if share_ep_contexts:
243-
provider_options["enable_htp_weight_sharing"] = "1"
244+
if str(device).lower() == "gpu":
245+
provider_options["backend_path"] = "libQnnGpu.so" if platform.system() == "Linux" else "QnnGpu.dll"
246+
update_llm_pipeline_genai_config_gpu_ctxbin(model_path)
247+
else:
248+
if version.parse(OrtVersion).release < version.parse("1.22.0").release:
249+
provider_options["backend_path"] = "libQnnHtp.so" if platform.system() == "Linux" else "QnnHtp.dll"
250+
if share_ep_contexts:
251+
provider_options["enable_htp_weight_sharing"] = "1"
244252

245253
# prepare session options
246254
session_options = session_options or {}

olive/passes/onnx/static_llm.py

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,17 @@
77

88
import onnx
99

10+
from olive.hardware import Device
1011
from olive.hardware.accelerator import AcceleratorSpec
12+
from olive.hardware.constants import ExecutionProvider
1113
from olive.model import CompositeModelHandler, ONNXModelHandler
1214
from olive.passes import Pass
1315
from olive.passes.onnx.common import (
1416
add_version_metadata_to_model_proto,
1517
fix_dim_params,
1618
process_llm_pipeline,
1719
resave_model,
20+
update_llm_pipeline_genai_config_gpu,
1821
)
1922
from olive.passes.onnx.onnx_dag import OnnxDAG
2023
from olive.passes.pass_config import BasePassConfig, PassConfigParam
@@ -61,9 +64,18 @@ def _default_config(cls, accelerator_spec: AcceleratorSpec) -> dict[str, PassCon
6164
),
6265
}
6366

64-
def _run_for_config(
65-
self, model: CompositeModelHandler, config: type[BasePassConfig], output_model_path: str
66-
) -> CompositeModelHandler:
67+
def _run_for_config(self, model, config: type[BasePassConfig], output_model_path: str):
68+
if (
69+
self.accelerator_spec.execution_provider == ExecutionProvider.QNNExecutionProvider
70+
and self.accelerator_spec.accelerator_type == Device.GPU
71+
):
72+
assert isinstance(model, ONNXModelHandler), "StaticLLM (qnn-gpu) requires a single ONNXModelHandler."
73+
return self._run_qnn_gpu(model, config, output_model_path)
74+
75+
else:
76+
return self._run_generic(model, config, output_model_path)
77+
78+
def _run_generic(self, model: CompositeModelHandler, config: type[BasePassConfig], output_model_path: str):
6779
assert isinstance(model, CompositeModelHandler), "StaticLLM pass only supports CompositeModelHandler"
6880
model_components = list(model.model_components)
6981
assert all(isinstance(m, ONNXModelHandler) for m in model_components), "All components must be ONNXModelHandler"
@@ -169,6 +181,60 @@ def process_context_iterator(component_models, llm_pipeline, output_dir):
169181
group_session_options=config.group_session_options,
170182
)
171183

184+
def _run_qnn_gpu(self, model: ONNXModelHandler, config: type[BasePassConfig], output_model_path: Path):
185+
output_model_dir = Path(output_model_path).with_suffix("")
186+
model_path = Path(model.model_path)
187+
188+
# --- Step 1: Load model (handle both single and external data) ---
189+
try:
190+
model_proto = onnx.load(model_path, load_external_data=True)
191+
except Exception as e:
192+
raise RuntimeError(f"Failed to load ONNX model: {e}") from e
193+
194+
# --- Step 2: Fix symbolic dimensions ---
195+
batch_size, sequence_length = OnnxDAG(model_proto).get_io_shape("input_ids")
196+
if not (isinstance(batch_size, str) and isinstance(sequence_length, str)):
197+
raise ValueError("Input dimensions must be symbolic before static shape fixing.")
198+
199+
param_mapping = {batch_size: config.batch_size, sequence_length: config.context_length}
200+
self.fix_shape(model_proto, param_mapping)
201+
202+
# --- Step 3: Save model as external-data format ---
203+
output_model_file = Path(output_model_dir) / "model.onnx"
204+
external_data_file = Path(output_model_dir) / "model.onnx.data"
205+
206+
onnx.save(
207+
model_proto,
208+
str(output_model_file),
209+
save_as_external_data=True,
210+
all_tensors_to_one_file=True,
211+
location=external_data_file.name,
212+
convert_attribute=False,
213+
)
214+
215+
decoder_config_extra = {
216+
"inputs": {
217+
"past_sequence_length": "past_seq_len",
218+
"total_sequence_length": "total_seq_len",
219+
},
220+
"sliding_window": {
221+
"window_size": config.context_length,
222+
"pad_value": 0,
223+
"alignment": "left",
224+
"slide_key_value_cache": False,
225+
},
226+
}
227+
228+
input_model_path = model.model_path
229+
model_static = ONNXModelHandler(model_path=output_model_dir, onnx_file_name=output_model_file.name)
230+
231+
return update_llm_pipeline_genai_config_gpu(
232+
model_static,
233+
output_model_dir,
234+
input_model_path,
235+
decoder_config_extra,
236+
)
237+
172238
@staticmethod
173239
def fix_shape(model_proto: onnx.ModelProto, param_mapping: dict[str, int]):
174240
"""Fix the shape of the model based on the param mapping.

test/hardware/test_accelerator.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
([ExecutionProvider.CUDAExecutionProvider], ["gpu"]),
2727
([ExecutionProvider.CPUExecutionProvider, ExecutionProvider.CUDAExecutionProvider], ["gpu"]),
2828
([ExecutionProvider.DmlExecutionProvider, ExecutionProvider.CUDAExecutionProvider], None),
29-
([ExecutionProvider.QNNExecutionProvider, ExecutionProvider.CUDAExecutionProvider], ["npu", "gpu"]),
29+
([ExecutionProvider.VitisAIExecutionProvider, ExecutionProvider.CUDAExecutionProvider], ["npu", "gpu"]),
3030
],
3131
)
3232
def test_infer_accelerators_from_execution_provider(execution_providers_test):
@@ -35,6 +35,8 @@ def test_infer_accelerators_from_execution_provider(execution_providers_test):
3535
assert actual_rls == expected_accelerators
3636

3737

38+
# NOTE: Use PythonEnvironmentSystem test cases when using EPs that are not CPU EP
39+
# The @patch("onnxruntime.get_available_providers") doesn't seem to work on Windows in CI
3840
@pytest.mark.parametrize(
3941
("system_config", "expected_acc_specs", "available_providers"),
4042
[
@@ -94,6 +96,18 @@ def test_infer_accelerators_from_execution_provider(execution_providers_test):
9496
[("cpu", ExecutionProvider.CPUExecutionProvider)],
9597
[ExecutionProvider.CPUExecutionProvider],
9698
),
99+
# for qnn, if only EP provided, we map it to npu device
100+
(
101+
{
102+
"type": "PythonEnvironment",
103+
"config": {
104+
"accelerators": [{"execution_providers": [ExecutionProvider.QNNExecutionProvider]}],
105+
"python_environment_path": Path(sys.executable).parent,
106+
},
107+
},
108+
[("npu", ExecutionProvider.QNNExecutionProvider)],
109+
[ExecutionProvider.QNNExecutionProvider],
110+
),
97111
# both device and EP provided
98112
(
99113
{
@@ -509,7 +523,7 @@ def test_normalize_accelerators_skip_ep_check(system_config, expected_acc):
509523
"accelerators": [
510524
{
511525
"execution_providers": [
512-
ExecutionProvider.QNNExecutionProvider,
526+
ExecutionProvider.VitisAIExecutionProvider,
513527
ExecutionProvider.CUDAExecutionProvider,
514528
]
515529
}
@@ -520,7 +534,7 @@ def test_normalize_accelerators_skip_ep_check(system_config, expected_acc):
520534
AssertionError,
521535
(
522536
"Cannot infer the devices from the execution providers "
523-
"['QNNExecutionProvider', 'CUDAExecutionProvider']. Multiple devices are inferred: ['npu', 'gpu']."
537+
"['VitisAIExecutionProvider', 'CUDAExecutionProvider']. Multiple devices are inferred: ['npu', 'gpu']."
524538
),
525539
),
526540
],

0 commit comments

Comments
 (0)