Skip to content

Commit 43451ae

Browse files
authored
Merge branch 'main' into asr-nemo-1-ckpt
2 parents 51d8948 + faa2be8 commit 43451ae

File tree

8 files changed

+429
-2
lines changed

8 files changed

+429
-2
lines changed

.github/workflows/cicd-main.yml

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1713,6 +1713,30 @@ jobs:
17131713
RUNNER: self-hosted-azure
17141714
SCRIPT: L2_NeMo_2_Export_In_Framework
17151715

1716+
L2_NeMo_2_Export_Deploy_Query_In_Framework:
1717+
needs: [ pre-flight, cicd-test-container-build ]
1718+
uses: ./.github/workflows/_test_template.yml
1719+
if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Export_Deploy_Query_In_Framework')
1720+
with:
1721+
RUNNER: self-hosted-azure
1722+
SCRIPT: L2_NeMo_2_Export_Deploy_Query_In_Framework
1723+
1724+
L2_ONNX_TRT_LLM_Embedding_Export:
1725+
needs: [ pre-flight, cicd-test-container-build ]
1726+
uses: ./.github/workflows/_test_template.yml
1727+
if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_ONNX_TRT_LLM_Embedding_Export')
1728+
with:
1729+
RUNNER: self-hosted-azure
1730+
SCRIPT: L2_ONNX_TRT_LLM_Embedding_Export
1731+
1732+
L2_NeMo_2_Export_TRT_LLM:
1733+
needs: [pre-flight, cicd-test-container-build]
1734+
uses: ./.github/workflows/_test_template.yml
1735+
if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Export_TRT_LLM')
1736+
with:
1737+
RUNNER: self-hosted-azure
1738+
SCRIPT: L2_NeMo_2_Export_TRT_LLM
1739+
17161740
L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING:
17171741
needs: [pre-flight, cicd-test-container-build]
17181742
uses: ./.github/workflows/_test_template.yml
@@ -2138,6 +2162,9 @@ jobs:
21382162
- L2_NeMo_2_Distill_Llama3_TP1PP2
21392163
- L2_NeMo_2_Prune_Llama_TP1PP2
21402164
- L2_NeMo_2_Export_In_Framework
2165+
- L2_ONNX_TRT_LLM_Embedding_Export
2166+
- L2_NeMo_2_Export_Deploy_Query_In_Framework
2167+
- L2_NeMo_2_Export_TRT_LLM
21412168
- L2_NeMo_2_jit_callback
21422169
- L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING
21432170
- L2_NeMo_2_VLLM_VISION

nemo/export/tensorrt_llm.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -865,7 +865,10 @@ def convert_to_safe_tensors(
865865
for weight_dict, model_config in zip(weights_dicts, model_configs):
866866
rank = model_config.mapping.tp_rank
867867
for k, v in weight_dict.items():
868-
weight_dict[k] = numpy_to_torch(v)
868+
if isinstance(v, np.ndarray):
869+
weight_dict[k] = numpy_to_torch(v)
870+
else:
871+
weight_dict[k] = v
869872

870873
safetensors.torch.save_file(weight_dict, os.path.join(self.model_dir, f'rank{rank}.safetensors'))
871874
model_configs[0].to_json_file(os.path.join(self.model_dir, 'config.json'))
@@ -874,7 +877,8 @@ def convert_to_safe_tensors(
874877
if os.path.exists(tokenizer_path):
875878
shutil.copy(tokenizer_path, self.model_dir)
876879
else:
877-
self.tokenizer.save_pretrained(self.model_dir)
880+
if self.tokenizer is not None:
881+
self.tokenizer.save_pretrained(self.model_dir)
878882

879883
nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml")
880884
if os.path.exists(nemo_model_config):

tests/deploy/test_deploy_query.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
16+
import numpy as np
17+
from pytriton.decorators import batch
18+
from pytriton.model_config import Tensor
19+
20+
from nemo.deploy import DeployPyTriton, ITritonDeployable
21+
from nemo.deploy.nlp import NemoQueryLLM
22+
from nemo.deploy.utils import cast_output, str_ndarray2list
23+
24+
25+
class MockModel(ITritonDeployable):
26+
27+
@property
28+
def get_triton_input(self):
29+
inputs = (
30+
Tensor(name="prompts", shape=(-1,), dtype=bytes),
31+
Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True),
32+
Tensor(name="output_context_logits", shape=(-1,), dtype=np.bool_, optional=False),
33+
Tensor(name="output_generation_logits", shape=(-1,), dtype=np.bool_, optional=False),
34+
)
35+
return inputs
36+
37+
@property
38+
def get_triton_output(self):
39+
outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),)
40+
return outputs
41+
42+
@batch
43+
def triton_infer_fn(self, **inputs: np.ndarray):
44+
infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
45+
if "max_output_len" in inputs:
46+
infer_input["max_output_len"] = inputs.pop("max_output_len")[0][0]
47+
48+
output_dict = dict()
49+
output_dict["outputs"] = cast_output("I am good, how about you?", np.bytes_)
50+
return output_dict
51+
52+
53+
def test_nemo_deploy_query():
54+
model_name = "mock_model"
55+
model = MockModel()
56+
nm = DeployPyTriton(
57+
model=model,
58+
triton_model_name=model_name,
59+
max_batch_size=32,
60+
http_port=9002,
61+
grpc_port=8001,
62+
address="0.0.0.0",
63+
allow_grpc=True,
64+
allow_http=True,
65+
streaming=False,
66+
)
67+
nm.deploy()
68+
nm.run()
69+
70+
nq = NemoQueryLLM(url="localhost:9002", model_name=model_name)
71+
output_deployed = nq.query_llm(
72+
prompts=["Hey, how is it going?"],
73+
max_output_len=20,
74+
)
75+
nm.stop()
76+
77+
assert output_deployed is not None, "Output cannot be none."
78+
assert output_deployed == "I am good, how about you?", "Output cannot be none."

tests/export/test_export_onnx.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import argparse
16+
import os
17+
18+
import tensorrt as trt
19+
20+
from nemo.collections.llm.gpt.model.hf_llama_embedding import get_llama_bidirectional_hf_model
21+
from nemo.export.onnx_llm_exporter import OnnxLLMExporter
22+
from nemo.utils import logging
23+
24+
25+
def get_args():
26+
parser = argparse.ArgumentParser(description='Test ONNX and TensorRT export for LLM embedding models.')
27+
parser.add_argument('--hf_model_path', type=str, required=True, help="Hugging Face model id or path.")
28+
parser.add_argument('--pooling_strategy', type=str, default="avg", help="Pooling strategy for the model.")
29+
parser.add_argument("--normalize", default=False, action="store_true", help="Normalize the embeddings or not.")
30+
parser.add_argument('--onnx_export_path', type=str, default="/tmp/onnx_model/", help="Path to store ONNX model.")
31+
parser.add_argument('--onnx_opset', type=int, default=17, help="ONNX version to use for export.")
32+
parser.add_argument('--trt_model_path', type=str, default="/tmp/trt_model/", help="Path to store TensorRT model.")
33+
parser.add_argument(
34+
"--trt_version_compatible",
35+
default=False,
36+
action="store_true",
37+
help="Whether to generate version compatible TensorRT models.",
38+
)
39+
40+
return parser.parse_args()
41+
42+
43+
def export_onnx_trt(args):
44+
# Base Llama model needs to be adapted to turn it into an embedding model.
45+
model, tokenizer = get_llama_bidirectional_hf_model(
46+
model_name_or_path=args.hf_model_path,
47+
normalize=args.normalize,
48+
pooling_mode=args.pooling_strategy,
49+
trust_remote_code=True,
50+
)
51+
52+
input_names = ["input_ids", "attention_mask", "dimensions"] # ONNX specific arguments, input names in this case.
53+
dynamic_axes_input = {
54+
"input_ids": {0: "batch_size", 1: "seq_length"},
55+
"attention_mask": {0: "batch_size", 1: "seq_length"},
56+
"dimensions": {0: "batch_size"},
57+
}
58+
59+
output_names = ["embeddings"] # ONNX specific arguments, output names in this case.
60+
dynamic_axes_output = {"embeddings": {0: "batch_size", 1: "embedding_dim"}}
61+
62+
# Initialize ONNX exporter.
63+
onnx_exporter = OnnxLLMExporter(
64+
onnx_model_dir=args.onnx_export_path,
65+
model=model,
66+
tokenizer=tokenizer,
67+
)
68+
69+
# Export ONNX model.
70+
onnx_exporter.export(
71+
input_names=input_names,
72+
output_names=output_names,
73+
opset=args.onnx_opset,
74+
dynamic_axes_input=dynamic_axes_input,
75+
dynamic_axes_output=dynamic_axes_output,
76+
export_dtype="fp32",
77+
)
78+
79+
# Input profiles for TensorRT.
80+
input_profiles = [
81+
{
82+
"input_ids": [[1, 3], [16, 128], [64, 256]],
83+
"attention_mask": [[1, 3], [16, 128], [64, 256]],
84+
"dimensions": [[1], [16], [64]],
85+
}
86+
]
87+
88+
# TensorRT builder flags.
89+
trt_builder_flags = None
90+
if args.trt_version_compatible:
91+
trt_builder_flags = [trt.BuilderFlag.VERSION_COMPATIBLE]
92+
93+
# Model specific layers to override the precision to fp32.
94+
override_layers_to_fp32 = [
95+
"/model/norm/",
96+
"/pooling_module",
97+
"/ReduceL2",
98+
"/Div",
99+
]
100+
# Model specific operation wheter to override layernorm precision or not.
101+
override_layernorm_precision_to_fp32 = True
102+
profiling_verbosity = "layer_names_only"
103+
104+
# Export ONNX to TensorRT.
105+
onnx_exporter.export_onnx_to_trt(
106+
trt_model_dir=args.trt_model_path,
107+
profiles=input_profiles,
108+
override_layernorm_precision_to_fp32=override_layernorm_precision_to_fp32,
109+
override_layers_to_fp32=override_layers_to_fp32,
110+
profiling_verbosity=profiling_verbosity,
111+
trt_builder_flags=trt_builder_flags,
112+
)
113+
114+
assert os.path.exists(args.trt_model_path)
115+
assert os.path.exists(args.onnx_export_path)
116+
117+
prompt = ["hello", "world"]
118+
119+
prompt = onnx_exporter.get_tokenizer(prompt)
120+
prompt["dimensions"] = [[2]]
121+
122+
output = onnx_exporter.forward(prompt)
123+
if output is None:
124+
logging.warning(f"Output is None because ONNX runtime is not installed.")
125+
126+
127+
if __name__ == '__main__':
128+
export_onnx_trt(get_args())
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import shutil
16+
17+
import pytest
18+
19+
20+
@pytest.mark.run_only_on('GPU')
21+
@pytest.mark.unit
22+
@pytest.mark.parametrize("tensor_parallelism_size,pipeline_parallelism_size", [(2, 1), (1, 2)])
23+
def test_nemo2_convert_to_safe_tensors(tensor_parallelism_size, pipeline_parallelism_size):
24+
"""
25+
Test safe tensor exporter. This tests the whole nemo export until engine building.
26+
"""
27+
from pathlib import Path
28+
29+
from nemo.export.tensorrt_llm import TensorRTLLM
30+
31+
trt_llm_exporter = TensorRTLLM(model_dir="/tmp/safe_tensor_test/")
32+
trt_llm_exporter.convert_to_safe_tensors(
33+
nemo_checkpoint_path="/home/TestData/llm/models/llama32_1b_nemo2",
34+
model_type="llama",
35+
delete_existing_files=True,
36+
tensor_parallelism_size=tensor_parallelism_size,
37+
pipeline_parallelism_size=pipeline_parallelism_size,
38+
gpus_per_node=2,
39+
use_parallel_embedding=False,
40+
use_embedding_sharing=False,
41+
dtype="bfloat16",
42+
)
43+
44+
assert Path("/tmp/safe_tensor_test/").exists(), "Safe tensors were not generated."
45+
assert Path("/tmp/safe_tensor_test/rank0.safetensors").exists(), "Safe tensors for rank0 were not generated."
46+
if pipeline_parallelism_size == 1 and tensor_parallelism_size == 2:
47+
assert Path("/tmp/safe_tensor_test/rank1.safetensors").exists(), "Safe tensors for rank1 were not generated."
48+
assert Path("/tmp/safe_tensor_test/config.json").exists(), "config.yaml was not generated."
49+
50+
shutil.rmtree("/tmp/safe_tensor_test/")
51+
52+
53+
@pytest.mark.run_only_on('GPU')
54+
@pytest.mark.unit
55+
def test_nemo2_convert_to_export():
56+
"""
57+
Test safe tensor exporter. This tests the whole nemo export until engine building.
58+
"""
59+
from pathlib import Path
60+
61+
from nemo.export.tensorrt_llm import TensorRTLLM
62+
63+
trt_llm_exporter = TensorRTLLM(model_dir="/tmp/safe_tensor_test_2/")
64+
trt_llm_exporter.export(
65+
nemo_checkpoint_path="/home/TestData/llm/models/llama32_1b_nemo2",
66+
model_type="llama",
67+
delete_existing_files=True,
68+
tensor_parallelism_size=1,
69+
pipeline_parallelism_size=1,
70+
gpus_per_node=None,
71+
max_input_len=1024,
72+
max_output_len=256,
73+
max_batch_size=4,
74+
max_prompt_embedding_table_size=None,
75+
use_parallel_embedding=False,
76+
use_embedding_sharing=False,
77+
paged_kv_cache=True,
78+
remove_input_padding=True,
79+
paged_context_fmha=False,
80+
dtype=None,
81+
load_model=True,
82+
use_lora_plugin=None,
83+
lora_target_modules=None,
84+
max_lora_rank=64,
85+
max_num_tokens=None,
86+
opt_num_tokens=None,
87+
max_seq_len=512,
88+
multiple_profiles=False,
89+
gpt_attention_plugin="auto",
90+
gemm_plugin="auto",
91+
use_mcore_path=True,
92+
reduce_fusion=True,
93+
fp8_quantized=None,
94+
fp8_kvcache=None,
95+
gather_context_logits=True,
96+
gather_generation_logits=True,
97+
build_rank=None,
98+
)
99+
100+
output = trt_llm_exporter.forward(
101+
input_texts=["Tell me the capitol of France "],
102+
max_output_len=16,
103+
top_k=1,
104+
top_p=0.0,
105+
temperature=0.1,
106+
stop_words_list=None,
107+
bad_words_list=None,
108+
no_repeat_ngram_size=None,
109+
task_ids=None,
110+
lora_uids=None,
111+
prompt_embeddings_table=None,
112+
prompt_embeddings_checkpoint_path=None,
113+
streaming=False,
114+
output_log_probs=False,
115+
output_context_logits=False,
116+
output_generation_logits=False,
117+
)
118+
119+
print(output)
120+
121+
assert Path("/tmp/safe_tensor_test_2/trtllm_engine/").exists(), "Safe tensors were not generated."
122+
assert Path(
123+
"/tmp/safe_tensor_test_2/trtllm_engine/rank0.engine"
124+
).exists(), "Safe tensors for rank0 were not generated."
125+
assert Path("/tmp/safe_tensor_test_2/trtllm_engine/config.json").exists(), "config.yaml was not generated."
126+
127+
shutil.rmtree("/tmp/safe_tensor_test_2/")

0 commit comments

Comments
 (0)