jayya2
diff --git a/‎.github/workflows/cicd-main.yml‎
Lines changed: 27 additions & 0 deletions b/‎.github/workflows/cicd-main.yml‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎nemo/export/tensorrt_llm.py‎
Lines changed: 6 additions & 2 deletions b/‎nemo/export/tensorrt_llm.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎tests/deploy/test_deploy_query.py‎
Lines changed: 78 additions & 0 deletions b/‎tests/deploy/test_deploy_query.py‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎tests/export/test_export_onnx.py‎
Lines changed: 128 additions & 0 deletions b/‎tests/export/test_export_onnx.py‎
Lines changed: 128 additions & 0 deletions
diff --git a/‎tests/export/trt_llm/test_tensorrt_llm_export.py‎
Lines changed: 127 additions & 0 deletions b/‎tests/export/trt_llm/test_tensorrt_llm_export.py‎
Lines changed: 127 additions & 0 deletions
@@ -1713,6 +1713,30 @@ jobs:
       RUNNER: self-hosted-azure
       SCRIPT: L2_NeMo_2_Export_In_Framework
 
+  L2_NeMo_2_Export_Deploy_Query_In_Framework:
+    needs: [ pre-flight, cicd-test-container-build ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Export_Deploy_Query_In_Framework')
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: L2_NeMo_2_Export_Deploy_Query_In_Framework
+  
+  L2_ONNX_TRT_LLM_Embedding_Export:
+    needs: [ pre-flight, cicd-test-container-build ]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_ONNX_TRT_LLM_Embedding_Export')
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: L2_ONNX_TRT_LLM_Embedding_Export
+  
+  L2_NeMo_2_Export_TRT_LLM:
+    needs: [pre-flight, cicd-test-container-build]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.pre-flight.outputs.test_to_run), 'L2_NeMo_2_Export_TRT_LLM')
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: L2_NeMo_2_Export_TRT_LLM
+
   L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING:
     needs: [pre-flight, cicd-test-container-build]
     uses: ./.github/workflows/_test_template.yml
@@ -2138,6 +2162,9 @@ jobs:
       - L2_NeMo_2_Distill_Llama3_TP1PP2
       - L2_NeMo_2_Prune_Llama_TP1PP2
       - L2_NeMo_2_Export_In_Framework
+      - L2_ONNX_TRT_LLM_Embedding_Export
+      - L2_NeMo_2_Export_Deploy_Query_In_Framework
+      - L2_NeMo_2_Export_TRT_LLM
       - L2_NeMo_2_jit_callback
       - L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING
       - L2_NeMo_2_VLLM_VISION
 
@@ -865,7 +865,10 @@ def convert_to_safe_tensors(
             for weight_dict, model_config in zip(weights_dicts, model_configs):
                 rank = model_config.mapping.tp_rank
                 for k, v in weight_dict.items():
-                    weight_dict[k] = numpy_to_torch(v)
+                    if isinstance(v, np.ndarray):
+                        weight_dict[k] = numpy_to_torch(v)
+                    else:
+                        weight_dict[k] = v
 
                 safetensors.torch.save_file(weight_dict, os.path.join(self.model_dir, f'rank{rank}.safetensors'))
             model_configs[0].to_json_file(os.path.join(self.model_dir, 'config.json'))
@@ -874,7 +877,8 @@ def convert_to_safe_tensors(
             if os.path.exists(tokenizer_path):
                 shutil.copy(tokenizer_path, self.model_dir)
             else:
-                self.tokenizer.save_pretrained(self.model_dir)
+                if self.tokenizer is not None:
+                    self.tokenizer.save_pretrained(self.model_dir)
 
             nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml")
             if os.path.exists(nemo_model_config):
 
@@ -0,0 +1,78 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import numpy as np
+from pytriton.decorators import batch
+from pytriton.model_config import Tensor
+
+from nemo.deploy import DeployPyTriton, ITritonDeployable
+from nemo.deploy.nlp import NemoQueryLLM
+from nemo.deploy.utils import cast_output, str_ndarray2list
+
+
+class MockModel(ITritonDeployable):
+
+    @property
+    def get_triton_input(self):
+        inputs = (
+            Tensor(name="prompts", shape=(-1,), dtype=bytes),
+            Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="output_context_logits", shape=(-1,), dtype=np.bool_, optional=False),
+            Tensor(name="output_generation_logits", shape=(-1,), dtype=np.bool_, optional=False),
+        )
+        return inputs
+
+    @property
+    def get_triton_output(self):
+        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),)
+        return outputs
+
+    @batch
+    def triton_infer_fn(self, **inputs: np.ndarray):
+        infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
+        if "max_output_len" in inputs:
+            infer_input["max_output_len"] = inputs.pop("max_output_len")[0][0]
+
+        output_dict = dict()
+        output_dict["outputs"] = cast_output("I am good, how about you?", np.bytes_)
+        return output_dict
+
+
+def test_nemo_deploy_query():
+    model_name = "mock_model"
+    model = MockModel()
+    nm = DeployPyTriton(
+        model=model,
+        triton_model_name=model_name,
+        max_batch_size=32,
+        http_port=9002,
+        grpc_port=8001,
+        address="0.0.0.0",
+        allow_grpc=True,
+        allow_http=True,
+        streaming=False,
+    )
+    nm.deploy()
+    nm.run()
+
+    nq = NemoQueryLLM(url="localhost:9002", model_name=model_name)
+    output_deployed = nq.query_llm(
+        prompts=["Hey, how is it going?"],
+        max_output_len=20,
+    )
+    nm.stop()
+
+    assert output_deployed is not None, "Output cannot be none."
+    assert output_deployed == "I am good, how about you?", "Output cannot be none."
@@ -0,0 +1,128 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+
+import tensorrt as trt
+
+from nemo.collections.llm.gpt.model.hf_llama_embedding import get_llama_bidirectional_hf_model
+from nemo.export.onnx_llm_exporter import OnnxLLMExporter
+from nemo.utils import logging
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='Test ONNX and TensorRT export for LLM embedding models.')
+    parser.add_argument('--hf_model_path', type=str, required=True, help="Hugging Face model id or path.")
+    parser.add_argument('--pooling_strategy', type=str, default="avg", help="Pooling strategy for the model.")
+    parser.add_argument("--normalize", default=False, action="store_true", help="Normalize the embeddings or not.")
+    parser.add_argument('--onnx_export_path', type=str, default="/tmp/onnx_model/", help="Path to store ONNX model.")
+    parser.add_argument('--onnx_opset', type=int, default=17, help="ONNX version to use for export.")
+    parser.add_argument('--trt_model_path', type=str, default="/tmp/trt_model/", help="Path to store TensorRT model.")
+    parser.add_argument(
+        "--trt_version_compatible",
+        default=False,
+        action="store_true",
+        help="Whether to generate version compatible TensorRT models.",
+    )
+
+    return parser.parse_args()
+
+
+def export_onnx_trt(args):
+    # Base Llama model needs to be adapted to turn it into an embedding model.
+    model, tokenizer = get_llama_bidirectional_hf_model(
+        model_name_or_path=args.hf_model_path,
+        normalize=args.normalize,
+        pooling_mode=args.pooling_strategy,
+        trust_remote_code=True,
+    )
+
+    input_names = ["input_ids", "attention_mask", "dimensions"]  # ONNX specific arguments, input names in this case.
+    dynamic_axes_input = {
+        "input_ids": {0: "batch_size", 1: "seq_length"},
+        "attention_mask": {0: "batch_size", 1: "seq_length"},
+        "dimensions": {0: "batch_size"},
+    }
+
+    output_names = ["embeddings"]  # ONNX specific arguments, output names in this case.
+    dynamic_axes_output = {"embeddings": {0: "batch_size", 1: "embedding_dim"}}
+
+    # Initialize ONNX exporter.
+    onnx_exporter = OnnxLLMExporter(
+        onnx_model_dir=args.onnx_export_path,
+        model=model,
+        tokenizer=tokenizer,
+    )
+
+    # Export ONNX model.
+    onnx_exporter.export(
+        input_names=input_names,
+        output_names=output_names,
+        opset=args.onnx_opset,
+        dynamic_axes_input=dynamic_axes_input,
+        dynamic_axes_output=dynamic_axes_output,
+        export_dtype="fp32",
+    )
+
+    # Input profiles for TensorRT.
+    input_profiles = [
+        {
+            "input_ids": [[1, 3], [16, 128], [64, 256]],
+            "attention_mask": [[1, 3], [16, 128], [64, 256]],
+            "dimensions": [[1], [16], [64]],
+        }
+    ]
+
+    # TensorRT builder flags.
+    trt_builder_flags = None
+    if args.trt_version_compatible:
+        trt_builder_flags = [trt.BuilderFlag.VERSION_COMPATIBLE]
+
+    # Model specific layers to override the precision to fp32.
+    override_layers_to_fp32 = [
+        "/model/norm/",
+        "/pooling_module",
+        "/ReduceL2",
+        "/Div",
+    ]
+    # Model specific operation wheter to override layernorm precision or not.
+    override_layernorm_precision_to_fp32 = True
+    profiling_verbosity = "layer_names_only"
+
+    # Export ONNX to TensorRT.
+    onnx_exporter.export_onnx_to_trt(
+        trt_model_dir=args.trt_model_path,
+        profiles=input_profiles,
+        override_layernorm_precision_to_fp32=override_layernorm_precision_to_fp32,
+        override_layers_to_fp32=override_layers_to_fp32,
+        profiling_verbosity=profiling_verbosity,
+        trt_builder_flags=trt_builder_flags,
+    )
+
+    assert os.path.exists(args.trt_model_path)
+    assert os.path.exists(args.onnx_export_path)
+
+    prompt = ["hello", "world"]
+
+    prompt = onnx_exporter.get_tokenizer(prompt)
+    prompt["dimensions"] = [[2]]
+
+    output = onnx_exporter.forward(prompt)
+    if output is None:
+        logging.warning(f"Output is None because ONNX runtime is not installed.")
+
+
+if __name__ == '__main__':
+    export_onnx_trt(get_args())
@@ -0,0 +1,127 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+
+import pytest
+
+
+@pytest.mark.run_only_on('GPU')
+@pytest.mark.unit
+@pytest.mark.parametrize("tensor_parallelism_size,pipeline_parallelism_size", [(2, 1), (1, 2)])
+def test_nemo2_convert_to_safe_tensors(tensor_parallelism_size, pipeline_parallelism_size):
+    """
+    Test safe tensor exporter. This tests the whole nemo export until engine building.
+    """
+    from pathlib import Path
+
+    from nemo.export.tensorrt_llm import TensorRTLLM
+
+    trt_llm_exporter = TensorRTLLM(model_dir="/tmp/safe_tensor_test/")
+    trt_llm_exporter.convert_to_safe_tensors(
+        nemo_checkpoint_path="/home/TestData/llm/models/llama32_1b_nemo2",
+        model_type="llama",
+        delete_existing_files=True,
+        tensor_parallelism_size=tensor_parallelism_size,
+        pipeline_parallelism_size=pipeline_parallelism_size,
+        gpus_per_node=2,
+        use_parallel_embedding=False,
+        use_embedding_sharing=False,
+        dtype="bfloat16",
+    )
+
+    assert Path("/tmp/safe_tensor_test/").exists(), "Safe tensors were not generated."
+    assert Path("/tmp/safe_tensor_test/rank0.safetensors").exists(), "Safe tensors for rank0 were not generated."
+    if pipeline_parallelism_size == 1 and tensor_parallelism_size == 2:
+        assert Path("/tmp/safe_tensor_test/rank1.safetensors").exists(), "Safe tensors for rank1 were not generated."
+    assert Path("/tmp/safe_tensor_test/config.json").exists(), "config.yaml was not generated."
+
+    shutil.rmtree("/tmp/safe_tensor_test/")
+
+
+@pytest.mark.run_only_on('GPU')
+@pytest.mark.unit
+def test_nemo2_convert_to_export():
+    """
+    Test safe tensor exporter. This tests the whole nemo export until engine building.
+    """
+    from pathlib import Path
+
+    from nemo.export.tensorrt_llm import TensorRTLLM
+
+    trt_llm_exporter = TensorRTLLM(model_dir="/tmp/safe_tensor_test_2/")
+    trt_llm_exporter.export(
+        nemo_checkpoint_path="/home/TestData/llm/models/llama32_1b_nemo2",
+        model_type="llama",
+        delete_existing_files=True,
+        tensor_parallelism_size=1,
+        pipeline_parallelism_size=1,
+        gpus_per_node=None,
+        max_input_len=1024,
+        max_output_len=256,
+        max_batch_size=4,
+        max_prompt_embedding_table_size=None,
+        use_parallel_embedding=False,
+        use_embedding_sharing=False,
+        paged_kv_cache=True,
+        remove_input_padding=True,
+        paged_context_fmha=False,
+        dtype=None,
+        load_model=True,
+        use_lora_plugin=None,
+        lora_target_modules=None,
+        max_lora_rank=64,
+        max_num_tokens=None,
+        opt_num_tokens=None,
+        max_seq_len=512,
+        multiple_profiles=False,
+        gpt_attention_plugin="auto",
+        gemm_plugin="auto",
+        use_mcore_path=True,
+        reduce_fusion=True,
+        fp8_quantized=None,
+        fp8_kvcache=None,
+        gather_context_logits=True,
+        gather_generation_logits=True,
+        build_rank=None,
+    )
+
+    output = trt_llm_exporter.forward(
+        input_texts=["Tell me the capitol of France "],
+        max_output_len=16,
+        top_k=1,
+        top_p=0.0,
+        temperature=0.1,
+        stop_words_list=None,
+        bad_words_list=None,
+        no_repeat_ngram_size=None,
+        task_ids=None,
+        lora_uids=None,
+        prompt_embeddings_table=None,
+        prompt_embeddings_checkpoint_path=None,
+        streaming=False,
+        output_log_probs=False,
+        output_context_logits=False,
+        output_generation_logits=False,
+    )
+
+    print(output)
+
+    assert Path("/tmp/safe_tensor_test_2/trtllm_engine/").exists(), "Safe tensors were not generated."
+    assert Path(
+        "/tmp/safe_tensor_test_2/trtllm_engine/rank0.engine"
+    ).exists(), "Safe tensors for rank0 were not generated."
+    assert Path("/tmp/safe_tensor_test_2/trtllm_engine/config.json").exists(), "config.yaml was not generated."
+
+    shutil.rmtree("/tmp/safe_tensor_test_2/")