Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions examples/nvidia_runtime/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
FROM nvcr.io/nvidia/pytorch:25.04-py3

# Add the tests to the entrypoint set. Docker Slim only traces/monitors the processes started by the entrypoint.
RUN echo "pytest /opt/pytorch/pytorch/test/test_cuda.py::TestCuda::test_graph_cudnn_dropout" > /opt/nvidia/entrypoint.d/99-trace.sh
RUN chmod +x /opt/nvidia/entrypoint.d/99-trace.sh

19 changes: 19 additions & 0 deletions examples/nvidia_runtime/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
As a pre-requisite, install nvidia-container toolkit, including adding the nvidia runtime. Then you should be able to translate runtime and capabilities from a OCI/Docker string like `--runtime=nvidia --gpus all` to `--cro-device-request '{"Count":-1, "Capabilities":[["gpu"]]}' --cro-runtime nvidia`

See the example `test_nvidia_smi.sh`, which slims ubuntu to just the files necessary to run the runtime mounted nvidia-smi. Similarly, see `test_nvidia_pytorch.sh` which minimizes nvidia-pytorch to run a subset of the CUDA tests.

97 changes: 97 additions & 0 deletions examples/nvidia_runtime/test_nvidia_pytorch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Create host config file with ulimit settings and capabilities
cat > host-config.json <<'EOF'
{
"IpcMode": "host",
"CapAdd": ["SYS_ADMIN"],
"Ulimits": [
{
"Name": "memlock",
"Soft": -1,
"Hard": -1
},
{
"Name": "stack",
"Soft": 67108864,
"Hard": 67108864
},
{
"Name": "nofile",
"Soft": 1048576,
"Hard": 1048576
}
]
}
EOF

# Build the slim image
# CAP_SYS_ADMIN is added via host-config.json for fanotify support (required for filesystem monitoring)
# Build custom image with test in entrypoint first
echo "Building custom test image with pytest in entrypoint..."
docker build -t nvcr.io/nvidia/pytorch:25.04-py3-test -f Dockerfile .

echo "Running docker-slim on the test image..."
docker-slim build \
--target nvcr.io/nvidia/pytorch:25.04-py3-test \
--tag nvcr.io/nvidia/pytorch:25.04-py3-slim \
--cro-host-config-file host-config.json \
--cro-shm-size 1200 \
--cro-device-request '{"Count":-1, "Capabilities":[["gpu"]]}' \
--cro-runtime nvidia \
--http-probe=false \
--continue-after 10 \
--preserve-path /etc/ld.so.conf \
--preserve-path /etc/ld.so.conf.d \
.

# Get output of original and slim images stored in a log file
echo "Running original image..."
docker run --rm --runtime nvidia --gpus all nvcr.io/nvidia/pytorch:25.04-py3-test > original_log.txt 2>&1
echo "Running slim image..."
docker run --rm --runtime nvidia --gpus all nvcr.io/nvidia/pytorch:25.04-py3-slim > slim_log.txt 2>&1

# Verify that both logs contain the pytest success message (ignoring timing)
echo "Checking test results..."

# Look for "X passed" pattern in both logs
original_passed=$(grep -oE "[0-9]+ passed" original_log.txt | head -1)
slim_passed=$(grep -oE "[0-9]+ passed" slim_log.txt | head -1)

if [ -z "$original_passed" ]; then
echo "Error: Original image test did not pass"
echo "Original log tail:"
tail -20 original_log.txt
exit 1
fi

if [ -z "$slim_passed" ]; then
echo "Error: Slim image test did not pass"
echo "Slim log tail:"
tail -20 slim_log.txt
exit 1
fi

echo "Original image: $original_passed"
echo "Slim image: $slim_passed"

if [ "$original_passed" = "$slim_passed" ]; then
echo "SUCCESS: Both images passed the same number of tests!"
else
echo "Warning: Different number of tests passed (original: $original_passed, slim: $slim_passed)"
fi

echo "Successfully minimized nvidia-pytorch to run a subset of the CUDA tests"
34 changes: 34 additions & 0 deletions examples/nvidia_runtime/test_nvidia_smi.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Build the slim image
docker-slim build --target ubuntu:24.04 --tag ubuntu:24.04-slim --cro-shm-size 1200 --cro-device-request '{"Count":-1, "Capabilities":[["gpu"]]}' --cro-runtime nvidia --http-probe=false --exec "/usr/bin/nvidia-smi" .

# Get output of original and slim images stored in a log file
docker run --rm --runtime nvidia --gpus all ubuntu:24.04 nvidia-smi > original_log.txt
docker run --rm --runtime nvidia --gpus all ubuntu:24.04-slim nvidia-smi > slim_log.txt

# verify that both logs include the nvidia-smi output with an assert
assert_contains() {
if ! grep -q "$1" "$2"; then
echo "Error: '$1' not found in $2"
exit 1
fi
}

# verify that both logs include the nvidia-smi output with an assert
assert_contains "NVIDIA-SMI" original_log.txt
assert_contains "NVIDIA-SMI" slim_log.txt

Loading