Skip to content

Commit

Permalink
fix #639 provide NCCL tests example
Browse files Browse the repository at this point in the history
Signed-off-by: Sam Stoelinga <[email protected]>
  • Loading branch information
samos123 committed May 1, 2024
1 parent 52cda2c commit 6dcc0f7
Show file tree
Hide file tree
Showing 2 changed files with 128 additions and 0 deletions.
60 changes: 60 additions & 0 deletions examples/v2beta1/nccl-tests/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
ARG CUDA_VERSION_MINOR=12.4.1
ARG BASE_IMAGE=nvidia/cuda:${CUDA_VERSION_MINOR}-devel-ubuntu22.04
FROM ${BASE_IMAGE} as base

ARG CUDA_VERSION_MAJOR=12.4
ARG TARGET_NCCL_VERSION=2.21.5-1

ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get -qq update && \
apt-get -qq install -y \
--allow-change-held-packages \
--no-install-recommends \
--allow-downgrades \
build-essential libtool autoconf automake autotools-dev unzip \
ca-certificates \
wget curl openssh-server vim environment-modules \
iputils-ping net-tools \
libnuma1 libsubunit0 libpci-dev \
libpmix-dev \
datacenter-gpu-manager \
g++ libopenmpi-dev openmpi-bin \
libnccl2=$TARGET_NCCL_VERSION+cuda${CUDA_VERSION_MAJOR} \
libnccl-dev=${TARGET_NCCL_VERSION}+cuda${CUDA_VERSION_MAJOR} \
git && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

RUN apt-get -qq update \
&& apt-get -qq install -y --no-install-recommends \
ibverbs-utils libibverbs-dev libibumad3 libibumad-dev librdmacm-dev rdmacm-utils infiniband-diags ibverbs-utils \
&& rm -rf /var/lib/apt/lists/*

RUN apt-get update && apt-get install -y apt-transport-https ca-certificates gnupg curl && \
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add - && \
echo "deb https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
apt-get update && apt-get install -y google-cloud-sdk && \
apt-get clean && rm -rf /var/lib/apt/lists/*



# NCCL Tests
ENV NCCL_TESTS_COMMITISH=c6afef0
ENV CUDA12_GENCODE='-gencode=arch=compute_90,code=sm_90'
ENV CUDA12_PTX='-gencode=arch=compute_90,code=compute_90'
WORKDIR /opt/nccl-tests
RUN wget -q -O - https://github.com/NVIDIA/nccl-tests/archive/${NCCL_TESTS_COMMITISH}.tar.gz | tar --strip-components=1 -xzf - && \
mpicc -show && \
export CXX=mpic++ && \
make -j20 MPI=1 MPI_HOME=/usr/include/openmpi NVCC_GENCODE="$CUDA12_GENCODE $CUDA12_PTX" && \
ln -s /opt/nccl-tests /opt/nccl_tests

RUN ldconfig

# SSH dependencies for MPI
RUN sed -i 's/[ #]\(.*StrictHostKeyChecking \).*/ \1no/g' /etc/ssh/ssh_config && \
echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
sed -i "s/[ #]\(.*Port \).*/ \12222/g" /etc/ssh/ssh_config && \
sed -i "s/#\(Port \).*/\12222/g" /etc/ssh/sshd_config && \
mkdir /var/run/sshd -p
68 changes: 68 additions & 0 deletions examples/v2beta1/nccl-tests/nccl-tests.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
apiVersion: kubeflow.org/v2beta1
kind: MPIJob
metadata:
name: nccl-tests
spec:
slotsPerWorker: 8
runPolicy:
cleanPodPolicy: Running
activeDeadlineSeconds: 666
mpiReplicaSpecs:
Launcher:
replicas: 1
template:
spec:
restartPolicy: OnFailure
containers:
- image: mpioperator/nccl-tests:latest
name: nccl
securityContext:
privileged: true
env:
- name: OMPI_ALLOW_RUN_AS_ROOT
value: "1"
- name: OMPI_ALLOW_RUN_AS_ROOT_CONFIRM
value: "1"
- name: OMPI_MCA_orte_base_help_aggregate
value: "0"
command: ["/bin/bash", "-c"]
args:
- |
set -xe
export NCCL_DEBUG=INFO
until mpirun -np 16 -x LD_LIBRARY_PATH -bind-to none /usr/local/nvidia/bin/nvidia-smi; do sleep 5; done
mpirun -np ${NP} -bind-to none \
-x NCCL_DEBUG \
/opt/nccl_tests/build/all_reduce_perf -c 0 -b 8 -e 16G \
-f 4 -g 1 -n 10
resources:
requests:
cpu: 50m
memory: 128Mi
enableServiceLinks: false
automountServiceAccountToken: false
Worker:
replicas: 2
template:
metadata:
annotations:
spec:
volumes:
- name: shared-memory
emptyDir:
medium: "Memory"

containers:
- image: mpioperator/nccl-tests:latest
name: nccl
securityContext:
privileged: true
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: shared-memory
mountPath: /dev/shm

enableServiceLinks: false
automountServiceAccountToken: false

0 comments on commit 6dcc0f7

Please sign in to comment.