Skip to content

Commit bab681f

Browse files
committed
Add OSS GPU tests
ghstack-source-id: 093cf50 Pull Request resolved: #231
1 parent 932450a commit bab681f

File tree

4 files changed

+136
-20
lines changed

4 files changed

+136
-20
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
#!/usr/bin/env bash
2+
3+
set -eou pipefail
4+
5+
6+
DISTRIBUTION=$(. /etc/os-release;echo $ID$VERSION_ID)
7+
DRIVER_VERSION="515.57"
8+
DRIVER_FN="NVIDIA-Linux-x86_64-${DRIVER_VERSION}.run"
9+
YUM_REPO_URL="https://nvidia.github.io/nvidia-docker/${DISTRIBUTION}/nvidia-docker.repo"
10+
11+
install_nvidia_docker2_amzn2() {
12+
(
13+
set -x
14+
# Needed for yum-config-manager
15+
sudo yum install -y yum-utils
16+
sudo yum-config-manager --add-repo "${YUM_REPO_URL}"
17+
sudo yum install -y nvidia-docker2
18+
sudo systemctl restart docker
19+
)
20+
}
21+
22+
install_nvidia_driver_amzn2() {
23+
(
24+
set -x
25+
26+
# Purge any nvidia driver installed from RHEL repo
27+
sudo yum remove -y nvidia-driver-latest-dkms
28+
29+
HAS_NVIDIA_DRIVER=0
30+
# Check if NVIDIA driver has already been installed
31+
if [ -x "$(command -v nvidia-smi)" ]; then
32+
# The driver exists, check its version next
33+
INSTALLED_DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader)
34+
35+
if [ "$INSTALLED_DRIVER_VERSION" != "$DRIVER_VERSION" ]; then
36+
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has been installed, but we expect to have $DRIVER_VERSION instead. Continuing"
37+
else
38+
HAS_NVIDIA_DRIVER=1
39+
echo "NVIDIA driver ($INSTALLED_DRIVER_VERSION) has already been installed. Skipping NVIDIA driver installation"
40+
fi
41+
fi
42+
43+
if [ "$HAS_NVIDIA_DRIVER" -eq 0 ]; then
44+
sudo yum groupinstall -y "Development Tools"
45+
# ensure our kernel install is the same as our underlying kernel,
46+
# groupinstall "Development Tools" has a habit of mismatching kernel headers
47+
sudo yum install -y "kernel-devel-uname-r == $(uname -r)"
48+
sudo modprobe backlight
49+
sudo curl -fsL -o /tmp/nvidia_driver "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN"
50+
sudo /bin/bash /tmp/nvidia_driver -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
51+
sudo rm -fv /tmp/nvidia_driver
52+
fi
53+
54+
(
55+
set +e
56+
nvidia-smi
57+
status=$?
58+
# Allowable exit statuses for nvidia-smi, see: https://github.com/NVIDIA/gpu-operator/issues/285
59+
if [ $status -eq 0 ] || [ $status -eq 14 ]; then
60+
echo "INFO: Ignoring allowed status ${status}"
61+
else
62+
echo "ERROR: nvidia-smi exited with unresolved status ${status}"
63+
exit ${status}
64+
fi
65+
)
66+
)
67+
}
68+
69+
echo "== Installing nvidia driver ${DRIVER_FN} =="
70+
case "${DISTRIBUTION}" in
71+
amzn*)
72+
install_nvidia_driver_amzn2
73+
;;
74+
*)
75+
echo "ERROR: Unknown distribution ${DISTRIBUTION}"
76+
exit 1
77+
;;
78+
esac
79+
80+
# Install container toolkit based on distribution
81+
echo "== Installing nvidia container toolkit for ${DISTRIBUTION} =="
82+
case "${DISTRIBUTION}" in
83+
amzn*)
84+
install_nvidia_docker2_amzn2
85+
;;
86+
*)
87+
echo "ERROR: Unknown distribution ${DISTRIBUTION}"
88+
exit 1
89+
;;
90+
esac

.github/workflows/runtime_tests.yaml

+17-3
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,27 @@ jobs:
1313
matrix:
1414
python-major-version: [3]
1515
python-minor-version: [7,8,9,10]
16-
platform: [ubuntu-18.04]
16+
platform: [linux.4xlarge.nvidia.gpu]
1717
fail-fast: false
1818
runs-on: ${{ matrix.platform }}
1919
steps:
2020
- name: Checkout MultiPy
2121
uses: actions/checkout@v2
2222
with:
2323
submodules: true
24-
24+
- name: Clean up previous CUDA driver installations
25+
shell: bash
26+
run: |
27+
set -x
28+
yum list installed | grep nvidia || true
29+
yum list installed | grep cuda || true
30+
sudo yum remove -y cuda || true
31+
sudo yum remove -y cuda-drivers || true
32+
sudo yum remove -y "*nvidia*" || true
33+
- name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
34+
run: |
35+
bash .github/scripts/install_nvidia_utils_linux.sh || true
36+
echo "GPU_FLAG=--gpus all" >> "${GITHUB_ENV}"
2537
- name: Setup SSH (Click me for login details)
2638
uses: ./.github/actions/setup-ssh
2739
with:
@@ -30,11 +42,13 @@ jobs:
3042
- name: Build
3143
env:
3244
DOCKER_BUILDKIT: 1
33-
run: docker build -t multipy --progress=plain --build-arg PYTHON_MAJOR_VERSION=${{ matrix.python-major-version }} --build-arg PYTHON_MINOR_VERSION=${{ matrix.python-minor-version }} .
45+
run: nvidia-docker build -t multipy --progress=plain --build-arg PYTHON_MAJOR_VERSION=${{ matrix.python-major-version }} --build-arg PYTHON_MINOR_VERSION=${{ matrix.python-minor-version }} --build-arg BUILD_CUDA_TESTS=1 .
3446

3547
- name: Test
3648
run: |
3749
docker run --rm multipy bash -c "if [[ ${{ matrix.python-minor-version }} -lt 8 ]]; then source ~/venvs/multipy/bin/activate; fi && multipy/runtime/build/test_deploy"
50+
nvidia-docker run --rm multipy bash -c "if [[ ${{ matrix.python-minor-version }} -lt 8 ]]; then source ~/venvs/multipy/bin/activate; fi && multipy/runtime/build/test_deploy_gpu"
51+
3852
3953
- name: Examples
4054
run: |

Dockerfile

+14-13
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
ARG BASE_IMAGE=nvidia/cuda:11.3.1-devel-ubuntu18.04
1+
ARG BASE_IMAGE=nvidia/cuda:11.6.1-devel-ubuntu18.04
22

33
FROM ${BASE_IMAGE} as dev-base
44

@@ -59,13 +59,17 @@ COPY .git .git
5959
COPY .gitmodules .gitmodules
6060
COPY multipy multipy
6161
COPY compat-requirements.txt compat-requirements.txt
62+
COPY setup.py setup.py
63+
COPY README.md README.md
64+
COPY dev-requirements.txt dev-requirements.txt
6265

6366
RUN git submodule update --init --recursive --jobs 0
6467

6568
# Install conda/pyenv + necessary python dependencies
6669
FROM dev-base as conda-pyenv
6770
ARG PYTHON_MAJOR_VERSION=3
6871
ARG PYTHON_MINOR_VERSION=8
72+
ARG BUILD_CUDA_TESTS=0
6973
ENV PYTHON_MINOR_VERSION=${PYTHON_MINOR_VERSION}
7074
ENV PYTHON_VERSION=${PYTHON_MAJOR_VERSION}.${PYTHON_MINOR_VERSION}
7175
RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \
@@ -75,7 +79,7 @@ RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \
7579
rm ~/miniconda.sh && \
7680
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} mkl mkl-include conda-build pyyaml numpy ipython && \
7781
/opt/conda/bin/conda install -y -c conda-forge libpython-static=${PYTHON_VERSION} && \
78-
/opt/conda/bin/conda install -y pytorch torchvision torchaudio cudatoolkit=11.3 -c pytorch-nightly && \
82+
/opt/conda/bin/conda install -y pytorch torchvision torchaudio pytorch-cuda=11.6 -c pytorch-nightly -c nvidia && \
7983
/opt/conda/bin/conda clean -ya; \
8084
else \
8185
pip3 install virtualenv && \
@@ -84,29 +88,26 @@ RUN if [[ ${PYTHON_MINOR_VERSION} -gt 7 ]]; then \
8488
~/.pyenv/bin/pyenv install --force 3.7.10 && \
8589
virtualenv -p ~/.pyenv/versions/3.7.10/bin/python3 ~/venvs/multipy && \
8690
source ~/venvs/multipy/bin/activate && \
87-
pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu113; \
91+
pip3 install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cu116; \
8892
fi
8993

90-
# Build/Install pytorch with post-cxx11 ABI
9194
FROM conda-pyenv as build
92-
WORKDIR /opt/multipy/multipy/runtime/third-party/pytorch
9395
COPY --from=conda-pyenv /opt/conda* /opt/conda
9496
COPY --from=submodule-update /opt/multipy /opt/multipy
9597

9698
WORKDIR /opt/multipy
9799

98100
# Build Multipy
99-
RUN rm -r multipy/runtime/build; mkdir multipy/runtime/build && \
100-
cd multipy/runtime/build && \
101+
RUN ls && pwd && rm -rf multipy/runtime/build && \
101102
if [[ ${PYTHON_MINOR_VERSION} -lt 8 ]]; then \
102-
source ~/venvs/multipy/bin/activate && \
103-
cmake -DLEGACY_PYTHON_PRE_3_8=ON ..; \
103+
source ~/venvs/multipy/bin/activate; \
104+
fi && \
105+
if [[ ${BUILD_CUDA_TESTS} -eq 1 ]]; then \
106+
python -m pip install -e . --install-option="--cudatests"; \
104107
else \
105-
cmake -DLEGACY_PYTHON_PRE_3_8=OFF ..; \
108+
python -m pip install -e .; \
106109
fi && \
107-
cmake --build . --config Release -j && \
108-
cmake --install . --prefix "." && \
109-
cd ../example && python generate_examples.py
110+
python multipy/runtime/example/generate_examples.py
110111

111112
# Build examples
112113
COPY examples examples

setup.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,11 @@ def get_cmake_version():
2828

2929

3030
class MultipyRuntimeCmake(object):
31-
user_options = [("cmakeoff", None, None), ("abicxx", None, None)]
31+
user_options = [
32+
("cmakeoff", None, None),
33+
("cudatests", None, None),
34+
("abicxx", None, None),
35+
]
3236

3337

3438
class MultipyRuntimeDevelop(MultipyRuntimeCmake, develop):
@@ -41,24 +45,29 @@ def initialize_options(self):
4145
# TODO(tristanr): remove once unused
4246
self.abicxx = None
4347

48+
self.cudatests = None
49+
4450
def finalize_options(self):
4551
develop.finalize_options(self)
4652
if self.cmakeoff is not None:
4753
self.distribution.get_command_obj("build_ext").cmake_off = True
54+
if self.cudatests is not None:
55+
self.distribution.get_command_obj("build_ext").cuda_tests_flag = "ON"
4856

4957

5058
class MultipyRuntimeBuild(MultipyRuntimeCmake, build_ext):
5159
user_options = build_ext.user_options + MultipyRuntimeCmake.user_options
5260
cmake_off = False
61+
cuda_tests_flag = "OFF"
5362

5463
def run(self):
5564
if self.cmake_off:
5665
return
5766
try:
5867
cmake_version_comps = get_cmake_version().split(".")
59-
if cmake_version_comps[0] < "3" or cmake_version_comps[1] < "19":
68+
if cmake_version_comps[0] < "3" or cmake_version_comps[1] < "12":
6069
raise RuntimeError(
61-
"CMake 3.19 or later required for multipy runtime installation."
70+
"CMake 3.12 or later required for multipy runtime installation."
6271
)
6372
except OSError:
6473
raise RuntimeError(
@@ -74,7 +83,9 @@ def run(self):
7483
print(f"-- Running multipy runtime makefile in dir {build_dir_abs}")
7584
try:
7685
subprocess.run(
77-
[f"cmake -DLEGACY_PYTHON_PRE_3_8={legacy_python_cmake_flag} .."],
86+
[
87+
f"cmake -DBUILD_CUDA_TESTS={self.cuda_tests_flag} -DLEGACY_PYTHON_PRE_3_8={legacy_python_cmake_flag} .."
88+
],
7889
cwd=build_dir_abs,
7990
shell=True,
8091
check=True,

0 commit comments

Comments
 (0)