Skip to content

Commit 7e2df12

Browse files
simonpintarelliMathieu Taillefumier
authored and
Mathieu Taillefumier
committed
GH200 CI (#148)
* sync spack recipes from spack@develop * ci on gh200 * remove handwritten compilers.yaml
1 parent 09fc581 commit 7e2df12

File tree

6 files changed

+237
-18
lines changed

6 files changed

+237
-18
lines changed

ci/baseimage.cuda.Dockerfile

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
FROM ubuntu:22.04 as builder
22

3-
ARG CUDA_ARCH=60
3+
ARG CUDA_ARCH=90
44

55
ENV DEBIAN_FRONTEND noninteractive
66

@@ -10,7 +10,7 @@ ENV PATH="/spack/bin:${PATH}"
1010

1111
ENV MPICH_VERSION=3.4.3
1212

13-
ENV CMAKE_VERSION=3.27.9
13+
ENV CMAKE_VERSION=3.30.3
1414

1515
RUN apt-get -y update
1616

@@ -23,11 +23,12 @@ RUN apt-get install -y --no-install-recommends gcc g++ gfortran clang libomp-14-
2323
liblzma-dev libbz2-dev
2424

2525
# install CMake
26-
RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz -O cmake.tar.gz && \
26+
RUN wget https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-aarch64.tar.gz -O cmake.tar.gz && \
2727
tar zxvf cmake.tar.gz --strip-components=1 -C /usr
2828

29+
#
2930
# get latest version of spack
30-
RUN git clone -b v0.21.0 https://github.com/spack/spack.git
31+
RUN git clone -b v0.23.0 https://github.com/spack/spack.git
3132

3233
# set the location of packages built by spack
3334
RUN spack config add config:install_tree:root:/opt/local
@@ -45,13 +46,7 @@ RUN spack external find --all --exclude python
4546
RUN spack compiler find
4647

4748
# install yq (utility to manipulate the yaml files)
48-
RUN wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_386 && chmod a+x /usr/local/bin/yq
49-
50-
# change the fortran compilers: for gcc the gfortran is already properly set and the change has no effect; add it for clang
51-
RUN yq -i '.compilers[0].compiler.paths.f77 = "/usr/bin/gfortran"' /root/.spack/linux/compilers.yaml && \
52-
yq -i '.compilers[0].compiler.paths.fc = "/usr/bin/gfortran"' /root/.spack/linux/compilers.yaml && \
53-
yq -i '.compilers[1].compiler.paths.f77 = "/usr/bin/gfortran"' /root/.spack/linux/compilers.yaml && \
54-
yq -i '.compilers[1].compiler.paths.fc = "/usr/bin/gfortran"' /root/.spack/linux/compilers.yaml
49+
RUN wget -qO /usr/local/bin/yq https://github.com/mikefarah/yq/releases/latest/download/yq_linux_arm64 && chmod a+x /usr/local/bin/yq
5550

5651
# install MPICH
5752
RUN spack install mpich@${MPICH_VERSION} %gcc

ci/daint-alps.yml

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
include:
2+
- remote: 'https://gitlab.com/cscs-ci/recipes/-/raw/master/templates/v2/.ci-ext.yml'
3+
4+
stages:
5+
- baseimage
6+
- build
7+
- test
8+
9+
build base image:
10+
extends: [.dynamic-image-name, .container-builder-cscs-gh200]
11+
stage: baseimage
12+
timeout: 2h
13+
variables:
14+
SLURM_RESERVATION: 'NCCL'
15+
DOCKERFILE: ci/baseimage.cuda.Dockerfile
16+
WATCH_FILECHANGES: ci/baseimage.cuda.Dockerfile
17+
PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/base/cosma-ci
18+
19+
build tiled-mm:
20+
extends: .container-builder-cscs-gh200
21+
needs: ["build base image"]
22+
stage: build
23+
variables:
24+
SLURM_RESERVATION: 'NCCL'
25+
DOCKERFILE: ci/build.Dockerfile
26+
PERSIST_IMAGE_NAME: $CSCS_REGISTRY_PATH/cosma/cosma-ci:$CI_COMMIT_SHA
27+
ENVPATH: "/cosma-env-cuda"
28+
DOCKER_BUILD_ARGS: '["BASE_IMAGE=${BASE_IMAGE}", "ENVPATH=$ENVPATH"]'
29+
30+
.run_tests:
31+
extends: [.container-runner-todi-gh200]
32+
needs: ["build tiled-mm"]
33+
stage: test
34+
image: $CSCS_REGISTRY_PATH/cosma/cosma-ci:$CI_COMMIT_SHA
35+
variables:
36+
GIT_STRATEGY: none
37+
MPICH_MAX_THREAD_SAFETY: multiple
38+
CSCS_REGISTRY_LOGIN: 'YES'
39+
PULL_IMAGE: 'YES'
40+
SLURM_HINT: nomultithread
41+
SLURM_UNBUFFEREDIO: ''
42+
SLURM_RESERVATION: 'NCCL'
43+
SLURM_CPU_BIND: 'socket'
44+
SLURM_MPI: "pmi2"
45+
CRAY_CUDA_MPS: 'YES'
46+
# Workaround after update until hooks are fixed
47+
ENROOT_LIBRARY_PATH: /capstor/scratch/cscs/fmohamed/enrootlibn
48+
# SLURM_WAIT: 0
49+
COSMA_GPU_MAX_TILE_K: 100
50+
COSMA_GPU_MAX_TILE_M: 100
51+
COSMA_GPU_MAX_TILE_N: 100
52+
53+
mapper:
54+
extends: .run_tests
55+
stage: test
56+
script: /cosma-env-cuda/.spack-env/view/bin/test.mapper
57+
variables:
58+
SLURM_JOB_NUM_NODES: 1
59+
SLURM_NTASKS: 1
60+
USE_MPI: 'YES'
61+
62+
pdgemm:
63+
extends: .run_tests
64+
stage: test
65+
script: /cosma-env-cuda/.spack-env/view/bin/test.pdgemm
66+
variables:
67+
SLURM_JOB_NUM_NODES: 2
68+
SLURM_NTASKS: 16
69+
USE_MPI: 'YES'
70+
71+
multiply:
72+
extends: .run_tests
73+
stage: test
74+
script: /cosma-env-cuda/.spack-env/view/bin/test.multiply
75+
variables:
76+
SLURM_JOB_NUM_NODES: 2
77+
SLURM_NTASKS: 16
78+
USE_MPI: 'YES'
79+
80+
scalar_matmul:
81+
extends: .run_tests
82+
stage: test
83+
script: /cosma-env-cuda/.spack-env/view/bin/test.scalar_matmul
84+
variables:
85+
SLURM_JOB_NUM_NODES: 1
86+
SLURM_NTASKS: 8
87+
USE_MPI: 'YES'
88+
89+
multiply_using_layout:
90+
extends: .run_tests
91+
stage: test
92+
script: /cosma-env-cuda/.spack-env/view/bin/test.multiply_using_layout
93+
variables:
94+
SLURM_JOB_NUM_NODES: 1
95+
SLURM_NTASKS: 4

ci/mps-wrapper.sh

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/bin/bash
2+
# Example mps-wrapper.sh usage:
3+
# > srun --cpu-bind=socket [...] mps-wrapper.sh <cmd>
4+
5+
export CUDA_MPS_PIPE_DIRECTORY=/tmp/nvidia-mps
6+
export CUDA_MPS_LOG_DIRECTORY=/tmp/nvidia-log
7+
# Launch MPS from a single rank per node
8+
if [ $SLURM_LOCALID -eq 0 ]; then
9+
CUDA_VISIBLE_DEVICES=0,1,2,3 nvidia-cuda-mps-control -d
10+
fi
11+
12+
# set cuda device
13+
numa_nodes=$(hwloc-calc --physical --intersect NUMAnode $(taskset -p $$ | awk '{print "0x"$6}'))
14+
export CUDA_VISIBLE_DEVICES=$numa_nodes
15+
# Run the command
16+
exec numactl --membind=$numa_nodes "$@"

spack/packages/cosma/fj-ssl2.patch

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
diff --git a/CMakeLists.txt b/CMakeLists.txt
2+
index 1fd1e55..41a041b 100644
3+
--- a/CMakeLists.txt
4+
+++ b/CMakeLists.txt
5+
@@ -19,7 +19,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS "YES") # always write compile_commands.json
6+
7+
set(COSMA_GPU_BACKENDS_LIST "CUDA" "ROCM")
8+
set(COSMA_SCALAPACK_LIST "OFF" "MKL" "CRAY_LIBSCI" "CUSTOM")
9+
-set(COSMA_BLAS_LIST "auto" "MKL" "OPENBLAS" "CRAY_LIBSCI" "CUSTOM" "BLIS" "ATLAS" "CUDA" "ROCM" "OFF")
10+
+set(COSMA_BLAS_LIST "auto" "MKL" "SSL2" "OPENBLAS" "CRAY_LIBSCI" "CUSTOM" "BLIS" "ATLAS" "CUDA" "ROCM" "OFF")
11+
option(COSMA_WITH_TESTS "Generate the test target." ON)
12+
option(COSMA_WITH_APPS "Generate the miniapp targets." ON)
13+
option(COSMA_WITH_BENCHMARKS "Generate the benchmark targets." ON)
14+
@@ -45,7 +45,7 @@ if (COSMA_BLAS MATCHES "CUDA|ROCM")
15+
set(COSMA_GPU_BACKEND ${COSMA_BLAS})
16+
else()
17+
if(COSMA_BLAS STREQUAL "OFF")
18+
- message(FATAL_ERROR "A Blas implementation is needed when running on CPU only: choices are : auto, MKL, OPENBLAS, CRAY_LIBSCI, CUSTOM, BLIS, ATLAS, FLEXIBLAS, ARMPL, GenericBLAS")
19+
+ message(FATAL_ERROR "A Blas implementation is needed when running on CPU only: choices are : auto, MKL, SSL2, OPENBLAS, CRAY_LIBSCI, CUSTOM, BLIS, ATLAS, FLEXIBLAS, ARMPL, GenericBLAS")
20+
else()
21+
set(COSMA_BLAS_VENDOR ${COSMA_BLAS})
22+
endif()
23+
@@ -190,6 +190,7 @@ install(FILES "${cosma_BINARY_DIR}/cosmaConfig.cmake"
24+
"${cosma_BINARY_DIR}/cosmaConfigVersion.cmake"
25+
"${cosma_BINARY_DIR}/cosmaConfigVersion.cmake"
26+
"${cosma_SOURCE_DIR}/cmake/FindMKL.cmake"
27+
+ "${cosma_SOURCE_DIR}/cmake/FindSSL2.cmake"
28+
"${cosma_SOURCE_DIR}/cmake/FindBlas.cmake"
29+
"${cosma_SOURCE_DIR}/cmake/FindSCALAPACK.cmake"
30+
"${cosma_SOURCE_DIR}/cmake/FindOPENBLAS.cmake"
31+
diff --git a/cmake/FindBlas.cmake b/cmake/FindBlas.cmake
32+
index aef956c..3c47561 100644
33+
--- a/cmake/FindBlas.cmake
34+
+++ b/cmake/FindBlas.cmake
35+
@@ -14,6 +14,7 @@ endif()
36+
set(COSMA_BLAS_VENDOR_LIST
37+
"auto"
38+
"MKL"
39+
+ "SSL2"
40+
"OPENBLAS"
41+
"FLEXIBLAS"
42+
"ARMPL"
43+
diff --git a/cmake/FindSSL2.cmake b/cmake/FindSSL2.cmake
44+
new file mode 100644
45+
index 0000000..f0e11bf
46+
--- /dev/null
47+
+++ b/cmake/FindSSL2.cmake
48+
@@ -0,0 +1,56 @@
49+
+#.rst:
50+
+# FindSSL2
51+
+# -----------
52+
+#
53+
+# This module tries to find the SSL2 library.
54+
+#
55+
+# The following variables are set
56+
+#
57+
+# ::
58+
+#
59+
+# SSL2_FOUND - True if ssl2 is found
60+
+# SSL2_LIBRARIES - The required libraries
61+
+# SSL2_INCLUDE_DIRS - The required include directory
62+
+#
63+
+# The following import target is created
64+
+#
65+
+# ::
66+
+#
67+
+# SSL2::ssl2
68+
+
69+
+#set paths to look for library from ROOT variables.If new policy is set, find_library() automatically uses them.
70+
+# if(NOT POLICY CMP0074)
71+
+set(_SSL2_PATHS ${SSL2_ROOT}
72+
+ $ENV{SSL2_ROOT}
73+
+ $ENV{SSL2ROOT}
74+
+ $ENV{SSL2_DIR}
75+
+ $ENV{SSL2DIR})
76+
+# endif()
77+
+
78+
+find_library(
79+
+ COSMA_SSL2_LINK_LIBRARIES
80+
+ NAMES "fjlapackex"
81+
+ HINTS ${_SSL2_PATHS}
82+
+ PATH_SUFFIXES "lib64"
83+
+)
84+
+find_path(
85+
+ COSMA_SSL2_INCLUDE_DIRS
86+
+ NAMES "cblas.h"
87+
+ HINTS ${_SSL2_PATHS}
88+
+ PATH_SUFFIXES "include"
89+
+)
90+
+
91+
+# check if found
92+
+include(FindPackageHandleStandardArgs)
93+
+find_package_handle_standard_args(SSL2 REQUIRED_VARS COSMA_SSL2_INCLUDE_DIRS COSMA_SSL2_LINK_LIBRARIES)
94+
+
95+
+# add target to link against
96+
+if(NOT TARGET cosma::BLAS::SSL2::ssl2)
97+
+ add_library(cosma::BLAS::SSL2::ssl2 INTERFACE IMPORTED)
98+
+ add_library(cosma::BLAS::SSL2::blas ALIAS cosma::BLAS::SSL2::ssl2)
99+
+endif()
100+
+set_property(TARGET cosma::BLAS::SSL2::ssl2 PROPERTY INTERFACE_LINK_LIBRARIES ${COSMA_SSL2_LINK_LIBRARIES})
101+
+set_property(TARGET cosma::BLAS::SSL2::ssl2 PROPERTY INTERFACE_INCLUDE_DIRECTORIES ${COSMA_SSL2_INCLUDE_DIRS})
102+
+
103+
+# prevent clutter in cache
104+
+MARK_AS_ADVANCED(SSL2_FOUND SSL2_LIBRARIES SSL2_INCLUDE_DIRS)

spack/packages/cosma/package.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2013-2023 Lawrence Livermore National Security, LLC and other
1+
# Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
22
# Spack Project Developers. See the top-level COPYRIGHT file for details.
33
#
44
# SPDX-License-Identifier: (Apache-2.0 OR MIT)
@@ -17,13 +17,15 @@ class Cosma(CMakePackage):
1717
url = "https://github.com/eth-cscs/COSMA/archive/refs/tags/v2.6.6.tar.gz"
1818
git = "https://github.com/eth-cscs/COSMA.git"
1919

20+
license("BSD-3-Clause")
21+
2022
# note: The default archives produced with github do not have the archives
2123
# of the submodules.
2224
version("master", branch="master", submodules=False)
2325
version("2.6.6", sha256="1604be101e77192fbcc5551236bc87888d336e402f5409bbdd9dea900401cc37")
2426
version("2.6.5", sha256="10d9b7ecc1ce44ec5b9e0c0bf89278a63029912ec3ea99661be8576b553ececf")
2527
version("2.6.4", sha256="6d7bd5e3005874af9542a329c93e7ccd29ca1a5573dae27618fac2704fa2b6ab")
26-
version("2.6.3", sha256="8ca96ca41458f1e9d0da70d524c5a03c677dba7238d23a578f852163b6d45ac9")
28+
version("2.6.3", sha256="c2a3735ea8f860930bea6706d968497d72a1be0498c689b5bc4a951ffc2d1146")
2729
version("2.6.2", sha256="2debb5123cc35aeebc5fd2f8a46cfd6356d1e27618c9bb57129ecd09aa400940")
2830
version("2.6.1", sha256="69aa6634a030674f0d9be61e7b0bf0dc17acf0fc9e7a90b40e3179e2254c8d67")
2931
version("2.5.1", sha256="085b7787597374244bbb1eb89bc69bf58c35f6c85be805e881e1c0b25166c3ce")
@@ -34,12 +36,14 @@ class Cosma(CMakePackage):
3436
version("2.0.7", sha256="8d70bfcbda6239b6a8fbeaca138790bbe58c0c3aa576879480d2632d4936cf7e")
3537
version("2.0.2", sha256="4f3354828bc718f3eef2f0098c3bdca3499297497a220da32db1acd57920c68d")
3638

39+
depends_on("cxx", type="build") # generated
40+
3741
# We just need the libraries of cuda and rocm, so no need to extend
3842
# CudaPackage or ROCmPackage.
3943
variant("cuda", default=False, description="Build with cuBLAS support")
4044
variant("rocm", default=False, description="Build with rocBLAS support")
4145
variant("scalapack", default=False, description="Build with ScaLAPACK API")
42-
variant("shared", default=False, description="Build the shared library version")
46+
variant("shared", default=True, description="Build the shared library version")
4347
variant("tests", default=False, description="Build tests")
4448
variant("apps", default=False, description="Build miniapp")
4549
variant("profiling", default=False, description="Enable profiling")
@@ -76,8 +80,10 @@ class Cosma(CMakePackage):
7680
depends_on("semiprof", when="+profiling")
7781
depends_on("costa+profiling", when="+profiling")
7882

83+
patch("fj-ssl2.patch", when="^fujitsu-ssl2")
84+
7985
def setup_build_environment(self, env):
80-
if "+cuda" in self.spec:
86+
if self.spec.satisfies("+cuda"):
8187
env.set("CUDA_PATH", self.spec["cuda"].prefix)
8288

8389
def cosma_blas_cmake_arg(self):
@@ -89,6 +95,7 @@ def cosma_blas_cmake_arg(self):
8995
("^cray-libsci", "CRAY_LIBSCI"),
9096
("^netlib-lapack", "CUSTOM"),
9197
("^openblas", "OPENBLAS"),
98+
("^fujitsu-ssl2", "SSL2"),
9299
]
93100

94101
if self.version >= Version("2.4.0"):
@@ -105,11 +112,11 @@ def cosma_blas_cmake_arg(self):
105112
def cosma_scalapack_cmake_arg(self):
106113
spec = self.spec
107114

108-
if "~scalapack" in spec:
115+
if spec.satisfies("~scalapack"):
109116
return "OFF"
110-
elif "^intel-mkl" in spec or "^intel-oneapi-mkl" in spec:
117+
elif spec.satisfies("^intel-mkl") or spec.satisfies("^intel-oneapi-mkl"):
111118
return "MKL"
112-
elif "^cray-libsci" in spec:
119+
elif spec.satisfies("^cray-libsci"):
113120
return "CRAY_LIBSCI"
114121

115122
return "CUSTOM"

spack/packages/tiled-mm/package.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@ class TiledMm(CMakePackage, CudaPackage, ROCmPackage):
2525
version("2.2", sha256="6d0b49c9588ece744166822fd44a7bc5bec3dc666b836de8bf4bf1a7bb675aac")
2626
version("2.0", sha256="ea554aea8c53d7c8e40044e6d478c0e8137d7e8b09d7cb9650703430d92cf32e")
2727

28+
depends_on("cxx", type="build") # generated
29+
2830
variant("shared", default=True, description="Build shared libraries")
2931
variant("examples", default=False, description="Enable examples")
3032
variant("tests", default=False, description="Enable tests")

0 commit comments

Comments
 (0)