[BLAS] SYCL-Graph integration for native-command

Ewan Crawford · Ewan Crawford · commit 3c0693402e97 · 2025-05-09T10:13:39.000+01:00
In order to support applications calling the library
with a sycl queue recording to a SYCL-Graph, check if
the `ext_codeplay_enqueue_native_command` command-group is being
recorded to a graph object. If so use the native stream recording
APIs to add the blas calls as nodes in the graph.

In particular this fixes the llama.cpp unit test
`MUL_MAT(type_a=f16,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[2,1],per=[0,1,2,3],v=0)`
on CUDA with SYCL-Graph enabled. Previously this would throw an error:

```sh
$ GGML_SYCL_DISABLE_GRAPH=0 ./bin/test-backend-ops -b SYCL0 -o MUL_MAT -p type_a=f16,type_b=f32,m=16,n=1,k=256,bs=\\[1,1\\],nr=\\[2

UR CUDA ERROR:
        Value:           700
        Name:            CUDA_ERROR_ILLEGAL_ADDRESS
        Description:     an illegal memory access was encountered
        Function:        operator()
        Source Location: $HOME/dpcpp/unified-runtime/source/adapters/cuda/queue.cpp:154

Native API failed. Native API returns: 2147483646 (UR_RESULT_ERROR_UNKNOWN)
Exception caught at file:$HOME/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp, line:3598, func:operator()
SYCL error: CHECK_TRY_ERROR((stream)-&gt;wait()): Meet error in this line code!
  in function ggml_backend_sycl_synchronize at $HOME/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp:3598
$HOME/llama.cpp/ggml/src/ggml-sycl/../ggml-sycl/common.hpp:118: SYCL error
Could not attach to process.  If your uid matches the uid of the target
process, check the setting of /proc/sys/kernel/yama/ptrace_scope, or try
again as the root user.  For more details, see /etc/sysctl.d/10-ptrace.conf
ptrace: Operation not permitted.
No stack.
The program is not being run.
```
diff --git a/src/blas/backends/cublas/cublas_batch.cpp b/src/blas/backends/cublas/cublas_batch.cpp
@@ -722,8 +722,9 @@ inline sycl::event gemm_batch_usm_impl(sycl::queue& queue, transpose* transa, tr
             auto handle = sc.get_handle(queue);
             int64_t offset = 0;
             cublasStatus_t err;
-            for (int64_t i = 0; i < group_count; i++) {
 #ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
+            sc.begin_recording_if_graph(queue);
+            for (int64_t i = 0; i < group_count; i++) {
                 CUBLAS_ERROR_FUNC_T(
                     "cublasGemmBatchedEx", cublasGemmBatchedEx, err, handle,
                     get_cublas_operation(transa[i]), get_cublas_operation(transb[i]), (int)m[i],
@@ -732,7 +733,11 @@ inline sycl::event gemm_batch_usm_impl(sycl::queue& queue, transpose* transa, tr
                     get_cublas_datatype<cuTypeB>(), (int)ldb[i], &beta[i],
                     (void* const*)(c + offset), get_cublas_datatype<cuTypeC>(), (int)ldc[i],
                     (int)group_size[i], get_cublas_datatype<cuTypeS>(), cublas_gemm_algo);
+                offset += group_size[i];
+            }
+            sc.end_recording_if_graph(queue);
 #else
+            for (int64_t i = 0; i < group_count; i++) {
                 CUBLAS_ERROR_FUNC_T_SYNC(
                     "cublasGemmBatchedEx", cublasGemmBatchedEx, err, handle,
                     get_cublas_operation(transa[i]), get_cublas_operation(transb[i]), (int)m[i],
@@ -741,9 +746,9 @@ inline sycl::event gemm_batch_usm_impl(sycl::queue& queue, transpose* transa, tr
                     get_cublas_datatype<cuTypeB>(), (int)ldb[i], &beta[i],
                     (void *const *)(c + offset), get_cublas_datatype<cuTypeC>(), (int)ldc[i],
                     (int)group_size[i], get_cublas_datatype<cuTypeS>(), cublas_gemm_algo);
-#endif
                 offset += group_size[i];
             }
+#endif
         });
     });
     return done;
diff --git a/src/blas/backends/cublas/cublas_scope_handle.cpp b/src/blas/backends/cublas/cublas_scope_handle.cpp
@@ -60,6 +60,50 @@ cublasHandle_t CublasScopedContextHandler::get_handle(const sycl::queue& queue)
     return nativeHandle;
 }
 
+void CublasScopedContextHandler::begin_recording_if_graph(const sycl::queue& queue) {
+    if (!ih.ext_codeplay_has_graph()) {
+        return;
+    }
+
+    auto stream = get_stream(queue);
+    CUresult err;
+#if CUDA_VERSION >= 12030
+    // After CUDA 12.3 we can use cuStreamBeginCaptureToGraph to capture
+    // the stream directly in the native graph, rather than needing to
+    // instantiate the stream capture as a new graph.
+    auto graph = ih.ext_codeplay_get_native_graph<sycl::backend::ext_oneapi_cuda>();
+    CUDA_ERROR_FUNC(cuStreamBeginCaptureToGraph, err, stream, graph, nullptr, nullptr, 0,
+                    CU_STREAM_CAPTURE_MODE_GLOBAL);
+#else
+    CUDA_ERROR_FUNC(cuStreamBeginCapture, err, stream, CU_STREAM_CAPTURE_MODE_GLOBAL);
+#endif // CUDA_VERSION
+}
+
+void CublasScopedContextHandler::end_recording_if_graph(const sycl::queue& queue) {
+    if (!ih.ext_codeplay_has_graph()) {
+        return;
+    }
+
+    auto graph = ih.ext_codeplay_get_native_graph<sycl::backend::ext_oneapi_cuda>();
+    auto stream = get_stream(queue);
+    CUresult err;
+#if CUDA_VERSION >= 12030
+    CUDA_ERROR_FUNC(cuStreamEndCapture, err, stream, &graph);
+#else
+    // cuStreamEndCapture returns a new graph, if we overwrite
+    // "graph" it won't be picked up by the SYCL runtime, as
+    // "ext_codeplay_get_native_graph" returns a passed-by-value pointer.
+    CUgraph recorded_graph;
+    CUDA_ERROR_FUNC(cuStreamEndCapture, err, stream, &recorded_graph);
+
+    // Add graph to native graph as a child node
+    // Need to return a node object for the node to be created,
+    // can't be nullptr.
+    CUgraphNode node;
+    CUDA_ERROR_FUNC(cuGraphAddChildGraphNode, err, &node, graph, nullptr, 0, recorded_graph);
+#endif // CUDA_VERSION
+}
+
 CUstream CublasScopedContextHandler::get_stream(const sycl::queue& queue) {
     return sycl::get_native<sycl::backend::ext_oneapi_cuda>(queue);
 }
diff --git a/src/blas/backends/cublas/cublas_scope_handle.hpp b/src/blas/backends/cublas/cublas_scope_handle.hpp
@@ -69,6 +69,28 @@ class CublasScopedContextHandler {
 public:
     CublasScopedContextHandler(sycl::interop_handle& ih);
 
+    /**
+     * @brief Start recording cuBlas calls to a graph.
+     * @detail Checks if the command-group associated with \p ih is being added
+     * to a graph, and if so, begin stream recording of the native CUDA stream
+     * associated with \p queue to the native cuda-graph object.
+     * @param queue The sycl queue to start stream recording on native stream
+     * backing the queue.
+     */
+    void begin_recording_if_graph(const sycl::queue& queue);
+
+    /**
+     * @brief End recording cuBlas calls to a graph.
+     * @detail Checks if the command-group associated with \p ih is being added
+     * to a graph, and if so, ends stream recording of the native CUDA stream
+     * associated with \p queue to the native cuda-graph object. Doing any
+     * extra work to ensure that stream recorded calls get added as nodes to
+     * the native graph object associated with \p ih.
+     * @param queue The sycl queue to end stream recording on native stream
+     * backing the queue.
+     */
+    void end_recording_if_graph(const sycl::queue& queue);
+
     /**
    * @brief get_handle: creates the handle by implicitly impose the advice
    * given by nvidia for creating a cublas_handle. (e.g. one cuStream per device