Skip to content

Commit 3c06934

Browse files
author
Ewan Crawford
committed
[BLAS] SYCL-Graph integration for native-command
In order to support applications calling the library with a sycl queue recording to a SYCL-Graph, check if the `ext_codeplay_enqueue_native_command` command-group is being recorded to a graph object. If so use the native stream recording APIs to add the blas calls as nodes in the graph. In particular this fixes the llama.cpp unit test `MUL_MAT(type_a=f16,type_b=f32,m=16,n=1,k=256,bs=[1,1],nr=[2,1],per=[0,1,2,3],v=0)` on CUDA with SYCL-Graph enabled. Previously this would throw an error: ```sh $ GGML_SYCL_DISABLE_GRAPH=0 ./bin/test-backend-ops -b SYCL0 -o MUL_MAT -p type_a=f16,type_b=f32,m=16,n=1,k=256,bs=\\[1,1\\],nr=\\[2 UR CUDA ERROR: Value: 700 Name: CUDA_ERROR_ILLEGAL_ADDRESS Description: an illegal memory access was encountered Function: operator() Source Location: $HOME/dpcpp/unified-runtime/source/adapters/cuda/queue.cpp:154 Native API failed. Native API returns: 2147483646 (UR_RESULT_ERROR_UNKNOWN) Exception caught at file:$HOME/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp, line:3598, func:operator() SYCL error: CHECK_TRY_ERROR((stream)->wait()): Meet error in this line code! in function ggml_backend_sycl_synchronize at $HOME/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp:3598 $HOME/llama.cpp/ggml/src/ggml-sycl/../ggml-sycl/common.hpp:118: SYCL error Could not attach to process. If your uid matches the uid of the target process, check the setting of /proc/sys/kernel/yama/ptrace_scope, or try again as the root user. For more details, see /etc/sysctl.d/10-ptrace.conf ptrace: Operation not permitted. No stack. The program is not being run. ```
1 parent 4ad4dfb commit 3c06934

File tree

3 files changed

+73
-2
lines changed

3 files changed

+73
-2
lines changed

src/blas/backends/cublas/cublas_batch.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -722,8 +722,9 @@ inline sycl::event gemm_batch_usm_impl(sycl::queue& queue, transpose* transa, tr
722722
auto handle = sc.get_handle(queue);
723723
int64_t offset = 0;
724724
cublasStatus_t err;
725-
for (int64_t i = 0; i < group_count; i++) {
726725
#ifdef SYCL_EXT_ONEAPI_ENQUEUE_NATIVE_COMMAND
726+
sc.begin_recording_if_graph(queue);
727+
for (int64_t i = 0; i < group_count; i++) {
727728
CUBLAS_ERROR_FUNC_T(
728729
"cublasGemmBatchedEx", cublasGemmBatchedEx, err, handle,
729730
get_cublas_operation(transa[i]), get_cublas_operation(transb[i]), (int)m[i],
@@ -732,7 +733,11 @@ inline sycl::event gemm_batch_usm_impl(sycl::queue& queue, transpose* transa, tr
732733
get_cublas_datatype<cuTypeB>(), (int)ldb[i], &beta[i],
733734
(void* const*)(c + offset), get_cublas_datatype<cuTypeC>(), (int)ldc[i],
734735
(int)group_size[i], get_cublas_datatype<cuTypeS>(), cublas_gemm_algo);
736+
offset += group_size[i];
737+
}
738+
sc.end_recording_if_graph(queue);
735739
#else
740+
for (int64_t i = 0; i < group_count; i++) {
736741
CUBLAS_ERROR_FUNC_T_SYNC(
737742
"cublasGemmBatchedEx", cublasGemmBatchedEx, err, handle,
738743
get_cublas_operation(transa[i]), get_cublas_operation(transb[i]), (int)m[i],
@@ -741,9 +746,9 @@ inline sycl::event gemm_batch_usm_impl(sycl::queue& queue, transpose* transa, tr
741746
get_cublas_datatype<cuTypeB>(), (int)ldb[i], &beta[i],
742747
(void *const *)(c + offset), get_cublas_datatype<cuTypeC>(), (int)ldc[i],
743748
(int)group_size[i], get_cublas_datatype<cuTypeS>(), cublas_gemm_algo);
744-
#endif
745749
offset += group_size[i];
746750
}
751+
#endif
747752
});
748753
});
749754
return done;

src/blas/backends/cublas/cublas_scope_handle.cpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,50 @@ cublasHandle_t CublasScopedContextHandler::get_handle(const sycl::queue& queue)
6060
return nativeHandle;
6161
}
6262

63+
void CublasScopedContextHandler::begin_recording_if_graph(const sycl::queue& queue) {
64+
if (!ih.ext_codeplay_has_graph()) {
65+
return;
66+
}
67+
68+
auto stream = get_stream(queue);
69+
CUresult err;
70+
#if CUDA_VERSION >= 12030
71+
// After CUDA 12.3 we can use cuStreamBeginCaptureToGraph to capture
72+
// the stream directly in the native graph, rather than needing to
73+
// instantiate the stream capture as a new graph.
74+
auto graph = ih.ext_codeplay_get_native_graph<sycl::backend::ext_oneapi_cuda>();
75+
CUDA_ERROR_FUNC(cuStreamBeginCaptureToGraph, err, stream, graph, nullptr, nullptr, 0,
76+
CU_STREAM_CAPTURE_MODE_GLOBAL);
77+
#else
78+
CUDA_ERROR_FUNC(cuStreamBeginCapture, err, stream, CU_STREAM_CAPTURE_MODE_GLOBAL);
79+
#endif // CUDA_VERSION
80+
}
81+
82+
void CublasScopedContextHandler::end_recording_if_graph(const sycl::queue& queue) {
83+
if (!ih.ext_codeplay_has_graph()) {
84+
return;
85+
}
86+
87+
auto graph = ih.ext_codeplay_get_native_graph<sycl::backend::ext_oneapi_cuda>();
88+
auto stream = get_stream(queue);
89+
CUresult err;
90+
#if CUDA_VERSION >= 12030
91+
CUDA_ERROR_FUNC(cuStreamEndCapture, err, stream, &graph);
92+
#else
93+
// cuStreamEndCapture returns a new graph, if we overwrite
94+
// "graph" it won't be picked up by the SYCL runtime, as
95+
// "ext_codeplay_get_native_graph" returns a passed-by-value pointer.
96+
CUgraph recorded_graph;
97+
CUDA_ERROR_FUNC(cuStreamEndCapture, err, stream, &recorded_graph);
98+
99+
// Add graph to native graph as a child node
100+
// Need to return a node object for the node to be created,
101+
// can't be nullptr.
102+
CUgraphNode node;
103+
CUDA_ERROR_FUNC(cuGraphAddChildGraphNode, err, &node, graph, nullptr, 0, recorded_graph);
104+
#endif // CUDA_VERSION
105+
}
106+
63107
CUstream CublasScopedContextHandler::get_stream(const sycl::queue& queue) {
64108
return sycl::get_native<sycl::backend::ext_oneapi_cuda>(queue);
65109
}

src/blas/backends/cublas/cublas_scope_handle.hpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,28 @@ class CublasScopedContextHandler {
6969
public:
7070
CublasScopedContextHandler(sycl::interop_handle& ih);
7171

72+
/**
73+
* @brief Start recording cuBlas calls to a graph.
74+
* @detail Checks if the command-group associated with \p ih is being added
75+
* to a graph, and if so, begin stream recording of the native CUDA stream
76+
* associated with \p queue to the native cuda-graph object.
77+
* @param queue The sycl queue to start stream recording on native stream
78+
* backing the queue.
79+
*/
80+
void begin_recording_if_graph(const sycl::queue& queue);
81+
82+
/**
83+
* @brief End recording cuBlas calls to a graph.
84+
* @detail Checks if the command-group associated with \p ih is being added
85+
* to a graph, and if so, ends stream recording of the native CUDA stream
86+
* associated with \p queue to the native cuda-graph object. Doing any
87+
* extra work to ensure that stream recorded calls get added as nodes to
88+
* the native graph object associated with \p ih.
89+
* @param queue The sycl queue to end stream recording on native stream
90+
* backing the queue.
91+
*/
92+
void end_recording_if_graph(const sycl::queue& queue);
93+
7294
/**
7395
* @brief get_handle: creates the handle by implicitly impose the advice
7496
* given by nvidia for creating a cublas_handle. (e.g. one cuStream per device

0 commit comments

Comments
 (0)