From 090c9aa56bfec43cdc4e93e4921212cae321584b Mon Sep 17 00:00:00 2001 From: Maxime France-Pillois Date: Fri, 15 Mar 2024 15:19:58 +0000 Subject: [PATCH 1/2] [SYCL][Graph] Update design doc for copy optimization and add test - Update UR tag to include L0 command-buffer copy engine optimization - Add test which mixes copy and kernel commands - Update design doc to detail copy engine optimization --- sycl/doc/design/CommandGraph.md | 18 ++++ sycl/plugins/unified_runtime/CMakeLists.txt | 10 +- .../Graph/ValidUsage/linear_graph_copy.cpp | 102 ++++++++++++++++++ 3 files changed, 122 insertions(+), 8 deletions(-) create mode 100644 sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md index 9519067a00484..f36c40af07403 100644 --- a/sycl/doc/design/CommandGraph.md +++ b/sycl/doc/design/CommandGraph.md @@ -438,6 +438,24 @@ Level Zero: Future work will include exploring L0 API extensions to improve the mapping of UR command-buffer to L0 command-list. +#### Copy Engine + +For performance considerations, the Unified Runtime Level Zero adapter uses +different Level Zero command-queues to submit compute kernels and memory +operations when the device has a dedicated copy engine. To take advantage of the +copy engine when available, the graph workload can also be split between memory +operations and compute kernels. To achieve this, two graph workload +command-lists live simultaneously in a command-buffer. + +When the command-buffer is finalized, memory operations (e.g. buffer copy, +buffer fill, ...) are enqueued in the *copy* command-list while the other +commands are enqueued in the compute command-list. On submission, if not empty, +the *copy* command-list is sent to the main copy command-queue while the compute +command-list is sent to the compute command-queue. + +Both are executed concurrently. Synchronization between the command-lists is +handled by Level Zero events. + ### CUDA The SYCL Graph CUDA backend relies on the diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index 3cc77362ddbee..c13cb373fae1f 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -109,14 +109,8 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) set(UNIFIED_RUNTIME_TAG b13c5e1f85e01fef7de7568835092f8592ded6e4) fetch_adapter_source(level_zero - ${UNIFIED_RUNTIME_REPO} - # commit 2c86cd84a86761204f302a1c5148a8455561b8e6 - # Merge: f23ee23a a4617787 - # Author: Kenneth Benzie (Benie) - # Date: Fri Jun 14 10:54:08 2024 +0100 - # Merge pull request #1749 from nrspruit/fix_NonBlocking_LastCommand - # [L0] Maintain Lock of Queue while syncing the Last Command Event and update Last Command Event only if matching - 2c86cd84a86761204f302a1c5148a8455561b8e6 + "https://github.com/bensuo/unified-runtime.git" + "cmd-buf-copy-queue" ) fetch_adapter_source(opencl diff --git a/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp b/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp new file mode 100644 index 0000000000000..fee6ff18d94bf --- /dev/null +++ b/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp @@ -0,0 +1,102 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// + +// Tests that the optimization to use the L0 Copy Engine for memory commands +// does not interfere with the linear graph optimization + +#include "../graph_common.hpp" + +#include + +int main() { + queue Queue{{sycl::property::queue::in_order{}}}; + + using T = int; + + const T ModValue = 7; + std::vector DataA(Size), DataB(Size), DataC(Size); + + std::iota(DataA.begin(), DataA.end(), 1); + std::iota(DataB.begin(), DataB.end(), 10); + std::iota(DataC.begin(), DataC.end(), 1000); + + // Create reference data for output + std::vector ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC); + for (size_t i = 0; i < Iterations; i++) { + for (size_t j = 0; j < Size; j++) { + ReferenceA[j] += ModValue; + ReferenceB[j] = ReferenceA[j]; + ReferenceB[j] -= ModValue; + ReferenceC[j] = ReferenceB[j]; + ReferenceC[j] += ModValue; + } + } + + ext::oneapi::experimental::command_graph Graph{Queue.get_context(), + Queue.get_device()}; + + T *PtrA = malloc_device(Size, Queue); + T *PtrB = malloc_device(Size, Queue); + T *PtrC = malloc_device(Size, Queue); + + Queue.copy(DataA.data(), PtrA, Size); + Queue.copy(DataB.data(), PtrB, Size); + Queue.copy(DataC.data(), PtrC, Size); + Queue.wait_and_throw(); + + Graph.begin_recording(Queue); + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + PtrA[LinID] += ModValue; + }); + }); + + Queue.submit([&](handler &CGH) { CGH.memcpy(PtrB, PtrA, Size * sizeof(T)); }); + + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + PtrB[LinID] -= ModValue; + }); + }); + + Queue.submit([&](handler &CGH) { CGH.memcpy(PtrC, PtrB, Size * sizeof(T)); }); + + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>(Size), [=](item<1> id) { + auto LinID = id.get_linear_id(); + PtrC[LinID] += ModValue; + }); + }); + + Graph.end_recording(); + + auto GraphExec = Graph.finalize(); + + event Event; + for (unsigned n = 0; n < Iterations; n++) { + Event = + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); }); + } + + Queue.copy(PtrA, DataA.data(), Size, Event); + Queue.copy(PtrB, DataB.data(), Size, Event); + Queue.copy(PtrC, DataC.data(), Size, Event); + Queue.wait_and_throw(); + + free(PtrA, Queue); + free(PtrB, Queue); + free(PtrC, Queue); + + for (size_t i = 0; i < Size; i++) { + assert(check_value(i, ReferenceA[i], DataA[i], "DataA")); + assert(check_value(i, ReferenceB[i], DataB[i], "DataB")); + assert(check_value(i, ReferenceC[i], DataC[i], "DataC")); + } +} From 01b1582bf46e5201d06716e73ff202daed012c49 Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Fri, 14 Jun 2024 14:28:47 +0100 Subject: [PATCH 2/2] Update sycl/plugins/unified_runtime/CMakeLists.txt Co-authored-by: Kenneth Benzie (Benie) --- sycl/plugins/unified_runtime/CMakeLists.txt | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index c13cb373fae1f..87dd3af4dd66e 100644 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -109,8 +109,14 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT) set(UNIFIED_RUNTIME_TAG b13c5e1f85e01fef7de7568835092f8592ded6e4) fetch_adapter_source(level_zero - "https://github.com/bensuo/unified-runtime.git" - "cmd-buf-copy-queue" + ${UNIFIED_RUNTIME_REPO} + # commit b8a1a3f232198bf2c3d8edd2bbc909bb2a9be555 + # Merge: 0cd127ad 30f8ac50 + # Author: Kenneth Benzie (Benie) + # Date: Fri Jun 14 14:26:17 2024 +0100 + # Merge pull request #1738 from Bensuo/cmd-buf-copy-queue + # [CMDBUF][L0] Use copy engine to optimize cmd-buffer usage + b8a1a3f232198bf2c3d8edd2bbc909bb2a9be555 ) fetch_adapter_source(opencl