From 090c9aa56bfec43cdc4e93e4921212cae321584b Mon Sep 17 00:00:00 2001
From: Maxime France-Pillois <maxime.francepillois@codeplay.com>
Date: Fri, 15 Mar 2024 15:19:58 +0000
Subject: [PATCH 1/2] [SYCL][Graph] Update design doc for copy optimization and
 add test

- Update UR tag to include L0 command-buffer copy engine optimization
- Add test which mixes copy and kernel commands
- Update design doc to detail copy engine optimization
---
 sycl/doc/design/CommandGraph.md               |  18 ++++
 sycl/plugins/unified_runtime/CMakeLists.txt   |  10 +-
 .../Graph/ValidUsage/linear_graph_copy.cpp    | 102 ++++++++++++++++++
 3 files changed, 122 insertions(+), 8 deletions(-)
 create mode 100644 sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp

diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md
index 9519067a00484..f36c40af07403 100644
--- a/sycl/doc/design/CommandGraph.md
+++ b/sycl/doc/design/CommandGraph.md
@@ -438,6 +438,24 @@ Level Zero:
 Future work will include exploring L0 API extensions to improve the mapping of
 UR command-buffer to L0 command-list.
 
+#### Copy Engine
+
+For performance considerations, the Unified Runtime Level Zero adapter uses
+different Level Zero command-queues to submit compute kernels and memory
+operations when the device has a dedicated copy engine. To take advantage of the
+copy engine when available, the graph workload can also be split between memory
+operations and compute kernels. To achieve this, two graph workload
+command-lists live simultaneously in a command-buffer.
+
+When the command-buffer is finalized, memory operations (e.g. buffer copy,
+buffer fill, ...) are enqueued in the *copy* command-list while the other
+commands are enqueued in the compute command-list. On submission, if not empty,
+the *copy* command-list is sent to the main copy command-queue while the compute
+command-list is sent to the compute command-queue.
+
+Both are executed concurrently. Synchronization between the command-lists is
+handled by Level Zero events.
+
 ### CUDA
 
 The SYCL Graph CUDA backend relies on the
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 3cc77362ddbee..c13cb373fae1f 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -109,14 +109,8 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
   set(UNIFIED_RUNTIME_TAG b13c5e1f85e01fef7de7568835092f8592ded6e4)
 
   fetch_adapter_source(level_zero
-    ${UNIFIED_RUNTIME_REPO}
-    # commit 2c86cd84a86761204f302a1c5148a8455561b8e6
-    # Merge: f23ee23a a4617787
-    # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-    # Date:   Fri Jun 14 10:54:08 2024 +0100
-    #     Merge pull request #1749 from nrspruit/fix_NonBlocking_LastCommand
-    #     [L0] Maintain Lock of Queue while syncing the Last Command Event and update Last Command Event only if matching
-    2c86cd84a86761204f302a1c5148a8455561b8e6
+    "https://github.com/bensuo/unified-runtime.git"
+    "cmd-buf-copy-queue"
   )
 
   fetch_adapter_source(opencl
diff --git a/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp b/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp
new file mode 100644
index 0000000000000..fee6ff18d94bf
--- /dev/null
+++ b/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp
@@ -0,0 +1,102 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+//
+
+// Tests that the optimization to use the L0 Copy Engine for memory commands
+// does not interfere with the linear graph optimization
+
+#include "../graph_common.hpp"
+
+#include <sycl/properties/queue_properties.hpp>
+
+int main() {
+  queue Queue{{sycl::property::queue::in_order{}}};
+
+  using T = int;
+
+  const T ModValue = 7;
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  // Create reference data for output
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  for (size_t i = 0; i < Iterations; i++) {
+    for (size_t j = 0; j < Size; j++) {
+      ReferenceA[j] += ModValue;
+      ReferenceB[j] = ReferenceA[j];
+      ReferenceB[j] -= ModValue;
+      ReferenceC[j] = ReferenceB[j];
+      ReferenceC[j] += ModValue;
+    }
+  }
+
+  ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
+                                                 Queue.get_device()};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+  T *PtrB = malloc_device<T>(Size, Queue);
+  T *PtrC = malloc_device<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.wait_and_throw();
+
+  Graph.begin_recording(Queue);
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+      auto LinID = id.get_linear_id();
+      PtrA[LinID] += ModValue;
+    });
+  });
+
+  Queue.submit([&](handler &CGH) { CGH.memcpy(PtrB, PtrA, Size * sizeof(T)); });
+
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+      auto LinID = id.get_linear_id();
+      PtrB[LinID] -= ModValue;
+    });
+  });
+
+  Queue.submit([&](handler &CGH) { CGH.memcpy(PtrC, PtrB, Size * sizeof(T)); });
+
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+      auto LinID = id.get_linear_id();
+      PtrC[LinID] += ModValue;
+    });
+  });
+
+  Graph.end_recording();
+
+  auto GraphExec = Graph.finalize();
+
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event =
+        Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+  }
+
+  Queue.copy(PtrA, DataA.data(), Size, Event);
+  Queue.copy(PtrB, DataB.data(), Size, Event);
+  Queue.copy(PtrC, DataC.data(), Size, Event);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+
+  for (size_t i = 0; i < Size; i++) {
+    assert(check_value(i, ReferenceA[i], DataA[i], "DataA"));
+    assert(check_value(i, ReferenceB[i], DataB[i], "DataB"));
+    assert(check_value(i, ReferenceC[i], DataC[i], "DataC"));
+  }
+}

From 01b1582bf46e5201d06716e73ff202daed012c49 Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Fri, 14 Jun 2024 14:28:47 +0100
Subject: [PATCH 2/2] Update sycl/plugins/unified_runtime/CMakeLists.txt

Co-authored-by: Kenneth Benzie (Benie) <k.benzie83@gmail.com>
---
 sycl/plugins/unified_runtime/CMakeLists.txt | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index c13cb373fae1f..87dd3af4dd66e 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -109,8 +109,14 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
   set(UNIFIED_RUNTIME_TAG b13c5e1f85e01fef7de7568835092f8592ded6e4)
 
   fetch_adapter_source(level_zero
-    "https://github.com/bensuo/unified-runtime.git"
-    "cmd-buf-copy-queue"
+    ${UNIFIED_RUNTIME_REPO}
+    # commit b8a1a3f232198bf2c3d8edd2bbc909bb2a9be555
+    # Merge: 0cd127ad 30f8ac50
+    # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
+    # Date:   Fri Jun 14 14:26:17 2024 +0100
+    #     Merge pull request #1738 from Bensuo/cmd-buf-copy-queue
+    #     [CMDBUF][L0] Use copy engine to optimize cmd-buffer usage
+    b8a1a3f232198bf2c3d8edd2bbc909bb2a9be555
   )
 
   fetch_adapter_source(opencl