From 00d1cbe2d02819ad50b9d74d899211babacc0e82 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 24 Feb 2023 08:07:58 -0800
Subject: [PATCH 1/5] Added rough P2P usage examples.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/P2P/p2p_usm_access.cpp | 84 +++++++++++++++++++++++++++++++++++++
 SYCL/P2P/p2p_usm_memcpy.cpp | 57 +++++++++++++++++++++++++
 2 files changed, 141 insertions(+)
 create mode 100644 SYCL/P2P/p2p_usm_access.cpp
 create mode 100644 SYCL/P2P/p2p_usm_memcpy.cpp
diff --git a/SYCL/P2P/p2p_usm_access.cpp b/SYCL/P2P/p2p_usm_access.cpp
new file mode 100644
index 0000000000..2d763a1e9b
--- /dev/null
+++ b/SYCL/P2P/p2p_usm_access.cpp
@@ -0,0 +1,84 @@
+#include <cassert>
+#include <memory>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  int Data[10] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+  {
+
+    std::vector<sycl::device> Devs;
+
+    // Note that this code is temporary due to the temporary lack of multiple devices per sycl context in the nvidia backend.
+    ////////////////////////
+    for (const auto &plt : sycl::platform::get_platforms()) {
+
+      if (plt.get_backend() == sycl::backend::cuda)
+        Devs.push_back(plt.get_devices()[0]);
+    }
+    ////////////////////////
+
+    std::vector<sycl::queue> Queues;
+    std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues),
+                   [](const sycl::device &D) { return sycl::queue{D}; });
+
+    assert(Queues.size() > 1);
+
+    int N = 100;
+    int val = 5;
+    int *input = (int *)malloc(sizeof(int) * N);
+    for (int i = 0; i < N; i++) {
+      input[i] = val;
+    }
+
+    // note: practically it could also be good to provide clear directions to
+    // documentation showing users how to make sure they are constructing queues
+    // using distinct devices.
+
+    auto Dev0 = Queues[0].get_device();
+    auto Dev1 = Queues[1].get_device();
+
+    assert(Dev0 != Dev1);
+
+    int *arr0 = malloc<int>(N, Queues[0], usm::alloc::device);
+    int *arr1 = malloc<int>(N, Queues[1], usm::alloc::device);
+
+    // note: in real use would obviously load/set arr0/arr1 with meaningful
+    // data.
+
+    if (Dev0.ext_oneapi_can_access_peer(
+            Dev1, sycl::ext::oneapi::peer_access::access_supported)) {
+      // dev0 enables itself to access dev1
+      Dev0.ext_oneapi_enable_peer_access(Dev1);
+      // dev1 enables itself to access dev0
+      Dev1.ext_oneapi_enable_peer_access(Dev0);
+      // Dev0.ext_oneapi_disable_peer_access(Dev1);
+    }
+    
+    // access Device/Queue 1
+    Queues[0].submit([&](handler &cgh) {
+      auto myRange = range<1>(N);
+      auto myKernel = ([=](id<1> idx) {
+        arr0[idx] = idx[0];
+        arr1[idx] = idx[0];
+      });
+
+      cgh.parallel_for<class p2p_access>(myRange, myKernel);
+    }).wait();
+
+    sycl::free(arr0, Queues[0]);
+    sycl::free(arr1, Queues[1]);
+
+    if (Dev0.ext_oneapi_can_access_peer(
+            Dev1, sycl::ext::oneapi::peer_access::access_supported)) {
+      Dev0.ext_oneapi_disable_peer_access(Dev1);
+      Dev1.ext_oneapi_disable_peer_access(Dev0);
+    }
+  }
+
+  //TODO we should also test inter-device atomics
+  // Devs[0].ext_oneapi_can_access_peer(Devs[1], sycl::ext::oneapi::peer_access::atomics_supported);
+
+  return 0;
+}
diff --git a/SYCL/P2P/p2p_usm_memcpy.cpp b/SYCL/P2P/p2p_usm_memcpy.cpp
new file mode 100644
index 0000000000..538579ddea
--- /dev/null
+++ b/SYCL/P2P/p2p_usm_memcpy.cpp
@@ -0,0 +1,57 @@
+#include <cassert>
+#include <memory>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  int Data[10] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+  {
+
+    std::vector<sycl::device> Devs;
+    
+    // Note that this code is temporary due to the temporary lack of multiple devices per sycl context in the nvidia backend.
+    ////////////////////////
+    for (const auto &plt : sycl::platform::get_platforms()) {
+
+      if (plt.get_backend() == sycl::backend::cuda)
+        Devs.push_back(plt.get_devices()[0]);
+    }
+    ////////////////////////
+    
+    ///// Enable bi-directional peer copies
+    Devs[0].ext_oneapi_enable_peer_access(Devs[1]);
+
+    std::vector<sycl::queue> Queues;
+    std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues),
+                   [](const sycl::device &D) { return sycl::queue{D}; });
+
+    assert(Queues.size() > 1);
+
+    int N = 100;
+    int val = 5;
+    int *input = (int *)malloc(sizeof(int) * N);
+    for (int i = 0; i < N; i++) {
+      input[i] = val;
+    }
+
+    int *arr0 = malloc<int>(N, Queues[0], usm::alloc::device);
+    Queues[0].memcpy(arr0, input, N * sizeof(int));
+    Queues[0].wait();
+
+    int *arr2 = malloc<int>(N, Queues[1], usm::alloc::device);
+    // Copy device usm allocated in devices/cuContexts
+    Queues[0].copy(arr1, arr0, N).wait();
+    int *out;
+    out = new int[N];
+    Queues[0].memcpy(out, arr1, N * sizeof(int)).wait();
+
+    sycl::free(arr0, Queues[0]);
+    sycl::free(arr1, Queues[1]);
+
+    std::cout << out[0];
+    delete[] out;
+  }
+
+  return 0;
+}

From 45a7859e628404b47d52b5626f75ff44e9580afe Mon Sep 17 00:00:00 2001
From: JackAKirk <chezjakirk@gmail.com>
Date: Tue, 28 Feb 2023 16:32:40 +0000
Subject: [PATCH 2/5] correct memcpy

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/P2P/p2p_usm_memcpy.cpp | 83 +++++++++++++++++++------------------
 1 file changed, 43 insertions(+), 40 deletions(-)

diff --git a/SYCL/P2P/p2p_usm_memcpy.cpp b/SYCL/P2P/p2p_usm_memcpy.cpp
index 538579ddea..98bcabac94 100644
--- a/SYCL/P2P/p2p_usm_memcpy.cpp
+++ b/SYCL/P2P/p2p_usm_memcpy.cpp
@@ -5,53 +5,56 @@
 using namespace sycl;
 
 int main() {
-  int Data[10] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
-  {
 
-    std::vector<sycl::device> Devs;
-    
-    // Note that this code is temporary due to the temporary lack of multiple devices per sycl context in the nvidia backend.
-    ////////////////////////
-    for (const auto &plt : sycl::platform::get_platforms()) {
+  std::vector<sycl::device> Devs;
 
-      if (plt.get_backend() == sycl::backend::cuda)
-        Devs.push_back(plt.get_devices()[0]);
-    }
-    ////////////////////////
-    
-    ///// Enable bi-directional peer copies
-    Devs[0].ext_oneapi_enable_peer_access(Devs[1]);
-
-    std::vector<sycl::queue> Queues;
-    std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues),
-                   [](const sycl::device &D) { return sycl::queue{D}; });
-
-    assert(Queues.size() > 1);
-
-    int N = 100;
-    int val = 5;
-    int *input = (int *)malloc(sizeof(int) * N);
-    for (int i = 0; i < N; i++) {
-      input[i] = val;
-    }
+  // Note that this code is temporary due to the temporary lack of multiple devices per sycl context in the nvidia backend.
+  ////////////////////////
+  for (const auto &plt : sycl::platform::get_platforms()) {
+
+    if (plt.get_backend() == sycl::backend::cuda)
+      Devs.push_back(plt.get_devices()[0]);
+  }
+  ////////////////////////
+
+  ///// Enable bi-directional peer copies
+  Devs[0].ext_oneapi_enable_peer_access(Devs[1]);
+
+  std::vector<sycl::queue> Queues;
+  std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues),
+      [](const sycl::device &D) { return sycl::queue{D}; });
 
-    int *arr0 = malloc<int>(N, Queues[0], usm::alloc::device);
-    Queues[0].memcpy(arr0, input, N * sizeof(int));
-    Queues[0].wait();
+  assert(Queues.size() > 1);
 
-    int *arr2 = malloc<int>(N, Queues[1], usm::alloc::device);
-    // Copy device usm allocated in devices/cuContexts
-    Queues[0].copy(arr1, arr0, N).wait();
-    int *out;
-    out = new int[N];
-    Queues[0].memcpy(out, arr1, N * sizeof(int)).wait();
+  int N = 100;
+  int *input = (int *)malloc(sizeof(int) * N);
+  for (int i = 0; i < N; i++) {
+    input[i] = i;
+  }
+
+  int *arr0 = malloc<int>(N, Queues[0], usm::alloc::device);
+  Queues[0].memcpy(arr0, input, N * sizeof(int)).wait();
+
+  int *arr1 = malloc<int>(N, Queues[1], usm::alloc::device);
+  Queues[1].copy(arr0, arr1, N).wait();
 
-    sycl::free(arr0, Queues[0]);
-    sycl::free(arr1, Queues[1]);
+  int *out;
+  out = new int[N];
+  Queues[1].copy(arr1, out, N).wait();
 
-    std::cout << out[0];
-    delete[] out;
+  sycl::free(arr0, Queues[0]);
+  sycl::free(arr1, Queues[1]);
+
+  bool ok = true;
+  for (int i = 0; i < N; i++) {
+    if (out[i] != input[i]) {
+      printf("%d %d\n", out[i], input[i]);
+      ok = false; //break;
+    }
   }
+  delete[] out;
+
+  printf("%s\n", ok ? "PASS" : "FAIL");
 
   return 0;
 }

From 78bee4e7a508d3917292c5f5fd3ea8cf923e65e1 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 3 Mar 2023 11:32:31 -0800
Subject: [PATCH 3/5] Updated and added atomics test.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/P2P/p2p_usm_access.cpp  | 84 ----------------------------------
 SYCL/P2P/p2p_usm_memcpy.cpp  | 60 -------------------------
 SYCL/USM/P2P/p2p_access.cpp  | 71 +++++++++++++++++++++++++++++
 SYCL/USM/P2P/p2p_atomics.cpp | 87 ++++++++++++++++++++++++++++++++++++
 SYCL/USM/P2P/p2p_memcpy.cpp  | 78 ++++++++++++++++++++++++++++++++
 5 files changed, 236 insertions(+), 144 deletions(-)
 delete mode 100644 SYCL/P2P/p2p_usm_access.cpp
 delete mode 100644 SYCL/P2P/p2p_usm_memcpy.cpp
 create mode 100644 SYCL/USM/P2P/p2p_access.cpp
 create mode 100644 SYCL/USM/P2P/p2p_atomics.cpp
 create mode 100644 SYCL/USM/P2P/p2p_memcpy.cpp

diff --git a/SYCL/P2P/p2p_usm_access.cpp b/SYCL/P2P/p2p_usm_access.cpp
deleted file mode 100644
index 2d763a1e9b..0000000000
--- a/SYCL/P2P/p2p_usm_access.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-#include <cassert>
-#include <memory>
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-
-int main() {
-  int Data[10] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
-  {
-
-    std::vector<sycl::device> Devs;
-
-    // Note that this code is temporary due to the temporary lack of multiple devices per sycl context in the nvidia backend.
-    ////////////////////////
-    for (const auto &plt : sycl::platform::get_platforms()) {
-
-      if (plt.get_backend() == sycl::backend::cuda)
-        Devs.push_back(plt.get_devices()[0]);
-    }
-    ////////////////////////
-
-    std::vector<sycl::queue> Queues;
-    std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues),
-                   [](const sycl::device &D) { return sycl::queue{D}; });
-
-    assert(Queues.size() > 1);
-
-    int N = 100;
-    int val = 5;
-    int *input = (int *)malloc(sizeof(int) * N);
-    for (int i = 0; i < N; i++) {
-      input[i] = val;
-    }
-
-    // note: practically it could also be good to provide clear directions to
-    // documentation showing users how to make sure they are constructing queues
-    // using distinct devices.
-
-    auto Dev0 = Queues[0].get_device();
-    auto Dev1 = Queues[1].get_device();
-
-    assert(Dev0 != Dev1);
-
-    int *arr0 = malloc<int>(N, Queues[0], usm::alloc::device);
-    int *arr1 = malloc<int>(N, Queues[1], usm::alloc::device);
-
-    // note: in real use would obviously load/set arr0/arr1 with meaningful
-    // data.
-
-    if (Dev0.ext_oneapi_can_access_peer(
-            Dev1, sycl::ext::oneapi::peer_access::access_supported)) {
-      // dev0 enables itself to access dev1
-      Dev0.ext_oneapi_enable_peer_access(Dev1);
-      // dev1 enables itself to access dev0
-      Dev1.ext_oneapi_enable_peer_access(Dev0);
-      // Dev0.ext_oneapi_disable_peer_access(Dev1);
-    }
-    
-    // access Device/Queue 1
-    Queues[0].submit([&](handler &cgh) {
-      auto myRange = range<1>(N);
-      auto myKernel = ([=](id<1> idx) {
-        arr0[idx] = idx[0];
-        arr1[idx] = idx[0];
-      });
-
-      cgh.parallel_for<class p2p_access>(myRange, myKernel);
-    }).wait();
-
-    sycl::free(arr0, Queues[0]);
-    sycl::free(arr1, Queues[1]);
-
-    if (Dev0.ext_oneapi_can_access_peer(
-            Dev1, sycl::ext::oneapi::peer_access::access_supported)) {
-      Dev0.ext_oneapi_disable_peer_access(Dev1);
-      Dev1.ext_oneapi_disable_peer_access(Dev0);
-    }
-  }
-
-  //TODO we should also test inter-device atomics
-  // Devs[0].ext_oneapi_can_access_peer(Devs[1], sycl::ext::oneapi::peer_access::atomics_supported);
-
-  return 0;
-}
diff --git a/SYCL/P2P/p2p_usm_memcpy.cpp b/SYCL/P2P/p2p_usm_memcpy.cpp
deleted file mode 100644
index 98bcabac94..0000000000
--- a/SYCL/P2P/p2p_usm_memcpy.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-#include <cassert>
-#include <memory>
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-
-int main() {
-
-  std::vector<sycl::device> Devs;
-
-  // Note that this code is temporary due to the temporary lack of multiple devices per sycl context in the nvidia backend.
-  ////////////////////////
-  for (const auto &plt : sycl::platform::get_platforms()) {
-
-    if (plt.get_backend() == sycl::backend::cuda)
-      Devs.push_back(plt.get_devices()[0]);
-  }
-  ////////////////////////
-
-  ///// Enable bi-directional peer copies
-  Devs[0].ext_oneapi_enable_peer_access(Devs[1]);
-
-  std::vector<sycl::queue> Queues;
-  std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues),
-      [](const sycl::device &D) { return sycl::queue{D}; });
-
-  assert(Queues.size() > 1);
-
-  int N = 100;
-  int *input = (int *)malloc(sizeof(int) * N);
-  for (int i = 0; i < N; i++) {
-    input[i] = i;
-  }
-
-  int *arr0 = malloc<int>(N, Queues[0], usm::alloc::device);
-  Queues[0].memcpy(arr0, input, N * sizeof(int)).wait();
-
-  int *arr1 = malloc<int>(N, Queues[1], usm::alloc::device);
-  Queues[1].copy(arr0, arr1, N).wait();
-
-  int *out;
-  out = new int[N];
-  Queues[1].copy(arr1, out, N).wait();
-
-  sycl::free(arr0, Queues[0]);
-  sycl::free(arr1, Queues[1]);
-
-  bool ok = true;
-  for (int i = 0; i < N; i++) {
-    if (out[i] != input[i]) {
-      printf("%d %d\n", out[i], input[i]);
-      ok = false; //break;
-    }
-  }
-  delete[] out;
-
-  printf("%s\n", ok ? "PASS" : "FAIL");
-
-  return 0;
-}
diff --git a/SYCL/USM/P2P/p2p_access.cpp b/SYCL/USM/P2P/p2p_access.cpp
new file mode 100644
index 0000000000..6e04f390d7
--- /dev/null
+++ b/SYCL/USM/P2P/p2p_access.cpp
@@ -0,0 +1,71 @@
+// REQUIRES: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %t.out
+
+#include <cassert>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+
+  // Note that this code will largely be removed: it is temporary due to the
+  // temporary lack of multiple devices per sycl context in the Nvidia backend.
+  // A portable implementation, using a single gpu platform, should be possible
+  // once the Nvidia context issues are resolved.
+  ////////////////////////////////////////////////////////////////////////
+  std::vector<sycl::device> Devs;
+  for (const auto &plt : sycl::platform::get_platforms()) {
+
+    if (plt.get_backend() == sycl::backend::ext_oneapi_cuda)
+      Devs.push_back(plt.get_devices()[0]);
+  }
+  if (Devs.size() < 2) {
+    std::cout << "Cannot test P2P capabilities, at least two devices are "
+                 "required, exiting."
+              << std::endl;
+    return 0;
+  }
+
+  std::vector<sycl::queue> Queues;
+  std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues),
+                 [](const sycl::device &D) { return sycl::queue{D}; });
+  ////////////////////////////////////////////////////////////////////////
+
+  if (!Devs[0].ext_oneapi_can_access_peer(
+          Devs[1], sycl::ext::oneapi::peer_access::access_supported)) {
+    std::cout << "P2P access is not supported by devices, exiting."
+              << std::endl;
+    return 0;
+  }
+
+  // Enables Devs[0] to access Devs[1] memory.
+  Devs[0].ext_oneapi_enable_peer_access(Devs[1]);
+
+  auto *arr1 = malloc<int>(2, Queues[1], usm::alloc::device);
+
+  // Calling fill on Devs[1] data with Devs[0] queue requires P2P enabled.
+  Queues[0].fill(arr1, 2, 2).wait();
+
+  // Access/write Devs[1] data with Devs[0] queue.
+  Queues[0]
+      .submit([&](handler &cgh) {
+        auto myRange = range<1>(1);
+        auto myKernel = ([=](id<1> idx) { arr1[0] *= 2; });
+
+        cgh.parallel_for<class p2p_access>(myRange, myKernel);
+      })
+      .wait();
+
+  int2 out;
+
+  Queues[0].memcpy(&out, arr1, 2 * sizeof(int)).wait();
+  assert(out[0] == 4);
+  assert(out[1] == 2);
+
+  sycl::free(arr1, Queues[1]);
+
+  Devs[0].ext_oneapi_disable_peer_access(Devs[1]);
+
+  return 0;
+}
diff --git a/SYCL/USM/P2P/p2p_atomics.cpp b/SYCL/USM/P2P/p2p_atomics.cpp
new file mode 100644
index 0000000000..457221da45
--- /dev/null
+++ b/SYCL/USM/P2P/p2p_atomics.cpp
@@ -0,0 +1,87 @@
+// REQUIRES: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %t.out
+
+#include <cassert>
+#include <numeric>
+#include <sycl/sycl.hpp>
+#include <vector>
+
+using namespace sycl;
+
+// number of atomic operations
+constexpr size_t N = 512;
+
+int main() {
+
+  // Note that this code will largely be removed: it is temporary due to the
+  // temporary lack of multiple devices per sycl context in the Nvidia backend.
+  // A portable implementation, using a single gpu platform, should be possible
+  // once the Nvidia context issues are resolved.
+  ////////////////////////////////////////////////////////////////////////
+  std::vector<sycl::device> Devs;
+  for (const auto &plt : sycl::platform::get_platforms()) {
+
+    if (plt.get_backend() == sycl::backend::ext_oneapi_cuda)
+      Devs.push_back(plt.get_devices()[0]);
+  }
+  if (Devs.size() < 2) {
+    std::cout << "Cannot test P2P capabilities, at least two devices are "
+                 "required, exiting."
+              << std::endl;
+    return 0;
+  }
+
+  std::vector<sycl::queue> Queues;
+  std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues),
+                 [](const sycl::device &D) { return sycl::queue{D}; });
+  ////////////////////////////////////////////////////////////////////////
+
+  auto Dev0 = Queues[0].get_device();
+  auto Dev1 = Queues[1].get_device();
+
+  assert(Dev0 != Dev1);
+
+  if (!Devs[0].ext_oneapi_can_access_peer(
+          Devs[1], sycl::ext::oneapi::peer_access::atomics_supported)) {
+    std::cout << "P2P atomics are not supported by devices, exiting."
+              << std::endl;
+    return 0;
+  }
+
+  // Enables Dev1 to access Dev0 memory.
+  Dev1.ext_oneapi_enable_peer_access(Dev0);
+
+  std::vector<double> input(N);
+  std::iota(input.begin(), input.end(), 0);
+
+  double h_sum = 0.;
+  for (const auto &value : input) {
+    h_sum += value;
+  }
+
+  double *d_sum = malloc_shared<double>(1, Queues[0]);
+  double *d_in = malloc_device<double>(N, Queues[0]);
+
+  Queues[0].memcpy(d_in, &input[0], N * sizeof(double));
+  Queues[0].wait();
+
+  range global_range{N};
+
+  *d_sum = 0.;
+  Queues[1].submit([&](handler &h) {
+    h.parallel_for<class peer_atomic>(global_range, [=](id<1> i) {
+      sycl::atomic_ref<double, sycl::memory_order::relaxed,
+                       sycl::memory_scope::system,
+                       access::address_space::global_space>(*d_sum) += d_in[i];
+    });
+  });
+  Queues[1].wait();
+
+  assert(*d_sum == h_sum);
+
+  free(d_sum, Queues[0]);
+  free(d_in, Queues[0]);
+
+  return 0;
+}
diff --git a/SYCL/USM/P2P/p2p_memcpy.cpp b/SYCL/USM/P2P/p2p_memcpy.cpp
new file mode 100644
index 0000000000..09d62021ed
--- /dev/null
+++ b/SYCL/USM/P2P/p2p_memcpy.cpp
@@ -0,0 +1,78 @@
+// REQUIRES: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %t.out
+
+#include <cassert>
+#include <numeric>
+#include <sycl/sycl.hpp>
+#include <vector>
+
+using namespace sycl;
+
+// Array size to copy
+constexpr int N = 100;
+
+int main() {
+
+  // Note that this code will largely be removed: it is temporary due to the
+  // temporary lack of multiple devices per sycl context in the Nvidia backend.
+  // A portable implementation, using a single gpu platform, should be possible
+  // once the Nvidia context issues are resolved.
+  ////////////////////////////////////////////////////////////////////////
+  std::vector<sycl::device> Devs;
+  for (const auto &plt : sycl::platform::get_platforms()) {
+
+    if (plt.get_backend() == sycl::backend::ext_oneapi_cuda)
+      Devs.push_back(plt.get_devices()[0]);
+  }
+  if (Devs.size() < 2) {
+    std::cout << "Cannot test P2P capabilities, at least two devices are "
+                 "required, exiting."
+              << std::endl;
+    return 0;
+  }
+
+  std::vector<sycl::queue> Queues;
+  std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues),
+                 [](const sycl::device &D) { return sycl::queue{D}; });
+  ////////////////////////////////////////////////////////////////////////
+
+  if (!Devs[0].ext_oneapi_can_access_peer(
+          Devs[1], sycl::ext::oneapi::peer_access::access_supported)) {
+    std::cout << "P2P access is not supported by devices, exiting."
+              << std::endl;
+    return 0;
+  }
+
+  // Enables Devs[0] to access Devs[1] memory.
+  Devs[0].ext_oneapi_enable_peer_access(Devs[1]);
+
+  std::vector<int> input(N);
+  std::iota(input.begin(), input.end(), 0);
+
+  int *arr0 = malloc<int>(N, Queues[0], usm::alloc::device);
+  Queues[0].memcpy(arr0, &input[0], N * sizeof(int));
+
+  int *arr1 = malloc<int>(N, Queues[1], usm::alloc::device);
+  // P2P copy performed here:
+  Queues[1].copy(arr0, arr1, N).wait();
+
+  int out[N];
+  Queues[1].copy(arr1, out, N).wait();
+
+  sycl::free(arr0, Queues[0]);
+  sycl::free(arr1, Queues[1]);
+
+  bool ok = true;
+  for (int i = 0; i < N; i++) {
+    if (out[i] != input[i]) {
+      printf("%d %d\n", out[i], input[i]);
+      ok = false;
+      break;
+    }
+  }
+
+  printf("%s\n", ok ? "PASS" : "FAIL");
+
+  return 0;
+}

From b2fe38e7b999962833243c620e7632edd63361eb Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Mon, 6 Mar 2023 08:14:58 -0800
Subject: [PATCH 4/5] Chose better test name.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/USM/P2P/p2p_atomics.cpp                  | 5 -----
 SYCL/USM/P2P/{p2p_memcpy.cpp => p2p_copy.cpp} | 0
 2 files changed, 5 deletions(-)
 rename SYCL/USM/P2P/{p2p_memcpy.cpp => p2p_copy.cpp} (100%)

diff --git a/SYCL/USM/P2P/p2p_atomics.cpp b/SYCL/USM/P2P/p2p_atomics.cpp
index 457221da45..ff38d45790 100644
--- a/SYCL/USM/P2P/p2p_atomics.cpp
+++ b/SYCL/USM/P2P/p2p_atomics.cpp
@@ -37,11 +37,6 @@ int main() {
                  [](const sycl::device &D) { return sycl::queue{D}; });
   ////////////////////////////////////////////////////////////////////////
 
-  auto Dev0 = Queues[0].get_device();
-  auto Dev1 = Queues[1].get_device();
-
-  assert(Dev0 != Dev1);
-
   if (!Devs[0].ext_oneapi_can_access_peer(
           Devs[1], sycl::ext::oneapi::peer_access::atomics_supported)) {
     std::cout << "P2P atomics are not supported by devices, exiting."
diff --git a/SYCL/USM/P2P/p2p_memcpy.cpp b/SYCL/USM/P2P/p2p_copy.cpp
similarity index 100%
rename from SYCL/USM/P2P/p2p_memcpy.cpp
rename to SYCL/USM/P2P/p2p_copy.cpp

From dfcd93adc90520e8741f0c148a9d10477aac8f65 Mon Sep 17 00:00:00 2001
From: JackAKirk <jack.kirk@codeplay.com>
Date: Fri, 10 Mar 2023 08:42:36 -0800
Subject: [PATCH 5/5] Added PI_TRACE tests for new pi peer calls.

Signed-off-by: JackAKirk <jack.kirk@codeplay.com>
---
 SYCL/USM/P2P/p2p_access.cpp  |  6 +++++-
 SYCL/USM/P2P/p2p_atomics.cpp | 15 +++++++++------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/SYCL/USM/P2P/p2p_access.cpp b/SYCL/USM/P2P/p2p_access.cpp
index 6e04f390d7..22e351aec3 100644
--- a/SYCL/USM/P2P/p2p_access.cpp
+++ b/SYCL/USM/P2P/p2p_access.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: cuda
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %t.out
+// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
 
 #include <cassert>
 #include <sycl/sycl.hpp>
@@ -69,3 +69,7 @@ int main() {
 
   return 0;
 }
+
+// CHECK: ---> piextPeerAccessGetInfo(
+// CHECK: ---> piextEnablePeerAccess(
+// CHECK: ---> piextDisablePeerAccess(
diff --git a/SYCL/USM/P2P/p2p_atomics.cpp b/SYCL/USM/P2P/p2p_atomics.cpp
index ff38d45790..74651946fb 100644
--- a/SYCL/USM/P2P/p2p_atomics.cpp
+++ b/SYCL/USM/P2P/p2p_atomics.cpp
@@ -1,6 +1,6 @@
 // REQUIRES: cuda
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_61 %s -o %t.out
+// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
 
 #include <cassert>
 #include <numeric>
@@ -37,15 +37,15 @@ int main() {
                  [](const sycl::device &D) { return sycl::queue{D}; });
   ////////////////////////////////////////////////////////////////////////
 
-  if (!Devs[0].ext_oneapi_can_access_peer(
-          Devs[1], sycl::ext::oneapi::peer_access::atomics_supported)) {
+  if (!Devs[1].ext_oneapi_can_access_peer(
+          Devs[0], sycl::ext::oneapi::peer_access::atomics_supported)) {
     std::cout << "P2P atomics are not supported by devices, exiting."
               << std::endl;
     return 0;
   }
 
-  // Enables Dev1 to access Dev0 memory.
-  Dev1.ext_oneapi_enable_peer_access(Dev0);
+  // Enables Devs[1] to access Devs[0] memory.
+  Devs[1].ext_oneapi_enable_peer_access(Devs[0]);
 
   std::vector<double> input(N);
   std::iota(input.begin(), input.end(), 0);
@@ -80,3 +80,6 @@ int main() {
 
   return 0;
 }
+
+// CHECK: ---> piextPeerAccessGetInfo(
+// CHECK: ---> piextEnablePeerAccess(