diff --git a/SYCL/USM/P2P/p2p_access.cpp b/SYCL/USM/P2P/p2p_access.cpp
new file mode 100644
index 0000000000..22e351aec3
--- /dev/null
+++ b/SYCL/USM/P2P/p2p_access.cpp
@@ -0,0 +1,75 @@
+// REQUIRES: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
+
+#include <cassert>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+
+  // Note that this code will largely be removed: it is temporary due to the
+  // temporary lack of multiple devices per sycl context in the Nvidia backend.
+  // A portable implementation, using a single gpu platform, should be possible
+  // once the Nvidia context issues are resolved.
+  ////////////////////////////////////////////////////////////////////////
+  std::vector<sycl::device> Devs;
+  for (const auto &plt : sycl::platform::get_platforms()) {
+
+    if (plt.get_backend() == sycl::backend::ext_oneapi_cuda)
+      Devs.push_back(plt.get_devices()[0]);
+  }
+  if (Devs.size() < 2) {
+    std::cout << "Cannot test P2P capabilities, at least two devices are "
+                 "required, exiting."
+              << std::endl;
+    return 0;
+  }
+
+  std::vector<sycl::queue> Queues;
+  std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues),
+                 [](const sycl::device &D) { return sycl::queue{D}; });
+  ////////////////////////////////////////////////////////////////////////
+
+  if (!Devs[0].ext_oneapi_can_access_peer(
+          Devs[1], sycl::ext::oneapi::peer_access::access_supported)) {
+    std::cout << "P2P access is not supported by devices, exiting."
+              << std::endl;
+    return 0;
+  }
+
+  // Enables Devs[0] to access Devs[1] memory.
+  Devs[0].ext_oneapi_enable_peer_access(Devs[1]);
+
+  auto *arr1 = malloc<int>(2, Queues[1], usm::alloc::device);
+
+  // Calling fill on Devs[1] data with Devs[0] queue requires P2P enabled.
+  Queues[0].fill(arr1, 2, 2).wait();
+
+  // Access/write Devs[1] data with Devs[0] queue.
+  Queues[0]
+      .submit([&](handler &cgh) {
+        auto myRange = range<1>(1);
+        auto myKernel = ([=](id<1> idx) { arr1[0] *= 2; });
+
+        cgh.parallel_for<class p2p_access>(myRange, myKernel);
+      })
+      .wait();
+
+  int2 out;
+
+  Queues[0].memcpy(&out, arr1, 2 * sizeof(int)).wait();
+  assert(out[0] == 4);
+  assert(out[1] == 2);
+
+  sycl::free(arr1, Queues[1]);
+
+  Devs[0].ext_oneapi_disable_peer_access(Devs[1]);
+
+  return 0;
+}
+
+// CHECK: ---> piextPeerAccessGetInfo(
+// CHECK: ---> piextEnablePeerAccess(
+// CHECK: ---> piextDisablePeerAccess(
diff --git a/SYCL/USM/P2P/p2p_atomics.cpp b/SYCL/USM/P2P/p2p_atomics.cpp
new file mode 100644
index 0000000000..74651946fb
--- /dev/null
+++ b/SYCL/USM/P2P/p2p_atomics.cpp
@@ -0,0 +1,85 @@
+// REQUIRES: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_61 %s -o %t.out
+// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER
+
+#include <cassert>
+#include <numeric>
+#include <sycl/sycl.hpp>
+#include <vector>
+
+using namespace sycl;
+
+// number of atomic operations
+constexpr size_t N = 512;
+
+int main() {
+
+  // Note that this code will largely be removed: it is temporary due to the
+  // temporary lack of multiple devices per sycl context in the Nvidia backend.
+  // A portable implementation, using a single gpu platform, should be possible
+  // once the Nvidia context issues are resolved.
+  ////////////////////////////////////////////////////////////////////////
+  std::vector<sycl::device> Devs;
+  for (const auto &plt : sycl::platform::get_platforms()) {
+
+    if (plt.get_backend() == sycl::backend::ext_oneapi_cuda)
+      Devs.push_back(plt.get_devices()[0]);
+  }
+  if (Devs.size() < 2) {
+    std::cout << "Cannot test P2P capabilities, at least two devices are "
+                 "required, exiting."
+              << std::endl;
+    return 0;
+  }
+
+  std::vector<sycl::queue> Queues;
+  std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues),
+                 [](const sycl::device &D) { return sycl::queue{D}; });
+  ////////////////////////////////////////////////////////////////////////
+
+  if (!Devs[1].ext_oneapi_can_access_peer(
+          Devs[0], sycl::ext::oneapi::peer_access::atomics_supported)) {
+    std::cout << "P2P atomics are not supported by devices, exiting."
+              << std::endl;
+    return 0;
+  }
+
+  // Enables Devs[1] to access Devs[0] memory.
+  Devs[1].ext_oneapi_enable_peer_access(Devs[0]);
+
+  std::vector<double> input(N);
+  std::iota(input.begin(), input.end(), 0);
+
+  double h_sum = 0.;
+  for (const auto &value : input) {
+    h_sum += value;
+  }
+
+  double *d_sum = malloc_shared<double>(1, Queues[0]);
+  double *d_in = malloc_device<double>(N, Queues[0]);
+
+  Queues[0].memcpy(d_in, &input[0], N * sizeof(double));
+  Queues[0].wait();
+
+  range global_range{N};
+
+  *d_sum = 0.;
+  Queues[1].submit([&](handler &h) {
+    h.parallel_for<class peer_atomic>(global_range, [=](id<1> i) {
+      sycl::atomic_ref<double, sycl::memory_order::relaxed,
+                       sycl::memory_scope::system,
+                       access::address_space::global_space>(*d_sum) += d_in[i];
+    });
+  });
+  Queues[1].wait();
+
+  assert(*d_sum == h_sum);
+
+  free(d_sum, Queues[0]);
+  free(d_in, Queues[0]);
+
+  return 0;
+}
+
+// CHECK: ---> piextPeerAccessGetInfo(
+// CHECK: ---> piextEnablePeerAccess(
diff --git a/SYCL/USM/P2P/p2p_copy.cpp b/SYCL/USM/P2P/p2p_copy.cpp
new file mode 100644
index 0000000000..09d62021ed
--- /dev/null
+++ b/SYCL/USM/P2P/p2p_copy.cpp
@@ -0,0 +1,78 @@
+// REQUIRES: cuda
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %t.out
+
+#include <cassert>
+#include <numeric>
+#include <sycl/sycl.hpp>
+#include <vector>
+
+using namespace sycl;
+
+// Array size to copy
+constexpr int N = 100;
+
+int main() {
+
+  // Note that this code will largely be removed: it is temporary due to the
+  // temporary lack of multiple devices per sycl context in the Nvidia backend.
+  // A portable implementation, using a single gpu platform, should be possible
+  // once the Nvidia context issues are resolved.
+  ////////////////////////////////////////////////////////////////////////
+  std::vector<sycl::device> Devs;
+  for (const auto &plt : sycl::platform::get_platforms()) {
+
+    if (plt.get_backend() == sycl::backend::ext_oneapi_cuda)
+      Devs.push_back(plt.get_devices()[0]);
+  }
+  if (Devs.size() < 2) {
+    std::cout << "Cannot test P2P capabilities, at least two devices are "
+                 "required, exiting."
+              << std::endl;
+    return 0;
+  }
+
+  std::vector<sycl::queue> Queues;
+  std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues),
+                 [](const sycl::device &D) { return sycl::queue{D}; });
+  ////////////////////////////////////////////////////////////////////////
+
+  if (!Devs[0].ext_oneapi_can_access_peer(
+          Devs[1], sycl::ext::oneapi::peer_access::access_supported)) {
+    std::cout << "P2P access is not supported by devices, exiting."
+              << std::endl;
+    return 0;
+  }
+
+  // Enables Devs[0] to access Devs[1] memory.
+  Devs[0].ext_oneapi_enable_peer_access(Devs[1]);
+
+  std::vector<int> input(N);
+  std::iota(input.begin(), input.end(), 0);
+
+  int *arr0 = malloc<int>(N, Queues[0], usm::alloc::device);
+  Queues[0].memcpy(arr0, &input[0], N * sizeof(int));
+
+  int *arr1 = malloc<int>(N, Queues[1], usm::alloc::device);
+  // P2P copy performed here:
+  Queues[1].copy(arr0, arr1, N).wait();
+
+  int out[N];
+  Queues[1].copy(arr1, out, N).wait();
+
+  sycl::free(arr0, Queues[0]);
+  sycl::free(arr1, Queues[1]);
+
+  bool ok = true;
+  for (int i = 0; i < N; i++) {
+    if (out[i] != input[i]) {
+      printf("%d %d\n", out[i], input[i]);
+      ok = false;
+      break;
+    }
+  }
+
+  printf("%s\n", ok ? "PASS" : "FAIL");
+
+  return 0;
+}