diff --git a/SYCL/USM/P2P/p2p_access.cpp b/SYCL/USM/P2P/p2p_access.cpp new file mode 100644 index 0000000000..22e351aec3 --- /dev/null +++ b/SYCL/USM/P2P/p2p_access.cpp @@ -0,0 +1,75 @@ +// REQUIRES: cuda +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER + +#include +#include + +using namespace sycl; + +int main() { + + // Note that this code will largely be removed: it is temporary due to the + // temporary lack of multiple devices per sycl context in the Nvidia backend. + // A portable implementation, using a single gpu platform, should be possible + // once the Nvidia context issues are resolved. + //////////////////////////////////////////////////////////////////////// + std::vector Devs; + for (const auto &plt : sycl::platform::get_platforms()) { + + if (plt.get_backend() == sycl::backend::ext_oneapi_cuda) + Devs.push_back(plt.get_devices()[0]); + } + if (Devs.size() < 2) { + std::cout << "Cannot test P2P capabilities, at least two devices are " + "required, exiting." + << std::endl; + return 0; + } + + std::vector Queues; + std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues), + [](const sycl::device &D) { return sycl::queue{D}; }); + //////////////////////////////////////////////////////////////////////// + + if (!Devs[0].ext_oneapi_can_access_peer( + Devs[1], sycl::ext::oneapi::peer_access::access_supported)) { + std::cout << "P2P access is not supported by devices, exiting." + << std::endl; + return 0; + } + + // Enables Devs[0] to access Devs[1] memory. + Devs[0].ext_oneapi_enable_peer_access(Devs[1]); + + auto *arr1 = malloc(2, Queues[1], usm::alloc::device); + + // Calling fill on Devs[1] data with Devs[0] queue requires P2P enabled. + Queues[0].fill(arr1, 2, 2).wait(); + + // Access/write Devs[1] data with Devs[0] queue. + Queues[0] + .submit([&](handler &cgh) { + auto myRange = range<1>(1); + auto myKernel = ([=](id<1> idx) { arr1[0] *= 2; }); + + cgh.parallel_for(myRange, myKernel); + }) + .wait(); + + int2 out; + + Queues[0].memcpy(&out, arr1, 2 * sizeof(int)).wait(); + assert(out[0] == 4); + assert(out[1] == 2); + + sycl::free(arr1, Queues[1]); + + Devs[0].ext_oneapi_disable_peer_access(Devs[1]); + + return 0; +} + +// CHECK: ---> piextPeerAccessGetInfo( +// CHECK: ---> piextEnablePeerAccess( +// CHECK: ---> piextDisablePeerAccess( diff --git a/SYCL/USM/P2P/p2p_atomics.cpp b/SYCL/USM/P2P/p2p_atomics.cpp new file mode 100644 index 0000000000..74651946fb --- /dev/null +++ b/SYCL/USM/P2P/p2p_atomics.cpp @@ -0,0 +1,85 @@ +// REQUIRES: cuda +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -Xsycl-target-backend --cuda-gpu-arch=sm_61 %s -o %t.out +// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out 2>&1 %GPU_CHECK_PLACEHOLDER + +#include +#include +#include +#include + +using namespace sycl; + +// number of atomic operations +constexpr size_t N = 512; + +int main() { + + // Note that this code will largely be removed: it is temporary due to the + // temporary lack of multiple devices per sycl context in the Nvidia backend. + // A portable implementation, using a single gpu platform, should be possible + // once the Nvidia context issues are resolved. + //////////////////////////////////////////////////////////////////////// + std::vector Devs; + for (const auto &plt : sycl::platform::get_platforms()) { + + if (plt.get_backend() == sycl::backend::ext_oneapi_cuda) + Devs.push_back(plt.get_devices()[0]); + } + if (Devs.size() < 2) { + std::cout << "Cannot test P2P capabilities, at least two devices are " + "required, exiting." + << std::endl; + return 0; + } + + std::vector Queues; + std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues), + [](const sycl::device &D) { return sycl::queue{D}; }); + //////////////////////////////////////////////////////////////////////// + + if (!Devs[1].ext_oneapi_can_access_peer( + Devs[0], sycl::ext::oneapi::peer_access::atomics_supported)) { + std::cout << "P2P atomics are not supported by devices, exiting." + << std::endl; + return 0; + } + + // Enables Devs[1] to access Devs[0] memory. + Devs[1].ext_oneapi_enable_peer_access(Devs[0]); + + std::vector input(N); + std::iota(input.begin(), input.end(), 0); + + double h_sum = 0.; + for (const auto &value : input) { + h_sum += value; + } + + double *d_sum = malloc_shared(1, Queues[0]); + double *d_in = malloc_device(N, Queues[0]); + + Queues[0].memcpy(d_in, &input[0], N * sizeof(double)); + Queues[0].wait(); + + range global_range{N}; + + *d_sum = 0.; + Queues[1].submit([&](handler &h) { + h.parallel_for(global_range, [=](id<1> i) { + sycl::atomic_ref(*d_sum) += d_in[i]; + }); + }); + Queues[1].wait(); + + assert(*d_sum == h_sum); + + free(d_sum, Queues[0]); + free(d_in, Queues[0]); + + return 0; +} + +// CHECK: ---> piextPeerAccessGetInfo( +// CHECK: ---> piextEnablePeerAccess( diff --git a/SYCL/USM/P2P/p2p_copy.cpp b/SYCL/USM/P2P/p2p_copy.cpp new file mode 100644 index 0000000000..09d62021ed --- /dev/null +++ b/SYCL/USM/P2P/p2p_copy.cpp @@ -0,0 +1,78 @@ +// REQUIRES: cuda +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %t.out + +#include +#include +#include +#include + +using namespace sycl; + +// Array size to copy +constexpr int N = 100; + +int main() { + + // Note that this code will largely be removed: it is temporary due to the + // temporary lack of multiple devices per sycl context in the Nvidia backend. + // A portable implementation, using a single gpu platform, should be possible + // once the Nvidia context issues are resolved. + //////////////////////////////////////////////////////////////////////// + std::vector Devs; + for (const auto &plt : sycl::platform::get_platforms()) { + + if (plt.get_backend() == sycl::backend::ext_oneapi_cuda) + Devs.push_back(plt.get_devices()[0]); + } + if (Devs.size() < 2) { + std::cout << "Cannot test P2P capabilities, at least two devices are " + "required, exiting." + << std::endl; + return 0; + } + + std::vector Queues; + std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues), + [](const sycl::device &D) { return sycl::queue{D}; }); + //////////////////////////////////////////////////////////////////////// + + if (!Devs[0].ext_oneapi_can_access_peer( + Devs[1], sycl::ext::oneapi::peer_access::access_supported)) { + std::cout << "P2P access is not supported by devices, exiting." + << std::endl; + return 0; + } + + // Enables Devs[0] to access Devs[1] memory. + Devs[0].ext_oneapi_enable_peer_access(Devs[1]); + + std::vector input(N); + std::iota(input.begin(), input.end(), 0); + + int *arr0 = malloc(N, Queues[0], usm::alloc::device); + Queues[0].memcpy(arr0, &input[0], N * sizeof(int)); + + int *arr1 = malloc(N, Queues[1], usm::alloc::device); + // P2P copy performed here: + Queues[1].copy(arr0, arr1, N).wait(); + + int out[N]; + Queues[1].copy(arr1, out, N).wait(); + + sycl::free(arr0, Queues[0]); + sycl::free(arr1, Queues[1]); + + bool ok = true; + for (int i = 0; i < N; i++) { + if (out[i] != input[i]) { + printf("%d %d\n", out[i], input[i]); + ok = false; + break; + } + } + + printf("%s\n", ok ? "PASS" : "FAIL"); + + return 0; +}