diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_peer_access.asciidoc b/sycl/doc/extensions/supported/sycl_ext_oneapi_peer_access.asciidoc similarity index 92% rename from sycl/doc/extensions/proposed/sycl_ext_oneapi_peer_access.asciidoc rename to sycl/doc/extensions/supported/sycl_ext_oneapi_peer_access.asciidoc index e31f974e38bb4..b089d9b853cbb 100644 --- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_peer_access.asciidoc +++ b/sycl/doc/extensions/supported/sycl_ext_oneapi_peer_access.asciidoc @@ -36,18 +36,19 @@ https://github.com/intel/llvm/issues == Dependencies -This extension is written against the SYCL 2020 revision 6 specification. All +This extension is written against the SYCL 2020 revision 7 specification. All references below to the "core SYCL specification" or to section numbers in the SYCL specification refer to that revision. == Status -This is a proposed extension specification, intended to gather community -feedback. Interfaces defined in this specification may not be implemented yet -or may be in a preliminary state. The specification itself may also change in -incompatible ways before it is finalized. *Shipping software products should -not rely on APIs defined in this specification.* +This extension is implemented and fully supported by DPC++. +== Backend support status + +This extension is currently implemented in DPC++ for all GPU devices and +backends, however, only the CUDA backend allows peer to peer memory access. +Other backends report false from the `ext_oneapi_can_access_peer` query. == Overview diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def index 0880b3c0788ba..5f92d495963ca 100644 --- a/sycl/include/sycl/detail/pi.def +++ b/sycl/include/sycl/detail/pi.def @@ -145,7 +145,6 @@ _PI_API(piPluginGetLastError) _PI_API(piTearDown) - _PI_API(piextUSMEnqueueFill2D) _PI_API(piextUSMEnqueueMemset2D) _PI_API(piextUSMEnqueueMemcpy2D) @@ -158,6 +157,10 @@ _PI_API(piextEnqueueDeviceGlobalVariableRead) _PI_API(piPluginGetBackendOption) +_PI_API(piextEnablePeerAccess) +_PI_API(piextDisablePeerAccess) +_PI_API(piextPeerAccessGetInfo) + // command-buffer Extension _PI_API(piextCommandBufferCreate) _PI_API(piextCommandBufferRetain) diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h index 7cdef9f980730..c029bf3fa0391 100644 --- a/sycl/include/sycl/detail/pi.h +++ b/sycl/include/sycl/detail/pi.h @@ -97,9 +97,11 @@ // 14.33 Added new parameter (memory object properties) to // piextKernelSetArgMemObj // 14.34 Added command-buffer extension methods +// 14.35 Added piextEnablePeerAccess, piextDisablePeerAccess, +// piextPeerAccessGetInfo, and pi_peer_attr enum. #define _PI_H_VERSION_MAJOR 14 -#define _PI_H_VERSION_MINOR 34 +#define _PI_H_VERSION_MINOR 35 #define _PI_STRING_HELPER(a) #a #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b) @@ -1030,7 +1032,17 @@ using pi_image_desc = _pi_image_desc; typedef enum { PI_MEM_CONTEXT = 0x1106, PI_MEM_SIZE = 0x1102 } _pi_mem_info; +typedef enum { + PI_PEER_ACCESS_SUPPORTED = + 0x0, ///< returns a uint32_t: 1 if P2P Access is supported + ///< otherwise P2P Access is not supported. + PI_PEER_ATOMICS_SUPPORTED = + 0x1 ///< returns a uint32_t: 1 if Atomic operations are supported over the + ///< P2P link, otherwise such operations are not supported. +} _pi_peer_attr; + using pi_mem_info = _pi_mem_info; +using pi_peer_attr = _pi_peer_attr; // // Following section contains SYCL RT Plugin Interface (PI) functions. @@ -1088,6 +1100,14 @@ __SYCL_EXPORT pi_result piDevicesGet(pi_platform platform, pi_uint32 num_entries, pi_device *devices, pi_uint32 *num_devices); +__SYCL_EXPORT pi_result piextEnablePeerAccess(pi_device command_device, + pi_device peer_device); +__SYCL_EXPORT pi_result piextDisablePeerAccess(pi_device command_device, + pi_device peer_device); +__SYCL_EXPORT pi_result piextPeerAccessGetInfo( + pi_device command_device, pi_device peer_device, pi_peer_attr attr, + size_t param_value_size, void *param_value, size_t *param_value_size_ret); + /// Returns requested info for provided native device /// Return PI_DEVICE_INFO_EXTENSION_DEVICELIB_ASSERT for /// PI_DEVICE_INFO_EXTENSIONS query when the device supports native asserts diff --git a/sycl/include/sycl/detail/pi.hpp b/sycl/include/sycl/detail/pi.hpp index 92c2019de7066..c954ce37510a3 100644 --- a/sycl/include/sycl/detail/pi.hpp +++ b/sycl/include/sycl/detail/pi.hpp @@ -155,6 +155,7 @@ using PiKernelCacheConfig = ::pi_kernel_cache_config; using PiExtSyncPoint = ::pi_ext_sync_point; using PiExtCommandBuffer = ::pi_ext_command_buffer; using PiExtCommandBufferDesc = ::pi_ext_command_buffer_desc; +using PiPeerAttr = ::pi_peer_attr; __SYCL_EXPORT void contextSetExtendedDeleter(const sycl::context &constext, pi_context_extended_deleter func, diff --git a/sycl/include/sycl/device.hpp b/sycl/include/sycl/device.hpp index cc3fecac2786c..d8361a9fcdaf4 100644 --- a/sycl/include/sycl/device.hpp +++ b/sycl/include/sycl/device.hpp @@ -41,6 +41,12 @@ enum class aspect; namespace ext::oneapi { // Forward declaration class filter_selector; + +enum class peer_access { + access_supported = 0x0, + atomics_supported = 0x1, +}; + } // namespace ext::oneapi /// The SYCL device class encapsulates a single SYCL device on which kernels @@ -90,6 +96,13 @@ class __SYCL_EXPORT device : public detail::OwnerLessBase { device &operator=(device &&rhs) = default; + void ext_oneapi_enable_peer_access(const device &peer); + void ext_oneapi_disable_peer_access(const device &peer); + bool + ext_oneapi_can_access_peer(const device &peer, + ext::oneapi::peer_access value = + ext::oneapi::peer_access::access_supported); + /// Get instance of device /// /// \return a valid cl_device_id instance in accordance with the requirements diff --git a/sycl/plugins/cuda/CMakeLists.txt b/sycl/plugins/cuda/CMakeLists.txt index e25856515f2f5..99f6f601ee46a 100644 --- a/sycl/plugins/cuda/CMakeLists.txt +++ b/sycl/plugins/cuda/CMakeLists.txt @@ -81,6 +81,7 @@ add_sycl_plugin(cuda "../unified_runtime/ur/adapters/cuda/usm.cpp" "../unified_runtime/ur/adapters/cuda/command_buffer.hpp" "../unified_runtime/ur/adapters/cuda/command_buffer.cpp" + "../unified_runtime/ur/adapters/cuda/usm_p2p.cpp" # --- "${sycl_inc_dir}/sycl/detail/pi.h" "${sycl_inc_dir}/sycl/detail/pi.hpp" diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index 074959f91c4d4..010ddd6333822 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -198,7 +198,10 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piextCommandBufferMemBufferCopyRect, pi2ur::piextCommandBufferMemBufferCopyRect) _PI_CL(piextEnqueueCommandBuffer, pi2ur::piextEnqueueCommandBuffer) - + // Peer to Peer + _PI_CL(piextEnablePeerAccess, pi2ur::piextEnablePeerAccess) + _PI_CL(piextDisablePeerAccess, pi2ur::piextDisablePeerAccess) + _PI_CL(piextPeerAccessGetInfo, pi2ur::piextPeerAccessGetInfo) #undef _PI_CL return PI_SUCCESS; diff --git a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp index c390102c2131f..ea0303da1562b 100644 --- a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp +++ b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp @@ -2241,6 +2241,43 @@ pi_result piPluginInit(pi_plugin *PluginInit) { return PI_SUCCESS; } +pi_result piextEnablePeerAccess(pi_device command_device, + pi_device peer_device) { + + std::ignore = command_device; + std::ignore = peer_device; + + setErrorMessage("piextEnablePeerAccess not " + "implemented in esimd_emulator backend", + PI_ERROR_PLUGIN_SPECIFIC_ERROR); + return PI_ERROR_PLUGIN_SPECIFIC_ERROR; +} + +pi_result piextDisablePeerAccess(pi_device command_device, + pi_device peer_device) { + + std::ignore = command_device; + std::ignore = peer_device; + + setErrorMessage("piextDisablePeerAccess not " + "implemented in esimd_emulator backend", + PI_ERROR_PLUGIN_SPECIFIC_ERROR); + return PI_ERROR_PLUGIN_SPECIFIC_ERROR; +} + +pi_result piextPeerAccessGetInfo(pi_device command_device, + pi_device peer_device, pi_peer_attr attr, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + std::ignore = command_device; + std::ignore = peer_device; + std::ignore = attr; + + ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); + // Zero return value indicates that all of the queries currently return false. + return ReturnValue(pi_int32{0}); +} + #ifdef _WIN32 #define __SYCL_PLUGIN_DLL_NAME "pi_esimd_emulator.dll" #include "../common_win_pi_trace/common_win_pi_trace.hpp" diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp index 664e6e889a1f9..83e512d8b1def 100644 --- a/sycl/plugins/hip/pi_hip.cpp +++ b/sycl/plugins/hip/pi_hip.cpp @@ -5857,6 +5857,42 @@ pi_result hip_piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime, return PI_SUCCESS; } +pi_result hip_piextEnablePeerAccess(pi_device command_device, + pi_device peer_device) { + + std::ignore = command_device; + std::ignore = peer_device; + + setErrorMessage("piextEnablePeerAccess not " + "implemented in hip backend", + PI_ERROR_PLUGIN_SPECIFIC_ERROR); + return PI_ERROR_PLUGIN_SPECIFIC_ERROR; +} + +pi_result hip_piextDisablePeerAccess(pi_device command_device, + pi_device peer_device) { + + std::ignore = command_device; + std::ignore = peer_device; + + setErrorMessage("piextDisablePeerAccess not " + "implemented in hip backend", + PI_ERROR_PLUGIN_SPECIFIC_ERROR); + return PI_ERROR_PLUGIN_SPECIFIC_ERROR; +} + +pi_result hip_piextPeerAccessGetInfo(pi_device command_device, + pi_device peer_device, pi_peer_attr attr, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + std::ignore = command_device; + std::ignore = peer_device; + std::ignore = attr; + // Zero return value indicates that all of the queries currently return false. + return getInfo(param_value_size, param_value, param_value_size_ret, + pi_int32{0}); +} + const char SupportedVersion[] = _PI_HIP_PLUGIN_VERSION_STRING; pi_result piPluginInit(pi_plugin *PluginInit) { @@ -6028,6 +6064,11 @@ pi_result piPluginInit(pi_plugin *PluginInit) { _PI_CL(piGetDeviceAndHostTimer, hip_piGetDeviceAndHostTimer) _PI_CL(piPluginGetBackendOption, hip_piPluginGetBackendOption) + // Peer to Peer + _PI_CL(piextEnablePeerAccess, hip_piextEnablePeerAccess) + _PI_CL(piextDisablePeerAccess, hip_piextDisablePeerAccess) + _PI_CL(piextPeerAccessGetInfo, hip_piextPeerAccessGetInfo) + #undef _PI_CL return PI_SUCCESS; diff --git a/sycl/plugins/level_zero/CMakeLists.txt b/sycl/plugins/level_zero/CMakeLists.txt index 995989eaf4c65..8c5a0d4f92c43 100755 --- a/sycl/plugins/level_zero/CMakeLists.txt +++ b/sycl/plugins/level_zero/CMakeLists.txt @@ -127,6 +127,7 @@ add_sycl_plugin(level_zero "../unified_runtime/ur/adapters/level_zero/queue.cpp" "../unified_runtime/ur/adapters/level_zero/sampler.cpp" "../unified_runtime/ur/adapters/level_zero/usm.cpp" + "../unified_runtime/ur/adapters/level_zero/usm_p2p.cpp" # Following are the PI Level-Zero Plugin only codes. "pi_level_zero.cpp" "pi_level_zero.hpp" diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 88891bc585f68..4ce400a3768b4 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -1226,6 +1226,28 @@ pi_result piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime, return pi2ur::piGetDeviceAndHostTimer(Device, DeviceTime, HostTime); } +pi_result piextEnablePeerAccess(pi_device command_device, + pi_device peer_device) { + + return pi2ur::piextEnablePeerAccess(command_device, peer_device); +} + +pi_result piextDisablePeerAccess(pi_device command_device, + pi_device peer_device) { + + return pi2ur::piextDisablePeerAccess(command_device, peer_device); +} + +pi_result piextPeerAccessGetInfo(pi_device command_device, + pi_device peer_device, pi_peer_attr attr, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + + return pi2ur::piextPeerAccessGetInfo(command_device, peer_device, attr, + ParamValueSize, ParamValue, + ParamValueSizeRet); +} + #ifdef _WIN32 #define __SYCL_PLUGIN_DLL_NAME "pi_level_zero.dll" #include "../common_win_pi_trace/common_win_pi_trace.hpp" diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt index f6d27744b99ef..88c631c3b6795 100755 --- a/sycl/plugins/unified_runtime/CMakeLists.txt +++ b/sycl/plugins/unified_runtime/CMakeLists.txt @@ -110,6 +110,7 @@ add_sycl_library("ur_adapter_level_zero" SHARED "ur/adapters/level_zero/queue.cpp" "ur/adapters/level_zero/sampler.cpp" "ur/adapters/level_zero/usm.cpp" + "ur/adapters/level_zero/usm_p2p.cpp" INCLUDE_DIRS ${sycl_inc_dir} LIBRARIES @@ -158,6 +159,7 @@ if ("cuda" IN_LIST SYCL_ENABLE_PLUGINS) "ur/adapters/cuda/usm.cpp" "ur/adapters/cuda/command_buffer.hpp" "ur/adapters/cuda/command_buffer.cpp" + "ur/adapters/cuda/usm_p2p.cpp" INCLUDE_DIRS ${sycl_inc_dir} LIBRARIES diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp index 79b6d44a4d0d8..1c86b1b075f5a 100644 --- a/sycl/plugins/unified_runtime/pi2ur.hpp +++ b/sycl/plugins/unified_runtime/pi2ur.hpp @@ -4322,4 +4322,59 @@ inline pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer, // Command-buffer extension /////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////// +// usm-p2p + +inline pi_result piextEnablePeerAccess(pi_device command_device, + pi_device peer_device) { + auto commandDevice = reinterpret_cast(command_device); + auto peerDevice = reinterpret_cast(peer_device); + + HANDLE_ERRORS(urUsmP2PEnablePeerAccessExp(commandDevice, peerDevice)); + + return PI_SUCCESS; +} + +inline pi_result piextDisablePeerAccess(pi_device command_device, + pi_device peer_device) { + auto commandDevice = reinterpret_cast(command_device); + auto peerDevice = reinterpret_cast(peer_device); + + HANDLE_ERRORS(urUsmP2PDisablePeerAccessExp(commandDevice, peerDevice)); + + return PI_SUCCESS; +} + +inline pi_result +piextPeerAccessGetInfo(pi_device command_device, pi_device peer_device, + pi_peer_attr attr, size_t param_value_size, + void *param_value, size_t *param_value_size_ret) { + auto commandDevice = reinterpret_cast(command_device); + auto peerDevice = reinterpret_cast(peer_device); + + ur_exp_peer_info_t propName; + switch (attr) { + case PI_PEER_ACCESS_SUPPORTED: { + propName = UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORTED; + break; + } + case PI_PEER_ATOMICS_SUPPORTED: { + propName = UR_EXP_PEER_INFO_UR_PEER_ATOMICS_SUPPORTED; + break; + } + default: { + return PI_ERROR_INVALID_VALUE; + } + } + + HANDLE_ERRORS(urUsmP2PPeerAccessGetInfoExp( + commandDevice, peerDevice, propName, param_value_size, param_value, + param_value_size_ret)); + + return PI_SUCCESS; +} + +// usm-p2p +/////////////////////////////////////////////////////////////////////////////// + } // namespace pi2ur diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp index b4680e716ac01..3e6bd8c43dfa5 100644 --- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp +++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp @@ -1101,6 +1101,26 @@ __SYCL_EXPORT pi_result piPluginGetBackendOption(pi_platform platform, backend_option); } +__SYCL_EXPORT pi_result piextEnablePeerAccess(pi_device command_device, + pi_device peer_device) { + + return pi2ur::piextEnablePeerAccess(command_device, peer_device); +} + +__SYCL_EXPORT pi_result piextDisablePeerAccess(pi_device command_device, + pi_device peer_device) { + + return pi2ur::piextDisablePeerAccess(command_device, peer_device); +} + +__SYCL_EXPORT pi_result piextPeerAccessGetInfo( + pi_device command_device, pi_device peer_device, pi_peer_attr attr, + size_t ParamValueSize, void *ParamValue, size_t *ParamValueSizeRet) { + return pi2ur::piextPeerAccessGetInfo(command_device, peer_device, attr, + ParamValueSize, ParamValue, + ParamValueSizeRet); +} + // This interface is not in Unified Runtime currently __SYCL_EXPORT pi_result piTearDown(void *PluginParameter) { return pi2ur::piTearDown(PluginParameter); @@ -1251,6 +1271,11 @@ __SYCL_EXPORT pi_result piPluginInit(pi_plugin *PluginInit) { _PI_API(piSamplerRetain) _PI_API(piSamplerRelease) + // Peer to Peer + _PI_API(piextEnablePeerAccess) + _PI_API(piextDisablePeerAccess) + _PI_API(piextPeerAccessGetInfo) + _PI_API(piextPluginGetOpaqueData) _PI_API(piTearDown) diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp index 86975e5097257..392498973f768 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.cpp @@ -98,6 +98,20 @@ thread_local char ErrorMessage[MaxMessageSize]; ErrorMessageCode = ErrorCode; } +void setPluginSpecificMessage(CUresult cu_res) { + const char *error_string; + const char *error_name; + cuGetErrorName(cu_res, &error_name); + cuGetErrorString(cu_res, &error_string); + char *message = (char *)malloc(strlen(error_string) + strlen(error_name) + 2); + strcpy(message, error_name); + strcat(message, "\n"); + strcat(message, error_string); + + setErrorMessage(message, UR_RESULT_ERROR_ADAPTER_SPECIFIC); + free(message); +} + // Returns plugin specific error and warning messages; common implementation // that can be shared between adapters ur_result_t urGetLastResult(ur_platform_handle_t, const char **ppMessage) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp index 5cfa609018b29..1af46cbef00cd 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/common.hpp @@ -36,6 +36,8 @@ extern thread_local char ErrorMessage[MaxMessageSize]; [[maybe_unused]] void setErrorMessage(const char *pMessage, ur_result_t ErrorCode); +void setPluginSpecificMessage(CUresult cu_res); + /// ------ Error handling, matching OpenCL plugin semantics. namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp index 132c7775bbad5..9c5934c0ae9c1 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/ur_interface_loader.cpp @@ -288,6 +288,19 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( return retVal; } +UR_DLLEXPORT ur_result_t UR_APICALL urGetUsmP2PExpProcAddrTable( + ur_api_version_t version, ur_usm_p2p_exp_dditable_t *pDdiTable) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + pDdiTable->pfnEnablePeerAccessExp = urUsmP2PEnablePeerAccessExp; + pDdiTable->pfnDisablePeerAccessExp = urUsmP2PDisablePeerAccessExp; + pDdiTable->pfnPeerAccessGetInfoExp = urUsmP2PPeerAccessGetInfoExp; + + return retVal; +} + #if defined(__cplusplus) } // extern "C" #endif diff --git a/sycl/plugins/unified_runtime/ur/adapters/cuda/usm_p2p.cpp b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm_p2p.cpp new file mode 100644 index 0000000000000..b1b0255a94d1e --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/cuda/usm_p2p.cpp @@ -0,0 +1,69 @@ +//===--------- usm_p2p.cpp - CUDA Adapter---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------===// + +#include "common.hpp" +#include "context.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp( + ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) { + + ur_result_t result = UR_RESULT_SUCCESS; + try { + ScopedContext active(commandDevice->getContext()); + UR_CHECK_ERROR(cuCtxEnablePeerAccess(peerDevice->getContext(), 0)); + } catch (ur_result_t err) { + result = err; + } + return result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PDisablePeerAccessExp( + ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) { + + ur_result_t result = UR_RESULT_SUCCESS; + try { + ScopedContext active(commandDevice->getContext()); + UR_CHECK_ERROR(cuCtxDisablePeerAccess(peerDevice->getContext())); + } catch (ur_result_t err) { + result = err; + } + return result; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( + ur_device_handle_t commandDevice, ur_device_handle_t peerDevice, + ur_exp_peer_info_t propName, size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + int value; + CUdevice_P2PAttribute cu_attr; + try { + ScopedContext active(commandDevice->getContext()); + switch (propName) { + case UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORTED: { + cu_attr = CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED; + break; + } + case UR_EXP_PEER_INFO_UR_PEER_ATOMICS_SUPPORTED: { + cu_attr = CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED; + break; + } + default: { + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + } + + UR_CHECK_ERROR(cuDeviceGetP2PAttribute( + &value, cu_attr, commandDevice->get(), peerDevice->get())); + } catch (ur_result_t err) { + return err; + } + return ReturnValue(value); +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_interface_loader.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_interface_loader.cpp index 5cac5f3b99da5..d56448ca35e12 100644 --- a/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_interface_loader.cpp +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/ur_interface_loader.cpp @@ -330,3 +330,16 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( return retVal; } + +UR_DLLEXPORT ur_result_t UR_APICALL urGetUsmP2PExpProcAddrTable( + ur_api_version_t version, ur_usm_p2p_exp_dditable_t *pDdiTable) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + pDdiTable->pfnEnablePeerAccessExp = urUsmP2PEnablePeerAccessExp; + pDdiTable->pfnDisablePeerAccessExp = urUsmP2PDisablePeerAccessExp; + pDdiTable->pfnPeerAccessGetInfoExp = urUsmP2PPeerAccessGetInfoExp; + + return retVal; +} diff --git a/sycl/plugins/unified_runtime/ur/adapters/level_zero/usm_p2p.cpp b/sycl/plugins/unified_runtime/ur/adapters/level_zero/usm_p2p.cpp new file mode 100644 index 0000000000000..9ec958e2d3c9c --- /dev/null +++ b/sycl/plugins/unified_runtime/ur/adapters/level_zero/usm_p2p.cpp @@ -0,0 +1,43 @@ +//===----------- usm_p2p.cpp - L0 Adapter ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===-----------------------------------------------------------------===// + +#include "ur_level_zero.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PEnablePeerAccessExp( + ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) { + + std::ignore = commandDevice; + std::ignore = peerDevice; + + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PDisablePeerAccessExp( + ur_device_handle_t commandDevice, ur_device_handle_t peerDevice) { + + std::ignore = commandDevice; + std::ignore = peerDevice; + + urPrint("[UR][L0] %s function not implemented!\n", __FUNCTION__); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUsmP2PPeerAccessGetInfoExp( + ur_device_handle_t commandDevice, ur_device_handle_t peerDevice, + ur_exp_peer_info_t propName, size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + + std::ignore = commandDevice; + std::ignore = peerDevice; + std::ignore = propName; + + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + // Zero return value indicates that all of the queries currently return false. + return ReturnValue(uint32_t{0}); +} diff --git a/sycl/source/device.cpp b/sycl/source/device.cpp index 9b87b69271743..fdc689e22bd22 100644 --- a/sycl/source/device.cpp +++ b/sycl/source/device.cpp @@ -208,6 +208,52 @@ pi_native_handle device::getNative() const { return impl->getNative(); } bool device::has(aspect Aspect) const { return impl->has(Aspect); } +void device::ext_oneapi_enable_peer_access(const device &peer) { + const sycl::detail::pi::PiDevice Device = impl->getHandleRef(); + const sycl::detail::pi::PiDevice Peer = peer.impl->getHandleRef(); + if (Device != Peer) { + auto Plugin = impl->getPlugin(); + Plugin->call(Device, Peer); + } +} + +void device::ext_oneapi_disable_peer_access(const device &peer) { + const sycl::detail::pi::PiDevice Device = impl->getHandleRef(); + const sycl::detail::pi::PiDevice Peer = peer.impl->getHandleRef(); + if (Device != Peer) { + auto Plugin = impl->getPlugin(); + Plugin->call(Device, Peer); + } +} + +bool device::ext_oneapi_can_access_peer(const device &peer, + ext::oneapi::peer_access attr) { + const sycl::detail::pi::PiDevice Device = impl->getHandleRef(); + const sycl::detail::pi::PiDevice Peer = peer.impl->getHandleRef(); + + if (Device == Peer) { + return true; + } + + size_t returnSize; + int value; + + sycl::detail::pi::PiPeerAttr PiAttr = [&]() { + switch (attr) { + case ext::oneapi::peer_access::access_supported: + return PI_PEER_ACCESS_SUPPORTED; + case ext::oneapi::peer_access::atomics_supported: + return PI_PEER_ATOMICS_SUPPORTED; + } + throw sycl::exception(make_error_code(errc::invalid), + "Unrecognized peer access attribute."); + }(); + auto Plugin = impl->getPlugin(); + Plugin->call( + Device, Peer, PiAttr, sizeof(int), &value, &returnSize); + + return value == 1; +} bool device::ext_oneapi_architecture_is( ext::oneapi::experimental::architecture arch) { return impl->extOneapiArchitectureIs(arch); diff --git a/sycl/test-e2e/USM/P2P/p2p_access.cpp b/sycl/test-e2e/USM/P2P/p2p_access.cpp new file mode 100644 index 0000000000000..d1b3c2f0b2f40 --- /dev/null +++ b/sycl/test-e2e/USM/P2P/p2p_access.cpp @@ -0,0 +1,71 @@ +// REQUIRES: cuda +// RUN: %{build} -o %t.out +// RUN: %if ext_oneapi_cuda %{ %{run} %t.out %} + +#include +#include + +using namespace sycl; + +int main() { + + // Note that this code will largely be removed: it is temporary due to the + // temporary lack of multiple devices per sycl context in the Nvidia backend. + // A portable implementation, using a single gpu platform, should be possible + // once the Nvidia context issues are resolved. + //////////////////////////////////////////////////////////////////////// + std::vector Devs; + for (const auto &plt : sycl::platform::get_platforms()) { + + if (plt.get_backend() == sycl::backend::ext_oneapi_cuda) + Devs.push_back(plt.get_devices()[0]); + } + if (Devs.size() < 2) { + std::cout << "Cannot test P2P capabilities, at least two devices are " + "required, exiting." + << std::endl; + return 0; + } + + std::vector Queues; + std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues), + [](const sycl::device &D) { return sycl::queue{D}; }); + //////////////////////////////////////////////////////////////////////// + + if (!Devs[0].ext_oneapi_can_access_peer( + Devs[1], sycl::ext::oneapi::peer_access::access_supported)) { + std::cout << "P2P access is not supported by devices, exiting." + << std::endl; + return 0; + } + + // Enables Devs[0] to access Devs[1] memory. + Devs[0].ext_oneapi_enable_peer_access(Devs[1]); + + auto *arr1 = malloc(2, Queues[1], usm::alloc::device); + + // Calling fill on Devs[1] data with Devs[0] queue requires P2P enabled. + Queues[0].fill(arr1, 2, 2).wait(); + + // Access/write Devs[1] data with Devs[0] queue. + Queues[0] + .submit([&](handler &cgh) { + auto myRange = range<1>(1); + auto myKernel = ([=](id<1> idx) { arr1[0] *= 2; }); + + cgh.parallel_for(myRange, myKernel); + }) + .wait(); + + int2 out; + + Queues[0].memcpy(&out, arr1, 2 * sizeof(int)).wait(); + assert(out[0] == 4); + assert(out[1] == 2); + + sycl::free(arr1, Queues[1]); + + Devs[0].ext_oneapi_disable_peer_access(Devs[1]); + std::cout << "PASS" << std::endl; + return 0; +} diff --git a/sycl/test-e2e/USM/P2P/p2p_atomics.cpp b/sycl/test-e2e/USM/P2P/p2p_atomics.cpp new file mode 100644 index 0000000000000..2f58c063c3c62 --- /dev/null +++ b/sycl/test-e2e/USM/P2P/p2p_atomics.cpp @@ -0,0 +1,82 @@ +// REQUIRES: cuda +// RUN: %if any-device-is-cuda %{ %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_61 -o %t.out %} +// RUN: %if ext_oneapi_cuda %{ %{run} %t.out %} + +#include +#include +#include +#include + +using namespace sycl; + +// number of atomic operations +constexpr size_t N = 512; + +int main() { + + // Note that this code will largely be removed: it is temporary due to the + // temporary lack of multiple devices per sycl context in the Nvidia backend. + // A portable implementation, using a single gpu platform, should be possible + // once the Nvidia context issues are resolved. + //////////////////////////////////////////////////////////////////////// + std::vector Devs; + for (const auto &plt : sycl::platform::get_platforms()) { + + if (plt.get_backend() == sycl::backend::ext_oneapi_cuda) + Devs.push_back(plt.get_devices()[0]); + } + if (Devs.size() < 2) { + std::cout << "Cannot test P2P capabilities, at least two devices are " + "required, exiting." + << std::endl; + return 0; + } + + std::vector Queues; + std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues), + [](const sycl::device &D) { return sycl::queue{D}; }); + //////////////////////////////////////////////////////////////////////// + + if (!Devs[1].ext_oneapi_can_access_peer( + Devs[0], sycl::ext::oneapi::peer_access::atomics_supported)) { + std::cout << "P2P atomics are not supported by devices, exiting." + << std::endl; + return 0; + } + + // Enables Devs[1] to access Devs[0] memory. + Devs[1].ext_oneapi_enable_peer_access(Devs[0]); + + std::vector input(N); + std::iota(input.begin(), input.end(), 0); + + double h_sum = 0.; + for (const auto &value : input) { + h_sum += value; + } + + double *d_sum = malloc_shared(1, Queues[0]); + double *d_in = malloc_device(N, Queues[0]); + + Queues[0].memcpy(d_in, &input[0], N * sizeof(double)); + Queues[0].wait(); + + range global_range{N}; + + *d_sum = 0.; + Queues[1].submit([&](handler &h) { + h.parallel_for(global_range, [=](id<1> i) { + sycl::atomic_ref(*d_sum) += d_in[i]; + }); + }); + Queues[1].wait(); + + assert(*d_sum == h_sum); + + free(d_sum, Queues[0]); + free(d_in, Queues[0]); + std::cout << "PASS" << std::endl; + return 0; +} diff --git a/sycl/test-e2e/USM/P2P/p2p_copy.cpp b/sycl/test-e2e/USM/P2P/p2p_copy.cpp new file mode 100644 index 0000000000000..99aab3e6c7d25 --- /dev/null +++ b/sycl/test-e2e/USM/P2P/p2p_copy.cpp @@ -0,0 +1,78 @@ +// REQUIRES: cuda +// RUN: %{build} -o %t.out +// RUN: %if ext_oneapi_cuda %{ %{run} %t.out %} + +#include +#include +#include +#include + +using namespace sycl; + +// Array size to copy +constexpr int N = 100; + +int main() { + + // Note that this code will largely be removed: it is temporary due to the + // temporary lack of multiple devices per sycl context in the Nvidia backend. + // A portable implementation, using a single gpu platform, should be possible + // once the Nvidia context issues are resolved. + //////////////////////////////////////////////////////////////////////// + std::vector Devs; + for (const auto &plt : sycl::platform::get_platforms()) { + + if (plt.get_backend() == sycl::backend::ext_oneapi_cuda) + Devs.push_back(plt.get_devices()[0]); + } + if (Devs.size() < 2) { + std::cout << "Cannot test P2P capabilities, at least two devices are " + "required, exiting." + << std::endl; + return 0; + } + + std::vector Queues; + std::transform(Devs.begin(), Devs.end(), std::back_inserter(Queues), + [](const sycl::device &D) { return sycl::queue{D}; }); + //////////////////////////////////////////////////////////////////////// + + if (!Devs[0].ext_oneapi_can_access_peer( + Devs[1], sycl::ext::oneapi::peer_access::access_supported)) { + std::cout << "P2P access is not supported by devices, exiting." + << std::endl; + return 0; + } + + // Enables Devs[0] to access Devs[1] memory. + Devs[0].ext_oneapi_enable_peer_access(Devs[1]); + + std::vector input(N); + std::iota(input.begin(), input.end(), 0); + + int *arr0 = malloc(N, Queues[0], usm::alloc::device); + Queues[0].memcpy(arr0, &input[0], N * sizeof(int)); + + int *arr1 = malloc(N, Queues[1], usm::alloc::device); + // P2P copy performed here: + Queues[1].copy(arr0, arr1, N).wait(); + + int out[N]; + Queues[1].copy(arr1, out, N).wait(); + + sycl::free(arr0, Queues[0]); + sycl::free(arr1, Queues[1]); + + bool ok = true; + for (int i = 0; i < N; i++) { + if (out[i] != input[i]) { + printf("%d %d\n", out[i], input[i]); + ok = false; + break; + } + } + + printf("%s\n", ok ? "PASS" : "FAIL"); + + return 0; +} diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump index dc977f5d51171..daeefdedb3066 100644 --- a/sycl/test/abi/pi_level_zero_symbol_check.dump +++ b/sycl/test/abi/pi_level_zero_symbol_check.dump @@ -136,3 +136,6 @@ piextUSMFree piextUSMGetMemAllocInfo piextUSMHostAlloc piextUSMSharedAlloc +piextEnablePeerAccess +piextDisablePeerAccess +piextPeerAccessGetInfo diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 6eb5ff8d147c7..0b83ae775746f 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -4458,6 +4458,9 @@ _ZNK4sycl3_V16device8get_infoINS0_4info6device8atomic64EEENS0_6detail19is_device _ZNK4sycl3_V16device8get_infoINS0_4info6device8platformEEENS0_6detail19is_device_info_descIT_E11return_typeEv _ZNK4sycl3_V16device8get_infoINS0_4info6device9vendor_idEEENS0_6detail19is_device_info_descIT_E11return_typeEv _ZNK4sycl3_V16device9getNativeEv +_ZN4sycl3_V16device26ext_oneapi_can_access_peerERKS1_NS0_3ext6oneapi11peer_accessE +_ZN4sycl3_V16device29ext_oneapi_enable_peer_accessERKS1_ +_ZN4sycl3_V16device30ext_oneapi_disable_peer_accessERKS1_ _ZNK4sycl3_V16kernel11get_backendEv _ZNK4sycl3_V16kernel11get_contextEv _ZNK4sycl3_V16kernel13getNativeImplEv diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 39ffa2245921f..8c9139bb365c7 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -942,7 +942,10 @@ ?ext_oneapi_architecture_is@device@_V1@sycl@@QEAA_NW4architecture@experimental@oneapi@ext@23@@Z ?ext_oneapi_barrier@handler@_V1@sycl@@QEAAXAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@@Z ?ext_oneapi_barrier@handler@_V1@sycl@@QEAAXXZ +?ext_oneapi_can_access_peer@device@_V1@sycl@@QEAA_NAEBV123@W4peer_access@oneapi@ext@23@@Z +?ext_oneapi_disable_peer_access@device@_V1@sycl@@QEAAXAEBV123@@Z ?ext_oneapi_empty@queue@_V1@sycl@@QEBA_NXZ +?ext_oneapi_enable_peer_access@device@_V1@sycl@@QEAAXAEBV123@@Z ?ext_oneapi_fill2d_impl@handler@_V1@sycl@@AEAAXPEAX_KPEBX111@Z ?ext_oneapi_get_default_context@platform@_V1@sycl@@QEBA?AVcontext@23@XZ ?ext_oneapi_graph@handler@_V1@sycl@@QEAAXV?$command_graph@$00@experimental@oneapi@ext@23@@Z diff --git a/sycl/unittests/Extensions/CMakeLists.txt b/sycl/unittests/Extensions/CMakeLists.txt index 85bcd63e874ed..869fe6ea8e4f2 100644 --- a/sycl/unittests/Extensions/CMakeLists.txt +++ b/sycl/unittests/Extensions/CMakeLists.txt @@ -8,5 +8,6 @@ add_sycl_unittest(ExtensionsTests OBJECT DeviceGlobal.cpp OneAPISubGroupMask.cpp CommandGraph.cpp + USMP2P.cpp ) diff --git a/sycl/unittests/Extensions/USMP2P.cpp b/sycl/unittests/Extensions/USMP2P.cpp new file mode 100644 index 0000000000000..ac44bb6ddd54e --- /dev/null +++ b/sycl/unittests/Extensions/USMP2P.cpp @@ -0,0 +1,85 @@ +//==------------------------- USMP2P.cpp -----------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +int check = 0; + +pi_result redefinedDevicesGet(pi_platform platform, pi_device_type device_type, + pi_uint32 num_entries, pi_device *devices, + pi_uint32 *num_devices) { + if (num_devices) + *num_devices = 2; + if (devices && num_entries > 0) { + devices[0] = reinterpret_cast(1); + devices[1] = reinterpret_cast(2); + } + return PI_SUCCESS; +} + +pi_result redefinedEnablePeerAccess(pi_device command_device, + pi_device peer_device) { + check = 3; + return PI_SUCCESS; +} + +pi_result redefinedDisablePeerAccess(pi_device command_device, + pi_device peer_device) { + check = 4; + return PI_SUCCESS; +} + +pi_result redefinedPeerAccessGetInfo(pi_device command_device, + pi_device peer_device, pi_peer_attr attr, + size_t param_value_size, void *param_value, + size_t *param_value_size_ret) { + + if (param_value) + *static_cast(param_value) = 1; + if (param_value_size_ret) + *param_value_size_ret = sizeof(pi_int32); + + if (attr == PI_PEER_ACCESS_SUPPORTED) { + check = 1; + } else if (attr == PI_PEER_ATOMICS_SUPPORTED) { + check = 2; + } + return PI_SUCCESS; +} + +TEST(USMP2PTest, USMP2PTest) { + + sycl::unittest::PiMock Mock; + + Mock.redefine(redefinedDevicesGet); + Mock.redefine( + redefinedEnablePeerAccess); + Mock.redefine( + redefinedDisablePeerAccess); + Mock.redefine( + redefinedPeerAccessGetInfo); + + sycl::platform Plt = Mock.getPlatform(); + + auto Dev1 = Plt.get_devices()[0]; + auto Dev2 = Plt.get_devices()[1]; + + ASSERT_TRUE(Dev1.ext_oneapi_can_access_peer( + Dev2, sycl::ext::oneapi::peer_access::access_supported)); + ASSERT_EQ(check, 1); + ASSERT_TRUE(Dev1.ext_oneapi_can_access_peer( + Dev2, sycl::ext::oneapi::peer_access::atomics_supported)); + ASSERT_EQ(check, 2); + + Dev1.ext_oneapi_enable_peer_access(Dev2); + ASSERT_EQ(check, 3); + Dev1.ext_oneapi_disable_peer_access(Dev2); + ASSERT_EQ(check, 4); +} diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp index fb62e3d4ffb67..a3285dc602fb1 100644 --- a/sycl/unittests/helpers/PiMockPlugin.hpp +++ b/sycl/unittests/helpers/PiMockPlugin.hpp @@ -1288,3 +1288,25 @@ inline pi_result mock_piextEnqueueWriteHostPipe( *event = createDummyHandle(); return PI_SUCCESS; } + +inline pi_result mock_piextEnablePeerAccess(pi_device command_device, + pi_device peer_device) { + return PI_SUCCESS; +} + +inline pi_result mock_piextDisablePeerAccess(pi_device command_device, + pi_device peer_device) { + return PI_SUCCESS; +} + +inline pi_result +mock_piextPeerAccessGetInfo(pi_device command_device, pi_device peer_device, + pi_peer_attr attr, size_t param_value_size, + void *param_value, size_t *param_value_size_ret) { + if (param_value) + *static_cast(param_value) = 1; + if (param_value_size_ret) + *param_value_size_ret = sizeof(pi_int32); + + return PI_SUCCESS; +}