diff --git a/include/ur_api.h b/include/ur_api.h index 60d6fc2f70..9feb774315 100644 --- a/include/ur_api.h +++ b/include/ur_api.h @@ -9530,6 +9530,7 @@ typedef enum ur_exp_launch_property_id_t { UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0, ///< The property has no effect UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, ///< Whether to launch a cooperative kernel UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions + UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation /// @cond UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -9543,10 +9544,12 @@ typedef enum ur_exp_launch_property_id_t { /// _Analogues_ /// - **CUlaunchAttributeValue** typedef union ur_exp_launch_property_value_t { - uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each - ///< value must be a divisor of the corresponding global work-size - ///< dimension (in units of work-group). - int cooperative; ///< [in] non-zero value indicates a cooperative kernel + uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each + ///< value must be a divisor of the corresponding global work-size + ///< dimension (in units of work-group). + int cooperative; ///< [in] non-zero value indicates a cooperative kernel + size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to + ///< allocate } ur_exp_launch_property_value_t; @@ -9587,6 +9590,7 @@ typedef struct ur_exp_launch_property_t { /// + NULL == hQueue /// + NULL == hKernel /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` /// + `NULL == pGlobalWorkSize` /// + `NULL == launchPropList` /// + NULL == pGlobalWorkSize @@ -9615,6 +9619,8 @@ urEnqueueKernelLaunchCustomExp( ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items + const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< offset used to calculate the global ID of a work-item const size_t *pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel ///< function @@ -11441,6 +11447,7 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t { ur_queue_handle_t *phQueue; ur_kernel_handle_t *phKernel; uint32_t *pworkDim; + const size_t **ppGlobalWorkOffset; const size_t **ppGlobalWorkSize; const size_t **ppLocalWorkSize; uint32_t *pnumPropsInLaunchPropList; diff --git a/include/ur_ddi.h b/include/ur_ddi.h index 80a0003fca..dbcbb0fc91 100644 --- a/include/ur_ddi.h +++ b/include/ur_ddi.h @@ -1457,6 +1457,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)( uint32_t, const size_t *, const size_t *, + const size_t *, uint32_t, const ur_exp_launch_property_t *, uint32_t, diff --git a/include/ur_print.hpp b/include/ur_print.hpp index 09431d4352..1b8d2350d6 100644 --- a/include/ur_print.hpp +++ b/include/ur_print.hpp @@ -10246,6 +10246,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION"; break; + case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: + os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY"; + break; default: os << "unknown enumerator"; break; @@ -10282,6 +10285,13 @@ inline ur_result_t printUnion( os << (params.cooperative); + break; + case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: + + os << ".workgroup_mem_size = "; + + os << (params.workgroup_mem_size); + break; default: os << ""; @@ -14722,6 +14732,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct os << *(params->pworkDim); + os << ", "; + os << ".pGlobalWorkOffset = "; + + ur::details::printPtr(os, + *(params->ppGlobalWorkOffset)); + os << ", "; os << ".pGlobalWorkSize = "; diff --git a/scripts/core/exp-launch-properties.yml b/scripts/core/exp-launch-properties.yml index 9e66e9ea06..e2c4ef6317 100644 --- a/scripts/core/exp-launch-properties.yml +++ b/scripts/core/exp-launch-properties.yml @@ -29,6 +29,8 @@ etors: desc: "Whether to launch a cooperative kernel" - name: CLUSTER_DIMENSION desc: "work-group cluster dimensions" + - name: WORK_GROUP_MEMORY + desc: "Implicit work group memory allocation" --- #-------------------------------------------------------------------------- type: union desc: "Specifies a launch property value" @@ -45,6 +47,10 @@ members: name: cooperative desc: "[in] non-zero value indicates a cooperative kernel" tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE + - type: size_t + name: workgroup_mem_size + desc: "[in] non-zero value indicates the amount of work group memory to allocate" + tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY --- #-------------------------------------------------------------------------- type: struct desc: "Kernel launch property" @@ -82,6 +88,9 @@ params: - type: uint32_t name: workDim desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items" + - type: "const size_t*" + name: pGlobalWorkOffset + desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item" - type: const size_t* name: pGlobalWorkSize desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function" @@ -97,10 +106,10 @@ params: - type: uint32_t name: numEventsInWaitList desc: "[in] size of the event wait list" - - type: const ur_event_handle_t* + - type: const $x_event_handle_t* name: phEventWaitList desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. " - - type: ur_event_handle_t* + - type: $x_event_handle_t* name: phEvent desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array." returns: diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 0e00f680f6..c97487a928 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -414,11 +414,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait( phEventWaitList, phEvent); } -UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( - ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { +static ur_result_t +enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, + uint32_t workDim, const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, size_t WorkGroupMemory) { // Preconditions UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(), UR_RESULT_ERROR_INVALID_KERNEL); @@ -436,6 +438,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + // Set work group memory so we can compute the whole memory requirement + if (WorkGroupMemory) + hKernel->setWorkGroupMemory(WorkGroupMemory); uint32_t LocalSize = hKernel->getLocalSize(); CUfunction CuFunc = hKernel->get(); @@ -498,6 +503,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, phEvent, 0); +} + UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, @@ -508,8 +523,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE; coop_prop.value.cooperative = 1; return urEnqueueKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1, - &coop_prop, numEventsInWaitList, phEventWaitList, phEvent); + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList, + phEvent); } return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, @@ -518,16 +534,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp( UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numPropsInLaunchPropList, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - if (numPropsInLaunchPropList == 0) { - urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize, - pLocalWorkSize, numEventsInWaitList, phEventWaitList, - phEvent); + size_t WorkGroupMemory = [&]() -> size_t { + const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if( + launchPropList, launchPropList + numPropsInLaunchPropList, + [](const ur_exp_launch_property_t &Prop) { + return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY; + }); + if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList) + return WorkGroupMemoryProp->value.workgroup_mem_size; + return 0; + }(); + + if (numPropsInLaunchPropList == 0 || + (WorkGroupMemory && numPropsInLaunchPropList == 1)) { + return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset, + pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, phEvent, + WorkGroupMemory); } #if CUDA_VERSION >= 11080 // Preconditions @@ -540,7 +569,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( return UR_RESULT_ERROR_INVALID_NULL_POINTER; } - std::vector launch_attribute(numPropsInLaunchPropList); + std::vector launch_attribute; + launch_attribute.reserve(numPropsInLaunchPropList); // Early exit for zero size kernel if (*pGlobalWorkSize == 0) { @@ -553,17 +583,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( size_t ThreadsPerBlock[3] = {32u, 1u, 1u}; size_t BlocksPerGrid[3] = {1u, 1u, 1u}; + // Set work group memory so we can compute the whole memory requirement + if (WorkGroupMemory) + hKernel->setWorkGroupMemory(WorkGroupMemory); uint32_t LocalSize = hKernel->getLocalSize(); CUfunction CuFunc = hKernel->get(); for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) { switch (launchPropList[i].id) { case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: { + launch_attribute.push_back({}); launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE; break; } case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: { - + launch_attribute.push_back({}); launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION; // Note that cuda orders from right to left wrt SYCL dimensional order. if (workDim == 3) { @@ -595,11 +629,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( break; } case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: { + launch_attribute.push_back({}); launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE; launch_attribute[i].value.cooperative = launchPropList[i].value.cooperative; break; } + case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: { + break; + } default: { return UR_RESULT_ERROR_INVALID_ENUMERATION; } @@ -610,8 +648,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( // using the standard UR_CHECK_ERROR if (ur_result_t Ret = setKernelParams(hQueue->getContext(), hQueue->Device, workDim, - nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel, - CuFunc, ThreadsPerBlock, BlocksPerGrid); + pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid); Ret != UR_RESULT_SUCCESS) return Ret; @@ -659,7 +697,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( launch_config.sharedMemBytes = LocalSize; launch_config.hStream = CuStream; launch_config.attrs = &launch_attribute[0]; - launch_config.numAttrs = numPropsInLaunchPropList; + launch_config.numAttrs = launch_attribute.size(); UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc, const_cast(ArgIndices.data()), diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp index 7ad20a4f0e..2f1c511143 100644 --- a/source/adapters/cuda/kernel.hpp +++ b/source/adapters/cuda/kernel.hpp @@ -65,6 +65,8 @@ struct ur_kernel_handle_t_ { args_size_t ParamSizes; args_index_t Indices; args_size_t OffsetPerIndex; + size_t WorkGroupMemory = 0; + // A struct to keep track of memargs so that we can do dependency analysis // at urEnqueueKernelLaunch struct mem_obj_arg { @@ -105,22 +107,28 @@ struct ur_kernel_handle_t_ { OffsetPerIndex[Index] = LocalSize; } - void addLocalArg(size_t Index, size_t Size) { - size_t LocalOffset = this->getLocalSize(); - - // maximum required alignment is the size of the largest vector type - const size_t MaxAlignment = sizeof(double) * 16; + // maximum required alignment is the size of the largest vector type + static constexpr size_t MaxAlignment = sizeof(double) * 16; + static size_t alignMemoryAllocation(size_t Size, size_t Offset) { // for arguments smaller than the maximum alignment simply align to the // size of the argument const size_t Alignment = std::min(MaxAlignment, Size); // align the argument - size_t AlignedLocalOffset = LocalOffset; - size_t Pad = LocalOffset % Alignment; + size_t AlignedLocalOffset = Offset; + size_t Pad = Offset % Alignment; if (Pad != 0) { AlignedLocalOffset += Alignment - Pad; } + return AlignedLocalOffset; + } + + void addLocalArg(size_t Index, size_t Size) { + size_t LocalOffset = this->getLocalSize(); + + // align the argument + size_t AlignedLocalOffset = alignMemoryAllocation(Size, LocalOffset); addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset), Size + (AlignedLocalOffset - LocalOffset)); @@ -140,6 +148,24 @@ struct ur_kernel_handle_t_ { MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags}); } + void setWorkGroupMemory(size_t memSize) { + assert(WorkGroupMemory == 0 && + "Work Group Memory size can only be set once"); + // Ensure first offset is MaxAlignment aligned + WorkGroupMemory = alignMemoryAllocation(MaxAlignment, memSize); + + // Adjust local accessor setting + // the dynamic memory will start at offset 0 (allows us to keep access + // local memory as a GV) and accessors will use the rest of the range + for (size_t i = 0; i < OffsetPerIndex.size(); i++) { + // if offset is 0, that's it is not a local accessor argument. + if (!OffsetPerIndex[i]) + continue; + assert(ParamSizes[i] == sizeof(size_t) && "Offset should be a size_t"); + *reinterpret_cast(Indices[i]) += WorkGroupMemory; + } + } + void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) { assert(Size == sizeof(std::uint32_t) * 3); std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); @@ -147,13 +173,15 @@ struct ur_kernel_handle_t_ { void clearLocalSize() { std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0); + WorkGroupMemory = 0; } const args_index_t &getIndices() const noexcept { return Indices; } uint32_t getLocalSize() const { return std::accumulate(std::begin(OffsetPerIndex), - std::end(OffsetPerIndex), 0); + std::end(OffsetPerIndex), 0) + + WorkGroupMemory; } } Args; @@ -238,6 +266,7 @@ struct ur_kernel_handle_t_ { return Args.getIndices(); } + void setWorkGroupMemory(size_t memSize) { Args.setWorkGroupMemory(memSize); } uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); } void clearLocalSize() { Args.clearLocalSize(); } diff --git a/source/adapters/level_zero/queue.cpp b/source/adapters/level_zero/queue.cpp index c4598f3472..95c8d026a7 100644 --- a/source/adapters/level_zero/queue.cpp +++ b/source/adapters/level_zero/queue.cpp @@ -902,14 +902,15 @@ ur_result_t urQueueFlush( ur_result_t urEnqueueKernelLaunchCustomExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numPropsInLaunchPropList, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { std::ignore = hQueue; std::ignore = hKernel; std::ignore = workDim; + std::ignore = pGlobalWorkOffset; std::ignore = pGlobalWorkSize; std::ignore = pLocalWorkSize; std::ignore = numPropsInLaunchPropList; diff --git a/source/adapters/level_zero/ur_interface_loader.hpp b/source/adapters/level_zero/ur_interface_loader.hpp index 1207f7776b..4d0f0c5b72 100644 --- a/source/adapters/level_zero/ur_interface_loader.hpp +++ b/source/adapters/level_zero/ur_interface_loader.hpp @@ -694,8 +694,8 @@ ur_result_t urEnqueueTimestampRecordingExp( const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); ur_result_t urEnqueueKernelLaunchCustomExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numPropsInLaunchPropList, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); diff --git a/source/adapters/level_zero/v2/queue_api.cpp b/source/adapters/level_zero/v2/queue_api.cpp index ea2e931bfe..bea90a7d4f 100644 --- a/source/adapters/level_zero/v2/queue_api.cpp +++ b/source/adapters/level_zero/v2/queue_api.cpp @@ -317,13 +317,13 @@ ur_result_t urEnqueueTimestampRecordingExp( } ur_result_t urEnqueueKernelLaunchCustomExp( ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, - uint32_t numPropsInLaunchPropList, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { return hQueue->enqueueKernelLaunchCustomExp( - hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, numPropsInLaunchPropList, launchPropList, numEventsInWaitList, phEventWaitList, phEvent); } diff --git a/source/adapters/level_zero/v2/queue_api.hpp b/source/adapters/level_zero/v2/queue_api.hpp index 577f6c5aba..284b21b0ce 100644 --- a/source/adapters/level_zero/v2/queue_api.hpp +++ b/source/adapters/level_zero/v2/queue_api.hpp @@ -144,9 +144,9 @@ struct ur_queue_handle_t_ { const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t enqueueKernelLaunchCustomExp( - ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, uint32_t, - const ur_exp_launch_property_t *, uint32_t, const ur_event_handle_t *, - ur_event_handle_t *) = 0; + ur_kernel_handle_t, uint32_t, const size_t *, const size_t *, + const size_t *, uint32_t, const ur_exp_launch_property_t *, uint32_t, + const ur_event_handle_t *, ur_event_handle_t *) = 0; virtual ur_result_t enqueueNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, uint32_t, const ur_mem_handle_t *, diff --git a/source/adapters/mock/ur_mockddi.cpp b/source/adapters/mock/ur_mockddi.cpp index dea28a4658..f77e5d0182 100644 --- a/source/adapters/mock/ur_mockddi.cpp +++ b/source/adapters/mock/ur_mockddi.cpp @@ -10126,6 +10126,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel @@ -10153,11 +10156,17 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( ur_result_t result = UR_RESULT_SUCCESS; ur_enqueue_kernel_launch_custom_exp_params_t params = { - &hQueue, &hKernel, - &workDim, &pGlobalWorkSize, - &pLocalWorkSize, &numPropsInLaunchPropList, - &launchPropList, &numEventsInWaitList, - &phEventWaitList, &phEvent}; + &hQueue, + &hKernel, + &workDim, + &pGlobalWorkOffset, + &pGlobalWorkSize, + &pLocalWorkSize, + &numPropsInLaunchPropList, + &launchPropList, + &numEventsInWaitList, + &phEventWaitList, + &phEvent}; auto beforeCallback = reinterpret_cast( mock::getCallbacks().get_before_callback( @@ -10176,6 +10185,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( result = replaceCallback(¶ms); } else { + // optional output handle + if (phEvent) { + *phEvent = mock::createDummyHandle(); + } result = UR_RESULT_SUCCESS; } diff --git a/source/loader/layers/tracing/ur_trcddi.cpp b/source/loader/layers/tracing/ur_trcddi.cpp index 9cc18c66c4..32a3622953 100644 --- a/source/loader/layers/tracing/ur_trcddi.cpp +++ b/source/loader/layers/tracing/ur_trcddi.cpp @@ -8698,6 +8698,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel @@ -8730,11 +8733,17 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } ur_enqueue_kernel_launch_custom_exp_params_t params = { - &hQueue, &hKernel, - &workDim, &pGlobalWorkSize, - &pLocalWorkSize, &numPropsInLaunchPropList, - &launchPropList, &numEventsInWaitList, - &phEventWaitList, &phEvent}; + &hQueue, + &hKernel, + &workDim, + &pGlobalWorkOffset, + &pGlobalWorkSize, + &pLocalWorkSize, + &numPropsInLaunchPropList, + &launchPropList, + &numEventsInWaitList, + &phEventWaitList, + &phEvent}; uint64_t instance = getContext()->notify_begin(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP, "urEnqueueKernelLaunchCustomExp", ¶ms); @@ -8743,9 +8752,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( logger.info(" ---> urEnqueueKernelLaunchCustomExp\n"); ur_result_t result = pfnKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, numEventsInWaitList, - phEventWaitList, phEvent); + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); getContext()->notify_end(UR_FUNCTION_ENQUEUE_KERNEL_LAUNCH_CUSTOM_EXP, "urEnqueueKernelLaunchCustomExp", ¶ms, &result, diff --git a/source/loader/layers/validation/ur_valddi.cpp b/source/loader/layers/validation/ur_valddi.cpp index fdfce7951b..9dcc91ef63 100644 --- a/source/loader/layers/validation/ur_valddi.cpp +++ b/source/loader/layers/validation/ur_valddi.cpp @@ -9726,6 +9726,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel @@ -9766,6 +9769,10 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } + if (NULL == pGlobalWorkOffset) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + if (NULL == pGlobalWorkSize) { return UR_RESULT_ERROR_INVALID_NULL_POINTER; } @@ -9794,9 +9801,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( } ur_result_t result = pfnKernelLaunchCustomExp( - hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, numEventsInWaitList, - phEventWaitList, phEvent); + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); return result; } diff --git a/source/loader/ur_ldrddi.cpp b/source/loader/ur_ldrddi.cpp index a67879a9eb..7facb5982d 100644 --- a/source/loader/ur_ldrddi.cpp +++ b/source/loader/ur_ldrddi.cpp @@ -8866,6 +8866,9 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel @@ -8908,11 +8911,35 @@ __urdlllocal ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( // convert loader handle to platform handle hKernel = reinterpret_cast(hKernel)->handle; + // convert loader handles to platform handles + auto phEventWaitListLocal = + std::vector(numEventsInWaitList); + for (size_t i = 0; i < numEventsInWaitList; ++i) { + phEventWaitListLocal[i] = + reinterpret_cast(phEventWaitList[i])->handle; + } + // forward to device-platform - result = pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, - launchPropList, numEventsInWaitList, - phEventWaitList, phEvent); + result = pfnKernelLaunchCustomExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitListLocal.data(), phEvent); + + // In the event of ERROR_ADAPTER_SPECIFIC we should still attempt to wrap any output handles below. + if (UR_RESULT_SUCCESS != result && + UR_RESULT_ERROR_ADAPTER_SPECIFIC != result) { + return result; + } + try { + // convert platform handle to loader handle + if (nullptr != phEvent) { + *phEvent = reinterpret_cast( + context->factories.ur_event_factory.getInstance(*phEvent, + dditable)); + } + } catch (std::bad_alloc &) { + result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } return result; } diff --git a/source/loader/ur_libapi.cpp b/source/loader/ur_libapi.cpp index 9a8e4c2e12..91af0c22c7 100644 --- a/source/loader/ur_libapi.cpp +++ b/source/loader/ur_libapi.cpp @@ -8992,6 +8992,7 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( /// + NULL == hQueue /// + NULL == hKernel /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` /// + `NULL == pGlobalWorkSize` /// + `NULL == launchPropList` /// + NULL == pGlobalWorkSize @@ -9020,6 +9021,9 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel @@ -9050,10 +9054,10 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( return UR_RESULT_ERROR_UNINITIALIZED; } - return pfnKernelLaunchCustomExp(hQueue, hKernel, workDim, pGlobalWorkSize, - pLocalWorkSize, numPropsInLaunchPropList, - launchPropList, numEventsInWaitList, - phEventWaitList, phEvent); + return pfnKernelLaunchCustomExp( + hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numPropsInLaunchPropList, launchPropList, + numEventsInWaitList, phEventWaitList, phEvent); } catch (...) { return exceptionToResult(std::current_exception()); } diff --git a/source/ur_api.cpp b/source/ur_api.cpp index 92b02b7176..e805a2f9f0 100644 --- a/source/ur_api.cpp +++ b/source/ur_api.cpp @@ -7625,6 +7625,7 @@ ur_result_t UR_APICALL urEnqueueTimestampRecordingExp( /// + NULL == hQueue /// + NULL == hKernel /// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER +/// + `NULL == pGlobalWorkOffset` /// + `NULL == pGlobalWorkSize` /// + `NULL == launchPropList` /// + NULL == pGlobalWorkSize @@ -7653,6 +7654,9 @@ ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp( uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and ///< work-group work-items + const size_t * + pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the + ///< offset used to calculate the global ID of a work-item const size_t * pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the ///< number of global work-items in workDim that will execute the kernel diff --git a/test/conformance/exp_launch_properties/launch_properties.cpp b/test/conformance/exp_launch_properties/launch_properties.cpp index a54a44ecaf..d5b8f89c98 100644 --- a/test/conformance/exp_launch_properties/launch_properties.cpp +++ b/test/conformance/exp_launch_properties/launch_properties.cpp @@ -95,8 +95,8 @@ TEST_P(urEnqueueKernelLaunchCustomTest, Success) { AddPodArg(val); ASSERT_SUCCESS(urEnqueueKernelLaunchCustomExp( - queue, kernel, n_dimensions, &global_size, nullptr, 1, &props[0], 0, - nullptr, nullptr)); + queue, kernel, n_dimensions, nullptr, &global_size, nullptr, 1, + &props[0], 0, nullptr, nullptr)); ASSERT_SUCCESS(urQueueFinish(queue)); ValidateBuffer(buffer, sizeof(val) * global_size, val); }