Skip to content

Commit

Permalink
Add new launch property to support work_group_scratch_memory
Browse files Browse the repository at this point in the history
intel/llvm#15061 introduces a new property work_group_scratch_memory which allow the user to set a given amount of local memory to be used.

In order to pass this information to the adaptor, the patch adds a new launch property to urEnqueueKernelLaunchCustomExp.

The patch also changes the signature of urEnqueueKernelLaunchCustomExp to add global offset in order to maintain features when using this extension.

Signed-off-by: Victor Lomuller <[email protected]>
  • Loading branch information
Naghasan committed Nov 13, 2024
1 parent cd92e72 commit 55f2d02
Show file tree
Hide file tree
Showing 17 changed files with 233 additions and 68 deletions.
15 changes: 11 additions & 4 deletions include/ur_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -9530,6 +9530,7 @@ typedef enum ur_exp_launch_property_id_t {
UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0, ///< The property has no effect
UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, ///< Whether to launch a cooperative kernel
UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions
UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation
/// @cond
UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff
/// @endcond
Expand All @@ -9543,10 +9544,12 @@ typedef enum ur_exp_launch_property_id_t {
/// _Analogues_
/// - **CUlaunchAttributeValue**
typedef union ur_exp_launch_property_value_t {
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
///< value must be a divisor of the corresponding global work-size
///< dimension (in units of work-group).
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
///< value must be a divisor of the corresponding global work-size
///< dimension (in units of work-group).
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to
///< allocate

} ur_exp_launch_property_value_t;

Expand Down Expand Up @@ -9587,6 +9590,7 @@ typedef struct ur_exp_launch_property_t {
/// + NULL == hQueue
/// + NULL == hKernel
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
/// + `NULL == pGlobalWorkOffset`
/// + `NULL == pGlobalWorkSize`
/// + `NULL == launchPropList`
/// + NULL == pGlobalWorkSize
Expand Down Expand Up @@ -9615,6 +9619,8 @@ urEnqueueKernelLaunchCustomExp(
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
///< work-group work-items
const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
///< offset used to calculate the global ID of a work-item
const size_t *pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
///< number of global work-items in workDim that will execute the kernel
///< function
Expand Down Expand Up @@ -11441,6 +11447,7 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t {
ur_queue_handle_t *phQueue;
ur_kernel_handle_t *phKernel;
uint32_t *pworkDim;
const size_t **ppGlobalWorkOffset;
const size_t **ppGlobalWorkSize;
const size_t **ppLocalWorkSize;
uint32_t *pnumPropsInLaunchPropList;
Expand Down
1 change: 1 addition & 0 deletions include/ur_ddi.h
Original file line number Diff line number Diff line change
Expand Up @@ -1457,6 +1457,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)(
uint32_t,
const size_t *,
const size_t *,
const size_t *,
uint32_t,
const ur_exp_launch_property_t *,
uint32_t,
Expand Down
16 changes: 16 additions & 0 deletions include/ur_print.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10246,6 +10246,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION:
os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION";
break;
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY";
break;
default:
os << "unknown enumerator";
break;
Expand Down Expand Up @@ -10282,6 +10285,13 @@ inline ur_result_t printUnion(

os << (params.cooperative);

break;
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:

os << ".workgroup_mem_size = ";

os << (params.workgroup_mem_size);

break;
default:
os << "<unknown>";
Expand Down Expand Up @@ -14722,6 +14732,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct

os << *(params->pworkDim);

os << ", ";
os << ".pGlobalWorkOffset = ";

ur::details::printPtr(os,
*(params->ppGlobalWorkOffset));

os << ", ";
os << ".pGlobalWorkSize = ";

Expand Down
13 changes: 11 additions & 2 deletions scripts/core/exp-launch-properties.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ etors:
desc: "Whether to launch a cooperative kernel"
- name: CLUSTER_DIMENSION
desc: "work-group cluster dimensions"
- name: WORK_GROUP_MEMORY
desc: "Implicit work group memory allocation"
--- #--------------------------------------------------------------------------
type: union
desc: "Specifies a launch property value"
Expand All @@ -45,6 +47,10 @@ members:
name: cooperative
desc: "[in] non-zero value indicates a cooperative kernel"
tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE
- type: size_t
name: workgroup_mem_size
desc: "[in] non-zero value indicates the amount of work group memory to allocate"
tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY
--- #--------------------------------------------------------------------------
type: struct
desc: "Kernel launch property"
Expand Down Expand Up @@ -82,6 +88,9 @@ params:
- type: uint32_t
name: workDim
desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items"
- type: "const size_t*"
name: pGlobalWorkOffset
desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item"
- type: const size_t*
name: pGlobalWorkSize
desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function"
Expand All @@ -97,10 +106,10 @@ params:
- type: uint32_t
name: numEventsInWaitList
desc: "[in] size of the event wait list"
- type: const ur_event_handle_t*
- type: const $x_event_handle_t*
name: phEventWaitList
desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. "
- type: ur_event_handle_t*
- type: $x_event_handle_t*
name: phEvent
desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array."
returns:
Expand Down
74 changes: 56 additions & 18 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -414,11 +414,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
phEventWaitList, phEvent);
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
static ur_result_t
enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
uint32_t workDim, const size_t *pGlobalWorkOffset,
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent, size_t WorkGroupMemory) {
// Preconditions
UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
UR_RESULT_ERROR_INVALID_KERNEL);
Expand All @@ -436,6 +438,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};

// Set work group memory so we can compute the whole memory requirement
if (WorkGroupMemory)
hKernel->setWorkGroupMemory(WorkGroupMemory);
uint32_t LocalSize = hKernel->getLocalSize();
CUfunction CuFunc = hKernel->get();

Expand Down Expand Up @@ -498,6 +503,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
pGlobalWorkSize, pLocalWorkSize,
numEventsInWaitList, phEventWaitList, phEvent, 0);
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
Expand All @@ -508,8 +523,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
coop_prop.value.cooperative = 1;
return urEnqueueKernelLaunchCustomExp(
hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1,
&coop_prop, numEventsInWaitList, phEventWaitList, phEvent);
hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList,
phEvent);
}
return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
pGlobalWorkSize, pLocalWorkSize,
Expand All @@ -518,16 +534,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
uint32_t numPropsInLaunchPropList,
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
const ur_exp_launch_property_t *launchPropList,
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
ur_event_handle_t *phEvent) {

if (numPropsInLaunchPropList == 0) {
urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize,
pLocalWorkSize, numEventsInWaitList, phEventWaitList,
phEvent);
size_t WorkGroupMemory = [&]() -> size_t {
const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if(
launchPropList, launchPropList + numPropsInLaunchPropList,
[](const ur_exp_launch_property_t &Prop) {
return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY;
});
if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList)
return WorkGroupMemoryProp->value.workgroup_mem_size;
return 0;
}();

if (numPropsInLaunchPropList == 0 ||
(WorkGroupMemory && numPropsInLaunchPropList == 1)) {
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
pGlobalWorkSize, pLocalWorkSize,
numEventsInWaitList, phEventWaitList, phEvent,
WorkGroupMemory);
}
#if CUDA_VERSION >= 11080
// Preconditions
Expand All @@ -540,7 +569,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
}

std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
std::vector<CUlaunchAttribute> launch_attribute;
launch_attribute.reserve(numPropsInLaunchPropList);

// Early exit for zero size kernel
if (*pGlobalWorkSize == 0) {
Expand All @@ -553,17 +583,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};

// Set work group memory so we can compute the whole memory requirement
if (WorkGroupMemory)
hKernel->setWorkGroupMemory(WorkGroupMemory);
uint32_t LocalSize = hKernel->getLocalSize();
CUfunction CuFunc = hKernel->get();

for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
switch (launchPropList[i].id) {
case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
launch_attribute.push_back({});
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
break;
}
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {

launch_attribute.push_back({});
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
// Note that cuda orders from right to left wrt SYCL dimensional order.
if (workDim == 3) {
Expand Down Expand Up @@ -595,11 +629,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
break;
}
case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
launch_attribute.push_back({});
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
launch_attribute[i].value.cooperative =
launchPropList[i].value.cooperative;
break;
}
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: {
break;
}
default: {
return UR_RESULT_ERROR_INVALID_ENUMERATION;
}
Expand All @@ -610,8 +648,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
// using the standard UR_CHECK_ERROR
if (ur_result_t Ret =
setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel,
CuFunc, ThreadsPerBlock, BlocksPerGrid);
pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
Ret != UR_RESULT_SUCCESS)
return Ret;

Expand Down Expand Up @@ -659,7 +697,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
launch_config.sharedMemBytes = LocalSize;
launch_config.hStream = CuStream;
launch_config.attrs = &launch_attribute[0];
launch_config.numAttrs = numPropsInLaunchPropList;
launch_config.numAttrs = launch_attribute.size();

UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
const_cast<void **>(ArgIndices.data()),
Expand Down
45 changes: 37 additions & 8 deletions source/adapters/cuda/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ struct ur_kernel_handle_t_ {
args_size_t ParamSizes;
args_index_t Indices;
args_size_t OffsetPerIndex;
size_t WorkGroupMemory = 0;

// A struct to keep track of memargs so that we can do dependency analysis
// at urEnqueueKernelLaunch
struct mem_obj_arg {
Expand Down Expand Up @@ -105,22 +107,28 @@ struct ur_kernel_handle_t_ {
OffsetPerIndex[Index] = LocalSize;
}

void addLocalArg(size_t Index, size_t Size) {
size_t LocalOffset = this->getLocalSize();

// maximum required alignment is the size of the largest vector type
const size_t MaxAlignment = sizeof(double) * 16;
// maximum required alignment is the size of the largest vector type
static constexpr size_t MaxAlignment = sizeof(double) * 16;

static size_t alignMemoryAllocation(size_t Size, size_t Offset) {
// for arguments smaller than the maximum alignment simply align to the
// size of the argument
const size_t Alignment = std::min(MaxAlignment, Size);

// align the argument
size_t AlignedLocalOffset = LocalOffset;
size_t Pad = LocalOffset % Alignment;
size_t AlignedLocalOffset = Offset;
size_t Pad = Offset % Alignment;
if (Pad != 0) {
AlignedLocalOffset += Alignment - Pad;
}
return AlignedLocalOffset;
}

void addLocalArg(size_t Index, size_t Size) {
size_t LocalOffset = this->getLocalSize();

// align the argument
size_t AlignedLocalOffset = alignMemoryAllocation(Size, LocalOffset);

addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
Size + (AlignedLocalOffset - LocalOffset));
Expand All @@ -140,20 +148,40 @@ struct ur_kernel_handle_t_ {
MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags});
}

void setWorkGroupMemory(size_t memSize) {
assert(WorkGroupMemory == 0 &&
"Work Group Memory size can only be set once");
// Ensure first offset is MaxAlignment aligned
WorkGroupMemory = alignMemoryAllocation(MaxAlignment, memSize);

// Adjust local accessor setting
// the dynamic memory will start at offset 0 (allows us to keep access
// local memory as a GV) and accessors will use the rest of the range
for (size_t i = 0; i < OffsetPerIndex.size(); i++) {
// if offset is 0, that's it is not a local accessor argument.
if (!OffsetPerIndex[i])
continue;
assert(ParamSizes[i] == sizeof(size_t) && "Offset should be a size_t");
*reinterpret_cast<size_t *>(Indices[i]) += WorkGroupMemory;
}
}

void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) {
assert(Size == sizeof(std::uint32_t) * 3);
std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
}

void clearLocalSize() {
std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0);
WorkGroupMemory = 0;
}

const args_index_t &getIndices() const noexcept { return Indices; }

uint32_t getLocalSize() const {
return std::accumulate(std::begin(OffsetPerIndex),
std::end(OffsetPerIndex), 0);
std::end(OffsetPerIndex), 0) +
WorkGroupMemory;
}
} Args;

Expand Down Expand Up @@ -238,6 +266,7 @@ struct ur_kernel_handle_t_ {
return Args.getIndices();
}

void setWorkGroupMemory(size_t memSize) { Args.setWorkGroupMemory(memSize); }
uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }

void clearLocalSize() { Args.clearLocalSize(); }
Expand Down
Loading

0 comments on commit 55f2d02

Please sign in to comment.