Add new launch property to support work_group_scratch_memory

intel/llvm#15061 introduces a new property work_group_scratch_memory which allow the user to set a given amount of local memory to be used. In order to pass this information to the adaptor, the patch adds a new launch property to urEnqueueKernelLaunchCustomExp. The patch also changes the signature of urEnqueueKernelLaunchCustomExp to add global offset in order to maintain features when using this extension. Signed-off-by: Victor Lomuller <[email protected]>
Naghasan · Nov 13, 2024 · 55f2d02 · 55f2d02
1 parent cd92e72
commit 55f2d02
Show file tree

Hide file tree

Showing 17 changed files with 233 additions and 68 deletions.
diff --git a/include/ur_api.h b/include/ur_api.h
@@ -9530,6 +9530,7 @@ typedef enum ur_exp_launch_property_id_t {
     UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0,            ///< The property has no effect
     UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1,       ///< Whether to launch a cooperative kernel
     UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions
+    UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation
     /// @cond
     UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff
     /// @endcond
@@ -9543,10 +9544,12 @@ typedef enum ur_exp_launch_property_id_t {
 ///   _Analogues_
 ///     - **CUlaunchAttributeValue**
 typedef union ur_exp_launch_property_value_t {
-    uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
-                            ///< value must be a divisor of the corresponding global work-size
-                            ///< dimension (in units of work-group).
-    int cooperative;        ///< [in] non-zero value indicates a cooperative kernel
+    uint32_t clusterDim[3];    ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
+                               ///< value must be a divisor of the corresponding global work-size
+                               ///< dimension (in units of work-group).
+    int cooperative;           ///< [in] non-zero value indicates a cooperative kernel
+    size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to
+                               ///< allocate
 
 } ur_exp_launch_property_value_t;
 
@@ -9587,6 +9590,7 @@ typedef struct ur_exp_launch_property_t {
 ///         + NULL == hQueue
 ///         + NULL == hKernel
 ///     - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
+///         + `NULL == pGlobalWorkOffset`
 ///         + `NULL == pGlobalWorkSize`
 ///         + `NULL == launchPropList`
 ///         + NULL == pGlobalWorkSize
@@ -9615,6 +9619,8 @@ urEnqueueKernelLaunchCustomExp(
     ur_kernel_handle_t hKernel,                     ///< [in] handle of the kernel object
     uint32_t workDim,                               ///< [in] number of dimensions, from 1 to 3, to specify the global and
                                                     ///< work-group work-items
+    const size_t *pGlobalWorkOffset,                ///< [in] pointer to an array of workDim unsigned values that specify the
+                                                    ///< offset used to calculate the global ID of a work-item
     const size_t *pGlobalWorkSize,                  ///< [in] pointer to an array of workDim unsigned values that specify the
                                                     ///< number of global work-items in workDim that will execute the kernel
                                                     ///< function
@@ -11441,6 +11447,7 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t {
     ur_queue_handle_t *phQueue;
     ur_kernel_handle_t *phKernel;
     uint32_t *pworkDim;
+    const size_t **ppGlobalWorkOffset;
     const size_t **ppGlobalWorkSize;
     const size_t **ppLocalWorkSize;
     uint32_t *pnumPropsInLaunchPropList;

diff --git a/include/ur_ddi.h b/include/ur_ddi.h
@@ -1457,6 +1457,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)(
     uint32_t,
     const size_t *,
     const size_t *,
+    const size_t *,
     uint32_t,
     const ur_exp_launch_property_t *,
     uint32_t,

diff --git a/include/ur_print.hpp b/include/ur_print.hpp
@@ -10246,6 +10246,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id
     case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION:
         os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION";
         break;
+    case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
+        os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY";
+        break;
     default:
         os << "unknown enumerator";
         break;
@@ -10282,6 +10285,13 @@ inline ur_result_t printUnion(
 
         os << (params.cooperative);
 
+        break;
+    case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
+
+        os << ".workgroup_mem_size = ";
+
+        os << (params.workgroup_mem_size);
+
         break;
     default:
         os << "<unknown>";
@@ -14722,6 +14732,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
 
     os << *(params->pworkDim);
 
+    os << ", ";
+    os << ".pGlobalWorkOffset = ";
+
+    ur::details::printPtr(os,
+                          *(params->ppGlobalWorkOffset));
+
     os << ", ";
     os << ".pGlobalWorkSize = ";
 

diff --git a/scripts/core/exp-launch-properties.yml b/scripts/core/exp-launch-properties.yml
@@ -29,6 +29,8 @@ etors:
       desc: "Whether to launch a cooperative kernel"
     - name: CLUSTER_DIMENSION
       desc: "work-group cluster dimensions"
+    - name: WORK_GROUP_MEMORY
+      desc: "Implicit work group memory allocation"
 --- #--------------------------------------------------------------------------
 type: union
 desc: "Specifies a launch property value"
@@ -45,6 +47,10 @@ members:
       name: cooperative
       desc: "[in] non-zero value indicates a cooperative kernel"
       tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE
+    - type: size_t
+      name: workgroup_mem_size
+      desc: "[in] non-zero value indicates the amount of work group memory to allocate"
+      tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY
 --- #--------------------------------------------------------------------------
 type: struct
 desc: "Kernel launch property"
@@ -82,6 +88,9 @@ params:
     - type: uint32_t
       name: workDim
       desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items"
+    - type: "const size_t*"
+      name: pGlobalWorkOffset
+      desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item"
     - type: const size_t*
       name: pGlobalWorkSize
       desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function"
@@ -97,10 +106,10 @@ params:
     - type: uint32_t
       name: numEventsInWaitList
       desc: "[in] size of the event wait list"
-    - type: const ur_event_handle_t*
+    - type: const $x_event_handle_t*
       name: phEventWaitList
       desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. "
-    - type: ur_event_handle_t*
+    - type: $x_event_handle_t*
       name: phEvent
       desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array."
 returns:

diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp
@@ -414,11 +414,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
                                         phEventWaitList, phEvent);
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
-    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
-    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
-    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+static ur_result_t
+enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
+                    uint32_t workDim, const size_t *pGlobalWorkOffset,
+                    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
+                    uint32_t numEventsInWaitList,
+                    const ur_event_handle_t *phEventWaitList,
+                    ur_event_handle_t *phEvent, size_t WorkGroupMemory) {
   // Preconditions
   UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
             UR_RESULT_ERROR_INVALID_KERNEL);
@@ -436,6 +438,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
+  // Set work group memory so we can compute the whole memory requirement
+  if (WorkGroupMemory)
+    hKernel->setWorkGroupMemory(WorkGroupMemory);
   uint32_t LocalSize = hKernel->getLocalSize();
   CUfunction CuFunc = hKernel->get();
 
@@ -498,6 +503,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   return UR_RESULT_SUCCESS;
 }
 
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                             pGlobalWorkSize, pLocalWorkSize,
+                             numEventsInWaitList, phEventWaitList, phEvent, 0);
+}
+
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
     const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -508,8 +523,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
     coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
     coop_prop.value.cooperative = 1;
     return urEnqueueKernelLaunchCustomExp(
-        hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1,
-        &coop_prop, numEventsInWaitList, phEventWaitList, phEvent);
+        hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
+        pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList,
+        phEvent);
   }
   return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
                                pGlobalWorkSize, pLocalWorkSize,
@@ -518,16 +534,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
 
 UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
-    const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
-    uint32_t numPropsInLaunchPropList,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
     const ur_exp_launch_property_t *launchPropList,
     uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
     ur_event_handle_t *phEvent) {
 
-  if (numPropsInLaunchPropList == 0) {
-    urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize,
-                          pLocalWorkSize, numEventsInWaitList, phEventWaitList,
-                          phEvent);
+  size_t WorkGroupMemory = [&]() -> size_t {
+    const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if(
+        launchPropList, launchPropList + numPropsInLaunchPropList,
+        [](const ur_exp_launch_property_t &Prop) {
+          return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY;
+        });
+    if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList)
+      return WorkGroupMemoryProp->value.workgroup_mem_size;
+    return 0;
+  }();
+
+  if (numPropsInLaunchPropList == 0 ||
+      (WorkGroupMemory && numPropsInLaunchPropList == 1)) {
+    return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
+                               pGlobalWorkSize, pLocalWorkSize,
+                               numEventsInWaitList, phEventWaitList, phEvent,
+                               WorkGroupMemory);
   }
 #if CUDA_VERSION >= 11080
   // Preconditions
@@ -540,7 +569,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     return UR_RESULT_ERROR_INVALID_NULL_POINTER;
   }
 
-  std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
+  std::vector<CUlaunchAttribute> launch_attribute;
+  launch_attribute.reserve(numPropsInLaunchPropList);
 
   // Early exit for zero size kernel
   if (*pGlobalWorkSize == 0) {
@@ -553,17 +583,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
   size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
   size_t BlocksPerGrid[3] = {1u, 1u, 1u};
 
+  // Set work group memory so we can compute the whole memory requirement
+  if (WorkGroupMemory)
+    hKernel->setWorkGroupMemory(WorkGroupMemory);
   uint32_t LocalSize = hKernel->getLocalSize();
   CUfunction CuFunc = hKernel->get();
 
   for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
     switch (launchPropList[i].id) {
     case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
+      launch_attribute.push_back({});
       launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {
-
+      launch_attribute.push_back({});
       launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
       // Note that cuda orders from right to left wrt SYCL dimensional order.
       if (workDim == 3) {
@@ -595,11 +629,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
       break;
     }
     case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
+      launch_attribute.push_back({});
       launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
       launch_attribute[i].value.cooperative =
           launchPropList[i].value.cooperative;
       break;
     }
+    case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: {
+      break;
+    }
     default: {
       return UR_RESULT_ERROR_INVALID_ENUMERATION;
     }
@@ -610,8 +648,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
   // using the standard UR_CHECK_ERROR
   if (ur_result_t Ret =
           setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
-                          nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel,
-                          CuFunc, ThreadsPerBlock, BlocksPerGrid);
+                          pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
+                          hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
       Ret != UR_RESULT_SUCCESS)
     return Ret;
 
@@ -659,7 +697,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
     launch_config.sharedMemBytes = LocalSize;
     launch_config.hStream = CuStream;
     launch_config.attrs = &launch_attribute[0];
-    launch_config.numAttrs = numPropsInLaunchPropList;
+    launch_config.numAttrs = launch_attribute.size();
 
     UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
                                     const_cast<void **>(ArgIndices.data()),

diff --git a/source/adapters/cuda/kernel.hpp b/source/adapters/cuda/kernel.hpp
@@ -65,6 +65,8 @@ struct ur_kernel_handle_t_ {
     args_size_t ParamSizes;
     args_index_t Indices;
     args_size_t OffsetPerIndex;
+    size_t WorkGroupMemory = 0;
+
     // A struct to keep track of memargs so that we can do dependency analysis
     // at urEnqueueKernelLaunch
     struct mem_obj_arg {
@@ -105,22 +107,28 @@ struct ur_kernel_handle_t_ {
       OffsetPerIndex[Index] = LocalSize;
     }
 
-    void addLocalArg(size_t Index, size_t Size) {
-      size_t LocalOffset = this->getLocalSize();
-
-      // maximum required alignment is the size of the largest vector type
-      const size_t MaxAlignment = sizeof(double) * 16;
+    // maximum required alignment is the size of the largest vector type
+    static constexpr size_t MaxAlignment = sizeof(double) * 16;
 
+    static size_t alignMemoryAllocation(size_t Size, size_t Offset) {
       // for arguments smaller than the maximum alignment simply align to the
       // size of the argument
       const size_t Alignment = std::min(MaxAlignment, Size);
 
       // align the argument
-      size_t AlignedLocalOffset = LocalOffset;
-      size_t Pad = LocalOffset % Alignment;
+      size_t AlignedLocalOffset = Offset;
+      size_t Pad = Offset % Alignment;
       if (Pad != 0) {
         AlignedLocalOffset += Alignment - Pad;
       }
+      return AlignedLocalOffset;
+    }
+
+    void addLocalArg(size_t Index, size_t Size) {
+      size_t LocalOffset = this->getLocalSize();
+
+      // align the argument
+      size_t AlignedLocalOffset = alignMemoryAllocation(Size, LocalOffset);
 
       addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
              Size + (AlignedLocalOffset - LocalOffset));
@@ -140,20 +148,40 @@ struct ur_kernel_handle_t_ {
       MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags});
     }
 
+    void setWorkGroupMemory(size_t memSize) {
+      assert(WorkGroupMemory == 0 &&
+             "Work Group Memory size can only be set once");
+      // Ensure first offset is MaxAlignment aligned
+      WorkGroupMemory = alignMemoryAllocation(MaxAlignment, memSize);
+
+      // Adjust local accessor setting
+      // the dynamic memory will start at offset 0 (allows us to keep access
+      // local memory as a GV) and accessors will use the rest of the range
+      for (size_t i = 0; i < OffsetPerIndex.size(); i++) {
+        // if offset is 0, that's it is not a local accessor argument.
+        if (!OffsetPerIndex[i])
+          continue;
+        assert(ParamSizes[i] == sizeof(size_t) && "Offset should be a size_t");
+        *reinterpret_cast<size_t *>(Indices[i]) += WorkGroupMemory;
+      }
+    }
+
     void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) {
       assert(Size == sizeof(std::uint32_t) * 3);
       std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
     }
 
     void clearLocalSize() {
       std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0);
+      WorkGroupMemory = 0;
     }
 
     const args_index_t &getIndices() const noexcept { return Indices; }
 
     uint32_t getLocalSize() const {
       return std::accumulate(std::begin(OffsetPerIndex),
-                             std::end(OffsetPerIndex), 0);
+                             std::end(OffsetPerIndex), 0) +
+             WorkGroupMemory;
     }
   } Args;
 
@@ -238,6 +266,7 @@ struct ur_kernel_handle_t_ {
     return Args.getIndices();
   }
 
+  void setWorkGroupMemory(size_t memSize) { Args.setWorkGroupMemory(memSize); }
   uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }
 
   void clearLocalSize() { Args.clearLocalSize(); }