libocca · pavlo-hilei · Oct 7, 2024
diff --git a/examples/cpp/32_native_cuda_kernels_reduction/CMakeLists.txt b/examples/cpp/32_native_cuda_kernels_reduction/CMakeLists.txt
@@ -0,0 +1,7 @@
+if (WITH_CUDA)
+  compile_cpp_example(native_cuda_kernels_reduction main.cpp)
+
+  add_custom_target(cpp_example_native_cuda_kernels_reduction_dynamic_shm_cu ALL COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/sum_reductiom_dynamic_shm.cu sum_reductiom_dynamic_shm.cu)
+  add_dependencies(examples_cpp_native_cuda_kernels cpp_example_native_cuda_kernels_reduction_dynamic_shm_cu)
+
+endif()
diff --git a/examples/cpp/32_native_cuda_kernels_reduction/Makefile b/examples/cpp/32_native_cuda_kernels_reduction/Makefile
@@ -0,0 +1,26 @@
+PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+ifndef OCCA_DIR
+  include $(PROJ_DIR)/../../../scripts/build/Makefile
+else
+  include ${OCCA_DIR}/scripts/build/Makefile
+endif
+
+#---[ COMPILATION ]-------------------------------
+headers = $(wildcard $(incPath)/*.hpp) $(wildcard $(incPath)/*.tpp)
+sources = $(wildcard $(srcPath)/*.cpp)
+
+objects = $(subst $(srcPath)/,$(objPath)/,$(sources:.cpp=.o))
+
+${PROJ_DIR}/main: $(objects) $(headers) ${PROJ_DIR}/main.cpp
+	$(compiler) $(compilerFlags) -o ${PROJ_DIR}/main $(flags) $(objects) ${PROJ_DIR}/main.cpp $(paths) $(linkerFlags)
+	@if which install_name_tool > /dev/null 2>&1; then \
+		install_name_tool -add_rpath "${OCCA_DIR}/lib" ${PROJ_DIR}/main; \
+	fi
+
+$(objPath)/%.o:$(srcPath)/%.cpp $(wildcard $(subst $(srcPath)/,$(incPath)/,$(<:.cpp=.hpp))) $(wildcard $(subst $(srcPath)/,$(incPath)/,$(<:.cpp=.tpp)))
+	$(compiler) $(compilerFlags) -o $@ $(flags) -c $(paths) $<
+
+clean:
+	rm -f $(objPath)/*;
+	rm -f $(PROJ_DIR)/main
+#=================================================
diff --git a/examples/cpp/32_native_cuda_kernels_reduction/main.cpp b/examples/cpp/32_native_cuda_kernels_reduction/main.cpp
@@ -0,0 +1,84 @@
+#include <iostream>
+
+#include <occa.hpp>
+
+//---[ Internal Tools ]-----------------
+// Note: These headers are not officially supported
+//       Please don't rely on it outside of the occa examples
+#include <occa/internal/utils/cli.hpp>
+#include <occa/internal/utils/testing.hpp>
+//======================================
+
+occa::json parseArgs(int argc, const char **argv);
+
+int main(int argc, const char **argv) {
+  occa::json args = parseArgs(argc, argv);
+
+  int entries = 32 * 32;
+
+  float *h_data = new float[entries];
+  float h_result = 0;
+  float ref_result = 0;
+
+  for (int i = 0; i < entries; ++i) {
+    h_data[i] = i;
+  }
+
+  // Setup the platform and device IDs
+  occa::device device({{"mode", "CUDA"},
+                       {"device_id", (int)args["options/device-id"]}});
+
+  // Allocate memory on the device
+  occa::memory d_data = device.malloc<float>(entries);
+  occa::memory d_result = device.malloc<float>(1);
+
+  // Compile a regular CUDA kernel at run-time
+  occa::json kernelProps({{"okl/enabled", false}, {"sharedMemBytes", 32 * 4}});
+
+  occa::kernel reduce =
+      device.buildKernel("sum_reductiom_dynamic_shm.cu", "reduce", kernelProps);
+
+  // Copy memory to the device
+  d_data.copyFrom(h_data);
+  d_result.copyFrom(&h_result);
+
+  // Set the kernel dimensions
+  reduce.setRunDims(32, 32);
+
+  // Launch device kernel
+  reduce(d_data, d_result);
+
+  // Copy result to the host
+  d_result.copyTo(&h_result);
+
+  // Calculate reference
+  for (int i = 0; i < entries; ++i) {
+    ref_result += h_data[i];
+  }
+
+  // Assert values
+  printf("Ref result: %f, GPU result: %f\n", ref_result, h_result);
+
+  // Free host memory
+  delete[] h_data;
+
+  return 0;
+}
+
+occa::json parseArgs(int argc, const char **argv) {
+  occa::cli::parser parser;
+  parser
+      .withDescription(
+          "Example of using a regular CUDA kernel instead of an OCCA kernel")
+      .addOption(
+          occa::cli::option('d', "device-id", "OpenCL device ID (default: 0)")
+              .withArg()
+              .withDefaultValue(0))
+      .addOption(
+          occa::cli::option('v', "verbose", "Compile kernels in verbose mode"));
+
+  occa::json args = parser.parseArgs(argc, argv);
+  occa::settings()["kernel/verbose"] = args["options/verbose"];
+
+  return args;
+}
diff --git a/examples/cpp/32_native_cuda_kernels_reduction/sum_reductiom_dynamic_shm.cu b/examples/cpp/32_native_cuda_kernels_reduction/sum_reductiom_dynamic_shm.cu
@@ -0,0 +1,24 @@
+#define block_size 32
+extern "C" __global__ __launch_bounds__(block_size) void reduce(
+  float *g_idata,
+  float *res)
+{
+ {
+   int bid = (0) + blockIdx.x;
+   extern __shared__ float sdata[];
+   {
+     int tid = (0) + threadIdx.x;
+     int i = bid * block_size + tid;
+     sdata[tid] = g_idata[i];
+     __syncthreads();
+     for (unsigned int s = block_size / 2; s > 0; s >>= 1) {
+       if (tid < s) {
+         sdata[tid] += sdata[tid + s];
+       }
+       __syncthreads();
+     }
+     if (tid == 0)
+       atomicAdd(res, sdata[0]);
+   }
+ }
+}
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
@@ -17,6 +17,7 @@ add_subdirectory(18_nonblocking_streams)
 add_subdirectory(19_stream_tags)
 add_subdirectory(20_native_dpcpp_kernel)
 add_subdirectory(30_device_function)
+add_subdirectory(32_native_cuda_kernels_reduction)
 
 # Don't force-compile OpenGL examples
 # add_subdirectory(16_finite_difference)

diff --git a/src/occa/internal/modes/cuda/kernel.cpp b/src/occa/internal/modes/cuda/kernel.cpp
@@ -14,7 +14,9 @@ namespace occa {
                    const occa::json &properties_) :
       occa::launchedModeKernel_t(modeDevice_, name_, sourceFilename_, properties_),
       cuModule(cuModule_),
-      cuFunction(NULL) {}
+      cuFunction(NULL) {
+        sharedMemBytes = properties_.get("sharedMemBytes", 0);
+      }
 
     kernel::kernel(modeDevice_t *modeDevice_,
                    const std::string &name_,
@@ -23,7 +25,9 @@ namespace occa {
                    const occa::json &properties_) :
       occa::launchedModeKernel_t(modeDevice_, name_, sourceFilename_, properties_),
       cuModule(NULL),
-      cuFunction(cuFunction_) {}
+      cuFunction(cuFunction_) {
+        sharedMemBytes = properties_.get("sharedMemBytes", 0);
+      }
 
     kernel::kernel(modeDevice_t *modeDevice_,
                    const std::string &name_,
@@ -33,7 +37,9 @@ namespace occa {
                    const occa::json &properties_) :
       occa::launchedModeKernel_t(modeDevice_, name_, sourceFilename_, properties_),
       cuModule(cuModule_),
-      cuFunction(cuFunction_) {}
+      cuFunction(cuFunction_) {
+        sharedMemBytes = properties_.get("sharedMemBytes", 0);
+      }
 
     kernel::~kernel() {
       if (cuModule) {
@@ -92,12 +98,13 @@ namespace occa {
 
       devicePtr->setCudaContext();
 
+      OCCA_CUDA_ERROR("Set max dynamic shm", cuFuncSetAttribute(cuFunction, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, sharedMemBytes));
       OCCA_CUDA_ERROR("Launching Kernel",
                       cuLaunchKernel(cuFunction,
                                      outerDims.x, outerDims.y, outerDims.z,
                                      innerDims.x, innerDims.y, innerDims.z,
-                                     0, getCuStream(),
-                                     &(vArgs[0]), 0));
+                                     sharedMemBytes, getCuStream(),
+                                     &(vArgs[0]), NULL));
     }
   }
 }
diff --git a/src/occa/internal/modes/cuda/kernel.hpp b/src/occa/internal/modes/cuda/kernel.hpp
@@ -19,6 +19,9 @@ namespace occa {
 
       mutable std::vector<void*> vArgs;
 
+      // Dynamic shared memory size
+      int sharedMemBytes = 0;
+
     public:
       kernel(modeDevice_t *modeDevice_,
              const std::string &name_,

diff --git a/src/occa/internal/modes/cuda/polyfill.hpp b/src/occa/internal/modes/cuda/polyfill.hpp
@@ -21,7 +21,6 @@ namespace occa {
   typedef struct _CUdeviceptr*          CUdeviceptr;
   typedef struct _CUevent*              CUevent;
   typedef struct _CUfunction*           CUfunction;
-  typedef struct _CUfunction_attribute* CUfunction_attribute;
   typedef struct _CUmodule*             CUmodule;
   typedef struct _CUstream*             CUstream;
 
@@ -35,8 +34,6 @@ namespace occa {
   static const int CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 0;
   static const int CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 0;
 
-  static const CUfunction_attribute CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = NULL;
-
   enum CUresult {
     CUDA_SUCCESS = 0,
     CUDA_ERROR_INVALID_VALUE,
@@ -98,6 +95,20 @@ namespace occa {
     OCCA_CUDA_IS_NOT_ENABLED
   };
 
+  enum CUfunction_attribute {
+      CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+      CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
+      CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
+      CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
+      CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
+      CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
+      CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
+      CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
+      CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
+      CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
+      CU_FUNC_ATTRIBUTE_MAX
+  };
+
   //---[ Methods ]----------------------
   inline CUresult cuInit(unsigned int Flags) {
     return OCCA_CUDA_IS_NOT_ENABLED;
@@ -107,6 +118,10 @@ namespace occa {
     return OCCA_CUDA_IS_NOT_ENABLED;
   }
 
+  inline CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) {
+    return OCCA_CUDA_IS_NOT_ENABLED;
+  }
+
   inline CUresult cuLaunchKernel(CUfunction f,
                                  unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
                                  unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,