From 17b198ae4640b6075ced6f8cd0a304cc6a5c7ff7 Mon Sep 17 00:00:00 2001
From: Pavlo Hilei <pavlo.hilei@shell.com>
Date: Mon, 7 Oct 2024 17:37:16 +0200
Subject: [PATCH] Add possibility to configure dynamic shared memory on CUDA
 backend.

It can be configured via 'sharedMemBytes' field of type int in the kernel properties.
Also example of dynamic shared memory usage is added.
Note that transpiler doesn't support this feature, so it is usable with only native cuda kernels for now.
---
 .../CMakeLists.txt                            |  7 ++
 .../32_native_cuda_kernels_reduction/Makefile | 26 ++++++
 .../32_native_cuda_kernels_reduction/main.cpp | 84 +++++++++++++++++++
 .../sum_reductiom_dynamic_shm.cu              | 24 ++++++
 examples/cpp/CMakeLists.txt                   |  1 +
 src/occa/internal/modes/cuda/kernel.cpp       | 17 ++--
 src/occa/internal/modes/cuda/kernel.hpp       |  3 +
 src/occa/internal/modes/cuda/polyfill.hpp     | 21 ++++-
 8 files changed, 175 insertions(+), 8 deletions(-)
 create mode 100644 examples/cpp/32_native_cuda_kernels_reduction/CMakeLists.txt
 create mode 100644 examples/cpp/32_native_cuda_kernels_reduction/Makefile
 create mode 100644 examples/cpp/32_native_cuda_kernels_reduction/main.cpp
 create mode 100644 examples/cpp/32_native_cuda_kernels_reduction/sum_reductiom_dynamic_shm.cu
diff --git a/examples/cpp/32_native_cuda_kernels_reduction/CMakeLists.txt b/examples/cpp/32_native_cuda_kernels_reduction/CMakeLists.txt
new file mode 100644
index 000000000..d214fcad5
--- /dev/null
+++ b/examples/cpp/32_native_cuda_kernels_reduction/CMakeLists.txt
@@ -0,0 +1,7 @@
+if (WITH_CUDA)
+  compile_cpp_example(native_cuda_kernels_reduction main.cpp)
+
+  add_custom_target(cpp_example_native_cuda_kernels_reduction_dynamic_shm_cu ALL COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/sum_reductiom_dynamic_shm.cu sum_reductiom_dynamic_shm.cu)
+  add_dependencies(examples_cpp_native_cuda_kernels cpp_example_native_cuda_kernels_reduction_dynamic_shm_cu)
+
+endif()
diff --git a/examples/cpp/32_native_cuda_kernels_reduction/Makefile b/examples/cpp/32_native_cuda_kernels_reduction/Makefile
new file mode 100644
index 000000000..af0b6c1cc
--- /dev/null
+++ b/examples/cpp/32_native_cuda_kernels_reduction/Makefile
@@ -0,0 +1,26 @@
+PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
+ifndef OCCA_DIR
+  include $(PROJ_DIR)/../../../scripts/build/Makefile
+else
+  include ${OCCA_DIR}/scripts/build/Makefile
+endif
+
+#---[ COMPILATION ]-------------------------------
+headers = $(wildcard $(incPath)/*.hpp) $(wildcard $(incPath)/*.tpp)
+sources = $(wildcard $(srcPath)/*.cpp)
+
+objects = $(subst $(srcPath)/,$(objPath)/,$(sources:.cpp=.o))
+
+${PROJ_DIR}/main: $(objects) $(headers) ${PROJ_DIR}/main.cpp
+	$(compiler) $(compilerFlags) -o ${PROJ_DIR}/main $(flags) $(objects) ${PROJ_DIR}/main.cpp $(paths) $(linkerFlags)
+	@if which install_name_tool > /dev/null 2>&1; then \
+		install_name_tool -add_rpath "${OCCA_DIR}/lib" ${PROJ_DIR}/main; \
+	fi
+
+$(objPath)/%.o:$(srcPath)/%.cpp $(wildcard $(subst $(srcPath)/,$(incPath)/,$(<:.cpp=.hpp))) $(wildcard $(subst $(srcPath)/,$(incPath)/,$(<:.cpp=.tpp)))
+	$(compiler) $(compilerFlags) -o $@ $(flags) -c $(paths) $<
+
+clean:
+	rm -f $(objPath)/*;
+	rm -f $(PROJ_DIR)/main
+#=================================================
diff --git a/examples/cpp/32_native_cuda_kernels_reduction/main.cpp b/examples/cpp/32_native_cuda_kernels_reduction/main.cpp
new file mode 100644
index 000000000..106a28ea1
--- /dev/null
+++ b/examples/cpp/32_native_cuda_kernels_reduction/main.cpp
@@ -0,0 +1,84 @@
+#include <iostream>
+
+#include <occa.hpp>
+
+//---[ Internal Tools ]-----------------
+// Note: These headers are not officially supported
+//       Please don't rely on it outside of the occa examples
+#include <occa/internal/utils/cli.hpp>
+#include <occa/internal/utils/testing.hpp>
+//======================================
+
+occa::json parseArgs(int argc, const char **argv);
+
+int main(int argc, const char **argv) {
+  occa::json args = parseArgs(argc, argv);
+
+  int entries = 32 * 32;
+
+  float *h_data = new float[entries];
+  float h_result = 0;
+  float ref_result = 0;
+
+  for (int i = 0; i < entries; ++i) {
+    h_data[i] = i;
+  }
+
+  // Setup the platform and device IDs
+  occa::device device({{"mode", "CUDA"},
+                       {"device_id", (int)args["options/device-id"]}});
+
+  // Allocate memory on the device
+  occa::memory d_data = device.malloc<float>(entries);
+  occa::memory d_result = device.malloc<float>(1);
+
+  // Compile a regular CUDA kernel at run-time
+  occa::json kernelProps({{"okl/enabled", false}, {"sharedMemBytes", 32 * 4}});
+
+  occa::kernel reduce =
+      device.buildKernel("sum_reductiom_dynamic_shm.cu", "reduce", kernelProps);
+
+  // Copy memory to the device
+  d_data.copyFrom(h_data);
+  d_result.copyFrom(&h_result);
+
+  // Set the kernel dimensions
+  reduce.setRunDims(32, 32);
+
+  // Launch device kernel
+  reduce(d_data, d_result);
+
+  // Copy result to the host
+  d_result.copyTo(&h_result);
+
+  // Calculate reference
+  for (int i = 0; i < entries; ++i) {
+    ref_result += h_data[i];
+  }
+
+  // Assert values
+  printf("Ref result: %f, GPU result: %f\n", ref_result, h_result);
+
+  // Free host memory
+  delete[] h_data;
+
+  return 0;
+}
+
+occa::json parseArgs(int argc, const char **argv) {
+  occa::cli::parser parser;
+  parser
+      .withDescription(
+          "Example of using a regular CUDA kernel instead of an OCCA kernel")
+      .addOption(
+          occa::cli::option('d', "device-id", "OpenCL device ID (default: 0)")
+              .withArg()
+              .withDefaultValue(0))
+      .addOption(
+          occa::cli::option('v', "verbose", "Compile kernels in verbose mode"));
+
+  occa::json args = parser.parseArgs(argc, argv);
+  occa::settings()["kernel/verbose"] = args["options/verbose"];
+
+  return args;
+}
diff --git a/examples/cpp/32_native_cuda_kernels_reduction/sum_reductiom_dynamic_shm.cu b/examples/cpp/32_native_cuda_kernels_reduction/sum_reductiom_dynamic_shm.cu
new file mode 100644
index 000000000..0e798a57f
--- /dev/null
+++ b/examples/cpp/32_native_cuda_kernels_reduction/sum_reductiom_dynamic_shm.cu
@@ -0,0 +1,24 @@
+#define block_size 32
+extern "C" __global__ __launch_bounds__(block_size) void reduce(
+  float *g_idata,
+  float *res)
+{
+ {
+   int bid = (0) + blockIdx.x;
+   extern __shared__ float sdata[];
+   {
+     int tid = (0) + threadIdx.x;
+     int i = bid * block_size + tid;
+     sdata[tid] = g_idata[i];
+     __syncthreads();
+     for (unsigned int s = block_size / 2; s > 0; s >>= 1) {
+       if (tid < s) {
+         sdata[tid] += sdata[tid + s];
+       }
+       __syncthreads();
+     }
+     if (tid == 0)
+       atomicAdd(res, sdata[0]);
+   }
+ }
+}
\ No newline at end of file
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index 6c51ab1e9..ff1f78c86 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -17,6 +17,7 @@ add_subdirectory(18_nonblocking_streams)
 add_subdirectory(19_stream_tags)
 add_subdirectory(20_native_dpcpp_kernel)
 add_subdirectory(30_device_function)
+add_subdirectory(32_native_cuda_kernels_reduction)
 
 # Don't force-compile OpenGL examples
 # add_subdirectory(16_finite_difference)
diff --git a/src/occa/internal/modes/cuda/kernel.cpp b/src/occa/internal/modes/cuda/kernel.cpp
index b90a7b454..5aed24f29 100644
--- a/src/occa/internal/modes/cuda/kernel.cpp
+++ b/src/occa/internal/modes/cuda/kernel.cpp
@@ -14,7 +14,9 @@ namespace occa {
                    const occa::json &properties_) :
       occa::launchedModeKernel_t(modeDevice_, name_, sourceFilename_, properties_),
       cuModule(cuModule_),
-      cuFunction(NULL) {}
+      cuFunction(NULL) {
+        sharedMemBytes = properties_.get("sharedMemBytes", 0);
+      }
 
     kernel::kernel(modeDevice_t *modeDevice_,
                    const std::string &name_,
@@ -23,7 +25,9 @@ namespace occa {
                    const occa::json &properties_) :
       occa::launchedModeKernel_t(modeDevice_, name_, sourceFilename_, properties_),
       cuModule(NULL),
-      cuFunction(cuFunction_) {}
+      cuFunction(cuFunction_) {
+        sharedMemBytes = properties_.get("sharedMemBytes", 0);
+      }
 
     kernel::kernel(modeDevice_t *modeDevice_,
                    const std::string &name_,
@@ -33,7 +37,9 @@ namespace occa {
                    const occa::json &properties_) :
       occa::launchedModeKernel_t(modeDevice_, name_, sourceFilename_, properties_),
       cuModule(cuModule_),
-      cuFunction(cuFunction_) {}
+      cuFunction(cuFunction_) {
+        sharedMemBytes = properties_.get("sharedMemBytes", 0);
+      }
 
     kernel::~kernel() {
       if (cuModule) {
@@ -92,12 +98,13 @@ namespace occa {
 
       devicePtr->setCudaContext();
 
+      OCCA_CUDA_ERROR("Set max dynamic shm", cuFuncSetAttribute(cuFunction, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, sharedMemBytes));
       OCCA_CUDA_ERROR("Launching Kernel",
                       cuLaunchKernel(cuFunction,
                                      outerDims.x, outerDims.y, outerDims.z,
                                      innerDims.x, innerDims.y, innerDims.z,
-                                     0, getCuStream(),
-                                     &(vArgs[0]), 0));
+                                     sharedMemBytes, getCuStream(),
+                                     &(vArgs[0]), NULL));
     }
   }
 }
diff --git a/src/occa/internal/modes/cuda/kernel.hpp b/src/occa/internal/modes/cuda/kernel.hpp
index 5424712e1..c51501606 100644
--- a/src/occa/internal/modes/cuda/kernel.hpp
+++ b/src/occa/internal/modes/cuda/kernel.hpp
@@ -19,6 +19,9 @@ namespace occa {
 
       mutable std::vector<void*> vArgs;
 
+      // Dynamic shared memory size
+      int sharedMemBytes = 0;
+
     public:
       kernel(modeDevice_t *modeDevice_,
              const std::string &name_,
diff --git a/src/occa/internal/modes/cuda/polyfill.hpp b/src/occa/internal/modes/cuda/polyfill.hpp
index ac664ff41..2a0c9e991 100644
--- a/src/occa/internal/modes/cuda/polyfill.hpp
+++ b/src/occa/internal/modes/cuda/polyfill.hpp
@@ -21,7 +21,6 @@ namespace occa {
   typedef struct _CUdeviceptr*          CUdeviceptr;
   typedef struct _CUevent*              CUevent;
   typedef struct _CUfunction*           CUfunction;
-  typedef struct _CUfunction_attribute* CUfunction_attribute;
   typedef struct _CUmodule*             CUmodule;
   typedef struct _CUstream*             CUstream;
 
@@ -35,8 +34,6 @@ namespace occa {
   static const int CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 0;
   static const int CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 0;
 
-  static const CUfunction_attribute CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = NULL;
-
   enum CUresult {
     CUDA_SUCCESS = 0,
     CUDA_ERROR_INVALID_VALUE,
@@ -98,6 +95,20 @@ namespace occa {
     OCCA_CUDA_IS_NOT_ENABLED
   };
 
+  enum CUfunction_attribute {
+      CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+      CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
+      CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
+      CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
+      CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
+      CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
+      CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
+      CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
+      CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
+      CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
+      CU_FUNC_ATTRIBUTE_MAX
+  };
+
   //---[ Methods ]----------------------
   inline CUresult cuInit(unsigned int Flags) {
     return OCCA_CUDA_IS_NOT_ENABLED;
@@ -107,6 +118,10 @@ namespace occa {
     return OCCA_CUDA_IS_NOT_ENABLED;
   }
 
+  inline CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) {
+    return OCCA_CUDA_IS_NOT_ENABLED;
+  }
+
   inline CUresult cuLaunchKernel(CUfunction f,
                                  unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
                                  unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,