Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add possibility to configure dynamic shared memory on CUDA backend. #767

Open
wants to merge 1 commit into
base: development
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions examples/cpp/32_native_cuda_kernels_reduction/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
if (WITH_CUDA)
compile_cpp_example(native_cuda_kernels_reduction main.cpp)

add_custom_target(cpp_example_native_cuda_kernels_reduction_dynamic_shm_cu ALL COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/sum_reductiom_dynamic_shm.cu sum_reductiom_dynamic_shm.cu)
add_dependencies(examples_cpp_native_cuda_kernels cpp_example_native_cuda_kernels_reduction_dynamic_shm_cu)

endif()
26 changes: 26 additions & 0 deletions examples/cpp/32_native_cuda_kernels_reduction/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
PROJ_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
ifndef OCCA_DIR
include $(PROJ_DIR)/../../../scripts/build/Makefile
else
include ${OCCA_DIR}/scripts/build/Makefile
endif

#---[ COMPILATION ]-------------------------------
headers = $(wildcard $(incPath)/*.hpp) $(wildcard $(incPath)/*.tpp)
sources = $(wildcard $(srcPath)/*.cpp)

objects = $(subst $(srcPath)/,$(objPath)/,$(sources:.cpp=.o))

${PROJ_DIR}/main: $(objects) $(headers) ${PROJ_DIR}/main.cpp
$(compiler) $(compilerFlags) -o ${PROJ_DIR}/main $(flags) $(objects) ${PROJ_DIR}/main.cpp $(paths) $(linkerFlags)
@if which install_name_tool > /dev/null 2>&1; then \
install_name_tool -add_rpath "${OCCA_DIR}/lib" ${PROJ_DIR}/main; \
fi

$(objPath)/%.o:$(srcPath)/%.cpp $(wildcard $(subst $(srcPath)/,$(incPath)/,$(<:.cpp=.hpp))) $(wildcard $(subst $(srcPath)/,$(incPath)/,$(<:.cpp=.tpp)))
$(compiler) $(compilerFlags) -o $@ $(flags) -c $(paths) $<

clean:
rm -f $(objPath)/*;
rm -f $(PROJ_DIR)/main
#=================================================
84 changes: 84 additions & 0 deletions examples/cpp/32_native_cuda_kernels_reduction/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#include <iostream>

#include <occa.hpp>

//---[ Internal Tools ]-----------------
// Note: These headers are not officially supported
// Please don't rely on it outside of the occa examples
#include <occa/internal/utils/cli.hpp>
#include <occa/internal/utils/testing.hpp>
//======================================

occa::json parseArgs(int argc, const char **argv);

int main(int argc, const char **argv) {
occa::json args = parseArgs(argc, argv);

int entries = 32 * 32;

float *h_data = new float[entries];
float h_result = 0;
float ref_result = 0;

for (int i = 0; i < entries; ++i) {
h_data[i] = i;
}

// Setup the platform and device IDs
occa::device device({{"mode", "CUDA"},
{"device_id", (int)args["options/device-id"]}});

// Allocate memory on the device
occa::memory d_data = device.malloc<float>(entries);
occa::memory d_result = device.malloc<float>(1);

// Compile a regular CUDA kernel at run-time
occa::json kernelProps({{"okl/enabled", false}, {"sharedMemBytes", 32 * 4}});

occa::kernel reduce =
device.buildKernel("sum_reductiom_dynamic_shm.cu", "reduce", kernelProps);

// Copy memory to the device
d_data.copyFrom(h_data);
d_result.copyFrom(&h_result);

// Set the kernel dimensions
reduce.setRunDims(32, 32);

// Launch device kernel
reduce(d_data, d_result);

// Copy result to the host
d_result.copyTo(&h_result);

// Calculate reference
for (int i = 0; i < entries; ++i) {
ref_result += h_data[i];
}

// Assert values
printf("Ref result: %f, GPU result: %f\n", ref_result, h_result);

// Free host memory
delete[] h_data;

return 0;
}

occa::json parseArgs(int argc, const char **argv) {
occa::cli::parser parser;
parser
.withDescription(
"Example of using a regular CUDA kernel instead of an OCCA kernel")
.addOption(
occa::cli::option('d', "device-id", "OpenCL device ID (default: 0)")
.withArg()
.withDefaultValue(0))
.addOption(
occa::cli::option('v', "verbose", "Compile kernels in verbose mode"));

occa::json args = parser.parseArgs(argc, argv);
occa::settings()["kernel/verbose"] = args["options/verbose"];

return args;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#define block_size 32
extern "C" __global__ __launch_bounds__(block_size) void reduce(
float *g_idata,
float *res)
{
{
int bid = (0) + blockIdx.x;
extern __shared__ float sdata[];
{
int tid = (0) + threadIdx.x;
int i = bid * block_size + tid;
sdata[tid] = g_idata[i];
__syncthreads();
for (unsigned int s = block_size / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
if (tid == 0)
atomicAdd(res, sdata[0]);
}
}
}
1 change: 1 addition & 0 deletions examples/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ add_subdirectory(18_nonblocking_streams)
add_subdirectory(19_stream_tags)
add_subdirectory(20_native_dpcpp_kernel)
add_subdirectory(30_device_function)
add_subdirectory(32_native_cuda_kernels_reduction)

# Don't force-compile OpenGL examples
# add_subdirectory(16_finite_difference)
Expand Down
17 changes: 12 additions & 5 deletions src/occa/internal/modes/cuda/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@ namespace occa {
const occa::json &properties_) :
occa::launchedModeKernel_t(modeDevice_, name_, sourceFilename_, properties_),
cuModule(cuModule_),
cuFunction(NULL) {}
cuFunction(NULL) {
sharedMemBytes = properties_.get("sharedMemBytes", 0);
}

kernel::kernel(modeDevice_t *modeDevice_,
const std::string &name_,
Expand All @@ -23,7 +25,9 @@ namespace occa {
const occa::json &properties_) :
occa::launchedModeKernel_t(modeDevice_, name_, sourceFilename_, properties_),
cuModule(NULL),
cuFunction(cuFunction_) {}
cuFunction(cuFunction_) {
sharedMemBytes = properties_.get("sharedMemBytes", 0);
}

kernel::kernel(modeDevice_t *modeDevice_,
const std::string &name_,
Expand All @@ -33,7 +37,9 @@ namespace occa {
const occa::json &properties_) :
occa::launchedModeKernel_t(modeDevice_, name_, sourceFilename_, properties_),
cuModule(cuModule_),
cuFunction(cuFunction_) {}
cuFunction(cuFunction_) {
sharedMemBytes = properties_.get("sharedMemBytes", 0);
}

kernel::~kernel() {
if (cuModule) {
Expand Down Expand Up @@ -92,12 +98,13 @@ namespace occa {

devicePtr->setCudaContext();

OCCA_CUDA_ERROR("Set max dynamic shm", cuFuncSetAttribute(cuFunction, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, sharedMemBytes));
OCCA_CUDA_ERROR("Launching Kernel",
cuLaunchKernel(cuFunction,
outerDims.x, outerDims.y, outerDims.z,
innerDims.x, innerDims.y, innerDims.z,
0, getCuStream(),
&(vArgs[0]), 0));
sharedMemBytes, getCuStream(),
&(vArgs[0]), NULL));
}
}
}
3 changes: 3 additions & 0 deletions src/occa/internal/modes/cuda/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ namespace occa {

mutable std::vector<void*> vArgs;

// Dynamic shared memory size
int sharedMemBytes = 0;

public:
kernel(modeDevice_t *modeDevice_,
const std::string &name_,
Expand Down
21 changes: 18 additions & 3 deletions src/occa/internal/modes/cuda/polyfill.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ namespace occa {
typedef struct _CUdeviceptr* CUdeviceptr;
typedef struct _CUevent* CUevent;
typedef struct _CUfunction* CUfunction;
typedef struct _CUfunction_attribute* CUfunction_attribute;
typedef struct _CUmodule* CUmodule;
typedef struct _CUstream* CUstream;

Expand All @@ -35,8 +34,6 @@ namespace occa {
static const int CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 0;
static const int CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 0;

static const CUfunction_attribute CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = NULL;

enum CUresult {
CUDA_SUCCESS = 0,
CUDA_ERROR_INVALID_VALUE,
Expand Down Expand Up @@ -98,6 +95,20 @@ namespace occa {
OCCA_CUDA_IS_NOT_ENABLED
};

enum CUfunction_attribute {
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
CU_FUNC_ATTRIBUTE_MAX
};

//---[ Methods ]----------------------
inline CUresult cuInit(unsigned int Flags) {
return OCCA_CUDA_IS_NOT_ENABLED;
Expand All @@ -107,6 +118,10 @@ namespace occa {
return OCCA_CUDA_IS_NOT_ENABLED;
}

inline CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) {
return OCCA_CUDA_IS_NOT_ENABLED;
}

inline CUresult cuLaunchKernel(CUfunction f,
unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
Expand Down
Loading