Skip to content

Commit 3187e8c

Browse files
committed
Add support for struct kernel parameters
1 parent 2ed31f1 commit 3187e8c

File tree

11 files changed

+412
-5
lines changed

11 files changed

+412
-5
lines changed

Diff for: driverapi/src/cmdqueue.cpp

+15-1
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,21 @@ NvCommandQueue::launchFunction(LibreCUFunction function,
545545
kernargs_buf[j++] = param_value;
546546
break;
547547
}
548-
default: LIBRECUDA_FAIL(LIBRECUDA_ERROR_INVALID_VALUE)
548+
default: {
549+
if (param_size % sizeof(NvU32) != 0) {
550+
// cuda encodes everything with these 32-bit words. The fact that this would be allowed is highly
551+
// implausible given that even most c compilers pad struct lengths to multiples of 4 anyway,
552+
// so cuda doing it any different would be highly implausible
553+
LIBRECUDA_DEBUG("Encountered kernel with array parameter with size % 4 != 0! This should not be possible");
554+
LIBRECUDA_FAIL(LIBRECUDA_ERROR_INVALID_VALUE);
555+
}
556+
auto *param_ptr = reinterpret_cast<NvU32 *>(params[i]);
557+
size_t num_words = param_size / sizeof(NvU32);
558+
for (size_t k = 0; k < num_words; k++) {
559+
kernargs_buf[j++] = param_ptr[k];
560+
}
561+
break;
562+
}
549563
}
550564
}
551565
}

Diff for: tests/CMakeLists.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ add_subdirectory(memcopy)
33
add_subdirectory(dynamic_shared_mem)
44
add_subdirectory(compute_chronological_consistency)
55
add_subdirectory(test_async_kernels)
6-
add_subdirectory(dma_chronological_consistency)
6+
add_subdirectory(dma_chronological_consistency)
7+
add_subdirectory(kernel_struct_param)

Diff for: tests/compute_chronological_consistency/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ target_link_libraries(
88
driverapi
99
)
1010

11-
configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/write_float COPYONLY)
11+
configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/compute_chronological_consistency COPYONLY)

Diff for: tests/dynamic_shared_mem/CMakeLists.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ target_link_libraries(
88
driverapi
99
)
1010

11-
configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/write_float COPYONLY)
11+
configure_file("${CMAKE_CURRENT_LIST_DIR}/write_float.cubin" ${CMAKE_BINARY_DIR}/tests/dynamic_shared_mem COPYONLY)

Diff for: tests/kernel_struct_param/CMakeLists.txt

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
add_executable(
2+
test_kernel_struct_param
3+
main.cpp
4+
)
5+
target_link_libraries(
6+
test_kernel_struct_param
7+
PRIVATE
8+
driverapi
9+
)
10+
11+
configure_file("${CMAKE_CURRENT_LIST_DIR}/read_from_struct.cubin" ${CMAKE_BINARY_DIR}/tests/kernel_struct_param COPYONLY)

Diff for: tests/kernel_struct_param/main.cpp

+126
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#include <librecuda.h>
2+
3+
#include <iostream>
4+
#include <iomanip>
5+
#include <vector>
6+
#include <fstream>
7+
#include <cstring>
8+
9+
inline void cudaCheck(libreCudaStatus_t error, const char *file, int line) {
10+
if (error != LIBRECUDA_SUCCESS) {
11+
const char *error_string;
12+
libreCuGetErrorString(error, &error_string);
13+
printf("[CUDA ERROR] at file %s:%d: %s\n", file, line, error_string);
14+
exit(EXIT_FAILURE);
15+
}
16+
};
17+
#define CUDA_CHECK(err) (cudaCheck(err, __FILE__, __LINE__))
18+
19+
struct struct_t {
20+
int x, y, z;
21+
int w, h, d;
22+
char str[32];
23+
char me_ugly;
24+
};
25+
static_assert(sizeof(struct_t) == 60);
26+
27+
int main() {
28+
CUDA_CHECK(libreCuInit(0));
29+
30+
int device_count{};
31+
CUDA_CHECK(libreCuDeviceGetCount(&device_count));
32+
std::cout << "Device count: " + std::to_string(device_count) << std::endl;
33+
34+
LibreCUdevice device{};
35+
CUDA_CHECK(libreCuDeviceGet(&device, 0));
36+
37+
LibreCUcontext ctx{};
38+
CUDA_CHECK(libreCuCtxCreate_v2(&ctx, CU_CTX_SCHED_YIELD, device));
39+
40+
char name_buffer[256] = {};
41+
libreCuDeviceGetName(name_buffer, 256, device);
42+
std::cout << "Device Name: " + std::string(name_buffer) << std::endl;
43+
LibreCUmodule module{};
44+
45+
// read cubin file
46+
uint8_t *image;
47+
size_t n_bytes;
48+
{
49+
std::ifstream input("read_from_struct.cubin", std::ios::binary);
50+
std::vector<uint8_t> bytes(
51+
(std::istreambuf_iterator<char>(input)),
52+
(std::istreambuf_iterator<char>()));
53+
input.close();
54+
image = new uint8_t[bytes.size()];
55+
memcpy(image, bytes.data(), bytes.size());
56+
n_bytes = bytes.size();
57+
}
58+
CUDA_CHECK(libreCuModuleLoadData(&module, image, n_bytes));
59+
60+
// read functions
61+
uint32_t num_funcs{};
62+
CUDA_CHECK(libreCuModuleGetFunctionCount(&num_funcs, module));
63+
std::cout << "Num functions: " << num_funcs << std::endl;
64+
65+
auto *functions = new LibreCUFunction[num_funcs];
66+
CUDA_CHECK(libreCuModuleEnumerateFunctions(functions, num_funcs, module));
67+
68+
for (size_t i = 0; i < num_funcs; i++) {
69+
LibreCUFunction func = functions[i];
70+
const char *func_name{};
71+
CUDA_CHECK(libreCuFuncGetName(&func_name, func));
72+
std::cout << " function \"" << func_name << "\"" << std::endl;
73+
}
74+
75+
delete[] functions;
76+
77+
// find function
78+
LibreCUFunction func{};
79+
CUDA_CHECK(libreCuModuleGetFunction(&func, module, "read_from_struct"));
80+
// create stream
81+
LibreCUstream stream{};
82+
CUDA_CHECK(libreCuStreamCreate(&stream, 0));
83+
84+
void *w_dst_va{};
85+
CUDA_CHECK(libreCuMemAlloc(&w_dst_va, sizeof(int), true));
86+
87+
struct_t s = {
88+
.w=64,
89+
};
90+
91+
void *params[] = {
92+
&s, // struct
93+
&w_dst_va, // dst
94+
};
95+
96+
CUDA_CHECK(
97+
libreCuLaunchKernel(func,
98+
1, 1, 1,
99+
1, 1, 1,
100+
8192,
101+
stream,
102+
params, sizeof(params) / sizeof(void *),
103+
nullptr
104+
)
105+
);
106+
107+
// dispatch built up command buffer to GPU
108+
CUDA_CHECK(libreCuStreamCommence(stream));
109+
110+
// wait for work to complete
111+
CUDA_CHECK(libreCuStreamAwait(stream));
112+
std::cout << "Dst value (post exec): " << *(int *) (w_dst_va) << std::endl;
113+
114+
// free memory
115+
CUDA_CHECK(libreCuMemFree(w_dst_va));
116+
117+
// destroy stream
118+
CUDA_CHECK(libreCuStreamDestroy(stream));
119+
120+
// unload module
121+
CUDA_CHECK(libreCuModuleUnload(module));
122+
123+
// destroy ctx
124+
CUDA_CHECK(libreCuCtxDestroy(ctx));
125+
return 0;
126+
}

0 commit comments

Comments
 (0)