1+ #include < librecuda.h>
2+
3+ #include < iostream>
4+ #include < vector>
5+ #include < fstream>
6+ #include < cstring>
7+ #include < chrono>
8+
9+ inline void cudaCheck (libreCudaStatus_t error, const char *file, int line) {
10+ if (error != LIBRECUDA_SUCCESS) {
11+ const char *error_string;
12+ libreCuGetErrorString (error, &error_string);
13+ printf (" [CUDA ERROR] at file %s:%d: %s\n " , file, line, error_string);
14+ exit (EXIT_FAILURE);
15+ }
16+ };
17+ #define CUDA_CHECK (err ) (cudaCheck(err, __FILE__, __LINE__))
18+
19+ int main () {
20+ CUDA_CHECK (libreCuInit (0 ));
21+
22+ int device_count{};
23+ CUDA_CHECK (libreCuDeviceGetCount (&device_count));
24+ std::cout << " Device count: " + std::to_string (device_count) << std::endl;
25+
26+ LibreCUdevice device{};
27+ CUDA_CHECK (libreCuDeviceGet (&device, 0 ));
28+
29+ LibreCUcontext ctx{};
30+ CUDA_CHECK (libreCuCtxCreate_v2 (&ctx, CU_CTX_SCHED_YIELD, device));
31+
32+ char name_buffer[256 ] = {};
33+ libreCuDeviceGetName (name_buffer, 256 , device);
34+ std::cout << " Device Name: " + std::string (name_buffer) << std::endl;
35+
36+ LibreCUmodule module {};
37+
38+ // read cubin file
39+ uint8_t *image;
40+ size_t n_bytes;
41+ {
42+ std::ifstream input (" write_float.cubin" , std::ios::binary);
43+ std::vector<uint8_t > bytes (
44+ (std::istreambuf_iterator<char >(input)),
45+ (std::istreambuf_iterator<char >()));
46+ input.close ();
47+ image = new uint8_t [bytes.size ()];
48+ memcpy (image, bytes.data (), bytes.size ());
49+ n_bytes = bytes.size ();
50+ }
51+ CUDA_CHECK (libreCuModuleLoadData (&module , image, n_bytes));
52+
53+ // read functions
54+ uint32_t num_funcs{};
55+ CUDA_CHECK (libreCuModuleGetFunctionCount (&num_funcs, module ));
56+ std::cout << " Num functions: " << num_funcs << std::endl;
57+
58+ auto *functions = new LibreCUFunction[num_funcs];
59+ CUDA_CHECK (libreCuModuleEnumerateFunctions (functions, num_funcs, module ));
60+
61+ for (size_t i = 0 ; i < num_funcs; i++) {
62+ LibreCUFunction func = functions[i];
63+ const char *func_name{};
64+ CUDA_CHECK (libreCuFuncGetName (&func_name, func));
65+ std::cout << " function \" " << func_name << " \" " << std::endl;
66+ }
67+
68+ delete[] functions;
69+
70+ // find function
71+ LibreCUFunction func{};
72+ CUDA_CHECK (libreCuModuleGetFunction (&func, module , " write_float" ));
73+
74+ // create stream
75+ LibreCUstream stream{};
76+ CUDA_CHECK (libreCuStreamCreate (&stream, 0 ));
77+
78+ void *float_dst_compute_va{};
79+ void *float_dst_dma_va{};
80+ CUDA_CHECK (libreCuMemAlloc (&float_dst_compute_va, sizeof (float ), true ));
81+ CUDA_CHECK (libreCuMemAlloc (&float_dst_dma_va, sizeof (float ), true ));
82+ *(float *) float_dst_compute_va = 0 .0f ;
83+ *(float *) float_dst_dma_va = 0 .0f ;
84+
85+ // first time execution of the kernel
86+ auto start = std::chrono::high_resolution_clock::now ();
87+ {
88+ void *params[] = {
89+ &float_dst_compute_va, &float_dst_dma_va
90+ };
91+ CUDA_CHECK (
92+ libreCuLaunchKernel (func,
93+ 1 , 1 , 1 ,
94+ 1 , 1 , 1 ,
95+ 0 ,
96+ stream,
97+ params, sizeof (params) / sizeof (void *),
98+ nullptr ,
99+ false
100+ )
101+ );
102+ }
103+ CUDA_CHECK (libreCuStreamCommence (stream));
104+ CUDA_CHECK (libreCuStreamAwait (stream));
105+ auto end = std::chrono::high_resolution_clock::now ();
106+ std::cout << " Single kernel took: " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count ()
107+ << " ms" << std::endl;
108+
109+ start = std::chrono::high_resolution_clock::now ();
110+ {
111+ void *params[] = {
112+ &float_dst_compute_va, &float_dst_dma_va
113+ };
114+ for (int i = 0 ; i < 5 ; i++) {
115+ CUDA_CHECK (
116+ libreCuLaunchKernel (func,
117+ 1 , 1 , 1 ,
118+ 1 , 1 , 1 ,
119+ 0 ,
120+ stream,
121+ params, sizeof (params) / sizeof (void *),
122+ nullptr ,
123+ true
124+ )
125+ );
126+ }
127+ }
128+ CUDA_CHECK (libreCuStreamCommence (stream));
129+ CUDA_CHECK (libreCuStreamAwait (stream));
130+ end = std::chrono::high_resolution_clock::now ();
131+ std::cout << " 5xParallel kernel took: "
132+ << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count ()
133+ << " ms" << std::endl;
134+
135+ // free memory
136+ CUDA_CHECK (libreCuMemFree (float_dst_compute_va));
137+ CUDA_CHECK (libreCuMemFree (float_dst_dma_va));
138+
139+ // destroy stream
140+ CUDA_CHECK (libreCuStreamDestroy (stream));
141+
142+ // unload module
143+ CUDA_CHECK (libreCuModuleUnload (module ));
144+
145+ // destroy ctx
146+ CUDA_CHECK (libreCuCtxDestroy (ctx));
147+ return 0 ;
148+ }
0 commit comments