|
| 1 | +# Dealing with function pointers - when you really have to |
| 2 | + |
| 3 | +Function pointers are something to be avoided when writing Kokkos code (or any portable code for that matter). |
| 4 | +However, if you really have to it can be made to work - with some effort. |
| 5 | +Here we will give some explanation of why it is complicated, and how to work around the hurdles. |
| 6 | + |
| 7 | +### The naive approach and why it doesn't work. |
| 8 | + |
| 9 | +Let's start with some simple thing, where `SomeClass` contains a function pointer which you want to use inside a `KOKKOS_FUNCTION` marked function: |
| 10 | + |
| 11 | +```c++ |
| 12 | +struct SomeClass { |
| 13 | + void (*bar)(); |
| 14 | + KOKKOS_FUNCTION void print() const { |
| 15 | + bar(); |
| 16 | + } |
| 17 | +}; |
| 18 | +``` |
| 19 | +
|
| 20 | +Going forward we will use a simple examplar function: |
| 21 | +
|
| 22 | +```c++ |
| 23 | +KOKKOS_INLINE_FUNCTION void foo() { |
| 24 | + KOKKOS_IF_ON_HOST(printf("foo called from host\n");) |
| 25 | + KOKKOS_IF_ON_DEVICE(printf("foo called from device\n");) |
| 26 | +} |
| 27 | +``` |
| 28 | + |
| 29 | +This function leverages the `KOKKOS_IF_ON_HOST` and `KOKKOS_IF_ON_DEVICE` macros so we can tell which version we got. |
| 30 | + |
| 31 | +Putting it all together into a fully self contained source and lets try to call it both on host and on device: |
| 32 | + |
| 33 | +```c++ |
| 34 | +#include <Kokkos_Core.hpp> |
| 35 | + |
| 36 | +KOKKOS_INLINE_FUNCTION void foo() { |
| 37 | + KOKKOS_IF_ON_HOST(printf("foo called from host\n");) |
| 38 | + KOKKOS_IF_ON_DEVICE(printf("foo called from device\n");) |
| 39 | +} |
| 40 | + |
| 41 | +struct SomeClass { |
| 42 | + void (*bar)(); |
| 43 | + KOKKOS_FUNCTION void print() const { |
| 44 | + bar(); |
| 45 | + } |
| 46 | +}; |
| 47 | + |
| 48 | +int main(int argc, char* argv[]) { |
| 49 | + Kokkos::initialize(argc, argv); |
| 50 | + { |
| 51 | + SomeClass A; |
| 52 | + A.bar = &foo; |
| 53 | + // Call it plain on host |
| 54 | + A.print(); |
| 55 | + |
| 56 | + // Call it inside a host parallel for |
| 57 | + printf("I can use the function pointer in a host parallel_for!\n"); |
| 58 | + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0,1), KOKKOS_LAMBDA(int) { |
| 59 | + A.print(); |
| 60 | + }); |
| 61 | + Kokkos::fence(); |
| 62 | + printf("Now I will crash if we compiled for CUDA/HIP\n"); |
| 63 | + // Try to call it on device |
| 64 | + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0,1), KOKKOS_LAMBDA(int) { |
| 65 | + A.print(); |
| 66 | + }); |
| 67 | + Kokkos::fence(); |
| 68 | + printf("Never got here in CUDA/HIP\n"); |
| 69 | + |
| 70 | + } |
| 71 | + Kokkos::finalize(); |
| 72 | +} |
| 73 | +``` |
| 74 | +
|
| 75 | +This worked on host (both inside and outside a `parallel_for` but crashes inside a device kernel. |
| 76 | +Here is the output: |
| 77 | +
|
| 78 | +``` |
| 79 | +foo called from host |
| 80 | +I can use the function pointer in a host parallel_for! |
| 81 | +foo called from host |
| 82 | +Now I will crash if we compiled for CUDA/HIP |
| 83 | +cudaDeviceSynchronize() error( cudaErrorIllegalAddress): an illegal memory access was encountered /home/crtrott/Kokkos/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp:153 |
| 84 | +Backtrace: |
| 85 | +[0x4422e3] |
| 86 | +[0x4249d5] |
| 87 | +[0x43085d] |
| 88 | +[0x430a5e] |
| 89 | +[0x429855] |
| 90 | +[0x40843e] |
| 91 | +[0x7f7fca5ba7e5] __libc_start_main |
| 92 | +[0x409cce] |
| 93 | +Aborted (core dumped) |
| 94 | +``` |
| 95 | +
|
| 96 | +*The function pointer we created was for a host function: you can not use it on device same as you can't dereference data pointers to host data!* |
| 97 | +
|
| 98 | +### Getting a device function pointer |
| 99 | +
|
| 100 | +We actually can get the pointer to the device version of our function `foo`. But we can only do that *on the device*! |
| 101 | +That means we need to run a little Kokkos kernel get the function pointer there, and copy it somehow back to the host so we can set the host object that way. |
| 102 | +
|
| 103 | +To do this we need a device `View` of a `SomeClass` instance, set the function pointer in the device code, and `deep_copy` it back into our host instance. |
| 104 | +
|
| 105 | +```c++ |
| 106 | + SomeClass A; |
| 107 | + Kokkos::View<SomeClass> A_v("A_v"); |
| 108 | + // Now init the function pointer on device: |
| 109 | + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0,1), KOKKOS_LAMBDA(int) { |
| 110 | + A_v().bar = &foo; |
| 111 | + }); |
| 112 | + // copy the device instance back to the host |
| 113 | + Kokkos::deep_copy(A, A_v); |
| 114 | +``` |
| 115 | + |
| 116 | +We are leveraging here the fact that `deep_copy` allows you to copy from and to a host scalar value, from a `View` of rank-0. |
| 117 | + |
| 118 | +If we do this `A` will contain a function pointer to a device function. That means we can capture `A` into a parallel region and execute it on the device, but now it will crash on the host. |
| 119 | + |
| 120 | +```c++ |
| 121 | +#include <Kokkos_Core.hpp> |
| 122 | +#include <cmath> |
| 123 | + |
| 124 | +KOKKOS_INLINE_FUNCTION void foo() { |
| 125 | + KOKKOS_IF_ON_HOST(printf("foo called from host\n");) |
| 126 | + KOKKOS_IF_ON_DEVICE(printf("foo called from device\n");) |
| 127 | +} |
| 128 | + |
| 129 | +struct SomeClass { |
| 130 | + void (*bar)(); |
| 131 | + KOKKOS_FUNCTION void print() const { |
| 132 | + bar(); |
| 133 | + } |
| 134 | +}; |
| 135 | + |
| 136 | +int main(int argc, char* argv[]) { |
| 137 | + Kokkos::initialize(argc, argv); |
| 138 | + { |
| 139 | + SomeClass A; |
| 140 | + Kokkos::View<SomeClass> A_v("A_v"); |
| 141 | + // Now init the function pointer on device: |
| 142 | + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0,1), KOKKOS_LAMBDA(int) { |
| 143 | + A_v().bar = &foo; |
| 144 | + }); |
| 145 | + // copy the device instance back to the host |
| 146 | + Kokkos::deep_copy(A, A_v); |
| 147 | + |
| 148 | + printf("Now I can capture A in a device kernel and use it there!\n"); |
| 149 | + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0,1), KOKKOS_LAMBDA(int) { |
| 150 | + A.print(); |
| 151 | + }); |
| 152 | + Kokkos::fence(); |
| 153 | + printf("But now I will crash on the host :-(\n"); |
| 154 | + A.print(); |
| 155 | + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0,1), KOKKOS_LAMBDA(int) { |
| 156 | + A.print(); |
| 157 | + }); |
| 158 | + Kokkos::fence(); |
| 159 | + printf("Never got here in CUDA/HIP builds\n"); |
| 160 | + } |
| 161 | + Kokkos::finalize(); |
| 162 | +} |
| 163 | +``` |
| 164 | +
|
| 165 | +If you run this with a CUDA build you get this output: |
| 166 | +
|
| 167 | +``` |
| 168 | +Now I can capture A in a device kernel and use it there! |
| 169 | +foo called from device |
| 170 | +But now I will crash on the host :-( |
| 171 | +Segmentation fault (core dumped) |
| 172 | +``` |
| 173 | +
|
| 174 | +### Creating a dual function pointer object |
| 175 | +
|
| 176 | +To do better than this we need something which contains both the device function pointer and the host function pointer. |
| 177 | +And it should call the right thing depending on where you call it from. |
| 178 | +We can in-fact write such a class: |
| 179 | +
|
| 180 | +```c++ |
| 181 | +template<class FPtr> |
| 182 | +struct DualFunctionPtr { |
| 183 | + FPtr h; |
| 184 | + FPtr d; |
| 185 | + template<class ... Args> |
| 186 | + KOKKOS_FUNCTION |
| 187 | + auto operator()(Args...args) const { |
| 188 | + KOKKOS_IF_ON_HOST( return h(args...); ) |
| 189 | + KOKKOS_IF_ON_DEVICE( return d(args...); ) |
| 190 | + } |
| 191 | +}; |
| 192 | +``` |
| 193 | + |
| 194 | +This class is templated on the function pointer type, contains a pointer for both the host and the device version, and has a templated operator that forwards all the arguments, and calls the appropriate function pointer depending on call site. |
| 195 | + |
| 196 | +We can use this class similar to `std::function` inside `SomeClass`: |
| 197 | + |
| 198 | +```c++ |
| 199 | +struct SomeClass { |
| 200 | + DualFunctionPtr<decltype(&foo)> bar; |
| 201 | + KOKKOS_FUNCTION void print() const { |
| 202 | + bar(); |
| 203 | + } |
| 204 | +}; |
| 205 | +``` |
| 206 | +
|
| 207 | +However we still need to initialize the two function pointers, using the device initialization approach from before, and also initializing the host side. Note: the order of initialization and `deep_copy` matters, because `deep_copy` will overwrite both member poitners in our wrapper class: |
| 208 | +
|
| 209 | +```c++ |
| 210 | + SomeClass A; |
| 211 | + Kokkos::View<SomeClass> A_v("A_v"); |
| 212 | + // Now init the function pointer on device: |
| 213 | + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0,1), KOKKOS_LAMBDA(int) { |
| 214 | + A_v().bar.d = &foo; |
| 215 | + }); |
| 216 | + // copy the device instance back to the host |
| 217 | + Kokkos::deep_copy(A, A_v); |
| 218 | + // Now init the host ptr |
| 219 | + A.bar.h = &foo; |
| 220 | +``` |
| 221 | + |
| 222 | +With that we have a fully working code where we can use our instance `A` of `SomeClass` on the host and on the device: |
| 223 | + |
| 224 | +```c++ |
| 225 | +#include <Kokkos_Core.hpp> |
| 226 | + |
| 227 | +KOKKOS_INLINE_FUNCTION void foo() { |
| 228 | + KOKKOS_IF_ON_HOST(printf("foo called from host\n");) |
| 229 | + KOKKOS_IF_ON_DEVICE(printf("foo called from device\n");) |
| 230 | +} |
| 231 | + |
| 232 | +template<class FPtr> |
| 233 | +struct DualFunctionPtr { |
| 234 | + FPtr h; |
| 235 | + FPtr d; |
| 236 | + template<class ... Args> |
| 237 | + KOKKOS_FUNCTION |
| 238 | + auto operator()(Args...args) const { |
| 239 | + KOKKOS_IF_ON_HOST( return h(args...); ) |
| 240 | + KOKKOS_IF_ON_DEVICE( return d(args...); ) |
| 241 | + } |
| 242 | +}; |
| 243 | + |
| 244 | +struct SomeClass { |
| 245 | + DualFunctionPtr<decltype(&foo)> bar; |
| 246 | + KOKKOS_FUNCTION void print() const { |
| 247 | + bar(); |
| 248 | + } |
| 249 | +}; |
| 250 | + |
| 251 | +int main(int argc, char* argv[]) { |
| 252 | + Kokkos::initialize(argc, argv); |
| 253 | + { |
| 254 | + SomeClass A; |
| 255 | + Kokkos::View<SomeClass> A_v("A_v"); |
| 256 | + // Now init the function pointer on device: |
| 257 | + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0,1), KOKKOS_LAMBDA(int) { |
| 258 | + A_v().bar.d = &foo; |
| 259 | + }); |
| 260 | + // copy the device instance back to the host |
| 261 | + Kokkos::deep_copy(A, A_v); |
| 262 | + // Now init the host ptr |
| 263 | + A.bar.h = &foo; |
| 264 | + |
| 265 | + printf("Now I can capture A in a device kernel and use it there!\n"); |
| 266 | + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0,1), KOKKOS_LAMBDA(int) { |
| 267 | + A.print(); |
| 268 | + }); |
| 269 | + Kokkos::fence(); |
| 270 | + printf("And I can capture A on the host and use it there\n"); |
| 271 | + A.print(); |
| 272 | + Kokkos::parallel_for(Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0,1), KOKKOS_LAMBDA(int) { |
| 273 | + A.print(); |
| 274 | + }); |
| 275 | + Kokkos::fence(); |
| 276 | + printf("I didn't crash whoohoo!!\n"); |
| 277 | + } |
| 278 | + Kokkos::finalize(); |
| 279 | +} |
| 280 | +``` |
| 281 | +
|
| 282 | +While this generally works (and note this code will also work if you simply compile for host-only) there may be architectures in the future where this fails. |
| 283 | +Also we in principle strongly discourage the use of function pointers - the amount of possible problems you run into is fairly significant. |
| 284 | +Furthermore, the visibility of the functions is important. In the example above all functions were in the same translation unit. |
| 285 | +You may need to use *relocatable device code* if that is not the case. |
| 286 | +
|
0 commit comments