Skip to content

Commit 0b8b5db

Browse files
chore: trying atomics and tree reduction for CUDA reducer kernels (#3123)
* chore: trying atomics and tree reduction for CUDA reducer kernels * chore: add prod, min, max * fix: some fixes * fix: handle block boundaries * chore: add argmin and argmax * chore: add sum and max complex * chore: add sum and prod bool * chore: add count kernels * chore: add sum int32 and int64 bool kernels * chore: add sum and prod complex --------- Co-authored-by: Jim Pivarski <[email protected]>
1 parent c941e24 commit 0b8b5db

17 files changed

+1434
-0
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import cupy as cp
2+
3+
cuda_kernel = """
4+
extern "C" {
5+
__global__ void awkward_reduce_argmax_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
6+
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
7+
8+
if (thread_id < outlength) {
9+
toptr[thread_id] = -1;
10+
}
11+
}
12+
}
13+
14+
extern "C" {
15+
__global__ void awkward_reduce_argmax_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
16+
extern __shared__ int shared[];
17+
18+
int idx = threadIdx.x;
19+
int thread_id = blockIdx.x * blockDim.x + idx;
20+
21+
if (thread_id < lenparents) {
22+
shared[idx] = thread_id;
23+
} else {
24+
shared[idx] = -1;
25+
}
26+
__syncthreads();
27+
28+
for (int stride = 1; stride < blockDim.x; stride *= 2) {
29+
int index = -1;
30+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
31+
index = shared[idx - stride];
32+
}
33+
if (index != -1 && (shared[idx] == -1 || fromptr[index] > fromptr[shared[idx]])) {
34+
shared[idx] = index;
35+
}
36+
__syncthreads();
37+
}
38+
39+
if (thread_id < lenparents) {
40+
int parent = parents[thread_id];
41+
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
42+
partial[blockIdx.x * outlength + parent] = shared[idx];
43+
}
44+
}
45+
}
46+
}
47+
48+
extern "C" {
49+
__global__ void awkward_reduce_argmax_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
50+
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
51+
52+
if (thread_id < outlength) {
53+
int max_index = -1;
54+
int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
55+
for (int i = 0; i < blocks; ++i) {
56+
int index = partial[i * outlength + thread_id];
57+
if (index != -1 && (max_index == -1 || fromptr[index] > fromptr[max_index])) {
58+
max_index = index;
59+
}
60+
}
61+
toptr[thread_id] = max_index;
62+
}
63+
}
64+
}
65+
"""
66+
67+
parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
68+
fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
69+
lenparents = len(parents)
70+
outlength = int(cp.max(parents)) + 1
71+
toptr = cp.full(outlength, -1, dtype=cp.int32)
72+
73+
74+
block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
75+
for i in range (len(block_size)):
76+
partial = cp.full((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), -1, dtype=cp.int32)
77+
grid_size = (lenparents + block_size[i] - 1) // block_size[i]
78+
shared_mem_size = block_size[i] * cp.int32().nbytes
79+
80+
raw_module = cp.RawModule(code=cuda_kernel)
81+
82+
awkward_reduce_argmax_a = raw_module.get_function('awkward_reduce_argmax_a')
83+
awkward_reduce_argmax_b = raw_module.get_function('awkward_reduce_argmax_b')
84+
awkward_reduce_argmax_c = raw_module.get_function('awkward_reduce_argmax_c')
85+
86+
awkward_reduce_argmax_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
87+
awkward_reduce_argmax_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
88+
awkward_reduce_argmax_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
89+
90+
assert cp.array_equal(toptr, cp.array([0, 2, 8, -1, -1, 9]))
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import cupy as cp
2+
3+
cuda_kernel = """
4+
extern "C" {
5+
__global__ void awkward_reduce_argmin_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
6+
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
7+
8+
if (thread_id < outlength) {
9+
toptr[thread_id] = -1;
10+
}
11+
}
12+
}
13+
14+
extern "C" {
15+
__global__ void awkward_reduce_argmin_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
16+
extern __shared__ int shared[];
17+
18+
int idx = threadIdx.x;
19+
int thread_id = blockIdx.x * blockDim.x + idx;
20+
21+
if (thread_id < lenparents) {
22+
shared[idx] = thread_id;
23+
} else {
24+
shared[idx] = -1;
25+
}
26+
__syncthreads();
27+
28+
for (int stride = 1; stride < blockDim.x; stride *= 2) {
29+
int index = -1;
30+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
31+
index = shared[idx - stride];
32+
}
33+
if (index != -1 && (shared[idx] == -1 || fromptr[index] < fromptr[shared[idx]])) {
34+
shared[idx] = index;
35+
}
36+
__syncthreads();
37+
}
38+
39+
if (thread_id < lenparents) {
40+
int parent = parents[thread_id];
41+
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
42+
partial[blockIdx.x * outlength + parent] = shared[idx];
43+
}
44+
}
45+
}
46+
}
47+
48+
extern "C" {
49+
__global__ void awkward_reduce_argmin_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
50+
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
51+
52+
if (thread_id < outlength) {
53+
int min_index = -1;
54+
int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
55+
for (int i = 0; i < blocks; ++i) {
56+
int index = partial[i * outlength + thread_id];
57+
if (index != -1 && (min_index == -1 || fromptr[index] < fromptr[min_index])) {
58+
min_index = index;
59+
}
60+
}
61+
toptr[thread_id] = min_index;
62+
}
63+
}
64+
}
65+
"""
66+
67+
parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
68+
fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
69+
lenparents = len(parents)
70+
outlength = int(cp.max(parents)) + 1
71+
toptr = cp.full(outlength, -1, dtype=cp.int32)
72+
73+
74+
block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
75+
for i in range (len(block_size)):
76+
partial = cp.full((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), -1, dtype=cp.int32)
77+
grid_size = (lenparents + block_size[i] - 1) // block_size[i]
78+
shared_mem_size = block_size[i] * cp.int32().nbytes
79+
80+
raw_module = cp.RawModule(code=cuda_kernel)
81+
82+
awkward_reduce_argmin_a = raw_module.get_function('awkward_reduce_argmin_a')
83+
awkward_reduce_argmin_b = raw_module.get_function('awkward_reduce_argmin_b')
84+
awkward_reduce_argmin_c = raw_module.get_function('awkward_reduce_argmin_c')
85+
86+
awkward_reduce_argmin_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
87+
awkward_reduce_argmin_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
88+
awkward_reduce_argmin_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
89+
90+
assert cp.array_equal(toptr, cp.array([0, 1, 3, -1, -1, 9]))
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import cupy as cp
2+
3+
cuda_kernel = """
4+
extern "C" {
5+
__global__ void awkward_reduce_countnonzero_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
6+
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
7+
8+
if (thread_id < outlength) {
9+
toptr[thread_id] = 0;
10+
}
11+
}
12+
}
13+
14+
extern "C" {
15+
__global__ void awkward_reduce_countnonzero_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
16+
extern __shared__ int shared[];
17+
18+
int idx = threadIdx.x;
19+
int thread_id = blockIdx.x * blockDim.x + idx;
20+
21+
if (thread_id < lenparents) {
22+
shared[idx] = 1;
23+
} else {
24+
shared[idx] = 0;
25+
}
26+
__syncthreads();
27+
28+
for (int stride = 1; stride < blockDim.x; stride *= 2) {
29+
int val = 0;
30+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
31+
val = shared[idx - stride];
32+
}
33+
shared[idx] += val;
34+
__syncthreads();
35+
}
36+
37+
if (thread_id < lenparents) {
38+
int parent = parents[thread_id];
39+
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
40+
partial[blockIdx.x * outlength + parent] = shared[idx];
41+
}
42+
}
43+
}
44+
}
45+
46+
extern "C" {
47+
__global__ void awkward_reduce_countnonzero_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
48+
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
49+
50+
if (thread_id < outlength) {
51+
int countnonzero = 0;
52+
int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
53+
for (int i = 0; i < blocks; ++i) {
54+
countnonzero += partial[i * outlength + thread_id];
55+
}
56+
toptr[thread_id] = countnonzero;
57+
}
58+
}
59+
}
60+
"""
61+
62+
parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
63+
fromptr = cp.array([1, 2, 3, 0, 5, 6, 0, 8, 9, 0], dtype=cp.int32)
64+
lenparents = len(parents)
65+
outlength = int(cp.max(parents)) + 1
66+
toptr = cp.zeros(outlength, dtype=cp.int32)
67+
68+
block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
69+
for i in range (len(block_size)):
70+
partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32)
71+
grid_size = (lenparents + block_size[i] - 1) // block_size[i]
72+
shared_mem_size = block_size[i] * cp.int32().nbytes
73+
74+
raw_module = cp.RawModule(code=cuda_kernel)
75+
76+
awkward_reduce_countnonzero_a = raw_module.get_function('awkward_reduce_countnonzero_a')
77+
awkward_reduce_countnonzero_b = raw_module.get_function('awkward_reduce_countnonzero_b')
78+
awkward_reduce_countnonzero_c = raw_module.get_function('awkward_reduce_countnonzero_c')
79+
80+
awkward_reduce_countnonzero_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
81+
awkward_reduce_countnonzero_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
82+
awkward_reduce_countnonzero_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
83+
84+
assert cp.array_equal(toptr, cp.array([1, 2, 6, 0, 0, 1]))
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import cupy as cp
2+
3+
cuda_kernel = """
4+
extern "C" {
5+
__global__ void awkward_reduce_countnonzero_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
6+
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
7+
8+
if (thread_id < outlength) {
9+
toptr[thread_id] = 0;
10+
}
11+
}
12+
}
13+
14+
extern "C" {
15+
__global__ void awkward_reduce_countnonzero_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
16+
extern __shared__ int shared[];
17+
18+
int idx = threadIdx.x;
19+
int thread_id = blockIdx.x * blockDim.x + idx;
20+
21+
if (thread_id < lenparents) {
22+
shared[idx] = (fromptr[thread_id] != 0) ? 1 : 0;
23+
} else {
24+
shared[idx] = 0;
25+
}
26+
__syncthreads();
27+
28+
for (int stride = 1; stride < blockDim.x; stride *= 2) {
29+
int val = 0;
30+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
31+
val = shared[idx - stride];
32+
}
33+
shared[idx] += val;
34+
__syncthreads();
35+
}
36+
37+
if (thread_id < lenparents) {
38+
int parent = parents[thread_id];
39+
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
40+
partial[blockIdx.x * outlength + parent] = shared[idx];
41+
}
42+
}
43+
}
44+
}
45+
46+
extern "C" {
47+
__global__ void awkward_reduce_countnonzero_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
48+
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
49+
50+
if (thread_id < outlength) {
51+
int countnonzero = 0;
52+
int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
53+
for (int i = 0; i < blocks; ++i) {
54+
countnonzero += partial[i * outlength + thread_id];
55+
}
56+
toptr[thread_id] = countnonzero;
57+
}
58+
}
59+
}
60+
"""
61+
62+
parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
63+
fromptr = cp.array([1, 2, 3, 0, 5, 6, 0, 8, 9, 0], dtype=cp.int32)
64+
lenparents = len(parents)
65+
outlength = int(cp.max(parents)) + 1
66+
toptr = cp.zeros(outlength, dtype=cp.int32)
67+
68+
block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
69+
for i in range (len(block_size)):
70+
partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32)
71+
grid_size = (lenparents + block_size[i] - 1) // block_size[i]
72+
shared_mem_size = block_size[i] * cp.int32().nbytes
73+
74+
raw_module = cp.RawModule(code=cuda_kernel)
75+
76+
awkward_reduce_countnonzero_a = raw_module.get_function('awkward_reduce_countnonzero_a')
77+
awkward_reduce_countnonzero_b = raw_module.get_function('awkward_reduce_countnonzero_b')
78+
awkward_reduce_countnonzero_c = raw_module.get_function('awkward_reduce_countnonzero_c')
79+
80+
awkward_reduce_countnonzero_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
81+
awkward_reduce_countnonzero_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
82+
awkward_reduce_countnonzero_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
83+
84+
assert cp.array_equal(toptr, cp.array([1, 2, 4, 0, 0, 0]))

0 commit comments

Comments
 (0)