Skip to content

Commit a1da072

Browse files
ManasviGoyalianna
andauthored
test: reducer CUDA kernel tests (#3162)
* feat: add tree reduction implementation of argmin and argmax * feat: add awkward_ListOffsetArray_reduce_local_outoffsets_64 kernel * test: integration tests for cuda * test: some more integration tests for cuda * feat: add awkward_reduce_count_64 kernel * fix: indexing and indentation * feat: add awkward_reduce_countnonzero kernel * feat: add reduce sum, min and max kernels * feat: add reduce prod and sum_int_bool * feat: add sum_bool and prod_bool kernels * fix: use cpt.assert_allclose * test: reducer integration tests * fix: typr conversion * fix: use atomic to avoid race conditions * fix: remove unnessary variable * fix: minor fixes * fix: all reducer for atomics * fix: missing template * fix: remove complex * fix: atomicMin() for float 32 and indentation * fix: pass correct dtype of identity * fix: remove combinations test * fix: manage resources and disable failing test * fix: uncomment fixed test for slicing * fix: correctly interpret typetracer array for cuda backend * fix: tests-spec error for bool * fix: check for the backend of head * test: reducer CUDAkernel tests * test: add more reducer tests * test: add more reducer tests 2 * fix: error for EmptyArray * test: generic_reducer_operation and block_boundary * Update dev/generate-tests.py Co-authored-by: Ianna Osborne <[email protected]> --------- Co-authored-by: Ianna Osborne <[email protected]>
1 parent ae1ba10 commit a1da072

15 files changed

+1411
-121
lines changed

src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -59,17 +59,17 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b(
5959
}
6060
__syncthreads();
6161

62-
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
63-
int64_t val = 0;
64-
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
65-
val = temp[thread_id - stride];
62+
if (thread_id < lenparents) {
63+
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
64+
int64_t val = 0;
65+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
66+
val = temp[thread_id - stride];
67+
}
68+
__syncthreads();
69+
temp[thread_id] += val;
70+
__syncthreads();
6671
}
67-
__syncthreads();
68-
temp[thread_id] += val;
69-
__syncthreads();
70-
}
7172

72-
if (thread_id < lenparents) {
7373
int64_t parent = parents[thread_id];
7474
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
7575
atomicAdd(&scan_in_array[parent], temp[thread_id]);

src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -59,19 +59,19 @@ awkward_reduce_argmax_b(
5959
}
6060
__syncthreads();
6161

62-
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
63-
int64_t index = -1;
64-
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
65-
index = temp[thread_id - stride];
66-
}
67-
if (index != -1 && (temp[thread_id] == -1 || fromptr[index] > fromptr[temp[thread_id]] ||
68-
(fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
69-
temp[thread_id] = index;
62+
if (thread_id < lenparents) {
63+
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
64+
int64_t index = -1;
65+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
66+
index = temp[thread_id - stride];
67+
}
68+
if (index != -1 && (temp[thread_id] == -1 || fromptr[index] > fromptr[temp[thread_id]] ||
69+
(fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
70+
temp[thread_id] = index;
71+
}
72+
__syncthreads();
7073
}
71-
__syncthreads();
72-
}
7374

74-
if (thread_id < lenparents) {
7575
int64_t parent = parents[thread_id];
7676
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
7777
atomicExch(&atomic_toptr[parent], temp[thread_id]);

src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmin.cu

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -59,19 +59,19 @@ awkward_reduce_argmin_b(
5959
}
6060
__syncthreads();
6161

62-
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
63-
int64_t index = -1;
64-
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
65-
index = temp[thread_id - stride];
66-
}
67-
if (index != -1 && (temp[thread_id] == -1 || fromptr[index] < fromptr[temp[thread_id]] ||
68-
(fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
69-
temp[thread_id] = index;
62+
if (thread_id < lenparents) {
63+
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
64+
int64_t index = -1;
65+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
66+
index = temp[thread_id - stride];
67+
}
68+
if (index != -1 && (temp[thread_id] == -1 || fromptr[index] < fromptr[temp[thread_id]] ||
69+
(fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
70+
temp[thread_id] = index;
71+
}
72+
__syncthreads();
7073
}
71-
__syncthreads();
72-
}
7374

74-
if (thread_id < lenparents) {
7575
int64_t parent = parents[thread_id];
7676
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
7777
atomicExch(&atomic_toptr[parent], temp[thread_id]);

src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_count_64.cu

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -52,17 +52,17 @@ awkward_reduce_count_64_b(
5252
}
5353
__syncthreads();
5454

55-
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
56-
int64_t val = 0;
57-
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
58-
val = temp[thread_id - stride];
55+
if (thread_id < lenparents) {
56+
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
57+
int64_t val = 0;
58+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
59+
val = temp[thread_id - stride];
60+
}
61+
__syncthreads();
62+
temp[thread_id] += val;
63+
__syncthreads();
5964
}
60-
__syncthreads();
61-
temp[thread_id] += val;
62-
__syncthreads();
63-
}
6465

65-
if (thread_id < lenparents) {
6666
int64_t parent = parents[thread_id];
6767
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
6868
atomicAdd(&toptr[parent], temp[thread_id]);

src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_countnonzero.cu

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,17 @@ awkward_reduce_countnonzero_b(
5454
}
5555
__syncthreads();
5656

57-
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
58-
int64_t val = 0;
59-
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
60-
val = temp[thread_id - stride];
57+
if (thread_id < lenparents) {
58+
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
59+
int64_t val = 0;
60+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
61+
val = temp[thread_id - stride];
62+
}
63+
__syncthreads();
64+
temp[thread_id] += val;
65+
__syncthreads();
6166
}
62-
__syncthreads();
63-
temp[thread_id] += val;
64-
__syncthreads();
65-
}
6667

67-
if (thread_id < lenparents) {
6868
int64_t parent = parents[thread_id];
6969
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
7070
atomicAdd(&toptr[parent], temp[thread_id]);

src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_max.cu

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,18 +55,18 @@ awkward_reduce_max_b(
5555
}
5656
__syncthreads();
5757

58-
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
59-
T val = identity;
58+
if (thread_id < lenparents) {
59+
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
60+
T val = identity;
6061

61-
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
62-
val = temp[idx - stride];
62+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
63+
val = temp[idx - stride];
64+
}
65+
__syncthreads();
66+
temp[thread_id] = val > temp[thread_id] ? val : temp[thread_id];
67+
__syncthreads();
6368
}
64-
__syncthreads();
65-
temp[thread_id] = val > temp[thread_id] ? val : temp[thread_id];
66-
__syncthreads();
67-
}
6869

69-
if (thread_id < lenparents) {
7070
int64_t parent = parents[thread_id];
7171
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
7272
atomicMax(&toptr[parent], temp[thread_id]);

src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_min.cu

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -56,17 +56,17 @@ awkward_reduce_min_b(
5656
}
5757
__syncthreads();
5858

59-
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
60-
T val = identity;
61-
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
62-
val = temp[thread_id - stride];
59+
if (thread_id < lenparents) {
60+
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
61+
T val = identity;
62+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
63+
val = temp[thread_id - stride];
64+
}
65+
__syncthreads();
66+
temp[thread_id] = val < temp[thread_id] ? val : temp[thread_id];
67+
__syncthreads();
6368
}
64-
__syncthreads();
65-
temp[thread_id] = val < temp[thread_id] ? val : temp[thread_id];
66-
__syncthreads();
67-
}
6869

69-
if (thread_id < lenparents) {
7070
int64_t parent = parents[thread_id];
7171
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
7272
atomicMin(&toptr[parent], temp[thread_id]);

src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod.cu

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -59,17 +59,17 @@ awkward_reduce_prod_b(
5959
}
6060
__syncthreads();
6161

62-
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
63-
T val = 1;
64-
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
65-
val = temp[thread_id - stride];
62+
if (thread_id < lenparents) {
63+
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
64+
T val = 1;
65+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
66+
val = temp[thread_id - stride];
67+
}
68+
__syncthreads();
69+
temp[thread_id] *= val;
70+
__syncthreads();
6671
}
67-
__syncthreads();
68-
temp[thread_id] *= val;
69-
__syncthreads();
70-
}
7172

72-
if (thread_id < lenparents) {
7373
int64_t parent = parents[thread_id];
7474
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
7575
atomicMul(&atomic_toptr[parent], temp[thread_id]);

src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_prod_bool.cu

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -59,17 +59,17 @@ awkward_reduce_prod_bool_b(
5959
}
6060
__syncthreads();
6161

62-
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
63-
T val = 1;
64-
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
65-
val = temp[thread_id - stride];
62+
if (thread_id < lenparents) {
63+
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
64+
T val = 1;
65+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
66+
val = temp[thread_id - stride];
67+
}
68+
__syncthreads();
69+
temp[thread_id] &= (val != 0);
70+
__syncthreads();
6671
}
67-
__syncthreads();
68-
temp[thread_id] &= (val != 0);
69-
__syncthreads();
70-
}
7172

72-
if (thread_id < lenparents) {
7373
int64_t parent = parents[thread_id];
7474
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
7575
atomicAnd(&atomic_toptr[parent], temp[thread_id]);

src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum.cu

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,17 @@ awkward_reduce_sum_b(
5454
}
5555
__syncthreads();
5656

57-
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
58-
T val = 0;
59-
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
60-
val = temp[thread_id - stride];
57+
if (thread_id < lenparents) {
58+
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
59+
T val = 0;
60+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
61+
val = temp[thread_id - stride];
62+
}
63+
__syncthreads();
64+
temp[thread_id] += val;
65+
__syncthreads();
6166
}
62-
__syncthreads();
63-
temp[thread_id] += val;
64-
__syncthreads();
65-
}
6667

67-
if (thread_id < lenparents) {
6868
int64_t parent = parents[thread_id];
6969
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
7070
atomicAdd(&toptr[parent], temp[thread_id]);

src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_bool.cu

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -59,17 +59,17 @@ awkward_reduce_sum_bool_b(
5959
}
6060
__syncthreads();
6161

62-
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
63-
T val = 0;
64-
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
65-
val = temp[thread_id - stride];
62+
if (thread_id < lenparents) {
63+
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
64+
T val = 0;
65+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
66+
val = temp[thread_id - stride];
67+
}
68+
__syncthreads();
69+
temp[thread_id] |= (val != 0);
70+
__syncthreads();
6671
}
67-
__syncthreads();
68-
temp[thread_id] |= (val != 0);
69-
__syncthreads();
70-
}
7172

72-
if (thread_id < lenparents) {
7373
int64_t parent = parents[thread_id];
7474
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
7575
atomicOr(&atomic_toptr[parent], temp[thread_id]);

src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int32_bool_64.cu

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,17 @@ awkward_reduce_sum_int32_bool_64_b(
5454
}
5555
__syncthreads();
5656

57-
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
58-
T val = 0;
59-
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
60-
val = temp[thread_id - stride];
57+
if (thread_id < lenparents) {
58+
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
59+
T val = 0;
60+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
61+
val = temp[thread_id - stride];
62+
}
63+
__syncthreads();
64+
temp[thread_id] += val;
65+
__syncthreads();
6166
}
62-
__syncthreads();
63-
temp[thread_id] += val;
64-
__syncthreads();
65-
}
6667

67-
if (thread_id < lenparents) {
6868
int64_t parent = parents[thread_id];
6969
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
7070
atomicAdd(&toptr[parent], temp[thread_id]);

src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_sum_int64_bool_64.cu

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,17 @@ awkward_reduce_sum_int64_bool_64_b(
5454
}
5555
__syncthreads();
5656

57-
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
58-
T val = 0;
59-
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
60-
val = temp[thread_id - stride];
57+
if (thread_id < lenparents) {
58+
for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
59+
T val = 0;
60+
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
61+
val = temp[thread_id - stride];
62+
}
63+
__syncthreads();
64+
temp[thread_id] += val;
65+
__syncthreads();
6166
}
62-
__syncthreads();
63-
temp[thread_id] += val;
64-
__syncthreads();
65-
}
6667

67-
if (thread_id < lenparents) {
6868
int64_t parent = parents[thread_id];
6969
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
7070
atomicAdd(&toptr[parent], temp[thread_id]);

0 commit comments

Comments
 (0)