File tree Expand file tree Collapse file tree 15 files changed +1411
-121
lines changed
src/awkward/_connect/cuda/cuda_kernels Expand file tree Collapse file tree 15 files changed +1411
-121
lines changed Original file line number Diff line number Diff line change @@ -59,17 +59,17 @@ awkward_ListOffsetArray_reduce_local_outoffsets_64_b(
59
59
}
60
60
__syncthreads ();
61
61
62
- for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
63
- int64_t val = 0 ;
64
- if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
65
- val = temp[thread_id - stride];
62
+ if (thread_id < lenparents) {
63
+ for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
64
+ int64_t val = 0 ;
65
+ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
66
+ val = temp[thread_id - stride];
67
+ }
68
+ __syncthreads ();
69
+ temp[thread_id] += val;
70
+ __syncthreads ();
66
71
}
67
- __syncthreads ();
68
- temp[thread_id] += val;
69
- __syncthreads ();
70
- }
71
72
72
- if (thread_id < lenparents) {
73
73
int64_t parent = parents[thread_id];
74
74
if (idx == blockDim .x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1 ]) {
75
75
atomicAdd (&scan_in_array[parent], temp[thread_id]);
Original file line number Diff line number Diff line change @@ -59,19 +59,19 @@ awkward_reduce_argmax_b(
59
59
}
60
60
__syncthreads ();
61
61
62
- for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
63
- int64_t index = -1 ;
64
- if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
65
- index = temp[thread_id - stride];
66
- }
67
- if (index != -1 && (temp[thread_id] == -1 || fromptr[index] > fromptr[temp[thread_id]] ||
68
- (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
69
- temp[thread_id] = index;
62
+ if (thread_id < lenparents) {
63
+ for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
64
+ int64_t index = -1 ;
65
+ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
66
+ index = temp[thread_id - stride];
67
+ }
68
+ if (index != -1 && (temp[thread_id] == -1 || fromptr[index] > fromptr[temp[thread_id]] ||
69
+ (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
70
+ temp[thread_id] = index;
71
+ }
72
+ __syncthreads ();
70
73
}
71
- __syncthreads ();
72
- }
73
74
74
- if (thread_id < lenparents) {
75
75
int64_t parent = parents[thread_id];
76
76
if (idx == blockDim .x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1 ]) {
77
77
atomicExch (&atomic_toptr[parent], temp[thread_id]);
Original file line number Diff line number Diff line change @@ -59,19 +59,19 @@ awkward_reduce_argmin_b(
59
59
}
60
60
__syncthreads ();
61
61
62
- for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
63
- int64_t index = -1 ;
64
- if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
65
- index = temp[thread_id - stride];
66
- }
67
- if (index != -1 && (temp[thread_id] == -1 || fromptr[index] < fromptr[temp[thread_id]] ||
68
- (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
69
- temp[thread_id] = index;
62
+ if (thread_id < lenparents) {
63
+ for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
64
+ int64_t index = -1 ;
65
+ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
66
+ index = temp[thread_id - stride];
67
+ }
68
+ if (index != -1 && (temp[thread_id] == -1 || fromptr[index] < fromptr[temp[thread_id]] ||
69
+ (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
70
+ temp[thread_id] = index;
71
+ }
72
+ __syncthreads ();
70
73
}
71
- __syncthreads ();
72
- }
73
74
74
- if (thread_id < lenparents) {
75
75
int64_t parent = parents[thread_id];
76
76
if (idx == blockDim .x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1 ]) {
77
77
atomicExch (&atomic_toptr[parent], temp[thread_id]);
Original file line number Diff line number Diff line change @@ -52,17 +52,17 @@ awkward_reduce_count_64_b(
52
52
}
53
53
__syncthreads ();
54
54
55
- for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
56
- int64_t val = 0 ;
57
- if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
58
- val = temp[thread_id - stride];
55
+ if (thread_id < lenparents) {
56
+ for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
57
+ int64_t val = 0 ;
58
+ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
59
+ val = temp[thread_id - stride];
60
+ }
61
+ __syncthreads ();
62
+ temp[thread_id] += val;
63
+ __syncthreads ();
59
64
}
60
- __syncthreads ();
61
- temp[thread_id] += val;
62
- __syncthreads ();
63
- }
64
65
65
- if (thread_id < lenparents) {
66
66
int64_t parent = parents[thread_id];
67
67
if (idx == blockDim .x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1 ]) {
68
68
atomicAdd (&toptr[parent], temp[thread_id]);
Original file line number Diff line number Diff line change @@ -54,17 +54,17 @@ awkward_reduce_countnonzero_b(
54
54
}
55
55
__syncthreads ();
56
56
57
- for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
58
- int64_t val = 0 ;
59
- if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
60
- val = temp[thread_id - stride];
57
+ if (thread_id < lenparents) {
58
+ for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
59
+ int64_t val = 0 ;
60
+ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
61
+ val = temp[thread_id - stride];
62
+ }
63
+ __syncthreads ();
64
+ temp[thread_id] += val;
65
+ __syncthreads ();
61
66
}
62
- __syncthreads ();
63
- temp[thread_id] += val;
64
- __syncthreads ();
65
- }
66
67
67
- if (thread_id < lenparents) {
68
68
int64_t parent = parents[thread_id];
69
69
if (idx == blockDim .x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1 ]) {
70
70
atomicAdd (&toptr[parent], temp[thread_id]);
Original file line number Diff line number Diff line change @@ -55,18 +55,18 @@ awkward_reduce_max_b(
55
55
}
56
56
__syncthreads ();
57
57
58
- for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
59
- T val = identity;
58
+ if (thread_id < lenparents) {
59
+ for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
60
+ T val = identity;
60
61
61
- if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
62
- val = temp[idx - stride];
62
+ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
63
+ val = temp[idx - stride];
64
+ }
65
+ __syncthreads ();
66
+ temp[thread_id] = val > temp[thread_id] ? val : temp[thread_id];
67
+ __syncthreads ();
63
68
}
64
- __syncthreads ();
65
- temp[thread_id] = val > temp[thread_id] ? val : temp[thread_id];
66
- __syncthreads ();
67
- }
68
69
69
- if (thread_id < lenparents) {
70
70
int64_t parent = parents[thread_id];
71
71
if (idx == blockDim .x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1 ]) {
72
72
atomicMax (&toptr[parent], temp[thread_id]);
Original file line number Diff line number Diff line change @@ -56,17 +56,17 @@ awkward_reduce_min_b(
56
56
}
57
57
__syncthreads ();
58
58
59
- for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
60
- T val = identity;
61
- if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
62
- val = temp[thread_id - stride];
59
+ if (thread_id < lenparents) {
60
+ for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
61
+ T val = identity;
62
+ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
63
+ val = temp[thread_id - stride];
64
+ }
65
+ __syncthreads ();
66
+ temp[thread_id] = val < temp[thread_id] ? val : temp[thread_id];
67
+ __syncthreads ();
63
68
}
64
- __syncthreads ();
65
- temp[thread_id] = val < temp[thread_id] ? val : temp[thread_id];
66
- __syncthreads ();
67
- }
68
69
69
- if (thread_id < lenparents) {
70
70
int64_t parent = parents[thread_id];
71
71
if (idx == blockDim .x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1 ]) {
72
72
atomicMin (&toptr[parent], temp[thread_id]);
Original file line number Diff line number Diff line change @@ -59,17 +59,17 @@ awkward_reduce_prod_b(
59
59
}
60
60
__syncthreads ();
61
61
62
- for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
63
- T val = 1 ;
64
- if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
65
- val = temp[thread_id - stride];
62
+ if (thread_id < lenparents) {
63
+ for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
64
+ T val = 1 ;
65
+ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
66
+ val = temp[thread_id - stride];
67
+ }
68
+ __syncthreads ();
69
+ temp[thread_id] *= val;
70
+ __syncthreads ();
66
71
}
67
- __syncthreads ();
68
- temp[thread_id] *= val;
69
- __syncthreads ();
70
- }
71
72
72
- if (thread_id < lenparents) {
73
73
int64_t parent = parents[thread_id];
74
74
if (idx == blockDim .x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1 ]) {
75
75
atomicMul (&atomic_toptr[parent], temp[thread_id]);
Original file line number Diff line number Diff line change @@ -59,17 +59,17 @@ awkward_reduce_prod_bool_b(
59
59
}
60
60
__syncthreads ();
61
61
62
- for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
63
- T val = 1 ;
64
- if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
65
- val = temp[thread_id - stride];
62
+ if (thread_id < lenparents) {
63
+ for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
64
+ T val = 1 ;
65
+ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
66
+ val = temp[thread_id - stride];
67
+ }
68
+ __syncthreads ();
69
+ temp[thread_id] &= (val != 0 );
70
+ __syncthreads ();
66
71
}
67
- __syncthreads ();
68
- temp[thread_id] &= (val != 0 );
69
- __syncthreads ();
70
- }
71
72
72
- if (thread_id < lenparents) {
73
73
int64_t parent = parents[thread_id];
74
74
if (idx == blockDim .x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1 ]) {
75
75
atomicAnd (&atomic_toptr[parent], temp[thread_id]);
Original file line number Diff line number Diff line change @@ -54,17 +54,17 @@ awkward_reduce_sum_b(
54
54
}
55
55
__syncthreads ();
56
56
57
- for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
58
- T val = 0 ;
59
- if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
60
- val = temp[thread_id - stride];
57
+ if (thread_id < lenparents) {
58
+ for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
59
+ T val = 0 ;
60
+ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
61
+ val = temp[thread_id - stride];
62
+ }
63
+ __syncthreads ();
64
+ temp[thread_id] += val;
65
+ __syncthreads ();
61
66
}
62
- __syncthreads ();
63
- temp[thread_id] += val;
64
- __syncthreads ();
65
- }
66
67
67
- if (thread_id < lenparents) {
68
68
int64_t parent = parents[thread_id];
69
69
if (idx == blockDim .x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1 ]) {
70
70
atomicAdd (&toptr[parent], temp[thread_id]);
Original file line number Diff line number Diff line change @@ -59,17 +59,17 @@ awkward_reduce_sum_bool_b(
59
59
}
60
60
__syncthreads ();
61
61
62
- for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
63
- T val = 0 ;
64
- if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
65
- val = temp[thread_id - stride];
62
+ if (thread_id < lenparents) {
63
+ for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
64
+ T val = 0 ;
65
+ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
66
+ val = temp[thread_id - stride];
67
+ }
68
+ __syncthreads ();
69
+ temp[thread_id] |= (val != 0 );
70
+ __syncthreads ();
66
71
}
67
- __syncthreads ();
68
- temp[thread_id] |= (val != 0 );
69
- __syncthreads ();
70
- }
71
72
72
- if (thread_id < lenparents) {
73
73
int64_t parent = parents[thread_id];
74
74
if (idx == blockDim .x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1 ]) {
75
75
atomicOr (&atomic_toptr[parent], temp[thread_id]);
Original file line number Diff line number Diff line change @@ -54,17 +54,17 @@ awkward_reduce_sum_int32_bool_64_b(
54
54
}
55
55
__syncthreads ();
56
56
57
- for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
58
- T val = 0 ;
59
- if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
60
- val = temp[thread_id - stride];
57
+ if (thread_id < lenparents) {
58
+ for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
59
+ T val = 0 ;
60
+ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
61
+ val = temp[thread_id - stride];
62
+ }
63
+ __syncthreads ();
64
+ temp[thread_id] += val;
65
+ __syncthreads ();
61
66
}
62
- __syncthreads ();
63
- temp[thread_id] += val;
64
- __syncthreads ();
65
- }
66
67
67
- if (thread_id < lenparents) {
68
68
int64_t parent = parents[thread_id];
69
69
if (idx == blockDim .x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1 ]) {
70
70
atomicAdd (&toptr[parent], temp[thread_id]);
Original file line number Diff line number Diff line change @@ -54,17 +54,17 @@ awkward_reduce_sum_int64_bool_64_b(
54
54
}
55
55
__syncthreads ();
56
56
57
- for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
58
- T val = 0 ;
59
- if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
60
- val = temp[thread_id - stride];
57
+ if (thread_id < lenparents) {
58
+ for (int64_t stride = 1 ; stride < blockDim .x ; stride *= 2 ) {
59
+ T val = 0 ;
60
+ if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
61
+ val = temp[thread_id - stride];
62
+ }
63
+ __syncthreads ();
64
+ temp[thread_id] += val;
65
+ __syncthreads ();
61
66
}
62
- __syncthreads ();
63
- temp[thread_id] += val;
64
- __syncthreads ();
65
- }
66
67
67
- if (thread_id < lenparents) {
68
68
int64_t parent = parents[thread_id];
69
69
if (idx == blockDim .x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1 ]) {
70
70
atomicAdd (&toptr[parent], temp[thread_id]);
You can’t perform that action at this time.
0 commit comments