fix: runtime for ak.combinations with cuda backend (#3502)

ianna · pre-commit-ci[bot] · web-flow · commit 914e012502dc · 2025-05-12T22:45:38.000+02:00
* optimize awkward_ListArray_combinations_length.cu

* optimize awkward_ListArray_combinations.cu

* update awkward_ListArray_combinations.cu

* update awkward_ListArray_combinations_length.cu

* cleanup

* style: pre-commit fixes

* Update awkward_ListArray_combinations.cu

---------

Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_combinations.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_combinations.cu
@@ -5,24 +5,21 @@
 //     (tocarry, toindex, fromindex, n, replacement, starts, stops, length, invocation_index, err_code) = args
 //     scan_in_array_offsets = cupy.zeros(length + 1, dtype=cupy.int64)
 //     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_combinations_a", tocarry[0].dtype, toindex.dtype, fromindex.dtype, starts.dtype, stops.dtype]))(grid, block, (tocarry, toindex, fromindex, n, replacement, starts, stops, length, scan_in_array_offsets, invocation_index, err_code))
-//     scan_in_array_offsets = cupy.cumsum(scan_in_array_offsets)
-//     scan_in_array_parents = cupy.zeros(int(scan_in_array_offsets[length]), dtype=cupy.int64)
-//     scan_in_array_local_indices = cupy.zeros(int(scan_in_array_offsets[length]), dtype=cupy.int64)
-//     for i in range(1, length + 1):
-//         scan_in_array_parents[scan_in_array_offsets[i - 1]:scan_in_array_offsets[i]] = i - 1
-//     if int(scan_in_array_offsets[length]) < 1024:
-//         block_size = int(scan_in_array_offsets[length])
-//     else:
-//         block_size = 1024
-//     if block_size > 0:
-//         grid_size = math.floor((int(scan_in_array_offsets[length]) + block_size - 1) / block_size)
-//     else:
-//         grid_size = 1
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_combinations_b", tocarry[0].dtype, toindex.dtype, fromindex.dtype, starts.dtype, stops.dtype]))((grid_size,), (block_size,), (tocarry, toindex, fromindex, n, replacement, starts, stops, length, scan_in_array_offsets, scan_in_array_parents, scan_in_array_local_indices, invocation_index, err_code))
+//     cupy.cumsum(scan_in_array_offsets, out = scan_in_array_offsets)
+//     totallen=int(scan_in_array_offsets[length])
+//     if totallen == 0:
+//         return  # Nothing to do if no combinations, skip the rest
+//     block_size = min(1024, totallen)
+//     grid_size = (totallen + block_size - 1)//block_size
+//     scan_in_array_parents = cupy.zeros(totallen, dtype=cupy.int64)
+//     scan_in_array_local_indices = cupy.zeros(totallen, dtype=cupy.int64)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_combinations_b", tocarry[0].dtype, toindex.dtype, fromindex.dtype, starts.dtype, stops.dtype]))((grid_size,), (block_size,), (tocarry, toindex, fromindex, n, replacement, starts, stops, length, scan_in_array_offsets, scan_in_array_parents, invocation_index, err_code))
 //     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_combinations_c", tocarry[0].dtype, toindex.dtype, fromindex.dtype, starts.dtype, stops.dtype]))((grid_size,), (block_size,), (tocarry, toindex, fromindex, n, replacement, starts, stops, length, scan_in_array_offsets, scan_in_array_parents, scan_in_array_local_indices, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_combinations_d", tocarry[0].dtype, toindex.dtype, fromindex.dtype, starts.dtype, stops.dtype]))((grid_size,), (block_size,), (tocarry, toindex, fromindex, n, replacement, starts, stops, length, scan_in_array_offsets, scan_in_array_parents, scan_in_array_local_indices, invocation_index, err_code))
 // out["awkward_ListArray_combinations_a", {dtype_specializations}] = None
 // out["awkward_ListArray_combinations_b", {dtype_specializations}] = None
 // out["awkward_ListArray_combinations_c", {dtype_specializations}] = None
+// out["awkward_ListArray_combinations_d", {dtype_specializations}] = None
 // END PYTHON
 
 enum class LISTARRAY_COMBINATIONS_ERRORS {
@@ -43,25 +40,61 @@ awkward_ListArray_combinations_a(
     int64_t* scan_in_array_offsets,
     uint64_t invocation_index,
     uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    if (thread_id < length) {
-      if (n != 2) {
-        RAISE_ERROR(LISTARRAY_COMBINATIONS_ERRORS::N_NOT_IMPLEMENTED)
-      }
-      int64_t counts = stops[thread_id] - starts[thread_id];
-      if (replacement) {
-        scan_in_array_offsets[thread_id + 1] = counts * (counts + 1) / 2;
-      } else {
-        scan_in_array_offsets[thread_id + 1] = counts * (counts - 1) / 2;
-      }
+  if (err_code[0] != NO_ERROR) {
+    return;
+  }
+
+  // For now only n==2 supported
+  if (n != 2) {
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+      RAISE_ERROR(LISTARRAY_COMBINATIONS_ERRORS::N_NOT_IMPLEMENTED)
     }
+    return;
+  }
+
+  int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Grid-stride loop for general scalability
+  for (int64_t i = thread_id; i < length; i += gridDim.x * blockDim.x) {
+    V start = starts[i];
+    W stop  = stops[i];
+    int64_t counts = stop - start;
+    int64_t result = replacement
+                        ? counts * (counts + 1) / 2
+                        : counts * (counts - 1) / 2;
+    scan_in_array_offsets[i + 1] = result;
   }
 }
 
 template <typename T, typename C, typename U, typename V, typename W>
 __global__ void
 awkward_ListArray_combinations_b(
+    T** tocarry,
+    C* toindex,
+    U* fromindex,
+    int64_t n,
+    bool replacement,
+    const V* starts,
+    const W* stops,
+    int64_t length,
+    const int64_t* __restrict__ scan_in_array_offsets,
+    int64_t* __restrict__ scan_in_array_parents,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (thread_id >= length) return;
+
+  int64_t start = scan_in_array_offsets[thread_id];
+  int64_t stop  = scan_in_array_offsets[thread_id + 1];
+
+  for (int64_t i = start; i < stop; i++) {
+    scan_in_array_parents[i] = thread_id;
+  }
+}
+
+template <typename T, typename C, typename U, typename V, typename W>
+__global__ void
+awkward_ListArray_combinations_c(
     T** tocarry,
     C* toindex,
     U* fromindex,
@@ -75,22 +108,31 @@ awkward_ListArray_combinations_b(
     int64_t* scan_in_array_local_indices,
     uint64_t invocation_index,
     uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int64_t offsetslength = scan_in_array_offsets[length];
-
-    if (thread_id < offsetslength) {
-      if (n != 2) {
-        RAISE_ERROR(LISTARRAY_COMBINATIONS_ERRORS::N_NOT_IMPLEMENTED)
-      }
-      scan_in_array_local_indices[thread_id] = thread_id - scan_in_array_offsets[scan_in_array_parents[thread_id]];
+  if (err_code[0] != NO_ERROR) {
+    return;
+  }
+
+  // For now only n==2 supported
+  if (n != 2) {
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+      RAISE_ERROR(LISTARRAY_COMBINATIONS_ERRORS::N_NOT_IMPLEMENTED)
     }
+    return;
+  }
+
+  int64_t offsetslength = scan_in_array_offsets[length];
+  int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Grid-stride loop
+  for (int64_t i = thread_id; i < offsetslength; i += gridDim.x * blockDim.x) {
+    int64_t parent_offset = scan_in_array_offsets[scan_in_array_parents[i]];
+    scan_in_array_local_indices[i] = i - parent_offset;
   }
 }
 
 template <typename T, typename C, typename U, typename V, typename W>
 __global__ void
-awkward_ListArray_combinations_c(
+awkward_ListArray_combinations_d(
     T** tocarry,
     C* toindex,
     U* fromindex,
@@ -104,38 +146,55 @@ awkward_ListArray_combinations_c(
     int64_t* scan_in_array_local_indices,
     uint64_t invocation_index,
     uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    int64_t offsetslength = scan_in_array_offsets[length];
-    int64_t i = 0;
-    int64_t j = 0;
-
-    if (thread_id < offsetslength) {
-      if (n != 2) {
-        RAISE_ERROR(LISTARRAY_COMBINATIONS_ERRORS::N_NOT_IMPLEMENTED)
-      }
-
-      int64_t n = stops[scan_in_array_parents[thread_id]] - starts[scan_in_array_parents[thread_id]];
-
-      if (replacement) {
-        int64_t b = 2 * n + 1;
-        float discriminant = sqrtf(b * b - 8 * scan_in_array_local_indices[thread_id]);
-        i = (int64_t)((b - discriminant) / 2);
-        j = scan_in_array_local_indices[thread_id] + i * (i - b + 2) / 2;
-      } else {
-        int64_t b = 2 * n - 1;
-        float discriminant = sqrtf(b * b - 8 * scan_in_array_local_indices[thread_id]);
-        i = (int64_t)((b - discriminant) / 2);
-        j = scan_in_array_local_indices[thread_id] + i * (i - b + 2) / 2 + 1;
-      }
-
-      i += starts[scan_in_array_parents[thread_id]];
-      j += starts[scan_in_array_parents[thread_id]];
-
-      tocarry[0][thread_id] = i;
-      tocarry[1][thread_id] = j;
-      toindex[0] = offsetslength;
-      toindex[1] = offsetslength;
+  if (err_code[0] != NO_ERROR) {
+    return;
+  }
+
+  // For now only n==2 supported
+  if (n != 2) {
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+      RAISE_ERROR(LISTARRAY_COMBINATIONS_ERRORS::N_NOT_IMPLEMENTED)
     }
+    return;
+  }
+
+  int64_t offsetslength = scan_in_array_offsets[length];
+  int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Grid-stride loop
+  for (int64_t idx = thread_id; idx < offsetslength; idx += gridDim.x * blockDim.x) {
+
+    int64_t parent = scan_in_array_parents[idx];
+    V start = starts[parent];
+    W stop  = stops[parent];
+    int64_t count = stop - start;
+    int64_t local_index = scan_in_array_local_indices[idx];
+
+    float discriminant;
+    int64_t i, j;
+
+    if (replacement) {
+      int64_t b = 2 * count + 1;
+      discriminant = sqrtf(float(b * b - 8 * local_index));
+      i = (int64_t)((b - discriminant) / 2.0f);
+      j = local_index + i * (i - b + 2) / 2;
+    } else {
+      int64_t b = 2 * count - 1;
+      discriminant = sqrtf(float(b * b - 8 * local_index));
+      i = (int64_t)((b - discriminant) / 2.0f);
+      j = local_index + i * (i - b + 2) / 2 + 1;
+    }
+
+    i += start;
+    j += start;
+
+    tocarry[0][idx] = i;
+    tocarry[1][idx] = j;
+  }
+
+  // Set toindex[0] and [1] only once per kernel call (thread 0 of block 0)
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    toindex[0] = offsetslength;
+    toindex[1] = offsetslength;
   }
 }
diff --git a/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_combinations_length.cu b/src/awkward/_connect/cuda/cuda_kernels/awkward_ListArray_combinations_length.cu
@@ -3,12 +3,10 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (totallen, tooffsets, n, replacement, starts, stops, length, invocation_index, err_code) = args
-//     scan_in_array_totallen = cupy.zeros(length, dtype=cupy.int64)
-//     scan_in_array_tooffsets = cupy.zeros(length, dtype=cupy.int64)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_combinations_length_a", totallen.dtype, tooffsets.dtype, starts.dtype, stops.dtype]))(grid, block, (totallen, tooffsets, n, replacement, starts, stops, length, scan_in_array_totallen, scan_in_array_tooffsets, invocation_index, err_code))
-//     scan_in_array_totallen = cupy.cumsum(scan_in_array_totallen)
-//     scan_in_array_tooffsets = cupy.cumsum(scan_in_array_tooffsets)
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_combinations_length_b", totallen.dtype, tooffsets.dtype, starts.dtype, stops.dtype]))(grid, block, (totallen, tooffsets, n, replacement, starts, stops, length, scan_in_array_totallen, scan_in_array_tooffsets,  invocation_index, err_code))
+//     scan_out = cupy.zeros(length, dtype=cupy.int64)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_combinations_length_a", totallen.dtype, tooffsets.dtype, starts.dtype, stops.dtype]))(grid, block, (totallen, tooffsets, n, replacement, starts, stops, length, scan_out, invocation_index, err_code))
+//     cupy.cumsum(scan_out, out=scan_out)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListArray_combinations_length_b", totallen.dtype, tooffsets.dtype, starts.dtype, stops.dtype]))(grid, block, (totallen, tooffsets, n, replacement, starts, stops, length, scan_out, invocation_index, err_code))
 // out["awkward_ListArray_combinations_length_a", {dtype_specializations}] = None
 // out["awkward_ListArray_combinations_length_b", {dtype_specializations}] = None
 // END PYTHON
@@ -23,40 +21,42 @@ awkward_ListArray_combinations_length_a(
     const U* starts,
     const V* stops,
     int64_t length,
-    int64_t* scan_in_array_totallen,
-    int64_t* scan_in_array_tooffsets,
+    int64_t* scan_out,
     uint64_t invocation_index,
     uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (err_code[0] != NO_ERROR) {
+    return;
+  }
+
+  int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  if (thread_id >= length) {
+    return;
+  }
+
+  int64_t size = stops[thread_id] - starts[thread_id];
+  int64_t combinationslen = 0;
+
+  if (replacement) {
+    size += (n - 1);
+  }
+
+  if (n > size) {
+    combinationslen = 0;
+  }
+  else if (n == size) {
+    combinationslen = 1;
+  }
+  else {
+    // Choose the smaller of n and size - n for fewer multiplications
+    int64_t k = (n * 2 > size) ? (size - n) : n;
 
-    if (thread_id < length) {
-      int64_t size = (int64_t)(stops[thread_id] - starts[thread_id]);
-      if (replacement) {
-        size += (n - 1);
-      }
-      int64_t thisn = n;
-      int64_t combinationslen;
-      if (thisn > size) {
-        combinationslen = 0;
-      }
-      else if (thisn == size) {
-        combinationslen = 1;
-      }
-      else {
-        if (thisn * 2 > size) {
-          thisn = size - thisn;
-        }
-        combinationslen = size;
-        for (int64_t j = 2 + threadIdx.y; j <= thisn; j += blockDim.y) {
-          combinationslen *= (size - j + 1);
-          combinationslen /= j;
-        }
-      }
-      scan_in_array_totallen[thread_id] = combinationslen;
-      scan_in_array_tooffsets[thread_id] = combinationslen;
+    combinationslen = 1;
+    for (int64_t j = 1; j <= k; ++j) {
+      combinationslen = (combinationslen * (size - j + 1)) / j;
     }
   }
+
+  scan_out[thread_id] = combinationslen;
 }
 
 template <typename T, typename C, typename U, typename V>
@@ -69,17 +69,24 @@ awkward_ListArray_combinations_length_b(
     const U* starts,
     const V* stops,
     int64_t length,
-    int64_t* scan_in_array_totallen,
-    int64_t* scan_in_array_tooffsets,
+    int64_t* scan_out,
     uint64_t invocation_index,
     uint64_t* err_code) {
-  if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-    *totallen = length > 0 ? scan_in_array_totallen[length - 1] : 0;
+
+  if (err_code[0] != NO_ERROR) {
+    return;
+  }
+
+  int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Let a single thread handle totallen and tooffsets[0]
+  if (thread_id == 0) {
+    *totallen = (length > 0) ? scan_out[length - 1] : 0;
     tooffsets[0] = 0;
+  }
 
-    if (thread_id < length) {
-      tooffsets[thread_id + 1] = scan_in_array_tooffsets[thread_id];
-    }
+  // Copy scan_out values into tooffsets (shifted by 1)
+  if (thread_id < length) {
+    tooffsets[thread_id + 1] = scan_out[thread_id];
   }
 }