scikit-hep
diff --git a/‎dev/generate-kernel-signatures.py
Lines changed: 4 additions & 0 deletions b/‎dev/generate-kernel-signatures.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎dev/generate-tests.py
Lines changed: 7 additions & 3 deletions b/‎dev/generate-tests.py
Lines changed: 7 additions & 3 deletions
diff --git a/‎kernel-test-data.json
Lines changed: 946 additions & 154 deletions b/‎kernel-test-data.json
Lines changed: 946 additions & 154 deletions
diff --git a/‎src/awkward/_connect/cuda/__init__.py
Lines changed: 2 additions & 0 deletions b/‎src/awkward/_connect/cuda/__init__.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
Lines changed: 100 additions & 0 deletions b/‎src/awkward/_connect/cuda/cuda_kernels/awkward_ListOffsetArray_reduce_local_outoffsets_64.cu
Lines changed: 100 additions & 0 deletions
diff --git a/‎src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
Lines changed: 58 additions & 7 deletions b/‎src/awkward/_connect/cuda/cuda_kernels/awkward_reduce_argmax.cu
Lines changed: 58 additions & 7 deletions
@@ -108,6 +108,7 @@
     "awkward_ListOffsetArray_drop_none_indexes",
     "awkward_ListOffsetArray_reduce_local_nextparents_64",
     "awkward_ListOffsetArray_reduce_nonlocal_maxcount_offsetscopy_64",
+    "awkward_ListOffsetArray_reduce_local_outoffsets_64",
     "awkward_UnionArray_flatten_length",
     "awkward_UnionArray_flatten_combine",
     "awkward_UnionArray_nestedfill_tags_index",
@@ -123,6 +124,7 @@
     "awkward_reduce_sum_int32_bool_64",
     "awkward_reduce_sum_int64_bool_64",
     "awkward_reduce_sum_bool",
+    "awkward_reduce_prod",
     "awkward_reduce_prod_bool",
     "awkward_reduce_countnonzero",
     "awkward_sorting_ranges",
@@ -381,6 +383,8 @@ def kernel_signatures_cuda_py(specification):
 from awkward._connect.cuda import fetch_specialization
 from awkward._connect.cuda import import_cupy
 
+import math
+
 cupy = import_cupy("Awkward Arrays with CUDA")
 """
         )
 
@@ -424,7 +424,6 @@ def genspectests(specdict):
 
 """
             )
-            f.write("import pytest\nimport kernels\n\n")
             f.write("import pytest\nimport numpy as np\nimport kernels\n\n")
             num = 1
             if spec.tests == []:
@@ -894,6 +893,7 @@ def gencpuunittests(specdict):
     "awkward_ListOffsetArray_drop_none_indexes",
     "awkward_ListOffsetArray_reduce_local_nextparents_64",
     "awkward_ListOffsetArray_reduce_nonlocal_maxcount_offsetscopy_64",
+    "awkward_ListOffsetArray_reduce_local_outoffsets_64",
     "awkward_UnionArray_flatten_length",
     "awkward_UnionArray_flatten_combine",
     "awkward_UnionArray_nestedfill_tags_index",
@@ -909,6 +909,7 @@ def gencpuunittests(specdict):
     "awkward_reduce_sum_int32_bool_64",
     "awkward_reduce_sum_int64_bool_64",
     "awkward_reduce_sum_bool",
+    "awkward_reduce_prod",
     "awkward_reduce_prod_bool",
     "awkward_reduce_countnonzero",
     "awkward_sorting_ranges",
@@ -959,6 +960,8 @@ def gencudakerneltests(specdict):
 
                 f.write(
                     "import cupy\n"
+                    "import cupy.testing as cpt\n"
+                    "import numpy as np\n"
                     "import pytest\n\n"
                     "import awkward as ak\n"
                     "import awkward._connect.cuda as ak_cu\n"
@@ -1028,7 +1031,7 @@ def gencudakerneltests(specdict):
                             if isinstance(val, list):
                                 f.write(
                                     " " * 4
-                                    + f"assert cupy.array_equal({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
+                                    + f"cpt.assert_allclose({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
                                 )
                             else:
                                 f.write(" " * 4 + f"assert {arg} == pytest_{arg}\n")
@@ -1088,6 +1091,7 @@ def gencudaunittests(specdict):
                 f.write(
                     "import re\n"
                     "import cupy\n"
+                    "import cupy.testing as cpt\n"
                     "import pytest\n\n"
                     "import awkward as ak\n"
                     "import awkward._connect.cuda as ak_cu\n"
@@ -1224,7 +1228,7 @@ def gencudaunittests(specdict):
                                 if isinstance(val, list):
                                     f.write(
                                         " " * 4
-                                        + f"assert cupy.array_equal({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
+                                        + f"cpt.assert_allclose({arg}[:len(pytest_{arg})], cupy.array(pytest_{arg}))\n"
                                     )
                                 else:
                                     f.write(" " * 4 + f"assert {arg} == pytest_{arg}\n")
 
@@ -108,6 +108,7 @@ def fetch_template_specializations(kernel_dict):
         "awkward_ListArray_rpad_axis1",
         "awkward_ListOffsetArray_drop_none_indexes",
         "awkward_ListOffsetArray_reduce_nonlocal_maxcount_offsetscopy_64",
+        "awkward_ListOffsetArray_reduce_local_outoffsets_64",
         "awkward_UnionArray_regular_index",
         "awkward_ListOffsetArray_reduce_nonlocal_nextstarts_64",
         "awkward_ListOffsetArray_rpad_axis1",
@@ -119,6 +120,7 @@ def fetch_template_specializations(kernel_dict):
         "awkward_reduce_sum_int32_bool_64",
         "awkward_reduce_sum_int64_bool_64",
         "awkward_reduce_sum_bool",
+        "awkward_reduce_prod",
         "awkward_reduce_prod_bool",
         "awkward_reduce_argmax",
         "awkward_reduce_argmin",
 
@@ -0,0 +1,100 @@
+// BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE
+
+// BEGIN PYTHON
+// def f(grid, block, args):
+//     (outoffsets, parents, lenparents, outlength, invocation_index, err_code) = args
+//     if block[0] > 0:
+//         segment = math.floor((outlength + block[0] - 1) / block[0])
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         grid_size = 1
+//     temp = cupy.zeros(lenparents, dtype=cupy.int64)
+//     scan_in_array = cupy.zeros(outlength, dtype=cupy.uint64)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, scan_in_array, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, scan_in_array, temp, invocation_index, err_code))
+//     scan_in_array = cupy.cumsum(scan_in_array)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", cupy.dtype(outoffsets.dtype).type, parents.dtype]))((grid_size,), block, (outoffsets, parents, lenparents, outlength, scan_in_array, temp, invocation_index, err_code))
+// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_a", {dtype_specializations}] = None
+// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_b", {dtype_specializations}] = None
+// out["awkward_ListOffsetArray_reduce_local_outoffsets_64_c", {dtype_specializations}] = None
+// END PYTHON
+
+template <typename T, typename C>
+__global__ void
+awkward_ListOffsetArray_reduce_local_outoffsets_64_a(
+    T* outoffsets,
+    const C* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    uint64_t* scan_in_array,
+    int64_t* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (thread_id < outlength) {
+      outoffsets[thread_id] = 0;
+    }
+  }
+}
+
+template <typename T, typename C>
+__global__ void
+awkward_ListOffsetArray_reduce_local_outoffsets_64_b(
+    T* outoffsets,
+    const C* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    uint64_t* scan_in_array,
+    int64_t* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
+    if (thread_id < lenparents) {
+      temp[thread_id] = 1;
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      int64_t val = 0;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        val = temp[thread_id - stride];
+      }
+      __syncthreads();
+      temp[thread_id] += val;
+      __syncthreads();
+    }
+
+    if (thread_id < lenparents) {
+      int64_t parent = parents[thread_id];
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        atomicAdd(&scan_in_array[parent], temp[thread_id]);
+      }
+    }
+  }
+}
+
+template <typename T, typename C>
+__global__ void
+awkward_ListOffsetArray_reduce_local_outoffsets_64_c(
+    T* outoffsets,
+    const C* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    uint64_t* scan_in_array,
+    int64_t* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    outoffsets[0] = 0;
+
+    if (thread_id < outlength) {
+      outoffsets[thread_id + 1] = (T)(scan_in_array[thread_id]);
+    }
+  }
+}
@@ -3,10 +3,18 @@
 // BEGIN PYTHON
 // def f(grid, block, args):
 //     (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code) = args
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code))
-//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", toptr.dtype, fromptr.dtype, parents.dtype]))(grid, block, (toptr, fromptr, parents, lenparents, outlength, invocation_index, err_code))
+//     if block[0] > 0:
+//         grid_size = math.floor((lenparents + block[0] - 1) / block[0])
+//     else:
+//         grid_size = 1
+//     atomic_toptr = cupy.array(toptr, dtype=cupy.uint64)
+//     temp = cupy.zeros(lenparents, dtype=toptr.dtype)
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_a", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_b", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
+//     cuda_kernel_templates.get_function(fetch_specialization(["awkward_reduce_argmax_c", cupy.dtype(toptr.dtype).type, cupy.dtype(fromptr.dtype).type, parents.dtype]))((grid_size,), block, (toptr, fromptr, parents, lenparents, outlength, atomic_toptr, temp, invocation_index, err_code))
 // out["awkward_reduce_argmax_a", {dtype_specializations}] = None
 // out["awkward_reduce_argmax_b", {dtype_specializations}] = None
+// out["awkward_reduce_argmax_c", {dtype_specializations}] = None
 // END PYTHON
 
 template <typename T, typename C, typename U>
@@ -17,12 +25,15 @@ awkward_reduce_argmax_a(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
+    uint64_t* atomic_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
     int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
     if (thread_id < outlength) {
-      toptr[thread_id] = -1;
+      atomic_toptr[thread_id] = -1;
     }
   }
 }
@@ -35,17 +46,57 @@ awkward_reduce_argmax_b(
     const U* parents,
     int64_t lenparents,
     int64_t outlength,
+    uint64_t* atomic_toptr,
+    T* temp,
     uint64_t invocation_index,
     uint64_t* err_code) {
   if (err_code[0] == NO_ERROR) {
-    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t idx = threadIdx.x;
+    int64_t thread_id = blockIdx.x * blockDim.x + idx;
+
+    if (thread_id < lenparents) {
+      temp[thread_id] = thread_id;
+    }
+    __syncthreads();
+
+    for (int64_t stride = 1; stride < blockDim.x; stride *= 2) {
+      int64_t index = -1;
+      if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
+        index = temp[thread_id - stride];
+      }
+      if (index != -1 && (temp[thread_id] == -1 || fromptr[index] > fromptr[temp[thread_id]] ||
+         (fromptr[index] == fromptr[temp[thread_id]] && index < temp[thread_id]))) {
+        temp[thread_id] = index;
+      }
+      __syncthreads();
+    }
 
     if (thread_id < lenparents) {
       int64_t parent = parents[thread_id];
-      if (toptr[parent] == -1 ||
-          (fromptr[thread_id] > (fromptr[toptr[parent]]))) {
-        toptr[parent] = thread_id; // we need the last parent filled, thread random order problem, find max arg at that index
+      if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
+        atomicExch(&atomic_toptr[parent], temp[thread_id]);
       }
     }
   }
 }
+
+template <typename T, typename C, typename U>
+__global__ void
+awkward_reduce_argmax_c(
+    T* toptr,
+    const C* fromptr,
+    const U* parents,
+    int64_t lenparents,
+    int64_t outlength,
+    uint64_t* atomic_toptr,
+    T* temp,
+    uint64_t invocation_index,
+    uint64_t* err_code) {
+  if (err_code[0] == NO_ERROR) {
+    int64_t thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (thread_id < outlength) {
+      toptr[thread_id] = (T)(atomic_toptr[thread_id]);
+    }
+  }
+}