NVIDIA
diff --git a/‎cub/cub/detail/launcher/cuda_runtime.cuh
Lines changed: 4 additions & 0 deletions b/‎cub/cub/detail/launcher/cuda_runtime.cuh
Lines changed: 4 additions & 0 deletions
diff --git a/‎cub/cub/device/device_reduce.cuh
Lines changed: 274 additions & 0 deletions b/‎cub/cub/device/device_reduce.cuh
Lines changed: 274 additions & 0 deletions
@@ -69,6 +69,10 @@ struct TripleChevronFactory
   }
 };
 
+#ifndef CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER
+#  define CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER detail::TripleChevronFactory
+#endif
+
 } // namespace detail
 
 CUB_NAMESPACE_END
@@ -45,11 +45,18 @@
 #include <cub/detail/choose_offset.cuh>
 #include <cub/device/dispatch/dispatch_reduce.cuh>
 #include <cub/device/dispatch/dispatch_reduce_by_key.cuh>
+#include <cub/device/dispatch/dispatch_reduce_deterministic.cuh>
 #include <cub/device/dispatch/dispatch_streaming_reduce.cuh>
 #include <cub/util_type.cuh>
 
 #include <thrust/iterator/tabulate_output_iterator.h>
 
+#include <cuda/__execution/determinism.h>
+#include <cuda/__execution/require.h>
+#include <cuda/__execution/tune.h>
+#include <cuda/__memory_resource/get_memory_resource.h>
+#include <cuda/__stream/get_stream.h>
+#include <cuda/std/__execution/env.h>
 #include <cuda/std/limits>
 
 CUB_NAMESPACE_BEGIN
@@ -58,6 +65,31 @@ namespace detail
 {
 namespace reduce
 {
+
+struct get_reduce_tuning_query_t
+{};
+
+template <class Derived>
+struct tuning
+{
+  [[nodiscard]] _CCCL_TRIVIAL_API constexpr auto query(const get_reduce_tuning_query_t&) const noexcept -> Derived
+  {
+    return static_cast<const Derived&>(*this);
+  }
+};
+
+struct default_tuning : tuning<default_tuning>
+{
+  template <class AccumT, class Offset, class OpT>
+  using fn = policy_hub<AccumT, Offset, OpT>;
+};
+
+struct default_rfa_tuning : tuning<default_tuning>
+{
+  template <class AccumT, class Offset, class OpT>
+  using fn = detail::rfa::policy_hub<AccumT, Offset, OpT>;
+};
+
 template <typename ExtremumOutIteratorT, typename IndexOutIteratorT>
 struct unzip_and_write_arg_extremum_op
 {
@@ -72,6 +104,41 @@ struct unzip_and_write_arg_extremum_op
   }
 };
 } // namespace reduce
+
+// TODO(gevtushenko): move cudax `device_memory_resource` to `cuda::__device_memory_resource` and use it here
+struct device_memory_resource
+{
+  void* allocate(size_t bytes, size_t /* alignment */)
+  {
+    void* ptr{nullptr};
+    _CCCL_TRY_CUDA_API(::cudaMalloc, "allocate failed to allocate with cudaMalloc", &ptr, bytes);
+    return ptr;
+  }
+
+  void deallocate(void* ptr, size_t /* bytes */)
+  {
+    _CCCL_ASSERT_CUDA_API(::cudaFree, "deallocate failed", ptr);
+  }
+
+  void* allocate_async(size_t bytes, size_t /* alignment */, ::cuda::stream_ref stream)
+  {
+    return allocate_async(bytes, stream);
+  }
+
+  void* allocate_async(size_t bytes, ::cuda::stream_ref stream)
+  {
+    void* ptr{nullptr};
+    _CCCL_TRY_CUDA_API(
+      ::cudaMallocAsync, "allocate_async failed to allocate with cudaMallocAsync", &ptr, bytes, stream.get());
+    return ptr;
+  }
+
+  void deallocate_async(void* ptr, size_t /* bytes */, const ::cuda::stream_ref stream)
+  {
+    _CCCL_ASSERT_CUDA_API(::cudaFreeAsync, "deallocate_async failed", ptr, stream.get());
+  }
+};
+
 } // namespace detail
 
 //! @rst
@@ -102,6 +169,85 @@ struct unzip_and_write_arg_extremum_op
 //! @endrst
 struct DeviceReduce
 {
+private:
+  // TODO(gevtushenko): dispatch to atomic reduce once merged
+  template <typename TuningEnvT,
+            typename InputIteratorT,
+            typename OutputIteratorT,
+            typename ReductionOpT,
+            typename T,
+            typename NumItemsT,
+            ::cuda::execution::determinism::__determinism_t Determinism>
+  CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    NumItemsT num_items,
+    ReductionOpT reduction_op,
+    T init,
+    ::cuda::execution::determinism::__determinism_holder_t<Determinism>,
+    cudaStream_t stream)
+  {
+    using offset_t        = detail::choose_offset_t<NumItemsT>;
+    using accum_t         = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
+    using transform_t     = ::cuda::std::identity;
+    using reduce_tuning_t = ::cuda::std::execution::
+      __query_result_or_t<TuningEnvT, detail::reduce::get_reduce_tuning_query_t, detail::reduce::default_tuning>;
+    using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
+    using dispatch_t =
+      DispatchReduce<InputIteratorT, OutputIteratorT, offset_t, ReductionOpT, T, accum_t, transform_t, policy_t>;
+
+    return dispatch_t::Dispatch(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<offset_t>(num_items), reduction_op, init, stream);
+  }
+
+  template <typename TuningEnvT,
+            typename InputIteratorT,
+            typename OutputIteratorT,
+            typename ReductionOpT,
+            typename T,
+            typename NumItemsT>
+  CUB_RUNTIME_FUNCTION static cudaError_t reduce_impl(
+    void* d_temp_storage,
+    size_t& temp_storage_bytes,
+    InputIteratorT d_in,
+    OutputIteratorT d_out,
+    NumItemsT num_items,
+    ReductionOpT,
+    T init,
+    ::cuda::execution::determinism::gpu_to_gpu_t,
+    cudaStream_t stream)
+  {
+    using offset_t = detail::choose_offset_t<NumItemsT>;
+    using accum_t  = ::cuda::std::__accumulator_t<ReductionOpT, detail::it_value_t<InputIteratorT>, T>;
+
+    // RFA is only supported for float and double accumulators
+    constexpr bool is_float_or_double = _CUDA_VSTD::is_same_v<accum_t, float> || _CUDA_VSTD::is_same_v<accum_t, double>;
+    constexpr bool is_sum             = _CUDA_VSTD::is_same_v<ReductionOpT, ::cuda::std::plus<>>;
+    constexpr bool is_supported       = is_float_or_double && is_sum;
+
+    static_assert(is_supported, "gpu-to-gpu deterministic reduction supports only float and double sum.");
+
+    if constexpr (is_supported)
+    {
+      using transform_t     = ::cuda::std::identity;
+      using reduce_tuning_t = ::cuda::std::execution::
+        __query_result_or_t<TuningEnvT, detail::reduce::get_reduce_tuning_query_t, detail::reduce::default_rfa_tuning>;
+      using policy_t = typename reduce_tuning_t::template fn<accum_t, offset_t, ReductionOpT>;
+      using dispatch_t =
+        detail::DispatchReduceDeterministic<InputIteratorT, OutputIteratorT, offset_t, T, accum_t, transform_t, policy_t>;
+
+      return dispatch_t::Dispatch(
+        d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<offset_t>(num_items), init, stream);
+    }
+    else
+    {
+      return cudaErrorNotSupported;
+    }
+  }
+
+public:
   //! @rst
   //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``.
   //!
@@ -225,6 +371,134 @@ struct DeviceReduce
       d_temp_storage, temp_storage_bytes, d_in, d_out, static_cast<OffsetT>(num_items), reduction_op, init, stream);
   }
 
+  //! @rst
+  //! Computes a device-wide reduction using the specified binary ``reduction_op`` functor and initial value ``init``.
+  //!
+  //! - Does not support binary reduction operators that are non-commutative.
+  //! - By default, provides "run-to-run" determinism for pseudo-associative reduction
+  //!   (e.g., addition of floating point types) on the same GPU device.
+  //!   However, results for pseudo-associative reduction may be inconsistent
+  //!   from one device to a another device of a different compute-capability
+  //!   because CUB can employ different tile-sizing for different architectures.
+  //!   To request "gpu-to-gpu" determinism, pass `cuda::execution::require(cuda::execution::determinism::gpu_to_gpu)`
+  //!   as the `env` parameter.
+  //! - The range ``[d_in, d_in + num_items)`` shall not overlap ``d_out``.
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! The code snippet below illustrates a user-defined min-reduction of a
+  //! device vector of ``int`` data elements.
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_reduce_env_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin reduce-env-determinism
+  //!     :end-before: example-end reduce-env-determinism
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputIteratorT
+  //!   **[inferred]** Random-access input iterator type for reading input items @iterator
+  //!
+  //! @tparam OutputIteratorT
+  //!   **[inferred]** Output iterator type for recording the reduced aggregate @iterator
+  //!
+  //! @tparam ReductionOpT
+  //!   **[inferred]** Binary reduction functor type having member `T operator()(const T &a, const T &b)`
+  //!
+  //! @tparam T
+  //!   **[inferred]** Data element type that is convertible to the `value` type of `InputIteratorT`
+  //!
+  //! @tparam NumItemsT
+  //!   **[inferred]** Type of num_items
+  //!
+  //! @tparam EnvT
+  //!   **[inferred]** Execution environment type. Default is `cuda::std::execution::env<>`.
+  //!
+  //! @param[in] d_in
+  //!   Pointer to the input sequence of data items
+  //!
+  //! @param[out] d_out
+  //!   Pointer to the output aggregate
+  //!
+  //! @param[in] num_items
+  //!   Total number of input items (i.e., length of `d_in`)
+  //!
+  //! @param[in] reduction_op
+  //!   Binary reduction functor
+  //!
+  //! @param[in] init
+  //!   Initial value of the reduction
+  //!
+  //! @param[in] env
+  //!   @rst
+  //!   **[optional]** Execution environment. Default is `cuda::std::execution::env{}`.
+  //!   @endrst
+  template <typename InputIteratorT,
+            typename OutputIteratorT,
+            typename ReductionOpT,
+            typename T,
+            typename NumItemsT,
+            typename EnvT = ::cuda::std::execution::env<>>
+  CUB_RUNTIME_FUNCTION static cudaError_t Reduce(
+    InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, ReductionOpT reduction_op, T init, EnvT env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceReduce::Reduce");
+
+    static_assert(!_CUDA_STD_EXEC::__queryable_with<EnvT, _CUDA_EXEC::determinism::__get_determinism_t>,
+                  "Determinism should be used inside requires to have an effect.");
+    using requirements_t =
+      _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_requirements_t, _CUDA_STD_EXEC::env<>>;
+    using determinism_t =
+      _CUDA_STD_EXEC::__query_result_or_t<requirements_t, //
+                                          _CUDA_EXEC::determinism::__get_determinism_t,
+                                          _CUDA_EXEC::determinism::run_to_run_t>;
+
+    // Query relevant properties from the environment
+    auto stream = _CUDA_STD_EXEC::__query_or(env, ::cuda::get_stream, ::cuda::stream_ref{});
+    auto mr     = _CUDA_STD_EXEC::__query_or(env, ::cuda::mr::__get_memory_resource, detail::device_memory_resource{});
+
+    void* d_temp_storage      = nullptr;
+    size_t temp_storage_bytes = 0;
+
+    using tuning_t = _CUDA_STD_EXEC::__query_result_or_t<EnvT, _CUDA_EXEC::__get_tuning_t, _CUDA_STD_EXEC::env<>>;
+
+    // Query the required temporary storage size
+    cudaError_t error = reduce_impl<tuning_t>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, determinism_t{}, stream.get());
+    if (error != cudaSuccess)
+    {
+      return error;
+    }
+
+    NV_IF_ELSE_TARGET(
+      NV_IS_HOST,
+      (
+        try { d_temp_storage = mr.allocate_async(temp_storage_bytes, stream); } catch (...) {
+          return cudaErrorMemoryAllocation;
+        }),
+      (d_temp_storage = mr.allocate_async(temp_storage_bytes, stream);));
+
+    // Run the algorithm
+    error = reduce_impl<tuning_t>(
+      d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, reduction_op, init, determinism_t{}, stream.get());
+    if (error != cudaSuccess)
+    {
+      return error;
+    }
+
+    NV_IF_ELSE_TARGET(
+      NV_IS_HOST,
+      (
+        try { mr.deallocate_async(d_temp_storage, temp_storage_bytes, stream); } catch (...) {
+          return cudaErrorMemoryAllocation;
+        }),
+      (mr.deallocate_async(d_temp_storage, temp_storage_bytes, stream);));
+
+    return cudaSuccess;
+  }
+
   //! @rst
   //! Computes a device-wide sum using the addition (``+``) operator.
   //!
Original file line number	Diff line number	Diff line change
`@@ -69,6 +69,10 @@ struct TripleChevronFactory`
`69`	`69`	`}`
`70`	`70`	`};`
`71`	`71`
	`72`	`+#ifndef CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER`
	`73`	`+# define CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER detail::TripleChevronFactory`
	`74`	`+#endif`
	`75`	`+`
`72`	`76`	`} // namespace detail`
`73`	`77`
`74`	`78`	`CUB_NAMESPACE_END`