diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 68768fe96..1ae09b3c8 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -46,63 +46,7 @@ endfunction(ConfigureBench) ### benchmark sources ############################################################################# ################################################################################################### -################################################################################################### -# - static_set benchmarks ------------------------------------------------------------------------- -ConfigureBench(STATIC_SET_BENCH - static_set/contains_bench.cu - static_set/find_bench.cu - static_set/insert_bench.cu - static_set/retrieve_bench.cu - static_set/retrieve_all_bench.cu - static_set/size_bench.cu - static_set/rehash_bench.cu) - -################################################################################################### -# - static_map benchmarks ------------------------------------------------------------------------- -ConfigureBench(STATIC_MAP_BENCH - static_map/insert_bench.cu - static_map/find_bench.cu - static_map/contains_bench.cu - static_map/erase_bench.cu - static_map/insert_or_apply_bench.cu) - -################################################################################################### -# - static_multiset benchmarks -------------------------------------------------------------------- -ConfigureBench(STATIC_MULTISET_BENCH - static_multiset/contains_bench.cu - static_multiset/retrieve_bench.cu - static_multiset/count_bench.cu - static_multiset/find_bench.cu - static_multiset/insert_bench.cu) - ################################################################################################### # - static_multimap benchmarks -------------------------------------------------------------------- ConfigureBench(STATIC_MULTIMAP_BENCH - static_multimap/insert_bench.cu - static_multimap/retrieve_bench.cu - static_multimap/query_bench.cu static_multimap/count_bench.cu) - -################################################################################################### -# - dynamic_map benchmarks ------------------------------------------------------------------------ -ConfigureBench(DYNAMIC_MAP_BENCH - dynamic_map/insert_bench.cu - dynamic_map/find_bench.cu - dynamic_map/contains_bench.cu - dynamic_map/erase_bench.cu) - -################################################################################################### -# - hash function benchmarks ---------------------------------------------------------------------- -ConfigureBench(HASH_FUNCTION_BENCH - hash_function/hash_function_bench.cu) - -################################################################################################### -# - hyperloglog benchmarks ----------------------------------------------------------- -ConfigureBench(HYPERLOGLOG_BENCH - hyperloglog/hyperloglog_bench.cu) - -################################################################################################### -# - bloom_filter benchmarks ----------------------------------------------------------------------- -ConfigureBench(BLOOM_FILTER_BENCH - bloom_filter/add_bench.cu - bloom_filter/contains_bench.cu) diff --git a/include/cuco/detail/extent/extent.inl b/include/cuco/detail/extent/extent.inl index bb5145c70..5359e7adf 100644 --- a/include/cuco/detail/extent/extent.inl +++ b/include/cuco/detail/extent/extent.inl @@ -99,7 +99,7 @@ template return bucket_extent{static_cast( *cuco::detail::lower_bound( cuco::detail::primes.begin(), cuco::detail::primes.end(), static_cast(size)) * - CGSize)}; + CGSize * BucketSize)}; } if constexpr (N != dynamic_extent) { return bucket_extent const& group, Value const& value) noexcept { - auto const val = this->heterogeneous_value(value); - auto const key = this->extract_key(val); - auto probing_iter = probing_scheme_(group, key, storage_ref_.bucket_extent()); - auto const init_idx = *probing_iter; + auto const val = this->heterogeneous_value(value); + auto const key = this->extract_key(val); + auto probing_iter = probing_scheme_(group, key, storage_ref_.bucket_extent()); + auto* data = reinterpret_cast(storage_ref_.data()); while (true) { - auto const bucket_slots = storage_ref_[*probing_iter]; + value_type bucket_slots[2]; + auto const tmp = + *reinterpret_cast(data + *probing_iter * sizeof(value_type) * 2); + memcpy(&bucket_slots[0], &tmp, 2 * sizeof(value_type)); - auto const [state, intra_bucket_index] = [&]() { - for (auto i = 0; i < bucket_size; ++i) { - switch ( - this->predicate_.operator()(key, this->extract_key(bucket_slots[i]))) { - case detail::equal_result::AVAILABLE: - return bucket_probing_results{detail::equal_result::AVAILABLE, i}; - case detail::equal_result::EQUAL: { - if constexpr (allows_duplicates) { - continue; - } else { - return bucket_probing_results{detail::equal_result::EQUAL, i}; - } - } - default: continue; - } - } - // returns dummy index `-1` for UNEQUAL - return bucket_probing_results{detail::equal_result::UNEQUAL, -1}; - }(); + auto const first_slot_is_empty = + detail::bitwise_compare(bucket_slots[0].first, this->empty_key_sentinel()); + auto const second_slot_is_empty = + detail::bitwise_compare(bucket_slots[1].first, this->empty_key_sentinel()); - if constexpr (not allows_duplicates) { - // If the key is already in the container, return false - if (group.any(state == detail::equal_result::EQUAL)) { return false; } - } + auto const bucket_contains_empty = group.ballot(first_slot_is_empty or second_slot_is_empty); - auto const group_contains_available = group.ballot(state == detail::equal_result::AVAILABLE); - if (group_contains_available) { - auto const src_lane = __ffs(group_contains_available) - 1; + if (bucket_contains_empty) { + auto const src_lane = __ffs(bucket_contains_empty) - 1; auto status = insert_result::CONTINUE; if (group.thread_rank() == src_lane) { - if constexpr (SupportsErase) { - status = - attempt_insert((storage_ref_.data() + *probing_iter)->data() + intra_bucket_index, - bucket_slots[intra_bucket_index], - val); - } else { - status = - attempt_insert((storage_ref_.data() + *probing_iter)->data() + intra_bucket_index, - this->empty_slot_sentinel(), - val); - } + status = attempt_insert(bucket_slots, this->empty_slot_sentinel(), val); } - switch (group.shfl(status, src_lane)) { - case insert_result::SUCCESS: return true; - case insert_result::DUPLICATE: { - if constexpr (allows_duplicates) { - [[fallthrough]]; - } else { - return false; - } - } - default: continue; - } + if (group.any(status == insert_result::SUCCESS)) { return true; } } else { ++probing_iter; - if (*probing_iter == init_idx) { return false; } } } } @@ -990,27 +953,35 @@ class open_addressing_ref_impl { [[nodiscard]] __device__ size_type count( cooperative_groups::thread_block_tile const& group, ProbeKey const& key) const noexcept { - auto probing_iter = probing_scheme_(group, key, storage_ref_.bucket_extent()); - auto const init_idx = *probing_iter; - size_type count = 0; + auto probing_iter = probing_scheme_(group, key, storage_ref_.bucket_extent()); + size_type count = 0; - while (true) { - auto const bucket_slots = storage_ref_[*probing_iter]; + auto* data = reinterpret_cast(storage_ref_.data()); - auto const state = [&]() { - auto res = detail::equal_result::UNEQUAL; - for (auto& slot : bucket_slots) { - res = this->predicate_.operator()(key, this->extract_key(slot)); - if (res == detail::equal_result::EMPTY) { return res; } - count += static_cast(res); - } - return res; - }(); + if constexpr (has_payload) { + while (true) { + value_type bucket_slots[2]; + auto const tmp = + *reinterpret_cast(data + *probing_iter * sizeof(value_type) * 2); + memcpy(&bucket_slots[0], &tmp, 2 * sizeof(value_type)); - if (group.any(state == detail::equal_result::EMPTY)) { return count; } - ++probing_iter; - if (*probing_iter == init_idx) { return count; } + auto const first_slot_is_empty = + detail::bitwise_compare(bucket_slots[0].first, this->empty_key_sentinel()); + auto const second_slot_is_empty = + detail::bitwise_compare(bucket_slots[1].first, this->empty_key_sentinel()); + auto const first_equals = + (not first_slot_is_empty and predicate_.equal_(key, bucket_slots[0].first)); + auto const second_equals = + (not second_slot_is_empty and predicate_.equal_(key, bucket_slots[1].first)); + + count += (first_equals + second_equals); + + if (group.any(first_slot_is_empty or second_slot_is_empty)) { return count; } + + ++probing_iter; + } } + return count; } /** diff --git a/include/cuco/detail/storage/kernels.cuh b/include/cuco/detail/storage/kernels.cuh index b2f425071..b18fcb08c 100644 --- a/include/cuco/detail/storage/kernels.cuh +++ b/include/cuco/detail/storage/kernels.cuh @@ -51,5 +51,18 @@ CUCO_KERNEL void initialize(BucketT* buckets, } } +template +CUCO_KERNEL void initialize(BucketT* buckets, cuco::detail::index_type n, BucketT value) +{ + auto const loop_stride = cuco::detail::grid_stride(); + auto idx = cuco::detail::global_thread_id(); + + while (idx < n) { + auto& slot = *(buckets + idx); + slot = value; + idx += loop_stride; + } +} + } // namespace detail } // namespace cuco diff --git a/include/cuco/detail/storage/storage.cuh b/include/cuco/detail/storage/storage.cuh index 33c866390..f0f1a54b3 100644 --- a/include/cuco/detail/storage/storage.cuh +++ b/include/cuco/detail/storage/storage.cuh @@ -17,6 +17,7 @@ #pragma once #include +#include namespace cuco { namespace detail { @@ -60,5 +61,38 @@ class storage : StorageImpl::template impl { } }; +template +class slot_storage : StorageImpl::template impl { + public: + /// Storage implementation type + using impl_type = typename StorageImpl::template impl; + using ref_type = typename impl_type::ref_type; ///< Storage ref type + using value_type = typename impl_type::value_type; ///< Storage value type + using allocator_type = typename impl_type::allocator_type; ///< Storage value type + + /// Number of elements per bucket + static constexpr int bucket_size = impl_type::bucket_size; + + using impl_type::allocator; + using impl_type::bucket_extent; + using impl_type::capacity; + using impl_type::data; + using impl_type::initialize; + using impl_type::initialize_async; + using impl_type::num_buckets; + using impl_type::ref; + + /** + * @brief Constructs storage. + * + * @param size Number of slots to (de)allocate + * @param allocator Allocator used for (de)allocating device storage + */ + explicit constexpr slot_storage(Extent size, Allocator const& allocator) + : impl_type{size, allocator} + { + } +}; + } // namespace detail } // namespace cuco diff --git a/include/cuco/flat_storage.cuh b/include/cuco/flat_storage.cuh new file mode 100644 index 000000000..c0ac959a3 --- /dev/null +++ b/include/cuco/flat_storage.cuh @@ -0,0 +1,264 @@ +/* + * Copyright (c) 2022-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include + +#include + +#include +#include +#include +#include + +namespace cuco { +/** + * @brief Non-owning array of buckets storage reference type. + * + * @tparam T Storage element type + * @tparam BucketSize Number of slots in each bucket + * @tparam Extent Type of extent denoting storage capacity + */ +template > +class flat_storage_ref : public detail::storage_base { + public: + /// Array of buckets base class type + using base_type = detail::storage_base; + + static int32_t constexpr bucket_size = BucketSize; + + using extent_type = typename base_type::extent_type; ///< Storage extent type + using size_type = typename base_type::size_type; ///< Storage size type + using value_type = T; ///< Slot type + using bucket_type = value_type; + + using base_type::capacity; + using base_type::extent; + + /** + * @brief Constructor of AoS storage ref. + * + * @param size Number of buckets + * @param buckets Pointer to the buckets array + */ + __host__ __device__ explicit constexpr flat_storage_ref(Extent size, value_type* slots) noexcept + : base_type{size}, slots_{slots} + { + } + + using iterator = value_type*; + using const_iterator = iterator const; ///< Const forward iterator type + + /** + * @brief Returns an iterator to one past the last slot. + * + * This is provided for convenience for those familiar with checking + * an iterator returned from `find()` against the `end()` iterator. + * + * @return An iterator to one past the last slot + */ + [[nodiscard]] __device__ constexpr iterator end() noexcept { this->data() + this->capacity(); } + + /** + * @brief Returns a const_iterator to one past the last slot. + * + * This is provided for convenience for those familiar with checking + * an iterator returned from `find()` against the `end()` iterator. + * + * @return A const_iterator to one past the last slot + */ + [[nodiscard]] __device__ constexpr const_iterator end() const noexcept + { + this->data() + this->capacity(); + } + + /** + * @brief Gets buckets array. + * + * @return Pointer to the first bucket + */ + [[nodiscard]] __device__ constexpr value_type* data() noexcept { return slots_; } + + /** + * @brief Gets bucket array. + * + * @return Pointer to the first bucket + */ + [[nodiscard]] __device__ constexpr value_type* data() const noexcept { return slots_; } + + /** + * @brief Returns an array of slots (or a bucket) for a given index. + * + * @param index Index of the bucket + * @return An array of slots + */ + [[nodiscard]] __device__ constexpr value_type operator[](size_type index) const noexcept + { + *(this->data() + index); + } + + [[nodiscard]] __host__ __device__ constexpr size_type num_buckets() const noexcept + { + return this->capacity() / bucket_size; + } + + [[nodiscard]] __host__ __device__ constexpr auto bucket_extent() const noexcept + { + return cuco::extent{this->capacity() / bucket_size}; + } + + private: + value_type* slots_; ///< Pointer to the buckets array +}; + +/** + * @brief Array of buckets open addressing storage class. + * + * @tparam T Slot type + * @tparam BucketSize Number of slots in each bucket + * @tparam Extent Type of extent denoting number of buckets + * @tparam Allocator Type of allocator used for device storage (de)allocation + */ +template , + typename Allocator = cuco::cuda_allocator> +class flat_storage : public detail::storage_base { + public: + /// Array of buckets base class type + using base_type = detail::storage_base; + + static int32_t constexpr bucket_size = BucketSize; + + using extent_type = typename base_type::extent_type; ///< Storage extent type + using size_type = typename base_type::size_type; ///< Storage size type + using value_type = T; ///< Slot type + using bucket_type = value_type; + + using base_type::capacity; + using base_type::extent; + + /// Type of the allocator to (de)allocate buckets + using allocator_type = + typename std::allocator_traits::template rebind_alloc; + using slot_deleter_type = + detail::custom_deleter; ///< Type of bucket deleter + using ref_type = flat_storage_ref; ///< Storage ref type + + /** + * @brief Constructor of bucket storage. + * + * @note The input `size` should be exclusively determined by the return value of + * `make_flat_extent` since it depends on the requested low-bound value, the probing scheme, and + * the storage. + * + * @param size Number of buckets to (de)allocate + * @param allocator Allocator used for (de)allocating device storage + */ + explicit constexpr flat_storage(Extent size, Allocator const& allocator = {}) + : base_type{size}, + allocator_{allocator}, + slot_deleter_{capacity(), allocator_}, + slots_{allocator_.allocate(capacity()), slot_deleter_} + { + } + + flat_storage(flat_storage&&) = default; ///< Move constructor + /** + * @brief Replaces the contents of the storage with another storage. + * + * @return Reference of the current storage object + */ + flat_storage& operator=(flat_storage&&) = default; + ~flat_storage() = default; ///< Destructor + + flat_storage(flat_storage const&) = delete; + flat_storage& operator=(flat_storage const&) = delete; + + /** + * @brief Gets buckets array. + * + * @return Pointer to the first bucket + */ + [[nodiscard]] constexpr value_type* data() const noexcept { return slots_.get(); } + + /** + * @brief Gets the storage allocator. + * + * @return The storage allocator + */ + [[nodiscard]] constexpr allocator_type allocator() const noexcept { return allocator_; } + + /** + * @brief Gets bucket storage reference. + * + * @return Reference of bucket storage + */ + [[nodiscard]] constexpr ref_type ref() const noexcept + { + return ref_type{this->extent(), this->data()}; + } + + /** + * @brief Initializes each slot in the bucket storage to contain `key`. + * + * @param key Key to which all keys in `slots` are initialized + * @param stream Stream used for executing the kernel + */ + void initialize(value_type key, cuda::stream_ref stream = {}) + { + this->initialize_async(key, stream); + stream.wait(); + } + + /** + * @brief Asynchronously initializes each slot in the bucket storage to contain `key`. + * + * @param key Key to which all keys in `slots` are initialized + * @param stream Stream used for executing the kernel + */ + void initialize_async(value_type key, cuda::stream_ref stream = {}) noexcept + { + if (this->capacity() == 0) { return; } + + auto constexpr cg_size = 1; + auto constexpr stride = 4; + auto const grid_size = cuco::detail::grid_size(this->capacity(), cg_size, stride); + + detail::initialize<<>>( + this->data(), this->capacity(), key); + } + + [[nodiscard]] constexpr size_type num_buckets() const noexcept + { + return this->capacity() / bucket_size; + } + + [[nodiscard]] constexpr auto bucket_extent() const noexcept + { + return cuco::extent{this->capacity() / bucket_size}; + } + + private: + allocator_type allocator_; ///< Allocator used to (de)allocate buckets + slot_deleter_type slot_deleter_; ///< Custom buckets deleter + /// Pointer to the bucket storage + std::unique_ptr slots_; +}; +} // namespace cuco diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh index 9cd5de812..e24c25c08 100644 --- a/include/cuco/static_multimap.cuh +++ b/include/cuco/static_multimap.cuh @@ -96,7 +96,7 @@ template >, class Allocator = cuco::cuda_allocator>, - class Storage = cuco::storage<2>> + class Storage = cuco::slot_storage<2>> class static_multimap { static_assert(sizeof(Key) <= 8, "Container does not support key types larger than 8 bytes."); diff --git a/include/cuco/storage.cuh b/include/cuco/storage.cuh index c9da5ca3c..bbd7191d5 100644 --- a/include/cuco/storage.cuh +++ b/include/cuco/storage.cuh @@ -45,4 +45,15 @@ class storage { using impl = bucket_storage; }; +template +class slot_storage { + public: + /// Number of slots per bucket storage + static constexpr int32_t bucket_size = BucketSize; + + /// Type of implementation details + template + using impl = flat_storage; +}; + } // namespace cuco