rapidsai · rapids-bot · Sep 13, 2023 · Jun 26, 2023 · Jun 27, 2023 · Jun 27, 2023
@@ -402,6 +402,7 @@ add_library(
   src/io/parquet/reader_impl.cpp
   src/io/parquet/reader_impl_helpers.cpp
   src/io/parquet/reader_impl_preprocess.cu
+  src/io/parquet/stream_pool.cpp
   src/io/parquet/writer_impl.cu
   src/io/statistics/orc_column_statistics.cu
   src/io/statistics/parquet_column_statistics.cu

@@ -15,34 +15,18 @@
  */
 
 #include "reader_impl.hpp"
+#include "stream_pool.hpp"
 
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <rmm/cuda_stream_pool.hpp>
 
+#include <bitset>
 #include <numeric>
 
 namespace cudf::io::detail::parquet {
 
-namespace {
-
-int constexpr NUM_DECODERS       = 3;  // how many decode kernels are there to run
-int constexpr APPROX_NUM_THREADS = 4;  // guestimate from DaveB
-int constexpr STREAM_POOL_SIZE   = NUM_DECODERS * APPROX_NUM_THREADS;
-
-auto& get_stream_pool()
-{
-  // TODO: creating this on the heap because there were issues with trying to call the
-  // stream pool destructor during cuda shutdown that lead to a segmentation fault in
-  // nvbench. this allocation is being deliberately leaked to avoid the above, but still
-  // results in non-fatal warnings when running nvbench in cuda-gdb.
-  static auto pool = new rmm::cuda_stream_pool{STREAM_POOL_SIZE};
-  return *pool;
-}
-
-}  // namespace
-
 void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
 {
   auto& chunks              = _file_itm_data.chunks;
@@ -178,34 +162,34 @@ void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
   chunks.host_to_device_async(_stream);
   chunk_nested_valids.host_to_device_async(_stream);
   chunk_nested_data.host_to_device_async(_stream);
-  _stream.synchronize();
 
-  auto const level_type_size = _file_itm_data.level_type_size;
+  // get the number of streams we need from the pool and tell them to wait on the H2D copies
+  int nkernels = std::bitset<32>(kernel_mask).count();
+  auto streams = global_cuda_stream_pool().get_streams(nkernels);
+  fork_streams(streams, _stream);
 
-  // vector of launched streams
-  std::vector<rmm::cuda_stream_view> streams;
+  auto const level_type_size = _file_itm_data.level_type_size;
 
   // launch string decoder
+  int s_idx = 0;
   if (has_strings) {
-    streams.push_back(get_stream_pool().get_stream());
-    chunk_nested_str_data.host_to_device_async(streams.back());
-    gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back());
+    auto& stream = streams[s_idx++];
+    chunk_nested_str_data.host_to_device_async(stream);
+    gpu::DecodeStringPageData(pages, chunks, num_rows, skip_rows, level_type_size, stream);
   }
 
   // launch delta binary decoder
   if ((kernel_mask & gpu::KERNEL_MASK_DELTA_BINARY) != 0) {
-    streams.push_back(get_stream_pool().get_stream());
-    gpu::DecodeDeltaBinary(pages, chunks, num_rows, skip_rows, level_type_size, streams.back());
+    gpu::DecodeDeltaBinary(pages, chunks, num_rows, skip_rows, level_type_size, streams[s_idx++]);
   }
 
   // launch the catch-all page decoder
   if ((kernel_mask & gpu::KERNEL_MASK_GENERAL) != 0) {
-    streams.push_back(get_stream_pool().get_stream());
-    gpu::DecodePageData(pages, chunks, num_rows, skip_rows, level_type_size, streams.back());
+    gpu::DecodePageData(pages, chunks, num_rows, skip_rows, level_type_size, streams[s_idx++]);
   }
 
   // synchronize the streams
-  std::for_each(streams.begin(), streams.end(), [](auto& stream) { stream.synchronize(); });
+  join_streams(streams, _stream);
 
   pages.device_to_host_async(_stream);
   page_nesting.device_to_host_async(_stream);

diff --git a/cpp/src/io/parquet/stream_pool.cpp b/cpp/src/io/parquet/stream_pool.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <mutex>
+
+#include "stream_pool.hpp"
+
+#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+
+namespace cudf::io::detail::parquet {
+
+namespace {
+
+// TODO: what is a good number here. what's the penalty for making it larger?
+// Dave Baranec rule of thumb was max_streams_needed * num_concurrent_threads,
+// where num_concurrent_threads was estimated to be 4. so using 32 will allow
+// for 8 streams per thread, which should be plenty (decoding will be up to 4
+// kernels when delta_byte_array decoding is added). rmm::cuda_stream_pool
+// defaults to 16.
+std::size_t constexpr STREAM_POOL_SIZE = 32;
+
+class rmm_cuda_stream_pool : public cuda_stream_pool {
+  rmm::cuda_stream_pool _pool;
+
+ public:
+  rmm_cuda_stream_pool() : _pool{STREAM_POOL_SIZE} {}
+  rmm::cuda_stream_view get_stream() override { return _pool.get_stream(); }
+  rmm::cuda_stream_view get_stream(std::size_t stream_id) override
+  {
+    return _pool.get_stream(stream_id);
+  }
+
+  std::vector<rmm::cuda_stream_view> get_streams(uint32_t count) override
+  {
+    static std::mutex stream_pool_mutex;
+
+    if (count > STREAM_POOL_SIZE) {
+      CUDF_LOG_WARN("get_streams called with count ({}) > pool size ({})", count, STREAM_POOL_SIZE);
+    }
+    auto streams = std::vector<rmm::cuda_stream_view>();
+    std::lock_guard<std::mutex> lock(stream_pool_mutex);
+    for (uint32_t i = 0; i < count; i++) {
+      streams.emplace_back(_pool.get_stream());
+    }
+    return streams;
+  }
+
+  std::size_t get_stream_pool_size() const override { return STREAM_POOL_SIZE; }
+};
+
+class debug_cuda_stream_pool : public cuda_stream_pool {
+ public:
+  rmm::cuda_stream_view get_stream() override { return cudf::get_default_stream(); }
+  rmm::cuda_stream_view get_stream(std::size_t stream_id) override
+  {
+    return cudf::get_default_stream();
+  }
+
+  std::vector<rmm::cuda_stream_view> get_streams(uint32_t count) override
+  {
+    return std::vector<rmm::cuda_stream_view>(count, cudf::get_default_stream());
+  }
+
+  std::size_t get_stream_pool_size() const override { return 1UL; }
+};
+
+cuda_stream_pool* create_global_cuda_stream_pool()
+{
+  if (getenv("LIBCUDF_USE_DEBUG_STREAM_POOL")) return new debug_cuda_stream_pool();
+
+  return new rmm_cuda_stream_pool();
+}
+
+// implementation of per-thread-default-event.
+class cuda_event_map {
+ public:
+  cuda_event_map() {}
+
+  cudaEvent_t find(std::thread::id thread_id)
+  {
+    std::lock_guard<std::mutex> lock(map_mutex_);
+    auto it = event_map_.find(thread_id);
+    if (it != event_map_.end()) {
+      return it->second;
+    } else {
+      cudaEvent_t event;
+      CUDF_CUDA_TRY(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+      event_map_[thread_id] = event;
+      return event;
+    }
+  }
+
+  cuda_event_map(cuda_event_map const&) = delete;
+  void operator=(cuda_event_map const&) = delete;
+
+ private:
+  std::unordered_map<std::thread::id, cudaEvent_t> event_map_;
+  std::mutex map_mutex_;
+};
+
+cudaEvent_t event_for_thread()
+{
+  static cuda_event_map instance;
+  return instance.find(std::this_thread::get_id());
+}
+
+}  // anonymous namespace
+
+cuda_stream_pool& global_cuda_stream_pool()
+{
+  static cuda_stream_pool* pool = create_global_cuda_stream_pool();
+  return *pool;
+}
+
+void fork_streams(host_span<rmm::cuda_stream_view> streams, rmm::cuda_stream_view stream)
+{
+  cudaEvent_t event = event_for_thread();
+  CUDF_CUDA_TRY(cudaEventRecord(event, stream));
+  for (auto& strm : streams) {
+    CUDF_CUDA_TRY(cudaStreamWaitEvent(strm, event, 0));
+  }
+}
+
+void join_streams(host_span<rmm::cuda_stream_view> streams, rmm::cuda_stream_view stream)
+{
+  cudaEvent_t event = event_for_thread();
+  for (auto& strm : streams) {
+    CUDF_CUDA_TRY(cudaEventRecord(event, strm));
+    CUDF_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0));
+  }
+}
+
+}  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/stream_pool.hpp b/cpp/src/io/parquet/stream_pool.hpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_pool.hpp>
+
+namespace cudf::io::detail::parquet {
+
+/**
+ * @brief A pool of CUDA stream objects
+ *
+ * Meant to provide efficient on-demand access to CUDA streams.
+ *
+ * TODO: better docs!
+ */
+class cuda_stream_pool {
+ public:
+  virtual ~cuda_stream_pool() = default;
+
+  /**
+   * @brief Get a `cuda_stream_view` of a stream in the pool.
+   *
+   * This function is thread safe with respect to other calls to the same function.
+   *
+   * @return Stream view.
+   */
+  virtual rmm::cuda_stream_view get_stream() = 0;
+
+  /**
+   * @brief Get a `cuda_stream_view` of the stream associated with `stream_id`.
+   *
+   * Equivalent values of `stream_id` return a stream_view to the same underlying stream.
+   * This function is thread safe with respect to other calls to the same function.
+   *
+   * @param stream_id Unique identifier for the desired stream
+   * @return Requested stream view.
+   */
+  virtual rmm::cuda_stream_view get_stream(std::size_t stream_id) = 0;
+
+  /**
+   * @brief Get a set of `cuda_stream_view` objects from the pool.
+   *
+   * This function is thread safe with respect to other calls to the same function.
+   *
+   * @param count The number of stream views to return.
+   * @return Vector containing `count` stream views.
+   */
+  virtual std::vector<rmm::cuda_stream_view> get_streams(uint32_t count) = 0;
+
+  /**
+   * @brief Get the number of streams in the pool.
+   *
+   * This function is thread safe with respect to other calls to the same function.
+   *
+   * @return the number of streams in the pool
+   */
+  virtual std::size_t get_stream_pool_size() const = 0;
+};
+
+/**
+ * @brief Return the global cuda_stream_pool object.
+ *
+ * TODO: document how to control the implementation
+ *
+ * @return The cuda_stream_pool singleton.
+ */
+cuda_stream_pool& global_cuda_stream_pool();
+
+/**
+ * @brief Synchronize a set of streams to an event on another stream.
+ *
+ * @param streams Vector of streams to synchronize on.
+ * @param stream Stream to synchronize the other streams to, usually the default stream.
+ */
+void fork_streams(host_span<rmm::cuda_stream_view> streams, rmm::cuda_stream_view stream);
+
+/**
+ * @brief Synchronize a stream to an event on a set of streams.
+ *
+ * @param streams Vector of streams to synchronize on.
+ * @param stream Stream to synchronize the other streams to, usually the default stream.
+ */
+void join_streams(host_span<rmm::cuda_stream_view> streams, rmm::cuda_stream_view stream);
+
+}  // namespace cudf::io::detail::parquet