From 5039d043a08e7ea7e5656bab60a6fced4dfa2f1d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 10 Oct 2023 15:06:24 -0400 Subject: [PATCH] Expose stream parameter in public strings APIs (#14260) Add stream parameter to public APIs: - `cudf::strings::strip()` - `cudf::strings::slice_strings()` - `cudf::strings::pad()` - `cudf::strings::zfill()` - `cudf::strings::wrap()` Also cleaned up some of the doxygen comments and added stream-tests. Reference #13744 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14260 --- cpp/include/cudf/strings/padding.hpp | 14 ++-- cpp/include/cudf/strings/slice.hpp | 30 +++++---- cpp/include/cudf/strings/strip.hpp | 4 +- cpp/include/cudf/strings/wrap.hpp | 14 ++-- cpp/src/strings/padding.cu | 6 +- cpp/src/strings/slice.cu | 7 +- cpp/src/strings/strip.cu | 5 +- cpp/src/strings/wrap.cu | 8 +-- cpp/tests/CMakeLists.txt | 4 +- cpp/tests/streams/strings/strings_tests.cpp | 71 +++++++++++++++++++++ 10 files changed, 125 insertions(+), 38 deletions(-) create mode 100644 cpp/tests/streams/strings/strings_tests.cpp diff --git a/cpp/include/cudf/strings/padding.hpp b/cpp/include/cudf/strings/padding.hpp index 7699159fbea..f0cb351eeda 100644 --- a/cpp/include/cudf/strings/padding.hpp +++ b/cpp/include/cudf/strings/padding.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -51,6 +51,7 @@ namespace strings { * Default is pad right (left justify) * @param fill_char Single UTF-8 character to use for padding; * Default is the space character + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return New column with padded strings */ @@ -59,6 +60,7 @@ std::unique_ptr pad( size_type width, side_type side = side_type::RIGHT, std::string_view fill_char = " ", + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -79,14 +81,16 @@ std::unique_ptr pad( * r is now ['001234','-09876','+00.34','-342567', '0002+2'] * @endcode * - * @param input Strings instance for this operation. - * @param width The minimum number of characters for each string. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New column of strings. + * @param input Strings instance for this operation + * @param width The minimum number of characters for each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New column of strings */ std::unique_ptr zfill( strings_column_view const& input, size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/slice.hpp b/cpp/include/cudf/strings/slice.hpp index 5f2c71725eb..f106663be9b 100644 --- a/cpp/include/cudf/strings/slice.hpp +++ b/cpp/include/cudf/strings/slice.hpp @@ -50,18 +50,20 @@ namespace strings { * r2 is now ["lo","ob"] * @endcode * - * @param strings Strings column for this operation. - * @param start First character position to begin the substring. - * @param stop Last character position (exclusive) to end the substring. - * @param step Distance between input characters retrieved. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column with sorted elements of this instance. + * @param input Strings column for this operation + * @param start First character position to begin the substring + * @param stop Last character position (exclusive) to end the substring + * @param step Distance between input characters retrieved + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with sorted elements of this instance */ std::unique_ptr slice_strings( - strings_column_view const& strings, + strings_column_view const& input, numeric_scalar const& start = numeric_scalar(0, false), numeric_scalar const& stop = numeric_scalar(0, false), numeric_scalar const& step = numeric_scalar(1), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @@ -95,16 +97,18 @@ std::unique_ptr slice_strings( * @throw cudf::logic_error if starts and stops are not same integer type. * @throw cudf::logic_error if starts or stops contains nulls. * - * @param strings Strings column for this operation. - * @param starts First character positions to begin the substring. - * @param stops Last character (exclusive) positions to end the substring. - * @param mr Device memory resource used to allocate the returned column's device memory. - * @return New strings column with sorted elements of this instance. + * @param input Strings column for this operation + * @param starts First character positions to begin the substring + * @param stops Last character (exclusive) positions to end the substring + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New strings column with sorted elements of this instance */ std::unique_ptr slice_strings( - strings_column_view const& strings, + strings_column_view const& input, column_view const& starts, column_view const& stops, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp index adf3b291144..556d6805ac3 100644 --- a/cpp/include/cudf/strings/strip.hpp +++ b/cpp/include/cudf/strings/strip.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,6 +57,7 @@ namespace strings { * string; Default is both * @param to_strip UTF-8 encoded characters to strip from each string; * Default is empty string which indicates strip whitespace characters + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory. * @return New strings column. */ @@ -64,6 +65,7 @@ std::unique_ptr strip( strings_column_view const& input, side_type side = side_type::BOTH, string_scalar const& to_strip = string_scalar(""), + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/include/cudf/strings/wrap.hpp b/cpp/include/cudf/strings/wrap.hpp index 8d2d43c7f0f..efdc3e62aff 100644 --- a/cpp/include/cudf/strings/wrap.hpp +++ b/cpp/include/cudf/strings/wrap.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -57,14 +57,16 @@ namespace strings { * wrapped_string_tbl = ["the quick\nbrown fox\njumped over\nthe lazy\nbrown dog", "hello, world"] * ``` * - * @param[in] strings String column. - * @param[in] width Maximum character width of a line within each string. - * @param[in] mr Device memory resource used to allocate the returned column's device memory - * @return Column of wrapped strings. + * @param input String column + * @param width Maximum character width of a line within each string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Column of wrapped strings */ std::unique_ptr wrap( - strings_column_view const& strings, + strings_column_view const& input, size_type width, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of doxygen group diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu index c501a8bf7b4..850ccaa4535 100644 --- a/cpp/src/strings/padding.cu +++ b/cpp/src/strings/padding.cu @@ -168,18 +168,20 @@ std::unique_ptr pad(strings_column_view const& input, size_type width, side_type side, std::string_view fill_char, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::pad(input, width, side, fill_char, cudf::get_default_stream(), mr); + return detail::pad(input, width, side, fill_char, stream, mr); } std::unique_ptr zfill(strings_column_view const& input, size_type width, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::zfill(input, width, cudf::get_default_stream(), mr); + return detail::zfill(input, width, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu index cce6a19a5a6..5a1fee92c7d 100644 --- a/cpp/src/strings/slice.cu +++ b/cpp/src/strings/slice.cu @@ -248,20 +248,21 @@ std::unique_ptr slice_strings(strings_column_view const& strings, numeric_scalar const& start, numeric_scalar const& stop, numeric_scalar const& step, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::slice_strings(strings, start, stop, step, cudf::get_default_stream(), mr); + return detail::slice_strings(strings, start, stop, step, stream, mr); } std::unique_ptr slice_strings(strings_column_view const& strings, column_view const& starts_column, column_view const& stops_column, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::slice_strings( - strings, starts_column, stops_column, cudf::get_default_stream(), mr); + return detail::slice_strings(strings, starts_column, stops_column, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu index 6fb7c671a87..26df76850f7 100644 --- a/cpp/src/strings/strip.cu +++ b/cpp/src/strings/strip.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -86,10 +86,11 @@ std::unique_ptr strip(strings_column_view const& input, std::unique_ptr strip(strings_column_view const& input, side_type side, string_scalar const& to_strip, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::strip(input, side, to_strip, cudf::get_default_stream(), mr); + return detail::strip(input, side, to_strip, stream, mr); } } // namespace strings diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu index 335908d65d1..aa87a663964 100644 --- a/cpp/src/strings/wrap.cu +++ b/cpp/src/strings/wrap.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2022, NVIDIA CORPORATION. + * Copyright (c) 2020-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,10 +19,9 @@ #include #include #include -#include -#include #include #include +#include #include #include @@ -133,10 +132,11 @@ std::unique_ptr wrap(strings_column_view const& strings, std::unique_ptr wrap(strings_column_view const& strings, size_type width, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::wrap(strings, width, cudf::get_default_stream(), mr); + return detail::wrap(strings, width, stream, mr); } } // namespace strings diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index ffaba7d6fa7..b15a6c41d39 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -633,8 +633,8 @@ ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing) ConfigureTest( - STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE - testing + STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp + streams/strings/strings_tests.cpp STREAM_MODE testing ) ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/streams/strings/strings_tests.cpp b/cpp/tests/streams/strings/strings_tests.cpp new file mode 100644 index 00000000000..0db467a6895 --- /dev/null +++ b/cpp/tests/streams/strings/strings_tests.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include + +class StringsTest : public cudf::test::BaseFixture {}; + +TEST_F(StringsTest, Strip) +{ + auto input = cudf::test::strings_column_wrapper({" aBc ", " ", "aaaa ", "\tb"}); + auto view = cudf::strings_column_view(input); + + auto const strip = cudf::string_scalar(" ", true, cudf::test::get_default_stream()); + auto const side = cudf::strings::side_type::BOTH; + cudf::strings::strip(view, side, strip, cudf::test::get_default_stream()); +} + +TEST_F(StringsTest, Pad) +{ + auto input = cudf::test::strings_column_wrapper({"333", "", "4444", "1"}); + auto view = cudf::strings_column_view(input); + + auto const side = cudf::strings::side_type::BOTH; + cudf::strings::pad(view, 6, side, " ", cudf::test::get_default_stream()); + cudf::strings::zfill(view, 6, cudf::test::get_default_stream()); +} + +TEST_F(StringsTest, Wrap) +{ + auto input = cudf::test::strings_column_wrapper({"the quick brown fox jumped"}); + auto view = cudf::strings_column_view(input); + + cudf::strings::wrap(view, 6, cudf::test::get_default_stream()); +} + +TEST_F(StringsTest, Slice) +{ + auto input = cudf::test::strings_column_wrapper({"hello", "these", "are test strings"}); + auto view = cudf::strings_column_view(input); + + auto start = cudf::numeric_scalar(2, true, cudf::test::get_default_stream()); + auto stop = cudf::numeric_scalar(5, true, cudf::test::get_default_stream()); + auto step = cudf::numeric_scalar(1, true, cudf::test::get_default_stream()); + cudf::strings::slice_strings(view, start, stop, step, cudf::test::get_default_stream()); + + auto starts = cudf::test::fixed_width_column_wrapper({1, 2, 3}); + auto stops = cudf::test::fixed_width_column_wrapper({4, 5, 6}); + cudf::strings::slice_strings(view, starts, stops, cudf::test::get_default_stream()); +}