Skip to content

Commit

Permalink
Expose stream parameter in public strings filter APIs (#14293)
Browse files Browse the repository at this point in the history
Add stream parameter to public APIs:

- `cudf::strings::translate()`
- `cudf::strings::filter_characters()`
- `cudf::strings::filter_characters_of_type()`
- `cudf::strings::all_characters_of_type()`
- `cudf::strings::reverse()`

Also cleaned up some of the doxygen comments.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)
  - https://github.com/shrshi

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/shrshi

URL: #14293
  • Loading branch information
davidwendt authored Oct 27, 2023
1 parent 7d6c377 commit 52f7d5c
Show file tree
Hide file tree
Showing 10 changed files with 162 additions and 35 deletions.
26 changes: 15 additions & 11 deletions cpp/include/cudf/strings/char_types/char_types.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -53,18 +53,20 @@ namespace strings {
*
* Any null row results in a null entry for that row in the output column.
*
* @param strings Strings instance for this operation.
* @param types The character types to check in each string.
* @param input Strings instance for this operation
* @param types The character types to check in each string
* @param verify_types Only verify against these character types.
* Default `ALL_TYPES` means return `true`
* iff all characters match `types`.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column of boolean results for each string
*/
std::unique_ptr<column> all_characters_of_type(
strings_column_view const& strings,
strings_column_view const& input,
string_character_types types,
string_character_types verify_types = string_character_types::ALL_TYPES,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -96,20 +98,22 @@ std::unique_ptr<column> all_characters_of_type(
* @throw cudf::logic_error if neither or both `types_to_remove` and
* `types_to_keep` are set to `ALL_TYPES`.
*
* @param strings Strings instance for this operation.
* @param input Strings instance for this operation
* @param types_to_remove The character types to check in each string.
* Use `ALL_TYPES` here to specify `types_to_keep` instead.
* @param replacement The replacement character to use when removing characters.
* @param replacement The replacement character to use when removing characters
* @param types_to_keep Default `ALL_TYPES` means all characters of
* `types_to_remove` will be filtered.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column of boolean results for each string.
* @param mr Device memory resource used to allocate the returned column's device memory
* @param stream CUDA stream used for device memory operations and kernel launches
* @return New column of boolean results for each string
*/
std::unique_ptr<column> filter_characters_of_type(
strings_column_view const& strings,
strings_column_view const& input,
string_character_types types_to_remove,
string_scalar const& replacement = string_scalar(""),
string_character_types types_to_keep = string_character_types::ALL_TYPES,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
Expand Down
4 changes: 3 additions & 1 deletion cpp/include/cudf/strings/reverse.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -42,10 +42,12 @@ namespace strings {
*
* @param input Strings column for this operation
* @param mr Device memory resource used to allocate the returned column's device memory
* @param stream CUDA stream used for device memory operations and kernel launches
* @return New strings column
*/
std::unique_ptr<column> reverse(
strings_column_view const& input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
Expand Down
30 changes: 17 additions & 13 deletions cpp/include/cudf/strings/translate.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -47,14 +47,16 @@ namespace strings {
* r is now ["AA", "", "cccc", "AcQ"]
* @endcode
*
* @param strings Strings instance for this operation.
* @param chars_table Table of UTF-8 character mappings.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column with padded strings.
* @param input Strings instance for this operation
* @param chars_table Table of UTF-8 character mappings
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column with padded strings
*/
std::unique_ptr<column> translate(
strings_column_view const& strings,
strings_column_view const& input,
std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
Expand Down Expand Up @@ -87,19 +89,21 @@ enum class filter_type : bool {
*
* @throw cudf::logic_error if `replacement` is invalid
*
* @param strings Strings instance for this operation.
* @param characters_to_filter Table of character ranges to filter on.
* @param input Strings instance for this operation
* @param characters_to_filter Table of character ranges to filter on
* @param keep_characters If true, the `characters_to_filter` are retained and all other characters
* are removed.
* @param replacement Optional replacement string for each character removed.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New column with filtered strings.
* are removed
* @param replacement Optional replacement string for each character removed
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New column with filtered strings
*/
std::unique_ptr<column> filter_characters(
strings_column_view const& strings,
strings_column_view const& input,
std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> characters_to_filter,
filter_type keep_characters = filter_type::KEEP,
string_scalar const& replacement = string_scalar(""),
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
Expand Down
11 changes: 6 additions & 5 deletions cpp/src/strings/char_types/char_types.cu
Original file line number Diff line number Diff line change
Expand Up @@ -214,25 +214,26 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str

// external API

std::unique_ptr<column> all_characters_of_type(strings_column_view const& strings,
std::unique_ptr<column> all_characters_of_type(strings_column_view const& input,
string_character_types types,
string_character_types verify_types,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::all_characters_of_type(
strings, types, verify_types, cudf::get_default_stream(), mr);
return detail::all_characters_of_type(input, types, verify_types, stream, mr);
}

std::unique_ptr<column> filter_characters_of_type(strings_column_view const& strings,
std::unique_ptr<column> filter_characters_of_type(strings_column_view const& input,
string_character_types types_to_remove,
string_scalar const& replacement,
string_character_types types_to_keep,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::filter_characters_of_type(
strings, types_to_remove, replacement, types_to_keep, cudf::get_default_stream(), mr);
input, types_to_remove, replacement, types_to_keep, stream, mr);
}

} // namespace strings
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/strings/filter_chars.cu
Original file line number Diff line number Diff line change
Expand Up @@ -154,15 +154,16 @@ std::unique_ptr<column> filter_characters(
* @copydoc cudf::strings::filter_characters
*/
std::unique_ptr<column> filter_characters(
strings_column_view const& strings,
strings_column_view const& input,
std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> characters_to_filter,
filter_type keep_characters,
string_scalar const& replacement,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::filter_characters(
strings, characters_to_filter, keep_characters, replacement, cudf::get_default_stream(), mr);
input, characters_to_filter, keep_characters, replacement, stream, mr);
}

} // namespace strings
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/strings/reverse.cu
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,11 @@ std::unique_ptr<column> reverse(strings_column_view const& input,
} // namespace detail

std::unique_ptr<column> reverse(strings_column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::reverse(input, cudf::get_default_stream(), mr);
return detail::reverse(input, stream, mr);
}

} // namespace strings
Expand Down
5 changes: 3 additions & 2 deletions cpp/src/strings/translate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -124,12 +124,13 @@ std::unique_ptr<column> translate(strings_column_view const& strings,

// external APIs

std::unique_ptr<column> translate(strings_column_view const& strings,
std::unique_ptr<column> translate(strings_column_view const& input,
std::vector<std::pair<uint32_t, uint32_t>> const& chars_table,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::translate(strings, chars_table, cudf::get_default_stream(), mr);
return detail::translate(input, chars_table, stream, mr);
}

} // namespace strings
Expand Down
2 changes: 2 additions & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -641,8 +641,10 @@ ConfigureTest(
streams/strings/contains_test.cpp
streams/strings/convert_test.cpp
streams/strings/extract_test.cpp
streams/strings/filter_test.cpp
streams/strings/find_test.cpp
streams/strings/replace_test.cpp
streams/strings/reverse_test.cpp
streams/strings/split_test.cpp
streams/strings/strings_tests.cpp
STREAM_MODE
Expand Down
77 changes: 77 additions & 0 deletions cpp/tests/streams/strings/filter_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/strings/char_types/char_types.hpp>
#include <cudf/strings/translate.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/default_stream.hpp>

#include <string>
#include <vector>

class StringsFilterTest : public cudf::test::BaseFixture {};

static std::pair<cudf::char_utf8, cudf::char_utf8> make_entry(char const* from, char const* to)
{
cudf::char_utf8 in = 0;
cudf::char_utf8 out = 0;
cudf::strings::detail::to_char_utf8(from, in);
if (to) cudf::strings::detail::to_char_utf8(to, out);
return std::pair(in, out);
}

TEST_F(StringsFilterTest, Translate)
{
auto input = cudf::test::strings_column_wrapper({" aBc ", " ", "aaaa ", "\tb"});
auto view = cudf::strings_column_view(input);

std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> translate_table{
make_entry("b", 0), make_entry("a", "A"), make_entry(" ", "_")};
cudf::strings::translate(view, translate_table, cudf::test::get_default_stream());
}

TEST_F(StringsFilterTest, Filter)
{
auto input = cudf::test::strings_column_wrapper({" aBc ", " ", "aaaa ", "\tb"});
auto view = cudf::strings_column_view(input);

std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> filter_table{
make_entry("b", 0), make_entry("a", "A"), make_entry(" ", "_")};

auto const repl = cudf::string_scalar("X", true, cudf::test::get_default_stream());
auto const keep = cudf::strings::filter_type::KEEP;
cudf::strings::filter_characters(
view, filter_table, keep, repl, cudf::test::get_default_stream());
}

TEST_F(StringsFilterTest, FilterTypes)
{
auto input = cudf::test::strings_column_wrapper({" aBc ", " ", "aaaa ", "\tb"});
auto view = cudf::strings_column_view(input);

auto const verify_types =
cudf::strings::string_character_types::LOWER | cudf::strings::string_character_types::UPPER;
auto const all_types = cudf::strings::string_character_types::ALL_TYPES;
cudf::strings::all_characters_of_type(
view, verify_types, all_types, cudf::test::get_default_stream());

auto const repl = cudf::string_scalar("X", true, cudf::test::get_default_stream());
auto const space_types = cudf::strings::string_character_types::SPACE;
cudf::strings::filter_characters_of_type(
view, all_types, repl, space_types, cudf::test::get_default_stream());
}
34 changes: 34 additions & 0 deletions cpp/tests/streams/strings/reverse_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/strings/reverse.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/default_stream.hpp>

#include <string>
#include <vector>

class StringsReverseTest : public cudf::test::BaseFixture {};

TEST_F(StringsReverseTest, Reverse)
{
auto input = cudf::test::strings_column_wrapper({"aBcdef", " ", "12345"});
auto view = cudf::strings_column_view(input);

cudf::strings::reverse(view, cudf::test::get_default_stream());
}

0 comments on commit 52f7d5c

Please sign in to comment.