From dd51eb3ee1d2bbaefe1b1271752a288bc511fd6a Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 24 Jan 2025 16:22:55 -0500 Subject: [PATCH 01/21] Add new nvtext::normalize_characters API --- cpp/include/nvtext/normalize.hpp | 103 ++++++++- cpp/src/text/normalize.cu | 280 ++++++++++++++++++++++-- cpp/src/text/normalize.cuh | 106 +++++++++ cpp/src/text/subword/data_normalizer.cu | 76 +------ cpp/tests/text/normalize_tests.cpp | 72 ++++-- 5 files changed, 521 insertions(+), 116 deletions(-) create mode 100644 cpp/src/text/normalize.cuh diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp index 74325f4a406..10797164945 100644 --- a/cpp/include/nvtext/normalize.hpp +++ b/cpp/include/nvtext/normalize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -107,5 +107,106 @@ std::unique_ptr normalize_characters( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Normalizer object to be used with nvtext::normalize_characters + * + * Use nvtext::create_normalizer to create this object. + * + * This normalizer includes: + * + * - adding padding around punctuation (unicode category starts with "P") + * as well as certain ASCII symbols like "^" and "$" + * - adding padding around the [CJK Unicode block + * characters](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)) + * - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "` + * - removing control characters (unicode categories "Cc" and "Cf") + * + * The padding process here adds a single space before and after the character. + * Details on _unicode category_ can be found here: + * https://unicodebook.readthedocs.io/unicode.html#categories + * + * If `do_lower_case = true`, lower-casing also removes the accents. The + * accents cannot be removed from upper-case characters without lower-casing + * and lower-casing cannot be performed without also removing accents. + * However, if the accented character is already lower-case, then only the + * accent is removed. + */ +struct character_normalizer { + /** + * @brief Normalizer object constructor + * + * This initializes and holds the character normalizing tables and settings. + * + * @param do_lower_case If true, upper-case characters are converted to + * lower-case and accents are stripped from those characters. + * If false, accented and upper-case characters are not transformed. + * @param allow_special_tokens If true, the following character sequences are not + * normalized if encountered in the input: + * `[BOS] [EOS] [UNK] [SEP] [PAD] [CLS] [MASK]` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + */ + character_normalizer(bool do_lower_case, + bool allow_special_tokens = true, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + ~character_normalizer(); + + struct character_normalizer_impl; + character_normalizer_impl* _impl{}; +}; + +/** + * @brief Create a normalizer object + * + * Creates a normalizer object which can be reused on multiple calls to + * nvtext::normalize_characters + * + * @see nvtext::character_normalizer + * + * @param do_lower_case If true, upper-case characters are converted to + * lower-case and accents are stripped from those characters. + * If false, accented and upper-case characters are not transformed. + * @param allow_special_tokens If true, the following character sequences are not + * normalized if encountered in the input: + * `[BOS] [EOS] [UNK] [SEP] [PAD] [CLS] [MASK]` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Object to be used with nvtext::tokenize_with_vocabulary + */ +std::unique_ptr create_character_normalizer( + bool do_lower_case, + bool allow_special_tokens = true, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Normalizes strings characters for tokenizing + * + * @see nvtext::character_normalizer for details on the normalizer behavior + * + * @code{.pseudo} + * s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"] + * s1 = normalize_characters(s,true) + * s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "] + * s2 = normalize_characters(s,false) + * s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "] + * @endcode + * + * A null input element at row `i` produces a corresponding null entry + * for row `i` in the output column. + * + * @param input The input strings to normalize + * @param normalizer Normalizer to use for this function + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Memory resource to allocate any returned objects + * @return Normalized strings column + */ +std::unique_ptr normalize_characters( + cudf::strings_column_view const& input, + character_normalizer const& normalizer, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 7e2b766862d..2d1ce290aec 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "text/normalize.cuh" #include "text/subword/detail/data_normalizer.hpp" #include "text/subword/detail/tokenizer_utils.cuh" #include "text/utilities/tokenize_ops.cuh" @@ -38,9 +39,11 @@ #include +#include #include #include #include +#include #include #include @@ -103,6 +106,35 @@ constexpr uint32_t UTF8_1BYTE = 0x0080; constexpr uint32_t UTF8_2BYTE = 0x0800; constexpr uint32_t UTF8_3BYTE = 0x01'0000; +__device__ int8_t cp_to_utf8(uint32_t codepoint, char* out) +{ + auto utf8 = cudf::strings::detail::codepoint_to_utf8(codepoint); + return cudf::strings::detail::from_char_utf8(utf8, out); +#if 0 + auto out_ptr = out; + if (codepoint < UTF8_1BYTE) // ASCII range + *out_ptr++ = static_cast(codepoint); + else if (codepoint < UTF8_2BYTE) { // create two-byte UTF-8 + // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy + *out_ptr++ = static_cast((((codepoint << 2) & 0x00'1F00) | 0x00'C000) >> 8); + *out_ptr++ = static_cast((codepoint & 0x3F) | 0x0080); + } else if (codepoint < UTF8_3BYTE) { // create three-byte UTF-8 + // bxxxxxxxx:byyyyyyyy => b1110xxxx:b10xxxxyy:b10yyyyyy + *out_ptr++ = static_cast((((codepoint << 4) & 0x0F'0000) | 0x00E0'0000) >> 16); + *out_ptr++ = static_cast((((codepoint << 2) & 0x00'3F00) | 0x00'8000) >> 8); + *out_ptr++ = static_cast((codepoint & 0x3F) | 0x0080); + } else { // create four-byte UTF-8 + // maximum code-point value is 0x0011'0000 + // b000xxxxx:byyyyyyyy:bzzzzzzzz => b11110xxx:b10xxyyyy:b10yyyyzz:b10zzzzzz + *out_ptr++ = static_cast((((codepoint << 6) & 0x0700'0000u) | 0xF000'0000u) >> 24); + *out_ptr++ = static_cast((((codepoint << 4) & 0x003F'0000u) | 0x0080'0000u) >> 16); + *out_ptr++ = static_cast((((codepoint << 2) & 0x00'3F00u) | 0x00'8000u) >> 8); + *out_ptr++ = static_cast((codepoint & 0x3F) | 0x0080); + } + return static_cast(thrust::distance(out, out_ptr)); +#endif +} + /** * @brief Convert code-point arrays into UTF-8 bytes for each string. */ @@ -148,26 +180,8 @@ struct codepoint_to_utf8_fn { // convert each code-point to 1-4 UTF-8 encoded bytes char* out_ptr = d_chars + d_offsets[idx]; for (uint32_t jdx = 0; jdx < count; ++jdx) { - uint32_t code_point = *str_cps++; - if (code_point < UTF8_1BYTE) // ASCII range - *out_ptr++ = static_cast(code_point); - else if (code_point < UTF8_2BYTE) { // create two-byte UTF-8 - // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy - *out_ptr++ = static_cast((((code_point << 2) & 0x00'1F00) | 0x00'C000) >> 8); - *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); - } else if (code_point < UTF8_3BYTE) { // create three-byte UTF-8 - // bxxxxxxxx:byyyyyyyy => b1110xxxx:b10xxxxyy:b10yyyyyy - *out_ptr++ = static_cast((((code_point << 4) & 0x0F'0000) | 0x00E0'0000) >> 16); - *out_ptr++ = static_cast((((code_point << 2) & 0x00'3F00) | 0x00'8000) >> 8); - *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); - } else { // create four-byte UTF-8 - // maximum code-point value is 0x0011'0000 - // b000xxxxx:byyyyyyyy:bzzzzzzzz => b11110xxx:b10xxyyyy:b10yyyyzz:b10zzzzzz - *out_ptr++ = static_cast((((code_point << 6) & 0x0700'0000u) | 0xF000'0000u) >> 24); - *out_ptr++ = static_cast((((code_point << 4) & 0x003F'0000u) | 0x0080'0000u) >> 16); - *out_ptr++ = static_cast((((code_point << 2) & 0x00'3F00u) | 0x00'8000u) >> 8); - *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); - } + uint32_t codepoint = *str_cps++; + out_ptr += cp_to_utf8(codepoint, out_ptr); } } }; @@ -261,4 +275,228 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con return detail::normalize_characters(input, do_lower_case, stream, mr); } +struct character_normalizer::character_normalizer_impl { + rmm::device_uvector cp_metadata; + rmm::device_uvector aux_table; + bool do_lower_case; + bool special_tokens; + + character_normalizer_impl(rmm::device_uvector&& cp_metadata, + rmm::device_uvector&& aux_table, + bool do_lower_case, + bool special_tokens) + : cp_metadata(std::move(cp_metadata)), + aux_table(std::move(aux_table)), + do_lower_case{do_lower_case}, + special_tokens{special_tokens} + { + } +}; + +character_normalizer::character_normalizer(bool do_lower_case, + bool special_tokens, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref) +{ + auto cp_metadata = nvtext::detail::get_codepoint_metadata(stream); + auto aux_table = nvtext::detail::get_aux_codepoint_data(stream); + + _impl = new character_normalizer_impl( + std::move(cp_metadata), std::move(aux_table), do_lower_case, special_tokens); +} +character_normalizer::~character_normalizer() { delete _impl; } + +std::unique_ptr create_character_normalizer(bool do_lower_case, + bool allow_special_tokens, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return std::make_unique(do_lower_case, allow_special_tokens, stream, mr); +} + +namespace detail { +namespace { +CUDF_KERNEL void normalizer_kernel(char const* d_chars, + int64_t total_bytes, + codepoint_metadata_type const* cp_metadata, + aux_codepoint_data_type const* aux_table, + bool do_lower_case, + bool, // allow_special_tokens, + uint32_t* d_output, + int8_t* chars_per_thread) +{ + uint32_t replacement[MAX_NEW_CHARS] = {0}; + + auto const idx = cudf::detail::grid_1d::global_thread_id(); + int8_t num_new_chars = 0; + + if ((idx < total_bytes) && cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { + auto const cp = [utf8 = d_chars + idx] { + cudf::char_utf8 ch_utf8; + auto const ch_size = cudf::strings::detail::to_char_utf8(utf8, ch_utf8); + return cudf::strings::detail::utf8_to_codepoint(ch_utf8); + }(); + auto const metadata = cp_metadata[cp]; + + if (!should_remove_cp(metadata, do_lower_case)) { + num_new_chars = 1; + // Apply lower cases and accent stripping if necessary + auto const new_cp = do_lower_case || always_replace(metadata) ? get_first_cp(metadata) : cp; + replacement[0] = new_cp == 0 ? cp : new_cp; + + if (do_lower_case && is_multi_char_transform(metadata)) { + auto const next_cps = aux_table[cp]; + replacement[1] = static_cast(next_cps >> 32); + replacement[2] = static_cast(next_cps & 0xFFFFFFFF); + num_new_chars = 2 + (replacement[2] != 0); + } + + // check for possible special tokens here before checking add-spaces? + + if (should_add_spaces(metadata, do_lower_case) && (num_new_chars == 1)) { + // Need to shift all existing code-points up one. + // This is a rotate right. There is no thrust equivalent at this time. + // for (int loc = num_new_chars; loc > 0; --loc) { + // replacement[loc] = replacement[loc - 1]; + //} + // Write the required spaces at each end + replacement[1] = replacement[0]; + replacement[0] = SPACE_CODE_POINT; + replacement[2] = SPACE_CODE_POINT; + num_new_chars = 3; + } + + // convert back to UTF-8 + for (int k = 0; k < num_new_chars; ++k) { + auto const new_cp = replacement[k]; + if (new_cp) { cp_to_utf8(new_cp, reinterpret_cast(replacement + k)); } + } + } + } + + if (idx < total_bytes) { chars_per_thread[idx] = num_new_chars; } + + using BlockStore = + cub::BlockStore; + __shared__ typename BlockStore::TempStorage temp_storage; + + // Now we perform coalesced writes back to global memory using cub. + auto output_offset = blockIdx.x * blockDim.x * MAX_NEW_CHARS; + auto block_base = d_output + output_offset; + auto valid_items = + min(static_cast(total_bytes - output_offset), static_cast(blockDim.x)); + BlockStore(temp_storage).Store(block_base, replacement, valid_items); +} + +template +rmm::device_uvector compute_sizes(int8_t const* sizes, + OffsetType offsets, + int64_t offset, + cudf::size_type size, + rmm::cuda_stream_view stream) +{ + auto output_sizes = rmm::device_uvector(size, stream); + + auto d_in = sizes; + auto d_out = output_sizes.begin(); + std::size_t temp = 0; + nvtxRangePushA("segmented_reduce"); + if (offset == 0) { + cub::DeviceSegmentedReduce::Sum( + nullptr, temp, d_in, d_out, size, offsets, offsets + 1, stream.value()); + auto d_temp = rmm::device_buffer{temp, stream}; + cub::DeviceSegmentedReduce::Sum( + d_temp.data(), temp, d_in, d_out, size, offsets, offsets + 1, stream.value()); + } else { + // offsets need to be normalized for segmented-reduce to work efficiently + auto d_offsets = rmm::device_uvector(size + 1, stream); + thrust::transform(rmm::exec_policy_nosync(stream), + offsets, + offsets + size + 1, + d_offsets.begin(), + [offset] __device__(auto o) { return o - offset; }); + auto const offsets_itr = d_offsets.begin(); + cub::DeviceSegmentedReduce::Sum( + nullptr, temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value()); + auto d_temp = rmm::device_buffer{temp, stream}; + cub::DeviceSegmentedReduce::Sum( + d_temp.data(), temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value()); + } + stream.synchronize(); + nvtxRangePop(); + + return output_sizes; +} +} // namespace +std::unique_ptr normalize_characters(cudf::strings_column_view const& input, + character_normalizer const& normalizer, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + if (input.is_empty()) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); } + + auto const first_offset = (input.offset() == 0) ? 0 + : cudf::strings::detail::get_offset_value( + input.offsets(), input.offset(), stream); + auto const last_offset = (input.offset() == 0 && input.size() == input.offsets().size() - 1) + ? input.chars_size(stream) + : cudf::strings::detail::get_offset_value( + input.offsets(), input.size() + input.offset(), stream); + auto const chars_size = last_offset - first_offset; + auto const d_input_chars = input.chars_begin(stream) + first_offset; + + if (chars_size == 0) { return std::make_unique(input.parent(), stream, mr); } + + constexpr int64_t block_size = 64; + cudf::detail::grid_1d grid{chars_size, block_size}; + auto const max_new_char_total = MAX_NEW_CHARS * chars_size; + + auto d_code_points = rmm::device_uvector(max_new_char_total, stream); + auto d_sizes = rmm::device_uvector(chars_size, stream); + normalizer_kernel<<>>( + d_input_chars, + chars_size, + normalizer._impl->cp_metadata.data(), + normalizer._impl->aux_table.data(), + normalizer._impl->do_lower_case, + normalizer._impl->special_tokens, + d_code_points.data(), + d_sizes.data()); + + auto const input_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + + // use segmented-reduce with input_offsets over d_sizes to get the size of the output rows + auto output_sizes = + compute_sizes(d_sizes.data(), input_offsets, first_offset, input.size(), stream); + + // convert the sizes to offsets + auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column( + output_sizes.begin(), output_sizes.end(), stream, mr); + + // create output chars and use remove-copy(0) on d_code_points + rmm::device_uvector chars(total_size, stream, mr); + auto begin = reinterpret_cast(d_code_points.begin()); + auto end = reinterpret_cast(d_code_points.end()); + thrust::remove_copy(rmm::exec_policy_nosync(stream), begin, end, chars.data(), 0); + + return cudf::make_strings_column(input.size(), + std::move(offsets), + chars.release(), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); +} + +} // namespace detail + +std::unique_ptr normalize_characters(cudf::strings_column_view const& input, + character_normalizer const& normalizer, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::normalize_characters(input, normalizer, stream, mr); +} + } // namespace nvtext diff --git a/cpp/src/text/normalize.cuh b/cpp/src/text/normalize.cuh new file mode 100644 index 00000000000..8b1f4408ee8 --- /dev/null +++ b/cpp/src/text/normalize.cuh @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2020-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "text/subword/detail/cp_data.h" + +#include +#include +#include + +#include + +namespace nvtext { +namespace detail { + +/** + * @brief Bit used to filter out invalid code points. + * + * When normalizing characters to code point values, if this bit is set, + * the code point should be filtered out before returning from the normalizer. + */ +constexpr uint32_t FILTER_BIT = 22; + +/** + * @brief Retrieve new code point from metadata value. + * + * @param metadata Value from the codepoint_metadata table. + * @return The replacement character if appropriate. + */ +__device__ constexpr uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; } + +/** + * @brief Retrieve token category from the metadata value. + * + * Category values are 0-5: + * 0 - character should be padded + * 1 - pad character if lower-case + * 2 - character should be removed + * 3 - remove character if lower-case + * 4 - whitespace character -- always replace + * 5 - uncategorized + * + * @param metadata Value from the codepoint_metadata table. + * @return Category value. + */ +__device__ constexpr uint32_t extract_token_cat(uint32_t metadata) +{ + return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK; +} + +/** + * @brief Return true if category of metadata value specifies the character should be replaced. + */ +__device__ constexpr bool should_remove_cp(uint32_t metadata, bool lower_case) +{ + auto const cat = extract_token_cat(metadata); + return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER)); +} + +/** + * @brief Return true if category of metadata value specifies the character should be padded. + */ +__device__ constexpr bool should_add_spaces(uint32_t metadata, bool lower_case) +{ + auto const cat = extract_token_cat(metadata); + return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER)); +} + +/** + * @brief Return true if category of metadata value specifies the character should be replaced. + */ +__device__ constexpr bool always_replace(uint32_t metadata) +{ + return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE; +} + +/** + * @brief Returns true if metadata value includes a multi-character transform bit equal to 1. + */ +__device__ constexpr bool is_multi_char_transform(uint32_t metadata) +{ + return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK; +} + +/** + * @brief Returns true if the byte passed in could be a valid head byte for + * a utf8 character. That is, not binary `10xxxxxx` + */ +__device__ constexpr bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; } + +} // namespace detail +} // namespace nvtext diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index 7a39199011e..4c54409c41a 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "text/normalize.cuh" #include "text/subword/detail/data_normalizer.hpp" #include "text/subword/detail/tokenizer_utils.cuh" @@ -38,81 +39,6 @@ namespace nvtext { namespace detail { namespace { -/** - * @brief Bit used to filter out invalid code points. - * - * When normalizing characters to code point values, if this bit is set, - * the code point should be filtered out before returning from the normalizer. - */ -constexpr uint32_t FILTER_BIT = 22; - -/** - * @brief Retrieve new code point from metadata value. - * - * @param metadata Value from the codepoint_metadata table. - * @return The replacement character if appropriate. - */ -__device__ uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; } - -/** - * @brief Retrieve token category from the metadata value. - * - * Category values are 0-5: - * 0 - character should be padded - * 1 - pad character if lower-case - * 2 - character should be removed - * 3 - remove character if lower-case - * 4 - whitespace character -- always replace - * 5 - uncategorized - * - * @param metadata Value from the codepoint_metadata table. - * @return Category value. - */ -__device__ uint32_t extract_token_cat(uint32_t metadata) -{ - return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK; -} - -/** - * @brief Return true if category of metadata value specifies the character should be replaced. - */ -__device__ bool should_remove_cp(uint32_t metadata, bool lower_case) -{ - auto const cat = extract_token_cat(metadata); - return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER)); -} - -/** - * @brief Return true if category of metadata value specifies the character should be padded. - */ -__device__ bool should_add_spaces(uint32_t metadata, bool lower_case) -{ - auto const cat = extract_token_cat(metadata); - return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER)); -} - -/** - * @brief Return true if category of metadata value specifies the character should be replaced. - */ -__device__ bool always_replace(uint32_t metadata) -{ - return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE; -} - -/** - * @brief Returns true if metadata value includes a multi-character transform bit equal to 1. - */ -__device__ bool is_multi_char_transform(uint32_t metadata) -{ - return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK; -} - -/** - * @brief Returns true if the byte passed in could be a valid head byte for - * a utf8 character. That is, not binary `10xxxxxx` - */ -__device__ bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; } - /** * @brief Converts a UTF-8 character into a unicode code point value. * diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index 2515cc917fa..d8f87f78278 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -98,22 +99,53 @@ TEST_F(TextNormalizeTest, SomeNullStrings) TEST_F(TextNormalizeTest, NormalizeCharacters) { // These include punctuation, accents, whitespace, and CJK characters - std::vector h_strings{"abc£def", - nullptr, - "éè â îô\taeio", - "\tĂĆĖÑ Ü", - "ACEN U", - "P^NP", - "$41.07", - "[a,b]", - "丏丟", - ""}; - auto validity = - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); - cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity); - cudf::strings_column_view strings_view(strings); + auto input = cudf::test::strings_column_wrapper( + {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + auto sv = cudf::strings_column_view(input); + { + auto results = nvtext::normalize_characters(sv, true); + cudf::test::strings_column_wrapper expected({"abc£def", + "", + "ee a io aeio", + " acen u", + "acen u", + "p ^ np", + " $ 41 . 07", + " [ a , b ] ", + " 丏 丟 ", + ""}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } + { + auto results = nvtext::normalize_characters(sv, false); + cudf::test::strings_column_wrapper expected({"abc£def", + "", + "éè â îô aeio", + " ĂĆĖÑ Ü", + "ACEN U", + "P ^ NP", + " $ 41 . 07", + " [ a , b ] ", + " 丏 丟 ", + ""}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + } +} + +TEST_F(TextNormalizeTest, NormalizeCharactersNew) +{ + // These include punctuation, accents, whitespace, and CJK characters + auto input = cudf::test::strings_column_wrapper( + {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + auto sv = cudf::strings_column_view(input); { - auto results = nvtext::normalize_characters(strings_view, true); + auto normalizer = nvtext::create_character_normalizer(true); + auto results = nvtext::normalize_characters(sv, *normalizer); + cudf::test::print(results->view()); cudf::test::strings_column_wrapper expected({"abc£def", "", "ee a io aeio", @@ -124,11 +156,13 @@ TEST_F(TextNormalizeTest, NormalizeCharacters) " [ a , b ] ", " 丏 丟 ", ""}, - validity); + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto results = nvtext::normalize_characters(strings_view, false); + auto normalizer = nvtext::create_character_normalizer(false); + auto results = nvtext::normalize_characters(sv, *normalizer); + cudf::test::print(results->view()); cudf::test::strings_column_wrapper expected({"abc£def", "", "éè â îô aeio", @@ -139,7 +173,7 @@ TEST_F(TextNormalizeTest, NormalizeCharacters) " [ a , b ] ", " 丏 丟 ", ""}, - validity); + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } From 2dd819a33cb81b4d34fd44870f2efe87b046550c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 27 Jan 2025 15:37:58 -0500 Subject: [PATCH 02/21] add special-tokens column --- cpp/include/nvtext/normalize.hpp | 16 +-- cpp/src/text/normalize.cu | 178 ++++++++++++++++++++++------- cpp/tests/text/normalize_tests.cpp | 45 +++++++- 3 files changed, 184 insertions(+), 55 deletions(-) diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp index 10797164945..be1d28248de 100644 --- a/cpp/include/nvtext/normalize.hpp +++ b/cpp/include/nvtext/normalize.hpp @@ -130,6 +130,9 @@ std::unique_ptr normalize_characters( * and lower-casing cannot be performed without also removing accents. * However, if the accented character is already lower-case, then only the * accent is removed. + * + * If `special_tokens` are included the padding around the `[]` is not + * enforced if the character between them match one of the given tokens. */ struct character_normalizer { /** @@ -140,14 +143,13 @@ struct character_normalizer { * @param do_lower_case If true, upper-case characters are converted to * lower-case and accents are stripped from those characters. * If false, accented and upper-case characters are not transformed. - * @param allow_special_tokens If true, the following character sequences are not - * normalized if encountered in the input: - * `[BOS] [EOS] [UNK] [SEP] [PAD] [CLS] [MASK]` + * @param special_tokens Individual sequences including `[]` brackets. + * For example: `[BOS]`, `[EOS]`, `[UNK]`, `[SEP]`, `[PAD]`, `[CLS]`, `[MASK]` * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory */ character_normalizer(bool do_lower_case, - bool allow_special_tokens = true, + cudf::strings_column_view const& special_tokens, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); ~character_normalizer(); @@ -167,16 +169,14 @@ struct character_normalizer { * @param do_lower_case If true, upper-case characters are converted to * lower-case and accents are stripped from those characters. * If false, accented and upper-case characters are not transformed. - * @param allow_special_tokens If true, the following character sequences are not - * normalized if encountered in the input: - * `[BOS] [EOS] [UNK] [SEP] [PAD] [CLS] [MASK]` + * @param special_tokens Individual sequences including `[]` brackets * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return Object to be used with nvtext::tokenize_with_vocabulary */ std::unique_ptr create_character_normalizer( bool do_lower_case, - bool allow_special_tokens = true, + cudf::strings_column_view const& special_tokens, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 2d1ce290aec..63047edd376 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include #include #include @@ -279,68 +281,147 @@ struct character_normalizer::character_normalizer_impl { rmm::device_uvector cp_metadata; rmm::device_uvector aux_table; bool do_lower_case; - bool special_tokens; + std::unique_ptr special_tokens; + rmm::device_uvector special_tokens_view; + cudf::size_type min_token_width; + cudf::size_type max_token_width; + + cudf::device_span get_special_tokens() const + { + return special_tokens_view; + } character_normalizer_impl(rmm::device_uvector&& cp_metadata, rmm::device_uvector&& aux_table, bool do_lower_case, - bool special_tokens) + std::unique_ptr&& special_tokens, + rmm::device_uvector&& special_tokens_view, + cudf::size_type min_token_width, + cudf::size_type max_token_width) : cp_metadata(std::move(cp_metadata)), aux_table(std::move(aux_table)), do_lower_case{do_lower_case}, - special_tokens{special_tokens} + special_tokens{std::move(special_tokens)}, + special_tokens_view{std::move(special_tokens_view)}, + min_token_width{min_token_width}, + max_token_width{max_token_width} { } }; +// lambdas not allowed in constructors +struct size_bytes_fn { + cudf::string_view const* d_tokens; + __device__ cudf::size_type operator()(cudf::size_type idx) { return d_tokens[idx].size_bytes(); } +}; + character_normalizer::character_normalizer(bool do_lower_case, - bool special_tokens, + cudf::strings_column_view const& special_tokens, rmm::cuda_stream_view stream, rmm::device_async_resource_ref) { auto cp_metadata = nvtext::detail::get_codepoint_metadata(stream); auto aux_table = nvtext::detail::get_aux_codepoint_data(stream); + CUDF_EXPECTS( + !special_tokens.has_nulls(), "special tokens should not have nulls", std::invalid_argument); - _impl = new character_normalizer_impl( - std::move(cp_metadata), std::move(aux_table), do_lower_case, special_tokens); + auto sorted = std::move( + cudf::sort(cudf::table_view({special_tokens.parent()}), {}, {}, stream)->release().front()); + if (do_lower_case) { + sorted = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream); + } + auto tokens_view = cudf::strings::detail::create_string_vector_from_column( + cudf::strings_column_view(sorted->view()), stream, cudf::get_current_device_resource_ref()); + auto const begin = + cudf::detail::make_counting_transform_iterator(0, size_bytes_fn{tokens_view.data()}); + auto const end = begin + tokens_view.size(); + auto const min_width = thrust::reduce(rmm::exec_policy(stream), + begin, + end, + std::numeric_limits::max(), + thrust::minimum{}); + auto const max_width = + thrust::reduce(rmm::exec_policy(stream), begin, end, 0, thrust::maximum{}); + printf("%ld special tokens, min=%d, max=%d\n", tokens_view.size(), min_width, max_width); + CUDF_EXPECTS(tokens_view.is_empty() || (min_width > 2 && max_width <= 6), + "Expected special tokens to be between 3 and 6 bytes including [] brackets", + std::invalid_argument); + + _impl = new character_normalizer_impl(std::move(cp_metadata), + std::move(aux_table), + do_lower_case, + std::move(sorted), + std::move(tokens_view), + min_width, + max_width); } character_normalizer::~character_normalizer() { delete _impl; } -std::unique_ptr create_character_normalizer(bool do_lower_case, - bool allow_special_tokens, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::unique_ptr create_character_normalizer( + bool do_lower_case, + cudf::strings_column_view const& special_tokens, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - return std::make_unique(do_lower_case, allow_special_tokens, stream, mr); + return std::make_unique(do_lower_case, special_tokens, stream, mr); } namespace detail { namespace { + +__device__ bool check_special_token(cudf::device_span special_tokens, + char const* d_chars, + int64_t idx, + int64_t total_bytes) +{ + char const ch = d_chars[idx]; + auto begin = (ch == '[' || idx <= 6) ? d_chars + idx : d_chars + idx - 6L; + auto end = ch == ']' ? d_chars + idx + 1 : d_chars + idx + std::min(6L, total_bytes - idx); + char const mch = ch == '[' ? ']' : '['; + // printf("(%p,%ld),%c,(%p,%p)\n", d_chars, total_bytes, ch, begin, end); + auto fnd = thrust::find(thrust::seq, begin, end, mch); + if (fnd == end) { return false; } + if (mch == ']') { + end = fnd + 1; // include ']' + } else { + begin = fnd; + } + auto const size = static_cast(thrust::distance(begin, end)); + auto const token = cudf::string_view(begin, size); + char p[8] = {0}; + memcpy(p, begin, min(size, 7)); + p[min(size, 7)] = 0; + auto rtn = thrust::find(thrust::seq, special_tokens.begin(), special_tokens.end(), token) != + special_tokens.end(); + printf("(%p,%d)=%s: %c/%d\n", begin, size, p, ch, int(rtn)); + return rtn; +} + CUDF_KERNEL void normalizer_kernel(char const* d_chars, int64_t total_bytes, codepoint_metadata_type const* cp_metadata, aux_codepoint_data_type const* aux_table, bool do_lower_case, - bool, // allow_special_tokens, + cudf::device_span special_tokens, uint32_t* d_output, int8_t* chars_per_thread) { uint32_t replacement[MAX_NEW_CHARS] = {0}; auto const idx = cudf::detail::grid_1d::global_thread_id(); - int8_t num_new_chars = 0; + int8_t num_new_bytes = 0; if ((idx < total_bytes) && cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { auto const cp = [utf8 = d_chars + idx] { cudf::char_utf8 ch_utf8; - auto const ch_size = cudf::strings::detail::to_char_utf8(utf8, ch_utf8); + cudf::strings::detail::to_char_utf8(utf8, ch_utf8); return cudf::strings::detail::utf8_to_codepoint(ch_utf8); }(); auto const metadata = cp_metadata[cp]; if (!should_remove_cp(metadata, do_lower_case)) { - num_new_chars = 1; + int8_t num_new_chars = 1; // Apply lower cases and accent stripping if necessary auto const new_cp = do_lower_case || always_replace(metadata) ? get_first_cp(metadata) : cp; replacement[0] = new_cp == 0 ? cp : new_cp; @@ -352,41 +433,48 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars, num_new_chars = 2 + (replacement[2] != 0); } - // check for possible special tokens here before checking add-spaces? - if (should_add_spaces(metadata, do_lower_case) && (num_new_chars == 1)) { - // Need to shift all existing code-points up one. - // This is a rotate right. There is no thrust equivalent at this time. - // for (int loc = num_new_chars; loc > 0; --loc) { - // replacement[loc] = replacement[loc - 1]; - //} - // Write the required spaces at each end + // check for possible special tokens here + auto const ch = d_chars[idx]; + auto sp_found = !special_tokens.empty() && (ch == '[' || ch == ']') && + check_special_token(special_tokens, d_chars, idx, total_bytes); + + // write the required spaces at each end replacement[1] = replacement[0]; - replacement[0] = SPACE_CODE_POINT; - replacement[2] = SPACE_CODE_POINT; + replacement[0] = sp_found && ch == ']' ? 0 : SPACE_CODE_POINT; + replacement[2] = sp_found && ch == '[' ? 0 : SPACE_CODE_POINT; num_new_chars = 3; } // convert back to UTF-8 for (int k = 0; k < num_new_chars; ++k) { auto const new_cp = replacement[k]; - if (new_cp) { cp_to_utf8(new_cp, reinterpret_cast(replacement + k)); } + if (new_cp) { + num_new_bytes += cp_to_utf8(new_cp, reinterpret_cast(replacement + k)); + // printf("%ld: k=%d, ncp=0x%04x, u8=0x%04x, bytes=%d\n", + // idx, + // k, + // new_cp, + // replacement[k], + // num_new_bytes); + } } } } - if (idx < total_bytes) { chars_per_thread[idx] = num_new_chars; } + if (idx < total_bytes) { + chars_per_thread[idx] = num_new_bytes; + // printf("%ld: nb=%d\n", idx, (int)num_new_bytes); - using BlockStore = - cub::BlockStore; - __shared__ typename BlockStore::TempStorage temp_storage; - - // Now we perform coalesced writes back to global memory using cub. - auto output_offset = blockIdx.x * blockDim.x * MAX_NEW_CHARS; - auto block_base = d_output + output_offset; - auto valid_items = - min(static_cast(total_bytes - output_offset), static_cast(blockDim.x)); - BlockStore(temp_storage).Store(block_base, replacement, valid_items); + d_output += idx * MAX_NEW_CHARS; +#pragma unroll + for (int k = 0; k < MAX_NEW_CHARS; ++k) { + *d_output++ = replacement[k]; + } + // printf( + // "%ld: r0=0x%04x,r1=0x%04x,r2=0x%04x\n", idx, replacement[0], replacement[1], + // replacement[2]); + } } template @@ -452,15 +540,17 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con cudf::detail::grid_1d grid{chars_size, block_size}; auto const max_new_char_total = MAX_NEW_CHARS * chars_size; + auto const parameters = normalizer._impl; + auto d_code_points = rmm::device_uvector(max_new_char_total, stream); auto d_sizes = rmm::device_uvector(chars_size, stream); normalizer_kernel<<>>( d_input_chars, chars_size, - normalizer._impl->cp_metadata.data(), - normalizer._impl->aux_table.data(), - normalizer._impl->do_lower_case, - normalizer._impl->special_tokens, + parameters->cp_metadata.data(), + parameters->aux_table.data(), + parameters->do_lower_case, + parameters->get_special_tokens(), d_code_points.data(), d_sizes.data()); @@ -475,10 +565,16 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column( output_sizes.begin(), output_sizes.end(), stream, mr); + // printf("total_size=%ld\n", total_size); + // create output chars and use remove-copy(0) on d_code_points rmm::device_uvector chars(total_size, stream, mr); auto begin = reinterpret_cast(d_code_points.begin()); auto end = reinterpret_cast(d_code_points.end()); + // thrust::for_each_n(rmm::exec_policy(stream), + // thrust::make_counting_iterator(0), + // d_code_points.size(), + // [begin] __device__(auto idx) { printf("0x%02x\n", (int)begin[idx]); }); thrust::remove_copy(rmm::exec_policy_nosync(stream), begin, end, chars.data(), 0); return cudf::make_strings_column(input.size(), diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index d8f87f78278..d774a357fa4 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -138,12 +138,23 @@ TEST_F(TextNormalizeTest, NormalizeCharacters) TEST_F(TextNormalizeTest, NormalizeCharactersNew) { // These include punctuation, accents, whitespace, and CJK characters - auto input = cudf::test::strings_column_wrapper( - {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""}, - {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); - auto sv = cudf::strings_column_view(input); + auto input = cudf::test::strings_column_wrapper({"abc£def", // 0 + "", // 8 + "éè â îô\taeio", // 25 + "\tĂĆĖÑ Ü", // + "ACEN U", + "P^NP", + "$41.07", + "[a,b]", + "丏丟", + ""}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + + auto sv = cudf::strings_column_view(input); + auto special_tokens = cudf::test::strings_column_wrapper(); + auto stv = cudf::strings_column_view(special_tokens); { - auto normalizer = nvtext::create_character_normalizer(true); + auto normalizer = nvtext::create_character_normalizer(true, stv); auto results = nvtext::normalize_characters(sv, *normalizer); cudf::test::print(results->view()); cudf::test::strings_column_wrapper expected({"abc£def", @@ -160,7 +171,7 @@ TEST_F(TextNormalizeTest, NormalizeCharactersNew) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto normalizer = nvtext::create_character_normalizer(false); + auto normalizer = nvtext::create_character_normalizer(false, stv); auto results = nvtext::normalize_characters(sv, *normalizer); cudf::test::print(results->view()); cudf::test::strings_column_wrapper expected({"abc£def", @@ -178,6 +189,28 @@ TEST_F(TextNormalizeTest, NormalizeCharactersNew) } } +TEST_F(TextNormalizeTest, NormalizeCharactersNewSpecialTokens) +{ + // These include punctuation, accents, whitespace, and CJK characters + auto input = + cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]", + "[bos]these should[sep]work too[eos]"}); + + auto sv = cudf::strings_column_view(input); + auto special_tokens = cudf::test::strings_column_wrapper({"[BOS]", "[EOS]", "[SEP]", "[PAD]"}); + auto stv = cudf::strings_column_view(special_tokens); + { + auto normalizer = nvtext::create_character_normalizer(true, stv); + auto results = nvtext::normalize_characters(sv, *normalizer); + cudf::test::print(results->view()); + } + { + auto normalizer = nvtext::create_character_normalizer(false, stv); + auto results = nvtext::normalize_characters(sv, *normalizer); + cudf::test::print(results->view()); + } +} + TEST_F(TextNormalizeTest, NormalizeSlicedColumn) { cudf::test::strings_column_wrapper strings( From a29a40514c2a066239dba54a0602cb6396065b44 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 27 Jan 2025 18:55:43 -0500 Subject: [PATCH 03/21] add remove_copy_safe --- cpp/benchmarks/text/normalize.cpp | 10 ++- cpp/src/text/normalize.cu | 75 +++++++++--------- cpp/tests/text/normalize_tests.cpp | 120 +++++++++++++++++------------ 3 files changed, 117 insertions(+), 88 deletions(-) diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp index 594dc0de28a..91738a83adb 100644 --- a/cpp/benchmarks/text/normalize.cpp +++ b/cpp/benchmarks/text/normalize.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -48,8 +49,11 @@ static void bench_normalize(nvbench::state& state) [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); }); } else { bool const to_lower = (normalize_type == "to_lower"); + auto spt = cudf::strings_column_view(cudf::make_empty_column(cudf::type_id::STRING)->view()); + auto normalizer = nvtext::create_character_normalizer(to_lower, spt); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = nvtext::normalize_characters(input, to_lower); + // auto result = nvtext::normalize_characters(input, to_lower); + auto result = nvtext::normalize_characters(input, *normalizer); }); } } @@ -57,6 +61,6 @@ static void bench_normalize(nvbench::state& state) NVBENCH_BENCH(bench_normalize) .set_name("normalize") .add_int64_axis("min_width", {0}) - .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("max_width", {128, 256}) .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"spaces", "characters", "to_lower"}); diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 63047edd376..24a0c5ea811 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -328,21 +329,19 @@ character_normalizer::character_normalizer(bool do_lower_case, auto sorted = std::move( cudf::sort(cudf::table_view({special_tokens.parent()}), {}, {}, stream)->release().front()); if (do_lower_case) { - sorted = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream); + auto lower = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream); + auto upper = cudf::strings::to_upper(cudf::strings_column_view(sorted->view()), stream); + sorted = cudf::concatenate(std::vector{lower->view(), upper->view()}, stream); } auto tokens_view = cudf::strings::detail::create_string_vector_from_column( cudf::strings_column_view(sorted->view()), stream, cudf::get_current_device_resource_ref()); auto const begin = cudf::detail::make_counting_transform_iterator(0, size_bytes_fn{tokens_view.data()}); - auto const end = begin + tokens_view.size(); - auto const min_width = thrust::reduce(rmm::exec_policy(stream), - begin, - end, - std::numeric_limits::max(), - thrust::minimum{}); - auto const max_width = - thrust::reduce(rmm::exec_policy(stream), begin, end, 0, thrust::maximum{}); - printf("%ld special tokens, min=%d, max=%d\n", tokens_view.size(), min_width, max_width); + auto const end = begin + tokens_view.size(); + auto const init_min = std::numeric_limits::max(); + auto const min_width = + thrust::reduce(rmm::exec_policy(stream), begin, end, init_min, thrust::minimum{}); + auto const max_width = thrust::reduce(rmm::exec_policy(stream), begin, end, 0, thrust::maximum{}); CUDF_EXPECTS(tokens_view.is_empty() || (min_width > 2 && max_width <= 6), "Expected special tokens to be between 3 and 6 bytes including [] brackets", std::invalid_argument); @@ -376,11 +375,10 @@ __device__ bool check_special_token(cudf::device_span s int64_t total_bytes) { char const ch = d_chars[idx]; - auto begin = (ch == '[' || idx <= 6) ? d_chars + idx : d_chars + idx - 6L; + auto begin = ch == '[' ? d_chars + idx : d_chars + std::max(0L, idx - 6); auto end = ch == ']' ? d_chars + idx + 1 : d_chars + idx + std::min(6L, total_bytes - idx); char const mch = ch == '[' ? ']' : '['; - // printf("(%p,%ld),%c,(%p,%p)\n", d_chars, total_bytes, ch, begin, end); - auto fnd = thrust::find(thrust::seq, begin, end, mch); + auto fnd = thrust::find(thrust::seq, begin, end, mch); if (fnd == end) { return false; } if (mch == ']') { end = fnd + 1; // include ']' @@ -389,12 +387,8 @@ __device__ bool check_special_token(cudf::device_span s } auto const size = static_cast(thrust::distance(begin, end)); auto const token = cudf::string_view(begin, size); - char p[8] = {0}; - memcpy(p, begin, min(size, 7)); - p[min(size, 7)] = 0; auto rtn = thrust::find(thrust::seq, special_tokens.begin(), special_tokens.end(), token) != special_tokens.end(); - printf("(%p,%d)=%s: %c/%d\n", begin, size, p, ch, int(rtn)); return rtn; } @@ -451,12 +445,6 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars, auto const new_cp = replacement[k]; if (new_cp) { num_new_bytes += cp_to_utf8(new_cp, reinterpret_cast(replacement + k)); - // printf("%ld: k=%d, ncp=0x%04x, u8=0x%04x, bytes=%d\n", - // idx, - // k, - // new_cp, - // replacement[k], - // num_new_bytes); } } } @@ -464,16 +452,11 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars, if (idx < total_bytes) { chars_per_thread[idx] = num_new_bytes; - // printf("%ld: nb=%d\n", idx, (int)num_new_bytes); - d_output += idx * MAX_NEW_CHARS; #pragma unroll for (int k = 0; k < MAX_NEW_CHARS; ++k) { *d_output++ = replacement[k]; } - // printf( - // "%ld: r0=0x%04x,r1=0x%04x,r2=0x%04x\n", idx, replacement[0], replacement[1], - // replacement[2]); } } @@ -516,6 +499,27 @@ rmm::device_uvector compute_sizes(int8_t const* sizes, return output_sizes; } + +template +OutputIterator remove_copy_safe(InputIterator first, + InputIterator last, + OutputIterator result, + T const& value, + rmm::cuda_stream_view stream) +{ + auto const copy_size = std::min(static_cast(std::distance(first, last)), + static_cast(std::numeric_limits::max())); + + auto itr = first; + while (itr != last) { + auto const copy_end = + static_cast(std::distance(itr, last)) <= copy_size ? last : itr + copy_size; + result = thrust::remove_copy(rmm::exec_policy(stream), itr, copy_end, result, value); + itr = copy_end; + } + return result; +} + } // namespace std::unique_ptr normalize_characters(cudf::strings_column_view const& input, character_normalizer const& normalizer, @@ -544,6 +548,7 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con auto d_code_points = rmm::device_uvector(max_new_char_total, stream); auto d_sizes = rmm::device_uvector(chars_size, stream); + nvtxRangePushA("normalizer_kernel"); normalizer_kernel<<>>( d_input_chars, chars_size, @@ -553,6 +558,8 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con parameters->get_special_tokens(), d_code_points.data(), d_sizes.data()); + stream.synchronize(); + nvtxRangePop(); auto const input_offsets = cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); @@ -565,17 +572,15 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column( output_sizes.begin(), output_sizes.end(), stream, mr); - // printf("total_size=%ld\n", total_size); - // create output chars and use remove-copy(0) on d_code_points rmm::device_uvector chars(total_size, stream, mr); auto begin = reinterpret_cast(d_code_points.begin()); auto end = reinterpret_cast(d_code_points.end()); - // thrust::for_each_n(rmm::exec_policy(stream), - // thrust::make_counting_iterator(0), - // d_code_points.size(), - // [begin] __device__(auto idx) { printf("0x%02x\n", (int)begin[idx]); }); - thrust::remove_copy(rmm::exec_policy_nosync(stream), begin, end, chars.data(), 0); + nvtxRangePushA("remove_copy"); + // thrust::remove_copy(rmm::exec_policy_nosync(stream), begin, end, chars.data(), 0); + remove_copy_safe(begin, end, chars.data(), 0, stream); + stream.synchronize(); + nvtxRangePop(); return cudf::make_strings_column(input.size(), std::move(offsets), diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index d774a357fa4..3f27b5a4eb2 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -135,46 +135,37 @@ TEST_F(TextNormalizeTest, NormalizeCharacters) } } -TEST_F(TextNormalizeTest, NormalizeCharactersNew) +TEST_F(TextNormalizeTest, WithNormalizer) { // These include punctuation, accents, whitespace, and CJK characters - auto input = cudf::test::strings_column_wrapper({"abc£def", // 0 - "", // 8 - "éè â îô\taeio", // 25 - "\tĂĆĖÑ Ü", // - "ACEN U", - "P^NP", - "$41.07", - "[a,b]", - "丏丟", - ""}, - {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + auto input = cudf::test::strings_column_wrapper( + {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); - auto sv = cudf::strings_column_view(input); + auto const sv = cudf::strings_column_view(input); auto special_tokens = cudf::test::strings_column_wrapper(); - auto stv = cudf::strings_column_view(special_tokens); - { - auto normalizer = nvtext::create_character_normalizer(true, stv); - auto results = nvtext::normalize_characters(sv, *normalizer); - cudf::test::print(results->view()); - cudf::test::strings_column_wrapper expected({"abc£def", - "", - "ee a io aeio", - " acen u", - "acen u", - "p ^ np", - " $ 41 . 07", - " [ a , b ] ", - " 丏 丟 ", - ""}, - {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - } - { - auto normalizer = nvtext::create_character_normalizer(false, stv); - auto results = nvtext::normalize_characters(sv, *normalizer); - cudf::test::print(results->view()); - cudf::test::strings_column_wrapper expected({"abc£def", + auto const stv = cudf::strings_column_view(special_tokens); + + auto normalizer = nvtext::create_character_normalizer(true, stv); + auto results = nvtext::normalize_characters(sv, *normalizer); + // cudf::test::print(results->view()); + auto expected = cudf::test::strings_column_wrapper({"abc£def", + "", + "ee a io aeio", + " acen u", + "acen u", + "p ^ np", + " $ 41 . 07", + " [ a , b ] ", + " 丏 丟 ", + ""}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + normalizer = nvtext::create_character_normalizer(false, stv); + results = nvtext::normalize_characters(sv, *normalizer); + // cudf::test::print(results->view()); + expected = cudf::test::strings_column_wrapper({"abc£def", "", "éè â îô aeio", " ĂĆĖÑ Ü", @@ -185,30 +176,37 @@ TEST_F(TextNormalizeTest, NormalizeCharactersNew) " 丏 丟 ", ""}, {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - } + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } -TEST_F(TextNormalizeTest, NormalizeCharactersNewSpecialTokens) +TEST_F(TextNormalizeTest, SpecialTokens) { // These include punctuation, accents, whitespace, and CJK characters auto input = cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]", - "[bos]these should[sep]work too[eos]"}); + "[bos]these should[sep]work too[eos]", + "some[non]tokens[eol]too"}); auto sv = cudf::strings_column_view(input); auto special_tokens = cudf::test::strings_column_wrapper({"[BOS]", "[EOS]", "[SEP]", "[PAD]"}); auto stv = cudf::strings_column_view(special_tokens); - { - auto normalizer = nvtext::create_character_normalizer(true, stv); - auto results = nvtext::normalize_characters(sv, *normalizer); - cudf::test::print(results->view()); - } - { - auto normalizer = nvtext::create_character_normalizer(false, stv); - auto results = nvtext::normalize_characters(sv, *normalizer); - cudf::test::print(results->view()); - } + auto normalizer = nvtext::create_character_normalizer(true, stv); + auto results = nvtext::normalize_characters(sv, *normalizer); + // cudf::test::print(results->view()); + auto expected = cudf::test::strings_column_wrapper( + {" [bos] some strings with [pad] special [sep] tokens [eos] ", + " [bos] these should [sep] work too [eos] ", + "some [ non ] tokens [ eol ] too"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + normalizer = nvtext::create_character_normalizer(false, stv); + results = nvtext::normalize_characters(sv, *normalizer); + // cudf::test::print(results->view()); + expected = cudf::test::strings_column_wrapper( + {" [BOS] Some strings with [PAD] special [SEP] tokens [EOS] ", + " [ bos ] these should [ sep ] work too [ eos ] ", + "some [ non ] tokens [ eol ] too"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(TextNormalizeTest, NormalizeSlicedColumn) @@ -225,3 +223,25 @@ TEST_F(TextNormalizeTest, NormalizeSlicedColumn) cudf::test::strings_column_wrapper expected2({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); } + +TEST_F(TextNormalizeTest, SlicedColumn) +{ + auto input = cudf::test::strings_column_wrapper( + {"abc£def", "éè â îô\taeio", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟"}); + + auto special_tokens = cudf::test::strings_column_wrapper(); + auto stv = cudf::strings_column_view(special_tokens); + + std::vector sliced = cudf::split(input, {4}); + auto normalizer = nvtext::create_character_normalizer(true, stv); + auto results = + nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer); + auto expected = + cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + normalizer = nvtext::create_character_normalizer(false, stv); + results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), *normalizer); + expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} From 5b21c0a226b2ca8ff30ef5c1abf2585b1314e6d3 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 28 Jan 2025 13:34:23 -0500 Subject: [PATCH 04/21] add special_tokens_kernel --- cpp/src/text/normalize.cu | 138 ++++++++++++----------------- cpp/tests/text/normalize_tests.cpp | 1 - 2 files changed, 56 insertions(+), 83 deletions(-) diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 24a0c5ea811..d64b26ac599 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -113,29 +113,6 @@ __device__ int8_t cp_to_utf8(uint32_t codepoint, char* out) { auto utf8 = cudf::strings::detail::codepoint_to_utf8(codepoint); return cudf::strings::detail::from_char_utf8(utf8, out); -#if 0 - auto out_ptr = out; - if (codepoint < UTF8_1BYTE) // ASCII range - *out_ptr++ = static_cast(codepoint); - else if (codepoint < UTF8_2BYTE) { // create two-byte UTF-8 - // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy - *out_ptr++ = static_cast((((codepoint << 2) & 0x00'1F00) | 0x00'C000) >> 8); - *out_ptr++ = static_cast((codepoint & 0x3F) | 0x0080); - } else if (codepoint < UTF8_3BYTE) { // create three-byte UTF-8 - // bxxxxxxxx:byyyyyyyy => b1110xxxx:b10xxxxyy:b10yyyyyy - *out_ptr++ = static_cast((((codepoint << 4) & 0x0F'0000) | 0x00E0'0000) >> 16); - *out_ptr++ = static_cast((((codepoint << 2) & 0x00'3F00) | 0x00'8000) >> 8); - *out_ptr++ = static_cast((codepoint & 0x3F) | 0x0080); - } else { // create four-byte UTF-8 - // maximum code-point value is 0x0011'0000 - // b000xxxxx:byyyyyyyy:bzzzzzzzz => b11110xxx:b10xxyyyy:b10yyyyzz:b10zzzzzz - *out_ptr++ = static_cast((((codepoint << 6) & 0x0700'0000u) | 0xF000'0000u) >> 24); - *out_ptr++ = static_cast((((codepoint << 4) & 0x003F'0000u) | 0x0080'0000u) >> 16); - *out_ptr++ = static_cast((((codepoint << 2) & 0x00'3F00u) | 0x00'8000u) >> 8); - *out_ptr++ = static_cast((codepoint & 0x3F) | 0x0080); - } - return static_cast(thrust::distance(out, out_ptr)); -#endif } /** @@ -284,8 +261,6 @@ struct character_normalizer::character_normalizer_impl { bool do_lower_case; std::unique_ptr special_tokens; rmm::device_uvector special_tokens_view; - cudf::size_type min_token_width; - cudf::size_type max_token_width; cudf::device_span get_special_tokens() const { @@ -296,16 +271,12 @@ struct character_normalizer::character_normalizer_impl { rmm::device_uvector&& aux_table, bool do_lower_case, std::unique_ptr&& special_tokens, - rmm::device_uvector&& special_tokens_view, - cudf::size_type min_token_width, - cudf::size_type max_token_width) + rmm::device_uvector&& special_tokens_view) : cp_metadata(std::move(cp_metadata)), aux_table(std::move(aux_table)), do_lower_case{do_lower_case}, special_tokens{std::move(special_tokens)}, - special_tokens_view{std::move(special_tokens_view)}, - min_token_width{min_token_width}, - max_token_width{max_token_width} + special_tokens_view{std::move(special_tokens_view)} { } }; @@ -329,31 +300,20 @@ character_normalizer::character_normalizer(bool do_lower_case, auto sorted = std::move( cudf::sort(cudf::table_view({special_tokens.parent()}), {}, {}, stream)->release().front()); if (do_lower_case) { - auto lower = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream); - auto upper = cudf::strings::to_upper(cudf::strings_column_view(sorted->view()), stream); - sorted = cudf::concatenate(std::vector{lower->view(), upper->view()}, stream); + // lower case the tokens so they will match the normalized input + sorted = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream); } + auto tokens_view = cudf::strings::detail::create_string_vector_from_column( cudf::strings_column_view(sorted->view()), stream, cudf::get_current_device_resource_ref()); - auto const begin = - cudf::detail::make_counting_transform_iterator(0, size_bytes_fn{tokens_view.data()}); - auto const end = begin + tokens_view.size(); - auto const init_min = std::numeric_limits::max(); - auto const min_width = - thrust::reduce(rmm::exec_policy(stream), begin, end, init_min, thrust::minimum{}); - auto const max_width = thrust::reduce(rmm::exec_policy(stream), begin, end, 0, thrust::maximum{}); - CUDF_EXPECTS(tokens_view.is_empty() || (min_width > 2 && max_width <= 6), - "Expected special tokens to be between 3 and 6 bytes including [] brackets", - std::invalid_argument); _impl = new character_normalizer_impl(std::move(cp_metadata), std::move(aux_table), do_lower_case, std::move(sorted), - std::move(tokens_view), - min_width, - max_width); + std::move(tokens_view)); } + character_normalizer::~character_normalizer() { delete _impl; } std::unique_ptr create_character_normalizer( @@ -369,27 +329,40 @@ std::unique_ptr create_character_normalizer( namespace detail { namespace { -__device__ bool check_special_token(cudf::device_span special_tokens, - char const* d_chars, - int64_t idx, - int64_t total_bytes) +CUDF_KERNEL void special_tokens_kernel(uint32_t* d_utf8chars, + int64_t total_count, + cudf::device_span special_tokens, + int8_t* chars_per_thread) { - char const ch = d_chars[idx]; - auto begin = ch == '[' ? d_chars + idx : d_chars + std::max(0L, idx - 6); - auto end = ch == ']' ? d_chars + idx + 1 : d_chars + idx + std::min(6L, total_bytes - idx); - char const mch = ch == '[' ? ']' : '['; - auto fnd = thrust::find(thrust::seq, begin, end, mch); - if (fnd == end) { return false; } - if (mch == ']') { - end = fnd + 1; // include ']' - } else { - begin = fnd; + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= total_count) { return; } + auto const begin = d_utf8chars + (idx * MAX_NEW_CHARS) + 1; + if (*begin != '[') { return; } + auto const end = begin + std::min(6L, total_count - idx) * MAX_NEW_CHARS; + auto const match = thrust::find(thrust::seq, begin, end, static_cast(']')); + if (match == end) { return; } + char candidate[8]; + auto itr = thrust::transform_iterator(begin, [](auto v) { return static_cast(v); }); + auto eitr = itr + thrust::distance(begin, match + 1); + auto last = + thrust::copy_if(thrust::seq, itr, eitr, candidate, [](auto c) { return c != 0 && c != ' '; }); + *last = 0; // only needed for debug + + auto const size = static_cast(thrust::distance(candidate, last)); + auto const token = cudf::string_view(candidate, size); + if (thrust::find(thrust::seq, special_tokens.begin(), special_tokens.end(), token) == + special_tokens.end()) { + return; } - auto const size = static_cast(thrust::distance(begin, end)); - auto const token = cudf::string_view(begin, size); - auto rtn = thrust::find(thrust::seq, special_tokens.begin(), special_tokens.end(), token) != - special_tokens.end(); - return rtn; + + // fix up chars to remove the extra spaces + *(begin + 1) = 0; + *(match - 1) = 0; + + auto const match_idx = idx + (thrust::distance(begin, match) / MAX_NEW_CHARS); + + chars_per_thread[idx] = 2; // leading space plus '[' + chars_per_thread[match_idx] = 2; // ']' plus trailing space } CUDF_KERNEL void normalizer_kernel(char const* d_chars, @@ -397,7 +370,6 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars, codepoint_metadata_type const* cp_metadata, aux_codepoint_data_type const* aux_table, bool do_lower_case, - cudf::device_span special_tokens, uint32_t* d_output, int8_t* chars_per_thread) { @@ -428,15 +400,10 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars, } if (should_add_spaces(metadata, do_lower_case) && (num_new_chars == 1)) { - // check for possible special tokens here - auto const ch = d_chars[idx]; - auto sp_found = !special_tokens.empty() && (ch == '[' || ch == ']') && - check_special_token(special_tokens, d_chars, idx, total_bytes); - // write the required spaces at each end replacement[1] = replacement[0]; - replacement[0] = sp_found && ch == ']' ? 0 : SPACE_CODE_POINT; - replacement[2] = sp_found && ch == '[' ? 0 : SPACE_CODE_POINT; + replacement[0] = SPACE_CODE_POINT; + replacement[2] = SPACE_CODE_POINT; num_new_chars = 3; } @@ -481,13 +448,9 @@ rmm::device_uvector compute_sizes(int8_t const* sizes, d_temp.data(), temp, d_in, d_out, size, offsets, offsets + 1, stream.value()); } else { // offsets need to be normalized for segmented-reduce to work efficiently - auto d_offsets = rmm::device_uvector(size + 1, stream); - thrust::transform(rmm::exec_policy_nosync(stream), - offsets, - offsets + size + 1, - d_offsets.begin(), - [offset] __device__(auto o) { return o - offset; }); - auto const offsets_itr = d_offsets.begin(); + auto offsets_itr = thrust::transform_iterator( + offsets, + cuda::proclaim_return_type([offset] __device__(auto o) { return o - offset; })); cub::DeviceSegmentedReduce::Sum( nullptr, temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value()); auto d_temp = rmm::device_buffer{temp, stream}; @@ -555,12 +518,23 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con parameters->cp_metadata.data(), parameters->aux_table.data(), parameters->do_lower_case, - parameters->get_special_tokens(), d_code_points.data(), d_sizes.data()); stream.synchronize(); nvtxRangePop(); + // This removes space adding around any special tokens in the form of [ttt]. + // An alternate approach is to do a multi-replace of '[ ttt ]' with '[ttt]' right + // before returning the output strings column. + auto const special_tokens = parameters->get_special_tokens(); + if (!special_tokens.empty()) { + nvtxRangePushA("special_tokens_kernel"); + special_tokens_kernel<<>>( + d_code_points.data(), chars_size, special_tokens, d_sizes.data()); + stream.synchronize(); + nvtxRangePop(); + } + auto const input_offsets = cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index 3f27b5a4eb2..67c6ab79eb9 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -198,7 +198,6 @@ TEST_F(TextNormalizeTest, SpecialTokens) " [bos] these should [sep] work too [eos] ", "some [ non ] tokens [ eol ] too"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - normalizer = nvtext::create_character_normalizer(false, stv); results = nvtext::normalize_characters(sv, *normalizer); // cudf::test::print(results->view()); From e37bf3f15bbe5e28217b7adcbc396af5cab150dd Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 28 Jan 2025 19:32:59 -0500 Subject: [PATCH 05/21] add python and pylibcudf interfaces --- cpp/benchmarks/text/normalize.cpp | 4 +- cpp/include/nvtext/normalize.hpp | 8 ++- cpp/tests/text/normalize_tests.cpp | 15 ++-- python/cudf/cudf/core/character_normalizer.py | 45 ++++++++++++ python/cudf/cudf/core/column/string.py | 17 ++++- .../pylibcudf/libcudf/nvtext/normalize.pxd | 15 +++- .../pylibcudf/pylibcudf/nvtext/normalize.pxd | 13 +++- .../pylibcudf/pylibcudf/nvtext/normalize.pyi | 10 ++- .../pylibcudf/pylibcudf/nvtext/normalize.pyx | 70 ++++++++++++++++--- 9 files changed, 164 insertions(+), 33 deletions(-) create mode 100644 python/cudf/cudf/core/character_normalizer.py diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp index 91738a83adb..405b099fa10 100644 --- a/cpp/benchmarks/text/normalize.cpp +++ b/cpp/benchmarks/text/normalize.cpp @@ -17,7 +17,6 @@ #include #include -#include #include #include #include @@ -49,8 +48,7 @@ static void bench_normalize(nvbench::state& state) [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); }); } else { bool const to_lower = (normalize_type == "to_lower"); - auto spt = cudf::strings_column_view(cudf::make_empty_column(cudf::type_id::STRING)->view()); - auto normalizer = nvtext::create_character_normalizer(to_lower, spt); + auto normalizer = nvtext::create_character_normalizer(to_lower); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { // auto result = nvtext::normalize_characters(input, to_lower); auto result = nvtext::normalize_characters(input, *normalizer); diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp index be1d28248de..6d256fec231 100644 --- a/cpp/include/nvtext/normalize.hpp +++ b/cpp/include/nvtext/normalize.hpp @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -176,9 +177,10 @@ struct character_normalizer { */ std::unique_ptr create_character_normalizer( bool do_lower_case, - cudf::strings_column_view const& special_tokens, - rmm::cuda_stream_view stream = cudf::get_default_stream(), - rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + cudf::strings_column_view const& special_tokens = cudf::strings_column_view(cudf::column_view{ + cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0}), + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** * @brief Normalizes strings characters for tokenizing diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index 67c6ab79eb9..37b6d51fcbd 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -142,11 +142,9 @@ TEST_F(TextNormalizeTest, WithNormalizer) {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""}, {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); - auto const sv = cudf::strings_column_view(input); - auto special_tokens = cudf::test::strings_column_wrapper(); - auto const stv = cudf::strings_column_view(special_tokens); + auto const sv = cudf::strings_column_view(input); - auto normalizer = nvtext::create_character_normalizer(true, stv); + auto normalizer = nvtext::create_character_normalizer(true); auto results = nvtext::normalize_characters(sv, *normalizer); // cudf::test::print(results->view()); auto expected = cudf::test::strings_column_wrapper({"abc£def", @@ -162,7 +160,7 @@ TEST_F(TextNormalizeTest, WithNormalizer) {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - normalizer = nvtext::create_character_normalizer(false, stv); + normalizer = nvtext::create_character_normalizer(false); results = nvtext::normalize_characters(sv, *normalizer); // cudf::test::print(results->view()); expected = cudf::test::strings_column_wrapper({"abc£def", @@ -228,18 +226,15 @@ TEST_F(TextNormalizeTest, SlicedColumn) auto input = cudf::test::strings_column_wrapper( {"abc£def", "éè â îô\taeio", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟"}); - auto special_tokens = cudf::test::strings_column_wrapper(); - auto stv = cudf::strings_column_view(special_tokens); - std::vector sliced = cudf::split(input, {4}); - auto normalizer = nvtext::create_character_normalizer(true, stv); + auto normalizer = nvtext::create_character_normalizer(true); auto results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer); auto expected = cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - normalizer = nvtext::create_character_normalizer(false, stv); + normalizer = nvtext::create_character_normalizer(false); results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), *normalizer); expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); diff --git a/python/cudf/cudf/core/character_normalizer.py b/python/cudf/cudf/core/character_normalizer.py new file mode 100644 index 00000000000..ffb694d2f73 --- /dev/null +++ b/python/cudf/cudf/core/character_normalizer.py @@ -0,0 +1,45 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from __future__ import annotations + +import pylibcudf as plc + +import cudf + + +class CharacterNormalizer: + """ + A normalizer object used to normalize input text. + + Parameters + ---------- + do_lower_case : bool + Set to True if the normalizer should also lower-case + while normalizing. + special_tokens : cudf string series + Strings column of special tokens. + """ + + def __init__( + self, + d_lower_case: bool, + special_tokens: cudf.Series = cudf.Series([], dtype="object"), + ) -> None: + self.normalizer = plc.nvtext.normalize.CharacterNormalizer( + d_lower_case, special_tokens._column.to_pylibcudf(mode="read") + ) + + def normalize(self, text: cudf.Series) -> cudf.Series: + """ + Parameters + ---------- + text : cudf string series + The strings to be normalized. + + Returns + ------- + Normalized strings + """ + result = text._column.normalize_characters(self.normalizer) + + return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 54b42b1f6de..fe35e400410 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4708,7 +4708,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: dtype: object """ return self._return_or_inplace( - self._column.normalize_characters(do_lower) + self._column.characters_normalize(do_lower) ) def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: @@ -6237,14 +6237,25 @@ def normalize_spaces(self) -> Self: ) @acquire_spill_lock() - def normalize_characters(self, do_lower: bool = True) -> Self: + def characters_normalize(self, do_lower: bool = True) -> Self: return Column.from_pylibcudf( # type: ignore[return-value] - plc.nvtext.normalize.normalize_characters( + plc.nvtext.normalize.characters_normalize( self.to_pylibcudf(mode="read"), do_lower, ) ) + @acquire_spill_lock() + def normalize_characters( + self, normalizer: plc.nvtext.normalize.CharacterNormalizer + ) -> Self: + return Column.from_pylibcudf( # type: ignore[return-value] + plc.nvtext.normalize.normalize_characters( + self.to_pylibcudf(mode="read"), + normalizer, + ) + ) + @acquire_spill_lock() def replace_tokens( self, targets: Self, replacements: Self, delimiter: plc.Scalar diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd index f8b082c8429..2cf2bfb8ac9 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler @@ -16,3 +16,16 @@ cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil: const column_view & strings, bool do_lower_case ) except +libcudf_exception_handler + + cdef struct character_normalizer "nvtext::character_normalizer": + pass + + cdef unique_ptr[character_normalizer] create_character_normalizer( + bool do_lower_case, + const column_view & strings + ) except +libcudf_exception_handler + + cdef unique_ptr[column] normalize_characters( + const column_view & strings, + const character_normalizer & normalizer + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd index 90676145afa..e6688e19762 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd @@ -1,9 +1,18 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp cimport bool +from libcpp.memory cimport unique_ptr from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.normalize cimport character_normalizer +cdef class CharacterNormalizer: + cdef unique_ptr[character_normalizer] c_obj cpdef Column normalize_spaces(Column input) -cpdef Column normalize_characters(Column input, bool do_lower_case) +cpdef Column characters_normalize(Column input, bool do_lower_case) + +cpdef Column normalize_characters( + Column input, + CharacterNormalizer normalizer +) diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi index 1d90a5a8960..d722ef6c79e 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi @@ -1,6 +1,12 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.column import Column +class CharacterNormalizer: + def __init__(self, do_lower_case: bool, special_tokens: Column): ... + def normalize_spaces(input: Column) -> Column: ... -def normalize_characters(input: Column, do_lower_case: bool) -> Column: ... +def characters_normalize(input: Column, do_lower_case: bool) -> Column: ... +def normalize_characters( + input: Column, normalizer: CharacterNormalizer +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx index b259ccaefa6..ada361c5e79 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx @@ -1,16 +1,36 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. +from cython.operator cimport dereference from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.nvtext.normalize cimport ( - normalize_characters as cpp_normalize_characters, - normalize_spaces as cpp_normalize_spaces, -) +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext cimport normalize as cpp_normalize -__all__ = ["normalize_characters", "normalize_spaces"] +__all__ = [ + "CharacterNormalizer" + "normalize_characters", + "normalize_spaces", + "characters_normalize"] + +cdef class CharacterNormalizer: + """The normalizer object to be used with ``normalize_characters``. + + For details, see :cpp:class:`cudf::nvtext::character_normalizer`. + """ + def __cinit__(self, bool do_lower_case, Column tokens): + cdef column_view c_tokens = tokens.view() + with nogil: + self.c_obj = move( + cpp_normalize.create_character_normalizer( + do_lower_case, + c_tokens + ) + ) + + __hash__ = None cpdef Column normalize_spaces(Column input): """ @@ -32,12 +52,12 @@ cpdef Column normalize_spaces(Column input): cdef unique_ptr[column] c_result with nogil: - c_result = cpp_normalize_spaces(input.view()) + c_result = cpp_normalize.normalize_spaces(input.view()) return Column.from_libcudf(move(c_result)) -cpdef Column normalize_characters(Column input, bool do_lower_case): +cpdef Column characters_normalize(Column input, bool do_lower_case): """ Normalizes strings characters for tokenizing. @@ -60,6 +80,38 @@ cpdef Column normalize_characters(Column input, bool do_lower_case): cdef unique_ptr[column] c_result with nogil: - c_result = cpp_normalize_characters(input.view(), do_lower_case) + c_result = cpp_normalize.normalize_characters( + input.view(), + do_lower_case + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column normalize_characters(Column input, CharacterNormalizer normalizer): + """ + Normalizes strings characters for tokenizing. + + For details, see :cpp:func:`normalize_characters` + + Parameters + ---------- + input : Column + Input strings + normalizer : CharacterNormalizer + Normalizer object used for modifying the input column text + + Returns + ------- + Column + Normalized strings column + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_normalize.normalize_characters( + input.view(), + dereference(normalizer.c_obj.get()) + ) return Column.from_libcudf(move(c_result)) From ac4c5a8a4ef69949e55642af1a63cabe57fed6a6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 29 Jan 2025 16:39:51 -0500 Subject: [PATCH 06/21] add block-store --- cpp/src/text/normalize.cu | 64 +++++++++++++------ cpp/src/text/subword/data_normalizer.cu | 12 ++++ python/cudf/cudf/core/character_normalizer.py | 6 +- 3 files changed, 61 insertions(+), 21 deletions(-) diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index d64b26ac599..8e2b1d9a44e 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -200,12 +200,15 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); // create the normalizer and call it + nvtxRangePushA("o_load_normalize_tables"); auto result = [&] { auto const cp_metadata = get_codepoint_metadata(stream); auto const aux_table = get_aux_codepoint_data(stream); auto const normalizer = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case); return normalizer.normalize(strings, stream); }(); + stream.synchronize(); + nvtxRangePop(); CUDF_EXPECTS( result.first->size() < static_cast(std::numeric_limits::max()), @@ -221,8 +224,11 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con auto d_strings = cudf::column_device_view::create(strings.parent(), stream); // build offsets and children using the codepoint_to_utf8_fn + nvtxRangePushA("o_codepoint_to_utf8"); auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr); + stream.synchronize(); + nvtxRangePop(); return cudf::make_strings_column(strings.size(), std::move(offsets_column), @@ -419,16 +425,22 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars, if (idx < total_bytes) { chars_per_thread[idx] = num_new_bytes; - d_output += idx * MAX_NEW_CHARS; -#pragma unroll - for (int k = 0; k < MAX_NEW_CHARS; ++k) { - *d_output++ = replacement[k]; - } + // d_output += idx * MAX_NEW_CHARS; + // // #pragma unroll + // for (int k = 0; k < MAX_NEW_CHARS; ++k) { + // *d_output++ = replacement[k]; + // } } + + // BLOCK_STORE_WARP_TRANSPOSE + using block_store = cub::BlockStore; + __shared__ typename block_store::TempStorage bs_stg; + auto block_base = d_output + blockIdx.x * blockDim.x * MAX_NEW_CHARS; + block_store(bs_stg).Store(block_base, replacement); } template -rmm::device_uvector compute_sizes(int8_t const* sizes, +rmm::device_uvector compute_sizes(cudf::device_span sizes, OffsetType offsets, int64_t offset, cudf::size_type size, @@ -436,10 +448,10 @@ rmm::device_uvector compute_sizes(int8_t const* sizes, { auto output_sizes = rmm::device_uvector(size, stream); - auto d_in = sizes; + auto d_in = sizes.data(); auto d_out = output_sizes.begin(); std::size_t temp = 0; - nvtxRangePushA("segmented_reduce"); + nvtxRangePushA("compute_sizes_segred"); if (offset == 0) { cub::DeviceSegmentedReduce::Sum( nullptr, temp, d_in, d_out, size, offsets, offsets + 1, stream.value()); @@ -457,7 +469,7 @@ rmm::device_uvector compute_sizes(int8_t const* sizes, cub::DeviceSegmentedReduce::Sum( d_temp.data(), temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value()); } - stream.synchronize(); + // stream.synchronize(); nvtxRangePop(); return output_sizes; @@ -483,6 +495,22 @@ OutputIterator remove_copy_safe(InputIterator first, return result; } +template +Iterator remove_safe(Iterator first, Iterator last, T const& value, rmm::cuda_stream_view stream) +{ + auto const size = std::min(static_cast(std::distance(first, last)), + static_cast(std::numeric_limits::max())); + + auto result = first; + auto itr = first; + while (itr != last) { + auto end = static_cast(std::distance(itr, last)) <= size ? last : itr + size; + result = thrust::remove(rmm::exec_policy(stream), itr, end, value); + itr = end; + } + return result; +} + } // namespace std::unique_ptr normalize_characters(cudf::strings_column_view const& input, character_normalizer const& normalizer, @@ -503,14 +531,14 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con if (chars_size == 0) { return std::make_unique(input.parent(), stream, mr); } - constexpr int64_t block_size = 64; + constexpr int64_t block_size = 256; cudf::detail::grid_1d grid{chars_size, block_size}; - auto const max_new_char_total = MAX_NEW_CHARS * chars_size; + auto const max_new_char_total = cudf::util::round_up_safe(chars_size, block_size) * MAX_NEW_CHARS; auto const parameters = normalizer._impl; - auto d_code_points = rmm::device_uvector(max_new_char_total, stream); auto d_sizes = rmm::device_uvector(chars_size, stream); + auto d_code_points = rmm::device_uvector(max_new_char_total, stream); nvtxRangePushA("normalizer_kernel"); normalizer_kernel<<>>( d_input_chars, @@ -520,7 +548,7 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con parameters->do_lower_case, d_code_points.data(), d_sizes.data()); - stream.synchronize(); + // stream.synchronize(); nvtxRangePop(); // This removes space adding around any special tokens in the form of [ttt]. @@ -539,8 +567,7 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); // use segmented-reduce with input_offsets over d_sizes to get the size of the output rows - auto output_sizes = - compute_sizes(d_sizes.data(), input_offsets, first_offset, input.size(), stream); + auto output_sizes = compute_sizes(d_sizes, input_offsets, first_offset, input.size(), stream); // convert the sizes to offsets auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column( @@ -549,11 +576,12 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con // create output chars and use remove-copy(0) on d_code_points rmm::device_uvector chars(total_size, stream, mr); auto begin = reinterpret_cast(d_code_points.begin()); - auto end = reinterpret_cast(d_code_points.end()); + // auto end = reinterpret_cast(d_code_points.end()); nvtxRangePushA("remove_copy"); - // thrust::remove_copy(rmm::exec_policy_nosync(stream), begin, end, chars.data(), 0); + auto end = reinterpret_cast( + remove_safe(d_code_points.begin(), d_code_points.end(), 0, stream)); remove_copy_safe(begin, end, chars.data(), 0, stream); - stream.synchronize(); + // stream.synchronize(); nvtxRangePop(); return cudf::make_strings_column(input.size(), diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index 4c54409c41a..559695e00d3 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -238,6 +238,7 @@ uvector_pair data_normalizer::normalize(cudf::strings_column_view const& input, rmm::device_uvector d_chars_per_thread(threads_on_device, stream); auto const d_strings = input.chars_begin(stream) + cudf::strings::detail::get_offset_value( input.offsets(), input.offset(), stream); + nvtxRangePushA("o_kernel_data_normalizer"); kernel_data_normalizer<<>>( reinterpret_cast(d_strings), bytes_count, @@ -246,26 +247,37 @@ uvector_pair data_normalizer::normalize(cudf::strings_column_view const& input, do_lower_case, d_code_points->data(), d_chars_per_thread.data()); + stream.synchronize(); + nvtxRangePop(); // Remove the 'empty' code points from the vector + nvtxRangePushA("o_remove"); thrust::remove(rmm::exec_policy(stream), d_code_points->begin(), d_code_points->end(), uint32_t{1 << FILTER_BIT}); + stream.synchronize(); + nvtxRangePop(); // We also need to prefix sum the number of characters up to an including // the current character in order to get the new strings lengths. + nvtxRangePushA("o_inclusive_scan"); thrust::inclusive_scan(rmm::exec_policy(stream), d_chars_per_thread.begin(), d_chars_per_thread.end(), d_chars_per_thread.begin()); + stream.synchronize(); + nvtxRangePop(); // This will reset the offsets to the new generated code point values + nvtxRangePushA("o_update_strings_lengths"); thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(1), input.size(), update_strings_lengths_fn{d_chars_per_thread.data(), d_strings_offsets->data()}); + stream.synchronize(); + nvtxRangePop(); auto const num_chars = d_strings_offsets->element(input.size(), stream); d_code_points->resize(num_chars, stream); // should be smaller than original allocated size diff --git a/python/cudf/cudf/core/character_normalizer.py b/python/cudf/cudf/core/character_normalizer.py index ffb694d2f73..00ee43598f6 100644 --- a/python/cudf/cudf/core/character_normalizer.py +++ b/python/cudf/cudf/core/character_normalizer.py @@ -13,7 +13,7 @@ class CharacterNormalizer: Parameters ---------- - do_lower_case : bool + do_lower : bool Set to True if the normalizer should also lower-case while normalizing. special_tokens : cudf string series @@ -22,11 +22,11 @@ class CharacterNormalizer: def __init__( self, - d_lower_case: bool, + do_lower: bool, special_tokens: cudf.Series = cudf.Series([], dtype="object"), ) -> None: self.normalizer = plc.nvtext.normalize.CharacterNormalizer( - d_lower_case, special_tokens._column.to_pylibcudf(mode="read") + do_lower, special_tokens._column.to_pylibcudf(mode="read") ) def normalize(self, text: cudf.Series) -> cudf.Series: From 0df89afc69b5117b36f1fd9c5bf5dbed8f00245e Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 29 Jan 2025 20:19:22 -0500 Subject: [PATCH 07/21] fix block-store algo type --- cpp/src/text/normalize.cu | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 8e2b1d9a44e..fcb84c1f220 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -386,8 +386,8 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars, if ((idx < total_bytes) && cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { auto const cp = [utf8 = d_chars + idx] { - cudf::char_utf8 ch_utf8; - cudf::strings::detail::to_char_utf8(utf8, ch_utf8); + cudf::char_utf8 ch_utf8 = *utf8; + if (ch_utf8 > 0x7F) { cudf::strings::detail::to_char_utf8(utf8, ch_utf8); } return cudf::strings::detail::utf8_to_codepoint(ch_utf8); }(); auto const metadata = cp_metadata[cp]; @@ -426,14 +426,15 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars, if (idx < total_bytes) { chars_per_thread[idx] = num_new_bytes; // d_output += idx * MAX_NEW_CHARS; - // // #pragma unroll + // #pragma unroll // for (int k = 0; k < MAX_NEW_CHARS; ++k) { // *d_output++ = replacement[k]; // } } - // BLOCK_STORE_WARP_TRANSPOSE - using block_store = cub::BlockStore; + // BLOCK_STORE_TRANSPOSE + using block_store = + cub::BlockStore; __shared__ typename block_store::TempStorage bs_stg; auto block_base = d_output + blockIdx.x * blockDim.x * MAX_NEW_CHARS; block_store(bs_stg).Store(block_base, replacement); @@ -469,7 +470,7 @@ rmm::device_uvector compute_sizes(cudf::device_span normalize_characters(cudf::strings_column_view con parameters->do_lower_case, d_code_points.data(), d_sizes.data()); - // stream.synchronize(); + stream.synchronize(); nvtxRangePop(); // This removes space adding around any special tokens in the form of [ttt]. @@ -576,12 +577,12 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con // create output chars and use remove-copy(0) on d_code_points rmm::device_uvector chars(total_size, stream, mr); auto begin = reinterpret_cast(d_code_points.begin()); - // auto end = reinterpret_cast(d_code_points.end()); + auto end = reinterpret_cast(d_code_points.end()); nvtxRangePushA("remove_copy"); - auto end = reinterpret_cast( - remove_safe(d_code_points.begin(), d_code_points.end(), 0, stream)); + // auto end = reinterpret_cast( + // remove_safe(d_code_points.begin(), d_code_points.end(), 0, stream)); remove_copy_safe(begin, end, chars.data(), 0, stream); - // stream.synchronize(); + stream.synchronize(); nvtxRangePop(); return cudf::make_strings_column(input.size(), From 51cae79465a4b7a6dbcf099dcb00d98b116040b6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 30 Jan 2025 09:20:17 -0500 Subject: [PATCH 08/21] update nvtx range names --- cpp/src/text/normalize.cu | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index fcb84c1f220..be92086a663 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -200,15 +200,15 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); // create the normalizer and call it - nvtxRangePushA("o_load_normalize_tables"); auto result = [&] { + nvtxRangePushA("o_load_normalize_tables"); auto const cp_metadata = get_codepoint_metadata(stream); auto const aux_table = get_aux_codepoint_data(stream); auto const normalizer = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case); + stream.synchronize(); + nvtxRangePop(); return normalizer.normalize(strings, stream); }(); - stream.synchronize(); - nvtxRangePop(); CUDF_EXPECTS( result.first->size() < static_cast(std::numeric_limits::max()), @@ -371,13 +371,13 @@ CUDF_KERNEL void special_tokens_kernel(uint32_t* d_utf8chars, chars_per_thread[match_idx] = 2; // ']' plus trailing space } -CUDF_KERNEL void normalizer_kernel(char const* d_chars, - int64_t total_bytes, - codepoint_metadata_type const* cp_metadata, - aux_codepoint_data_type const* aux_table, - bool do_lower_case, - uint32_t* d_output, - int8_t* chars_per_thread) +CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, + int64_t total_bytes, + codepoint_metadata_type const* cp_metadata, + aux_codepoint_data_type const* aux_table, + bool do_lower_case, + uint32_t* d_output, + int8_t* chars_per_thread) { uint32_t replacement[MAX_NEW_CHARS] = {0}; @@ -452,7 +452,7 @@ rmm::device_uvector compute_sizes(cudf::device_span normalize_characters(cudf::strings_column_view con auto d_sizes = rmm::device_uvector(chars_size, stream); auto d_code_points = rmm::device_uvector(max_new_char_total, stream); - nvtxRangePushA("normalizer_kernel"); - normalizer_kernel<<>>( + nvtxRangePushA("n_normalizer_kernel"); + data_normalizer_kernel<<>>( d_input_chars, chars_size, parameters->cp_metadata.data(), @@ -557,7 +557,7 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con // before returning the output strings column. auto const special_tokens = parameters->get_special_tokens(); if (!special_tokens.empty()) { - nvtxRangePushA("special_tokens_kernel"); + nvtxRangePushA("n_special_tokens_kernel"); special_tokens_kernel<<>>( d_code_points.data(), chars_size, special_tokens, d_sizes.data()); stream.synchronize(); @@ -578,7 +578,7 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con rmm::device_uvector chars(total_size, stream, mr); auto begin = reinterpret_cast(d_code_points.begin()); auto end = reinterpret_cast(d_code_points.end()); - nvtxRangePushA("remove_copy"); + nvtxRangePushA("n_remove_copy"); // auto end = reinterpret_cast( // remove_safe(d_code_points.begin(), d_code_points.end(), 0, stream)); remove_copy_safe(begin, end, chars.data(), 0, stream); From 091cb84494a3b0a8a8fa7a43a20dd7cbf0f4cd5d Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 30 Jan 2025 15:57:42 -0500 Subject: [PATCH 09/21] replace d_sizes with transform-iterator --- cpp/benchmarks/text/normalize.cpp | 5 +- cpp/include/nvtext/normalize.hpp | 26 +++++--- cpp/src/text/normalize.cu | 89 ++++++++++--------------- cpp/src/text/subword/data_normalizer.cu | 12 ---- cpp/tests/text/normalize_tests.cpp | 72 ++++++++++---------- 5 files changed, 90 insertions(+), 114 deletions(-) diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp index 405b099fa10..494d5722ae4 100644 --- a/cpp/benchmarks/text/normalize.cpp +++ b/cpp/benchmarks/text/normalize.cpp @@ -48,9 +48,10 @@ static void bench_normalize(nvbench::state& state) [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); }); } else { bool const to_lower = (normalize_type == "to_lower"); - auto normalizer = nvtext::create_character_normalizer(to_lower); + // we expect the normalizer to be created once and re-used + // so creating it is not measured + auto normalizer = nvtext::create_character_normalizer(to_lower); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - // auto result = nvtext::normalize_characters(input, to_lower); auto result = nvtext::normalize_characters(input, *normalizer); }); } diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp index 6d256fec231..4bd257b7d45 100644 --- a/cpp/include/nvtext/normalize.hpp +++ b/cpp/include/nvtext/normalize.hpp @@ -122,18 +122,22 @@ std::unique_ptr normalize_characters( * - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "` * - removing control characters (unicode categories "Cc" and "Cf") * - * The padding process here adds a single space before and after the character. + * The padding process adds a single space before and after the character. * Details on _unicode category_ can be found here: * https://unicodebook.readthedocs.io/unicode.html#categories * - * If `do_lower_case = true`, lower-casing also removes the accents. The + * If `do_lower_case = true`, lower-casing also removes any accents. The * accents cannot be removed from upper-case characters without lower-casing * and lower-casing cannot be performed without also removing accents. * However, if the accented character is already lower-case, then only the * accent is removed. * - * If `special_tokens` are included the padding around the `[]` is not - * enforced if the character between them match one of the given tokens. + * If `special_tokens` are included the padding after `[` and before `]` is not + * inserted if the character between them match one of the given tokens. + * If `do_lower_case = true` the `special_tokens` are expected to contain + * lower-case characters. + * Also, the `special_tokens` are expected to include the `[]` characters + * at the beginning of and end of each string respectively. */ struct character_normalizer { /** @@ -170,10 +174,11 @@ struct character_normalizer { * @param do_lower_case If true, upper-case characters are converted to * lower-case and accents are stripped from those characters. * If false, accented and upper-case characters are not transformed. - * @param special_tokens Individual sequences including `[]` brackets + * @param special_tokens Individual sequences including `[]` brackets. + * Default is no special tokens. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory - * @return Object to be used with nvtext::tokenize_with_vocabulary + * @return Object to be used with nvtext::normalize_characters */ std::unique_ptr create_character_normalizer( bool do_lower_case, @@ -183,15 +188,18 @@ std::unique_ptr create_character_normalizer( rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); /** - * @brief Normalizes strings characters for tokenizing + * @brief Normalizes the text in input strings column * * @see nvtext::character_normalizer for details on the normalizer behavior * * @code{.pseudo} + * cn = create_character_normalizer(true) * s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"] - * s1 = normalize_characters(s,true) + * s1 = normalize_characters(s,cn) * s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "] - * s2 = normalize_characters(s,false) + * + * cn = create_character_normalizer(false) + * s2 = normalize_characters(s,cn) * s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "] * @endcode * diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index be92086a663..b4606243775 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -201,12 +201,9 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con // create the normalizer and call it auto result = [&] { - nvtxRangePushA("o_load_normalize_tables"); auto const cp_metadata = get_codepoint_metadata(stream); auto const aux_table = get_aux_codepoint_data(stream); auto const normalizer = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case); - stream.synchronize(); - nvtxRangePop(); return normalizer.normalize(strings, stream); }(); @@ -224,11 +221,8 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con auto d_strings = cudf::column_device_view::create(strings.parent(), stream); // build offsets and children using the codepoint_to_utf8_fn - nvtxRangePushA("o_codepoint_to_utf8"); auto [offsets_column, chars] = cudf::strings::detail::make_strings_children( codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr); - stream.synchronize(); - nvtxRangePop(); return cudf::make_strings_column(strings.size(), std::move(offsets_column), @@ -337,8 +331,7 @@ namespace { CUDF_KERNEL void special_tokens_kernel(uint32_t* d_utf8chars, int64_t total_count, - cudf::device_span special_tokens, - int8_t* chars_per_thread) + cudf::device_span special_tokens) { auto const idx = cudf::detail::grid_1d::global_thread_id(); if (idx >= total_count) { return; } @@ -364,11 +357,6 @@ CUDF_KERNEL void special_tokens_kernel(uint32_t* d_utf8chars, // fix up chars to remove the extra spaces *(begin + 1) = 0; *(match - 1) = 0; - - auto const match_idx = idx + (thrust::distance(begin, match) / MAX_NEW_CHARS); - - chars_per_thread[idx] = 2; // leading space plus '[' - chars_per_thread[match_idx] = 2; // ']' plus trailing space } CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, @@ -376,8 +364,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, codepoint_metadata_type const* cp_metadata, aux_codepoint_data_type const* aux_table, bool do_lower_case, - uint32_t* d_output, - int8_t* chars_per_thread) + uint32_t* d_output) { uint32_t replacement[MAX_NEW_CHARS] = {0}; @@ -423,16 +410,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, } } - if (idx < total_bytes) { - chars_per_thread[idx] = num_new_bytes; - // d_output += idx * MAX_NEW_CHARS; - // #pragma unroll - // for (int k = 0; k < MAX_NEW_CHARS; ++k) { - // *d_output++ = replacement[k]; - // } - } - - // BLOCK_STORE_TRANSPOSE + // alternate algorithm BLOCK_STORE_TRANSPOSE using block_store = cub::BlockStore; __shared__ typename block_store::TempStorage bs_stg; @@ -441,7 +419,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, } template -rmm::device_uvector compute_sizes(cudf::device_span sizes, +rmm::device_uvector compute_sizes(cudf::device_span d_codepoints, OffsetType offsets, int64_t offset, cudf::size_type size, @@ -449,10 +427,23 @@ rmm::device_uvector compute_sizes(cudf::device_span(size, stream); - auto d_in = sizes.data(); - auto d_out = output_sizes.begin(); - std::size_t temp = 0; - nvtxRangePushA("n_compute_sizes_segred"); + auto d_cps = d_codepoints.data(); + + // counts the non-zero bytes in the d_cps arrays + auto d_in = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type([d_cps] __device__(auto idx) { + idx = idx * MAX_NEW_CHARS; + auto size = cudf::size_type{0}; + for (int k = 0; k < MAX_NEW_CHARS; ++k) { + auto const v = d_cps[idx + k]; + size += + ((v & 0xFF) > 0) + ((v & 0xFF00) > 0) + ((v & 0xFF0000) > 0) + ((v & 0xFF000000) > 0); + } + return size; + })); + + auto d_out = output_sizes.begin(); + auto temp = std::size_t{0}; if (offset == 0) { cub::DeviceSegmentedReduce::Sum( nullptr, temp, d_in, d_out, size, offsets, offsets + 1, stream.value()); @@ -470,12 +461,11 @@ rmm::device_uvector compute_sizes(cudf::device_span OutputIterator remove_copy_safe(InputIterator first, InputIterator last, @@ -496,6 +486,7 @@ OutputIterator remove_copy_safe(InputIterator first, return result; } +// handles ranges above int32 max template Iterator remove_safe(Iterator first, Iterator last, T const& value, rmm::cuda_stream_view stream) { @@ -538,52 +529,42 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con auto const parameters = normalizer._impl; - auto d_sizes = rmm::device_uvector(chars_size, stream); - auto d_code_points = rmm::device_uvector(max_new_char_total, stream); - nvtxRangePushA("n_normalizer_kernel"); + auto d_codepoints = rmm::device_uvector(max_new_char_total, stream); data_normalizer_kernel<<>>( d_input_chars, chars_size, parameters->cp_metadata.data(), parameters->aux_table.data(), parameters->do_lower_case, - d_code_points.data(), - d_sizes.data()); - stream.synchronize(); - nvtxRangePop(); + d_codepoints.data()); - // This removes space adding around any special tokens in the form of [ttt]. + // This removes space added around any special tokens in the form of [ttt]. // An alternate approach is to do a multi-replace of '[ ttt ]' with '[ttt]' right // before returning the output strings column. auto const special_tokens = parameters->get_special_tokens(); if (!special_tokens.empty()) { - nvtxRangePushA("n_special_tokens_kernel"); special_tokens_kernel<<>>( - d_code_points.data(), chars_size, special_tokens, d_sizes.data()); - stream.synchronize(); - nvtxRangePop(); + d_codepoints.data(), chars_size, special_tokens); } auto const input_offsets = cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); - // use segmented-reduce with input_offsets over d_sizes to get the size of the output rows - auto output_sizes = compute_sizes(d_sizes, input_offsets, first_offset, input.size(), stream); + // Use segmented-reduce over the non-zero codepoints to get the size of the output rows + auto output_sizes = + compute_sizes(d_codepoints, input_offsets, first_offset, input.size(), stream); // convert the sizes to offsets auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column( output_sizes.begin(), output_sizes.end(), stream, mr); - // create output chars and use remove-copy(0) on d_code_points + // create output chars by calling remove_copy(0) on the bytes in d_codepoints rmm::device_uvector chars(total_size, stream, mr); - auto begin = reinterpret_cast(d_code_points.begin()); - auto end = reinterpret_cast(d_code_points.end()); - nvtxRangePushA("n_remove_copy"); - // auto end = reinterpret_cast( - // remove_safe(d_code_points.begin(), d_code_points.end(), 0, stream)); + auto begin = reinterpret_cast(d_codepoints.begin()); + // the first remove() here speeds up the remove_copy() by roughly 10% + auto end = + reinterpret_cast(remove_safe(d_codepoints.begin(), d_codepoints.end(), 0, stream)); remove_copy_safe(begin, end, chars.data(), 0, stream); - stream.synchronize(); - nvtxRangePop(); return cudf::make_strings_column(input.size(), std::move(offsets), diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index 559695e00d3..4c54409c41a 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -238,7 +238,6 @@ uvector_pair data_normalizer::normalize(cudf::strings_column_view const& input, rmm::device_uvector d_chars_per_thread(threads_on_device, stream); auto const d_strings = input.chars_begin(stream) + cudf::strings::detail::get_offset_value( input.offsets(), input.offset(), stream); - nvtxRangePushA("o_kernel_data_normalizer"); kernel_data_normalizer<<>>( reinterpret_cast(d_strings), bytes_count, @@ -247,37 +246,26 @@ uvector_pair data_normalizer::normalize(cudf::strings_column_view const& input, do_lower_case, d_code_points->data(), d_chars_per_thread.data()); - stream.synchronize(); - nvtxRangePop(); // Remove the 'empty' code points from the vector - nvtxRangePushA("o_remove"); thrust::remove(rmm::exec_policy(stream), d_code_points->begin(), d_code_points->end(), uint32_t{1 << FILTER_BIT}); - stream.synchronize(); - nvtxRangePop(); // We also need to prefix sum the number of characters up to an including // the current character in order to get the new strings lengths. - nvtxRangePushA("o_inclusive_scan"); thrust::inclusive_scan(rmm::exec_policy(stream), d_chars_per_thread.begin(), d_chars_per_thread.end(), d_chars_per_thread.begin()); - stream.synchronize(); - nvtxRangePop(); // This will reset the offsets to the new generated code point values - nvtxRangePushA("o_update_strings_lengths"); thrust::for_each_n( rmm::exec_policy(stream), thrust::make_counting_iterator(1), input.size(), update_strings_lengths_fn{d_chars_per_thread.data(), d_strings_offsets->data()}); - stream.synchronize(); - nvtxRangePop(); auto const num_chars = d_strings_offsets->element(input.size(), stream); d_code_points->resize(num_chars, stream); // should be smaller than original allocated size diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index 37b6d51fcbd..0223bcc428a 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -146,34 +145,32 @@ TEST_F(TextNormalizeTest, WithNormalizer) auto normalizer = nvtext::create_character_normalizer(true); auto results = nvtext::normalize_characters(sv, *normalizer); - // cudf::test::print(results->view()); - auto expected = cudf::test::strings_column_wrapper({"abc£def", - "", - "ee a io aeio", - " acen u", - "acen u", - "p ^ np", - " $ 41 . 07", - " [ a , b ] ", - " 丏 丟 ", - ""}, - {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + auto expected = cudf::test::strings_column_wrapper({"abc£def", + "", + "ee a io aeio", + " acen u", + "acen u", + "p ^ np", + " $ 41 . 07", + " [ a , b ] ", + " 丏 丟 ", + ""}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); normalizer = nvtext::create_character_normalizer(false); results = nvtext::normalize_characters(sv, *normalizer); - // cudf::test::print(results->view()); - expected = cudf::test::strings_column_wrapper({"abc£def", - "", - "éè â îô aeio", - " ĂĆĖÑ Ü", - "ACEN U", - "P ^ NP", - " $ 41 . 07", - " [ a , b ] ", - " 丏 丟 ", - ""}, - {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + expected = cudf::test::strings_column_wrapper({"abc£def", + "", + "éè â îô aeio", + " ĂĆĖÑ Ü", + "ACEN U", + "P ^ NP", + " $ 41 . 07", + " [ a , b ] ", + " 丏 丟 ", + ""}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -188,21 +185,21 @@ TEST_F(TextNormalizeTest, SpecialTokens) auto sv = cudf::strings_column_view(input); auto special_tokens = cudf::test::strings_column_wrapper({"[BOS]", "[EOS]", "[SEP]", "[PAD]"}); auto stv = cudf::strings_column_view(special_tokens); - auto normalizer = nvtext::create_character_normalizer(true, stv); - auto results = nvtext::normalize_characters(sv, *normalizer); - // cudf::test::print(results->view()); - auto expected = cudf::test::strings_column_wrapper( + + auto normalizer = nvtext::create_character_normalizer(true, stv); + auto results = nvtext::normalize_characters(sv, *normalizer); + auto expected = cudf::test::strings_column_wrapper( {" [bos] some strings with [pad] special [sep] tokens [eos] ", - " [bos] these should [sep] work too [eos] ", - "some [ non ] tokens [ eol ] too"}); + " [bos] these should [sep] work too [eos] ", + "some [ non ] tokens [ eol ] too"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + normalizer = nvtext::create_character_normalizer(false, stv); results = nvtext::normalize_characters(sv, *normalizer); - // cudf::test::print(results->view()); - expected = cudf::test::strings_column_wrapper( + expected = cudf::test::strings_column_wrapper( {" [BOS] Some strings with [PAD] special [SEP] tokens [EOS] ", - " [ bos ] these should [ sep ] work too [ eos ] ", - "some [ non ] tokens [ eol ] too"}); + " [ bos ] these should [ sep ] work too [ eos ] ", + "some [ non ] tokens [ eol ] too"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } @@ -226,8 +223,9 @@ TEST_F(TextNormalizeTest, SlicedColumn) auto input = cudf::test::strings_column_wrapper( {"abc£def", "éè â îô\taeio", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟"}); - std::vector sliced = cudf::split(input, {4}); - auto normalizer = nvtext::create_character_normalizer(true); + auto sliced = cudf::split(input, {4}); + + auto normalizer = nvtext::create_character_normalizer(true); auto results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer); auto expected = From deb33ac3ec98d04fb22e1e096bbadddc3180a263 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 30 Jan 2025 18:34:53 -0500 Subject: [PATCH 10/21] fix pylibcudf normalize-characters pytest --- python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py index 25b6d1389ec..78fb4164a61 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import pyarrow as pa import pytest @@ -29,7 +29,7 @@ def test_normalize_spaces(norm_spaces_input_data): @pytest.mark.parametrize("do_lower", [True, False]) def test_normalize_characters(norm_chars_input_data, do_lower): - result = plc.nvtext.normalize.normalize_characters( + result = plc.nvtext.normalize.characters_normalize( plc.interop.from_arrow(norm_chars_input_data), do_lower, ) From f2f35a616edb859fc27633f07b3a4c37f769a7f7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 3 Feb 2025 17:23:08 -0500 Subject: [PATCH 11/21] add more gtests and pytests --- cpp/tests/text/normalize_tests.cpp | 36 +++---- .../pylibcudf/tests/test_nvtext_normalize.py | 93 ++++++++++++++++++- 2 files changed, 107 insertions(+), 22 deletions(-) diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index 0223bcc428a..59cf611babb 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -74,6 +74,10 @@ TEST_F(TextNormalizeTest, NormalizeEmptyTest) EXPECT_EQ(results->size(), 0); results = nvtext::normalize_characters(strings_view, false); EXPECT_EQ(results->size(), 0); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(strings_view, *normalizer); + EXPECT_EQ(results->size(), 0); } TEST_F(TextNormalizeTest, AllNullStrings) @@ -84,6 +88,10 @@ TEST_F(TextNormalizeTest, AllNullStrings) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); results = nvtext::normalize_characters(strings_view, false); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(strings_view, *normalizer); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); } TEST_F(TextNormalizeTest, SomeNullStrings) @@ -93,6 +101,10 @@ TEST_F(TextNormalizeTest, SomeNullStrings) auto results = nvtext::normalize_characters(strings_view, false); cudf::test::strings_column_wrapper expected({"", " . ", "a"}, {false, true, true}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(strings_view, *normalizer); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(TextNormalizeTest, NormalizeCharacters) @@ -176,7 +188,6 @@ TEST_F(TextNormalizeTest, WithNormalizer) TEST_F(TextNormalizeTest, SpecialTokens) { - // These include punctuation, accents, whitespace, and CJK characters auto input = cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]", "[bos]these should[sep]work too[eos]", @@ -210,26 +221,17 @@ TEST_F(TextNormalizeTest, NormalizeSlicedColumn) std::vector sliced = cudf::split(strings, {4}); auto results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), true); - cudf::test::strings_column_wrapper expected({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); + auto expected = + cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false); - cudf::test::strings_column_wrapper expected2({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); -} - -TEST_F(TextNormalizeTest, SlicedColumn) -{ - auto input = cudf::test::strings_column_wrapper( - {"abc£def", "éè â îô\taeio", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟"}); - - auto sliced = cudf::split(input, {4}); + results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false); + expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); auto normalizer = nvtext::create_character_normalizer(true); - auto results = - nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer); - auto expected = - cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); + results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer); + expected = cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); normalizer = nvtext::create_character_normalizer(false); diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py index 78fb4164a61..47bbb191be6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py @@ -15,7 +15,7 @@ def norm_spaces_input_data(): @pytest.fixture(scope="module") def norm_chars_input_data(): - arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"] + arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]", "[pad]"] return pa.array(arr) @@ -33,11 +33,94 @@ def test_normalize_characters(norm_chars_input_data, do_lower): plc.interop.from_arrow(norm_chars_input_data), do_lower, ) - expected = pa.array( - ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "] + if do_lower: + expected = pa.array( + [ + "eaio eaio", + "acenu", + "acenu", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + else: + expected = pa.array( + [ + "éâîô eaio", + "ĂĆĖÑÜ", + "ACENU", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("do_lower", [True, False]) +def test_normalizer(norm_chars_input_data, do_lower): + result = plc.nvtext.normalize.normalize_characters( + plc.interop.from_arrow(norm_chars_input_data), + plc.nvtext.normalize.CharacterNormalizer( + do_lower, + plc.column_factories.make_empty_column(plc.types.TypeId.STRING), + ), ) - if not do_lower: + if do_lower: + expected = pa.array( + [ + "eaio eaio", + "acenu", + "acenu", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + else: + expected = pa.array( + [ + "éâîô eaio", + "ĂĆĖÑÜ", + "ACENU", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("do_lower", [True, False]) +def test_normalizer_with_special_tokens(norm_chars_input_data, do_lower): + special_tokens = pa.array(["[pad]"]) + result = plc.nvtext.normalize.normalize_characters( + plc.interop.from_arrow(norm_chars_input_data), + plc.nvtext.normalize.CharacterNormalizer( + do_lower, plc.interop.from_arrow(special_tokens) + ), + ) + if do_lower: + expected = pa.array( + [ + "eaio eaio", + "acenu", + "acenu", + " $ 24 . 08", + " [ a , bb ] ", + " [pad] ", + ] + ) + else: expected = pa.array( - ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "] + [ + "éâîô eaio", + "ĂĆĖÑÜ", + "ACENU", + " $ 24 . 08", + " [ a , bb ] ", + " [pad] ", + ] ) assert_column_eq(result, expected) From fc86efa17ab34a38a1d93fe5af94fe80b11be6e5 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 4 Feb 2025 13:33:56 -0500 Subject: [PATCH 12/21] add longer strings to gtests --- cpp/src/text/normalize.cu | 132 ++++++++++++++++++----------- cpp/tests/text/normalize_tests.cpp | 52 +++++++++--- 2 files changed, 124 insertions(+), 60 deletions(-) diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index b4606243775..e37da41c8c3 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -23,8 +23,6 @@ #include #include #include -#include -#include #include #include #include @@ -43,6 +41,7 @@ #include #include +#include #include #include #include @@ -281,12 +280,6 @@ struct character_normalizer::character_normalizer_impl { } }; -// lambdas not allowed in constructors -struct size_bytes_fn { - cudf::string_view const* d_tokens; - __device__ cudf::size_type operator()(cudf::size_type idx) { return d_tokens[idx].size_bytes(); } -}; - character_normalizer::character_normalizer(bool do_lower_case, cudf::strings_column_view const& special_tokens, rmm::cuda_stream_view stream, @@ -300,7 +293,7 @@ character_normalizer::character_normalizer(bool do_lower_case, auto sorted = std::move( cudf::sort(cudf::table_view({special_tokens.parent()}), {}, {}, stream)->release().front()); if (do_lower_case) { - // lower case the tokens so they will match the normalized input + // lower-case the tokens so they will match the normalized input sorted = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream); } @@ -329,36 +322,65 @@ std::unique_ptr create_character_normalizer( namespace detail { namespace { -CUDF_KERNEL void special_tokens_kernel(uint32_t* d_utf8chars, +/** + * @brief Kernel handles fixing up the normalized data to account for any special tokens + * + * This undoes the padding added around the `[]` for patterns matching the strings in the + * special_tokens array. + * + * Launched as a thread per input byte (total_count). + * + * @param d_normalized The normalized set of UTF-8 characters; 3 uints per input byte + * @param total_count Number of bytes represented by d_normalized; len(d_normalized)/3 + * @param special_tokens Tokens to check against + */ +CUDF_KERNEL void special_tokens_kernel(uint32_t* d_normalized, int64_t total_count, cudf::device_span special_tokens) { auto const idx = cudf::detail::grid_1d::global_thread_id(); if (idx >= total_count) { return; } - auto const begin = d_utf8chars + (idx * MAX_NEW_CHARS) + 1; + auto const begin = d_normalized + (idx * MAX_NEW_CHARS) + 1; if (*begin != '[') { return; } auto const end = begin + std::min(6L, total_count - idx) * MAX_NEW_CHARS; auto const match = thrust::find(thrust::seq, begin, end, static_cast(']')); if (match == end) { return; } char candidate[8]; - auto itr = thrust::transform_iterator(begin, [](auto v) { return static_cast(v); }); - auto eitr = itr + thrust::distance(begin, match + 1); - auto last = - thrust::copy_if(thrust::seq, itr, eitr, candidate, [](auto c) { return c != 0 && c != ' '; }); + auto const ch_begin = + thrust::transform_iterator(begin, [](auto v) { return static_cast(v); }); + auto const ch_end = ch_begin + thrust::distance(begin, match + 1); + auto last = thrust::copy_if( + thrust::seq, ch_begin, ch_end, candidate, [](auto c) { return c != 0 && c != ' '; }); *last = 0; // only needed for debug auto const size = static_cast(thrust::distance(candidate, last)); auto const token = cudf::string_view(candidate, size); - if (thrust::find(thrust::seq, special_tokens.begin(), special_tokens.end(), token) == - special_tokens.end()) { + // the binary_search expects the special_tokens to be sorted + if (!thrust::binary_search(thrust::seq, special_tokens.begin(), special_tokens.end(), token)) { return; } // fix up chars to remove the extra spaces - *(begin + 1) = 0; - *(match - 1) = 0; + *(begin + 1) = 0; // removes space after '[' + *(match - 1) = 0; // removes space before ']' } +/** + * @brief The normalizer kernel + * + * Launched as a thread per input byte (total_bytes). + * + * Converts the input d_chars into codepoints to lookup in the provided tables. + * Once processed, the d_output contains 3 uints per input byte each encoded + * as output UTF-8. Any zero values are to removed by a subsequent kernel call. + * + * @param d_chars The characters for the input strings column to normalize + * @param total_bytes The number of bytes in the d_chars + * @param cp_metadata First lookup table for codepoint metadata + * @param aux_table Second lookup table containing possible replacement characters + * @param do_lower_case True if the normalization includes lower-casing characters + * @param d_output The output of the normalization (UTF-8 encoded) + */ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, int64_t total_bytes, codepoint_metadata_type const* cp_metadata, @@ -381,7 +403,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, if (!should_remove_cp(metadata, do_lower_case)) { int8_t num_new_chars = 1; - // Apply lower cases and accent stripping if necessary + // retrieve the normalized value for cp auto const new_cp = do_lower_case || always_replace(metadata) ? get_first_cp(metadata) : cp; replacement[0] = new_cp == 0 ? cp : new_cp; @@ -393,14 +415,13 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, } if (should_add_spaces(metadata, do_lower_case) && (num_new_chars == 1)) { - // write the required spaces at each end replacement[1] = replacement[0]; - replacement[0] = SPACE_CODE_POINT; + replacement[0] = SPACE_CODE_POINT; // add spaces around the new codepoint replacement[2] = SPACE_CODE_POINT; num_new_chars = 3; } - // convert back to UTF-8 + // convert codepoints back to UTF-8 in-place for (int k = 0; k < num_new_chars; ++k) { auto const new_cp = replacement[k]; if (new_cp) { @@ -410,7 +431,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, } } - // alternate algorithm BLOCK_STORE_TRANSPOSE + // employ an optimized coalesced writer to output replacement as a block of transposed data using block_store = cub::BlockStore; __shared__ typename block_store::TempStorage bs_stg; @@ -418,8 +439,20 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, block_store(bs_stg).Store(block_base, replacement); } +/** + * @brief Computes the output sizes for each row + * + * The input offsets are used with segmented-reduce to count the number of + * non-zero values for each output row. + * + * @param d_normalized The UTF-8 encoded normalized values + * @param offsets These identify the row boundaries + * @param offset Only non-zero if the input column has been sliced + * @param size The number of output rows (sames as the number of input rows) + * @param stream Stream used for allocating device memory and launching kernels + */ template -rmm::device_uvector compute_sizes(cudf::device_span d_codepoints, +rmm::device_uvector compute_sizes(cudf::device_span d_normalized, OffsetType offsets, int64_t offset, cudf::size_type size, @@ -427,21 +460,23 @@ rmm::device_uvector compute_sizes(cudf::device_span(size, stream); - auto d_cps = d_codepoints.data(); + auto d_data = d_normalized.data(); - // counts the non-zero bytes in the d_cps arrays + // counts the non-zero bytes in the d_data array auto d_in = cudf::detail::make_counting_transform_iterator( - 0, cuda::proclaim_return_type([d_cps] __device__(auto idx) { - idx = idx * MAX_NEW_CHARS; - auto size = cudf::size_type{0}; - for (int k = 0; k < MAX_NEW_CHARS; ++k) { - auto const v = d_cps[idx + k]; - size += - ((v & 0xFF) > 0) + ((v & 0xFF00) > 0) + ((v & 0xFF0000) > 0) + ((v & 0xFF000000) > 0); - } - return size; + 0, cuda::proclaim_return_type([d_data] __device__(auto idx) { + idx = idx * MAX_NEW_CHARS; + // transform function counts number of non-zero bytes in uint32_t value + auto tfn = [](uint32_t v) -> cudf::size_type { + return ((v & 0xFF) > 0) + ((v & 0xFF00) > 0) + ((v & 0xFF0000) > 0) + + ((v & 0xFF000000) > 0); + }; + auto const begin = d_data + idx; + auto const end = begin + MAX_NEW_CHARS; + return thrust::transform_reduce(thrust::seq, begin, end, tfn, 0, thrust::plus{}); })); + // DeviceSegmentedReduce is used to compute the size of each output row auto d_out = output_sizes.begin(); auto temp = std::size_t{0}; if (offset == 0) { @@ -502,8 +537,8 @@ Iterator remove_safe(Iterator first, Iterator last, T const& value, rmm::cuda_st } return result; } - } // namespace + std::unique_ptr normalize_characters(cudf::strings_column_view const& input, character_normalizer const& normalizer, rmm::cuda_stream_view stream, @@ -529,14 +564,14 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con auto const parameters = normalizer._impl; - auto d_codepoints = rmm::device_uvector(max_new_char_total, stream); + auto d_normalized = rmm::device_uvector(max_new_char_total, stream); data_normalizer_kernel<<>>( d_input_chars, chars_size, parameters->cp_metadata.data(), parameters->aux_table.data(), parameters->do_lower_case, - d_codepoints.data()); + d_normalized.data()); // This removes space added around any special tokens in the form of [ttt]. // An alternate approach is to do a multi-replace of '[ ttt ]' with '[ttt]' right @@ -544,26 +579,25 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con auto const special_tokens = parameters->get_special_tokens(); if (!special_tokens.empty()) { special_tokens_kernel<<>>( - d_codepoints.data(), chars_size, special_tokens); + d_normalized.data(), chars_size, special_tokens); } + // Use segmented-reduce over the non-zero codepoints to get the size of the output rows auto const input_offsets = cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); - - // Use segmented-reduce over the non-zero codepoints to get the size of the output rows auto output_sizes = - compute_sizes(d_codepoints, input_offsets, first_offset, input.size(), stream); + compute_sizes(d_normalized, input_offsets, first_offset, input.size(), stream); // convert the sizes to offsets auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column( output_sizes.begin(), output_sizes.end(), stream, mr); - // create output chars by calling remove_copy(0) on the bytes in d_codepoints - rmm::device_uvector chars(total_size, stream, mr); - auto begin = reinterpret_cast(d_codepoints.begin()); - // the first remove() here speeds up the remove_copy() by roughly 10% - auto end = - reinterpret_cast(remove_safe(d_codepoints.begin(), d_codepoints.end(), 0, stream)); + // create output chars by calling remove_copy(0) on the bytes in d_normalized + auto chars = rmm::device_uvector(total_size, stream, mr); + auto const begin = reinterpret_cast(d_normalized.begin()); + // the remove() above speeds up the remove_copy() by roughly 10% + auto const end = + reinterpret_cast(remove_safe(d_normalized.begin(), d_normalized.end(), 0, stream)); remove_copy_safe(begin, end, chars.data(), 0, stream); return cudf::make_strings_column(input.size(), diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index 59cf611babb..fc2a0ef1660 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -148,10 +148,23 @@ TEST_F(TextNormalizeTest, NormalizeCharacters) TEST_F(TextNormalizeTest, WithNormalizer) { - // These include punctuation, accents, whitespace, and CJK characters - auto input = cudf::test::strings_column_wrapper( - {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""}, - {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + auto long_row = + "this entry is intended to pad out past 256 bytes which is currently the block size"; + // the following include punctuation, accents, whitespace, and CJK characters + auto input = cudf::test::strings_column_wrapper({"abc£def", + "", + "éè â îô\taeio", + "\tĂĆĖÑ Ü", + "ACEN U", + "P^NP", + "$41.07", + "[a,b]", + "丏丟", + "", + long_row, + long_row, + long_row}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); auto const sv = cudf::strings_column_view(input); @@ -166,8 +179,11 @@ TEST_F(TextNormalizeTest, WithNormalizer) " $ 41 . 07", " [ a , b ] ", " 丏 丟 ", - ""}, - {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + "", + long_row, + long_row, + long_row}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); normalizer = nvtext::create_character_normalizer(false); @@ -181,17 +197,25 @@ TEST_F(TextNormalizeTest, WithNormalizer) " $ 41 . 07", " [ a , b ] ", " 丏 丟 ", - ""}, - {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + "", + long_row, + long_row, + long_row}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(TextNormalizeTest, SpecialTokens) { + auto long_row = + "this entry is intended to pad out past 256 bytes which is currently the block size"; auto input = cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]", "[bos]these should[sep]work too[eos]", - "some[non]tokens[eol]too"}); + "some[non]tokens[eol]too", + long_row, + long_row, + long_row}); auto sv = cudf::strings_column_view(input); auto special_tokens = cudf::test::strings_column_wrapper({"[BOS]", "[EOS]", "[SEP]", "[PAD]"}); @@ -202,7 +226,10 @@ TEST_F(TextNormalizeTest, SpecialTokens) auto expected = cudf::test::strings_column_wrapper( {" [bos] some strings with [pad] special [sep] tokens [eos] ", " [bos] these should [sep] work too [eos] ", - "some [ non ] tokens [ eol ] too"}); + "some [ non ] tokens [ eol ] too", + long_row, + long_row, + long_row}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); normalizer = nvtext::create_character_normalizer(false, stv); @@ -210,7 +237,10 @@ TEST_F(TextNormalizeTest, SpecialTokens) expected = cudf::test::strings_column_wrapper( {" [BOS] Some strings with [PAD] special [SEP] tokens [EOS] ", " [ bos ] these should [ sep ] work too [ eos ] ", - "some [ non ] tokens [ eol ] too"}); + "some [ non ] tokens [ eol ] too", + long_row, + long_row, + long_row}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } From 756fa6da3928ce9cf79b153632208b99d76c648f Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 4 Feb 2025 15:41:15 -0500 Subject: [PATCH 13/21] fix typos --- cpp/include/nvtext/normalize.hpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp index 4bd257b7d45..4f55a5f7b8e 100644 --- a/cpp/include/nvtext/normalize.hpp +++ b/cpp/include/nvtext/normalize.hpp @@ -133,11 +133,9 @@ std::unique_ptr normalize_characters( * accent is removed. * * If `special_tokens` are included the padding after `[` and before `]` is not - * inserted if the character between them match one of the given tokens. - * If `do_lower_case = true` the `special_tokens` are expected to contain - * lower-case characters. + * inserted if the characters between them match one of the given tokens. * Also, the `special_tokens` are expected to include the `[]` characters - * at the beginning of and end of each string respectively. + * at the beginning of and end of each string appropriately. */ struct character_normalizer { /** @@ -148,7 +146,7 @@ struct character_normalizer { * @param do_lower_case If true, upper-case characters are converted to * lower-case and accents are stripped from those characters. * If false, accented and upper-case characters are not transformed. - * @param special_tokens Individual sequences including `[]` brackets. + * @param special_tokens Each row is a token including the `[]` brackets. * For example: `[BOS]`, `[EOS]`, `[UNK]`, `[SEP]`, `[PAD]`, `[CLS]`, `[MASK]` * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory @@ -174,7 +172,7 @@ struct character_normalizer { * @param do_lower_case If true, upper-case characters are converted to * lower-case and accents are stripped from those characters. * If false, accented and upper-case characters are not transformed. - * @param special_tokens Individual sequences including `[]` brackets. + * @param special_tokens Individual tokens including `[]` brackets. * Default is no special tokens. * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory From 2a2b9da135f7f08e2365648d6f96ca2de59b72e6 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 4 Feb 2025 15:41:39 -0500 Subject: [PATCH 14/21] remove unneeded includes --- cpp/src/text/normalize.cuh | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/cpp/src/text/normalize.cuh b/cpp/src/text/normalize.cuh index 8b1f4408ee8..3972726d536 100644 --- a/cpp/src/text/normalize.cuh +++ b/cpp/src/text/normalize.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2025, NVIDIA CORPORATION. + * Copyright (c) 2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,12 +18,6 @@ #include "text/subword/detail/cp_data.h" -#include -#include -#include - -#include - namespace nvtext { namespace detail { From c205062e620ee6f01d010ccf2212a92d06ec046c Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 4 Feb 2025 15:41:58 -0500 Subject: [PATCH 15/21] remove unneeded variable --- cpp/src/text/normalize.cu | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index e37da41c8c3..501034766ec 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -390,8 +390,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, { uint32_t replacement[MAX_NEW_CHARS] = {0}; - auto const idx = cudf::detail::grid_1d::global_thread_id(); - int8_t num_new_bytes = 0; + auto const idx = cudf::detail::grid_1d::global_thread_id(); if ((idx < total_bytes) && cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { auto const cp = [utf8 = d_chars + idx] { @@ -424,9 +423,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, // convert codepoints back to UTF-8 in-place for (int k = 0; k < num_new_chars; ++k) { auto const new_cp = replacement[k]; - if (new_cp) { - num_new_bytes += cp_to_utf8(new_cp, reinterpret_cast(replacement + k)); - } + if (new_cp) { cp_to_utf8(new_cp, reinterpret_cast(replacement + k)); } } } } @@ -450,6 +447,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, * @param offset Only non-zero if the input column has been sliced * @param size The number of output rows (sames as the number of input rows) * @param stream Stream used for allocating device memory and launching kernels + * @return The sizes of each output row */ template rmm::device_uvector compute_sizes(cudf::device_span d_normalized, From b46df42b96e32edd49a26c1e790d4a1a052a4d39 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 7 Feb 2025 14:04:28 -0500 Subject: [PATCH 16/21] fix comment formatting in python source --- python/cudf/cudf/core/character_normalizer.py | 11 ++++++----- python/pylibcudf/pylibcudf/nvtext/normalize.pyx | 3 ++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/character_normalizer.py b/python/cudf/cudf/core/character_normalizer.py index 00ee43598f6..1240c0e1eb7 100644 --- a/python/cudf/cudf/core/character_normalizer.py +++ b/python/cudf/cudf/core/character_normalizer.py @@ -14,10 +14,10 @@ class CharacterNormalizer: Parameters ---------- do_lower : bool - Set to True if the normalizer should also lower-case + If True, the normalizer should also lower-case while normalizing. - special_tokens : cudf string series - Strings column of special tokens. + special_tokens : cudf.Series + Series of special tokens. """ def __init__( @@ -33,12 +33,13 @@ def normalize(self, text: cudf.Series) -> cudf.Series: """ Parameters ---------- - text : cudf string series + text : cudf.Series The strings to be normalized. Returns ------- - Normalized strings + cudf.Series + Normalized strings """ result = text._column.normalize_characters(self.normalizer) diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx index ada361c5e79..6a18c205841 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx @@ -13,7 +13,8 @@ __all__ = [ "CharacterNormalizer" "normalize_characters", "normalize_spaces", - "characters_normalize"] + "characters_normalize" +] cdef class CharacterNormalizer: """The normalizer object to be used with ``normalize_characters``. From 2d92791bd1c06f00051aad6ef4fb44609e3ea4bb Mon Sep 17 00:00:00 2001 From: David Wendt Date: Mon, 10 Feb 2025 18:59:17 -0500 Subject: [PATCH 17/21] change min() to cuda::std::min() --- cpp/src/text/normalize.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 501034766ec..f20a4102a6e 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -41,6 +41,7 @@ #include #include +#include #include #include #include @@ -342,7 +343,7 @@ CUDF_KERNEL void special_tokens_kernel(uint32_t* d_normalized, if (idx >= total_count) { return; } auto const begin = d_normalized + (idx * MAX_NEW_CHARS) + 1; if (*begin != '[') { return; } - auto const end = begin + std::min(6L, total_count - idx) * MAX_NEW_CHARS; + auto const end = begin + cuda::std::min(6L, total_count - idx) * MAX_NEW_CHARS; auto const match = thrust::find(thrust::seq, begin, end, static_cast(']')); if (match == end) { return; } char candidate[8]; From 10e3e4b05095dbf6ee913d4b452b71f3279c98f8 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Thu, 13 Feb 2025 14:44:09 -0500 Subject: [PATCH 18/21] add get_first_and_last_offset utility --- cpp/include/cudf/strings/detail/utilities.hpp | 14 +++++++++++++- cpp/src/strings/utilities.cu | 14 +++++++++++++- cpp/src/text/normalize.cu | 9 ++------- cpp/tests/text/normalize_tests.cpp | 8 ++++++++ 4 files changed, 36 insertions(+), 9 deletions(-) diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp index d276c5df7dc..8fb1f30f961 100644 --- a/cpp/include/cudf/strings/detail/utilities.hpp +++ b/cpp/include/cudf/strings/detail/utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -96,5 +96,17 @@ int64_t get_offset_value(cudf::column_view const& offsets, size_type index, rmm::cuda_stream_view stream); +/** + * @brief Return the first and last offset in the given strings column + * + * This accounts for sliced input columns as well. + * + * @param input Strings column + * @param stream CUDA stream used for device memory operations and kernel launches + * @return First and last offset values + */ +std::pair get_first_and_last_offset(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream); + } // namespace strings::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 45bd4615435..c5d46598d4a 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -180,6 +180,18 @@ int64_t get_offset_value(cudf::column_view const& offsets, : cudf::detail::get_value(offsets, index, stream); } +std::pair get_first_and_last_offset(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream) +{ + if (input.is_empty()) { return {0L, 0L}; } + auto const first_offset = (input.offset() == 0) ? 0 + : cudf::strings::detail::get_offset_value( + input.offsets(), input.offset(), stream); + auto const last_offset = + cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream); + return {first_offset, last_offset}; +} + } // namespace detail rmm::device_uvector create_string_vector_from_column( diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index f20a4102a6e..931ed2de4bb 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -545,13 +545,8 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con { if (input.is_empty()) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); } - auto const first_offset = (input.offset() == 0) ? 0 - : cudf::strings::detail::get_offset_value( - input.offsets(), input.offset(), stream); - auto const last_offset = (input.offset() == 0 && input.size() == input.offsets().size() - 1) - ? input.chars_size(stream) - : cudf::strings::detail::get_offset_value( - input.offsets(), input.size() + input.offset(), stream); + auto [first_offset, last_offset] = + cudf::strings::detail::get_first_and_last_offset(input, stream); auto const chars_size = last_offset - first_offset; auto const d_input_chars = input.chars_begin(stream) + first_offset; diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index fc2a0ef1660..530148eb654 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -185,6 +185,8 @@ TEST_F(TextNormalizeTest, WithNormalizer) long_row}, {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); // test normalizer re-use + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); normalizer = nvtext::create_character_normalizer(false); results = nvtext::normalize_characters(sv, *normalizer); @@ -203,6 +205,8 @@ TEST_F(TextNormalizeTest, WithNormalizer) long_row}, {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(TextNormalizeTest, SpecialTokens) @@ -231,6 +235,8 @@ TEST_F(TextNormalizeTest, SpecialTokens) long_row, long_row}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); // and again + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); normalizer = nvtext::create_character_normalizer(false, stv); results = nvtext::normalize_characters(sv, *normalizer); @@ -242,6 +248,8 @@ TEST_F(TextNormalizeTest, SpecialTokens) long_row, long_row}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); // and again + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(TextNormalizeTest, NormalizeSlicedColumn) From 29bee66157b2860eeb57f32c2e81d7c463ce79e7 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Tue, 18 Feb 2025 13:39:08 -0500 Subject: [PATCH 19/21] change _impl to smart-pointer --- cpp/include/nvtext/normalize.hpp | 2 +- cpp/src/text/normalize.cu | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp index 4f55a5f7b8e..70ee7891ad7 100644 --- a/cpp/include/nvtext/normalize.hpp +++ b/cpp/include/nvtext/normalize.hpp @@ -158,7 +158,7 @@ struct character_normalizer { ~character_normalizer(); struct character_normalizer_impl; - character_normalizer_impl* _impl{}; + std::unique_ptr _impl; }; /** diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 931ed2de4bb..0e680e98ec5 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -301,14 +301,14 @@ character_normalizer::character_normalizer(bool do_lower_case, auto tokens_view = cudf::strings::detail::create_string_vector_from_column( cudf::strings_column_view(sorted->view()), stream, cudf::get_current_device_resource_ref()); - _impl = new character_normalizer_impl(std::move(cp_metadata), - std::move(aux_table), - do_lower_case, - std::move(sorted), - std::move(tokens_view)); + _impl = std::make_unique(std::move(cp_metadata), + std::move(aux_table), + do_lower_case, + std::move(sorted), + std::move(tokens_view)); } -character_normalizer::~character_normalizer() { delete _impl; } +character_normalizer::~character_normalizer() {} std::unique_ptr create_character_normalizer( bool do_lower_case, @@ -556,7 +556,7 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con cudf::detail::grid_1d grid{chars_size, block_size}; auto const max_new_char_total = cudf::util::round_up_safe(chars_size, block_size) * MAX_NEW_CHARS; - auto const parameters = normalizer._impl; + auto const& parameters = normalizer._impl; auto d_normalized = rmm::device_uvector(max_new_char_total, stream); data_normalizer_kernel<<>>( From e3fdb2d8e38cdb808a2e77f4d4d0b714dd3a30ab Mon Sep 17 00:00:00 2001 From: David Wendt Date: Wed, 19 Feb 2025 07:24:46 -0500 Subject: [PATCH 20/21] fix docstring --- python/cudf/cudf/core/column/string.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 37615347a47..818a6d9ad8a 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4679,8 +4679,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: r""" Normalizes strings characters for tokenizing. - This uses the normalizer that is built into the - subword_tokenize function which includes: + The normalizer function includes: - adding padding around punctuation (unicode category starts with "P") as well as certain ASCII symbols like "^" and "$" From 56041b5de2482846080c22229483e06510506ee3 Mon Sep 17 00:00:00 2001 From: David Wendt Date: Fri, 21 Feb 2025 11:24:37 -0500 Subject: [PATCH 21/21] add deprecation warning --- python/cudf/cudf/core/column/string.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 818a6d9ad8a..c0ad33ec7d6 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4679,6 +4679,9 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: r""" Normalizes strings characters for tokenizing. + .. deprecated:: 25.04 + Use `CharacterNormalizer` instead. + The normalizer function includes: - adding padding around punctuation (unicode category starts with @@ -4719,6 +4722,11 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: 2 $ 99 dtype: object """ + warnings.warn( + "normalize_characters is deprecated and will be removed in a future " + "version. Use CharacterNormalizer instead.", + FutureWarning, + ) return self._return_or_inplace( self._column.characters_normalize(do_lower) )