diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp index 594dc0de28a..494d5722ae4 100644 --- a/cpp/benchmarks/text/normalize.cpp +++ b/cpp/benchmarks/text/normalize.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -48,8 +48,11 @@ static void bench_normalize(nvbench::state& state) [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); }); } else { bool const to_lower = (normalize_type == "to_lower"); + // we expect the normalizer to be created once and re-used + // so creating it is not measured + auto normalizer = nvtext::create_character_normalizer(to_lower); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto result = nvtext::normalize_characters(input, to_lower); + auto result = nvtext::normalize_characters(input, *normalizer); }); } } @@ -57,6 +60,6 @@ static void bench_normalize(nvbench::state& state) NVBENCH_BENCH(bench_normalize) .set_name("normalize") .add_int64_axis("min_width", {0}) - .add_int64_axis("max_width", {32, 64, 128, 256}) + .add_int64_axis("max_width", {128, 256}) .add_int64_axis("num_rows", {32768, 262144, 2097152}) .add_string_axis("type", {"spaces", "characters", "to_lower"}); diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp index d276c5df7dc..8fb1f30f961 100644 --- a/cpp/include/cudf/strings/detail/utilities.hpp +++ b/cpp/include/cudf/strings/detail/utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -96,5 +96,17 @@ int64_t get_offset_value(cudf::column_view const& offsets, size_type index, rmm::cuda_stream_view stream); +/** + * @brief Return the first and last offset in the given strings column + * + * This accounts for sliced input columns as well. + * + * @param input Strings column + * @param stream CUDA stream used for device memory operations and kernel launches + * @return First and last offset values + */ +std::pair get_first_and_last_offset(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream); + } // namespace strings::detail } // namespace CUDF_EXPORT cudf diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp index 74325f4a406..70ee7891ad7 100644 --- a/cpp/include/nvtext/normalize.hpp +++ b/cpp/include/nvtext/normalize.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -107,5 +108,113 @@ std::unique_ptr normalize_characters( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Normalizer object to be used with nvtext::normalize_characters + * + * Use nvtext::create_normalizer to create this object. + * + * This normalizer includes: + * + * - adding padding around punctuation (unicode category starts with "P") + * as well as certain ASCII symbols like "^" and "$" + * - adding padding around the [CJK Unicode block + * characters](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)) + * - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "` + * - removing control characters (unicode categories "Cc" and "Cf") + * + * The padding process adds a single space before and after the character. + * Details on _unicode category_ can be found here: + * https://unicodebook.readthedocs.io/unicode.html#categories + * + * If `do_lower_case = true`, lower-casing also removes any accents. The + * accents cannot be removed from upper-case characters without lower-casing + * and lower-casing cannot be performed without also removing accents. + * However, if the accented character is already lower-case, then only the + * accent is removed. + * + * If `special_tokens` are included the padding after `[` and before `]` is not + * inserted if the characters between them match one of the given tokens. + * Also, the `special_tokens` are expected to include the `[]` characters + * at the beginning of and end of each string appropriately. + */ +struct character_normalizer { + /** + * @brief Normalizer object constructor + * + * This initializes and holds the character normalizing tables and settings. + * + * @param do_lower_case If true, upper-case characters are converted to + * lower-case and accents are stripped from those characters. + * If false, accented and upper-case characters are not transformed. + * @param special_tokens Each row is a token including the `[]` brackets. + * For example: `[BOS]`, `[EOS]`, `[UNK]`, `[SEP]`, `[PAD]`, `[CLS]`, `[MASK]` + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + */ + character_normalizer(bool do_lower_case, + cudf::strings_column_view const& special_tokens, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + ~character_normalizer(); + + struct character_normalizer_impl; + std::unique_ptr _impl; +}; + +/** + * @brief Create a normalizer object + * + * Creates a normalizer object which can be reused on multiple calls to + * nvtext::normalize_characters + * + * @see nvtext::character_normalizer + * + * @param do_lower_case If true, upper-case characters are converted to + * lower-case and accents are stripped from those characters. + * If false, accented and upper-case characters are not transformed. + * @param special_tokens Individual tokens including `[]` brackets. + * Default is no special tokens. + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * @return Object to be used with nvtext::normalize_characters + */ +std::unique_ptr create_character_normalizer( + bool do_lower_case, + cudf::strings_column_view const& special_tokens = cudf::strings_column_view(cudf::column_view{ + cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0}), + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + +/** + * @brief Normalizes the text in input strings column + * + * @see nvtext::character_normalizer for details on the normalizer behavior + * + * @code{.pseudo} + * cn = create_character_normalizer(true) + * s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"] + * s1 = normalize_characters(s,cn) + * s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "] + * + * cn = create_character_normalizer(false) + * s2 = normalize_characters(s,cn) + * s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "] + * @endcode + * + * A null input element at row `i` produces a corresponding null entry + * for row `i` in the output column. + * + * @param input The input strings to normalize + * @param normalizer Normalizer to use for this function + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Memory resource to allocate any returned objects + * @return Normalized strings column + */ +std::unique_ptr normalize_characters( + cudf::strings_column_view const& input, + character_normalizer const& normalizer, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** @} */ // end of group } // namespace CUDF_EXPORT nvtext diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu index 45bd4615435..c5d46598d4a 100644 --- a/cpp/src/strings/utilities.cu +++ b/cpp/src/strings/utilities.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -180,6 +180,18 @@ int64_t get_offset_value(cudf::column_view const& offsets, : cudf::detail::get_value(offsets, index, stream); } +std::pair get_first_and_last_offset(cudf::strings_column_view const& input, + rmm::cuda_stream_view stream) +{ + if (input.is_empty()) { return {0L, 0L}; } + auto const first_offset = (input.offset() == 0) ? 0 + : cudf::strings::detail::get_offset_value( + input.offsets(), input.offset(), stream); + auto const last_offset = + cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream); + return {first_offset, last_offset}; +} + } // namespace detail rmm::device_uvector create_string_vector_from_column( diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu index 7e2b766862d..0e680e98ec5 100644 --- a/cpp/src/text/normalize.cu +++ b/cpp/src/text/normalize.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "text/normalize.cuh" #include "text/subword/detail/data_normalizer.hpp" #include "text/subword/detail/tokenizer_utils.cuh" #include "text/utilities/tokenize_ops.cuh" @@ -22,10 +23,11 @@ #include #include #include -#include #include #include #include +#include +#include #include #include #include @@ -38,9 +40,13 @@ #include +#include +#include +#include #include #include #include +#include #include #include @@ -103,6 +109,12 @@ constexpr uint32_t UTF8_1BYTE = 0x0080; constexpr uint32_t UTF8_2BYTE = 0x0800; constexpr uint32_t UTF8_3BYTE = 0x01'0000; +__device__ int8_t cp_to_utf8(uint32_t codepoint, char* out) +{ + auto utf8 = cudf::strings::detail::codepoint_to_utf8(codepoint); + return cudf::strings::detail::from_char_utf8(utf8, out); +} + /** * @brief Convert code-point arrays into UTF-8 bytes for each string. */ @@ -148,26 +160,8 @@ struct codepoint_to_utf8_fn { // convert each code-point to 1-4 UTF-8 encoded bytes char* out_ptr = d_chars + d_offsets[idx]; for (uint32_t jdx = 0; jdx < count; ++jdx) { - uint32_t code_point = *str_cps++; - if (code_point < UTF8_1BYTE) // ASCII range - *out_ptr++ = static_cast(code_point); - else if (code_point < UTF8_2BYTE) { // create two-byte UTF-8 - // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy - *out_ptr++ = static_cast((((code_point << 2) & 0x00'1F00) | 0x00'C000) >> 8); - *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); - } else if (code_point < UTF8_3BYTE) { // create three-byte UTF-8 - // bxxxxxxxx:byyyyyyyy => b1110xxxx:b10xxxxyy:b10yyyyyy - *out_ptr++ = static_cast((((code_point << 4) & 0x0F'0000) | 0x00E0'0000) >> 16); - *out_ptr++ = static_cast((((code_point << 2) & 0x00'3F00) | 0x00'8000) >> 8); - *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); - } else { // create four-byte UTF-8 - // maximum code-point value is 0x0011'0000 - // b000xxxxx:byyyyyyyy:bzzzzzzzz => b11110xxx:b10xxyyyy:b10yyyyzz:b10zzzzzz - *out_ptr++ = static_cast((((code_point << 6) & 0x0700'0000u) | 0xF000'0000u) >> 24); - *out_ptr++ = static_cast((((code_point << 4) & 0x003F'0000u) | 0x0080'0000u) >> 16); - *out_ptr++ = static_cast((((code_point << 2) & 0x00'3F00u) | 0x00'8000u) >> 8); - *out_ptr++ = static_cast((code_point & 0x3F) | 0x0080); - } + uint32_t codepoint = *str_cps++; + out_ptr += cp_to_utf8(codepoint, out_ptr); } } }; @@ -261,4 +255,361 @@ std::unique_ptr normalize_characters(cudf::strings_column_view con return detail::normalize_characters(input, do_lower_case, stream, mr); } +struct character_normalizer::character_normalizer_impl { + rmm::device_uvector cp_metadata; + rmm::device_uvector aux_table; + bool do_lower_case; + std::unique_ptr special_tokens; + rmm::device_uvector special_tokens_view; + + cudf::device_span get_special_tokens() const + { + return special_tokens_view; + } + + character_normalizer_impl(rmm::device_uvector&& cp_metadata, + rmm::device_uvector&& aux_table, + bool do_lower_case, + std::unique_ptr&& special_tokens, + rmm::device_uvector&& special_tokens_view) + : cp_metadata(std::move(cp_metadata)), + aux_table(std::move(aux_table)), + do_lower_case{do_lower_case}, + special_tokens{std::move(special_tokens)}, + special_tokens_view{std::move(special_tokens_view)} + { + } +}; + +character_normalizer::character_normalizer(bool do_lower_case, + cudf::strings_column_view const& special_tokens, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref) +{ + auto cp_metadata = nvtext::detail::get_codepoint_metadata(stream); + auto aux_table = nvtext::detail::get_aux_codepoint_data(stream); + CUDF_EXPECTS( + !special_tokens.has_nulls(), "special tokens should not have nulls", std::invalid_argument); + + auto sorted = std::move( + cudf::sort(cudf::table_view({special_tokens.parent()}), {}, {}, stream)->release().front()); + if (do_lower_case) { + // lower-case the tokens so they will match the normalized input + sorted = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream); + } + + auto tokens_view = cudf::strings::detail::create_string_vector_from_column( + cudf::strings_column_view(sorted->view()), stream, cudf::get_current_device_resource_ref()); + + _impl = std::make_unique(std::move(cp_metadata), + std::move(aux_table), + do_lower_case, + std::move(sorted), + std::move(tokens_view)); +} + +character_normalizer::~character_normalizer() {} + +std::unique_ptr create_character_normalizer( + bool do_lower_case, + cudf::strings_column_view const& special_tokens, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return std::make_unique(do_lower_case, special_tokens, stream, mr); +} + +namespace detail { +namespace { + +/** + * @brief Kernel handles fixing up the normalized data to account for any special tokens + * + * This undoes the padding added around the `[]` for patterns matching the strings in the + * special_tokens array. + * + * Launched as a thread per input byte (total_count). + * + * @param d_normalized The normalized set of UTF-8 characters; 3 uints per input byte + * @param total_count Number of bytes represented by d_normalized; len(d_normalized)/3 + * @param special_tokens Tokens to check against + */ +CUDF_KERNEL void special_tokens_kernel(uint32_t* d_normalized, + int64_t total_count, + cudf::device_span special_tokens) +{ + auto const idx = cudf::detail::grid_1d::global_thread_id(); + if (idx >= total_count) { return; } + auto const begin = d_normalized + (idx * MAX_NEW_CHARS) + 1; + if (*begin != '[') { return; } + auto const end = begin + cuda::std::min(6L, total_count - idx) * MAX_NEW_CHARS; + auto const match = thrust::find(thrust::seq, begin, end, static_cast(']')); + if (match == end) { return; } + char candidate[8]; + auto const ch_begin = + thrust::transform_iterator(begin, [](auto v) { return static_cast(v); }); + auto const ch_end = ch_begin + thrust::distance(begin, match + 1); + auto last = thrust::copy_if( + thrust::seq, ch_begin, ch_end, candidate, [](auto c) { return c != 0 && c != ' '; }); + *last = 0; // only needed for debug + + auto const size = static_cast(thrust::distance(candidate, last)); + auto const token = cudf::string_view(candidate, size); + // the binary_search expects the special_tokens to be sorted + if (!thrust::binary_search(thrust::seq, special_tokens.begin(), special_tokens.end(), token)) { + return; + } + + // fix up chars to remove the extra spaces + *(begin + 1) = 0; // removes space after '[' + *(match - 1) = 0; // removes space before ']' +} + +/** + * @brief The normalizer kernel + * + * Launched as a thread per input byte (total_bytes). + * + * Converts the input d_chars into codepoints to lookup in the provided tables. + * Once processed, the d_output contains 3 uints per input byte each encoded + * as output UTF-8. Any zero values are to removed by a subsequent kernel call. + * + * @param d_chars The characters for the input strings column to normalize + * @param total_bytes The number of bytes in the d_chars + * @param cp_metadata First lookup table for codepoint metadata + * @param aux_table Second lookup table containing possible replacement characters + * @param do_lower_case True if the normalization includes lower-casing characters + * @param d_output The output of the normalization (UTF-8 encoded) + */ +CUDF_KERNEL void data_normalizer_kernel(char const* d_chars, + int64_t total_bytes, + codepoint_metadata_type const* cp_metadata, + aux_codepoint_data_type const* aux_table, + bool do_lower_case, + uint32_t* d_output) +{ + uint32_t replacement[MAX_NEW_CHARS] = {0}; + + auto const idx = cudf::detail::grid_1d::global_thread_id(); + + if ((idx < total_bytes) && cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) { + auto const cp = [utf8 = d_chars + idx] { + cudf::char_utf8 ch_utf8 = *utf8; + if (ch_utf8 > 0x7F) { cudf::strings::detail::to_char_utf8(utf8, ch_utf8); } + return cudf::strings::detail::utf8_to_codepoint(ch_utf8); + }(); + auto const metadata = cp_metadata[cp]; + + if (!should_remove_cp(metadata, do_lower_case)) { + int8_t num_new_chars = 1; + // retrieve the normalized value for cp + auto const new_cp = do_lower_case || always_replace(metadata) ? get_first_cp(metadata) : cp; + replacement[0] = new_cp == 0 ? cp : new_cp; + + if (do_lower_case && is_multi_char_transform(metadata)) { + auto const next_cps = aux_table[cp]; + replacement[1] = static_cast(next_cps >> 32); + replacement[2] = static_cast(next_cps & 0xFFFFFFFF); + num_new_chars = 2 + (replacement[2] != 0); + } + + if (should_add_spaces(metadata, do_lower_case) && (num_new_chars == 1)) { + replacement[1] = replacement[0]; + replacement[0] = SPACE_CODE_POINT; // add spaces around the new codepoint + replacement[2] = SPACE_CODE_POINT; + num_new_chars = 3; + } + + // convert codepoints back to UTF-8 in-place + for (int k = 0; k < num_new_chars; ++k) { + auto const new_cp = replacement[k]; + if (new_cp) { cp_to_utf8(new_cp, reinterpret_cast(replacement + k)); } + } + } + } + + // employ an optimized coalesced writer to output replacement as a block of transposed data + using block_store = + cub::BlockStore; + __shared__ typename block_store::TempStorage bs_stg; + auto block_base = d_output + blockIdx.x * blockDim.x * MAX_NEW_CHARS; + block_store(bs_stg).Store(block_base, replacement); +} + +/** + * @brief Computes the output sizes for each row + * + * The input offsets are used with segmented-reduce to count the number of + * non-zero values for each output row. + * + * @param d_normalized The UTF-8 encoded normalized values + * @param offsets These identify the row boundaries + * @param offset Only non-zero if the input column has been sliced + * @param size The number of output rows (sames as the number of input rows) + * @param stream Stream used for allocating device memory and launching kernels + * @return The sizes of each output row + */ +template +rmm::device_uvector compute_sizes(cudf::device_span d_normalized, + OffsetType offsets, + int64_t offset, + cudf::size_type size, + rmm::cuda_stream_view stream) +{ + auto output_sizes = rmm::device_uvector(size, stream); + + auto d_data = d_normalized.data(); + + // counts the non-zero bytes in the d_data array + auto d_in = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type([d_data] __device__(auto idx) { + idx = idx * MAX_NEW_CHARS; + // transform function counts number of non-zero bytes in uint32_t value + auto tfn = [](uint32_t v) -> cudf::size_type { + return ((v & 0xFF) > 0) + ((v & 0xFF00) > 0) + ((v & 0xFF0000) > 0) + + ((v & 0xFF000000) > 0); + }; + auto const begin = d_data + idx; + auto const end = begin + MAX_NEW_CHARS; + return thrust::transform_reduce(thrust::seq, begin, end, tfn, 0, thrust::plus{}); + })); + + // DeviceSegmentedReduce is used to compute the size of each output row + auto d_out = output_sizes.begin(); + auto temp = std::size_t{0}; + if (offset == 0) { + cub::DeviceSegmentedReduce::Sum( + nullptr, temp, d_in, d_out, size, offsets, offsets + 1, stream.value()); + auto d_temp = rmm::device_buffer{temp, stream}; + cub::DeviceSegmentedReduce::Sum( + d_temp.data(), temp, d_in, d_out, size, offsets, offsets + 1, stream.value()); + } else { + // offsets need to be normalized for segmented-reduce to work efficiently + auto offsets_itr = thrust::transform_iterator( + offsets, + cuda::proclaim_return_type([offset] __device__(auto o) { return o - offset; })); + cub::DeviceSegmentedReduce::Sum( + nullptr, temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value()); + auto d_temp = rmm::device_buffer{temp, stream}; + cub::DeviceSegmentedReduce::Sum( + d_temp.data(), temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value()); + } + + return output_sizes; +} + +// handles ranges above int32 max +template +OutputIterator remove_copy_safe(InputIterator first, + InputIterator last, + OutputIterator result, + T const& value, + rmm::cuda_stream_view stream) +{ + auto const copy_size = std::min(static_cast(std::distance(first, last)), + static_cast(std::numeric_limits::max())); + + auto itr = first; + while (itr != last) { + auto const copy_end = + static_cast(std::distance(itr, last)) <= copy_size ? last : itr + copy_size; + result = thrust::remove_copy(rmm::exec_policy(stream), itr, copy_end, result, value); + itr = copy_end; + } + return result; +} + +// handles ranges above int32 max +template +Iterator remove_safe(Iterator first, Iterator last, T const& value, rmm::cuda_stream_view stream) +{ + auto const size = std::min(static_cast(std::distance(first, last)), + static_cast(std::numeric_limits::max())); + + auto result = first; + auto itr = first; + while (itr != last) { + auto end = static_cast(std::distance(itr, last)) <= size ? last : itr + size; + result = thrust::remove(rmm::exec_policy(stream), itr, end, value); + itr = end; + } + return result; +} +} // namespace + +std::unique_ptr normalize_characters(cudf::strings_column_view const& input, + character_normalizer const& normalizer, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + if (input.is_empty()) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); } + + auto [first_offset, last_offset] = + cudf::strings::detail::get_first_and_last_offset(input, stream); + auto const chars_size = last_offset - first_offset; + auto const d_input_chars = input.chars_begin(stream) + first_offset; + + if (chars_size == 0) { return std::make_unique(input.parent(), stream, mr); } + + constexpr int64_t block_size = 256; + cudf::detail::grid_1d grid{chars_size, block_size}; + auto const max_new_char_total = cudf::util::round_up_safe(chars_size, block_size) * MAX_NEW_CHARS; + + auto const& parameters = normalizer._impl; + + auto d_normalized = rmm::device_uvector(max_new_char_total, stream); + data_normalizer_kernel<<>>( + d_input_chars, + chars_size, + parameters->cp_metadata.data(), + parameters->aux_table.data(), + parameters->do_lower_case, + d_normalized.data()); + + // This removes space added around any special tokens in the form of [ttt]. + // An alternate approach is to do a multi-replace of '[ ttt ]' with '[ttt]' right + // before returning the output strings column. + auto const special_tokens = parameters->get_special_tokens(); + if (!special_tokens.empty()) { + special_tokens_kernel<<>>( + d_normalized.data(), chars_size, special_tokens); + } + + // Use segmented-reduce over the non-zero codepoints to get the size of the output rows + auto const input_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); + auto output_sizes = + compute_sizes(d_normalized, input_offsets, first_offset, input.size(), stream); + + // convert the sizes to offsets + auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column( + output_sizes.begin(), output_sizes.end(), stream, mr); + + // create output chars by calling remove_copy(0) on the bytes in d_normalized + auto chars = rmm::device_uvector(total_size, stream, mr); + auto const begin = reinterpret_cast(d_normalized.begin()); + // the remove() above speeds up the remove_copy() by roughly 10% + auto const end = + reinterpret_cast(remove_safe(d_normalized.begin(), d_normalized.end(), 0, stream)); + remove_copy_safe(begin, end, chars.data(), 0, stream); + + return cudf::make_strings_column(input.size(), + std::move(offsets), + chars.release(), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr)); +} + +} // namespace detail + +std::unique_ptr normalize_characters(cudf::strings_column_view const& input, + character_normalizer const& normalizer, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::normalize_characters(input, normalizer, stream, mr); +} + } // namespace nvtext diff --git a/cpp/src/text/normalize.cuh b/cpp/src/text/normalize.cuh new file mode 100644 index 00000000000..3972726d536 --- /dev/null +++ b/cpp/src/text/normalize.cuh @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "text/subword/detail/cp_data.h" + +namespace nvtext { +namespace detail { + +/** + * @brief Bit used to filter out invalid code points. + * + * When normalizing characters to code point values, if this bit is set, + * the code point should be filtered out before returning from the normalizer. + */ +constexpr uint32_t FILTER_BIT = 22; + +/** + * @brief Retrieve new code point from metadata value. + * + * @param metadata Value from the codepoint_metadata table. + * @return The replacement character if appropriate. + */ +__device__ constexpr uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; } + +/** + * @brief Retrieve token category from the metadata value. + * + * Category values are 0-5: + * 0 - character should be padded + * 1 - pad character if lower-case + * 2 - character should be removed + * 3 - remove character if lower-case + * 4 - whitespace character -- always replace + * 5 - uncategorized + * + * @param metadata Value from the codepoint_metadata table. + * @return Category value. + */ +__device__ constexpr uint32_t extract_token_cat(uint32_t metadata) +{ + return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK; +} + +/** + * @brief Return true if category of metadata value specifies the character should be replaced. + */ +__device__ constexpr bool should_remove_cp(uint32_t metadata, bool lower_case) +{ + auto const cat = extract_token_cat(metadata); + return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER)); +} + +/** + * @brief Return true if category of metadata value specifies the character should be padded. + */ +__device__ constexpr bool should_add_spaces(uint32_t metadata, bool lower_case) +{ + auto const cat = extract_token_cat(metadata); + return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER)); +} + +/** + * @brief Return true if category of metadata value specifies the character should be replaced. + */ +__device__ constexpr bool always_replace(uint32_t metadata) +{ + return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE; +} + +/** + * @brief Returns true if metadata value includes a multi-character transform bit equal to 1. + */ +__device__ constexpr bool is_multi_char_transform(uint32_t metadata) +{ + return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK; +} + +/** + * @brief Returns true if the byte passed in could be a valid head byte for + * a utf8 character. That is, not binary `10xxxxxx` + */ +__device__ constexpr bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; } + +} // namespace detail +} // namespace nvtext diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index 7a39199011e..4c54409c41a 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -14,6 +14,7 @@ * limitations under the License. */ +#include "text/normalize.cuh" #include "text/subword/detail/data_normalizer.hpp" #include "text/subword/detail/tokenizer_utils.cuh" @@ -38,81 +39,6 @@ namespace nvtext { namespace detail { namespace { -/** - * @brief Bit used to filter out invalid code points. - * - * When normalizing characters to code point values, if this bit is set, - * the code point should be filtered out before returning from the normalizer. - */ -constexpr uint32_t FILTER_BIT = 22; - -/** - * @brief Retrieve new code point from metadata value. - * - * @param metadata Value from the codepoint_metadata table. - * @return The replacement character if appropriate. - */ -__device__ uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; } - -/** - * @brief Retrieve token category from the metadata value. - * - * Category values are 0-5: - * 0 - character should be padded - * 1 - pad character if lower-case - * 2 - character should be removed - * 3 - remove character if lower-case - * 4 - whitespace character -- always replace - * 5 - uncategorized - * - * @param metadata Value from the codepoint_metadata table. - * @return Category value. - */ -__device__ uint32_t extract_token_cat(uint32_t metadata) -{ - return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK; -} - -/** - * @brief Return true if category of metadata value specifies the character should be replaced. - */ -__device__ bool should_remove_cp(uint32_t metadata, bool lower_case) -{ - auto const cat = extract_token_cat(metadata); - return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER)); -} - -/** - * @brief Return true if category of metadata value specifies the character should be padded. - */ -__device__ bool should_add_spaces(uint32_t metadata, bool lower_case) -{ - auto const cat = extract_token_cat(metadata); - return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER)); -} - -/** - * @brief Return true if category of metadata value specifies the character should be replaced. - */ -__device__ bool always_replace(uint32_t metadata) -{ - return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE; -} - -/** - * @brief Returns true if metadata value includes a multi-character transform bit equal to 1. - */ -__device__ bool is_multi_char_transform(uint32_t metadata) -{ - return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK; -} - -/** - * @brief Returns true if the byte passed in could be a valid head byte for - * a utf8 character. That is, not binary `10xxxxxx` - */ -__device__ bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; } - /** * @brief Converts a UTF-8 character into a unicode code point value. * diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp index 2515cc917fa..530148eb654 100644 --- a/cpp/tests/text/normalize_tests.cpp +++ b/cpp/tests/text/normalize_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,6 +74,10 @@ TEST_F(TextNormalizeTest, NormalizeEmptyTest) EXPECT_EQ(results->size(), 0); results = nvtext::normalize_characters(strings_view, false); EXPECT_EQ(results->size(), 0); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(strings_view, *normalizer); + EXPECT_EQ(results->size(), 0); } TEST_F(TextNormalizeTest, AllNullStrings) @@ -84,6 +88,10 @@ TEST_F(TextNormalizeTest, AllNullStrings) CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); results = nvtext::normalize_characters(strings_view, false); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(strings_view, *normalizer); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings); } TEST_F(TextNormalizeTest, SomeNullStrings) @@ -93,27 +101,21 @@ TEST_F(TextNormalizeTest, SomeNullStrings) auto results = nvtext::normalize_characters(strings_view, false); cudf::test::strings_column_wrapper expected({"", " . ", "a"}, {false, true, true}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(strings_view, *normalizer); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } TEST_F(TextNormalizeTest, NormalizeCharacters) { // These include punctuation, accents, whitespace, and CJK characters - std::vector h_strings{"abc£def", - nullptr, - "éè â îô\taeio", - "\tĂĆĖÑ Ü", - "ACEN U", - "P^NP", - "$41.07", - "[a,b]", - "丏丟", - ""}; - auto validity = - thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }); - cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity); - cudf::strings_column_view strings_view(strings); + auto input = cudf::test::strings_column_wrapper( + {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); + auto sv = cudf::strings_column_view(input); { - auto results = nvtext::normalize_characters(strings_view, true); + auto results = nvtext::normalize_characters(sv, true); cudf::test::strings_column_wrapper expected({"abc£def", "", "ee a io aeio", @@ -124,11 +126,11 @@ TEST_F(TextNormalizeTest, NormalizeCharacters) " [ a , b ] ", " 丏 丟 ", ""}, - validity); + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } { - auto results = nvtext::normalize_characters(strings_view, false); + auto results = nvtext::normalize_characters(sv, false); cudf::test::strings_column_wrapper expected({"abc£def", "", "éè â îô aeio", @@ -139,11 +141,117 @@ TEST_F(TextNormalizeTest, NormalizeCharacters) " [ a , b ] ", " 丏 丟 ", ""}, - validity); + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } } +TEST_F(TextNormalizeTest, WithNormalizer) +{ + auto long_row = + "this entry is intended to pad out past 256 bytes which is currently the block size"; + // the following include punctuation, accents, whitespace, and CJK characters + auto input = cudf::test::strings_column_wrapper({"abc£def", + "", + "éè â îô\taeio", + "\tĂĆĖÑ Ü", + "ACEN U", + "P^NP", + "$41.07", + "[a,b]", + "丏丟", + "", + long_row, + long_row, + long_row}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + + auto const sv = cudf::strings_column_view(input); + + auto normalizer = nvtext::create_character_normalizer(true); + auto results = nvtext::normalize_characters(sv, *normalizer); + auto expected = cudf::test::strings_column_wrapper({"abc£def", + "", + "ee a io aeio", + " acen u", + "acen u", + "p ^ np", + " $ 41 . 07", + " [ a , b ] ", + " 丏 丟 ", + "", + long_row, + long_row, + long_row}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); // test normalizer re-use + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + normalizer = nvtext::create_character_normalizer(false); + results = nvtext::normalize_characters(sv, *normalizer); + expected = cudf::test::strings_column_wrapper({"abc£def", + "", + "éè â îô aeio", + " ĂĆĖÑ Ü", + "ACEN U", + "P ^ NP", + " $ 41 . 07", + " [ a , b ] ", + " 丏 丟 ", + "", + long_row, + long_row, + long_row}, + {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + +TEST_F(TextNormalizeTest, SpecialTokens) +{ + auto long_row = + "this entry is intended to pad out past 256 bytes which is currently the block size"; + auto input = + cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]", + "[bos]these should[sep]work too[eos]", + "some[non]tokens[eol]too", + long_row, + long_row, + long_row}); + + auto sv = cudf::strings_column_view(input); + auto special_tokens = cudf::test::strings_column_wrapper({"[BOS]", "[EOS]", "[SEP]", "[PAD]"}); + auto stv = cudf::strings_column_view(special_tokens); + + auto normalizer = nvtext::create_character_normalizer(true, stv); + auto results = nvtext::normalize_characters(sv, *normalizer); + auto expected = cudf::test::strings_column_wrapper( + {" [bos] some strings with [pad] special [sep] tokens [eos] ", + " [bos] these should [sep] work too [eos] ", + "some [ non ] tokens [ eol ] too", + long_row, + long_row, + long_row}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); // and again + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + normalizer = nvtext::create_character_normalizer(false, stv); + results = nvtext::normalize_characters(sv, *normalizer); + expected = cudf::test::strings_column_wrapper( + {" [BOS] Some strings with [PAD] special [SEP] tokens [EOS] ", + " [ bos ] these should [ sep ] work too [ eos ] ", + "some [ non ] tokens [ eol ] too", + long_row, + long_row, + long_row}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + results = nvtext::normalize_characters(sv, *normalizer); // and again + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); +} + TEST_F(TextNormalizeTest, NormalizeSlicedColumn) { cudf::test::strings_column_wrapper strings( @@ -151,10 +259,21 @@ TEST_F(TextNormalizeTest, NormalizeSlicedColumn) std::vector sliced = cudf::split(strings, {4}); auto results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), true); - cudf::test::strings_column_wrapper expected({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); + auto expected = + cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false); + expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); + + auto normalizer = nvtext::create_character_normalizer(true); + results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer); + expected = cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); - results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false); - cudf::test::strings_column_wrapper expected2({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2); + normalizer = nvtext::create_character_normalizer(false); + results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), *normalizer); + expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); } diff --git a/python/cudf/cudf/core/character_normalizer.py b/python/cudf/cudf/core/character_normalizer.py new file mode 100644 index 00000000000..1240c0e1eb7 --- /dev/null +++ b/python/cudf/cudf/core/character_normalizer.py @@ -0,0 +1,46 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +from __future__ import annotations + +import pylibcudf as plc + +import cudf + + +class CharacterNormalizer: + """ + A normalizer object used to normalize input text. + + Parameters + ---------- + do_lower : bool + If True, the normalizer should also lower-case + while normalizing. + special_tokens : cudf.Series + Series of special tokens. + """ + + def __init__( + self, + do_lower: bool, + special_tokens: cudf.Series = cudf.Series([], dtype="object"), + ) -> None: + self.normalizer = plc.nvtext.normalize.CharacterNormalizer( + do_lower, special_tokens._column.to_pylibcudf(mode="read") + ) + + def normalize(self, text: cudf.Series) -> cudf.Series: + """ + Parameters + ---------- + text : cudf.Series + The strings to be normalized. + + Returns + ------- + cudf.Series + Normalized strings + """ + result = text._column.normalize_characters(self.normalizer) + + return cudf.Series._from_column(result) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 04a72017c33..c0ad33ec7d6 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -4679,8 +4679,10 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: r""" Normalizes strings characters for tokenizing. - This uses the normalizer that is built into the - subword_tokenize function which includes: + .. deprecated:: 25.04 + Use `CharacterNormalizer` instead. + + The normalizer function includes: - adding padding around punctuation (unicode category starts with "P") as well as certain ASCII symbols like "^" and "$" @@ -4720,8 +4722,13 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: 2 $ 99 dtype: object """ + warnings.warn( + "normalize_characters is deprecated and will be removed in a future " + "version. Use CharacterNormalizer instead.", + FutureWarning, + ) return self._return_or_inplace( - self._column.normalize_characters(do_lower) + self._column.characters_normalize(do_lower) ) def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: @@ -6256,14 +6263,25 @@ def normalize_spaces(self) -> Self: ) @acquire_spill_lock() - def normalize_characters(self, do_lower: bool = True) -> Self: + def characters_normalize(self, do_lower: bool = True) -> Self: return ColumnBase.from_pylibcudf( # type: ignore[return-value] - plc.nvtext.normalize.normalize_characters( + plc.nvtext.normalize.characters_normalize( self.to_pylibcudf(mode="read"), do_lower, ) ) + @acquire_spill_lock() + def normalize_characters( + self, normalizer: plc.nvtext.normalize.CharacterNormalizer + ) -> Self: + return ColumnBase.from_pylibcudf( # type: ignore[return-value] + plc.nvtext.normalize.normalize_characters( + self.to_pylibcudf(mode="read"), + normalizer, + ) + ) + @acquire_spill_lock() def replace_tokens( self, targets: Self, replacements: Self, delimiter: plc.Scalar diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd index f8b082c8429..2cf2bfb8ac9 100644 --- a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from pylibcudf.exception_handler cimport libcudf_exception_handler @@ -16,3 +16,16 @@ cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil: const column_view & strings, bool do_lower_case ) except +libcudf_exception_handler + + cdef struct character_normalizer "nvtext::character_normalizer": + pass + + cdef unique_ptr[character_normalizer] create_character_normalizer( + bool do_lower_case, + const column_view & strings + ) except +libcudf_exception_handler + + cdef unique_ptr[column] normalize_characters( + const column_view & strings, + const character_normalizer & normalizer + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd index 90676145afa..e6688e19762 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd @@ -1,9 +1,18 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libcpp cimport bool +from libcpp.memory cimport unique_ptr from pylibcudf.column cimport Column +from pylibcudf.libcudf.nvtext.normalize cimport character_normalizer +cdef class CharacterNormalizer: + cdef unique_ptr[character_normalizer] c_obj cpdef Column normalize_spaces(Column input) -cpdef Column normalize_characters(Column input, bool do_lower_case) +cpdef Column characters_normalize(Column input, bool do_lower_case) + +cpdef Column normalize_characters( + Column input, + CharacterNormalizer normalizer +) diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi index 1d90a5a8960..d722ef6c79e 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi @@ -1,6 +1,12 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from pylibcudf.column import Column +class CharacterNormalizer: + def __init__(self, do_lower_case: bool, special_tokens: Column): ... + def normalize_spaces(input: Column) -> Column: ... -def normalize_characters(input: Column, do_lower_case: bool) -> Column: ... +def characters_normalize(input: Column, do_lower_case: bool) -> Column: ... +def normalize_characters( + input: Column, normalizer: CharacterNormalizer +) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx index b259ccaefa6..6a18c205841 100644 --- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx +++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx @@ -1,16 +1,37 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. +from cython.operator cimport dereference from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move from pylibcudf.column cimport Column from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.nvtext.normalize cimport ( - normalize_characters as cpp_normalize_characters, - normalize_spaces as cpp_normalize_spaces, -) +from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.nvtext cimport normalize as cpp_normalize -__all__ = ["normalize_characters", "normalize_spaces"] +__all__ = [ + "CharacterNormalizer" + "normalize_characters", + "normalize_spaces", + "characters_normalize" +] + +cdef class CharacterNormalizer: + """The normalizer object to be used with ``normalize_characters``. + + For details, see :cpp:class:`cudf::nvtext::character_normalizer`. + """ + def __cinit__(self, bool do_lower_case, Column tokens): + cdef column_view c_tokens = tokens.view() + with nogil: + self.c_obj = move( + cpp_normalize.create_character_normalizer( + do_lower_case, + c_tokens + ) + ) + + __hash__ = None cpdef Column normalize_spaces(Column input): """ @@ -32,12 +53,12 @@ cpdef Column normalize_spaces(Column input): cdef unique_ptr[column] c_result with nogil: - c_result = cpp_normalize_spaces(input.view()) + c_result = cpp_normalize.normalize_spaces(input.view()) return Column.from_libcudf(move(c_result)) -cpdef Column normalize_characters(Column input, bool do_lower_case): +cpdef Column characters_normalize(Column input, bool do_lower_case): """ Normalizes strings characters for tokenizing. @@ -60,6 +81,38 @@ cpdef Column normalize_characters(Column input, bool do_lower_case): cdef unique_ptr[column] c_result with nogil: - c_result = cpp_normalize_characters(input.view(), do_lower_case) + c_result = cpp_normalize.normalize_characters( + input.view(), + do_lower_case + ) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column normalize_characters(Column input, CharacterNormalizer normalizer): + """ + Normalizes strings characters for tokenizing. + + For details, see :cpp:func:`normalize_characters` + + Parameters + ---------- + input : Column + Input strings + normalizer : CharacterNormalizer + Normalizer object used for modifying the input column text + + Returns + ------- + Column + Normalized strings column + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = cpp_normalize.normalize_characters( + input.view(), + dereference(normalizer.c_obj.get()) + ) return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py index 25b6d1389ec..47bbb191be6 100644 --- a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py +++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import pyarrow as pa import pytest @@ -15,7 +15,7 @@ def norm_spaces_input_data(): @pytest.fixture(scope="module") def norm_chars_input_data(): - arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"] + arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]", "[pad]"] return pa.array(arr) @@ -29,15 +29,98 @@ def test_normalize_spaces(norm_spaces_input_data): @pytest.mark.parametrize("do_lower", [True, False]) def test_normalize_characters(norm_chars_input_data, do_lower): - result = plc.nvtext.normalize.normalize_characters( + result = plc.nvtext.normalize.characters_normalize( plc.interop.from_arrow(norm_chars_input_data), do_lower, ) - expected = pa.array( - ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "] + if do_lower: + expected = pa.array( + [ + "eaio eaio", + "acenu", + "acenu", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + else: + expected = pa.array( + [ + "éâîô eaio", + "ĂĆĖÑÜ", + "ACENU", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("do_lower", [True, False]) +def test_normalizer(norm_chars_input_data, do_lower): + result = plc.nvtext.normalize.normalize_characters( + plc.interop.from_arrow(norm_chars_input_data), + plc.nvtext.normalize.CharacterNormalizer( + do_lower, + plc.column_factories.make_empty_column(plc.types.TypeId.STRING), + ), + ) + if do_lower: + expected = pa.array( + [ + "eaio eaio", + "acenu", + "acenu", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + else: + expected = pa.array( + [ + "éâîô eaio", + "ĂĆĖÑÜ", + "ACENU", + " $ 24 . 08", + " [ a , bb ] ", + " [ pad ] ", + ] + ) + assert_column_eq(result, expected) + + +@pytest.mark.parametrize("do_lower", [True, False]) +def test_normalizer_with_special_tokens(norm_chars_input_data, do_lower): + special_tokens = pa.array(["[pad]"]) + result = plc.nvtext.normalize.normalize_characters( + plc.interop.from_arrow(norm_chars_input_data), + plc.nvtext.normalize.CharacterNormalizer( + do_lower, plc.interop.from_arrow(special_tokens) + ), ) - if not do_lower: + if do_lower: + expected = pa.array( + [ + "eaio eaio", + "acenu", + "acenu", + " $ 24 . 08", + " [ a , bb ] ", + " [pad] ", + ] + ) + else: expected = pa.array( - ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "] + [ + "éâîô eaio", + "ĂĆĖÑÜ", + "ACENU", + " $ 24 . 08", + " [ a , bb ] ", + " [pad] ", + ] ) assert_column_eq(result, expected)