From dd51eb3ee1d2bbaefe1b1271752a288bc511fd6a Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 24 Jan 2025 16:22:55 -0500
Subject: [PATCH 01/21] Add new nvtext::normalize_characters API

---
 cpp/include/nvtext/normalize.hpp        | 103 ++++++++-
 cpp/src/text/normalize.cu               | 280 ++++++++++++++++++++++--
 cpp/src/text/normalize.cuh              | 106 +++++++++
 cpp/src/text/subword/data_normalizer.cu |  76 +------
 cpp/tests/text/normalize_tests.cpp      |  72 ++++--
 5 files changed, 521 insertions(+), 116 deletions(-)
 create mode 100644 cpp/src/text/normalize.cuh

diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index 74325f4a406..10797164945 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -107,5 +107,106 @@ std::unique_ptr<cudf::column> normalize_characters(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Normalizer object to be used with nvtext::normalize_characters
+ *
+ * Use nvtext::create_normalizer to create this object.
+ *
+ * This normalizer includes:
+ *
+ * - adding padding around punctuation (unicode category starts with "P")
+ *   as well as certain ASCII symbols like "^" and "$"
+ * - adding padding around the [CJK Unicode block
+ * characters](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block))
+ * - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "`
+ * - removing control characters (unicode categories "Cc" and "Cf")
+ *
+ * The padding process here adds a single space before and after the character.
+ * Details on _unicode category_ can be found here:
+ * https://unicodebook.readthedocs.io/unicode.html#categories
+ *
+ * If `do_lower_case = true`, lower-casing also removes the accents. The
+ * accents cannot be removed from upper-case characters without lower-casing
+ * and lower-casing cannot be performed without also removing accents.
+ * However, if the accented character is already lower-case, then only the
+ * accent is removed.
+ */
+struct character_normalizer {
+  /**
+   * @brief Normalizer object constructor
+   *
+   * This initializes and holds the character normalizing tables and settings.
+   *
+   * @param do_lower_case If true, upper-case characters are converted to
+   *        lower-case and accents are stripped from those characters.
+   *        If false, accented and upper-case characters are not transformed.
+   * @param allow_special_tokens If true, the following character sequences are not
+   *        normalized if encountered in the input:
+   *        `[BOS] [EOS] [UNK] [SEP] [PAD] [CLS] [MASK]`
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource used to allocate the returned column's device memory
+   */
+  character_normalizer(bool do_lower_case,
+                       bool allow_special_tokens         = true,
+                       rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+  ~character_normalizer();
+
+  struct character_normalizer_impl;
+  character_normalizer_impl* _impl{};
+};
+
+/**
+ * @brief Create a normalizer object
+ *
+ * Creates a normalizer object which can be reused on multiple calls to
+ * nvtext::normalize_characters
+ *
+ * @see nvtext::character_normalizer
+ *
+ * @param do_lower_case If true, upper-case characters are converted to
+ *        lower-case and accents are stripped from those characters.
+ *        If false, accented and upper-case characters are not transformed.
+ * @param allow_special_tokens If true, the following character sequences are not
+ *        normalized if encountered in the input:
+ *        `[BOS] [EOS] [UNK] [SEP] [PAD] [CLS] [MASK]`
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Object to be used with nvtext::tokenize_with_vocabulary
+ */
+std::unique_ptr<character_normalizer> create_character_normalizer(
+  bool do_lower_case,
+  bool allow_special_tokens         = true,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
+/**
+ * @brief Normalizes strings characters for tokenizing
+ *
+ * @see nvtext::character_normalizer for details on the normalizer behavior
+ *
+ * @code{.pseudo}
+ * s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
+ * s1 = normalize_characters(s,true)
+ * s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
+ * s2 = normalize_characters(s,false)
+ * s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
+ * @endcode
+ *
+ * A null input element at row `i` produces a corresponding null entry
+ * for row `i` in the output column.
+ *
+ * @param input The input strings to normalize
+ * @param normalizer Normalizer to use for this function
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to allocate any returned objects
+ * @return Normalized strings column
+ */
+std::unique_ptr<cudf::column> normalize_characters(
+  cudf::strings_column_view const& input,
+  character_normalizer const& normalizer,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 7e2b766862d..2d1ce290aec 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "text/normalize.cuh"
 #include "text/subword/detail/data_normalizer.hpp"
 #include "text/subword/detail/tokenizer_utils.cuh"
 #include "text/utilities/tokenize_ops.cuh"
@@ -38,9 +39,11 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cub/cub.cuh>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
+#include <thrust/remove.h>
 #include <thrust/transform_reduce.h>
 
 #include <limits>
@@ -103,6 +106,35 @@ constexpr uint32_t UTF8_1BYTE = 0x0080;
 constexpr uint32_t UTF8_2BYTE = 0x0800;
 constexpr uint32_t UTF8_3BYTE = 0x01'0000;
 
+__device__ int8_t cp_to_utf8(uint32_t codepoint, char* out)
+{
+  auto utf8 = cudf::strings::detail::codepoint_to_utf8(codepoint);
+  return cudf::strings::detail::from_char_utf8(utf8, out);
+#if 0
+  auto out_ptr = out;
+  if (codepoint < UTF8_1BYTE)  // ASCII range
+    *out_ptr++ = static_cast<char>(codepoint);
+  else if (codepoint < UTF8_2BYTE) {  // create two-byte UTF-8
+    // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy
+    *out_ptr++ = static_cast<char>((((codepoint << 2) & 0x00'1F00) | 0x00'C000) >> 8);
+    *out_ptr++ = static_cast<char>((codepoint & 0x3F) | 0x0080);
+  } else if (codepoint < UTF8_3BYTE) {  // create three-byte UTF-8
+    // bxxxxxxxx:byyyyyyyy => b1110xxxx:b10xxxxyy:b10yyyyyy
+    *out_ptr++ = static_cast<char>((((codepoint << 4) & 0x0F'0000) | 0x00E0'0000) >> 16);
+    *out_ptr++ = static_cast<char>((((codepoint << 2) & 0x00'3F00) | 0x00'8000) >> 8);
+    *out_ptr++ = static_cast<char>((codepoint & 0x3F) | 0x0080);
+  } else {  // create four-byte UTF-8
+    // maximum code-point value is 0x0011'0000
+    // b000xxxxx:byyyyyyyy:bzzzzzzzz => b11110xxx:b10xxyyyy:b10yyyyzz:b10zzzzzz
+    *out_ptr++ = static_cast<char>((((codepoint << 6) & 0x0700'0000u) | 0xF000'0000u) >> 24);
+    *out_ptr++ = static_cast<char>((((codepoint << 4) & 0x003F'0000u) | 0x0080'0000u) >> 16);
+    *out_ptr++ = static_cast<char>((((codepoint << 2) & 0x00'3F00u) | 0x00'8000u) >> 8);
+    *out_ptr++ = static_cast<char>((codepoint & 0x3F) | 0x0080);
+  }
+  return static_cast<int8_t>(thrust::distance(out, out_ptr));
+#endif
+}
+
 /**
  * @brief Convert code-point arrays into UTF-8 bytes for each string.
  */
@@ -148,26 +180,8 @@ struct codepoint_to_utf8_fn {
     // convert each code-point to 1-4 UTF-8 encoded bytes
     char* out_ptr = d_chars + d_offsets[idx];
     for (uint32_t jdx = 0; jdx < count; ++jdx) {
-      uint32_t code_point = *str_cps++;
-      if (code_point < UTF8_1BYTE)  // ASCII range
-        *out_ptr++ = static_cast<char>(code_point);
-      else if (code_point < UTF8_2BYTE) {  // create two-byte UTF-8
-        // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy
-        *out_ptr++ = static_cast<char>((((code_point << 2) & 0x00'1F00) | 0x00'C000) >> 8);
-        *out_ptr++ = static_cast<char>((code_point & 0x3F) | 0x0080);
-      } else if (code_point < UTF8_3BYTE) {  // create three-byte UTF-8
-        // bxxxxxxxx:byyyyyyyy => b1110xxxx:b10xxxxyy:b10yyyyyy
-        *out_ptr++ = static_cast<char>((((code_point << 4) & 0x0F'0000) | 0x00E0'0000) >> 16);
-        *out_ptr++ = static_cast<char>((((code_point << 2) & 0x00'3F00) | 0x00'8000) >> 8);
-        *out_ptr++ = static_cast<char>((code_point & 0x3F) | 0x0080);
-      } else {  // create four-byte UTF-8
-        // maximum code-point value is 0x0011'0000
-        // b000xxxxx:byyyyyyyy:bzzzzzzzz => b11110xxx:b10xxyyyy:b10yyyyzz:b10zzzzzz
-        *out_ptr++ = static_cast<char>((((code_point << 6) & 0x0700'0000u) | 0xF000'0000u) >> 24);
-        *out_ptr++ = static_cast<char>((((code_point << 4) & 0x003F'0000u) | 0x0080'0000u) >> 16);
-        *out_ptr++ = static_cast<char>((((code_point << 2) & 0x00'3F00u) | 0x00'8000u) >> 8);
-        *out_ptr++ = static_cast<char>((code_point & 0x3F) | 0x0080);
-      }
+      uint32_t codepoint = *str_cps++;
+      out_ptr += cp_to_utf8(codepoint, out_ptr);
     }
   }
 };
@@ -261,4 +275,228 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   return detail::normalize_characters(input, do_lower_case, stream, mr);
 }
 
+struct character_normalizer::character_normalizer_impl {
+  rmm::device_uvector<uint32_t> cp_metadata;
+  rmm::device_uvector<aux_codepoint_data_type> aux_table;
+  bool do_lower_case;
+  bool special_tokens;
+
+  character_normalizer_impl(rmm::device_uvector<uint32_t>&& cp_metadata,
+                            rmm::device_uvector<aux_codepoint_data_type>&& aux_table,
+                            bool do_lower_case,
+                            bool special_tokens)
+    : cp_metadata(std::move(cp_metadata)),
+      aux_table(std::move(aux_table)),
+      do_lower_case{do_lower_case},
+      special_tokens{special_tokens}
+  {
+  }
+};
+
+character_normalizer::character_normalizer(bool do_lower_case,
+                                           bool special_tokens,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref)
+{
+  auto cp_metadata = nvtext::detail::get_codepoint_metadata(stream);
+  auto aux_table   = nvtext::detail::get_aux_codepoint_data(stream);
+
+  _impl = new character_normalizer_impl(
+    std::move(cp_metadata), std::move(aux_table), do_lower_case, special_tokens);
+}
+character_normalizer::~character_normalizer() { delete _impl; }
+
+std::unique_ptr<character_normalizer> create_character_normalizer(bool do_lower_case,
+                                                                  bool allow_special_tokens,
+                                                                  rmm::cuda_stream_view stream,
+                                                                  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return std::make_unique<character_normalizer>(do_lower_case, allow_special_tokens, stream, mr);
+}
+
+namespace detail {
+namespace {
+CUDF_KERNEL void normalizer_kernel(char const* d_chars,
+                                   int64_t total_bytes,
+                                   codepoint_metadata_type const* cp_metadata,
+                                   aux_codepoint_data_type const* aux_table,
+                                   bool do_lower_case,
+                                   bool,  // allow_special_tokens,
+                                   uint32_t* d_output,
+                                   int8_t* chars_per_thread)
+{
+  uint32_t replacement[MAX_NEW_CHARS] = {0};
+
+  auto const idx       = cudf::detail::grid_1d::global_thread_id();
+  int8_t num_new_chars = 0;
+
+  if ((idx < total_bytes) && cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) {
+    auto const cp = [utf8 = d_chars + idx] {
+      cudf::char_utf8 ch_utf8;
+      auto const ch_size = cudf::strings::detail::to_char_utf8(utf8, ch_utf8);
+      return cudf::strings::detail::utf8_to_codepoint(ch_utf8);
+    }();
+    auto const metadata = cp_metadata[cp];
+
+    if (!should_remove_cp(metadata, do_lower_case)) {
+      num_new_chars = 1;
+      // Apply lower cases and accent stripping if necessary
+      auto const new_cp = do_lower_case || always_replace(metadata) ? get_first_cp(metadata) : cp;
+      replacement[0]    = new_cp == 0 ? cp : new_cp;
+
+      if (do_lower_case && is_multi_char_transform(metadata)) {
+        auto const next_cps = aux_table[cp];
+        replacement[1]      = static_cast<uint32_t>(next_cps >> 32);
+        replacement[2]      = static_cast<uint32_t>(next_cps & 0xFFFFFFFF);
+        num_new_chars       = 2 + (replacement[2] != 0);
+      }
+
+      // check for possible special tokens here before checking add-spaces?
+
+      if (should_add_spaces(metadata, do_lower_case) && (num_new_chars == 1)) {
+        // Need to shift all existing code-points up one.
+        // This is a rotate right. There is no thrust equivalent at this time.
+        // for (int loc = num_new_chars; loc > 0; --loc) {
+        //  replacement[loc] = replacement[loc - 1];
+        //}
+        // Write the required spaces at each end
+        replacement[1] = replacement[0];
+        replacement[0] = SPACE_CODE_POINT;
+        replacement[2] = SPACE_CODE_POINT;
+        num_new_chars  = 3;
+      }
+
+      // convert back to UTF-8
+      for (int k = 0; k < num_new_chars; ++k) {
+        auto const new_cp = replacement[k];
+        if (new_cp) { cp_to_utf8(new_cp, reinterpret_cast<char*>(replacement + k)); }
+      }
+    }
+  }
+
+  if (idx < total_bytes) { chars_per_thread[idx] = num_new_chars; }
+
+  using BlockStore =
+    cub::BlockStore<uint32_t, THREADS_PER_BLOCK, MAX_NEW_CHARS, cub::BLOCK_STORE_WARP_TRANSPOSE>;
+  __shared__ typename BlockStore::TempStorage temp_storage;
+
+  // Now we perform coalesced writes back to global memory using cub.
+  auto output_offset = blockIdx.x * blockDim.x * MAX_NEW_CHARS;
+  auto block_base    = d_output + output_offset;
+  auto valid_items =
+    min(static_cast<int>(total_bytes - output_offset), static_cast<int>(blockDim.x));
+  BlockStore(temp_storage).Store(block_base, replacement, valid_items);
+}
+
+template <typename OffsetType>
+rmm::device_uvector<cudf::size_type> compute_sizes(int8_t const* sizes,
+                                                   OffsetType offsets,
+                                                   int64_t offset,
+                                                   cudf::size_type size,
+                                                   rmm::cuda_stream_view stream)
+{
+  auto output_sizes = rmm::device_uvector<cudf::size_type>(size, stream);
+
+  auto d_in        = sizes;
+  auto d_out       = output_sizes.begin();
+  std::size_t temp = 0;
+  nvtxRangePushA("segmented_reduce");
+  if (offset == 0) {
+    cub::DeviceSegmentedReduce::Sum(
+      nullptr, temp, d_in, d_out, size, offsets, offsets + 1, stream.value());
+    auto d_temp = rmm::device_buffer{temp, stream};
+    cub::DeviceSegmentedReduce::Sum(
+      d_temp.data(), temp, d_in, d_out, size, offsets, offsets + 1, stream.value());
+  } else {
+    // offsets need to be normalized for segmented-reduce to work efficiently
+    auto d_offsets = rmm::device_uvector<cudf::size_type>(size + 1, stream);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      offsets,
+                      offsets + size + 1,
+                      d_offsets.begin(),
+                      [offset] __device__(auto o) { return o - offset; });
+    auto const offsets_itr = d_offsets.begin();
+    cub::DeviceSegmentedReduce::Sum(
+      nullptr, temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value());
+    auto d_temp = rmm::device_buffer{temp, stream};
+    cub::DeviceSegmentedReduce::Sum(
+      d_temp.data(), temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value());
+  }
+  stream.synchronize();
+  nvtxRangePop();
+
+  return output_sizes;
+}
+}  // namespace
+std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& input,
+                                                   character_normalizer const& normalizer,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr)
+{
+  if (input.is_empty()) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); }
+
+  auto const first_offset  = (input.offset() == 0) ? 0
+                                                   : cudf::strings::detail::get_offset_value(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset   = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
+                               ? input.chars_size(stream)
+                               : cudf::strings::detail::get_offset_value(
+                                 input.offsets(), input.size() + input.offset(), stream);
+  auto const chars_size    = last_offset - first_offset;
+  auto const d_input_chars = input.chars_begin(stream) + first_offset;
+
+  if (chars_size == 0) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
+
+  constexpr int64_t block_size = 64;
+  cudf::detail::grid_1d grid{chars_size, block_size};
+  auto const max_new_char_total = MAX_NEW_CHARS * chars_size;
+
+  auto d_code_points = rmm::device_uvector<uint32_t>(max_new_char_total, stream);
+  auto d_sizes       = rmm::device_uvector<int8_t>(chars_size, stream);
+  normalizer_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    d_input_chars,
+    chars_size,
+    normalizer._impl->cp_metadata.data(),
+    normalizer._impl->aux_table.data(),
+    normalizer._impl->do_lower_case,
+    normalizer._impl->special_tokens,
+    d_code_points.data(),
+    d_sizes.data());
+
+  auto const input_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
+
+  // use segmented-reduce with input_offsets over d_sizes to get the size of the output rows
+  auto output_sizes =
+    compute_sizes(d_sizes.data(), input_offsets, first_offset, input.size(), stream);
+
+  // convert the sizes to offsets
+  auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column(
+    output_sizes.begin(), output_sizes.end(), stream, mr);
+
+  // create output chars and use remove-copy(0) on d_code_points
+  rmm::device_uvector<char> chars(total_size, stream, mr);
+  auto begin = reinterpret_cast<char const*>(d_code_points.begin());
+  auto end   = reinterpret_cast<char const*>(d_code_points.end());
+  thrust::remove_copy(rmm::exec_policy_nosync(stream), begin, end, chars.data(), 0);
+
+  return cudf::make_strings_column(input.size(),
+                                   std::move(offsets),
+                                   chars.release(),
+                                   input.null_count(),
+                                   cudf::detail::copy_bitmask(input.parent(), stream, mr));
+}
+
+}  // namespace detail
+
+std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& input,
+                                                   character_normalizer const& normalizer,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::normalize_characters(input, normalizer, stream, mr);
+}
+
 }  // namespace nvtext
diff --git a/cpp/src/text/normalize.cuh b/cpp/src/text/normalize.cuh
new file mode 100644
index 00000000000..8b1f4408ee8
--- /dev/null
+++ b/cpp/src/text/normalize.cuh
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "text/subword/detail/cp_data.h"
+
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <cuda/std/functional>
+
+namespace nvtext {
+namespace detail {
+
+/**
+ * @brief Bit used to filter out invalid code points.
+ *
+ * When normalizing characters to code point values, if this bit is set,
+ * the code point should be filtered out before returning from the normalizer.
+ */
+constexpr uint32_t FILTER_BIT = 22;
+
+/**
+ * @brief Retrieve new code point from metadata value.
+ *
+ * @param metadata Value from the codepoint_metadata table.
+ * @return The replacement character if appropriate.
+ */
+__device__ constexpr uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; }
+
+/**
+ * @brief Retrieve token category from the metadata value.
+ *
+ * Category values are 0-5:
+ * 0 - character should be padded
+ * 1 - pad character if lower-case
+ * 2 - character should be removed
+ * 3 - remove character if lower-case
+ * 4 - whitespace character -- always replace
+ * 5 - uncategorized
+ *
+ * @param metadata Value from the codepoint_metadata table.
+ * @return Category value.
+ */
+__device__ constexpr uint32_t extract_token_cat(uint32_t metadata)
+{
+  return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK;
+}
+
+/**
+ * @brief Return true if category of metadata value specifies the character should be replaced.
+ */
+__device__ constexpr bool should_remove_cp(uint32_t metadata, bool lower_case)
+{
+  auto const cat = extract_token_cat(metadata);
+  return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER));
+}
+
+/**
+ * @brief Return true if category of metadata value specifies the character should be padded.
+ */
+__device__ constexpr bool should_add_spaces(uint32_t metadata, bool lower_case)
+{
+  auto const cat = extract_token_cat(metadata);
+  return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER));
+}
+
+/**
+ * @brief Return true if category of metadata value specifies the character should be replaced.
+ */
+__device__ constexpr bool always_replace(uint32_t metadata)
+{
+  return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE;
+}
+
+/**
+ * @brief Returns true if metadata value includes a multi-character transform bit equal to 1.
+ */
+__device__ constexpr bool is_multi_char_transform(uint32_t metadata)
+{
+  return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK;
+}
+
+/**
+ * @brief Returns true if the byte passed in could be a valid head byte for
+ * a utf8 character. That is, not binary `10xxxxxx`
+ */
+__device__ constexpr bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; }
+
+}  // namespace detail
+}  // namespace nvtext
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index 7a39199011e..4c54409c41a 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "text/normalize.cuh"
 #include "text/subword/detail/data_normalizer.hpp"
 #include "text/subword/detail/tokenizer_utils.cuh"
 
@@ -38,81 +39,6 @@ namespace nvtext {
 namespace detail {
 namespace {
 
-/**
- * @brief Bit used to filter out invalid code points.
- *
- * When normalizing characters to code point values, if this bit is set,
- * the code point should be filtered out before returning from the normalizer.
- */
-constexpr uint32_t FILTER_BIT = 22;
-
-/**
- * @brief Retrieve new code point from metadata value.
- *
- * @param metadata Value from the codepoint_metadata table.
- * @return The replacement character if appropriate.
- */
-__device__ uint32_t get_first_cp(uint32_t metadata) { return metadata & NEW_CP_MASK; }
-
-/**
- * @brief Retrieve token category from the metadata value.
- *
- * Category values are 0-5:
- * 0 - character should be padded
- * 1 - pad character if lower-case
- * 2 - character should be removed
- * 3 - remove character if lower-case
- * 4 - whitespace character -- always replace
- * 5 - uncategorized
- *
- * @param metadata Value from the codepoint_metadata table.
- * @return Category value.
- */
-__device__ uint32_t extract_token_cat(uint32_t metadata)
-{
-  return (metadata >> TOKEN_CAT_SHIFT) & TOKEN_CAT_MASK;
-}
-
-/**
- * @brief Return true if category of metadata value specifies the character should be replaced.
- */
-__device__ bool should_remove_cp(uint32_t metadata, bool lower_case)
-{
-  auto const cat = extract_token_cat(metadata);
-  return (cat == TOKEN_CAT_REMOVE_CHAR) || (lower_case && (cat == TOKEN_CAT_REMOVE_CHAR_IF_LOWER));
-}
-
-/**
- * @brief Return true if category of metadata value specifies the character should be padded.
- */
-__device__ bool should_add_spaces(uint32_t metadata, bool lower_case)
-{
-  auto const cat = extract_token_cat(metadata);
-  return (cat == TOKEN_CAT_ADD_SPACE) || (lower_case && (cat == TOKEN_CAT_ADD_SPACE_IF_LOWER));
-}
-
-/**
- * @brief Return true if category of metadata value specifies the character should be replaced.
- */
-__device__ bool always_replace(uint32_t metadata)
-{
-  return extract_token_cat(metadata) == TOKEN_CAT_ALWAYS_REPLACE;
-}
-
-/**
- * @brief Returns true if metadata value includes a multi-character transform bit equal to 1.
- */
-__device__ bool is_multi_char_transform(uint32_t metadata)
-{
-  return (metadata >> MULTICHAR_SHIFT) & MULTICHAR_MASK;
-}
-
-/**
- * @brief Returns true if the byte passed in could be a valid head byte for
- * a utf8 character. That is, not binary `10xxxxxx`
- */
-__device__ bool is_head_byte(unsigned char utf8_byte) { return (utf8_byte >> 6) != 2; }
-
 /**
  * @brief Converts a UTF-8 character into a unicode code point value.
  *
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index 2515cc917fa..d8f87f78278 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/debug_utilities.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/copying.hpp>
@@ -98,22 +99,53 @@ TEST_F(TextNormalizeTest, SomeNullStrings)
 TEST_F(TextNormalizeTest, NormalizeCharacters)
 {
   // These include punctuation, accents, whitespace, and CJK characters
-  std::vector<char const*> h_strings{"abc£def",
-                                     nullptr,
-                                     "éè â îô\taeio",
-                                     "\tĂĆĖÑ  Ü",
-                                     "ACEN U",
-                                     "P^NP",
-                                     "$41.07",
-                                     "[a,b]",
-                                     "丏丟",
-                                     ""};
-  auto validity =
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; });
-  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
-  cudf::strings_column_view strings_view(strings);
+  auto input = cudf::test::strings_column_wrapper(
+    {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ  Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""},
+    {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+  auto sv = cudf::strings_column_view(input);
+  {
+    auto results = nvtext::normalize_characters(sv, true);
+    cudf::test::strings_column_wrapper expected({"abc£def",
+                                                 "",
+                                                 "ee a io aeio",
+                                                 " acen  u",
+                                                 "acen u",
+                                                 "p ^ np",
+                                                 " $ 41 . 07",
+                                                 " [ a , b ] ",
+                                                 " 丏  丟 ",
+                                                 ""},
+                                                {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+  {
+    auto results = nvtext::normalize_characters(sv, false);
+    cudf::test::strings_column_wrapper expected({"abc£def",
+                                                 "",
+                                                 "éè â îô aeio",
+                                                 " ĂĆĖÑ  Ü",
+                                                 "ACEN U",
+                                                 "P ^ NP",
+                                                 " $ 41 . 07",
+                                                 " [ a , b ] ",
+                                                 " 丏  丟 ",
+                                                 ""},
+                                                {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  }
+}
+
+TEST_F(TextNormalizeTest, NormalizeCharactersNew)
+{
+  // These include punctuation, accents, whitespace, and CJK characters
+  auto input = cudf::test::strings_column_wrapper(
+    {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ  Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""},
+    {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+  auto sv = cudf::strings_column_view(input);
   {
-    auto results = nvtext::normalize_characters(strings_view, true);
+    auto normalizer = nvtext::create_character_normalizer(true);
+    auto results    = nvtext::normalize_characters(sv, *normalizer);
+    cudf::test::print(results->view());
     cudf::test::strings_column_wrapper expected({"abc£def",
                                                  "",
                                                  "ee a io aeio",
@@ -124,11 +156,13 @@ TEST_F(TextNormalizeTest, NormalizeCharacters)
                                                  " [ a , b ] ",
                                                  " 丏  丟 ",
                                                  ""},
-                                                validity);
+                                                {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results = nvtext::normalize_characters(strings_view, false);
+    auto normalizer = nvtext::create_character_normalizer(false);
+    auto results    = nvtext::normalize_characters(sv, *normalizer);
+    cudf::test::print(results->view());
     cudf::test::strings_column_wrapper expected({"abc£def",
                                                  "",
                                                  "éè â îô aeio",
@@ -139,7 +173,7 @@ TEST_F(TextNormalizeTest, NormalizeCharacters)
                                                  " [ a , b ] ",
                                                  " 丏  丟 ",
                                                  ""},
-                                                validity);
+                                                {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }

From 2dd819a33cb81b4d34fd44870f2efe87b046550c Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 27 Jan 2025 15:37:58 -0500
Subject: [PATCH 02/21] add special-tokens column

---
 cpp/include/nvtext/normalize.hpp   |  16 +--
 cpp/src/text/normalize.cu          | 178 ++++++++++++++++++++++-------
 cpp/tests/text/normalize_tests.cpp |  45 +++++++-
 3 files changed, 184 insertions(+), 55 deletions(-)

diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index 10797164945..be1d28248de 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -130,6 +130,9 @@ std::unique_ptr<cudf::column> normalize_characters(
  * and lower-casing cannot be performed without also removing accents.
  * However, if the accented character is already lower-case, then only the
  * accent is removed.
+ *
+ * If `special_tokens` are included the padding around the `[]` is not
+ * enforced if the character between them match one of the given tokens.
  */
 struct character_normalizer {
   /**
@@ -140,14 +143,13 @@ struct character_normalizer {
    * @param do_lower_case If true, upper-case characters are converted to
    *        lower-case and accents are stripped from those characters.
    *        If false, accented and upper-case characters are not transformed.
-   * @param allow_special_tokens If true, the following character sequences are not
-   *        normalized if encountered in the input:
-   *        `[BOS] [EOS] [UNK] [SEP] [PAD] [CLS] [MASK]`
+   * @param special_tokens Individual sequences including `[]` brackets.
+   *        For example: `[BOS]`, `[EOS]`, `[UNK]`, `[SEP]`, `[PAD]`, `[CLS]`, `[MASK]`
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned column's device memory
    */
   character_normalizer(bool do_lower_case,
-                       bool allow_special_tokens         = true,
+                       cudf::strings_column_view const& special_tokens,
                        rmm::cuda_stream_view stream      = cudf::get_default_stream(),
                        rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
   ~character_normalizer();
@@ -167,16 +169,14 @@ struct character_normalizer {
  * @param do_lower_case If true, upper-case characters are converted to
  *        lower-case and accents are stripped from those characters.
  *        If false, accented and upper-case characters are not transformed.
- * @param allow_special_tokens If true, the following character sequences are not
- *        normalized if encountered in the input:
- *        `[BOS] [EOS] [UNK] [SEP] [PAD] [CLS] [MASK]`
+ * @param special_tokens Individual sequences including `[]` brackets
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return Object to be used with nvtext::tokenize_with_vocabulary
  */
 std::unique_ptr<character_normalizer> create_character_normalizer(
   bool do_lower_case,
-  bool allow_special_tokens         = true,
+  cudf::strings_column_view const& special_tokens,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 2d1ce290aec..63047edd376 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -27,6 +27,8 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
@@ -279,68 +281,147 @@ struct character_normalizer::character_normalizer_impl {
   rmm::device_uvector<uint32_t> cp_metadata;
   rmm::device_uvector<aux_codepoint_data_type> aux_table;
   bool do_lower_case;
-  bool special_tokens;
+  std::unique_ptr<cudf::column> special_tokens;
+  rmm::device_uvector<cudf::string_view> special_tokens_view;
+  cudf::size_type min_token_width;
+  cudf::size_type max_token_width;
+
+  cudf::device_span<cudf::string_view const> get_special_tokens() const
+  {
+    return special_tokens_view;
+  }
 
   character_normalizer_impl(rmm::device_uvector<uint32_t>&& cp_metadata,
                             rmm::device_uvector<aux_codepoint_data_type>&& aux_table,
                             bool do_lower_case,
-                            bool special_tokens)
+                            std::unique_ptr<cudf::column>&& special_tokens,
+                            rmm::device_uvector<cudf::string_view>&& special_tokens_view,
+                            cudf::size_type min_token_width,
+                            cudf::size_type max_token_width)
     : cp_metadata(std::move(cp_metadata)),
       aux_table(std::move(aux_table)),
       do_lower_case{do_lower_case},
-      special_tokens{special_tokens}
+      special_tokens{std::move(special_tokens)},
+      special_tokens_view{std::move(special_tokens_view)},
+      min_token_width{min_token_width},
+      max_token_width{max_token_width}
   {
   }
 };
 
+// lambdas not allowed in constructors
+struct size_bytes_fn {
+  cudf::string_view const* d_tokens;
+  __device__ cudf::size_type operator()(cudf::size_type idx) { return d_tokens[idx].size_bytes(); }
+};
+
 character_normalizer::character_normalizer(bool do_lower_case,
-                                           bool special_tokens,
+                                           cudf::strings_column_view const& special_tokens,
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref)
 {
   auto cp_metadata = nvtext::detail::get_codepoint_metadata(stream);
   auto aux_table   = nvtext::detail::get_aux_codepoint_data(stream);
+  CUDF_EXPECTS(
+    !special_tokens.has_nulls(), "special tokens should not have nulls", std::invalid_argument);
 
-  _impl = new character_normalizer_impl(
-    std::move(cp_metadata), std::move(aux_table), do_lower_case, special_tokens);
+  auto sorted = std::move(
+    cudf::sort(cudf::table_view({special_tokens.parent()}), {}, {}, stream)->release().front());
+  if (do_lower_case) {
+    sorted = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream);
+  }
+  auto tokens_view = cudf::strings::detail::create_string_vector_from_column(
+    cudf::strings_column_view(sorted->view()), stream, cudf::get_current_device_resource_ref());
+  auto const begin =
+    cudf::detail::make_counting_transform_iterator(0, size_bytes_fn{tokens_view.data()});
+  auto const end       = begin + tokens_view.size();
+  auto const min_width = thrust::reduce(rmm::exec_policy(stream),
+                                        begin,
+                                        end,
+                                        std::numeric_limits<cudf::size_type>::max(),
+                                        thrust::minimum<cudf::size_type>{});
+  auto const max_width =
+    thrust::reduce(rmm::exec_policy(stream), begin, end, 0, thrust::maximum<cudf::size_type>{});
+  printf("%ld special tokens, min=%d, max=%d\n", tokens_view.size(), min_width, max_width);
+  CUDF_EXPECTS(tokens_view.is_empty() || (min_width > 2 && max_width <= 6),
+               "Expected special tokens to be between 3 and 6 bytes including [] brackets",
+               std::invalid_argument);
+
+  _impl = new character_normalizer_impl(std::move(cp_metadata),
+                                        std::move(aux_table),
+                                        do_lower_case,
+                                        std::move(sorted),
+                                        std::move(tokens_view),
+                                        min_width,
+                                        max_width);
 }
 character_normalizer::~character_normalizer() { delete _impl; }
 
-std::unique_ptr<character_normalizer> create_character_normalizer(bool do_lower_case,
-                                                                  bool allow_special_tokens,
-                                                                  rmm::cuda_stream_view stream,
-                                                                  rmm::device_async_resource_ref mr)
+std::unique_ptr<character_normalizer> create_character_normalizer(
+  bool do_lower_case,
+  cudf::strings_column_view const& special_tokens,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::make_unique<character_normalizer>(do_lower_case, allow_special_tokens, stream, mr);
+  return std::make_unique<character_normalizer>(do_lower_case, special_tokens, stream, mr);
 }
 
 namespace detail {
 namespace {
+
+__device__ bool check_special_token(cudf::device_span<cudf::string_view const> special_tokens,
+                                    char const* d_chars,
+                                    int64_t idx,
+                                    int64_t total_bytes)
+{
+  char const ch  = d_chars[idx];
+  auto begin     = (ch == '[' || idx <= 6) ? d_chars + idx : d_chars + idx - 6L;
+  auto end       = ch == ']' ? d_chars + idx + 1 : d_chars + idx + std::min(6L, total_bytes - idx);
+  char const mch = ch == '[' ? ']' : '[';
+  // printf("(%p,%ld),%c,(%p,%p)\n", d_chars, total_bytes, ch, begin, end);
+  auto fnd = thrust::find(thrust::seq, begin, end, mch);
+  if (fnd == end) { return false; }
+  if (mch == ']') {
+    end = fnd + 1;  // include ']'
+  } else {
+    begin = fnd;
+  }
+  auto const size  = static_cast<cudf::size_type>(thrust::distance(begin, end));
+  auto const token = cudf::string_view(begin, size);
+  char p[8]        = {0};
+  memcpy(p, begin, min(size, 7));
+  p[min(size, 7)] = 0;
+  auto rtn = thrust::find(thrust::seq, special_tokens.begin(), special_tokens.end(), token) !=
+             special_tokens.end();
+  printf("(%p,%d)=%s: %c/%d\n", begin, size, p, ch, int(rtn));
+  return rtn;
+}
+
 CUDF_KERNEL void normalizer_kernel(char const* d_chars,
                                    int64_t total_bytes,
                                    codepoint_metadata_type const* cp_metadata,
                                    aux_codepoint_data_type const* aux_table,
                                    bool do_lower_case,
-                                   bool,  // allow_special_tokens,
+                                   cudf::device_span<cudf::string_view const> special_tokens,
                                    uint32_t* d_output,
                                    int8_t* chars_per_thread)
 {
   uint32_t replacement[MAX_NEW_CHARS] = {0};
 
   auto const idx       = cudf::detail::grid_1d::global_thread_id();
-  int8_t num_new_chars = 0;
+  int8_t num_new_bytes = 0;
 
   if ((idx < total_bytes) && cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) {
     auto const cp = [utf8 = d_chars + idx] {
       cudf::char_utf8 ch_utf8;
-      auto const ch_size = cudf::strings::detail::to_char_utf8(utf8, ch_utf8);
+      cudf::strings::detail::to_char_utf8(utf8, ch_utf8);
       return cudf::strings::detail::utf8_to_codepoint(ch_utf8);
     }();
     auto const metadata = cp_metadata[cp];
 
     if (!should_remove_cp(metadata, do_lower_case)) {
-      num_new_chars = 1;
+      int8_t num_new_chars = 1;
       // Apply lower cases and accent stripping if necessary
       auto const new_cp = do_lower_case || always_replace(metadata) ? get_first_cp(metadata) : cp;
       replacement[0]    = new_cp == 0 ? cp : new_cp;
@@ -352,41 +433,48 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars,
         num_new_chars       = 2 + (replacement[2] != 0);
       }
 
-      // check for possible special tokens here before checking add-spaces?
-
       if (should_add_spaces(metadata, do_lower_case) && (num_new_chars == 1)) {
-        // Need to shift all existing code-points up one.
-        // This is a rotate right. There is no thrust equivalent at this time.
-        // for (int loc = num_new_chars; loc > 0; --loc) {
-        //  replacement[loc] = replacement[loc - 1];
-        //}
-        // Write the required spaces at each end
+        // check for possible special tokens here
+        auto const ch = d_chars[idx];
+        auto sp_found = !special_tokens.empty() && (ch == '[' || ch == ']') &&
+                        check_special_token(special_tokens, d_chars, idx, total_bytes);
+
+        // write the required spaces at each end
         replacement[1] = replacement[0];
-        replacement[0] = SPACE_CODE_POINT;
-        replacement[2] = SPACE_CODE_POINT;
+        replacement[0] = sp_found && ch == ']' ? 0 : SPACE_CODE_POINT;
+        replacement[2] = sp_found && ch == '[' ? 0 : SPACE_CODE_POINT;
         num_new_chars  = 3;
       }
 
       // convert back to UTF-8
       for (int k = 0; k < num_new_chars; ++k) {
         auto const new_cp = replacement[k];
-        if (new_cp) { cp_to_utf8(new_cp, reinterpret_cast<char*>(replacement + k)); }
+        if (new_cp) {
+          num_new_bytes += cp_to_utf8(new_cp, reinterpret_cast<char*>(replacement + k));
+          // printf("%ld: k=%d, ncp=0x%04x, u8=0x%04x, bytes=%d\n",
+          //        idx,
+          //        k,
+          //        new_cp,
+          //        replacement[k],
+          //        num_new_bytes);
+        }
       }
     }
   }
 
-  if (idx < total_bytes) { chars_per_thread[idx] = num_new_chars; }
+  if (idx < total_bytes) {
+    chars_per_thread[idx] = num_new_bytes;
+    // printf("%ld: nb=%d\n", idx, (int)num_new_bytes);
 
-  using BlockStore =
-    cub::BlockStore<uint32_t, THREADS_PER_BLOCK, MAX_NEW_CHARS, cub::BLOCK_STORE_WARP_TRANSPOSE>;
-  __shared__ typename BlockStore::TempStorage temp_storage;
-
-  // Now we perform coalesced writes back to global memory using cub.
-  auto output_offset = blockIdx.x * blockDim.x * MAX_NEW_CHARS;
-  auto block_base    = d_output + output_offset;
-  auto valid_items =
-    min(static_cast<int>(total_bytes - output_offset), static_cast<int>(blockDim.x));
-  BlockStore(temp_storage).Store(block_base, replacement, valid_items);
+    d_output += idx * MAX_NEW_CHARS;
+#pragma unroll
+    for (int k = 0; k < MAX_NEW_CHARS; ++k) {
+      *d_output++ = replacement[k];
+    }
+    // printf(
+    //   "%ld: r0=0x%04x,r1=0x%04x,r2=0x%04x\n", idx, replacement[0], replacement[1],
+    //   replacement[2]);
+  }
 }
 
 template <typename OffsetType>
@@ -452,15 +540,17 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   cudf::detail::grid_1d grid{chars_size, block_size};
   auto const max_new_char_total = MAX_NEW_CHARS * chars_size;
 
+  auto const parameters = normalizer._impl;
+
   auto d_code_points = rmm::device_uvector<uint32_t>(max_new_char_total, stream);
   auto d_sizes       = rmm::device_uvector<int8_t>(chars_size, stream);
   normalizer_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     d_input_chars,
     chars_size,
-    normalizer._impl->cp_metadata.data(),
-    normalizer._impl->aux_table.data(),
-    normalizer._impl->do_lower_case,
-    normalizer._impl->special_tokens,
+    parameters->cp_metadata.data(),
+    parameters->aux_table.data(),
+    parameters->do_lower_case,
+    parameters->get_special_tokens(),
     d_code_points.data(),
     d_sizes.data());
 
@@ -475,10 +565,16 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column(
     output_sizes.begin(), output_sizes.end(), stream, mr);
 
+  // printf("total_size=%ld\n", total_size);
+
   // create output chars and use remove-copy(0) on d_code_points
   rmm::device_uvector<char> chars(total_size, stream, mr);
   auto begin = reinterpret_cast<char const*>(d_code_points.begin());
   auto end   = reinterpret_cast<char const*>(d_code_points.end());
+  // thrust::for_each_n(rmm::exec_policy(stream),
+  //                    thrust::make_counting_iterator<int>(0),
+  //                    d_code_points.size(),
+  //                    [begin] __device__(auto idx) { printf("0x%02x\n", (int)begin[idx]); });
   thrust::remove_copy(rmm::exec_policy_nosync(stream), begin, end, chars.data(), 0);
 
   return cudf::make_strings_column(input.size(),
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index d8f87f78278..d774a357fa4 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -138,12 +138,23 @@ TEST_F(TextNormalizeTest, NormalizeCharacters)
 TEST_F(TextNormalizeTest, NormalizeCharactersNew)
 {
   // These include punctuation, accents, whitespace, and CJK characters
-  auto input = cudf::test::strings_column_wrapper(
-    {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ  Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""},
-    {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
-  auto sv = cudf::strings_column_view(input);
+  auto input = cudf::test::strings_column_wrapper({"abc£def",        //  0
+                                                   "",               //  8
+                                                   "éè â îô\taeio",  // 25
+                                                   "\tĂĆĖÑ  Ü",      //
+                                                   "ACEN U",
+                                                   "P^NP",
+                                                   "$41.07",
+                                                   "[a,b]",
+                                                   "丏丟",
+                                                   ""},
+                                                  {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+
+  auto sv             = cudf::strings_column_view(input);
+  auto special_tokens = cudf::test::strings_column_wrapper();
+  auto stv            = cudf::strings_column_view(special_tokens);
   {
-    auto normalizer = nvtext::create_character_normalizer(true);
+    auto normalizer = nvtext::create_character_normalizer(true, stv);
     auto results    = nvtext::normalize_characters(sv, *normalizer);
     cudf::test::print(results->view());
     cudf::test::strings_column_wrapper expected({"abc£def",
@@ -160,7 +171,7 @@ TEST_F(TextNormalizeTest, NormalizeCharactersNew)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto normalizer = nvtext::create_character_normalizer(false);
+    auto normalizer = nvtext::create_character_normalizer(false, stv);
     auto results    = nvtext::normalize_characters(sv, *normalizer);
     cudf::test::print(results->view());
     cudf::test::strings_column_wrapper expected({"abc£def",
@@ -178,6 +189,28 @@ TEST_F(TextNormalizeTest, NormalizeCharactersNew)
   }
 }
 
+TEST_F(TextNormalizeTest, NormalizeCharactersNewSpecialTokens)
+{
+  // These include punctuation, accents, whitespace, and CJK characters
+  auto input =
+    cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]",
+                                        "[bos]these should[sep]work too[eos]"});
+
+  auto sv             = cudf::strings_column_view(input);
+  auto special_tokens = cudf::test::strings_column_wrapper({"[BOS]", "[EOS]", "[SEP]", "[PAD]"});
+  auto stv            = cudf::strings_column_view(special_tokens);
+  {
+    auto normalizer = nvtext::create_character_normalizer(true, stv);
+    auto results    = nvtext::normalize_characters(sv, *normalizer);
+    cudf::test::print(results->view());
+  }
+  {
+    auto normalizer = nvtext::create_character_normalizer(false, stv);
+    auto results    = nvtext::normalize_characters(sv, *normalizer);
+    cudf::test::print(results->view());
+  }
+}
+
 TEST_F(TextNormalizeTest, NormalizeSlicedColumn)
 {
   cudf::test::strings_column_wrapper strings(

From a29a40514c2a066239dba54a0602cb6396065b44 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 27 Jan 2025 18:55:43 -0500
Subject: [PATCH 03/21] add remove_copy_safe

---
 cpp/benchmarks/text/normalize.cpp  |  10 ++-
 cpp/src/text/normalize.cu          |  75 +++++++++---------
 cpp/tests/text/normalize_tests.cpp | 120 +++++++++++++++++------------
 3 files changed, 117 insertions(+), 88 deletions(-)

diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
index 594dc0de28a..91738a83adb 100644
--- a/cpp/benchmarks/text/normalize.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 
+#include <cudf/column/column_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -48,8 +49,11 @@ static void bench_normalize(nvbench::state& state)
                [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); });
   } else {
     bool const to_lower = (normalize_type == "to_lower");
+    auto spt = cudf::strings_column_view(cudf::make_empty_column(cudf::type_id::STRING)->view());
+    auto normalizer = nvtext::create_character_normalizer(to_lower, spt);
     state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-      auto result = nvtext::normalize_characters(input, to_lower);
+      // auto result = nvtext::normalize_characters(input, to_lower);
+      auto result = nvtext::normalize_characters(input, *normalizer);
     });
   }
 }
@@ -57,6 +61,6 @@ static void bench_normalize(nvbench::state& state)
 NVBENCH_BENCH(bench_normalize)
   .set_name("normalize")
   .add_int64_axis("min_width", {0})
-  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("max_width", {128, 256})
   .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"spaces", "characters", "to_lower"});
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 63047edd376..24a0c5ea811 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -23,6 +23,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/concatenate.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
@@ -328,21 +329,19 @@ character_normalizer::character_normalizer(bool do_lower_case,
   auto sorted = std::move(
     cudf::sort(cudf::table_view({special_tokens.parent()}), {}, {}, stream)->release().front());
   if (do_lower_case) {
-    sorted = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream);
+    auto lower = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream);
+    auto upper = cudf::strings::to_upper(cudf::strings_column_view(sorted->view()), stream);
+    sorted     = cudf::concatenate(std::vector{lower->view(), upper->view()}, stream);
   }
   auto tokens_view = cudf::strings::detail::create_string_vector_from_column(
     cudf::strings_column_view(sorted->view()), stream, cudf::get_current_device_resource_ref());
   auto const begin =
     cudf::detail::make_counting_transform_iterator(0, size_bytes_fn{tokens_view.data()});
-  auto const end       = begin + tokens_view.size();
-  auto const min_width = thrust::reduce(rmm::exec_policy(stream),
-                                        begin,
-                                        end,
-                                        std::numeric_limits<cudf::size_type>::max(),
-                                        thrust::minimum<cudf::size_type>{});
-  auto const max_width =
-    thrust::reduce(rmm::exec_policy(stream), begin, end, 0, thrust::maximum<cudf::size_type>{});
-  printf("%ld special tokens, min=%d, max=%d\n", tokens_view.size(), min_width, max_width);
+  auto const end      = begin + tokens_view.size();
+  auto const init_min = std::numeric_limits<cudf::size_type>::max();
+  auto const min_width =
+    thrust::reduce(rmm::exec_policy(stream), begin, end, init_min, thrust::minimum{});
+  auto const max_width = thrust::reduce(rmm::exec_policy(stream), begin, end, 0, thrust::maximum{});
   CUDF_EXPECTS(tokens_view.is_empty() || (min_width > 2 && max_width <= 6),
                "Expected special tokens to be between 3 and 6 bytes including [] brackets",
                std::invalid_argument);
@@ -376,11 +375,10 @@ __device__ bool check_special_token(cudf::device_span<cudf::string_view const> s
                                     int64_t total_bytes)
 {
   char const ch  = d_chars[idx];
-  auto begin     = (ch == '[' || idx <= 6) ? d_chars + idx : d_chars + idx - 6L;
+  auto begin     = ch == '[' ? d_chars + idx : d_chars + std::max(0L, idx - 6);
   auto end       = ch == ']' ? d_chars + idx + 1 : d_chars + idx + std::min(6L, total_bytes - idx);
   char const mch = ch == '[' ? ']' : '[';
-  // printf("(%p,%ld),%c,(%p,%p)\n", d_chars, total_bytes, ch, begin, end);
-  auto fnd = thrust::find(thrust::seq, begin, end, mch);
+  auto fnd       = thrust::find(thrust::seq, begin, end, mch);
   if (fnd == end) { return false; }
   if (mch == ']') {
     end = fnd + 1;  // include ']'
@@ -389,12 +387,8 @@ __device__ bool check_special_token(cudf::device_span<cudf::string_view const> s
   }
   auto const size  = static_cast<cudf::size_type>(thrust::distance(begin, end));
   auto const token = cudf::string_view(begin, size);
-  char p[8]        = {0};
-  memcpy(p, begin, min(size, 7));
-  p[min(size, 7)] = 0;
   auto rtn = thrust::find(thrust::seq, special_tokens.begin(), special_tokens.end(), token) !=
              special_tokens.end();
-  printf("(%p,%d)=%s: %c/%d\n", begin, size, p, ch, int(rtn));
   return rtn;
 }
 
@@ -451,12 +445,6 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars,
         auto const new_cp = replacement[k];
         if (new_cp) {
           num_new_bytes += cp_to_utf8(new_cp, reinterpret_cast<char*>(replacement + k));
-          // printf("%ld: k=%d, ncp=0x%04x, u8=0x%04x, bytes=%d\n",
-          //        idx,
-          //        k,
-          //        new_cp,
-          //        replacement[k],
-          //        num_new_bytes);
         }
       }
     }
@@ -464,16 +452,11 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars,
 
   if (idx < total_bytes) {
     chars_per_thread[idx] = num_new_bytes;
-    // printf("%ld: nb=%d\n", idx, (int)num_new_bytes);
-
     d_output += idx * MAX_NEW_CHARS;
 #pragma unroll
     for (int k = 0; k < MAX_NEW_CHARS; ++k) {
       *d_output++ = replacement[k];
     }
-    // printf(
-    //   "%ld: r0=0x%04x,r1=0x%04x,r2=0x%04x\n", idx, replacement[0], replacement[1],
-    //   replacement[2]);
   }
 }
 
@@ -516,6 +499,27 @@ rmm::device_uvector<cudf::size_type> compute_sizes(int8_t const* sizes,
 
   return output_sizes;
 }
+
+template <typename InputIterator, typename OutputIterator, typename T>
+OutputIterator remove_copy_safe(InputIterator first,
+                                InputIterator last,
+                                OutputIterator result,
+                                T const& value,
+                                rmm::cuda_stream_view stream)
+{
+  auto const copy_size = std::min(static_cast<std::size_t>(std::distance(first, last)),
+                                  static_cast<std::size_t>(std::numeric_limits<int>::max()));
+
+  auto itr = first;
+  while (itr != last) {
+    auto const copy_end =
+      static_cast<std::size_t>(std::distance(itr, last)) <= copy_size ? last : itr + copy_size;
+    result = thrust::remove_copy(rmm::exec_policy(stream), itr, copy_end, result, value);
+    itr    = copy_end;
+  }
+  return result;
+}
+
 }  // namespace
 std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& input,
                                                    character_normalizer const& normalizer,
@@ -544,6 +548,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
 
   auto d_code_points = rmm::device_uvector<uint32_t>(max_new_char_total, stream);
   auto d_sizes       = rmm::device_uvector<int8_t>(chars_size, stream);
+  nvtxRangePushA("normalizer_kernel");
   normalizer_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     d_input_chars,
     chars_size,
@@ -553,6 +558,8 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
     parameters->get_special_tokens(),
     d_code_points.data(),
     d_sizes.data());
+  stream.synchronize();
+  nvtxRangePop();
 
   auto const input_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
@@ -565,17 +572,15 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column(
     output_sizes.begin(), output_sizes.end(), stream, mr);
 
-  // printf("total_size=%ld\n", total_size);
-
   // create output chars and use remove-copy(0) on d_code_points
   rmm::device_uvector<char> chars(total_size, stream, mr);
   auto begin = reinterpret_cast<char const*>(d_code_points.begin());
   auto end   = reinterpret_cast<char const*>(d_code_points.end());
-  // thrust::for_each_n(rmm::exec_policy(stream),
-  //                    thrust::make_counting_iterator<int>(0),
-  //                    d_code_points.size(),
-  //                    [begin] __device__(auto idx) { printf("0x%02x\n", (int)begin[idx]); });
-  thrust::remove_copy(rmm::exec_policy_nosync(stream), begin, end, chars.data(), 0);
+  nvtxRangePushA("remove_copy");
+  // thrust::remove_copy(rmm::exec_policy_nosync(stream), begin, end, chars.data(), 0);
+  remove_copy_safe(begin, end, chars.data(), 0, stream);
+  stream.synchronize();
+  nvtxRangePop();
 
   return cudf::make_strings_column(input.size(),
                                    std::move(offsets),
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index d774a357fa4..3f27b5a4eb2 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -135,46 +135,37 @@ TEST_F(TextNormalizeTest, NormalizeCharacters)
   }
 }
 
-TEST_F(TextNormalizeTest, NormalizeCharactersNew)
+TEST_F(TextNormalizeTest, WithNormalizer)
 {
   // These include punctuation, accents, whitespace, and CJK characters
-  auto input = cudf::test::strings_column_wrapper({"abc£def",        //  0
-                                                   "",               //  8
-                                                   "éè â îô\taeio",  // 25
-                                                   "\tĂĆĖÑ  Ü",      //
-                                                   "ACEN U",
-                                                   "P^NP",
-                                                   "$41.07",
-                                                   "[a,b]",
-                                                   "丏丟",
-                                                   ""},
-                                                  {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+  auto input = cudf::test::strings_column_wrapper(
+    {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ  Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""},
+    {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
 
-  auto sv             = cudf::strings_column_view(input);
+  auto const sv       = cudf::strings_column_view(input);
   auto special_tokens = cudf::test::strings_column_wrapper();
-  auto stv            = cudf::strings_column_view(special_tokens);
-  {
-    auto normalizer = nvtext::create_character_normalizer(true, stv);
-    auto results    = nvtext::normalize_characters(sv, *normalizer);
-    cudf::test::print(results->view());
-    cudf::test::strings_column_wrapper expected({"abc£def",
-                                                 "",
-                                                 "ee a io aeio",
-                                                 " acen  u",
-                                                 "acen u",
-                                                 "p ^ np",
-                                                 " $ 41 . 07",
-                                                 " [ a , b ] ",
-                                                 " 丏  丟 ",
-                                                 ""},
-                                                {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  }
-  {
-    auto normalizer = nvtext::create_character_normalizer(false, stv);
-    auto results    = nvtext::normalize_characters(sv, *normalizer);
-    cudf::test::print(results->view());
-    cudf::test::strings_column_wrapper expected({"abc£def",
+  auto const stv      = cudf::strings_column_view(special_tokens);
+
+  auto normalizer = nvtext::create_character_normalizer(true, stv);
+  auto results    = nvtext::normalize_characters(sv, *normalizer);
+  //  cudf::test::print(results->view());
+  auto expected = cudf::test::strings_column_wrapper({"abc£def",
+                                                      "",
+                                                      "ee a io aeio",
+                                                      " acen  u",
+                                                      "acen u",
+                                                      "p ^ np",
+                                                      " $ 41 . 07",
+                                                      " [ a , b ] ",
+                                                      " 丏  丟 ",
+                                                      ""},
+                                                     {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  normalizer = nvtext::create_character_normalizer(false, stv);
+  results    = nvtext::normalize_characters(sv, *normalizer);
+  // cudf::test::print(results->view());
+  expected = cudf::test::strings_column_wrapper({"abc£def",
                                                  "",
                                                  "éè â îô aeio",
                                                  " ĂĆĖÑ  Ü",
@@ -185,30 +176,37 @@ TEST_F(TextNormalizeTest, NormalizeCharactersNew)
                                                  " 丏  丟 ",
                                                  ""},
                                                 {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  }
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(TextNormalizeTest, NormalizeCharactersNewSpecialTokens)
+TEST_F(TextNormalizeTest, SpecialTokens)
 {
   // These include punctuation, accents, whitespace, and CJK characters
   auto input =
     cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]",
-                                        "[bos]these should[sep]work too[eos]"});
+                                        "[bos]these should[sep]work too[eos]",
+                                        "some[non]tokens[eol]too"});
 
   auto sv             = cudf::strings_column_view(input);
   auto special_tokens = cudf::test::strings_column_wrapper({"[BOS]", "[EOS]", "[SEP]", "[PAD]"});
   auto stv            = cudf::strings_column_view(special_tokens);
-  {
-    auto normalizer = nvtext::create_character_normalizer(true, stv);
-    auto results    = nvtext::normalize_characters(sv, *normalizer);
-    cudf::test::print(results->view());
-  }
-  {
-    auto normalizer = nvtext::create_character_normalizer(false, stv);
-    auto results    = nvtext::normalize_characters(sv, *normalizer);
-    cudf::test::print(results->view());
-  }
+  auto normalizer     = nvtext::create_character_normalizer(true, stv);
+  auto results        = nvtext::normalize_characters(sv, *normalizer);
+  // cudf::test::print(results->view());
+  auto expected = cudf::test::strings_column_wrapper(
+    {" [bos] some strings with  [pad]  special [sep] tokens [eos] ",
+     " [bos] these should [sep] work too [eos] ",
+     "some [ non ] tokens [ eol ] too"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  normalizer = nvtext::create_character_normalizer(false, stv);
+  results    = nvtext::normalize_characters(sv, *normalizer);
+  // cudf::test::print(results->view());
+  expected = cudf::test::strings_column_wrapper(
+    {" [BOS] Some strings with  [PAD]  special [SEP] tokens [EOS] ",
+     " [ bos ] these should [ sep ] work too [ eos ] ",
+     "some [ non ] tokens [ eol ] too"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(TextNormalizeTest, NormalizeSlicedColumn)
@@ -225,3 +223,25 @@ TEST_F(TextNormalizeTest, NormalizeSlicedColumn)
   cudf::test::strings_column_wrapper expected2({" $ 41 . 07", " [ a , b ] ", " 丏  丟 "});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
 }
+
+TEST_F(TextNormalizeTest, SlicedColumn)
+{
+  auto input = cudf::test::strings_column_wrapper(
+    {"abc£def", "éè â îô\taeio", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟"});
+
+  auto special_tokens = cudf::test::strings_column_wrapper();
+  auto stv            = cudf::strings_column_view(special_tokens);
+
+  std::vector<cudf::column_view> sliced = cudf::split(input, {4});
+  auto normalizer                       = nvtext::create_character_normalizer(true, stv);
+  auto results =
+    nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer);
+  auto expected =
+    cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  normalizer = nvtext::create_character_normalizer(false, stv);
+  results    = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), *normalizer);
+  expected   = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏  丟 "});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}

From 5b21c0a226b2ca8ff30ef5c1abf2585b1314e6d3 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 28 Jan 2025 13:34:23 -0500
Subject: [PATCH 04/21] add special_tokens_kernel

---
 cpp/src/text/normalize.cu          | 138 ++++++++++++-----------------
 cpp/tests/text/normalize_tests.cpp |   1 -
 2 files changed, 56 insertions(+), 83 deletions(-)

diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 24a0c5ea811..d64b26ac599 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -113,29 +113,6 @@ __device__ int8_t cp_to_utf8(uint32_t codepoint, char* out)
 {
   auto utf8 = cudf::strings::detail::codepoint_to_utf8(codepoint);
   return cudf::strings::detail::from_char_utf8(utf8, out);
-#if 0
-  auto out_ptr = out;
-  if (codepoint < UTF8_1BYTE)  // ASCII range
-    *out_ptr++ = static_cast<char>(codepoint);
-  else if (codepoint < UTF8_2BYTE) {  // create two-byte UTF-8
-    // b00001xxx:byyyyyyyy => b110xxxyy:b10yyyyyy
-    *out_ptr++ = static_cast<char>((((codepoint << 2) & 0x00'1F00) | 0x00'C000) >> 8);
-    *out_ptr++ = static_cast<char>((codepoint & 0x3F) | 0x0080);
-  } else if (codepoint < UTF8_3BYTE) {  // create three-byte UTF-8
-    // bxxxxxxxx:byyyyyyyy => b1110xxxx:b10xxxxyy:b10yyyyyy
-    *out_ptr++ = static_cast<char>((((codepoint << 4) & 0x0F'0000) | 0x00E0'0000) >> 16);
-    *out_ptr++ = static_cast<char>((((codepoint << 2) & 0x00'3F00) | 0x00'8000) >> 8);
-    *out_ptr++ = static_cast<char>((codepoint & 0x3F) | 0x0080);
-  } else {  // create four-byte UTF-8
-    // maximum code-point value is 0x0011'0000
-    // b000xxxxx:byyyyyyyy:bzzzzzzzz => b11110xxx:b10xxyyyy:b10yyyyzz:b10zzzzzz
-    *out_ptr++ = static_cast<char>((((codepoint << 6) & 0x0700'0000u) | 0xF000'0000u) >> 24);
-    *out_ptr++ = static_cast<char>((((codepoint << 4) & 0x003F'0000u) | 0x0080'0000u) >> 16);
-    *out_ptr++ = static_cast<char>((((codepoint << 2) & 0x00'3F00u) | 0x00'8000u) >> 8);
-    *out_ptr++ = static_cast<char>((codepoint & 0x3F) | 0x0080);
-  }
-  return static_cast<int8_t>(thrust::distance(out, out_ptr));
-#endif
 }
 
 /**
@@ -284,8 +261,6 @@ struct character_normalizer::character_normalizer_impl {
   bool do_lower_case;
   std::unique_ptr<cudf::column> special_tokens;
   rmm::device_uvector<cudf::string_view> special_tokens_view;
-  cudf::size_type min_token_width;
-  cudf::size_type max_token_width;
 
   cudf::device_span<cudf::string_view const> get_special_tokens() const
   {
@@ -296,16 +271,12 @@ struct character_normalizer::character_normalizer_impl {
                             rmm::device_uvector<aux_codepoint_data_type>&& aux_table,
                             bool do_lower_case,
                             std::unique_ptr<cudf::column>&& special_tokens,
-                            rmm::device_uvector<cudf::string_view>&& special_tokens_view,
-                            cudf::size_type min_token_width,
-                            cudf::size_type max_token_width)
+                            rmm::device_uvector<cudf::string_view>&& special_tokens_view)
     : cp_metadata(std::move(cp_metadata)),
       aux_table(std::move(aux_table)),
       do_lower_case{do_lower_case},
       special_tokens{std::move(special_tokens)},
-      special_tokens_view{std::move(special_tokens_view)},
-      min_token_width{min_token_width},
-      max_token_width{max_token_width}
+      special_tokens_view{std::move(special_tokens_view)}
   {
   }
 };
@@ -329,31 +300,20 @@ character_normalizer::character_normalizer(bool do_lower_case,
   auto sorted = std::move(
     cudf::sort(cudf::table_view({special_tokens.parent()}), {}, {}, stream)->release().front());
   if (do_lower_case) {
-    auto lower = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream);
-    auto upper = cudf::strings::to_upper(cudf::strings_column_view(sorted->view()), stream);
-    sorted     = cudf::concatenate(std::vector{lower->view(), upper->view()}, stream);
+    // lower case the tokens so they will match the normalized input
+    sorted = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream);
   }
+
   auto tokens_view = cudf::strings::detail::create_string_vector_from_column(
     cudf::strings_column_view(sorted->view()), stream, cudf::get_current_device_resource_ref());
-  auto const begin =
-    cudf::detail::make_counting_transform_iterator(0, size_bytes_fn{tokens_view.data()});
-  auto const end      = begin + tokens_view.size();
-  auto const init_min = std::numeric_limits<cudf::size_type>::max();
-  auto const min_width =
-    thrust::reduce(rmm::exec_policy(stream), begin, end, init_min, thrust::minimum{});
-  auto const max_width = thrust::reduce(rmm::exec_policy(stream), begin, end, 0, thrust::maximum{});
-  CUDF_EXPECTS(tokens_view.is_empty() || (min_width > 2 && max_width <= 6),
-               "Expected special tokens to be between 3 and 6 bytes including [] brackets",
-               std::invalid_argument);
 
   _impl = new character_normalizer_impl(std::move(cp_metadata),
                                         std::move(aux_table),
                                         do_lower_case,
                                         std::move(sorted),
-                                        std::move(tokens_view),
-                                        min_width,
-                                        max_width);
+                                        std::move(tokens_view));
 }
+
 character_normalizer::~character_normalizer() { delete _impl; }
 
 std::unique_ptr<character_normalizer> create_character_normalizer(
@@ -369,27 +329,40 @@ std::unique_ptr<character_normalizer> create_character_normalizer(
 namespace detail {
 namespace {
 
-__device__ bool check_special_token(cudf::device_span<cudf::string_view const> special_tokens,
-                                    char const* d_chars,
-                                    int64_t idx,
-                                    int64_t total_bytes)
+CUDF_KERNEL void special_tokens_kernel(uint32_t* d_utf8chars,
+                                       int64_t total_count,
+                                       cudf::device_span<cudf::string_view const> special_tokens,
+                                       int8_t* chars_per_thread)
 {
-  char const ch  = d_chars[idx];
-  auto begin     = ch == '[' ? d_chars + idx : d_chars + std::max(0L, idx - 6);
-  auto end       = ch == ']' ? d_chars + idx + 1 : d_chars + idx + std::min(6L, total_bytes - idx);
-  char const mch = ch == '[' ? ']' : '[';
-  auto fnd       = thrust::find(thrust::seq, begin, end, mch);
-  if (fnd == end) { return false; }
-  if (mch == ']') {
-    end = fnd + 1;  // include ']'
-  } else {
-    begin = fnd;
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= total_count) { return; }
+  auto const begin = d_utf8chars + (idx * MAX_NEW_CHARS) + 1;
+  if (*begin != '[') { return; }
+  auto const end   = begin + std::min(6L, total_count - idx) * MAX_NEW_CHARS;
+  auto const match = thrust::find(thrust::seq, begin, end, static_cast<uint32_t>(']'));
+  if (match == end) { return; }
+  char candidate[8];
+  auto itr  = thrust::transform_iterator(begin, [](auto v) { return static_cast<char>(v); });
+  auto eitr = itr + thrust::distance(begin, match + 1);
+  auto last =
+    thrust::copy_if(thrust::seq, itr, eitr, candidate, [](auto c) { return c != 0 && c != ' '; });
+  *last = 0;  // only needed for debug
+
+  auto const size  = static_cast<cudf::size_type>(thrust::distance(candidate, last));
+  auto const token = cudf::string_view(candidate, size);
+  if (thrust::find(thrust::seq, special_tokens.begin(), special_tokens.end(), token) ==
+      special_tokens.end()) {
+    return;
   }
-  auto const size  = static_cast<cudf::size_type>(thrust::distance(begin, end));
-  auto const token = cudf::string_view(begin, size);
-  auto rtn = thrust::find(thrust::seq, special_tokens.begin(), special_tokens.end(), token) !=
-             special_tokens.end();
-  return rtn;
+
+  // fix up chars to remove the extra spaces
+  *(begin + 1) = 0;
+  *(match - 1) = 0;
+
+  auto const match_idx = idx + (thrust::distance(begin, match) / MAX_NEW_CHARS);
+
+  chars_per_thread[idx]       = 2;  // leading space plus '['
+  chars_per_thread[match_idx] = 2;  // ']' plus trailing space
 }
 
 CUDF_KERNEL void normalizer_kernel(char const* d_chars,
@@ -397,7 +370,6 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars,
                                    codepoint_metadata_type const* cp_metadata,
                                    aux_codepoint_data_type const* aux_table,
                                    bool do_lower_case,
-                                   cudf::device_span<cudf::string_view const> special_tokens,
                                    uint32_t* d_output,
                                    int8_t* chars_per_thread)
 {
@@ -428,15 +400,10 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars,
       }
 
       if (should_add_spaces(metadata, do_lower_case) && (num_new_chars == 1)) {
-        // check for possible special tokens here
-        auto const ch = d_chars[idx];
-        auto sp_found = !special_tokens.empty() && (ch == '[' || ch == ']') &&
-                        check_special_token(special_tokens, d_chars, idx, total_bytes);
-
         // write the required spaces at each end
         replacement[1] = replacement[0];
-        replacement[0] = sp_found && ch == ']' ? 0 : SPACE_CODE_POINT;
-        replacement[2] = sp_found && ch == '[' ? 0 : SPACE_CODE_POINT;
+        replacement[0] = SPACE_CODE_POINT;
+        replacement[2] = SPACE_CODE_POINT;
         num_new_chars  = 3;
       }
 
@@ -481,13 +448,9 @@ rmm::device_uvector<cudf::size_type> compute_sizes(int8_t const* sizes,
       d_temp.data(), temp, d_in, d_out, size, offsets, offsets + 1, stream.value());
   } else {
     // offsets need to be normalized for segmented-reduce to work efficiently
-    auto d_offsets = rmm::device_uvector<cudf::size_type>(size + 1, stream);
-    thrust::transform(rmm::exec_policy_nosync(stream),
-                      offsets,
-                      offsets + size + 1,
-                      d_offsets.begin(),
-                      [offset] __device__(auto o) { return o - offset; });
-    auto const offsets_itr = d_offsets.begin();
+    auto offsets_itr = thrust::transform_iterator(
+      offsets,
+      cuda::proclaim_return_type<int64_t>([offset] __device__(auto o) { return o - offset; }));
     cub::DeviceSegmentedReduce::Sum(
       nullptr, temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value());
     auto d_temp = rmm::device_buffer{temp, stream};
@@ -555,12 +518,23 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
     parameters->cp_metadata.data(),
     parameters->aux_table.data(),
     parameters->do_lower_case,
-    parameters->get_special_tokens(),
     d_code_points.data(),
     d_sizes.data());
   stream.synchronize();
   nvtxRangePop();
 
+  // This removes space adding around any special tokens in the form of [ttt].
+  // An alternate approach is to do a multi-replace of '[ ttt ]' with '[ttt]' right
+  // before returning the output strings column.
+  auto const special_tokens = parameters->get_special_tokens();
+  if (!special_tokens.empty()) {
+    nvtxRangePushA("special_tokens_kernel");
+    special_tokens_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      d_code_points.data(), chars_size, special_tokens, d_sizes.data());
+    stream.synchronize();
+    nvtxRangePop();
+  }
+
   auto const input_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index 3f27b5a4eb2..67c6ab79eb9 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -198,7 +198,6 @@ TEST_F(TextNormalizeTest, SpecialTokens)
      " [bos] these should [sep] work too [eos] ",
      "some [ non ] tokens [ eol ] too"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-
   normalizer = nvtext::create_character_normalizer(false, stv);
   results    = nvtext::normalize_characters(sv, *normalizer);
   // cudf::test::print(results->view());

From e37bf3f15bbe5e28217b7adcbc396af5cab150dd Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 28 Jan 2025 19:32:59 -0500
Subject: [PATCH 05/21] add python and pylibcudf interfaces

---
 cpp/benchmarks/text/normalize.cpp             |  4 +-
 cpp/include/nvtext/normalize.hpp              |  8 ++-
 cpp/tests/text/normalize_tests.cpp            | 15 ++--
 python/cudf/cudf/core/character_normalizer.py | 45 ++++++++++++
 python/cudf/cudf/core/column/string.py        | 17 ++++-
 .../pylibcudf/libcudf/nvtext/normalize.pxd    | 15 +++-
 .../pylibcudf/pylibcudf/nvtext/normalize.pxd  | 13 +++-
 .../pylibcudf/pylibcudf/nvtext/normalize.pyi  | 10 ++-
 .../pylibcudf/pylibcudf/nvtext/normalize.pyx  | 70 ++++++++++++++++---
 9 files changed, 164 insertions(+), 33 deletions(-)
 create mode 100644 python/cudf/cudf/core/character_normalizer.py

diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
index 91738a83adb..405b099fa10 100644
--- a/cpp/benchmarks/text/normalize.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -17,7 +17,6 @@
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -49,8 +48,7 @@ static void bench_normalize(nvbench::state& state)
                [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); });
   } else {
     bool const to_lower = (normalize_type == "to_lower");
-    auto spt = cudf::strings_column_view(cudf::make_empty_column(cudf::type_id::STRING)->view());
-    auto normalizer = nvtext::create_character_normalizer(to_lower, spt);
+    auto normalizer     = nvtext::create_character_normalizer(to_lower);
     state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
       // auto result = nvtext::normalize_characters(input, to_lower);
       auto result = nvtext::normalize_characters(input, *normalizer);
diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index be1d28248de..6d256fec231 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -176,9 +177,10 @@ struct character_normalizer {
  */
 std::unique_ptr<character_normalizer> create_character_normalizer(
   bool do_lower_case,
-  cudf::strings_column_view const& special_tokens,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+  cudf::strings_column_view const& special_tokens = cudf::strings_column_view(cudf::column_view{
+    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0}),
+  rmm::cuda_stream_view stream                    = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr               = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Normalizes strings characters for tokenizing
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index 67c6ab79eb9..37b6d51fcbd 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -142,11 +142,9 @@ TEST_F(TextNormalizeTest, WithNormalizer)
     {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ  Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""},
     {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
 
-  auto const sv       = cudf::strings_column_view(input);
-  auto special_tokens = cudf::test::strings_column_wrapper();
-  auto const stv      = cudf::strings_column_view(special_tokens);
+  auto const sv = cudf::strings_column_view(input);
 
-  auto normalizer = nvtext::create_character_normalizer(true, stv);
+  auto normalizer = nvtext::create_character_normalizer(true);
   auto results    = nvtext::normalize_characters(sv, *normalizer);
   //  cudf::test::print(results->view());
   auto expected = cudf::test::strings_column_wrapper({"abc£def",
@@ -162,7 +160,7 @@ TEST_F(TextNormalizeTest, WithNormalizer)
                                                      {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  normalizer = nvtext::create_character_normalizer(false, stv);
+  normalizer = nvtext::create_character_normalizer(false);
   results    = nvtext::normalize_characters(sv, *normalizer);
   // cudf::test::print(results->view());
   expected = cudf::test::strings_column_wrapper({"abc£def",
@@ -228,18 +226,15 @@ TEST_F(TextNormalizeTest, SlicedColumn)
   auto input = cudf::test::strings_column_wrapper(
     {"abc£def", "éè â îô\taeio", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟"});
 
-  auto special_tokens = cudf::test::strings_column_wrapper();
-  auto stv            = cudf::strings_column_view(special_tokens);
-
   std::vector<cudf::column_view> sliced = cudf::split(input, {4});
-  auto normalizer                       = nvtext::create_character_normalizer(true, stv);
+  auto normalizer                       = nvtext::create_character_normalizer(true);
   auto results =
     nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer);
   auto expected =
     cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  normalizer = nvtext::create_character_normalizer(false, stv);
+  normalizer = nvtext::create_character_normalizer(false);
   results    = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), *normalizer);
   expected   = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏  丟 "});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
diff --git a/python/cudf/cudf/core/character_normalizer.py b/python/cudf/cudf/core/character_normalizer.py
new file mode 100644
index 00000000000..ffb694d2f73
--- /dev/null
+++ b/python/cudf/cudf/core/character_normalizer.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import pylibcudf as plc
+
+import cudf
+
+
+class CharacterNormalizer:
+    """
+    A normalizer object used to normalize input text.
+
+    Parameters
+    ----------
+    do_lower_case : bool
+        Set to True if the normalizer should also lower-case
+        while normalizing.
+    special_tokens : cudf string series
+        Strings column of special tokens.
+    """
+
+    def __init__(
+        self,
+        d_lower_case: bool,
+        special_tokens: cudf.Series = cudf.Series([], dtype="object"),
+    ) -> None:
+        self.normalizer = plc.nvtext.normalize.CharacterNormalizer(
+            d_lower_case, special_tokens._column.to_pylibcudf(mode="read")
+        )
+
+    def normalize(self, text: cudf.Series) -> cudf.Series:
+        """
+        Parameters
+        ----------
+        text : cudf string series
+            The strings to be normalized.
+
+        Returns
+        -------
+        Normalized strings
+        """
+        result = text._column.normalize_characters(self.normalizer)
+
+        return cudf.Series._from_column(result)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 54b42b1f6de..fe35e400410 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4708,7 +4708,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         dtype: object
         """
         return self._return_or_inplace(
-            self._column.normalize_characters(do_lower)
+            self._column.characters_normalize(do_lower)
         )
 
     def tokenize(self, delimiter: str = " ") -> SeriesOrIndex:
@@ -6237,14 +6237,25 @@ def normalize_spaces(self) -> Self:
         )
 
     @acquire_spill_lock()
-    def normalize_characters(self, do_lower: bool = True) -> Self:
+    def characters_normalize(self, do_lower: bool = True) -> Self:
         return Column.from_pylibcudf(  # type: ignore[return-value]
-            plc.nvtext.normalize.normalize_characters(
+            plc.nvtext.normalize.characters_normalize(
                 self.to_pylibcudf(mode="read"),
                 do_lower,
             )
         )
 
+    @acquire_spill_lock()
+    def normalize_characters(
+        self, normalizer: plc.nvtext.normalize.CharacterNormalizer
+    ) -> Self:
+        return Column.from_pylibcudf(  # type: ignore[return-value]
+            plc.nvtext.normalize.normalize_characters(
+                self.to_pylibcudf(mode="read"),
+                normalizer,
+            )
+        )
+
     @acquire_spill_lock()
     def replace_tokens(
         self, targets: Self, replacements: Self, delimiter: plc.Scalar
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
index f8b082c8429..2cf2bfb8ac9 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from pylibcudf.exception_handler cimport libcudf_exception_handler
@@ -16,3 +16,16 @@ cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil:
         const column_view & strings,
         bool do_lower_case
     ) except +libcudf_exception_handler
+
+    cdef struct character_normalizer "nvtext::character_normalizer":
+        pass
+
+    cdef unique_ptr[character_normalizer] create_character_normalizer(
+        bool do_lower_case,
+        const column_view & strings
+    ) except +libcudf_exception_handler
+
+    cdef unique_ptr[column] normalize_characters(
+        const column_view & strings,
+        const character_normalizer & normalizer
+    ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
index 90676145afa..e6688e19762 100644
--- a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
@@ -1,9 +1,18 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
 from pylibcudf.column cimport Column
+from pylibcudf.libcudf.nvtext.normalize cimport character_normalizer
 
+cdef class CharacterNormalizer:
+    cdef unique_ptr[character_normalizer] c_obj
 
 cpdef Column normalize_spaces(Column input)
 
-cpdef Column normalize_characters(Column input, bool do_lower_case)
+cpdef Column characters_normalize(Column input, bool do_lower_case)
+
+cpdef Column normalize_characters(
+  Column input,
+  CharacterNormalizer normalizer
+)
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
index 1d90a5a8960..d722ef6c79e 100644
--- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
@@ -1,6 +1,12 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 from pylibcudf.column import Column
 
+class CharacterNormalizer:
+    def __init__(self, do_lower_case: bool, special_tokens: Column): ...
+
 def normalize_spaces(input: Column) -> Column: ...
-def normalize_characters(input: Column, do_lower_case: bool) -> Column: ...
+def characters_normalize(input: Column, do_lower_case: bool) -> Column: ...
+def normalize_characters(
+    input: Column, normalizer: CharacterNormalizer
+) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
index b259ccaefa6..ada361c5e79 100644
--- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
@@ -1,16 +1,36 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
+from cython.operator cimport dereference
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.nvtext.normalize cimport (
-    normalize_characters as cpp_normalize_characters,
-    normalize_spaces as cpp_normalize_spaces,
-)
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext cimport normalize as cpp_normalize
 
-__all__ = ["normalize_characters", "normalize_spaces"]
+__all__ = [
+    "CharacterNormalizer"
+    "normalize_characters",
+    "normalize_spaces",
+    "characters_normalize"]
+
+cdef class CharacterNormalizer:
+    """The normalizer object to be used with ``normalize_characters``.
+
+    For details, see :cpp:class:`cudf::nvtext::character_normalizer`.
+    """
+    def __cinit__(self, bool do_lower_case, Column tokens):
+        cdef column_view c_tokens = tokens.view()
+        with nogil:
+            self.c_obj = move(
+                cpp_normalize.create_character_normalizer(
+                    do_lower_case,
+                    c_tokens
+                )
+            )
+
+    __hash__ = None
 
 cpdef Column normalize_spaces(Column input):
     """
@@ -32,12 +52,12 @@ cpdef Column normalize_spaces(Column input):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = cpp_normalize_spaces(input.view())
+        c_result = cpp_normalize.normalize_spaces(input.view())
 
     return Column.from_libcudf(move(c_result))
 
 
-cpdef Column normalize_characters(Column input, bool do_lower_case):
+cpdef Column characters_normalize(Column input, bool do_lower_case):
     """
     Normalizes strings characters for tokenizing.
 
@@ -60,6 +80,38 @@ cpdef Column normalize_characters(Column input, bool do_lower_case):
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = cpp_normalize_characters(input.view(), do_lower_case)
+        c_result = cpp_normalize.normalize_characters(
+            input.view(),
+            do_lower_case
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column normalize_characters(Column input, CharacterNormalizer normalizer):
+    """
+    Normalizes strings characters for tokenizing.
+
+    For details, see :cpp:func:`normalize_characters`
+
+    Parameters
+    ----------
+    input : Column
+        Input strings
+    normalizer : CharacterNormalizer
+        Normalizer object used for modifying the input column text
+
+    Returns
+    -------
+    Column
+        Normalized strings column
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = cpp_normalize.normalize_characters(
+            input.view(),
+            dereference(normalizer.c_obj.get())
+        )
 
     return Column.from_libcudf(move(c_result))

From ac4c5a8a4ef69949e55642af1a63cabe57fed6a6 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 29 Jan 2025 16:39:51 -0500
Subject: [PATCH 06/21] add block-store

---
 cpp/src/text/normalize.cu                     | 64 +++++++++++++------
 cpp/src/text/subword/data_normalizer.cu       | 12 ++++
 python/cudf/cudf/core/character_normalizer.py |  6 +-
 3 files changed, 61 insertions(+), 21 deletions(-)

diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index d64b26ac599..8e2b1d9a44e 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -200,12 +200,15 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
   // create the normalizer and call it
+  nvtxRangePushA("o_load_normalize_tables");
   auto result = [&] {
     auto const cp_metadata = get_codepoint_metadata(stream);
     auto const aux_table   = get_aux_codepoint_data(stream);
     auto const normalizer  = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case);
     return normalizer.normalize(strings, stream);
   }();
+  stream.synchronize();
+  nvtxRangePop();
 
   CUDF_EXPECTS(
     result.first->size() < static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
@@ -221,8 +224,11 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the codepoint_to_utf8_fn
+  nvtxRangePushA("o_codepoint_to_utf8");
   auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
     codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr);
+  stream.synchronize();
+  nvtxRangePop();
 
   return cudf::make_strings_column(strings.size(),
                                    std::move(offsets_column),
@@ -419,16 +425,22 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars,
 
   if (idx < total_bytes) {
     chars_per_thread[idx] = num_new_bytes;
-    d_output += idx * MAX_NEW_CHARS;
-#pragma unroll
-    for (int k = 0; k < MAX_NEW_CHARS; ++k) {
-      *d_output++ = replacement[k];
-    }
+    // d_output += idx * MAX_NEW_CHARS;
+    // // #pragma unroll
+    // for (int k = 0; k < MAX_NEW_CHARS; ++k) {
+    //   *d_output++ = replacement[k];
+    // }
   }
+
+  // BLOCK_STORE_WARP_TRANSPOSE
+  using block_store = cub::BlockStore<uint32_t, 256, MAX_NEW_CHARS, cub::BLOCK_STORE_STRIPED>;
+  __shared__ typename block_store::TempStorage bs_stg;
+  auto block_base = d_output + blockIdx.x * blockDim.x * MAX_NEW_CHARS;
+  block_store(bs_stg).Store(block_base, replacement);
 }
 
 template <typename OffsetType>
-rmm::device_uvector<cudf::size_type> compute_sizes(int8_t const* sizes,
+rmm::device_uvector<cudf::size_type> compute_sizes(cudf::device_span<int8_t const> sizes,
                                                    OffsetType offsets,
                                                    int64_t offset,
                                                    cudf::size_type size,
@@ -436,10 +448,10 @@ rmm::device_uvector<cudf::size_type> compute_sizes(int8_t const* sizes,
 {
   auto output_sizes = rmm::device_uvector<cudf::size_type>(size, stream);
 
-  auto d_in        = sizes;
+  auto d_in        = sizes.data();
   auto d_out       = output_sizes.begin();
   std::size_t temp = 0;
-  nvtxRangePushA("segmented_reduce");
+  nvtxRangePushA("compute_sizes_segred");
   if (offset == 0) {
     cub::DeviceSegmentedReduce::Sum(
       nullptr, temp, d_in, d_out, size, offsets, offsets + 1, stream.value());
@@ -457,7 +469,7 @@ rmm::device_uvector<cudf::size_type> compute_sizes(int8_t const* sizes,
     cub::DeviceSegmentedReduce::Sum(
       d_temp.data(), temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value());
   }
-  stream.synchronize();
+  // stream.synchronize();
   nvtxRangePop();
 
   return output_sizes;
@@ -483,6 +495,22 @@ OutputIterator remove_copy_safe(InputIterator first,
   return result;
 }
 
+template <typename Iterator, typename T>
+Iterator remove_safe(Iterator first, Iterator last, T const& value, rmm::cuda_stream_view stream)
+{
+  auto const size = std::min(static_cast<std::size_t>(std::distance(first, last)),
+                             static_cast<std::size_t>(std::numeric_limits<int>::max()));
+
+  auto result = first;
+  auto itr    = first;
+  while (itr != last) {
+    auto end = static_cast<std::size_t>(std::distance(itr, last)) <= size ? last : itr + size;
+    result   = thrust::remove(rmm::exec_policy(stream), itr, end, value);
+    itr      = end;
+  }
+  return result;
+}
+
 }  // namespace
 std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& input,
                                                    character_normalizer const& normalizer,
@@ -503,14 +531,14 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
 
   if (chars_size == 0) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
 
-  constexpr int64_t block_size = 64;
+  constexpr int64_t block_size = 256;
   cudf::detail::grid_1d grid{chars_size, block_size};
-  auto const max_new_char_total = MAX_NEW_CHARS * chars_size;
+  auto const max_new_char_total = cudf::util::round_up_safe(chars_size, block_size) * MAX_NEW_CHARS;
 
   auto const parameters = normalizer._impl;
 
-  auto d_code_points = rmm::device_uvector<uint32_t>(max_new_char_total, stream);
   auto d_sizes       = rmm::device_uvector<int8_t>(chars_size, stream);
+  auto d_code_points = rmm::device_uvector<uint32_t>(max_new_char_total, stream);
   nvtxRangePushA("normalizer_kernel");
   normalizer_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     d_input_chars,
@@ -520,7 +548,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
     parameters->do_lower_case,
     d_code_points.data(),
     d_sizes.data());
-  stream.synchronize();
+  // stream.synchronize();
   nvtxRangePop();
 
   // This removes space adding around any special tokens in the form of [ttt].
@@ -539,8 +567,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
   // use segmented-reduce with input_offsets over d_sizes to get the size of the output rows
-  auto output_sizes =
-    compute_sizes(d_sizes.data(), input_offsets, first_offset, input.size(), stream);
+  auto output_sizes = compute_sizes(d_sizes, input_offsets, first_offset, input.size(), stream);
 
   // convert the sizes to offsets
   auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column(
@@ -549,11 +576,12 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   // create output chars and use remove-copy(0) on d_code_points
   rmm::device_uvector<char> chars(total_size, stream, mr);
   auto begin = reinterpret_cast<char const*>(d_code_points.begin());
-  auto end   = reinterpret_cast<char const*>(d_code_points.end());
+  // auto end   = reinterpret_cast<char const*>(d_code_points.end());
   nvtxRangePushA("remove_copy");
-  // thrust::remove_copy(rmm::exec_policy_nosync(stream), begin, end, chars.data(), 0);
+  auto end = reinterpret_cast<char const*>(
+    remove_safe(d_code_points.begin(), d_code_points.end(), 0, stream));
   remove_copy_safe(begin, end, chars.data(), 0, stream);
-  stream.synchronize();
+  // stream.synchronize();
   nvtxRangePop();
 
   return cudf::make_strings_column(input.size(),
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index 4c54409c41a..559695e00d3 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -238,6 +238,7 @@ uvector_pair data_normalizer::normalize(cudf::strings_column_view const& input,
   rmm::device_uvector<uint32_t> d_chars_per_thread(threads_on_device, stream);
   auto const d_strings = input.chars_begin(stream) + cudf::strings::detail::get_offset_value(
                                                        input.offsets(), input.offset(), stream);
+  nvtxRangePushA("o_kernel_data_normalizer");
   kernel_data_normalizer<<<num_blocks, threads_per_block, 0, stream.value()>>>(
     reinterpret_cast<unsigned char const*>(d_strings),
     bytes_count,
@@ -246,26 +247,37 @@ uvector_pair data_normalizer::normalize(cudf::strings_column_view const& input,
     do_lower_case,
     d_code_points->data(),
     d_chars_per_thread.data());
+  stream.synchronize();
+  nvtxRangePop();
 
   // Remove the 'empty' code points from the vector
+  nvtxRangePushA("o_remove");
   thrust::remove(rmm::exec_policy(stream),
                  d_code_points->begin(),
                  d_code_points->end(),
                  uint32_t{1 << FILTER_BIT});
+  stream.synchronize();
+  nvtxRangePop();
 
   // We also need to prefix sum the number of characters up to an including
   // the current character in order to get the new strings lengths.
+  nvtxRangePushA("o_inclusive_scan");
   thrust::inclusive_scan(rmm::exec_policy(stream),
                          d_chars_per_thread.begin(),
                          d_chars_per_thread.end(),
                          d_chars_per_thread.begin());
+  stream.synchronize();
+  nvtxRangePop();
 
   // This will reset the offsets to the new generated code point values
+  nvtxRangePushA("o_update_strings_lengths");
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<uint32_t>(1),
     input.size(),
     update_strings_lengths_fn{d_chars_per_thread.data(), d_strings_offsets->data()});
+  stream.synchronize();
+  nvtxRangePop();
 
   auto const num_chars = d_strings_offsets->element(input.size(), stream);
   d_code_points->resize(num_chars, stream);  // should be smaller than original allocated size
diff --git a/python/cudf/cudf/core/character_normalizer.py b/python/cudf/cudf/core/character_normalizer.py
index ffb694d2f73..00ee43598f6 100644
--- a/python/cudf/cudf/core/character_normalizer.py
+++ b/python/cudf/cudf/core/character_normalizer.py
@@ -13,7 +13,7 @@ class CharacterNormalizer:
 
     Parameters
     ----------
-    do_lower_case : bool
+    do_lower : bool
         Set to True if the normalizer should also lower-case
         while normalizing.
     special_tokens : cudf string series
@@ -22,11 +22,11 @@ class CharacterNormalizer:
 
     def __init__(
         self,
-        d_lower_case: bool,
+        do_lower: bool,
         special_tokens: cudf.Series = cudf.Series([], dtype="object"),
     ) -> None:
         self.normalizer = plc.nvtext.normalize.CharacterNormalizer(
-            d_lower_case, special_tokens._column.to_pylibcudf(mode="read")
+            do_lower, special_tokens._column.to_pylibcudf(mode="read")
         )
 
     def normalize(self, text: cudf.Series) -> cudf.Series:

From 0df89afc69b5117b36f1fd9c5bf5dbed8f00245e Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 29 Jan 2025 20:19:22 -0500
Subject: [PATCH 07/21] fix block-store algo type

---
 cpp/src/text/normalize.cu | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 8e2b1d9a44e..fcb84c1f220 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -386,8 +386,8 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars,
 
   if ((idx < total_bytes) && cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) {
     auto const cp = [utf8 = d_chars + idx] {
-      cudf::char_utf8 ch_utf8;
-      cudf::strings::detail::to_char_utf8(utf8, ch_utf8);
+      cudf::char_utf8 ch_utf8 = *utf8;
+      if (ch_utf8 > 0x7F) { cudf::strings::detail::to_char_utf8(utf8, ch_utf8); }
       return cudf::strings::detail::utf8_to_codepoint(ch_utf8);
     }();
     auto const metadata = cp_metadata[cp];
@@ -426,14 +426,15 @@ CUDF_KERNEL void normalizer_kernel(char const* d_chars,
   if (idx < total_bytes) {
     chars_per_thread[idx] = num_new_bytes;
     // d_output += idx * MAX_NEW_CHARS;
-    // // #pragma unroll
+    // #pragma unroll
     // for (int k = 0; k < MAX_NEW_CHARS; ++k) {
     //   *d_output++ = replacement[k];
     // }
   }
 
-  // BLOCK_STORE_WARP_TRANSPOSE
-  using block_store = cub::BlockStore<uint32_t, 256, MAX_NEW_CHARS, cub::BLOCK_STORE_STRIPED>;
+  // BLOCK_STORE_TRANSPOSE
+  using block_store =
+    cub::BlockStore<uint32_t, 256, MAX_NEW_CHARS, cub::BLOCK_STORE_WARP_TRANSPOSE>;
   __shared__ typename block_store::TempStorage bs_stg;
   auto block_base = d_output + blockIdx.x * blockDim.x * MAX_NEW_CHARS;
   block_store(bs_stg).Store(block_base, replacement);
@@ -469,7 +470,7 @@ rmm::device_uvector<cudf::size_type> compute_sizes(cudf::device_span<int8_t cons
     cub::DeviceSegmentedReduce::Sum(
       d_temp.data(), temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value());
   }
-  // stream.synchronize();
+  stream.synchronize();
   nvtxRangePop();
 
   return output_sizes;
@@ -548,7 +549,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
     parameters->do_lower_case,
     d_code_points.data(),
     d_sizes.data());
-  // stream.synchronize();
+  stream.synchronize();
   nvtxRangePop();
 
   // This removes space adding around any special tokens in the form of [ttt].
@@ -576,12 +577,12 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   // create output chars and use remove-copy(0) on d_code_points
   rmm::device_uvector<char> chars(total_size, stream, mr);
   auto begin = reinterpret_cast<char const*>(d_code_points.begin());
-  // auto end   = reinterpret_cast<char const*>(d_code_points.end());
+  auto end   = reinterpret_cast<char const*>(d_code_points.end());
   nvtxRangePushA("remove_copy");
-  auto end = reinterpret_cast<char const*>(
-    remove_safe(d_code_points.begin(), d_code_points.end(), 0, stream));
+  // auto end = reinterpret_cast<char const*>(
+  //   remove_safe(d_code_points.begin(), d_code_points.end(), 0, stream));
   remove_copy_safe(begin, end, chars.data(), 0, stream);
-  // stream.synchronize();
+  stream.synchronize();
   nvtxRangePop();
 
   return cudf::make_strings_column(input.size(),

From 51cae79465a4b7a6dbcf099dcb00d98b116040b6 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 30 Jan 2025 09:20:17 -0500
Subject: [PATCH 08/21] update nvtx range names

---
 cpp/src/text/normalize.cu | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index fcb84c1f220..be92086a663 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -200,15 +200,15 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
   // create the normalizer and call it
-  nvtxRangePushA("o_load_normalize_tables");
   auto result = [&] {
+    nvtxRangePushA("o_load_normalize_tables");
     auto const cp_metadata = get_codepoint_metadata(stream);
     auto const aux_table   = get_aux_codepoint_data(stream);
     auto const normalizer  = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case);
+    stream.synchronize();
+    nvtxRangePop();
     return normalizer.normalize(strings, stream);
   }();
-  stream.synchronize();
-  nvtxRangePop();
 
   CUDF_EXPECTS(
     result.first->size() < static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
@@ -371,13 +371,13 @@ CUDF_KERNEL void special_tokens_kernel(uint32_t* d_utf8chars,
   chars_per_thread[match_idx] = 2;  // ']' plus trailing space
 }
 
-CUDF_KERNEL void normalizer_kernel(char const* d_chars,
-                                   int64_t total_bytes,
-                                   codepoint_metadata_type const* cp_metadata,
-                                   aux_codepoint_data_type const* aux_table,
-                                   bool do_lower_case,
-                                   uint32_t* d_output,
-                                   int8_t* chars_per_thread)
+CUDF_KERNEL void data_normalizer_kernel(char const* d_chars,
+                                        int64_t total_bytes,
+                                        codepoint_metadata_type const* cp_metadata,
+                                        aux_codepoint_data_type const* aux_table,
+                                        bool do_lower_case,
+                                        uint32_t* d_output,
+                                        int8_t* chars_per_thread)
 {
   uint32_t replacement[MAX_NEW_CHARS] = {0};
 
@@ -452,7 +452,7 @@ rmm::device_uvector<cudf::size_type> compute_sizes(cudf::device_span<int8_t cons
   auto d_in        = sizes.data();
   auto d_out       = output_sizes.begin();
   std::size_t temp = 0;
-  nvtxRangePushA("compute_sizes_segred");
+  nvtxRangePushA("n_compute_sizes_segred");
   if (offset == 0) {
     cub::DeviceSegmentedReduce::Sum(
       nullptr, temp, d_in, d_out, size, offsets, offsets + 1, stream.value());
@@ -540,8 +540,8 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
 
   auto d_sizes       = rmm::device_uvector<int8_t>(chars_size, stream);
   auto d_code_points = rmm::device_uvector<uint32_t>(max_new_char_total, stream);
-  nvtxRangePushA("normalizer_kernel");
-  normalizer_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+  nvtxRangePushA("n_normalizer_kernel");
+  data_normalizer_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     d_input_chars,
     chars_size,
     parameters->cp_metadata.data(),
@@ -557,7 +557,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   // before returning the output strings column.
   auto const special_tokens = parameters->get_special_tokens();
   if (!special_tokens.empty()) {
-    nvtxRangePushA("special_tokens_kernel");
+    nvtxRangePushA("n_special_tokens_kernel");
     special_tokens_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
       d_code_points.data(), chars_size, special_tokens, d_sizes.data());
     stream.synchronize();
@@ -578,7 +578,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   rmm::device_uvector<char> chars(total_size, stream, mr);
   auto begin = reinterpret_cast<char const*>(d_code_points.begin());
   auto end   = reinterpret_cast<char const*>(d_code_points.end());
-  nvtxRangePushA("remove_copy");
+  nvtxRangePushA("n_remove_copy");
   // auto end = reinterpret_cast<char const*>(
   //   remove_safe(d_code_points.begin(), d_code_points.end(), 0, stream));
   remove_copy_safe(begin, end, chars.data(), 0, stream);

From 091cb84494a3b0a8a8fa7a43a20dd7cbf0f4cd5d Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 30 Jan 2025 15:57:42 -0500
Subject: [PATCH 09/21] replace d_sizes with transform-iterator

---
 cpp/benchmarks/text/normalize.cpp       |  5 +-
 cpp/include/nvtext/normalize.hpp        | 26 +++++---
 cpp/src/text/normalize.cu               | 89 ++++++++++---------------
 cpp/src/text/subword/data_normalizer.cu | 12 ----
 cpp/tests/text/normalize_tests.cpp      | 72 ++++++++++----------
 5 files changed, 90 insertions(+), 114 deletions(-)

diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
index 405b099fa10..494d5722ae4 100644
--- a/cpp/benchmarks/text/normalize.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -48,9 +48,10 @@ static void bench_normalize(nvbench::state& state)
                [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); });
   } else {
     bool const to_lower = (normalize_type == "to_lower");
-    auto normalizer     = nvtext::create_character_normalizer(to_lower);
+    // we expect the normalizer to be created once and re-used
+    // so creating it is not measured
+    auto normalizer = nvtext::create_character_normalizer(to_lower);
     state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-      // auto result = nvtext::normalize_characters(input, to_lower);
       auto result = nvtext::normalize_characters(input, *normalizer);
     });
   }
diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index 6d256fec231..4bd257b7d45 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -122,18 +122,22 @@ std::unique_ptr<cudf::column> normalize_characters(
  * - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "`
  * - removing control characters (unicode categories "Cc" and "Cf")
  *
- * The padding process here adds a single space before and after the character.
+ * The padding process adds a single space before and after the character.
  * Details on _unicode category_ can be found here:
  * https://unicodebook.readthedocs.io/unicode.html#categories
  *
- * If `do_lower_case = true`, lower-casing also removes the accents. The
+ * If `do_lower_case = true`, lower-casing also removes any accents. The
  * accents cannot be removed from upper-case characters without lower-casing
  * and lower-casing cannot be performed without also removing accents.
  * However, if the accented character is already lower-case, then only the
  * accent is removed.
  *
- * If `special_tokens` are included the padding around the `[]` is not
- * enforced if the character between them match one of the given tokens.
+ * If `special_tokens` are included the padding after `[` and before `]` is not
+ * inserted if the character between them match one of the given tokens.
+ * If `do_lower_case = true` the `special_tokens` are expected to contain
+ * lower-case characters.
+ * Also, the `special_tokens` are expected to include the `[]` characters
+ * at the beginning of and end of each string respectively.
  */
 struct character_normalizer {
   /**
@@ -170,10 +174,11 @@ struct character_normalizer {
  * @param do_lower_case If true, upper-case characters are converted to
  *        lower-case and accents are stripped from those characters.
  *        If false, accented and upper-case characters are not transformed.
- * @param special_tokens Individual sequences including `[]` brackets
+ * @param special_tokens Individual sequences including `[]` brackets.
+ *        Default is no special tokens.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return Object to be used with nvtext::tokenize_with_vocabulary
+ * @return Object to be used with nvtext::normalize_characters
  */
 std::unique_ptr<character_normalizer> create_character_normalizer(
   bool do_lower_case,
@@ -183,15 +188,18 @@ std::unique_ptr<character_normalizer> create_character_normalizer(
   rmm::device_async_resource_ref mr               = cudf::get_current_device_resource_ref());
 
 /**
- * @brief Normalizes strings characters for tokenizing
+ * @brief Normalizes the text in input strings column
  *
  * @see nvtext::character_normalizer for details on the normalizer behavior
  *
  * @code{.pseudo}
+ * cn = create_character_normalizer(true)
  * s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
- * s1 = normalize_characters(s,true)
+ * s1 = normalize_characters(s,cn)
  * s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
- * s2 = normalize_characters(s,false)
+ *
+ * cn = create_character_normalizer(false)
+ * s2 = normalize_characters(s,cn)
  * s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
  * @endcode
  *
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index be92086a663..b4606243775 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -201,12 +201,9 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
 
   // create the normalizer and call it
   auto result = [&] {
-    nvtxRangePushA("o_load_normalize_tables");
     auto const cp_metadata = get_codepoint_metadata(stream);
     auto const aux_table   = get_aux_codepoint_data(stream);
     auto const normalizer  = data_normalizer(cp_metadata.data(), aux_table.data(), do_lower_case);
-    stream.synchronize();
-    nvtxRangePop();
     return normalizer.normalize(strings, stream);
   }();
 
@@ -224,11 +221,8 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the codepoint_to_utf8_fn
-  nvtxRangePushA("o_codepoint_to_utf8");
   auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
     codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr);
-  stream.synchronize();
-  nvtxRangePop();
 
   return cudf::make_strings_column(strings.size(),
                                    std::move(offsets_column),
@@ -337,8 +331,7 @@ namespace {
 
 CUDF_KERNEL void special_tokens_kernel(uint32_t* d_utf8chars,
                                        int64_t total_count,
-                                       cudf::device_span<cudf::string_view const> special_tokens,
-                                       int8_t* chars_per_thread)
+                                       cudf::device_span<cudf::string_view const> special_tokens)
 {
   auto const idx = cudf::detail::grid_1d::global_thread_id();
   if (idx >= total_count) { return; }
@@ -364,11 +357,6 @@ CUDF_KERNEL void special_tokens_kernel(uint32_t* d_utf8chars,
   // fix up chars to remove the extra spaces
   *(begin + 1) = 0;
   *(match - 1) = 0;
-
-  auto const match_idx = idx + (thrust::distance(begin, match) / MAX_NEW_CHARS);
-
-  chars_per_thread[idx]       = 2;  // leading space plus '['
-  chars_per_thread[match_idx] = 2;  // ']' plus trailing space
 }
 
 CUDF_KERNEL void data_normalizer_kernel(char const* d_chars,
@@ -376,8 +364,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars,
                                         codepoint_metadata_type const* cp_metadata,
                                         aux_codepoint_data_type const* aux_table,
                                         bool do_lower_case,
-                                        uint32_t* d_output,
-                                        int8_t* chars_per_thread)
+                                        uint32_t* d_output)
 {
   uint32_t replacement[MAX_NEW_CHARS] = {0};
 
@@ -423,16 +410,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars,
     }
   }
 
-  if (idx < total_bytes) {
-    chars_per_thread[idx] = num_new_bytes;
-    // d_output += idx * MAX_NEW_CHARS;
-    // #pragma unroll
-    // for (int k = 0; k < MAX_NEW_CHARS; ++k) {
-    //   *d_output++ = replacement[k];
-    // }
-  }
-
-  // BLOCK_STORE_TRANSPOSE
+  // alternate algorithm BLOCK_STORE_TRANSPOSE
   using block_store =
     cub::BlockStore<uint32_t, 256, MAX_NEW_CHARS, cub::BLOCK_STORE_WARP_TRANSPOSE>;
   __shared__ typename block_store::TempStorage bs_stg;
@@ -441,7 +419,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars,
 }
 
 template <typename OffsetType>
-rmm::device_uvector<cudf::size_type> compute_sizes(cudf::device_span<int8_t const> sizes,
+rmm::device_uvector<cudf::size_type> compute_sizes(cudf::device_span<uint32_t const> d_codepoints,
                                                    OffsetType offsets,
                                                    int64_t offset,
                                                    cudf::size_type size,
@@ -449,10 +427,23 @@ rmm::device_uvector<cudf::size_type> compute_sizes(cudf::device_span<int8_t cons
 {
   auto output_sizes = rmm::device_uvector<cudf::size_type>(size, stream);
 
-  auto d_in        = sizes.data();
-  auto d_out       = output_sizes.begin();
-  std::size_t temp = 0;
-  nvtxRangePushA("n_compute_sizes_segred");
+  auto d_cps = d_codepoints.data();
+
+  // counts the non-zero bytes in the d_cps arrays
+  auto d_in = cudf::detail::make_counting_transform_iterator(
+    0, cuda::proclaim_return_type<cudf::size_type>([d_cps] __device__(auto idx) {
+      idx       = idx * MAX_NEW_CHARS;
+      auto size = cudf::size_type{0};
+      for (int k = 0; k < MAX_NEW_CHARS; ++k) {
+        auto const v = d_cps[idx + k];
+        size +=
+          ((v & 0xFF) > 0) + ((v & 0xFF00) > 0) + ((v & 0xFF0000) > 0) + ((v & 0xFF000000) > 0);
+      }
+      return size;
+    }));
+
+  auto d_out = output_sizes.begin();
+  auto temp  = std::size_t{0};
   if (offset == 0) {
     cub::DeviceSegmentedReduce::Sum(
       nullptr, temp, d_in, d_out, size, offsets, offsets + 1, stream.value());
@@ -470,12 +461,11 @@ rmm::device_uvector<cudf::size_type> compute_sizes(cudf::device_span<int8_t cons
     cub::DeviceSegmentedReduce::Sum(
       d_temp.data(), temp, d_in, d_out, size, offsets_itr, offsets_itr + 1, stream.value());
   }
-  stream.synchronize();
-  nvtxRangePop();
 
   return output_sizes;
 }
 
+// handles ranges above int32 max
 template <typename InputIterator, typename OutputIterator, typename T>
 OutputIterator remove_copy_safe(InputIterator first,
                                 InputIterator last,
@@ -496,6 +486,7 @@ OutputIterator remove_copy_safe(InputIterator first,
   return result;
 }
 
+// handles ranges above int32 max
 template <typename Iterator, typename T>
 Iterator remove_safe(Iterator first, Iterator last, T const& value, rmm::cuda_stream_view stream)
 {
@@ -538,52 +529,42 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
 
   auto const parameters = normalizer._impl;
 
-  auto d_sizes       = rmm::device_uvector<int8_t>(chars_size, stream);
-  auto d_code_points = rmm::device_uvector<uint32_t>(max_new_char_total, stream);
-  nvtxRangePushA("n_normalizer_kernel");
+  auto d_codepoints = rmm::device_uvector<uint32_t>(max_new_char_total, stream);
   data_normalizer_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     d_input_chars,
     chars_size,
     parameters->cp_metadata.data(),
     parameters->aux_table.data(),
     parameters->do_lower_case,
-    d_code_points.data(),
-    d_sizes.data());
-  stream.synchronize();
-  nvtxRangePop();
+    d_codepoints.data());
 
-  // This removes space adding around any special tokens in the form of [ttt].
+  // This removes space added around any special tokens in the form of [ttt].
   // An alternate approach is to do a multi-replace of '[ ttt ]' with '[ttt]' right
   // before returning the output strings column.
   auto const special_tokens = parameters->get_special_tokens();
   if (!special_tokens.empty()) {
-    nvtxRangePushA("n_special_tokens_kernel");
     special_tokens_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-      d_code_points.data(), chars_size, special_tokens, d_sizes.data());
-    stream.synchronize();
-    nvtxRangePop();
+      d_codepoints.data(), chars_size, special_tokens);
   }
 
   auto const input_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
-  // use segmented-reduce with input_offsets over d_sizes to get the size of the output rows
-  auto output_sizes = compute_sizes(d_sizes, input_offsets, first_offset, input.size(), stream);
+  // Use segmented-reduce over the non-zero codepoints to get the size of the output rows
+  auto output_sizes =
+    compute_sizes(d_codepoints, input_offsets, first_offset, input.size(), stream);
 
   // convert the sizes to offsets
   auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column(
     output_sizes.begin(), output_sizes.end(), stream, mr);
 
-  // create output chars and use remove-copy(0) on d_code_points
+  // create output chars by calling remove_copy(0) on the bytes in d_codepoints
   rmm::device_uvector<char> chars(total_size, stream, mr);
-  auto begin = reinterpret_cast<char const*>(d_code_points.begin());
-  auto end   = reinterpret_cast<char const*>(d_code_points.end());
-  nvtxRangePushA("n_remove_copy");
-  // auto end = reinterpret_cast<char const*>(
-  //   remove_safe(d_code_points.begin(), d_code_points.end(), 0, stream));
+  auto begin = reinterpret_cast<char const*>(d_codepoints.begin());
+  // the first remove() here speeds up the remove_copy() by roughly 10%
+  auto end =
+    reinterpret_cast<char const*>(remove_safe(d_codepoints.begin(), d_codepoints.end(), 0, stream));
   remove_copy_safe(begin, end, chars.data(), 0, stream);
-  stream.synchronize();
-  nvtxRangePop();
 
   return cudf::make_strings_column(input.size(),
                                    std::move(offsets),
diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu
index 559695e00d3..4c54409c41a 100644
--- a/cpp/src/text/subword/data_normalizer.cu
+++ b/cpp/src/text/subword/data_normalizer.cu
@@ -238,7 +238,6 @@ uvector_pair data_normalizer::normalize(cudf::strings_column_view const& input,
   rmm::device_uvector<uint32_t> d_chars_per_thread(threads_on_device, stream);
   auto const d_strings = input.chars_begin(stream) + cudf::strings::detail::get_offset_value(
                                                        input.offsets(), input.offset(), stream);
-  nvtxRangePushA("o_kernel_data_normalizer");
   kernel_data_normalizer<<<num_blocks, threads_per_block, 0, stream.value()>>>(
     reinterpret_cast<unsigned char const*>(d_strings),
     bytes_count,
@@ -247,37 +246,26 @@ uvector_pair data_normalizer::normalize(cudf::strings_column_view const& input,
     do_lower_case,
     d_code_points->data(),
     d_chars_per_thread.data());
-  stream.synchronize();
-  nvtxRangePop();
 
   // Remove the 'empty' code points from the vector
-  nvtxRangePushA("o_remove");
   thrust::remove(rmm::exec_policy(stream),
                  d_code_points->begin(),
                  d_code_points->end(),
                  uint32_t{1 << FILTER_BIT});
-  stream.synchronize();
-  nvtxRangePop();
 
   // We also need to prefix sum the number of characters up to an including
   // the current character in order to get the new strings lengths.
-  nvtxRangePushA("o_inclusive_scan");
   thrust::inclusive_scan(rmm::exec_policy(stream),
                          d_chars_per_thread.begin(),
                          d_chars_per_thread.end(),
                          d_chars_per_thread.begin());
-  stream.synchronize();
-  nvtxRangePop();
 
   // This will reset the offsets to the new generated code point values
-  nvtxRangePushA("o_update_strings_lengths");
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<uint32_t>(1),
     input.size(),
     update_strings_lengths_fn{d_chars_per_thread.data(), d_strings_offsets->data()});
-  stream.synchronize();
-  nvtxRangePop();
 
   auto const num_chars = d_strings_offsets->element(input.size(), stream);
   d_code_points->resize(num_chars, stream);  // should be smaller than original allocated size
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index 37b6d51fcbd..0223bcc428a 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -17,7 +17,6 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/debug_utilities.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/copying.hpp>
@@ -146,34 +145,32 @@ TEST_F(TextNormalizeTest, WithNormalizer)
 
   auto normalizer = nvtext::create_character_normalizer(true);
   auto results    = nvtext::normalize_characters(sv, *normalizer);
-  //  cudf::test::print(results->view());
-  auto expected = cudf::test::strings_column_wrapper({"abc£def",
-                                                      "",
-                                                      "ee a io aeio",
-                                                      " acen  u",
-                                                      "acen u",
-                                                      "p ^ np",
-                                                      " $ 41 . 07",
-                                                      " [ a , b ] ",
-                                                      " 丏  丟 ",
-                                                      ""},
-                                                     {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+  auto expected   = cudf::test::strings_column_wrapper({"abc£def",
+                                                        "",
+                                                        "ee a io aeio",
+                                                        " acen  u",
+                                                        "acen u",
+                                                        "p ^ np",
+                                                        " $ 41 . 07",
+                                                        " [ a , b ] ",
+                                                        " 丏  丟 ",
+                                                        ""},
+                                                       {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   normalizer = nvtext::create_character_normalizer(false);
   results    = nvtext::normalize_characters(sv, *normalizer);
-  // cudf::test::print(results->view());
-  expected = cudf::test::strings_column_wrapper({"abc£def",
-                                                 "",
-                                                 "éè â îô aeio",
-                                                 " ĂĆĖÑ  Ü",
-                                                 "ACEN U",
-                                                 "P ^ NP",
-                                                 " $ 41 . 07",
-                                                 " [ a , b ] ",
-                                                 " 丏  丟 ",
-                                                 ""},
-                                                {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+  expected   = cudf::test::strings_column_wrapper({"abc£def",
+                                                   "",
+                                                   "éè â îô aeio",
+                                                   " ĂĆĖÑ  Ü",
+                                                   "ACEN U",
+                                                   "P ^ NP",
+                                                   " $ 41 . 07",
+                                                   " [ a , b ] ",
+                                                   " 丏  丟 ",
+                                                   ""},
+                                                  {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
@@ -188,21 +185,21 @@ TEST_F(TextNormalizeTest, SpecialTokens)
   auto sv             = cudf::strings_column_view(input);
   auto special_tokens = cudf::test::strings_column_wrapper({"[BOS]", "[EOS]", "[SEP]", "[PAD]"});
   auto stv            = cudf::strings_column_view(special_tokens);
-  auto normalizer     = nvtext::create_character_normalizer(true, stv);
-  auto results        = nvtext::normalize_characters(sv, *normalizer);
-  // cudf::test::print(results->view());
-  auto expected = cudf::test::strings_column_wrapper(
+
+  auto normalizer = nvtext::create_character_normalizer(true, stv);
+  auto results    = nvtext::normalize_characters(sv, *normalizer);
+  auto expected   = cudf::test::strings_column_wrapper(
     {" [bos] some strings with  [pad]  special [sep] tokens [eos] ",
-     " [bos] these should [sep] work too [eos] ",
-     "some [ non ] tokens [ eol ] too"});
+       " [bos] these should [sep] work too [eos] ",
+       "some [ non ] tokens [ eol ] too"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
   normalizer = nvtext::create_character_normalizer(false, stv);
   results    = nvtext::normalize_characters(sv, *normalizer);
-  // cudf::test::print(results->view());
-  expected = cudf::test::strings_column_wrapper(
+  expected   = cudf::test::strings_column_wrapper(
     {" [BOS] Some strings with  [PAD]  special [SEP] tokens [EOS] ",
-     " [ bos ] these should [ sep ] work too [ eos ] ",
-     "some [ non ] tokens [ eol ] too"});
+       " [ bos ] these should [ sep ] work too [ eos ] ",
+       "some [ non ] tokens [ eol ] too"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
@@ -226,8 +223,9 @@ TEST_F(TextNormalizeTest, SlicedColumn)
   auto input = cudf::test::strings_column_wrapper(
     {"abc£def", "éè â îô\taeio", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟"});
 
-  std::vector<cudf::column_view> sliced = cudf::split(input, {4});
-  auto normalizer                       = nvtext::create_character_normalizer(true);
+  auto sliced = cudf::split(input, {4});
+
+  auto normalizer = nvtext::create_character_normalizer(true);
   auto results =
     nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer);
   auto expected =

From deb33ac3ec98d04fb22e1e096bbadddc3180a263 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 30 Jan 2025 18:34:53 -0500
Subject: [PATCH 10/21] fix pylibcudf normalize-characters pytest

---
 python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
index 25b6d1389ec..78fb4164a61 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION.
 
 import pyarrow as pa
 import pytest
@@ -29,7 +29,7 @@ def test_normalize_spaces(norm_spaces_input_data):
 
 @pytest.mark.parametrize("do_lower", [True, False])
 def test_normalize_characters(norm_chars_input_data, do_lower):
-    result = plc.nvtext.normalize.normalize_characters(
+    result = plc.nvtext.normalize.characters_normalize(
         plc.interop.from_arrow(norm_chars_input_data),
         do_lower,
     )

From f2f35a616edb859fc27633f07b3a4c37f769a7f7 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 3 Feb 2025 17:23:08 -0500
Subject: [PATCH 11/21] add more gtests and pytests

---
 cpp/tests/text/normalize_tests.cpp            | 36 +++----
 .../pylibcudf/tests/test_nvtext_normalize.py  | 93 ++++++++++++++++++-
 2 files changed, 107 insertions(+), 22 deletions(-)

diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index 0223bcc428a..59cf611babb 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -74,6 +74,10 @@ TEST_F(TextNormalizeTest, NormalizeEmptyTest)
   EXPECT_EQ(results->size(), 0);
   results = nvtext::normalize_characters(strings_view, false);
   EXPECT_EQ(results->size(), 0);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  results         = nvtext::normalize_characters(strings_view, *normalizer);
+  EXPECT_EQ(results->size(), 0);
 }
 
 TEST_F(TextNormalizeTest, AllNullStrings)
@@ -84,6 +88,10 @@ TEST_F(TextNormalizeTest, AllNullStrings)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
   results = nvtext::normalize_characters(strings_view, false);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  results         = nvtext::normalize_characters(strings_view, *normalizer);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
 }
 
 TEST_F(TextNormalizeTest, SomeNullStrings)
@@ -93,6 +101,10 @@ TEST_F(TextNormalizeTest, SomeNullStrings)
   auto results = nvtext::normalize_characters(strings_view, false);
   cudf::test::strings_column_wrapper expected({"", " . ", "a"}, {false, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  results         = nvtext::normalize_characters(strings_view, *normalizer);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(TextNormalizeTest, NormalizeCharacters)
@@ -176,7 +188,6 @@ TEST_F(TextNormalizeTest, WithNormalizer)
 
 TEST_F(TextNormalizeTest, SpecialTokens)
 {
-  // These include punctuation, accents, whitespace, and CJK characters
   auto input =
     cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]",
                                         "[bos]these should[sep]work too[eos]",
@@ -210,26 +221,17 @@ TEST_F(TextNormalizeTest, NormalizeSlicedColumn)
 
   std::vector<cudf::column_view> sliced = cudf::split(strings, {4});
   auto results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), true);
-  cudf::test::strings_column_wrapper expected({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
+  auto expected =
+    cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false);
-  cudf::test::strings_column_wrapper expected2({" $ 41 . 07", " [ a , b ] ", " 丏  丟 "});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
-}
-
-TEST_F(TextNormalizeTest, SlicedColumn)
-{
-  auto input = cudf::test::strings_column_wrapper(
-    {"abc£def", "éè â îô\taeio", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟"});
-
-  auto sliced = cudf::split(input, {4});
+  results  = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false);
+  expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏  丟 "});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   auto normalizer = nvtext::create_character_normalizer(true);
-  auto results =
-    nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer);
-  auto expected =
-    cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
+  results  = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer);
+  expected = cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   normalizer = nvtext::create_character_normalizer(false);
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
index 78fb4164a61..47bbb191be6 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
@@ -15,7 +15,7 @@ def norm_spaces_input_data():
 
 @pytest.fixture(scope="module")
 def norm_chars_input_data():
-    arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
+    arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]", "[pad]"]
     return pa.array(arr)
 
 
@@ -33,11 +33,94 @@ def test_normalize_characters(norm_chars_input_data, do_lower):
         plc.interop.from_arrow(norm_chars_input_data),
         do_lower,
     )
-    expected = pa.array(
-        ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
+    if do_lower:
+        expected = pa.array(
+            [
+                "eaio eaio",
+                "acenu",
+                "acenu",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    else:
+        expected = pa.array(
+            [
+                "éâîô eaio",
+                "ĂĆĖÑÜ",
+                "ACENU",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("do_lower", [True, False])
+def test_normalizer(norm_chars_input_data, do_lower):
+    result = plc.nvtext.normalize.normalize_characters(
+        plc.interop.from_arrow(norm_chars_input_data),
+        plc.nvtext.normalize.CharacterNormalizer(
+            do_lower,
+            plc.column_factories.make_empty_column(plc.types.TypeId.STRING),
+        ),
     )
-    if not do_lower:
+    if do_lower:
+        expected = pa.array(
+            [
+                "eaio eaio",
+                "acenu",
+                "acenu",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    else:
+        expected = pa.array(
+            [
+                "éâîô eaio",
+                "ĂĆĖÑÜ",
+                "ACENU",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("do_lower", [True, False])
+def test_normalizer_with_special_tokens(norm_chars_input_data, do_lower):
+    special_tokens = pa.array(["[pad]"])
+    result = plc.nvtext.normalize.normalize_characters(
+        plc.interop.from_arrow(norm_chars_input_data),
+        plc.nvtext.normalize.CharacterNormalizer(
+            do_lower, plc.interop.from_arrow(special_tokens)
+        ),
+    )
+    if do_lower:
+        expected = pa.array(
+            [
+                "eaio eaio",
+                "acenu",
+                "acenu",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [pad] ",
+            ]
+        )
+    else:
         expected = pa.array(
-            ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
+            [
+                "éâîô eaio",
+                "ĂĆĖÑÜ",
+                "ACENU",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [pad] ",
+            ]
         )
     assert_column_eq(result, expected)

From fc86efa17ab34a38a1d93fe5af94fe80b11be6e5 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 4 Feb 2025 13:33:56 -0500
Subject: [PATCH 12/21] add longer strings to gtests

---
 cpp/src/text/normalize.cu          | 132 ++++++++++++++++++-----------
 cpp/tests/text/normalize_tests.cpp |  52 +++++++++---
 2 files changed, 124 insertions(+), 60 deletions(-)

diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index b4606243775..e37da41c8c3 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -23,8 +23,6 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
-#include <cudf/concatenate.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -43,6 +41,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cub/cub.cuh>
+#include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -281,12 +280,6 @@ struct character_normalizer::character_normalizer_impl {
   }
 };
 
-// lambdas not allowed in constructors
-struct size_bytes_fn {
-  cudf::string_view const* d_tokens;
-  __device__ cudf::size_type operator()(cudf::size_type idx) { return d_tokens[idx].size_bytes(); }
-};
-
 character_normalizer::character_normalizer(bool do_lower_case,
                                            cudf::strings_column_view const& special_tokens,
                                            rmm::cuda_stream_view stream,
@@ -300,7 +293,7 @@ character_normalizer::character_normalizer(bool do_lower_case,
   auto sorted = std::move(
     cudf::sort(cudf::table_view({special_tokens.parent()}), {}, {}, stream)->release().front());
   if (do_lower_case) {
-    // lower case the tokens so they will match the normalized input
+    // lower-case the tokens so they will match the normalized input
     sorted = cudf::strings::to_lower(cudf::strings_column_view(sorted->view()), stream);
   }
 
@@ -329,36 +322,65 @@ std::unique_ptr<character_normalizer> create_character_normalizer(
 namespace detail {
 namespace {
 
-CUDF_KERNEL void special_tokens_kernel(uint32_t* d_utf8chars,
+/**
+ * @brief Kernel handles fixing up the normalized data to account for any special tokens
+ *
+ * This undoes the padding added around the `[]` for patterns matching the strings in the
+ * special_tokens array.
+ *
+ * Launched as a thread per input byte (total_count).
+ *
+ * @param d_normalized The normalized set of UTF-8 characters; 3 uints per input byte
+ * @param total_count Number of bytes represented by d_normalized; len(d_normalized)/3
+ * @param special_tokens Tokens to check against
+ */
+CUDF_KERNEL void special_tokens_kernel(uint32_t* d_normalized,
                                        int64_t total_count,
                                        cudf::device_span<cudf::string_view const> special_tokens)
 {
   auto const idx = cudf::detail::grid_1d::global_thread_id();
   if (idx >= total_count) { return; }
-  auto const begin = d_utf8chars + (idx * MAX_NEW_CHARS) + 1;
+  auto const begin = d_normalized + (idx * MAX_NEW_CHARS) + 1;
   if (*begin != '[') { return; }
   auto const end   = begin + std::min(6L, total_count - idx) * MAX_NEW_CHARS;
   auto const match = thrust::find(thrust::seq, begin, end, static_cast<uint32_t>(']'));
   if (match == end) { return; }
   char candidate[8];
-  auto itr  = thrust::transform_iterator(begin, [](auto v) { return static_cast<char>(v); });
-  auto eitr = itr + thrust::distance(begin, match + 1);
-  auto last =
-    thrust::copy_if(thrust::seq, itr, eitr, candidate, [](auto c) { return c != 0 && c != ' '; });
+  auto const ch_begin =
+    thrust::transform_iterator(begin, [](auto v) { return static_cast<char>(v); });
+  auto const ch_end = ch_begin + thrust::distance(begin, match + 1);
+  auto last         = thrust::copy_if(
+    thrust::seq, ch_begin, ch_end, candidate, [](auto c) { return c != 0 && c != ' '; });
   *last = 0;  // only needed for debug
 
   auto const size  = static_cast<cudf::size_type>(thrust::distance(candidate, last));
   auto const token = cudf::string_view(candidate, size);
-  if (thrust::find(thrust::seq, special_tokens.begin(), special_tokens.end(), token) ==
-      special_tokens.end()) {
+  // the binary_search expects the special_tokens to be sorted
+  if (!thrust::binary_search(thrust::seq, special_tokens.begin(), special_tokens.end(), token)) {
     return;
   }
 
   // fix up chars to remove the extra spaces
-  *(begin + 1) = 0;
-  *(match - 1) = 0;
+  *(begin + 1) = 0;  // removes space after '['
+  *(match - 1) = 0;  // removes space before ']'
 }
 
+/**
+ * @brief The normalizer kernel
+ *
+ * Launched as a thread per input byte (total_bytes).
+ *
+ * Converts the input d_chars into codepoints to lookup in the provided tables.
+ * Once processed, the d_output contains 3 uints per input byte each encoded
+ * as output UTF-8. Any zero values are to removed by a subsequent kernel call.
+ *
+ * @param d_chars The characters for the input strings column to normalize
+ * @param total_bytes The number of bytes in the d_chars
+ * @param cp_metadata First lookup table for codepoint metadata
+ * @param aux_table Second lookup table containing possible replacement characters
+ * @param do_lower_case True if the normalization includes lower-casing characters
+ * @param d_output The output of the normalization (UTF-8 encoded)
+ */
 CUDF_KERNEL void data_normalizer_kernel(char const* d_chars,
                                         int64_t total_bytes,
                                         codepoint_metadata_type const* cp_metadata,
@@ -381,7 +403,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars,
 
     if (!should_remove_cp(metadata, do_lower_case)) {
       int8_t num_new_chars = 1;
-      // Apply lower cases and accent stripping if necessary
+      // retrieve the normalized value for cp
       auto const new_cp = do_lower_case || always_replace(metadata) ? get_first_cp(metadata) : cp;
       replacement[0]    = new_cp == 0 ? cp : new_cp;
 
@@ -393,14 +415,13 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars,
       }
 
       if (should_add_spaces(metadata, do_lower_case) && (num_new_chars == 1)) {
-        // write the required spaces at each end
         replacement[1] = replacement[0];
-        replacement[0] = SPACE_CODE_POINT;
+        replacement[0] = SPACE_CODE_POINT;  // add spaces around the new codepoint
         replacement[2] = SPACE_CODE_POINT;
         num_new_chars  = 3;
       }
 
-      // convert back to UTF-8
+      // convert codepoints back to UTF-8 in-place
       for (int k = 0; k < num_new_chars; ++k) {
         auto const new_cp = replacement[k];
         if (new_cp) {
@@ -410,7 +431,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars,
     }
   }
 
-  // alternate algorithm BLOCK_STORE_TRANSPOSE
+  // employ an optimized coalesced writer to output replacement as a block of transposed data
   using block_store =
     cub::BlockStore<uint32_t, 256, MAX_NEW_CHARS, cub::BLOCK_STORE_WARP_TRANSPOSE>;
   __shared__ typename block_store::TempStorage bs_stg;
@@ -418,8 +439,20 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars,
   block_store(bs_stg).Store(block_base, replacement);
 }
 
+/**
+ * @brief Computes the output sizes for each row
+ *
+ * The input offsets are used with segmented-reduce to count the number of
+ * non-zero values for each output row.
+ *
+ * @param d_normalized The UTF-8 encoded normalized values
+ * @param offsets These identify the row boundaries
+ * @param offset Only non-zero if the input column has been sliced
+ * @param size The number of output rows (sames as the number of input rows)
+ * @param stream Stream used for allocating device memory and launching kernels
+ */
 template <typename OffsetType>
-rmm::device_uvector<cudf::size_type> compute_sizes(cudf::device_span<uint32_t const> d_codepoints,
+rmm::device_uvector<cudf::size_type> compute_sizes(cudf::device_span<uint32_t const> d_normalized,
                                                    OffsetType offsets,
                                                    int64_t offset,
                                                    cudf::size_type size,
@@ -427,21 +460,23 @@ rmm::device_uvector<cudf::size_type> compute_sizes(cudf::device_span<uint32_t co
 {
   auto output_sizes = rmm::device_uvector<cudf::size_type>(size, stream);
 
-  auto d_cps = d_codepoints.data();
+  auto d_data = d_normalized.data();
 
-  // counts the non-zero bytes in the d_cps arrays
+  // counts the non-zero bytes in the d_data array
   auto d_in = cudf::detail::make_counting_transform_iterator(
-    0, cuda::proclaim_return_type<cudf::size_type>([d_cps] __device__(auto idx) {
-      idx       = idx * MAX_NEW_CHARS;
-      auto size = cudf::size_type{0};
-      for (int k = 0; k < MAX_NEW_CHARS; ++k) {
-        auto const v = d_cps[idx + k];
-        size +=
-          ((v & 0xFF) > 0) + ((v & 0xFF00) > 0) + ((v & 0xFF0000) > 0) + ((v & 0xFF000000) > 0);
-      }
-      return size;
+    0, cuda::proclaim_return_type<cudf::size_type>([d_data] __device__(auto idx) {
+      idx = idx * MAX_NEW_CHARS;
+      // transform function counts number of non-zero bytes in uint32_t value
+      auto tfn = [](uint32_t v) -> cudf::size_type {
+        return ((v & 0xFF) > 0) + ((v & 0xFF00) > 0) + ((v & 0xFF0000) > 0) +
+               ((v & 0xFF000000) > 0);
+      };
+      auto const begin = d_data + idx;
+      auto const end   = begin + MAX_NEW_CHARS;
+      return thrust::transform_reduce(thrust::seq, begin, end, tfn, 0, thrust::plus{});
     }));
 
+  // DeviceSegmentedReduce is used to compute the size of each output row
   auto d_out = output_sizes.begin();
   auto temp  = std::size_t{0};
   if (offset == 0) {
@@ -502,8 +537,8 @@ Iterator remove_safe(Iterator first, Iterator last, T const& value, rmm::cuda_st
   }
   return result;
 }
-
 }  // namespace
+
 std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& input,
                                                    character_normalizer const& normalizer,
                                                    rmm::cuda_stream_view stream,
@@ -529,14 +564,14 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
 
   auto const parameters = normalizer._impl;
 
-  auto d_codepoints = rmm::device_uvector<uint32_t>(max_new_char_total, stream);
+  auto d_normalized = rmm::device_uvector<uint32_t>(max_new_char_total, stream);
   data_normalizer_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     d_input_chars,
     chars_size,
     parameters->cp_metadata.data(),
     parameters->aux_table.data(),
     parameters->do_lower_case,
-    d_codepoints.data());
+    d_normalized.data());
 
   // This removes space added around any special tokens in the form of [ttt].
   // An alternate approach is to do a multi-replace of '[ ttt ]' with '[ttt]' right
@@ -544,26 +579,25 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   auto const special_tokens = parameters->get_special_tokens();
   if (!special_tokens.empty()) {
     special_tokens_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-      d_codepoints.data(), chars_size, special_tokens);
+      d_normalized.data(), chars_size, special_tokens);
   }
 
+  // Use segmented-reduce over the non-zero codepoints to get the size of the output rows
   auto const input_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
-
-  // Use segmented-reduce over the non-zero codepoints to get the size of the output rows
   auto output_sizes =
-    compute_sizes(d_codepoints, input_offsets, first_offset, input.size(), stream);
+    compute_sizes(d_normalized, input_offsets, first_offset, input.size(), stream);
 
   // convert the sizes to offsets
   auto [offsets, total_size] = cudf::strings::detail::make_offsets_child_column(
     output_sizes.begin(), output_sizes.end(), stream, mr);
 
-  // create output chars by calling remove_copy(0) on the bytes in d_codepoints
-  rmm::device_uvector<char> chars(total_size, stream, mr);
-  auto begin = reinterpret_cast<char const*>(d_codepoints.begin());
-  // the first remove() here speeds up the remove_copy() by roughly 10%
-  auto end =
-    reinterpret_cast<char const*>(remove_safe(d_codepoints.begin(), d_codepoints.end(), 0, stream));
+  // create output chars by calling remove_copy(0) on the bytes in d_normalized
+  auto chars       = rmm::device_uvector<char>(total_size, stream, mr);
+  auto const begin = reinterpret_cast<char const*>(d_normalized.begin());
+  // the remove() above speeds up the remove_copy() by roughly 10%
+  auto const end =
+    reinterpret_cast<char const*>(remove_safe(d_normalized.begin(), d_normalized.end(), 0, stream));
   remove_copy_safe(begin, end, chars.data(), 0, stream);
 
   return cudf::make_strings_column(input.size(),
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index 59cf611babb..fc2a0ef1660 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -148,10 +148,23 @@ TEST_F(TextNormalizeTest, NormalizeCharacters)
 
 TEST_F(TextNormalizeTest, WithNormalizer)
 {
-  // These include punctuation, accents, whitespace, and CJK characters
-  auto input = cudf::test::strings_column_wrapper(
-    {"abc£def", "", "éè â îô\taeio", "\tĂĆĖÑ  Ü", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟", ""},
-    {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+  auto long_row =
+    "this entry is intended to pad out past 256 bytes which is currently the block size";
+  // the following include punctuation, accents, whitespace, and CJK characters
+  auto input = cudf::test::strings_column_wrapper({"abc£def",
+                                                   "",
+                                                   "éè â îô\taeio",
+                                                   "\tĂĆĖÑ  Ü",
+                                                   "ACEN U",
+                                                   "P^NP",
+                                                   "$41.07",
+                                                   "[a,b]",
+                                                   "丏丟",
+                                                   "",
+                                                   long_row,
+                                                   long_row,
+                                                   long_row},
+                                                  {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
 
   auto const sv = cudf::strings_column_view(input);
 
@@ -166,8 +179,11 @@ TEST_F(TextNormalizeTest, WithNormalizer)
                                                         " $ 41 . 07",
                                                         " [ a , b ] ",
                                                         " 丏  丟 ",
-                                                        ""},
-                                                       {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+                                                        "",
+                                                        long_row,
+                                                        long_row,
+                                                        long_row},
+                                                       {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   normalizer = nvtext::create_character_normalizer(false);
@@ -181,17 +197,25 @@ TEST_F(TextNormalizeTest, WithNormalizer)
                                                    " $ 41 . 07",
                                                    " [ a , b ] ",
                                                    " 丏  丟 ",
-                                                   ""},
-                                                  {1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+                                                   "",
+                                                   long_row,
+                                                   long_row,
+                                                   long_row},
+                                                  {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(TextNormalizeTest, SpecialTokens)
 {
+  auto long_row =
+    "this entry is intended to pad out past 256 bytes which is currently the block size";
   auto input =
     cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]",
                                         "[bos]these should[sep]work too[eos]",
-                                        "some[non]tokens[eol]too"});
+                                        "some[non]tokens[eol]too",
+                                        long_row,
+                                        long_row,
+                                        long_row});
 
   auto sv             = cudf::strings_column_view(input);
   auto special_tokens = cudf::test::strings_column_wrapper({"[BOS]", "[EOS]", "[SEP]", "[PAD]"});
@@ -202,7 +226,10 @@ TEST_F(TextNormalizeTest, SpecialTokens)
   auto expected   = cudf::test::strings_column_wrapper(
     {" [bos] some strings with  [pad]  special [sep] tokens [eos] ",
        " [bos] these should [sep] work too [eos] ",
-       "some [ non ] tokens [ eol ] too"});
+       "some [ non ] tokens [ eol ] too",
+       long_row,
+       long_row,
+       long_row});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   normalizer = nvtext::create_character_normalizer(false, stv);
@@ -210,7 +237,10 @@ TEST_F(TextNormalizeTest, SpecialTokens)
   expected   = cudf::test::strings_column_wrapper(
     {" [BOS] Some strings with  [PAD]  special [SEP] tokens [EOS] ",
        " [ bos ] these should [ sep ] work too [ eos ] ",
-       "some [ non ] tokens [ eol ] too"});
+       "some [ non ] tokens [ eol ] too",
+       long_row,
+       long_row,
+       long_row});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 

From 756fa6da3928ce9cf79b153632208b99d76c648f Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 4 Feb 2025 15:41:15 -0500
Subject: [PATCH 13/21] fix typos

---
 cpp/include/nvtext/normalize.hpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index 4bd257b7d45..4f55a5f7b8e 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -133,11 +133,9 @@ std::unique_ptr<cudf::column> normalize_characters(
  * accent is removed.
  *
  * If `special_tokens` are included the padding after `[` and before `]` is not
- * inserted if the character between them match one of the given tokens.
- * If `do_lower_case = true` the `special_tokens` are expected to contain
- * lower-case characters.
+ * inserted if the characters between them match one of the given tokens.
  * Also, the `special_tokens` are expected to include the `[]` characters
- * at the beginning of and end of each string respectively.
+ * at the beginning of and end of each string appropriately.
  */
 struct character_normalizer {
   /**
@@ -148,7 +146,7 @@ struct character_normalizer {
    * @param do_lower_case If true, upper-case characters are converted to
    *        lower-case and accents are stripped from those characters.
    *        If false, accented and upper-case characters are not transformed.
-   * @param special_tokens Individual sequences including `[]` brackets.
+   * @param special_tokens Each row is a token including the `[]` brackets.
    *        For example: `[BOS]`, `[EOS]`, `[UNK]`, `[SEP]`, `[PAD]`, `[CLS]`, `[MASK]`
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned column's device memory
@@ -174,7 +172,7 @@ struct character_normalizer {
  * @param do_lower_case If true, upper-case characters are converted to
  *        lower-case and accents are stripped from those characters.
  *        If false, accented and upper-case characters are not transformed.
- * @param special_tokens Individual sequences including `[]` brackets.
+ * @param special_tokens Individual tokens including `[]` brackets.
  *        Default is no special tokens.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory

From 2a2b9da135f7f08e2365648d6f96ca2de59b72e6 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 4 Feb 2025 15:41:39 -0500
Subject: [PATCH 14/21] remove unneeded includes

---
 cpp/src/text/normalize.cuh | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/cpp/src/text/normalize.cuh b/cpp/src/text/normalize.cuh
index 8b1f4408ee8..3972726d536 100644
--- a/cpp/src/text/normalize.cuh
+++ b/cpp/src/text/normalize.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2025, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,12 +18,6 @@
 
 #include "text/subword/detail/cp_data.h"
 
-#include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <cuda/std/functional>
-
 namespace nvtext {
 namespace detail {
 

From c205062e620ee6f01d010ccf2212a92d06ec046c Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 4 Feb 2025 15:41:58 -0500
Subject: [PATCH 15/21] remove unneeded variable

---
 cpp/src/text/normalize.cu | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index e37da41c8c3..501034766ec 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -390,8 +390,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars,
 {
   uint32_t replacement[MAX_NEW_CHARS] = {0};
 
-  auto const idx       = cudf::detail::grid_1d::global_thread_id();
-  int8_t num_new_bytes = 0;
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
 
   if ((idx < total_bytes) && cudf::strings::detail::is_begin_utf8_char(d_chars[idx])) {
     auto const cp = [utf8 = d_chars + idx] {
@@ -424,9 +423,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars,
       // convert codepoints back to UTF-8 in-place
       for (int k = 0; k < num_new_chars; ++k) {
         auto const new_cp = replacement[k];
-        if (new_cp) {
-          num_new_bytes += cp_to_utf8(new_cp, reinterpret_cast<char*>(replacement + k));
-        }
+        if (new_cp) { cp_to_utf8(new_cp, reinterpret_cast<char*>(replacement + k)); }
       }
     }
   }
@@ -450,6 +447,7 @@ CUDF_KERNEL void data_normalizer_kernel(char const* d_chars,
  * @param offset Only non-zero if the input column has been sliced
  * @param size The number of output rows (sames as the number of input rows)
  * @param stream Stream used for allocating device memory and launching kernels
+ * @return The sizes of each output row
  */
 template <typename OffsetType>
 rmm::device_uvector<cudf::size_type> compute_sizes(cudf::device_span<uint32_t const> d_normalized,

From b46df42b96e32edd49a26c1e790d4a1a052a4d39 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 7 Feb 2025 14:04:28 -0500
Subject: [PATCH 16/21] fix comment formatting in python source

---
 python/cudf/cudf/core/character_normalizer.py   | 11 ++++++-----
 python/pylibcudf/pylibcudf/nvtext/normalize.pyx |  3 ++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/character_normalizer.py b/python/cudf/cudf/core/character_normalizer.py
index 00ee43598f6..1240c0e1eb7 100644
--- a/python/cudf/cudf/core/character_normalizer.py
+++ b/python/cudf/cudf/core/character_normalizer.py
@@ -14,10 +14,10 @@ class CharacterNormalizer:
     Parameters
     ----------
     do_lower : bool
-        Set to True if the normalizer should also lower-case
+        If True, the normalizer should also lower-case
         while normalizing.
-    special_tokens : cudf string series
-        Strings column of special tokens.
+    special_tokens : cudf.Series
+        Series of special tokens.
     """
 
     def __init__(
@@ -33,12 +33,13 @@ def normalize(self, text: cudf.Series) -> cudf.Series:
         """
         Parameters
         ----------
-        text : cudf string series
+        text : cudf.Series
             The strings to be normalized.
 
         Returns
         -------
-        Normalized strings
+        cudf.Series
+            Normalized strings
         """
         result = text._column.normalize_characters(self.normalizer)
 
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
index ada361c5e79..6a18c205841 100644
--- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
@@ -13,7 +13,8 @@ __all__ = [
     "CharacterNormalizer"
     "normalize_characters",
     "normalize_spaces",
-    "characters_normalize"]
+    "characters_normalize"
+]
 
 cdef class CharacterNormalizer:
     """The normalizer object to be used with ``normalize_characters``.

From 2d92791bd1c06f00051aad6ef4fb44609e3ea4bb Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 10 Feb 2025 18:59:17 -0500
Subject: [PATCH 17/21] change min() to cuda::std::min()

---
 cpp/src/text/normalize.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 501034766ec..f20a4102a6e 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -41,6 +41,7 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -342,7 +343,7 @@ CUDF_KERNEL void special_tokens_kernel(uint32_t* d_normalized,
   if (idx >= total_count) { return; }
   auto const begin = d_normalized + (idx * MAX_NEW_CHARS) + 1;
   if (*begin != '[') { return; }
-  auto const end   = begin + std::min(6L, total_count - idx) * MAX_NEW_CHARS;
+  auto const end   = begin + cuda::std::min(6L, total_count - idx) * MAX_NEW_CHARS;
   auto const match = thrust::find(thrust::seq, begin, end, static_cast<uint32_t>(']'));
   if (match == end) { return; }
   char candidate[8];

From 10e3e4b05095dbf6ee913d4b452b71f3279c98f8 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 13 Feb 2025 14:44:09 -0500
Subject: [PATCH 18/21] add get_first_and_last_offset utility

---
 cpp/include/cudf/strings/detail/utilities.hpp | 14 +++++++++++++-
 cpp/src/strings/utilities.cu                  | 14 +++++++++++++-
 cpp/src/text/normalize.cu                     |  9 ++-------
 cpp/tests/text/normalize_tests.cpp            |  8 ++++++++
 4 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index d276c5df7dc..8fb1f30f961 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -96,5 +96,17 @@ int64_t get_offset_value(cudf::column_view const& offsets,
                          size_type index,
                          rmm::cuda_stream_view stream);
 
+/**
+ * @brief Return the first and last offset in the given strings column
+ *
+ * This accounts for sliced input columns as well.
+ *
+ * @param input Strings column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return First and last offset values
+ */
+std::pair<int64_t, int64_t> get_first_and_last_offset(cudf::strings_column_view const& input,
+                                                      rmm::cuda_stream_view stream);
+
 }  // namespace strings::detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 45bd4615435..c5d46598d4a 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -180,6 +180,18 @@ int64_t get_offset_value(cudf::column_view const& offsets,
                                 : cudf::detail::get_value<int32_t>(offsets, index, stream);
 }
 
+std::pair<int64_t, int64_t> get_first_and_last_offset(cudf::strings_column_view const& input,
+                                                      rmm::cuda_stream_view stream)
+{
+  if (input.is_empty()) { return {0L, 0L}; }
+  auto const first_offset = (input.offset() == 0) ? 0
+                                                  : cudf::strings::detail::get_offset_value(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset =
+    cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
+  return {first_offset, last_offset};
+}
+
 }  // namespace detail
 
 rmm::device_uvector<string_view> create_string_vector_from_column(
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index f20a4102a6e..931ed2de4bb 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -545,13 +545,8 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
 {
   if (input.is_empty()) { return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING}); }
 
-  auto const first_offset  = (input.offset() == 0) ? 0
-                                                   : cudf::strings::detail::get_offset_value(
-                                                      input.offsets(), input.offset(), stream);
-  auto const last_offset   = (input.offset() == 0 && input.size() == input.offsets().size() - 1)
-                               ? input.chars_size(stream)
-                               : cudf::strings::detail::get_offset_value(
-                                 input.offsets(), input.size() + input.offset(), stream);
+  auto [first_offset, last_offset] =
+    cudf::strings::detail::get_first_and_last_offset(input, stream);
   auto const chars_size    = last_offset - first_offset;
   auto const d_input_chars = input.chars_begin(stream) + first_offset;
 
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index fc2a0ef1660..530148eb654 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -185,6 +185,8 @@ TEST_F(TextNormalizeTest, WithNormalizer)
                                                         long_row},
                                                        {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = nvtext::normalize_characters(sv, *normalizer);  // test normalizer re-use
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   normalizer = nvtext::create_character_normalizer(false);
   results    = nvtext::normalize_characters(sv, *normalizer);
@@ -203,6 +205,8 @@ TEST_F(TextNormalizeTest, WithNormalizer)
                                                    long_row},
                                                   {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = nvtext::normalize_characters(sv, *normalizer);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(TextNormalizeTest, SpecialTokens)
@@ -231,6 +235,8 @@ TEST_F(TextNormalizeTest, SpecialTokens)
        long_row,
        long_row});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = nvtext::normalize_characters(sv, *normalizer);  // and again
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   normalizer = nvtext::create_character_normalizer(false, stv);
   results    = nvtext::normalize_characters(sv, *normalizer);
@@ -242,6 +248,8 @@ TEST_F(TextNormalizeTest, SpecialTokens)
        long_row,
        long_row});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  results = nvtext::normalize_characters(sv, *normalizer);  // and again
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(TextNormalizeTest, NormalizeSlicedColumn)

From 29bee66157b2860eeb57f32c2e81d7c463ce79e7 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 18 Feb 2025 13:39:08 -0500
Subject: [PATCH 19/21] change _impl to smart-pointer

---
 cpp/include/nvtext/normalize.hpp |  2 +-
 cpp/src/text/normalize.cu        | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index 4f55a5f7b8e..70ee7891ad7 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -158,7 +158,7 @@ struct character_normalizer {
   ~character_normalizer();
 
   struct character_normalizer_impl;
-  character_normalizer_impl* _impl{};
+  std::unique_ptr<character_normalizer_impl> _impl;
 };
 
 /**
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 931ed2de4bb..0e680e98ec5 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -301,14 +301,14 @@ character_normalizer::character_normalizer(bool do_lower_case,
   auto tokens_view = cudf::strings::detail::create_string_vector_from_column(
     cudf::strings_column_view(sorted->view()), stream, cudf::get_current_device_resource_ref());
 
-  _impl = new character_normalizer_impl(std::move(cp_metadata),
-                                        std::move(aux_table),
-                                        do_lower_case,
-                                        std::move(sorted),
-                                        std::move(tokens_view));
+  _impl = std::make_unique<character_normalizer_impl>(std::move(cp_metadata),
+                                                      std::move(aux_table),
+                                                      do_lower_case,
+                                                      std::move(sorted),
+                                                      std::move(tokens_view));
 }
 
-character_normalizer::~character_normalizer() { delete _impl; }
+character_normalizer::~character_normalizer() {}
 
 std::unique_ptr<character_normalizer> create_character_normalizer(
   bool do_lower_case,
@@ -556,7 +556,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   cudf::detail::grid_1d grid{chars_size, block_size};
   auto const max_new_char_total = cudf::util::round_up_safe(chars_size, block_size) * MAX_NEW_CHARS;
 
-  auto const parameters = normalizer._impl;
+  auto const& parameters = normalizer._impl;
 
   auto d_normalized = rmm::device_uvector<uint32_t>(max_new_char_total, stream);
   data_normalizer_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(

From e3fdb2d8e38cdb808a2e77f4d4d0b714dd3a30ab Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 19 Feb 2025 07:24:46 -0500
Subject: [PATCH 20/21] fix docstring

---
 python/cudf/cudf/core/column/string.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 37615347a47..818a6d9ad8a 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4679,8 +4679,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         r"""
         Normalizes strings characters for tokenizing.
 
-        This uses the normalizer that is built into the
-        subword_tokenize function which includes:
+        The normalizer function includes:
 
             - adding padding around punctuation (unicode category starts with
               "P") as well as certain ASCII symbols like "^" and "$"

From 56041b5de2482846080c22229483e06510506ee3 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 21 Feb 2025 11:24:37 -0500
Subject: [PATCH 21/21] add deprecation warning

---
 python/cudf/cudf/core/column/string.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 818a6d9ad8a..c0ad33ec7d6 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4679,6 +4679,9 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         r"""
         Normalizes strings characters for tokenizing.
 
+        .. deprecated:: 25.04
+           Use `CharacterNormalizer` instead.
+
         The normalizer function includes:
 
             - adding padding around punctuation (unicode category starts with
@@ -4719,6 +4722,11 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex:
         2              $ 99
         dtype: object
         """
+        warnings.warn(
+            "normalize_characters is deprecated and will be removed in a future "
+            "version. Use CharacterNormalizer instead.",
+            FutureWarning,
+        )
         return self._return_or_inplace(
             self._column.characters_normalize(do_lower)
         )