rapidsai · davidwendt · Jan 24, 2025 · Jan 24, 2025 · Jan 27, 2025 · Jan 27, 2025
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -48,15 +48,18 @@ static void bench_normalize(nvbench::state& state)
                [&](nvbench::launch& launch) { auto result = nvtext::normalize_spaces(input); });
   } else {
     bool const to_lower = (normalize_type == "to_lower");
+    // we expect the normalizer to be created once and re-used
+    // so creating it is not measured
+    auto normalizer = nvtext::create_character_normalizer(to_lower);
     state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-      auto result = nvtext::normalize_characters(input, to_lower);
+      auto result = nvtext::normalize_characters(input, *normalizer);
     });
   }
 }
 
 NVBENCH_BENCH(bench_normalize)
   .set_name("normalize")
   .add_int64_axis("min_width", {0})
-  .add_int64_axis("max_width", {32, 64, 128, 256})
+  .add_int64_axis("max_width", {128, 256})
   .add_int64_axis("num_rows", {32768, 262144, 2097152})
   .add_string_axis("type", {"spaces", "characters", "to_lower"});
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -96,5 +96,17 @@ int64_t get_offset_value(cudf::column_view const& offsets,
                          size_type index,
                          rmm::cuda_stream_view stream);
 
+/**
+ * @brief Return the first and last offset in the given strings column
+ *
+ * This accounts for sliced input columns as well.
+ *
+ * @param input Strings column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return First and last offset values
+ */
+std::pair<int64_t, int64_t> get_first_and_last_offset(cudf::strings_column_view const& input,
+                                                      rmm::cuda_stream_view stream);
+
 }  // namespace strings::detail
 }  // namespace CUDF_EXPORT cudf
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -107,5 +108,113 @@ std::unique_ptr<cudf::column> normalize_characters(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Normalizer object to be used with nvtext::normalize_characters
+ *
+ * Use nvtext::create_normalizer to create this object.
+ *
+ * This normalizer includes:
+ *
+ * - adding padding around punctuation (unicode category starts with "P")
+ *   as well as certain ASCII symbols like "^" and "$"
+ * - adding padding around the [CJK Unicode block
+ * characters](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block))
+ * - changing whitespace (e.g. `"\t", "\n", "\r"`) to just space `" "`
+ * - removing control characters (unicode categories "Cc" and "Cf")
+ *
+ * The padding process adds a single space before and after the character.
+ * Details on _unicode category_ can be found here:
+ * https://unicodebook.readthedocs.io/unicode.html#categories
+ *
+ * If `do_lower_case = true`, lower-casing also removes any accents. The
+ * accents cannot be removed from upper-case characters without lower-casing
+ * and lower-casing cannot be performed without also removing accents.
+ * However, if the accented character is already lower-case, then only the
+ * accent is removed.
+ *
+ * If `special_tokens` are included the padding after `[` and before `]` is not
+ * inserted if the characters between them match one of the given tokens.
+ * Also, the `special_tokens` are expected to include the `[]` characters
+ * at the beginning of and end of each string appropriately.
+ */
+struct character_normalizer {
+  /**
+   * @brief Normalizer object constructor
+   *
+   * This initializes and holds the character normalizing tables and settings.
+   *
+   * @param do_lower_case If true, upper-case characters are converted to
+   *        lower-case and accents are stripped from those characters.
+   *        If false, accented and upper-case characters are not transformed.
+   * @param special_tokens Each row is a token including the `[]` brackets.
+   *        For example: `[BOS]`, `[EOS]`, `[UNK]`, `[SEP]`, `[PAD]`, `[CLS]`, `[MASK]`
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource used to allocate the returned column's device memory
+   */
+  character_normalizer(bool do_lower_case,
+                       cudf::strings_column_view const& special_tokens,
+                       rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+  ~character_normalizer();
+
+  struct character_normalizer_impl;
+  std::unique_ptr<character_normalizer_impl> _impl;
+};
+
+/**
+ * @brief Create a normalizer object
+ *
+ * Creates a normalizer object which can be reused on multiple calls to
+ * nvtext::normalize_characters
+ *
+ * @see nvtext::character_normalizer
+ *
+ * @param do_lower_case If true, upper-case characters are converted to
+ *        lower-case and accents are stripped from those characters.
+ *        If false, accented and upper-case characters are not transformed.
+ * @param special_tokens Individual tokens including `[]` brackets.
+ *        Default is no special tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Object to be used with nvtext::normalize_characters
+ */
+std::unique_ptr<character_normalizer> create_character_normalizer(
+  bool do_lower_case,
+  cudf::strings_column_view const& special_tokens = cudf::strings_column_view(cudf::column_view{
+    cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0}),
+  rmm::cuda_stream_view stream                    = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr               = cudf::get_current_device_resource_ref());
+
+/**
+ * @brief Normalizes the text in input strings column
+ *
+ * @see nvtext::character_normalizer for details on the normalizer behavior
+ *
+ * @code{.pseudo}
+ * cn = create_character_normalizer(true)
+ * s = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
+ * s1 = normalize_characters(s,cn)
+ * s1 is now ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
+ *
+ * cn = create_character_normalizer(false)
+ * s2 = normalize_characters(s,cn)
+ * s2 is now ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
+ * @endcode
+ *
+ * A null input element at row `i` produces a corresponding null entry
+ * for row `i` in the output column.
+ *
+ * @param input The input strings to normalize
+ * @param normalizer Normalizer to use for this function
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to allocate any returned objects
+ * @return Normalized strings column
+ */
+std::unique_ptr<cudf::column> normalize_characters(
+  cudf::strings_column_view const& input,
+  character_normalizer const& normalizer,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -180,6 +180,18 @@ int64_t get_offset_value(cudf::column_view const& offsets,
                                 : cudf::detail::get_value<int32_t>(offsets, index, stream);
 }
 
+std::pair<int64_t, int64_t> get_first_and_last_offset(cudf::strings_column_view const& input,
+                                                      rmm::cuda_stream_view stream)
+{
+  if (input.is_empty()) { return {0L, 0L}; }
+  auto const first_offset = (input.offset() == 0) ? 0
+                                                  : cudf::strings::detail::get_offset_value(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset =
+    cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
+  return {first_offset, last_offset};
+}
+
 }  // namespace detail
 
 rmm::device_uvector<string_view> create_string_vector_from_column(