add more gtests and pytests

rapidsai · Feb 3, 2025 · f2f35a6 · f2f35a6
1 parent e96bc21
commit f2f35a6
Show file tree

Hide file tree

Showing 2 changed files with 107 additions and 22 deletions.
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
@@ -74,6 +74,10 @@ TEST_F(TextNormalizeTest, NormalizeEmptyTest)
   EXPECT_EQ(results->size(), 0);
   results = nvtext::normalize_characters(strings_view, false);
   EXPECT_EQ(results->size(), 0);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  results         = nvtext::normalize_characters(strings_view, *normalizer);
+  EXPECT_EQ(results->size(), 0);
 }
 
 TEST_F(TextNormalizeTest, AllNullStrings)
@@ -84,6 +88,10 @@ TEST_F(TextNormalizeTest, AllNullStrings)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
   results = nvtext::normalize_characters(strings_view, false);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  results         = nvtext::normalize_characters(strings_view, *normalizer);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
 }
 
 TEST_F(TextNormalizeTest, SomeNullStrings)
@@ -93,6 +101,10 @@ TEST_F(TextNormalizeTest, SomeNullStrings)
   auto results = nvtext::normalize_characters(strings_view, false);
   cudf::test::strings_column_wrapper expected({"", " . ", "a"}, {false, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto normalizer = nvtext::create_character_normalizer(true);
+  results         = nvtext::normalize_characters(strings_view, *normalizer);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(TextNormalizeTest, NormalizeCharacters)
@@ -176,7 +188,6 @@ TEST_F(TextNormalizeTest, WithNormalizer)
 
 TEST_F(TextNormalizeTest, SpecialTokens)
 {
-  // These include punctuation, accents, whitespace, and CJK characters
   auto input =
     cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]",
                                         "[bos]these should[sep]work too[eos]",
@@ -210,26 +221,17 @@ TEST_F(TextNormalizeTest, NormalizeSlicedColumn)
 
   std::vector<cudf::column_view> sliced = cudf::split(strings, {4});
   auto results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), true);
-  cudf::test::strings_column_wrapper expected({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
+  auto expected =
+    cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false);
-  cudf::test::strings_column_wrapper expected2({" $ 41 . 07", " [ a , b ] ", " 丏  丟 "});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
-}
-
-TEST_F(TextNormalizeTest, SlicedColumn)
-{
-  auto input = cudf::test::strings_column_wrapper(
-    {"abc£def", "éè â îô\taeio", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟"});
-
-  auto sliced = cudf::split(input, {4});
+  results  = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false);
+  expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏  丟 "});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   auto normalizer = nvtext::create_character_normalizer(true);
-  auto results =
-    nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer);
-  auto expected =
-    cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
+  results  = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer);
+  expected = cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   normalizer = nvtext::create_character_normalizer(false);

diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
@@ -15,7 +15,7 @@ def norm_spaces_input_data():
 
 @pytest.fixture(scope="module")
 def norm_chars_input_data():
-    arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
+    arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]", "[pad]"]
     return pa.array(arr)
 
 
@@ -33,11 +33,94 @@ def test_normalize_characters(norm_chars_input_data, do_lower):
         plc.interop.from_arrow(norm_chars_input_data),
         do_lower,
     )
-    expected = pa.array(
-        ["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
+    if do_lower:
+        expected = pa.array(
+            [
+                "eaio eaio",
+                "acenu",
+                "acenu",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    else:
+        expected = pa.array(
+            [
+                "éâîô eaio",
+                "ĂĆĖÑÜ",
+                "ACENU",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("do_lower", [True, False])
+def test_normalizer(norm_chars_input_data, do_lower):
+    result = plc.nvtext.normalize.normalize_characters(
+        plc.interop.from_arrow(norm_chars_input_data),
+        plc.nvtext.normalize.CharacterNormalizer(
+            do_lower,
+            plc.column_factories.make_empty_column(plc.types.TypeId.STRING),
+        ),
     )
-    if not do_lower:
+    if do_lower:
+        expected = pa.array(
+            [
+                "eaio eaio",
+                "acenu",
+                "acenu",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    else:
+        expected = pa.array(
+            [
+                "éâîô eaio",
+                "ĂĆĖÑÜ",
+                "ACENU",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [ pad ] ",
+            ]
+        )
+    assert_column_eq(result, expected)
+
+
+@pytest.mark.parametrize("do_lower", [True, False])
+def test_normalizer_with_special_tokens(norm_chars_input_data, do_lower):
+    special_tokens = pa.array(["[pad]"])
+    result = plc.nvtext.normalize.normalize_characters(
+        plc.interop.from_arrow(norm_chars_input_data),
+        plc.nvtext.normalize.CharacterNormalizer(
+            do_lower, plc.interop.from_arrow(special_tokens)
+        ),
+    )
+    if do_lower:
+        expected = pa.array(
+            [
+                "eaio eaio",
+                "acenu",
+                "acenu",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [pad] ",
+            ]
+        )
+    else:
         expected = pa.array(
-            ["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
+            [
+                "éâîô eaio",
+                "ĂĆĖÑÜ",
+                "ACENU",
+                " $ 24 . 08",
+                " [ a , bb ] ",
+                " [pad] ",
+            ]
         )
     assert_column_eq(result, expected)