Skip to content

Commit

Permalink
add more gtests and pytests
Browse files Browse the repository at this point in the history
  • Loading branch information
davidwendt committed Feb 3, 2025
1 parent e96bc21 commit f2f35a6
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 22 deletions.
36 changes: 19 additions & 17 deletions cpp/tests/text/normalize_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ TEST_F(TextNormalizeTest, NormalizeEmptyTest)
EXPECT_EQ(results->size(), 0);
results = nvtext::normalize_characters(strings_view, false);
EXPECT_EQ(results->size(), 0);

auto normalizer = nvtext::create_character_normalizer(true);
results = nvtext::normalize_characters(strings_view, *normalizer);
EXPECT_EQ(results->size(), 0);
}

TEST_F(TextNormalizeTest, AllNullStrings)
Expand All @@ -84,6 +88,10 @@ TEST_F(TextNormalizeTest, AllNullStrings)
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
results = nvtext::normalize_characters(strings_view, false);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);

auto normalizer = nvtext::create_character_normalizer(true);
results = nvtext::normalize_characters(strings_view, *normalizer);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
}

TEST_F(TextNormalizeTest, SomeNullStrings)
Expand All @@ -93,6 +101,10 @@ TEST_F(TextNormalizeTest, SomeNullStrings)
auto results = nvtext::normalize_characters(strings_view, false);
cudf::test::strings_column_wrapper expected({"", " . ", "a"}, {false, true, true});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

auto normalizer = nvtext::create_character_normalizer(true);
results = nvtext::normalize_characters(strings_view, *normalizer);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
}

TEST_F(TextNormalizeTest, NormalizeCharacters)
Expand Down Expand Up @@ -176,7 +188,6 @@ TEST_F(TextNormalizeTest, WithNormalizer)

TEST_F(TextNormalizeTest, SpecialTokens)
{
// These include punctuation, accents, whitespace, and CJK characters
auto input =
cudf::test::strings_column_wrapper({"[BOS]Some strings with [PAD] special[SEP]tokens[EOS]",
"[bos]these should[sep]work too[eos]",
Expand Down Expand Up @@ -210,26 +221,17 @@ TEST_F(TextNormalizeTest, NormalizeSlicedColumn)

std::vector<cudf::column_view> sliced = cudf::split(strings, {4});
auto results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), true);
cudf::test::strings_column_wrapper expected({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
auto expected =
cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false);
cudf::test::strings_column_wrapper expected2({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
}

TEST_F(TextNormalizeTest, SlicedColumn)
{
auto input = cudf::test::strings_column_wrapper(
{"abc£def", "éè â îô\taeio", "ACEN U", "P^NP", "$41.07", "[a,b]", "丏丟"});

auto sliced = cudf::split(input, {4});
results = nvtext::normalize_characters(cudf::strings_column_view(sliced[1]), false);
expected = cudf::test::strings_column_wrapper({" $ 41 . 07", " [ a , b ] ", " 丏 丟 "});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

auto normalizer = nvtext::create_character_normalizer(true);
auto results =
nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer);
auto expected =
cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
results = nvtext::normalize_characters(cudf::strings_column_view(sliced.front()), *normalizer);
expected = cudf::test::strings_column_wrapper({"abc£def", "ee a io aeio", "acen u", "p ^ np"});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

normalizer = nvtext::create_character_normalizer(false);
Expand Down
93 changes: 88 additions & 5 deletions python/pylibcudf/pylibcudf/tests/test_nvtext_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def norm_spaces_input_data():

@pytest.fixture(scope="module")
def norm_chars_input_data():
arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]"]
arr = ["éâîô\teaio", "ĂĆĖÑÜ", "ACENU", "$24.08", "[a,bb]", "[pad]"]
return pa.array(arr)


Expand All @@ -33,11 +33,94 @@ def test_normalize_characters(norm_chars_input_data, do_lower):
plc.interop.from_arrow(norm_chars_input_data),
do_lower,
)
expected = pa.array(
["eaio eaio", "acenu", "acenu", " $ 24 . 08", " [ a , bb ] "]
if do_lower:
expected = pa.array(
[
"eaio eaio",
"acenu",
"acenu",
" $ 24 . 08",
" [ a , bb ] ",
" [ pad ] ",
]
)
else:
expected = pa.array(
[
"éâîô eaio",
"ĂĆĖÑÜ",
"ACENU",
" $ 24 . 08",
" [ a , bb ] ",
" [ pad ] ",
]
)
assert_column_eq(result, expected)


@pytest.mark.parametrize("do_lower", [True, False])
def test_normalizer(norm_chars_input_data, do_lower):
result = plc.nvtext.normalize.normalize_characters(
plc.interop.from_arrow(norm_chars_input_data),
plc.nvtext.normalize.CharacterNormalizer(
do_lower,
plc.column_factories.make_empty_column(plc.types.TypeId.STRING),
),
)
if not do_lower:
if do_lower:
expected = pa.array(
[
"eaio eaio",
"acenu",
"acenu",
" $ 24 . 08",
" [ a , bb ] ",
" [ pad ] ",
]
)
else:
expected = pa.array(
[
"éâîô eaio",
"ĂĆĖÑÜ",
"ACENU",
" $ 24 . 08",
" [ a , bb ] ",
" [ pad ] ",
]
)
assert_column_eq(result, expected)


@pytest.mark.parametrize("do_lower", [True, False])
def test_normalizer_with_special_tokens(norm_chars_input_data, do_lower):
special_tokens = pa.array(["[pad]"])
result = plc.nvtext.normalize.normalize_characters(
plc.interop.from_arrow(norm_chars_input_data),
plc.nvtext.normalize.CharacterNormalizer(
do_lower, plc.interop.from_arrow(special_tokens)
),
)
if do_lower:
expected = pa.array(
[
"eaio eaio",
"acenu",
"acenu",
" $ 24 . 08",
" [ a , bb ] ",
" [pad] ",
]
)
else:
expected = pa.array(
["éâîô eaio", "ĂĆĖÑÜ", "ACENU", " $ 24 . 08", " [ a , bb ] "]
[
"éâîô eaio",
"ĂĆĖÑÜ",
"ACENU",
" $ 24 . 08",
" [ a , bb ] ",
" [pad] ",
]
)
assert_column_eq(result, expected)

0 comments on commit f2f35a6

Please sign in to comment.