Skip to content

Commit 4565b61

Browse files
authored
GH-45735: [C++] Broken tests for extract_regex compute funcion (#45900)
### Rationale for this change I fix two broken tests of extract_regex and add the similar tests to extract_regex_span. ### What changes are included in this PR? 1. Modify the ResolveOutput Method for extract_regex and extract_regex_span compute functions to be able to accept non-utf8 regex. 2. Fix broken tests for extract_regex. 3. Add similar tests for extract_regex_span. ### Are these changes tested? I run the relevant unit tests. ### Are there any user-facing changes? No. * GitHub Issue: #45735 Lead-authored-by: Antoine Pitrou <[email protected]> Co-authored-by: arash andishgar <[email protected]> Co-authored-by: Arash Andishgar <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent eaeb64b commit 4565b61

File tree

2 files changed

+64
-38
lines changed

2 files changed

+64
-38
lines changed

Diff for: cpp/src/arrow/compute/kernels/scalar_string_ascii.cc

+21-19
Original file line numberDiff line numberDiff line change
@@ -2209,28 +2209,20 @@ struct BaseExtractRegexData {
22092209
std::vector<std::string> group_names;
22102210

22112211
protected:
2212-
explicit BaseExtractRegexData(const std::string& pattern, bool is_utf8 = true)
2212+
explicit BaseExtractRegexData(const std::string& pattern, bool is_utf8)
22132213
: regex(new RE2(pattern, MakeRE2Options(is_utf8))) {}
22142214
};
22152215

22162216
// TODO cache this once per ExtractRegexOptions
22172217
struct ExtractRegexData : public BaseExtractRegexData {
2218-
static Result<ExtractRegexData> Make(const ExtractRegexOptions& options,
2219-
bool is_utf8 = true) {
2218+
static Result<ExtractRegexData> Make(const ExtractRegexOptions& options, bool is_utf8) {
22202219
ExtractRegexData data(options.pattern, is_utf8);
22212220
ARROW_RETURN_NOT_OK(data.Init());
22222221
return data;
22232222
}
22242223

22252224
Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types) const {
22262225
const DataType* input_type = types[0].type;
2227-
if (input_type == nullptr) {
2228-
// No input type specified
2229-
return nullptr;
2230-
}
2231-
// Input type is either [Large]Binary or [Large]String and is also the type
2232-
// of each field in the output struct type.
2233-
DCHECK(is_base_binary_like(input_type->id()));
22342226
FieldVector fields;
22352227
fields.reserve(num_groups());
22362228
std::shared_ptr<DataType> owned_type = input_type->GetSharedPtr();
@@ -2240,14 +2232,21 @@ struct ExtractRegexData : public BaseExtractRegexData {
22402232
}
22412233

22422234
private:
2243-
explicit ExtractRegexData(const std::string& pattern, bool is_utf8 = true)
2235+
explicit ExtractRegexData(const std::string& pattern, bool is_utf8)
22442236
: BaseExtractRegexData(pattern, is_utf8) {}
22452237
};
22462238

22472239
Result<TypeHolder> ResolveExtractRegexOutput(KernelContext* ctx,
22482240
const std::vector<TypeHolder>& types) {
2241+
auto input_type = types[0].type;
2242+
if (input_type == nullptr) {
2243+
// No input type specified
2244+
return nullptr;
2245+
}
2246+
DCHECK(is_base_binary_like(input_type->id()));
2247+
auto is_utf8 = is_string(input_type->id());
22492248
ExtractRegexOptions options = ExtractRegexState::Get(ctx);
2250-
ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options));
2249+
ARROW_ASSIGN_OR_RAISE(auto data, ExtractRegexData::Make(options, is_utf8));
22512250
return data.ResolveOutputType(types);
22522251
}
22532252

@@ -2359,19 +2358,14 @@ void AddAsciiStringExtractRegex(FunctionRegistry* registry) {
23592358
}
23602359

23612360
struct ExtractRegexSpanData : public BaseExtractRegexData {
2362-
static Result<ExtractRegexSpanData> Make(const std::string& pattern,
2363-
bool is_utf8 = true) {
2361+
static Result<ExtractRegexSpanData> Make(const std::string& pattern, bool is_utf8) {
23642362
auto data = ExtractRegexSpanData(pattern, is_utf8);
23652363
ARROW_RETURN_NOT_OK(data.Init());
23662364
return data;
23672365
}
23682366

23692367
Result<TypeHolder> ResolveOutputType(const std::vector<TypeHolder>& types) const {
23702368
const DataType* input_type = types[0].type;
2371-
if (input_type == nullptr) {
2372-
return nullptr;
2373-
}
2374-
DCHECK(is_base_binary_like(input_type->id()));
23752369
FieldVector fields;
23762370
fields.reserve(num_groups());
23772371
auto index_type = is_binary_like(input_type->id()) ? int32() : int64();
@@ -2474,8 +2468,16 @@ const FunctionDoc extract_regex_span_doc(
24742468

24752469
Result<TypeHolder> ResolveExtractRegexSpanOutputType(
24762470
KernelContext* ctx, const std::vector<TypeHolder>& types) {
2471+
auto input_type = types[0].type;
2472+
if (input_type == nullptr) {
2473+
// No input type specified
2474+
return nullptr;
2475+
}
2476+
DCHECK(is_base_binary_like(input_type->id()));
2477+
auto is_utf8 = is_string(input_type->id());
24772478
auto options = OptionsWrapper<ExtractRegexSpanOptions>::Get(*ctx->state());
2478-
ARROW_ASSIGN_OR_RAISE(auto span, ExtractRegexSpanData::Make(options.pattern));
2479+
2480+
ARROW_ASSIGN_OR_RAISE(auto span, ExtractRegexSpanData::Make(options.pattern, is_utf8));
24792481
return span.ResolveOutputType(types);
24802482
}
24812483

Diff for: cpp/src/arrow/compute/kernels/scalar_string_test.cc

+43-19
Original file line numberDiff line numberDiff line change
@@ -314,15 +314,27 @@ TYPED_TEST(TestBinaryKernels, NonUtf8Regex) {
314314
this->MakeArray({"\xfc\x40", "this \xfc\x40 that \xfc\x40"}),
315315
this->MakeArray({"bazz", "this bazz that \xfc\x40"}), &options);
316316
}
317-
// TODO the following test is broken (GH-45735)
318317
{
319-
ExtractRegexOptions options("(?P<letter>[\\xfc])(?P<digit>\\d)");
320-
auto null_bitmap = std::make_shared<Buffer>("0");
321-
auto output = StructArray::Make(
322-
{this->MakeArray({"\xfc", "1"}), this->MakeArray({"\xfc", "2"})},
323-
{field("letter", this->type()), field("digit", this->type())}, null_bitmap);
324-
this->CheckUnary("extract_regex", this->MakeArray({"foo\xfc 1bar", "\x02\xfc\x40"}),
325-
std::static_pointer_cast<Array>(*output), &options);
318+
ExtractRegexOptions options("(?P<letter>[\xfc])(?P<digit>\\d+)");
319+
ASSERT_OK_AND_ASSIGN(
320+
auto output,
321+
StructArray::Make(
322+
{this->MakeArray({"\xfc", "\xfc"}), this->MakeArray({"14", "2"})},
323+
{field("letter", this->type()), field("digit", this->type())}));
324+
this->CheckUnary("extract_regex",
325+
this->MakeArray({"foo\xfc\x31\x34 bar", "\x02\xfc\x32"}), output,
326+
&options);
327+
}
328+
{
329+
ExtractRegexSpanOptions options("(?P<letter>[\xfc])(?P<digit>\\d+)");
330+
auto offset_type = is_binary_like(this->type()->id()) ? int32() : int64();
331+
auto out_type = struct_({field("letter", fixed_size_list(offset_type, 2)),
332+
field("digit", fixed_size_list(offset_type, 2))});
333+
this->CheckUnary("extract_regex_span",
334+
this->MakeArray({"foo\xfc\x31\x34 bar", "\x02\xfc\x32"}), out_type,
335+
R"([{"letter": [3, 1], "digit": [4, 2]},
336+
{"letter": [1, 1], "digit": [2, 1]}])",
337+
&options);
326338
}
327339
}
328340

@@ -371,18 +383,30 @@ TYPED_TEST(TestBinaryKernels, NonUtf8WithNullRegex) {
371383
this->template MakeArray<std::string>({{"\x00\x40", 2}}),
372384
this->type(), R"(["bazz"])", &options);
373385
}
374-
// TODO the following test is broken (GH-45735)
375386
{
376-
ExtractRegexOptions options("(?P<null>[\\x00])(?P<digit>\\d)");
377-
auto null_bitmap = std::make_shared<Buffer>("0");
378-
auto output = StructArray::Make(
379-
{this->template MakeArray<std::string>({{"\x00", 1}, {"1", 1}}),
380-
this->template MakeArray<std::string>({{"\x00", 1}, {"2", 1}})},
381-
{field("null", this->type()), field("digit", this->type())}, null_bitmap);
382-
this->CheckUnary(
383-
"extract_regex",
384-
this->template MakeArray<std::string>({{"foo\x00 1bar", 9}, {"\x02\x00\x40", 3}}),
385-
std::static_pointer_cast<Array>(*output), &options);
387+
ExtractRegexOptions options(std::string("(?P<null>[\x00])(?P<digit>\\d+)", 27));
388+
ASSERT_OK_AND_ASSIGN(
389+
auto output,
390+
StructArray::Make(
391+
{this->template MakeArray<std::string>({{"\x00", 1}, {"\x00", 1}}),
392+
this->template MakeArray<std::string>({"14", "2"})},
393+
{field("null", this->type()), field("digit", this->type())}));
394+
this->CheckUnary("extract_regex",
395+
this->template MakeArray<std::string>(
396+
{{"foo\x00\x31\x34 bar", 10}, {"\x02\x00\x32", 3}}),
397+
output, &options);
398+
}
399+
{
400+
ExtractRegexSpanOptions options(std::string("(?P<null>[\x00])(?P<digit>\\d+)", 27));
401+
auto offset_type = is_binary_like(this->type()->id()) ? int32() : int64();
402+
auto out_type = struct_({field("null", fixed_size_list(offset_type, 2)),
403+
field("digit", fixed_size_list(offset_type, 2))});
404+
this->CheckUnary("extract_regex_span",
405+
this->template MakeArray<std::string>(
406+
{{"foo\x00\x31\x34 bar", 10}, {"\x02\x00\x32", 3}}),
407+
out_type, R"([{"null": [3, 1], "digit": [4, 2]},
408+
{"null": [1, 1], "digit": [2, 1]}])",
409+
&options);
386410
}
387411
{
388412
ReplaceSliceOptions options(1, 2, std::string("\x00\x40", 2));

0 commit comments

Comments
 (0)