Skip to content

Commit 3d321c2

Browse files
authored
Allow REGEX for arbitrary expressions (not just a variable) (#1576)
So far, the `REGEX` function was only implemented for the (frequent) special case, where the first argument is either a variable (like `?x`) or `STR` of a variable (like `STR(?x)`). Now `REGEX` works for arbitrary expressions. Use the occasion to clean up the code a little bit and improve the documentation.
1 parent e528480 commit 3d321c2

File tree

3 files changed

+186
-104
lines changed

3 files changed

+186
-104
lines changed

src/engine/sparqlExpressions/RegexExpression.cpp

Lines changed: 110 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -72,19 +72,21 @@ std::optional<std::string> getPrefixRegex(std::string regex) {
7272
} // namespace sparqlExpression::detail
7373

7474
namespace sparqlExpression {
75+
7576
// ___________________________________________________________________________
7677
RegexExpression::RegexExpression(
7778
SparqlExpression::Ptr child, SparqlExpression::Ptr regex,
7879
std::optional<SparqlExpression::Ptr> optionalFlags)
7980
: child_{std::move(child)} {
81+
// If we have a `STR()` expression, remove the `STR()` and remember that it
82+
// was there.
8083
if (child_->isStrExpression()) {
8184
child_ = std::move(std::move(*child_).moveChildrenOut().at(0));
8285
childIsStrExpression_ = true;
8386
}
84-
if (!dynamic_cast<const VariableExpression*>(child_.get())) {
85-
throw std::runtime_error(
86-
"REGEX expressions are currently supported only on variables.");
87-
}
87+
88+
// Get the regex string, which must be a string literal without a datatype or
89+
// language tag.
8890
std::string regexString;
8991
if (auto regexPtr =
9092
dynamic_cast<const StringLiteralExpression*>(regex.get())) {
@@ -100,6 +102,9 @@ RegexExpression::RegexExpression(
100102
"The second argument to the REGEX function must be a "
101103
"string literal (which contains the regular expression)");
102104
}
105+
106+
// Parse the flags. The optional argument for that must, again, be a
107+
// string literal without a datatype or language tag.
103108
if (optionalFlags.has_value()) {
104109
if (auto flagsPtr = dynamic_cast<const StringLiteralExpression*>(
105110
optionalFlags.value().get())) {
@@ -131,19 +136,18 @@ RegexExpression::RegexExpression(
131136
}
132137
}
133138

139+
// Create RE2 object from the regex string. If it is a simple prefix regex,
140+
// store the prefix in `prefixRegex_` (otherwise that becomes `std::nullopt`).
134141
regexAsString_ = regexString;
135-
if (auto opt = detail::getPrefixRegex(regexString)) {
136-
regex_ = std::move(opt.value());
137-
} else {
138-
regex_.emplace<RE2>(regexString, RE2::Quiet);
139-
const auto& r = std::get<RE2>(regex_);
140-
if (r.error_code() != RE2::NoError) {
141-
throw std::runtime_error{absl::StrCat(
142-
"The regex \"", regexString,
143-
"\" is not supported by QLever (which uses Google's RE2 library). "
144-
"Error from RE2 is: ",
145-
r.error())};
146-
}
142+
prefixRegex_ = detail::getPrefixRegex(regexString);
143+
regex_.emplace(regexString, RE2::Quiet);
144+
const auto& r = regex_.value();
145+
if (r.error_code() != RE2::NoError) {
146+
throw std::runtime_error{absl::StrCat(
147+
"The regex \"", regexString,
148+
"\" is not supported by QLever (which uses Google's RE2 library); "
149+
"the error from RE2 is: ",
150+
r.error())};
147151
}
148152
}
149153

@@ -163,17 +167,27 @@ std::span<SparqlExpression::Ptr> RegexExpression::childrenImpl() {
163167
ExpressionResult RegexExpression::evaluatePrefixRegex(
164168
const Variable& variable,
165169
sparqlExpression::EvaluationContext* context) const {
166-
std::string prefixRegex = std::get<std::string>(regex_);
170+
// This function must only be called if we have a simple prefix regex.
171+
AD_CORRECTNESS_CHECK(prefixRegex_.has_value());
172+
std::string prefixRegex = prefixRegex_.value();
173+
174+
// If the expression is enclosed in `STR()`, we have two ranges: for the
175+
// prefix with and without leading "<".
176+
//
177+
// TODO<joka921> prefix filters currently have false negatives when the prefix
178+
// is not in the vocabulary, and there exist local vocab entries in the input
179+
// that are between the prefix and the next local vocab entry. This is
180+
// non-trivial to fix as it involves fiddling with Unicode prefix encodings.
181+
//
182+
// TODO<joka921> prefix filters currently never find numbers or other
183+
// datatypes that are encoded directly inside the IDs.
167184
std::vector<std::string> actualPrefixes;
168185
actualPrefixes.push_back("\"" + prefixRegex);
169-
// If the STR function was applied, we also look for prefix matches for IRIs.
170-
// TODO<joka921> prefix filters currently never find numbers or local vocab
171-
// entries, numbers, or other datatypes that are encoded directly inside the
172-
// IDs.
173186
if (childIsStrExpression_) {
174187
actualPrefixes.push_back("<" + prefixRegex);
175188
}
176-
std::vector<ad_utility::SetOfIntervals> resultSetOfIntervals;
189+
190+
// Compute the (one or two) ranges.
177191
std::vector<std::pair<Id, Id>> lowerAndUpperIds;
178192
lowerAndUpperIds.reserve(actualPrefixes.size());
179193
for (const auto& prefix : actualPrefixes) {
@@ -184,12 +198,21 @@ ExpressionResult RegexExpression::evaluatePrefixRegex(
184198
}
185199
}
186200
checkCancellation(context);
201+
202+
// Begin and end of the input (for each row of which we want to
203+
// evaluate the regex).
187204
auto beg = context->_inputTable.begin() + context->_beginIndex;
188205
auto end = context->_inputTable.begin() + context->_endIndex;
189206
AD_CONTRACT_CHECK(end <= context->_inputTable.end());
207+
208+
// In this function, the expression is a simple variable. If the input is
209+
// sorted by that variable, the result can be computed by a constant number
210+
// of binary searches and the result is a set of intervals.
211+
std::vector<ad_utility::SetOfIntervals> resultSetOfIntervals;
190212
if (context->isResultSortedBy(variable)) {
191213
auto column = context->getColumnIndexForVariable(variable);
192214
for (auto [lowerId, upperId] : lowerAndUpperIds) {
215+
// Two binary searches to find the lower and upper bounds of the range.
193216
auto lower = std::lower_bound(
194217
beg, end, nullptr,
195218
[column, lowerId = lowerId](const auto& l, const auto&) {
@@ -200,7 +223,6 @@ ExpressionResult RegexExpression::evaluatePrefixRegex(
200223
[column, upperId = upperId](const auto& l, const auto&) {
201224
return l[column] < upperId;
202225
});
203-
204226
// Return the empty result as an empty `SetOfIntervals` instead of as an
205227
// empty range.
206228
if (lower != upper) {
@@ -212,47 +234,58 @@ ExpressionResult RegexExpression::evaluatePrefixRegex(
212234
return std::reduce(resultSetOfIntervals.begin(), resultSetOfIntervals.end(),
213235
ad_utility::SetOfIntervals{},
214236
ad_utility::SetOfIntervals::Union{});
215-
} else {
216-
auto resultSize = context->size();
217-
VectorWithMemoryLimit<Id> result{context->_allocator};
218-
result.reserve(resultSize);
219-
for (auto id : detail::makeGenerator(variable, resultSize, context)) {
220-
result.push_back(Id::makeFromBool(
221-
std::ranges::any_of(lowerAndUpperIds, [&](const auto& lowerUpper) {
222-
return !valueIdComparators::compareByBits(id, lowerUpper.first) &&
223-
valueIdComparators::compareByBits(id, lowerUpper.second);
224-
})));
225-
checkCancellation(context);
226-
}
227-
return result;
228237
}
238+
239+
// If the input is not sorted by the variable, we have to check each row
240+
// individually (by checking inclusion in the ranges).
241+
auto resultSize = context->size();
242+
VectorWithMemoryLimit<Id> result{context->_allocator};
243+
result.reserve(resultSize);
244+
for (auto id : detail::makeGenerator(variable, resultSize, context)) {
245+
result.push_back(Id::makeFromBool(
246+
std::ranges::any_of(lowerAndUpperIds, [&](const auto& lowerUpper) {
247+
return !valueIdComparators::compareByBits(id, lowerUpper.first) &&
248+
valueIdComparators::compareByBits(id, lowerUpper.second);
249+
})));
250+
checkCancellation(context);
251+
}
252+
return result;
229253
}
230254

231255
// ___________________________________________________________________________
232-
ExpressionResult RegexExpression::evaluateNonPrefixRegex(
233-
const Variable& variable,
234-
sparqlExpression::EvaluationContext* context) const {
235-
AD_CONTRACT_CHECK(std::holds_alternative<RE2>(regex_));
256+
template <SingleExpressionResult T>
257+
ExpressionResult RegexExpression::evaluateGeneralCase(
258+
T&& input, sparqlExpression::EvaluationContext* context) const {
259+
// We have one result for each row of the input.
236260
auto resultSize = context->size();
237261
VectorWithMemoryLimit<Id> result{context->_allocator};
238262
result.reserve(resultSize);
263+
AD_CORRECTNESS_CHECK(regex_.has_value());
239264

240-
auto impl = [&]<typename ValueGetter>(const ValueGetter& getter) {
241-
for (auto id : detail::makeGenerator(variable, resultSize, context)) {
242-
auto str = getter(id, context);
243-
if (!str.has_value()) {
244-
result.push_back(Id::makeUndefined());
245-
} else {
246-
result.push_back(Id::makeFromBool(
247-
RE2::PartialMatch(str.value(), std::get<RE2>(regex_))));
248-
}
249-
checkCancellation(context);
250-
}
265+
// Compute the result using the given value getter. If the getter returns
266+
// `std::nullopt` for a row, the result is `UNDEF`. Otherwise, we have a
267+
// string and evaluate the regex on it.
268+
auto computeResult = [&]<typename ValueGetter>(const ValueGetter& getter) {
269+
std::ranges::for_each(
270+
detail::makeGenerator(AD_FWD(input), resultSize, context),
271+
[&getter, &context, &result, this](const auto& id) {
272+
auto str = getter(id, context);
273+
if (!str.has_value()) {
274+
result.push_back(Id::makeUndefined());
275+
} else {
276+
result.push_back(Id::makeFromBool(
277+
RE2::PartialMatch(str.value(), regex_.value())));
278+
}
279+
checkCancellation(context);
280+
});
251281
};
282+
283+
// Compute the result with the correct value getter (depending on whether the
284+
// expression is enclosed in `STR()` or not), and return it.
252285
if (childIsStrExpression_) {
253-
impl(detail::StringValueGetter{});
286+
computeResult(detail::StringValueGetter{});
254287
} else {
255-
impl(detail::LiteralFromIdGetter{});
288+
computeResult(detail::LiteralFromIdGetter{});
256289
}
257290
return result;
258291
}
@@ -262,51 +295,57 @@ ExpressionResult RegexExpression::evaluate(
262295
sparqlExpression::EvaluationContext* context) const {
263296
auto resultAsVariant = child_->evaluate(context);
264297
auto variablePtr = std::get_if<Variable>(&resultAsVariant);
265-
AD_CONTRACT_CHECK(variablePtr);
266298

267-
if (std::holds_alternative<std::string>(regex_)) {
299+
if (prefixRegex_.has_value() && variablePtr != nullptr) {
268300
return evaluatePrefixRegex(*variablePtr, context);
269301
} else {
270-
return evaluateNonPrefixRegex(*variablePtr, context);
302+
return std::visit(
303+
[this, context](auto&& input) {
304+
return evaluateGeneralCase(AD_FWD(input), context);
305+
},
306+
std::move(resultAsVariant));
271307
}
272308
}
273309

274310
// ____________________________________________________________________________
275311
bool RegexExpression::isPrefixExpression() const {
276-
return std::holds_alternative<std::string>(regex_);
312+
return prefixRegex_.has_value();
277313
}
278314

279315
// ____________________________________________________________________________
280316
auto RegexExpression::getEstimatesForFilterExpression(
281317
uint64_t inputSize,
282318
const std::optional<Variable>& firstSortedVariable) const -> Estimates {
319+
// If we have a simple prefix regex, assume that only 10^-k entries remain,
320+
// where k is the length of the prefix.
283321
if (isPrefixExpression()) {
284-
// Assume that only 10^-k entries remain, where k is the length of the
285-
// prefix. The reason for the -2 is that at this point, _rhs always
286-
// starts with ^"
287322
double reductionFactor = std::pow(
288-
10, std::max(
289-
0, static_cast<int>(std::get<std::string>(regex_).size()) - 2));
323+
10, std::max(0, static_cast<int>(prefixRegex_.value().size())));
290324
// Cap to reasonable minimal and maximal values to prevent numerical
291325
// stability problems.
292326
reductionFactor = std::min(100000000.0, reductionFactor);
293327
reductionFactor = std::max(1.0, reductionFactor);
294328
size_t sizeEstimate = inputSize / static_cast<size_t>(reductionFactor);
295329
auto varPtr = dynamic_cast<VariableExpression*>(child_.get());
296-
AD_CONTRACT_CHECK(varPtr);
297-
size_t costEstimate = firstSortedVariable == varPtr->value()
330+
size_t costEstimate = (varPtr && firstSortedVariable == varPtr->value())
298331
? sizeEstimate
299332
: sizeEstimate + inputSize;
300333

301-
return {sizeEstimate, costEstimate};
302-
} else { // Not a prefix filter.
303-
size_t sizeEstimate = inputSize / 2;
304-
// We assume that checking a REGEX for an element is 10 times more
305-
// expensive than an "ordinary" filter check.
306-
size_t costEstimate = sizeEstimate + 10 * inputSize;
307-
308334
return {sizeEstimate, costEstimate};
309335
}
336+
337+
// For the general case, we make two assumptions.
338+
//
339+
// 1. Half of the entries remain after the filter. This is a very simple
340+
// and arbitrary heuristic.
341+
//
342+
// 2. Checking a REGEX for an element is 10 times more expensive than a
343+
// "simple" filter check. This is reasonable because regex evaluations are
344+
// expensive, but the fixed factor disregard that it depends on the
345+
// complexity of the regex how expensive it is.
346+
size_t sizeEstimate = inputSize / 2;
347+
size_t costEstimate = sizeEstimate + 10 * inputSize;
348+
return {sizeEstimate, costEstimate};
310349
}
311350

312351
// ____________________________________________________________________________

src/engine/sparqlExpressions/RegexExpression.h

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
// Copyright 2022, University of Freiburg,
2-
// Chair of Algorithms and Data Structures.
3-
// Author: Johannes Kalmbach <[email protected]>
1+
// Copyright 2022 - 2024, University of Freiburg
2+
// Chair of Algorithms and Data Structures
3+
// Author: Johannes Kalmbach <[email protected]>
44

55
#pragma once
66

@@ -11,22 +11,27 @@
1111
#include "re2/re2.h"
1212

1313
namespace sparqlExpression {
14+
// Class implementing the REGEX function, which takes two mandatory arguments
15+
// (an expression and a regex) and one optional argument (a string of flags).
1416
class RegexExpression : public SparqlExpression {
1517
private:
1618
SparqlExpression::Ptr child_;
17-
// If this variant holds a string, we consider this string as the prefix of a
18-
// prefix regex.
19-
std::variant<std::string, RE2> regex_;
19+
// The reguar expression. It needs to be a `std::optional` because `RE2`
20+
// objects do not have a default constructor.
21+
std::optional<RE2> regex_;
22+
// If this `std::optional` holds a string, we have a simple prefix regex
23+
// (which translates to a range search) and this string holds the prefix.
24+
std::optional<std::string> prefixRegex_;
2025
// The regex as a string, used for the cache key.
2126
std::string regexAsString_;
2227

23-
// True if the STR() function is to be applied on the child before evaluating
24-
// the regex.
28+
// True iff the expression is enclosed in `STR()`.
2529
bool childIsStrExpression_ = false;
2630

2731
public:
28-
// `child` must be a `VariableExpression` and `regex` must be a
29-
// `LiteralExpression` that stores a string, else an exception will be thrown.
32+
// The `child` must be a `VariableExpression` and `regex` must be a
33+
// `LiteralExpression` that stores a string, otherwise an exception will be
34+
// thrown.
3035
RegexExpression(SparqlExpression::Ptr child, SparqlExpression::Ptr regex,
3136
std::optional<SparqlExpression::Ptr> optionalFlags);
3237

@@ -46,17 +51,21 @@ class RegexExpression : public SparqlExpression {
4651

4752
private:
4853
std::span<SparqlExpression::Ptr> childrenImpl() override;
49-
// Internal implementations that are called by `evaluate`.
54+
55+
// Evaluate for the special case, where the expression is a variable and we
56+
// have a simple prefix regex (in which case the regex match translates to a
57+
// simple range check).
5058
ExpressionResult evaluatePrefixRegex(
5159
const Variable& variable,
5260
sparqlExpression::EvaluationContext* context) const;
53-
ExpressionResult evaluateNonPrefixRegex(
54-
const Variable& variable,
55-
sparqlExpression::EvaluationContext* context) const;
5661

57-
/// Helper function to check if the `CancellationHandle` of the passed
58-
/// `EvaluationContext` has been cancelled and throw an exception if this is
59-
/// the case.
62+
// Evaluate for the general case.
63+
template <SingleExpressionResult T>
64+
ExpressionResult evaluateGeneralCase(
65+
T&& input, sparqlExpression::EvaluationContext* context) const;
66+
67+
// Check if the `CancellationHandle` of `context` has been cancelled and throw
68+
// an exception if this is the case.
6069
static void checkCancellation(
6170
const sparqlExpression::EvaluationContext* context,
6271
ad_utility::source_location location =

0 commit comments

Comments
 (0)