@@ -72,19 +72,21 @@ std::optional<std::string> getPrefixRegex(std::string regex) {
72
72
} // namespace sparqlExpression::detail
73
73
74
74
namespace sparqlExpression {
75
+
75
76
// ___________________________________________________________________________
76
77
RegexExpression::RegexExpression (
77
78
SparqlExpression::Ptr child, SparqlExpression::Ptr regex,
78
79
std::optional<SparqlExpression::Ptr> optionalFlags)
79
80
: child_{std::move (child)} {
81
+ // If we have a `STR()` expression, remove the `STR()` and remember that it
82
+ // was there.
80
83
if (child_->isStrExpression ()) {
81
84
child_ = std::move (std::move (*child_).moveChildrenOut ().at (0 ));
82
85
childIsStrExpression_ = true ;
83
86
}
84
- if (!dynamic_cast <const VariableExpression*>(child_.get ())) {
85
- throw std::runtime_error (
86
- " REGEX expressions are currently supported only on variables." );
87
- }
87
+
88
+ // Get the regex string, which must be a string literal without a datatype or
89
+ // language tag.
88
90
std::string regexString;
89
91
if (auto regexPtr =
90
92
dynamic_cast <const StringLiteralExpression*>(regex.get ())) {
@@ -100,6 +102,9 @@ RegexExpression::RegexExpression(
100
102
" The second argument to the REGEX function must be a "
101
103
" string literal (which contains the regular expression)" );
102
104
}
105
+
106
+ // Parse the flags. The optional argument for that must, again, be a
107
+ // string literal without a datatype or language tag.
103
108
if (optionalFlags.has_value ()) {
104
109
if (auto flagsPtr = dynamic_cast <const StringLiteralExpression*>(
105
110
optionalFlags.value ().get ())) {
@@ -131,19 +136,18 @@ RegexExpression::RegexExpression(
131
136
}
132
137
}
133
138
139
+ // Create RE2 object from the regex string. If it is a simple prefix regex,
140
+ // store the prefix in `prefixRegex_` (otherwise that becomes `std::nullopt`).
134
141
regexAsString_ = regexString;
135
- if (auto opt = detail::getPrefixRegex (regexString)) {
136
- regex_ = std::move (opt.value ());
137
- } else {
138
- regex_.emplace <RE2>(regexString, RE2::Quiet);
139
- const auto & r = std::get<RE2>(regex_);
140
- if (r.error_code () != RE2::NoError) {
141
- throw std::runtime_error{absl::StrCat (
142
- " The regex \" " , regexString,
143
- " \" is not supported by QLever (which uses Google's RE2 library). "
144
- " Error from RE2 is: " ,
145
- r.error ())};
146
- }
142
+ prefixRegex_ = detail::getPrefixRegex (regexString);
143
+ regex_.emplace (regexString, RE2::Quiet);
144
+ const auto & r = regex_.value ();
145
+ if (r.error_code () != RE2::NoError) {
146
+ throw std::runtime_error{absl::StrCat (
147
+ " The regex \" " , regexString,
148
+ " \" is not supported by QLever (which uses Google's RE2 library); "
149
+ " the error from RE2 is: " ,
150
+ r.error ())};
147
151
}
148
152
}
149
153
@@ -163,17 +167,27 @@ std::span<SparqlExpression::Ptr> RegexExpression::childrenImpl() {
163
167
ExpressionResult RegexExpression::evaluatePrefixRegex (
164
168
const Variable& variable,
165
169
sparqlExpression::EvaluationContext* context) const {
166
- std::string prefixRegex = std::get<std::string>(regex_);
170
+ // This function must only be called if we have a simple prefix regex.
171
+ AD_CORRECTNESS_CHECK (prefixRegex_.has_value ());
172
+ std::string prefixRegex = prefixRegex_.value ();
173
+
174
+ // If the expression is enclosed in `STR()`, we have two ranges: for the
175
+ // prefix with and without leading "<".
176
+ //
177
+ // TODO<joka921> prefix filters currently have false negatives when the prefix
178
+ // is not in the vocabulary, and there exist local vocab entries in the input
179
+ // that are between the prefix and the next local vocab entry. This is
180
+ // non-trivial to fix as it involves fiddling with Unicode prefix encodings.
181
+ //
182
+ // TODO<joka921> prefix filters currently never find numbers or other
183
+ // datatypes that are encoded directly inside the IDs.
167
184
std::vector<std::string> actualPrefixes;
168
185
actualPrefixes.push_back (" \" " + prefixRegex);
169
- // If the STR function was applied, we also look for prefix matches for IRIs.
170
- // TODO<joka921> prefix filters currently never find numbers or local vocab
171
- // entries, numbers, or other datatypes that are encoded directly inside the
172
- // IDs.
173
186
if (childIsStrExpression_) {
174
187
actualPrefixes.push_back (" <" + prefixRegex);
175
188
}
176
- std::vector<ad_utility::SetOfIntervals> resultSetOfIntervals;
189
+
190
+ // Compute the (one or two) ranges.
177
191
std::vector<std::pair<Id, Id>> lowerAndUpperIds;
178
192
lowerAndUpperIds.reserve (actualPrefixes.size ());
179
193
for (const auto & prefix : actualPrefixes) {
@@ -184,12 +198,21 @@ ExpressionResult RegexExpression::evaluatePrefixRegex(
184
198
}
185
199
}
186
200
checkCancellation (context);
201
+
202
+ // Begin and end of the input (for each row of which we want to
203
+ // evaluate the regex).
187
204
auto beg = context->_inputTable .begin () + context->_beginIndex ;
188
205
auto end = context->_inputTable .begin () + context->_endIndex ;
189
206
AD_CONTRACT_CHECK (end <= context->_inputTable .end ());
207
+
208
+ // In this function, the expression is a simple variable. If the input is
209
+ // sorted by that variable, the result can be computed by a constant number
210
+ // of binary searches and the result is a set of intervals.
211
+ std::vector<ad_utility::SetOfIntervals> resultSetOfIntervals;
190
212
if (context->isResultSortedBy (variable)) {
191
213
auto column = context->getColumnIndexForVariable (variable);
192
214
for (auto [lowerId, upperId] : lowerAndUpperIds) {
215
+ // Two binary searches to find the lower and upper bounds of the range.
193
216
auto lower = std::lower_bound (
194
217
beg, end, nullptr ,
195
218
[column, lowerId = lowerId](const auto & l, const auto &) {
@@ -200,7 +223,6 @@ ExpressionResult RegexExpression::evaluatePrefixRegex(
200
223
[column, upperId = upperId](const auto & l, const auto &) {
201
224
return l[column] < upperId;
202
225
});
203
-
204
226
// Return the empty result as an empty `SetOfIntervals` instead of as an
205
227
// empty range.
206
228
if (lower != upper) {
@@ -212,47 +234,58 @@ ExpressionResult RegexExpression::evaluatePrefixRegex(
212
234
return std::reduce (resultSetOfIntervals.begin (), resultSetOfIntervals.end (),
213
235
ad_utility::SetOfIntervals{},
214
236
ad_utility::SetOfIntervals::Union{});
215
- } else {
216
- auto resultSize = context->size ();
217
- VectorWithMemoryLimit<Id> result{context->_allocator };
218
- result.reserve (resultSize);
219
- for (auto id : detail::makeGenerator (variable, resultSize, context)) {
220
- result.push_back (Id::makeFromBool (
221
- std::ranges::any_of (lowerAndUpperIds, [&](const auto & lowerUpper) {
222
- return !valueIdComparators::compareByBits (id, lowerUpper.first ) &&
223
- valueIdComparators::compareByBits (id, lowerUpper.second );
224
- })));
225
- checkCancellation (context);
226
- }
227
- return result;
228
237
}
238
+
239
+ // If the input is not sorted by the variable, we have to check each row
240
+ // individually (by checking inclusion in the ranges).
241
+ auto resultSize = context->size ();
242
+ VectorWithMemoryLimit<Id> result{context->_allocator };
243
+ result.reserve (resultSize);
244
+ for (auto id : detail::makeGenerator (variable, resultSize, context)) {
245
+ result.push_back (Id::makeFromBool (
246
+ std::ranges::any_of (lowerAndUpperIds, [&](const auto & lowerUpper) {
247
+ return !valueIdComparators::compareByBits (id, lowerUpper.first ) &&
248
+ valueIdComparators::compareByBits (id, lowerUpper.second );
249
+ })));
250
+ checkCancellation (context);
251
+ }
252
+ return result;
229
253
}
230
254
231
255
// ___________________________________________________________________________
232
- ExpressionResult RegexExpression::evaluateNonPrefixRegex (
233
- const Variable& variable,
234
- sparqlExpression::EvaluationContext* context) const {
235
- AD_CONTRACT_CHECK (std::holds_alternative<RE2>(regex_));
256
+ template <SingleExpressionResult T>
257
+ ExpressionResult RegexExpression::evaluateGeneralCase (
258
+ T&& input, sparqlExpression::EvaluationContext* context) const {
259
+ // We have one result for each row of the input.
236
260
auto resultSize = context->size ();
237
261
VectorWithMemoryLimit<Id> result{context->_allocator };
238
262
result.reserve (resultSize);
263
+ AD_CORRECTNESS_CHECK (regex_.has_value ());
239
264
240
- auto impl = [&]<typename ValueGetter>(const ValueGetter& getter) {
241
- for (auto id : detail::makeGenerator (variable, resultSize, context)) {
242
- auto str = getter (id, context);
243
- if (!str.has_value ()) {
244
- result.push_back (Id::makeUndefined ());
245
- } else {
246
- result.push_back (Id::makeFromBool (
247
- RE2::PartialMatch (str.value (), std::get<RE2>(regex_))));
248
- }
249
- checkCancellation (context);
250
- }
265
+ // Compute the result using the given value getter. If the getter returns
266
+ // `std::nullopt` for a row, the result is `UNDEF`. Otherwise, we have a
267
+ // string and evaluate the regex on it.
268
+ auto computeResult = [&]<typename ValueGetter>(const ValueGetter& getter) {
269
+ std::ranges::for_each (
270
+ detail::makeGenerator (AD_FWD (input), resultSize, context),
271
+ [&getter, &context, &result, this ](const auto & id) {
272
+ auto str = getter (id, context);
273
+ if (!str.has_value ()) {
274
+ result.push_back (Id::makeUndefined ());
275
+ } else {
276
+ result.push_back (Id::makeFromBool (
277
+ RE2::PartialMatch (str.value (), regex_.value ())));
278
+ }
279
+ checkCancellation (context);
280
+ });
251
281
};
282
+
283
+ // Compute the result with the correct value getter (depending on whether the
284
+ // expression is enclosed in `STR()` or not), and return it.
252
285
if (childIsStrExpression_) {
253
- impl (detail::StringValueGetter{});
286
+ computeResult (detail::StringValueGetter{});
254
287
} else {
255
- impl (detail::LiteralFromIdGetter{});
288
+ computeResult (detail::LiteralFromIdGetter{});
256
289
}
257
290
return result;
258
291
}
@@ -262,51 +295,57 @@ ExpressionResult RegexExpression::evaluate(
262
295
sparqlExpression::EvaluationContext* context) const {
263
296
auto resultAsVariant = child_->evaluate (context);
264
297
auto variablePtr = std::get_if<Variable>(&resultAsVariant);
265
- AD_CONTRACT_CHECK (variablePtr);
266
298
267
- if (std::holds_alternative<std::string>(regex_) ) {
299
+ if (prefixRegex_. has_value () && variablePtr != nullptr ) {
268
300
return evaluatePrefixRegex (*variablePtr, context);
269
301
} else {
270
- return evaluateNonPrefixRegex (*variablePtr, context);
302
+ return std::visit (
303
+ [this , context](auto && input) {
304
+ return evaluateGeneralCase (AD_FWD (input), context);
305
+ },
306
+ std::move (resultAsVariant));
271
307
}
272
308
}
273
309
274
310
// ____________________________________________________________________________
275
311
bool RegexExpression::isPrefixExpression () const {
276
- return std::holds_alternative<std::string>(regex_ );
312
+ return prefixRegex_. has_value ( );
277
313
}
278
314
279
315
// ____________________________________________________________________________
280
316
auto RegexExpression::getEstimatesForFilterExpression (
281
317
uint64_t inputSize,
282
318
const std::optional<Variable>& firstSortedVariable) const -> Estimates {
319
+ // If we have a simple prefix regex, assume that only 10^-k entries remain,
320
+ // where k is the length of the prefix.
283
321
if (isPrefixExpression ()) {
284
- // Assume that only 10^-k entries remain, where k is the length of the
285
- // prefix. The reason for the -2 is that at this point, _rhs always
286
- // starts with ^"
287
322
double reductionFactor = std::pow (
288
- 10 , std::max (
289
- 0 , static_cast <int >(std::get<std::string>(regex_).size ()) - 2 ));
323
+ 10 , std::max (0 , static_cast <int >(prefixRegex_.value ().size ())));
290
324
// Cap to reasonable minimal and maximal values to prevent numerical
291
325
// stability problems.
292
326
reductionFactor = std::min (100000000.0 , reductionFactor);
293
327
reductionFactor = std::max (1.0 , reductionFactor);
294
328
size_t sizeEstimate = inputSize / static_cast <size_t >(reductionFactor);
295
329
auto varPtr = dynamic_cast <VariableExpression*>(child_.get ());
296
- AD_CONTRACT_CHECK (varPtr);
297
- size_t costEstimate = firstSortedVariable == varPtr->value ()
330
+ size_t costEstimate = (varPtr && firstSortedVariable == varPtr->value ())
298
331
? sizeEstimate
299
332
: sizeEstimate + inputSize;
300
333
301
- return {sizeEstimate, costEstimate};
302
- } else { // Not a prefix filter.
303
- size_t sizeEstimate = inputSize / 2 ;
304
- // We assume that checking a REGEX for an element is 10 times more
305
- // expensive than an "ordinary" filter check.
306
- size_t costEstimate = sizeEstimate + 10 * inputSize;
307
-
308
334
return {sizeEstimate, costEstimate};
309
335
}
336
+
337
+ // For the general case, we make two assumptions.
338
+ //
339
+ // 1. Half of the entries remain after the filter. This is a very simple
340
+ // and arbitrary heuristic.
341
+ //
342
+ // 2. Checking a REGEX for an element is 10 times more expensive than a
343
+ // "simple" filter check. This is reasonable because regex evaluations are
344
+ // expensive, but the fixed factor disregard that it depends on the
345
+ // complexity of the regex how expensive it is.
346
+ size_t sizeEstimate = inputSize / 2 ;
347
+ size_t costEstimate = sizeEstimate + 10 * inputSize;
348
+ return {sizeEstimate, costEstimate};
310
349
}
311
350
312
351
// ____________________________________________________________________________
0 commit comments