Skip to content

Commit

Permalink
Allow REGEX for arbitrary expressions (not just a variable) (#1576)
Browse files Browse the repository at this point in the history
So far, the `REGEX` function was only implemented for the (frequent) special case, where the first argument is either a variable (like `?x`) or `STR` of a variable (like `STR(?x)`). Now `REGEX` works for arbitrary expressions. Use the occasion to clean up the code a little bit and improve the documentation.
  • Loading branch information
joka921 authored Nov 5, 2024
1 parent e528480 commit 3d321c2
Show file tree
Hide file tree
Showing 3 changed files with 186 additions and 104 deletions.
181 changes: 110 additions & 71 deletions src/engine/sparqlExpressions/RegexExpression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,21 @@ std::optional<std::string> getPrefixRegex(std::string regex) {
} // namespace sparqlExpression::detail

namespace sparqlExpression {

// ___________________________________________________________________________
RegexExpression::RegexExpression(
SparqlExpression::Ptr child, SparqlExpression::Ptr regex,
std::optional<SparqlExpression::Ptr> optionalFlags)
: child_{std::move(child)} {
// If we have a `STR()` expression, remove the `STR()` and remember that it
// was there.
if (child_->isStrExpression()) {
child_ = std::move(std::move(*child_).moveChildrenOut().at(0));
childIsStrExpression_ = true;
}
if (!dynamic_cast<const VariableExpression*>(child_.get())) {
throw std::runtime_error(
"REGEX expressions are currently supported only on variables.");
}

// Get the regex string, which must be a string literal without a datatype or
// language tag.
std::string regexString;
if (auto regexPtr =
dynamic_cast<const StringLiteralExpression*>(regex.get())) {
Expand All @@ -100,6 +102,9 @@ RegexExpression::RegexExpression(
"The second argument to the REGEX function must be a "
"string literal (which contains the regular expression)");
}

// Parse the flags. The optional argument for that must, again, be a
// string literal without a datatype or language tag.
if (optionalFlags.has_value()) {
if (auto flagsPtr = dynamic_cast<const StringLiteralExpression*>(
optionalFlags.value().get())) {
Expand Down Expand Up @@ -131,19 +136,18 @@ RegexExpression::RegexExpression(
}
}

// Create RE2 object from the regex string. If it is a simple prefix regex,
// store the prefix in `prefixRegex_` (otherwise that becomes `std::nullopt`).
regexAsString_ = regexString;
if (auto opt = detail::getPrefixRegex(regexString)) {
regex_ = std::move(opt.value());
} else {
regex_.emplace<RE2>(regexString, RE2::Quiet);
const auto& r = std::get<RE2>(regex_);
if (r.error_code() != RE2::NoError) {
throw std::runtime_error{absl::StrCat(
"The regex \"", regexString,
"\" is not supported by QLever (which uses Google's RE2 library). "
"Error from RE2 is: ",
r.error())};
}
prefixRegex_ = detail::getPrefixRegex(regexString);
regex_.emplace(regexString, RE2::Quiet);
const auto& r = regex_.value();
if (r.error_code() != RE2::NoError) {
throw std::runtime_error{absl::StrCat(
"The regex \"", regexString,
"\" is not supported by QLever (which uses Google's RE2 library); "
"the error from RE2 is: ",
r.error())};
}
}

Expand All @@ -163,17 +167,27 @@ std::span<SparqlExpression::Ptr> RegexExpression::childrenImpl() {
ExpressionResult RegexExpression::evaluatePrefixRegex(
const Variable& variable,
sparqlExpression::EvaluationContext* context) const {
std::string prefixRegex = std::get<std::string>(regex_);
// This function must only be called if we have a simple prefix regex.
AD_CORRECTNESS_CHECK(prefixRegex_.has_value());
std::string prefixRegex = prefixRegex_.value();

// If the expression is enclosed in `STR()`, we have two ranges: for the
// prefix with and without leading "<".
//
// TODO<joka921> prefix filters currently have false negatives when the prefix
// is not in the vocabulary, and there exist local vocab entries in the input
// that are between the prefix and the next local vocab entry. This is
// non-trivial to fix as it involves fiddling with Unicode prefix encodings.
//
// TODO<joka921> prefix filters currently never find numbers or other
// datatypes that are encoded directly inside the IDs.
std::vector<std::string> actualPrefixes;
actualPrefixes.push_back("\"" + prefixRegex);
// If the STR function was applied, we also look for prefix matches for IRIs.
// TODO<joka921> prefix filters currently never find numbers or local vocab
// entries, numbers, or other datatypes that are encoded directly inside the
// IDs.
if (childIsStrExpression_) {
actualPrefixes.push_back("<" + prefixRegex);
}
std::vector<ad_utility::SetOfIntervals> resultSetOfIntervals;

// Compute the (one or two) ranges.
std::vector<std::pair<Id, Id>> lowerAndUpperIds;
lowerAndUpperIds.reserve(actualPrefixes.size());
for (const auto& prefix : actualPrefixes) {
Expand All @@ -184,12 +198,21 @@ ExpressionResult RegexExpression::evaluatePrefixRegex(
}
}
checkCancellation(context);

// Begin and end of the input (for each row of which we want to
// evaluate the regex).
auto beg = context->_inputTable.begin() + context->_beginIndex;
auto end = context->_inputTable.begin() + context->_endIndex;
AD_CONTRACT_CHECK(end <= context->_inputTable.end());

// In this function, the expression is a simple variable. If the input is
// sorted by that variable, the result can be computed by a constant number
// of binary searches and the result is a set of intervals.
std::vector<ad_utility::SetOfIntervals> resultSetOfIntervals;
if (context->isResultSortedBy(variable)) {
auto column = context->getColumnIndexForVariable(variable);
for (auto [lowerId, upperId] : lowerAndUpperIds) {
// Two binary searches to find the lower and upper bounds of the range.
auto lower = std::lower_bound(
beg, end, nullptr,
[column, lowerId = lowerId](const auto& l, const auto&) {
Expand All @@ -200,7 +223,6 @@ ExpressionResult RegexExpression::evaluatePrefixRegex(
[column, upperId = upperId](const auto& l, const auto&) {
return l[column] < upperId;
});

// Return the empty result as an empty `SetOfIntervals` instead of as an
// empty range.
if (lower != upper) {
Expand All @@ -212,47 +234,58 @@ ExpressionResult RegexExpression::evaluatePrefixRegex(
return std::reduce(resultSetOfIntervals.begin(), resultSetOfIntervals.end(),
ad_utility::SetOfIntervals{},
ad_utility::SetOfIntervals::Union{});
} else {
auto resultSize = context->size();
VectorWithMemoryLimit<Id> result{context->_allocator};
result.reserve(resultSize);
for (auto id : detail::makeGenerator(variable, resultSize, context)) {
result.push_back(Id::makeFromBool(
std::ranges::any_of(lowerAndUpperIds, [&](const auto& lowerUpper) {
return !valueIdComparators::compareByBits(id, lowerUpper.first) &&
valueIdComparators::compareByBits(id, lowerUpper.second);
})));
checkCancellation(context);
}
return result;
}

// If the input is not sorted by the variable, we have to check each row
// individually (by checking inclusion in the ranges).
auto resultSize = context->size();
VectorWithMemoryLimit<Id> result{context->_allocator};
result.reserve(resultSize);
for (auto id : detail::makeGenerator(variable, resultSize, context)) {
result.push_back(Id::makeFromBool(
std::ranges::any_of(lowerAndUpperIds, [&](const auto& lowerUpper) {
return !valueIdComparators::compareByBits(id, lowerUpper.first) &&
valueIdComparators::compareByBits(id, lowerUpper.second);
})));
checkCancellation(context);
}
return result;
}

// ___________________________________________________________________________
ExpressionResult RegexExpression::evaluateNonPrefixRegex(
const Variable& variable,
sparqlExpression::EvaluationContext* context) const {
AD_CONTRACT_CHECK(std::holds_alternative<RE2>(regex_));
template <SingleExpressionResult T>
ExpressionResult RegexExpression::evaluateGeneralCase(
T&& input, sparqlExpression::EvaluationContext* context) const {
// We have one result for each row of the input.
auto resultSize = context->size();
VectorWithMemoryLimit<Id> result{context->_allocator};
result.reserve(resultSize);
AD_CORRECTNESS_CHECK(regex_.has_value());

auto impl = [&]<typename ValueGetter>(const ValueGetter& getter) {
for (auto id : detail::makeGenerator(variable, resultSize, context)) {
auto str = getter(id, context);
if (!str.has_value()) {
result.push_back(Id::makeUndefined());
} else {
result.push_back(Id::makeFromBool(
RE2::PartialMatch(str.value(), std::get<RE2>(regex_))));
}
checkCancellation(context);
}
// Compute the result using the given value getter. If the getter returns
// `std::nullopt` for a row, the result is `UNDEF`. Otherwise, we have a
// string and evaluate the regex on it.
auto computeResult = [&]<typename ValueGetter>(const ValueGetter& getter) {
std::ranges::for_each(
detail::makeGenerator(AD_FWD(input), resultSize, context),
[&getter, &context, &result, this](const auto& id) {
auto str = getter(id, context);
if (!str.has_value()) {
result.push_back(Id::makeUndefined());
} else {
result.push_back(Id::makeFromBool(
RE2::PartialMatch(str.value(), regex_.value())));
}
checkCancellation(context);
});
};

// Compute the result with the correct value getter (depending on whether the
// expression is enclosed in `STR()` or not), and return it.
if (childIsStrExpression_) {
impl(detail::StringValueGetter{});
computeResult(detail::StringValueGetter{});
} else {
impl(detail::LiteralFromIdGetter{});
computeResult(detail::LiteralFromIdGetter{});
}
return result;
}
Expand All @@ -262,51 +295,57 @@ ExpressionResult RegexExpression::evaluate(
sparqlExpression::EvaluationContext* context) const {
auto resultAsVariant = child_->evaluate(context);
auto variablePtr = std::get_if<Variable>(&resultAsVariant);
AD_CONTRACT_CHECK(variablePtr);

if (std::holds_alternative<std::string>(regex_)) {
if (prefixRegex_.has_value() && variablePtr != nullptr) {
return evaluatePrefixRegex(*variablePtr, context);
} else {
return evaluateNonPrefixRegex(*variablePtr, context);
return std::visit(
[this, context](auto&& input) {
return evaluateGeneralCase(AD_FWD(input), context);
},
std::move(resultAsVariant));
}
}

// ____________________________________________________________________________
bool RegexExpression::isPrefixExpression() const {
return std::holds_alternative<std::string>(regex_);
return prefixRegex_.has_value();
}

// ____________________________________________________________________________
auto RegexExpression::getEstimatesForFilterExpression(
uint64_t inputSize,
const std::optional<Variable>& firstSortedVariable) const -> Estimates {
// If we have a simple prefix regex, assume that only 10^-k entries remain,
// where k is the length of the prefix.
if (isPrefixExpression()) {
// Assume that only 10^-k entries remain, where k is the length of the
// prefix. The reason for the -2 is that at this point, _rhs always
// starts with ^"
double reductionFactor = std::pow(
10, std::max(
0, static_cast<int>(std::get<std::string>(regex_).size()) - 2));
10, std::max(0, static_cast<int>(prefixRegex_.value().size())));
// Cap to reasonable minimal and maximal values to prevent numerical
// stability problems.
reductionFactor = std::min(100000000.0, reductionFactor);
reductionFactor = std::max(1.0, reductionFactor);
size_t sizeEstimate = inputSize / static_cast<size_t>(reductionFactor);
auto varPtr = dynamic_cast<VariableExpression*>(child_.get());
AD_CONTRACT_CHECK(varPtr);
size_t costEstimate = firstSortedVariable == varPtr->value()
size_t costEstimate = (varPtr && firstSortedVariable == varPtr->value())
? sizeEstimate
: sizeEstimate + inputSize;

return {sizeEstimate, costEstimate};
} else { // Not a prefix filter.
size_t sizeEstimate = inputSize / 2;
// We assume that checking a REGEX for an element is 10 times more
// expensive than an "ordinary" filter check.
size_t costEstimate = sizeEstimate + 10 * inputSize;

return {sizeEstimate, costEstimate};
}

// For the general case, we make two assumptions.
//
// 1. Half of the entries remain after the filter. This is a very simple
// and arbitrary heuristic.
//
// 2. Checking a REGEX for an element is 10 times more expensive than a
// "simple" filter check. This is reasonable because regex evaluations are
// expensive, but the fixed factor disregard that it depends on the
// complexity of the regex how expensive it is.
size_t sizeEstimate = inputSize / 2;
size_t costEstimate = sizeEstimate + 10 * inputSize;
return {sizeEstimate, costEstimate};
}

// ____________________________________________________________________________
Expand Down
43 changes: 26 additions & 17 deletions src/engine/sparqlExpressions/RegexExpression.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Copyright 2022, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Johannes Kalmbach <[email protected]>
// Copyright 2022 - 2024, University of Freiburg
// Chair of Algorithms and Data Structures
// Author: Johannes Kalmbach <[email protected]>

#pragma once

Expand All @@ -11,22 +11,27 @@
#include "re2/re2.h"

namespace sparqlExpression {
// Class implementing the REGEX function, which takes two mandatory arguments
// (an expression and a regex) and one optional argument (a string of flags).
class RegexExpression : public SparqlExpression {
private:
SparqlExpression::Ptr child_;
// If this variant holds a string, we consider this string as the prefix of a
// prefix regex.
std::variant<std::string, RE2> regex_;
// The reguar expression. It needs to be a `std::optional` because `RE2`
// objects do not have a default constructor.
std::optional<RE2> regex_;
// If this `std::optional` holds a string, we have a simple prefix regex
// (which translates to a range search) and this string holds the prefix.
std::optional<std::string> prefixRegex_;
// The regex as a string, used for the cache key.
std::string regexAsString_;

// True if the STR() function is to be applied on the child before evaluating
// the regex.
// True iff the expression is enclosed in `STR()`.
bool childIsStrExpression_ = false;

public:
// `child` must be a `VariableExpression` and `regex` must be a
// `LiteralExpression` that stores a string, else an exception will be thrown.
// The `child` must be a `VariableExpression` and `regex` must be a
// `LiteralExpression` that stores a string, otherwise an exception will be
// thrown.
RegexExpression(SparqlExpression::Ptr child, SparqlExpression::Ptr regex,
std::optional<SparqlExpression::Ptr> optionalFlags);

Expand All @@ -46,17 +51,21 @@ class RegexExpression : public SparqlExpression {

private:
std::span<SparqlExpression::Ptr> childrenImpl() override;
// Internal implementations that are called by `evaluate`.

// Evaluate for the special case, where the expression is a variable and we
// have a simple prefix regex (in which case the regex match translates to a
// simple range check).
ExpressionResult evaluatePrefixRegex(
const Variable& variable,
sparqlExpression::EvaluationContext* context) const;
ExpressionResult evaluateNonPrefixRegex(
const Variable& variable,
sparqlExpression::EvaluationContext* context) const;

/// Helper function to check if the `CancellationHandle` of the passed
/// `EvaluationContext` has been cancelled and throw an exception if this is
/// the case.
// Evaluate for the general case.
template <SingleExpressionResult T>
ExpressionResult evaluateGeneralCase(
T&& input, sparqlExpression::EvaluationContext* context) const;

// Check if the `CancellationHandle` of `context` has been cancelled and throw
// an exception if this is the case.
static void checkCancellation(
const sparqlExpression::EvaluationContext* context,
ad_utility::source_location location =
Expand Down
Loading

0 comments on commit 3d321c2

Please sign in to comment.