diff --git a/e2e/scientists_queries.yaml b/e2e/scientists_queries.yaml index 0b6dc1418c..29ee2481fc 100644 --- a/e2e/scientists_queries.yaml +++ b/e2e/scientists_queries.yaml @@ -5,16 +5,16 @@ queries: - query: relativ-star-scientists type: text sparql: | - SELECT ?x ?t ?ql_textscore_t WHERE { + SELECT ?x ?t ?ql_score_t_var_x WHERE { ?x . ?t ql:contains-entity ?x . ?t ql:contains-word "relati*" } - ORDER BY DESC(?ql_textscore_t) + ORDER BY DESC(?ql_score_t_var_x) checks: - num_cols: 3 - num_rows: 4285 - - selected: [ "?x", "?t", "?ql_textscore_t"] + - selected: [ "?x", "?t", "?ql_score_t_var_x"] - contains_row: - "" - "He realized, however, that the principle of relativity could also be extended @@ -23,30 +23,27 @@ queries: - null - contains_row: [ "", null, null ] # null cells are ignored - contains_row: [ "", null, null ] # Test Unicode - - order_numeric: {"dir" : "DESC", "var": "?ql_textscore_t"} + - order_numeric: {"dir" : "DESC", "var": "?ql_score_t_var_x"} - - query: relativ-star-scientists-from-ulm # should use TextOperationWithFilter + - query: relativ-star-scientists-from-ulm type: text sparql: | - SELECT ?x ?t ?ql_textscore_t WHERE { + SELECT ?x ?t WHERE { ?x . ?x . ?t ql:contains-entity ?x . ?t ql:contains-word "relati*" } - ORDER BY DESC(?ql_textscore_t) - TEXTLIMIT 1 checks: - - num_cols: 3 - - num_rows: 1 - - selected: [ "?x", "?t", "?ql_textscore_t" ] + - num_cols: 2 + - num_rows: 172 + - selected: [ "?x", "?t"] - contains_row: - "" - "He realized, however, that the principle of relativity could also be extended to gravitational fields, and with his subsequent theory of gravitation in 1916, he published a paper on general relativity." - - null - query: relat-star-Physikalische-real-star-scientists-from-ulm type: text @@ -55,11 +52,11 @@ queries: ?x . ?x . ?t ql:contains-entity ?x . - ?t ql:contains-word "relat* Physikalische rela*" + ?t ql:contains-word "RElaT* phySIKalische rela*" } checks: - num_cols: 5 - - selected: [ "?x", "?ql_textscore_t", "?t", "?ql_matchingword_t_relat", "?ql_matchingword_t_rela" ] + - selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_matchingword_t_relat", "?ql_matchingword_t_rela" ] - contains_row: - "" - null @@ -88,26 +85,26 @@ queries: - query: algo-star-female-scientists type: text sparql: | - SELECT ?x ?ql_textscore_t WHERE { + SELECT ?x ?ql_score_t_var_x WHERE { ?x . ?x . ?t ql:contains-entity ?x . ?t ql:contains-word "algo*" } - ORDER BY DESC(?ql_textscore_t) + ORDER BY DESC(?ql_score_t_var_x) checks: - num_cols: 2 - num_rows: 27 - - selected: [ "?x", "?ql_textscore_t" ] + - selected: [ "?x", "?ql_score_t_var_x" ] - contains_row: [ "", null ] - - order_numeric: {"dir": "DESC", "var" : "?ql_textscore_t"} + - order_numeric: {"dir": "DESC", "var" : "?ql_score_t_var_x"} - - query: algor-start-female-born-before-1940 + - query: algor-star-female-born-before-1940 type: text sparql: | PREFIX xsd: - SELECT ?x ?date ?t ?ql_textscore_t ?ql_matchingword_t_algor WHERE { + SELECT ?x ?date ?t ?ql_matchingword_t_algor WHERE { ?x . ?x ?date . ?x . @@ -115,29 +112,75 @@ queries: ?t ql:contains-word "algor*" . FILTER (?date < "1940-01-01"^^xsd:date) } - ORDER BY DESC(?ql_textscore_t) checks: - - num_cols: 5 + - num_cols: 4 - num_rows: 4 - contains_row: - "" - "1901-03-02" - "Hermann's algorithm for primary decomposition is still in use now." - - null - "algorithm" - contains_row: - "" - "1815-12-10" - "Her notes on the engine include what is recognised as the first algorithm intended to be carried out by a machine." - - null - "algorithm" - - order_numeric: {"dir": "DESC", "var" : "?ql_textscore_t"} - - query: algorithm-hermann-start-female-born-before-1940 + - query: algor-star-female-fixedEntity-ada-ordered + type: text + sparql: | + SELECT * WHERE { + ?scientist . + ?scientist . + ?text ql:contains-entity ?scientist . + ?text ql:contains-entity . + ?text ql:contains-word "rela*" . + } + ORDER BY DESC(?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_) + checks: + - num_cols: 5 + - num_rows: 7 + - contains_row: + - "" + - null + - "As a teenager, her mathematical talents led her to an ongoing + working relationship and friendship with fellow British mathematician + Charles Babbage, also known as' the father of computers', and in + particular, Babbage's work on the Analytical Engine." + - null + - "relationship" + - order_numeric: {"dir": "DESC", + "var" : "?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_"} + + - query: algor-star-female-fixedEntity-ada-fixed-Entity-mary + type: text + sparql: | + SELECT * WHERE { + ?scientist . + ?scientist . + ?text ql:contains-entity ?scientist . + ?text ql:contains-entity . + ?text ql:contains-entity . + ?text ql:contains-word "rela*" . + } + checks: + - num_cols: 6 + - num_rows: 2 + - contains_row: + - "" + - null + - "She became fascinated with the machine and used her relationship + with Somerville to visit Babbage as often as she could." + - null + - null + - "relationship" + + + - query: algorithm-hermann-star-female-born-before-1940 type: text sparql: | PREFIX xsd: - SELECT ?x ?date ?t ?ql_textscore_t WHERE { + SELECT ?x ?date ?t WHERE { ?x . ?x ?date . ?x . @@ -145,16 +188,13 @@ queries: ?t ql:contains-word "algorithm hermann" . FILTER (?date < "1940-01-01"^^xsd:date) } - ORDER BY DESC(?ql_textscore_t) checks: - - num_cols: 4 + - num_cols: 3 - num_rows: 1 - contains_row: - "" - "1901-03-02" - "Hermann's algorithm for primary decomposition is still in use now." - - null - - order_numeric: {"dir": "DESC", "var" : "?ql_textscore_t"} - query: people-born-in-1901 type: no-text @@ -1239,11 +1279,11 @@ queries: ?x . ?t ql:contains-entity ?x . ?t ql:contains-word "algo* herm* primary" - } TEXTLIMIT 1 + } checks: - num_cols: 5 - num_rows: 1 - - selected: [ "?x", "?ql_textscore_t", "?t", "?ql_matchingword_t_algo", "?ql_matchingword_t_herm" ] + - selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_matchingword_t_algo", "?ql_matchingword_t_herm" ] - contains_row: [ "",null,"Hermann's algorithm for primary decomposition is still in use now.","algorithm","hermann" ] diff --git a/src/engine/CMakeLists.txt b/src/engine/CMakeLists.txt index 8e37590511..b68c739d24 100644 --- a/src/engine/CMakeLists.txt +++ b/src/engine/CMakeLists.txt @@ -10,6 +10,6 @@ add_library(engine Union.cpp MultiColumnJoin.cpp TransitivePath.cpp Service.cpp Values.cpp Bind.cpp Minus.cpp RuntimeInformation.cpp CheckUsePatternTrick.cpp VariableToColumnMap.cpp ExportQueryExecutionTrees.cpp - CartesianProductJoin.cpp + CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp idTable/CompressedExternalIdTable.h) qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams) diff --git a/src/engine/QueryExecutionTree.cpp b/src/engine/QueryExecutionTree.cpp index b7a13a6af3..b372a76bbf 100644 --- a/src/engine/QueryExecutionTree.cpp +++ b/src/engine/QueryExecutionTree.cpp @@ -28,6 +28,8 @@ #include "engine/OrderBy.h" #include "engine/Service.h" #include "engine/Sort.h" +#include "engine/TextIndexScanForEntity.h" +#include "engine/TextIndexScanForWord.h" #include "engine/TextOperationWithFilter.h" #include "engine/TextOperationWithoutFilter.h" #include "engine/TransitivePath.h" @@ -176,6 +178,10 @@ void QueryExecutionTree::setOperation(std::shared_ptr operation) { type_ = TEXT_WITH_FILTER; } else if constexpr (std::is_same_v) { type_ = TEXT_WITHOUT_FILTER; + } else if constexpr (std::is_same_v) { + type_ = TEXT_INDEX_SCAN_FOR_WORD; + } else if constexpr (std::is_same_v) { + type_ = TEXT_INDEX_SCAN_FOR_ENTITY; } else if constexpr (std::is_same_v) { type_ = COUNT_AVAILABLE_PREDICATES; } else if constexpr (std::is_same_v) { @@ -217,6 +223,10 @@ template void QueryExecutionTree::setOperation( std::shared_ptr); template void QueryExecutionTree::setOperation( std::shared_ptr); +template void QueryExecutionTree::setOperation( + std::shared_ptr); +template void QueryExecutionTree::setOperation( + std::shared_ptr); template void QueryExecutionTree::setOperation( std::shared_ptr); template void QueryExecutionTree::setOperation(std::shared_ptr); diff --git a/src/engine/QueryExecutionTree.h b/src/engine/QueryExecutionTree.h index 8a533ce91c..f612cbf32b 100644 --- a/src/engine/QueryExecutionTree.h +++ b/src/engine/QueryExecutionTree.h @@ -45,6 +45,8 @@ class QueryExecutionTree { DISTINCT, TEXT_WITHOUT_FILTER, TEXT_WITH_FILTER, + TEXT_INDEX_SCAN_FOR_WORD, + TEXT_INDEX_SCAN_FOR_ENTITY, OPTIONAL_JOIN, COUNT_AVAILABLE_PREDICATES, GROUP_BY, diff --git a/src/engine/QueryPlanner.cpp b/src/engine/QueryPlanner.cpp index 4fb1b66c5c..20d2267329 100644 --- a/src/engine/QueryPlanner.cpp +++ b/src/engine/QueryPlanner.cpp @@ -4,35 +4,38 @@ // 2015-2017 Björn Buchhold (buchhold@informatik.uni-freiburg.de) // 2018- Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de) -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include "engine/QueryPlanner.h" #include #include +#include "engine/Bind.h" +#include "engine/CartesianProductJoin.h" +#include "engine/CheckUsePatternTrick.h" +#include "engine/CountAvailablePredicates.h" +#include "engine/Distinct.h" +#include "engine/Filter.h" +#include "engine/GroupBy.h" +#include "engine/HasPredicateScan.h" +#include "engine/IndexScan.h" +#include "engine/Join.h" +#include "engine/Minus.h" +#include "engine/MultiColumnJoin.h" +#include "engine/NeutralElementOperation.h" +#include "engine/OptionalJoin.h" +#include "engine/OrderBy.h" +#include "engine/Service.h" +#include "engine/Sort.h" +#include "engine/TextIndexScanForEntity.h" +#include "engine/TextIndexScanForWord.h" +#include "engine/TextOperationWithFilter.h" +#include "engine/TextOperationWithoutFilter.h" +#include "engine/TransitivePath.h" +#include "engine/Union.h" +#include "engine/Values.h" +#include "parser/Alias.h" +#include "parser/SparqlParserHelpers.h" + namespace p = parsedQuery; namespace { @@ -188,9 +191,6 @@ std::vector QueryPlanner::optimize( auto optimizeCommutativ = [this](const auto& triples, const auto& plans, const auto& filters) { auto tg = createTripleGraph(&triples); - LOG(TRACE) << "Collapse text cliques..." << std::endl; - tg.collapseTextCliques(); - LOG(TRACE) << "Collapse text cliques done." << std::endl; // always apply all filters to be safe. // TODO it could be possible, to allow the DpTab to leave // results unfiltered and add the filters later, but this has to be @@ -464,9 +464,6 @@ std::vector QueryPlanner::optimize( // joinCandidates lambda; if (candidatePlans.size() > 1 || !candidateTriples._triples.empty()) { auto tg = createTripleGraph(&candidateTriples); - LOG(TRACE) << "Collapse text cliques..." << std::endl; - tg.collapseTextCliques(); - LOG(TRACE) << "Collapse text cliques done." << std::endl; auto lastRow = fillDpTab(tg, rootPattern->_filters, candidatePlans).back(); candidateTriples._triples.clear(); candidatePlans.clear(); @@ -646,36 +643,184 @@ vector QueryPlanner::getOrderByRow( return added; } +void QueryPlanner::addNodeToTripleGraph(const TripleGraph::Node& node, + QueryPlanner::TripleGraph& tg) const { + // TODO This needs quite some refactoring: The IDs of the nodes have + // to be ascending as an invariant, so we can store all the nodes in a + // vector or even a plain vector. + tg._nodeStorage.emplace_back(node); + auto& addedNode = tg._nodeStorage.back(); + tg._nodeMap[addedNode.id_] = &addedNode; + tg._adjLists.emplace_back(); + AD_CORRECTNESS_CHECK(tg._adjLists.size() == tg._nodeStorage.size()); + AD_CORRECTNESS_CHECK(tg._adjLists.size() == addedNode.id_ + 1); + // Now add an edge between the added node and every node sharing a var. + for (auto& addedNodevar : addedNode._variables) { + for (size_t i = 0; i < addedNode.id_; ++i) { + auto& otherNode = *tg._nodeMap[i]; + if (otherNode._variables.contains(addedNodevar)) { + // There is an edge between *it->second and the node with id "id". + tg._adjLists[addedNode.id_].push_back(otherNode.id_); + tg._adjLists[otherNode.id_].push_back(addedNode.id_); + } + } + } +} + // _____________________________________________________________________________ QueryPlanner::TripleGraph QueryPlanner::createTripleGraph( const p::BasicGraphPattern* pattern) const { TripleGraph tg; - if (pattern->_triples.size() > 64) { - AD_THROW("At most 64 triples allowed at the moment."); - } + size_t numNodesInTripleGraph = 0; + ad_utility::HashMap optTermForCvar; + ad_utility::HashMap> potentialTermsForCvar; + vector entityTriples; + // Add one or more nodes for each triple. for (auto& t : pattern->_triples) { - // Add a node for the triple. - tg._nodeStorage.emplace_back(TripleGraph::Node(tg._nodeStorage.size(), t)); - auto& addedNode = tg._nodeStorage.back(); - tg._nodeMap[addedNode._id] = &tg._nodeStorage.back(); - tg._adjLists.emplace_back(vector()); - assert(tg._adjLists.size() == tg._nodeStorage.size()); - assert(tg._adjLists.size() == addedNode._id + 1); - // Now add an edge between the added node and every node sharing a var. - for (auto& addedNodevar : addedNode._variables) { - for (size_t i = 0; i < addedNode._id; ++i) { - auto& otherNode = *tg._nodeMap[i]; - if (otherNode._variables.count(addedNodevar) > 0) { - // There is an edge between *it->second and the node with id "id". - tg._adjLists[addedNode._id].push_back(otherNode._id); - tg._adjLists[otherNode._id].push_back(addedNode._id); - } + if (t._p._iri == CONTAINS_WORD_PREDICATE) { + std::string buffer = t._o.toString(); + std::string_view sv{buffer}; + // Add one node for each word + for (const auto& term : + absl::StrSplit(sv.substr(1, sv.size() - 2), ' ')) { + std::string s{ad_utility::utf8ToLower(term)}; + potentialTermsForCvar[t._s.getVariable()].push_back(s); + addNodeToTripleGraph( + TripleGraph::Node(tg._nodeStorage.size(), t._s.getVariable(), s, t), + tg); + numNodesInTripleGraph++; } + } else if (t._p._iri == CONTAINS_ENTITY_PREDICATE) { + entityTriples.push_back(&t); + } else { + addNodeToTripleGraph(TripleGraph::Node(tg._nodeStorage.size(), t), tg); + numNodesInTripleGraph++; } } + for (const auto& [cvar, terms] : potentialTermsForCvar) { + optTermForCvar[cvar] = + terms[_qec->getIndex().getIndexOfBestSuitedElTerm(terms)]; + } + for (const SparqlTriple* t : entityTriples) { + Variable currentVar = t->_s.getVariable(); + if (!optTermForCvar.contains(currentVar)) { + AD_THROW( + "Missing ql:contains-word statement. A ql:contains-entity " + "statement always also needs corresponding ql:contains-word " + "statement."); + } + addNodeToTripleGraph(TripleGraph::Node(tg._nodeStorage.size(), currentVar, + optTermForCvar[currentVar], *t), + tg); + numNodesInTripleGraph++; + } + if (numNodesInTripleGraph > 64) { + AD_THROW("At most 64 triples allowed at the moment."); + } return tg; } +// _____________________________________________________________________________ +template +void QueryPlanner::indexScanSingleVarCase( + const TripleGraph::Node& node, const PushPlanFunction& pushPlan, + const AddedIndexScanFunction& addIndexScan) { + using enum Permutation::Enum; + + // TODO: The case where the same variable appears in subject + predicate or + // object + predicate is missing here and leads to an assertion failure. + if (isVariable(node.triple_._s) && isVariable(node.triple_._o) && + node.triple_._s == node.triple_._o) { + if (isVariable(node.triple_._p._iri)) { + AD_THROW("Triple with one variable repeated three times"); + } + LOG(DEBUG) << "Subject variable same as object variable" << std::endl; + // Need to handle this as IndexScan with a new unique + // variable + Filter. Works in both directions + Variable filterVar = generateUniqueVarName(); + auto scanTriple = node.triple_; + scanTriple._o = filterVar; + auto scanTree = makeExecutionTree(_qec, PSO, scanTriple); + // The simplest way to set up the filtering expression is to use the + // parser. + std::string filterString = + absl::StrCat("FILTER (", scanTriple._s.getVariable().name(), "=", + filterVar.name(), ")"); + auto filter = sparqlParserHelpers::ParserAndVisitor{filterString} + .parseTypesafe(&SparqlAutomaticParser::filterR) + .resultOfParse_; + auto plan = + makeSubtreePlan(_qec, scanTree, std::move(filter.expression_)); + pushPlan(std::move(plan)); + } else if (isVariable(node.triple_._s)) { + addIndexScan(POS); + } else if (isVariable(node.triple_._o)) { + addIndexScan(PSO); + } else { + AD_CONTRACT_CHECK(isVariable(node.triple_._p)); + addIndexScan(SOP); + } +} + +// _____________________________________________________________________________ +template +void QueryPlanner::indexScanTwoVarsCase( + const TripleGraph::Node& node, + const AddedIndexScanFunction& addIndexScan) const { + using enum Permutation::Enum; + + // TODO: The case that the same variable appears in more than one position + // leads (as in indexScanSingleVarCase) to an assertion. + if (!isVariable(node.triple_._p._iri)) { + addIndexScan(PSO); + addIndexScan(POS); + } else if (!isVariable(node.triple_._s)) { + addIndexScan(SPO); + addIndexScan(SOP); + } else if (!isVariable(node.triple_._o)) { + addIndexScan(OSP); + addIndexScan(OPS); + } +} + +// _____________________________________________________________________________ +template +void QueryPlanner::indexScanThreeVarsCase( + const TripleGraph::Node& node, + const AddedIndexScanFunction& addIndexScan) const { + using enum Permutation::Enum; + + if (!_qec || _qec->getIndex().hasAllPermutations()) { + // Add plans for all six permutations. + addIndexScan(OPS); + addIndexScan(OSP); + addIndexScan(PSO); + addIndexScan(POS); + addIndexScan(SPO); + addIndexScan(SOP); + } else { + AD_THROW( + "With only 2 permutations registered (no -a option), " + "triples should have at most two variables. " + "Not the case in: " + + node.triple_.asString()); + } +} + +// _____________________________________________________________________________ +template +void QueryPlanner::seedFromOrdinaryTriple( + const TripleGraph::Node& node, const PushPlanFunction& pushPlan, + const AddedIndexScanFunction& addIndexScan) { + if (node._variables.size() == 1) { + indexScanSingleVarCase(node, pushPlan, addIndexScan); + } else if (node._variables.size() == 2) { + indexScanTwoVarsCase(node, addIndexScan); + } else { + indexScanThreeVarsCase(node, addIndexScan); + } +} + // _____________________________________________________________________________ vector QueryPlanner::seedWithScansAndText( const QueryPlanner::TripleGraph& tg, @@ -696,30 +841,30 @@ vector QueryPlanner::seedWithScansAndText( for (size_t i = 0; i < tg._nodeMap.size(); ++i) { const TripleGraph::Node& node = *tg._nodeMap.find(i)->second; - auto pushPlan = [&](SubtreePlan plan) { + auto pushPlan = [&seeds, i](SubtreePlan plan) { plan._idsOfIncludedNodes = (uint64_t(1) << i); seeds.push_back(std::move(plan)); }; - auto addIndexScan = [&](Permutation::Enum permutation) { - pushPlan(makeSubtreePlan(_qec, permutation, node._triple)); + auto addIndexScan = [this, pushPlan, node](Permutation::Enum permutation) { + pushPlan(makeSubtreePlan(_qec, permutation, node.triple_)); }; using enum Permutation::Enum; - if (node._cvar.has_value()) { + if (node.isTextNode()) { seeds.push_back(getTextLeafPlan(node)); continue; } if (node._variables.empty()) { AD_THROW("Triples should have at least one variable. Not the case in: " + - node._triple.asString()); + node.triple_.asString()); } // If the predicate is a property path, we have to recursively set up the // index scans. - if (node._triple._p._operation != PropertyPath::Operation::IRI) { - for (SubtreePlan& plan : seedFromPropertyPathTriple(node._triple)) { + if (node.triple_._p._operation != PropertyPath::Operation::IRI) { + for (SubtreePlan& plan : seedFromPropertyPathTriple(node.triple_)) { pushPlan(std::move(plan)); } continue; @@ -728,7 +873,7 @@ vector QueryPlanner::seedWithScansAndText( // At this point, we know that the predicate is a simple IRI or a variable. if (_qec && !_qec->getIndex().hasAllPermutations() && - isVariable(node._triple._p._iri)) { + isVariable(node.triple_._p._iri)) { AD_THROW( "The query contains a predicate variable, but only the PSO " "and POS permutations were loaded. Rerun the server without " @@ -736,74 +881,12 @@ vector QueryPlanner::seedWithScansAndText( "necessary also rebuild the index."); } - if (node._triple._p._iri == HAS_PREDICATE_PREDICATE) { - pushPlan(makeSubtreePlan(_qec, node._triple)); + if (node.triple_._p._iri == HAS_PREDICATE_PREDICATE) { + pushPlan(makeSubtreePlan(_qec, node.triple_)); continue; } - if (node._variables.size() == 1) { - // There is exactly one variable in the triple (may occur twice). - if (isVariable(node._triple._s) && isVariable(node._triple._o) && - node._triple._s == node._triple._o) { - if (isVariable(node._triple._p._iri)) { - AD_THROW("Triple with one variable repeated three times"); - } - LOG(DEBUG) << "Subject variable same as object variable" << std::endl; - // Need to handle this as IndexScan with a new unique - // variable + Filter. Works in both directions - Variable filterVar = generateUniqueVarName(); - auto scanTriple = node._triple; - scanTriple._o = filterVar; - auto scanTree = makeExecutionTree(_qec, PSO, scanTriple); - // The simplest way to set up the filtering expression is to use the - // parser. - std::string filterString = - absl::StrCat("FILTER (", scanTriple._s.getVariable().name(), "=", - filterVar.name(), ")"); - auto filter = sparqlParserHelpers::ParserAndVisitor{filterString} - .parseTypesafe(&SparqlAutomaticParser::filterR) - .resultOfParse_; - auto plan = makeSubtreePlan(_qec, scanTree, - std::move(filter.expression_)); - pushPlan(std::move(plan)); - } else if (isVariable(node._triple._s)) { - addIndexScan(POS); - } else if (isVariable(node._triple._o)) { - addIndexScan(PSO); - } else { - AD_CONTRACT_CHECK(isVariable(node._triple._p)); - addIndexScan(SOP); - } - } else if (node._variables.size() == 2) { - // Add plans for both possible scan directions. - if (!isVariable(node._triple._p._iri)) { - addIndexScan(PSO); - addIndexScan(POS); - } else if (!isVariable(node._triple._s)) { - addIndexScan(SPO); - addIndexScan(SOP); - } else if (!isVariable(node._triple._o)) { - addIndexScan(OSP); - addIndexScan(OPS); - } - } else { - // The current triple contains three distinct variables. - if (!_qec || _qec->getIndex().hasAllPermutations()) { - // Add plans for all six permutations. - addIndexScan(OPS); - addIndexScan(OSP); - addIndexScan(PSO); - addIndexScan(POS); - addIndexScan(SPO); - addIndexScan(SOP); - } else { - AD_THROW( - "With only 2 permutations registered (no -a option), " - "triples should have at most two variables. " - "Not the case in: " + - node._triple.asString()); - } - } + seedFromOrdinaryTriple(node, pushPlan, addIndexScan); } return seeds; } @@ -973,11 +1056,29 @@ Variable QueryPlanner::generateUniqueVarName() { // _____________________________________________________________________________ QueryPlanner::SubtreePlan QueryPlanner::getTextLeafPlan( const QueryPlanner::TripleGraph::Node& node) const { + AD_CONTRACT_CHECK(node.wordPart_.has_value()); + string word = node.wordPart_.value(); SubtreePlan plan(_qec); - plan._idsOfIncludedNodes |= (size_t(1) << node._id); - AD_CONTRACT_CHECK(node._wordPart.has_value()); - plan._qet = makeExecutionTree( - _qec, node._wordPart.value(), node._variables, node._cvar.value()); + if (node.triple_._p._iri == CONTAINS_ENTITY_PREDICATE) { + if (node._variables.size() == 2) { + // TODO: This is not nice, refactor the whole TripleGraph class + // to make these checks more explicity. + Variable evar = *(node._variables.begin()) == node.cvar_.value() + ? *(++node._variables.begin()) + : *(node._variables.begin()); + plan = makeSubtreePlan(_qec, node.cvar_.value(), + evar, word); + } else { + // Fixed entity case + AD_CORRECTNESS_CHECK(node._variables.size() == 1); + plan = makeSubtreePlan( + _qec, node.cvar_.value(), node.triple_._o.toString(), word); + } + } else { + plan = + makeSubtreePlan(_qec, node.cvar_.value(), word); + } + plan._idsOfIncludedNodes |= (size_t(1) << node.id_); return plan; } @@ -1043,13 +1144,12 @@ vector QueryPlanner::merge( string QueryPlanner::TripleGraph::asString() const { std::ostringstream os; for (size_t i = 0; i < _adjLists.size(); ++i) { - if (!_nodeMap.find(i)->second->_cvar.has_value()) { - os << i << " " << _nodeMap.find(i)->second->_triple.asString() << " : ("; + if (!_nodeMap.find(i)->second->cvar_.has_value()) { + os << i << " " << _nodeMap.find(i)->second->triple_.asString() << " : ("; } else { os << i << " {TextOP for " - << _nodeMap.find(i)->second->_cvar.value().name() << ", wordPart: \"" - << absl::StrJoin(_nodeMap.find(i)->second->_wordPart.value(), " ") - << "\"} : ("; + << _nodeMap.find(i)->second->cvar_.value().name() << ", wordPart: \"" + << _nodeMap.find(i)->second->wordPart_.value() << "\"} : ("; } for (size_t j = 0; j < _adjLists[i].size(); ++j) { @@ -1286,24 +1386,9 @@ vector> QueryPlanner::fillDpTab( // _____________________________________________________________________________ bool QueryPlanner::TripleGraph::isTextNode(size_t i) const { return _nodeMap.count(i) > 0 && - (_nodeMap.find(i)->second->_triple._p._iri == + (_nodeMap.find(i)->second->triple_._p._iri == CONTAINS_ENTITY_PREDICATE || - _nodeMap.find(i)->second->_triple._p._iri == CONTAINS_WORD_PREDICATE); -} - -// _____________________________________________________________________________ -ad_utility::HashMap> -QueryPlanner::TripleGraph::identifyTextCliques() const { - ad_utility::HashMap> contextVarToTextNodesIds; - // Fill contextVar -> triples map - for (size_t i = 0; i < _adjLists.size(); ++i) { - if (isTextNode(i)) { - auto& triple = _nodeMap.find(i)->second->_triple; - auto& cvar = triple._s; - contextVarToTextNodesIds[cvar.getVariable()].push_back(i); - } - } - return contextVarToTextNodesIds; + _nodeMap.find(i)->second->triple_._p._iri == CONTAINS_WORD_PREDICATE); } // _____________________________________________________________________________ @@ -1432,7 +1517,7 @@ QueryPlanner::TripleGraph::TripleGraph( const std::vector>>& init) { for (const std::pair>& p : init) { _nodeStorage.push_back(p.first); - _nodeMap[p.first._id] = &_nodeStorage.back(); + _nodeMap[p.first.id_] = &_nodeStorage.back(); _adjLists.push_back(p.second); } } @@ -1451,7 +1536,7 @@ QueryPlanner::TripleGraph::TripleGraph(const QueryPlanner::TripleGraph& other, if (keep.count(i) > 0) { _nodeStorage.push_back(*other._nodeMap.find(i)->second); idChange[i] = _nodeMap.size(); - _nodeStorage.back()._id = _nodeMap.size(); + _nodeStorage.back().id_ = _nodeMap.size(); _nodeMap[idChange[i]] = &_nodeStorage.back(); } } @@ -1493,123 +1578,6 @@ QueryPlanner::TripleGraph& QueryPlanner::TripleGraph::operator=( QueryPlanner::TripleGraph::TripleGraph() : _adjLists(), _nodeMap(), _nodeStorage() {} -// ___________________________________________________________________________ -namespace { - -// Remove the quotation marks around an enquoted literal and convert it to lower -// case. This is only used in the `collapseTextCliques` function. -string stripAndLowercaseLiteral(std::string_view lit) { - AD_CORRECTNESS_CHECK(lit.size() >= 2 && lit.starts_with('"') && - lit.ends_with('"')); - lit.remove_prefix(1); - lit.remove_suffix(1); - return ad_utility::utf8ToLower(lit); -} -} // namespace - -// _____________________________________________________________________________ -void QueryPlanner::TripleGraph::collapseTextCliques() { - // TODO: Could use more refactoring. - - // Create a map from context var to triples it occurs in (the cliques). - ad_utility::HashMap> cvarsToTextNodes( - identifyTextCliques()); - if (cvarsToTextNodes.empty()) { - return; - } - // Now turn each such clique into a new node the represents that whole - // text operation clique. - size_t id = 0; - vector textNodes; - ad_utility::HashMap removedNodeIds; - vector> tnAdjSetsToOldIds; - for (auto& cvarsToTextNode : cvarsToTextNodes) { - auto& cvar = cvarsToTextNode.first; - std::vector words; - vector trips; - tnAdjSetsToOldIds.emplace_back(); - auto& adjNodes = tnAdjSetsToOldIds.back(); - for (auto nid : cvarsToTextNode.second) { - removedNodeIds[nid] = id; - adjNodes.insert(_adjLists[nid].begin(), _adjLists[nid].end()); - auto& triple = _nodeMap[nid]->_triple; - trips.push_back(triple); - // TODO I think the check "is the predicate ql:contains_word" is - // missing. Verify this. - if (triple._s == cvar && triple._o.isLiteral()) { - std::vector newWords = absl::StrSplit( - stripAndLowercaseLiteral( - triple._o.getLiteral().normalizedLiteralContent().get()), - ' '); - words.insert(words.end(), newWords.begin(), newWords.end()); - } - } - textNodes.emplace_back(id, cvar, std::move(words), trips); - ++id; - assert(tnAdjSetsToOldIds.size() == id); - } - - // Finally update the graph (node ids and adj lists). - vector> oldAdjLists = _adjLists; - std::list oldNodeStorage = _nodeStorage; - _nodeStorage.clear(); - _nodeMap.clear(); - _adjLists.clear(); - ad_utility::HashMap idMapOldToNew; - ad_utility::HashMap idMapNewToOld; - - // Storage and ids. - for (auto& tn : textNodes) { - _nodeStorage.push_back(tn); - _nodeMap[tn._id] = &_nodeStorage.back(); - } - - for (auto& n : oldNodeStorage) { - if (removedNodeIds.count(n._id) == 0) { - idMapOldToNew[n._id] = id; - idMapNewToOld[id] = n._id; - n._id = id++; - _nodeStorage.push_back(n); - _nodeMap[n._id] = &_nodeStorage.back(); - } - } - - // Adj lists - // First for newly created text nodes. - for (size_t i = 0; i < tnAdjSetsToOldIds.size(); ++i) { - const auto& nodes = tnAdjSetsToOldIds[i]; - std::set adjNodes; - for (auto nid : nodes) { - if (removedNodeIds.count(nid) == 0) { - adjNodes.insert(idMapOldToNew[nid]); - } else if (removedNodeIds[nid] != i) { - adjNodes.insert(removedNodeIds[nid]); - } - } - vector adjList; - adjList.insert(adjList.begin(), adjNodes.begin(), adjNodes.end()); - _adjLists.emplace_back(adjList); - } - assert(_adjLists.size() == textNodes.size()); - assert(_adjLists.size() == tnAdjSetsToOldIds.size()); - // Then for remaining (regular) nodes. - for (size_t i = textNodes.size(); i < _nodeMap.size(); ++i) { - const Node& node = *_nodeMap[i]; - const auto& oldAdjList = oldAdjLists[idMapNewToOld[node._id]]; - std::set adjNodes; - for (auto nid : oldAdjList) { - if (removedNodeIds.count(nid) == 0) { - adjNodes.insert(idMapOldToNew[nid]); - } else { - adjNodes.insert(removedNodeIds[nid]); - } - } - vector adjList; - adjList.insert(adjList.begin(), adjNodes.begin(), adjNodes.end()); - _adjLists.emplace_back(adjList); - } -} - // _____________________________________________________________________________ bool QueryPlanner::TripleGraph::isSimilar( const QueryPlanner::TripleGraph& other) const { @@ -1629,8 +1597,8 @@ bool QueryPlanner::TripleGraph::isSimilar( bool hasMatch = false; for (const Node& n2 : other._nodeStorage) { if (n.isSimilar(n2)) { - id_map[n._id] = n2._id; - id_map_reverse[n2._id] = n._id; + id_map[n.id_] = n2.id_; + id_map_reverse[n2.id_] = n.id_; hasMatch = true; break; } else { diff --git a/src/engine/QueryPlanner.h b/src/engine/QueryPlanner.h index e09be794ad..85adb09fb9 100644 --- a/src/engine/QueryPlanner.h +++ b/src/engine/QueryPlanner.h @@ -36,40 +36,22 @@ class QueryPlanner { TripleGraph(const TripleGraph& other, vector keepNodes); struct Node { - Node(size_t id, SparqlTriple t) : _id(id), _triple(std::move(t)) { - if (isVariable(_triple._s)) { - _variables.insert(_triple._s.getVariable()); + Node(size_t id, SparqlTriple t) : id_(id), triple_(std::move(t)) { + if (isVariable(triple_._s)) { + _variables.insert(triple_._s.getVariable()); } - if (isVariable(_triple._p)) { - _variables.insert(Variable{_triple._p._iri}); + if (isVariable(triple_._p)) { + _variables.insert(Variable{triple_._p._iri}); } - if (isVariable(_triple._o)) { - _variables.insert(_triple._o.getVariable()); + if (isVariable(triple_._o)) { + _variables.insert(triple_._o.getVariable()); } } - Node(size_t id, const Variable& cvar, std::vector words, - const vector& trips) - : _id(id), - // TODO What is this triple used for? If it is just a - // dummy, then we can replace it by a `variant`. - _triple(cvar, PropertyPath::fromIri(INTERNAL_TEXT_MATCH_PREDICATE), - TripleComponent::UNDEF{}), - _cvar(cvar), - _wordPart(std::move(words)) { - _variables.insert(cvar); - for (const auto& t : trips) { - if (isVariable(t._s)) { - _variables.insert(t._s.getVariable()); - } - if (isVariable(t._p)) { - _variables.insert(Variable{t._p._iri}); - } - if (isVariable(t._o)) { - _variables.insert(t._o.getVariable()); - } - } + Node(size_t id, Variable cvar, std::string word, SparqlTriple t) + : Node(id, std::move(t)) { + cvar_ = std::move(cvar); + wordPart_ = std::move(word); } Node(const Node& other) = default; @@ -79,30 +61,32 @@ class QueryPlanner { // Returns true if the two nodes equal apart from the id // and the order of variables bool isSimilar(const Node& other) const { - return _triple == other._triple && _cvar == other._cvar && - _wordPart == other._wordPart && _variables == other._variables; + return triple_ == other.triple_ && cvar_ == other.cvar_ && + wordPart_ == other.wordPart_ && _variables == other._variables; } + bool isTextNode() const { return cvar_.has_value(); } + friend std::ostream& operator<<(std::ostream& out, const Node& n) { - out << "id: " << n._id << " triple: " << n._triple.asString() + out << "id: " << n.id_ << " triple: " << n.triple_.asString() << " vars_ "; for (const auto& s : n._variables) { out << s.name() << ", "; } // TODO Should the `cvar` and the `wordPart` be stored // together? - if (n._cvar.has_value()) { - out << " cvar " << n._cvar.value().name() << " wordPart " - << absl::StrJoin(n._wordPart.value(), " "); + if (n.cvar_.has_value()) { + out << " cvar " << n.cvar_.value().name() << " wordPart " + << n.wordPart_.value(); } return out; } - size_t _id; - SparqlTriple _triple; + size_t id_; + SparqlTriple triple_; ad_utility::HashSet _variables; - std::optional _cvar = std::nullopt; - std::optional> _wordPart = std::nullopt; + std::optional cvar_ = std::nullopt; + std::optional wordPart_ = std::nullopt; }; // Allows for manually building triple graphs for testing @@ -119,13 +103,9 @@ class QueryPlanner { ad_utility::HashMap _nodeMap; std::list _nodeStorage; - ad_utility::HashMap> identifyTextCliques() const; - vector bfsLeaveOut(size_t startNode, ad_utility::HashSet leaveOut) const; - void collapseTextCliques(); - private: vector>> splitAtContextVars( const vector& origFilters, @@ -219,6 +199,8 @@ class QueryPlanner { [[nodiscard]] TripleGraph createTripleGraph( const parsedQuery::BasicGraphPattern* pattern) const; + void addNodeToTripleGraph(const TripleGraph::Node&, TripleGraph&) const; + void setEnablePatternTrick(bool enablePatternTrick); // Create a set of possible execution trees for the given parsed query. The @@ -242,6 +224,30 @@ class QueryPlanner { [[nodiscard]] std::vector optimize( ParsedQuery::GraphPattern* rootPattern); + // Add all the possible index scans for the triple represented by the node. + // The triple is "ordinary" in the sense that it is neither a text triple with + // ql:contains-word nor a special pattern trick triple. + template + void seedFromOrdinaryTriple(const TripleGraph::Node& node, + const PushPlanFunction& pushPlan, + const AddedIndexScanFunction& addIndexScan); + + // Helper function used by the seedFromOrdinaryTriple function + template + void indexScanSingleVarCase(const TripleGraph::Node& node, + const PushPlanFunction& pushPlan, + const AddedIndexScanFunction& addIndexScan); + + // Helper function used by the seedFromOrdinaryTriple function + template + void indexScanTwoVarsCase(const TripleGraph::Node& node, + const AddedIndexScanFunction& addIndexScan) const; + + // Helper function used by the seedFromOrdinaryTriple function + template + void indexScanThreeVarsCase(const TripleGraph::Node& node, + const AddedIndexScanFunction& addIndexScan) const; + /** * @brief Fills children with all operations that are associated with a single * node in the triple graph (e.g. IndexScans). diff --git a/src/engine/TextIndexScanForEntity.cpp b/src/engine/TextIndexScanForEntity.cpp new file mode 100644 index 0000000000..352ecde5e1 --- /dev/null +++ b/src/engine/TextIndexScanForEntity.cpp @@ -0,0 +1,110 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Nick Göckel + +#include "engine/TextIndexScanForEntity.h" + +// _____________________________________________________________________________ +TextIndexScanForEntity::TextIndexScanForEntity( + QueryExecutionContext* qec, Variable textRecordVar, + std::variant entity, string word) + : Operation(qec), + textRecordVar_(std::move(textRecordVar)), + varOrFixed_(qec, std::move(entity)), + word_(std::move(word)) {} + +// _____________________________________________________________________________ +ResultTable TextIndexScanForEntity::computeResult() { + IdTable idTable = getExecutionContext()->getIndex().getEntityMentionsForWord( + word_, getExecutionContext()->getAllocator()); + + if (hasFixedEntity()) { + auto beginErase = std::ranges::remove_if(idTable, [this](const auto& row) { + return row[1].getVocabIndex() != getVocabIndexOfFixedEntity(); + }); + idTable.erase(beginErase.begin(), idTable.end()); + idTable.setColumnSubset(std::vector{0, 2}); + } + + // Add details to the runtimeInfo. This is has no effect on the result. + if (hasFixedEntity()) { + runtimeInfo().addDetail("fixed entity: ", fixedEntity()); + } else { + runtimeInfo().addDetail("entity var: ", entityVariable().name()); + } + runtimeInfo().addDetail("word: ", word_); + + return {std::move(idTable), resultSortedOn(), LocalVocab{}}; +} + +// _____________________________________________________________________________ +VariableToColumnMap TextIndexScanForEntity::computeVariableToColumnMap() const { + VariableToColumnMap vcmap; + auto addDefinedVar = [&vcmap, + index = ColumnIndex{0}](const Variable& var) mutable { + vcmap[var] = makeAlwaysDefinedColumn(index); + ++index; + }; + addDefinedVar(textRecordVar_); + if (hasFixedEntity()) { + addDefinedVar(textRecordVar_.getScoreVariable(fixedEntity())); + } else { + addDefinedVar(entityVariable()); + addDefinedVar(textRecordVar_.getScoreVariable(entityVariable())); + } + return vcmap; +} + +// _____________________________________________________________________________ +size_t TextIndexScanForEntity::getResultWidth() const { + return 2 + (hasFixedEntity() ? 0 : 1); +} + +// _____________________________________________________________________________ +size_t TextIndexScanForEntity::getCostEstimate() { + if (hasFixedEntity()) { + // We currently have to first materialize and then filter the complete list + // for the fixed entity + return 2 * getExecutionContext()->getIndex().getSizeOfTextBlockForEntities( + word_); + } else { + return getExecutionContext()->getIndex().getSizeOfTextBlockForEntities( + word_); + } +} + +// _____________________________________________________________________________ +uint64_t TextIndexScanForEntity::getSizeEstimateBeforeLimit() { + if (hasFixedEntity()) { + return static_cast( + getExecutionContext()->getIndex().getAverageNofEntityContexts()); + } else { + return getExecutionContext()->getIndex().getSizeOfTextBlockForEntities( + word_); + } +} + +// _____________________________________________________________________________ +bool TextIndexScanForEntity::knownEmptyResult() { + return getExecutionContext()->getIndex().getSizeOfTextBlockForEntities( + word_) == 0; +} + +// _____________________________________________________________________________ +vector TextIndexScanForEntity::resultSortedOn() const { + return {ColumnIndex(0)}; +} + +// _____________________________________________________________________________ +string TextIndexScanForEntity::getDescriptor() const { + return absl::StrCat("TextIndexScanForEntity on ", textRecordVar_.name()); +} + +// _____________________________________________________________________________ +string TextIndexScanForEntity::getCacheKeyImpl() const { + std::ostringstream os; + os << "ENTITY INDEX SCAN FOR WORD: " + << " with word: \"" << word_ << "\" and fixed-entity: \"" + << (hasFixedEntity() ? fixedEntity() : "no fixed-entity") << " \""; + return std::move(os).str(); +} diff --git a/src/engine/TextIndexScanForEntity.h b/src/engine/TextIndexScanForEntity.h new file mode 100644 index 0000000000..155a962f12 --- /dev/null +++ b/src/engine/TextIndexScanForEntity.h @@ -0,0 +1,111 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Nick Göckel + +#pragma once + +#include + +#include "./Operation.h" + +// This operation retrieves all text records and their corresponding +// entities from the fulltext index that contain a certain word or prefix. +// The entities are saved to the entityVar_. If the operation is called on a +// fixed entity instead, it only returns entries that contain this entity. +class TextIndexScanForEntity : public Operation { + using FixedEntity = std::pair; + + struct VarOrFixedEntity { + std::variant entity_; + + static std::variant makeEntityVariant( + const QueryExecutionContext* qec, + std::variant entity) { + if (std::holds_alternative(entity)) { + VocabIndex index; + std::string fixedEntity = std::move(std::get(entity)); + bool success = qec->getIndex().getVocab().getId(fixedEntity, &index); + if (!success) { + throw std::runtime_error( + "The entity " + fixedEntity + + " is not part of the underlying knowledge graph and can " + "therefore not be used as the object of ql:contains-entity"); + } + return FixedEntity(std::move(fixedEntity), std::move(index)); + } else { + return std::get(entity); + } + }; + + VarOrFixedEntity(const QueryExecutionContext* qec, + std::variant entity) + : entity_(makeEntityVariant(qec, std::move(entity))) {} + + ~VarOrFixedEntity() = default; + + bool hasFixedEntity() const { + return std::holds_alternative(entity_); + } + }; + + const Variable textRecordVar_; + const VarOrFixedEntity varOrFixed_; + const string word_; + + public: + TextIndexScanForEntity(QueryExecutionContext* qec, Variable textRecordVar, + std::variant entity, + string word); + ~TextIndexScanForEntity() override = default; + + bool hasFixedEntity() const { return varOrFixed_.hasFixedEntity(); } + + const std::string& fixedEntity() const { + AD_CONTRACT_CHECK(hasFixedEntity()); + return std::get(varOrFixed_.entity_).first; + } + + const Variable& entityVariable() const { + AD_CONTRACT_CHECK(!hasFixedEntity()); + return std::get(varOrFixed_.entity_); + } + + const Variable& textRecordVar() const { return textRecordVar_; } + + const std::string& word() const { return word_; } + + string getCacheKeyImpl() const override; + + string getDescriptor() const override; + + size_t getResultWidth() const override; + + void setTextLimit(size_t) override { + // TODO: implement textLimit + } + + size_t getCostEstimate() override; + + uint64_t getSizeEstimateBeforeLimit() override; + + float getMultiplicity(size_t col) override { + (void)col; + return 1; + } + + bool knownEmptyResult() override; + + vector resultSortedOn() const override; + + VariableToColumnMap computeVariableToColumnMap() const override; + + private: + const VocabIndex& getVocabIndexOfFixedEntity() const { + AD_CONTRACT_CHECK(hasFixedEntity()); + return std::get(varOrFixed_.entity_).second; + } + + ResultTable computeResult() override; + + vector getChildren() override { return {}; } +}; diff --git a/src/engine/TextIndexScanForWord.cpp b/src/engine/TextIndexScanForWord.cpp new file mode 100644 index 0000000000..c490a88c6f --- /dev/null +++ b/src/engine/TextIndexScanForWord.cpp @@ -0,0 +1,82 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Nick Göckel + +#include "engine/TextIndexScanForWord.h" + +// _____________________________________________________________________________ +TextIndexScanForWord::TextIndexScanForWord(QueryExecutionContext* qec, + Variable textRecordVar, string word) + : Operation(qec), + textRecordVar_(std::move(textRecordVar)), + word_(std::move(word)), + isPrefix_(word_.ends_with('*')) {} + +// _____________________________________________________________________________ +ResultTable TextIndexScanForWord::computeResult() { + IdTable idTable = getExecutionContext()->getIndex().getWordPostingsForTerm( + word_, getExecutionContext()->getAllocator()); + + if (!isPrefix_) { + IdTable smallIdTable{getExecutionContext()->getAllocator()}; + smallIdTable.setNumColumns(1); + smallIdTable.resize(idTable.numRows()); + std::ranges::copy(idTable.getColumn(0), smallIdTable.getColumn(0).begin()); + + return {std::move(smallIdTable), resultSortedOn(), LocalVocab{}}; + } + + // Add details to the runtimeInfo. This is has no effect on the result. + runtimeInfo().addDetail("word: ", word_); + + return {std::move(idTable), resultSortedOn(), LocalVocab{}}; +} + +// _____________________________________________________________________________ +VariableToColumnMap TextIndexScanForWord::computeVariableToColumnMap() const { + VariableToColumnMap vcmap; + auto addDefinedVar = [&vcmap, + index = ColumnIndex{0}](const Variable& var) mutable { + vcmap[var] = makeAlwaysDefinedColumn(index); + ++index; + }; + addDefinedVar(textRecordVar_); + if (isPrefix_) { + addDefinedVar(textRecordVar_.getMatchingWordVariable( + std::string_view(word_).substr(0, word_.size() - 1))); + } + return vcmap; +} + +// _____________________________________________________________________________ +size_t TextIndexScanForWord::getResultWidth() const { + return 1 + (isPrefix_ ? 1 : 0); +} + +// _____________________________________________________________________________ +size_t TextIndexScanForWord::getCostEstimate() { + return getExecutionContext()->getIndex().getSizeOfTextBlockForWord(word_); +} + +// _____________________________________________________________________________ +uint64_t TextIndexScanForWord::getSizeEstimateBeforeLimit() { + return getExecutionContext()->getIndex().getSizeOfTextBlockForWord(word_); +} + +// _____________________________________________________________________________ +vector TextIndexScanForWord::resultSortedOn() const { + return {ColumnIndex(0)}; +} + +// _____________________________________________________________________________ +string TextIndexScanForWord::getDescriptor() const { + return absl::StrCat("TextIndexScanForWord on ", textRecordVar_.name()); +} + +// _____________________________________________________________________________ +string TextIndexScanForWord::getCacheKeyImpl() const { + std::ostringstream os; + os << "WORD INDEX SCAN: " + << " with word: \"" << word_ << "\""; + return std::move(os).str(); +} diff --git a/src/engine/TextIndexScanForWord.h b/src/engine/TextIndexScanForWord.h new file mode 100644 index 0000000000..53b3f56757 --- /dev/null +++ b/src/engine/TextIndexScanForWord.h @@ -0,0 +1,60 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Nick Göckel + +#pragma once + +#include + +#include "./Operation.h" + +// This operation retrieves all text records from the fulltext index that +// contain a certain word or prefix. +class TextIndexScanForWord : public Operation { + private: + const Variable textRecordVar_; + const string word_; + bool isPrefix_ = false; + + public: + TextIndexScanForWord(QueryExecutionContext* qec, Variable textRecordVar, + string word); + + ~TextIndexScanForWord() override = default; + + const Variable& textRecordVar() const { return textRecordVar_; } + + const std::string& word() const { return word_; } + + string getCacheKeyImpl() const override; + + string getDescriptor() const override; + + size_t getResultWidth() const override; + + void setTextLimit(size_t) override { + // TODO: implement textLimit + } + + size_t getCostEstimate() override; + + uint64_t getSizeEstimateBeforeLimit() override; + + float getMultiplicity(size_t col) override { + (void)col; + return 1; + } + + bool knownEmptyResult() override { return getSizeEstimateBeforeLimit() == 0; } + + vector resultSortedOn() const override; + + VariableToColumnMap computeVariableToColumnMap() const override; + + private: + // Returns a ResultTable containing an IdTable with the columns being + // the text variable and the completed word (if it was prefixed) + ResultTable computeResult() override; + + vector getChildren() override { return {}; } +}; diff --git a/src/global/Constants.h b/src/global/Constants.h index e74407d327..7584c3a028 100644 --- a/src/global/Constants.h +++ b/src/global/Constants.h @@ -71,6 +71,8 @@ static const std::string INTERNAL_VARIABLE_PREFIX = "?_QLever_internal_variable_"; static constexpr std::string_view TEXTSCORE_VARIABLE_PREFIX = "?ql_textscore_"; +static constexpr std::string_view ENTITY_VARIABLE_PREFIX = "?ql_entity_"; +static constexpr std::string_view SCORE_VARIABLE_PREFIX = "?ql_score_"; static constexpr std::string_view MATCHINGWORD_VARIABLE_PREFIX = "?ql_matchingword_"; diff --git a/src/index/FTSAlgorithms.cpp b/src/index/FTSAlgorithms.cpp index 1698622600..30552b1b63 100644 --- a/src/index/FTSAlgorithms.cpp +++ b/src/index/FTSAlgorithms.cpp @@ -16,7 +16,7 @@ using std::pair; // _____________________________________________________________________________ -Index::WordEntityPostings FTSAlgorithms::filterByRange( +Index::WordEntityPostings FTSAlgorithms::filterByRangeWep( const IdRange& idRange, const WordEntityPostings& wepPreFilter) { AD_CONTRACT_CHECK(wepPreFilter.wids_.size() == 1); @@ -63,6 +63,46 @@ Index::WordEntityPostings FTSAlgorithms::filterByRange( return wepResult; } +// _____________________________________________________________________________ +IdTable FTSAlgorithms::filterByRange(const IdRange& idRange, + const IdTable& idTablePreFilter) { + AD_CONTRACT_CHECK(idTablePreFilter.numColumns() == 2); + LOG(DEBUG) << "Filtering " << idTablePreFilter.getColumn(0).size() + << " elements by ID range...\n"; + + IdTable idTableResult{idTablePreFilter.getAllocator()}; + idTableResult.setNumColumns(2); + idTableResult.resize(idTablePreFilter.getColumn(0).size()); + + decltype(auto) resultCidColumn = idTableResult.getColumn(0); + decltype(auto) resultWidColumn = idTableResult.getColumn(1); + size_t nofResultElements = 0; + decltype(auto) preFilterCidColumn = idTablePreFilter.getColumn(0); + decltype(auto) preFilterWidColumn = idTablePreFilter.getColumn(1); + // TODO Use views::zip. + for (size_t i = 0; i < preFilterWidColumn.size(); ++i) { + // TODO proper Ids for the text stuff. + // The mapping from words that appear in text records to `WordIndex`es is + // stored in a `Vocabulary` that stores `VocabIndex`es, so we have to + // convert between those two types. + // TODO Can we make the returned `IndexType` a template parameter + // of the vocabulary, s.t. we have a vocabulary that stores `WordIndex`es + // directly? + if (preFilterWidColumn[i].getWordVocabIndex() >= idRange.first() && + preFilterWidColumn[i].getWordVocabIndex() <= idRange.last()) { + resultCidColumn[nofResultElements] = preFilterCidColumn[i]; + resultWidColumn[nofResultElements] = preFilterWidColumn[i]; + nofResultElements++; + } + } + + idTableResult.resize(nofResultElements); + + LOG(DEBUG) << "Filtering by ID range done. Result has " + << idTableResult.numRows() << " elements.\n"; + return idTableResult; +} + // _____________________________________________________________________________ Index::WordEntityPostings FTSAlgorithms::crossIntersect( const WordEntityPostings& matchingContextsWep, diff --git a/src/index/FTSAlgorithms.h b/src/index/FTSAlgorithms.h index 73701919ae..fc753eb2cf 100644 --- a/src/index/FTSAlgorithms.h +++ b/src/index/FTSAlgorithms.h @@ -25,10 +25,15 @@ class FTSAlgorithms { public: // Filters all wep entries out where the wid does not lay inside the // idRange. - static WordEntityPostings filterByRange( + static WordEntityPostings filterByRangeWep( const IdRange& idRange, const WordEntityPostings& wepPreFilter); + // Filters all IdTable entries out where the WordIndex does not lay inside the + // idRange. + static IdTable filterByRange(const IdRange& idRange, + const IdTable& idPreFilter); + // Intersects matchingContextsWep and eBlockWep on the cids_ attribute. If // there are multiple matches for the same cid then we calculate every // possible combination of eids and wids. diff --git a/src/index/Index.cpp b/src/index/Index.cpp index 29db81f326..4aabb49e03 100644 --- a/src/index/Index.cpp +++ b/src/index/Index.cpp @@ -122,6 +122,16 @@ std::string_view Index::wordIdToString(WordIndex wordIndex) const { return pimpl_->wordIdToString(wordIndex); } +// ____________________________________________________________________________ +size_t Index::getSizeOfTextBlockForWord(const std::string& word) const { + return pimpl_->getSizeOfTextBlockForWord(word); +} + +// ____________________________________________________________________________ +size_t Index::getSizeOfTextBlockForEntities(const std::string& word) const { + return pimpl_->getSizeOfTextBlockForEntities(word); +} + // ____________________________________________________________________________ size_t Index::getSizeEstimate(const std::string& words) const { return pimpl_->getSizeEstimate(words); @@ -170,9 +180,10 @@ Index::WordEntityPostings Index::getContextEntityScoreListsForWords( } // ____________________________________________________________________________ -Index::WordEntityPostings Index::getWordPostingsForTerm( - const std::string& term) const { - return pimpl_->getWordPostingsForTerm(term); +IdTable Index::getWordPostingsForTerm( + const std::string& term, + const ad_utility::AllocatorWithLimit& allocator) const { + return pimpl_->getWordPostingsForTerm(term, allocator); } // ____________________________________________________________________________ @@ -181,6 +192,18 @@ Index::WordEntityPostings Index::getEntityPostingsForTerm( return pimpl_->getEntityPostingsForTerm(term); } +// ____________________________________________________________________________ +IdTable Index::getEntityMentionsForWord( + const string& term, + const ad_utility::AllocatorWithLimit& allocator) const { + return pimpl_->getEntityMentionsForWord(term, allocator); +} + +// ____________________________________________________________________________ +size_t Index::getIndexOfBestSuitedElTerm(const vector& terms) const { + return pimpl_->getIndexOfBestSuitedElTerm(terms); +} + // ____________________________________________________________________________ std::string Index::getTextExcerpt(TextRecordIndex cid) const { return pimpl_->getTextExcerpt(cid); diff --git a/src/index/Index.h b/src/index/Index.h index 58d13d5eab..1adfcc6c57 100644 --- a/src/index/Index.h +++ b/src/index/Index.h @@ -157,6 +157,11 @@ class Index { // -------------------------------------------------------------------------- [[nodiscard]] std::string_view wordIdToString(WordIndex wordIndex) const; + [[nodiscard]] size_t getSizeOfTextBlockForWord(const std::string& word) const; + + [[nodiscard]] size_t getSizeOfTextBlockForEntities( + const std::string& word) const; + [[nodiscard]] size_t getSizeEstimate(const std::string& words) const; void getContextListForWords(const std::string& words, IdTable* result) const; @@ -184,10 +189,18 @@ class Index { WordEntityPostings getContextEntityScoreListsForWords( const std::string& words) const; - WordEntityPostings getWordPostingsForTerm(const std::string& term) const; + IdTable getWordPostingsForTerm( + const std::string& term, + const ad_utility::AllocatorWithLimit& allocator) const; WordEntityPostings getEntityPostingsForTerm(const std::string& term) const; + IdTable getEntityMentionsForWord( + const string& term, + const ad_utility::AllocatorWithLimit& allocator) const; + + size_t getIndexOfBestSuitedElTerm(const vector& terms) const; + [[nodiscard]] std::string getTextExcerpt(TextRecordIndex cid) const; // Only for debug reasons and external encoding tests. diff --git a/src/index/IndexImpl.Text.cpp b/src/index/IndexImpl.Text.cpp index 870f1405f5..0654bc024d 100644 --- a/src/index/IndexImpl.Text.cpp +++ b/src/index/IndexImpl.Text.cpp @@ -747,12 +747,12 @@ void IndexImpl::getContextListForWords(const string& words, if (!term.ends_with('*')) { skipColumns.push_back(i); } - wepVecs.push_back(getWordPostingsForTerm(term)); + wepVecs.push_back(getWordPostingsForTermWep(term)); i++; } wep = FTSAlgorithms::crossIntersectKWay(wepVecs, nullptr); } else { - wep = getWordPostingsForTerm(terms[0]); + wep = getWordPostingsForTermWep(terms[0]); } AD_CONTRACT_CHECK(wep.wids_.size() >= terms.size()); @@ -787,7 +787,7 @@ void IndexImpl::getContextListForWords(const string& words, } // _____________________________________________________________________________ -Index::WordEntityPostings IndexImpl::readWordCl( +Index::WordEntityPostings IndexImpl::readWordClWep( const TextBlockMetaData& tbmd) const { Index::WordEntityPostings wep; wep.cids_ = readGapComprList( @@ -804,7 +804,30 @@ Index::WordEntityPostings IndexImpl::readWordCl( } // _____________________________________________________________________________ -Index::WordEntityPostings IndexImpl::readWordEntityCl( +IdTable IndexImpl::readWordCl( + const TextBlockMetaData& tbmd, + const ad_utility::AllocatorWithLimit& allocator) const { + IdTable idTable{2, allocator}; + vector cids = readGapComprList( + tbmd._cl._nofElements, tbmd._cl._startContextlist, + static_cast(tbmd._cl._startWordlist - tbmd._cl._startContextlist), + &TextRecordIndex::make); + idTable.resize(cids.size()); + std::ranges::transform(cids, idTable.getColumn(0).begin(), + &Id::makeFromTextRecordIndex); + std::ranges::transform( + readFreqComprList( + tbmd._cl._nofElements, tbmd._cl._startWordlist, + static_cast(tbmd._cl._startScorelist - + tbmd._cl._startWordlist)), + idTable.getColumn(1).begin(), [](WordIndex id) { + return Id::makeFromWordVocabIndex(WordVocabIndex::make(id)); + }); + return idTable; +} + +// _____________________________________________________________________________ +Index::WordEntityPostings IndexImpl::readWordEntityClWep( const TextBlockMetaData& tbmd) const { Index::WordEntityPostings wep; wep.cids_ = readGapComprList( @@ -825,7 +848,36 @@ Index::WordEntityPostings IndexImpl::readWordEntityCl( } // _____________________________________________________________________________ -Index::WordEntityPostings IndexImpl::getWordPostingsForTerm( +IdTable IndexImpl::readWordEntityCl( + const TextBlockMetaData& tbmd, + const ad_utility::AllocatorWithLimit& allocator) const { + IdTable idTable{3, allocator}; + vector cids = readGapComprList( + tbmd._entityCl._nofElements, tbmd._entityCl._startContextlist, + static_cast(tbmd._entityCl._startWordlist - + tbmd._entityCl._startContextlist), + &TextRecordIndex::make); + idTable.resize(cids.size()); + std::ranges::transform(cids, idTable.getColumn(0).begin(), + &Id::makeFromTextRecordIndex); + std::ranges::copy( + readFreqComprList(tbmd._entityCl._nofElements, + tbmd._entityCl._startWordlist, + static_cast(tbmd._entityCl._startScorelist - + tbmd._entityCl._startWordlist), + &Id::fromBits), + idTable.getColumn(1).begin()); + std::ranges::transform( + readFreqComprList( + tbmd._entityCl._nofElements, tbmd._entityCl._startScorelist, + static_cast(tbmd._entityCl._lastByte + 1 - + tbmd._entityCl._startScorelist)), + idTable.getColumn(2).begin(), &Id::makeFromInt); + return idTable; +} + +// _____________________________________________________________________________ +Index::WordEntityPostings IndexImpl::getWordPostingsForTermWep( const string& term) const { LOG(DEBUG) << "Getting word postings for term: " << term << '\n'; Index::WordEntityPostings wep; @@ -834,9 +886,9 @@ Index::WordEntityPostings IndexImpl::getWordPostingsForTerm( return wep; } const auto& tbmd = optionalTbmd.value().tbmd_; - wep = readWordCl(tbmd); + wep = readWordClWep(tbmd); if (optionalTbmd.value().hasToBeFiltered_) { - wep = FTSAlgorithms::filterByRange(optionalTbmd.value().idRange_, wep); + wep = FTSAlgorithms::filterByRangeWep(optionalTbmd.value().idRange_, wep); } LOG(DEBUG) << "Word postings for term: " << term << ": cids: " << wep.cids_.size() << " scores " @@ -844,6 +896,27 @@ Index::WordEntityPostings IndexImpl::getWordPostingsForTerm( return wep; } +// _____________________________________________________________________________ +IdTable IndexImpl::getWordPostingsForTerm( + const string& term, + const ad_utility::AllocatorWithLimit& allocator) const { + LOG(DEBUG) << "Getting word postings for term: " << term << '\n'; + IdTable idTable{allocator}; + auto optionalTbmd = getTextBlockMetadataForWordOrPrefix(term); + if (!optionalTbmd.has_value()) { + return idTable; + } + const auto& tbmd = optionalTbmd.value().tbmd_; + idTable = readWordCl(tbmd, allocator); + if (optionalTbmd.value().hasToBeFiltered_) { + idTable = + FTSAlgorithms::filterByRange(optionalTbmd.value().idRange_, idTable); + } + LOG(DEBUG) << "Word postings for term: " << term + << ": cids: " << idTable.getColumn(0).size() << '\n'; + return idTable; +} + // _____________________________________________________________________________ Index::WordEntityPostings IndexImpl::getContextEntityScoreListsForWords( const string& words) const { @@ -871,7 +944,7 @@ Index::WordEntityPostings IndexImpl::getContextEntityScoreListsForWords( skipColumns.push_back(i); } if (i != useElFromTerm) { - wepVecs.push_back(getWordPostingsForTerm(terms[i])); + wepVecs.push_back(getWordPostingsForTermWep(terms[i])); } } wepVecs.push_back(getEntityPostingsForTerm(terms[useElFromTerm])); @@ -1030,14 +1103,27 @@ Index::WordEntityPostings IndexImpl::getEntityPostingsForTerm( return resultWep; } const auto& tbmd = optTbmd.value().tbmd_; - Index::WordEntityPostings matchingContextsWep = getWordPostingsForTerm(term); + Index::WordEntityPostings matchingContextsWep = + getWordPostingsForTermWep(term); // Read the full lists - Index::WordEntityPostings eBlockWep = readWordEntityCl(tbmd); + Index::WordEntityPostings eBlockWep = readWordEntityClWep(tbmd); resultWep = FTSAlgorithms::crossIntersect(matchingContextsWep, eBlockWep); return resultWep; } +// _____________________________________________________________________________ +IdTable IndexImpl::getEntityMentionsForWord( + const string& term, + const ad_utility::AllocatorWithLimit& allocator) const { + auto optTbmd = getTextBlockMetadataForWordOrPrefix(term); + if (!optTbmd.has_value()) { + return IdTable{allocator}; + } + const auto& tbmd = optTbmd.value().tbmd_; + return readWordEntityCl(tbmd, allocator); +} + // _____________________________________________________________________________ template vector IndexImpl::readGapComprList(size_t nofElements, off_t from, @@ -1380,6 +1466,30 @@ size_t IndexImpl::getIndexOfBestSuitedElTerm( return std::get<0>(toBeSorted[0]); } +// _____________________________________________________________________________ +size_t IndexImpl::getSizeOfTextBlockForEntities(const string& word) const { + if (word.empty()) { + return 0; + } + auto optTbmd = getTextBlockMetadataForWordOrPrefix(word); + if (!optTbmd.has_value()) { + return 0; + } + return optTbmd.value().tbmd_._entityCl._nofElements; +} + +// _____________________________________________________________________________ +size_t IndexImpl::getSizeOfTextBlockForWord(const string& word) const { + if (word.empty()) { + return 0; + } + auto optTbmd = getTextBlockMetadataForWordOrPrefix(word); + if (!optTbmd.has_value()) { + return 0; + } + return optTbmd.value().tbmd_._cl._nofElements; +} + // _____________________________________________________________________________ size_t IndexImpl::getSizeEstimate(const string& words) const { // TODO vector can be of type std::string_view if called functions @@ -1408,10 +1518,12 @@ auto IndexImpl::getTextBlockMetadataForWordOrPrefix(const std::string& word) AD_CORRECTNESS_CHECK(!word.empty()); IdRange idRange; if (word.ends_with(PREFIX_CHAR)) { - if (!textVocab_.getIdRangeForFullTextPrefix(word, &idRange)) { + auto idRangeOpt = textVocab_.getIdRangeForFullTextPrefix(word); + if (!idRangeOpt.has_value()) { LOG(INFO) << "Prefix: " << word << " not in vocabulary\n"; return std::nullopt; } + idRange = idRangeOpt.value(); } else { WordVocabIndex idx; if (!textVocab_.getId(word, &idx)) { diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 8800d670a4..7bea6fe050 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -305,6 +305,17 @@ class IndexImpl { // -------------------------------------------------------------------------- std::string_view wordIdToString(WordIndex wordIndex) const; + size_t getSizeOfTextBlockForEntities(const string& words) const; + + // Returns the size of the whole textblock. If the word is very long or not + // prefixed then only a small number of words actually match. So the final + // result is much smaller. + // Note that as a cost estimate the estimation is correct. Because we always + // have to read the complete block and then filter by the actually needed + // words. + // TODO: improve size estimate by adding a correction factor. + size_t getSizeOfTextBlockForWord(const string& words) const; + size_t getSizeEstimate(const string& words) const; void callFixedGetContextListForWords(const string& words, @@ -335,15 +346,46 @@ class IndexImpl { Index::WordEntityPostings getContextEntityScoreListsForWords( const string& words) const; - Index::WordEntityPostings getWordPostingsForTerm(const string& term) const; + // Does the same as getWordPostingsForTerm but returns a + // WordEntityPosting. Sorted by textRecord. + Index::WordEntityPostings getWordPostingsForTermWep( + const string& wordOrPrefix) const; + + // Returns a set of [textRecord, term] pairs where the term is contained in + // the textRecord. The term can be either the wordOrPrefix itself or a word + // that has wordOrPrefix as a prefix. Returned IdTable has columns: + // textRecord, word. Sorted by textRecord. + IdTable getWordPostingsForTerm( + const string& wordOrPrefix, + const ad_utility::AllocatorWithLimit& allocator) const; Index::WordEntityPostings getEntityPostingsForTerm(const string& term) const; - Index::WordEntityPostings readWordCl(const TextBlockMetaData& tbmd) const; + // Returns a set of textRecords and their corresponding entities and + // scores. Each textRecord contains its corresponding entity and the term. + // Returned IdTable has columns: textRecord, entity, score. Sorted by + // textRecord. + // NOTE: This returns a superset because it contains the whole block and + // unfitting words are filtered out later by the join with the + // TextIndexScanForWords operation. + IdTable getEntityMentionsForWord( + const string& term, + const ad_utility::AllocatorWithLimit& allocator) const; + + size_t getIndexOfBestSuitedElTerm(const vector& terms) const; + + Index::WordEntityPostings readWordClWep(const TextBlockMetaData& tbmd) const; + + IdTable readWordCl(const TextBlockMetaData& tbmd, + const ad_utility::AllocatorWithLimit& allocator) const; - Index::WordEntityPostings readWordEntityCl( + Index::WordEntityPostings readWordEntityClWep( const TextBlockMetaData& tbmd) const; + IdTable readWordEntityCl( + const TextBlockMetaData& tbmd, + const ad_utility::AllocatorWithLimit& allocator) const; + string getTextExcerpt(TextRecordIndex cid) const { if (cid.get() >= docsDB_._size) { return ""; @@ -550,8 +592,6 @@ class IndexImpl { size_t nofElements, off_t from, size_t nofBytes, MakeFromUint64t makeFromUint = MakeFromUint64t{}) const; - size_t getIndexOfBestSuitedElTerm(const vector& terms) const; - // Get the metadata for the block from the text index that contains the // `word`. Also works for prefixes that are terminated with `PREFIX_CHAR` like // "astro*". Returns `nullopt` if no suitable block was found because no diff --git a/src/index/Vocabulary.cpp b/src/index/Vocabulary.cpp index 414619e632..13699dfcaf 100644 --- a/src/index/Vocabulary.cpp +++ b/src/index/Vocabulary.cpp @@ -183,18 +183,21 @@ void Vocabulary::initializeInternalizedLangs(const StringRange& s) { // ___________________________________________________________________________ template -bool Vocabulary::getIdRangeForFullTextPrefix( - const string& word, IdRange* range) const { +std::optional> Vocabulary::getIdRangeForFullTextPrefix( + const string& word) const { AD_CONTRACT_CHECK(word[word.size() - 1] == PREFIX_CHAR); + IdRange range; auto prefixRange = prefix_range(word.substr(0, word.size() - 1)); bool success = prefixRange.second > prefixRange.first; - *range = IdRange{prefixRange.first, prefixRange.second.decremented()}; if (success) { - AD_CONTRACT_CHECK(range->first().get() < internalVocabulary_.size()); - AD_CONTRACT_CHECK(range->last().get() < internalVocabulary_.size()); + range = IdRange{prefixRange.first, prefixRange.second.decremented()}; + AD_CONTRACT_CHECK(range.first().get() < internalVocabulary_.size()); + AD_CONTRACT_CHECK(range.last().get() < internalVocabulary_.size()); + + return range; } - return success; + return std::nullopt; } // _______________________________________________________________ diff --git a/src/index/Vocabulary.h b/src/index/Vocabulary.h index d9f19a515f..6953c9b254 100644 --- a/src/index/Vocabulary.h +++ b/src/index/Vocabulary.h @@ -148,13 +148,13 @@ class Vocabulary { bool getId(const string& word, IndexType* idx) const; //! Get an Id range that matches a prefix. - //! Return value signals if something was found at all. + //! Return value also signals if something was found at all. //! CAVEAT! TODO: This is only used for the text index, //! and uses a range, where the last index is still within the range which is //! against C++ conventions! // consider using the prefixRange function. - bool getIdRangeForFullTextPrefix(const string& word, - IdRange* range) const; + std::optional> getIdRangeForFullTextPrefix( + const string& word) const; ad_utility::HashMap> getRangesForDatatypes() const; diff --git a/src/parser/data/Variable.h b/src/parser/data/Variable.h index ef35eb9f0c..7667cebdf3 100644 --- a/src/parser/data/Variable.h +++ b/src/parser/data/Variable.h @@ -7,6 +7,7 @@ #include #include #include +#include // Forward declaration because of cyclic dependencies // TODO The coupling of the `Variable` with its `evaluate` methods @@ -39,6 +40,15 @@ class Variable { // Convert `?someVariable` into `?ql_textscore_someVariable` Variable getTextScoreVariable() const; + // Converts `?someTextVar` and `?someEntityVar` into + // `?ql_someTextVar_score_var_someEntityVar`. + // Converts `?someTextVar` and `someFixedEntity` into + // `?ql_someTextVar_fixedEntity_someFixedEntity`. + // Note that if the the fixed entity contains non ascii characters they are + // converted to numbers and escaped. + Variable getScoreVariable( + const std::variant& varOrEntity) const; + // Convert `?someVariable` into `?ql_matchingword_someVariable_someTerm` Variable getMatchingWordVariable(std::string_view term) const; diff --git a/src/parser/data/VariableToColumnMapPrinters.cpp b/src/parser/data/VariableToColumnMapPrinters.cpp index fbf7808533..f23c0aeb23 100644 --- a/src/parser/data/VariableToColumnMapPrinters.cpp +++ b/src/parser/data/VariableToColumnMapPrinters.cpp @@ -15,7 +15,7 @@ Variable::Variable(std::string name) : _name{std::move(name)} { // verify variable name starts with ? or $ and continues without any // special characters. This is weaker than the SPARQL grammar, // but it is close enough so that it will likely never cause issues. - AD_CONTRACT_CHECK(ctre::match<"[$?]\\w+">(_name)); + AD_CONTRACT_CHECK(ctre::match<"[$?][\\w]+">(_name)); // normalize notation for consistency _name[0] = '?'; } @@ -58,6 +58,30 @@ Variable Variable::getTextScoreVariable() const { return Variable{absl::StrCat(TEXTSCORE_VARIABLE_PREFIX, name().substr(1))}; } +// _____________________________________________________________________________ +Variable Variable::getScoreVariable( + const std::variant& varOrEntity) const { + std::string_view type; + std::string entity; + if (std::holds_alternative(varOrEntity)) { + type = "_var_"; + entity = std::get(varOrEntity).name().substr(1); + } else { + type = "_fixedEntity_"; + // Converts input string to unambiguous result string not containing any + // special characters. "_" is used as an escaping character. + for (char c : std::get(varOrEntity)) { + if (isalpha(static_cast(c))) { + entity += c; + } else { + absl::StrAppend(&entity, "_", std::to_string(c), "_"); + } + } + } + return Variable{ + absl::StrCat(SCORE_VARIABLE_PREFIX, name().substr(1), type, entity)}; +} + // _____________________________________________________________________________ Variable Variable::getMatchingWordVariable(std::string_view term) const { return Variable{ diff --git a/src/parser/sparqlParser/SparqlQleverVisitor.cpp b/src/parser/sparqlParser/SparqlQleverVisitor.cpp index a129b34fbc..0e4b39638b 100644 --- a/src/parser/sparqlParser/SparqlQleverVisitor.cpp +++ b/src/parser/sparqlParser/SparqlQleverVisitor.cpp @@ -1034,32 +1034,37 @@ vector Visitor::visit( // Similarly if a triple `?var ql:contains-word "words"` is contained in the // query, then the variable `ql_matchingword_var` is implicitly created and // visible in the query body. - auto setMatchingWordAndTextscoreVisibleIfPresent = [this, ctx]( - VarOrTerm& subject, - VarOrPath& predicate, - VarOrTerm& object) { - if (auto* var = std::get_if(&subject)) { - if (auto* propertyPath = std::get_if(&predicate)) { - if (propertyPath->asString() == CONTAINS_WORD_PREDICATE) { - addVisibleVariable(var->getTextScoreVariable()); - string name = object.toSparql(); - if (!((name.starts_with('"') && name.ends_with('"')) || - (name.starts_with('\'') && name.ends_with('\'')))) { - reportError( - ctx, - "ql:contains-word has to be followed by a string in quotes"); - } - for (std::string_view s : std::vector( - absl::StrSplit(name.substr(1, name.size() - 2), ' '))) { - if (!s.ends_with('*')) { - continue; - } - addVisibleVariable( - var->getMatchingWordVariable(s.substr(0, s.size() - 1))); - } - } else if (propertyPath->asString() == CONTAINS_ENTITY_PREDICATE) { - addVisibleVariable(var->getTextScoreVariable()); + auto setMatchingWordAndScoreVisibleIfPresent = [this, ctx]( + VarOrTerm& subject, + VarOrPath& predicate, + VarOrTerm& object) { + auto* var = std::get_if(&subject); + auto* propertyPath = std::get_if(&predicate); + + if (!var || !propertyPath) { + return; + } + + if (propertyPath->asString() == CONTAINS_WORD_PREDICATE) { + string name = object.toSparql(); + if (!((name.starts_with('"') && name.ends_with('"')) || + (name.starts_with('\'') && name.ends_with('\'')))) { + reportError( + ctx, "ql:contains-word has to be followed by a string in quotes"); + } + for (std::string_view s : std::vector( + absl::StrSplit(name.substr(1, name.size() - 2), ' '))) { + if (!s.ends_with('*')) { + continue; } + addVisibleVariable(var->getMatchingWordVariable( + ad_utility::utf8ToLower(s.substr(0, s.size() - 1)))); + } + } else if (propertyPath->asString() == CONTAINS_ENTITY_PREDICATE) { + if (const auto* entVar = std::get_if(&object)) { + addVisibleVariable(var->getScoreVariable(*entVar)); + } else if (const auto* fixedEntity = std::get_if(&object)) { + addVisibleVariable(var->getScoreVariable(fixedEntity->toSparql())); } } }; @@ -1069,7 +1074,7 @@ vector Visitor::visit( auto subject = visit(ctx->varOrTerm()); auto tuples = visit(ctx->propertyListPathNotEmpty()); for (auto& [predicate, object] : tuples) { - setMatchingWordAndTextscoreVisibleIfPresent(subject, predicate, object); + setMatchingWordAndScoreVisibleIfPresent(subject, predicate, object); triples.emplace_back(subject, std::move(predicate), std::move(object)); } return triples; diff --git a/test/FTSAlgorithmsTest.cpp b/test/FTSAlgorithmsTest.cpp index 0718441894..f80374a424 100644 --- a/test/FTSAlgorithmsTest.cpp +++ b/test/FTSAlgorithmsTest.cpp @@ -29,7 +29,7 @@ TEST(FTSAlgorithmsTest, filterByRangeTest) { Index::WordEntityPostings resultWep; // Empty - resultWep = FTSAlgorithms::filterByRange(idRange, wep); + resultWep = FTSAlgorithms::filterByRangeWep(idRange, wep); ASSERT_EQ(0u, resultWep.cids_.size()); // None @@ -37,7 +37,7 @@ TEST(FTSAlgorithmsTest, filterByRangeTest) { wep.wids_ = {{2}}; wep.scores_ = {1}; - resultWep = FTSAlgorithms::filterByRange(idRange, wep); + resultWep = FTSAlgorithms::filterByRangeWep(idRange, wep); ASSERT_EQ(0u, resultWep.cids_.size()); // Match @@ -45,7 +45,7 @@ TEST(FTSAlgorithmsTest, filterByRangeTest) { wep.wids_ = {{2, 5, 7, 5, 6}}; wep.scores_ = {1, 1, 1, 1, 1}; - resultWep = FTSAlgorithms::filterByRange(idRange, wep); + resultWep = FTSAlgorithms::filterByRangeWep(idRange, wep); EXPECT_THAT(resultWep.cids_, ::testing::ElementsAre(TRID(0), TRID(1), TRID(2), TRID(3))); EXPECT_THAT(resultWep.eids_, ::testing::ElementsAre()); @@ -57,7 +57,7 @@ TEST(FTSAlgorithmsTest, filterByRangeTest) { wep.scores_ = {1, 1, 1, 1, 1, 1}; // Partial - resultWep = FTSAlgorithms::filterByRange(idRange, wep); + resultWep = FTSAlgorithms::filterByRangeWep(idRange, wep); EXPECT_THAT(resultWep.cids_, ::testing::ElementsAre(TRID(0), TRID(1), TRID(2), TRID(3))); EXPECT_THAT(resultWep.eids_, ::testing::ElementsAre()); diff --git a/test/IndexTestHelpers.h b/test/IndexTestHelpers.h index ad219cf523..45765556c8 100644 --- a/test/IndexTestHelpers.h +++ b/test/IndexTestHelpers.h @@ -43,7 +43,8 @@ Index makeTestIndex(const std::string& indexBasename, std::optional turtleInput = std::nullopt, bool loadAllPermutations = true, bool usePatterns = true, bool usePrefixCompression = true, - ad_utility::MemorySize blocksizePermutations = 16_B); + ad_utility::MemorySize blocksizePermutations = 16_B, + bool createTextIndex = false); // Return a static `QueryExecutionContext` that refers to an index that was // build using `makeTestIndex` (see above). The index (most notably its @@ -53,7 +54,8 @@ QueryExecutionContext* getQec( std::optional turtleInput = std::nullopt, bool loadAllPermutations = true, bool usePatterns = true, bool usePrefixCompression = true, - ad_utility::MemorySize blocksizePermutations = 16_B); + ad_utility::MemorySize blocksizePermutations = 16_B, + bool createTextIndex = false); // Return a lambda that takes a string and converts it into an ID by looking // it up in the vocabulary of `index`. An `AD_CONTRACT_CHECK` will fail if the diff --git a/test/QueryPlannerTest.cpp b/test/QueryPlannerTest.cpp index aec5d83b9d..fdb549a733 100644 --- a/test/QueryPlannerTest.cpp +++ b/test/QueryPlannerTest.cpp @@ -14,10 +14,6 @@ namespace h = queryPlannerTestHelpers; using Var = Variable; -namespace { -auto lit = ad_utility::testing::tripleComponentLiteral; -} - TEST(QueryPlannerTest, createTripleGraph) { using TripleGraph = QueryPlanner::TripleGraph; using Node = QueryPlanner::TripleGraph::Node; @@ -201,321 +197,6 @@ TEST(QueryPlannerTest, testBFSLeaveOut) { } } -TEST(QueryPlannerTest, testcollapseTextCliques) { - using TripleGraph = QueryPlanner::TripleGraph; - using Node = QueryPlanner::TripleGraph::Node; - using std::vector; - { - { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?x WHERE {?x

. ?c ql:contains-entity ?x. ?c " - "ql:contains-word \"abc\"}"); - QueryPlanner qp(nullptr); - auto tg = qp.createTripleGraph(&pq.children()[0].getBasic()); - ASSERT_EQ( - "0 {s: ?x, p:

, o: } : (1)\n" - "1 {s: ?c, p: " - ", o: ?x} : " - "(0, 2)\n" - "2 {s: ?c, p: " - ", " - "o: \"abc\"} " - ": " - "(1)", - tg.asString()); - tg.collapseTextCliques(); - TripleGraph expected = - TripleGraph(std::vector>>( - {std::make_pair>( - QueryPlanner::TripleGraph::Node( - 0, Var{"?c"}, {"abc"}, - { - SparqlTriple(Var{"?c"}, - "", - Var{"?x"}), - SparqlTriple(Var{"?c"}, - "", - lit("\"abc\"")), - }), - {1}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 1, SparqlTriple(Var{"?x"}, "

", "")), - {0})})); - ASSERT_TRUE(tg.isSimilar(expected)); - } - { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?x WHERE {?x

. ?c " - " ?x. ?c " - " " - "\"abc\" . ?c " - "ql:contains-entity ?y}"); - QueryPlanner qp(nullptr); - auto tg = qp.createTripleGraph(&pq.children()[0].getBasic()); - ASSERT_EQ( - "0 {s: ?x, p:

, o: } : (1)\n" - "1 {s: ?c, p: " - ", o: ?x} : " - "(0, 2, 3)\n" - "2 {s: ?c, p: " - ", " - "o: \"abc\"} " - ": " - "(1, 3)\n" - "3 {s: ?c, p: " - ", o: ?y} : " - "(1, 2)", - tg.asString()); - tg.collapseTextCliques(); - TripleGraph expected = - TripleGraph(std::vector>>( - {std::make_pair>( - QueryPlanner::TripleGraph::Node( - 0, Var{"?c"}, {"abc"}, - {SparqlTriple(Var{"?c"}, - "", - Var{"?x"}), - SparqlTriple(Var{"?c"}, - "", - lit("\"abc\"")), - SparqlTriple(Var{"?c"}, - "", - Var{"?y"})}), - {1}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 1, SparqlTriple(Var{"?x"}, "

", "")), - {0})})); - ASSERT_TRUE(tg.isSimilar(expected)); - } - { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?x WHERE {?x

. ?c ql:contains-entity ?x. ?c " - "ql:contains-word \"abc\" . ?c ql:contains-entity ?y. ?y " - "}"); - QueryPlanner qp(nullptr); - auto tg = qp.createTripleGraph(&pq.children()[0].getBasic()); - ASSERT_EQ( - "0 {s: ?x, p:

, o: } : (1)\n" - "1 {s: ?c, p: " - ", o: ?x} : " - "(0, 2, 3)\n" - "2 {s: ?c, p: " - ", " - "o: \"abc\"} " - ": " - "(1, 3)\n" - "3 {s: ?c, p: " - ", o: ?y} : " - "(1, 2, 4)\n" - "4 {s: ?y, p: , o: } : (3)", - tg.asString()); - tg.collapseTextCliques(); - TripleGraph expected = - TripleGraph(std::vector>>( - {std::make_pair>( - QueryPlanner::TripleGraph::Node( - 0, Var{"?c"}, {"abc"}, - {SparqlTriple(Var{"?c"}, - "", - Var{"?x"}), - SparqlTriple(Var{"?c"}, - "", - lit("\"abc\"")), - SparqlTriple(Var{"?c"}, - "", - Var{"?y"})}), - {1, 2}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 1, SparqlTriple(Var{"?x"}, "

", "")), - {0}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 2, SparqlTriple(Var{"?y"}, "", "")), - {0})})); - ASSERT_TRUE(tg.isSimilar(expected)); - } - { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?x WHERE {?x

. ?c ql:contains-entity ?x. ?c " - "ql:contains-word \"abc\" . ?c ql:contains-entity ?y. ?c2 " - "ql:contains-entity ?y. ?c2 ql:contains-word \"xx\"}"); - QueryPlanner qp(nullptr); - auto tg = qp.createTripleGraph(&pq.children()[0].getBasic()); - TripleGraph expected = - TripleGraph(std::vector>>( - {std::make_pair>( - QueryPlanner::TripleGraph::Node( - 0, SparqlTriple(Var{"?x"}, "

", "")), - {1}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 1, SparqlTriple(Var{"?c"}, - "", - Var{"?x"})), - {0, 2, 3}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 2, SparqlTriple(Var{"?c"}, - "", - lit("\"abc\""))), - {1, 3}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 3, SparqlTriple(Var{"?c"}, - "", - Var{"?y"})), - {1, 2, 4}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 4, SparqlTriple(Var{"?c2"}, - "", - Var{"?y"})), - {3, 5}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 5, SparqlTriple(Var{"?c2"}, - "", - lit("\"xx\""))), - {4})})); - - ASSERT_TRUE(tg.isSimilar(expected)); - tg.collapseTextCliques(); - TripleGraph expected2 = - TripleGraph(std::vector>>( - {std::make_pair>( - QueryPlanner::TripleGraph::Node( - 0, Var{"?c"}, {"abc"}, - {SparqlTriple(Var{"?c"}, - "", - Var{"?x"}), - SparqlTriple(Var{"?c"}, - "", - lit("\"abc\"")), - SparqlTriple(Var{"?c"}, - "", - Var{"?y"})}), - {1, 2}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 1, Var{"?c2"}, {"xx"}, - {SparqlTriple(Var{"?c2"}, - "", - Var{"?y"}), - SparqlTriple(Var{"?c2"}, - "", - lit("\"xx\""))}), - {0}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 2, SparqlTriple(Var{"?x"}, "

", "")), - {0})})); - ASSERT_TRUE(tg.isSimilar(expected2)); - } - { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?x WHERE {?x

. ?c ql:contains-entity ?x. ?c " - "ql:contains-word \"abc\" . ?c ql:contains-entity ?y. ?c2 " - "ql:contains-entity ?y. ?c2 ql:contains-word \"xx\". ?y " - "}"); - QueryPlanner qp(nullptr); - auto tg = qp.createTripleGraph(&pq.children()[0].getBasic()); - ASSERT_EQ( - "0 {s: ?x, p:

, o: } : (1)\n" - "1 {s: ?c, p: " - ", o: ?x} : " - "(0, 2, 3)\n" - "2 {s: ?c, p: " - ", " - "o: \"abc\"} " - ": " - "(1, 3)\n" - "3 {s: ?c, p: " - ", o: ?y} : " - "(1, 2, 4, 6)\n" - "4 {s: ?c2, p: " - ", o: ?y} " - ": (3, 5, 6)\n" - "5 {s: ?c2, p: " - ", " - "o: \"xx\"} " - ": " - "(4)\n" - "6 {s: ?y, p: , o: } : (3, 4)", - tg.asString()); - tg.collapseTextCliques(); - TripleGraph expected2 = - TripleGraph(std::vector>>( - {std::make_pair>( - QueryPlanner::TripleGraph::Node( - 0, Var{"?c"}, {"abc"}, - {SparqlTriple(Var{"?c"}, - "", - Var{"?x"}), - SparqlTriple(Var{"?c"}, - "", - "abc"), - SparqlTriple(Var{"?c"}, - "", - Var{"?y"})}), - {1, 2, 3}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 1, Var{"?c2"}, {"xx"}, - {SparqlTriple(Var{"?c2"}, - "", - Var{"?y"}), - SparqlTriple(Var{"?c2"}, - "", - lit("\"xx\""))}), - {0, 3}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 2, SparqlTriple(Var{"?x"}, "

", "")), - {0}), - std::make_pair>( - QueryPlanner::TripleGraph::Node( - 3, SparqlTriple(Var{"?y"}, "", "")), - {0, 1})})); - ASSERT_TRUE(tg.isSimilar(expected2)); - } - } -} - TEST(QueryPlannerTest, indexScanOneVariable) { auto scan = h::IndexScanFromStrings; using enum Permutation::Enum; @@ -698,96 +379,40 @@ TEST(QueryExecutionTreeTest, testBooksGermanAwardNomAuth) { scan("?x", "", "?y"), scan("?y", "", ""))); } -/* TEST(QueryExecutionTreeTest, testPlantsEdibleLeaves) { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?a \n " - "WHERE {?a . ?c ql:contains-entity ?a. " - "?c ql:contains-word \"edible leaves\"} TEXTLIMIT 5"); - QueryPlanner qp(nullptr); - QueryPlanner::TripleGraph tg = - qp.createTripleGraph(&pq.children()[0].getBasic()); - ASSERT_EQ(1u, tg._nodeMap.find(0)->second->_variables.size()); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ( - "{\n TEXT OPERATION WITH FILTER: co-occurrence with words: " - "\"edible leaves\" and 1 variables with textLimit = 5 " - "filtered by\n {\n SCAN POS with P = \"\", " - "O = \"\"\n qet-width: 1 \n }\n filtered on " - "column 0\n qet-width: 3 \n}", - qet.getCacheKey()); -} - -TEST(QueryExecutionTreeTest, testTextQuerySE) { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?c \n " - "WHERE {?c ql:contains-word \"search engine\"}"); - QueryPlanner qp(nullptr); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ(absl::StrCat( - "{\n TEXT OPERATION WITHOUT FILTER: co-occurrence with words:", - " \"search engine\" and 0 variables with textLimit = ", - TEXT_LIMIT_DEFAULT, "\n", " qet-width: 2 \n}"), - qet.getCacheKey()); -} - -TEST(QueryExecutionTreeTest, testBornInEuropeOwCocaine) { - ParsedQuery pq = SparqlParser::parseQuery( - "PREFIX : <>\n" - "SELECT ?x ?y ?c\n " - "WHERE \t {" - "?x :Place_of_birth ?y ." - "?y :Contained_by :Europe ." - "?c ql:contains-entity ?x ." - "?c ql:contains-word \"cocaine\" ." - "} TEXTLIMIT 1"); - QueryPlanner qp(nullptr); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ( - "{\n TEXT OPERATION WITH FILTER: co-occurrence with words: " - "\"cocaine\" and 1 variables with textLimit = 1 filtered by\n " - "{\n JOIN\n {\n SCAN POS with P = \"\", " - "O = \"\"\n qet-width: 1 \n } join-column: [0]\n" - " |X|\n {\n SCAN POS with P = \"\"\n" - " qet-width: 2 \n } join-column: [0]\n qet-width: 2 \n" - " }\n filtered on column 1\n qet-width: 4 \n}", - qet.getCacheKey()); - auto c = Variable{"?c"}; - ASSERT_EQ(0u, qet.getVariableColumn(c)); - ASSERT_EQ(1u, qet.getVariableColumn(c.getTextScoreVariable())); - ASSERT_EQ(2u, qet.getVariableColumn(Variable{"?y"})); + auto scan = h::IndexScanFromStrings; + auto wordScan = h::TextIndexScanForWord; + auto entityScan = h::TextIndexScanForEntity; + h::expect( + "SELECT ?a WHERE {?a . ?c ql:contains-entity ?a. ?c " + "ql:contains-word \"edible leaves\"}", + h::UnorderedJoins(scan("?a", "", ""), + wordScan(Var{"?c"}, "edible"), + wordScan(Var{"?c"}, "leaves"), + entityScan(Var{"?c"}, Var{"?a"}, "edible"))); } TEST(QueryExecutionTreeTest, testCoOccFreeVar) { - ParsedQuery pq = SparqlParser::parseQuery( - "PREFIX : <>" - "SELECT ?x ?y WHERE {" - "?x :is-a :Politician ." - "?c ql:contains-entity ?x ." - "?c ql:contains-word \"friend*\" ." - "?c ql:contains-entity ?y ." - "} TEXTLIMIT 1"); - QueryPlanner qp(nullptr); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ( - "{\n TEXT OPERATION WITH FILTER: co-occurrence with words: " - "\"friend*\" and 2 variables with textLimit = 1 filtered by\n" - " {\n SCAN POS with P = \"\", O = \"" - "\"\n qet-width: 1 \n }\n filtered on column 0\n " - " qet-width: 5 \n}", - qet.getCacheKey()); - auto c = Variable{"?c"}; - ASSERT_EQ(0u, qet.getVariableColumn(c)); - ASSERT_EQ(1u, qet.getVariableColumn(c.getTextScoreVariable())); - ASSERT_EQ(2u, qet.getVariableColumn(Variable{"?y"})); - ASSERT_EQ(3u, qet.getVariableColumn(Variable{"?x"})); - ASSERT_EQ(4u, qet.getVariableColumn(c.getMatchingWordVariable("friend"))); + auto scan = h::IndexScanFromStrings; + auto wordScan = h::TextIndexScanForWord; + auto entityScan = h::TextIndexScanForEntity; + h::expect( + "PREFIX : <> SELECT ?x ?y WHERE { ?x :is-a :Politician . ?c " + "ql:contains-entity ?x . ?c ql:contains-word \"friend*\" . ?c " + "ql:contains-entity ?y }", + h::UnorderedJoins(scan("?x", "", ""), + entityScan(Var{"?c"}, Var{"?x"}, "friend*"), + wordScan(Var{"?c"}, "friend*"), + entityScan(Var{"?c"}, Var{"?y"}, "friend*"))); } TEST(QueryExecutionTreeTest, testPoliticiansFriendWithScieManHatProj) { - ParsedQuery pq = SparqlParser::parseQuery( - "SELECT ?p ?s \n " + auto scan = h::IndexScanFromStrings; + auto wordScan = h::TextIndexScanForWord; + auto entityScan = h::TextIndexScanForEntity; + h::expect( + "SELECT ?p ?s" "WHERE {" "?a . " "?c ql:contains-entity ?a ." @@ -795,24 +420,16 @@ TEST(QueryExecutionTreeTest, testPoliticiansFriendWithScieManHatProj) { "?c ql:contains-entity ?s ." "?s ." "?c2 ql:contains-entity ?s ." - "?c2 ql:contains-word \"manhattan project\"} TEXTLIMIT 1"); - QueryPlanner qp(nullptr); - QueryExecutionTree qet = qp.createExecutionTree(pq); - ASSERT_EQ( - "{\n TEXT OPERATION WITH FILTER: co-occurrence with words: \"manhattan " - "project\" and 1 variables with textLimit = 1 filtered by\n {\n " - "JOIN\n {\n SORT(internal) on columns:asc(2) \n {\n " - "TEXT OPERATION WITH FILTER: co-occurrence with words: \"friend*\" and 2 " - "variables with textLimit = 1 filtered by\n {\n SCAN POS " - "with P = \"\", O = \"\"\n qet-width: 1 \n " - " }\n filtered on column 0\n qet-width: 5 \n }\n " - " qet-width: 5 \n } join-column: [2]\n |X|\n {\n SCAN " - "POS with P = \"\", O = \"\"\n qet-width: 1 \n " - "} join-column: [0]\n qet-width: 5 \n }\n filtered on column 2\n " - "qet-width: 7 \n}", - qet.getCacheKey()); + "?c2 ql:contains-word \"manhattan project\"}", + h::UnorderedJoins(scan("?a", "", ""), + entityScan(Var{"?c"}, Var{"?a"}, "friend*"), + wordScan(Var{"?c"}, "friend*"), + entityScan(Var{"?c"}, Var{"?s"}, "friend*"), + scan("?s", "", ""), + entityScan(Var{"?c2"}, Var{"?s"}, "manhattan"), + wordScan(Var{"?c2"}, "manhattan"), + wordScan(Var{"?c2"}, "project"))); } - */ TEST(QueryExecutionTreeTest, testCyclicQuery) { ParsedQuery pq = SparqlParser::parseQuery( @@ -1158,3 +775,103 @@ TEST(QueryPlanner, BindAtBeginningOfQuery) { " BIND (3 + 5 AS ?x) }", h::Bind(h::NeutralElementOperation(), "3 + 5", Variable{"?x"})); } + +// __________________________________________________________________________ +TEST(QueryPlannerTest, TextIndexScanForWord) { + auto qec = ad_utility::testing::getQec( + "

\"this text contains some words and is part of the test\" . " + "

\"testEntity\" .

\"picking the right text can be a hard " + "test\" .

\"sentence for multiple words tests\" . " + "

\"testing and picking\"", + true, true, true, 16_B, true); + auto wordScan = h::TextIndexScanForWord; + + h::expect("SELECT * WHERE { ?text ql:contains-word \"test*\" }", + wordScan(Var{"?text"}, "test*"), qec); + + h::expect("SELECT * WHERE { ?text2 ql:contains-word \"test\" }", + wordScan(Var{"?text2"}, "test"), qec); + + h::expect( + "SELECT * WHERE { ?text2 ql:contains-word \"multiple words* test\" }", + h::UnorderedJoins(wordScan(Var{"?text2"}, "test"), + wordScan(Var{"?text2"}, "words*"), + wordScan(Var{"?text2"}, "multiple")), + qec); + + AD_EXPECT_THROW_WITH_MESSAGE( + SparqlParser::parseQuery( + "SELECT * WHERE { ?text ql:contains-word . }"), + ::testing::ContainsRegex( + "ql:contains-word has to be followed by a string in quotes")); +} + +// __________________________________________________________________________ +TEST(QueryPlannerTest, TextIndexScanForEntity) { + auto qec = ad_utility::testing::getQec( + "

\"this text contains some words and is part of the test\" . " + "

.

\"picking the right text can be a hard " + "test\" .

\"only this text contains the word opti \" . " + "

\"testing and picking\"", + true, true, true, 16_B, true); + + auto wordScan = h::TextIndexScanForWord; + auto entityScan = h::TextIndexScanForEntity; + h::expect( + "SELECT * WHERE { ?text ql:contains-entity ?scientist . ?text " + "ql:contains-word \"test*\" }", + h::Join(wordScan(Var{"?text"}, "test*"), + entityScan(Var{"?text"}, Var{"?scientist"}, "test*")), + qec); + + h::expect( + "SELECT * WHERE { ?text ql:contains-entity . ?text " + "ql:contains-word \"test\" }", + h::Join(wordScan(Var{"?text"}, "test"), + entityScan(Var{"?text"}, "", "test")), + qec); + + // Test case sensitivity + h::expect( + "SELECT * WHERE { ?text ql:contains-entity . ?text " + "ql:contains-word \"TeST\" }", + h::Join(wordScan(Var{"?text"}, "test"), + entityScan(Var{"?text"}, "", "test")), + qec); + + // NOTE: It is important that the TextIndexScanForEntity uses "opti", because + // we also want to test here if the QueryPlanner assigns the optimal word to + // the Operation. + h::expect( + "SELECT * WHERE { ?text ql:contains-word \"picking*\" . ?text " + "ql:contains-entity . ?text ql:contains-word " + "\"opti\" . ?text ql:contains-word \"testi*\"}", + h::UnorderedJoins(entityScan(Var{"?text"}, "", "opti"), + wordScan(Var{"?text"}, "testi*"), + wordScan(Var{"?text"}, "opti"), + wordScan(Var{"?text"}, "picking*")), + qec); + + ParsedQuery pq = SparqlParser::parseQuery( + "SELECT * WHERE { ?text ql:contains-entity ?scientist . }"); + QueryPlanner qp(nullptr); + AD_EXPECT_THROW_WITH_MESSAGE( + qp.createExecutionTree(pq), + ::testing::ContainsRegex( + "Missing ql:contains-word statement. A ql:contains-entity statement " + "always also needs corresponding ql:contains-word statement.")); +} + +// __________________________________________________________________________ +TEST(QueryPlannerTest, TooManyTriples) { + std::string query = "SELECT * WHERE {"; + for (size_t i = 0; i < 65; i++) { + query = absl::StrCat(query, " ?x

?y ."); + } + query = absl::StrCat(query, "}"); + ParsedQuery pq = SparqlParser::parseQuery(query); + QueryPlanner qp(nullptr); + AD_EXPECT_THROW_WITH_MESSAGE( + qp.createExecutionTree(pq), + ::testing::ContainsRegex("At most 64 triples allowed at the moment.")); +} diff --git a/test/QueryPlannerTestHelpers.h b/test/QueryPlannerTestHelpers.h index 51cc962d7b..0b785e69e7 100644 --- a/test/QueryPlannerTestHelpers.h +++ b/test/QueryPlannerTestHelpers.h @@ -15,6 +15,8 @@ #include "engine/QueryExecutionTree.h" #include "engine/QueryPlanner.h" #include "engine/Sort.h" +#include "engine/TextIndexScanForEntity.h" +#include "engine/TextIndexScanForWord.h" #include "engine/TransitivePath.h" #include "gmock/gmock-matchers.h" #include "gmock/gmock.h" @@ -86,6 +88,41 @@ inline auto IndexScan = AD_PROPERTY(IndexScan, getObject, Eq(object)))); }; +inline auto TextIndexScanForWord = [](Variable textRecordVar, + string word) -> QetMatcher { + return RootOperation<::TextIndexScanForWord>(AllOf( + AD_PROPERTY(::TextIndexScanForWord, getResultWidth, + Eq(1 + word.ends_with('*'))), + AD_PROPERTY(::TextIndexScanForWord, textRecordVar, Eq(textRecordVar)), + AD_PROPERTY(::TextIndexScanForWord, word, word))); +}; + +inline auto TextIndexScanForEntity = + [](Variable textRecordVar, std::variant entity, + string word) -> QetMatcher { + // TODO: Implement AD_THROWING_PROPERTY(..., Exception matcher) and use it + // here to test the contract-checks in entityVariable() and fixedEntity(). + if (std::holds_alternative(entity)) { + return RootOperation<::TextIndexScanForEntity>(AllOf( + AD_PROPERTY(::TextIndexScanForEntity, getResultWidth, + Eq(2 + std::holds_alternative(entity))), + AD_PROPERTY(::TextIndexScanForEntity, textRecordVar, Eq(textRecordVar)), + AD_PROPERTY(::TextIndexScanForEntity, entityVariable, + std::get(entity)), + AD_PROPERTY(::TextIndexScanForEntity, word, word), + AD_PROPERTY(::TextIndexScanForEntity, hasFixedEntity, false))); + } else { + return RootOperation<::TextIndexScanForEntity>(AllOf( + AD_PROPERTY(::TextIndexScanForEntity, getResultWidth, + Eq(2 + std::holds_alternative(entity))), + AD_PROPERTY(::TextIndexScanForEntity, textRecordVar, Eq(textRecordVar)), + AD_PROPERTY(::TextIndexScanForEntity, fixedEntity, + std::get(entity)), + AD_PROPERTY(::TextIndexScanForEntity, word, word), + AD_PROPERTY(::TextIndexScanForEntity, hasFixedEntity, true))); + } +}; + inline auto Bind = [](const QetMatcher& childMatcher, std::string_view expression, Variable target) -> QetMatcher { diff --git a/test/SparqlParserTest.cpp b/test/SparqlParserTest.cpp index 22f130e455..f15e444e21 100644 --- a/test/SparqlParserTest.cpp +++ b/test/SparqlParserTest.cpp @@ -841,22 +841,22 @@ TEST(ParserTest, testSolutionModifiers) { { auto pq = SparqlParser::parseQuery( - "SELECT DISTINCT ?x ?ql_textscore_x ?y WHERE \t {?x " + "SELECT DISTINCT ?x ?ql_score_x_var_y ?y WHERE \t {?x " "ql:contains-entity ?y}\n" - "ORDER BY ASC(?y) DESC(?ql_textscore_x) LIMIT 10 OFFSET 15"); + "ORDER BY ASC(?y) DESC(?ql_score_x_var_y) LIMIT 10 OFFSET 15"); ASSERT_TRUE(pq.hasSelectClause()); const auto& selectClause = pq.selectClause(); ASSERT_EQ(1u, pq.children().size()); const auto& c = pq.children()[0].getBasic(); ASSERT_EQ(3u, selectClause.getSelectedVariables().size()); - ASSERT_EQ(Var{"?ql_textscore_x"}, selectClause.getSelectedVariables()[1]); + ASSERT_EQ(Var{"?ql_score_x_var_y"}, selectClause.getSelectedVariables()[1]); ASSERT_EQ(1u, c._triples.size()); ASSERT_EQ(10u, pq._limitOffset._limit); ASSERT_EQ(15u, pq._limitOffset._offset); ASSERT_EQ(size_t(2), pq._orderBy.size()); ASSERT_EQ(Var{"?y"}, pq._orderBy[0].variable_); ASSERT_FALSE(pq._orderBy[0].isDescending_); - ASSERT_EQ(Var{"?ql_textscore_x"}, pq._orderBy[1].variable_); + ASSERT_EQ(Var{"?ql_score_x_var_y"}, pq._orderBy[1].variable_); ASSERT_TRUE(pq._orderBy[1].isDescending_); ASSERT_TRUE(selectClause.distinct_); ASSERT_FALSE(selectClause.reduced_); diff --git a/test/VocabularyTest.cpp b/test/VocabularyTest.cpp index c00693d6a1..34c69e01f4 100644 --- a/test/VocabularyTest.cpp +++ b/test/VocabularyTest.cpp @@ -48,28 +48,32 @@ TEST(VocabularyTest, getIdRangeForFullTextPrefixTest) { v.createFromSet(s); uint64_t word0 = 0; - IdRange retVal; // Match exactly one - ASSERT_TRUE(v.getIdRangeForFullTextPrefix("wordA1*", &retVal)); - ASSERT_EQ(word0 + 1, retVal.first().get()); - ASSERT_EQ(word0 + 1, retVal.last().get()); + auto retVal = v.getIdRangeForFullTextPrefix("wordA1*"); + ASSERT_TRUE(retVal.has_value()); + ASSERT_EQ(word0 + 1, retVal.value().first().get()); + ASSERT_EQ(word0 + 1, retVal.value().last().get()); // Match all - ASSERT_TRUE(v.getIdRangeForFullTextPrefix("word*", &retVal)); - ASSERT_EQ(word0, retVal.first().get()); - ASSERT_EQ(word0 + 4, retVal.last().get()); + retVal = v.getIdRangeForFullTextPrefix("word*"); + ASSERT_TRUE(retVal.has_value()); + ASSERT_EQ(word0, retVal.value().first().get()); + ASSERT_EQ(word0 + 4, retVal.value().last().get()); // Match first two - ASSERT_TRUE(v.getIdRangeForFullTextPrefix("wordA*", &retVal)); - ASSERT_EQ(word0, retVal.first().get()); - ASSERT_EQ(word0 + 1, retVal.last().get()); + retVal = v.getIdRangeForFullTextPrefix("wordA*"); + ASSERT_TRUE(retVal.has_value()); + ASSERT_EQ(word0, retVal.value().first().get()); + ASSERT_EQ(word0 + 1, retVal.value().last().get()); // Match last three - ASSERT_TRUE(v.getIdRangeForFullTextPrefix("wordB*", &retVal)); - ASSERT_EQ(word0 + 2, retVal.first().get()); - ASSERT_EQ(word0 + 4, retVal.last().get()); + retVal = v.getIdRangeForFullTextPrefix("wordB*"); + ASSERT_TRUE(retVal.has_value()); + ASSERT_EQ(word0 + 2, retVal.value().first().get()); + ASSERT_EQ(word0 + 4, retVal.value().last().get()); - ASSERT_FALSE(v.getIdRangeForFullTextPrefix("foo*", &retVal)); + retVal = v.getIdRangeForFullTextPrefix("foo*"); + ASSERT_FALSE(retVal.has_value()); } TEST(VocabularyTest, readWriteTest) { diff --git a/test/engine/CMakeLists.txt b/test/engine/CMakeLists.txt index 2a9dfabac0..62a36c12ec 100644 --- a/test/engine/CMakeLists.txt +++ b/test/engine/CMakeLists.txt @@ -1,3 +1,5 @@ add_subdirectory(idTable) addLinkAndDiscoverTest(IndexScanTest engine) addLinkAndDiscoverTest(CartesianProductJoinTest engine) +addLinkAndDiscoverTest(TextIndexScanForWordTest engine) +addLinkAndDiscoverTest(TextIndexScanForEntityTest engine) diff --git a/test/engine/TextIndexScanForEntityTest.cpp b/test/engine/TextIndexScanForEntityTest.cpp new file mode 100644 index 0000000000..9ab86cb8d7 --- /dev/null +++ b/test/engine/TextIndexScanForEntityTest.cpp @@ -0,0 +1,155 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Nick Göckel + +#include + +#include "../IndexTestHelpers.h" +#include "../util/GTestHelpers.h" +#include "../util/IdTableHelpers.h" +#include "./TextIndexScanTestHelpers.h" +#include "engine/IndexScan.h" +#include "engine/TextIndexScanForEntity.h" +#include "parser/ParsedQuery.h" + +using namespace ad_utility::testing; +using ad_utility::source_location; +namespace h = textIndexScanTestHelpers; + +namespace { +std::string kg = + "

\"he failed the test\" .

\"testing can help\" .

" + "\"some other sentence\" .

\"the test on friday was really hard\" " + ". . ."; + +TEST(TextIndexScanForEntity, EntityScanBasic) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + TextIndexScanForEntity s1{qec, Variable{"?text"}, Variable{"?entityVar"}, + "test*"}; + TextIndexScanForEntity s2{qec, Variable{"?text2"}, Variable{"?entityVar2"}, + "test*"}; + ASSERT_EQ(s1.getResultWidth(), 3); + + auto result = s1.computeResultOnlyForTesting(); + ASSERT_EQ(result.width(), 3); + ASSERT_EQ(result.size(), 3); + + // NOTE: because of the way the graph above is constructed, the entities are + // texts + ASSERT_EQ("\"he failed the test\"", + h::getEntityFromResultTable(qec, result, 0)); + ASSERT_EQ("\"testing can help\"", + h::getEntityFromResultTable(qec, result, 1)); + ASSERT_EQ("\"the test on friday was really hard\"", + h::getEntityFromResultTable(qec, result, 2)); + + using enum ColumnIndexAndTypeInfo::UndefStatus; + VariableToColumnMap expectedVariables{ + {Variable{"?text2"}, {0, AlwaysDefined}}, + {Variable{"?entityVar2"}, {1, AlwaysDefined}}, + {Variable{"?ql_score_text2_var_entityVar2"}, {2, AlwaysDefined}}}; + EXPECT_THAT(s2.getExternallyVisibleVariableColumns(), + ::testing::UnorderedElementsAreArray(expectedVariables)); +} + +TEST(TextIndexScanForEntity, FixedEntityScan) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + string fixedEntity = "\"some other sentence\""; + TextIndexScanForEntity s3{qec, Variable{"?text3"}, fixedEntity, "sentence"}; + + auto result = s3.computeResultOnlyForTesting(); + ASSERT_EQ(s3.getResultWidth(), 2); + ASSERT_EQ(result.width(), 2); + ASSERT_EQ(result.size(), 1); + + using enum ColumnIndexAndTypeInfo::UndefStatus; + VariableToColumnMap expectedVariables = { + {Variable{"?text3"}, {0, AlwaysDefined}}, + {Variable{ + "?ql_score_text3_fixedEntity__34_some_32_other_32_sentence_34_"}, + {1, AlwaysDefined}}}; + EXPECT_THAT(s3.getExternallyVisibleVariableColumns(), + ::testing::UnorderedElementsAreArray(expectedVariables)); + + ASSERT_EQ(fixedEntity, h::getTextRecordFromResultTable(qec, result, 0)); + + fixedEntity = "\"he failed the test\""; + TextIndexScanForEntity s4{qec, Variable{"?text4"}, fixedEntity, "test*"}; + result = s4.computeResultOnlyForTesting(); + ASSERT_EQ(result.width(), 2); + ASSERT_EQ(result.size(), 1); + + ASSERT_EQ(fixedEntity, h::getTextRecordFromResultTable(qec, result, 0)); +} + +TEST(TextIndexScanForEntity, CacheKeys) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + TextIndexScanForEntity s1{qec, Variable{"?text"}, Variable{"?entityVar"}, + "test*"}; + TextIndexScanForEntity s2{qec, Variable{"?text2"}, Variable{"?entityVar2"}, + "test*"}; + // Different text vars, different entity vars, same word (both with prefix) + ASSERT_EQ(s1.getCacheKeyImpl(), s2.getCacheKeyImpl()); + + TextIndexScanForEntity s3{qec, Variable{"?text3"}, Variable{"?entityVar"}, + "test"}; + // Different text vars, same entity var, different words (one with, one + // without prefix) + ASSERT_NE(s1.getCacheKeyImpl(), s3.getCacheKeyImpl()); + + TextIndexScanForEntity s4{qec, Variable{"?text4"}, Variable{"?entityVar"}, + "sentence*"}; + // Different text vars, same entity var, different words (both with prefix) + ASSERT_NE(s1.getCacheKeyImpl(), s4.getCacheKeyImpl()); + + // fixed entity case + string fixedEntity = "\"some other sentence\""; + TextIndexScanForEntity s5{qec, Variable{"?text3"}, fixedEntity, "sentence"}; + // Same text var, different entities (one entity var, one fixed entity), same + // word + ASSERT_NE(s3.getCacheKeyImpl(), s5.getCacheKeyImpl()); + + TextIndexScanForEntity s6{qec, Variable{"?text6"}, fixedEntity, "sentence"}; + // Different text vars, same fixed entity, same word + ASSERT_EQ(s5.getCacheKeyImpl(), s6.getCacheKeyImpl()); + + string newFixedEntity = "\"he failed the test\""; + TextIndexScanForEntity s7{qec, Variable{"?text7"}, newFixedEntity, + "sentence"}; + // Different text vars, different fixed entities, same word + ASSERT_NE(s5.getCacheKeyImpl(), s7.getCacheKeyImpl()); + + TextIndexScanForEntity s8{qec, Variable{"?text7"}, newFixedEntity, + "sentences"}; + // Same text var, same fixed entitiy, different words + ASSERT_NE(s7.getCacheKeyImpl(), s8.getCacheKeyImpl()); +} + +TEST(TextIndexScanForEntity, KnownEmpty) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + TextIndexScanForEntity s1{qec, Variable{"?text"}, Variable{"?entityVar"}, + "nonExistentWord*"}; + ASSERT_TRUE(s1.knownEmptyResult()); + + string fixedEntity = "\"non existent entity\""; + AD_EXPECT_THROW_WITH_MESSAGE( + TextIndexScanForEntity(qec, Variable{"?text"}, fixedEntity, "test*"), + ::testing::ContainsRegex(absl::StrCat( + "The entity ", fixedEntity, + " is not part of the underlying knowledge graph and can therefore " + "not be used as the object of ql:contains-entity"))); + + TextIndexScanForEntity s2{qec, Variable{"?text"}, Variable{"?entityVar"}, + "test*"}; + ASSERT_TRUE(!s2.knownEmptyResult()); + + TextIndexScanForEntity s3{qec, Variable{"?text"}, Variable{"?entityVar"}, + "test"}; + ASSERT_TRUE(!s3.knownEmptyResult()); +} + +} // namespace diff --git a/test/engine/TextIndexScanForWordTest.cpp b/test/engine/TextIndexScanForWordTest.cpp new file mode 100644 index 0000000000..5f1741d955 --- /dev/null +++ b/test/engine/TextIndexScanForWordTest.cpp @@ -0,0 +1,127 @@ +// Copyright 2023, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Nick Göckel + +#include +#include + +#include "../IndexTestHelpers.h" +#include "../util/GTestHelpers.h" +#include "../util/IdTableHelpers.h" +#include "./TextIndexScanTestHelpers.h" +#include "engine/IndexScan.h" +#include "engine/TextIndexScanForWord.h" +#include "parser/ParsedQuery.h" + +using namespace ad_utility::testing; +using ad_utility::source_location; +namespace h = textIndexScanTestHelpers; + +namespace { +std::string kg = + "

\"he failed the test\" .

\"testing can help\" .

" + "\"some other sentence\" .

\"the test on friday was really hard\" " + ". . ."; + +TEST(TextIndexScanForWord, WordScanPrefix) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + TextIndexScanForWord s1{qec, Variable{"?text1"}, "test*"}; + TextIndexScanForWord s2{qec, Variable{"?text2"}, "test*"}; + + ASSERT_EQ(s1.getResultWidth(), 2); + + auto result = s1.computeResultOnlyForTesting(); + ASSERT_EQ(result.width(), 2); + ASSERT_EQ(result.size(), 3); + s2.getExternallyVisibleVariableColumns(); + + using enum ColumnIndexAndTypeInfo::UndefStatus; + VariableToColumnMap expectedVariables{ + {Variable{"?text2"}, {0, AlwaysDefined}}, + {Variable{"?ql_matchingword_text2_test"}, {1, AlwaysDefined}}}; + EXPECT_THAT(s2.getExternallyVisibleVariableColumns(), + ::testing::UnorderedElementsAreArray(expectedVariables)); + + ASSERT_EQ(h::combineToString("\"he failed the test\"", "test"), + h::combineToString(h::getTextRecordFromResultTable(qec, result, 0), + h::getWordFromResultTable(qec, result, 0))); + ASSERT_EQ(h::combineToString("\"testing can help\"", "testing"), + h::combineToString(h::getTextRecordFromResultTable(qec, result, 1), + h::getWordFromResultTable(qec, result, 1))); + ASSERT_EQ( + h::combineToString("\"the test on friday was really hard\"", "test"), + h::combineToString(h::getTextRecordFromResultTable(qec, result, 2), + h::getWordFromResultTable(qec, result, 2))); +} + +TEST(TextIndexScanForWord, WordScanBasic) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + TextIndexScanForWord s1{qec, Variable{"?text1"}, "test"}; + + ASSERT_EQ(s1.getResultWidth(), 1); + + auto result = s1.computeResultOnlyForTesting(); + ASSERT_EQ(result.width(), 1); + ASSERT_EQ(result.size(), 2); + + ASSERT_EQ("\"he failed the test\"", + h::getTextRecordFromResultTable(qec, result, 0)); + ASSERT_EQ("\"the test on friday was really hard\"", + h::getTextRecordFromResultTable(qec, result, 1)); + + TextIndexScanForWord s2{qec, Variable{"?text1"}, "testing"}; + + ASSERT_EQ(s2.getResultWidth(), 1); + + result = s2.computeResultOnlyForTesting(); + ASSERT_EQ(result.width(), 1); + ASSERT_EQ(result.size(), 1); + + ASSERT_EQ("\"testing can help\"", + h::getTextRecordFromResultTable(qec, result, 0)); +} + +TEST(TextIndexScanForWord, CacheKey) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + TextIndexScanForWord s1{qec, Variable{"?text1"}, "test*"}; + TextIndexScanForWord s2{qec, Variable{"?text2"}, "test*"}; + // Different text variables, same word (both with prefix) + ASSERT_EQ(s1.getCacheKeyImpl(), s2.getCacheKeyImpl()); + + TextIndexScanForWord s3{qec, Variable{"?text1"}, "test"}; + // Same text variable, different words (one with, one without prefix) + ASSERT_NE(s1.getCacheKeyImpl(), s3.getCacheKeyImpl()); + + TextIndexScanForWord s4{qec, Variable{"?text1"}, "tests"}; + // Same text variable, different words (both without prefix) + ASSERT_NE(s3.getCacheKeyImpl(), s4.getCacheKeyImpl()); + + TextIndexScanForWord s5{qec, Variable{"?text2"}, "tests"}; + // Different text variables, different words (both without prefix) + ASSERT_NE(s3.getCacheKeyImpl(), s5.getCacheKeyImpl()); + // Different text variables, same words (both without prefix) + ASSERT_EQ(s4.getCacheKeyImpl(), s5.getCacheKeyImpl()); +} + +TEST(TextIndexScanForWord, KnownEmpty) { + auto qec = getQec(kg, true, true, true, 16_B, true); + + TextIndexScanForWord s1{qec, Variable{"?text1"}, "nonExistentWord*"}; + ASSERT_TRUE(s1.knownEmptyResult()); + + TextIndexScanForWord s2{qec, Variable{"?text1"}, "nonExistentWord"}; + ASSERT_TRUE(s2.knownEmptyResult()); + + TextIndexScanForWord s3{qec, Variable{"?text1"}, "test"}; + ASSERT_TRUE(!s3.knownEmptyResult()); + + TextIndexScanForWord s4{qec, Variable{"?text1"}, "test*"}; + ASSERT_TRUE(!s4.knownEmptyResult()); + + TextIndexScanForWord s5{qec, Variable{"?text1"}, "testing"}; + ASSERT_TRUE(!s5.knownEmptyResult()); +} +} // namespace diff --git a/test/engine/TextIndexScanTestHelpers.h b/test/engine/TextIndexScanTestHelpers.h new file mode 100644 index 0000000000..25ff7f3aaf --- /dev/null +++ b/test/engine/TextIndexScanTestHelpers.h @@ -0,0 +1,43 @@ +// Copyright 2024, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Nick Göckel + +#pragma once + +namespace textIndexScanTestHelpers { +// NOTE: this function exploits a "lucky accident" that allows us to +// obtain the textRecord using idToOptionalString. +// TODO: Implement a more elegant/stable version +inline string getTextRecordFromResultTable(const QueryExecutionContext* qec, + const ResultTable& result, + const size_t& rowIndex) { + return qec->getIndex() + .idToOptionalString( + result.idTable().getColumn(0)[rowIndex].getVocabIndex()) + .value(); +} + +inline string getEntityFromResultTable(const QueryExecutionContext* qec, + const ResultTable& result, + const size_t& rowIndex) { + return qec->getIndex() + .idToOptionalString( + result.idTable().getColumn(1)[rowIndex].getVocabIndex()) + .value(); +} + +inline string getWordFromResultTable(const QueryExecutionContext* qec, + const ResultTable& result, + const size_t& rowIndex) { + return qec->getIndex() + .idToOptionalString( + result.idTable().getColumn(1)[rowIndex].getWordVocabIndex()) + .value(); +} + +inline string combineToString(const string& text, const string& word) { + std::stringstream ss; + ss << "Text: " << text << ", Word: " << word << std::endl; + return ss.str(); +} +} // namespace textIndexScanTestHelpers diff --git a/test/util/IndexTestHelpers.cpp b/test/util/IndexTestHelpers.cpp index 44c69e1864..4bff3a5137 100644 --- a/test/util/IndexTestHelpers.cpp +++ b/test/util/IndexTestHelpers.cpp @@ -123,7 +123,8 @@ Index makeTestIndex(const std::string& indexBasename, std::optional turtleInput, bool loadAllPermutations, bool usePatterns, bool usePrefixCompression, - ad_utility::MemorySize blocksizePermutations) { + ad_utility::MemorySize blocksizePermutations, + bool createTextIndex) { // Ignore the (irrelevant) log output of the index building and loading during // these tests. static std::ostringstream ignoreLogStream; @@ -155,6 +156,9 @@ Index makeTestIndex(const std::string& indexBasename, index.setPrefixCompression(usePrefixCompression); index.loadAllPermutations() = loadAllPermutations; index.createFromFile(inputFilename); + if (createTextIndex) { + index.addTextFromContextFile("", true); + } } if (!usePatterns || !loadAllPermutations) { // If we have no patterns, or only two permutations, then check the graceful @@ -172,6 +176,9 @@ Index makeTestIndex(const std::string& indexBasename, index.usePatterns() = usePatterns; index.loadAllPermutations() = loadAllPermutations; index.createFromOnDiskIndex(indexBasename); + if (createTextIndex) { + index.addTextFromOnDiskIndex(); + } ad_utility::setGlobalLoggingStream(&std::cout); if (usePatterns && loadAllPermutations) { @@ -184,7 +191,8 @@ Index makeTestIndex(const std::string& indexBasename, QueryExecutionContext* getQec(std::optional turtleInput, bool loadAllPermutations, bool usePatterns, bool usePrefixCompression, - ad_utility::MemorySize blocksizePermutations) { + ad_utility::MemorySize blocksizePermutations, + bool createTextIndex) { // Similar to `absl::Cleanup`. Calls the `callback_` in the destructor, but // the callback is stored as a `std::function`, which allows to store // different types of callbacks in the same wrapper type. @@ -230,20 +238,20 @@ QueryExecutionContext* getQec(std::optional turtleInput, std::string testIndexBasename = "_staticGlobalTestIndex" + std::to_string(contextMap.size()); contextMap.emplace( - key, - Context{TypeErasedCleanup{[testIndexBasename]() { - for (const std::string& indexFilename : - getAllIndexFilenames(testIndexBasename)) { - // Don't log when a file can't be deleted, - // because the logging might already be - // destroyed. - ad_utility::deleteFile(indexFilename, false); - } - }}, - std::make_unique(makeTestIndex( - testIndexBasename, turtleInput, loadAllPermutations, - usePatterns, usePrefixCompression, blocksizePermutations)), - std::make_unique()}); + key, Context{TypeErasedCleanup{[testIndexBasename]() { + for (const std::string& indexFilename : + getAllIndexFilenames(testIndexBasename)) { + // Don't log when a file can't be deleted, + // because the logging might already be + // destroyed. + ad_utility::deleteFile(indexFilename, false); + } + }}, + std::make_unique(makeTestIndex( + testIndexBasename, turtleInput, loadAllPermutations, + usePatterns, usePrefixCompression, + blocksizePermutations, createTextIndex)), + std::make_unique()}); } return contextMap.at(key).qec_.get(); }