Skip to content

Commit 8f9b13a

Browse files
authored
Completely refactor the fulltext operations (#1093)
As of this commit, the fulltext index (triggered by `ql:contains-word` and `ql:contains-entity`) uses two basic operations: 1. `TextIndexScanForWord`: For a given word or prefix, return all text records that contain the word, (possibly together with the matched word in the case of a prefix, and the score of the match). 2. `TextIndexScanForEntity`: For a given word or prefix, return a superset of all pairs of `(text, entity)` where the entity is contained in the text according to `ql:contains-entity` and the text contains the `word`. For technical reasons this is a superset: We always have to scan the complete block from the half-inverted index which might belong to a shorter prefix. The general processing is then as follows: * For each word or prefix that appears as part of the object of a `ql:contains-word` triple, a `TextIndexScanForWord` is created. * For each entity or variable that appears as the object of a `ql:contains-entity` triple, a `TextIndexScanForEntity` is created. * The rest of the query processing is handled by the "ordinary" query planner using the normal operations like JOIN that are also used to process standard SPARQL queries. This is much cleaner than the old `TextOperationWith[out]Filter` operations which combined the functionality of the above scan operations with JOIN operations, because the old approach lead to a lot of code duplication (the code for a join of two tables was duplicated for the fulltext module) and because the new approach makes queries easier to optimize and to reason about because the runtime information trees become much clearer if the scans and joins are represented separately.
1 parent f7c2c32 commit 8f9b13a

33 files changed

+1624
-863
lines changed

e2e/scientists_queries.yaml

Lines changed: 73 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,16 @@ queries:
55
- query: relativ-star-scientists
66
type: text
77
sparql: |
8-
SELECT ?x ?t ?ql_textscore_t WHERE {
8+
SELECT ?x ?t ?ql_score_t_var_x WHERE {
99
?x <is-a> <Scientist> .
1010
?t ql:contains-entity ?x .
1111
?t ql:contains-word "relati*"
1212
}
13-
ORDER BY DESC(?ql_textscore_t)
13+
ORDER BY DESC(?ql_score_t_var_x)
1414
checks:
1515
- num_cols: 3
1616
- num_rows: 4285
17-
- selected: [ "?x", "?t", "?ql_textscore_t"]
17+
- selected: [ "?x", "?t", "?ql_score_t_var_x"]
1818
- contains_row:
1919
- "<Albert_Einstein>"
2020
- "He realized, however, that the principle of relativity could also be extended
@@ -23,30 +23,27 @@ queries:
2323
- null
2424
- contains_row: [ "<Albert_Einstein>", null, null ] # null cells are ignored
2525
- contains_row: [ "<Luís_Lindley_Cintra>", null, null ] # Test Unicode
26-
- order_numeric: {"dir" : "DESC", "var": "?ql_textscore_t"}
26+
- order_numeric: {"dir" : "DESC", "var": "?ql_score_t_var_x"}
2727

2828

29-
- query: relativ-star-scientists-from-ulm # should use TextOperationWithFilter
29+
- query: relativ-star-scientists-from-ulm
3030
type: text
3131
sparql: |
32-
SELECT ?x ?t ?ql_textscore_t WHERE {
32+
SELECT ?x ?t WHERE {
3333
?x <is-a> <Scientist> .
3434
?x <Place_of_birth> <Ulm> .
3535
?t ql:contains-entity ?x .
3636
?t ql:contains-word "relati*"
3737
}
38-
ORDER BY DESC(?ql_textscore_t)
39-
TEXTLIMIT 1
4038
checks:
41-
- num_cols: 3
42-
- num_rows: 1
43-
- selected: [ "?x", "?t", "?ql_textscore_t" ]
39+
- num_cols: 2
40+
- num_rows: 172
41+
- selected: [ "?x", "?t"]
4442
- contains_row:
4543
- "<Albert_Einstein>"
4644
- "He realized, however, that the principle of relativity could also be extended
4745
to gravitational fields, and with his subsequent theory of gravitation in 1916,
4846
he published a paper on general relativity."
49-
- null
5047

5148
- query: relat-star-Physikalische-real-star-scientists-from-ulm
5249
type: text
@@ -55,11 +52,11 @@ queries:
5552
?x <is-a> <Scientist> .
5653
?x <Place_of_birth> <Ulm> .
5754
?t ql:contains-entity ?x .
58-
?t ql:contains-word "relat* Physikalische rela*"
55+
?t ql:contains-word "RElaT* phySIKalische rela*"
5956
}
6057
checks:
6158
- num_cols: 5
62-
- selected: [ "?x", "?ql_textscore_t", "?t", "?ql_matchingword_t_relat", "?ql_matchingword_t_rela" ]
59+
- selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_matchingword_t_relat", "?ql_matchingword_t_rela" ]
6360
- contains_row:
6461
- "<Albert_Einstein>"
6562
- null
@@ -88,73 +85,116 @@ queries:
8885
- query: algo-star-female-scientists
8986
type: text
9087
sparql: |
91-
SELECT ?x ?ql_textscore_t WHERE {
88+
SELECT ?x ?ql_score_t_var_x WHERE {
9289
?x <is-a> <Scientist> .
9390
?x <Gender> <Female> .
9491
?t ql:contains-entity ?x .
9592
?t ql:contains-word "algo*"
9693
}
97-
ORDER BY DESC(?ql_textscore_t)
94+
ORDER BY DESC(?ql_score_t_var_x)
9895
checks:
9996
- num_cols: 2
10097
- num_rows: 27
101-
- selected: [ "?x", "?ql_textscore_t" ]
98+
- selected: [ "?x", "?ql_score_t_var_x" ]
10299
- contains_row: [ "<Grete_Hermann>", null ]
103-
- order_numeric: {"dir": "DESC", "var" : "?ql_textscore_t"}
100+
- order_numeric: {"dir": "DESC", "var" : "?ql_score_t_var_x"}
104101

105102

106-
- query: algor-start-female-born-before-1940
103+
- query: algor-star-female-born-before-1940
107104
type: text
108105
sparql: |
109106
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
110-
SELECT ?x ?date ?t ?ql_textscore_t ?ql_matchingword_t_algor WHERE {
107+
SELECT ?x ?date ?t ?ql_matchingword_t_algor WHERE {
111108
?x <is-a> <Scientist> .
112109
?x <Date_of_birth> ?date .
113110
?x <Gender> <Female> .
114111
?t ql:contains-entity ?x .
115112
?t ql:contains-word "algor*" .
116113
FILTER (?date < "1940-01-01"^^xsd:date)
117114
}
118-
ORDER BY DESC(?ql_textscore_t)
119115
checks:
120-
- num_cols: 5
116+
- num_cols: 4
121117
- num_rows: 4
122118
- contains_row:
123119
- "<Grete_Hermann>"
124120
- "1901-03-02"
125121
- "Hermann's algorithm for primary decomposition is still in use now."
126-
- null
127122
- "algorithm"
128123
- contains_row:
129124
- "<Ada_Lovelace>"
130125
- "1815-12-10"
131126
- "Her notes on the engine include what is recognised as the first algorithm intended to be carried out by a machine."
132-
- null
133127
- "algorithm"
134-
- order_numeric: {"dir": "DESC", "var" : "?ql_textscore_t"}
135128

136-
- query: algorithm-hermann-start-female-born-before-1940
129+
- query: algor-star-female-fixedEntity-ada-ordered
130+
type: text
131+
sparql: |
132+
SELECT * WHERE {
133+
?scientist <is-a> <Scientist> .
134+
?scientist <Gender> <Female> .
135+
?text ql:contains-entity ?scientist .
136+
?text ql:contains-entity <Ada_Lovelace> .
137+
?text ql:contains-word "rela*" .
138+
}
139+
ORDER BY DESC(?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_)
140+
checks:
141+
- num_cols: 5
142+
- num_rows: 7
143+
- contains_row:
144+
- "<Ada_Lovelace>"
145+
- null
146+
- "As a teenager, her mathematical talents led her to an ongoing
147+
working relationship and friendship with fellow British mathematician
148+
Charles Babbage, also known as' the father of computers', and in
149+
particular, Babbage's work on the Analytical Engine."
150+
- null
151+
- "relationship"
152+
- order_numeric: {"dir": "DESC",
153+
"var" : "?ql_score_text_fixedEntity__60_Ada_95_Lovelace_62_"}
154+
155+
- query: algor-star-female-fixedEntity-ada-fixed-Entity-mary
156+
type: text
157+
sparql: |
158+
SELECT * WHERE {
159+
?scientist <is-a> <Scientist> .
160+
?scientist <Gender> <Female> .
161+
?text ql:contains-entity ?scientist .
162+
?text ql:contains-entity <Ada_Lovelace> .
163+
?text ql:contains-entity <Mary_Somerville> .
164+
?text ql:contains-word "rela*" .
165+
}
166+
checks:
167+
- num_cols: 6
168+
- num_rows: 2
169+
- contains_row:
170+
- "<Ada_Lovelace>"
171+
- null
172+
- "She became fascinated with the machine and used her relationship
173+
with Somerville to visit Babbage as often as she could."
174+
- null
175+
- null
176+
- "relationship"
177+
178+
179+
- query: algorithm-hermann-star-female-born-before-1940
137180
type: text
138181
sparql: |
139182
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
140-
SELECT ?x ?date ?t ?ql_textscore_t WHERE {
183+
SELECT ?x ?date ?t WHERE {
141184
?x <is-a> <Scientist> .
142185
?x <Date_of_birth> ?date .
143186
?x <Gender> <Female> .
144187
?t ql:contains-entity ?x .
145188
?t ql:contains-word "algorithm hermann" .
146189
FILTER (?date < "1940-01-01"^^xsd:date)
147190
}
148-
ORDER BY DESC(?ql_textscore_t)
149191
checks:
150-
- num_cols: 4
192+
- num_cols: 3
151193
- num_rows: 1
152194
- contains_row:
153195
- "<Grete_Hermann>"
154196
- "1901-03-02"
155197
- "Hermann's algorithm for primary decomposition is still in use now."
156-
- null
157-
- order_numeric: {"dir": "DESC", "var" : "?ql_textscore_t"}
158198

159199
- query: people-born-in-1901
160200
type: no-text
@@ -1239,11 +1279,11 @@ queries:
12391279
?x <Gender> <Female> .
12401280
?t ql:contains-entity ?x .
12411281
?t ql:contains-word "algo* herm* primary"
1242-
} TEXTLIMIT 1
1282+
}
12431283
checks:
12441284
- num_cols: 5
12451285
- num_rows: 1
1246-
- selected: [ "?x", "?ql_textscore_t", "?t", "?ql_matchingword_t_algo", "?ql_matchingword_t_herm" ]
1286+
- selected: [ "?x", "?ql_score_t_var_x", "?t", "?ql_matchingword_t_algo", "?ql_matchingword_t_herm" ]
12471287
- contains_row: [ "<Grete_Hermann>",null,"Hermann's algorithm for primary decomposition is still in use now.","algorithm","hermann" ]
12481288

12491289

src/engine/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,6 @@ add_library(engine
1010
Union.cpp MultiColumnJoin.cpp TransitivePath.cpp Service.cpp
1111
Values.cpp Bind.cpp Minus.cpp RuntimeInformation.cpp CheckUsePatternTrick.cpp
1212
VariableToColumnMap.cpp ExportQueryExecutionTrees.cpp
13-
CartesianProductJoin.cpp
13+
CartesianProductJoin.cpp TextIndexScanForWord.cpp TextIndexScanForEntity.cpp
1414
idTable/CompressedExternalIdTable.h)
1515
qlever_target_link_libraries(engine util index parser sparqlExpressions http SortPerformanceEstimator Boost::iostreams)

src/engine/QueryExecutionTree.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
#include "engine/OrderBy.h"
2929
#include "engine/Service.h"
3030
#include "engine/Sort.h"
31+
#include "engine/TextIndexScanForEntity.h"
32+
#include "engine/TextIndexScanForWord.h"
3133
#include "engine/TextOperationWithFilter.h"
3234
#include "engine/TextOperationWithoutFilter.h"
3335
#include "engine/TransitivePath.h"
@@ -176,6 +178,10 @@ void QueryExecutionTree::setOperation(std::shared_ptr<Op> operation) {
176178
type_ = TEXT_WITH_FILTER;
177179
} else if constexpr (std::is_same_v<Op, TextOperationWithoutFilter>) {
178180
type_ = TEXT_WITHOUT_FILTER;
181+
} else if constexpr (std::is_same_v<Op, TextIndexScanForWord>) {
182+
type_ = TEXT_INDEX_SCAN_FOR_WORD;
183+
} else if constexpr (std::is_same_v<Op, TextIndexScanForEntity>) {
184+
type_ = TEXT_INDEX_SCAN_FOR_ENTITY;
179185
} else if constexpr (std::is_same_v<Op, CountAvailablePredicates>) {
180186
type_ = COUNT_AVAILABLE_PREDICATES;
181187
} else if constexpr (std::is_same_v<Op, Minus>) {
@@ -217,6 +223,10 @@ template void QueryExecutionTree::setOperation(
217223
std::shared_ptr<TextOperationWithFilter>);
218224
template void QueryExecutionTree::setOperation(
219225
std::shared_ptr<TextOperationWithoutFilter>);
226+
template void QueryExecutionTree::setOperation(
227+
std::shared_ptr<TextIndexScanForWord>);
228+
template void QueryExecutionTree::setOperation(
229+
std::shared_ptr<TextIndexScanForEntity>);
220230
template void QueryExecutionTree::setOperation(
221231
std::shared_ptr<CountAvailablePredicates>);
222232
template void QueryExecutionTree::setOperation(std::shared_ptr<Minus>);

src/engine/QueryExecutionTree.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ class QueryExecutionTree {
4545
DISTINCT,
4646
TEXT_WITHOUT_FILTER,
4747
TEXT_WITH_FILTER,
48+
TEXT_INDEX_SCAN_FOR_WORD,
49+
TEXT_INDEX_SCAN_FOR_ENTITY,
4850
OPTIONAL_JOIN,
4951
COUNT_AVAILABLE_PREDICATES,
5052
GROUP_BY,

0 commit comments

Comments
 (0)