Skip to content

Commit f7c2c32

Browse files
authored
Add triples of ql:has-pattern predicate to PSO and POS (#1226)
The PSO and POS permutation now also contain the triples of the internal ql:has-pattern predicate. These will be used as a fallback for the new pattern implementation (which will come with one of the next commits). Note that we don't need the triples in the other four permutations, so the pair PSO&POS now has more triples than SPO&SOP and OSP&OPS.
1 parent 0bd2b6c commit f7c2c32

File tree

13 files changed

+232
-74
lines changed

13 files changed

+232
-74
lines changed

src/engine/idTable/CompressedExternalIdTable.h

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,10 @@ class CompressedExternalIdTableBase {
316316
CompressedExternalIdTableWriter writer_;
317317
std::future<void> compressAndWriteFuture_;
318318

319+
// Store whether this table has previously already been iterated over (in
320+
// which case this member becomes `false`).
321+
std::atomic<bool> isFirstIteration_ = true;
322+
319323
[[no_unique_address]] BlockTransformation blockTransformation_{};
320324

321325
public:
@@ -364,6 +368,7 @@ class CompressedExternalIdTableBase {
364368
}
365369
writer_.clear();
366370
numBlocksPushed_ = 0;
371+
isFirstIteration_ = true;
367372
}
368373

369374
protected:
@@ -401,6 +406,9 @@ class CompressedExternalIdTableBase {
401406
// until the pushing is actually finished, and return `true`. Using this
402407
// function allows for an efficient usage of this class for very small inputs.
403408
bool transformAndPushLastBlock() {
409+
if (!isFirstIteration_) {
410+
return numBlocksPushed_ != 0;
411+
}
404412
// If we have pushed at least one (complete) block, then the last future
405413
// from pushing a block is still in flight. If we have never pushed a block,
406414
// then also the future cannot be valid.
@@ -549,6 +557,9 @@ class CompressedExternalIdTableSorter
549557
// output phase.
550558
int numBufferedOutputBlocks_ = 4;
551559

560+
// See the `moveResultOnMerge()` getter function for documentation.
561+
bool moveResultOnMerge_ = true;
562+
552563
public:
553564
// Constructor.
554565
CompressedExternalIdTableSorter(
@@ -579,6 +590,18 @@ class CompressedExternalIdTableSorter
579590
// within this class.
580591
using Base::push;
581592

593+
// If set to `false` then the sorted result can be extracted multiple times.
594+
// If set to `true` then the result is moved out and unusable after the first
595+
// merge. In that case an exception will be thrown at the start of the second
596+
// merge.
597+
// Note: This mechanism gives a performance advantage for very small inputs
598+
// that can be completely sorted in RAM. In that case we can avoid a copy of
599+
// the sorted result.
600+
bool& moveResultOnMerge() {
601+
AD_CONTRACT_CHECK(this->isFirstIteration_);
602+
return moveResultOnMerge_;
603+
}
604+
582605
// Transition from the input phase, where `push()` can be called, to the
583606
// output phase and return a generator that yields the sorted elements one by
584607
// one. Either this function or the following function must be called exactly
@@ -594,6 +617,8 @@ class CompressedExternalIdTableSorter
594617
requires(N == NumStaticCols || N == 0)
595618
cppcoro::generator<IdTableStatic<N>> getSortedBlocks(
596619
std::optional<size_t> blocksize = std::nullopt) {
620+
// If we move the result out, there must only be a single merge phase.
621+
AD_CONTRACT_CHECK(this->isFirstIteration_ || !this->moveResultOnMerge_);
597622
mergeIsActive_.store(true);
598623
// Explanation for the second argument: One block is buffered by this
599624
// generator, one block is buffered inside the `sortedBlocks` generator, so
@@ -604,6 +629,7 @@ class CompressedExternalIdTableSorter
604629
std::max(1, numBufferedOutputBlocks_ - 2))) {
605630
co_yield block;
606631
}
632+
this->isFirstIteration_ = false;
607633
mergeIsActive_.store(false);
608634
}
609635

@@ -637,8 +663,15 @@ class CompressedExternalIdTableSorter
637663
auto& block = this->currentBlock_;
638664
const auto blocksizeOutput = blocksize.value_or(block.numRows());
639665
if (block.numRows() <= blocksizeOutput) {
640-
co_yield std::move(this->currentBlock_).template toStatic<N>();
666+
if (this->moveResultOnMerge_) {
667+
co_yield std::move(this->currentBlock_).template toStatic<N>();
668+
} else {
669+
auto blockAsStatic = IdTableStatic<N>(
670+
this->currentBlock_.clone().template toStatic<N>());
671+
co_yield blockAsStatic;
672+
}
641673
} else {
674+
// TODO<C++23> Use `std::views::chunk`.
642675
for (size_t i = 0; i < block.numRows(); i += blocksizeOutput) {
643676
size_t upper = std::min(i + blocksizeOutput, block.numRows());
644677
auto curBlock = IdTableStatic<NumStaticCols>(

src/engine/idTable/IdTable.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,8 @@ class IdTable {
124124
static constexpr bool columnsAreAllocatable =
125125
std::is_constructible_v<ColumnStorage, size_t, Allocator>;
126126

127-
using value_type = T;
127+
// The type of a single entry in a row.
128+
using single_value_type = T;
128129
// Because of the column-major layout, the `row_type` (a value type that
129130
// stores the values of a single row) and the `row_reference` (a type that
130131
// refers to a specific row of a specific `IdTable`) are different. They are
@@ -135,6 +136,11 @@ class IdTable {
135136
using row_reference = RowReference<IdTable, ad_utility::IsConst::False>;
136137
using const_row_reference = RowReference<IdTable, ad_utility::IsConst::True>;
137138

139+
// This alias is required to make the `IdTable` class work with advanced GTest
140+
// features, because GTest uses `Container::value_type` directly instead of
141+
// using `std::iterator_traits`.
142+
using value_type = row_type;
143+
138144
private:
139145
// Assign shorter aliases for some types that are important for the correct
140146
// handling of the proxy reference, but that are not visible to the outside.
@@ -526,14 +532,18 @@ class IdTable {
526532
// numColumns()` implies that the function applies a permutation to the table.
527533
// For example `setColumnSubset({1, 2, 0})` rotates the columns of a table
528534
// with three columns left by one element.
529-
void setColumnSubset(std::span<const ColumnIndex> subset) requires isDynamic {
535+
void setColumnSubset(std::span<const ColumnIndex> subset) {
530536
// First check that the `subset` is indeed a subset of the column
531537
// indices.
532538
std::vector<ColumnIndex> check{subset.begin(), subset.end()};
533539
std::ranges::sort(check);
534540
AD_CONTRACT_CHECK(std::unique(check.begin(), check.end()) == check.end());
535541
AD_CONTRACT_CHECK(!subset.empty() && subset.back() < numColumns());
536542

543+
// If the number of columns is statically fixed, then only a permutation of
544+
// the columns and not a real subset is allowed.
545+
AD_CONTRACT_CHECK(isDynamic || subset.size() == NumColumns);
546+
537547
Data newData;
538548
newData.reserve(subset.size());
539549
std::ranges::for_each(subset, [this, &newData](ColumnIndex colIdx) {

src/engine/idTable/IdTableRow.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,14 @@ class Row {
8585
friend void swap(Row& a, Row& b) { std::swap(a.data_, b.data_); }
8686

8787
bool operator==(const Row& other) const = default;
88+
89+
// Convert from a static `RowReference` to a `std::array` (makes a copy).
90+
explicit operator std::array<T, numStaticColumns>() const
91+
requires(numStaticColumns != 0) {
92+
std::array<T, numStaticColumns> result;
93+
std::ranges::copy(*this, result.begin());
94+
return result;
95+
}
8896
};
8997

9098
// The following two classes store a reference to a row in the underlying
@@ -120,7 +128,7 @@ class RowReferenceImpl {
120128
public:
121129
static constexpr bool isConst = isConstTag == ad_utility::IsConst::True;
122130
using TablePtr = std::conditional_t<isConst, const Table*, Table*>;
123-
using T = typename Table::value_type;
131+
using T = typename Table::single_value_type;
124132
static constexpr int numStaticColumns = Table::numStaticColumns;
125133

126134
// Grant the `IdTable` class access to the internal details.

src/global/SpecialIds.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,16 @@ static const inline ad_utility::HashMap<std::string, Id> specialIds = []() {
3131
AD_CORRECTNESS_CHECK(uniqueIds.size() == result.size());
3232
return result;
3333
}();
34+
35+
// Return the [lowerBound, upperBound) for the special Ids.
36+
// This range can be used to filter them out in cases where we want to ignore
37+
// triples that were added by QLever for internal reasons.
38+
static constexpr std::pair<Id, Id> getBoundsForSpecialIds() {
39+
constexpr auto upperBound = Id::makeFromBool(false);
40+
static_assert(static_cast<int>(Datatype::Undefined) == 0);
41+
static_assert(upperBound.getBits() == 1UL << Id::numDataBits);
42+
return {Id::fromBits(1), upperBound};
43+
}
3444
} // namespace qlever
3545

3646
#endif // QLEVER_SPECIALIDS_H

src/index/IndexImpl.cpp

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,9 @@ std::unique_ptr<ExternalSorter<SortByPSO, 5>> IndexImpl::buildOspWithPatterns(
192192
auto isQleverInternalId) {
193193
auto&& [hasPatternPredicateSortedByPSO, secondSorter] =
194194
sortersFromPatternCreator;
195+
// We need the patterns twice: once for the additional column, and once for
196+
// the additional permutation.
197+
hasPatternPredicateSortedByPSO->moveResultOnMerge() = false;
195198
// The column with index 1 always is `has-predicate` and is not needed here.
196199
// Note that the order of the columns during index building is alwasy `SPO`,
197200
// but the sorting might be different (PSO in this case).
@@ -259,6 +262,19 @@ std::unique_ptr<ExternalSorter<SortByPSO, 5>> IndexImpl::buildOspWithPatterns(
259262
makeSorterPtr<ThirdPermutation, NumColumnsIndexBuilding + 2>("third");
260263
createSecondPermutationPair(NumColumnsIndexBuilding + 2, isQleverInternalId,
261264
std::move(blockGenerator), *thirdSorter);
265+
// Add the `ql:has-pattern` predicate to the sorter such that it will become
266+
// part of the PSO and POS permutation.
267+
LOG(INFO) << "Adding " << hasPatternPredicateSortedByPSO->size()
268+
<< " additional triples to the POS and PSO permutation for the "
269+
"`ql:has-pattern` predicate ..."
270+
<< std::endl;
271+
auto noPattern = Id::makeFromInt(NO_PATTERN);
272+
static_assert(NumColumnsIndexBuilding == 3);
273+
for (const auto& row : hasPatternPredicateSortedByPSO->sortedView()) {
274+
// The repetition of the pattern index (`row[2]`) for the fourth column is
275+
// useful for generic unit testing, but not needed otherwise.
276+
thirdSorter->push(std::array{row[0], row[1], row[2], row[2], noPattern});
277+
}
262278
return thirdSorter;
263279
}
264280
// _____________________________________________________________________________
@@ -282,7 +298,10 @@ void IndexImpl::createFromFile(const string& filename) {
282298
writeConfiguration();
283299

284300
auto isQleverInternalId = [&indexBuilderData](const auto& id) {
285-
return indexBuilderData.vocabularyMetaData_.isQleverInternalId(id);
301+
// The special internal IDs like `ql:has-pattern` (see `SpecialIds.h`)
302+
// have the datatype `UNDEFINED`.
303+
return indexBuilderData.vocabularyMetaData_.isQleverInternalId(id) ||
304+
id.getDatatype() == Datatype::Undefined;
286305
};
287306

288307
// For the first permutation, perform a unique.
@@ -754,6 +773,7 @@ void IndexImpl::createFromOnDiskIndex(const string& onDiskBase) {
754773
totalVocabularySize_ = vocab_.size() + vocab_.getExternalVocab().size();
755774
LOG(DEBUG) << "Number of words in internal and external vocabulary: "
756775
<< totalVocabularySize_ << std::endl;
776+
757777
pso_.loadFromDisk(onDiskBase_);
758778
pos_.loadFromDisk(onDiskBase_);
759779

src/index/IndexImpl.h

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#include <engine/ResultTable.h>
99
#include <global/Pattern.h>
10+
#include <global/SpecialIds.h>
1011
#include <index/CompressedRelation.h>
1112
#include <index/ConstantsIndexBuilding.h>
1213
#include <index/DocsDB.h>
@@ -668,35 +669,43 @@ class IndexImpl {
668669
// index scan) and `GroupBy.cpp`.
669670
auto getIgnoredIdRanges(const Permutation::Enum permutation) const {
670671
std::vector<std::pair<Id, Id>> ignoredRanges;
672+
ignoredRanges.emplace_back(qlever::getBoundsForSpecialIds());
671673

672674
auto literalRange = getVocab().prefix_range("\"");
673675
auto taggedPredicatesRange = getVocab().prefix_range("@");
674676
auto internalEntitiesRange =
675677
getVocab().prefix_range(INTERNAL_ENTITIES_URI_PREFIX);
676-
ignoredRanges.emplace_back(
677-
Id::makeFromVocabIndex(internalEntitiesRange.first),
678-
Id::makeFromVocabIndex(internalEntitiesRange.second));
679678

679+
auto pushIgnoredRange = [&ignoredRanges](const auto& range) {
680+
ignoredRanges.emplace_back(Id::makeFromVocabIndex(range.first),
681+
Id::makeFromVocabIndex(range.second));
682+
};
683+
pushIgnoredRange(internalEntitiesRange);
680684
using enum Permutation::Enum;
681685
if (permutation == SPO || permutation == SOP) {
682-
ignoredRanges.push_back({Id::makeFromVocabIndex(literalRange.first),
683-
Id::makeFromVocabIndex(literalRange.second)});
686+
pushIgnoredRange(literalRange);
684687
} else if (permutation == PSO || permutation == POS) {
685-
ignoredRanges.push_back(
686-
{Id::makeFromVocabIndex(taggedPredicatesRange.first),
687-
Id::makeFromVocabIndex(taggedPredicatesRange.second)});
688+
pushIgnoredRange(taggedPredicatesRange);
688689
}
689690

690-
auto isIllegalPredicateId = [=](Id predicateId) {
691+
// A lambda that checks whether the `predicateId` is an internal ID like
692+
// `ql:has-pattern` or `@en@rdfs:label`.
693+
auto isInternalPredicateId = [internalEntitiesRange,
694+
taggedPredicatesRange](Id predicateId) {
695+
if (predicateId.getDatatype() == Datatype::Undefined) {
696+
return true;
697+
}
698+
AD_CORRECTNESS_CHECK(predicateId.getDatatype() == Datatype::VocabIndex);
691699
auto idx = predicateId.getVocabIndex();
692-
return (idx >= internalEntitiesRange.first &&
693-
idx < internalEntitiesRange.second) ||
694-
(idx >= taggedPredicatesRange.first &&
695-
idx < taggedPredicatesRange.second);
700+
auto isInRange = [idx](const auto& range) {
701+
return range.first <= idx && idx < range.second;
702+
};
703+
return (isInRange(internalEntitiesRange) ||
704+
isInRange(taggedPredicatesRange));
696705
};
697706

698707
auto isTripleIgnored = [permutation,
699-
isIllegalPredicateId](const auto& triple) {
708+
isInternalPredicateId](const auto& triple) {
700709
// TODO<joka921, everybody in the future>:
701710
// A lot of code (especially for statistical queries in `GroupBy.cpp` and
702711
// the pattern trick) relies on this function being a noop for the `PSO`
@@ -707,9 +716,9 @@ class IndexImpl {
707716
// be thoroughly reviewed.
708717
if (permutation == SPO || permutation == OPS) {
709718
// Predicates are always entities from the vocabulary.
710-
return isIllegalPredicateId(triple[1]);
719+
return isInternalPredicateId(triple[1]);
711720
} else if (permutation == SOP || permutation == OSP) {
712-
return isIllegalPredicateId(triple[2]);
721+
return isInternalPredicateId(triple[2]);
713722
}
714723
return false;
715724
};

src/parser/TripleComponent.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "engine/LocalVocab.h"
1515
#include "global/Constants.h"
1616
#include "global/Id.h"
17+
#include "global/SpecialIds.h"
1718
#include "parser/RdfEscaping.h"
1819
#include "parser/data/Variable.h"
1920
#include "util/Date.h"
@@ -232,6 +233,8 @@ class TripleComponent {
232233
isString() ? getString() : getLiteral().rawContent();
233234
if (vocabulary.getId(content, &idx)) {
234235
return Id::makeFromVocabIndex(idx);
236+
} else if (qlever::specialIds.contains(content)) {
237+
return qlever::specialIds.at(content);
235238
} else {
236239
return std::nullopt;
237240
}

test/IdTableTest.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -974,6 +974,28 @@ TEST(IdTable, setColumnSubset) {
974974
ASSERT_ANY_THROW(t.setColumnSubset(std::vector<ColumnIndex>{1, 2}));
975975
}
976976

977+
TEST(IdTableStatic, setColumnSubset) {
978+
using IntTable = columnBasedIdTable::IdTable<int, 3>;
979+
IntTable t;
980+
t.push_back({0, 10, 20});
981+
t.push_back({1, 11, 21});
982+
t.push_back({2, 12, 22});
983+
t.setColumnSubset(std::array{ColumnIndex(2), ColumnIndex(0), ColumnIndex(1)});
984+
ASSERT_EQ(3, t.numColumns());
985+
ASSERT_EQ(3, t.numRows());
986+
ASSERT_THAT(t.getColumn(0), ::testing::ElementsAre(20, 21, 22));
987+
ASSERT_THAT(t.getColumn(1), ::testing::ElementsAre(0, 1, 2));
988+
ASSERT_THAT(t.getColumn(2), ::testing::ElementsAre(10, 11, 12));
989+
990+
// Duplicate columns are not allowed.
991+
ASSERT_ANY_THROW(t.setColumnSubset(std::vector<ColumnIndex>{0, 0, 1}));
992+
// A column index is out of range.
993+
ASSERT_ANY_THROW(t.setColumnSubset(std::vector<ColumnIndex>{1, 2, 3}));
994+
995+
// For static tables, we need a permutation, a real subset is not allowed.
996+
ASSERT_ANY_THROW(t.setColumnSubset(std::vector<ColumnIndex>{1, 2}));
997+
}
998+
977999
TEST(IdTable, cornerCases) {
9781000
using Dynamic = columnBasedIdTable::IdTable<int, 0>;
9791001
{

0 commit comments

Comments
 (0)