From 5b89eb14d8593b67feea5facbc6df2e2730903c5 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Wed, 1 Oct 2025 13:38:41 +0200 Subject: [PATCH 01/41] First draft to rebuild the index --- src/engine/Server.cpp | 5 +++++ src/index/DeltaTriples.cpp | 42 ++++++++++++++++++++++++++++++++++++++ src/index/DeltaTriples.h | 2 ++ src/index/IndexImpl.cpp | 8 ++++++++ src/index/IndexImpl.h | 16 ++++++++++++++- 5 files changed, 72 insertions(+), 1 deletion(-) diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index a28c31e03a..fab85c3a7c 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -431,6 +431,11 @@ CPP_template_def(typename RequestT, typename ResponseT)( json[nlohmann::json(key)] = std::move(value); } response = createJsonResponse(json, request); + } else if (auto cmd = checkParameter("cmd", "rebuild-index")) { + index_.deltaTriplesManager().modify( + [](DeltaTriples& deltaTriples) { deltaTriples.materializeToIndex(); }, + false); + response = createOkResponse("Done writing", request, MediaType::textPlain); } // Ping with or without message. diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index 2d9194a13f..b32d0d0093 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -390,3 +390,45 @@ void DeltaTriplesManager::setFilenameForPersistentUpdatesAndReadFromDisk( }, false); } + +// _____________________________________________________________________________ +void DeltaTriples::materializeToIndex() { + ScanSpecification scanSpec{std::nullopt, std::nullopt, std::nullopt}; + auto snapshot = getSnapshot(); + CancellationHandle cancellationHandle = + std::make_shared(); + IndexImpl newIndex{index_.allocator()}; + newIndex.setOnDiskBase("tmp_index"); + newIndex.setKbName(index_.getKbName()); + newIndex.blocksizePermutationPerColumn() = + index_.blocksizePermutationPerColumn(); + for (auto permutation : Permutation::ALL) { + // Only process half the permutations (the rest is done by the pairs). + if (static_cast(permutation) % 2 != 0) { + continue; + } + Permutation::ScanSpecAndBlocks scanSpecAndBlocks{ + scanSpec, BlockMetadataRanges( + newIndex.getPermutation(permutation) + .getAugmentedMetadataForPermutation(*snapshot))}; + auto fullScan = + index_.getPermutation(permutation) + .lazyScan(scanSpecAndBlocks, std::nullopt, + std::array{ADDITIONAL_COLUMN_GRAPH_ID}, + cancellationHandle, *snapshot, LimitOffsetClause{}); + [[maybe_unused]] auto distinct = newIndex.createPermutationPairPublic( + 4, + ad_utility::InputRangeTypeErased{ad_utility::CachingTransformInputRange{ + std::move(fullScan), + [permutation](IdTable& idTable) { + auto keyOrder = Permutation::toKeyOrder(permutation); + std::vector columnIndices{keyOrder.keys().begin(), + keyOrder.keys().end()}; + idTable.setColumnSubset(columnIndices); + return IdTableStatic<0>{std::move(idTable)}; + }}}, + newIndex.getPermutation(permutation), + newIndex.getPermutation( + static_cast(static_cast(permutation) + 1))); + } +} diff --git a/src/index/DeltaTriples.h b/src/index/DeltaTriples.h index c1f08d98be..fcea3a01be 100644 --- a/src/index/DeltaTriples.h +++ b/src/index/DeltaTriples.h @@ -200,6 +200,8 @@ class DeltaTriples { Permutation::Enum permutation, std::shared_ptr> metadata); + void materializeToIndex(); + private: // Find the position of the given triple in the given permutation and add it // to each of the six `LocatedTriplesPerBlock` maps (one per permutation). diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 1fcf2be46f..bcfdbb2f48 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -877,6 +877,14 @@ IndexImpl::createPermutations(size_t numColumns, T&& sortedTriples, return metaData; } +// ________________________________________________________________________ +size_t IndexImpl::createPermutationPairPublic( + size_t numColumns, + ad_utility::InputRangeTypeErased>&& sortedTriples, + const Permutation& p1, const Permutation& p2) { + return createPermutationPair(numColumns, AD_FWD(sortedTriples), p1, p2); +} + // ________________________________________________________________________ template size_t IndexImpl::createPermutationPair(size_t numColumns, diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index a60e247a88..263035d44d 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -39,7 +39,6 @@ #include "util/File.h" #include "util/Forward.h" #include "util/MemorySize/MemorySize.h" -#include "util/MmapVector.h" #include "util/json.h" template @@ -261,6 +260,10 @@ class IndexImpl { const auto& getScoreData() const { return scoreData_; } + const ad_utility::AllocatorWithLimit& allocator() const { + return allocator_; + }; + ad_utility::BlankNodeManager* getBlankNodeManager() const; DeltaTriplesManager& deltaTriplesManager() { return deltaTriples_.value(); } @@ -447,6 +450,10 @@ class IndexImpl { return blocksizePermutationPerColumn_; } + const ad_utility::MemorySize& blocksizePermutationPerColumn() const { + return blocksizePermutationPerColumn_; + } + void setOnDiskBase(const std::string& onDiskBase); void setSettingsFile(const std::string& filename); @@ -557,6 +564,13 @@ class IndexImpl { Permutation::KeyOrder permutation, Callbacks&&... perTripleCallbacks); + public: + [[nodiscard]] size_t createPermutationPairPublic( + size_t numColumns, + ad_utility::InputRangeTypeErased>&& sortedTriples, + const Permutation& p1, const Permutation& p2); + + protected: // _______________________________________________________________________ // Create a pair of permutations. Only works for valid pairs (PSO-POS, // OSP-OPS, SPO-SOP). First creates the permutation and then exchanges the From daec8e9c3d8861235bbf9a4c525e46cc88ec0926 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Wed, 1 Oct 2025 17:44:19 +0200 Subject: [PATCH 02/41] First attempt to transform the values --- src/index/DeltaTriples.cpp | 71 +++++++++++++++++++++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index b32d0d0093..241722d8da 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -393,6 +393,57 @@ void DeltaTriplesManager::setFilenameForPersistentUpdatesAndReadFromDisk( // _____________________________________________________________________________ void DeltaTriples::materializeToIndex() { + auto& vocab = index_.getVocab(); + + size_t newWordCount = 0; + std::vector> insertInfo; + insertInfo.reserve(localVocab_.size()); + + ad_utility::HashMap localVocabMapping; + + for (const LocalVocabEntry& entry : + const_cast(localVocab_).primaryWordSet()) { + const auto& [lower, upper] = entry.positionInVocab(); + if (lower == upper) { + localVocabMapping.emplace( + Id::makeFromLocalVocabIndex(&entry), + Id::makeFromVocabIndex(VocabIndex::make(lower.get()))); + continue; + } + insertInfo.emplace_back(VocabIndex::make(upper.get()), + entry.asLiteralOrIri().toStringRepresentation(), + Id::makeFromLocalVocabIndex(&entry)); + } + ql::ranges::sort(insertInfo, [](const auto& tupleA, const auto& tupleB) { + return std::tie(std::get(tupleA).get(), + std::get(tupleA)) < + std::tie(std::get(tupleB).get(), + std::get(tupleB)); + }); + + auto vocabWriter = vocab.makeWordWriterPtr("tmp_vocab"); + for (size_t vocabIndex = 0; vocabIndex < vocab.size(); ++vocabIndex) { + auto actualIndex = VocabIndex::make(vocabIndex); + while (insertInfo.size() > newWordCount && + std::get(insertInfo.at(newWordCount)) == actualIndex) { + auto word = std::get(insertInfo.at(newWordCount)); + auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); + localVocabMapping.emplace( + std::get(insertInfo.at(newWordCount)), + Id::makeFromVocabIndex(VocabIndex::make(newIndex))); + newWordCount++; + } + auto word = vocab[actualIndex]; + auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); + AD_EXPENSIVE_CHECK(newIndex == vocabIndex + newWordCount); + } + + for (const auto& [_, word, id] : insertInfo | ql::views::drop(newWordCount)) { + auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); + localVocabMapping.emplace( + id, Id::makeFromVocabIndex(VocabIndex::make(newIndex))); + } + ScanSpecification scanSpec{std::nullopt, std::nullopt, std::nullopt}; auto snapshot = getSnapshot(); CancellationHandle cancellationHandle = @@ -420,11 +471,29 @@ void DeltaTriples::materializeToIndex() { 4, ad_utility::InputRangeTypeErased{ad_utility::CachingTransformInputRange{ std::move(fullScan), - [permutation](IdTable& idTable) { + [permutation, &localVocabMapping, &insertInfo](IdTable& idTable) { auto keyOrder = Permutation::toKeyOrder(permutation); std::vector columnIndices{keyOrder.keys().begin(), keyOrder.keys().end()}; idTable.setColumnSubset(columnIndices); + for (auto col : idTable.getColumns()) { + ql::ranges::for_each(col, [&localVocabMapping, + &insertInfo](Id& id) { + if (id.getDatatype() == Datatype::LocalVocabIndex) { + id = localVocabMapping.at(id); + } else if (id.getDatatype() == Datatype::VocabIndex) { + size_t offset = ql::ranges::distance( + insertInfo.begin(), + ql::ranges::upper_bound(insertInfo, id.getVocabIndex(), + std::less{}, + [](const auto& tuple) { + return std::get<0>(tuple); + })); + id = Id::makeFromVocabIndex( + VocabIndex::make(id.getVocabIndex().get() + offset)); + } + }); + } return IdTableStatic<0>{std::move(idTable)}; }}}, newIndex.getPermutation(permutation), From 6e83fac7740199b3cb4d51768aeb6eff567dcb0b Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 2 Oct 2025 10:44:56 +0200 Subject: [PATCH 03/41] Fix some assertion errors and segfaults in the code --- src/index/DeltaTriples.cpp | 25 +++++++++++-------------- src/index/IndexImpl.cpp | 7 +++++-- src/index/IndexImpl.h | 3 ++- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index 241722d8da..db60fb8a80 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -404,28 +404,25 @@ void DeltaTriples::materializeToIndex() { for (const LocalVocabEntry& entry : const_cast(localVocab_).primaryWordSet()) { const auto& [lower, upper] = entry.positionInVocab(); - if (lower == upper) { - localVocabMapping.emplace( - Id::makeFromLocalVocabIndex(&entry), - Id::makeFromVocabIndex(VocabIndex::make(lower.get()))); - continue; - } - insertInfo.emplace_back(VocabIndex::make(upper.get()), + AD_CORRECTNESS_CHECK(lower == upper); + Id id = Id::fromBits(upper.get()); + AD_CORRECTNESS_CHECK(id.getDatatype() == Datatype::VocabIndex); + insertInfo.emplace_back(id.getVocabIndex(), entry.asLiteralOrIri().toStringRepresentation(), Id::makeFromLocalVocabIndex(&entry)); } ql::ranges::sort(insertInfo, [](const auto& tupleA, const auto& tupleB) { - return std::tie(std::get(tupleA).get(), - std::get(tupleA)) < - std::tie(std::get(tupleB).get(), - std::get(tupleB)); + return std::tie(std::get(tupleA).get(), std::get(tupleA)) < + std::tie(std::get(tupleB).get(), std::get(tupleB)); }); - auto vocabWriter = vocab.makeWordWriterPtr("tmp_vocab"); + auto vocabWriter = vocab.makeWordWriterPtr("tmp_index.vocabulary"); for (size_t vocabIndex = 0; vocabIndex < vocab.size(); ++vocabIndex) { auto actualIndex = VocabIndex::make(vocabIndex); while (insertInfo.size() > newWordCount && std::get(insertInfo.at(newWordCount)) == actualIndex) { + AD_CORRECTNESS_CHECK(std::get(insertInfo.at(newWordCount)) < + Id::makeFromVocabIndex(actualIndex)); auto word = std::get(insertInfo.at(newWordCount)); auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); localVocabMapping.emplace( @@ -435,7 +432,7 @@ void DeltaTriples::materializeToIndex() { } auto word = vocab[actualIndex]; auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); - AD_EXPENSIVE_CHECK(newIndex == vocabIndex + newWordCount); + AD_CORRECTNESS_CHECK(newIndex == vocabIndex + newWordCount); } for (const auto& [_, word, id] : insertInfo | ql::views::drop(newWordCount)) { @@ -448,7 +445,7 @@ void DeltaTriples::materializeToIndex() { auto snapshot = getSnapshot(); CancellationHandle cancellationHandle = std::make_shared(); - IndexImpl newIndex{index_.allocator()}; + IndexImpl newIndex{index_.allocator(), false}; newIndex.setOnDiskBase("tmp_index"); newIndex.setKbName(index_.getKbName()); newIndex.blocksizePermutationPerColumn() = diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index bcfdbb2f48..f61f4e9a0f 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -43,9 +43,12 @@ using namespace ad_utility::memory_literals; static constexpr size_t NUM_EXTERNAL_SORTERS_AT_SAME_TIME = 2u; // _____________________________________________________________________________ -IndexImpl::IndexImpl(ad_utility::AllocatorWithLimit allocator) +IndexImpl::IndexImpl(ad_utility::AllocatorWithLimit allocator, + bool registerSingleton) : allocator_{std::move(allocator)} { - globalSingletonIndex_ = this; + if (registerSingleton) { + globalSingletonIndex_ = this; + } deltaTriples_.emplace(*this); }; diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 263035d44d..ddff9f1ed9 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -195,7 +195,8 @@ class IndexImpl { std::optional deltaTriples_; public: - explicit IndexImpl(ad_utility::AllocatorWithLimit allocator); + explicit IndexImpl(ad_utility::AllocatorWithLimit allocator, + bool registerSingleton = true); // Forbid copying. IndexImpl& operator=(const IndexImpl&) = delete; From b802ad32d6aa98ab1d58d9ed5180957311edf472 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 9 Oct 2025 13:06:13 +0200 Subject: [PATCH 04/41] Also re-write internal triples --- src/index/DeltaTriples.cpp | 89 +++++++++++++++++++++++--------------- src/index/IndexImpl.cpp | 23 ++++++++-- src/index/IndexImpl.h | 3 ++ src/index/Permutation.h | 5 +++ 4 files changed, 80 insertions(+), 40 deletions(-) diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index db60fb8a80..c2a273be7b 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -391,6 +391,53 @@ void DeltaTriplesManager::setFilenameForPersistentUpdatesAndReadFromDisk( false); } +namespace { + +ad_utility::InputRangeTypeErased> readIndexAndRemap( + const Permutation& permutation, ScanSpecification scanSpec, + const LocatedTriplesSnapshot& snapshot, + const ad_utility::HashMap& localVocabMapping, + const std::vector>& insertInfo, + const ad_utility::SharedCancellationHandle& cancellationHandle) { + Permutation::ScanSpecAndBlocks scanSpecAndBlocks{ + std::move(scanSpec), + BlockMetadataRanges( + permutation.getAugmentedMetadataForPermutation(snapshot))}; + auto fullScan = permutation.lazyScan( + scanSpecAndBlocks, std::nullopt, std::array{ADDITIONAL_COLUMN_GRAPH_ID}, + cancellationHandle, snapshot, LimitOffsetClause{}); + return ad_utility::InputRangeTypeErased{ + ad_utility::CachingTransformInputRange{ + std::move(fullScan), + [&permutation, &localVocabMapping, &insertInfo](IdTable& idTable) { + auto keyOrder = Permutation::toKeyOrder(permutation.permutation()); + std::vector columnIndices{keyOrder.keys().begin(), + keyOrder.keys().end()}; + idTable.setColumnSubset(columnIndices); + for (auto col : idTable.getColumns()) { + ql::ranges::for_each( + col, [&localVocabMapping, &insertInfo](Id& id) { + if (id.getDatatype() == Datatype::LocalVocabIndex) { + id = localVocabMapping.at(id); + } else if (id.getDatatype() == Datatype::VocabIndex) { + size_t offset = ql::ranges::distance( + insertInfo.begin(), + ql::ranges::upper_bound( + insertInfo, id.getVocabIndex(), std::less{}, + [](const auto& tuple) { + return std::get<0>(tuple); + })); + id = Id::makeFromVocabIndex( + VocabIndex::make(id.getVocabIndex().get() + offset)); + } + }); + } + return IdTableStatic<0>{std::move(idTable)}; + }}}; +} + +} // namespace + // _____________________________________________________________________________ void DeltaTriples::materializeToIndex() { auto& vocab = index_.getVocab(); @@ -455,46 +502,16 @@ void DeltaTriples::materializeToIndex() { if (static_cast(permutation) % 2 != 0) { continue; } - Permutation::ScanSpecAndBlocks scanSpecAndBlocks{ - scanSpec, BlockMetadataRanges( - newIndex.getPermutation(permutation) - .getAugmentedMetadataForPermutation(*snapshot))}; - auto fullScan = - index_.getPermutation(permutation) - .lazyScan(scanSpecAndBlocks, std::nullopt, - std::array{ADDITIONAL_COLUMN_GRAPH_ID}, - cancellationHandle, *snapshot, LimitOffsetClause{}); [[maybe_unused]] auto distinct = newIndex.createPermutationPairPublic( 4, - ad_utility::InputRangeTypeErased{ad_utility::CachingTransformInputRange{ - std::move(fullScan), - [permutation, &localVocabMapping, &insertInfo](IdTable& idTable) { - auto keyOrder = Permutation::toKeyOrder(permutation); - std::vector columnIndices{keyOrder.keys().begin(), - keyOrder.keys().end()}; - idTable.setColumnSubset(columnIndices); - for (auto col : idTable.getColumns()) { - ql::ranges::for_each(col, [&localVocabMapping, - &insertInfo](Id& id) { - if (id.getDatatype() == Datatype::LocalVocabIndex) { - id = localVocabMapping.at(id); - } else if (id.getDatatype() == Datatype::VocabIndex) { - size_t offset = ql::ranges::distance( - insertInfo.begin(), - ql::ranges::upper_bound(insertInfo, id.getVocabIndex(), - std::less{}, - [](const auto& tuple) { - return std::get<0>(tuple); - })); - id = Id::makeFromVocabIndex( - VocabIndex::make(id.getVocabIndex().get() + offset)); - } - }); - } - return IdTableStatic<0>{std::move(idTable)}; - }}}, + readIndexAndRemap(index_.getPermutation(permutation), scanSpec, + *snapshot, localVocabMapping, insertInfo, + cancellationHandle), newIndex.getPermutation(permutation), newIndex.getPermutation( static_cast(static_cast(permutation) + 1))); } + newIndex.createInternalPSOandPOSFromRange(readIndexAndRemap( + index_.getPermutation(Permutation::Enum::PSO).internalPermutation(), + scanSpec, *snapshot, localVocabMapping, insertInfo, cancellationHandle)); } diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index f61f4e9a0f..7503974892 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -288,10 +288,8 @@ std::pair IndexImpl::createInternalPSOandPOS( auto configurationJsonBackup = configurationJson_; onDiskBase_.append(QLEVER_INTERNAL_INDEX_INFIX); - // TODO As soon as `uniqueBlockView` is no longer a `generator` the - // explicit `BlocksOfTriples` constructor can be removed again. - auto internalTriplesUnique = BlocksOfTriples{ad_utility::uniqueBlockView( - internalTriplesPsoSorter.template getSortedBlocks<0>())}; + auto internalTriplesUnique = ad_utility::uniqueBlockView( + internalTriplesPsoSorter.template getSortedBlocks<0>()); createPSOAndPOSImpl(NumColumnsIndexBuilding, std::move(internalTriplesUnique), false); onDiskBase_ = std::move(onDiskBaseBackup); @@ -307,6 +305,23 @@ std::pair IndexImpl::createInternalPSOandPOS( return {numTriplesInternal, numPredicatesInternal}; } +// _____________________________________________________________________________ +namespace { +struct SortedBlocksWrapper { + ad_utility::InputRangeTypeErased> sortedBlocks_; + template + ad_utility::InputRangeTypeErased> getSortedBlocks() { + static_assert(N == 0); + return std::move(sortedBlocks_); + } +}; +} // namespace +// _____________________________________________________________________________ +std::pair IndexImpl::createInternalPSOandPOSFromRange( + ad_utility::InputRangeTypeErased> sortedBlocks) { + return createInternalPSOandPOS(SortedBlocksWrapper{std::move(sortedBlocks)}); +} + // _____________________________________________________________________________ void IndexImpl::updateInputFileSpecificationsAndLog( std::vector& spec, diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index ddff9f1ed9..fa67be87bd 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -745,6 +745,9 @@ class IndexImpl { std::pair createInternalPSOandPOS( InternalTriplePsoSorter&& internalTriplesPsoSorter); + std::pair createInternalPSOandPOSFromRange( + ad_utility::InputRangeTypeErased> sortedBlocks); + // Set up one of the permutation sorters with the appropriate memory limit. // The `permutationName` is used to determine the filename and must be unique // for each call during one index build. diff --git a/src/index/Permutation.h b/src/index/Permutation.h index 73992fb155..f0256d4167 100644 --- a/src/index/Permutation.h +++ b/src/index/Permutation.h @@ -185,6 +185,11 @@ class Permutation { Enum permutation() const { return permutation_; } + const Permutation& internalPermutation() const { + AD_CONTRACT_CHECK(internalPermutation_ != nullptr); + return *internalPermutation_; + } + private: // Readable name for this permutation, e.g., `POS`. std::string readableName_; From f9419f4edf9d07df8fa52cdf32828001509a6b71 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 9 Oct 2025 17:20:05 +0200 Subject: [PATCH 05/41] Properly write metadata json --- src/index/DeltaTriples.cpp | 28 +++++++++++++++------------- src/index/IndexImpl.cpp | 26 ++++++++++++++++++++------ src/index/IndexImpl.h | 8 +++++++- 3 files changed, 42 insertions(+), 20 deletions(-) diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index c2a273be7b..f46c644a2a 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -493,16 +493,19 @@ void DeltaTriples::materializeToIndex() { CancellationHandle cancellationHandle = std::make_shared(); IndexImpl newIndex{index_.allocator(), false}; - newIndex.setOnDiskBase("tmp_index"); - newIndex.setKbName(index_.getKbName()); - newIndex.blocksizePermutationPerColumn() = - index_.blocksizePermutationPerColumn(); - for (auto permutation : Permutation::ALL) { - // Only process half the permutations (the rest is done by the pairs). - if (static_cast(permutation) % 2 != 0) { - continue; - } - [[maybe_unused]] auto distinct = newIndex.createPermutationPairPublic( + newIndex.loadConfigFromOldIndex("tmp_index", index_); + + auto [numTriplesInternal, numPredicatesInternal] = + newIndex.createInternalPSOandPOSFromRange(readIndexAndRemap( + index_.getPermutation(Permutation::Enum::PSO).internalPermutation(), + scanSpec, *snapshot, localVocabMapping, insertInfo, + cancellationHandle)); + newIndex.createPSOAndPOSImplPublic( + 4, readIndexAndRemap(index_.getPermutation(Permutation::Enum::PSO), + scanSpec, *snapshot, localVocabMapping, insertInfo, + cancellationHandle)); + for (auto permutation : {Permutation::Enum::SPO, Permutation::Enum::OPS}) { + newIndex.createPermutationPairPublic( 4, readIndexAndRemap(index_.getPermutation(permutation), scanSpec, *snapshot, localVocabMapping, insertInfo, @@ -511,7 +514,6 @@ void DeltaTriples::materializeToIndex() { newIndex.getPermutation( static_cast(static_cast(permutation) + 1))); } - newIndex.createInternalPSOandPOSFromRange(readIndexAndRemap( - index_.getPermutation(Permutation::Enum::PSO).internalPermutation(), - scanSpec, *snapshot, localVocabMapping, insertInfo, cancellationHandle)); + newIndex.addInternalStatisticsToConfiguration(numTriplesInternal, + numPredicatesInternal); } diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 7503974892..716df3b91c 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -896,11 +896,12 @@ IndexImpl::createPermutations(size_t numColumns, T&& sortedTriples, } // ________________________________________________________________________ -size_t IndexImpl::createPermutationPairPublic( +void IndexImpl::createPermutationPairPublic( size_t numColumns, ad_utility::InputRangeTypeErased>&& sortedTriples, const Permutation& p1, const Permutation& p2) { - return createPermutationPair(numColumns, AD_FWD(sortedTriples), p1, p2); + [[maybe_unused]] auto value = + createPermutationPair(numColumns, AD_FWD(sortedTriples), p1, p2); } // ________________________________________________________________________ @@ -1729,9 +1730,7 @@ CPP_template_def(typename... NextSorter)(requires( 1)) void IndexImpl::createPSOAndPOSImpl(size_t numColumns, BlocksOfTriples sortedTriples, bool doWriteConfiguration, - NextSorter&&... nextSorter) - -{ + NextSorter&&... nextSorter) { size_t numTriplesNormal = 0; size_t numTriplesTotal = 0; auto countTriplesNormal = [&numTriplesNormal, &numTriplesTotal]( @@ -1753,7 +1752,13 @@ CPP_template_def(typename... NextSorter)(requires( if (doWriteConfiguration) { writeConfiguration(); } -}; +} + +// _____________________________________________________________________________ +void IndexImpl::createPSOAndPOSImplPublic(size_t numColumns, + BlocksOfTriples sortedTriples) { + createPSOAndPOSImpl(numColumns, std::move(sortedTriples), false); +} // _____________________________________________________________________________ CPP_template_def(typename... NextSorter)( @@ -1876,3 +1881,12 @@ void IndexImpl::setPrefixesForEncodedValues( encodedIriManager_ = EncodedIriManager{std::move(prefixesWithoutAngleBrackets)}; } + +// _____________________________________________________________________________ +void IndexImpl::loadConfigFromOldIndex(const std::string& newName, + const IndexImpl& other) { + setOnDiskBase(newName); + setKbName(other.getKbName()); + blocksizePermutationPerColumn() = other.blocksizePermutationPerColumn(); + configurationJson_ = other.configurationJson_; +} diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index fa67be87bd..51d31788bf 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -566,7 +566,7 @@ class IndexImpl { Callbacks&&... perTripleCallbacks); public: - [[nodiscard]] size_t createPermutationPairPublic( + void createPermutationPairPublic( size_t numColumns, ad_utility::InputRangeTypeErased>&& sortedTriples, const Permutation& p1, const Permutation& p2); @@ -732,6 +732,9 @@ class IndexImpl { BlocksOfTriples sortedTriples, bool doWriteConfiguration, NextSorter&&... nextSorter); + + void createPSOAndPOSImplPublic(size_t numColumns, + BlocksOfTriples sortedTriples); // Call `createPSOAndPOSImpl` with the given arguments and with // `doWriteConfiguration` set to `true` (see above). CPP_template(typename... NextSorter)(requires( @@ -821,6 +824,9 @@ class IndexImpl { void storeTextScoringParamsInConfiguration(TextScoringMetric scoringMetric, float b, float k); + + void loadConfigFromOldIndex(const std::string& newName, + const IndexImpl& other); }; #endif // QLEVER_SRC_INDEX_INDEXIMPL_H From 9853e4bb1f291e478d40a925e04ecdf1f417d4e4 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 9 Oct 2025 17:59:29 +0200 Subject: [PATCH 06/41] Also transfer additional columns --- src/index/DeltaTriples.cpp | 63 +++++++++++++++++++++++++++++--------- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index f46c644a2a..3648c9dc53 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -392,27 +392,31 @@ void DeltaTriplesManager::setFilenameForPersistentUpdatesAndReadFromDisk( } namespace { - ad_utility::InputRangeTypeErased> readIndexAndRemap( const Permutation& permutation, ScanSpecification scanSpec, + BlockMetadataRanges blockMetadataRanges, const LocatedTriplesSnapshot& snapshot, const ad_utility::HashMap& localVocabMapping, const std::vector>& insertInfo, - const ad_utility::SharedCancellationHandle& cancellationHandle) { + const ad_utility::SharedCancellationHandle& cancellationHandle, + ql::span additionalColumns) { Permutation::ScanSpecAndBlocks scanSpecAndBlocks{ - std::move(scanSpec), - BlockMetadataRanges( - permutation.getAugmentedMetadataForPermutation(snapshot))}; - auto fullScan = permutation.lazyScan( - scanSpecAndBlocks, std::nullopt, std::array{ADDITIONAL_COLUMN_GRAPH_ID}, - cancellationHandle, snapshot, LimitOffsetClause{}); + std::move(scanSpec), std::move(blockMetadataRanges)}; + auto fullScan = + permutation.lazyScan(scanSpecAndBlocks, std::nullopt, additionalColumns, + cancellationHandle, snapshot, LimitOffsetClause{}); + auto keyOrder = Permutation::toKeyOrder(permutation.permutation()); + std::vector columnIndices{keyOrder.keys().begin(), + keyOrder.keys().end()}; + while (columnIndices.size() < additionalColumns.size() + 3) { + columnIndices.emplace_back(columnIndices.size()); + } return ad_utility::InputRangeTypeErased{ ad_utility::CachingTransformInputRange{ std::move(fullScan), - [&permutation, &localVocabMapping, &insertInfo](IdTable& idTable) { - auto keyOrder = Permutation::toKeyOrder(permutation.permutation()); - std::vector columnIndices{keyOrder.keys().begin(), - keyOrder.keys().end()}; + [columnIndices = std::move(columnIndices), &localVocabMapping, + &insertInfo](IdTable& idTable) { + AD_CORRECTNESS_CHECK(idTable.numColumns() == columnIndices.size()); idTable.setColumnSubset(columnIndices); for (auto col : idTable.getColumns()) { ql::ranges::for_each( @@ -436,6 +440,18 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( }}}; } +ad_utility::InputRangeTypeErased> readIndexAndRemap( + const Permutation& permutation, ScanSpecification scanSpec, + const LocatedTriplesSnapshot& snapshot, + const ad_utility::HashMap& localVocabMapping, + const std::vector>& insertInfo, + const ad_utility::SharedCancellationHandle& cancellationHandle) { + return readIndexAndRemap( + permutation, std::move(scanSpec), + permutation.getAugmentedMetadataForPermutation(snapshot), snapshot, + localVocabMapping, insertInfo, cancellationHandle, + std::array{ADDITIONAL_COLUMN_GRAPH_ID}); +} } // namespace // _____________________________________________________________________________ @@ -500,10 +516,27 @@ void DeltaTriples::materializeToIndex() { index_.getPermutation(Permutation::Enum::PSO).internalPermutation(), scanSpec, *snapshot, localVocabMapping, insertInfo, cancellationHandle)); + + const auto& psoPermutation = index_.getPermutation(Permutation::Enum::PSO); + auto blockMetadataRanges = + psoPermutation.getAugmentedMetadataForPermutation(*snapshot); + size_t numColumns = + blockMetadataRanges.at(0).at(0).offsetsAndCompressedSize_.size(); + std::vector additionalColumns; + additionalColumns.push_back(ADDITIONAL_COLUMN_GRAPH_ID); + for (ColumnIndex col : {ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN, + ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN}) { + if (additionalColumns.size() >= numColumns - 3) { + break; + } + additionalColumns.push_back(col); + } + AD_CORRECTNESS_CHECK(additionalColumns.size() == numColumns - 3); newIndex.createPSOAndPOSImplPublic( - 4, readIndexAndRemap(index_.getPermutation(Permutation::Enum::PSO), - scanSpec, *snapshot, localVocabMapping, insertInfo, - cancellationHandle)); + numColumns, readIndexAndRemap(psoPermutation, scanSpec, + std::move(blockMetadataRanges), *snapshot, + localVocabMapping, insertInfo, + cancellationHandle, additionalColumns)); for (auto permutation : {Permutation::Enum::SPO, Permutation::Enum::OPS}) { newIndex.createPermutationPairPublic( 4, From 3dc0a5271e3984659f3cee3f1028da8267eeeba5 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Fri, 10 Oct 2025 15:29:33 +0200 Subject: [PATCH 07/41] Avoid compilation errors --- src/index/DeltaTriples.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index 3648c9dc53..f81ee3c473 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -452,6 +452,16 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( localVocabMapping, insertInfo, cancellationHandle, std::array{ADDITIONAL_COLUMN_GRAPH_ID}); } + +size_t getNumColumns(const BlockMetadataRanges& blockMetadataRanges) { + if (!blockMetadataRanges.empty()) { + const auto& first = blockMetadataRanges.at(0); + if (!first.empty()) { + return first[0].offsetsAndCompressedSize_.size(); + } + } + return 4; +} } // namespace // _____________________________________________________________________________ @@ -520,8 +530,7 @@ void DeltaTriples::materializeToIndex() { const auto& psoPermutation = index_.getPermutation(Permutation::Enum::PSO); auto blockMetadataRanges = psoPermutation.getAugmentedMetadataForPermutation(*snapshot); - size_t numColumns = - blockMetadataRanges.at(0).at(0).offsetsAndCompressedSize_.size(); + size_t numColumns = getNumColumns(blockMetadataRanges); std::vector additionalColumns; additionalColumns.push_back(ADDITIONAL_COLUMN_GRAPH_ID); for (ColumnIndex col : {ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN, From 7ed05535f44db652ff9d62380ec75b0c83245856 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Fri, 10 Oct 2025 15:29:47 +0200 Subject: [PATCH 08/41] Also copy patterns if they exist --- src/index/DeltaTriples.cpp | 7 +++++++ src/index/IndexImpl.cpp | 3 +++ src/index/IndexImpl.h | 2 ++ 3 files changed, 12 insertions(+) diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index f81ee3c473..45f6240a30 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -14,6 +14,8 @@ #include +#include + #include "backports/algorithm.h" #include "engine/ExecuteUpdate.h" #include "index/Index.h" @@ -558,4 +560,9 @@ void DeltaTriples::materializeToIndex() { } newIndex.addInternalStatisticsToConfiguration(numTriplesInternal, numPredicatesInternal); + if (index_.usePatterns()) { + std::filesystem::copy(index_.getOnDiskBase() + ".index.patterns", + newIndex.getOnDiskBase() + ".index.patterns", + std::filesystem::copy_options::overwrite_existing); + } } diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 716df3b91c..cce579df31 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1078,6 +1078,9 @@ void IndexImpl::setKeepTempFiles(bool keepTempFiles) { // _____________________________________________________________________________ bool& IndexImpl::usePatterns() { return usePatterns_; } +// _____________________________________________________________________________ +bool IndexImpl::usePatterns() const { return usePatterns_; } + // _____________________________________________________________________________ bool& IndexImpl::loadAllPermutations() { return loadAllPermutations_; } diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 51d31788bf..96058369a5 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -431,6 +431,8 @@ class IndexImpl { bool& usePatterns(); + bool usePatterns() const; + bool& loadAllPermutations(); void setKeepTempFiles(bool keepTempFiles); From d28cf93bb0c2b30ee82b22ff2915d6773ef091f9 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 16 Oct 2025 13:53:37 +0200 Subject: [PATCH 09/41] Fix build on macOS --- src/index/DeltaTriples.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index 45f6240a30..a38dd4c314 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -452,7 +452,7 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( permutation, std::move(scanSpec), permutation.getAugmentedMetadataForPermutation(snapshot), snapshot, localVocabMapping, insertInfo, cancellationHandle, - std::array{ADDITIONAL_COLUMN_GRAPH_ID}); + std::array{static_cast(ADDITIONAL_COLUMN_GRAPH_ID)}); } size_t getNumColumns(const BlockMetadataRanges& blockMetadataRanges) { From 9ef9ecac10258ff61a1e83fa4795c4387b64715c Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Fri, 24 Oct 2025 11:47:33 +0200 Subject: [PATCH 10/41] Improve index building by correct order & honoring missing permutations --- src/index/DeltaTriples.cpp | 34 +++++++++++++++++----------------- src/index/IndexImpl.cpp | 33 +++++++++++++++++---------------- src/index/IndexImpl.h | 11 +++++------ 3 files changed, 39 insertions(+), 39 deletions(-) diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index a38dd4c314..b26c8b1411 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -14,8 +14,6 @@ #include -#include - #include "backports/algorithm.h" #include "engine/ExecuteUpdate.h" #include "index/Index.h" @@ -523,6 +521,18 @@ void DeltaTriples::materializeToIndex() { IndexImpl newIndex{index_.allocator(), false}; newIndex.loadConfigFromOldIndex("tmp_index", index_); + if (index_.hasAllPermutations()) { + // TODO Figure out how to respect patterns here properly. + newIndex.createSPOAndSOPPublic( + 4, readIndexAndRemap(index_.getPermutation(Permutation::Enum::SPO), + scanSpec, *snapshot, localVocabMapping, insertInfo, + cancellationHandle)); + newIndex.createOSPAndOPSPublic( + 4, readIndexAndRemap(index_.getPermutation(Permutation::Enum::OPS), + scanSpec, *snapshot, localVocabMapping, insertInfo, + cancellationHandle)); + } + auto [numTriplesInternal, numPredicatesInternal] = newIndex.createInternalPSOandPOSFromRange(readIndexAndRemap( index_.getPermutation(Permutation::Enum::PSO).internalPermutation(), @@ -548,21 +558,11 @@ void DeltaTriples::materializeToIndex() { std::move(blockMetadataRanges), *snapshot, localVocabMapping, insertInfo, cancellationHandle, additionalColumns)); - for (auto permutation : {Permutation::Enum::SPO, Permutation::Enum::OPS}) { - newIndex.createPermutationPairPublic( - 4, - readIndexAndRemap(index_.getPermutation(permutation), scanSpec, - *snapshot, localVocabMapping, insertInfo, - cancellationHandle), - newIndex.getPermutation(permutation), - newIndex.getPermutation( - static_cast(static_cast(permutation) + 1))); - } + + // TODO explicitly set these two + // newIndex.configurationJson_["has-all-permutations"] = + // index_.hasAllPermutations(); + // newIndex.configurationJson_["num-blank-nodes-total"] = TBD; newIndex.addInternalStatisticsToConfiguration(numTriplesInternal, numPredicatesInternal); - if (index_.usePatterns()) { - std::filesystem::copy(index_.getOnDiskBase() + ".index.patterns", - newIndex.getOnDiskBase() + ".index.patterns", - std::filesystem::copy_options::overwrite_existing); - } } diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index cce579df31..94685a859f 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -404,11 +404,8 @@ void IndexImpl::createFromFiles( createInternalPSOandPOS(*indexBuilderData.sorter_.internalTriplesPso_); }; - // TODO: this will become ad_utility::InputRangeErased so no conversion - // will be needed after https://github.com/ad-freiburg/qlever/pull/2208 - // For the first permutation, perform a unique. - auto firstSorterWithUnique{ad_utility::InputRangeTypeErased{ - ad_utility::uniqueBlockView(firstSorter.getSortedOutput())}}; + auto firstSorterWithUnique = + ad_utility::uniqueBlockView(firstSorter.getSortedOutput()); if (!loadAllPermutations_) { createInternalPsoAndPosAndSetMetadata(); @@ -895,15 +892,6 @@ IndexImpl::createPermutations(size_t numColumns, T&& sortedTriples, return metaData; } -// ________________________________________________________________________ -void IndexImpl::createPermutationPairPublic( - size_t numColumns, - ad_utility::InputRangeTypeErased>&& sortedTriples, - const Permutation& p1, const Permutation& p2) { - [[maybe_unused]] auto value = - createPermutationPair(numColumns, AD_FWD(sortedTriples), p1, p2); -} - // ________________________________________________________________________ template size_t IndexImpl::createPermutationPair(size_t numColumns, @@ -1822,7 +1810,13 @@ CPP_template_def(typename... NextSorter)(requires(sizeof...(NextSorter) <= 1)) numSubjectsNormal, numSubjectsTotal); writeConfiguration(); return result; -}; +} + +// _____________________________________________________________________________ +void IndexImpl::createSPOAndSOPPublic(size_t numColumns, + BlocksOfTriples sortedTriples) { + createSPOAndSOP(numColumns, std::move(sortedTriples)); +} // _____________________________________________________________________________ CPP_template_def(typename... NextSorter)( @@ -1841,7 +1835,13 @@ CPP_template_def(typename... NextSorter)( numObjectsNormal, numObjectsTotal); configurationJson_["has-all-permutations"] = true; writeConfiguration(); -}; +} + +// _____________________________________________________________________________ +void IndexImpl::createOSPAndOPSPublic(size_t numColumns, + BlocksOfTriples sortedTriples) { + createOSPAndOPS(numColumns, std::move(sortedTriples)); +} // _____________________________________________________________________________ template @@ -1892,4 +1892,5 @@ void IndexImpl::loadConfigFromOldIndex(const std::string& newName, setKbName(other.getKbName()); blocksizePermutationPerColumn() = other.blocksizePermutationPerColumn(); configurationJson_ = other.configurationJson_; + usePatterns_ = other.usePatterns_; } diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 96058369a5..a27ab386fa 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -567,12 +567,6 @@ class IndexImpl { Permutation::KeyOrder permutation, Callbacks&&... perTripleCallbacks); - public: - void createPermutationPairPublic( - size_t numColumns, - ad_utility::InputRangeTypeErased>&& sortedTriples, - const Permutation& p1, const Permutation& p2); - protected: // _______________________________________________________________________ // Create a pair of permutations. Only works for valid pairs (PSO-POS, @@ -716,6 +710,9 @@ class IndexImpl { std::optional createSPOAndSOP( size_t numColumns, BlocksOfTriples sortedTriples, NextSorter&&... nextSorter); + + void createSPOAndSOPPublic(size_t numColumns, BlocksOfTriples sortedTriples); + // Create the OSP and OPS permutations. Additionally, count the number of // distinct objects and write it to the metadata. CPP_template(typename... NextSorter)(requires( @@ -723,6 +720,8 @@ class IndexImpl { 1)) void createOSPAndOPS(size_t numColumns, BlocksOfTriples sortedTriples, NextSorter&&... nextSorter); + void createOSPAndOPSPublic(size_t numColumns, BlocksOfTriples sortedTriples); + // Create the PSO and POS permutations. Additionally, count the number of // distinct predicates and the number of actual triples and write them to the // metadata. The meta-data JSON file for the index statistics will only be From ebba3fc187dd650d0dd1953542e9397711661a98 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Fri, 24 Oct 2025 11:57:50 +0200 Subject: [PATCH 11/41] Properly run index-rebuild on worker thread --- src/engine/Server.cpp | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 6c0f08a4e3..46e01c8121 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -449,9 +449,24 @@ CPP_template_def(typename RequestT, typename ResponseT)( } response = createJsonResponse(json, request); } else if (auto cmd = checkParameter("cmd", "rebuild-index")) { - index_.deltaTriplesManager().modify( - [](DeltaTriples& deltaTriples) { deltaTriples.materializeToIndex(); }, - false); + requireValidAccessToken("rebuild-index"); + logCommand(cmd, "rebuilding index"); + // The function requires a SharedCancellationHandle, but the operation is + // not cancellable. + auto handle = std::make_shared>(); + // We don't directly `co_await` because of lifetime issues (bugs) in the + // Conan setup. + auto coroutine = computeInNewThread( + updateThreadPool_, + [this] { + index_.deltaTriplesManager().modify( + [](DeltaTriples& deltaTriples) { + deltaTriples.materializeToIndex(); + }, + false); + }, + handle); + co_await std::move(coroutine); response = createOkResponse("Done writing", request, MediaType::textPlain); } From fb64328cd4472dc882d2ea08b08e26c80b67cfd3 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Fri, 24 Oct 2025 12:40:36 +0200 Subject: [PATCH 12/41] Separate functions a little better --- src/engine/Server.cpp | 12 +++-- src/index/DeltaTriples.cpp | 107 ++++++++++++++++++++----------------- src/index/DeltaTriples.h | 6 ++- 3 files changed, 69 insertions(+), 56 deletions(-) diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 46e01c8121..e9c902bf08 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -451,17 +451,19 @@ CPP_template_def(typename RequestT, typename ResponseT)( } else if (auto cmd = checkParameter("cmd", "rebuild-index")) { requireValidAccessToken("rebuild-index"); logCommand(cmd, "rebuilding index"); - // The function requires a SharedCancellationHandle, but the operation is - // not cancellable. + // There is no mechanism to actually cancel the handle. auto handle = std::make_shared>(); // We don't directly `co_await` because of lifetime issues (bugs) in the // Conan setup. auto coroutine = computeInNewThread( updateThreadPool_, - [this] { + [this, &handle] { index_.deltaTriplesManager().modify( - [](DeltaTriples& deltaTriples) { - deltaTriples.materializeToIndex(); + [&handle](DeltaTriples& deltaTriples) { + // TODO Ideally acquire a snapshot of the delta triples + // to then build the new index based on this snapshot without + // holding the lock any longer. + deltaTriples.materializeToIndex(handle); }, false); }, diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index b26c8b1411..da20b9a5d3 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -391,6 +391,58 @@ void DeltaTriplesManager::setFilenameForPersistentUpdatesAndReadFromDisk( false); } +std::pair>, + ad_utility::HashMap> +DeltaTriples::materializeLocalVocab() const { + auto& vocab = index_.getVocab(); + + size_t newWordCount = 0; + std::vector> insertInfo; + insertInfo.reserve(localVocab_.size()); + + ad_utility::HashMap localVocabMapping; + + for (const LocalVocabEntry& entry : localVocab_.primaryWordSet()) { + const auto& [lower, upper] = entry.positionInVocab(); + AD_CORRECTNESS_CHECK(lower == upper); + Id id = Id::fromBits(upper.get()); + AD_CORRECTNESS_CHECK(id.getDatatype() == Datatype::VocabIndex); + insertInfo.emplace_back(id.getVocabIndex(), + entry.asLiteralOrIri().toStringRepresentation(), + Id::makeFromLocalVocabIndex(&entry)); + } + ql::ranges::sort(insertInfo, [](const auto& tupleA, const auto& tupleB) { + return std::tie(std::get(tupleA).get(), std::get(tupleA)) < + std::tie(std::get(tupleB).get(), std::get(tupleB)); + }); + + auto vocabWriter = vocab.makeWordWriterPtr("tmp_index.vocabulary"); + for (size_t vocabIndex = 0; vocabIndex < vocab.size(); ++vocabIndex) { + auto actualIndex = VocabIndex::make(vocabIndex); + while (insertInfo.size() > newWordCount && + std::get(insertInfo.at(newWordCount)) == actualIndex) { + AD_CORRECTNESS_CHECK(std::get(insertInfo.at(newWordCount)) < + Id::makeFromVocabIndex(actualIndex)); + auto word = std::get(insertInfo.at(newWordCount)); + auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); + localVocabMapping.emplace( + std::get(insertInfo.at(newWordCount)), + Id::makeFromVocabIndex(VocabIndex::make(newIndex))); + newWordCount++; + } + auto word = vocab[actualIndex]; + auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); + AD_CORRECTNESS_CHECK(newIndex == vocabIndex + newWordCount); + } + + for (const auto& [_, word, id] : insertInfo | ql::views::drop(newWordCount)) { + auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); + localVocabMapping.emplace( + id, Id::makeFromVocabIndex(VocabIndex::make(newIndex))); + } + return std::pair{std::move(insertInfo), std::move(localVocabMapping)}; +} + namespace { ad_utility::InputRangeTypeErased> readIndexAndRemap( const Permutation& permutation, ScanSpecification scanSpec, @@ -465,59 +517,14 @@ size_t getNumColumns(const BlockMetadataRanges& blockMetadataRanges) { } // namespace // _____________________________________________________________________________ -void DeltaTriples::materializeToIndex() { - auto& vocab = index_.getVocab(); - - size_t newWordCount = 0; - std::vector> insertInfo; - insertInfo.reserve(localVocab_.size()); - - ad_utility::HashMap localVocabMapping; - - for (const LocalVocabEntry& entry : - const_cast(localVocab_).primaryWordSet()) { - const auto& [lower, upper] = entry.positionInVocab(); - AD_CORRECTNESS_CHECK(lower == upper); - Id id = Id::fromBits(upper.get()); - AD_CORRECTNESS_CHECK(id.getDatatype() == Datatype::VocabIndex); - insertInfo.emplace_back(id.getVocabIndex(), - entry.asLiteralOrIri().toStringRepresentation(), - Id::makeFromLocalVocabIndex(&entry)); - } - ql::ranges::sort(insertInfo, [](const auto& tupleA, const auto& tupleB) { - return std::tie(std::get(tupleA).get(), std::get(tupleA)) < - std::tie(std::get(tupleB).get(), std::get(tupleB)); - }); - - auto vocabWriter = vocab.makeWordWriterPtr("tmp_index.vocabulary"); - for (size_t vocabIndex = 0; vocabIndex < vocab.size(); ++vocabIndex) { - auto actualIndex = VocabIndex::make(vocabIndex); - while (insertInfo.size() > newWordCount && - std::get(insertInfo.at(newWordCount)) == actualIndex) { - AD_CORRECTNESS_CHECK(std::get(insertInfo.at(newWordCount)) < - Id::makeFromVocabIndex(actualIndex)); - auto word = std::get(insertInfo.at(newWordCount)); - auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); - localVocabMapping.emplace( - std::get(insertInfo.at(newWordCount)), - Id::makeFromVocabIndex(VocabIndex::make(newIndex))); - newWordCount++; - } - auto word = vocab[actualIndex]; - auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); - AD_CORRECTNESS_CHECK(newIndex == vocabIndex + newWordCount); - } - - for (const auto& [_, word, id] : insertInfo | ql::views::drop(newWordCount)) { - auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); - localVocabMapping.emplace( - id, Id::makeFromVocabIndex(VocabIndex::make(newIndex))); - } +void DeltaTriples::materializeToIndex( + const CancellationHandle& cancellationHandle) { + const auto& [insertInfo, localVocabMapping] = materializeLocalVocab(); + // TODO Move much of this logic to `IndexImpl`. This way the "public" + // wrappers can be avoided. ScanSpecification scanSpec{std::nullopt, std::nullopt, std::nullopt}; auto snapshot = getSnapshot(); - CancellationHandle cancellationHandle = - std::make_shared(); IndexImpl newIndex{index_.allocator(), false}; newIndex.loadConfigFromOldIndex("tmp_index", index_); diff --git a/src/index/DeltaTriples.h b/src/index/DeltaTriples.h index aa5a06be33..7f35574a8a 100644 --- a/src/index/DeltaTriples.h +++ b/src/index/DeltaTriples.h @@ -202,7 +202,11 @@ class DeltaTriples { Permutation::Enum permutation, std::shared_ptr> metadata); - void materializeToIndex(); + std::pair>, + ad_utility::HashMap> + materializeLocalVocab() const; + + void materializeToIndex(const CancellationHandle& cancellationHandle); private: // Find the position of the given triple in the given permutation and add it From 9fc0a57001354e9ce3e99af56abece441451b314 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Fri, 24 Oct 2025 17:49:30 +0200 Subject: [PATCH 13/41] Fix regression in code --- src/index/DeltaTriples.cpp | 35 ++++++++++++++++++++++------------- src/index/IndexImpl.cpp | 13 +++++++++++++ src/index/IndexImpl.h | 6 ++++++ 3 files changed, 41 insertions(+), 13 deletions(-) diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index da20b9a5d3..5b86f8dfef 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -444,6 +444,19 @@ DeltaTriples::materializeLocalVocab() const { } namespace { +Id remapVocabId(Id original, + const std::vector>& + insertInfo) { + AD_CONTRACT_CHECK(original.getDatatype() == Datatype::VocabIndex); + size_t offset = ql::ranges::distance( + insertInfo.begin(), + ql::ranges::upper_bound( + insertInfo, original.getVocabIndex(), std::less{}, + [](const auto& tuple) { return std::get<0>(tuple); })); + return Id::makeFromVocabIndex( + VocabIndex::make(original.getVocabIndex().get() + offset)); +} + ad_utility::InputRangeTypeErased> readIndexAndRemap( const Permutation& permutation, ScanSpecification scanSpec, BlockMetadataRanges blockMetadataRanges, @@ -476,15 +489,7 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( if (id.getDatatype() == Datatype::LocalVocabIndex) { id = localVocabMapping.at(id); } else if (id.getDatatype() == Datatype::VocabIndex) { - size_t offset = ql::ranges::distance( - insertInfo.begin(), - ql::ranges::upper_bound( - insertInfo, id.getVocabIndex(), std::less{}, - [](const auto& tuple) { - return std::get<0>(tuple); - })); - id = Id::makeFromVocabIndex( - VocabIndex::make(id.getVocabIndex().get() + offset)); + id = remapVocabId(id, insertInfo); } }); } @@ -534,10 +539,14 @@ void DeltaTriples::materializeToIndex( 4, readIndexAndRemap(index_.getPermutation(Permutation::Enum::SPO), scanSpec, *snapshot, localVocabMapping, insertInfo, cancellationHandle)); - newIndex.createOSPAndOPSPublic( - 4, readIndexAndRemap(index_.getPermutation(Permutation::Enum::OPS), - scanSpec, *snapshot, localVocabMapping, insertInfo, - cancellationHandle)); + // TODO Find out why we can't use createOSPAndOPSPublic here. + newIndex.createPermutationPairPublic( + 4, + readIndexAndRemap(index_.getPermutation(Permutation::Enum::OPS), + scanSpec, *snapshot, localVocabMapping, insertInfo, + cancellationHandle), + newIndex.getPermutation(Permutation::Enum::OPS), + newIndex.getPermutation(Permutation::Enum::OSP)); } auto [numTriplesInternal, numPredicatesInternal] = diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 94685a859f..44f9a080fe 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -892,6 +892,15 @@ IndexImpl::createPermutations(size_t numColumns, T&& sortedTriples, return metaData; } +// ________________________________________________________________________ +void IndexImpl::createPermutationPairPublic( + size_t numColumns, + ad_utility::InputRangeTypeErased>&& sortedTriples, + const Permutation& p1, const Permutation& p2) { + [[maybe_unused]] auto value = + createPermutationPair(numColumns, AD_FWD(sortedTriples), p1, p2); +} + // ________________________________________________________________________ template size_t IndexImpl::createPermutationPair(size_t numColumns, @@ -1893,4 +1902,8 @@ void IndexImpl::loadConfigFromOldIndex(const std::string& newName, blocksizePermutationPerColumn() = other.blocksizePermutationPerColumn(); configurationJson_ = other.configurationJson_; usePatterns_ = other.usePatterns_; + idOfHasPatternDuringIndexBuilding_ = + qlever::specialIds().at(HAS_PATTERN_PREDICATE); + idOfInternalGraphDuringIndexBuilding_ = + qlever::specialIds().at(QLEVER_INTERNAL_GRAPH_IRI); } diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index a27ab386fa..e3aa254af4 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -567,6 +567,12 @@ class IndexImpl { Permutation::KeyOrder permutation, Callbacks&&... perTripleCallbacks); + public: + void createPermutationPairPublic( + size_t numColumns, + ad_utility::InputRangeTypeErased>&& sortedTriples, + const Permutation& p1, const Permutation& p2); + protected: // _______________________________________________________________________ // Create a pair of permutations. Only works for valid pairs (PSO-POS, From 609e666b499a51fc773c8ec34ba5053b114eb735 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Fri, 21 Nov 2025 17:14:44 +0100 Subject: [PATCH 14/41] Fix broken patterns & skip remap for additional cols --- src/global/Pattern.h | 15 +++++++++++---- src/index/DeltaTriples.cpp | 22 ++++++++++++++++++++-- src/index/IndexImpl.cpp | 38 +++++++++++++++++++++++++++++--------- src/index/IndexImpl.h | 6 ++++++ 4 files changed, 66 insertions(+), 15 deletions(-) diff --git a/src/global/Pattern.h b/src/global/Pattern.h index 2521556032..2062cdd3e1 100644 --- a/src/global/Pattern.h +++ b/src/global/Pattern.h @@ -116,7 +116,7 @@ class CompactVectorOfStrings { /** * @brief operator [] * @param i - * @return A std::pair containing a pointer to the data, and the number of + * @return A `value_type` containing a pointer to the data, and the number of * elements stored at the pointers target. */ const value_type operator[](size_t i) const { @@ -126,9 +126,16 @@ class CompactVectorOfStrings { return {ptr, size}; } - // Forward iterator for a `CompactVectorOfStrings` that reads directly from - // disk without buffering the whole `Vector`. - static cppcoro::generator diskIterator(std::string filename); + template + CompactVectorOfStrings cloneAndRemap(Func mappingFunction) const { + CompactVectorOfStrings clone; + clone.offsets_ = offsets_; + clone.data_.reserve(data_.size()); + for (const data_type& element : data_) { + clone.data_.push_back(std::invoke(mappingFunction, element)); + } + return clone; + } using Iterator = ad_utility::IteratorForAccessOperator< CompactVectorOfStrings, ad_utility::AccessViaBracketOperator, diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index 47c7fdedd1..1b04d77e87 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -496,7 +496,12 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( &insertInfo](IdTable& idTable) { AD_CORRECTNESS_CHECK(idTable.numColumns() == columnIndices.size()); idTable.setColumnSubset(columnIndices); - for (auto col : idTable.getColumns()) { + auto allCols = idTable.getColumns(); + // Extra columns beyond the graph column only contain integers (or + // undefined for triples added via UPDATE) and thus don't need to be + // remapped. + constexpr size_t REGULAR_COLUMNS = 4; + for (auto col : allCols | ::ranges::views::take(REGULAR_COLUMNS)) { ql::ranges::for_each( col, [&localVocabMapping, &insertInfo](Id& id) { if (id.getDatatype() == Datatype::LocalVocabIndex) { @@ -506,6 +511,13 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( } }); } + AD_EXPENSIVE_CHECK(ql::ranges::all_of( + allCols | ::ranges::views::drop(REGULAR_COLUMNS), [](auto col) { + return ql::ranges::all_of(col, [](Id id) { + return id.getDatatype() == Datatype::Int || + id.isUndefined(); + }); + })); return IdTableStatic<0>{std::move(idTable)}; }}}; } @@ -546,8 +558,14 @@ void DeltaTriples::materializeToIndex( IndexImpl newIndex{index_.allocator(), false}; newIndex.loadConfigFromOldIndex("tmp_index", index_); + if (index_.usePatterns()) { + newIndex.getPatterns() = + index_.getPatterns().cloneAndRemap([&insertInfo](const Id& oldId) { + return remapVocabId(oldId, insertInfo); + }); + } + if (index_.hasAllPermutations()) { - // TODO Figure out how to respect patterns here properly. newIndex.createSPOAndSOPPublic( 4, readIndexAndRemap(index_.getPermutation(Permutation::Enum::SPO), scanSpec, *snapshot, localVocabMapping, insertInfo, diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 2d1ed444d1..1123a456e4 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1012,7 +1012,7 @@ void IndexImpl::createFromOnDiskIndex(const std::string& onDiskBase, if (usePatterns_) { try { PatternCreator::readPatternsFromFile( - onDiskBase_ + ".index.patterns", avgNumDistinctSubjectsPerPredicate_, + getPatternFilename(), avgNumDistinctSubjectsPerPredicate_, avgNumDistinctPredicatesPerSubject_, numDistinctSubjectPredicatePairs_, patterns_); } catch (const std::exception& e) { @@ -1046,6 +1046,9 @@ const CompactVectorOfStrings& IndexImpl::getPatterns() const { return patterns_; } +// _____________________________________________________________________________ +CompactVectorOfStrings& IndexImpl::getPatterns() { return patterns_; } + // _____________________________________________________________________________ double IndexImpl::getAvgNumDistinctPredicatesPerSubject() const { throwExceptionIfNoPatterns(); @@ -1722,6 +1725,11 @@ void IndexImpl::deleteTemporaryFile(const std::string& path) { } } +// _____________________________________________________________________________ +std::string IndexImpl::getPatternFilename() const { + return onDiskBase_ + ".index.patterns"; +} + namespace { // Return a lambda that is called repeatedly with triples that are sorted by the @@ -1802,8 +1810,7 @@ CPP_template_def(typename... NextSorter)(requires(sizeof...(NextSorter) <= 1)) // For now (especially for testing) We build the new pattern format as well // as the old one to see that they match. PatternCreator patternCreator{ - onDiskBase_ + ".index.patterns", - idOfHasPatternDuringIndexBuilding_.value(), + getPatternFilename(), idOfHasPatternDuringIndexBuilding_.value(), memoryLimitIndexBuilding() / NUM_EXTERNAL_SORTERS_AT_SAME_TIME}; auto pushTripleToPatterns = [&patternCreator](const auto& triple) { bool ignoreForPatterns = false; @@ -1824,7 +1831,6 @@ CPP_template_def(typename... NextSorter)(requires(sizeof...(NextSorter) <= 1)) writeConfiguration(); result = std::move(patternCreator).getTripleSorter(); } else { - AD_CORRECTNESS_CHECK(sizeof...(nextSorter) == 1); numSubjectsTotal = createPermutationPair( numColumns, AD_FWD(sortedTriples), spo_, sop_, nextSorter.makePushCallback()..., std::ref(numSubjectCounter)); @@ -1918,9 +1924,23 @@ void IndexImpl::loadConfigFromOldIndex(const std::string& newName, setKbName(other.getKbName()); blocksizePermutationPerColumn() = other.blocksizePermutationPerColumn(); configurationJson_ = other.configurationJson_; - usePatterns_ = other.usePatterns_; - idOfHasPatternDuringIndexBuilding_ = - qlever::specialIds().at(HAS_PATTERN_PREDICATE); - idOfInternalGraphDuringIndexBuilding_ = - qlever::specialIds().at(QLEVER_INTERNAL_GRAPH_IRI); +} + +// _____________________________________________________________________________ +void IndexImpl::writePatternsToFile() const { + // Write the subjectToPatternMap. + ad_utility::serialization::FileWriteSerializer patternWriter{ + getPatternFilename()}; + + // Write the statistics and the patterns. + PatternStatistics statistics; + + statistics.numDistinctSubjectPredicatePairs_ = + numDistinctSubjectPredicatePairs_; + statistics.avgNumDistinctSubjectsPerPredicate_ = + avgNumDistinctSubjectsPerPredicate_; + statistics.avgNumDistinctPredicatesPerSubject_ = + avgNumDistinctPredicatesPerSubject_; + patternWriter << statistics; + patternWriter << patterns_; } diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index e3aa254af4..f538ae4177 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -317,6 +317,8 @@ class IndexImpl { Index::Vocab::PrefixRanges prefixRanges(std::string_view prefix) const; const CompactVectorOfStrings& getPatterns() const; + + CompactVectorOfStrings& getPatterns(); /** * @return The multiplicity of the Entities column (0) of the full * has-relation relation after unrolling the patterns. @@ -693,6 +695,8 @@ class IndexImpl { */ void deleteTemporaryFile(const std::string& path); + std::string getPatternFilename() const; + public: // Count the number of "QLever-internal" triples (predicate ql:langtag or // predicate starts with @) and all other triples (that were actually part of @@ -834,6 +838,8 @@ class IndexImpl { void loadConfigFromOldIndex(const std::string& newName, const IndexImpl& other); + + void writePatternsToFile() const; }; #endif // QLEVER_SRC_INDEX_INDEXIMPL_H From ce95ec17bb0bd35d8ea23df3441d056adcaad393 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Fri, 21 Nov 2025 18:52:03 +0100 Subject: [PATCH 15/41] Add missing call to actually write the patterns --- src/index/DeltaTriples.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index 1b04d77e87..12c5f808e1 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -563,6 +563,7 @@ void DeltaTriples::materializeToIndex( index_.getPatterns().cloneAndRemap([&insertInfo](const Id& oldId) { return remapVocabId(oldId, insertInfo); }); + newIndex.writePatternsToFile(); } if (index_.hasAllPermutations()) { From 2ed78908f9abfa931865ced8ee7b1d5aa0725343 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Tue, 16 Dec 2025 12:43:38 +0100 Subject: [PATCH 16/41] Move rebuilder to dedicated file and make rebuild non-blocking --- src/engine/Server.cpp | 46 ++++--- src/engine/Server.h | 4 + src/index/CMakeLists.txt | 2 +- src/index/DeltaTriples.cpp | 223 +++----------------------------- src/index/DeltaTriples.h | 13 +- src/index/IndexRebuilder.cpp | 243 +++++++++++++++++++++++++++++++++++ src/index/IndexRebuilder.h | 25 ++++ 7 files changed, 326 insertions(+), 230 deletions(-) create mode 100644 src/index/IndexRebuilder.cpp create mode 100644 src/index/IndexRebuilder.h diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index c01271480f..13abc197d3 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -26,6 +26,7 @@ #include "engine/SparqlProtocol.h" #include "global/RuntimeParameters.h" #include "index/IndexImpl.h" +#include "index/IndexRebuilder.h" #include "parser/SparqlParser.h" #include "util/AsioHelpers.h" #include "util/MemorySize/MemorySize.h" @@ -456,26 +457,31 @@ CPP_template_def(typename RequestT, typename ResponseT)( response = createJsonResponse(json, request); } else if (auto cmd = checkParameter("cmd", "rebuild-index")) { requireValidAccessToken("rebuild-index"); - logCommand(cmd, "rebuilding index"); - // There is no mechanism to actually cancel the handle. - auto handle = std::make_shared>(); - // We don't directly `co_await` because of lifetime issues (bugs) in the - // Conan setup. - auto coroutine = computeInNewThread( - updateThreadPool_, - [this, &handle] { - index_.deltaTriplesManager().modify( - [&handle](DeltaTriples& deltaTriples) { - // TODO Ideally acquire a snapshot of the delta triples - // to then build the new index based on this snapshot without - // holding the lock any longer. - deltaTriples.materializeToIndex(handle); - }, - false); - }, - handle); - co_await std::move(coroutine); - response = createOkResponse("Done writing", request, MediaType::textPlain); + + if (rebuildInProgress_.exchange(true)) { + response = createHttpResponseFromString( + "Another rebuild is currently in progress!", + http::status::too_many_requests, request, MediaType::textPlain); + } else { + absl::Cleanup cleanup{[this]() { rebuildInProgress_.store(false); }}; + logCommand(cmd, "rebuilding index"); + // There is no mechanism to actually cancel the handle. + auto handle = std::make_shared>(); + // We don't directly `co_await` because of lifetime issues (bugs) in the + // Conan setup. + auto coroutine = computeInNewThread( + queryThreadPool_, + [this, &handle] { + auto [currentSnapshot, localVocabCopy] = + index_.deltaTriplesManager().getCurrentSnapshotWithVocab(); + qlever::materializeToIndex(index_.getImpl(), "tmp_index", + localVocabCopy, currentSnapshot, handle); + }, + handle); + co_await std::move(coroutine); + response = + createOkResponse("Done writing", request, MediaType::textPlain); + } } // Ping with or without message. diff --git a/src/engine/Server.h b/src/engine/Server.h index dd77038316..518ce48605 100644 --- a/src/engine/Server.h +++ b/src/engine/Server.h @@ -103,6 +103,10 @@ class Server { /// Executor with a single thread that is used to run timers asynchronously. boost::asio::static_thread_pool timerExecutor_{1}; + // Indicates if an index rebuild is currently in progress so that we prevent + // triggering this twice. + std::atomic_bool rebuildInProgress_{false}; + template using Awaitable = boost::asio::awaitable; diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index 577af95874..da796a20f2 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -7,5 +7,5 @@ add_library(index PrefixHeuristic.cpp CompressedRelation.cpp PatternCreator.cpp ScanSpecification.cpp DeltaTriples.cpp LocalVocabEntry.cpp TextScoring.cpp TextScoringEnum.cpp TextIndexReadWrite.cpp - TextIndexBuilder.cpp GraphFilter.cpp) + TextIndexBuilder.cpp GraphFilter.cpp IndexRebuilder.cpp) qlever_target_link_libraries(index util parser vocabulary global) diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index 83e780e1fc..1d30ec24f3 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -387,6 +387,16 @@ SharedLocatedTriplesSnapshot DeltaTriplesManager::getCurrentSnapshot() const { return *currentLocatedTriplesSnapshot_.rlock(); } +// _____________________________________________________________________________ +std::pair>> +DeltaTriplesManager::getCurrentSnapshotWithVocab() const { + return deltaTriples_.withReadLock([this](const DeltaTriples& deltaTriples) { + return std::make_pair(*currentLocatedTriplesSnapshot_.rlock(), + deltaTriples.deepCloneLocalVocab()); + }); +} + // _____________________________________________________________________________ void DeltaTriples::setOriginalMetadata( Permutation::Enum permutation, @@ -482,213 +492,18 @@ void DeltaTriplesManager::setFilenameForPersistentUpdatesAndReadFromDisk( false); } -std::pair>, - ad_utility::HashMap> -DeltaTriples::materializeLocalVocab() const { - auto& vocab = index_.getVocab(); - - size_t newWordCount = 0; - std::vector> insertInfo; - insertInfo.reserve(localVocab_.size()); +// _____________________________________________________________________________ +std::vector> +DeltaTriples::deepCloneLocalVocab() const { + std::vector> entries; + entries.reserve(localVocab_.size()); ad_utility::HashMap localVocabMapping; + // TODO it is probably sufficient to just hand out the pointers + a + // lifetime extender. for (const LocalVocabEntry& entry : localVocab_.primaryWordSet()) { - const auto& [lower, upper] = entry.positionInVocab(); - AD_CORRECTNESS_CHECK(lower == upper); - Id id = Id::fromBits(upper.get()); - AD_CORRECTNESS_CHECK(id.getDatatype() == Datatype::VocabIndex); - insertInfo.emplace_back(id.getVocabIndex(), - entry.asLiteralOrIri().toStringRepresentation(), - Id::makeFromLocalVocabIndex(&entry)); - } - ql::ranges::sort(insertInfo, [](const auto& tupleA, const auto& tupleB) { - return std::tie(std::get(tupleA).get(), std::get(tupleA)) < - std::tie(std::get(tupleB).get(), std::get(tupleB)); - }); - - auto vocabWriter = vocab.makeWordWriterPtr("tmp_index.vocabulary"); - for (size_t vocabIndex = 0; vocabIndex < vocab.size(); ++vocabIndex) { - auto actualIndex = VocabIndex::make(vocabIndex); - while (insertInfo.size() > newWordCount && - std::get(insertInfo.at(newWordCount)) == actualIndex) { - AD_CORRECTNESS_CHECK(std::get(insertInfo.at(newWordCount)) < - Id::makeFromVocabIndex(actualIndex)); - auto word = std::get(insertInfo.at(newWordCount)); - auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); - localVocabMapping.emplace( - std::get(insertInfo.at(newWordCount)), - Id::makeFromVocabIndex(VocabIndex::make(newIndex))); - newWordCount++; - } - auto word = vocab[actualIndex]; - auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); - AD_CORRECTNESS_CHECK(newIndex == vocabIndex + newWordCount); - } - - for (const auto& [_, word, id] : insertInfo | ql::views::drop(newWordCount)) { - auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); - localVocabMapping.emplace( - id, Id::makeFromVocabIndex(VocabIndex::make(newIndex))); - } - return std::pair{std::move(insertInfo), std::move(localVocabMapping)}; -} - -namespace { -Id remapVocabId(Id original, - const std::vector>& - insertInfo) { - AD_CONTRACT_CHECK(original.getDatatype() == Datatype::VocabIndex); - size_t offset = ql::ranges::distance( - insertInfo.begin(), - ql::ranges::upper_bound( - insertInfo, original.getVocabIndex(), std::less{}, - [](const auto& tuple) { return std::get<0>(tuple); })); - return Id::makeFromVocabIndex( - VocabIndex::make(original.getVocabIndex().get() + offset)); -} - -ad_utility::InputRangeTypeErased> readIndexAndRemap( - const Permutation& permutation, ScanSpecification scanSpec, - BlockMetadataRanges blockMetadataRanges, - const LocatedTriplesSnapshot& snapshot, - const ad_utility::HashMap& localVocabMapping, - const std::vector>& insertInfo, - const ad_utility::SharedCancellationHandle& cancellationHandle, - ql::span additionalColumns) { - Permutation::ScanSpecAndBlocks scanSpecAndBlocks{ - std::move(scanSpec), std::move(blockMetadataRanges)}; - auto fullScan = - permutation.lazyScan(scanSpecAndBlocks, std::nullopt, additionalColumns, - cancellationHandle, snapshot, LimitOffsetClause{}); - auto keyOrder = Permutation::toKeyOrder(permutation.permutation()); - std::vector columnIndices{keyOrder.keys().begin(), - keyOrder.keys().end()}; - while (columnIndices.size() < additionalColumns.size() + 3) { - columnIndices.emplace_back(columnIndices.size()); - } - return ad_utility::InputRangeTypeErased{ - ad_utility::CachingTransformInputRange{ - std::move(fullScan), - [columnIndices = std::move(columnIndices), &localVocabMapping, - &insertInfo](IdTable& idTable) { - AD_CORRECTNESS_CHECK(idTable.numColumns() == columnIndices.size()); - idTable.setColumnSubset(columnIndices); - auto allCols = idTable.getColumns(); - // Extra columns beyond the graph column only contain integers (or - // undefined for triples added via UPDATE) and thus don't need to be - // remapped. - constexpr size_t REGULAR_COLUMNS = 4; - for (auto col : allCols | ::ranges::views::take(REGULAR_COLUMNS)) { - ql::ranges::for_each( - col, [&localVocabMapping, &insertInfo](Id& id) { - if (id.getDatatype() == Datatype::LocalVocabIndex) { - id = localVocabMapping.at(id); - } else if (id.getDatatype() == Datatype::VocabIndex) { - id = remapVocabId(id, insertInfo); - } - }); - } - AD_EXPENSIVE_CHECK(ql::ranges::all_of( - allCols | ::ranges::views::drop(REGULAR_COLUMNS), [](auto col) { - return ql::ranges::all_of(col, [](Id id) { - return id.getDatatype() == Datatype::Int || - id.isUndefined(); - }); - })); - return IdTableStatic<0>{std::move(idTable)}; - }}}; -} - -ad_utility::InputRangeTypeErased> readIndexAndRemap( - const Permutation& permutation, ScanSpecification scanSpec, - const LocatedTriplesSnapshot& snapshot, - const ad_utility::HashMap& localVocabMapping, - const std::vector>& insertInfo, - const ad_utility::SharedCancellationHandle& cancellationHandle) { - return readIndexAndRemap( - permutation, std::move(scanSpec), - permutation.getAugmentedMetadataForPermutation(snapshot), snapshot, - localVocabMapping, insertInfo, cancellationHandle, - std::array{static_cast(ADDITIONAL_COLUMN_GRAPH_ID)}); -} - -size_t getNumColumns(const BlockMetadataRanges& blockMetadataRanges) { - if (!blockMetadataRanges.empty()) { - const auto& first = blockMetadataRanges.at(0); - if (!first.empty()) { - return first[0].offsetsAndCompressedSize_.size(); - } - } - return 4; -} -} // namespace - -// _____________________________________________________________________________ -void DeltaTriples::materializeToIndex( - const CancellationHandle& cancellationHandle) { - const auto& [insertInfo, localVocabMapping] = materializeLocalVocab(); - - // TODO Move much of this logic to `IndexImpl`. This way the "public" - // wrappers can be avoided. - ScanSpecification scanSpec{std::nullopt, std::nullopt, std::nullopt}; - auto snapshot = getSnapshot(); - IndexImpl newIndex{index_.allocator(), false}; - newIndex.loadConfigFromOldIndex("tmp_index", index_); - - if (index_.usePatterns()) { - newIndex.getPatterns() = - index_.getPatterns().cloneAndRemap([&insertInfo](const Id& oldId) { - return remapVocabId(oldId, insertInfo); - }); - newIndex.writePatternsToFile(); - } - - if (index_.hasAllPermutations()) { - newIndex.createSPOAndSOPPublic( - 4, readIndexAndRemap(index_.getPermutation(Permutation::Enum::SPO), - scanSpec, *snapshot, localVocabMapping, insertInfo, - cancellationHandle)); - // TODO Find out why we can't use createOSPAndOPSPublic here. - newIndex.createPermutationPairPublic( - 4, - readIndexAndRemap(index_.getPermutation(Permutation::Enum::OPS), - scanSpec, *snapshot, localVocabMapping, insertInfo, - cancellationHandle), - newIndex.getPermutation(Permutation::Enum::OPS), - newIndex.getPermutation(Permutation::Enum::OSP)); - } - - auto [numTriplesInternal, numPredicatesInternal] = - newIndex.createInternalPSOandPOSFromRange(readIndexAndRemap( - index_.getPermutation(Permutation::Enum::PSO).internalPermutation(), - scanSpec, *snapshot, localVocabMapping, insertInfo, - cancellationHandle)); - - const auto& psoPermutation = index_.getPermutation(Permutation::Enum::PSO); - auto blockMetadataRanges = - psoPermutation.getAugmentedMetadataForPermutation(*snapshot); - size_t numColumns = getNumColumns(blockMetadataRanges); - std::vector additionalColumns; - additionalColumns.push_back(ADDITIONAL_COLUMN_GRAPH_ID); - for (ColumnIndex col : {ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN, - ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN}) { - if (additionalColumns.size() >= numColumns - 3) { - break; - } - additionalColumns.push_back(col); + entries.push_back(std::make_pair(entry, &entry)); } - AD_CORRECTNESS_CHECK(additionalColumns.size() == numColumns - 3); - newIndex.createPSOAndPOSImplPublic( - numColumns, readIndexAndRemap(psoPermutation, scanSpec, - std::move(blockMetadataRanges), *snapshot, - localVocabMapping, insertInfo, - cancellationHandle, additionalColumns)); - - // TODO explicitly set these two - // newIndex.configurationJson_["has-all-permutations"] = - // index_.hasAllPermutations(); - // newIndex.configurationJson_["num-blank-nodes-total"] = TBD; - newIndex.addInternalStatisticsToConfiguration(numTriplesInternal, - numPredicatesInternal); + return entries; } diff --git a/src/index/DeltaTriples.h b/src/index/DeltaTriples.h index 42fee83db4..5c163d32e9 100644 --- a/src/index/DeltaTriples.h +++ b/src/index/DeltaTriples.h @@ -242,11 +242,10 @@ class DeltaTriples { // Update the block metadata. void updateAugmentedMetadata(); - std::pair>, - ad_utility::HashMap> - materializeLocalVocab() const; - - void materializeToIndex(const CancellationHandle& cancellationHandle); + // Create a deep clone of the local vocab such that it can be processed + // without holding the lock. + std::vector> deepCloneLocalVocab() + const; private: // The the proper state according to the template parameter. This will either @@ -342,6 +341,10 @@ class DeltaTriplesManager { // Return a shared pointer to a deep copy of the current snapshot. This can // be safely used to execute a query without interfering with future updates. SharedLocatedTriplesSnapshot getCurrentSnapshot() const; + + std::pair>> + getCurrentSnapshotWithVocab() const; }; #endif // QLEVER_SRC_INDEX_DELTATRIPLES_H diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp new file mode 100644 index 0000000000..a14d038905 --- /dev/null +++ b/src/index/IndexRebuilder.cpp @@ -0,0 +1,243 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Robin Textor-Falconi + +#include "index/IndexRebuilder.h" + +#include +#include +#include +#include +#include +#include + +#include "backports/algorithm.h" +#include "engine/idTable/IdTable.h" +#include "global/Id.h" +#include "index/IndexImpl.h" +#include "index/LocalVocabEntry.h" +#include "index/Permutation.h" +#include "util/CancellationHandle.h" +#include "util/Exception.h" +#include "util/HashMap.h" +#include "util/InputRangeUtils.h" + +namespace { +using CancellationHandle = ad_utility::SharedCancellationHandle; + +// _____________________________________________________________________________ +std::pair>, + ad_utility::HashMap> +materializeLocalVocab( + const std::vector>& entries, + const Index::Vocab& vocab, const std::string& newIndexName) { + size_t newWordCount = 0; + std::vector> insertInfo; + insertInfo.reserve(entries.size()); + + ad_utility::HashMap localVocabMapping; + + for (const auto& [entry, originalIndex] : entries) { + const auto& [lower, upper] = entry.positionInVocab(); + AD_CORRECTNESS_CHECK(lower == upper); + Id id = Id::fromBits(upper.get()); + AD_CORRECTNESS_CHECK(id.getDatatype() == Datatype::VocabIndex); + insertInfo.emplace_back(id.getVocabIndex(), + entry.asLiteralOrIri().toStringRepresentation(), + Id::makeFromLocalVocabIndex(originalIndex)); + } + ql::ranges::sort(insertInfo, [](const auto& tupleA, const auto& tupleB) { + return std::tie(std::get(tupleA).get(), std::get(tupleA)) < + std::tie(std::get(tupleB).get(), std::get(tupleB)); + }); + + auto vocabWriter = vocab.makeWordWriterPtr(newIndexName + ".vocabulary"); + for (size_t vocabIndex = 0; vocabIndex < vocab.size(); ++vocabIndex) { + auto actualIndex = VocabIndex::make(vocabIndex); + while (insertInfo.size() > newWordCount && + std::get(insertInfo.at(newWordCount)) == actualIndex) { + AD_CORRECTNESS_CHECK(std::get(insertInfo.at(newWordCount)) < + Id::makeFromVocabIndex(actualIndex)); + auto word = std::get(insertInfo.at(newWordCount)); + auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); + localVocabMapping.emplace( + std::get(insertInfo.at(newWordCount)), + Id::makeFromVocabIndex(VocabIndex::make(newIndex))); + newWordCount++; + } + auto word = vocab[actualIndex]; + auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); + AD_CORRECTNESS_CHECK(newIndex == vocabIndex + newWordCount); + } + + for (const auto& [_, word, id] : insertInfo | ql::views::drop(newWordCount)) { + auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); + localVocabMapping.emplace( + id, Id::makeFromVocabIndex(VocabIndex::make(newIndex))); + } + return std::pair{std::move(insertInfo), std::move(localVocabMapping)}; +} + +// _____________________________________________________________________________ +Id remapVocabId(Id original, + const std::vector>& + insertInfo) { + AD_CONTRACT_CHECK(original.getDatatype() == Datatype::VocabIndex); + size_t offset = ql::ranges::distance( + insertInfo.begin(), + ql::ranges::upper_bound( + insertInfo, original.getVocabIndex(), std::less{}, + [](const auto& tuple) { return std::get<0>(tuple); })); + return Id::makeFromVocabIndex( + VocabIndex::make(original.getVocabIndex().get() + offset)); +} + +// _____________________________________________________________________________ +ad_utility::InputRangeTypeErased> readIndexAndRemap( + const Permutation& permutation, ScanSpecification scanSpec, + const BlockMetadataRanges& blockMetadataRanges, + const LocatedTriplesSnapshot& snapshot, + const ad_utility::HashMap& localVocabMapping, + const std::vector>& insertInfo, + const ad_utility::SharedCancellationHandle& cancellationHandle, + ql::span additionalColumns) { + Permutation::ScanSpecAndBlocks scanSpecAndBlocks{std::move(scanSpec), + blockMetadataRanges}; + auto fullScan = + permutation.lazyScan(scanSpecAndBlocks, std::nullopt, additionalColumns, + cancellationHandle, snapshot, LimitOffsetClause{}); + auto keyOrder = Permutation::toKeyOrder(permutation.permutation()); + std::vector columnIndices{keyOrder.keys().begin(), + keyOrder.keys().end()}; + while (columnIndices.size() < additionalColumns.size() + 3) { + columnIndices.emplace_back(columnIndices.size()); + } + return ad_utility::InputRangeTypeErased{ + ad_utility::CachingTransformInputRange{ + std::move(fullScan), + [columnIndices = std::move(columnIndices), &localVocabMapping, + &insertInfo](IdTable& idTable) { + AD_CORRECTNESS_CHECK(idTable.numColumns() == columnIndices.size()); + idTable.setColumnSubset(columnIndices); + auto allCols = idTable.getColumns(); + // Extra columns beyond the graph column only contain integers (or + // undefined for triples added via UPDATE) and thus don't need to be + // remapped. + constexpr size_t REGULAR_COLUMNS = 4; + for (auto col : allCols | ::ranges::views::take(REGULAR_COLUMNS)) { + ql::ranges::for_each( + col, [&localVocabMapping, &insertInfo](Id& id) { + if (id.getDatatype() == Datatype::LocalVocabIndex) { + id = localVocabMapping.at(id); + } else if (id.getDatatype() == Datatype::VocabIndex) { + id = remapVocabId(id, insertInfo); + } + }); + } + AD_EXPENSIVE_CHECK(ql::ranges::all_of( + allCols | ::ranges::views::drop(REGULAR_COLUMNS), [](auto col) { + return ql::ranges::all_of(col, [](Id id) { + return id.getDatatype() == Datatype::Int || + id.isUndefined(); + }); + })); + return IdTableStatic<0>{std::move(idTable)}; + }}}; +} + +ad_utility::InputRangeTypeErased> readIndexAndRemap( + const Permutation& permutation, ScanSpecification scanSpec, + const LocatedTriplesSnapshot& snapshot, + const ad_utility::HashMap& localVocabMapping, + const std::vector>& insertInfo, + const ad_utility::SharedCancellationHandle& cancellationHandle) { + return readIndexAndRemap( + permutation, std::move(scanSpec), + permutation.getAugmentedMetadataForPermutation(snapshot), snapshot, + localVocabMapping, insertInfo, cancellationHandle, + std::array{static_cast(ADDITIONAL_COLUMN_GRAPH_ID)}); +} + +size_t getNumColumns(const BlockMetadataRanges& blockMetadataRanges) { + if (!blockMetadataRanges.empty()) { + const auto& first = blockMetadataRanges.at(0); + if (!first.empty()) { + return first[0].offsetsAndCompressedSize_.size(); + } + } + return 4; +} +} // namespace + +// _____________________________________________________________________________ +namespace qlever { +void materializeToIndex( + const IndexImpl& index, const std::string& newIndexName, + const std::vector>& entries, + const SharedLocatedTriplesSnapshot& snapshot, + const CancellationHandle& cancellationHandle) { + const auto& [insertInfo, localVocabMapping] = + materializeLocalVocab(entries, index.getVocab(), newIndexName); + + ScanSpecification scanSpec{std::nullopt, std::nullopt, std::nullopt}; + IndexImpl newIndex{index.allocator(), false}; + newIndex.loadConfigFromOldIndex(newIndexName, index); + + if (index.usePatterns()) { + newIndex.getPatterns() = + index.getPatterns().cloneAndRemap([&insertInfo](const Id& oldId) { + return remapVocabId(oldId, insertInfo); + }); + newIndex.writePatternsToFile(); + } + + if (index.hasAllPermutations()) { + newIndex.createSPOAndSOPPublic( + 4, readIndexAndRemap(index.getPermutation(Permutation::Enum::SPO), + scanSpec, *snapshot, localVocabMapping, insertInfo, + cancellationHandle)); + // TODO Find out why we can't use createOSPAndOPSPublic here. + newIndex.createPermutationPairPublic( + 4, + readIndexAndRemap(index.getPermutation(Permutation::Enum::OPS), + scanSpec, *snapshot, localVocabMapping, insertInfo, + cancellationHandle), + newIndex.getPermutation(Permutation::Enum::OPS), + newIndex.getPermutation(Permutation::Enum::OSP)); + } + + auto [numTriplesInternal, numPredicatesInternal] = + newIndex.createInternalPSOandPOSFromRange(readIndexAndRemap( + index.getPermutation(Permutation::Enum::PSO).internalPermutation(), + scanSpec, *snapshot, localVocabMapping, insertInfo, + cancellationHandle)); + + const auto& psoPermutation = index.getPermutation(Permutation::Enum::PSO); + auto blockMetadataRanges = + psoPermutation.getAugmentedMetadataForPermutation(*snapshot); + size_t numColumns = getNumColumns(blockMetadataRanges); + std::vector additionalColumns; + additionalColumns.push_back(ADDITIONAL_COLUMN_GRAPH_ID); + for (ColumnIndex col : {ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN, + ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN}) { + if (additionalColumns.size() >= numColumns - 3) { + break; + } + additionalColumns.push_back(col); + } + AD_CORRECTNESS_CHECK(additionalColumns.size() == numColumns - 3); + newIndex.createPSOAndPOSImplPublic( + numColumns, + readIndexAndRemap(psoPermutation, scanSpec, blockMetadataRanges, + *snapshot, localVocabMapping, insertInfo, + cancellationHandle, additionalColumns)); + + // TODO explicitly set these two + // newIndex.configurationJson_["has-all-permutations"] = + // index_.hasAllPermutations(); + // newIndex.configurationJson_["num-blank-nodes-total"] = TBD; + newIndex.addInternalStatisticsToConfiguration(numTriplesInternal, + numPredicatesInternal); +} + +} // namespace qlever diff --git a/src/index/IndexRebuilder.h b/src/index/IndexRebuilder.h new file mode 100644 index 0000000000..42cc93e471 --- /dev/null +++ b/src/index/IndexRebuilder.h @@ -0,0 +1,25 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Robin Textor-Falconi + +#ifndef QLEVER_SRC_INDEX_INDEXREBUILDER_H +#define QLEVER_SRC_INDEX_INDEXREBUILDER_H + +#include +#include + +#include "index/IndexImpl.h" +#include "util/CancellationHandle.h" + +namespace qlever { + +// Build a new index based on this data. +void materializeToIndex( + const IndexImpl& index, const std::string& newIndexName, + const std::vector>& entries, + const SharedLocatedTriplesSnapshot& snapshot, + const ad_utility::SharedCancellationHandle& cancellationHandle); + +} // namespace qlever + +#endif // QLEVER_SRC_INDEX_INDEXREBUILDER_H From 1292865bb2bf899f76e40df162c71939bd1f4789 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Tue, 16 Dec 2025 23:40:49 +0100 Subject: [PATCH 17/41] Avoid redundant string copies --- src/index/DeltaTriples.cpp | 14 +++++--------- src/index/DeltaTriples.h | 15 ++++++++------- src/index/IndexRebuilder.cpp | 23 +++++++++++------------ src/index/IndexRebuilder.h | 2 +- 4 files changed, 25 insertions(+), 29 deletions(-) diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index 1d30ec24f3..c25cbd09b7 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -388,12 +388,11 @@ SharedLocatedTriplesSnapshot DeltaTriplesManager::getCurrentSnapshot() const { } // _____________________________________________________________________________ -std::pair>> +std::pair> DeltaTriplesManager::getCurrentSnapshotWithVocab() const { return deltaTriples_.withReadLock([this](const DeltaTriples& deltaTriples) { return std::make_pair(*currentLocatedTriplesSnapshot_.rlock(), - deltaTriples.deepCloneLocalVocab()); + deltaTriples.copyLocalVocab()); }); } @@ -493,17 +492,14 @@ void DeltaTriplesManager::setFilenameForPersistentUpdatesAndReadFromDisk( } // _____________________________________________________________________________ -std::vector> -DeltaTriples::deepCloneLocalVocab() const { - std::vector> entries; +std::vector DeltaTriples::copyLocalVocab() const { + std::vector entries; entries.reserve(localVocab_.size()); ad_utility::HashMap localVocabMapping; - // TODO it is probably sufficient to just hand out the pointers + a - // lifetime extender. for (const LocalVocabEntry& entry : localVocab_.primaryWordSet()) { - entries.push_back(std::make_pair(entry, &entry)); + entries.push_back(&entry); } return entries; } diff --git a/src/index/DeltaTriples.h b/src/index/DeltaTriples.h index 5c163d32e9..bdca9f3c33 100644 --- a/src/index/DeltaTriples.h +++ b/src/index/DeltaTriples.h @@ -242,13 +242,13 @@ class DeltaTriples { // Update the block metadata. void updateAugmentedMetadata(); - // Create a deep clone of the local vocab such that it can be processed - // without holding the lock. - std::vector> deepCloneLocalVocab() - const; + // Create a copy of the local vocab such that it can be processed + // without holding the lock. You have to make sure separately that the + // pointers are still valid. + std::vector copyLocalVocab() const; private: - // The the proper state according to the template parameter. This will either + // The proper state according to the template parameter. This will either // return a reference to `triplesToHandlesInternal_` or // `triplesToHandlesNormal_`. template @@ -342,8 +342,9 @@ class DeltaTriplesManager { // be safely used to execute a query without interfering with future updates. SharedLocatedTriplesSnapshot getCurrentSnapshot() const; - std::pair>> + // In addition to a simple snapshot, also acquire a copy of the local vocab + // indices. + std::pair> getCurrentSnapshotWithVocab() const; }; diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index a14d038905..c38efde5b5 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -28,23 +28,23 @@ using CancellationHandle = ad_utility::SharedCancellationHandle; // _____________________________________________________________________________ std::pair>, ad_utility::HashMap> -materializeLocalVocab( - const std::vector>& entries, - const Index::Vocab& vocab, const std::string& newIndexName) { +materializeLocalVocab(const std::vector& entries, + const Index::Vocab& vocab, + const std::string& newIndexName) { size_t newWordCount = 0; std::vector> insertInfo; insertInfo.reserve(entries.size()); ad_utility::HashMap localVocabMapping; - for (const auto& [entry, originalIndex] : entries) { - const auto& [lower, upper] = entry.positionInVocab(); + for (auto* entry : entries) { + const auto& [lower, upper] = entry->positionInVocab(); AD_CORRECTNESS_CHECK(lower == upper); Id id = Id::fromBits(upper.get()); AD_CORRECTNESS_CHECK(id.getDatatype() == Datatype::VocabIndex); insertInfo.emplace_back(id.getVocabIndex(), - entry.asLiteralOrIri().toStringRepresentation(), - Id::makeFromLocalVocabIndex(originalIndex)); + entry->asLiteralOrIri().toStringRepresentation(), + Id::makeFromLocalVocabIndex(entry)); } ql::ranges::sort(insertInfo, [](const auto& tupleA, const auto& tupleB) { return std::tie(std::get(tupleA).get(), std::get(tupleA)) < @@ -171,11 +171,10 @@ size_t getNumColumns(const BlockMetadataRanges& blockMetadataRanges) { // _____________________________________________________________________________ namespace qlever { -void materializeToIndex( - const IndexImpl& index, const std::string& newIndexName, - const std::vector>& entries, - const SharedLocatedTriplesSnapshot& snapshot, - const CancellationHandle& cancellationHandle) { +void materializeToIndex(const IndexImpl& index, const std::string& newIndexName, + const std::vector& entries, + const SharedLocatedTriplesSnapshot& snapshot, + const CancellationHandle& cancellationHandle) { const auto& [insertInfo, localVocabMapping] = materializeLocalVocab(entries, index.getVocab(), newIndexName); diff --git a/src/index/IndexRebuilder.h b/src/index/IndexRebuilder.h index 42cc93e471..cbcf59fe1e 100644 --- a/src/index/IndexRebuilder.h +++ b/src/index/IndexRebuilder.h @@ -16,7 +16,7 @@ namespace qlever { // Build a new index based on this data. void materializeToIndex( const IndexImpl& index, const std::string& newIndexName, - const std::vector>& entries, + const std::vector& entries, const SharedLocatedTriplesSnapshot& snapshot, const ad_utility::SharedCancellationHandle& cancellationHandle); From 13824e23abfe6c0a228b7226f7e06ec61d593f22 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Wed, 17 Dec 2025 12:05:12 +0100 Subject: [PATCH 18/41] Make index name configurable --- src/engine/Server.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 13abc197d3..084ca07e00 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -465,16 +465,18 @@ CPP_template_def(typename RequestT, typename ResponseT)( } else { absl::Cleanup cleanup{[this]() { rebuildInProgress_.store(false); }}; logCommand(cmd, "rebuilding index"); + auto fileName = + checkParameter("index-name", std::nullopt).value_or("new_index"); // There is no mechanism to actually cancel the handle. auto handle = std::make_shared>(); // We don't directly `co_await` because of lifetime issues (bugs) in the // Conan setup. auto coroutine = computeInNewThread( queryThreadPool_, - [this, &handle] { + [this, &handle, fileName = std::move(fileName)] { auto [currentSnapshot, localVocabCopy] = index_.deltaTriplesManager().getCurrentSnapshotWithVocab(); - qlever::materializeToIndex(index_.getImpl(), "tmp_index", + qlever::materializeToIndex(index_.getImpl(), fileName, localVocabCopy, currentSnapshot, handle); }, handle); From d0a1fcbc0fb12600a4a350d8ee20d830abe9b4c3 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 18 Dec 2025 12:15:52 +0100 Subject: [PATCH 19/41] Start implementing stats recompute --- src/global/Pattern.h | 2 + src/index/IndexImpl.cpp | 114 +++++++++++++++++++++++++++++++++-- src/index/IndexImpl.h | 8 ++- src/index/IndexRebuilder.cpp | 7 +-- 4 files changed, 120 insertions(+), 11 deletions(-) diff --git a/src/global/Pattern.h b/src/global/Pattern.h index 2062cdd3e1..891221329b 100644 --- a/src/global/Pattern.h +++ b/src/global/Pattern.h @@ -126,6 +126,8 @@ class CompactVectorOfStrings { return {ptr, size}; } + // Copy this class and apply the transformation `mappingFunction` to its + // elements. template CompactVectorOfStrings cloneAndRemap(Func mappingFunction) const { CompactVectorOfStrings clone; diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 0cce214ce4..4d335a8d32 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1839,10 +1839,8 @@ CPP_template_def(typename... NextSorter)(requires(sizeof...(NextSorter) <= 1)) configurationJson_["num-subjects"] = NumNormalAndInternal::fromNormalAndTotal(numSubjectsNormal, numSubjectsTotal); + writeConfiguration(); } - configurationJson_["num-subjects"] = NumNormalAndInternal::fromNormalAndTotal( - numSubjectsNormal, numSubjectsTotal); - writeConfiguration(); return result; } @@ -1921,11 +1919,17 @@ void IndexImpl::setPrefixesForEncodedValues( // _____________________________________________________________________________ void IndexImpl::loadConfigFromOldIndex(const std::string& newName, - const IndexImpl& other) { + const IndexImpl& other, + const nlohmann::json& newStats) { setOnDiskBase(newName); setKbName(other.getKbName()); blocksizePermutationPerColumn() = other.blocksizePermutationPerColumn(); - configurationJson_ = other.configurationJson_; + configurationJson_ = newStats; + numTriples_ = static_cast(newStats["num-triples"]); + numPredicates_ = + static_cast(newStats["num-predicates"]); + numSubjects_ = static_cast(newStats["num-subjects"]); + numObjects_ = static_cast(newStats["num-objects"]); } // _____________________________________________________________________________ @@ -1946,3 +1950,103 @@ void IndexImpl::writePatternsToFile() const { patternWriter << statistics; patternWriter << patterns_; } + +namespace { +void countDistinct(std::optional& lastId, size_t& counter, + const IdTable& table) { + if (!table.empty()) { + auto col = table.getColumn(0); + counter += + ql::ranges::distance(col | ::ranges::views::unique([](Id a, Id b) { + return a.getBits() == b.getBits(); + })); + if (lastId != col.at(0)) { + lastId = col.at(0); + } else { + // Avoid double counting in case the last id of the previous block is the + // same as the first id of this block. + counter--; + } + } +} +} // namespace + +// _____________________________________________________________________________ +nlohmann::json IndexImpl::recomputeStatistics( + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { + size_t numTriples = 0; + size_t numSubjects = 0; + size_t numPredicates = 0; + size_t numObjects = 0; + uint64_t nextBlankNode = 0; + { + auto cancellationHandle = + std::make_shared(); + ScanSpecification scanSpec{std::nullopt, std::nullopt, std::nullopt}; + std::vector tasks; + + tasks.push_back(ad_utility::JThread{ + [this, &numTriples, &numPredicates, &nextBlankNode, &scanSpec, + &locatedTriplesSnapshot, &cancellationHandle]() { + auto tables = pso_->lazyScan( + pso_->getScanSpecAndBlocks(scanSpec, locatedTriplesSnapshot), + std::nullopt, CompressedRelationReader::ColumnIndicesRef{}, + cancellationHandle, locatedTriplesSnapshot); + std::optional lastPredicate = std::nullopt; + for (const auto& table : tables) { + numTriples += table.numRows(); + for (auto col : table.getColumns()) { + for (auto id : col) { + if (id.getDatatype() == Datatype::BlankNodeIndex) { + nextBlankNode = + std::max(nextBlankNode, id.getBlankNodeIndex().get() + 1); + } + } + } + countDistinct(lastPredicate, numPredicates, table); + } + }}); + + if (hasAllPermutations()) { + tasks.push_back( + ad_utility::JThread{[this, &numSubjects, &scanSpec, + &locatedTriplesSnapshot, &cancellationHandle]() { + auto tables = spo_->lazyScan( + spo_->getScanSpecAndBlocks(scanSpec, locatedTriplesSnapshot), + std::nullopt, CompressedRelationReader::ColumnIndicesRef{}, + cancellationHandle, locatedTriplesSnapshot); + std::optional lastSubject = std::nullopt; + for (const auto& table : tables) { + countDistinct(lastSubject, numSubjects, table); + } + }}); + + tasks.push_back( + ad_utility::JThread{[this, &numObjects, &scanSpec, + &locatedTriplesSnapshot, &cancellationHandle]() { + auto tables = osp_->lazyScan( + osp_->getScanSpecAndBlocks(scanSpec, locatedTriplesSnapshot), + std::nullopt, CompressedRelationReader::ColumnIndicesRef{}, + cancellationHandle, locatedTriplesSnapshot); + std::optional lastObject = std::nullopt; + for (const auto& table : tables) { + countDistinct(lastObject, numObjects, table); + } + }}); + } + } + // TODO find out what the internal counts do + auto configuration = configurationJson_; + configuration["num-triples"] = + NumNormalAndInternal{numTriples, numTriples_.internal}; + configuration["num-predicates"] = + NumNormalAndInternal{numPredicates, numPredicates_.internal}; + if (hasAllPermutations()) { + configuration["num-subjects"] = + NumNormalAndInternal{numSubjects, numSubjects_.internal}; + configuration["num-objects"] = + NumNormalAndInternal{numObjects, numObjects_.internal}; + } + configuration["num-blank-nodes-total"] = nextBlankNode; + return configuration; +} diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 0bf61b54aa..0abab87085 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -847,9 +847,15 @@ class IndexImpl { float b, float k); void loadConfigFromOldIndex(const std::string& newName, - const IndexImpl& other); + const IndexImpl& other, + const nlohmann::json& newStats); + // Write the stored in-memory patterns to a pattern file. void writePatternsToFile() const; + + // Recompute the statistics about the index based on the passed snapshot. + nlohmann::json recomputeStatistics( + const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; }; #endif // QLEVER_SRC_INDEX_INDEXIMPL_H diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index c38efde5b5..d89af133a8 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -177,10 +177,11 @@ void materializeToIndex(const IndexImpl& index, const std::string& newIndexName, const CancellationHandle& cancellationHandle) { const auto& [insertInfo, localVocabMapping] = materializeLocalVocab(entries, index.getVocab(), newIndexName); + auto newStats = index.recomputeStatistics(*snapshot); ScanSpecification scanSpec{std::nullopt, std::nullopt, std::nullopt}; IndexImpl newIndex{index.allocator(), false}; - newIndex.loadConfigFromOldIndex(newIndexName, index); + newIndex.loadConfigFromOldIndex(newIndexName, index, newStats); if (index.usePatterns()) { newIndex.getPatterns() = @@ -231,10 +232,6 @@ void materializeToIndex(const IndexImpl& index, const std::string& newIndexName, *snapshot, localVocabMapping, insertInfo, cancellationHandle, additionalColumns)); - // TODO explicitly set these two - // newIndex.configurationJson_["has-all-permutations"] = - // index_.hasAllPermutations(); - // newIndex.configurationJson_["num-blank-nodes-total"] = TBD; newIndex.addInternalStatisticsToConfiguration(numTriplesInternal, numPredicatesInternal); } From 7fd1c9341e82491006d0e50d539fa04c497abfac Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 18 Dec 2025 16:44:20 +0100 Subject: [PATCH 20/41] Correctly recompute internal statistics --- src/index/IndexImpl.cpp | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 4d335a8d32..94ddef26c7 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1975,8 +1975,10 @@ void countDistinct(std::optional& lastId, size_t& counter, nlohmann::json IndexImpl::recomputeStatistics( const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { size_t numTriples = 0; + size_t numTriplesInternal = 0; size_t numSubjects = 0; size_t numPredicates = 0; + size_t numPredicatesInternal = 0; size_t numObjects = 0; uint64_t nextBlankNode = 0; { @@ -2007,6 +2009,21 @@ nlohmann::json IndexImpl::recomputeStatistics( } }}); + tasks.push_back(ad_utility::JThread{ + [this, &numTriplesInternal, &numPredicatesInternal, &scanSpec, + &locatedTriplesSnapshot, &cancellationHandle]() { + auto tables = pso_->internalPermutation().lazyScan( + pso_->internalPermutation().getScanSpecAndBlocks( + scanSpec, locatedTriplesSnapshot), + std::nullopt, CompressedRelationReader::ColumnIndicesRef{}, + cancellationHandle, locatedTriplesSnapshot); + std::optional lastPredicate = std::nullopt; + for (const auto& table : tables) { + numTriplesInternal += table.numRows(); + countDistinct(lastPredicate, numPredicatesInternal, table); + } + }}); + if (hasAllPermutations()) { tasks.push_back( ad_utility::JThread{[this, &numSubjects, &scanSpec, @@ -2035,17 +2052,17 @@ nlohmann::json IndexImpl::recomputeStatistics( }}); } } - // TODO find out what the internal counts do auto configuration = configurationJson_; configuration["num-triples"] = - NumNormalAndInternal{numTriples, numTriples_.internal}; + NumNormalAndInternal{numTriples, numTriplesInternal}; configuration["num-predicates"] = - NumNormalAndInternal{numPredicates, numPredicates_.internal}; + NumNormalAndInternal{numPredicates, numPredicatesInternal}; if (hasAllPermutations()) { - configuration["num-subjects"] = - NumNormalAndInternal{numSubjects, numSubjects_.internal}; - configuration["num-objects"] = - NumNormalAndInternal{numObjects, numObjects_.internal}; + // These are unused. + AD_CORRECTNESS_CHECK(numSubjects_.internal == 0); + AD_CORRECTNESS_CHECK(numObjects_.internal == 0); + configuration["num-subjects"] = NumNormalAndInternal{numSubjects, 0}; + configuration["num-objects"] = NumNormalAndInternal{numObjects, 0}; } configuration["num-blank-nodes-total"] = nextBlankNode; return configuration; From 739af376e4968205d8b988f5d07076aaf2856f85 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 18 Dec 2025 17:58:45 +0100 Subject: [PATCH 21/41] Write permutations individually --- src/index/IndexImpl.cpp | 106 +++++++++++++++++++---------------- src/index/IndexImpl.h | 25 ++++----- src/index/IndexRebuilder.cpp | 86 +++++++++++++--------------- 3 files changed, 108 insertions(+), 109 deletions(-) diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 94ddef26c7..62239b8fa0 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -322,23 +322,6 @@ std::pair IndexImpl::createInternalPSOandPOS( return {numTriplesInternal, numPredicatesInternal}; } -// _____________________________________________________________________________ -namespace { -struct SortedBlocksWrapper { - ad_utility::InputRangeTypeErased> sortedBlocks_; - template - ad_utility::InputRangeTypeErased> getSortedBlocks() { - static_assert(N == 0); - return std::move(sortedBlocks_); - } -}; -} // namespace -// _____________________________________________________________________________ -std::pair IndexImpl::createInternalPSOandPOSFromRange( - ad_utility::InputRangeTypeErased> sortedBlocks) { - return createInternalPSOandPOS(SortedBlocksWrapper{std::move(sortedBlocks)}); -} - // _____________________________________________________________________________ void IndexImpl::updateInputFileSpecificationsAndLog( std::vector& spec, @@ -838,6 +821,18 @@ auto IndexImpl::convertPartialToGlobalIds( return {std::move(resultPtr), std::move(internalTriplesPtr)}; } +// _____________________________________________________________________________ + +namespace { +// Lift a callback that works on single elements to a callback that works on +// blocks. +auto liftCallback(auto callback) { + return [callback](const auto& block) mutable { + ql::ranges::for_each(block, callback); + }; +} +} // namespace + // _____________________________________________________________________________ template std::tuple +IndexImpl::createPermutationImpl( + size_t numColumns, const std::string& fileName, + ad_utility::InputRangeTypeErased> sortedTriples) { + using MetaData = IndexMetaDataMmapDispatcher::WriteType; + MetaData metaData; + static_assert(MetaData::isMmapBased_); + metaData.setup(fileName + MMAP_FILE_SUFFIX, ad_utility::CreateTag{}); + + CompressedRelationWriter writer{numColumns, ad_utility::File(fileName, "w"), + blocksizePermutationPerColumn_}; + + auto callback = + liftCallback([&metaData](const auto& md) { metaData.add(md); }); + + // We can always supply the tables with the correct permutation. No need to + // re-order everything. + auto [numDistinctCol0, blockData] = + CompressedRelationWriter::createPermutation( + {writer, callback}, std::move(sortedTriples), {0, 1, 2, 3}, {}); + metaData.blockData() = std::move(blockData); + + return {numDistinctCol0, std::move(metaData)}; +} + // ________________________________________________________________________ template std::tuple>&& sortedTriples, - const Permutation& p1, const Permutation& p2) { - [[maybe_unused]] auto value = - createPermutationPair(numColumns, AD_FWD(sortedTriples), p1, p2); + ad_utility::InputRangeTypeErased> sortedTriples, + const Permutation& permutation, bool internal) { + AD_LOG_INFO << "Creating permutation " << permutation.readableName() << " ..." + << std::endl; + std::string fileName = + absl::StrCat(onDiskBase_, internal ? QLEVER_INTERNAL_INDEX_INFIX : "", + ".index", permutation.fileSuffix()); + auto metaData = + createPermutationImpl(numColumns, fileName, std::move(sortedTriples)); + + auto& [numDistinctCol0, meta] = metaData; + meta.calculateStatistics(numDistinctCol0); + AD_LOG_INFO << "Statistics for " << permutation.readableName() << ": " + << meta.statistics() << std::endl; + + meta.setName(getKbName()); + ad_utility::File f{fileName, "r+"}; + meta.appendToFile(&f); + return numDistinctCol0; } // ________________________________________________________________________ @@ -1781,12 +1810,6 @@ CPP_template_def(typename... NextSorter)(requires( } } -// _____________________________________________________________________________ -void IndexImpl::createPSOAndPOSImplPublic(size_t numColumns, - BlocksOfTriples sortedTriples) { - createPSOAndPOSImpl(numColumns, std::move(sortedTriples), false); -} - // _____________________________________________________________________________ CPP_template_def(typename... NextSorter)( requires(sizeof...(NextSorter) <= @@ -1844,12 +1867,6 @@ CPP_template_def(typename... NextSorter)(requires(sizeof...(NextSorter) <= 1)) return result; } -// _____________________________________________________________________________ -void IndexImpl::createSPOAndSOPPublic(size_t numColumns, - BlocksOfTriples sortedTriples) { - createSPOAndSOP(numColumns, std::move(sortedTriples)); -} - // _____________________________________________________________________________ CPP_template_def(typename... NextSorter)( requires(sizeof...(NextSorter) <= @@ -1869,12 +1886,6 @@ CPP_template_def(typename... NextSorter)( writeConfiguration(); } -// _____________________________________________________________________________ -void IndexImpl::createOSPAndOPSPublic(size_t numColumns, - BlocksOfTriples sortedTriples) { - createOSPAndOPS(numColumns, std::move(sortedTriples)); -} - // _____________________________________________________________________________ template auto IndexImpl::makeSorterImpl(std::string_view permutationName) const { @@ -1930,6 +1941,7 @@ void IndexImpl::loadConfigFromOldIndex(const std::string& newName, static_cast(newStats["num-predicates"]); numSubjects_ = static_cast(newStats["num-subjects"]); numObjects_ = static_cast(newStats["num-objects"]); + writeConfiguration(); } // _____________________________________________________________________________ diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 0abab87085..8e4a403c2a 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -579,11 +579,10 @@ class IndexImpl { Permutation::KeyOrder permutation, Callbacks&&... perTripleCallbacks); - public: - void createPermutationPairPublic( - size_t numColumns, - ad_utility::InputRangeTypeErased>&& sortedTriples, - const Permutation& p1, const Permutation& p2); + std::tuple + createPermutationImpl( + size_t numColumns, const std::string& fileName, + ad_utility::InputRangeTypeErased> sortedTriples); protected: // _______________________________________________________________________ @@ -622,6 +621,13 @@ class IndexImpl { const Permutation& p1, const Permutation& p2, Callbacks&&... perTripleCallbacks); + public: + size_t createPermutation( + size_t numColumns, + ad_utility::InputRangeTypeErased> sortedTriples, + const Permutation& permutation, bool internal = false); + + protected: void openTextFileHandle(); // Get the metadata for the block from the text index that contains the @@ -731,8 +737,6 @@ class IndexImpl { size_t numColumns, BlocksOfTriples sortedTriples, NextSorter&&... nextSorter); - void createSPOAndSOPPublic(size_t numColumns, BlocksOfTriples sortedTriples); - // Create the OSP and OPS permutations. Additionally, count the number of // distinct objects and write it to the metadata. CPP_template(typename... NextSorter)(requires( @@ -740,8 +744,6 @@ class IndexImpl { 1)) void createOSPAndOPS(size_t numColumns, BlocksOfTriples sortedTriples, NextSorter&&... nextSorter); - void createOSPAndOPSPublic(size_t numColumns, BlocksOfTriples sortedTriples); - // Create the PSO and POS permutations. Additionally, count the number of // distinct predicates and the number of actual triples and write them to the // metadata. The meta-data JSON file for the index statistics will only be @@ -754,8 +756,6 @@ class IndexImpl { bool doWriteConfiguration, NextSorter&&... nextSorter); - void createPSOAndPOSImplPublic(size_t numColumns, - BlocksOfTriples sortedTriples); // Call `createPSOAndPOSImpl` with the given arguments and with // `doWriteConfiguration` set to `true` (see above). CPP_template(typename... NextSorter)(requires( @@ -769,9 +769,6 @@ class IndexImpl { std::pair createInternalPSOandPOS( InternalTriplePsoSorter&& internalTriplesPsoSorter); - std::pair createInternalPSOandPOSFromRange( - ad_utility::InputRangeTypeErased> sortedBlocks); - // Set up one of the permutation sorters with the appropriate memory limit. // The `permutationName` is used to determine the filename and must be unique // for each call during one index build. diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index d89af133a8..f124cfca9e 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -106,19 +106,11 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( auto fullScan = permutation.lazyScan(scanSpecAndBlocks, std::nullopt, additionalColumns, cancellationHandle, snapshot, LimitOffsetClause{}); - auto keyOrder = Permutation::toKeyOrder(permutation.permutation()); - std::vector columnIndices{keyOrder.keys().begin(), - keyOrder.keys().end()}; - while (columnIndices.size() < additionalColumns.size() + 3) { - columnIndices.emplace_back(columnIndices.size()); - } + return ad_utility::InputRangeTypeErased{ ad_utility::CachingTransformInputRange{ std::move(fullScan), - [columnIndices = std::move(columnIndices), &localVocabMapping, - &insertInfo](IdTable& idTable) { - AD_CORRECTNESS_CHECK(idTable.numColumns() == columnIndices.size()); - idTable.setColumnSubset(columnIndices); + [&localVocabMapping, &insertInfo](IdTable& idTable) { auto allCols = idTable.getColumns(); // Extra columns beyond the graph column only contain integers (or // undefined for triples added via UPDATE) and thus don't need to be @@ -192,48 +184,46 @@ void materializeToIndex(const IndexImpl& index, const std::string& newIndexName, } if (index.hasAllPermutations()) { - newIndex.createSPOAndSOPPublic( - 4, readIndexAndRemap(index.getPermutation(Permutation::Enum::SPO), - scanSpec, *snapshot, localVocabMapping, insertInfo, - cancellationHandle)); - // TODO Find out why we can't use createOSPAndOPSPublic here. - newIndex.createPermutationPairPublic( - 4, - readIndexAndRemap(index.getPermutation(Permutation::Enum::OPS), - scanSpec, *snapshot, localVocabMapping, insertInfo, - cancellationHandle), - newIndex.getPermutation(Permutation::Enum::OPS), - newIndex.getPermutation(Permutation::Enum::OSP)); + using enum Permutation::Enum; + for (auto permutation : {SPO, SOP, OPS, OSP}) { + const auto& actualPermutation = index.getPermutation(permutation); + newIndex.createPermutation( + 4, + readIndexAndRemap(actualPermutation, scanSpec, *snapshot, + localVocabMapping, insertInfo, cancellationHandle), + actualPermutation); + } } - auto [numTriplesInternal, numPredicatesInternal] = - newIndex.createInternalPSOandPOSFromRange(readIndexAndRemap( - index.getPermutation(Permutation::Enum::PSO).internalPermutation(), - scanSpec, *snapshot, localVocabMapping, insertInfo, - cancellationHandle)); - - const auto& psoPermutation = index.getPermutation(Permutation::Enum::PSO); - auto blockMetadataRanges = - psoPermutation.getAugmentedMetadataForPermutation(*snapshot); - size_t numColumns = getNumColumns(blockMetadataRanges); - std::vector additionalColumns; - additionalColumns.push_back(ADDITIONAL_COLUMN_GRAPH_ID); - for (ColumnIndex col : {ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN, - ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN}) { - if (additionalColumns.size() >= numColumns - 3) { - break; + for (auto permutation : Permutation::INTERNAL) { + const auto& actualPermutation = index.getPermutation(permutation); + const auto& internalPermutation = actualPermutation.internalPermutation(); + newIndex.createPermutation( + 4, + readIndexAndRemap(internalPermutation, scanSpec, *snapshot, + localVocabMapping, insertInfo, cancellationHandle), + internalPermutation, true); + + auto blockMetadataRanges = + actualPermutation.getAugmentedMetadataForPermutation(*snapshot); + size_t numColumns = getNumColumns(blockMetadataRanges); + std::vector additionalColumns; + additionalColumns.push_back(ADDITIONAL_COLUMN_GRAPH_ID); + for (ColumnIndex col : {ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN, + ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN}) { + if (additionalColumns.size() >= numColumns - 3) { + break; + } + additionalColumns.push_back(col); } - additionalColumns.push_back(col); + AD_CORRECTNESS_CHECK(additionalColumns.size() == numColumns - 3); + newIndex.createPermutation( + numColumns, + readIndexAndRemap(actualPermutation, scanSpec, blockMetadataRanges, + *snapshot, localVocabMapping, insertInfo, + cancellationHandle, additionalColumns), + actualPermutation); } - AD_CORRECTNESS_CHECK(additionalColumns.size() == numColumns - 3); - newIndex.createPSOAndPOSImplPublic( - numColumns, - readIndexAndRemap(psoPermutation, scanSpec, blockMetadataRanges, - *snapshot, localVocabMapping, insertInfo, - cancellationHandle, additionalColumns)); - - newIndex.addInternalStatisticsToConfiguration(numTriplesInternal, - numPredicatesInternal); } } // namespace qlever From a503da4877b234af14d6e1ae1c4ea00a1da254f7 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 18 Dec 2025 19:50:09 +0100 Subject: [PATCH 22/41] Parallelize processing --- src/index/IndexImpl.cpp | 2 ++ src/index/IndexRebuilder.cpp | 64 ++++++++++++++++++++++++------------ 2 files changed, 45 insertions(+), 21 deletions(-) diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 62239b8fa0..2c97413176 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1986,6 +1986,8 @@ void countDistinct(std::optional& lastId, size_t& counter, // _____________________________________________________________________________ nlohmann::json IndexImpl::recomputeStatistics( const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { + // TODO Wrap threads in try statements to avoid termination on + // exception and propagate it. size_t numTriples = 0; size_t numTriplesInternal = 0; size_t numSubjects = 0; diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index f124cfca9e..7cce6d915c 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -111,6 +111,7 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( ad_utility::CachingTransformInputRange{ std::move(fullScan), [&localVocabMapping, &insertInfo](IdTable& idTable) { + // TODO process columns in parallel. auto allCols = idTable.getColumns(); // Extra columns beyond the graph column only contain integers (or // undefined for triples added via UPDATE) and thus don't need to be @@ -174,35 +175,49 @@ void materializeToIndex(const IndexImpl& index, const std::string& newIndexName, ScanSpecification scanSpec{std::nullopt, std::nullopt, std::nullopt}; IndexImpl newIndex{index.allocator(), false}; newIndex.loadConfigFromOldIndex(newIndexName, index, newStats); + // TODO Make sure any exceptions are properly handled and propagated. + std::vector tasks; if (index.usePatterns()) { - newIndex.getPatterns() = - index.getPatterns().cloneAndRemap([&insertInfo](const Id& oldId) { - return remapVocabId(oldId, insertInfo); - }); - newIndex.writePatternsToFile(); + tasks.push_back(ad_utility::JThread{[&newIndex, &index, &insertInfo]() { + newIndex.getPatterns() = + index.getPatterns().cloneAndRemap([&insertInfo](const Id& oldId) { + return remapVocabId(oldId, insertInfo); + }); + newIndex.writePatternsToFile(); + }}); } if (index.hasAllPermutations()) { using enum Permutation::Enum; for (auto permutation : {SPO, SOP, OPS, OSP}) { const auto& actualPermutation = index.getPermutation(permutation); - newIndex.createPermutation( - 4, - readIndexAndRemap(actualPermutation, scanSpec, *snapshot, - localVocabMapping, insertInfo, cancellationHandle), - actualPermutation); + tasks.push_back(ad_utility::JThread{ + [&newIndex, &actualPermutation, &scanSpec, &snapshot, + &localVocabMapping, &insertInfo, &cancellationHandle]() { + newIndex.createPermutation( + 4, + readIndexAndRemap(actualPermutation, scanSpec, *snapshot, + localVocabMapping, insertInfo, + cancellationHandle), + actualPermutation); + }}); } } for (auto permutation : Permutation::INTERNAL) { const auto& actualPermutation = index.getPermutation(permutation); const auto& internalPermutation = actualPermutation.internalPermutation(); - newIndex.createPermutation( - 4, - readIndexAndRemap(internalPermutation, scanSpec, *snapshot, - localVocabMapping, insertInfo, cancellationHandle), - internalPermutation, true); + tasks.push_back(ad_utility::JThread{ + [&newIndex, &internalPermutation, &scanSpec, &snapshot, + &localVocabMapping, &insertInfo, &cancellationHandle]() { + newIndex.createPermutation( + 4, + readIndexAndRemap(internalPermutation, scanSpec, *snapshot, + localVocabMapping, insertInfo, + cancellationHandle), + internalPermutation, true); + }}); auto blockMetadataRanges = actualPermutation.getAugmentedMetadataForPermutation(*snapshot); @@ -217,12 +232,19 @@ void materializeToIndex(const IndexImpl& index, const std::string& newIndexName, additionalColumns.push_back(col); } AD_CORRECTNESS_CHECK(additionalColumns.size() == numColumns - 3); - newIndex.createPermutation( - numColumns, - readIndexAndRemap(actualPermutation, scanSpec, blockMetadataRanges, - *snapshot, localVocabMapping, insertInfo, - cancellationHandle, additionalColumns), - actualPermutation); + tasks.push_back(ad_utility::JThread{ + [&newIndex, &actualPermutation, &scanSpec, &snapshot, + &localVocabMapping, &insertInfo, &cancellationHandle, numColumns, + blockMetadataRanges = std::move(blockMetadataRanges), + additionalColumns = std::move(additionalColumns)]() { + newIndex.createPermutation( + numColumns, + readIndexAndRemap(actualPermutation, scanSpec, + blockMetadataRanges, *snapshot, + localVocabMapping, insertInfo, + cancellationHandle, additionalColumns), + actualPermutation); + }}); } } From bcc533a2d121bfdf6adac7a96e4173a19cd0370b Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Thu, 18 Dec 2025 20:08:19 +0100 Subject: [PATCH 23/41] Fix compilation --- src/index/IndexImpl.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 2c97413176..00cb3bf8bc 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1972,8 +1972,8 @@ void countDistinct(std::optional& lastId, size_t& counter, ql::ranges::distance(col | ::ranges::views::unique([](Id a, Id b) { return a.getBits() == b.getBits(); })); - if (lastId != col.at(0)) { - lastId = col.at(0); + if (lastId != col[0]) { + lastId = col[0]; } else { // Avoid double counting in case the last id of the previous block is the // same as the first id of this block. From c9cc426a5eab07594a09555f8fad0674b4667054 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Fri, 26 Dec 2025 04:06:34 +0100 Subject: [PATCH 24/41] Separate log file `.rebuild-index-log.txt` Not as fine-grained as the log file for a full index build, but still useful, and, importantly, separate from the server log file. --- src/engine/Server.cpp | 4 +++- src/index/IndexRebuilder.cpp | 35 ++++++++++++++++++++++++++++++++++- src/index/IndexRebuilder.h | 3 ++- 3 files changed, 39 insertions(+), 3 deletions(-) diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 9bba17897f..91c6a9d208 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -480,10 +480,12 @@ CPP_template_def(typename RequestT, typename ResponseT)( auto coroutine = computeInNewThread( queryThreadPool_, [this, &handle, fileName = std::move(fileName)] { + auto logFileName = fileName + ".rebuild-index-log.txt"; auto [currentSnapshot, localVocabCopy] = index_.deltaTriplesManager().getCurrentSnapshotWithVocab(); qlever::materializeToIndex(index_.getImpl(), fileName, - localVocabCopy, currentSnapshot, handle); + localVocabCopy, currentSnapshot, handle, + logFileName); }, handle); co_await std::move(coroutine); diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index 7cce6d915c..0cf84d696b 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -6,6 +6,8 @@ #include #include +#include +#include #include #include #include @@ -21,6 +23,7 @@ #include "util/Exception.h" #include "util/HashMap.h" #include "util/InputRangeUtils.h" +#include "util/Log.h" namespace { using CancellationHandle = ad_utility::SharedCancellationHandle; @@ -167,14 +170,37 @@ namespace qlever { void materializeToIndex(const IndexImpl& index, const std::string& newIndexName, const std::vector& entries, const SharedLocatedTriplesSnapshot& snapshot, - const CancellationHandle& cancellationHandle) { + const CancellationHandle& cancellationHandle, + const std::string& logFileName) { + AD_CONTRACT_CHECK(!logFileName.empty(), "Log file name must not be empty"); + + // Set up logging to file + auto logFile = std::make_unique(logFileName); + AD_CORRECTNESS_CHECK(logFile->is_open(), + "Failed to open log file: " + logFileName); + + // Macro for rebuild-specific logging with the same syntax as AD_LOG_INFO +#define REBUILD_LOG_INFO \ + *logFile << ad_utility::Log::getTimeStamp() << " - INFO: " + + REBUILD_LOG_INFO << "Rebuilding index from current data (including updates)" + << std::endl; + + REBUILD_LOG_INFO << "Writing new vocabulary ..." << std::endl; + const auto& [insertInfo, localVocabMapping] = materializeLocalVocab(entries, index.getVocab(), newIndexName); + + REBUILD_LOG_INFO << "Recomputing statistics ..." << std::endl; + auto newStats = index.recomputeStatistics(*snapshot); ScanSpecification scanSpec{std::nullopt, std::nullopt, std::nullopt}; IndexImpl newIndex{index.allocator(), false}; newIndex.loadConfigFromOldIndex(newIndexName, index, newStats); + + REBUILD_LOG_INFO << "Writing new permutations ..." << std::endl; + // TODO Make sure any exceptions are properly handled and propagated. std::vector tasks; @@ -246,6 +272,13 @@ void materializeToIndex(const IndexImpl& index, const std::string& newIndexName, actualPermutation); }}); } + + // Explicitly wait for all threads to complete before logging completion + tasks.clear(); + + REBUILD_LOG_INFO << "Index rebuild completed" << std::endl; + +#undef REBUILD_LOG_INFO } } // namespace qlever diff --git a/src/index/IndexRebuilder.h b/src/index/IndexRebuilder.h index cbcf59fe1e..020df29d11 100644 --- a/src/index/IndexRebuilder.h +++ b/src/index/IndexRebuilder.h @@ -18,7 +18,8 @@ void materializeToIndex( const IndexImpl& index, const std::string& newIndexName, const std::vector& entries, const SharedLocatedTriplesSnapshot& snapshot, - const ad_utility::SharedCancellationHandle& cancellationHandle); + const ad_utility::SharedCancellationHandle& cancellationHandle, + const std::string& logFileName); } // namespace qlever From b4e0d45cdabf6b88afae26264edc0ef18bf7f9f4 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Wed, 7 Jan 2026 17:15:07 +0100 Subject: [PATCH 25/41] Use 6 columns for dummy block --- src/index/LocatedTriples.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp index 51638175de..64c7488940 100644 --- a/src/index/LocatedTriples.cpp +++ b/src/index/LocatedTriples.cpp @@ -346,10 +346,10 @@ void LocatedTriplesPerBlock::updateAugmentedMetadata() { using O = CompressedBlockMetadata::OffsetAndCompressedSize; O emptyBlock{0, 0}; - // TODO We need the appropriate number of columns here, or we need - // to make the reading code work regardless of the number of columns. + // Note: This code assumes that a single permutation will never contain more + // than 6 columns. Downstream code removes these unecessary columns again. CompressedBlockMetadataNoBlockIndex lastBlockN{ - std::vector(4, emptyBlock), + std::vector(6, emptyBlock), 0, firstTriple, lastTriple, From f9a6a62bedcdf3032c8071fc3b80e173b2094d18 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Wed, 7 Jan 2026 17:17:08 +0100 Subject: [PATCH 26/41] Avoid use of unique pointer --- src/index/IndexRebuilder.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index 0cf84d696b..df9cae1f75 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -175,13 +174,13 @@ void materializeToIndex(const IndexImpl& index, const std::string& newIndexName, AD_CONTRACT_CHECK(!logFileName.empty(), "Log file name must not be empty"); // Set up logging to file - auto logFile = std::make_unique(logFileName); - AD_CORRECTNESS_CHECK(logFile->is_open(), + std::ofstream logFile{logFileName}; + AD_CORRECTNESS_CHECK(logFile.is_open(), "Failed to open log file: " + logFileName); // Macro for rebuild-specific logging with the same syntax as AD_LOG_INFO #define REBUILD_LOG_INFO \ - *logFile << ad_utility::Log::getTimeStamp() << " - INFO: " + logFile << ad_utility::Log::getTimeStamp() << " - INFO: " REBUILD_LOG_INFO << "Rebuilding index from current data (including updates)" << std::endl; From c01becf3996c6fd28f7f4df27020522d3ec42c9e Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Wed, 7 Jan 2026 18:30:23 +0100 Subject: [PATCH 27/41] Fix typo --- src/index/LocatedTriples.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp index 64c7488940..10c241c7b8 100644 --- a/src/index/LocatedTriples.cpp +++ b/src/index/LocatedTriples.cpp @@ -347,7 +347,7 @@ void LocatedTriplesPerBlock::updateAugmentedMetadata() { O emptyBlock{0, 0}; // Note: This code assumes that a single permutation will never contain more - // than 6 columns. Downstream code removes these unecessary columns again. + // than 6 columns. Downstream code removes these unnecessary columns again. CompressedBlockMetadataNoBlockIndex lastBlockN{ std::vector(6, emptyBlock), 0, From 8a097a65eee41483acccb33d3d582736a5c35724 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Wed, 7 Jan 2026 21:44:35 +0100 Subject: [PATCH 28/41] Change test to new size --- test/LocatedTriplesTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/LocatedTriplesTest.cpp b/test/LocatedTriplesTest.cpp index 0dde6ec7fe..72311f439e 100644 --- a/test/LocatedTriplesTest.cpp +++ b/test/LocatedTriplesTest.cpp @@ -895,7 +895,7 @@ TEST_F(LocatedTriplesTest, augmentedMetadataGraphInfo) { // The automatically added metadata for the last block also has the correct // block index and number of columns, so we have to properly initialize it. expectedAugmentedMetadata.back().blockIndex_ = 2; - expectedAugmentedMetadata.back().offsetsAndCompressedSize_.resize(4, + expectedAugmentedMetadata.back().offsetsAndCompressedSize_.resize(6, {0, 0}); // All the blocks have updates, so their value of `containsDuplicates..` is From 4015e1b52f738aca4a92c626506866cc54d5edf7 Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Thu, 8 Jan 2026 02:11:53 +0100 Subject: [PATCH 29/41] Resolve conflicts from merging `origin/master` The merge of #2500 changed the way "snapshots" are obtained + they are no longer called "locatedTriplesSnapshot" but "locatedTriplesSharedState" --- src/engine/Server.cpp | 3 ++- src/index/DeltaTriples.cpp | 6 +++--- src/index/DeltaTriples.h | 8 ++++---- src/index/IndexImpl.cpp | 26 ++++++++++++------------ src/index/IndexImpl.h | 5 +++-- src/index/IndexRebuilder.cpp | 39 +++++++++++++++++++----------------- src/index/IndexRebuilder.h | 2 +- 7 files changed, 47 insertions(+), 42 deletions(-) diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 913131c727..9a9f9cf5a7 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -482,7 +482,8 @@ CPP_template_def(typename RequestT, typename ResponseT)( [this, &handle, fileName = std::move(fileName)] { auto logFileName = fileName + ".rebuild-index-log.txt"; auto [currentSnapshot, localVocabCopy] = - index_.deltaTriplesManager().getCurrentSnapshotWithVocab(); + index_.deltaTriplesManager() + .getCurrentLocatedTriplesSharedStateWithVocab(); qlever::materializeToIndex(index_.getImpl(), fileName, localVocabCopy, currentSnapshot, handle, logFileName); diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index 302b8f3e72..e74354a318 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -508,10 +508,10 @@ DeltaTriplesManager::getCurrentLocatedTriplesSharedState() const { } // _____________________________________________________________________________ -std::pair> -DeltaTriplesManager::getCurrentSnapshotWithVocab() const { +std::pair> +DeltaTriplesManager::getCurrentLocatedTriplesSharedStateWithVocab() const { return deltaTriples_.withReadLock([this](const DeltaTriples& deltaTriples) { - return std::make_pair(*currentLocatedTriplesSnapshot_.rlock(), + return std::make_pair(*currentLocatedTriplesSharedState_.rlock(), deltaTriples.copyLocalVocab()); }); } diff --git a/src/index/DeltaTriples.h b/src/index/DeltaTriples.h index 9074611481..7ce1d2d69c 100644 --- a/src/index/DeltaTriples.h +++ b/src/index/DeltaTriples.h @@ -371,10 +371,10 @@ class DeltaTriplesManager { // updates. LocatedTriplesSharedState getCurrentLocatedTriplesSharedState() const; - // In addition to a simple snapshot, also acquire a copy of the local vocab - // indices. - std::pair> - getCurrentSnapshotWithVocab() const; + // In addition to the located triples shared state, also acquire a copy of the + // local vocab indices. + std::pair> + getCurrentLocatedTriplesSharedStateWithVocab() const; }; #endif // QLEVER_SRC_INDEX_DELTATRIPLES_H diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 48fc0d6278..ffa996fe71 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -1944,7 +1944,7 @@ void countDistinct(std::optional& lastId, size_t& counter, // _____________________________________________________________________________ nlohmann::json IndexImpl::recomputeStatistics( - const LocatedTriplesSnapshot& locatedTriplesSnapshot) const { + const LocatedTriplesSharedState& locatedTriplesSharedState) const { // TODO Wrap threads in try statements to avoid termination on // exception and propagate it. size_t numTriples = 0; @@ -1962,11 +1962,11 @@ nlohmann::json IndexImpl::recomputeStatistics( tasks.push_back(ad_utility::JThread{ [this, &numTriples, &numPredicates, &nextBlankNode, &scanSpec, - &locatedTriplesSnapshot, &cancellationHandle]() { + &locatedTriplesSharedState, &cancellationHandle]() { auto tables = pso_->lazyScan( - pso_->getScanSpecAndBlocks(scanSpec, locatedTriplesSnapshot), + pso_->getScanSpecAndBlocks(scanSpec, *locatedTriplesSharedState), std::nullopt, CompressedRelationReader::ColumnIndicesRef{}, - cancellationHandle, locatedTriplesSnapshot); + cancellationHandle, *locatedTriplesSharedState); std::optional lastPredicate = std::nullopt; for (const auto& table : tables) { numTriples += table.numRows(); @@ -1984,12 +1984,12 @@ nlohmann::json IndexImpl::recomputeStatistics( tasks.push_back(ad_utility::JThread{ [this, &numTriplesInternal, &numPredicatesInternal, &scanSpec, - &locatedTriplesSnapshot, &cancellationHandle]() { + &locatedTriplesSharedState, &cancellationHandle]() { auto tables = pso_->internalPermutation().lazyScan( pso_->internalPermutation().getScanSpecAndBlocks( - scanSpec, locatedTriplesSnapshot), + scanSpec, *locatedTriplesSharedState), std::nullopt, CompressedRelationReader::ColumnIndicesRef{}, - cancellationHandle, locatedTriplesSnapshot); + cancellationHandle, *locatedTriplesSharedState); std::optional lastPredicate = std::nullopt; for (const auto& table : tables) { numTriplesInternal += table.numRows(); @@ -2000,11 +2000,11 @@ nlohmann::json IndexImpl::recomputeStatistics( if (hasAllPermutations()) { tasks.push_back( ad_utility::JThread{[this, &numSubjects, &scanSpec, - &locatedTriplesSnapshot, &cancellationHandle]() { + &locatedTriplesSharedState, &cancellationHandle]() { auto tables = spo_->lazyScan( - spo_->getScanSpecAndBlocks(scanSpec, locatedTriplesSnapshot), + spo_->getScanSpecAndBlocks(scanSpec, *locatedTriplesSharedState), std::nullopt, CompressedRelationReader::ColumnIndicesRef{}, - cancellationHandle, locatedTriplesSnapshot); + cancellationHandle, *locatedTriplesSharedState); std::optional lastSubject = std::nullopt; for (const auto& table : tables) { countDistinct(lastSubject, numSubjects, table); @@ -2013,11 +2013,11 @@ nlohmann::json IndexImpl::recomputeStatistics( tasks.push_back( ad_utility::JThread{[this, &numObjects, &scanSpec, - &locatedTriplesSnapshot, &cancellationHandle]() { + &locatedTriplesSharedState, &cancellationHandle]() { auto tables = osp_->lazyScan( - osp_->getScanSpecAndBlocks(scanSpec, locatedTriplesSnapshot), + osp_->getScanSpecAndBlocks(scanSpec, *locatedTriplesSharedState), std::nullopt, CompressedRelationReader::ColumnIndicesRef{}, - cancellationHandle, locatedTriplesSnapshot); + cancellationHandle, *locatedTriplesSharedState); std::optional lastObject = std::nullopt; for (const auto& table : tables) { countDistinct(lastObject, numObjects, table); diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index bf5a0b03fc..3523080c96 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -835,9 +835,10 @@ class IndexImpl { // Write the stored in-memory patterns to a pattern file. void writePatternsToFile() const; - // Recompute the statistics about the index based on the passed snapshot. + // Recompute the statistics about the index based on the passed located + // triples shared state. nlohmann::json recomputeStatistics( - const LocatedTriplesSnapshot& locatedTriplesSnapshot) const; + const LocatedTriplesSharedState& locatedTriplesSharedState) const; }; #endif // QLEVER_SRC_INDEX_INDEXIMPL_H diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index df9cae1f75..00c850ccbc 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -98,7 +98,7 @@ Id remapVocabId(Id original, ad_utility::InputRangeTypeErased> readIndexAndRemap( const Permutation& permutation, ScanSpecification scanSpec, const BlockMetadataRanges& blockMetadataRanges, - const LocatedTriplesSnapshot& snapshot, + const LocatedTriplesSharedState& locatedTriplesSharedState, const ad_utility::HashMap& localVocabMapping, const std::vector>& insertInfo, const ad_utility::SharedCancellationHandle& cancellationHandle, @@ -107,7 +107,8 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( blockMetadataRanges}; auto fullScan = permutation.lazyScan(scanSpecAndBlocks, std::nullopt, additionalColumns, - cancellationHandle, snapshot, LimitOffsetClause{}); + cancellationHandle, *locatedTriplesSharedState, + LimitOffsetClause{}); return ad_utility::InputRangeTypeErased{ ad_utility::CachingTransformInputRange{ @@ -142,14 +143,15 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( ad_utility::InputRangeTypeErased> readIndexAndRemap( const Permutation& permutation, ScanSpecification scanSpec, - const LocatedTriplesSnapshot& snapshot, + const LocatedTriplesSharedState& locatedTriplesSharedState, const ad_utility::HashMap& localVocabMapping, const std::vector>& insertInfo, const ad_utility::SharedCancellationHandle& cancellationHandle) { return readIndexAndRemap( permutation, std::move(scanSpec), - permutation.getAugmentedMetadataForPermutation(snapshot), snapshot, - localVocabMapping, insertInfo, cancellationHandle, + permutation.getAugmentedMetadataForPermutation(*locatedTriplesSharedState), + locatedTriplesSharedState, localVocabMapping, insertInfo, + cancellationHandle, std::array{static_cast(ADDITIONAL_COLUMN_GRAPH_ID)}); } @@ -168,7 +170,7 @@ size_t getNumColumns(const BlockMetadataRanges& blockMetadataRanges) { namespace qlever { void materializeToIndex(const IndexImpl& index, const std::string& newIndexName, const std::vector& entries, - const SharedLocatedTriplesSnapshot& snapshot, + const LocatedTriplesSharedState& locatedTriplesSharedState, const CancellationHandle& cancellationHandle, const std::string& logFileName) { AD_CONTRACT_CHECK(!logFileName.empty(), "Log file name must not be empty"); @@ -192,7 +194,7 @@ void materializeToIndex(const IndexImpl& index, const std::string& newIndexName, REBUILD_LOG_INFO << "Recomputing statistics ..." << std::endl; - auto newStats = index.recomputeStatistics(*snapshot); + auto newStats = index.recomputeStatistics(locatedTriplesSharedState); ScanSpecification scanSpec{std::nullopt, std::nullopt, std::nullopt}; IndexImpl newIndex{index.allocator(), false}; @@ -218,13 +220,13 @@ void materializeToIndex(const IndexImpl& index, const std::string& newIndexName, for (auto permutation : {SPO, SOP, OPS, OSP}) { const auto& actualPermutation = index.getPermutation(permutation); tasks.push_back(ad_utility::JThread{ - [&newIndex, &actualPermutation, &scanSpec, &snapshot, + [&newIndex, &actualPermutation, &scanSpec, &locatedTriplesSharedState, &localVocabMapping, &insertInfo, &cancellationHandle]() { newIndex.createPermutation( 4, - readIndexAndRemap(actualPermutation, scanSpec, *snapshot, - localVocabMapping, insertInfo, - cancellationHandle), + readIndexAndRemap(actualPermutation, scanSpec, + locatedTriplesSharedState, localVocabMapping, + insertInfo, cancellationHandle), actualPermutation); }}); } @@ -234,18 +236,19 @@ void materializeToIndex(const IndexImpl& index, const std::string& newIndexName, const auto& actualPermutation = index.getPermutation(permutation); const auto& internalPermutation = actualPermutation.internalPermutation(); tasks.push_back(ad_utility::JThread{ - [&newIndex, &internalPermutation, &scanSpec, &snapshot, + [&newIndex, &internalPermutation, &scanSpec, &locatedTriplesSharedState, &localVocabMapping, &insertInfo, &cancellationHandle]() { newIndex.createPermutation( 4, - readIndexAndRemap(internalPermutation, scanSpec, *snapshot, - localVocabMapping, insertInfo, - cancellationHandle), + readIndexAndRemap(internalPermutation, scanSpec, + locatedTriplesSharedState, localVocabMapping, + insertInfo, cancellationHandle), internalPermutation, true); }}); auto blockMetadataRanges = - actualPermutation.getAugmentedMetadataForPermutation(*snapshot); + actualPermutation.getAugmentedMetadataForPermutation( + *locatedTriplesSharedState); size_t numColumns = getNumColumns(blockMetadataRanges); std::vector additionalColumns; additionalColumns.push_back(ADDITIONAL_COLUMN_GRAPH_ID); @@ -258,14 +261,14 @@ void materializeToIndex(const IndexImpl& index, const std::string& newIndexName, } AD_CORRECTNESS_CHECK(additionalColumns.size() == numColumns - 3); tasks.push_back(ad_utility::JThread{ - [&newIndex, &actualPermutation, &scanSpec, &snapshot, + [&newIndex, &actualPermutation, &scanSpec, &locatedTriplesSharedState, &localVocabMapping, &insertInfo, &cancellationHandle, numColumns, blockMetadataRanges = std::move(blockMetadataRanges), additionalColumns = std::move(additionalColumns)]() { newIndex.createPermutation( numColumns, readIndexAndRemap(actualPermutation, scanSpec, - blockMetadataRanges, *snapshot, + blockMetadataRanges, locatedTriplesSharedState, localVocabMapping, insertInfo, cancellationHandle, additionalColumns), actualPermutation); diff --git a/src/index/IndexRebuilder.h b/src/index/IndexRebuilder.h index 020df29d11..4f56366d25 100644 --- a/src/index/IndexRebuilder.h +++ b/src/index/IndexRebuilder.h @@ -17,7 +17,7 @@ namespace qlever { void materializeToIndex( const IndexImpl& index, const std::string& newIndexName, const std::vector& entries, - const SharedLocatedTriplesSnapshot& snapshot, + const LocatedTriplesSharedState& locatedTriplesSharedState, const ad_utility::SharedCancellationHandle& cancellationHandle, const std::string& logFileName); From 2ee4404a2cb99d2945c9a95b6763763e7840cbaf Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Mon, 12 Jan 2026 10:51:46 +0100 Subject: [PATCH 30/41] Add documentation --- src/index/DeltaTriples.cpp | 2 -- src/index/IndexImpl.h | 20 ++++++++++++++++++++ src/index/IndexRebuilder.cpp | 20 ++++++++++++++++---- 3 files changed, 36 insertions(+), 6 deletions(-) diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index e74354a318..7c208a364c 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -617,8 +617,6 @@ std::vector DeltaTriples::copyLocalVocab() const { std::vector entries; entries.reserve(localVocab_.size()); - ad_utility::HashMap localVocabMapping; - for (const LocalVocabEntry& entry : localVocab_.primaryWordSet()) { entries.push_back(&entry); } diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 4c09f70f12..3970189e56 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -563,6 +563,13 @@ class IndexImpl { Permutation::KeyOrder permutation, Callbacks&&... perTripleCallbacks); + // Write a single permutation to disk. `numColumns` specifies the number of + // columns in the relation (usually 4, sometimes 6 with patterns). + // `fileName` is the base name of the files to write to (without suffixes). + // `sortedTriples` is an input range that provides the triples in the correct + // order. + // Return the number of triples written and the metadata for the written + // permutation. std::tuple createPermutationImpl( size_t numColumns, const std::string& fileName, @@ -606,6 +613,16 @@ class IndexImpl { Callbacks&&... perTripleCallbacks); public: + // Write a single permutation to disk. `numColumns` specifies the number of + // columns in the relation (usually 4, sometimes 6 with patterns). + // `sortedTriples` is an input range that provides the triples in the correct + // order. + // `permutation` specifies which permutation to write. + // `internal` specifies whether this is an internal permutation and adjusts + // the filename of the generated file on disk accordingly. + // Return the number of distinct values on the first column of the written + // permutation. (Predicates for PSO/POS, Subjects for SPO/SOP, Objects for + // OSP/OPS). size_t createPermutation( size_t numColumns, ad_utility::InputRangeTypeErased> sortedTriples, @@ -828,6 +845,9 @@ class IndexImpl { void storeTextScoringParamsInConfiguration(TextScoringMetric scoringMetric, float b, float k); + // Overwrite the config of this instance of `IndexImpl` with the config of + // `other`, adjusting the name to `newName` and the statistics to + // `newStats`. void loadConfigFromOldIndex(const std::string& newName, const IndexImpl& other, const nlohmann::json& newStats); diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index 8ab6efdb17..4d6e8d5620 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -27,7 +27,12 @@ namespace { using CancellationHandle = ad_utility::SharedCancellationHandle; -// _____________________________________________________________________________ +// Write a new vocabulary that contains all words from `vocab` plus all +// entries in `entries`. Returns a pair consisting of a vector of tuples +// containing information about the inserted entries (the `VocabIndex` of their +// position in the old `vocab`, the string representation of the newly added +// value, and the original `Id`) and a mapping from old local vocab `Id`s to +// new vocab `Id`s. std::pair>, ad_utility::HashMap> materializeLocalVocab(const std::vector& entries, @@ -77,10 +82,10 @@ materializeLocalVocab(const std::vector& entries, localVocabMapping.emplace( id, Id::makeFromVocabIndex(VocabIndex::make(newIndex))); } - return std::pair{std::move(insertInfo), std::move(localVocabMapping)}; + return std::make_pair(std::move(insertInfo), std::move(localVocabMapping)); } -// _____________________________________________________________________________ +// Map old vocab `Id`s to new vocab `Id`s according to the given `insertInfo`. Id remapVocabId(Id original, const std::vector>& insertInfo) { @@ -94,7 +99,10 @@ Id remapVocabId(Id original, VocabIndex::make(original.getVocabIndex().get() + offset)); } -// _____________________________________________________________________________ +// Create a copy of the given `permutation` scanned according to `scanSpec`, +// where all local vocab `Id`s are remapped according to `localVocabMapping` +// and all vocab `Id`s are remapped according to `insertInfo` to create a new +// index where all of these values are all vocab `Id`s in the new vocabulary. ad_utility::InputRangeTypeErased> readIndexAndRemap( const Permutation& permutation, ScanSpecification scanSpec, const BlockMetadataRanges& blockMetadataRanges, @@ -140,6 +148,8 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( }}}; } +// Overload that automatically retrieves the block metadata ranges for the given +// `permutation` and passes the graph ID as an additional column to be read. ad_utility::InputRangeTypeErased> readIndexAndRemap( const Permutation& permutation, ScanSpecification scanSpec, const LocatedTriplesSharedState& locatedTriplesSharedState, @@ -155,6 +165,8 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( std::array{static_cast(ADDITIONAL_COLUMN_GRAPH_ID)}); } +// Get the number of columns in the given `blockMetadataRanges`. If this cannot +// be determined, return 4 as a safe default. size_t getNumColumns(const BlockMetadataRanges& blockMetadataRanges) { if (!blockMetadataRanges.empty()) { const auto& first = blockMetadataRanges.at(0); From 1d85ecd8cb931008b18b19b5d52f875bd827a94d Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Mon, 12 Jan 2026 10:58:34 +0100 Subject: [PATCH 31/41] Make lookup vector more compact --- src/index/IndexRebuilder.cpp | 55 ++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index 4d6e8d5620..59facaeb39 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -28,13 +28,10 @@ namespace { using CancellationHandle = ad_utility::SharedCancellationHandle; // Write a new vocabulary that contains all words from `vocab` plus all -// entries in `entries`. Returns a pair consisting of a vector of tuples -// containing information about the inserted entries (the `VocabIndex` of their -// position in the old `vocab`, the string representation of the newly added -// value, and the original `Id`) and a mapping from old local vocab `Id`s to -// new vocab `Id`s. -std::pair>, - ad_utility::HashMap> +// entries in `entries`. Returns a pair consisting of a vector insertion +// positions (the `VocabIndex` of the `LocalVocabEntry`s position in the old +// `vocab`) and a mapping from old local vocab `Id`s to new vocab `Id`s. +std::pair, ad_utility::HashMap> materializeLocalVocab(const std::vector& entries, const Index::Vocab& vocab, const std::string& newIndexName) { @@ -82,19 +79,24 @@ materializeLocalVocab(const std::vector& entries, localVocabMapping.emplace( id, Id::makeFromVocabIndex(VocabIndex::make(newIndex))); } - return std::make_pair(std::move(insertInfo), std::move(localVocabMapping)); + std::vector insertionPositions; + insertionPositions.reserve(insertInfo.size()); + for (const auto& [vocabIndex, _, __] : insertInfo) { + insertionPositions.push_back(vocabIndex); + } + return std::make_pair(std::move(insertionPositions), + std::move(localVocabMapping)); } -// Map old vocab `Id`s to new vocab `Id`s according to the given `insertInfo`. +// Map old vocab `Id`s to new vocab `Id`s according to the given +// `insertionPositions`. Id remapVocabId(Id original, - const std::vector>& - insertInfo) { + const std::vector& insertionPositions) { AD_CONTRACT_CHECK(original.getDatatype() == Datatype::VocabIndex); size_t offset = ql::ranges::distance( - insertInfo.begin(), - ql::ranges::upper_bound( - insertInfo, original.getVocabIndex(), std::less{}, - [](const auto& tuple) { return std::get<0>(tuple); })); + insertionPositions.begin(), + ql::ranges::upper_bound(insertionPositions, original.getVocabIndex(), + std::less{})); return Id::makeFromVocabIndex( VocabIndex::make(original.getVocabIndex().get() + offset)); } @@ -108,7 +110,7 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( const BlockMetadataRanges& blockMetadataRanges, const LocatedTriplesSharedState& locatedTriplesSharedState, const ad_utility::HashMap& localVocabMapping, - const std::vector>& insertInfo, + const std::vector& insertionPositions, const ad_utility::SharedCancellationHandle& cancellationHandle, ql::span additionalColumns) { Permutation::ScanSpecAndBlocks scanSpecAndBlocks{std::move(scanSpec), @@ -120,7 +122,7 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( return ad_utility::InputRangeTypeErased{ ad_utility::CachingTransformInputRange{ std::move(fullScan), - [&localVocabMapping, &insertInfo](IdTable& idTable) { + [&localVocabMapping, &insertionPositions](IdTable& idTable) { // TODO process columns in parallel. auto allCols = idTable.getColumns(); // Extra columns beyond the graph column only contain integers (or @@ -128,14 +130,13 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( // remapped. constexpr size_t REGULAR_COLUMNS = 4; for (auto col : allCols | ::ranges::views::take(REGULAR_COLUMNS)) { - ql::ranges::for_each( - col, [&localVocabMapping, &insertInfo](Id& id) { - if (id.getDatatype() == Datatype::LocalVocabIndex) { - id = localVocabMapping.at(id); - } else if (id.getDatatype() == Datatype::VocabIndex) { - id = remapVocabId(id, insertInfo); - } - }); + for (Id& id : col) { + if (id.getDatatype() == Datatype::LocalVocabIndex) { + id = localVocabMapping.at(id); + } else if (id.getDatatype() == Datatype::VocabIndex) { + id = remapVocabId(id, insertionPositions); + } + } } AD_EXPENSIVE_CHECK(ql::ranges::all_of( allCols | ::ranges::views::drop(REGULAR_COLUMNS), [](auto col) { @@ -154,13 +155,13 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( const Permutation& permutation, ScanSpecification scanSpec, const LocatedTriplesSharedState& locatedTriplesSharedState, const ad_utility::HashMap& localVocabMapping, - const std::vector>& insertInfo, + const std::vector& insertionPositions, const ad_utility::SharedCancellationHandle& cancellationHandle) { return readIndexAndRemap( permutation, std::move(scanSpec), permutation.getAugmentedMetadataForPermutation( *locatedTriplesSharedState), - locatedTriplesSharedState, localVocabMapping, insertInfo, + locatedTriplesSharedState, localVocabMapping, insertionPositions, cancellationHandle, std::array{static_cast(ADDITIONAL_COLUMN_GRAPH_ID)}); } From f295b13f9b09f04e6f39c7ffbcae708ad87f02fb Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Mon, 12 Jan 2026 11:26:53 +0100 Subject: [PATCH 32/41] Perform some optimizations to speed up rebuilding --- src/index/IndexRebuilder.cpp | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index 59facaeb39..c3611a8ee9 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -30,8 +30,9 @@ using CancellationHandle = ad_utility::SharedCancellationHandle; // Write a new vocabulary that contains all words from `vocab` plus all // entries in `entries`. Returns a pair consisting of a vector insertion // positions (the `VocabIndex` of the `LocalVocabEntry`s position in the old -// `vocab`) and a mapping from old local vocab `Id`s to new vocab `Id`s. -std::pair, ad_utility::HashMap> +// `vocab`) and a mapping from old local vocab `Id`s bit representation (for +// cheaper hash functions) to new vocab `Id`s. +std::pair, ad_utility::HashMap> materializeLocalVocab(const std::vector& entries, const Index::Vocab& vocab, const std::string& newIndexName) { @@ -39,7 +40,7 @@ materializeLocalVocab(const std::vector& entries, std::vector> insertInfo; insertInfo.reserve(entries.size()); - ad_utility::HashMap localVocabMapping; + ad_utility::HashMap localVocabMapping; for (auto* entry : entries) { const auto& [lower, upper] = entry->positionInVocab(); @@ -65,7 +66,7 @@ materializeLocalVocab(const std::vector& entries, auto word = std::get(insertInfo.at(newWordCount)); auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); localVocabMapping.emplace( - std::get(insertInfo.at(newWordCount)), + std::get(insertInfo.at(newWordCount)).getBits(), Id::makeFromVocabIndex(VocabIndex::make(newIndex))); newWordCount++; } @@ -77,7 +78,7 @@ materializeLocalVocab(const std::vector& entries, for (const auto& [_, word, id] : insertInfo | ql::views::drop(newWordCount)) { auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); localVocabMapping.emplace( - id, Id::makeFromVocabIndex(VocabIndex::make(newIndex))); + id.getBits(), Id::makeFromVocabIndex(VocabIndex::make(newIndex))); } std::vector insertionPositions; insertionPositions.reserve(insertInfo.size()); @@ -89,10 +90,13 @@ materializeLocalVocab(const std::vector& entries, } // Map old vocab `Id`s to new vocab `Id`s according to the given -// `insertionPositions`. -Id remapVocabId(Id original, - const std::vector& insertionPositions) { - AD_CONTRACT_CHECK(original.getDatatype() == Datatype::VocabIndex); +// `insertionPositions`. This is the most performance critical code of the +// rebuild. +AD_ALWAYS_INLINE Id +remapVocabId(Id original, const std::vector& insertionPositions) { + AD_EXPENSIVE_CHECK( + original.getDatatype() == Datatype::VocabIndex, + "Only ids resembling a vocab index can be remapped with this function."); size_t offset = ql::ranges::distance( insertionPositions.begin(), ql::ranges::upper_bound(insertionPositions, original.getVocabIndex(), @@ -109,7 +113,7 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( const Permutation& permutation, ScanSpecification scanSpec, const BlockMetadataRanges& blockMetadataRanges, const LocatedTriplesSharedState& locatedTriplesSharedState, - const ad_utility::HashMap& localVocabMapping, + const ad_utility::HashMap& localVocabMapping, const std::vector& insertionPositions, const ad_utility::SharedCancellationHandle& cancellationHandle, ql::span additionalColumns) { @@ -131,10 +135,10 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( constexpr size_t REGULAR_COLUMNS = 4; for (auto col : allCols | ::ranges::views::take(REGULAR_COLUMNS)) { for (Id& id : col) { - if (id.getDatatype() == Datatype::LocalVocabIndex) { - id = localVocabMapping.at(id); - } else if (id.getDatatype() == Datatype::VocabIndex) { + if (id.getDatatype() == Datatype::VocabIndex) [[likely]] { id = remapVocabId(id, insertionPositions); + } else if (id.getDatatype() == Datatype::LocalVocabIndex) { + id = localVocabMapping.at(id.getBits()); } } } @@ -154,7 +158,7 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( ad_utility::InputRangeTypeErased> readIndexAndRemap( const Permutation& permutation, ScanSpecification scanSpec, const LocatedTriplesSharedState& locatedTriplesSharedState, - const ad_utility::HashMap& localVocabMapping, + const ad_utility::HashMap& localVocabMapping, const std::vector& insertionPositions, const ad_utility::SharedCancellationHandle& cancellationHandle) { return readIndexAndRemap( From 0a198d15710fc174e49fffdb91c892c9e4cd0763 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Mon, 12 Jan 2026 12:05:21 +0100 Subject: [PATCH 33/41] Properly handle exceptions in parallel execution --- src/index/IndexImpl.cpp | 120 +++++++++++++++++------------------ src/index/IndexRebuilder.cpp | 15 ++--- src/util/ParallelExecutor.h | 36 +++++++++++ 3 files changed, 102 insertions(+), 69 deletions(-) create mode 100644 src/util/ParallelExecutor.h diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 2b824323d9..151d157c60 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -27,6 +27,7 @@ #include "util/InputRangeUtils.h" #include "util/Iterators.h" #include "util/JoinAlgorithms/JoinAlgorithms.h" +#include "util/ParallelExecutor.h" #include "util/ProgressBar.h" #include "util/ThreadSafeQueue.h" #include "util/Timer.h" @@ -1910,8 +1911,6 @@ void countDistinct(std::optional& lastId, size_t& counter, // _____________________________________________________________________________ nlohmann::json IndexImpl::recomputeStatistics( const LocatedTriplesSharedState& locatedTriplesSharedState) const { - // TODO Wrap threads in try statements to avoid termination on - // exception and propagate it. size_t numTriples = 0; size_t numTriplesInternal = 0; size_t numSubjects = 0; @@ -1919,77 +1918,76 @@ nlohmann::json IndexImpl::recomputeStatistics( size_t numPredicatesInternal = 0; size_t numObjects = 0; uint64_t nextBlankNode = 0; - { - auto cancellationHandle = - std::make_shared(); - ScanSpecification scanSpec{std::nullopt, std::nullopt, std::nullopt}; - std::vector tasks; - - tasks.push_back(ad_utility::JThread{ - [this, &numTriples, &numPredicates, &nextBlankNode, &scanSpec, - &locatedTriplesSharedState, &cancellationHandle]() { - auto tables = pso_->lazyScan( - pso_->getScanSpecAndBlocks(scanSpec, *locatedTriplesSharedState), - std::nullopt, CompressedRelationReader::ColumnIndicesRef{}, - cancellationHandle, *locatedTriplesSharedState); - std::optional lastPredicate = std::nullopt; - for (const auto& table : tables) { - numTriples += table.numRows(); - for (auto col : table.getColumns()) { - for (auto id : col) { - if (id.getDatatype() == Datatype::BlankNodeIndex) { - nextBlankNode = - std::max(nextBlankNode, id.getBlankNodeIndex().get() + 1); - } - } - } - countDistinct(lastPredicate, numPredicates, table); - } - }}); - - tasks.push_back(ad_utility::JThread{ - [this, &numTriplesInternal, &numPredicatesInternal, &scanSpec, - &locatedTriplesSharedState, &cancellationHandle]() { - auto tables = pso_->internalPermutation().lazyScan( - pso_->internalPermutation().getScanSpecAndBlocks( - scanSpec, *locatedTriplesSharedState), - std::nullopt, CompressedRelationReader::ColumnIndicesRef{}, - cancellationHandle, *locatedTriplesSharedState); - std::optional lastPredicate = std::nullopt; - for (const auto& table : tables) { - numTriplesInternal += table.numRows(); - countDistinct(lastPredicate, numPredicatesInternal, table); - } - }}); - - if (hasAllPermutations()) { - tasks.push_back(ad_utility::JThread{[this, &numSubjects, &scanSpec, - &locatedTriplesSharedState, - &cancellationHandle]() { - auto tables = spo_->lazyScan( - spo_->getScanSpecAndBlocks(scanSpec, *locatedTriplesSharedState), + auto cancellationHandle = + std::make_shared(); + ScanSpecification scanSpec{std::nullopt, std::nullopt, std::nullopt}; + std::vector> tasks; + + tasks.push_back(std::packaged_task{ + [this, &numTriples, &numPredicates, &nextBlankNode, &scanSpec, + &locatedTriplesSharedState, &cancellationHandle]() { + auto tables = pso_->lazyScan( + pso_->getScanSpecAndBlocks(scanSpec, *locatedTriplesSharedState), std::nullopt, CompressedRelationReader::ColumnIndicesRef{}, cancellationHandle, *locatedTriplesSharedState); - std::optional lastSubject = std::nullopt; + std::optional lastPredicate = std::nullopt; for (const auto& table : tables) { - countDistinct(lastSubject, numSubjects, table); + numTriples += table.numRows(); + for (auto col : table.getColumns()) { + for (auto id : col) { + if (id.getDatatype() == Datatype::BlankNodeIndex) { + nextBlankNode = + std::max(nextBlankNode, id.getBlankNodeIndex().get() + 1); + } + } + } + countDistinct(lastPredicate, numPredicates, table); } }}); - tasks.push_back(ad_utility::JThread{[this, &numObjects, &scanSpec, - &locatedTriplesSharedState, - &cancellationHandle]() { - auto tables = osp_->lazyScan( - osp_->getScanSpecAndBlocks(scanSpec, *locatedTriplesSharedState), + tasks.push_back(std::packaged_task{ + [this, &numTriplesInternal, &numPredicatesInternal, &scanSpec, + &locatedTriplesSharedState, &cancellationHandle]() { + auto tables = pso_->internalPermutation().lazyScan( + pso_->internalPermutation().getScanSpecAndBlocks( + scanSpec, *locatedTriplesSharedState), std::nullopt, CompressedRelationReader::ColumnIndicesRef{}, cancellationHandle, *locatedTriplesSharedState); - std::optional lastObject = std::nullopt; + std::optional lastPredicate = std::nullopt; for (const auto& table : tables) { - countDistinct(lastObject, numObjects, table); + numTriplesInternal += table.numRows(); + countDistinct(lastPredicate, numPredicatesInternal, table); } }}); - } + + if (hasAllPermutations()) { + tasks.push_back( + std::packaged_task{[this, &numSubjects, &scanSpec, + &locatedTriplesSharedState, &cancellationHandle]() { + auto tables = spo_->lazyScan( + spo_->getScanSpecAndBlocks(scanSpec, *locatedTriplesSharedState), + std::nullopt, CompressedRelationReader::ColumnIndicesRef{}, + cancellationHandle, *locatedTriplesSharedState); + std::optional lastSubject = std::nullopt; + for (const auto& table : tables) { + countDistinct(lastSubject, numSubjects, table); + } + }}); + + tasks.push_back( + std::packaged_task{[this, &numObjects, &scanSpec, + &locatedTriplesSharedState, &cancellationHandle]() { + auto tables = osp_->lazyScan( + osp_->getScanSpecAndBlocks(scanSpec, *locatedTriplesSharedState), + std::nullopt, CompressedRelationReader::ColumnIndicesRef{}, + cancellationHandle, *locatedTriplesSharedState); + std::optional lastObject = std::nullopt; + for (const auto& table : tables) { + countDistinct(lastObject, numObjects, table); + } + }}); } + ad_utility::runTasksInParallel(tasks); auto configuration = configurationJson_; configuration["num-triples"] = NumNormalAndInternal{numTriples, numTriplesInternal}; diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index c3611a8ee9..883d71e26a 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -23,6 +23,7 @@ #include "util/HashMap.h" #include "util/InputRangeUtils.h" #include "util/Log.h" +#include "util/ParallelExecutor.h" namespace { using CancellationHandle = ad_utility::SharedCancellationHandle; @@ -223,11 +224,10 @@ void materializeToIndex( REBUILD_LOG_INFO << "Writing new permutations ..." << std::endl; - // TODO Make sure any exceptions are properly handled and propagated. - std::vector tasks; + std::vector> tasks; if (index.usePatterns()) { - tasks.push_back(ad_utility::JThread{[&newIndex, &index, &insertInfo]() { + tasks.push_back(std::packaged_task{[&newIndex, &index, &insertInfo]() { newIndex.getPatterns() = index.getPatterns().cloneAndRemap([&insertInfo](const Id& oldId) { return remapVocabId(oldId, insertInfo); @@ -240,7 +240,7 @@ void materializeToIndex( using enum Permutation::Enum; for (auto permutation : {SPO, SOP, OPS, OSP}) { const auto& actualPermutation = index.getPermutation(permutation); - tasks.push_back(ad_utility::JThread{ + tasks.push_back(std::packaged_task{ [&newIndex, &actualPermutation, &scanSpec, &locatedTriplesSharedState, &localVocabMapping, &insertInfo, &cancellationHandle]() { newIndex.createPermutation( @@ -256,7 +256,7 @@ void materializeToIndex( for (auto permutation : Permutation::INTERNAL) { const auto& actualPermutation = index.getPermutation(permutation); const auto& internalPermutation = actualPermutation.internalPermutation(); - tasks.push_back(ad_utility::JThread{ + tasks.push_back(std::packaged_task{ [&newIndex, &internalPermutation, &scanSpec, &locatedTriplesSharedState, &localVocabMapping, &insertInfo, &cancellationHandle]() { newIndex.createPermutation( @@ -281,7 +281,7 @@ void materializeToIndex( additionalColumns.push_back(col); } AD_CORRECTNESS_CHECK(additionalColumns.size() == numColumns - 3); - tasks.push_back(ad_utility::JThread{ + tasks.push_back(std::packaged_task{ [&newIndex, &actualPermutation, &scanSpec, &locatedTriplesSharedState, &localVocabMapping, &insertInfo, &cancellationHandle, numColumns, blockMetadataRanges = std::move(blockMetadataRanges), @@ -296,8 +296,7 @@ void materializeToIndex( }}); } - // Explicitly wait for all threads to complete before logging completion - tasks.clear(); + ad_utility::runTasksInParallel(tasks); REBUILD_LOG_INFO << "Index rebuild completed" << std::endl; diff --git a/src/util/ParallelExecutor.h b/src/util/ParallelExecutor.h new file mode 100644 index 0000000000..909c0615d2 --- /dev/null +++ b/src/util/ParallelExecutor.h @@ -0,0 +1,36 @@ +// Copyright 2026 The QLever Authors, in particular: +// +// 2026 Robin Textor-Falconi , UFR +// +// UFR = University of Freiburg, Chair of Algorithms and Data Structures + +#ifndef QLEVER_SRC_UTIL_PARALLELEXECUTOR_H +#define QLEVER_SRC_UTIL_PARALLELEXECUTOR_H + +#include +#include + +#include "util/jthread.h" + +namespace ad_utility { +// Run the given tasks in parallel and wait for their completion. This function +// will spawn a new thread for each task. If one of the tasks throws an +// exception, this exception will be rethrown in the main thread. If multiple +// tasks throw exceptions, only the first one will be rethrown. +inline void runTasksInParallel(std::vector>& tasks) { + std::vector> futures; + futures.reserve(tasks.size()); + std::vector threads; + futures.reserve(tasks.size()); + for (auto& task : tasks) { + futures.push_back(task.get_future()); + threads.push_back(JThread{std::move(task)}); + } + // Wait for completion. + for (auto& future : futures) { + future.get(); + } +} +} // namespace ad_utility + +#endif // QLEVER_SRC_UTIL_PARALLELEXECUTOR_H From dee6cdbc1a30209b0d373eb3023e24b02af6aacd Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Wed, 14 Jan 2026 16:08:54 +0100 Subject: [PATCH 34/41] Unify code and fix issue with missing patterns for OSP/OPS --- src/index/IndexImpl.h | 2 +- src/index/IndexRebuilder.cpp | 131 +++++++++++++++-------------------- 2 files changed, 58 insertions(+), 75 deletions(-) diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 3970189e56..85ecdf6c99 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -626,7 +626,7 @@ class IndexImpl { size_t createPermutation( size_t numColumns, ad_utility::InputRangeTypeErased> sortedTriples, - const Permutation& permutation, bool internal = false); + const Permutation& permutation, bool internal); protected: void openTextFileHandle(); diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index 883d71e26a..24cd90ee83 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -154,23 +154,6 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( }}}; } -// Overload that automatically retrieves the block metadata ranges for the given -// `permutation` and passes the graph ID as an additional column to be read. -ad_utility::InputRangeTypeErased> readIndexAndRemap( - const Permutation& permutation, ScanSpecification scanSpec, - const LocatedTriplesSharedState& locatedTriplesSharedState, - const ad_utility::HashMap& localVocabMapping, - const std::vector& insertionPositions, - const ad_utility::SharedCancellationHandle& cancellationHandle) { - return readIndexAndRemap( - permutation, std::move(scanSpec), - permutation.getAugmentedMetadataForPermutation( - *locatedTriplesSharedState), - locatedTriplesSharedState, localVocabMapping, insertionPositions, - cancellationHandle, - std::array{static_cast(ADDITIONAL_COLUMN_GRAPH_ID)}); -} - // Get the number of columns in the given `blockMetadataRanges`. If this cannot // be determined, return 4 as a safe default. size_t getNumColumns(const BlockMetadataRanges& blockMetadataRanges) { @@ -185,6 +168,44 @@ size_t getNumColumns(const BlockMetadataRanges& blockMetadataRanges) { } return 4; } + +// Create a `std::packaged_task` that writes a new permutation according to the +// settings of `newIndex`, based on the data of the current index. +std::packaged_task createPermutationWriterTask( + IndexImpl& newIndex, const Permutation& permutation, bool isInternal, + const LocatedTriplesSharedState& locatedTriplesSharedState, + const ad_utility::HashMap& localVocabMapping, + const std::vector& insertionPositions, + const ad_utility::SharedCancellationHandle& cancellationHandle) { + auto blockMetadataRanges = permutation.getAugmentedMetadataForPermutation( + *locatedTriplesSharedState); + size_t numColumns = getNumColumns(blockMetadataRanges); + std::vector additionalColumns; + additionalColumns.push_back(ADDITIONAL_COLUMN_GRAPH_ID); + for (ColumnIndex col : {ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN, + ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN}) { + if (additionalColumns.size() >= numColumns - 3) { + break; + } + additionalColumns.push_back(col); + } + AD_CORRECTNESS_CHECK(additionalColumns.size() == numColumns - 3); + return std::packaged_task{ + [numColumns, blockMetadataRanges = std::move(blockMetadataRanges), + &newIndex, &permutation, isInternal, &locatedTriplesSharedState, + &localVocabMapping, &insertionPositions, &cancellationHandle, + additionalColumns = std::move(additionalColumns)]() { + newIndex.createPermutation( + numColumns, + readIndexAndRemap( + permutation, + ScanSpecification{std::nullopt, std::nullopt, std::nullopt}, + blockMetadataRanges, locatedTriplesSharedState, + localVocabMapping, insertionPositions, cancellationHandle, + additionalColumns), + permutation, isInternal); + }}; +} } // namespace // _____________________________________________________________________________ @@ -211,14 +232,14 @@ void materializeToIndex( REBUILD_LOG_INFO << "Writing new vocabulary ..." << std::endl; - const auto& [insertInfo, localVocabMapping] = + const auto& [insertionPositions, localVocabMapping] = materializeLocalVocab(entries, index.getVocab(), newIndexName); REBUILD_LOG_INFO << "Recomputing statistics ..." << std::endl; auto newStats = index.recomputeStatistics(locatedTriplesSharedState); - ScanSpecification scanSpec{std::nullopt, std::nullopt, std::nullopt}; + ; IndexImpl newIndex{index.allocator(), false}; newIndex.loadConfigFromOldIndex(newIndexName, index, newStats); @@ -227,73 +248,35 @@ void materializeToIndex( std::vector> tasks; if (index.usePatterns()) { - tasks.push_back(std::packaged_task{[&newIndex, &index, &insertInfo]() { - newIndex.getPatterns() = - index.getPatterns().cloneAndRemap([&insertInfo](const Id& oldId) { - return remapVocabId(oldId, insertInfo); - }); - newIndex.writePatternsToFile(); - }}); + tasks.push_back( + std::packaged_task{[&newIndex, &index, &insertionPositions]() { + newIndex.getPatterns() = index.getPatterns().cloneAndRemap( + [&insertionPositions](const Id& oldId) { + return remapVocabId(oldId, insertionPositions); + }); + newIndex.writePatternsToFile(); + }}); } if (index.hasAllPermutations()) { using enum Permutation::Enum; for (auto permutation : {SPO, SOP, OPS, OSP}) { const auto& actualPermutation = index.getPermutation(permutation); - tasks.push_back(std::packaged_task{ - [&newIndex, &actualPermutation, &scanSpec, &locatedTriplesSharedState, - &localVocabMapping, &insertInfo, &cancellationHandle]() { - newIndex.createPermutation( - 4, - readIndexAndRemap(actualPermutation, scanSpec, - locatedTriplesSharedState, localVocabMapping, - insertInfo, cancellationHandle), - actualPermutation); - }}); + tasks.push_back(createPermutationWriterTask( + newIndex, actualPermutation, false, locatedTriplesSharedState, + localVocabMapping, insertionPositions, cancellationHandle)); } } for (auto permutation : Permutation::INTERNAL) { const auto& actualPermutation = index.getPermutation(permutation); const auto& internalPermutation = actualPermutation.internalPermutation(); - tasks.push_back(std::packaged_task{ - [&newIndex, &internalPermutation, &scanSpec, &locatedTriplesSharedState, - &localVocabMapping, &insertInfo, &cancellationHandle]() { - newIndex.createPermutation( - 4, - readIndexAndRemap(internalPermutation, scanSpec, - locatedTriplesSharedState, localVocabMapping, - insertInfo, cancellationHandle), - internalPermutation, true); - }}); - - auto blockMetadataRanges = - actualPermutation.getAugmentedMetadataForPermutation( - *locatedTriplesSharedState); - size_t numColumns = getNumColumns(blockMetadataRanges); - std::vector additionalColumns; - additionalColumns.push_back(ADDITIONAL_COLUMN_GRAPH_ID); - for (ColumnIndex col : {ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN, - ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN}) { - if (additionalColumns.size() >= numColumns - 3) { - break; - } - additionalColumns.push_back(col); - } - AD_CORRECTNESS_CHECK(additionalColumns.size() == numColumns - 3); - tasks.push_back(std::packaged_task{ - [&newIndex, &actualPermutation, &scanSpec, &locatedTriplesSharedState, - &localVocabMapping, &insertInfo, &cancellationHandle, numColumns, - blockMetadataRanges = std::move(blockMetadataRanges), - additionalColumns = std::move(additionalColumns)]() { - newIndex.createPermutation( - numColumns, - readIndexAndRemap(actualPermutation, scanSpec, - blockMetadataRanges, locatedTriplesSharedState, - localVocabMapping, insertInfo, - cancellationHandle, additionalColumns), - actualPermutation); - }}); + tasks.push_back(createPermutationWriterTask( + newIndex, internalPermutation, true, locatedTriplesSharedState, + localVocabMapping, insertionPositions, cancellationHandle)); + tasks.push_back(createPermutationWriterTask( + newIndex, actualPermutation, false, locatedTriplesSharedState, + localVocabMapping, insertionPositions, cancellationHandle)); } ad_utility::runTasksInParallel(tasks); From c7036af3b91638f114920c20323df08fa9254cf8 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Wed, 14 Jan 2026 16:20:31 +0100 Subject: [PATCH 35/41] Add future task --- src/index/IndexRebuilder.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index 24cd90ee83..a6c8f8c60d 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -136,6 +136,9 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( constexpr size_t REGULAR_COLUMNS = 4; for (auto col : allCols | ::ranges::views::take(REGULAR_COLUMNS)) { for (Id& id : col) { + // TODO Experiment with caching the last remapped id + // and reusing it if the same id appears again. See if that + // improves performance or if it makes it worse. if (id.getDatatype() == Datatype::VocabIndex) [[likely]] { id = remapVocabId(id, insertionPositions); } else if (id.getDatatype() == Datatype::LocalVocabIndex) { From 93243f63ee4f82958e148c44fb9a93b21e6e1077 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Wed, 14 Jan 2026 16:20:44 +0100 Subject: [PATCH 36/41] Simplify function signature --- src/index/IndexRebuilder.cpp | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index a6c8f8c60d..7a611ab204 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -111,15 +111,16 @@ remapVocabId(Id original, const std::vector& insertionPositions) { // and all vocab `Id`s are remapped according to `insertInfo` to create a new // index where all of these values are all vocab `Id`s in the new vocabulary. ad_utility::InputRangeTypeErased> readIndexAndRemap( - const Permutation& permutation, ScanSpecification scanSpec, + const Permutation& permutation, const BlockMetadataRanges& blockMetadataRanges, const LocatedTriplesSharedState& locatedTriplesSharedState, const ad_utility::HashMap& localVocabMapping, const std::vector& insertionPositions, const ad_utility::SharedCancellationHandle& cancellationHandle, ql::span additionalColumns) { - Permutation::ScanSpecAndBlocks scanSpecAndBlocks{std::move(scanSpec), - blockMetadataRanges}; + Permutation::ScanSpecAndBlocks scanSpecAndBlocks{ + ScanSpecification{std::nullopt, std::nullopt, std::nullopt}, + blockMetadataRanges}; auto fullScan = permutation.lazyScan( scanSpecAndBlocks, std::nullopt, additionalColumns, cancellationHandle, *locatedTriplesSharedState, LimitOffsetClause{}); @@ -200,12 +201,10 @@ std::packaged_task createPermutationWriterTask( additionalColumns = std::move(additionalColumns)]() { newIndex.createPermutation( numColumns, - readIndexAndRemap( - permutation, - ScanSpecification{std::nullopt, std::nullopt, std::nullopt}, - blockMetadataRanges, locatedTriplesSharedState, - localVocabMapping, insertionPositions, cancellationHandle, - additionalColumns), + readIndexAndRemap(permutation, blockMetadataRanges, + locatedTriplesSharedState, localVocabMapping, + insertionPositions, cancellationHandle, + additionalColumns), permutation, isInternal); }}; } From 96b96f83a61e373cbd897b3d6f8ffa290b1defdd Mon Sep 17 00:00:00 2001 From: Hannah Bast Date: Thu, 15 Jan 2026 22:03:35 +0100 Subject: [PATCH 37/41] Add runtime parameter `disable-update-graph-metadata` The parameter does what it says, and is `false` by default --- src/global/RuntimeParameters.cpp | 1 + src/global/RuntimeParameters.h | 4 ++++ src/index/LocatedTriples.cpp | 6 ++++++ 3 files changed, 11 insertions(+) diff --git a/src/global/RuntimeParameters.cpp b/src/global/RuntimeParameters.cpp index 5a409a2c32..b23b79a4c4 100644 --- a/src/global/RuntimeParameters.cpp +++ b/src/global/RuntimeParameters.cpp @@ -51,6 +51,7 @@ RuntimeParameters::RuntimeParameters() { add(materializedViewWriterMemory_); add(defaultQueryTimeout_); add(sortInMemoryThreshold_); + add(disableUpdateGraphMetadata_); defaultQueryTimeout_.setParameterConstraint( [](std::chrono::seconds value, std::string_view parameterName) { diff --git a/src/global/RuntimeParameters.h b/src/global/RuntimeParameters.h index 67cdfbf3f8..24901addfe 100644 --- a/src/global/RuntimeParameters.h +++ b/src/global/RuntimeParameters.h @@ -135,6 +135,10 @@ struct RuntimeParameters { MemorySizeParameter sortInMemoryThreshold_{ ad_utility::MemorySize::gigabytes(5), "sort-in-memory-threshold"}; + // If set to `true`, skip updating graph metadata for delta triples. + // This can improve performance when graph metadata is not needed. + Bool disableUpdateGraphMetadata_{false, "disable-update-graph-metadata"}; + // ___________________________________________________________________________ // IMPORTANT NOTE: IF YOU ADD PARAMETERS ABOVE, ALSO REGISTER THEM IN THE // CONSTRUCTOR, S.T. THEY CAN ALSO BE ACCESSED VIA THE RUNTIME INTERFACE. diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp index 4666d856ff..4d61f5f903 100644 --- a/src/index/LocatedTriples.cpp +++ b/src/index/LocatedTriples.cpp @@ -11,6 +11,7 @@ #include "index/LocatedTriples.h" #include "backports/algorithm.h" +#include "global/RuntimeParameters.h" #include "index/CompressedRelation.h" #include "index/ConstantsIndexBuilding.h" #include "util/ChunkedForLoop.h" @@ -276,6 +277,11 @@ void LocatedTriplesPerBlock::setOriginalMetadata( // the graph info is set to `nullopt`, which means that there is no info. static auto updateGraphMetadata(CompressedBlockMetadata& blockMetadata, const LocatedTriples& locatedTriples) { + // Early return if graph metadata updates are disabled. + if (getRuntimeParameter<&RuntimeParameters::disableUpdateGraphMetadata_>()) { + return; + } + // We do not know anything about the triples contained in the block, so we // also cannot know if the `locatedTriples` introduces duplicates. We thus // have to be conservative and assume that there are duplicates. From fcdc22d70c8e297e5a2d9a8ed510102a618f3bd6 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Mon, 19 Jan 2026 18:28:20 +0100 Subject: [PATCH 38/41] Fix blank node remapping --- src/engine/Server.cpp | 6 +-- src/index/DeltaTriples.cpp | 19 ++++++-- src/index/DeltaTriples.h | 12 +++-- src/index/IndexImpl.cpp | 23 ++++----- src/index/IndexRebuilder.cpp | 93 ++++++++++++++++++++++++++++-------- src/index/IndexRebuilder.h | 8 +++- test/IndexTest.cpp | 2 - 7 files changed, 115 insertions(+), 48 deletions(-) diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 9a9f9cf5a7..c08593f7f8 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -481,12 +481,12 @@ CPP_template_def(typename RequestT, typename ResponseT)( queryThreadPool_, [this, &handle, fileName = std::move(fileName)] { auto logFileName = fileName + ".rebuild-index-log.txt"; - auto [currentSnapshot, localVocabCopy] = + auto [currentSnapshot, localVocabCopy, ownedBlocks] = index_.deltaTriplesManager() .getCurrentLocatedTriplesSharedStateWithVocab(); qlever::materializeToIndex(index_.getImpl(), fileName, - localVocabCopy, currentSnapshot, handle, - logFileName); + currentSnapshot, localVocabCopy, + ownedBlocks, handle, logFileName); }, handle); co_await std::move(coroutine); diff --git a/src/index/DeltaTriples.cpp b/src/index/DeltaTriples.cpp index 7c208a364c..585f2debf3 100644 --- a/src/index/DeltaTriples.cpp +++ b/src/index/DeltaTriples.cpp @@ -508,11 +508,15 @@ DeltaTriplesManager::getCurrentLocatedTriplesSharedState() const { } // _____________________________________________________________________________ -std::pair> +std::tuple< + LocatedTriplesSharedState, std::vector, + std::vector< + ad_utility::BlankNodeManager::LocalBlankNodeManager::OwnedBlocksEntry>> DeltaTriplesManager::getCurrentLocatedTriplesSharedStateWithVocab() const { return deltaTriples_.withReadLock([this](const DeltaTriples& deltaTriples) { - return std::make_pair(*currentLocatedTriplesSharedState_.rlock(), - deltaTriples.copyLocalVocab()); + auto [indices, ownedBlocks] = deltaTriples.copyLocalVocab(); + return std::make_tuple(*currentLocatedTriplesSharedState_.rlock(), + std::move(indices), std::move(ownedBlocks)); }); } @@ -613,12 +617,17 @@ void DeltaTriplesManager::setFilenameForPersistentUpdatesAndReadFromDisk( } // _____________________________________________________________________________ -std::vector DeltaTriples::copyLocalVocab() const { +std::pair< + std::vector, + std::vector< + ad_utility::BlankNodeManager::LocalBlankNodeManager::OwnedBlocksEntry>> +DeltaTriples::copyLocalVocab() const { std::vector entries; entries.reserve(localVocab_.size()); for (const LocalVocabEntry& entry : localVocab_.primaryWordSet()) { entries.push_back(&entry); } - return entries; + return std::make_pair(std::move(entries), + localVocab_.getOwnedLocalBlankNodeBlocks()); } diff --git a/src/index/DeltaTriples.h b/src/index/DeltaTriples.h index 7ce1d2d69c..08562b1ebc 100644 --- a/src/index/DeltaTriples.h +++ b/src/index/DeltaTriples.h @@ -278,7 +278,10 @@ class DeltaTriples { // Create a copy of the local vocab such that it can be processed // without holding the lock. You have to make sure separately that the // pointers are still valid. - std::vector copyLocalVocab() const; + std::pair, + std::vector> + copyLocalVocab() const; private: // The proper state according to the template parameter. This will either @@ -372,8 +375,11 @@ class DeltaTriplesManager { LocatedTriplesSharedState getCurrentLocatedTriplesSharedState() const; // In addition to the located triples shared state, also acquire a copy of the - // local vocab indices. - std::pair> + // local vocab indices and the local blank node blocks owned by the local + // vocab. + std::tuple, + std::vector> getCurrentLocatedTriplesSharedStateWithVocab() const; }; diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 71d46e0fbf..57971bb1d4 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -43,6 +43,11 @@ using namespace ad_utility::memory_literals; // sorter. static constexpr size_t NUM_EXTERNAL_SORTERS_AT_SAME_TIME = 2u; +// The name of this JSON property no longer holds up as soon as blank nodes are +// added or removed via updates. For backwards compatibility we keep the name. +constexpr std::string_view BLANK_NODE_ALLOCATION_START = + "num-blank-nodes-total"; + // _____________________________________________________________________________ IndexImpl::IndexImpl(ad_utility::AllocatorWithLimit allocator, bool registerSingleton) @@ -447,7 +452,7 @@ void IndexImpl::createFromFiles( configurationJson_["has-all-permutations"] = true; } - configurationJson_["num-blank-nodes-total"] = + configurationJson_[BLANK_NODE_ALLOCATION_START] = indexBuilderData.vocabularyMetaData_.getNextBlankNodeIndex(); addInternalStatisticsToConfiguration(numTriplesInternal, @@ -1284,7 +1289,7 @@ void IndexImpl::readConfiguration() { // Initialize BlankNodeManager uint64_t numBlankNodesTotal; - loadDataMember("num-blank-nodes-total", numBlankNodesTotal); + loadDataMember(BLANK_NODE_ALLOCATION_START, numBlankNodesTotal); blankNodeManager_ = std::make_unique(numBlankNodesTotal); @@ -1974,7 +1979,6 @@ nlohmann::json IndexImpl::recomputeStatistics( size_t numPredicates = 0; size_t numPredicatesInternal = 0; size_t numObjects = 0; - uint64_t nextBlankNode = 0; std::vector> tasks; @@ -1987,17 +1991,7 @@ nlohmann::json IndexImpl::recomputeStatistics( tasks.push_back(getCounterTask( numPredicates, *pso_, - [&numTriples, &nextBlankNode](const IdTable& table) { - numTriples += table.numRows(); - for (auto col : table.getColumns()) { - for (auto id : col) { - if (id.getDatatype() == Datatype::BlankNodeIndex) { - nextBlankNode = - std::max(nextBlankNode, id.getBlankNodeIndex().get() + 1); - } - } - } - })); + [&numTriples](const IdTable& table) { numTriples += table.numRows(); })); tasks.push_back(getCounterTask(numPredicatesInternal, pso_->internalPermutation(), @@ -2022,6 +2016,5 @@ nlohmann::json IndexImpl::recomputeStatistics( configuration["num-subjects"] = NumNormalAndInternal{numSubjects, 0}; configuration["num-objects"] = NumNormalAndInternal{numObjects, 0}; } - configuration["num-blank-nodes-total"] = nextBlankNode; return configuration; } diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index 2cf0551648..3b29db4444 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -33,10 +33,14 @@ using CancellationHandle = ad_utility::SharedCancellationHandle; // positions (the `VocabIndex` of the `LocalVocabEntry`s position in the old // `vocab`) and a mapping from old local vocab `Id`s bit representation (for // cheaper hash functions) to new vocab `Id`s. -std::pair, ad_utility::HashMap> -materializeLocalVocab(const std::vector& entries, - const Index::Vocab& vocab, - const std::string& newIndexName) { +std::tuple, ad_utility::HashMap, + std::vector> +materializeLocalVocab( + const std::vector& entries, + const std::vector< + ad_utility::BlankNodeManager::LocalBlankNodeManager::OwnedBlocksEntry>& + ownedBlocks, + const Index::Vocab& vocab, const std::string& newIndexName) { size_t newWordCount = 0; std::vector> insertInfo; insertInfo.reserve(entries.size()); @@ -86,8 +90,15 @@ materializeLocalVocab(const std::vector& entries, for (const auto& [vocabIndex, _, __] : insertInfo) { insertionPositions.push_back(vocabIndex); } - return std::make_pair(std::move(insertionPositions), - std::move(localVocabMapping)); + std::vector flatBlockIndices; + for (const auto& ownedBlockEntry : ownedBlocks) { + ql::ranges::copy(ownedBlockEntry.blockIndices_, + std::back_inserter(flatBlockIndices)); + } + ql::ranges::sort(flatBlockIndices); + return std::make_tuple(std::move(insertionPositions), + std::move(localVocabMapping), + std::move(flatBlockIndices)); } // Map old vocab `Id`s to new vocab `Id`s according to the given @@ -106,6 +117,29 @@ remapVocabId(Id original, const std::vector& insertionPositions) { VocabIndex::make(original.getVocabIndex().get() + offset)); } +// Remaps a blank node `Id` to another id that's more dense. +Id remapBlankNodeId(Id original, const std::vector& blankNodeBlocks, + uint64_t minBlankNodeIndex) { + AD_EXPENSIVE_CHECK( + original.getDatatype() == Datatype::BlankNodeIndex, + "Only ids resembling a blank node index can be remapped with this " + "function."); + auto rawId = original.getBlankNodeIndex().get(); + if (rawId < minBlankNodeIndex) { + return original; + } + auto normalizedId = rawId - minBlankNodeIndex; + auto blockIndex = normalizedId / ad_utility::BlankNodeManager::blockSize_; + auto it = ql::ranges::lower_bound(blankNodeBlocks, blockIndex); + AD_EXPENSIVE_CHECK(it != blankNodeBlocks.end() && *it == blockIndex, + "Could not find block index of blank node."); + return Id::makeFromBlankNodeIndex(BlankNodeIndex::make( + (normalizedId % ad_utility::BlankNodeManager::blockSize_) + + ql::ranges::distance(blankNodeBlocks.begin(), it) * + ad_utility::BlankNodeManager::blockSize_ + + minBlankNodeIndex)); +} + // Create a copy of the given `permutation` scanned according to `scanSpec`, // where all local vocab `Id`s are remapped according to `localVocabMapping` // and all vocab `Id`s are remapped according to `insertInfo` to create a new @@ -116,8 +150,11 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( const LocatedTriplesSharedState& locatedTriplesSharedState, const ad_utility::HashMap& localVocabMapping, const std::vector& insertionPositions, + const std::vector& blankNodeBlocks, uint64_t minBlankNodeIndex, const ad_utility::SharedCancellationHandle& cancellationHandle, ql::span additionalColumns) { + AD_CORRECTNESS_CHECK(ql::ranges::is_sorted(insertionPositions)); + AD_CORRECTNESS_CHECK(ql::ranges::is_sorted(blankNodeBlocks)); Permutation::ScanSpecAndBlocks scanSpecAndBlocks{ ScanSpecification{std::nullopt, std::nullopt, std::nullopt}, blockMetadataRanges}; @@ -128,7 +165,8 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( return ad_utility::InputRangeTypeErased{ ad_utility::CachingTransformInputRange{ std::move(fullScan), - [&localVocabMapping, &insertionPositions](IdTable& idTable) { + [&localVocabMapping, &insertionPositions, &blankNodeBlocks, + minBlankNodeIndex](IdTable& idTable) { // TODO process columns in parallel. auto allCols = idTable.getColumns(); // Extra columns beyond the graph column only contain integers (or @@ -144,6 +182,8 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( id = remapVocabId(id, insertionPositions); } else if (id.getDatatype() == Datatype::LocalVocabIndex) { id = localVocabMapping.at(id.getBits()); + } else if (id.getDatatype() == Datatype::BlankNodeIndex) { + id = remapBlankNodeId(id, blankNodeBlocks, minBlankNodeIndex); } } } @@ -180,6 +220,7 @@ std::packaged_task createPermutationWriterTask( const LocatedTriplesSharedState& locatedTriplesSharedState, const ad_utility::HashMap& localVocabMapping, const std::vector& insertionPositions, + const std::vector& blankNodeBlocks, uint64_t minBlankNodeIndex, const ad_utility::SharedCancellationHandle& cancellationHandle) { auto blockMetadataRanges = permutation.getAugmentedMetadataForPermutation( *locatedTriplesSharedState); @@ -197,14 +238,15 @@ std::packaged_task createPermutationWriterTask( return std::packaged_task{ [numColumns, blockMetadataRanges = std::move(blockMetadataRanges), &newIndex, &permutation, isInternal, &locatedTriplesSharedState, - &localVocabMapping, &insertionPositions, &cancellationHandle, + &localVocabMapping, &insertionPositions, &blankNodeBlocks, + minBlankNodeIndex, &cancellationHandle, additionalColumns = std::move(additionalColumns)]() { newIndex.createPermutation( numColumns, - readIndexAndRemap(permutation, blockMetadataRanges, - locatedTriplesSharedState, localVocabMapping, - insertionPositions, cancellationHandle, - additionalColumns), + readIndexAndRemap( + permutation, blockMetadataRanges, locatedTriplesSharedState, + localVocabMapping, insertionPositions, blankNodeBlocks, + minBlankNodeIndex, cancellationHandle, additionalColumns), permutation, isInternal); }}; } @@ -214,8 +256,11 @@ std::packaged_task createPermutationWriterTask( namespace qlever { void materializeToIndex( const IndexImpl& index, const std::string& newIndexName, - const std::vector& entries, const LocatedTriplesSharedState& locatedTriplesSharedState, + const std::vector& entries, + const std::vector< + ad_utility::BlankNodeManager::LocalBlankNodeManager::OwnedBlocksEntry>& + ownedBlocks, const CancellationHandle& cancellationHandle, const std::string& logFileName) { AD_CONTRACT_CHECK(!logFileName.empty(), "Log file name must not be empty"); @@ -234,14 +279,21 @@ void materializeToIndex( REBUILD_LOG_INFO << "Writing new vocabulary ..." << std::endl; - const auto& [insertionPositions, localVocabMapping] = - materializeLocalVocab(entries, index.getVocab(), newIndexName); + const auto& [insertionPositions, localVocabMapping, blankNodeBlocks] = + materializeLocalVocab(entries, ownedBlocks, index.getVocab(), + newIndexName); REBUILD_LOG_INFO << "Recomputing statistics ..." << std::endl; auto newStats = index.recomputeStatistics(locatedTriplesSharedState); - ; + auto minBlankNodeIndex = index.getBlankNodeManager()->minIndex_; + + // Set newer lower bound for dynamic blank node indices. + newStats["num-blank-nodes-total"] = + minBlankNodeIndex + + blankNodeBlocks.size() * ad_utility::BlankNodeManager::blockSize_; + IndexImpl newIndex{index.allocator(), false}; newIndex.loadConfigFromOldIndex(newIndexName, index, newStats); @@ -266,7 +318,8 @@ void materializeToIndex( const auto& actualPermutation = index.getPermutation(permutation); tasks.push_back(createPermutationWriterTask( newIndex, actualPermutation, false, locatedTriplesSharedState, - localVocabMapping, insertionPositions, cancellationHandle)); + localVocabMapping, insertionPositions, blankNodeBlocks, + minBlankNodeIndex, cancellationHandle)); } } @@ -275,10 +328,12 @@ void materializeToIndex( const auto& internalPermutation = actualPermutation.internalPermutation(); tasks.push_back(createPermutationWriterTask( newIndex, internalPermutation, true, locatedTriplesSharedState, - localVocabMapping, insertionPositions, cancellationHandle)); + localVocabMapping, insertionPositions, blankNodeBlocks, + minBlankNodeIndex, cancellationHandle)); tasks.push_back(createPermutationWriterTask( newIndex, actualPermutation, false, locatedTriplesSharedState, - localVocabMapping, insertionPositions, cancellationHandle)); + localVocabMapping, insertionPositions, blankNodeBlocks, + minBlankNodeIndex, cancellationHandle)); } ad_utility::runTasksInParallel(std::move(tasks)); diff --git a/src/index/IndexRebuilder.h b/src/index/IndexRebuilder.h index 4f56366d25..e5dc102368 100644 --- a/src/index/IndexRebuilder.h +++ b/src/index/IndexRebuilder.h @@ -8,7 +8,10 @@ #include #include +#include "global/IndexTypes.h" +#include "index/DeltaTriples.h" #include "index/IndexImpl.h" +#include "util/BlankNodeManager.h" #include "util/CancellationHandle.h" namespace qlever { @@ -16,8 +19,11 @@ namespace qlever { // Build a new index based on this data. void materializeToIndex( const IndexImpl& index, const std::string& newIndexName, - const std::vector& entries, const LocatedTriplesSharedState& locatedTriplesSharedState, + const std::vector& entries, + const std::vector< + ad_utility::BlankNodeManager::LocalBlankNodeManager::OwnedBlocksEntry>& + ownedBlocks, const ad_utility::SharedCancellationHandle& cancellationHandle, const std::string& logFileName); diff --git a/test/IndexTest.cpp b/test/IndexTest.cpp index 2311d35db8..93f891a0ff 100644 --- a/test/IndexTest.cpp +++ b/test/IndexTest.cpp @@ -722,8 +722,6 @@ TEST(IndexImpl, recomputeStatistics) { EXPECT_EQ(newStats["num-subjects"], NNAI(0, 0)); EXPECT_EQ(newStats["num-objects"], NNAI(0, 0)); } - // Blank node ids are remapped, so we cannot predict the exact number. - EXPECT_NE(newStats["num-blank-nodes-total"], 0); } } From 98c56809c1aab1dda005d8f9d7edf5d2451cbc31 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Mon, 2 Feb 2026 12:05:28 +0100 Subject: [PATCH 39/41] Revert "Add runtime parameter `disable-update-graph-metadata`" This reverts commit 96b96f83a61e373cbd897b3d6f8ffa290b1defdd. --- src/global/RuntimeParameters.cpp | 1 - src/global/RuntimeParameters.h | 4 ---- src/index/LocatedTriples.cpp | 6 ------ 3 files changed, 11 deletions(-) diff --git a/src/global/RuntimeParameters.cpp b/src/global/RuntimeParameters.cpp index b23b79a4c4..5a409a2c32 100644 --- a/src/global/RuntimeParameters.cpp +++ b/src/global/RuntimeParameters.cpp @@ -51,7 +51,6 @@ RuntimeParameters::RuntimeParameters() { add(materializedViewWriterMemory_); add(defaultQueryTimeout_); add(sortInMemoryThreshold_); - add(disableUpdateGraphMetadata_); defaultQueryTimeout_.setParameterConstraint( [](std::chrono::seconds value, std::string_view parameterName) { diff --git a/src/global/RuntimeParameters.h b/src/global/RuntimeParameters.h index 24901addfe..67cdfbf3f8 100644 --- a/src/global/RuntimeParameters.h +++ b/src/global/RuntimeParameters.h @@ -135,10 +135,6 @@ struct RuntimeParameters { MemorySizeParameter sortInMemoryThreshold_{ ad_utility::MemorySize::gigabytes(5), "sort-in-memory-threshold"}; - // If set to `true`, skip updating graph metadata for delta triples. - // This can improve performance when graph metadata is not needed. - Bool disableUpdateGraphMetadata_{false, "disable-update-graph-metadata"}; - // ___________________________________________________________________________ // IMPORTANT NOTE: IF YOU ADD PARAMETERS ABOVE, ALSO REGISTER THEM IN THE // CONSTRUCTOR, S.T. THEY CAN ALSO BE ACCESSED VIA THE RUNTIME INTERFACE. diff --git a/src/index/LocatedTriples.cpp b/src/index/LocatedTriples.cpp index 4d61f5f903..4666d856ff 100644 --- a/src/index/LocatedTriples.cpp +++ b/src/index/LocatedTriples.cpp @@ -11,7 +11,6 @@ #include "index/LocatedTriples.h" #include "backports/algorithm.h" -#include "global/RuntimeParameters.h" #include "index/CompressedRelation.h" #include "index/ConstantsIndexBuilding.h" #include "util/ChunkedForLoop.h" @@ -277,11 +276,6 @@ void LocatedTriplesPerBlock::setOriginalMetadata( // the graph info is set to `nullopt`, which means that there is no info. static auto updateGraphMetadata(CompressedBlockMetadata& blockMetadata, const LocatedTriples& locatedTriples) { - // Early return if graph metadata updates are disabled. - if (getRuntimeParameter<&RuntimeParameters::disableUpdateGraphMetadata_>()) { - return; - } - // We do not know anything about the triples contained in the block, so we // also cannot know if the `locatedTriples` introduces duplicates. We thus // have to be conservative and assume that there are duplicates. From 7720cdef42aef3768edc7b66906da2503e7b0a7b Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Mon, 2 Feb 2026 12:47:09 +0100 Subject: [PATCH 40/41] Implement unit tests for `loadConfigFromOldIndex` --- src/index/IndexImpl.h | 1 + test/IndexTest.cpp | 44 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 42603b86e5..97a8f70268 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -709,6 +709,7 @@ class IndexImpl { friend class CreatePatternsFixture_createPatterns_Test; FRIEND_TEST(IndexImpl, recomputeStatistics); FRIEND_TEST(IndexImpl, writePatternsToFile); + FRIEND_TEST(IndexImpl, loadConfigFromOldIndex); bool isLiteral(std::string_view object) const; diff --git a/test/IndexTest.cpp b/test/IndexTest.cpp index 506be61ee1..cbfa55a059 100644 --- a/test/IndexTest.cpp +++ b/test/IndexTest.cpp @@ -14,7 +14,9 @@ #include "./util/IdTableHelpers.h" #include "./util/IdTestHelpers.h" #include "./util/TripleComponentTestHelpers.h" +#include "CompilationInfo.h" #include "index/Index.h" +#include "index/IndexFormatVersion.h" #include "index/IndexImpl.h" #include "util/IndexTestHelpers.h" @@ -862,3 +864,45 @@ TEST(IndexImpl, writePatternsToFile) { EXPECT_TRUE(ql::ranges::equal(CompactVectorOfStrings{data}, result, ql::ranges::equal)); } + +// _____________________________________________________________________________ +TEST(IndexImpl, loadConfigFromOldIndex) { + auto [directory, cleanup] = makeTemporaryDirectory("loadConfigFromOldIndex"); + auto onDiskBase = directory + "/index"; + IndexImpl other{ad_utility::makeUnlimitedAllocator()}; + other.blocksizePermutationPerColumn() = 1337_B; + nlohmann::json stats; + + Index::NumNormalAndInternal numTriples{42, 1337}; + Index::NumNormalAndInternal numPredicates{9999, 1010}; + Index::NumNormalAndInternal numSubjects{8888, 2020}; + Index::NumNormalAndInternal numObjects{7777, 3030}; + + stats["num-triples"] = numTriples; + stats["num-predicates"] = numPredicates; + stats["num-subjects"] = numSubjects; + stats["num-objects"] = numObjects; + stats["i-just-invented-this"] = "🤠"; + + IndexImpl index{ad_utility::makeUnlimitedAllocator()}; + index.loadConfigFromOldIndex(onDiskBase, other, stats); + EXPECT_EQ(index.getOnDiskBase(), onDiskBase); + EXPECT_EQ(index.getKbName(), other.getKbName()); + EXPECT_EQ(index.numTriples(), numTriples); + EXPECT_EQ(index.numDistinctPredicates(), numPredicates); + EXPECT_EQ(index.numSubjects_, numSubjects); + EXPECT_EQ(index.numObjects_, numObjects); + EXPECT_EQ(index.blocksizePermutationPerColumn(), + other.blocksizePermutationPerColumn()); + EXPECT_EQ(index.configurationJson_, stats); + + // The version written to disk will also have these fields. + stats["git-hash"] = *qlever::version::gitShortHashWithoutLinking.wlock(); + stats["index-format-version"] = qlever::indexFormatVersion; + + std::string jsonFile = onDiskBase + CONFIGURATION_FILE; + std::ifstream in{jsonFile}; + nlohmann::json jsonFromFile; + in >> jsonFromFile; + EXPECT_EQ(stats, jsonFromFile); +} From e50da2ff7b9490a82b5dc53ac5087bbb8799b7d9 Mon Sep 17 00:00:00 2001 From: RobinTF <83676088+RobinTF@users.noreply.github.com> Date: Mon, 2 Feb 2026 18:35:54 +0100 Subject: [PATCH 41/41] Start implementing unit tests for index rebuilds --- src/index/IndexRebuilder.cpp | 35 ++++------ src/index/IndexRebuilderImpl.h | 76 ++++++++++++++++++++++ test/index/CMakeLists.txt | 1 + test/index/IndexRebuilderTest.cpp | 103 ++++++++++++++++++++++++++++++ 4 files changed, 193 insertions(+), 22 deletions(-) create mode 100644 src/index/IndexRebuilderImpl.h create mode 100644 test/index/IndexRebuilderTest.cpp diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp index 3b29db4444..50cd704f27 100644 --- a/src/index/IndexRebuilder.cpp +++ b/src/index/IndexRebuilder.cpp @@ -16,6 +16,7 @@ #include "engine/idTable/IdTable.h" #include "global/Id.h" #include "index/IndexImpl.h" +#include "index/IndexRebuilderImpl.h" #include "index/LocalVocabEntry.h" #include "index/Permutation.h" #include "util/CancellationHandle.h" @@ -25,14 +26,8 @@ #include "util/Log.h" #include "util/ParallelExecutor.h" -namespace { -using CancellationHandle = ad_utility::SharedCancellationHandle; - -// Write a new vocabulary that contains all words from `vocab` plus all -// entries in `entries`. Returns a pair consisting of a vector insertion -// positions (the `VocabIndex` of the `LocalVocabEntry`s position in the old -// `vocab`) and a mapping from old local vocab `Id`s bit representation (for -// cheaper hash functions) to new vocab `Id`s. +namespace qlever::indexRebuilder { +// _____________________________________________________________________________ std::tuple, ad_utility::HashMap, std::vector> materializeLocalVocab( @@ -101,9 +96,7 @@ materializeLocalVocab( std::move(flatBlockIndices)); } -// Map old vocab `Id`s to new vocab `Id`s according to the given -// `insertionPositions`. This is the most performance critical code of the -// rebuild. +// _____________________________________________________________________________ AD_ALWAYS_INLINE Id remapVocabId(Id original, const std::vector& insertionPositions) { AD_EXPENSIVE_CHECK( @@ -117,7 +110,7 @@ remapVocabId(Id original, const std::vector& insertionPositions) { VocabIndex::make(original.getVocabIndex().get() + offset)); } -// Remaps a blank node `Id` to another id that's more dense. +// _____________________________________________________________________________ Id remapBlankNodeId(Id original, const std::vector& blankNodeBlocks, uint64_t minBlankNodeIndex) { AD_EXPENSIVE_CHECK( @@ -140,10 +133,7 @@ Id remapBlankNodeId(Id original, const std::vector& blankNodeBlocks, minBlankNodeIndex)); } -// Create a copy of the given `permutation` scanned according to `scanSpec`, -// where all local vocab `Id`s are remapped according to `localVocabMapping` -// and all vocab `Id`s are remapped according to `insertInfo` to create a new -// index where all of these values are all vocab `Id`s in the new vocabulary. +// _____________________________________________________________________________ ad_utility::InputRangeTypeErased> readIndexAndRemap( const Permutation& permutation, const BlockMetadataRanges& blockMetadataRanges, @@ -198,8 +188,7 @@ ad_utility::InputRangeTypeErased> readIndexAndRemap( }}}; } -// Get the number of columns in the given `blockMetadataRanges`. If this cannot -// be determined, return 4 as a safe default. +// _____________________________________________________________________________ size_t getNumColumns(const BlockMetadataRanges& blockMetadataRanges) { if (!blockMetadataRanges.empty()) { const auto& first = blockMetadataRanges.at(0); @@ -213,8 +202,7 @@ size_t getNumColumns(const BlockMetadataRanges& blockMetadataRanges) { return 4; } -// Create a `std::packaged_task` that writes a new permutation according to the -// settings of `newIndex`, based on the data of the current index. +// _____________________________________________________________________________ std::packaged_task createPermutationWriterTask( IndexImpl& newIndex, const Permutation& permutation, bool isInternal, const LocatedTriplesSharedState& locatedTriplesSharedState, @@ -241,6 +229,8 @@ std::packaged_task createPermutationWriterTask( &localVocabMapping, &insertionPositions, &blankNodeBlocks, minBlankNodeIndex, &cancellationHandle, additionalColumns = std::move(additionalColumns)]() { + // TODO exchange the multiplicities of col1 and col2 for + // matching permutations before writing the metadata. newIndex.createPermutation( numColumns, readIndexAndRemap( @@ -250,7 +240,7 @@ std::packaged_task createPermutationWriterTask( permutation, isInternal); }}; } -} // namespace +} // namespace qlever::indexRebuilder // _____________________________________________________________________________ namespace qlever { @@ -261,8 +251,9 @@ void materializeToIndex( const std::vector< ad_utility::BlankNodeManager::LocalBlankNodeManager::OwnedBlocksEntry>& ownedBlocks, - const CancellationHandle& cancellationHandle, + const ad_utility::SharedCancellationHandle& cancellationHandle, const std::string& logFileName) { + using namespace indexRebuilder; AD_CONTRACT_CHECK(!logFileName.empty(), "Log file name must not be empty"); // Set up logging to file diff --git a/src/index/IndexRebuilderImpl.h b/src/index/IndexRebuilderImpl.h new file mode 100644 index 0000000000..b11fd6090e --- /dev/null +++ b/src/index/IndexRebuilderImpl.h @@ -0,0 +1,76 @@ +// Copyright 2026 The QLever Authors, in particular: +// +// 2026 Robin Textor-Falconi , UFR +// +// UFR = University of Freiburg, Chair of Algorithms and Data Structures + +#ifndef QLEVER_SRC_INDEX_INDEXREBUILDERIMPL_H +#define QLEVER_SRC_INDEX_INDEXREBUILDERIMPL_H + +#include +#include +#include + +#include "engine/idTable/IdTable.h" +#include "global/Id.h" +#include "index/IndexRebuilder.h" +#include "util/CancellationHandle.h" +#include "util/HashMap.h" +#include "util/InputRangeUtils.h" + +namespace qlever::indexRebuilder { + +// Write a new vocabulary that contains all words from `vocab` plus all +// entries in `entries`. Returns a pair consisting of a vector insertion +// positions (the `VocabIndex` of the `LocalVocabEntry`s position in the old +// `vocab`) and a mapping from old local vocab `Id`s bit representation (for +// cheaper hash functions) to new vocab `Id`s. +std::tuple, ad_utility::HashMap, + std::vector> +materializeLocalVocab( + const std::vector& entries, + const std::vector< + ad_utility::BlankNodeManager::LocalBlankNodeManager::OwnedBlocksEntry>& + ownedBlocks, + const Index::Vocab& vocab, const std::string& newIndexName); + +// Map old vocab `Id`s to new vocab `Id`s according to the given +// `insertionPositions`. This is the most performance critical code of the +// rebuild. +Id remapVocabId(Id original, const std::vector& insertionPositions); + +// Remaps a blank node `Id` to another id that's more dense. +Id remapBlankNodeId(Id original, const std::vector& blankNodeBlocks, + uint64_t minBlankNodeIndex); + +// Create a copy of the given `permutation` scanned according to `scanSpec`, +// where all local vocab `Id`s are remapped according to `localVocabMapping` +// and all vocab `Id`s are remapped according to `insertInfo` to create a new +// index where all of these values are all vocab `Id`s in the new vocabulary. +ad_utility::InputRangeTypeErased> readIndexAndRemap( + const Permutation& permutation, + const BlockMetadataRanges& blockMetadataRanges, + const LocatedTriplesSharedState& locatedTriplesSharedState, + const ad_utility::HashMap& localVocabMapping, + const std::vector& insertionPositions, + const std::vector& blankNodeBlocks, uint64_t minBlankNodeIndex, + const ad_utility::SharedCancellationHandle& cancellationHandle, + ql::span additionalColumns); + +// Get the number of columns in the given `blockMetadataRanges`. If this cannot +// be determined, return 4 as a safe default. +size_t getNumColumns(const BlockMetadataRanges& blockMetadataRanges); + +// Create a `std::packaged_task` that writes a new permutation according to the +// settings of `newIndex`, based on the data of the current index. +std::packaged_task createPermutationWriterTask( + IndexImpl& newIndex, const Permutation& permutation, bool isInternal, + const LocatedTriplesSharedState& locatedTriplesSharedState, + const ad_utility::HashMap& localVocabMapping, + const std::vector& insertionPositions, + const std::vector& blankNodeBlocks, uint64_t minBlankNodeIndex, + const ad_utility::SharedCancellationHandle& cancellationHandle); + +} // namespace qlever::indexRebuilder + +#endif // QLEVER_SRC_INDEX_INDEXREBUILDERIMPL_H diff --git a/test/index/CMakeLists.txt b/test/index/CMakeLists.txt index db9ee415da..89c26b733b 100644 --- a/test/index/CMakeLists.txt +++ b/test/index/CMakeLists.txt @@ -3,3 +3,4 @@ addLinkAndDiscoverTest(PatternCreatorTest index) addLinkAndDiscoverTestSerial(ScanSpecificationTest index) addLinkAndDiscoverTestNoLibs(KeyOrderTest) addLinkAndDiscoverTestNoLibs(EncodedIriManagerTest) +addLinkAndDiscoverTest(IndexRebuilderTest index) diff --git a/test/index/IndexRebuilderTest.cpp b/test/index/IndexRebuilderTest.cpp new file mode 100644 index 0000000000..5fee9832b7 --- /dev/null +++ b/test/index/IndexRebuilderTest.cpp @@ -0,0 +1,103 @@ +// Copyright 2026 The QLever Authors, in particular: +// +// 2026 Robin Textor-Falconi , UFR +// +// UFR = University of Freiburg, Chair of Algorithms and Data Structures + +#include + +#include "../util/IndexTestHelpers.h" +#include "../util/TripleComponentTestHelpers.h" +#include "index/IndexRebuilder.h" +#include "index/IndexRebuilderImpl.h" + +using namespace qlever::indexRebuilder; + +// _____________________________________________________________________________ +TEST(IndexRebuilder, materializeLocalVocab) { + auto oldIndex = ad_utility::testing::makeTestIndex( + "materializeLocalVocab", " . ."); + std::string vocabPrefix = "/tmp/materializeLocalVocab"; + // TODO Cleanup generated test files. + + auto makeVocabEntry = [](std::string_view str) { + return LocalVocabEntry{ad_utility::testing::iri(str)}; + }; + + auto getId = ad_utility::testing::makeGetId(oldIndex); + auto b = makeVocabEntry(""); + auto c = getId(""); + auto d = makeVocabEntry(""); + auto e = getId(""); + auto f = makeVocabEntry(""); + auto g = getId(""); + auto h = makeVocabEntry(""); + auto j = makeVocabEntry(""); + auto k = getId(""); + auto l = makeVocabEntry(""); + std::vector entries{&b, &d, &f, &h, &j, &l}; + using OBE = + ad_utility::BlankNodeManager::LocalBlankNodeManager::OwnedBlocksEntry; + std::vector ownedBlocks{OBE{{}, {4, 42}}, OBE{{}, {7, 77}}}; + + auto [insertionPositions, localVocabMapping, flatBlockIndices] = + materializeLocalVocab(entries, ownedBlocks, oldIndex.getVocab(), + vocabPrefix); + EXPECT_THAT( + insertionPositions, + ::testing::ElementsAre( + c.getVocabIndex(), e.getVocabIndex(), g.getVocabIndex(), + Id::fromBits(h.positionInVocab().upperBound_.get()).getVocabIndex(), + k.getVocabIndex(), + Id::fromBits(l.positionInVocab().upperBound_.get()).getVocabIndex())); + auto toBits = [](const LocalVocabEntry& entry) { + return Id::makeFromLocalVocabIndex(&entry).getBits(); + }; + EXPECT_THAT(localVocabMapping, + ::testing::UnorderedElementsAre( + std::make_pair(toBits(b), + Id::makeFromVocabIndex(VocabIndex::make(1))), + std::make_pair(toBits(d), + Id::makeFromVocabIndex(VocabIndex::make(3))), + std::make_pair(toBits(f), + Id::makeFromVocabIndex(VocabIndex::make(5))), + std::make_pair(toBits(h), + Id::makeFromVocabIndex(VocabIndex::make(7))), + std::make_pair(toBits(j), + Id::makeFromVocabIndex(VocabIndex::make(14))), + std::make_pair(toBits(l), Id::makeFromVocabIndex( + VocabIndex::make(16))))); + EXPECT_THAT(flatBlockIndices, ::testing::ElementsAre(4, 7, 42, 77)); + + // TODO Add tests that the created vocabulary on disk is correct +} + +// _____________________________________________________________________________ +TEST(IndexRebuilder, remapVocabId) { + // TODO Add unit tests +} + +// _____________________________________________________________________________ +TEST(IndexRebuilder, remapBlankNodeId) { + // TODO Add unit tests +} + +// _____________________________________________________________________________ +TEST(IndexRebuilder, readIndexAndRemap) { + // TODO Add unit tests +} + +// _____________________________________________________________________________ +TEST(IndexRebuilder, getNumColumns) { + // TODO Add unit tests +} + +// _____________________________________________________________________________ +TEST(IndexRebuilder, createPermutationWriterTask) { + // TODO Add unit tests +} + +// _____________________________________________________________________________ +TEST(IndexRebuilder, materializeToIndex) { + // TODO Add unit tests +}