diff --git a/src/engine/Server.cpp b/src/engine/Server.cpp index 452fa5b49d..a6e2213912 100644 --- a/src/engine/Server.cpp +++ b/src/engine/Server.cpp @@ -28,6 +28,7 @@ #include "engine/SparqlProtocol.h" #include "global/RuntimeParameters.h" #include "index/IndexImpl.h" +#include "index/IndexRebuilder.h" #include "parser/SparqlParser.h" #include "util/AsioHelpers.h" #include "util/Exception.h" @@ -474,6 +475,38 @@ CPP_template_def(typename RequestT, typename ResponseT)( json[nlohmann::json(key)] = std::move(value); } response = createJsonResponse(json, request); + } else if (auto cmd = checkParameter("cmd", "rebuild-index")) { + requireValidAccessToken("rebuild-index"); + + if (rebuildInProgress_.exchange(true)) { + response = createHttpResponseFromString( + "Another rebuild is currently in progress!", + http::status::too_many_requests, request, MediaType::textPlain); + } else { + absl::Cleanup cleanup{[this]() { rebuildInProgress_.store(false); }}; + logCommand(cmd, "rebuilding index"); + auto fileName = + checkParameter("index-name", std::nullopt).value_or("new_index"); + // There is no mechanism to actually cancel the handle. + auto handle = std::make_shared>(); + // We don't directly `co_await` because of lifetime issues (bugs) in the + // Conan setup. + auto coroutine = computeInNewThread( + queryThreadPool_, + [this, &handle, fileName = std::move(fileName)] { + auto logFileName = fileName + ".rebuild-index-log.txt"; + auto [currentSnapshot, localVocabCopy, ownedBlocks] = + index_.deltaTriplesManager() + .getCurrentLocatedTriplesSharedStateWithVocab(); + qlever::materializeToIndex(index_.getImpl(), fileName, + currentSnapshot, localVocabCopy, + ownedBlocks, handle, logFileName); + }, + handle); + co_await std::move(coroutine); + response = + createOkResponse("Done writing", request, MediaType::textPlain); + } } else if (auto cmd = checkParameter("cmd", "write-materialized-view")) { requireValidAccessToken("write-materialized-view"); logCommand(cmd, "write materialized view"); diff --git a/src/engine/Server.h b/src/engine/Server.h index 49adaf8815..33c2f37ee3 100644 --- a/src/engine/Server.h +++ b/src/engine/Server.h @@ -113,6 +113,10 @@ class Server { /// Executor with a single thread that is used to run timers asynchronously. boost::asio::static_thread_pool timerExecutor_{1}; + // Indicates if an index rebuild is currently in progress so that we prevent + // triggering this twice. + std::atomic_bool rebuildInProgress_{false}; + template using Awaitable = boost::asio::awaitable; diff --git a/src/index/CMakeLists.txt b/src/index/CMakeLists.txt index 577af95874..da796a20f2 100644 --- a/src/index/CMakeLists.txt +++ b/src/index/CMakeLists.txt @@ -7,5 +7,5 @@ add_library(index PrefixHeuristic.cpp CompressedRelation.cpp PatternCreator.cpp ScanSpecification.cpp DeltaTriples.cpp LocalVocabEntry.cpp TextScoring.cpp TextScoringEnum.cpp TextIndexReadWrite.cpp - TextIndexBuilder.cpp GraphFilter.cpp) + TextIndexBuilder.cpp GraphFilter.cpp IndexRebuilder.cpp) qlever_target_link_libraries(index util parser vocabulary global) diff --git a/src/index/IndexImpl.cpp b/src/index/IndexImpl.cpp index 71b0681f87..c7c963db03 100644 --- a/src/index/IndexImpl.cpp +++ b/src/index/IndexImpl.cpp @@ -49,9 +49,12 @@ constexpr std::string_view BLANK_NODE_ALLOCATION_START = "num-blank-nodes-total"; // _____________________________________________________________________________ -IndexImpl::IndexImpl(ad_utility::AllocatorWithLimit allocator) +IndexImpl::IndexImpl(ad_utility::AllocatorWithLimit allocator, + bool registerSingleton) : allocator_{std::move(allocator)} { - globalSingletonIndex_ = this; + if (registerSingleton) { + globalSingletonIndex_ = this; + } deltaTriples_.emplace(*this); } @@ -1138,6 +1141,9 @@ void IndexImpl::setKeepTempFiles(bool keepTempFiles) { // _____________________________________________________________________________ bool& IndexImpl::usePatterns() { return usePatterns_; } +// _____________________________________________________________________________ +bool IndexImpl::usePatterns() const { return usePatterns_; } + // _____________________________________________________________________________ bool& IndexImpl::loadAllPermutations() { return loadAllPermutations_; } @@ -1900,6 +1906,22 @@ void IndexImpl::writePatternsToFile() const { statistics); } +// _____________________________________________________________________________ +void IndexImpl::loadConfigFromOldIndex(const std::string& newName, + const IndexImpl& other, + const nlohmann::json& newStats) { + setOnDiskBase(newName); + setKbName(other.getKbName()); + blocksizePermutationPerColumn() = other.blocksizePermutationPerColumn(); + configurationJson_ = newStats; + numTriples_ = static_cast(newStats["num-triples"]); + numPredicates_ = + static_cast(newStats["num-predicates"]); + numSubjects_ = static_cast(newStats["num-subjects"]); + numObjects_ = static_cast(newStats["num-objects"]); + writeConfiguration(); +} + // _____________________________________________________________________________ void IndexImpl::countDistinct(std::optional& lastId, size_t& counter, const IdTable& table) { diff --git a/src/index/IndexImpl.h b/src/index/IndexImpl.h index 49343fbf38..97a8f70268 100644 --- a/src/index/IndexImpl.h +++ b/src/index/IndexImpl.h @@ -208,7 +208,8 @@ class IndexImpl { std::optional deltaTriples_; public: - explicit IndexImpl(ad_utility::AllocatorWithLimit allocator); + explicit IndexImpl(ad_utility::AllocatorWithLimit allocator, + bool registerSingleton = true); // Forbid copying. IndexImpl& operator=(const IndexImpl&) = delete; @@ -273,6 +274,10 @@ class IndexImpl { const auto& getScoreData() const { return scoreData_; } + const ad_utility::AllocatorWithLimit& allocator() const { + return allocator_; + }; + ad_utility::BlankNodeManager* getBlankNodeManager() const; DeltaTriplesManager& deltaTriplesManager() { return deltaTriples_.value(); } @@ -442,6 +447,8 @@ class IndexImpl { bool& usePatterns(); + bool usePatterns() const; + bool& loadAllPermutations(); bool& doNotLoadPermutations(); @@ -464,6 +471,10 @@ class IndexImpl { return blocksizePermutationPerColumn_; } + const ad_utility::MemorySize& blocksizePermutationPerColumn() const { + return blocksizePermutationPerColumn_; + } + void setOnDiskBase(const std::string& onDiskBase); void setSettingsFile(const std::string& filename); @@ -698,6 +709,7 @@ class IndexImpl { friend class CreatePatternsFixture_createPatterns_Test; FRIEND_TEST(IndexImpl, recomputeStatistics); FRIEND_TEST(IndexImpl, writePatternsToFile); + FRIEND_TEST(IndexImpl, loadConfigFromOldIndex); bool isLiteral(std::string_view object) const; @@ -862,6 +874,13 @@ class IndexImpl { void storeTextScoringParamsInConfiguration(TextScoringMetric scoringMetric, float b, float k); + // Overwrite the config of this instance of `IndexImpl` with the config of + // `other`, adjusting the name to `newName` and the statistics to + // `newStats`. + void loadConfigFromOldIndex(const std::string& newName, + const IndexImpl& other, + const nlohmann::json& newStats); + // Write the stored in-memory patterns to a pattern file. void writePatternsToFile() const; diff --git a/src/index/IndexRebuilder.cpp b/src/index/IndexRebuilder.cpp new file mode 100644 index 0000000000..50cd704f27 --- /dev/null +++ b/src/index/IndexRebuilder.cpp @@ -0,0 +1,337 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Robin Textor-Falconi + +#include "index/IndexRebuilder.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "backports/algorithm.h" +#include "engine/idTable/IdTable.h" +#include "global/Id.h" +#include "index/IndexImpl.h" +#include "index/IndexRebuilderImpl.h" +#include "index/LocalVocabEntry.h" +#include "index/Permutation.h" +#include "util/CancellationHandle.h" +#include "util/Exception.h" +#include "util/HashMap.h" +#include "util/InputRangeUtils.h" +#include "util/Log.h" +#include "util/ParallelExecutor.h" + +namespace qlever::indexRebuilder { +// _____________________________________________________________________________ +std::tuple, ad_utility::HashMap, + std::vector> +materializeLocalVocab( + const std::vector& entries, + const std::vector< + ad_utility::BlankNodeManager::LocalBlankNodeManager::OwnedBlocksEntry>& + ownedBlocks, + const Index::Vocab& vocab, const std::string& newIndexName) { + size_t newWordCount = 0; + std::vector> insertInfo; + insertInfo.reserve(entries.size()); + + ad_utility::HashMap localVocabMapping; + + for (auto* entry : entries) { + const auto& [lower, upper] = entry->positionInVocab(); + AD_CORRECTNESS_CHECK(lower == upper); + Id id = Id::fromBits(upper.get()); + AD_CORRECTNESS_CHECK(id.getDatatype() == Datatype::VocabIndex); + insertInfo.emplace_back(id.getVocabIndex(), + entry->asLiteralOrIri().toStringRepresentation(), + Id::makeFromLocalVocabIndex(entry)); + } + ql::ranges::sort(insertInfo, [](const auto& tupleA, const auto& tupleB) { + return std::tie(std::get(tupleA).get(), std::get(tupleA)) < + std::tie(std::get(tupleB).get(), std::get(tupleB)); + }); + + auto vocabWriter = vocab.makeWordWriterPtr(newIndexName + ".vocabulary"); + for (size_t vocabIndex = 0; vocabIndex < vocab.size(); ++vocabIndex) { + auto actualIndex = VocabIndex::make(vocabIndex); + while (insertInfo.size() > newWordCount && + std::get(insertInfo.at(newWordCount)) == actualIndex) { + AD_CORRECTNESS_CHECK(std::get(insertInfo.at(newWordCount)) < + Id::makeFromVocabIndex(actualIndex)); + auto word = std::get(insertInfo.at(newWordCount)); + auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); + localVocabMapping.emplace( + std::get(insertInfo.at(newWordCount)).getBits(), + Id::makeFromVocabIndex(VocabIndex::make(newIndex))); + newWordCount++; + } + auto word = vocab[actualIndex]; + auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); + AD_CORRECTNESS_CHECK(newIndex == vocabIndex + newWordCount); + } + + for (const auto& [_, word, id] : insertInfo | ql::views::drop(newWordCount)) { + auto newIndex = (*vocabWriter)(word, vocab.shouldBeExternalized(word)); + localVocabMapping.emplace( + id.getBits(), Id::makeFromVocabIndex(VocabIndex::make(newIndex))); + } + std::vector insertionPositions; + insertionPositions.reserve(insertInfo.size()); + for (const auto& [vocabIndex, _, __] : insertInfo) { + insertionPositions.push_back(vocabIndex); + } + std::vector flatBlockIndices; + for (const auto& ownedBlockEntry : ownedBlocks) { + ql::ranges::copy(ownedBlockEntry.blockIndices_, + std::back_inserter(flatBlockIndices)); + } + ql::ranges::sort(flatBlockIndices); + return std::make_tuple(std::move(insertionPositions), + std::move(localVocabMapping), + std::move(flatBlockIndices)); +} + +// _____________________________________________________________________________ +AD_ALWAYS_INLINE Id +remapVocabId(Id original, const std::vector& insertionPositions) { + AD_EXPENSIVE_CHECK( + original.getDatatype() == Datatype::VocabIndex, + "Only ids resembling a vocab index can be remapped with this function."); + size_t offset = ql::ranges::distance( + insertionPositions.begin(), + ql::ranges::upper_bound(insertionPositions, original.getVocabIndex(), + std::less{})); + return Id::makeFromVocabIndex( + VocabIndex::make(original.getVocabIndex().get() + offset)); +} + +// _____________________________________________________________________________ +Id remapBlankNodeId(Id original, const std::vector& blankNodeBlocks, + uint64_t minBlankNodeIndex) { + AD_EXPENSIVE_CHECK( + original.getDatatype() == Datatype::BlankNodeIndex, + "Only ids resembling a blank node index can be remapped with this " + "function."); + auto rawId = original.getBlankNodeIndex().get(); + if (rawId < minBlankNodeIndex) { + return original; + } + auto normalizedId = rawId - minBlankNodeIndex; + auto blockIndex = normalizedId / ad_utility::BlankNodeManager::blockSize_; + auto it = ql::ranges::lower_bound(blankNodeBlocks, blockIndex); + AD_EXPENSIVE_CHECK(it != blankNodeBlocks.end() && *it == blockIndex, + "Could not find block index of blank node."); + return Id::makeFromBlankNodeIndex(BlankNodeIndex::make( + (normalizedId % ad_utility::BlankNodeManager::blockSize_) + + ql::ranges::distance(blankNodeBlocks.begin(), it) * + ad_utility::BlankNodeManager::blockSize_ + + minBlankNodeIndex)); +} + +// _____________________________________________________________________________ +ad_utility::InputRangeTypeErased> readIndexAndRemap( + const Permutation& permutation, + const BlockMetadataRanges& blockMetadataRanges, + const LocatedTriplesSharedState& locatedTriplesSharedState, + const ad_utility::HashMap& localVocabMapping, + const std::vector& insertionPositions, + const std::vector& blankNodeBlocks, uint64_t minBlankNodeIndex, + const ad_utility::SharedCancellationHandle& cancellationHandle, + ql::span additionalColumns) { + AD_CORRECTNESS_CHECK(ql::ranges::is_sorted(insertionPositions)); + AD_CORRECTNESS_CHECK(ql::ranges::is_sorted(blankNodeBlocks)); + Permutation::ScanSpecAndBlocks scanSpecAndBlocks{ + ScanSpecification{std::nullopt, std::nullopt, std::nullopt}, + blockMetadataRanges}; + auto fullScan = permutation.lazyScan( + scanSpecAndBlocks, std::nullopt, additionalColumns, cancellationHandle, + *locatedTriplesSharedState, LimitOffsetClause{}); + + return ad_utility::InputRangeTypeErased{ + ad_utility::CachingTransformInputRange{ + std::move(fullScan), + [&localVocabMapping, &insertionPositions, &blankNodeBlocks, + minBlankNodeIndex](IdTable& idTable) { + // TODO process columns in parallel. + auto allCols = idTable.getColumns(); + // Extra columns beyond the graph column only contain integers (or + // undefined for triples added via UPDATE) and thus don't need to be + // remapped. + constexpr size_t REGULAR_COLUMNS = 4; + for (auto col : allCols | ::ranges::views::take(REGULAR_COLUMNS)) { + for (Id& id : col) { + // TODO Experiment with caching the last remapped id + // and reusing it if the same id appears again. See if that + // improves performance or if it makes it worse. + if (id.getDatatype() == Datatype::VocabIndex) [[likely]] { + id = remapVocabId(id, insertionPositions); + } else if (id.getDatatype() == Datatype::LocalVocabIndex) { + id = localVocabMapping.at(id.getBits()); + } else if (id.getDatatype() == Datatype::BlankNodeIndex) { + id = remapBlankNodeId(id, blankNodeBlocks, minBlankNodeIndex); + } + } + } + AD_EXPENSIVE_CHECK(ql::ranges::all_of( + allCols | ::ranges::views::drop(REGULAR_COLUMNS), [](auto col) { + return ql::ranges::all_of(col, [](Id id) { + return id.getDatatype() == Datatype::Int || + id.isUndefined(); + }); + })); + return IdTableStatic<0>{std::move(idTable)}; + }}}; +} + +// _____________________________________________________________________________ +size_t getNumColumns(const BlockMetadataRanges& blockMetadataRanges) { + if (!blockMetadataRanges.empty()) { + const auto& first = blockMetadataRanges.at(0); + if (!first.empty()) { + const auto& offsets = first[0].offsetsAndCompressedSize_; + if (offsets.has_value()) { + return offsets.value().size(); + } + } + } + return 4; +} + +// _____________________________________________________________________________ +std::packaged_task createPermutationWriterTask( + IndexImpl& newIndex, const Permutation& permutation, bool isInternal, + const LocatedTriplesSharedState& locatedTriplesSharedState, + const ad_utility::HashMap& localVocabMapping, + const std::vector& insertionPositions, + const std::vector& blankNodeBlocks, uint64_t minBlankNodeIndex, + const ad_utility::SharedCancellationHandle& cancellationHandle) { + auto blockMetadataRanges = permutation.getAugmentedMetadataForPermutation( + *locatedTriplesSharedState); + size_t numColumns = getNumColumns(blockMetadataRanges); + std::vector additionalColumns; + additionalColumns.push_back(ADDITIONAL_COLUMN_GRAPH_ID); + for (ColumnIndex col : {ADDITIONAL_COLUMN_INDEX_SUBJECT_PATTERN, + ADDITIONAL_COLUMN_INDEX_OBJECT_PATTERN}) { + if (additionalColumns.size() >= numColumns - 3) { + break; + } + additionalColumns.push_back(col); + } + AD_CORRECTNESS_CHECK(additionalColumns.size() == numColumns - 3); + return std::packaged_task{ + [numColumns, blockMetadataRanges = std::move(blockMetadataRanges), + &newIndex, &permutation, isInternal, &locatedTriplesSharedState, + &localVocabMapping, &insertionPositions, &blankNodeBlocks, + minBlankNodeIndex, &cancellationHandle, + additionalColumns = std::move(additionalColumns)]() { + // TODO exchange the multiplicities of col1 and col2 for + // matching permutations before writing the metadata. + newIndex.createPermutation( + numColumns, + readIndexAndRemap( + permutation, blockMetadataRanges, locatedTriplesSharedState, + localVocabMapping, insertionPositions, blankNodeBlocks, + minBlankNodeIndex, cancellationHandle, additionalColumns), + permutation, isInternal); + }}; +} +} // namespace qlever::indexRebuilder + +// _____________________________________________________________________________ +namespace qlever { +void materializeToIndex( + const IndexImpl& index, const std::string& newIndexName, + const LocatedTriplesSharedState& locatedTriplesSharedState, + const std::vector& entries, + const std::vector< + ad_utility::BlankNodeManager::LocalBlankNodeManager::OwnedBlocksEntry>& + ownedBlocks, + const ad_utility::SharedCancellationHandle& cancellationHandle, + const std::string& logFileName) { + using namespace indexRebuilder; + AD_CONTRACT_CHECK(!logFileName.empty(), "Log file name must not be empty"); + + // Set up logging to file + std::ofstream logFile{logFileName}; + AD_CORRECTNESS_CHECK(logFile.is_open(), + "Failed to open log file: " + logFileName); + + // Macro for rebuild-specific logging with the same syntax as AD_LOG_INFO +#define REBUILD_LOG_INFO \ + logFile << ad_utility::Log::getTimeStamp() << " - INFO: " + + REBUILD_LOG_INFO << "Rebuilding index from current data (including updates)" + << std::endl; + + REBUILD_LOG_INFO << "Writing new vocabulary ..." << std::endl; + + const auto& [insertionPositions, localVocabMapping, blankNodeBlocks] = + materializeLocalVocab(entries, ownedBlocks, index.getVocab(), + newIndexName); + + REBUILD_LOG_INFO << "Recomputing statistics ..." << std::endl; + + auto newStats = index.recomputeStatistics(locatedTriplesSharedState); + + auto minBlankNodeIndex = index.getBlankNodeManager()->minIndex_; + + // Set newer lower bound for dynamic blank node indices. + newStats["num-blank-nodes-total"] = + minBlankNodeIndex + + blankNodeBlocks.size() * ad_utility::BlankNodeManager::blockSize_; + + IndexImpl newIndex{index.allocator(), false}; + newIndex.loadConfigFromOldIndex(newIndexName, index, newStats); + + REBUILD_LOG_INFO << "Writing new permutations ..." << std::endl; + + std::vector> tasks; + + if (index.usePatterns()) { + tasks.push_back( + std::packaged_task{[&newIndex, &index, &insertionPositions]() { + newIndex.getPatterns() = index.getPatterns().cloneAndRemap( + [&insertionPositions](const Id& oldId) { + return remapVocabId(oldId, insertionPositions); + }); + newIndex.writePatternsToFile(); + }}); + } + + if (index.hasAllPermutations()) { + using enum Permutation::Enum; + for (auto permutation : {SPO, SOP, OPS, OSP}) { + const auto& actualPermutation = index.getPermutation(permutation); + tasks.push_back(createPermutationWriterTask( + newIndex, actualPermutation, false, locatedTriplesSharedState, + localVocabMapping, insertionPositions, blankNodeBlocks, + minBlankNodeIndex, cancellationHandle)); + } + } + + for (auto permutation : Permutation::INTERNAL) { + const auto& actualPermutation = index.getPermutation(permutation); + const auto& internalPermutation = actualPermutation.internalPermutation(); + tasks.push_back(createPermutationWriterTask( + newIndex, internalPermutation, true, locatedTriplesSharedState, + localVocabMapping, insertionPositions, blankNodeBlocks, + minBlankNodeIndex, cancellationHandle)); + tasks.push_back(createPermutationWriterTask( + newIndex, actualPermutation, false, locatedTriplesSharedState, + localVocabMapping, insertionPositions, blankNodeBlocks, + minBlankNodeIndex, cancellationHandle)); + } + + ad_utility::runTasksInParallel(std::move(tasks)); + + REBUILD_LOG_INFO << "Index rebuild completed" << std::endl; + +#undef REBUILD_LOG_INFO +} + +} // namespace qlever diff --git a/src/index/IndexRebuilder.h b/src/index/IndexRebuilder.h new file mode 100644 index 0000000000..e5dc102368 --- /dev/null +++ b/src/index/IndexRebuilder.h @@ -0,0 +1,32 @@ +// Copyright 2025, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Robin Textor-Falconi + +#ifndef QLEVER_SRC_INDEX_INDEXREBUILDER_H +#define QLEVER_SRC_INDEX_INDEXREBUILDER_H + +#include +#include + +#include "global/IndexTypes.h" +#include "index/DeltaTriples.h" +#include "index/IndexImpl.h" +#include "util/BlankNodeManager.h" +#include "util/CancellationHandle.h" + +namespace qlever { + +// Build a new index based on this data. +void materializeToIndex( + const IndexImpl& index, const std::string& newIndexName, + const LocatedTriplesSharedState& locatedTriplesSharedState, + const std::vector& entries, + const std::vector< + ad_utility::BlankNodeManager::LocalBlankNodeManager::OwnedBlocksEntry>& + ownedBlocks, + const ad_utility::SharedCancellationHandle& cancellationHandle, + const std::string& logFileName); + +} // namespace qlever + +#endif // QLEVER_SRC_INDEX_INDEXREBUILDER_H diff --git a/src/index/IndexRebuilderImpl.h b/src/index/IndexRebuilderImpl.h new file mode 100644 index 0000000000..b11fd6090e --- /dev/null +++ b/src/index/IndexRebuilderImpl.h @@ -0,0 +1,76 @@ +// Copyright 2026 The QLever Authors, in particular: +// +// 2026 Robin Textor-Falconi , UFR +// +// UFR = University of Freiburg, Chair of Algorithms and Data Structures + +#ifndef QLEVER_SRC_INDEX_INDEXREBUILDERIMPL_H +#define QLEVER_SRC_INDEX_INDEXREBUILDERIMPL_H + +#include +#include +#include + +#include "engine/idTable/IdTable.h" +#include "global/Id.h" +#include "index/IndexRebuilder.h" +#include "util/CancellationHandle.h" +#include "util/HashMap.h" +#include "util/InputRangeUtils.h" + +namespace qlever::indexRebuilder { + +// Write a new vocabulary that contains all words from `vocab` plus all +// entries in `entries`. Returns a pair consisting of a vector insertion +// positions (the `VocabIndex` of the `LocalVocabEntry`s position in the old +// `vocab`) and a mapping from old local vocab `Id`s bit representation (for +// cheaper hash functions) to new vocab `Id`s. +std::tuple, ad_utility::HashMap, + std::vector> +materializeLocalVocab( + const std::vector& entries, + const std::vector< + ad_utility::BlankNodeManager::LocalBlankNodeManager::OwnedBlocksEntry>& + ownedBlocks, + const Index::Vocab& vocab, const std::string& newIndexName); + +// Map old vocab `Id`s to new vocab `Id`s according to the given +// `insertionPositions`. This is the most performance critical code of the +// rebuild. +Id remapVocabId(Id original, const std::vector& insertionPositions); + +// Remaps a blank node `Id` to another id that's more dense. +Id remapBlankNodeId(Id original, const std::vector& blankNodeBlocks, + uint64_t minBlankNodeIndex); + +// Create a copy of the given `permutation` scanned according to `scanSpec`, +// where all local vocab `Id`s are remapped according to `localVocabMapping` +// and all vocab `Id`s are remapped according to `insertInfo` to create a new +// index where all of these values are all vocab `Id`s in the new vocabulary. +ad_utility::InputRangeTypeErased> readIndexAndRemap( + const Permutation& permutation, + const BlockMetadataRanges& blockMetadataRanges, + const LocatedTriplesSharedState& locatedTriplesSharedState, + const ad_utility::HashMap& localVocabMapping, + const std::vector& insertionPositions, + const std::vector& blankNodeBlocks, uint64_t minBlankNodeIndex, + const ad_utility::SharedCancellationHandle& cancellationHandle, + ql::span additionalColumns); + +// Get the number of columns in the given `blockMetadataRanges`. If this cannot +// be determined, return 4 as a safe default. +size_t getNumColumns(const BlockMetadataRanges& blockMetadataRanges); + +// Create a `std::packaged_task` that writes a new permutation according to the +// settings of `newIndex`, based on the data of the current index. +std::packaged_task createPermutationWriterTask( + IndexImpl& newIndex, const Permutation& permutation, bool isInternal, + const LocatedTriplesSharedState& locatedTriplesSharedState, + const ad_utility::HashMap& localVocabMapping, + const std::vector& insertionPositions, + const std::vector& blankNodeBlocks, uint64_t minBlankNodeIndex, + const ad_utility::SharedCancellationHandle& cancellationHandle); + +} // namespace qlever::indexRebuilder + +#endif // QLEVER_SRC_INDEX_INDEXREBUILDERIMPL_H diff --git a/test/IndexTest.cpp b/test/IndexTest.cpp index 506be61ee1..cbfa55a059 100644 --- a/test/IndexTest.cpp +++ b/test/IndexTest.cpp @@ -14,7 +14,9 @@ #include "./util/IdTableHelpers.h" #include "./util/IdTestHelpers.h" #include "./util/TripleComponentTestHelpers.h" +#include "CompilationInfo.h" #include "index/Index.h" +#include "index/IndexFormatVersion.h" #include "index/IndexImpl.h" #include "util/IndexTestHelpers.h" @@ -862,3 +864,45 @@ TEST(IndexImpl, writePatternsToFile) { EXPECT_TRUE(ql::ranges::equal(CompactVectorOfStrings{data}, result, ql::ranges::equal)); } + +// _____________________________________________________________________________ +TEST(IndexImpl, loadConfigFromOldIndex) { + auto [directory, cleanup] = makeTemporaryDirectory("loadConfigFromOldIndex"); + auto onDiskBase = directory + "/index"; + IndexImpl other{ad_utility::makeUnlimitedAllocator()}; + other.blocksizePermutationPerColumn() = 1337_B; + nlohmann::json stats; + + Index::NumNormalAndInternal numTriples{42, 1337}; + Index::NumNormalAndInternal numPredicates{9999, 1010}; + Index::NumNormalAndInternal numSubjects{8888, 2020}; + Index::NumNormalAndInternal numObjects{7777, 3030}; + + stats["num-triples"] = numTriples; + stats["num-predicates"] = numPredicates; + stats["num-subjects"] = numSubjects; + stats["num-objects"] = numObjects; + stats["i-just-invented-this"] = "🤠"; + + IndexImpl index{ad_utility::makeUnlimitedAllocator()}; + index.loadConfigFromOldIndex(onDiskBase, other, stats); + EXPECT_EQ(index.getOnDiskBase(), onDiskBase); + EXPECT_EQ(index.getKbName(), other.getKbName()); + EXPECT_EQ(index.numTriples(), numTriples); + EXPECT_EQ(index.numDistinctPredicates(), numPredicates); + EXPECT_EQ(index.numSubjects_, numSubjects); + EXPECT_EQ(index.numObjects_, numObjects); + EXPECT_EQ(index.blocksizePermutationPerColumn(), + other.blocksizePermutationPerColumn()); + EXPECT_EQ(index.configurationJson_, stats); + + // The version written to disk will also have these fields. + stats["git-hash"] = *qlever::version::gitShortHashWithoutLinking.wlock(); + stats["index-format-version"] = qlever::indexFormatVersion; + + std::string jsonFile = onDiskBase + CONFIGURATION_FILE; + std::ifstream in{jsonFile}; + nlohmann::json jsonFromFile; + in >> jsonFromFile; + EXPECT_EQ(stats, jsonFromFile); +} diff --git a/test/index/CMakeLists.txt b/test/index/CMakeLists.txt index db9ee415da..89c26b733b 100644 --- a/test/index/CMakeLists.txt +++ b/test/index/CMakeLists.txt @@ -3,3 +3,4 @@ addLinkAndDiscoverTest(PatternCreatorTest index) addLinkAndDiscoverTestSerial(ScanSpecificationTest index) addLinkAndDiscoverTestNoLibs(KeyOrderTest) addLinkAndDiscoverTestNoLibs(EncodedIriManagerTest) +addLinkAndDiscoverTest(IndexRebuilderTest index) diff --git a/test/index/IndexRebuilderTest.cpp b/test/index/IndexRebuilderTest.cpp new file mode 100644 index 0000000000..5fee9832b7 --- /dev/null +++ b/test/index/IndexRebuilderTest.cpp @@ -0,0 +1,103 @@ +// Copyright 2026 The QLever Authors, in particular: +// +// 2026 Robin Textor-Falconi , UFR +// +// UFR = University of Freiburg, Chair of Algorithms and Data Structures + +#include + +#include "../util/IndexTestHelpers.h" +#include "../util/TripleComponentTestHelpers.h" +#include "index/IndexRebuilder.h" +#include "index/IndexRebuilderImpl.h" + +using namespace qlever::indexRebuilder; + +// _____________________________________________________________________________ +TEST(IndexRebuilder, materializeLocalVocab) { + auto oldIndex = ad_utility::testing::makeTestIndex( + "materializeLocalVocab", " . ."); + std::string vocabPrefix = "/tmp/materializeLocalVocab"; + // TODO Cleanup generated test files. + + auto makeVocabEntry = [](std::string_view str) { + return LocalVocabEntry{ad_utility::testing::iri(str)}; + }; + + auto getId = ad_utility::testing::makeGetId(oldIndex); + auto b = makeVocabEntry(""); + auto c = getId(""); + auto d = makeVocabEntry(""); + auto e = getId(""); + auto f = makeVocabEntry(""); + auto g = getId(""); + auto h = makeVocabEntry(""); + auto j = makeVocabEntry(""); + auto k = getId(""); + auto l = makeVocabEntry(""); + std::vector entries{&b, &d, &f, &h, &j, &l}; + using OBE = + ad_utility::BlankNodeManager::LocalBlankNodeManager::OwnedBlocksEntry; + std::vector ownedBlocks{OBE{{}, {4, 42}}, OBE{{}, {7, 77}}}; + + auto [insertionPositions, localVocabMapping, flatBlockIndices] = + materializeLocalVocab(entries, ownedBlocks, oldIndex.getVocab(), + vocabPrefix); + EXPECT_THAT( + insertionPositions, + ::testing::ElementsAre( + c.getVocabIndex(), e.getVocabIndex(), g.getVocabIndex(), + Id::fromBits(h.positionInVocab().upperBound_.get()).getVocabIndex(), + k.getVocabIndex(), + Id::fromBits(l.positionInVocab().upperBound_.get()).getVocabIndex())); + auto toBits = [](const LocalVocabEntry& entry) { + return Id::makeFromLocalVocabIndex(&entry).getBits(); + }; + EXPECT_THAT(localVocabMapping, + ::testing::UnorderedElementsAre( + std::make_pair(toBits(b), + Id::makeFromVocabIndex(VocabIndex::make(1))), + std::make_pair(toBits(d), + Id::makeFromVocabIndex(VocabIndex::make(3))), + std::make_pair(toBits(f), + Id::makeFromVocabIndex(VocabIndex::make(5))), + std::make_pair(toBits(h), + Id::makeFromVocabIndex(VocabIndex::make(7))), + std::make_pair(toBits(j), + Id::makeFromVocabIndex(VocabIndex::make(14))), + std::make_pair(toBits(l), Id::makeFromVocabIndex( + VocabIndex::make(16))))); + EXPECT_THAT(flatBlockIndices, ::testing::ElementsAre(4, 7, 42, 77)); + + // TODO Add tests that the created vocabulary on disk is correct +} + +// _____________________________________________________________________________ +TEST(IndexRebuilder, remapVocabId) { + // TODO Add unit tests +} + +// _____________________________________________________________________________ +TEST(IndexRebuilder, remapBlankNodeId) { + // TODO Add unit tests +} + +// _____________________________________________________________________________ +TEST(IndexRebuilder, readIndexAndRemap) { + // TODO Add unit tests +} + +// _____________________________________________________________________________ +TEST(IndexRebuilder, getNumColumns) { + // TODO Add unit tests +} + +// _____________________________________________________________________________ +TEST(IndexRebuilder, createPermutationWriterTask) { + // TODO Add unit tests +} + +// _____________________________________________________________________________ +TEST(IndexRebuilder, materializeToIndex) { + // TODO Add unit tests +}