diff --git a/src/engine/AddCombinedRowToTable.h b/src/engine/AddCombinedRowToTable.h index e7a4fc2e4f..9437cd950b 100644 --- a/src/engine/AddCombinedRowToTable.h +++ b/src/engine/AddCombinedRowToTable.h @@ -259,6 +259,14 @@ class AddCombinedRowToIdTable { LocalVocab& localVocab() { return mergedVocab_; } + // Move both the result table and local vocab out as an IdTableVocabPair. + // This is a convenience method for the common pattern of moving both out. + auto toIdTableVocabPair() && { + flush(); + return Result::IdTableVocabPair{std::move(resultTable_), + std::move(mergedVocab_)}; + } + // Disable copying and moving, it is currently not needed and makes it harder // to reason about AddCombinedRowToIdTable(const AddCombinedRowToIdTable&) = delete; diff --git a/src/engine/IndexScan.cpp b/src/engine/IndexScan.cpp index 9dad45f671..f6e90efa89 100644 --- a/src/engine/IndexScan.cpp +++ b/src/engine/IndexScan.cpp @@ -744,6 +744,46 @@ std::pair IndexScan::prefilterTables( createPrefilteredIndexScanSide(state)}; } +// _____________________________________________________________________________ +Result::LazyResult IndexScan::createPrefilteredJoinSideForOptional( + std::shared_ptr innerState) { + using LoopControl = ad_utility::LoopControl; + + auto range = ad_utility::InputRangeFromLoopControlGet{ + [state = std::move(innerState)]() mutable { + // For OPTIONAL, we always re-yield ALL input, never filter anything + // This is the key difference from regular JOIN + if (!state->iterator_.has_value()) { + state->iterator_ = state->generator_.begin(); + } + + // Just pass through the entire input stream + return LoopControl::breakWithYieldAll(ql::ranges::subrange( + state->iterator_.value(), state->generator_.end())); + }}; + return Result::LazyResult{std::move(range)}; +} + +// _____________________________________________________________________________ +std::pair +IndexScan::prefilterTablesForOptional(Result::LazyResult input, + ColumnIndex joinColumn) { + AD_CORRECTNESS_CHECK(numVariables_ <= 3 && numVariables_ > 0); + auto metaBlocks = getMetadataForScan(); + + if (!metaBlocks.has_value()) { + // Return empty results + return {Result::LazyResult{}, Result::LazyResult{}}; + } + + auto state = std::make_shared(SharedGeneratorState{ + std::move(input), joinColumn, std::move(metaBlocks.value())}); + // Use the OPTIONAL version for the left side (never filters) + // and the regular version for the right side (still prefilters) + return {createPrefilteredJoinSideForOptional(state), + createPrefilteredIndexScanSide(state)}; +} + // _____________________________________________________________________________ std::unique_ptr IndexScan::cloneImpl() const { return std::make_unique( diff --git a/src/engine/IndexScan.h b/src/engine/IndexScan.h index ed3f318a2e..220d31f998 100644 --- a/src/engine/IndexScan.h +++ b/src/engine/IndexScan.h @@ -130,6 +130,14 @@ class IndexScan final : public Operation { std::pair prefilterTables( Result::LazyResult input, ColumnIndex joinColumn); + // Similar to `prefilterTables`, but for OPTIONAL semantics: The first + // generator re-yields ALL input (never skips any), while the second generator + // still yields only the matching prefiltered blocks. This ensures that + // OPTIONAL joins produce output for all left rows, even when they don't + // match. + std::pair prefilterTablesForOptional( + Result::LazyResult input, ColumnIndex joinColumn); + private: // Implementation detail that allows to consume a lazy range from two other // cooperating ranges. Needs to be forward declared as it is used by @@ -147,6 +155,11 @@ class IndexScan final : public Operation { Result::LazyResult createPrefilteredIndexScanSide( std::shared_ptr innerState); + // Helper function for OPTIONAL semantics: creates a lazy range that re-yields + // ALL input without filtering (even inputs that don't have matching blocks). + static Result::LazyResult createPrefilteredJoinSideForOptional( + std::shared_ptr innerState); + // TODO Make the `getSizeEstimateBeforeLimit()` function `const` for // ALL the `Operations`. uint64_t getSizeEstimateBeforeLimit() override { return sizeEstimate_; } @@ -208,6 +221,17 @@ class IndexScan final : public Operation { // Retrieve the `Permutation` entity for this `IndexScan`. const Permutation& permutation() const; + // Access the metadata and blocks for this scan. Used by join operations to + // perform block-level prefiltering. Returns `std::nullopt` if the scan + // doesn't have metadata (e.g., for very small relations). + std::optional getMetadataForScan() const; + + // Get a lazy scan that only reads the specified blocks. Used by join + // operations to scan only the prefiltered blocks. + CompressedRelationReader::IdTableGeneratorInputRange getLazyScan( + std::optional> blocks = + std::nullopt) const; + private: std::unique_ptr cloneImpl() const override; @@ -253,13 +277,6 @@ class IndexScan final : public Operation { // `Permutation` class. ScanSpecAndBlocks getScanSpecAndBlocks() const; - // Helper functions for the public `getLazyScanFor...` methods and - // `chunkedIndexScan` (see above). - CompressedRelationReader::IdTableGeneratorInputRange getLazyScan( - std::optional> blocks = - std::nullopt) const; - std::optional getMetadataForScan() const; - // If the `varsToKeep_` member is set, meaning that this `IndexScan` only // returns a subset of this actual columns, return the subset of columns that // has to be applied to the "full" result (without any columns stripped) to diff --git a/src/engine/JoinHelpers.h b/src/engine/JoinHelpers.h index 7387b6f4dd..5382f0ad9a 100644 --- a/src/engine/JoinHelpers.h +++ b/src/engine/JoinHelpers.h @@ -29,6 +29,9 @@ static constexpr size_t CHUNK_SIZE = 100'000; using namespace ad_utility; +// Forward declaration for getRowAdderForJoin +class Operation; + using OptionalPermutation = std::optional>; // _____________________________________________________________________________ @@ -123,6 +126,39 @@ CPP_template_2(typename ActionT)( }); } +// Helper function to create a Result from an action, either lazy or +// materialized depending on the requestLaziness parameter. The action is +// expected to be a callable that takes a callback and returns an +// IdTableVocabPair. An optional permutation can be applied to the result. +template +inline Result createResultFromAction(bool requestLaziness, Action&& action, + GetSortedOn&& getSortedOn, + OptionalPermutation permutation = {}) { + if (requestLaziness) { + return {runLazyJoinAndConvertToGenerator(std::forward(action), + std::move(permutation)), + getSortedOn()}; + } else { + auto [idTable, localVocab] = action(ad_utility::noop); + applyPermutation(idTable, permutation); + return {std::move(idTable), getSortedOn(), std::move(localVocab)}; + } +} + +// Helper function to create an AddCombinedRowToIdTable for join operations. +// This encapsulates the common pattern of constructing the row adder with +// parameters derived from the operation. +inline auto getRowAdderForJoin( + const Operation& op, size_t numJoinColumns, bool keepJoinColumns, + AddCombinedRowToIdTable::BlockwiseCallback yieldTable) { + return AddCombinedRowToIdTable{numJoinColumns, + IdTable{op.getResultWidth(), op.allocator()}, + op.cancellationHandle_, + keepJoinColumns, + CHUNK_SIZE, + std::move(yieldTable)}; +} + // Helper function to check if the join of two columns propagate the value // returned by `Operation::columnOriginatesFromGraphOrUndef`. inline bool doesJoinProduceGuaranteedGraphValuesOrUndef( diff --git a/src/engine/JoinWithIndexScanHelpers.h b/src/engine/JoinWithIndexScanHelpers.h new file mode 100644 index 0000000000..fc01ba6ba7 --- /dev/null +++ b/src/engine/JoinWithIndexScanHelpers.h @@ -0,0 +1,207 @@ +// Copyright 2025, University of Freiburg +// Chair of Algorithms and Data Structures +// Author: Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de) + +#ifndef QLEVER_SRC_ENGINE_JOINWITHINDEXSCANHELPERS_H +#define QLEVER_SRC_ENGINE_JOINWITHINDEXSCANHELPERS_H + +#include "engine/AddCombinedRowToTable.h" +#include "engine/IndexScan.h" +#include "engine/Result.h" +#include "index/CompressedRelation.h" +#include "util/Iterators.h" +#include "util/JoinAlgorithms/JoinAlgorithms.h" +#include "util/JoinAlgorithms/JoinColumnMapping.h" +#include "util/MemoryHelpers.h" + +namespace qlever::joinWithIndexScanHelpers { + +// Tag types to indicate the join semantics +struct InnerJoinTag {}; +struct OptionalJoinTag {}; +struct MinusTag {}; + +// Helper to convert generators to the format expected by join algorithms +using IteratorWithSingleCol = + ad_utility::InputRangeTypeErased>; + +inline IteratorWithSingleCol convertGenerator( + CompressedRelationReader::IdTableGeneratorInputRange&& gen, + IndexScan& scan) { + // Store the generator in a wrapper so we can access its details after moving + auto generatorStorage = + std::make_shared( + std::move(gen)); + + using SendPriority = RuntimeInformation::SendPriority; + + auto range = ad_utility::CachingTransformInputRange( + *generatorStorage, + [generatorStorage, &scan, + sendPriority = SendPriority::Always](auto& table) mutable { + scan.updateRuntimeInfoForLazyScan(generatorStorage->details(), + sendPriority); + sendPriority = SendPriority::IfDue; + // IndexScans don't have a local vocabulary, so we can just use an empty + // one. + return ad_utility::IdTableAndFirstCol{std::move(table), LocalVocab{}}; + }); + + return IteratorWithSingleCol{std::move(range)}; +} + +// Helper to get blocks for join based on number of join columns +inline std::array +getBlocksForJoinOfTwoScans(const IndexScan& s1, const IndexScan& s2, + size_t numJoinColumns) { + AD_CONTRACT_CHECK(s1.numVariables() <= 3 && s2.numVariables() <= 3); + AD_CONTRACT_CHECK(s1.numVariables() >= 1 && s2.numVariables() >= 1); + + auto metaBlocks1 = s1.getMetadataForScan(); + auto metaBlocks2 = s2.getMetadataForScan(); + + if (!metaBlocks1.has_value() || !metaBlocks2.has_value()) { + return {{}}; + } + + std::array, 2> blocks; + if (numJoinColumns == 1) { + blocks = CompressedRelationReader::getBlocksForJoin(metaBlocks1.value(), + metaBlocks2.value()); + } else { + blocks = CompressedRelationReader::getBlocksForJoinMultiColumn( + metaBlocks1.value(), metaBlocks2.value(), numJoinColumns); + } + + std::array result{ + s1.getLazyScan(blocks[0]), s2.getLazyScan(blocks[1])}; + result[0].details().numBlocksAll_ = metaBlocks1.value().sizeBlockMetadata_; + result[1].details().numBlocksAll_ = metaBlocks2.value().sizeBlockMetadata_; + return result; +} + +// Helper to check if the first row of any of the specified columns contains +// UNDEF values. Returns true if any join column in the first row is undefined, +// false otherwise. Returns false if the table is empty. +inline bool firstRowHasUndef( + const IdTable& table, + const std::vector>& joinColumns, + size_t sideIndex) { + if (table.empty()) { + return false; + } + for (const auto& jc : joinColumns) { + if (table.at(0, jc[sideIndex]).isUndefined()) { + return true; + } + } + return false; +} + +// Helper to get blocks for join of a column with a scan (multi-column version) +inline CompressedRelationReader::IdTableGeneratorInputRange +getBlocksForJoinOfColumnsWithScan( + const IdTable& idTable, + const std::vector>& joinColumns, + const IndexScan& scan, ColumnIndex scanJoinColIndex) { + AD_EXPENSIVE_CHECK(ql::ranges::is_sorted( + idTable.getColumn(joinColumns[scanJoinColIndex][0]))); + AD_CORRECTNESS_CHECK(scan.numVariables() <= 3 && scan.numVariables() > 0); + + auto metaBlocks = scan.getMetadataForScan(); + if (!metaBlocks.has_value()) { + return {}; + } + + // Cannot prefilter if first row has UNDEF in any join column + if (firstRowHasUndef(idTable, joinColumns, 0)) { + return {}; + } + + CompressedRelationReader::GetBlocksForJoinResult blocksResult; + + if (joinColumns.size() == 1) { + auto joinColumn = idTable.getColumn(joinColumns[0][0]); + blocksResult = CompressedRelationReader::getBlocksForJoin( + joinColumn, metaBlocks.value()); + } else if (joinColumns.size() == 2) { + auto col1 = idTable.getColumn(joinColumns[0][0]); + auto col2 = idTable.getColumn(joinColumns[1][0]); + blocksResult = CompressedRelationReader::getBlocksForJoinMultiColumn( + col1, col2, metaBlocks.value()); + } else if (joinColumns.size() == 3) { + auto col1 = idTable.getColumn(joinColumns[0][0]); + auto col2 = idTable.getColumn(joinColumns[1][0]); + auto col3 = idTable.getColumn(joinColumns[2][0]); + blocksResult = CompressedRelationReader::getBlocksForJoinMultiColumn( + col1, col2, col3, metaBlocks.value()); + } else { + AD_FAIL(); + } + + auto result = scan.getLazyScan(std::move(blocksResult.matchingBlocks_)); + result.details().numBlocksAll_ = metaBlocks.value().sizeBlockMetadata_; + return result; +} + +// Helper to convert prefiltered lazy generators to the format expected by +// zipperJoinForBlocksWithPotentialUndef. Takes the left and right generators +// from prefilterTablesForOptional and converts them to ranges of +// IdTableAndFirstCol with appropriate column permutations applied. +inline auto convertPrefilteredGenerators( + std::shared_ptr leftGenerator, + std::shared_ptr rightGenerator, size_t leftWidth, + ColumnIndex rightJoinColumn) { + // Create identity permutation for left (all columns in order) + std::vector identityPerm(leftWidth); + std::iota(identityPerm.begin(), identityPerm.end(), 0); + + auto leftRange = ad_utility::CachingTransformInputRange( + std::move(*leftGenerator), [identityPerm](auto& pair) { + return ad_utility::IdTableAndFirstCol{ + pair.idTable_.asColumnSubsetView(identityPerm), + std::move(pair.localVocab_)}; + }); + + // Right permutation puts the join column first + std::vector rightPerm = {rightJoinColumn}; + auto rightRange = ad_utility::CachingTransformInputRange( + std::move(*rightGenerator), [rightPerm](auto& pair) { + return ad_utility::IdTableAndFirstCol{ + pair.idTable_.asColumnSubsetView(rightPerm), + std::move(pair.localVocab_)}; + }); + + return std::pair{std::move(leftRange), std::move(rightRange)}; +} + +// Helper to set scan status to lazily completed (variadic, accepts 1+ scans) +template +inline void setScanStatusToLazilyCompleted(Scans&... scans) { + (void(scans.runtimeInfo().status_ = + RuntimeInformation::Status::lazilyMaterializedCompleted), + ...); +} + +// Helper to get unfiltered blocks for the left scan and filtered blocks for +// the right scan. Returns shared_ptrs ready for use in action lambdas. +// Used by OptionalJoin and Minus where the left side must be complete and only +// the right side can be prefiltered. +inline auto getUnfilteredLeftAndFilteredRightSideFromIndexScans( + IndexScan& leftScan, IndexScan& rightScan, size_t numJoinColumns) { + auto leftMetaBlocks = leftScan.getMetadataForScan(); + + auto leftBlocks = leftScan.getLazyScan(std::nullopt); + leftBlocks.details().numBlocksAll_ = + leftMetaBlocks.value().sizeBlockMetadata_; + + auto rightBlocks = + getBlocksForJoinOfTwoScans(leftScan, rightScan, numJoinColumns); + + return std::pair{ad_utility::toSharedPtr(std::move(leftBlocks)), + ad_utility::toSharedPtr(std::move(rightBlocks[1]))}; +} + +} // namespace qlever::joinWithIndexScanHelpers + +#endif // QLEVER_SRC_ENGINE_JOINWITHINDEXSCANHELPERS_H diff --git a/src/engine/Minus.cpp b/src/engine/Minus.cpp index d06be34b23..6ad7711e2f 100644 --- a/src/engine/Minus.cpp +++ b/src/engine/Minus.cpp @@ -4,15 +4,21 @@ #include "engine/Minus.h" +#include + #include "engine/CallFixedSize.h" +#include "engine/IndexScan.h" #include "engine/JoinHelpers.h" +#include "engine/JoinWithIndexScanHelpers.h" #include "engine/MinusRowHandler.h" #include "engine/Service.h" #include "engine/Sort.h" +#include "global/RuntimeParameters.h" #include "util/Algorithm.h" #include "util/Exception.h" #include "util/JoinAlgorithms/IndexNestedLoopJoin.h" #include "util/JoinAlgorithms/JoinAlgorithms.h" +#include "util/MemoryHelpers.h" using std::endl; using std::string; @@ -38,6 +44,156 @@ string Minus::getCacheKeyImpl() const { // _____________________________________________________________________________ string Minus::getDescriptor() const { return "Minus"; } +// _____________________________________________________________________________ +Result Minus::computeResultForTwoIndexScans(bool requestLaziness, + IndexScan& leftScan, + IndexScan& rightScan) const { + using namespace qlever::joinWithIndexScanHelpers; + + // For MINUS, only the right child can be prefiltered. + // Get unfiltered blocks for left, filtered blocks for right. + auto [leftBlocksPtr, rightBlocksPtr] = + getUnfilteredLeftAndFilteredRightSideFromIndexScans( + leftScan, rightScan, _matchedColumns.size()); + + std::vector permutation; + permutation.resize(_left->getResultWidth()); + ql::ranges::copy(ad_utility::integerRange(permutation.size()), + permutation.begin()); + ColumnIndex leftJoinColumn = _matchedColumns.at(0).at(0); + std::swap(permutation.at(0), permutation.at(leftJoinColumn)); + + auto action = [this, leftBlocksPtr, rightBlocksPtr, &leftScan, &rightScan, + permutation]( + std::function yieldTable) { + using namespace qlever::joinWithIndexScanHelpers; + + ad_utility::MinusRowHandler rowAdder{ + _matchedColumns.size(), IdTable{getResultWidth(), allocator()}, + cancellationHandle_, std::move(yieldTable)}; + auto leftConverted = convertGenerator(std::move(*leftBlocksPtr), leftScan); + auto rightConverted = + convertGenerator(std::move(*rightBlocksPtr), rightScan); + ad_utility::zipperJoinForBlocksWithPotentialUndef( + leftConverted, rightConverted, std::less{}, rowAdder, {}, {}, + ad_utility::MinusJoinTag{}); + return std::move(rowAdder).toIdTableVocabPair(); + }; + + return qlever::joinHelpers::createResultFromAction( + requestLaziness, std::move(action), [this] { return resultSortedOn(); }, + permutation); +} + +// _____________________________________________________________________________ +Result Minus::computeResultForIndexScanOnRight( + bool requestLaziness, std::shared_ptr leftRes, + IndexScan& rightScan) const { + using namespace qlever::joinWithIndexScanHelpers; + + AD_CORRECTNESS_CHECK(leftRes->isFullyMaterialized()); + + const IdTable& leftTable = leftRes->idTable(); + + // Get filtered blocks for right based on left's data. + auto rightBlocks = getBlocksForJoinOfColumnsWithScan( + leftTable, _matchedColumns, rightScan, _matchedColumns.at(0).at(1)); + + std::vector permutation; + permutation.resize(_left->getResultWidth()); + ql::ranges::copy(ad_utility::integerRange(permutation.size()), + permutation.begin()); + ColumnIndex leftJoinColumn = _matchedColumns.at(0).at(0); + std::swap(permutation.at(0), permutation.at(leftJoinColumn)); + + auto action = [this, leftRes = std::move(leftRes), + rightBlocks = std::move(rightBlocks), &rightScan, permutation]( + std::function yieldTable) { + using namespace qlever::joinWithIndexScanHelpers; + + ad_utility::MinusRowHandler rowAdder{ + _matchedColumns.size(), IdTable{getResultWidth(), allocator()}, + cancellationHandle_, std::move(yieldTable)}; + + // Create view of left table for the join + const IdTable& leftTable = leftRes->idTable(); + std::vector identityPerm(leftTable.numColumns()); + std::iota(identityPerm.begin(), identityPerm.end(), 0); + auto leftBlock = std::array{ad_utility::IdTableAndFirstCol{ + leftTable.asColumnSubsetView(identityPerm), + leftRes->getCopyOfLocalVocab()}}; + + auto rightConverted = convertGenerator(std::move(rightBlocks), rightScan); + ad_utility::zipperJoinForBlocksWithPotentialUndef( + leftBlock, rightConverted, std::less{}, rowAdder, {}, {}, + ad_utility::MinusJoinTag{}); + return std::move(rowAdder).toIdTableVocabPair(); + }; + + return qlever::joinHelpers::createResultFromAction( + requestLaziness, std::move(action), [this] { return resultSortedOn(); }, + permutation); +} + +// _____________________________________________________________________________ +Result Minus::computeResultForIndexScanOnRightLazy( + bool requestLaziness, std::shared_ptr leftRes, + IndexScan& rightScan) const { + using namespace qlever::joinWithIndexScanHelpers; + + AD_CORRECTNESS_CHECK(!leftRes->isFullyMaterialized()); + + // Only support single join column for now + if (_matchedColumns.size() != 1) { + return lazyMinusJoin(std::move(leftRes), + const_cast(rightScan).getResult( + true, ComputationMode::LAZY_IF_SUPPORTED), + requestLaziness); + } + + // For MINUS semantics, we must process ALL left input (similar to OPTIONAL). + // We use prefilterTablesForOptional which passes through all left rows + // while still prefiltering the right IndexScan. + auto [leftSide, rightSide] = rightScan.prefilterTablesForOptional( + leftRes->idTables(), _matchedColumns.at(0).at(0)); + + std::vector permutation; + permutation.resize(_left->getResultWidth()); + ql::ranges::copy(ad_utility::integerRange(permutation.size()), + permutation.begin()); + ColumnIndex leftJoinColumn = _matchedColumns.at(0).at(0); + std::swap(permutation.at(0), permutation.at(leftJoinColumn)); + + auto action = [this, leftSide = std::move(leftSide), + rightSide = std::move(rightSide), &rightScan, permutation]( + std::function yieldTable) { + using namespace qlever::joinWithIndexScanHelpers; + + ad_utility::MinusRowHandler rowAdder{ + _matchedColumns.size(), IdTable{getResultWidth(), allocator()}, + cancellationHandle_, std::move(yieldTable)}; + + // Convert generators to the right format + auto leftSidePtr = ad_utility::toSharedPtr(std::move(leftSide)); + auto rightSidePtr = ad_utility::toSharedPtr(std::move(rightSide)); + auto [leftRange, rightRange] = convertPrefilteredGenerators( + leftSidePtr, rightSidePtr, _left->getResultWidth(), + _matchedColumns.at(0).at(1)); + + ad_utility::zipperJoinForBlocksWithPotentialUndef( + leftRange, rightRange, std::less{}, rowAdder, {}, {}, + ad_utility::MinusJoinTag{}); + + setScanStatusToLazilyCompleted(rightScan); + + return std::move(rowAdder).toIdTableVocabPair(); + }; + + return qlever::joinHelpers::createResultFromAction( + requestLaziness, std::move(action), [this] { return resultSortedOn(); }, + permutation); +} + // _____________________________________________________________________________ Result Minus::computeResult(bool requestLaziness) { AD_LOG_DEBUG << "Minus result computation..." << endl; @@ -52,6 +208,35 @@ Result Minus::computeResult(bool requestLaziness) { return std::move(res).value(); } + // Check for IndexScan children to enable prefiltering. + auto leftIndexScan = + std::dynamic_pointer_cast(_left->getRootOperation()); + auto rightIndexScan = + std::dynamic_pointer_cast(_right->getRootOperation()); + + // Case 1: Both children are IndexScans + if (leftIndexScan && rightIndexScan && _matchedColumns.size() == 1) { + return computeResultForTwoIndexScans(requestLaziness, *leftIndexScan, + *rightIndexScan); + } + + // Case 2: Only right child is IndexScan + if (rightIndexScan && _matchedColumns.size() == 1) { + // The lazy minus implementation does only work if there's just a single + // join column. This might be extended in the future. + bool lazyJoinIsSupported = _matchedColumns.size() == 1; + auto leftResult = _left->getResult(lazyJoinIsSupported); + + if (leftResult->isFullyMaterialized()) { + return computeResultForIndexScanOnRight( + requestLaziness, std::move(leftResult), *rightIndexScan); + } else { + return computeResultForIndexScanOnRightLazy( + requestLaziness, std::move(leftResult), *rightIndexScan); + } + } + + // Fall back to regular minus computation // The lazy minus implementation does only work if there's just a single // join column. This might be extended in the future. bool lazyJoinIsSupported = _matchedColumns.size() == 1; @@ -269,7 +454,7 @@ bool Minus::columnOriginatesFromGraphOrUndef(const Variable& variable) const { // _____________________________________________________________________________ Result Minus::lazyMinusJoin(std::shared_ptr left, std::shared_ptr right, - bool requestLaziness) { + bool requestLaziness) const { // If both inputs are fully materialized, we can join them more // efficiently. AD_CONTRACT_CHECK(!left->isFullyMaterialized() || diff --git a/src/engine/Minus.h b/src/engine/Minus.h index a7a8a0e4b7..328140c242 100644 --- a/src/engine/Minus.h +++ b/src/engine/Minus.h @@ -11,6 +11,9 @@ #include "engine/Operation.h" #include "engine/QueryExecutionTree.h" +// Forward declarations +class IndexScan; + class Minus : public Operation { private: std::shared_ptr _left; @@ -93,7 +96,7 @@ class Minus : public Operation { // single join column, otherwise this function will throw. Result lazyMinusJoin(std::shared_ptr left, std::shared_ptr right, - bool requestLaziness); + bool requestLaziness) const; Result computeResult(bool requestLaziness) override; @@ -102,6 +105,26 @@ class Minus : public Operation { std::optional> makeTreeWithStrippedColumns( const std::set& variables) const override; + + // Specialized implementations for joins involving IndexScans (prefiltering). + // These methods are similar to those in OptionalJoin but adapted for MINUS + // semantics (only the right child can be prefiltered). + + // When both children are IndexScans. Filter blocks on the right based on + // the left's block ranges. + Result computeResultForTwoIndexScans(bool requestLaziness, + IndexScan& leftScan, + IndexScan& rightScan) const; + + // When the right child is an IndexScan and the left is fully materialized. + Result computeResultForIndexScanOnRight(bool requestLaziness, + std::shared_ptr leftRes, + IndexScan& rightScan) const; + + // When the right child is an IndexScan and the left is lazy. + Result computeResultForIndexScanOnRightLazy( + bool requestLaziness, std::shared_ptr leftRes, + IndexScan& rightScan) const; }; #endif // QLEVER_SRC_ENGINE_MINUS_H diff --git a/src/engine/MinusRowHandler.h b/src/engine/MinusRowHandler.h index 1607581b5c..9832cb8f24 100644 --- a/src/engine/MinusRowHandler.h +++ b/src/engine/MinusRowHandler.h @@ -130,6 +130,14 @@ class MinusRowHandler { // Get the output `LocalVocab`. LocalVocab& localVocab() { return mergedVocab_; } + // Move both the result table and local vocab out as an IdTableVocabPair. + // This is a convenience method for the common pattern of moving both out. + auto toIdTableVocabPair() && { + flush(); + return Result::IdTableVocabPair{std::move(resultTable_), + std::move(mergedVocab_)}; + } + // Disable copying and moving, it is currently not needed and makes it harder // to reason about MinusRowHandler(const MinusRowHandler&) = delete; diff --git a/src/engine/MultiColumnJoin.cpp b/src/engine/MultiColumnJoin.cpp index 4d5cd2b48e..5a343797a8 100644 --- a/src/engine/MultiColumnJoin.cpp +++ b/src/engine/MultiColumnJoin.cpp @@ -5,10 +5,15 @@ #include "engine/MultiColumnJoin.h" +#include + #include "engine/AddCombinedRowToTable.h" #include "engine/CallFixedSize.h" #include "engine/Engine.h" +#include "engine/IndexScan.h" #include "engine/JoinHelpers.h" +#include "engine/JoinWithIndexScanHelpers.h" +#include "global/RuntimeParameters.h" #include "util/JoinAlgorithms/JoinAlgorithms.h" using std::endl; @@ -58,10 +63,249 @@ string MultiColumnJoin::getDescriptor() const { return "MultiColumnJoin on " + joinVars; } +// _____________________________________________________________________________ +Result MultiColumnJoin::computeResultForTwoIndexScans( + bool requestLaziness, IndexScan& leftScan, IndexScan& rightScan) const { + using namespace qlever::joinWithIndexScanHelpers; + + ad_utility::Timer timer{ad_utility::timer::Timer::InitialStatus::Started}; + + // Get filtered blocks for both sides + auto blocks = + getBlocksForJoinOfTwoScans(leftScan, rightScan, _joinColumns.size()); + + runtimeInfo().addDetail("time-for-filtering-blocks", timer.msecs()); + + // Create result generator + // Wrap generators in shared_ptr to allow const lambda capture + auto leftBlocksPtr = + std::make_shared( + std::move(blocks[0])); + auto rightBlocksPtr = + std::make_shared( + std::move(blocks[1])); + + auto action = [this, leftBlocksPtr, rightBlocksPtr, &leftScan, &rightScan]( + std::function yieldTable) { + auto rowAdder = ad_utility::AddCombinedRowToIdTable{ + _joinColumns.size(), + IdTable{getResultWidth(), allocator()}, + cancellationHandle_, + true, // keepJoinColumns (for multi-column joins, we always keep them) + qlever::joinHelpers::CHUNK_SIZE, + std::move(yieldTable)}; + + auto leftConverted = convertGenerator(std::move(*leftBlocksPtr), leftScan); + auto rightConverted = + convertGenerator(std::move(*rightBlocksPtr), rightScan); + + ad_utility::zipperJoinForBlocksWithPotentialUndef( + leftConverted, rightConverted, std::less{}, rowAdder, {}, {}); + + setScanStatusToLazilyCompleted(leftScan, rightScan); + + auto localVocab = std::move(rowAdder.localVocab()); + return Result::IdTableVocabPair{std::move(rowAdder).resultTable(), + std::move(localVocab)}; + }; + + if (requestLaziness) { + return {qlever::joinHelpers::runLazyJoinAndConvertToGenerator( + std::move(action), {}), + resultSortedOn()}; + } else { + auto [idTable, localVocab] = action(ad_utility::noop); + return {std::move(idTable), resultSortedOn(), std::move(localVocab)}; + } +} + +// _____________________________________________________________________________ +template +Result MultiColumnJoin::computeResultForIndexScanAndIdTable( + bool requestLaziness, std::shared_ptr resultWithIdTable, + std::shared_ptr scan) const { + using namespace qlever::joinWithIndexScanHelpers; + + AD_CORRECTNESS_CHECK(resultWithIdTable->isFullyMaterialized()); + + ad_utility::Timer timer{ad_utility::timer::Timer::InitialStatus::Started}; + + const IdTable& idTable = resultWithIdTable->idTable(); + + // Check if IdTable has UNDEF in join columns + bool idTableHasUndef = false; + for (const auto& [leftCol, rightCol] : _joinColumns) { + auto col = idTableIsRightInput ? rightCol : leftCol; + if (!idTable.empty() && idTable.at(0, col).isUndefined()) { + idTableHasUndef = true; + break; + } + } + + // Get prefiltered blocks from the IndexScan + CompressedRelationReader::IdTableGeneratorInputRange scanBlocks; + if (!idTableHasUndef) { + scanBlocks = getBlocksForJoinOfColumnsWithScan(idTable, _joinColumns, *scan, + idTableIsRightInput ? 1 : 0); + } else { + // Cannot prefilter with UNDEF, scan everything + scanBlocks = scan->getLazyScan(std::nullopt); + auto metaBlocks = scan->getMetadataForScan(); + if (metaBlocks.has_value()) { + scanBlocks.details().numBlocksAll_ = + metaBlocks.value().sizeBlockMetadata_; + } + } + + runtimeInfo().addDetail("time-for-filtering-blocks", timer.msecs()); + + // Wrap generator in shared_ptr + auto scanBlocksPtr = + std::make_shared( + std::move(scanBlocks)); + + auto action = [this, resultWithIdTable = std::move(resultWithIdTable), + scanBlocksPtr, + scan](std::function yieldTable) { + auto rowAdder = ad_utility::AddCombinedRowToIdTable{ + _joinColumns.size(), + IdTable{getResultWidth(), allocator()}, + cancellationHandle_, + true, // keepJoinColumns (for multi-column joins, we always keep them) + qlever::joinHelpers::CHUNK_SIZE, + std::move(yieldTable)}; + + // Create view of idTable + const IdTable& table = resultWithIdTable->idTable(); + std::vector identityPerm(table.numColumns()); + std::iota(identityPerm.begin(), identityPerm.end(), 0); + auto idTableBlock = std::array{ad_utility::IdTableAndFirstCol{ + table.asColumnSubsetView(identityPerm), + resultWithIdTable->getCopyOfLocalVocab()}}; + auto scanConverted = convertGenerator(std::move(*scanBlocksPtr), *scan); + + if constexpr (idTableIsRightInput) { + ad_utility::zipperJoinForBlocksWithPotentialUndef( + scanConverted, idTableBlock, std::less{}, rowAdder, {}, {}); + } else { + ad_utility::zipperJoinForBlocksWithPotentialUndef( + idTableBlock, scanConverted, std::less{}, rowAdder, {}, {}); + } + + setScanStatusToLazilyCompleted(*scan); + + auto localVocab = std::move(rowAdder.localVocab()); + return Result::IdTableVocabPair{std::move(rowAdder).resultTable(), + std::move(localVocab)}; + }; + + if (requestLaziness) { + return {qlever::joinHelpers::runLazyJoinAndConvertToGenerator( + std::move(action), {}), + resultSortedOn()}; + } else { + auto [idTable, localVocab] = action(ad_utility::noop); + return {std::move(idTable), resultSortedOn(), std::move(localVocab)}; + } +} + +// Explicit template instantiation +template Result MultiColumnJoin::computeResultForIndexScanAndIdTable( + bool, std::shared_ptr, std::shared_ptr) const; +template Result MultiColumnJoin::computeResultForIndexScanAndIdTable( + bool, std::shared_ptr, std::shared_ptr) const; + +// _____________________________________________________________________________ +Result MultiColumnJoin::computeResultForIndexScanAndLazyOperation( + bool requestLaziness, std::shared_ptr lazyResult, + std::shared_ptr scan) const { + // For lazy input with IndexScan, we cannot use prefiltering efficiently + // TODO: Implement proper lazy prefiltering similar to Join + // For now, signal to fall back to regular path by returning empty result + (void)requestLaziness; + (void)lazyResult; + (void)scan; + + // Return empty result to signal fallback to regular computation path + return {IdTable{getResultWidth(), allocator()}, resultSortedOn(), + LocalVocab{}}; +} + // _____________________________________________________________________________ Result MultiColumnJoin::computeResult([[maybe_unused]] bool requestLaziness) { AD_LOG_DEBUG << "MultiColumnJoin result computation..." << endl; + // Try prefiltering with IndexScans + auto leftIndexScan = + std::dynamic_pointer_cast(_left->getRootOperation()); + auto rightIndexScan = + std::dynamic_pointer_cast(_right->getRootOperation()); + + // Case 1: Both children are IndexScans + if (leftIndexScan && rightIndexScan) { + return computeResultForTwoIndexScans(requestLaziness, *leftIndexScan, + *rightIndexScan); + } + + // Case 2: One child is IndexScan, try to use prefiltering + if (leftIndexScan || rightIndexScan) { + bool leftIsSmall = + _left->getRootOperation()->getSizeEstimate() < + getRuntimeParameter< + &RuntimeParameters::lazyIndexScanMaxSizeMaterialization_>(); + bool rightIsSmall = + _right->getRootOperation()->getSizeEstimate() < + getRuntimeParameter< + &RuntimeParameters::lazyIndexScanMaxSizeMaterialization_>(); + + auto leftResIfCached = _left->getRootOperation()->getResult( + false, leftIsSmall ? ComputationMode::FULLY_MATERIALIZED + : ComputationMode::ONLY_IF_CACHED); + auto rightResIfCached = _right->getRootOperation()->getResult( + false, rightIsSmall ? ComputationMode::FULLY_MATERIALIZED + : ComputationMode::ONLY_IF_CACHED); + + if (leftIndexScan && rightResIfCached && + rightResIfCached->isFullyMaterialized()) { + return computeResultForIndexScanAndIdTable( + requestLaziness, std::move(rightResIfCached), leftIndexScan); + } + + if (rightIndexScan && leftResIfCached && + leftResIfCached->isFullyMaterialized()) { + return computeResultForIndexScanAndIdTable( + requestLaziness, std::move(leftResIfCached), rightIndexScan); + } + + // Try getting the full results + auto leftResult = + leftResIfCached ? leftResIfCached : _left->getResult(true); + auto rightResult = + rightResIfCached ? rightResIfCached : _right->getResult(true); + + if (leftIndexScan && rightResult->isFullyMaterialized()) { + return computeResultForIndexScanAndIdTable( + requestLaziness, std::move(rightResult), leftIndexScan); + } + + if (rightIndexScan && leftResult->isFullyMaterialized()) { + return computeResultForIndexScanAndIdTable( + requestLaziness, std::move(leftResult), rightIndexScan); + } + + // Handle lazy cases + if (leftIndexScan && !rightResult->isFullyMaterialized()) { + return computeResultForIndexScanAndLazyOperation( + requestLaziness, std::move(rightResult), leftIndexScan); + } + + if (rightIndexScan && !leftResult->isFullyMaterialized()) { + return computeResultForIndexScanAndLazyOperation( + requestLaziness, std::move(leftResult), rightIndexScan); + } + } + + // Regular path: no IndexScan optimization IdTable idTable{getExecutionContext()->getAllocator()}; idTable.setNumColumns(getResultWidth()); diff --git a/src/engine/MultiColumnJoin.h b/src/engine/MultiColumnJoin.h index f1eaf8fba1..47caf926be 100644 --- a/src/engine/MultiColumnJoin.h +++ b/src/engine/MultiColumnJoin.h @@ -11,6 +11,9 @@ #include "engine/Operation.h" #include "engine/QueryExecutionTree.h" +// Forward declarations +class IndexScan; + class MultiColumnJoin : public Operation { private: std::shared_ptr _left; @@ -77,6 +80,26 @@ class MultiColumnJoin : public Operation { VariableToColumnMap computeVariableToColumnMap() const override; void computeSizeEstimateAndMultiplicities(); + + // Specialized implementations for joins involving IndexScans (prefiltering). + // These methods are similar to those in Join but support multiple join + // columns. + + // When both children are IndexScans. Filter blocks on both sides. + Result computeResultForTwoIndexScans(bool requestLaziness, + IndexScan& leftScan, + IndexScan& rightScan) const; + + // When one child is an IndexScan and the other is fully materialized. + template + Result computeResultForIndexScanAndIdTable( + bool requestLaziness, std::shared_ptr resultWithIdTable, + std::shared_ptr scan) const; + + // When one child is an IndexScan and the other is lazy. + Result computeResultForIndexScanAndLazyOperation( + bool requestLaziness, std::shared_ptr lazyResult, + std::shared_ptr scan) const; }; #endif // QLEVER_SRC_ENGINE_MULTICOLUMNJOIN_H diff --git a/src/engine/OptionalJoin.cpp b/src/engine/OptionalJoin.cpp index e4a25ef301..76a16ae65b 100644 --- a/src/engine/OptionalJoin.cpp +++ b/src/engine/OptionalJoin.cpp @@ -5,15 +5,20 @@ #include "engine/OptionalJoin.h" +#include + #include "engine/AddCombinedRowToTable.h" #include "engine/CallFixedSize.h" #include "engine/Engine.h" #include "engine/JoinHelpers.h" +#include "engine/JoinWithIndexScanHelpers.h" #include "engine/Service.h" #include "engine/Sort.h" +#include "global/RuntimeParameters.h" #include "util/Algorithm.h" #include "util/JoinAlgorithms/IndexNestedLoopJoin.h" #include "util/JoinAlgorithms/JoinAlgorithms.h" +#include "util/MemoryHelpers.h" using namespace qlever::joinHelpers; @@ -101,6 +106,158 @@ string OptionalJoin::getDescriptor() const { return "OptionalJoin on " + joinVars; } +// _____________________________________________________________________________ +Result OptionalJoin::computeResultForTwoIndexScans(bool requestLaziness, + IndexScan& leftScan, + IndexScan& rightScan) const { + using namespace qlever::joinWithIndexScanHelpers; + + // For OPTIONAL joins, we cannot prefilter the left side (it must be + // complete). We can only prefilter the right side based on the left's block + // ranges. + + ad_utility::Timer timer{ad_utility::timer::Timer::InitialStatus::Started}; + + // Get unfiltered blocks for left and filtered blocks for right + auto [leftBlocksPtr, rightBlocksPtr] = + getUnfilteredLeftAndFilteredRightSideFromIndexScans(leftScan, rightScan, + _joinColumns.size()); + + runtimeInfo().addDetail("time-for-filtering-blocks", timer.msecs()); + + auto action = [this, leftBlocksPtr, rightBlocksPtr, &leftScan, &rightScan]( + std::function yieldTable) { + using namespace qlever::joinWithIndexScanHelpers; + + auto rowAdder = getRowAdderForJoin(*this, _joinColumns.size(), + keepJoinColumns_, std::move(yieldTable)); + + auto leftConverted = convertGenerator(std::move(*leftBlocksPtr), leftScan); + auto rightConverted = + convertGenerator(std::move(*rightBlocksPtr), rightScan); + + ad_utility::zipperJoinForBlocksWithPotentialUndef( + leftConverted, rightConverted, std::less{}, rowAdder, {}, {}, + ad_utility::OptionalJoinTag{}); + + setScanStatusToLazilyCompleted(leftScan, rightScan); + + return std::move(rowAdder).toIdTableVocabPair(); + }; + + return createResultFromAction(requestLaziness, std::move(action), + [this] { return resultSortedOn(); }); +} + +// _____________________________________________________________________________ +Result OptionalJoin::computeResultForIndexScanOnRight( + bool requestLaziness, std::shared_ptr leftRes, + IndexScan& rightScan) const { + using namespace qlever::joinWithIndexScanHelpers; + + AD_CORRECTNESS_CHECK(leftRes->isFullyMaterialized()); + + ad_utility::Timer timer{ad_utility::timer::Timer::InitialStatus::Started}; + + const IdTable& leftTable = leftRes->idTable(); + + // Get prefiltered blocks from the right IndexScan + CompressedRelationReader::IdTableGeneratorInputRange rightBlocks; + if (!firstRowHasUndef(leftTable, _joinColumns, 0)) { + rightBlocks = getBlocksForJoinOfColumnsWithScan(leftTable, _joinColumns, + rightScan, 0); + } else { + // Cannot prefilter with UNDEF, scan everything + rightBlocks = rightScan.getLazyScan(std::nullopt); + auto metaBlocks = rightScan.getMetadataForScan(); + if (metaBlocks.has_value()) { + rightBlocks.details().numBlocksAll_ = + metaBlocks.value().sizeBlockMetadata_; + } + } + + runtimeInfo().addDetail("time-for-filtering-blocks", timer.msecs()); + + auto action = [this, leftRes = std::move(leftRes), + rightBlocks = std::move(rightBlocks), &rightScan]( + std::function yieldTable) { + using namespace qlever::joinWithIndexScanHelpers; + + auto rowAdder = getRowAdderForJoin(*this, _joinColumns.size(), + keepJoinColumns_, std::move(yieldTable)); + + // Create view of left table for the join + const IdTable& leftTable = leftRes->idTable(); + std::vector identityPerm(leftTable.numColumns()); + std::iota(identityPerm.begin(), identityPerm.end(), 0); + auto leftBlock = std::array{ad_utility::IdTableAndFirstCol{ + leftTable.asColumnSubsetView(identityPerm), + leftRes->getCopyOfLocalVocab()}}; + auto rightConverted = convertGenerator(std::move(rightBlocks), rightScan); + + ad_utility::zipperJoinForBlocksWithPotentialUndef( + leftBlock, rightConverted, std::less{}, rowAdder, {}, {}, + ad_utility::OptionalJoinTag{}); + + setScanStatusToLazilyCompleted(rightScan); + + return std::move(rowAdder).toIdTableVocabPair(); + }; + + return createResultFromAction(requestLaziness, std::move(action), + [this] { return resultSortedOn(); }); +} + +// _____________________________________________________________________________ +Result OptionalJoin::computeResultForIndexScanOnRightLazy( + bool requestLaziness, std::shared_ptr leftRes, + IndexScan& rightScan) const { + using namespace qlever::joinWithIndexScanHelpers; + + AD_CORRECTNESS_CHECK(!leftRes->isFullyMaterialized()); + + // Only support single join column for now + if (_joinColumns.size() != 1) { + return lazyOptionalJoin(std::move(leftRes), + const_cast(rightScan).getResult( + true, ComputationMode::LAZY_IF_SUPPORTED), + requestLaziness); + } + + // For OPTIONAL semantics, we must re-yield ALL left input (never filter it). + // We use prefilterTables which gives us filtered right blocks, but we need + // to ensure the left side always re-yields everything. + auto [leftSide, rightSide] = rightScan.prefilterTablesForOptional( + leftRes->idTables(), _joinColumns.at(0).at(0)); + + auto action = [this, leftSide = std::move(leftSide), + rightSide = std::move(rightSide), &rightScan]( + std::function yieldTable) { + using namespace qlever::joinWithIndexScanHelpers; + + auto rowAdder = getRowAdderForJoin(*this, _joinColumns.size(), + keepJoinColumns_, std::move(yieldTable)); + + // Convert generators to the right format + auto leftSidePtr = ad_utility::toSharedPtr(std::move(leftSide)); + auto rightSidePtr = ad_utility::toSharedPtr(std::move(rightSide)); + auto [leftRange, rightRange] = convertPrefilteredGenerators( + leftSidePtr, rightSidePtr, _left->getResultWidth(), + _joinColumns.at(0).at(1)); + + ad_utility::zipperJoinForBlocksWithPotentialUndef( + leftRange, rightRange, std::less{}, rowAdder, {}, {}, + ad_utility::OptionalJoinTag{}); + + setScanStatusToLazilyCompleted(rightScan); + + return std::move(rowAdder).toIdTableVocabPair(); + }; + + return createResultFromAction(requestLaziness, std::move(action), + [this] { return resultSortedOn(); }); +} + // _____________________________________________________________________________ Result OptionalJoin::computeResult(bool requestLaziness) { AD_LOG_DEBUG << "OptionalJoin result computation..." << endl; @@ -115,6 +272,58 @@ Result OptionalJoin::computeResult(bool requestLaziness) { return std::move(res).value(); } + // Check if the right child is an IndexScan (prefiltering optimization) + auto rightIndexScan = + std::dynamic_pointer_cast(_right->getRootOperation()); + + // Try prefiltering with IndexScans + if (rightIndexScan) { + auto leftIndexScan = + std::dynamic_pointer_cast(_left->getRootOperation()); + + // Case 1: Both children are IndexScans + if (leftIndexScan) { + if (auto res = computeResultForTwoIndexScans( + requestLaziness, *leftIndexScan, *rightIndexScan); + !res.idTable().empty() || res.idTable().numColumns() > 0) { + return res; + } + // If prefiltering failed (e.g., no metadata), fall through to regular + // path + } + + // Case 2: Right is IndexScan, left might be materialized or lazy + // Try to get left result (prefer cached/small) + bool leftIsSmall = + _left->getRootOperation()->getSizeEstimate() < + getRuntimeParameter< + &RuntimeParameters::lazyIndexScanMaxSizeMaterialization_>(); + auto leftResIfCached = _left->getRootOperation()->getResult( + false, leftIsSmall ? ComputationMode::FULLY_MATERIALIZED + : ComputationMode::ONLY_IF_CACHED); + + if (leftResIfCached && leftResIfCached->isFullyMaterialized()) { + // Left is materialized, use prefiltering + return computeResultForIndexScanOnRight( + requestLaziness, std::move(leftResIfCached), *rightIndexScan); + } + + // Get the full left result (might be lazy) + bool lazyJoinIsSupported = _joinColumns.size() == 1; + auto leftResult = _left->getResult(lazyJoinIsSupported); + + if (leftResult->isFullyMaterialized()) { + // Left became materialized, use prefiltering + return computeResultForIndexScanOnRight( + requestLaziness, std::move(leftResult), *rightIndexScan); + } else { + // Left is lazy, use lazy prefiltering + return computeResultForIndexScanOnRightLazy( + requestLaziness, std::move(leftResult), *rightIndexScan); + } + } + + // Regular path: no IndexScan optimization possible IdTable idTable{getResultWidth(), getExecutionContext()->getAllocator()}; AD_CONTRACT_CHECK(idTable.numColumns() >= _joinColumns.size() || @@ -450,7 +659,7 @@ void OptionalJoin::optionalJoin( // _____________________________________________________________________________ Result OptionalJoin::lazyOptionalJoin(std::shared_ptr left, std::shared_ptr right, - bool requestLaziness) { + bool requestLaziness) const { // If both inputs are fully materialized, we can join them more // efficiently. AD_CONTRACT_CHECK(!left->isFullyMaterialized() || diff --git a/src/engine/OptionalJoin.h b/src/engine/OptionalJoin.h index 428951ebc6..7512b7d1bd 100644 --- a/src/engine/OptionalJoin.h +++ b/src/engine/OptionalJoin.h @@ -9,6 +9,9 @@ #include "engine/Operation.h" #include "engine/QueryExecutionTree.h" +// Forward declarations +class IndexScan; + class OptionalJoin : public Operation { private: std::shared_ptr _left; @@ -79,7 +82,7 @@ class OptionalJoin : public Operation { // value `Id::makeUndefined()` for any entries marked as optional. Result lazyOptionalJoin(std::shared_ptr left, std::shared_ptr right, - bool requestLaziness); + bool requestLaziness) const; private: std::unique_ptr cloneImpl() const override; @@ -107,6 +110,26 @@ class OptionalJoin : public Operation { static Implementation computeImplementationFromIdTables( const IdTable& left, const IdTable& right, const std::vector>&); + + // Specialized implementations for joins involving IndexScans (prefiltering). + // These methods are similar to those in Join but adapted for OPTIONAL + // semantics (only the right child can be prefiltered). + + // When both children are IndexScans. Filter blocks on the right based on + // the left's block ranges. + Result computeResultForTwoIndexScans(bool requestLaziness, + IndexScan& leftScan, + IndexScan& rightScan) const; + + // When the right child is an IndexScan and the left is fully materialized. + Result computeResultForIndexScanOnRight(bool requestLaziness, + std::shared_ptr leftRes, + IndexScan& rightScan) const; + + // When the right child is an IndexScan and the left is lazy. + Result computeResultForIndexScanOnRightLazy( + bool requestLaziness, std::shared_ptr leftRes, + IndexScan& rightScan) const; }; #endif // QLEVER_SRC_ENGINE_OPTIONALJOIN_H diff --git a/src/index/CompressedRelation.cpp b/src/index/CompressedRelation.cpp index 906e3aa6c9..11a1addd7e 100644 --- a/src/index/CompressedRelation.cpp +++ b/src/index/CompressedRelation.cpp @@ -733,6 +733,243 @@ CompressedRelationReader::getBlocksForJoin( findMatchingBlocks(blocksWithFirstAndLastId2, blocksWithFirstAndLastId1)}; } +// _____________________________________________________________________________ +// Helper function to extract up to 3 relevant IDs from a PermutedTriple based +// on the scan specification. Returns a tuple of IDs for multi-column +// comparison. +namespace { +std::array getRelevantIdsFromTriple( + const CompressedBlockMetadata::PermutedTriple& triple, + const CompressedRelationReader::ScanSpecAndBlocksAndBounds& + metadataAndBlocks) { + const auto& scanSpec = metadataAndBlocks.scanSpec_; + + // Determine which columns are variable (not fixed by the scan spec) + std::array result{Id::makeUndefined(), Id::makeUndefined(), + Id::makeUndefined()}; + size_t idx = 0; + + if (!scanSpec.col0Id().has_value()) { + result[idx++] = triple.col0Id_; + } + if (!scanSpec.col1Id().has_value()) { + result[idx++] = triple.col1Id_; + } + if (!scanSpec.col2Id().has_value()) { + result[idx++] = triple.col2Id_; + } + + return result; +} +} // namespace + +// _____________________________________________________________________________ +auto CompressedRelationReader::getBlocksForJoinMultiColumn( + ql::span joinColumn1, ql::span joinColumn2, + const ScanSpecAndBlocksAndBounds& metadataAndBlocks) + -> GetBlocksForJoinResult { + if (joinColumn1.empty() || joinColumn2.empty() || + metadataAndBlocks.getBlockMetadataView().empty()) { + return {}; + } + + AD_CONTRACT_CHECK(joinColumn1.size() == joinColumn2.size(), + "Join columns must have the same size"); + + // For 2-column comparison: compare tuples (col1[i], col2[i]) with block + // ranges + auto tupleLessThanBlock = [&metadataAndBlocks]( + const Id& id1, const Id& id2, + const CompressedBlockMetadata& block) { + auto blockIds = + getRelevantIdsFromTriple(block.firstTriple_, metadataAndBlocks); + return std::tie(id1, id2) < std::tie(blockIds[0], blockIds[1]); + }; + + auto blockLessThanTuple = [&metadataAndBlocks]( + const CompressedBlockMetadata& block, + const Id& id1, const Id& id2) { + auto blockIds = + getRelevantIdsFromTriple(block.lastTriple_, metadataAndBlocks); + return std::tie(blockIds[0], blockIds[1]) < std::tie(id1, id2); + }; + + const auto& mdView = metadataAndBlocks.getBlockMetadataView(); + auto [blockIt, blockEnd] = getBeginAndEnd(mdView); + GetBlocksForJoinResult res; + auto& blockIdx = res.numHandledBlocks; + + // Iterate through join column tuples + for (size_t i = 0; i < joinColumn1.size(); ++i) { + Id id1 = joinColumn1[i]; + Id id2 = joinColumn2[i]; + + // Skip to first block that might contain this tuple + while (blockIt != blockEnd && blockLessThanTuple(*blockIt, id1, id2)) { + ++blockIt; + ++blockIdx; + } + if (blockIt == blockEnd) { + return res; + } + + // Add all blocks that might contain this tuple + auto currentBlockIt = blockIt; + while (currentBlockIt != blockEnd && + !tupleLessThanBlock(id1, id2, *currentBlockIt)) { + // Only add if not already added (avoid duplicates) + if (res.matchingBlocks_.empty() || + !(res.matchingBlocks_.back() == *currentBlockIt)) { + res.matchingBlocks_.push_back(*currentBlockIt); + } + ++currentBlockIt; + } + } + + return res; +} + +// _____________________________________________________________________________ +auto CompressedRelationReader::getBlocksForJoinMultiColumn( + ql::span joinColumn1, ql::span joinColumn2, + ql::span joinColumn3, + const ScanSpecAndBlocksAndBounds& metadataAndBlocks) + -> GetBlocksForJoinResult { + if (joinColumn1.empty() || joinColumn2.empty() || joinColumn3.empty() || + metadataAndBlocks.getBlockMetadataView().empty()) { + return {}; + } + + AD_CONTRACT_CHECK(joinColumn1.size() == joinColumn2.size() && + joinColumn1.size() == joinColumn3.size(), + "Join columns must have the same size"); + + // For 3-column comparison: compare tuples (col1[i], col2[i], col3[i]) + auto tupleLessThanBlock = [&metadataAndBlocks]( + const Id& id1, const Id& id2, const Id& id3, + const CompressedBlockMetadata& block) { + auto blockIds = + getRelevantIdsFromTriple(block.firstTriple_, metadataAndBlocks); + return std::tie(id1, id2, id3) < + std::tie(blockIds[0], blockIds[1], blockIds[2]); + }; + + auto blockLessThanTuple = [&metadataAndBlocks]( + const CompressedBlockMetadata& block, + const Id& id1, const Id& id2, const Id& id3) { + auto blockIds = + getRelevantIdsFromTriple(block.lastTriple_, metadataAndBlocks); + return std::tie(blockIds[0], blockIds[1], blockIds[2]) < + std::tie(id1, id2, id3); + }; + + const auto& mdView = metadataAndBlocks.getBlockMetadataView(); + auto [blockIt, blockEnd] = getBeginAndEnd(mdView); + GetBlocksForJoinResult res; + auto& blockIdx = res.numHandledBlocks; + + // Iterate through join column tuples + for (size_t i = 0; i < joinColumn1.size(); ++i) { + Id id1 = joinColumn1[i]; + Id id2 = joinColumn2[i]; + Id id3 = joinColumn3[i]; + + // Skip to first block that might contain this tuple + while (blockIt != blockEnd && blockLessThanTuple(*blockIt, id1, id2, id3)) { + ++blockIt; + ++blockIdx; + } + if (blockIt == blockEnd) { + return res; + } + + // Add all blocks that might contain this tuple + auto currentBlockIt = blockIt; + while (currentBlockIt != blockEnd && + !tupleLessThanBlock(id1, id2, id3, *currentBlockIt)) { + // Only add if not already added (avoid duplicates) + if (res.matchingBlocks_.empty() || + !(res.matchingBlocks_.back() == *currentBlockIt)) { + res.matchingBlocks_.push_back(*currentBlockIt); + } + ++currentBlockIt; + } + } + + return res; +} + +// _____________________________________________________________________________ +std::array, 2> +CompressedRelationReader::getBlocksForJoinMultiColumn( + const ScanSpecAndBlocksAndBounds& metadataAndBlocks1, + const ScanSpecAndBlocksAndBounds& metadataAndBlocks2, + size_t numJoinColumns) { + AD_CONTRACT_CHECK(numJoinColumns >= 1 && numJoinColumns <= 3); + + // Helper struct to store block with extracted IDs for all columns + struct BlockWithIds { + const CompressedBlockMetadata& block_; + std::array firstIds_; + std::array lastIds_; + }; + + // Compare blocks based on numJoinColumns + auto blockLessThanBlock = [numJoinColumns](const BlockWithIds& block1, + const BlockWithIds& block2) { + if (numJoinColumns == 1) { + return block1.lastIds_[0] < block2.firstIds_[0]; + } else if (numJoinColumns == 2) { + return std::tie(block1.lastIds_[0], block1.lastIds_[1]) < + std::tie(block2.firstIds_[0], block2.firstIds_[1]); + } else { // numJoinColumns == 3 + return std::tie(block1.lastIds_[0], block1.lastIds_[1], + block1.lastIds_[2]) < std::tie(block2.firstIds_[0], + block2.firstIds_[1], + block2.firstIds_[2]); + } + }; + + // Transform blocks to BlockWithIds + auto getBlocksWithIds = + [&blockLessThanBlock]( + const ScanSpecAndBlocksAndBounds& metadataAndBlocks) { + auto getSingleBlock = + [&metadataAndBlocks]( + const CompressedBlockMetadata& block) -> BlockWithIds { + return { + block, + getRelevantIdsFromTriple(block.firstTriple_, metadataAndBlocks), + getRelevantIdsFromTriple(block.lastTriple_, metadataAndBlocks)}; + }; + auto result = metadataAndBlocks.getBlockMetadataView() | + ql::views::transform(getSingleBlock); + AD_CORRECTNESS_CHECK(ql::ranges::is_sorted(result, blockLessThanBlock)); + return result; + }; + + auto blocksWithIds1 = getBlocksWithIds(metadataAndBlocks1); + auto blocksWithIds2 = getBlocksWithIds(metadataAndBlocks2); + + // Find matching blocks using binary search + auto findMatchingBlocks = [&blockLessThanBlock](const auto& blocks, + const auto& otherBlocks) { + std::vector result; + for (const auto& block : blocks) { + if (!ql::ranges::equal_range(otherBlocks, block, blockLessThanBlock) + .empty()) { + result.push_back(block.block_); + } + } + AD_CORRECTNESS_CHECK(std::unique(result.begin(), result.end()) == + result.end()); + return result; + }; + + return {findMatchingBlocks(blocksWithIds1, blocksWithIds2), + findMatchingBlocks(blocksWithIds2, blocksWithIds1)}; +} + // _____________________________________________________________________________ IdTable CompressedRelationReader::scan( const ScanSpecAndBlocks& scanSpecAndBlocks, diff --git a/src/index/CompressedRelation.h b/src/index/CompressedRelation.h index 3ba72d5181..cb18018123 100644 --- a/src/index/CompressedRelation.h +++ b/src/index/CompressedRelation.h @@ -740,6 +740,32 @@ class CompressedRelationReader { const ScanSpecAndBlocksAndBounds& metadataAndBlocks, const ScanSpecAndBlocksAndBounds& metadataAndBlocks2); + // Multi-column versions of getBlocksForJoin that compare blocks based on + // multiple columns (up to 3) to provide more aggressive filtering. These + // methods extract the relevant column IDs from the block's firstTriple and + // lastTriple based on the scan specification and compare them as tuples. + + // Get blocks where the relevant columns (determined by the scan spec) can + // match one of the tuples in joinColumns. `numColumns` indicates how many + // columns to compare (2 or 3). For example, if the scan has col0Id fixed, + // we compare (col1Id, col2Id) pairs from blocks against the joinColumns. + static GetBlocksForJoinResult getBlocksForJoinMultiColumn( + ql::span joinColumn1, ql::span joinColumn2, + const ScanSpecAndBlocksAndBounds& metadataAndBlocks); + + static GetBlocksForJoinResult getBlocksForJoinMultiColumn( + ql::span joinColumn1, ql::span joinColumn2, + ql::span joinColumn3, + const ScanSpecAndBlocksAndBounds& metadataAndBlocks); + + // For joining two index scans with multiple join columns, get the blocks + // from both sides that can potentially match. Compares up to 3 columns. + static std::array, 2> + getBlocksForJoinMultiColumn( + const ScanSpecAndBlocksAndBounds& metadataAndBlocks1, + const ScanSpecAndBlocksAndBounds& metadataAndBlocks2, + size_t numJoinColumns); + /** * @brief For a permutation XYZ, retrieve all Z for given X and Y (if `col1Id` * is set) or all YZ for a given X (if `col1Id` is `std::nullopt`. diff --git a/src/util/MemoryHelpers.h b/src/util/MemoryHelpers.h new file mode 100644 index 0000000000..f27f6f9405 --- /dev/null +++ b/src/util/MemoryHelpers.h @@ -0,0 +1,22 @@ +// Copyright 2026, University of Freiburg, +// Chair of Algorithms and Data Structures. +// Author: Johannes Kalmbach + +#ifndef QLEVER_MEMORYHELPERS_H +#define QLEVER_MEMORYHELPERS_H + +#include +#include + +namespace ad_utility { + +// Helper to create a shared_ptr with automatic type deduction. +// Usage: auto ptr = toSharedPtr(std::move(myObject)); +template +auto toSharedPtr(T&& element) { + return std::make_shared>(std::forward(element)); +} + +} // namespace ad_utility + +#endif // QLEVER_MEMORYHELPERS_H diff --git a/test/MinusTest.cpp b/test/MinusTest.cpp index 3fb67fac70..28923cf6a7 100644 --- a/test/MinusTest.cpp +++ b/test/MinusTest.cpp @@ -18,10 +18,14 @@ #include "util/IdTableHelpers.h" #include "util/IndexTestHelpers.h" #include "util/OperationTestHelpers.h" +#include "util/RuntimeParametersTestHelpers.h" namespace { auto V = ad_utility::testing::VocabId; constexpr auto U = Id::makeUndefined(); +auto iri = [](std::string_view s) { + return TripleComponent::Iri::fromIriref(s); +}; // Helper function to test minus implementations. void testMinus(std::vector leftTables, @@ -650,3 +654,88 @@ TEST(Minus, MinusRowHandlerKeepsLeftLocalVocabAfterFlush) { ::testing::ElementsAre(testLiteral)); EXPECT_TRUE(std::move(handler).resultTable().empty()); } + +// _____________________________________________________________________________ +TEST(Minus, prefilteringWithTwoIndexScans) { + // Create a dataset where some subjects from p1 also appear in p2. + // MINUS should remove those subjects from the result. + // This tests that the right IndexScan is prefiltered based on left's data. + std::string kg; + for (size_t i = 0; i < 20; ++i) { + kg += absl::StrCat(" .\n"); + } + // Subjects s5-s14 also appear in p2 (these should be removed) + for (size_t i = 5; i < 15; ++i) { + kg += absl::StrCat(" .\n"); + } + + auto qec = ad_utility::testing::getQec(kg); + auto cleanup = setRuntimeParameterForTest< + &RuntimeParameters::lazyIndexScanMaxSizeMaterialization_>(1); + qec->getQueryTreeCache().clearAll(); + + using V = Variable; + auto scan1 = ad_utility::makeExecutionTree( + qec, Permutation::PSO, + SparqlTripleSimple{V{"?s"}, iri(""), V{"?o1"}}); + auto scan2 = ad_utility::makeExecutionTree( + qec, Permutation::PSO, + SparqlTripleSimple{V{"?s"}, iri(""), V{"?o2"}}); + + auto minusOp = ad_utility::makeExecutionTree(qec, scan1, scan2); + + auto result = minusOp->getResult(); + + // Verify result correctness: 10 rows (20 - 10 removed) + ASSERT_TRUE(result->isFullyMaterialized()); + EXPECT_EQ(result->idTable().size(), 10); + + // Verify that the operation was recognized as using IndexScans by checking + // runtime info exists + const auto& scan1Rti = scan1->getRootOperation()->getRuntimeInfoPointer(); + const auto& scan2Rti = scan2->getRootOperation()->getRuntimeInfoPointer(); + ASSERT_NE(scan1Rti, nullptr); + ASSERT_NE(scan2Rti, nullptr); +} + +// _____________________________________________________________________________ +TEST(Minus, prefilteringWithLazyLeftAndIndexScanRight) { + // Create a dataset where some subjects from p1 also appear in p2. + // MINUS should remove those subjects from the result. + std::string kg; + for (size_t i = 0; i < 20; ++i) { + kg += absl::StrCat(" .\n"); + } + // Subjects s5-s14 also appear in p2 (these should be removed by MINUS) + for (size_t i = 5; i < 15; ++i) { + kg += absl::StrCat(" .\n"); + } + + auto qec = ad_utility::testing::getQec(kg); + // Set threshold to force lazy execution + auto cleanup = setRuntimeParameterForTest< + &RuntimeParameters::lazyIndexScanMaxSizeMaterialization_>(1); + qec->getQueryTreeCache().clearAll(); + + using V = Variable; + auto scan1 = ad_utility::makeExecutionTree( + qec, Permutation::PSO, + SparqlTripleSimple{V{"?s"}, iri(""), V{"?o1"}}); + auto scan2 = ad_utility::makeExecutionTree( + qec, Permutation::PSO, + SparqlTripleSimple{V{"?s"}, iri(""), V{"?o2"}}); + + auto minusOp = ad_utility::makeExecutionTree(qec, scan1, scan2); + auto result = minusOp->getResult(); + + // Verify result correctness: 10 rows (s0-s4 and s15-s19, excluding s5-s14) + ASSERT_TRUE(result->isFullyMaterialized()); + EXPECT_EQ(result->idTable().size(), 10); + + // All rows should be defined (no UNDEFs in MINUS results) + const auto& table = result->idTable(); + for (size_t i = 0; i < table.size(); ++i) { + EXPECT_FALSE(table(i, 0).isUndefined()); + EXPECT_FALSE(table(i, 1).isUndefined()); + } +} diff --git a/test/MultiColumnJoinTest.cpp b/test/MultiColumnJoinTest.cpp index b3478703a2..be13aa8001 100644 --- a/test/MultiColumnJoinTest.cpp +++ b/test/MultiColumnJoinTest.cpp @@ -16,11 +16,15 @@ #include "util/IdTestHelpers.h" #include "util/IndexTestHelpers.h" #include "util/OperationTestHelpers.h" +#include "util/RuntimeParametersTestHelpers.h" using ad_utility::testing::makeAllocator; namespace { auto V = ad_utility::testing::VocabId; -} +auto iri = [](std::string_view s) { + return TripleComponent::Iri::fromIriref(s); +}; +} // namespace TEST(EngineTest, multiColumnJoinTest) { using std::array; @@ -179,3 +183,44 @@ TEST(MultiColumnJoin, columnOriginatesFromGraphOrUndef) { testWithTrees(values2, values3, false, false, false); testWithTrees(values2, values1, false, false, false); } + +// _____________________________________________________________________________ +TEST(MultiColumnJoin, prefilteringWithTwoIndexScans) { + // Create a dataset with overlap in subjects between two predicates. + // This tests that both IndexScans can be prefiltered when joining. + std::string kg; + for (size_t i = 0; i < 15; ++i) { + kg += absl::StrCat(" .\n"); + } + for (size_t i = 5; i < 20; ++i) { + kg += absl::StrCat(" .\n"); + } + + auto qec = ad_utility::testing::getQec(kg); + auto cleanup = setRuntimeParameterForTest< + &RuntimeParameters::lazyIndexScanMaxSizeMaterialization_>(1); + qec->getQueryTreeCache().clearAll(); + + using V = Variable; + auto scan1 = ad_utility::makeExecutionTree( + qec, Permutation::PSO, + SparqlTripleSimple{V{"?s"}, iri(""), V{"?o1"}}); + auto scan2 = ad_utility::makeExecutionTree( + qec, Permutation::PSO, + SparqlTripleSimple{V{"?s"}, iri(""), V{"?o2"}}); + + auto join = ad_utility::makeExecutionTree(qec, scan1, scan2); + + auto result = join->getResult(); + + // Verify result correctness: only subjects s5-s14 appear in both (10 rows) + ASSERT_TRUE(result->isFullyMaterialized()); + EXPECT_EQ(result->idTable().size(), 10); + + // Verify that the operation was recognized as using IndexScans by checking + // runtime info exists + const auto& scan1Rti = scan1->getRootOperation()->getRuntimeInfoPointer(); + const auto& scan2Rti = scan2->getRootOperation()->getRuntimeInfoPointer(); + ASSERT_NE(scan1Rti, nullptr); + ASSERT_NE(scan2Rti, nullptr); +} diff --git a/test/engine/OptionalJoinTest.cpp b/test/engine/OptionalJoinTest.cpp index 43faed3ccc..e117dce60b 100644 --- a/test/engine/OptionalJoinTest.cpp +++ b/test/engine/OptionalJoinTest.cpp @@ -10,6 +10,7 @@ #include "../util/IdTestHelpers.h" #include "../util/IndexTestHelpers.h" #include "../util/OperationTestHelpers.h" +#include "../util/RuntimeParametersTestHelpers.h" #include "./ValuesForTesting.h" #include "engine/CallFixedSize.h" #include "engine/IndexScan.h" @@ -23,6 +24,9 @@ using ad_utility::testing::makeAllocator; using namespace ad_utility::testing; namespace { auto V = VocabId; +auto iri = [](std::string_view s) { + return TripleComponent::Iri::fromIriref(s); +}; constexpr auto U = Id::makeUndefined(); using JoinColumns = std::vector>; @@ -769,3 +773,116 @@ TEST(OptionalJoin, columnOriginatesFromGraphOrUndef) { testWithTrees(index3, index2, true, true, true); testWithTrees(index3, values1, false, false, true); } + +// _____________________________________________________________________________ +TEST(OptionalJoin, prefilteringWithTwoIndexScans) { + // Create a dataset where not all subjects from p1 appear in p2. + // This tests that the right IndexScan is prefiltered based on left's data. + std::string kg; + for (size_t i = 0; i < 20; ++i) { + kg += absl::StrCat(" .\n"); + } + // Only subjects s5-s14 appear in p2 (10 out of 20) + for (size_t i = 5; i < 15; ++i) { + kg += absl::StrCat(" .\n"); + } + + auto qec = ad_utility::testing::getQec(kg); + auto cleanup = setRuntimeParameterForTest< + &RuntimeParameters::lazyIndexScanMaxSizeMaterialization_>(1); + qec->getQueryTreeCache().clearAll(); + + using V = Variable; + auto scan1 = ad_utility::makeExecutionTree( + qec, Permutation::PSO, + SparqlTripleSimple{V{"?s"}, iri(""), V{"?o1"}}); + auto scan2 = ad_utility::makeExecutionTree( + qec, Permutation::PSO, + SparqlTripleSimple{V{"?s"}, iri(""), V{"?o2"}}); + + auto optJoin = ad_utility::makeExecutionTree(qec, scan1, scan2); + + auto result = optJoin->getResult(); + + // Verify result correctness: 20 rows (all from left) + ASSERT_TRUE(result->isFullyMaterialized()); + EXPECT_EQ(result->idTable().size(), 20); + + const auto& table = result->idTable(); + + // Count how many rows have defined vs undefined values in the o2 column + size_t definedCount = 0; + size_t undefCount = 0; + for (size_t i = 0; i < table.size(); ++i) { + if (table(i, 2).isUndefined()) { + undefCount++; + } else { + definedCount++; + } + } + + // We expect 10 subjects to match (s5-s14) and 10 to not match + EXPECT_EQ(definedCount, 10); + EXPECT_EQ(undefCount, 10); + + // Verify that the operation was recognized as using IndexScans by checking + // runtime info exists + const auto& scan1Rti = scan1->getRootOperation()->getRuntimeInfoPointer(); + const auto& scan2Rti = scan2->getRootOperation()->getRuntimeInfoPointer(); + ASSERT_NE(scan1Rti, nullptr); + ASSERT_NE(scan2Rti, nullptr); +} + +// _____________________________________________________________________________ +TEST(OptionalJoin, prefilteringWithLazyLeftAndIndexScanRight) { + // Create a dataset where not all subjects from p1 appear in p2. + // This tests that the right IndexScan is prefiltered based on lazy left's + // data. + std::string kg; + for (size_t i = 0; i < 20; ++i) { + kg += absl::StrCat(" .\n"); + } + // Only subjects s5-s14 appear in p2 (10 out of 20) + for (size_t i = 5; i < 15; ++i) { + kg += absl::StrCat(" .\n"); + } + + auto qec = ad_utility::testing::getQec(kg); + // Set threshold to force lazy execution + auto cleanup = setRuntimeParameterForTest< + &RuntimeParameters::lazyIndexScanMaxSizeMaterialization_>(1); + qec->getQueryTreeCache().clearAll(); + + using V = Variable; + auto scan1 = ad_utility::makeExecutionTree( + qec, Permutation::PSO, + SparqlTripleSimple{V{"?s"}, iri(""), V{"?o1"}}); + auto scan2 = ad_utility::makeExecutionTree( + qec, Permutation::PSO, + SparqlTripleSimple{V{"?s"}, iri(""), V{"?o2"}}); + + auto optJoin = ad_utility::makeExecutionTree(qec, scan1, scan2); + + auto result = optJoin->getResult(); + + // Verify result correctness: 20 rows (all from left) + ASSERT_TRUE(result->isFullyMaterialized()); + EXPECT_EQ(result->idTable().size(), 20); + + const auto& table = result->idTable(); + + // Count how many rows have defined vs undefined values in the o2 column + size_t definedCount = 0; + size_t undefCount = 0; + for (size_t i = 0; i < table.size(); ++i) { + if (table(i, 2).isUndefined()) { + undefCount++; + } else { + definedCount++; + } + } + + // We expect 10 subjects to match (s5-s14) and 10 to not match + EXPECT_EQ(definedCount, 10); + EXPECT_EQ(undefCount, 10); +}