Skip to content
5 changes: 4 additions & 1 deletion src/global/Constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,12 @@ constexpr inline std::string_view MATCHINGWORD_VARIABLE_PREFIX =

namespace constants::details::strings {
constexpr inline std::string_view langtag{"langtag"};
}
constexpr inline std::string_view hasWord{"has-word"};
} // namespace constants::details::strings
constexpr inline std::string_view LANGUAGE_PREDICATE =
makeQleverInternalIriConst<constants::details::strings::langtag>();
constexpr inline std::string_view HAS_WORD_PREDICATE =
makeQleverInternalIriConst<constants::details::strings::hasWord>();

// TODO<joka921> Move them to their own file, make them strings, remove
// duplications, etc.
Expand Down
3 changes: 3 additions & 0 deletions src/index/Index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,9 @@ bool& Index::usePatterns() { return pimpl_->usePatterns(); }
// ____________________________________________________________________________
bool& Index::loadAllPermutations() { return pimpl_->loadAllPermutations(); }

// ____________________________________________________________________________
bool& Index::addHasWordTriples() { return pimpl_->addHasWordTriples(); }

// ____________________________________________________________________________
void Index::setKeepTempFiles(bool keepTempFiles) {
return pimpl_->setKeepTempFiles(keepTempFiles);
Expand Down
2 changes: 2 additions & 0 deletions src/index/Index.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,8 @@ class Index {

bool& loadAllPermutations();

bool& addHasWordTriples();

void setKeepTempFiles(bool keepTempFiles);

ad_utility::MemorySize& memoryLimitIndexBuilding();
Expand Down
3 changes: 3 additions & 0 deletions src/index/IndexBuilderMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,9 @@ int main(int argc, char** argv) {
po::bool_switch(&config.onlyPsoAndPos_),
"Only build the PSO and POS permutations. This is faster, but then "
"queries with predicate variables are not supported");
add("add-has-word-triples", po::bool_switch(&config.addHasWordTriples_),
"Add `ql:has-word` triples for each word in each literal. This enables "
"keyword search in literals via `?literal ql:has-word \"word\"`.");
auto msg = absl::StrCat(
"The vocabulary implementation for strings in qlever, can be any of ",
ad_utility::VocabularyType::getListOfSupportedValues());
Expand Down
281 changes: 168 additions & 113 deletions src/index/IndexBuilderTypes.h

Large diffs are not rendered by default.

33 changes: 26 additions & 7 deletions src/index/IndexImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <absl/strings/str_join.h>

#include <atomic>
#include <cstdio>
#include <future>
#include <numeric>
Expand All @@ -20,6 +21,7 @@
#include "index/IndexFormatVersion.h"
#include "index/VocabularyMerger.h"
#include "parser/ParallelParseBuffer.h"
#include "parser/WordsAndDocsFileParser.h"
#include "util/BatchedPipeline.h"
#include "util/CachingMemoryResource.h"
#include "util/Generator.h"
Expand Down Expand Up @@ -513,6 +515,8 @@ IndexBuilderDataAsExternalVector IndexImpl::passFileForVocabulary(

ad_utility::CachingMemoryResource cachingMemoryResource;
ItemAlloc itemAlloc(&cachingMemoryResource);
// Counter for the number of ql:has-word triples created.
std::atomic<size_t> numHasWordTriples = 0;
while (!parserExhausted) {
size_t actualCurrentPartialSize = 0;

Expand All @@ -533,9 +537,9 @@ IndexBuilderDataAsExternalVector IndexImpl::passFileForVocabulary(
// get the Ids for the original triple and the possibly added language
// Tag triples using the provided HashMaps via itemArray. See
// documentation of the function for more details
getIdMapLambdas<NUM_PARALLEL_ITEM_MAPS>(&itemArray, linesPerPartial,
&(vocab_.getCaseComparator()),
this, itemAlloc));
getIdMapLambdas<NUM_PARALLEL_ITEM_MAPS>(
&itemArray, linesPerPartial, &(vocab_.getCaseComparator()), this,
itemAlloc, addHasWordTriples_ ? &numHasWordTriples : nullptr));

while (auto opt = p.getNextValue()) {
numTriplesParsedTimer.cont();
Expand Down Expand Up @@ -601,6 +605,10 @@ IndexBuilderDataAsExternalVector IndexImpl::passFileForVocabulary(
AD_LOG_INFO << "Number of triples created (including QLever-internal ones): "
<< (*idTriples.wlock())->size() << " [may contain duplicates]"
<< std::endl;
if (addHasWordTriples_) {
AD_LOG_INFO << "Number of `ql:has-word` triples created: "
<< numHasWordTriples.load() << std::endl;
}
AD_LOG_INFO << "Number of partial vocabularies created: " << numFiles
<< std::endl;

Expand Down Expand Up @@ -1052,6 +1060,9 @@ bool& IndexImpl::usePatterns() { return usePatterns_; }
// _____________________________________________________________________________
bool& IndexImpl::loadAllPermutations() { return loadAllPermutations_; }

// _____________________________________________________________________________
bool& IndexImpl::addHasWordTriples() { return addHasWordTriples_; }

// ____________________________________________________________________________
void IndexImpl::setSettingsFile(const std::string& filename) {
settingsFileName_ = filename;
Expand Down Expand Up @@ -1210,19 +1221,27 @@ void IndexImpl::readConfiguration() {
}

// ___________________________________________________________________________
LangtagAndTriple IndexImpl::tripleToInternalRepresentation(
TurtleTriple&& triple) const {
LangtagAndTriple result{"", {}};
ProcessedTriple IndexImpl::processTriple(TurtleTriple&& triple) const {
ProcessedTriple result{{}, "", {}};
auto& resultTriple = result.triple_;
if (triple.object_.isLiteral()) {
const auto& lit = triple.object_.getLiteral();
if (lit.hasLanguageTag()) {
result.langtag_ = std::string(asStringViewUnsafe(lit.getLanguageTag()));
}
// Extract words from the literal content for ql:has-word triples and
// count their term frequencies.
if (addHasWordTriples_) {
std::string_view content = asStringViewUnsafe(lit.getContent());
for (auto&& word :
tokenizeAndNormalizeText(content, vocab_.getLocaleManager())) {
++result.wordFrequencies_[std::move(word)];
}
}
}

// The following lambda deals with triple elements that might be strings
// (literals or IRIs) as well as values that can be decoded into the IRI
// (literals or IRIs) as well as values that can be encoded into the IRI
// directly. These currently are the object and the graph ID of the triple.
// The `index` is the index of the element within the triple. For example if
// the `getter` is `subject_` then the index has to be `0`.
Expand Down
13 changes: 12 additions & 1 deletion src/index/IndexImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,9 @@ class IndexImpl {
double avgNumDistinctSubjectsPerPredicate_;
uint64_t numDistinctSubjectPredicatePairs_;

// If true, add `ql:has-word` triples for each word in each literal.
bool addHasWordTriples_ = true;

size_t parserBatchSize_ = PARSER_BATCH_SIZE;
size_t numTriplesPerBatch_ = NUM_TRIPLES_PER_PARTIAL_VOCAB;

Expand Down Expand Up @@ -439,6 +442,8 @@ class IndexImpl {

bool& loadAllPermutations();

bool& addHasWordTriples();

void setKeepTempFiles(bool keepTempFiles);

ad_utility::MemorySize& memoryLimitIndexBuilding() {
Expand Down Expand Up @@ -649,7 +654,13 @@ class IndexImpl {
bool isLiteral(std::string_view object) const;

public:
LangtagAndTriple tripleToInternalRepresentation(TurtleTriple&& triple) const;
// Process the given parsed triple in a number of ways:
//
// 1. If the object has a language tag, extract and store it
// 2. If the object is a literal, store the distinct words contained in it
// together with their term frequencies
// 3. If the IRI or literal can be encoded directly into an `Id`, do so
ProcessedTriple processTriple(TurtleTriple&& triple) const;

protected:
/**
Expand Down
1 change: 1 addition & 0 deletions src/libqlever/Qlever.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ void Qlever::buildIndex(IndexBuilderConfig config) {
index.setKeepTempFiles(config.keepTemporaryFiles_);
index.setSettingsFile(config.settingsFile_);
index.loadAllPermutations() = !config.onlyPsoAndPos_;
index.addHasWordTriples() = config.addHasWordTriples_;
index.getImpl().setVocabularyTypeForIndexBuilding(config.vocabType_);
index.getImpl().setPrefixesForEncodedValues(config.prefixesForIdEncodedIris_);

Expand Down
7 changes: 7 additions & 0 deletions src/libqlever/Qlever.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ struct CommonConfig {
// TODO: We have not tested this mode in a while. In particular, it is
// unlikely to work when updates are involved.
bool onlyPsoAndPos_ = false;

// Option to add `ql:has-word` triples for each word in each literal. For
// each literal, a triple `<literal> ql:has-word "word"` is added for each
// word in the literal. This is useful for keyword search in literals.
// NOTE: While testing, this defaults to true. Eventually, it should default
// to false.
bool addHasWordTriples_ = true;
Comment on lines +68 to +70
Copy link

Copilot AI Jan 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment indicates this defaults to true "while testing" and "eventually, it should default to false." This suggests a temporary state during development. Consider whether the default should be changed to false before merging, or if a TODO/issue should track this decision. Having a default of true could unexpectedly increase index sizes for users who don't explicitly set this option.

Suggested change
// NOTE: While testing, this defaults to true. Eventually, it should default
// to false.
bool addHasWordTriples_ = true;
// NOTE: This defaults to false to avoid increasing index sizes
// unexpectedly. Enable explicitly if keyword search in literals is needed.
bool addHasWordTriples_ = false;

Copilot uses AI. Check for mistakes.
};

// Additional configuration used for building an index for a given dataset.
Expand Down
10 changes: 5 additions & 5 deletions src/parser/TripleComponent.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ CPP_concept MoveAssignableWith =

} // namespace ad_utility::detail

/// A wrapper around a `std::variant` that can hold the different types that the
/// subject, predicate, or object of a triple can have in the Turtle Parser.
/// Those currently are `double` (xsd:double and xsd:decimal), `int64_t`
/// (xsd:int and xsd:integer) and `std::string` (variables, IRIs, and literals
/// of any other type).
// A wrapper around a `std::variant` that can hold the different types that the
// subject, predicate, or object of a triple can have in the Turtle Parser.
// Those currently are `double` (xsd:double and xsd:decimal), `int64_t`
// (xsd:int and xsd:integer) and `std::string` (variables, IRIs, and literals
// of any other type).
class TripleComponent {
public:
using Literal = ad_utility::triple_component::Literal;
Expand Down
48 changes: 24 additions & 24 deletions test/IndexTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -426,19 +426,18 @@ auto IsPossiblyExternalString = [](TripleComponent content, bool isExternal) {
::testing::Eq(isExternal))));
};

TEST(IndexTest, TripleToInternalRepresentation) {
TEST(IndexTest, processTriple) {
{
IndexImpl index{ad_utility::makeUnlimitedAllocator<Id>()};
TurtleTriple turtleTriple{iri("<subject>"), iri("<predicate>"),
lit("\"literal\"")};
LangtagAndTriple res =
index.tripleToInternalRepresentation(std::move(turtleTriple));
EXPECT_TRUE(res.langtag_.empty());
EXPECT_THAT(res.triple_[0],
ProcessedTriple result = index.processTriple(std::move(turtleTriple));
EXPECT_TRUE(result.langtag_.empty());
EXPECT_THAT(result.triple_[0],
IsPossiblyExternalString(iri("<subject>"), true));
EXPECT_THAT(res.triple_[1],
EXPECT_THAT(result.triple_[1],
IsPossiblyExternalString(iri("<predicate>"), true));
EXPECT_THAT(res.triple_[2],
EXPECT_THAT(result.triple_[2],
IsPossiblyExternalString(lit("\"literal\""), true));
}
{
Expand All @@ -447,23 +446,21 @@ TEST(IndexTest, TripleToInternalRepresentation) {
std::vector{"<subj"s});
TurtleTriple turtleTriple{iri("<subject>"), iri("<predicate>"),
lit("\"literal\"", "@fr")};
LangtagAndTriple res =
index.tripleToInternalRepresentation(std::move(turtleTriple));
EXPECT_EQ(res.langtag_, "fr");
EXPECT_THAT(res.triple_[0],
ProcessedTriple result = index.processTriple(std::move(turtleTriple));
EXPECT_EQ(result.langtag_, "fr");
EXPECT_THAT(result.triple_[0],
IsPossiblyExternalString(iri("<subject>"), true));
EXPECT_THAT(res.triple_[1],
EXPECT_THAT(result.triple_[1],
IsPossiblyExternalString(iri("<predicate>"), false));
// By default all languages other than English are externalized.
EXPECT_THAT(res.triple_[2],
EXPECT_THAT(result.triple_[2],
IsPossiblyExternalString(lit("\"literal\"", "@fr"), true));
}
{
IndexImpl index{ad_utility::makeUnlimitedAllocator<Id>()};
TurtleTriple turtleTriple{iri("<subject>"), iri("<predicate>"), 42.0};
LangtagAndTriple res =
index.tripleToInternalRepresentation(std::move(turtleTriple));
EXPECT_EQ(Id::makeFromDouble(42.0), std::get<Id>(res.triple_[2]));
ProcessedTriple result = index.processTriple(std::move(turtleTriple));
EXPECT_EQ(Id::makeFromDouble(42.0), std::get<Id>(result.triple_[2]));
}
}

Expand All @@ -473,7 +470,9 @@ TEST(IndexTest, NumDistinctEntities) {
"<x> "
"<label> \"Beta\". <x> <is-a> <y>. <y> <is-a> <x>. <z> <label> "
"\"zz\"@en";
const auto& qec = *getQec(turtleInput);
TestIndexConfig config{turtleInput};
config.addHasWordTriples = true;
const auto& qec = *getQec(config);
const IndexImpl& index = qec.getIndex().getImpl();
// Note: Those numbers might change as the triples of the test index in
// `IndexTestHelpers.cpp` change.
Expand All @@ -487,10 +486,10 @@ TEST(IndexTest, NumDistinctEntities) {

auto numPredicates = index.numDistinctPredicates();
EXPECT_EQ(numPredicates.normal, 2);
// The added numPredicates are `ql:has-pattern`, `ql:langtag`, and one added
// predicate for each combination of predicate+language that is actually used
// (e.g. `@en@label`).
EXPECT_EQ(numPredicates.internal, 3);
// The added numPredicates are `ql:has-pattern`, `ql:langtag`, `ql:has-word`,
// and one added predicate for each combination of predicate+language that is
// actually used (e.g. `@en@label`).
EXPECT_EQ(numPredicates.internal, 4);
EXPECT_EQ(numPredicates, index.numDistinctCol0(Permutation::PSO));
EXPECT_EQ(numPredicates, index.numDistinctCol0(Permutation::POS));

Expand All @@ -501,9 +500,10 @@ TEST(IndexTest, NumDistinctEntities) {

auto numTriples = index.numTriples();
EXPECT_EQ(numTriples.normal, 7);
// Two added triples for each triple that has an object with a language tag
// and one triple per subject for the pattern.
EXPECT_EQ(numTriples.internal, 5);
// Two added triples for each triple that has an object with a language tag,
// one triple per subject for the pattern, and one ql:has-word triple per
// word in the literals (5 literals with 1 word each = 5 word triples).
EXPECT_EQ(numTriples.internal, 10);

auto multiplicities =
index.getMultiplicities(index.getPermutation(Permutation::SPO));
Expand Down
3 changes: 3 additions & 0 deletions test/util/IndexTestHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ Index makeIndexWithTestSettings(ad_utility::MemorySize parserBufferSize) {
parserBufferSize; // Note that the default value remains unchanged, but
// some tests (i.e. polygon testing in Spatial Joins)
// require a larger buffer size
// By default, don't add ql:has-word triples in test indices.
index.addHasWordTriples() = false;
return index;
}

Expand Down Expand Up @@ -197,6 +199,7 @@ Index makeTestIndex(const std::string& indexBasename, TestIndexConfig c) {
index.usePatterns() = c.usePatterns;
index.setSettingsFile(inputFilename + ".settings.json");
index.loadAllPermutations() = c.loadAllPermutations;
index.addHasWordTriples() = c.addHasWordTriples;
qlever::InputFileSpecification spec{inputFilename, c.indexType,
std::nullopt};
// randomly choose one of the vocabulary implementations
Expand Down
8 changes: 6 additions & 2 deletions test/util/IndexTestHelpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ struct TestIndexConfig {
qlever::Filetype indexType = qlever::Filetype::Turtle;
std::optional<VocabularyType> vocabularyType = std::nullopt;
std::optional<EncodedIriManager> encodedIriManager = std::nullopt;
// If true, add `ql:has-word` triples for each word in each literal during
// index building.
bool addHasWordTriples = false;

// A very typical use case is to only specify the turtle input, and leave all
// the other members as the default. We therefore have a dedicated constructor
Expand All @@ -83,13 +86,14 @@ struct TestIndexConfig {
c.blocksizePermutations, c.createTextIndex,
c.addWordsFromLiterals, c.contentsOfWordsFileAndDocsfile,
c.parserBufferSize, c.scoringMetric, c.bAndKParam,
c.indexType, c.encodedIriManager);
c.indexType, c.encodedIriManager, c.addHasWordTriples);
}
QL_DEFINE_DEFAULTED_EQUALITY_OPERATOR_LOCAL(
TestIndexConfig, turtleInput, loadAllPermutations, usePatterns,
usePrefixCompression, blocksizePermutations, createTextIndex,
addWordsFromLiterals, contentsOfWordsFileAndDocsfile, parserBufferSize,
scoringMetric, bAndKParam, indexType, vocabularyType, encodedIriManager)
scoringMetric, bAndKParam, indexType, vocabularyType, encodedIriManager,
addHasWordTriples)
};

// Create a test index at the given `indexBasename` and with the given `config`.
Expand Down