Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parallel parsing, fix logging and deprecation warnings #1620

Merged
merged 5 commits into from
Nov 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion src/index/IndexBuilderMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -305,12 +305,14 @@ int main(int argc, char** argv) {
}

bool parseInParallel = getParameterValue(i, parseParallel, false);
bool parseInParallelSetExplicitly = i < parseParallel.size();
auto& filename = inputFile.at(i);
if (filename == "-") {
filename = "/dev/stdin";
}
fileSpecs.emplace_back(filename, getFiletype(type, filename),
std::move(defaultGraph), parseInParallel);
std::move(defaultGraph), parseInParallel,
parseInParallelSetExplicitly);
}
return fileSpecs;
};
Expand Down
51 changes: 29 additions & 22 deletions src/index/IndexImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -291,36 +291,43 @@ std::pair<size_t, size_t> IndexImpl::createInternalPSOandPOS(
void IndexImpl::updateInputFileSpecificationsAndLog(
std::vector<Index::InputFileSpecification>& spec,
std::optional<bool> parallelParsingSpecifiedViaJson) {
if (spec.size() == 1) {
LOG(INFO) << "Processing triples from " << spec.at(0).filename_ << " ..."
<< std::endl;
} else {
LOG(INFO) << "Processing triples from " << spec.size()
<< " input streams ..." << std::endl;
}
if (parallelParsingSpecifiedViaJson.value_or(false) == true) {
std::string pleaseUseParallelParsingOption =
"please use the command-line option --parse-parallel or -p";
// Parallel parsing specified in the `settings.json` file. This is deprecated
// for a single input stream and forbidden for multiple input streams.
if (parallelParsingSpecifiedViaJson.has_value()) {
if (spec.size() == 1) {
LOG(WARN) << "Parallel parsing set to `true` in the `.settings.json` "
"file; this is deprecated, please use the command-line "
" option --parse-parallel or -p instead"
<< std::endl;
spec.at(0).parseInParallel_ = true;
LOG(WARN) << "Parallel parsing set in the `.settings.json` file; this is "
"deprecated, "
<< pleaseUseParallelParsingOption << std::endl;
spec.at(0).parseInParallel_ = parallelParsingSpecifiedViaJson.value();
} else {
throw std::runtime_error{
"For more than one input file, the parallel parsing must not be "
"specified via the `.settings.json` file, but has to be specified "
" via the command-line option --parse-parallel or -p"};
throw std::runtime_error{absl::StrCat(
"Parallel parsing set in the `.settings.json` file; this is "
"forbidden for multiple input streams, ",
pleaseUseParallelParsingOption)};
}
}

if (spec.size() == 1 && !parallelParsingSpecifiedViaJson.has_value()) {
// For a single input stream, if parallel parsing is not specified explicitly
// on the command line, we set if implicitly for backward compatibility.
if (!parallelParsingSpecifiedViaJson.has_value() && spec.size() == 1 &&
!spec.at(0).parseInParallelSetExplicitly_) {
LOG(WARN) << "Implicitly using the parallel parser for a single input file "
"for reasons of backward compatibility; this is deprecated, "
"please use the command-line option --parse-parallel or -p "
"instead"
<< std::endl;
<< pleaseUseParallelParsingOption << std::endl;
spec.at(0).parseInParallel_ = true;
}
// For a single input stream, show the name and whether we parse in parallel.
// For multiple input streams, only show the number of streams.
if (spec.size() == 1) {
LOG(INFO) << "Parsing triples from single input stream "
<< spec.at(0).filename_ << " (parallel = "
<< (spec.at(0).parseInParallel_ ? "true" : "false") << ") ..."
<< std::endl;
} else {
LOG(INFO) << "Processing triples from " << spec.size()
<< " input streams ..." << std::endl;
}
}

// _____________________________________________________________________________
Expand Down
12 changes: 8 additions & 4 deletions src/index/InputFileSpecification.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Copyright 2024, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Johannes Kalmbach(joka921) <[email protected]>
//
// Copyright 2024, University of Freiburg
// Chair of Algorithms and Data Structures
// Author: Johannes Kalmbach <[email protected]>

#pragma once

#include <optional>
Expand All @@ -27,6 +27,10 @@ struct InputFileSpecification {
// Turtle files with all prefixes at the beginning and no multiline literals.
bool parseInParallel_ = false;

// Remember if the value for parallel parsing was set explicitly (via the
// command line).
bool parseInParallelSetExplicitly_ = false;

bool operator==(const InputFileSpecification&) const = default;
};
} // namespace qlever
88 changes: 61 additions & 27 deletions test/IndexTest.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
// Copyright 2015, University of Freiburg,
// Copyright 2015 - 2024, University of Freiburg,
// Chair of Algorithms and Data Structures.
// Author: Björn Buchhold ([email protected])
// Authors: Björn Buchhold <[email protected]> [2015 - 2017]
// Johannes Kalmbach <[email protected]>
// Hannah Bast <[email protected]>

#include <gmock/gmock.h>
#include <gtest/gtest.h>
Expand Down Expand Up @@ -527,50 +529,82 @@ TEST(IndexTest, trivialGettersAndSetters) {

TEST(IndexTest, updateInputFileSpecificationsAndLog) {
using enum qlever::Filetype;
std::vector<qlever::InputFileSpecification> files{
{"singleFile.ttl", Turtle, std::nullopt, false}};
std::vector<qlever::InputFileSpecification> singleFileSpec = {
{"singleFile.ttl", Turtle, std::nullopt}};
std::vector<qlever::InputFileSpecification> twoFilesSpec = {
{"firstFile.ttl", Turtle, std::nullopt},
{"secondFile.ttl", Turtle, std::nullopt}};
using namespace ::testing;

// Parallel parsing not specified anywhere. For a single input stream, we then
// default to `true` for reasons of backwards compatibility, but this is
// deprecated. For multiple input streams, we default to `false` and this is
// normal behavior.
{
singleFileSpec.at(0).parseInParallelSetExplicitly_ = false;
testing::internal::CaptureStdout();
IndexImpl::updateInputFileSpecificationsAndLog(files, false);
IndexImpl::updateInputFileSpecificationsAndLog(singleFileSpec,
std::nullopt);
EXPECT_THAT(testing::internal::GetCapturedStdout(),
AllOf(HasSubstr("singleFile.ttl"), HasSubstr("deprecated")));
EXPECT_TRUE(singleFileSpec.at(0).parseInParallel_);
}
{
twoFilesSpec.at(0).parseInParallelSetExplicitly_ = false;
twoFilesSpec.at(1).parseInParallelSetExplicitly_ = false;
testing::internal::CaptureStdout();
IndexImpl::updateInputFileSpecificationsAndLog(twoFilesSpec, std::nullopt);
EXPECT_THAT(
testing::internal::GetCapturedStdout(),
AllOf(HasSubstr("from singleFile.ttl"), Not(HasSubstr("parallel"))));
EXPECT_FALSE(files.at(0).parseInParallel_);
AllOf(HasSubstr("from 2 input streams"), Not(HasSubstr("deprecated"))));
EXPECT_FALSE(twoFilesSpec.at(0).parseInParallel_);
EXPECT_FALSE(twoFilesSpec.at(1).parseInParallel_);
}

{
// Parallel parsing specified on the command line and not in the
// `settings.json`. This is normal behavior (no deprecation warning).
for (auto parallelParsing : {true, false}) {
singleFileSpec.at(0).parseInParallel_ = parallelParsing;
singleFileSpec.at(0).parseInParallelSetExplicitly_ = true;
testing::internal::CaptureStdout();
IndexImpl::updateInputFileSpecificationsAndLog(files, true);
EXPECT_THAT(testing::internal::GetCapturedStdout(),
AllOf(HasSubstr("from singleFile.ttl"),
HasSubstr("settings.json"), HasSubstr("deprecated")));
EXPECT_TRUE(files.at(0).parseInParallel_);
IndexImpl::updateInputFileSpecificationsAndLog(singleFileSpec,
std::nullopt);
EXPECT_THAT(
testing::internal::GetCapturedStdout(),
AllOf(HasSubstr("singleFile.ttl"), Not(HasSubstr("deprecated"))));
EXPECT_EQ(singleFileSpec.at(0).parseInParallel_, parallelParsing);
}
{
twoFilesSpec.at(0).parseInParallel_ = true;
twoFilesSpec.at(1).parseInParallel_ = false;
twoFilesSpec.at(0).parseInParallelSetExplicitly_ = true;
twoFilesSpec.at(1).parseInParallelSetExplicitly_ = true;
testing::internal::CaptureStdout();
IndexImpl::updateInputFileSpecificationsAndLog(files, std::nullopt);
EXPECT_THAT(testing::internal::GetCapturedStdout(),
AllOf(HasSubstr("from singleFile.ttl"),
HasSubstr("single input"), HasSubstr("deprecated")));
EXPECT_TRUE(files.at(0).parseInParallel_);
IndexImpl::updateInputFileSpecificationsAndLog(twoFilesSpec, std::nullopt);
EXPECT_THAT(
testing::internal::GetCapturedStdout(),
AllOf(HasSubstr("from 2 input streams"), Not(HasSubstr("deprecated"))));
EXPECT_TRUE(twoFilesSpec.at(0).parseInParallel_);
EXPECT_FALSE(twoFilesSpec.at(1).parseInParallel_);
}

// Parallel parsing not specified on the command line, but explicitly set in
// the `settings.json` file. This is deprecated for a single input
// stream and forbidden for multiple input streams.
{
files.emplace_back("secondFile.ttl", Turtle, std::nullopt, false);
auto filesCopy = files;
singleFileSpec.at(0).parseInParallelSetExplicitly_ = false;
testing::internal::CaptureStdout();
IndexImpl::updateInputFileSpecificationsAndLog(files, false);
IndexImpl::updateInputFileSpecificationsAndLog(singleFileSpec, true);
EXPECT_THAT(testing::internal::GetCapturedStdout(),
AllOf(HasSubstr("from 2 input streams"),
Not(HasSubstr("is deprecated"))));
EXPECT_EQ(files, filesCopy);
AllOf(HasSubstr("singleFile.ttl"), HasSubstr("deprecated")));
EXPECT_TRUE(singleFileSpec.at(0).parseInParallel_);
}

{
twoFilesSpec.at(0).parseInParallelSetExplicitly_ = false;
twoFilesSpec.at(1).parseInParallelSetExplicitly_ = false;
AD_EXPECT_THROW_WITH_MESSAGE(
IndexImpl::updateInputFileSpecificationsAndLog(files, true),
HasSubstr("but has to be specified"));
IndexImpl::updateInputFileSpecificationsAndLog(twoFilesSpec, true),
AllOf(Not(HasSubstr("from 2 input streams")), HasSubstr("forbidden")));
}
}

Expand Down
Loading