From 25a4757c70fcec1b6dfc8f163d36c45365850311 Mon Sep 17 00:00:00 2001 From: ferdymercury <ferdymercury@users.noreply.github.com> Date: Fri, 6 Dec 2024 20:31:53 +0100 Subject: [PATCH 1/7] [DF] Check column types in GetColumnReadersImpl() Fixes https://github.com/root-project/root/issues/10749 Commit by jblomer, just rebased here --- tree/dataframe/inc/ROOT/RNTupleDS.hxx | 2 ++ tree/dataframe/src/RNTupleDS.cxx | 39 ++++++++++++++++++++--- tree/dataframe/test/datasource_ntuple.cxx | 17 +++++++++- 3 files changed, 52 insertions(+), 6 deletions(-) diff --git a/tree/dataframe/inc/ROOT/RNTupleDS.hxx b/tree/dataframe/inc/ROOT/RNTupleDS.hxx index b121454c9450d..4c437a0a649cf 100644 --- a/tree/dataframe/inc/ROOT/RNTupleDS.hxx +++ b/tree/dataframe/inc/ROOT/RNTupleDS.hxx @@ -92,6 +92,8 @@ class RNTupleDS final : public ROOT::RDF::RDataSource { std::unordered_map<ROOT::Experimental::DescriptorId_t, std::string> fFieldId2QualifiedName; std::vector<std::string> fColumnNames; std::vector<std::string> fColumnTypes; + /// Applies TClassEdit::GetNormalizedName to fColumnTypes + std::vector<std::string> fNormalizedColumnTypes; /// List of column readers returned by GetColumnReaders() organized by slot. Used to reconnect readers /// to new page sources when the files in the chain change. std::vector<std::vector<Internal::RNTupleColumnReader *>> fActiveColumnReaders; diff --git a/tree/dataframe/src/RNTupleDS.cxx b/tree/dataframe/src/RNTupleDS.cxx index c18c3709e3dcc..dc57ed452c287 100644 --- a/tree/dataframe/src/RNTupleDS.cxx +++ b/tree/dataframe/src/RNTupleDS.cxx @@ -277,7 +277,11 @@ void RNTupleDS::AddField(const RNTupleDescriptor &desc, std::string_view colName auto cardinalityField = std::make_unique<ROOT::Experimental::Internal::RRDFCardinalityField>(); cardinalityField->SetOnDiskId(fieldId); fColumnNames.emplace_back("R_rdf_sizeof_" + std::string(colName)); - fColumnTypes.emplace_back(cardinalityField->GetTypeName()); + const auto typeName = cardinalityField->GetTypeName(); + fColumnTypes.emplace_back(typeName); + std::string normalized; + TClassEdit::GetNormalizedName(normalized, typeName); + fNormalizedColumnTypes.emplace_back(normalized); fProtoFields.emplace_back(std::move(cardinalityField)); for (const auto &f : desc.GetFieldIterable(fieldDesc.GetId())) { @@ -359,13 +363,21 @@ void RNTupleDS::AddField(const RNTupleDescriptor &desc, std::string_view colName if (cardinalityField) { fColumnNames.emplace_back("R_rdf_sizeof_" + std::string(colName)); - fColumnTypes.emplace_back(cardinalityField->GetTypeName()); + const auto typeName = cardinalityField->GetTypeName(); + fColumnTypes.emplace_back(typeName); + std::string normalized; + TClassEdit::GetNormalizedName(normalized, typeName); + fNormalizedColumnTypes.emplace_back(normalized); fProtoFields.emplace_back(std::move(cardinalityField)); } fieldInfos.emplace_back(fieldId, nRepetitions); fColumnNames.emplace_back(colName); - fColumnTypes.emplace_back(valueField->GetTypeName()); + const auto typeName = valueField->GetTypeName(); + fColumnTypes.emplace_back(typeName); + std::string normalized; + TClassEdit::GetNormalizedName(normalized, typeName); + fNormalizedColumnTypes.emplace_back(normalized); fProtoFields.emplace_back(std::move(valueField)); } @@ -431,13 +443,30 @@ RDF::RDataSource::Record_t RNTupleDS::GetColumnReadersImpl(std::string_view /* n } std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase> -RNTupleDS::GetColumnReaders(unsigned int slot, std::string_view name, const std::type_info & /*tid*/) +RNTupleDS::GetColumnReaders(unsigned int slot, std::string_view name, const std::type_info & tid) { // At this point we can assume that `name` will be found in fColumnNames - // TODO(jblomer): check incoming type const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), name)); auto field = fProtoFields[index].get(); + std::string demangled = ROOT::Internal::RDF::DemangleTypeIdName(ti); + std::string normalized; + TClassEdit::GetNormalizedName(normalized, demangled.c_str()); + if (normalized != fNormalizedColumnTypes[index]) { + std::string err = "The type of column \""; + err += name; + err += "\" is "; + err += fColumnTypes[index]; + if (fColumnTypes[index] != fNormalizedColumnTypes[index]) + err += " (= " + fNormalizedColumnTypes[index] + ")"; + err += " but "; + err += demangled; + if (demangled != normalized) + err += " (= " + normalized + ")"; + err += " has been selected"; + throw std::runtime_error(err); + } + // Map the field's and subfields' IDs to qualified names so that we can later connect the fields to // other page sources from the chain fFieldId2QualifiedName[field->GetOnDiskId()] = fPrincipalDescriptor->GetQualifiedFieldName(field->GetOnDiskId()); diff --git a/tree/dataframe/test/datasource_ntuple.cxx b/tree/dataframe/test/datasource_ntuple.cxx index df2955a74ce80..742c2573820b0 100644 --- a/tree/dataframe/test/datasource_ntuple.cxx +++ b/tree/dataframe/test/datasource_ntuple.cxx @@ -51,6 +51,7 @@ class RNTupleDSTest : public ::testing::Test { void SetUp() override { auto model = RNTupleModel::Create(); + *model->MakeField<std::uint32_t>("nevent", 1); *model->MakeField<float>("pt") = 42; *model->MakeField<float>("energy") = 7; *model->MakeField<std::string>("tag") = "xyz"; @@ -81,8 +82,9 @@ TEST_F(RNTupleDSTest, ColTypeNames) RNTupleDS ds(fNtplName, fFileName); auto colNames = ds.GetColumnNames(); - ASSERT_EQ(15, colNames.size()); + ASSERT_EQ(16, colNames.size()); + EXPECT_TRUE(ds.HasColumn("nevent")); EXPECT_TRUE(ds.HasColumn("pt")); EXPECT_TRUE(ds.HasColumn("energy")); EXPECT_TRUE(ds.HasColumn("rvec")); @@ -132,6 +134,19 @@ TEST_F(RNTupleDSTest, CardinalityColumn) EXPECT_EQ(3, *max_rvec2); } +// TODO(jblomer): this test will change once collections are read as RVecs in RNTupleDS +TEST_F(RNTupleDSTest, ReadRVec) +{ + auto df = ROOT::Experimental::MakeNTupleDataFrame(fNtplName, fFileName); + // jets is currently exposed as std::vector<float> and thus not usable as ROOT::RVec<float> + EXPECT_THROW(df.Sum<ROOT::RVec<float>>("jets"), std::runtime_error); + // Allow use of float and Float_t interchangibly + EXPECT_DOUBLE_EQ(3.0, *df.Sum<std::vector<Float_t>>("jets")); + // Allow use of std int types and ROOT int types interchangibly + EXPECT_EQ(1U, df.Take<std::uint32_t>("nevent").GetValue()[0]); + EXPECT_EQ(1U, df.Take<UInt_t>("nevent").GetValue()[0]); +} + static void ReadTest(const std::string &name, const std::string &fname) { auto df = ROOT::RDF::Experimental::FromRNTuple(name, fname); From 41a28949df0e9c985ae396f01ef9155dd6a1d9b3 Mon Sep 17 00:00:00 2001 From: ferdymercury <ferdymercury@users.noreply.github.com> Date: Fri, 6 Dec 2024 22:56:04 +0100 Subject: [PATCH 2/7] [tree] fix typo varname --- tree/dataframe/src/RNTupleDS.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tree/dataframe/src/RNTupleDS.cxx b/tree/dataframe/src/RNTupleDS.cxx index dc57ed452c287..80ebbd00c013b 100644 --- a/tree/dataframe/src/RNTupleDS.cxx +++ b/tree/dataframe/src/RNTupleDS.cxx @@ -449,7 +449,7 @@ RNTupleDS::GetColumnReaders(unsigned int slot, std::string_view name, const std: const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), name)); auto field = fProtoFields[index].get(); - std::string demangled = ROOT::Internal::RDF::DemangleTypeIdName(ti); + std::string demangled = ROOT::Internal::RDF::DemangleTypeIdName(tid); std::string normalized; TClassEdit::GetNormalizedName(normalized, demangled.c_str()); if (normalized != fNormalizedColumnTypes[index]) { From 1c0a5aac5b8f4b8114e6819d9f1940703eae4b83 Mon Sep 17 00:00:00 2001 From: ferdymercury <ferdymercury@users.noreply.github.com> Date: Fri, 6 Dec 2024 23:04:04 +0100 Subject: [PATCH 3/7] Adapt to new interface --- tree/dataframe/test/datasource_ntuple.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tree/dataframe/test/datasource_ntuple.cxx b/tree/dataframe/test/datasource_ntuple.cxx index 742c2573820b0..df12c043f4976 100644 --- a/tree/dataframe/test/datasource_ntuple.cxx +++ b/tree/dataframe/test/datasource_ntuple.cxx @@ -51,7 +51,7 @@ class RNTupleDSTest : public ::testing::Test { void SetUp() override { auto model = RNTupleModel::Create(); - *model->MakeField<std::uint32_t>("nevent", 1); + *model->MakeField<std::uint32_t>("nevent") = 1; *model->MakeField<float>("pt") = 42; *model->MakeField<float>("energy") = 7; *model->MakeField<std::string>("tag") = "xyz"; From 9057dd7cc016240d803079201579d829e0617066 Mon Sep 17 00:00:00 2001 From: ferdymercury <ferdymercury@users.noreply.github.com> Date: Sat, 7 Dec 2024 09:04:49 +0100 Subject: [PATCH 4/7] Fix namespace --- tree/dataframe/test/datasource_ntuple.cxx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tree/dataframe/test/datasource_ntuple.cxx b/tree/dataframe/test/datasource_ntuple.cxx index df12c043f4976..0a02970d98c78 100644 --- a/tree/dataframe/test/datasource_ntuple.cxx +++ b/tree/dataframe/test/datasource_ntuple.cxx @@ -137,7 +137,8 @@ TEST_F(RNTupleDSTest, CardinalityColumn) // TODO(jblomer): this test will change once collections are read as RVecs in RNTupleDS TEST_F(RNTupleDSTest, ReadRVec) { - auto df = ROOT::Experimental::MakeNTupleDataFrame(fNtplName, fFileName); + auto df = ROOT::RDF::Experimental::FromRNTuple(fNtplName, fFileName); + // jets is currently exposed as std::vector<float> and thus not usable as ROOT::RVec<float> EXPECT_THROW(df.Sum<ROOT::RVec<float>>("jets"), std::runtime_error); // Allow use of float and Float_t interchangibly From 44e42970ecce9974eb030812c9f46f1ea1aaad05 Mon Sep 17 00:00:00 2001 From: ferdymercury <ferdymercury@users.noreply.github.com> Date: Sat, 7 Dec 2024 10:09:53 +0100 Subject: [PATCH 5/7] [ntuple] fix column type of filter it was detected in CI with new checking code of normalized types --- tutorials/io/ntuple/ntpl011_global_temperatures.C | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tutorials/io/ntuple/ntpl011_global_temperatures.C b/tutorials/io/ntuple/ntpl011_global_temperatures.C index b884d8cfd2aa7..d35836fa0fa91 100644 --- a/tutorials/io/ntuple/ntpl011_global_temperatures.C +++ b/tutorials/io/ntuple/ntpl011_global_temperatures.C @@ -146,10 +146,10 @@ void Analyze() auto max_value = df.Max("AverageTemperature"); // Functions to filter by each season from date formatted "1944-12-01." - auto fnWinter = [](int month) { return month == 12 || month == 1 || month == 2; }; - auto fnSpring = [](int month) { return month == 3 || month == 4 || month == 5; }; - auto fnSummer = [](int month) { return month == 6 || month == 7 || month == 8; }; - auto fnFall = [](int month) { return month == 9 || month == 10 || month == 11; }; + auto fnWinter = [](std::uint32_t month) { return month == 12 || month == 1 || month == 2; }; + auto fnSpring = [](std::uint32_t month) { return month == 3 || month == 4 || month == 5; }; + auto fnSummer = [](std::uint32_t month) { return month == 6 || month == 7 || month == 8; }; + auto fnFall = [](std::uint32_t month) { return month == 9 || month == 10 || month == 11; }; // Create a RDataFrame per season. auto dfWinter = df.Filter(fnWinter, {"Month"}); From 956478cb8a025b7b57c0fab6d74a1bf07c0fce75 Mon Sep 17 00:00:00 2001 From: ferdymercury <ferdymercury@users.noreply.github.com> Date: Sat, 7 Dec 2024 10:50:31 +0100 Subject: [PATCH 6/7] fix year filter column type --- tutorials/io/ntuple/ntpl011_global_temperatures.C | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tutorials/io/ntuple/ntpl011_global_temperatures.C b/tutorials/io/ntuple/ntpl011_global_temperatures.C index d35836fa0fa91..1b2367df4f2c3 100644 --- a/tutorials/io/ntuple/ntpl011_global_temperatures.C +++ b/tutorials/io/ntuple/ntpl011_global_temperatures.C @@ -164,8 +164,8 @@ void Analyze() auto fallCount = dfFall.Count(); // Functions to filter for the time period between 2003-2013, and 1993-2002. - auto fn1993_to_2002 = [](int year) { return year >= 1993 && year <= 2002; }; - auto fn2003_to_2013 = [](int year) { return year >= 2003 && year <= 2013; }; + auto fn1993_to_2002 = [](std::uint32_t year) { return year >= 1993 && year <= 2002; }; + auto fn2003_to_2013 = [](std::uint32_t year) { return year >= 2003 && year <= 2013; }; // Create a RDataFrame for decades 1993_to_2002 & 2003_to_2013. auto df1993_to_2002 = df.Filter(fn1993_to_2002, {"Year"}); From 2d2066e272644197f2ff63f783ff07fa114f3f9e Mon Sep 17 00:00:00 2001 From: ferdymercury <ferdymercury@users.noreply.github.com> Date: Sun, 8 Dec 2024 12:41:03 +0100 Subject: [PATCH 7/7] [df] try to fix gtest not forwarding the exception --- tree/dataframe/test/datasource_ntuple.cxx | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tree/dataframe/test/datasource_ntuple.cxx b/tree/dataframe/test/datasource_ntuple.cxx index 0a02970d98c78..52672aed7e6e1 100644 --- a/tree/dataframe/test/datasource_ntuple.cxx +++ b/tree/dataframe/test/datasource_ntuple.cxx @@ -139,13 +139,14 @@ TEST_F(RNTupleDSTest, ReadRVec) { auto df = ROOT::RDF::Experimental::FromRNTuple(fNtplName, fFileName); - // jets is currently exposed as std::vector<float> and thus not usable as ROOT::RVec<float> - EXPECT_THROW(df.Sum<ROOT::RVec<float>>("jets"), std::runtime_error); // Allow use of float and Float_t interchangibly EXPECT_DOUBLE_EQ(3.0, *df.Sum<std::vector<Float_t>>("jets")); // Allow use of std int types and ROOT int types interchangibly EXPECT_EQ(1U, df.Take<std::uint32_t>("nevent").GetValue()[0]); EXPECT_EQ(1U, df.Take<UInt_t>("nevent").GetValue()[0]); + // jets is currently exposed as std::vector<float> and thus not usable as ROOT::RVec<float> + EXPECT_ANY_THROW(df.Sum<ROOT::RVec<float>>("jets")); + // EXPECT_THROW(df.Sum<ROOT::RVec<float>>("jets"), std::runtime_error); // This does not work directly, maybe due to jitting ? } static void ReadTest(const std::string &name, const std::string &fname)