From 25a4757c70fcec1b6dfc8f163d36c45365850311 Mon Sep 17 00:00:00 2001
From: ferdymercury <ferdymercury@users.noreply.github.com>
Date: Fri, 6 Dec 2024 20:31:53 +0100
Subject: [PATCH 1/7] [DF] Check column types in GetColumnReadersImpl()

Fixes https://github.com/root-project/root/issues/10749

Commit by jblomer, just rebased here
---
 tree/dataframe/inc/ROOT/RNTupleDS.hxx     |  2 ++
 tree/dataframe/src/RNTupleDS.cxx          | 39 ++++++++++++++++++++---
 tree/dataframe/test/datasource_ntuple.cxx | 17 +++++++++-
 3 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/tree/dataframe/inc/ROOT/RNTupleDS.hxx b/tree/dataframe/inc/ROOT/RNTupleDS.hxx
index b121454c9450d..4c437a0a649cf 100644
--- a/tree/dataframe/inc/ROOT/RNTupleDS.hxx
+++ b/tree/dataframe/inc/ROOT/RNTupleDS.hxx
@@ -92,6 +92,8 @@ class RNTupleDS final : public ROOT::RDF::RDataSource {
    std::unordered_map<ROOT::Experimental::DescriptorId_t, std::string> fFieldId2QualifiedName;
    std::vector<std::string> fColumnNames;
    std::vector<std::string> fColumnTypes;
+   /// Applies TClassEdit::GetNormalizedName to fColumnTypes
+   std::vector<std::string> fNormalizedColumnTypes;
    /// List of column readers returned by GetColumnReaders() organized by slot. Used to reconnect readers
    /// to new page sources when the files in the chain change.
    std::vector<std::vector<Internal::RNTupleColumnReader *>> fActiveColumnReaders;
diff --git a/tree/dataframe/src/RNTupleDS.cxx b/tree/dataframe/src/RNTupleDS.cxx
index c18c3709e3dcc..dc57ed452c287 100644
--- a/tree/dataframe/src/RNTupleDS.cxx
+++ b/tree/dataframe/src/RNTupleDS.cxx
@@ -277,7 +277,11 @@ void RNTupleDS::AddField(const RNTupleDescriptor &desc, std::string_view colName
          auto cardinalityField = std::make_unique<ROOT::Experimental::Internal::RRDFCardinalityField>();
          cardinalityField->SetOnDiskId(fieldId);
          fColumnNames.emplace_back("R_rdf_sizeof_" + std::string(colName));
-         fColumnTypes.emplace_back(cardinalityField->GetTypeName());
+         const auto typeName = cardinalityField->GetTypeName();
+         fColumnTypes.emplace_back(typeName);
+         std::string normalized;
+         TClassEdit::GetNormalizedName(normalized, typeName);
+         fNormalizedColumnTypes.emplace_back(normalized);
          fProtoFields.emplace_back(std::move(cardinalityField));
 
          for (const auto &f : desc.GetFieldIterable(fieldDesc.GetId())) {
@@ -359,13 +363,21 @@ void RNTupleDS::AddField(const RNTupleDescriptor &desc, std::string_view colName
 
    if (cardinalityField) {
       fColumnNames.emplace_back("R_rdf_sizeof_" + std::string(colName));
-      fColumnTypes.emplace_back(cardinalityField->GetTypeName());
+      const auto typeName = cardinalityField->GetTypeName();
+      fColumnTypes.emplace_back(typeName);
+      std::string normalized;
+      TClassEdit::GetNormalizedName(normalized, typeName);
+      fNormalizedColumnTypes.emplace_back(normalized);
       fProtoFields.emplace_back(std::move(cardinalityField));
    }
 
    fieldInfos.emplace_back(fieldId, nRepetitions);
    fColumnNames.emplace_back(colName);
-   fColumnTypes.emplace_back(valueField->GetTypeName());
+   const auto typeName = valueField->GetTypeName();
+   fColumnTypes.emplace_back(typeName);
+   std::string normalized;
+   TClassEdit::GetNormalizedName(normalized, typeName);
+   fNormalizedColumnTypes.emplace_back(normalized);
    fProtoFields.emplace_back(std::move(valueField));
 }
 
@@ -431,13 +443,30 @@ RDF::RDataSource::Record_t RNTupleDS::GetColumnReadersImpl(std::string_view /* n
 }
 
 std::unique_ptr<ROOT::Detail::RDF::RColumnReaderBase>
-RNTupleDS::GetColumnReaders(unsigned int slot, std::string_view name, const std::type_info & /*tid*/)
+RNTupleDS::GetColumnReaders(unsigned int slot, std::string_view name, const std::type_info & tid)
 {
    // At this point we can assume that `name` will be found in fColumnNames
-   // TODO(jblomer): check incoming type
    const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), name));
    auto field = fProtoFields[index].get();
 
+   std::string demangled = ROOT::Internal::RDF::DemangleTypeIdName(ti);
+   std::string normalized;
+   TClassEdit::GetNormalizedName(normalized, demangled.c_str());
+   if (normalized != fNormalizedColumnTypes[index]) {
+      std::string err = "The type of column \"";
+      err += name;
+      err += "\" is ";
+      err += fColumnTypes[index];
+      if (fColumnTypes[index] != fNormalizedColumnTypes[index])
+         err += " (= " + fNormalizedColumnTypes[index] + ")";
+      err += " but ";
+      err += demangled;
+      if (demangled != normalized)
+         err += " (= " + normalized + ")";
+      err += " has been selected";
+      throw std::runtime_error(err);
+   }
+
    // Map the field's and subfields' IDs to qualified names so that we can later connect the fields to
    // other page sources from the chain
    fFieldId2QualifiedName[field->GetOnDiskId()] = fPrincipalDescriptor->GetQualifiedFieldName(field->GetOnDiskId());
diff --git a/tree/dataframe/test/datasource_ntuple.cxx b/tree/dataframe/test/datasource_ntuple.cxx
index df2955a74ce80..742c2573820b0 100644
--- a/tree/dataframe/test/datasource_ntuple.cxx
+++ b/tree/dataframe/test/datasource_ntuple.cxx
@@ -51,6 +51,7 @@ class RNTupleDSTest : public ::testing::Test {
 
    void SetUp() override {
       auto model = RNTupleModel::Create();
+      *model->MakeField<std::uint32_t>("nevent", 1);
       *model->MakeField<float>("pt") = 42;
       *model->MakeField<float>("energy") = 7;
       *model->MakeField<std::string>("tag") = "xyz";
@@ -81,8 +82,9 @@ TEST_F(RNTupleDSTest, ColTypeNames)
    RNTupleDS ds(fNtplName, fFileName);
 
    auto colNames = ds.GetColumnNames();
-   ASSERT_EQ(15, colNames.size());
+   ASSERT_EQ(16, colNames.size());
 
+   EXPECT_TRUE(ds.HasColumn("nevent"));
    EXPECT_TRUE(ds.HasColumn("pt"));
    EXPECT_TRUE(ds.HasColumn("energy"));
    EXPECT_TRUE(ds.HasColumn("rvec"));
@@ -132,6 +134,19 @@ TEST_F(RNTupleDSTest, CardinalityColumn)
    EXPECT_EQ(3, *max_rvec2);
 }
 
+// TODO(jblomer): this test will change once collections are read as RVecs in RNTupleDS
+TEST_F(RNTupleDSTest, ReadRVec)
+{
+   auto df = ROOT::Experimental::MakeNTupleDataFrame(fNtplName, fFileName);
+   // jets is currently exposed as std::vector<float> and thus not usable as ROOT::RVec<float>
+   EXPECT_THROW(df.Sum<ROOT::RVec<float>>("jets"), std::runtime_error);
+   // Allow use of float and Float_t interchangibly
+   EXPECT_DOUBLE_EQ(3.0, *df.Sum<std::vector<Float_t>>("jets"));
+   // Allow use of std int types and ROOT int types interchangibly
+   EXPECT_EQ(1U, df.Take<std::uint32_t>("nevent").GetValue()[0]);
+   EXPECT_EQ(1U, df.Take<UInt_t>("nevent").GetValue()[0]);
+}
+
 static void ReadTest(const std::string &name, const std::string &fname)
 {
    auto df = ROOT::RDF::Experimental::FromRNTuple(name, fname);

From 41a28949df0e9c985ae396f01ef9155dd6a1d9b3 Mon Sep 17 00:00:00 2001
From: ferdymercury <ferdymercury@users.noreply.github.com>
Date: Fri, 6 Dec 2024 22:56:04 +0100
Subject: [PATCH 2/7] [tree] fix typo varname

---
 tree/dataframe/src/RNTupleDS.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tree/dataframe/src/RNTupleDS.cxx b/tree/dataframe/src/RNTupleDS.cxx
index dc57ed452c287..80ebbd00c013b 100644
--- a/tree/dataframe/src/RNTupleDS.cxx
+++ b/tree/dataframe/src/RNTupleDS.cxx
@@ -449,7 +449,7 @@ RNTupleDS::GetColumnReaders(unsigned int slot, std::string_view name, const std:
    const auto index = std::distance(fColumnNames.begin(), std::find(fColumnNames.begin(), fColumnNames.end(), name));
    auto field = fProtoFields[index].get();
 
-   std::string demangled = ROOT::Internal::RDF::DemangleTypeIdName(ti);
+   std::string demangled = ROOT::Internal::RDF::DemangleTypeIdName(tid);
    std::string normalized;
    TClassEdit::GetNormalizedName(normalized, demangled.c_str());
    if (normalized != fNormalizedColumnTypes[index]) {

From 1c0a5aac5b8f4b8114e6819d9f1940703eae4b83 Mon Sep 17 00:00:00 2001
From: ferdymercury <ferdymercury@users.noreply.github.com>
Date: Fri, 6 Dec 2024 23:04:04 +0100
Subject: [PATCH 3/7] Adapt to new interface

---
 tree/dataframe/test/datasource_ntuple.cxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tree/dataframe/test/datasource_ntuple.cxx b/tree/dataframe/test/datasource_ntuple.cxx
index 742c2573820b0..df12c043f4976 100644
--- a/tree/dataframe/test/datasource_ntuple.cxx
+++ b/tree/dataframe/test/datasource_ntuple.cxx
@@ -51,7 +51,7 @@ class RNTupleDSTest : public ::testing::Test {
 
    void SetUp() override {
       auto model = RNTupleModel::Create();
-      *model->MakeField<std::uint32_t>("nevent", 1);
+      *model->MakeField<std::uint32_t>("nevent") = 1;
       *model->MakeField<float>("pt") = 42;
       *model->MakeField<float>("energy") = 7;
       *model->MakeField<std::string>("tag") = "xyz";

From 9057dd7cc016240d803079201579d829e0617066 Mon Sep 17 00:00:00 2001
From: ferdymercury <ferdymercury@users.noreply.github.com>
Date: Sat, 7 Dec 2024 09:04:49 +0100
Subject: [PATCH 4/7] Fix namespace

---
 tree/dataframe/test/datasource_ntuple.cxx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tree/dataframe/test/datasource_ntuple.cxx b/tree/dataframe/test/datasource_ntuple.cxx
index df12c043f4976..0a02970d98c78 100644
--- a/tree/dataframe/test/datasource_ntuple.cxx
+++ b/tree/dataframe/test/datasource_ntuple.cxx
@@ -137,7 +137,8 @@ TEST_F(RNTupleDSTest, CardinalityColumn)
 // TODO(jblomer): this test will change once collections are read as RVecs in RNTupleDS
 TEST_F(RNTupleDSTest, ReadRVec)
 {
-   auto df = ROOT::Experimental::MakeNTupleDataFrame(fNtplName, fFileName);
+   auto df = ROOT::RDF::Experimental::FromRNTuple(fNtplName, fFileName);
+
    // jets is currently exposed as std::vector<float> and thus not usable as ROOT::RVec<float>
    EXPECT_THROW(df.Sum<ROOT::RVec<float>>("jets"), std::runtime_error);
    // Allow use of float and Float_t interchangibly

From 44e42970ecce9974eb030812c9f46f1ea1aaad05 Mon Sep 17 00:00:00 2001
From: ferdymercury <ferdymercury@users.noreply.github.com>
Date: Sat, 7 Dec 2024 10:09:53 +0100
Subject: [PATCH 5/7] [ntuple] fix column type of filter

it was detected in CI with new checking code of normalized types
---
 tutorials/io/ntuple/ntpl011_global_temperatures.C | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tutorials/io/ntuple/ntpl011_global_temperatures.C b/tutorials/io/ntuple/ntpl011_global_temperatures.C
index b884d8cfd2aa7..d35836fa0fa91 100644
--- a/tutorials/io/ntuple/ntpl011_global_temperatures.C
+++ b/tutorials/io/ntuple/ntpl011_global_temperatures.C
@@ -146,10 +146,10 @@ void Analyze()
    auto max_value = df.Max("AverageTemperature");
 
    // Functions to filter by each season from date formatted "1944-12-01."
-   auto fnWinter = [](int month) { return month == 12 || month == 1 || month == 2; };
-   auto fnSpring = [](int month) { return month == 3 || month == 4 || month == 5; };
-   auto fnSummer = [](int month) { return month == 6 || month == 7 || month == 8; };
-   auto fnFall = [](int month) { return month == 9 || month == 10 || month == 11; };
+   auto fnWinter = [](std::uint32_t month) { return month == 12 || month == 1 || month == 2; };
+   auto fnSpring = [](std::uint32_t month) { return month == 3 || month == 4 || month == 5; };
+   auto fnSummer = [](std::uint32_t month) { return month == 6 || month == 7 || month == 8; };
+   auto fnFall = [](std::uint32_t month) { return month == 9 || month == 10 || month == 11; };
 
    // Create a RDataFrame per season.
    auto dfWinter = df.Filter(fnWinter, {"Month"});

From 956478cb8a025b7b57c0fab6d74a1bf07c0fce75 Mon Sep 17 00:00:00 2001
From: ferdymercury <ferdymercury@users.noreply.github.com>
Date: Sat, 7 Dec 2024 10:50:31 +0100
Subject: [PATCH 6/7] fix year filter column type

---
 tutorials/io/ntuple/ntpl011_global_temperatures.C | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tutorials/io/ntuple/ntpl011_global_temperatures.C b/tutorials/io/ntuple/ntpl011_global_temperatures.C
index d35836fa0fa91..1b2367df4f2c3 100644
--- a/tutorials/io/ntuple/ntpl011_global_temperatures.C
+++ b/tutorials/io/ntuple/ntpl011_global_temperatures.C
@@ -164,8 +164,8 @@ void Analyze()
    auto fallCount = dfFall.Count();
 
    // Functions to filter for the time period between 2003-2013, and 1993-2002.
-   auto fn1993_to_2002 = [](int year) { return year >= 1993 && year <= 2002; };
-   auto fn2003_to_2013 = [](int year) { return year >= 2003 && year <= 2013; };
+   auto fn1993_to_2002 = [](std::uint32_t year) { return year >= 1993 && year <= 2002; };
+   auto fn2003_to_2013 = [](std::uint32_t year) { return year >= 2003 && year <= 2013; };
 
    // Create a RDataFrame for decades 1993_to_2002 & 2003_to_2013.
    auto df1993_to_2002 = df.Filter(fn1993_to_2002, {"Year"});

From 2d2066e272644197f2ff63f783ff07fa114f3f9e Mon Sep 17 00:00:00 2001
From: ferdymercury <ferdymercury@users.noreply.github.com>
Date: Sun, 8 Dec 2024 12:41:03 +0100
Subject: [PATCH 7/7] [df] try to fix gtest not forwarding the exception

---
 tree/dataframe/test/datasource_ntuple.cxx | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tree/dataframe/test/datasource_ntuple.cxx b/tree/dataframe/test/datasource_ntuple.cxx
index 0a02970d98c78..52672aed7e6e1 100644
--- a/tree/dataframe/test/datasource_ntuple.cxx
+++ b/tree/dataframe/test/datasource_ntuple.cxx
@@ -139,13 +139,14 @@ TEST_F(RNTupleDSTest, ReadRVec)
 {
    auto df = ROOT::RDF::Experimental::FromRNTuple(fNtplName, fFileName);
 
-   // jets is currently exposed as std::vector<float> and thus not usable as ROOT::RVec<float>
-   EXPECT_THROW(df.Sum<ROOT::RVec<float>>("jets"), std::runtime_error);
    // Allow use of float and Float_t interchangibly
    EXPECT_DOUBLE_EQ(3.0, *df.Sum<std::vector<Float_t>>("jets"));
    // Allow use of std int types and ROOT int types interchangibly
    EXPECT_EQ(1U, df.Take<std::uint32_t>("nevent").GetValue()[0]);
    EXPECT_EQ(1U, df.Take<UInt_t>("nevent").GetValue()[0]);
+   // jets is currently exposed as std::vector<float> and thus not usable as ROOT::RVec<float>
+   EXPECT_ANY_THROW(df.Sum<ROOT::RVec<float>>("jets"));
+   // EXPECT_THROW(df.Sum<ROOT::RVec<float>>("jets"), std::runtime_error); // This does not work directly, maybe due to jitting ?
 }
 
 static void ReadTest(const std::string &name, const std::string &fname)