[tmva][sofie] Change UnidirectionalBroadcast to take output as raw array

guitargeek · guitargeek · commit 7dce919ff627 · 2026-03-04T10:04:08.000+01:00
This commit suggests to refactor the `UnidirectionalBroadcast`
implementation to take the output buffer as a raw C-style array.

This makes is easier to use in code generation, as the shape doesn't
need to be recomputed to be hardcoded in the construction of the output
span, or forces us to use a `std::vector` that already encodes the size
information. The `std::vector` argument is the real motivation for this
change, as it's better to avoid taking vectors as output buffers for
intermediate tensors. Forcing the use of `std::vector` prevents some
memory optimizations where we take offsetted pointers to a larger memory
buffer, and also makes it more difficult to emit code that is
differentiable by Clad.
diff --git a/tmva/sofie/inc/TMVA/ROperator_BasicBinary.hxx b/tmva/sofie/inc/TMVA/ROperator_BasicBinary.hxx
@@ -151,7 +151,7 @@ public:
                   fNBroadcastedA = "Broadcasted" + fNA + "to" + fNY;
                   auto data = model.GetInitializedTensorData(fNA);
                   std::shared_ptr<void> broadcastedData(
-                     UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeA, fShapeY),
+                     UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), fShapeA, fShapeY),
                      std::default_delete<T[]>());
                   if (model.Verbose())
                      std::cout << "broadcasted data A " << ConvertShapeToString(fShapeY) << " : "
@@ -172,7 +172,7 @@ public:
                                << ConvertValuesToString(ConvertShapeToLength(fShapeB), static_cast<T *>(data.get()))
                                << std::endl;
                   std::shared_ptr<void> broadcastedData(
-                     UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeB, fShapeY),
+                     UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), fShapeB, fShapeY),
                      std::default_delete<T[]>());
                   // do not update tensor B but add broadcasted one (since it can be input to some other operators)
                   if (model.Verbose())
diff --git a/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx b/tmva/sofie/inc/TMVA/ROperator_Comparision.hxx
@@ -143,7 +143,7 @@ public:
             data1 = static_cast<T *>(model.GetInitializedTensorData(fNX1).get());
             if (broadcastX1) {
                broadcastedData1 = std::unique_ptr<T>(
-                  UTILITY::UnidirectionalBroadcast<T>(data1, fShapeX1, fShapeY));
+                  UTILITY::UnidirectionalBroadcast(data1, fShapeX1, fShapeY));
                data1 = broadcastedData1.get();
             }
 
@@ -154,7 +154,7 @@ public:
             data2 = static_cast<T *>(model.GetInitializedTensorData(fNX2).get());
             if (broadcastX2) {
                broadcastedData2 = std::unique_ptr<T>(
-                  UTILITY::UnidirectionalBroadcast<T>(data2, fShapeX2, fShapeY));
+                  UTILITY::UnidirectionalBroadcast(data2, fShapeX2, fShapeY));
                data2 = broadcastedData2.get();
             }
          } else if (model.IsShapeTensor(fNX2)) {
diff --git a/tmva/sofie/inc/TMVA/ROperator_Conv.hxx b/tmva/sofie/inc/TMVA/ROperator_Conv.hxx
@@ -289,7 +289,7 @@ public:
                shape[0] = fShapeB[0];
                auto intTargetShape = ConvertShapeToInt(targetShape);
                std::shared_ptr<void> new_data_ptr(
-                  UTILITY::UnidirectionalBroadcast<float>(static_cast<float *>(original_data.get()), shape, intTargetShape),
+                  UTILITY::UnidirectionalBroadcast(static_cast<float *>(original_data.get()), shape, intTargetShape),
                   std::default_delete<float[]>());
                model.UpdateInitializedTensor(fNB, model.GetTensorType(fNB), intTargetShape, new_data_ptr);
                fShapeB = model.GetTensorShape(fNB);
@@ -347,7 +347,7 @@ public:
             out << SP << "if (" << length << " > " << ConvertShapeToLength(shape) << ") {\n";
          else
             out << SP << "{\n";
-         out << SP << SP << "float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
+         out << SP << SP << "float * data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tensor_"
              << fNB << ", " << ConvertShapeToString(shape) << ", " << ConvertShapeToString(fShapeY) << ");\n";
          out << SP << SP << "fTensor_" << fNB << ".resize(" << length << ");\n";
          out << SP << SP << "std::copy(data, data + " << length << ", fTensor_" << fNB << ".begin());\n";
diff --git a/tmva/sofie/inc/TMVA/ROperator_Expand.hxx b/tmva/sofie/inc/TMVA/ROperator_Expand.hxx
@@ -94,7 +94,7 @@ public:
          auto data = model.GetInitializedTensorData(fNX);
          if (fInitBroadcast) {
             std::shared_ptr<void> broadcastedData(
-               UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), shapeX, shapeY),
+               UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), shapeX, shapeY),
                std::default_delete<T[]>());
             // Update the data and the shape of X
             model.UpdateInitializedTensor(fNX, model.GetTensorType(fNX), shapeY, broadcastedData);
@@ -153,8 +153,8 @@ public:
       // No need to broadcast A if it's an initialized tensor or shapes are the same
       if (!fInitialized && fShapeX != fShapeY) {
          out << SP << "// Broadcasting uninitialized tensor " << fNX << "\n";
-         out << SP << "TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<" << fType << ">(tensor_" << fNX << ", " << ConvertShapeToString(fShapeX) << ", " << ConvertShapeToString(fShapeY)
-                   << ", std::span<"<<fType<<">(tensor_"<<fNY<<", "<<ConvertDimShapeToLength(fShapeY)<<"));\n";
+         out << SP << "TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" << fNX << ", " << ConvertShapeToString(fShapeX) << ", " << ConvertShapeToString(fShapeY)
+                   << ", tensor_"<<fNY<<");\n";
       }
       return out.str();
    }
diff --git a/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx b/tmva/sofie/inc/TMVA/ROperator_LayerNormalization.hxx
@@ -177,7 +177,7 @@ public:
       if (!fNBroadcastedB.empty()) {
          out << SP << "// Broadcasting the bias of LayerNormalization op\n";
          out << SP << "{\n";
-         out << SP << SP << "float* data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_";
+         out << SP << SP << "float* data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tensor_";
          out << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeX) << ");\n";
          out << SP << "std::copy(data, data + " << fLength << ", tensor_" << fNBroadcastedB << ");\n";
          out << SP << "delete[] data;\n";
diff --git a/tmva/sofie/inc/TMVA/ROperator_Where.hxx b/tmva/sofie/inc/TMVA/ROperator_Where.hxx
@@ -105,7 +105,7 @@ public:
             if (model.IsInitializedTensor(fNA)) {
                auto data = model.GetInitializedTensorData(fNA);
                std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeA, fShapeY),
+                  UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), fShapeA, fShapeY),
                   std::default_delete<T[]>());
                // Update the data and the shape of A
                model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData);
@@ -121,7 +121,7 @@ public:
             if (model.IsInitializedTensor(fNB)) {
                auto data = model.GetInitializedTensorData(fNB);
                std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeB, fShapeY),
+                  UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), fShapeB, fShapeY),
                   std::default_delete<T[]>());
                // do not update tensor B but add broadcasted one (since it can be input to some other operators)
                model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData);
@@ -137,7 +137,7 @@ public:
             if (model.IsInitializedTensor(fNC)) {
                auto data = model.GetInitializedTensorData(fNC);
                std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeC, fShapeY),
+                  UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), fShapeC, fShapeY),
                   std::default_delete<T[]>());
                // do not update tensor C but add broadcasted one (since it can be input to some other operators)
                model.AddConstantTensor(fNBroadcastedC, model.GetTensorType(fNC), fShapeY, broadcastedData);
@@ -256,34 +256,34 @@ public:
       if (fShapeA != fShapeY) {
          out << SP << "// Broadcasting uninitialized tensor " << fNA << "\n";
          //out << SP << "{\n";
-         out << SP  << "TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY)
-                         << ", fTensor_" << fNBroadcastedA << ");\n";
+         out << SP  << "TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY)
+                         << ", tensor_" << fNBroadcastedA << ");\n";
       }
       // Broadcast B if it's uninitialized
       if (fShapeB != fShapeY) {
          out << SP << "// Broadcasting uninitialized tensor " << fNB << "\n";
          //out << SP << "{\n";
-         out << SP << "TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY)
-                   << ", fTensor_" << fNBroadcastedB << ");\n";
+         out << SP << "TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY)
+                   << ", tensor_" << fNBroadcastedB << ");\n";
       }
        // Broadcast C if it's uninitialized
       if (fShapeC != fShapeY) {
          // special case if C is an input tensor
          if (fIsInputBoolTensor) {
             size_t inputLength = ConvertShapeToLength(fShapeC);
-            out << SP << "std::vector<std::uint8_t> fTensor_" << fNC << "(tensor_" << fNC <<  ", tensor_" << fNC << " + " << inputLength << ");\n";
+            out << SP << "std::vector<std::uint8_t> tmp_tensor_" << fNC << "(tensor_" << fNC <<  ", tensor_" << fNC << " + " << inputLength << ");\n";
          }
          out << SP << "// Broadcasting uninitialized tensor " << fNC << "\n";
          //out << SP << "{\n";
-         out << SP << "TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast<std::uint8_t>(fTensor_" << fNC << ".data(), " << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY)
-                   << ", fTensor_" << fNBroadcastedC << ");\n";
+         out << SP << "TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tmp_tensor_" << fNC << ".data(), " << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY)
+                   << ", tensor_" << fNBroadcastedC << ");\n";
       }
       std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA;
       std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB;
       std::string nameC = fNBroadcastedC.empty()? fNC : fNBroadcastedC;
       out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n";
       // get output tensor applying condition
-      out << SP << SP << "tensor_" << fNY << "[id] = "  << "(fTensor_" << nameC << "[id]) ? tensor_"
+      out << SP << SP << "tensor_" << fNY << "[id] = "  << "tensor_" << nameC << "[id] ? tensor_"
                                << nameA << "[id] : tensor_" + nameB + "[id];\n";
       out << SP << "}\n";
       return out.str();
diff --git a/tmva/sofie/inc/TMVA/SOFIE_common.hxx b/tmva/sofie/inc/TMVA/SOFIE_common.hxx
@@ -416,14 +416,12 @@ T* BroadcastConvBias(const T* data, const size_t channel, const std::vector<size
 // Broadcast a tensor from shape to targetShape according to numpy broadcasting rules
 // See more at https://numpy.org/doc/stable/user/basics.broadcasting.html
 // and https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md .
-template<typename T, class ConstContT = std::span<const T>, class ContT = std::span<T> >
-void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, ContT broadcastedData) {
+template<typename T, class ConstContT = std::span<const T>>
+void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, T *broadcastedData) {
    // Size of the shapes (tensor input here have shapes with same sizes, we have already added the needed ones )
    size_t size = shape.size();
    // Current length of the broadcasted tensor
    size_t curLength = data.size();
-   size_t targetLength = broadcastedData.size();
-   assert(ConvertShapeToLength(targetShape) == targetLength);
    // special case when broadcasting last dimensions (initial shapes must be the same)
    if (size > 1 && shape.front() == targetShape.front() && shape.back() == 1) {
       size_t bsize = targetShape.back();
@@ -433,16 +431,16 @@ void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const st
          bsize *= targetShape[k];
       }
       for (size_t i = 0; i < curLength; i++) {
-         std::fill(broadcastedData.begin() + i*bsize, broadcastedData.begin() + (i+1)*bsize , data[i]);
+         std::fill(broadcastedData + i*bsize, broadcastedData + (i+1)*bsize , data[i]);
       }
       return;
    }
 
-   std::copy(data.begin(), data.end(), broadcastedData.begin());
+   std::copy(data.begin(), data.end(), broadcastedData);
    // Product of the previous dimensions of targetShape
    size_t arrayNum = 1;
    // New broadcasted data: is this needed?
-   std::vector<T> newData(targetLength);
+   std::vector<T> newData(ConvertShapeToLength(targetShape));
 
    for (size_t idx = 0; idx < size; idx++) {
       size_t dim = shape[idx];
@@ -458,8 +456,8 @@ void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const st
             for (size_t arrayIdx = 0; arrayIdx < arrayNum; arrayIdx++) {
                for (size_t targetIdx = 0; targetIdx < targetDim; targetIdx++) {
                   size_t offset = arrayIdx * arrayLength * targetDim + targetIdx * arrayLength;
-                  std::copy(broadcastedData.begin() + arrayIdx * arrayLength,
-                     broadcastedData.begin() + (arrayIdx + 1) * arrayLength,
+                  std::copy(broadcastedData + arrayIdx * arrayLength,
+                     broadcastedData + (arrayIdx + 1) * arrayLength,
                      newData.begin() + offset);
                }
             }
@@ -473,23 +471,20 @@ void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const st
          // Update current length
          curLength = newLength;
          // Update broadcasted data
-         std::copy(newData.begin(), newData.begin() + newLength, broadcastedData.begin());
+         std::copy(newData.begin(), newData.begin() + newLength, broadcastedData);
       }
       // Update the number of arrays
       arrayNum *= targetDim;
    }
-   //return broadcastedData;
 }
 
 // interface where we allocate a new array for broadcasted data
 template<typename T>
 T* CreateBroadcastTensor(const T* data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, size_t targetLength) {
    // newShape is an array of size equal to dimension along which we are broadcasting the tensor
    T* broadcastedData = new T[targetLength];
-   std::span<T> bData(broadcastedData, broadcastedData+targetLength);
    size_t curLength = ConvertShapeToLength(shape);
-   std::span<const T> inData(data, curLength);
-   BroadcastTensor<T, std::span<const T>, std::span<T>>(inData, shape, targetShape, bData);
+   BroadcastTensor<T>({data, curLength}, shape, targetShape, broadcastedData);
    return broadcastedData;
 }
 // Unidirectional broadcasting shape to targetShape// In unidirectional broadcast - only tensor B can have the shape changed not
@@ -502,14 +497,14 @@ T* UnidirectionalBroadcast(const T* data, const std::vector<size_t>& shape, cons
       std::vector<size_t> newShape(targetSize, 1);
       size_t offset = targetSize - shape.size();
       std::copy(shape.begin(), shape.end(), newShape.begin() + offset);
-      return CreateBroadcastTensor<T>(data, newShape, targetShape, ConvertShapeToLength(targetShape));
+      return CreateBroadcastTensor(data, newShape, targetShape, ConvertShapeToLength(targetShape));
    }
-   return CreateBroadcastTensor<T>(data, shape, targetShape, ConvertShapeToLength(targetShape));
+   return CreateBroadcastTensor(data, shape, targetShape, ConvertShapeToLength(targetShape));
 }
 
 // Unidirectional broadcasting shape to targetShape using a passed vector to avoid allocations
 template<typename T>
-void UnidirectionalBroadcast(const T* data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, std::span<T> broadcastedData) {
+void UnidirectionalBroadcast(const T* data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, T *broadcastedData) {
    size_t curLength = ConvertShapeToLength(shape);
    std::span<T> inData(const_cast<T*>(data), curLength);
    // Prepend shape with ones
@@ -518,9 +513,9 @@ void UnidirectionalBroadcast(const T* data, const std::vector<size_t>& shape, co
       std::vector<size_t> newShape(targetSize, 1);
       size_t offset = targetSize - shape.size();
       std::copy(shape.begin(), shape.end(), newShape.begin() + offset);
-      BroadcastTensor<T>(inData, newShape, targetShape, broadcastedData);
+      BroadcastTensor(inData, newShape, targetShape, broadcastedData);
    }
-   BroadcastTensor<T, std::span<T>>(inData, shape, targetShape, broadcastedData);
+   BroadcastTensor(inData, shape, targetShape, broadcastedData);
 }
 
 /// compute stride of a tensor given its shape (assume layout is row-major)