caffe2/operators/lengths_reducer_ops.cc

#include "caffe2/operators/lengths_reducer_ops.h"
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/operators/segment_reduction_op.h"
#include "caffe2/utils/math.h"

namespace caffe2 {

// Use _STR option because the schema is declared using _STR version too in
// generic fashion. Otherwise it'd break schema declaration check.
// TODO(dzhulgakov): remove _STR when all lengths ops are off generic version.

using SparseLengthsSumOp =
    // NOLINTNEXTLINE(modernize-use-bool-literals)
    CPUSparseLengthsReductionOp<float, TensorTypes<float, at::Half>, 0, 0>;
using SparseLengthsWeightedSumOp =
    // NOLINTNEXTLINE(modernize-use-bool-literals)
    CPUSparseLengthsReductionOp<float, TensorTypes<float, at::Half>, 1, 0>;
using SparseLengthsMeanOp =
    // NOLINTNEXTLINE(modernize-use-bool-literals)
    CPUSparseLengthsReductionOp<float, TensorTypes<float, at::Half>, 0, 1>;

REGISTER_CPU_OPERATOR(SparseLengthsSum, SparseLengthsSumOp);
REGISTER_CPU_OPERATOR(SparseLengthsWeightedSum, SparseLengthsWeightedSumOp);
REGISTER_CPU_OPERATOR(SparseLengthsMean, SparseLengthsMeanOp);

OPERATOR_SCHEMA(SparseLengthsPositionalWeightedSum)
    .NumInputs(4)
    .NumOutputs(1)
    .SetDoc(R"DOC(
Variation of SparseLengthsWeightedSum operator, where, for each row,
weights are accessed by indices [0..L-1], where L is the length of given row.
This is basically a fused operator of LengthsRangeFill + Gather +
SparseWeightedSum
)DOC")
    .Input(
        0,
        "DATA",
        "uint8 tensor obtained with "
        "operator FloatToRowwiseQuantized8Bits")
    .Input(
        1,
        "WEIGHT",
        "Scalar multipliers for the input slices. Must "
        "be a vector with the length matching the length of DATA")
    .Input(
        2,
        "INDICES",
        "Integer vector containing indices of the first "
        "dimension of DATA for the slices that are being aggregated")
    .Input(
        3,
        "LENGTHS",
        "Vector with the same sum of elements as the first dimension of DATA")
    .Output(0, "output", "output");

REGISTER_CPU_OPERATOR_STR(
    "SparseLengthsPositionalWeightedSum",
    CPUSparseLengthsReductionOp<float, TensorTypes<float, at::Half>, 1, 0, 1>);

template <typename Def>
string FormatDoc() {
  string doc = Def::doc;
  c10::ReplaceAll(doc, "{op}", Def::OpDef::name);
  c10::ReplaceAll(doc, "{op_doc}", Def::OpDef::doc);
  auto replaced = c10::ReplaceAll(doc, "{extra}", "");
  CAFFE_ENFORCE_EQ(replaced, 0);
  return doc;
}

using SparseLengthsSumDef = AbstractSparseLengthsDef<
    float,
    int,
    CPUContext,
    SumReducerDef,
    true /*GradientNeedIndices*/>;
OPERATOR_SCHEMA(SparseLengthsSum)
    .NumInputs(SparseLengthsSumDef::ForwardOp::kNumInputs)
    .NumOutputs(1)
    .ValueKeyLengthInputFillers(
        SparseLengthsSumOp::DATA,
        SparseLengthsSumOp::INDICES,
        SparseLengthsSumOp::LENGTHS)
    .SetDoc(FormatDoc<SparseLengthsSumDef>())
    .Output(0, "OUTPUT", "Aggregated tensor")
    .FillUsing(SparseLengthsSumDef::PopulateSchema)
    .InheritOnnxSchema();
REGISTER_CPU_OPERATOR(
    SparseLengthsSumGradient,
    SparseLengthsSumDef::BackwardOp);
OPERATOR_SCHEMA(SparseLengthsSumGradient)
    .NumInputs(SparseLengthsSumDef::BackwardOp::kNumInputs)
    .NumOutputs(1)
    .DisallowInputFillers();
REGISTER_GRADIENT(SparseLengthsSum, SparseLengthsSumDef::GetGradient)

REGISTER_CPU_OPERATOR(
    TTSparseLengthsSum,
    TTSparseLengthsSumOp<float, CPUContext>);
REGISTER_CPU_OPERATOR(
    TTSparseLengthsSumGradient,
    TTSparseLengthsSumGradientOp<float, CPUContext>);

OPERATOR_SCHEMA(TTSparseLengthsSum)
    .NumInputs(5)
    .NumOutputs(4)
    .SetDoc(R"DOC(
This operator introduce a new, parameter efficient embedding layer, termed TT embedding, which
can be plugged in into any model and trained end-to-end. The benefits of our compressed TT layer
are twofold. Firstly, instead of storing huge embedding matrix, it stores a sequence of much smaller
2-dimensional and 3-dimensional tensors, necessary for reconstructing the required embeddings,
which allows compressing the model significantly at the cost of a negligible performance drop.
Secondly, the overall number of parameters can be relatively small (and constant) during the whole
training stage, which allows to use larger batches or train efficiently in a case of limited resources.
)DOC")
    .Arg("factor_i", "vector<int>: factorization of voc size")
    .Arg("factor_j", "vector<int>: factorization of emb size")
    .Arg("ranks", "int[] Ranks of cores")
    .Arg("emb_size", "int: the size of each embedding entry")
    .Input(0, "core0", "tensor core 0")
    .Input(1, "core1", "tensor core 1")
    .Input(2, "core2", "tensor core 2")
    .Input(3, "index", "index for embedding")
    .Input(4, "lengths", "segment lengths")
    .Output(0, "OUTPUT", "Aggregated tensor")
    .Output(
        1,
        "core0_output",
        "intermediate mm result from core0 for backward path")
    .Output(
        2,
        "core1_output",
        "intermediate mm result from core1 for backward path")
    .Output(3, "indices", "the index for each core");

using SparseLengthsWeightedSumDef = AbstractSparseLengthsDef<
    float,
    int,
    CPUContext,
    WeightedSumReducerDef,
    true /*GradientNeedIndices*/>;
OPERATOR_SCHEMA(SparseLengthsWeightedSum)
    .NumInputs(SparseLengthsWeightedSumDef::ForwardOp::kNumInputs)
    .NumOutputs(1)
    .WeightedValueKeyLengthInputFillers(
        SparseLengthsWeightedSumOp::DATA,
        SparseLengthsWeightedSumOp::INDICES,
        SparseLengthsWeightedSumOp::LENGTHS,
        SparseLengthsWeightedSumOp::WEIGHT)
    .SetDoc(FormatDoc<SparseLengthsWeightedSumDef>())
    .Output(0, "OUTPUT", "Aggregated tensor")
    .FillUsing(SparseLengthsWeightedSumDef::PopulateSchema)
    .InheritOnnxSchema();
REGISTER_CPU_OPERATOR(
    SparseLengthsWeightedSumGradient,
    SparseLengthsWeightedSumDef::BackwardOp);
OPERATOR_SCHEMA(SparseLengthsWeightedSumGradient)
    .NumInputs(SparseLengthsWeightedSumDef::BackwardOp::kNumInputs)
    .NumOutputs(1)
    .DisallowInputFillers();
REGISTER_GRADIENT(
    SparseLengthsWeightedSum,
    SparseLengthsWeightedSumDef::GetGradient)

using SparseLengthsMeanDef = AbstractSparseLengthsDef<
    float,
    int,
    CPUContext,
    MeanReducerDef,
    true /*GradientNeedIndices*/>;
OPERATOR_SCHEMA(SparseLengthsMean)
    .NumInputs(SparseLengthsMeanDef::ForwardOp::kNumInputs)
    .NumOutputs(1)
    .ValueKeyLengthInputFillers(
        SparseLengthsMeanOp::DATA,
        SparseLengthsMeanOp::INDICES,
        SparseLengthsMeanOp::LENGTHS)
    .SetDoc(FormatDoc<SparseLengthsMeanDef>())
    .Output(0, "OUTPUT", "Aggregated tensor")
    .FillUsing(SparseLengthsMeanDef::PopulateSchema);
REGISTER_CPU_OPERATOR(
    SparseLengthsMeanGradient,
    SparseLengthsMeanDef::BackwardOp);
OPERATOR_SCHEMA(SparseLengthsMeanGradient)
    .NumInputs(SparseLengthsMeanDef::BackwardOp::kNumInputs)
    .NumOutputs(1)
    .DisallowInputFillers();
REGISTER_GRADIENT(SparseLengthsMean, SparseLengthsMeanDef::GetGradient)

// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables,cppcoreguidelines-avoid-magic-numbers)
OPERATOR_SCHEMA(TTSparseLengthsSumGradient).NumInputs(8).NumOutputs(3);

class GetTTSparseLengthsGradient : public GradientMakerBase {
  using GradientMakerBase::GradientMakerBase;
  vector<OperatorDef> GetGradientDefs() override {
    // set up the input and output
    return SingleGradientDef(
        "TTSparseLengthsSumGradient",
        "",
        // CORE0, CORE1, CORE2, LENGTHS, CORE0_output, CORE1_output,
        // indices, dY
        vector<string>{
            I(0), I(1), I(2), I(4), O(1), O(2), O(3), GO(0)},
        // dCore0, dCore1, dCore2
        vector<string>{GI(0), GI(1), GI(2)});
  }
};

REGISTER_GRADIENT(TTSparseLengthsSum, GetTTSparseLengthsGradient)

} // namespace caffe2