forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgather_fused_8bit_rowwise_op.h
64 lines (51 loc) · 2.11 KB
/
gather_fused_8bit_rowwise_op.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#pragma once
#include "caffe2/core/operator.h"
#include "caffe2/utils/eigen_utils.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
template <class Context>
class GatherFused8BitRowwiseOp : public Operator<Context> {
public:
USE_OPERATOR_CONTEXT_FUNCTIONS;
USE_SIMPLE_CTOR_DTOR(GatherFused8BitRowwiseOp);
bool RunOnDevice() override {
return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
this, this->template Input<Tensor>(INDICES, CPU));
}
template <typename Index>
bool DoRunWithType() {
const auto& data = Input(DATA);
const auto& indices = Input(INDICES);
CAFFE_ENFORCE_EQ(data.dim(), 2, "DATA must be a matrix");
CAFFE_ENFORCE_EQ(indices.dim(), 1, "INDICES must be a vector");
CAFFE_ENFORCE_GT(data.size(1), 8, "DATA must have more than 8 columns");
// Subtract 8 from the #columns of data for the 4 bytes for scale and 4
// bytes for bias that we use in the fused representation (per row).
const std::vector<int64_t> shape = {indices.size(0), data.size(1) - 8};
auto* output = Output(0, shape, at::dtype<float>());
auto block_bytesize = data.size_from_dim(1) * data.dtype().itemsize();
int N = indices.numel();
const uint8_t* src_base = data.template data<uint8_t>();
const Index* idxs = indices.template data<Index>();
auto out = output->template mutable_data<float>();
for (const auto i : c10::irange(N)) {
auto idx = idxs[i];
CAFFE_ENFORCE(
0 <= idx && idx < data.size(0),
"INDICES element is out of DATA bounds, id=",
idx,
" data_dim=",
data.size(0));
const uint8_t* src = src_base + idx * block_bytesize;
ConstEigenVectorArrayMap<uint8_t> input_row_values(src, shape[1]);
ConstEigenVectorArrayMap<float> input_row_scale_bias(
reinterpret_cast<const float*>(src + shape[1]), 2);
EigenVectorArrayMap<float> output_row(out + i * shape[1], shape[1]);
output_row = input_row_values.cast<float>() * input_row_scale_bias(0) +
input_row_scale_bias(1);
}
return true;
}
INPUT_TAGS(DATA, INDICES);
};
} // namespace caffe2