forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfused_rowwise_random_quantization_ops.cc
206 lines (184 loc) · 8.22 KB
/
fused_rowwise_random_quantization_ops.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#include "caffe2/operators/fused_rowwise_random_quantization_ops.h"
#include <c10/util/Registry.h>
#include "caffe2/utils/math.h"
namespace caffe2 {
#define IS_LITTLE_ENDIAN \
[] { \
const int32_t kValue = 1; \
return reinterpret_cast<const uint8_t*>(&kValue)[0] == 1; \
}()
template <class Context>
bool FloatToFusedRandRowwiseQuantizedOp<Context>::RunOnDevice() {
CAFFE_ENFORCE(IS_LITTLE_ENDIAN, "Unsupported endianness");
const auto& input = Input(DATA_FLOAT);
CAFFE_ENFORCE_EQ(
input.dim(),
2,
"Expect input to be a matrix. Reshape the input tensor to a matrix for usage.");
const auto input_rows = input.size(0);
const auto input_columns = input.size(1);
// The "fused" representation stores the [bitwidth][tail][min][max]
// with the row-wise quantized data in one tensor. Since we store 8/bitwidth
// quantized data in one byte, the last buckets of some bytes may have
// unused bits. There are totally tail buckets are unused.
// We encode *bitwidth* and *tail* at the beginning of
// each row, following by 32-bit floating data respresenting min and max.
// | bitwidth | tail | min | max | ... int8 data ... |
// | 1B | 1B | 4B | 4B | ...output_data....|
// In output_data: the b-th bucket of the i-th byte stores
// the i-th data of the b-th segment of input row
size_t data_per_byte = 8 / bitwidth_;
// How many bytes in the output
size_t segment_size = (input_columns + data_per_byte - 1) / data_per_byte;
const std::vector<int64_t> output_dimensions = {
input_rows, 10 + static_cast<int64_t>(segment_size)};
auto* output =
Output(DATA_FUSED_QUANTIZED, output_dimensions, at::dtype<uint8_t>());
const auto* input_data = input.template data<float>();
auto* output_data = output->template mutable_data<uint8_t>();
const size_t output_columns = static_cast<size_t>(output->size(1));
memset(output_data, 0, output->numel());
if (random_) {
random_buffer_.resize(input_columns);
}
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
for (size_t row = 0; row < input_rows; ++row) {
if (random_) {
#ifdef FUSED_ROWWISE_RANDOM_QUANTIZATION_USE_MKL
int status = vsRngUniform(
VSL_RNG_METHOD_UNIFORM_STD,
vslStream_,
input_columns,
random_buffer_.data(),
0.0f,
1.0f);
if (status != VSL_ERROR_OK) {
LOG(WARNING) << "vsRngUniform returns " << status;
}
#else
for (int i = 0; i < input_columns; ++i) {
random_buffer_[i] = (*dis_)(gen_);
}
#endif
}
math::quantize_and_compress(
input_data + row * input_columns,
output_data + row * output_columns,
input_columns,
bitwidth_,
random_,
random_buffer_.data());
}
return true;
}
template <class Context>
bool FusedRandRowwiseQuantizedToFloatOp<Context>::RunOnDevice() {
CAFFE_ENFORCE(IS_LITTLE_ENDIAN, "Unsupported endianness");
const auto& input = Input(DATA_FUSED_QUANTIZED);
CAFFE_ENFORCE_EQ(input.dim(), 2, "Expect input to be a matrix.");
CAFFE_ENFORCE_GE(
input.numel(),
4,
"Expect input to have size greater than or equal to 4.");
const auto input_rows = input.size(0);
const auto input_columns = input.size(1);
const auto* input_data = input.template data<uint8_t>();
const size_t bitwidth = input_data[0];
CAFFE_ENFORCE(
bitwidth == 1 || bitwidth == 2 || bitwidth == 4 || bitwidth == 8,
"Unsupported bitwidth");
const size_t tail = input_data[1];
const size_t output_columns = (input_columns - 10) * (8 / bitwidth) - tail;
const std::vector<int64_t> output_dimensions = {
input_rows, static_cast<int64_t>(output_columns)};
auto* output = Output(DATA_FLOAT, output_dimensions, at::dtype<float>());
auto* output_data = output->template mutable_data<float>();
// NOLINTNEXTLINE(clang-diagnostic-sign-compare)
for (size_t row = 0; row < input_rows; ++row) {
math::decompress_and_dequantize(
input_data + row * input_columns,
output_data + row * output_columns,
input_columns);
}
return true;
}
#undef IS_LITTLE_ENDIAN
REGISTER_CPU_OPERATOR(
FloatToFusedRandRowwiseQuantized,
FloatToFusedRandRowwiseQuantizedOp<CPUContext>);
OPERATOR_SCHEMA(FloatToFusedRandRowwiseQuantized)
.NumInputs(1)
.NumOutputs(1)
.TensorInferenceFunction([](const OperatorDef& def,
const vector<TensorShape>& in) {
ArgumentHelper helper(def);
auto bitwidth = helper.GetSingleArgument<int32_t>("bitwidth", 8);
size_t data_per_byte = 8 / bitwidth;
vector<TensorShape> out;
TensorShape X = in[0];
X.set_dims(1, 10 + (X.dims(1) + data_per_byte - 1) / data_per_byte);
out.push_back(std::move(X));
out[0].set_data_type(TensorProto_DataType_UINT8);
return out;
})
.SetDoc(R"DOC(
Applies row-wise stochastic/random quantization by determining the range of
each row in the input matrix, and then quantize each element to one of two
closest discrete levels by randomly drawing Bernoulli distribution.
The method is extended from TernGrad [1],
which randomly quantizes gradients to three levels to reduce communication in distributed training.
The format of each row (x) in the output matrix is [bitwidth][tail][min][max][data]:
bitwidth[1 Byte]: bitwidth per data [1, 2, 4 or 8];
tail[1 Byte]: the number of unused buckets [1-8] (One byte is split to 8/bitwidth buckets and each bucket stores one low-precision data in bitwidth bits);
min[4 Bytes]: the minimum floating value min(x);
max[4 Bytes]: the maximum floating value max(x);
data: quantized data.
The quantization is uniform with levels q = min + (max-min)/(2^bitwidth - 1)*[0:1:2^bitwidth].
During stochastic/random quantization x'=Quantize(x), for q_j < x_i <= q_{j+1}, we draw quantization x'_i from Bernoulli distributions with
P(x'_i = q_{j+1}) = (x_i - q_j)/(q_{j+1} - q_j), and
P(x'_i = q_j) = (q_{j+1} - x_i)/(q_{j+1} - q_j) where x'_i is the quantized value of x_i.
[1] proved E{x'_i}=x_i, which is an unbiased approximation. More details are in the paper.
For example, suppose targeted bitwidth = 2 and x = [0.3, -1.4, -0.6, 0.9, 1.0],
then tail = 3, min = -1.4, max = 1.0 and q = [-1.4, -0.6, 0.2, 1.0].
x_1 = 0.3 will be quantized to x'_1 = 0.2 with probability 7/8 and to x'_1 = 1.0 with probability 1/8.
The storage format of quantized data is: [x'_1|x'_3|x'_5|xxx]-[x'_2|x'_4|xxx|xxx].
In general, a input row is split to multiple segments. One segment is a continuous subarray of the row,
and its length is the number of bytes storing quantized data in the output matrix.
The b-th bucket of the i-th byte stores the i-th data of the b-th segment of input row.
[1] Wen, Wei, Cong Xu, Feng Yan, Chunpeng Wu, Yandan Wang, Yiran Chen, and Hai Li.
"Terngrad: Ternary gradients to reduce communication in distributed deep learning."
In Advances in Neural Information Processing Systems, pp. 1508-1518. 2017.
)DOC")
.Input(0, "input", "Float32 input data")
.Output(0, "output", "Fused bitwidth, tail, min, max and quantized data")
.Arg("bitwidth", "How many bits to quantize per data (defaults to 8).")
.Arg("random", "random or not (True). False is set up for unittest.");
NO_GRADIENT(FloatToFusedRandRowwiseQuantized);
REGISTER_CPU_OPERATOR(
FusedRandRowwiseQuantizedToFloat,
FusedRandRowwiseQuantizedToFloatOp<CPUContext>);
OPERATOR_SCHEMA(FusedRandRowwiseQuantizedToFloat)
.NumInputs(1)
.NumOutputs(1)
.TensorInferenceFunction([](const OperatorDef& def,
const vector<TensorShape>&) {
vector<TensorShape> out;
for (int i = 0; i < def.output_size(); i++) {
TensorShape ts;
ts.set_unknown_shape(true);
ts.set_data_type(TensorProto_DataType_FLOAT);
out.push_back(ts);
}
return out;
})
.SetDoc(R"DOC(
De-quantizes the result of the FloatToFusedRandRowwiseQuantized operator.
Refer FloatToFusedRandRowwiseQuantized operator for details.
)DOC")
.Input(
0,
"quantized_input",
"Fused bitwidth, tail, min, max and quantized data")
.Output(0, "float_input", "Float32 data");
NO_GRADIENT(FusedRandRowwiseQuantizedToFloat);
} // namespace caffe2