forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathweighted_sample_op.cu
104 lines (87 loc) · 2.81 KB
/
weighted_sample_op.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/weighted_sample_op.h"
#include "caffe2/utils/math.h"
namespace caffe2 {
namespace {
__global__ void WeightedSampleKernel(
const int batch_size,
const int weights_dim,
const float* in_weights_data,
const float* in_val_data,
float* samples,
int* out_idx_data,
float* out_val_data) {
CUDA_1D_KERNEL_LOOP(i, batch_size) {
int offset = i * weights_dim;
float sum = 0.0;
for (int j = 0; j < weights_dim; j++) {
sum += in_weights_data[offset + j];
}
samples[i] *= sum;
float cum_sum = 0.0;
int j = 0;
for (; j < weights_dim; j++) {
cum_sum += in_weights_data[offset + j];
if (cum_sum >= samples[i]) {
break;
}
}
out_idx_data[i] = min(j, weights_dim - 1);
if (out_val_data) {
out_val_data[i] = in_val_data[offset + out_idx_data[i]];
}
}
}
} // namespace
template <>
bool WeightedSampleOp<float, CUDAContext>::RunOnDevice() {
CAFFE_ENFORCE_EQ(
InputSize(),
OutputSize(),
"The number of tensors of the input and the output must be the same.");
auto& in_weights = Input(0);
int batch_size = in_weights.dim(0);
int weights_dim = in_weights.dim(1);
if (batch_size > 0 && weights_dim > 0) {
auto* out_idx = Output(0, {batch_size, 1}, at::dtype<int>());
ReinitializeTensor(&unif_samples_, {batch_size}, at::dtype<float>().device(CUDA));
const float* in_weights_data = in_weights.data<float>();
const float* in_val_data = nullptr;
int* out_idx_data = out_idx->template mutable_data<int>();
float* out_val_data = nullptr;
if (OutputSize() == 2) {
auto& in_val = Input(1);
CAFFE_ENFORCE_EQ(
in_weights.sizes(),
in_val.sizes(),
"The sampling weights tensor and the sampling values tensor must have the same dimensions.");
in_val_data = in_val.data<float>();
auto* out_val = Output(1, {batch_size, 1}, at::dtype<float>());
out_val_data = out_val->template mutable_data<float>();
}
float* unif_samples_data = unif_samples_.mutable_data<float>();
CURAND_ENFORCE(curandGenerateUniform(
context_.curand_generator(), unif_samples_data, batch_size));
WeightedSampleKernel<<<
CAFFE_GET_BLOCKS(batch_size),
CAFFE_CUDA_NUM_THREADS,
0,
context_.cuda_stream()>>>(
batch_size,
weights_dim,
in_weights_data,
in_val_data,
unif_samples_data,
out_idx_data,
out_val_data);
C10_CUDA_KERNEL_LAUNCH_CHECK();
} else {
/* out_idx = */ Output(0, {0}, at::dtype<int>());
if (OutputSize() == 2) {
/* out_val = */ Output(1, {0}, at::dtype<float>());
}
}
return true;
}
REGISTER_CUDA_OPERATOR(WeightedSample, WeightedSampleOp<float, CUDAContext>);
} // namespace caffe2