forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsparse_to_dense_op.cu
79 lines (63 loc) · 2.41 KB
/
sparse_to_dense_op.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#include "caffe2/operators/sparse_to_dense_op.h"
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
#include "caffe2/utils/GpuAtomics.cuh"
namespace caffe2 {
template <typename TInd, typename TData>
__global__ void SparseToDenseKernel(
size_t N, int64_t block_nitems, const TInd* indices, const TData* vals, TData* dst) {
CUDA_1D_KERNEL_LOOP(i, N) {
int idx = indices[i / block_nitems];
int dst_idx = block_nitems * idx + i % block_nitems;
gpu_atomic_add(&dst[dst_idx], vals[i]);
}
}
template <>
bool SparseToDenseOp<CUDAContext>::RunOnDevice() {
return DispatchHelper<TensorTypes<int32_t>>::call(
this, Input(INDICES));
}
template <>
template <typename TInd>
bool SparseToDenseOp<CUDAContext>::DoRunWithType() {
return DispatchHelper<
TensorTypes2<
float,
int32_t>,
TInd>::call(this, Input(VALUES));
}
template <>
template <typename TInd, typename TData>
bool SparseToDenseOp<CUDAContext>::DoRunWithType2() {
auto& sparse_indices = Input(INDICES);
CAFFE_ENFORCE_EQ(sparse_indices.dim(), 1);
auto& sparse_values = Input(VALUES);
CAFFE_ENFORCE_GE(sparse_values.dim(), 1);
CAFFE_ENFORCE_EQ(sparse_indices.numel(), sparse_values.dim(0));
const TInd* sparse_indices_vec = sparse_indices.template data<TInd>();
const int32_t sparse_indices_len = sparse_indices.dim32(0);
const int output_first_dim =
GetOutputFirstDim(sparse_indices_vec, sparse_indices_len);
auto shape = sparse_values.sizes().vec();
shape[0] = output_first_dim;
auto* output = Output(0, shape, at::dtype<TData>());
TData* output_data = output->template mutable_data<TData>();
math::Set<TData>(output->numel(), TData(0), output_data, &context_);
const auto block_nitems = sparse_values.size_from_dim(1);
const TData* sparse_values_vec = sparse_values.template data<TData>();
size_t N = block_nitems * sparse_indices_len;
CAFFE_ENFORCE_EQ(output->numel(), output_first_dim * block_nitems);
SparseToDenseKernel<TInd, TData><<<
CAFFE_GET_BLOCKS(N), CAFFE_CUDA_NUM_THREADS, 0,
context_.cuda_stream()>>>(
N,
block_nitems,
sparse_indices_vec,
sparse_values_vec,
output_data
);
C10_CUDA_KERNEL_LAUNCH_CHECK();
return true;
}
REGISTER_CUDA_OPERATOR(SparseToDense, SparseToDenseOp<CUDAContext>);
} // namespace caffe2