forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspace_batch_op_gpu.cu
181 lines (166 loc) · 4.8 KB
/
space_batch_op_gpu.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#include "caffe2/operators/space_batch_op.h"
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context_gpu.h"
namespace caffe2 {
__global__ void SpaceToBatch(
int N,
int output_batch,
int output_depth,
int output_height,
int output_width,
int input_batch,
int input_depth,
int input_height,
int input_width,
const int pad_l,
const int pad_t,
int block_size,
const float* input,
float* output) {
CUDA_1D_KERNEL_LOOP(i, N) {
// Recall:
// const auto output_offset =
// ((out_b * output_depth + d) * output_height + out_h) * output_width +
// out_w;
const int out_w = i % output_width;
const int i_2 = i / output_width;
const int out_h = i_2 % output_height;
const int i_3 = i_2 / output_height;
const int d = i_3 % output_depth;
const int out_b = i_3 / output_depth;
const int in_b = out_b % input_batch;
const int offset_w = (out_b / input_batch) % block_size;
const int offset_h = (out_b / input_batch) / block_size;
const int in_h = out_h * block_size + offset_h - pad_t;
const int in_w = out_w * block_size + offset_w - pad_l;
if (in_h >= 0 && in_w >= 0 && in_h < input_height && in_w < input_width) {
const auto input_offset =
((in_b * input_depth + d) * input_height + in_h) * input_width +
in_w;
output[i] = input[input_offset];
} else {
output[i] = 0.0;
}
}
}
template <>
void spaceToBatch<CUDAContext>(
const Tensor& input,
int pad_t,
int pad_l,
int block_size,
Tensor* output,
CUDAContext* context) {
const int output_batch = output->dim32(0);
const int output_depth = output->dim32(1);
const int output_height = output->dim32(2);
const int output_width = output->dim32(3);
const int input_batch = input.dim32(0);
const int input_depth = input.dim32(1);
const int input_height = input.dim32(2);
const int input_width = input.dim32(3);
const int N = output->numel();
SpaceToBatch<<<
CAFFE_GET_BLOCKS(N),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(
N,
output_batch,
output_depth,
output_height,
output_width,
input_batch,
input_depth,
input_height,
input_width,
pad_l,
pad_t,
block_size,
input.data<float>(),
output->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
__global__ void BatchToSpace(
int N,
int output_batch,
int output_depth,
int output_height,
int output_width,
int input_batch,
int input_depth,
int input_height,
int input_width,
const int pad_l,
const int pad_t,
int block_size,
const float* input,
float* output) {
CUDA_1D_KERNEL_LOOP(i, N) {
// Recall:
// const auto input_offset = ((in_b * input_depth + d) *
// input_height + in_h) * input_width + in_w;
const int in_w = i % input_width;
const int i_2 = i / input_width;
const int in_h = i_2 % input_height;
const int i_3 = i_2 / input_height;
const int d = i_3 % input_depth;
const int in_b = i_3 / input_depth;
const int out_b = in_b % output_batch;
const int offset_w = (in_b / output_batch) % block_size;
const int offset_h = (in_b / output_batch) / block_size;
const int out_h = in_h * block_size + offset_h - pad_t;
const int out_w = in_w * block_size + offset_w - pad_l;
if (out_h >= 0 && out_w >= 0 && out_h < output_height &&
out_w < output_width) {
const auto output_offset =
((out_b * output_depth + d) * output_height + out_h) *
output_width +
out_w;
output[output_offset] = input[i];
}
}
}
template <>
void batchToSpace(
const Tensor& input,
int pad_t,
int pad_l,
int block_size,
Tensor* output,
CUDAContext* context) {
CAFFE_ENFORCE(input.dim() == 4);
CAFFE_ENFORCE(output->dim() == 4);
const int output_batch = output->dim32(0);
const int output_depth = output->dim32(1);
const int output_height = output->dim32(2);
const int output_width = output->dim32(3);
const int input_batch = input.dim32(0);
const int input_depth = input.dim32(1);
const int input_height = input.dim32(2);
const int input_width = input.dim32(3);
const int N = input.numel();
BatchToSpace<<<
CAFFE_GET_BLOCKS(N),
CAFFE_CUDA_NUM_THREADS,
0,
context->cuda_stream()>>>(
N,
output_batch,
output_depth,
output_height,
output_width,
input_batch,
input_depth,
input_height,
input_width,
pad_l,
pad_t,
block_size,
input.data<float>(),
output->template mutable_data<float>());
C10_CUDA_KERNEL_LAUNCH_CHECK();
}
REGISTER_CUDA_OPERATOR(SpaceToBatch, SpaceToBatchOp<CUDAContext>);
REGISTER_CUDA_OPERATOR(BatchToSpace, BatchToSpaceOp<CUDAContext>);
}