forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsummarize_op.cu
109 lines (97 loc) · 3.31 KB
/
summarize_op.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#include <cuda.h>
#include <thrust/device_vector.h>
#include <thrust/transform_reduce.h>
#include <thrust/system/cuda/execution_policy.h>
#include "caffe2/operators/summarize_op.h"
#include "caffe2/core/context_gpu.h"
namespace caffe2 {
namespace {
// structure used to accumulate the moments and other statistical properties
// encountered so far.
template <typename T>
struct SummaryStatsData {
T n;
T min;
T max;
T mean;
T M2;
// initialize to the identity element
void initialize() {
n = mean = M2 = 0;
min = std::numeric_limits<T>::max();
max = std::numeric_limits<T>::min();
}
T variance() { return (n == 1 ? 0 : M2 / (n - 1)); }
};
// stats_unary_op is a functor that takes in a value x and
// returns a variace_data whose mean value is initialized to x.
template <typename T>
struct summary_stats_unary_op {
__host__ __device__ SummaryStatsData<T> operator()(const T& x) const {
SummaryStatsData<T> result;
result.n = 1;
result.min = x;
result.max = x;
result.mean = x;
result.M2 = 0;
return result;
}
};
// summary_stats_binary_op is a functor that accepts two SummaryStatsData
// structs and returns a new SummaryStatsData which are an
// approximation to the summary_stats for
// all values that have been aggregated so far
template <typename T>
struct summary_stats_binary_op
: public thrust::binary_function<const SummaryStatsData<T>&,
const SummaryStatsData<T>&,
SummaryStatsData<T> > {
__host__ __device__ SummaryStatsData<T> operator()(
const SummaryStatsData<T>& x, const SummaryStatsData <T>& y) const {
SummaryStatsData<T> result;
T n = x.n + y.n;
T delta = y.mean - x.mean;
T delta2 = delta * delta;
result.n = n;
result.min = thrust::min(x.min, y.min);
result.max = thrust::max(x.max, y.max);
result.mean = x.mean + delta * y.n / n;
result.M2 = x.M2 + y.M2;
result.M2 += delta2 * x.n * y.n / n;
return result;
}
};
} // namespace
template<>
bool SummarizeOp<float, CUDAContext>::RunOnDevice() {
auto& X = Input(0);
const int N = X.numel();
TORCH_DCHECK_GT(N, 0);
// TODO(Yangqing): Any better way to avoid having to const cast?
thrust::device_ptr<float> Xdata(const_cast<float*>(X.data<float>()));
summary_stats_unary_op<float> unary_op;
summary_stats_binary_op<float> binary_op;
SummaryStatsData<float> init;
init.initialize();
// compute summary statistics
SummaryStatsData<float> result = thrust::transform_reduce(
#if THRUST_VERSION >= 100800
thrust::cuda::par.on(context_.cuda_stream()),
#endif // THRUST_VERSION >= 100800
Xdata, Xdata + N, unary_op, init, binary_op);
float standard_deviation = std::sqrt(result.variance());
if (to_file_) {
(*log_file_) << result.min << " " << result.max << " " << result.mean << " "
<< standard_deviation << std::endl;
}
if (OutputSize()) {
auto* Y = Output(0, {4}, at::dtype<float>());
float output_buffer[NUM_STATS] = {result.min, result.max, result.mean,
standard_deviation};
context_.CopyFromCPU<float>(
NUM_STATS, output_buffer, Y->template mutable_data<float>());
}
return true;
}
REGISTER_CUDA_OPERATOR(Summarize, SummarizeOp<float, CUDAContext>);
} // namespace caffe2