@@ -36,26 +36,32 @@ namespace dnnl {
3636namespace impl {
3737namespace cpu {
3838
39+ namespace {
40+ template <typename gates_t , typename acc_t >
41+ // The loop body needs to be put in a function as some versions of icc have
42+ // an issue with lambdas & macros inside omp simd loops
43+ inline void body_loop (int i, int k, const gates_t *ws_gates, acc_t *diff_bias,
44+ const rnn_utils::rnn_conf_t &rnn) {
45+ for (int j = 0 ; j < rnn.mb ; j++)
46+ diff_bias[i * rnn.dhc + k]
47+ += ws_gates[j * rnn.scratch_gates_ld + i * rnn.dhc + k];
48+ }
49+ } // namespace
50+
3951template <typename gates_t , typename acc_t >
4052void gates_reduction (const rnn_utils::rnn_conf_t &rnn, const gates_t *ws_gates_,
4153 acc_t *diff_bias_) {
4254
43- // The loop body needs to be inlined as some versions of icc have
44- // an issue with lambdas inside omp simd loops
45- #define body_loop (i, k ) \
46- for (int j = 0 ; j < rnn.mb ; j++) \
47- diff_bias_[i * rnn.dhc + k] \
48- += ws_gates_[j * rnn.scratch_gates_ld + i * rnn.dhc + k];
49-
5055 // @todo block k on simd-width to enable vectorization in
5156 // parallel_nd path
5257#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_OMP && _OPENMP >= 201307
5358#pragma omp parallel for simd collapse(2)
5459 for (int i = 0 ; i < rnn.n_gates ; i++)
5560 for (int k = 0 ; k < rnn.dhc ; k++)
56- body_loop (i, k);
61+ body_loop (i, k, ws_gates_, diff_bias_, rnn );
5762#else
58- parallel_nd (rnn.n_gates , rnn.dhc , [&](int i, int k) { body_loop (i, k); });
63+ parallel_nd (rnn.n_gates , rnn.dhc ,
64+ [&](int i, int k) { body_loop (i, k, ws_gates_, diff_bias_, rnn); });
5965#endif
6066
6167#undef body_loop
0 commit comments