@@ -39,23 +39,26 @@ namespace cpu {
3939template <typename src_data_t , typename acc_data_t >
4040void gates_reduction (const rnn_utils::rnn_conf_t &rnn,
4141 const src_data_t *ws_gates_, acc_data_t *diff_bias_) {
42- auto body = [&](int i, int k) {
43- for (int j = 0 ; j < rnn.mb ; j++)
44- diff_bias_[i * rnn.dic + k]
45- += ws_gates_[j * rnn.gates_ws_ld + i * rnn.dic + k];
46- };
4742
48- // @todo block k on simd-width
49- #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_OMP \
50- && _OPENMP >= 201307 /* icc 17.0 has a problem with simd collapse */ \
51- && !((defined __INTEL_COMPILER) && (__INTEL_COMPILER == 1700 ))
43+ // The loop body needs to be inlined as some versions of icc have
44+ // an issue with lambdas inside omp simd loops
45+ #define body_loop (i, k ) \
46+ for (int j = 0 ; j < rnn.mb ; j++) \
47+ diff_bias_[i * rnn.dic + k] \
48+ += ws_gates_[j * rnn.gates_ws_ld + i * rnn.dic + k];
49+
50+ // @todo block k on simd-width to enable vectorization in
51+ // parallel_nd path
52+ #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_OMP && _OPENMP >= 201307
5253#pragma omp parallel for simd collapse(2)
5354 for (int i = 0 ; i < rnn.n_gates ; i++)
5455 for (int k = 0 ; k < rnn.dic ; k++)
55- body (i, k);
56+ body_loop (i, k);
5657#else
57- parallel_nd (rnn.n_gates , rnn.dic , body );
58+ parallel_nd (rnn.n_gates , rnn.dic , [&]( int i, int k) { body_loop (i, k); } );
5859#endif
60+
61+ #undef body_loop
5962}
6063
6164template <prop_kind_t aprop, impl::data_type_t src_type,
0 commit comments