diff --git a/kernels/portable/cpu/op_amax.cpp b/kernels/portable/cpu/op_amax.cpp index 6030221d883..4ad409d4820 100644 --- a/kernels/portable/cpu/op_amax.cpp +++ b/kernels/portable/cpu/op_amax.cpp @@ -46,13 +46,17 @@ Tensor& amax_out( ReduceOverDimListPlan plan(in, dim_list); ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "amax.out", CTYPE, [&]() { CTYPE* out_data = out.mutable_data_ptr<CTYPE>(); - for (const auto out_ix : c10::irange(out.numel())) { - out_data[out_ix] = plan.execute<CTYPE>( - [](CTYPE v, CTYPE max_v) { - return std::isnan(v) || v > max_v ? v : max_v; - }, - out_ix); - } + const bool success = parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + out_data[out_ix] = plan.execute<CTYPE>( + [](CTYPE v, CTYPE max_v) { + return std::isnan(v) || v > max_v ? v : max_v; + }, + out_ix); + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); }); return out; diff --git a/kernels/portable/cpu/op_amin.cpp b/kernels/portable/cpu/op_amin.cpp index e4979390a5d..396cb6c016d 100644 --- a/kernels/portable/cpu/op_amin.cpp +++ b/kernels/portable/cpu/op_amin.cpp @@ -45,13 +45,17 @@ Tensor& amin_out( ReduceOverDimListPlan plan(in, dim_list); ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "amin.out", CTYPE, [&]() { CTYPE* out_data = out.mutable_data_ptr<CTYPE>(); - for (const auto out_ix : c10::irange(out.numel())) { - out_data[out_ix] = plan.execute<CTYPE>( - [](CTYPE v, CTYPE min_v) { - return std::isnan(v) || v < min_v ? v : min_v; - }, - out_ix); - } + const bool success = parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + out_data[out_ix] = plan.execute<CTYPE>( + [](CTYPE v, CTYPE min_v) { + return std::isnan(v) || v < min_v ? v : min_v; + }, + out_ix); + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); }); return out; diff --git a/kernels/portable/cpu/op_any.cpp b/kernels/portable/cpu/op_any.cpp index a368226db80..ee9e54fc0c3 100644 --- a/kernels/portable/cpu/op_any.cpp +++ b/kernels/portable/cpu/op_any.cpp @@ -96,16 +96,21 @@ Tensor& any_dims_out( static_cast<CTYPE_OUT>(static_cast<bool>(in_data[out_ix])); } } else { - for (const auto out_ix : c10::irange(out.numel())) { - bool any = false; - if (in_not_empty) { - any = plan->execute<CTYPE_IN, bool>( - [](CTYPE_IN v) { return static_cast<bool>(v); }, - [](bool outv, bool acc) { return acc || outv; }, - out_ix); - } - out_data[out_ix] = static_cast<CTYPE_OUT>(any); - } + const bool success = + parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + bool any = false; + if (in_not_empty) { + any = plan->execute<CTYPE_IN, bool>( + [](CTYPE_IN v) { return static_cast<bool>(v); }, + [](bool outv, bool acc) { return acc || outv; }, + out_ix); + } + out_data[out_ix] = static_cast<CTYPE_OUT>(any); + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); } }); }); diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp index c13e2a09937..423c2564232 100644 --- a/kernels/portable/cpu/op_mean.cpp +++ b/kernels/portable/cpu/op_mean.cpp @@ -46,22 +46,27 @@ Tensor& mean_dim_out( out); MapReduceOverDimListPlan plan(in, dim_list); - ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "mean.out", CTYPE_IN, [&] { - ET_SWITCH_FLOATHBF16_TYPES( - out.scalar_type(), ctx, "mean.out", CTYPE_OUT, [&] { - CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>(); - const size_t num = get_reduced_dim_product(in, dim_list); - for (const auto out_ix : c10::irange(out.numel())) { - CTYPE_OUT sum = 0; - if (in.numel() > 0) { - sum = plan.execute<CTYPE_IN, CTYPE_OUT>( - [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, - out_ix); + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "add.out"; + ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] { + ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] { + CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>(); + const size_t num = get_reduced_dim_product(in, dim_list); + const bool success = parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + CTYPE_OUT sum = 0; + if (in.numel() > 0) { + sum = plan.execute<CTYPE_IN, CTYPE_OUT>( + [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + out_ix); + } + out_data[out_ix] = sum / static_cast<float>(num); } - out_data[out_ix] = sum / static_cast<float>(num); - } - }); + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); + }); }); return out; diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp index f58773a6769..550f6b9572f 100644 --- a/kernels/portable/cpu/op_sum.cpp +++ b/kernels/portable/cpu/op_sum.cpp @@ -50,23 +50,27 @@ Tensor& sum_dim_out( if (in.numel() > 0) { plan.emplace(in, dim_list); } - ET_SWITCH_REALHBBF16_TYPES( - in.scalar_type(), ctx, "sum.IntList_out", CTYPE_IN, [&] { - ET_SWITCH_REALHBBF16_TYPES( - out.scalar_type(), ctx, "sum.IntList_out", CTYPE_OUT, [&] { - CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>(); - for (const auto out_ix : c10::irange(out.numel())) { - CTYPE_OUT sum = 0; - if (plan.has_value()) { - sum = plan->execute<CTYPE_IN, CTYPE_OUT>( - [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, - out_ix); - } - out_data[out_ix] = sum; + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "sum.IntList_out"; + ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] { + ET_SWITCH_REALHBBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] { + CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>(); + const bool success = parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + CTYPE_OUT sum = 0; + if (plan.has_value()) { + sum = plan->execute<CTYPE_IN, CTYPE_OUT>( + [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + out_ix); } - }); - }); + out_data[out_ix] = sum; + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); + }); + }); return out; } diff --git a/kernels/portable/cpu/op_var.cpp b/kernels/portable/cpu/op_var.cpp index c5be3fdad62..f09f1d92bc9 100644 --- a/kernels/portable/cpu/op_var.cpp +++ b/kernels/portable/cpu/op_var.cpp @@ -21,6 +21,7 @@ namespace { template <typename CTYPE_IN, typename CTYPE_OUT> void compute_variance( + KernelRuntimeContext& ctx, const Tensor& in, Tensor& out, optional<ArrayRef<int64_t>> dim_list, @@ -33,22 +34,26 @@ void compute_variance( } } else { MapReduceOverDimListPlan plan(in, dim_list); - for (const auto out_ix : c10::irange(out.numel())) { - CTYPE_OUT sum = plan.execute<CTYPE_IN, CTYPE_OUT>( - [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, - out_ix); - CTYPE_OUT mean = sum / static_cast<CTYPE_OUT>(num); - CTYPE_OUT sum2 = plan.execute<CTYPE_IN, CTYPE_OUT>( - [mean](CTYPE_IN v) { - return ( - (static_cast<CTYPE_OUT>(v) - mean) * - (static_cast<CTYPE_OUT>(v) - mean)); - }, - [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, - out_ix); - out_data[out_ix] = sum2 / denominator; - } + const bool success = parallel_for_each_reduce_over_dim_list_output_index( + in, dim_list, out, [&](const auto begin, const auto end) { + for (const auto out_ix : c10::irange(begin, end)) { + CTYPE_OUT sum = plan.execute<CTYPE_IN, CTYPE_OUT>( + [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + out_ix); + CTYPE_OUT mean = sum / static_cast<CTYPE_OUT>(num); + CTYPE_OUT sum2 = plan.execute<CTYPE_IN, CTYPE_OUT>( + [mean](CTYPE_IN v) { + return ( + (static_cast<CTYPE_OUT>(v) - mean) * + (static_cast<CTYPE_OUT>(v) - mean)); + }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + out_ix); + out_data[out_ix] = sum2 / denominator; + } + }); + ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed"); } } @@ -90,7 +95,7 @@ Tensor& var_out( ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] { ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] { - compute_variance<CTYPE_IN, CTYPE_OUT>(in, out, dim_list, num, denom); + compute_variance<CTYPE_IN, CTYPE_OUT>(ctx, in, out, dim_list, num, denom); }); }); @@ -135,7 +140,7 @@ Tensor& var_correction_out( ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] { ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] { - compute_variance<CTYPE_IN, CTYPE_OUT>(in, out, dim_list, num, denom); + compute_variance<CTYPE_IN, CTYPE_OUT>(ctx, in, out, dim_list, num, denom); }); }); diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h index 1c6a6de4101..ff7589af4f5 100644 --- a/kernels/portable/cpu/util/reduce_util.h +++ b/kernels/portable/cpu/util/reduce_util.h @@ -823,11 +823,15 @@ template <typename Func> executorch::aten::optional<int64_t> dim, const Tensor& out, const Func& func) { +#ifdef ET_USE_THREADPOOL const ssize_t reduction_size = get_reduced_dim_product(in, dim); const auto grain_size = std::max( static_cast<ssize_t>(1), static_cast<ssize_t>(executorch::extension::internal::GRAIN_SIZE) / reduction_size); +#else // ET_USE_THREADPOOL + const auto grain_size = 1; +#endif // ET_USE_THREADPOOL return executorch::extension::parallel_for(0, out.numel(), grain_size, func); } @@ -842,11 +846,15 @@ template <typename Func> optional<ArrayRef<int64_t>> dim_list, const Tensor& out, const Func& func) { +#ifdef ET_UE_THREADPOOL const ssize_t reduction_size = get_reduced_dim_product(in, dim_list); const auto grain_size = std::max( static_cast<ssize_t>(1), static_cast<ssize_t>(executorch::extension::internal::GRAIN_SIZE) / reduction_size); +#else // ET_USE_THREADPOOL + const auto grain_size = 1; +#endif // ET_USE_THREADPOOL return executorch::extension::parallel_for(0, out.numel(), grain_size, func); }