diff --git a/kernels/portable/cpu/op_amax.cpp b/kernels/portable/cpu/op_amax.cpp
index 6030221d883..4ad409d4820 100644
--- a/kernels/portable/cpu/op_amax.cpp
+++ b/kernels/portable/cpu/op_amax.cpp
@@ -46,13 +46,17 @@ Tensor& amax_out(
   ReduceOverDimListPlan plan(in, dim_list);
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "amax.out", CTYPE, [&]() {
     CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
-    for (const auto out_ix : c10::irange(out.numel())) {
-      out_data[out_ix] = plan.execute<CTYPE>(
-          [](CTYPE v, CTYPE max_v) {
-            return std::isnan(v) || v > max_v ? v : max_v;
-          },
-          out_ix);
-    }
+    const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+        in, dim_list, out, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            out_data[out_ix] = plan.execute<CTYPE>(
+                [](CTYPE v, CTYPE max_v) {
+                  return std::isnan(v) || v > max_v ? v : max_v;
+                },
+                out_ix);
+          }
+        });
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_amin.cpp b/kernels/portable/cpu/op_amin.cpp
index e4979390a5d..396cb6c016d 100644
--- a/kernels/portable/cpu/op_amin.cpp
+++ b/kernels/portable/cpu/op_amin.cpp
@@ -45,13 +45,17 @@ Tensor& amin_out(
   ReduceOverDimListPlan plan(in, dim_list);
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "amin.out", CTYPE, [&]() {
     CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
-    for (const auto out_ix : c10::irange(out.numel())) {
-      out_data[out_ix] = plan.execute<CTYPE>(
-          [](CTYPE v, CTYPE min_v) {
-            return std::isnan(v) || v < min_v ? v : min_v;
-          },
-          out_ix);
-    }
+    const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+        in, dim_list, out, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            out_data[out_ix] = plan.execute<CTYPE>(
+                [](CTYPE v, CTYPE min_v) {
+                  return std::isnan(v) || v < min_v ? v : min_v;
+                },
+                out_ix);
+          }
+        });
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_any.cpp b/kernels/portable/cpu/op_any.cpp
index a368226db80..ee9e54fc0c3 100644
--- a/kernels/portable/cpu/op_any.cpp
+++ b/kernels/portable/cpu/op_any.cpp
@@ -96,16 +96,21 @@ Tensor& any_dims_out(
               static_cast<CTYPE_OUT>(static_cast<bool>(in_data[out_ix]));
         }
       } else {
-        for (const auto out_ix : c10::irange(out.numel())) {
-          bool any = false;
-          if (in_not_empty) {
-            any = plan->execute<CTYPE_IN, bool>(
-                [](CTYPE_IN v) { return static_cast<bool>(v); },
-                [](bool outv, bool acc) { return acc || outv; },
-                out_ix);
-          }
-          out_data[out_ix] = static_cast<CTYPE_OUT>(any);
-        }
+        const bool success =
+            parallel_for_each_reduce_over_dim_list_output_index(
+                in, dim_list, out, [&](const auto begin, const auto end) {
+                  for (const auto out_ix : c10::irange(begin, end)) {
+                    bool any = false;
+                    if (in_not_empty) {
+                      any = plan->execute<CTYPE_IN, bool>(
+                          [](CTYPE_IN v) { return static_cast<bool>(v); },
+                          [](bool outv, bool acc) { return acc || outv; },
+                          out_ix);
+                    }
+                    out_data[out_ix] = static_cast<CTYPE_OUT>(any);
+                  }
+                });
+        ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
       }
     });
   });
diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp
index c13e2a09937..423c2564232 100644
--- a/kernels/portable/cpu/op_mean.cpp
+++ b/kernels/portable/cpu/op_mean.cpp
@@ -46,22 +46,27 @@ Tensor& mean_dim_out(
       out);
 
   MapReduceOverDimListPlan plan(in, dim_list);
-  ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, "mean.out", CTYPE_IN, [&] {
-    ET_SWITCH_FLOATHBF16_TYPES(
-        out.scalar_type(), ctx, "mean.out", CTYPE_OUT, [&] {
-          CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-          const size_t num = get_reduced_dim_product(in, dim_list);
-          for (const auto out_ix : c10::irange(out.numel())) {
-            CTYPE_OUT sum = 0;
-            if (in.numel() > 0) {
-              sum = plan.execute<CTYPE_IN, CTYPE_OUT>(
-                  [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-                  [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
-                  out_ix);
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "add.out";
+  ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
+    ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
+      CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
+      const size_t num = get_reduced_dim_product(in, dim_list);
+      const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+          in, dim_list, out, [&](const auto begin, const auto end) {
+            for (const auto out_ix : c10::irange(begin, end)) {
+              CTYPE_OUT sum = 0;
+              if (in.numel() > 0) {
+                sum = plan.execute<CTYPE_IN, CTYPE_OUT>(
+                    [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+                    [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                    out_ix);
+              }
+              out_data[out_ix] = sum / static_cast<float>(num);
             }
-            out_data[out_ix] = sum / static_cast<float>(num);
-          }
-        });
+          });
+      ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
+    });
   });
 
   return out;
diff --git a/kernels/portable/cpu/op_sum.cpp b/kernels/portable/cpu/op_sum.cpp
index f58773a6769..550f6b9572f 100644
--- a/kernels/portable/cpu/op_sum.cpp
+++ b/kernels/portable/cpu/op_sum.cpp
@@ -50,23 +50,27 @@ Tensor& sum_dim_out(
   if (in.numel() > 0) {
     plan.emplace(in, dim_list);
   }
-  ET_SWITCH_REALHBBF16_TYPES(
-      in.scalar_type(), ctx, "sum.IntList_out", CTYPE_IN, [&] {
-        ET_SWITCH_REALHBBF16_TYPES(
-            out.scalar_type(), ctx, "sum.IntList_out", CTYPE_OUT, [&] {
-              CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-              for (const auto out_ix : c10::irange(out.numel())) {
-                CTYPE_OUT sum = 0;
-                if (plan.has_value()) {
-                  sum = plan->execute<CTYPE_IN, CTYPE_OUT>(
-                      [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-                      [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
-                      out_ix);
-                }
-                out_data[out_ix] = sum;
+  // @lint-ignore CLANGTIDY facebook-hte-CArray
+  static constexpr const char op_name[] = "sum.IntList_out";
+  ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
+    ET_SWITCH_REALHBBF16_TYPES(out.scalar_type(), ctx, op_name, CTYPE_OUT, [&] {
+      CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
+      const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+          in, dim_list, out, [&](const auto begin, const auto end) {
+            for (const auto out_ix : c10::irange(begin, end)) {
+              CTYPE_OUT sum = 0;
+              if (plan.has_value()) {
+                sum = plan->execute<CTYPE_IN, CTYPE_OUT>(
+                    [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+                    [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                    out_ix);
               }
-            });
-      });
+              out_data[out_ix] = sum;
+            }
+          });
+      ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
+    });
+  });
 
   return out;
 }
diff --git a/kernels/portable/cpu/op_var.cpp b/kernels/portable/cpu/op_var.cpp
index c5be3fdad62..f09f1d92bc9 100644
--- a/kernels/portable/cpu/op_var.cpp
+++ b/kernels/portable/cpu/op_var.cpp
@@ -21,6 +21,7 @@ namespace {
 
 template <typename CTYPE_IN, typename CTYPE_OUT>
 void compute_variance(
+    KernelRuntimeContext& ctx,
     const Tensor& in,
     Tensor& out,
     optional<ArrayRef<int64_t>> dim_list,
@@ -33,22 +34,26 @@ void compute_variance(
     }
   } else {
     MapReduceOverDimListPlan plan(in, dim_list);
-    for (const auto out_ix : c10::irange(out.numel())) {
-      CTYPE_OUT sum = plan.execute<CTYPE_IN, CTYPE_OUT>(
-          [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
-          [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
-          out_ix);
-      CTYPE_OUT mean = sum / static_cast<CTYPE_OUT>(num);
-      CTYPE_OUT sum2 = plan.execute<CTYPE_IN, CTYPE_OUT>(
-          [mean](CTYPE_IN v) {
-            return (
-                (static_cast<CTYPE_OUT>(v) - mean) *
-                (static_cast<CTYPE_OUT>(v) - mean));
-          },
-          [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
-          out_ix);
-      out_data[out_ix] = sum2 / denominator;
-    }
+    const bool success = parallel_for_each_reduce_over_dim_list_output_index(
+        in, dim_list, out, [&](const auto begin, const auto end) {
+          for (const auto out_ix : c10::irange(begin, end)) {
+            CTYPE_OUT sum = plan.execute<CTYPE_IN, CTYPE_OUT>(
+                [](CTYPE_IN v) { return static_cast<CTYPE_OUT>(v); },
+                [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                out_ix);
+            CTYPE_OUT mean = sum / static_cast<CTYPE_OUT>(num);
+            CTYPE_OUT sum2 = plan.execute<CTYPE_IN, CTYPE_OUT>(
+                [mean](CTYPE_IN v) {
+                  return (
+                      (static_cast<CTYPE_OUT>(v) - mean) *
+                      (static_cast<CTYPE_OUT>(v) - mean));
+                },
+                [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; },
+                out_ix);
+            out_data[out_ix] = sum2 / denominator;
+          }
+        });
+    ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
   }
 }
 
@@ -90,7 +95,7 @@ Tensor& var_out(
 
   ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] {
     ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] {
-      compute_variance<CTYPE_IN, CTYPE_OUT>(in, out, dim_list, num, denom);
+      compute_variance<CTYPE_IN, CTYPE_OUT>(ctx, in, out, dim_list, num, denom);
     });
   });
 
@@ -135,7 +140,7 @@ Tensor& var_correction_out(
 
   ET_SWITCH_FLOATHBF16_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] {
     ET_SWITCH_FLOATHBF16_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] {
-      compute_variance<CTYPE_IN, CTYPE_OUT>(in, out, dim_list, num, denom);
+      compute_variance<CTYPE_IN, CTYPE_OUT>(ctx, in, out, dim_list, num, denom);
     });
   });
 
diff --git a/kernels/portable/cpu/util/reduce_util.h b/kernels/portable/cpu/util/reduce_util.h
index 1c6a6de4101..ff7589af4f5 100644
--- a/kernels/portable/cpu/util/reduce_util.h
+++ b/kernels/portable/cpu/util/reduce_util.h
@@ -823,11 +823,15 @@ template <typename Func>
     executorch::aten::optional<int64_t> dim,
     const Tensor& out,
     const Func& func) {
+#ifdef ET_USE_THREADPOOL
   const ssize_t reduction_size = get_reduced_dim_product(in, dim);
   const auto grain_size = std::max(
       static_cast<ssize_t>(1),
       static_cast<ssize_t>(executorch::extension::internal::GRAIN_SIZE) /
           reduction_size);
+#else // ET_USE_THREADPOOL
+  const auto grain_size = 1;
+#endif // ET_USE_THREADPOOL
   return executorch::extension::parallel_for(0, out.numel(), grain_size, func);
 }
 
@@ -842,11 +846,15 @@ template <typename Func>
     optional<ArrayRef<int64_t>> dim_list,
     const Tensor& out,
     const Func& func) {
+#ifdef ET_UE_THREADPOOL
   const ssize_t reduction_size = get_reduced_dim_product(in, dim_list);
   const auto grain_size = std::max(
       static_cast<ssize_t>(1),
       static_cast<ssize_t>(executorch::extension::internal::GRAIN_SIZE) /
           reduction_size);
+#else // ET_USE_THREADPOOL
+  const auto grain_size = 1;
+#endif // ET_USE_THREADPOOL
   return executorch::extension::parallel_for(0, out.numel(), grain_size, func);
 }