Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use numel() when appropriate in get_reduced_dim_product #9142

Merged
merged 111 commits into from
Mar 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
111 commits
Select commit Hold shift + click to select a range
d0b11e8
Update
swolchok Mar 4, 2025
9437be1
Update
swolchok Mar 4, 2025
643e10e
Update
swolchok Mar 4, 2025
6f2842b
Update
swolchok Mar 4, 2025
e47dfeb
Update
swolchok Mar 4, 2025
231ebc3
Update
swolchok Mar 5, 2025
296513c
Update
swolchok Mar 5, 2025
845a01e
Update
swolchok Mar 5, 2025
a92958a
Update
swolchok Mar 5, 2025
3fa99d6
Update
swolchok Mar 5, 2025
a6c69a6
Update
swolchok Mar 5, 2025
3bd6437
Update
swolchok Mar 5, 2025
675f01b
Update
swolchok Mar 5, 2025
5f3a768
Update
swolchok Mar 5, 2025
9fdebee
Update
swolchok Mar 5, 2025
70a7096
Update
swolchok Mar 5, 2025
337dc23
Update
swolchok Mar 5, 2025
f388177
Update
swolchok Mar 5, 2025
2949daf
Update
swolchok Mar 5, 2025
7347915
Update
swolchok Mar 5, 2025
1a8481d
Update
swolchok Mar 5, 2025
e48e816
Update
swolchok Mar 5, 2025
3351d50
Update
swolchok Mar 6, 2025
0102e25
Update
swolchok Mar 6, 2025
956f8a5
Update
swolchok Mar 6, 2025
9f7f0c1
Update
swolchok Mar 6, 2025
a1aeae7
Update
swolchok Mar 6, 2025
c658163
Update
swolchok Mar 6, 2025
7e0ccd4
Update
swolchok Mar 6, 2025
d9cd27c
Update
swolchok Mar 6, 2025
095d4f5
Update
swolchok Mar 6, 2025
e537a59
Update
swolchok Mar 6, 2025
c130224
Update
swolchok Mar 6, 2025
49c2971
Update
swolchok Mar 6, 2025
cd37460
Update
swolchok Mar 6, 2025
754a4f6
Update
swolchok Mar 6, 2025
11c5707
Update
swolchok Mar 6, 2025
7ca7627
Update
swolchok Mar 6, 2025
d428ca2
Update
swolchok Mar 6, 2025
62b6ef2
Update
swolchok Mar 6, 2025
b92ea35
Update
swolchok Mar 6, 2025
b478275
Update
swolchok Mar 7, 2025
0470870
Update
swolchok Mar 7, 2025
5a283c8
Update
swolchok Mar 7, 2025
a8a0e57
Update
swolchok Mar 7, 2025
df93cd4
Update
swolchok Mar 7, 2025
6350e07
Update
swolchok Mar 7, 2025
bd20770
Update
swolchok Mar 7, 2025
e7190a8
Update
swolchok Mar 7, 2025
4dd58a0
Update
swolchok Mar 7, 2025
1b6eb9f
Update
swolchok Mar 7, 2025
450e50b
Update
swolchok Mar 7, 2025
4459a7e
Update
swolchok Mar 7, 2025
fad4ed8
Update
swolchok Mar 7, 2025
085b624
Update
swolchok Mar 7, 2025
c7219a3
Update
swolchok Mar 7, 2025
e4af3bb
Update
swolchok Mar 8, 2025
9c9e31e
Update
swolchok Mar 8, 2025
34423ae
Update
swolchok Mar 8, 2025
379c10e
Update
swolchok Mar 8, 2025
fb5e06c
Update
swolchok Mar 8, 2025
4a0e893
Update
swolchok Mar 8, 2025
40a1bce
Update
swolchok Mar 8, 2025
37e4213
Update
swolchok Mar 8, 2025
ef1a0ce
Update
swolchok Mar 10, 2025
6844013
Update
swolchok Mar 10, 2025
e417a3b
Update
swolchok Mar 10, 2025
adaae97
Update
swolchok Mar 10, 2025
ea335ee
Update
swolchok Mar 10, 2025
2cc4910
Update
swolchok Mar 10, 2025
d7cdfa7
Update
swolchok Mar 10, 2025
98d6d01
Update
swolchok Mar 10, 2025
4a7ba26
Update
swolchok Mar 11, 2025
c0d1daa
Update
swolchok Mar 11, 2025
f408201
Update
swolchok Mar 11, 2025
e2fb689
Update
swolchok Mar 11, 2025
994c5f5
Update
swolchok Mar 11, 2025
b76240d
Update
swolchok Mar 11, 2025
18d5dde
Update
swolchok Mar 11, 2025
8cfdfa6
Update
swolchok Mar 11, 2025
4917358
Update
swolchok Mar 11, 2025
4a43b35
Update
swolchok Mar 11, 2025
3fe478d
Update
swolchok Mar 11, 2025
21d8aac
Update
swolchok Mar 11, 2025
2272c40
Update
swolchok Mar 11, 2025
f5bac6a
Update
swolchok Mar 11, 2025
c44fda6
Update
swolchok Mar 11, 2025
fa9ef9c
Update
swolchok Mar 11, 2025
73f37ee
Update
swolchok Mar 11, 2025
a8dd330
Update
swolchok Mar 11, 2025
0088cd2
Update
swolchok Mar 11, 2025
6b296df
Update
swolchok Mar 11, 2025
854c967
Update
swolchok Mar 11, 2025
03f00ee
Update
swolchok Mar 11, 2025
87085af
Update
swolchok Mar 11, 2025
2ee8846
Update
swolchok Mar 11, 2025
e6be3fe
Update
swolchok Mar 11, 2025
c66f533
Update
swolchok Mar 11, 2025
e6d6ad6
Update
swolchok Mar 11, 2025
5781018
Update
swolchok Mar 11, 2025
caac9df
Update
swolchok Mar 11, 2025
66387af
Update
swolchok Mar 11, 2025
051f69c
Update
swolchok Mar 11, 2025
d8f4b13
Update
swolchok Mar 11, 2025
ecfabce
Update
swolchok Mar 11, 2025
cfbd318
Update
swolchok Mar 12, 2025
de50f9b
Update
swolchok Mar 12, 2025
d68ef30
Update
swolchok Mar 12, 2025
98e3147
Update
swolchok Mar 12, 2025
cb6aa4d
Update
swolchok Mar 12, 2025
eabae64
Update
swolchok Mar 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions build/cmake_deps.toml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ deps = [
"executorch",
"executorch_core",
"extension_threadpool",
"optimized_cpublas",
"portable_kernels",
]

Expand Down Expand Up @@ -146,6 +147,7 @@ deps = [
"executorch_core",
"executorch",
"extension_threadpool",
"optimized_cpublas",
"portable_kernels",
]
# ---------------------------------- core end ----------------------------------
Expand Down Expand Up @@ -413,6 +415,7 @@ excludes = [
deps = [
"executorch",
"executorch_core",
"optimized_cpublas",
"optimized_kernels",
"extension_threadpool",
"reduce_util",
Expand Down Expand Up @@ -452,6 +455,7 @@ deps = [
"extension_data_loader",
"extension_module",
"extension_threadpool",
"optimized_cpublas",
"portable_kernels",
"quantized_kernels",
"xnnpack_backend",
Expand Down
22 changes: 21 additions & 1 deletion build/executorch-config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ set(lib_list
pthreadpool
vulkan_backend
optimized_kernels
optimized_portable_kernels
cpublas
eigen_blas
optimized_ops_lib
Expand Down Expand Up @@ -132,7 +133,26 @@ endforeach()
# target_compile_options/target_compile_definitions for everything.
if(TARGET cpublas)
set_target_properties(
cpublas PROPERTIES INTERFACE_LINK_LIBRARIES extension_threadpool
cpublas PROPERTIES INTERFACE_LINK_LIBRARIES
"extension_threadpool;eigen_blas"
)
endif()
if(TARGET optimized_kernels)
set_target_properties(
optimized_kernels PROPERTIES INTERFACE_LINK_LIBRARIES
"executorch_core;cpublas;extension_threadpool"
)
endif()
if(TARGET optimized_native_cpu_ops_lib)
if(TARGET optimized_portable_kernels)
set(_maybe_optimized_portable_kernels_lib optimized_portable_kernels)
else()
set(_maybe_optimized_portable_kernels_lib portable_kernels)
endif()
set_target_properties(
optimized_native_cpu_ops_lib
PROPERTIES INTERFACE_LINK_LIBRARIES
"optimized_kernels;${_maybe_optimized_portable_kernels_lib}"
)
endif()
if(TARGET extension_threadpool)
Expand Down
7 changes: 6 additions & 1 deletion configurations/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,17 @@ if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
message("Generated files ${gen_command_sources}")

# optimized_native_cpu_ops_lib: Register optimized op kernels into the runtime
if(TARGET optimized_portable_kernels)
set(_optimized_native_cpu_ops_lib_portable_kernels_lib optimized_portable_kernels)
else()
set(_optimized_native_cpu_ops_lib_portable_kernels_lib portable_kernels)
endif()
gen_operators_lib(
LIB_NAME
"optimized_native_cpu_ops_lib"
KERNEL_LIBS
portable_kernels
optimized_kernels
${_optimized_native_cpu_ops_lib_portable_kernels_lib}
DEPS
executorch
)
Expand Down
4 changes: 0 additions & 4 deletions extension/android/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,6 @@ if(TARGET optimized_native_cpu_ops_lib)
APPEND
link_libraries
optimized_native_cpu_ops_lib
optimized_kernels
portable_kernels
cpublas
eigen_blas
)
target_link_options_shared_lib(optimized_native_cpu_ops_lib)
else()
Expand Down
16 changes: 16 additions & 0 deletions kernels/portable/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,22 @@ gen_operators_lib(
LIB_NAME "portable_ops_lib" KERNEL_LIBS portable_kernels DEPS executorch
)

# Portable kernels support optional parallelization (and, in the
# future, perhaps other performance features). If support is present,
# produce an optimized version.
set(BUILD_OPTIMIZED_PORTABLE_KERNELS EXECUTORCH_BUILD_PTHREADPOOL)

if(BUILD_OPTIMIZED_PORTABLE_KERNELS)
add_library(optimized_portable_kernels ${_portable_kernels__srcs})
target_link_libraries(optimized_portable_kernels PRIVATE executorch)
target_link_libraries(optimized_portable_kernels PUBLIC extension_threadpool)
target_compile_options(optimized_portable_kernels PUBLIC ${_common_compile_options})
install(
TARGETS optimized_portable_kernels
DESTINATION lib
)
endif()

install(
TARGETS portable_kernels portable_ops_lib
DESTINATION lib
Expand Down
62 changes: 38 additions & 24 deletions kernels/portable/cpu/op_argmin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include <executorch/kernels/portable/cpu/util/reduce_util.h>
#include <executorch/runtime/kernel/kernel_includes.h>
#include <executorch/runtime/kernel/thread_parallel_interface.h>
#include <executorch/runtime/platform/assert.h>

namespace torch {
Expand Down Expand Up @@ -47,30 +48,43 @@ Tensor& argmin_out(
ET_SWITCH_REALHBF16_TYPES(in.scalar_type(), ctx, "argmin.out", CTYPE, [&] {
long* out_data = out.mutable_data_ptr<long>();

for (const auto out_ix : c10::irange(out.numel())) {
std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
[](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
// the below condition as written is equivalent to !isnan(accval) &&
// (isnan(v) || v < acc_val). cases:
// - if neither acc_val nor v is NaN, !(v >= acc_val) is
// trivially equivalent to v < acc_val.
// - if acc_val is NaN, the whole thing is trivially false.
// - if acc_val is not NaN and v is NaN, then v >= acc_val
// - is false because all comparisons involving NaN are
// - false, so the result is true. The result is trivially
// - true for the above condition that uses isnan(v) as
// - well.
if (!std::isnan(acc_val) && !(v >= acc_val)) {
acc_val = v;
acc_ix = ix;
}
return std::tuple<CTYPE, long>{acc_val, acc_ix};
},
in,
dim,
out_ix);
out_data[out_ix] = std::get<1>(acc);
}
// REVIEW: this is the parallelization strategy ATen uses
// specifically when the reduction is along the last dimension and
// that dimension is contiguous. Is there any particular reason we
// shouldn't just always use this strategy since we aren't
// otherwise capable of parallelizing reductions?
const int64_t reduction_size = get_reduced_dim_product(in, dim);
const auto grain_size = std::max(
static_cast<int64_t>(1),
executorch::extension::internal::GRAIN_SIZE / reduction_size);
const bool success = executorch::extension::parallel_for(
0, out.numel(), grain_size, [&](const auto begin, const auto end) {
for (const auto out_ix : c10::irange(begin, end)) {
std::tuple<CTYPE, long> acc = reduce_over_dim<CTYPE>(
[](CTYPE v, long ix, CTYPE acc_val, long acc_ix) {
// the below condition as written is equivalent to
// !isnan(accval) && (isnan(v) || v < acc_val). cases:
// - if neither acc_val nor v is NaN, !(v >= acc_val) is
// trivially equivalent to v < acc_val.
// - if acc_val is NaN, the whole thing is trivially false.
// - if acc_val is not NaN and v is NaN, then v >= acc_val
// - is false because all comparisons involving NaN are
// - false, so the result is true. The result is trivially
// - true for the above condition that uses isnan(v) as
// - well.
if (!std::isnan(acc_val) && !(v >= acc_val)) {
acc_val = v;
acc_ix = ix;
}
return std::tuple<CTYPE, long>{acc_val, acc_ix};
},
in,
dim,
out_ix);
out_data[out_ix] = std::get<1>(acc);
}
});
ET_KERNEL_CHECK_MSG(ctx, success, Internal, , "parallel_for failed");
});

return out;
Expand Down
16 changes: 4 additions & 12 deletions kernels/portable/cpu/util/reduce_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,8 @@ size_t get_reduced_dim_product(
if (in.dim() == 0) {
return 1;
}
size_t dim_product = 1;
if (!dim.has_value()) {
for (size_t i = 0; i < static_cast<size_t>(in.dim()); ++i) {
dim_product *= in.size(i);
}
return dim_product;
return in.numel();
}
const size_t d = _normalize_non_neg_d(dim.value(), in.dim());
return in.size(d);
Expand All @@ -104,16 +100,12 @@ size_t get_reduced_dim_product(
if (in.dim() == 0) {
return 1;
}
size_t dim_product = 1;
const size_t in_dim = in.dim();
if (!dim_list.has_value() || dim_list.value().size() == 0) {
for (size_t i = 0; i < static_cast<size_t>(in.dim()); ++i) {
dim_product *= in.size(i);
}
return dim_product;
return in.numel();
}
size_t dim_product = 1;
for (const auto& d : dim_list.value()) {
const size_t non_neg_d = _normalize_non_neg_d(d, in_dim);
const size_t non_neg_d = _normalize_non_neg_d(d, in.dim());
dim_product *= in.size(non_neg_d);
}
return dim_product;
Expand Down
36 changes: 25 additions & 11 deletions kernels/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ foreach(kernel ${_kernels})
"${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/test"
)
set(_wrapper_path "${_wrapper_dir}/FunctionHeaderWrapper.h")
set(_functions_include "#include <executorch/kernels/${kernel}/Functions.h>")
add_custom_command(
OUTPUT "${_wrapper_path}"
COMMAND mkdir -p ${_wrapper_dir}
COMMAND echo "#include <executorch/kernels/${kernel}/Functions.h>" >
"${_wrapper_path}"
COMMAND echo ${_functions_include} > "${_wrapper_path}"
DEPENDS
"${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/Functions.h"
"${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/NativeFunctions.h"
Expand All @@ -53,7 +53,17 @@ foreach(kernel ${_kernels})
COMMENT "Generating ${_wrapper_dir}/supported_features.cpp and header"
VERBATIM
)

if(${kernel} STREQUAL "optimized")
set(_kernel_ops_lib "optimized_native_cpu_ops_lib")
set(_kernel_ops_lib_path
"${CMAKE_CURRENT_BINARY_DIR}/../../configurations/optimized_native_cpu_ops_lib"
)
else()
set(_kernel_ops_lib "${kernel}_ops_lib")
set(_kernel_ops_lib_path
"${CMAKE_CURRENT_BINARY_DIR}/../../kernels/${kernel}/${kernel}_ops_lib"
)
endif()
add_custom_command(
OUTPUT
"${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/Functions.h"
Expand All @@ -63,10 +73,9 @@ foreach(kernel ${_kernels})
mkdir -p
"${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/"
COMMAND
cp
"${CMAKE_CURRENT_BINARY_DIR}/../../kernels/${kernel}/${kernel}_ops_lib/*.h"
cp "${_kernel_ops_lib_path}/*.h"
"${CMAKE_CURRENT_BINARY_DIR}/include/${kernel}/executorch/kernels/${kernel}/"
DEPENDS "${kernel}_ops_lib"
DEPENDS ${_kernel_ops_lib}
)
endforeach()

Expand Down Expand Up @@ -280,23 +289,28 @@ set(_optimized_kernels_test_sources
${CMAKE_CURRENT_BINARY_DIR}/include/optimized/executorch/kernels/test/supported_features.cpp
)

if(TARGET optimized_portable_kernels)
list(APPEND _optimized_kernels_test_sources ${all_test_sources})
list(REMOVE_DUPLICATES _optimized_kernels_test_sources)
endif()

et_cxx_test(
optimized_kernels_test
SOURCES
${_optimized_kernels_test_sources}
EXTRA_LIBS
cpuinfo
extension_threadpool
optimized_kernels
optimized_ops_lib
portable_kernels
optimized_native_cpu_ops_lib
pthreadpool
eigen_blas
)
add_dependencies(optimized_kernels_test generate_wrapper)
target_include_directories(
optimized_kernels_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/include/optimized"
"${CMAKE_INSTALL_PREFIX}/include"
optimized_kernels_test
PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/include/optimized"
"${CMAKE_CURRENT_BINARY_DIR}/include/portable"
"${CMAKE_INSTALL_PREFIX}/include"
)

if(TARGET quantized_kernels)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ ATEN_OPS = (
name = "op_argmin",
deps = [
"//executorch/kernels/portable/cpu/util:reduce_util",
"//executorch/runtime/kernel:thread_parallel_interface",
],
),
op_target(
Expand Down
Loading