diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt index 8b3bf3d91c1..ed0128f93f1 100644 --- a/backends/xnnpack/CMakeLists.txt +++ b/backends/xnnpack/CMakeLists.txt @@ -37,6 +37,19 @@ option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE # Keeping this OFF by default due to regressions in decode and model load with # kleidi kernels option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI "Enable Arm Kleidi kernels" OFF) + +# Turning this on cache weights between partitions and methods. If weights +# are shared across methods/partitions then this can reduce load time and +# memory usage + +# Keeping this off maintains existing behavior. Turning this on serializes +# execution and initialization of delegates, to be revisited +option(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE + "Enable weights cache to cache and manage all packed weights" OFF) + +if(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE) + add_definitions(-DENABLE_XNNPACK_WEIGHTS_CACHE) +endif() if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE) add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE) endif() diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp index 6a93ab73a2e..c0204831c07 100644 --- a/backends/xnnpack/runtime/XNNCompiler.cpp +++ b/backends/xnnpack/runtime/XNNCompiler.cpp @@ -11,7 +11,9 @@ #include <executorch/backends/xnnpack/serialization/schema_generated.h> #include <executorch/extension/threadpool/threadpool.h> #include <executorch/runtime/executor/pte_data_map.h> +#include <string> #include <unordered_map> +#include <vector> #pragma clang diagnostic ignored "-Wmissing-prototypes" #pragma clang diagnostic ignored "-Wglobal-constructors" @@ -167,7 +169,8 @@ const uint8_t* getConstantDataPtr( GraphPtr flatbuffer_graph, const uint8_t* constant_data_ptr, const NamedDataMap* named_data_map, - std::vector<FreeableBuffer>& loaded_buffers_from_map) { + std::vector<FreeableBuffer>& freeable_buffers, + XNNWeightsCache* weights_cache) { auto buffer_idx = tensor_value->constant_buffer_idx(); if (buffer_idx) { if (!constant_data_ptr) { @@ -187,6 +190,15 @@ const uint8_t* getConstantDataPtr( return constant_data_ptr + offset; } else { const std::string& data_name = constant_data_offset->named_key()->str(); +#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE + Result<const uint8_t*> data_ptr = + weights_cache->load_unpacked_data(data_name); + if (!data_ptr.ok()) { + ET_LOG(Error, "Failed to load weights from cache"); + return nullptr; + } + return data_ptr.get(); +#else Result<FreeableBuffer> buffer = named_data_map->get_data(data_name.c_str()); if (!buffer.ok()) { @@ -198,8 +210,9 @@ const uint8_t* getConstantDataPtr( } const uint8_t* data_ptr = static_cast<const uint8_t*>(buffer.get().data()); - loaded_buffers_from_map.push_back(std::move(buffer.get())); + freeable_buffers.push_back(std::move(buffer.get())); return data_ptr; +#endif } } } @@ -222,7 +235,8 @@ Error defineTensor( std::vector<uint32_t>& output_ids, CompileAllocator& allocator, const NamedDataMap* named_data_map, - std::vector<FreeableBuffer>& loaded_buffers_from_map) { + std::vector<FreeableBuffer>& freeable_buffers, + XNNWeightsCache* weights_cache) { const fb_xnnpack::XNNTensorValue* tensor_value = nullptr; const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr; @@ -264,7 +278,8 @@ Error defineTensor( flatbuffer_graph, constant_data_ptr, named_data_map, - loaded_buffers_from_map); + freeable_buffers, + weights_cache); xnn_status status; // The type we might have to convert to @@ -1999,9 +2014,9 @@ ET_NODISCARD Error XNNCompiler::compileModel( const void* buffer_pointer, size_t num_bytes, XNNExecutor* executor, - MemoryAllocator* runtime_allocator, - const NamedDataMap* named_data_map, - xnn_workspace_t workspace) { + XNNWeightsCache* weights_cache, + xnn_workspace_t workspace, + const NamedDataMap* named_data_map) { Result<XNNHeader> header = XNNHeader::Parse(buffer_pointer, num_bytes); const uint8_t* flatbuffer_data = nullptr; const uint8_t* constant_data = nullptr; @@ -2065,11 +2080,14 @@ ET_NODISCARD Error XNNCompiler::compileModel( // Invalid ids do not need to be remapped remapped_ids.emplace(XNN_INVALID_VALUE_ID, XNN_INVALID_VALUE_ID); + // If weight cache is not on we hold onto all the unpacked buffers + // and we free them at the end + std::vector<FreeableBuffer> unpacked_buffers; + // External Ids for inputs and outputs std::vector<uint32_t> input_ids; std::vector<uint32_t> output_ids; Error err = Error::Ok; - std::vector<FreeableBuffer> loaded_buffers_from_map; for (auto value : *flatbuffer_graph->xvalues()) { err = defineTensor( subgraph.get(), @@ -2081,7 +2099,8 @@ ET_NODISCARD Error XNNCompiler::compileModel( output_ids, compile_allocator, named_data_map, - loaded_buffers_from_map); + unpacked_buffers, + weights_cache); if (err != Error::Ok) { return err; @@ -2103,12 +2122,26 @@ ET_NODISCARD Error XNNCompiler::compileModel( xnn_runtime_t runtime_ptr = nullptr; + // XNNWeightsCache if weights cache is not enabled, then XNNWeightsCache + // just manages the unpacked weights until the runtime is created. +#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE + ET_CHECK_OR_RETURN_ERROR( + unpacked_buffers.size() == 0, + Internal, + "Weight Cache is enabled, which means unpacked buffers should be owned by the cache"); + xnn_weights_cache_t weights_cache_ptr = + weights_cache->get_num_unpacked_data() > 0 ? weights_cache->get() + : nullptr; +#else + xnn_weights_cache_t weights_cache_ptr = nullptr; +#endif + #ifdef ENABLE_XNNPACK_SHARED_WORKSPACE ET_CHECK_OR_RETURN_ERROR( workspace != nullptr, Internal, "Failed to initialize XNNPACK workspace"); status = xnn_create_runtime_v4( subgraph.get(), - /*weight_cache=*/nullptr, // TODO - support weight cache + weights_cache_ptr, workspace, ::executorch::extension::threadpool::get_pthreadpool(), runtime_flags, @@ -2116,7 +2149,7 @@ ET_NODISCARD Error XNNCompiler::compileModel( #else status = xnn_create_runtime_v3( subgraph.get(), - /*weight_cache=*/nullptr, // TODO - support weight cache + weights_cache_ptr, ::executorch::extension::threadpool::get_pthreadpool(), runtime_flags, &runtime_ptr); @@ -2128,10 +2161,25 @@ ET_NODISCARD Error XNNCompiler::compileModel( "XNN Runtime creation failed with code: %s", xnn_status_to_string(status)); +#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE + auto packed_weights_names = weights_cache->finalize_for_runtime(); + ET_CHECK_OR_RETURN_ERROR( + packed_weights_names.ok(), + Internal, + "Failed to finalize weights cache after creating the xnn runtime") +#else + for (auto& buffer : unpacked_buffers) { + buffer.Free(); + } + Result<std::vector<std::string>> packed_weights_names = + std::vector<std::string>(); +#endif + err = executor->initialize( // NOLINT: runtime_ptr is non-null runtime_ptr, std::move(input_ids), - std::move(output_ids)); + std::move(output_ids), + std::move(packed_weights_names.get())); return err; }; diff --git a/backends/xnnpack/runtime/XNNCompiler.h b/backends/xnnpack/runtime/XNNCompiler.h index 3ea621a4d59..bcc87351d7d 100644 --- a/backends/xnnpack/runtime/XNNCompiler.h +++ b/backends/xnnpack/runtime/XNNCompiler.h @@ -9,11 +9,9 @@ #pragma once #include <executorch/backends/xnnpack/runtime/XNNExecutor.h> +#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h> #include <executorch/runtime/platform/compiler.h> - #include <xnnpack.h> -#include <memory> -#include <vector> namespace executorch { namespace backends { @@ -29,9 +27,9 @@ class XNNCompiler { const void* buffer_pointer, size_t num_bytes, XNNExecutor* executor, - executorch::runtime::MemoryAllocator* runtime_allocator, - const executorch::runtime::NamedDataMap* named_data_map, - xnn_workspace_t workspace); + XNNWeightsCache* weights_cache, + xnn_workspace_t workspace, + const NamedDataMap* named_data_map); }; } // namespace delegate diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp index 1ba549bb8d7..ae7c0d66ecb 100644 --- a/backends/xnnpack/runtime/XNNExecutor.cpp +++ b/backends/xnnpack/runtime/XNNExecutor.cpp @@ -30,7 +30,8 @@ using executorch::runtime::kTensorDimensionLimit; ET_NODISCARD Error XNNExecutor::initialize( xnn_runtime_t runtime, std::vector<uint32_t>&& input_ids, - std::vector<uint32_t>&& output_ids) { + std::vector<uint32_t>&& output_ids, + std::vector<std::string>&& packed_data_names) { runtime_ = std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)>( runtime, xnn_delete_runtime); @@ -51,6 +52,7 @@ ET_NODISCARD Error XNNExecutor::initialize( std::sort(output_ids_.begin(), output_ids_.end()); externals_.resize(input_ids_.size() + output_ids_.size()); + packed_data_names_ = std::move(packed_data_names); return Error::Ok; } diff --git a/backends/xnnpack/runtime/XNNExecutor.h b/backends/xnnpack/runtime/XNNExecutor.h index 68ee18609e3..b98c902f44f 100644 --- a/backends/xnnpack/runtime/XNNExecutor.h +++ b/backends/xnnpack/runtime/XNNExecutor.h @@ -34,6 +34,7 @@ class XNNExecutor { std::vector<uint32_t> input_ids_; std::vector<uint32_t> output_ids_; std::vector<xnn_external_value> externals_; + std::vector<std::string> packed_data_names_; public: XNNExecutor() = default; @@ -46,6 +47,10 @@ class XNNExecutor { return output_ids_.size(); } + inline std::vector<std::string> get_packed_data_names() { + return packed_data_names_; + } + /** * Initialize the XNNExecutor with a given runtime and input/output ids. * The input/output ids are expected to be sorted in order of their @@ -54,7 +59,8 @@ class XNNExecutor { ET_NODISCARD executorch::runtime::Error initialize( xnn_runtime_t runtime, std::vector<uint32_t>&& input_ids, - std::vector<uint32_t>&& output_ids); + std::vector<uint32_t>&& output_ids, + std::vector<std::string>&& packed_data_names); /** * Prepares the arguments for runtime graph execution. diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp index a01ba2da704..1e2f07bd905 100644 --- a/backends/xnnpack/runtime/XNNPACKBackend.cpp +++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp @@ -7,6 +7,7 @@ */ #include <executorch/backends/xnnpack/runtime/XNNCompiler.h> +#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h> #include <executorch/runtime/backend/interface.h> #include <executorch/runtime/core/error.h> #include <executorch/runtime/core/evalue.h> @@ -20,6 +21,7 @@ namespace executorch { namespace backends { +using executorch::backends::xnnpack::delegate::XNNWeightsCache; using executorch::runtime::ArrayRef; using executorch::runtime::Backend; using executorch::runtime::BackendExecutionContext; @@ -81,13 +83,18 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface { } const NamedDataMap* named_data_map = context.get_named_data_map(); - -#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE - // This is needed to serialize access to xnn_create_runtime which is not // thread safe. This can heppen when multiple threads call init() on // the same backend instance. +#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE const std::lock_guard<std::mutex> lock(workspace_mutex_); #endif + +#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE + const std::lock_guard<std::mutex> lock_weight_cache(weights_cache_mutex_); + weights_cache_->initialize_for_runtime( + context.get_runtime_allocator(), named_data_map); +#endif + // Executor has been allocated but not constructed, ensure that runtime_ is // nullptr by constructing it in place here. NOTE: Since we use placement // new and since this type is not trivially destructible, we must call the @@ -97,9 +104,9 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface { processed->data(), processed->size(), executor, - context.get_runtime_allocator(), - named_data_map, - workspace_.get()); + weights_cache_.get(), + workspace_.get(), + named_data_map); // This backend does not need its processed data after compiling the model. processed->Free(); @@ -125,6 +132,10 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface { const std::lock_guard<std::mutex> lock(workspace_mutex_); #endif +#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE + const std::lock_guard<std::mutex> lock_weights_cache(weights_cache_mutex_); +#endif + // Prepare Inputs/Outputs and Propagate Input Shapes Error err = executor->prepare_args(args); if (err != Error::Ok) { @@ -145,16 +156,24 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface { void destroy(DelegateHandle* handle) const override { if (handle != nullptr) { -#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE // This is needed to serialize access to xnn_delete_runtime which is not // thread safe. This can heppen when multiple threads call destroy() on // the same backend instance. +#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE const std::lock_guard<std::mutex> lock(workspace_mutex_); #endif + auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle); + #ifdef ENABLE_XNNPACK_PROFILING executor->print_avg_op_timings(); #endif + +#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE + const std::lock_guard<std::mutex> lock_weights_cache( + weights_cache_mutex_); + weights_cache_->delete_packed_data(executor->get_packed_data_names()); +#endif // XNNExecutor is not trivially destructible. Since this was constructed // manually in init(), we must destroy it manually here. executor->~XNNExecutor(); @@ -167,6 +186,15 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface { std::unique_ptr<xnn_workspace, decltype(&xnn_release_workspace)> workspace_{ nullptr, &xnn_release_workspace}; + + // Weights cache is global to all delegate instances. + mutable std::mutex weights_cache_mutex_; + std::unique_ptr<XNNWeightsCache> weights_cache_ = + std::make_unique<XNNWeightsCache>(); + + // Lock Hiearchy for Mutexes: + // workspace_mutex_ + // weights_cache_mutex_ }; namespace { diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl index 3fd9c433372..e97f1941ff7 100644 --- a/backends/xnnpack/targets.bzl +++ b/backends/xnnpack/targets.bzl @@ -6,11 +6,15 @@ def _get_preprocessor_flags(): Disable if someone explictly specified a config option, else Enable otherwise """ - if native.read_config("executorch", "xnnpack_workspace_sharing", "0") == "0": - return [] + preprocessor_flags = [] + if native.read_config("executorch", "xnnpack_workspace_sharing", "0") != "0": + preprocessor_flags.append("-DENABLE_XNNPACK_SHARED_WORKSPACE") + + if native.read_config("executorch", "xnnpack_weights_cache", "0") != "0": + preprocessor_flags.append("-DENABLE_XNNPACK_WEIGHTS_CACHE") # Enable if not disabled through config - return ["-DENABLE_XNNPACK_SHARED_WORKSPACE"] + return preprocessor_flags def define_common_targets(): runtime.cxx_library( diff --git a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp index a5a26004b49..42d925c1253 100644 --- a/backends/xnnpack/test/runtime/test_xnnexecutor.cpp +++ b/backends/xnnpack/test/runtime/test_xnnexecutor.cpp @@ -74,7 +74,8 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) { }, { 1, - }), + }, + {}), Error::Ok); TensorFactory<executorch::aten::ScalarType::Int> tf; auto input_tensor = tf.make({1, 1, 1, 1, 1, 1, 1, 1, 1}, {42});