Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[XNNPACK][Weights Cache] Enable in XNNPACK #9155

Merged
merged 9 commits into from
Mar 14, 2025
13 changes: 13 additions & 0 deletions backends/xnnpack/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,19 @@ option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
# Keeping this OFF by default due to regressions in decode and model load with
# kleidi kernels
option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI "Enable Arm Kleidi kernels" OFF)

# Turning this on cache weights between partitions and methods. If weights
# are shared across methods/partitions then this can reduce load time and
# memory usage

# Keeping this off maintains existing behavior. Turning this on serializes
# execution and initialization of delegates, to be revisited
option(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE
"Enable weights cache to cache and manage all packed weights" OFF)

if(EXECUTORCH_XNNPACK_ENABLE_WEIGHT_CACHE)
add_definitions(-DENABLE_XNNPACK_WEIGHTS_CACHE)
endif()
if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE)
add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE)
endif()
Expand Down
72 changes: 60 additions & 12 deletions backends/xnnpack/runtime/XNNCompiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
#include <executorch/backends/xnnpack/serialization/schema_generated.h>
#include <executorch/extension/threadpool/threadpool.h>
#include <executorch/runtime/executor/pte_data_map.h>
#include <string>
#include <unordered_map>
#include <vector>

#pragma clang diagnostic ignored "-Wmissing-prototypes"
#pragma clang diagnostic ignored "-Wglobal-constructors"
Expand Down Expand Up @@ -167,7 +169,8 @@ const uint8_t* getConstantDataPtr(
GraphPtr flatbuffer_graph,
const uint8_t* constant_data_ptr,
const NamedDataMap* named_data_map,
std::vector<FreeableBuffer>& loaded_buffers_from_map) {
std::vector<FreeableBuffer>& freeable_buffers,
XNNWeightsCache* weights_cache) {
auto buffer_idx = tensor_value->constant_buffer_idx();
if (buffer_idx) {
if (!constant_data_ptr) {
Expand All @@ -187,6 +190,15 @@ const uint8_t* getConstantDataPtr(
return constant_data_ptr + offset;
} else {
const std::string& data_name = constant_data_offset->named_key()->str();
#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
Result<const uint8_t*> data_ptr =
weights_cache->load_unpacked_data(data_name);
if (!data_ptr.ok()) {
ET_LOG(Error, "Failed to load weights from cache");
return nullptr;
}
return data_ptr.get();
#else
Result<FreeableBuffer> buffer =
named_data_map->get_data(data_name.c_str());
if (!buffer.ok()) {
Expand All @@ -198,8 +210,9 @@ const uint8_t* getConstantDataPtr(
}
const uint8_t* data_ptr =
static_cast<const uint8_t*>(buffer.get().data());
loaded_buffers_from_map.push_back(std::move(buffer.get()));
freeable_buffers.push_back(std::move(buffer.get()));
return data_ptr;
#endif
}
}
}
Expand All @@ -222,7 +235,8 @@ Error defineTensor(
std::vector<uint32_t>& output_ids,
CompileAllocator& allocator,
const NamedDataMap* named_data_map,
std::vector<FreeableBuffer>& loaded_buffers_from_map) {
std::vector<FreeableBuffer>& freeable_buffers,
XNNWeightsCache* weights_cache) {
const fb_xnnpack::XNNTensorValue* tensor_value = nullptr;
const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr;

Expand Down Expand Up @@ -264,7 +278,8 @@ Error defineTensor(
flatbuffer_graph,
constant_data_ptr,
named_data_map,
loaded_buffers_from_map);
freeable_buffers,
weights_cache);

xnn_status status;
// The type we might have to convert to
Expand Down Expand Up @@ -1999,9 +2014,9 @@ ET_NODISCARD Error XNNCompiler::compileModel(
const void* buffer_pointer,
size_t num_bytes,
XNNExecutor* executor,
MemoryAllocator* runtime_allocator,
const NamedDataMap* named_data_map,
xnn_workspace_t workspace) {
XNNWeightsCache* weights_cache,
xnn_workspace_t workspace,
const NamedDataMap* named_data_map) {
Result<XNNHeader> header = XNNHeader::Parse(buffer_pointer, num_bytes);
const uint8_t* flatbuffer_data = nullptr;
const uint8_t* constant_data = nullptr;
Expand Down Expand Up @@ -2065,11 +2080,14 @@ ET_NODISCARD Error XNNCompiler::compileModel(
// Invalid ids do not need to be remapped
remapped_ids.emplace(XNN_INVALID_VALUE_ID, XNN_INVALID_VALUE_ID);

// If weight cache is not on we hold onto all the unpacked buffers
// and we free them at the end
std::vector<FreeableBuffer> unpacked_buffers;

// External Ids for inputs and outputs
std::vector<uint32_t> input_ids;
std::vector<uint32_t> output_ids;
Error err = Error::Ok;
std::vector<FreeableBuffer> loaded_buffers_from_map;
for (auto value : *flatbuffer_graph->xvalues()) {
err = defineTensor(
subgraph.get(),
Expand All @@ -2081,7 +2099,8 @@ ET_NODISCARD Error XNNCompiler::compileModel(
output_ids,
compile_allocator,
named_data_map,
loaded_buffers_from_map);
unpacked_buffers,
weights_cache);

if (err != Error::Ok) {
return err;
Expand All @@ -2103,20 +2122,34 @@ ET_NODISCARD Error XNNCompiler::compileModel(

xnn_runtime_t runtime_ptr = nullptr;

// XNNWeightsCache if weights cache is not enabled, then XNNWeightsCache
// just manages the unpacked weights until the runtime is created.
#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
ET_CHECK_OR_RETURN_ERROR(
unpacked_buffers.size() == 0,
Internal,
"Weight Cache is enabled, which means unpacked buffers should be owned by the cache");
xnn_weights_cache_t weights_cache_ptr =
weights_cache->get_num_unpacked_data() > 0 ? weights_cache->get()
: nullptr;
#else
xnn_weights_cache_t weights_cache_ptr = nullptr;
#endif

#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
ET_CHECK_OR_RETURN_ERROR(
workspace != nullptr, Internal, "Failed to initialize XNNPACK workspace");
status = xnn_create_runtime_v4(
subgraph.get(),
/*weight_cache=*/nullptr, // TODO - support weight cache
weights_cache_ptr,
workspace,
::executorch::extension::threadpool::get_pthreadpool(),
runtime_flags,
&runtime_ptr);
#else
status = xnn_create_runtime_v3(
subgraph.get(),
/*weight_cache=*/nullptr, // TODO - support weight cache
weights_cache_ptr,
::executorch::extension::threadpool::get_pthreadpool(),
runtime_flags,
&runtime_ptr);
Expand All @@ -2128,10 +2161,25 @@ ET_NODISCARD Error XNNCompiler::compileModel(
"XNN Runtime creation failed with code: %s",
xnn_status_to_string(status));

#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
auto packed_weights_names = weights_cache->finalize_for_runtime();
ET_CHECK_OR_RETURN_ERROR(
packed_weights_names.ok(),
Internal,
"Failed to finalize weights cache after creating the xnn runtime")
#else
for (auto& buffer : unpacked_buffers) {
buffer.Free();
}
Result<std::vector<std::string>> packed_weights_names =
std::vector<std::string>();
#endif

err = executor->initialize( // NOLINT: runtime_ptr is non-null
runtime_ptr,
std::move(input_ids),
std::move(output_ids));
std::move(output_ids),
std::move(packed_weights_names.get()));

return err;
};
Expand Down
10 changes: 4 additions & 6 deletions backends/xnnpack/runtime/XNNCompiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,9 @@
#pragma once

#include <executorch/backends/xnnpack/runtime/XNNExecutor.h>
#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
#include <executorch/runtime/platform/compiler.h>

#include <xnnpack.h>
#include <memory>
#include <vector>

namespace executorch {
namespace backends {
Expand All @@ -29,9 +27,9 @@ class XNNCompiler {
const void* buffer_pointer,
size_t num_bytes,
XNNExecutor* executor,
executorch::runtime::MemoryAllocator* runtime_allocator,
const executorch::runtime::NamedDataMap* named_data_map,
xnn_workspace_t workspace);
XNNWeightsCache* weights_cache,
xnn_workspace_t workspace,
const NamedDataMap* named_data_map);
};

} // namespace delegate
Expand Down
4 changes: 3 additions & 1 deletion backends/xnnpack/runtime/XNNExecutor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ using executorch::runtime::kTensorDimensionLimit;
ET_NODISCARD Error XNNExecutor::initialize(
xnn_runtime_t runtime,
std::vector<uint32_t>&& input_ids,
std::vector<uint32_t>&& output_ids) {
std::vector<uint32_t>&& output_ids,
std::vector<std::string>&& packed_data_names) {
runtime_ = std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)>(
runtime, xnn_delete_runtime);

Expand All @@ -51,6 +52,7 @@ ET_NODISCARD Error XNNExecutor::initialize(
std::sort(output_ids_.begin(), output_ids_.end());

externals_.resize(input_ids_.size() + output_ids_.size());
packed_data_names_ = std::move(packed_data_names);

return Error::Ok;
}
Expand Down
8 changes: 7 additions & 1 deletion backends/xnnpack/runtime/XNNExecutor.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ class XNNExecutor {
std::vector<uint32_t> input_ids_;
std::vector<uint32_t> output_ids_;
std::vector<xnn_external_value> externals_;
std::vector<std::string> packed_data_names_;

public:
XNNExecutor() = default;
Expand All @@ -46,6 +47,10 @@ class XNNExecutor {
return output_ids_.size();
}

inline std::vector<std::string> get_packed_data_names() {
return packed_data_names_;
}

/**
* Initialize the XNNExecutor with a given runtime and input/output ids.
* The input/output ids are expected to be sorted in order of their
Expand All @@ -54,7 +59,8 @@ class XNNExecutor {
ET_NODISCARD executorch::runtime::Error initialize(
xnn_runtime_t runtime,
std::vector<uint32_t>&& input_ids,
std::vector<uint32_t>&& output_ids);
std::vector<uint32_t>&& output_ids,
std::vector<std::string>&& packed_data_names);

/**
* Prepares the arguments for runtime graph execution.
Expand Down
42 changes: 35 additions & 7 deletions backends/xnnpack/runtime/XNNPACKBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
*/

#include <executorch/backends/xnnpack/runtime/XNNCompiler.h>
#include <executorch/backends/xnnpack/runtime/XNNWeightsCache.h>
#include <executorch/runtime/backend/interface.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/core/evalue.h>
Expand All @@ -20,6 +21,7 @@
namespace executorch {
namespace backends {

using executorch::backends::xnnpack::delegate::XNNWeightsCache;
using executorch::runtime::ArrayRef;
using executorch::runtime::Backend;
using executorch::runtime::BackendExecutionContext;
Expand Down Expand Up @@ -81,13 +83,18 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
}

const NamedDataMap* named_data_map = context.get_named_data_map();

#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
// This is needed to serialize access to xnn_create_runtime which is not
// thread safe. This can heppen when multiple threads call init() on
// the same backend instance.
#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
const std::lock_guard<std::mutex> lock(workspace_mutex_);
#endif

#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
const std::lock_guard<std::mutex> lock_weight_cache(weights_cache_mutex_);
weights_cache_->initialize_for_runtime(
context.get_runtime_allocator(), named_data_map);
#endif

// Executor has been allocated but not constructed, ensure that runtime_ is
// nullptr by constructing it in place here. NOTE: Since we use placement
// new and since this type is not trivially destructible, we must call the
Expand All @@ -97,9 +104,9 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
processed->data(),
processed->size(),
executor,
context.get_runtime_allocator(),
named_data_map,
workspace_.get());
weights_cache_.get(),
workspace_.get(),
named_data_map);
// This backend does not need its processed data after compiling the model.
processed->Free();

Expand All @@ -125,6 +132,10 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
const std::lock_guard<std::mutex> lock(workspace_mutex_);
#endif

#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
const std::lock_guard<std::mutex> lock_weights_cache(weights_cache_mutex_);
#endif

// Prepare Inputs/Outputs and Propagate Input Shapes
Error err = executor->prepare_args(args);
if (err != Error::Ok) {
Expand All @@ -145,16 +156,24 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {

void destroy(DelegateHandle* handle) const override {
if (handle != nullptr) {
#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
// This is needed to serialize access to xnn_delete_runtime which is not
// thread safe. This can heppen when multiple threads call destroy() on
// the same backend instance.
#ifdef ENABLE_XNNPACK_SHARED_WORKSPACE
const std::lock_guard<std::mutex> lock(workspace_mutex_);
#endif

auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);

#ifdef ENABLE_XNNPACK_PROFILING
executor->print_avg_op_timings();
#endif

#ifdef ENABLE_XNNPACK_WEIGHTS_CACHE
const std::lock_guard<std::mutex> lock_weights_cache(
weights_cache_mutex_);
weights_cache_->delete_packed_data(executor->get_packed_data_names());
#endif
// XNNExecutor is not trivially destructible. Since this was constructed
// manually in init(), we must destroy it manually here.
executor->~XNNExecutor();
Expand All @@ -167,6 +186,15 @@ class XnnpackBackend final : public ::executorch::runtime::BackendInterface {
std::unique_ptr<xnn_workspace, decltype(&xnn_release_workspace)> workspace_{
nullptr,
&xnn_release_workspace};

// Weights cache is global to all delegate instances.
mutable std::mutex weights_cache_mutex_;
std::unique_ptr<XNNWeightsCache> weights_cache_ =
std::make_unique<XNNWeightsCache>();

// Lock Hiearchy for Mutexes:
// workspace_mutex_
// weights_cache_mutex_
};

namespace {
Expand Down
10 changes: 7 additions & 3 deletions backends/xnnpack/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,15 @@ def _get_preprocessor_flags():
Disable if someone explictly specified a config option,
else Enable otherwise
"""
if native.read_config("executorch", "xnnpack_workspace_sharing", "0") == "0":
return []
preprocessor_flags = []
if native.read_config("executorch", "xnnpack_workspace_sharing", "0") != "0":
preprocessor_flags.append("-DENABLE_XNNPACK_SHARED_WORKSPACE")

if native.read_config("executorch", "xnnpack_weights_cache", "0") != "0":
preprocessor_flags.append("-DENABLE_XNNPACK_WEIGHTS_CACHE")

# Enable if not disabled through config
return ["-DENABLE_XNNPACK_SHARED_WORKSPACE"]
return preprocessor_flags

def define_common_targets():
runtime.cxx_library(
Expand Down
3 changes: 2 additions & 1 deletion backends/xnnpack/test/runtime/test_xnnexecutor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
},
{
1,
}),
},
{}),
Error::Ok);
TensorFactory<executorch::aten::ScalarType::Int> tf;
auto input_tensor = tf.make({1, 1, 1, 1, 1, 1, 1, 1, 1}, {42});
Expand Down
Loading