google
diff --git a/‎bench/subgraph/BUILD
Lines changed: 27 additions & 3 deletions b/‎bench/subgraph/BUILD
Lines changed: 27 additions & 3 deletions
diff --git a/‎bench/subgraph/CMakeLists.txt
Lines changed: 13 additions & 0 deletions b/‎bench/subgraph/CMakeLists.txt
Lines changed: 13 additions & 0 deletions
diff --git a/‎bench/subgraph/benchmark.cc
Lines changed: 26 additions & 142 deletions b/‎bench/subgraph/benchmark.cc
Lines changed: 26 additions & 142 deletions
@@ -10,6 +10,20 @@ load(
     "xnnpack_slow_benchmark_tags",
 )
 
+xnnpack_cxx_library(
+    name = "model_runtime",
+    srcs = ["model_runtime.cc"],
+    hdrs = ["model_runtime.h"],
+    deps = [
+        "//:XNNPACK",
+        "//:allocator",
+        "//:subgraph_h",
+        "//bench:bench_utils",
+        "@com_google_benchmark//:benchmark",
+        "@pthreadpool",
+    ],
+)
+
 xnnpack_cxx_library(
     name = "models",
     testonly = 1,
@@ -42,11 +56,21 @@ xnnpack_benchmark(
     srcs = ["benchmark.cc"],
     tags = xnnpack_slow_benchmark_tags(),
     deps = [
+        ":model_runtime",
         ":models",
-        "//:allocator",
-        "//:subgraph",
         "//:xnnpack_h",
         "//bench:bench_utils",
-        "@pthreadpool",
+    ],
+)
+
+xnnpack_benchmark(
+    name = "fully_connected",
+    srcs = ["fully_connected.cc"],
+    tags = xnnpack_slow_benchmark_tags(),
+    deps = [
+        ":model_runtime",
+        "//:xnnpack_h",
+        "//bench:bench_utils",
+        "//test:next_prime",
     ],
 )
@@ -26,10 +26,23 @@ IF(XNNPACK_BUILD_LIBRARY)
   SET_TARGET_PROPERTIES(models PROPERTIES CXX_EXTENSIONS YES)
   TARGET_LINK_LIBRARIES(models PRIVATE XNNPACK)
 
+  ADD_LIBRARY(model_runtime STATIC
+    model_runtime.cc)
+  SET_TARGET_PROPERTIES(model_runtime PROPERTIES CXX_EXTENSIONS YES)
+  TARGET_LINK_LIBRARIES(model_runtime PRIVATE XNNPACK)
+
   ADD_EXECUTABLE(bench-models benchmark.cc)
   TARGET_LINK_LIBRARIES(bench-models PRIVATE
     bench-utils
     benchmark::benchmark
     models
+    model_runtime
+    XNNPACK)
+
+  ADD_EXECUTABLE(bench-fully-connected fully_connected.cc)
+  TARGET_LINK_LIBRARIES(bench-fully-connected PRIVATE
+    bench-utils
+    benchmark::benchmark
+    model_runtime
     XNNPACK)
 ENDIF()
@@ -6,140 +6,24 @@
 #include <benchmark/benchmark.h>
 
 #include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-#include <cstring>
 #include <functional>
-#include <memory>
 #include <vector>
 
+#include "bench/subgraph/model_runtime.h"
 #include "bench/subgraph/models.h"
 #include "bench/utils.h"
 #include "include/xnnpack.h"
-#include "src/xnnpack/allocator.h"
-#include "src/xnnpack/subgraph.h"
-#include <pthreadpool.h>
-
-struct ModelRuntime {
-  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> model;
-  pthreadpool_t threadpool = nullptr;
-  xnn_runtime_t runtime = nullptr;
-  std::vector<xnn_external_value> external_values;
-
-  explicit ModelRuntime(int num_threads) : model(nullptr, xnn_delete_subgraph) {
-    xnn_delete_runtime(runtime);
-    threadpool = pthreadpool_create(num_threads);
-  }
-
-  ~ModelRuntime() {
-    if (runtime) {
-      xnn_delete_runtime(runtime);
-    }
-    if (threadpool) {
-      pthreadpool_destroy(threadpool);
-    }
-    for (xnn_external_value& i : external_values) {
-      xnn_release_simd_memory(i.data);
-    }
-  }
-
-  bool CreateModel(std::function<xnn_subgraph_t()> model_factory) {
-    model.reset(model_factory());
-    if (!model) {
-      return false;
-    }
-    for (uint32_t i = 0; i < model->num_values; ++i) {
-      if ((model->values[i].flags & (XNN_VALUE_FLAG_EXTERNAL_INPUT |
-                                     XNN_VALUE_FLAG_EXTERNAL_OUTPUT)) == 0) {
-        continue;
-      }
-      // Make a buffer for this external value.
-      size_t size = xnn_tensor_get_size(&model->values[i]) + XNN_EXTRA_BYTES;
-      external_values.push_back(
-          xnn_external_value{i, xnn_allocate_zero_simd_memory(size)});
-    }
-    return model != nullptr;
-  }
-
-  bool CreateRuntime(uint32_t flags) {
-    assert(!runtime);
-    return xnn_status_success == xnn_create_runtime_v4(model.get(), nullptr,
-                                                       nullptr, threadpool,
-                                                       flags, &runtime);
-  }
-  bool ReshapeRuntime() {
-    return xnn_status_success == xnn_reshape_runtime(runtime);
-  }
-
-  bool SetupRuntime() {
-    return xnn_status_success == xnn_setup_runtime_v2(runtime,
-                                                      external_values.size(),
-                                                      external_values.data());
-  }
-
-  bool Invoke() { return xnn_status_success == xnn_invoke_runtime(runtime); }
-};
-
-static void BenchmarkInvoke(benchmark::State& state,
-                            std::function<xnn_subgraph_t()> model_factory,
-                            uint32_t extra_flags = 0) {
-  if (xnn_initialize(nullptr /* allocator */) != xnn_status_success) {
-    state.SkipWithError("failed to initialize XNNPACK");
-    return;
-  }
-
-  ModelRuntime model_runtime(FLAGS_num_threads);
-  if (!model_runtime.CreateModel(model_factory)) {
-    state.SkipWithError("failed to create model");
-    return;
-  }
-
-  // TODO(dsharlet): We should have benchmarks of these steps too.
-  if (!model_runtime.CreateRuntime(FLAGS_xnn_runtime_flags | extra_flags)) {
-    state.SkipWithError("failed to create runtime");
-    return;
-  }
-
-  if (!model_runtime.ReshapeRuntime()) {
-    state.SkipWithError("failed to reshape runtime");
-    return;
-  }
-
-  if (!model_runtime.SetupRuntime()) {
-    state.SkipWithError("failed to setup runtime");
-    return;
-  }
-
-  int num_iters = FLAGS_benchmark_min_iters;
-  while (state.KeepRunningBatch(num_iters)) {
-    for (int iter = 0; iter < num_iters; iter++) {
-      benchmark::utils::WipePthreadpoolL2Caches(state,
-                                                model_runtime.threadpool);
-      if (!model_runtime.Invoke()) {
-        state.SkipWithError("failed to invoke runtime");
-        return;
-      }
-    }
-    num_iters = 1;
-  }
-
-  const uint64_t cpu_frequency = benchmark::utils::GetCurrentCpuFrequency();
-  if (cpu_frequency != 0) {
-    state.counters["cpufreq"] = cpu_frequency;
-  }
-}
 
 static void FP32Attention(benchmark::State& state) {
-  BenchmarkInvoke(state, [&state]() {
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
     return models::FP32Attention(FLAGS_batch_size, state.range(0),
                                  state.range(1), state.range(2),
                                  state.range(3));
   });
 }
 
 static void FP16Attention(benchmark::State& state) {
-  BenchmarkInvoke(
+  xnnpack::ModelRuntime::BenchmarkInvoke(
       state,
       [&state]() {
         return models::FP32Attention(FLAGS_batch_size, state.range(0),
@@ -150,101 +34,101 @@ static void FP16Attention(benchmark::State& state) {
 }
 
 static void FP32MobileNetV1(benchmark::State& state) {
-  BenchmarkInvoke(state, models::FP32MobileNetV1);
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV1);
 }
 
 static void FP32MobileNetV2(benchmark::State& state) {
-  BenchmarkInvoke(state, models::FP32MobileNetV2);
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV2);
 }
 
 static void FP32MobileNetV3Large(benchmark::State& state) {
-  BenchmarkInvoke(state, models::FP32MobileNetV3Large);
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV3Large);
 }
 
 static void FP32MobileNetV3Small(benchmark::State& state) {
-  BenchmarkInvoke(state, models::FP32MobileNetV3Small);
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV3Small);
 }
 
 static void FP16MobileNetV1(benchmark::State& state) {
-  BenchmarkInvoke(state, models::FP32MobileNetV1,
-                  XNN_FLAG_FORCE_FP16_INFERENCE);
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV1,
+                                         XNN_FLAG_FORCE_FP16_INFERENCE);
 }
 
 static void FP16MobileNetV2(benchmark::State& state) {
-  BenchmarkInvoke(state, models::FP32MobileNetV2,
-                  XNN_FLAG_FORCE_FP16_INFERENCE);
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV2,
+                                         XNN_FLAG_FORCE_FP16_INFERENCE);
 }
 
 static void FP16MobileNetV3Large(benchmark::State& state) {
-  BenchmarkInvoke(state, models::FP32MobileNetV3Large,
-                  XNN_FLAG_FORCE_FP16_INFERENCE);
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV3Large,
+                                         XNN_FLAG_FORCE_FP16_INFERENCE);
 }
 
 static void FP16MobileNetV3Small(benchmark::State& state) {
-  BenchmarkInvoke(state, models::FP32MobileNetV3Small,
-                  XNN_FLAG_FORCE_FP16_INFERENCE);
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, models::FP32MobileNetV3Small,
+                                         XNN_FLAG_FORCE_FP16_INFERENCE);
 }
 
 static void QD8Attention(benchmark::State& state) {
   models::QD8AttentionWeights weights;
-  BenchmarkInvoke(state, [&state, &weights]() {
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state, &weights]() {
     return models::QD8Attention(FLAGS_batch_size, state.range(0),
                                 state.range(1), state.range(2), state.range(3),
                                 weights);
   });
 }
 
 static void QS8MobileNetV2(benchmark::State& state) {
-  BenchmarkInvoke(state, models::QS8MobileNetV2);
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, models::QS8MobileNetV2);
 }
 
 static void FP32Elementwise(benchmark::State& state) {
-  BenchmarkInvoke(state, [&state]() {
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
     return models::FP32Elementwise(/*batch_size=*/state.range(0),
                                    /*num_elements=*/state.range(1),
                                    /*depth=*/state.range(2));
   });
 }
 
 static void FP32LayerNorm(benchmark::State& state) {
-  BenchmarkInvoke(state, [&state]() {
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
     return models::FP32LayerNorm(state.range(0), state.range(1), state.range(2),
                                  state.range(3));
   });
 }
 
 static void FP32L2Norm(benchmark::State& state) {
-  BenchmarkInvoke(state, [&state]() {
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
     return models::FP32L2Norm(state.range(0), state.range(1), state.range(2),
                                  state.range(3));
   });
 }
 
 static void FP32SoftmaxDecomp(benchmark::State& state) {
-  BenchmarkInvoke(state, [&state]() {
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
     return models::FP32Softmax(state.range(0), state.range(1), state.range(2),
                                state.range(3), /*use_softmax=*/false);
   });
 }
 
 static void FP32SoftmaxFused(benchmark::State& state) {
-  BenchmarkInvoke(state, [&state]() {
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
     return models::FP32Softmax(state.range(0), state.range(1), state.range(2),
                                state.range(3), /*use_softmax=*/true);
   });
 }
 
 static void FP32DepthwiseSeparable(benchmark::State& state) {
   models::FP32DepthwiseSeparableWeights weights;
-  BenchmarkInvoke(state, [&state, &weights]() {
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state, &weights]() {
     return models::FP32DepthwiseSeparable(state.range(0), state.range(1),
                                           state.range(2), state.range(3),
                                           state.range(4), weights);
   });
 }
 
 static void QD8TransformerBlock(benchmark::State& state) {
-  BenchmarkInvoke(state, [&state]() {
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
     return models::QD8TransformerBlock(
         FLAGS_batch_size, /*sequence_length=*/state.range(0),
         /*embedding_dim=*/state.range(1), /*num_heads=*/state.range(2),
@@ -253,7 +137,7 @@ static void QD8TransformerBlock(benchmark::State& state) {
 }
 
 static void FP32TransformerBlock(benchmark::State& state) {
-  BenchmarkInvoke(state, [&state]() {
+  xnnpack::ModelRuntime::BenchmarkInvoke(state, [&state]() {
     return models::FP32TransformerBlock(
         FLAGS_batch_size, /*sequence_length=*/state.range(0),
         /*embedding_dim=*/state.range(1), /*num_heads=*/state.range(2),
@@ -262,7 +146,7 @@ static void FP32TransformerBlock(benchmark::State& state) {
 }
 
 static void FP16TransformerBlock(benchmark::State& state) {
-  BenchmarkInvoke(
+  xnnpack::ModelRuntime::BenchmarkInvoke(
       state,
       [&state]() {
         return models::FP32TransformerBlock(