JDAI-CV
diff --git a/‎.gitmodules
Lines changed: 3 additions & 0 deletions b/‎.gitmodules
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 3 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎benchmark/benchmark.cpp
Lines changed: 4 additions & 4 deletions b/‎benchmark/benchmark.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎cmake/flatbuffers.cmake
Lines changed: 10 additions & 0 deletions b/‎cmake/flatbuffers.cmake
Lines changed: 10 additions & 0 deletions
diff --git a/‎common/baseline.h
Lines changed: 3 additions & 1 deletion b/‎common/baseline.h
Lines changed: 3 additions & 1 deletion
diff --git a/‎common/common_bitpack.h
Lines changed: 15 additions & 2 deletions b/‎common/common_bitpack.h
Lines changed: 15 additions & 2 deletions
diff --git a/‎common/dab.fbs
Lines changed: 1 addition & 0 deletions b/‎common/dab.fbs
Lines changed: 1 addition & 0 deletions
@@ -13,3 +13,6 @@
 [submodule "third_party/protobuf"]
 	path = third_party/protobuf
 	url = https://github.com/protocolbuffers/protobuf
+[submodule "third_party/flatbuffers"]
+	path = third_party/flatbuffers
+	url = https://github.com/google/flatbuffers
@@ -42,6 +42,9 @@ include(cmake/system.cmake)
 include(cmake/glog.cmake)
 configure_glog()
 
+include(cmake/flatbuffers.cmake)
+configure_flatbuffers()
+
 add_compile_options("-DEIGEN_MPL2_ONLY")
 if (${BNN_NET_BENCHMARK})
     add_compile_options("-DBNN_BENCHMARK")
 
@@ -14,17 +14,17 @@
 #include <dabnn/net.h>
 
 static void BM_pack_mat_64_small(benchmark::State &state) {
-    const bnn::Mat a(1, 32, 32, 128, bnn::DataType::Float, 0);
-    bnn::Mat b(1, 32, 32, 128, bnn::DataType::Bit, 0);
+    const bnn::Mat a(1, 32, 32, 128, bnn::DataType::Float, false);
+    bnn::Mat b(1, 32, 32, 128, bnn::DataType::Bit, false);
     for (auto _ : state) {
         pack_mat_64(a, b);
     }
 }
 
 #ifdef __aarch64__
 static void BM_pack_mat_128_small(benchmark::State &state) {
-    const bnn::Mat a(1, 32, 32, 128, bnn::DataType::Float, 0);
-    bnn::Mat b(1, 32, 32, 128, bnn::DataType::Bit, 0);
+    const bnn::Mat a(1, 32, 32, 128, bnn::DataType::Float, false);
+    bnn::Mat b(1, 32, 32, 128, bnn::DataType::Bit, false);
     for (auto _ : state) {
         pack_mat_128(a, b);
     }
 
@@ -0,0 +1,10 @@
+function(configure_flatbuffers)
+    option(FLATBUFFERS_BUILD_TESTS "Enable the build of tests and samples." OFF)
+    option(FLATBUFFERS_BUILD_FLATHASH "Enable the build of flathash" OFF)
+    option(FLATBUFFERS_BUILD_FLATC "Enable the build of the flatbuffers compiler"
+        OFF)
+    option(FLATBUFFERS_BUILD_FLATLIB "Enable the build of the flatbuffers library"
+        ON)
+    add_subdirectory(third_party/flatbuffers)
+endfunction()
+
@@ -80,6 +80,8 @@ inline void baseline_bconv(const Mat &input, const Mat &weight,
                            const int stride_w, const int dilation_h,
                            const int dilation_w, const int output_channels,
                            Mat &output) {
+    BNN_ASSERT(weight.total() % weight.n == 0, "");
+    const auto HWC = weight.total() / weight.n;
     int input_y = 0;
     FORZ(th, output.h) {
         int input_x = 0;
@@ -91,7 +93,7 @@ inline void baseline_bconv(const Mat &input, const Mat &weight,
                     FORZ(ww, kernel_w) {
                         int x = input_x - pad_w + ww * dilation_w;
                         FORZ(wc, input.c) {
-                            int idx = tc * kernel_h * kernel_w * input.c +
+                            int idx = tc * HWC +
                                       wh * kernel_w * input.c + ww * input.c +
                                       wc;
                             const auto w_value =
 
@@ -9,11 +9,24 @@
 
 #include <common/helper.h>
 
-inline void pack_64_bitset(const float *fptr, uint64_t *buf) {
+inline void pack_64_bitset(const float *fptr, uint64_t *buf,
+                           const size_t eff_bits = 64) {
+    /**
+     * The eff_bits is to support non-128-multiple channels.
+     * In this case, we need pad the tensor to make the
+     * channel aligned with 128.
+     */
+    // BNN_ASSERT(eff_bits == 64, eff_bits);
     const size_t UNIT_LEN = 64;
+    BNN_ASSERT(eff_bits <= UNIT_LEN, "The eff_bits ", eff_bits,
+               " must be smaller than UNIT_LEN ", UNIT_LEN);
     std::bitset<UNIT_LEN> bits;
     for (size_t i = 0; i < UNIT_LEN; i++) {
-        bits[i] = (*(fptr + i) > 0);
+        if (i < eff_bits) {
+            bits[i] = (*(fptr + i) > 0);
+        } else {
+            bits[i] = 0;
+        }
     }
     static_assert(sizeof(decltype(bits.to_ullong())) * CHAR_BIT == 64,
                   "bits.to_ullong() must return a 64-bit element");
 
@@ -10,6 +10,7 @@ table Tensor {
     float32_data: [float32];
     shape: [uint32];
     name: string;
+    align_hwc_to_128: bool;
 }
 
 table Input {
Original file line number	Diff line number	Diff line change
`@@ -10,6 +10,7 @@ table Tensor {`
`10`	`10`	`float32_data: [float32];`
`11`	`11`	`shape: [uint32];`
`12`	`12`	`name: string;`
	`13`	`+ align_hwc_to_128: bool;`
`13`	`14`	`}`
`14`	`15`
`15`	`16`	`table Input {`