Xilinx · mdanilow · Jul 2, 2024 · Jul 4, 2024 · Jul 10, 2024 · Jul 11, 2024
diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py
@@ -145,7 +145,13 @@ def load_external_weights(self):
                 # weight_buf.sync_to_device()
                 weight_buf.flush()
 
-                self.external_weights += [(iwdma, weight_buf, idma_name)]
+                input_shape = self._io_shape_dict["external_weights_input_shapes"][idma_name]
+                # NHWC input?
+                if len(input_shape) == 4:
+                    num_repeats = input_shape[1] * input_shape[2]
+                else:
+                    num_repeats = 1
+                self.external_weights += [(iwdma, weight_buf, idma_name, num_repeats)]
 
         if "number_of_external_weights" in self._io_shape_dict:
             hw_ext_weights = self._io_shape_dict["number_of_external_weights"]
@@ -351,9 +357,9 @@ def execute_on_buffers(self, asynch=False, batch_size=None):
             for o in range(self.num_outputs):
                 assert self.odma[o].read(0x00) & 0x4 != 0, "Output DMA %d is not idle" % (o)
             # manually launch IODMAs since signatures are missing
-            for iwdma, iwbuf, iwdma_name in self.external_weights:
+            for iwdma, iwbuf, iwdma_name, num_repeats in self.external_weights:
                 iwdma.write(0x10, iwbuf.device_address)
-                iwdma.write(0x1C, batch_size)
+                iwdma.write(0x1C, batch_size * num_repeats)
                 iwdma.write(0x00, 1)
             for o in range(self.num_outputs):
                 self.odma[o].write(0x10, self.obuf_packed_device[o].device_address)
@@ -368,8 +374,8 @@ def execute_on_buffers(self, asynch=False, batch_size=None):
                 assert self.odma_handle[o] is None, "Output DMA %d is already running" % o
             for i in range(self.num_inputs):
                 self.idma[i].start(self.ibuf_packed_device[i], batch_size)
-            for iwdma, iwbuf, iwdma_name in self.external_weights:
-                iwdma.start(iwbuf, batch_size)
+            for iwdma, iwbuf, iwdma_name, num_repeats in self.external_weights:
+                iwdma.start(iwbuf, batch_size * num_repeats)
             for o in range(self.num_outputs):
                 self.odma_handle[o] = self.odma[o].start(self.obuf_packed_device[o], batch_size)
         else:
@@ -437,9 +443,9 @@ def throughput_test(self):
         for o in range(self.num_outputs):
             total_out += np.prod(self.oshape_packed(o))
         res["DRAM_out_bandwidth[MB/s]"] = total_out * 0.000001 / runtime
-        for iwdma, iwbuf, iwdma_name in self.external_weights:
+        for iwdma, iwbuf, iwdma_name, num_repeats in self.external_weights:
             res["DRAM_extw_%s_bandwidth[MB/s]" % iwdma_name] = (
-                self.batch_size * np.prod(iwbuf.shape) * 0.000001 / runtime
+                self.batch_size * np.prod(iwbuf.shape) * num_repeats * 0.000001 / runtime
             )
         if self.platform == "zynq-iodma":
             res["fclk[mhz]"] = Clocks.fclk0_mhz

diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py
@@ -98,9 +98,9 @@
         driver.execute_on_buffers()
         obuf_normal = np.empty_like(driver.obuf_packed_device[0])
         driver.copy_output_data_from_device(obuf_normal)
-        ret = np.bincount(obuf_normal.flatten() == exp.flatten())
-        nok += ret[0]
-        ok += ret[1]
+        batch_ok = (obuf_normal.flatten() == exp.flatten()).sum()
+        ok += batch_ok
+        nok += bsize - batch_ok
         print("batch %d / %d : total OK %d NOK %d" % (i + 1, n_batches, ok, nok))
 
     acc = 100.0 * ok / (total)

diff --git a/src/finn/qnn-data/test_ext_weights/cnv-w2a2-extw.json b/src/finn/qnn-data/test_ext_weights/cnv-w2a2-extw.json
@@ -0,0 +1,116 @@
+{
+  "Defaults": {},
+  "Thresholding_rtl_0": {
+    "PE": 3
+  },
+  "ConvolutionInputGenerator_rtl_0": {
+    "SIMD": 3,
+    "ram_style": "distributed"
+  },
+  "MVAU_hls_0": {
+    "PE": 8,
+    "SIMD": 3,
+    "ram_style": "auto",
+    "mem_mode": "internal_decoupled"
+  },
+  "Thresholding_rtl_1": {
+    "PE": 1
+  },
+  "ConvolutionInputGenerator_rtl_1": {
+    "SIMD": 16,
+    "ram_style": "distributed"
+  },
+  "MVAU_hls_1": {
+    "PE": 16,
+    "SIMD": 16,
+    "mem_mode": "external"
+  },
+  "Thresholding_rtl_2": {
+    "PE": 1
+  },
+  "StreamingMaxPool_hls_0": {
+    "PE": 1
+  },
+  "ConvolutionInputGenerator_rtl_2": {
+    "SIMD": 16,
+    "ram_style": "distributed"
+  },
+  "MVAU_hls_2": {
+    "PE": 8,
+    "SIMD": 16,
+    "ram_style": "auto",
+    "mem_mode": "internal_decoupled"
+  },
+  "Thresholding_rtl_3": {
+    "PE": 1
+  },
+  "ConvolutionInputGenerator_rtl_3": {
+    "SIMD": 16,
+    "ram_style": "distributed"
+  },
+  "MVAU_hls_3": {
+    "PE": 8,
+    "SIMD": 16,
+    "ram_style": "auto",
+    "mem_mode": "internal_decoupled"
+  },
+  "Thresholding_rtl_4": {
+    "PE": 1
+  },
+  "StreamingMaxPool_hls_1": {
+    "PE": 1
+  },
+  "ConvolutionInputGenerator_rtl_4": {
+    "SIMD": 8,
+    "ram_style": "distributed"
+  },
+  "MVAU_hls_4": {
+    "PE": 4,
+    "SIMD": 8,
+    "ram_style": "auto",
+    "mem_mode": "internal_decoupled"
+  },
+  "Thresholding_rtl_5": {
+    "PE": 1
+  },
+  "ConvolutionInputGenerator_rtl_5": {
+    "SIMD": 8,
+    "ram_style": "distributed"
+  },
+  "MVAU_hls_5": {
+    "PE": 1,
+    "SIMD": 8,
+    "ram_style": "auto",
+    "mem_mode": "internal_decoupled"
+  },
+  "Thresholding_rtl_6": {
+    "PE": 1
+  },
+  "MVAU_hls_6": {
+    "PE": 1,
+    "SIMD": 2,
+    "ram_style": "block",
+    "mem_mode": "internal_decoupled"
+  },
+  "Thresholding_rtl_7": {
+    "PE": 1
+  },
+  "MVAU_hls_7": {
+    "PE": 2,
+    "SIMD": 2,
+    "ram_style": "auto",
+    "mem_mode": "internal_decoupled"
+  },
+  "Thresholding_rtl_8": {
+    "PE": 1
+  },
+  "MVAU_hls_8": {
+    "PE": 5,
+    "SIMD": 1,
+    "ram_style": "distributed",
+    "mem_mode": "internal_decoupled"
+  },
+  "LabelSelect_hls_0": {
+    "PE": 1
+  }
+}
diff --git a/src/finn/qnn-data/test_ext_weights/specialize_layers_config_cnv.json b/src/finn/qnn-data/test_ext_weights/specialize_layers_config_cnv.json
@@ -0,0 +1,84 @@
+{
+  "Defaults": {},
+  "Thresholding_0": {
+    "preferred_impl_style": ""
+  },
+  "ConvolutionInputGenerator_0": {
+    "preferred_impl_style": ""
+  },
+  "MVAU_0": {
+    "preferred_impl_style": "hls"
+  },
+  "Thresholding_1": {
+    "preferred_impl_style": ""
+  },
+  "ConvolutionInputGenerator_1": {
+    "preferred_impl_style": ""
+  },
+  "MVAU_1": {
+    "preferred_impl_style": "hls"
+  },
+  "Thresholding_2": {
+    "preferred_impl_style": ""
+  },
+  "StreamingMaxPool_0": {
+    "preferred_impl_style": ""
+  },
+  "ConvolutionInputGenerator_2": {
+    "preferred_impl_style": ""
+  },
+  "MVAU_2": {
+    "preferred_impl_style": "hls"
+  },
+  "Thresholding_3": {
+    "preferred_impl_style": ""
+  },
+  "ConvolutionInputGenerator_3": {
+    "preferred_impl_style": ""
+  },
+  "MVAU_3": {
+    "preferred_impl_style": "hls"
+  },
+  "Thresholding_4": {
+    "preferred_impl_style": ""
+  },
+  "StreamingMaxPool_1": {
+    "preferred_impl_style": ""
+  },
+  "ConvolutionInputGenerator_4": {
+    "preferred_impl_style": ""
+  },
+  "MVAU_4": {
+    "preferred_impl_style": "hls"
+  },
+  "Thresholding_5": {
+    "preferred_impl_style": ""
+  },
+  "ConvolutionInputGenerator_5": {
+    "preferred_impl_style": ""
+  },
+  "MVAU_5": {
+    "preferred_impl_style": "hls"
+  },
+  "Thresholding_6": {
+    "preferred_impl_style": ""
+  },
+  "MVAU_6": {
+    "preferred_impl_style": "hls"
+  },
+  "Thresholding_7": {
+    "preferred_impl_style": ""
+  },
+  "MVAU_7": {
+    "preferred_impl_style": "hls"
+  },
+  "Thresholding_8": {
+    "preferred_impl_style": ""
+  },
+  "MVAU_8": {
+    "preferred_impl_style": "hls"
+  },
+  "LabelSelect_0": {
+    "preferred_impl_style": ""
+  }
+}
diff --git a/...ext_weights/specialize_layers_config.json → ...weights/specialize_layers_config_tfc.json b/...ext_weights/specialize_layers_config.json → ...weights/specialize_layers_config_tfc.json
@@ -4,7 +4,7 @@
     "preferred_impl_style": "rtl"
   },
   "MVAU_0": {
-    "preferred_impl_style": "rtl"
+    "preferred_impl_style": "hls"
   },
   "Thresholding_1": {
     "preferred_impl_style": "rtl"
@@ -16,13 +16,13 @@
     "preferred_impl_style": "rtl"
   },
   "MVAU_2": {
-    "preferred_impl_style": "rtl"
+    "preferred_impl_style": "hls"
   },
   "Thresholding_3": {
     "preferred_impl_style": "rtl"
   },
   "MVAU_3": {
-    "preferred_impl_style": "rtl"
+    "preferred_impl_style": "hls"
   },
   "LabelSelect_0": {
     "preferred_impl_style": "hls"

diff --git a/src/finn/qnn-data/test_ext_weights/tfc-w2a2-extw.json b/src/finn/qnn-data/test_ext_weights/tfc-w2a2-extw.json
@@ -3,31 +3,31 @@
     "Thresholding_rtl_0": {
       "PE": 49
     },
-    "MVAU_rtl_0": {
+    "MVAU_hls_0": {
       "PE": 16,
       "SIMD": 49,
       "ram_style": "block"
     },
     "Thresholding_rtl_1": {
       "PE": 16
     },
-    "MVAU_hls_0": {
+    "MVAU_hls_1": {
       "PE": 8,
       "SIMD": 8,
       "mem_mode": "external"
     },
     "Thresholding_rtl_2": {
       "PE": 8
     },
-    "MVAU_rtl_1": {
+    "MVAU_hls_2": {
       "PE": 8,
       "SIMD": 8,
       "mem_mode": "external"
     },
     "Thresholding_rtl_3": {
       "PE": 8
     },
-    "MVAU_rtl_2": {
+    "MVAU_hls_3": {
       "PE": 10,
       "SIMD": 8,
       "ram_style": "distributed"

diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py
@@ -216,6 +216,7 @@ def apply(self, model):
         os.makedirs(weights_dir)
         idma_idx = 0
         ext_weight_dma_cnt = 0
+        ext_weight_shapes_dict = {}
 
         for node in model.graph.node:
             assert (
@@ -236,6 +237,15 @@ def apply(self, model):
                 assert df_model.graph.node[0].op_type == "IODMA_hls"
                 iodma_node = getCustomOp(df_model.graph.node[0])
                 if iodma_node.get_nodeattr("burstMode") == "wrap":  # input weights dma?
+                    dma_sdp_output = sdp_inst.onnx_node.output[0]
+                    dma_target_sdp = getCustomOp(model.find_consumer(dma_sdp_output))
+                    dma_target_model = ModelWrapper(dma_target_sdp.get_nodeattr("model"))
+                    iodma_output_tensor = iodma_node.onnx_node.output[0]
+                    dma_consumer = dma_target_model.find_consumer(iodma_output_tensor)
+                    ext_weight_shapes_dict[idma_name] = dma_target_model.get_tensor_shape(
+                        dma_consumer.output[0]
+                    )
+
                     init_tensor = df_model.get_initializer(iodma_node.onnx_node.input[0])
                     ext_weight_dma_cnt += 1
                     w_dtype = df_model.get_tensor_datatype(iodma_node.onnx_node.input[0])
@@ -261,6 +271,7 @@ def apply(self, model):
         driver = driver.replace("$NUM_INPUTS$", str(len(idma_names)))
         driver = driver.replace("$NUM_OUTPUTS$", str(len(odma_names)))
         driver = driver.replace("$EXT_WEIGHT_NUM$", str(ext_weight_dma_cnt))
+        driver = driver.replace("$EXT_WEIGHT_INPUT_SHAPES$", str(ext_weight_shapes_dict))
 
         with open(driver_py, "w") as f:
             f.write(driver)

diff --git a/src/finn/transformation/fpgadataflow/template_driver.py b/src/finn/transformation/fpgadataflow/template_driver.py
@@ -82,6 +82,7 @@
     "input_dma_name" : $INPUT_DMA_NAME$,
     "output_dma_name" : $OUTPUT_DMA_NAME$,
     "number_of_external_weights": $EXT_WEIGHT_NUM$,
+    "external_weights_input_shapes": $EXT_WEIGHT_INPUT_SHAPES$,
     "num_inputs" : $NUM_INPUTS$,
     "num_outputs" : $NUM_OUTPUTS$,
 }