Xilinx
diff --git a/‎run-docker.sh‎
100755100644
Lines changed: 0 additions & 1 deletion b/‎run-docker.sh‎
100755100644
Lines changed: 0 additions & 1 deletion
diff --git a/‎src/finn/analysis/fpgadataflow/dataflow_performance.py‎
Lines changed: 82 additions & 0 deletions b/‎src/finn/analysis/fpgadataflow/dataflow_performance.py‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎src/finn/builder/build_dataflow_config.py‎
Lines changed: 46 additions & 3 deletions b/‎src/finn/builder/build_dataflow_config.py‎
Lines changed: 46 additions & 3 deletions
diff --git a/‎src/finn/builder/build_dataflow_steps.py‎
Lines changed: 89 additions & 15 deletions b/‎src/finn/builder/build_dataflow_steps.py‎
Lines changed: 89 additions & 15 deletions
diff --git a/‎src/finn/custom_op/fpgadataflow/addstreams.py‎
Lines changed: 16 additions & 2 deletions b/‎src/finn/custom_op/fpgadataflow/addstreams.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎src/finn/custom_op/fpgadataflow/channelwise_op.py‎
Lines changed: 11 additions & 0 deletions b/‎src/finn/custom_op/fpgadataflow/channelwise_op.py‎
Lines changed: 11 additions & 0 deletions
@@ -300,4 +300,3 @@ else
 fi
 
 echo $CMD_TO_RUN
-$CMD_TO_RUN
@@ -29,6 +29,7 @@
 
 from qonnx.custom_op.registry import getCustomOp
 
+from finn.util.basic import decompress_string_to_numpy
 from finn.util.fpgadataflow import is_hls_node, is_rtl_node
 
 
@@ -76,3 +77,84 @@ def dataflow_performance(model):
         "max_cycles": int(max_cycles),
         "max_cycles_node_name": max_node_name,
     }
+
+
+def max_period(model):
+    """Extract maximum period among all nodes in the graph
+
+    Preconditions:
+    - model consists of HLS/RTL nodes
+    - model has cycle estimates annotated (see AnnotateCycles transformation)
+    - nodes have unique names (see GiveUniqueNodeNames)
+    - model has been characteristically derived and contains specific chr periods
+
+    Returns:
+    - max_cycles : number of cycles for slowest node
+    - max_cycles_node_name : name of slowest node
+    - critical_path_cycles : pessimistic expected latency from input to output
+    """
+    max_cycles = 0
+
+    for node in model.graph.node:
+        if node is not None and node.op_type not in [
+            "AddStreams_hls",
+            "DuplicateStreams_hls",
+            "StreamingFIFO_hls",
+            "StreamingFIFO_rtl",
+        ]:
+            if is_hls_node(node) or is_rtl_node(node):
+                inst = getCustomOp(node)
+                node_cycles_in = (
+                    len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_in"))[0]) // 2
+                )
+                node_cycles_out = (
+                    len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_out"))[0]) // 2
+                )
+                node_cycles = max(node_cycles_in, node_cycles_out)
+
+                if node_cycles > max_cycles:
+                    max_cycles = node_cycles
+
+    return {
+        "max_cycles": int(max_cycles),
+    }
+
+
+def max_remaining_period(model, node):
+    """Extract maximum period among all nodes in the graph
+
+    Preconditions:
+    - model consists of HLS/RTL nodes
+    - model has cycle estimates annotated (see AnnotateCycles transformation)
+    - nodes have unique names (see GiveUniqueNodeNames)
+    - model has been characteristically derived and contains specific chr periods
+
+    Returns:
+    - max_cycles : number of cycles for slowest node
+    - max_cycles_node_name : name of slowest node
+    - critical_path_cycles : pessimistic expected latency from input to output
+    """
+    max_cycles = 0
+    node_index = list(model.graph.node).index(node)
+    for node in model.graph.node[node_index:]:
+        if node is not None and node.op_type not in [
+            "AddStreams_hls",
+            "DuplicateStreams_hls",
+            "StreamingFIFO_hls",
+            "StreamingFIFO_rtl",
+        ]:
+            if is_hls_node(node) or is_rtl_node(node):
+                inst = getCustomOp(node)
+                node_cycles = int(inst.get_nodeattr("io_chrc_period"))
+                node_cycles_in = (
+                    len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_in"))[0]) // 2
+                )
+                node_cycles_out = (
+                    len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_out"))[0]) // 2
+                )
+                node_cycles = max(node_cycles_in, node_cycles_out)
+                if node_cycles > max_cycles:
+                    max_cycles = node_cycles
+    return {
+        "max_cycles": int(max_cycles),
+    }
@@ -40,11 +40,30 @@
 
 class AutoFIFOSizingMethod(str, Enum):
     "Select the type of automatic FIFO sizing strategy."
-
-    CHARACTERIZE = "characterize"
+    ANALYTIC = "analytical"
     LARGEFIFO_RTLSIM = "largefifo_rtlsim"
 
 
+class TAVGenerationMethod(str, Enum):
+    "Select the strategy for constructing token access vectors of an operator."
+    RTLSIM = "rtlsim"
+    TREE_MODEL = "tree_model"
+
+
+class TAVUtilizationMethod(str, Enum):
+    """Select the strategy for utilizing token access vectors of an operator
+    for buffer sizing."""
+
+    # worst-case ratio of data rates between a consumer and producer
+    CONSERVATIVE_RELAXATION = "conservative_relaxation"
+
+    # average-case ratio of data rates between a consumer and producer
+    AGGRESSIVE_RELAXATION = "aggressive_relaxation"
+
+    # no relaxation, use the token access vectors as-is
+    NO_RELAXATION = "no_relaxation"
+
+
 class ShellFlowType(str, Enum):
     """For builds that produce a bitfile, select the shell flow that will integrate
     the FINN-generated accelerator."""
@@ -117,9 +136,9 @@ class VerificationStepType(str, Enum):
     "step_apply_folding_config",
     "step_minimize_bit_width",
     "step_generate_estimate_reports",
+    "step_set_fifo_depths",
     "step_hw_codegen",
     "step_hw_ipgen",
-    "step_set_fifo_depths",
     "step_create_stitched_ip",
     "step_measure_rtlsim_performance",
     "step_out_of_context_synthesis",
@@ -266,6 +285,10 @@ class DataflowBuildConfig:
     #: for each FIFO.
     auto_fifo_depths: Optional[bool] = True
 
+    #: Whether synthesis should be performed in the fifo sizing step
+    #: in case a node does not have an rtlsim prepared to generate TAVs
+    just_in_time_synthesis: Optional[bool] = True
+
     #: Whether FIFO nodes with depth larger than 32768 will be split.
     #: Allow to configure very large FIFOs in the folding_config_file.
     split_large_fifos: Optional[bool] = False
@@ -274,6 +297,26 @@ class DataflowBuildConfig:
     #: setting the FIFO sizes.
     auto_fifo_strategy: Optional[AutoFIFOSizingMethod] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM
 
+    #: Which strategy will be used for token access vector generation for FIFO sizing.
+    #: RTLSIM will result in performing RTLSIM for each node
+    #: to deduce the token access vectors empirically
+    #: TREE_MODEL will use the tree mode of an operator if available, avoiding the generation
+    #: of IP cores.
+    tav_generation_strategy: Optional[TAVGenerationMethod] = TAVGenerationMethod.RTLSIM
+
+    #: Which strategy will be used for token access vector generation for FIFO sizing.
+    #: RTLSIM will result in performing RTLSIM for each node
+    #: to deduce the token access vectors empirically
+    #: TREE_MODEL will use the tree mode of an operator if available, avoiding the generation
+    #: of IP cores.
+    tav_utilization_strategy: Optional[
+        TAVUtilizationMethod
+    ] = TAVUtilizationMethod.CONSERVATIVE_RELAXATION
+
+    #: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test
+    #: if set to True, always using Python instead
+    force_python_rtlsim: Optional[bool] = False
+
     #: Memory resource type for large FIFOs
     #: Only relevant when `auto_fifo_depths = True`
     large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO
 
@@ -55,7 +55,10 @@
 
 import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
 import finn.transformation.streamline.absorb as absorb
-from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
+from finn.analysis.fpgadataflow.dataflow_performance import (
+    dataflow_performance,
+    max_period,
+)
 from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
 from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
 from finn.analysis.fpgadataflow.op_and_param_counts import (
@@ -82,8 +85,13 @@
 )
 from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
 from finn.transformation.fpgadataflow.derive_characteristic import (
-    DeriveCharacteristic,
+    DelayCharacteristicFunctions,
     DeriveFIFOSizes,
+    DeriveTokenAccessVectors,
+    HandleBranches,
+    JustInTimeSynthesize,
+    LocalStretchCharacteristicFunctions,
+    ProducerDelayCharacteristicFunctions,
 )
 from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
 from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
@@ -104,6 +112,7 @@
 )
 from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
 from finn.transformation.fpgadataflow.set_fifo_depths import (
+    CapConvolutionFIFODepths,
     InsertAndSetFIFODepths,
     RemoveShallowFIFOs,
     SplitLargeFIFOs,
@@ -548,29 +557,92 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
     """
 
     if cfg.auto_fifo_depths:
-        if cfg.auto_fifo_strategy == "characterize":
-            model = model.transform(InsertDWC())
-            model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
-            model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(InsertDWC())
+        model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
+        model = model.transform(GiveUniqueNodeNames())
+        model = model.transform(AnnotateCycles())
+
+        if cfg.auto_fifo_strategy == "analytical":
+            if cfg.just_in_time_synthesis:
+                if cfg.tav_generation_strategy == "tree_model":
+                    only_jit_nodes_without_tree = True
+                else:
+                    only_jit_nodes_without_tree = False
+                model = model.transform(
+                    JustInTimeSynthesize(
+                        cfg._resolve_fpga_part(),
+                        cfg._resolve_hls_clk_period(),
+                        only_jit_nodes_without_tree,
+                    )
+                )
+            # model.save(f"{cfg.output_dir}/intermediate_models/step_rtl_generated_unsized.onnx")
+
+            period = int(model.analysis(dataflow_performance)["max_cycles"])
             model = model.transform(
-                PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
+                DeriveTokenAccessVectors(
+                    model, period, cfg.tav_generation_strategy, cfg._resolve_fpga_part(), 10.0
+                )
+            )
+
+            # model.save("rtlsim-derived_model.onnx")
+
+            period = int(model.analysis(dataflow_performance)["max_cycles"])
+            model = model.transform(
+                LocalStretchCharacteristicFunctions(
+                    1,
+                    period,
+                    nodes_to_ignore=[],
+                )
+            )
+
+            period = int(model.analysis(dataflow_performance)["max_cycles"])
+
+            model = model.transform(HandleBranches(model, period))
+
+            period = int(model.analysis(dataflow_performance)["max_cycles"])
+            model = model.transform(
+                DelayCharacteristicFunctions(
+                    1,
+                    period,
+                    nodes_to_ignore=[],
+                )
+            )
+
+            period = int(model.analysis(dataflow_performance)["max_cycles"])
+
+            model = model.transform(
+                ProducerDelayCharacteristicFunctions(
+                    1,
+                    period,
+                    nodes_to_ignore=[],
+                )
+            )
+
+            period = int(model.analysis(max_period)["max_cycles"])
+
+            model = model.transform(
+                DeriveFIFOSizes(
+                    period=period,
+                    nodes_to_ignore=[],
+                    global_offset_correction=True,
+                    tav_utilization_strategy=cfg.tav_utilization_strategy,
+                )
             )
-            model = model.transform(HLSSynthIP())
-            model = model.transform(PrepareRTLSim())
-            model = model.transform(AnnotateCycles())
-            period = model.analysis(dataflow_performance)["max_cycles"] + 10
-            model = model.transform(DeriveCharacteristic(period))
-            model = model.transform(DeriveFIFOSizes())
+
             model = model.transform(
                 InsertFIFO(
                     vivado_ram_style=cfg.large_fifo_mem_style,
                     max_qsrl_depth=256,
                     create_shallow_fifos=True,
                 )
             )
+
             model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
             model = model.transform(GiveUniqueNodeNames())
             model = model.transform(GiveReadableTensorNames())
+            if cfg.default_swg_exception:
+                model = model.transform(CapConvolutionFIFODepths(max_qsrl_depth=256))
+
         elif cfg.auto_fifo_strategy == "largefifo_rtlsim":
             if cfg.fifosim_save_waveform:
                 report_dir = cfg.output_dir + "/report"
@@ -620,6 +692,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
         "depth_trigger_uram",
         "depth_trigger_bram",
     ]
+
     extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs)
 
     # perform FIFO splitting and shallow FIFO removal only after the final config
@@ -631,8 +704,9 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
 
     # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
     # this will only run for the new nodes (e.g. FIFOs and DWCs)
-    model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
-    model = model.transform(HLSSynthIP())
+    # model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
+    # model = model.transform(HLSSynthIP())
+
     return model
 
 
 
@@ -32,6 +32,7 @@
 from qonnx.core.datatype import DataType
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import Characteristic_Node
 
 
 class AddStreams(HWCustomOp):
@@ -149,7 +150,17 @@ def execute_node(self, context, graph):
         result = inp0_values + inp1_values
         context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
 
-    def derive_characteristic_fxns(self, period):
+    def prepare_tree_model(self):
+        dim = np.prod(self.get_folded_output_shape()[1:-1])
+
+        read_write = Characteristic_Node("passing addstreams layer", [(dim, [1, 1])], True)
+        addstreams_top = Characteristic_Node("compute addstreams", [(1, read_write)], False)
+
+        return addstreams_top  # top level phase of this node
+
+    def derive_token_access_vectors(
+        self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
+    ):
         n_inps = np.prod(self.get_folded_input_shape()[:-1])
         io_dict = {
             "inputs": {
@@ -158,4 +169,7 @@ def derive_characteristic_fxns(self, period):
             },
             "outputs": {"out0": []},
         }
-        super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+
+        super().derive_token_access_vectors(
+            model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict
+        )
@@ -34,6 +34,7 @@
 from qonnx.util.basic import qonnx_make_model
 
 from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+from finn.util.basic import Characteristic_Node
 
 # ONNX i/o tensor shape assumptions for channelwise ops:
 # input 0 is the input tensor, shape (..., NumChannels)
@@ -243,3 +244,13 @@ def execute_node(self, context, graph):
         sess = rt.InferenceSession(model_func.SerializeToString())
         result = sess.run(None, idict)
         context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
+
+    def get_tree_model(self):
+        # key parameters
+
+        dim = np.prod(self.get_folded_output_shape()[1:-1])
+
+        pass_channelwise = Characteristic_Node("passing channelwise layer", [(dim, [1, 1])], True)
+        channelwise_top = Characteristic_Node("compute pool", [(1, pass_channelwise)], False)
+
+        return channelwise_top  # top level phase of this node
Original file line number	Diff line number	Diff line change
`@@ -300,4 +300,3 @@ else`
`300`	`300`	`fi`
`301`	`301`
`302`	`302`	`echo $CMD_TO_RUN`
`303`		`-$CMD_TO_RUN`