Skip to content

Commit 8f875cd

Browse files
author
Lukas Stasytis
committed
Analytical FIFO sizing with a set of tests for generation trees and fifo derivation transformations. Swapping fifo sizing step to before stitch currently breaks stitching
1 parent 54ede86 commit 8f875cd

27 files changed

+3540
-269
lines changed

run-docker.sh

100755100644
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,4 +300,3 @@ else
300300
fi
301301

302302
echo $CMD_TO_RUN
303-
$CMD_TO_RUN

src/finn/analysis/fpgadataflow/dataflow_performance.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
from qonnx.custom_op.registry import getCustomOp
3131

32+
from finn.util.basic import decompress_string_to_numpy
3233
from finn.util.fpgadataflow import is_hls_node, is_rtl_node
3334

3435

@@ -76,3 +77,84 @@ def dataflow_performance(model):
7677
"max_cycles": int(max_cycles),
7778
"max_cycles_node_name": max_node_name,
7879
}
80+
81+
82+
def max_period(model):
83+
"""Extract maximum period among all nodes in the graph
84+
85+
Preconditions:
86+
- model consists of HLS/RTL nodes
87+
- model has cycle estimates annotated (see AnnotateCycles transformation)
88+
- nodes have unique names (see GiveUniqueNodeNames)
89+
- model has been characteristically derived and contains specific chr periods
90+
91+
Returns:
92+
- max_cycles : number of cycles for slowest node
93+
- max_cycles_node_name : name of slowest node
94+
- critical_path_cycles : pessimistic expected latency from input to output
95+
"""
96+
max_cycles = 0
97+
98+
for node in model.graph.node:
99+
if node is not None and node.op_type not in [
100+
"AddStreams_hls",
101+
"DuplicateStreams_hls",
102+
"StreamingFIFO_hls",
103+
"StreamingFIFO_rtl",
104+
]:
105+
if is_hls_node(node) or is_rtl_node(node):
106+
inst = getCustomOp(node)
107+
node_cycles_in = (
108+
len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_in"))[0]) // 2
109+
)
110+
node_cycles_out = (
111+
len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_out"))[0]) // 2
112+
)
113+
node_cycles = max(node_cycles_in, node_cycles_out)
114+
115+
if node_cycles > max_cycles:
116+
max_cycles = node_cycles
117+
118+
return {
119+
"max_cycles": int(max_cycles),
120+
}
121+
122+
123+
def max_remaining_period(model, node):
124+
"""Extract maximum period among all nodes in the graph
125+
126+
Preconditions:
127+
- model consists of HLS/RTL nodes
128+
- model has cycle estimates annotated (see AnnotateCycles transformation)
129+
- nodes have unique names (see GiveUniqueNodeNames)
130+
- model has been characteristically derived and contains specific chr periods
131+
132+
Returns:
133+
- max_cycles : number of cycles for slowest node
134+
- max_cycles_node_name : name of slowest node
135+
- critical_path_cycles : pessimistic expected latency from input to output
136+
"""
137+
max_cycles = 0
138+
node_index = list(model.graph.node).index(node)
139+
for node in model.graph.node[node_index:]:
140+
if node is not None and node.op_type not in [
141+
"AddStreams_hls",
142+
"DuplicateStreams_hls",
143+
"StreamingFIFO_hls",
144+
"StreamingFIFO_rtl",
145+
]:
146+
if is_hls_node(node) or is_rtl_node(node):
147+
inst = getCustomOp(node)
148+
node_cycles = int(inst.get_nodeattr("io_chrc_period"))
149+
node_cycles_in = (
150+
len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_in"))[0]) // 2
151+
)
152+
node_cycles_out = (
153+
len(decompress_string_to_numpy(inst.get_nodeattr("io_chrc_out"))[0]) // 2
154+
)
155+
node_cycles = max(node_cycles_in, node_cycles_out)
156+
if node_cycles > max_cycles:
157+
max_cycles = node_cycles
158+
return {
159+
"max_cycles": int(max_cycles),
160+
}

src/finn/builder/build_dataflow_config.py

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,30 @@
4040

4141
class AutoFIFOSizingMethod(str, Enum):
4242
"Select the type of automatic FIFO sizing strategy."
43-
44-
CHARACTERIZE = "characterize"
43+
ANALYTIC = "analytical"
4544
LARGEFIFO_RTLSIM = "largefifo_rtlsim"
4645

4746

47+
class TAVGenerationMethod(str, Enum):
48+
"Select the strategy for constructing token access vectors of an operator."
49+
RTLSIM = "rtlsim"
50+
TREE_MODEL = "tree_model"
51+
52+
53+
class TAVUtilizationMethod(str, Enum):
54+
"""Select the strategy for utilizing token access vectors of an operator
55+
for buffer sizing."""
56+
57+
# worst-case ratio of data rates between a consumer and producer
58+
CONSERVATIVE_RELAXATION = "conservative_relaxation"
59+
60+
# average-case ratio of data rates between a consumer and producer
61+
AGGRESSIVE_RELAXATION = "aggressive_relaxation"
62+
63+
# no relaxation, use the token access vectors as-is
64+
NO_RELAXATION = "no_relaxation"
65+
66+
4867
class ShellFlowType(str, Enum):
4968
"""For builds that produce a bitfile, select the shell flow that will integrate
5069
the FINN-generated accelerator."""
@@ -117,9 +136,9 @@ class VerificationStepType(str, Enum):
117136
"step_apply_folding_config",
118137
"step_minimize_bit_width",
119138
"step_generate_estimate_reports",
139+
"step_set_fifo_depths",
120140
"step_hw_codegen",
121141
"step_hw_ipgen",
122-
"step_set_fifo_depths",
123142
"step_create_stitched_ip",
124143
"step_measure_rtlsim_performance",
125144
"step_out_of_context_synthesis",
@@ -266,6 +285,10 @@ class DataflowBuildConfig:
266285
#: for each FIFO.
267286
auto_fifo_depths: Optional[bool] = True
268287

288+
#: Whether synthesis should be performed in the fifo sizing step
289+
#: in case a node does not have an rtlsim prepared to generate TAVs
290+
just_in_time_synthesis: Optional[bool] = True
291+
269292
#: Whether FIFO nodes with depth larger than 32768 will be split.
270293
#: Allow to configure very large FIFOs in the folding_config_file.
271294
split_large_fifos: Optional[bool] = False
@@ -274,6 +297,26 @@ class DataflowBuildConfig:
274297
#: setting the FIFO sizes.
275298
auto_fifo_strategy: Optional[AutoFIFOSizingMethod] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM
276299

300+
#: Which strategy will be used for token access vector generation for FIFO sizing.
301+
#: RTLSIM will result in performing RTLSIM for each node
302+
#: to deduce the token access vectors empirically
303+
#: TREE_MODEL will use the tree mode of an operator if available, avoiding the generation
304+
#: of IP cores.
305+
tav_generation_strategy: Optional[TAVGenerationMethod] = TAVGenerationMethod.RTLSIM
306+
307+
#: Which strategy will be used for token access vector generation for FIFO sizing.
308+
#: RTLSIM will result in performing RTLSIM for each node
309+
#: to deduce the token access vectors empirically
310+
#: TREE_MODEL will use the tree mode of an operator if available, avoiding the generation
311+
#: of IP cores.
312+
tav_utilization_strategy: Optional[
313+
TAVUtilizationMethod
314+
] = TAVUtilizationMethod.CONSERVATIVE_RELAXATION
315+
316+
#: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test
317+
#: if set to True, always using Python instead
318+
force_python_rtlsim: Optional[bool] = False
319+
277320
#: Memory resource type for large FIFOs
278321
#: Only relevant when `auto_fifo_depths = True`
279322
large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO

src/finn/builder/build_dataflow_steps.py

Lines changed: 89 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,10 @@
5555

5656
import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
5757
import finn.transformation.streamline.absorb as absorb
58-
from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
58+
from finn.analysis.fpgadataflow.dataflow_performance import (
59+
dataflow_performance,
60+
max_period,
61+
)
5962
from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
6063
from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation
6164
from finn.analysis.fpgadataflow.op_and_param_counts import (
@@ -82,8 +85,13 @@
8285
)
8386
from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP
8487
from finn.transformation.fpgadataflow.derive_characteristic import (
85-
DeriveCharacteristic,
88+
DelayCharacteristicFunctions,
8689
DeriveFIFOSizes,
90+
DeriveTokenAccessVectors,
91+
HandleBranches,
92+
JustInTimeSynthesize,
93+
LocalStretchCharacteristicFunctions,
94+
ProducerDelayCharacteristicFunctions,
8795
)
8896
from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP
8997
from finn.transformation.fpgadataflow.insert_dwc import InsertDWC
@@ -104,6 +112,7 @@
104112
)
105113
from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode
106114
from finn.transformation.fpgadataflow.set_fifo_depths import (
115+
CapConvolutionFIFODepths,
107116
InsertAndSetFIFODepths,
108117
RemoveShallowFIFOs,
109118
SplitLargeFIFOs,
@@ -548,29 +557,92 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
548557
"""
549558

550559
if cfg.auto_fifo_depths:
551-
if cfg.auto_fifo_strategy == "characterize":
552-
model = model.transform(InsertDWC())
553-
model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
554-
model = model.transform(GiveUniqueNodeNames())
560+
model = model.transform(InsertDWC())
561+
model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
562+
model = model.transform(GiveUniqueNodeNames())
563+
model = model.transform(AnnotateCycles())
564+
565+
if cfg.auto_fifo_strategy == "analytical":
566+
if cfg.just_in_time_synthesis:
567+
if cfg.tav_generation_strategy == "tree_model":
568+
only_jit_nodes_without_tree = True
569+
else:
570+
only_jit_nodes_without_tree = False
571+
model = model.transform(
572+
JustInTimeSynthesize(
573+
cfg._resolve_fpga_part(),
574+
cfg._resolve_hls_clk_period(),
575+
only_jit_nodes_without_tree,
576+
)
577+
)
578+
# model.save(f"{cfg.output_dir}/intermediate_models/step_rtl_generated_unsized.onnx")
579+
580+
period = int(model.analysis(dataflow_performance)["max_cycles"])
555581
model = model.transform(
556-
PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
582+
DeriveTokenAccessVectors(
583+
model, period, cfg.tav_generation_strategy, cfg._resolve_fpga_part(), 10.0
584+
)
585+
)
586+
587+
# model.save("rtlsim-derived_model.onnx")
588+
589+
period = int(model.analysis(dataflow_performance)["max_cycles"])
590+
model = model.transform(
591+
LocalStretchCharacteristicFunctions(
592+
1,
593+
period,
594+
nodes_to_ignore=[],
595+
)
596+
)
597+
598+
period = int(model.analysis(dataflow_performance)["max_cycles"])
599+
600+
model = model.transform(HandleBranches(model, period))
601+
602+
period = int(model.analysis(dataflow_performance)["max_cycles"])
603+
model = model.transform(
604+
DelayCharacteristicFunctions(
605+
1,
606+
period,
607+
nodes_to_ignore=[],
608+
)
609+
)
610+
611+
period = int(model.analysis(dataflow_performance)["max_cycles"])
612+
613+
model = model.transform(
614+
ProducerDelayCharacteristicFunctions(
615+
1,
616+
period,
617+
nodes_to_ignore=[],
618+
)
619+
)
620+
621+
period = int(model.analysis(max_period)["max_cycles"])
622+
623+
model = model.transform(
624+
DeriveFIFOSizes(
625+
period=period,
626+
nodes_to_ignore=[],
627+
global_offset_correction=True,
628+
tav_utilization_strategy=cfg.tav_utilization_strategy,
629+
)
557630
)
558-
model = model.transform(HLSSynthIP())
559-
model = model.transform(PrepareRTLSim())
560-
model = model.transform(AnnotateCycles())
561-
period = model.analysis(dataflow_performance)["max_cycles"] + 10
562-
model = model.transform(DeriveCharacteristic(period))
563-
model = model.transform(DeriveFIFOSizes())
631+
564632
model = model.transform(
565633
InsertFIFO(
566634
vivado_ram_style=cfg.large_fifo_mem_style,
567635
max_qsrl_depth=256,
568636
create_shallow_fifos=True,
569637
)
570638
)
639+
571640
model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
572641
model = model.transform(GiveUniqueNodeNames())
573642
model = model.transform(GiveReadableTensorNames())
643+
if cfg.default_swg_exception:
644+
model = model.transform(CapConvolutionFIFODepths(max_qsrl_depth=256))
645+
574646
elif cfg.auto_fifo_strategy == "largefifo_rtlsim":
575647
if cfg.fifosim_save_waveform:
576648
report_dir = cfg.output_dir + "/report"
@@ -620,6 +692,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
620692
"depth_trigger_uram",
621693
"depth_trigger_bram",
622694
]
695+
623696
extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs)
624697

625698
# perform FIFO splitting and shallow FIFO removal only after the final config
@@ -631,8 +704,9 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
631704

632705
# after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
633706
# this will only run for the new nodes (e.g. FIFOs and DWCs)
634-
model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
635-
model = model.transform(HLSSynthIP())
707+
# model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
708+
# model = model.transform(HLSSynthIP())
709+
636710
return model
637711

638712

src/finn/custom_op/fpgadataflow/addstreams.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from qonnx.core.datatype import DataType
3333

3434
from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
35+
from finn.util.basic import Characteristic_Node
3536

3637

3738
class AddStreams(HWCustomOp):
@@ -149,7 +150,17 @@ def execute_node(self, context, graph):
149150
result = inp0_values + inp1_values
150151
context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
151152

152-
def derive_characteristic_fxns(self, period):
153+
def prepare_tree_model(self):
154+
dim = np.prod(self.get_folded_output_shape()[1:-1])
155+
156+
read_write = Characteristic_Node("passing addstreams layer", [(dim, [1, 1])], True)
157+
addstreams_top = Characteristic_Node("compute addstreams", [(1, read_write)], False)
158+
159+
return addstreams_top # top level phase of this node
160+
161+
def derive_token_access_vectors(
162+
self, model, period, strategy, fpga_part, clk_period, op_type, override_dict=None
163+
):
153164
n_inps = np.prod(self.get_folded_input_shape()[:-1])
154165
io_dict = {
155166
"inputs": {
@@ -158,4 +169,7 @@ def derive_characteristic_fxns(self, period):
158169
},
159170
"outputs": {"out0": []},
160171
}
161-
super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
172+
173+
super().derive_token_access_vectors(
174+
model, period, strategy, fpga_part, clk_period, op_type, override_dict=io_dict
175+
)

src/finn/custom_op/fpgadataflow/channelwise_op.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from qonnx.util.basic import qonnx_make_model
3535

3636
from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
37+
from finn.util.basic import Characteristic_Node
3738

3839
# ONNX i/o tensor shape assumptions for channelwise ops:
3940
# input 0 is the input tensor, shape (..., NumChannels)
@@ -243,3 +244,13 @@ def execute_node(self, context, graph):
243244
sess = rt.InferenceSession(model_func.SerializeToString())
244245
result = sess.run(None, idict)
245246
context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
247+
248+
def get_tree_model(self):
249+
# key parameters
250+
251+
dim = np.prod(self.get_folded_output_shape()[1:-1])
252+
253+
pass_channelwise = Characteristic_Node("passing channelwise layer", [(dim, [1, 1])], True)
254+
channelwise_top = Characteristic_Node("compute pool", [(1, pass_channelwise)], False)
255+
256+
return channelwise_top # top level phase of this node

0 commit comments

Comments
 (0)