diff --git a/fetch-repos.sh b/fetch-repos.sh index a4fc124fa4..ece06523ea 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -32,7 +32,7 @@ FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851" BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4" PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1" CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4" -HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3" +HLSLIB_COMMIT="2e4adf1f34d71cc76a87da189e7eefcff94ebd9f" OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a" AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b" XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e" @@ -45,7 +45,7 @@ FINN_EXP_URL="https://github.com/Xilinx/finn-experimental.git" BREVITAS_URL="https://github.com/Xilinx/brevitas.git" PYVERILATOR_URL="https://github.com/maltanar/pyverilator.git" CNPY_URL="https://github.com/rogersce/cnpy.git" -HLSLIB_URL="https://github.com/Xilinx/finn-hlslib.git" +HLSLIB_URL="https://github.com/mdanilow/finn-hlslib.git" OMX_URL="https://github.com/maltanar/oh-my-xilinx.git" AVNET_BDF_URL="https://github.com/Avnet/bdf.git" XIL_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git" diff --git a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py index 05d26eddb2..cc8c161d73 100644 --- a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py +++ b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py @@ -26,10 +26,12 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import os import numpy as np from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour +from finn.custom_op.fpgadataflow import templates from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy @@ -58,48 +60,40 @@ def global_includes(self): def defines(self, var): self.code_gen_dict["$DEFINES$"] = [] - ifm_ch = self.get_nodeattr("NumChannels") - self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] + HI = self.get_nodeattr("HI") + self.code_gen_dict["$DEFINES$"] += ["#define HI {}".format(HI)] - ibits = self.get_input_datatype().bitwidth() - self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] + WI = self.get_nodeattr("WI") + self.code_gen_dict["$DEFINES$"] += ["#define WI {}".format(WI)] - idim = self.get_nodeattr("IFMDim") - self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] + HO = self.get_nodeattr("HO") + self.code_gen_dict["$DEFINES$"] += ["#define HO {}".format(HO)] - odim = self.get_nodeattr("OFMDim") - self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)] + WO = self.get_nodeattr("WO") + self.code_gen_dict["$DEFINES$"] += ["#define WO {}".format(WO)] - batch_size = self.get_nodeattr("numInputVectors") - self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] + SIMD = self.get_nodeattr("SIMD") + self.code_gen_dict["$DEFINES$"] += ["#define SIMD {}".format(SIMD)] + + CF = self.get_nodeattr("NumChannels") // SIMD + self.code_gen_dict["$DEFINES$"] += ["#define CF {}".format(CF)] def docompute(self): - is_2d = self.get_nodeattr("DimMode") == 0 - batch = self.get_nodeattr("numInputVectors") - if is_2d: - self.code_gen_dict["$DOCOMPUTE$"] = [ - """UpsampleNearestNeighbour_Batch > (in0_%s, out_%s, numReps);""" - % (self.hls_sname(), self.hls_sname()) - ] - else: - assert batch == 1, "1D upsampler currently needs numReps=1" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """UpsampleNearestNeighbour_1D > (in0_%s, out_%s);""" - % (self.hls_sname(), self.hls_sname()) - ] + self.code_gen_dict["$DOCOMPUTE$"] = [ + """upsample_nn(in0_%s, out_%s);""" + % (self.hls_sname(), self.hls_sname()) + ] def blackboxfunction(self): - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits + input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str() + output_elem_hls_type = self.get_output_datatype().get_hls_datatype_str() self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + "void %s(hls::stream> &in0_%s, hls::stream> &out_%s)" % ( self.onnx_node.name, - packed_hls_type, + input_elem_hls_type, self.hls_sname(), - packed_hls_type, + output_elem_hls_type, self.hls_sname(), ) ] @@ -109,7 +103,6 @@ def execute_node(self, context, graph): node = self.onnx_node exp_ishape = self.get_normal_input_shape() exp_oshape = self.get_normal_output_shape() - folded_oshape = self.get_folded_output_shape() if mode == "cppsim": code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") @@ -138,7 +131,7 @@ def execute_node(self, context, graph): # load output npy file super().npy_to_dynamic_output(context) assert ( - context[node.output[0]].shape == folded_oshape + context[node.output[0]].shape == exp_oshape ), "cppsim did not produce expected folded output shape" context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) elif mode == "rtlsim": @@ -173,3 +166,97 @@ def execute_node(self, context, graph): context[node.output[0]].shape == exp_oshape ), """Output shape doesn't match expected shape (1, OutputDim, OutputDim, NumChannels).""" + + # def code_generation_cppsim(self, model): + # """Generates c++ code for simulation (cppsim).""" + # node = self.onnx_node + # path = self.get_nodeattr("code_gen_dir_cppsim") + # self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())] + # self.generate_params(model, path) + # self.global_includes() + # self.defines("cppsim") + # self.read_npy_data() + # self.strm_decl() + # self.pragmas() + # self.docompute() + # self.dataoutstrm() + # self.save_as_npy() + # self.timeout_value() + # self.timeout_condition() + # self.timeout_read_stream() + + # template = templates.docompute_template_timeout + + # for key in self.code_gen_dict: + # # transform list into long string separated by '\n' + # code_gen_line = "\n".join(self.code_gen_dict[key]) + # template = template.replace(key, code_gen_line) + # code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + # f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w") + # f.write(template) + # f.close() + # self.code_gen_dict.clear() + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + npy_type = "float" + self.code_gen_dict["$READNPYDATA$"] = [] + input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str() + npy_in = "%s/input_0.npy" % (code_gen_dir) + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2vectorstream<%s, %s, SIMD>("%s", in0_%s);' + % ( + input_elem_hls_type, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def dataoutstrm(self): + npy_type = "float" + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + npy_out = "%s/output.npy" % code_gen_dir + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'vectorstream2npy<%s, %s, SIMD>(out_%s, %s, "%s");' + % ( + self.get_output_datatype().get_hls_datatype_str(), + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_input_datatype().get_hls_datatype_str(), + self.hls_sname(), + self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_output_datatype().get_hls_datatype_str(), + self.hls_sname(), + self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> debug_out_{} ("debug_out_{}");'.format( + self.get_output_datatype().get_hls_datatype_str(), + self.hls_sname(), + self.hls_sname() + ) + ) + + def pragmas(self): + super().pragmas() + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS aggregate variable=in0_%s compact=bit" % self.hls_sname()) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS aggregate variable=out_%s compact=bit" % self.hls_sname()) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS dataflow disable_start_propagation") + \ No newline at end of file diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py index d8210fd684..c03a9029db 100644 --- a/src/finn/custom_op/fpgadataflow/hlsbackend.py +++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py @@ -474,3 +474,17 @@ def get_ap_int_max_w(self): ret = max([instream, outstream]) assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret return ret + + def timeout_value(self): + """Set timeout value for HLS functions defined for one clock cycle""" + self.code_gen_dict["$TIMEOUT_VALUE$"] = ["100"] + + def timeout_condition(self): + """Set timeout condition for HLS functions defined for one clock cycle""" + self.code_gen_dict["$TIMEOUT_CONDITION$"] = ["out_{}.empty()".format(self.hls_sname())] + + def timeout_read_stream(self): + """Set reading output stream procedure for HLS functions defined for one clock cycle""" + self.code_gen_dict["$TIMEOUT_READ_STREAM$"] = [ + "debug_out_{} << out_{}.read();".format(self.hls_sname(), self.hls_sname()) + ] diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index 3d89a0ab23..d2100a7516 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -32,6 +32,7 @@ #define AP_INT_MAX_W $AP_INT_MAX_W$ #include "cnpy.h" #include "npy2apintstream.hpp" +#include "npy2vectorstream.hpp" #include #include "bnn-library.h" @@ -58,6 +59,51 @@ """ +# template for single node execution with timeout (for single clock hls operations) +docompute_template_timeout = """ +#define AP_INT_MAX_W $AP_INT_MAX_W$ +#include "cnpy.h" +#include "npy2apintstream.hpp" +#include "npy2vectorstream.hpp" +#include +#include "bnn-library.h" + +// includes for network parameters +$GLOBALS$ + +// defines for network parameters +$DEFINES$ + +int main(){ +$PRAGMAS$ + +$STREAMDECLARATIONS$ + +$READNPYDATA$ + +unsigned timeout = 0; +while(timeout < $TIMEOUT_VALUE$){ + +$DOCOMPUTE$ + +if($TIMEOUT_CONDITION$){ +timeout++; +} + +else{ +$TIMEOUT_READ_STREAM$ +timeout = 0; +} +} + +$DATAOUTSTREAM$ + +$SAVEASCNPY$ + +} + +""" + # templates for single node ip generation # cpp file diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py index 3348394e05..25da51f7a7 100644 --- a/src/finn/custom_op/fpgadataflow/upsampler.py +++ b/src/finn/custom_op/fpgadataflow/upsampler.py @@ -44,62 +44,53 @@ def __init__(self, onnx_node, **kwargs): def get_nodeattr_types(self): my_attrs = { - # Size of the output feature map - "OFMDim": ("i", True, 0), - # Size of the input feature map - "IFMDim": ("i", True, 0), + "SIMD": ("i", True, 0), + # Height, width of the output feature map + "HO": ("i", True, 0), + "WO": ("i", True, 0), + # Height, width of the input feature map + "HI": ("i", True, 0), + "WI": ("i", True, 0), # Amount of channels of the input feature map "NumChannels": ("i", True, 0), # FINN input datatype "inputDataType": ("s", True, ""), # Batch size - "numInputVectors": ("i", False, 1), - # Dimensionality mode: 0 = 2D square, 1 = 1D in H dim - "DimMode": ("i", False, 0), + "batchSize": ("i", False, 1), } my_attrs.update(super().get_nodeattr_types()) return my_attrs def get_exp_cycles(self): - OFMDim = self.get_nodeattr("OFMDim") - batch_size = self.get_nodeattr("numInputVectors") - is_2d = self.get_nodeattr("DimMode") == 0 - reps = 1 - if is_2d: - OFMDim = OFMDim * OFMDim - reps = batch_size - exp_cycles = OFMDim * reps - return int(exp_cycles) + return np.prod(self.get_folded_output_shape()[:-1]) def get_normal_input_shape(self, ind=0): - IFMDim = self.get_nodeattr("IFMDim") + batch = self.get_nodeattr("batchSize") + HI = self.get_nodeattr("HI") + WI = self.get_nodeattr("WI") num_ch = self.get_nodeattr("NumChannels") - batch = self.get_nodeattr("numInputVectors") - is_2d = self.get_nodeattr("DimMode") == 0 - if is_2d: - ishape = (batch, IFMDim, IFMDim, num_ch) - else: - ishape = (batch, IFMDim, 1, num_ch) + ishape = (batch, HI, WI, num_ch) return ishape def get_normal_output_shape(self, ind=0): - OFMDim = self.get_nodeattr("OFMDim") + batch = self.get_nodeattr("batchSize") + HO = self.get_nodeattr("HO") + WO = self.get_nodeattr("WO") num_ch = self.get_nodeattr("NumChannels") - batch = self.get_nodeattr("numInputVectors") - is_2d = self.get_nodeattr("DimMode") == 0 - if is_2d: - oshape = (batch, OFMDim, OFMDim, num_ch) - else: - oshape = (batch, OFMDim, 1, num_ch) + oshape = (batch, HO, WO, num_ch) return oshape def get_folded_input_shape(self, ind=0): - normal_ishape = list(self.get_normal_input_shape()) - return tuple(normal_ishape) + spatial_shape = list(self.get_normal_input_shape())[:-1] + simd = self.get_nodeattr("SIMD") + folds = self.get_nodeattr("NumChannels") // simd + return tuple(spatial_shape + [folds, simd]) def get_folded_output_shape(self, ind=0): - normal_oshape = list(self.get_normal_output_shape()) - return tuple(normal_oshape) + spatial_shape = list(self.get_normal_output_shape())[:-1] + simd = self.get_nodeattr("SIMD") + folds = self.get_nodeattr("NumChannels") // simd + return tuple(spatial_shape + [folds, simd]) def make_shape_compatible_op(self, model): exp_ishape = self.get_normal_input_shape() @@ -136,13 +127,13 @@ def get_output_datatype(self, ind=0): def get_instream_width(self, ind=0): ibits = self.get_input_datatype().bitwidth() - ifm_ch = self.get_nodeattr("NumChannels") - return ibits * ifm_ch + simd = self.get_nodeattr("SIMD") + return ibits * simd def get_outstream_width(self, ind=0): obits = self.get_output_datatype().bitwidth() - ifm_ch = self.get_nodeattr("NumChannels") - return obits * ifm_ch + simd = self.get_nodeattr("SIMD") + return obits * simd def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() @@ -153,17 +144,11 @@ def execute_node(self, context, graph): node = self.onnx_node inp_values = context[node.input[0]] ishape = inp_values.shape - odim = self.get_nodeattr("OFMDim") - idim = self.get_nodeattr("IFMDim") - if ishape[1] == ishape[2]: - scales_val = [1, int(round(odim / idim)), int(round(odim / idim)), 1] - elif ishape[1] > 1 and ishape[2] == 1: - scales_val = [1, int(round(odim / idim)), 1, 1] - else: - warnings.warn( - """HW abstraction layer for Upsample cannot be executed. - Upsampling only supported for 1D H, or 2D square scaling""" - ) + HO = self.get_nodeattr("HO") + WO = self.get_nodeattr("WO") + HI = self.get_nodeattr("HI") + WI = self.get_nodeattr("WI") + scales_val = [1, int(round(HO / HI)), int(round(WO / WI)), 1] oshape = context[node.output[0]].shape inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) scales = helper.make_tensor_value_info("scales", TensorProto.FLOAT, [4]) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index b02bc89db8..5935d237e6 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -319,33 +319,19 @@ def apply(self, model): ) # Assumes nhwc layout for scales and input - is_scale_square_2d = scales[1] == scales[2] - is_scale_1d = scales[1] > 1 and scales[2] == 1 - assert is_scale_square_2d or is_scale_1d, ( - "%s: Upsampling only supported for 1D H, or 2D square scaling" % n.name - ) assert scales[0] == scales[3] == 1, ( n.name + ": Upsampling is only supported for scales with " "the first and last dimensions being 1 in NHWC." ) - spatial_scale = scales[1] - assert spatial_scale == int(spatial_scale), ( - "%s: Upsampling is only supported for integer scales." % n.name - ) - is_shape_square_2d = in_shape[1] == in_shape[2] - is_shape_1d = in_shape[1] > 1 and in_shape[2] == 1 - - assert is_shape_square_2d or is_shape_1d, ( - "%s: Upsampling is only supported for 1D H or 2D square inputs." % n.name - ) # Extract information for HW node - IFMDim = in_shape[1] - OFMDim = int(round(in_shape[1] * spatial_scale)) + HI = in_shape[1] + WI = in_shape[2] + HO = int(round(HI * scales[1])) + WO = int(round(WI * scales[2])) NumChannels = in_shape[-1] - numInputVectors = in_shape[0] + batchSize = in_shape[0] inputDataType = dt.name - dim_mode = 0 if is_shape_square_2d else 1 # Insert the HWCustomOp node Upsample_HW_node = helper.make_node( @@ -354,12 +340,14 @@ def apply(self, model): [n.output[0]], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - OFMDim=OFMDim, - IFMDim=IFMDim, + SIMD=1, + HO=HO, + WO=WO, + HI=HI, + WI=WI, NumChannels=NumChannels, inputDataType=inputDataType, - numInputVectors=numInputVectors, - DimMode=dim_mode, + batchSize=batchSize, name="UpsampleNearestNeighbour_" + n.name, ) diff --git a/tests/fpgadataflow/test_fpgadataflow_upsampler.py b/tests/fpgadataflow/test_fpgadataflow_upsampler.py index 4539917878..c9a8d4379d 100644 --- a/tests/fpgadataflow/test_fpgadataflow_upsampler.py +++ b/tests/fpgadataflow/test_fpgadataflow_upsampler.py @@ -43,6 +43,7 @@ from qonnx.transformation.infer_shapes import InferShapes from qonnx.transformation.make_input_chanlast import MakeInputChannelsLast from qonnx.util.cleanup import cleanup as qonnx_cleanup +from qonnx.custom_op.registry import getCustomOp from torch import nn import finn.core.onnx_exec as oxe @@ -101,26 +102,28 @@ def forward(self, x): # param datatype @pytest.mark.parametrize("dt", [DataType["INT8"]]) # spatial dim input feature map -@pytest.mark.parametrize("IFMDim", [3, 5]) +@pytest.mark.parametrize("IFMDim", [(6, 10), (12, 20)]) # upscaling factor -@pytest.mark.parametrize("scale", [2, 3]) +@pytest.mark.parametrize("scale", [2]) # Number of input/output channels -@pytest.mark.parametrize("NumChannels", [4]) +@pytest.mark.parametrize("NumChannels", [128, 256]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) # whether to use 1D or 2D square testcases -@pytest.mark.parametrize("is_1d", [False, True]) +@pytest.mark.parametrize("is_1d", [False]) +# parallelization level +@pytest.mark.parametrize("SIMD", [1, 16]) @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow -def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d): +def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d, SIMD): tmpdir = make_build_dir("upsample_export_") atol = 1e-3 if is_1d: - input_shape = (1, NumChannels, IFMDim, 1) + input_shape = (1, NumChannels, IFMDim[0], 1) upscale_factor = (scale, 1) else: - input_shape = (1, NumChannels, IFMDim, IFMDim) + input_shape = (1, NumChannels, IFMDim[0], IFMDim[1]) upscale_factor = (scale, scale) # Create the test model and inputs for it torch_model = PyTorchTestModel(upscale_factor=upscale_factor) @@ -165,6 +168,8 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d for n in model.get_finn_nodes(): node_check = n.op_type == "UpsampleNearestNeighbour" assert node_check, "All nodes should be UpsampleNearestNeighbour nodes." + inst = getCustomOp(n) + inst.set_nodeattr("SIMD", SIMD) test_in_transposed = test_in.numpy().transpose(_to_chan_last_args) input_dict = {model.graph.input[0].name: test_in_transposed}