Xilinx · STFleming · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025 · Sep 26, 2025
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
@@ -151,6 +151,8 @@ else
   echo "See https://docs.xilinx.com/r/en-US/ug835-vivado-tcl-commands/Tcl-Initialization-Scripts"
 fi
 
+export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$HLS_PATH/lnx64/tools/fpo_v7_1"
+
 export PATH=$PATH:$HOME/.local/bin
 # execute the provided command(s) as root
 exec "$@"
diff --git a/fetch-repos.sh b/fetch-repos.sh
@@ -32,7 +32,7 @@ QONNX_COMMIT="0630ceaee17799096d1750abcfb5bbe0a2877888"
 FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
 BREVITAS_COMMIT="4617f7bd136e96fa21c7f76e3c7e2e37fe563837"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="5dde96382b84979c6caa6f34cdad2ac72fa28489"
+HLSLIB_COMMIT="6c493c8f6f4302f6a84403d48e23cb8a912949a0"
 OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"

diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -62,6 +62,7 @@ def register_custom_op(cls):
 from finn.custom_op.fpgadataflow.fmpadding import FMPadding
 from finn.custom_op.fpgadataflow.fmpadding_pixel import FMPadding_Pixel
 from finn.custom_op.fpgadataflow.globalaccpool import GlobalAccPool
+from finn.custom_op.fpgadataflow.hwsoftmax import HWSoftmax
 from finn.custom_op.fpgadataflow.labelselect import LabelSelect
 from finn.custom_op.fpgadataflow.lookup import Lookup
 from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU
@@ -102,3 +103,4 @@ def register_custom_op(cls):
 custom_op["StreamingDataWidthConverter"] = StreamingDataWidthConverter
 custom_op["StreamingEltwise"] = StreamingEltwise
 custom_op["UpsampleNearestNeighbour"] = UpsampleNearestNeighbour
+custom_op["HWSoftmax"] = HWSoftmax
diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -60,6 +60,7 @@ def register_custom_op(cls):
 from finn.custom_op.fpgadataflow.hls.duplicatestreams_hls import DuplicateStreams_hls
 from finn.custom_op.fpgadataflow.hls.fmpadding_pixel_hls import FMPadding_Pixel_hls
 from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls
+from finn.custom_op.fpgadataflow.hls.hwsoftmax_hls import HWSoftmax_hls
 from finn.custom_op.fpgadataflow.hls.iodma_hls import IODMA_hls
 from finn.custom_op.fpgadataflow.hls.labelselect_hls import LabelSelect_hls
 from finn.custom_op.fpgadataflow.hls.lookup_hls import Lookup_hls
@@ -96,3 +97,4 @@ def register_custom_op(cls):
 custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls
 custom_op["MVAU_hls"] = MVAU_hls
 custom_op["VVAU_hls"] = VVAU_hls
+custom_op["HWSoftmax_hls"] = HWSoftmax_hls
diff --git a/src/finn/custom_op/fpgadataflow/hls/hwsoftmax_hls.py b/src/finn/custom_op/fpgadataflow/hls/hwsoftmax_hls.py
@@ -0,0 +1,126 @@
+############################################################################
+# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3 Clause
+#
+# @author       Shane T. Fleming <[email protected]>
+############################################################################
+
+import numpy as np
+import os
+
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.hwsoftmax import HWSoftmax
+from finn.util.basic import CppBuilder
+
+
+class HWSoftmax_hls(HWSoftmax, HLSBackend):
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+        self.set_nodeattr("hls_style", "freerunning")
+        self.set_nodeattr("cpp_interface", "hls_vector")
+
+    def get_nodeattr_types(self):
+        my_attrs = {}
+        my_attrs.update(HWSoftmax.get_nodeattr_types(self))
+        my_attrs.update(HLSBackend.get_nodeattr_types(self))
+        return my_attrs
+
+    def global_includes(self):
+        self.code_gen_dict["$GLOBALS$"] = [
+            "#include <hls_vector.h>",
+            '#include "softmax.hpp"',
+            '#include "utils.hpp"',
+        ]
+
+    def defines(self, var):
+        simd = self.get_nodeattr("SIMD")
+        idtype = self.get_input_datatype()
+        w = self.get_nodeattr("ifm_dim")[-1]
+        self.code_gen_dict["$DEFINES$"] = [
+            f"""
+            constexpr unsigned  SIMD = {simd};
+            constexpr unsigned  W = {w};
+            using  TI = {idtype.get_hls_datatype_str()};
+            using  F = float;
+           """
+        ]
+
+    def docompute(self):
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """
+                static hls::stream<hls::vector<TI,SIMD>>  src0;
+                static hls::stream<hls::vector<float,SIMD>>  dst0;
+
+                move(in0_V, src0);
+                static SoftMax<TI, float, W, SIMD> sm_inst;
+                sm_inst.execute(src0, dst0);
+                move(dst0, out0_V);
+        """
+        ]
+
+    def blackboxfunction(self):
+        self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+            f"""
+            void {self.onnx_node.name}(
+                hls::stream<hls::vector<TI,SIMD>> &in0_V,
+                hls::stream<hls::vector<float,SIMD>> &out0_V
+                )
+            """
+        ]
+
+    def pragmas(self):
+        self.code_gen_dict["$PRAGMAS$"] = [
+            """
+            #pragma HLS interface AXIS port=in0_V
+            #pragma HLS interface AXIS port=out0_V
+            #pragma HLS aggregate  variable=in0_V compact=bit
+            #pragma HLS aggregate  variable=out0_V compact=bit
+
+            #pragma HLS interface ap_ctrl_none port=return
+            #pragma HLS dataflow disable_start_propagation
+            """
+        ]
+
+    def execute_node(self, context, graph):
+        HLSBackend.execute_node(self, context, graph)
+
+    def compile_singlenode_code(self):
+        """Builds the bash script for compilation using the CppBuilder from
+        finn.util.basic and executes the script to produce the executable."""
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        builder = CppBuilder()
+        # to enable additional debug features please uncommand the next line
+        # builder.append_includes("-DDEBUG")
+        builder.append_includes("-I$FINN_ROOT/src/finn/qnn-data/cpp")
+        builder.append_includes("-I$FINN_ROOT/deps/cnpy/")
+        builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib")
+        builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"]))
+        builder.append_includes("-I{}/include".format(os.environ["VITIS_PATH"]))
+        builder.append_includes("--std=c++14")
+        builder.append_includes("-O3")
+        builder.append_sources(code_gen_dir + "/*.cpp")
+        builder.append_sources("$FINN_ROOT/deps/cnpy/cnpy.cpp")
+        builder.append_includes("-lz")
+        builder.append_includes("-fno-builtin -fno-inline")
+        builder.append_includes('-Wl,-rpath,"$HLS_PATH/lnx64/lib/csim"')
+        builder.append_includes("-L$HLS_PATH/lnx64/lib/csim -lhlsmc++-GCC46")
+        builder.append_includes('-Wl,-rpath,"$HLS_PATH/lnx64/tools/fpo_v7_1"')
+        builder.append_includes("-L$HLS_PATH/lnx64/tools/fpo_v7_1 -lgmp -lmpfr")
+        builder.append_includes("-lIp_floating_point_v7_1_bitacc_cmodel")
+        builder.set_executable_path(code_gen_dir + "/node_model")
+        builder.build(code_gen_dir)
+        self.set_nodeattr("executable_path", builder.executable_path)
+
+    def ipgen_extra_includes(self):
+        """Add kernel-specific include paths."""
+        import os
+
+        kernel_dir = os.path.dirname(os.path.abspath(__file__))
+        utils_dir = os.path.join(os.path.dirname(kernel_dir), "utils")
+        return f"-I{kernel_dir} -I{utils_dir}"
+
+    def timeout_value(self):
+        """Set timeout value for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_VALUE$"] = [str(np.prod(self.get_normal_input_shape()))]
diff --git a/src/finn/custom_op/fpgadataflow/hwsoftmax.py b/src/finn/custom_op/fpgadataflow/hwsoftmax.py
@@ -0,0 +1,111 @@
+############################################################################
+# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# SPDX-License-Identifier: BSD-3 Clause
+#
+# @author       Shane T. Fleming <[email protected]>
+############################################################################
+import numpy as np
+import warnings
+from onnx.helper import make_node
+from qonnx.core.datatype import DataType
+from scipy.special import softmax
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+
+class HWSoftmax(HWCustomOp):
+    """Abstraction layer for HW implementation of SoftMax layers."""
+
+    def __init__(self, onnx_node, **kwargs):
+        super().__init__(onnx_node, **kwargs)
+
+    def get_nodeattr_types(self):
+        my_attrs = {
+            "ifm_dim": ("ints", True, []),
+            "SIMD": ("i", False, 1),
+            # FINN DataTypes for inputs, weights, outputs
+            "input_data_type": ("s", True, ""),
+            "NumChannels": ("i", False, 128),
+        }
+        my_attrs.update(super().get_nodeattr_types())
+        return my_attrs
+
+    def get_normal_input_shape(self, ind=0):
+        return self.get_nodeattr("ifm_dim")
+
+    def get_normal_output_shape(self, ind=0):
+        return self.get_normal_input_shape()
+
+    def get_number_output_values(self):
+        folded_oshape = self.get_folded_output_shape()
+        return np.prod(folded_oshape[:-1])
+
+    def execute_node(self, context, graph):
+        node = self.onnx_node
+        input_data = context[node.input[0]]
+        output_data = softmax(input_data, axis=-1)
+        context[node.output[0]] = output_data
+
+    def get_input_datatype(self, ind=0):
+        """Returns FINN DataType of input."""
+        data_type = DataType[self.get_nodeattr("input_data_type")]
+        # the hlslib op always pads with zeros, so ensure that the DataType
+        # is able to represent zeros
+        assert data_type.allowed(0), "DataType must support zero"
+        return data_type
+
+    def make_shape_compatible_op(self, model):
+        shape = self.get_normal_input_shape()
+        # create an ONNX Softmax node with the same shape as this one
+        return make_node(
+            "Softmax",
+            inputs=[self.onnx_node.input[0]],
+            outputs=[self.onnx_node.output[0]],
+            shape=list(shape),
+        )
+
+    def infer_node_datatype(self, model):
+        node = self.onnx_node
+        idt = model.get_tensor_datatype(node.input[0])
+        if idt != self.get_input_datatype():
+            warn_str = "input_data_type changing for %s: %s -> %s " % (
+                node.name,
+                str(self.get_input_datatype()),
+                str(idt),
+            )
+            warnings.warn(warn_str)
+        self.set_nodeattr("input_data_type", idt.name)
+
+        # set output datatype from property
+        odt = self.get_output_datatype()
+        model.set_tensor_datatype(node.output[0], odt)
+
+    def verify_node(self):
+        raise NotImplementedError
+
+    def get_instream_width(self, ind=0):
+        ibits = self.get_input_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return ibits * simd
+
+    def get_outstream_width(self, ind=0):
+        obits = self.get_output_datatype().bitwidth()
+        simd = self.get_nodeattr("SIMD")
+        return obits * simd
+
+    def get_output_datatype(self, ind=0):
+        """Returns FINN DataType of output."""
+        return DataType["FLOAT32"]
+
+    def get_folded_output_shape(self, ind=0):
+        return self.get_folded_input_shape()
+
+    def get_folded_input_shape(self, ind=0):
+        normal_ishape = list(self.get_normal_input_shape())
+        simd = self.get_nodeattr("SIMD")
+        assert normal_ishape[-1] % simd == 0, "SIMD must divide into input dimension"
+        fold = int(normal_ishape[-1] / simd)
+        folded_ishape = normal_ishape[:-1] + [fold, simd]
+        return tuple(folded_ishape)
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -1741,6 +1741,47 @@ def apply(self, model):
         return (model, graph_modified)
 
 
+class InferHWSoftmax(Transformation):
+    """
+    Infers a regular softmax node without merging the multithreshold
+    and setting the softmax to perform the quantisation.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def apply(self, model):
+        graph = model.graph
+        node_ind = 0
+        graph_modified = False
+        for n in graph.node:
+            if n.op_type == "Softmax":
+                input_shape = model.get_tensor_shape(n.input[0])
+                idt0 = model.get_tensor_datatype(n.input[0])
+                odt0 = model.get_tensor_datatype(n.output[0])
+                new_node = helper.make_node(
+                    "HWSoftmax",
+                    [n.input[0]],  # input tensor(s)
+                    [n.output[0]],  # output tensor(s)
+                    domain="finn.custom_op.fpgadataflow",
+                    backend="fpgadataflow",
+                    ifm_dim=input_shape,
+                    input_data_type=idt0.name,
+                    output_data_type=odt0.name,
+                    name=n.name,
+                    SIMD=1,
+                    NumChannels=input_shape[-1],
+                )
+                graph.node.insert(node_ind, new_node)
+                graph.node.remove(n)
+                graph_modified = True
+
+        if graph_modified:
+            model = model.transform(InferShapes())
+            model = model.transform(InferDataTypes())
+        return (model, graph_modified)
+
+
 # Lifts scalar to rank-1 tensor
 def lift_to_rank1(name: str, model: ModelWrapper):
     # Scalars have a shape of lengths zero