diff --git a/fetch-repos.sh b/fetch-repos.sh
index a4fc124fa4..ece06523ea 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -32,7 +32,7 @@ FINN_EXP_COMMIT="0724be21111a21f0d81a072fccc1c446e053f851"
 BREVITAS_COMMIT="d4834bd2a0fad3c1fbc0ff7e1346d5dcb3797ea4"
 PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
 CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"
+HLSLIB_COMMIT="2e4adf1f34d71cc76a87da189e7eefcff94ebd9f"
 OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a"
 AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
 XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
@@ -45,7 +45,7 @@ FINN_EXP_URL="https://github.com/Xilinx/finn-experimental.git"
 BREVITAS_URL="https://github.com/Xilinx/brevitas.git"
 PYVERILATOR_URL="https://github.com/maltanar/pyverilator.git"
 CNPY_URL="https://github.com/rogersce/cnpy.git"
-HLSLIB_URL="https://github.com/Xilinx/finn-hlslib.git"
+HLSLIB_URL="https://github.com/mdanilow/finn-hlslib.git"
 OMX_URL="https://github.com/maltanar/oh-my-xilinx.git"
 AVNET_BDF_URL="https://github.com/Avnet/bdf.git"
 XIL_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git"
diff --git a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py
index 05d26eddb2..cc8c161d73 100644
--- a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py
+++ b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py
@@ -26,10 +26,12 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+import os
 import numpy as np
 
 from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
 from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour
+from finn.custom_op.fpgadataflow import templates
 from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
 
 
@@ -58,48 +60,40 @@ def global_includes(self):
     def defines(self, var):
         self.code_gen_dict["$DEFINES$"] = []
 
-        ifm_ch = self.get_nodeattr("NumChannels")
-        self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)]
+        HI = self.get_nodeattr("HI")
+        self.code_gen_dict["$DEFINES$"] += ["#define HI {}".format(HI)]
 
-        ibits = self.get_input_datatype().bitwidth()
-        self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)]
+        WI = self.get_nodeattr("WI")
+        self.code_gen_dict["$DEFINES$"] += ["#define WI {}".format(WI)]
 
-        idim = self.get_nodeattr("IFMDim")
-        self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)]
+        HO = self.get_nodeattr("HO")
+        self.code_gen_dict["$DEFINES$"] += ["#define HO {}".format(HO)]
 
-        odim = self.get_nodeattr("OFMDim")
-        self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)]
+        WO = self.get_nodeattr("WO")
+        self.code_gen_dict["$DEFINES$"] += ["#define WO {}".format(WO)]
 
-        batch_size = self.get_nodeattr("numInputVectors")
-        self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)]
+        SIMD = self.get_nodeattr("SIMD")
+        self.code_gen_dict["$DEFINES$"] += ["#define SIMD {}".format(SIMD)]
+
+        CF = self.get_nodeattr("NumChannels") // SIMD
+        self.code_gen_dict["$DEFINES$"] += ["#define CF {}".format(CF)]
 
     def docompute(self):
-        is_2d = self.get_nodeattr("DimMode") == 0
-        batch = self.get_nodeattr("numInputVectors")
-        if is_2d:
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                """UpsampleNearestNeighbour_Batch<OFMDim, IFMDim, IFMChannels,
-                ap_uint<Input_precision> > (in0_%s, out_%s, numReps);"""
-                % (self.hls_sname(), self.hls_sname())
-            ]
-        else:
-            assert batch == 1, "1D upsampler currently needs numReps=1"
-            self.code_gen_dict["$DOCOMPUTE$"] = [
-                """UpsampleNearestNeighbour_1D<OFMDim, IFMDim, IFMChannels,
-                ap_uint<Input_precision> > (in0_%s, out_%s);"""
-                % (self.hls_sname(), self.hls_sname())
-            ]
+        self.code_gen_dict["$DOCOMPUTE$"] = [
+            """upsample_nn<HI, WI, HO, WO, CF>(in0_%s, out_%s);"""
+            % (self.hls_sname(), self.hls_sname())
+        ]
 
     def blackboxfunction(self):
-        packed_bits = self.get_instream_width()
-        packed_hls_type = "ap_uint<%d>" % packed_bits
+        input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str()
+        output_elem_hls_type = self.get_output_datatype().get_hls_datatype_str()
         self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
-            "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+            "void %s(hls::stream<hls::vector<%s, SIMD>> &in0_%s, hls::stream<hls::vector<%s, SIMD>> &out_%s)"
             % (
                 self.onnx_node.name,
-                packed_hls_type,
+                input_elem_hls_type,
                 self.hls_sname(),
-                packed_hls_type,
+                output_elem_hls_type,
                 self.hls_sname(),
             )
         ]
@@ -109,7 +103,6 @@ def execute_node(self, context, graph):
         node = self.onnx_node
         exp_ishape = self.get_normal_input_shape()
         exp_oshape = self.get_normal_output_shape()
-        folded_oshape = self.get_folded_output_shape()
 
         if mode == "cppsim":
             code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
@@ -138,7 +131,7 @@ def execute_node(self, context, graph):
             # load output npy file
             super().npy_to_dynamic_output(context)
             assert (
-                context[node.output[0]].shape == folded_oshape
+                context[node.output[0]].shape == exp_oshape
             ), "cppsim did not produce expected folded output shape"
             context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
         elif mode == "rtlsim":
@@ -173,3 +166,97 @@ def execute_node(self, context, graph):
             context[node.output[0]].shape == exp_oshape
         ), """Output shape doesn't match expected shape
             (1, OutputDim, OutputDim, NumChannels)."""
+
+    # def code_generation_cppsim(self, model):
+    #     """Generates c++ code for simulation (cppsim)."""
+    #     node = self.onnx_node
+    #     path = self.get_nodeattr("code_gen_dir_cppsim")
+    #     self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
+    #     self.generate_params(model, path)
+    #     self.global_includes()
+    #     self.defines("cppsim")
+    #     self.read_npy_data()
+    #     self.strm_decl()
+    #     self.pragmas()
+    #     self.docompute()
+    #     self.dataoutstrm()
+    #     self.save_as_npy()
+    #     self.timeout_value()
+    #     self.timeout_condition()
+    #     self.timeout_read_stream()
+
+    #     template = templates.docompute_template_timeout
+
+    #     for key in self.code_gen_dict:
+    #         # transform list into long string separated by '\n'
+    #         code_gen_line = "\n".join(self.code_gen_dict[key])
+    #         template = template.replace(key, code_gen_line)
+    #     code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+    #     f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w")
+    #     f.write(template)
+    #     f.close()
+    #     self.code_gen_dict.clear()
+
+    def read_npy_data(self):
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        npy_type = "float"
+        self.code_gen_dict["$READNPYDATA$"] = []
+        input_elem_hls_type = self.get_input_datatype().get_hls_datatype_str()
+        npy_in = "%s/input_0.npy" % (code_gen_dir)
+        self.code_gen_dict["$READNPYDATA$"].append(
+            'npy2vectorstream<%s, %s, SIMD>("%s", in0_%s);'
+            % (
+                input_elem_hls_type,
+                npy_type,
+                npy_in,
+                self.hls_sname(),
+            )
+        )
+
+    def dataoutstrm(self):
+        npy_type = "float"
+        code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+        oshape = self.get_folded_output_shape()
+        oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+        npy_out = "%s/output.npy" % code_gen_dir
+        self.code_gen_dict["$DATAOUTSTREAM$"] = [
+            'vectorstream2npy<%s, %s, SIMD>(out_%s, %s, "%s");'
+            % (
+                self.get_output_datatype().get_hls_datatype_str(),
+                npy_type,
+                self.hls_sname(),
+                oshape_cpp_str,
+                npy_out,
+            )
+        ]
+    
+    def strm_decl(self):
+        self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<hls::vector<{}, SIMD>> in0_{} ("in0_{}");'.format(
+                self.get_input_datatype().get_hls_datatype_str(),
+                self.hls_sname(),
+                self.hls_sname()
+            )
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<hls::vector<{}, SIMD>> out_{} ("out_{}");'.format(
+                self.get_output_datatype().get_hls_datatype_str(),
+                self.hls_sname(),
+                self.hls_sname()
+            )
+        )
+        self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+            'hls::stream<hls::vector<{}, SIMD>> debug_out_{} ("debug_out_{}");'.format(
+                self.get_output_datatype().get_hls_datatype_str(),
+                self.hls_sname(),
+                self.hls_sname()
+            )
+        )
+
+    def pragmas(self):
+        super().pragmas()
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS aggregate variable=in0_%s compact=bit" % self.hls_sname())
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS aggregate variable=out_%s compact=bit" % self.hls_sname())
+        self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS dataflow disable_start_propagation")
+        
\ No newline at end of file
diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
index d8210fd684..c03a9029db 100644
--- a/src/finn/custom_op/fpgadataflow/hlsbackend.py
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -474,3 +474,17 @@ def get_ap_int_max_w(self):
         ret = max([instream, outstream])
         assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret
         return ret
+
+    def timeout_value(self):
+        """Set timeout value for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_VALUE$"] = ["100"]
+
+    def timeout_condition(self):
+        """Set timeout condition for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_CONDITION$"] = ["out_{}.empty()".format(self.hls_sname())]
+
+    def timeout_read_stream(self):
+        """Set reading output stream procedure for HLS functions defined for one clock cycle"""
+        self.code_gen_dict["$TIMEOUT_READ_STREAM$"] = [
+            "debug_out_{} << out_{}.read();".format(self.hls_sname(), self.hls_sname())
+        ]
diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py
index 3d89a0ab23..d2100a7516 100644
--- a/src/finn/custom_op/fpgadataflow/templates.py
+++ b/src/finn/custom_op/fpgadataflow/templates.py
@@ -32,6 +32,7 @@
 #define AP_INT_MAX_W $AP_INT_MAX_W$
 #include "cnpy.h"
 #include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
 #include <vector>
 #include "bnn-library.h"
 
@@ -58,6 +59,51 @@
 
 """
 
+# template for single node execution with timeout (for single clock hls operations)
+docompute_template_timeout = """
+#define AP_INT_MAX_W $AP_INT_MAX_W$
+#include "cnpy.h"
+#include "npy2apintstream.hpp"
+#include "npy2vectorstream.hpp"
+#include <vector>
+#include "bnn-library.h"
+
+// includes for network parameters
+$GLOBALS$
+
+// defines for network parameters
+$DEFINES$
+
+int main(){
+$PRAGMAS$
+
+$STREAMDECLARATIONS$
+
+$READNPYDATA$
+
+unsigned timeout = 0;
+while(timeout < $TIMEOUT_VALUE$){
+
+$DOCOMPUTE$
+
+if($TIMEOUT_CONDITION$){
+timeout++;
+}
+
+else{
+$TIMEOUT_READ_STREAM$
+timeout = 0;
+}
+}
+
+$DATAOUTSTREAM$
+
+$SAVEASCNPY$
+
+}
+
+"""
+
 # templates for single node ip generation
 
 # cpp file
diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py
index 3348394e05..25da51f7a7 100644
--- a/src/finn/custom_op/fpgadataflow/upsampler.py
+++ b/src/finn/custom_op/fpgadataflow/upsampler.py
@@ -44,62 +44,53 @@ def __init__(self, onnx_node, **kwargs):
 
     def get_nodeattr_types(self):
         my_attrs = {
-            # Size of the output feature map
-            "OFMDim": ("i", True, 0),
-            # Size of the input feature map
-            "IFMDim": ("i", True, 0),
+            "SIMD": ("i", True, 0),
+            # Height, width of the output feature map
+            "HO": ("i", True, 0),
+            "WO": ("i", True, 0),
+            # Height, width of the input feature map
+            "HI": ("i", True, 0),
+            "WI": ("i", True, 0),
             # Amount of channels of the input feature map
             "NumChannels": ("i", True, 0),
             # FINN input datatype
             "inputDataType": ("s", True, ""),
             # Batch size
-            "numInputVectors": ("i", False, 1),
-            # Dimensionality mode: 0 = 2D square, 1 = 1D in H dim
-            "DimMode": ("i", False, 0),
+            "batchSize": ("i", False, 1),
         }
         my_attrs.update(super().get_nodeattr_types())
         return my_attrs
 
     def get_exp_cycles(self):
-        OFMDim = self.get_nodeattr("OFMDim")
-        batch_size = self.get_nodeattr("numInputVectors")
-        is_2d = self.get_nodeattr("DimMode") == 0
-        reps = 1
-        if is_2d:
-            OFMDim = OFMDim * OFMDim
-            reps = batch_size
-        exp_cycles = OFMDim * reps
-        return int(exp_cycles)
+        return np.prod(self.get_folded_output_shape()[:-1])
 
     def get_normal_input_shape(self, ind=0):
-        IFMDim = self.get_nodeattr("IFMDim")
+        batch = self.get_nodeattr("batchSize")
+        HI = self.get_nodeattr("HI")
+        WI = self.get_nodeattr("WI")
         num_ch = self.get_nodeattr("NumChannels")
-        batch = self.get_nodeattr("numInputVectors")
-        is_2d = self.get_nodeattr("DimMode") == 0
-        if is_2d:
-            ishape = (batch, IFMDim, IFMDim, num_ch)
-        else:
-            ishape = (batch, IFMDim, 1, num_ch)
+        ishape = (batch, HI, WI, num_ch)
         return ishape
 
     def get_normal_output_shape(self, ind=0):
-        OFMDim = self.get_nodeattr("OFMDim")
+        batch = self.get_nodeattr("batchSize")
+        HO = self.get_nodeattr("HO")
+        WO = self.get_nodeattr("WO")
         num_ch = self.get_nodeattr("NumChannels")
-        batch = self.get_nodeattr("numInputVectors")
-        is_2d = self.get_nodeattr("DimMode") == 0
-        if is_2d:
-            oshape = (batch, OFMDim, OFMDim, num_ch)
-        else:
-            oshape = (batch, OFMDim, 1, num_ch)
+        oshape = (batch, HO, WO, num_ch)
         return oshape
 
     def get_folded_input_shape(self, ind=0):
-        normal_ishape = list(self.get_normal_input_shape())
-        return tuple(normal_ishape)
+        spatial_shape = list(self.get_normal_input_shape())[:-1]
+        simd = self.get_nodeattr("SIMD")
+        folds = self.get_nodeattr("NumChannels") // simd
+        return tuple(spatial_shape + [folds, simd])
 
     def get_folded_output_shape(self, ind=0):
-        normal_oshape = list(self.get_normal_output_shape())
-        return tuple(normal_oshape)
+        spatial_shape = list(self.get_normal_output_shape())[:-1]
+        simd = self.get_nodeattr("SIMD")
+        folds = self.get_nodeattr("NumChannels") // simd
+        return tuple(spatial_shape + [folds, simd])
 
     def make_shape_compatible_op(self, model):
         exp_ishape = self.get_normal_input_shape()
@@ -136,13 +127,13 @@ def get_output_datatype(self, ind=0):
 
     def get_instream_width(self, ind=0):
         ibits = self.get_input_datatype().bitwidth()
-        ifm_ch = self.get_nodeattr("NumChannels")
-        return ibits * ifm_ch
+        simd = self.get_nodeattr("SIMD")
+        return ibits * simd
 
     def get_outstream_width(self, ind=0):
         obits = self.get_output_datatype().bitwidth()
-        ifm_ch = self.get_nodeattr("NumChannels")
-        return obits * ifm_ch
+        simd = self.get_nodeattr("SIMD")
+        return obits * simd
 
     def get_number_output_values(self):
         folded_oshape = self.get_folded_output_shape()
@@ -153,17 +144,11 @@ def execute_node(self, context, graph):
         node = self.onnx_node
         inp_values = context[node.input[0]]
         ishape = inp_values.shape
-        odim = self.get_nodeattr("OFMDim")
-        idim = self.get_nodeattr("IFMDim")
-        if ishape[1] == ishape[2]:
-            scales_val = [1, int(round(odim / idim)), int(round(odim / idim)), 1]
-        elif ishape[1] > 1 and ishape[2] == 1:
-            scales_val = [1, int(round(odim / idim)), 1, 1]
-        else:
-            warnings.warn(
-                """HW abstraction layer for Upsample cannot be executed.
-            Upsampling only supported for 1D H, or 2D square scaling"""
-            )
+        HO = self.get_nodeattr("HO")
+        WO = self.get_nodeattr("WO")
+        HI = self.get_nodeattr("HI")
+        WI = self.get_nodeattr("WI")
+        scales_val = [1, int(round(HO / HI)), int(round(WO / WI)), 1]
         oshape = context[node.output[0]].shape
         inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape)
         scales = helper.make_tensor_value_info("scales", TensorProto.FLOAT, [4])
diff --git a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
index b02bc89db8..5935d237e6 100644
--- a/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
+++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py
@@ -319,33 +319,19 @@ def apply(self, model):
                 )
 
                 # Assumes nhwc layout for scales and input
-                is_scale_square_2d = scales[1] == scales[2]
-                is_scale_1d = scales[1] > 1 and scales[2] == 1
-                assert is_scale_square_2d or is_scale_1d, (
-                    "%s: Upsampling only supported for 1D H, or 2D square scaling" % n.name
-                )
                 assert scales[0] == scales[3] == 1, (
                     n.name + ": Upsampling is only supported for scales with "
                     "the first and last dimensions being 1 in NHWC."
                 )
-                spatial_scale = scales[1]
-                assert spatial_scale == int(spatial_scale), (
-                    "%s: Upsampling is only supported for integer scales." % n.name
-                )
-                is_shape_square_2d = in_shape[1] == in_shape[2]
-                is_shape_1d = in_shape[1] > 1 and in_shape[2] == 1
-
-                assert is_shape_square_2d or is_shape_1d, (
-                    "%s: Upsampling is only supported for 1D H or 2D square inputs." % n.name
-                )
 
                 # Extract information for HW node
-                IFMDim = in_shape[1]
-                OFMDim = int(round(in_shape[1] * spatial_scale))
+                HI = in_shape[1]
+                WI = in_shape[2]
+                HO = int(round(HI * scales[1]))
+                WO = int(round(WI * scales[2]))
                 NumChannels = in_shape[-1]
-                numInputVectors = in_shape[0]
+                batchSize = in_shape[0]
                 inputDataType = dt.name
-                dim_mode = 0 if is_shape_square_2d else 1
 
                 # Insert the HWCustomOp node
                 Upsample_HW_node = helper.make_node(
@@ -354,12 +340,14 @@ def apply(self, model):
                     [n.output[0]],
                     domain="finn.custom_op.fpgadataflow",
                     backend="fpgadataflow",
-                    OFMDim=OFMDim,
-                    IFMDim=IFMDim,
+                    SIMD=1,
+                    HO=HO,
+                    WO=WO,
+                    HI=HI,
+                    WI=WI,
                     NumChannels=NumChannels,
                     inputDataType=inputDataType,
-                    numInputVectors=numInputVectors,
-                    DimMode=dim_mode,
+                    batchSize=batchSize,
                     name="UpsampleNearestNeighbour_" + n.name,
                 )
 
diff --git a/tests/fpgadataflow/test_fpgadataflow_upsampler.py b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
index 4539917878..c9a8d4379d 100644
--- a/tests/fpgadataflow/test_fpgadataflow_upsampler.py
+++ b/tests/fpgadataflow/test_fpgadataflow_upsampler.py
@@ -43,6 +43,7 @@
 from qonnx.transformation.infer_shapes import InferShapes
 from qonnx.transformation.make_input_chanlast import MakeInputChannelsLast
 from qonnx.util.cleanup import cleanup as qonnx_cleanup
+from qonnx.custom_op.registry import getCustomOp
 from torch import nn
 
 import finn.core.onnx_exec as oxe
@@ -101,26 +102,28 @@ def forward(self, x):
 # param datatype
 @pytest.mark.parametrize("dt", [DataType["INT8"]])
 # spatial dim input feature map
-@pytest.mark.parametrize("IFMDim", [3, 5])
+@pytest.mark.parametrize("IFMDim", [(6, 10), (12, 20)])
 # upscaling factor
-@pytest.mark.parametrize("scale", [2, 3])
+@pytest.mark.parametrize("scale", [2])
 # Number of input/output channels
-@pytest.mark.parametrize("NumChannels", [4])
+@pytest.mark.parametrize("NumChannels", [128, 256])
 # execution mode
 @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"])
 # whether to use 1D or 2D square testcases
-@pytest.mark.parametrize("is_1d", [False, True])
+@pytest.mark.parametrize("is_1d", [False])
+# parallelization level
+@pytest.mark.parametrize("SIMD", [1, 16])
 @pytest.mark.fpgadataflow
 @pytest.mark.vivado
 @pytest.mark.slow
-def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d):
+def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d, SIMD):
     tmpdir = make_build_dir("upsample_export_")
     atol = 1e-3
     if is_1d:
-        input_shape = (1, NumChannels, IFMDim, 1)
+        input_shape = (1, NumChannels, IFMDim[0], 1)
         upscale_factor = (scale, 1)
     else:
-        input_shape = (1, NumChannels, IFMDim, IFMDim)
+        input_shape = (1, NumChannels, IFMDim[0], IFMDim[1])
         upscale_factor = (scale, scale)
     # Create the test model and inputs for it
     torch_model = PyTorchTestModel(upscale_factor=upscale_factor)
@@ -165,6 +168,8 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d
     for n in model.get_finn_nodes():
         node_check = n.op_type == "UpsampleNearestNeighbour"
         assert node_check, "All nodes should be UpsampleNearestNeighbour nodes."
+        inst = getCustomOp(n)
+        inst.set_nodeattr("SIMD", SIMD)
 
     test_in_transposed = test_in.numpy().transpose(_to_chan_last_args)
     input_dict = {model.graph.input[0].name: test_in_transposed}