From 115c3d16c9932b2a94231f695e8260c05419c614 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Tue, 4 Nov 2025 14:24:22 +0100
Subject: [PATCH 1/7] Move example

---
 examples/cpp/DetectionNetwork/CMakeLists.txt                    | 2 +-
 .../cpp/DetectionNetwork/{RVC4 => }/detection_and_keypoints.cpp | 2 +-
 .../DetectionNetwork/{RVC4 => }/detection_and_keypoints.py      | 2 +-
 tests/CMakeLists.txt                                            | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)
 rename examples/cpp/DetectionNetwork/{RVC4 => }/detection_and_keypoints.cpp (97%)
 rename examples/python/DetectionNetwork/{RVC4 => }/detection_and_keypoints.py (96%)

diff --git a/examples/cpp/DetectionNetwork/CMakeLists.txt b/examples/cpp/DetectionNetwork/CMakeLists.txt
index 9a3adc4e6..8193faeb9 100644
--- a/examples/cpp/DetectionNetwork/CMakeLists.txt
+++ b/examples/cpp/DetectionNetwork/CMakeLists.txt
@@ -26,5 +26,5 @@ dai_set_example_test_labels(detection_network_remap ondevice rvc2_all rvc4 ci)
 dai_add_example(detection_and_segmentation RVC4/detection_and_segmentation.cpp ON OFF)
 dai_set_example_test_labels(detection_and_segmentation rvc4)
 
-dai_add_example(detection_and_keypoints RVC4/detection_and_keypoints.cpp ON OFF)
+dai_add_example(detection_and_keypoints detection_and_keypoints.cpp ON OFF)
 dai_set_example_test_labels(detection_and_keypoints rvc4)
\ No newline at end of file
diff --git a/examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
similarity index 97%
rename from examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp
rename to examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
index bc8dca07c..667151bb0 100644
--- a/examples/cpp/DetectionNetwork/RVC4/detection_and_keypoints.cpp
+++ b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
@@ -22,7 +22,7 @@ int main() {
     auto detectionNetwork = pipeline.create<dai::node::DetectionNetwork>();
 
     dai::NNModelDescription modelDescription;
-    modelDescription.model = "luxonis/yolov8-large-pose-estimation:coco-640x352:1868e39";
+    modelDescription.model = "luxonis/yolov8-nano-pose-estimation:coco-512x288";
     detectionNetwork->build(cameraNode, modelDescription);
     auto labelMap = detectionNetwork->getClasses();
 
diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py b/examples/python/DetectionNetwork/detection_and_keypoints.py
similarity index 96%
rename from examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py
rename to examples/python/DetectionNetwork/detection_and_keypoints.py
index c62987701..431679544 100644
--- a/examples/python/DetectionNetwork/RVC4/detection_and_keypoints.py
+++ b/examples/python/DetectionNetwork/detection_and_keypoints.py
@@ -9,7 +9,7 @@
 # Create pipeline
 with dai.Pipeline() as pipeline:
     cameraNode = pipeline.create(dai.node.Camera).build()
-    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-large-pose-estimation:coco-640x352:1868e39"))
+    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-nano-pose-estimation:coco-512x288"))
     labelMap = detectionNetwork.getClasses()
 
     qRgb = detectionNetwork.passthrough.createOutputQueue()
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index e88884733..6ab38e604 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -389,7 +389,7 @@ dai_set_test_labels(nndata_test onhost ci)
 
 #ImgDetections tests
 dai_add_test(imgdetections_test src/onhost_tests/pipeline/datatype/imgdetections_test.cpp)
-dai_set_test_labels(imgdetections_test onhost ci)
+dai_set_test_labels(imgdetections_test ondevice rvc2 rvc4 onhost ci)
 
 # Model description tests
 dai_add_test(model_slug_test src/onhost_tests/model_slug_test.cpp)

From bb3204ebd1abeaa5a6748e77a1d4cf9ed075c04a Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Tue, 4 Nov 2025 16:59:00 +0100
Subject: [PATCH 2/7] Add host parsing option

---
 CMakeLists.txt                                |   1 +
 .../RVC4/detection_and_segmentation.py        |   3 +-
 .../depthai/pipeline/node/DetectionParser.hpp |  29 +-
 src/pipeline/node/DetectionParser.cpp         | 184 ++++
 .../DetectionParser/DetectionParserUtils.cpp  | 897 ++++++++++++++++++
 .../DetectionParser/DetectionParserUtils.hpp  |  85 ++
 src/pipeline/utilities/NNDataViewer.hpp       | 163 ++++
 7 files changed, 1360 insertions(+), 2 deletions(-)
 create mode 100644 src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
 create mode 100644 src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp
 create mode 100644 src/pipeline/utilities/NNDataViewer.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ae50f4e25..54150150f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -311,6 +311,7 @@ set(TARGET_CORE_SOURCES
     src/pipeline/node/ImageAlign.cpp
     src/pipeline/node/ToF.cpp
     src/pipeline/node/DetectionParser.cpp
+    src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
     src/pipeline/node/test/MyProducer.cpp
     src/pipeline/node/test/MyConsumer.cpp
     src/pipeline/node/UVC.cpp
diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py b/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py
index fcbbbfd2f..650f90f2f 100644
--- a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py
+++ b/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py
@@ -9,7 +9,8 @@
 # Create pipeline
 with dai.Pipeline() as pipeline:
     cameraNode = pipeline.create(dai.node.Camera).build()
-    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-instance-segmentation-large:coco-640x480"))
+    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-instance-segmentation-nano:coco-512x288"))
+    # detectionNetwork.detectionParser.runOnHost(True)
     labelMap = detectionNetwork.getClasses()
 
     qRgb = detectionNetwork.passthrough.createOutputQueue()
diff --git a/include/depthai/pipeline/node/DetectionParser.hpp b/include/depthai/pipeline/node/DetectionParser.hpp
index 78bb8ce8e..4b50a75b3 100644
--- a/include/depthai/pipeline/node/DetectionParser.hpp
+++ b/include/depthai/pipeline/node/DetectionParser.hpp
@@ -15,6 +15,8 @@
 #include <string>
 
 #include "depthai/common/YoloDecodingFamily.hpp"
+#include "depthai/pipeline/datatype/ImgDetections.hpp"
+#include "depthai/pipeline/datatype/NNData.hpp"
 
 namespace dai {
 namespace node {
@@ -23,7 +25,7 @@ namespace node {
  * @brief DetectionParser node. Parses detection results from different neural networks and is being used internally by MobileNetDetectionNetwork and
  * YoloDetectionNetwork.
  */
-class DetectionParser : public DeviceNodeCRTP<DeviceNode, DetectionParser, DetectionParserProperties> {
+class DetectionParser : public DeviceNodeCRTP<DeviceNode, DetectionParser, DetectionParserProperties>, public HostRunnable {
    public:
     constexpr static const char* NAME = "DetectionParser";
     using DeviceNodeCRTP::DeviceNodeCRTP;
@@ -177,7 +179,23 @@ class DetectionParser : public DeviceNodeCRTP<DeviceNode, DetectionParser, Detec
 
     const NNArchiveVersionedConfig& getNNArchiveVersionedConfig() const;
 
+    /**
+     * Specify whether to run on host or device
+     * By default, the node will run on device.
+     */
+    void setRunOnHost(bool runOnHost);
+
+    /**
+     * Check if the node is set to run on host
+     */
+    bool runOnHost() const override;
+
+    void run() override;
+
+    std::vector<dai::ImgDetection> decodeMobilenet(std::shared_ptr<dai::NNData> nnData, float confidenceThr);
+
    private:
+    bool runOnHostVar = false;
     void setNNArchiveBlob(const NNArchive& nnArchive);
     void setNNArchiveSuperblob(const NNArchive& nnArchive, int numShaves);
     void setNNArchiveOther(const NNArchive& nnArchive);
@@ -185,6 +203,15 @@ class DetectionParser : public DeviceNodeCRTP<DeviceNode, DetectionParser, Detec
     YoloDecodingFamily yoloDecodingFamilyResolver(const std::string& subtype);
     bool decodeSegmentationResolver(const std::vector<std::string>& outputs);
 
+    // host runnable requirements
+    void buildStage1() override;
+    void decodeYolo(std::shared_ptr<dai::NNData> nnData, std::shared_ptr<dai::ImgDetections> outDetections);
+    std::vector<dai::TensorInfo> inTensorInfo;
+    uint32_t imgWidth;
+    uint32_t imgHeight;
+    uint32_t imgSizesSet = false;
+    //
+
     std::optional<NNArchive> mArchive;
 
     std::optional<NNArchiveVersionedConfig> archiveConfig;
diff --git a/src/pipeline/node/DetectionParser.cpp b/src/pipeline/node/DetectionParser.cpp
index a03b64633..2c0e07b9a 100644
--- a/src/pipeline/node/DetectionParser.cpp
+++ b/src/pipeline/node/DetectionParser.cpp
@@ -13,6 +13,8 @@
 #include "nn_archive/NNArchive.hpp"
 #include "nn_archive/v1/Head.hpp"
 #include "pipeline/ThreadedNodeImpl.hpp"
+#include "pipeline/datatype/NNData.hpp"
+#include "pipeline/utilities/DetectionParser/DetectionParserUtils.hpp"
 #include "spdlog/fmt/fmt.h"
 
 // internal headers
@@ -349,5 +351,187 @@ std::vector<int> DetectionParser::getStrides() const {
     return properties.parser.strides;
 }
 
+void DetectionParser::setRunOnHost(bool runOnHost) {
+    if(runOnHost) {
+        pimpl->logger->warn("Detection parser set to run on host.");
+    }
+    runOnHostVar = runOnHost;
+}
+
+/**
+ * Check if the node is set to run on host
+ */
+bool DetectionParser::runOnHost() const {
+    return runOnHostVar;
+}
+
+void DetectionParser::run() {
+    auto& logger = pimpl->logger;
+    logger->info("Detection parser running on host.");
+
+    using namespace std::chrono;
+    while(isRunning()) {
+        auto tAbsoluteBeginning = steady_clock::now();
+        std::shared_ptr<dai::NNData> inputData;
+        inputData = input.get<dai::NNData>();
+        if(!inputData) {
+            logger->error("Error while receiving NN frame.");
+            continue;
+        }
+        auto tAfterMessageBeginning = steady_clock::now();
+
+        if(!imgSizesSet) {
+            const bool containsTransformation = inputData->transformation.has_value();
+            if(containsTransformation) {
+                std::tie(imgWidth, imgHeight) = inputData->transformation->getSize();
+            } else {
+                logger->warn("No image size provided for detection parser. Skipping processing and sending empty detections.");
+                continue;
+            }
+
+            imgSizesSet = true;
+        }
+
+        auto outDetections = std::make_shared<dai::ImgDetections>();
+
+        switch(properties.parser.nnFamily) {
+            case DetectionNetworkType::YOLO: {
+                decodeYolo(inputData, outDetections);
+                break;
+            }
+            case DetectionNetworkType::MOBILENET: {
+                auto dets = decodeMobilenet(inputData, properties.parser.confidenceThreshold);  // TODO (aljaz) update to shared pointer
+                outDetections->detections = dets;
+                break;
+            }
+            default: {
+                logger->error("Unknown NN family. 'YOLO' and 'MOBILENET' are supported.");
+                break;
+            }
+        }
+
+        auto tBeforeSend = steady_clock::now();
+
+        // Copy over seq and ts
+        outDetections->setSequenceNum(inputData->getSequenceNum());
+        outDetections->setTimestamp(inputData->getTimestamp());
+        outDetections->setTimestampDevice(inputData->getTimestampDevice());
+        outDetections->transformation = inputData->transformation;
+        // Send detections
+        out.send(outDetections);
+
+        auto tAbsoluteEnd = steady_clock::now();
+        logger->debug("Detection parser total took {}ms, processing {}ms, getting_frames {}ms, sending_frames {}ms",
+                      duration_cast<microseconds>(tAbsoluteEnd - tAbsoluteBeginning).count() / 1000,
+                      duration_cast<microseconds>(tBeforeSend - tAfterMessageBeginning).count() / 1000,
+                      duration_cast<microseconds>(tAfterMessageBeginning - tAbsoluteBeginning).count() / 1000,
+                      duration_cast<microseconds>(tAbsoluteEnd - tBeforeSend).count() / 1000);
+    }
+}
+
+void DetectionParser::buildStage1() {
+    auto& logger = pimpl->logger;
+
+    // Grab dimensions from input tensor info
+    if(properties.networkInputs.size() > 0) {
+        if(properties.networkInputs.size() > 1) {
+            logger->warn("Detection parser supports only single input networks, assuming first input");
+        }
+        for(const auto& kv : properties.networkInputs) {
+            const dai::TensorInfo& tensorInfo = kv.second;
+            inTensorInfo.push_back(tensorInfo);
+        }
+    }
+    if(inTensorInfo.size() > 0) {
+        int numDimensions = inTensorInfo[0].numDimensions;
+        if(numDimensions < 2) {
+            logger->error("Number of input dimensions is less than 2");
+        } else {
+            imgSizesSet = true;
+            imgWidth = inTensorInfo[0].dims[numDimensions - 1];
+            imgHeight = inTensorInfo[0].dims[numDimensions - 2];
+        }
+    } else {
+        logger->info("Unable to read input tensor height and width from static inputs. The node will try to get input sizes at runtime.");
+    }
+}
+
+std::vector<dai::ImgDetection> DetectionParser::decodeMobilenet(std::shared_ptr<dai::NNData> nnData, float confidenceThr) {
+    auto& logger = pimpl->logger;
+
+    if(!nnData) {
+        return {};
+    }
+    int maxDetections = 100;
+    std::vector<dai::ImgDetection> detections;
+    std::string tensorName;
+    for(const auto& tensor : nnData->getAllLayers()) {
+        if(tensor.offset == 0) {
+            tensorName = tensor.name;
+        }
+    }
+
+    auto tensorData = nnData->getTensor<float>(tensorName);
+    maxDetections = tensorData.size() / 7;
+    if(static_cast<int>(tensorData.size()) < maxDetections * 7) {
+        logger->error("Error while parsing Mobilenet. Vector not long enough, expected size: {}, real size {}", maxDetections * 7, tensorData.size());
+        return {};
+    }
+
+    struct raw_Detection {  // need to update it to include more
+        float header;
+        float label;
+        float confidence;
+        float xmin;
+        float ymin;
+        float xmax;
+        float ymax;
+    };
+
+    float* rawPtr = tensorData.data();
+    for(int i = 0; i < maxDetections; i++) {
+        raw_Detection temp;
+        // TODO This is likely unnecessary optimisation
+        memcpy(&temp, &rawPtr[i * 7], sizeof(raw_Detection));
+
+        // if header == -1, stop sooner
+        if(temp.header == -1.0f) break;
+
+        float currentConfidence = temp.confidence;
+        if(currentConfidence >= confidenceThr) {
+            dai::ImgDetection d;
+            d.label = temp.label;
+
+            d.confidence = currentConfidence;
+
+            d.xmin = temp.xmin;
+            d.ymin = temp.ymin;
+            d.xmax = temp.xmax;
+            d.ymax = temp.ymax;
+
+            detections.push_back(d);
+        }
+    }
+    return detections;
+}
+
+void DetectionParser::decodeYolo(std::shared_ptr<dai::NNData> nnData, std::shared_ptr<dai::ImgDetections> outDetections) {
+    auto& logger = pimpl->logger;
+    switch(properties.parser.decodingFamily) {
+        case YoloDecodingFamily::R1AF:  // anchor free: yolo v6r1
+            utilities::DetectionParserUtils::decodeR1AF(nnData, outDetections, properties, logger);
+            break;
+        case YoloDecodingFamily::v3AB:  // anchor based yolo v3 v3-Tiny
+            utilities::DetectionParserUtils::decodeV3AB(nnData, outDetections, properties, logger);
+            break;
+        case YoloDecodingFamily::v5AB:  // anchor based yolo v5, v7, P
+            utilities::DetectionParserUtils::decodeV5AB(nnData, outDetections, properties, logger);
+            break;
+        case YoloDecodingFamily::TLBR:  // top left bottom right anchor free: yolo v6r2, v8 v10 v11
+            utilities::DetectionParserUtils::decodeTLBR(nnData, outDetections, properties, logger);
+            break;
+    }
+}
+
 }  // namespace node
 }  // namespace dai
diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
new file mode 100644
index 000000000..c1809e847
--- /dev/null
+++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
@@ -0,0 +1,897 @@
+#include "DetectionParserUtils.hpp"
+
+#include <spdlog/async_logger.h>
+
+#include <cmath>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "depthai/common/KeypointsList.hpp"
+#include "depthai/common/RotatedRect.hpp"
+#include "depthai/common/TensorInfo.hpp"
+#include "depthai/pipeline/datatype/ImgDetections.hpp"
+#include "depthai/pipeline/datatype/NNData.hpp"
+#include "depthai/properties/DetectionParserProperties.hpp"
+#include "pipeline/utilities/NNDataViewer.hpp"
+
+namespace dai {
+namespace utilities {
+namespace DetectionParserUtils {
+
+// yolo v6 r1 - anchor free
+void decodeR1AF(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger) {
+    auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames);
+
+    const std::vector<int> strides = properties.parser.strides;
+    if(strides.size() != layerNames.size()) {
+        std::string errorMsg = fmt::format(
+            "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size());
+        throw std::runtime_error(errorMsg);
+    }
+    const float confidenceThr = properties.parser.confidenceThreshold;
+    const float iouThr = properties.parser.iouThreshold;
+    const int numClasses = properties.parser.classes;
+    int inputWidth;
+    int inputHeight;
+    std::tie(inputWidth, inputHeight) = nnData->transformation->getSize();
+
+    if(inputWidth <= 0 || inputHeight <= 0) {
+        throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
+    }
+    std::vector<DetectionCandidate> detectionCandidates;
+    detectionCandidates.reserve(250);
+
+    for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
+        std::string layerName = layerNames[strideIdx];
+        auto tensorInfo = nnData->getTensorInfo(layerName);
+        if(!tensorInfo) {
+            std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        if(!isTensorOrderValid(*tensorInfo, properties, logger)) {
+            logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName);
+            continue;
+        }
+
+        int layerHeight = tensorInfo->getHeight();
+        int layerWidth = tensorInfo->getWidth();
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger);
+        if(!outputData.build()) {
+            std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        for(int row = 0; row < layerHeight; ++row) {
+            for(int col = 0; col < layerWidth; ++col) {
+                const float score = outputData.get(4, row, col);
+                if(score < confidenceThr) {
+                    continue;
+                }
+
+                int bestC = 0;
+                float bestConf = 0.0f;
+                for(int c = 0; c < numClasses; ++c) {
+                    float candidateProb = outputData.get(c + 5, row, col);
+                    if(candidateProb > bestConf) {
+                        bestConf = candidateProb;
+                        bestC = c;
+                    }
+                }
+                if(bestConf * score < confidenceThr) {
+                    continue;
+                }
+
+                float cx = outputData.get(0, row, col);
+                float cy = outputData.get(1, row, col);
+                float w = outputData.get(2, row, col);
+                float h = outputData.get(3, row, col);
+
+                float xmin = cx - w * 0.5f;
+                float ymin = cy - h * 0.5f;
+                float xmax = cx + w * 0.5f;
+                float ymax = cy + h * 0.5f;
+
+                xmin = std::max(0.0f, std::min(xmin, float(inputWidth)));
+                ymin = std::max(0.0f, std::min(ymin, float(inputHeight)));
+                xmax = std::max(0.0f, std::min(xmax, float(inputWidth)));
+                ymax = std::max(0.0f, std::min(ymax, float(inputHeight)));
+
+                if(xmax <= xmin || ymax <= ymin) {
+                    logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping");
+                    continue;
+                }
+                DetectionCandidate candidate = DetectionCandidate{
+                    xmin,
+                    ymin,
+                    xmax,
+                    ymax,
+                    bestConf * score,
+                    bestC,
+                    strideIdx,
+                    row,
+                    col,
+                    std::nullopt,
+                };
+
+                if(!properties.parser.classNames->empty()) {
+                    candidate.labelName = (*properties.parser.classNames)[bestC];
+                }
+                detectionCandidates.emplace_back(std::move(candidate));
+            }
+        }
+    }
+
+    std::vector<DetectionCandidate> keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr);
+    if(keepCandidates.size() == 0) {
+        logger->trace("No detections after NMS, skipping overlay.");
+        return;
+    }
+
+    createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
+
+    if(properties.parser.decodeSegmentation) {
+        logger->trace("Segmentation decoding.");
+        segmentationDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+
+    if(properties.parser.decodeKeypoints) {
+        logger->trace("Keypoints decoding.");
+        keypointDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+}
+
+/*
+Decode anchor based yolo v3 and v3-Tiny
+*/
+void decodeV3AB(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger) {
+    auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames);
+
+    const std::vector<int> strides = properties.parser.strides;
+    if(strides.size() != layerNames.size()) {
+        std::string errorMsg = fmt::format(
+            "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size());
+        throw std::runtime_error(errorMsg);
+    }
+
+    const float confidenceThr = properties.parser.confidenceThreshold;
+    const float iouThr = properties.parser.iouThreshold;
+    const int numClasses = properties.parser.classes;
+    int inputWidth;
+    int inputHeight;
+    std::tie(inputWidth, inputHeight) = nnData->transformation->getSize();
+    if(inputWidth <= 0 || inputHeight <= 0) {
+        throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
+    }
+
+    if(properties.parser.anchorsV2.size() != layerNames.size()) {
+        logger->error("Number of anchor sets does not match number of output layers. Anchor sets size: {}, output layers size: {}",
+                      properties.parser.anchorsV2.size(),
+                      layerNames.size());
+        return;
+    }
+
+    std::vector<DetectionCandidate> detectionCandidates;
+    detectionCandidates.reserve(250);
+
+    for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
+        std::string layerName = layerNames[strideIdx];
+        int stride = strides[strideIdx];
+        auto tensorInfo = nnData->getTensorInfo(layerName);
+        if(!tensorInfo) {
+            std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        if(!isTensorOrderValid(*tensorInfo, properties, logger)) {
+            logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName);
+            continue;
+        }
+
+        int layerHeight = tensorInfo->getHeight();
+        int layerWidth = tensorInfo->getWidth();
+        int layerChannels = tensorInfo->getChannels();
+
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger);
+        if(!outputData.build()) {
+            std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+        std::vector<std::vector<float>>& anchors = properties.parser.anchorsV2[strideIdx];
+        int numAnchors = anchors.size();
+        int block = 5 + numClasses;
+        int expectedC = numAnchors * block;
+
+        if(layerChannels != expectedC) {
+            std::string errorMsg = fmt::format("Layer {} channels mismatch. Expected {}, got {}", layerName, expectedC, layerChannels);
+            throw std::runtime_error(errorMsg);
+        }
+
+        auto sigmoid = [](float x) -> float { return 1.f / (1.f + std::exp(-x)); };
+
+        for(int row = 0; row < layerHeight; ++row) {
+            for(int col = 0; col < layerWidth; ++col) {
+                for(int a = 0; a < numAnchors; ++a) {
+                    const int ch0 = a * block;
+                    const float tx = sigmoid(outputData.get(ch0 + 0, row, col));
+                    const float ty = sigmoid(outputData.get(ch0 + 1, row, col));
+                    const float tw = outputData.get(ch0 + 2, row, col);
+                    const float th = outputData.get(ch0 + 3, row, col);
+                    const float obj = sigmoid(outputData.get(ch0 + 4, row, col));
+                    if(obj < confidenceThr) continue;
+
+                    int bestC = 0;
+                    float clsProb = 0.0f;
+                    for(int c = 0; c < numClasses; ++c) {
+                        const float prob = outputData.get(ch0 + 5 + c, row, col);
+                        if(prob > clsProb) {
+                            clsProb = prob;
+                            bestC = c;
+                        }
+                    }
+                    const float conf = obj * 1.f / (1.f + std::exp(-clsProb));
+                    if(conf < confidenceThr) continue;
+
+                    // YOLOv3 decode
+                    const float cx = (static_cast<float>(col) + tx) * static_cast<float>(stride);
+                    const float cy = (static_cast<float>(row) + ty) * static_cast<float>(stride);
+                    const float w_exp = std::exp(tw);
+                    const float h_exp = std::exp(th);
+                    const float w = w_exp * anchors[a][0];
+                    const float h = h_exp * anchors[a][1];
+
+                    float xmin = cx - 0.5f * w;
+                    float ymin = cy - 0.5f * h;
+                    float xmax = cx + 0.5f * w;
+                    float ymax = cy + 0.5f * h;
+
+                    xmin = std::max(0.0f, std::min(xmin, float(inputWidth)));
+                    ymin = std::max(0.0f, std::min(ymin, float(inputHeight)));
+                    xmax = std::max(0.0f, std::min(xmax, float(inputWidth)));
+                    ymax = std::max(0.0f, std::min(ymax, float(inputHeight)));
+
+                    if(xmax <= xmin || ymax <= ymin) {
+                        logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping");
+                        continue;
+                    }
+
+                    DetectionCandidate candidate = DetectionCandidate{
+                        xmin,
+                        ymin,
+                        xmax,
+                        ymax,
+                        conf,
+                        bestC,
+                        strideIdx,
+                        row,
+                        col,
+                        std::nullopt,
+                    };
+
+                    if(!properties.parser.classNames->empty()) {
+                        candidate.labelName = (*properties.parser.classNames)[bestC];
+                    }
+                    detectionCandidates.emplace_back(std::move(candidate));
+                }
+            }
+        }
+    }
+
+    std::vector<DetectionCandidate> keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr);
+    if(keepCandidates.size() == 0) {
+        logger->trace("No detections after NMS, skipping overlay.");
+        return;
+    }
+
+    createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
+
+    if(properties.parser.decodeSegmentation) {
+        logger->trace("Segmentation decoding.");
+        segmentationDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+
+    if(properties.parser.decodeKeypoints) {
+        logger->trace("Keypoints decoding.");
+        keypointDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+
+    //
+}
+
+/*
+Decode anchor based networks, e.g., yolo v5, v7, P
+*/
+void decodeV5AB(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger) {
+    auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames);
+
+    const std::vector<int> strides = properties.parser.strides;
+    if(strides.size() != layerNames.size()) {
+        std::string errorMsg = fmt::format(
+            "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size());
+        throw std::runtime_error(errorMsg);
+    }
+
+    const float confidenceThr = properties.parser.confidenceThreshold;
+    const float iouThr = properties.parser.iouThreshold;
+    const int numClasses = properties.parser.classes;
+    int inputWidth;
+    int inputHeight;
+    std::tie(inputWidth, inputHeight) = nnData->transformation->getSize();
+
+    if(inputWidth <= 0 || inputHeight <= 0) {
+        throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
+    }
+
+    if(properties.parser.anchorsV2.size() != layerNames.size()) {
+        logger->error("Number of anchor sets does not match number of output layers. Anchor sets size: {}, output layers size: {}",
+                      properties.parser.anchorsV2.size(),
+                      layerNames.size());
+        return;
+    }
+
+    std::vector<DetectionCandidate> detectionCandidates;
+    detectionCandidates.reserve(250);
+
+    for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
+        std::string layerName = layerNames[strideIdx];
+        int stride = strides[strideIdx];
+        auto tensorInfo = nnData->getTensorInfo(layerName);
+        if(!tensorInfo) {
+            std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        if(!isTensorOrderValid(*tensorInfo, properties, logger)) {
+            logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName);
+            continue;
+        }
+
+        int layerHeight = tensorInfo->getHeight();
+        int layerWidth = tensorInfo->getWidth();
+        int layerChannels = tensorInfo->getChannels();
+
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger);
+        if(!outputData.build()) {
+            std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+        std::vector<std::vector<float>>& anchors = properties.parser.anchorsV2[strideIdx];
+        int numAnchors = anchors.size();
+        int block = 5 + numClasses;
+        int expectedC = numAnchors * block;
+
+        if(layerChannels != expectedC) {
+            logger->error("Layer {} channels mismatch. Expected {}, got {}", layerName, expectedC, layerChannels);
+            return;
+        }
+
+        for(int row = 0; row < layerHeight; ++row) {
+            for(int col = 0; col < layerWidth; ++col) {
+                for(int a = 0; a < numAnchors; ++a) {
+                    const int ch0 = a * block;
+
+                    const float tx = outputData.get(ch0 + 0, row, col);
+                    const float ty = outputData.get(ch0 + 1, row, col);
+                    const float tw = outputData.get(ch0 + 2, row, col);
+                    const float th = outputData.get(ch0 + 3, row, col);
+                    const float obj = outputData.get(ch0 + 4, row, col);
+                    if(obj < confidenceThr) continue;
+
+                    int bestC = 0;
+                    float bestConf = 0.0f;
+                    for(int c = 0; c < numClasses; ++c) {
+                        const float prob = outputData.get(ch0 + 5 + c, row, col);
+                        if(prob > bestConf) {
+                            bestConf = prob;
+                            bestC = c;
+                        }
+                    }
+                    const float conf = obj * bestConf;
+                    if(conf < confidenceThr) continue;
+
+                    // YOLOv5 decode
+                    const float cx = ((tx * 2.0f - 0.5f) + static_cast<float>(col)) * static_cast<float>(stride);
+                    const float cy = ((ty * 2.0f - 0.5f) + static_cast<float>(row)) * static_cast<float>(stride);
+
+                    const float w = tw * tw * 4.0f * anchors[a][0];
+                    const float h = th * th * 4.0f * anchors[a][1];
+
+                    float xmin = cx - 0.5f * w;
+                    float ymin = cy - 0.5f * h;
+                    float xmax = cx + 0.5f * w;
+                    float ymax = cy + 0.5f * h;
+
+                    xmin = std::max(0.0f, std::min(xmin, float(inputWidth)));
+                    ymin = std::max(0.0f, std::min(ymin, float(inputHeight)));
+                    xmax = std::max(0.0f, std::min(xmax, float(inputWidth)));
+                    ymax = std::max(0.0f, std::min(ymax, float(inputHeight)));
+
+                    if(xmax <= xmin || ymax <= ymin) continue;
+                    DetectionCandidate candidate = DetectionCandidate{
+                        xmin,
+                        ymin,
+                        xmax,
+                        ymax,
+                        conf,
+                        bestC,
+                        strideIdx,
+                        row,
+                        col,
+                        std::nullopt,
+                    };
+
+                    if(!properties.parser.classNames->empty()) {
+                        candidate.labelName = (*properties.parser.classNames)[bestC];
+                    }
+                    detectionCandidates.emplace_back(std::move(candidate));
+                }
+            }
+        }
+    }
+
+    std::vector<DetectionCandidate> keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr);
+    if(keepCandidates.size() == 0) {
+        logger->trace("No detections after NMS, skipping overlay.");
+        return;
+    }
+
+    createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
+
+    if(properties.parser.decodeSegmentation) {
+        logger->trace("Segmentation decoding.");
+        segmentationDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+
+    if(properties.parser.decodeKeypoints) {
+        logger->trace("Keypoints decoding.");
+        keypointDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+}
+
+/*
+Decode TLBR (top left bottom right) style networks, e.g., yolo v6r2, v8, v10, v11
+*/
+void decodeTLBR(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger) {
+    auto layerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames);
+
+    const std::vector<int> strides = properties.parser.strides;
+    if(strides.size() != layerNames.size()) {
+        std::string errorMsg = fmt::format(
+            "Number of strides does not match number of output layers. Strides size: {}, output layers size: {}", strides.size(), layerNames.size());
+        throw std::runtime_error(errorMsg);
+    }
+    const float confidenceThr = properties.parser.confidenceThreshold;
+    const float iouThr = properties.parser.iouThreshold;
+    const int numClasses = properties.parser.classes;
+    int inputWidth;
+    int inputHeight;
+    std::tie(inputWidth, inputHeight) = nnData->transformation->getSize();
+
+    if(inputWidth <= 0 || inputHeight <= 0) {
+        throw std::runtime_error("Invalid input dimensions retrieved from NNData transformation.");
+    }
+
+    std::vector<DetectionCandidate> detectionCandidates;
+    detectionCandidates.reserve(250);
+
+    for(int strideIdx = 0; strideIdx < static_cast<int>(layerNames.size()); ++strideIdx) {
+        std::string layerName = layerNames[strideIdx];
+        int stride = strides[strideIdx];
+        auto tensorInfo = nnData->getTensorInfo(layerName);
+        if(!tensorInfo) {
+            std::string errorMsg = fmt::format("Tensor info for layer {} is null", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        if(!isTensorOrderValid(*tensorInfo, properties, logger)) {
+            logger->error("Tensor order for layer {} is invalid, skipping this layer", layerName);
+            continue;
+        }
+
+        int layerHeight = tensorInfo->getHeight();
+        int layerWidth = tensorInfo->getWidth();
+        NNDataViewer outputData = NNDataViewer(*tensorInfo, nnData->data, logger);
+        if(!outputData.build()) {
+            std::string errorMsg = fmt::format("Failed to build NNDataViewer for layer {}", layerName);
+            throw std::runtime_error(errorMsg);
+        }
+
+        for(int row = 0; row < layerHeight; ++row) {
+            for(int col = 0; col < layerWidth; ++col) {
+                const float score = outputData.get(4, row, col);
+                if(score < confidenceThr) {
+                    continue;
+                }
+
+                int bestC = 0;
+                float bestConf = 0.0f;
+                for(int c = 0; c < numClasses; ++c) {
+                    float candidateProb = outputData.get(c + 5, row, col);
+                    if(candidateProb > bestConf) {
+                        bestConf = candidateProb;
+                        bestC = c;
+                    }
+                }
+                float xmin = (col - outputData.get(0, row, col) + 0.5f) * stride;
+                float ymin = (row - outputData.get(1, row, col) + 0.5f) * stride;
+                float xmax = (col + outputData.get(2, row, col) + 0.5f) * stride;
+                float ymax = (row + outputData.get(3, row, col) + 0.5f) * stride;
+
+                if(bestConf < confidenceThr) {
+                    continue;
+                }
+
+                xmin = std::max(0.0f, std::min(xmin, float(inputWidth)));
+                ymin = std::max(0.0f, std::min(ymin, float(inputHeight)));
+                xmax = std::max(0.0f, std::min(xmax, float(inputWidth)));
+                ymax = std::max(0.0f, std::min(ymax, float(inputHeight)));
+
+                if(xmax <= xmin || ymax <= ymin) {
+                    logger->info("Invalid box with xmax <= xmin or ymax <= ymin, skipping");
+                    continue;
+                }
+
+                DetectionCandidate candidate = DetectionCandidate{
+                    xmin,
+                    ymin,
+                    xmax,
+                    ymax,
+                    bestConf,
+                    bestC,
+                    strideIdx,
+                    row,
+                    col,
+                    std::nullopt,
+
+                };
+
+                if(!properties.parser.classNames->empty()) {
+                    candidate.labelName = (*properties.parser.classNames)[bestC];
+                }
+                detectionCandidates.emplace_back(std::move(candidate));
+            }
+        }
+    }
+
+    std::vector<DetectionCandidate> keepCandidates = nonMaximumSuppression(detectionCandidates, iouThr);
+    if(keepCandidates.size() == 0) {
+        logger->trace("No detections after NMS, skipping overlay.");
+        return;
+    }
+
+    createImgDetections(keepCandidates, outDetections, inputWidth, inputHeight);
+
+    if(properties.parser.decodeSegmentation) {
+        logger->trace("Segmentation decoding.");
+        segmentationDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+
+    if(properties.parser.decodeKeypoints) {
+        logger->trace("Keypoints decoding.");
+        keypointDecode(nnData, keepCandidates, outDetections, properties, logger);
+    }
+}
+
+bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr<spdlog::async_logger> logger) {
+    // Fix the channel order for Yolo - this is hacky and would be best to be fixed in the actual models and make it consistent
+
+    int anchorMultiplier = properties.parser.anchorsV2.empty() ? 1 : static_cast<int>(properties.parser.anchorsV2.size());
+    int channelSize = anchorMultiplier * (properties.parser.classes + properties.parser.coordinates + 1);
+
+    auto checkAndFixOrder = [&](int channelDimIndex, int alternativeDimIndex, dai::TensorInfo::StorageOrder alternativeOrder) -> bool {
+        // Check that the dims size is big enough
+        if(static_cast<int>(tensorInfo.dims.size()) <= channelDimIndex || static_cast<int>(tensorInfo.dims.size()) <= alternativeDimIndex) {
+            logger->error("Invalid tensor dims size. Skipping.");
+            return false;
+        }
+
+        if(tensorInfo.dims[channelDimIndex] != uint32_t(channelSize)) {
+            // Check if the channel size would match the alternative storage order
+            if(tensorInfo.dims[alternativeDimIndex] == uint32_t(channelSize)) {
+                logger->trace("Invalid channel size for the tensor. Expected {}, got {}, switching", channelSize, tensorInfo.dims[channelDimIndex]);
+                tensorInfo.order = alternativeOrder;
+            } else {
+                logger->error("Invalid channel size for the tensor. Expected {}, got {}. Skipping.", channelSize, tensorInfo.dims[channelDimIndex]);
+                return false;
+            }
+        }
+        return true;
+    };
+
+    switch(tensorInfo.order) {
+        case dai::TensorInfo::StorageOrder::CHW:
+            if(!checkAndFixOrder(0, 2, dai::TensorInfo::StorageOrder::HWC)) return false;
+            break;
+        case dai::TensorInfo::StorageOrder::HWC:
+            if(!checkAndFixOrder(2, 0, dai::TensorInfo::StorageOrder::CHW)) return false;
+            break;
+        case dai::TensorInfo::StorageOrder::NCHW:
+            if(!checkAndFixOrder(1, 3, dai::TensorInfo::StorageOrder::NHWC)) return false;
+            break;
+        case dai::TensorInfo::StorageOrder::NHWC:
+            if(!checkAndFixOrder(3, 1, dai::TensorInfo::StorageOrder::NCHW)) return false;
+            break;
+        case dai::TensorInfo::StorageOrder::NHCW:
+        case dai::TensorInfo::StorageOrder::WHC:
+        case dai::TensorInfo::StorageOrder::WCH:
+        case dai::TensorInfo::StorageOrder::HCW:
+        case dai::TensorInfo::StorageOrder::CWH:
+        case dai::TensorInfo::StorageOrder::NC:
+        case dai::TensorInfo::StorageOrder::CN:
+        case dai::TensorInfo::StorageOrder::C:
+        case dai::TensorInfo::StorageOrder::H:
+        case dai::TensorInfo::StorageOrder::W:
+        default:
+            logger->error("Invalid storage order for the tensor. Skipping.");
+            return false;
+    }
+
+    return true;
+}
+
+std::vector<std::string> getSortedDetectionLayerNames(std::shared_ptr<dai::NNData> nnData, std::string searchTerm, std::vector<std::string> outputNames) {
+    if(outputNames.empty()) {
+        outputNames = nnData->getAllLayerNames();
+    }
+
+    std::vector<std::string> layerNames;
+    for(const auto& name : outputNames) {
+        // if yolo in the name, push it to layerNames
+        if(name.find(searchTerm) != std::string::npos) {
+            layerNames.push_back(name);
+        }
+    }
+
+    std::sort(layerNames.begin(), layerNames.end());
+    return layerNames;
+}
+
+float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2) {
+    float width_of_overlap_area = fmin(box1.xmax, box2.xmax) - fmax(box1.xmin, box2.xmin);
+    float height_of_overlap_area = fmin(box1.ymax, box2.ymax) - fmax(box1.ymin, box2.ymin);
+    float area_of_overlap;
+    if(width_of_overlap_area < 0 || height_of_overlap_area < 0)
+        area_of_overlap = 0;
+    else
+        area_of_overlap = width_of_overlap_area * height_of_overlap_area;
+    float box_1_area = (box1.ymax - box1.ymin) * (box1.xmax - box1.xmin);
+    float box_2_area = (box2.ymax - box2.ymin) * (box2.xmax - box2.xmin);
+    float area_of_union = box_1_area + box_2_area - area_of_overlap;
+    return area_of_overlap / area_of_union;
+}
+
+std::vector<DetectionCandidate> nonMaximumSuppression(std::vector<DetectionCandidate>& detectionCandidates, float iouThr) {
+    std::sort(
+        detectionCandidates.begin(), detectionCandidates.end(), [](const DetectionCandidate& a, const DetectionCandidate& b) { return a.score > b.score; });
+
+    std::vector<uint8_t> keep(detectionCandidates.size(), 1);
+    std::vector<size_t> keepIndices;
+    keepIndices.reserve(detectionCandidates.size());
+
+    for(size_t i = 0; i < detectionCandidates.size(); ++i) {
+        if(!keep[i]) continue;
+        keepIndices.push_back(i);
+
+        for(size_t j = i + 1; j < detectionCandidates.size(); ++j) {
+            if(!keep[j]) continue;
+            if(YoloIntersectionOverUnion(detectionCandidates[i], detectionCandidates[j]) >= iouThr) {
+                keep[j] = 0;
+            }
+        }
+    }
+
+    std::vector<DetectionCandidate> keepCandidates;
+    keepCandidates.reserve(keepIndices.size());
+    for(size_t idx : keepIndices) keepCandidates.push_back(detectionCandidates[idx]);
+
+    return keepCandidates;
+}
+
+void createImgDetections(const std::vector<DetectionCandidate>& detectionCandidates,
+                         std::shared_ptr<dai::ImgDetections> outDetections,
+                         unsigned int width,
+                         unsigned int height) {
+    for(const auto& det : detectionCandidates) {
+        dai::ImgDetection detection;
+        dai::RotatedRect rotatedRect(dai::Rect(dai::Point2f(det.xmin, det.ymin), dai::Point2f(det.xmax, det.ymax)), 0.0f);
+        detection.setBoundingBox(rotatedRect.normalize(width, height));
+        detection.confidence = det.score;
+        detection.label = det.label;
+        if(det.labelName) {
+            detection.labelName = *det.labelName;
+        }
+        outDetections->detections.push_back(std::move(detection));
+    }
+}
+
+void segmentationDecode(std::shared_ptr<dai::NNData> nnData,
+                        std::vector<DetectionCandidate>& detectionCandidates,
+                        std::shared_ptr<dai::ImgDetections> outDetections,
+                        DetectionParserProperties properties,
+                        std::shared_ptr<spdlog::async_logger> logger) {
+    auto maskFromCoeffs = [](NNDataViewer& protos, const float* coeffs, int width, int height) -> cv::Mat {
+        cv::Mat maskLow(height, width, CV_32F);
+        for(int y = 0; y < maskLow.rows; ++y) {
+            float* row = maskLow.ptr<float>(y);
+            for(int x = 0; x < maskLow.cols; ++x) {
+                float sum = 0.f;
+                for(int c = 0; c < 32; ++c) sum += protos.get(c, y, x) * coeffs[c];
+                row[x] = 1.f / (1.f + std::exp(-sum));  // sigmoid
+            }
+        }
+        return maskLow;
+    };
+
+    std::pair<int, int> inputSize = nnData->transformation->getSize();
+    int inputWidth = inputSize.first;
+    int inputHeight = inputSize.second;
+
+    cv::Mat indexMask(inputHeight, inputWidth, CV_8U, cv::Scalar(255));
+
+    cv::Mat maskLow, maskUp;
+
+    auto maskLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "masks", std::vector<std::string>{});
+    if(properties.parser.strides.size() != maskLayerNames.size()) {
+        logger->error(
+            "Number of strides does not match number of mask output layers. Strides size: {}, mask output layers size: {}. Skipping segmentation decoding.",
+            properties.parser.strides.size(),
+            maskLayerNames.size());
+        return;
+    }
+    auto protoLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "proto", std::vector<std::string>{});
+    if(protoLayerNames.size() == 0) {
+        logger->error("Expecting proto output layer, found no layer with proto label. Skipping segmentation decoding.");
+        return;
+    }
+
+    NNDataViewer protoValues = NNDataViewer(*nnData->getTensorInfo(protoLayerNames[0]), nnData->data, logger);
+    if(!protoValues.build()) {
+        logger->error("Failed to build NNDataViewer for proto layer {}. Skipping segmentation decoding.", protoLayerNames[0]);
+        return;
+    }
+
+    std::map<int, NNDataViewer> maskValues;
+    for(int strideIdx = 0; strideIdx < static_cast<int>(maskLayerNames.size()); ++strideIdx) {
+        maskValues.try_emplace(strideIdx, *nnData->getTensorInfo(maskLayerNames[strideIdx]), nnData->data, logger);
+        if(!maskValues.at(strideIdx).build()) {
+            logger->error("Failed to build NNDataViewer for mask layer {}. Skipping segmentation decoding.", maskLayerNames[strideIdx]);
+            return;
+        }
+    }
+
+    for(size_t i = 0; i < detectionCandidates.size(); ++i) {  // loop over all detections
+        const auto& c = detectionCandidates[i];
+        const int detIdx = static_cast<int>(i);  // index in outDetections list
+
+        NNDataViewer mask = maskValues.at(c.headIndex);
+        std::array<float, 32> coeff;
+        for(int i = 0; i < 32; ++i) {
+            coeff[i] = mask.get(i, c.rowIndex, c.columnIndex);
+        }
+
+        TensorInfo protoInfo = *nnData->getTensorInfo(protoLayerNames[0]);
+        int protoWidth = protoInfo.getWidth();
+        int protoHeight = protoInfo.getHeight();
+        maskLow = maskFromCoeffs(protoValues, coeff.data(), protoWidth, protoHeight);
+
+        cv::resize(maskLow, maskUp, cv::Size(inputWidth, inputHeight), 0, 0, cv::INTER_LINEAR);
+        // ROI clamp
+        int x0 = std::clamp(static_cast<int>(std::floor(c.xmin)), 0, inputWidth - 1);
+        int y0 = std::clamp(static_cast<int>(std::floor(c.ymin)), 0, inputHeight - 1);
+        int x1 = std::clamp(static_cast<int>(std::ceil(c.xmax)), 0, inputWidth);
+        int y1 = std::clamp(static_cast<int>(std::ceil(c.ymax)), 0, inputHeight);
+
+        if(x1 <= x0 || y1 <= y0) continue;
+        const cv::Rect roi(x0, y0, x1 - x0, y1 - y0);
+
+        // Threshold & paint only unassigned pixels
+        cv::Mat roiProb = maskUp(roi);
+        cv::Mat roiBin;
+        cv::compare(roiProb, static_cast<double>(0.5f), roiBin, cv::CMP_GT);
+        cv::Mat roiOut = indexMask(roi);
+        cv::Mat unassigned;
+        cv::compare(roiOut, 255, unassigned, cv::CMP_EQ);
+        cv::Mat paintMask;
+        cv::bitwise_and(roiBin, unassigned, paintMask);
+
+        const uint8_t value = static_cast<uint8_t>(std::min(detIdx, 254));
+        roiOut.setTo(value, paintMask);
+    }
+
+    outDetections->setSegmentationMask(indexMask);
+}
+
+void keypointDecode(std::shared_ptr<dai::NNData> nnData,
+                    std::vector<DetectionCandidate>& detectionCandidates,
+                    std::shared_ptr<dai::ImgDetections> outDetections,
+                    DetectionParserProperties properties,
+                    std::shared_ptr<spdlog::async_logger> logger) {
+    int inputWidth;
+    int inputHeight;
+    std::tie(inputWidth, inputHeight) = nnData->transformation->getSize();
+
+    auto yoloLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "yolo", properties.parser.outputNames);
+    std::vector<int> featureMapWidths;
+    for(int i = 0; i < static_cast<int>(yoloLayerNames.size()); ++i) {
+        auto tensorInfo = nnData->getTensorInfo(yoloLayerNames[i]);
+        if(!tensorInfo) {
+            logger->error("Tensor info for layer {} is null. Skipping keypoints decoding.", yoloLayerNames[i]);
+            return;
+        }
+        featureMapWidths.push_back(tensorInfo->getWidth());
+    }
+
+    auto kptsLayerNames = DetectionParserUtils::getSortedDetectionLayerNames(nnData, "kpt_output", std::vector<std::string>{});
+    if(properties.parser.strides.size() != kptsLayerNames.size()) {
+        logger->error(
+            "Number of strides does not match number of keypoints output layers. Strides size: {}, keypoints output layers size: {}. Skipping keypoints "
+            "decoding.",
+            properties.parser.strides.size(),
+            kptsLayerNames.size());
+        return;
+    }
+
+    // TODO (aljaz) move to a function
+    std::map<int, NNDataViewer> keypointValues;
+    for(int strideIdx = 0; strideIdx < static_cast<int>(kptsLayerNames.size()); ++strideIdx) {
+        keypointValues.try_emplace(strideIdx, *nnData->getTensorInfo(kptsLayerNames[strideIdx]), nnData->data, logger);
+        if(!keypointValues.at(strideIdx).build()) {
+            logger->error("Failed to build NNDataViewer for keypoints layer {}. Skipping keypoints decoding.", kptsLayerNames[strideIdx]);
+            return;
+        }
+    }
+
+    if(outDetections->detections.size() != detectionCandidates.size()) {
+        logger->error(
+            "Number of detections in ImgDetections does not match number of detection candidates. ImgDetections size: {}, detection candidates size: {}. "
+            "Skipping keypoints decoding.",
+            outDetections->detections.size(),
+            detectionCandidates.size());
+        return;
+    }
+
+    for(size_t i = 0; i < detectionCandidates.size(); ++i) {  // loop over all detections
+        const auto& c = detectionCandidates[i];
+        int flattenedIndex = c.rowIndex * featureMapWidths[c.headIndex] + c.columnIndex;
+
+        std::vector<dai::Keypoint> keypoints;
+        keypoints.reserve(*properties.parser.nKeypoints);
+        NNDataViewer keypointMask = keypointValues.at(c.headIndex);
+
+        for(int k = 0; k < properties.parser.nKeypoints; ++k) {
+            int base = 3 * k;
+
+            // keypointValues tensor storage order HWC
+            //  H == 0
+            //  W == 51 == 17 * 3 (x, y, conf for each keypoint)
+            //  C == flattened spatial dimensions of row x col of the feature map
+            float x = std::clamp(keypointMask.get(flattenedIndex, 0, base + 0) / inputWidth, 0.0f, 1.0f);
+            float y = std::clamp(keypointMask.get(flattenedIndex, 0, base + 1) / inputHeight, 0.0f, 1.0f);
+            float conf = 1.f / (1.f + std::exp(-(keypointMask.get(flattenedIndex, 0, base + 2))));
+
+            keypoints.push_back(dai::Keypoint{dai::Point2f(x, y), conf});
+        }
+
+        outDetections->detections[i].keypoints = KeypointsList(keypoints);
+    }
+}
+
+}  // namespace DetectionParserUtils
+}  // namespace utilities
+}  // namespace dai
\ No newline at end of file
diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp
new file mode 100644
index 000000000..85b5a234f
--- /dev/null
+++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.hpp
@@ -0,0 +1,85 @@
+#pragma once
+
+#include <spdlog/async_logger.h>
+
+#include <optional>
+
+#include "depthai/pipeline/datatype/ImgDetections.hpp"
+#include "depthai/pipeline/datatype/NNData.hpp"
+#include "depthai/properties/DetectionParserProperties.hpp"
+
+namespace dai {
+namespace utilities {
+namespace DetectionParserUtils {
+
+struct DetectionCandidate {
+    float xmin, ymin, xmax, ymax, score;
+    int label, headIndex, rowIndex, columnIndex;
+    std::optional<std::string> labelName;
+};
+
+/*
+Decode anchor free yolo v6r1 with sigmoid assisted center detection
+*/
+void decodeR1AF(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger);
+
+/*
+Decode anchor based yolo v3 and v3-Tiny
+*/
+void decodeV3AB(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger);
+
+/*
+Decode anchor based networks, e.g., yolo v5, v7, P
+*/
+void decodeV5AB(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger);
+
+/*
+Decode anchor free top-left-bottom-right (TLBR) style networks, e.g., yolo v6r2, v8, v10, v11
+*/
+void decodeTLBR(std::shared_ptr<dai::NNData> nnData,
+                std::shared_ptr<dai::ImgDetections> outDetections,
+                DetectionParserProperties properties,
+                std::shared_ptr<spdlog::async_logger> logger);
+
+std::vector<std::string> getSortedDetectionLayerNames(std::shared_ptr<dai::NNData> nnData, std::string searchTerm, std::vector<std::string> outputNames);
+
+float YoloIntersectionOverUnion(const DetectionCandidate& box1, const DetectionCandidate& box2);
+
+bool isTensorOrderValid(dai::TensorInfo& tensorInfo, DetectionParserProperties properties, std::shared_ptr<spdlog::async_logger> logger);
+
+void createImgDetections(std::vector<DetectionCandidate>& detectionCandidates,
+                         std::vector<int> keepIndices,
+                         std::shared_ptr<dai::ImgDetections> outDetections,
+                         std::shared_ptr<spdlog::async_logger> logger);
+
+std::vector<DetectionCandidate> nonMaximumSuppression(std::vector<DetectionCandidate>& detectionCandidates, float iouThr);
+
+void createImgDetections(const std::vector<DetectionCandidate>& detectionCandidates,
+                         std::shared_ptr<dai::ImgDetections> outDetections,
+                         unsigned int width,
+                         unsigned int height);
+
+void segmentationDecode(std::shared_ptr<dai::NNData> nnData,
+                        std::vector<DetectionCandidate>& detectionCandidates,
+                        std::shared_ptr<dai::ImgDetections> outDetections,
+                        DetectionParserProperties properties,
+                        std::shared_ptr<spdlog::async_logger> logger);
+
+void keypointDecode(std::shared_ptr<dai::NNData> nnData,
+                    std::vector<DetectionCandidate>& detectionCandidates,
+                    std::shared_ptr<dai::ImgDetections> outDetections,
+                    DetectionParserProperties properties,
+                    std::shared_ptr<spdlog::async_logger> logger);
+
+}  // namespace DetectionParserUtils
+}  // namespace utilities
+}  // namespace dai
\ No newline at end of file
diff --git a/src/pipeline/utilities/NNDataViewer.hpp b/src/pipeline/utilities/NNDataViewer.hpp
new file mode 100644
index 000000000..94ab12cda
--- /dev/null
+++ b/src/pipeline/utilities/NNDataViewer.hpp
@@ -0,0 +1,163 @@
+#pragma once
+#include <spdlog/async_logger.h>
+
+#include "depthai/common/TensorInfo.hpp"
+#include "depthai/pipeline/datatype/NNData.hpp"
+#include "fp16/fp16.h"
+namespace dai {
+class NNDataViewer {
+   public:
+    std::shared_ptr<dai::Memory> data;
+    dai::TensorInfo tensor;
+    std::shared_ptr<spdlog::async_logger> logger;
+
+    // Factors to multiply with before the vectors
+    struct FactorsBefore {
+        int32_t h;
+        int32_t w;
+        int32_t c;
+    };
+
+    FactorsBefore factorsBefore;
+
+    NNDataViewer(dai::TensorInfo tensor, std::shared_ptr<dai::Memory> data, std::shared_ptr<spdlog::async_logger> logger)
+        : data{data}, tensor{tensor}, logger{logger} {};
+    bool build() {
+        if(tensor.strides.size() < 2) {
+            logger->error("Tensor doesn't have enough strides. Number of strides: {}, expected: {}", tensor.strides.size(), 2);
+            return false;
+        }
+        if(tensor.strides[0] == 0 || tensor.strides[1] == 0) {
+            logger->error("Tensor strides should not be set to zero. Strides are {} {}", tensor.strides[0], tensor.strides[1]);
+            return false;
+        }
+        switch(tensor.order) {
+            case TensorInfo::StorageOrder::NCHW:
+                if(tensor.dims[0] != 1) {
+                    logger->error("NCHW is only supported in Detection Parser if N is 1. It is {}", tensor.dims[0]);
+                    return false;
+                }
+                if(tensor.strides.size() != 4) {
+                    logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4);
+                }
+                factorsBefore.c = tensor.strides[1];
+                factorsBefore.h = tensor.strides[2];
+                factorsBefore.w = tensor.getDataTypeSize();
+                break;
+            case TensorInfo::StorageOrder::NHWC:
+                if(tensor.dims[0] != 1) {
+                    logger->error("NHWC is only supported in Detection Parser if N is 1. It is {}", tensor.dims[0]);
+                    return false;
+                }
+                if(tensor.strides.size() != 4) {
+                    logger->error("Invalid number of strides: {}, expected: {}", tensor.strides.size(), 4);
+                }
+                factorsBefore.h = tensor.strides[1];
+                factorsBefore.w = tensor.strides[2];
+                factorsBefore.c = tensor.getDataTypeSize();
+                break;
+            case TensorInfo::StorageOrder::HCW:
+                factorsBefore.h = tensor.strides[0];
+                factorsBefore.c = tensor.strides[1];
+                factorsBefore.w = tensor.getDataTypeSize();
+                break;
+
+            case TensorInfo::StorageOrder::HWC:
+                factorsBefore.h = tensor.strides[0];
+                factorsBefore.w = tensor.strides[1];
+                factorsBefore.c = tensor.getDataTypeSize();
+                break;
+            case TensorInfo::StorageOrder::CHW:
+                factorsBefore.c = tensor.strides[0];
+                factorsBefore.h = tensor.strides[1];
+                factorsBefore.w = tensor.getDataTypeSize();
+                break;
+
+            case TensorInfo::StorageOrder::CWH:
+                factorsBefore.c = tensor.strides[0];
+                factorsBefore.w = tensor.strides[1];
+                factorsBefore.h = tensor.getDataTypeSize();
+                break;
+
+            case TensorInfo::StorageOrder::WCH:
+                factorsBefore.w = tensor.strides[0];
+                factorsBefore.c = tensor.strides[1];
+                factorsBefore.h = tensor.getDataTypeSize();
+                break;
+
+            case TensorInfo::StorageOrder::WHC:
+                factorsBefore.w = tensor.strides[0];
+                factorsBefore.h = tensor.strides[1];
+                factorsBefore.c = tensor.getDataTypeSize();
+                break;
+            case TensorInfo::StorageOrder::NHCW:
+            case TensorInfo::StorageOrder::NC:
+            case TensorInfo::StorageOrder::CN:
+            case TensorInfo::StorageOrder::H:
+            case TensorInfo::StorageOrder::W:
+            case TensorInfo::StorageOrder::C:
+            default:
+                logger->error("Storage order not supported in NNDataViewer");
+                return false;
+        }
+        return sanity_check();
+    }
+
+    bool sanity_check() {
+        if(data->getSize() < (tensor.offset + (tensor.dims[0] * tensor.strides[0]))) {
+            logger->error(
+                "Underlying data does not hold enough data for the tensor to be contained.\
+                Tensor size: {}, Tensor offset: {}, Data type size: {}, Data size: {} ",
+                tensor.dims[0] * tensor.strides[0],
+                tensor.offset,
+                tensor.getDataTypeSize(),
+                data->getSize());
+            return false;
+        }
+        if(tensor.dims.size() < 2) {
+            logger->error("Number of dimensions for the input tensor is expected to be at least 2. It is {}", tensor.dims.size());
+            return false;
+        }
+        return true;
+    };
+
+    inline float get(int c, int h, int w) {
+        // If this turns out to be slow, use a function pointer instead and point to the right getter at build time
+        int32_t index = tensor.offset + factorsBefore.h * h + factorsBefore.w * w + factorsBefore.c * c;
+#ifdef DEPTHAI_SAFE_NN_DATA_ACCESS
+        logger->trace("Offset {}, fbH {}, fbW {}, fbC {}, h {}, w {}, c{}", tensor.offset, factorsBefore.h, factorsBefore.w, factorsBefore.c, h, w, c);
+        if(index > data->getSize()) {
+            logger->error("Out of bound access. Size is {}, index is {}", data->getSize(), index);
+            return 0.0;
+        }
+#endif
+
+        switch(tensor.dataType) {
+            case TensorInfo::DataType::U8F: {
+                uint8_t dataOut = data->getData()[index];
+                return (static_cast<float>(dataOut) - tensor.qpZp) * tensor.qpScale;
+            }
+            case TensorInfo::DataType::I8: {
+                int8_t dataOut = static_cast<int8_t>(data->getData()[index]);
+                return (static_cast<float>(dataOut) - tensor.qpZp) * tensor.qpScale;
+            }
+            case TensorInfo::DataType::INT: {
+                int32_t dataOut = reinterpret_cast<int32_t*>(data->getData().data())[index / sizeof(int32_t)];
+                return (static_cast<float>(dataOut) - tensor.qpZp) * tensor.qpScale;
+            }
+            case TensorInfo::DataType::FP16: {
+                int16_t dataOut = reinterpret_cast<int16_t*>(data->getData().data())[index / sizeof(int16_t)];
+                return (fp16_ieee_to_fp32_value(dataOut) - tensor.qpZp) * tensor.qpScale;
+            }
+            case TensorInfo::DataType::FP32: {
+                float dataOut = reinterpret_cast<float*>(data->getData().data())[index / sizeof(float)];
+                return (static_cast<float>(dataOut) - tensor.qpZp) * tensor.qpScale;
+            }
+            case TensorInfo::DataType::FP64:
+            default: {
+                return 0.0f;
+            }
+        }
+    }
+};
+}  // namespace dai

From 96a92f58cac13c433300945107e82fb1dcf03ab0 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Wed, 5 Nov 2025 10:47:23 +0100
Subject: [PATCH 3/7] Add host side implementation

---
 .../pipeline/node/DetectionParserBindings.cpp |  2 ++
 examples/cpp/DetectionNetwork/CMakeLists.txt  |  2 +-
 .../detection_and_keypoints.cpp               |  1 -
 .../{RVC4 => }/detection_and_segmentation.cpp | 35 ++++++++++++-------
 .../{RVC4 => }/detection_and_segmentation.py  | 14 ++++++--
 tests/CMakeLists.txt                          |  2 +-
 6 files changed, 38 insertions(+), 18 deletions(-)
 rename examples/cpp/DetectionNetwork/{RVC4 => }/detection_and_segmentation.cpp (84%)
 rename examples/python/DetectionNetwork/{RVC4 => }/detection_and_segmentation.py (92%)

diff --git a/bindings/python/src/pipeline/node/DetectionParserBindings.cpp b/bindings/python/src/pipeline/node/DetectionParserBindings.cpp
index 7e5a50c4f..eab544ed4 100644
--- a/bindings/python/src/pipeline/node/DetectionParserBindings.cpp
+++ b/bindings/python/src/pipeline/node/DetectionParserBindings.cpp
@@ -65,11 +65,13 @@ void bind_detectionparser(pybind11::module& m, void* pCallstack) {
              DOC(dai, node, DetectionParser, setAnchors, 2))
         .def("setAnchorMasks", &DetectionParser::setAnchorMasks, py::arg("anchorMasks"), DOC(dai, node, DetectionParser, setAnchorMasks))
         .def("setIouThreshold", &DetectionParser::setIouThreshold, py::arg("thresh"), DOC(dai, node, DetectionParser, setIouThreshold))
+        .def("setRunOnHost", &DetectionParser::setRunOnHost, py::arg("runOnHost"), DOC(dai, node, DetectionParser, setRunOnHost))
         .def("getNumClasses", &DetectionParser::getNumClasses, DOC(dai, node, DetectionParser, getNumClasses))
         .def("getCoordinateSize", &DetectionParser::getCoordinateSize, DOC(dai, node, DetectionParser, getCoordinateSize))
         .def("getAnchors", &DetectionParser::getAnchors, DOC(dai, node, DetectionParser, getAnchors))
         .def("getAnchorMasks", &DetectionParser::getAnchorMasks, DOC(dai, node, DetectionParser, getAnchorMasks))
         .def("getIouThreshold", &DetectionParser::getIouThreshold, DOC(dai, node, DetectionParser, getIouThreshold))
+        .def("runOnHost", &DetectionParser::runOnHost, DOC(dai, node, DetectionParser, runOnHost))
         .def("build", &DetectionParser::build, DOC(dai, node, DetectionParser, build));
     daiNodeModule.attr("DetectionParser").attr("Properties") = detectionParserProperties;
 }
diff --git a/examples/cpp/DetectionNetwork/CMakeLists.txt b/examples/cpp/DetectionNetwork/CMakeLists.txt
index 8193faeb9..8c3ba6ecf 100644
--- a/examples/cpp/DetectionNetwork/CMakeLists.txt
+++ b/examples/cpp/DetectionNetwork/CMakeLists.txt
@@ -23,7 +23,7 @@ dai_set_example_test_labels(detection_network ondevice rvc2_all rvc4 rvc4rgb ci)
 dai_add_example(detection_network_remap detection_network_remap.cpp ON OFF)
 dai_set_example_test_labels(detection_network_remap ondevice rvc2_all rvc4 ci)
 
-dai_add_example(detection_and_segmentation RVC4/detection_and_segmentation.cpp ON OFF)
+dai_add_example(detection_and_segmentation detection_and_segmentation.cpp ON OFF)
 dai_set_example_test_labels(detection_and_segmentation rvc4)
 
 dai_add_example(detection_and_keypoints detection_and_keypoints.cpp ON OFF)
diff --git a/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
index 667151bb0..f374bdca1 100644
--- a/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
+++ b/examples/cpp/DetectionNetwork/detection_and_keypoints.cpp
@@ -86,7 +86,6 @@ int main() {
 
             auto currentTime = std::chrono::steady_clock::now();
             float fps = counter / std::chrono::duration<float>(currentTime - startTime).count();
-            std::cout << "FPS: " << fps << std::endl;
         }
 
         if(cv::waitKey(1) == 'q') {
diff --git a/examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp
similarity index 84%
rename from examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp
rename to examples/cpp/DetectionNetwork/detection_and_segmentation.cpp
index 4912d04c6..e3e81dcbf 100644
--- a/examples/cpp/DetectionNetwork/RVC4/detection_and_segmentation.cpp
+++ b/examples/cpp/DetectionNetwork/detection_and_segmentation.cpp
@@ -4,6 +4,7 @@
 #include <cstddef>
 #include <cstdio>
 #include <iostream>
+#include <memory>
 #include <opencv2/core.hpp>
 #include <opencv2/opencv.hpp>
 
@@ -16,8 +17,16 @@ cv::Rect frameNorm(const cv::Mat& frame, const dai::Point2f& topLeft, const dai:
 }
 
 int main() {
+    std::string modelName = "luxonis/yolov8-instance-segmentation-large:coco-640x352";
+    bool setRunOnHost = false;
+    auto device = std::make_shared<dai::Device>();
+
+    if(device->getPlatformAsString() == "RVC2") {
+        modelName = "luxonis/yolov8-instance-segmentation-nano:coco-512x288";
+        setRunOnHost = true;
+    }
     // Create pipeline
-    dai::Pipeline pipeline;
+    dai::Pipeline pipeline{device};
 
     // Create and configure camera node
     auto cameraNode = pipeline.create<dai::node::Camera>();
@@ -27,8 +36,10 @@ int main() {
     auto detectionNetwork = pipeline.create<dai::node::DetectionNetwork>();
 
     dai::NNModelDescription modelDescription;
-    modelDescription.model = "luxonis/yolov8-instance-segmentation-large:coco-640x480";
+
+    modelDescription.model = modelName;
     detectionNetwork->build(cameraNode, modelDescription);
+    detectionNetwork->detectionParser->setRunOnHost(setRunOnHost);
     auto labelMap = detectionNetwork->getClasses();
 
     // Create output queues
@@ -121,16 +132,18 @@ int main() {
                             detections.begin(), detections.end(), [filteredLabel](const dai::ImgDetection& det) { return det.label != filteredLabel; }),
                         detections.end());
                 }
+                if(!segmentationMask.empty()) {
+                    cv::Mat lut(1, 256, CV_8U);
+                    for(int i = 0; i < 256; ++i) lut.at<uchar>(i) = (i >= 255) ? 255 : cv::saturate_cast<uchar>(i * 25);
 
-                cv::Mat lut(1, 256, CV_8U);
-                for(int i = 0; i < 256; ++i) lut.at<uchar>(i) = (i == 255) ? 255 : cv::saturate_cast<uchar>(i * 25);
-                cv::Mat scaledMask;
-                cv::LUT(segmentationMask, lut, scaledMask);
+                    cv::Mat scaledMask;
+                    cv::LUT(segmentationMask, lut, scaledMask);
 
-                cv::Mat coloredMask;
-                cv::applyColorMap(scaledMask, coloredMask, cv::COLORMAP_JET);
-                frame.copyTo(coloredMask, (scaledMask == 255));
-                cv::addWeighted(frame, 0.7, coloredMask, 0.3, 0, frame);
+                    cv::Mat coloredMask;
+                    cv::applyColorMap(scaledMask, coloredMask, cv::COLORMAP_JET);
+                    frame.copyTo(coloredMask, (scaledMask == 255));
+                    cv::addWeighted(frame, 0.7, coloredMask, 0.3, 0, frame);
+                }
 
                 // Display detections
                 for(const auto& detection : detections) {
@@ -157,8 +170,6 @@ int main() {
                 cv::imshow("rgb", frame);
 
                 auto currentTime = std::chrono::steady_clock::now();
-                float fps = counter / std::chrono::duration<float>(currentTime - startTime).count();
-                std::cout << "FPS: " << fps << std::endl;
             }
         }
     }
diff --git a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py b/examples/python/DetectionNetwork/detection_and_segmentation.py
similarity index 92%
rename from examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py
rename to examples/python/DetectionNetwork/detection_and_segmentation.py
index 650f90f2f..a8ecc74a6 100644
--- a/examples/python/DetectionNetwork/RVC4/detection_and_segmentation.py
+++ b/examples/python/DetectionNetwork/detection_and_segmentation.py
@@ -6,11 +6,19 @@
 import numpy as np
 import time
 
+model_name = "luxonis/yolov8-instance-segmentation-large:coco-640x480"
+setRunOnHost = False
+device = dai.Device()
+if device.getPlatformAsString() == "RVC2":
+    model_name = "luxonis/yolov8-instance-segmentation-nano:coco-512x288"
+    setRunOnHost = True
+
 # Create pipeline
-with dai.Pipeline() as pipeline:
+with dai.Pipeline(device) as pipeline:
     cameraNode = pipeline.create(dai.node.Camera).build()
-    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-instance-segmentation-nano:coco-512x288"))
-    # detectionNetwork.detectionParser.runOnHost(True)
+    
+    detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription(model_name))
+    detectionNetwork.detectionParser.setRunOnHost(setRunOnHost)
     labelMap = detectionNetwork.getClasses()
 
     qRgb = detectionNetwork.passthrough.createOutputQueue()
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d3823f1a8..f4b0776b2 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -524,7 +524,7 @@ FIRE_VIDEO="${fire_video}"
 KITCHEN_IMAGE_PATH="${kitchen_image}"
 YOLO_V8_INSTANCE_SEGMENTATION_LARGE_COCO_640x352_KITCHEN_SEGMENTATION_GROUND_TRUTH="${yolo_v8_instance_segmentation_large_coco_640x352_kitchen_segmentation_gt}"
 )
-dai_set_test_labels(detection_parser_test ondevice rvc4 ci)
+dai_set_test_labels(detection_parser_test ondevice rvc4 ci onhost)
 
 # Spatial detection network test
 dai_add_test(spatial_detection_network_test src/ondevice_tests/pipeline/node/spatial_detection_network_test.cpp)

From a0dd29a87dcf460a720332847689570829a99707 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Wed, 5 Nov 2025 10:52:36 +0100
Subject: [PATCH 4/7] bump device

---
 cmake/Depthai/DepthaiDeviceSideConfig.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Depthai/DepthaiDeviceSideConfig.cmake b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
index b0a270a1d..7c6bb3df4 100644
--- a/cmake/Depthai/DepthaiDeviceSideConfig.cmake
+++ b/cmake/Depthai/DepthaiDeviceSideConfig.cmake
@@ -2,7 +2,7 @@
 set(DEPTHAI_DEVICE_SIDE_MATURITY "snapshot")
 
 # "full commit hash of device side binary"
-set(DEPTHAI_DEVICE_SIDE_COMMIT "e658b28655820c649b3bbed9f44865d00139094d")
+set(DEPTHAI_DEVICE_SIDE_COMMIT "8741ce89206d2a5299acc3382c7496e1ee205fcb")
 
 # "version if applicable"
 set(DEPTHAI_DEVICE_SIDE_VERSION "")

From 33752f1f44726f7e977dac9e1f6bd1ff79228da1 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Wed, 5 Nov 2025 12:54:19 +0100
Subject: [PATCH 5/7] bump rvc4

---
 cmake/Depthai/DepthaiDeviceRVC4Config.cmake                 | 2 +-
 .../pipeline/node/spatial_location_calculator_test.cpp      | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
index 43d640f5e..f6ae0d22b 100644
--- a/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
+++ b/cmake/Depthai/DepthaiDeviceRVC4Config.cmake
@@ -3,4 +3,4 @@
 set(DEPTHAI_DEVICE_RVC4_MATURITY "snapshot")
 
 # "version if applicable"
-set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+53bd364bc4c519e9aa6230b3de4d78a78d073373")
+set(DEPTHAI_DEVICE_RVC4_VERSION "0.0.1+62ce59c3c4a4a53a9b0773fe83dabbecdc4553e9")
diff --git a/tests/src/ondevice_tests/pipeline/node/spatial_location_calculator_test.cpp b/tests/src/ondevice_tests/pipeline/node/spatial_location_calculator_test.cpp
index 1316e8566..0a8ca09b7 100644
--- a/tests/src/ondevice_tests/pipeline/node/spatial_location_calculator_test.cpp
+++ b/tests/src/ondevice_tests/pipeline/node/spatial_location_calculator_test.cpp
@@ -1,6 +1,5 @@
-#include <catch2/catch_all.hpp>
-
 #include <array>
+#include <catch2/catch_all.hpp>
 #include <cstdint>
 #include <cstring>
 #include <vector>
@@ -106,7 +105,6 @@ TEST_CASE("SpatialLocationCalculator synthetic depth data test") {
     auto outputQueue = spatial->out.createOutputQueue();
     auto passthroughQueue = spatial->passthroughDepth.createOutputQueue();
 
-    
     std::vector<std::uint16_t> depthPixels(width * height, 1000);
     auto setRegionDepth = [&](const RoiSpec& spec) {
         const int x0 = static_cast<int>(spec.roi.x);
@@ -120,7 +118,7 @@ TEST_CASE("SpatialLocationCalculator synthetic depth data test") {
     for(const auto& spec : roiSpecs) {
         setRegionDepth(spec);
     }
-    
+
     // Prepare synthetic depth frame
     auto depthFrame = std::make_shared<dai::ImgFrame>();
     depthFrame->setType(dai::ImgFrame::Type::RAW16);

From 255a8824da8078e10cf8b8d19623a8cdc649daa7 Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Tue, 11 Nov 2025 16:59:50 +0100
Subject: [PATCH 6/7] update parser

---
 .../utilities/DetectionParser/DetectionParserUtils.cpp        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
index c1809e847..a9455e551 100644
--- a/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
+++ b/src/pipeline/utilities/DetectionParser/DetectionParserUtils.cpp
@@ -10,7 +10,7 @@
 #include <string>
 #include <vector>
 
-#include "depthai/common/KeypointsList.hpp"
+#include "depthai/common/KeypointsListT.hpp"
 #include "depthai/common/RotatedRect.hpp"
 #include "depthai/common/TensorInfo.hpp"
 #include "depthai/pipeline/datatype/ImgDetections.hpp"
@@ -888,7 +888,7 @@ void keypointDecode(std::shared_ptr<dai::NNData> nnData,
             keypoints.push_back(dai::Keypoint{dai::Point2f(x, y), conf});
         }
 
-        outDetections->detections[i].keypoints = KeypointsList(keypoints);
+        outDetections->detections[i].keypoints = KeypointsList(keypoints, properties.parser.keypointEdges);
     }
 }
 

From 831232c1ac7faf4cb2e181bc74d5067f9aba220f Mon Sep 17 00:00:00 2001
From: aljazkonec1 <konec.aljaz1@gmail.com>
Date: Tue, 11 Nov 2025 17:57:55 +0100
Subject: [PATCH 7/7] update example

---
 examples/python/DetectionNetwork/detection_and_keypoints.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/python/DetectionNetwork/detection_and_keypoints.py b/examples/python/DetectionNetwork/detection_and_keypoints.py
index b61c41fc9..4459be138 100644
--- a/examples/python/DetectionNetwork/detection_and_keypoints.py
+++ b/examples/python/DetectionNetwork/detection_and_keypoints.py
@@ -7,7 +7,7 @@
 
 # Create pipeline
 with dai.Pipeline() as pipeline:
-    cameraNode = pipeline.create(dai.node.Camera).build()
+    cameraNode = pipeline.create(dai.node.Camera).build(sensorFps=12)
     detectionNetwork = pipeline.create(dai.node.DetectionNetwork).build(cameraNode, dai.NNModelDescription("luxonis/yolov8-nano-pose-estimation:coco-512x288"))
     labelMap = detectionNetwork.getClasses()