From 81d1039b92193f424e5666adb5e766089171cfed Mon Sep 17 00:00:00 2001
From: Stephen Neuendorffer <stephen.neuendorffer@amd.com>
Date: Tue, 23 Jan 2024 08:50:42 -0800
Subject: [PATCH 1/3] dos2unix

---
 .../vision_kernels/addWeighted.cc             | 248 +++----
 .../vision_kernels/bitwiseAND.cc              | 178 ++---
 .../vision_kernels/bitwiseOR.cc               | 178 ++---
 .../vision_kernels/filter2d.cc                | 446 ++++++-------
 .../vision_kernels/gray2rgba.cc               | 164 ++---
 .../vision_kernels/passThrough.cc             | 174 ++---
 .../vision_kernels/rgba2gray.cc               | 214 +++---
 .../vision_kernels/rgba2hue.cc                | 362 +++++-----
 .../vision_kernels/threshold.cc               | 626 +++++++++---------
 9 files changed, 1295 insertions(+), 1295 deletions(-)
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc
index 24d6fb7f25..c6868aad3a 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc
@@ -1,124 +1,124 @@
-//===- add_weighted.cc -------------------------------------------------*- C++
-//-*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2022, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-// #define __AIENGINE__ 1
-#define NOCPP
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define REL_WRITE 0
-#define REL_READ 1
-
-// #include <imgproc/xf_addweighted_aie.hpp> // NOTE: use of float2fix not
-//  supported in aie2
-#include <aie_api/aie.hpp>
-
-const int32_t SRS_SHIFT = 14;
-
-template <typename T, int N, int MAX>
-void addweighted_aie_scalar(const T *in1, const T *in2, T *out,
-                            const int32_t width, const int32_t height,
-                            const int16_t alpha, const int16_t beta,
-                            const T gamma) {
-  for (int i = 0; i < height; i++)
-    for (int j = 0; j < width; j++) {
-      int tmpIn1 = in1[i * width + j] * alpha;
-      int tmpIn2 = in2[i * width + j] * beta;
-      int tmp =
-          ((tmpIn1 + tmpIn2 + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT) + gamma;
-      tmp = (tmp > MAX) ? MAX : (tmp < 0) ? 0 : tmp; // saturate
-      out[i * width + j] = (T)tmp;
-    }
-}
-
-template <typename T, int N, int MAX>
-void addweighted_aie(const T *src1, const T *src2, T *dst, const int32_t width,
-                     const int32_t height, const int16_t alphaFixedPoint,
-                     const int16_t betaFixedPoint, const T gamma) {
-
-  ::aie::set_saturation(
-      aie::saturation_mode::saturate); // Needed to saturate properly to uint8
-
-  ::aie::vector<int16_t, N> coeff(alphaFixedPoint, betaFixedPoint);
-  ::aie::vector<T, N> gamma_coeff;
-  ::aie::accum<acc32, N> gamma_acc;
-  for (int i = 0; i < N; i++) {
-    gamma_coeff[i] = gamma;
-  }
-  gamma_acc.template from_vector(gamma_coeff, 0);
-  for (int j = 0; j < width * height; j += N) // 16 samples per loop
-    chess_prepare_for_pipelining chess_loop_range(
-        14, ) // loop_range(14) - loop : 1 cycle
-    {
-      ::aie::vector<T, N> data_buf1 = ::aie::load_v<N>(src1);
-      src1 += N;
-      ::aie::vector<T, N> data_buf2 = ::aie::load_v<N>(src2);
-      src2 += N;
-      ::aie::accum<acc32, N> acc = ::aie::accumulate<N>(
-          gamma_acc, coeff, 0, data_buf1,
-          data_buf2); // weight[0] * data_buf1 + weight[1] * data_buf2
-      ::aie::store_v(dst, acc.template to_vector<T>(SRS_SHIFT));
-      dst += N;
-    }
-}
-
-extern "C" {
-
-#if BIT_WIDTH == 8
-void addWeightedLine(uint8_t *in1, uint8_t *in2, uint8_t *out,
-                     int32_t lineWidth, int16_t alpha, int16_t beta,
-                     uint8_t gamma) {
-  addweighted_aie<uint8_t, 32, UINT8_MAX>(in1, in2, out, lineWidth, 1, alpha,
-                                          beta, gamma);
-}
-
-void addWeightedTile(uint8_t *in1, uint8_t *in2, uint8_t *out,
-                     int32_t tileHeight, int32_t tileWidth, int16_t alpha,
-                     int16_t beta, uint8_t gamma) {
-  addweighted_aie<uint8_t, 32, UINT8_MAX>(in1, in2, out, tileWidth, tileHeight,
-                                          alpha, beta, gamma);
-}
-
-#elif BIT_WIDTH == 16
-void addWeightedLine(int16_t *in1, int16_t *in2, int16_t *out,
-                     int32_t lineWidth, int16_t alpha, int16_t beta,
-                     int16_t gamma) {
-  addweighted_aie<int16_t, 16, INT16_MAX>(in1, in2, out, lineWidth, 1, alpha,
-                                          beta, gamma);
-}
-
-void addWeightedTile(int16_t *in1, int16_t *in2, int16_t *out,
-                     int32_t tileHeight, int32_t tileWidth, int16_t alpha,
-                     int16_t beta, int16_t gamma) {
-  addweighted_aie<int16_t, 16, INT16_MAX>(in1, in2, out, tileWidth, tileHeight,
-                                          alpha, beta, gamma);
-}
-
-#else // 32
-
-void addWeightedLine(int32_t *in1, int32_t *in2, int32_t *out,
-                     int32_t lineWidth, int16_t alpha, int16_t beta,
-                     int32_t gamma) {
-  addweighted_aie<int32_t, 16, INT32_MAX>(in1, in2, out, lineWidth, 1, alpha,
-                                          beta, gamma);
-}
-
-void addWeightedTile(int32_t *in1, int32_t *in2, int32_t *out,
-                     int32_t tileHeight, int32_t tileWidth, int16_t alpha,
-                     int16_t beta, int32_t gamma) {
-  addweighted_aie<int32_t, 16, INT32_MAX>(in1, in2, out, tileWidth, tileHeight,
-                                          alpha, beta, gamma);
-}
-
-#endif
-} // extern "C"
+//===- add_weighted.cc -------------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define NOCPP
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+// #include <imgproc/xf_addweighted_aie.hpp> // NOTE: use of float2fix not
+//  supported in aie2
+#include <aie_api/aie.hpp>
+
+const int32_t SRS_SHIFT = 14;
+
+template <typename T, int N, int MAX>
+void addweighted_aie_scalar(const T *in1, const T *in2, T *out,
+                            const int32_t width, const int32_t height,
+                            const int16_t alpha, const int16_t beta,
+                            const T gamma) {
+  for (int i = 0; i < height; i++)
+    for (int j = 0; j < width; j++) {
+      int tmpIn1 = in1[i * width + j] * alpha;
+      int tmpIn2 = in2[i * width + j] * beta;
+      int tmp =
+          ((tmpIn1 + tmpIn2 + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT) + gamma;
+      tmp = (tmp > MAX) ? MAX : (tmp < 0) ? 0 : tmp; // saturate
+      out[i * width + j] = (T)tmp;
+    }
+}
+
+template <typename T, int N, int MAX>
+void addweighted_aie(const T *src1, const T *src2, T *dst, const int32_t width,
+                     const int32_t height, const int16_t alphaFixedPoint,
+                     const int16_t betaFixedPoint, const T gamma) {
+
+  ::aie::set_saturation(
+      aie::saturation_mode::saturate); // Needed to saturate properly to uint8
+
+  ::aie::vector<int16_t, N> coeff(alphaFixedPoint, betaFixedPoint);
+  ::aie::vector<T, N> gamma_coeff;
+  ::aie::accum<acc32, N> gamma_acc;
+  for (int i = 0; i < N; i++) {
+    gamma_coeff[i] = gamma;
+  }
+  gamma_acc.template from_vector(gamma_coeff, 0);
+  for (int j = 0; j < width * height; j += N) // 16 samples per loop
+    chess_prepare_for_pipelining chess_loop_range(
+        14, ) // loop_range(14) - loop : 1 cycle
+    {
+      ::aie::vector<T, N> data_buf1 = ::aie::load_v<N>(src1);
+      src1 += N;
+      ::aie::vector<T, N> data_buf2 = ::aie::load_v<N>(src2);
+      src2 += N;
+      ::aie::accum<acc32, N> acc = ::aie::accumulate<N>(
+          gamma_acc, coeff, 0, data_buf1,
+          data_buf2); // weight[0] * data_buf1 + weight[1] * data_buf2
+      ::aie::store_v(dst, acc.template to_vector<T>(SRS_SHIFT));
+      dst += N;
+    }
+}
+
+extern "C" {
+
+#if BIT_WIDTH == 8
+void addWeightedLine(uint8_t *in1, uint8_t *in2, uint8_t *out,
+                     int32_t lineWidth, int16_t alpha, int16_t beta,
+                     uint8_t gamma) {
+  addweighted_aie<uint8_t, 32, UINT8_MAX>(in1, in2, out, lineWidth, 1, alpha,
+                                          beta, gamma);
+}
+
+void addWeightedTile(uint8_t *in1, uint8_t *in2, uint8_t *out,
+                     int32_t tileHeight, int32_t tileWidth, int16_t alpha,
+                     int16_t beta, uint8_t gamma) {
+  addweighted_aie<uint8_t, 32, UINT8_MAX>(in1, in2, out, tileWidth, tileHeight,
+                                          alpha, beta, gamma);
+}
+
+#elif BIT_WIDTH == 16
+void addWeightedLine(int16_t *in1, int16_t *in2, int16_t *out,
+                     int32_t lineWidth, int16_t alpha, int16_t beta,
+                     int16_t gamma) {
+  addweighted_aie<int16_t, 16, INT16_MAX>(in1, in2, out, lineWidth, 1, alpha,
+                                          beta, gamma);
+}
+
+void addWeightedTile(int16_t *in1, int16_t *in2, int16_t *out,
+                     int32_t tileHeight, int32_t tileWidth, int16_t alpha,
+                     int16_t beta, int16_t gamma) {
+  addweighted_aie<int16_t, 16, INT16_MAX>(in1, in2, out, tileWidth, tileHeight,
+                                          alpha, beta, gamma);
+}
+
+#else // 32
+
+void addWeightedLine(int32_t *in1, int32_t *in2, int32_t *out,
+                     int32_t lineWidth, int16_t alpha, int16_t beta,
+                     int32_t gamma) {
+  addweighted_aie<int32_t, 16, INT32_MAX>(in1, in2, out, lineWidth, 1, alpha,
+                                          beta, gamma);
+}
+
+void addWeightedTile(int32_t *in1, int32_t *in2, int32_t *out,
+                     int32_t tileHeight, int32_t tileWidth, int16_t alpha,
+                     int16_t beta, int32_t gamma) {
+  addweighted_aie<int32_t, 16, INT32_MAX>(in1, in2, out, tileWidth, tileHeight,
+                                          alpha, beta, gamma);
+}
+
+#endif
+} // extern "C"
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc
index 577ce10659..e3cd871195 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc
@@ -1,89 +1,89 @@
-//===- bitwisaAND.cc --------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-// #define __AIENGINE__ 1
-#define NOCPP
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define REL_WRITE 0
-#define REL_READ 1
-
-#include <aie_api/aie.hpp>
-
-template <typename T, int N>
-void bitwiseAND_aie_scalar(const T *in1, const T *in2, T *out,
-                           const int32_t width, const int32_t height) {
-  for (int i = 0; i < height; i++)
-    for (int j = 0; j < width; j++)
-      out[i * width + j] = in1[i * width + j] & in2[i * width + j];
-}
-
-template <typename T, int N>
-void bitwiseAND_aie(const T *src1, const T *src2, T *dst, const int32_t width,
-                    const int32_t height) {
-
-  for (int j = 0; j < width * height; j += N)
-    chess_prepare_for_pipelining chess_loop_range(
-        14, ) // loop_range(14) - loop : 1 cycle
-    {
-      ::aie::vector<T, N> in1 = ::aie::load_v<N>(src1);
-      src1 += N;
-      ::aie::vector<T, N> in2 = ::aie::load_v<N>(src2);
-      src2 += N;
-      ::aie::vector<T, N> out;
-
-      out = ::aie::bit_and(in1, in2);
-
-      ::aie::store_v(dst, out);
-      dst += N;
-    }
-}
-
-extern "C" {
-
-#if BIT_WIDTH == 8
-void bitwiseANDLine(uint8_t *in1, uint8_t *in2, uint8_t *out,
-                    int32_t lineWidth) {
-  bitwiseAND_aie<uint8_t, 64>(in1, in2, out, lineWidth, 1);
-}
-
-void bitwiseANDTile(uint8_t *in1, uint8_t *in2, uint8_t *out,
-                    int32_t tileHeight, int32_t tileWidth) {
-  bitwiseAND_aie<uint8_t, 64>(in1, in2, out, tileWidth, tileHeight);
-}
-
-#elif BIT_WIDTH == 16
-void bitwiseANDLine(int16_t *in1, int16_t *in2, int16_t *out,
-                    int32_t lineWidth) {
-  bitwiseAND_aie<int16_t, 32>(in1, in2, out, lineWidth, 1);
-}
-
-void bitwiseANDTile(int16_t *in1, int16_t *in2, int16_t *out,
-                    int32_t tileHeight, int32_t tileWidth) {
-  bitwiseAND_aie<int16_t, 32>(in1, in2, out, tileWidth, tileHeight);
-}
-
-#else // 32
-
-void bitwiseANDLine(int32_t *in1, int32_t *in2, int32_t *out,
-                    int32_t lineWidth) {
-  bitwiseAND_aie<int32_t, 16>(in1, in2, out, lineWidth);
-}
-
-void bitwiseANDTile(int32_t *in1, int32_t *in2, int32_t *out,
-                    int32_t tileHeight, int32_t tileWidth) {
-  bitwiseAND_aie<int32_t, 16>(in1, in2, out, tileWidth, tileHeight);
-}
-
-#endif
-} // extern "C"
+//===- bitwisaAND.cc --------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define NOCPP
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#include <aie_api/aie.hpp>
+
+template <typename T, int N>
+void bitwiseAND_aie_scalar(const T *in1, const T *in2, T *out,
+                           const int32_t width, const int32_t height) {
+  for (int i = 0; i < height; i++)
+    for (int j = 0; j < width; j++)
+      out[i * width + j] = in1[i * width + j] & in2[i * width + j];
+}
+
+template <typename T, int N>
+void bitwiseAND_aie(const T *src1, const T *src2, T *dst, const int32_t width,
+                    const int32_t height) {
+
+  for (int j = 0; j < width * height; j += N)
+    chess_prepare_for_pipelining chess_loop_range(
+        14, ) // loop_range(14) - loop : 1 cycle
+    {
+      ::aie::vector<T, N> in1 = ::aie::load_v<N>(src1);
+      src1 += N;
+      ::aie::vector<T, N> in2 = ::aie::load_v<N>(src2);
+      src2 += N;
+      ::aie::vector<T, N> out;
+
+      out = ::aie::bit_and(in1, in2);
+
+      ::aie::store_v(dst, out);
+      dst += N;
+    }
+}
+
+extern "C" {
+
+#if BIT_WIDTH == 8
+void bitwiseANDLine(uint8_t *in1, uint8_t *in2, uint8_t *out,
+                    int32_t lineWidth) {
+  bitwiseAND_aie<uint8_t, 64>(in1, in2, out, lineWidth, 1);
+}
+
+void bitwiseANDTile(uint8_t *in1, uint8_t *in2, uint8_t *out,
+                    int32_t tileHeight, int32_t tileWidth) {
+  bitwiseAND_aie<uint8_t, 64>(in1, in2, out, tileWidth, tileHeight);
+}
+
+#elif BIT_WIDTH == 16
+void bitwiseANDLine(int16_t *in1, int16_t *in2, int16_t *out,
+                    int32_t lineWidth) {
+  bitwiseAND_aie<int16_t, 32>(in1, in2, out, lineWidth, 1);
+}
+
+void bitwiseANDTile(int16_t *in1, int16_t *in2, int16_t *out,
+                    int32_t tileHeight, int32_t tileWidth) {
+  bitwiseAND_aie<int16_t, 32>(in1, in2, out, tileWidth, tileHeight);
+}
+
+#else // 32
+
+void bitwiseANDLine(int32_t *in1, int32_t *in2, int32_t *out,
+                    int32_t lineWidth) {
+  bitwiseAND_aie<int32_t, 16>(in1, in2, out, lineWidth);
+}
+
+void bitwiseANDTile(int32_t *in1, int32_t *in2, int32_t *out,
+                    int32_t tileHeight, int32_t tileWidth) {
+  bitwiseAND_aie<int32_t, 16>(in1, in2, out, tileWidth, tileHeight);
+}
+
+#endif
+} // extern "C"
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc
index 520efbaf70..e02ec472f5 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc
@@ -1,89 +1,89 @@
-//===- bitwiseOR.cc ---------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-// #define __AIENGINE__ 1
-#define NOCPP
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define REL_WRITE 0
-#define REL_READ 1
-
-#include <aie_api/aie.hpp>
-
-template <typename T, int N>
-void bitwiseOR_aie_scalar(const T *in1, const T *in2, T *out,
-                          const int32_t width, const int32_t height) {
-  for (int i = 0; i < height; i++)
-    for (int j = 0; j < width; j++)
-      out[i * width + j] = in1[i * width + j] | in2[i * width + j];
-}
-
-template <typename T, int N>
-void bitwiseOR_aie(const T *src1, const T *src2, T *dst, const int32_t width,
-                   const int32_t height) {
-
-  for (int j = 0; j < width * height; j += N)
-    chess_prepare_for_pipelining chess_loop_range(
-        14, ) // loop_range(14) - loop : 1 cycle
-    {
-      ::aie::vector<T, N> in1 = ::aie::load_v<N>(src1);
-      src1 += N;
-      ::aie::vector<T, N> in2 = ::aie::load_v<N>(src2);
-      src2 += N;
-      ::aie::vector<T, N> out;
-
-      out = ::aie::bit_or(in1, in2);
-
-      ::aie::store_v(dst, out);
-      dst += N;
-    }
-}
-
-extern "C" {
-
-#if BIT_WIDTH == 8
-void bitwiseORLine(uint8_t *in1, uint8_t *in2, uint8_t *out,
-                   int32_t lineWidth) {
-  bitwiseOR_aie<uint8_t, 64>(in1, in2, out, lineWidth, 1);
-}
-
-void bitwiseORTile(uint8_t *in1, uint8_t *in2, uint8_t *out, int32_t tileHeight,
-                   int32_t tileWidth) {
-  bitwiseOR_aie<uint8_t, 64>(in1, in2, out, tileWidth, tileHeight);
-}
-
-#elif BIT_WIDTH == 16
-void bitwiseORLine(int16_t *in1, int16_t *in2, int16_t *out,
-                   int32_t lineWidth) {
-  bitwiseOR_aie<int16_t, 32>(in1, in2, out, lineWidth, 1);
-}
-
-void bitwiseORTile(int16_t *in1, int16_t *in2, int16_t *out, int32_t tileHeight,
-                   int32_t tileWidth) {
-  bitwiseOR_aie<int16_t, 32>(in1, in2, out, tileWidth, tileHeight);
-}
-
-#else // 32
-
-void bitwiseORLine(int32_t *in1, int32_t *in2, int32_t *out,
-                   int32_t lineWidth) {
-  bitwiseOR_aie<int32_t, 16>(in1, in2, out, lineWidth);
-}
-
-void bitwiseORTile(int32_t *in1, int32_t *in2, int32_t *out, int32_t tileHeight,
-                   int32_t tileWidth) {
-  bitwiseOR_aie<int32_t, 16>(in1, in2, out, tileWidth, tileHeight);
-}
-
-#endif
-} // extern "C"
+//===- bitwiseOR.cc ---------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define NOCPP
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#include <aie_api/aie.hpp>
+
+template <typename T, int N>
+void bitwiseOR_aie_scalar(const T *in1, const T *in2, T *out,
+                          const int32_t width, const int32_t height) {
+  for (int i = 0; i < height; i++)
+    for (int j = 0; j < width; j++)
+      out[i * width + j] = in1[i * width + j] | in2[i * width + j];
+}
+
+template <typename T, int N>
+void bitwiseOR_aie(const T *src1, const T *src2, T *dst, const int32_t width,
+                   const int32_t height) {
+
+  for (int j = 0; j < width * height; j += N)
+    chess_prepare_for_pipelining chess_loop_range(
+        14, ) // loop_range(14) - loop : 1 cycle
+    {
+      ::aie::vector<T, N> in1 = ::aie::load_v<N>(src1);
+      src1 += N;
+      ::aie::vector<T, N> in2 = ::aie::load_v<N>(src2);
+      src2 += N;
+      ::aie::vector<T, N> out;
+
+      out = ::aie::bit_or(in1, in2);
+
+      ::aie::store_v(dst, out);
+      dst += N;
+    }
+}
+
+extern "C" {
+
+#if BIT_WIDTH == 8
+void bitwiseORLine(uint8_t *in1, uint8_t *in2, uint8_t *out,
+                   int32_t lineWidth) {
+  bitwiseOR_aie<uint8_t, 64>(in1, in2, out, lineWidth, 1);
+}
+
+void bitwiseORTile(uint8_t *in1, uint8_t *in2, uint8_t *out, int32_t tileHeight,
+                   int32_t tileWidth) {
+  bitwiseOR_aie<uint8_t, 64>(in1, in2, out, tileWidth, tileHeight);
+}
+
+#elif BIT_WIDTH == 16
+void bitwiseORLine(int16_t *in1, int16_t *in2, int16_t *out,
+                   int32_t lineWidth) {
+  bitwiseOR_aie<int16_t, 32>(in1, in2, out, lineWidth, 1);
+}
+
+void bitwiseORTile(int16_t *in1, int16_t *in2, int16_t *out, int32_t tileHeight,
+                   int32_t tileWidth) {
+  bitwiseOR_aie<int16_t, 32>(in1, in2, out, tileWidth, tileHeight);
+}
+
+#else // 32
+
+void bitwiseORLine(int32_t *in1, int32_t *in2, int32_t *out,
+                   int32_t lineWidth) {
+  bitwiseOR_aie<int32_t, 16>(in1, in2, out, lineWidth);
+}
+
+void bitwiseORTile(int32_t *in1, int32_t *in2, int32_t *out, int32_t tileHeight,
+                   int32_t tileWidth) {
+  bitwiseOR_aie<int32_t, 16>(in1, in2, out, tileWidth, tileHeight);
+}
+
+#endif
+} // extern "C"
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc
index 1bb7302982..ccbbb87c6f 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc
@@ -1,223 +1,223 @@
-//===- filter2d.cc ----------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2022, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-// #define __AIENGINE__ 1
-#define NOCPP
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define REL_WRITE 0
-#define REL_READ 1
-
-#define THRESH_TYPE XF_THRESHOLD_TYPE_BINARY
-
-#include <aie_api/aie.hpp>
-
-const int32_t SRS_SHIFT = 12;
-
-void filter2d_3lines_aie_scalar(uint8_t *lineIn0, uint8_t *lineIn1,
-                                uint8_t *lineIn2, uint8_t *output,
-                                const int32_t width, int16_t *kernel) {
-
-  int32_t acc;
-
-  // left of line, border extension by mirroring
-  acc = 0;
-  acc += ((int32_t)lineIn0[0]) * kernel[0 * 3 + 0];
-  acc += ((int32_t)lineIn1[0]) * kernel[1 * 3 + 0];
-  acc += ((int32_t)lineIn2[0]) * kernel[2 * 3 + 0];
-
-  for (int ki = 1; ki < 3; ki++) {
-    acc += ((int32_t)lineIn0[0 + ki - 1]) * kernel[0 * 3 + ki];
-    acc += ((int32_t)lineIn1[0 + ki - 1]) * kernel[1 * 3 + ki];
-    acc += ((int32_t)lineIn2[0 + ki - 1]) * kernel[2 * 3 + ki];
-  }
-  acc = ((acc + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT);
-  acc = (acc > UINT8_MAX) ? UINT8_MAX : (acc < 0) ? 0 : acc; // saturate
-  output[0] = (uint8_t)acc;
-
-  // middle of line, no border extension needed
-  for (int i = 1; i < width - 1; i++) {
-    acc = 0;
-    for (int ki = 0; ki < 3; ki++) {
-      acc += ((int32_t)lineIn0[i + ki - 1]) * kernel[0 * 3 + ki];
-      acc += ((int32_t)lineIn1[i + ki - 1]) * kernel[1 * 3 + ki];
-      acc += ((int32_t)lineIn2[i + ki - 1]) * kernel[2 * 3 + ki];
-    }
-    acc = ((acc + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT);
-    acc = (acc > UINT8_MAX) ? UINT8_MAX : (acc < 0) ? 0 : acc; // saturate
-    output[i] = (uint8_t)acc;
-  }
-
-  // right of line, border extension by mirroring
-  acc = 0;
-  for (int ki = 0; ki < 2; ki++) {
-    acc += ((int32_t)lineIn0[width + ki - 2]) * kernel[0 * 3 + ki];
-    acc += ((int32_t)lineIn1[width + ki - 2]) * kernel[1 * 3 + ki];
-    acc += ((int32_t)lineIn2[width + ki - 2]) * kernel[2 * 3 + ki];
-  }
-
-  acc += ((int32_t)lineIn0[width - 1]) * kernel[0 * 3 + 2];
-  acc += ((int32_t)lineIn1[width - 1]) * kernel[1 * 3 + 2];
-  acc += ((int32_t)lineIn2[width - 1]) * kernel[2 * 3 + 2];
-  acc = ((acc + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT);
-  acc = (acc > UINT8_MAX) ? UINT8_MAX : (acc < 0) ? 0 : acc; // saturate
-  output[width - 1] = (uint8_t)acc;
-}
-
-#define KERNEL_WIDTH 3
-
-constexpr unsigned VecFactor = 32;
-
-constexpr unsigned Lanes = 32; // Parallel vector output lanes
-constexpr unsigned Points = 8; // Columns where data in summed togther
-constexpr unsigned CoeffStep = 1;
-constexpr unsigned DataStepXY = 1;
-
-using mul_ops =
-    aie::sliding_mul_xy_ops<Lanes, Points, CoeffStep, DataStepXY, int8, uint8>;
-
-void filter2d_3lines_aie(uint8_t *lineIn0, uint8_t *lineIn1, uint8_t *lineIn2,
-                         uint8_t *output, const int32_t width,
-                         int16_t *kernel) {
-
-  set_sat(); // Needed for int16 to saturate properly to uint8
-
-  aie::vector<uint8, 64> data_buf1, data_buf2, data_buf3;
-  aie::vector<uint8, 64> prev_buf1, prev_buf2, prev_buf3;
-  aie::vector<uint8, 64> zero_buf = ::aie::zeros<uint8, 64>();
-  aie::vector<int8, 32> kernel_vec;
-
-  const uint32_t kernel_side = KERNEL_WIDTH / 2;
-
-  for (int j = 0; j < KERNEL_WIDTH; j++) {
-    for (int i = 0; i < KERNEL_WIDTH; i++) {
-      kernel_vec[j * Points + i] =
-          (int8_t)((*kernel) >> 8); // int16 to int8 shift
-      kernel++;
-    }
-    for (int i2 = 0; i2 < Points - KERNEL_WIDTH; i2++) {
-      kernel_vec[j * Points + KERNEL_WIDTH + i2] = 0;
-    }
-  }
-
-  // left of line, border extension by mirroring
-  // first kernel row
-  data_buf1.insert(0, aie::load_v<32>(lineIn0));
-  lineIn0 += VecFactor;
-  data_buf1.insert(1, aie::load_v<32>(lineIn0));
-  prev_buf1.insert(1, data_buf1.template extract<32>(0));
-  data_buf1 = ::aie::shuffle_up_replicate(data_buf1, kernel_side);
-  auto acc = mul_ops::mul(kernel_vec, 0, data_buf1, 0);
-
-  // second kernel row
-  data_buf2.insert(0, aie::load_v<32>(lineIn1));
-  lineIn1 += VecFactor;
-  data_buf2.insert(1, aie::load_v<32>(lineIn1));
-  prev_buf2.insert(1, data_buf2.template extract<32>(0));
-  data_buf2 = ::aie::shuffle_up_replicate(data_buf2, kernel_side);
-  acc = mul_ops::mac(acc, kernel_vec, Points, data_buf2, 0);
-
-  // third kernel row
-  data_buf3.insert(0, aie::load_v<32>(lineIn2));
-  lineIn2 += VecFactor;
-  data_buf3.insert(1, aie::load_v<32>(lineIn2));
-  prev_buf3.insert(1, data_buf3.template extract<32>(0));
-  data_buf3 = ::aie::shuffle_up_replicate(data_buf3, kernel_side);
-  acc = mul_ops::mac(acc, kernel_vec, 2 * Points, data_buf3, 0);
-
-  // Store result
-  ::aie::store_v(output, acc.to_vector<uint8>(SRS_SHIFT - 8));
-  output += VecFactor;
-
-  // middle of line, no border extension needed
-  for (int i = 2 * VecFactor; i < width - 1; i += VecFactor) {
-    // first kernel row
-    data_buf1.insert(0, aie::load_v<32>(lineIn0));
-    lineIn0 += VecFactor;
-    data_buf1.insert(1, aie::load_v<32>(lineIn0));
-    data_buf1 = ::aie::shuffle_up_fill(data_buf1, prev_buf1, kernel_side);
-    prev_buf1.insert(1, data_buf1.template extract<32>(0));
-    acc = mul_ops::mul(kernel_vec, 0, data_buf1, 0);
-
-    // second kernel row
-    data_buf2.insert(0, aie::load_v<32>(lineIn1));
-    lineIn1 += VecFactor;
-    data_buf2.insert(1, aie::load_v<32>(lineIn1));
-    data_buf2 = ::aie::shuffle_up_fill(data_buf2, prev_buf2, kernel_side);
-    prev_buf2.insert(1, data_buf2.template extract<32>(0));
-    acc = mul_ops::mac(acc, kernel_vec, Points, data_buf2, 0);
-
-    // third kernel row
-    data_buf3.insert(0, aie::load_v<32>(lineIn2));
-    lineIn2 += VecFactor;
-    data_buf3.insert(1, aie::load_v<32>(lineIn2));
-    data_buf3 = ::aie::shuffle_up_fill(data_buf3, prev_buf3, kernel_side);
-    prev_buf3.insert(1, data_buf3.template extract<32>(0));
-    acc = mul_ops::mac(acc, kernel_vec, 2 * Points, data_buf3, 0);
-
-    // Store result
-    ::aie::store_v(output, acc.to_vector<uint8>(SRS_SHIFT - 8));
-    output += VecFactor;
-  }
-
-  // right of line, border extension by mirroring
-  // first kernel row
-  data_buf1.insert(1, aie::load_v<32>(lineIn0));
-  data_buf1 = ::aie::shuffle_down_replicate(data_buf1, 32);
-  data_buf1 = ::aie::shuffle_up_fill(data_buf1, prev_buf1, kernel_side);
-  acc = mul_ops::mul(kernel_vec, 0, data_buf1, 0);
-
-  // second kernel row
-  data_buf2.insert(1, aie::load_v<32>(lineIn1));
-  data_buf2 = ::aie::shuffle_down_replicate(data_buf2, 32);
-  data_buf2 = ::aie::shuffle_up_fill(data_buf2, prev_buf2, kernel_side);
-  acc = mul_ops::mac(acc, kernel_vec, Points, data_buf2, 0);
-
-  // third kernel row
-  data_buf3.insert(1, aie::load_v<32>(lineIn2));
-  lineIn2 += VecFactor;
-  data_buf3 = ::aie::shuffle_down_replicate(data_buf3, 32);
-  data_buf3 = ::aie::shuffle_up_fill(data_buf3, prev_buf3, kernel_side);
-  acc = mul_ops::mac(acc, kernel_vec, 2 * Points, data_buf3, 0);
-
-  // Store result
-  ::aie::store_v(output, acc.to_vector<uint8>(SRS_SHIFT - 8));
-  output += VecFactor;
-}
-
-extern "C" {
-
-// #if BIT_WIDTH == 8
-
-void filter2dLine(uint8_t *lineIn0, uint8_t *lineIn1, uint8_t *lineIn2,
-                  uint8_t *out, int32_t lineWidth, int16_t *filterKernel) {
-  filter2d_3lines_aie(lineIn0, lineIn1, lineIn2, out, lineWidth, filterKernel);
-}
-
-/* #elif BIT_WIDTH == 16
-
-void filter2dLine(int16_t *in, int16_t *out, int32_t lineWidth, int16_t
-filter2dValue, int16_t maxValue) { filter2d_3lines_aie<int16_t, 32>(in, out,
-lineWidth, 1, filter2dValue, maxValue);
-}
-
-#else // 32
-
-void filter2dLine(int32_t *in, int32_t *out, int32_t lineWidth, int32_t
-filter2dValue, int32_t maxValue) { filter2d_3lines_aie<int32_t, 16>(in, out,
-lineWidth, 1, filter2dValue, maxValue);
-}
-
-#endif */
-
-} // extern "C"
+//===- filter2d.cc ----------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define NOCPP
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#define THRESH_TYPE XF_THRESHOLD_TYPE_BINARY
+
+#include <aie_api/aie.hpp>
+
+const int32_t SRS_SHIFT = 12;
+
+void filter2d_3lines_aie_scalar(uint8_t *lineIn0, uint8_t *lineIn1,
+                                uint8_t *lineIn2, uint8_t *output,
+                                const int32_t width, int16_t *kernel) {
+
+  int32_t acc;
+
+  // left of line, border extension by mirroring
+  acc = 0;
+  acc += ((int32_t)lineIn0[0]) * kernel[0 * 3 + 0];
+  acc += ((int32_t)lineIn1[0]) * kernel[1 * 3 + 0];
+  acc += ((int32_t)lineIn2[0]) * kernel[2 * 3 + 0];
+
+  for (int ki = 1; ki < 3; ki++) {
+    acc += ((int32_t)lineIn0[0 + ki - 1]) * kernel[0 * 3 + ki];
+    acc += ((int32_t)lineIn1[0 + ki - 1]) * kernel[1 * 3 + ki];
+    acc += ((int32_t)lineIn2[0 + ki - 1]) * kernel[2 * 3 + ki];
+  }
+  acc = ((acc + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT);
+  acc = (acc > UINT8_MAX) ? UINT8_MAX : (acc < 0) ? 0 : acc; // saturate
+  output[0] = (uint8_t)acc;
+
+  // middle of line, no border extension needed
+  for (int i = 1; i < width - 1; i++) {
+    acc = 0;
+    for (int ki = 0; ki < 3; ki++) {
+      acc += ((int32_t)lineIn0[i + ki - 1]) * kernel[0 * 3 + ki];
+      acc += ((int32_t)lineIn1[i + ki - 1]) * kernel[1 * 3 + ki];
+      acc += ((int32_t)lineIn2[i + ki - 1]) * kernel[2 * 3 + ki];
+    }
+    acc = ((acc + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT);
+    acc = (acc > UINT8_MAX) ? UINT8_MAX : (acc < 0) ? 0 : acc; // saturate
+    output[i] = (uint8_t)acc;
+  }
+
+  // right of line, border extension by mirroring
+  acc = 0;
+  for (int ki = 0; ki < 2; ki++) {
+    acc += ((int32_t)lineIn0[width + ki - 2]) * kernel[0 * 3 + ki];
+    acc += ((int32_t)lineIn1[width + ki - 2]) * kernel[1 * 3 + ki];
+    acc += ((int32_t)lineIn2[width + ki - 2]) * kernel[2 * 3 + ki];
+  }
+
+  acc += ((int32_t)lineIn0[width - 1]) * kernel[0 * 3 + 2];
+  acc += ((int32_t)lineIn1[width - 1]) * kernel[1 * 3 + 2];
+  acc += ((int32_t)lineIn2[width - 1]) * kernel[2 * 3 + 2];
+  acc = ((acc + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT);
+  acc = (acc > UINT8_MAX) ? UINT8_MAX : (acc < 0) ? 0 : acc; // saturate
+  output[width - 1] = (uint8_t)acc;
+}
+
+#define KERNEL_WIDTH 3
+
+constexpr unsigned VecFactor = 32;
+
+constexpr unsigned Lanes = 32; // Parallel vector output lanes
+constexpr unsigned Points = 8; // Columns where data in summed togther
+constexpr unsigned CoeffStep = 1;
+constexpr unsigned DataStepXY = 1;
+
+using mul_ops =
+    aie::sliding_mul_xy_ops<Lanes, Points, CoeffStep, DataStepXY, int8, uint8>;
+
+void filter2d_3lines_aie(uint8_t *lineIn0, uint8_t *lineIn1, uint8_t *lineIn2,
+                         uint8_t *output, const int32_t width,
+                         int16_t *kernel) {
+
+  set_sat(); // Needed for int16 to saturate properly to uint8
+
+  aie::vector<uint8, 64> data_buf1, data_buf2, data_buf3;
+  aie::vector<uint8, 64> prev_buf1, prev_buf2, prev_buf3;
+  aie::vector<uint8, 64> zero_buf = ::aie::zeros<uint8, 64>();
+  aie::vector<int8, 32> kernel_vec;
+
+  const uint32_t kernel_side = KERNEL_WIDTH / 2;
+
+  for (int j = 0; j < KERNEL_WIDTH; j++) {
+    for (int i = 0; i < KERNEL_WIDTH; i++) {
+      kernel_vec[j * Points + i] =
+          (int8_t)((*kernel) >> 8); // int16 to int8 shift
+      kernel++;
+    }
+    for (int i2 = 0; i2 < Points - KERNEL_WIDTH; i2++) {
+      kernel_vec[j * Points + KERNEL_WIDTH + i2] = 0;
+    }
+  }
+
+  // left of line, border extension by mirroring
+  // first kernel row
+  data_buf1.insert(0, aie::load_v<32>(lineIn0));
+  lineIn0 += VecFactor;
+  data_buf1.insert(1, aie::load_v<32>(lineIn0));
+  prev_buf1.insert(1, data_buf1.template extract<32>(0));
+  data_buf1 = ::aie::shuffle_up_replicate(data_buf1, kernel_side);
+  auto acc = mul_ops::mul(kernel_vec, 0, data_buf1, 0);
+
+  // second kernel row
+  data_buf2.insert(0, aie::load_v<32>(lineIn1));
+  lineIn1 += VecFactor;
+  data_buf2.insert(1, aie::load_v<32>(lineIn1));
+  prev_buf2.insert(1, data_buf2.template extract<32>(0));
+  data_buf2 = ::aie::shuffle_up_replicate(data_buf2, kernel_side);
+  acc = mul_ops::mac(acc, kernel_vec, Points, data_buf2, 0);
+
+  // third kernel row
+  data_buf3.insert(0, aie::load_v<32>(lineIn2));
+  lineIn2 += VecFactor;
+  data_buf3.insert(1, aie::load_v<32>(lineIn2));
+  prev_buf3.insert(1, data_buf3.template extract<32>(0));
+  data_buf3 = ::aie::shuffle_up_replicate(data_buf3, kernel_side);
+  acc = mul_ops::mac(acc, kernel_vec, 2 * Points, data_buf3, 0);
+
+  // Store result
+  ::aie::store_v(output, acc.to_vector<uint8>(SRS_SHIFT - 8));
+  output += VecFactor;
+
+  // middle of line, no border extension needed
+  for (int i = 2 * VecFactor; i < width - 1; i += VecFactor) {
+    // first kernel row
+    data_buf1.insert(0, aie::load_v<32>(lineIn0));
+    lineIn0 += VecFactor;
+    data_buf1.insert(1, aie::load_v<32>(lineIn0));
+    data_buf1 = ::aie::shuffle_up_fill(data_buf1, prev_buf1, kernel_side);
+    prev_buf1.insert(1, data_buf1.template extract<32>(0));
+    acc = mul_ops::mul(kernel_vec, 0, data_buf1, 0);
+
+    // second kernel row
+    data_buf2.insert(0, aie::load_v<32>(lineIn1));
+    lineIn1 += VecFactor;
+    data_buf2.insert(1, aie::load_v<32>(lineIn1));
+    data_buf2 = ::aie::shuffle_up_fill(data_buf2, prev_buf2, kernel_side);
+    prev_buf2.insert(1, data_buf2.template extract<32>(0));
+    acc = mul_ops::mac(acc, kernel_vec, Points, data_buf2, 0);
+
+    // third kernel row
+    data_buf3.insert(0, aie::load_v<32>(lineIn2));
+    lineIn2 += VecFactor;
+    data_buf3.insert(1, aie::load_v<32>(lineIn2));
+    data_buf3 = ::aie::shuffle_up_fill(data_buf3, prev_buf3, kernel_side);
+    prev_buf3.insert(1, data_buf3.template extract<32>(0));
+    acc = mul_ops::mac(acc, kernel_vec, 2 * Points, data_buf3, 0);
+
+    // Store result
+    ::aie::store_v(output, acc.to_vector<uint8>(SRS_SHIFT - 8));
+    output += VecFactor;
+  }
+
+  // right of line, border extension by mirroring
+  // first kernel row
+  data_buf1.insert(1, aie::load_v<32>(lineIn0));
+  data_buf1 = ::aie::shuffle_down_replicate(data_buf1, 32);
+  data_buf1 = ::aie::shuffle_up_fill(data_buf1, prev_buf1, kernel_side);
+  acc = mul_ops::mul(kernel_vec, 0, data_buf1, 0);
+
+  // second kernel row
+  data_buf2.insert(1, aie::load_v<32>(lineIn1));
+  data_buf2 = ::aie::shuffle_down_replicate(data_buf2, 32);
+  data_buf2 = ::aie::shuffle_up_fill(data_buf2, prev_buf2, kernel_side);
+  acc = mul_ops::mac(acc, kernel_vec, Points, data_buf2, 0);
+
+  // third kernel row
+  data_buf3.insert(1, aie::load_v<32>(lineIn2));
+  lineIn2 += VecFactor;
+  data_buf3 = ::aie::shuffle_down_replicate(data_buf3, 32);
+  data_buf3 = ::aie::shuffle_up_fill(data_buf3, prev_buf3, kernel_side);
+  acc = mul_ops::mac(acc, kernel_vec, 2 * Points, data_buf3, 0);
+
+  // Store result
+  ::aie::store_v(output, acc.to_vector<uint8>(SRS_SHIFT - 8));
+  output += VecFactor;
+}
+
+extern "C" {
+
+// #if BIT_WIDTH == 8
+
+void filter2dLine(uint8_t *lineIn0, uint8_t *lineIn1, uint8_t *lineIn2,
+                  uint8_t *out, int32_t lineWidth, int16_t *filterKernel) {
+  filter2d_3lines_aie(lineIn0, lineIn1, lineIn2, out, lineWidth, filterKernel);
+}
+
+/* #elif BIT_WIDTH == 16
+
+void filter2dLine(int16_t *in, int16_t *out, int32_t lineWidth, int16_t
+filter2dValue, int16_t maxValue) { filter2d_3lines_aie<int16_t, 32>(in, out,
+lineWidth, 1, filter2dValue, maxValue);
+}
+
+#else // 32
+
+void filter2dLine(int32_t *in, int32_t *out, int32_t lineWidth, int32_t
+filter2dValue, int32_t maxValue) { filter2d_3lines_aie<int32_t, 16>(in, out,
+lineWidth, 1, filter2dValue, maxValue);
+}
+
+#endif */
+
+} // extern "C"
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc
index d0c7a9e3b1..351b3331b6 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc
@@ -1,82 +1,82 @@
-//===- gray2rgba.cc -------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2022, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-#define NOCPP
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define REL_WRITE 0
-#define REL_READ 1
-
-#include <aie_api/aie.hpp>
-
-::aie::vector<uint8, 64> vector_broadcast(::aie::vector<uint8, 16> e) {
-  v64uint8 lli = e.template grow<64>();
-  lli = shuffle(lli, lli, T8_2x64_lo);
-  lli = shuffle(lli, lli, T8_2x64_lo);
-  return ::aie::vector<uint8, 64>(lli);
-}
-
-void gray2rgba_aie(uint8_t *y_in, uint8_t *rgba_out, const int32_t height,
-                   const int32_t width) {
-  // Initialize alpha vector
-  ::aie::vector<uint8, 64> alpha255 = ::aie::zeros<uint8, 64>();
-  for (int i = 0; i < 16; i++) {
-    alpha255[i * 4 + 3] = 255;
-  }
-
-  for (int i = 0; i < height; i++)
-    for (int j = 0; j < width; j += 16) {
-      ::aie::vector<uint8, 16> data_buf = ::aie::load_v<16>(y_in);
-      y_in += 16;
-
-      // vector shuffle
-      ::aie::vector<uint8, 64> out = vector_broadcast(data_buf);
-
-      // bitwise OR with alpha value
-      v64uint8 fout = bor(out, alpha255);
-
-      ::aie::store_v(rgba_out, ::aie::vector<uint8, 64>(fout));
-      rgba_out += 64;
-    }
-
-  return;
-  ;
-}
-
-void gray2rgba_aie_scalar(uint8_t *y_in, uint8_t *rgba_out,
-                          const int32_t height, const int32_t width) {
-  for (int i = 0; i < height; i++)
-    for (int j = 0; j < width; j++) {
-      uint8_t value = y_in[i * width + j];
-      rgba_out[i * width * 4 + j * 4] = value;
-      rgba_out[i * width * 4 + j * 4 + 1] = value;
-      rgba_out[i * width * 4 + j * 4 + 2] = value;
-      rgba_out[i * width * 4 + j * 4 + 3] = 255;
-    }
-
-  return;
-  ;
-}
-
-extern "C" {
-
-void gray2rgbaLine(uint8_t *in, uint8_t *out, int32_t lineWidth) {
-  gray2rgba_aie(in, out, 1, lineWidth);
-}
-
-void gray2rgbaTile(uint8_t *in, uint8_t *out, int32_t tileHeight,
-                   int32_t tileWidth) {
-  gray2rgba_aie(in, out, tileHeight, tileWidth);
-}
-
-} // extern "C"
+//===- gray2rgba.cc -------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#define NOCPP
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#include <aie_api/aie.hpp>
+
+::aie::vector<uint8, 64> vector_broadcast(::aie::vector<uint8, 16> e) {
+  v64uint8 lli = e.template grow<64>();
+  lli = shuffle(lli, lli, T8_2x64_lo);
+  lli = shuffle(lli, lli, T8_2x64_lo);
+  return ::aie::vector<uint8, 64>(lli);
+}
+
+void gray2rgba_aie(uint8_t *y_in, uint8_t *rgba_out, const int32_t height,
+                   const int32_t width) {
+  // Initialize alpha vector
+  ::aie::vector<uint8, 64> alpha255 = ::aie::zeros<uint8, 64>();
+  for (int i = 0; i < 16; i++) {
+    alpha255[i * 4 + 3] = 255;
+  }
+
+  for (int i = 0; i < height; i++)
+    for (int j = 0; j < width; j += 16) {
+      ::aie::vector<uint8, 16> data_buf = ::aie::load_v<16>(y_in);
+      y_in += 16;
+
+      // vector shuffle
+      ::aie::vector<uint8, 64> out = vector_broadcast(data_buf);
+
+      // bitwise OR with alpha value
+      v64uint8 fout = bor(out, alpha255);
+
+      ::aie::store_v(rgba_out, ::aie::vector<uint8, 64>(fout));
+      rgba_out += 64;
+    }
+
+  return;
+  ;
+}
+
+void gray2rgba_aie_scalar(uint8_t *y_in, uint8_t *rgba_out,
+                          const int32_t height, const int32_t width) {
+  for (int i = 0; i < height; i++)
+    for (int j = 0; j < width; j++) {
+      uint8_t value = y_in[i * width + j];
+      rgba_out[i * width * 4 + j * 4] = value;
+      rgba_out[i * width * 4 + j * 4 + 1] = value;
+      rgba_out[i * width * 4 + j * 4 + 2] = value;
+      rgba_out[i * width * 4 + j * 4 + 3] = 255;
+    }
+
+  return;
+  ;
+}
+
+extern "C" {
+
+void gray2rgbaLine(uint8_t *in, uint8_t *out, int32_t lineWidth) {
+  gray2rgba_aie(in, out, 1, lineWidth);
+}
+
+void gray2rgbaTile(uint8_t *in, uint8_t *out, int32_t tileHeight,
+                   int32_t tileWidth) {
+  gray2rgba_aie(in, out, tileHeight, tileWidth);
+}
+
+} // extern "C"
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc
index 0928af33f0..4725d09ba6 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc
@@ -1,87 +1,87 @@
-//===- passThrough.cc -------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2022, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-// #define __AIENGINE__ 1
-#define NOCPP
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define REL_WRITE 0
-#define REL_READ 1
-
-#include <aie_api/aie.hpp>
-
-template <typename T, int N>
-__attribute__((noinline)) void passThrough_aie(T *restrict in, T *restrict out,
-                                               const int32_t height,
-                                               const int32_t width) {
-  //::aie::vector<T, N> data_out;
-  //::aie::mask<N> temp_val;
-  v64uint8 *restrict outPtr = (v64uint8 *)out;
-  v64uint8 *restrict inPtr = (v64uint8 *)in;
-
-  for (int j = 0; j < (height * width); j += N) // Nx samples per loop
-    chess_prepare_for_pipelining chess_loop_range(6, ) {
-      //::aie::vector<T, N> tmpVector = ::aie::load_v(in);
-      //::aie::store_v(out, tmpVector);
-
-      *outPtr++ = *inPtr++;
-
-      // in += N;
-      // out += N;
-    }
-}
-
-extern "C" {
-
-#if BIT_WIDTH == 8
-
-void passThroughLine(uint8_t *in, uint8_t *out, int32_t lineWidth) {
-  printf("passThroughLine BIT_WIDTH\n");
-  passThrough_aie<uint8_t, 64>(in, out, 1, lineWidth);
-}
-
-void passThroughTile(uint8_t *in, uint8_t *out, int32_t tileHeight,
-                     int32_t tileWidth) {
-  printf("passThroughTile BIT_WIDTH\n");
-  passThrough_aie<uint8_t, 64>(in, out, tileHeight, tileWidth);
-}
-
-#elif BIT_WIDTH == 16
-
-void passThroughLine(int16_t *in, int16_t *out, int32_t lineWidth) {
-  printf("passThroughLine BIT_WIDTH\n");
-  passThrough_aie<int16_t, 32>(in, out, 1, lineWidth);
-}
-
-void passThroughTile(int16_t *in, int16_t *out, int32_t tileHeight,
-                     int32_t tileWidth) {
-  printf("passThroughTile BIT_WIDTH\n");
-  passThrough_aie<int16_t, 32>(in, out, tileHeight, tileWidth);
-}
-
-#else // 32
-
-void passThroughLine(int32_t *in, int32_t *out, int32_t lineWidth) {
-  printf("passThroughLine BIT_WIDTH\n");
-  passThrough_aie<int32_t, 16>(in, out, 1, lineWidth);
-}
-
-void passThroughTile(int32_t *in, int32_t *out, int32_t tileHeight,
-                     int32_t tileWidth) {
-  printf("passThroughTile BIT_WIDTH\n");
-  passThrough_aie<int32_t, 16>(in, out, tileHeight, tileWidth);
-}
-
-#endif
-
-} // extern "C"
+//===- passThrough.cc -------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define NOCPP
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#include <aie_api/aie.hpp>
+
+template <typename T, int N>
+__attribute__((noinline)) void passThrough_aie(T *restrict in, T *restrict out,
+                                               const int32_t height,
+                                               const int32_t width) {
+  //::aie::vector<T, N> data_out;
+  //::aie::mask<N> temp_val;
+  v64uint8 *restrict outPtr = (v64uint8 *)out;
+  v64uint8 *restrict inPtr = (v64uint8 *)in;
+
+  for (int j = 0; j < (height * width); j += N) // Nx samples per loop
+    chess_prepare_for_pipelining chess_loop_range(6, ) {
+      //::aie::vector<T, N> tmpVector = ::aie::load_v(in);
+      //::aie::store_v(out, tmpVector);
+
+      *outPtr++ = *inPtr++;
+
+      // in += N;
+      // out += N;
+    }
+}
+
+extern "C" {
+
+#if BIT_WIDTH == 8
+
+void passThroughLine(uint8_t *in, uint8_t *out, int32_t lineWidth) {
+  printf("passThroughLine BIT_WIDTH\n");
+  passThrough_aie<uint8_t, 64>(in, out, 1, lineWidth);
+}
+
+void passThroughTile(uint8_t *in, uint8_t *out, int32_t tileHeight,
+                     int32_t tileWidth) {
+  printf("passThroughTile BIT_WIDTH\n");
+  passThrough_aie<uint8_t, 64>(in, out, tileHeight, tileWidth);
+}
+
+#elif BIT_WIDTH == 16
+
+void passThroughLine(int16_t *in, int16_t *out, int32_t lineWidth) {
+  printf("passThroughLine BIT_WIDTH\n");
+  passThrough_aie<int16_t, 32>(in, out, 1, lineWidth);
+}
+
+void passThroughTile(int16_t *in, int16_t *out, int32_t tileHeight,
+                     int32_t tileWidth) {
+  printf("passThroughTile BIT_WIDTH\n");
+  passThrough_aie<int16_t, 32>(in, out, tileHeight, tileWidth);
+}
+
+#else // 32
+
+void passThroughLine(int32_t *in, int32_t *out, int32_t lineWidth) {
+  printf("passThroughLine BIT_WIDTH\n");
+  passThrough_aie<int32_t, 16>(in, out, 1, lineWidth);
+}
+
+void passThroughTile(int32_t *in, int32_t *out, int32_t tileHeight,
+                     int32_t tileWidth) {
+  printf("passThroughTile BIT_WIDTH\n");
+  passThrough_aie<int32_t, 16>(in, out, tileHeight, tileWidth);
+}
+
+#endif
+
+} // extern "C"
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc
index 175e4f4a33..988c4a0e4c 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc
@@ -1,107 +1,107 @@
-//===- rgba2gray.cc -------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2022, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-#define NOCPP
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define REL_WRITE 0
-#define REL_READ 1
-
-#include <aie_api/aie.hpp>
-
-const int32_t SRS_SHIFT = 15;
-__attribute__((inline)) void xf_extract_rgb(uint8_t *ptr_rgba,
-                                            ::aie::vector<uint8_t, 32> &r,
-                                            ::aie::vector<uint8_t, 32> &g,
-                                            ::aie::vector<uint8_t, 32> &b) {
-  ::aie::vector<uint8_t, 32> rgba_channel0, rgba_channel1, rgba_channel3,
-      rgba_channel2;
-  rgba_channel0 = ::aie::load_v<32>(ptr_rgba);
-  ptr_rgba += 32;
-  rgba_channel1 = ::aie::load_v<32>(ptr_rgba);
-  ptr_rgba += 32;
-  rgba_channel2 = ::aie::load_v<32>(ptr_rgba);
-  ptr_rgba += 32;
-  rgba_channel3 = ::aie::load_v<32>(ptr_rgba);
-  ptr_rgba += 32;
-
-  // Unzip the interleaved channels
-  auto [rg_temp, ba_temp] =
-      ::aie::interleave_unzip(::aie::concat(rgba_channel0, rgba_channel1),
-                              ::aie::concat(rgba_channel2, rgba_channel3), 2);
-  r = ::aie::filter_even(rg_temp, 1);
-  g = ::aie::filter_odd(rg_temp, 1);
-  b = ::aie::filter_even(ba_temp, 1);
-}
-
-__attribute__((noinline)) void rgba2gray_aie(uint8_t *rgba_in, uint8_t *y_out,
-                                             const int32_t height,
-                                             const int32_t width) {
-  //::aie::vector<int16_t, 16> WT(66, 129, 25, 128); //Y=0.299*R + 0.587*G +
-  //: 0.114*B (BT.470) :aie::vector<int16_t, 16> WT(25, 129, 66, 128);
-  //://Y=0.299*R + 0.587*G + 0.114*B (BT.470)
-  ::aie::vector<int16_t, 16> WT(
-      (int16_t)round(0.299 * (1 << SRS_SHIFT)),
-      (int16_t)round(0.587 * (1 << SRS_SHIFT)),
-      (int16_t)round(0.114 * (1 << SRS_SHIFT)),
-      (1 << (SRS_SHIFT - 1))); // Y=0.299*R + 0.587*G + 0.114*B (BT.470)
-  ::aie::vector<uint8_t, 32> c1 = ::aie::broadcast<uint8_t, 32>(1);
-  ::aie::vector<uint8_t, 32> r, g, b;
-  ::aie::vector<uint8_t, 32> y;
-
-  for (int j = 0; (j < (width * height) / 32); j += 1)
-    chess_prepare_for_pipelining {
-      xf_extract_rgb(rgba_in, r, g, b);
-
-      ::aie::accum<acc32, 32> acc;
-      acc = ::aie::accumulate<32>(WT, 0, r, g, b, c1);
-      y = acc.template to_vector<uint8_t>(SRS_SHIFT);
-
-      ::aie::store_v(y_out, y);
-      rgba_in += 128;
-      y_out += 32;
-    }
-}
-
-void rgba2gray_aie_scalar(uint8_t *rgba_in, uint8_t *y_out,
-                          const int32_t height, const int32_t width) {
-  /// Y=0.299*R + 0.587*G + 0.114*B (BT.470)
-  const int colorMatrix[4] = {(int)round(0.299 * 65536),
-                              (int)round(0.587 * 65536),
-                              (int)round(0.114 * 65536), (65536 / 2)};
-  for (int i = 0; i < height; i++)
-    for (int j = 0; j < width; j++) {
-      int r = (int)rgba_in[i * width * 4 + j * 4];
-      int g = (int)rgba_in[i * width * 4 + j * 4 + 1];
-      int b = (int)rgba_in[i * width * 4 + j * 4 + 2];
-      int tmpSum = (colorMatrix[0] * r + colorMatrix[1] * g +
-                    colorMatrix[2] * b + colorMatrix[3]) >>
-                   16;
-      y_out[i * width + j] = (uint8_t)tmpSum;
-    }
-
-  return;
-}
-
-extern "C" {
-
-void rgba2grayLine(uint8_t *in, uint8_t *out, int32_t lineWidth) {
-  rgba2gray_aie(in, out, 1, lineWidth);
-}
-
-void rgba2grayTile(uint8_t *in, uint8_t *out, int32_t tileHeight,
-                   int32_t tileWidth) {
-  rgba2gray_aie(in, out, tileHeight, tileWidth);
-}
-
-} // extern "C"
+//===- rgba2gray.cc -------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#define NOCPP
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+#include <aie_api/aie.hpp>
+
+const int32_t SRS_SHIFT = 15;
+__attribute__((inline)) void xf_extract_rgb(uint8_t *ptr_rgba,
+                                            ::aie::vector<uint8_t, 32> &r,
+                                            ::aie::vector<uint8_t, 32> &g,
+                                            ::aie::vector<uint8_t, 32> &b) {
+  ::aie::vector<uint8_t, 32> rgba_channel0, rgba_channel1, rgba_channel3,
+      rgba_channel2;
+  rgba_channel0 = ::aie::load_v<32>(ptr_rgba);
+  ptr_rgba += 32;
+  rgba_channel1 = ::aie::load_v<32>(ptr_rgba);
+  ptr_rgba += 32;
+  rgba_channel2 = ::aie::load_v<32>(ptr_rgba);
+  ptr_rgba += 32;
+  rgba_channel3 = ::aie::load_v<32>(ptr_rgba);
+  ptr_rgba += 32;
+
+  // Unzip the interleaved channels
+  auto [rg_temp, ba_temp] =
+      ::aie::interleave_unzip(::aie::concat(rgba_channel0, rgba_channel1),
+                              ::aie::concat(rgba_channel2, rgba_channel3), 2);
+  r = ::aie::filter_even(rg_temp, 1);
+  g = ::aie::filter_odd(rg_temp, 1);
+  b = ::aie::filter_even(ba_temp, 1);
+}
+
+__attribute__((noinline)) void rgba2gray_aie(uint8_t *rgba_in, uint8_t *y_out,
+                                             const int32_t height,
+                                             const int32_t width) {
+  //::aie::vector<int16_t, 16> WT(66, 129, 25, 128); //Y=0.299*R + 0.587*G +
+  //: 0.114*B (BT.470) :aie::vector<int16_t, 16> WT(25, 129, 66, 128);
+  //://Y=0.299*R + 0.587*G + 0.114*B (BT.470)
+  ::aie::vector<int16_t, 16> WT(
+      (int16_t)round(0.299 * (1 << SRS_SHIFT)),
+      (int16_t)round(0.587 * (1 << SRS_SHIFT)),
+      (int16_t)round(0.114 * (1 << SRS_SHIFT)),
+      (1 << (SRS_SHIFT - 1))); // Y=0.299*R + 0.587*G + 0.114*B (BT.470)
+  ::aie::vector<uint8_t, 32> c1 = ::aie::broadcast<uint8_t, 32>(1);
+  ::aie::vector<uint8_t, 32> r, g, b;
+  ::aie::vector<uint8_t, 32> y;
+
+  for (int j = 0; (j < (width * height) / 32); j += 1)
+    chess_prepare_for_pipelining {
+      xf_extract_rgb(rgba_in, r, g, b);
+
+      ::aie::accum<acc32, 32> acc;
+      acc = ::aie::accumulate<32>(WT, 0, r, g, b, c1);
+      y = acc.template to_vector<uint8_t>(SRS_SHIFT);
+
+      ::aie::store_v(y_out, y);
+      rgba_in += 128;
+      y_out += 32;
+    }
+}
+
+void rgba2gray_aie_scalar(uint8_t *rgba_in, uint8_t *y_out,
+                          const int32_t height, const int32_t width) {
+  /// Y=0.299*R + 0.587*G + 0.114*B (BT.470)
+  const int colorMatrix[4] = {(int)round(0.299 * 65536),
+                              (int)round(0.587 * 65536),
+                              (int)round(0.114 * 65536), (65536 / 2)};
+  for (int i = 0; i < height; i++)
+    for (int j = 0; j < width; j++) {
+      int r = (int)rgba_in[i * width * 4 + j * 4];
+      int g = (int)rgba_in[i * width * 4 + j * 4 + 1];
+      int b = (int)rgba_in[i * width * 4 + j * 4 + 2];
+      int tmpSum = (colorMatrix[0] * r + colorMatrix[1] * g +
+                    colorMatrix[2] * b + colorMatrix[3]) >>
+                   16;
+      y_out[i * width + j] = (uint8_t)tmpSum;
+    }
+
+  return;
+}
+
+extern "C" {
+
+void rgba2grayLine(uint8_t *in, uint8_t *out, int32_t lineWidth) {
+  rgba2gray_aie(in, out, 1, lineWidth);
+}
+
+void rgba2grayTile(uint8_t *in, uint8_t *out, int32_t tileHeight,
+                   int32_t tileWidth) {
+  rgba2gray_aie(in, out, tileHeight, tileWidth);
+}
+
+} // extern "C"
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc
index c49cd97655..3f20bf514d 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc
@@ -1,181 +1,181 @@
-//===- rgba2hue.cc ----------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-#define NOCPP
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define REL_WRITE 0
-#define REL_READ 1
-
-// clang-format off
-#include <aie_api/aie.hpp>
-#include "lut_inv_8b.h"
-// clang-format on
-
-const int32_t SRS_SHIFT = 12;
-
-__attribute__((inline)) void xf_extract_rgb(uint8_t *ptr_rgba,
-                                            ::aie::vector<uint8_t, 32> &r,
-                                            ::aie::vector<uint8_t, 32> &g,
-                                            ::aie::vector<uint8_t, 32> &b) {
-  ::aie::vector<uint8_t, 32> rgba_channel0, rgba_channel1, rgba_channel3,
-      rgba_channel2;
-  rgba_channel0 = ::aie::load_v<32>(ptr_rgba);
-  ptr_rgba += 32;
-  rgba_channel1 = ::aie::load_v<32>(ptr_rgba);
-  ptr_rgba += 32;
-  rgba_channel2 = ::aie::load_v<32>(ptr_rgba);
-  ptr_rgba += 32;
-  rgba_channel3 = ::aie::load_v<32>(ptr_rgba);
-  ptr_rgba += 32;
-
-  // Unzip the interleaved channels
-  auto [rg_temp, ba_temp] =
-      ::aie::interleave_unzip(::aie::concat(rgba_channel0, rgba_channel1),
-                              ::aie::concat(rgba_channel2, rgba_channel3), 2);
-  r = ::aie::filter_even(rg_temp, 1);
-  g = ::aie::filter_odd(rg_temp, 1);
-  b = ::aie::filter_even(ba_temp, 1);
-}
-
-__attribute__((inline)) void
-comp_divisor_16b(::aie::vector<uint8_t, 32> divisor,
-                 ::aie::vector<uint16_t, 32> &divisor_select) {
-  const int step = 0;
-  using lut_type_uint16 = aie::lut<4, uint16, uint16>;
-  lut_type_uint16 inv_lut_16b(num_entries_lut_inv_16b, lut_inv_16b_ab,
-                              lut_inv_16b_cd);
-  aie::parallel_lookup<uint8, lut_type_uint16, aie::lut_oor_policy::truncate>
-      lookup_inv_16b(inv_lut_16b, step);
-
-  aie::vector<uint8, 16> input1, input2;
-  aie::vector<uint16, 16> res1, res2;
-  input1 = divisor.extract<16>(0);
-  input2 = divisor.extract<16>(1);
-  res1 = lookup_inv_16b.fetch(input1.cast_to<uint8>());
-  res2 = lookup_inv_16b.fetch(input2.cast_to<uint8>());
-  divisor_select = aie::concat(res1, res2);
-}
-
-__attribute__((noinline)) void rgba2hue_aie(uint8_t *rgba_in, uint8_t *hue_out,
-                                            const int32_t height,
-                                            const int32_t width) {
-  ::aie::vector<uint8_t, 32> r, g, b;
-  ::aie::vector<uint8_t, 32> hue;
-
-  ::aie::vector<uint8_t, 32> rgbMin, rgbMax;
-
-  ::aie::vector<uint8_t, 32> zero32 = aie::zeros<uint8_t, 32>();
-
-  ::aie::vector<int16_t, 32> eightFive = aie::zeros<int16_t, 32>();
-  eightFive[0] = 85;
-  eightFive[1] = -85;
-  ::aie::vector<int16_t, 32> one = aie::broadcast<int16_t, 32>(1);
-  ::aie::vector<int16_t, 32> twoEightFive =
-      aie::broadcast<int16_t, 32>(171); // 170 + 1
-  ::aie::vector<int16_t, 32> fourEightFive =
-      aie::broadcast<int16_t, 32>(341); // 340 + 1
-
-  for (int j = 0; (j < (width * height) / 32); j += 1)
-    chess_prepare_for_pipelining {
-      xf_extract_rgb(rgba_in, r, g, b);
-
-      // Get rgbMin and rgbMax
-      rgbMin = ::aie::min(::aie::min(r, g), b);
-      rgbMax = ::aie::max(::aie::max(r, g), b);
-
-      // Get divisor and select the fixed point divisor to multiply by
-      auto divisor = ::aie::sub(rgbMax, rgbMin);
-      ::aie::vector<uint16, 32> divisor_sel;
-      comp_divisor_16b(divisor, divisor_sel);
-
-      // Initialize accum with value since 340 is larger than uint8
-      aie::accum<acc32, 32> hr_partial(one, 9);
-      aie::accum<acc32, 32> hg_partial(twoEightFive, 9);
-      aie::accum<acc32, 32> hb_partial(fourEightFive, 9);
-
-      // Performa uin8*int16 vector multiply
-      hr_partial = aie::mac(hr_partial, g, divisor_sel);
-      hg_partial = aie::mac(hg_partial, b, divisor_sel);
-      hb_partial = aie::mac(hb_partial, r, divisor_sel);
-
-      hr_partial = aie::msc(hr_partial, b, divisor_sel);
-      hg_partial = aie::msc(hg_partial, r, divisor_sel);
-      hb_partial = aie::msc(hb_partial, g, divisor_sel);
-
-      auto hr = hr_partial.to_vector<uint8>(10); // Q7.9 shift + 1 (div 2)
-      auto hg = hg_partial.to_vector<uint8>(10); // Q7.9 shift + 1 (div 2)
-      auto hb = hb_partial.to_vector<uint8>(10); // Q7.9 shift + 1 (div 2)
-
-      aie::mask<32> sel1 = aie::eq(rgbMax, r);
-      auto tmp1 = aie::select(hb, hr, sel1);
-      aie::mask<32> sel2 = aie::eq(rgbMax, g);
-      auto tmp2 = aie::select(tmp1, hg, sel2);
-      aie::mask<32> sel3 = aie::eq(divisor, zero32);
-      hue = aie::select(tmp2, zero32, sel3);
-
-      ::aie::store_v(hue_out, hue);
-      rgba_in += 128;
-      hue_out += 32;
-    }
-}
-
-void rgba2hue_aie_scalar(uint8_t *rgba_in, uint8_t *hue_out,
-                         const int32_t height, const int32_t width) {
-  for (int i = 0; i < height; i++)
-    for (int j = 0; j < width; j++) {
-      int r = (int)rgba_in[i * (width * 4) + (j * 4)];
-      int g = (int)rgba_in[i * (width * 4) + (j * 4) + 1];
-      int b = (int)rgba_in[i * (width * 4) + (j * 4) + 2];
-      int h;
-      uint8_t rgbMin, rgbMax;
-
-      rgbMin = r < g ? (r < b ? r : b) : (g < b ? g : b);
-      rgbMax = r > g ? (r > b ? r : b) : (g > b ? g : b);
-
-      if (rgbMax == 0 || rgbMax == rgbMin)
-        h = 0;
-      else if (rgbMax == r)
-        h = 0 +
-            85 * (g - b) /
-                (rgbMax - rgbMin); // h = 0 + 42.5*(g - b) / (rgbMax - rgbMin);
-      else if (rgbMax == g)
-        h = 85 * 2 +
-            85 * (b - r) /
-                (rgbMax - rgbMin); // h = 85 + 42.5*(b - r) / (rgbMax - rgbMin);
-      else
-        h = 170 * 2 +
-            85 * (r - g) /
-                (rgbMax -
-                 rgbMin); // h = 170 + 42.5*(r - g) / (rgbMax - rgbMin);
-
-      h = (h + 1) >> 1;
-      hue_out[i * width + j] = (uint8_t)h;
-    }
-
-  return;
-}
-
-extern "C" {
-
-void rgba2hueLine(uint8_t *in, uint8_t *out, int32_t lineWidth) {
-  // rgba2hue_aie_scalar(in, out, 1, lineWidth);
-  rgba2hue_aie(in, out, 1, lineWidth);
-}
-
-void rgba2hueTile(uint8_t *in, uint8_t *out, int32_t tileHeight,
-                  int32_t tileWidth) {
-  rgba2hue_aie_scalar(in, out, tileHeight, tileWidth);
-}
-
-} // extern "C"
+//===- rgba2hue.cc ----------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#define NOCPP
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+// clang-format off
+#include <aie_api/aie.hpp>
+#include "lut_inv_8b.h"
+// clang-format on
+
+const int32_t SRS_SHIFT = 12;
+
+__attribute__((inline)) void xf_extract_rgb(uint8_t *ptr_rgba,
+                                            ::aie::vector<uint8_t, 32> &r,
+                                            ::aie::vector<uint8_t, 32> &g,
+                                            ::aie::vector<uint8_t, 32> &b) {
+  ::aie::vector<uint8_t, 32> rgba_channel0, rgba_channel1, rgba_channel3,
+      rgba_channel2;
+  rgba_channel0 = ::aie::load_v<32>(ptr_rgba);
+  ptr_rgba += 32;
+  rgba_channel1 = ::aie::load_v<32>(ptr_rgba);
+  ptr_rgba += 32;
+  rgba_channel2 = ::aie::load_v<32>(ptr_rgba);
+  ptr_rgba += 32;
+  rgba_channel3 = ::aie::load_v<32>(ptr_rgba);
+  ptr_rgba += 32;
+
+  // Unzip the interleaved channels
+  auto [rg_temp, ba_temp] =
+      ::aie::interleave_unzip(::aie::concat(rgba_channel0, rgba_channel1),
+                              ::aie::concat(rgba_channel2, rgba_channel3), 2);
+  r = ::aie::filter_even(rg_temp, 1);
+  g = ::aie::filter_odd(rg_temp, 1);
+  b = ::aie::filter_even(ba_temp, 1);
+}
+
+__attribute__((inline)) void
+comp_divisor_16b(::aie::vector<uint8_t, 32> divisor,
+                 ::aie::vector<uint16_t, 32> &divisor_select) {
+  const int step = 0;
+  using lut_type_uint16 = aie::lut<4, uint16, uint16>;
+  lut_type_uint16 inv_lut_16b(num_entries_lut_inv_16b, lut_inv_16b_ab,
+                              lut_inv_16b_cd);
+  aie::parallel_lookup<uint8, lut_type_uint16, aie::lut_oor_policy::truncate>
+      lookup_inv_16b(inv_lut_16b, step);
+
+  aie::vector<uint8, 16> input1, input2;
+  aie::vector<uint16, 16> res1, res2;
+  input1 = divisor.extract<16>(0);
+  input2 = divisor.extract<16>(1);
+  res1 = lookup_inv_16b.fetch(input1.cast_to<uint8>());
+  res2 = lookup_inv_16b.fetch(input2.cast_to<uint8>());
+  divisor_select = aie::concat(res1, res2);
+}
+
+__attribute__((noinline)) void rgba2hue_aie(uint8_t *rgba_in, uint8_t *hue_out,
+                                            const int32_t height,
+                                            const int32_t width) {
+  ::aie::vector<uint8_t, 32> r, g, b;
+  ::aie::vector<uint8_t, 32> hue;
+
+  ::aie::vector<uint8_t, 32> rgbMin, rgbMax;
+
+  ::aie::vector<uint8_t, 32> zero32 = aie::zeros<uint8_t, 32>();
+
+  ::aie::vector<int16_t, 32> eightFive = aie::zeros<int16_t, 32>();
+  eightFive[0] = 85;
+  eightFive[1] = -85;
+  ::aie::vector<int16_t, 32> one = aie::broadcast<int16_t, 32>(1);
+  ::aie::vector<int16_t, 32> twoEightFive =
+      aie::broadcast<int16_t, 32>(171); // 170 + 1
+  ::aie::vector<int16_t, 32> fourEightFive =
+      aie::broadcast<int16_t, 32>(341); // 340 + 1
+
+  for (int j = 0; (j < (width * height) / 32); j += 1)
+    chess_prepare_for_pipelining {
+      xf_extract_rgb(rgba_in, r, g, b);
+
+      // Get rgbMin and rgbMax
+      rgbMin = ::aie::min(::aie::min(r, g), b);
+      rgbMax = ::aie::max(::aie::max(r, g), b);
+
+      // Get divisor and select the fixed point divisor to multiply by
+      auto divisor = ::aie::sub(rgbMax, rgbMin);
+      ::aie::vector<uint16, 32> divisor_sel;
+      comp_divisor_16b(divisor, divisor_sel);
+
+      // Initialize accum with value since 340 is larger than uint8
+      aie::accum<acc32, 32> hr_partial(one, 9);
+      aie::accum<acc32, 32> hg_partial(twoEightFive, 9);
+      aie::accum<acc32, 32> hb_partial(fourEightFive, 9);
+
+      // Performa uin8*int16 vector multiply
+      hr_partial = aie::mac(hr_partial, g, divisor_sel);
+      hg_partial = aie::mac(hg_partial, b, divisor_sel);
+      hb_partial = aie::mac(hb_partial, r, divisor_sel);
+
+      hr_partial = aie::msc(hr_partial, b, divisor_sel);
+      hg_partial = aie::msc(hg_partial, r, divisor_sel);
+      hb_partial = aie::msc(hb_partial, g, divisor_sel);
+
+      auto hr = hr_partial.to_vector<uint8>(10); // Q7.9 shift + 1 (div 2)
+      auto hg = hg_partial.to_vector<uint8>(10); // Q7.9 shift + 1 (div 2)
+      auto hb = hb_partial.to_vector<uint8>(10); // Q7.9 shift + 1 (div 2)
+
+      aie::mask<32> sel1 = aie::eq(rgbMax, r);
+      auto tmp1 = aie::select(hb, hr, sel1);
+      aie::mask<32> sel2 = aie::eq(rgbMax, g);
+      auto tmp2 = aie::select(tmp1, hg, sel2);
+      aie::mask<32> sel3 = aie::eq(divisor, zero32);
+      hue = aie::select(tmp2, zero32, sel3);
+
+      ::aie::store_v(hue_out, hue);
+      rgba_in += 128;
+      hue_out += 32;
+    }
+}
+
+void rgba2hue_aie_scalar(uint8_t *rgba_in, uint8_t *hue_out,
+                         const int32_t height, const int32_t width) {
+  for (int i = 0; i < height; i++)
+    for (int j = 0; j < width; j++) {
+      int r = (int)rgba_in[i * (width * 4) + (j * 4)];
+      int g = (int)rgba_in[i * (width * 4) + (j * 4) + 1];
+      int b = (int)rgba_in[i * (width * 4) + (j * 4) + 2];
+      int h;
+      uint8_t rgbMin, rgbMax;
+
+      rgbMin = r < g ? (r < b ? r : b) : (g < b ? g : b);
+      rgbMax = r > g ? (r > b ? r : b) : (g > b ? g : b);
+
+      if (rgbMax == 0 || rgbMax == rgbMin)
+        h = 0;
+      else if (rgbMax == r)
+        h = 0 +
+            85 * (g - b) /
+                (rgbMax - rgbMin); // h = 0 + 42.5*(g - b) / (rgbMax - rgbMin);
+      else if (rgbMax == g)
+        h = 85 * 2 +
+            85 * (b - r) /
+                (rgbMax - rgbMin); // h = 85 + 42.5*(b - r) / (rgbMax - rgbMin);
+      else
+        h = 170 * 2 +
+            85 * (r - g) /
+                (rgbMax -
+                 rgbMin); // h = 170 + 42.5*(r - g) / (rgbMax - rgbMin);
+
+      h = (h + 1) >> 1;
+      hue_out[i * width + j] = (uint8_t)h;
+    }
+
+  return;
+}
+
+extern "C" {
+
+void rgba2hueLine(uint8_t *in, uint8_t *out, int32_t lineWidth) {
+  // rgba2hue_aie_scalar(in, out, 1, lineWidth);
+  rgba2hue_aie(in, out, 1, lineWidth);
+}
+
+void rgba2hueTile(uint8_t *in, uint8_t *out, int32_t tileHeight,
+                  int32_t tileWidth) {
+  rgba2hue_aie_scalar(in, out, tileHeight, tileWidth);
+}
+
+} // extern "C"
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc
index 2e42d85829..e1ffb38479 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc
@@ -1,313 +1,313 @@
-//===- threshold.cc ----------------------------------------------*- C++
-//-*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2022, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-// #define __AIENGINE__ 1
-#define NOCPP
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-
-#define REL_WRITE 0
-#define REL_READ 1
-
-enum _threshold_type {
-  XF_THRESHOLD_TYPE_BINARY = 0,
-  XF_THRESHOLD_TYPE_BINARY_INV = 1,
-  XF_THRESHOLD_TYPE_TRUNC = 2,
-  XF_THRESHOLD_TYPE_TOZERO = 3,
-  XF_THRESHOLD_TYPE_TOZERO_INV = 4,
-};
-
-// #define THRESH_TYPE XF_THRESHOLD_TYPE_BINARY
-
-#include <aie_api/aie.hpp>
-
-template <typename T, int N>
-__attribute__((noinline)) void
-threshold_aie(T *img_in, T *img_out, const int32_t img_width,
-              const int32_t img_height, const T &thresh_val, const T &max_val,
-              const uint8_t thresholdType) {
-  ::aie::vector<T, N> constants;
-  ::aie::vector<T, N> data_out;
-  ::aie::mask<N> temp_val;
-  constants[0] = 0;          // updating constant zero_val value
-  constants[1] = thresh_val; // updating constant threshold value
-  constants[2] = max_val;    // updating constant max_val value
-
-  switch (thresholdType) {
-  case XF_THRESHOLD_TYPE_TRUNC:
-    for (int j = 0; j < (img_height * img_width);
-         j += N) // 16x samples per loop
-      chess_prepare_for_pipelining chess_loop_range(14, ) {
-        ::aie::vector<T, N> data_buf1 =
-            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
-        img_in += N;
-        data_out = ::aie::min(constants[1], data_buf1);
-        ::aie::store_v(img_out, data_out);
-        img_out += N;
-      }
-    break;
-  case XF_THRESHOLD_TYPE_BINARY:
-    for (int j = 0; j < (img_height * img_width);
-         j += N) // 16x samples per loop
-      chess_prepare_for_pipelining chess_loop_range(14, ) {
-        ::aie::vector<T, N> data_buf1 =
-            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
-        img_in += N;
-        temp_val = ::aie::lt(constants[1], data_buf1);
-        data_out = ::aie::select(constants[0], constants[2], temp_val);
-        ::aie::store_v(img_out, data_out);
-        img_out += N;
-      }
-    break;
-  case XF_THRESHOLD_TYPE_BINARY_INV:
-    for (int j = 0; j < (img_height * img_width);
-         j += N) // 16x samples per loop
-      chess_prepare_for_pipelining chess_loop_range(14, ) {
-        ::aie::vector<T, N> data_buf1 =
-            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
-        img_in += N;
-        temp_val = ::aie::lt(constants[1], data_buf1);
-        data_out = ::aie::select(constants[2], constants[0], temp_val);
-        ::aie::store_v(img_out, data_out);
-        img_out += N;
-      }
-    break;
-  case XF_THRESHOLD_TYPE_TOZERO:
-    for (int j = 0; j < (img_height * img_width);
-         j += N) // 16x samples per loop
-      chess_prepare_for_pipelining chess_loop_range(14, ) {
-        ::aie::vector<T, N> data_buf1 =
-            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
-        img_in += N;
-        temp_val = ::aie::lt(constants[1], data_buf1);
-        data_out = ::aie::select(constants[0], data_buf1, temp_val);
-        ::aie::store_v(img_out, data_out);
-        img_out += N;
-      }
-    break;
-  case XF_THRESHOLD_TYPE_TOZERO_INV:
-    for (int j = 0; j < (img_height * img_width);
-         j += N) // 16x samples per loop
-      chess_prepare_for_pipelining chess_loop_range(14, ) {
-        ::aie::vector<T, N> data_buf1 =
-            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
-        img_in += N;
-        temp_val = ::aie::lt(constants[1], data_buf1);
-        data_out = ::aie::select(data_buf1, constants[0], temp_val);
-        ::aie::store_v(img_out, data_out);
-        img_out += N;
-      }
-    break;
-  default:
-    for (int j = 0; j < (img_height * img_width);
-         j += N) // 16x samples per loop
-      chess_prepare_for_pipelining chess_loop_range(14, ) {
-        ::aie::vector<T, N> data_buf1 =
-            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
-        img_in += N;
-        data_out = ::aie::min(constants[1], data_buf1);
-        ::aie::store_v(img_out, data_out);
-        img_out += N;
-      }
-  }
-}
-
-template <typename T, int N>
-__attribute__((noinline)) void threshold4Ch_aie(
-    T *img_in, T *img_out, const int32_t img_width, const int32_t img_height,
-    const T &thresh_val1, const T &thresh_val2, const T &thresh_val3,
-    const T &thresh_val4, const T &max_val1, const T &max_val2,
-    const T &max_val3, const T &max_val4, const uint8_t thresholdType) {
-  ::aie::vector<T, N> constants;
-  ::aie::vector<T, N> data_out;
-  ::aie::mask<N> temp_val;
-  // constants[0] = 0;          // updating constant zero_val value
-  // constants[1] = thresh_val; // updating constant threshold value
-  // constants[2] = max_val;    // updating constant max_val value
-
-  ::aie::vector<T, N> mask_zeros = ::aie::zeros<T, N>();
-  ::aie::vector<T, N> mask_thresh;
-  ::aie::vector<T, N> mask_max;
-  for (int i = 0; i < N / 4; i++) {
-    mask_thresh[i * 4] = thresh_val1;
-    mask_thresh[i * 4 + 1] = thresh_val2;
-    mask_thresh[i * 4 + 2] = thresh_val3;
-    mask_thresh[i * 4 + 3] = thresh_val4;
-    mask_max[i * 4] = max_val1;
-    mask_max[i * 4 + 1] = max_val2;
-    mask_max[i * 4 + 2] = max_val3;
-    mask_max[i * 4 + 3] = max_val4;
-  }
-
-  switch (thresholdType) {
-  case XF_THRESHOLD_TYPE_TRUNC:
-    for (int j = 0; j < (img_height * img_width);
-         j += N) // 16x samples per loop
-      chess_prepare_for_pipelining chess_loop_range(14, ) {
-        ::aie::vector<T, N> data_buf1 =
-            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
-        img_in += N;
-        data_out = ::aie::min(mask_thresh, data_buf1);
-        ::aie::store_v(img_out, data_out);
-        img_out += N;
-      }
-    break;
-  case XF_THRESHOLD_TYPE_BINARY:
-    for (int j = 0; j < (img_height * img_width);
-         j += N) // 16x samples per loop
-      chess_prepare_for_pipelining chess_loop_range(14, ) {
-        ::aie::vector<T, N> data_buf1 =
-            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
-        img_in += N;
-        temp_val = ::aie::lt(mask_thresh, data_buf1);
-        data_out = ::aie::select(mask_zeros, mask_max, temp_val);
-        ::aie::store_v(img_out, data_out);
-        img_out += N;
-      }
-    break;
-  case XF_THRESHOLD_TYPE_BINARY_INV:
-    for (int j = 0; j < (img_height * img_width);
-         j += N) // 16x samples per loop
-      chess_prepare_for_pipelining chess_loop_range(14, ) {
-        ::aie::vector<T, N> data_buf1 =
-            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
-        img_in += N;
-        temp_val = ::aie::lt(mask_thresh, data_buf1);
-        data_out = ::aie::select(mask_max, mask_zeros, temp_val);
-        ::aie::store_v(img_out, data_out);
-        img_out += N;
-      }
-    break;
-  case XF_THRESHOLD_TYPE_TOZERO:
-    for (int j = 0; j < (img_height * img_width);
-         j += N) // 16x samples per loop
-      chess_prepare_for_pipelining chess_loop_range(14, ) {
-        ::aie::vector<T, N> data_buf1 =
-            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
-        img_in += N;
-        temp_val = ::aie::lt(mask_thresh, data_buf1);
-        data_out = ::aie::select(mask_zeros, data_buf1, temp_val);
-        ::aie::store_v(img_out, data_out);
-        img_out += N;
-      }
-    break;
-  case XF_THRESHOLD_TYPE_TOZERO_INV:
-    for (int j = 0; j < (img_height * img_width);
-         j += N) // 16x samples per loop
-      chess_prepare_for_pipelining chess_loop_range(14, ) {
-        ::aie::vector<T, N> data_buf1 =
-            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
-        img_in += N;
-        temp_val = ::aie::lt(mask_thresh, data_buf1);
-        data_out = ::aie::select(data_buf1, mask_zeros, temp_val);
-        ::aie::store_v(img_out, data_out);
-        img_out += N;
-      }
-    break;
-  default:
-    for (int j = 0; j < (img_height * img_width);
-         j += N) // 16x samples per loop
-      chess_prepare_for_pipelining chess_loop_range(14, ) {
-        ::aie::vector<T, N> data_buf1 =
-            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
-        img_in += N;
-        data_out = ::aie::min(mask_thresh, data_buf1);
-        ::aie::store_v(img_out, data_out);
-        img_out += N;
-      }
-  }
-}
-
-extern "C" {
-
-#if BIT_WIDTH == 8
-
-void threshold(uint8_t *img_in, uint8_t *img_out, int32_t thresh_val,
-               int32_t max_val, int32_t img_width, int32_t img_height) {
-  threshold_aie<uint8_t, 64>(img_in, img_out, img_width, img_height, thresh_val,
-                             max_val, XF_THRESHOLD_TYPE_BINARY);
-}
-
-void thresholdTile(uint8_t *in, uint8_t *out, int32_t tileHeight,
-                   int32_t tileWidth, uint8_t thresholdValue, uint8_t maxValue,
-                   uint8_t thresholdType) {
-  threshold_aie<uint8_t, 64>(in, out, tileWidth, tileHeight, thresholdValue,
-                             maxValue, thresholdType);
-}
-
-void thresholdLine(uint8_t *in, uint8_t *out, int32_t lineWidth,
-                   uint8_t thresholdValue, uint8_t maxValue,
-                   uint8_t thresholdType) {
-  threshold_aie<uint8_t, 64>(in, out, lineWidth, 1, thresholdValue, maxValue,
-                             thresholdType);
-}
-
-void threshold4ChLine(uint8_t *in, uint8_t *out, int32_t lineWidth,
-                      uint8_t thresholdValue1, uint8_t thresholdValue2,
-                      uint8_t thresholdValue3, uint8_t thresholdValue4,
-                      uint8_t maxValue1, uint8_t maxValue2, uint8_t maxValue3,
-                      uint8_t maxValue4, uint8_t thresholdType) {
-  threshold4Ch_aie<uint8_t, 64>(in, out, lineWidth, 1, thresholdValue1,
-                                thresholdValue2, thresholdValue3,
-                                thresholdValue4, maxValue1, maxValue2,
-                                maxValue3, maxValue4, thresholdType);
-}
-
-#elif BIT_WIDTH == 16
-
-void threshold(int16_t *img_in, int16_t *img_out, int32_t thresh_val,
-               int32_t max_val, int32_t img_width, int32_t img_height) {
-  threshold_aie<int16_t, 32>(img_in, img_out, img_width, img_height, thresh_val,
-                             max_va, XF_THRESHOLD_TYPE_BINARY);
-}
-
-void thresholdTile(int16_t *in, int16_t *out, int32_t tileHeight,
-                   int32_t tileWidth, int16_t thresholdValue, int16_t maxValue,
-                   uint8_t thresholdType) {
-  threshold_aie<int16_t, 32>(in, out, tileWidth, tileHeight, thresholdValue,
-                             maxValue),
-      thresholdType;
-}
-
-void thresholdLine(int16_t *in, int16_t *out, int32_t lineWidth,
-                   int16_t thresholdValue, int16_t maxValue,
-                   uint8_t thresholdType) {
-  threshold_aie<int16_t, 32>(in, out, lineWidth, 1, thresholdValue, maxValue,
-                             thresholdType);
-}
-
-#else // 32
-
-void threshold(int32_t *img_in, int32_t *img_out, int32_t thresh_val,
-               int32_t max_val, int32_t img_width, int32_t img_height) {
-  threshold_aie<int32_t, 16>(img_in, img_out, img_width, img_height, thresh_val,
-                             max_val, XF_THRESHOLD_TYPE_BINARY);
-}
-
-void thresholdTile(int32_t *in, int32_t *out, int32_t tileHeight,
-                   int32_t tileWidth, int32_t thresholdValue, int32_t maxValue,
-                   uint8_t thresholdType) {
-  threshold_aie<int32_t, 16>(in, out, tileWidth, tileHeight, thresholdValue,
-                             maxValue, thresholdType);
-}
-
-void thresholdLine(int32_t *in, int32_t *out, int32_t lineWidth,
-                   int32_t thresholdValue, int32_t maxValue,
-                   uint8_t thresholdType) {
-  threshold_aie<int32_t, 16>(in, out, lineWidth, 1, thresholdValue, maxValue,
-                             thresholdType);
-}
-
-#endif
-
-} // extern "C"
+//===- threshold.cc ----------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2022, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// #define __AIENGINE__ 1
+#define NOCPP
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define REL_WRITE 0
+#define REL_READ 1
+
+enum _threshold_type {
+  XF_THRESHOLD_TYPE_BINARY = 0,
+  XF_THRESHOLD_TYPE_BINARY_INV = 1,
+  XF_THRESHOLD_TYPE_TRUNC = 2,
+  XF_THRESHOLD_TYPE_TOZERO = 3,
+  XF_THRESHOLD_TYPE_TOZERO_INV = 4,
+};
+
+// #define THRESH_TYPE XF_THRESHOLD_TYPE_BINARY
+
+#include <aie_api/aie.hpp>
+
+template <typename T, int N>
+__attribute__((noinline)) void
+threshold_aie(T *img_in, T *img_out, const int32_t img_width,
+              const int32_t img_height, const T &thresh_val, const T &max_val,
+              const uint8_t thresholdType) {
+  ::aie::vector<T, N> constants;
+  ::aie::vector<T, N> data_out;
+  ::aie::mask<N> temp_val;
+  constants[0] = 0;          // updating constant zero_val value
+  constants[1] = thresh_val; // updating constant threshold value
+  constants[2] = max_val;    // updating constant max_val value
+
+  switch (thresholdType) {
+  case XF_THRESHOLD_TYPE_TRUNC:
+    for (int j = 0; j < (img_height * img_width);
+         j += N) // 16x samples per loop
+      chess_prepare_for_pipelining chess_loop_range(14, ) {
+        ::aie::vector<T, N> data_buf1 =
+            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
+        img_in += N;
+        data_out = ::aie::min(constants[1], data_buf1);
+        ::aie::store_v(img_out, data_out);
+        img_out += N;
+      }
+    break;
+  case XF_THRESHOLD_TYPE_BINARY:
+    for (int j = 0; j < (img_height * img_width);
+         j += N) // 16x samples per loop
+      chess_prepare_for_pipelining chess_loop_range(14, ) {
+        ::aie::vector<T, N> data_buf1 =
+            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
+        img_in += N;
+        temp_val = ::aie::lt(constants[1], data_buf1);
+        data_out = ::aie::select(constants[0], constants[2], temp_val);
+        ::aie::store_v(img_out, data_out);
+        img_out += N;
+      }
+    break;
+  case XF_THRESHOLD_TYPE_BINARY_INV:
+    for (int j = 0; j < (img_height * img_width);
+         j += N) // 16x samples per loop
+      chess_prepare_for_pipelining chess_loop_range(14, ) {
+        ::aie::vector<T, N> data_buf1 =
+            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
+        img_in += N;
+        temp_val = ::aie::lt(constants[1], data_buf1);
+        data_out = ::aie::select(constants[2], constants[0], temp_val);
+        ::aie::store_v(img_out, data_out);
+        img_out += N;
+      }
+    break;
+  case XF_THRESHOLD_TYPE_TOZERO:
+    for (int j = 0; j < (img_height * img_width);
+         j += N) // 16x samples per loop
+      chess_prepare_for_pipelining chess_loop_range(14, ) {
+        ::aie::vector<T, N> data_buf1 =
+            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
+        img_in += N;
+        temp_val = ::aie::lt(constants[1], data_buf1);
+        data_out = ::aie::select(constants[0], data_buf1, temp_val);
+        ::aie::store_v(img_out, data_out);
+        img_out += N;
+      }
+    break;
+  case XF_THRESHOLD_TYPE_TOZERO_INV:
+    for (int j = 0; j < (img_height * img_width);
+         j += N) // 16x samples per loop
+      chess_prepare_for_pipelining chess_loop_range(14, ) {
+        ::aie::vector<T, N> data_buf1 =
+            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
+        img_in += N;
+        temp_val = ::aie::lt(constants[1], data_buf1);
+        data_out = ::aie::select(data_buf1, constants[0], temp_val);
+        ::aie::store_v(img_out, data_out);
+        img_out += N;
+      }
+    break;
+  default:
+    for (int j = 0; j < (img_height * img_width);
+         j += N) // 16x samples per loop
+      chess_prepare_for_pipelining chess_loop_range(14, ) {
+        ::aie::vector<T, N> data_buf1 =
+            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
+        img_in += N;
+        data_out = ::aie::min(constants[1], data_buf1);
+        ::aie::store_v(img_out, data_out);
+        img_out += N;
+      }
+  }
+}
+
+template <typename T, int N>
+__attribute__((noinline)) void threshold4Ch_aie(
+    T *img_in, T *img_out, const int32_t img_width, const int32_t img_height,
+    const T &thresh_val1, const T &thresh_val2, const T &thresh_val3,
+    const T &thresh_val4, const T &max_val1, const T &max_val2,
+    const T &max_val3, const T &max_val4, const uint8_t thresholdType) {
+  ::aie::vector<T, N> constants;
+  ::aie::vector<T, N> data_out;
+  ::aie::mask<N> temp_val;
+  // constants[0] = 0;          // updating constant zero_val value
+  // constants[1] = thresh_val; // updating constant threshold value
+  // constants[2] = max_val;    // updating constant max_val value
+
+  ::aie::vector<T, N> mask_zeros = ::aie::zeros<T, N>();
+  ::aie::vector<T, N> mask_thresh;
+  ::aie::vector<T, N> mask_max;
+  for (int i = 0; i < N / 4; i++) {
+    mask_thresh[i * 4] = thresh_val1;
+    mask_thresh[i * 4 + 1] = thresh_val2;
+    mask_thresh[i * 4 + 2] = thresh_val3;
+    mask_thresh[i * 4 + 3] = thresh_val4;
+    mask_max[i * 4] = max_val1;
+    mask_max[i * 4 + 1] = max_val2;
+    mask_max[i * 4 + 2] = max_val3;
+    mask_max[i * 4 + 3] = max_val4;
+  }
+
+  switch (thresholdType) {
+  case XF_THRESHOLD_TYPE_TRUNC:
+    for (int j = 0; j < (img_height * img_width);
+         j += N) // 16x samples per loop
+      chess_prepare_for_pipelining chess_loop_range(14, ) {
+        ::aie::vector<T, N> data_buf1 =
+            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
+        img_in += N;
+        data_out = ::aie::min(mask_thresh, data_buf1);
+        ::aie::store_v(img_out, data_out);
+        img_out += N;
+      }
+    break;
+  case XF_THRESHOLD_TYPE_BINARY:
+    for (int j = 0; j < (img_height * img_width);
+         j += N) // 16x samples per loop
+      chess_prepare_for_pipelining chess_loop_range(14, ) {
+        ::aie::vector<T, N> data_buf1 =
+            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
+        img_in += N;
+        temp_val = ::aie::lt(mask_thresh, data_buf1);
+        data_out = ::aie::select(mask_zeros, mask_max, temp_val);
+        ::aie::store_v(img_out, data_out);
+        img_out += N;
+      }
+    break;
+  case XF_THRESHOLD_TYPE_BINARY_INV:
+    for (int j = 0; j < (img_height * img_width);
+         j += N) // 16x samples per loop
+      chess_prepare_for_pipelining chess_loop_range(14, ) {
+        ::aie::vector<T, N> data_buf1 =
+            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
+        img_in += N;
+        temp_val = ::aie::lt(mask_thresh, data_buf1);
+        data_out = ::aie::select(mask_max, mask_zeros, temp_val);
+        ::aie::store_v(img_out, data_out);
+        img_out += N;
+      }
+    break;
+  case XF_THRESHOLD_TYPE_TOZERO:
+    for (int j = 0; j < (img_height * img_width);
+         j += N) // 16x samples per loop
+      chess_prepare_for_pipelining chess_loop_range(14, ) {
+        ::aie::vector<T, N> data_buf1 =
+            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
+        img_in += N;
+        temp_val = ::aie::lt(mask_thresh, data_buf1);
+        data_out = ::aie::select(mask_zeros, data_buf1, temp_val);
+        ::aie::store_v(img_out, data_out);
+        img_out += N;
+      }
+    break;
+  case XF_THRESHOLD_TYPE_TOZERO_INV:
+    for (int j = 0; j < (img_height * img_width);
+         j += N) // 16x samples per loop
+      chess_prepare_for_pipelining chess_loop_range(14, ) {
+        ::aie::vector<T, N> data_buf1 =
+            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
+        img_in += N;
+        temp_val = ::aie::lt(mask_thresh, data_buf1);
+        data_out = ::aie::select(data_buf1, mask_zeros, temp_val);
+        ::aie::store_v(img_out, data_out);
+        img_out += N;
+      }
+    break;
+  default:
+    for (int j = 0; j < (img_height * img_width);
+         j += N) // 16x samples per loop
+      chess_prepare_for_pipelining chess_loop_range(14, ) {
+        ::aie::vector<T, N> data_buf1 =
+            ::aie::load_v(img_in); // in:00++15|_________|_________|_________
+        img_in += N;
+        data_out = ::aie::min(mask_thresh, data_buf1);
+        ::aie::store_v(img_out, data_out);
+        img_out += N;
+      }
+  }
+}
+
+extern "C" {
+
+#if BIT_WIDTH == 8
+
+void threshold(uint8_t *img_in, uint8_t *img_out, int32_t thresh_val,
+               int32_t max_val, int32_t img_width, int32_t img_height) {
+  threshold_aie<uint8_t, 64>(img_in, img_out, img_width, img_height, thresh_val,
+                             max_val, XF_THRESHOLD_TYPE_BINARY);
+}
+
+void thresholdTile(uint8_t *in, uint8_t *out, int32_t tileHeight,
+                   int32_t tileWidth, uint8_t thresholdValue, uint8_t maxValue,
+                   uint8_t thresholdType) {
+  threshold_aie<uint8_t, 64>(in, out, tileWidth, tileHeight, thresholdValue,
+                             maxValue, thresholdType);
+}
+
+void thresholdLine(uint8_t *in, uint8_t *out, int32_t lineWidth,
+                   uint8_t thresholdValue, uint8_t maxValue,
+                   uint8_t thresholdType) {
+  threshold_aie<uint8_t, 64>(in, out, lineWidth, 1, thresholdValue, maxValue,
+                             thresholdType);
+}
+
+void threshold4ChLine(uint8_t *in, uint8_t *out, int32_t lineWidth,
+                      uint8_t thresholdValue1, uint8_t thresholdValue2,
+                      uint8_t thresholdValue3, uint8_t thresholdValue4,
+                      uint8_t maxValue1, uint8_t maxValue2, uint8_t maxValue3,
+                      uint8_t maxValue4, uint8_t thresholdType) {
+  threshold4Ch_aie<uint8_t, 64>(in, out, lineWidth, 1, thresholdValue1,
+                                thresholdValue2, thresholdValue3,
+                                thresholdValue4, maxValue1, maxValue2,
+                                maxValue3, maxValue4, thresholdType);
+}
+
+#elif BIT_WIDTH == 16
+
+void threshold(int16_t *img_in, int16_t *img_out, int32_t thresh_val,
+               int32_t max_val, int32_t img_width, int32_t img_height) {
+  threshold_aie<int16_t, 32>(img_in, img_out, img_width, img_height, thresh_val,
+                             max_va, XF_THRESHOLD_TYPE_BINARY);
+}
+
+void thresholdTile(int16_t *in, int16_t *out, int32_t tileHeight,
+                   int32_t tileWidth, int16_t thresholdValue, int16_t maxValue,
+                   uint8_t thresholdType) {
+  threshold_aie<int16_t, 32>(in, out, tileWidth, tileHeight, thresholdValue,
+                             maxValue),
+      thresholdType;
+}
+
+void thresholdLine(int16_t *in, int16_t *out, int32_t lineWidth,
+                   int16_t thresholdValue, int16_t maxValue,
+                   uint8_t thresholdType) {
+  threshold_aie<int16_t, 32>(in, out, lineWidth, 1, thresholdValue, maxValue,
+                             thresholdType);
+}
+
+#else // 32
+
+void threshold(int32_t *img_in, int32_t *img_out, int32_t thresh_val,
+               int32_t max_val, int32_t img_width, int32_t img_height) {
+  threshold_aie<int32_t, 16>(img_in, img_out, img_width, img_height, thresh_val,
+                             max_val, XF_THRESHOLD_TYPE_BINARY);
+}
+
+void thresholdTile(int32_t *in, int32_t *out, int32_t tileHeight,
+                   int32_t tileWidth, int32_t thresholdValue, int32_t maxValue,
+                   uint8_t thresholdType) {
+  threshold_aie<int32_t, 16>(in, out, tileWidth, tileHeight, thresholdValue,
+                             maxValue, thresholdType);
+}
+
+void thresholdLine(int32_t *in, int32_t *out, int32_t lineWidth,
+                   int32_t thresholdValue, int32_t maxValue,
+                   uint8_t thresholdType) {
+  threshold_aie<int32_t, 16>(in, out, lineWidth, 1, thresholdValue, maxValue,
+                             thresholdType);
+}
+
+#endif
+
+} // extern "C"

From a2d9254ced87b814216f5f125c6d2d6419e1c59c Mon Sep 17 00:00:00 2001
From: Stephen Neuendorffer <stephen.neuendorffer@amd.com>
Date: Tue, 23 Jan 2024 09:14:00 -0800
Subject: [PATCH 2/3] [vision_pipelines] remove extra defines.

These defines are actually in aie_api.h, and aren't used in the code
anyway.
---
 .../ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc     | 3 ---
 .../ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc      | 3 ---
 .../ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc       | 3 ---
 .../ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc        | 3 ---
 .../ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc       | 3 ---
 .../ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc     | 3 ---
 .../ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc       | 3 ---
 .../ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc        | 3 ---
 .../ipu-xrt/vision_pipelines/vision_kernels/threshold.cc       | 3 ---
 9 files changed, 27 deletions(-)

diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc
index c6868aad3a..849e47a551 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc
@@ -16,9 +16,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#define REL_WRITE 0
-#define REL_READ 1
-
 // #include <imgproc/xf_addweighted_aie.hpp> // NOTE: use of float2fix not
 //  supported in aie2
 #include <aie_api/aie.hpp>
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc
index e3cd871195..57aec65470 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc
@@ -15,9 +15,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#define REL_WRITE 0
-#define REL_READ 1
-
 #include <aie_api/aie.hpp>
 
 template <typename T, int N>
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc
index e02ec472f5..cd6e0d1027 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc
@@ -15,9 +15,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#define REL_WRITE 0
-#define REL_READ 1
-
 #include <aie_api/aie.hpp>
 
 template <typename T, int N>
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc
index ccbbb87c6f..59d11d303f 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc
@@ -15,9 +15,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#define REL_WRITE 0
-#define REL_READ 1
-
 #define THRESH_TYPE XF_THRESHOLD_TYPE_BINARY
 
 #include <aie_api/aie.hpp>
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc
index 351b3331b6..48d1c6acba 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc
@@ -14,9 +14,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#define REL_WRITE 0
-#define REL_READ 1
-
 #include <aie_api/aie.hpp>
 
 ::aie::vector<uint8, 64> vector_broadcast(::aie::vector<uint8, 16> e) {
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc
index 4725d09ba6..9d5d6188b0 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc
@@ -15,9 +15,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#define REL_WRITE 0
-#define REL_READ 1
-
 #include <aie_api/aie.hpp>
 
 template <typename T, int N>
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc
index 988c4a0e4c..1f1bc23d4c 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc
@@ -14,9 +14,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#define REL_WRITE 0
-#define REL_READ 1
-
 #include <aie_api/aie.hpp>
 
 const int32_t SRS_SHIFT = 15;
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc
index 3f20bf514d..39a2894495 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc
@@ -14,9 +14,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#define REL_WRITE 0
-#define REL_READ 1
-
 // clang-format off
 #include <aie_api/aie.hpp>
 #include "lut_inv_8b.h"
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc
index e1ffb38479..ef4d9ee7c1 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc
@@ -16,9 +16,6 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#define REL_WRITE 0
-#define REL_READ 1
-
 enum _threshold_type {
   XF_THRESHOLD_TYPE_BINARY = 0,
   XF_THRESHOLD_TYPE_BINARY_INV = 1,

From b22002635db963b225548adb7e53fdb4382dd486 Mon Sep 17 00:00:00 2001
From: Stephen Neuendorffer <stephen.neuendorffer@amd.com>
Date: Tue, 23 Jan 2024 09:58:24 -0800
Subject: [PATCH 3/3] Update ipu edgeDetect demo to compile and link with peano

This change also updates test.cpp to properly support 'live' mode.
With no arguments: runs a single image and attempts to compare
against openCV.
with -l: runs continuously using the webcam
with -v: runs continuously using a video file.
---
 reference_designs/ipu-xrt/makefile-common     |   7 +
 .../vision_pipelines/edge_detect/Makefile     |  51 ++-
 .../edge_detect/aie2_edgeDetect.py            |   4 +-
 .../vision_pipelines/edge_detect/test.cpp     | 393 ++++++++----------
 .../vision_kernels/rgba2gray.cc               |   2 +-
 5 files changed, 237 insertions(+), 220 deletions(-)

diff --git a/reference_designs/ipu-xrt/makefile-common b/reference_designs/ipu-xrt/makefile-common
index d9a0a69015..ed851b0e29 100644
--- a/reference_designs/ipu-xrt/makefile-common
+++ b/reference_designs/ipu-xrt/makefile-common
@@ -6,8 +6,15 @@ VITIS_AIETOOLS_DIR ?= ${VITIS_ROOT}/aietools
 VITIS_AIE_INCLUDE_DIR ?= ${VITIS_ROOT}/aietools/data/versal_prod/lib
 VITIS_AIE2_INCLUDE_DIR ?= ${VITIS_ROOT}/aietools/data/aie_ml/lib
 
+export XILINX_VITIS_AIETOOLS = ${VITIS_AIETOOLS_DIR}
+
 CHESSCC1_FLAGS = -f -p me -P ${VITIS_AIE_INCLUDE_DIR} -I ${VITIS_AIETOOLS_DIR}/include
 CHESSCC2_FLAGS = -f -p me -P ${VITIS_AIE2_INCLUDE_DIR} -I ${VITIS_AIETOOLS_DIR}/include -D__AIENGINE__=2 -D__AIEARCH__=20
+
+# Compile with peano, link with xbridge
+CLANG_CHESSLIBC_FLAGS = --target=aie2 -I ${VITIS_AIETOOLS_DIR}/include -I ${VITIS_AIETOOLS_DIR}/include/adf -I ${VITIS_AIETOOLS_DIR}/include/aie_api -D_LIBCPP_HAS_THREAD_API_PTHREAD --std=c++2a -O2 -fno-jump-tables -Wno-deprecated-declarations
+# Compile with peano, link with lld
+CLANG_FLAGS = --target=aie2-none-elf -I ${VITIS_AIETOOLS_DIR}/include -D_LIBCPP_HAS_THREAD_API_PTHREAD --std=c++2a -O2 -fno-jump-tables -Wno-deprecated-declarations
 CHESS_FLAGS = -P ${VITIS_AIE_INCLUDE_DIR}
 
 CHESSCCWRAP1_FLAGS = aie -I ${VITIS_AIETOOLS_DIR}/include 
diff --git a/reference_designs/ipu-xrt/vision_pipelines/edge_detect/Makefile b/reference_designs/ipu-xrt/vision_pipelines/edge_detect/Makefile
index a0d50006e1..1557420dca 100755
--- a/reference_designs/ipu-xrt/vision_pipelines/edge_detect/Makefile
+++ b/reference_designs/ipu-xrt/vision_pipelines/edge_detect/Makefile
@@ -21,13 +21,19 @@ EDGEDETECT_HEIGHT = 1080
 
 targetname = edgeDetect
 
-all: build/final_${EDGEDETECT_WIDTH}.xclbin
+default: peano
+
+all: peano xchesscc peanoxbridge
 
 mlir: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir
+peano: build/final_${EDGEDETECT_WIDTH}.xclbin
+xchesscc: build_xchesscc/final_${EDGEDETECT_WIDTH}.xclbin
+peanoxbridge: build_peanoxbridge/final_${EDGEDETECT_WIDTH}.xclbin
 
+PEANO = /root/acdc/build/install/peano/bin/clang++
 build/%.cc.o: %.cc
-	mkdir -p ${@D}
-	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F}
+	@mkdir -p ${@D}
+	cd ${@D} && ${PEANO} ${CLANG_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F}
 
 build/combined_gray2rgba_addWeighted.a: build/gray2rgba.cc.o build/addWeighted.cc.o
 	mkdir -p ${@D}
@@ -42,6 +48,45 @@ build/final_${EDGEDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${EDGEDETECT_WID
 	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
 		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
 
+XCHESSCC_OBJS = build_xchesscc/rgba2gray.cc.o \
+				build_xchesscc/gray2rgba.cc.o \
+				build_xchesscc/filter2d.cc.o \
+				build_xchesscc/threshold.cc.o \
+				build_xchesscc/addWeighted.cc.o \
+				build_xchesscc/combined_gray2rgba_addWeighted.a
+
+build_xchesscc/%.cc.o: %.cc
+	@mkdir -p ${@D}
+	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} +w ${@F}.work -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F}
+
+build_xchesscc/combined_gray2rgba_addWeighted.a: build_xchesscc/gray2rgba.cc.o build_xchesscc/addWeighted.cc.o
+	mkdir -p ${@D}
+	ar rvs $@ $< $(word 2,$^)
+
+build_xchesscc/final_${EDGEDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir ${XCHESSCC_OBJS}
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --xchesscc --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+
+PEANOXBRIDGE_OBJS = build_peanoxbridge/rgba2gray.cc.o \
+				build_peanoxbridge/gray2rgba.cc.o \
+				build_peanoxbridge/filter2d.cc.o \
+				build_peanoxbridge/threshold.cc.o \
+				build_peanoxbridge/addWeighted.cc.o \
+				build_peanoxbridge/combined_gray2rgba_addWeighted.a
+build_peanoxbridge/%.cc.o: %.cc
+	@mkdir -p ${@D}
+	cd ${@D} && ${PEANO} ${CLANG_CHESSLIBC_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F}
+
+build_peanoxbridge/combined_gray2rgba_addWeighted.a: build_peanoxbridge/gray2rgba.cc.o build_peanoxbridge/addWeighted.cc.o
+	mkdir -p ${@D}
+	ar rvs $@ $< $(word 2,$^)
+
+build_peanoxbridge/final_${EDGEDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir ${PEANOXBRIDGE_OBJS}
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host \
+		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+
 ${targetname}.exe: test.cpp
 	rm -rf _build
 	mkdir -p _build
diff --git a/reference_designs/ipu-xrt/vision_pipelines/edge_detect/aie2_edgeDetect.py b/reference_designs/ipu-xrt/vision_pipelines/edge_detect/aie2_edgeDetect.py
index 48682e4011..5a32a38c9f 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/edge_detect/aie2_edgeDetect.py
+++ b/reference_designs/ipu-xrt/vision_pipelines/edge_detect/aie2_edgeDetect.py
@@ -178,10 +178,12 @@ def core_body():
                     objectfifo_release(ObjectFifoPort.Produce, "OF_2to3", 1)
                     yield_([])
 
+            kernel = Buffer(ComputeTile3, [3, 3], T.i16(), "kernel")
+
             # Compute tile 3
             @core(ComputeTile3, "filter2d.cc.o")
             def core_body():
-                kernel = memref.alloc([3, 3], T.i16())
+                # kernel = memref.alloca([3, 3], T.i16())
                 v0 = arith.constant(0, T.i16())
                 v1 = arith.constant(4096, T.i16())
                 v_minus4 = arith.constant(-16384, T.i16())
diff --git a/reference_designs/ipu-xrt/vision_pipelines/edge_detect/test.cpp b/reference_designs/ipu-xrt/vision_pipelines/edge_detect/test.cpp
index 57389fffbf..0fa2e80294 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/edge_detect/test.cpp
+++ b/reference_designs/ipu-xrt/vision_pipelines/edge_detect/test.cpp
@@ -105,18 +105,156 @@ int main(int argc, const char *argv[]) {
   std::cout << "Running edgeDetect for resolution: " << testImageWidth << "x"
             << testImageHeight << std::endl;
 
-  if (vm.count("live")) {
-    std::cout << "Using live webcam input" << std::endl;
+  /*
+  ****************************************************************************
+  * Read the input image or generate random one if no input file argument
+  * provided
+  ****************************************************************************
+  */
+  cv::Mat inImage, inImageRGBA;
+  cv::String fileIn;
+  if (vm.count("image")) {
+    fileIn = vm["image"].as<std::string>();
+    //"/group/xrlabs/imagesAndVideos/images/minion128x128.jpg";
+    initializeSingleImageTest(fileIn, inImage);
+  } else {
+    fileIn = "RANDOM";
+    inImage = cv::Mat(testImageHeight, testImageWidth, CV_8UC3);
+    cv::randu(inImage, cv::Scalar(0, 0, 0), cv::Scalar(255, 255, 255));
+  }
+
+  cv::String fileOut =
+      vm["outfile"].as<std::string>(); //"edgeDetectOut_test.jpg";
+  printf("Load input image %s and run edgeDetect\n", fileIn.c_str());
+
+  cv::resize(inImage, inImage, cv::Size(testImageWidth, testImageHeight));
+  cv::cvtColor(inImage, inImageRGBA, cv::COLOR_BGR2RGBA);
+
+  /*
+   ****************************************************************************
+   * Calculate OpenCV referennce for edgeDetect
+   ****************************************************************************
+   */
+
+  cv::Mat outImageReference, outImageTestBGR;
+  edgeDetect(inImage, outImageReference);
+
+  cv::cvtColor(outImageReference, outImageReference, cv::COLOR_BGR2RGBA);
+  cv::Mat outImageTest(testImageHeight, testImageWidth, CV_8UC4);
+
+  /*
+   ****************************************************************************
+   * Load instruction sequence
+   ****************************************************************************
+   */
+  std::vector<uint32_t> instr_v =
+      load_instr_sequence(vm["instr"].as<std::string>());
+
+  int verbosity = vm["verbosity"].as<int>();
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  /*
+   ****************************************************************************
+   * Start the XRT context and load the kernel
+   ****************************************************************************
+   */
+  xrt::device device;
+  xrt::kernel kernel;
+
+  initXrtLoadKernel(device, kernel, verbosity, vm["xclbin"].as<std::string>(),
+                    vm["kernel"].as<std::string>());
+
+  /*
+   ****************************************************************************
+   * Set up the buffer objects
+   ****************************************************************************
+   */
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+  auto bo_inA = xrt::bo(device, inImageRGBA.total() * inImageRGBA.elemSize(),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+  auto bo_inB = xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_out =
+      xrt::bo(device, (outImageTest.total() * outImageTest.elemSize()),
+              XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+  if (verbosity >= 1)
+    std::cout << "Writing data into buffer objects.\n";
+
+  uint8_t *bufInA = bo_inA.map<uint8_t *>();
+
+  // Copy cv::Mat input image to xrt buffer object
+  memcpy(bufInA, inImageRGBA.data,
+         (inImageRGBA.total() * inImageRGBA.elemSize()));
+
+  // Copy instruction stream to xrt buffer object
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  // Sync host to device memories
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  // Execute the kernel and wait to finish
+  if (verbosity >= 1)
+    std::cout << "Running Kernel.\n";
+  auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+  run.wait();
+
+  // Sync device to host memories
+  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+  // Store result in cv::Mat
+  uint8_t *bufOut = bo_out.map<uint8_t *>();
+  memcpy(outImageTest.data, bufOut,
+         (outImageTest.total() * outImageTest.elemSize()));
+
+  /*
+   ****************************************************************************
+   * Compare to OpenCV reference
+   ****************************************************************************
+   */
+  int numberOfDifferences = 0;
+  double errorPerPixel = 0;
+  imageCompare(outImageTest, outImageReference, numberOfDifferences,
+               errorPerPixel, true, false);
+  printf("Number of differences: %d, average L1 error: %f\n",
+         numberOfDifferences, errorPerPixel);
+
+  cv::cvtColor(outImageTest, outImageTestBGR, cv::COLOR_RGBA2BGR);
+  cv::imwrite(fileOut, outImageTestBGR);
+
+  // Print Pass/Fail result of our test
+  int res = 0;
+  if (errorPerPixel < epsilon) {
+    printf("PASS!\n");
+    res = 0;
+  } else {
+    printf("Fail!\n");
+    res = -1;
+  }
+
+  if (vm.count("live") || vm.count("video")) {
+    if (vm.count("live"))
+      std::cout << "Using live webcam input" << std::endl;
+    else
+      std::cout << "Reading movie file " << vm["video"].as<std::string>()
+                << std::endl;
 
     cv::VideoCapture cap;
     try {
-      initializeVideoCapture(cap);
+      if (vm.count("live"))
+        initializeVideoCapture(cap);
+      else
+        initializeVideoFile(cap, vm["video"].as<std::string>());
     } catch (const std::exception &ex) {
       std::cerr << ex.what() << "\n\n";
       return 1;
     }
 
-    //--- GRAB AND SHOW LOOP
+    //--- frame grab + process
     std::cout << "Start grabbing" << std::endl
               << "Press any key to terminate" << std::endl;
     cv::Mat frame;
@@ -129,222 +267,47 @@ int main(int argc, const char *argv[]) {
         break;
       }
 
-      cv::Mat edgeFrame;
-      edgeDetect(frame, edgeFrame);
+      // cv::Mat edgeFrame;
+      // edgeDetect(frame,edgeFrame);
 
-      // show live and wait for a key with timeout long enough to show images
-      cv::imshow("Live", edgeFrame);
-      if (cv::waitKey(5) >= 0)
-        break;
-    }
-  }
+      cv::resize(frame, inImage, cv::Size(testImageWidth, testImageHeight));
+      cv::cvtColor(inImage, inImageRGBA, cv::COLOR_BGR2RGBA);
+      // Copy cv::Mat input image to xrt buffer object
+      memcpy(bufInA, inImageRGBA.data,
+             (inImageRGBA.total() * inImageRGBA.elemSize()));
 
-  else {
-
-    /*
-    ****************************************************************************
-    * Read the input image or generate random one if no input file argument
-    * provided
-    ****************************************************************************
-    */
-    cv::Mat inImage, inImageRGBA;
-    cv::String fileIn;
-    if (vm.count("image")) {
-      fileIn = vm["image"].as<std::string>();
-      //"/group/xrlabs/imagesAndVideos/images/minion128x128.jpg";
-      initializeSingleImageTest(fileIn, inImage);
-    } else {
-      fileIn = "RANDOM";
-      inImage = cv::Mat(testImageHeight, testImageWidth, CV_8UC3);
-      cv::randu(inImage, cv::Scalar(0, 0, 0), cv::Scalar(255, 255, 255));
-    }
+      // Copy instruction stream to xrt buffer object
+      void *bufInstr = bo_instr.map<void *>();
+      memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
 
-    cv::String fileOut =
-        vm["outfile"].as<std::string>(); //"edgeDetectOut_test.jpg";
-    printf("Load input image %s and run edgeDetect\n", fileIn.c_str());
-
-    cv::resize(inImage, inImage, cv::Size(testImageWidth, testImageHeight));
-    cv::cvtColor(inImage, inImageRGBA, cv::COLOR_BGR2RGBA);
-
-    /*
-     ****************************************************************************
-     * Calculate OpenCV referennce for edgeDetect
-     ****************************************************************************
-     */
-
-    cv::Mat outImageReference, outImageTestBGR;
-    edgeDetect(inImage, outImageReference);
-
-    cv::cvtColor(outImageReference, outImageReference, cv::COLOR_BGR2RGBA);
-    cv::Mat outImageTest(testImageHeight, testImageWidth, CV_8UC4);
-
-    /*
-     ****************************************************************************
-     * Load instruction sequence
-     ****************************************************************************
-     */
-    std::vector<uint32_t> instr_v =
-        load_instr_sequence(vm["instr"].as<std::string>());
-
-    int verbosity = vm["verbosity"].as<int>();
-    if (verbosity >= 1)
-      std::cout << "Sequence instr count: " << instr_v.size() << "\n";
-
-    /*
-     ****************************************************************************
-     * Start the XRT context and load the kernel
-     ****************************************************************************
-     */
-    xrt::device device;
-    xrt::kernel kernel;
-
-    initXrtLoadKernel(device, kernel, verbosity, vm["xclbin"].as<std::string>(),
-                      vm["kernel"].as<std::string>());
-
-    /*
-     ****************************************************************************
-     * Set up the buffer objects
-     ****************************************************************************
-     */
-    auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
-                            XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
-    auto bo_inA = xrt::bo(device, inImageRGBA.total() * inImageRGBA.elemSize(),
-                          XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
-    auto bo_inB =
-        xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-    auto bo_out =
-        xrt::bo(device, (outImageTest.total() * outImageTest.elemSize()),
-                XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
-
-    if (verbosity >= 1)
-      std::cout << "Writing data into buffer objects.\n";
-
-    uint8_t *bufInA = bo_inA.map<uint8_t *>();
-
-    // Copy cv::Mat input image to xrt buffer object
-    memcpy(bufInA, inImageRGBA.data,
-           (inImageRGBA.total() * inImageRGBA.elemSize()));
-
-    // Copy instruction stream to xrt buffer object
-    void *bufInstr = bo_instr.map<void *>();
-    memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
-
-    // Sync host to device memories
-    bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-    bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-    bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-
-    // Execute the kernel and wait to finish
-    if (verbosity >= 1)
-      std::cout << "Running Kernel.\n";
-    auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
-    run.wait();
-
-    // Sync device to host memories
-    bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-
-    // Store result in cv::Mat
-    uint8_t *bufOut = bo_out.map<uint8_t *>();
-    memcpy(outImageTest.data, bufOut,
-           (outImageTest.total() * outImageTest.elemSize()));
-
-    /*
-     ****************************************************************************
-     * Compare to OpenCV reference
-     ****************************************************************************
-     */
-    int numberOfDifferences = 0;
-    double errorPerPixel = 0;
-    imageCompare(outImageTest, outImageReference, numberOfDifferences,
-                 errorPerPixel, true, false);
-    printf("Number of differences: %d, average L1 error: %f\n",
-           numberOfDifferences, errorPerPixel);
-
-    cv::cvtColor(outImageTest, outImageTestBGR, cv::COLOR_RGBA2BGR);
-    cv::imwrite(fileOut, outImageTestBGR);
-
-    // Print Pass/Fail result of our test
-    int res = 0;
-    if (errorPerPixel < epsilon) {
-      printf("PASS!\n");
-      res = 0;
-    } else {
-      printf("Fail!\n");
-      res = -1;
-    }
+      // Sync host to device memories
+      bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+      bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE);
 
-    if (vm.count("live") || vm.count("video")) {
-      if (vm.count("live"))
-        std::cout << "Using live webcam input" << std::endl;
-      else
-        std::cout << "Reading movie file " << vm["video"].as<std::string>()
-                  << std::endl;
-
-      cv::VideoCapture cap;
-      try {
-        if (vm.count("live"))
-          initializeVideoCapture(cap);
-        else
-          initializeVideoFile(cap, vm["video"].as<std::string>());
-      } catch (const std::exception &ex) {
-        std::cerr << ex.what() << "\n\n";
-        return 1;
-      }
+      // Execute the kernel and wait to finish
+      if (verbosity >= 1)
+        std::cout << "Running Kernel.\n";
 
-      //--- frame grab + process
-      std::cout << "Start grabbing" << std::endl
-                << "Press any key to terminate" << std::endl;
-      cv::Mat frame;
-      for (;;) {
-        // wait for a new frame from camera and store it into 'frame'
-        cap.read(frame);
-        // check if we succeeded
-        if (frame.empty()) {
-          std::cerr << "ERROR! blank frame grabbed\n";
-          break;
-        }
-
-        // cv::Mat edgeFrame;
-        // edgeDetect(frame,edgeFrame);
-
-        cv::resize(frame, inImage, cv::Size(testImageWidth, testImageHeight));
-        cv::cvtColor(inImage, inImageRGBA, cv::COLOR_BGR2RGBA);
-        // Copy cv::Mat input image to xrt buffer object
-        memcpy(bufInA, inImageRGBA.data,
-               (inImageRGBA.total() * inImageRGBA.elemSize()));
-
-        // Copy instruction stream to xrt buffer object
-        void *bufInstr = bo_instr.map<void *>();
-        memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
-
-        // Sync host to device memories
-        bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-        bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-        bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE);
-
-        // Execute the kernel and wait to finish
-        if (verbosity >= 1)
-          std::cout << "Running Kernel.\n";
-        auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
-        run.wait();
-
-        // Sync device to host memories
-        bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-
-        // Store result in cv::Mat
-        uint8_t *bufOut = bo_out.map<uint8_t *>();
-        memcpy(outImageTest.data, bufOut,
-               (outImageTest.total() * outImageTest.elemSize()));
-
-        // show live and wait for a key with timeout long enough to show images
-        cv::cvtColor(outImageTest, outImageTestBGR, cv::COLOR_RGBA2BGR);
-        cv::imshow("Edge AIE", outImageTestBGR);
-        if (cv::waitKey(5) >= 0)
-          break;
-      }
-    }
+      auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+      run.wait();
 
-    printf("Testing edgeDetect done!\n");
-    return res;
+      // Sync device to host memories
+      bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+      // Store result in cv::Mat
+      uint8_t *bufOut = bo_out.map<uint8_t *>();
+      memcpy(outImageTest.data, bufOut,
+             (outImageTest.total() * outImageTest.elemSize()));
+
+      // show live and wait for a key with timeout long enough to show images
+      cv::cvtColor(outImageTest, outImageTestBGR, cv::COLOR_RGBA2BGR);
+      cv::imshow("Edge AIE", outImageTestBGR);
+      if (cv::waitKey(5) >= 0)
+        break;
+    }
   }
+
+  printf("Testing edgeDetect done!\n");
+  return res;
 }
diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc
index 1f1bc23d4c..b3ff661be4 100644
--- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc
+++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc
@@ -56,7 +56,7 @@ __attribute__((noinline)) void rgba2gray_aie(uint8_t *rgba_in, uint8_t *y_out,
   ::aie::vector<uint8_t, 32> r, g, b;
   ::aie::vector<uint8_t, 32> y;
 
-  for (int j = 0; (j < (width * height) / 32); j += 1)
+  for (int j = 0; (j < (width * height) >> 5); j += 1)
     chess_prepare_for_pipelining {
       xf_extract_rgb(rgba_in, r, g, b);