From 81d1039b92193f424e5666adb5e766089171cfed Mon Sep 17 00:00:00 2001 From: Stephen Neuendorffer Date: Tue, 23 Jan 2024 08:50:42 -0800 Subject: [PATCH 1/3] dos2unix --- .../vision_kernels/addWeighted.cc | 248 +++---- .../vision_kernels/bitwiseAND.cc | 178 ++--- .../vision_kernels/bitwiseOR.cc | 178 ++--- .../vision_kernels/filter2d.cc | 446 ++++++------- .../vision_kernels/gray2rgba.cc | 164 ++--- .../vision_kernels/passThrough.cc | 174 ++--- .../vision_kernels/rgba2gray.cc | 214 +++--- .../vision_kernels/rgba2hue.cc | 362 +++++----- .../vision_kernels/threshold.cc | 626 +++++++++--------- 9 files changed, 1295 insertions(+), 1295 deletions(-) diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc index 24d6fb7f25..c6868aad3a 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc @@ -1,124 +1,124 @@ -//===- add_weighted.cc -------------------------------------------------*- C++ -//-*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2022, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -// #define __AIENGINE__ 1 -#define NOCPP - -#include -#include -#include - -#define REL_WRITE 0 -#define REL_READ 1 - -// #include // NOTE: use of float2fix not -// supported in aie2 -#include - -const int32_t SRS_SHIFT = 14; - -template -void addweighted_aie_scalar(const T *in1, const T *in2, T *out, - const int32_t width, const int32_t height, - const int16_t alpha, const int16_t beta, - const T gamma) { - for (int i = 0; i < height; i++) - for (int j = 0; j < width; j++) { - int tmpIn1 = in1[i * width + j] * alpha; - int tmpIn2 = in2[i * width + j] * beta; - int tmp = - ((tmpIn1 + tmpIn2 + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT) + gamma; - tmp = (tmp > MAX) ? MAX : (tmp < 0) ? 0 : tmp; // saturate - out[i * width + j] = (T)tmp; - } -} - -template -void addweighted_aie(const T *src1, const T *src2, T *dst, const int32_t width, - const int32_t height, const int16_t alphaFixedPoint, - const int16_t betaFixedPoint, const T gamma) { - - ::aie::set_saturation( - aie::saturation_mode::saturate); // Needed to saturate properly to uint8 - - ::aie::vector coeff(alphaFixedPoint, betaFixedPoint); - ::aie::vector gamma_coeff; - ::aie::accum gamma_acc; - for (int i = 0; i < N; i++) { - gamma_coeff[i] = gamma; - } - gamma_acc.template from_vector(gamma_coeff, 0); - for (int j = 0; j < width * height; j += N) // 16 samples per loop - chess_prepare_for_pipelining chess_loop_range( - 14, ) // loop_range(14) - loop : 1 cycle - { - ::aie::vector data_buf1 = ::aie::load_v(src1); - src1 += N; - ::aie::vector data_buf2 = ::aie::load_v(src2); - src2 += N; - ::aie::accum acc = ::aie::accumulate( - gamma_acc, coeff, 0, data_buf1, - data_buf2); // weight[0] * data_buf1 + weight[1] * data_buf2 - ::aie::store_v(dst, acc.template to_vector(SRS_SHIFT)); - dst += N; - } -} - -extern "C" { - -#if BIT_WIDTH == 8 -void addWeightedLine(uint8_t *in1, uint8_t *in2, uint8_t *out, - int32_t lineWidth, int16_t alpha, int16_t beta, - uint8_t gamma) { - addweighted_aie(in1, in2, out, lineWidth, 1, alpha, - beta, gamma); -} - -void addWeightedTile(uint8_t *in1, uint8_t *in2, uint8_t *out, - int32_t tileHeight, int32_t tileWidth, int16_t alpha, - int16_t beta, uint8_t gamma) { - addweighted_aie(in1, in2, out, tileWidth, tileHeight, - alpha, beta, gamma); -} - -#elif BIT_WIDTH == 16 -void addWeightedLine(int16_t *in1, int16_t *in2, int16_t *out, - int32_t lineWidth, int16_t alpha, int16_t beta, - int16_t gamma) { - addweighted_aie(in1, in2, out, lineWidth, 1, alpha, - beta, gamma); -} - -void addWeightedTile(int16_t *in1, int16_t *in2, int16_t *out, - int32_t tileHeight, int32_t tileWidth, int16_t alpha, - int16_t beta, int16_t gamma) { - addweighted_aie(in1, in2, out, tileWidth, tileHeight, - alpha, beta, gamma); -} - -#else // 32 - -void addWeightedLine(int32_t *in1, int32_t *in2, int32_t *out, - int32_t lineWidth, int16_t alpha, int16_t beta, - int32_t gamma) { - addweighted_aie(in1, in2, out, lineWidth, 1, alpha, - beta, gamma); -} - -void addWeightedTile(int32_t *in1, int32_t *in2, int32_t *out, - int32_t tileHeight, int32_t tileWidth, int16_t alpha, - int16_t beta, int32_t gamma) { - addweighted_aie(in1, in2, out, tileWidth, tileHeight, - alpha, beta, gamma); -} - -#endif -} // extern "C" +//===- add_weighted.cc -------------------------------------------------*- C++ +//-*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2022, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// #define __AIENGINE__ 1 +#define NOCPP + +#include +#include +#include + +#define REL_WRITE 0 +#define REL_READ 1 + +// #include // NOTE: use of float2fix not +// supported in aie2 +#include + +const int32_t SRS_SHIFT = 14; + +template +void addweighted_aie_scalar(const T *in1, const T *in2, T *out, + const int32_t width, const int32_t height, + const int16_t alpha, const int16_t beta, + const T gamma) { + for (int i = 0; i < height; i++) + for (int j = 0; j < width; j++) { + int tmpIn1 = in1[i * width + j] * alpha; + int tmpIn2 = in2[i * width + j] * beta; + int tmp = + ((tmpIn1 + tmpIn2 + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT) + gamma; + tmp = (tmp > MAX) ? MAX : (tmp < 0) ? 0 : tmp; // saturate + out[i * width + j] = (T)tmp; + } +} + +template +void addweighted_aie(const T *src1, const T *src2, T *dst, const int32_t width, + const int32_t height, const int16_t alphaFixedPoint, + const int16_t betaFixedPoint, const T gamma) { + + ::aie::set_saturation( + aie::saturation_mode::saturate); // Needed to saturate properly to uint8 + + ::aie::vector coeff(alphaFixedPoint, betaFixedPoint); + ::aie::vector gamma_coeff; + ::aie::accum gamma_acc; + for (int i = 0; i < N; i++) { + gamma_coeff[i] = gamma; + } + gamma_acc.template from_vector(gamma_coeff, 0); + for (int j = 0; j < width * height; j += N) // 16 samples per loop + chess_prepare_for_pipelining chess_loop_range( + 14, ) // loop_range(14) - loop : 1 cycle + { + ::aie::vector data_buf1 = ::aie::load_v(src1); + src1 += N; + ::aie::vector data_buf2 = ::aie::load_v(src2); + src2 += N; + ::aie::accum acc = ::aie::accumulate( + gamma_acc, coeff, 0, data_buf1, + data_buf2); // weight[0] * data_buf1 + weight[1] * data_buf2 + ::aie::store_v(dst, acc.template to_vector(SRS_SHIFT)); + dst += N; + } +} + +extern "C" { + +#if BIT_WIDTH == 8 +void addWeightedLine(uint8_t *in1, uint8_t *in2, uint8_t *out, + int32_t lineWidth, int16_t alpha, int16_t beta, + uint8_t gamma) { + addweighted_aie(in1, in2, out, lineWidth, 1, alpha, + beta, gamma); +} + +void addWeightedTile(uint8_t *in1, uint8_t *in2, uint8_t *out, + int32_t tileHeight, int32_t tileWidth, int16_t alpha, + int16_t beta, uint8_t gamma) { + addweighted_aie(in1, in2, out, tileWidth, tileHeight, + alpha, beta, gamma); +} + +#elif BIT_WIDTH == 16 +void addWeightedLine(int16_t *in1, int16_t *in2, int16_t *out, + int32_t lineWidth, int16_t alpha, int16_t beta, + int16_t gamma) { + addweighted_aie(in1, in2, out, lineWidth, 1, alpha, + beta, gamma); +} + +void addWeightedTile(int16_t *in1, int16_t *in2, int16_t *out, + int32_t tileHeight, int32_t tileWidth, int16_t alpha, + int16_t beta, int16_t gamma) { + addweighted_aie(in1, in2, out, tileWidth, tileHeight, + alpha, beta, gamma); +} + +#else // 32 + +void addWeightedLine(int32_t *in1, int32_t *in2, int32_t *out, + int32_t lineWidth, int16_t alpha, int16_t beta, + int32_t gamma) { + addweighted_aie(in1, in2, out, lineWidth, 1, alpha, + beta, gamma); +} + +void addWeightedTile(int32_t *in1, int32_t *in2, int32_t *out, + int32_t tileHeight, int32_t tileWidth, int16_t alpha, + int16_t beta, int32_t gamma) { + addweighted_aie(in1, in2, out, tileWidth, tileHeight, + alpha, beta, gamma); +} + +#endif +} // extern "C" diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc index 577ce10659..e3cd871195 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc @@ -1,89 +1,89 @@ -//===- bitwisaAND.cc --------------------------------------------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2023, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -// #define __AIENGINE__ 1 -#define NOCPP - -#include -#include -#include - -#define REL_WRITE 0 -#define REL_READ 1 - -#include - -template -void bitwiseAND_aie_scalar(const T *in1, const T *in2, T *out, - const int32_t width, const int32_t height) { - for (int i = 0; i < height; i++) - for (int j = 0; j < width; j++) - out[i * width + j] = in1[i * width + j] & in2[i * width + j]; -} - -template -void bitwiseAND_aie(const T *src1, const T *src2, T *dst, const int32_t width, - const int32_t height) { - - for (int j = 0; j < width * height; j += N) - chess_prepare_for_pipelining chess_loop_range( - 14, ) // loop_range(14) - loop : 1 cycle - { - ::aie::vector in1 = ::aie::load_v(src1); - src1 += N; - ::aie::vector in2 = ::aie::load_v(src2); - src2 += N; - ::aie::vector out; - - out = ::aie::bit_and(in1, in2); - - ::aie::store_v(dst, out); - dst += N; - } -} - -extern "C" { - -#if BIT_WIDTH == 8 -void bitwiseANDLine(uint8_t *in1, uint8_t *in2, uint8_t *out, - int32_t lineWidth) { - bitwiseAND_aie(in1, in2, out, lineWidth, 1); -} - -void bitwiseANDTile(uint8_t *in1, uint8_t *in2, uint8_t *out, - int32_t tileHeight, int32_t tileWidth) { - bitwiseAND_aie(in1, in2, out, tileWidth, tileHeight); -} - -#elif BIT_WIDTH == 16 -void bitwiseANDLine(int16_t *in1, int16_t *in2, int16_t *out, - int32_t lineWidth) { - bitwiseAND_aie(in1, in2, out, lineWidth, 1); -} - -void bitwiseANDTile(int16_t *in1, int16_t *in2, int16_t *out, - int32_t tileHeight, int32_t tileWidth) { - bitwiseAND_aie(in1, in2, out, tileWidth, tileHeight); -} - -#else // 32 - -void bitwiseANDLine(int32_t *in1, int32_t *in2, int32_t *out, - int32_t lineWidth) { - bitwiseAND_aie(in1, in2, out, lineWidth); -} - -void bitwiseANDTile(int32_t *in1, int32_t *in2, int32_t *out, - int32_t tileHeight, int32_t tileWidth) { - bitwiseAND_aie(in1, in2, out, tileWidth, tileHeight); -} - -#endif -} // extern "C" +//===- bitwisaAND.cc --------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// #define __AIENGINE__ 1 +#define NOCPP + +#include +#include +#include + +#define REL_WRITE 0 +#define REL_READ 1 + +#include + +template +void bitwiseAND_aie_scalar(const T *in1, const T *in2, T *out, + const int32_t width, const int32_t height) { + for (int i = 0; i < height; i++) + for (int j = 0; j < width; j++) + out[i * width + j] = in1[i * width + j] & in2[i * width + j]; +} + +template +void bitwiseAND_aie(const T *src1, const T *src2, T *dst, const int32_t width, + const int32_t height) { + + for (int j = 0; j < width * height; j += N) + chess_prepare_for_pipelining chess_loop_range( + 14, ) // loop_range(14) - loop : 1 cycle + { + ::aie::vector in1 = ::aie::load_v(src1); + src1 += N; + ::aie::vector in2 = ::aie::load_v(src2); + src2 += N; + ::aie::vector out; + + out = ::aie::bit_and(in1, in2); + + ::aie::store_v(dst, out); + dst += N; + } +} + +extern "C" { + +#if BIT_WIDTH == 8 +void bitwiseANDLine(uint8_t *in1, uint8_t *in2, uint8_t *out, + int32_t lineWidth) { + bitwiseAND_aie(in1, in2, out, lineWidth, 1); +} + +void bitwiseANDTile(uint8_t *in1, uint8_t *in2, uint8_t *out, + int32_t tileHeight, int32_t tileWidth) { + bitwiseAND_aie(in1, in2, out, tileWidth, tileHeight); +} + +#elif BIT_WIDTH == 16 +void bitwiseANDLine(int16_t *in1, int16_t *in2, int16_t *out, + int32_t lineWidth) { + bitwiseAND_aie(in1, in2, out, lineWidth, 1); +} + +void bitwiseANDTile(int16_t *in1, int16_t *in2, int16_t *out, + int32_t tileHeight, int32_t tileWidth) { + bitwiseAND_aie(in1, in2, out, tileWidth, tileHeight); +} + +#else // 32 + +void bitwiseANDLine(int32_t *in1, int32_t *in2, int32_t *out, + int32_t lineWidth) { + bitwiseAND_aie(in1, in2, out, lineWidth); +} + +void bitwiseANDTile(int32_t *in1, int32_t *in2, int32_t *out, + int32_t tileHeight, int32_t tileWidth) { + bitwiseAND_aie(in1, in2, out, tileWidth, tileHeight); +} + +#endif +} // extern "C" diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc index 520efbaf70..e02ec472f5 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc @@ -1,89 +1,89 @@ -//===- bitwiseOR.cc ---------------------------------------------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2023, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -// #define __AIENGINE__ 1 -#define NOCPP - -#include -#include -#include - -#define REL_WRITE 0 -#define REL_READ 1 - -#include - -template -void bitwiseOR_aie_scalar(const T *in1, const T *in2, T *out, - const int32_t width, const int32_t height) { - for (int i = 0; i < height; i++) - for (int j = 0; j < width; j++) - out[i * width + j] = in1[i * width + j] | in2[i * width + j]; -} - -template -void bitwiseOR_aie(const T *src1, const T *src2, T *dst, const int32_t width, - const int32_t height) { - - for (int j = 0; j < width * height; j += N) - chess_prepare_for_pipelining chess_loop_range( - 14, ) // loop_range(14) - loop : 1 cycle - { - ::aie::vector in1 = ::aie::load_v(src1); - src1 += N; - ::aie::vector in2 = ::aie::load_v(src2); - src2 += N; - ::aie::vector out; - - out = ::aie::bit_or(in1, in2); - - ::aie::store_v(dst, out); - dst += N; - } -} - -extern "C" { - -#if BIT_WIDTH == 8 -void bitwiseORLine(uint8_t *in1, uint8_t *in2, uint8_t *out, - int32_t lineWidth) { - bitwiseOR_aie(in1, in2, out, lineWidth, 1); -} - -void bitwiseORTile(uint8_t *in1, uint8_t *in2, uint8_t *out, int32_t tileHeight, - int32_t tileWidth) { - bitwiseOR_aie(in1, in2, out, tileWidth, tileHeight); -} - -#elif BIT_WIDTH == 16 -void bitwiseORLine(int16_t *in1, int16_t *in2, int16_t *out, - int32_t lineWidth) { - bitwiseOR_aie(in1, in2, out, lineWidth, 1); -} - -void bitwiseORTile(int16_t *in1, int16_t *in2, int16_t *out, int32_t tileHeight, - int32_t tileWidth) { - bitwiseOR_aie(in1, in2, out, tileWidth, tileHeight); -} - -#else // 32 - -void bitwiseORLine(int32_t *in1, int32_t *in2, int32_t *out, - int32_t lineWidth) { - bitwiseOR_aie(in1, in2, out, lineWidth); -} - -void bitwiseORTile(int32_t *in1, int32_t *in2, int32_t *out, int32_t tileHeight, - int32_t tileWidth) { - bitwiseOR_aie(in1, in2, out, tileWidth, tileHeight); -} - -#endif -} // extern "C" +//===- bitwiseOR.cc ---------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// #define __AIENGINE__ 1 +#define NOCPP + +#include +#include +#include + +#define REL_WRITE 0 +#define REL_READ 1 + +#include + +template +void bitwiseOR_aie_scalar(const T *in1, const T *in2, T *out, + const int32_t width, const int32_t height) { + for (int i = 0; i < height; i++) + for (int j = 0; j < width; j++) + out[i * width + j] = in1[i * width + j] | in2[i * width + j]; +} + +template +void bitwiseOR_aie(const T *src1, const T *src2, T *dst, const int32_t width, + const int32_t height) { + + for (int j = 0; j < width * height; j += N) + chess_prepare_for_pipelining chess_loop_range( + 14, ) // loop_range(14) - loop : 1 cycle + { + ::aie::vector in1 = ::aie::load_v(src1); + src1 += N; + ::aie::vector in2 = ::aie::load_v(src2); + src2 += N; + ::aie::vector out; + + out = ::aie::bit_or(in1, in2); + + ::aie::store_v(dst, out); + dst += N; + } +} + +extern "C" { + +#if BIT_WIDTH == 8 +void bitwiseORLine(uint8_t *in1, uint8_t *in2, uint8_t *out, + int32_t lineWidth) { + bitwiseOR_aie(in1, in2, out, lineWidth, 1); +} + +void bitwiseORTile(uint8_t *in1, uint8_t *in2, uint8_t *out, int32_t tileHeight, + int32_t tileWidth) { + bitwiseOR_aie(in1, in2, out, tileWidth, tileHeight); +} + +#elif BIT_WIDTH == 16 +void bitwiseORLine(int16_t *in1, int16_t *in2, int16_t *out, + int32_t lineWidth) { + bitwiseOR_aie(in1, in2, out, lineWidth, 1); +} + +void bitwiseORTile(int16_t *in1, int16_t *in2, int16_t *out, int32_t tileHeight, + int32_t tileWidth) { + bitwiseOR_aie(in1, in2, out, tileWidth, tileHeight); +} + +#else // 32 + +void bitwiseORLine(int32_t *in1, int32_t *in2, int32_t *out, + int32_t lineWidth) { + bitwiseOR_aie(in1, in2, out, lineWidth); +} + +void bitwiseORTile(int32_t *in1, int32_t *in2, int32_t *out, int32_t tileHeight, + int32_t tileWidth) { + bitwiseOR_aie(in1, in2, out, tileWidth, tileHeight); +} + +#endif +} // extern "C" diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc index 1bb7302982..ccbbb87c6f 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc @@ -1,223 +1,223 @@ -//===- filter2d.cc ----------------------------------------------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2022, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -// #define __AIENGINE__ 1 -#define NOCPP - -#include -#include -#include - -#define REL_WRITE 0 -#define REL_READ 1 - -#define THRESH_TYPE XF_THRESHOLD_TYPE_BINARY - -#include - -const int32_t SRS_SHIFT = 12; - -void filter2d_3lines_aie_scalar(uint8_t *lineIn0, uint8_t *lineIn1, - uint8_t *lineIn2, uint8_t *output, - const int32_t width, int16_t *kernel) { - - int32_t acc; - - // left of line, border extension by mirroring - acc = 0; - acc += ((int32_t)lineIn0[0]) * kernel[0 * 3 + 0]; - acc += ((int32_t)lineIn1[0]) * kernel[1 * 3 + 0]; - acc += ((int32_t)lineIn2[0]) * kernel[2 * 3 + 0]; - - for (int ki = 1; ki < 3; ki++) { - acc += ((int32_t)lineIn0[0 + ki - 1]) * kernel[0 * 3 + ki]; - acc += ((int32_t)lineIn1[0 + ki - 1]) * kernel[1 * 3 + ki]; - acc += ((int32_t)lineIn2[0 + ki - 1]) * kernel[2 * 3 + ki]; - } - acc = ((acc + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT); - acc = (acc > UINT8_MAX) ? UINT8_MAX : (acc < 0) ? 0 : acc; // saturate - output[0] = (uint8_t)acc; - - // middle of line, no border extension needed - for (int i = 1; i < width - 1; i++) { - acc = 0; - for (int ki = 0; ki < 3; ki++) { - acc += ((int32_t)lineIn0[i + ki - 1]) * kernel[0 * 3 + ki]; - acc += ((int32_t)lineIn1[i + ki - 1]) * kernel[1 * 3 + ki]; - acc += ((int32_t)lineIn2[i + ki - 1]) * kernel[2 * 3 + ki]; - } - acc = ((acc + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT); - acc = (acc > UINT8_MAX) ? UINT8_MAX : (acc < 0) ? 0 : acc; // saturate - output[i] = (uint8_t)acc; - } - - // right of line, border extension by mirroring - acc = 0; - for (int ki = 0; ki < 2; ki++) { - acc += ((int32_t)lineIn0[width + ki - 2]) * kernel[0 * 3 + ki]; - acc += ((int32_t)lineIn1[width + ki - 2]) * kernel[1 * 3 + ki]; - acc += ((int32_t)lineIn2[width + ki - 2]) * kernel[2 * 3 + ki]; - } - - acc += ((int32_t)lineIn0[width - 1]) * kernel[0 * 3 + 2]; - acc += ((int32_t)lineIn1[width - 1]) * kernel[1 * 3 + 2]; - acc += ((int32_t)lineIn2[width - 1]) * kernel[2 * 3 + 2]; - acc = ((acc + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT); - acc = (acc > UINT8_MAX) ? UINT8_MAX : (acc < 0) ? 0 : acc; // saturate - output[width - 1] = (uint8_t)acc; -} - -#define KERNEL_WIDTH 3 - -constexpr unsigned VecFactor = 32; - -constexpr unsigned Lanes = 32; // Parallel vector output lanes -constexpr unsigned Points = 8; // Columns where data in summed togther -constexpr unsigned CoeffStep = 1; -constexpr unsigned DataStepXY = 1; - -using mul_ops = - aie::sliding_mul_xy_ops; - -void filter2d_3lines_aie(uint8_t *lineIn0, uint8_t *lineIn1, uint8_t *lineIn2, - uint8_t *output, const int32_t width, - int16_t *kernel) { - - set_sat(); // Needed for int16 to saturate properly to uint8 - - aie::vector data_buf1, data_buf2, data_buf3; - aie::vector prev_buf1, prev_buf2, prev_buf3; - aie::vector zero_buf = ::aie::zeros(); - aie::vector kernel_vec; - - const uint32_t kernel_side = KERNEL_WIDTH / 2; - - for (int j = 0; j < KERNEL_WIDTH; j++) { - for (int i = 0; i < KERNEL_WIDTH; i++) { - kernel_vec[j * Points + i] = - (int8_t)((*kernel) >> 8); // int16 to int8 shift - kernel++; - } - for (int i2 = 0; i2 < Points - KERNEL_WIDTH; i2++) { - kernel_vec[j * Points + KERNEL_WIDTH + i2] = 0; - } - } - - // left of line, border extension by mirroring - // first kernel row - data_buf1.insert(0, aie::load_v<32>(lineIn0)); - lineIn0 += VecFactor; - data_buf1.insert(1, aie::load_v<32>(lineIn0)); - prev_buf1.insert(1, data_buf1.template extract<32>(0)); - data_buf1 = ::aie::shuffle_up_replicate(data_buf1, kernel_side); - auto acc = mul_ops::mul(kernel_vec, 0, data_buf1, 0); - - // second kernel row - data_buf2.insert(0, aie::load_v<32>(lineIn1)); - lineIn1 += VecFactor; - data_buf2.insert(1, aie::load_v<32>(lineIn1)); - prev_buf2.insert(1, data_buf2.template extract<32>(0)); - data_buf2 = ::aie::shuffle_up_replicate(data_buf2, kernel_side); - acc = mul_ops::mac(acc, kernel_vec, Points, data_buf2, 0); - - // third kernel row - data_buf3.insert(0, aie::load_v<32>(lineIn2)); - lineIn2 += VecFactor; - data_buf3.insert(1, aie::load_v<32>(lineIn2)); - prev_buf3.insert(1, data_buf3.template extract<32>(0)); - data_buf3 = ::aie::shuffle_up_replicate(data_buf3, kernel_side); - acc = mul_ops::mac(acc, kernel_vec, 2 * Points, data_buf3, 0); - - // Store result - ::aie::store_v(output, acc.to_vector(SRS_SHIFT - 8)); - output += VecFactor; - - // middle of line, no border extension needed - for (int i = 2 * VecFactor; i < width - 1; i += VecFactor) { - // first kernel row - data_buf1.insert(0, aie::load_v<32>(lineIn0)); - lineIn0 += VecFactor; - data_buf1.insert(1, aie::load_v<32>(lineIn0)); - data_buf1 = ::aie::shuffle_up_fill(data_buf1, prev_buf1, kernel_side); - prev_buf1.insert(1, data_buf1.template extract<32>(0)); - acc = mul_ops::mul(kernel_vec, 0, data_buf1, 0); - - // second kernel row - data_buf2.insert(0, aie::load_v<32>(lineIn1)); - lineIn1 += VecFactor; - data_buf2.insert(1, aie::load_v<32>(lineIn1)); - data_buf2 = ::aie::shuffle_up_fill(data_buf2, prev_buf2, kernel_side); - prev_buf2.insert(1, data_buf2.template extract<32>(0)); - acc = mul_ops::mac(acc, kernel_vec, Points, data_buf2, 0); - - // third kernel row - data_buf3.insert(0, aie::load_v<32>(lineIn2)); - lineIn2 += VecFactor; - data_buf3.insert(1, aie::load_v<32>(lineIn2)); - data_buf3 = ::aie::shuffle_up_fill(data_buf3, prev_buf3, kernel_side); - prev_buf3.insert(1, data_buf3.template extract<32>(0)); - acc = mul_ops::mac(acc, kernel_vec, 2 * Points, data_buf3, 0); - - // Store result - ::aie::store_v(output, acc.to_vector(SRS_SHIFT - 8)); - output += VecFactor; - } - - // right of line, border extension by mirroring - // first kernel row - data_buf1.insert(1, aie::load_v<32>(lineIn0)); - data_buf1 = ::aie::shuffle_down_replicate(data_buf1, 32); - data_buf1 = ::aie::shuffle_up_fill(data_buf1, prev_buf1, kernel_side); - acc = mul_ops::mul(kernel_vec, 0, data_buf1, 0); - - // second kernel row - data_buf2.insert(1, aie::load_v<32>(lineIn1)); - data_buf2 = ::aie::shuffle_down_replicate(data_buf2, 32); - data_buf2 = ::aie::shuffle_up_fill(data_buf2, prev_buf2, kernel_side); - acc = mul_ops::mac(acc, kernel_vec, Points, data_buf2, 0); - - // third kernel row - data_buf3.insert(1, aie::load_v<32>(lineIn2)); - lineIn2 += VecFactor; - data_buf3 = ::aie::shuffle_down_replicate(data_buf3, 32); - data_buf3 = ::aie::shuffle_up_fill(data_buf3, prev_buf3, kernel_side); - acc = mul_ops::mac(acc, kernel_vec, 2 * Points, data_buf3, 0); - - // Store result - ::aie::store_v(output, acc.to_vector(SRS_SHIFT - 8)); - output += VecFactor; -} - -extern "C" { - -// #if BIT_WIDTH == 8 - -void filter2dLine(uint8_t *lineIn0, uint8_t *lineIn1, uint8_t *lineIn2, - uint8_t *out, int32_t lineWidth, int16_t *filterKernel) { - filter2d_3lines_aie(lineIn0, lineIn1, lineIn2, out, lineWidth, filterKernel); -} - -/* #elif BIT_WIDTH == 16 - -void filter2dLine(int16_t *in, int16_t *out, int32_t lineWidth, int16_t -filter2dValue, int16_t maxValue) { filter2d_3lines_aie(in, out, -lineWidth, 1, filter2dValue, maxValue); -} - -#else // 32 - -void filter2dLine(int32_t *in, int32_t *out, int32_t lineWidth, int32_t -filter2dValue, int32_t maxValue) { filter2d_3lines_aie(in, out, -lineWidth, 1, filter2dValue, maxValue); -} - -#endif */ - -} // extern "C" +//===- filter2d.cc ----------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2022, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// #define __AIENGINE__ 1 +#define NOCPP + +#include +#include +#include + +#define REL_WRITE 0 +#define REL_READ 1 + +#define THRESH_TYPE XF_THRESHOLD_TYPE_BINARY + +#include + +const int32_t SRS_SHIFT = 12; + +void filter2d_3lines_aie_scalar(uint8_t *lineIn0, uint8_t *lineIn1, + uint8_t *lineIn2, uint8_t *output, + const int32_t width, int16_t *kernel) { + + int32_t acc; + + // left of line, border extension by mirroring + acc = 0; + acc += ((int32_t)lineIn0[0]) * kernel[0 * 3 + 0]; + acc += ((int32_t)lineIn1[0]) * kernel[1 * 3 + 0]; + acc += ((int32_t)lineIn2[0]) * kernel[2 * 3 + 0]; + + for (int ki = 1; ki < 3; ki++) { + acc += ((int32_t)lineIn0[0 + ki - 1]) * kernel[0 * 3 + ki]; + acc += ((int32_t)lineIn1[0 + ki - 1]) * kernel[1 * 3 + ki]; + acc += ((int32_t)lineIn2[0 + ki - 1]) * kernel[2 * 3 + ki]; + } + acc = ((acc + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT); + acc = (acc > UINT8_MAX) ? UINT8_MAX : (acc < 0) ? 0 : acc; // saturate + output[0] = (uint8_t)acc; + + // middle of line, no border extension needed + for (int i = 1; i < width - 1; i++) { + acc = 0; + for (int ki = 0; ki < 3; ki++) { + acc += ((int32_t)lineIn0[i + ki - 1]) * kernel[0 * 3 + ki]; + acc += ((int32_t)lineIn1[i + ki - 1]) * kernel[1 * 3 + ki]; + acc += ((int32_t)lineIn2[i + ki - 1]) * kernel[2 * 3 + ki]; + } + acc = ((acc + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT); + acc = (acc > UINT8_MAX) ? UINT8_MAX : (acc < 0) ? 0 : acc; // saturate + output[i] = (uint8_t)acc; + } + + // right of line, border extension by mirroring + acc = 0; + for (int ki = 0; ki < 2; ki++) { + acc += ((int32_t)lineIn0[width + ki - 2]) * kernel[0 * 3 + ki]; + acc += ((int32_t)lineIn1[width + ki - 2]) * kernel[1 * 3 + ki]; + acc += ((int32_t)lineIn2[width + ki - 2]) * kernel[2 * 3 + ki]; + } + + acc += ((int32_t)lineIn0[width - 1]) * kernel[0 * 3 + 2]; + acc += ((int32_t)lineIn1[width - 1]) * kernel[1 * 3 + 2]; + acc += ((int32_t)lineIn2[width - 1]) * kernel[2 * 3 + 2]; + acc = ((acc + (1 << (SRS_SHIFT - 1))) >> SRS_SHIFT); + acc = (acc > UINT8_MAX) ? UINT8_MAX : (acc < 0) ? 0 : acc; // saturate + output[width - 1] = (uint8_t)acc; +} + +#define KERNEL_WIDTH 3 + +constexpr unsigned VecFactor = 32; + +constexpr unsigned Lanes = 32; // Parallel vector output lanes +constexpr unsigned Points = 8; // Columns where data in summed togther +constexpr unsigned CoeffStep = 1; +constexpr unsigned DataStepXY = 1; + +using mul_ops = + aie::sliding_mul_xy_ops; + +void filter2d_3lines_aie(uint8_t *lineIn0, uint8_t *lineIn1, uint8_t *lineIn2, + uint8_t *output, const int32_t width, + int16_t *kernel) { + + set_sat(); // Needed for int16 to saturate properly to uint8 + + aie::vector data_buf1, data_buf2, data_buf3; + aie::vector prev_buf1, prev_buf2, prev_buf3; + aie::vector zero_buf = ::aie::zeros(); + aie::vector kernel_vec; + + const uint32_t kernel_side = KERNEL_WIDTH / 2; + + for (int j = 0; j < KERNEL_WIDTH; j++) { + for (int i = 0; i < KERNEL_WIDTH; i++) { + kernel_vec[j * Points + i] = + (int8_t)((*kernel) >> 8); // int16 to int8 shift + kernel++; + } + for (int i2 = 0; i2 < Points - KERNEL_WIDTH; i2++) { + kernel_vec[j * Points + KERNEL_WIDTH + i2] = 0; + } + } + + // left of line, border extension by mirroring + // first kernel row + data_buf1.insert(0, aie::load_v<32>(lineIn0)); + lineIn0 += VecFactor; + data_buf1.insert(1, aie::load_v<32>(lineIn0)); + prev_buf1.insert(1, data_buf1.template extract<32>(0)); + data_buf1 = ::aie::shuffle_up_replicate(data_buf1, kernel_side); + auto acc = mul_ops::mul(kernel_vec, 0, data_buf1, 0); + + // second kernel row + data_buf2.insert(0, aie::load_v<32>(lineIn1)); + lineIn1 += VecFactor; + data_buf2.insert(1, aie::load_v<32>(lineIn1)); + prev_buf2.insert(1, data_buf2.template extract<32>(0)); + data_buf2 = ::aie::shuffle_up_replicate(data_buf2, kernel_side); + acc = mul_ops::mac(acc, kernel_vec, Points, data_buf2, 0); + + // third kernel row + data_buf3.insert(0, aie::load_v<32>(lineIn2)); + lineIn2 += VecFactor; + data_buf3.insert(1, aie::load_v<32>(lineIn2)); + prev_buf3.insert(1, data_buf3.template extract<32>(0)); + data_buf3 = ::aie::shuffle_up_replicate(data_buf3, kernel_side); + acc = mul_ops::mac(acc, kernel_vec, 2 * Points, data_buf3, 0); + + // Store result + ::aie::store_v(output, acc.to_vector(SRS_SHIFT - 8)); + output += VecFactor; + + // middle of line, no border extension needed + for (int i = 2 * VecFactor; i < width - 1; i += VecFactor) { + // first kernel row + data_buf1.insert(0, aie::load_v<32>(lineIn0)); + lineIn0 += VecFactor; + data_buf1.insert(1, aie::load_v<32>(lineIn0)); + data_buf1 = ::aie::shuffle_up_fill(data_buf1, prev_buf1, kernel_side); + prev_buf1.insert(1, data_buf1.template extract<32>(0)); + acc = mul_ops::mul(kernel_vec, 0, data_buf1, 0); + + // second kernel row + data_buf2.insert(0, aie::load_v<32>(lineIn1)); + lineIn1 += VecFactor; + data_buf2.insert(1, aie::load_v<32>(lineIn1)); + data_buf2 = ::aie::shuffle_up_fill(data_buf2, prev_buf2, kernel_side); + prev_buf2.insert(1, data_buf2.template extract<32>(0)); + acc = mul_ops::mac(acc, kernel_vec, Points, data_buf2, 0); + + // third kernel row + data_buf3.insert(0, aie::load_v<32>(lineIn2)); + lineIn2 += VecFactor; + data_buf3.insert(1, aie::load_v<32>(lineIn2)); + data_buf3 = ::aie::shuffle_up_fill(data_buf3, prev_buf3, kernel_side); + prev_buf3.insert(1, data_buf3.template extract<32>(0)); + acc = mul_ops::mac(acc, kernel_vec, 2 * Points, data_buf3, 0); + + // Store result + ::aie::store_v(output, acc.to_vector(SRS_SHIFT - 8)); + output += VecFactor; + } + + // right of line, border extension by mirroring + // first kernel row + data_buf1.insert(1, aie::load_v<32>(lineIn0)); + data_buf1 = ::aie::shuffle_down_replicate(data_buf1, 32); + data_buf1 = ::aie::shuffle_up_fill(data_buf1, prev_buf1, kernel_side); + acc = mul_ops::mul(kernel_vec, 0, data_buf1, 0); + + // second kernel row + data_buf2.insert(1, aie::load_v<32>(lineIn1)); + data_buf2 = ::aie::shuffle_down_replicate(data_buf2, 32); + data_buf2 = ::aie::shuffle_up_fill(data_buf2, prev_buf2, kernel_side); + acc = mul_ops::mac(acc, kernel_vec, Points, data_buf2, 0); + + // third kernel row + data_buf3.insert(1, aie::load_v<32>(lineIn2)); + lineIn2 += VecFactor; + data_buf3 = ::aie::shuffle_down_replicate(data_buf3, 32); + data_buf3 = ::aie::shuffle_up_fill(data_buf3, prev_buf3, kernel_side); + acc = mul_ops::mac(acc, kernel_vec, 2 * Points, data_buf3, 0); + + // Store result + ::aie::store_v(output, acc.to_vector(SRS_SHIFT - 8)); + output += VecFactor; +} + +extern "C" { + +// #if BIT_WIDTH == 8 + +void filter2dLine(uint8_t *lineIn0, uint8_t *lineIn1, uint8_t *lineIn2, + uint8_t *out, int32_t lineWidth, int16_t *filterKernel) { + filter2d_3lines_aie(lineIn0, lineIn1, lineIn2, out, lineWidth, filterKernel); +} + +/* #elif BIT_WIDTH == 16 + +void filter2dLine(int16_t *in, int16_t *out, int32_t lineWidth, int16_t +filter2dValue, int16_t maxValue) { filter2d_3lines_aie(in, out, +lineWidth, 1, filter2dValue, maxValue); +} + +#else // 32 + +void filter2dLine(int32_t *in, int32_t *out, int32_t lineWidth, int32_t +filter2dValue, int32_t maxValue) { filter2d_3lines_aie(in, out, +lineWidth, 1, filter2dValue, maxValue); +} + +#endif */ + +} // extern "C" diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc index d0c7a9e3b1..351b3331b6 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc @@ -1,82 +1,82 @@ -//===- gray2rgba.cc -------------------------------------------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2022, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -#define NOCPP - -#include -#include -#include - -#define REL_WRITE 0 -#define REL_READ 1 - -#include - -::aie::vector vector_broadcast(::aie::vector e) { - v64uint8 lli = e.template grow<64>(); - lli = shuffle(lli, lli, T8_2x64_lo); - lli = shuffle(lli, lli, T8_2x64_lo); - return ::aie::vector(lli); -} - -void gray2rgba_aie(uint8_t *y_in, uint8_t *rgba_out, const int32_t height, - const int32_t width) { - // Initialize alpha vector - ::aie::vector alpha255 = ::aie::zeros(); - for (int i = 0; i < 16; i++) { - alpha255[i * 4 + 3] = 255; - } - - for (int i = 0; i < height; i++) - for (int j = 0; j < width; j += 16) { - ::aie::vector data_buf = ::aie::load_v<16>(y_in); - y_in += 16; - - // vector shuffle - ::aie::vector out = vector_broadcast(data_buf); - - // bitwise OR with alpha value - v64uint8 fout = bor(out, alpha255); - - ::aie::store_v(rgba_out, ::aie::vector(fout)); - rgba_out += 64; - } - - return; - ; -} - -void gray2rgba_aie_scalar(uint8_t *y_in, uint8_t *rgba_out, - const int32_t height, const int32_t width) { - for (int i = 0; i < height; i++) - for (int j = 0; j < width; j++) { - uint8_t value = y_in[i * width + j]; - rgba_out[i * width * 4 + j * 4] = value; - rgba_out[i * width * 4 + j * 4 + 1] = value; - rgba_out[i * width * 4 + j * 4 + 2] = value; - rgba_out[i * width * 4 + j * 4 + 3] = 255; - } - - return; - ; -} - -extern "C" { - -void gray2rgbaLine(uint8_t *in, uint8_t *out, int32_t lineWidth) { - gray2rgba_aie(in, out, 1, lineWidth); -} - -void gray2rgbaTile(uint8_t *in, uint8_t *out, int32_t tileHeight, - int32_t tileWidth) { - gray2rgba_aie(in, out, tileHeight, tileWidth); -} - -} // extern "C" +//===- gray2rgba.cc -------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2022, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#define NOCPP + +#include +#include +#include + +#define REL_WRITE 0 +#define REL_READ 1 + +#include + +::aie::vector vector_broadcast(::aie::vector e) { + v64uint8 lli = e.template grow<64>(); + lli = shuffle(lli, lli, T8_2x64_lo); + lli = shuffle(lli, lli, T8_2x64_lo); + return ::aie::vector(lli); +} + +void gray2rgba_aie(uint8_t *y_in, uint8_t *rgba_out, const int32_t height, + const int32_t width) { + // Initialize alpha vector + ::aie::vector alpha255 = ::aie::zeros(); + for (int i = 0; i < 16; i++) { + alpha255[i * 4 + 3] = 255; + } + + for (int i = 0; i < height; i++) + for (int j = 0; j < width; j += 16) { + ::aie::vector data_buf = ::aie::load_v<16>(y_in); + y_in += 16; + + // vector shuffle + ::aie::vector out = vector_broadcast(data_buf); + + // bitwise OR with alpha value + v64uint8 fout = bor(out, alpha255); + + ::aie::store_v(rgba_out, ::aie::vector(fout)); + rgba_out += 64; + } + + return; + ; +} + +void gray2rgba_aie_scalar(uint8_t *y_in, uint8_t *rgba_out, + const int32_t height, const int32_t width) { + for (int i = 0; i < height; i++) + for (int j = 0; j < width; j++) { + uint8_t value = y_in[i * width + j]; + rgba_out[i * width * 4 + j * 4] = value; + rgba_out[i * width * 4 + j * 4 + 1] = value; + rgba_out[i * width * 4 + j * 4 + 2] = value; + rgba_out[i * width * 4 + j * 4 + 3] = 255; + } + + return; + ; +} + +extern "C" { + +void gray2rgbaLine(uint8_t *in, uint8_t *out, int32_t lineWidth) { + gray2rgba_aie(in, out, 1, lineWidth); +} + +void gray2rgbaTile(uint8_t *in, uint8_t *out, int32_t tileHeight, + int32_t tileWidth) { + gray2rgba_aie(in, out, tileHeight, tileWidth); +} + +} // extern "C" diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc index 0928af33f0..4725d09ba6 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc @@ -1,87 +1,87 @@ -//===- passThrough.cc -------------------------------------------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2022, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -// #define __AIENGINE__ 1 -#define NOCPP - -#include -#include -#include - -#define REL_WRITE 0 -#define REL_READ 1 - -#include - -template -__attribute__((noinline)) void passThrough_aie(T *restrict in, T *restrict out, - const int32_t height, - const int32_t width) { - //::aie::vector data_out; - //::aie::mask temp_val; - v64uint8 *restrict outPtr = (v64uint8 *)out; - v64uint8 *restrict inPtr = (v64uint8 *)in; - - for (int j = 0; j < (height * width); j += N) // Nx samples per loop - chess_prepare_for_pipelining chess_loop_range(6, ) { - //::aie::vector tmpVector = ::aie::load_v(in); - //::aie::store_v(out, tmpVector); - - *outPtr++ = *inPtr++; - - // in += N; - // out += N; - } -} - -extern "C" { - -#if BIT_WIDTH == 8 - -void passThroughLine(uint8_t *in, uint8_t *out, int32_t lineWidth) { - printf("passThroughLine BIT_WIDTH\n"); - passThrough_aie(in, out, 1, lineWidth); -} - -void passThroughTile(uint8_t *in, uint8_t *out, int32_t tileHeight, - int32_t tileWidth) { - printf("passThroughTile BIT_WIDTH\n"); - passThrough_aie(in, out, tileHeight, tileWidth); -} - -#elif BIT_WIDTH == 16 - -void passThroughLine(int16_t *in, int16_t *out, int32_t lineWidth) { - printf("passThroughLine BIT_WIDTH\n"); - passThrough_aie(in, out, 1, lineWidth); -} - -void passThroughTile(int16_t *in, int16_t *out, int32_t tileHeight, - int32_t tileWidth) { - printf("passThroughTile BIT_WIDTH\n"); - passThrough_aie(in, out, tileHeight, tileWidth); -} - -#else // 32 - -void passThroughLine(int32_t *in, int32_t *out, int32_t lineWidth) { - printf("passThroughLine BIT_WIDTH\n"); - passThrough_aie(in, out, 1, lineWidth); -} - -void passThroughTile(int32_t *in, int32_t *out, int32_t tileHeight, - int32_t tileWidth) { - printf("passThroughTile BIT_WIDTH\n"); - passThrough_aie(in, out, tileHeight, tileWidth); -} - -#endif - -} // extern "C" +//===- passThrough.cc -------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2022, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// #define __AIENGINE__ 1 +#define NOCPP + +#include +#include +#include + +#define REL_WRITE 0 +#define REL_READ 1 + +#include + +template +__attribute__((noinline)) void passThrough_aie(T *restrict in, T *restrict out, + const int32_t height, + const int32_t width) { + //::aie::vector data_out; + //::aie::mask temp_val; + v64uint8 *restrict outPtr = (v64uint8 *)out; + v64uint8 *restrict inPtr = (v64uint8 *)in; + + for (int j = 0; j < (height * width); j += N) // Nx samples per loop + chess_prepare_for_pipelining chess_loop_range(6, ) { + //::aie::vector tmpVector = ::aie::load_v(in); + //::aie::store_v(out, tmpVector); + + *outPtr++ = *inPtr++; + + // in += N; + // out += N; + } +} + +extern "C" { + +#if BIT_WIDTH == 8 + +void passThroughLine(uint8_t *in, uint8_t *out, int32_t lineWidth) { + printf("passThroughLine BIT_WIDTH\n"); + passThrough_aie(in, out, 1, lineWidth); +} + +void passThroughTile(uint8_t *in, uint8_t *out, int32_t tileHeight, + int32_t tileWidth) { + printf("passThroughTile BIT_WIDTH\n"); + passThrough_aie(in, out, tileHeight, tileWidth); +} + +#elif BIT_WIDTH == 16 + +void passThroughLine(int16_t *in, int16_t *out, int32_t lineWidth) { + printf("passThroughLine BIT_WIDTH\n"); + passThrough_aie(in, out, 1, lineWidth); +} + +void passThroughTile(int16_t *in, int16_t *out, int32_t tileHeight, + int32_t tileWidth) { + printf("passThroughTile BIT_WIDTH\n"); + passThrough_aie(in, out, tileHeight, tileWidth); +} + +#else // 32 + +void passThroughLine(int32_t *in, int32_t *out, int32_t lineWidth) { + printf("passThroughLine BIT_WIDTH\n"); + passThrough_aie(in, out, 1, lineWidth); +} + +void passThroughTile(int32_t *in, int32_t *out, int32_t tileHeight, + int32_t tileWidth) { + printf("passThroughTile BIT_WIDTH\n"); + passThrough_aie(in, out, tileHeight, tileWidth); +} + +#endif + +} // extern "C" diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc index 175e4f4a33..988c4a0e4c 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc @@ -1,107 +1,107 @@ -//===- rgba2gray.cc -------------------------------------------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2022, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -#define NOCPP - -#include -#include -#include - -#define REL_WRITE 0 -#define REL_READ 1 - -#include - -const int32_t SRS_SHIFT = 15; -__attribute__((inline)) void xf_extract_rgb(uint8_t *ptr_rgba, - ::aie::vector &r, - ::aie::vector &g, - ::aie::vector &b) { - ::aie::vector rgba_channel0, rgba_channel1, rgba_channel3, - rgba_channel2; - rgba_channel0 = ::aie::load_v<32>(ptr_rgba); - ptr_rgba += 32; - rgba_channel1 = ::aie::load_v<32>(ptr_rgba); - ptr_rgba += 32; - rgba_channel2 = ::aie::load_v<32>(ptr_rgba); - ptr_rgba += 32; - rgba_channel3 = ::aie::load_v<32>(ptr_rgba); - ptr_rgba += 32; - - // Unzip the interleaved channels - auto [rg_temp, ba_temp] = - ::aie::interleave_unzip(::aie::concat(rgba_channel0, rgba_channel1), - ::aie::concat(rgba_channel2, rgba_channel3), 2); - r = ::aie::filter_even(rg_temp, 1); - g = ::aie::filter_odd(rg_temp, 1); - b = ::aie::filter_even(ba_temp, 1); -} - -__attribute__((noinline)) void rgba2gray_aie(uint8_t *rgba_in, uint8_t *y_out, - const int32_t height, - const int32_t width) { - //::aie::vector WT(66, 129, 25, 128); //Y=0.299*R + 0.587*G + - //: 0.114*B (BT.470) :aie::vector WT(25, 129, 66, 128); - //://Y=0.299*R + 0.587*G + 0.114*B (BT.470) - ::aie::vector WT( - (int16_t)round(0.299 * (1 << SRS_SHIFT)), - (int16_t)round(0.587 * (1 << SRS_SHIFT)), - (int16_t)round(0.114 * (1 << SRS_SHIFT)), - (1 << (SRS_SHIFT - 1))); // Y=0.299*R + 0.587*G + 0.114*B (BT.470) - ::aie::vector c1 = ::aie::broadcast(1); - ::aie::vector r, g, b; - ::aie::vector y; - - for (int j = 0; (j < (width * height) / 32); j += 1) - chess_prepare_for_pipelining { - xf_extract_rgb(rgba_in, r, g, b); - - ::aie::accum acc; - acc = ::aie::accumulate<32>(WT, 0, r, g, b, c1); - y = acc.template to_vector(SRS_SHIFT); - - ::aie::store_v(y_out, y); - rgba_in += 128; - y_out += 32; - } -} - -void rgba2gray_aie_scalar(uint8_t *rgba_in, uint8_t *y_out, - const int32_t height, const int32_t width) { - /// Y=0.299*R + 0.587*G + 0.114*B (BT.470) - const int colorMatrix[4] = {(int)round(0.299 * 65536), - (int)round(0.587 * 65536), - (int)round(0.114 * 65536), (65536 / 2)}; - for (int i = 0; i < height; i++) - for (int j = 0; j < width; j++) { - int r = (int)rgba_in[i * width * 4 + j * 4]; - int g = (int)rgba_in[i * width * 4 + j * 4 + 1]; - int b = (int)rgba_in[i * width * 4 + j * 4 + 2]; - int tmpSum = (colorMatrix[0] * r + colorMatrix[1] * g + - colorMatrix[2] * b + colorMatrix[3]) >> - 16; - y_out[i * width + j] = (uint8_t)tmpSum; - } - - return; -} - -extern "C" { - -void rgba2grayLine(uint8_t *in, uint8_t *out, int32_t lineWidth) { - rgba2gray_aie(in, out, 1, lineWidth); -} - -void rgba2grayTile(uint8_t *in, uint8_t *out, int32_t tileHeight, - int32_t tileWidth) { - rgba2gray_aie(in, out, tileHeight, tileWidth); -} - -} // extern "C" +//===- rgba2gray.cc -------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2022, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#define NOCPP + +#include +#include +#include + +#define REL_WRITE 0 +#define REL_READ 1 + +#include + +const int32_t SRS_SHIFT = 15; +__attribute__((inline)) void xf_extract_rgb(uint8_t *ptr_rgba, + ::aie::vector &r, + ::aie::vector &g, + ::aie::vector &b) { + ::aie::vector rgba_channel0, rgba_channel1, rgba_channel3, + rgba_channel2; + rgba_channel0 = ::aie::load_v<32>(ptr_rgba); + ptr_rgba += 32; + rgba_channel1 = ::aie::load_v<32>(ptr_rgba); + ptr_rgba += 32; + rgba_channel2 = ::aie::load_v<32>(ptr_rgba); + ptr_rgba += 32; + rgba_channel3 = ::aie::load_v<32>(ptr_rgba); + ptr_rgba += 32; + + // Unzip the interleaved channels + auto [rg_temp, ba_temp] = + ::aie::interleave_unzip(::aie::concat(rgba_channel0, rgba_channel1), + ::aie::concat(rgba_channel2, rgba_channel3), 2); + r = ::aie::filter_even(rg_temp, 1); + g = ::aie::filter_odd(rg_temp, 1); + b = ::aie::filter_even(ba_temp, 1); +} + +__attribute__((noinline)) void rgba2gray_aie(uint8_t *rgba_in, uint8_t *y_out, + const int32_t height, + const int32_t width) { + //::aie::vector WT(66, 129, 25, 128); //Y=0.299*R + 0.587*G + + //: 0.114*B (BT.470) :aie::vector WT(25, 129, 66, 128); + //://Y=0.299*R + 0.587*G + 0.114*B (BT.470) + ::aie::vector WT( + (int16_t)round(0.299 * (1 << SRS_SHIFT)), + (int16_t)round(0.587 * (1 << SRS_SHIFT)), + (int16_t)round(0.114 * (1 << SRS_SHIFT)), + (1 << (SRS_SHIFT - 1))); // Y=0.299*R + 0.587*G + 0.114*B (BT.470) + ::aie::vector c1 = ::aie::broadcast(1); + ::aie::vector r, g, b; + ::aie::vector y; + + for (int j = 0; (j < (width * height) / 32); j += 1) + chess_prepare_for_pipelining { + xf_extract_rgb(rgba_in, r, g, b); + + ::aie::accum acc; + acc = ::aie::accumulate<32>(WT, 0, r, g, b, c1); + y = acc.template to_vector(SRS_SHIFT); + + ::aie::store_v(y_out, y); + rgba_in += 128; + y_out += 32; + } +} + +void rgba2gray_aie_scalar(uint8_t *rgba_in, uint8_t *y_out, + const int32_t height, const int32_t width) { + /// Y=0.299*R + 0.587*G + 0.114*B (BT.470) + const int colorMatrix[4] = {(int)round(0.299 * 65536), + (int)round(0.587 * 65536), + (int)round(0.114 * 65536), (65536 / 2)}; + for (int i = 0; i < height; i++) + for (int j = 0; j < width; j++) { + int r = (int)rgba_in[i * width * 4 + j * 4]; + int g = (int)rgba_in[i * width * 4 + j * 4 + 1]; + int b = (int)rgba_in[i * width * 4 + j * 4 + 2]; + int tmpSum = (colorMatrix[0] * r + colorMatrix[1] * g + + colorMatrix[2] * b + colorMatrix[3]) >> + 16; + y_out[i * width + j] = (uint8_t)tmpSum; + } + + return; +} + +extern "C" { + +void rgba2grayLine(uint8_t *in, uint8_t *out, int32_t lineWidth) { + rgba2gray_aie(in, out, 1, lineWidth); +} + +void rgba2grayTile(uint8_t *in, uint8_t *out, int32_t tileHeight, + int32_t tileWidth) { + rgba2gray_aie(in, out, tileHeight, tileWidth); +} + +} // extern "C" diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc index c49cd97655..3f20bf514d 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc @@ -1,181 +1,181 @@ -//===- rgba2hue.cc ----------------------------------------------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2023, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -#define NOCPP - -#include -#include -#include - -#define REL_WRITE 0 -#define REL_READ 1 - -// clang-format off -#include -#include "lut_inv_8b.h" -// clang-format on - -const int32_t SRS_SHIFT = 12; - -__attribute__((inline)) void xf_extract_rgb(uint8_t *ptr_rgba, - ::aie::vector &r, - ::aie::vector &g, - ::aie::vector &b) { - ::aie::vector rgba_channel0, rgba_channel1, rgba_channel3, - rgba_channel2; - rgba_channel0 = ::aie::load_v<32>(ptr_rgba); - ptr_rgba += 32; - rgba_channel1 = ::aie::load_v<32>(ptr_rgba); - ptr_rgba += 32; - rgba_channel2 = ::aie::load_v<32>(ptr_rgba); - ptr_rgba += 32; - rgba_channel3 = ::aie::load_v<32>(ptr_rgba); - ptr_rgba += 32; - - // Unzip the interleaved channels - auto [rg_temp, ba_temp] = - ::aie::interleave_unzip(::aie::concat(rgba_channel0, rgba_channel1), - ::aie::concat(rgba_channel2, rgba_channel3), 2); - r = ::aie::filter_even(rg_temp, 1); - g = ::aie::filter_odd(rg_temp, 1); - b = ::aie::filter_even(ba_temp, 1); -} - -__attribute__((inline)) void -comp_divisor_16b(::aie::vector divisor, - ::aie::vector &divisor_select) { - const int step = 0; - using lut_type_uint16 = aie::lut<4, uint16, uint16>; - lut_type_uint16 inv_lut_16b(num_entries_lut_inv_16b, lut_inv_16b_ab, - lut_inv_16b_cd); - aie::parallel_lookup - lookup_inv_16b(inv_lut_16b, step); - - aie::vector input1, input2; - aie::vector res1, res2; - input1 = divisor.extract<16>(0); - input2 = divisor.extract<16>(1); - res1 = lookup_inv_16b.fetch(input1.cast_to()); - res2 = lookup_inv_16b.fetch(input2.cast_to()); - divisor_select = aie::concat(res1, res2); -} - -__attribute__((noinline)) void rgba2hue_aie(uint8_t *rgba_in, uint8_t *hue_out, - const int32_t height, - const int32_t width) { - ::aie::vector r, g, b; - ::aie::vector hue; - - ::aie::vector rgbMin, rgbMax; - - ::aie::vector zero32 = aie::zeros(); - - ::aie::vector eightFive = aie::zeros(); - eightFive[0] = 85; - eightFive[1] = -85; - ::aie::vector one = aie::broadcast(1); - ::aie::vector twoEightFive = - aie::broadcast(171); // 170 + 1 - ::aie::vector fourEightFive = - aie::broadcast(341); // 340 + 1 - - for (int j = 0; (j < (width * height) / 32); j += 1) - chess_prepare_for_pipelining { - xf_extract_rgb(rgba_in, r, g, b); - - // Get rgbMin and rgbMax - rgbMin = ::aie::min(::aie::min(r, g), b); - rgbMax = ::aie::max(::aie::max(r, g), b); - - // Get divisor and select the fixed point divisor to multiply by - auto divisor = ::aie::sub(rgbMax, rgbMin); - ::aie::vector divisor_sel; - comp_divisor_16b(divisor, divisor_sel); - - // Initialize accum with value since 340 is larger than uint8 - aie::accum hr_partial(one, 9); - aie::accum hg_partial(twoEightFive, 9); - aie::accum hb_partial(fourEightFive, 9); - - // Performa uin8*int16 vector multiply - hr_partial = aie::mac(hr_partial, g, divisor_sel); - hg_partial = aie::mac(hg_partial, b, divisor_sel); - hb_partial = aie::mac(hb_partial, r, divisor_sel); - - hr_partial = aie::msc(hr_partial, b, divisor_sel); - hg_partial = aie::msc(hg_partial, r, divisor_sel); - hb_partial = aie::msc(hb_partial, g, divisor_sel); - - auto hr = hr_partial.to_vector(10); // Q7.9 shift + 1 (div 2) - auto hg = hg_partial.to_vector(10); // Q7.9 shift + 1 (div 2) - auto hb = hb_partial.to_vector(10); // Q7.9 shift + 1 (div 2) - - aie::mask<32> sel1 = aie::eq(rgbMax, r); - auto tmp1 = aie::select(hb, hr, sel1); - aie::mask<32> sel2 = aie::eq(rgbMax, g); - auto tmp2 = aie::select(tmp1, hg, sel2); - aie::mask<32> sel3 = aie::eq(divisor, zero32); - hue = aie::select(tmp2, zero32, sel3); - - ::aie::store_v(hue_out, hue); - rgba_in += 128; - hue_out += 32; - } -} - -void rgba2hue_aie_scalar(uint8_t *rgba_in, uint8_t *hue_out, - const int32_t height, const int32_t width) { - for (int i = 0; i < height; i++) - for (int j = 0; j < width; j++) { - int r = (int)rgba_in[i * (width * 4) + (j * 4)]; - int g = (int)rgba_in[i * (width * 4) + (j * 4) + 1]; - int b = (int)rgba_in[i * (width * 4) + (j * 4) + 2]; - int h; - uint8_t rgbMin, rgbMax; - - rgbMin = r < g ? (r < b ? r : b) : (g < b ? g : b); - rgbMax = r > g ? (r > b ? r : b) : (g > b ? g : b); - - if (rgbMax == 0 || rgbMax == rgbMin) - h = 0; - else if (rgbMax == r) - h = 0 + - 85 * (g - b) / - (rgbMax - rgbMin); // h = 0 + 42.5*(g - b) / (rgbMax - rgbMin); - else if (rgbMax == g) - h = 85 * 2 + - 85 * (b - r) / - (rgbMax - rgbMin); // h = 85 + 42.5*(b - r) / (rgbMax - rgbMin); - else - h = 170 * 2 + - 85 * (r - g) / - (rgbMax - - rgbMin); // h = 170 + 42.5*(r - g) / (rgbMax - rgbMin); - - h = (h + 1) >> 1; - hue_out[i * width + j] = (uint8_t)h; - } - - return; -} - -extern "C" { - -void rgba2hueLine(uint8_t *in, uint8_t *out, int32_t lineWidth) { - // rgba2hue_aie_scalar(in, out, 1, lineWidth); - rgba2hue_aie(in, out, 1, lineWidth); -} - -void rgba2hueTile(uint8_t *in, uint8_t *out, int32_t tileHeight, - int32_t tileWidth) { - rgba2hue_aie_scalar(in, out, tileHeight, tileWidth); -} - -} // extern "C" +//===- rgba2hue.cc ----------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#define NOCPP + +#include +#include +#include + +#define REL_WRITE 0 +#define REL_READ 1 + +// clang-format off +#include +#include "lut_inv_8b.h" +// clang-format on + +const int32_t SRS_SHIFT = 12; + +__attribute__((inline)) void xf_extract_rgb(uint8_t *ptr_rgba, + ::aie::vector &r, + ::aie::vector &g, + ::aie::vector &b) { + ::aie::vector rgba_channel0, rgba_channel1, rgba_channel3, + rgba_channel2; + rgba_channel0 = ::aie::load_v<32>(ptr_rgba); + ptr_rgba += 32; + rgba_channel1 = ::aie::load_v<32>(ptr_rgba); + ptr_rgba += 32; + rgba_channel2 = ::aie::load_v<32>(ptr_rgba); + ptr_rgba += 32; + rgba_channel3 = ::aie::load_v<32>(ptr_rgba); + ptr_rgba += 32; + + // Unzip the interleaved channels + auto [rg_temp, ba_temp] = + ::aie::interleave_unzip(::aie::concat(rgba_channel0, rgba_channel1), + ::aie::concat(rgba_channel2, rgba_channel3), 2); + r = ::aie::filter_even(rg_temp, 1); + g = ::aie::filter_odd(rg_temp, 1); + b = ::aie::filter_even(ba_temp, 1); +} + +__attribute__((inline)) void +comp_divisor_16b(::aie::vector divisor, + ::aie::vector &divisor_select) { + const int step = 0; + using lut_type_uint16 = aie::lut<4, uint16, uint16>; + lut_type_uint16 inv_lut_16b(num_entries_lut_inv_16b, lut_inv_16b_ab, + lut_inv_16b_cd); + aie::parallel_lookup + lookup_inv_16b(inv_lut_16b, step); + + aie::vector input1, input2; + aie::vector res1, res2; + input1 = divisor.extract<16>(0); + input2 = divisor.extract<16>(1); + res1 = lookup_inv_16b.fetch(input1.cast_to()); + res2 = lookup_inv_16b.fetch(input2.cast_to()); + divisor_select = aie::concat(res1, res2); +} + +__attribute__((noinline)) void rgba2hue_aie(uint8_t *rgba_in, uint8_t *hue_out, + const int32_t height, + const int32_t width) { + ::aie::vector r, g, b; + ::aie::vector hue; + + ::aie::vector rgbMin, rgbMax; + + ::aie::vector zero32 = aie::zeros(); + + ::aie::vector eightFive = aie::zeros(); + eightFive[0] = 85; + eightFive[1] = -85; + ::aie::vector one = aie::broadcast(1); + ::aie::vector twoEightFive = + aie::broadcast(171); // 170 + 1 + ::aie::vector fourEightFive = + aie::broadcast(341); // 340 + 1 + + for (int j = 0; (j < (width * height) / 32); j += 1) + chess_prepare_for_pipelining { + xf_extract_rgb(rgba_in, r, g, b); + + // Get rgbMin and rgbMax + rgbMin = ::aie::min(::aie::min(r, g), b); + rgbMax = ::aie::max(::aie::max(r, g), b); + + // Get divisor and select the fixed point divisor to multiply by + auto divisor = ::aie::sub(rgbMax, rgbMin); + ::aie::vector divisor_sel; + comp_divisor_16b(divisor, divisor_sel); + + // Initialize accum with value since 340 is larger than uint8 + aie::accum hr_partial(one, 9); + aie::accum hg_partial(twoEightFive, 9); + aie::accum hb_partial(fourEightFive, 9); + + // Performa uin8*int16 vector multiply + hr_partial = aie::mac(hr_partial, g, divisor_sel); + hg_partial = aie::mac(hg_partial, b, divisor_sel); + hb_partial = aie::mac(hb_partial, r, divisor_sel); + + hr_partial = aie::msc(hr_partial, b, divisor_sel); + hg_partial = aie::msc(hg_partial, r, divisor_sel); + hb_partial = aie::msc(hb_partial, g, divisor_sel); + + auto hr = hr_partial.to_vector(10); // Q7.9 shift + 1 (div 2) + auto hg = hg_partial.to_vector(10); // Q7.9 shift + 1 (div 2) + auto hb = hb_partial.to_vector(10); // Q7.9 shift + 1 (div 2) + + aie::mask<32> sel1 = aie::eq(rgbMax, r); + auto tmp1 = aie::select(hb, hr, sel1); + aie::mask<32> sel2 = aie::eq(rgbMax, g); + auto tmp2 = aie::select(tmp1, hg, sel2); + aie::mask<32> sel3 = aie::eq(divisor, zero32); + hue = aie::select(tmp2, zero32, sel3); + + ::aie::store_v(hue_out, hue); + rgba_in += 128; + hue_out += 32; + } +} + +void rgba2hue_aie_scalar(uint8_t *rgba_in, uint8_t *hue_out, + const int32_t height, const int32_t width) { + for (int i = 0; i < height; i++) + for (int j = 0; j < width; j++) { + int r = (int)rgba_in[i * (width * 4) + (j * 4)]; + int g = (int)rgba_in[i * (width * 4) + (j * 4) + 1]; + int b = (int)rgba_in[i * (width * 4) + (j * 4) + 2]; + int h; + uint8_t rgbMin, rgbMax; + + rgbMin = r < g ? (r < b ? r : b) : (g < b ? g : b); + rgbMax = r > g ? (r > b ? r : b) : (g > b ? g : b); + + if (rgbMax == 0 || rgbMax == rgbMin) + h = 0; + else if (rgbMax == r) + h = 0 + + 85 * (g - b) / + (rgbMax - rgbMin); // h = 0 + 42.5*(g - b) / (rgbMax - rgbMin); + else if (rgbMax == g) + h = 85 * 2 + + 85 * (b - r) / + (rgbMax - rgbMin); // h = 85 + 42.5*(b - r) / (rgbMax - rgbMin); + else + h = 170 * 2 + + 85 * (r - g) / + (rgbMax - + rgbMin); // h = 170 + 42.5*(r - g) / (rgbMax - rgbMin); + + h = (h + 1) >> 1; + hue_out[i * width + j] = (uint8_t)h; + } + + return; +} + +extern "C" { + +void rgba2hueLine(uint8_t *in, uint8_t *out, int32_t lineWidth) { + // rgba2hue_aie_scalar(in, out, 1, lineWidth); + rgba2hue_aie(in, out, 1, lineWidth); +} + +void rgba2hueTile(uint8_t *in, uint8_t *out, int32_t tileHeight, + int32_t tileWidth) { + rgba2hue_aie_scalar(in, out, tileHeight, tileWidth); +} + +} // extern "C" diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc index 2e42d85829..e1ffb38479 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc @@ -1,313 +1,313 @@ -//===- threshold.cc ----------------------------------------------*- C++ -//-*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2022, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -// #define __AIENGINE__ 1 -#define NOCPP - -#include -#include -#include - -#define REL_WRITE 0 -#define REL_READ 1 - -enum _threshold_type { - XF_THRESHOLD_TYPE_BINARY = 0, - XF_THRESHOLD_TYPE_BINARY_INV = 1, - XF_THRESHOLD_TYPE_TRUNC = 2, - XF_THRESHOLD_TYPE_TOZERO = 3, - XF_THRESHOLD_TYPE_TOZERO_INV = 4, -}; - -// #define THRESH_TYPE XF_THRESHOLD_TYPE_BINARY - -#include - -template -__attribute__((noinline)) void -threshold_aie(T *img_in, T *img_out, const int32_t img_width, - const int32_t img_height, const T &thresh_val, const T &max_val, - const uint8_t thresholdType) { - ::aie::vector constants; - ::aie::vector data_out; - ::aie::mask temp_val; - constants[0] = 0; // updating constant zero_val value - constants[1] = thresh_val; // updating constant threshold value - constants[2] = max_val; // updating constant max_val value - - switch (thresholdType) { - case XF_THRESHOLD_TYPE_TRUNC: - for (int j = 0; j < (img_height * img_width); - j += N) // 16x samples per loop - chess_prepare_for_pipelining chess_loop_range(14, ) { - ::aie::vector data_buf1 = - ::aie::load_v(img_in); // in:00++15|_________|_________|_________ - img_in += N; - data_out = ::aie::min(constants[1], data_buf1); - ::aie::store_v(img_out, data_out); - img_out += N; - } - break; - case XF_THRESHOLD_TYPE_BINARY: - for (int j = 0; j < (img_height * img_width); - j += N) // 16x samples per loop - chess_prepare_for_pipelining chess_loop_range(14, ) { - ::aie::vector data_buf1 = - ::aie::load_v(img_in); // in:00++15|_________|_________|_________ - img_in += N; - temp_val = ::aie::lt(constants[1], data_buf1); - data_out = ::aie::select(constants[0], constants[2], temp_val); - ::aie::store_v(img_out, data_out); - img_out += N; - } - break; - case XF_THRESHOLD_TYPE_BINARY_INV: - for (int j = 0; j < (img_height * img_width); - j += N) // 16x samples per loop - chess_prepare_for_pipelining chess_loop_range(14, ) { - ::aie::vector data_buf1 = - ::aie::load_v(img_in); // in:00++15|_________|_________|_________ - img_in += N; - temp_val = ::aie::lt(constants[1], data_buf1); - data_out = ::aie::select(constants[2], constants[0], temp_val); - ::aie::store_v(img_out, data_out); - img_out += N; - } - break; - case XF_THRESHOLD_TYPE_TOZERO: - for (int j = 0; j < (img_height * img_width); - j += N) // 16x samples per loop - chess_prepare_for_pipelining chess_loop_range(14, ) { - ::aie::vector data_buf1 = - ::aie::load_v(img_in); // in:00++15|_________|_________|_________ - img_in += N; - temp_val = ::aie::lt(constants[1], data_buf1); - data_out = ::aie::select(constants[0], data_buf1, temp_val); - ::aie::store_v(img_out, data_out); - img_out += N; - } - break; - case XF_THRESHOLD_TYPE_TOZERO_INV: - for (int j = 0; j < (img_height * img_width); - j += N) // 16x samples per loop - chess_prepare_for_pipelining chess_loop_range(14, ) { - ::aie::vector data_buf1 = - ::aie::load_v(img_in); // in:00++15|_________|_________|_________ - img_in += N; - temp_val = ::aie::lt(constants[1], data_buf1); - data_out = ::aie::select(data_buf1, constants[0], temp_val); - ::aie::store_v(img_out, data_out); - img_out += N; - } - break; - default: - for (int j = 0; j < (img_height * img_width); - j += N) // 16x samples per loop - chess_prepare_for_pipelining chess_loop_range(14, ) { - ::aie::vector data_buf1 = - ::aie::load_v(img_in); // in:00++15|_________|_________|_________ - img_in += N; - data_out = ::aie::min(constants[1], data_buf1); - ::aie::store_v(img_out, data_out); - img_out += N; - } - } -} - -template -__attribute__((noinline)) void threshold4Ch_aie( - T *img_in, T *img_out, const int32_t img_width, const int32_t img_height, - const T &thresh_val1, const T &thresh_val2, const T &thresh_val3, - const T &thresh_val4, const T &max_val1, const T &max_val2, - const T &max_val3, const T &max_val4, const uint8_t thresholdType) { - ::aie::vector constants; - ::aie::vector data_out; - ::aie::mask temp_val; - // constants[0] = 0; // updating constant zero_val value - // constants[1] = thresh_val; // updating constant threshold value - // constants[2] = max_val; // updating constant max_val value - - ::aie::vector mask_zeros = ::aie::zeros(); - ::aie::vector mask_thresh; - ::aie::vector mask_max; - for (int i = 0; i < N / 4; i++) { - mask_thresh[i * 4] = thresh_val1; - mask_thresh[i * 4 + 1] = thresh_val2; - mask_thresh[i * 4 + 2] = thresh_val3; - mask_thresh[i * 4 + 3] = thresh_val4; - mask_max[i * 4] = max_val1; - mask_max[i * 4 + 1] = max_val2; - mask_max[i * 4 + 2] = max_val3; - mask_max[i * 4 + 3] = max_val4; - } - - switch (thresholdType) { - case XF_THRESHOLD_TYPE_TRUNC: - for (int j = 0; j < (img_height * img_width); - j += N) // 16x samples per loop - chess_prepare_for_pipelining chess_loop_range(14, ) { - ::aie::vector data_buf1 = - ::aie::load_v(img_in); // in:00++15|_________|_________|_________ - img_in += N; - data_out = ::aie::min(mask_thresh, data_buf1); - ::aie::store_v(img_out, data_out); - img_out += N; - } - break; - case XF_THRESHOLD_TYPE_BINARY: - for (int j = 0; j < (img_height * img_width); - j += N) // 16x samples per loop - chess_prepare_for_pipelining chess_loop_range(14, ) { - ::aie::vector data_buf1 = - ::aie::load_v(img_in); // in:00++15|_________|_________|_________ - img_in += N; - temp_val = ::aie::lt(mask_thresh, data_buf1); - data_out = ::aie::select(mask_zeros, mask_max, temp_val); - ::aie::store_v(img_out, data_out); - img_out += N; - } - break; - case XF_THRESHOLD_TYPE_BINARY_INV: - for (int j = 0; j < (img_height * img_width); - j += N) // 16x samples per loop - chess_prepare_for_pipelining chess_loop_range(14, ) { - ::aie::vector data_buf1 = - ::aie::load_v(img_in); // in:00++15|_________|_________|_________ - img_in += N; - temp_val = ::aie::lt(mask_thresh, data_buf1); - data_out = ::aie::select(mask_max, mask_zeros, temp_val); - ::aie::store_v(img_out, data_out); - img_out += N; - } - break; - case XF_THRESHOLD_TYPE_TOZERO: - for (int j = 0; j < (img_height * img_width); - j += N) // 16x samples per loop - chess_prepare_for_pipelining chess_loop_range(14, ) { - ::aie::vector data_buf1 = - ::aie::load_v(img_in); // in:00++15|_________|_________|_________ - img_in += N; - temp_val = ::aie::lt(mask_thresh, data_buf1); - data_out = ::aie::select(mask_zeros, data_buf1, temp_val); - ::aie::store_v(img_out, data_out); - img_out += N; - } - break; - case XF_THRESHOLD_TYPE_TOZERO_INV: - for (int j = 0; j < (img_height * img_width); - j += N) // 16x samples per loop - chess_prepare_for_pipelining chess_loop_range(14, ) { - ::aie::vector data_buf1 = - ::aie::load_v(img_in); // in:00++15|_________|_________|_________ - img_in += N; - temp_val = ::aie::lt(mask_thresh, data_buf1); - data_out = ::aie::select(data_buf1, mask_zeros, temp_val); - ::aie::store_v(img_out, data_out); - img_out += N; - } - break; - default: - for (int j = 0; j < (img_height * img_width); - j += N) // 16x samples per loop - chess_prepare_for_pipelining chess_loop_range(14, ) { - ::aie::vector data_buf1 = - ::aie::load_v(img_in); // in:00++15|_________|_________|_________ - img_in += N; - data_out = ::aie::min(mask_thresh, data_buf1); - ::aie::store_v(img_out, data_out); - img_out += N; - } - } -} - -extern "C" { - -#if BIT_WIDTH == 8 - -void threshold(uint8_t *img_in, uint8_t *img_out, int32_t thresh_val, - int32_t max_val, int32_t img_width, int32_t img_height) { - threshold_aie(img_in, img_out, img_width, img_height, thresh_val, - max_val, XF_THRESHOLD_TYPE_BINARY); -} - -void thresholdTile(uint8_t *in, uint8_t *out, int32_t tileHeight, - int32_t tileWidth, uint8_t thresholdValue, uint8_t maxValue, - uint8_t thresholdType) { - threshold_aie(in, out, tileWidth, tileHeight, thresholdValue, - maxValue, thresholdType); -} - -void thresholdLine(uint8_t *in, uint8_t *out, int32_t lineWidth, - uint8_t thresholdValue, uint8_t maxValue, - uint8_t thresholdType) { - threshold_aie(in, out, lineWidth, 1, thresholdValue, maxValue, - thresholdType); -} - -void threshold4ChLine(uint8_t *in, uint8_t *out, int32_t lineWidth, - uint8_t thresholdValue1, uint8_t thresholdValue2, - uint8_t thresholdValue3, uint8_t thresholdValue4, - uint8_t maxValue1, uint8_t maxValue2, uint8_t maxValue3, - uint8_t maxValue4, uint8_t thresholdType) { - threshold4Ch_aie(in, out, lineWidth, 1, thresholdValue1, - thresholdValue2, thresholdValue3, - thresholdValue4, maxValue1, maxValue2, - maxValue3, maxValue4, thresholdType); -} - -#elif BIT_WIDTH == 16 - -void threshold(int16_t *img_in, int16_t *img_out, int32_t thresh_val, - int32_t max_val, int32_t img_width, int32_t img_height) { - threshold_aie(img_in, img_out, img_width, img_height, thresh_val, - max_va, XF_THRESHOLD_TYPE_BINARY); -} - -void thresholdTile(int16_t *in, int16_t *out, int32_t tileHeight, - int32_t tileWidth, int16_t thresholdValue, int16_t maxValue, - uint8_t thresholdType) { - threshold_aie(in, out, tileWidth, tileHeight, thresholdValue, - maxValue), - thresholdType; -} - -void thresholdLine(int16_t *in, int16_t *out, int32_t lineWidth, - int16_t thresholdValue, int16_t maxValue, - uint8_t thresholdType) { - threshold_aie(in, out, lineWidth, 1, thresholdValue, maxValue, - thresholdType); -} - -#else // 32 - -void threshold(int32_t *img_in, int32_t *img_out, int32_t thresh_val, - int32_t max_val, int32_t img_width, int32_t img_height) { - threshold_aie(img_in, img_out, img_width, img_height, thresh_val, - max_val, XF_THRESHOLD_TYPE_BINARY); -} - -void thresholdTile(int32_t *in, int32_t *out, int32_t tileHeight, - int32_t tileWidth, int32_t thresholdValue, int32_t maxValue, - uint8_t thresholdType) { - threshold_aie(in, out, tileWidth, tileHeight, thresholdValue, - maxValue, thresholdType); -} - -void thresholdLine(int32_t *in, int32_t *out, int32_t lineWidth, - int32_t thresholdValue, int32_t maxValue, - uint8_t thresholdType) { - threshold_aie(in, out, lineWidth, 1, thresholdValue, maxValue, - thresholdType); -} - -#endif - -} // extern "C" +//===- threshold.cc ----------------------------------------------*- C++ +//-*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2022, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// #define __AIENGINE__ 1 +#define NOCPP + +#include +#include +#include + +#define REL_WRITE 0 +#define REL_READ 1 + +enum _threshold_type { + XF_THRESHOLD_TYPE_BINARY = 0, + XF_THRESHOLD_TYPE_BINARY_INV = 1, + XF_THRESHOLD_TYPE_TRUNC = 2, + XF_THRESHOLD_TYPE_TOZERO = 3, + XF_THRESHOLD_TYPE_TOZERO_INV = 4, +}; + +// #define THRESH_TYPE XF_THRESHOLD_TYPE_BINARY + +#include + +template +__attribute__((noinline)) void +threshold_aie(T *img_in, T *img_out, const int32_t img_width, + const int32_t img_height, const T &thresh_val, const T &max_val, + const uint8_t thresholdType) { + ::aie::vector constants; + ::aie::vector data_out; + ::aie::mask temp_val; + constants[0] = 0; // updating constant zero_val value + constants[1] = thresh_val; // updating constant threshold value + constants[2] = max_val; // updating constant max_val value + + switch (thresholdType) { + case XF_THRESHOLD_TYPE_TRUNC: + for (int j = 0; j < (img_height * img_width); + j += N) // 16x samples per loop + chess_prepare_for_pipelining chess_loop_range(14, ) { + ::aie::vector data_buf1 = + ::aie::load_v(img_in); // in:00++15|_________|_________|_________ + img_in += N; + data_out = ::aie::min(constants[1], data_buf1); + ::aie::store_v(img_out, data_out); + img_out += N; + } + break; + case XF_THRESHOLD_TYPE_BINARY: + for (int j = 0; j < (img_height * img_width); + j += N) // 16x samples per loop + chess_prepare_for_pipelining chess_loop_range(14, ) { + ::aie::vector data_buf1 = + ::aie::load_v(img_in); // in:00++15|_________|_________|_________ + img_in += N; + temp_val = ::aie::lt(constants[1], data_buf1); + data_out = ::aie::select(constants[0], constants[2], temp_val); + ::aie::store_v(img_out, data_out); + img_out += N; + } + break; + case XF_THRESHOLD_TYPE_BINARY_INV: + for (int j = 0; j < (img_height * img_width); + j += N) // 16x samples per loop + chess_prepare_for_pipelining chess_loop_range(14, ) { + ::aie::vector data_buf1 = + ::aie::load_v(img_in); // in:00++15|_________|_________|_________ + img_in += N; + temp_val = ::aie::lt(constants[1], data_buf1); + data_out = ::aie::select(constants[2], constants[0], temp_val); + ::aie::store_v(img_out, data_out); + img_out += N; + } + break; + case XF_THRESHOLD_TYPE_TOZERO: + for (int j = 0; j < (img_height * img_width); + j += N) // 16x samples per loop + chess_prepare_for_pipelining chess_loop_range(14, ) { + ::aie::vector data_buf1 = + ::aie::load_v(img_in); // in:00++15|_________|_________|_________ + img_in += N; + temp_val = ::aie::lt(constants[1], data_buf1); + data_out = ::aie::select(constants[0], data_buf1, temp_val); + ::aie::store_v(img_out, data_out); + img_out += N; + } + break; + case XF_THRESHOLD_TYPE_TOZERO_INV: + for (int j = 0; j < (img_height * img_width); + j += N) // 16x samples per loop + chess_prepare_for_pipelining chess_loop_range(14, ) { + ::aie::vector data_buf1 = + ::aie::load_v(img_in); // in:00++15|_________|_________|_________ + img_in += N; + temp_val = ::aie::lt(constants[1], data_buf1); + data_out = ::aie::select(data_buf1, constants[0], temp_val); + ::aie::store_v(img_out, data_out); + img_out += N; + } + break; + default: + for (int j = 0; j < (img_height * img_width); + j += N) // 16x samples per loop + chess_prepare_for_pipelining chess_loop_range(14, ) { + ::aie::vector data_buf1 = + ::aie::load_v(img_in); // in:00++15|_________|_________|_________ + img_in += N; + data_out = ::aie::min(constants[1], data_buf1); + ::aie::store_v(img_out, data_out); + img_out += N; + } + } +} + +template +__attribute__((noinline)) void threshold4Ch_aie( + T *img_in, T *img_out, const int32_t img_width, const int32_t img_height, + const T &thresh_val1, const T &thresh_val2, const T &thresh_val3, + const T &thresh_val4, const T &max_val1, const T &max_val2, + const T &max_val3, const T &max_val4, const uint8_t thresholdType) { + ::aie::vector constants; + ::aie::vector data_out; + ::aie::mask temp_val; + // constants[0] = 0; // updating constant zero_val value + // constants[1] = thresh_val; // updating constant threshold value + // constants[2] = max_val; // updating constant max_val value + + ::aie::vector mask_zeros = ::aie::zeros(); + ::aie::vector mask_thresh; + ::aie::vector mask_max; + for (int i = 0; i < N / 4; i++) { + mask_thresh[i * 4] = thresh_val1; + mask_thresh[i * 4 + 1] = thresh_val2; + mask_thresh[i * 4 + 2] = thresh_val3; + mask_thresh[i * 4 + 3] = thresh_val4; + mask_max[i * 4] = max_val1; + mask_max[i * 4 + 1] = max_val2; + mask_max[i * 4 + 2] = max_val3; + mask_max[i * 4 + 3] = max_val4; + } + + switch (thresholdType) { + case XF_THRESHOLD_TYPE_TRUNC: + for (int j = 0; j < (img_height * img_width); + j += N) // 16x samples per loop + chess_prepare_for_pipelining chess_loop_range(14, ) { + ::aie::vector data_buf1 = + ::aie::load_v(img_in); // in:00++15|_________|_________|_________ + img_in += N; + data_out = ::aie::min(mask_thresh, data_buf1); + ::aie::store_v(img_out, data_out); + img_out += N; + } + break; + case XF_THRESHOLD_TYPE_BINARY: + for (int j = 0; j < (img_height * img_width); + j += N) // 16x samples per loop + chess_prepare_for_pipelining chess_loop_range(14, ) { + ::aie::vector data_buf1 = + ::aie::load_v(img_in); // in:00++15|_________|_________|_________ + img_in += N; + temp_val = ::aie::lt(mask_thresh, data_buf1); + data_out = ::aie::select(mask_zeros, mask_max, temp_val); + ::aie::store_v(img_out, data_out); + img_out += N; + } + break; + case XF_THRESHOLD_TYPE_BINARY_INV: + for (int j = 0; j < (img_height * img_width); + j += N) // 16x samples per loop + chess_prepare_for_pipelining chess_loop_range(14, ) { + ::aie::vector data_buf1 = + ::aie::load_v(img_in); // in:00++15|_________|_________|_________ + img_in += N; + temp_val = ::aie::lt(mask_thresh, data_buf1); + data_out = ::aie::select(mask_max, mask_zeros, temp_val); + ::aie::store_v(img_out, data_out); + img_out += N; + } + break; + case XF_THRESHOLD_TYPE_TOZERO: + for (int j = 0; j < (img_height * img_width); + j += N) // 16x samples per loop + chess_prepare_for_pipelining chess_loop_range(14, ) { + ::aie::vector data_buf1 = + ::aie::load_v(img_in); // in:00++15|_________|_________|_________ + img_in += N; + temp_val = ::aie::lt(mask_thresh, data_buf1); + data_out = ::aie::select(mask_zeros, data_buf1, temp_val); + ::aie::store_v(img_out, data_out); + img_out += N; + } + break; + case XF_THRESHOLD_TYPE_TOZERO_INV: + for (int j = 0; j < (img_height * img_width); + j += N) // 16x samples per loop + chess_prepare_for_pipelining chess_loop_range(14, ) { + ::aie::vector data_buf1 = + ::aie::load_v(img_in); // in:00++15|_________|_________|_________ + img_in += N; + temp_val = ::aie::lt(mask_thresh, data_buf1); + data_out = ::aie::select(data_buf1, mask_zeros, temp_val); + ::aie::store_v(img_out, data_out); + img_out += N; + } + break; + default: + for (int j = 0; j < (img_height * img_width); + j += N) // 16x samples per loop + chess_prepare_for_pipelining chess_loop_range(14, ) { + ::aie::vector data_buf1 = + ::aie::load_v(img_in); // in:00++15|_________|_________|_________ + img_in += N; + data_out = ::aie::min(mask_thresh, data_buf1); + ::aie::store_v(img_out, data_out); + img_out += N; + } + } +} + +extern "C" { + +#if BIT_WIDTH == 8 + +void threshold(uint8_t *img_in, uint8_t *img_out, int32_t thresh_val, + int32_t max_val, int32_t img_width, int32_t img_height) { + threshold_aie(img_in, img_out, img_width, img_height, thresh_val, + max_val, XF_THRESHOLD_TYPE_BINARY); +} + +void thresholdTile(uint8_t *in, uint8_t *out, int32_t tileHeight, + int32_t tileWidth, uint8_t thresholdValue, uint8_t maxValue, + uint8_t thresholdType) { + threshold_aie(in, out, tileWidth, tileHeight, thresholdValue, + maxValue, thresholdType); +} + +void thresholdLine(uint8_t *in, uint8_t *out, int32_t lineWidth, + uint8_t thresholdValue, uint8_t maxValue, + uint8_t thresholdType) { + threshold_aie(in, out, lineWidth, 1, thresholdValue, maxValue, + thresholdType); +} + +void threshold4ChLine(uint8_t *in, uint8_t *out, int32_t lineWidth, + uint8_t thresholdValue1, uint8_t thresholdValue2, + uint8_t thresholdValue3, uint8_t thresholdValue4, + uint8_t maxValue1, uint8_t maxValue2, uint8_t maxValue3, + uint8_t maxValue4, uint8_t thresholdType) { + threshold4Ch_aie(in, out, lineWidth, 1, thresholdValue1, + thresholdValue2, thresholdValue3, + thresholdValue4, maxValue1, maxValue2, + maxValue3, maxValue4, thresholdType); +} + +#elif BIT_WIDTH == 16 + +void threshold(int16_t *img_in, int16_t *img_out, int32_t thresh_val, + int32_t max_val, int32_t img_width, int32_t img_height) { + threshold_aie(img_in, img_out, img_width, img_height, thresh_val, + max_va, XF_THRESHOLD_TYPE_BINARY); +} + +void thresholdTile(int16_t *in, int16_t *out, int32_t tileHeight, + int32_t tileWidth, int16_t thresholdValue, int16_t maxValue, + uint8_t thresholdType) { + threshold_aie(in, out, tileWidth, tileHeight, thresholdValue, + maxValue), + thresholdType; +} + +void thresholdLine(int16_t *in, int16_t *out, int32_t lineWidth, + int16_t thresholdValue, int16_t maxValue, + uint8_t thresholdType) { + threshold_aie(in, out, lineWidth, 1, thresholdValue, maxValue, + thresholdType); +} + +#else // 32 + +void threshold(int32_t *img_in, int32_t *img_out, int32_t thresh_val, + int32_t max_val, int32_t img_width, int32_t img_height) { + threshold_aie(img_in, img_out, img_width, img_height, thresh_val, + max_val, XF_THRESHOLD_TYPE_BINARY); +} + +void thresholdTile(int32_t *in, int32_t *out, int32_t tileHeight, + int32_t tileWidth, int32_t thresholdValue, int32_t maxValue, + uint8_t thresholdType) { + threshold_aie(in, out, tileWidth, tileHeight, thresholdValue, + maxValue, thresholdType); +} + +void thresholdLine(int32_t *in, int32_t *out, int32_t lineWidth, + int32_t thresholdValue, int32_t maxValue, + uint8_t thresholdType) { + threshold_aie(in, out, lineWidth, 1, thresholdValue, maxValue, + thresholdType); +} + +#endif + +} // extern "C" From a2d9254ced87b814216f5f125c6d2d6419e1c59c Mon Sep 17 00:00:00 2001 From: Stephen Neuendorffer Date: Tue, 23 Jan 2024 09:14:00 -0800 Subject: [PATCH 2/3] [vision_pipelines] remove extra defines. These defines are actually in aie_api.h, and aren't used in the code anyway. --- .../ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc | 3 --- .../ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc | 3 --- .../ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc | 3 --- .../ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc | 3 --- .../ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc | 3 --- .../ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc | 3 --- .../ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc | 3 --- .../ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc | 3 --- .../ipu-xrt/vision_pipelines/vision_kernels/threshold.cc | 3 --- 9 files changed, 27 deletions(-) diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc index c6868aad3a..849e47a551 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/addWeighted.cc @@ -16,9 +16,6 @@ #include #include -#define REL_WRITE 0 -#define REL_READ 1 - // #include // NOTE: use of float2fix not // supported in aie2 #include diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc index e3cd871195..57aec65470 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseAND.cc @@ -15,9 +15,6 @@ #include #include -#define REL_WRITE 0 -#define REL_READ 1 - #include template diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc index e02ec472f5..cd6e0d1027 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/bitwiseOR.cc @@ -15,9 +15,6 @@ #include #include -#define REL_WRITE 0 -#define REL_READ 1 - #include template diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc index ccbbb87c6f..59d11d303f 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/filter2d.cc @@ -15,9 +15,6 @@ #include #include -#define REL_WRITE 0 -#define REL_READ 1 - #define THRESH_TYPE XF_THRESHOLD_TYPE_BINARY #include diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc index 351b3331b6..48d1c6acba 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/gray2rgba.cc @@ -14,9 +14,6 @@ #include #include -#define REL_WRITE 0 -#define REL_READ 1 - #include ::aie::vector vector_broadcast(::aie::vector e) { diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc index 4725d09ba6..9d5d6188b0 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/passThrough.cc @@ -15,9 +15,6 @@ #include #include -#define REL_WRITE 0 -#define REL_READ 1 - #include template diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc index 988c4a0e4c..1f1bc23d4c 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc @@ -14,9 +14,6 @@ #include #include -#define REL_WRITE 0 -#define REL_READ 1 - #include const int32_t SRS_SHIFT = 15; diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc index 3f20bf514d..39a2894495 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2hue.cc @@ -14,9 +14,6 @@ #include #include -#define REL_WRITE 0 -#define REL_READ 1 - // clang-format off #include #include "lut_inv_8b.h" diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc index e1ffb38479..ef4d9ee7c1 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/threshold.cc @@ -16,9 +16,6 @@ #include #include -#define REL_WRITE 0 -#define REL_READ 1 - enum _threshold_type { XF_THRESHOLD_TYPE_BINARY = 0, XF_THRESHOLD_TYPE_BINARY_INV = 1, From b22002635db963b225548adb7e53fdb4382dd486 Mon Sep 17 00:00:00 2001 From: Stephen Neuendorffer Date: Tue, 23 Jan 2024 09:58:24 -0800 Subject: [PATCH 3/3] Update ipu edgeDetect demo to compile and link with peano This change also updates test.cpp to properly support 'live' mode. With no arguments: runs a single image and attempts to compare against openCV. with -l: runs continuously using the webcam with -v: runs continuously using a video file. --- reference_designs/ipu-xrt/makefile-common | 7 + .../vision_pipelines/edge_detect/Makefile | 51 ++- .../edge_detect/aie2_edgeDetect.py | 4 +- .../vision_pipelines/edge_detect/test.cpp | 393 ++++++++---------- .../vision_kernels/rgba2gray.cc | 2 +- 5 files changed, 237 insertions(+), 220 deletions(-) diff --git a/reference_designs/ipu-xrt/makefile-common b/reference_designs/ipu-xrt/makefile-common index d9a0a69015..ed851b0e29 100644 --- a/reference_designs/ipu-xrt/makefile-common +++ b/reference_designs/ipu-xrt/makefile-common @@ -6,8 +6,15 @@ VITIS_AIETOOLS_DIR ?= ${VITIS_ROOT}/aietools VITIS_AIE_INCLUDE_DIR ?= ${VITIS_ROOT}/aietools/data/versal_prod/lib VITIS_AIE2_INCLUDE_DIR ?= ${VITIS_ROOT}/aietools/data/aie_ml/lib +export XILINX_VITIS_AIETOOLS = ${VITIS_AIETOOLS_DIR} + CHESSCC1_FLAGS = -f -p me -P ${VITIS_AIE_INCLUDE_DIR} -I ${VITIS_AIETOOLS_DIR}/include CHESSCC2_FLAGS = -f -p me -P ${VITIS_AIE2_INCLUDE_DIR} -I ${VITIS_AIETOOLS_DIR}/include -D__AIENGINE__=2 -D__AIEARCH__=20 + +# Compile with peano, link with xbridge +CLANG_CHESSLIBC_FLAGS = --target=aie2 -I ${VITIS_AIETOOLS_DIR}/include -I ${VITIS_AIETOOLS_DIR}/include/adf -I ${VITIS_AIETOOLS_DIR}/include/aie_api -D_LIBCPP_HAS_THREAD_API_PTHREAD --std=c++2a -O2 -fno-jump-tables -Wno-deprecated-declarations +# Compile with peano, link with lld +CLANG_FLAGS = --target=aie2-none-elf -I ${VITIS_AIETOOLS_DIR}/include -D_LIBCPP_HAS_THREAD_API_PTHREAD --std=c++2a -O2 -fno-jump-tables -Wno-deprecated-declarations CHESS_FLAGS = -P ${VITIS_AIE_INCLUDE_DIR} CHESSCCWRAP1_FLAGS = aie -I ${VITIS_AIETOOLS_DIR}/include diff --git a/reference_designs/ipu-xrt/vision_pipelines/edge_detect/Makefile b/reference_designs/ipu-xrt/vision_pipelines/edge_detect/Makefile index a0d50006e1..1557420dca 100755 --- a/reference_designs/ipu-xrt/vision_pipelines/edge_detect/Makefile +++ b/reference_designs/ipu-xrt/vision_pipelines/edge_detect/Makefile @@ -21,13 +21,19 @@ EDGEDETECT_HEIGHT = 1080 targetname = edgeDetect -all: build/final_${EDGEDETECT_WIDTH}.xclbin +default: peano + +all: peano xchesscc peanoxbridge mlir: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir +peano: build/final_${EDGEDETECT_WIDTH}.xclbin +xchesscc: build_xchesscc/final_${EDGEDETECT_WIDTH}.xclbin +peanoxbridge: build_peanoxbridge/final_${EDGEDETECT_WIDTH}.xclbin +PEANO = /root/acdc/build/install/peano/bin/clang++ build/%.cc.o: %.cc - mkdir -p ${@D} - cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F} + @mkdir -p ${@D} + cd ${@D} && ${PEANO} ${CLANG_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F} build/combined_gray2rgba_addWeighted.a: build/gray2rgba.cc.o build/addWeighted.cc.o mkdir -p ${@D} @@ -42,6 +48,45 @@ build/final_${EDGEDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${EDGEDETECT_WID cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%) +XCHESSCC_OBJS = build_xchesscc/rgba2gray.cc.o \ + build_xchesscc/gray2rgba.cc.o \ + build_xchesscc/filter2d.cc.o \ + build_xchesscc/threshold.cc.o \ + build_xchesscc/addWeighted.cc.o \ + build_xchesscc/combined_gray2rgba_addWeighted.a + +build_xchesscc/%.cc.o: %.cc + @mkdir -p ${@D} + cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} +w ${@F}.work -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F} + +build_xchesscc/combined_gray2rgba_addWeighted.a: build_xchesscc/gray2rgba.cc.o build_xchesscc/addWeighted.cc.o + mkdir -p ${@D} + ar rvs $@ $< $(word 2,$^) + +build_xchesscc/final_${EDGEDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir ${XCHESSCC_OBJS} + mkdir -p ${@D} + cd ${@D} && aiecc.py --xchesscc --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host \ + --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%) + +PEANOXBRIDGE_OBJS = build_peanoxbridge/rgba2gray.cc.o \ + build_peanoxbridge/gray2rgba.cc.o \ + build_peanoxbridge/filter2d.cc.o \ + build_peanoxbridge/threshold.cc.o \ + build_peanoxbridge/addWeighted.cc.o \ + build_peanoxbridge/combined_gray2rgba_addWeighted.a +build_peanoxbridge/%.cc.o: %.cc + @mkdir -p ${@D} + cd ${@D} && ${PEANO} ${CLANG_CHESSLIBC_FLAGS} -DBIT_WIDTH=8 -c $(<:%=../%) -o ${@F} + +build_peanoxbridge/combined_gray2rgba_addWeighted.a: build_peanoxbridge/gray2rgba.cc.o build_peanoxbridge/addWeighted.cc.o + mkdir -p ${@D} + ar rvs $@ $< $(word 2,$^) + +build_peanoxbridge/final_${EDGEDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir ${PEANOXBRIDGE_OBJS} + mkdir -p ${@D} + cd ${@D} && aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host \ + --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%) + ${targetname}.exe: test.cpp rm -rf _build mkdir -p _build diff --git a/reference_designs/ipu-xrt/vision_pipelines/edge_detect/aie2_edgeDetect.py b/reference_designs/ipu-xrt/vision_pipelines/edge_detect/aie2_edgeDetect.py index 48682e4011..5a32a38c9f 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/edge_detect/aie2_edgeDetect.py +++ b/reference_designs/ipu-xrt/vision_pipelines/edge_detect/aie2_edgeDetect.py @@ -178,10 +178,12 @@ def core_body(): objectfifo_release(ObjectFifoPort.Produce, "OF_2to3", 1) yield_([]) + kernel = Buffer(ComputeTile3, [3, 3], T.i16(), "kernel") + # Compute tile 3 @core(ComputeTile3, "filter2d.cc.o") def core_body(): - kernel = memref.alloc([3, 3], T.i16()) + # kernel = memref.alloca([3, 3], T.i16()) v0 = arith.constant(0, T.i16()) v1 = arith.constant(4096, T.i16()) v_minus4 = arith.constant(-16384, T.i16()) diff --git a/reference_designs/ipu-xrt/vision_pipelines/edge_detect/test.cpp b/reference_designs/ipu-xrt/vision_pipelines/edge_detect/test.cpp index 57389fffbf..0fa2e80294 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/edge_detect/test.cpp +++ b/reference_designs/ipu-xrt/vision_pipelines/edge_detect/test.cpp @@ -105,18 +105,156 @@ int main(int argc, const char *argv[]) { std::cout << "Running edgeDetect for resolution: " << testImageWidth << "x" << testImageHeight << std::endl; - if (vm.count("live")) { - std::cout << "Using live webcam input" << std::endl; + /* + **************************************************************************** + * Read the input image or generate random one if no input file argument + * provided + **************************************************************************** + */ + cv::Mat inImage, inImageRGBA; + cv::String fileIn; + if (vm.count("image")) { + fileIn = vm["image"].as(); + //"/group/xrlabs/imagesAndVideos/images/minion128x128.jpg"; + initializeSingleImageTest(fileIn, inImage); + } else { + fileIn = "RANDOM"; + inImage = cv::Mat(testImageHeight, testImageWidth, CV_8UC3); + cv::randu(inImage, cv::Scalar(0, 0, 0), cv::Scalar(255, 255, 255)); + } + + cv::String fileOut = + vm["outfile"].as(); //"edgeDetectOut_test.jpg"; + printf("Load input image %s and run edgeDetect\n", fileIn.c_str()); + + cv::resize(inImage, inImage, cv::Size(testImageWidth, testImageHeight)); + cv::cvtColor(inImage, inImageRGBA, cv::COLOR_BGR2RGBA); + + /* + **************************************************************************** + * Calculate OpenCV referennce for edgeDetect + **************************************************************************** + */ + + cv::Mat outImageReference, outImageTestBGR; + edgeDetect(inImage, outImageReference); + + cv::cvtColor(outImageReference, outImageReference, cv::COLOR_BGR2RGBA); + cv::Mat outImageTest(testImageHeight, testImageWidth, CV_8UC4); + + /* + **************************************************************************** + * Load instruction sequence + **************************************************************************** + */ + std::vector instr_v = + load_instr_sequence(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + /* + **************************************************************************** + * Start the XRT context and load the kernel + **************************************************************************** + */ + xrt::device device; + xrt::kernel kernel; + + initXrtLoadKernel(device, kernel, verbosity, vm["xclbin"].as(), + vm["kernel"].as()); + + /* + **************************************************************************** + * Set up the buffer objects + **************************************************************************** + */ + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_inA = xrt::bo(device, inImageRGBA.total() * inImageRGBA.elemSize(), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_inB = xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_out = + xrt::bo(device, (outImageTest.total() * outImageTest.elemSize()), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects.\n"; + + uint8_t *bufInA = bo_inA.map(); + + // Copy cv::Mat input image to xrt buffer object + memcpy(bufInA, inImageRGBA.data, + (inImageRGBA.total() * inImageRGBA.elemSize())); + + // Copy instruction stream to xrt buffer object + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + // Sync host to device memories + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + // Execute the kernel and wait to finish + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + run.wait(); + + // Sync device to host memories + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + // Store result in cv::Mat + uint8_t *bufOut = bo_out.map(); + memcpy(outImageTest.data, bufOut, + (outImageTest.total() * outImageTest.elemSize())); + + /* + **************************************************************************** + * Compare to OpenCV reference + **************************************************************************** + */ + int numberOfDifferences = 0; + double errorPerPixel = 0; + imageCompare(outImageTest, outImageReference, numberOfDifferences, + errorPerPixel, true, false); + printf("Number of differences: %d, average L1 error: %f\n", + numberOfDifferences, errorPerPixel); + + cv::cvtColor(outImageTest, outImageTestBGR, cv::COLOR_RGBA2BGR); + cv::imwrite(fileOut, outImageTestBGR); + + // Print Pass/Fail result of our test + int res = 0; + if (errorPerPixel < epsilon) { + printf("PASS!\n"); + res = 0; + } else { + printf("Fail!\n"); + res = -1; + } + + if (vm.count("live") || vm.count("video")) { + if (vm.count("live")) + std::cout << "Using live webcam input" << std::endl; + else + std::cout << "Reading movie file " << vm["video"].as() + << std::endl; cv::VideoCapture cap; try { - initializeVideoCapture(cap); + if (vm.count("live")) + initializeVideoCapture(cap); + else + initializeVideoFile(cap, vm["video"].as()); } catch (const std::exception &ex) { std::cerr << ex.what() << "\n\n"; return 1; } - //--- GRAB AND SHOW LOOP + //--- frame grab + process std::cout << "Start grabbing" << std::endl << "Press any key to terminate" << std::endl; cv::Mat frame; @@ -129,222 +267,47 @@ int main(int argc, const char *argv[]) { break; } - cv::Mat edgeFrame; - edgeDetect(frame, edgeFrame); + // cv::Mat edgeFrame; + // edgeDetect(frame,edgeFrame); - // show live and wait for a key with timeout long enough to show images - cv::imshow("Live", edgeFrame); - if (cv::waitKey(5) >= 0) - break; - } - } + cv::resize(frame, inImage, cv::Size(testImageWidth, testImageHeight)); + cv::cvtColor(inImage, inImageRGBA, cv::COLOR_BGR2RGBA); + // Copy cv::Mat input image to xrt buffer object + memcpy(bufInA, inImageRGBA.data, + (inImageRGBA.total() * inImageRGBA.elemSize())); - else { - - /* - **************************************************************************** - * Read the input image or generate random one if no input file argument - * provided - **************************************************************************** - */ - cv::Mat inImage, inImageRGBA; - cv::String fileIn; - if (vm.count("image")) { - fileIn = vm["image"].as(); - //"/group/xrlabs/imagesAndVideos/images/minion128x128.jpg"; - initializeSingleImageTest(fileIn, inImage); - } else { - fileIn = "RANDOM"; - inImage = cv::Mat(testImageHeight, testImageWidth, CV_8UC3); - cv::randu(inImage, cv::Scalar(0, 0, 0), cv::Scalar(255, 255, 255)); - } + // Copy instruction stream to xrt buffer object + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); - cv::String fileOut = - vm["outfile"].as(); //"edgeDetectOut_test.jpg"; - printf("Load input image %s and run edgeDetect\n", fileIn.c_str()); - - cv::resize(inImage, inImage, cv::Size(testImageWidth, testImageHeight)); - cv::cvtColor(inImage, inImageRGBA, cv::COLOR_BGR2RGBA); - - /* - **************************************************************************** - * Calculate OpenCV referennce for edgeDetect - **************************************************************************** - */ - - cv::Mat outImageReference, outImageTestBGR; - edgeDetect(inImage, outImageReference); - - cv::cvtColor(outImageReference, outImageReference, cv::COLOR_BGR2RGBA); - cv::Mat outImageTest(testImageHeight, testImageWidth, CV_8UC4); - - /* - **************************************************************************** - * Load instruction sequence - **************************************************************************** - */ - std::vector instr_v = - load_instr_sequence(vm["instr"].as()); - - int verbosity = vm["verbosity"].as(); - if (verbosity >= 1) - std::cout << "Sequence instr count: " << instr_v.size() << "\n"; - - /* - **************************************************************************** - * Start the XRT context and load the kernel - **************************************************************************** - */ - xrt::device device; - xrt::kernel kernel; - - initXrtLoadKernel(device, kernel, verbosity, vm["xclbin"].as(), - vm["kernel"].as()); - - /* - **************************************************************************** - * Set up the buffer objects - **************************************************************************** - */ - auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); - auto bo_inA = xrt::bo(device, inImageRGBA.total() * inImageRGBA.elemSize(), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inB = - xrt::bo(device, 1, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_out = - xrt::bo(device, (outImageTest.total() * outImageTest.elemSize()), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); - - if (verbosity >= 1) - std::cout << "Writing data into buffer objects.\n"; - - uint8_t *bufInA = bo_inA.map(); - - // Copy cv::Mat input image to xrt buffer object - memcpy(bufInA, inImageRGBA.data, - (inImageRGBA.total() * inImageRGBA.elemSize())); - - // Copy instruction stream to xrt buffer object - void *bufInstr = bo_instr.map(); - memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); - - // Sync host to device memories - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE); - - // Execute the kernel and wait to finish - if (verbosity >= 1) - std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); - run.wait(); - - // Sync device to host memories - bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); - - // Store result in cv::Mat - uint8_t *bufOut = bo_out.map(); - memcpy(outImageTest.data, bufOut, - (outImageTest.total() * outImageTest.elemSize())); - - /* - **************************************************************************** - * Compare to OpenCV reference - **************************************************************************** - */ - int numberOfDifferences = 0; - double errorPerPixel = 0; - imageCompare(outImageTest, outImageReference, numberOfDifferences, - errorPerPixel, true, false); - printf("Number of differences: %d, average L1 error: %f\n", - numberOfDifferences, errorPerPixel); - - cv::cvtColor(outImageTest, outImageTestBGR, cv::COLOR_RGBA2BGR); - cv::imwrite(fileOut, outImageTestBGR); - - // Print Pass/Fail result of our test - int res = 0; - if (errorPerPixel < epsilon) { - printf("PASS!\n"); - res = 0; - } else { - printf("Fail!\n"); - res = -1; - } + // Sync host to device memories + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE); - if (vm.count("live") || vm.count("video")) { - if (vm.count("live")) - std::cout << "Using live webcam input" << std::endl; - else - std::cout << "Reading movie file " << vm["video"].as() - << std::endl; - - cv::VideoCapture cap; - try { - if (vm.count("live")) - initializeVideoCapture(cap); - else - initializeVideoFile(cap, vm["video"].as()); - } catch (const std::exception &ex) { - std::cerr << ex.what() << "\n\n"; - return 1; - } + // Execute the kernel and wait to finish + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; - //--- frame grab + process - std::cout << "Start grabbing" << std::endl - << "Press any key to terminate" << std::endl; - cv::Mat frame; - for (;;) { - // wait for a new frame from camera and store it into 'frame' - cap.read(frame); - // check if we succeeded - if (frame.empty()) { - std::cerr << "ERROR! blank frame grabbed\n"; - break; - } - - // cv::Mat edgeFrame; - // edgeDetect(frame,edgeFrame); - - cv::resize(frame, inImage, cv::Size(testImageWidth, testImageHeight)); - cv::cvtColor(inImage, inImageRGBA, cv::COLOR_BGR2RGBA); - // Copy cv::Mat input image to xrt buffer object - memcpy(bufInA, inImageRGBA.data, - (inImageRGBA.total() * inImageRGBA.elemSize())); - - // Copy instruction stream to xrt buffer object - void *bufInstr = bo_instr.map(); - memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); - - // Sync host to device memories - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inB.sync(XCL_BO_SYNC_BO_TO_DEVICE); - - // Execute the kernel and wait to finish - if (verbosity >= 1) - std::cout << "Running Kernel.\n"; - auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); - run.wait(); - - // Sync device to host memories - bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); - - // Store result in cv::Mat - uint8_t *bufOut = bo_out.map(); - memcpy(outImageTest.data, bufOut, - (outImageTest.total() * outImageTest.elemSize())); - - // show live and wait for a key with timeout long enough to show images - cv::cvtColor(outImageTest, outImageTestBGR, cv::COLOR_RGBA2BGR); - cv::imshow("Edge AIE", outImageTestBGR); - if (cv::waitKey(5) >= 0) - break; - } - } + auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + run.wait(); - printf("Testing edgeDetect done!\n"); - return res; + // Sync device to host memories + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + // Store result in cv::Mat + uint8_t *bufOut = bo_out.map(); + memcpy(outImageTest.data, bufOut, + (outImageTest.total() * outImageTest.elemSize())); + + // show live and wait for a key with timeout long enough to show images + cv::cvtColor(outImageTest, outImageTestBGR, cv::COLOR_RGBA2BGR); + cv::imshow("Edge AIE", outImageTestBGR); + if (cv::waitKey(5) >= 0) + break; + } } + + printf("Testing edgeDetect done!\n"); + return res; } diff --git a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc index 1f1bc23d4c..b3ff661be4 100644 --- a/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc +++ b/reference_designs/ipu-xrt/vision_pipelines/vision_kernels/rgba2gray.cc @@ -56,7 +56,7 @@ __attribute__((noinline)) void rgba2gray_aie(uint8_t *rgba_in, uint8_t *y_out, ::aie::vector r, g, b; ::aie::vector y; - for (int j = 0; (j < (width * height) / 32); j += 1) + for (int j = 0; (j < (width * height) >> 5); j += 1) chess_prepare_for_pipelining { xf_extract_rgb(rgba_in, r, g, b);