From c61bd3387403b76d618915ccebf5e9585f52a071 Mon Sep 17 00:00:00 2001
From: Jenkins <bsgcomp@arm.com>
Date: Wed, 25 Sep 2024 16:09:05 +0000
Subject: [PATCH] Compute Library v24.09

---
 Android.bp                                    |   5 +-
 CMakeLists.txt                                |  21 +-
 README.md                                     |  24 +-
 SConscript                                    |   4 +-
 SConstruct                                    |   8 +-
 arm_compute/core/utils/DataTypeUtils.h        |   3 +
 arm_compute/runtime/CL/CLScheduler.h          |  10 +-
 arm_compute/runtime/CL/CLTensorAllocator.h    |  11 +-
 arm_compute/runtime/ITensorAllocator.h        |  14 +-
 .../NEON/functions/NEArithmeticSubtraction.h  |   1 -
 .../functions/NEPixelWiseMultiplication.h     |   3 +-
 .../runtime/NEON/functions/NEReverse.h        |  15 +-
 arm_compute/runtime/TensorAllocator.h         |  10 +-
 .../runtime/experimental/operators/CpuMul.h   |  24 +-
 .../experimental/operators/CpuSoftmax.h       | 104 +++++
 docs/Doxyfile                                 |   2 +-
 .../how_to_build_and_run_examples.dox         |   4 +-
 docs/user_guide/operator_list.dox             |   6 +-
 filelist.json                                 |  11 +-
 src/BUILD.bazel                               |   4 +
 src/CMakeLists.txt                            |   4 +
 src/common/cpuinfo/CpuInfo.cpp                |  16 +-
 src/core/CL/CLCommandBuffer.h                 | 163 -------
 src/core/CL/CLCompatCommandBuffer.cpp         | 108 -----
 src/core/CL/CLCompatCommandBuffer.h           |  94 ----
 src/core/CL/CLMutableCommandBuffer.cpp        | 151 -------
 src/core/CL/CLMutableCommandBuffer.h          |  85 ----
 src/core/CL/cl_kernels/nchw/pooling_layer.cl  |  15 +-
 .../nhwc/pooling_3d_layer_quantized.cl        |  11 +-
 .../nhwc/pooling_layer_quantized.cl           |  13 +-
 src/core/CL/kernels/CLReverseKernel.cpp       |   9 +-
 src/core/NEON/NEMath.h                        |  15 +-
 src/core/NEON/NEMath.inl                      |  75 +++-
 src/core/NEON/kernels/NEReverseKernel.cpp     |  11 +-
 src/core/Utils.cpp                            |   6 +-
 src/core/helpers/LUTManager.cpp               |  58 ++-
 src/core/helpers/LUTManager.h                 |  45 +-
 src/core/helpers/MemoryHelpers.h              |  40 +-
 src/core/helpers/PoolingHelpers.h             |  27 +-
 src/cpu/kernels/CpuActivationKernel.cpp       | 143 +------
 src/cpu/kernels/CpuActivationKernel.h         |  24 +-
 src/cpu/kernels/CpuAddKernel.cpp              |  30 +-
 .../CpuDepthwiseConv2dNativeKernel.cpp        |   3 +-
 src/cpu/kernels/CpuKernelSelectionTypes.h     |   1 +
 src/cpu/kernels/CpuMulKernel.cpp              |  10 +
 src/cpu/kernels/CpuSoftmaxKernel.h            |   4 +-
 src/cpu/kernels/CpuSubKernel.cpp              |  19 +-
 .../CpuActivationKernelHeuristics.cpp         | 251 +++++++++++
 .../CpuActivationKernelHeuristics.h           | 112 +++++
 .../kernels/add/generic/sme2/impl.cpp}        |  48 +--
 src/cpu/kernels/add/generic/sme2/impl.h       |  46 ++
 .../add/generic/sme2/qasymm8_signed.cpp       | 403 ++++++++++++++++++
 src/cpu/kernels/add/list.h                    |  10 +-
 ...puDepthwiseConv2dAssemblyWrapperKernel.cpp |   3 +-
 src/cpu/kernels/pool2d/neon/quantized.h       |  41 +-
 src/cpu/kernels/pool3d/neon/quantized.h       |  40 +-
 src/cpu/kernels/softmax/generic/neon/fp16.cpp |   6 +-
 src/cpu/kernels/softmax/generic/neon/fp32.cpp |   6 +-
 src/cpu/kernels/softmax/generic/neon/impl.cpp |  74 +++-
 .../kernels/softmax/generic/neon/qasymm8.cpp  |   6 +-
 .../softmax/generic/neon/qasymm8_signed.cpp   |   6 +-
 src/cpu/kernels/softmax/generic/sme2/fp16.cpp |   9 +-
 src/cpu/kernels/softmax/generic/sme2/fp32.cpp |   9 +-
 .../kernels/softmax/generic/sme2/qasymm8.cpp  |   6 +-
 .../softmax/generic/sme2/qasymm8_signed.cpp   |   7 +-
 src/cpu/kernels/softmax/list.h                |  10 +-
 src/cpu/kernels/sub/neon/impl.h               |  49 ++-
 src/cpu/kernels/sub/neon/qsymm16.cpp          |  36 +-
 src/cpu/operators/CpuConv2d.cpp               |   9 +-
 src/cpu/operators/CpuDepthwiseConv2d.cpp      |   8 +-
 src/runtime/CL/CLScheduler.cpp                |  20 +-
 src/runtime/CL/CLTensorAllocator.cpp          |   7 +-
 .../CL/functions/CLConvolutionLayer.cpp       |  30 +-
 .../CL/functions/CLFullyConnectedLayer.cpp    |  46 +-
 src/runtime/CL/functions/CLGEMM.cpp           |   7 +-
 .../CL/functions/CLGEMMConvolutionLayer.cpp   |   9 +-
 .../CLGEMMLowpMatrixMultiplyCore.cpp          |   9 +-
 .../functions/CLWinogradConvolutionLayer.cpp  |  15 +-
 .../NEON/functions/NEConvolutionLayer.cpp     |  30 +-
 .../functions/NEDepthwiseConvolutionLayer.cpp |   5 +-
 .../NEON/functions/NEFullyConnectedLayer.cpp  |   7 +-
 src/runtime/NEON/functions/NEGEMM.cpp         |   7 +-
 src/runtime/NEON/functions/NEGEMMConv2d.cpp   |   7 +-
 .../NEON/functions/NEGEMMConvolutionLayer.cpp |  20 +-
 .../NEGEMMLowpMatrixMultiplyCore.cpp          |  22 +-
 .../functions/NEWinogradConvolutionLayer.cpp  |   6 +-
 src/runtime/OMP/OMPScheduler.cpp              |   8 +-
 src/runtime/TensorAllocator.cpp               |   7 +-
 .../experimental/operators/CpuSoftmax.cpp     |  75 ++++
 support/Bfloat16.h                            |  43 +-
 tests/AssetsLibrary.cpp                       |   7 +-
 tests/AssetsLibrary.h                         |  18 +-
 tests/datasets/DatatypeDataset.h              |  58 ++-
 tests/datasets/ShapeDatasets.h                |  24 ++
 tests/framework/Framework.cpp                 |  16 +-
 tests/framework/Framework.h                   |  34 +-
 tests/framework/Macros.h                      |  32 +-
 tests/main.cpp                                |  24 +-
 tests/validation/CL/Pooling3dLayer.cpp        |  46 +-
 tests/validation/CL/PoolingLayer.cpp          |  59 ++-
 tests/validation/CL/Reverse.cpp               | 226 +++++++++-
 tests/validation/CMakeLists.txt               |   4 +-
 tests/validation/CPP/LUT.cpp                  | 151 +++++++
 tests/validation/Helpers.cpp                  |  38 +-
 tests/validation/Helpers.h                    |   9 +
 tests/validation/NEON/ActivationLayer.cpp     |  42 --
 tests/validation/NEON/ArithmeticAddition.cpp  |  53 ++-
 .../validation/NEON/ArithmeticSubtraction.cpp | 138 +++++-
 .../NEON/DepthwiseConvolutionLayer.cpp        |  65 +++
 tests/validation/NEON/LogSoftmaxLayer.cpp     |  37 ++
 .../NEON/PixelWiseMultiplication.cpp          | 123 +++++-
 tests/validation/NEON/Pooling3dLayer.cpp      |  34 ++
 tests/validation/NEON/PoolingLayer.cpp        |  34 ++
 tests/validation/NEON/Reverse.cpp             | 185 ++++++--
 tests/validation/fixtures/CpuSoftmaxFixture.h | 143 +++++++
 tests/validation/fixtures/ReverseFixture.h    |  18 +-
 tests/validation/reference/Reverse.cpp        |  14 +-
 .../runtime/experimental/operators/CpuAdd.cpp |   4 +-
 .../experimental/operators/CpuElementwise.cpp |   8 +-
 .../experimental/operators/CpuGemm.cpp        |  55 ++-
 .../runtime/experimental/operators/CpuMul.cpp |   2 +
 .../experimental/operators/CpuSoftmax.cpp     | 172 ++++++++
 .../runtime/experimental/operators/CpuSub.cpp |   4 +-
 utils/Utils.h                                 |  35 +-
 124 files changed, 3554 insertions(+), 1397 deletions(-)
 create mode 100644 arm_compute/runtime/experimental/operators/CpuSoftmax.h
 delete mode 100644 src/core/CL/CLCommandBuffer.h
 delete mode 100644 src/core/CL/CLCompatCommandBuffer.cpp
 delete mode 100644 src/core/CL/CLCompatCommandBuffer.h
 delete mode 100644 src/core/CL/CLMutableCommandBuffer.cpp
 delete mode 100644 src/core/CL/CLMutableCommandBuffer.h
 create mode 100644 src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp
 create mode 100644 src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.h
 rename src/{core/CL/CLCommandBuffer.cpp => cpu/kernels/add/generic/sme2/impl.cpp} (55%)
 create mode 100644 src/cpu/kernels/add/generic/sme2/impl.h
 create mode 100644 src/cpu/kernels/add/generic/sme2/qasymm8_signed.cpp
 create mode 100644 src/runtime/experimental/operators/CpuSoftmax.cpp
 create mode 100644 tests/validation/CPP/LUT.cpp
 create mode 100644 tests/validation/fixtures/CpuSoftmaxFixture.h
 create mode 100644 tests/validation/runtime/experimental/operators/CpuSoftmax.cpp

diff --git a/Android.bp b/Android.bp
index fd59ef524e..d6516fec72 100644
--- a/Android.bp
+++ b/Android.bp
@@ -202,12 +202,9 @@ cc_library_static {
         "src/core/AccessWindowAutoPadding.cpp",
         "src/core/AccessWindowStatic.cpp",
         "src/core/AccessWindowTranspose.cpp",
-        "src/core/CL/CLCommandBuffer.cpp",
-        "src/core/CL/CLCompatCommandBuffer.cpp",
         "src/core/CL/CLCompileContext.cpp",
         "src/core/CL/CLHelpers.cpp",
         "src/core/CL/CLKernelLibrary.cpp",
-        "src/core/CL/CLMutableCommandBuffer.cpp",
         "src/core/CL/CLUtils.cpp",
         "src/core/CL/DefaultLWSHeuristics.cpp",
         "src/core/CL/ICLKernel.cpp",
@@ -466,6 +463,7 @@ cc_library_static {
         "src/cpu/kernels/activation/generic/neon/qasymm8.cpp",
         "src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp",
         "src/cpu/kernels/activation/generic/neon/qsymm16.cpp",
+        "src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp",
         "src/cpu/kernels/add/generic/neon/fp16.cpp",
         "src/cpu/kernels/add/generic/neon/fp32.cpp",
         "src/cpu/kernels/add/generic/neon/impl.cpp",
@@ -1032,6 +1030,7 @@ cc_library_static {
         "src/runtime/experimental/operators/CpuGemmConv2d.cpp",
         "src/runtime/experimental/operators/CpuGemmDirectConv2d.cpp",
         "src/runtime/experimental/operators/CpuMul.cpp",
+        "src/runtime/experimental/operators/CpuSoftmax.cpp",
         "src/runtime/experimental/operators/CpuSub.cpp",
         "src/runtime/experimental/operators/CpuTranspose.cpp",
         "src/runtime/experimental/operators/CpuWinogradConv2d.cpp",
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cb99dee99e..321a83bfbb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 list(APPEND CMAKE_MESSAGE_CONTEXT ArmCompute)
 project(
   ArmCompute
-  VERSION 41.0.0
+  VERSION 42.0.0
   DESCRIPTION
     "The Arm Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A CPU and Arm® Mali™ GPU architectures"
   LANGUAGES C CXX ASM)
@@ -138,11 +138,12 @@ if(ARM_COMPUTE_OPENMP)
 endif()
 
 # ---------------------------------------------------------------------
-# SVE Library
+# SVE Object Library
 
-add_library(arm_compute_sve "")
+add_library(arm_compute_sve OBJECT "")
 target_compile_options(arm_compute_sve
-                       PRIVATE "-march=armv8.2-a+sve+fp16+dotprod")
+                       PRIVATE "-march=armv8.2-a+sve+fp16+dotprod"
+                       PRIVATE "-fPIC")
 target_compile_definitions(arm_compute_sve PRIVATE ARM_COMPUTE_ENABLE_BF16)
 target_compile_definitions(arm_compute_sve PRIVATE ENABLE_SVE)
 target_compile_definitions(arm_compute_sve PRIVATE ARM_COMPUTE_ENABLE_SVE)
@@ -160,11 +161,12 @@ target_include_directories(
          src/core/NEON/kernels/arm_gemm/merges)
 
 # ---------------------------------------------------------------------
-# SVE2 Library
+# SVE2 Object Library
 
-add_library(arm_compute_sve2 "")
+add_library(arm_compute_sve2 OBJECT "")
 target_compile_options(arm_compute_sve2
-                       PRIVATE "-march=armv8.6-a+sve2+fp16+dotprod")
+                       PRIVATE "-march=armv8.6-a+sve2+fp16+dotprod"
+                       PRIVATE "-fPIC")
 target_compile_definitions(arm_compute_sve2 PRIVATE ARM_COMPUTE_ENABLE_SVE2)
 target_compile_definitions(arm_compute_sve2 PRIVATE ARM_COMPUTE_ENABLE_BF16)
 target_compile_definitions(arm_compute_sve2 PRIVATE ENABLE_SVE)
@@ -205,8 +207,11 @@ target_include_directories(
 target_compile_options(arm_compute PUBLIC ${COMMON_CXX_FLAGS})
 
 add_library(ArmCompute::Core ALIAS arm_compute)
+
+# arm_compute_sve and arm_compute_sve2 obj files will not be public in the arm_compute.so
 target_link_libraries(
-  arm_compute PUBLIC arm_compute_sve arm_compute_sve2)
+  arm_compute PRIVATE $<TARGET_OBJECTS:arm_compute_sve>
+              PRIVATE $<TARGET_OBJECTS:arm_compute_sve2>)
 
 # ---------------------------------------------------------------------
 # Graph Library
diff --git a/README.md b/README.md
index 64474b9890..97ffe318c4 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
  <img src="https://raw.githubusercontent.com/ARM-software/ComputeLibrary/gh-pages/ACL_logo.png"/><br><br>
 </div>
 
-# Compute Library ![](https://img.shields.io/badge/latest_release-24.08.1-green)
+# Compute Library ![](https://img.shields.io/badge/latest_release-24.09-green)
 
 
 The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A, Arm® Neoverse® and Arm® Mali™ GPUs architectures.<br>
@@ -37,7 +37,7 @@ Key Features:
 <br>
 
 ## Documentation
-[![Documentation](https://img.shields.io/badge/documentation-24.08.1-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08.1/index.xhtml)
+[![Documentation](https://img.shields.io/badge/documentation-24.09-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.09/index.xhtml)
 
 > Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc.
 
@@ -50,22 +50,22 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C
 
 | Platform       | Operating System | Release archive (Download) |
 | -------------- | ---------------- | -------------------------- |
-| Raspberry Pi 4 | Linux® 32bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-armv7a-cpu-bin.tar.gz) |
-| Raspberry Pi 4 | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) |
-| Odroid N2      | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-gpu-bin.tar.gz) |
-| HiKey960       | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-gpu-bin.tar.gz) |
+| Raspberry Pi 4 | Linux® 32bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-armv7a-cpu-bin.tar.gz) |
+| Raspberry Pi 4 | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) |
+| Odroid N2      | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-gpu-bin.tar.gz) |
+| HiKey960       | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-gpu-bin.tar.gz) |
 
 <br>
 
 | Architecture | Operating System | Release archive (Download) |
 | ------------ | ---------------- | -------------------------- |
-| armv7        | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-armv7a-cpu-gpu-bin.tar.gz) |
-| arm64-v8a    | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-android-aarch64-cpu-gpu-bin.tar.gz) |
-| arm64-v8a    | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-gpu-bin.tar.gz) |
+| armv7        | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-armv7a-cpu-gpu-bin.tar.gz) |
+| arm64-v8a    | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-android-aarch64-cpu-gpu-bin.tar.gz) |
+| arm64-v8a    | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.09/arm_compute-v24.09-linux-aarch64-cpu-gpu-bin.tar.gz) |
 
 <br>
 
-Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.08.1-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.08.1)
+Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.09-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.09)
 
 Pre-build binaries are generated with the following security / good coding practices related flags:
 > -Wall, -Wextra, -Wformat=2, -Winit-self, -Wstrict-overflow=2, -Wswitch-default, -Woverloaded-virtual, -Wformat-security, -Wctor-dtor-privacy, -Wsign-promo, -Weffc++, -pedantic, -fstack-protector-strong
@@ -107,13 +107,13 @@ Pre-build binaries are generated with the following security / good coding pract
 
 ## Experimental builds
 
-**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08.1/how_to_build.xhtml) for more details.
+**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.09/how_to_build.xhtml) for more details.
 
 <br>
 
 ## How to contribute
 
-Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08.1/contribution_guidelines.xhtml).
+Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.09/contribution_guidelines.xhtml).
 
 ### Developer Certificate of Origin (DCO)
 Before the Compute Library accepts your contribution, you need to certify its origin and give us your permission. To manage this process we use the Developer Certificate of Origin (DCO) V1.1 (https://developercertificate.org/)
diff --git a/SConscript b/SConscript
index bd8f034c9c..2aff67d8ca 100644
--- a/SConscript
+++ b/SConscript
@@ -33,8 +33,8 @@ import codecs
 import platform
 import SCons
 
-VERSION = "v24.08.1"
-LIBRARY_VERSION_MAJOR = 41
+VERSION = "v24.09"
+LIBRARY_VERSION_MAJOR = 42
 LIBRARY_VERSION_MINOR = 0
 LIBRARY_VERSION_PATCH = 0
 SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH)
diff --git a/SConstruct b/SConstruct
index 941f173d3d..c4bfef826d 100644
--- a/SConstruct
+++ b/SConstruct
@@ -281,8 +281,12 @@ if env['cppthreads']:
 
 if env['openmp']:
     env.Append(CPPDEFINES = [('ARM_COMPUTE_OPENMP_SCHEDULER', 1)])
-    env.Append(CXXFLAGS = ['-fopenmp'])
-    env.Append(LINKFLAGS = ['-fopenmp'])
+    if not 'windows' in env['os']:
+        env.Append(CXXFLAGS = ['-fopenmp'])
+        env.Append(LINKFLAGS = ['-fopenmp'])
+    else:
+        env.Append(CXXFLAGS = ['-openmp'])
+        env.Append(LINKFLAGS = ['libomp.lib'])
 
 # Validate and define state
 if env['estate'] == 'auto':
diff --git a/arm_compute/core/utils/DataTypeUtils.h b/arm_compute/core/utils/DataTypeUtils.h
index 6fabb19b64..b19a3dd1e7 100644
--- a/arm_compute/core/utils/DataTypeUtils.h
+++ b/arm_compute/core/utils/DataTypeUtils.h
@@ -97,9 +97,12 @@ inline size_t element_size_from_data_type(DataType dt)
         case DataType::S32:
         case DataType::F32:
             return 4;
+        case DataType::F64:
         case DataType::U64:
         case DataType::S64:
             return 8;
+        case DataType::SIZET:
+            return sizeof(size_t); // portable
         default:
             ARM_COMPUTE_ERROR("Undefined element size for given data type");
             return 0;
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
index b74fcb74ef..3b99cb40ac 100644
--- a/arm_compute/runtime/CL/CLScheduler.h
+++ b/arm_compute/runtime/CL/CLScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLSCHEDULER_H
-#define ARM_COMPUTE_CLSCHEDULER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_CLSCHEDULER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_CLSCHEDULER_H
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLTypes.h"
@@ -211,6 +211,8 @@ class CLScheduler final
     bool                    _job_chaining_enabled;
     int                     _job_chaining_size;
     int                     _job_chaining_count;
+    unsigned int            _enqueue_count;
+    unsigned int            _flush_count;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLSCHEDULER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_CLSCHEDULER_H
diff --git a/arm_compute/runtime/CL/CLTensorAllocator.h b/arm_compute/runtime/CL/CLTensorAllocator.h
index fde8e9c43a..763a1e4b13 100644
--- a/arm_compute/runtime/CL/CLTensorAllocator.h
+++ b/arm_compute/runtime/CL/CLTensorAllocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLTENSORALLOCATOR_H
-#define ARM_COMPUTE_CLTENSORALLOCATOR_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_CLTENSORALLOCATOR_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_CLTENSORALLOCATOR_H
 
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/OpenCL.h"
@@ -106,6 +106,9 @@ class CLTensorAllocator : public ITensorAllocator
      *
      */
     void free() override;
+
+    bool is_allocated() const override;
+
     /** Import an existing memory as a tensor's backing memory
      *
      * @warning memory should have been created under the same context that Compute Library uses.
@@ -156,4 +159,4 @@ class CLTensorAllocator : public ITensorAllocator
     CLInt32Array       _offset;                  /**< Offsets array in case of quantized per channel data type */
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLTENSORALLOCATOR_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_CLTENSORALLOCATOR_H
diff --git a/arm_compute/runtime/ITensorAllocator.h b/arm_compute/runtime/ITensorAllocator.h
index e2d3536169..2f77cd491c 100644
--- a/arm_compute/runtime/ITensorAllocator.h
+++ b/arm_compute/runtime/ITensorAllocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_ITENSORALLOCATOR_H
-#define ARM_COMPUTE_ITENSORALLOCATOR_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_ITENSORALLOCATOR_H
+#define ACL_ARM_COMPUTE_RUNTIME_ITENSORALLOCATOR_H
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
@@ -91,6 +91,12 @@ class ITensorAllocator
      */
     virtual void free() = 0;
 
+    /** Return whether the tensor is currently allocated.
+     *
+     * @return true if the tensor is allocated, false otherwise.
+     */
+    virtual bool is_allocated() const = 0;
+
 protected:
     /** Interface to be implemented by the child class to lock the memory allocation for the CPU to access.
      *
@@ -106,4 +112,4 @@ class ITensorAllocator
     size_t      _alignment{};            /**< Tensor's alignment in bytes */
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_ITENSORALLOCATOR_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_ITENSORALLOCATOR_H
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
index 3268781c65..46c83eb827 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
@@ -67,7 +67,6 @@ class NEArithmeticSubtraction : public IFunction
      * |QASYMM8        |QASYMM8        |QASYMM8        |
      * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
      * |QSYMM16        |QSYMM16        |QASYMM16       |
-     * |QSYMM16        |QSYMM16        |S32            |
      * |U8             |U8             |U8             |
      * |S16            |S16            |S16            |
      * |S32            |S32            |S32            |
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index bfb4bc83b5..f1f983b282 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -69,8 +69,9 @@ class NEPixelWiseMultiplication : public IFunction
      * |U8             |S16            |S16            |
      * |S16            |U8             |S16            |
      * |S16            |S16            |S16            |
+     * |S32            |S32            |S32            |
      * |F16            |F16            |F16            |
-     * |F32            |S32            |F32            |
+     * |F32            |F32            |F32            |
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
diff --git a/arm_compute/runtime/NEON/functions/NEReverse.h b/arm_compute/runtime/NEON/functions/NEReverse.h
index 4ac397a980..aa44e39a0b 100644
--- a/arm_compute/runtime/NEON/functions/NEReverse.h
+++ b/arm_compute/runtime/NEON/functions/NEReverse.h
@@ -42,11 +42,11 @@ class NEReverse : public INESimpleFunctionNoBorder
      * - All
      *
      * Valid data type configurations:
-     * |src0           |src1           |dst            |
-     * |:--------------|:--------------|:--------------|
-     * |All            |U32, S32       |All            |
+     * |src0                        |src1           |dst                         |
+     * |:---------------------------|:--------------|:---------------------------|
+     * |All except SIZET <= 32-bits |U32, S32       |All except SIZET <= 32-bits |
      *
-     * @param[in]  input             Input tensor. Data types supported: All
+     * @param[in]  input             Input tensor. Data types supported: All except SIZET <= 32-bit data types
      * @param[out] output            Output tensor. Data type supported: Same as @p input
      * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
      * @param[in]  use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
@@ -60,12 +60,7 @@ class NEReverse : public INESimpleFunctionNoBorder
     void configure(const ITensor *input, ITensor *output, const ITensor *axis, const bool use_inverted_axis = false);
     /** Static function to check if given info will lead to a valid configuration of NEReverseKernel
      *
-     * @param[in] input             Input tensor info. Data types supported: All
-     * @param[in] output            Output tensor info. Data type supported: Same as @p input
-     * @param[in] axis              Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
-     * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
-     *
-     * @return a status
+     * Similar to @ref NEReverse::configure()
      */
     static Status validate(const ITensorInfo *input,
                            const ITensorInfo *output,
diff --git a/arm_compute/runtime/TensorAllocator.h b/arm_compute/runtime/TensorAllocator.h
index d819931415..f25108d747 100644
--- a/arm_compute/runtime/TensorAllocator.h
+++ b/arm_compute/runtime/TensorAllocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 Arm Limited.
+ * Copyright (c) 2016-2019, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TENSORALLOCATOR_H
-#define ARM_COMPUTE_TENSORALLOCATOR_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_TENSORALLOCATOR_H
+#define ACL_ARM_COMPUTE_RUNTIME_TENSORALLOCATOR_H
 #include "arm_compute/runtime/ITensorAllocator.h"
 #include "arm_compute/runtime/Memory.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -84,6 +84,8 @@ class TensorAllocator : public ITensorAllocator
      */
     void allocate() override;
 
+    bool is_allocated() const override;
+
     /** Free allocated CPU memory.
      *
      * @note The tensor must have been allocated when calling this function.
@@ -126,4 +128,4 @@ class TensorAllocator : public ITensorAllocator
     Memory             _memory;                  /**< CPU memory */
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TENSORALLOCATOR_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_TENSORALLOCATOR_H
diff --git a/arm_compute/runtime/experimental/operators/CpuMul.h b/arm_compute/runtime/experimental/operators/CpuMul.h
index d5ef33d08b..10a9c40a46 100644
--- a/arm_compute/runtime/experimental/operators/CpuMul.h
+++ b/arm_compute/runtime/experimental/operators/CpuMul.h
@@ -56,27 +56,7 @@ class CpuMul : public INEOperator
     ~CpuMul() override;
     /** Initialise the kernel's inputs, dst and convertion policy.
      *
-     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
-     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
-     *
-     * @param[in, out] src1            First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
-     *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] src2            Second input tensor info. Data types supported: U8, QASYMM8 (only if @p src1 is QASYMM8), QASYMM8_SIGNED (only if @p src1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p src1 is QSYMM16), F16 (only if @p src1 is F16), F32 (only if @p src1 is F32).
-     *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     dst             dst tensor info. Data types supported:
-     *                                 - U8, only if both inputs are U8.
-     *                                 - QASYMM8, only if both inputs are QASYMM8.
-     *                                 - QASYMM8_SIGNED, only if @p src1 is QASYMM8_SIGNED.
-     *                                 - S16.
-     *                                 - QSYMM16, only if both inputs are QSYMM16.
-     *                                 - S32, only if both inputs are S32 or both are QSYMM16.
-     *                                 - F16, only if @p src1 is F16.
-     *                                 - F32, only if both inputs are F32.
-     * @param[in]      scale           Scale to apply after multiplication.
-     *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     *                                 If both @p src1, @p src2 and @p dst are of datatype S32, scale cannot be 1/255
-     * @param[in]      overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
-     * @param[in]      rounding_policy Rounding policy. @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     * Similar to @ref NEPixelWiseMultiplication::configure()
      */
     void configure(ITensorInfo               *src1,
                    ITensorInfo               *src2,
@@ -87,7 +67,7 @@ class CpuMul : public INEOperator
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
-     * Similar to @ref CpuMul::configure()
+     * Similar to @ref NEPixelWiseMultiplication::validate()
      *
      * @return a status
      */
diff --git a/arm_compute/runtime/experimental/operators/CpuSoftmax.h b/arm_compute/runtime/experimental/operators/CpuSoftmax.h
new file mode 100644
index 0000000000..1ac94288fd
--- /dev/null
+++ b/arm_compute/runtime/experimental/operators/CpuSoftmax.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2021-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUSOFTMAX_H
+#define ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUSOFTMAX_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/IOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace op
+{
+class CpuSoftmaxKernel;
+
+/*
+ * A shallow wrapper for arm_compute::cpu::CpuSoftmaxGeneric.
+ * Any new features should be added to arm_compute::cpu::CpuSoftmaxGeneric
+ * and arm_compute::experimental::op::CpuSoftmax should remain a shallow wrapper.
+ */
+class CpuSoftmax : public IOperator
+{
+public:
+    /** Constructor **/
+    CpuSoftmax();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CpuSoftmax(const CpuSoftmax &) = delete;
+    /** Prevent copy assignment */
+    CpuSoftmax &operator=(const CpuSoftmax &) = delete;
+    /** Default move constructor */
+    CpuSoftmax(CpuSoftmax &&) = default;
+    /** Default move assignment */
+    CpuSoftmax &operator=(CpuSoftmax &&) = default;
+    /** Default destructor */
+    ~CpuSoftmax() override;
+    /** Set the input and output tensors.
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     * @param[in,out] src    Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     *                       last value of each row to the nearest multiple.
+     * @param[out]    dst    Destination tensor ifo. Data types supported: same as @p input.
+     * @param[in]     beta   (Optional) A scaling factor for the exponent.
+     * @param[in]     axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
+     * @param[in]     is_log True if the operation is log-softmax
+     */
+    void configure(const ITensorInfo *src, ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref CpuSoftmax::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0, bool is_log = false);
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+    // Unused
+    void prepare(ITensorPack &constants) override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> impl_;
+};
+
+} // namespace op
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_RUNTIME_EXPERIMENTAL_OPERATORS_CPUSOFTMAX_H
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 57f15d0a78..d92a65f340 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -60,7 +60,7 @@ PROJECT_NAME           = "Compute Library"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 24.08.1
+PROJECT_NUMBER         = 24.09
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/docs/user_guide/how_to_build_and_run_examples.dox b/docs/user_guide/how_to_build_and_run_examples.dox
index 88ccc3d5c8..39b3a2ed6b 100644
--- a/docs/user_guide/how_to_build_and_run_examples.dox
+++ b/docs/user_guide/how_to_build_and_run_examples.dox
@@ -322,11 +322,9 @@ In this case the first argument of LeNet (like all the graph examples) is the ta
 
 @section S1_4_macos Building for macOS
 
-The library was successfully natively built for Apple Silicon under macOS 11.1 using clang v12.0.0.
-
 To natively compile the library with accelerated CPU support:
 
-	scons Werror=1 -j8 neon=1 opencl=0 os=macos arch=armv8a build=native
+	scons Werror=1 -j8 neon=1 opencl=0 os=macos arch=armv8.2-a build=native
 
 @note Initial support disables feature discovery through HWCAPS and thread scheduling affinity controls
 
diff --git a/docs/user_guide/operator_list.dox b/docs/user_guide/operator_list.dox
index 8e828e88a4..f423260fb5 100644
--- a/docs/user_guide/operator_list.dox
+++ b/docs/user_guide/operator_list.dox
@@ -208,7 +208,6 @@ where N = batches, C = channels, H = height, W = width, D = depth
     <tr><td>QASYMM8<td>QASYMM8<td>QASYMM8
     <tr><td>QASYMM8_SIGNED<td>QASYMM8_SIGNED<td>QASYMM8_SIGNED
     <tr><td>QSYMM16<td>QSYMM16<td>QASYMM16
-    <tr><td>QSYMM16<td>QSYMM16<td>S32
     <tr><td>U8<td>U8<td>U8
     <tr><td>S16<td>S16<td>S16
     <tr><td>S32<td>S32<td>S32
@@ -2319,8 +2318,9 @@ where N = batches, C = channels, H = height, W = width, D = depth
     <tr><td>U8<td>S16<td>S16
     <tr><td>S16<td>U8<td>S16
     <tr><td>S16<td>S16<td>S16
+    <tr><td>S32<td>S32<td>S32
     <tr><td>F16<td>F16<td>F16
-    <tr><td>F32<td>S32<td>F32
+    <tr><td>F32<td>F32<td>F32
     </table>
 <tr>
   <td>CLPixelWiseMultiplication
@@ -2752,7 +2752,7 @@ where N = batches, C = channels, H = height, W = width, D = depth
   <td>
     <table>
     <tr><th>src0<th>src1<th>dst
-    <tr><td>All<td>U32, S32<td>All
+    <tr><td>All except SIZET <= 32-bits<td>U32, S32<td>All except SIZET <= 32-bits
     </table>
 <tr>
   <td>CLReverse
diff --git a/filelist.json b/filelist.json
index e1de9e0511..5b49a68692 100644
--- a/filelist.json
+++ b/filelist.json
@@ -120,10 +120,7 @@
   ],
   "gpu": {
     "common": [
-      "src/core/CL/CLCommandBuffer.cpp",
-      "src/core/CL/CLCompatCommandBuffer.cpp",
       "src/core/CL/CLCompileContext.cpp",
-      "src/core/CL/CLMutableCommandBuffer.cpp",
       "src/core/CL/DefaultLWSHeuristics.cpp",
       "src/core/CL/CLHelpers.cpp",
       "src/core/CL/CLKernelLibrary.cpp",
@@ -899,6 +896,7 @@
           "common": [
             "src/cpu/operators/CpuActivation.cpp",
             "src/cpu/kernels/CpuActivationKernel.cpp",
+            "src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp",
             "src/runtime/NEON/functions/NEActivationLayer.cpp"
           ],
           "neon": {
@@ -958,8 +956,10 @@
             "fp16":["src/cpu/kernels/add/generic/sve/fp16.cpp"]
           },
           "sve2": {
+            "common": ["src/cpu/kernels/add/generic/sme2/impl.cpp"],
             "qasymm8": [ "src/cpu/kernels/add/generic/sve2/qasymm8.cpp" ],
-            "qasymm8_signed": [ "src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp" ],
+            "qasymm8_signed": [ "src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp",
+                                "src/cpu/kernels/add/generic/sme2/qasymm8_signed.cpp" ],
             "qsymm16": [ "src/cpu/kernels/add/generic/sve2/qsymm16.cpp" ]
           }
         }
@@ -1586,7 +1586,7 @@
         }
       },
       "Gemm": {
-        "deps": [ "Quantize", "Add", "Sub"],
+        "deps": [ "Quantize", "Add", "Sub", "Softmax"],
           "files": {
           "common": [
             "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp",
@@ -1617,6 +1617,7 @@
             "src/runtime/experimental/operators/CpuGemmConv2d.cpp",
             "src/runtime/experimental/operators/CpuGemmDirectConv2d.cpp",
             "src/runtime/experimental/operators/CpuMul.cpp",
+            "src/runtime/experimental/operators/CpuSoftmax.cpp",
             "src/runtime/experimental/operators/CpuSub.cpp",
             "src/runtime/experimental/operators/CpuTranspose.cpp",
             "src/runtime/experimental/operators/CpuWinogradConv2d.cpp"
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index bbfb463d54..4aa157efd5 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -110,6 +110,8 @@ filegroup(
 	"cpu/kernels/activation/generic/sve2/qasymm8.cpp",
 	"cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp",
 	"cpu/kernels/activation/generic/sve2/qsymm16.cpp",
+	"cpu/kernels/add/generic/sme2/impl.cpp",
+	"cpu/kernels/add/generic/sme2/qasymm8_signed.cpp",
 	"cpu/kernels/add/generic/sve2/qasymm8.cpp",
 	"cpu/kernels/add/generic/sve2/qasymm8_signed.cpp",
 	"cpu/kernels/add/generic/sve2/qsymm16.cpp",
@@ -743,6 +745,7 @@ filegroup(
 	"cpu/kernels/activation/generic/neon/qasymm8.cpp",
 	"cpu/kernels/activation/generic/neon/qasymm8_signed.cpp",
 	"cpu/kernels/activation/generic/neon/qsymm16.cpp",
+	"cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp",
 	"cpu/kernels/add/generic/neon/fp16.cpp",
 	"cpu/kernels/add/generic/neon/fp32.cpp",
 	"cpu/kernels/add/generic/neon/impl.cpp",
@@ -1044,6 +1047,7 @@ filegroup(
 	"runtime/experimental/operators/CpuGemmConv2d.cpp",
 	"runtime/experimental/operators/CpuGemmDirectConv2d.cpp",
 	"runtime/experimental/operators/CpuMul.cpp",
+	"runtime/experimental/operators/CpuSoftmax.cpp",
 	"runtime/experimental/operators/CpuSub.cpp",
 	"runtime/experimental/operators/CpuTranspose.cpp",
 	"runtime/experimental/operators/CpuWinogradConv2d.cpp"]  +
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 22198050e4..58eca30847 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -337,6 +337,8 @@ target_sources(
 	cpu/kernels/activation/generic/sve2/qasymm8.cpp
 	cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
 	cpu/kernels/activation/generic/sve2/qsymm16.cpp
+	cpu/kernels/add/generic/sme2/impl.cpp
+	cpu/kernels/add/generic/sme2/qasymm8_signed.cpp
 	cpu/kernels/add/generic/sve2/qasymm8.cpp
 	cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
 	cpu/kernels/add/generic/sve2/qsymm16.cpp
@@ -734,6 +736,7 @@ target_sources(
 	cpu/kernels/activation/generic/neon/qasymm8.cpp
 	cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
 	cpu/kernels/activation/generic/neon/qsymm16.cpp
+	cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp
 	cpu/kernels/add/generic/neon/fp16.cpp
 	cpu/kernels/add/generic/neon/fp32.cpp
 	cpu/kernels/add/generic/neon/impl.cpp
@@ -1035,6 +1038,7 @@ target_sources(
 	runtime/experimental/operators/CpuGemmConv2d.cpp
 	runtime/experimental/operators/CpuGemmDirectConv2d.cpp
 	runtime/experimental/operators/CpuMul.cpp
+	runtime/experimental/operators/CpuSoftmax.cpp
 	runtime/experimental/operators/CpuSub.cpp
 	runtime/experimental/operators/CpuTranspose.cpp
 	runtime/experimental/operators/CpuWinogradConv2d.cpp
diff --git a/src/common/cpuinfo/CpuInfo.cpp b/src/common/cpuinfo/CpuInfo.cpp
index d46d8d7773..2352e27a17 100644
--- a/src/common/cpuinfo/CpuInfo.cpp
+++ b/src/common/cpuinfo/CpuInfo.cpp
@@ -39,6 +39,12 @@
 #if !defined(_WIN64)
 #include <regex.h> /* C++ std::regex takes up a lot of space in the standalone builds */
 #include <sched.h>
+#else  /*  !defined(_WIN64) */
+// clang-format off
+#include <windows.h>
+#include <sysinfoapi.h>
+#include <processthreadsapi.h>
+// clang-format on
 #endif /* !defined(_WIN64) */
 
 #include <thread>
@@ -411,7 +417,15 @@ CpuInfo CpuInfo::build()
 #elif defined(__aarch64__) && defined(_WIN64)    /* #elif defined(__aarch64__) && defined(__APPLE__) */
     CpuIsaInfo isainfo;
     isainfo.neon = true;
-    CpuInfo info(isainfo, {CpuModel::GENERIC});
+    if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE))
+    {
+        isainfo.dot = true;
+    }
+    SYSTEM_INFO sysinfo;
+    GetSystemInfo(&sysinfo);
+    const int             ncpus = sysinfo.dwNumberOfProcessors;
+    std::vector<CpuModel> cpus_model(ncpus);
+    CpuInfo               info(isainfo, cpus_model);
     return info;
 #else                                            /* #elif defined(__aarch64__) && defined(_WIN64) */
     CpuInfo info(CpuIsaInfo(), {CpuModel::GENERIC});
diff --git a/src/core/CL/CLCommandBuffer.h b/src/core/CL/CLCommandBuffer.h
deleted file mode 100644
index 90e434161e..0000000000
--- a/src/core/CL/CLCommandBuffer.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ACL_SRC_CORE_CL_CLCOMMANDBUFFER_H
-#define ACL_SRC_CORE_CL_CLCOMMANDBUFFER_H
-
-#include "arm_compute/core/CL/OpenCL.h"
-
-#include <cstdint>
-#include <memory>
-#include <type_traits>
-
-namespace arm_compute
-{
-
-/** Command buffer contains a list of commands that is constructed once and later enqueued multiple times.
- *
- * To prepare a command buffer:
- *   - Construct a new command buffer targeting a command queue using @ref CLCommandBuffer::create.
- *   - Add kernel enqueue command to the buffer using @ref CLCommandBuffer::add_kernel.
- *     The kernel must be ready to be enqueued with all the arguments set.
- *   - Specify which kernel argument is mutable after the command buffer has been finalized.
- *   - When all the kernel enqueue commands have been added, call @ref CLCommandBuffer::finalize.
- *     After this point the command buffer is ready to be executed.
- *
- * To execute the command buffer:
- *   - Make any changes in the value which the mutable arguments are pointing to.
- *   - Call @ref CLCommandBuffer::update to apply the argument value changes.
- *   - Call @ref CLCommandBuffer::enqueue to enqueue the command buffer to execute.
- */
-class CLCommandBuffer
-{
-public:
-    /** Create a new command buffer targeting the specified command queue.
-     *
-     * @param[in] queue The command queue to execute the command buffer.
-     *
-     * @return A unique pointer to the newly created command buffer.
-     */
-    static std::unique_ptr<CLCommandBuffer> create(cl_command_queue queue);
-
-    /** Constructor. */
-    CLCommandBuffer();
-
-    /** Destructor. */
-    virtual ~CLCommandBuffer();
-
-    /** Disallow copy constructor. */
-    CLCommandBuffer(const CLCommandBuffer &) = delete;
-
-    /** Disallow copy assignment. */
-    CLCommandBuffer &operator=(const CLCommandBuffer &) = delete;
-
-    /** Disallow move constructor. */
-    CLCommandBuffer(CLCommandBuffer &&other) = delete;
-
-    /** Disallow move assignment. */
-    CLCommandBuffer &operator=(CLCommandBuffer &&other) = delete;
-
-    /** Add a kernel enqueue command to the command queue.
-     *
-     * This function must be called before the command buffer has been finalized.
-     *
-     * @param[in] kernel The CL kernel.
-     * @param[in] offset The global work offset.
-     * @param[in] global The global work size.
-     * @param[in] local  The local work size.
-     */
-    virtual void
-    add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0;
-
-    /** Add the mutable argument to the current kernel enqueue command.
-     *
-     * This function must be called after @ref CLCommandBuffer::add_kernel but before the command buffer
-     * has been finalized.
-     *
-     * The pointer must be valid and it must point to the correct value at the time
-     * @ref CLCommandBuffer::update is called so that the value of the argument
-     * can be applied successfully to the kernel enqueue command.
-     *
-     * @param[in] arg_idx The index of the argument in the current kernel program.
-     * @param[in] value   The pointer to the value of the argument.
-     */
-    template <typename T, typename = std::enable_if_t<std::is_arithmetic<T>::value || std::is_pointer<T>::value>>
-    void add_mutable_argument(cl_uint arg_idx, const T *value)
-    {
-        add_mutable_argument_generic(arg_idx, value, sizeof(T));
-    }
-
-    /** Finalize the command buffer. */
-    virtual void finalize() = 0;
-
-    /** Update the command buffer with new kernel argument values.
-     *
-     * This function must be called after the command buffer has been finalized.
-     *
-     * All the value pointed by the mutable argument will be applied to the command buffer.
-     */
-    virtual void update() = 0;
-
-    /** Enqueue the command buffer.
-     *
-     * This function must be called after the command buffer has been finalized.
-     */
-    virtual void enqueue() = 0;
-
-    /** Check if the command buffer has been finalized.
-     *
-     * @return true if the command buffer has been finalized.
-     */
-    virtual bool is_finalized() const = 0;
-
-protected:
-    /** Add the mutable argument to the current kernel enqueue command.
-     *
-     * @see CLCommandBuffer::add_mutable_argument for more information.
-     */
-    virtual void add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) = 0;
-
-    /** The state of the command buffer. */
-    enum class State : int32_t
-    {
-        /** The command buffer has been created and is being specified. */
-        Created,
-
-        /** The command buffer has been finalized and is ready to be executed. */
-        Finalized,
-    };
-
-    /** Get the state of the command buffer. */
-    State state() const;
-
-    /** Set the state of the command buffer. */
-    CLCommandBuffer &state(State state);
-
-private:
-    State _state{State::Created};
-};
-
-} // namespace arm_compute
-
-#endif // ACL_SRC_CORE_CL_CLCOMMANDBUFFER_H
diff --git a/src/core/CL/CLCompatCommandBuffer.cpp b/src/core/CL/CLCompatCommandBuffer.cpp
deleted file mode 100644
index 242fd7719c..0000000000
--- a/src/core/CL/CLCompatCommandBuffer.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/core/CL/CLCompatCommandBuffer.h"
-
-#include "arm_compute/core/Error.h"
-
-#include "src/core/CL/CLUtils.h"
-
-namespace arm_compute
-{
-
-CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue) : _queue(queue)
-{
-}
-
-CLCompatCommandBuffer::~CLCompatCommandBuffer()
-{
-}
-
-void CLCompatCommandBuffer::add_kernel(cl_kernel          kernel,
-                                       const cl::NDRange &offset,
-                                       const cl::NDRange &global,
-                                       const cl::NDRange &local)
-{
-    ARM_COMPUTE_ERROR_ON(state() != State::Created);
-
-    _kernel_cmds.push_back(KernelCommand{kernel, offset, global, local, {}});
-}
-
-void CLCompatCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size)
-{
-    ARM_COMPUTE_ERROR_ON(state() != State::Created);
-    ARM_COMPUTE_ERROR_ON(_kernel_cmds.empty());
-
-    _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{arg_idx, size, value});
-}
-
-void CLCompatCommandBuffer::finalize()
-{
-    ARM_COMPUTE_ERROR_ON(state() != State::Created);
-
-    _kernel_cmds.shrink_to_fit();
-
-    for (auto &cmd : _kernel_cmds)
-    {
-        cmd.mutable_args.shrink_to_fit();
-    }
-
-    state(State::Finalized);
-}
-
-void CLCompatCommandBuffer::update()
-{
-    ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
-
-    // Nothing to do here - The kernel arguments will be updated when each command is enqueued.
-}
-
-void CLCompatCommandBuffer::enqueue()
-{
-    ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
-
-    for (const auto &cmd : _kernel_cmds)
-    {
-        for (const auto &arg : cmd.mutable_args)
-        {
-            const auto error = clSetKernelArg(cmd.kernel, arg.arg_index, arg.arg_size, arg.arg_value);
-
-            handle_cl_error("clSetKernelArg", error);
-        }
-
-        const auto error =
-            clEnqueueNDRangeKernel(_queue, cmd.kernel, static_cast<cl_uint>(cmd.global.dimensions()),
-                                   cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr, cmd.global.get(),
-                                   cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr, 0, nullptr, nullptr);
-
-        handle_cl_error("clEnqueueNDRangeKernel", error);
-    }
-}
-
-bool CLCompatCommandBuffer::is_finalized() const
-{
-    return state() == State::Finalized;
-}
-
-} // namespace arm_compute
diff --git a/src/core/CL/CLCompatCommandBuffer.h b/src/core/CL/CLCompatCommandBuffer.h
deleted file mode 100644
index d5df106425..0000000000
--- a/src/core/CL/CLCompatCommandBuffer.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ACL_SRC_CORE_CL_CLCOMPATCOMMANDBUFFER_H
-#define ACL_SRC_CORE_CL_CLCOMPATCOMMANDBUFFER_H
-
-#include "src/core/CL/CLCommandBuffer.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-
-/** Command buffer implementation for platform without mutable dispatch command buffer extension. */
-class CLCompatCommandBuffer final : public CLCommandBuffer
-{
-public:
-    /** Create a new command buffer targeting the specified command queue.
-     *
-     * @param[in] queue The command queue to execute the command buffer.
-     */
-    CLCompatCommandBuffer(cl_command_queue queue);
-
-    /** Destructor. */
-    virtual ~CLCompatCommandBuffer();
-
-    /** Disallow copy constructor. */
-    CLCompatCommandBuffer(const CLCompatCommandBuffer &) = delete;
-
-    /** Disallow copy assignment. */
-    CLCompatCommandBuffer &operator=(const CLCompatCommandBuffer &) = delete;
-
-    /** Disallow move constructor. */
-    CLCompatCommandBuffer(CLCompatCommandBuffer &&) = delete;
-
-    /** Disallow move assignment. */
-    CLCompatCommandBuffer &operator=(CLCompatCommandBuffer &&) = delete;
-
-    void add_kernel(cl_kernel          kernel,
-                    const cl::NDRange &offset,
-                    const cl::NDRange &global,
-                    const cl::NDRange &local) override;
-
-    void finalize() override;
-
-    void update() override;
-
-    void enqueue() override;
-
-    bool is_finalized() const override;
-
-protected:
-    void add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) override;
-
-private:
-    struct KernelCommand
-    {
-        cl_kernel   kernel;
-        cl::NDRange offset;
-        cl::NDRange global;
-        cl::NDRange local;
-
-        std::vector<cl_mutable_dispatch_arg_khr> mutable_args;
-    };
-
-private:
-    cl_command_queue           _queue{};
-    std::vector<KernelCommand> _kernel_cmds{};
-};
-
-} // namespace arm_compute
-
-#endif // ACL_SRC_CORE_CL_CLCOMPATCOMMANDBUFFER_H
diff --git a/src/core/CL/CLMutableCommandBuffer.cpp b/src/core/CL/CLMutableCommandBuffer.cpp
deleted file mode 100644
index 0e078d8416..0000000000
--- a/src/core/CL/CLMutableCommandBuffer.cpp
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/core/CL/CLMutableCommandBuffer.h"
-
-#include "arm_compute/core/Error.h"
-
-#include "src/common/utils/Log.h"
-#include "src/core/CL/CLUtils.h"
-
-namespace arm_compute
-{
-
-CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) : CLCommandBuffer()
-{
-    cl_int status = CL_SUCCESS;
-
-    cl_command_buffer_properties_khr properties[] = {
-        CL_COMMAND_BUFFER_FLAGS_KHR,
-        CL_COMMAND_BUFFER_MUTABLE_KHR,
-        0,
-    };
-
-    _cb = clCreateCommandBufferKHR(1, &queue, properties, &status);
-    handle_cl_error("clCreateCommandBufferKHR", status);
-}
-
-CLMutableCommandBuffer::~CLMutableCommandBuffer()
-{
-    const auto status = clReleaseCommandBufferKHR(_cb);
-    if (status != CL_SUCCESS)
-    {
-        const std::string error_message = "clReleaseCommandBufferKHR - Error code: " + std::to_string(status);
-        ARM_COMPUTE_LOG_ERROR_ACL(error_message);
-    }
-}
-
-void CLMutableCommandBuffer::add_kernel(cl_kernel          kernel,
-                                        const cl::NDRange &offset,
-                                        const cl::NDRange &global,
-                                        const cl::NDRange &local)
-{
-    ARM_COMPUTE_ERROR_ON(state() != State::Created);
-
-    cl_mutable_command_khr mutable_handle = nullptr;
-
-    cl_ndrange_kernel_command_properties_khr properties[] = {
-        CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR,
-        CL_MUTABLE_DISPATCH_ARGUMENTS_KHR,
-        0,
-    };
-
-    const auto error = clCommandNDRangeKernelKHR(
-        _cb, nullptr, properties, kernel, global.dimensions(), offset.dimensions() != 0 ? offset.get() : nullptr,
-        global.get(), local.dimensions() != 0 ? local.get() : nullptr, 0, nullptr, nullptr, &mutable_handle);
-
-    handle_cl_error("clCommandNDRangeKernelKHR", error);
-
-    cl_mutable_dispatch_config_khr mut_dispatch_cfg{};
-    mut_dispatch_cfg.type    = CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR;
-    mut_dispatch_cfg.command = mutable_handle;
-
-    _mut_dispatch_cfgs.emplace_back(mut_dispatch_cfg);
-}
-
-void CLMutableCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size)
-{
-    ARM_COMPUTE_ERROR_ON(state() != State::Created);
-
-    cl_mutable_dispatch_arg_khr cfg{};
-    cfg.arg_index = arg_idx;
-    cfg.arg_size  = size;
-    cfg.arg_value = value;
-
-    _mut_arg_cfgs.emplace_back(cfg);
-    ++_mut_dispatch_cfgs.back().num_args;
-}
-
-void CLMutableCommandBuffer::finalize()
-{
-    ARM_COMPUTE_ERROR_ON(state() != State::Created);
-
-    const auto error = clFinalizeCommandBufferKHR(_cb);
-    handle_cl_error("clFinalizeCommandBufferKHR", error);
-
-    state(State::Finalized);
-
-    _mut_dispatch_cfgs.shrink_to_fit();
-    _mut_arg_cfgs.shrink_to_fit();
-
-    size_t arg_no = 0;
-
-    for (auto &mut_dispatch_cfg : _mut_dispatch_cfgs)
-    {
-        ARM_COMPUTE_ERROR_ON(arg_no >= _mut_arg_cfgs.size());
-        mut_dispatch_cfg.arg_list = &_mut_arg_cfgs[arg_no];
-
-        arg_no += mut_dispatch_cfg.num_args;
-    }
-
-    _mut_cfg.type                  = CL_STRUCTURE_TYPE_MUTABLE_BASE_CONFIG_KHR;
-    _mut_cfg.next                  = nullptr;
-    _mut_cfg.num_mutable_dispatch  = _mut_dispatch_cfgs.size();
-    _mut_cfg.mutable_dispatch_list = &_mut_dispatch_cfgs[0];
-}
-
-void CLMutableCommandBuffer::update()
-{
-    ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
-
-    const auto error = clUpdateMutableCommandsKHR(_cb, &_mut_cfg);
-
-    handle_cl_error("clUpdateMutableCommandsKHR", error);
-}
-
-void CLMutableCommandBuffer::enqueue()
-{
-    ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
-
-    const auto error = clEnqueueCommandBufferKHR(0, nullptr, _cb, 0, nullptr, nullptr);
-
-    handle_cl_error("clEnqueueCommandBufferKHR", error);
-}
-
-bool CLMutableCommandBuffer::is_finalized() const
-{
-    return state() == State::Finalized;
-}
-
-} // namespace arm_compute
diff --git a/src/core/CL/CLMutableCommandBuffer.h b/src/core/CL/CLMutableCommandBuffer.h
deleted file mode 100644
index 8997d7d1fd..0000000000
--- a/src/core/CL/CLMutableCommandBuffer.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2023 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ACL_SRC_CORE_CL_CLMUTABLECOMMANDBUFFER_H
-#define ACL_SRC_CORE_CL_CLMUTABLECOMMANDBUFFER_H
-
-#include "src/core/CL/CLCommandBuffer.h"
-
-#include <vector>
-
-namespace arm_compute
-{
-
-/** Command buffer implementaton based on CL mutable dispatch command buffer extension. */
-class CLMutableCommandBuffer : public CLCommandBuffer
-{
-public:
-    /** Create a new mutable dispatch command buffer targeting the specified command queue.
-     *
-     * @param[in] queue The command queue to execute the command buffer.
-     */
-    CLMutableCommandBuffer(cl_command_queue queue);
-
-    /** Destructor. */
-    virtual ~CLMutableCommandBuffer();
-
-    /** Disallow copy constructor. */
-    CLMutableCommandBuffer(const CLMutableCommandBuffer &) = delete;
-
-    /** Disallow copy assignment. */
-    CLMutableCommandBuffer &operator=(const CLMutableCommandBuffer &) = delete;
-
-    /** Disallow move constructor. */
-    CLMutableCommandBuffer(CLMutableCommandBuffer &&) = delete;
-
-    /** Disallow move assignment. */
-    CLMutableCommandBuffer &operator=(CLMutableCommandBuffer &&) = delete;
-
-    void add_kernel(cl_kernel          kernel,
-                    const cl::NDRange &offset,
-                    const cl::NDRange &global,
-                    const cl::NDRange &local) override;
-
-    void finalize() override;
-
-    void update() override;
-
-    void enqueue() override;
-
-    bool is_finalized() const override;
-
-protected:
-    void add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size) override;
-
-private:
-    cl_command_buffer_khr                       _cb{};
-    cl_mutable_base_config_khr                  _mut_cfg{};
-    std::vector<cl_mutable_dispatch_config_khr> _mut_dispatch_cfgs{};
-    std::vector<cl_mutable_dispatch_arg_khr>    _mut_arg_cfgs{};
-};
-
-} // namespace arm_compute
-
-#endif // ACL_SRC_CORE_CL_CLMUTABLECOMMANDBUFFER_H
diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer.cl b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
index 15ad116289..bd59e61ef8 100644
--- a/src/core/CL/cl_kernels/nchw/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,7 +39,9 @@
 #define POW2_OP(x, vec_size) (x)
 #endif /* defined(POOL_L2) */
 
+// Compatible with Cpu backend: Round to nearest ties to even
 #define DIV_OP(x, y) (x * (1.f / y))
+#define DIV_INT(x, y) convert_int_rte(DIV_OP(x, y))
 #define SQRT_OP(x) sqrt((x))
 
 #if defined(FP_MIXED_PRECISION) || defined(QUANTIZED)
@@ -132,7 +134,7 @@ __kernel void pooling_layer_MxN_nchw(
                 src_x = clamp(src_x, 0, SRC_WIDTH - 1);
                 VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
                 data0               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)(src_addr + src_x.s0 * sizeof(DATA_TYPE) + y * src_stride_y));
-#endif // defined(POOL_AVG) || defined(POOL_L2
+#endif // defined(POOL_AVG) || defined(POOL_L2)
 
 #if defined(POOL_L2)
                 // Raise to power of 2 for L2 Pooling
@@ -176,7 +178,12 @@ __kernel void pooling_layer_MxN_nchw(
 
 #if defined(POOL_AVG) || defined(POOL_L2)
     // Divide by pool region in case of average pooling
-    res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+    const ACC_DATA_TYPE avg_scale = calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#if defined(QUANTIZED)
+    res = DIV_INT(res, avg_scale);
+#else // defined(QUANTIZED)
+    res = DIV_OP(res, avg_scale);
+#endif // defined(QUANTIZED)
 #endif /* defined(POOL_AVG) || defined(POOL_L2) */
 
 #if defined(QUANTIZED)
@@ -282,4 +289,4 @@ __kernel void pooling_layer_2_nchw_indices(
     *(__global uint *)(indices_ptr + indices_offset_first_element_in_bytes + id0 * sizeof(uint) + id1 * indices_stride_y + id2 * indices_stride_z) = index;
 
 #endif // defined(SRC_BATCH)
-}
\ No newline at end of file
+}
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl
index abf0db9d07..a7b1ffd08e 100644
--- a/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/nhwc/pooling_3d_layer_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,14 +32,16 @@
 
 #define SQRT_OP(x) sqrt((x))
 
+#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
+#define CONVERT_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_RTE(x, type) CONVERT_RTE_STR(x, type)
+
 #if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_DEPTH) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
 
 #if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) && defined(POOL_SIZE_Z)
 
 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
 #define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
 #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
 #define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res)                                                                                 \
     {                                                                                                                                                                 \
@@ -169,7 +171,8 @@ __kernel void pooling_3d_layer_MxN_ndhwc_quantized(
     }
 
 #if defined(POOL_AVG)
-    res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size;
+    // Compatible with Cpu backend: Round to nearest ties to even
+    res0 = CONVERT_RTE(CONVERT(res0, VEC_DATA_TYPE(float, VEC_SIZE)) / filter_size, VEC_INT(VEC_SIZE));
 #endif // defined(POOL_AVG)
 
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
index 46268a4a88..42899e1e50 100644
--- a/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,10 +26,12 @@
 #if defined(DATA_TYPE) && defined(INITIAL_VALUE)
 #define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 
+#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
+#define CONVERT_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_RTE(x, type) CONVERT_RTE_STR(x, type)
+
 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
 #define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
 #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
 #define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res)                                                                                  \
     {                                                                                                                                                                 \
@@ -148,7 +150,8 @@ __kernel void pooling_layer_MxN_quantized_nhwc(
     }
 
 #if defined(POOL_AVG)
-    res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size;
+    // Compatible with Cpu backend: Round to nearest ties to even
+    res0 = CONVERT_RTE(CONVERT(res0, VEC_DATA_TYPE(float, VEC_SIZE)) / filter_size, VEC_INT(VEC_SIZE));
 #endif // defined(POOL_AVG)
 
     VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
@@ -161,4 +164,4 @@ __kernel void pooling_layer_MxN_quantized_nhwc(
     STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
 }
 #endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
-#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE)
\ No newline at end of file
+#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE)
diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
index 00241b161b..9722441bdb 100644
--- a/src/core/CL/kernels/CLReverseKernel.cpp
+++ b/src/core/CL/kernels/CLReverseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, 2023 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,7 +45,12 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT
 {
     ARM_COMPUTE_UNUSED(use_inverted_axis);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
-    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+
+#ifndef __aarch64__
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->element_size() > 4,
+                                    "Only 32-bit and lower data types are supported in 32-bit builds");
+#endif // __aarch64__
+
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
diff --git a/src/core/NEON/NEMath.h b/src/core/NEON/NEMath.h
index 9e81c38ad8..e03d6d537d 100644
--- a/src/core/NEON/NEMath.h
+++ b/src/core/NEON/NEMath.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEMATH_H
-#define ARM_COMPUTE_NEMATH_H
+#ifndef ACL_SRC_CORE_NEON_NEMATH_H
+#define ACL_SRC_CORE_NEON_NEMATH_H
+
+#include "arm_compute/core/Rounding.h"
 
 #include <arm_neon.h>
 #include <array>
@@ -204,6 +206,7 @@ void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x
  * @param[in]  in  Vector of float to be converted
  * @param[out] out Converted vector of uint8 to store the result
  */
+template <RoundingPolicy policy = RoundingPolicy::TO_ZERO>
 void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out);
 
 /** Converts from float32x4x4_t to just one int8x16_t
@@ -211,9 +214,13 @@ void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out);
  * @param[in]  in  Vector of float to be converted
  * @param[out] out Converted vector of uint8 to store the result
  */
+template <RoundingPolicy policy = RoundingPolicy::TO_ZERO>
 void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out);
 
 /** Converts from float vector to integer vector
+ *
+ * @note: Default rounding mode is "Round to Nearest with Ties to Even"
+ *        if __aarch64__ is defined else "Round towards Zero"
  *
  * @param[in] in Float vector to converted
  *
@@ -353,4 +360,4 @@ float16_t vreduce(const float16x8_t &v);
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace arm_compute
 #include "src/core/NEON/NEMath.inl"
-#endif /* ARM_COMPUTE_NEMATH_H */
+#endif // ACL_SRC_CORE_NEON_NEMATH_H
diff --git a/src/core/NEON/NEMath.inl b/src/core/NEON/NEMath.inl
index a5aba0bf23..d995b6e2fc 100644
--- a/src/core/NEON/NEMath.inl
+++ b/src/core/NEON/NEMath.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2023 Arm Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,6 +22,11 @@
  * SOFTWARE.
  */
 
+#ifndef ACL_SRC_CORE_NEON_NEMATH_INL
+#define ACL_SRC_CORE_NEON_NEMATH_INL
+
+#include "arm_compute/core/Error.h"
+
 #include "src/core/utils/Math.h"
 #include "support/ToolchainSupport.h"
 
@@ -492,25 +497,71 @@ inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const flo
     out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])), vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
 }
 
+template <RoundingPolicy policy>
+inline uint32x4_t vconvert_to_uint(float32x4_t in)
+{
+    switch (policy)
+    {
+        case RoundingPolicy::TO_ZERO:
+            return vcvtq_u32_f32(in);
+#ifdef __aarch64__
+        case RoundingPolicy::TO_NEAREST_EVEN:
+            return vcvtnq_u32_f32(in);
+        case RoundingPolicy::TO_NEAREST_UP:
+            return vcvtaq_u32_f32(in);
+#endif // __aarch64__
+        default:
+            ARM_COMPUTE_ERROR("Unsupported Rounding Policy");
+    }
+}
+
+template <RoundingPolicy policy>
+inline int32x4_t vconvert_to_int(float32x4_t in)
+{
+    switch (policy)
+    {
+        case RoundingPolicy::TO_ZERO:
+            return vcvtq_s32_f32(in);
+#ifdef __aarch64__
+        case RoundingPolicy::TO_NEAREST_EVEN:
+            return vcvtnq_s32_f32(in);
+        case RoundingPolicy::TO_NEAREST_UP:
+            return vcvtaq_s32_f32(in);
+#endif // __aarch64__
+        default:
+            ARM_COMPUTE_ERROR("Unsupported Rounding Policy");
+    }
+}
+
+template <RoundingPolicy policy>
 inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out)
 {
-    const auto low  = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), vqmovn_u32(vcvtq_u32_f32(in.val[1])));
-    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])), vqmovn_u32(vcvtq_u32_f32(in.val[3])));
-    out             = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
+    const auto low =
+        vcombine_u16(vqmovn_u32(vconvert_to_uint<policy>(in.val[0])), vqmovn_u32(vconvert_to_uint<policy>(in.val[1])));
+    const auto high =
+        vcombine_u16(vqmovn_u32(vconvert_to_uint<policy>(in.val[2])), vqmovn_u32(vconvert_to_uint<policy>(in.val[3])));
+    out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
 }
 
+template <RoundingPolicy policy>
 inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out)
 {
-    const auto low  = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])), vqmovn_s32(vcvtq_s32_f32(in.val[1])));
-    const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])), vqmovn_s32(vcvtq_s32_f32(in.val[3])));
-    out             = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
+    const auto low =
+        vcombine_s16(vqmovn_s32(vconvert_to_int<policy>(in.val[0])), vqmovn_s32(vconvert_to_int<policy>(in.val[1])));
+    const auto high =
+        vcombine_s16(vqmovn_s32(vconvert_to_int<policy>(in.val[2])), vqmovn_s32(vconvert_to_int<policy>(in.val[3])));
+    out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
 }
 
 template <>
 inline uint8x16_t convert_float_to_int<float32x4x4_t, uint8x16_t>(const float32x4x4_t &in)
 {
     uint8x16_t out;
-    convert_float32x4x4_to_uint8x16(in, out);
+#ifdef __aarch64__
+    convert_float32x4x4_to_uint8x16<RoundingPolicy::TO_NEAREST_EVEN>(in, out);
+#else  //  __aarch64__
+    convert_float32x4x4_to_uint8x16<RoundingPolicy::TO_ZERO>(in, out);
+#endif //  __aarch64__
     return out;
 }
 
@@ -524,7 +575,11 @@ template <>
 inline int8x16_t convert_float_to_int<float32x4x4_t, int8x16_t>(const float32x4x4_t &in)
 {
     int8x16_t out;
-    convert_float32x4x4_to_int8x16(in, out);
+#ifdef __aarch64__
+    convert_float32x4x4_to_int8x16<RoundingPolicy::TO_NEAREST_EVEN>(in, out);
+#else  //  __aarch64__
+    convert_float32x4x4_to_int8x16<RoundingPolicy::TO_ZERO>(in, out);
+#endif //  __aarch64__
     return out;
 }
 
@@ -730,3 +785,5 @@ inline float16_t vreduce(const float16x8_t &v)
 #endif /* DOXYGEN_SKIP_THIS */
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 } // namespace arm_compute
+
+#endif // ACL_SRC_CORE_NEON_NEMATH_INL
diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index b3710555df..88a9fa5a90 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021, 2023 Arm Limited.
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,8 +40,13 @@ validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const IT
 {
     ARM_COMPUTE_UNUSED(use_inverted_axis);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis);
-    //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
+
+    // No need to check for fp16 or bf16 support in the cpu as this kernel will only use unsigned integer data types
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->element_size() > 4, "Only 32-bit and lower data types are supported");
+
+    // size_t is not a portable type
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN || input->data_type() == DataType::SIZET);
+
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4,
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 532d08de92..5dfb5c0306 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2023 Arm Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -423,7 +423,7 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool
     // Note: Output quantization info for softmax should always have
     // * Softmax with QASYMM8: scale = 1/256, offset = 0
     // * Softmax with QASYMM8_SIGNED: scale = 1/256, offset = -128
-    // * LogSoftmax with QASYMM8: scale = 1/256, offset = 0
+    // * LogSoftmax with QASYMM8: scale = 16/256, offset = 255
     // * LogSoftmax with QASYMM8_SIGNED: scale = 16/256, offset = 127
     if (is_data_type_quantized_asymmetric_signed(input_type))
     {
@@ -436,7 +436,7 @@ QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool
             return QuantizationInfo(1.f / 256, -128);
         }
     }
-    return QuantizationInfo(1.f / 256, 0);
+    return is_log ? QuantizationInfo(16.f / 256, 255) : QuantizationInfo(1.f / 256, 0);
 }
 
 std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info,
diff --git a/src/core/helpers/LUTManager.cpp b/src/core/helpers/LUTManager.cpp
index 2effffbe92..62ad2bab6d 100644
--- a/src/core/helpers/LUTManager.cpp
+++ b/src/core/helpers/LUTManager.cpp
@@ -24,13 +24,22 @@
 
 #include "src/core/helpers/LUTManager.h"
 
+#include "src/common/utils/Validate.h"
+#include "support/Bfloat16.h"
+
 namespace arm_compute
 {
 #ifdef __aarch64__
 namespace
 {
 
-float16_t activation(float16_t x, const LUTInfo &info)
+union Element
+{
+    uint16_t  i = 0;
+    float16_t fp;
+};
+
+inline float16_t activation(float16_t x, const LUTInfo &info)
 {
     float16_t out = 0.f;
     switch (info.act)
@@ -50,26 +59,51 @@ float16_t activation(float16_t x, const LUTInfo &info)
     return out;
 }
 
-void init_lut_fp16(ActivationLayerInfo::LookupTable65536 *lut, const LUTInfo &info)
+// Read bf16 value as u16, convert to fp32.
+// Calculate exp in fp32, return as bf16
+inline uint16_t exponential(uint16_t x, const LUTInfo &info)
 {
-    union Element
-    {
-        uint16_t  i = 0;
-        float16_t fp;
-    } item;
+    float fp = bf16_to_float(x);
+    fp       = std::exp(fp * info.beta * -1);
+    return float_to_bf16(fp);
+}
 
-    // Fill lut by iterating over all 16 bit values using the union.
+void init_lut_16bit(LookupTable65536 *lut, const LUTInfo &info)
+{
+    // assert lut is valid config.
+    ARM_COMPUTE_ASSERT((info.type == LUTType::Activation && info.dt == DataType::F16) ||
+                       (info.type == LUTType::Exponential && info.dt == DataType::BFLOAT16));
+
+    Element item = {0}; // Fill lut by iterating over all 16 bit values using the union.
+    Element bf16 = {0}; // Temporary object used to store bf16 values as fp16 in lut
     while (true)
     {
-        (*lut)[item.i] = activation(item.fp, info);
+        switch (info.type)
+        {
+            case LUTType::Activation:
+            {
+                (*lut)[item.i] = activation(item.fp, info);
+                break;
+            }
+            case LUTType::Exponential:
+            {
+                bf16.i         = exponential(item.i, info);
+                (*lut)[item.i] = bf16.fp;
+                break;
+            }
+            default:
+                ARM_COMPUTE_ERROR("Unsupported Activation for 16-bit LUT table");
+                break;
+        }
         if (item.i == 65535)
             break;
         item.i++;
     }
 }
+
 } // namespace
 
-std::shared_ptr<ActivationLayerInfo::LookupTable65536> LUTManager::get_lut_table(LUTInfo info)
+std::shared_ptr<LookupTable65536> LUTManager::get_lut_table(LUTInfo info)
 {
     const auto itr   = map_fp16.find(info);
     auto       s_ptr = (itr != map_fp16.end()) ? itr->second.lock() : nullptr; // nullptr if invalid or not found.
@@ -82,8 +116,8 @@ std::shared_ptr<ActivationLayerInfo::LookupTable65536> LUTManager::get_lut_table
     {
         // Not found, or pointer not valid
         // We do not use make_shared to prevent the weak_ptr keeping the control block alive
-        std::shared_ptr<ActivationLayerInfo::LookupTable65536> ptr(new ActivationLayerInfo::LookupTable65536);
-        init_lut_fp16(ptr.get(), info);
+        std::shared_ptr<LookupTable65536> ptr(new LookupTable65536);
+        init_lut_16bit(ptr.get(), info);
         map_fp16[info] = ptr;
         return ptr;
     }
diff --git a/src/core/helpers/LUTManager.h b/src/core/helpers/LUTManager.h
index f3f4bf2832..226f44f360 100644
--- a/src/core/helpers/LUTManager.h
+++ b/src/core/helpers/LUTManager.h
@@ -34,28 +34,51 @@
 
 namespace arm_compute
 {
+#ifdef __aarch64__
+using LookupTable256   = std::array<qasymm8_t, 256>;
+using LookupTable65536 = std::array<float16_t, 65536>;
+#endif // __aarch64__
+
+enum class LUTType
+{
+    Activation,  // Determined by activation type
+    Exponential, // e^x
+};
 
 struct LUTInfo
 {
-    ActivationLayerInfo::ActivationFunction act;
-    float                                   alpha;
-    float                                   beta;
-    DataType                                dt;
-    UniformQuantizationInfo                 qinfo;
+    // For exponential lookup
+    LUTInfo(LUTType lut, float b, DataType type, UniformQuantizationInfo info)
+        : act(), alpha(1.0f), beta(b), dt(type), qinfo(info), type(lut)
+    {
+    }
+
+    // For activation functions
+    LUTInfo(ActivationFunction func, float a, float b, DataType type, UniformQuantizationInfo info)
+        : act(func), alpha(a), beta(b), dt(type), qinfo(info), type(LUTType::Activation)
+    {
+    }
 
     // Operators enable use of map with Lutinfo as key
     friend bool operator<(const LUTInfo &l, const LUTInfo &r)
     {
-        const auto l_tup = std::make_tuple(l.act, l.alpha, l.beta, l.dt, l.qinfo.scale, l.qinfo.offset);
-        const auto r_tup = std::make_tuple(r.act, r.alpha, r.beta, r.dt, r.qinfo.scale, r.qinfo.offset);
+        const auto l_tup = std::make_tuple(l.type, l.act, l.alpha, l.beta, l.dt, l.qinfo.scale, l.qinfo.offset);
+        const auto r_tup = std::make_tuple(r.type, r.act, r.alpha, r.beta, r.dt, r.qinfo.scale, r.qinfo.offset);
 
         return l_tup < r_tup;
     }
     bool operator==(const LUTInfo &l) const
     {
-        return this->act == l.act && this->alpha == l.alpha && this->beta == l.beta && this->dt == l.dt &&
-               this->qinfo == l.qinfo;
+        return this->type == l.type && this->act == l.act && this->alpha == l.alpha && this->beta == l.beta &&
+               this->dt == l.dt && this->qinfo == l.qinfo;
     }
+
+    ActivationLayerInfo::ActivationFunction act;
+    float                                   alpha;
+    float                                   beta;
+    DataType                                dt;
+    UniformQuantizationInfo                 qinfo;
+    LUTType                                 type; // Default is Activation.
 };
 
 /* Class to handle getting look up table */
@@ -66,10 +89,10 @@ class LUTManager
 
     static LUTManager &get_instance();
 #ifdef __aarch64__
-    std::shared_ptr<ActivationLayerInfo::LookupTable65536> get_lut_table(LUTInfo info);
+    std::shared_ptr<LookupTable65536> get_lut_table(LUTInfo info);
 
 private:
-    std::map<LUTInfo, std::weak_ptr<ActivationLayerInfo::LookupTable65536>> map_fp16{};
+    std::map<LUTInfo, std::weak_ptr<LookupTable65536>> map_fp16{};
 #endif // __aarch64__
 };
 
diff --git a/src/core/helpers/MemoryHelpers.h b/src/core/helpers/MemoryHelpers.h
index dd094b414c..2182ec925b 100644
--- a/src/core/helpers/MemoryHelpers.h
+++ b/src/core/helpers/MemoryHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_COMMON_MEMORY_HELPERS_H
-#define SRC_COMMON_MEMORY_HELPERS_H
+#ifndef ACL_SRC_CORE_HELPERS_MEMORYHELPERS_H
+#define ACL_SRC_CORE_HELPERS_MEMORYHELPERS_H
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorPack.h"
@@ -63,7 +63,8 @@ template <typename TensorType>
 WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirements &mem_reqs,
                                            MemoryGroup                            &mgroup,
                                            ITensorPack                            &run_pack,
-                                           ITensorPack                            &prep_pack)
+                                           ITensorPack                            &prep_pack,
+                                           bool                                    allocate_now = true)
 {
     WorkspaceData<TensorType> workspace_memory;
     for (const auto &req : mem_reqs)
@@ -94,8 +95,11 @@ WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirement
 
     for (auto &mem : workspace_memory)
     {
-        auto tensor = mem.tensor.get();
-        tensor->allocator()->allocate();
+        if (allocate_now || mem.lifetime == experimental::MemoryLifetime::Temporary)
+        {
+            auto tensor = mem.tensor.get();
+            tensor->allocator()->allocate();
+        }
     }
 
     return workspace_memory;
@@ -117,6 +121,28 @@ void release_prepare_tensors(WorkspaceData<TensorType> &workspace, ITensorPack &
                     workspace.end());
 }
 
+/** Allocate all tensors with Persistent or Prepare lifetime if not already allocated */
+template <typename TensorType>
+void allocate_tensors(const experimental::MemoryRequirements &mem_reqs, WorkspaceData<TensorType> &workspace)
+{
+    for (auto &ws : workspace)
+    {
+        const int slot = ws.slot;
+        for (auto &m : mem_reqs)
+        {
+            if (m.slot == slot && m.lifetime != experimental::MemoryLifetime::Temporary)
+            {
+                auto tensor = ws.tensor.get();
+                if (!tensor->allocator()->is_allocated())
+                {
+                    tensor->allocator()->allocate();
+                }
+                break;
+            }
+        }
+    }
+}
+
 /** Utility function to release tensors with lifetime marked as Prepare */
 template <typename TensorType>
 void release_temporaries(const experimental::MemoryRequirements &mem_reqs, WorkspaceData<TensorType> &workspace)
@@ -136,4 +162,4 @@ void release_temporaries(const experimental::MemoryRequirements &mem_reqs, Works
     }
 }
 } // namespace arm_compute
-#endif /* SRC_COMMON_MEMORY_HELPERS_H */
+#endif // ACL_SRC_CORE_HELPERS_MEMORYHELPERS_H
diff --git a/src/core/helpers/PoolingHelpers.h b/src/core/helpers/PoolingHelpers.h
index 9ef045f472..1bd570e4af 100644
--- a/src/core/helpers/PoolingHelpers.h
+++ b/src/core/helpers/PoolingHelpers.h
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2022 Arm Limited.
+* Copyright (c) 2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_HELPERS_POOLINGHELPERS_H
-#define SRC_CORE_HELPERS_POOLINGHELPERS_H
+#ifndef ACL_SRC_CORE_HELPERS_POOLINGHELPERS_H
+#define ACL_SRC_CORE_HELPERS_POOLINGHELPERS_H
 
 #include "src/core/NEON/NEAsymm.h"
 
@@ -122,6 +122,25 @@ inline int32x4_t vcvtq_q32_f32(float32x4_t values)
     return vcvtq_s32_f32(values);
 }
 
+#ifdef __aarch64__
+
+template <typename T>
+inline T vcvtnq_q32_f32(float32x4_t values);
+
+template <>
+inline uint32x4_t vcvtnq_q32_f32(float32x4_t values)
+{
+    return vcvtnq_u32_f32(values);
+}
+
+template <>
+inline int32x4_t vcvtnq_q32_f32(float32x4_t values)
+{
+    return vcvtnq_s32_f32(values);
+}
+
+#endif // __aarch64__
+
 template <typename T>
 inline float32x4_t vcvtq_f32_q32(T values);
 
@@ -216,4 +235,4 @@ inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo
 } // namespace
 } // namespace cpu
 } // namespace arm_compute
-#endif /* SRC_CORE_HELPERS_POOLINGHELPERS_H */
+#endif // ACL_SRC_CORE_HELPERS_POOLINGHELPERS_H
diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index 555705bd45..c02691d5db 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp
@@ -31,6 +31,7 @@
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.h"
 #include "src/cpu/kernels/activation/list.h"
 #include "src/cpu/kernels/logistic/list.h"
 
@@ -45,87 +46,6 @@ namespace kernels
 namespace
 {
 
-bool is_fp16_lut_supported(ActivationLayerInfo::ActivationFunction func)
-{
-    return func == ActivationLayerInfo::ActivationFunction::LOGISTIC ||
-           func == ActivationLayerInfo::ActivationFunction::TANH;
-}
-
-static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels = {
-#ifdef ARM_COMPUTE_ENABLE_SVE
-    {"sve2_q8_activation_lut",
-     [](const ActivationDataTypeISASelectorData &data)
-     {
-         return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) &&
-                data.cpumodel == CPUModel::A510 && data.isa.sve2 &&
-                data.f != ActivationLayerInfo::ActivationFunction::RELU;
-     },
-     REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)},
-#endif // ARM_COMPUTE_ENABLE_SVE
-#ifdef __aarch64__
-    {// Neon LUT implementantion takes precedence
-     "neon_q8_activation_lut",
-     [](const ActivationDataTypeISASelectorData &data)
-     {
-         return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) &&
-                data.f != ActivationLayerInfo::ActivationFunction::RELU;
-     },
-     REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)},
-#endif // __aarch64__
-    {"sme2_fp32_logistic",
-     [](const ActivationDataTypeISASelectorData &data) {
-         return data.dt == DataType::F32 && data.f == ActivationLayerInfo::ActivationFunction::LOGISTIC &&
-                data.isa.sme2;
-     },
-     REGISTER_FP32_SME2(arm_compute::cpu::sme2_fp32_logistic)},
-    {"sve2_qu8_activation",
-     [](const ActivationDataTypeISASelectorData &data) {
-         return data.dt == DataType::QASYMM8 && data.isa.sve2 &&
-                data.f != ActivationLayerInfo::ActivationFunction::GELU;
-     },
-     REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)},
-    {"sve2_qs8_activation",
-     [](const ActivationDataTypeISASelectorData &data)
-     {
-         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 &&
-                data.f != ActivationLayerInfo::ActivationFunction::GELU;
-     },
-     REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)},
-    {"sve2_qs16_activation",
-     [](const ActivationDataTypeISASelectorData &data) {
-         return data.dt == DataType::QSYMM16 && data.isa.sve2 &&
-                data.f != ActivationLayerInfo::ActivationFunction::GELU;
-     },
-     REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)},
-    {"sve_fp16_activation_lut",
-     [](const ActivationDataTypeISASelectorData &data)
-     { return data.dt == DataType::F16 && data.isa.fp16 && data.isa.sve && is_fp16_lut_supported(data.f); },
-     REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation_lut)},
-    {"sve_fp16_activation",
-     [](const ActivationDataTypeISASelectorData &data)
-     {
-         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
-                data.f != ActivationLayerInfo::ActivationFunction::GELU;
-     },
-     REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)},
-    {"sve_fp32_activation",
-     [](const ActivationDataTypeISASelectorData &data)
-     { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-     REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)},
-    {"neon_fp16_activation",
-     [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
-     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)},
-    {"neon_fp32_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F32; },
-     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)},
-    {"neon_qu8_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
-     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)},
-    {"neon_qs8_activation",
-     [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
-     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)},
-    {"neon_qs16_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QSYMM16; },
-     REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)},
-};
-
 /* Supported activation in the 8-bit integer domain */
 static const std::array<ActivationLayerInfo::ActivationFunction, 8> qasymm8_activations = {
     ActivationLayerInfo::ActivationFunction::RELU,         ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
@@ -144,8 +64,8 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
                                                          DataType::QSYMM16, DataType::F16, DataType::F32);
 
-    const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{
-        src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()});
+    heuristics::CpuActivationKernelHeuristics heuristics(src, dst, activation_info);
+    const auto                               *uk = heuristics.kernel();
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     const DataType          data_type = src->data_type();
@@ -193,19 +113,6 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src, ITensorInfo *dst)
-{
-    // Configure kernel window
-    Window win = calculate_max_window(*src, Steps());
-
-    if (dst != nullptr)
-    {
-        // dst auto inizialitation if not yet initialized
-        auto_init_if_empty(*dst, *src->clone());
-    }
-
-    return std::make_pair(Status{}, win);
-}
 #ifdef __aarch64__
 void init_lut(ActivationLayerInfo::ActivationFunction act_func,
               DataType                                data_type,
@@ -281,20 +188,21 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
 {
     ARM_COMPUTE_UNUSED(dst);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CpuActivationKernel::validate(src, dst, activation_info));
+
+    heuristics::CpuActivationKernelHeuristics heuristics(src, dst, activation_info);
+    _heuristics = std::move(heuristics);
 
-    const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{
-        src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()});
     if (dst != nullptr)
     {
         // dst auto inizialitation if not yet initialized
         auto_init_if_empty(*dst, *src->clone());
     }
 
+    const auto *uk = heuristics.kernel();
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
-    _run_method = uk->ukernel;
-    _name       = std::string("CpuActivationKernel").append("/").append(uk->name);
+    _name = std::string("CpuActivationKernel").append("/").append(uk->name);
 
 #ifdef __aarch64__
     // Initialise lut_manager
@@ -312,6 +220,7 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
 
     if (std::string(uk->name) == "sve_fp16_activation_lut")
     {
+        // Create info using init list.
         const LUTInfo info = {activation_info.activation(), activation_info.a(), activation_info.b(), src->data_type(),
                               src->quantization_info().uniform()};
         activation_info.setLookupTable65536((lut_manager.get_lut_table(info)));
@@ -319,16 +228,7 @@ void CpuActivationKernel::configure(const ITensorInfo *src, ITensorInfo *dst, Ac
 #endif // __aarch64__
     _act_info = activation_info;
 
-    Window win;
-
-    // Use squashed window
-    std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*src);
-    // Collapse window with SME kernels in Y-Dim
-    if (std::string(uk->name) == "sme2_fp32_logistic")
-    {
-        win = win.collapse(win, Window::DimY);
-    }
-    ICPPKernel::configure(win);
+    ICPPKernel::configure(heuristics.window());
 }
 
 Status
@@ -336,8 +236,6 @@ CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, co
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first);
 
     return Status{};
 }
@@ -347,13 +245,7 @@ size_t CpuActivationKernel::get_mws(const CPUInfo &platform, size_t thread_count
     ARM_COMPUTE_UNUSED(thread_count);
     ARM_COMPUTE_UNUSED(platform);
 
-    if (_split_dimension == Window::DimX)
-    {
-        // Don't split the work load too small if the tensor has been reinterpreted as 1D.
-        // This number is loosely chosen as threading overhead in each platform varies wildly.
-        return 1536;
-    }
-    return default_mws;
+    return _heuristics.mws();
 }
 
 void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
@@ -369,23 +261,20 @@ void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, con
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     ARM_COMPUTE_ERROR_ON(tensors.empty());
-    ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
+
+    ActivationKernelPtr run_method = _heuristics.kernel()->ukernel;
+    ARM_COMPUTE_ERROR_ON(run_method == nullptr);
 
     const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
     ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    _run_method(src, dst, _act_info, window);
+    run_method(src, dst, _act_info, window);
 }
 
 const char *CpuActivationKernel::name() const
 {
     return _name.c_str();
 }
-
-const std::vector<CpuActivationKernel::ActivationKernel> &CpuActivationKernel::get_available_kernels()
-{
-    return available_kernels;
-}
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h
index c1487499d6..946d539b17 100644
--- a/src/cpu/kernels/CpuActivationKernel.h
+++ b/src/cpu/kernels/CpuActivationKernel.h
@@ -29,6 +29,7 @@
 #include "src/core/common/Macros.h"
 #include "src/core/helpers/LUTManager.h"
 #include "src/cpu/ICpuKernel.h"
+#include "src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.h"
 
 namespace arm_compute
 {
@@ -37,11 +38,10 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the activation kernel */
-class CpuActivationKernel : public ICpuKernel<CpuActivationKernel>
+class CpuActivationKernel : public ICPPKernel
 {
 private:
-    using ActivationKernelPtr =
-        std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
+    using ActivationKernelPtr = heuristics::CpuActivationKernelHeuristics::KernelPtr;
 
 public:
     CpuActivationKernel() = default;
@@ -83,23 +83,13 @@ class CpuActivationKernel : public ICpuKernel<CpuActivationKernel>
      */
     size_t get_split_dimension_hint() const
     {
-        return _split_dimension;
+        return _heuristics.scheduler_hint().split_dimension();
     }
 
-    struct ActivationKernel
-    {
-        const char                                *name;
-        const ActivationDataTypeISASelectorDataPtr is_selected;
-        ActivationKernelPtr                        ukernel;
-    };
-
-    static const std::vector<ActivationKernel> &get_available_kernels();
-
 private:
-    ActivationLayerInfo _act_info{};
-    ActivationKernelPtr _run_method{nullptr};
-    size_t              _split_dimension{Window::DimY};
-    std::string         _name{};
+    ActivationLayerInfo                       _act_info{};
+    std::string                               _name{};
+    heuristics::CpuActivationKernelHeuristics _heuristics{};
 };
 } // namespace kernels
 } // namespace cpu
diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp
index a990aa4715..d86504054f 100644
--- a/src/cpu/kernels/CpuAddKernel.cpp
+++ b/src/cpu/kernels/CpuAddKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,6 +52,12 @@ namespace kernels
 namespace
 {
 static const std::vector<CpuAddKernel::AddKernel> available_kernels = {
+    {"sme2_qs8_add_fixedpoint",
+     [](const CpuAddKernelDataTypeISASelectorData &data) {
+         return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sme2 && data.can_use_fixedpoint &&
+                data.can_use_sme2_impl;
+     },
+     REGISTER_QASYMM8_SIGNED_SME2(arm_compute::cpu::add_qasymm8_signed_sme2)},
     {"neon_qu8_add_fixedpoint",
      [](const CpuAddKernelDataTypeISASelectorData &data)
      { return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint; },
@@ -134,8 +140,15 @@ validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITens
     }
 
     const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(&src0, &src1, &dst);
-    const auto uk                 = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(
-        CpuAddKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
+#ifdef ARM_COMPUTE_ENABLE_SME2
+    const auto can_use_sme2_impl = add_q8_sme2_fixedpoint_possible(&src0, &src1, &dst);
+#else  /* ARM_COMPUTE_ENABLE_SME2 */
+    const auto can_use_sme2_impl = false;
+#endif /* ARM_COMPUTE_ENABLE_SME2 */
+    const auto uk =
+        CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(CpuAddKernelDataTypeISASelectorData{
+            src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint, can_use_sme2_impl});
+
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
@@ -148,8 +161,14 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
 
     const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(src0, src1, dst);
-    const auto uk                 = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(
-        CpuAddKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
+#ifdef ARM_COMPUTE_ENABLE_SME2
+    const auto can_use_sme2_impl = add_q8_sme2_fixedpoint_possible(src0, src1, dst);
+#else  /* ARM_COMPUTE_ENABLE_SME2 */
+    const auto can_use_sme2_impl = false;
+#endif /* ARM_COMPUTE_ENABLE_SME2 */
+    const auto uk =
+        CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(CpuAddKernelDataTypeISASelectorData{
+            src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint, can_use_sme2_impl});
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
@@ -191,7 +210,6 @@ void CpuAddKernel::run_op(ITensorPack &tensors, const Window &window, const Thre
     const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
     const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     ITensor       *dst  = tensors.get_tensor(TensorType::ACL_DST);
-
     _run_method(src0, src1, dst, _policy, window);
 }
 
diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
index 82e3a5ce00..9acdd9b6c2 100644
--- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
+++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023 Arm Limited.
+ * Copyright (c) 2019-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -92,6 +92,7 @@ Status validate_arguments(const ITensorInfo     *src,
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     }
     else
     {
diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h
index 96ddad9d19..7e3a6fcc4b 100644
--- a/src/cpu/kernels/CpuKernelSelectionTypes.h
+++ b/src/cpu/kernels/CpuKernelSelectionTypes.h
@@ -90,6 +90,7 @@ struct CpuAddKernelDataTypeISASelectorData
     DataType            dt;
     cpuinfo::CpuIsaInfo isa;
     bool                can_use_fixedpoint;
+    bool                can_use_sme2_impl;
 };
 
 struct ScaleKernelDataTypeISASelectorData
diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp
index d7a3a77d51..bba5cc105f 100644
--- a/src/cpu/kernels/CpuMulKernel.cpp
+++ b/src/cpu/kernels/CpuMulKernel.cpp
@@ -128,6 +128,16 @@ inline Status validate_arguments(const ITensorInfo *src1,
                                         "Scale value not supported (Should be 1/(2^n) or 1/255");
     }
 
+    // Certain data types do not support x-dimension broadcasting
+    const bool broadcast_x = src1->tensor_shape().x() != src2->tensor_shape().x();
+    if (broadcast_x)
+    {
+        const DataType dtype1 = src1->data_type();
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dtype1 == DataType::QSYMM16 || dtype1 == DataType::U8 ||
+                                            dtype1 == DataType::S16,
+                                        "X-broadcasting is not supported in certain data type configurations.");
+    }
+
     return Status{};
 }
 
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h
index 676e79782b..becaa42835 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.h
+++ b/src/cpu/kernels/CpuSoftmaxKernel.h
@@ -37,8 +37,8 @@ namespace kernels
 class CpuSoftmaxKernel : public ICpuKernel<CpuSoftmaxKernel>
 {
 private:
-    using SoftmaxKernelPtr = std::add_pointer<void(
-        const ITensor *, void *const, ITensor *, float, int, const Window &, const float *)>::type;
+    using SoftmaxKernelPtr =
+        std::add_pointer<void(const ITensor *, void *const, ITensor *, float, int, const Window &, const void *)>::type;
 
 public:
     CpuSoftmaxKernel() = default;
diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp
index c8706ff651..0604691b81 100644
--- a/src/cpu/kernels/CpuSubKernel.cpp
+++ b/src/cpu/kernels/CpuSubKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -92,9 +92,12 @@ validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITens
                                                          DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
 
-    const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(&src0, &src1, &dst);
-    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(
-        CpuSubKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
+    const auto can_use_fixedpoint    = sub_q8_neon_fixedpoint_possible(&src0, &src1, &dst);
+    const auto can_use_sme2_add_impl = false;
+
+    const auto uk =
+        CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(CpuSubKernelDataTypeISASelectorData{
+            src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint, can_use_sme2_add_impl});
 
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
@@ -126,9 +129,11 @@ void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
     set_shape_if_empty(*dst, out_shape);
     set_data_type_if_unknown(*dst, src0->data_type());
 
-    const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(src0, src1, dst);
-    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(
-        CpuSubKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
+    const auto can_use_fixedpoint    = sub_q8_neon_fixedpoint_possible(src0, src1, dst);
+    const auto can_use_sme2_add_impl = false;
+    const auto uk =
+        CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(CpuSubKernelDataTypeISASelectorData{
+            src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint, can_use_sme2_add_impl});
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
diff --git a/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp b/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp
new file mode 100644
index 0000000000..76aa759dd1
--- /dev/null
+++ b/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.cpp
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2017-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.h"
+
+#include "src/core/common/Registrars.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/activation/list.h"
+#include "src/cpu/kernels/logistic/list.h"
+
+#include <map>
+#include <vector>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace heuristics
+{
+namespace
+{
+
+bool is_fp16_lut_supported(ActivationLayerInfo::ActivationFunction func)
+{
+    return func == ActivationLayerInfo::ActivationFunction::LOGISTIC ||
+           func == ActivationLayerInfo::ActivationFunction::TANH;
+}
+
+using KernelList = std::vector<CpuActivationKernelHeuristics::ActivationKernel>;
+using KernelMap  = std::map<DataType, KernelList>;
+
+static const KernelList fp32_kernels = {
+    {"sme2_fp32_logistic",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.f == ActivationLayerInfo::ActivationFunction::LOGISTIC && data.isa.sme2; },
+     REGISTER_FP32_SME2(arm_compute::cpu::sme2_fp32_logistic)},
+    {"sve_fp32_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
+     REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)},
+    {"neon_fp32_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         ARM_COMPUTE_UNUSED(data);
+         return true;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)},
+};
+
+static const KernelList fp16_kernels = {
+    {"sve_fp16_activation_lut",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.isa.fp16 && data.isa.sve && is_fp16_lut_supported(data.f); },
+     REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation_lut)},
+    {"sve_fp16_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.isa.sve && data.isa.fp16 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
+     REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)},
+    {"neon_fp16_activation", [](const ActivationDataTypeISASelectorData &data) { return data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)},
+};
+
+static const KernelList qasymm8_kernels = {
+    {"sve2_q8_activation_lut",
+     [](const ActivationDataTypeISASelectorData &data) {
+         return data.cpumodel == CPUModel::A510 && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::RELU;
+     },
+     REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)},
+#ifdef __aarch64__
+    {// Neon LUT implementantion takes precedence
+     "neon_q8_activation_lut",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.f != ActivationLayerInfo::ActivationFunction::RELU; },
+     REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)},
+#endif // __aarch64__
+    {"sve2_qu8_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
+     REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)},
+    {"neon_qu8_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         ARM_COMPUTE_UNUSED(data);
+         return true;
+     },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)},
+};
+
+static const KernelList qasymm8_signed_kernels = {
+    {"sve2_q8_activation_lut",
+     [](const ActivationDataTypeISASelectorData &data) {
+         return data.cpumodel == CPUModel::A510 && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::RELU;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_q8_activation_lut)},
+#ifdef __aarch64__
+    {// Neon LUT implementantion takes precedence
+     "neon_q8_activation_lut",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.f != ActivationLayerInfo::ActivationFunction::RELU; },
+     REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)},
+#endif // __aarch64__
+    {"sve2_qs8_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
+     REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)},
+    {"neon_qs8_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         ARM_COMPUTE_UNUSED(data);
+         return true;
+     },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)},
+};
+
+static const KernelList qsymm16_kernels = {
+    {"sve2_qs16_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
+     REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)},
+    {"neon_qs16_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         ARM_COMPUTE_UNUSED(data);
+         return true;
+     },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)},
+};
+
+static const KernelMap kernels = {{DataType::F32, fp32_kernels},
+                                  {DataType::F16, fp16_kernels},
+                                  {DataType::QASYMM8, qasymm8_kernels},
+                                  {DataType::QASYMM8_SIGNED, qasymm8_signed_kernels},
+                                  {DataType::QSYMM16, qsymm16_kernels}};
+
+} // namespace
+
+void CpuActivationKernelHeuristics::choose_kernel(ActivationDataTypeISASelectorData &selector)
+{
+    const auto &klist = kernels.find(selector.dt);
+    if (klist == kernels.end())
+    {
+        return;
+    }
+
+    for (const auto &uk : klist->second)
+    {
+        if (uk.is_selected(selector) && uk.ukernel != nullptr)
+        {
+            _kernel = &uk;
+            return;
+        }
+    }
+}
+
+CpuActivationKernelHeuristics::CpuActivationKernelHeuristics(const ITensorInfo         *src,
+                                                             const ITensorInfo         *dst,
+                                                             const ActivationLayerInfo &activation_info)
+{
+    ARM_COMPUTE_UNUSED(dst);
+
+    // Set kernel
+    const DataType                    dtype = src->data_type();
+    ActivationDataTypeISASelectorData selector{dtype, CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(),
+                                               activation_info.activation()};
+    choose_kernel(selector);
+
+    // Set window and scheduling hint
+    int split_dim;
+    std::tie(_window, split_dim) = calculate_squashed_or_max_window(*src);
+
+    // Collapse window with SME kernels in Y-Dim
+    if (std::string(_kernel->name) == "sme2_fp32_logistic")
+    {
+        _window = _window.collapse(_window, Window::DimY);
+    }
+
+    _hint = IScheduler::Hints(split_dim);
+
+    // Set minimum workload size
+    if (split_dim == Window::DimX)
+    {
+        // Don't split the work load too small if the tensor has been reinterpreted as 1D.
+        // This number is loosely chosen as threading overhead in each platform varies wildly.
+        _mws = 1536;
+    }
+}
+
+/** Return minimum workload size
+ *
+ * @return Minimum workload size for requested configuration.
+ */
+size_t CpuActivationKernelHeuristics::mws() const
+{
+    return _mws;
+}
+
+/** Return kernel's execution window
+ *
+ * @return The execution window
+ */
+const Window &CpuActivationKernelHeuristics::window() const
+{
+    return _window;
+}
+
+/** Return the kernel to run
+ *
+ * @return The function pointer to the chosen kernel
+ */
+const CpuActivationKernelHeuristics::ActivationKernel *CpuActivationKernelHeuristics::kernel()
+{
+    return _kernel;
+}
+
+/** Return the scheduling hint e.g. dimension(s) to split
+ *
+ * @return an instance of @ref IScheduler::Hints to describe the scheduling hints
+ */
+const IScheduler::Hints &CpuActivationKernelHeuristics::scheduler_hint() const
+{
+    return _hint;
+}
+} // namespace heuristics
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.h b/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.h
new file mode 100644
index 0000000000..1e08680ee7
--- /dev/null
+++ b/src/cpu/kernels/activation/heuristics/CpuActivationKernelHeuristics.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2017-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CPU_KERNELS_ACTIVATION_HEURISTICS_CPUACTIVATIONKERNELHEURISTICS_H
+#define ACL_SRC_CPU_KERNELS_ACTIVATION_HEURISTICS_CPUACTIVATIONKERNELHEURISTICS_H
+
+#include "arm_compute/core/CPP/ICPPKernel.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IScheduler.h"
+
+#include "src/core/common/Macros.h"
+#include "src/cpu/kernels/CpuKernelSelectionTypes.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace heuristics
+{
+
+class CpuActivationKernelHeuristics
+{
+public:
+    using KernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
+
+    struct ActivationKernel
+    {
+        const char                                *name{nullptr};
+        const ActivationDataTypeISASelectorDataPtr is_selected{nullptr};
+        KernelPtr                                  ukernel{nullptr};
+    };
+
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuActivationKernelHeuristics);
+
+    // Default constructor and destructor
+    CpuActivationKernelHeuristics() noexcept {};
+    ~CpuActivationKernelHeuristics() = default;
+
+    /** Similar to @ref CpuActivationKernel::configure() */
+    CpuActivationKernelHeuristics(const ITensorInfo         *src,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &activation_info);
+
+    /** Return minimum workload size
+     *
+     * @return Minimum workload size for requested configuration in size_t
+     */
+    size_t mws() const;
+
+    /** Return kernel's execution window
+     *
+     * @return a reference to the kernel execution window of type @ref Window
+     */
+    const Window &window() const;
+
+    /** Return the kernel to run
+     *
+     * @return The function pointer to the chosen kernel
+     */
+    const ActivationKernel *kernel();
+
+    /** Return the scheduling hint e.g. dimension(s) to split
+     *
+     * @return an instance of @ref IScheduler::Hints to describe the scheduling hints
+     */
+    const IScheduler::Hints &scheduler_hint() const;
+
+private:
+    /** Chooses a kernel to run and saves it into _kernel data member
+     *
+     * @param[in] selector Selector object based on input and device configuration
+     */
+    void choose_kernel(ActivationDataTypeISASelectorData &selector);
+
+private:
+    size_t                  _mws{ICPPKernel::default_mws};
+    Window                  _window{};
+    const ActivationKernel *_kernel{nullptr};
+    IScheduler::Hints       _hint{Window::DimY};
+};
+
+} // namespace heuristics
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_ACTIVATION_HEURISTICS_CPUACTIVATIONKERNELHEURISTICS_H
diff --git a/src/core/CL/CLCommandBuffer.cpp b/src/cpu/kernels/add/generic/sme2/impl.cpp
similarity index 55%
rename from src/core/CL/CLCommandBuffer.cpp
rename to src/cpu/kernels/add/generic/sme2/impl.cpp
index d094dcdaea..acc00e490c 100644
--- a/src/core/CL/CLCommandBuffer.cpp
+++ b/src/cpu/kernels/add/generic/sme2/impl.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023 Arm Limited.
+ * Copyright (c) 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,45 +22,33 @@
  * SOFTWARE.
  */
 
-#include "src/core/CL/CLCommandBuffer.h"
+#include "src/cpu/kernels/add/generic/sme2/impl.h"
 
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-
-#include "src/core/CL/CLCompatCommandBuffer.h"
-#include "src/core/CL/CLMutableCommandBuffer.h"
+#include "arm_compute/core/Helpers.h"
 
 namespace arm_compute
 {
-
-std::unique_ptr<CLCommandBuffer> CLCommandBuffer::create(cl_command_queue queue)
+namespace cpu
 {
-    const auto &cl_device            = CLKernelLibrary::get().get_device();
-    const auto  has_mutable_dispatch = command_buffer_mutable_dispatch_supported(cl_device);
-
-    if (has_mutable_dispatch)
-    {
-        return std::make_unique<CLMutableCommandBuffer>(queue);
-    }
-    else
-    {
-        return std::make_unique<CLCompatCommandBuffer>(queue);
-    }
-}
-
-CLCommandBuffer::CLCommandBuffer()  = default;
-CLCommandBuffer::~CLCommandBuffer() = default;
 
-CLCommandBuffer::State CLCommandBuffer::state() const
+bool add_q8_sme2_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
-    return _state;
+    return add_sub_q8_sme2_fixedpoint_possible(src0, src1, dst);
 }
 
-CLCommandBuffer &CLCommandBuffer::state(CLCommandBuffer::State state)
+bool add_sub_q8_sme2_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
-    _state = state;
-
-    return *this;
+    const auto        &in0_shape = src0->tensor_shape();
+    const auto        &in1_shape = src1->tensor_shape();
+    const unsigned int dst_dims  = dst->num_dimensions();
+    // Does not support broadcasting on x
+    // Does not support dims > 4D output, unless input shapes are identical (therefore collapsible)
+    if (in0_shape.x() == in1_shape.x() && (in0_shape == in1_shape || dst_dims <= 4))
+    {
+        return true;
+    }
+    return false;
 }
 
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/add/generic/sme2/impl.h b/src/cpu/kernels/add/generic/sme2/impl.h
new file mode 100644
index 0000000000..906b4f360c
--- /dev/null
+++ b/src/cpu/kernels/add/generic/sme2/impl.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_CPU_KERNELS_ADD_GENERIC_SME2_IMPL_H
+#define ACL_SRC_CPU_KERNELS_ADD_GENERIC_SME2_IMPL_H
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h" // Needed for ConvertPolicy
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+void add_qasymm8_signed_sme2(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+
+bool add_q8_sme2_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+
+bool add_sub_q8_sme2_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_ADD_GENERIC_SME2_IMPL_H
diff --git a/src/cpu/kernels/add/generic/sme2/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/sme2/qasymm8_signed.cpp
new file mode 100644
index 0000000000..4cf369b688
--- /dev/null
+++ b/src/cpu/kernels/add/generic/sme2/qasymm8_signed.cpp
@@ -0,0 +1,403 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef ARM_COMPUTE_ENABLE_SME2
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+// Add SME kernel
+void sme2_q8_signed_add_kernel( //
+    const int8_t   *src0,
+    const int8_t   *src1,
+    int8_t         *dst,
+    const float     scale_0,
+    const float     scale_1,
+    const float     offset,
+    const uintptr_t win_shape[4],
+    const uintptr_t src_strides[4],
+    const uintptr_t wei_strides[4],
+    const uintptr_t dst_strides[4])
+{
+    struct Args
+    {
+        uintptr_t     shape1;
+        uintptr_t     shape2;
+        uintptr_t     shape3;
+        const int8_t *src_0;
+        const int8_t *src_1;
+        int8_t       *dst;
+        int32_t       scale_0_5p11;
+        int32_t       scale_1_5p11;
+        int32_t       offset_21p11;
+    } args;
+
+    // Constant used to express values in the 21p11 and 5p11 fixed point format
+    constexpr float _2pow11 = 2048;
+
+    args.shape1       = win_shape[1];
+    args.shape2       = win_shape[2];
+    args.shape3       = win_shape[3];
+    args.src_0        = src0;
+    args.src_1        = src1;
+    args.dst          = dst;
+    args.scale_0_5p11 = static_cast<int32_t>(static_cast<int16_t>(support::cpp11::lround(scale_0 * _2pow11)));
+    args.scale_1_5p11 = static_cast<int32_t>(static_cast<int16_t>(support::cpp11::lround(scale_1 * _2pow11)));
+    args.offset_21p11 = static_cast<int32_t>(support::cpp11::lround(offset * _2pow11));
+
+    // Precondition:
+    assert(src_strides[0] == sizeof(int8_t));
+    assert(wei_strides[0] == sizeof(int8_t));
+    assert(dst_strides[0] == sizeof(int8_t));
+    __asm__ volatile(
+        R"(
+            .inst 0xd503477f  // smstart
+            .inst 0x25207811  // ptrue pn9.b
+            ptrue p0.b
+
+            // ==================================================
+            // 3D loop opening
+            // ==================================================
+
+            // ---------------------------------------------------------------- x8: body_length = (length / vl) * vl
+            cntb x8, ALL, MUL #2 // x8 is vl (of 8 bit values)
+            udiv x9, %x[length], x8 // length/vl
+            mul x8, x8, x9 // x8 = vl * result
+
+            ldr x10, [%[args_ptr], %[offset_shape_3]]
+            ldr x11, [%[args_ptr], %[offset_src_ptr]]
+            ldr x12, [%[args_ptr], %[offset_wei_ptr]]
+            ldr x13, [%[args_ptr], %[offset_dst_ptr]]
+
+            // Could potentially be replaced with explicit loads.
+            ld1rw {z1.s}, p0/z, [%[args_ptr], %[scale_0_offset]]
+            ld1rw {z2.s}, p0/z, [%[args_ptr], %[scale_1_offset]]
+            ld1rw {z3.s}, p0/z, [%[args_ptr], %[offset_offset]]
+
+loop_3_start%=:
+            // for index_3 in shape_3 downto 1
+            cmp x10, #0
+            b.eq loop_3_end%=
+            sub x10, x10, #1
+
+            ldr x14, [%[args_ptr], %[offset_shape_2]]
+            mov x15, x11
+            mov x16, x12
+            mov x17, x13
+
+loop_2_start%=:
+            // for index_2 in shape_2 downto 1
+            cmp x14, #0
+            b.eq loop_2_end%=
+            sub x14, x14, #1
+
+            ldr x7, [%[args_ptr], %[offset_shape_1]]
+            mov x20, x15
+            mov x21, x16
+            mov x22, x17
+
+loop_1_start%=:
+            // for index_1 in shape_2 downto 1
+            cmp x7, #0
+            b.eq loop_1_end%=
+            sub x7, x7, #1
+
+            mov x9, #0                                                         // x9: index/count
+
+inner_loop_body_start%=:
+            cmp x9, x8
+            b.eq inner_loop_body_end%=
+
+            /*
+            Two - instead of the maximal four - registers of each input are processed per loop iteration
+            due to the need for at least 32 registers just for the data processing which leaves no space
+            for the registers that contain the pre-loop loaded constants.
+            Once the would be 4 registers are expanded into 16 as the data goes from 8 to 32-bit, the
+            same number of registers (another 16) is needed to accumulate onto the offset constant for
+            each of those 16 lanes. One advantage of only processing two registers per loop is that more
+            of the elements to be processed will be in this vectorised loop instead of the left-over one.
+            */
+
+            // Load src0
+            .inst 0xa0090684    // ld1b     {z4.b-z5.b}, pn9/z, [x20, x9]
+
+            // Widen src0 to 16 bits
+            .inst 0xc175e08c    // sunpk    {z12.h-z15.h}, {z4.b-z5.b}
+
+            // Widen src0 to 32-bits
+            .inst 0xc1b5e184    // sunpk    {z4.s-z7.s}, {z12.h-z13.h}
+            .inst 0xc1b5e1c8    // sunpk    {z8.s-z11.s}, {z14.h-z15.h}
+
+            // Duplicate the offset value into registers for all the values to be processed
+            mov z16.d, z3.d
+            mov z17.d, z3.d
+            mov z18.d, z3.d
+            mov z19.d, z3.d
+            mov z20.d, z3.d
+            mov z21.d, z3.d
+            mov z22.d, z3.d
+            mov z23.d, z3.d
+
+            // MLA Fixed Point multiplication and accumulation integer
+            // Multiply src0 by scale_0 (z1) and add offset
+            mla z16.s, p0/m, z4.s, z1.s
+            mla z17.s, p0/m, z5.s, z1.s
+            mla z18.s, p0/m, z6.s, z1.s
+            mla z19.s, p0/m, z7.s, z1.s
+            mla z20.s, p0/m, z8.s, z1.s
+            mla z21.s, p0/m, z9.s, z1.s
+            mla z22.s, p0/m, z10.s, z1.s
+            mla z23.s, p0/m, z11.s, z1.s
+
+            //Load src1 into the same registers that were used for src0 since they are no longer needed
+            .inst 0xa00906a4    // ld1b     {z4.b-z5.b}, pn9/z, [x21, x9]
+
+            // Widen src1 to 16 bits
+            .inst 0xc175e08c    // sunpk    {z12.h-z15.h}, {z4.b-z5.b}
+
+            // Widen src1 32-bits
+            .inst 0xc1b5e184    // sunpk    {z4.s-z7.s}, {z12.h-z13.h}
+            .inst 0xc1b5e1c8    // sunpk    {z8.s-z11.s}, {z14.h-z15.h}
+
+            // MLA Fixed Point multiplication and accumulation integer
+            // Multiply src1 by scale_1 (z2) and accumulate into registers containing src0*scale_0 + offset
+            mla z16.s, p0/m, z4.s, z2.s
+            mla z17.s, p0/m, z5.s, z2.s
+            mla z18.s, p0/m, z6.s, z2.s
+            mla z19.s, p0/m, z7.s, z2.s
+            mla z20.s, p0/m, z8.s, z2.s
+            mla z21.s, p0/m, z9.s, z2.s
+            mla z22.s, p0/m, z10.s, z2.s
+            mla z23.s, p0/m, z11.s, z2.s
+
+            // Int32 to Int8 saturate
+            .inst 0xc175da85    // sqrshr   z5.b, {z20.s-z23.s}, #11
+            .inst 0xc175da04    // sqrshr   z4.b, {z16.s-z19.s}, #11
+            // Store
+            .inst 0xa02906c4    // st1b     {z4.b-z5.b}, pn9, [x22, x9]
+
+            incb x9, ALL, MUL #2
+            b inner_loop_body_start%=
+inner_loop_body_end%=:
+
+inner_loop_leftover_start%=:
+            whilelo p1.b, x9, %x[length]    // While x9<length
+            b.none inner_loop_leftover_end%=
+
+            // Load src0
+            ld1b z4.b, p1/z, [x20, x9]
+
+            // Widen src0 to 16 bits
+            sunpklo z6.h, z4.b                                       // lower as 16 bits
+            sunpkhi z7.h, z4.b                                       // upper as 16 bits
+
+            // Widen src0 to 32 bits
+            // Lower - 32-bit
+            sunpklo z10.s, z6.h
+            sunpkhi z11.s, z6.h
+            // Upper - 32-bit
+            sunpklo z12.s, z7.h
+            sunpkhi z13.s, z7.h
+
+            // Duplicate the offset value into registers for all the values to be processed
+            mov z14.d, z3.d
+            mov z15.d, z3.d
+            mov z16.d, z3.d
+            mov z17.d, z3.d
+
+            // MLA Fixed Point multiplication and accumulation integer
+            // Multiply src0 by scale_0 (z1) and add offset
+            mla z14.s, p0/m, z10.s, z1.s
+            mla z15.s, p0/m, z11.s, z1.s
+            mla z16.s, p0/m, z12.s, z1.s
+            mla z17.s, p0/m, z13.s, z1.s
+
+            // Load src1
+            ld1b z5.b, p1/z, [x21, x9]                                // z5: b input_data
+
+            // Widen src1 to 16 bits
+            sunpklo z8.h, z5.b                                       // lower as 16 bits
+            sunpkhi z9.h, z5.b                                       // upper as 16 bits
+
+            // Widen src1 to 32 bits
+            // Lower - 32-bit
+            sunpklo z10.s, z8.h
+            sunpkhi z11.s, z8.h
+            // Upper - 32-bit
+            sunpklo z12.s, z9.h
+            sunpkhi z13.s, z9.h
+
+            // MLA Fixed Point multiplication and accumulation integer
+            // Multiply src1 by scale_1 (z2) and accumulate into registers containing src0*scale_0 + offset
+            mla z14.s, p0/m, z10.s, z2.s
+            mla z15.s, p0/m, z11.s, z2.s
+            mla z16.s, p0/m, z12.s, z2.s
+            mla z17.s, p0/m, z13.s, z2.s
+
+            // Right shift rounding (lower)
+            rshrnb z20.h, z14.s, #8
+            rshrnb z21.h, z15.s, #8
+            uzp1 z25.h, z20.h, z21.h
+            // Right shift upper.
+            rshrnb z22.h, z16.s, #8
+            rshrnb z23.h, z17.s, #8
+            uzp1 z26.h, z22.h, z23.h
+
+            // Shift again to 8 bit both vectors. Recombine.
+            sqrshrnb z25.b, z25.h, #3
+            sqrshrnb z26.b, z26.h, #3
+            uzp1 z27.b, z25.b, z26.b
+
+            st1b z27.b, p1, [x22, x9]
+
+            incb x9 // x9 : x9 += sizeof(element) * predicate_count
+            b inner_loop_leftover_start%=
+inner_loop_leftover_end%=:
+
+            // ==================================================
+            // 3D loop closing
+            // ==================================================
+
+            add x20, x20, %[src_stride_1]
+            add x21, x21, %[wei_stride_1]
+            add x22, x22, %[dst_stride_1]
+            b loop_1_start%=
+loop_1_end%=:
+
+            add x15, x15, %[src_stride_2]
+            add x16, x16, %[wei_stride_2]
+            add x17, x17, %[dst_stride_2]
+            b loop_2_start%=
+loop_2_end%=:
+
+            add x11, x11, %[src_stride_3]
+            add x12, x12, %[wei_stride_3]
+            add x13, x13, %[dst_stride_3]
+            b loop_3_start%=
+loop_3_end%=:
+
+            .inst 0xd503467f  // smstop
+        )"
+        :
+        : // The following arguments are loaded via arg ptr values and a constant offset.
+        [args_ptr] "r"(&args), [offset_src_ptr] "I"(offsetof(Args, src_0)), [offset_wei_ptr] "I"(offsetof(Args, src_1)),
+        [offset_dst_ptr] "I"(offsetof(Args, dst)), [offset_shape_1] "I"(offsetof(Args, shape1)),
+        [offset_shape_2] "I"(offsetof(Args, shape2)), [offset_shape_3] "I"(offsetof(Args, shape3)),
+        [scale_0_offset] "I"(offsetof(Args, scale_0_5p11)), //
+        [scale_1_offset] "I"(offsetof(Args, scale_1_5p11)), //
+        [offset_offset] "I"(offsetof(Args, offset_21p11)),  //
+        // Use registers for efficiency sake.
+        [src_stride_1] "r"(src_strides[1]), [src_stride_2] "r"(src_strides[2]), [src_stride_3] "r"(src_strides[3]),
+        [wei_stride_1] "r"(wei_strides[1]), [wei_stride_2] "r"(wei_strides[2]), [wei_stride_3] "r"(wei_strides[3]),
+        [dst_stride_1] "r"(dst_strides[1]), [dst_stride_2] "r"(dst_strides[2]), [dst_stride_3] "r"(dst_strides[3]),
+        [length] "r"(win_shape[0])
+        : "cc", "memory", //
+          "p0", "p1", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x20", "x21", "x22",
+          "z1", "z2", "z3", "z4", "z5", "z6", "z7",               //
+          "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15",   //
+          "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", //
+          "z25", "z26", "z27"                                     //
+    );
+}
+
+void add_qasymm8_signed_sme2(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    const auto *src0_info = src0->info();
+    const auto *src1_info = src1->info();
+    const auto *dst_info  = dst->info();
+
+    const UniformQuantizationInfo src0_q_info = src0_info->quantization_info().uniform();
+    const UniformQuantizationInfo src1_q_info = src1_info->quantization_info().uniform();
+    const UniformQuantizationInfo dst_q_info  = dst_info->quantization_info().uniform();
+
+    const auto &src0_strides_bytes = src0_info->strides_in_bytes();
+    const auto &src1_strides_bytes = src1_info->strides_in_bytes();
+    const auto &dst_strides_bytes  = dst_info->strides_in_bytes();
+
+    // NOTE: This kernel does not support shapes above 4D (Unless excecution window has been collapsed)
+    assert(window.num_iterations(4) == 1 && window.num_iterations(5) == 1);
+
+    // Note : The window is expected to handle broadcasting in higher axis than x by setting relevant strides to 0.
+    const uintptr_t shape[] = {
+        window.num_iterations(0),
+        window.num_iterations(1),
+        window.num_iterations(2),
+        window.num_iterations(3),
+    };
+
+    Window input0_win = window.broadcast_if_dimension_le_one(src0_info->tensor_shape());
+    Window input1_win = window.broadcast_if_dimension_le_one(src1_info->tensor_shape());
+
+    // First dim is always datasize. If broadcasting in other dims, set stride to 0.
+    uintptr_t src0_strides[] = {src0_strides_bytes[0], (input0_win.is_broadcasted(1)) ? 0 : src0_strides_bytes[1],
+                                (input0_win.is_broadcasted(2)) ? 0 : src0_strides_bytes[2],
+                                (input0_win.is_broadcasted(3)) ? 0 : src0_strides_bytes[3]};
+    uintptr_t src1_strides[] = {src1_strides_bytes[0], (input1_win.is_broadcasted(1)) ? 0 : src1_strides_bytes[1],
+                                (input1_win.is_broadcasted(2)) ? 0 : src1_strides_bytes[2],
+                                (input1_win.is_broadcasted(3)) ? 0 : src1_strides_bytes[3]};
+
+    const uintptr_t dst_strides[] = {
+        dst_strides_bytes[0],
+        dst_strides_bytes[1],
+        dst_strides_bytes[2],
+        dst_strides_bytes[3],
+    };
+
+    const uintptr_t src0_offset = window[0].start() * src0_strides[0] + window[1].start() * src0_strides[1] +
+                                  window[2].start() * src0_strides[2] + window[3].start() * src0_strides[3] +
+                                  src0->info()->offset_first_element_in_bytes();
+    const uintptr_t src1_offset = window[0].start() * src1_strides[0] + window[1].start() * src1_strides[1] +
+                                  window[2].start() * src1_strides[2] + window[3].start() * src1_strides[3] +
+                                  src1->info()->offset_first_element_in_bytes();
+    const uintptr_t dst_offset = window[0].start() * dst_strides[0] + window[1].start() * dst_strides[1] +
+                                 window[2].start() * dst_strides[2] + window[3].start() * dst_strides[3] +
+                                 dst->info()->offset_first_element_in_bytes();
+
+    const auto *src0_ptr = reinterpret_cast<const int8_t *>(src0->buffer() + src0_offset);
+    const auto *src1_ptr = reinterpret_cast<const int8_t *>(src1->buffer() + src1_offset);
+    auto       *dst_ptr  = reinterpret_cast<int8_t *>(dst->buffer() + dst_offset);
+
+    // Calculate or retrieve necessary offsets/scale values.
+    const int32_t offset_a = src0_q_info.offset;
+    const int32_t offset_b = src1_q_info.offset;
+    const float   scale0   = src0_q_info.scale / dst_q_info.scale;
+    const float   scale1   = src1_q_info.scale / dst_q_info.scale;
+    const float   offset   = static_cast<float>(dst_q_info.offset) - static_cast<float>(offset_a) * scale0 -
+                         static_cast<float>(offset_b) * scale1;
+
+    sme2_q8_signed_add_kernel(src0_ptr, src1_ptr, dst_ptr, scale0, scale1, offset, shape, src0_strides, src1_strides,
+                              dst_strides);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ARM_COMPUTE_ENABLE_SME2
diff --git a/src/cpu/kernels/add/list.h b/src/cpu/kernels/add/list.h
index 1040c39a41..7a5dc5a176 100644
--- a/src/cpu/kernels/add/list.h
+++ b/src/cpu/kernels/add/list.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022 Arm Limited.
+ * Copyright (c) 2020-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_KERNELS_ADD_LIST_H
-#define SRC_CORE_KERNELS_ADD_LIST_H
+#ifndef ACL_SRC_CPU_KERNELS_ADD_LIST_H
+#define ACL_SRC_CPU_KERNELS_ADD_LIST_H
 
 #include "src/cpu/kernels/add/generic/neon/impl.h"
+#include "src/cpu/kernels/add/generic/sme2/impl.h"
 #include "src/cpu/kernels/add/generic/sve/impl.h"
 
 namespace arm_compute
@@ -51,9 +52,10 @@ DECLARE_ADD_KERNEL(add_s32_sve);
 DECLARE_ADD_KERNEL(add_qasymm8_sve2);
 DECLARE_ADD_KERNEL(add_qasymm8_signed_sve2);
 DECLARE_ADD_KERNEL(add_qsymm16_sve2);
+DECLARE_ADD_KERNEL(add_qasymm8_signed_sme2);
 
 #undef DECLARE_ADD_KERNEL
 
 } // namespace cpu
 } // namespace arm_compute
-#endif // SRC_CORE_KERNELS_ADD_LIST_H
+#endif // ACL_SRC_CPU_KERNELS_ADD_LIST_H
diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
index 296fe88791..e94c92b5ce 100644
--- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -281,6 +281,7 @@ Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo     *
     if (is_data_type_quantized_per_channel(weights->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
         ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
     }
     else
diff --git a/src/cpu/kernels/pool2d/neon/quantized.h b/src/cpu/kernels/pool2d/neon/quantized.h
index 38f1b2f1f9..8dd43ad4b1 100644
--- a/src/cpu/kernels/pool2d/neon/quantized.h
+++ b/src/cpu/kernels/pool2d/neon/quantized.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_QUANTIZED_H
-#define SRC_CORE_NEON_KERNELS_QUANTIZED_H
+#ifndef ACL_SRC_CPU_KERNELS_POOL2D_NEON_QUANTIZED_H
+#define ACL_SRC_CPU_KERNELS_POOL2D_NEON_QUANTIZED_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
@@ -80,13 +80,10 @@ void poolingMxN_q8_neon_nhwc(const ITensor    *src,
     const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
 
-    const float32x4_t             half_scale_v = vdupq_n_f32(0.5f);
-    const UniformQuantizationInfo src_qinfo    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo dst_qinfo    = dst0->info()->quantization_info().uniform();
+    const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
 
-    const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
-    // "new_offset" doesn't have to consider the "half_scale_v" in its computation
-    // With a requantization performed in a single step there won't be uncertainties introduced
+    const float   quant_rescale = dst_qinfo.scale / src_qinfo.scale;
     const int32_t new_offset =
         dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
 
@@ -163,11 +160,18 @@ void poolingMxN_q8_neon_nhwc(const ITensor    *src,
                     else
                     {
                         const float32x4_t scale_v = vdupq_n_f32(scale);
-                        // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                        vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
-                        vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
-                        vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
-                        vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+
+#ifdef __aarch64__
+                        vres1 = vcvtnq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres1), scale_v));
+                        vres2 = vcvtnq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres2), scale_v));
+                        vres3 = vcvtnq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres3), scale_v));
+                        vres4 = vcvtnq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres4), scale_v));
+#else  // __aarch64__
+                        vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres1), scale_v));
+                        vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres2), scale_v));
+                        vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres3), scale_v));
+                        vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres4), scale_v));
+#endif // __aarch64__
 
                         const q8x8_t res1 =
                             wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
@@ -268,8 +272,11 @@ void poolingMxN_q8_neon_nhwc(const ITensor    *src,
                     }
                     else
                     {
-                        // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                        res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+#ifdef __aarch64__
+                        res = arm_compute::round(static_cast<float>(res) * scale, RoundingPolicy::TO_NEAREST_EVEN);
+#else  // __aarch64__
+                        res   = arm_compute::round(static_cast<float>(res) * scale, RoundingPolicy::TO_ZERO);
+#endif // __aarch64__
 
                         // Store result
                         *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
@@ -829,4 +836,4 @@ void poolingMxN_quantized_neon_nchw(const ITensor    *src,
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // SRC_CORE_NEON_KERNELS_QUANTIZED_H
+#endif // ACL_SRC_CPU_KERNELS_POOL2D_NEON_QUANTIZED_H
diff --git a/src/cpu/kernels/pool3d/neon/quantized.h b/src/cpu/kernels/pool3d/neon/quantized.h
index 8819907901..fe7e1bc965 100644
--- a/src/cpu/kernels/pool3d/neon/quantized.h
+++ b/src/cpu/kernels/pool3d/neon/quantized.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
-#define SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
+#ifndef ACL_SRC_CPU_KERNELS_POOL3D_NEON_QUANTIZED_H
+#define ACL_SRC_CPU_KERNELS_POOL3D_NEON_QUANTIZED_H
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
@@ -83,13 +83,10 @@ void avg_poolingMxNxD_q8_neon_ndhwc(
 
     Iterator out(dst0, window_out);
 
-    const float32x4_t             half_scale_v = vdupq_n_f32(0.5f);
-    const UniformQuantizationInfo src_qinfo    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo dst_qinfo    = dst0->info()->quantization_info().uniform();
+    const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
 
-    const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
-    // "new_offset" doesn't have to consider the "half_scale_v" in its computation
-    // With a requantization performed in a single step there won't be uncertainties introduced
+    const float   quant_rescale = dst_qinfo.scale / src_qinfo.scale;
     const int32_t new_offset =
         dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
 
@@ -171,12 +168,18 @@ void avg_poolingMxNxD_q8_neon_ndhwc(
                 else
                 {
                     const float32x4_t scale_v = vdupq_n_f32(scale);
-                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                    vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
-                    vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
-                    vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
-                    vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
 
+#ifdef __aarch64__
+                    vres1 = vcvtnq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres1), scale_v));
+                    vres2 = vcvtnq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres2), scale_v));
+                    vres3 = vcvtnq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres3), scale_v));
+                    vres4 = vcvtnq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres4), scale_v));
+#else  // __aarch64__
+                    vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres1), scale_v));
+                    vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres2), scale_v));
+                    vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres3), scale_v));
+                    vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmul(vcvtq_f32_q32(vres4), scale_v));
+#endif // __aarch64__
                     const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
                     const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
                     // Store result
@@ -217,8 +220,11 @@ void avg_poolingMxNxD_q8_neon_ndhwc(
                 }
                 else
                 {
-                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                    res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+#ifdef __aarch64__
+                    res = arm_compute::round(static_cast<float>(res) * scale, RoundingPolicy::TO_NEAREST_EVEN);
+#else  // __aarch64__
+                    res   = arm_compute::round(static_cast<float>(res) * scale, RoundingPolicy::TO_ZERO);
+#endif // __aarch64__
 
                     // Store result
                     *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
@@ -396,4 +402,4 @@ void max_poolingMxNxD_q8_neon_ndhwc(
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
+#endif // ACL_SRC_CPU_KERNELS_POOL3D_NEON_QUANTIZED_H
diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
index 425fcf7ac6..a364c1331e 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
@@ -39,7 +39,7 @@ void neon_fp16_softmax(const ITensor *in,
                        const float    beta,
                        int            axis,
                        const Window  &window,
-                       const float   *lut_ptr)
+                       const void    *lut_ptr)
 {
     ARM_COMPUTE_UNUSED(lut_ptr);
     if (axis == 0)
@@ -58,14 +58,14 @@ template void neon_fp16_softmax<true>(const ITensor *in,
                                       const float    beta,
                                       int            axis,
                                       const Window  &window,
-                                      const float   *lut_ptr);
+                                      const void    *lut_ptr);
 template void neon_fp16_softmax<false>(const ITensor *in,
                                        void *const    tmp,
                                        ITensor       *out,
                                        const float    beta,
                                        int            axis,
                                        const Window  &window,
-                                       const float   *lut_ptr);
+                                       const void    *lut_ptr);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
index a64946eb74..a4ded572fe 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
@@ -37,7 +37,7 @@ void neon_fp32_softmax(const ITensor *in,
                        const float    beta,
                        int            axis,
                        const Window  &window,
-                       const float   *lut_ptr)
+                       const void    *lut_ptr)
 {
     ARM_COMPUTE_UNUSED(lut_ptr);
     if (axis == 0)
@@ -56,14 +56,14 @@ template void neon_fp32_softmax<true>(const ITensor *in,
                                       const float    beta,
                                       int            axis,
                                       const Window  &window,
-                                      const float   *lut_ptr);
+                                      const void    *lut_ptr);
 template void neon_fp32_softmax<false>(const ITensor *in,
                                        void *const    tmp,
                                        ITensor       *out,
                                        const float    beta,
                                        int            axis,
                                        const Window  &window,
-                                       const float   *lut_ptr);
+                                       const void    *lut_ptr);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp
index 31baf8a9df..8448fb8088 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp
@@ -40,8 +40,9 @@ void neon_softmax_x_quantized(
 
     const int input_width = in->info()->valid_region().shape.x();
 
-    const float       scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
-    const float32x4_t scale_beta_vec = vdupq_n_f32(scale_beta);
+    const float                   scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
+    const float32x4_t             scale_beta_vec = vdupq_n_f32(scale_beta);
+    const UniformQuantizationInfo out_qinfo      = out->info()->quantization_info().uniform();
 
     Iterator in_it(in, window);
     Iterator out_it(out, window);
@@ -198,18 +199,22 @@ void neon_softmax_x_quantized(
                 int x = 0;
                 for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
-                    float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
-                    int_vec_type  normalized_value{};
+                    using int_vec_type         = wrapper::traits::neon_vector_t<T, 16>;
+                    const float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
+                    int_vec_type        normalized_value{};
                     if (IS_LOG)
                     {
-                        const float32x4x4_t sub = {
-                            vsubq_f32(vec_in.val[0], sum_vec),
-                            vsubq_f32(vec_in.val[1], sum_vec),
-                            vsubq_f32(vec_in.val[2], sum_vec),
-                            vsubq_f32(vec_in.val[3], sum_vec),
+                        const float32x4_t out_offset    = vdupq_n_f32(static_cast<float>(out_qinfo.offset));
+                        const float32x4_t out_inv_scale = vdupq_n_f32(1.f / out_qinfo.scale);
+
+                        const float32x4x4_t normalized_value_f = {
+                            vmlaq_f32(out_offset, vsubq_f32(vec_in.val[0], sum_vec), out_inv_scale),
+                            vmlaq_f32(out_offset, vsubq_f32(vec_in.val[1], sum_vec), out_inv_scale),
+                            vmlaq_f32(out_offset, vsubq_f32(vec_in.val[2], sum_vec), out_inv_scale),
+                            vmlaq_f32(out_offset, vsubq_f32(vec_in.val[3], sum_vec), out_inv_scale),
                         };
-                        normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
+
+                        normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(normalized_value_f);
                     }
                     else
                     {
@@ -238,7 +243,13 @@ void neon_softmax_x_quantized(
                 {
                     if (IS_LOG)
                     {
-                        out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum_transformed);
+                        const float diff = tmp_ptr[x] - sum_transformed;
+#ifdef __aarch64__
+                        constexpr auto policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  // __aarch64__
+                        constexpr auto policy = RoundingPolicy::TO_ZERO;
+#endif // __aarch64__
+                        out_ptr[x] = Qasymm8QuantizationHelper<T>::quantize(diff, out_qinfo, policy);
                     }
                     else
                     {
@@ -277,6 +288,8 @@ void neon_softmax_non_x_quantized(
     const int          axis_width      = in_info->dimension(axis);
     const int          end_actual      = std::min(window[0].end(), x_width);
 
+    const UniformQuantizationInfo out_qinfo = out->info()->quantization_info().uniform();
+
     execute_window_loop(
         window,
         [&](const Coordinates &winCoords)
@@ -488,13 +501,21 @@ void neon_softmax_non_x_quantized(
 
                         if (IS_LOG)
                         {
-                            const float32x4x4_t sub = {
-                                vsubq_f32(vec_in.val[0], vec_sum_transformed.val[0]),
-                                vsubq_f32(vec_in.val[1], vec_sum_transformed.val[1]),
-                                vsubq_f32(vec_in.val[2], vec_sum_transformed.val[2]),
-                                vsubq_f32(vec_in.val[3], vec_sum_transformed.val[3]),
+                            const float32x4_t out_offset    = vdupq_n_f32(static_cast<float>(out_qinfo.offset));
+                            const float32x4_t out_inv_scale = vdupq_n_f32(1.f / out_qinfo.scale);
+
+                            const float32x4x4_t normalized_value_f = {
+                                vmlaq_f32(out_offset, vsubq_f32(vec_in.val[0], vec_sum_transformed.val[0]),
+                                          out_inv_scale),
+                                vmlaq_f32(out_offset, vsubq_f32(vec_in.val[1], vec_sum_transformed.val[1]),
+                                          out_inv_scale),
+                                vmlaq_f32(out_offset, vsubq_f32(vec_in.val[2], vec_sum_transformed.val[2]),
+                                          out_inv_scale),
+                                vmlaq_f32(out_offset, vsubq_f32(vec_in.val[3], vec_sum_transformed.val[3]),
+                                          out_inv_scale),
                             };
-                            normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
+
+                            normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(normalized_value_f);
                         }
                         else
                         {
@@ -528,19 +549,28 @@ void neon_softmax_non_x_quantized(
                         float *const base_ptr_tmp = (i * tmp_axis_stride) + reinterpret_cast<float *>(tmp_ptr);
                         if (IS_LOG)
                         {
+#ifdef __aarch64__
+                            constexpr auto policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  // __aarch64__
+                            constexpr auto policy = RoundingPolicy::TO_ZERO;
+#endif // __aarch64__
+
                             for (int k = 0; k < num_remaining_full; ++k)
                             {
                                 for (int j = 0; j < 4; ++j)
                                 {
-                                    *(base_ptr_out + (4 * k + j)) = utils::cast::saturate_cast<T>(
-                                        (*(base_ptr_tmp + (4 * k + j)) - vec_sum_transformed.val[k][j]));
+                                    const float diff = *(base_ptr_tmp + (4 * k + j)) - vec_sum_transformed.val[k][j];
+
+                                    *(base_ptr_out + (4 * k + j)) =
+                                        Qasymm8QuantizationHelper<T>::quantize(diff, out_qinfo, policy);
                                 }
                             }
                             for (int j = 0; j < num_remaining_partial; ++j)
                             {
+                                const float diff = *(base_ptr_tmp + (4 * num_remaining_full + j)) -
+                                                   vec_sum_transformed.val[num_remaining_full][j];
                                 *(base_ptr_out + (4 * num_remaining_full + j)) =
-                                    utils::cast::saturate_cast<T>(*(base_ptr_tmp + (4 * num_remaining_full + j)) -
-                                                                  vec_sum_transformed.val[num_remaining_full][j]);
+                                    Qasymm8QuantizationHelper<T>::quantize(diff, out_qinfo, policy);
                             }
                         }
                         else
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
index 369f9bb005..e16eff3ac6 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
@@ -36,7 +36,7 @@ void neon_qasymm8_softmax(const ITensor *in,
                           const float    beta,
                           int            axis,
                           const Window  &window,
-                          const float   *lut_ptr)
+                          const void    *lut_ptr)
 {
     ARM_COMPUTE_UNUSED(lut_ptr);
     if (axis == 0)
@@ -55,14 +55,14 @@ template void neon_qasymm8_softmax<true>(const ITensor *in,
                                          const float    beta,
                                          int            axis,
                                          const Window  &window,
-                                         const float   *lut_ptr);
+                                         const void    *lut_ptr);
 template void neon_qasymm8_softmax<false>(const ITensor *in,
                                           void *const    tmp,
                                           ITensor       *out,
                                           const float    beta,
                                           int            axis,
                                           const Window  &window,
-                                          const float   *lut_ptr);
+                                          const void    *lut_ptr);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
index 594ceb7654..a2832dcca2 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
@@ -36,7 +36,7 @@ void neon_qasymm8_signed_softmax(const ITensor *in,
                                  const float    beta,
                                  int            axis,
                                  const Window  &window,
-                                 const float   *lut_ptr)
+                                 const void    *lut_ptr)
 {
     ARM_COMPUTE_UNUSED(lut_ptr);
     if (axis == 0)
@@ -55,14 +55,14 @@ template void neon_qasymm8_signed_softmax<true>(const ITensor *in,
                                                 const float    beta,
                                                 int            axis,
                                                 const Window  &window,
-                                                const float   *lut_ptr);
+                                                const void    *lut_ptr);
 template void neon_qasymm8_signed_softmax<false>(const ITensor *in,
                                                  void *const    tmp,
                                                  ITensor       *out,
                                                  const float    beta,
                                                  int            axis,
                                                  const Window  &window,
-                                                 const float   *lut_ptr);
+                                                 const void    *lut_ptr);
 
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/softmax/generic/sme2/fp16.cpp b/src/cpu/kernels/softmax/generic/sme2/fp16.cpp
index e70c9f4793..95550548bf 100644
--- a/src/cpu/kernels/softmax/generic/sme2/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/sme2/fp16.cpp
@@ -720,13 +720,8 @@ loop_3_end%=:
     );
 }
 
-void sme2_fp16_softmax(const ITensor *in,
-                       void *const,
-                       ITensor      *out,
-                       const float   beta,
-                       int           axis,
-                       const Window &window,
-                       const float  *lut_ptr)
+void sme2_fp16_softmax(
+    const ITensor *in, void *const, ITensor *out, const float beta, int axis, const Window &window, const void *lut_ptr)
 {
     ARM_COMPUTE_UNUSED(lut_ptr);
     ARM_COMPUTE_UNUSED(axis);
diff --git a/src/cpu/kernels/softmax/generic/sme2/fp32.cpp b/src/cpu/kernels/softmax/generic/sme2/fp32.cpp
index 5e29d51746..d08bed7ad9 100644
--- a/src/cpu/kernels/softmax/generic/sme2/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/sme2/fp32.cpp
@@ -524,13 +524,8 @@ loop_3_end%=:
     );
 }
 
-void sme2_fp32_softmax(const ITensor *in,
-                       void *const,
-                       ITensor      *out,
-                       const float   beta,
-                       int           axis,
-                       const Window &window,
-                       const float  *lut_ptr)
+void sme2_fp32_softmax(
+    const ITensor *in, void *const, ITensor *out, const float beta, int axis, const Window &window, const void *lut_ptr)
 {
     ARM_COMPUTE_UNUSED(lut_ptr);
     ARM_COMPUTE_UNUSED(axis);
diff --git a/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp
index 8bde7dc907..f3d443f9aa 100644
--- a/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/sme2/qasymm8.cpp
@@ -566,10 +566,12 @@ void sme2_qasymm8_softmax_lut_512VL(const ITensor *in,
                                     const float    beta,
                                     int            axis,
                                     const Window  &window,
-                                    const float   *lut_ptr)
+                                    const void    *lut_ptr)
 {
     ARM_COMPUTE_UNUSED(axis);
 
+    auto lut_fp32_ptr = reinterpret_cast<const float *>(lut_ptr);
+
     const auto *src_info = in->info();
     const auto *dst_info = out->info();
 
@@ -624,7 +626,7 @@ void sme2_qasymm8_softmax_lut_512VL(const ITensor *in,
     auto       *k_tmp         = reinterpret_cast<float *>(tmp_float_ptr + k_tmp_offset);
     auto       *k_dst         = reinterpret_cast<uint8_t *>(out->buffer() + k_dst_offset);
 
-    sme2_qasymm8_softmax_kernel_512VL(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides, lut_ptr, k_tmp);
+    sme2_qasymm8_softmax_kernel_512VL(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides, lut_fp32_ptr, k_tmp);
 }
 
 } // namespace cpu
diff --git a/src/cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp
index f3667d4ad8..4a71914006 100644
--- a/src/cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/sme2/qasymm8_signed.cpp
@@ -587,10 +587,12 @@ void sme2_qasymm8_signed_softmax_lut_512VL(const ITensor *in,
                                            const float    beta,
                                            int            axis,
                                            const Window  &window,
-                                           const float   *lut_ptr)
+                                           const void    *lut_ptr)
 {
     ARM_COMPUTE_UNUSED(axis);
 
+    auto lut_fp32_ptr = reinterpret_cast<const float *>(lut_ptr);
+
     const auto *src_info = in->info();
     const auto *dst_info = out->info();
 
@@ -645,7 +647,8 @@ void sme2_qasymm8_signed_softmax_lut_512VL(const ITensor *in,
     auto       *k_tmp         = reinterpret_cast<float *>(tmp_float_ptr + k_tmp_offset);
     auto       *k_dst         = reinterpret_cast<int8_t *>(out->buffer() + k_dst_offset);
 
-    sme2_qasymm8_signed_softmax_kernel_512VL(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides, lut_ptr, k_tmp);
+    sme2_qasymm8_signed_softmax_kernel_512VL(k_src, k_dst, beta, k_shape, k_src_strides, k_dst_strides, lut_fp32_ptr,
+                                             k_tmp);
 }
 
 } // namespace cpu
diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h
index 7bbb265022..9b11f1eaed 100644
--- a/src/cpu/kernels/softmax/list.h
+++ b/src/cpu/kernels/softmax/list.h
@@ -31,7 +31,7 @@ namespace cpu
 #define DECLARE_SOFTMAX_KERNEL(func_name)                                                                              \
     template <bool IS_LOG>                                                                                             \
     void func_name(const ITensor *in, void *const tmp, ITensor *out, const float beta, int axis, const Window &window, \
-                   const float *lut_ptr)
+                   const void *lut_ptr)
 
 DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax);
 DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax);
@@ -46,7 +46,7 @@ void sme2_fp32_softmax(const ITensor *in,
                        const float    beta,
                        int            axis,
                        const Window  &window,
-                       const float   *lut_ptr);
+                       const void    *lut_ptr);
 
 void sme2_fp16_softmax(const ITensor *in,
                        void *const    tmp,
@@ -54,7 +54,7 @@ void sme2_fp16_softmax(const ITensor *in,
                        const float    beta,
                        int            axis,
                        const Window  &window,
-                       const float   *lut_ptr);
+                       const void    *lut_ptr);
 
 void sme2_qasymm8_softmax_lut_512VL(const ITensor *in,
                                     void *const    tmp,
@@ -62,7 +62,7 @@ void sme2_qasymm8_softmax_lut_512VL(const ITensor *in,
                                     const float    beta,
                                     int            axis,
                                     const Window  &window,
-                                    const float   *lut_ptr);
+                                    const void    *lut_ptr);
 
 void sme2_qasymm8_signed_softmax_lut_512VL(const ITensor *in,
                                            void *const    tmp,
@@ -70,7 +70,7 @@ void sme2_qasymm8_signed_softmax_lut_512VL(const ITensor *in,
                                            const float    beta,
                                            int            axis,
                                            const Window  &window,
-                                           const float   *lut_ptr);
+                                           const void    *lut_ptr);
 
 #endif // ARM_COMPUTE_ENABLE_SME2
 
diff --git a/src/cpu/kernels/sub/neon/impl.h b/src/cpu/kernels/sub/neon/impl.h
index 6123f7e25a..d641ec4ec9 100644
--- a/src/cpu/kernels/sub/neon/impl.h
+++ b/src/cpu/kernels/sub/neon/impl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -91,30 +91,45 @@ void sub_same_neon(
 
                 // Compute S elements per iteration
                 int x = window_start_x;
-                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                if (is_broadcast_input_2)
                 {
-                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                    auto       res             = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v)
-                                                        : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
-                    if (is_broadcast_input_2)
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
+                        const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                        auto       res             = is_sat ? wrapper::vqsub(non_broadcast_v, broadcast_value_vec)
+                                                            : wrapper::vsub(non_broadcast_v, broadcast_value_vec);
+                        wrapper::vstore(output_ptr + x, res);
                     }
-                    wrapper::vstore(output_ptr + x, res);
-                }
 
-                // Compute left-over elements
-                for (; x < window_end_x; ++x)
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                        auto       res             = is_sat ? wrapper::sub_sat(non_broadcast_v, broadcast_value)
+                                                            : non_broadcast_v - broadcast_value;
+
+                        *(output_ptr + x) = res;
+                    }
+                }
+                else
                 {
-                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                    auto       res =
-                        is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
-                    if (is_broadcast_input_2)
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        res = static_cast<T>(-1) * res;
+                        const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                        auto       res             = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v)
+                                                            : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
+                        wrapper::vstore(output_ptr + x, res);
                     }
 
-                    *(output_ptr + x) = res;
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                        auto       res             = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v)
+                                                            : broadcast_value - non_broadcast_v;
+
+                        *(output_ptr + x) = res;
+                    }
                 }
             },
             broadcast_input, non_broadcast_input, output);
diff --git a/src/cpu/kernels/sub/neon/qsymm16.cpp b/src/cpu/kernels/sub/neon/qsymm16.cpp
index 23e4b03843..d6b9a73727 100644
--- a/src/cpu/kernels/sub/neon/qsymm16.cpp
+++ b/src/cpu/kernels/sub/neon/qsymm16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,8 +55,6 @@ void sub_qsymm16_neon(
     const UniformQuantizationInfo iq2_info = src1->info()->quantization_info().uniform();
     const UniformQuantizationInfo oq_info  = dst->info()->quantization_info().uniform();
 
-    const float32x4_t vscale1    = vdupq_n_f32(iq1_info.scale);
-    const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
 
     if (is_broadcast_across_x)
@@ -69,6 +67,9 @@ void sub_qsymm16_neon(
         const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
         const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
 
+        const float32x4_t vbroadcast_scale     = vdupq_n_f32(broadcast_qinfo.scale);
+        const float32x4_t vnon_broadcast_scale = vdupq_n_f32(non_broadcast_qinfo.scale);
+
         // Clear X Dimension on execution window as we handle manually
         non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
@@ -87,8 +88,8 @@ void sub_qsymm16_neon(
                 const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
 
                 const float32x4x2_t bf  = {{
-                     vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
-                     vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
+                     vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vbroadcast_scale),
+                     vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vbroadcast_scale),
                 }};
                 const float         bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
 
@@ -98,24 +99,24 @@ void sub_qsymm16_neon(
                 {
                     const int16x8_t     a  = vld1q_s16(non_broadcast_input_ptr + x);
                     const float32x4x2_t af = {{
-                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
-                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
+                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vnon_broadcast_scale),
+                        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vnon_broadcast_scale),
                     }};
 
                     const int32x4x4_t rf = {{
 #ifdef __aarch64__
-                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0])
-                                                                      : vsubq_f32(af.val[0], bf.val[0]),
+                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(af.val[0], bf.val[0])
+                                                                      : vsubq_f32(bf.val[0], af.val[0]),
                                                  invvscaleo)),
-                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1])
-                                                                      : vsubq_f32(af.val[1], bf.val[1]),
+                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(af.val[1], bf.val[1])
+                                                                      : vsubq_f32(bf.val[1], af.val[1]),
                                                  invvscaleo)),
 #else  //__aarch64__
-                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0])
-                                                                     : vsubq_f32(af.val[0], bf.val[0]),
+                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(af.val[0], bf.val[0])
+                                                                     : vsubq_f32(bf.val[0], af.val[0]),
                                                 invvscaleo)),
-                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1])
-                                                                     : vsubq_f32(af.val[1], bf.val[1]),
+                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(af.val[1], bf.val[1])
+                                                                     : vsubq_f32(bf.val[1], af.val[1]),
                                                 invvscaleo)),
 #endif //__aarch64__
                     }};
@@ -128,13 +129,16 @@ void sub_qsymm16_neon(
                 for (; x < window_end_x; ++x)
                 {
                     const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
-                    *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
+                    *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (afs - bfs) : (bfs - afs), oq_info);
                 }
             },
             broadcast_input, non_broadcast_input, output);
     }
     else
     {
+        const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
+        const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
+
         // Clear X Dimension on execution window as we handle manually
         input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
         input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp
index 26ca2ee783..23f51cda24 100644
--- a/src/cpu/operators/CpuConv2d.cpp
+++ b/src/cpu/operators/CpuConv2d.cpp
@@ -233,10 +233,9 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo         *i
             return ConvolutionMethod::GEMM;
         }
 
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        // This heuristics only applies to F16 data type on A55r1
-        if (NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math &&
-            input->data_type() == DataType::F16)
+#if ARM_COMPUTE_ENABLE_FP16
+        // This heuristics only applies to F16
+        if (CPUInfo::get().has_fp16() && enable_fast_math && input->data_type() == DataType::F16)
         {
             // Exclude known bad winograd configs (and defaults to GEMM)
             const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs = {
@@ -270,7 +269,7 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo         *i
                 return ConvolutionMethod::GEMM;
             }
         }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // ARM_COMPUTE_ENABLE_FP16
 
         // For 1x1 convolutions run the default GEMM
         if (weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
diff --git a/src/cpu/operators/CpuDepthwiseConv2d.cpp b/src/cpu/operators/CpuDepthwiseConv2d.cpp
index 54075f2afa..5fe91aa5d8 100644
--- a/src/cpu/operators/CpuDepthwiseConv2d.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023 Arm Limited.
+ * Copyright (c) 2021-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,12 +45,6 @@ Status validate_arguments_optimized(const ITensorInfo     *src,
                                     const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::F16, DataType::F32);
-    if (!is_data_type_quantized_per_channel(weights->data_type()))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-    }
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
     const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 4d84c1b257..ae74ca8d64 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -105,7 +105,9 @@ CLScheduler::CLScheduler()
       _backend_type(CLBackendType::Native),
       _job_chaining_enabled(true),
       _job_chaining_size(1),
-      _job_chaining_count(0)
+      _job_chaining_count(0),
+      _enqueue_count(0),
+      _flush_count(0)
 {
 }
 
@@ -201,14 +203,11 @@ void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool f
 
 void CLScheduler::flush_queue(bool flush)
 {
-    if (flush)
-    {
-        _queue.flush();
-        _job_chaining_count = 0;
-        return;
-    }
+    _enqueue_count++;
+    _flush_count += flush;
+    const float flush_ratio = _flush_count / (float)_enqueue_count;
 
-    if (_job_chaining_enabled)
+    if (_enqueue_count > 100 && flush_ratio > 0.5f && _job_chaining_enabled)
     {
         ++_job_chaining_count;
 
@@ -228,6 +227,11 @@ void CLScheduler::flush_queue(bool flush)
             _queue.flush();
         }
     }
+    else if (flush)
+    {
+        _job_chaining_count = 0;
+        _queue.flush();
+    }
 }
 
 void CLScheduler::enqueue(ICLKernel &kernel, bool flush)
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index e6457218c7..b109288bfb 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2021 Arm Limited.
+ * Copyright (c) 2016-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -168,6 +168,11 @@ void CLTensorAllocator::free()
     info().set_is_resizable(true);
 }
 
+bool CLTensorAllocator::is_allocated() const
+{
+    return _memory.region() != nullptr;
+}
+
 Status CLTensorAllocator::import_memory(cl::Buffer buffer)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(buffer.get() == nullptr);
diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index 7767b45a01..bb3edb2323 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,6 +49,7 @@ struct CLConvolutionLayer::Impl
     WorkspaceData<CLTensor>              workspace{};
     experimental::MemoryRequirements     aux_mem_req{};
     std::unique_ptr<IFunction>           func{nullptr};
+    bool                                 is_prepared{false};
 };
 
 CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
@@ -126,9 +127,10 @@ void CLConvolutionLayer::configure(const CLCompileContext    &compile_context,
         _impl->aux_mem_req  = _impl->op->workspace();
         _impl->run_pack     = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
         _impl->prep_pack    = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
-        _impl->workspace =
-            manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->workspace    = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack,
+                                                      _impl->prep_pack, /* allocate_now */ false);
     }
+    _impl->is_prepared = false;
 }
 
 Status CLConvolutionLayer::validate(const ITensorInfo         *input,
@@ -208,16 +210,22 @@ void CLConvolutionLayer::run()
 
 void CLConvolutionLayer::prepare()
 {
-    if (_impl->func)
+    if (!_impl->is_prepared)
     {
-        _impl->func->prepare();
-    }
-    else
-    {
-        _impl->op->prepare(_impl->prep_pack);
+        if (_impl->func)
+        {
+            _impl->func->prepare();
+        }
+        else
+        {
+            allocate_tensors(_impl->aux_mem_req, _impl->workspace);
+            _impl->op->prepare(_impl->prep_pack);
+
+            // Release temporary tensors that are only used in prepare stage
+            release_temporaries(_impl->aux_mem_req, _impl->workspace);
+        }
 
-        // Release temporary tensors that are only used in prepare stage
-        release_temporaries(_impl->aux_mem_req, _impl->workspace);
+        _impl->is_prepared = true;
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index b30f9e701f..6296b8e054 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,8 +97,8 @@ void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context,
     {
         _impl->aux_mem_req = _impl->op->workspace();
         _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
-        _impl->workspace =
-            manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
+        _impl->workspace   = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack,
+                                                      _impl->run_pack, /* allocate_now */ false);
     }
     else
     {
@@ -121,10 +121,7 @@ Status CLFullyConnectedLayer::validate(const ITensorInfo      *input,
 
 void CLFullyConnectedLayer::run()
 {
-    if (!_impl->dynamic_weights)
-    {
-        prepare();
-    }
+    prepare();
 
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
     _impl->op->run(_impl->run_pack);
@@ -134,26 +131,31 @@ void CLFullyConnectedLayer::prepare()
 {
     if (!_impl->is_prepared)
     {
-        _impl->op->prepare(_impl->run_pack);
+        allocate_tensors(_impl->aux_mem_req, _impl->workspace);
+        if (!_impl->dynamic_weights)
+        {
+            _impl->op->prepare(_impl->run_pack);
 
-        // Release temporary tensors that are only used in prepare stage
-        release_temporaries<CLTensor>(_impl->aux_mem_req, _impl->workspace);
-        _impl->is_prepared = true;
+            // Release temporary tensors that are only used in prepare stage
+            release_temporaries<CLTensor>(_impl->aux_mem_req, _impl->workspace);
 
-        // Handle weights managed infrastructure
-        if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
-        {
-            // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare
-            // This is for cases where multiple functions share the same b (weights)
-            // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference
-            const ITensor *original_b = _impl->original_weights;
-            if (!original_b->is_used())
+            // Handle weights managed infrastructure
+            if (_impl->weights_manager != nullptr &&
+                _impl->weights_manager->are_weights_managed(_impl->original_weights))
             {
-                _impl->weights_manager->pre_mark_as_unused(original_b);
+                // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare
+                // This is for cases where multiple functions share the same b (weights)
+                // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference
+                const ITensor *original_b = _impl->original_weights;
+                if (!original_b->is_used())
+                {
+                    _impl->weights_manager->pre_mark_as_unused(original_b);
+                }
+                _impl->original_weights->mark_as_used();
+                _impl->weights_manager->release(_impl->original_weights);
             }
-            _impl->original_weights->mark_as_used();
-            _impl->weights_manager->release(_impl->original_weights);
         }
+        _impl->is_prepared = true;
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 871a1d6e27..bc66205af4 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -102,8 +102,8 @@ void CLGEMM::configure(const CLCompileContext &compile_context,
         _impl->run_pack  = {{ACL_SRC_0, a}, {ACL_SRC_2, c}, {ACL_DST, output}};
         _impl->prep_pack = {{ACL_SRC_1, _impl->b}};
 
-        _impl->workspace_tensors =
-            manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->workspace_tensors = manage_workspace<CLTensor>(
+            _impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack, /* allocate_now */ false);
     }
 }
 
@@ -131,6 +131,7 @@ void CLGEMM::prepare()
 {
     if (!_impl->is_prepared)
     {
+        allocate_tensors(_impl->aux_mem_req, _impl->workspace_tensors);
         _impl->op->prepare(_impl->prep_pack);
 
         auto has_reshape =
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index aef7cddd7a..5439129ab0 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -109,9 +109,9 @@ void CLGEMMConvolutionLayer::configure(const CLCompileContext    &compile_contex
         {TensorType::ACL_SRC_1, weights},
         {TensorType::ACL_SRC_2, biases},
     };
-    _impl->aux_mem_req = _impl->op->workspace();
-    _impl->workspace_tensors =
-        manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->aux_mem_req       = _impl->op->workspace();
+    _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack,
+                                                          _impl->prep_pack, /* allocate_now */ false);
 }
 
 Status CLGEMMConvolutionLayer::validate(const ITensorInfo         *input,
@@ -139,6 +139,7 @@ void CLGEMMConvolutionLayer::prepare()
 {
     if (!_impl->is_prepared)
     {
+        allocate_tensors(_impl->aux_mem_req, _impl->workspace_tensors);
         _impl->op->prepare(_impl->prep_pack);
         auto has_reshape =
             std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 8bad198658..d3a2e7dc09 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021 Arm Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -94,9 +94,9 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con
     }
     else
     {
-        _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}};
-        _impl->workspace_tensors =
-            manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
+        _impl->run_pack          = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}};
+        _impl->workspace_tensors = manage_workspace<CLTensor>(
+            _impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack, /* allocate_now */ false);
     }
 }
 
@@ -122,6 +122,7 @@ void CLGEMMLowpMatrixMultiplyCore::prepare()
 {
     if (!_impl->is_prepared)
     {
+        allocate_tensors(_impl->aux_mem_req, _impl->workspace_tensors);
         _impl->op->prepare(_impl->run_pack);
 
         // Release temporary tensors that are only used in prepare stage
diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index 645f817030..bd7e23b980 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -86,12 +86,12 @@ void CLWinogradConvolutionLayer::configure(const CLCompileContext    &compile_co
                          (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info,
                          enable_fast_math);
 
-    _impl->run_pack = {{TensorType::ACL_SRC_0, _impl->src},
-                       {TensorType::ACL_SRC_1, _impl->weights},
-                       {TensorType::ACL_SRC_2, _impl->biases},
-                       {TensorType::ACL_DST, _impl->dst}};
-    _impl->workspace_tensors =
-        manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
+    _impl->run_pack          = {{TensorType::ACL_SRC_0, _impl->src},
+                                {TensorType::ACL_SRC_1, _impl->weights},
+                                {TensorType::ACL_SRC_2, _impl->biases},
+                                {TensorType::ACL_DST, _impl->dst}};
+    _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack,
+                                                          _impl->run_pack, /* allocate_now */ false);
 }
 
 Status CLWinogradConvolutionLayer::validate(const ITensorInfo         *input,
@@ -116,6 +116,7 @@ void CLWinogradConvolutionLayer::prepare()
 {
     if (!_impl->is_prepared)
     {
+        allocate_tensors(_impl->op->workspace(), _impl->workspace_tensors);
         _impl->op->prepare(_impl->run_pack);
 
         // Release Preparation tensors
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 8efebbbb1a..7107a6be7a 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,6 +51,7 @@ struct NEConvolutionLayer::Impl
     WorkspaceData<Tensor>              workspace{};
     experimental::MemoryRequirements   aux_mem_req{};
     std::unique_ptr<IFunction>         func{nullptr};
+    bool                               is_prepared{false};
 };
 
 NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
@@ -113,9 +114,10 @@ void NEConvolutionLayer::configure(ITensor                   *input,
         _impl->aux_mem_req  = _impl->op->workspace();
         _impl->run_pack     = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
         _impl->prep_pack    = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
-        _impl->workspace =
-            manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->workspace    = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack,
+                                                    _impl->prep_pack, /* allocate_now */ false);
     }
+    _impl->is_prepared = false;
 }
 
 Status NEConvolutionLayer::validate(const ITensorInfo         *input,
@@ -193,16 +195,22 @@ void NEConvolutionLayer::run()
 
 void NEConvolutionLayer::prepare()
 {
-    if (_impl->func)
+    if (!_impl->is_prepared)
     {
-        _impl->func->prepare();
-    }
-    else
-    {
-        _impl->op->prepare(_impl->prep_pack);
+        if (_impl->func)
+        {
+            _impl->func->prepare();
+        }
+        else
+        {
+            allocate_tensors(_impl->aux_mem_req, _impl->workspace);
+            _impl->op->prepare(_impl->prep_pack);
+
+            // Release temporary tensors that are only used in prepare stage
+            release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
+        }
 
-        // Release temporary tensors that are only used in prepare stage
-        release_temporaries<Tensor>(_impl->aux_mem_req, _impl->workspace);
+        _impl->is_prepared = true;
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 6c085645db..de291355ac 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -244,6 +244,9 @@ void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(
     _impl->is_nchw     = input->info()->data_layout() == DataLayout::NCHW;
     _impl->is_prepared = !_impl->is_nchw;
 
+    _impl->permuted_input   = {};
+    _impl->permuted_weights = {};
+
     ITensor       *input_to_use   = input;
     const ITensor *weights_to_use = weights;
     ITensor       *output_to_use  = output;
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 2656d0fa0f..be451bcdeb 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -91,8 +91,8 @@ void NEFullyConnectedLayer::configure(const ITensor          *input,
 
     _impl->aux_mem_req = _impl->op->workspace();
     _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
-    _impl->workspace =
-        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
+    _impl->workspace   = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack,
+                                                _impl->run_pack, /* allocate_now */ false);
 
     _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights &&
                              !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights;
@@ -135,6 +135,7 @@ void NEFullyConnectedLayer::prepare()
 {
     if (!_impl->is_prepared)
     {
+        allocate_tensors(_impl->aux_mem_req, _impl->workspace);
         _impl->op->prepare(_impl->run_pack);
 
         // Release temporary tensors that are only used in prepare stage
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 934a8250cc..d26b819864 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -92,8 +92,8 @@ void NEGEMM::configure(const ITensor  *a,
     _impl->aux_mem_req = _impl->op->workspace();
     _impl->run_pack    = {{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_SRC_2, c}, {ACL_DST, d}};
     _impl->prep_pack   = {{ACL_SRC_1, b}, {ACL_SRC_2, c}};
-    _impl->workspace =
-        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->workspace   = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack,
+                                                _impl->prep_pack, /* allocate_now */ false);
 }
 
 Status NEGEMM::validate(const ITensorInfo *a,
@@ -139,6 +139,7 @@ void NEGEMM::prepare()
 {
     if (!_impl->is_prepared)
     {
+        allocate_tensors(_impl->aux_mem_req, _impl->workspace);
         _impl->op->prepare(_impl->prep_pack);
 
         auto has_reshape =
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
index 6cca02eea9..b5cdd864ba 100644
--- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021 Arm Limited.
+ * Copyright (c) 2020-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -68,8 +68,8 @@ void NEGEMMConv2d::configure(
     _impl->aux_mem_req = _impl->op->workspace();
     _impl->run_pack  = {{TensorType::ACL_SRC_0, input}, {TensorType::ACL_SRC_2, biases}, {TensorType::ACL_DST, output}};
     _impl->prep_pack = {{TensorType::ACL_SRC_1, weights}, {TensorType::ACL_SRC_2, biases}};
-    _impl->workspace =
-        manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->workspace = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack,
+                                                _impl->prep_pack, /* allocate_now */ false);
 }
 
 Status NEGEMMConv2d::validate(const ITensorInfo *input,
@@ -93,6 +93,7 @@ void NEGEMMConv2d::prepare()
 {
     if (!_impl->is_prepared)
     {
+        allocate_tensors(_impl->aux_mem_req, _impl->workspace);
         _impl->op->prepare(_impl->prep_pack);
 
         auto has_reshape =
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index be10121a56..03df5115f0 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -69,18 +69,19 @@ void NEGEMMConvolutionLayer::configure(const ITensor             *input,
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
-    _impl->weights = weights;
-    _impl->op      = std::make_unique<cpu::CpuGemmConv2d>();
+    _impl->is_prepared = false;
+    _impl->weights     = weights;
+    _impl->op          = std::make_unique<cpu::CpuGemmConv2d>();
     _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(),
                          conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
 
-    _impl->run_pack    = {{TensorType::ACL_SRC_0, input},
-                          {TensorType::ACL_SRC_1, weights},
-                          {TensorType::ACL_SRC_2, biases},
-                          {TensorType::ACL_DST, output}};
-    _impl->aux_mem_req = _impl->op->workspace();
-    _impl->workspace_tensors =
-        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
+    _impl->run_pack          = {{TensorType::ACL_SRC_0, input},
+                                {TensorType::ACL_SRC_1, weights},
+                                {TensorType::ACL_SRC_2, biases},
+                                {TensorType::ACL_DST, output}};
+    _impl->aux_mem_req       = _impl->op->workspace();
+    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack,
+                                                        _impl->run_pack, /* allocate_now */ false);
 }
 
 Status NEGEMMConvolutionLayer::validate(const ITensorInfo         *input,
@@ -129,6 +130,7 @@ void NEGEMMConvolutionLayer::prepare()
 {
     if (!_impl->is_prepared)
     {
+        allocate_tensors(_impl->aux_mem_req, _impl->workspace_tensors);
         _impl->op->prepare(_impl->run_pack);
 
         // Release temporary tensors that are only used in prepare stage
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index b9cff8540d..6d172cef27 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -73,18 +73,19 @@ void NEGEMMLowpMatrixMultiplyCore::configure(
         b_info_to_use->set_are_values_constant(false);
     }
 
-    _impl->b  = b;
-    _impl->op = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
+    _impl->is_prepared = false;
+    _impl->b           = b;
+    _impl->op          = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
     _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(),
                          gemm_info);
-    _impl->run_pack    = {{TensorType::ACL_SRC_0, a},
-                          {TensorType::ACL_SRC_1, b},
-                          {TensorType::ACL_SRC_2, c},
-                          {TensorType::ACL_DST, output}};
-    _impl->prep_pack   = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}};
-    _impl->aux_mem_req = _impl->op->workspace();
-    _impl->workspace_tensors =
-        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->run_pack          = {{TensorType::ACL_SRC_0, a},
+                                {TensorType::ACL_SRC_1, b},
+                                {TensorType::ACL_SRC_2, c},
+                                {TensorType::ACL_DST, output}};
+    _impl->prep_pack         = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}};
+    _impl->aux_mem_req       = _impl->op->workspace();
+    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack,
+                                                        _impl->prep_pack, /* allocate_now */ false);
 }
 
 Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
@@ -142,6 +143,7 @@ void NEGEMMLowpMatrixMultiplyCore::prepare()
 {
     if (!_impl->is_prepared)
     {
+        allocate_tensors(_impl->aux_mem_req, _impl->workspace_tensors);
         _impl->op->prepare(_impl->prep_pack);
 
         auto has_reshape =
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index 7334be8456..b72aff577a 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -69,6 +69,7 @@ void NEWinogradConvolutionLayer::configure(const ITensor             *input,
                                            const ActivationLayerInfo &act_info,
                                            bool                       enable_fast_math)
 {
+    _impl->is_prepared      = false;
     _impl->original_weights = weights;
     _impl->op               = std::make_unique<cpu::CpuWinogradConv2d>();
     _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
@@ -77,8 +78,8 @@ void NEWinogradConvolutionLayer::configure(const ITensor             *input,
     _impl->aux_mem_req = _impl->op->workspace();
     _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
     _impl->prep_pack   = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
-    _impl->workspace =
-        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->workspace   = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack,
+                                                _impl->prep_pack, /* allocate_now */ false);
 }
 
 void NEWinogradConvolutionLayer::run()
@@ -104,6 +105,7 @@ void NEWinogradConvolutionLayer::prepare()
 {
     if (!_impl->is_prepared)
     {
+        allocate_tensors(_impl->aux_mem_req, _impl->workspace);
         _impl->op->prepare(_impl->prep_pack);
         _impl->original_weights->mark_as_unused();
 
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index fe4dfdd474..55b25f9098 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -81,7 +81,13 @@ void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Win
 
     const Window      &max_window     = window;
     const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
-    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
+    const unsigned int mws            = kernel->get_mws(CPUInfo::get(), _num_threads);
+
+    // Ensure each thread has mws amount of work to do (i.e. ceil(num_iterations / mws) threads)
+    const unsigned int candidate_num_threads = (num_iterations + mws - 1) / mws;
+
+    // Cap the number of threads to be spawn with the size of the thread pool
+    const unsigned int num_threads = std::min(candidate_num_threads, _num_threads);
 
     if (!kernel->is_parallelisable() || num_threads == 1)
     {
diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 372852bfea..b803f77522 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -151,6 +151,11 @@ void TensorAllocator::free()
     info().set_is_resizable(true);
 }
 
+bool TensorAllocator::is_allocated() const
+{
+    return _memory.region() != nullptr;
+}
+
 Status TensorAllocator::import_memory(void *memory)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(memory == nullptr);
diff --git a/src/runtime/experimental/operators/CpuSoftmax.cpp b/src/runtime/experimental/operators/CpuSoftmax.cpp
new file mode 100644
index 0000000000..8386fd36ef
--- /dev/null
+++ b/src/runtime/experimental/operators/CpuSoftmax.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/experimental/operators/CpuSoftmax.h"
+
+#include "src/cpu/operators/CpuSoftmax.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace op
+{
+
+struct CpuSoftmax::Impl
+{
+    std::unique_ptr<arm_compute::cpu::CpuSoftmaxGeneric> op{nullptr};
+};
+
+CpuSoftmax::CpuSoftmax() : impl_(std::make_unique<Impl>())
+{
+    impl_->op = std::make_unique<cpu::CpuSoftmaxGeneric>();
+}
+
+CpuSoftmax::~CpuSoftmax() = default;
+
+void CpuSoftmax::configure(const ITensorInfo *src, ITensorInfo *dst, float beta, int32_t axis, bool is_log)
+{
+    impl_->op->configure(src, dst, beta, axis, is_log);
+}
+
+Status CpuSoftmax::validate(const ITensorInfo *src, const ITensorInfo *dst, float beta, int32_t axis, bool is_log)
+{
+    return cpu::CpuSoftmaxGeneric::validate(src, dst, beta, axis, is_log);
+}
+
+void CpuSoftmax::run(ITensorPack &tensor)
+{
+    impl_->op->run(tensor);
+}
+
+experimental::MemoryRequirements CpuSoftmax::workspace() const
+{
+    return impl_->op->workspace();
+}
+
+void CpuSoftmax::prepare(ITensorPack &constants)
+{
+    ARM_COMPUTE_UNUSED(constants);
+}
+
+} // namespace op
+} // namespace experimental
+} // namespace arm_compute
diff --git a/support/Bfloat16.h b/support/Bfloat16.h
index 02772898a8..7c5ef78848 100644
--- a/support/Bfloat16.h
+++ b/support/Bfloat16.h
@@ -31,6 +31,26 @@ namespace arm_compute
 {
 namespace
 {
+/** Convert float to bfloat16 in a portable way that works on older hardware
+ *
+ * @param[in] v Floating-point value to convert to bfloat
+ *
+ * @return Converted value
+ */
+inline uint16_t portable_float_to_bf16(const float v)
+{
+    const uint32_t *fromptr = reinterpret_cast<const uint32_t *>(&v);
+    uint16_t        res     = (*fromptr >> 16);
+    const uint16_t  error   = (*fromptr & 0x0000ffff);
+    uint16_t        bf_l    = res & 0x0001;
+
+    if ((error > 0x8000) || ((error == 0x8000) && (bf_l != 0)))
+    {
+        res += 1;
+    }
+    return res;
+}
+
 /** Convert float to bfloat16
  *
  * @param[in] v Floating-point value to convert to bfloat
@@ -39,9 +59,9 @@ namespace
  */
 inline uint16_t float_to_bf16(const float v)
 {
-    const uint32_t *fromptr = reinterpret_cast<const uint32_t *>(&v);
 #if defined(ARM_COMPUTE_ENABLE_BF16)
-    uint16_t res;
+    const uint32_t *fromptr = reinterpret_cast<const uint32_t *>(&v);
+    uint16_t        res;
 
     __asm __volatile("ldr    s0, [%[fromptr]]\n"
                      ".inst    0x1e634000\n" // BFCVT h0, s0
@@ -49,16 +69,10 @@ inline uint16_t float_to_bf16(const float v)
                      :
                      : [fromptr] "r"(fromptr), [toptr] "r"(&res)
                      : "v0", "memory");
+    return res;
 #else  /* defined(ARM_COMPUTE_ENABLE_BF16) */
-    uint16_t       res   = (*fromptr >> 16);
-    const uint16_t error = (*fromptr & 0x0000ffff);
-    uint16_t       bf_l  = res & 0x0001;
-    if ((error > 0x8000) || ((error == 0x8000) && (bf_l != 0)))
-    {
-        res += 1;
-    }
+    return portable_float_to_bf16(v);
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
-    return res;
 }
 
 /** Convert bfloat16 to float
@@ -91,6 +105,15 @@ class bfloat16 final
     bfloat16(float v) : value(float_to_bf16(v))
     {
     }
+    /** Constructor
+     *
+     * @param[in] v        Floating-point value
+     * @param[in] portable bool to indicate the conversion is to be done in a backward compatible way
+     */
+    bfloat16(float v, bool portable) : value(0)
+    {
+        value = portable ? portable_float_to_bf16(v) : float_to_bf16(v);
+    }
     /** Assignment operator
      *
      * @param[in] v Floating point value to assign
diff --git a/tests/AssetsLibrary.cpp b/tests/AssetsLibrary.cpp
index 571b55125b..5eb8179704 100644
--- a/tests/AssetsLibrary.cpp
+++ b/tests/AssetsLibrary.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020, 2023 Arm Limited.
+ * Copyright (c) 2017-2020, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -267,6 +267,11 @@ std::random_device::result_type AssetsLibrary::seed() const
     return _seed;
 }
 
+void AssetsLibrary::set_seed(std::random_device::result_type seed)
+{
+    _seed = seed;
+}
+
 void AssetsLibrary::fill(RawTensor &raw, const std::string &name, Format format) const
 {
     //FIXME: Should be done by swapping cached buffers
diff --git a/tests/AssetsLibrary.h b/tests/AssetsLibrary.h
index bd97cb7bd4..dedad5227f 100644
--- a/tests/AssetsLibrary.h
+++ b/tests/AssetsLibrary.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_TENSOR_LIBRARY_H
-#define ARM_COMPUTE_TEST_TENSOR_LIBRARY_H
+#ifndef ACL_TESTS_ASSETSLIBRARY_H
+#define ACL_TESTS_ASSETSLIBRARY_H
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
@@ -76,6 +76,12 @@ class AssetsLibrary final
      */
     std::string path() const;
 
+    /** Set the seed that is used to fill tensors with random values.
+     *
+     * @param[in] the initial random seed to set.
+     */
+    void set_seed(std::random_device::result_type);
+
     /** Seed that is used to fill tensors with random values.
      *
      * @return the initial random seed.
@@ -588,7 +594,6 @@ void AssetsLibrary::fill_with_generator(T &&tensor, const GeneratorFunctionType<
 {
     const bool  is_nhwc = tensor.data_layout() == DataLayout::NHWC;
     TensorShape shape(tensor.shape());
-
     if(is_nhwc)
     {
         // Ensure that the equivalent tensors will be filled for both data layouts
@@ -739,6 +744,7 @@ void AssetsLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_t
             break;
         }
         case DataType::U16:
+        case DataType::QASYMM16:
         {
             std::uniform_int_distribution<uint16_t> distribution_u16(std::numeric_limits<uint16_t>::lowest(), std::numeric_limits<uint16_t>::max());
             fill(tensor, distribution_u16, seed_offset);
@@ -778,7 +784,7 @@ void AssetsLibrary::fill_tensor_uniform(T &&tensor, std::random_device::result_t
         case DataType::BFLOAT16:
         {
             // It doesn't make sense to check [-inf, inf], so hard code it to a big number
-            arm_compute::utils::uniform_real_distribution_16bit<bfloat16> distribution_bf16{ -1000.f, 1000.f };
+            arm_compute::utils::uniform_real_distribution_16bit<bfloat16> distribution_bf16{ -1000.f, 1000.f, true /* portable */ };
             fill(tensor, distribution_bf16, seed_offset);
             break;
         }
@@ -1057,4 +1063,4 @@ void AssetsLibrary::fill_tensor_value(T &&tensor, D value) const
 }
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_TENSOR_LIBRARY_H */
+#endif // ACL_TESTS_ASSETSLIBRARY_H
diff --git a/tests/datasets/DatatypeDataset.h b/tests/datasets/DatatypeDataset.h
index 4cce7bb375..b1928b5e1d 100644
--- a/tests/datasets/DatatypeDataset.h
+++ b/tests/datasets/DatatypeDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2020, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_DATATYPE_DATASET_H
-#define ARM_COMPUTE_TEST_DATATYPE_DATASET_H
+#ifndef ACL_TESTS_DATASETS_DATATYPEDATASET_H
+#define ACL_TESTS_DATASETS_DATATYPEDATASET_H
 
-#include "arm_compute/core/Types.h"
+#include "arm_compute/core/CoreTypes.h"
 #include "tests/framework/datasets/ContainerDataset.h"
 
 #include <vector>
@@ -35,6 +35,54 @@ namespace test
 {
 namespace datasets
 {
+class AllDataTypes final : public framework::dataset::ContainerDataset<std::vector<DataType>>
+{
+public:
+    AllDataTypes(const std::string &name)
+        : ContainerDataset(name,
+    {
+        DataType::QSYMM8,
+        DataType::QASYMM8,
+        DataType::QASYMM8_SIGNED,
+        DataType::QSYMM16,
+        DataType::U8,                 /**< unsigned 8-bit number */
+        DataType::S8,                 /**< signed 8-bit number */
+        DataType::QSYMM8_PER_CHANNEL, /**< quantized, symmetric per channel fixed-point 8-bit number */
+        DataType::U16,                /**< unsigned 16-bit number */
+        DataType::S16,                /**< signed 16-bit number */
+        DataType::QSYMM16,            /**< quantized, symmetric fixed-point 16-bit number */
+        DataType::QASYMM16,           /**< quantized, asymmetric fixed-point 16-bit number */
+        DataType::U32,                /**< unsigned 32-bit number */
+        DataType::S32,                /**< signed 32-bit number */
+        DataType::U64,                /**< unsigned 64-bit number */
+        DataType::S64,                /**< signed 64-bit number */
+        DataType::BFLOAT16,           /**< 16-bit brain floating-point number */
+        DataType::F16,                /**< 16-bit floating-point number */
+        DataType::F32,                /**< 32-bit floating-point number */
+        DataType::F64,                /**< 64-bit floating-point number */
+        DataType::SIZET               /**< size_t */
+    })
+    {
+    }
+};
+
+class CommonDataTypes final : public framework::dataset::ContainerDataset<std::vector<DataType>>
+{
+public:
+    CommonDataTypes(const std::string &name)
+        : ContainerDataset(name,
+    {
+        DataType::QASYMM8,
+        DataType::QASYMM8_SIGNED,
+        DataType::QSYMM8_PER_CHANNEL, /**< quantized, symmetric per channel fixed-point 8-bit number */
+        DataType::S32,                /**< signed 32-bit number */
+        DataType::BFLOAT16,           /**< 16-bit brain floating-point number */
+        DataType::F16,                /**< 16-bit floating-point number */
+        DataType::F32,                /**< 32-bit floating-point number */
+    })
+    {
+    }
+};
 class QuantizedTypes final : public framework::dataset::ContainerDataset<std::vector<DataType>>
 {
 public:
@@ -63,4 +111,4 @@ class QuantizedPerChannelTypes final : public framework::dataset::ContainerDatas
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_DATATYPE_DATASET_H */
+#endif // ACL_TESTS_DATASETS_DATATYPEDATASET_H
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index d987f4f60b..4d419bbaf2 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -339,6 +339,30 @@ class SmallShapesBroadcast final : public framework::dataset::ZipDataset<ShapeDa
     }
 };
 
+/** Data set containing pairs of small tensor shapes that are broadcast compatible in non-x dimensions. */
+class SmallShapesNonXBroadcast final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset>
+{
+public:
+    SmallShapesNonXBroadcast()
+        : ZipDataset<ShapeDataset, ShapeDataset>(
+    ShapeDataset("Shape0",
+    {
+        TensorShape{ 9U, 9U },
+        TensorShape{ 128U, 1U, 5U, 3U },
+        TensorShape{ 9U, 9U, 3U, 4U },
+        TensorShape{ 1U, 16U, 10U, 2U, 128U }
+    }),
+    ShapeDataset("Shape1",
+    {
+        TensorShape{ 9U, 1U, 2U },
+        TensorShape{ 128U, 64U, 1U, 3U },
+        TensorShape{ 9U, 1U, 3U },
+        TensorShape{ 1U, 1U, 1U, 1U, 128U }
+    }))
+    {
+    }
+};
+
 class TemporaryLimitedSmallShapesBroadcast final : public framework::dataset::ZipDataset<ShapeDataset, ShapeDataset>
 {
 public:
diff --git a/tests/framework/Framework.cpp b/tests/framework/Framework.cpp
index bfb955c525..755398b53d 100644
--- a/tests/framework/Framework.cpp
+++ b/tests/framework/Framework.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -343,6 +343,10 @@ TestResult::Status Framework::run_test(const TestInfo &info, TestCaseFactory &te
                 {
                     profiler.start();
                 }
+                if (_prepare_function != nullptr)
+                {
+                    _prepare_function();
+                }
                 test_case->do_run();
                 test_case->do_sync();
                 if(_num_iterations == 1 || i != 0)
@@ -726,6 +730,16 @@ void Framework::set_new_fixture_call(bool val)
 {
     _new_fixture_call = val;
 }
+
+void Framework::set_prepare_function(const Framework::PrepareFunc &foo)
+{
+    _prepare_function = foo;
+}
+
+void Framework::set_seed(unsigned int seed)
+{
+    _seed = seed;
+}
 } // namespace framework
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/framework/Framework.h b/tests/framework/Framework.h
index 2dded30038..1c5c9a89a1 100644
--- a/tests/framework/Framework.h
+++ b/tests/framework/Framework.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TEST_FRAMEWORK
-#define ARM_COMPUTE_TEST_FRAMEWORK
+#ifndef ACL_TESTS_FRAMEWORK_FRAMEWORK_H
+#define ACL_TESTS_FRAMEWORK_FRAMEWORK_H
 
 #include "DatasetModes.h"
 #include "Exceptions.h"
@@ -323,6 +323,31 @@ class Framework final
      */
     void set_new_fixture_call(bool val);
 
+    /** Prepare functions is called before each test run.
+     *
+     * The difference between the prepare_function and on_setup() callback
+     * from TestCase is that the prepare_function is global for the framework,
+     * but the on_setup() is individual for each TestCase.
+     */
+    using PrepareFunc = std::function<void(void)>;
+
+    /** Set prepare function.
+     *
+     * The prepare_function is called before calling on_run() for each test case.
+     * The difference between the prepare function and on_setup() callback from
+     * TestCase is that the prepare function is global for the framework, but
+     * the on_setup() callback is individual for each TestCase.
+     *
+     * @param[in] prepare The prepare function.
+     */
+    void set_prepare_function(const PrepareFunc &prepare);
+
+    /** Set random seed reported by the framework.
+     *
+     * @param[in] seed Random seed reported by the framework.
+     */
+    void set_seed(unsigned int seed);
+
 private:
     Framework();
     ~Framework() = default;
@@ -360,6 +385,7 @@ class Framework final
     bool                   _new_fixture_call{ false };
     bool                   _print_rerun_cmd{ false };
     unsigned int           _seed{ 0 };
+    PrepareFunc            _prepare_function{};
 
     using create_function = std::unique_ptr<Instrument>();
     std::map<InstrumentsDescription, create_function *> _available_instruments{};
@@ -391,4 +417,4 @@ inline void Framework::add_data_test_case(std::string test_name, DatasetMode mod
 } // namespace framework
 } // namespace test
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_FRAMEWORK */
+#endif // ACL_TESTS_FRAMEWORK_FRAMEWORK_H
diff --git a/tests/framework/Macros.h b/tests/framework/Macros.h
index 09e01b0b0c..97389597f0 100644
--- a/tests/framework/Macros.h
+++ b/tests/framework/Macros.h
@@ -215,17 +215,31 @@
 #define DISABLED_DATA_TEST_CASE(TEST_NAME, MODE, DATASET, ...) \
     DATA_TEST_CASE_IMPL(TEST_NAME, MODE, arm_compute::test::framework::TestCaseFactory::Status::DISABLED, DATASET, __VA_ARGS__)
 
+#define VALIDATION_FIXTURE_RUN()    \
+    void do_run() override          \
+    {                               \
+        if (_iteration != 0)        \
+        {                           \
+            do_setup();             \
+        }                           \
+        do_validate();              \
+        ++_iteration;               \
+    }
+
 #define FIXTURE_TEST_CASE_IMPL(TEST_NAME, FIXTURE, MODE, STATUS)                    \
     class TEST_NAME : public arm_compute::test::framework::TestCase, public FIXTURE \
     {                                                                               \
     public:                                                                     \
         TEST_CASE_CONSTRUCTOR(TEST_NAME)                                            \
         FIXTURE_SETUP(FIXTURE)                                                      \
-        void do_run() override;                                                     \
+        VALIDATION_FIXTURE_RUN()                                                    \
+        void do_validate();                                                         \
         FIXTURE_TEARDOWN(FIXTURE)                                                   \
+    private:                                                                        \
+        unsigned int _iteration {0};                                                \
     };                                                                              \
     TEST_REGISTRAR(TEST_NAME, MODE, STATUS);                                        \
-    void TEST_NAME::do_run()
+    void TEST_NAME::do_validate()
 
 #define FIXTURE_TEST_CASE(TEST_NAME, FIXTURE, MODE) \
     FIXTURE_TEST_CASE_IMPL(TEST_NAME, FIXTURE, MODE, arm_compute::test::framework::TestCaseFactory::Status::ACTIVE)
@@ -248,12 +262,15 @@
     public:                                                                                                                     \
         DATA_TEST_CASE_CONSTRUCTOR(TEST_NAME, DATASET)                                                                              \
         FIXTURE_DATA_SETUP(FIXTURE)                                                                                                 \
-        void do_run() override;                                                                                                     \
+        VALIDATION_FIXTURE_RUN() \
+        void do_validate();                                                                                                         \
         FIXTURE_TEARDOWN(FIXTURE)                                                                                                   \
+    private:                                                                                                                        \
+        unsigned int _iteration {0};                                                                                                \
     };                                                                                                                              \
     DATA_TEST_REGISTRAR(TEST_NAME, MODE, STATUS, DATASET);                                                                          \
     template <typename... As>                                                                                                       \
-    void TEST_NAME<std::tuple<As...>>::do_run()
+    void TEST_NAME<std::tuple<As...>>::do_validate()
 
 #define FIXTURE_DATA_TEST_CASE(TEST_NAME, FIXTURE, MODE, DATASET) \
     FIXTURE_DATA_TEST_CASE_IMPL(TEST_NAME, FIXTURE, MODE, arm_compute::test::framework::TestCaseFactory::Status::ACTIVE, DATASET)
@@ -271,12 +288,15 @@
     public:                                                                                                                     \
         DATA_TEST_CASE_CONSTRUCTOR(TEST_NAME, DATASET)                                                                              \
         FIXTURE_DATA_SETUP_NEW(FIXTURE)                                                                                             \
-        void do_run() override;                                                                                                     \
+        VALIDATION_FIXTURE_RUN()                                                                                                    \
+        void do_validate();                                                                                                         \
         FIXTURE_TEARDOWN(FIXTURE)                                                                                                   \
+    private:                                                                                                                        \
+        unsigned int _iteration {0};                                                                                                \
     };                                                                                                                              \
     DATA_TEST_REGISTRAR(TEST_NAME, MODE, STATUS, DATASET);                                                                          \
     template <typename... As>                                                                                                       \
-    void TEST_NAME<std::tuple<As...>>::do_run()
+    void TEST_NAME<std::tuple<As...>>::do_validate()
 
 #define FIXTURE_DATA_TEST_CASE_NEW(TEST_NAME, FIXTURE, MODE, DATASET) \
     FIXTURE_DATA_TEST_CASE_NEW_IMPL(TEST_NAME, FIXTURE, MODE, arm_compute::test::framework::TestCaseFactory::Status::ACTIVE, DATASET)
diff --git a/tests/main.cpp b/tests/main.cpp
index e862c7627e..52301bf276 100644
--- a/tests/main.cpp
+++ b/tests/main.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -118,8 +118,8 @@ int main(int argc, char **argv)
     filter_id->set_help("List of test ids. ... can be used to define a range.");
     auto stop_on_error = parser.add_option<utils::ToggleOption>("stop-on-error");
     stop_on_error->set_help("Stop execution after the first failed test (useful for debugging)");
-    auto seed = parser.add_option<utils::SimpleOption<std::random_device::result_type>>("seed", std::random_device()());
-    seed->set_help("Global seed for random number generation");
+    auto seed = parser.add_option<utils::SimpleOption<std::random_device::result_type>>("seed");
+    seed->set_help("Global seed for random number generation. When not set, each test iteration will use different random seed");
     auto list_tests = parser.add_option<utils::ToggleOption>("list-tests", false);
     list_tests->set_help("List all test names");
     auto test_instruments = parser.add_option<utils::ToggleOption>("test-instruments", false);
@@ -220,13 +220,17 @@ int main(int argc, char **argv)
             }
         }
 
+        const std::random_device::result_type seed_value = (seed->is_set()) ? seed->value(): std::random_device()();
+        const bool randomize_seeds = !seed->is_set() && (options.iterations->value() > 1);
+
         if(options.log_level->value() >= framework::LogLevel::CONFIG)
         {
             for(auto &p : printers)
             {
                 p->print_entry("Version", build_information());
                 p->print_entry("CommandLine", command_line(argc, argv));
-                p->print_entry("Seed", support::cpp11::to_string(seed->value()));
+                auto seed_str = randomize_seeds ? "Dynamic" : support::cpp11::to_string(seed_value);
+                p->print_entry("Seed", seed_str);
 #ifdef ARM_COMPUTE_CL
                 if(opencl_is_available())
                 {
@@ -282,7 +286,7 @@ int main(int argc, char **argv)
         fconfig.cooldown_sec    = cooldown_sec->value();
         fconfig.configure_only  = configure_only->value();
         fconfig.print_rerun_cmd = print_rerun_command->value();
-        fconfig.seed            = seed->value();
+        fconfig.seed            = seed_value;
         framework.init(fconfig);
 
         for(auto &p : printers)
@@ -292,6 +296,14 @@ int main(int argc, char **argv)
         framework.set_throw_errors(options.throw_errors->value());
         framework.set_stop_on_error(stop_on_error->value());
         framework.set_error_on_missing_assets(error_on_missing_assets->value());
+        if (randomize_seeds)
+        {
+            framework.set_prepare_function([&] (){
+                    std::random_device::result_type seed = std::random_device()();
+                    library->set_seed(seed);
+                    framework.set_seed(seed);
+            });
+        }
 
         bool success = true;
 
@@ -319,7 +331,7 @@ int main(int argc, char **argv)
             return 0;
         }
 
-        library       = std::make_unique<AssetsLibrary>(assets->value(), seed->value());
+        library       = std::make_unique<AssetsLibrary>(assets->value(), seed_value);
         fixed_library = std::make_unique<AssetsLibrary>(assets->value(), fixed_seed);
 
         if(!parser.validate())
diff --git a/tests/validation/CL/Pooling3dLayer.cpp b/tests/validation/CL/Pooling3dLayer.cpp
index 84d630e6cf..5f0c68c17b 100644
--- a/tests/validation/CL/Pooling3dLayer.cpp
+++ b/tests/validation/CL/Pooling3dLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,6 +77,48 @@ constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);        /**< Tolerance
 TEST_SUITE(CL)
 TEST_SUITE(Pooling3dLayer)
 
+TEST_CASE(RoundToNearestInteger, framework::DatasetMode::ALL)
+{
+    const auto pool_info = Pooling3dLayerInfo(PoolingType::AVG,
+        Size3D(3,1,1), Size3D(1,1,1), Padding3D(), true /* exclude padding */);
+
+    const auto shape = TensorShape(1U,3U,1U,1U);
+    const auto output_shape = TensorShape(1U,1U,1U,1U);
+
+    const auto dtype = DataType::QASYMM8_SIGNED;
+    const auto layout = DataLayout::NDHWC;
+    const auto qinfo = QuantizationInfo(1.f, 0);
+
+    CLTensor input = create_tensor<CLTensor>(shape, dtype, 1, qinfo, layout);
+    CLTensor output = create_tensor<CLTensor>(output_shape, dtype, 1, qinfo, layout);
+
+    CLPooling3dLayer pool;
+    pool.configure(&input, &output, pool_info);
+
+    input.allocator()->allocate();
+    output.allocator()->allocate();
+
+    std::vector<int8_t> values = {-10, -10, -9};
+    std::vector<int8_t> refs = {-10};
+
+    ARM_COMPUTE_EXPECT(values.size() == shape.total_size(), framework::LogLevel::ERRORS);
+
+    library->fill_static_values(CLAccessor(input), values);
+
+    pool.run();
+
+    output.map(true);
+    for(unsigned int i = 0; i < refs.size(); ++i)
+    {
+        const int8_t ref = refs[i];
+        const int8_t target = reinterpret_cast<int8_t *>(output.buffer())[i];
+
+        ARM_COMPUTE_EXPECT(ref == target, framework::LogLevel::ERRORS);
+    }
+
+    output.unmap();
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
@@ -106,7 +148,7 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                                                        TensorInfo(TensorShape(5U,  1U, 1U, 1U, 4U), 1, DataType::F32, DataLayout::NDHWC),
                                                        TensorInfo(TensorShape(1U, 15U, 1U, 2U, 4U), 1, DataType::F32, DataLayout::NDHWC), // Output width larger than input
                                                        TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U),  1, DataType::F32, DataLayout::NDHWC),
-                                                       TensorInfo(TensorShape(5U, 6U, 6U, 2U, 2U),  1, DataType::F32, DataLayout::NDHWC), 
+                                                       TensorInfo(TensorShape(5U, 6U, 6U, 2U, 2U),  1, DataType::F32, DataLayout::NDHWC),
                                                        TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U),  1, DataType::F32, DataLayout::NDHWC),
                                                        TensorInfo(TensorShape(5U, 6U, 6U, 2U, 3U),  1, DataType::F32, DataLayout::NDHWC),
                                                      })),
diff --git a/tests/validation/CL/PoolingLayer.cpp b/tests/validation/CL/PoolingLayer.cpp
index 9fe28c7acf..8aca7dd08b 100644
--- a/tests/validation/CL/PoolingLayer.cpp
+++ b/tests/validation/CL/PoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2021, 2023 Arm Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -85,11 +85,68 @@ const auto                           pool_data_layout_dataset = framework::datas
 
 const auto pool_fp_mixed_precision_dataset = framework::dataset::make("FpMixedPrecision", { true, false });
 
+void RoundToNearestIntegerPoolTestBody(const DataLayout layout, const TensorShape &shape,
+    const TensorShape &output_shape)
+{
+    const auto pool_info = PoolingLayerInfo(PoolingType::AVG,
+        Size2D(3,1), layout, PadStrideInfo(), true /* exclude padding */);
+
+    const auto dtype = DataType::QASYMM8_SIGNED;
+    const auto qinfo = QuantizationInfo(1.f, 0);
+
+    CLTensor input = create_tensor<CLTensor>(shape, dtype, 1, qinfo, layout);
+    CLTensor output = create_tensor<CLTensor>(output_shape, dtype, 1, qinfo, layout);
+
+    CLPoolingLayer pool;
+    pool.configure(&input, &output, pool_info);
+
+    input.allocator()->allocate();
+    output.allocator()->allocate();
+
+    std::vector<int8_t> values = {-10, -10, -9};
+    std::vector<int8_t> refs = {-10};
+
+    ARM_COMPUTE_EXPECT(values.size() == shape.total_size(), framework::LogLevel::ERRORS);
+
+    library->fill_static_values(CLAccessor(input), values);
+
+    pool.run();
+
+    output.map(true);
+    for(unsigned int i = 0; i < refs.size(); ++i)
+    {
+        const int8_t ref = refs[i];
+        const int8_t target = reinterpret_cast<int8_t *>(output.buffer())[i];
+
+        ARM_COMPUTE_EXPECT(ref == target, framework::LogLevel::ERRORS);
+    }
+
+    output.unmap();
+}
+
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(PoolingLayer)
 
+TEST_CASE(RoundToNearestIntegerNHWC, framework::DatasetMode::ALL)
+{
+    const auto layout = DataLayout::NHWC;
+    const auto shape = TensorShape(1U,3U,1U);
+    const auto output_shape = TensorShape(1U,1U,1U);
+
+    RoundToNearestIntegerPoolTestBody(layout, shape, output_shape);
+}
+
+TEST_CASE(RoundToNearestIntegerNCHW, framework::DatasetMode::ALL)
+{
+    const auto layout = DataLayout::NCHW;
+    const auto shape = TensorShape(3U,1U,1U);
+    const auto output_shape = TensorShape(1U,1U,1U);
+
+    RoundToNearestIntegerPoolTestBody(layout, shape, output_shape);
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
diff --git a/tests/validation/CL/Reverse.cpp b/tests/validation/CL/Reverse.cpp
index 82effc2136..671eb94090 100644
--- a/tests/validation/CL/Reverse.cpp
+++ b/tests/validation/CL/Reverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, 2023 Arm Limited.
+ * Copyright (c) 2018-2020, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "tests/CL/CLAccessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/DatatypeDataset.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
@@ -47,10 +48,68 @@ namespace
 auto run_small_dataset = combine(datasets::Small3DShapes(), datasets::Tiny1DShapes());
 auto run_large_dataset = combine(datasets::LargeShapes(), datasets::Tiny1DShapes());
 
+void validate_data_types(DataType input_dtype, DataType output_dtype, DataType axis_dtype)
+{
+    const auto input = TensorInfo(TensorShape(16U, 16U, 5U), 1, input_dtype);
+    const auto axis = TensorInfo(TensorShape(1U), 1, axis_dtype);
+    auto output = TensorInfo(TensorShape(16U, 16U, 5U), 1, output_dtype);
+
+    const Status status = (CLReverse::validate(&input, &output, &axis, false /* use_inverted_axis */));
+    const bool is_valid = static_cast<bool>(status);
+
+    static const auto supported_dtypes = {
+        DataType::QSYMM8,
+        DataType::QASYMM8,
+        DataType::QASYMM8_SIGNED,
+        DataType::QSYMM16,
+        DataType::U8,
+        DataType::S8,
+        DataType::QSYMM8_PER_CHANNEL,
+        DataType::U16,
+        DataType::S16,
+        DataType::QSYMM16,
+        DataType::QASYMM16,
+        DataType::U32,
+        DataType::S32,
+        DataType::SIZET,
+        DataType::BFLOAT16,
+        DataType::F16,
+        DataType::F32,
+#ifdef __aarch64__
+        DataType::U64,
+        DataType::S64,
+        DataType::F64,
+#endif // __aarch64__
+    };
+
+    static std::vector<std::tuple<DataType,DataType,DataType>> supports = {};
+    for(DataType dtype : supported_dtypes)
+    {
+        supports.push_back(std::make_tuple(dtype, dtype, DataType::S32));
+        supports.push_back(std::make_tuple(dtype, dtype, DataType::U32));
+    }
+
+    const auto config = std::make_tuple(input_dtype, output_dtype, axis_dtype);
+    const bool expected = (std::find(supports.begin(), supports.end(), config) != supports.end());
+
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+
 } // namespace
 TEST_SUITE(CL)
 TEST_SUITE(Reverse)
 
+/// @note: Do not modify. Validating all data types is pretty fast.
+DATA_TEST_CASE(ValidateAllDataTypes, framework::DatasetMode::ALL,
+    combine(
+        datasets::AllDataTypes("InputDataType"),
+        datasets::AllDataTypes("OutputDataType"),
+        datasets::AllDataTypes("AxisDataType")),
+        input_dtype, output_dtype, axis_dtype)
+{
+    validate_data_types(input_dtype, output_dtype, axis_dtype);
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
@@ -90,6 +149,8 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 template <typename T>
 using CLReverseFixture = ReverseValidationFixture<CLTensor, CLAccessor, CLReverse, T>;
 
+/// @note: see tests/validation/NEON/Reverse.cpp for the Test Strategy
+
 TEST_SUITE(Float)
 TEST_SUITE(F16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
@@ -119,7 +180,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
 }
 TEST_SUITE_END() // F16
 
-TEST_SUITE(FP32)
+TEST_SUITE(F32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        CLReverseFixture<float>,
                        framework::DatasetMode::PRECOMMIT,
@@ -146,16 +207,150 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
     validate(CLAccessor(_target), _reference);
 }
 TEST_SUITE_END() // F32
+
+#ifdef __aarch64__
+TEST_SUITE(FP64)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CLReverseFixture<float64_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           run_small_dataset,
+                           make("DataType", DataType::F64),
+                           make("use_negative_axis", { false }),
+                           make("use_inverted_axis", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // F64
+#endif // __aarch64__
+
 TEST_SUITE_END() // Float
 
-TEST_SUITE(Quantized)
-TEST_SUITE(QASYMM8)
+TEST_SUITE(Integer)
+TEST_SUITE(Int32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CLReverseFixture<int32_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("InOutShape", TensorShape(18U, 5U, 5U)),
+                           make("AxisShape", TensorShape(2U)),
+                           make("DataType", {DataType::S32}),
+                           make("use_negative_axis", { false }),
+                           make("use_inverted_axis", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // Int32
+
+#ifndef __x86_64__
+TEST_SUITE(SizeT)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CLReverseFixture<size_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("InOutShape", TensorShape(18U, 5U, 5U)),
+                           make("AxisShape", TensorShape(2U)),
+                           make("DataType", {DataType::SIZET}),
+                           make("use_negative_axis", { false }),
+                           make("use_inverted_axis", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // SizeT
+#endif // __x86_64__
+
+TEST_SUITE(UInt32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CLReverseFixture<uint32_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("InOutShape", TensorShape(18U, 5U, 5U)),
+                           make("AxisShape", TensorShape(2U)),
+                           make("DataType", {DataType::U32}),
+                           make("use_negative_axis", { false }),
+                           make("use_inverted_axis", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // UInt32
+
+#ifdef __aarch64__
+TEST_SUITE(Int64)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CLReverseFixture<int64_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("InOutShape", TensorShape(18U, 5U, 5U)),
+                           make("AxisShape", TensorShape(2U)),
+                           make("DataType", {DataType::S64}),
+                           make("use_negative_axis", { false }),
+                           make("use_inverted_axis", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // Int64
+
+TEST_SUITE(UInt64)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CLReverseFixture<uint64_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("InOutShape", TensorShape(18U, 5U, 5U)),
+                           make("AxisShape", TensorShape(2U)),
+                           make("DataType", {DataType::S64}),
+                           make("use_negative_axis", { false }),
+                           make("use_inverted_axis", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // UInt64
+#endif // __aarch64__
+
+TEST_SUITE(Int16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CLReverseFixture<int16_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("InOutShape", TensorShape(18U, 5U, 5U)),
+                           make("AxisShape", TensorShape(2U)),
+                           make("DataType", {DataType::S16, DataType::QSYMM16}),
+                           make("use_negative_axis", { false }),
+                           make("use_inverted_axis", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // Int16
+
+TEST_SUITE(UInt16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CLReverseFixture<uint16_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("InOutShape", TensorShape(18U, 5U, 5U)),
+                           make("AxisShape", TensorShape(2U)),
+                           make("DataType", {DataType::U16, DataType::QASYMM16}),
+                           make("use_negative_axis", { false }),
+                           make("use_inverted_axis", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // UInt16
+
+TEST_SUITE(UInt8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        CLReverseFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
                        combine(
                            run_small_dataset,
-                           make("DataType", DataType::QASYMM8),
+                           make("DataType", {DataType::QASYMM8, DataType::U8}),
                            make("use_negative_axis", { true, false }),
                            make("use_inverted_axis", { true, false })))
 {
@@ -175,8 +370,25 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-TEST_SUITE_END() // QASYMM8
-TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // UInt8
+
+TEST_SUITE(Int8)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       CLReverseFixture<int8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("InOutShape", TensorShape(18U, 5U, 5U)),
+                           make("AxisShape", TensorShape(2U)),
+                           make("DataType", {DataType::QASYMM8_SIGNED, DataType::S8,
+                                DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL}),
+                           make("use_negative_axis", { false }),
+                           make("use_inverted_axis", { false })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END() // Int8
+TEST_SUITE_END() // Integer
 
 TEST_SUITE_END() // Reverse
 TEST_SUITE_END() // CL
diff --git a/tests/validation/CMakeLists.txt b/tests/validation/CMakeLists.txt
index c2b526817e..59cd4b0a88 100644
--- a/tests/validation/CMakeLists.txt
+++ b/tests/validation/CMakeLists.txt
@@ -152,7 +152,9 @@ if(ENABLE_NEON)
             runtime/experimental/operators/CpuGemmConv2d.cpp
             runtime/experimental/operators/CpuGemmDirectConv2d.cpp
             runtime/experimental/operators/CpuMul.cpp
+            runtime/experimental/operators/CpuSoftmax.cpp
             runtime/experimental/operators/CpuSub.cpp
             runtime/experimental/operators/CpuTranspose.cpp
-            runtime/experimental/operators/CpuWinogradConv2d.cpp)
+            runtime/experimental/operators/CpuWinogradConv2d.cpp
+            )
 endif()
diff --git a/tests/validation/CPP/LUT.cpp b/tests/validation/CPP/LUT.cpp
new file mode 100644
index 0000000000..1874823d8d
--- /dev/null
+++ b/tests/validation/CPP/LUT.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+#include "src/core/helpers/LUTManager.h"
+#include "include/half/half.hpp"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+#ifdef ARM_COMPUTE_ENABLE_FP16
+    // Take fp16 value and output as uint16_t without changing bits.
+    inline uint16_t read_as_bf16(const float16_t tmp)
+    {
+        uint16_t out = 0;
+        memcpy(&out, &tmp, sizeof(tmp));
+        return out;
+    }
+#endif // ARM_COMPUTE_ENABLE_FP16
+
+    // Check if difference in values is within tolerance range
+    template<typename U>
+    bool equal_values_relative(const U target, const U reference, const float tolerance)
+    {
+        if(are_equal_infs(target, reference))
+        {
+            return true;
+        }
+        else if(target == reference)
+        {
+            return true;
+        }
+        else if(half_float::detail::builtin_isnan(target) && half_float::detail::builtin_isnan(reference)) // determine if nan values using existing function
+        {
+            return true;
+        }
+
+        const U epsilon = (std::is_same<half, typename std::remove_cv<U>::type>::value || (reference == 0)) ? static_cast<U>(0.01) : static_cast<U>(1e-05);
+        if(std::abs(static_cast<double>(reference) - static_cast<double>(target)) <= epsilon)
+        {
+            return true;
+        }
+        else
+        {
+            if(static_cast<double>(reference) == 0.0f)
+            {
+                return false;  // We have checked whether _reference and _target is close. If _reference is 0 but not close to _target, it should return false
+            }
+            const double relative_change = std::abs((static_cast<double>(target) - static_cast<double>(reference)) / reference);
+            return relative_change <= static_cast<U>(tolerance);
+        }
+    }
+} // namespace
+
+TEST_SUITE(LUTManager)
+#ifdef ARM_COMPUTE_ENABLE_FP16
+TEST_SUITE(BF16)
+TEST_CASE(LUTValueTest, framework::DatasetMode::ALL)
+{
+    // Define values for test
+    constexpr float beta = 1.0f;
+    constexpr float rel_tolerance = 0.01f;
+    constexpr int num_elements = 65536;
+    unsigned int num_mismatches = 0;
+
+    // Create lutinfo, use to get lut
+    LUTInfo info = {LUTType::Exponential, beta, DataType::BFLOAT16, UniformQuantizationInfo()};
+    LUTManager lman = LUTManager::get_instance();
+
+    if(CPUInfo::get().has_fp16())
+    {
+        // Retrieve lut, Assert lut exists and is retrieved successfully.
+        std::shared_ptr<LookupTable65536> lut = lman.get_lut_table(info);
+        ARM_COMPUTE_EXPECT(lut != nullptr, framework::LogLevel::ALL);
+
+        // Check each value in lut
+        for(int i=0; i < num_elements; i++)
+        {
+            // Calculate reference in fp32. Convert lut value to fp32.
+            const float fref = std::exp(bf16_to_float(i) * beta * -1);
+            const uint16_t target_bf16 = read_as_bf16((*lut)[i]);
+            const float target = bf16_to_float(target_bf16);
+
+            // Compare and increment mismatch count if needed.
+            if(!equal_values_relative(target, fref, rel_tolerance))
+            {
+                ARM_COMPUTE_TEST_INFO("id = " << i);
+                ARM_COMPUTE_TEST_INFO("target = " << std::setprecision(5) << framework::make_printable(target));
+                ARM_COMPUTE_TEST_INFO("reference = " << std::setprecision(5) << framework::make_printable(fref));
+                ARM_COMPUTE_TEST_INFO("relative tolerance = " << std::setprecision(5) << framework::make_printable(rel_tolerance));
+                framework::ARM_COMPUTE_PRINT_INFO();
+                ++num_mismatches;
+            }
+        }
+
+        if(num_mismatches != 0)
+        {
+            const float    percent_mismatches        = static_cast<float>(num_mismatches) / num_elements * 100.f;
+            ARM_COMPUTE_TEST_INFO(num_mismatches << " values (" << std::fixed << std::setprecision(2) << percent_mismatches << "%) mismatched ");
+        }
+
+        // Check if passed tests
+        ARM_COMPUTE_EXPECT(num_mismatches == 0, framework::LogLevel::ERRORS);
+    }
+}
+
+TEST_CASE(CheckLutReuse, framework::DatasetMode::ALL)
+{
+    LUTInfo info = {LUTType::Exponential, 1.0f, DataType::BFLOAT16, UniformQuantizationInfo()};
+    LUTManager lman = LUTManager::get_instance();
+    auto first = lman.get_lut_table(info);
+    auto second = lman.get_lut_table(info);
+    ARM_COMPUTE_EXPECT(first == second, framework::LogLevel::ERRORS);
+}
+
+
+TEST_SUITE_END() // BF16
+#endif // ARM_COMPUTE_ENABLE_FP16
+
+TEST_SUITE_END() // LUTManager
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/Helpers.cpp b/tests/validation/Helpers.cpp
index 560460fd33..d9c0418f35 100644
--- a/tests/validation/Helpers.cpp
+++ b/tests/validation/Helpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2023 Arm Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,6 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/CPP/CPPTypes.h"
+
 #include "tests/validation/Helpers.h"
 #include "tests/framework/Asserts.h"
 
@@ -571,6 +573,40 @@ QuantizationHint suggest_mac_dst_q_info_and_bias(
     return { c_q_info, min_bias, max_bias };
 }
 
+template<DataType data_type>
+bool config_has_dtype(const std::initializer_list<DataType> &types)
+{
+    bool dtype_exists = false;
+    for(DataType type : types)
+    {
+        dtype_exists |= (type == data_type);
+    }
+    return dtype_exists;
+}
+
+bool cpu_supports_dtypes(const std::initializer_list<DataType> &types)
+{
+    const bool cpu_has_bf16 = CPUInfo::get().has_bf16();
+    const bool cpu_has_fp16 = CPUInfo::get().has_fp16();
+    const bool config_has_fp16 = config_has_dtype<DataType::F16>(types);
+    const bool config_has_bf16 = config_has_dtype<DataType::BFLOAT16>(types);
+
+#ifndef ARM_COMPUTE_ENABLE_FP16
+    const bool fp16_enabled = false;
+#else // ARM_COMPUTE_ENABLE_FP16
+    const bool fp16_enabled = true;
+#endif // ARM_COMPUTE_ENABLE_FP16
+
+#ifndef ARM_COMPUTE_ENABLE_BF16
+    const bool bf16_enabled = false;
+#else // ARM_COMPUTE_ENABLE_BF16
+    const bool bf16_enabled = true;
+#endif // ARM_COMPUTE_ENABLE_BF16
+
+    return !(config_has_fp16 && (!cpu_has_fp16 || !fp16_enabled)) &&
+           !(config_has_bf16 && (!cpu_has_bf16 || !bf16_enabled));
+}
+
 template void get_tile(const SimpleTensor<float> &in, SimpleTensor<float> &roi, const Coordinates &coord);
 template void get_tile(const SimpleTensor<half> &in, SimpleTensor<half> &roi, const Coordinates &coord);
 template void get_tile(const SimpleTensor<int> &in, SimpleTensor<int> &roi, const Coordinates &coord);
diff --git a/tests/validation/Helpers.h b/tests/validation/Helpers.h
index e044620556..7bdbf5a855 100644
--- a/tests/validation/Helpers.h
+++ b/tests/validation/Helpers.h
@@ -34,6 +34,7 @@
 
 #include <cmath>
 #include <cstdint>
+#include <initializer_list>
 #include <random>
 #include <type_traits>
 #include <utility>
@@ -308,6 +309,14 @@ QuantizationHint suggest_mac_dst_q_info_and_bias(const QuantizationInfo &lhs_q_i
                                                  DataType                data_type,
                                                  float                   bias_fraction,
                                                  int                     num_sd = 2);
+
+/** Check if Cpu supports the vectoral operations for the data types in the parameters
+ *
+ * @param[in] types an initializeer list that contain data types
+ *
+ * @return true if the current cpu supports the vectoral operations for the data types
+ */
+bool cpu_supports_dtypes(const std::initializer_list<DataType> &types);
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp
index 51a2cecb78..119d51808a 100644
--- a/tests/validation/NEON/ActivationLayer.cpp
+++ b/tests/validation/NEON/ActivationLayer.cpp
@@ -285,48 +285,6 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
 
-DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, concat(concat(
-               combine(framework::dataset::make("CpuExt", std::string("NEON")),
-                       framework::dataset::make("DataType", { DataType::F32,
-                                                              DataType::F16,
-                                                              DataType::QASYMM8,
-                                                              DataType::QASYMM8_SIGNED,
-                                                              DataType::QSYMM16
-                                                            })),
-                combine(framework::dataset::make("CpuExt", std::string("SVE")),
-                        framework::dataset::make("DataType", { DataType::F32,
-                                                               DataType::F16,
-                                                             }))),
-                combine(framework::dataset::make("CpuExt", std::string("SVE2")),
-                        framework::dataset::make("DataType", { DataType::QASYMM8,
-                                                               DataType::QASYMM8_SIGNED,
-                                                               DataType::QSYMM16
-                                                             }))),
-               cpu_ext, data_type)
-{
-    using namespace cpu::kernels;
-
-    cpuinfo::CpuIsaInfo cpu_isa{};
-    cpu_isa.neon = (cpu_ext == "NEON");
-    cpu_isa.sve  = (cpu_ext == "SVE");
-    cpu_isa.sve2 = (cpu_ext == "SVE2");
-    cpu_isa.fp16 = (data_type == DataType::F16);
-
-    const auto *selected_impl = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{data_type, CPUModel::GENERIC, cpu_isa,ActivationLayerInfo::ActivationFunction::BOUNDED_RELU}, cpu::KernelSelectionType::Preferred);
-
-    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
-    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_activation";
-    if( data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED)
-    {
-#ifdef __aarch64__
-        expected = "neon_q8_activation_lut";
-#else  // __aarch64__
-        expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_activation";
-#endif // __aarch64__
-    }
-    std::string actual   = selected_impl->name;
-    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);
-}
 // clang-format on
 // *INDENT-ON*
 
diff --git a/tests/validation/NEON/ArithmeticAddition.cpp b/tests/validation/NEON/ArithmeticAddition.cpp
index 7a7aa52041..c0033daab0 100644
--- a/tests/validation/NEON/ArithmeticAddition.cpp
+++ b/tests/validation/NEON/ArithmeticAddition.cpp
@@ -44,12 +44,14 @@ namespace test
 {
 namespace validation
 {
+
+using framework::dataset::make;
 namespace
 {
 #if !defined(__aarch64__) || defined(ENABLE_SVE)
 constexpr AbsoluteTolerance<float> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
 #else                                                  // !defined(__aarch64__) || defined(ENABLE_SVE)
-constexpr AbsoluteTolerance<float> tolerance_quant(0);
+constexpr AbsoluteTolerance<float> tolerance_quant(1);
 #endif                                                 // !defined(__aarch64__) || defined(ENABLE_SVE)
 const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
 const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
@@ -125,7 +127,7 @@ DATA_TEST_CASE(KernelSelection, framework::DatasetMode::ALL, concat(concat(
     cpu_isa.sve2 = (cpu_ext == "SVE2");
     cpu_isa.fp16 = (data_type == DataType::F16);
 
-    const auto *selected_impl = CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{data_type, cpu_isa, can_use_fixedpoint}, cpu::KernelSelectionType::Preferred);
+    const auto *selected_impl = CpuAddKernel::get_implementation(CpuAddKernelDataTypeISASelectorData{data_type, cpu_isa, can_use_fixedpoint, false /* can_use_sme2_impl */ }, cpu::KernelSelectionType::Preferred);
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl);
 
@@ -298,12 +300,43 @@ TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEArithmeticAdditionQuantizedFixture<int8_t>,
                        framework::DatasetMode::ALL,
-                       combine(combine(combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
-                                                               framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                                       framework::dataset::make("Src0QInfo", { QuantizationInfo(0.5f, 20) })),
-                                               framework::dataset::make("Src1QInfo", { QuantizationInfo(0.5f, 10) })),
-                                       framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })),
-                               OutOfPlaceDataSet))
+                       combine(datasets::SmallShapes(),
+                            make("DataType", DataType::QASYMM8_SIGNED),
+                            make("ConvertPolicy", { ConvertPolicy::SATURATE }),
+                            make("Src0QInfo", { QuantizationInfo(0.45f, 20) }),
+                            make("Src1QInfo", { QuantizationInfo(0.55f, 10) }),
+                            make("OutQInfo", { QuantizationInfo(0.5f, 5) }),
+                            OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quant);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmall5d,
+                       NEArithmeticAdditionQuantizedFixture<int8_t>,
+                       framework::DatasetMode::ALL,
+                       combine(datasets::Tiny5dShapes(),
+                            make("DataType", DataType::QASYMM8_SIGNED),
+                            make("ConvertPolicy", { ConvertPolicy::SATURATE }),
+                            make("Src0QInfo", { QuantizationInfo(0.45f, 20) }),
+                            make("Src1QInfo", { QuantizationInfo(0.55f, 10) }),
+                            make("OutQInfo", { QuantizationInfo(0.5f, 5) }),
+                            OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_quant);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge,
+                       NEArithmeticAdditionQuantizedFixture<int8_t>,
+                       framework::DatasetMode::NIGHTLY,
+                       combine(datasets::LargeShapes(),
+                            make("DataType", DataType::QASYMM8_SIGNED),
+                            make("ConvertPolicy", { ConvertPolicy::SATURATE }),
+                            make("Src0QInfo", { QuantizationInfo(0.45f, 20) }),
+                            make("Src1QInfo", { QuantizationInfo(0.55f, 10) }),
+                            make("OutQInfo", { QuantizationInfo(0.5f, 5) }),
+                            OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_quant);
@@ -312,8 +345,8 @@ FIXTURE_DATA_TEST_CASE(RunSmall,
 FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticAdditionQuantizedBroadcastFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(
                            datasets::SmallShapesBroadcast(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                       framework::dataset::make("Src0QInfo", { QuantizationInfo(0.5f, 20) })),
-                       framework::dataset::make("Src1QInfo", { QuantizationInfo(0.5f, 10) })),
+                       framework::dataset::make("Src0QInfo", { QuantizationInfo(0.45f, 20) })),
+                       framework::dataset::make("Src1QInfo", { QuantizationInfo(0.55f, 10) })),
                        framework::dataset::make("OutQInfo", { QuantizationInfo(0.5f, 5) })),
                        OutOfPlaceDataSet))
 {
diff --git a/tests/validation/NEON/ArithmeticSubtraction.cpp b/tests/validation/NEON/ArithmeticSubtraction.cpp
index 9a6032cd9e..cbc99ba78c 100644
--- a/tests/validation/NEON/ArithmeticSubtraction.cpp
+++ b/tests/validation/NEON/ArithmeticSubtraction.cpp
@@ -25,6 +25,7 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 #include "arm_compute/runtime/Tensor.h"
 #include "arm_compute/runtime/TensorAllocator.h"
+
 #include "tests/NEON/Accessor.h"
 #include "tests/PaddingCalculator.h"
 #include "tests/datasets/ConvertPolicyDataset.h"
@@ -32,8 +33,10 @@
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Helpers.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ArithmeticOperationsFixture.h"
+#include "tests/datasets/DatatypeDataset.h"
 
 namespace arm_compute
 {
@@ -41,6 +44,9 @@ namespace test
 {
 namespace validation
 {
+
+using framework::dataset::make;
+
 namespace
 {
 #ifdef __aarch64__
@@ -60,11 +66,47 @@ const auto ArithmeticSubtractionQuantizationInfoSignedDataset = combine(combine(
 const auto ArithmeticSubtractionQuantizationInfoSignedInPlaceDataset = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(0.8f, 10) }),
                                                                                        framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(0.8f, 10) })),
                                                                                framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(0.8f, 10) }));
-const auto ArithmeticSubtractionQuantizationInfoSymmetric = combine(combine(framework::dataset::make("QuantizationInfoIn1", { QuantizationInfo(0.3f, 0) }),
-                                                                            framework::dataset::make("QuantizationInfoIn2", { QuantizationInfo(0.7f, 0) })),
-                                                                    framework::dataset::make("QuantizationInfoOut", { QuantizationInfo(0.2f, 0) }));
+const auto ArithmeticSubtractionQuantizationInfo16bitSymmetric =
+    combine(
+        make("QuantizationInfoIn1", { QuantizationInfo(0.003f, 0) }),
+        make("QuantizationInfoIn2", { QuantizationInfo(0.007f, 0) }),
+        make("QuantizationInfoOut", { QuantizationInfo(0.2f, 0),
+                                      QuantizationInfo(0.002f, 0) /* for more saturation */ })
+    );
+
 const auto InPlaceDataSet    = framework::dataset::make("InPlace", { false, true });
 const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
+
+void validate_data_types(DataType input1_dtype, DataType input2_dtype, DataType output_dtype)
+{
+    const auto input1 = TensorInfo(TensorShape(27U, 13U, 2U), 1, input1_dtype);
+    const auto input2 = TensorInfo(TensorShape(27U, 13U, 2U), 1, input2_dtype);
+    auto output = TensorInfo(TensorShape(27U, 13U, 2U), 1, output_dtype);
+
+    const bool is_valid = static_cast<bool>(NEArithmeticSubtraction::validate(&input1, &input2, &output,
+        ConvertPolicy::SATURATE));
+
+    const auto supports = {
+        std::make_tuple(DataType::F32,DataType::F32,DataType::F32),
+        std::make_tuple(DataType::F16,DataType::F16,DataType::F16),
+        std::make_tuple(DataType::U8,DataType::U8,DataType::U8),
+        std::make_tuple(DataType::S16,DataType::S16,DataType::S16),
+        std::make_tuple(DataType::S32,DataType::S32,DataType::S32),
+        std::make_tuple(DataType::QSYMM16,DataType::QSYMM16,DataType::QSYMM16),
+        std::make_tuple(DataType::QASYMM8,DataType::QASYMM8,DataType::QASYMM8),
+        std::make_tuple(DataType::QASYMM8_SIGNED,DataType::QASYMM8_SIGNED,DataType::QASYMM8_SIGNED)
+    };
+    const auto config = std::make_tuple(input1_dtype, input2_dtype, output_dtype);
+    const std::initializer_list<DataType> dtypes_list = {input1_dtype, input2_dtype, output_dtype};
+
+    bool expected = false;
+    if(cpu_supports_dtypes(dtypes_list))
+    {
+        expected = (std::find(supports.begin(), supports.end(), config) != supports.end());
+    }
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+
 } // namespace
 
 TEST_SUITE(NEON)
@@ -72,6 +114,8 @@ TEST_SUITE(ArithmeticSubtraction)
 
 template <typename T>
 using NEArithmeticSubtractionFixture = ArithmeticSubtractionValidationFixture<Tensor, Accessor, NEArithmeticSubtraction, T>;
+template <typename T>
+using NEArithmeticSubtractionBroadcastFixture = ArithmeticSubtractionBroadcastValidationFixture<Tensor, Accessor, NEArithmeticSubtraction, T>;
 
 // *INDENT-OFF*
 // clang-format off
@@ -160,6 +204,18 @@ TEST_CASE(InvalidBroadcastBoth, framework::DatasetMode::ALL)
 }
 TEST_SUITE_END() // InPlaceValidate
 
+/// @note: Do not modify. Validating all data types is pretty fast.
+DATA_TEST_CASE(ValidateAllDataTypes, framework::DatasetMode::ALL,
+    combine(
+        datasets::AllDataTypes("Input1DataType"),
+        datasets::AllDataTypes("Input2DataType"),
+        datasets::AllDataTypes("OutputDataType")),
+        input1_dtype, input2_dtype, output_dtype)
+{
+    validate_data_types(input1_dtype, input2_dtype, output_dtype);
+}
+
+
 TEST_SUITE(U8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
                                                                                                                      DataType::U8)),
@@ -169,13 +225,25 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<uint8_t>, framew
     // Validate output
     validate(Accessor(_target), _reference);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SmallShapesBroadcast(),
+        make("DataType", DataType::U8),
+        make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+        OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
 TEST_SUITE_END() // U8
 
 using NEArithmeticSubtractionQASYMM8Fixture                = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, uint8_t>;
 using NEArithmeticSubtractionQASYMM8SignedFixture          = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, int8_t>;
-using NEArithmeticSubtractionQASYMM8SignedBroadcastFixture = ArithmeticSubtractionValidationQuantizedBroadcastFixture<Tensor, Accessor, NEArithmeticSubtraction, int8_t>;
 using NEArithmeticSubtractionQSYMM16Fixture                = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, int16_t>;
 
+template<typename T>
+using NEArithmeticSubtractionQuantizedBroadcastFixture = ArithmeticSubtractionValidationQuantizedBroadcastFixture<Tensor, Accessor, NEArithmeticSubtraction, T>;
+
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType",
@@ -187,6 +255,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8Fixture, framewor
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQuantizedBroadcastFixture<uint8_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SmallShapesBroadcast(),
+        make("DataType", DataType::QASYMM8),
+        make("ConvertPolicy", { ConvertPolicy::SATURATE }),
+        ArithmeticSubtractionQuantizationInfoDataset,
+        OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
 TEST_SUITE_END() // QASYMM8
 
 TEST_SUITE(QASYMM8_SIGNED)
@@ -199,7 +278,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8SignedFixture, fr
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQASYMM8SignedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQuantizedBroadcastFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(
                            datasets::SmallShapesBroadcast(),
                            framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
@@ -209,7 +288,7 @@ FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQASYMM8SignedBr
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
-FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEArithmeticSubtractionQASYMM8SignedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEArithmeticSubtractionQuantizedBroadcastFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(
                            datasets::TinyShapesBroadcastInplace(),
                            framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
@@ -222,12 +301,24 @@ FIXTURE_DATA_TEST_CASE(RunTinyBroadcastInPlace, NEArithmeticSubtractionQASYMM8Si
 TEST_SUITE_END() // QASYMM8_SIGNED
 
 TEST_SUITE(QSYMM16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQSYMM16Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(
+FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQSYMM16Fixture, framework::DatasetMode::ALL,
+    combine(
         datasets::SmallShapes(),
-        framework::dataset::make("DataType", DataType::QSYMM16)),
-                                                                                                                     framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
-                                                                                                                     ArithmeticSubtractionQuantizationInfoSymmetric),
-                                                                                                             OutOfPlaceDataSet))
+        make("DataType", DataType::QSYMM16),
+        make("ConvertPolicy", { ConvertPolicy::SATURATE }),
+        ArithmeticSubtractionQuantizationInfo16bitSymmetric,
+        OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qsymm16);
+}
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQuantizedBroadcastFixture<int16_t>, framework::DatasetMode::ALL,
+    combine(
+        datasets::SmallShapesBroadcast(),
+        make("DataType", DataType::QSYMM16),
+        make("ConvertPolicy", { ConvertPolicy::SATURATE }),
+        ArithmeticSubtractionQuantizationInfo16bitSymmetric,
+        OutOfPlaceDataSet))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
@@ -245,6 +336,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int16_t>, framew
     validate(Accessor(_target), _reference);
 }
 
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixture<int16_t>, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallShapesBroadcast(),
+        make("DataType",
+        DataType::S16),
+        make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+        OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
                                                                                                                    DataType::S16)),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
@@ -265,6 +367,17 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<int32_t>, framew
     validate(Accessor(_target), _reference);
 }
 
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixture<int32_t>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SmallShapesBroadcast(),
+        make("DataType", DataType::S32),
+        make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP }),
+        OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
 FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<int32_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType",
                                                                                                                    DataType::S32)),
                                                                                                                    framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
@@ -316,9 +429,6 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEArithmeticSubtractionFixture<float>, framewor
     validate(Accessor(_target), _reference);
 }
 
-template <typename T>
-using NEArithmeticSubtractionBroadcastFixture = ArithmeticSubtractionBroadcastValidationFixture<Tensor, Accessor, NEArithmeticSubtraction, T>;
-
 FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionBroadcastFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallShapesBroadcast(),
                        framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE, ConvertPolicy::WRAP })),
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index 2d948f3e32..4f4e0e5a6c 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -35,6 +35,10 @@
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h"
+#include "tests/datasets/DatatypeDataset.h"
+
+#include <algorithm>
+#include <tuple>
 
 namespace arm_compute
 {
@@ -206,6 +210,67 @@ DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip
     ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
 
+void validate_data_types(DataType input_dtype, DataType weight_dtype, DataType bias_dtype, DataType output_dtype)
+{
+    const int depth_multiplier = 1;
+    const auto dilation = Size2D(1U, 1U);
+    const auto conv_info = PadStrideInfo(1, 1, 0, 0);
+
+    const auto input = TensorInfo(TensorShape(27U, 13U, 2U), 1, input_dtype);
+    std::vector<float> scales(input.tensor_shape().z() * depth_multiplier);
+
+    const auto weights = TensorInfo(TensorShape(3U, 3U, 2U), 1, weight_dtype, QuantizationInfo(scales));
+    const auto bias = TensorInfo(TensorShape(2U), 1, bias_dtype);
+    auto output = TensorInfo(TensorShape(25U, 11U, 2U), 1, output_dtype);
+
+
+    bool is_valid = bool(NEDepthwiseConvolutionLayer::validate(&input.clone()->set_is_resizable(false), &weights.clone()->set_is_resizable(false), &bias.clone()->set_is_resizable(false), &output.clone()->set_is_resizable(false),
+        conv_info, depth_multiplier, ActivationLayerInfo(), dilation));
+
+    const auto supports = {
+        std::make_tuple(DataType::F32,DataType::F32,DataType::F32,DataType::F32),
+        std::make_tuple(DataType::F16,DataType::F16,DataType::F16,DataType::F16),
+        std::make_tuple(DataType::QASYMM8,DataType::QASYMM8,DataType::S32,DataType::QASYMM8),
+        std::make_tuple(DataType::QASYMM8,DataType::QSYMM8_PER_CHANNEL,DataType::S32,DataType::QASYMM8),
+        std::make_tuple(DataType::QASYMM8_SIGNED,DataType::QASYMM8_SIGNED,DataType::S32,DataType::QASYMM8_SIGNED),
+        std::make_tuple(DataType::QASYMM8_SIGNED,DataType::QSYMM8_PER_CHANNEL,DataType::S32,DataType::QASYMM8_SIGNED),
+    };
+
+    const auto config = std::make_tuple(input_dtype, weight_dtype, bias_dtype, output_dtype);
+    const std::initializer_list<DataType> dtypes_list = {input_dtype, weight_dtype, bias_dtype, output_dtype};
+
+    bool expected = false;
+    if(cpu_supports_dtypes(dtypes_list))
+    {
+        expected = (std::find(supports.begin(), supports.end(), config) != supports.end());
+    }
+
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+
+/// @note: Do not modify. Validating all data types is pretty fast.
+DATA_TEST_CASE(ValidateAllDataTypes, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::AllDataTypes("InputDataType"),
+        datasets::AllDataTypes("WeightDataType"),
+        datasets::AllDataTypes("BiasDataType"),
+        datasets::AllDataTypes("OutputDataType")),
+        input_dtype, weight_dtype, bias_dtype, output_dtype)
+{
+    validate_data_types(input_dtype, weight_dtype, bias_dtype, output_dtype);
+}
+
+DATA_TEST_CASE(ValidateCommonDataTypes, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::CommonDataTypes("InputDataType"),
+        datasets::CommonDataTypes("WeightDataType"),
+        datasets::CommonDataTypes("BiasDataType"),
+        datasets::CommonDataTypes("OutputDataType")),
+        input_dtype, weight_dtype, bias_dtype, output_dtype)
+{
+    validate_data_types(input_dtype, weight_dtype, bias_dtype, output_dtype);
+}
+
 DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(zip(
                 make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
                                                         TensorInfo(TensorShape(27U, 13U, 3U), 1, DataType::F32),     // Mismatching input feature maps
diff --git a/tests/validation/NEON/LogSoftmaxLayer.cpp b/tests/validation/NEON/LogSoftmaxLayer.cpp
index 6718597c6b..4d983d5763 100644
--- a/tests/validation/NEON/LogSoftmaxLayer.cpp
+++ b/tests/validation/NEON/LogSoftmaxLayer.cpp
@@ -40,6 +40,9 @@ namespace test
 {
 namespace validation
 {
+
+using framework::dataset::make;
+
 namespace
 {
 /** Tolerance for float operations */
@@ -48,6 +51,7 @@ RelativeTolerance<half>            tolerance_f16(half(0.2));
 
 /** Tolerance for quantized operations */
 constexpr AbsoluteTolerance<uint8_t> tolerance_qasymm8(1);
+constexpr AbsoluteTolerance<int8_t> tolerance_qasymm8_signed(1);
 
 /** CNN data types */
 const auto CNNDataTypes = framework::dataset::make("DataType",
@@ -180,6 +184,39 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NELogSoftmaxLayerQuantizedFixture<uint8_t>, fra
     validate(Accessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() //QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall2D, NELogSoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        combine(make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+                make("Beta", { 1.0f, 2.f })),
+        make("Axis", { 0, 1 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunSmall4D, NELogSoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
+    combine(datasets::Small4DShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        combine(make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+                make("Beta", { 1.0f, 2.f })),
+        make("Axis", { 0, -1, 1 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NELogSoftmaxLayerQuantizedFixture<int8_t>, framework::DatasetMode::NIGHTLY,
+    combine(datasets::SoftmaxLayerLargeShapes(),
+        make("DataType", DataType::QASYMM8_SIGNED),
+        combine(make("QuantizationInfo", { QuantizationInfo(0.5f, -10) }),
+                make("Beta", { 1.0f, 2.f })),
+        make("Axis", { 0 })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qasymm8_signed);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
 TEST_SUITE_END() //Quantized
 
 TEST_SUITE_END() //LogSoftmaxLayer
diff --git a/tests/validation/NEON/PixelWiseMultiplication.cpp b/tests/validation/NEON/PixelWiseMultiplication.cpp
index f93bafcff6..f14100fbb5 100644
--- a/tests/validation/NEON/PixelWiseMultiplication.cpp
+++ b/tests/validation/NEON/PixelWiseMultiplication.cpp
@@ -30,6 +30,11 @@
 #include "tests/framework/Macros.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/PixelWiseMultiplicationFixture.h"
+#include "tests/datasets/DatatypeDataset.h"
+#include "tests/validation/Helpers.h"
+
+#include <algorithm>
+#include <tuple>
 
 namespace arm_compute
 {
@@ -37,6 +42,9 @@ namespace test
 {
 namespace validation
 {
+
+using framework::dataset::make;
+
 namespace
 {
 const float scale_unity = 1.f;
@@ -87,7 +95,6 @@ const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
 #define VALIDATE(TYPE, TOLERANCE) validate(Accessor(_target), _reference, AbsoluteTolerance<TYPE>(TOLERANCE), 0.f);
 #define WRAP_VALIDATE(TYPE, TOLERANCE) validate_wrap(Accessor(_target), _reference, AbsoluteTolerance<TYPE>(TOLERANCE), 0.f);
 
-// *INDENT-OFF*
 // clang-format off
 #define PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(TEST_NAME, FIXTURE, MODE, SHAPES, DT1, DT2, DT3, SCALE, RP, INPLACE_DATASET, VALIDATE) \
     FIXTURE_DATA_TEST_CASE(TEST_NAME, NEPixelWiseMultiplication##FIXTURE, framework::DatasetMode::MODE,                        \
@@ -114,8 +121,43 @@ const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", { false });
         }                                                                                                                 \
     }
 
-// *INDENT-ON*
 // clang-format on
+
+void validate_data_types(DataType input1_dtype, DataType input2_dtype, DataType output_dtype)
+{
+    const auto input1 = TensorInfo(TensorShape(27U, 13U, 2U), 1, input1_dtype);
+    const auto input2 = TensorInfo(TensorShape(27U, 13U, 2U), 1, input2_dtype);
+    auto output = TensorInfo(TensorShape(27U, 13U, 2U), 1, output_dtype);
+
+    bool is_valid = static_cast<bool>(NEPixelWiseMultiplication::validate(&input1, &input2, &output, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+
+    const auto supports = {
+        std::make_tuple(DataType::F32,DataType::F32,DataType::F32),
+        std::make_tuple(DataType::F16,DataType::F16,DataType::F16),
+        std::make_tuple(DataType::U8,DataType::U8,DataType::U8),
+        std::make_tuple(DataType::U8,DataType::U8,DataType::S16),
+        std::make_tuple(DataType::U8,DataType::S16,DataType::S16),
+        std::make_tuple(DataType::S16,DataType::U8,DataType::S16),
+        std::make_tuple(DataType::S16,DataType::S16,DataType::S16),
+        std::make_tuple(DataType::S32,DataType::S32,DataType::S32),
+        std::make_tuple(DataType::QSYMM16,DataType::QSYMM16,DataType::QSYMM16),
+        std::make_tuple(DataType::QSYMM16,DataType::QSYMM16,DataType::S32),
+        std::make_tuple(DataType::QASYMM8,DataType::QASYMM8,DataType::QASYMM8),
+        std::make_tuple(DataType::QASYMM8_SIGNED,DataType::QASYMM8_SIGNED,DataType::QASYMM8_SIGNED)
+    };
+
+    const auto config = std::make_tuple(input1_dtype, input2_dtype, output_dtype);
+    const std::initializer_list<DataType> dtypes_list = {input1_dtype, input2_dtype, output_dtype};
+
+    bool expected = false;
+    if(cpu_supports_dtypes(dtypes_list))
+    {
+        expected = (std::find(supports.begin(), supports.end(), config) != supports.end());
+    }
+
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+
 } // namespace
 
 using NEPixelWiseMultiplicationQASYMM8Fixture       = PixelWiseMultiplicationValidationQuantizedFixture<Tensor, Accessor, NEPixelWiseMultiplication, uint8_t, uint8_t>;
@@ -137,11 +179,14 @@ template <typename T>
 using NEPixelWiseMultiplicationBroadcastFixture              = PixelWiseMultiplicationBroadcastValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, T, T>;
 using NEPixelWiseMultiplicationBroadcastQASYMM8Fixture       = PixelWiseMultiplicationBroadcastValidationQuantizedFixture<Tensor, Accessor, NEPixelWiseMultiplication, uint8_t, uint8_t>;
 using NEPixelWiseMultiplicationBroadcastQASYMM8SignedFixture = PixelWiseMultiplicationBroadcastValidationQuantizedFixture<Tensor, Accessor, NEPixelWiseMultiplication, int8_t, int8_t>;
+using NEPixelWiseMultiplicationBroadcastQSYMM16Fixture = PixelWiseMultiplicationBroadcastValidationQuantizedFixture<Tensor, Accessor, NEPixelWiseMultiplication, int16_t, int16_t>;
+using NEPixelWiseMultiplicationBroadcastQSYMM16ToS32Fixture  = PixelWiseMultiplicationBroadcastValidationQuantizedFixture<Tensor, Accessor, NEPixelWiseMultiplication, int16_t, int16_t, int32_t>;
+using NEPixelWiseMultiplicationBroadcastU8U8ToS16Fixture = PixelWiseMultiplicationBroadcastValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, uint8_t, uint8_t, int16_t>;
+using NEPixelWiseMultiplicationBroadcastToS16Fixture = PixelWiseMultiplicationBroadcastValidationFixture<Tensor, Accessor, NEPixelWiseMultiplication, uint8_t, int16_t, int16_t>;
 
 TEST_SUITE(NEON)
 TEST_SUITE(PixelWiseMultiplication)
 
-// *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
                framework::dataset::make("Input1Info", { TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::U8),                 //1 Ok
@@ -227,7 +272,17 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(
     ARM_COMPUTE_EXPECT(has_error == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
-// *INDENT-ON*
+
+/// @note: Do not modify. Validating all data types is pretty fast.
+DATA_TEST_CASE(ValidateAllDataTypes, framework::DatasetMode::ALL,
+    combine(
+        datasets::AllDataTypes("Input1DataType"),
+        datasets::AllDataTypes("Input2DataType"),
+        datasets::AllDataTypes("OutputDataType")),
+        input1_dtype, input2_dtype, output_dtype)
+{
+    validate_data_types(input1_dtype, input2_dtype, output_dtype);
+}
 
 TEST_SUITE(InPlaceValidate)
 TEST_CASE(SingleTensor, framework::DatasetMode::ALL)
@@ -455,7 +510,24 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16Fixture, framew
     // Validate output
     validate(Accessor(_target), _reference, tolerance_qsymm16);
 }
+
 TEST_SUITE_END() // ScaleOther
+TEST_SUITE(NonXBroadcast)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationBroadcastQSYMM16Fixture,
+    framework::DatasetMode::ALL,
+    combine(datasets::SmallShapesNonXBroadcast(),
+            make("DataTypeIn1", DataType::QSYMM16),
+            make("DataTypeIn2", DataType::QSYMM16),
+            make("DataTypeOut", DataType::QSYMM16),
+            make("Scale", { scale_unity }),
+            PixelWiseMultiplicationPolicySTZDataset,
+            PixelWiseMultiplicationQSYMM16QuantDataset,
+    OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qsymm16);
+}
+TEST_SUITE_END() // NonXBroadcast
 TEST_SUITE_END() // QSYMM16
 TEST_SUITE(QSYMM16toS32)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16ToS32Fixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
@@ -470,6 +542,21 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationQSYMM16ToS32Fixture, f
     // Validate output
     validate(Accessor(_target), _reference);
 }
+TEST_SUITE(NonXBroadcast)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationBroadcastQSYMM16ToS32Fixture, framework::DatasetMode::ALL,
+    combine(datasets::SmallShapesNonXBroadcast(),
+        make("DataTypeIn1", DataType::QSYMM16),
+        make("DataTypeIn2", DataType::QSYMM16),
+        make("DataTypeOut", DataType::S32),
+        make("Scale", { scale_unity }),
+        PixelWiseMultiplicationPolicySTZDataset,
+        PixelWiseMultiplicationQSYMM16QuantDataset,
+        OutOfPlaceDataSet))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // NonXBroadcast
 TEST_SUITE_END() // QSYMM16toS32
 TEST_SUITE_END() // Quantized
 
@@ -488,6 +575,22 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationU8U8ToS16Fixture, fram
     validate_wrap(Accessor(_target), _reference, AbsoluteTolerance<int16_t>(1), 0.f);
 }
 
+TEST_SUITE(NonXBroadcast)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEPixelWiseMultiplicationBroadcastU8U8ToS16Fixture, framework::DatasetMode::PRECOMMIT,
+    combine(datasets::SmallShapesNonXBroadcast(),
+        framework::dataset::make("DataTypeIn1", DataType::U8),
+        framework::dataset::make("DataTypeIn2", DataType::U8),
+        framework::dataset::make("DataTypeOut", DataType::S16),
+        framework::dataset::make("Scale", { scale_255 }),
+        datasets::ConvertPolicies(),
+        framework::dataset::make("RoundingPolicy", RoundingPolicy::TO_NEAREST_UP),
+        OutOfPlaceDataSet))
+{
+    // Validate output
+    validate_wrap(Accessor(_target), _reference, AbsoluteTolerance<int16_t>(1), 0.f);
+}
+TEST_SUITE_END() // NonXBroadcast
+
 FIXTURE_DATA_TEST_CASE(RunSmall1, NEPixelWiseMultiplicationU8U8ToS16Fixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
                                                                                                                         framework::dataset::make("DataTypeIn1", DataType::U8)),
                                                                                                                         framework::dataset::make("DataTypeIn2", DataType::U8)),
@@ -511,6 +614,9 @@ TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
 PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToU8Fixture<uint8_t>, ALL, SmallShapes(), U8, U8, U8, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
+TEST_SUITE(NonXBroadcast)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, BroadcastFixture<uint8_t>, ALL, SmallShapesNonXBroadcast(), U8, U8, U8, scale_unity, TO_ZERO, OutOfPlaceDataSet, DEFAULT_VALIDATE)
+TEST_SUITE_END() // NonXBroadcast
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
@@ -529,6 +635,11 @@ TEST_SUITE_END() // Scale255
 TEST_SUITE(ScaleUnity)
 PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<uint8_t>, ALL, SmallShapes(), U8, S16, S16, scale_unity, TO_ZERO, OutOfPlaceDataSet,
                                                  DEFAULT_VALIDATE)
+
+TEST_SUITE(NonXBroadcast)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, BroadcastToS16Fixture, ALL, SmallShapesNonXBroadcast(), U8, S16, S16, scale_unity, TO_ZERO, OutOfPlaceDataSet,
+                                                 DEFAULT_VALIDATE)
+TEST_SUITE_END() // NonXBroadcast
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
@@ -546,6 +657,10 @@ TEST_SUITE_END() // Scale255
 
 TEST_SUITE(ScaleUnity)
 PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, ToS16Fixture<int16_t>, ALL, SmallShapes(), S16, S16, S16, scale_unity, TO_ZERO, InPlaceDataSet, DEFAULT_VALIDATE)
+TEST_SUITE(NonXBroadcast)
+PIXEL_WISE_MULTIPLICATION_FIXTURE_DATA_TEST_CASE(RunSmall, BroadcastFixture<int16_t>, ALL, SmallShapesNonXBroadcast(), S16, S16, S16, scale_unity, TO_ZERO, OutOfPlaceDataSet, DEFAULT_VALIDATE)
+TEST_SUITE_END() // NonXBroadcast
+
 TEST_SUITE_END() // ScaleUnity
 
 TEST_SUITE(ScaleOther)
diff --git a/tests/validation/NEON/Pooling3dLayer.cpp b/tests/validation/NEON/Pooling3dLayer.cpp
index 1b30023ca5..461f82da25 100644
--- a/tests/validation/NEON/Pooling3dLayer.cpp
+++ b/tests/validation/NEON/Pooling3dLayer.cpp
@@ -97,6 +97,40 @@ const auto qasymm8_signed_out_qinfo_dataset = framework::dataset::make("OutputQu
 TEST_SUITE(NEON)
 TEST_SUITE(Pooling3dLayer)
 
+TEST_CASE(SimpleIntegerAvgPooling, framework::DatasetMode::ALL)
+{
+    const auto pool_info = Pooling3dLayerInfo(PoolingType::AVG,
+        Size3D(1,1,1), Size3D(1,1,1), Padding3D(), true /* exclude padding */);
+    const auto shape = TensorShape(18U,1U,1U,1U); // > 16 for channel dim. to stress vector and leftover loops
+    const auto dtype = DataType::QASYMM8_SIGNED;
+    const auto layout = DataLayout::NDHWC;
+    const auto qinfo = QuantizationInfo(1.f, 0);
+
+    Tensor input = create_tensor<Tensor>(shape, dtype, 1, qinfo, layout);
+    Tensor output = create_tensor<Tensor>(shape, dtype, 1, qinfo, layout);
+
+    NEPooling3dLayer pool;
+    pool.configure(&input, &output, pool_info);
+
+    input.allocator()->allocate();
+    output.allocator()->allocate();
+
+    std::vector<int8_t> values = {-9, -8, -7, -6, -5, -4, -3, -2, -1,
+                                   0, 1, 2, 3, 4, 5, 6, 7, 8};
+
+    ARM_COMPUTE_EXPECT(values.size() == shape.x(), framework::LogLevel::ERRORS);
+
+    library->fill_static_values(Accessor(input), values);
+
+    pool.run();
+    for(unsigned int i = 0; i < values.size(); ++i)
+    {
+        const int8_t ref = values[i];
+        const int8_t target = reinterpret_cast<int8_t *>(output.buffer())[i];
+        ARM_COMPUTE_EXPECT(ref == target, framework::LogLevel::ERRORS);
+    }
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
diff --git a/tests/validation/NEON/PoolingLayer.cpp b/tests/validation/NEON/PoolingLayer.cpp
index f635a63bbe..50118e8831 100644
--- a/tests/validation/NEON/PoolingLayer.cpp
+++ b/tests/validation/NEON/PoolingLayer.cpp
@@ -160,6 +160,40 @@ const auto PoolingLayerIndicesDatasetFPSmall = combine(combine(combine(framework
 const auto PoolingLayerKernelIndicesDatasetFPSmall = combine(combine(combine(framework::dataset::make("PoolType", { PoolingType::MAX }), framework::dataset::make("PoolingSize", { Size2D(2, 2), Size2D(3, 3), Size2D(7, 7) })),
                                                                      framework::dataset::make("PadStride", { PadStrideInfo(1, 1, 0, 0), PadStrideInfo(2, 1, 0, 0), PadStrideInfo(1, 1, 1, 1) })),
                                                              framework::dataset::make("ExcludePadding", { false }));
+
+TEST_CASE(SimpleIntegerAvgPooling, framework::DatasetMode::ALL)
+{
+    const auto pool_info = PoolingLayerInfo(PoolingType::AVG, Size2D(1,1), DataLayout::NHWC);
+    const auto shape = TensorShape(18U,1U,1U); // > 16 for channel dim. to stress vector and leftover loops
+    const auto dtype = DataType::QASYMM8_SIGNED;
+    const auto layout = DataLayout::NHWC;
+    const auto qinfo = QuantizationInfo(1.f, 0);
+
+    Tensor input = create_tensor<Tensor>(shape, dtype, 1, qinfo, layout);
+    Tensor output = create_tensor<Tensor>(shape, dtype, 1, qinfo, layout);
+
+    NEPoolingLayer pool;
+    pool.configure(&input, &output, pool_info);
+
+    input.allocator()->allocate();
+    output.allocator()->allocate();
+
+    std::vector<int8_t> values = {-9, -8, -7, -6, -5, -4, -3, -2, -1,
+                                   0, 1, 2, 3, 4, 5, 6, 7, 8};
+
+    ARM_COMPUTE_EXPECT(values.size() == shape.x(), framework::LogLevel::ERRORS);
+
+    library->fill_static_values(Accessor(input), values);
+    pool.run();
+
+    for(unsigned int i = 0; i < values.size(); ++i)
+    {
+        const int8_t ref = values[i];
+        const int8_t target = reinterpret_cast<int8_t *>(output.buffer())[i];
+        ARM_COMPUTE_EXPECT(ref == target, framework::LogLevel::ERRORS);
+    }
+}
+
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunIndices, NEPoolingLayerIndicesFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(datasets::SmallNoneUnitShapes(),
diff --git a/tests/validation/NEON/Reverse.cpp b/tests/validation/NEON/Reverse.cpp
index 7d99bd614d..d390ed0e23 100644
--- a/tests/validation/NEON/Reverse.cpp
+++ b/tests/validation/NEON/Reverse.cpp
@@ -25,15 +25,15 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEReverse.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "arm_compute/runtime/TensorAllocator.h"
 #include "tests/NEON/Accessor.h"
-#include "tests/PaddingCalculator.h"
 #include "tests/datasets/ShapeDatasets.h"
+#include "tests/datasets/DatatypeDataset.h"
 #include "tests/framework/Asserts.h"
 #include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
 #include "tests/validation/Validation.h"
 #include "tests/validation/fixtures/ReverseFixture.h"
+#include "tests/validation/Helpers.h"
 
 namespace arm_compute
 {
@@ -44,13 +44,66 @@ namespace validation
 namespace
 {
 using framework::dataset::make;
+
 auto run_small_dataset = combine(datasets::Small3DShapes(), datasets::Tiny1DShapes());
 auto run_large_dataset = combine(datasets::LargeShapes(), datasets::Tiny1DShapes());
 
+void validate_data_types(DataType input_dtype, DataType output_dtype, DataType axis_dtype)
+{
+    const auto input = TensorInfo(TensorShape(16U, 16U, 5U), 1, input_dtype);
+    const auto axis = TensorInfo(TensorShape(1U), 1, axis_dtype);
+    auto output = TensorInfo(TensorShape(16U, 16U, 5U), 1, output_dtype);
+
+    const Status status = (NEReverse::validate(&input, &output, &axis, false /* use_inverted_axis */));
+    const bool is_valid = static_cast<bool>(status);
+
+    static const auto supported_dtypes = {
+        DataType::QSYMM8,
+        DataType::QASYMM8,
+        DataType::QASYMM8_SIGNED,
+        DataType::QSYMM16,
+        DataType::U8,
+        DataType::S8,
+        DataType::QSYMM8_PER_CHANNEL,
+        DataType::U16,
+        DataType::S16,
+        DataType::QSYMM16,
+        DataType::QASYMM16,
+        DataType::U32,
+        DataType::S32,
+        DataType::BFLOAT16,
+        DataType::F16,
+        DataType::F32
+    };
+
+    static std::vector<std::tuple<DataType,DataType,DataType>> supports = {};
+    for(DataType dtype : supported_dtypes)
+    {
+        supports.push_back(std::make_tuple(dtype, dtype, DataType::S32));
+        supports.push_back(std::make_tuple(dtype, dtype, DataType::U32));
+    }
+
+    const auto config = std::make_tuple(input_dtype, output_dtype, axis_dtype);
+    const bool expected = (std::find(supports.begin(), supports.end(), config) != supports.end());
+
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
+
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(Reverse)
 
+/// @note: Do not modify. Validating all data types is pretty fast.
+DATA_TEST_CASE(ValidateAllDataTypes, framework::DatasetMode::ALL,
+    combine(
+        datasets::AllDataTypes("InputDataType"),
+        datasets::AllDataTypes("OutputDataType"),
+        datasets::AllDataTypes("AxisDataType")),
+        input_dtype, output_dtype, axis_dtype)
+{
+    validate_data_types(input_dtype, output_dtype, axis_dtype);
+}
+
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
@@ -92,29 +145,26 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
 template <typename T>
 using NEReverseFixture = ReverseValidationFixture<Tensor, Accessor, NEReverse, T>;
 
-TEST_SUITE(Float)
+/// @note: Test Strategy --
+///    The operator uses uint8_t, uint16_t and uint32_t under the hood depending
+///    on the size of the input data type. Therefore, we do not extensively test
+///    all the data types here. fp32/16 and qasymm8 has been thoroughly tested with
+///    multiple shapes and configuration. Other data types are just smoke tested
+///    with a very limited set of configurations, just to make sure they function
+///    correctly.
 
-#ifdef ARM_COMPUTE_ENABLE_FP16
+TEST_SUITE(Float)
 TEST_SUITE(F16)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReverseFixture<half>,
                        framework::DatasetMode::PRECOMMIT,
                        combine(
                            run_small_dataset,
-                           make("DataType", DataType::F16),
+                           make("DataType", {DataType::F16, DataType::BFLOAT16}),
                            make("use_negative_axis", { true, false }),
                            make("use_inverted_axis", { true, false })))
 {
-    if(CPUInfo::get().has_fp16())
-    {
-        // Validate output
-        validate(Accessor(_target), _reference);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("Device does not support fp16 vector operations. Test SKIPPED.");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
+    validate(Accessor(_target), _reference);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge,
@@ -126,21 +176,11 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
                            make("use_negative_axis", { true, false }),
                            make("use_inverted_axis", { true, false })))
 {
-    if(CPUInfo::get().has_fp16())
-    {
-        // Validate output
-        validate(Accessor(_target), _reference);
-    }
-    else
-    {
-        ARM_COMPUTE_TEST_INFO("Device does not support fp16 vector operations. Test SKIPPED.");
-        framework::ARM_COMPUTE_PRINT_INFO();
-    }
+    validate(Accessor(_target), _reference);
 }
 TEST_SUITE_END() // F16
-#endif           /* ARM_COMPUTE_ENABLE_FP16 */
 
-TEST_SUITE(FP32)
+TEST_SUITE(F32)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReverseFixture<float>,
                        framework::DatasetMode::PRECOMMIT,
@@ -169,14 +209,78 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
 TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 
-TEST_SUITE(Quantized)
-TEST_SUITE(QASYMM8)
+TEST_SUITE(Integer)
+TEST_SUITE(Int32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEReverseFixture<int32_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("InOutShape", TensorShape(18U, 5U, 5U)),
+                           make("AxisShape", TensorShape(2U)),
+                           make("DataType", {DataType::S32}),
+                           make("use_negative_axis", { false }),
+                           make("use_inverted_axis", { false })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // Int32
+
+TEST_SUITE(UInt32)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEReverseFixture<uint32_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("InOutShape", TensorShape(18U, 5U, 5U)),
+                           make("AxisShape", TensorShape(2U)),
+                           make("DataType", {DataType::U32}),
+                           make("use_negative_axis", { false }),
+                           make("use_inverted_axis", { false })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // UInt32
+
+TEST_SUITE(Int16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEReverseFixture<int16_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("InOutShape", TensorShape(18U, 5U, 5U)),
+                           make("AxisShape", TensorShape(2U)),
+                           make("DataType", {DataType::S16, DataType::QSYMM16}),
+                           make("use_negative_axis", { false }),
+                           make("use_inverted_axis", { false })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // Int16
+
+TEST_SUITE(UInt16)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEReverseFixture<uint16_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("InOutShape", TensorShape(18U, 5U, 5U)),
+                           make("AxisShape", TensorShape(2U)),
+                           make("DataType", {DataType::U16, DataType::QASYMM16}),
+                           make("use_negative_axis", { false }),
+                           make("use_inverted_axis", { false })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // UInt16
+
+TEST_SUITE(UInt8)
 FIXTURE_DATA_TEST_CASE(RunSmall,
                        NEReverseFixture<uint8_t>,
                        framework::DatasetMode::PRECOMMIT,
                        combine(
                            run_small_dataset,
-                           make("DataType", DataType::QASYMM8),
+                           make("DataType", {DataType::QASYMM8, DataType::U8}),
                            make("use_negative_axis", { true, false }),
                            make("use_inverted_axis", { true, false })))
 {
@@ -196,8 +300,25 @@ FIXTURE_DATA_TEST_CASE(RunLarge,
     // Validate output
     validate(Accessor(_target), _reference);
 }
-TEST_SUITE_END() // QASYMM8
-TEST_SUITE_END() // Quantized
+TEST_SUITE_END() // UInt8
+
+TEST_SUITE(Int8)
+FIXTURE_DATA_TEST_CASE(RunSmall,
+                       NEReverseFixture<int8_t>,
+                       framework::DatasetMode::PRECOMMIT,
+                       combine(
+                           make("InOutShape", TensorShape(18U, 5U, 5U)),
+                           make("AxisShape", TensorShape(2U)),
+                           make("DataType", {DataType::QASYMM8_SIGNED, DataType::S8,
+                                DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL}),
+                           make("use_negative_axis", { false }),
+                           make("use_inverted_axis", { false })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // Int8
+TEST_SUITE_END() // Integer
 
 TEST_SUITE_END() // Reverse
 TEST_SUITE_END() // Neon
diff --git a/tests/validation/fixtures/CpuSoftmaxFixture.h b/tests/validation/fixtures/CpuSoftmaxFixture.h
new file mode 100644
index 0000000000..82938405b7
--- /dev/null
+++ b/tests/validation/fixtures/CpuSoftmaxFixture.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_TESTS_VALIDATION_FIXTURES_CPUSOFTMAXFIXTURE_H
+#define ACL_TESTS_VALIDATION_FIXTURES_CPUSOFTMAXFIXTURE_H
+
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/SoftmaxLayer.h"
+
+#include <random>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool IS_LOG = false>
+class CpuSoftmaxValidationFixture : public framework::Fixture
+{
+public:
+    void setup(TensorShape shape, DataType data_type, float beta, size_t axis)
+    {
+        if(std::is_same<TensorType, Tensor>::value &&  // Cpu
+            data_type == DataType::F16 && !CPUInfo::get().has_fp16())
+        {
+            return;
+        }
+
+        quantization_info_ = QuantizationInfo();
+
+        reference_ = compute_reference(shape, data_type, quantization_info_, beta, axis);
+        target_    = compute_target(shape, data_type, quantization_info_, beta, axis);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor)
+    {
+        if(tensor.data_type() == DataType::F32)
+        {
+            std::uniform_real_distribution<float> distribution(-10.0f, 10.0f);
+            library->fill(tensor, distribution, 0);
+        }
+        else if(tensor.data_type() == DataType::F16)
+        {
+            arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -10.0f, 10.0f };
+            library->fill(tensor, distribution, 0);
+        }
+        else if(!is_data_type_quantized(tensor.data_type()))
+        {
+            std::uniform_int_distribution<> distribution(0, 100);
+            library->fill(tensor, distribution, 0);
+        }
+        else
+        {
+            library->fill_tensor_uniform(tensor, 0);
+        }
+    }
+
+    TensorType compute_target(const TensorShape &shape, DataType data_type,
+                              QuantizationInfo quantization_info, float beta, int32_t axis)
+    {
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(shape, data_type, 1, quantization_info);
+        TensorType dst = create_tensor<TensorType>(shape, data_type, 1, get_softmax_output_quantization_info(data_type, IS_LOG));
+
+        // Create and configure function
+        FunctionType softmax;
+        softmax.configure(src.info(), dst.info(), beta, axis);
+
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+        // Fill tensors
+        fill(AccessorType(src));
+
+        ITensorPack run_pack{ { arm_compute::TensorType::ACL_SRC_0, &src }};
+        run_pack.add_tensor(arm_compute::TensorType::ACL_DST, &dst);
+        auto mg = MemoryGroup{};
+        auto ws = manage_workspace<Tensor>(softmax.workspace(), mg, run_pack);
+
+        // Compute function
+        softmax.run(run_pack);
+
+        return dst;
+    }
+
+    SimpleTensor<T> compute_reference(const TensorShape &shape, DataType data_type,
+                                      QuantizationInfo quantization_info, float beta, int32_t axis)
+    {
+        // Create reference
+        SimpleTensor<T> src{ shape, data_type, 1, quantization_info };
+
+        // Fill reference
+        fill(src);
+
+        return reference::softmax_layer<T>(src, beta, axis, IS_LOG);
+    }
+
+    TensorType       target_{};
+    SimpleTensor<T>  reference_{};
+    QuantizationInfo quantization_info_{};
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif // ACL_TESTS_VALIDATION_FIXTURES_CPUSOFTMAXFIXTURE_H
diff --git a/tests/validation/fixtures/ReverseFixture.h b/tests/validation/fixtures/ReverseFixture.h
index 5bb8f876d2..58a108d637 100644
--- a/tests/validation/fixtures/ReverseFixture.h
+++ b/tests/validation/fixtures/ReverseFixture.h
@@ -47,12 +47,6 @@ class ReverseValidationFixture : public framework::Fixture
 public:
     void setup(TensorShape shape, TensorShape axis_shape, DataType data_type, bool use_negative_axis = false, bool use_inverted_axis = false)
     {
-        if(std::is_same<TensorType, Tensor>::value &&  // Cpu
-            data_type == DataType::F16 && !CPUInfo::get().has_fp16())
-        {
-            return;
-        }
-
         _num_dims  = shape.num_dimensions();
         _target    = compute_target(shape, axis_shape, data_type, use_negative_axis, use_inverted_axis);
         _reference = compute_reference(shape, axis_shape, data_type, use_negative_axis, use_inverted_axis);
@@ -85,7 +79,17 @@ class ReverseValidationFixture : public framework::Fixture
     TensorType compute_target(const TensorShape &shape, const TensorShape &axis_shape, DataType data_type, bool use_negative_axis, bool use_inverted_axis = false)
     {
         // Create tensors
-        TensorType src  = create_tensor<TensorType>(shape, data_type, 1);
+        QuantizationInfo qinfo = QuantizationInfo();
+        if(data_type == DataType::QSYMM8_PER_CHANNEL)
+        {
+            // We need dummy scale and offset values for tensor buffer allocation
+            const std::vector<float> scales(1);
+            const std::vector<int> offsets(1);
+
+            qinfo = QuantizationInfo(scales, offsets);
+        }
+
+        TensorType src  = create_tensor<TensorType>(shape, data_type, 1, qinfo);
         TensorType axis = create_tensor<TensorType>(axis_shape, DataType::U32, 1);
         TensorType dst;
 
diff --git a/tests/validation/reference/Reverse.cpp b/tests/validation/reference/Reverse.cpp
index 7924f900d1..1d7ed74c85 100644
--- a/tests/validation/reference/Reverse.cpp
+++ b/tests/validation/reference/Reverse.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020, 2023 Arm Limited.
+ * Copyright (c) 2018-2020, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -97,8 +97,20 @@ SimpleTensor<T> reverse(const SimpleTensor<T> &src, const SimpleTensor<int32_t>
 }
 
 template SimpleTensor<uint8_t> reverse(const SimpleTensor<uint8_t> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
+template SimpleTensor<int8_t> reverse(const SimpleTensor<int8_t> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
+template SimpleTensor<int16_t> reverse(const SimpleTensor<int16_t> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
+template SimpleTensor<int32_t> reverse(const SimpleTensor<int32_t> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
+template SimpleTensor<uint16_t> reverse(const SimpleTensor<uint16_t> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
+template SimpleTensor<uint32_t> reverse(const SimpleTensor<uint32_t> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
 template SimpleTensor<half> reverse(const SimpleTensor<half> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
 template SimpleTensor<float> reverse(const SimpleTensor<float> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
+
+#ifdef __aarch64__
+template SimpleTensor<int64_t> reverse(const SimpleTensor<int64_t> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
+template SimpleTensor<uint64_t> reverse(const SimpleTensor<uint64_t> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
+template SimpleTensor<float64_t> reverse(const SimpleTensor<float64_t> &src, const SimpleTensor<int32_t> &axis, bool use_inverted_axis);
+#endif // __aarch64__
+
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/runtime/experimental/operators/CpuAdd.cpp b/tests/validation/runtime/experimental/operators/CpuAdd.cpp
index 97eaa9ce9e..5a3ec353d1 100644
--- a/tests/validation/runtime/experimental/operators/CpuAdd.cpp
+++ b/tests/validation/runtime/experimental/operators/CpuAdd.cpp
@@ -52,6 +52,7 @@ const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", {false});
 } // namespace
 
 TEST_SUITE(NEON)
+TEST_SUITE(OPERATORS)
 TEST_SUITE(CpuAdd)
 
 using CpuAddFixture = CpuArithmeticAdditionValidationFixture<Tensor, Accessor, experimental::op::CpuAdd>;
@@ -71,7 +72,8 @@ FIXTURE_DATA_TEST_CASE(
 TEST_SUITE_END() // U8
 
 TEST_SUITE_END() // CpuAdd
-TEST_SUITE_END() // Neon
+TEST_SUITE_END() // OPERATORS
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/runtime/experimental/operators/CpuElementwise.cpp b/tests/validation/runtime/experimental/operators/CpuElementwise.cpp
index b2007ea22a..a41f8e6b07 100644
--- a/tests/validation/runtime/experimental/operators/CpuElementwise.cpp
+++ b/tests/validation/runtime/experimental/operators/CpuElementwise.cpp
@@ -54,6 +54,7 @@ const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", {false});
 } // namespace
 
 TEST_SUITE(NEON)
+TEST_SUITE(OPERATORS)
 
 TEST_SUITE(CpuElementwiseDivision)
 template <typename T>
@@ -72,7 +73,7 @@ FIXTURE_DATA_TEST_CASE(SmokeTest,
 }
 TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
-TEST_SUITE_END() // CpuElementwiseMin
+TEST_SUITE_END() // CpuElementwiseDivision
 
 TEST_SUITE(CpuElementwiseMax)
 template <typename T>
@@ -91,7 +92,7 @@ FIXTURE_DATA_TEST_CASE(SmokeTest,
 }
 TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
-TEST_SUITE_END() // CpuElementwiseMin
+TEST_SUITE_END() // CpuElementwiseMax
 
 TEST_SUITE(CpuElementwiseMin)
 
@@ -113,7 +114,8 @@ TEST_SUITE_END() // F32
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // CpuElementwiseMin
 
-TEST_SUITE_END() // Neon
+TEST_SUITE_END() // OPERATORS
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/runtime/experimental/operators/CpuGemm.cpp b/tests/validation/runtime/experimental/operators/CpuGemm.cpp
index 9d85f90712..75ad22a448 100644
--- a/tests/validation/runtime/experimental/operators/CpuGemm.cpp
+++ b/tests/validation/runtime/experimental/operators/CpuGemm.cpp
@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/experimental/operators/CpuGemm.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
-#include "tests/NEON/Accessor.h"
-#include "tests/framework/Macros.h"
 #include "tests/framework/datasets/Datasets.h"
+#include "tests/framework/Macros.h"
+#include "tests/NEON/Accessor.h"
 #include "tests/validation/fixtures/GEMMFixture.h"
 
 /*
@@ -42,19 +43,10 @@ namespace validation
 {
 using framework::dataset::make;
 
-namespace
-{
-/** CNN data types */
-const auto CNNDataTypes = make("DataType",
-{
-    DataType::F32,
-});
-} // namespace
-
 TEST_SUITE(NEON)
 TEST_SUITE(OPERATORS)
 
-TEST_SUITE(CPUGEMM)
+TEST_SUITE(CpuGemm)
 /** Test case for memory injection in @ref arm_compute::experimental::op::CpuGemm.
  *
  * Configure the operator once and inject memory at run-time in multiple executions.
@@ -80,8 +72,8 @@ TEST_CASE(OpCpuGemmMemoryInjection, framework::DatasetMode::ALL)
     rhs.allocator()->allocate();
     c.allocator()->allocate();
 
-    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &lhs }, { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } };
-    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } };
+    ITensorPack run_pack{{TensorType::ACL_SRC_0, &lhs}, {TensorType::ACL_SRC_1, &rhs}, {TensorType::ACL_SRC_2, &c}};
+    ITensorPack prep_pack{{TensorType::ACL_SRC_1, &rhs}, {TensorType::ACL_SRC_2, &c}};
 
     auto mg = MemoryGroup{};
     auto ws = manage_workspace<Tensor>(gemm->workspace(), mg, run_pack, prep_pack);
@@ -102,22 +94,29 @@ TEST_CASE(OpCpuGemmMemoryInjection, framework::DatasetMode::ALL)
     };
     auto result_0 = run_conv();
     auto result_1 = run_conv();
-    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    for (size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
     {
-        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i],
+                           framework::LogLevel::ERRORS);
     }
 }
 
-DATA_TEST_CASE(OpCpuGemmValidateAccumulate, framework::DatasetMode::ALL, combine(
-                                                                     zip(make("In0",{ TensorShape(21U, 13U) }),
-                                                                     make("In1", { TensorShape(33U, 21U) }),
-                                                                     make("Dst", { TensorShape(33U, 13U) })),
-                                                                     zip(
-                                                                     make("alpha", { 1.0, 100.0, 1.0, 1.0 }),
-                                                                     make("beta", { 0.0, 0.0, 1.0, 1.0 }),
-                                                                     make("is_c_null", { false, false, false, true }),
-                                                                     make("Expected", { true, false, false, true }))),
-               shape_a, shape_b, shape_dst, alpha, beta, is_c_null, expected)
+DATA_TEST_CASE(OpCpuGemmValidateAccumulate,
+               framework::DatasetMode::ALL,
+               combine(zip(make("In0", {TensorShape(21U, 13U)}),
+                           make("In1", {TensorShape(33U, 21U)}),
+                           make("Dst", {TensorShape(33U, 13U)})),
+                       zip(make("alpha", {1.0, 100.0, 1.0, 1.0}),
+                           make("beta", {0.0, 0.0, 1.0, 1.0}),
+                           make("is_c_null", {false, false, false, true}),
+                           make("Expected", {true, false, false, true}))),
+               shape_a,
+               shape_b,
+               shape_dst,
+               alpha,
+               beta,
+               is_c_null,
+               expected)
 {
     /* Accumulation test for GEMM kernels */
     // Create tensors
@@ -132,10 +131,10 @@ DATA_TEST_CASE(OpCpuGemmValidateAccumulate, framework::DatasetMode::ALL, combine
     // Validate accumulation
     arm_compute::experimental::op::CpuGemm gemm;
     Status status = gemm.validate(&in_a, &in_b, (is_c_null ? nullptr : &in_c), &dst, alpha, beta, gemm_info);
-    ARM_COMPUTE_EXPECT((expected ==  bool(status)), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT((expected == bool(status)), framework::LogLevel::ERRORS);
 }
 
-TEST_SUITE_END() // CPUGEMM
+TEST_SUITE_END() // CpuGemm
 TEST_SUITE_END() // OPERATORS
 TEST_SUITE_END() // NEON
 } // namespace validation
diff --git a/tests/validation/runtime/experimental/operators/CpuMul.cpp b/tests/validation/runtime/experimental/operators/CpuMul.cpp
index 8cad6210a1..3bff2e3b5a 100644
--- a/tests/validation/runtime/experimental/operators/CpuMul.cpp
+++ b/tests/validation/runtime/experimental/operators/CpuMul.cpp
@@ -62,6 +62,7 @@ using CpuMulU8U8toS16Fixture =
     CpuMulValidationFixture<Tensor, Accessor, experimental::op::CpuMul, uint8_t, uint8_t, int16_t>;
 
 TEST_SUITE(NEON)
+TEST_SUITE(OPERATORS)
 TEST_SUITE(CpuMul)
 
 TEST_SUITE(U8U8toS16)
@@ -101,6 +102,7 @@ FIXTURE_DATA_TEST_CASE(
 TEST_SUITE_END() // U8U8toS16
 
 TEST_SUITE_END() // CpuMul
+TEST_SUITE_END() // OPERATORS
 TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/runtime/experimental/operators/CpuSoftmax.cpp b/tests/validation/runtime/experimental/operators/CpuSoftmax.cpp
new file mode 100644
index 0000000000..30eb1c31bb
--- /dev/null
+++ b/tests/validation/runtime/experimental/operators/CpuSoftmax.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2017-2020, 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/runtime/experimental/operators/CpuSoftmax.h"
+#include "arm_compute/core/Types.h"
+
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/CpuSoftmaxFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+using framework::dataset::make;
+
+/** Tolerance for float operations */
+constexpr AbsoluteTolerance<float> tolerance_f32(0.000001f);
+} // namespace
+TEST_SUITE(NEON)
+TEST_SUITE(OPERATORS)
+
+TEST_SUITE(CpuSoftmax)
+
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(
+    make("InputInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching data types
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),    // Mismatching shapes
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8, // Invalid output quantization info
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis high
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,  //Invalid axis low
+                                    QuantizationInfo(1.f/256, 12)),
+                        }),
+    make("OutputInfo",{ TensorInfo(TensorShape(27U, 13U), 1, DataType::F16),
+                        TensorInfo(TensorShape(27U, 11U), 1, DataType::F32),
+                        TensorInfo(TensorShape(27U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 12)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::F32),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        TensorInfo(TensorShape(32U, 13U), 1, DataType::QASYMM8,
+                                    QuantizationInfo(1.f/256, 0)),
+                        }),
+    make("beta", { 1.0,
+                   2.0,
+                   1.0,
+                   2.0,
+                   1.0,
+                   1.0,
+                   2.0,
+                   1.0,
+                }),
+    make("axis", { 0,
+                   0,
+                   0,
+                   1,
+                   0,
+                   -1,
+                   2,
+                   -3,
+                }),
+    make("Expected", { false, false, false, true, true, true, false, false })),
+    input_info, output_info, beta, axis, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(arm_compute::experimental::op::CpuSoftmax::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), beta, axis)) == expected, framework::LogLevel::ERRORS);
+}
+
+TEST_CASE(OpCpuSoftmaxMemoryInjection, framework::DatasetMode::ALL)
+{
+    auto       softmax   = std::make_unique<arm_compute::experimental::op::CpuSoftmax>();
+    const auto src_info  = TensorInfo(TensorShape{ 1U, 9U }, 1, DataType::F32);
+    auto dst_info = TensorInfo(TensorShape{ 1U, 9U }, 1, DataType::F32);
+
+    const float beta = (1.0F);
+    const int32_t axis = 0;
+    const bool is_log = false;
+
+    softmax->configure(&src_info, &dst_info, beta, axis, is_log);
+
+    // the lhs are newly created every call of this lambda function
+    auto src = create_tensor<Tensor>(src_info);
+    auto dst = create_tensor<Tensor>(dst_info);
+    src.allocator()->allocate();
+
+    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }};
+    auto mg = MemoryGroup{};
+    auto ws = manage_workspace<Tensor>(softmax->workspace(), mg, run_pack);
+
+    auto run_softmax = [&]() -> Tensor
+    {
+        auto dst = create_tensor<Tensor>(dst_info);
+        dst.allocator()->allocate();
+        run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+        library->fill_tensor_value(Accessor(src), 1.f);
+        // This operator is configured once and captured by this lambda.
+        softmax->run(run_pack);
+        return dst;
+    };
+    auto result_0 = run_softmax();
+    auto result_1 = run_softmax();
+    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+    {
+        ARM_COMPUTE_EXPECT((reinterpret_cast<float *>(result_0.buffer()))[i] == (reinterpret_cast<float *>(result_1.buffer()))[i], framework::LogLevel::ERRORS);
+    }
+}
+
+template <typename T>
+using CpuOpSoftmaxFixture = CpuSoftmaxValidationFixture<Tensor, Accessor, arm_compute::experimental::op::CpuSoftmax, T>;
+
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(SmokeTest, CpuOpSoftmaxFixture<float>, framework::DatasetMode::PRECOMMIT,
+    combine(
+        datasets::SoftmaxLayerSmallShapes(),
+        make("DataType", DataType::F32),
+        make("Beta", { 1.0f, 2.0f }),
+        make("Axis", { 0, -1 })))
+{
+    // Validate output
+    validate(Accessor(target_), reference_, tolerance_f32);
+}
+
+TEST_SUITE_END() //FP32
+TEST_SUITE_END() //CpuSoftmax
+TEST_SUITE_END() //OPERATORS
+TEST_SUITE_END() //NEON
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/runtime/experimental/operators/CpuSub.cpp b/tests/validation/runtime/experimental/operators/CpuSub.cpp
index 22f5ae8d7b..4736aafb2e 100644
--- a/tests/validation/runtime/experimental/operators/CpuSub.cpp
+++ b/tests/validation/runtime/experimental/operators/CpuSub.cpp
@@ -52,6 +52,7 @@ const auto OutOfPlaceDataSet = framework::dataset::make("InPlace", {false});
 } // namespace
 
 TEST_SUITE(NEON)
+TEST_SUITE(OPERATORS)
 TEST_SUITE(CpuSub)
 
 using CpuSubFixture = CpuArithmeticSubtractionValidationFixture<Tensor, Accessor, experimental::op::CpuSub>;
@@ -71,7 +72,8 @@ FIXTURE_DATA_TEST_CASE(
 TEST_SUITE_END() // U8
 
 TEST_SUITE_END() // CpuSub
-TEST_SUITE_END() // Neon
+TEST_SUITE_END() // OPERATORS
+TEST_SUITE_END() // NEON
 } // namespace validation
 } // namespace test
 } // namespace arm_compute
diff --git a/utils/Utils.h b/utils/Utils.h
index 626cbcf07f..93dc2fa106 100644
--- a/utils/Utils.h
+++ b/utils/Utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2023 Arm Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef __UTILS_UTILS_H__
-#define __UTILS_UTILS_H__
+
+#ifndef ACL_UTILS_UTILS_H
+#define ACL_UTILS_UTILS_H
 
 /** @dir .
  *  brief Boiler plate code used by examples. Various utilities to print types, load / store assets, etc.
@@ -261,10 +262,12 @@ class uniform_real_distribution_16bit
     using result_type = T;
     /** Constructor
      *
-     * @param[in] min Minimum value of the distribution
-     * @param[in] max Maximum value of the distribution
+     * @param[in] min      Minimum value of the distribution
+     * @param[in] max      Maximum value of the distribution
+     * @param[in] portable Boolean to indicate portable conversion in-between 16-bit and other data types
      */
-    explicit uniform_real_distribution_16bit(float min = 0.f, float max = 1.0) : dist(min, max)
+    explicit uniform_real_distribution_16bit(float min = 0.f, float max = 1.0, bool portable = false)
+        : _dist(min, max), _portable(portable)
     {
     }
 
@@ -274,11 +277,24 @@ class uniform_real_distribution_16bit
      */
     T operator()(std::mt19937 &gen)
     {
-        return T(dist(gen));
+        return convert(_dist(gen));
     }
 
 private:
-    std::uniform_real_distribution<float> dist;
+    template <typename U = T>
+    inline typename std::enable_if<std::is_same<U, bfloat16>::value, bfloat16>::type convert(float x)
+    {
+        return bfloat16(x, _portable);
+    }
+
+    template <typename U = T>
+    inline typename std::enable_if<!std::is_same<U, bfloat16>::value, T>::type convert(float x)
+    {
+        return T(x);
+    }
+
+    std::uniform_real_distribution<float> _dist;
+    bool                                  _portable;
 };
 
 /** Numpy data loader */
@@ -857,4 +873,5 @@ int compare_tensor(ITensor &tensor1, ITensor &tensor2, T tolerance)
 }
 } // namespace utils
 } // namespace arm_compute
-#endif /* __UTILS_UTILS_H__*/
+
+#endif // ACL_UTILS_UTILS_H