Merge branch 'main' into blend_test

ARM-software · Jul 31, 2024 · 7338637 · 7338637
2 parents b8192c2 + 69bc17b
commit 7338637
Show file tree

Hide file tree

Showing 10 changed files with 1,124 additions and 1,030 deletions.
diff --git a/Source/UnitTest/test_simd.cpp b/Source/UnitTest/test_simd.cpp
diff --git a/Source/astcenc_decompress_symbolic.cpp b/Source/astcenc_decompress_symbolic.cpp
@@ -110,7 +110,7 @@ void unpack_weights(
 		{
 			vint summed_value(8);
 			vint weight_count(di.texel_weight_count + i);
-			int max_weight_count = hmax(weight_count).lane<0>();
+			int max_weight_count = hmax_s(weight_count);
 
 			promise(max_weight_count > 0);
 			for (int j = 0; j < max_weight_count; j++)
@@ -145,7 +145,7 @@ void unpack_weights(
 			vint sum_plane2(8);
 
 			vint weight_count(di.texel_weight_count + i);
-			int max_weight_count = hmax(weight_count).lane<0>();
+			int max_weight_count = hmax_s(weight_count);
 
 			promise(max_weight_count > 0);
 			for (int j = 0; j < max_weight_count; j++)

diff --git a/Source/astcenc_ideal_endpoints_and_weights.cpp b/Source/astcenc_ideal_endpoints_and_weights.cpp
@@ -889,7 +889,7 @@ void compute_ideal_weights_for_decimation(
 
 		// Accumulate error weighting of all the texels using this weight
 		vint weight_texel_count(di.weight_texel_count + i);
-		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
+		unsigned int max_texel_count = hmax_s(weight_texel_count);
 		promise(max_texel_count > 0);
 
 		for (unsigned int j = 0; j < max_texel_count; j++)
@@ -947,7 +947,7 @@ void compute_ideal_weights_for_decimation(
 
 		// Accumulate error weighting of all the texels using this weight
 		vint weight_texel_count(di.weight_texel_count + i);
-		unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
+		unsigned int max_texel_count = hmax_s(weight_texel_count);
 		promise(max_texel_count > 0);
 
 		for (unsigned int j = 0; j < max_texel_count; j++)

diff --git a/Source/astcenc_pick_best_endpoint_format.cpp b/Source/astcenc_pick_best_endpoint_format.cpp
@@ -1,6 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // ----------------------------------------------------------------------------
-// Copyright 2011-2022 Arm Limited
+// Copyright 2011-2024 Arm Limited
 //
 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
 // use this file except in compliance with the License. You may obtain a copy
@@ -1306,8 +1306,8 @@ unsigned int compute_ideal_endpoint_formats(
 		// Pick best mode from the SIMD result, using lowest matching index to ensure invariance
 		vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
 		vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
-		vbest_error_index = hmin(vbest_error_index);
-		int best_error_index = vbest_error_index.lane<0>();
+
+		int best_error_index = hmin_s(vbest_error_index);
 
 		best_error_weights[i] = best_error_index;
 

diff --git a/Source/astcenc_vecmathlib_avx2_8.h b/Source/astcenc_vecmathlib_avx2_8.h
@@ -74,18 +74,6 @@ struct vfloat8
 		m = _mm256_set1_ps(a);
 	}
 
-	/**
-	 * @brief Construct from 8 scalar values.
-	 *
-	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
-	 */
-	ASTCENC_SIMD_INLINE explicit vfloat8(
-		float a, float b, float c, float d,
-		float e, float f, float g, float h)
-	{
-		m = _mm256_set_ps(h, g, f, e, d, c, b, a);
-	}
-
 	/**
 	 * @brief Construct from an existing SIMD register.
 	 */
@@ -94,20 +82,6 @@ struct vfloat8
 		m = a;
 	}
 
-	/**
-	 * @brief Get the scalar value of a single lane.
-	 */
-	template <int l> ASTCENC_SIMD_INLINE float lane() const
-	{
-	#if !defined(__clang__) && defined(_MSC_VER)
-		return m.m256_f32[l];
-	#else
-		union { __m256 m; float f[8]; } cvt;
-		cvt.m = m;
-		return cvt.f[l];
-	#endif
-	}
-
 	/**
 	 * @brief Factory that returns a vector of zeros.
 	 */
@@ -132,14 +106,6 @@ struct vfloat8
 		return vfloat8(_mm256_load_ps(p));
 	}
 
-	/**
-	 * @brief Factory that returns a vector containing the lane IDs.
-	 */
-	static ASTCENC_SIMD_INLINE vfloat8 lane_id()
-	{
-		return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
-	}
-
 	/**
 	 * @brief The vector ...
 	 */
@@ -190,18 +156,6 @@ struct vint8
 		m = _mm256_set1_epi32(a);
 	}
 
-	/**
-	 * @brief Construct from 8 scalar values.
-	 *
-	 * The value of @c a is stored to lane 0 (LSB) in the SIMD register.
-	 */
-	ASTCENC_SIMD_INLINE explicit vint8(
-		int a, int b, int c, int d,
-		int e, int f, int g, int h)
-	{
-		m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
-	}
-
 	/**
 	 * @brief Construct from an existing SIMD register.
 	 */
@@ -210,20 +164,6 @@ struct vint8
 		m = a;
 	}
 
-	/**
-	 * @brief Get the scalar from a single lane.
-	 */
-	template <int l> ASTCENC_SIMD_INLINE int lane() const
-	{
-	#if !defined(__clang__) && defined(_MSC_VER)
-		return m.m256i_i32[l];
-	#else
-		union { __m256i m; int f[8]; } cvt;
-		cvt.m = m;
-		return cvt.f[l];
-	#endif
-	}
-
 	/**
 	 * @brief Factory that returns a vector of zeros.
 	 */
@@ -528,6 +468,14 @@ ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
 	return vmin;
 }
 
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
+{
+	return _mm256_cvtsi256_si32(hmin(a).m);
+}
+
 /**
  * @brief Return the horizontal maximum of a vector.
  */
@@ -543,6 +491,14 @@ ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
 	return vmax;
 }
 
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
+{
+	return _mm256_cvtsi256_si32(hmax(a).m);
+}
+
 /**
  * @brief Store a vector to a 16B aligned memory address.
  */
@@ -570,14 +526,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
 	_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
 }
 
-/**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
-{
-	return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
-}
-
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */
@@ -786,19 +734,6 @@ ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
 	return a;
 }
 
-/**
- * @brief Return a clamped value between 0.0f and max.
- *
- * It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
- * be returned for that lane.
- */
-ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
-{
-	a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
-	a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
-	return a;
-}
-
 /**
  * @brief Return a clamped value between 0.0f and 1.0f.
  *
@@ -857,7 +792,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
  */
 ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
 {
-	return hmin(a).lane<0>();
+	return _mm256_cvtss_f32(hmin(a).m);
 }
 
 /**
@@ -887,7 +822,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
  */
 ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
 {
-	return hmax(a).lane<0>();
+	return _mm256_cvtss_f32(hmax(a).m);
 }
 
 /**
@@ -1146,7 +1081,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3
  * @brief Return a vector of interleaved RGBA data.
  *
  * Input vectors have the value stored in the bottom 8 bits of each lane,
- * with high  bits set to zero.
+ * with high bits set to zero.
  *
  * Output vector stores a single RGBA texel packed in each lane.
  */

diff --git a/Source/astcenc_vecmathlib_common_4.h b/Source/astcenc_vecmathlib_common_4.h
@@ -129,6 +129,22 @@ ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
 	return a.lane<0>() + a.lane<1>() + a.lane<2>();
 }
 
+/**
+ * @brief Return the horizontal minimum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
+{
+	return hmin(a).lane<0>();
+}
+
+/**
+ * @brief Return the horizontal maximum of a vector.
+ */
+ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
+{
+	return hmax(a).lane<0>();
+}
+
 // ============================================================================
 // vfloat4 operators and functions
 // ============================================================================
@@ -222,18 +238,6 @@ ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
 	return min(max(a, minv), maxv);
 }
 
-/**
- * @brief Return the clamped value between 0.0f and max.
- *
- * It is assumed that  @c max is not a NaN value. If @c a is NaN then zero will
- * be returned for that lane.
- */
-ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
-{
-	// Do not reorder - second operand will return if either is NaN
-	return min(max(a, vfloat4::zero()), maxv);
-}
-
 /**
  * @brief Return the clamped value between 0.0f and 1.0f.
  *

diff --git a/Source/astcenc_vecmathlib_neon_4.h b/Source/astcenc_vecmathlib_neon_4.h
@@ -134,15 +134,6 @@ struct vfloat4
 		return vfloat4(vld1q_f32(p));
 	}
 
-	/**
-	 * @brief Factory that returns a vector containing the lane IDs.
-	 */
-	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
-	{
-		alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
-		return vfloat4(vld1q_f32(data));
-	}
-
 	/**
 	 * @brief Return a swizzled float 2.
 	 */
@@ -611,21 +602,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 	vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
 }
 
-/**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
-{
-	alignas(16) int idx[4];
-	storea(indices, idx);
-	alignas(16) int vals[4];
-	vals[0] = base[idx[0]];
-	vals[1] = base[idx[1]];
-	vals[2] = base[idx[2]];
-	vals[3] = base[idx[3]];
-	return vint4(vals);
-}
-
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */

diff --git a/Source/astcenc_vecmathlib_none_4.h b/Source/astcenc_vecmathlib_none_4.h
@@ -139,14 +139,6 @@ struct vfloat4
 		return vfloat4(p);
 	}
 
-	/**
-	 * @brief Factory that returns a vector containing the lane IDs.
-	 */
-	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
-	{
-		return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
-	}
-
 	/**
 	 * @brief Return a swizzled float 2.
 	 */
@@ -354,7 +346,7 @@ struct vmask4
 	/**
 	 * @brief Get the scalar value of a single lane.
 	 */
-	template <int l> ASTCENC_SIMD_INLINE float lane() const
+	template <int l> ASTCENC_SIMD_INLINE bool lane() const
 	{
 		return m[l] != 0;
 	}
@@ -684,17 +676,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 	std::memcpy(p, a.m, sizeof(uint8_t) * 4);
 }
 
-/**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
-{
-	return vint4(base[indices.m[0]],
-	             base[indices.m[1]],
-	             base[indices.m[2]],
-	             base[indices.m[3]]);
-}
-
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */

diff --git a/Source/astcenc_vecmathlib_sse_4.h b/Source/astcenc_vecmathlib_sse_4.h
@@ -142,14 +142,6 @@ struct vfloat4
 		return vfloat4(_mm_load_ps(p));
 	}
 
-	/**
-	 * @brief Factory that returns a vector containing the lane IDs.
-	 */
-	static ASTCENC_SIMD_INLINE vfloat4 lane_id()
-	{
-		return vfloat4(_mm_set_ps(3, 2, 1, 0));
-	}
-
 	/**
 	 * @brief Return a swizzled float 2.
 	 */
@@ -663,20 +655,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
 	_mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
 }
 
-/**
- * @brief Gather N (vector width) indices from the array.
- */
-ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
-{
-#if ASTCENC_AVX >= 2
-	return vint4(_mm_i32gather_epi32(base, indices.m, 4));
-#else
-	alignas(16) int idx[4];
-	storea(indices, idx);
-	return vint4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
-#endif
-}
-
 /**
  * @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
  */