Skip to content

Commit

Permalink
Merge branch 'main' into blend_test
Browse files Browse the repository at this point in the history
  • Loading branch information
solidpixel authored Jul 31, 2024
2 parents b8192c2 + 69bc17b commit 7338637
Show file tree
Hide file tree
Showing 10 changed files with 1,124 additions and 1,030 deletions.
1,940 changes: 1,080 additions & 860 deletions Source/UnitTest/test_simd.cpp

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions Source/astcenc_decompress_symbolic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ void unpack_weights(
{
vint summed_value(8);
vint weight_count(di.texel_weight_count + i);
int max_weight_count = hmax(weight_count).lane<0>();
int max_weight_count = hmax_s(weight_count);

promise(max_weight_count > 0);
for (int j = 0; j < max_weight_count; j++)
Expand Down Expand Up @@ -145,7 +145,7 @@ void unpack_weights(
vint sum_plane2(8);

vint weight_count(di.texel_weight_count + i);
int max_weight_count = hmax(weight_count).lane<0>();
int max_weight_count = hmax_s(weight_count);

promise(max_weight_count > 0);
for (int j = 0; j < max_weight_count; j++)
Expand Down
4 changes: 2 additions & 2 deletions Source/astcenc_ideal_endpoints_and_weights.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -889,7 +889,7 @@ void compute_ideal_weights_for_decimation(

// Accumulate error weighting of all the texels using this weight
vint weight_texel_count(di.weight_texel_count + i);
unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
unsigned int max_texel_count = hmax_s(weight_texel_count);
promise(max_texel_count > 0);

for (unsigned int j = 0; j < max_texel_count; j++)
Expand Down Expand Up @@ -947,7 +947,7 @@ void compute_ideal_weights_for_decimation(

// Accumulate error weighting of all the texels using this weight
vint weight_texel_count(di.weight_texel_count + i);
unsigned int max_texel_count = hmax(weight_texel_count).lane<0>();
unsigned int max_texel_count = hmax_s(weight_texel_count);
promise(max_texel_count > 0);

for (unsigned int j = 0; j < max_texel_count; j++)
Expand Down
6 changes: 3 additions & 3 deletions Source/astcenc_pick_best_endpoint_format.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: Apache-2.0
// ----------------------------------------------------------------------------
// Copyright 2011-2022 Arm Limited
// Copyright 2011-2024 Arm Limited
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy
Expand Down Expand Up @@ -1306,8 +1306,8 @@ unsigned int compute_ideal_endpoint_formats(
// Pick best mode from the SIMD result, using lowest matching index to ensure invariance
vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
vbest_error_index = hmin(vbest_error_index);
int best_error_index = vbest_error_index.lane<0>();

int best_error_index = hmin_s(vbest_error_index);

best_error_weights[i] = best_error_index;

Expand Down
103 changes: 19 additions & 84 deletions Source/astcenc_vecmathlib_avx2_8.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,18 +74,6 @@ struct vfloat8
m = _mm256_set1_ps(a);
}

/**
* @brief Construct from 8 scalar values.
*
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
*/
ASTCENC_SIMD_INLINE explicit vfloat8(
float a, float b, float c, float d,
float e, float f, float g, float h)
{
m = _mm256_set_ps(h, g, f, e, d, c, b, a);
}

/**
* @brief Construct from an existing SIMD register.
*/
Expand All @@ -94,20 +82,6 @@ struct vfloat8
m = a;
}

/**
* @brief Get the scalar value of a single lane.
*/
template <int l> ASTCENC_SIMD_INLINE float lane() const
{
#if !defined(__clang__) && defined(_MSC_VER)
return m.m256_f32[l];
#else
union { __m256 m; float f[8]; } cvt;
cvt.m = m;
return cvt.f[l];
#endif
}

/**
* @brief Factory that returns a vector of zeros.
*/
Expand All @@ -132,14 +106,6 @@ struct vfloat8
return vfloat8(_mm256_load_ps(p));
}

/**
* @brief Factory that returns a vector containing the lane IDs.
*/
static ASTCENC_SIMD_INLINE vfloat8 lane_id()
{
return vfloat8(_mm256_set_ps(7, 6, 5, 4, 3, 2, 1, 0));
}

/**
* @brief The vector ...
*/
Expand Down Expand Up @@ -190,18 +156,6 @@ struct vint8
m = _mm256_set1_epi32(a);
}

/**
* @brief Construct from 8 scalar values.
*
* The value of @c a is stored to lane 0 (LSB) in the SIMD register.
*/
ASTCENC_SIMD_INLINE explicit vint8(
int a, int b, int c, int d,
int e, int f, int g, int h)
{
m = _mm256_set_epi32(h, g, f, e, d, c, b, a);
}

/**
* @brief Construct from an existing SIMD register.
*/
Expand All @@ -210,20 +164,6 @@ struct vint8
m = a;
}

/**
* @brief Get the scalar from a single lane.
*/
template <int l> ASTCENC_SIMD_INLINE int lane() const
{
#if !defined(__clang__) && defined(_MSC_VER)
return m.m256i_i32[l];
#else
union { __m256i m; int f[8]; } cvt;
cvt.m = m;
return cvt.f[l];
#endif
}

/**
* @brief Factory that returns a vector of zeros.
*/
Expand Down Expand Up @@ -528,6 +468,14 @@ ASTCENC_SIMD_INLINE vint8 hmin(vint8 a)
return vmin;
}

/**
* @brief Return the horizontal minimum of a vector.
*/
ASTCENC_SIMD_INLINE int hmin_s(vint8 a)
{
return _mm256_cvtsi256_si32(hmin(a).m);
}

/**
* @brief Return the horizontal maximum of a vector.
*/
Expand All @@ -543,6 +491,14 @@ ASTCENC_SIMD_INLINE vint8 hmax(vint8 a)
return vmax;
}

/**
* @brief Return the horizontal maximum of a vector.
*/
ASTCENC_SIMD_INLINE int hmax_s(vint8 a)
{
return _mm256_cvtsi256_si32(hmax(a).m);
}

/**
* @brief Store a vector to a 16B aligned memory address.
*/
Expand Down Expand Up @@ -570,14 +526,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint8 a, uint8_t* p)
_mm_storel_epi64(reinterpret_cast<__m128i*>(p), _mm256_extracti128_si256(a.m, 0));
}

/**
* @brief Gather N (vector width) indices from the array.
*/
ASTCENC_SIMD_INLINE vint8 gatheri(const int* base, vint8 indices)
{
return vint8(_mm256_i32gather_epi32(base, indices.m, 4));
}

/**
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
*/
Expand Down Expand Up @@ -786,19 +734,6 @@ ASTCENC_SIMD_INLINE vfloat8 clamp(float min, float max, vfloat8 a)
return a;
}

/**
* @brief Return a clamped value between 0.0f and max.
*
* It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
* be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat8 clampz(float max, vfloat8 a)
{
a.m = _mm256_max_ps(a.m, _mm256_setzero_ps());
a.m = _mm256_min_ps(a.m, _mm256_set1_ps(max));
return a;
}

/**
* @brief Return a clamped value between 0.0f and 1.0f.
*
Expand Down Expand Up @@ -857,7 +792,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmin(vfloat8 a)
*/
ASTCENC_SIMD_INLINE float hmin_s(vfloat8 a)
{
return hmin(a).lane<0>();
return _mm256_cvtss_f32(hmin(a).m);
}

/**
Expand Down Expand Up @@ -887,7 +822,7 @@ ASTCENC_SIMD_INLINE vfloat8 hmax(vfloat8 a)
*/
ASTCENC_SIMD_INLINE float hmax_s(vfloat8 a)
{
return hmax(a).lane<0>();
return _mm256_cvtss_f32(hmax(a).m);
}

/**
Expand Down Expand Up @@ -1146,7 +1081,7 @@ ASTCENC_SIMD_INLINE vint8 vtable_8bt_32bi(vint8 t0, vint8 t1, vint8 t2, vint8 t3
* @brief Return a vector of interleaved RGBA data.
*
* Input vectors have the value stored in the bottom 8 bits of each lane,
* with high bits set to zero.
* with high bits set to zero.
*
* Output vector stores a single RGBA texel packed in each lane.
*/
Expand Down
28 changes: 16 additions & 12 deletions Source/astcenc_vecmathlib_common_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,22 @@ ASTCENC_SIMD_INLINE int hadd_rgb_s(vint4 a)
return a.lane<0>() + a.lane<1>() + a.lane<2>();
}

/**
* @brief Return the horizontal minimum of a vector.
*/
ASTCENC_SIMD_INLINE int hmin_s(vint4 a)
{
return hmin(a).lane<0>();
}

/**
* @brief Return the horizontal maximum of a vector.
*/
ASTCENC_SIMD_INLINE int hmax_s(vint4 a)
{
return hmax(a).lane<0>();
}

// ============================================================================
// vfloat4 operators and functions
// ============================================================================
Expand Down Expand Up @@ -222,18 +238,6 @@ ASTCENC_SIMD_INLINE vfloat4 clamp(float minv, float maxv, vfloat4 a)
return min(max(a, minv), maxv);
}

/**
* @brief Return the clamped value between 0.0f and max.
*
* It is assumed that @c max is not a NaN value. If @c a is NaN then zero will
* be returned for that lane.
*/
ASTCENC_SIMD_INLINE vfloat4 clampz(float maxv, vfloat4 a)
{
// Do not reorder - second operand will return if either is NaN
return min(max(a, vfloat4::zero()), maxv);
}

/**
* @brief Return the clamped value between 0.0f and 1.0f.
*
Expand Down
24 changes: 0 additions & 24 deletions Source/astcenc_vecmathlib_neon_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,15 +134,6 @@ struct vfloat4
return vfloat4(vld1q_f32(p));
}

/**
* @brief Factory that returns a vector containing the lane IDs.
*/
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
{
alignas(16) float data[4] { 0.0f, 1.0f, 2.0f, 3.0f };
return vfloat4(vld1q_f32(data));
}

/**
* @brief Return a swizzled float 2.
*/
Expand Down Expand Up @@ -611,21 +602,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
vst1q_lane_s32(reinterpret_cast<int32_t*>(p), a.m, 0);
}

/**
* @brief Gather N (vector width) indices from the array.
*/
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
{
alignas(16) int idx[4];
storea(indices, idx);
alignas(16) int vals[4];
vals[0] = base[idx[0]];
vals[1] = base[idx[1]];
vals[2] = base[idx[2]];
vals[3] = base[idx[3]];
return vint4(vals);
}

/**
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
*/
Expand Down
21 changes: 1 addition & 20 deletions Source/astcenc_vecmathlib_none_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,14 +139,6 @@ struct vfloat4
return vfloat4(p);
}

/**
* @brief Factory that returns a vector containing the lane IDs.
*/
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
{
return vfloat4(0.0f, 1.0f, 2.0f, 3.0f);
}

/**
* @brief Return a swizzled float 2.
*/
Expand Down Expand Up @@ -354,7 +346,7 @@ struct vmask4
/**
* @brief Get the scalar value of a single lane.
*/
template <int l> ASTCENC_SIMD_INLINE float lane() const
template <int l> ASTCENC_SIMD_INLINE bool lane() const
{
return m[l] != 0;
}
Expand Down Expand Up @@ -684,17 +676,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
std::memcpy(p, a.m, sizeof(uint8_t) * 4);
}

/**
* @brief Gather N (vector width) indices from the array.
*/
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
{
return vint4(base[indices.m[0]],
base[indices.m[1]],
base[indices.m[2]],
base[indices.m[3]]);
}

/**
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
*/
Expand Down
22 changes: 0 additions & 22 deletions Source/astcenc_vecmathlib_sse_4.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,14 +142,6 @@ struct vfloat4
return vfloat4(_mm_load_ps(p));
}

/**
* @brief Factory that returns a vector containing the lane IDs.
*/
static ASTCENC_SIMD_INLINE vfloat4 lane_id()
{
return vfloat4(_mm_set_ps(3, 2, 1, 0));
}

/**
* @brief Return a swizzled float 2.
*/
Expand Down Expand Up @@ -663,20 +655,6 @@ ASTCENC_SIMD_INLINE void store_nbytes(vint4 a, uint8_t* p)
_mm_store_ss(reinterpret_cast<float*>(p), _mm_castsi128_ps(a.m));
}

/**
* @brief Gather N (vector width) indices from the array.
*/
ASTCENC_SIMD_INLINE vint4 gatheri(const int* base, vint4 indices)
{
#if ASTCENC_AVX >= 2
return vint4(_mm_i32gather_epi32(base, indices.m, 4));
#else
alignas(16) int idx[4];
storea(indices, idx);
return vint4(base[idx[0]], base[idx[1]], base[idx[2]], base[idx[3]]);
#endif
}

/**
* @brief Pack low 8 bits of N (vector width) lanes into bottom of vector.
*/
Expand Down
Loading

0 comments on commit 7338637

Please sign in to comment.