Skip to content

Commit 454b6de

Browse files
authored
Improve SIMD vtable API (#492)
1 parent 44e3b94 commit 454b6de

10 files changed

+544
-345
lines changed

Source/UnitTest/test_simd.cpp

Lines changed: 128 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1947,43 +1947,78 @@ TEST(vmask4, not)
19471947
}
19481948

19491949
/** @brief Test vint4 table permute. */
1950-
TEST(vint4, vtable_8bt_32bi_32entry)
1950+
TEST(vint4, vtable4_16x8)
19511951
{
1952-
vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
1953-
vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
1952+
uint8_t data[16] = {
1953+
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1954+
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
1955+
};
19541956

1955-
vint4 table0p, table1p;
1956-
vtable_prepare(table0, table1, table0p, table1p);
1957+
vtable4_16x8 table;
1958+
vtable_prepare(table, data);
19571959

1958-
vint4 index(0, 7, 4, 31);
1960+
vint4 index(0, 7, 4, 15);
19591961

1960-
vint4 result = vtable_8bt_32bi(table0p, table1p, index);
1962+
vint4 result = vtable_lookup_32bit(table, index);
19611963

1962-
EXPECT_EQ(result.lane<0>(), 3);
1963-
EXPECT_EQ(result.lane<1>(), 4);
1964-
EXPECT_EQ(result.lane<2>(), 7);
1965-
EXPECT_EQ(result.lane<3>(), 28);
1964+
EXPECT_EQ(result.lane<0>(), 0);
1965+
EXPECT_EQ(result.lane<1>(), 7);
1966+
EXPECT_EQ(result.lane<2>(), 4);
1967+
EXPECT_EQ(result.lane<3>(), 15);
19661968
}
19671969

19681970
/** @brief Test vint4 table permute. */
1969-
TEST(vint4, vtable_8bt_32bi_64entry)
1971+
TEST(vint4, vtable4_32x8)
19701972
{
1971-
vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
1972-
vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
1973-
vint4 table2(0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f);
1974-
vint4 table3(0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
1973+
uint8_t data[32] = {
1974+
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1975+
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1976+
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
1977+
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
1978+
};
1979+
1980+
vtable4_32x8 table;
1981+
vtable_prepare(table, data);
1982+
1983+
vint4 index(0, 7, 4, 31);
1984+
1985+
vint4 result = vtable_lookup_32bit(table, index);
19751986

1976-
vint4 table0p, table1p, table2p, table3p;
1977-
vtable_prepare(table0, table1, table2, table3, table0p, table1p, table2p, table3p);
1987+
EXPECT_EQ(result.lane<0>(), 0);
1988+
EXPECT_EQ(result.lane<1>(), 7);
1989+
EXPECT_EQ(result.lane<2>(), 4);
1990+
EXPECT_EQ(result.lane<3>(), 31);
1991+
}
1992+
1993+
/** @brief Test vint4 table permute. */
1994+
TEST(vint4, vtable4_64x8)
1995+
{
1996+
uint8_t data[64] = {
1997+
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
1998+
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1999+
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
2000+
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
2001+
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
2002+
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
2003+
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
2004+
0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
2005+
};
2006+
2007+
vtable4_64x8 table;
2008+
vtable_prepare(table, data);
19782009

19792010
vint4 index(0, 7, 38, 63);
19802011

1981-
vint4 result = vtable_8bt_32bi(table0p, table1p, table2p, table3p, index);
2012+
vint4 result = vtable_lookup_32bit(table, index);
2013+
2014+
uint8_t* hack = reinterpret_cast<uint8_t*>(&table);
2015+
std::cout << "38: " << hack[38] << "\n";
2016+
std::cout << "63: " << hack[63] << "\n";
19822017

1983-
EXPECT_EQ(result.lane<0>(), 3);
1984-
EXPECT_EQ(result.lane<1>(), 4);
1985-
EXPECT_EQ(result.lane<2>(), 37);
1986-
EXPECT_EQ(result.lane<3>(), 60);
2018+
EXPECT_EQ(result.lane<0>(), 0);
2019+
EXPECT_EQ(result.lane<1>(), 7);
2020+
EXPECT_EQ(result.lane<2>(), 38);
2021+
EXPECT_EQ(result.lane<3>(), 63);
19872022
}
19882023

19892024
/** @brief Test vint4 rgba byte interleave. */
@@ -3657,57 +3692,95 @@ TEST(vmask8, not)
36573692
}
36583693

36593694
/** @brief Test vint8 table permute. */
3660-
TEST(vint8, vtable_8bt_32bi_32entry)
3695+
TEST(vint8, vtable8_16x8)
36613696
{
3662-
vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
3663-
vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
3697+
uint8_t data[16] = {
3698+
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
3699+
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
3700+
};
36643701

3665-
vint8 table0p, table1p;
3666-
vtable_prepare(table0, table1, table0p, table1p);
3702+
vtable8_16x8 table;
3703+
vtable_prepare(table, data);
36673704

3668-
vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 23, 31);
3705+
vint8 index = vint8_lit(0, 7, 4, 15, 1, 2, 14, 4);
36693706

3670-
vint8 result = vtable_8bt_32bi(table0p, table1p, index);
3707+
vint8 result = vtable_lookup_32bit(table, index);
36713708

36723709
alignas(32) int ra[8];
36733710
store(result, ra);
36743711

3675-
EXPECT_EQ(ra[0], 3);
3676-
EXPECT_EQ(ra[1], 4);
3677-
EXPECT_EQ(ra[2], 7);
3678-
EXPECT_EQ(ra[3], 12);
3679-
EXPECT_EQ(ra[4], 19);
3680-
EXPECT_EQ(ra[5], 23);
3681-
EXPECT_EQ(ra[6], 20);
3682-
EXPECT_EQ(ra[7], 28);
3712+
EXPECT_EQ(ra[0], 0);
3713+
EXPECT_EQ(ra[1], 7);
3714+
EXPECT_EQ(ra[2], 4);
3715+
EXPECT_EQ(ra[3], 15);
3716+
EXPECT_EQ(ra[4], 1);
3717+
EXPECT_EQ(ra[5], 2);
3718+
EXPECT_EQ(ra[6], 14);
3719+
EXPECT_EQ(ra[7], 4);
36833720
}
36843721

3685-
/** @brief Test vint4 table permute. */
3686-
TEST(vint8, vtable_8bt_32bi_64entry)
3722+
/** @brief Test vint8 table permute. */
3723+
TEST(vint8, vtable8_32x8)
36873724
{
3688-
vint4 table0(0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f);
3689-
vint4 table1(0x10111213, 0x14151617, 0x18191a1b, 0x1c1d1e1f);
3690-
vint4 table2(0x20212223, 0x24252627, 0x28292a2b, 0x2c2d2e2f);
3691-
vint4 table3(0x30313233, 0x34353637, 0x38393a3b, 0x3c3d3e3f);
3725+
uint8_t data[32] = {
3726+
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
3727+
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
3728+
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
3729+
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
3730+
};
3731+
3732+
vtable8_32x8 table;
3733+
vtable_prepare(table, data);
3734+
3735+
vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 23, 31);
36923736

3693-
vint8 table0p, table1p, table2p, table3p;
3694-
vtable_prepare(table0, table1, table2, table3, table0p, table1p, table2p, table3p);
3737+
vint8 result = vtable_lookup_32bit(table, index);
3738+
3739+
alignas(32) int ra[8];
3740+
store(result, ra);
3741+
3742+
EXPECT_EQ(ra[0], 0);
3743+
EXPECT_EQ(ra[1], 7);
3744+
EXPECT_EQ(ra[2], 4);
3745+
EXPECT_EQ(ra[3], 15);
3746+
EXPECT_EQ(ra[4], 16);
3747+
EXPECT_EQ(ra[5], 20);
3748+
EXPECT_EQ(ra[6], 23);
3749+
EXPECT_EQ(ra[7], 31);
3750+
}
3751+
3752+
/** @brief Test vint8 table permute. */
3753+
TEST(vint8, vtable8_64x8)
3754+
{
3755+
uint8_t data[64] = {
3756+
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
3757+
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
3758+
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
3759+
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
3760+
0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
3761+
0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
3762+
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
3763+
0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f
3764+
};
3765+
3766+
vtable8_64x8 table;
3767+
vtable_prepare(table, data);
36953768

36963769
vint8 index = vint8_lit(0, 7, 4, 15, 16, 20, 38, 63);
36973770

3698-
vint8 result = vtable_8bt_32bi(table0p, table1p, table2p, table3p, index);
3771+
vint8 result = vtable_lookup_32bit(table, index);
36993772

37003773
alignas(32) int ra[8];
37013774
store(result, ra);
37023775

3703-
EXPECT_EQ(ra[0], 3);
3704-
EXPECT_EQ(ra[1], 4);
3705-
EXPECT_EQ(ra[2], 7);
3706-
EXPECT_EQ(ra[3], 12);
3707-
EXPECT_EQ(ra[4], 19);
3708-
EXPECT_EQ(ra[5], 23);
3709-
EXPECT_EQ(ra[6], 37);
3710-
EXPECT_EQ(ra[7], 60);
3776+
EXPECT_EQ(ra[0], 0);
3777+
EXPECT_EQ(ra[1], 7);
3778+
EXPECT_EQ(ra[2], 4);
3779+
EXPECT_EQ(ra[3], 15);
3780+
EXPECT_EQ(ra[4], 16);
3781+
EXPECT_EQ(ra[5], 20);
3782+
EXPECT_EQ(ra[6], 38);
3783+
EXPECT_EQ(ra[7], 63);
37113784
}
37123785

37133786
#endif

Source/astcenc_decompress_symbolic.cpp

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -98,13 +98,8 @@ void unpack_weights(
9898
if (!is_dual_plane)
9999
{
100100
// Build full 64-entry weight lookup table
101-
vint4 tab0 = vint4::load(scb.weights + 0);
102-
vint4 tab1 = vint4::load(scb.weights + 16);
103-
vint4 tab2 = vint4::load(scb.weights + 32);
104-
vint4 tab3 = vint4::load(scb.weights + 48);
105-
106-
vint tab0p, tab1p, tab2p, tab3p;
107-
vtable_prepare(tab0, tab1, tab2, tab3, tab0p, tab1p, tab2p, tab3p);
101+
vtable_64x8 table;
102+
vtable_prepare(table, scb.weights);
108103

109104
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
110105
{
@@ -118,7 +113,7 @@ void unpack_weights(
118113
vint texel_weights(di.texel_weights_tr[j] + i);
119114
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
120115

121-
summed_value += vtable_8bt_32bi(tab0p, tab1p, tab2p, tab3p, texel_weights) * texel_weights_int;
116+
summed_value += vtable_lookup_32bit(table, texel_weights) * texel_weights_int;
122117
}
123118

124119
store(lsr<4>(summed_value), weights_plane1 + i);
@@ -128,16 +123,12 @@ void unpack_weights(
128123
{
129124
// Build a 32-entry weight lookup table per plane
130125
// Plane 1
131-
vint4 tab0_plane1 = vint4::load(scb.weights + 0);
132-
vint4 tab1_plane1 = vint4::load(scb.weights + 16);
133-
vint tab0_plane1p, tab1_plane1p;
134-
vtable_prepare(tab0_plane1, tab1_plane1, tab0_plane1p, tab1_plane1p);
126+
vtable_32x8 tab_plane1;
127+
vtable_prepare(tab_plane1, scb.weights);
135128

136129
// Plane 2
137-
vint4 tab0_plane2 = vint4::load(scb.weights + 32);
138-
vint4 tab1_plane2 = vint4::load(scb.weights + 48);
139-
vint tab0_plane2p, tab1_plane2p;
140-
vtable_prepare(tab0_plane2, tab1_plane2, tab0_plane2p, tab1_plane2p);
130+
vtable_32x8 tab_plane2;
131+
vtable_prepare(tab_plane2, scb.weights + 32);
141132

142133
for (unsigned int i = 0; i < bsd.texel_count; i += ASTCENC_SIMD_WIDTH)
143134
{
@@ -153,8 +144,8 @@ void unpack_weights(
153144
vint texel_weights(di.texel_weights_tr[j] + i);
154145
vint texel_weights_int(di.texel_weight_contribs_int_tr[j] + i);
155146

156-
sum_plane1 += vtable_8bt_32bi(tab0_plane1p, tab1_plane1p, texel_weights) * texel_weights_int;
157-
sum_plane2 += vtable_8bt_32bi(tab0_plane2p, tab1_plane2p, texel_weights) * texel_weights_int;
147+
sum_plane1 += vtable_lookup_32bit(tab_plane1, texel_weights) * texel_weights_int;
148+
sum_plane2 += vtable_lookup_32bit(tab_plane2, texel_weights) * texel_weights_int;
158149
}
159150

160151
store(lsr<4>(sum_plane1), weights_plane1 + i);

Source/astcenc_ideal_endpoints_and_weights.cpp

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,9 +1023,8 @@ void compute_quantized_weights_for_decimation(
10231023
// safe data in compute_ideal_weights_for_decimation and arrays are always 64 elements
10241024
if (get_quant_level(quant_level) <= 16)
10251025
{
1026-
vint4 tab0 = vint4::load(qat.quant_to_unquant);
1027-
vint tab0p;
1028-
vtable_prepare(tab0, tab0p);
1026+
vtable_16x8 table;
1027+
vtable_prepare(table, qat.quant_to_unquant);
10291028

10301029
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
10311030
{
@@ -1038,8 +1037,8 @@ void compute_quantized_weights_for_decimation(
10381037
vint weightl = float_to_int(ix1);
10391038
vint weighth = min(weightl + vint(1), steps_m1);
10401039

1041-
vint ixli = vtable_8bt_32bi(tab0p, weightl);
1042-
vint ixhi = vtable_8bt_32bi(tab0p, weighth);
1040+
vint ixli = vtable_lookup_32bit(table, weightl);
1041+
vint ixhi = vtable_lookup_32bit(table, weighth);
10431042

10441043
vfloat ixl = int_to_float(ixli);
10451044
vfloat ixh = int_to_float(ixhi);
@@ -1055,10 +1054,8 @@ void compute_quantized_weights_for_decimation(
10551054
}
10561055
else
10571056
{
1058-
vint4 tab0 = vint4::load(qat.quant_to_unquant + 0);
1059-
vint4 tab1 = vint4::load(qat.quant_to_unquant + 16);
1060-
vint tab0p, tab1p;
1061-
vtable_prepare(tab0, tab1, tab0p, tab1p);
1057+
vtable_32x8 table;
1058+
vtable_prepare(table, qat.quant_to_unquant);
10621059

10631060
for (int i = 0; i < weight_count; i += ASTCENC_SIMD_WIDTH)
10641061
{
@@ -1071,8 +1068,8 @@ void compute_quantized_weights_for_decimation(
10711068
vint weightl = float_to_int(ix1);
10721069
vint weighth = min(weightl + vint(1), steps_m1);
10731070

1074-
vint ixli = vtable_8bt_32bi(tab0p, tab1p, weightl);
1075-
vint ixhi = vtable_8bt_32bi(tab0p, tab1p, weighth);
1071+
vint ixli = vtable_lookup_32bit(table, weightl);
1072+
vint ixhi = vtable_lookup_32bit(table, weighth);
10761073

10771074
vfloat ixl = int_to_float(ixli);
10781075
vfloat ixh = int_to_float(ixhi);

Source/astcenc_vecmathlib.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,10 @@
9696
using vint = vint8;
9797
using vmask = vmask8;
9898

99+
using vtable_16x8 = vtable8_16x8;
100+
using vtable_32x8 = vtable8_32x8;
101+
using vtable_64x8 = vtable8_64x8;
102+
99103
constexpr auto loada = vfloat8::loada;
100104
constexpr auto load1 = vfloat8::load1;
101105

@@ -111,6 +115,10 @@
111115
using vint = vint4;
112116
using vmask = vmask4;
113117

118+
using vtable_16x8 = vtable4_16x8;
119+
using vtable_32x8 = vtable4_32x8;
120+
using vtable_64x8 = vtable4_64x8;
121+
114122
constexpr auto loada = vfloat4::loada;
115123
constexpr auto load1 = vfloat4::load1;
116124

@@ -138,6 +146,10 @@
138146
using vint = vint8;
139147
using vmask = vmask8;
140148

149+
using vtable_16x8 = vtable8_16x8;
150+
using vtable_32x8 = vtable8_32x8;
151+
using vtable_64x8 = vtable8_64x8;
152+
141153
constexpr auto loada = vfloat8::loada;
142154
constexpr auto load1 = vfloat8::load1;
143155

@@ -153,6 +165,10 @@
153165
using vint = vint4;
154166
using vmask = vmask4;
155167

168+
using vtable_16x8 = vtable4_16x8;
169+
using vtable_32x8 = vtable4_32x8;
170+
using vtable_64x8 = vtable4_64x8;
171+
156172
constexpr auto loada = vfloat4::loada;
157173
constexpr auto load1 = vfloat4::load1;
158174

@@ -185,6 +201,10 @@
185201
using vint = vint4;
186202
using vmask = vmask4;
187203

204+
using vtable_16x8 = vtable4_16x8;
205+
using vtable_32x8 = vtable4_32x8;
206+
using vtable_64x8 = vtable4_64x8;
207+
188208
constexpr auto loada = vfloat4::loada;
189209
constexpr auto load1 = vfloat4::load1;
190210
#endif

0 commit comments

Comments
 (0)