Skip to content

Commit 3d43eca

Browse files
committed
x86_64: Move WHT_WHT transforms to common lookup table
1 parent a0cf43f commit 3d43eca

File tree

1 file changed

+143
-63
lines changed

1 file changed

+143
-63
lines changed

src/asm/x86/transform/inverse.rs

Lines changed: 143 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -19,21 +19,6 @@ pub fn inverse_transform_add<T: Pixel>(
1919
input: &[T::Coeff], output: &mut PlaneRegionMut<'_, T>, eob: u16,
2020
tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
2121
) {
22-
if tx_type == TxType::WHT_WHT {
23-
debug_assert!(tx_size == TxSize::TX_4X4);
24-
match T::type_enum() {
25-
PixelType::U8 => {
26-
if let Some(func) = INV_TXFM_WHT_FN[cpu.as_index()] {
27-
return call_inverse_func(func, input, output, eob, 4, 4, bd);
28-
}
29-
}
30-
PixelType::U16 => {
31-
if let Some(func) = INV_TXFM_WHT_HBD_FN[cpu.as_index()] {
32-
return call_inverse_hbd_func(func, input, output, eob, 4, 4, bd);
33-
}
34-
}
35-
}
36-
}
3722
match T::type_enum() {
3823
PixelType::U8 => {
3924
if let Some(func) = INV_TXFM_FNS[cpu.as_index()][tx_size][tx_type] {
@@ -81,44 +66,6 @@ pub fn inverse_transform_add<T: Pixel>(
8166
rust::inverse_transform_add(input, output, eob, tx_size, tx_type, bd, cpu);
8267
}
8368

84-
extern {
85-
fn rav1e_inv_txfm_add_wht_wht_4x4_8bpc_avx2(
86-
dst: *mut u8, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
87-
);
88-
fn rav1e_inv_txfm_add_wht_wht_4x4_8bpc_sse2(
89-
dst: *mut u8, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
90-
);
91-
fn rav1e_inv_txfm_add_wht_wht_4x4_16bpc_avx2(
92-
dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
93-
bitdepth_max: i32,
94-
);
95-
fn rav1e_inv_txfm_add_wht_wht_4x4_16bpc_sse2(
96-
dst: *mut u16, dst_stride: libc::ptrdiff_t, coeff: *mut i16, eob: i32,
97-
bitdepth_max: i32,
98-
);
99-
}
100-
101-
const INV_TXFM_WHT_FN_AVX2: Option<InvTxfmFunc> =
102-
Some(rav1e_inv_txfm_add_wht_wht_4x4_8bpc_avx2 as _);
103-
const INV_TXFM_WHT_FN_SSE2: Option<InvTxfmFunc> =
104-
Some(rav1e_inv_txfm_add_wht_wht_4x4_8bpc_sse2 as _);
105-
const INV_TXFM_WHT_HBD_FN_AVX2: Option<InvTxfmHBDFunc> =
106-
Some(rav1e_inv_txfm_add_wht_wht_4x4_16bpc_avx2 as _);
107-
const INV_TXFM_WHT_HBD_FN_SSE2: Option<InvTxfmHBDFunc> =
108-
Some(rav1e_inv_txfm_add_wht_wht_4x4_16bpc_sse2 as _);
109-
110-
cpu_function_lookup_table!(
111-
INV_TXFM_WHT_FN: [Option<InvTxfmFunc>],
112-
default: None,
113-
[SSE2, AVX2]
114-
);
115-
116-
cpu_function_lookup_table!(
117-
INV_TXFM_WHT_HBD_FN: [Option<InvTxfmHBDFunc>],
118-
default: None,
119-
[SSE2, AVX2]
120-
);
121-
12269
macro_rules! decl_itx_fns {
12370
// Takes a 2d list of tx types for W and H
12471
([$([$(($ENUM:expr, $TYPE1:ident, $TYPE2:ident)),*]),*], $W:expr, $H:expr,
@@ -249,7 +196,7 @@ macro_rules! impl_itx_fns {
249196
};
250197

251198
($TYPES64:tt, $DIMS64:tt, $TYPES32:tt, $DIMS32:tt, $TYPES16:tt, $DIMS16:tt,
252-
$TYPES84:tt, $DIMS84:tt, $OPT:tt) => {
199+
$TYPES84:tt, $DIMS84:tt, $TYPES4:tt, $DIMS4:tt, $OPT:tt) => {
253200
// Make 2d list of tx types for each set of dimensions. Each set of
254201
// dimensions uses a superset of the previous set of tx types.
255202
impl_itx_fns!([$TYPES64], $DIMS64, $OPT);
@@ -258,15 +205,53 @@ macro_rules! impl_itx_fns {
258205
impl_itx_fns!(
259206
[$TYPES64, $TYPES32, $TYPES16, $TYPES84], $DIMS84, $OPT
260207
);
208+
impl_itx_fns!(
209+
[$TYPES64, $TYPES32, $TYPES16, $TYPES84, $TYPES4], $DIMS4, $OPT
210+
);
261211

262212
// Pool all of the dimensions together to create a table for each cpu
263213
// feature level.
264214
create_wxh_tables!(
265-
[$DIMS64, $DIMS32, $DIMS16, $DIMS84], $OPT
215+
[$DIMS64, $DIMS32, $DIMS16, $DIMS84, $DIMS4], $OPT
266216
);
267217
};
268218
}
269219

220+
impl_itx_fns!(
221+
// 64x
222+
[(TxType::DCT_DCT, dct, dct)],
223+
[(64, 64), (64, 32), (32, 64), (16, 64), (64, 16)],
224+
// 32x
225+
[(TxType::IDTX, identity, identity)],
226+
[(32, 32), (32, 16), (16, 32), (32, 8), (8, 32)],
227+
// 16x16
228+
[
229+
(TxType::DCT_ADST, dct, adst),
230+
(TxType::ADST_DCT, adst, dct),
231+
(TxType::DCT_FLIPADST, dct, flipadst),
232+
(TxType::FLIPADST_DCT, flipadst, dct),
233+
(TxType::V_DCT, dct, identity),
234+
(TxType::H_DCT, identity, dct),
235+
(TxType::ADST_ADST, adst, adst),
236+
(TxType::ADST_FLIPADST, adst, flipadst),
237+
(TxType::FLIPADST_ADST, flipadst, adst),
238+
(TxType::FLIPADST_FLIPADST, flipadst, flipadst)
239+
],
240+
[(16, 16)],
241+
// 8x, 4x and 16x (minus 16x16 and 4x4)
242+
[
243+
(TxType::V_ADST, adst, identity),
244+
(TxType::H_ADST, identity, adst),
245+
(TxType::V_FLIPADST, flipadst, identity),
246+
(TxType::H_FLIPADST, identity, flipadst)
247+
],
248+
[(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8)],
249+
// 4x4
250+
[(TxType::WHT_WHT, wht, wht)],
251+
[(4, 4)],
252+
[(avx2, AVX2)]
253+
);
254+
270255
impl_itx_fns!(
271256
// 64x
272257
[(TxType::DCT_DCT, dct, dct)],
@@ -296,13 +281,35 @@ impl_itx_fns!(
296281
(TxType::H_FLIPADST, identity, flipadst)
297282
],
298283
[(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8), (4, 4)],
299-
[(avx512icl, AVX512ICL), (avx2, AVX2), (ssse3, SSSE3)]
284+
// 4x4
285+
[],
286+
[],
287+
[(avx512icl, AVX512ICL), (ssse3, SSSE3)]
288+
);
289+
290+
impl_itx_fns!(
291+
// 64x
292+
[],
293+
[],
294+
// 32x
295+
[],
296+
[],
297+
// 16x16
298+
[],
299+
[],
300+
// 8x, 4x and 16x (minus 16x16 and 4x4)
301+
[],
302+
[],
303+
// 4x4
304+
[(TxType::WHT_WHT, wht, wht)],
305+
[(4, 4)],
306+
[(sse2, SSE2)]
300307
);
301308

302309
cpu_function_lookup_table!(
303310
INV_TXFM_FNS: [[[Option<InvTxfmFunc>; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL]],
304311
default: [[None; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL],
305-
[SSSE3, AVX2, AVX512ICL]
312+
[SSE2, SSSE3, AVX2, AVX512ICL]
306313
);
307314

308315
macro_rules! impl_itx_hbd_fns {
@@ -321,7 +328,7 @@ macro_rules! impl_itx_hbd_fns {
321328
};
322329

323330
($TYPES64:tt, $DIMS64:tt, $TYPES32:tt, $DIMS32:tt, $TYPES16:tt, $DIMS16:tt,
324-
$TYPES84:tt, $DIMS84:tt, $EXT:ident, $OPT:tt) => {
331+
$TYPES84:tt, $DIMS84:tt, $TYPES4:tt, $DIMS4:tt, $EXT:ident, $OPT:tt) => {
325332
// Make 2d list of tx types for each set of dimensions. Each set of
326333
// dimensions uses a superset of the previous set of tx types.
327334
impl_itx_hbd_fns!([$TYPES64], $DIMS64, $OPT);
@@ -330,11 +337,14 @@ macro_rules! impl_itx_hbd_fns {
330337
impl_itx_hbd_fns!(
331338
[$TYPES64, $TYPES32, $TYPES16, $TYPES84], $DIMS84, $OPT
332339
);
340+
impl_itx_hbd_fns!(
341+
[$TYPES64, $TYPES32, $TYPES16, $TYPES84, $TYPES4], $DIMS4, $OPT
342+
);
333343

334344
// Pool all of the dimensions together to create a table for each cpu
335345
// feature level.
336346
create_wxh_hbd_tables!(
337-
[$DIMS64, $DIMS32, $DIMS16, $DIMS84], $EXT, $OPT
347+
[$DIMS64, $DIMS32, $DIMS16, $DIMS84, $DIMS4], $EXT, $OPT
338348
);
339349
};
340350
}
@@ -368,6 +378,9 @@ impl_itx_hbd_fns!(
368378
(TxType::H_FLIPADST, identity, flipadst)
369379
],
370380
[(16, 8), (8, 16), (8, 8)],
381+
// 4x4
382+
[],
383+
[],
371384
_10,
372385
[(10, avx512icl, AVX512ICL)]
373386
);
@@ -401,7 +414,10 @@ impl_itx_hbd_fns!(
401414
(TxType::H_FLIPADST, identity, flipadst)
402415
],
403416
[(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8), (4, 4)],
404-
_10,
417+
// 4x4
418+
[],
419+
[],
420+
_10_,
405421
[(10, avx2, AVX2)]
406422
);
407423

@@ -434,14 +450,71 @@ impl_itx_hbd_fns!(
434450
(TxType::H_FLIPADST, identity, flipadst)
435451
],
436452
[(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8), (4, 4)],
453+
// 4x4
454+
[],
455+
[],
437456
_10,
438457
[(16, sse4, SSE4_1)]
439458
);
440459

460+
impl_itx_hbd_fns!(
461+
// 64x
462+
[],
463+
[],
464+
// 32x
465+
[],
466+
[],
467+
// 16x16
468+
[],
469+
[],
470+
// 8x, 4x and 16x (minus 16x16 and 4x4)
471+
[],
472+
[],
473+
// 4x4
474+
[(TxType::WHT_WHT, wht, wht)],
475+
[(4, 4)],
476+
_16,
477+
[(16, sse2, SSE2), (16, avx2, AVX2)]
478+
);
479+
480+
const INV_TXFM_HBD_FNS_10_SSE2: [[Option<InvTxfmHBDFunc>; TX_TYPES_PLUS_LL];
481+
TxSize::TX_SIZES_ALL] = INV_TXFM_HBD_FNS_16_SSE2;
482+
const INV_TXFM_HBD_FNS_12_SSE2: [[Option<InvTxfmHBDFunc>; TX_TYPES_PLUS_LL];
483+
TxSize::TX_SIZES_ALL] = INV_TXFM_HBD_FNS_16_SSE2;
484+
485+
const fn merge_hbd_fns(
486+
a: [[Option<InvTxfmHBDFunc>; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL],
487+
b: [[Option<InvTxfmHBDFunc>; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL],
488+
) -> [[Option<InvTxfmHBDFunc>; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL] {
489+
let mut out = b;
490+
let mut tx_size = 0;
491+
loop {
492+
let mut tx_type = 0;
493+
loop {
494+
if a[tx_size][tx_type].is_some() {
495+
out[tx_size][tx_type] = a[tx_size][tx_type];
496+
}
497+
tx_type += 1;
498+
if tx_type == TX_TYPES_PLUS_LL {
499+
break;
500+
}
501+
}
502+
tx_size += 1;
503+
if tx_size == TxSize::TX_SIZES_ALL {
504+
break;
505+
}
506+
}
507+
out
508+
}
509+
510+
const INV_TXFM_HBD_FNS_10_AVX2: [[Option<InvTxfmHBDFunc>; TX_TYPES_PLUS_LL];
511+
TxSize::TX_SIZES_ALL] =
512+
merge_hbd_fns(INV_TXFM_HBD_FNS_10__AVX2, INV_TXFM_HBD_FNS_16_AVX2);
513+
441514
cpu_function_lookup_table!(
442515
INV_TXFM_HBD_FNS_10: [[[Option<InvTxfmHBDFunc>; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL]],
443516
default: [[None; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL],
444-
[SSE4_1, AVX2, AVX512ICL]
517+
[SSE2, SSE4_1, AVX2, AVX512ICL]
445518
);
446519

447520
impl_itx_hbd_fns!(
@@ -472,12 +545,19 @@ impl_itx_hbd_fns!(
472545
(TxType::H_FLIPADST, identity, flipadst)
473546
],
474547
[(16, 8), (8, 16), (16, 4), (4, 16), (8, 8), (8, 4), (4, 8), (4, 4)],
475-
_12,
548+
// 4x4
549+
[],
550+
[],
551+
_12_,
476552
[(12, avx2, AVX2)]
477553
);
478554

555+
const INV_TXFM_HBD_FNS_12_AVX2: [[Option<InvTxfmHBDFunc>; TX_TYPES_PLUS_LL];
556+
TxSize::TX_SIZES_ALL] =
557+
merge_hbd_fns(INV_TXFM_HBD_FNS_12__AVX2, INV_TXFM_HBD_FNS_16_AVX2);
558+
479559
cpu_function_lookup_table!(
480560
INV_TXFM_HBD_FNS_12: [[[Option<InvTxfmHBDFunc>; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL]],
481561
default: [[None; TX_TYPES_PLUS_LL]; TxSize::TX_SIZES_ALL],
482-
[AVX2]
562+
[SSE2, AVX2]
483563
);

0 commit comments

Comments
 (0)