From 8ceb4e9c8bb6f5ad702279c25ec58c876f6223aa Mon Sep 17 00:00:00 2001 From: Kornel Date: Sat, 30 Sep 2023 10:49:51 +0100 Subject: [PATCH 1/5] Don't use out params --- src/context/block_unit.rs | 3 +-- src/context/mod.rs | 11 ++++++----- src/context/transform_unit.rs | 8 +++++--- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/src/context/block_unit.rs b/src/context/block_unit.rs index 1d53b0db33..d426132a1d 100644 --- a/src/context/block_unit.rs +++ b/src/context/block_unit.rs @@ -1861,8 +1861,7 @@ impl<'a> ContextWriter<'a> { &mut self, eob: usize, tx_size: TxSize, tx_class: TxClass, txs_ctx: usize, plane_type: usize, w: &mut W, ) { - let mut eob_extra: u32 = 0; - let eob_pt = Self::get_eob_pos_token(eob, &mut eob_extra); + let (eob_pt, eob_extra) = Self::get_eob_pos_token(eob); let eob_multi_size: usize = tx_size.area_log2() - 4; let eob_multi_ctx: usize = usize::from(tx_class != TX_CLASS_2D); diff --git a/src/context/mod.rs b/src/context/mod.rs index 0ccf62fd59..1cfc92f59d 100644 --- a/src/context/mod.rs +++ b/src/context/mod.rs @@ -164,16 +164,18 @@ pub const fn mv_class_base(mv_class: usize) -> u32 { pub fn log_in_base_2(n: u32) -> u8 { 31 - cmp::min(31, n.leading_zeros() as u8) } + +/// Returns `(mv_class, offset)` #[inline(always)] -pub fn get_mv_class(z: u32, offset: &mut u32) -> usize { +pub fn get_mv_class(z: u32) -> (usize, u32) { let c = if z >= CLASS0_SIZE as u32 * 4096 { MV_CLASS_10 } else { log_in_base_2(z >> 3) as usize }; - *offset = z - mv_class_base(c); - c + let offset = z - mv_class_base(c); + (c, offset) } impl<'a> ContextWriter<'a> { @@ -186,10 +188,9 @@ impl<'a> ContextWriter<'a> { ) { assert!(comp != 0); assert!((MV_LOW..=MV_UPP).contains(&comp)); - let mut offset: u32 = 0; let sign: u32 = u32::from(comp < 0); let mag: u32 = if sign == 1 { -comp as u32 } else { comp as u32 }; - let mv_class = get_mv_class(mag - 1, &mut offset); + let (mv_class, offset) = get_mv_class(mag - 1); let d = offset >> 3; // int mv data let fr = (offset >> 1) & 3; // fractional mv data let hp = offset & 1; // high precision mv data diff --git a/src/context/transform_unit.rs b/src/context/transform_unit.rs index 8b8c2e97d6..5baf660657 100644 --- a/src/context/transform_unit.rs +++ b/src/context/transform_unit.rs @@ -798,11 +798,13 @@ impl<'a> ContextWriter<'a> { av1_get_coded_tx_size(tx_size).height_log2() } + /// Returns `(eob_pt, eob_extra)` + /// /// # Panics /// /// - If `eob` is prior to the start of the group #[inline] - pub fn get_eob_pos_token(eob: usize, extra: &mut u32) -> u32 { + pub fn get_eob_pos_token(eob: usize) -> (u32, u32) { let t = if eob < 33 { eob_to_pos_small[eob] as u32 } else { @@ -810,9 +812,9 @@ impl<'a> ContextWriter<'a> { eob_to_pos_large[e] as u32 }; assert!(eob as i32 >= k_eob_group_start[t as usize] as i32); - *extra = eob as u32 - k_eob_group_start[t as usize] as u32; + let extra = eob as u32 - k_eob_group_start[t as usize] as u32; - t + (t, extra) } pub fn get_nz_mag(levels: &[u8], bhl: usize, tx_class: TxClass) -> usize { From 9f5657da48264f62f321b566a0e600aa486b4584 Mon Sep 17 00:00:00 2001 From: Kornel Date: Sat, 30 Sep 2023 21:17:47 +0100 Subject: [PATCH 2/5] Reduce use of uninitialized arrays --- benches/mc.rs | 16 ++++++++-------- src/asm/shared/predict.rs | 16 ++++------------ src/util/align.rs | 10 ++++++++++ 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/benches/mc.rs b/benches/mc.rs index 4d3f81add7..88072d2abb 100644 --- a/benches/mc.rs +++ b/benches/mc.rs @@ -263,7 +263,7 @@ fn bench_prep_8tap_top_left_lbd(c: &mut Criterion) { let w = 640; let h = 480; let input_plane = new_plane::(&mut ra, w, h); - let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() }; + let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0); let (row_frac, col_frac, src) = get_params( &input_plane, @@ -294,7 +294,7 @@ fn bench_prep_8tap_top_lbd(c: &mut Criterion) { let w = 640; let h = 480; let input_plane = new_plane::(&mut ra, w, h); - let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() }; + let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0); let (row_frac, col_frac, src) = get_params( &input_plane, @@ -325,7 +325,7 @@ fn bench_prep_8tap_left_lbd(c: &mut Criterion) { let w = 640; let h = 480; let input_plane = new_plane::(&mut ra, w, h); - let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() }; + let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0); let (row_frac, col_frac, src) = get_params( &input_plane, @@ -356,7 +356,7 @@ fn bench_prep_8tap_center_lbd(c: &mut Criterion) { let w = 640; let h = 480; let input_plane = new_plane::(&mut ra, w, h); - let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() }; + let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0); let (row_frac, col_frac, src) = get_params( &input_plane, @@ -387,7 +387,7 @@ fn bench_prep_8tap_top_left_hbd(c: &mut Criterion) { let w = 640; let h = 480; let input_plane = new_plane::(&mut ra, w, h); - let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() }; + let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0); let (row_frac, col_frac, src) = get_params( &input_plane, @@ -418,7 +418,7 @@ fn bench_prep_8tap_top_hbd(c: &mut Criterion) { let w = 640; let h = 480; let input_plane = new_plane::(&mut ra, w, h); - let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() }; + let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0); let (row_frac, col_frac, src) = get_params( &input_plane, @@ -449,7 +449,7 @@ fn bench_prep_8tap_left_hbd(c: &mut Criterion) { let w = 640; let h = 480; let input_plane = new_plane::(&mut ra, w, h); - let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() }; + let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0); let (row_frac, col_frac, src) = get_params( &input_plane, @@ -480,7 +480,7 @@ fn bench_prep_8tap_center_hbd(c: &mut Criterion) { let w = 640; let h = 480; let input_plane = new_plane::(&mut ra, w, h); - let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() }; + let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0); let (row_frac, col_frac, src) = get_params( &input_plane, diff --git a/src/asm/shared/predict.rs b/src/asm/shared/predict.rs index 6d098bae5b..7b73157116 100644 --- a/src/asm/shared/predict.rs +++ b/src/asm/shared/predict.rs @@ -39,18 +39,10 @@ mod test { fn pred_matches_inner(cpu: CpuFeatureLevel, bit_depth: usize) { let tx_size = TxSize::TX_4X4; - // SAFETY: We write to the array below before reading from it. - let mut ac: Aligned<[i16; 32 * 32]> = unsafe { Aligned::uninitialized() }; - for i in 0..ac.data.len() { - ac.data[i] = i as i16 - 16 * 32; - } - // SAFETY: We write to the array below before reading from it. - let mut edge_buf: Aligned<[T; 4 * MAX_TX_SIZE + 1]> = - unsafe { Aligned::uninitialized() }; - for i in 0..edge_buf.data.len() { - edge_buf.data[i] = - T::cast_from(((i ^ 1) + 32).saturating_sub(2 * MAX_TX_SIZE)); - } + let ac: Aligned<[i16; 32 * 32]> = Aligned::from_fn(|i| i as i16 - 16 * 32); + let edge_buf: Aligned<[T; 4 * MAX_TX_SIZE + 1]> = Aligned::from_fn(|i| { + T::cast_from(((i ^ 1) + 32).saturating_sub(2 * MAX_TX_SIZE)) + }); let ief_params_all = [ None, diff --git a/src/util/align.rs b/src/util/align.rs index c86424e8b2..84f47d1bd9 100644 --- a/src/util/align.rs +++ b/src/util/align.rs @@ -29,6 +29,16 @@ pub struct Aligned { pub data: T, } +impl Aligned<[T; N]> { + #[inline(always)] + pub fn from_fn(cb: F) -> Self + where + F: FnMut(usize) -> T, + { + Aligned { _alignment: [], data: std::array::from_fn(cb) } + } +} + impl Aligned { pub const fn new(data: T) -> Self { Aligned { _alignment: [], data } From ce8281f36b048b6c85f56db78822ed6e98c13c37 Mon Sep 17 00:00:00 2001 From: Kornel Date: Mon, 2 Oct 2023 12:15:24 +0100 Subject: [PATCH 3/5] Eliminate panic path from diff() --- src/encoder.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/encoder.rs b/src/encoder.rs index a3ee7e32e8..993fcdcd4e 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -1358,11 +1358,13 @@ fn diff( dst: &mut [i16], src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, ) { debug_assert!(dst.len() % src1.rect().width == 0); + let width = src1.rect().width; + if width == 0 || width != src2.rect().width { + return; + } - for ((l, s1), s2) in dst - .chunks_exact_mut(src1.rect().width) - .zip(src1.rows_iter()) - .zip(src2.rows_iter()) + for ((l, s1), s2) in + dst.chunks_exact_mut(width).zip(src1.rows_iter()).zip(src2.rows_iter()) { for ((r, v1), v2) in l.iter_mut().zip(s1).zip(s2) { *r = i16::cast_from(*v1) - i16::cast_from(*v2); From 13e98f2ac3675fc2d7094f3e18e8aecdaf35d9cc Mon Sep 17 00:00:00 2001 From: Kornel Date: Mon, 2 Oct 2023 12:30:30 +0100 Subject: [PATCH 4/5] Eliminate extra length checks from diff() loop --- src/encoder.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/encoder.rs b/src/encoder.rs index 993fcdcd4e..ef63894320 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -1359,7 +1359,14 @@ fn diff( ) { debug_assert!(dst.len() % src1.rect().width == 0); let width = src1.rect().width; - if width == 0 || width != src2.rect().width { + let height = src1.rect().height; + + if width == 0 + || width != src2.rect().width + || height == 0 + || src1.rows_iter().len() != src2.rows_iter().len() + { + debug_assert!(false); return; } From 36da899c14c217dc1c60658b2a3bc7bf88cf7505 Mon Sep 17 00:00:00 2001 From: Kornel Date: Tue, 3 Oct 2023 18:24:46 +0100 Subject: [PATCH 5/5] Prefer MaybeUninit for write-only slices --- benches/transform.rs | 3 ++- src/asm/shared/transform/inverse.rs | 21 +++++++++++-------- src/asm/x86/quantize.rs | 20 ++++++++++-------- src/asm/x86/transform/forward.rs | 17 +++++++++------ src/encoder.rs | 25 +++++++++++++--------- src/quantize/mod.rs | 10 ++++++--- src/transform/forward.rs | 32 +++++++++++++++++------------ src/transform/mod.rs | 5 ++++- src/util/align.rs | 7 +++++++ 9 files changed, 90 insertions(+), 50 deletions(-) diff --git a/benches/transform.rs b/benches/transform.rs index 849ed9d152..c301b503f4 100644 --- a/benches/transform.rs +++ b/benches/transform.rs @@ -15,6 +15,7 @@ use rav1e::bench::transform; use rav1e::bench::transform::{ forward_transform, get_valid_txfm_types, TxSize, }; +use std::mem::MaybeUninit; fn init_buffers(size: usize) -> (Vec, Vec) { let mut ra = ChaChaRng::from_seed([0; 32]); @@ -96,7 +97,7 @@ pub fn bench_forward_transforms(c: &mut Criterion) { let input: Vec = (0..area).map(|_| rng.gen_range(-255..256)).collect(); - let mut output = vec![0i16; area]; + let mut output = vec![MaybeUninit::new(0i16); area]; for &tx_type in get_valid_txfm_types(tx_size) { group.bench_function( diff --git a/src/asm/shared/transform/inverse.rs b/src/asm/shared/transform/inverse.rs index b7ad108c64..fc11b4341a 100644 --- a/src/asm/shared/transform/inverse.rs +++ b/src/asm/shared/transform/inverse.rs @@ -9,6 +9,7 @@ use crate::tiling::PlaneRegionMut; use crate::util::*; +use std::mem::MaybeUninit; // Note: Input coeffs are mutable since the assembly uses them as a scratchpad pub type InvTxfmFunc = @@ -27,13 +28,13 @@ pub fn call_inverse_func( let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)]; // SAFETY: We write to the array below before reading from it. - let mut copied: Aligned<[T::Coeff; 32 * 32]> = + let mut copied: Aligned<[MaybeUninit; 32 * 32]> = unsafe { Aligned::uninitialized() }; // Convert input to 16-bits. // TODO: Remove by changing inverse assembly to not overwrite its input for (a, b) in copied.data.iter_mut().zip(input) { - *a = *b; + a.write(*b); } // perform the inverse transform @@ -57,13 +58,13 @@ pub fn call_inverse_hbd_func( let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)]; // SAFETY: We write to the array below before reading from it. - let mut copied: Aligned<[T::Coeff; 32 * 32]> = + let mut copied: Aligned<[MaybeUninit; 32 * 32]> = unsafe { Aligned::uninitialized() }; // Convert input to 16-bits. // TODO: Remove by changing inverse assembly to not overwrite its input for (a, b) in copied.data.iter_mut().zip(input) { - *a = *b; + a.write(*b); } // perform the inverse transform @@ -88,6 +89,7 @@ pub mod test { use crate::transform::TxSize::*; use crate::transform::*; use rand::{random, thread_rng, Rng}; + use std::mem::MaybeUninit; pub fn pick_eob( coeffs: &mut [T], tx_size: TxSize, tx_type: TxType, sub_h: usize, @@ -146,12 +148,11 @@ pub mod test { let mut src_storage = [T::zero(); 64 * 64]; let src = &mut src_storage[..tx_size.area()]; let mut dst = Plane::from_slice(&[T::zero(); 64 * 64], 64); - // SAFETY: We write to the array below before reading from it. - let mut res_storage: Aligned<[i16; 64 * 64]> = + let mut res_storage: Aligned<[MaybeUninit; 64 * 64]> = unsafe { Aligned::uninitialized() }; let res = &mut res_storage.data[..tx_size.area()]; // SAFETY: We write to the array below before reading from it. - let mut freq_storage: Aligned<[T::Coeff; 64 * 64]> = + let mut freq_storage: Aligned<[MaybeUninit; 64 * 64]> = unsafe { Aligned::uninitialized() }; let freq = &mut freq_storage.data[..tx_size.area()]; for ((r, s), d) in @@ -159,8 +160,10 @@ pub mod test { { *s = T::cast_from(random::() >> (16 - bit_depth)); *d = T::cast_from(random::() >> (16 - bit_depth)); - *r = i16::cast_from(*s) - i16::cast_from(*d); + r.write(i16::cast_from(*s) - i16::cast_from(*d)); } + // SAFETY: The loop just initialized res, and all three slices have the same length + let res = unsafe { slice_assume_init_mut(res) }; forward_transform( res, freq, @@ -170,6 +173,8 @@ pub mod test { bit_depth, CpuFeatureLevel::RUST, ); + // SAFETY: forward_transform initialized freq + let freq = unsafe { slice_assume_init_mut(freq) }; let eob: usize = pick_eob(freq, tx_size, tx_type, sub_h); let mut rust_dst = dst.clone(); diff --git a/src/asm/x86/quantize.rs b/src/asm/x86/quantize.rs index 28dbabedc7..b61f5ad392 100644 --- a/src/asm/x86/quantize.rs +++ b/src/asm/x86/quantize.rs @@ -17,6 +17,7 @@ use crate::cpu_features::CpuFeatureLevel; use crate::quantize::*; use crate::transform::TxSize; use crate::util::*; +use std::mem::MaybeUninit; type DequantizeFn = unsafe fn( qindex: u8, @@ -37,10 +38,11 @@ cpu_function_lookup_table!( #[inline(always)] pub fn dequantize( - qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [T], tx_size: TxSize, - bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, cpu: CpuFeatureLevel, + qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [MaybeUninit], + tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, + cpu: CpuFeatureLevel, ) { - let call_rust = |rcoeffs: &mut [T]| { + let call_rust = |rcoeffs: &mut [MaybeUninit]| { crate::quantize::rust::dequantize( qindex, coeffs, eob, rcoeffs, tx_size, bit_depth, dc_delta_q, ac_delta_q, cpu, @@ -48,10 +50,9 @@ pub fn dequantize( }; #[cfg(any(feature = "check_asm", test))] - let ref_rcoeffs = { + let mut ref_rcoeffs = { let area = av1_get_coded_tx_size(tx_size).area(); - let mut copy = vec![T::cast_from(0); area]; - copy[..].copy_from_slice(&rcoeffs[..area]); + let mut copy = vec![MaybeUninit::new(T::cast_from(0)); area]; call_rust(&mut copy); copy }; @@ -82,7 +83,9 @@ pub fn dequantize( #[cfg(any(feature = "check_asm", test))] { let area = av1_get_coded_tx_size(tx_size).area(); - assert_eq!(&rcoeffs[..area], &ref_rcoeffs[..]); + let rcoeffs = unsafe { assume_slice_init_mut(&mut rcoeffs[..area]) }; + let ref_rcoeffs = unsafe { assume_slice_init_mut(&mut ref_rcoeffs[..]) }; + assert_eq!(rcoeffs, ref_rcoeffs); } } @@ -157,6 +160,7 @@ mod test { use super::*; use rand::distributions::{Distribution, Uniform}; use rand::{thread_rng, Rng}; + use std::mem::MaybeUninit; #[test] fn dequantize_test() { @@ -190,7 +194,7 @@ mod test { for &eob in &eobs { let mut qcoeffs = Aligned::new([0i16; 32 * 32]); - let mut rcoeffs = Aligned::new([0i16; 32 * 32]); + let mut rcoeffs = Aligned::new([MaybeUninit::new(0i16); 32 * 32]); // Generate quantized coefficients up to the eob let between = Uniform::from(-i16::MAX..=i16::MAX); diff --git a/src/asm/x86/transform/forward.rs b/src/asm/x86/transform/forward.rs index 18b1171517..6946f2650c 100644 --- a/src/asm/x86/transform/forward.rs +++ b/src/asm/x86/transform/forward.rs @@ -333,8 +333,8 @@ fn cast_mut(x: &mut [T]) -> &mut [T; N] { #[allow(clippy::identity_op, clippy::erasing_op)] #[target_feature(enable = "avx2")] unsafe fn forward_transform_avx2( - input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize, - tx_type: TxType, bd: usize, + input: &[i16], output: &mut [MaybeUninit], stride: usize, + tx_size: TxSize, tx_type: TxType, bd: usize, ) { // Note when assigning txfm_size_col, we use the txfm_size from the // row configuration and vice versa. This is intentionally done to @@ -508,8 +508,8 @@ unsafe fn forward_transform_avx2( /// /// - If called with an invalid combination of `tx_size` and `tx_type` pub fn forward_transform( - input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize, - tx_type: TxType, bd: usize, cpu: CpuFeatureLevel, + input: &[i16], output: &mut [MaybeUninit], stride: usize, + tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel, ) { assert!(valid_av1_transform(tx_size, tx_type)); if cpu >= CpuFeatureLevel::AVX2 { @@ -526,7 +526,9 @@ pub fn forward_transform( mod test { use crate::cpu_features::*; use crate::transform::{forward_transform, get_valid_txfm_types, TxSize}; + use crate::util::assume_slice_init_mut; use rand::Rng; + use std::mem::MaybeUninit; // Ensure that the simd results match the rust code #[test] @@ -558,8 +560,8 @@ mod test { (0..area).map(|_| rng.gen_range(-255..256)).collect(); for &tx_type in get_valid_txfm_types(tx_size) { - let mut output_ref = vec![0i16; area]; - let mut output_simd = vec![0i16; area]; + let mut output_ref = vec![MaybeUninit::new(0i16); area]; + let mut output_simd = vec![MaybeUninit::new(0i16); area]; println!("Testing combination {:?}, {:?}", tx_size, tx_type); forward_transform( @@ -571,6 +573,7 @@ mod test { 8, CpuFeatureLevel::RUST, ); + let output_ref = unsafe { assume_slice_init_mut(&mut output_ref[..]) }; forward_transform( &input[..], &mut output_simd[..], @@ -580,6 +583,8 @@ mod test { 8, cpu, ); + let output_simd = + unsafe { assume_slice_init_mut(&mut output_simd[..]) }; assert_eq!(output_ref, output_simd) } } diff --git a/src/encoder.rs b/src/encoder.rs index ef63894320..9f4cb54187 100644 --- a/src/encoder.rs +++ b/src/encoder.rs @@ -1355,9 +1355,12 @@ fn write_key_frame_obus( /// Write into `dst` the difference between the blocks at `src1` and `src2` fn diff( - dst: &mut [i16], src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>, + dst: &mut [MaybeUninit], src1: &PlaneRegion<'_, T>, + src2: &PlaneRegion<'_, T>, ) { debug_assert!(dst.len() % src1.rect().width == 0); + debug_assert_eq!(src1.rows_iter().count(), src1.rect().height); + let width = src1.rect().width; let height = src1.rect().height; @@ -1374,7 +1377,7 @@ fn diff( dst.chunks_exact_mut(width).zip(src1.rows_iter()).zip(src2.rows_iter()) { for ((r, v1), v2) in l.iter_mut().zip(s1).zip(s2) { - *r = i16::cast_from(*v1) - i16::cast_from(*v2); + r.write(i16::cast_from(*v1) - i16::cast_from(*v2)); } } } @@ -1504,17 +1507,13 @@ pub fn encode_tx_block( } let coded_tx_area = av1_get_coded_tx_size(tx_size).area(); - // SAFETY: We write to the array below before reading from it. - let mut residual_storage: Aligned<[i16; 64 * 64]> = + let mut residual_storage: Aligned<[MaybeUninit; 64 * 64]> = unsafe { Aligned::uninitialized() }; - // SAFETY: We write to the array below before reading from it. - let mut coeffs_storage: Aligned<[T::Coeff; 64 * 64]> = + let mut coeffs_storage: Aligned<[MaybeUninit; 64 * 64]> = unsafe { Aligned::uninitialized() }; - // SAFETY: We write to the array below before reading from it. let mut qcoeffs_storage: Aligned<[MaybeUninit; 32 * 32]> = unsafe { Aligned::uninitialized() }; - // SAFETY: We write to the array below before reading from it. - let mut rcoeffs_storage: Aligned<[T::Coeff; 32 * 32]> = + let mut rcoeffs_storage: Aligned<[MaybeUninit; 32 * 32]> = unsafe { Aligned::uninitialized() }; let residual = &mut residual_storage.data[..tx_size.area()]; let coeffs = &mut coeffs_storage.data[..tx_size.area()]; @@ -1539,8 +1538,10 @@ pub fn encode_tx_block( &rec.subregion(area), ); } else { - residual.fill(0); + residual.fill(MaybeUninit::new(0)); } + // SAFETY: `diff()` inits `tx_size.area()` elements when it matches size of `subregion(area)` + let residual = unsafe { slice_assume_init_mut(residual) }; forward_transform( residual, @@ -1551,6 +1552,8 @@ pub fn encode_tx_block( fi.sequence.bit_depth, fi.cpu_feature_level, ); + // SAFETY: forward_transform initialized coeffs + let coeffs = unsafe { slice_assume_init_mut(coeffs) }; let eob = ts.qc.quantize(coeffs, qcoeffs, tx_size, tx_type); @@ -1596,6 +1599,8 @@ pub fn encode_tx_block( fi.ac_delta_q[p], fi.cpu_feature_level, ); + // SAFETY: dequantize initialized rcoeffs + let rcoeffs = unsafe { slice_assume_init_mut(rcoeffs) }; if eob == 0 { // All zero coefficients is a no-op diff --git a/src/quantize/mod.rs b/src/quantize/mod.rs index 7fb6414866..1555f58db9 100644 --- a/src/quantize/mod.rs +++ b/src/quantize/mod.rs @@ -359,10 +359,12 @@ impl QuantizationContext { pub mod rust { use super::*; use crate::cpu_features::CpuFeatureLevel; + use std::mem::MaybeUninit; pub fn dequantize( - qindex: u8, coeffs: &[T], _eob: usize, rcoeffs: &mut [T], tx_size: TxSize, - bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, _cpu: CpuFeatureLevel, + qindex: u8, coeffs: &[T], _eob: usize, rcoeffs: &mut [MaybeUninit], + tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, + _cpu: CpuFeatureLevel, ) { let log_tx_scale = get_log_tx_scale(tx_size) as i32; let offset = (1 << log_tx_scale) - 1; @@ -376,7 +378,9 @@ pub mod rust { .enumerate() { let quant = if i == 0 { dc_quant } else { ac_quant }; - *r = T::cast_from((c * quant + ((c >> 31) & offset)) >> log_tx_scale); + r.write(T::cast_from( + (c * quant + ((c >> 31) & offset)) >> log_tx_scale, + )); } } } diff --git a/src/transform/forward.rs b/src/transform/forward.rs index ac9f4e850b..85529de464 100644 --- a/src/transform/forward.rs +++ b/src/transform/forward.rs @@ -22,6 +22,7 @@ cfg_if::cfg_if! { pub mod rust { use super::*; + use std::mem::MaybeUninit; use crate::transform::forward_shared::*; use crate::transform::{av1_round_shift_array, valid_av1_transform, TxSize}; @@ -99,8 +100,8 @@ pub mod rust { /// - If called with an invalid combination of `tx_size` and `tx_type` #[cold_for_target_arch("x86_64")] pub fn forward_transform( - input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize, - tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel, + input: &[i16], output: &mut [MaybeUninit], stride: usize, + tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel, ) { assert!(valid_av1_transform(tx_size, tx_type)); @@ -114,7 +115,8 @@ pub mod rust { let txfm_size_row = tx_size.height(); // SAFETY: We write to the array below before reading from it. - let mut tmp: Aligned<[i32; 64 * 64]> = unsafe { Aligned::uninitialized() }; + let mut tmp: Aligned<[MaybeUninit; 64 * 64]> = + unsafe { Aligned::uninitialized() }; let buf = &mut tmp.data[..txfm_size_col * txfm_size_row]; let cfg = Txfm2DFlipCfg::fwd(tx_type, tx_size, bd); @@ -124,20 +126,22 @@ pub mod rust { // Columns for c in 0..txfm_size_col { - // SAFETY: We write to the array below before reading from it. - let mut col_coeffs_backing: Aligned<[i32; 64]> = + let mut col_coeffs_backing: Aligned<[MaybeUninit; 64]> = unsafe { Aligned::uninitialized() }; let col_coeffs = &mut col_coeffs_backing.data[..txfm_size_row]; if cfg.ud_flip { // flip upside down for r in 0..txfm_size_row { - col_coeffs[r] = (input[(txfm_size_row - r - 1) * stride + c]).into(); + col_coeffs[r] + .write((input[(txfm_size_row - r - 1) * stride + c]).into()); } } else { for r in 0..txfm_size_row { - col_coeffs[r] = (input[r * stride + c]).into(); + col_coeffs[r].write((input[r * stride + c]).into()); } } + // SAFETY: The loops above have initialized all txfm_size_row elements + let col_coeffs = unsafe { slice_assume_init_mut(col_coeffs) }; av1_round_shift_array(col_coeffs, txfm_size_row, -cfg.shift[0]); txfm_func_col(col_coeffs); @@ -145,18 +149,20 @@ pub mod rust { if cfg.lr_flip { for r in 0..txfm_size_row { // flip from left to right - buf[r * txfm_size_col + (txfm_size_col - c - 1)] = col_coeffs[r]; + buf[r * txfm_size_col + (txfm_size_col - c - 1)] + .write(col_coeffs[r]); } } else { for r in 0..txfm_size_row { - buf[r * txfm_size_col + c] = col_coeffs[r]; + buf[r * txfm_size_col + c].write(col_coeffs[r]); } } } + // SAFETY: The loops above have initialized the entire buf + let buf = unsafe { slice_assume_init_mut(buf) }; // Rows - for r in 0..txfm_size_row { - let row_coeffs = &mut buf[r * txfm_size_col..]; + for (r, row_coeffs) in buf.chunks_exact_mut(txfm_size_col).enumerate() { txfm_func_row(row_coeffs); av1_round_shift_array(row_coeffs, txfm_size_col, -cfg.shift[2]); @@ -181,8 +187,8 @@ pub mod rust { let output = &mut output[txfm_size_row * cg..]; for c in 0..txfm_size_col.min(32) { - output[c * output_stride + (r & 31)] = - T::cast_from(row_coeffs[c + cg]); + output[c * output_stride + (r & 31)] + .write(T::cast_from(row_coeffs[c + cg])); } } } diff --git a/src/transform/mod.rs b/src/transform/mod.rs index d14913e133..e4d38b47a4 100644 --- a/src/transform/mod.rs +++ b/src/transform/mod.rs @@ -449,6 +449,7 @@ mod test { use crate::cpu_features::CpuFeatureLevel; use crate::frame::*; use rand::random; + use std::mem::MaybeUninit; fn test_roundtrip( tx_size: TxSize, tx_type: TxType, tolerance: i16, @@ -465,7 +466,7 @@ mod test { ); let mut res_storage = [0i16; 64 * 64]; let res = &mut res_storage[..tx_size.area()]; - let mut freq_storage = [T::Coeff::cast_from(0); 64 * 64]; + let mut freq_storage = [MaybeUninit::uninit(); 64 * 64]; let freq = &mut freq_storage[..tx_size.area()]; for ((r, s), d) in res.iter_mut().zip(src.iter_mut()).zip(dst.data.iter_mut()) @@ -475,6 +476,8 @@ mod test { *r = i16::cast_from(*s) - i16::cast_from(*d); } forward_transform(res, freq, tx_size.width(), tx_size, tx_type, 8, cpu); + // SAFETY: forward_transform initialized freq + let freq = unsafe { slice_assume_init_mut(freq) }; inverse_transform_add( freq, &mut dst.as_region_mut(), diff --git a/src/util/align.rs b/src/util/align.rs index 84f47d1bd9..641902e50f 100644 --- a/src/util/align.rs +++ b/src/util/align.rs @@ -52,6 +52,13 @@ impl Aligned { } } +#[inline(always)] +pub unsafe fn slice_assume_init_mut( + slice: &mut [MaybeUninit], +) -> &mut [T] { + unsafe { &mut *(slice as *mut [MaybeUninit] as *mut [T]) } +} + /// An analog to a Box<[T]> where the underlying slice is aligned. /// Alignment is according to the architecture-specific SIMD constraints. pub struct AlignedBoxedSlice {