From 8ceb4e9c8bb6f5ad702279c25ec58c876f6223aa Mon Sep 17 00:00:00 2001
From: Kornel <kornel@geekhood.net>
Date: Sat, 30 Sep 2023 10:49:51 +0100
Subject: [PATCH 1/5] Don't use out params

---
 src/context/block_unit.rs     |  3 +--
 src/context/mod.rs            | 11 ++++++-----
 src/context/transform_unit.rs |  8 +++++---
 3 files changed, 12 insertions(+), 10 deletions(-)
diff --git a/src/context/block_unit.rs b/src/context/block_unit.rs
index 1d53b0db33..d426132a1d 100644
--- a/src/context/block_unit.rs
+++ b/src/context/block_unit.rs
@@ -1861,8 +1861,7 @@ impl<'a> ContextWriter<'a> {
     &mut self, eob: usize, tx_size: TxSize, tx_class: TxClass, txs_ctx: usize,
     plane_type: usize, w: &mut W,
   ) {
-    let mut eob_extra: u32 = 0;
-    let eob_pt = Self::get_eob_pos_token(eob, &mut eob_extra);
+    let (eob_pt, eob_extra) = Self::get_eob_pos_token(eob);
     let eob_multi_size: usize = tx_size.area_log2() - 4;
     let eob_multi_ctx: usize = usize::from(tx_class != TX_CLASS_2D);
 
diff --git a/src/context/mod.rs b/src/context/mod.rs
index 0ccf62fd59..1cfc92f59d 100644
--- a/src/context/mod.rs
+++ b/src/context/mod.rs
@@ -164,16 +164,18 @@ pub const fn mv_class_base(mv_class: usize) -> u32 {
 pub fn log_in_base_2(n: u32) -> u8 {
   31 - cmp::min(31, n.leading_zeros() as u8)
 }
+
+/// Returns `(mv_class, offset)`
 #[inline(always)]
-pub fn get_mv_class(z: u32, offset: &mut u32) -> usize {
+pub fn get_mv_class(z: u32) -> (usize, u32) {
   let c = if z >= CLASS0_SIZE as u32 * 4096 {
     MV_CLASS_10
   } else {
     log_in_base_2(z >> 3) as usize
   };
 
-  *offset = z - mv_class_base(c);
-  c
+  let offset = z - mv_class_base(c);
+  (c, offset)
 }
 
 impl<'a> ContextWriter<'a> {
@@ -186,10 +188,9 @@ impl<'a> ContextWriter<'a> {
   ) {
     assert!(comp != 0);
     assert!((MV_LOW..=MV_UPP).contains(&comp));
-    let mut offset: u32 = 0;
     let sign: u32 = u32::from(comp < 0);
     let mag: u32 = if sign == 1 { -comp as u32 } else { comp as u32 };
-    let mv_class = get_mv_class(mag - 1, &mut offset);
+    let (mv_class, offset) = get_mv_class(mag - 1);
     let d = offset >> 3; // int mv data
     let fr = (offset >> 1) & 3; // fractional mv data
     let hp = offset & 1; // high precision mv data
diff --git a/src/context/transform_unit.rs b/src/context/transform_unit.rs
index 8b8c2e97d6..5baf660657 100644
--- a/src/context/transform_unit.rs
+++ b/src/context/transform_unit.rs
@@ -798,11 +798,13 @@ impl<'a> ContextWriter<'a> {
     av1_get_coded_tx_size(tx_size).height_log2()
   }
 
+  /// Returns `(eob_pt, eob_extra)`
+  ///
   /// # Panics
   ///
   /// - If `eob` is prior to the start of the group
   #[inline]
-  pub fn get_eob_pos_token(eob: usize, extra: &mut u32) -> u32 {
+  pub fn get_eob_pos_token(eob: usize) -> (u32, u32) {
     let t = if eob < 33 {
       eob_to_pos_small[eob] as u32
     } else {
@@ -810,9 +812,9 @@ impl<'a> ContextWriter<'a> {
       eob_to_pos_large[e] as u32
     };
     assert!(eob as i32 >= k_eob_group_start[t as usize] as i32);
-    *extra = eob as u32 - k_eob_group_start[t as usize] as u32;
+    let extra = eob as u32 - k_eob_group_start[t as usize] as u32;
 
-    t
+    (t, extra)
   }
 
   pub fn get_nz_mag(levels: &[u8], bhl: usize, tx_class: TxClass) -> usize {

From 9f5657da48264f62f321b566a0e600aa486b4584 Mon Sep 17 00:00:00 2001
From: Kornel <kornel@geekhood.net>
Date: Sat, 30 Sep 2023 21:17:47 +0100
Subject: [PATCH 2/5] Reduce use of uninitialized arrays

---
 benches/mc.rs             | 16 ++++++++--------
 src/asm/shared/predict.rs | 16 ++++------------
 src/util/align.rs         | 10 ++++++++++
 3 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/benches/mc.rs b/benches/mc.rs
index 4d3f81add7..88072d2abb 100644
--- a/benches/mc.rs
+++ b/benches/mc.rs
@@ -263,7 +263,7 @@ fn bench_prep_8tap_top_left_lbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u8>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,
@@ -294,7 +294,7 @@ fn bench_prep_8tap_top_lbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u8>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,
@@ -325,7 +325,7 @@ fn bench_prep_8tap_left_lbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u8>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,
@@ -356,7 +356,7 @@ fn bench_prep_8tap_center_lbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u8>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,
@@ -387,7 +387,7 @@ fn bench_prep_8tap_top_left_hbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u16>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,
@@ -418,7 +418,7 @@ fn bench_prep_8tap_top_hbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u16>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,
@@ -449,7 +449,7 @@ fn bench_prep_8tap_left_hbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u16>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,
@@ -480,7 +480,7 @@ fn bench_prep_8tap_center_hbd(c: &mut Criterion) {
   let w = 640;
   let h = 480;
   let input_plane = new_plane::<u16>(&mut ra, w, h);
-  let mut dst = unsafe { Aligned::<[i16; 128 * 128]>::uninitialized() };
+  let mut dst = Aligned::<[i16; 128 * 128]>::from_fn(|_| 0);
 
   let (row_frac, col_frac, src) = get_params(
     &input_plane,
diff --git a/src/asm/shared/predict.rs b/src/asm/shared/predict.rs
index 6d098bae5b..7b73157116 100644
--- a/src/asm/shared/predict.rs
+++ b/src/asm/shared/predict.rs
@@ -39,18 +39,10 @@ mod test {
 
   fn pred_matches_inner<T: Pixel>(cpu: CpuFeatureLevel, bit_depth: usize) {
     let tx_size = TxSize::TX_4X4;
-    // SAFETY: We write to the array below before reading from it.
-    let mut ac: Aligned<[i16; 32 * 32]> = unsafe { Aligned::uninitialized() };
-    for i in 0..ac.data.len() {
-      ac.data[i] = i as i16 - 16 * 32;
-    }
-    // SAFETY: We write to the array below before reading from it.
-    let mut edge_buf: Aligned<[T; 4 * MAX_TX_SIZE + 1]> =
-      unsafe { Aligned::uninitialized() };
-    for i in 0..edge_buf.data.len() {
-      edge_buf.data[i] =
-        T::cast_from(((i ^ 1) + 32).saturating_sub(2 * MAX_TX_SIZE));
-    }
+    let ac: Aligned<[i16; 32 * 32]> = Aligned::from_fn(|i| i as i16 - 16 * 32);
+    let edge_buf: Aligned<[T; 4 * MAX_TX_SIZE + 1]> = Aligned::from_fn(|i| {
+      T::cast_from(((i ^ 1) + 32).saturating_sub(2 * MAX_TX_SIZE))
+    });
 
     let ief_params_all = [
       None,
diff --git a/src/util/align.rs b/src/util/align.rs
index c86424e8b2..84f47d1bd9 100644
--- a/src/util/align.rs
+++ b/src/util/align.rs
@@ -29,6 +29,16 @@ pub struct Aligned<T> {
   pub data: T,
 }
 
+impl<const N: usize, T> Aligned<[T; N]> {
+  #[inline(always)]
+  pub fn from_fn<F>(cb: F) -> Self
+  where
+    F: FnMut(usize) -> T,
+  {
+    Aligned { _alignment: [], data: std::array::from_fn(cb) }
+  }
+}
+
 impl<T> Aligned<T> {
   pub const fn new(data: T) -> Self {
     Aligned { _alignment: [], data }

From ce8281f36b048b6c85f56db78822ed6e98c13c37 Mon Sep 17 00:00:00 2001
From: Kornel <kornel@geekhood.net>
Date: Mon, 2 Oct 2023 12:15:24 +0100
Subject: [PATCH 3/5] Eliminate panic path from diff()

---
 src/encoder.rs | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/encoder.rs b/src/encoder.rs
index a3ee7e32e8..993fcdcd4e 100644
--- a/src/encoder.rs
+++ b/src/encoder.rs
@@ -1358,11 +1358,13 @@ fn diff<T: Pixel>(
   dst: &mut [i16], src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>,
 ) {
   debug_assert!(dst.len() % src1.rect().width == 0);
+  let width = src1.rect().width;
+  if width == 0 || width != src2.rect().width {
+    return;
+  }
 
-  for ((l, s1), s2) in dst
-    .chunks_exact_mut(src1.rect().width)
-    .zip(src1.rows_iter())
-    .zip(src2.rows_iter())
+  for ((l, s1), s2) in
+    dst.chunks_exact_mut(width).zip(src1.rows_iter()).zip(src2.rows_iter())
   {
     for ((r, v1), v2) in l.iter_mut().zip(s1).zip(s2) {
       *r = i16::cast_from(*v1) - i16::cast_from(*v2);

From 13e98f2ac3675fc2d7094f3e18e8aecdaf35d9cc Mon Sep 17 00:00:00 2001
From: Kornel <kornel@geekhood.net>
Date: Mon, 2 Oct 2023 12:30:30 +0100
Subject: [PATCH 4/5] Eliminate extra length checks from diff() loop

---
 src/encoder.rs | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/encoder.rs b/src/encoder.rs
index 993fcdcd4e..ef63894320 100644
--- a/src/encoder.rs
+++ b/src/encoder.rs
@@ -1359,7 +1359,14 @@ fn diff<T: Pixel>(
 ) {
   debug_assert!(dst.len() % src1.rect().width == 0);
   let width = src1.rect().width;
-  if width == 0 || width != src2.rect().width {
+  let height = src1.rect().height;
+
+  if width == 0
+    || width != src2.rect().width
+    || height == 0
+    || src1.rows_iter().len() != src2.rows_iter().len()
+  {
+    debug_assert!(false);
     return;
   }
 

From 36da899c14c217dc1c60658b2a3bc7bf88cf7505 Mon Sep 17 00:00:00 2001
From: Kornel <kornel@geekhood.net>
Date: Tue, 3 Oct 2023 18:24:46 +0100
Subject: [PATCH 5/5] Prefer MaybeUninit for write-only slices

---
 benches/transform.rs                |  3 ++-
 src/asm/shared/transform/inverse.rs | 21 +++++++++++--------
 src/asm/x86/quantize.rs             | 20 ++++++++++--------
 src/asm/x86/transform/forward.rs    | 17 +++++++++------
 src/encoder.rs                      | 25 +++++++++++++---------
 src/quantize/mod.rs                 | 10 ++++++---
 src/transform/forward.rs            | 32 +++++++++++++++++------------
 src/transform/mod.rs                |  5 ++++-
 src/util/align.rs                   |  7 +++++++
 9 files changed, 90 insertions(+), 50 deletions(-)

diff --git a/benches/transform.rs b/benches/transform.rs
index 849ed9d152..c301b503f4 100644
--- a/benches/transform.rs
+++ b/benches/transform.rs
@@ -15,6 +15,7 @@ use rav1e::bench::transform;
 use rav1e::bench::transform::{
   forward_transform, get_valid_txfm_types, TxSize,
 };
+use std::mem::MaybeUninit;
 
 fn init_buffers(size: usize) -> (Vec<i32>, Vec<i32>) {
   let mut ra = ChaChaRng::from_seed([0; 32]);
@@ -96,7 +97,7 @@ pub fn bench_forward_transforms(c: &mut Criterion) {
 
     let input: Vec<i16> =
       (0..area).map(|_| rng.gen_range(-255..256)).collect();
-    let mut output = vec![0i16; area];
+    let mut output = vec![MaybeUninit::new(0i16); area];
 
     for &tx_type in get_valid_txfm_types(tx_size) {
       group.bench_function(
diff --git a/src/asm/shared/transform/inverse.rs b/src/asm/shared/transform/inverse.rs
index b7ad108c64..fc11b4341a 100644
--- a/src/asm/shared/transform/inverse.rs
+++ b/src/asm/shared/transform/inverse.rs
@@ -9,6 +9,7 @@
 
 use crate::tiling::PlaneRegionMut;
 use crate::util::*;
+use std::mem::MaybeUninit;
 
 // Note: Input coeffs are mutable since the assembly uses them as a scratchpad
 pub type InvTxfmFunc =
@@ -27,13 +28,13 @@ pub fn call_inverse_func<T: Pixel>(
   let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];
 
   // SAFETY: We write to the array below before reading from it.
-  let mut copied: Aligned<[T::Coeff; 32 * 32]> =
+  let mut copied: Aligned<[MaybeUninit<T::Coeff>; 32 * 32]> =
     unsafe { Aligned::uninitialized() };
 
   // Convert input to 16-bits.
   // TODO: Remove by changing inverse assembly to not overwrite its input
   for (a, b) in copied.data.iter_mut().zip(input) {
-    *a = *b;
+    a.write(*b);
   }
 
   // perform the inverse transform
@@ -57,13 +58,13 @@ pub fn call_inverse_hbd_func<T: Pixel>(
   let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];
 
   // SAFETY: We write to the array below before reading from it.
-  let mut copied: Aligned<[T::Coeff; 32 * 32]> =
+  let mut copied: Aligned<[MaybeUninit<T::Coeff>; 32 * 32]> =
     unsafe { Aligned::uninitialized() };
 
   // Convert input to 16-bits.
   // TODO: Remove by changing inverse assembly to not overwrite its input
   for (a, b) in copied.data.iter_mut().zip(input) {
-    *a = *b;
+    a.write(*b);
   }
 
   // perform the inverse transform
@@ -88,6 +89,7 @@ pub mod test {
   use crate::transform::TxSize::*;
   use crate::transform::*;
   use rand::{random, thread_rng, Rng};
+  use std::mem::MaybeUninit;
 
   pub fn pick_eob<T: Coefficient>(
     coeffs: &mut [T], tx_size: TxSize, tx_type: TxType, sub_h: usize,
@@ -146,12 +148,11 @@ pub mod test {
       let mut src_storage = [T::zero(); 64 * 64];
       let src = &mut src_storage[..tx_size.area()];
       let mut dst = Plane::from_slice(&[T::zero(); 64 * 64], 64);
-      // SAFETY: We write to the array below before reading from it.
-      let mut res_storage: Aligned<[i16; 64 * 64]> =
+      let mut res_storage: Aligned<[MaybeUninit<i16>; 64 * 64]> =
         unsafe { Aligned::uninitialized() };
       let res = &mut res_storage.data[..tx_size.area()];
       // SAFETY: We write to the array below before reading from it.
-      let mut freq_storage: Aligned<[T::Coeff; 64 * 64]> =
+      let mut freq_storage: Aligned<[MaybeUninit<T::Coeff>; 64 * 64]> =
         unsafe { Aligned::uninitialized() };
       let freq = &mut freq_storage.data[..tx_size.area()];
       for ((r, s), d) in
@@ -159,8 +160,10 @@ pub mod test {
       {
         *s = T::cast_from(random::<u16>() >> (16 - bit_depth));
         *d = T::cast_from(random::<u16>() >> (16 - bit_depth));
-        *r = i16::cast_from(*s) - i16::cast_from(*d);
+        r.write(i16::cast_from(*s) - i16::cast_from(*d));
       }
+      // SAFETY: The loop just initialized res, and all three slices have the same length
+      let res = unsafe { slice_assume_init_mut(res) };
       forward_transform(
         res,
         freq,
@@ -170,6 +173,8 @@ pub mod test {
         bit_depth,
         CpuFeatureLevel::RUST,
       );
+      // SAFETY: forward_transform initialized freq
+      let freq = unsafe { slice_assume_init_mut(freq) };
 
       let eob: usize = pick_eob(freq, tx_size, tx_type, sub_h);
       let mut rust_dst = dst.clone();
diff --git a/src/asm/x86/quantize.rs b/src/asm/x86/quantize.rs
index 28dbabedc7..b61f5ad392 100644
--- a/src/asm/x86/quantize.rs
+++ b/src/asm/x86/quantize.rs
@@ -17,6 +17,7 @@ use crate::cpu_features::CpuFeatureLevel;
 use crate::quantize::*;
 use crate::transform::TxSize;
 use crate::util::*;
+use std::mem::MaybeUninit;
 
 type DequantizeFn = unsafe fn(
   qindex: u8,
@@ -37,10 +38,11 @@ cpu_function_lookup_table!(
 
 #[inline(always)]
 pub fn dequantize<T: Coefficient>(
-  qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [T], tx_size: TxSize,
-  bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, cpu: CpuFeatureLevel,
+  qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [MaybeUninit<T>],
+  tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
+  cpu: CpuFeatureLevel,
 ) {
-  let call_rust = |rcoeffs: &mut [T]| {
+  let call_rust = |rcoeffs: &mut [MaybeUninit<T>]| {
     crate::quantize::rust::dequantize(
       qindex, coeffs, eob, rcoeffs, tx_size, bit_depth, dc_delta_q,
       ac_delta_q, cpu,
@@ -48,10 +50,9 @@ pub fn dequantize<T: Coefficient>(
   };
 
   #[cfg(any(feature = "check_asm", test))]
-  let ref_rcoeffs = {
+  let mut ref_rcoeffs = {
     let area = av1_get_coded_tx_size(tx_size).area();
-    let mut copy = vec![T::cast_from(0); area];
-    copy[..].copy_from_slice(&rcoeffs[..area]);
+    let mut copy = vec![MaybeUninit::new(T::cast_from(0)); area];
     call_rust(&mut copy);
     copy
   };
@@ -82,7 +83,9 @@ pub fn dequantize<T: Coefficient>(
   #[cfg(any(feature = "check_asm", test))]
   {
     let area = av1_get_coded_tx_size(tx_size).area();
-    assert_eq!(&rcoeffs[..area], &ref_rcoeffs[..]);
+    let rcoeffs = unsafe { assume_slice_init_mut(&mut rcoeffs[..area]) };
+    let ref_rcoeffs = unsafe { assume_slice_init_mut(&mut ref_rcoeffs[..]) };
+    assert_eq!(rcoeffs, ref_rcoeffs);
   }
 }
 
@@ -157,6 +160,7 @@ mod test {
   use super::*;
   use rand::distributions::{Distribution, Uniform};
   use rand::{thread_rng, Rng};
+  use std::mem::MaybeUninit;
 
   #[test]
   fn dequantize_test() {
@@ -190,7 +194,7 @@ mod test {
 
       for &eob in &eobs {
         let mut qcoeffs = Aligned::new([0i16; 32 * 32]);
-        let mut rcoeffs = Aligned::new([0i16; 32 * 32]);
+        let mut rcoeffs = Aligned::new([MaybeUninit::new(0i16); 32 * 32]);
 
         // Generate quantized coefficients up to the eob
         let between = Uniform::from(-i16::MAX..=i16::MAX);
diff --git a/src/asm/x86/transform/forward.rs b/src/asm/x86/transform/forward.rs
index 18b1171517..6946f2650c 100644
--- a/src/asm/x86/transform/forward.rs
+++ b/src/asm/x86/transform/forward.rs
@@ -333,8 +333,8 @@ fn cast_mut<const N: usize, T>(x: &mut [T]) -> &mut [T; N] {
 #[allow(clippy::identity_op, clippy::erasing_op)]
 #[target_feature(enable = "avx2")]
 unsafe fn forward_transform_avx2<T: Coefficient>(
-  input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
-  tx_type: TxType, bd: usize,
+  input: &[i16], output: &mut [MaybeUninit<T>], stride: usize,
+  tx_size: TxSize, tx_type: TxType, bd: usize,
 ) {
   // Note when assigning txfm_size_col, we use the txfm_size from the
   // row configuration and vice versa. This is intentionally done to
@@ -508,8 +508,8 @@ unsafe fn forward_transform_avx2<T: Coefficient>(
 ///
 /// - If called with an invalid combination of `tx_size` and `tx_type`
 pub fn forward_transform<T: Coefficient>(
-  input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
-  tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
+  input: &[i16], output: &mut [MaybeUninit<T>], stride: usize,
+  tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
 ) {
   assert!(valid_av1_transform(tx_size, tx_type));
   if cpu >= CpuFeatureLevel::AVX2 {
@@ -526,7 +526,9 @@ pub fn forward_transform<T: Coefficient>(
 mod test {
   use crate::cpu_features::*;
   use crate::transform::{forward_transform, get_valid_txfm_types, TxSize};
+  use crate::util::assume_slice_init_mut;
   use rand::Rng;
+  use std::mem::MaybeUninit;
 
   // Ensure that the simd results match the rust code
   #[test]
@@ -558,8 +560,8 @@ mod test {
         (0..area).map(|_| rng.gen_range(-255..256)).collect();
 
       for &tx_type in get_valid_txfm_types(tx_size) {
-        let mut output_ref = vec![0i16; area];
-        let mut output_simd = vec![0i16; area];
+        let mut output_ref = vec![MaybeUninit::new(0i16); area];
+        let mut output_simd = vec![MaybeUninit::new(0i16); area];
 
         println!("Testing combination {:?}, {:?}", tx_size, tx_type);
         forward_transform(
@@ -571,6 +573,7 @@ mod test {
           8,
           CpuFeatureLevel::RUST,
         );
+        let output_ref = unsafe { assume_slice_init_mut(&mut output_ref[..]) };
         forward_transform(
           &input[..],
           &mut output_simd[..],
@@ -580,6 +583,8 @@ mod test {
           8,
           cpu,
         );
+        let output_simd =
+          unsafe { assume_slice_init_mut(&mut output_simd[..]) };
         assert_eq!(output_ref, output_simd)
       }
     }
diff --git a/src/encoder.rs b/src/encoder.rs
index ef63894320..9f4cb54187 100644
--- a/src/encoder.rs
+++ b/src/encoder.rs
@@ -1355,9 +1355,12 @@ fn write_key_frame_obus<T: Pixel>(
 
 /// Write into `dst` the difference between the blocks at `src1` and `src2`
 fn diff<T: Pixel>(
-  dst: &mut [i16], src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>,
+  dst: &mut [MaybeUninit<i16>], src1: &PlaneRegion<'_, T>,
+  src2: &PlaneRegion<'_, T>,
 ) {
   debug_assert!(dst.len() % src1.rect().width == 0);
+  debug_assert_eq!(src1.rows_iter().count(), src1.rect().height);
+
   let width = src1.rect().width;
   let height = src1.rect().height;
 
@@ -1374,7 +1377,7 @@ fn diff<T: Pixel>(
     dst.chunks_exact_mut(width).zip(src1.rows_iter()).zip(src2.rows_iter())
   {
     for ((r, v1), v2) in l.iter_mut().zip(s1).zip(s2) {
-      *r = i16::cast_from(*v1) - i16::cast_from(*v2);
+      r.write(i16::cast_from(*v1) - i16::cast_from(*v2));
     }
   }
 }
@@ -1504,17 +1507,13 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
   }
 
   let coded_tx_area = av1_get_coded_tx_size(tx_size).area();
-  // SAFETY: We write to the array below before reading from it.
-  let mut residual_storage: Aligned<[i16; 64 * 64]> =
+  let mut residual_storage: Aligned<[MaybeUninit<i16>; 64 * 64]> =
     unsafe { Aligned::uninitialized() };
-  // SAFETY: We write to the array below before reading from it.
-  let mut coeffs_storage: Aligned<[T::Coeff; 64 * 64]> =
+  let mut coeffs_storage: Aligned<[MaybeUninit<T::Coeff>; 64 * 64]> =
     unsafe { Aligned::uninitialized() };
-  // SAFETY: We write to the array below before reading from it.
   let mut qcoeffs_storage: Aligned<[MaybeUninit<T::Coeff>; 32 * 32]> =
     unsafe { Aligned::uninitialized() };
-  // SAFETY: We write to the array below before reading from it.
-  let mut rcoeffs_storage: Aligned<[T::Coeff; 32 * 32]> =
+  let mut rcoeffs_storage: Aligned<[MaybeUninit<T::Coeff>; 32 * 32]> =
     unsafe { Aligned::uninitialized() };
   let residual = &mut residual_storage.data[..tx_size.area()];
   let coeffs = &mut coeffs_storage.data[..tx_size.area()];
@@ -1539,8 +1538,10 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
       &rec.subregion(area),
     );
   } else {
-    residual.fill(0);
+    residual.fill(MaybeUninit::new(0));
   }
+  // SAFETY: `diff()` inits `tx_size.area()` elements when it matches size of `subregion(area)`
+  let residual = unsafe { slice_assume_init_mut(residual) };
 
   forward_transform(
     residual,
@@ -1551,6 +1552,8 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
     fi.sequence.bit_depth,
     fi.cpu_feature_level,
   );
+  // SAFETY: forward_transform initialized coeffs
+  let coeffs = unsafe { slice_assume_init_mut(coeffs) };
 
   let eob = ts.qc.quantize(coeffs, qcoeffs, tx_size, tx_type);
 
@@ -1596,6 +1599,8 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
     fi.ac_delta_q[p],
     fi.cpu_feature_level,
   );
+  // SAFETY: dequantize initialized rcoeffs
+  let rcoeffs = unsafe { slice_assume_init_mut(rcoeffs) };
 
   if eob == 0 {
     // All zero coefficients is a no-op
diff --git a/src/quantize/mod.rs b/src/quantize/mod.rs
index 7fb6414866..1555f58db9 100644
--- a/src/quantize/mod.rs
+++ b/src/quantize/mod.rs
@@ -359,10 +359,12 @@ impl QuantizationContext {
 pub mod rust {
   use super::*;
   use crate::cpu_features::CpuFeatureLevel;
+  use std::mem::MaybeUninit;
 
   pub fn dequantize<T: Coefficient>(
-    qindex: u8, coeffs: &[T], _eob: usize, rcoeffs: &mut [T], tx_size: TxSize,
-    bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, _cpu: CpuFeatureLevel,
+    qindex: u8, coeffs: &[T], _eob: usize, rcoeffs: &mut [MaybeUninit<T>],
+    tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
+    _cpu: CpuFeatureLevel,
   ) {
     let log_tx_scale = get_log_tx_scale(tx_size) as i32;
     let offset = (1 << log_tx_scale) - 1;
@@ -376,7 +378,9 @@ pub mod rust {
       .enumerate()
     {
       let quant = if i == 0 { dc_quant } else { ac_quant };
-      *r = T::cast_from((c * quant + ((c >> 31) & offset)) >> log_tx_scale);
+      r.write(T::cast_from(
+        (c * quant + ((c >> 31) & offset)) >> log_tx_scale,
+      ));
     }
   }
 }
diff --git a/src/transform/forward.rs b/src/transform/forward.rs
index ac9f4e850b..85529de464 100644
--- a/src/transform/forward.rs
+++ b/src/transform/forward.rs
@@ -22,6 +22,7 @@ cfg_if::cfg_if! {
 
 pub mod rust {
   use super::*;
+  use std::mem::MaybeUninit;
 
   use crate::transform::forward_shared::*;
   use crate::transform::{av1_round_shift_array, valid_av1_transform, TxSize};
@@ -99,8 +100,8 @@ pub mod rust {
   /// - If called with an invalid combination of `tx_size` and `tx_type`
   #[cold_for_target_arch("x86_64")]
   pub fn forward_transform<T: Coefficient>(
-    input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
-    tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel,
+    input: &[i16], output: &mut [MaybeUninit<T>], stride: usize,
+    tx_size: TxSize, tx_type: TxType, bd: usize, _cpu: CpuFeatureLevel,
   ) {
     assert!(valid_av1_transform(tx_size, tx_type));
 
@@ -114,7 +115,8 @@ pub mod rust {
     let txfm_size_row = tx_size.height();
 
     // SAFETY: We write to the array below before reading from it.
-    let mut tmp: Aligned<[i32; 64 * 64]> = unsafe { Aligned::uninitialized() };
+    let mut tmp: Aligned<[MaybeUninit<i32>; 64 * 64]> =
+      unsafe { Aligned::uninitialized() };
     let buf = &mut tmp.data[..txfm_size_col * txfm_size_row];
 
     let cfg = Txfm2DFlipCfg::fwd(tx_type, tx_size, bd);
@@ -124,20 +126,22 @@ pub mod rust {
 
     // Columns
     for c in 0..txfm_size_col {
-      // SAFETY: We write to the array below before reading from it.
-      let mut col_coeffs_backing: Aligned<[i32; 64]> =
+      let mut col_coeffs_backing: Aligned<[MaybeUninit<i32>; 64]> =
         unsafe { Aligned::uninitialized() };
       let col_coeffs = &mut col_coeffs_backing.data[..txfm_size_row];
       if cfg.ud_flip {
         // flip upside down
         for r in 0..txfm_size_row {
-          col_coeffs[r] = (input[(txfm_size_row - r - 1) * stride + c]).into();
+          col_coeffs[r]
+            .write((input[(txfm_size_row - r - 1) * stride + c]).into());
         }
       } else {
         for r in 0..txfm_size_row {
-          col_coeffs[r] = (input[r * stride + c]).into();
+          col_coeffs[r].write((input[r * stride + c]).into());
         }
       }
+      // SAFETY: The loops above have initialized all txfm_size_row elements
+      let col_coeffs = unsafe { slice_assume_init_mut(col_coeffs) };
 
       av1_round_shift_array(col_coeffs, txfm_size_row, -cfg.shift[0]);
       txfm_func_col(col_coeffs);
@@ -145,18 +149,20 @@ pub mod rust {
       if cfg.lr_flip {
         for r in 0..txfm_size_row {
           // flip from left to right
-          buf[r * txfm_size_col + (txfm_size_col - c - 1)] = col_coeffs[r];
+          buf[r * txfm_size_col + (txfm_size_col - c - 1)]
+            .write(col_coeffs[r]);
         }
       } else {
         for r in 0..txfm_size_row {
-          buf[r * txfm_size_col + c] = col_coeffs[r];
+          buf[r * txfm_size_col + c].write(col_coeffs[r]);
         }
       }
     }
+    // SAFETY: The loops above have initialized the entire buf
+    let buf = unsafe { slice_assume_init_mut(buf) };
 
     // Rows
-    for r in 0..txfm_size_row {
-      let row_coeffs = &mut buf[r * txfm_size_col..];
+    for (r, row_coeffs) in buf.chunks_exact_mut(txfm_size_col).enumerate() {
       txfm_func_row(row_coeffs);
       av1_round_shift_array(row_coeffs, txfm_size_col, -cfg.shift[2]);
 
@@ -181,8 +187,8 @@ pub mod rust {
         let output = &mut output[txfm_size_row * cg..];
 
         for c in 0..txfm_size_col.min(32) {
-          output[c * output_stride + (r & 31)] =
-            T::cast_from(row_coeffs[c + cg]);
+          output[c * output_stride + (r & 31)]
+            .write(T::cast_from(row_coeffs[c + cg]));
         }
       }
     }
diff --git a/src/transform/mod.rs b/src/transform/mod.rs
index d14913e133..e4d38b47a4 100644
--- a/src/transform/mod.rs
+++ b/src/transform/mod.rs
@@ -449,6 +449,7 @@ mod test {
   use crate::cpu_features::CpuFeatureLevel;
   use crate::frame::*;
   use rand::random;
+  use std::mem::MaybeUninit;
 
   fn test_roundtrip<T: Pixel>(
     tx_size: TxSize, tx_type: TxType, tolerance: i16,
@@ -465,7 +466,7 @@ mod test {
     );
     let mut res_storage = [0i16; 64 * 64];
     let res = &mut res_storage[..tx_size.area()];
-    let mut freq_storage = [T::Coeff::cast_from(0); 64 * 64];
+    let mut freq_storage = [MaybeUninit::uninit(); 64 * 64];
     let freq = &mut freq_storage[..tx_size.area()];
     for ((r, s), d) in
       res.iter_mut().zip(src.iter_mut()).zip(dst.data.iter_mut())
@@ -475,6 +476,8 @@ mod test {
       *r = i16::cast_from(*s) - i16::cast_from(*d);
     }
     forward_transform(res, freq, tx_size.width(), tx_size, tx_type, 8, cpu);
+    // SAFETY: forward_transform initialized freq
+    let freq = unsafe { slice_assume_init_mut(freq) };
     inverse_transform_add(
       freq,
       &mut dst.as_region_mut(),
diff --git a/src/util/align.rs b/src/util/align.rs
index 84f47d1bd9..641902e50f 100644
--- a/src/util/align.rs
+++ b/src/util/align.rs
@@ -52,6 +52,13 @@ impl<T> Aligned<T> {
   }
 }
 
+#[inline(always)]
+pub unsafe fn slice_assume_init_mut<T>(
+  slice: &mut [MaybeUninit<T>],
+) -> &mut [T] {
+  unsafe { &mut *(slice as *mut [MaybeUninit<T>] as *mut [T]) }
+}
+
 /// An analog to a Box<[T]> where the underlying slice is aligned.
 /// Alignment is according to the architecture-specific SIMD constraints.
 pub struct AlignedBoxedSlice<T> {