struct Rav1dFrameContext_lf::level: Change [u8; 4]s to u8s and remove unsafes and UB (#1273)

kkysen · web-flow · commit 5cf49e0d314c · 2024-07-01T19:53:41.000-04:00
Previously usages of `level` were UB. Accesses to the disjoint `[0..2]`
Y and `[2..4]` UV elements were done `unsafe`ly since they overlapped,
but this meant that the other safe `DisjointMut` usages of `level` were
unchecked, overlapping with those writes, and thus UB.

This fixes that by making the elements just `u8`s instead of `[u8; 4]`s,
multiplying all previous indices/offsets by 4. Thus we can now index the
`[0..2]` Y and `[2..4]` UV elements independently with `DisjointMut`'s
safe APIs.

We can also remove `fn unaligned_lvl_slice`, since that now just
translates to a normal offset by `y`, just not multiplied by 4 like the
`x`s were. We can also allocate only 3 extra bytes for asm like C does,
rather than a whole extra element (4 bytes).

And finally, we can move the `DisjointMut` immutable indexing to `fn
loop_filter_sb128_rust`, where it can be fine-grained and not overlap
with other uses.
diff --git a/lib.rs b/lib.rs
@@ -54,7 +54,6 @@ pub mod src {
     pub(crate) mod pixels;
     pub(crate) mod relaxed_atomic;
     pub(crate) mod strided;
-    mod unstable_extensions;
     pub(crate) mod with_offset;
     pub(crate) mod wrap_fn_ptr;
     // TODO(kkysen) Temporarily `pub(crate)` due to a `pub use` until TAIT.
diff --git a/src/decode.rs b/src/decode.rs
@@ -4520,10 +4520,10 @@ pub(crate) fn rav1d_decode_frame_init(c: &Rav1dContext, fc: &Rav1dFrameContext)
     f.lf.mask.clear();
     // TODO: Fallible allocation.
     f.lf.mask.resize_with(num_sb128 as usize, Default::default);
-    // over-allocate one element (4 bytes) since some of the SIMD implementations
+    // over-allocate by 3 bytes since some of the SIMD implementations
     // index this from the level type and can thus over-read by up to 3 bytes.
     f.lf.level
-        .resize(num_sb128 as usize * 32 * 32 + 1, [0u8; 4]); // TODO: Fallible allocation
+        .resize_with(4 * num_sb128 as usize * 32 * 32 + 3, Default::default); // TODO: Fallible allocation
     if c.fc.len() > 1 {
         // TODO: Fallible allocation
         f.frame_thread
diff --git a/src/internal.rs b/src/internal.rs
@@ -693,7 +693,7 @@ impl TxLpfRightEdge {
 #[derive(Default)]
 #[repr(C)]
 pub struct Rav1dFrameContext_lf {
-    pub level: DisjointMut<Vec<[u8; 4]>>,
+    pub level: DisjointMut<Vec<u8>>,
     pub mask: Vec<Av1Filter>, /* len = w*h */
     pub lr_mask: Vec<Av1Restoration>,
     pub lim_lut: Align16<Av1FilterLUT>,
diff --git a/src/lf_apply.rs b/src/lf_apply.rs
@@ -370,7 +370,7 @@ pub(crate) fn rav1d_copy_lpf<BD: BitDepth>(
 fn filter_plane_cols_y<BD: BitDepth>(
     f: &Rav1dFrameData,
     have_left: bool,
-    lvl: WithOffset<&[[u8; 4]]>,
+    lvl: WithOffset<&DisjointMut<Vec<u8>>>,
     mask: &[[[RelaxedAtomic<u16>; 2]; 3]; 32],
     y_dst: Rav1dPictureDataComponentOffset,
     w: usize,
@@ -398,16 +398,16 @@ fn filter_plane_cols_y<BD: BitDepth>(
         } else {
             mask.each_ref().map(|[_, b]| b.get() as u32)
         };
-        let lvl = lvl + x;
-        lf_sb.y.h.call::<BD>(f, y_dst(x), &hmask, lvl, 0, len);
+        let lvl = |y| lvl + (4 * x + y);
+        lf_sb.y.h.call::<BD>(f, y_dst(x), &hmask, lvl(0), len);
     }
 }
 
 #[inline]
 fn filter_plane_rows_y<BD: BitDepth>(
     f: &Rav1dFrameData,
     have_top: bool,
-    lvl: WithOffset<&[[u8; 4]]>,
+    lvl: WithOffset<&DisjointMut<Vec<u8>>>,
     b4_stride: usize,
     mask: &[[[RelaxedAtomic<u16>; 2]; 3]; 32],
     y_dst: Rav1dPictureDataComponentOffset,
@@ -430,16 +430,16 @@ fn filter_plane_rows_y<BD: BitDepth>(
         let vmask = mask[y % mask.len()] // To elide the bounds check.
             .each_ref()
             .map(|[a, b]| a.get() as u32 | ((b.get() as u32) << 16));
-        let lvl = lvl + i * b4_stride;
-        lf_sb.y.v.call::<BD>(f, y_dst(i), &vmask, lvl, 1, w);
+        let lvl = |y| lvl + (4 * i * b4_stride + y);
+        lf_sb.y.v.call::<BD>(f, y_dst(i), &vmask, lvl(1), w);
     }
 }
 
 #[inline]
 fn filter_plane_cols_uv<BD: BitDepth>(
     f: &Rav1dFrameData,
     have_left: bool,
-    lvl: WithOffset<&[[u8; 4]]>,
+    lvl: WithOffset<&DisjointMut<Vec<u8>>>,
     mask: &[[[RelaxedAtomic<u16>; 2]; 2]; 32],
     u_dst: Rav1dPictureDataComponentOffset,
     v_dst: Rav1dPictureDataComponentOffset,
@@ -472,17 +472,17 @@ fn filter_plane_cols_uv<BD: BitDepth>(
             mask.each_ref().map(|[_, b]| b.get() as u32)
         };
         let hmask = [hmask[0], hmask[1], 0];
-        let lvl = lvl + x;
-        lf_sb.uv.h.call::<BD>(f, u_dst(x), &hmask, lvl, 2, len);
-        lf_sb.uv.h.call::<BD>(f, v_dst(x), &hmask, lvl, 3, len);
+        let lvl = |y| lvl + (4 * x + y);
+        lf_sb.uv.h.call::<BD>(f, u_dst(x), &hmask, lvl(2), len);
+        lf_sb.uv.h.call::<BD>(f, v_dst(x), &hmask, lvl(3), len);
     }
 }
 
 #[inline]
 fn filter_plane_rows_uv<BD: BitDepth>(
     f: &Rav1dFrameData,
     have_top: bool,
-    lvl: WithOffset<&[[u8; 4]]>,
+    lvl: WithOffset<&DisjointMut<Vec<u8>>>,
     b4_stride: usize,
     mask: &[[[RelaxedAtomic<u16>; 2]; 2]; 32],
     u_dst: Rav1dPictureDataComponentOffset,
@@ -510,9 +510,9 @@ fn filter_plane_rows_uv<BD: BitDepth>(
             .each_ref()
             .map(|[a, b]| a.get() as u32 | ((b.get() as u32) << (16 >> ss_hor)));
         let vmask = [vmask[0], vmask[1], 0];
-        let lvl = lvl + i * b4_stride;
-        lf_sb.uv.v.call::<BD>(f, u_dst(i), &vmask, lvl, 2, w);
-        lf_sb.uv.v.call::<BD>(f, v_dst(i), &vmask, lvl, 3, w);
+        let lvl = |y| lvl + (4 * i * b4_stride + y);
+        lf_sb.uv.v.call::<BD>(f, u_dst(i), &vmask, lvl(2), w);
+        lf_sb.uv.v.call::<BD>(f, v_dst(i), &vmask, lvl(3), w);
     }
 }
 
@@ -624,20 +624,16 @@ pub(crate) fn rav1d_loopfilter_sbrow_cols<BD: BitDepth>(
         }
     }
     let lflvl = &f.lf.mask[lflvl_offset..];
-    let lvl = &*f
-        .lf
-        .level
-        .index((f.b4_stride * sby as isize * sbsz as isize) as usize..);
     let lvl = WithOffset {
-        data: lvl,
-        offset: 0,
+        data: &f.lf.level,
+        offset: 4 * f.b4_stride as usize * (sby * sbsz) as usize,
     };
     have_left = false;
     for x in 0..f.sb128w as usize {
         filter_plane_cols_y::<BD>(
             f,
             have_left,
-            lvl + x * 32,
+            lvl + 4 * x * 32,
             &lflvl[x].filter_y[0],
             py + x * 128,
             cmp::min(32, f.w4 - x as c_int * 32) as usize,
@@ -649,20 +645,16 @@ pub(crate) fn rav1d_loopfilter_sbrow_cols<BD: BitDepth>(
     if frame_hdr.loopfilter.level_u == 0 && frame_hdr.loopfilter.level_v == 0 {
         return;
     }
-    let lvl = &*f
-        .lf
-        .level
-        .index((f.b4_stride * (sby * sbsz >> ss_ver) as isize) as usize..);
     let lvl = WithOffset {
-        data: lvl,
-        offset: 0,
+        data: &f.lf.level,
+        offset: 4 * f.b4_stride as usize * (sby * sbsz >> ss_ver) as usize,
     };
     have_left = false;
     for x in 0..f.sb128w as usize {
         filter_plane_cols_uv::<BD>(
             f,
             have_left,
-            lvl + x * (32 >> ss_hor),
+            lvl + 4 * x * (32 >> ss_hor),
             &lflvl[x].filter_uv[0],
             pu + x * (128 >> ss_hor),
             pv + x * (128 >> ss_hor),
@@ -694,19 +686,15 @@ pub(crate) fn rav1d_loopfilter_sbrow_rows<BD: BitDepth>(
     let endy4: c_uint = (starty4 + cmp::min(f.h4 - sby * sbsz, sbsz)) as c_uint;
     let uv_endy4: c_uint = endy4.wrapping_add(ss_ver as c_uint) >> ss_ver;
 
-    let lvl = &*f
-        .lf
-        .level
-        .index((f.b4_stride * sby as isize * sbsz as isize) as usize..);
     let lvl = WithOffset {
-        data: lvl,
-        offset: 0,
+        data: &f.lf.level,
+        offset: 4 * f.b4_stride as usize * (sby * sbsz) as usize,
     };
     for x in 0..f.sb128w as usize {
         filter_plane_rows_y::<BD>(
             f,
             have_top,
-            lvl + x * 32,
+            lvl + 4 * x * 32,
             f.b4_stride as usize,
             &lflvl[x].filter_y[1],
             p[0] + 128 * x,
@@ -721,20 +709,16 @@ pub(crate) fn rav1d_loopfilter_sbrow_rows<BD: BitDepth>(
         return;
     }
 
-    let lvl = &*f
-        .lf
-        .level
-        .index((f.b4_stride * (sby * sbsz >> ss_ver) as isize) as usize..);
     let lvl = WithOffset {
-        data: lvl,
-        offset: 0,
+        data: &f.lf.level,
+        offset: 4 * f.b4_stride as usize * (sby * sbsz >> ss_ver) as usize,
     };
     let [_, pu, pv] = p;
     for x in 0..f.sb128w as usize {
         filter_plane_rows_uv::<BD>(
             f,
             have_top,
-            lvl + x * (32 >> ss_hor),
+            lvl + 4 * x * (32 >> ss_hor),
             f.b4_stride as usize,
             &lflvl[x].filter_uv[1],
             pu + (x * 128 >> ss_hor),
diff --git a/src/lf_mask.rs b/src/lf_mask.rs
@@ -388,7 +388,7 @@ fn mask_edges_chroma(
 
 pub(crate) fn rav1d_create_lf_mask_intra(
     lflvl: &Av1Filter,
-    level_cache: &DisjointMut<Vec<[u8; 4]>>,
+    level_cache: &DisjointMut<Vec<u8>>,
     b4_stride: ptrdiff_t,
     filter_level: &Align16<[[[u8; 2]; 8]; 4]>,
     b: Bxy,
@@ -416,16 +416,11 @@ pub(crate) fn rav1d_create_lf_mask_intra(
         let mut level_cache_off = by * b4_stride + bx;
         for _y in 0..bh4 {
             for x in 0..bw4 {
-                let idx = level_cache_off + x;
-                assert!(idx < level_cache.len());
-                // SAFETY: The Y portion of this element (indices 0 and 1) is not
-                // concurrently accessed by any other threads and the assert above ensures
-                // that it is in bounds.
-                unsafe {
-                    let cur = level_cache.as_mut_ptr().add(idx);
-                    (*cur)[0] = filter_level[0][0][0];
-                    (*cur)[1] = filter_level[1][0][0];
-                }
+                let idx = 4 * (level_cache_off + x);
+                // `0.., ..2` is for Y
+                let lvl = &mut *level_cache.index_mut((idx + 0.., ..2));
+                lvl[0] = filter_level[0][0][0];
+                lvl[1] = filter_level[1][0][0];
             }
             level_cache_off += b4_stride;
         }
@@ -459,16 +454,11 @@ pub(crate) fn rav1d_create_lf_mask_intra(
     let mut level_cache_off = (by >> ss_ver) * b4_stride + (bx >> ss_hor);
     for _y in 0..cbh4 {
         for x in 0..cbw4 {
-            let idx = level_cache_off + x;
-            assert!(idx < level_cache.len());
-            // SAFETY: The UV portion of this element (indices 2 and 3) is not concurrently
-            // accessed by any other threads and the assert above ensures that it is in
-            // bounds.
-            unsafe {
-                let cur = level_cache.as_mut_ptr().add(idx);
-                (*cur)[2] = filter_level[2][0][0];
-                (*cur)[3] = filter_level[3][0][0];
-            }
+            let idx = 4 * (level_cache_off + x);
+            // `2.., ..2` is for UV
+            let lvl = &mut *level_cache.index_mut((idx + 2.., ..2));
+            lvl[0] = filter_level[2][0][0];
+            lvl[1] = filter_level[3][0][0];
         }
         level_cache_off += b4_stride;
     }
@@ -491,7 +481,7 @@ pub(crate) fn rav1d_create_lf_mask_intra(
 #[inline(never)]
 pub(crate) fn rav1d_create_lf_mask_inter(
     lflvl: &Av1Filter,
-    level_cache: &DisjointMut<Vec<[u8; 4]>>,
+    level_cache: &DisjointMut<Vec<u8>>,
     b4_stride: ptrdiff_t,
     filter_level: &Align16<[[[u8; 2]; 8]; 4]>,
     r#ref: usize,
@@ -524,16 +514,11 @@ pub(crate) fn rav1d_create_lf_mask_inter(
         let mut level_cache_off = by * b4_stride + bx;
         for _y in 0..bh4 {
             for x in 0..bw4 {
-                let idx = level_cache_off + x;
-                assert!(idx < level_cache.len());
-                // SAFETY: The Y portion of this element (indices 0 and 1) is not
-                // concurrently accessed by any other threads and the assert above ensures
-                // that it is in bounds.
-                unsafe {
-                    let cur = level_cache.as_mut_ptr().add(idx);
-                    (*cur)[0] = filter_level[0][r#ref][is_gmv];
-                    (*cur)[1] = filter_level[1][r#ref][is_gmv];
-                }
+                let idx = 4 * (level_cache_off + x);
+                // `0.., ..2` is for Y
+                let lvl = &mut *level_cache.index_mut((idx + 0.., ..2));
+                lvl[0] = filter_level[0][r#ref][is_gmv];
+                lvl[1] = filter_level[1][r#ref][is_gmv];
             }
             level_cache_off += b4_stride;
         }
@@ -578,16 +563,11 @@ pub(crate) fn rav1d_create_lf_mask_inter(
     let mut level_cache_off = (by >> ss_ver) * b4_stride + (bx >> ss_hor);
     for _y in 0..cbh4 {
         for x in 0..cbw4 {
-            let idx = level_cache_off + x;
-            assert!(idx < level_cache.len());
-            // SAFETY: The UV part of this element (indices 2 and 3) is not concurrently
-            // accessed by any other threads and the assert above ensures that it is in
-            // bounds.
-            unsafe {
-                let cur = level_cache.as_mut_ptr().add(idx);
-                (*cur)[2] = filter_level[2][r#ref][is_gmv];
-                (*cur)[3] = filter_level[3][r#ref][is_gmv];
-            }
+            let idx = 4 * (level_cache_off + x);
+            // `2.., ..2` is for UV
+            let lvl = &mut *level_cache.index_mut((idx + 2.., ..2));
+            lvl[0] = filter_level[2][r#ref][is_gmv];
+            lvl[1] = filter_level[3][r#ref][is_gmv];
         }
         level_cache_off += b4_stride;
     }
diff --git a/src/loopfilter.rs b/src/loopfilter.rs
diff --git a/src/unstable_extensions.rs b/src/unstable_extensions.rs