ChillFish8 · ChillFish8 · Jan 18, 2025 · Nov 9, 2024 · Nov 16, 2024 · Nov 16, 2024
diff --git a/cfavml/src/danger/core_simd_api.rs b/cfavml/src/danger/core_simd_api.rs
@@ -405,3 +405,19 @@ pub trait SimdRegister<T: Copy> {
         Self::write(mem.add(Self::elements_per_lane() * 7), lane.h);
     }
 }
+
+pub trait Hypot<T>: SimdRegister<T>
+where
+    T: Copy,
+{
+    /// SIMD Variant of the std hypot function. Computes the distance between the origin
+    /// and a point (`x`, `y`) on the Euclidean plane.
+    unsafe fn hypot(a: Self::Register, b: Self::Register) -> Self::Register;
+    #[inline(always)]
+    unsafe fn hypot_dense(
+        a: DenseLane<Self::Register>,
+        b: DenseLane<Self::Register>,
+    ) -> DenseLane<Self::Register> {
+        apply_dense!(Self::hypot, a, b)
+    }
+}
diff --git a/cfavml/src/danger/export_hypot.rs b/cfavml/src/danger/export_hypot.rs
@@ -0,0 +1,212 @@
+//! Common arithmetic operations
+//!
+//! I.e. Add, Sub, Mul, Div...
+
+use crate::buffer::WriteOnlyBuffer;
+use crate::danger::core_simd_api::Hypot;
+use crate::danger::{generic_hypot_vertical, SimdRegister};
+use crate::math::{AutoMath, Math, Numeric};
+use crate::mem_loader::{IntoMemLoader, MemLoader};
+
+macro_rules! define_hypot_impls {
+    (
+        $hypot_name:ident,
+        $imp:ident $(,)?
+        $(target_features = $($feat:expr $(,)?)+)?
+    ) => {
+        #[inline]
+        $(#[target_feature($(enable = $feat, )*)])*
+        #[doc = include_str!("../export_docs/arithmetic_hypot_vertical.md")]
+        $(
+
+            #[doc = concat!("- ", $("**`+", $feat, "`** ", )*)]
+            #[doc = "CPU features are available at runtime. Running on hardware _without_ this feature available will cause immediate UB."]
+        )*
+        pub unsafe fn $hypot_name<T, B1, B2, B3>(
+            a: B1,
+            b: B2,
+            result: &mut [B3],
+        )
+        where
+            T: Copy,
+            B1: IntoMemLoader<T>,
+            B1::Loader: MemLoader<Value = T>,
+            B2: IntoMemLoader<T>,
+            B2::Loader: MemLoader<Value = T>,
+            crate::danger::$imp: SimdRegister<T>+ Hypot<T>,
+            AutoMath: Math<T> + Numeric<T>,
+            for<'a> &'a mut [B3]: WriteOnlyBuffer<Item = T>,
+        {
+            generic_hypot_vertical::<T, crate::danger::$imp, AutoMath, B1, B2, B3>(
+                a,
+                b,
+                result,
+            )
+        }
+    };
+}
+
+define_hypot_impls!(generic_fallback_hypot_vertical, Fallback,);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+define_hypot_impls!(generic_avx2_hypot_vertical, Avx2, target_features = "avx2");
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+define_hypot_impls!(
+    generic_avx2fma_hypot_vertical,
+    Avx2Fma,
+    target_features = "avx2",
+    "fma"
+);
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "nightly"))]
+define_hypot_impls!(
+    generic_avx512_hypot_vertical,
+    Avx512,
+    target_features = "avx512f",
+    "avx512bw"
+);
+#[cfg(target_arch = "aarch64")]
+define_hypot_impls!(generic_neon_hypot_vertical, Neon, target_features = "neon");
+
+#[cfg(test)]
+mod tests {
+    use num_traits::{Float, FloatConst};
+
+    use super::*;
+
+    macro_rules! define_inner_test {
+        ($variant:ident, op = $op:ident, ty = $t:ident) => {
+            paste::paste! {
+                #[test]
+                fn [< $variant _ $op _value_ $t >]() {
+                    let (l1, _) = crate::test_utils::get_sample_vectors::<$t>(533);
+
+                    let mut result = vec![$t::default(); 533];
+                    unsafe { [< $variant _ $op _vertical >](&l1, 2 as $t, &mut result) };
+
+                    let expected = l1.iter()
+                        .copied()
+                        .map(|v| AutoMath::$op(v, 2 as $t))
+                        .collect::<Vec<_>>();
+
+                    for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
+                        let ulps_diff = get_diff_ulps(*expected, *actual);
+                        assert!(
+                            ulps_diff.abs() <= 1,
+                            "result differs by more than 1 ULP:\n initial inputs: {}, 2\n  expected: {} actual: {}\nulps diff: {}",
+                            initial, expected, actual, ulps_diff
+                        );
+
+                    }
+
+                }
+
+                #[test]
+                fn [< $variant _ $op _vector_ $t >]() {
+                    let (l1, l2) = crate::test_utils::get_sample_vectors::<$t>(533);
+
+                    let mut result = vec![$t::default(); 533];
+                    unsafe { [< $variant _ $op _vertical >](&l1, &l2, &mut result) };
+
+                    let expected = l1.iter()
+                        .copied()
+                        .zip(l2.iter().copied())
+                        .map(|(a, b)| AutoMath::$op(a, b))
+                        .collect::<Vec<_>>();
+                    for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
+                        let ulps_diff = get_diff_ulps(*expected, *actual);
+                        assert!(
+                            ulps_diff.abs() <= 1,
+                            "result differs by more than 1 ULP:\n initial inputs: {}, 2\n  expected: {} actual: {}\nulps diff: {}",
+                            initial, expected, actual, ulps_diff
+                        );
+                    }
+                }
+                #[test]
+                fn [< $variant _ $op _subnormal_value_ $t >]() {
+                    let (l1, _) = crate::test_utils::get_subnormal_sample_vectors::<$t>(533);
+
+                    let mut result = vec![$t::default(); 533];
+                    unsafe { [< $variant _ $op _vertical >](&l1, 2 as $t, &mut result) };
+
+                    let expected = l1.iter()
+                        .copied()
+                        .map(|v| AutoMath::$op(v, 2 as $t))
+                        .collect::<Vec<_>>();
+
+                    for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
+                        let ulps_diff = get_diff_ulps(*expected, *actual);
+                        assert!(
+                            ulps_diff.abs() <= 1,
+                            "result differs by more than 1 ULP:\n initial inputs: {}, 2\n  expected: {} actual: {}\nulps diff: {}",
+                            initial, expected, actual, ulps_diff
+                        );
+
+                    }
+
+                }
+
+                #[test]
+                fn [< $variant _ $op _subnormal_vector_ $t >]() {
+                    let (l1, l2) = crate::test_utils::get_subnormal_sample_vectors::<$t>(533);
+
+                    let mut result = vec![$t::default(); 533];
+                    unsafe { [< $variant _ $op _vertical >](&l1, &l2, &mut result) };
+
+                    let expected = l1.iter()
+                        .copied()
+                        .zip(l2.iter().copied())
+                        .map(|(a, b)| AutoMath::$op(a, b))
+                        .collect::<Vec<_>>();
+                    for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
+                        let ulps_diff = get_diff_ulps(*expected, *actual);
+                        assert!(
+                            ulps_diff.abs() <= 1,
+                            "result differs by more than 1 ULP:\n initial inputs: {}, 2\n  expected: {} actual: {}\nulps diff: {}",
+                            initial, expected, actual, ulps_diff
+                        );
+                    }
+                }
+            }
+        };
+    }
+
+    fn get_diff_ulps<T>(a: T, b: T) -> i64
+    where
+        T: Float + FloatConst,
+    {
+        let (a_mant, a_exp, a_sign) = a.integer_decode();
+        let (b_mant, b_exp, b_sign) = b.integer_decode();
+        assert!(a_sign == b_sign);
+        assert!(a_exp == b_exp);
+        a_mant as i64 - b_mant as i64
+    }
+
+    macro_rules! define_numeric_test {
+        ($variant:ident, types = $($t:ident $(,)?)+) => {
+            $(
+                define_inner_test!($variant, op = hypot, ty = $t);
+
+            )*
+        };
+    }
+
+    define_numeric_test!(generic_fallback, types = f32, f64,);
+    #[cfg(all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        target_feature = "avx2"
+    ))]
+    define_numeric_test!(generic_avx2, types = f32, f64,);
+    #[cfg(all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        target_feature = "avx2",
+        target_feature = "fma"
+    ))]
+    define_numeric_test!(generic_avx2fma, types = f32, f64,);
+    #[cfg(all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        feature = "nightly",
+        target_feature = "avx512f"
+    ))]
+    define_numeric_test!(generic_avx512, types = f32, f64,);
+    #[cfg(target_arch = "aarch64")]
+    define_numeric_test!(generic_neon, types = f32, f64,);
+}
diff --git a/cfavml/src/danger/impl_avx2.rs b/cfavml/src/danger/impl_avx2.rs
@@ -5,7 +5,7 @@ use core::arch::x86_64::*;
 use core::iter::zip;
 use core::mem;
 
-use super::core_simd_api::{DenseLane, SimdRegister};
+use super::core_simd_api::{DenseLane, Hypot, SimdRegister};
 use crate::apply_dense;
 
 /// AVX2 enabled SIMD operations.
@@ -169,6 +169,59 @@ impl SimdRegister<f32> for Avx2 {
     }
 }
 
+impl Hypot<f32> for Avx2 {
+    //b' * sqrt((a*b_hat)^2 + (b*b_hat)^2), where |b| > |a|, b'=(2^n<b), b_hat=1/b'
+    #[inline(always)]
+    unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
+        // convert the inputs to absolute values
+        let (x, y) = (
+            _mm256_andnot_ps(_mm256_set1_ps(-0.0), x),
+            _mm256_andnot_ps(_mm256_set1_ps(-0.0), y),
+        );
+        // find the max and min of the two inputs
+        let (hi, lo) = (
+            <Avx2 as SimdRegister<f32>>::max(x, y),
+            <Avx2 as SimdRegister<f32>>::min(x, y),
+        );
+        //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
+        let exponent_mask = _mm256_set1_ps(f32::INFINITY);
+        //The mantissa mask is the inverse of the exponent mask
+        let mantissa_mask = _mm256_sub_ps(
+            _mm256_set1_ps(f32::MIN_POSITIVE),
+            _mm256_set1_ps(f32::from_bits(1_u32)),
+        );
+        // round the hi values down to the nearest power of 2
+        let hi2p = _mm256_and_ps(hi, exponent_mask);
+        // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
+        // we can double it and xor it with the exponent mask
+        let scale = _mm256_xor_ps(_mm256_add_ps(hi2p, hi2p), exponent_mask);
+
+        // create a mask that matches the normal hi values
+        let mask = _mm256_cmp_ps::<_CMP_GT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE));
+        // replace the subnormal values of hi2p with the minimum positive normal value
+        let hi2p = _mm256_blendv_ps(_mm256_set1_ps(f32::MIN_POSITIVE), hi2p, mask);
+        // replace the subnormal values of scale with the reciprocal of the minimum positive normal value
+        let scale =
+            _mm256_blendv_ps(_mm256_set1_ps(1.0 / f32::MIN_POSITIVE), scale, mask);
+        // create a mask that matches the subnormal hi values
+        let mask = _mm256_cmp_ps::<_CMP_LT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE));
+        // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
+        let hi_scaled =
+            _mm256_or_ps(_mm256_and_ps(hi, mantissa_mask), _mm256_set1_ps(1.0));
+        // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
+        let hi_scaled = _mm256_blendv_ps(
+            hi_scaled,
+            _mm256_sub_ps(hi_scaled, _mm256_set1_ps(1.0)),
+            mask,
+        );
+
+        let hi_scaled = _mm256_mul_ps(hi_scaled, hi_scaled);
+        let lo_scaled = _mm256_mul_ps(lo, scale);
+        let lo_scaled = _mm256_mul_ps(lo_scaled, lo_scaled);
+        _mm256_mul_ps(hi2p, _mm256_sqrt_ps(_mm256_add_ps(lo_scaled, hi_scaled)))
+    }
+}
+
 impl SimdRegister<f64> for Avx2 {
     type Register = __m256d;
 
@@ -2724,3 +2777,55 @@ impl SimdRegister<u64> for Avx2 {
         _mm256_storeu_si256(mem.cast(), reg)
     }
 }
+
+impl Hypot<f64> for Avx2 {
+    #[inline(always)]
+    unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
+        // convert the inputs to absolute values
+        let (x, y) = (
+            _mm256_andnot_pd(_mm256_set1_pd(-0.0), x),
+            _mm256_andnot_pd(_mm256_set1_pd(-0.0), y),
+        );
+        // find the max and min of the two inputs
+        let (hi, lo) = (
+            <Avx2 as SimdRegister<f64>>::max(x, y),
+            <Avx2 as SimdRegister<f64>>::min(x, y),
+        );
+        //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
+        let exponent_mask = _mm256_set1_pd(f64::INFINITY);
+        //The mantissa mask is the inverse of the exponent mask
+        let mantissa_mask = _mm256_sub_pd(
+            _mm256_set1_pd(f64::MIN_POSITIVE),
+            _mm256_set1_pd(f64::from_bits(1_u64)),
+        );
+        // round the hi values down to the nearest power of 2
+        let hi2p = _mm256_and_pd(hi, exponent_mask);
+        // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
+        // we can double it and xor it with the exponent mask
+        let scale = _mm256_xor_pd(_mm256_add_pd(hi2p, hi2p), exponent_mask);
+
+        // create a mask that matches the normal hi values
+        let mask = _mm256_cmp_pd::<_CMP_GT_OQ>(hi, _mm256_set1_pd(f64::MIN_POSITIVE));
+        // replace the subnormal values of hi2p with the minimum positive normal value
+        let hi2p = _mm256_blendv_pd(_mm256_set1_pd(f64::MIN_POSITIVE), hi2p, mask);
+        // replace the subnormal values of scale with the reciprocal of the minimum positive normal value
+        let scale =
+            _mm256_blendv_pd(_mm256_set1_pd(1.0 / f64::MIN_POSITIVE), scale, mask);
+        // create a mask that matches the subnormal hi values
+        let mask = _mm256_cmp_pd::<_CMP_LT_OQ>(hi, _mm256_set1_pd(f64::MIN_POSITIVE));
+        // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
+        let hi_scaled =
+            _mm256_or_pd(_mm256_and_pd(hi, mantissa_mask), _mm256_set1_pd(1.0));
+        // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
+        let hi_scaled = _mm256_blendv_pd(
+            hi_scaled,
+            _mm256_sub_pd(hi_scaled, _mm256_set1_pd(1.0)),
+            mask,
+        );
+
+        let hi_scaled = _mm256_mul_pd(hi_scaled, hi_scaled);
+        let lo_scaled = _mm256_mul_pd(lo, scale);
+        let lo_scaled = _mm256_mul_pd(lo_scaled, lo_scaled);
+        _mm256_mul_pd(hi2p, _mm256_sqrt_pd(_mm256_add_pd(lo_scaled, hi_scaled)))
+    }
+}