use sqrt intrinsic for fastmath, implemented Hypot for Neon

skewballfox · skewballfox · commit 11db4447e95e · 2024-11-16T13:48:26.000-06:00
diff --git a/cfavml/src/danger/impl_neon.rs b/cfavml/src/danger/impl_neon.rs
@@ -2,7 +2,7 @@ use core::arch::aarch64::*;
 use core::iter::zip;
 use core::mem;
 
-use crate::danger::{DenseLane, SimdRegister};
+use super::core_simd_api::{DenseLane, SimdRegister,Hypot};
 use crate::math::{AutoMath, Math};
 
 const BITS_8_CAPACITY: usize = 16;
@@ -146,11 +146,48 @@ impl SimdRegister<f32> for Neon {
         vst1q_f32(mem, reg)
     }
 }
-
+const EXPONENT_MASK_F32: u32 = 2139095040;
+const MANTISSA_MASK_F32: u32 = 8388607;
 impl Hypot<f32> for Neon {
     #[inline(always)]
-    unsafe fn hypot(l1: Self::Register, l2: Self::Register) -> Self::Register {
-        todo!()
+    unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
+        // Convert inputs to absolute values
+        let (x, y) = (vabsq_f32(x), vabsq_f32(y));
+
+        // Find the max and min of the two inputs
+        let (hi, lo) = (vmaxq_f32(x, y), vminq_f32(x, y));
+        let exponent_mask = vdupq_n_u32(EXPONENT_MASK_F32);
+        let mantissa_mask = vdupq_n_u32(MANTISSA_MASK_F32);
+
+        // round the hi values down to the nearest power of 2
+        let hi2p =
+            vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(hi), exponent_mask));
+        // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
+        // we can double it and xor it with the exponent mask
+        let scale = vreinterpretq_f32_u32(veorq_u32(
+            vreinterpretq_u32_f32(vaddq_f32(hi2p, hi2p)),
+            exponent_mask,
+        ));
+        // create a mask that matches the normal hi values
+        let mask = vcgtq_f32(hi, vdupq_n_f32(f32::MIN_POSITIVE));
+        // replace the subnormal values of hi2p with the minimum positive normal value
+        let hi2p = vbslq_f32(mask, hi2p, vdupq_n_f32(f32::MIN_POSITIVE));
+        // replace the subnormal values of scale with the reciprocal of the minimum positive normal value
+        let scale = vbslq_f32(mask, scale, vdupq_n_f32(1.0 / f32::MIN_POSITIVE));
+        // create a mask that matches the subnormal hi values
+        let mask = vcltq_f32(hi, vdupq_n_f32(f32::MIN_POSITIVE));
+        // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
+        let hi_scaled = vreinterpretq_f32_u32(vorrq_u32(
+            vandq_u32(vreinterpretq_u32_f32(hi), mantissa_mask),
+            vreinterpretq_u32_f32(vdupq_n_f32(1.0)),
+        ));
+        // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
+        let hi_scaled =
+            vbslq_f32(mask, vsubq_f32(hi_scaled, vdupq_n_f32(1.0)), hi_scaled);
+        // finally, do the thing
+        let hi_scaled = vmulq_f32(hi_scaled, hi_scaled);
+        let lo_scaled = vmulq_f32(lo, scale);
+        vmulq_f32(hi2p, vsqrtq_f32(vfmaq_f32(lo_scaled, lo_scaled, hi_scaled)))
     }
 }
 
@@ -286,6 +323,49 @@ impl SimdRegister<f64> for Neon {
     }
 }
 
+impl Hypot<f64> for Neon {
+    #[inline(always)]
+    unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
+        // Convert inputs to absolute values
+        let (x, y) = (vabsq_f64(x), vabsq_f64(y));
+
+        // Find the max and min of the two inputs
+        let (hi, lo) = (vmaxq_f64(x, y), vminq_f64(x, y));
+        let exponent_mask = vdupq_n_u64(f64::INFINITY.to_bits());
+        let mantissa_mask = vdupq_n_u64((f64::MIN_POSITIVE - mem::transmute::<u64,f64>(1)).to_bits());
+
+        // round the hi values down to the nearest power of 2
+        let hi2p =
+            vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(hi), exponent_mask));
+        // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
+        // we can double it and xor it with the exponent mask
+        let scale = vreinterpretq_f64_u64(veorq_u64(
+            vreinterpretq_u64_f64(vaddq_f64(hi2p, hi2p)),
+            exponent_mask,
+        ));
+        // create a mask that matches the normal hi values
+        let mask = vcgtq_f64(hi, vdupq_n_f64(f64::MIN_POSITIVE));
+        // replace the subnormal values of hi2p with the minimum positive normal value
+        let hi2p = vbslq_f64(mask, hi2p, vdupq_n_f64(f64::MIN_POSITIVE));
+        // replace the subnormal values of scale with the reciprocal of the minimum positive normal value
+        let scale = vbslq_f64(mask, scale, vdupq_n_f64(1.0 / f64::MIN_POSITIVE));
+        // create a mask that matches the subnormal hi values
+        let mask = vcltq_f64(hi, vdupq_n_f64(f64::MIN_POSITIVE));
+        // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
+        let hi_scaled = vreinterpretq_f64_u64(vorrq_u64(
+            vandq_u64(vreinterpretq_u64_f64(hi), mantissa_mask),
+            vreinterpretq_u64_f64(vdupq_n_f64(1.0)),
+        ));
+        // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
+        let hi_scaled =
+            vbslq_f64(mask, vsubq_f64(hi_scaled, vdupq_n_f64(1.0)), hi_scaled);
+        // finally, do the thing
+        let hi_scaled = vmulq_f64(hi_scaled, hi_scaled);
+        let lo_scaled = vmulq_f64(lo, scale);
+        vmulq_f64(hi2p, vsqrtq_f64(vfmaq_f64(lo_scaled, lo_scaled, hi_scaled)))
+    }
+}
+
 impl SimdRegister<i8> for Neon {
     type Register = int8x16_t;
 
diff --git a/cfavml/src/danger/impl_test.rs b/cfavml/src/danger/impl_test.rs
@@ -326,7 +326,7 @@ unsafe fn test_sample<T, R>(
     Standard: Distribution<T>,
 {
     {
-        let (_std_result, std_sum) = get_std_results(&sample1, &sample2);
+        let (std_result, std_sum) = get_std_results(&sample1, &sample2);
         let l1 = R::load(sample1.as_ptr());
         let l2 = R::load(sample2.as_ptr());
         let res = R::hypot(l1, l2);
@@ -335,9 +335,12 @@ unsafe fn test_sample<T, R>(
             AutoMath::is_close(std_sum, res_sum),
             "Hypot and sum test failed on single task"
         );
+        let mut res_vec = vec![T::zero(); R::elements_per_lane()];
+        R::write(res_vec.as_mut_ptr(), res);
+        test_diff_ulps(std_result, res_vec);
     }
     {
-        let (_std_result, std_sum) = get_std_results(&large_sample_l1, &large_sample_l2);
+        let (std_result, std_sum) = get_std_results(&large_sample_l1, &large_sample_l2);
         let l1 = R::load_dense(large_sample_l1.as_ptr());
         let l2 = R::load_dense(large_sample_l2.as_ptr());
         let res = R::hypot_dense(l1, l2);
@@ -348,6 +351,9 @@ unsafe fn test_sample<T, R>(
             AutoMath::is_close(std_sum, res_sum),
             "Hypot and sum test failed on dense task"
         );
+        let mut res_vec = vec![T::zero(); R::elements_per_dense()];
+        R::write_dense(res_vec.as_mut_ptr(), res);
+        test_diff_ulps(std_result, res_vec);
     }
 }
 
@@ -364,3 +370,17 @@ where
     let sum = std_result.iter().fold(AutoMath::zero(), |a, b| a + *b);
     (std_result, sum)
 }
+
+fn test_diff_ulps<T>(a: Vec<T>, b: Vec<T>)
+where
+    T: Float + FloatConst,
+{
+    a.iter().zip(b.iter()).for_each(|(a, b)| {
+        let (a_mant, a_exp, a_sign) = a.integer_decode();
+        let (b_mant, b_exp, b_sign) = b.integer_decode();
+        assert!(a_sign == b_sign);
+        assert!(a_exp == b_exp);
+        let dist = a_mant as i64 - b_mant as i64;
+        assert!(dist.abs() < 2, "Greater than 1 ulp difference: {dist}");
+    });
+}
diff --git a/cfavml/src/math/fast_math.rs b/cfavml/src/math/fast_math.rs
@@ -28,7 +28,7 @@ impl Math<f32> for FastMath {
 
     #[inline(always)]
     fn sqrt(a: f32) -> f32 {
-        StdMath::sqrt(a)
+        core::intrinsics::sqrtf32(a)
     }
 
     #[inline(always)]
@@ -139,7 +139,7 @@ impl Math<f64> for FastMath {
 
     #[inline(always)]
     fn sqrt(a: f64) -> f64 {
-        StdMath::sqrt(a)
+        core::intrinsics::sqrtf64(a)
     }
 
     #[inline(always)]

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ impl Math<f32> for FastMath {`
`28`	`28`
`29`	`29`	`#[inline(always)]`
`30`	`30`	`fn sqrt(a: f32) -> f32 {`
`31`		`- StdMath::sqrt(a)`
	`31`	`+ core::intrinsics::sqrtf32(a)`
`32`	`32`	`}`
`33`	`33`
`34`	`34`	`#[inline(always)]`
`@@ -139,7 +139,7 @@ impl Math<f64> for FastMath {`
`139`	`139`
`140`	`140`	`#[inline(always)]`
`141`	`141`	`fn sqrt(a: f64) -> f64 {`
`142`		`- StdMath::sqrt(a)`
	`142`	`+ core::intrinsics::sqrtf64(a)`
`143`	`143`	`}`
`144`	`144`
`145`	`145`	`#[inline(always)]`