From a3433070578fbe1e4b02fe8f32c8a6264416c854 Mon Sep 17 00:00:00 2001
From: Joshua Ferguson <joshua.ferguson.273@gmail.com>
Date: Sat, 9 Nov 2024 16:47:23 -0600
Subject: [PATCH 01/13] hypotenuse function for f32, f64

---
 cfavml/src/danger/core_simd_api.rs |  16 ++++
 cfavml/src/danger/impl_avx2.rs     | 107 ++++++++++++++++++++++++++-
 cfavml/src/danger/impl_avx2fma.rs  | 112 +++++++++++++++++++++++++++-
 cfavml/src/danger/impl_avx512.rs   | 115 ++++++++++++++++++++++++++++-
 cfavml/src/danger/impl_fallback.rs |  16 +++-
 cfavml/src/danger/impl_neon.rs     |   7 ++
 cfavml/src/danger/impl_test.rs     |  89 +++++++++++++++++++++-
 cfavml/src/danger/test_suite.rs    |  20 +++++
 cfavml/src/math/default.rs         |  21 ++++++
 cfavml/src/math/mod.rs             |   2 +-
 cfavml/src/test_utils.rs           |  37 +++++++++-
 11 files changed, 535 insertions(+), 7 deletions(-)
diff --git a/cfavml/src/danger/core_simd_api.rs b/cfavml/src/danger/core_simd_api.rs
index cbc344e..e2fa9c0 100644
--- a/cfavml/src/danger/core_simd_api.rs
+++ b/cfavml/src/danger/core_simd_api.rs
@@ -405,3 +405,19 @@ pub trait SimdRegister<T: Copy> {
         Self::write(mem.add(Self::elements_per_lane() * 7), lane.h);
     }
 }
+
+pub trait Hypot<T>: SimdRegister<T>
+where
+    T: Copy,
+{
+    /// SIMD Variant of the std hypot function. Computes the distance between the origin
+    /// and a point (`x`, `y`) on the Euclidean plane.
+    unsafe fn hypot(a: Self::Register, b: Self::Register) -> Self::Register;
+    #[inline(always)]
+    unsafe fn hypot_dense(
+        a: DenseLane<Self::Register>,
+        b: DenseLane<Self::Register>,
+    ) -> DenseLane<Self::Register> {
+        apply_dense!(Self::hypot, a, b)
+    }
+}
diff --git a/cfavml/src/danger/impl_avx2.rs b/cfavml/src/danger/impl_avx2.rs
index e8f7303..56ca639 100644
--- a/cfavml/src/danger/impl_avx2.rs
+++ b/cfavml/src/danger/impl_avx2.rs
@@ -5,7 +5,7 @@ use core::arch::x86_64::*;
 use core::iter::zip;
 use core::mem;
 
-use super::core_simd_api::{DenseLane, SimdRegister};
+use super::core_simd_api::{DenseLane, Hypot, SimdRegister};
 use crate::apply_dense;
 
 /// AVX2 enabled SIMD operations.
@@ -169,6 +169,59 @@ impl SimdRegister<f32> for Avx2 {
     }
 }
 
+impl Hypot<f32> for Avx2 {
+    //b' * sqrt((a*b_hat)^2 + (b*b_hat)^2), where |b| > |a|, b'=(2^n<b), b_hat=1/b'
+    #[inline(always)]
+    unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
+        // convert the inputs to absolute values
+        let (x, y) = (
+            _mm256_andnot_ps(_mm256_set1_ps(-0.0), x),
+            _mm256_andnot_ps(_mm256_set1_ps(-0.0), y),
+        );
+        // find the max and min of the two inputs
+        let (hi, lo) = (
+            <Avx2 as SimdRegister<f32>>::max(x, y),
+            <Avx2 as SimdRegister<f32>>::min(x, y),
+        );
+        //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
+        let exponent_mask = _mm256_set1_ps(f32::INFINITY);
+        //The mantissa mask is the inverse of the exponent mask
+        let mantissa_mask = _mm256_sub_ps(
+            _mm256_set1_ps(f32::MIN_POSITIVE),
+            _mm256_set1_ps(core::mem::transmute(1_u32)),
+        );
+        // round the hi values down to the nearest power of 2
+        let hi2p = _mm256_and_ps(hi, exponent_mask);
+        // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
+        // we can double it and xor it with the exponent mask
+        let scale = _mm256_xor_ps(_mm256_add_ps(hi2p, hi2p), exponent_mask);
+
+        // create a mask that matches the normal hi values
+        let mask = _mm256_cmp_ps::<_CMP_GT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE));
+        // replace the subnormal values of hi2p with the minimum positive normal value
+        let hi2p = _mm256_blendv_ps(_mm256_set1_ps(f32::MIN_POSITIVE), hi2p, mask);
+        // replace the subnormal values of scale with the reciprocal of the minimum positive normal value
+        let scale =
+            _mm256_blendv_ps(_mm256_set1_ps(1.0 / f32::MIN_POSITIVE), scale, mask);
+        // create a mask that matches the subnormal hi values
+        let mask = _mm256_cmp_ps::<_CMP_LT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE));
+        // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
+        let hi_scaled =
+            _mm256_or_ps(_mm256_and_ps(hi, mantissa_mask), _mm256_set1_ps(1.0));
+        // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
+        let hi_scaled = _mm256_blendv_ps(
+            hi_scaled,
+            _mm256_sub_ps(hi_scaled, _mm256_set1_ps(1.0)),
+            mask,
+        );
+
+        let hi_scaled = _mm256_mul_ps(hi_scaled, hi_scaled);
+        let lo_scaled = _mm256_mul_ps(lo, scale);
+        let lo_scaled = _mm256_mul_ps(lo_scaled, lo_scaled);
+        _mm256_mul_ps(hi2p, _mm256_sqrt_ps(_mm256_add_ps(lo_scaled, hi_scaled)))
+    }
+}
+
 impl SimdRegister<f64> for Avx2 {
     type Register = __m256d;
 
@@ -2724,3 +2777,55 @@ impl SimdRegister<u64> for Avx2 {
         _mm256_storeu_si256(mem.cast(), reg)
     }
 }
+
+impl Hypot<f64> for Avx2 {
+    #[inline(always)]
+    unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
+        // convert the inputs to absolute values
+        let (x, y) = (
+            _mm256_andnot_pd(_mm256_set1_pd(-0.0), x),
+            _mm256_andnot_pd(_mm256_set1_pd(-0.0), y),
+        );
+        // find the max and min of the two inputs
+        let (hi, lo) = (
+            <Avx2 as SimdRegister<f64>>::max(x, y),
+            <Avx2 as SimdRegister<f64>>::min(x, y),
+        );
+        //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
+        let exponent_mask = _mm256_set1_pd(f64::INFINITY);
+        //The mantissa mask is the inverse of the exponent mask
+        let mantissa_mask = _mm256_sub_pd(
+            _mm256_set1_pd(f64::MIN_POSITIVE),
+            _mm256_set1_pd(core::mem::transmute(1_u64)),
+        );
+        // round the hi values down to the nearest power of 2
+        let hi2p = _mm256_and_pd(hi, exponent_mask);
+        // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
+        // we can double it and xor it with the exponent mask
+        let scale = _mm256_xor_pd(_mm256_add_pd(hi2p, hi2p), exponent_mask);
+
+        // create a mask that matches the normal hi values
+        let mask = _mm256_cmp_pd::<_CMP_GT_OQ>(hi, _mm256_set1_pd(f64::MIN_POSITIVE));
+        // replace the subnormal values of hi2p with the minimum positive normal value
+        let hi2p = _mm256_blendv_pd(_mm256_set1_pd(f64::MIN_POSITIVE), hi2p, mask);
+        // replace the subnormal values of scale with the reciprocal of the minimum positive normal value
+        let scale =
+            _mm256_blendv_pd(_mm256_set1_pd(1.0 / f64::MIN_POSITIVE), scale, mask);
+        // create a mask that matches the subnormal hi values
+        let mask = _mm256_cmp_pd::<_CMP_LT_OQ>(hi, _mm256_set1_pd(f64::MIN_POSITIVE));
+        // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
+        let hi_scaled =
+            _mm256_or_pd(_mm256_and_pd(hi, mantissa_mask), _mm256_set1_pd(1.0));
+        // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
+        let hi_scaled = _mm256_blendv_pd(
+            hi_scaled,
+            _mm256_sub_pd(hi_scaled, _mm256_set1_pd(1.0)),
+            mask,
+        );
+
+        let hi_scaled = _mm256_mul_pd(hi_scaled, hi_scaled);
+        let lo_scaled = _mm256_mul_pd(lo, scale);
+        let lo_scaled = _mm256_mul_pd(lo_scaled, lo_scaled);
+        _mm256_mul_pd(hi2p, _mm256_sqrt_pd(_mm256_add_pd(lo_scaled, hi_scaled)))
+    }
+}
diff --git a/cfavml/src/danger/impl_avx2fma.rs b/cfavml/src/danger/impl_avx2fma.rs
index f24cfcd..83da6a6 100644
--- a/cfavml/src/danger/impl_avx2fma.rs
+++ b/cfavml/src/danger/impl_avx2fma.rs
@@ -3,7 +3,7 @@ use core::arch::x86::*;
 #[cfg(target_arch = "x86_64")]
 use core::arch::x86_64::*;
 
-use super::core_simd_api::SimdRegister;
+use super::core_simd_api::{Hypot, SimdRegister};
 use super::impl_avx2::Avx2;
 
 /// AVX2 & FMA enabled SIMD operations.
@@ -119,6 +119,61 @@ impl SimdRegister<f32> for Avx2Fma {
     }
 }
 
+impl Hypot<f32> for Avx2Fma {
+    //b' * sqrt((a*b_hat)^2 + (b*b_hat)^2), where |b| > |a|, b'=(2^n<b), b_hat=1/b'
+    #[inline(always)]
+    unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
+        // convert the inputs to absolute values
+        let (x, y) = (
+            _mm256_andnot_ps(_mm256_set1_ps(-0.0), x),
+            _mm256_andnot_ps(_mm256_set1_ps(-0.0), y),
+        );
+        // find the max and min of the two inputs
+        let (hi, lo) = (
+            <Avx2 as SimdRegister<f32>>::max(x, y),
+            <Avx2 as SimdRegister<f32>>::min(x, y),
+        );
+        //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
+        let exponent_mask = _mm256_set1_ps(f32::INFINITY);
+        //The mantissa mask is the inverse of the exponent mask
+        let mantissa_mask = _mm256_sub_ps(
+            _mm256_set1_ps(f32::MIN_POSITIVE),
+            _mm256_set1_ps(core::mem::transmute(1_u32)),
+        );
+        // round the hi values down to the nearest power of 2
+        let hi2p = _mm256_and_ps(hi, exponent_mask);
+        // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
+        // we can double it and xor it with the exponent mask
+        let scale = _mm256_xor_ps(_mm256_add_ps(hi2p, hi2p), exponent_mask);
+
+        // create a mask that matches the normal hi values
+        let mask = _mm256_cmp_ps::<_CMP_GT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE));
+        // replace the subnormal values of hi2p with the minimum positive normal value
+        let hi2p = _mm256_blendv_ps(_mm256_set1_ps(f32::MIN_POSITIVE), hi2p, mask);
+        // replace the subnormal values of scale with the reciprocal of the minimum positive normal value
+        let scale =
+            _mm256_blendv_ps(_mm256_set1_ps(1.0 / f32::MIN_POSITIVE), scale, mask);
+        // create a mask that matches the subnormal hi values
+        let mask = _mm256_cmp_ps::<_CMP_LT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE));
+        // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
+        let hi_scaled =
+            _mm256_or_ps(_mm256_and_ps(hi, mantissa_mask), _mm256_set1_ps(1.0));
+        // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
+        let hi_scaled = _mm256_blendv_ps(
+            hi_scaled,
+            _mm256_sub_ps(hi_scaled, _mm256_set1_ps(1.0)),
+            mask,
+        );
+
+        let hi_scaled = _mm256_mul_ps(hi_scaled, hi_scaled);
+        let lo_scaled = _mm256_mul_ps(lo, scale);
+        _mm256_mul_ps(
+            hi2p,
+            _mm256_sqrt_ps(_mm256_fmadd_ps(lo_scaled, lo_scaled, hi_scaled)),
+        )
+    }
+}
+
 impl SimdRegister<f64> for Avx2Fma {
     type Register = __m256d;
 
@@ -225,3 +280,58 @@ impl SimdRegister<f64> for Avx2Fma {
         Avx2::write(mem, reg)
     }
 }
+
+impl Hypot<f64> for Avx2Fma {
+    #[inline(always)]
+    unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
+        // convert the inputs to absolute values
+        let (x, y) = (
+            _mm256_andnot_pd(_mm256_set1_pd(-0.0), x),
+            _mm256_andnot_pd(_mm256_set1_pd(-0.0), y),
+        );
+        // find the max and min of the two inputs
+        let (hi, lo) = (
+            <Avx2 as SimdRegister<f64>>::max(x, y),
+            <Avx2 as SimdRegister<f64>>::min(x, y),
+        );
+        //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
+        let exponent_mask = _mm256_set1_pd(f64::INFINITY);
+        //The mantissa mask is the inverse of the exponent mask
+        let mantissa_mask = _mm256_sub_pd(
+            _mm256_set1_pd(f64::MIN_POSITIVE),
+            _mm256_set1_pd(core::mem::transmute(1_u64)),
+        );
+        // round the hi values down to the nearest power of 2
+        let hi2p = _mm256_and_pd(hi, exponent_mask);
+        // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
+        // we can double it and xor it with the exponent mask
+        let scale = _mm256_xor_pd(_mm256_add_pd(hi2p, hi2p), exponent_mask);
+
+        // create a mask that matches the normal hi values
+        let mask = _mm256_cmp_pd::<_CMP_GT_OQ>(hi, _mm256_set1_pd(f64::MIN_POSITIVE));
+        // replace the subnormal values of hi2p with the minimum positive normal value
+        let hi2p = _mm256_blendv_pd(_mm256_set1_pd(f64::MIN_POSITIVE), hi2p, mask);
+        // replace the subnormal values of scale with the reciprocal of the minimum positive normal value
+        let scale =
+            _mm256_blendv_pd(_mm256_set1_pd(1.0 / f64::MIN_POSITIVE), scale, mask);
+        // create a mask that matches the subnormal hi values
+        let mask = _mm256_cmp_pd::<_CMP_LT_OQ>(hi, _mm256_set1_pd(f64::MIN_POSITIVE));
+        // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
+        let hi_scaled =
+            _mm256_or_pd(_mm256_and_pd(hi, mantissa_mask), _mm256_set1_pd(1.0));
+        // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
+        let hi_scaled = _mm256_blendv_pd(
+            hi_scaled,
+            _mm256_sub_pd(hi_scaled, _mm256_set1_pd(1.0)),
+            mask,
+        );
+
+        let hi_scaled = _mm256_mul_pd(hi_scaled, hi_scaled);
+        let lo_scaled = _mm256_mul_pd(lo, scale);
+
+        _mm256_mul_pd(
+            hi2p,
+            _mm256_sqrt_pd(_mm256_fmadd_pd(lo_scaled, lo_scaled, hi_scaled)),
+        )
+    }
+}
diff --git a/cfavml/src/danger/impl_avx512.rs b/cfavml/src/danger/impl_avx512.rs
index 726fe46..b6d34bf 100644
--- a/cfavml/src/danger/impl_avx512.rs
+++ b/cfavml/src/danger/impl_avx512.rs
@@ -5,7 +5,7 @@ use core::arch::x86_64::*;
 use core::iter::zip;
 use core::mem;
 
-use super::core_simd_api::{DenseLane, SimdRegister};
+use super::core_simd_api::{DenseLane, Hypot, SimdRegister};
 use super::impl_avx2::Avx2;
 use crate::apply_dense;
 
@@ -133,6 +133,63 @@ impl SimdRegister<f32> for Avx512 {
     }
 }
 
+impl Hypot<f32> for Avx512 {
+    //b' * sqrt((a*b_hat)^2 + (b*b_hat)^2), where |b| > |a|, b'=(2^n<b), b_hat=1/b'
+    #[inline(always)]
+    unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
+        // convert the inputs to absolute values
+        let (x, y) = (
+            _mm512_andnot_ps(_mm512_set1_ps(-0.0), x),
+            _mm512_andnot_ps(_mm512_set1_ps(-0.0), y),
+        );
+        // find the max and min of the two inputs
+        let (hi, lo) = (
+            <Avx512 as SimdRegister<f32>>::max(x, y),
+            <Avx512 as SimdRegister<f32>>::min(x, y),
+        );
+        //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
+        let exponent_mask = _mm512_set1_ps(f32::INFINITY);
+        //The mantissa mask is the inverse of the exponent mask
+        let mantissa_mask = _mm512_sub_ps(
+            _mm512_set1_ps(f32::MIN_POSITIVE),
+            _mm512_set1_ps(core::mem::transmute(1_u32)),
+        );
+        // round the hi values down to the nearest power of 2
+        let hi2p = _mm512_and_ps(hi, exponent_mask);
+        // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
+        // we can double it and xor it with the exponent mask
+        let scale = _mm512_xor_ps(_mm512_add_ps(hi2p, hi2p), exponent_mask);
+
+        // create a mask that matches the normal hi values
+        let mask =
+            _mm512_cmp_ps_mask::<_CMP_GT_OQ>(hi, _mm512_set1_ps(f32::MIN_POSITIVE));
+        // replace the subnormal values of hi2p with the minimum positive normal value
+        let hi2p = _mm512_mask_blend_ps(mask, _mm512_set1_ps(f32::MIN_POSITIVE), hi2p);
+        // replace the subnormal values of scale with the reciprocal of the minimum positive normal value
+        let scale =
+            _mm512_mask_blend_ps(mask, _mm512_set1_ps(1.0 / f32::MIN_POSITIVE), scale);
+        // create a mask that matches the subnormal hi values
+        let mask =
+            _mm512_cmp_ps_mask::<_CMP_LT_OQ>(hi, _mm512_set1_ps(f32::MIN_POSITIVE));
+        // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
+        let hi_scaled =
+            _mm512_or_ps(_mm512_and_ps(hi, mantissa_mask), _mm512_set1_ps(1.0));
+        // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
+        let hi_scaled = _mm512_mask_blend_ps(
+            mask,
+            hi_scaled,
+            _mm512_sub_ps(hi_scaled, _mm512_set1_ps(1.0)),
+        );
+
+        let hi_scaled = _mm512_mul_ps(hi_scaled, hi_scaled);
+        let lo_scaled = _mm512_mul_ps(lo, scale);
+        _mm512_mul_ps(
+            hi2p,
+            _mm512_sqrt_ps(_mm512_fmadd_ps(lo_scaled, lo_scaled, hi_scaled)),
+        )
+    }
+}
+
 impl SimdRegister<f64> for Avx512 {
     type Register = __m512d;
 
@@ -247,6 +304,62 @@ impl SimdRegister<f64> for Avx512 {
     }
 }
 
+impl Hypot<f64> for Avx512 {
+    #[inline(always)]
+    unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
+        // convert the inputs to absolute values
+        let (x, y) = (
+            _mm512_andnot_pd(_mm512_set1_pd(-0.0), x),
+            _mm512_andnot_pd(_mm512_set1_pd(-0.0), y),
+        );
+        // find the max and min of the two inputs
+        let (hi, lo) = (
+            <Avx512 as SimdRegister<f64>>::max(x, y),
+            <Avx512 as SimdRegister<f64>>::min(x, y),
+        );
+        //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
+        let exponent_mask = _mm512_set1_pd(f64::INFINITY);
+        //The mantissa mask is the inverse of the exponent mask
+        let mantissa_mask = _mm512_sub_pd(
+            _mm512_set1_pd(f64::MIN_POSITIVE),
+            _mm512_set1_pd(core::mem::transmute(1_u64)),
+        );
+        // round the hi values down to the nearest power of 2
+        let hi2p = _mm512_and_pd(hi, exponent_mask);
+        // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
+        // we can double it and xor it with the exponent mask
+        let scale = _mm512_xor_pd(_mm512_add_pd(hi2p, hi2p), exponent_mask);
+
+        // create a mask that matches the normal hi values
+        let mask =
+            _mm512_cmp_pd_mask::<_CMP_GT_OQ>(hi, _mm512_set1_pd(f64::MIN_POSITIVE));
+        // replace the subnormal values of hi2p with the minimum positive normal value
+        let hi2p = _mm512_mask_blend_pd(mask, _mm512_set1_pd(f64::MIN_POSITIVE), hi2p);
+        // replace the subnormal values of scale with the reciprocal of the minimum positive normal value
+        let scale =
+            _mm512_mask_blend_pd(mask, _mm512_set1_pd(1.0 / f64::MIN_POSITIVE), scale);
+        // create a mask that matches the subnormal hi values
+        let mask =
+            _mm512_cmp_pd_mask::<_CMP_LT_OQ>(hi, _mm512_set1_pd(f64::MIN_POSITIVE));
+        // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
+        let hi_scaled =
+            _mm512_or_pd(_mm512_and_pd(hi, mantissa_mask), _mm512_set1_pd(1.0));
+        // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
+        let hi_scaled = _mm512_mask_blend_pd(
+            mask,
+            hi_scaled,
+            _mm512_sub_pd(hi_scaled, _mm512_set1_pd(1.0)),
+        );
+
+        let hi_scaled = _mm512_mul_pd(hi_scaled, hi_scaled);
+        let lo_scaled = _mm512_mul_pd(lo, scale);
+
+        _mm512_mul_pd(
+            hi2p,
+            _mm512_sqrt_pd(_mm512_fmadd_pd(lo_scaled, lo_scaled, hi_scaled)),
+        )
+    }
+}
 impl SimdRegister<i8> for Avx512 {
     type Register = __m512i;
 
diff --git a/cfavml/src/danger/impl_fallback.rs b/cfavml/src/danger/impl_fallback.rs
index 73a98a4..b4ebbeb 100644
--- a/cfavml/src/danger/impl_fallback.rs
+++ b/cfavml/src/danger/impl_fallback.rs
@@ -1,5 +1,6 @@
+use super::core_simd_api::Hypot;
 use crate::danger::{DenseLane, SimdRegister};
-use crate::math::{AutoMath, Math};
+use crate::math::{f32_hypot, AutoMath, Math};
 
 /// Fallback SIMD-like operations.
 ///
@@ -130,3 +131,16 @@ where
         AutoMath::cast_bool(!AutoMath::cmp_eq(l1, l2))
     }
 }
+
+impl Hypot<f32> for Fallback {
+    #[inline(always)]
+    unsafe fn hypot(x: f32, y: f32) -> f32 {
+        f32_hypot(x, y)
+    }
+}
+impl Hypot<f64> for Fallback {
+    #[inline(always)]
+    unsafe fn hypot(x: f64, y: f64) -> f64 {
+        f64::hypot(x, y)
+    }
+}
diff --git a/cfavml/src/danger/impl_neon.rs b/cfavml/src/danger/impl_neon.rs
index 3fcef3a..41252ce 100644
--- a/cfavml/src/danger/impl_neon.rs
+++ b/cfavml/src/danger/impl_neon.rs
@@ -147,6 +147,13 @@ impl SimdRegister<f32> for Neon {
     }
 }
 
+impl Hypot<f32> for Neon {
+    #[inline(always)]
+    unsafe fn hypot(l1: Self::Register, l2: Self::Register) -> Self::Register {
+        todo!()
+    }
+}
+
 impl SimdRegister<f64> for Neon {
     type Register = float64x2_t;
 
diff --git a/cfavml/src/danger/impl_test.rs b/cfavml/src/danger/impl_test.rs
index 7bf42d9..8eaef9c 100644
--- a/cfavml/src/danger/impl_test.rs
+++ b/cfavml/src/danger/impl_test.rs
@@ -1,11 +1,14 @@
 use std::fmt::Debug;
 use std::iter::zip;
 
+use num_traits::{Float, FloatConst};
+use rand::distributions::uniform::SampleUniform;
 use rand::distributions::{Distribution, Standard};
 
+use super::core_simd_api::Hypot;
 use crate::danger::SimdRegister;
 use crate::math::{AutoMath, Math};
-use crate::test_utils::get_sample_vectors;
+use crate::test_utils::{get_sample_vectors, get_subnormal_sample_vectors};
 
 /// Runs a set of generic test suites to ensure a given impl is working correctly.
 pub(crate) unsafe fn test_suite_impl<T, R>()
@@ -277,3 +280,87 @@ where
         );
     }
 }
+
+pub(crate) unsafe fn hypot_test_impl<T, R>()
+where
+    T: Float + Debug + FloatConst + SampleUniform,
+    R: SimdRegister<T> + Hypot<T>,
+    AutoMath: Math<T>,
+    Standard: Distribution<T>,
+{
+    let (small_sample_l1, small_sample_l2) =
+        get_sample_vectors::<T>(R::elements_per_lane());
+
+    let (large_sample_l1, large_sample_l2) =
+        get_sample_vectors::<T>(R::elements_per_dense());
+
+    let (small_subnormal_sample_l1, small_subnormal_sample_l2) =
+        get_subnormal_sample_vectors::<T>(R::elements_per_lane());
+
+    let (large_subnormal_sample_l1, large_subnormal_sample_l2) =
+        get_subnormal_sample_vectors::<T>(R::elements_per_dense());
+
+    test_sample::<T, R>(
+        small_sample_l1,
+        small_sample_l2,
+        large_sample_l1,
+        large_sample_l2,
+    );
+    test_sample::<T, R>(
+        small_subnormal_sample_l1,
+        small_subnormal_sample_l2,
+        large_subnormal_sample_l1,
+        large_subnormal_sample_l2,
+    );
+}
+
+unsafe fn test_sample<T, R>(
+    sample1: Vec<T>,
+    sample2: Vec<T>,
+    large_sample_l1: Vec<T>,
+    large_sample_l2: Vec<T>,
+) where
+    T: Float + Debug + FloatConst,
+    R: SimdRegister<T> + Hypot<T>,
+    AutoMath: Math<T>,
+    Standard: Distribution<T>,
+{
+    {
+        let (_std_result, std_sum) = get_std_results(&sample1, &sample2);
+        let l1 = R::load(sample1.as_ptr());
+        let l2 = R::load(sample2.as_ptr());
+        let res = R::hypot(l1, l2);
+        let res_sum = R::sum_to_value(res);
+        assert!(
+            AutoMath::is_close(std_sum, res_sum),
+            "Hypot and sum test failed on single task"
+        );
+    }
+    {
+        let (_std_result, std_sum) = get_std_results(&large_sample_l1, &large_sample_l2);
+        let l1 = R::load_dense(large_sample_l1.as_ptr());
+        let l2 = R::load_dense(large_sample_l2.as_ptr());
+        let res = R::hypot_dense(l1, l2);
+        let reg = R::sum_to_register(res);
+        let res_sum = R::sum_to_value(reg);
+
+        assert!(
+            AutoMath::is_close(std_sum, res_sum),
+            "Hypot and sum test failed on dense task"
+        );
+    }
+}
+
+fn get_std_results<T>(sample1: &Vec<T>, sample2: &Vec<T>) -> (Vec<T>, T)
+where
+    T: Float + Debug,
+    AutoMath: Math<T>,
+{
+    let std_result = sample1
+        .iter()
+        .zip(sample2.iter())
+        .map(|(a, b)| a.hypot(*b))
+        .collect::<Vec<T>>();
+    let sum = std_result.iter().fold(AutoMath::zero(), |a, b| a + *b);
+    (std_result, sum)
+}
diff --git a/cfavml/src/danger/test_suite.rs b/cfavml/src/danger/test_suite.rs
index 887ef08..9671056 100644
--- a/cfavml/src/danger/test_suite.rs
+++ b/cfavml/src/danger/test_suite.rs
@@ -281,6 +281,23 @@ macro_rules! test_suite {
     };
 }
 
+macro_rules! hypot_tests {
+    ($im:ident) => {
+        paste::paste! {
+            #[test]
+            fn [<test_ $im:lower _hypot_f32>]() {
+                unsafe { crate::danger::impl_test::test_suite_impl::<f64, $im>() }
+            }
+        }
+        paste::paste! {
+            #[test]
+            fn [<test_ $im:lower _hypot_f64>]() {
+                unsafe { crate::danger::impl_test::test_suite_impl::<f64, $im>() }
+            }
+        }
+    };
+}
+
 fn test_arithmetic_value_all<T, R>(l1: Vec<T>, value: T)
 where
     T: Copy + PartialEq + Debug + IntoMemLoader<T>,
@@ -385,6 +402,7 @@ test_cosine_extra!(u64, Fallback);
 
 test_nan_sanity!(f32, Fallback);
 test_nan_sanity!(f64, Fallback);
+hypot_tests!(Fallback);
 
 #[cfg(all(target_feature = "avx2", test))]
 mod avx2_tests {
@@ -412,6 +430,7 @@ mod avx2_tests {
 
     test_nan_sanity!(f32, Avx2);
     test_nan_sanity!(f64, Avx2);
+    hypot_tests!(Avx2);
 }
 
 #[cfg(all(target_feature = "avx512f", feature = "nightly", test))]
@@ -451,6 +470,7 @@ mod avx2fma_tests {
 
     test_cosine_extra!(f32, Avx2Fma);
     test_cosine_extra!(f64, Avx2Fma);
+    hypot_tests!(Avx2Fma);
 }
 
 #[cfg(all(target_feature = "neon", test))]
diff --git a/cfavml/src/math/default.rs b/cfavml/src/math/default.rs
index 902b068..8bc96d0 100644
--- a/cfavml/src/math/default.rs
+++ b/cfavml/src/math/default.rs
@@ -114,6 +114,27 @@ impl Math<f32> for StdMath {
     }
 }
 
+pub(crate) fn f32_hypot(a: f32, b: f32) -> f32 {
+    #[cfg(feature = "std")]
+    {
+        f32::hypot(a, b)
+    }
+    #[cfg(not(feature = "std"))]
+    {
+        todo!()
+    }
+}
+pub(crate) fn f64_hypot(a: f64, b: f64) -> f64 {
+    #[cfg(feature = "std")]
+    {
+        f64::hypot(a, b)
+    }
+    #[cfg(not(feature = "std"))]
+    {
+        todo!()
+    }
+}
+
 impl Math<f64> for StdMath {
     #[inline(always)]
     fn zero() -> f64 {
diff --git a/cfavml/src/math/mod.rs b/cfavml/src/math/mod.rs
index 0d56be8..07df427 100644
--- a/cfavml/src/math/mod.rs
+++ b/cfavml/src/math/mod.rs
@@ -3,9 +3,9 @@ mod default;
 mod fast_math;
 
 pub use default::StdMath;
+pub(crate) use default::{f32_hypot, f64_hypot};
 #[cfg(feature = "nightly")]
 pub use fast_math::FastMath;
-
 #[cfg(not(feature = "nightly"))]
 pub type AutoMath = StdMath;
 #[cfg(feature = "nightly")]
diff --git a/cfavml/src/test_utils.rs b/cfavml/src/test_utils.rs
index 2688e3f..440c81f 100644
--- a/cfavml/src/test_utils.rs
+++ b/cfavml/src/test_utils.rs
@@ -1,10 +1,13 @@
+use core::ops::Neg;
+
+use num_traits::{zero, Float, FloatConst};
+use rand::distributions::uniform::SampleUniform;
 use rand::distributions::{Distribution, Standard};
 use rand::{Rng, SeedableRng};
 use rand_chacha::ChaCha8Rng;
 
 use crate::danger::cosine;
 use crate::math::{AutoMath, Math};
-
 const SEED: u64 = 34535345353;
 
 pub fn get_sample_vectors<T>(size: usize) -> (Vec<T>, Vec<T>)
@@ -36,6 +39,38 @@ where
     (x, y)
 }
 
+pub fn get_subnormal_sample_vectors<T>(size: usize) -> (Vec<T>, Vec<T>)
+where
+    T: Copy + Float + FloatConst + Neg<Output = T> + SampleUniform,
+    AutoMath: Math<T>,
+    Standard: Distribution<T>,
+{
+    let mut rng = ChaCha8Rng::seed_from_u64(SEED);
+
+    let mut x = Vec::new();
+    let mut y = Vec::new();
+    for _ in 0..size {
+        let mut v1 = rng.gen_range(core::ops::Range {
+            start: zero(),
+            end: T::min_positive_value(),
+        });
+        let mut v2 = rng.gen();
+
+        if AutoMath::cmp_eq(v1, AutoMath::zero()) {
+            v1 = AutoMath::one();
+        }
+
+        if AutoMath::cmp_eq(v2, AutoMath::zero()) {
+            v2 = AutoMath::one();
+        }
+
+        x.push(v1);
+        y.push(v2);
+    }
+
+    (x, y)
+}
+
 pub fn simple_dot<T>(x: &[T], y: &[T]) -> T
 where
     T: Copy,

From 11db4447e95ef53620d7419f54fc5b1e20fa9e39 Mon Sep 17 00:00:00 2001
From: Joshua Ferguson <joshua.ferguson.273@gmail.com>
Date: Sat, 16 Nov 2024 13:48:26 -0600
Subject: [PATCH 02/13] use sqrt intrinsic for fastmath, implemented Hypot for
 Neon

---
 cfavml/src/danger/impl_neon.rs | 88 ++++++++++++++++++++++++++++++++--
 cfavml/src/danger/impl_test.rs | 24 +++++++++-
 cfavml/src/math/fast_math.rs   |  4 +-
 3 files changed, 108 insertions(+), 8 deletions(-)

diff --git a/cfavml/src/danger/impl_neon.rs b/cfavml/src/danger/impl_neon.rs
index 41252ce..4b98703 100644
--- a/cfavml/src/danger/impl_neon.rs
+++ b/cfavml/src/danger/impl_neon.rs
@@ -2,7 +2,7 @@ use core::arch::aarch64::*;
 use core::iter::zip;
 use core::mem;
 
-use crate::danger::{DenseLane, SimdRegister};
+use super::core_simd_api::{DenseLane, SimdRegister,Hypot};
 use crate::math::{AutoMath, Math};
 
 const BITS_8_CAPACITY: usize = 16;
@@ -146,11 +146,48 @@ impl SimdRegister<f32> for Neon {
         vst1q_f32(mem, reg)
     }
 }
-
+const EXPONENT_MASK_F32: u32 = 2139095040;
+const MANTISSA_MASK_F32: u32 = 8388607;
 impl Hypot<f32> for Neon {
     #[inline(always)]
-    unsafe fn hypot(l1: Self::Register, l2: Self::Register) -> Self::Register {
-        todo!()
+    unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
+        // Convert inputs to absolute values
+        let (x, y) = (vabsq_f32(x), vabsq_f32(y));
+
+        // Find the max and min of the two inputs
+        let (hi, lo) = (vmaxq_f32(x, y), vminq_f32(x, y));
+        let exponent_mask = vdupq_n_u32(EXPONENT_MASK_F32);
+        let mantissa_mask = vdupq_n_u32(MANTISSA_MASK_F32);
+
+        // round the hi values down to the nearest power of 2
+        let hi2p =
+            vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(hi), exponent_mask));
+        // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
+        // we can double it and xor it with the exponent mask
+        let scale = vreinterpretq_f32_u32(veorq_u32(
+            vreinterpretq_u32_f32(vaddq_f32(hi2p, hi2p)),
+            exponent_mask,
+        ));
+        // create a mask that matches the normal hi values
+        let mask = vcgtq_f32(hi, vdupq_n_f32(f32::MIN_POSITIVE));
+        // replace the subnormal values of hi2p with the minimum positive normal value
+        let hi2p = vbslq_f32(mask, hi2p, vdupq_n_f32(f32::MIN_POSITIVE));
+        // replace the subnormal values of scale with the reciprocal of the minimum positive normal value
+        let scale = vbslq_f32(mask, scale, vdupq_n_f32(1.0 / f32::MIN_POSITIVE));
+        // create a mask that matches the subnormal hi values
+        let mask = vcltq_f32(hi, vdupq_n_f32(f32::MIN_POSITIVE));
+        // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
+        let hi_scaled = vreinterpretq_f32_u32(vorrq_u32(
+            vandq_u32(vreinterpretq_u32_f32(hi), mantissa_mask),
+            vreinterpretq_u32_f32(vdupq_n_f32(1.0)),
+        ));
+        // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
+        let hi_scaled =
+            vbslq_f32(mask, vsubq_f32(hi_scaled, vdupq_n_f32(1.0)), hi_scaled);
+        // finally, do the thing
+        let hi_scaled = vmulq_f32(hi_scaled, hi_scaled);
+        let lo_scaled = vmulq_f32(lo, scale);
+        vmulq_f32(hi2p, vsqrtq_f32(vfmaq_f32(lo_scaled, lo_scaled, hi_scaled)))
     }
 }
 
@@ -286,6 +323,49 @@ impl SimdRegister<f64> for Neon {
     }
 }
 
+impl Hypot<f64> for Neon {
+    #[inline(always)]
+    unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
+        // Convert inputs to absolute values
+        let (x, y) = (vabsq_f64(x), vabsq_f64(y));
+
+        // Find the max and min of the two inputs
+        let (hi, lo) = (vmaxq_f64(x, y), vminq_f64(x, y));
+        let exponent_mask = vdupq_n_u64(f64::INFINITY.to_bits());
+        let mantissa_mask = vdupq_n_u64((f64::MIN_POSITIVE - mem::transmute::<u64,f64>(1)).to_bits());
+
+        // round the hi values down to the nearest power of 2
+        let hi2p =
+            vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(hi), exponent_mask));
+        // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
+        // we can double it and xor it with the exponent mask
+        let scale = vreinterpretq_f64_u64(veorq_u64(
+            vreinterpretq_u64_f64(vaddq_f64(hi2p, hi2p)),
+            exponent_mask,
+        ));
+        // create a mask that matches the normal hi values
+        let mask = vcgtq_f64(hi, vdupq_n_f64(f64::MIN_POSITIVE));
+        // replace the subnormal values of hi2p with the minimum positive normal value
+        let hi2p = vbslq_f64(mask, hi2p, vdupq_n_f64(f64::MIN_POSITIVE));
+        // replace the subnormal values of scale with the reciprocal of the minimum positive normal value
+        let scale = vbslq_f64(mask, scale, vdupq_n_f64(1.0 / f64::MIN_POSITIVE));
+        // create a mask that matches the subnormal hi values
+        let mask = vcltq_f64(hi, vdupq_n_f64(f64::MIN_POSITIVE));
+        // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
+        let hi_scaled = vreinterpretq_f64_u64(vorrq_u64(
+            vandq_u64(vreinterpretq_u64_f64(hi), mantissa_mask),
+            vreinterpretq_u64_f64(vdupq_n_f64(1.0)),
+        ));
+        // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
+        let hi_scaled =
+            vbslq_f64(mask, vsubq_f64(hi_scaled, vdupq_n_f64(1.0)), hi_scaled);
+        // finally, do the thing
+        let hi_scaled = vmulq_f64(hi_scaled, hi_scaled);
+        let lo_scaled = vmulq_f64(lo, scale);
+        vmulq_f64(hi2p, vsqrtq_f64(vfmaq_f64(lo_scaled, lo_scaled, hi_scaled)))
+    }
+}
+
 impl SimdRegister<i8> for Neon {
     type Register = int8x16_t;
 
diff --git a/cfavml/src/danger/impl_test.rs b/cfavml/src/danger/impl_test.rs
index 8eaef9c..d1d3494 100644
--- a/cfavml/src/danger/impl_test.rs
+++ b/cfavml/src/danger/impl_test.rs
@@ -326,7 +326,7 @@ unsafe fn test_sample<T, R>(
     Standard: Distribution<T>,
 {
     {
-        let (_std_result, std_sum) = get_std_results(&sample1, &sample2);
+        let (std_result, std_sum) = get_std_results(&sample1, &sample2);
         let l1 = R::load(sample1.as_ptr());
         let l2 = R::load(sample2.as_ptr());
         let res = R::hypot(l1, l2);
@@ -335,9 +335,12 @@ unsafe fn test_sample<T, R>(
             AutoMath::is_close(std_sum, res_sum),
             "Hypot and sum test failed on single task"
         );
+        let mut res_vec = vec![T::zero(); R::elements_per_lane()];
+        R::write(res_vec.as_mut_ptr(), res);
+        test_diff_ulps(std_result, res_vec);
     }
     {
-        let (_std_result, std_sum) = get_std_results(&large_sample_l1, &large_sample_l2);
+        let (std_result, std_sum) = get_std_results(&large_sample_l1, &large_sample_l2);
         let l1 = R::load_dense(large_sample_l1.as_ptr());
         let l2 = R::load_dense(large_sample_l2.as_ptr());
         let res = R::hypot_dense(l1, l2);
@@ -348,6 +351,9 @@ unsafe fn test_sample<T, R>(
             AutoMath::is_close(std_sum, res_sum),
             "Hypot and sum test failed on dense task"
         );
+        let mut res_vec = vec![T::zero(); R::elements_per_dense()];
+        R::write_dense(res_vec.as_mut_ptr(), res);
+        test_diff_ulps(std_result, res_vec);
     }
 }
 
@@ -364,3 +370,17 @@ where
     let sum = std_result.iter().fold(AutoMath::zero(), |a, b| a + *b);
     (std_result, sum)
 }
+
+fn test_diff_ulps<T>(a: Vec<T>, b: Vec<T>)
+where
+    T: Float + FloatConst,
+{
+    a.iter().zip(b.iter()).for_each(|(a, b)| {
+        let (a_mant, a_exp, a_sign) = a.integer_decode();
+        let (b_mant, b_exp, b_sign) = b.integer_decode();
+        assert!(a_sign == b_sign);
+        assert!(a_exp == b_exp);
+        let dist = a_mant as i64 - b_mant as i64;
+        assert!(dist.abs() < 2, "Greater than 1 ulp difference: {dist}");
+    });
+}
diff --git a/cfavml/src/math/fast_math.rs b/cfavml/src/math/fast_math.rs
index d56924f..de25c04 100644
--- a/cfavml/src/math/fast_math.rs
+++ b/cfavml/src/math/fast_math.rs
@@ -28,7 +28,7 @@ impl Math<f32> for FastMath {
 
     #[inline(always)]
     fn sqrt(a: f32) -> f32 {
-        StdMath::sqrt(a)
+        core::intrinsics::sqrtf32(a)
     }
 
     #[inline(always)]
@@ -139,7 +139,7 @@ impl Math<f64> for FastMath {
 
     #[inline(always)]
     fn sqrt(a: f64) -> f64 {
-        StdMath::sqrt(a)
+        core::intrinsics::sqrtf64(a)
     }
 
     #[inline(always)]

From b1e9599fbf2c4ee78dbaf56e7ea5d0dee4ca6c7d Mon Sep 17 00:00:00 2001
From: Joshua Ferguson <joshua.ferguson.273@gmail.com>
Date: Sat, 16 Nov 2024 14:09:16 -0600
Subject: [PATCH 03/13] added hypot test for Neon, Axx512

---
 cfavml/src/danger/test_suite.rs | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cfavml/src/danger/test_suite.rs b/cfavml/src/danger/test_suite.rs
index 9671056..2fbe053 100644
--- a/cfavml/src/danger/test_suite.rs
+++ b/cfavml/src/danger/test_suite.rs
@@ -281,7 +281,7 @@ macro_rules! test_suite {
     };
 }
 
-macro_rules! hypot_tests {
+macro_rules! test_hypot {
     ($im:ident) => {
         paste::paste! {
             #[test]
@@ -402,7 +402,7 @@ test_cosine_extra!(u64, Fallback);
 
 test_nan_sanity!(f32, Fallback);
 test_nan_sanity!(f64, Fallback);
-hypot_tests!(Fallback);
+test_hypot!(Fallback);
 
 #[cfg(all(target_feature = "avx2", test))]
 mod avx2_tests {
@@ -430,7 +430,7 @@ mod avx2_tests {
 
     test_nan_sanity!(f32, Avx2);
     test_nan_sanity!(f64, Avx2);
-    hypot_tests!(Avx2);
+    test_hypot!(Avx2);
 }
 
 #[cfg(all(target_feature = "avx512f", feature = "nightly", test))]
@@ -459,6 +459,7 @@ mod avx512_tests {
 
     test_nan_sanity!(f32, Avx512);
     test_nan_sanity!(f64, Avx512);
+    test_hypot!(Avx512);
 }
 
 #[cfg(all(target_feature = "avx2", target_feature = "fma", test))]
@@ -470,7 +471,7 @@ mod avx2fma_tests {
 
     test_cosine_extra!(f32, Avx2Fma);
     test_cosine_extra!(f64, Avx2Fma);
-    hypot_tests!(Avx2Fma);
+    test_hypot!(Avx2Fma);
 }
 
 #[cfg(all(target_feature = "neon", test))]
@@ -501,4 +502,5 @@ mod neon_tests {
 
     test_nan_sanity!(f32, Neon);
     test_nan_sanity!(f64, Neon);
+    test_hypot!(Neon);
 }

From a1a2e279aea66d52ee6d12c71031fe2f37077b01 Mon Sep 17 00:00:00 2001
From: Joshua Ferguson <joshua.ferguson.273@gmail.com>
Date: Sat, 16 Nov 2024 17:01:07 -0600
Subject: [PATCH 04/13] revert sqrt change, fallback to direct calculation if
 no_std for now

---
 cfavml/src/danger/impl_fallback.rs |  6 ++--
 cfavml/src/danger/impl_neon.rs     |  5 ++--
 cfavml/src/math/default.rs         | 47 ++++++++++++++++++------------
 cfavml/src/math/fast_math.rs       | 18 ++++++++++--
 cfavml/src/math/mod.rs             |  5 +++-
 5 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/cfavml/src/danger/impl_fallback.rs b/cfavml/src/danger/impl_fallback.rs
index b4ebbeb..91b4a66 100644
--- a/cfavml/src/danger/impl_fallback.rs
+++ b/cfavml/src/danger/impl_fallback.rs
@@ -1,6 +1,6 @@
 use super::core_simd_api::Hypot;
 use crate::danger::{DenseLane, SimdRegister};
-use crate::math::{f32_hypot, AutoMath, Math};
+use crate::math::{AutoMath, Math, Numeric};
 
 /// Fallback SIMD-like operations.
 ///
@@ -135,12 +135,12 @@ where
 impl Hypot<f32> for Fallback {
     #[inline(always)]
     unsafe fn hypot(x: f32, y: f32) -> f32 {
-        f32_hypot(x, y)
+        AutoMath::hypot(x, y)
     }
 }
 impl Hypot<f64> for Fallback {
     #[inline(always)]
     unsafe fn hypot(x: f64, y: f64) -> f64 {
-        f64::hypot(x, y)
+        AutoMath::hypot(x, y)
     }
 }
diff --git a/cfavml/src/danger/impl_neon.rs b/cfavml/src/danger/impl_neon.rs
index 4b98703..ef60095 100644
--- a/cfavml/src/danger/impl_neon.rs
+++ b/cfavml/src/danger/impl_neon.rs
@@ -2,7 +2,7 @@ use core::arch::aarch64::*;
 use core::iter::zip;
 use core::mem;
 
-use super::core_simd_api::{DenseLane, SimdRegister,Hypot};
+use super::core_simd_api::{DenseLane, Hypot, SimdRegister};
 use crate::math::{AutoMath, Math};
 
 const BITS_8_CAPACITY: usize = 16;
@@ -332,7 +332,8 @@ impl Hypot<f64> for Neon {
         // Find the max and min of the two inputs
         let (hi, lo) = (vmaxq_f64(x, y), vminq_f64(x, y));
         let exponent_mask = vdupq_n_u64(f64::INFINITY.to_bits());
-        let mantissa_mask = vdupq_n_u64((f64::MIN_POSITIVE - mem::transmute::<u64,f64>(1)).to_bits());
+        let mantissa_mask =
+            vdupq_n_u64((f64::MIN_POSITIVE - mem::transmute::<u64, f64>(1)).to_bits());
 
         // round the hi values down to the nearest power of 2
         let hi2p =
diff --git a/cfavml/src/math/default.rs b/cfavml/src/math/default.rs
index 8bc96d0..c538f8e 100644
--- a/cfavml/src/math/default.rs
+++ b/cfavml/src/math/default.rs
@@ -1,4 +1,4 @@
-use super::Math;
+use super::{Math, Numeric};
 
 /// Standard math operations that apply no specialised handling.
 pub struct StdMath;
@@ -114,24 +114,18 @@ impl Math<f32> for StdMath {
     }
 }
 
-pub(crate) fn f32_hypot(a: f32, b: f32) -> f32 {
-    #[cfg(feature = "std")]
-    {
-        f32::hypot(a, b)
-    }
-    #[cfg(not(feature = "std"))]
-    {
-        todo!()
-    }
-}
-pub(crate) fn f64_hypot(a: f64, b: f64) -> f64 {
-    #[cfg(feature = "std")]
-    {
-        f64::hypot(a, b)
-    }
-    #[cfg(not(feature = "std"))]
-    {
-        todo!()
+impl Numeric<f32> for StdMath {
+    #[inline(always)]
+    fn hypot(a: f32, b: f32) -> f32 {
+        #[cfg(feature = "std")]
+        {
+            f32::hypot(a, b)
+        }
+        #[cfg(not(feature = "std"))]
+        {
+            // Fall back to direct calculation
+            f32_sqrt_fast(a * a + b * b)
+        }
     }
 }
 
@@ -246,6 +240,21 @@ impl Math<f64> for StdMath {
     }
 }
 
+impl Numeric<f64> for StdMath {
+    #[inline(always)]
+    fn hypot(a: f64, b: f64) -> f64 {
+        #[cfg(feature = "std")]
+        {
+            f64::hypot(a, b)
+        }
+        #[cfg(not(feature = "std"))]
+        {
+            // Fall back to direct calculation
+            f32_sqrt_fast((a * a + b * b) as f32) as f64
+        }
+    }
+}
+
 macro_rules! define_int_ops {
     ($t:ident) => {
         impl Math<$t> for StdMath {
diff --git a/cfavml/src/math/fast_math.rs b/cfavml/src/math/fast_math.rs
index de25c04..58e65d8 100644
--- a/cfavml/src/math/fast_math.rs
+++ b/cfavml/src/math/fast_math.rs
@@ -1,6 +1,6 @@
 use core::intrinsics;
 
-use super::{Math, StdMath};
+use super::{Math, Numeric, StdMath};
 
 /// Basic math operations backed by fast-math intrinsics.
 pub struct FastMath;
@@ -28,7 +28,7 @@ impl Math<f32> for FastMath {
 
     #[inline(always)]
     fn sqrt(a: f32) -> f32 {
-        core::intrinsics::sqrtf32(a)
+        StdMath::sqrt(a)
     }
 
     #[inline(always)]
@@ -116,6 +116,12 @@ impl Math<f32> for FastMath {
     }
 }
 
+impl Numeric<f32> for FastMath {
+    fn hypot(a: f32, b: f32) -> f32 {
+        StdMath::hypot(a as f64, b as f64) as f32
+    }
+}
+
 impl Math<f64> for FastMath {
     #[inline(always)]
     fn zero() -> f64 {
@@ -139,7 +145,7 @@ impl Math<f64> for FastMath {
 
     #[inline(always)]
     fn sqrt(a: f64) -> f64 {
-        core::intrinsics::sqrtf64(a)
+        StdMath::sqrt(a)
     }
 
     #[inline(always)]
@@ -227,6 +233,12 @@ impl Math<f64> for FastMath {
     }
 }
 
+impl Numeric<f64> for FastMath {
+    fn hypot(a: f64, b: f64) -> f64 {
+        StdMath::hypot(a, b)
+    }
+}
+
 macro_rules! define_int_ops {
     ($t:ident) => {
         impl Math<$t> for FastMath {
diff --git a/cfavml/src/math/mod.rs b/cfavml/src/math/mod.rs
index 07df427..f15fbfd 100644
--- a/cfavml/src/math/mod.rs
+++ b/cfavml/src/math/mod.rs
@@ -3,7 +3,6 @@ mod default;
 mod fast_math;
 
 pub use default::StdMath;
-pub(crate) use default::{f32_hypot, f64_hypot};
 #[cfg(feature = "nightly")]
 pub use fast_math::FastMath;
 #[cfg(not(feature = "nightly"))]
@@ -79,3 +78,7 @@ pub trait Math<T> {
         }
     }
 }
+
+pub trait Numeric<T>: Math<T> {
+    fn hypot(a: T, b: T) -> T;
+}

From c1b88c7a3971086fbbecb43059cd9f2773e6851f Mon Sep 17 00:00:00 2001
From: Joshua Ferguson <joshua.ferguson.273@gmail.com>
Date: Wed, 20 Nov 2024 12:02:38 -0600
Subject: [PATCH 05/13] added export_hypot

---
 cfavml/src/danger/export_hypot.rs             | 152 ++++++++++++++++++
 cfavml/src/danger/impl_fallback.rs            |  14 +-
 cfavml/src/danger/mod.rs                      |   3 +
 cfavml/src/danger/op_hypot.rs                 |  38 +++++
 .../export_docs/arithmetic_hypot_vertical.md  |  37 +++++
 5 files changed, 236 insertions(+), 8 deletions(-)
 create mode 100644 cfavml/src/danger/export_hypot.rs
 create mode 100644 cfavml/src/danger/op_hypot.rs
 create mode 100644 cfavml/src/export_docs/arithmetic_hypot_vertical.md

diff --git a/cfavml/src/danger/export_hypot.rs b/cfavml/src/danger/export_hypot.rs
new file mode 100644
index 0000000..96589cf
--- /dev/null
+++ b/cfavml/src/danger/export_hypot.rs
@@ -0,0 +1,152 @@
+//! Common arithmetic operations
+//!
+//! I.e. Add, Sub, Mul, Div...
+
+use crate::buffer::WriteOnlyBuffer;
+use crate::danger::core_simd_api::Hypot;
+use crate::danger::{generic_hypot_vertical, SimdRegister};
+use crate::math::{AutoMath, Math, Numeric};
+use crate::mem_loader::{IntoMemLoader, MemLoader};
+
+macro_rules! define_arithmetic_impls {
+    (
+        $hypot_name:ident,
+        $imp:ident $(,)?
+        $(target_features = $($feat:expr $(,)?)+)?
+    ) => {
+        #[inline]
+        $(#[target_feature($(enable = $feat, )*)])*
+        #[doc = include_str!("../export_docs/arithmetic_hypot_vertical.md")]
+        $(
+
+            #[doc = concat!("- ", $("**`+", $feat, "`** ", )*)]
+            #[doc = "CPU features are available at runtime. Running on hardware _without_ this feature available will cause immediate UB."]
+        )*
+        pub unsafe fn $hypot_name<T, B1, B2, B3>(
+            a: B1,
+            b: B2,
+            result: &mut [B3],
+        )
+        where
+            T: Copy,
+            B1: IntoMemLoader<T>,
+            B1::Loader: MemLoader<Value = T>,
+            B2: IntoMemLoader<T>,
+            B2::Loader: MemLoader<Value = T>,
+            crate::danger::$imp: SimdRegister<T>+ Hypot<T>,
+            AutoMath: Math<T> + Numeric<T>,
+            for<'a> &'a mut [B3]: WriteOnlyBuffer<Item = T>,
+        {
+            generic_hypot_vertical::<T, crate::danger::$imp, AutoMath, B1, B2, B3>(
+                a,
+                b,
+                result,
+            )
+        }
+    };
+}
+
+define_arithmetic_impls!(generic_fallback_hypot_vertical, Fallback,);
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+define_arithmetic_impls!(generic_avx2_hypot_vertical, Avx2, target_features = "avx2");
+#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "nightly"))]
+define_arithmetic_impls!(
+    generic_avx512_hypot_vertical,
+    Avx512,
+    target_features = "avx512f",
+    "avx512bw"
+);
+#[cfg(target_arch = "aarch64")]
+define_arithmetic_impls!(generic_neon_hypot_vertical, Neon, target_features = "neon");
+
+#[cfg(test)]
+mod tests {
+    use num_traits::{Float, FloatConst};
+
+    use super::*;
+
+    macro_rules! define_inner_test {
+        ($variant:ident, op = $op:ident, ty = $t:ident) => {
+            paste::paste! {
+                #[test]
+                fn [< $variant _ $op _value_ $t >]() {
+                    let (l1, _) = crate::test_utils::get_sample_vectors::<$t>(533);
+
+                    let mut result = vec![$t::default(); 533];
+                    unsafe { [< $variant _ $op _vertical >](&l1, 2 as $t, &mut result) };
+
+                    let expected = l1.iter()
+                        .copied()
+                        .map(|v| AutoMath::$op(v, 2 as $t))
+                        .collect::<Vec<_>>();
+                    for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
+                        let ulps_diff = get_diff_ulps(*expected, *actual);
+                        assert!(
+                            ulps_diff.abs() <= 1,
+                            "result differs by more than 1 ULP:\n initial inputs: {}, 2\n  expected: {} actual: {}\nulps diff: {}",
+                            initial, expected, actual, ulps_diff
+                        );
+                    }
+
+                }
+
+                #[test]
+                fn [< $variant _ $op _vector_ $t >]() {
+                    let (l1, l2) = crate::test_utils::get_subnormal_sample_vectors::<$t>(533);
+
+                    let mut result = vec![$t::default(); 533];
+                    unsafe { [< $variant _ $op _vertical >](&l1, &l2, &mut result) };
+
+                    let expected = l1.iter()
+                        .copied()
+                        .zip(l2.iter().copied())
+                        .map(|(a, b)| AutoMath::$op(a, b))
+                        .collect::<Vec<_>>();
+                    for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
+                        let ulps_diff = get_diff_ulps(*expected, *actual);
+                        assert!(
+                            ulps_diff.abs() <= 1,
+                            "result differs by more than 1 ULP:\n initial inputs: {}, 2\n  expected: {} actual: {}\nulps diff: {}",
+                            initial, expected, actual, ulps_diff
+                        );
+                    }
+                }
+            }
+        };
+    }
+
+    fn get_diff_ulps<T>(a: T, b: T) -> i64
+    where
+        T: Float + FloatConst,
+    {
+        let (a_mant, a_exp, a_sign) = a.integer_decode();
+        let (b_mant, b_exp, b_sign) = b.integer_decode();
+        assert!(a_sign == b_sign);
+        assert!(a_exp == b_exp);
+        a_mant as i64 - b_mant as i64
+    }
+
+    macro_rules! define_numeric_test {
+        ($variant:ident, types = $($t:ident $(,)?)+) => {
+            $(
+                define_inner_test!($variant, op = hypot, ty = $t);
+
+            )*
+        };
+    }
+
+    define_numeric_test!(generic_fallback, types = f32, f64,);
+    #[cfg(all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        target_feature = "avx2"
+    ))]
+    define_numeric_test!(generic_avx2, types = f32, f64,);
+    #[cfg(all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        feature = "nightly",
+        target_feature = "avx512f"
+    ))]
+    define_numeric_test!(generic_avx512, types = f32, f64,);
+    #[cfg(target_arch = "aarch64")]
+    define_numeric_test!(generic_neon, types = f32, f64,);
+}
diff --git a/cfavml/src/danger/impl_fallback.rs b/cfavml/src/danger/impl_fallback.rs
index 91b4a66..59f82c0 100644
--- a/cfavml/src/danger/impl_fallback.rs
+++ b/cfavml/src/danger/impl_fallback.rs
@@ -132,15 +132,13 @@ where
     }
 }
 
-impl Hypot<f32> for Fallback {
-    #[inline(always)]
-    unsafe fn hypot(x: f32, y: f32) -> f32 {
-        AutoMath::hypot(x, y)
-    }
-}
-impl Hypot<f64> for Fallback {
+impl<T> Hypot<T> for Fallback
+where
+    T: Copy,
+    AutoMath: Math<T> + Numeric<T>,
+{
     #[inline(always)]
-    unsafe fn hypot(x: f64, y: f64) -> f64 {
+    unsafe fn hypot(x: T, y: T) -> T {
         AutoMath::hypot(x, y)
     }
 }
diff --git a/cfavml/src/danger/mod.rs b/cfavml/src/danger/mod.rs
index c16075a..cd0b26c 100644
--- a/cfavml/src/danger/mod.rs
+++ b/cfavml/src/danger/mod.rs
@@ -24,9 +24,11 @@ pub mod export_agg_ops;
 pub mod export_arithmetic_ops;
 pub mod export_cmp_ops;
 pub mod export_distance_ops;
+pub mod export_hypot;
 #[cfg(test)]
 mod impl_test;
 mod op_cmp_vertical;
+mod op_hypot;
 #[cfg(test)]
 mod test_suite;
 
@@ -61,6 +63,7 @@ pub(crate) use self::op_cosine::cosine;
 pub use self::op_cosine::generic_cosine;
 pub use self::op_dot::generic_dot;
 pub use self::op_euclidean::generic_squared_euclidean;
+pub use self::op_hypot::generic_hypot_vertical;
 pub use self::op_norm::generic_squared_norm;
 pub use self::op_sum::generic_sum;
 
diff --git a/cfavml/src/danger/op_hypot.rs b/cfavml/src/danger/op_hypot.rs
new file mode 100644
index 0000000..83335b9
--- /dev/null
+++ b/cfavml/src/danger/op_hypot.rs
@@ -0,0 +1,38 @@
+use super::core_routine_boilerplate::apply_vertical_kernel;
+use super::core_simd_api::Hypot;
+use crate::buffer::WriteOnlyBuffer;
+use crate::danger::core_simd_api::SimdRegister;
+use crate::math::{Math, Numeric};
+use crate::mem_loader::{IntoMemLoader, MemLoader};
+
+#[inline(always)]
+/// A generic vector hypot over one vector and single value.
+///
+/// # Safety
+///
+/// The sizes of `a`, `b` and `result` must be equal to `dims`, the safety requirements of
+/// `M` definition the basic math operations and the requirements of `R` SIMD register
+/// must also be followed.
+pub unsafe fn generic_hypot_vertical<T, R, M, B1, B2, B3>(
+    a: B1,
+    b: B2,
+    result: &mut [B3],
+) where
+    T: Copy,
+    R: SimdRegister<T> + Hypot<T>,
+    M: Math<T> + Numeric<T>,
+    B1: IntoMemLoader<T>,
+    B1::Loader: MemLoader<Value = T>,
+    B2: IntoMemLoader<T>,
+    B2::Loader: MemLoader<Value = T>,
+    for<'a> &'a mut [B3]: WriteOnlyBuffer<Item = T>,
+{
+    apply_vertical_kernel::<T, R, M, B1, B2, B3>(
+        a,
+        b,
+        result,
+        R::hypot_dense,
+        R::hypot,
+        M::hypot,
+    )
+}
diff --git a/cfavml/src/export_docs/arithmetic_hypot_vertical.md b/cfavml/src/export_docs/arithmetic_hypot_vertical.md
new file mode 100644
index 0000000..ef7f662
--- /dev/null
+++ b/cfavml/src/export_docs/arithmetic_hypot_vertical.md
@@ -0,0 +1,37 @@
+Performs an elementwise hypotenuse of input buffers `a` and `b` that can
+be projected to the desired output size of `result`. Implementation is an appoximation that
+_should_ match std::hypot in most cases. However, with some inputs it's been confirmed to be off by 1 ulp.
+
+### Projecting Vectors
+
+CFAVML allows for working over a wide variety of buffers for applications, projection is effectively
+broadcasting of two input buffers implementing `IntoMemLoader<T>`.
+
+By default, you can provide _two slices_, _one slice and a broadcast value_, or _two broadcast values_,
+which exhibit the standard behaviour as you might expect.
+
+When providing two slices as inputs they cannot be projected to a buffer
+that is larger their input sizes by default. This means providing two slices
+of `128` elements in length must take a result buffer of `128` elements in length.
+
+### Implementation Pseudocode
+
+_This is the logic of the routine being called._
+
+```ignore
+result = [0; dims]
+
+for i in range(dims):
+    result[i] = a[i].hypot(b[i])
+
+return result
+```
+
+# Panics
+
+If vectors `a` and `b` cannot be projected to the target size of `result`.
+Note that the projection rules are tied to the `MemLoader` implementation.
+
+# Safety
+
+This routine assumes:

From 5ba40f790de645651396cb839b7eb3d6c90f0782 Mon Sep 17 00:00:00 2001
From: Joshua Ferguson <joshua.ferguson.273@gmail.com>
Date: Wed, 20 Nov 2024 17:25:16 -0600
Subject: [PATCH 06/13] safe api implemented

---
 cfavml/src/danger/export_hypot.rs    |  4 +-
 cfavml/src/lib.rs                    |  1 +
 cfavml/src/safe_trait_numeric_ops.rs | 85 ++++++++++++++++++++++++++++
 3 files changed, 89 insertions(+), 1 deletion(-)
 create mode 100644 cfavml/src/safe_trait_numeric_ops.rs

diff --git a/cfavml/src/danger/export_hypot.rs b/cfavml/src/danger/export_hypot.rs
index 96589cf..6e7b3a8 100644
--- a/cfavml/src/danger/export_hypot.rs
+++ b/cfavml/src/danger/export_hypot.rs
@@ -79,6 +79,7 @@ mod tests {
                         .copied()
                         .map(|v| AutoMath::$op(v, 2 as $t))
                         .collect::<Vec<_>>();
+
                     for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
                         let ulps_diff = get_diff_ulps(*expected, *actual);
                         assert!(
@@ -86,13 +87,14 @@ mod tests {
                             "result differs by more than 1 ULP:\n initial inputs: {}, 2\n  expected: {} actual: {}\nulps diff: {}",
                             initial, expected, actual, ulps_diff
                         );
+
                     }
 
                 }
 
                 #[test]
                 fn [< $variant _ $op _vector_ $t >]() {
-                    let (l1, l2) = crate::test_utils::get_subnormal_sample_vectors::<$t>(533);
+                    let (l1, l2) = crate::test_utils::get_sample_vectors::<$t>(533);
 
                     let mut result = vec![$t::default(); 533];
                     unsafe { [< $variant _ $op _vertical >](&l1, &l2, &mut result) };
diff --git a/cfavml/src/lib.rs b/cfavml/src/lib.rs
index 813e9ec..19affdd 100644
--- a/cfavml/src/lib.rs
+++ b/cfavml/src/lib.rs
@@ -23,6 +23,7 @@ pub mod safe_trait_agg_ops;
 pub mod safe_trait_arithmetic_ops;
 pub mod safe_trait_cmp_ops;
 pub mod safe_trait_distance_ops;
+pub mod safe_trait_numeric_ops;
 #[cfg(test)]
 mod test_utils;
 
diff --git a/cfavml/src/safe_trait_numeric_ops.rs b/cfavml/src/safe_trait_numeric_ops.rs
new file mode 100644
index 0000000..c442fc0
--- /dev/null
+++ b/cfavml/src/safe_trait_numeric_ops.rs
@@ -0,0 +1,85 @@
+//! Safe but somewhat low-level variants of the arithmetic operations in CFAVML.
+//!
+//! In general, I would recommend using the higher level generic functions api which provides
+//! some syntax sugar over these traits.
+
+use crate::buffer::WriteOnlyBuffer;
+use crate::danger::export_hypot;
+use crate::mem_loader::{IntoMemLoader, MemLoader};
+
+/// Various arithmetic operations over vectors.
+pub trait NumericOps: Sized + Copy {
+    /// Performs an elementwise hypotenuse of input buffers `a` and `b` that can
+    /// be projected to the desired output size of `result`. Implementation is an appoximation that
+    /// _should_ match std::hypot in most cases. However, with some inputs it's been confirmed to be off by 1 ulp.
+    ///
+    /// See [cfavml::hypot_vertical](crate::hypot_vertical) for examples.
+    ///
+    /// ### Projecting Vectors
+    ///
+    /// CFAVML allows for working over a wide variety of buffers for applications, projection is effectively
+    /// broadcasting of two input buffers implementing `IntoMemLoader<T>`.
+    ///
+    /// By default, you can provide _two slices_, _one slice and a broadcast value_, or _two broadcast values_,
+    /// which exhibit the standard behaviour as you might expect.
+    ///
+    /// When providing two slices as inputs they cannot be projected to a buffer
+    /// that is larger their input sizes by default. This means providing two slices
+    /// of `128` elements in length must take a result buffer of `128` elements in length.
+    ///
+    /// You can wrap your inputs in a [Projected](crate::mem_loader::Projected) wrapper which
+    /// enables projecting of the input buffer to new sizes providing the new size is a
+    /// multiple of the original size. When this buffer is projected, it is effectively
+    /// repeated `N` times, where `N` is how many times the old size fits into the new size.
+    ///
+    /// ### Implementation Pseudocode
+    ///
+    /// ```ignore
+    /// result = [0; dims]
+    ///
+    /// for i in range(dims):
+    ///     result[i] = a[i].hypot(b[i])
+    ///
+    /// return result
+    /// ```
+    ///
+    /// # Panics
+    ///
+    /// If vectors `a` and `b` cannot be projected to the target size of `result`.
+    /// Note that the projection rules are tied to the `MemLoader` implementation.
+    fn hypot_vertical<B1, B2, B3>(lhs: B1, rhs: B2, result: &mut [B3])
+    where
+        B1: IntoMemLoader<Self>,
+        B1::Loader: MemLoader<Value = Self>,
+        B2: IntoMemLoader<Self>,
+        B2::Loader: MemLoader<Value = Self>,
+        for<'a> &'a mut [B3]: WriteOnlyBuffer<Item = Self>;
+}
+
+macro_rules! numeric_ops {
+    ($t:ty) => {
+        impl NumericOps for $t {
+            fn hypot_vertical<B1, B2, B3>(lhs: B1, rhs: B2, result: &mut [B3])
+            where
+                B1: IntoMemLoader<Self>,
+                B1::Loader: MemLoader<Value = Self>,
+                B2: IntoMemLoader<Self>,
+                B2::Loader: MemLoader<Value = Self>,
+                for<'a> &'a mut [B3]: WriteOnlyBuffer<Item = Self>,
+            {
+                unsafe {
+                    crate::dispatch!(
+                        avx512 = export_hypot::generic_avx512_hypot_vertical,
+                        avx2 = export_hypot::generic_avx2_hypot_vertical,
+                        neon = export_hypot::generic_neon_hypot_vertical,
+                        fallback = export_hypot::generic_fallback_hypot_vertical,
+                        args = (lhs, rhs, result)
+                    );
+                }
+            }
+        }
+    };
+}
+
+numeric_ops!(f32);
+numeric_ops!(f64);

From 7479b041ae9f8099efb26efc00b088289d519855 Mon Sep 17 00:00:00 2001
From: Joshua Ferguson <joshua.ferguson.273@gmail.com>
Date: Sat, 23 Nov 2024 10:51:51 -0600
Subject: [PATCH 07/13] fixed issue with hypot test, fixed clippy warnings

---
 cfavml/src/danger/impl_avx2.rs    |  4 ++--
 cfavml/src/danger/impl_avx2fma.rs | 13 +++++++------
 cfavml/src/danger/impl_avx512.rs  |  4 ++--
 cfavml/src/danger/impl_test.rs    |  8 ++++----
 cfavml/src/danger/test_suite.rs   |  4 ++--
 5 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/cfavml/src/danger/impl_avx2.rs b/cfavml/src/danger/impl_avx2.rs
index 56ca639..e9ef479 100644
--- a/cfavml/src/danger/impl_avx2.rs
+++ b/cfavml/src/danger/impl_avx2.rs
@@ -188,7 +188,7 @@ impl Hypot<f32> for Avx2 {
         //The mantissa mask is the inverse of the exponent mask
         let mantissa_mask = _mm256_sub_ps(
             _mm256_set1_ps(f32::MIN_POSITIVE),
-            _mm256_set1_ps(core::mem::transmute(1_u32)),
+            _mm256_set1_ps(f32::from_bits(1_u32)),
         );
         // round the hi values down to the nearest power of 2
         let hi2p = _mm256_and_ps(hi, exponent_mask);
@@ -2796,7 +2796,7 @@ impl Hypot<f64> for Avx2 {
         //The mantissa mask is the inverse of the exponent mask
         let mantissa_mask = _mm256_sub_pd(
             _mm256_set1_pd(f64::MIN_POSITIVE),
-            _mm256_set1_pd(core::mem::transmute(1_u64)),
+            _mm256_set1_pd(f64::from_bits(1_u64)),
         );
         // round the hi values down to the nearest power of 2
         let hi2p = _mm256_and_pd(hi, exponent_mask);
diff --git a/cfavml/src/danger/impl_avx2fma.rs b/cfavml/src/danger/impl_avx2fma.rs
index 83da6a6..7f68335 100644
--- a/cfavml/src/danger/impl_avx2fma.rs
+++ b/cfavml/src/danger/impl_avx2fma.rs
@@ -138,7 +138,7 @@ impl Hypot<f32> for Avx2Fma {
         //The mantissa mask is the inverse of the exponent mask
         let mantissa_mask = _mm256_sub_ps(
             _mm256_set1_ps(f32::MIN_POSITIVE),
-            _mm256_set1_ps(core::mem::transmute(1_u32)),
+            _mm256_set1_ps(f32::from_bits(1_u32)),
         );
         // round the hi values down to the nearest power of 2
         let hi2p = _mm256_and_ps(hi, exponent_mask);
@@ -149,9 +149,10 @@ impl Hypot<f32> for Avx2Fma {
         // create a mask that matches the normal hi values
         let mask = _mm256_cmp_ps::<_CMP_GT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE));
         // replace the subnormal values of hi2p with the minimum positive normal value
-        let hi2p = _mm256_blendv_ps(_mm256_set1_ps(f32::MIN_POSITIVE), hi2p, mask);
+        let outer_scale =
+            _mm256_blendv_ps(_mm256_set1_ps(f32::MIN_POSITIVE), hi2p, mask);
         // replace the subnormal values of scale with the reciprocal of the minimum positive normal value
-        let scale =
+        let inner_scale =
             _mm256_blendv_ps(_mm256_set1_ps(1.0 / f32::MIN_POSITIVE), scale, mask);
         // create a mask that matches the subnormal hi values
         let mask = _mm256_cmp_ps::<_CMP_LT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE));
@@ -166,9 +167,9 @@ impl Hypot<f32> for Avx2Fma {
         );
 
         let hi_scaled = _mm256_mul_ps(hi_scaled, hi_scaled);
-        let lo_scaled = _mm256_mul_ps(lo, scale);
+        let lo_scaled = _mm256_mul_ps(lo, inner_scale);
         _mm256_mul_ps(
-            hi2p,
+            outer_scale,
             _mm256_sqrt_ps(_mm256_fmadd_ps(lo_scaled, lo_scaled, hi_scaled)),
         )
     }
@@ -299,7 +300,7 @@ impl Hypot<f64> for Avx2Fma {
         //The mantissa mask is the inverse of the exponent mask
         let mantissa_mask = _mm256_sub_pd(
             _mm256_set1_pd(f64::MIN_POSITIVE),
-            _mm256_set1_pd(core::mem::transmute(1_u64)),
+            _mm256_set1_pd(f64::from_bits(1_u64)),
         );
         // round the hi values down to the nearest power of 2
         let hi2p = _mm256_and_pd(hi, exponent_mask);
diff --git a/cfavml/src/danger/impl_avx512.rs b/cfavml/src/danger/impl_avx512.rs
index b6d34bf..f5567cd 100644
--- a/cfavml/src/danger/impl_avx512.rs
+++ b/cfavml/src/danger/impl_avx512.rs
@@ -152,7 +152,7 @@ impl Hypot<f32> for Avx512 {
         //The mantissa mask is the inverse of the exponent mask
         let mantissa_mask = _mm512_sub_ps(
             _mm512_set1_ps(f32::MIN_POSITIVE),
-            _mm512_set1_ps(core::mem::transmute(1_u32)),
+            _mm512_set1_ps(f32::from_bits(1_u32)),
         );
         // round the hi values down to the nearest power of 2
         let hi2p = _mm512_and_ps(hi, exponent_mask);
@@ -322,7 +322,7 @@ impl Hypot<f64> for Avx512 {
         //The mantissa mask is the inverse of the exponent mask
         let mantissa_mask = _mm512_sub_pd(
             _mm512_set1_pd(f64::MIN_POSITIVE),
-            _mm512_set1_pd(core::mem::transmute(1_u64)),
+            _mm512_set1_pd(f64::from_bits(1_u64)),
         );
         // round the hi values down to the nearest power of 2
         let hi2p = _mm512_and_pd(hi, exponent_mask);
diff --git a/cfavml/src/danger/impl_test.rs b/cfavml/src/danger/impl_test.rs
index d1d3494..92a6591 100644
--- a/cfavml/src/danger/impl_test.rs
+++ b/cfavml/src/danger/impl_test.rs
@@ -337,7 +337,7 @@ unsafe fn test_sample<T, R>(
         );
         let mut res_vec = vec![T::zero(); R::elements_per_lane()];
         R::write(res_vec.as_mut_ptr(), res);
-        test_diff_ulps(std_result, res_vec);
+        test_diff_ulps(&std_result, &res_vec);
     }
     {
         let (std_result, std_sum) = get_std_results(&large_sample_l1, &large_sample_l2);
@@ -353,11 +353,11 @@ unsafe fn test_sample<T, R>(
         );
         let mut res_vec = vec![T::zero(); R::elements_per_dense()];
         R::write_dense(res_vec.as_mut_ptr(), res);
-        test_diff_ulps(std_result, res_vec);
+        test_diff_ulps(&std_result, &res_vec);
     }
 }
 
-fn get_std_results<T>(sample1: &Vec<T>, sample2: &Vec<T>) -> (Vec<T>, T)
+fn get_std_results<T>(sample1: &[T], sample2: &[T]) -> (Vec<T>, T)
 where
     T: Float + Debug,
     AutoMath: Math<T>,
@@ -371,7 +371,7 @@ where
     (std_result, sum)
 }
 
-fn test_diff_ulps<T>(a: Vec<T>, b: Vec<T>)
+fn test_diff_ulps<T>(a: &[T], b: &[T])
 where
     T: Float + FloatConst,
 {
diff --git a/cfavml/src/danger/test_suite.rs b/cfavml/src/danger/test_suite.rs
index 2fbe053..82b156a 100644
--- a/cfavml/src/danger/test_suite.rs
+++ b/cfavml/src/danger/test_suite.rs
@@ -286,13 +286,13 @@ macro_rules! test_hypot {
         paste::paste! {
             #[test]
             fn [<test_ $im:lower _hypot_f32>]() {
-                unsafe { crate::danger::impl_test::test_suite_impl::<f64, $im>() }
+                unsafe { crate::danger::impl_test::hypot_test_impl::<f64, $im>() }
             }
         }
         paste::paste! {
             #[test]
             fn [<test_ $im:lower _hypot_f64>]() {
-                unsafe { crate::danger::impl_test::test_suite_impl::<f64, $im>() }
+                unsafe { crate::danger::impl_test::hypot_test_impl::<f64, $im>() }
             }
         }
     };

From 408556154dbdfb1f7ed4c46172a6f192be399593 Mon Sep 17 00:00:00 2001
From: Joshua Ferguson <joshua.ferguson.273@gmail.com>
Date: Sun, 24 Nov 2024 10:57:57 -0600
Subject: [PATCH 08/13] added export hypot test for avx2fma, subnormal floats

---
 cfavml/src/danger/export_hypot.rs | 68 ++++++++++++++++++++++++++++---
 1 file changed, 63 insertions(+), 5 deletions(-)

diff --git a/cfavml/src/danger/export_hypot.rs b/cfavml/src/danger/export_hypot.rs
index 6e7b3a8..f0ffb12 100644
--- a/cfavml/src/danger/export_hypot.rs
+++ b/cfavml/src/danger/export_hypot.rs
@@ -8,7 +8,7 @@ use crate::danger::{generic_hypot_vertical, SimdRegister};
 use crate::math::{AutoMath, Math, Numeric};
 use crate::mem_loader::{IntoMemLoader, MemLoader};
 
-macro_rules! define_arithmetic_impls {
+macro_rules! define_hypot_impls {
     (
         $hypot_name:ident,
         $imp:ident $(,)?
@@ -46,18 +46,25 @@ macro_rules! define_arithmetic_impls {
     };
 }
 
-define_arithmetic_impls!(generic_fallback_hypot_vertical, Fallback,);
+define_hypot_impls!(generic_fallback_hypot_vertical, Fallback,);
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-define_arithmetic_impls!(generic_avx2_hypot_vertical, Avx2, target_features = "avx2");
+define_hypot_impls!(generic_avx2_hypot_vertical, Avx2, target_features = "avx2");
+#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+define_hypot_impls!(
+    generic_avx2fma_hypot_vertical,
+    Avx2Fma,
+    target_features = "avx2",
+    "fma"
+);
 #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "nightly"))]
-define_arithmetic_impls!(
+define_hypot_impls!(
     generic_avx512_hypot_vertical,
     Avx512,
     target_features = "avx512f",
     "avx512bw"
 );
 #[cfg(target_arch = "aarch64")]
-define_arithmetic_impls!(generic_neon_hypot_vertical, Neon, target_features = "neon");
+define_hypot_impls!(generic_neon_hypot_vertical, Neon, target_features = "neon");
 
 #[cfg(test)]
 mod tests {
@@ -99,6 +106,51 @@ mod tests {
                     let mut result = vec![$t::default(); 533];
                     unsafe { [< $variant _ $op _vertical >](&l1, &l2, &mut result) };
 
+                    let expected = l1.iter()
+                        .copied()
+                        .zip(l2.iter().copied())
+                        .map(|(a, b)| AutoMath::$op(a, b))
+                        .collect::<Vec<_>>();
+                    for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
+                        let ulps_diff = get_diff_ulps(*expected, *actual);
+                        assert!(
+                            ulps_diff.abs() <= 1,
+                            "result differs by more than 1 ULP:\n initial inputs: {}, 2\n  expected: {} actual: {}\nulps diff: {}",
+                            initial, expected, actual, ulps_diff
+                        );
+                    }
+                }
+                #[test]
+                fn [< $variant _ $op _subnormal_value_ $t >]() {
+                    let (l1, _) = crate::test_utils::get_subnormal_sample_vectors::<$t>(533);
+
+                    let mut result = vec![$t::default(); 533];
+                    unsafe { [< $variant _ $op _vertical >](&l1, 2 as $t, &mut result) };
+
+                    let expected = l1.iter()
+                        .copied()
+                        .map(|v| AutoMath::$op(v, 2 as $t))
+                        .collect::<Vec<_>>();
+
+                    for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
+                        let ulps_diff = get_diff_ulps(*expected, *actual);
+                        assert!(
+                            ulps_diff.abs() <= 1,
+                            "result differs by more than 1 ULP:\n initial inputs: {}, 2\n  expected: {} actual: {}\nulps diff: {}",
+                            initial, expected, actual, ulps_diff
+                        );
+
+                    }
+
+                }
+
+                #[test]
+                fn [< $variant _ $op _subnormal_vector_ $t >]() {
+                    let (l1, l2) = crate::test_utils::get_subnormal_sample_vectors::<$t>(533);
+
+                    let mut result = vec![$t::default(); 533];
+                    unsafe { [< $variant _ $op _vertical >](&l1, &l2, &mut result) };
+
                     let expected = l1.iter()
                         .copied()
                         .zip(l2.iter().copied())
@@ -143,6 +195,12 @@ mod tests {
         target_feature = "avx2"
     ))]
     define_numeric_test!(generic_avx2, types = f32, f64,);
+    #[cfg(all(
+        any(target_arch = "x86", target_arch = "x86_64"),
+        target_feature = "avx2",
+        target_feature = "fma"
+    ))]
+    define_numeric_test!(generic_avx2fma, types = f32, f64,);
     #[cfg(all(
         any(target_arch = "x86", target_arch = "x86_64"),
         feature = "nightly",

From c8fca55ae0bdff1996895871247401ea51b9912c Mon Sep 17 00:00:00 2001
From: Joshua Ferguson <joshua.ferguson.273@gmail.com>
Date: Sat, 7 Dec 2024 12:39:11 -0600
Subject: [PATCH 09/13] apply changes from code review

---
 cfavml/src/danger/core_simd_api.rs            |  2 ++
 cfavml/src/danger/export_hypot.rs             |  2 +-
 cfavml/src/danger/impl_avx2.rs                | 29 ++++++++-----------
 cfavml/src/danger/impl_avx2fma.rs             | 24 +++++----------
 cfavml/src/danger/op_hypot.rs                 |  4 ++-
 .../export_docs/arithmetic_hypot_vertical.md  | 10 +++++--
 cfavml/src/math/default.rs                    |  2 +-
 7 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/cfavml/src/danger/core_simd_api.rs b/cfavml/src/danger/core_simd_api.rs
index e2fa9c0..7afc243 100644
--- a/cfavml/src/danger/core_simd_api.rs
+++ b/cfavml/src/danger/core_simd_api.rs
@@ -414,6 +414,8 @@ where
     /// and a point (`x`, `y`) on the Euclidean plane.
     unsafe fn hypot(a: Self::Register, b: Self::Register) -> Self::Register;
     #[inline(always)]
+
+    /// Perform a element wise hypot on two dense lanes.
     unsafe fn hypot_dense(
         a: DenseLane<Self::Register>,
         b: DenseLane<Self::Register>,
diff --git a/cfavml/src/danger/export_hypot.rs b/cfavml/src/danger/export_hypot.rs
index f0ffb12..8226492 100644
--- a/cfavml/src/danger/export_hypot.rs
+++ b/cfavml/src/danger/export_hypot.rs
@@ -207,6 +207,6 @@ mod tests {
         target_feature = "avx512f"
     ))]
     define_numeric_test!(generic_avx512, types = f32, f64,);
-    #[cfg(target_arch = "aarch64")]
+    #[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
     define_numeric_test!(generic_neon, types = f32, f64,);
 }
diff --git a/cfavml/src/danger/impl_avx2.rs b/cfavml/src/danger/impl_avx2.rs
index e9ef479..363e392 100644
--- a/cfavml/src/danger/impl_avx2.rs
+++ b/cfavml/src/danger/impl_avx2.rs
@@ -174,15 +174,14 @@ impl Hypot<f32> for Avx2 {
     #[inline(always)]
     unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
         // convert the inputs to absolute values
-        let (x, y) = (
-            _mm256_andnot_ps(_mm256_set1_ps(-0.0), x),
-            _mm256_andnot_ps(_mm256_set1_ps(-0.0), y),
-        );
+        let x = _mm256_andnot_ps(_mm256_set1_ps(-0.0), x);
+
+        let y = _mm256_andnot_ps(_mm256_set1_ps(-0.0), y);
+
         // find the max and min of the two inputs
-        let (hi, lo) = (
-            <Avx2 as SimdRegister<f32>>::max(x, y),
-            <Avx2 as SimdRegister<f32>>::min(x, y),
-        );
+        let hi = <Avx2 as SimdRegister<f32>>::max(x, y);
+        let lo = <Avx2 as SimdRegister<f32>>::min(x, y);
+
         //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
         let exponent_mask = _mm256_set1_ps(f32::INFINITY);
         //The mantissa mask is the inverse of the exponent mask
@@ -2782,15 +2781,11 @@ impl Hypot<f64> for Avx2 {
     #[inline(always)]
     unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
         // convert the inputs to absolute values
-        let (x, y) = (
-            _mm256_andnot_pd(_mm256_set1_pd(-0.0), x),
-            _mm256_andnot_pd(_mm256_set1_pd(-0.0), y),
-        );
-        // find the max and min of the two inputs
-        let (hi, lo) = (
-            <Avx2 as SimdRegister<f64>>::max(x, y),
-            <Avx2 as SimdRegister<f64>>::min(x, y),
-        );
+
+        let x = _mm256_andnot_pd(_mm256_set1_pd(-0.0), x);
+        let y = _mm256_andnot_pd(_mm256_set1_pd(-0.0), y);
+        let hi = <Avx2 as SimdRegister<f64>>::max(x, y);
+        let lo = <Avx2 as SimdRegister<f64>>::min(x, y);
         //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
         let exponent_mask = _mm256_set1_pd(f64::INFINITY);
         //The mantissa mask is the inverse of the exponent mask
diff --git a/cfavml/src/danger/impl_avx2fma.rs b/cfavml/src/danger/impl_avx2fma.rs
index 7f68335..573056f 100644
--- a/cfavml/src/danger/impl_avx2fma.rs
+++ b/cfavml/src/danger/impl_avx2fma.rs
@@ -124,15 +124,11 @@ impl Hypot<f32> for Avx2Fma {
     #[inline(always)]
     unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
         // convert the inputs to absolute values
-        let (x, y) = (
-            _mm256_andnot_ps(_mm256_set1_ps(-0.0), x),
-            _mm256_andnot_ps(_mm256_set1_ps(-0.0), y),
-        );
+        let x = _mm256_andnot_ps(_mm256_set1_ps(-0.0), x);
+        let y = _mm256_andnot_ps(_mm256_set1_ps(-0.0), y);
         // find the max and min of the two inputs
-        let (hi, lo) = (
-            <Avx2 as SimdRegister<f32>>::max(x, y),
-            <Avx2 as SimdRegister<f32>>::min(x, y),
-        );
+        let hi = <Avx2 as SimdRegister<f32>>::max(x, y);
+        let lo = <Avx2 as SimdRegister<f32>>::min(x, y);
         //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
         let exponent_mask = _mm256_set1_ps(f32::INFINITY);
         //The mantissa mask is the inverse of the exponent mask
@@ -286,15 +282,11 @@ impl Hypot<f64> for Avx2Fma {
     #[inline(always)]
     unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
         // convert the inputs to absolute values
-        let (x, y) = (
-            _mm256_andnot_pd(_mm256_set1_pd(-0.0), x),
-            _mm256_andnot_pd(_mm256_set1_pd(-0.0), y),
-        );
+        let x = _mm256_andnot_pd(_mm256_set1_pd(-0.0), x);
+        let y = _mm256_andnot_pd(_mm256_set1_pd(-0.0), y);
         // find the max and min of the two inputs
-        let (hi, lo) = (
-            <Avx2 as SimdRegister<f64>>::max(x, y),
-            <Avx2 as SimdRegister<f64>>::min(x, y),
-        );
+        let hi = <Avx2 as SimdRegister<f64>>::max(x, y);
+        let lo = <Avx2 as SimdRegister<f64>>::min(x, y);
         //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
         let exponent_mask = _mm256_set1_pd(f64::INFINITY);
         //The mantissa mask is the inverse of the exponent mask
diff --git a/cfavml/src/danger/op_hypot.rs b/cfavml/src/danger/op_hypot.rs
index 83335b9..b2b440f 100644
--- a/cfavml/src/danger/op_hypot.rs
+++ b/cfavml/src/danger/op_hypot.rs
@@ -6,13 +6,15 @@ use crate::math::{Math, Numeric};
 use crate::mem_loader::{IntoMemLoader, MemLoader};
 
 #[inline(always)]
-/// A generic vector hypot over one vector and single value.
+/// A generic vector implementation of hypot over one vector and single value.
 ///
 /// # Safety
 ///
+/// The safety of hypot relies on the safety of the implementation of SimdRegister
 /// The sizes of `a`, `b` and `result` must be equal to `dims`, the safety requirements of
 /// `M` definition the basic math operations and the requirements of `R` SIMD register
 /// must also be followed.
+
 pub unsafe fn generic_hypot_vertical<T, R, M, B1, B2, B3>(
     a: B1,
     b: B2,
diff --git a/cfavml/src/export_docs/arithmetic_hypot_vertical.md b/cfavml/src/export_docs/arithmetic_hypot_vertical.md
index ef7f662..223abbd 100644
--- a/cfavml/src/export_docs/arithmetic_hypot_vertical.md
+++ b/cfavml/src/export_docs/arithmetic_hypot_vertical.md
@@ -1,6 +1,6 @@
 Performs an elementwise hypotenuse of input buffers `a` and `b` that can
 be projected to the desired output size of `result`. Implementation is an appoximation that
-_should_ match std::hypot in most cases. However, with some inputs it's been confirmed to be off by 1 ulp.
+_should_ match std::hypot in most cases. However, with some inputs it's been confirmed to be off by 1 ulp. Note: for `no_std` builds the result will be off more significantly for fallback
 
 ### Projecting Vectors
 
@@ -16,13 +16,17 @@ of `128` elements in length must take a result buffer of `128` elements in lengt
 
 ### Implementation Pseudocode
 
-_This is the logic of the routine being called._
+_This is the (simplified) logic of the routine being called._
 
 ```ignore
 result = [0; dims]
 
+// assume |a|<|b| for all a,b
 for i in range(dims):
-    result[i] = a[i].hypot(b[i])
+    let hi = a[i].abs()
+    let lo = b[i].abs()
+    let scale = 1/hi
+    result[i] = hi * sqrt(lo*scale + 1)
 
 return result
 ```
diff --git a/cfavml/src/math/default.rs b/cfavml/src/math/default.rs
index c538f8e..6023462 100644
--- a/cfavml/src/math/default.rs
+++ b/cfavml/src/math/default.rs
@@ -250,7 +250,7 @@ impl Numeric<f64> for StdMath {
         #[cfg(not(feature = "std"))]
         {
             // Fall back to direct calculation
-            f32_sqrt_fast((a * a + b * b) as f32) as f64
+            StdMath::sqrt(a * a + b * b)
         }
     }
 }

From 511e1da0993bfdee2c8adeb74fcde6b3284743ae Mon Sep 17 00:00:00 2001
From: Joshua Ferguson <joshua.ferguson.273@gmail.com>
Date: Sat, 7 Dec 2024 12:46:23 -0600
Subject: [PATCH 10/13] apply changes from code review

---
 cfavml/src/danger/impl_avx512.rs | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/cfavml/src/danger/impl_avx512.rs b/cfavml/src/danger/impl_avx512.rs
index f5567cd..fc4ef33 100644
--- a/cfavml/src/danger/impl_avx512.rs
+++ b/cfavml/src/danger/impl_avx512.rs
@@ -138,15 +138,11 @@ impl Hypot<f32> for Avx512 {
     #[inline(always)]
     unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
         // convert the inputs to absolute values
-        let (x, y) = (
-            _mm512_andnot_ps(_mm512_set1_ps(-0.0), x),
-            _mm512_andnot_ps(_mm512_set1_ps(-0.0), y),
-        );
+        let x = _mm512_andnot_ps(_mm512_set1_ps(-0.0), x);
+        let y = _mm512_andnot_ps(_mm512_set1_ps(-0.0), y);
         // find the max and min of the two inputs
-        let (hi, lo) = (
-            <Avx512 as SimdRegister<f32>>::max(x, y),
-            <Avx512 as SimdRegister<f32>>::min(x, y),
-        );
+        let hi = <Avx512 as SimdRegister<f32>>::max(x, y);
+        let lo = <Avx512 as SimdRegister<f32>>::min(x, y);
         //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
         let exponent_mask = _mm512_set1_ps(f32::INFINITY);
         //The mantissa mask is the inverse of the exponent mask
@@ -308,15 +304,11 @@ impl Hypot<f64> for Avx512 {
     #[inline(always)]
     unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
         // convert the inputs to absolute values
-        let (x, y) = (
-            _mm512_andnot_pd(_mm512_set1_pd(-0.0), x),
-            _mm512_andnot_pd(_mm512_set1_pd(-0.0), y),
-        );
+        let x = _mm512_andnot_pd(_mm512_set1_pd(-0.0), x);
+        let y = _mm512_andnot_pd(_mm512_set1_pd(-0.0), y);
         // find the max and min of the two inputs
-        let (hi, lo) = (
-            <Avx512 as SimdRegister<f64>>::max(x, y),
-            <Avx512 as SimdRegister<f64>>::min(x, y),
-        );
+        let hi = <Avx512 as SimdRegister<f64>>::max(x, y);
+        let lo = <Avx512 as SimdRegister<f64>>::min(x, y);
         //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
         let exponent_mask = _mm512_set1_pd(f64::INFINITY);
         //The mantissa mask is the inverse of the exponent mask

From c1120d86483f6837be6118b34f661cada9336b43 Mon Sep 17 00:00:00 2001
From: Joshua Ferguson <joshua.ferguson.273@gmail.com>
Date: Sat, 7 Dec 2024 12:51:02 -0600
Subject: [PATCH 11/13] apply changes from code review

---
 cfavml/src/danger/impl_neon.rs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cfavml/src/danger/impl_neon.rs b/cfavml/src/danger/impl_neon.rs
index ef60095..7f6309b 100644
--- a/cfavml/src/danger/impl_neon.rs
+++ b/cfavml/src/danger/impl_neon.rs
@@ -152,10 +152,12 @@ impl Hypot<f32> for Neon {
     #[inline(always)]
     unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
         // Convert inputs to absolute values
-        let (x, y) = (vabsq_f32(x), vabsq_f32(y));
+        let x = vabsq_f32(x);
+        let y = vabsq_f32(y);
 
         // Find the max and min of the two inputs
-        let (hi, lo) = (vmaxq_f32(x, y), vminq_f32(x, y));
+        let hi = vmaxq_f32(x, y);
+        let lo = vminq_f32(x, y);
         let exponent_mask = vdupq_n_u32(EXPONENT_MASK_F32);
         let mantissa_mask = vdupq_n_u32(MANTISSA_MASK_F32);
 
@@ -327,10 +329,12 @@ impl Hypot<f64> for Neon {
     #[inline(always)]
     unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
         // Convert inputs to absolute values
-        let (x, y) = (vabsq_f64(x), vabsq_f64(y));
+        let x = vabsq_f64(x);
+        let y = vabsq_f64(y);
 
         // Find the max and min of the two inputs
-        let (hi, lo) = (vmaxq_f64(x, y), vminq_f64(x, y));
+        let hi = vmaxq_f64(x, y);
+        let lo = vminq_f64(x, y);
         let exponent_mask = vdupq_n_u64(f64::INFINITY.to_bits());
         let mantissa_mask =
             vdupq_n_u64((f64::MIN_POSITIVE - mem::transmute::<u64, f64>(1)).to_bits());

From dbd00fcb6a3aef32e1b6c2e505d7c7386d31d706 Mon Sep 17 00:00:00 2001
From: Joshua Ferguson <joshua.ferguson.273@gmail.com>
Date: Sat, 7 Dec 2024 14:13:43 -0600
Subject: [PATCH 12/13] changed name of numeric trait to MiscFloatOps

---
 cfavml/src/lib.rs                                    |  2 +-
 ...t_numeric_ops.rs => safe_trait_misc_float_ops.rs} | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)
 rename cfavml/src/{safe_trait_numeric_ops.rs => safe_trait_misc_float_ops.rs} (94%)

diff --git a/cfavml/src/lib.rs b/cfavml/src/lib.rs
index 19affdd..1fd3bb8 100644
--- a/cfavml/src/lib.rs
+++ b/cfavml/src/lib.rs
@@ -23,7 +23,7 @@ pub mod safe_trait_agg_ops;
 pub mod safe_trait_arithmetic_ops;
 pub mod safe_trait_cmp_ops;
 pub mod safe_trait_distance_ops;
-pub mod safe_trait_numeric_ops;
+pub mod safe_trait_misc_float_ops;
 #[cfg(test)]
 mod test_utils;
 
diff --git a/cfavml/src/safe_trait_numeric_ops.rs b/cfavml/src/safe_trait_misc_float_ops.rs
similarity index 94%
rename from cfavml/src/safe_trait_numeric_ops.rs
rename to cfavml/src/safe_trait_misc_float_ops.rs
index c442fc0..a805272 100644
--- a/cfavml/src/safe_trait_numeric_ops.rs
+++ b/cfavml/src/safe_trait_misc_float_ops.rs
@@ -7,8 +7,8 @@ use crate::buffer::WriteOnlyBuffer;
 use crate::danger::export_hypot;
 use crate::mem_loader::{IntoMemLoader, MemLoader};
 
-/// Various arithmetic operations over vectors.
-pub trait NumericOps: Sized + Copy {
+/// Various float operations over vectors.
+pub trait MiscFloatOps: Sized + Copy {
     /// Performs an elementwise hypotenuse of input buffers `a` and `b` that can
     /// be projected to the desired output size of `result`. Implementation is an appoximation that
     /// _should_ match std::hypot in most cases. However, with some inputs it's been confirmed to be off by 1 ulp.
@@ -56,9 +56,9 @@ pub trait NumericOps: Sized + Copy {
         for<'a> &'a mut [B3]: WriteOnlyBuffer<Item = Self>;
 }
 
-macro_rules! numeric_ops {
+macro_rules! misc_float_ops {
     ($t:ty) => {
-        impl NumericOps for $t {
+        impl MiscFloatOps for $t {
             fn hypot_vertical<B1, B2, B3>(lhs: B1, rhs: B2, result: &mut [B3])
             where
                 B1: IntoMemLoader<Self>,
@@ -81,5 +81,5 @@ macro_rules! numeric_ops {
     };
 }
 
-numeric_ops!(f32);
-numeric_ops!(f64);
+misc_float_ops!(f32);
+misc_float_ops!(f64);

From 446c6b6dd8f49cf9af5452c55aa6fce64fe3a329 Mon Sep 17 00:00:00 2001
From: Joshua Ferguson <joshua.ferguson.273@gmail.com>
Date: Tue, 17 Dec 2024 18:11:33 -0600
Subject: [PATCH 13/13] fixed issue with "out of order" arguments to neon fma
 functions

---
 cfavml/src/danger/impl_neon.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cfavml/src/danger/impl_neon.rs b/cfavml/src/danger/impl_neon.rs
index 7f6309b..55c34f1 100644
--- a/cfavml/src/danger/impl_neon.rs
+++ b/cfavml/src/danger/impl_neon.rs
@@ -189,7 +189,7 @@ impl Hypot<f32> for Neon {
         // finally, do the thing
         let hi_scaled = vmulq_f32(hi_scaled, hi_scaled);
         let lo_scaled = vmulq_f32(lo, scale);
-        vmulq_f32(hi2p, vsqrtq_f32(vfmaq_f32(lo_scaled, lo_scaled, hi_scaled)))
+        vmulq_f32(hi2p, vsqrtq_f32(vfmaq_f32(hi_scaled, lo_scaled, lo_scaled)))
     }
 }
 
@@ -367,7 +367,7 @@ impl Hypot<f64> for Neon {
         // finally, do the thing
         let hi_scaled = vmulq_f64(hi_scaled, hi_scaled);
         let lo_scaled = vmulq_f64(lo, scale);
-        vmulq_f64(hi2p, vsqrtq_f64(vfmaq_f64(lo_scaled, lo_scaled, hi_scaled)))
+        vmulq_f64(hi2p, vsqrtq_f64(vfmaq_f64(hi_scaled, lo_scaled, lo_scaled)))
     }
 }