From a3433070578fbe1e4b02fe8f32c8a6264416c854 Mon Sep 17 00:00:00 2001 From: Joshua Ferguson Date: Sat, 9 Nov 2024 16:47:23 -0600 Subject: [PATCH 01/13] hypotenuse function for f32, f64 --- cfavml/src/danger/core_simd_api.rs | 16 ++++ cfavml/src/danger/impl_avx2.rs | 107 ++++++++++++++++++++++++++- cfavml/src/danger/impl_avx2fma.rs | 112 +++++++++++++++++++++++++++- cfavml/src/danger/impl_avx512.rs | 115 ++++++++++++++++++++++++++++- cfavml/src/danger/impl_fallback.rs | 16 +++- cfavml/src/danger/impl_neon.rs | 7 ++ cfavml/src/danger/impl_test.rs | 89 +++++++++++++++++++++- cfavml/src/danger/test_suite.rs | 20 +++++ cfavml/src/math/default.rs | 21 ++++++ cfavml/src/math/mod.rs | 2 +- cfavml/src/test_utils.rs | 37 +++++++++- 11 files changed, 535 insertions(+), 7 deletions(-) diff --git a/cfavml/src/danger/core_simd_api.rs b/cfavml/src/danger/core_simd_api.rs index cbc344e..e2fa9c0 100644 --- a/cfavml/src/danger/core_simd_api.rs +++ b/cfavml/src/danger/core_simd_api.rs @@ -405,3 +405,19 @@ pub trait SimdRegister { Self::write(mem.add(Self::elements_per_lane() * 7), lane.h); } } + +pub trait Hypot: SimdRegister +where + T: Copy, +{ + /// SIMD Variant of the std hypot function. Computes the distance between the origin + /// and a point (`x`, `y`) on the Euclidean plane. + unsafe fn hypot(a: Self::Register, b: Self::Register) -> Self::Register; + #[inline(always)] + unsafe fn hypot_dense( + a: DenseLane, + b: DenseLane, + ) -> DenseLane { + apply_dense!(Self::hypot, a, b) + } +} diff --git a/cfavml/src/danger/impl_avx2.rs b/cfavml/src/danger/impl_avx2.rs index e8f7303..56ca639 100644 --- a/cfavml/src/danger/impl_avx2.rs +++ b/cfavml/src/danger/impl_avx2.rs @@ -5,7 +5,7 @@ use core::arch::x86_64::*; use core::iter::zip; use core::mem; -use super::core_simd_api::{DenseLane, SimdRegister}; +use super::core_simd_api::{DenseLane, Hypot, SimdRegister}; use crate::apply_dense; /// AVX2 enabled SIMD operations. @@ -169,6 +169,59 @@ impl SimdRegister for Avx2 { } } +impl Hypot for Avx2 { + //b' * sqrt((a*b_hat)^2 + (b*b_hat)^2), where |b| > |a|, b'=(2^n Self::Register { + // convert the inputs to absolute values + let (x, y) = ( + _mm256_andnot_ps(_mm256_set1_ps(-0.0), x), + _mm256_andnot_ps(_mm256_set1_ps(-0.0), y), + ); + // find the max and min of the two inputs + let (hi, lo) = ( + >::max(x, y), + >::min(x, y), + ); + //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa + let exponent_mask = _mm256_set1_ps(f32::INFINITY); + //The mantissa mask is the inverse of the exponent mask + let mantissa_mask = _mm256_sub_ps( + _mm256_set1_ps(f32::MIN_POSITIVE), + _mm256_set1_ps(core::mem::transmute(1_u32)), + ); + // round the hi values down to the nearest power of 2 + let hi2p = _mm256_and_ps(hi, exponent_mask); + // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2, + // we can double it and xor it with the exponent mask + let scale = _mm256_xor_ps(_mm256_add_ps(hi2p, hi2p), exponent_mask); + + // create a mask that matches the normal hi values + let mask = _mm256_cmp_ps::<_CMP_GT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE)); + // replace the subnormal values of hi2p with the minimum positive normal value + let hi2p = _mm256_blendv_ps(_mm256_set1_ps(f32::MIN_POSITIVE), hi2p, mask); + // replace the subnormal values of scale with the reciprocal of the minimum positive normal value + let scale = + _mm256_blendv_ps(_mm256_set1_ps(1.0 / f32::MIN_POSITIVE), scale, mask); + // create a mask that matches the subnormal hi values + let mask = _mm256_cmp_ps::<_CMP_LT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE)); + // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1 + let hi_scaled = + _mm256_or_ps(_mm256_and_ps(hi, mantissa_mask), _mm256_set1_ps(1.0)); + // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values + let hi_scaled = _mm256_blendv_ps( + hi_scaled, + _mm256_sub_ps(hi_scaled, _mm256_set1_ps(1.0)), + mask, + ); + + let hi_scaled = _mm256_mul_ps(hi_scaled, hi_scaled); + let lo_scaled = _mm256_mul_ps(lo, scale); + let lo_scaled = _mm256_mul_ps(lo_scaled, lo_scaled); + _mm256_mul_ps(hi2p, _mm256_sqrt_ps(_mm256_add_ps(lo_scaled, hi_scaled))) + } +} + impl SimdRegister for Avx2 { type Register = __m256d; @@ -2724,3 +2777,55 @@ impl SimdRegister for Avx2 { _mm256_storeu_si256(mem.cast(), reg) } } + +impl Hypot for Avx2 { + #[inline(always)] + unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register { + // convert the inputs to absolute values + let (x, y) = ( + _mm256_andnot_pd(_mm256_set1_pd(-0.0), x), + _mm256_andnot_pd(_mm256_set1_pd(-0.0), y), + ); + // find the max and min of the two inputs + let (hi, lo) = ( + >::max(x, y), + >::min(x, y), + ); + //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa + let exponent_mask = _mm256_set1_pd(f64::INFINITY); + //The mantissa mask is the inverse of the exponent mask + let mantissa_mask = _mm256_sub_pd( + _mm256_set1_pd(f64::MIN_POSITIVE), + _mm256_set1_pd(core::mem::transmute(1_u64)), + ); + // round the hi values down to the nearest power of 2 + let hi2p = _mm256_and_pd(hi, exponent_mask); + // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2, + // we can double it and xor it with the exponent mask + let scale = _mm256_xor_pd(_mm256_add_pd(hi2p, hi2p), exponent_mask); + + // create a mask that matches the normal hi values + let mask = _mm256_cmp_pd::<_CMP_GT_OQ>(hi, _mm256_set1_pd(f64::MIN_POSITIVE)); + // replace the subnormal values of hi2p with the minimum positive normal value + let hi2p = _mm256_blendv_pd(_mm256_set1_pd(f64::MIN_POSITIVE), hi2p, mask); + // replace the subnormal values of scale with the reciprocal of the minimum positive normal value + let scale = + _mm256_blendv_pd(_mm256_set1_pd(1.0 / f64::MIN_POSITIVE), scale, mask); + // create a mask that matches the subnormal hi values + let mask = _mm256_cmp_pd::<_CMP_LT_OQ>(hi, _mm256_set1_pd(f64::MIN_POSITIVE)); + // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1 + let hi_scaled = + _mm256_or_pd(_mm256_and_pd(hi, mantissa_mask), _mm256_set1_pd(1.0)); + // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values + let hi_scaled = _mm256_blendv_pd( + hi_scaled, + _mm256_sub_pd(hi_scaled, _mm256_set1_pd(1.0)), + mask, + ); + + let hi_scaled = _mm256_mul_pd(hi_scaled, hi_scaled); + let lo_scaled = _mm256_mul_pd(lo, scale); + let lo_scaled = _mm256_mul_pd(lo_scaled, lo_scaled); + _mm256_mul_pd(hi2p, _mm256_sqrt_pd(_mm256_add_pd(lo_scaled, hi_scaled))) + } +} diff --git a/cfavml/src/danger/impl_avx2fma.rs b/cfavml/src/danger/impl_avx2fma.rs index f24cfcd..83da6a6 100644 --- a/cfavml/src/danger/impl_avx2fma.rs +++ b/cfavml/src/danger/impl_avx2fma.rs @@ -3,7 +3,7 @@ use core::arch::x86::*; #[cfg(target_arch = "x86_64")] use core::arch::x86_64::*; -use super::core_simd_api::SimdRegister; +use super::core_simd_api::{Hypot, SimdRegister}; use super::impl_avx2::Avx2; /// AVX2 & FMA enabled SIMD operations. @@ -119,6 +119,61 @@ impl SimdRegister for Avx2Fma { } } +impl Hypot for Avx2Fma { + //b' * sqrt((a*b_hat)^2 + (b*b_hat)^2), where |b| > |a|, b'=(2^n Self::Register { + // convert the inputs to absolute values + let (x, y) = ( + _mm256_andnot_ps(_mm256_set1_ps(-0.0), x), + _mm256_andnot_ps(_mm256_set1_ps(-0.0), y), + ); + // find the max and min of the two inputs + let (hi, lo) = ( + >::max(x, y), + >::min(x, y), + ); + //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa + let exponent_mask = _mm256_set1_ps(f32::INFINITY); + //The mantissa mask is the inverse of the exponent mask + let mantissa_mask = _mm256_sub_ps( + _mm256_set1_ps(f32::MIN_POSITIVE), + _mm256_set1_ps(core::mem::transmute(1_u32)), + ); + // round the hi values down to the nearest power of 2 + let hi2p = _mm256_and_ps(hi, exponent_mask); + // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2, + // we can double it and xor it with the exponent mask + let scale = _mm256_xor_ps(_mm256_add_ps(hi2p, hi2p), exponent_mask); + + // create a mask that matches the normal hi values + let mask = _mm256_cmp_ps::<_CMP_GT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE)); + // replace the subnormal values of hi2p with the minimum positive normal value + let hi2p = _mm256_blendv_ps(_mm256_set1_ps(f32::MIN_POSITIVE), hi2p, mask); + // replace the subnormal values of scale with the reciprocal of the minimum positive normal value + let scale = + _mm256_blendv_ps(_mm256_set1_ps(1.0 / f32::MIN_POSITIVE), scale, mask); + // create a mask that matches the subnormal hi values + let mask = _mm256_cmp_ps::<_CMP_LT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE)); + // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1 + let hi_scaled = + _mm256_or_ps(_mm256_and_ps(hi, mantissa_mask), _mm256_set1_ps(1.0)); + // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values + let hi_scaled = _mm256_blendv_ps( + hi_scaled, + _mm256_sub_ps(hi_scaled, _mm256_set1_ps(1.0)), + mask, + ); + + let hi_scaled = _mm256_mul_ps(hi_scaled, hi_scaled); + let lo_scaled = _mm256_mul_ps(lo, scale); + _mm256_mul_ps( + hi2p, + _mm256_sqrt_ps(_mm256_fmadd_ps(lo_scaled, lo_scaled, hi_scaled)), + ) + } +} + impl SimdRegister for Avx2Fma { type Register = __m256d; @@ -225,3 +280,58 @@ impl SimdRegister for Avx2Fma { Avx2::write(mem, reg) } } + +impl Hypot for Avx2Fma { + #[inline(always)] + unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register { + // convert the inputs to absolute values + let (x, y) = ( + _mm256_andnot_pd(_mm256_set1_pd(-0.0), x), + _mm256_andnot_pd(_mm256_set1_pd(-0.0), y), + ); + // find the max and min of the two inputs + let (hi, lo) = ( + >::max(x, y), + >::min(x, y), + ); + //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa + let exponent_mask = _mm256_set1_pd(f64::INFINITY); + //The mantissa mask is the inverse of the exponent mask + let mantissa_mask = _mm256_sub_pd( + _mm256_set1_pd(f64::MIN_POSITIVE), + _mm256_set1_pd(core::mem::transmute(1_u64)), + ); + // round the hi values down to the nearest power of 2 + let hi2p = _mm256_and_pd(hi, exponent_mask); + // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2, + // we can double it and xor it with the exponent mask + let scale = _mm256_xor_pd(_mm256_add_pd(hi2p, hi2p), exponent_mask); + + // create a mask that matches the normal hi values + let mask = _mm256_cmp_pd::<_CMP_GT_OQ>(hi, _mm256_set1_pd(f64::MIN_POSITIVE)); + // replace the subnormal values of hi2p with the minimum positive normal value + let hi2p = _mm256_blendv_pd(_mm256_set1_pd(f64::MIN_POSITIVE), hi2p, mask); + // replace the subnormal values of scale with the reciprocal of the minimum positive normal value + let scale = + _mm256_blendv_pd(_mm256_set1_pd(1.0 / f64::MIN_POSITIVE), scale, mask); + // create a mask that matches the subnormal hi values + let mask = _mm256_cmp_pd::<_CMP_LT_OQ>(hi, _mm256_set1_pd(f64::MIN_POSITIVE)); + // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1 + let hi_scaled = + _mm256_or_pd(_mm256_and_pd(hi, mantissa_mask), _mm256_set1_pd(1.0)); + // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values + let hi_scaled = _mm256_blendv_pd( + hi_scaled, + _mm256_sub_pd(hi_scaled, _mm256_set1_pd(1.0)), + mask, + ); + + let hi_scaled = _mm256_mul_pd(hi_scaled, hi_scaled); + let lo_scaled = _mm256_mul_pd(lo, scale); + + _mm256_mul_pd( + hi2p, + _mm256_sqrt_pd(_mm256_fmadd_pd(lo_scaled, lo_scaled, hi_scaled)), + ) + } +} diff --git a/cfavml/src/danger/impl_avx512.rs b/cfavml/src/danger/impl_avx512.rs index 726fe46..b6d34bf 100644 --- a/cfavml/src/danger/impl_avx512.rs +++ b/cfavml/src/danger/impl_avx512.rs @@ -5,7 +5,7 @@ use core::arch::x86_64::*; use core::iter::zip; use core::mem; -use super::core_simd_api::{DenseLane, SimdRegister}; +use super::core_simd_api::{DenseLane, Hypot, SimdRegister}; use super::impl_avx2::Avx2; use crate::apply_dense; @@ -133,6 +133,63 @@ impl SimdRegister for Avx512 { } } +impl Hypot for Avx512 { + //b' * sqrt((a*b_hat)^2 + (b*b_hat)^2), where |b| > |a|, b'=(2^n Self::Register { + // convert the inputs to absolute values + let (x, y) = ( + _mm512_andnot_ps(_mm512_set1_ps(-0.0), x), + _mm512_andnot_ps(_mm512_set1_ps(-0.0), y), + ); + // find the max and min of the two inputs + let (hi, lo) = ( + >::max(x, y), + >::min(x, y), + ); + //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa + let exponent_mask = _mm512_set1_ps(f32::INFINITY); + //The mantissa mask is the inverse of the exponent mask + let mantissa_mask = _mm512_sub_ps( + _mm512_set1_ps(f32::MIN_POSITIVE), + _mm512_set1_ps(core::mem::transmute(1_u32)), + ); + // round the hi values down to the nearest power of 2 + let hi2p = _mm512_and_ps(hi, exponent_mask); + // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2, + // we can double it and xor it with the exponent mask + let scale = _mm512_xor_ps(_mm512_add_ps(hi2p, hi2p), exponent_mask); + + // create a mask that matches the normal hi values + let mask = + _mm512_cmp_ps_mask::<_CMP_GT_OQ>(hi, _mm512_set1_ps(f32::MIN_POSITIVE)); + // replace the subnormal values of hi2p with the minimum positive normal value + let hi2p = _mm512_mask_blend_ps(mask, _mm512_set1_ps(f32::MIN_POSITIVE), hi2p); + // replace the subnormal values of scale with the reciprocal of the minimum positive normal value + let scale = + _mm512_mask_blend_ps(mask, _mm512_set1_ps(1.0 / f32::MIN_POSITIVE), scale); + // create a mask that matches the subnormal hi values + let mask = + _mm512_cmp_ps_mask::<_CMP_LT_OQ>(hi, _mm512_set1_ps(f32::MIN_POSITIVE)); + // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1 + let hi_scaled = + _mm512_or_ps(_mm512_and_ps(hi, mantissa_mask), _mm512_set1_ps(1.0)); + // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values + let hi_scaled = _mm512_mask_blend_ps( + mask, + hi_scaled, + _mm512_sub_ps(hi_scaled, _mm512_set1_ps(1.0)), + ); + + let hi_scaled = _mm512_mul_ps(hi_scaled, hi_scaled); + let lo_scaled = _mm512_mul_ps(lo, scale); + _mm512_mul_ps( + hi2p, + _mm512_sqrt_ps(_mm512_fmadd_ps(lo_scaled, lo_scaled, hi_scaled)), + ) + } +} + impl SimdRegister for Avx512 { type Register = __m512d; @@ -247,6 +304,62 @@ impl SimdRegister for Avx512 { } } +impl Hypot for Avx512 { + #[inline(always)] + unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register { + // convert the inputs to absolute values + let (x, y) = ( + _mm512_andnot_pd(_mm512_set1_pd(-0.0), x), + _mm512_andnot_pd(_mm512_set1_pd(-0.0), y), + ); + // find the max and min of the two inputs + let (hi, lo) = ( + >::max(x, y), + >::min(x, y), + ); + //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa + let exponent_mask = _mm512_set1_pd(f64::INFINITY); + //The mantissa mask is the inverse of the exponent mask + let mantissa_mask = _mm512_sub_pd( + _mm512_set1_pd(f64::MIN_POSITIVE), + _mm512_set1_pd(core::mem::transmute(1_u64)), + ); + // round the hi values down to the nearest power of 2 + let hi2p = _mm512_and_pd(hi, exponent_mask); + // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2, + // we can double it and xor it with the exponent mask + let scale = _mm512_xor_pd(_mm512_add_pd(hi2p, hi2p), exponent_mask); + + // create a mask that matches the normal hi values + let mask = + _mm512_cmp_pd_mask::<_CMP_GT_OQ>(hi, _mm512_set1_pd(f64::MIN_POSITIVE)); + // replace the subnormal values of hi2p with the minimum positive normal value + let hi2p = _mm512_mask_blend_pd(mask, _mm512_set1_pd(f64::MIN_POSITIVE), hi2p); + // replace the subnormal values of scale with the reciprocal of the minimum positive normal value + let scale = + _mm512_mask_blend_pd(mask, _mm512_set1_pd(1.0 / f64::MIN_POSITIVE), scale); + // create a mask that matches the subnormal hi values + let mask = + _mm512_cmp_pd_mask::<_CMP_LT_OQ>(hi, _mm512_set1_pd(f64::MIN_POSITIVE)); + // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1 + let hi_scaled = + _mm512_or_pd(_mm512_and_pd(hi, mantissa_mask), _mm512_set1_pd(1.0)); + // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values + let hi_scaled = _mm512_mask_blend_pd( + mask, + hi_scaled, + _mm512_sub_pd(hi_scaled, _mm512_set1_pd(1.0)), + ); + + let hi_scaled = _mm512_mul_pd(hi_scaled, hi_scaled); + let lo_scaled = _mm512_mul_pd(lo, scale); + + _mm512_mul_pd( + hi2p, + _mm512_sqrt_pd(_mm512_fmadd_pd(lo_scaled, lo_scaled, hi_scaled)), + ) + } +} impl SimdRegister for Avx512 { type Register = __m512i; diff --git a/cfavml/src/danger/impl_fallback.rs b/cfavml/src/danger/impl_fallback.rs index 73a98a4..b4ebbeb 100644 --- a/cfavml/src/danger/impl_fallback.rs +++ b/cfavml/src/danger/impl_fallback.rs @@ -1,5 +1,6 @@ +use super::core_simd_api::Hypot; use crate::danger::{DenseLane, SimdRegister}; -use crate::math::{AutoMath, Math}; +use crate::math::{f32_hypot, AutoMath, Math}; /// Fallback SIMD-like operations. /// @@ -130,3 +131,16 @@ where AutoMath::cast_bool(!AutoMath::cmp_eq(l1, l2)) } } + +impl Hypot for Fallback { + #[inline(always)] + unsafe fn hypot(x: f32, y: f32) -> f32 { + f32_hypot(x, y) + } +} +impl Hypot for Fallback { + #[inline(always)] + unsafe fn hypot(x: f64, y: f64) -> f64 { + f64::hypot(x, y) + } +} diff --git a/cfavml/src/danger/impl_neon.rs b/cfavml/src/danger/impl_neon.rs index 3fcef3a..41252ce 100644 --- a/cfavml/src/danger/impl_neon.rs +++ b/cfavml/src/danger/impl_neon.rs @@ -147,6 +147,13 @@ impl SimdRegister for Neon { } } +impl Hypot for Neon { + #[inline(always)] + unsafe fn hypot(l1: Self::Register, l2: Self::Register) -> Self::Register { + todo!() + } +} + impl SimdRegister for Neon { type Register = float64x2_t; diff --git a/cfavml/src/danger/impl_test.rs b/cfavml/src/danger/impl_test.rs index 7bf42d9..8eaef9c 100644 --- a/cfavml/src/danger/impl_test.rs +++ b/cfavml/src/danger/impl_test.rs @@ -1,11 +1,14 @@ use std::fmt::Debug; use std::iter::zip; +use num_traits::{Float, FloatConst}; +use rand::distributions::uniform::SampleUniform; use rand::distributions::{Distribution, Standard}; +use super::core_simd_api::Hypot; use crate::danger::SimdRegister; use crate::math::{AutoMath, Math}; -use crate::test_utils::get_sample_vectors; +use crate::test_utils::{get_sample_vectors, get_subnormal_sample_vectors}; /// Runs a set of generic test suites to ensure a given impl is working correctly. pub(crate) unsafe fn test_suite_impl() @@ -277,3 +280,87 @@ where ); } } + +pub(crate) unsafe fn hypot_test_impl() +where + T: Float + Debug + FloatConst + SampleUniform, + R: SimdRegister + Hypot, + AutoMath: Math, + Standard: Distribution, +{ + let (small_sample_l1, small_sample_l2) = + get_sample_vectors::(R::elements_per_lane()); + + let (large_sample_l1, large_sample_l2) = + get_sample_vectors::(R::elements_per_dense()); + + let (small_subnormal_sample_l1, small_subnormal_sample_l2) = + get_subnormal_sample_vectors::(R::elements_per_lane()); + + let (large_subnormal_sample_l1, large_subnormal_sample_l2) = + get_subnormal_sample_vectors::(R::elements_per_dense()); + + test_sample::( + small_sample_l1, + small_sample_l2, + large_sample_l1, + large_sample_l2, + ); + test_sample::( + small_subnormal_sample_l1, + small_subnormal_sample_l2, + large_subnormal_sample_l1, + large_subnormal_sample_l2, + ); +} + +unsafe fn test_sample( + sample1: Vec, + sample2: Vec, + large_sample_l1: Vec, + large_sample_l2: Vec, +) where + T: Float + Debug + FloatConst, + R: SimdRegister + Hypot, + AutoMath: Math, + Standard: Distribution, +{ + { + let (_std_result, std_sum) = get_std_results(&sample1, &sample2); + let l1 = R::load(sample1.as_ptr()); + let l2 = R::load(sample2.as_ptr()); + let res = R::hypot(l1, l2); + let res_sum = R::sum_to_value(res); + assert!( + AutoMath::is_close(std_sum, res_sum), + "Hypot and sum test failed on single task" + ); + } + { + let (_std_result, std_sum) = get_std_results(&large_sample_l1, &large_sample_l2); + let l1 = R::load_dense(large_sample_l1.as_ptr()); + let l2 = R::load_dense(large_sample_l2.as_ptr()); + let res = R::hypot_dense(l1, l2); + let reg = R::sum_to_register(res); + let res_sum = R::sum_to_value(reg); + + assert!( + AutoMath::is_close(std_sum, res_sum), + "Hypot and sum test failed on dense task" + ); + } +} + +fn get_std_results(sample1: &Vec, sample2: &Vec) -> (Vec, T) +where + T: Float + Debug, + AutoMath: Math, +{ + let std_result = sample1 + .iter() + .zip(sample2.iter()) + .map(|(a, b)| a.hypot(*b)) + .collect::>(); + let sum = std_result.iter().fold(AutoMath::zero(), |a, b| a + *b); + (std_result, sum) +} diff --git a/cfavml/src/danger/test_suite.rs b/cfavml/src/danger/test_suite.rs index 887ef08..9671056 100644 --- a/cfavml/src/danger/test_suite.rs +++ b/cfavml/src/danger/test_suite.rs @@ -281,6 +281,23 @@ macro_rules! test_suite { }; } +macro_rules! hypot_tests { + ($im:ident) => { + paste::paste! { + #[test] + fn []() { + unsafe { crate::danger::impl_test::test_suite_impl::() } + } + } + paste::paste! { + #[test] + fn []() { + unsafe { crate::danger::impl_test::test_suite_impl::() } + } + } + }; +} + fn test_arithmetic_value_all(l1: Vec, value: T) where T: Copy + PartialEq + Debug + IntoMemLoader, @@ -385,6 +402,7 @@ test_cosine_extra!(u64, Fallback); test_nan_sanity!(f32, Fallback); test_nan_sanity!(f64, Fallback); +hypot_tests!(Fallback); #[cfg(all(target_feature = "avx2", test))] mod avx2_tests { @@ -412,6 +430,7 @@ mod avx2_tests { test_nan_sanity!(f32, Avx2); test_nan_sanity!(f64, Avx2); + hypot_tests!(Avx2); } #[cfg(all(target_feature = "avx512f", feature = "nightly", test))] @@ -451,6 +470,7 @@ mod avx2fma_tests { test_cosine_extra!(f32, Avx2Fma); test_cosine_extra!(f64, Avx2Fma); + hypot_tests!(Avx2Fma); } #[cfg(all(target_feature = "neon", test))] diff --git a/cfavml/src/math/default.rs b/cfavml/src/math/default.rs index 902b068..8bc96d0 100644 --- a/cfavml/src/math/default.rs +++ b/cfavml/src/math/default.rs @@ -114,6 +114,27 @@ impl Math for StdMath { } } +pub(crate) fn f32_hypot(a: f32, b: f32) -> f32 { + #[cfg(feature = "std")] + { + f32::hypot(a, b) + } + #[cfg(not(feature = "std"))] + { + todo!() + } +} +pub(crate) fn f64_hypot(a: f64, b: f64) -> f64 { + #[cfg(feature = "std")] + { + f64::hypot(a, b) + } + #[cfg(not(feature = "std"))] + { + todo!() + } +} + impl Math for StdMath { #[inline(always)] fn zero() -> f64 { diff --git a/cfavml/src/math/mod.rs b/cfavml/src/math/mod.rs index 0d56be8..07df427 100644 --- a/cfavml/src/math/mod.rs +++ b/cfavml/src/math/mod.rs @@ -3,9 +3,9 @@ mod default; mod fast_math; pub use default::StdMath; +pub(crate) use default::{f32_hypot, f64_hypot}; #[cfg(feature = "nightly")] pub use fast_math::FastMath; - #[cfg(not(feature = "nightly"))] pub type AutoMath = StdMath; #[cfg(feature = "nightly")] diff --git a/cfavml/src/test_utils.rs b/cfavml/src/test_utils.rs index 2688e3f..440c81f 100644 --- a/cfavml/src/test_utils.rs +++ b/cfavml/src/test_utils.rs @@ -1,10 +1,13 @@ +use core::ops::Neg; + +use num_traits::{zero, Float, FloatConst}; +use rand::distributions::uniform::SampleUniform; use rand::distributions::{Distribution, Standard}; use rand::{Rng, SeedableRng}; use rand_chacha::ChaCha8Rng; use crate::danger::cosine; use crate::math::{AutoMath, Math}; - const SEED: u64 = 34535345353; pub fn get_sample_vectors(size: usize) -> (Vec, Vec) @@ -36,6 +39,38 @@ where (x, y) } +pub fn get_subnormal_sample_vectors(size: usize) -> (Vec, Vec) +where + T: Copy + Float + FloatConst + Neg + SampleUniform, + AutoMath: Math, + Standard: Distribution, +{ + let mut rng = ChaCha8Rng::seed_from_u64(SEED); + + let mut x = Vec::new(); + let mut y = Vec::new(); + for _ in 0..size { + let mut v1 = rng.gen_range(core::ops::Range { + start: zero(), + end: T::min_positive_value(), + }); + let mut v2 = rng.gen(); + + if AutoMath::cmp_eq(v1, AutoMath::zero()) { + v1 = AutoMath::one(); + } + + if AutoMath::cmp_eq(v2, AutoMath::zero()) { + v2 = AutoMath::one(); + } + + x.push(v1); + y.push(v2); + } + + (x, y) +} + pub fn simple_dot(x: &[T], y: &[T]) -> T where T: Copy, From 11db4447e95ef53620d7419f54fc5b1e20fa9e39 Mon Sep 17 00:00:00 2001 From: Joshua Ferguson Date: Sat, 16 Nov 2024 13:48:26 -0600 Subject: [PATCH 02/13] use sqrt intrinsic for fastmath, implemented Hypot for Neon --- cfavml/src/danger/impl_neon.rs | 88 ++++++++++++++++++++++++++++++++-- cfavml/src/danger/impl_test.rs | 24 +++++++++- cfavml/src/math/fast_math.rs | 4 +- 3 files changed, 108 insertions(+), 8 deletions(-) diff --git a/cfavml/src/danger/impl_neon.rs b/cfavml/src/danger/impl_neon.rs index 41252ce..4b98703 100644 --- a/cfavml/src/danger/impl_neon.rs +++ b/cfavml/src/danger/impl_neon.rs @@ -2,7 +2,7 @@ use core::arch::aarch64::*; use core::iter::zip; use core::mem; -use crate::danger::{DenseLane, SimdRegister}; +use super::core_simd_api::{DenseLane, SimdRegister,Hypot}; use crate::math::{AutoMath, Math}; const BITS_8_CAPACITY: usize = 16; @@ -146,11 +146,48 @@ impl SimdRegister for Neon { vst1q_f32(mem, reg) } } - +const EXPONENT_MASK_F32: u32 = 2139095040; +const MANTISSA_MASK_F32: u32 = 8388607; impl Hypot for Neon { #[inline(always)] - unsafe fn hypot(l1: Self::Register, l2: Self::Register) -> Self::Register { - todo!() + unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register { + // Convert inputs to absolute values + let (x, y) = (vabsq_f32(x), vabsq_f32(y)); + + // Find the max and min of the two inputs + let (hi, lo) = (vmaxq_f32(x, y), vminq_f32(x, y)); + let exponent_mask = vdupq_n_u32(EXPONENT_MASK_F32); + let mantissa_mask = vdupq_n_u32(MANTISSA_MASK_F32); + + // round the hi values down to the nearest power of 2 + let hi2p = + vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(hi), exponent_mask)); + // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2, + // we can double it and xor it with the exponent mask + let scale = vreinterpretq_f32_u32(veorq_u32( + vreinterpretq_u32_f32(vaddq_f32(hi2p, hi2p)), + exponent_mask, + )); + // create a mask that matches the normal hi values + let mask = vcgtq_f32(hi, vdupq_n_f32(f32::MIN_POSITIVE)); + // replace the subnormal values of hi2p with the minimum positive normal value + let hi2p = vbslq_f32(mask, hi2p, vdupq_n_f32(f32::MIN_POSITIVE)); + // replace the subnormal values of scale with the reciprocal of the minimum positive normal value + let scale = vbslq_f32(mask, scale, vdupq_n_f32(1.0 / f32::MIN_POSITIVE)); + // create a mask that matches the subnormal hi values + let mask = vcltq_f32(hi, vdupq_n_f32(f32::MIN_POSITIVE)); + // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1 + let hi_scaled = vreinterpretq_f32_u32(vorrq_u32( + vandq_u32(vreinterpretq_u32_f32(hi), mantissa_mask), + vreinterpretq_u32_f32(vdupq_n_f32(1.0)), + )); + // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values + let hi_scaled = + vbslq_f32(mask, vsubq_f32(hi_scaled, vdupq_n_f32(1.0)), hi_scaled); + // finally, do the thing + let hi_scaled = vmulq_f32(hi_scaled, hi_scaled); + let lo_scaled = vmulq_f32(lo, scale); + vmulq_f32(hi2p, vsqrtq_f32(vfmaq_f32(lo_scaled, lo_scaled, hi_scaled))) } } @@ -286,6 +323,49 @@ impl SimdRegister for Neon { } } +impl Hypot for Neon { + #[inline(always)] + unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register { + // Convert inputs to absolute values + let (x, y) = (vabsq_f64(x), vabsq_f64(y)); + + // Find the max and min of the two inputs + let (hi, lo) = (vmaxq_f64(x, y), vminq_f64(x, y)); + let exponent_mask = vdupq_n_u64(f64::INFINITY.to_bits()); + let mantissa_mask = vdupq_n_u64((f64::MIN_POSITIVE - mem::transmute::(1)).to_bits()); + + // round the hi values down to the nearest power of 2 + let hi2p = + vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(hi), exponent_mask)); + // we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2, + // we can double it and xor it with the exponent mask + let scale = vreinterpretq_f64_u64(veorq_u64( + vreinterpretq_u64_f64(vaddq_f64(hi2p, hi2p)), + exponent_mask, + )); + // create a mask that matches the normal hi values + let mask = vcgtq_f64(hi, vdupq_n_f64(f64::MIN_POSITIVE)); + // replace the subnormal values of hi2p with the minimum positive normal value + let hi2p = vbslq_f64(mask, hi2p, vdupq_n_f64(f64::MIN_POSITIVE)); + // replace the subnormal values of scale with the reciprocal of the minimum positive normal value + let scale = vbslq_f64(mask, scale, vdupq_n_f64(1.0 / f64::MIN_POSITIVE)); + // create a mask that matches the subnormal hi values + let mask = vcltq_f64(hi, vdupq_n_f64(f64::MIN_POSITIVE)); + // since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1 + let hi_scaled = vreinterpretq_f64_u64(vorrq_u64( + vandq_u64(vreinterpretq_u64_f64(hi), mantissa_mask), + vreinterpretq_u64_f64(vdupq_n_f64(1.0)), + )); + // for the subnormal elements of hi, we need to subtract 1 from the scaled hi values + let hi_scaled = + vbslq_f64(mask, vsubq_f64(hi_scaled, vdupq_n_f64(1.0)), hi_scaled); + // finally, do the thing + let hi_scaled = vmulq_f64(hi_scaled, hi_scaled); + let lo_scaled = vmulq_f64(lo, scale); + vmulq_f64(hi2p, vsqrtq_f64(vfmaq_f64(lo_scaled, lo_scaled, hi_scaled))) + } +} + impl SimdRegister for Neon { type Register = int8x16_t; diff --git a/cfavml/src/danger/impl_test.rs b/cfavml/src/danger/impl_test.rs index 8eaef9c..d1d3494 100644 --- a/cfavml/src/danger/impl_test.rs +++ b/cfavml/src/danger/impl_test.rs @@ -326,7 +326,7 @@ unsafe fn test_sample( Standard: Distribution, { { - let (_std_result, std_sum) = get_std_results(&sample1, &sample2); + let (std_result, std_sum) = get_std_results(&sample1, &sample2); let l1 = R::load(sample1.as_ptr()); let l2 = R::load(sample2.as_ptr()); let res = R::hypot(l1, l2); @@ -335,9 +335,12 @@ unsafe fn test_sample( AutoMath::is_close(std_sum, res_sum), "Hypot and sum test failed on single task" ); + let mut res_vec = vec![T::zero(); R::elements_per_lane()]; + R::write(res_vec.as_mut_ptr(), res); + test_diff_ulps(std_result, res_vec); } { - let (_std_result, std_sum) = get_std_results(&large_sample_l1, &large_sample_l2); + let (std_result, std_sum) = get_std_results(&large_sample_l1, &large_sample_l2); let l1 = R::load_dense(large_sample_l1.as_ptr()); let l2 = R::load_dense(large_sample_l2.as_ptr()); let res = R::hypot_dense(l1, l2); @@ -348,6 +351,9 @@ unsafe fn test_sample( AutoMath::is_close(std_sum, res_sum), "Hypot and sum test failed on dense task" ); + let mut res_vec = vec![T::zero(); R::elements_per_dense()]; + R::write_dense(res_vec.as_mut_ptr(), res); + test_diff_ulps(std_result, res_vec); } } @@ -364,3 +370,17 @@ where let sum = std_result.iter().fold(AutoMath::zero(), |a, b| a + *b); (std_result, sum) } + +fn test_diff_ulps(a: Vec, b: Vec) +where + T: Float + FloatConst, +{ + a.iter().zip(b.iter()).for_each(|(a, b)| { + let (a_mant, a_exp, a_sign) = a.integer_decode(); + let (b_mant, b_exp, b_sign) = b.integer_decode(); + assert!(a_sign == b_sign); + assert!(a_exp == b_exp); + let dist = a_mant as i64 - b_mant as i64; + assert!(dist.abs() < 2, "Greater than 1 ulp difference: {dist}"); + }); +} diff --git a/cfavml/src/math/fast_math.rs b/cfavml/src/math/fast_math.rs index d56924f..de25c04 100644 --- a/cfavml/src/math/fast_math.rs +++ b/cfavml/src/math/fast_math.rs @@ -28,7 +28,7 @@ impl Math for FastMath { #[inline(always)] fn sqrt(a: f32) -> f32 { - StdMath::sqrt(a) + core::intrinsics::sqrtf32(a) } #[inline(always)] @@ -139,7 +139,7 @@ impl Math for FastMath { #[inline(always)] fn sqrt(a: f64) -> f64 { - StdMath::sqrt(a) + core::intrinsics::sqrtf64(a) } #[inline(always)] From b1e9599fbf2c4ee78dbaf56e7ea5d0dee4ca6c7d Mon Sep 17 00:00:00 2001 From: Joshua Ferguson Date: Sat, 16 Nov 2024 14:09:16 -0600 Subject: [PATCH 03/13] added hypot test for Neon, Axx512 --- cfavml/src/danger/test_suite.rs | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/cfavml/src/danger/test_suite.rs b/cfavml/src/danger/test_suite.rs index 9671056..2fbe053 100644 --- a/cfavml/src/danger/test_suite.rs +++ b/cfavml/src/danger/test_suite.rs @@ -281,7 +281,7 @@ macro_rules! test_suite { }; } -macro_rules! hypot_tests { +macro_rules! test_hypot { ($im:ident) => { paste::paste! { #[test] @@ -402,7 +402,7 @@ test_cosine_extra!(u64, Fallback); test_nan_sanity!(f32, Fallback); test_nan_sanity!(f64, Fallback); -hypot_tests!(Fallback); +test_hypot!(Fallback); #[cfg(all(target_feature = "avx2", test))] mod avx2_tests { @@ -430,7 +430,7 @@ mod avx2_tests { test_nan_sanity!(f32, Avx2); test_nan_sanity!(f64, Avx2); - hypot_tests!(Avx2); + test_hypot!(Avx2); } #[cfg(all(target_feature = "avx512f", feature = "nightly", test))] @@ -459,6 +459,7 @@ mod avx512_tests { test_nan_sanity!(f32, Avx512); test_nan_sanity!(f64, Avx512); + test_hypot!(Avx512); } #[cfg(all(target_feature = "avx2", target_feature = "fma", test))] @@ -470,7 +471,7 @@ mod avx2fma_tests { test_cosine_extra!(f32, Avx2Fma); test_cosine_extra!(f64, Avx2Fma); - hypot_tests!(Avx2Fma); + test_hypot!(Avx2Fma); } #[cfg(all(target_feature = "neon", test))] @@ -501,4 +502,5 @@ mod neon_tests { test_nan_sanity!(f32, Neon); test_nan_sanity!(f64, Neon); + test_hypot!(Neon); } From a1a2e279aea66d52ee6d12c71031fe2f37077b01 Mon Sep 17 00:00:00 2001 From: Joshua Ferguson Date: Sat, 16 Nov 2024 17:01:07 -0600 Subject: [PATCH 04/13] revert sqrt change, fallback to direct calculation if no_std for now --- cfavml/src/danger/impl_fallback.rs | 6 ++-- cfavml/src/danger/impl_neon.rs | 5 ++-- cfavml/src/math/default.rs | 47 ++++++++++++++++++------------ cfavml/src/math/fast_math.rs | 18 ++++++++++-- cfavml/src/math/mod.rs | 5 +++- 5 files changed, 53 insertions(+), 28 deletions(-) diff --git a/cfavml/src/danger/impl_fallback.rs b/cfavml/src/danger/impl_fallback.rs index b4ebbeb..91b4a66 100644 --- a/cfavml/src/danger/impl_fallback.rs +++ b/cfavml/src/danger/impl_fallback.rs @@ -1,6 +1,6 @@ use super::core_simd_api::Hypot; use crate::danger::{DenseLane, SimdRegister}; -use crate::math::{f32_hypot, AutoMath, Math}; +use crate::math::{AutoMath, Math, Numeric}; /// Fallback SIMD-like operations. /// @@ -135,12 +135,12 @@ where impl Hypot for Fallback { #[inline(always)] unsafe fn hypot(x: f32, y: f32) -> f32 { - f32_hypot(x, y) + AutoMath::hypot(x, y) } } impl Hypot for Fallback { #[inline(always)] unsafe fn hypot(x: f64, y: f64) -> f64 { - f64::hypot(x, y) + AutoMath::hypot(x, y) } } diff --git a/cfavml/src/danger/impl_neon.rs b/cfavml/src/danger/impl_neon.rs index 4b98703..ef60095 100644 --- a/cfavml/src/danger/impl_neon.rs +++ b/cfavml/src/danger/impl_neon.rs @@ -2,7 +2,7 @@ use core::arch::aarch64::*; use core::iter::zip; use core::mem; -use super::core_simd_api::{DenseLane, SimdRegister,Hypot}; +use super::core_simd_api::{DenseLane, Hypot, SimdRegister}; use crate::math::{AutoMath, Math}; const BITS_8_CAPACITY: usize = 16; @@ -332,7 +332,8 @@ impl Hypot for Neon { // Find the max and min of the two inputs let (hi, lo) = (vmaxq_f64(x, y), vminq_f64(x, y)); let exponent_mask = vdupq_n_u64(f64::INFINITY.to_bits()); - let mantissa_mask = vdupq_n_u64((f64::MIN_POSITIVE - mem::transmute::(1)).to_bits()); + let mantissa_mask = + vdupq_n_u64((f64::MIN_POSITIVE - mem::transmute::(1)).to_bits()); // round the hi values down to the nearest power of 2 let hi2p = diff --git a/cfavml/src/math/default.rs b/cfavml/src/math/default.rs index 8bc96d0..c538f8e 100644 --- a/cfavml/src/math/default.rs +++ b/cfavml/src/math/default.rs @@ -1,4 +1,4 @@ -use super::Math; +use super::{Math, Numeric}; /// Standard math operations that apply no specialised handling. pub struct StdMath; @@ -114,24 +114,18 @@ impl Math for StdMath { } } -pub(crate) fn f32_hypot(a: f32, b: f32) -> f32 { - #[cfg(feature = "std")] - { - f32::hypot(a, b) - } - #[cfg(not(feature = "std"))] - { - todo!() - } -} -pub(crate) fn f64_hypot(a: f64, b: f64) -> f64 { - #[cfg(feature = "std")] - { - f64::hypot(a, b) - } - #[cfg(not(feature = "std"))] - { - todo!() +impl Numeric for StdMath { + #[inline(always)] + fn hypot(a: f32, b: f32) -> f32 { + #[cfg(feature = "std")] + { + f32::hypot(a, b) + } + #[cfg(not(feature = "std"))] + { + // Fall back to direct calculation + f32_sqrt_fast(a * a + b * b) + } } } @@ -246,6 +240,21 @@ impl Math for StdMath { } } +impl Numeric for StdMath { + #[inline(always)] + fn hypot(a: f64, b: f64) -> f64 { + #[cfg(feature = "std")] + { + f64::hypot(a, b) + } + #[cfg(not(feature = "std"))] + { + // Fall back to direct calculation + f32_sqrt_fast((a * a + b * b) as f32) as f64 + } + } +} + macro_rules! define_int_ops { ($t:ident) => { impl Math<$t> for StdMath { diff --git a/cfavml/src/math/fast_math.rs b/cfavml/src/math/fast_math.rs index de25c04..58e65d8 100644 --- a/cfavml/src/math/fast_math.rs +++ b/cfavml/src/math/fast_math.rs @@ -1,6 +1,6 @@ use core::intrinsics; -use super::{Math, StdMath}; +use super::{Math, Numeric, StdMath}; /// Basic math operations backed by fast-math intrinsics. pub struct FastMath; @@ -28,7 +28,7 @@ impl Math for FastMath { #[inline(always)] fn sqrt(a: f32) -> f32 { - core::intrinsics::sqrtf32(a) + StdMath::sqrt(a) } #[inline(always)] @@ -116,6 +116,12 @@ impl Math for FastMath { } } +impl Numeric for FastMath { + fn hypot(a: f32, b: f32) -> f32 { + StdMath::hypot(a as f64, b as f64) as f32 + } +} + impl Math for FastMath { #[inline(always)] fn zero() -> f64 { @@ -139,7 +145,7 @@ impl Math for FastMath { #[inline(always)] fn sqrt(a: f64) -> f64 { - core::intrinsics::sqrtf64(a) + StdMath::sqrt(a) } #[inline(always)] @@ -227,6 +233,12 @@ impl Math for FastMath { } } +impl Numeric for FastMath { + fn hypot(a: f64, b: f64) -> f64 { + StdMath::hypot(a, b) + } +} + macro_rules! define_int_ops { ($t:ident) => { impl Math<$t> for FastMath { diff --git a/cfavml/src/math/mod.rs b/cfavml/src/math/mod.rs index 07df427..f15fbfd 100644 --- a/cfavml/src/math/mod.rs +++ b/cfavml/src/math/mod.rs @@ -3,7 +3,6 @@ mod default; mod fast_math; pub use default::StdMath; -pub(crate) use default::{f32_hypot, f64_hypot}; #[cfg(feature = "nightly")] pub use fast_math::FastMath; #[cfg(not(feature = "nightly"))] @@ -79,3 +78,7 @@ pub trait Math { } } } + +pub trait Numeric: Math { + fn hypot(a: T, b: T) -> T; +} From c1b88c7a3971086fbbecb43059cd9f2773e6851f Mon Sep 17 00:00:00 2001 From: Joshua Ferguson Date: Wed, 20 Nov 2024 12:02:38 -0600 Subject: [PATCH 05/13] added export_hypot --- cfavml/src/danger/export_hypot.rs | 152 ++++++++++++++++++ cfavml/src/danger/impl_fallback.rs | 14 +- cfavml/src/danger/mod.rs | 3 + cfavml/src/danger/op_hypot.rs | 38 +++++ .../export_docs/arithmetic_hypot_vertical.md | 37 +++++ 5 files changed, 236 insertions(+), 8 deletions(-) create mode 100644 cfavml/src/danger/export_hypot.rs create mode 100644 cfavml/src/danger/op_hypot.rs create mode 100644 cfavml/src/export_docs/arithmetic_hypot_vertical.md diff --git a/cfavml/src/danger/export_hypot.rs b/cfavml/src/danger/export_hypot.rs new file mode 100644 index 0000000..96589cf --- /dev/null +++ b/cfavml/src/danger/export_hypot.rs @@ -0,0 +1,152 @@ +//! Common arithmetic operations +//! +//! I.e. Add, Sub, Mul, Div... + +use crate::buffer::WriteOnlyBuffer; +use crate::danger::core_simd_api::Hypot; +use crate::danger::{generic_hypot_vertical, SimdRegister}; +use crate::math::{AutoMath, Math, Numeric}; +use crate::mem_loader::{IntoMemLoader, MemLoader}; + +macro_rules! define_arithmetic_impls { + ( + $hypot_name:ident, + $imp:ident $(,)? + $(target_features = $($feat:expr $(,)?)+)? + ) => { + #[inline] + $(#[target_feature($(enable = $feat, )*)])* + #[doc = include_str!("../export_docs/arithmetic_hypot_vertical.md")] + $( + + #[doc = concat!("- ", $("**`+", $feat, "`** ", )*)] + #[doc = "CPU features are available at runtime. Running on hardware _without_ this feature available will cause immediate UB."] + )* + pub unsafe fn $hypot_name( + a: B1, + b: B2, + result: &mut [B3], + ) + where + T: Copy, + B1: IntoMemLoader, + B1::Loader: MemLoader, + B2: IntoMemLoader, + B2::Loader: MemLoader, + crate::danger::$imp: SimdRegister+ Hypot, + AutoMath: Math + Numeric, + for<'a> &'a mut [B3]: WriteOnlyBuffer, + { + generic_hypot_vertical::( + a, + b, + result, + ) + } + }; +} + +define_arithmetic_impls!(generic_fallback_hypot_vertical, Fallback,); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +define_arithmetic_impls!(generic_avx2_hypot_vertical, Avx2, target_features = "avx2"); +#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "nightly"))] +define_arithmetic_impls!( + generic_avx512_hypot_vertical, + Avx512, + target_features = "avx512f", + "avx512bw" +); +#[cfg(target_arch = "aarch64")] +define_arithmetic_impls!(generic_neon_hypot_vertical, Neon, target_features = "neon"); + +#[cfg(test)] +mod tests { + use num_traits::{Float, FloatConst}; + + use super::*; + + macro_rules! define_inner_test { + ($variant:ident, op = $op:ident, ty = $t:ident) => { + paste::paste! { + #[test] + fn [< $variant _ $op _value_ $t >]() { + let (l1, _) = crate::test_utils::get_sample_vectors::<$t>(533); + + let mut result = vec![$t::default(); 533]; + unsafe { [< $variant _ $op _vertical >](&l1, 2 as $t, &mut result) }; + + let expected = l1.iter() + .copied() + .map(|v| AutoMath::$op(v, 2 as $t)) + .collect::>(); + for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) { + let ulps_diff = get_diff_ulps(*expected, *actual); + assert!( + ulps_diff.abs() <= 1, + "result differs by more than 1 ULP:\n initial inputs: {}, 2\n expected: {} actual: {}\nulps diff: {}", + initial, expected, actual, ulps_diff + ); + } + + } + + #[test] + fn [< $variant _ $op _vector_ $t >]() { + let (l1, l2) = crate::test_utils::get_subnormal_sample_vectors::<$t>(533); + + let mut result = vec![$t::default(); 533]; + unsafe { [< $variant _ $op _vertical >](&l1, &l2, &mut result) }; + + let expected = l1.iter() + .copied() + .zip(l2.iter().copied()) + .map(|(a, b)| AutoMath::$op(a, b)) + .collect::>(); + for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) { + let ulps_diff = get_diff_ulps(*expected, *actual); + assert!( + ulps_diff.abs() <= 1, + "result differs by more than 1 ULP:\n initial inputs: {}, 2\n expected: {} actual: {}\nulps diff: {}", + initial, expected, actual, ulps_diff + ); + } + } + } + }; + } + + fn get_diff_ulps(a: T, b: T) -> i64 + where + T: Float + FloatConst, + { + let (a_mant, a_exp, a_sign) = a.integer_decode(); + let (b_mant, b_exp, b_sign) = b.integer_decode(); + assert!(a_sign == b_sign); + assert!(a_exp == b_exp); + a_mant as i64 - b_mant as i64 + } + + macro_rules! define_numeric_test { + ($variant:ident, types = $($t:ident $(,)?)+) => { + $( + define_inner_test!($variant, op = hypot, ty = $t); + + )* + }; + } + + define_numeric_test!(generic_fallback, types = f32, f64,); + #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "avx2" + ))] + define_numeric_test!(generic_avx2, types = f32, f64,); + #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + feature = "nightly", + target_feature = "avx512f" + ))] + define_numeric_test!(generic_avx512, types = f32, f64,); + #[cfg(target_arch = "aarch64")] + define_numeric_test!(generic_neon, types = f32, f64,); +} diff --git a/cfavml/src/danger/impl_fallback.rs b/cfavml/src/danger/impl_fallback.rs index 91b4a66..59f82c0 100644 --- a/cfavml/src/danger/impl_fallback.rs +++ b/cfavml/src/danger/impl_fallback.rs @@ -132,15 +132,13 @@ where } } -impl Hypot for Fallback { - #[inline(always)] - unsafe fn hypot(x: f32, y: f32) -> f32 { - AutoMath::hypot(x, y) - } -} -impl Hypot for Fallback { +impl Hypot for Fallback +where + T: Copy, + AutoMath: Math + Numeric, +{ #[inline(always)] - unsafe fn hypot(x: f64, y: f64) -> f64 { + unsafe fn hypot(x: T, y: T) -> T { AutoMath::hypot(x, y) } } diff --git a/cfavml/src/danger/mod.rs b/cfavml/src/danger/mod.rs index c16075a..cd0b26c 100644 --- a/cfavml/src/danger/mod.rs +++ b/cfavml/src/danger/mod.rs @@ -24,9 +24,11 @@ pub mod export_agg_ops; pub mod export_arithmetic_ops; pub mod export_cmp_ops; pub mod export_distance_ops; +pub mod export_hypot; #[cfg(test)] mod impl_test; mod op_cmp_vertical; +mod op_hypot; #[cfg(test)] mod test_suite; @@ -61,6 +63,7 @@ pub(crate) use self::op_cosine::cosine; pub use self::op_cosine::generic_cosine; pub use self::op_dot::generic_dot; pub use self::op_euclidean::generic_squared_euclidean; +pub use self::op_hypot::generic_hypot_vertical; pub use self::op_norm::generic_squared_norm; pub use self::op_sum::generic_sum; diff --git a/cfavml/src/danger/op_hypot.rs b/cfavml/src/danger/op_hypot.rs new file mode 100644 index 0000000..83335b9 --- /dev/null +++ b/cfavml/src/danger/op_hypot.rs @@ -0,0 +1,38 @@ +use super::core_routine_boilerplate::apply_vertical_kernel; +use super::core_simd_api::Hypot; +use crate::buffer::WriteOnlyBuffer; +use crate::danger::core_simd_api::SimdRegister; +use crate::math::{Math, Numeric}; +use crate::mem_loader::{IntoMemLoader, MemLoader}; + +#[inline(always)] +/// A generic vector hypot over one vector and single value. +/// +/// # Safety +/// +/// The sizes of `a`, `b` and `result` must be equal to `dims`, the safety requirements of +/// `M` definition the basic math operations and the requirements of `R` SIMD register +/// must also be followed. +pub unsafe fn generic_hypot_vertical( + a: B1, + b: B2, + result: &mut [B3], +) where + T: Copy, + R: SimdRegister + Hypot, + M: Math + Numeric, + B1: IntoMemLoader, + B1::Loader: MemLoader, + B2: IntoMemLoader, + B2::Loader: MemLoader, + for<'a> &'a mut [B3]: WriteOnlyBuffer, +{ + apply_vertical_kernel::( + a, + b, + result, + R::hypot_dense, + R::hypot, + M::hypot, + ) +} diff --git a/cfavml/src/export_docs/arithmetic_hypot_vertical.md b/cfavml/src/export_docs/arithmetic_hypot_vertical.md new file mode 100644 index 0000000..ef7f662 --- /dev/null +++ b/cfavml/src/export_docs/arithmetic_hypot_vertical.md @@ -0,0 +1,37 @@ +Performs an elementwise hypotenuse of input buffers `a` and `b` that can +be projected to the desired output size of `result`. Implementation is an appoximation that +_should_ match std::hypot in most cases. However, with some inputs it's been confirmed to be off by 1 ulp. + +### Projecting Vectors + +CFAVML allows for working over a wide variety of buffers for applications, projection is effectively +broadcasting of two input buffers implementing `IntoMemLoader`. + +By default, you can provide _two slices_, _one slice and a broadcast value_, or _two broadcast values_, +which exhibit the standard behaviour as you might expect. + +When providing two slices as inputs they cannot be projected to a buffer +that is larger their input sizes by default. This means providing two slices +of `128` elements in length must take a result buffer of `128` elements in length. + +### Implementation Pseudocode + +_This is the logic of the routine being called._ + +```ignore +result = [0; dims] + +for i in range(dims): + result[i] = a[i].hypot(b[i]) + +return result +``` + +# Panics + +If vectors `a` and `b` cannot be projected to the target size of `result`. +Note that the projection rules are tied to the `MemLoader` implementation. + +# Safety + +This routine assumes: From 5ba40f790de645651396cb839b7eb3d6c90f0782 Mon Sep 17 00:00:00 2001 From: Joshua Ferguson Date: Wed, 20 Nov 2024 17:25:16 -0600 Subject: [PATCH 06/13] safe api implemented --- cfavml/src/danger/export_hypot.rs | 4 +- cfavml/src/lib.rs | 1 + cfavml/src/safe_trait_numeric_ops.rs | 85 ++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 cfavml/src/safe_trait_numeric_ops.rs diff --git a/cfavml/src/danger/export_hypot.rs b/cfavml/src/danger/export_hypot.rs index 96589cf..6e7b3a8 100644 --- a/cfavml/src/danger/export_hypot.rs +++ b/cfavml/src/danger/export_hypot.rs @@ -79,6 +79,7 @@ mod tests { .copied() .map(|v| AutoMath::$op(v, 2 as $t)) .collect::>(); + for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) { let ulps_diff = get_diff_ulps(*expected, *actual); assert!( @@ -86,13 +87,14 @@ mod tests { "result differs by more than 1 ULP:\n initial inputs: {}, 2\n expected: {} actual: {}\nulps diff: {}", initial, expected, actual, ulps_diff ); + } } #[test] fn [< $variant _ $op _vector_ $t >]() { - let (l1, l2) = crate::test_utils::get_subnormal_sample_vectors::<$t>(533); + let (l1, l2) = crate::test_utils::get_sample_vectors::<$t>(533); let mut result = vec![$t::default(); 533]; unsafe { [< $variant _ $op _vertical >](&l1, &l2, &mut result) }; diff --git a/cfavml/src/lib.rs b/cfavml/src/lib.rs index 813e9ec..19affdd 100644 --- a/cfavml/src/lib.rs +++ b/cfavml/src/lib.rs @@ -23,6 +23,7 @@ pub mod safe_trait_agg_ops; pub mod safe_trait_arithmetic_ops; pub mod safe_trait_cmp_ops; pub mod safe_trait_distance_ops; +pub mod safe_trait_numeric_ops; #[cfg(test)] mod test_utils; diff --git a/cfavml/src/safe_trait_numeric_ops.rs b/cfavml/src/safe_trait_numeric_ops.rs new file mode 100644 index 0000000..c442fc0 --- /dev/null +++ b/cfavml/src/safe_trait_numeric_ops.rs @@ -0,0 +1,85 @@ +//! Safe but somewhat low-level variants of the arithmetic operations in CFAVML. +//! +//! In general, I would recommend using the higher level generic functions api which provides +//! some syntax sugar over these traits. + +use crate::buffer::WriteOnlyBuffer; +use crate::danger::export_hypot; +use crate::mem_loader::{IntoMemLoader, MemLoader}; + +/// Various arithmetic operations over vectors. +pub trait NumericOps: Sized + Copy { + /// Performs an elementwise hypotenuse of input buffers `a` and `b` that can + /// be projected to the desired output size of `result`. Implementation is an appoximation that + /// _should_ match std::hypot in most cases. However, with some inputs it's been confirmed to be off by 1 ulp. + /// + /// See [cfavml::hypot_vertical](crate::hypot_vertical) for examples. + /// + /// ### Projecting Vectors + /// + /// CFAVML allows for working over a wide variety of buffers for applications, projection is effectively + /// broadcasting of two input buffers implementing `IntoMemLoader`. + /// + /// By default, you can provide _two slices_, _one slice and a broadcast value_, or _two broadcast values_, + /// which exhibit the standard behaviour as you might expect. + /// + /// When providing two slices as inputs they cannot be projected to a buffer + /// that is larger their input sizes by default. This means providing two slices + /// of `128` elements in length must take a result buffer of `128` elements in length. + /// + /// You can wrap your inputs in a [Projected](crate::mem_loader::Projected) wrapper which + /// enables projecting of the input buffer to new sizes providing the new size is a + /// multiple of the original size. When this buffer is projected, it is effectively + /// repeated `N` times, where `N` is how many times the old size fits into the new size. + /// + /// ### Implementation Pseudocode + /// + /// ```ignore + /// result = [0; dims] + /// + /// for i in range(dims): + /// result[i] = a[i].hypot(b[i]) + /// + /// return result + /// ``` + /// + /// # Panics + /// + /// If vectors `a` and `b` cannot be projected to the target size of `result`. + /// Note that the projection rules are tied to the `MemLoader` implementation. + fn hypot_vertical(lhs: B1, rhs: B2, result: &mut [B3]) + where + B1: IntoMemLoader, + B1::Loader: MemLoader, + B2: IntoMemLoader, + B2::Loader: MemLoader, + for<'a> &'a mut [B3]: WriteOnlyBuffer; +} + +macro_rules! numeric_ops { + ($t:ty) => { + impl NumericOps for $t { + fn hypot_vertical(lhs: B1, rhs: B2, result: &mut [B3]) + where + B1: IntoMemLoader, + B1::Loader: MemLoader, + B2: IntoMemLoader, + B2::Loader: MemLoader, + for<'a> &'a mut [B3]: WriteOnlyBuffer, + { + unsafe { + crate::dispatch!( + avx512 = export_hypot::generic_avx512_hypot_vertical, + avx2 = export_hypot::generic_avx2_hypot_vertical, + neon = export_hypot::generic_neon_hypot_vertical, + fallback = export_hypot::generic_fallback_hypot_vertical, + args = (lhs, rhs, result) + ); + } + } + } + }; +} + +numeric_ops!(f32); +numeric_ops!(f64); From 7479b041ae9f8099efb26efc00b088289d519855 Mon Sep 17 00:00:00 2001 From: Joshua Ferguson Date: Sat, 23 Nov 2024 10:51:51 -0600 Subject: [PATCH 07/13] fixed issue with hypot test, fixed clippy warnings --- cfavml/src/danger/impl_avx2.rs | 4 ++-- cfavml/src/danger/impl_avx2fma.rs | 13 +++++++------ cfavml/src/danger/impl_avx512.rs | 4 ++-- cfavml/src/danger/impl_test.rs | 8 ++++---- cfavml/src/danger/test_suite.rs | 4 ++-- 5 files changed, 17 insertions(+), 16 deletions(-) diff --git a/cfavml/src/danger/impl_avx2.rs b/cfavml/src/danger/impl_avx2.rs index 56ca639..e9ef479 100644 --- a/cfavml/src/danger/impl_avx2.rs +++ b/cfavml/src/danger/impl_avx2.rs @@ -188,7 +188,7 @@ impl Hypot for Avx2 { //The mantissa mask is the inverse of the exponent mask let mantissa_mask = _mm256_sub_ps( _mm256_set1_ps(f32::MIN_POSITIVE), - _mm256_set1_ps(core::mem::transmute(1_u32)), + _mm256_set1_ps(f32::from_bits(1_u32)), ); // round the hi values down to the nearest power of 2 let hi2p = _mm256_and_ps(hi, exponent_mask); @@ -2796,7 +2796,7 @@ impl Hypot for Avx2 { //The mantissa mask is the inverse of the exponent mask let mantissa_mask = _mm256_sub_pd( _mm256_set1_pd(f64::MIN_POSITIVE), - _mm256_set1_pd(core::mem::transmute(1_u64)), + _mm256_set1_pd(f64::from_bits(1_u64)), ); // round the hi values down to the nearest power of 2 let hi2p = _mm256_and_pd(hi, exponent_mask); diff --git a/cfavml/src/danger/impl_avx2fma.rs b/cfavml/src/danger/impl_avx2fma.rs index 83da6a6..7f68335 100644 --- a/cfavml/src/danger/impl_avx2fma.rs +++ b/cfavml/src/danger/impl_avx2fma.rs @@ -138,7 +138,7 @@ impl Hypot for Avx2Fma { //The mantissa mask is the inverse of the exponent mask let mantissa_mask = _mm256_sub_ps( _mm256_set1_ps(f32::MIN_POSITIVE), - _mm256_set1_ps(core::mem::transmute(1_u32)), + _mm256_set1_ps(f32::from_bits(1_u32)), ); // round the hi values down to the nearest power of 2 let hi2p = _mm256_and_ps(hi, exponent_mask); @@ -149,9 +149,10 @@ impl Hypot for Avx2Fma { // create a mask that matches the normal hi values let mask = _mm256_cmp_ps::<_CMP_GT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE)); // replace the subnormal values of hi2p with the minimum positive normal value - let hi2p = _mm256_blendv_ps(_mm256_set1_ps(f32::MIN_POSITIVE), hi2p, mask); + let outer_scale = + _mm256_blendv_ps(_mm256_set1_ps(f32::MIN_POSITIVE), hi2p, mask); // replace the subnormal values of scale with the reciprocal of the minimum positive normal value - let scale = + let inner_scale = _mm256_blendv_ps(_mm256_set1_ps(1.0 / f32::MIN_POSITIVE), scale, mask); // create a mask that matches the subnormal hi values let mask = _mm256_cmp_ps::<_CMP_LT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE)); @@ -166,9 +167,9 @@ impl Hypot for Avx2Fma { ); let hi_scaled = _mm256_mul_ps(hi_scaled, hi_scaled); - let lo_scaled = _mm256_mul_ps(lo, scale); + let lo_scaled = _mm256_mul_ps(lo, inner_scale); _mm256_mul_ps( - hi2p, + outer_scale, _mm256_sqrt_ps(_mm256_fmadd_ps(lo_scaled, lo_scaled, hi_scaled)), ) } @@ -299,7 +300,7 @@ impl Hypot for Avx2Fma { //The mantissa mask is the inverse of the exponent mask let mantissa_mask = _mm256_sub_pd( _mm256_set1_pd(f64::MIN_POSITIVE), - _mm256_set1_pd(core::mem::transmute(1_u64)), + _mm256_set1_pd(f64::from_bits(1_u64)), ); // round the hi values down to the nearest power of 2 let hi2p = _mm256_and_pd(hi, exponent_mask); diff --git a/cfavml/src/danger/impl_avx512.rs b/cfavml/src/danger/impl_avx512.rs index b6d34bf..f5567cd 100644 --- a/cfavml/src/danger/impl_avx512.rs +++ b/cfavml/src/danger/impl_avx512.rs @@ -152,7 +152,7 @@ impl Hypot for Avx512 { //The mantissa mask is the inverse of the exponent mask let mantissa_mask = _mm512_sub_ps( _mm512_set1_ps(f32::MIN_POSITIVE), - _mm512_set1_ps(core::mem::transmute(1_u32)), + _mm512_set1_ps(f32::from_bits(1_u32)), ); // round the hi values down to the nearest power of 2 let hi2p = _mm512_and_ps(hi, exponent_mask); @@ -322,7 +322,7 @@ impl Hypot for Avx512 { //The mantissa mask is the inverse of the exponent mask let mantissa_mask = _mm512_sub_pd( _mm512_set1_pd(f64::MIN_POSITIVE), - _mm512_set1_pd(core::mem::transmute(1_u64)), + _mm512_set1_pd(f64::from_bits(1_u64)), ); // round the hi values down to the nearest power of 2 let hi2p = _mm512_and_pd(hi, exponent_mask); diff --git a/cfavml/src/danger/impl_test.rs b/cfavml/src/danger/impl_test.rs index d1d3494..92a6591 100644 --- a/cfavml/src/danger/impl_test.rs +++ b/cfavml/src/danger/impl_test.rs @@ -337,7 +337,7 @@ unsafe fn test_sample( ); let mut res_vec = vec![T::zero(); R::elements_per_lane()]; R::write(res_vec.as_mut_ptr(), res); - test_diff_ulps(std_result, res_vec); + test_diff_ulps(&std_result, &res_vec); } { let (std_result, std_sum) = get_std_results(&large_sample_l1, &large_sample_l2); @@ -353,11 +353,11 @@ unsafe fn test_sample( ); let mut res_vec = vec![T::zero(); R::elements_per_dense()]; R::write_dense(res_vec.as_mut_ptr(), res); - test_diff_ulps(std_result, res_vec); + test_diff_ulps(&std_result, &res_vec); } } -fn get_std_results(sample1: &Vec, sample2: &Vec) -> (Vec, T) +fn get_std_results(sample1: &[T], sample2: &[T]) -> (Vec, T) where T: Float + Debug, AutoMath: Math, @@ -371,7 +371,7 @@ where (std_result, sum) } -fn test_diff_ulps(a: Vec, b: Vec) +fn test_diff_ulps(a: &[T], b: &[T]) where T: Float + FloatConst, { diff --git a/cfavml/src/danger/test_suite.rs b/cfavml/src/danger/test_suite.rs index 2fbe053..82b156a 100644 --- a/cfavml/src/danger/test_suite.rs +++ b/cfavml/src/danger/test_suite.rs @@ -286,13 +286,13 @@ macro_rules! test_hypot { paste::paste! { #[test] fn []() { - unsafe { crate::danger::impl_test::test_suite_impl::() } + unsafe { crate::danger::impl_test::hypot_test_impl::() } } } paste::paste! { #[test] fn []() { - unsafe { crate::danger::impl_test::test_suite_impl::() } + unsafe { crate::danger::impl_test::hypot_test_impl::() } } } }; From 408556154dbdfb1f7ed4c46172a6f192be399593 Mon Sep 17 00:00:00 2001 From: Joshua Ferguson Date: Sun, 24 Nov 2024 10:57:57 -0600 Subject: [PATCH 08/13] added export hypot test for avx2fma, subnormal floats --- cfavml/src/danger/export_hypot.rs | 68 ++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 5 deletions(-) diff --git a/cfavml/src/danger/export_hypot.rs b/cfavml/src/danger/export_hypot.rs index 6e7b3a8..f0ffb12 100644 --- a/cfavml/src/danger/export_hypot.rs +++ b/cfavml/src/danger/export_hypot.rs @@ -8,7 +8,7 @@ use crate::danger::{generic_hypot_vertical, SimdRegister}; use crate::math::{AutoMath, Math, Numeric}; use crate::mem_loader::{IntoMemLoader, MemLoader}; -macro_rules! define_arithmetic_impls { +macro_rules! define_hypot_impls { ( $hypot_name:ident, $imp:ident $(,)? @@ -46,18 +46,25 @@ macro_rules! define_arithmetic_impls { }; } -define_arithmetic_impls!(generic_fallback_hypot_vertical, Fallback,); +define_hypot_impls!(generic_fallback_hypot_vertical, Fallback,); #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -define_arithmetic_impls!(generic_avx2_hypot_vertical, Avx2, target_features = "avx2"); +define_hypot_impls!(generic_avx2_hypot_vertical, Avx2, target_features = "avx2"); +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +define_hypot_impls!( + generic_avx2fma_hypot_vertical, + Avx2Fma, + target_features = "avx2", + "fma" +); #[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "nightly"))] -define_arithmetic_impls!( +define_hypot_impls!( generic_avx512_hypot_vertical, Avx512, target_features = "avx512f", "avx512bw" ); #[cfg(target_arch = "aarch64")] -define_arithmetic_impls!(generic_neon_hypot_vertical, Neon, target_features = "neon"); +define_hypot_impls!(generic_neon_hypot_vertical, Neon, target_features = "neon"); #[cfg(test)] mod tests { @@ -99,6 +106,51 @@ mod tests { let mut result = vec![$t::default(); 533]; unsafe { [< $variant _ $op _vertical >](&l1, &l2, &mut result) }; + let expected = l1.iter() + .copied() + .zip(l2.iter().copied()) + .map(|(a, b)| AutoMath::$op(a, b)) + .collect::>(); + for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) { + let ulps_diff = get_diff_ulps(*expected, *actual); + assert!( + ulps_diff.abs() <= 1, + "result differs by more than 1 ULP:\n initial inputs: {}, 2\n expected: {} actual: {}\nulps diff: {}", + initial, expected, actual, ulps_diff + ); + } + } + #[test] + fn [< $variant _ $op _subnormal_value_ $t >]() { + let (l1, _) = crate::test_utils::get_subnormal_sample_vectors::<$t>(533); + + let mut result = vec![$t::default(); 533]; + unsafe { [< $variant _ $op _vertical >](&l1, 2 as $t, &mut result) }; + + let expected = l1.iter() + .copied() + .map(|v| AutoMath::$op(v, 2 as $t)) + .collect::>(); + + for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) { + let ulps_diff = get_diff_ulps(*expected, *actual); + assert!( + ulps_diff.abs() <= 1, + "result differs by more than 1 ULP:\n initial inputs: {}, 2\n expected: {} actual: {}\nulps diff: {}", + initial, expected, actual, ulps_diff + ); + + } + + } + + #[test] + fn [< $variant _ $op _subnormal_vector_ $t >]() { + let (l1, l2) = crate::test_utils::get_subnormal_sample_vectors::<$t>(533); + + let mut result = vec![$t::default(); 533]; + unsafe { [< $variant _ $op _vertical >](&l1, &l2, &mut result) }; + let expected = l1.iter() .copied() .zip(l2.iter().copied()) @@ -143,6 +195,12 @@ mod tests { target_feature = "avx2" ))] define_numeric_test!(generic_avx2, types = f32, f64,); + #[cfg(all( + any(target_arch = "x86", target_arch = "x86_64"), + target_feature = "avx2", + target_feature = "fma" + ))] + define_numeric_test!(generic_avx2fma, types = f32, f64,); #[cfg(all( any(target_arch = "x86", target_arch = "x86_64"), feature = "nightly", From c8fca55ae0bdff1996895871247401ea51b9912c Mon Sep 17 00:00:00 2001 From: Joshua Ferguson Date: Sat, 7 Dec 2024 12:39:11 -0600 Subject: [PATCH 09/13] apply changes from code review --- cfavml/src/danger/core_simd_api.rs | 2 ++ cfavml/src/danger/export_hypot.rs | 2 +- cfavml/src/danger/impl_avx2.rs | 29 ++++++++----------- cfavml/src/danger/impl_avx2fma.rs | 24 +++++---------- cfavml/src/danger/op_hypot.rs | 4 ++- .../export_docs/arithmetic_hypot_vertical.md | 10 +++++-- cfavml/src/math/default.rs | 2 +- 7 files changed, 34 insertions(+), 39 deletions(-) diff --git a/cfavml/src/danger/core_simd_api.rs b/cfavml/src/danger/core_simd_api.rs index e2fa9c0..7afc243 100644 --- a/cfavml/src/danger/core_simd_api.rs +++ b/cfavml/src/danger/core_simd_api.rs @@ -414,6 +414,8 @@ where /// and a point (`x`, `y`) on the Euclidean plane. unsafe fn hypot(a: Self::Register, b: Self::Register) -> Self::Register; #[inline(always)] + + /// Perform a element wise hypot on two dense lanes. unsafe fn hypot_dense( a: DenseLane, b: DenseLane, diff --git a/cfavml/src/danger/export_hypot.rs b/cfavml/src/danger/export_hypot.rs index f0ffb12..8226492 100644 --- a/cfavml/src/danger/export_hypot.rs +++ b/cfavml/src/danger/export_hypot.rs @@ -207,6 +207,6 @@ mod tests { target_feature = "avx512f" ))] define_numeric_test!(generic_avx512, types = f32, f64,); - #[cfg(target_arch = "aarch64")] + #[cfg(all(target_arch = "aarch64", target_feature = "neon"))] define_numeric_test!(generic_neon, types = f32, f64,); } diff --git a/cfavml/src/danger/impl_avx2.rs b/cfavml/src/danger/impl_avx2.rs index e9ef479..363e392 100644 --- a/cfavml/src/danger/impl_avx2.rs +++ b/cfavml/src/danger/impl_avx2.rs @@ -174,15 +174,14 @@ impl Hypot for Avx2 { #[inline(always)] unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register { // convert the inputs to absolute values - let (x, y) = ( - _mm256_andnot_ps(_mm256_set1_ps(-0.0), x), - _mm256_andnot_ps(_mm256_set1_ps(-0.0), y), - ); + let x = _mm256_andnot_ps(_mm256_set1_ps(-0.0), x); + + let y = _mm256_andnot_ps(_mm256_set1_ps(-0.0), y); + // find the max and min of the two inputs - let (hi, lo) = ( - >::max(x, y), - >::min(x, y), - ); + let hi = >::max(x, y); + let lo = >::min(x, y); + //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa let exponent_mask = _mm256_set1_ps(f32::INFINITY); //The mantissa mask is the inverse of the exponent mask @@ -2782,15 +2781,11 @@ impl Hypot for Avx2 { #[inline(always)] unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register { // convert the inputs to absolute values - let (x, y) = ( - _mm256_andnot_pd(_mm256_set1_pd(-0.0), x), - _mm256_andnot_pd(_mm256_set1_pd(-0.0), y), - ); - // find the max and min of the two inputs - let (hi, lo) = ( - >::max(x, y), - >::min(x, y), - ); + + let x = _mm256_andnot_pd(_mm256_set1_pd(-0.0), x); + let y = _mm256_andnot_pd(_mm256_set1_pd(-0.0), y); + let hi = >::max(x, y); + let lo = >::min(x, y); //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa let exponent_mask = _mm256_set1_pd(f64::INFINITY); //The mantissa mask is the inverse of the exponent mask diff --git a/cfavml/src/danger/impl_avx2fma.rs b/cfavml/src/danger/impl_avx2fma.rs index 7f68335..573056f 100644 --- a/cfavml/src/danger/impl_avx2fma.rs +++ b/cfavml/src/danger/impl_avx2fma.rs @@ -124,15 +124,11 @@ impl Hypot for Avx2Fma { #[inline(always)] unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register { // convert the inputs to absolute values - let (x, y) = ( - _mm256_andnot_ps(_mm256_set1_ps(-0.0), x), - _mm256_andnot_ps(_mm256_set1_ps(-0.0), y), - ); + let x = _mm256_andnot_ps(_mm256_set1_ps(-0.0), x); + let y = _mm256_andnot_ps(_mm256_set1_ps(-0.0), y); // find the max and min of the two inputs - let (hi, lo) = ( - >::max(x, y), - >::min(x, y), - ); + let hi = >::max(x, y); + let lo = >::min(x, y); //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa let exponent_mask = _mm256_set1_ps(f32::INFINITY); //The mantissa mask is the inverse of the exponent mask @@ -286,15 +282,11 @@ impl Hypot for Avx2Fma { #[inline(always)] unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register { // convert the inputs to absolute values - let (x, y) = ( - _mm256_andnot_pd(_mm256_set1_pd(-0.0), x), - _mm256_andnot_pd(_mm256_set1_pd(-0.0), y), - ); + let x = _mm256_andnot_pd(_mm256_set1_pd(-0.0), x); + let y = _mm256_andnot_pd(_mm256_set1_pd(-0.0), y); // find the max and min of the two inputs - let (hi, lo) = ( - >::max(x, y), - >::min(x, y), - ); + let hi = >::max(x, y); + let lo = >::min(x, y); //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa let exponent_mask = _mm256_set1_pd(f64::INFINITY); //The mantissa mask is the inverse of the exponent mask diff --git a/cfavml/src/danger/op_hypot.rs b/cfavml/src/danger/op_hypot.rs index 83335b9..b2b440f 100644 --- a/cfavml/src/danger/op_hypot.rs +++ b/cfavml/src/danger/op_hypot.rs @@ -6,13 +6,15 @@ use crate::math::{Math, Numeric}; use crate::mem_loader::{IntoMemLoader, MemLoader}; #[inline(always)] -/// A generic vector hypot over one vector and single value. +/// A generic vector implementation of hypot over one vector and single value. /// /// # Safety /// +/// The safety of hypot relies on the safety of the implementation of SimdRegister /// The sizes of `a`, `b` and `result` must be equal to `dims`, the safety requirements of /// `M` definition the basic math operations and the requirements of `R` SIMD register /// must also be followed. + pub unsafe fn generic_hypot_vertical( a: B1, b: B2, diff --git a/cfavml/src/export_docs/arithmetic_hypot_vertical.md b/cfavml/src/export_docs/arithmetic_hypot_vertical.md index ef7f662..223abbd 100644 --- a/cfavml/src/export_docs/arithmetic_hypot_vertical.md +++ b/cfavml/src/export_docs/arithmetic_hypot_vertical.md @@ -1,6 +1,6 @@ Performs an elementwise hypotenuse of input buffers `a` and `b` that can be projected to the desired output size of `result`. Implementation is an appoximation that -_should_ match std::hypot in most cases. However, with some inputs it's been confirmed to be off by 1 ulp. +_should_ match std::hypot in most cases. However, with some inputs it's been confirmed to be off by 1 ulp. Note: for `no_std` builds the result will be off more significantly for fallback ### Projecting Vectors @@ -16,13 +16,17 @@ of `128` elements in length must take a result buffer of `128` elements in lengt ### Implementation Pseudocode -_This is the logic of the routine being called._ +_This is the (simplified) logic of the routine being called._ ```ignore result = [0; dims] +// assume |a|<|b| for all a,b for i in range(dims): - result[i] = a[i].hypot(b[i]) + let hi = a[i].abs() + let lo = b[i].abs() + let scale = 1/hi + result[i] = hi * sqrt(lo*scale + 1) return result ``` diff --git a/cfavml/src/math/default.rs b/cfavml/src/math/default.rs index c538f8e..6023462 100644 --- a/cfavml/src/math/default.rs +++ b/cfavml/src/math/default.rs @@ -250,7 +250,7 @@ impl Numeric for StdMath { #[cfg(not(feature = "std"))] { // Fall back to direct calculation - f32_sqrt_fast((a * a + b * b) as f32) as f64 + StdMath::sqrt(a * a + b * b) } } } From 511e1da0993bfdee2c8adeb74fcde6b3284743ae Mon Sep 17 00:00:00 2001 From: Joshua Ferguson Date: Sat, 7 Dec 2024 12:46:23 -0600 Subject: [PATCH 10/13] apply changes from code review --- cfavml/src/danger/impl_avx512.rs | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/cfavml/src/danger/impl_avx512.rs b/cfavml/src/danger/impl_avx512.rs index f5567cd..fc4ef33 100644 --- a/cfavml/src/danger/impl_avx512.rs +++ b/cfavml/src/danger/impl_avx512.rs @@ -138,15 +138,11 @@ impl Hypot for Avx512 { #[inline(always)] unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register { // convert the inputs to absolute values - let (x, y) = ( - _mm512_andnot_ps(_mm512_set1_ps(-0.0), x), - _mm512_andnot_ps(_mm512_set1_ps(-0.0), y), - ); + let x = _mm512_andnot_ps(_mm512_set1_ps(-0.0), x); + let y = _mm512_andnot_ps(_mm512_set1_ps(-0.0), y); // find the max and min of the two inputs - let (hi, lo) = ( - >::max(x, y), - >::min(x, y), - ); + let hi = >::max(x, y); + let lo = >::min(x, y); //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa let exponent_mask = _mm512_set1_ps(f32::INFINITY); //The mantissa mask is the inverse of the exponent mask @@ -308,15 +304,11 @@ impl Hypot for Avx512 { #[inline(always)] unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register { // convert the inputs to absolute values - let (x, y) = ( - _mm512_andnot_pd(_mm512_set1_pd(-0.0), x), - _mm512_andnot_pd(_mm512_set1_pd(-0.0), y), - ); + let x = _mm512_andnot_pd(_mm512_set1_pd(-0.0), x); + let y = _mm512_andnot_pd(_mm512_set1_pd(-0.0), y); // find the max and min of the two inputs - let (hi, lo) = ( - >::max(x, y), - >::min(x, y), - ); + let hi = >::max(x, y); + let lo = >::min(x, y); //Infinity is represented by all 1s in the exponent, and all 0s in the mantissa let exponent_mask = _mm512_set1_pd(f64::INFINITY); //The mantissa mask is the inverse of the exponent mask From c1120d86483f6837be6118b34f661cada9336b43 Mon Sep 17 00:00:00 2001 From: Joshua Ferguson Date: Sat, 7 Dec 2024 12:51:02 -0600 Subject: [PATCH 11/13] apply changes from code review --- cfavml/src/danger/impl_neon.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cfavml/src/danger/impl_neon.rs b/cfavml/src/danger/impl_neon.rs index ef60095..7f6309b 100644 --- a/cfavml/src/danger/impl_neon.rs +++ b/cfavml/src/danger/impl_neon.rs @@ -152,10 +152,12 @@ impl Hypot for Neon { #[inline(always)] unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register { // Convert inputs to absolute values - let (x, y) = (vabsq_f32(x), vabsq_f32(y)); + let x = vabsq_f32(x); + let y = vabsq_f32(y); // Find the max and min of the two inputs - let (hi, lo) = (vmaxq_f32(x, y), vminq_f32(x, y)); + let hi = vmaxq_f32(x, y); + let lo = vminq_f32(x, y); let exponent_mask = vdupq_n_u32(EXPONENT_MASK_F32); let mantissa_mask = vdupq_n_u32(MANTISSA_MASK_F32); @@ -327,10 +329,12 @@ impl Hypot for Neon { #[inline(always)] unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register { // Convert inputs to absolute values - let (x, y) = (vabsq_f64(x), vabsq_f64(y)); + let x = vabsq_f64(x); + let y = vabsq_f64(y); // Find the max and min of the two inputs - let (hi, lo) = (vmaxq_f64(x, y), vminq_f64(x, y)); + let hi = vmaxq_f64(x, y); + let lo = vminq_f64(x, y); let exponent_mask = vdupq_n_u64(f64::INFINITY.to_bits()); let mantissa_mask = vdupq_n_u64((f64::MIN_POSITIVE - mem::transmute::(1)).to_bits()); From dbd00fcb6a3aef32e1b6c2e505d7c7386d31d706 Mon Sep 17 00:00:00 2001 From: Joshua Ferguson Date: Sat, 7 Dec 2024 14:13:43 -0600 Subject: [PATCH 12/13] changed name of numeric trait to MiscFloatOps --- cfavml/src/lib.rs | 2 +- ...t_numeric_ops.rs => safe_trait_misc_float_ops.rs} | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) rename cfavml/src/{safe_trait_numeric_ops.rs => safe_trait_misc_float_ops.rs} (94%) diff --git a/cfavml/src/lib.rs b/cfavml/src/lib.rs index 19affdd..1fd3bb8 100644 --- a/cfavml/src/lib.rs +++ b/cfavml/src/lib.rs @@ -23,7 +23,7 @@ pub mod safe_trait_agg_ops; pub mod safe_trait_arithmetic_ops; pub mod safe_trait_cmp_ops; pub mod safe_trait_distance_ops; -pub mod safe_trait_numeric_ops; +pub mod safe_trait_misc_float_ops; #[cfg(test)] mod test_utils; diff --git a/cfavml/src/safe_trait_numeric_ops.rs b/cfavml/src/safe_trait_misc_float_ops.rs similarity index 94% rename from cfavml/src/safe_trait_numeric_ops.rs rename to cfavml/src/safe_trait_misc_float_ops.rs index c442fc0..a805272 100644 --- a/cfavml/src/safe_trait_numeric_ops.rs +++ b/cfavml/src/safe_trait_misc_float_ops.rs @@ -7,8 +7,8 @@ use crate::buffer::WriteOnlyBuffer; use crate::danger::export_hypot; use crate::mem_loader::{IntoMemLoader, MemLoader}; -/// Various arithmetic operations over vectors. -pub trait NumericOps: Sized + Copy { +/// Various float operations over vectors. +pub trait MiscFloatOps: Sized + Copy { /// Performs an elementwise hypotenuse of input buffers `a` and `b` that can /// be projected to the desired output size of `result`. Implementation is an appoximation that /// _should_ match std::hypot in most cases. However, with some inputs it's been confirmed to be off by 1 ulp. @@ -56,9 +56,9 @@ pub trait NumericOps: Sized + Copy { for<'a> &'a mut [B3]: WriteOnlyBuffer; } -macro_rules! numeric_ops { +macro_rules! misc_float_ops { ($t:ty) => { - impl NumericOps for $t { + impl MiscFloatOps for $t { fn hypot_vertical(lhs: B1, rhs: B2, result: &mut [B3]) where B1: IntoMemLoader, @@ -81,5 +81,5 @@ macro_rules! numeric_ops { }; } -numeric_ops!(f32); -numeric_ops!(f64); +misc_float_ops!(f32); +misc_float_ops!(f64); From 446c6b6dd8f49cf9af5452c55aa6fce64fe3a329 Mon Sep 17 00:00:00 2001 From: Joshua Ferguson Date: Tue, 17 Dec 2024 18:11:33 -0600 Subject: [PATCH 13/13] fixed issue with "out of order" arguments to neon fma functions --- cfavml/src/danger/impl_neon.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cfavml/src/danger/impl_neon.rs b/cfavml/src/danger/impl_neon.rs index 7f6309b..55c34f1 100644 --- a/cfavml/src/danger/impl_neon.rs +++ b/cfavml/src/danger/impl_neon.rs @@ -189,7 +189,7 @@ impl Hypot for Neon { // finally, do the thing let hi_scaled = vmulq_f32(hi_scaled, hi_scaled); let lo_scaled = vmulq_f32(lo, scale); - vmulq_f32(hi2p, vsqrtq_f32(vfmaq_f32(lo_scaled, lo_scaled, hi_scaled))) + vmulq_f32(hi2p, vsqrtq_f32(vfmaq_f32(hi_scaled, lo_scaled, lo_scaled))) } } @@ -367,7 +367,7 @@ impl Hypot for Neon { // finally, do the thing let hi_scaled = vmulq_f64(hi_scaled, hi_scaled); let lo_scaled = vmulq_f64(lo, scale); - vmulq_f64(hi2p, vsqrtq_f64(vfmaq_f64(lo_scaled, lo_scaled, hi_scaled))) + vmulq_f64(hi2p, vsqrtq_f64(vfmaq_f64(hi_scaled, lo_scaled, lo_scaled))) } }