Skip to content

Commit e082947

Browse files
authored
Add hypotenuse function for f32, f64 (#15)
* hypotenuse function for f32, f64 * use sqrt intrinsic for fastmath, implemented Hypot for Neon * added hypot test for Neon, Axx512 * revert sqrt change, fallback to direct calculation if no_std for now * added export_hypot * safe api implemented * fixed issue with hypot test, fixed clippy warnings * added export hypot test for avx2fma, subnormal floats * changed name of numeric trait to MiscFloatOps * fixed issue with "out of order" arguments to neon fma functions
1 parent f99c448 commit e082947

18 files changed

+1031
-10
lines changed

cfavml/src/danger/core_simd_api.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,3 +405,21 @@ pub trait SimdRegister<T: Copy> {
405405
Self::write(mem.add(Self::elements_per_lane() * 7), lane.h);
406406
}
407407
}
408+
409+
pub trait Hypot<T>: SimdRegister<T>
410+
where
411+
T: Copy,
412+
{
413+
/// SIMD Variant of the std hypot function. Computes the distance between the origin
414+
/// and a point (`x`, `y`) on the Euclidean plane.
415+
unsafe fn hypot(a: Self::Register, b: Self::Register) -> Self::Register;
416+
#[inline(always)]
417+
418+
/// Perform a element wise hypot on two dense lanes.
419+
unsafe fn hypot_dense(
420+
a: DenseLane<Self::Register>,
421+
b: DenseLane<Self::Register>,
422+
) -> DenseLane<Self::Register> {
423+
apply_dense!(Self::hypot, a, b)
424+
}
425+
}

cfavml/src/danger/export_hypot.rs

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
//! Common arithmetic operations
2+
//!
3+
//! I.e. Add, Sub, Mul, Div...
4+
5+
use crate::buffer::WriteOnlyBuffer;
6+
use crate::danger::core_simd_api::Hypot;
7+
use crate::danger::{generic_hypot_vertical, SimdRegister};
8+
use crate::math::{AutoMath, Math, Numeric};
9+
use crate::mem_loader::{IntoMemLoader, MemLoader};
10+
11+
macro_rules! define_hypot_impls {
12+
(
13+
$hypot_name:ident,
14+
$imp:ident $(,)?
15+
$(target_features = $($feat:expr $(,)?)+)?
16+
) => {
17+
#[inline]
18+
$(#[target_feature($(enable = $feat, )*)])*
19+
#[doc = include_str!("../export_docs/arithmetic_hypot_vertical.md")]
20+
$(
21+
22+
#[doc = concat!("- ", $("**`+", $feat, "`** ", )*)]
23+
#[doc = "CPU features are available at runtime. Running on hardware _without_ this feature available will cause immediate UB."]
24+
)*
25+
pub unsafe fn $hypot_name<T, B1, B2, B3>(
26+
a: B1,
27+
b: B2,
28+
result: &mut [B3],
29+
)
30+
where
31+
T: Copy,
32+
B1: IntoMemLoader<T>,
33+
B1::Loader: MemLoader<Value = T>,
34+
B2: IntoMemLoader<T>,
35+
B2::Loader: MemLoader<Value = T>,
36+
crate::danger::$imp: SimdRegister<T>+ Hypot<T>,
37+
AutoMath: Math<T> + Numeric<T>,
38+
for<'a> &'a mut [B3]: WriteOnlyBuffer<Item = T>,
39+
{
40+
generic_hypot_vertical::<T, crate::danger::$imp, AutoMath, B1, B2, B3>(
41+
a,
42+
b,
43+
result,
44+
)
45+
}
46+
};
47+
}
48+
49+
define_hypot_impls!(generic_fallback_hypot_vertical, Fallback,);
50+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
51+
define_hypot_impls!(generic_avx2_hypot_vertical, Avx2, target_features = "avx2");
52+
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
53+
define_hypot_impls!(
54+
generic_avx2fma_hypot_vertical,
55+
Avx2Fma,
56+
target_features = "avx2",
57+
"fma"
58+
);
59+
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "nightly"))]
60+
define_hypot_impls!(
61+
generic_avx512_hypot_vertical,
62+
Avx512,
63+
target_features = "avx512f",
64+
"avx512bw"
65+
);
66+
#[cfg(target_arch = "aarch64")]
67+
define_hypot_impls!(generic_neon_hypot_vertical, Neon, target_features = "neon");
68+
69+
#[cfg(test)]
70+
mod tests {
71+
use num_traits::{Float, FloatConst};
72+
73+
use super::*;
74+
75+
macro_rules! define_inner_test {
76+
($variant:ident, op = $op:ident, ty = $t:ident) => {
77+
paste::paste! {
78+
#[test]
79+
fn [< $variant _ $op _value_ $t >]() {
80+
let (l1, _) = crate::test_utils::get_sample_vectors::<$t>(533);
81+
82+
let mut result = vec![$t::default(); 533];
83+
unsafe { [< $variant _ $op _vertical >](&l1, 2 as $t, &mut result) };
84+
85+
let expected = l1.iter()
86+
.copied()
87+
.map(|v| AutoMath::$op(v, 2 as $t))
88+
.collect::<Vec<_>>();
89+
90+
for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
91+
let ulps_diff = get_diff_ulps(*expected, *actual);
92+
assert!(
93+
ulps_diff.abs() <= 1,
94+
"result differs by more than 1 ULP:\n initial inputs: {}, 2\n expected: {} actual: {}\nulps diff: {}",
95+
initial, expected, actual, ulps_diff
96+
);
97+
98+
}
99+
100+
}
101+
102+
#[test]
103+
fn [< $variant _ $op _vector_ $t >]() {
104+
let (l1, l2) = crate::test_utils::get_sample_vectors::<$t>(533);
105+
106+
let mut result = vec![$t::default(); 533];
107+
unsafe { [< $variant _ $op _vertical >](&l1, &l2, &mut result) };
108+
109+
let expected = l1.iter()
110+
.copied()
111+
.zip(l2.iter().copied())
112+
.map(|(a, b)| AutoMath::$op(a, b))
113+
.collect::<Vec<_>>();
114+
for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
115+
let ulps_diff = get_diff_ulps(*expected, *actual);
116+
assert!(
117+
ulps_diff.abs() <= 1,
118+
"result differs by more than 1 ULP:\n initial inputs: {}, 2\n expected: {} actual: {}\nulps diff: {}",
119+
initial, expected, actual, ulps_diff
120+
);
121+
}
122+
}
123+
#[test]
124+
fn [< $variant _ $op _subnormal_value_ $t >]() {
125+
let (l1, _) = crate::test_utils::get_subnormal_sample_vectors::<$t>(533);
126+
127+
let mut result = vec![$t::default(); 533];
128+
unsafe { [< $variant _ $op _vertical >](&l1, 2 as $t, &mut result) };
129+
130+
let expected = l1.iter()
131+
.copied()
132+
.map(|v| AutoMath::$op(v, 2 as $t))
133+
.collect::<Vec<_>>();
134+
135+
for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
136+
let ulps_diff = get_diff_ulps(*expected, *actual);
137+
assert!(
138+
ulps_diff.abs() <= 1,
139+
"result differs by more than 1 ULP:\n initial inputs: {}, 2\n expected: {} actual: {}\nulps diff: {}",
140+
initial, expected, actual, ulps_diff
141+
);
142+
143+
}
144+
145+
}
146+
147+
#[test]
148+
fn [< $variant _ $op _subnormal_vector_ $t >]() {
149+
let (l1, l2) = crate::test_utils::get_subnormal_sample_vectors::<$t>(533);
150+
151+
let mut result = vec![$t::default(); 533];
152+
unsafe { [< $variant _ $op _vertical >](&l1, &l2, &mut result) };
153+
154+
let expected = l1.iter()
155+
.copied()
156+
.zip(l2.iter().copied())
157+
.map(|(a, b)| AutoMath::$op(a, b))
158+
.collect::<Vec<_>>();
159+
for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
160+
let ulps_diff = get_diff_ulps(*expected, *actual);
161+
assert!(
162+
ulps_diff.abs() <= 1,
163+
"result differs by more than 1 ULP:\n initial inputs: {}, 2\n expected: {} actual: {}\nulps diff: {}",
164+
initial, expected, actual, ulps_diff
165+
);
166+
}
167+
}
168+
}
169+
};
170+
}
171+
172+
fn get_diff_ulps<T>(a: T, b: T) -> i64
173+
where
174+
T: Float + FloatConst,
175+
{
176+
let (a_mant, a_exp, a_sign) = a.integer_decode();
177+
let (b_mant, b_exp, b_sign) = b.integer_decode();
178+
assert!(a_sign == b_sign);
179+
assert!(a_exp == b_exp);
180+
a_mant as i64 - b_mant as i64
181+
}
182+
183+
macro_rules! define_numeric_test {
184+
($variant:ident, types = $($t:ident $(,)?)+) => {
185+
$(
186+
define_inner_test!($variant, op = hypot, ty = $t);
187+
188+
)*
189+
};
190+
}
191+
192+
define_numeric_test!(generic_fallback, types = f32, f64,);
193+
#[cfg(all(
194+
any(target_arch = "x86", target_arch = "x86_64"),
195+
target_feature = "avx2"
196+
))]
197+
define_numeric_test!(generic_avx2, types = f32, f64,);
198+
#[cfg(all(
199+
any(target_arch = "x86", target_arch = "x86_64"),
200+
target_feature = "avx2",
201+
target_feature = "fma"
202+
))]
203+
define_numeric_test!(generic_avx2fma, types = f32, f64,);
204+
#[cfg(all(
205+
any(target_arch = "x86", target_arch = "x86_64"),
206+
feature = "nightly",
207+
target_feature = "avx512f"
208+
))]
209+
define_numeric_test!(generic_avx512, types = f32, f64,);
210+
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
211+
define_numeric_test!(generic_neon, types = f32, f64,);
212+
}

cfavml/src/danger/impl_avx2.rs

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ use core::arch::x86_64::*;
55
use core::iter::zip;
66
use core::mem;
77

8-
use super::core_simd_api::{DenseLane, SimdRegister};
8+
use super::core_simd_api::{DenseLane, Hypot, SimdRegister};
99
use crate::apply_dense;
1010

1111
/// AVX2 enabled SIMD operations.
@@ -169,6 +169,58 @@ impl SimdRegister<f32> for Avx2 {
169169
}
170170
}
171171

172+
impl Hypot<f32> for Avx2 {
173+
//b' * sqrt((a*b_hat)^2 + (b*b_hat)^2), where |b| > |a|, b'=(2^n<b), b_hat=1/b'
174+
#[inline(always)]
175+
unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
176+
// convert the inputs to absolute values
177+
let x = _mm256_andnot_ps(_mm256_set1_ps(-0.0), x);
178+
179+
let y = _mm256_andnot_ps(_mm256_set1_ps(-0.0), y);
180+
181+
// find the max and min of the two inputs
182+
let hi = <Avx2 as SimdRegister<f32>>::max(x, y);
183+
let lo = <Avx2 as SimdRegister<f32>>::min(x, y);
184+
185+
//Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
186+
let exponent_mask = _mm256_set1_ps(f32::INFINITY);
187+
//The mantissa mask is the inverse of the exponent mask
188+
let mantissa_mask = _mm256_sub_ps(
189+
_mm256_set1_ps(f32::MIN_POSITIVE),
190+
_mm256_set1_ps(f32::from_bits(1_u32)),
191+
);
192+
// round the hi values down to the nearest power of 2
193+
let hi2p = _mm256_and_ps(hi, exponent_mask);
194+
// we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
195+
// we can double it and xor it with the exponent mask
196+
let scale = _mm256_xor_ps(_mm256_add_ps(hi2p, hi2p), exponent_mask);
197+
198+
// create a mask that matches the normal hi values
199+
let mask = _mm256_cmp_ps::<_CMP_GT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE));
200+
// replace the subnormal values of hi2p with the minimum positive normal value
201+
let hi2p = _mm256_blendv_ps(_mm256_set1_ps(f32::MIN_POSITIVE), hi2p, mask);
202+
// replace the subnormal values of scale with the reciprocal of the minimum positive normal value
203+
let scale =
204+
_mm256_blendv_ps(_mm256_set1_ps(1.0 / f32::MIN_POSITIVE), scale, mask);
205+
// create a mask that matches the subnormal hi values
206+
let mask = _mm256_cmp_ps::<_CMP_LT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE));
207+
// since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
208+
let hi_scaled =
209+
_mm256_or_ps(_mm256_and_ps(hi, mantissa_mask), _mm256_set1_ps(1.0));
210+
// for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
211+
let hi_scaled = _mm256_blendv_ps(
212+
hi_scaled,
213+
_mm256_sub_ps(hi_scaled, _mm256_set1_ps(1.0)),
214+
mask,
215+
);
216+
217+
let hi_scaled = _mm256_mul_ps(hi_scaled, hi_scaled);
218+
let lo_scaled = _mm256_mul_ps(lo, scale);
219+
let lo_scaled = _mm256_mul_ps(lo_scaled, lo_scaled);
220+
_mm256_mul_ps(hi2p, _mm256_sqrt_ps(_mm256_add_ps(lo_scaled, hi_scaled)))
221+
}
222+
}
223+
172224
impl SimdRegister<f64> for Avx2 {
173225
type Register = __m256d;
174226

@@ -2724,3 +2776,51 @@ impl SimdRegister<u64> for Avx2 {
27242776
_mm256_storeu_si256(mem.cast(), reg)
27252777
}
27262778
}
2779+
2780+
impl Hypot<f64> for Avx2 {
2781+
#[inline(always)]
2782+
unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
2783+
// convert the inputs to absolute values
2784+
2785+
let x = _mm256_andnot_pd(_mm256_set1_pd(-0.0), x);
2786+
let y = _mm256_andnot_pd(_mm256_set1_pd(-0.0), y);
2787+
let hi = <Avx2 as SimdRegister<f64>>::max(x, y);
2788+
let lo = <Avx2 as SimdRegister<f64>>::min(x, y);
2789+
//Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
2790+
let exponent_mask = _mm256_set1_pd(f64::INFINITY);
2791+
//The mantissa mask is the inverse of the exponent mask
2792+
let mantissa_mask = _mm256_sub_pd(
2793+
_mm256_set1_pd(f64::MIN_POSITIVE),
2794+
_mm256_set1_pd(f64::from_bits(1_u64)),
2795+
);
2796+
// round the hi values down to the nearest power of 2
2797+
let hi2p = _mm256_and_pd(hi, exponent_mask);
2798+
// we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
2799+
// we can double it and xor it with the exponent mask
2800+
let scale = _mm256_xor_pd(_mm256_add_pd(hi2p, hi2p), exponent_mask);
2801+
2802+
// create a mask that matches the normal hi values
2803+
let mask = _mm256_cmp_pd::<_CMP_GT_OQ>(hi, _mm256_set1_pd(f64::MIN_POSITIVE));
2804+
// replace the subnormal values of hi2p with the minimum positive normal value
2805+
let hi2p = _mm256_blendv_pd(_mm256_set1_pd(f64::MIN_POSITIVE), hi2p, mask);
2806+
// replace the subnormal values of scale with the reciprocal of the minimum positive normal value
2807+
let scale =
2808+
_mm256_blendv_pd(_mm256_set1_pd(1.0 / f64::MIN_POSITIVE), scale, mask);
2809+
// create a mask that matches the subnormal hi values
2810+
let mask = _mm256_cmp_pd::<_CMP_LT_OQ>(hi, _mm256_set1_pd(f64::MIN_POSITIVE));
2811+
// since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
2812+
let hi_scaled =
2813+
_mm256_or_pd(_mm256_and_pd(hi, mantissa_mask), _mm256_set1_pd(1.0));
2814+
// for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
2815+
let hi_scaled = _mm256_blendv_pd(
2816+
hi_scaled,
2817+
_mm256_sub_pd(hi_scaled, _mm256_set1_pd(1.0)),
2818+
mask,
2819+
);
2820+
2821+
let hi_scaled = _mm256_mul_pd(hi_scaled, hi_scaled);
2822+
let lo_scaled = _mm256_mul_pd(lo, scale);
2823+
let lo_scaled = _mm256_mul_pd(lo_scaled, lo_scaled);
2824+
_mm256_mul_pd(hi2p, _mm256_sqrt_pd(_mm256_add_pd(lo_scaled, hi_scaled)))
2825+
}
2826+
}

0 commit comments

Comments
 (0)