Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions cfavml/src/danger/core_simd_api.rs
Original file line number Diff line number Diff line change
Expand Up @@ -405,3 +405,21 @@ pub trait SimdRegister<T: Copy> {
Self::write(mem.add(Self::elements_per_lane() * 7), lane.h);
}
}

pub trait Hypot<T>: SimdRegister<T>
where
T: Copy,
{
/// SIMD Variant of the std hypot function. Computes the distance between the origin
/// and a point (`x`, `y`) on the Euclidean plane.
unsafe fn hypot(a: Self::Register, b: Self::Register) -> Self::Register;
#[inline(always)]

/// Perform a element wise hypot on two dense lanes.
unsafe fn hypot_dense(
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit, please add docs for this function as well, and just add a note under # safety that it depends on the safety requirements of SimdRegister<T>.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just add a note under # safety that it depends on the safety requirements of SimdRegister

You mean for the docs of op_hypot correct?

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, technically it depends on both but I guess if you satisfy op_hypot you satisfy the register requirements anyway.

a: DenseLane<Self::Register>,
b: DenseLane<Self::Register>,
) -> DenseLane<Self::Register> {
apply_dense!(Self::hypot, a, b)
}
}
212 changes: 212 additions & 0 deletions cfavml/src/danger/export_hypot.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
//! Common arithmetic operations
//!
//! I.e. Add, Sub, Mul, Div...
use crate::buffer::WriteOnlyBuffer;
use crate::danger::core_simd_api::Hypot;
use crate::danger::{generic_hypot_vertical, SimdRegister};
use crate::math::{AutoMath, Math, Numeric};
use crate::mem_loader::{IntoMemLoader, MemLoader};

macro_rules! define_hypot_impls {
(
$hypot_name:ident,
$imp:ident $(,)?
$(target_features = $($feat:expr $(,)?)+)?
) => {
#[inline]
$(#[target_feature($(enable = $feat, )*)])*
#[doc = include_str!("../export_docs/arithmetic_hypot_vertical.md")]
$(

#[doc = concat!("- ", $("**`+", $feat, "`** ", )*)]
#[doc = "CPU features are available at runtime. Running on hardware _without_ this feature available will cause immediate UB."]
)*
pub unsafe fn $hypot_name<T, B1, B2, B3>(
a: B1,
b: B2,
result: &mut [B3],
)
where
T: Copy,
B1: IntoMemLoader<T>,
B1::Loader: MemLoader<Value = T>,
B2: IntoMemLoader<T>,
B2::Loader: MemLoader<Value = T>,
crate::danger::$imp: SimdRegister<T>+ Hypot<T>,
AutoMath: Math<T> + Numeric<T>,
for<'a> &'a mut [B3]: WriteOnlyBuffer<Item = T>,
{
generic_hypot_vertical::<T, crate::danger::$imp, AutoMath, B1, B2, B3>(
a,
b,
result,
)
}
};
}

define_hypot_impls!(generic_fallback_hypot_vertical, Fallback,);
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
define_hypot_impls!(generic_avx2_hypot_vertical, Avx2, target_features = "avx2");
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
define_hypot_impls!(
generic_avx2fma_hypot_vertical,
Avx2Fma,
target_features = "avx2",
"fma"
);
#[cfg(all(any(target_arch = "x86", target_arch = "x86_64"), feature = "nightly"))]
define_hypot_impls!(
generic_avx512_hypot_vertical,
Avx512,
target_features = "avx512f",
"avx512bw"
);
#[cfg(target_arch = "aarch64")]
define_hypot_impls!(generic_neon_hypot_vertical, Neon, target_features = "neon");

#[cfg(test)]
mod tests {
use num_traits::{Float, FloatConst};

use super::*;

macro_rules! define_inner_test {
($variant:ident, op = $op:ident, ty = $t:ident) => {
paste::paste! {
#[test]
fn [< $variant _ $op _value_ $t >]() {
let (l1, _) = crate::test_utils::get_sample_vectors::<$t>(533);

let mut result = vec![$t::default(); 533];
unsafe { [< $variant _ $op _vertical >](&l1, 2 as $t, &mut result) };

let expected = l1.iter()
.copied()
.map(|v| AutoMath::$op(v, 2 as $t))
.collect::<Vec<_>>();

for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
let ulps_diff = get_diff_ulps(*expected, *actual);
assert!(
ulps_diff.abs() <= 1,
"result differs by more than 1 ULP:\n initial inputs: {}, 2\n expected: {} actual: {}\nulps diff: {}",
initial, expected, actual, ulps_diff
);

}

}

#[test]
fn [< $variant _ $op _vector_ $t >]() {
let (l1, l2) = crate::test_utils::get_sample_vectors::<$t>(533);

let mut result = vec![$t::default(); 533];
unsafe { [< $variant _ $op _vertical >](&l1, &l2, &mut result) };

let expected = l1.iter()
.copied()
.zip(l2.iter().copied())
.map(|(a, b)| AutoMath::$op(a, b))
.collect::<Vec<_>>();
for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
let ulps_diff = get_diff_ulps(*expected, *actual);
assert!(
ulps_diff.abs() <= 1,
"result differs by more than 1 ULP:\n initial inputs: {}, 2\n expected: {} actual: {}\nulps diff: {}",
initial, expected, actual, ulps_diff
);
}
}
#[test]
fn [< $variant _ $op _subnormal_value_ $t >]() {
let (l1, _) = crate::test_utils::get_subnormal_sample_vectors::<$t>(533);

let mut result = vec![$t::default(); 533];
unsafe { [< $variant _ $op _vertical >](&l1, 2 as $t, &mut result) };

let expected = l1.iter()
.copied()
.map(|v| AutoMath::$op(v, 2 as $t))
.collect::<Vec<_>>();

for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
let ulps_diff = get_diff_ulps(*expected, *actual);
assert!(
ulps_diff.abs() <= 1,
"result differs by more than 1 ULP:\n initial inputs: {}, 2\n expected: {} actual: {}\nulps diff: {}",
initial, expected, actual, ulps_diff
);

}

}

#[test]
fn [< $variant _ $op _subnormal_vector_ $t >]() {
let (l1, l2) = crate::test_utils::get_subnormal_sample_vectors::<$t>(533);

let mut result = vec![$t::default(); 533];
unsafe { [< $variant _ $op _vertical >](&l1, &l2, &mut result) };

let expected = l1.iter()
.copied()
.zip(l2.iter().copied())
.map(|(a, b)| AutoMath::$op(a, b))
.collect::<Vec<_>>();
for ((initial, expected), actual) in l1.iter().zip(&expected).zip(&result) {
let ulps_diff = get_diff_ulps(*expected, *actual);
assert!(
ulps_diff.abs() <= 1,
"result differs by more than 1 ULP:\n initial inputs: {}, 2\n expected: {} actual: {}\nulps diff: {}",
initial, expected, actual, ulps_diff
);
}
}
}
};
}

fn get_diff_ulps<T>(a: T, b: T) -> i64
where
T: Float + FloatConst,
{
let (a_mant, a_exp, a_sign) = a.integer_decode();
let (b_mant, b_exp, b_sign) = b.integer_decode();
assert!(a_sign == b_sign);
assert!(a_exp == b_exp);
a_mant as i64 - b_mant as i64
}

macro_rules! define_numeric_test {
($variant:ident, types = $($t:ident $(,)?)+) => {
$(
define_inner_test!($variant, op = hypot, ty = $t);

)*
};
}

define_numeric_test!(generic_fallback, types = f32, f64,);
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "avx2"
))]
define_numeric_test!(generic_avx2, types = f32, f64,);
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64"),
target_feature = "avx2",
target_feature = "fma"
))]
define_numeric_test!(generic_avx2fma, types = f32, f64,);
#[cfg(all(
any(target_arch = "x86", target_arch = "x86_64"),
feature = "nightly",
target_feature = "avx512f"
))]
define_numeric_test!(generic_avx512, types = f32, f64,);
#[cfg(all(target_arch = "aarch64", target_feature = "neon"))]
define_numeric_test!(generic_neon, types = f32, f64,);
}
102 changes: 101 additions & 1 deletion cfavml/src/danger/impl_avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use core::arch::x86_64::*;
use core::iter::zip;
use core::mem;

use super::core_simd_api::{DenseLane, SimdRegister};
use super::core_simd_api::{DenseLane, Hypot, SimdRegister};
use crate::apply_dense;

/// AVX2 enabled SIMD operations.
Expand Down Expand Up @@ -169,6 +169,58 @@ impl SimdRegister<f32> for Avx2 {
}
}

impl Hypot<f32> for Avx2 {
//b' * sqrt((a*b_hat)^2 + (b*b_hat)^2), where |b| > |a|, b'=(2^n<b), b_hat=1/b'
#[inline(always)]
unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
// convert the inputs to absolute values
let x = _mm256_andnot_ps(_mm256_set1_ps(-0.0), x);

let y = _mm256_andnot_ps(_mm256_set1_ps(-0.0), y);

// find the max and min of the two inputs
let hi = <Avx2 as SimdRegister<f32>>::max(x, y);
let lo = <Avx2 as SimdRegister<f32>>::min(x, y);

//Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
let exponent_mask = _mm256_set1_ps(f32::INFINITY);
//The mantissa mask is the inverse of the exponent mask
let mantissa_mask = _mm256_sub_ps(
_mm256_set1_ps(f32::MIN_POSITIVE),
_mm256_set1_ps(f32::from_bits(1_u32)),
);
// round the hi values down to the nearest power of 2
let hi2p = _mm256_and_ps(hi, exponent_mask);
// we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
// we can double it and xor it with the exponent mask
let scale = _mm256_xor_ps(_mm256_add_ps(hi2p, hi2p), exponent_mask);

// create a mask that matches the normal hi values
let mask = _mm256_cmp_ps::<_CMP_GT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE));
// replace the subnormal values of hi2p with the minimum positive normal value
let hi2p = _mm256_blendv_ps(_mm256_set1_ps(f32::MIN_POSITIVE), hi2p, mask);
// replace the subnormal values of scale with the reciprocal of the minimum positive normal value
let scale =
_mm256_blendv_ps(_mm256_set1_ps(1.0 / f32::MIN_POSITIVE), scale, mask);
// create a mask that matches the subnormal hi values
let mask = _mm256_cmp_ps::<_CMP_LT_OQ>(hi, _mm256_set1_ps(f32::MIN_POSITIVE));
// since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
let hi_scaled =
_mm256_or_ps(_mm256_and_ps(hi, mantissa_mask), _mm256_set1_ps(1.0));
// for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
let hi_scaled = _mm256_blendv_ps(
hi_scaled,
_mm256_sub_ps(hi_scaled, _mm256_set1_ps(1.0)),
mask,
);

let hi_scaled = _mm256_mul_ps(hi_scaled, hi_scaled);
let lo_scaled = _mm256_mul_ps(lo, scale);
let lo_scaled = _mm256_mul_ps(lo_scaled, lo_scaled);
_mm256_mul_ps(hi2p, _mm256_sqrt_ps(_mm256_add_ps(lo_scaled, hi_scaled)))
}
}

impl SimdRegister<f64> for Avx2 {
type Register = __m256d;

Expand Down Expand Up @@ -2724,3 +2776,51 @@ impl SimdRegister<u64> for Avx2 {
_mm256_storeu_si256(mem.cast(), reg)
}
}

impl Hypot<f64> for Avx2 {
#[inline(always)]
unsafe fn hypot(x: Self::Register, y: Self::Register) -> Self::Register {
// convert the inputs to absolute values

let x = _mm256_andnot_pd(_mm256_set1_pd(-0.0), x);
let y = _mm256_andnot_pd(_mm256_set1_pd(-0.0), y);
let hi = <Avx2 as SimdRegister<f64>>::max(x, y);
let lo = <Avx2 as SimdRegister<f64>>::min(x, y);
//Infinity is represented by all 1s in the exponent, and all 0s in the mantissa
let exponent_mask = _mm256_set1_pd(f64::INFINITY);
//The mantissa mask is the inverse of the exponent mask
let mantissa_mask = _mm256_sub_pd(
_mm256_set1_pd(f64::MIN_POSITIVE),
_mm256_set1_pd(f64::from_bits(1_u64)),
);
// round the hi values down to the nearest power of 2
let hi2p = _mm256_and_pd(hi, exponent_mask);
// we scale the values inside the root by the reciprocal of hi2p. since it's a power of 2,
// we can double it and xor it with the exponent mask
let scale = _mm256_xor_pd(_mm256_add_pd(hi2p, hi2p), exponent_mask);

// create a mask that matches the normal hi values
let mask = _mm256_cmp_pd::<_CMP_GT_OQ>(hi, _mm256_set1_pd(f64::MIN_POSITIVE));
// replace the subnormal values of hi2p with the minimum positive normal value
let hi2p = _mm256_blendv_pd(_mm256_set1_pd(f64::MIN_POSITIVE), hi2p, mask);
// replace the subnormal values of scale with the reciprocal of the minimum positive normal value
let scale =
_mm256_blendv_pd(_mm256_set1_pd(1.0 / f64::MIN_POSITIVE), scale, mask);
// create a mask that matches the subnormal hi values
let mask = _mm256_cmp_pd::<_CMP_LT_OQ>(hi, _mm256_set1_pd(f64::MIN_POSITIVE));
// since hi2p was preserved the exponent bits of hi, the exponent of hi/hi2p is 1
let hi_scaled =
_mm256_or_pd(_mm256_and_pd(hi, mantissa_mask), _mm256_set1_pd(1.0));
// for the subnormal elements of hi, we need to subtract 1 from the scaled hi values
let hi_scaled = _mm256_blendv_pd(
hi_scaled,
_mm256_sub_pd(hi_scaled, _mm256_set1_pd(1.0)),
mask,
);

let hi_scaled = _mm256_mul_pd(hi_scaled, hi_scaled);
let lo_scaled = _mm256_mul_pd(lo, scale);
let lo_scaled = _mm256_mul_pd(lo_scaled, lo_scaled);
_mm256_mul_pd(hi2p, _mm256_sqrt_pd(_mm256_add_pd(lo_scaled, hi_scaled)))
}
}
Loading
Loading