test fixes for native f16, plus update perf_counters threshold

jan-wassenberg · copybara-github · commit 0fccdf8a61ce · 2025-03-21T08:39:46.000-07:00
PiperOrigin-RevId: 739192363
diff --git a/hwy/contrib/matvec/matvec-inl.h b/hwy/contrib/matvec/matvec-inl.h
@@ -22,6 +22,8 @@
 #define HIGHWAY_HWY_CONTRIB_MATVEC_MATVEC_INL_H_
 #endif
 
+#include <stddef.h>
+
 #include "hwy/cache_control.h"
 #include "hwy/contrib/thread_pool/thread_pool.h"
 #include "hwy/highway.h"
diff --git a/hwy/contrib/matvec/matvec_test.cc b/hwy/contrib/matvec/matvec_test.cc
@@ -24,6 +24,8 @@
 #include <stddef.h>
 #include <stdint.h>
 
+#include <cmath>  // std::abs
+
 #include "hwy/aligned_allocator.h"
 
 // clang-format off
@@ -52,21 +54,23 @@ HWY_NOINLINE void SimpleMatVecAdd(const MatT* HWY_RESTRICT mat,
                                   ThreadPool& pool) {
   if (add) {
     pool.Run(0, rows, [=](uint64_t r, size_t /*thread*/) {
-      T dot = ConvertScalarTo<T>(0);
+      double dot = 0.0;
       for (size_t c = 0; c < cols; c++) {
         // For reasons unknown, fp16 += does not compile on clang (Arm).
-        dot = ConvertScalarTo<T>(dot + mat[r * cols + c] * vec[c]);
+        dot += ConvertScalarTo<double>(mat[r * cols + c]) *
+               ConvertScalarTo<double>(vec[c]);
       }
-      out[r] = dot + add[r];
+      out[r] = ConvertScalarTo<T>(dot + ConvertScalarTo<double>(add[r]));
     });
   } else {
     pool.Run(0, rows, [=](uint64_t r, size_t /*thread*/) {
-      T dot = ConvertScalarTo<T>(0);
+      double dot = 0.0;
       for (size_t c = 0; c < cols; c++) {
         // For reasons unknown, fp16 += does not compile on clang (Arm).
-        dot = ConvertScalarTo<T>(dot + mat[r * cols + c] * vec[c]);
+        dot += ConvertScalarTo<double>(mat[r * cols + c]) *
+               ConvertScalarTo<double>(vec[c]);
       }
-      out[r] = dot;
+      out[r] = ConvertScalarTo<T>(dot);
     });
   }
 }
@@ -118,22 +122,33 @@ HWY_MAYBE_UNUSED HWY_NOINLINE void SimpleMatVecAdd(
   }
 }
 
+// Workaround for incorrect codegen on Arm, which results in values of `av`
+// >= 1E10. Can also be prevented by calling `Print(du, indices)`.
+#if HWY_ARCH_ARM && HWY_COMPILER_CLANG
+#define GENERATE_INLINE HWY_NOINLINE
+#else
+#define GENERATE_INLINE HWY_INLINE
+#endif
+
 struct GenerateMod {
   template <class D, HWY_IF_NOT_BF16_D(D), HWY_IF_LANES_GT_D(D, 1)>
-  Vec<D> operator()(D d, Vec<RebindToUnsigned<D>> indices) const {
+  GENERATE_INLINE Vec<D> operator()(D d,
+                                    Vec<RebindToUnsigned<D>> indices) const {
     const RebindToUnsigned<D> du;
     return Reverse2(d, ConvertTo(d, And(indices, Set(du, 0xF))));
   }
 
   template <class D, HWY_IF_NOT_BF16_D(D), HWY_IF_LANES_LE_D(D, 1)>
-  Vec<D> operator()(D d, Vec<RebindToUnsigned<D>> indices) const {
+  GENERATE_INLINE Vec<D> operator()(D d,
+                                    Vec<RebindToUnsigned<D>> indices) const {
     const RebindToUnsigned<D> du;
     return ConvertTo(d, And(indices, Set(du, 0xF)));
   }
 
   // Requires >= 4 bf16 lanes for float32 Reverse2.
   template <class D, HWY_IF_BF16_D(D), HWY_IF_LANES_GT_D(D, 2)>
-  Vec<D> operator()(D d, Vec<RebindToUnsigned<D>> indices) const {
+  GENERATE_INLINE Vec<D> operator()(D d,
+                                    Vec<RebindToUnsigned<D>> indices) const {
     const RebindToUnsigned<D> du;
     const RebindToSigned<D> di;
     const RepartitionToWide<decltype(di)> dw;
@@ -146,9 +161,10 @@ struct GenerateMod {
 
   // For one or two lanes, we don't have OrderedDemote2To nor Reverse2.
   template <class D, HWY_IF_BF16_D(D), HWY_IF_LANES_LE_D(D, 2)>
-  Vec<D> operator()(D d, Vec<RebindToUnsigned<D>> indices) const {
+  GENERATE_INLINE Vec<D> operator()(D d,
+                                    Vec<RebindToUnsigned<D>> indices) const {
     const Rebind<float, D> df;
-    return DemoteTo(d, Set(df, GetLane(indices)));
+    return DemoteTo(d, Set(df, static_cast<float>(GetLane(indices))));
   }
 };
 
@@ -194,15 +210,19 @@ class TestMatVecAdd {
       for (size_t i = 0; i < kRows; ++i) {
         const double exp = ConvertScalarTo<double>(expected[i]);
         const double act = ConvertScalarTo<double>(actual[i]);
-        const double tolerance =
-            exp * 20 * 1.0 /
-            (1ULL << HWY_MIN(MantissaBits<MatT>(), MantissaBits<VecT>()));
-        if (!(exp - tolerance <= act && act <= exp + tolerance)) {
+        const double epsilon =
+            1.0 / (1ULL << HWY_MIN(MantissaBits<MatT>(), MantissaBits<VecT>()));
+        const double tolerance = exp * 20.0 / epsilon;
+        const double l1 = std::abs(exp - act);
+        const double rel = exp == 0.0 ? 0.0 : l1 / exp;
+
+        if (l1 > tolerance && rel > epsilon) {
           fprintf(stderr,
-                  "%s/%s %zu x %zu, %s: mismatch at %zu %f %f; tol %f\n",
+                  "%s/%s %zu x %zu, %s: mismatch at %zu: %E != %E; "
+                  "tol %f l1 %f rel %E\n",
                   TypeName(MatT(), 1).c_str(), TypeName(VecT(), 1).c_str(),
                   kRows, kCols, (with_add ? "with add" : "without add"), i, exp,
-                  act, tolerance);
+                  act, tolerance, l1, rel);
           HWY_ASSERT(0);
         }
       }
diff --git a/hwy/contrib/thread_pool/topology_test.cc b/hwy/contrib/thread_pool/topology_test.cc
@@ -116,7 +116,8 @@ static void CheckCache(const Cache& c, size_t level) {
   HWY_ASSERT(32 <= c.bytes_per_line && c.bytes_per_line <= 1024);
 
   HWY_ASSERT(c.cores_sharing != 0);
-  HWY_ASSERT(c.cores_sharing <= TotalLogicalProcessors());
+  // +1 observed on RISC-V.
+  HWY_ASSERT(c.cores_sharing <= TotalLogicalProcessors() + 1);
 }
 
 TEST(TopologyTest, TestCaches) {
diff --git a/hwy/contrib/unroller/unroller_test.cc b/hwy/contrib/unroller/unroller_test.cc
@@ -14,6 +14,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <cmath>  // std::abs
 #include <vector>
 
 #include "hwy/base.h"
@@ -33,31 +34,31 @@ namespace HWY_NAMESPACE {
 namespace {
 
 template <typename T>
-T SimpleDot(const T* pa, const T* pb, size_t num) {
-  T sum = 0;
+T DoubleDot(const T* pa, const T* pb, size_t num) {
+  double sum = 0.0;
   for (size_t i = 0; i < num; ++i) {
     // For reasons unknown, fp16 += does not compile on clang (Arm).
-    sum = ConvertScalarTo<T>(sum + pa[i] * pb[i]);
+    sum += ConvertScalarTo<double>(pa[i]) * ConvertScalarTo<double>(pb[i]);
   }
-  return sum;
+  return ConvertScalarTo<T>(sum);
 }
 
 template <typename T>
-T SimpleAcc(const T* pa, size_t num) {
-  T sum = 0;
+T DoubleSum(const T* pa, size_t num) {
+  double sum = 0.0;
   for (size_t i = 0; i < num; ++i) {
-    sum += pa[i];
+    sum += ConvertScalarTo<double>(pa[i]);
   }
-  return sum;
+  return ConvertScalarTo<T>(sum);
 }
 
 template <typename T>
-T SimpleMin(const T* pa, size_t num) {
-  T min = HighestValue<T>();
+T DoubleMin(const T* pa, size_t num) {
+  double min = HighestValue<T>();
   for (size_t i = 0; i < num; ++i) {
-    if (min > pa[i]) min = pa[i];
+    min = HWY_MIN(min, ConvertScalarTo<double>(pa[i]));
   }
-  return min;
+  return ConvertScalarTo<T>(min);
 }
 
 template <typename T>
@@ -370,30 +371,35 @@ struct TestDot {
         b[i] = ConvertScalarTo<T>(random_t());
       }
 
-      const T expected_dot = SimpleDot(a, b, num);
+      const T expected_dot = DoubleDot(a, b, num);
+      const double expected_dot_f64 = ConvertScalarTo<double>(expected_dot);
       MultiplyUnit<T> multfn;
       Unroller(multfn, a, b, y, static_cast<ptrdiff_t>(num));
       AccumulateUnit<T> accfn;
       T dot_via_mul_acc;
       Unroller(accfn, y, &dot_via_mul_acc, static_cast<ptrdiff_t>(num));
       const double tolerance = 120.0 *
                                ConvertScalarTo<double>(hwy::Epsilon<T>()) *
-                               ScalarAbs(expected_dot);
-      HWY_ASSERT(ScalarAbs(expected_dot - dot_via_mul_acc) < tolerance);
+                               std::abs(expected_dot_f64);
+      HWY_ASSERT(std::abs(expected_dot_f64 - ConvertScalarTo<double>(
+                                                 dot_via_mul_acc)) < tolerance);
 
       DotUnit<T> dotfn;
       T dotr;
       Unroller(dotfn, a, b, &dotr, static_cast<ptrdiff_t>(num));
-      HWY_ASSERT(ConvertScalarTo<double>(ScalarAbs((expected_dot - dotr))) <
-                 tolerance);
+      const double dotr_f64 = ConvertScalarTo<double>(dotr);
+      HWY_ASSERT(std::abs(expected_dot_f64 - dotr_f64) < tolerance);
 
-      auto expected_min = SimpleMin(a, num);
+      const T expected_min = DoubleMin(a, num);
       MinUnit<T> minfn;
       T minr;
       Unroller(minfn, a, &minr, static_cast<ptrdiff_t>(num));
 
-      HWY_ASSERT(ConvertScalarTo<double>(ScalarAbs(expected_min - minr)) <
-                 1e-7);
+      const double l1 = std::abs(ConvertScalarTo<double>(expected_min) -
+                                 ConvertScalarTo<double>(minr));
+      // Unlike above, tolerance is absolute, there should be no numerical
+      // differences between T and double because we just compute the min.
+      HWY_ASSERT(l1 < 1E-7);
     }
 #endif
   }
diff --git a/hwy/perf_counters_test.cc b/hwy/perf_counters_test.cc
@@ -139,7 +139,7 @@ TEST(PerfCountersTest, RunBranches) {
 
   HWY_ASSERT(values[PerfCounters::kL3Loads] < 1E8);       // 174K..12M
   HWY_ASSERT(values[PerfCounters::kL3Stores] < 1E7);      // 44K..1.8M
-  HWY_ASSERT(values[PerfCounters::kCacheRefs] < 1E8);     // 5M..27M
+  HWY_ASSERT(values[PerfCounters::kCacheRefs] < 1E9);     // 5M..104M
   HWY_ASSERT(values[PerfCounters::kCacheMisses] < 1E8);   // 500K..10M
   HWY_ASSERT(values[PerfCounters::kBusCycles] < 1E11);    // 1M..10B
   HWY_ASSERT(values[PerfCounters::kPageFaults] < 1E4);    // 0..1.1K (in SDE)
diff --git a/hwy/tests/float_test.cc b/hwy/tests/float_test.cc
@@ -144,8 +144,9 @@ struct TestApproximateReciprocal {
     double worst_expected = 0.0;
     double worst_actual = 0.0;
     for (size_t i = 0; i < N; ++i) {
-      const double expected = 1.0 / input[i];
-      const double l1 = ScalarAbs(expected - actual[i]);
+      const double expected = 1.0 / ConvertScalarTo<double>(input[i]);
+      const double l1 =
+          ScalarAbs(expected - ConvertScalarTo<double>(actual[i]));
       if (l1 > max_l1) {
         max_l1 = l1;
         worst_expected = expected;
@@ -187,11 +188,12 @@ struct TestMaskedApproximateReciprocal {
     double expected;
     for (size_t i = 0; i < N; ++i) {
       if (i < 3) {
-        expected = 1.0 / input[i];
+        expected = 1.0 / ConvertScalarTo<double>(input[i]);
       } else {
         expected = 0.0;
       }
-      const double l1 = ScalarAbs(expected - actual[i]);
+      const double l1 =
+          ScalarAbs(expected - ConvertScalarTo<double>(actual[i]));
       if (l1 > max_l1) {
         max_l1 = l1;
         worst_expected = expected;
@@ -254,7 +256,9 @@ struct TestReciprocalSquareRoot {
     Store(ApproximateReciprocalSqrt(v), d, lanes.get());
     for (size_t i = 0; i < N; ++i) {
       T err = ConvertScalarTo<T>(ConvertScalarTo<float>(lanes[i]) - 0.090166f);
-      if (err < ConvertScalarTo<T>(0)) err = -err;
+      if (err < ConvertScalarTo<T>(0)) {
+        err = ConvertScalarTo<T>(-ConvertScalarTo<float>(err));
+      }
       if (static_cast<double>(err) >= 4E-4) {
         HWY_ABORT("Lane %d (%d): actual %f err %f\n", static_cast<int>(i),
                   static_cast<int>(N), static_cast<double>(lanes[i]),
diff --git a/hwy/tests/swizzle_test.cc b/hwy/tests/swizzle_test.cc
@@ -258,6 +258,11 @@ struct TestInsertLane {
     DoTestInsertLaneWithConstAmt(d, lanes.get());
 #endif
 
+// TODO(janwas): file compiler bug report
+#if HWY_COMPILER_CLANG && (HWY_COMPILER_CLANG < 2000) && HWY_ARCH_ARM
+    if (IsSpecialFloat<T>()) return;
+#endif
+
     V v2 = Zero(d);
     for (size_t i = 0; i < N; ++i) {
       lanes[i] = ConvertScalarTo<T>(i + 1);

Original file line number	Diff line number	Diff line change
`@@ -116,7 +116,8 @@ static void CheckCache(const Cache& c, size_t level) {`
`116`	`116`	`HWY_ASSERT(32 <= c.bytes_per_line && c.bytes_per_line <= 1024);`
`117`	`117`
`118`	`118`	`HWY_ASSERT(c.cores_sharing != 0);`
`119`		`- HWY_ASSERT(c.cores_sharing <= TotalLogicalProcessors());`
	`119`	`+ // +1 observed on RISC-V.`
	`120`	`+ HWY_ASSERT(c.cores_sharing <= TotalLogicalProcessors() + 1);`
`120`	`121`	`}`
`121`	`122`
`122`	`123`	`TEST(TopologyTest, TestCaches) {`