Skip to content

Commit 0fccdf8

Browse files
jan-wassenbergcopybara-github
authored andcommitted
test fixes for native f16, plus update perf_counters threshold
PiperOrigin-RevId: 739192363
1 parent 43d3099 commit 0fccdf8

File tree

7 files changed

+82
-44
lines changed

7 files changed

+82
-44
lines changed

hwy/contrib/matvec/matvec-inl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#define HIGHWAY_HWY_CONTRIB_MATVEC_MATVEC_INL_H_
2323
#endif
2424

25+
#include <stddef.h>
26+
2527
#include "hwy/cache_control.h"
2628
#include "hwy/contrib/thread_pool/thread_pool.h"
2729
#include "hwy/highway.h"

hwy/contrib/matvec/matvec_test.cc

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
#include <stddef.h>
2525
#include <stdint.h>
2626

27+
#include <cmath> // std::abs
28+
2729
#include "hwy/aligned_allocator.h"
2830

2931
// clang-format off
@@ -52,21 +54,23 @@ HWY_NOINLINE void SimpleMatVecAdd(const MatT* HWY_RESTRICT mat,
5254
ThreadPool& pool) {
5355
if (add) {
5456
pool.Run(0, rows, [=](uint64_t r, size_t /*thread*/) {
55-
T dot = ConvertScalarTo<T>(0);
57+
double dot = 0.0;
5658
for (size_t c = 0; c < cols; c++) {
5759
// For reasons unknown, fp16 += does not compile on clang (Arm).
58-
dot = ConvertScalarTo<T>(dot + mat[r * cols + c] * vec[c]);
60+
dot += ConvertScalarTo<double>(mat[r * cols + c]) *
61+
ConvertScalarTo<double>(vec[c]);
5962
}
60-
out[r] = dot + add[r];
63+
out[r] = ConvertScalarTo<T>(dot + ConvertScalarTo<double>(add[r]));
6164
});
6265
} else {
6366
pool.Run(0, rows, [=](uint64_t r, size_t /*thread*/) {
64-
T dot = ConvertScalarTo<T>(0);
67+
double dot = 0.0;
6568
for (size_t c = 0; c < cols; c++) {
6669
// For reasons unknown, fp16 += does not compile on clang (Arm).
67-
dot = ConvertScalarTo<T>(dot + mat[r * cols + c] * vec[c]);
70+
dot += ConvertScalarTo<double>(mat[r * cols + c]) *
71+
ConvertScalarTo<double>(vec[c]);
6872
}
69-
out[r] = dot;
73+
out[r] = ConvertScalarTo<T>(dot);
7074
});
7175
}
7276
}
@@ -118,22 +122,33 @@ HWY_MAYBE_UNUSED HWY_NOINLINE void SimpleMatVecAdd(
118122
}
119123
}
120124

125+
// Workaround for incorrect codegen on Arm, which results in values of `av`
126+
// >= 1E10. Can also be prevented by calling `Print(du, indices)`.
127+
#if HWY_ARCH_ARM && HWY_COMPILER_CLANG
128+
#define GENERATE_INLINE HWY_NOINLINE
129+
#else
130+
#define GENERATE_INLINE HWY_INLINE
131+
#endif
132+
121133
struct GenerateMod {
122134
template <class D, HWY_IF_NOT_BF16_D(D), HWY_IF_LANES_GT_D(D, 1)>
123-
Vec<D> operator()(D d, Vec<RebindToUnsigned<D>> indices) const {
135+
GENERATE_INLINE Vec<D> operator()(D d,
136+
Vec<RebindToUnsigned<D>> indices) const {
124137
const RebindToUnsigned<D> du;
125138
return Reverse2(d, ConvertTo(d, And(indices, Set(du, 0xF))));
126139
}
127140

128141
template <class D, HWY_IF_NOT_BF16_D(D), HWY_IF_LANES_LE_D(D, 1)>
129-
Vec<D> operator()(D d, Vec<RebindToUnsigned<D>> indices) const {
142+
GENERATE_INLINE Vec<D> operator()(D d,
143+
Vec<RebindToUnsigned<D>> indices) const {
130144
const RebindToUnsigned<D> du;
131145
return ConvertTo(d, And(indices, Set(du, 0xF)));
132146
}
133147

134148
// Requires >= 4 bf16 lanes for float32 Reverse2.
135149
template <class D, HWY_IF_BF16_D(D), HWY_IF_LANES_GT_D(D, 2)>
136-
Vec<D> operator()(D d, Vec<RebindToUnsigned<D>> indices) const {
150+
GENERATE_INLINE Vec<D> operator()(D d,
151+
Vec<RebindToUnsigned<D>> indices) const {
137152
const RebindToUnsigned<D> du;
138153
const RebindToSigned<D> di;
139154
const RepartitionToWide<decltype(di)> dw;
@@ -146,9 +161,10 @@ struct GenerateMod {
146161

147162
// For one or two lanes, we don't have OrderedDemote2To nor Reverse2.
148163
template <class D, HWY_IF_BF16_D(D), HWY_IF_LANES_LE_D(D, 2)>
149-
Vec<D> operator()(D d, Vec<RebindToUnsigned<D>> indices) const {
164+
GENERATE_INLINE Vec<D> operator()(D d,
165+
Vec<RebindToUnsigned<D>> indices) const {
150166
const Rebind<float, D> df;
151-
return DemoteTo(d, Set(df, GetLane(indices)));
167+
return DemoteTo(d, Set(df, static_cast<float>(GetLane(indices))));
152168
}
153169
};
154170

@@ -194,15 +210,19 @@ class TestMatVecAdd {
194210
for (size_t i = 0; i < kRows; ++i) {
195211
const double exp = ConvertScalarTo<double>(expected[i]);
196212
const double act = ConvertScalarTo<double>(actual[i]);
197-
const double tolerance =
198-
exp * 20 * 1.0 /
199-
(1ULL << HWY_MIN(MantissaBits<MatT>(), MantissaBits<VecT>()));
200-
if (!(exp - tolerance <= act && act <= exp + tolerance)) {
213+
const double epsilon =
214+
1.0 / (1ULL << HWY_MIN(MantissaBits<MatT>(), MantissaBits<VecT>()));
215+
const double tolerance = exp * 20.0 / epsilon;
216+
const double l1 = std::abs(exp - act);
217+
const double rel = exp == 0.0 ? 0.0 : l1 / exp;
218+
219+
if (l1 > tolerance && rel > epsilon) {
201220
fprintf(stderr,
202-
"%s/%s %zu x %zu, %s: mismatch at %zu %f %f; tol %f\n",
221+
"%s/%s %zu x %zu, %s: mismatch at %zu: %E != %E; "
222+
"tol %f l1 %f rel %E\n",
203223
TypeName(MatT(), 1).c_str(), TypeName(VecT(), 1).c_str(),
204224
kRows, kCols, (with_add ? "with add" : "without add"), i, exp,
205-
act, tolerance);
225+
act, tolerance, l1, rel);
206226
HWY_ASSERT(0);
207227
}
208228
}

hwy/contrib/thread_pool/topology_test.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,8 @@ static void CheckCache(const Cache& c, size_t level) {
116116
HWY_ASSERT(32 <= c.bytes_per_line && c.bytes_per_line <= 1024);
117117

118118
HWY_ASSERT(c.cores_sharing != 0);
119-
HWY_ASSERT(c.cores_sharing <= TotalLogicalProcessors());
119+
// +1 observed on RISC-V.
120+
HWY_ASSERT(c.cores_sharing <= TotalLogicalProcessors() + 1);
120121
}
121122

122123
TEST(TopologyTest, TestCaches) {

hwy/contrib/unroller/unroller_test.cc

Lines changed: 26 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
// See the License for the specific language governing permissions and
1515
// limitations under the License.
1616

17+
#include <cmath> // std::abs
1718
#include <vector>
1819

1920
#include "hwy/base.h"
@@ -33,31 +34,31 @@ namespace HWY_NAMESPACE {
3334
namespace {
3435

3536
template <typename T>
36-
T SimpleDot(const T* pa, const T* pb, size_t num) {
37-
T sum = 0;
37+
T DoubleDot(const T* pa, const T* pb, size_t num) {
38+
double sum = 0.0;
3839
for (size_t i = 0; i < num; ++i) {
3940
// For reasons unknown, fp16 += does not compile on clang (Arm).
40-
sum = ConvertScalarTo<T>(sum + pa[i] * pb[i]);
41+
sum += ConvertScalarTo<double>(pa[i]) * ConvertScalarTo<double>(pb[i]);
4142
}
42-
return sum;
43+
return ConvertScalarTo<T>(sum);
4344
}
4445

4546
template <typename T>
46-
T SimpleAcc(const T* pa, size_t num) {
47-
T sum = 0;
47+
T DoubleSum(const T* pa, size_t num) {
48+
double sum = 0.0;
4849
for (size_t i = 0; i < num; ++i) {
49-
sum += pa[i];
50+
sum += ConvertScalarTo<double>(pa[i]);
5051
}
51-
return sum;
52+
return ConvertScalarTo<T>(sum);
5253
}
5354

5455
template <typename T>
55-
T SimpleMin(const T* pa, size_t num) {
56-
T min = HighestValue<T>();
56+
T DoubleMin(const T* pa, size_t num) {
57+
double min = HighestValue<T>();
5758
for (size_t i = 0; i < num; ++i) {
58-
if (min > pa[i]) min = pa[i];
59+
min = HWY_MIN(min, ConvertScalarTo<double>(pa[i]));
5960
}
60-
return min;
61+
return ConvertScalarTo<T>(min);
6162
}
6263

6364
template <typename T>
@@ -370,30 +371,35 @@ struct TestDot {
370371
b[i] = ConvertScalarTo<T>(random_t());
371372
}
372373

373-
const T expected_dot = SimpleDot(a, b, num);
374+
const T expected_dot = DoubleDot(a, b, num);
375+
const double expected_dot_f64 = ConvertScalarTo<double>(expected_dot);
374376
MultiplyUnit<T> multfn;
375377
Unroller(multfn, a, b, y, static_cast<ptrdiff_t>(num));
376378
AccumulateUnit<T> accfn;
377379
T dot_via_mul_acc;
378380
Unroller(accfn, y, &dot_via_mul_acc, static_cast<ptrdiff_t>(num));
379381
const double tolerance = 120.0 *
380382
ConvertScalarTo<double>(hwy::Epsilon<T>()) *
381-
ScalarAbs(expected_dot);
382-
HWY_ASSERT(ScalarAbs(expected_dot - dot_via_mul_acc) < tolerance);
383+
std::abs(expected_dot_f64);
384+
HWY_ASSERT(std::abs(expected_dot_f64 - ConvertScalarTo<double>(
385+
dot_via_mul_acc)) < tolerance);
383386

384387
DotUnit<T> dotfn;
385388
T dotr;
386389
Unroller(dotfn, a, b, &dotr, static_cast<ptrdiff_t>(num));
387-
HWY_ASSERT(ConvertScalarTo<double>(ScalarAbs((expected_dot - dotr))) <
388-
tolerance);
390+
const double dotr_f64 = ConvertScalarTo<double>(dotr);
391+
HWY_ASSERT(std::abs(expected_dot_f64 - dotr_f64) < tolerance);
389392

390-
auto expected_min = SimpleMin(a, num);
393+
const T expected_min = DoubleMin(a, num);
391394
MinUnit<T> minfn;
392395
T minr;
393396
Unroller(minfn, a, &minr, static_cast<ptrdiff_t>(num));
394397

395-
HWY_ASSERT(ConvertScalarTo<double>(ScalarAbs(expected_min - minr)) <
396-
1e-7);
398+
const double l1 = std::abs(ConvertScalarTo<double>(expected_min) -
399+
ConvertScalarTo<double>(minr));
400+
// Unlike above, tolerance is absolute, there should be no numerical
401+
// differences between T and double because we just compute the min.
402+
HWY_ASSERT(l1 < 1E-7);
397403
}
398404
#endif
399405
}

hwy/perf_counters_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ TEST(PerfCountersTest, RunBranches) {
139139

140140
HWY_ASSERT(values[PerfCounters::kL3Loads] < 1E8); // 174K..12M
141141
HWY_ASSERT(values[PerfCounters::kL3Stores] < 1E7); // 44K..1.8M
142-
HWY_ASSERT(values[PerfCounters::kCacheRefs] < 1E8); // 5M..27M
142+
HWY_ASSERT(values[PerfCounters::kCacheRefs] < 1E9); // 5M..104M
143143
HWY_ASSERT(values[PerfCounters::kCacheMisses] < 1E8); // 500K..10M
144144
HWY_ASSERT(values[PerfCounters::kBusCycles] < 1E11); // 1M..10B
145145
HWY_ASSERT(values[PerfCounters::kPageFaults] < 1E4); // 0..1.1K (in SDE)

hwy/tests/float_test.cc

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,9 @@ struct TestApproximateReciprocal {
144144
double worst_expected = 0.0;
145145
double worst_actual = 0.0;
146146
for (size_t i = 0; i < N; ++i) {
147-
const double expected = 1.0 / input[i];
148-
const double l1 = ScalarAbs(expected - actual[i]);
147+
const double expected = 1.0 / ConvertScalarTo<double>(input[i]);
148+
const double l1 =
149+
ScalarAbs(expected - ConvertScalarTo<double>(actual[i]));
149150
if (l1 > max_l1) {
150151
max_l1 = l1;
151152
worst_expected = expected;
@@ -187,11 +188,12 @@ struct TestMaskedApproximateReciprocal {
187188
double expected;
188189
for (size_t i = 0; i < N; ++i) {
189190
if (i < 3) {
190-
expected = 1.0 / input[i];
191+
expected = 1.0 / ConvertScalarTo<double>(input[i]);
191192
} else {
192193
expected = 0.0;
193194
}
194-
const double l1 = ScalarAbs(expected - actual[i]);
195+
const double l1 =
196+
ScalarAbs(expected - ConvertScalarTo<double>(actual[i]));
195197
if (l1 > max_l1) {
196198
max_l1 = l1;
197199
worst_expected = expected;
@@ -254,7 +256,9 @@ struct TestReciprocalSquareRoot {
254256
Store(ApproximateReciprocalSqrt(v), d, lanes.get());
255257
for (size_t i = 0; i < N; ++i) {
256258
T err = ConvertScalarTo<T>(ConvertScalarTo<float>(lanes[i]) - 0.090166f);
257-
if (err < ConvertScalarTo<T>(0)) err = -err;
259+
if (err < ConvertScalarTo<T>(0)) {
260+
err = ConvertScalarTo<T>(-ConvertScalarTo<float>(err));
261+
}
258262
if (static_cast<double>(err) >= 4E-4) {
259263
HWY_ABORT("Lane %d (%d): actual %f err %f\n", static_cast<int>(i),
260264
static_cast<int>(N), static_cast<double>(lanes[i]),

hwy/tests/swizzle_test.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,11 @@ struct TestInsertLane {
258258
DoTestInsertLaneWithConstAmt(d, lanes.get());
259259
#endif
260260

261+
// TODO(janwas): file compiler bug report
262+
#if HWY_COMPILER_CLANG && (HWY_COMPILER_CLANG < 2000) && HWY_ARCH_ARM
263+
if (IsSpecialFloat<T>()) return;
264+
#endif
265+
261266
V v2 = Zero(d);
262267
for (size_t i = 0; i < N; ++i) {
263268
lanes[i] = ConvertScalarTo<T>(i + 1);

0 commit comments

Comments
 (0)