Skip to content

Commit edd3d56

Browse files
add intrinsic clz()
... and fix wrong datatype for CUDA/HIP `ffs()`
1 parent 66540b9 commit edd3d56

File tree

6 files changed

+180
-2
lines changed

6 files changed

+180
-2
lines changed

include/alpaka/api/syclGeneric/intrinsic.hpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,28 @@ namespace alpaka::internal::intrinsic
5757
ALPAKA_UNREACHABLE(int{});
5858
}
5959
};
60+
61+
template<typename T_Arg>
62+
struct Clz::Op<alpaka::internal::SyclIntrinsic, T_Arg>
63+
{
64+
constexpr auto operator()(alpaka::internal::SyclIntrinsic const, T_Arg const& val) const
65+
{
66+
if constexpr(sizeof(T_Arg) == 4u)
67+
{
68+
auto value = std::bit_cast<unsigned int>(val);
69+
return sycl::clz(value);
70+
}
71+
else if constexpr(sizeof(T_Arg) == 8u)
72+
{
73+
auto value = std::bit_cast<unsigned long long>(val);
74+
return sycl::clz(value);
75+
}
76+
else
77+
static_assert(!sizeof(T_Arg), "Unsupported data type, sizeof() must be 4 or 8");
78+
79+
ALPAKA_UNREACHABLE(int{});
80+
}
81+
};
6082
} // namespace alpaka::internal::intrinsic
6183

6284
#endif

include/alpaka/api/unifiedCudaHip/intrinsic.hpp

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,31 @@ namespace alpaka::internal::intrinsic
4343
{
4444
if constexpr(sizeof(T_Arg) == 4u)
4545
{
46-
return __ffs(std::bit_cast<unsigned int>(val));
46+
return __ffs(std::bit_cast<int>(val));
4747
}
4848
else if constexpr(sizeof(T_Arg) == 8u)
4949
{
50-
return __ffsll(std::bit_cast<unsigned long long>(val));
50+
return __ffsll(std::bit_cast<long long int>(val));
51+
}
52+
else
53+
static_assert(!sizeof(T_Arg), "Unsupported data type, sizeof() must be 4 or 8");
54+
55+
ALPAKA_UNREACHABLE(int{});
56+
}
57+
};
58+
59+
template<typename T_Arg>
60+
struct Clz::Op<alpaka::internal::CudaHipIntrinsic, T_Arg>
61+
{
62+
inline __device__ auto operator()(alpaka::internal::CudaHipIntrinsic const, T_Arg const& val) const
63+
{
64+
if constexpr(sizeof(T_Arg) == 4u)
65+
{
66+
return __clz(std::bit_cast<int>(val));
67+
}
68+
else if constexpr(sizeof(T_Arg) == 8u)
69+
{
70+
return __clzll(std::bit_cast<long long int>(val));
5171
}
5272
else
5373
static_assert(!sizeof(T_Arg), "Unsupported data type, sizeof() must be 4 or 8");

include/alpaka/internal/intrinsic.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,13 @@ namespace alpaka::internal::intrinsic
2525
int32_t operator()(T_IntrinsicImpl const, T_Arg const& val) const;
2626
};
2727
};
28+
29+
struct Clz
30+
{
31+
template<typename T_IntrinsicImpl, typename T_Arg>
32+
struct Op
33+
{
34+
int32_t operator()(T_IntrinsicImpl const, T_Arg const& val) const;
35+
};
36+
};
2837
} // namespace alpaka::internal::intrinsic

include/alpaka/internal/stlIntrinsic.hpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,4 +54,24 @@ namespace alpaka::internal::intrinsic
5454
ALPAKA_UNREACHABLE(int{});
5555
}
5656
};
57+
58+
template<typename T_Arg>
59+
struct Clz::Op<alpaka::internal::StlIntrinsic, T_Arg>
60+
{
61+
constexpr auto operator()(alpaka::internal::StlIntrinsic const, T_Arg const& val) const
62+
{
63+
if constexpr(sizeof(T_Arg) == 4u)
64+
{
65+
return std::countl_zero(std::bit_cast<unsigned int>(val));
66+
}
67+
else if constexpr(sizeof(T_Arg) == 8u)
68+
{
69+
return std::countl_zero(std::bit_cast<unsigned long long>(val));
70+
}
71+
else
72+
static_assert(!sizeof(T_Arg), "Unsupported data type, sizeof() must be 4 or 8");
73+
74+
ALPAKA_UNREACHABLE(int{});
75+
}
76+
};
5777
} // namespace alpaka::internal::intrinsic

include/alpaka/intrinsic.hpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,15 @@ namespace alpaka
3333
constexpr auto intrinsicImpl = trait::getIntrinsicImpl(thisApi());
3434
return internal::intrinsic::Ffs::Op<ALPAKA_TYPEOF(intrinsicImpl), ALPAKA_TYPEOF(arg)>{}(intrinsicImpl, arg);
3535
}
36+
37+
/* Return the number of most significant zero bits
38+
*
39+
* @return number consecutive most significant zero bits, zero for input value 0.
40+
*/
41+
constexpr int32_t clz(auto const& arg)
42+
requires(sizeof(ALPAKA_TYPEOF(arg)) == 4u || sizeof(ALPAKA_TYPEOF(arg)) == 8u)
43+
{
44+
constexpr auto intrinsicImpl = alpaka::trait::getIntrinsicImpl(thisApi());
45+
return internal::intrinsic::Clz::Op<ALPAKA_TYPEOF(intrinsicImpl), ALPAKA_TYPEOF(arg)>{}(intrinsicImpl, arg);
46+
}
3647
} // namespace alpaka

tests/unit/intrinsic/clz.cpp

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
/*
2+
* Copyright 2025 The alpaka team
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <alpaka/alpaka.hpp>
18+
#include <alpaka/onHost/example/executors.hpp>
19+
#include <alpaka/onHost/executeForEach.hpp>
20+
21+
#include <catch2/catch_template_test_macros.hpp>
22+
23+
#include <vector>
24+
25+
using namespace alpaka;
26+
27+
using TestBackends
28+
= std::decay_t<decltype(onHost::allBackends(onHost::enabledApis, onHost::example::enabledExecutors))>;
29+
30+
struct PopcountKernel
31+
{
32+
template<typename TAcc>
33+
ALPAKA_FN_ACC void operator()(TAcc const& acc, concepts::IMdSpan auto output, concepts::IMdSpan auto const input)
34+
const
35+
{
36+
for(auto [index] : onAcc::makeIdxMap(acc, alpaka::onAcc::worker::threadsInGrid, IdxRange{input.getExtents()}))
37+
{
38+
output[index] = clz(input[index]);
39+
}
40+
}
41+
};
42+
43+
TEMPLATE_LIST_TEST_CASE("clz", "[intrinsic][clz]", TestBackends)
44+
{
45+
using Backend = TestType;
46+
auto cfg = Backend::makeDict();
47+
auto deviceSpec = cfg[object::deviceSpec];
48+
auto computeExec = cfg[object::exec];
49+
50+
// Select a device
51+
auto devSelector = alpaka::onHost::makeDeviceSelector(deviceSpec);
52+
if(!devSelector.isAvailable())
53+
{
54+
return;
55+
}
56+
alpaka::onHost::Device devAcc = devSelector.makeDevice(0);
57+
58+
// Create a queue on the device
59+
alpaka::onHost::Queue queue = devAcc.makeQueue();
60+
61+
// Input data
62+
std::vector<uint64_t> hostInput = {0, 1, 2, 1llu << 32, 1llu << 63};
63+
size_t const size = hostInput.size();
64+
65+
// Allocate device memory
66+
auto devInput = alpaka::onHost::alloc<uint64_t>(devAcc, hostInput.size());
67+
auto devOutput = alpaka::onHost::alloc<int>(devAcc, hostInput.size());
68+
69+
// Copy data from host to device
70+
alpaka::onHost::memcpy(queue, devInput, hostInput);
71+
72+
// Define execution parameters
73+
auto const frameSpec = alpaka::onHost::getFrameSpec<uint64_t>(devAcc, devInput.getExtents());
74+
75+
// Create kernel
76+
PopcountKernel kernel;
77+
auto const taskKernel = alpaka::KernelBundle{kernel, devOutput, devInput};
78+
79+
// Execute the kernel
80+
queue.enqueue(computeExec, frameSpec, taskKernel);
81+
82+
// Copy data from device to host
83+
std::vector<int> hostOutput(size);
84+
alpaka::onHost::memcpy(queue, hostOutput, devOutput);
85+
86+
// Wait for the queue to finish
87+
alpaka::onHost::wait(queue);
88+
89+
// Verification
90+
for(size_t i = 0; i < size; ++i)
91+
{
92+
auto val = hostInput[i];
93+
int expected = std::countl_zero(val);
94+
CHECK(hostOutput[i] == expected);
95+
}
96+
}

0 commit comments

Comments
 (0)