forked from WojciechMula/sse-popcount
-
Notifications
You must be signed in to change notification settings - Fork 0
/
popcnt-bit-parallel-scalar32.cpp
68 lines (46 loc) · 1.78 KB
/
popcnt-bit-parallel-scalar32.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
std::uint64_t popcnt_parallel_32bit_naive(const uint8_t* data, const size_t n) {
uint32_t result = 0;
size_t i = 0;
#define ITER { \
const uint32_t t1 = *reinterpret_cast<const uint32_t*>(data + i); \
const uint32_t t2 = (t1 & 0x55555555llu) + ((t1 >> 1) & 0x55555555llu); \
const uint32_t t3 = (t2 & 0x33333333llu) + ((t2 >> 2) & 0x33333333llu); \
const uint32_t t4 = (t3 & 0x0f0f0f0fllu) + ((t3 >> 4) & 0x0f0f0f0fllu); \
const uint32_t t5 = (t4 & 0x00ff00ffllu) + ((t4 >> 8) & 0x00ff00ffllu); \
const uint32_t t6 = (t5 & 0x0000ffffllu) + ((t5 >> 16) & 0x0000ffffllu); \
result += t6; \
i += 4; \
}
while (i + 4*4 <= n) {
ITER ITER ITER ITER
}
#undef ITER
for (/**/; i < n; i++) {
result += lookup8bit[data[i]];
}
return result;
}
std::uint64_t popcnt_parallel_32bit_optimized(const uint8_t* data, const size_t n) {
uint32_t result = 0;
size_t i = 0;
while (i + 4*4 <= n) {
uint32_t partial = 0; // packed_byte
#define ITER { \
const uint32_t t1 = *reinterpret_cast<const uint32_t*>(data + i); \
const uint32_t t2 = (t1 & 0x55555555llu) + ((t1 >> 1) & 0x55555555llu); \
const uint32_t t3 = (t2 & 0x33333333llu) + ((t2 >> 2) & 0x33333333llu); \
const uint32_t t4 = (t3 & 0x0f0f0f0fllu) + ((t3 >> 4) & 0x0f0f0f0fllu); \
partial += t4; \
i += 4; \
}
ITER ITER ITER ITER
#undef ITER
const uint32_t t5 = (partial & 0x00ff00ffllu) + ((partial >> 8) & 0x00ff00ffllu);
const uint32_t t6 = (t5 & 0x0000ffffllu) + ((t5 >> 16) & 0x0000ffffllu);
result += t6;
}
for (/**/; i < n; i++) {
result += lookup8bit[data[i]];
}
return result;
}