Skip to content

Commit c219556

Browse files
committed
Optimizations for Armv8-A
These changes apply only to the AArch64 execution state.
1 parent 81fe3d9 commit c219556

File tree

1 file changed

+90
-2
lines changed

1 file changed

+90
-2
lines changed

picohttpparser.c

Lines changed: 90 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@
3434
#include <x86intrin.h>
3535
#endif
3636
#endif
37+
#ifdef __ARM_NEON
38+
#include <arm_neon.h>
39+
#endif
3740
#include "picohttpparser.h"
3841

3942
#if __GNUC__ >= 3
@@ -71,9 +74,8 @@
7174
#define ADVANCE_TOKEN(tok, toklen) \
7275
do { \
7376
const char *tok_start = buf; \
74-
static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; \
7577
int found2; \
76-
buf = findchar_fast(buf, buf_end, ranges2, 4, &found2); \
78+
buf = findchar_nonprintable_fast(buf, buf_end, &found2); \
7779
if (!found2) { \
7880
CHECK_EOF(); \
7981
} \
@@ -131,6 +133,46 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
131133
return buf;
132134
}
133135

136+
static const char *findchar_nonprintable_fast(const char *buf, const char *buf_end, int *found)
137+
{
138+
#if defined(__ARM_64BIT_STATE) && defined(__ARM_FEATURE_UNALIGNED) && !defined(__ARM_BIG_ENDIAN)
139+
*found = 0;
140+
141+
for (size_t i = (buf_end - buf) / sizeof(uint8x16_t); i; i--) {
142+
// This mask makes it possible to pack the comparison result into half a vector,
143+
// which has the same size as uint64_t.
144+
const uint8x16_t mask = vreinterpretq_u8_u16(vmovq_n_u16(0x8008));
145+
uint8x16_t v = vld1q_u8((const uint8_t *) buf);
146+
147+
v = vorrq_u8(vcltq_u8(v, vmovq_n_u8('\041')), vceqq_u8(v, vmovq_n_u8('\177')));
148+
// After masking, a byte in the result does not have the same bits set as any of its neighbours.
149+
v = vandq_u8(v, mask);
150+
// Pack the comparison result into 64 bits.
151+
v = vpaddq_u8(v, v);
152+
153+
uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v), 0);
154+
155+
if (offset) {
156+
*found = 1;
157+
__asm__ ("rbit %x0, %x0" : "+r" (offset));
158+
static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
159+
"Need the number of leading 0-bits in uint64_t.");
160+
// offset uses 4 bits per byte of input.
161+
buf += __builtin_clzll(offset) / 4;
162+
break;
163+
}
164+
165+
buf += sizeof(v);
166+
}
167+
168+
return buf;
169+
#else
170+
static const char ALIGNED(16) ranges2[16] = "\000\040\177\177";
171+
172+
return findchar_fast(buf, buf_end, ranges2, 4, found);
173+
#endif
174+
}
175+
134176
static const char *get_token_to_eol(const char *buf, const char *buf_end, const char **token, size_t *token_len, int *ret)
135177
{
136178
const char *token_start = buf;
@@ -143,6 +185,52 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
143185
buf = findchar_fast(buf, buf_end, ranges1, 6, &found);
144186
if (found)
145187
goto FOUND_CTL;
188+
#elif defined(__ARM_64BIT_STATE) && defined(__ARM_FEATURE_UNALIGNED) && !defined(__ARM_BIG_ENDIAN)
189+
for (size_t i = (buf_end - buf) / (2 * sizeof(uint8x16_t)); i; i--) {
190+
const uint8x16_t space = vmovq_n_u8('\040');
191+
const uint8x16_t threshold = vmovq_n_u8(0137u);
192+
const uint8x16_t v1 = vld1q_u8((const uint8_t *) buf);
193+
const uint8x16_t v2 = vld1q_u8((const uint8_t *) buf + sizeof(v1));
194+
uint8x16_t v3 = vcgeq_u8(vsubq_u8(v1, space), threshold);
195+
uint8x16_t v4 = vcgeq_u8(vsubq_u8(v2, space), threshold);
196+
197+
v3 = vorrq_u8(v3, v4);
198+
// Pack the comparison result into half a vector, i.e. 64 bits; the result will still be non-zero
199+
// even if any adjacent bytes are the same (either 0 or 0xFF).
200+
v3 = vpaddq_u8(v3, v3);
201+
202+
if (vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0)) {
203+
const uint8x16_t del = vmovq_n_u8('\177');
204+
// This mask makes it possible to pack the comparison results into half a vector,
205+
// which has the same size as uint64_t.
206+
const uint8x16_t mask = vreinterpretq_u8_u32(vmovq_n_u32(0x40100401));
207+
const uint8x16_t tab = vmovq_n_u8('\011');
208+
209+
v3 = vbicq_u8(vcltq_u8(v1, space), vceqq_u8(v1, tab));
210+
v4 = vbicq_u8(vcltq_u8(v2, space), vceqq_u8(v2, tab));
211+
v3 = vorrq_u8(v3, vceqq_u8(v1, del));
212+
v4 = vorrq_u8(v4, vceqq_u8(v2, del));
213+
// After masking, four consecutive bytes in the results do not have the same bits set.
214+
v3 = vandq_u8(v3, mask);
215+
v4 = vandq_u8(v4, mask);
216+
// Pack the comparison results into 128, and then 64 bits.
217+
v3 = vpaddq_u8(v3, v4);
218+
v3 = vpaddq_u8(v3, v3);
219+
220+
uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0);
221+
222+
if (offset) {
223+
__asm__ ("rbit %x0, %x0" : "+r" (offset));
224+
static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
225+
"Need the number of leading 0-bits in uint64_t.");
226+
// offset uses 2 bits per byte of input.
227+
buf += __builtin_clzll(offset) / 2;
228+
goto FOUND_CTL;
229+
}
230+
}
231+
232+
buf += sizeof(v1) + sizeof(v2);
233+
}
146234
#else
147235
/* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
148236
while (likely(buf_end - buf >= 8)) {

0 commit comments

Comments
 (0)