Skip to content

Commit a638f26

Browse files
committed
Optimizations for Armv8-A
These changes apply only to the AArch64 execution state.
1 parent f8d0513 commit a638f26

File tree

1 file changed

+143
-2
lines changed

1 file changed

+143
-2
lines changed

picohttpparser.c

Lines changed: 143 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@
3434
#include <x86intrin.h>
3535
#endif
3636
#endif
37+
#ifdef __ARM_FEATURE_SVE
38+
#include <arm_sve.h>
39+
#endif
40+
#ifdef __ARM_NEON
41+
#include <arm_neon.h>
42+
#endif
3743
#include "picohttpparser.h"
3844

3945
#if __GNUC__ >= 3
@@ -71,9 +77,8 @@
7177
#define ADVANCE_TOKEN(tok, toklen) \
7278
do { \
7379
const char *tok_start = buf; \
74-
static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; \
7580
int found2; \
76-
buf = findchar_fast(buf, buf_end, ranges2, 4, &found2); \
81+
buf = findchar_nonprintable_fast(buf, buf_end, &found2); \
7782
if (!found2) { \
7883
CHECK_EOF(); \
7984
} \
@@ -131,6 +136,67 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
131136
return buf;
132137
}
133138

139+
static const char *findchar_nonprintable_fast(const char *buf, const char *buf_end, int *found)
140+
{
141+
#ifdef __ARM_FEATURE_SVE
142+
*found = 0;
143+
144+
for (uint64_t i = 0;; i = svqincb(i, 1)) {
145+
const uint64_t len = buf_end - buf;
146+
const svbool_t pg = svwhilelt_b8(i, len);
147+
148+
if (!svptest_first(svptrue_b8(), pg)) {
149+
buf = buf_end;
150+
break;
151+
}
152+
153+
const svuint8_t v = svld1(pg, (const uint8_t *)buf + i);
154+
svbool_t c = svcmplt(pg, v, '\041');
155+
156+
c = svorr_z(pg, c, svcmpeq(pg, v, '\177'));
157+
158+
if (svptest_any(pg, c)) {
159+
*found = 1;
160+
c = svbrkb_z(pg, c);
161+
buf += i + svcntp_b8(pg, c);
162+
break;
163+
}
164+
}
165+
166+
return buf;
167+
#elif defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)
168+
*found = 0;
169+
170+
const size_t block_size = sizeof(uint8x16_t) - 1;
171+
const char *const end = (size_t)(buf_end - buf) >= block_size ? buf_end - block_size : buf;
172+
173+
for (; buf < end; buf += sizeof(uint8x16_t)) {
174+
uint8x16_t v = vld1q_u8((const uint8_t *)buf);
175+
176+
v = vorrq_u8(vcltq_u8(v, vmovq_n_u8('\041')), vceqq_u8(v, vmovq_n_u8('\177')));
177+
178+
/* Pack the comparison result into 64 bits. */
179+
const uint8x8_t rv = vshrn_n_u16(vreinterpretq_u16_u8(v), 4);
180+
uint64_t offset = vget_lane_u64(vreinterpret_u64_u8(rv), 0);
181+
182+
if (offset) {
183+
*found = 1;
184+
__asm__("rbit %x0, %x0" : "+r"(offset));
185+
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "Need the number of leading 0-bits in uint64_t.");
186+
/* offset uses 4 bits per byte of input. */
187+
buf += __builtin_clzll(offset) / 4;
188+
break;
189+
}
190+
}
191+
192+
return buf;
193+
#else
194+
static const char ALIGNED(16) ranges2[16] = "\000\040\177\177";
195+
196+
return findchar_fast(buf, buf_end, ranges2, 4, found);
197+
#endif
198+
}
199+
134200
static const char *get_token_to_eol(const char *buf, const char *buf_end, const char **token, size_t *token_len, int *ret)
135201
{
136202
const char *token_start = buf;
@@ -143,6 +209,81 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
143209
buf = findchar_fast(buf, buf_end, ranges1, 6, &found);
144210
if (found)
145211
goto FOUND_CTL;
212+
#elif defined(__ARM_FEATURE_SVE)
213+
for (uint64_t i = 0;; i = svqincb(i, 1)) {
214+
const uint64_t len = buf_end - buf;
215+
const svbool_t pg = svwhilelt_b8(i, len);
216+
217+
if (!svptest_first(svptrue_b8(), pg)) {
218+
buf = buf_end;
219+
break;
220+
}
221+
222+
const svuint8_t v = svld1(pg, (const uint8_t *)buf + i);
223+
const uint8_t space = '\040';
224+
svbool_t c = svcmpge(pg, svsub_x(pg, v, space), 0137u);
225+
226+
if (svptest_any(pg, c)) {
227+
c = svcmplt(pg, v, space);
228+
c = svcmpne(c, v, '\011');
229+
c = svorr_z(pg, c, svcmpeq(pg, v, '\177'));
230+
231+
if (svptest_any(pg, c)) {
232+
c = svbrkb_z(pg, c);
233+
buf += i + svcntp_b8(pg, c);
234+
goto FOUND_CTL;
235+
}
236+
}
237+
}
238+
#elif defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)
239+
const size_t block_size = 2 * sizeof(uint8x16_t) - 1;
240+
const char *const end = (size_t)(buf_end - buf) >= block_size ? buf_end - block_size : buf;
241+
242+
for (; buf < end; buf += 2 * sizeof(uint8x16_t)) {
243+
const uint8x16_t space = vmovq_n_u8('\040');
244+
const uint8x16_t threshold = vmovq_n_u8(0137u);
245+
const uint8x16_t v1 = vld1q_u8((const uint8_t *)buf);
246+
const uint8x16_t v2 = vld1q_u8((const uint8_t *)buf + sizeof(v1));
247+
uint8x16_t v3 = vsubq_u8(v1, space);
248+
uint8x16_t v4 = vsubq_u8(v2, space);
249+
250+
v3 = vcgeq_u8(v3, threshold);
251+
v4 = vcgeq_u8(v4, threshold);
252+
v3 = vorrq_u8(v3, v4);
253+
/* Pack the comparison result into half a vector, i.e. 64 bits. */
254+
v3 = vpmaxq_u8(v3, v3);
255+
256+
if (vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0)) {
257+
const uint8x16_t del = vmovq_n_u8('\177');
258+
/* This mask makes it possible to pack the comparison results into half a vector,
259+
* which has the same size as uint64_t. */
260+
const uint8x16_t mask = vreinterpretq_u8_u32(vmovq_n_u32(0x40100401));
261+
const uint8x16_t tab = vmovq_n_u8('\011');
262+
263+
v3 = vcltq_u8(v1, space);
264+
v4 = vcltq_u8(v2, space);
265+
v3 = vbicq_u8(v3, vceqq_u8(v1, tab));
266+
v4 = vbicq_u8(v4, vceqq_u8(v2, tab));
267+
v3 = vorrq_u8(v3, vceqq_u8(v1, del));
268+
v4 = vorrq_u8(v4, vceqq_u8(v2, del));
269+
/* After masking, four consecutive bytes in the results do not have the same bits set. */
270+
v3 = vandq_u8(v3, mask);
271+
v4 = vandq_u8(v4, mask);
272+
/* Pack the comparison results into 128, and then 64 bits. */
273+
v3 = vpaddq_u8(v3, v4);
274+
v3 = vpaddq_u8(v3, v3);
275+
276+
uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0);
277+
278+
if (offset) {
279+
__asm__("rbit %x0, %x0" : "+r"(offset));
280+
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "Need the number of leading 0-bits in uint64_t.");
281+
/* offset uses 2 bits per byte of input. */
282+
buf += __builtin_clzll(offset) / 2;
283+
goto FOUND_CTL;
284+
}
285+
}
286+
}
146287
#else
147288
/* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
148289
while (likely(buf_end - buf >= 8)) {

0 commit comments

Comments
 (0)