Skip to content

Commit c3967fc

Browse files
committed
Optimizations for Armv8-A
These changes apply only to the AArch64 execution state. They also add arm64 testing with Travis CI.
1 parent 066d2b1 commit c3967fc

File tree

2 files changed

+139
-2
lines changed

2 files changed

+139
-2
lines changed

.travis.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
language: c
2+
arch:
3+
- amd64
4+
- arm64
25
compiler:
36
- gcc
47
- clang

picohttpparser.c

Lines changed: 136 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@
3434
#include <x86intrin.h>
3535
#endif
3636
#endif
37+
#ifdef __ARM_FEATURE_SVE
38+
#include <arm_sve.h>
39+
#endif
40+
#ifdef __ARM_NEON
41+
#include <arm_neon.h>
42+
#endif
3743
#include "picohttpparser.h"
3844

3945
#if __GNUC__ >= 3
@@ -71,9 +77,8 @@
7177
#define ADVANCE_TOKEN(tok, toklen) \
7278
do { \
7379
const char *tok_start = buf; \
74-
static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; \
7580
int found2; \
76-
buf = findchar_fast(buf, buf_end, ranges2, 4, &found2); \
81+
buf = findchar_nonprintable_fast(buf, buf_end, &found2); \
7782
if (!found2) { \
7883
CHECK_EOF(); \
7984
} \
@@ -131,6 +136,65 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
131136
return buf;
132137
}
133138

139+
static const char *findchar_nonprintable_fast(const char *buf, const char *buf_end, int *found)
140+
{
141+
#ifdef __ARM_FEATURE_SVE
142+
*found = 0;
143+
144+
for (uint64_t i = 0;; i = svqincb(i, 1)) {
145+
const uint64_t len = buf_end - buf;
146+
const svbool_t pg = svwhilelt_b8(i, len);
147+
148+
if (!svptest_first(svptrue_b8(), pg)) {
149+
buf = buf_end;
150+
break;
151+
}
152+
153+
const svuint8_t v = svld1(pg, (const uint8_t *) buf + i);
154+
const svbool_t c = svorr_z(pg, svcmplt(pg, v, '\041'), svcmpeq(pg, v, '\177'));
155+
156+
if (svptest_any(pg, c)) {
157+
*found = 1;
158+
buf += i + svcntp_b8(pg, svbrkb_z(pg, c));
159+
break;
160+
}
161+
}
162+
163+
return buf;
164+
#elif defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)
165+
*found = 0;
166+
167+
const size_t block_size = sizeof(uint8x16_t) - 1;
168+
const char * const end = (size_t) (buf_end - buf) >= block_size ? buf_end - block_size : buf;
169+
170+
for (; buf < end; buf += sizeof(uint8x16_t)) {
171+
uint8x16_t v = vld1q_u8((const uint8_t *) buf);
172+
173+
v = vorrq_u8(vcltq_u8(v, vmovq_n_u8('\041')), vceqq_u8(v, vmovq_n_u8('\177')));
174+
175+
// Pack the comparison result into 64 bits.
176+
const uint8x8_t rv = vshrn_n_u16(vreinterpretq_u16_u8(v), 4);
177+
uint64_t offset = vget_lane_u64(vreinterpret_u64_u8(rv), 0);
178+
179+
if (offset) {
180+
*found = 1;
181+
__asm__ ("rbit %x0, %x0" : "+r" (offset));
182+
static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
183+
"Need the number of leading 0-bits in uint64_t.");
184+
// offset uses 4 bits per byte of input.
185+
buf += __builtin_clzll(offset) / 4;
186+
break;
187+
}
188+
}
189+
190+
return buf;
191+
#else
192+
static const char ALIGNED(16) ranges2[16] = "\000\040\177\177";
193+
194+
return findchar_fast(buf, buf_end, ranges2, 4, found);
195+
#endif
196+
}
197+
134198
static const char *get_token_to_eol(const char *buf, const char *buf_end, const char **token, size_t *token_len, int *ret)
135199
{
136200
const char *token_start = buf;
@@ -143,6 +207,76 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
143207
buf = findchar_fast(buf, buf_end, ranges1, 6, &found);
144208
if (found)
145209
goto FOUND_CTL;
210+
#elif defined(__ARM_FEATURE_SVE)
211+
for (uint64_t i = 0;; i = svqincb(i, 1)) {
212+
const uint64_t len = buf_end - buf;
213+
const svbool_t pg = svwhilelt_b8(i, len);
214+
215+
if (!svptest_first(svptrue_b8(), pg)) {
216+
buf = buf_end;
217+
break;
218+
}
219+
220+
const svuint8_t v = svld1(pg, (const uint8_t *) buf + i);
221+
const uint8_t space = '\040';
222+
223+
if (svptest_any(pg, svcmpge(pg, svsub_x(pg, v, space), 0137u))) {
224+
svbool_t c = svcmpne(svcmplt(pg, v, space), v, '\011');
225+
226+
c = svorr_z(pg, c, svcmpeq(pg, v, '\177'));
227+
228+
if (svptest_any(pg, c)) {
229+
buf += i + svcntp_b8(pg, svbrkb_z(pg, c));
230+
goto FOUND_CTL;
231+
}
232+
}
233+
}
234+
#elif defined(__ARM_NEON) && defined(__ARM_64BIT_STATE)
235+
const size_t block_size = 2 * sizeof(uint8x16_t) - 1;
236+
const char * const end = (size_t) (buf_end - buf) >= block_size ? buf_end - block_size : buf;
237+
238+
for (; buf < end; buf += 2 * sizeof(uint8x16_t)) {
239+
const uint8x16_t space = vmovq_n_u8('\040');
240+
const uint8x16_t threshold = vmovq_n_u8(0137u);
241+
const uint8x16_t v1 = vld1q_u8((const uint8_t *) buf);
242+
const uint8x16_t v2 = vld1q_u8((const uint8_t *) buf + sizeof(v1));
243+
uint8x16_t v3 = vcgeq_u8(vsubq_u8(v1, space), threshold);
244+
uint8x16_t v4 = vcgeq_u8(vsubq_u8(v2, space), threshold);
245+
246+
v3 = vorrq_u8(v3, v4);
247+
// Pack the comparison result into half a vector, i.e. 64 bits.
248+
v3 = vpmaxq_u8(v3, v3);
249+
250+
if (vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0)) {
251+
const uint8x16_t del = vmovq_n_u8('\177');
252+
// This mask makes it possible to pack the comparison results into half a vector,
253+
// which has the same size as uint64_t.
254+
const uint8x16_t mask = vreinterpretq_u8_u32(vmovq_n_u32(0x40100401));
255+
const uint8x16_t tab = vmovq_n_u8('\011');
256+
257+
v3 = vbicq_u8(vcltq_u8(v1, space), vceqq_u8(v1, tab));
258+
v4 = vbicq_u8(vcltq_u8(v2, space), vceqq_u8(v2, tab));
259+
v3 = vorrq_u8(v3, vceqq_u8(v1, del));
260+
v4 = vorrq_u8(v4, vceqq_u8(v2, del));
261+
// After masking, four consecutive bytes in the results do not have the same bits set.
262+
v3 = vandq_u8(v3, mask);
263+
v4 = vandq_u8(v4, mask);
264+
// Pack the comparison results into 128, and then 64 bits.
265+
v3 = vpaddq_u8(v3, v4);
266+
v3 = vpaddq_u8(v3, v3);
267+
268+
uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0);
269+
270+
if (offset) {
271+
__asm__ ("rbit %x0, %x0" : "+r" (offset));
272+
static_assert(sizeof(unsigned long long) == sizeof(uint64_t),
273+
"Need the number of leading 0-bits in uint64_t.");
274+
// offset uses 2 bits per byte of input.
275+
buf += __builtin_clzll(offset) / 2;
276+
goto FOUND_CTL;
277+
}
278+
}
279+
}
146280
#else
147281
/* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
148282
while (likely(buf_end - buf >= 8)) {

0 commit comments

Comments
 (0)