3434#include <x86intrin.h>
3535#endif
3636#endif
37+ #ifdef __ARM_FEATURE_SVE
38+ #include <arm_sve.h>
39+ #endif
40+ #ifdef __ARM_NEON
41+ #include <arm_neon.h>
42+ #endif
3743#include "picohttpparser.h"
3844
3945#if __GNUC__ >= 3
7177#define ADVANCE_TOKEN (tok , toklen ) \
7278 do { \
7379 const char *tok_start = buf; \
74- static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; \
7580 int found2; \
76- buf = findchar_fast (buf, buf_end, ranges2, 4, &found2); \
81+ buf = findchar_nonprintable_fast (buf, buf_end, &found2); \
7782 if (!found2) { \
7883 CHECK_EOF(); \
7984 } \
@@ -131,6 +136,69 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
131136 return buf ;
132137}
133138
139+ static const char * findchar_nonprintable_fast (const char * buf , const char * buf_end , int * found )
140+ {
141+ #ifdef __ARM_FEATURE_SVE
142+ * found = 0 ;
143+
144+ for (uint64_t i = 0 ;; i = svqincb (i , 1 )) {
145+ const uint64_t len = buf_end - buf ;
146+ const svbool_t pg = svwhilelt_b8 (i , len );
147+
148+ if (!svptest_first (svptrue_b8 (), pg )) {
149+ buf = buf_end ;
150+ break ;
151+ }
152+
153+ const svuint8_t v = svld1 (pg , (const uint8_t * ) buf + i );
154+ const svbool_t c = svorr_z (pg , svcmplt (pg , v , '\041' ), svcmpeq (pg , v , '\177' ));
155+
156+ if (svptest_any (pg , c )) {
157+ * found = 1 ;
158+ buf += i + svcntp_b8 (pg , svbrkb_z (pg , c ));
159+ break ;
160+ }
161+ }
162+
163+ return buf ;
164+ #elif defined(__ARM_NEON ) && defined(__ARM_64BIT_STATE )
165+ * found = 0 ;
166+
167+ const size_t block_size = sizeof (uint8x16_t ) - 1 ;
168+ const char * const end = (size_t ) (buf_end - buf ) >= block_size ? buf_end - block_size : buf ;
169+
170+ for (; buf < end ; buf += sizeof (uint8x16_t )) {
171+ // This mask makes it possible to pack the comparison result into half a vector,
172+ // which has the same size as uint64_t.
173+ const uint16x8_t mask = vmovq_n_u16 (0x0f00 );
174+ uint8x16_t v = vld1q_u8 ((const uint8_t * ) buf );
175+
176+ v = vorrq_u8 (vcltq_u8 (v , vmovq_n_u8 ('\041' )), vceqq_u8 (v , vmovq_n_u8 ('\177' )));
177+ v = vreinterpretq_u8_u16 (vbicq_u16 (vreinterpretq_u16_u8 (v ), mask ));
178+ // Pack the comparison result into 64 bits.
179+ v = vpmaxq_u8 (v , v );
180+
181+ uint64_t offset = vgetq_lane_u64 (vreinterpretq_u64_u8 (v ), 0 );
182+
183+ if (offset ) {
184+ * found = 1 ;
185+ __asm__ ("rbit %x0, %x0" : "+r" (offset ));
186+ static_assert (sizeof (unsigned long long ) == sizeof (uint64_t ),
187+ "Need the number of leading 0-bits in uint64_t." );
188+ // offset uses 4 bits per byte of input.
189+ buf += __builtin_clzll (offset ) / 4 ;
190+ break ;
191+ }
192+ }
193+
194+ return buf ;
195+ #else
196+ static const char ALIGNED (16 ) ranges2 [16 ] = "\000\040\177\177" ;
197+
198+ return findchar_fast (buf , buf_end , ranges2 , 4 , found );
199+ #endif
200+ }
201+
134202static const char * get_token_to_eol (const char * buf , const char * buf_end , const char * * token , size_t * token_len , int * ret )
135203{
136204 const char * token_start = buf ;
@@ -143,6 +211,76 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
143211 buf = findchar_fast (buf , buf_end , ranges1 , 6 , & found );
144212 if (found )
145213 goto FOUND_CTL ;
214+ #elif defined(__ARM_FEATURE_SVE )
215+ for (uint64_t i = 0 ;; i = svqincb (i , 1 )) {
216+ const uint64_t len = buf_end - buf ;
217+ const svbool_t pg = svwhilelt_b8 (i , len );
218+
219+ if (!svptest_first (svptrue_b8 (), pg )) {
220+ buf = buf_end ;
221+ break ;
222+ }
223+
224+ const svuint8_t v = svld1 (pg , (const uint8_t * ) buf + i );
225+ const uint8_t space = '\040' ;
226+
227+ if (svptest_any (pg , svcmpge (pg , svsub_x (pg , v , space ), 0137u ))) {
228+ svbool_t c = svcmpne (svcmplt (pg , v , space ), v , '\011' );
229+
230+ c = svorr_z (pg , c , svcmpeq (pg , v , '\177' ));
231+
232+ if (svptest_any (pg , c )) {
233+ buf += i + svcntp_b8 (pg , svbrkb_z (pg , c ));
234+ goto FOUND_CTL ;
235+ }
236+ }
237+ }
238+ #elif defined(__ARM_NEON ) && defined(__ARM_64BIT_STATE )
239+ const size_t block_size = 2 * sizeof (uint8x16_t ) - 1 ;
240+ const char * const end = (size_t ) (buf_end - buf ) >= block_size ? buf_end - block_size : buf ;
241+
242+ for (; buf < end ; buf += 2 * sizeof (uint8x16_t )) {
243+ const uint8x16_t space = vmovq_n_u8 ('\040' );
244+ const uint8x16_t threshold = vmovq_n_u8 (0137u );
245+ const uint8x16_t v1 = vld1q_u8 ((const uint8_t * ) buf );
246+ const uint8x16_t v2 = vld1q_u8 ((const uint8_t * ) buf + sizeof (v1 ));
247+ uint8x16_t v3 = vcgeq_u8 (vsubq_u8 (v1 , space ), threshold );
248+ uint8x16_t v4 = vcgeq_u8 (vsubq_u8 (v2 , space ), threshold );
249+
250+ v3 = vorrq_u8 (v3 , v4 );
251+ // Pack the comparison result into half a vector, i.e. 64 bits.
252+ v3 = vpmaxq_u8 (v3 , v3 );
253+
254+ if (vgetq_lane_u64 (vreinterpretq_u64_u8 (v3 ), 0 )) {
255+ const uint8x16_t del = vmovq_n_u8 ('\177' );
256+ // This mask makes it possible to pack the comparison results into half a vector,
257+ // which has the same size as uint64_t.
258+ const uint8x16_t mask = vreinterpretq_u8_u32 (vmovq_n_u32 (0x40100401 ));
259+ const uint8x16_t tab = vmovq_n_u8 ('\011' );
260+
261+ v3 = vbicq_u8 (vcltq_u8 (v1 , space ), vceqq_u8 (v1 , tab ));
262+ v4 = vbicq_u8 (vcltq_u8 (v2 , space ), vceqq_u8 (v2 , tab ));
263+ v3 = vorrq_u8 (v3 , vceqq_u8 (v1 , del ));
264+ v4 = vorrq_u8 (v4 , vceqq_u8 (v2 , del ));
265+ // After masking, four consecutive bytes in the results do not have the same bits set.
266+ v3 = vandq_u8 (v3 , mask );
267+ v4 = vandq_u8 (v4 , mask );
268+ // Pack the comparison results into 128, and then 64 bits.
269+ v3 = vpaddq_u8 (v3 , v4 );
270+ v3 = vpaddq_u8 (v3 , v3 );
271+
272+ uint64_t offset = vgetq_lane_u64 (vreinterpretq_u64_u8 (v3 ), 0 );
273+
274+ if (offset ) {
275+ __asm__ ("rbit %x0, %x0" : "+r" (offset ));
276+ static_assert (sizeof (unsigned long long ) == sizeof (uint64_t ),
277+ "Need the number of leading 0-bits in uint64_t." );
278+ // offset uses 2 bits per byte of input.
279+ buf += __builtin_clzll (offset ) / 2 ;
280+ goto FOUND_CTL ;
281+ }
282+ }
283+ }
146284#else
147285 /* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
148286 while (likely (buf_end - buf >= 8 )) {
0 commit comments