3434#include <x86intrin.h>
3535#endif
3636#endif
37+ #ifdef __ARM_FEATURE_SVE
38+ #include <arm_sve.h>
39+ #endif
40+ #ifdef __ARM_NEON
41+ #include <arm_neon.h>
42+ #endif
3743#include "picohttpparser.h"
3844
3945#if __GNUC__ >= 3
7177#define ADVANCE_TOKEN (tok , toklen ) \
7278 do { \
7379 const char *tok_start = buf; \
74- static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; \
7580 int found2; \
76- buf = findchar_fast (buf, buf_end, ranges2, 4, &found2); \
81+ buf = findchar_nonprintable_fast (buf, buf_end, &found2); \
7782 if (!found2) { \
7883 CHECK_EOF(); \
7984 } \
@@ -131,6 +136,65 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
131136 return buf ;
132137}
133138
139+ static const char * findchar_nonprintable_fast (const char * buf , const char * buf_end , int * found )
140+ {
141+ #ifdef __ARM_FEATURE_SVE
142+ * found = 0 ;
143+
144+ for (uint64_t i = 0 ;; i = svqincb (i , 1 )) {
145+ const uint64_t len = buf_end - buf ;
146+ const svbool_t pg = svwhilelt_b8 (i , len );
147+
148+ if (!svptest_first (svptrue_b8 (), pg )) {
149+ buf = buf_end ;
150+ break ;
151+ }
152+
153+ const svuint8_t v = svld1 (pg , (const uint8_t * ) buf + i );
154+ const svbool_t c = svorr_z (pg , svcmplt (pg , v , '\041' ), svcmpeq (pg , v , '\177' ));
155+
156+ if (svptest_any (pg , c )) {
157+ * found = 1 ;
158+ buf += i + svcntp_b8 (pg , svbrkb_z (pg , c ));
159+ break ;
160+ }
161+ }
162+
163+ return buf ;
164+ #elif defined(__ARM_NEON ) && defined(__ARM_64BIT_STATE )
165+ * found = 0 ;
166+
167+ const size_t block_size = sizeof (uint8x16_t ) - 1 ;
168+ const char * const end = (size_t ) (buf_end - buf ) >= block_size ? buf_end - block_size : buf ;
169+
170+ for (; buf < end ; buf += sizeof (uint8x16_t )) {
171+ uint8x16_t v = vld1q_u8 ((const uint8_t * ) buf );
172+
173+ v = vorrq_u8 (vcltq_u8 (v , vmovq_n_u8 ('\041' )), vceqq_u8 (v , vmovq_n_u8 ('\177' )));
174+
175+ // Pack the comparison result into 64 bits.
176+ const uint8x8_t rv = vshrn_n_u16 (vreinterpretq_u16_u8 (v ), 4 );
177+ uint64_t offset = vget_lane_u64 (vreinterpret_u64_u8 (rv ), 0 );
178+
179+ if (offset ) {
180+ * found = 1 ;
181+ __asm__ ("rbit %x0, %x0" : "+r" (offset ));
182+ static_assert (sizeof (unsigned long long ) == sizeof (uint64_t ),
183+ "Need the number of leading 0-bits in uint64_t." );
184+ // offset uses 4 bits per byte of input.
185+ buf += __builtin_clzll (offset ) / 4 ;
186+ break ;
187+ }
188+ }
189+
190+ return buf ;
191+ #else
192+ static const char ALIGNED (16 ) ranges2 [16 ] = "\000\040\177\177" ;
193+
194+ return findchar_fast (buf , buf_end , ranges2 , 4 , found );
195+ #endif
196+ }
197+
134198static const char * get_token_to_eol (const char * buf , const char * buf_end , const char * * token , size_t * token_len , int * ret )
135199{
136200 const char * token_start = buf ;
@@ -143,6 +207,76 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
143207 buf = findchar_fast (buf , buf_end , ranges1 , 6 , & found );
144208 if (found )
145209 goto FOUND_CTL ;
210+ #elif defined(__ARM_FEATURE_SVE )
211+ for (uint64_t i = 0 ;; i = svqincb (i , 1 )) {
212+ const uint64_t len = buf_end - buf ;
213+ const svbool_t pg = svwhilelt_b8 (i , len );
214+
215+ if (!svptest_first (svptrue_b8 (), pg )) {
216+ buf = buf_end ;
217+ break ;
218+ }
219+
220+ const svuint8_t v = svld1 (pg , (const uint8_t * ) buf + i );
221+ const uint8_t space = '\040' ;
222+
223+ if (svptest_any (pg , svcmpge (pg , svsub_x (pg , v , space ), 0137u ))) {
224+ svbool_t c = svcmpne (svcmplt (pg , v , space ), v , '\011' );
225+
226+ c = svorr_z (pg , c , svcmpeq (pg , v , '\177' ));
227+
228+ if (svptest_any (pg , c )) {
229+ buf += i + svcntp_b8 (pg , svbrkb_z (pg , c ));
230+ goto FOUND_CTL ;
231+ }
232+ }
233+ }
234+ #elif defined(__ARM_NEON ) && defined(__ARM_64BIT_STATE )
235+ const size_t block_size = 2 * sizeof (uint8x16_t ) - 1 ;
236+ const char * const end = (size_t ) (buf_end - buf ) >= block_size ? buf_end - block_size : buf ;
237+
238+ for (; buf < end ; buf += 2 * sizeof (uint8x16_t )) {
239+ const uint8x16_t space = vmovq_n_u8 ('\040' );
240+ const uint8x16_t threshold = vmovq_n_u8 (0137u );
241+ const uint8x16_t v1 = vld1q_u8 ((const uint8_t * ) buf );
242+ const uint8x16_t v2 = vld1q_u8 ((const uint8_t * ) buf + sizeof (v1 ));
243+ uint8x16_t v3 = vcgeq_u8 (vsubq_u8 (v1 , space ), threshold );
244+ uint8x16_t v4 = vcgeq_u8 (vsubq_u8 (v2 , space ), threshold );
245+
246+ v3 = vorrq_u8 (v3 , v4 );
247+ // Pack the comparison result into half a vector, i.e. 64 bits.
248+ v3 = vpmaxq_u8 (v3 , v3 );
249+
250+ if (vgetq_lane_u64 (vreinterpretq_u64_u8 (v3 ), 0 )) {
251+ const uint8x16_t del = vmovq_n_u8 ('\177' );
252+ // This mask makes it possible to pack the comparison results into half a vector,
253+ // which has the same size as uint64_t.
254+ const uint8x16_t mask = vreinterpretq_u8_u32 (vmovq_n_u32 (0x40100401 ));
255+ const uint8x16_t tab = vmovq_n_u8 ('\011' );
256+
257+ v3 = vbicq_u8 (vcltq_u8 (v1 , space ), vceqq_u8 (v1 , tab ));
258+ v4 = vbicq_u8 (vcltq_u8 (v2 , space ), vceqq_u8 (v2 , tab ));
259+ v3 = vorrq_u8 (v3 , vceqq_u8 (v1 , del ));
260+ v4 = vorrq_u8 (v4 , vceqq_u8 (v2 , del ));
261+ // After masking, four consecutive bytes in the results do not have the same bits set.
262+ v3 = vandq_u8 (v3 , mask );
263+ v4 = vandq_u8 (v4 , mask );
264+ // Pack the comparison results into 128, and then 64 bits.
265+ v3 = vpaddq_u8 (v3 , v4 );
266+ v3 = vpaddq_u8 (v3 , v3 );
267+
268+ uint64_t offset = vgetq_lane_u64 (vreinterpretq_u64_u8 (v3 ), 0 );
269+
270+ if (offset ) {
271+ __asm__ ("rbit %x0, %x0" : "+r" (offset ));
272+ static_assert (sizeof (unsigned long long ) == sizeof (uint64_t ),
273+ "Need the number of leading 0-bits in uint64_t." );
274+ // offset uses 2 bits per byte of input.
275+ buf += __builtin_clzll (offset ) / 2 ;
276+ goto FOUND_CTL ;
277+ }
278+ }
279+ }
146280#else
147281 /* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
148282 while (likely (buf_end - buf >= 8 )) {
0 commit comments