3434#include <x86intrin.h>
3535#endif
3636#endif
37+ #ifdef __ARM_FEATURE_SVE
38+ #include <arm_sve.h>
39+ #endif
40+ #ifdef __ARM_NEON
41+ #include <arm_neon.h>
42+ #endif
3743#include "picohttpparser.h"
3844
3945#if __GNUC__ >= 3
7177#define ADVANCE_TOKEN (tok , toklen ) \
7278 do { \
7379 const char *tok_start = buf; \
74- static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; \
7580 int found2; \
76- buf = findchar_fast (buf, buf_end, ranges2, 4, &found2); \
81+ buf = findchar_nonprintable_fast (buf, buf_end, &found2); \
7782 if (!found2) { \
7883 CHECK_EOF(); \
7984 } \
@@ -131,6 +136,67 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
131136 return buf ;
132137}
133138
139+ static const char * findchar_nonprintable_fast (const char * buf , const char * buf_end , int * found )
140+ {
141+ #ifdef __ARM_FEATURE_SVE
142+ * found = 0 ;
143+
144+ for (uint64_t i = 0 ;; i = svqincb (i , 1 )) {
145+ const uint64_t len = buf_end - buf ;
146+ const svbool_t pg = svwhilelt_b8 (i , len );
147+
148+ if (!svptest_first (svptrue_b8 (), pg )) {
149+ buf = buf_end ;
150+ break ;
151+ }
152+
153+ const svuint8_t v = svld1 (pg , (const uint8_t * )buf + i );
154+ svbool_t c = svcmplt (pg , v , '\041' );
155+
156+ c = svorr_z (pg , c , svcmpeq (pg , v , '\177' ));
157+
158+ if (svptest_any (pg , c )) {
159+ * found = 1 ;
160+ c = svbrkb_z (pg , c );
161+ buf += i + svcntp_b8 (pg , c );
162+ break ;
163+ }
164+ }
165+
166+ return buf ;
167+ #elif defined(__ARM_NEON ) && defined(__ARM_64BIT_STATE )
168+ * found = 0 ;
169+
170+ const size_t block_size = sizeof (uint8x16_t ) - 1 ;
171+ const char * const end = (size_t )(buf_end - buf ) >= block_size ? buf_end - block_size : buf ;
172+
173+ for (; buf < end ; buf += sizeof (uint8x16_t )) {
174+ uint8x16_t v = vld1q_u8 ((const uint8_t * )buf );
175+
176+ v = vorrq_u8 (vcltq_u8 (v , vmovq_n_u8 ('\041' )), vceqq_u8 (v , vmovq_n_u8 ('\177' )));
177+
178+ /* Pack the comparison result into 64 bits. */
179+ const uint8x8_t rv = vshrn_n_u16 (vreinterpretq_u16_u8 (v ), 4 );
180+ uint64_t offset = vget_lane_u64 (vreinterpret_u64_u8 (rv ), 0 );
181+
182+ if (offset ) {
183+ * found = 1 ;
184+ __asm__("rbit %x0, %x0" : "+r" (offset ));
185+ static_assert (sizeof (unsigned long long ) == sizeof (uint64_t ), "Need the number of leading 0-bits in uint64_t." );
186+ /* offset uses 4 bits per byte of input. */
187+ buf += __builtin_clzll (offset ) / 4 ;
188+ break ;
189+ }
190+ }
191+
192+ return buf ;
193+ #else
194+ static const char ALIGNED (16 ) ranges2 [16 ] = "\000\040\177\177" ;
195+
196+ return findchar_fast (buf , buf_end , ranges2 , 4 , found );
197+ #endif
198+ }
199+
134200static const char * get_token_to_eol (const char * buf , const char * buf_end , const char * * token , size_t * token_len , int * ret )
135201{
136202 const char * token_start = buf ;
@@ -143,6 +209,81 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
143209 buf = findchar_fast (buf , buf_end , ranges1 , 6 , & found );
144210 if (found )
145211 goto FOUND_CTL ;
212+ #elif defined(__ARM_FEATURE_SVE )
213+ for (uint64_t i = 0 ;; i = svqincb (i , 1 )) {
214+ const uint64_t len = buf_end - buf ;
215+ const svbool_t pg = svwhilelt_b8 (i , len );
216+
217+ if (!svptest_first (svptrue_b8 (), pg )) {
218+ buf = buf_end ;
219+ break ;
220+ }
221+
222+ const svuint8_t v = svld1 (pg , (const uint8_t * )buf + i );
223+ const uint8_t space = '\040' ;
224+ svbool_t c = svcmpge (pg , svsub_x (pg , v , space ), 0137u );
225+
226+ if (svptest_any (pg , c )) {
227+ c = svcmplt (pg , v , space );
228+ c = svcmpne (c , v , '\011' );
229+ c = svorr_z (pg , c , svcmpeq (pg , v , '\177' ));
230+
231+ if (svptest_any (pg , c )) {
232+ c = svbrkb_z (pg , c );
233+ buf += i + svcntp_b8 (pg , c );
234+ goto FOUND_CTL ;
235+ }
236+ }
237+ }
238+ #elif defined(__ARM_NEON ) && defined(__ARM_64BIT_STATE )
239+ const size_t block_size = 2 * sizeof (uint8x16_t ) - 1 ;
240+ const char * const end = (size_t )(buf_end - buf ) >= block_size ? buf_end - block_size : buf ;
241+
242+ for (; buf < end ; buf += 2 * sizeof (uint8x16_t )) {
243+ const uint8x16_t space = vmovq_n_u8 ('\040' );
244+ const uint8x16_t threshold = vmovq_n_u8 (0137u );
245+ const uint8x16_t v1 = vld1q_u8 ((const uint8_t * )buf );
246+ const uint8x16_t v2 = vld1q_u8 ((const uint8_t * )buf + sizeof (v1 ));
247+ uint8x16_t v3 = vsubq_u8 (v1 , space );
248+ uint8x16_t v4 = vsubq_u8 (v2 , space );
249+
250+ v3 = vcgeq_u8 (v3 , threshold );
251+ v4 = vcgeq_u8 (v4 , threshold );
252+ v3 = vorrq_u8 (v3 , v4 );
253+ /* Pack the comparison result into half a vector, i.e. 64 bits. */
254+ v3 = vpmaxq_u8 (v3 , v3 );
255+
256+ if (vgetq_lane_u64 (vreinterpretq_u64_u8 (v3 ), 0 )) {
257+ const uint8x16_t del = vmovq_n_u8 ('\177' );
258+ /* This mask makes it possible to pack the comparison results into half a vector,
259+ * which has the same size as uint64_t. */
260+ const uint8x16_t mask = vreinterpretq_u8_u32 (vmovq_n_u32 (0x40100401 ));
261+ const uint8x16_t tab = vmovq_n_u8 ('\011' );
262+
263+ v3 = vcltq_u8 (v1 , space );
264+ v4 = vcltq_u8 (v2 , space );
265+ v3 = vbicq_u8 (v3 , vceqq_u8 (v1 , tab ));
266+ v4 = vbicq_u8 (v4 , vceqq_u8 (v2 , tab ));
267+ v3 = vorrq_u8 (v3 , vceqq_u8 (v1 , del ));
268+ v4 = vorrq_u8 (v4 , vceqq_u8 (v2 , del ));
269+ /* After masking, four consecutive bytes in the results do not have the same bits set. */
270+ v3 = vandq_u8 (v3 , mask );
271+ v4 = vandq_u8 (v4 , mask );
272+ /* Pack the comparison results into 128, and then 64 bits. */
273+ v3 = vpaddq_u8 (v3 , v4 );
274+ v3 = vpaddq_u8 (v3 , v3 );
275+
276+ uint64_t offset = vgetq_lane_u64 (vreinterpretq_u64_u8 (v3 ), 0 );
277+
278+ if (offset ) {
279+ __asm__("rbit %x0, %x0" : "+r" (offset ));
280+ static_assert (sizeof (unsigned long long ) == sizeof (uint64_t ), "Need the number of leading 0-bits in uint64_t." );
281+ /* offset uses 2 bits per byte of input. */
282+ buf += __builtin_clzll (offset ) / 2 ;
283+ goto FOUND_CTL ;
284+ }
285+ }
286+ }
146287#else
147288 /* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
148289 while (likely (buf_end - buf >= 8 )) {
0 commit comments