3434#include <x86intrin.h>
3535#endif
3636#endif
37+ #ifdef __ARM_NEON
38+ #include <arm_neon.h>
39+ #endif
3740#include "picohttpparser.h"
3841
3942#if __GNUC__ >= 3
7174#define ADVANCE_TOKEN (tok , toklen ) \
7275 do { \
7376 const char *tok_start = buf; \
74- static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; \
7577 int found2; \
76- buf = findchar_fast (buf, buf_end, ranges2, 4, &found2); \
78+ buf = findchar_nonprintable_fast (buf, buf_end, &found2); \
7779 if (!found2) { \
7880 CHECK_EOF(); \
7981 } \
@@ -131,6 +133,46 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const cha
131133 return buf ;
132134}
133135
136+ static const char * findchar_nonprintable_fast (const char * buf , const char * buf_end , int * found )
137+ {
138+ #if defined(__ARM_64BIT_STATE ) && defined(__ARM_FEATURE_UNALIGNED ) && !defined(__ARM_BIG_ENDIAN )
139+ * found = 0 ;
140+
141+ for (size_t i = (buf_end - buf ) / sizeof (uint8x16_t ); i ; i -- ) {
142+ // This mask makes it possible to pack the comparison result into half a vector,
143+ // which has the same size as uint64_t.
144+ const uint8x16_t mask = vreinterpretq_u8_u16 (vmovq_n_u16 (0x8008 ));
145+ uint8x16_t v = vld1q_u8 ((const uint8_t * ) buf );
146+
147+ v = vorrq_u8 (vcltq_u8 (v , vmovq_n_u8 ('\041' )), vceqq_u8 (v , vmovq_n_u8 ('\177' )));
148+ // After masking, a byte in the result does not have the same bits set as any of its neighbours.
149+ v = vandq_u8 (v , mask );
150+ // Pack the comparison result into 64 bits.
151+ v = vpaddq_u8 (v , v );
152+
153+ uint64_t offset = vgetq_lane_u64 (vreinterpretq_u64_u8 (v ), 0 );
154+
155+ if (offset ) {
156+ * found = 1 ;
157+ __asm__ ("rbit %x0, %x0" : "+r" (offset ));
158+ static_assert (sizeof (unsigned long long ) == sizeof (uint64_t ),
159+ "Need the number of leading 0-bits in uint64_t." );
160+ // offset uses 4 bits per byte of input.
161+ buf += __builtin_clzll (offset ) / 4 ;
162+ break ;
163+ }
164+
165+ buf += sizeof (v );
166+ }
167+
168+ return buf ;
169+ #else
170+ static const char ALIGNED (16 ) ranges2 [16 ] = "\000\040\177\177" ;
171+
172+ return findchar_fast (buf , buf_end , ranges2 , 4 , found );
173+ #endif
174+ }
175+
134176static const char * get_token_to_eol (const char * buf , const char * buf_end , const char * * token , size_t * token_len , int * ret )
135177{
136178 const char * token_start = buf ;
@@ -143,6 +185,52 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, const
143185 buf = findchar_fast (buf , buf_end , ranges1 , 6 , & found );
144186 if (found )
145187 goto FOUND_CTL ;
188+ #elif defined(__ARM_64BIT_STATE ) && defined(__ARM_FEATURE_UNALIGNED ) && !defined(__ARM_BIG_ENDIAN )
189+ for (size_t i = (buf_end - buf ) / (2 * sizeof (uint8x16_t )); i ; i -- ) {
190+ const uint8x16_t space = vmovq_n_u8 ('\040' );
191+ const uint8x16_t threshold = vmovq_n_u8 (0137u );
192+ const uint8x16_t v1 = vld1q_u8 ((const uint8_t * ) buf );
193+ const uint8x16_t v2 = vld1q_u8 ((const uint8_t * ) buf + sizeof (v1 ));
194+ uint8x16_t v3 = vcgeq_u8 (vsubq_u8 (v1 , space ), threshold );
195+ uint8x16_t v4 = vcgeq_u8 (vsubq_u8 (v2 , space ), threshold );
196+
197+ v3 = vorrq_u8 (v3 , v4 );
198+ // Pack the comparison result into half a vector, i.e. 64 bits; the result will still be non-zero
199+ // even if any adjacent bytes are the same (either 0 or 0xFF).
200+ v3 = vpaddq_u8 (v3 , v3 );
201+
202+ if (vgetq_lane_u64 (vreinterpretq_u64_u8 (v3 ), 0 )) {
203+ const uint8x16_t del = vmovq_n_u8 ('\177' );
204+ // This mask makes it possible to pack the comparison results into half a vector,
205+ // which has the same size as uint64_t.
206+ const uint8x16_t mask = vreinterpretq_u8_u32 (vmovq_n_u32 (0x40100401 ));
207+ const uint8x16_t tab = vmovq_n_u8 ('\011' );
208+
209+ v3 = vbicq_u8 (vcltq_u8 (v1 , space ), vceqq_u8 (v1 , tab ));
210+ v4 = vbicq_u8 (vcltq_u8 (v2 , space ), vceqq_u8 (v2 , tab ));
211+ v3 = vorrq_u8 (v3 , vceqq_u8 (v1 , del ));
212+ v4 = vorrq_u8 (v4 , vceqq_u8 (v2 , del ));
213+ // After masking, four consecutive bytes in the results do not have the same bits set.
214+ v3 = vandq_u8 (v3 , mask );
215+ v4 = vandq_u8 (v4 , mask );
216+ // Pack the comparison results into 128, and then 64 bits.
217+ v3 = vpaddq_u8 (v3 , v4 );
218+ v3 = vpaddq_u8 (v3 , v3 );
219+
220+ uint64_t offset = vgetq_lane_u64 (vreinterpretq_u64_u8 (v3 ), 0 );
221+
222+ if (offset ) {
223+ __asm__ ("rbit %x0, %x0" : "+r" (offset ));
224+ static_assert (sizeof (unsigned long long ) == sizeof (uint64_t ),
225+ "Need the number of leading 0-bits in uint64_t." );
226+ // offset uses 2 bits per byte of input.
227+ buf += __builtin_clzll (offset ) / 2 ;
228+ goto FOUND_CTL ;
229+ }
230+ }
231+
232+ buf += sizeof (v1 ) + sizeof (v2 );
233+ }
146234#else
147235 /* find non-printable char within the next 8 bytes, this is the hottest code; manually inlined */
148236 while (likely (buf_end - buf >= 8 )) {
0 commit comments