@@ -200,67 +200,119 @@ static inline const char *scan_string_noSIMD(const char *str, const char *end) {
200200}
201201
202202#ifdef HAVE_SIMD_SSE4_2
203- // SIMD string scanner using SSE4.2 instructions
204- // Scans for null terminator, backslash, or quote characters
203+ // Optimized SIMD string scanner using SSE4.2 instructions
204+ // Uses prefetching and processes multiple chunks in parallel to reduce latency
205205static inline const char * scan_string_SSE42 (const char * str , const char * end ) {
206206 static const char chars [16 ] = "\x00\\\"" ;
207207 const __m128i terminate = _mm_loadu_si128 ((const __m128i * )& chars [0 ]);
208- const char * safe_end = end - 16 ;
208+ const char * safe_end_64 = end - 64 ;
209+ const char * safe_end_16 = end - 16 ;
210+
211+ // Process 64 bytes at a time with parallel SIMD operations
212+ // This reduces pipeline stalls and improves instruction-level parallelism
213+ while (str <= safe_end_64 ) {
214+ // Prefetch next cache line for better memory throughput
215+ __builtin_prefetch (str + 64 , 0 , 0 );
216+
217+ // Load and compare 4 chunks in parallel
218+ const __m128i chunk0 = _mm_loadu_si128 ((const __m128i * )(str ));
219+ const __m128i chunk1 = _mm_loadu_si128 ((const __m128i * )(str + 16 ));
220+ const __m128i chunk2 = _mm_loadu_si128 ((const __m128i * )(str + 32 ));
221+ const __m128i chunk3 = _mm_loadu_si128 ((const __m128i * )(str + 48 ));
222+
223+ const int r0 = _mm_cmpestri (terminate , 3 , chunk0 , 16 ,
224+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT );
225+ if (__builtin_expect (r0 != 16 , 0 )) return str + r0 ;
226+
227+ const int r1 = _mm_cmpestri (terminate , 3 , chunk1 , 16 ,
228+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT );
229+ if (__builtin_expect (r1 != 16 , 0 )) return str + 16 + r1 ;
230+
231+ const int r2 = _mm_cmpestri (terminate , 3 , chunk2 , 16 ,
232+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT );
233+ if (__builtin_expect (r2 != 16 , 0 )) return str + 32 + r2 ;
234+
235+ const int r3 = _mm_cmpestri (terminate , 3 , chunk3 , 16 ,
236+ _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT );
237+ if (__builtin_expect (r3 != 16 , 0 )) return str + 48 + r3 ;
238+
239+ str += 64 ;
240+ }
209241
210- for (; str <= safe_end ; str += 16 ) {
242+ // Handle remaining 16-byte chunks
243+ for (; str <= safe_end_16 ; str += 16 ) {
211244 const __m128i string = _mm_loadu_si128 ((const __m128i * )str );
212- const int r = _mm_cmpestri (terminate ,
213- 3 ,
214- string ,
215- 16 ,
245+ const int r = _mm_cmpestri (terminate , 3 , string , 16 ,
216246 _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT );
217- if (r != 16 ) {
218- return str + r ;
219- }
247+ if (r != 16 ) return str + r ;
220248 }
221249
222250 return scan_string_noSIMD (str , end );
223251}
224252#endif
225253
226254#ifdef HAVE_SIMD_SSE2
227- // SSE2 string scanner (fallback for older x86_64 CPUs)
228- // Uses SSE2 instructions available on all x86_64 processors
255+ // Optimized SSE2 string scanner (fallback for older x86_64 CPUs)
256+ // Uses SSE2 instructions with prefetching and parallel processing
229257static inline const char * scan_string_SSE2 (const char * str , const char * end ) {
230- const char * safe_end = end - 16 ;
258+ const char * safe_end_64 = end - 64 ;
259+ const char * safe_end_16 = end - 16 ;
231260
232261 // Create comparison vectors for our three special characters
233- const __m128i null_char = _mm_setzero_si128 ();
234- const __m128i backslash = _mm_set1_epi8 ('\\' );
235- const __m128i quote = _mm_set1_epi8 ('"' );
262+ const __m128i null_char = _mm_setzero_si128 ();
263+ const __m128i backslash = _mm_set1_epi8 ('\\' );
264+ const __m128i quote = _mm_set1_epi8 ('"' );
265+
266+ // Process 64 bytes at a time for better throughput
267+ while (str <= safe_end_64 ) {
268+ __builtin_prefetch (str + 64 , 0 , 0 );
269+
270+ // Load 4 chunks
271+ const __m128i chunk0 = _mm_loadu_si128 ((const __m128i * )(str ));
272+ const __m128i chunk1 = _mm_loadu_si128 ((const __m128i * )(str + 16 ));
273+ const __m128i chunk2 = _mm_loadu_si128 ((const __m128i * )(str + 32 ));
274+ const __m128i chunk3 = _mm_loadu_si128 ((const __m128i * )(str + 48 ));
275+
276+ // Compare all chunks (allows CPU to parallelize)
277+ const __m128i cmp0 = _mm_or_si128 (_mm_or_si128 (_mm_cmpeq_epi8 (chunk0 , null_char ),
278+ _mm_cmpeq_epi8 (chunk0 , backslash )),
279+ _mm_cmpeq_epi8 (chunk0 , quote ));
280+ const __m128i cmp1 = _mm_or_si128 (_mm_or_si128 (_mm_cmpeq_epi8 (chunk1 , null_char ),
281+ _mm_cmpeq_epi8 (chunk1 , backslash )),
282+ _mm_cmpeq_epi8 (chunk1 , quote ));
283+ const __m128i cmp2 = _mm_or_si128 (_mm_or_si128 (_mm_cmpeq_epi8 (chunk2 , null_char ),
284+ _mm_cmpeq_epi8 (chunk2 , backslash )),
285+ _mm_cmpeq_epi8 (chunk2 , quote ));
286+ const __m128i cmp3 = _mm_or_si128 (_mm_or_si128 (_mm_cmpeq_epi8 (chunk3 , null_char ),
287+ _mm_cmpeq_epi8 (chunk3 , backslash )),
288+ _mm_cmpeq_epi8 (chunk3 , quote ));
289+
290+ // Convert to masks
291+ int mask0 = _mm_movemask_epi8 (cmp0 );
292+ if (__builtin_expect (mask0 != 0 , 0 )) return str + __builtin_ctz (mask0 );
293+
294+ int mask1 = _mm_movemask_epi8 (cmp1 );
295+ if (__builtin_expect (mask1 != 0 , 0 )) return str + 16 + __builtin_ctz (mask1 );
296+
297+ int mask2 = _mm_movemask_epi8 (cmp2 );
298+ if (__builtin_expect (mask2 != 0 , 0 )) return str + 32 + __builtin_ctz (mask2 );
299+
300+ int mask3 = _mm_movemask_epi8 (cmp3 );
301+ if (__builtin_expect (mask3 != 0 , 0 )) return str + 48 + __builtin_ctz (mask3 );
302+
303+ str += 64 ;
304+ }
236305
237- for (; str <= safe_end ; str += 16 ) {
306+ // Handle remaining 16-byte chunks
307+ for (; str <= safe_end_16 ; str += 16 ) {
238308 const __m128i chunk = _mm_loadu_si128 ((const __m128i * )str );
239-
240- // Compare against each special character
241- __m128i cmp_null = _mm_cmpeq_epi8 (chunk , null_char );
242- __m128i cmp_back = _mm_cmpeq_epi8 (chunk , backslash );
243- __m128i cmp_quot = _mm_cmpeq_epi8 (chunk , quote );
244-
245- // Combine all comparisons
246- __m128i matches = _mm_or_si128 (_mm_or_si128 (cmp_null , cmp_back ), cmp_quot );
247-
248- // Create a mask from the comparison result
309+ const __m128i matches = _mm_or_si128 (_mm_or_si128 (_mm_cmpeq_epi8 (chunk , null_char ),
310+ _mm_cmpeq_epi8 (chunk , backslash )),
311+ _mm_cmpeq_epi8 (chunk , quote ));
249312 int mask = _mm_movemask_epi8 (matches );
250-
251- if (mask != 0 ) {
252- // Find the position of the first match using bit scan forward
253- #ifdef _MSC_VER
254- unsigned long pos ;
255- _BitScanForward (& pos , mask );
256- return str + pos ;
257- #else
258- return str + __builtin_ctz (mask );
259- #endif
260- }
313+ if (mask != 0 ) return str + __builtin_ctz (mask );
261314 }
262315
263- // Fall back to scalar scanning for the last < 16 bytes
264316 return scan_string_noSIMD (str , end );
265317}
266318#endif
0 commit comments