ohler55 · ohler55 · Nov 24, 2025 · Nov 23, 2025 · Nov 23, 2025 · Nov 23, 2025
diff --git a/ext/oj/extconf.rb b/ext/oj/extconf.rb
@@ -35,13 +35,12 @@
 
 dflags['OJ_DEBUG'] = true unless ENV['OJ_DEBUG'].nil?
 
-if with_config('--with-sse42')
-  if try_cflags('-msse4.2')
-    $CPPFLAGS += ' -msse4.2'
-    dflags['OJ_USE_SSE4_2'] = 1
-  else
-    warn 'SSE 4.2 is not supported on this platform.'
-  end
+# Enable SIMD optimizations - try SSE4.2 on x86_64 for best performance
+# Falls back to SSE2 or compiler defaults if not available
+if try_cflags('-msse4.2')
+  $CPPFLAGS += ' -msse4.2'
+elsif try_cflags('-msse2')
+  $CPPFLAGS += ' -msse2'
 end
 
 if enable_config('trace-log', false)

diff --git a/ext/oj/parse.c b/ext/oj/parse.c
@@ -15,12 +15,9 @@
 #include "mem.h"
 #include "oj.h"
 #include "rxclass.h"
+#include "simd.h"
 #include "val_stack.h"
 
-#ifdef OJ_USE_SSE4_2
-#include <nmmintrin.h>
-#endif
-
 // Workaround in case INFINITY is not defined in math.h or if the OS is CentOS
 #define OJ_INFINITY (1.0 / 0.0)
 
@@ -202,23 +199,118 @@ static inline const char *scan_string_noSIMD(const char *str, const char *end) {
     return str;
 }
 
-#ifdef OJ_USE_SSE4_2
-static inline const char *scan_string_SIMD(const char *str, const char *end) {
+#ifdef HAVE_SIMD_SSE4_2
+// Optimized SIMD string scanner using SSE4.2 instructions
+// Uses prefetching and processes multiple chunks in parallel to reduce latency
+static inline const char *scan_string_SSE42(const char *str, const char *end) {
     static const char chars[16] = "\x00\\\"";
     const __m128i     terminate = _mm_loadu_si128((const __m128i *)&chars[0]);
-    const char       *_end      = (const char *)(end - 16);
+    const char       *safe_end_64 = end - 64;
+    const char       *safe_end_16 = end - 16;
+
+    // Process 64 bytes at a time with parallel SIMD operations
+    // This reduces pipeline stalls and improves instruction-level parallelism
+    while (str <= safe_end_64) {
+        // Prefetch next cache line for better memory throughput
+        __builtin_prefetch(str + 64, 0, 0);
+
+        // Load and compare 4 chunks in parallel
+        const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
+        const __m128i chunk1 = _mm_loadu_si128((const __m128i *)(str + 16));
+        const __m128i chunk2 = _mm_loadu_si128((const __m128i *)(str + 32));
+        const __m128i chunk3 = _mm_loadu_si128((const __m128i *)(str + 48));
+
+        const int r0 = _mm_cmpestri(terminate, 3, chunk0, 16,
+                                    _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
+        if (__builtin_expect(r0 != 16, 0)) return str + r0;
+
+        const int r1 = _mm_cmpestri(terminate, 3, chunk1, 16,
+                                    _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
+        if (__builtin_expect(r1 != 16, 0)) return str + 16 + r1;
+
+        const int r2 = _mm_cmpestri(terminate, 3, chunk2, 16,
+                                    _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
+        if (__builtin_expect(r2 != 16, 0)) return str + 32 + r2;
+
+        const int r3 = _mm_cmpestri(terminate, 3, chunk3, 16,
+                                    _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
+        if (__builtin_expect(r3 != 16, 0)) return str + 48 + r3;
+
+        str += 64;
+    }
 
-    for (; str <= _end; str += 16) {
+    // Handle remaining 16-byte chunks
+    for (; str <= safe_end_16; str += 16) {
         const __m128i string = _mm_loadu_si128((const __m128i *)str);
-        const int     r      = _mm_cmpestri(terminate,
-                                   3,
-                                   string,
-                                   16,
+        const int     r      = _mm_cmpestri(terminate, 3, string, 16,
                                    _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
-        if (r != 16) {
-            str = (char *)(str + r);
-            return str;
-        }
+        if (r != 16) return str + r;
+    }
+
+    return scan_string_noSIMD(str, end);
+}
+#endif
+
+#ifdef HAVE_SIMD_SSE2
+// Optimized SSE2 string scanner (fallback for older x86_64 CPUs)
+// Uses SSE2 instructions with prefetching and parallel processing
+static inline const char *scan_string_SSE2(const char *str, const char *end) {
+    const char *safe_end_64 = end - 64;
+    const char *safe_end_16 = end - 16;
+
+    // Create comparison vectors for our three special characters
+    const __m128i null_char = _mm_setzero_si128();
+    const __m128i backslash = _mm_set1_epi8('\\');
+    const __m128i quote     = _mm_set1_epi8('"');
+
+    // Process 64 bytes at a time for better throughput
+    while (str <= safe_end_64) {
+        __builtin_prefetch(str + 64, 0, 0);
+
+        // Load 4 chunks
+        const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
+        const __m128i chunk1 = _mm_loadu_si128((const __m128i *)(str + 16));
+        const __m128i chunk2 = _mm_loadu_si128((const __m128i *)(str + 32));
+        const __m128i chunk3 = _mm_loadu_si128((const __m128i *)(str + 48));
+
+        // Compare all chunks (allows CPU to parallelize)
+        const __m128i cmp0 = _mm_or_si128(_mm_or_si128(_mm_cmpeq_epi8(chunk0, null_char),
+                                                       _mm_cmpeq_epi8(chunk0, backslash)),
+                                         _mm_cmpeq_epi8(chunk0, quote));
+        const __m128i cmp1 = _mm_or_si128(_mm_or_si128(_mm_cmpeq_epi8(chunk1, null_char),
+                                                       _mm_cmpeq_epi8(chunk1, backslash)),
+                                         _mm_cmpeq_epi8(chunk1, quote));
+        const __m128i cmp2 = _mm_or_si128(_mm_or_si128(_mm_cmpeq_epi8(chunk2, null_char),
+                                                       _mm_cmpeq_epi8(chunk2, backslash)),
+                                         _mm_cmpeq_epi8(chunk2, quote));
+        const __m128i cmp3 = _mm_or_si128(_mm_or_si128(_mm_cmpeq_epi8(chunk3, null_char),
+                                                       _mm_cmpeq_epi8(chunk3, backslash)),
+                                         _mm_cmpeq_epi8(chunk3, quote));
+
+        // Convert to masks
+        int mask0 = _mm_movemask_epi8(cmp0);
+        if (__builtin_expect(mask0 != 0, 0)) return str + __builtin_ctz(mask0);
+
+        int mask1 = _mm_movemask_epi8(cmp1);
+        if (__builtin_expect(mask1 != 0, 0)) return str + 16 + __builtin_ctz(mask1);
+
+        int mask2 = _mm_movemask_epi8(cmp2);
+        if (__builtin_expect(mask2 != 0, 0)) return str + 32 + __builtin_ctz(mask2);
+
+        int mask3 = _mm_movemask_epi8(cmp3);
+        if (__builtin_expect(mask3 != 0, 0)) return str + 48 + __builtin_ctz(mask3);
+
+        str += 64;
+    }
+
+    // Handle remaining 16-byte chunks
+    for (; str <= safe_end_16; str += 16) {
+        const __m128i chunk = _mm_loadu_si128((const __m128i *)str);
+        const __m128i matches = _mm_or_si128(_mm_or_si128(_mm_cmpeq_epi8(chunk, null_char),
+                                                          _mm_cmpeq_epi8(chunk, backslash)),
+                                            _mm_cmpeq_epi8(chunk, quote));
+        int mask = _mm_movemask_epi8(matches);
+        if (mask != 0) return str + __builtin_ctz(mask);
     }
 
     return scan_string_noSIMD(str, end);
@@ -228,9 +320,12 @@ static inline const char *scan_string_SIMD(const char *str, const char *end) {
 static const char *(*scan_func)(const char *str, const char *end) = scan_string_noSIMD;
 
 void oj_scanner_init(void) {
-#ifdef OJ_USE_SSE4_2
-    scan_func = scan_string_SIMD;
+#ifdef HAVE_SIMD_SSE4_2
+    scan_func = scan_string_SSE42;
+#elif defined(HAVE_SIMD_SSE2)
+    scan_func = scan_string_SSE2;
 #endif
+    // Note: ARM NEON string scanning would be added here if needed
 }
 
 // entered at /

diff --git a/ext/oj/simd.h b/ext/oj/simd.h
@@ -1,10 +1,47 @@
 #ifndef OJ_SIMD_H
 #define OJ_SIMD_H
 
+// SIMD architecture detection and configuration
+// This header provides unified SIMD support across different CPU architectures
+
+// x86/x86_64 SIMD detection
+#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
+#define HAVE_SIMD_X86 1
+
+    // SSE4.2 support (Intel Core i7+, AMD Bulldozer+)
+    // Enabled automatically when compiler has -msse4.2 flag
+    #if defined(__SSE4_2__)
+    #define HAVE_SIMD_SSE4_2 1
+    #include <nmmintrin.h>
+    #endif
+
+    // SSE2 support (fallback for older x86_64 CPUs - all x86_64 CPUs support SSE2)
+    #if defined(__SSE2__) && !defined(HAVE_SIMD_SSE4_2)
+    #define HAVE_SIMD_SSE2 1
+    #include <emmintrin.h>
+    #endif
+
+#endif  // x86/x86_64
+
+// ARM NEON detection
 #if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
 #define HAVE_SIMD_NEON 1
 #define SIMD_MINIMUM_THRESHOLD 6
 #include <arm_neon.h>
 #endif
 
+// Define which SIMD implementation to use (priority order: SSE4.2 > NEON > SSE2)
+#if defined(HAVE_SIMD_SSE4_2)
+#define HAVE_SIMD_STRING_SCAN 1
+#define SIMD_TYPE "SSE4.2"
+#elif defined(HAVE_SIMD_NEON)
+#define HAVE_SIMD_STRING_SCAN 1
+#define SIMD_TYPE "NEON"
+#elif defined(HAVE_SIMD_SSE2)
+#define HAVE_SIMD_STRING_SCAN 1
+#define SIMD_TYPE "SSE2"
+#else
+#define SIMD_TYPE "none"
+#endif
+
 #endif /* OJ_SIMD_H */