Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions ext/oj/extconf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,12 @@

dflags['OJ_DEBUG'] = true unless ENV['OJ_DEBUG'].nil?

if with_config('--with-sse42')
if try_cflags('-msse4.2')
$CPPFLAGS += ' -msse4.2'
dflags['OJ_USE_SSE4_2'] = 1
else
warn 'SSE 4.2 is not supported on this platform.'
end
# Enable SIMD optimizations - try SSE4.2 on x86_64 for best performance
# Falls back to SSE2 or compiler defaults if not available
if try_cflags('-msse4.2')
$CPPFLAGS += ' -msse4.2'
elsif try_cflags('-msse2')
$CPPFLAGS += ' -msse2'
end

if enable_config('trace-log', false)
Expand Down
131 changes: 113 additions & 18 deletions ext/oj/parse.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,9 @@
#include "mem.h"
#include "oj.h"
#include "rxclass.h"
#include "simd.h"
#include "val_stack.h"

#ifdef OJ_USE_SSE4_2
#include <nmmintrin.h>
#endif

// Workaround in case INFINITY is not defined in math.h or if the OS is CentOS
#define OJ_INFINITY (1.0 / 0.0)

Expand Down Expand Up @@ -202,23 +199,118 @@ static inline const char *scan_string_noSIMD(const char *str, const char *end) {
return str;
}

#ifdef OJ_USE_SSE4_2
static inline const char *scan_string_SIMD(const char *str, const char *end) {
#ifdef HAVE_SIMD_SSE4_2
// Optimized SIMD string scanner using SSE4.2 instructions
// Uses prefetching and processes multiple chunks in parallel to reduce latency
static inline const char *scan_string_SSE42(const char *str, const char *end) {
static const char chars[16] = "\x00\\\"";
const __m128i terminate = _mm_loadu_si128((const __m128i *)&chars[0]);
const char *_end = (const char *)(end - 16);
const char *safe_end_64 = end - 64;
const char *safe_end_16 = end - 16;

// Process 64 bytes at a time with parallel SIMD operations
// This reduces pipeline stalls and improves instruction-level parallelism
while (str <= safe_end_64) {
// Prefetch next cache line for better memory throughput
__builtin_prefetch(str + 64, 0, 0);

// Load and compare 4 chunks in parallel
const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
const __m128i chunk1 = _mm_loadu_si128((const __m128i *)(str + 16));
const __m128i chunk2 = _mm_loadu_si128((const __m128i *)(str + 32));
const __m128i chunk3 = _mm_loadu_si128((const __m128i *)(str + 48));

const int r0 = _mm_cmpestri(terminate, 3, chunk0, 16,
_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
if (__builtin_expect(r0 != 16, 0)) return str + r0;

const int r1 = _mm_cmpestri(terminate, 3, chunk1, 16,
_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
if (__builtin_expect(r1 != 16, 0)) return str + 16 + r1;

const int r2 = _mm_cmpestri(terminate, 3, chunk2, 16,
_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
if (__builtin_expect(r2 != 16, 0)) return str + 32 + r2;

const int r3 = _mm_cmpestri(terminate, 3, chunk3, 16,
_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
if (__builtin_expect(r3 != 16, 0)) return str + 48 + r3;

str += 64;
}

for (; str <= _end; str += 16) {
// Handle remaining 16-byte chunks
for (; str <= safe_end_16; str += 16) {
const __m128i string = _mm_loadu_si128((const __m128i *)str);
const int r = _mm_cmpestri(terminate,
3,
string,
16,
const int r = _mm_cmpestri(terminate, 3, string, 16,
_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT);
if (r != 16) {
str = (char *)(str + r);
return str;
}
if (r != 16) return str + r;
}

return scan_string_noSIMD(str, end);
}
#endif

#ifdef HAVE_SIMD_SSE2
// Optimized SSE2 string scanner (fallback for older x86_64 CPUs)
// Uses SSE2 instructions with prefetching and parallel processing
static inline const char *scan_string_SSE2(const char *str, const char *end) {
const char *safe_end_64 = end - 64;
const char *safe_end_16 = end - 16;

// Create comparison vectors for our three special characters
const __m128i null_char = _mm_setzero_si128();
const __m128i backslash = _mm_set1_epi8('\\');
const __m128i quote = _mm_set1_epi8('"');

// Process 64 bytes at a time for better throughput
while (str <= safe_end_64) {
__builtin_prefetch(str + 64, 0, 0);

// Load 4 chunks
const __m128i chunk0 = _mm_loadu_si128((const __m128i *)(str));
const __m128i chunk1 = _mm_loadu_si128((const __m128i *)(str + 16));
const __m128i chunk2 = _mm_loadu_si128((const __m128i *)(str + 32));
const __m128i chunk3 = _mm_loadu_si128((const __m128i *)(str + 48));

// Compare all chunks (allows CPU to parallelize)
const __m128i cmp0 = _mm_or_si128(_mm_or_si128(_mm_cmpeq_epi8(chunk0, null_char),
_mm_cmpeq_epi8(chunk0, backslash)),
_mm_cmpeq_epi8(chunk0, quote));
const __m128i cmp1 = _mm_or_si128(_mm_or_si128(_mm_cmpeq_epi8(chunk1, null_char),
_mm_cmpeq_epi8(chunk1, backslash)),
_mm_cmpeq_epi8(chunk1, quote));
const __m128i cmp2 = _mm_or_si128(_mm_or_si128(_mm_cmpeq_epi8(chunk2, null_char),
_mm_cmpeq_epi8(chunk2, backslash)),
_mm_cmpeq_epi8(chunk2, quote));
const __m128i cmp3 = _mm_or_si128(_mm_or_si128(_mm_cmpeq_epi8(chunk3, null_char),
_mm_cmpeq_epi8(chunk3, backslash)),
_mm_cmpeq_epi8(chunk3, quote));

// Convert to masks
int mask0 = _mm_movemask_epi8(cmp0);
if (__builtin_expect(mask0 != 0, 0)) return str + __builtin_ctz(mask0);

int mask1 = _mm_movemask_epi8(cmp1);
if (__builtin_expect(mask1 != 0, 0)) return str + 16 + __builtin_ctz(mask1);

int mask2 = _mm_movemask_epi8(cmp2);
if (__builtin_expect(mask2 != 0, 0)) return str + 32 + __builtin_ctz(mask2);

int mask3 = _mm_movemask_epi8(cmp3);
if (__builtin_expect(mask3 != 0, 0)) return str + 48 + __builtin_ctz(mask3);

str += 64;
}

// Handle remaining 16-byte chunks
for (; str <= safe_end_16; str += 16) {
const __m128i chunk = _mm_loadu_si128((const __m128i *)str);
const __m128i matches = _mm_or_si128(_mm_or_si128(_mm_cmpeq_epi8(chunk, null_char),
_mm_cmpeq_epi8(chunk, backslash)),
_mm_cmpeq_epi8(chunk, quote));
int mask = _mm_movemask_epi8(matches);
if (mask != 0) return str + __builtin_ctz(mask);
}

return scan_string_noSIMD(str, end);
Expand All @@ -228,9 +320,12 @@ static inline const char *scan_string_SIMD(const char *str, const char *end) {
static const char *(*scan_func)(const char *str, const char *end) = scan_string_noSIMD;

void oj_scanner_init(void) {
#ifdef OJ_USE_SSE4_2
scan_func = scan_string_SIMD;
#ifdef HAVE_SIMD_SSE4_2
scan_func = scan_string_SSE42;
#elif defined(HAVE_SIMD_SSE2)
scan_func = scan_string_SSE2;
#endif
// Note: ARM NEON string scanning would be added here if needed
}

// entered at /
Expand Down
37 changes: 37 additions & 0 deletions ext/oj/simd.h
Original file line number Diff line number Diff line change
@@ -1,10 +1,47 @@
#ifndef OJ_SIMD_H
#define OJ_SIMD_H

// SIMD architecture detection and configuration
// This header provides unified SIMD support across different CPU architectures

// x86/x86_64 SIMD detection
#if defined(__x86_64__) || defined(__i386__) || defined(_M_IX86) || defined(_M_X64)
#define HAVE_SIMD_X86 1

// SSE4.2 support (Intel Core i7+, AMD Bulldozer+)
// Enabled automatically when compiler has -msse4.2 flag
#if defined(__SSE4_2__)
#define HAVE_SIMD_SSE4_2 1
#include <nmmintrin.h>
#endif

// SSE2 support (fallback for older x86_64 CPUs - all x86_64 CPUs support SSE2)
#if defined(__SSE2__) && !defined(HAVE_SIMD_SSE4_2)
#define HAVE_SIMD_SSE2 1
#include <emmintrin.h>
#endif

#endif // x86/x86_64

// ARM NEON detection
#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(__aarch64__) || defined(_M_ARM64)
#define HAVE_SIMD_NEON 1
#define SIMD_MINIMUM_THRESHOLD 6
#include <arm_neon.h>
#endif

// Define which SIMD implementation to use (priority order: SSE4.2 > NEON > SSE2)
#if defined(HAVE_SIMD_SSE4_2)
#define HAVE_SIMD_STRING_SCAN 1
#define SIMD_TYPE "SSE4.2"
#elif defined(HAVE_SIMD_NEON)
#define HAVE_SIMD_STRING_SCAN 1
#define SIMD_TYPE "NEON"
#elif defined(HAVE_SIMD_SSE2)
#define HAVE_SIMD_STRING_SCAN 1
#define SIMD_TYPE "SSE2"
#else
#define SIMD_TYPE "none"
#endif

#endif /* OJ_SIMD_H */
Loading