Skip to content
Open
157 changes: 148 additions & 9 deletions ext/oj/dump.c
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,50 @@ void initialize_neon(void) {
}
#endif

#ifdef OJ_USE_SSE4_2

static size_t (*hibit_friendly_size_simd)(const uint8_t *str, size_t len) = NULL;
static __m128i hibit_friendly_chars_sse42[8];

#define SIMD_TARGET __attribute__((target("sse4.2,ssse3")))

// From: https://stackoverflow.com/questions/36998538/fastest-way-to-horizontally-sum-sse-unsigned-byte-vector
inline uint32_t _mm_sum_epu8(const __m128i v) {
__m128i vsum = _mm_sad_epu8(v, _mm_setzero_si128());
return _mm_cvtsi128_si32(vsum) + _mm_extract_epi16(vsum, 4);
}

inline static SIMD_TARGET size_t hibit_friendly_size_sse42(const uint8_t *str, size_t len) {
size_t size = 0;
size_t i = 0;

for (; i + sizeof(__m128i) <= len; i += sizeof(__m128i), str += sizeof(__m128i)) {
size += sizeof(__m128i);

__m128i chunk = _mm_loadu_si128((__m128i *)str);
__m128i tmp = vector_lookup_sse42(chunk, hibit_friendly_chars_sse42, 8);
size += _mm_sum_epu8(tmp);
}
size_t total = size + calculate_string_size(str, len - i, hibit_friendly_chars);
return total;
}

void SIMD_TARGET initialize_sse42(void) {
hibit_friendly_size_simd = hibit_friendly_size_sse42;

for (int i = 0; i < 8; i++) {
hibit_friendly_chars_sse42[i] = _mm_sub_epi8(
_mm_loadu_si128((__m128i *)(hibit_friendly_chars + i * sizeof(__m128i))),
_mm_set1_epi8('1'));
}
}

#else

#define SIMD_TARGET

#endif /* OJ_USE_SSE4_2 */

inline static size_t hibit_friendly_size(const uint8_t *str, size_t len) {
#ifdef HAVE_SIMD_NEON
size_t size = 0;
Expand All @@ -220,6 +264,13 @@ inline static size_t hibit_friendly_size(const uint8_t *str, size_t len) {

size_t total = size + calculate_string_size(str, len - i, hibit_friendly_chars);
return total;
#elif defined(OJ_USE_SSE4_2)
if (len >= sizeof(__m128i)) {
if (hibit_friendly_size_simd != NULL) {
return hibit_friendly_size_simd(str, len);
}
}
return calculate_string_size(str, len, hibit_friendly_chars);
#else
return calculate_string_size(str, len, hibit_friendly_chars);
#endif
Expand Down Expand Up @@ -944,6 +995,34 @@ neon_update(const char *str, uint8x16x4_t *cmap_neon, int neon_table_size, bool
return result;
}

#elif defined(OJ_USE_SSE4_2)
typedef struct _sse42_match_result {
__m128i actions;
bool needs_escape;
int escape_mask;
bool has_some_hibit;
bool do_unicode_validation;
} sse42_match_result;

static inline SIMD_TARGET sse42_match_result
sse42_update(const char *str, __m128i *cmap_sse42, int sse42_tab_size, bool do_unicode_validation, bool has_hi) {
sse42_match_result result = {.has_some_hibit = false, .do_unicode_validation = false};

__m128i chunk = _mm_loadu_si128((__m128i *)str);
__m128i actions = vector_lookup_sse42(chunk, cmap_sse42, sse42_tab_size);
__m128i needs_escape = _mm_xor_si128(_mm_cmpeq_epi8(actions, _mm_setzero_si128()), _mm_set1_epi8(0xFF));
result.actions = _mm_add_epi8(actions, _mm_set1_epi8('1'));

result.escape_mask = _mm_movemask_epi8(needs_escape);
result.needs_escape = result.escape_mask != 0;
if (has_hi && do_unicode_validation) {
__m128i has_some_hibit = _mm_and_si128(chunk, _mm_set1_epi8(0x80));
result.has_some_hibit = _mm_movemask_epi8(has_some_hibit) != 0;
result.do_unicode_validation = has_hi && do_unicode_validation && result.has_some_hibit;
}
return result;
}

#endif /* HAVE_SIMD_NEON */

static inline FORCE_INLINE const char *process_character(char action,
Expand Down Expand Up @@ -1023,6 +1102,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
#ifdef HAVE_SIMD_NEON
uint8x16x4_t *cmap_neon = NULL;
int neon_table_size;
#elif defined(OJ_USE_SSE4_2)
__m128i *cmap_sse42 = NULL;
int sse42_tab_size;
#endif /* HAVE_SIMD_NEON */
const char *orig = str;
bool has_hi = false;
Expand Down Expand Up @@ -1091,6 +1173,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
#ifdef HAVE_SIMD_NEON
cmap_neon = hibit_friendly_chars_neon;
neon_table_size = 2;
#elif defined(OJ_USE_SSE4_2)
cmap_sse42 = hibit_friendly_chars_sse42;
sse42_tab_size = 8;
#endif /* HAVE_NEON_SIMD */
size = hibit_friendly_size((uint8_t *)str, cnt);
}
Expand Down Expand Up @@ -1118,21 +1203,29 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
if (is_sym) {
*out->cur++ = ':';
}
#ifdef HAVE_SIMD_NEON
const char *chunk_start;
const char *chunk_end;
const char *cursor = str;
bool use_neon = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
char matches[16];

#if defined(HAVE_SIMD_NEON) || defined(OJ_USE_SSE4_2)

#define SEARCH_FLUSH \
if (str > cursor) { \
APPEND_CHARS(out->cur, cursor, str - cursor); \
cursor = str; \
}

#endif /* HAVE_SIMD_NEON */
const char *chunk_start;
const char *chunk_end;
const char *cursor = str;
char matches[16];
#endif /* HAVE_SIMD_NEON || OJ_USE_SSE4_2 */

#if defined(HAVE_SIMD_NEON)
bool use_simd = (cmap_neon != NULL && cnt >= (sizeof(uint8x16_t))) ? true : false;
#elif defined(OJ_USE_SSE4_2)
bool use_simd = (cmap_sse42 != NULL && cnt >= (sizeof(__m128i))) ? true : false;
#endif

#ifdef HAVE_SIMD_NEON
if (use_neon) {
if (use_simd) {
while (str < end) {
const char *chunk_ptr = NULL;
if (str + sizeof(uint8x16_t) <= end) {
Expand Down Expand Up @@ -1195,7 +1288,53 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
}
SEARCH_FLUSH;
}
#endif /* HAVE_SIMD_NEON */
#endif

#ifdef OJ_USE_SSE4_2
if (use_simd) {
while (str < end) {
const char *chunk_ptr = NULL;
if (str + sizeof(__m128i) <= end) {
chunk_ptr = str;
chunk_start = str;
chunk_end = str + sizeof(__m128i);
} else if ((end - str) >= SIMD_MINIMUM_THRESHOLD) {
memset(out->cur, 'A', sizeof(__m128i));
memcpy(out->cur, str, (end - str));
chunk_ptr = out->cur;
chunk_start = str;
chunk_end = end;
} else {
break;
}
sse42_match_result result = sse42_update(chunk_ptr,
cmap_sse42,
sse42_tab_size,
do_unicode_validation,
has_hi);
if ((result.do_unicode_validation) || result.needs_escape) {
SEARCH_FLUSH;
_mm_storeu_si128((__m128i *)matches, result.actions);
while (str < chunk_end) {
long match_index = str - chunk_start;
str = process_character(matches[match_index],
str,
end,
out,
orig,
do_unicode_validation,
&check_start);
str++;
}
cursor = str;
continue;
}
str = chunk_end;
}
SEARCH_FLUSH;
}
#endif /* OJ_USE_SSE4_2 */

for (; str < end; str++) {
str = process_character(cmap[(uint8_t)*str], str, end, out, orig, do_unicode_validation, &check_start);
}
Expand Down
4 changes: 4 additions & 0 deletions ext/oj/oj.c
Original file line number Diff line number Diff line change
Expand Up @@ -2086,4 +2086,8 @@ void Init_oj(void) {
#ifdef HAVE_SIMD_NEON
initialize_neon();
#endif /* HAVE_SIMD_NEON */

#ifdef OJ_USE_SSE4_2
initialize_sse42();
#endif /* OJ_USE_SSE4_2 */
}
37 changes: 37 additions & 0 deletions ext/oj/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,41 @@
#include <arm_neon.h>
#endif

#if defined(__x86_64__) && (defined(__GNUC__) || defined(__clang__))

#if defined(OJ_USE_SSE4_2)
#define SIMD_MINIMUM_THRESHOLD 6
#include <nmmintrin.h>

extern void initialize_sse42(void);

static inline __attribute__((target("sse4.2,ssse3"))) __m128i vector_lookup_sse42(__m128i input,
__m128i *lookup_table,
int tab_size) {
// Extract high 4 bits to determine which 16-byte chunk (0-15)
__m128i hi_index = _mm_and_si128(_mm_srli_epi32(input, 4), _mm_set1_epi8(0x0F));

// Extract low 4 bits for index within the chunk (0-15)
__m128i low_index = _mm_and_si128(input, _mm_set1_epi8(0x0F));

// Perform lookups in all 16 tables
__m128i results[16];
for (int i = 0; i < tab_size; i++) {
results[i] = _mm_shuffle_epi8(lookup_table[i], low_index);
}

// Create masks for each chunk and blend results
__m128i final_result = _mm_setzero_si128();

for (int i = 0; i < tab_size; i++) {
__m128i mask = _mm_cmpeq_epi8(hi_index, _mm_set1_epi8(i));
__m128i masked_result = _mm_and_si128(mask, results[i]);
final_result = _mm_or_si128(final_result, masked_result);
}

return final_result;
}

#endif /* defined(HAVE_X86INTRIN_H) && defined(HAVE_TYPE___M128I) */
#endif
#endif /* OJ_SIMD_H */