Skip to content

Commit 6589cbe

Browse files
mmozeikoryanfleury
authored andcommitted
update third_party/martins_hash with latest code
1 parent 4bc6240 commit 6589cbe

File tree

4 files changed

+843
-251
lines changed

4 files changed

+843
-251
lines changed

src/third_party/martins_hash/md5.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,9 @@ static inline void md5_finish(md5_ctx* ctx, uint8_t digest[MD5_DIGEST_SIZE]);
4646
#endif
4747

4848
#if defined(_MSC_VER)
49-
# define MD5_GET32LE(ptr) *((const _UNALIGNED uint32_t*)(ptr))
50-
# define MD5_SET32LE(ptr,x) *((_UNALIGNED uint32_t*)(ptr)) = (x)
51-
# define MD5_SET64LE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = (x)
49+
# define MD5_GET32LE(ptr) *((const __unaligned uint32_t*)(ptr))
50+
# define MD5_SET32LE(ptr,x) *((__unaligned uint32_t*)(ptr)) = (x)
51+
# define MD5_SET64LE(ptr,x) *((__unaligned uint64_t*)(ptr)) = (x)
5252
#else
5353
# define MD5_GET32LE(ptr) \
5454
( \
@@ -431,5 +431,5 @@ void md5_finish(md5_ctx* ctx, uint8_t digest[MD5_DIGEST_SIZE])
431431
}
432432

433433
#if defined(__clang__)
434-
#pragma clang diagnostic pop
434+
# pragma clang diagnostic pop
435435
#endif

src/third_party/martins_hash/sha1.h

Lines changed: 232 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,9 @@ static inline void sha1_finish(sha1_ctx* ctx, uint8_t digest[SHA1_DIGEST_SIZE]);
5050

5151
#if defined(_MSC_VER)
5252
# include <stdlib.h>
53-
# define SHA1_GET32BE(ptr) _byteswap_ulong( *((const _UNALIGNED uint32_t*)(ptr)) )
54-
# define SHA1_SET32BE(ptr,x) *((_UNALIGNED uint32_t*)(ptr)) = _byteswap_ulong(x)
55-
# define SHA1_SET64BE(ptr,x) *((_UNALIGNED uint64_t*)(ptr)) = _byteswap_uint64(x)
53+
# define SHA1_GET32BE(ptr) _byteswap_ulong( *((const __unaligned uint32_t*)(ptr)) )
54+
# define SHA1_SET32BE(ptr,x) *((__unaligned uint32_t*)(ptr)) = _byteswap_ulong(x)
55+
# define SHA1_SET64BE(ptr,x) *((__unaligned uint64_t*)(ptr)) = _byteswap_uint64(x)
5656
#else
5757
# define SHA1_GET32BE(ptr) \
5858
( \
@@ -137,36 +137,86 @@ static inline int sha1_cpuid(void)
137137
SHA1_TARGET("ssse3,sha")
138138
static void sha1_process_shani(uint32_t* state, const uint8_t* block, size_t count)
139139
{
140-
const __m128i* buffer = (const __m128i*)block;
140+
// in SHA1 each round has two parts:
141+
// 1) calculate message schedule dwords in w[i]
142+
// 2) do round functions to update a/b/c/d/e state values using w[i]
141143

142-
// for performing two operations in one:
143-
// 1) dwords need to be loaded as big-endian
144-
// 2) order of dwords need to be reversed for sha instructions: [0,1,2,3] -> [3,2,1,0]
145-
const __m128i bswap = _mm_setr_epi8(15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0);
144+
// w[i] in first 16 rounds is just loaded from block bytes, as 32-bit big-endian load
145+
146+
// for next rounds it is done as:
147+
// w[i] = ROL(w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16])
148+
// where ROL(x) = 32-bit rotate left by 1
149+
150+
// this means it is possible to keep just the last 16 of w's in circular buffer
151+
// and every new w calculated will need to update 1 to 3 previous w's
152+
153+
// unrolling round calculations by 4 we get:
154+
// w[i+0] = ROL(w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16])
155+
// w[i+1] = ROL(w[i-2] ^ w[i-7] ^ w[i-13] ^ w[i-15])
156+
// w[i+2] = ROL(w[i-1] ^ w[i-6] ^ w[i-12] ^ w[i-14])
157+
// w[i+3] = ROL(w[i+0] ^ w[i-5] ^ w[i-11] ^ w[i-13])
158+
159+
// now if you store 4 w[..] values in 128-bit SSE register, then
160+
// W(i) = ROL( r0 ^ r1 ^ r2 ^ r3 )
161+
// with caveat that r0 lane 3 depends on W(i) lane 0
162+
163+
// [3] [2] [1] [0] // lanes
164+
// r0 = [ special, w[i-1], w[i-2], w[i-3] ]
165+
// r1 = [ w[i-5], w[i-6], w[i-7], w[i-8] ]
166+
// r2 = [ w[i-11], w[i-12], w[i-13], w[i-14] ]
167+
// r3 = [ w[i-13], w[i-14], w[i-15], w[i-16] ]
168+
169+
// in each 4-round i'th step it is possible to incrementally update new W(..) value when
170+
// keeping W(i) values in 4 xmm element circular buffer
171+
172+
// rounds i>0: W(i-1) = r2 ^ r3 = _mm_sha1msg1_epu32(W(i-1), W(i))
173+
// rounds i>1: W(i-2) = W(i-2) ^ r1 = _mm_xor_si128 (W(i-2), W(i))
174+
// rounds i>2: W(i-3) = ROL(W(i-3) ^ r0) = _mm_sha1msg2_epu32(W(i-3), W(i))
175+
// then the new W(i) can be used in round function calculations
176+
// _mm_sha1msg2_epu32 correctly handles r0 lane 3 dependency on W(i) lane 0
177+
178+
// to perform round functions on two SIMD registers with state as:
179+
// abcd = [a,b,c,d]
180+
// e0 = [e,0,0,0]
181+
// use the following code to get next abcd/e0 state 4 rounds at a time:
182+
183+
// tmp = _mm_sha1nexte_epu32(e0, W(i)) // rotates e0 and adds message dwords
184+
// abcd_next = _mm_sha1rnds4_epu32(abcd, tmp, Fn) // with Fn = 0..3 round function selection
185+
// e0_next = abcd
186+
187+
// sha1nexte is not needed on first round, just regular add32(e0, W(i)) should be used
188+
// after last round need to do extra rotation, which sha1nexte takes care when adding to last_e0
146189

147190
#define W(i) w[(i)%4]
148191

149192
// 4 wide round calculations
150193
#define QROUND(i) do { \
151-
/* first four rounds loads input message */ \
194+
/* first 4 rounds load input block */ \
152195
if (i < 4) W(i) = _mm_shuffle_epi8(_mm_loadu_si128(&buffer[i]), bswap); \
153-
/* update previous message dwords for next rounds */ \
196+
/* update message schedule */ \
154197
if (i > 0 && i < 17) W(i-1) = _mm_sha1msg1_epu32(W(i-1), W(i)); \
155-
if (i > 1 && i < 18) W(i-2) = _mm_xor_si128(W(i-2), W(i)); \
198+
if (i > 1 && i < 18) W(i-2) = _mm_xor_si128 (W(i-2), W(i)); \
156199
if (i > 2 && i < 19) W(i-3) = _mm_sha1msg2_epu32(W(i-3), W(i)); \
157-
/* calculate E from message dwords */ \
158-
if (i == 0) tmp = _mm_add_epi32(e0, W(i)); \
200+
/* calculate E plus message schedule */ \
201+
if (i == 0) tmp = _mm_add_epi32 (e0, W(i)); \
159202
if (i != 0) tmp = _mm_sha1nexte_epu32(e0, W(i)); \
160-
/* round function */ \
203+
/* 4 round functions */ \
161204
e0 = abcd; \
162-
abcd = _mm_sha1rnds4_epu32(abcd, tmp, (i/5)%4); \
205+
abcd = _mm_sha1rnds4_epu32(abcd, tmp, i/5); \
163206
} while(0)
164207

208+
const __m128i* buffer = (const __m128i*)block;
209+
210+
// for performing two operations in one:
211+
// 1) dwords need to be loaded as big-endian
212+
// 2) order of dwords need to be reversed for sha1 instructions: [0,1,2,3] -> [3,2,1,0]
213+
const __m128i bswap = _mm_setr_epi8(15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0);
214+
165215
// load initial state
166216
__m128i abcd = _mm_loadu_si128((const __m128i*)state); // [d,c,b,a]
167217
__m128i e0 = _mm_loadu_si32(&state[4]); // [0,0,0,e]
168218

169-
// change dword order
219+
// flip dword order, to what sha1 instructions use
170220
abcd = _mm_shuffle_epi32(abcd, _MM_SHUFFLE(0,1,2,3)); // [a,b,c,d] where a is in the top lane
171221
e0 = _mm_slli_si128(e0, 12); // [e,0,0,0] where e is in top lane
172222

@@ -183,16 +233,19 @@ static void sha1_process_shani(uint32_t* state, const uint8_t* block, size_t cou
183233
QROUND(2);
184234
QROUND(3);
185235
QROUND(4);
236+
186237
QROUND(5);
187238
QROUND(6);
188239
QROUND(7);
189240
QROUND(8);
190241
QROUND(9);
242+
191243
QROUND(10);
192244
QROUND(11);
193245
QROUND(12);
194246
QROUND(13);
195247
QROUND(14);
248+
196249
QROUND(15);
197250
QROUND(16);
198251
QROUND(17);
@@ -221,6 +274,159 @@ static void sha1_process_shani(uint32_t* state, const uint8_t* block, size_t cou
221274

222275
#endif // defined(__x86_64__) || defined(_M_AMD64)
223276

277+
278+
#if defined(__aarch64__) || defined(_M_ARM64)
279+
280+
#if defined(__clang__)
281+
# define SHA1_TARGET __attribute__((target("sha2")))
282+
#elif defined(__GNUC__)
283+
# define SHA1_TARGET __attribute__((target("+sha2")))
284+
#elif defined(_MSC_VER)
285+
# define SHA1_TARGET
286+
#endif
287+
288+
#include <arm_neon.h>
289+
290+
#if defined(_WIN32)
291+
# include <windows.h>
292+
#elif defined(__linux__)
293+
# include <sys/auxv.h>
294+
# include <asm/hwcap.h>
295+
#elif defined(__APPLE__)
296+
# include <sys/sysctl.h>
297+
#endif
298+
299+
#define SHA1_CPUID_INIT (1 << 0)
300+
#define SHA1_CPUID_ARM64 (1 << 1)
301+
302+
static inline int sha1_cpuid(void)
303+
{
304+
#if defined(__ARM_FEATURE_CRYPTO) || defined(__ARM_FEATURE_SHA2)
305+
int result = SHA1_CPUID_ARM64;
306+
#else
307+
static int cpuid;
308+
309+
int result = cpuid;
310+
if (result == 0)
311+
{
312+
#if defined(_WIN32)
313+
int has_arm64 = IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE);
314+
#elif defined(__linux__)
315+
unsigned long hwcap = getauxval(AT_HWCAP);
316+
int has_arm64 = hwcap & HWCAP_SHA1;
317+
#elif defined(__APPLE__)
318+
int value = 0;
319+
size_t valuelen = sizeof(value);
320+
int has_arm64 = sysctlbyname("hw.optional.arm.FEAT_SHA1", &value, &valuelen, NULL, 0) == 0 && value != 0;
321+
#else
322+
#error unknown platform
323+
#endif
324+
result |= SHA1_CPUID_INIT;
325+
if (has_arm64)
326+
{
327+
result |= SHA1_CPUID_ARM64;
328+
}
329+
330+
cpuid = result;
331+
}
332+
#endif
333+
334+
#if defined(SHA1_CPUID_MASK)
335+
result &= SHA1_CPUID_MASK;
336+
#endif
337+
338+
return result;
339+
}
340+
341+
SHA1_TARGET
342+
static void sha1_process_arm64(uint32_t* state, const uint8_t* block, size_t count)
343+
{
344+
// code here is similar to x64 shani implementation
345+
346+
// message array is 16 element circular buffer
347+
// each iteration updates 4 rounds at the same time
348+
349+
#define W(i) w[(i)%4]
350+
351+
#define QROUND(i,F,k) do { \
352+
/* update message schedule */ \
353+
if (i >= 4) W(i) = vsha1su0q_u32(W(i), W(i-3), W(i-2)); \
354+
if (i >= 4) W(i) = vsha1su1q_u32(W(i), W(i-1)); \
355+
/* add round constant */ \
356+
uint32x4_t tmp = vaddq_u32(W(i), k); \
357+
/* 4 round functions */ \
358+
uint32_t x = e0; \
359+
e0 = vsha1h_u32(vgetq_lane_u32(abcd, 0)); \
360+
abcd = F(abcd, x, tmp); \
361+
} while (0)
362+
363+
const uint32x4_t k0 = vdupq_n_u32(0x5a827999);
364+
const uint32x4_t k1 = vdupq_n_u32(0x6ed9eba1);
365+
const uint32x4_t k2 = vdupq_n_u32(0x8f1bbcdc);
366+
const uint32x4_t k3 = vdupq_n_u32(0xca62c1d6);
367+
368+
// load state - a,b,c,d,e
369+
uint32x4_t abcd = vld1q_u32(state);
370+
uint32_t e0 = state[4];
371+
372+
do
373+
{
374+
// remember current state
375+
uint32x4_t last_abcd = abcd;
376+
uint32_t last_e0 = e0;
377+
378+
// load 64-byte block and advance pointer to next block
379+
uint8x16x4_t msg = vld1q_u8_x4(block);
380+
block += SHA1_BLOCK_SIZE;
381+
382+
uint32x4_t w[4];
383+
384+
// for first 16 w's reverse the byte order in each 32-bit lane
385+
W(0) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[0]));
386+
W(1) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[1]));
387+
W(2) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[2]));
388+
W(3) = vreinterpretq_u32_u8(vrev32q_u8(msg.val[3]));
389+
390+
QROUND( 0, vsha1cq_u32, k0);
391+
QROUND( 1, vsha1cq_u32, k0);
392+
QROUND( 2, vsha1cq_u32, k0);
393+
QROUND( 3, vsha1cq_u32, k0);
394+
QROUND( 4, vsha1cq_u32, k0);
395+
396+
QROUND( 5, vsha1pq_u32, k1);
397+
QROUND( 6, vsha1pq_u32, k1);
398+
QROUND( 7, vsha1pq_u32, k1);
399+
QROUND( 8, vsha1pq_u32, k1);
400+
QROUND( 9, vsha1pq_u32, k1);
401+
402+
QROUND(10, vsha1mq_u32, k2);
403+
QROUND(11, vsha1mq_u32, k2);
404+
QROUND(12, vsha1mq_u32, k2);
405+
QROUND(13, vsha1mq_u32, k2);
406+
QROUND(14, vsha1mq_u32, k2);
407+
408+
QROUND(15, vsha1pq_u32, k3);
409+
QROUND(16, vsha1pq_u32, k3);
410+
QROUND(17, vsha1pq_u32, k3);
411+
QROUND(18, vsha1pq_u32, k3);
412+
QROUND(19, vsha1pq_u32, k3);
413+
414+
// update next state
415+
abcd = vaddq_u32(abcd, last_abcd);
416+
e0 += last_e0;
417+
}
418+
while (--count);
419+
420+
// save state
421+
vst1q_u32(state, abcd);
422+
state[4] = e0;
423+
424+
#undef QROUND
425+
#undef W
426+
}
427+
428+
#endif // defined(__aarch64__) || defined(_M_ARM64)
429+
224430
static void sha1_process(uint32_t* state, const uint8_t* block, size_t count)
225431
{
226432
#if defined(__x86_64__) || defined(_M_AMD64)
@@ -232,12 +438,21 @@ static void sha1_process(uint32_t* state, const uint8_t* block, size_t count)
232438
}
233439
#endif
234440

441+
#if defined(__aarch64__) || defined(_M_ARM64)
442+
int cpuid = sha1_cpuid();
443+
if (cpuid & SHA1_CPUID_ARM64)
444+
{
445+
sha1_process_arm64(state, block, count);
446+
return;
447+
}
448+
#endif
449+
235450
#define F1(x,y,z) (0x5a827999 + ((x & (y ^ z)) ^ z))
236451
#define F2(x,y,z) (0x6ed9eba1 + (x ^ y ^ z))
237452
#define F3(x,y,z) (0x8f1bbcdc + ((x & y) | (z & (x | y))))
238453
#define F4(x,y,z) (0xca62c1d6 + (x ^ y ^ z))
239454

240-
#define W(i) w[(i+16)%16]
455+
#define W(i) w[(i)%16]
241456

242457
#define ROUND(i,a,b,c,d,e,F) do \
243458
{ \

0 commit comments

Comments
 (0)