@@ -50,9 +50,9 @@ static inline void sha1_finish(sha1_ctx* ctx, uint8_t digest[SHA1_DIGEST_SIZE]);
5050
5151#if defined(_MSC_VER )
5252# include <stdlib.h>
53- # define SHA1_GET32BE (ptr ) _byteswap_ulong( *((const _UNALIGNED uint32_t*)(ptr)) )
54- # define SHA1_SET32BE (ptr ,x ) *((_UNALIGNED uint32_t*)(ptr)) = _byteswap_ulong(x)
55- # define SHA1_SET64BE (ptr ,x ) *((_UNALIGNED uint64_t*)(ptr)) = _byteswap_uint64(x)
53+ # define SHA1_GET32BE (ptr ) _byteswap_ulong( *((const __unaligned uint32_t*)(ptr)) )
54+ # define SHA1_SET32BE (ptr ,x ) *((__unaligned uint32_t*)(ptr)) = _byteswap_ulong(x)
55+ # define SHA1_SET64BE (ptr ,x ) *((__unaligned uint64_t*)(ptr)) = _byteswap_uint64(x)
5656#else
5757# define SHA1_GET32BE (ptr ) \
5858 ( \
@@ -137,36 +137,86 @@ static inline int sha1_cpuid(void)
137137SHA1_TARGET ("ssse3,sha" )
138138static void sha1_process_shani (uint32_t * state , const uint8_t * block , size_t count )
139139{
140- const __m128i * buffer = (const __m128i * )block ;
140+ // in SHA1 each round has two parts:
141+ // 1) calculate message schedule dwords in w[i]
142+ // 2) do round functions to update a/b/c/d/e state values using w[i]
141143
142- // for performing two operations in one:
143- // 1) dwords need to be loaded as big-endian
144- // 2) order of dwords need to be reversed for sha instructions: [0,1,2,3] -> [3,2,1,0]
145- const __m128i bswap = _mm_setr_epi8 (15 ,14 ,13 ,12 ,11 ,10 ,9 ,8 ,7 ,6 ,5 ,4 ,3 ,2 ,1 ,0 );
144+ // w[i] in first 16 rounds is just loaded from block bytes, as 32-bit big-endian load
145+
146+ // for next rounds it is done as:
147+ // w[i] = ROL(w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16])
148+ // where ROL(x) = 32-bit rotate left by 1
149+
150+ // this means it is possible to keep just the last 16 of w's in circular buffer
151+ // and every new w calculated will need to update 1 to 3 previous w's
152+
153+ // unrolling round calculations by 4 we get:
154+ // w[i+0] = ROL(w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16])
155+ // w[i+1] = ROL(w[i-2] ^ w[i-7] ^ w[i-13] ^ w[i-15])
156+ // w[i+2] = ROL(w[i-1] ^ w[i-6] ^ w[i-12] ^ w[i-14])
157+ // w[i+3] = ROL(w[i+0] ^ w[i-5] ^ w[i-11] ^ w[i-13])
158+
159+ // now if you store 4 w[..] values in 128-bit SSE register, then
160+ // W(i) = ROL( r0 ^ r1 ^ r2 ^ r3 )
161+ // with caveat that r0 lane 3 depends on W(i) lane 0
162+
163+ // [3] [2] [1] [0] // lanes
164+ // r0 = [ special, w[i-1], w[i-2], w[i-3] ]
165+ // r1 = [ w[i-5], w[i-6], w[i-7], w[i-8] ]
166+ // r2 = [ w[i-11], w[i-12], w[i-13], w[i-14] ]
167+ // r3 = [ w[i-13], w[i-14], w[i-15], w[i-16] ]
168+
169+ // in each 4-round i'th step it is possible to incrementally update new W(..) value when
170+ // keeping W(i) values in 4 xmm element circular buffer
171+
172+ // rounds i>0: W(i-1) = r2 ^ r3 = _mm_sha1msg1_epu32(W(i-1), W(i))
173+ // rounds i>1: W(i-2) = W(i-2) ^ r1 = _mm_xor_si128 (W(i-2), W(i))
174+ // rounds i>2: W(i-3) = ROL(W(i-3) ^ r0) = _mm_sha1msg2_epu32(W(i-3), W(i))
175+ // then the new W(i) can be used in round function calculations
176+ // _mm_sha1msg2_epu32 correctly handles r0 lane 3 dependency on W(i) lane 0
177+
178+ // to perform round functions on two SIMD registers with state as:
179+ // abcd = [a,b,c,d]
180+ // e0 = [e,0,0,0]
181+ // use the following code to get next abcd/e0 state 4 rounds at a time:
182+
183+ // tmp = _mm_sha1nexte_epu32(e0, W(i)) // rotates e0 and adds message dwords
184+ // abcd_next = _mm_sha1rnds4_epu32(abcd, tmp, Fn) // with Fn = 0..3 round function selection
185+ // e0_next = abcd
186+
187+ // sha1nexte is not needed on first round, just regular add32(e0, W(i)) should be used
188+ // after last round need to do extra rotation, which sha1nexte takes care when adding to last_e0
146189
147190 #define W (i ) w[(i)%4]
148191
149192 // 4 wide round calculations
150193 #define QROUND (i ) do { \
151- /* first four rounds loads input message */ \
194+ /* first 4 rounds load input block */ \
152195 if (i < 4 ) W (i ) = _mm_shuffle_epi8 (_mm_loadu_si128 (& buffer [i ]), bswap ); \
153- /* update previous message dwords for next rounds */ \
196+ /* update message schedule */ \
154197 if (i > 0 && i < 17 ) W (i - 1 ) = _mm_sha1msg1_epu32 (W (i - 1 ), W (i )); \
155- if (i > 1 && i < 18 ) W (i - 2 ) = _mm_xor_si128 (W (i - 2 ), W (i )); \
198+ if (i > 1 && i < 18 ) W (i - 2 ) = _mm_xor_si128 (W (i - 2 ), W (i )); \
156199 if (i > 2 && i < 19 ) W (i - 3 ) = _mm_sha1msg2_epu32 (W (i - 3 ), W (i )); \
157- /* calculate E from message dwords */ \
158- if (i == 0 ) tmp = _mm_add_epi32 (e0 , W (i )); \
200+ /* calculate E plus message schedule */ \
201+ if (i == 0 ) tmp = _mm_add_epi32 (e0 , W (i )); \
159202 if (i != 0 ) tmp = _mm_sha1nexte_epu32 (e0 , W (i )); \
160- /* round function */ \
203+ /* 4 round functions */ \
161204 e0 = abcd ; \
162- abcd = _mm_sha1rnds4_epu32 (abcd , tmp , ( i /5 )% 4 ); \
205+ abcd = _mm_sha1rnds4_epu32 (abcd , tmp , i /5 ); \
163206 } while (0 )
164207
208+ const __m128i * buffer = (const __m128i * )block ;
209+
210+ // for performing two operations in one:
211+ // 1) dwords need to be loaded as big-endian
212+ // 2) order of dwords need to be reversed for sha1 instructions: [0,1,2,3] -> [3,2,1,0]
213+ const __m128i bswap = _mm_setr_epi8 (15 ,14 ,13 ,12 , 11 ,10 ,9 ,8 , 7 ,6 ,5 ,4 , 3 ,2 ,1 ,0 );
214+
165215 // load initial state
166216 __m128i abcd = _mm_loadu_si128 ((const __m128i * )state ); // [d,c,b,a]
167217 __m128i e0 = _mm_loadu_si32 (& state [4 ]); // [0,0,0,e]
168218
169- // change dword order
219+ // flip dword order, to what sha1 instructions use
170220 abcd = _mm_shuffle_epi32 (abcd , _MM_SHUFFLE (0 ,1 ,2 ,3 )); // [a,b,c,d] where a is in the top lane
171221 e0 = _mm_slli_si128 (e0 , 12 ); // [e,0,0,0] where e is in top lane
172222
@@ -183,16 +233,19 @@ static void sha1_process_shani(uint32_t* state, const uint8_t* block, size_t cou
183233 QROUND (2 );
184234 QROUND (3 );
185235 QROUND (4 );
236+
186237 QROUND (5 );
187238 QROUND (6 );
188239 QROUND (7 );
189240 QROUND (8 );
190241 QROUND (9 );
242+
191243 QROUND (10 );
192244 QROUND (11 );
193245 QROUND (12 );
194246 QROUND (13 );
195247 QROUND (14 );
248+
196249 QROUND (15 );
197250 QROUND (16 );
198251 QROUND (17 );
@@ -221,6 +274,159 @@ static void sha1_process_shani(uint32_t* state, const uint8_t* block, size_t cou
221274
222275#endif // defined(__x86_64__) || defined(_M_AMD64)
223276
277+
278+ #if defined(__aarch64__ ) || defined(_M_ARM64 )
279+
280+ #if defined(__clang__ )
281+ # define SHA1_TARGET __attribute__((target("sha2")))
282+ #elif defined(__GNUC__ )
283+ # define SHA1_TARGET __attribute__((target("+sha2")))
284+ #elif defined(_MSC_VER )
285+ # define SHA1_TARGET
286+ #endif
287+
288+ #include <arm_neon.h>
289+
290+ #if defined(_WIN32 )
291+ # include <windows.h>
292+ #elif defined(__linux__ )
293+ # include <sys/auxv.h>
294+ # include <asm/hwcap.h>
295+ #elif defined(__APPLE__ )
296+ # include <sys/sysctl.h>
297+ #endif
298+
299+ #define SHA1_CPUID_INIT (1 << 0)
300+ #define SHA1_CPUID_ARM64 (1 << 1)
301+
302+ static inline int sha1_cpuid (void )
303+ {
304+ #if defined(__ARM_FEATURE_CRYPTO ) || defined(__ARM_FEATURE_SHA2 )
305+ int result = SHA1_CPUID_ARM64 ;
306+ #else
307+ static int cpuid ;
308+
309+ int result = cpuid ;
310+ if (result == 0 )
311+ {
312+ #if defined(_WIN32 )
313+ int has_arm64 = IsProcessorFeaturePresent (PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE );
314+ #elif defined(__linux__ )
315+ unsigned long hwcap = getauxval (AT_HWCAP );
316+ int has_arm64 = hwcap & HWCAP_SHA1 ;
317+ #elif defined(__APPLE__ )
318+ int value = 0 ;
319+ size_t valuelen = sizeof (value );
320+ int has_arm64 = sysctlbyname ("hw.optional.arm.FEAT_SHA1" , & value , & valuelen , NULL , 0 ) == 0 && value != 0 ;
321+ #else
322+ #error unknown platform
323+ #endif
324+ result |= SHA1_CPUID_INIT ;
325+ if (has_arm64 )
326+ {
327+ result |= SHA1_CPUID_ARM64 ;
328+ }
329+
330+ cpuid = result ;
331+ }
332+ #endif
333+
334+ #if defined(SHA1_CPUID_MASK )
335+ result &= SHA1_CPUID_MASK ;
336+ #endif
337+
338+ return result ;
339+ }
340+
341+ SHA1_TARGET
342+ static void sha1_process_arm64 (uint32_t * state , const uint8_t * block , size_t count )
343+ {
344+ // code here is similar to x64 shani implementation
345+
346+ // message array is 16 element circular buffer
347+ // each iteration updates 4 rounds at the same time
348+
349+ #define W (i ) w[(i)%4]
350+
351+ #define QROUND (i ,F ,k ) do { \
352+ /* update message schedule */ \
353+ if (i >= 4 ) W (i ) = vsha1su0q_u32 (W (i ), W (i - 3 ), W (i - 2 )); \
354+ if (i >= 4 ) W (i ) = vsha1su1q_u32 (W (i ), W (i - 1 )); \
355+ /* add round constant */ \
356+ uint32x4_t tmp = vaddq_u32 (W (i ), k ); \
357+ /* 4 round functions */ \
358+ uint32_t x = e0 ; \
359+ e0 = vsha1h_u32 (vgetq_lane_u32 (abcd , 0 )); \
360+ abcd = F (abcd , x , tmp ); \
361+ } while (0 )
362+
363+ const uint32x4_t k0 = vdupq_n_u32 (0x5a827999 );
364+ const uint32x4_t k1 = vdupq_n_u32 (0x6ed9eba1 );
365+ const uint32x4_t k2 = vdupq_n_u32 (0x8f1bbcdc );
366+ const uint32x4_t k3 = vdupq_n_u32 (0xca62c1d6 );
367+
368+ // load state - a,b,c,d,e
369+ uint32x4_t abcd = vld1q_u32 (state );
370+ uint32_t e0 = state [4 ];
371+
372+ do
373+ {
374+ // remember current state
375+ uint32x4_t last_abcd = abcd ;
376+ uint32_t last_e0 = e0 ;
377+
378+ // load 64-byte block and advance pointer to next block
379+ uint8x16x4_t msg = vld1q_u8_x4 (block );
380+ block += SHA1_BLOCK_SIZE ;
381+
382+ uint32x4_t w [4 ];
383+
384+ // for first 16 w's reverse the byte order in each 32-bit lane
385+ W (0 ) = vreinterpretq_u32_u8 (vrev32q_u8 (msg .val [0 ]));
386+ W (1 ) = vreinterpretq_u32_u8 (vrev32q_u8 (msg .val [1 ]));
387+ W (2 ) = vreinterpretq_u32_u8 (vrev32q_u8 (msg .val [2 ]));
388+ W (3 ) = vreinterpretq_u32_u8 (vrev32q_u8 (msg .val [3 ]));
389+
390+ QROUND ( 0 , vsha1cq_u32 , k0 );
391+ QROUND ( 1 , vsha1cq_u32 , k0 );
392+ QROUND ( 2 , vsha1cq_u32 , k0 );
393+ QROUND ( 3 , vsha1cq_u32 , k0 );
394+ QROUND ( 4 , vsha1cq_u32 , k0 );
395+
396+ QROUND ( 5 , vsha1pq_u32 , k1 );
397+ QROUND ( 6 , vsha1pq_u32 , k1 );
398+ QROUND ( 7 , vsha1pq_u32 , k1 );
399+ QROUND ( 8 , vsha1pq_u32 , k1 );
400+ QROUND ( 9 , vsha1pq_u32 , k1 );
401+
402+ QROUND (10 , vsha1mq_u32 , k2 );
403+ QROUND (11 , vsha1mq_u32 , k2 );
404+ QROUND (12 , vsha1mq_u32 , k2 );
405+ QROUND (13 , vsha1mq_u32 , k2 );
406+ QROUND (14 , vsha1mq_u32 , k2 );
407+
408+ QROUND (15 , vsha1pq_u32 , k3 );
409+ QROUND (16 , vsha1pq_u32 , k3 );
410+ QROUND (17 , vsha1pq_u32 , k3 );
411+ QROUND (18 , vsha1pq_u32 , k3 );
412+ QROUND (19 , vsha1pq_u32 , k3 );
413+
414+ // update next state
415+ abcd = vaddq_u32 (abcd , last_abcd );
416+ e0 += last_e0 ;
417+ }
418+ while (-- count );
419+
420+ // save state
421+ vst1q_u32 (state , abcd );
422+ state [4 ] = e0 ;
423+
424+ #undef QROUND
425+ #undef W
426+ }
427+
428+ #endif // defined(__aarch64__) || defined(_M_ARM64)
429+
224430static void sha1_process (uint32_t * state , const uint8_t * block , size_t count )
225431{
226432#if defined(__x86_64__ ) || defined(_M_AMD64 )
@@ -232,12 +438,21 @@ static void sha1_process(uint32_t* state, const uint8_t* block, size_t count)
232438 }
233439#endif
234440
441+ #if defined(__aarch64__ ) || defined(_M_ARM64 )
442+ int cpuid = sha1_cpuid ();
443+ if (cpuid & SHA1_CPUID_ARM64 )
444+ {
445+ sha1_process_arm64 (state , block , count );
446+ return ;
447+ }
448+ #endif
449+
235450 #define F1 (x ,y ,z ) (0x5a827999 + ((x & (y ^ z)) ^ z))
236451 #define F2 (x ,y ,z ) (0x6ed9eba1 + (x ^ y ^ z))
237452 #define F3 (x ,y ,z ) (0x8f1bbcdc + ((x & y) | (z & (x | y))))
238453 #define F4 (x ,y ,z ) (0xca62c1d6 + (x ^ y ^ z))
239454
240- #define W (i ) w[(i+16 )%16]
455+ #define W (i ) w[(i)%16]
241456
242457 #define ROUND (i ,a ,b ,c ,d ,e ,F ) do \
243458 { \
0 commit comments