Skip to content

Commit ba03b2f

Browse files
Optimize x86/aarch64 MD5 implementation
(Equivalent to openssl/openssl@ebe34f9) As suggested in https://github.com/animetosho/md5-optimisation?tab=readme-ov-file#dependency-shortcut-in-g-function, we can delay the dependency on 'x' by recognizing that ((x & z) | (y & ~z)) is equivalent to ((x & z) + (y + ~z)) in this scenario, and we can perform those additions independently, leaving our dependency on x to the final addition. This speeds it up around 5% on both platforms.
1 parent 81f138a commit ba03b2f

File tree

2 files changed

+66
-67
lines changed

2 files changed

+66
-67
lines changed

Diff for: crypto/fipsmodule/md5/asm/md5-armv8.pl

+64-64
Original file line numberDiff line numberDiff line change
@@ -216,165 +216,165 @@
216216
add w9, w9, w13 // Add constant 0x49b40821
217217
add w9, w9, w6 // Add aux function result
218218
ror w9, w9, #10 // Rotate left s=22 bits
219-
bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
219+
bic x6, x8, x17 // Aux function round 2 (~z & y)
220220
add w9, w8, w9 // Add X parameter round 1 B=FF(B, C, D, A, 0x49b40821, s=22, M[15])
221-
and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
222-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
223221
movz x13, #0x2562 // Load lower half of constant 0xf61e2562
224222
movk x13, #0xf61e, lsl #16 // Load upper half of constant 0xf61e2562
225223
add w4, w4, w20 // Add dest value
226224
add w4, w4, w13 // Add constant 0xf61e2562
227-
add w4, w4, w6 // Add aux function result
225+
and x13, x9, x17 // Aux function round 2 (x & z)
226+
add w4, w4, w6 // Add (~z & y)
227+
add w4, w4, w13 // Add (x & z)
228228
ror w4, w4, #27 // Rotate left s=5 bits
229-
bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
229+
bic x6, x9, x8 // Aux function round 2 (~z & y)
230230
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xf61e2562, s=5, M[1])
231-
and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
232-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
233231
movz x13, #0xb340 // Load lower half of constant 0xc040b340
234232
movk x13, #0xc040, lsl #16 // Load upper half of constant 0xc040b340
235233
add w17, w17, w7 // Add dest value
236234
add w17, w17, w13 // Add constant 0xc040b340
237-
add w17, w17, w6 // Add aux function result
235+
and x13, x4, x8 // Aux function round 2 (x & z)
236+
add w17, w17, w6 // Add (~z & y)
237+
add w17, w17, w13 // Add (x & z)
238238
ror w17, w17, #23 // Rotate left s=9 bits
239-
bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
239+
bic x6, x4, x9 // Aux function round 2 (~z & y)
240240
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc040b340, s=9, M[6])
241-
and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
242-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
243241
movz x13, #0x5a51 // Load lower half of constant 0x265e5a51
244242
movk x13, #0x265e, lsl #16 // Load upper half of constant 0x265e5a51
245243
add w8, w8, w25 // Add dest value
246244
add w8, w8, w13 // Add constant 0x265e5a51
247-
add w8, w8, w6 // Add aux function result
245+
and x13, x17, x9 // Aux function round 2 (x & z)
246+
add w8, w8, w6 // Add (~z & y)
247+
add w8, w8, w13 // Add (x & z)
248248
ror w8, w8, #18 // Rotate left s=14 bits
249-
bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
249+
bic x6, x17, x4 // Aux function round 2 (~z & y)
250250
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x265e5a51, s=14, M[11])
251-
and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
252-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
253251
movz x13, #0xc7aa // Load lower half of constant 0xe9b6c7aa
254252
movk x13, #0xe9b6, lsl #16 // Load upper half of constant 0xe9b6c7aa
255253
add w9, w9, w15 // Add dest value
256254
add w9, w9, w13 // Add constant 0xe9b6c7aa
257-
add w9, w9, w6 // Add aux function result
255+
and x13, x8, x4 // Aux function round 2 (x & z)
256+
add w9, w9, w6 // Add (~z & y)
257+
add w9, w9, w13 // Add (x & z)
258258
ror w9, w9, #12 // Rotate left s=20 bits
259-
bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
259+
bic x6, x8, x17 // Aux function round 2 (~z & y)
260260
add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe9b6c7aa, s=20, M[0])
261-
and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
262-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
263261
movz x13, #0x105d // Load lower half of constant 0xd62f105d
264262
movk x13, #0xd62f, lsl #16 // Load upper half of constant 0xd62f105d
265263
add w4, w4, w22 // Add dest value
266264
add w4, w4, w13 // Add constant 0xd62f105d
267-
add w4, w4, w6 // Add aux function result
265+
and x13, x9, x17 // Aux function round 2 (x & z)
266+
add w4, w4, w6 // Add (~z & y)
267+
add w4, w4, w13 // Add (x & z)
268268
ror w4, w4, #27 // Rotate left s=5 bits
269-
bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
269+
bic x6, x9, x8 // Aux function round 2 (~z & y)
270270
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xd62f105d, s=5, M[5])
271-
and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
272-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
273271
movz x13, #0x1453 // Load lower half of constant 0x2441453
274272
movk x13, #0x244, lsl #16 // Load upper half of constant 0x2441453
275273
add w17, w17, w16 // Add dest value
276274
add w17, w17, w13 // Add constant 0x2441453
277-
add w17, w17, w6 // Add aux function result
275+
and x13, x4, x8 // Aux function round 2 (x & z)
276+
add w17, w17, w6 // Add (~z & y)
277+
add w17, w17, w13 // Add (x & z)
278278
ror w17, w17, #23 // Rotate left s=9 bits
279-
bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
279+
bic x6, x4, x9 // Aux function round 2 (~z & y)
280280
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0x2441453, s=9, M[10])
281-
and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
282-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
283281
movz x13, #0xe681 // Load lower half of constant 0xd8a1e681
284282
movk x13, #0xd8a1, lsl #16 // Load upper half of constant 0xd8a1e681
285283
add w8, w8, w27 // Add dest value
286284
add w8, w8, w13 // Add constant 0xd8a1e681
287-
add w8, w8, w6 // Add aux function result
285+
and x13, x17, x9 // Aux function round 2 (x & z)
286+
add w8, w8, w6 // Add (~z & y)
287+
add w8, w8, w13 // Add (x & z)
288288
ror w8, w8, #18 // Rotate left s=14 bits
289-
bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
289+
bic x6, x17, x4 // Aux function round 2 (~z & y)
290290
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xd8a1e681, s=14, M[15])
291-
and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
292-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
293291
movz x13, #0xfbc8 // Load lower half of constant 0xe7d3fbc8
294292
movk x13, #0xe7d3, lsl #16 // Load upper half of constant 0xe7d3fbc8
295293
add w9, w9, w14 // Add dest value
296294
add w9, w9, w13 // Add constant 0xe7d3fbc8
297-
add w9, w9, w6 // Add aux function result
295+
and x13, x8, x4 // Aux function round 2 (x & z)
296+
add w9, w9, w6 // Add (~z & y)
297+
add w9, w9, w13 // Add (x & z)
298298
ror w9, w9, #12 // Rotate left s=20 bits
299-
bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
299+
bic x6, x8, x17 // Aux function round 2 (~z & y)
300300
add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0xe7d3fbc8, s=20, M[4])
301-
and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
302-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
303301
movz x13, #0xcde6 // Load lower half of constant 0x21e1cde6
304302
movk x13, #0x21e1, lsl #16 // Load upper half of constant 0x21e1cde6
305303
add w4, w4, w24 // Add dest value
306304
add w4, w4, w13 // Add constant 0x21e1cde6
307-
add w4, w4, w6 // Add aux function result
305+
and x13, x9, x17 // Aux function round 2 (x & z)
306+
add w4, w4, w6 // Add (~z & y)
307+
add w4, w4, w13 // Add (x & z)
308308
ror w4, w4, #27 // Rotate left s=5 bits
309-
bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
309+
bic x6, x9, x8 // Aux function round 2 (~z & y)
310310
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0x21e1cde6, s=5, M[9])
311-
and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
312-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
313311
movz x13, #0x7d6 // Load lower half of constant 0xc33707d6
314312
movk x13, #0xc337, lsl #16 // Load upper half of constant 0xc33707d6
315313
add w17, w17, w12 // Add dest value
316314
add w17, w17, w13 // Add constant 0xc33707d6
317-
add w17, w17, w6 // Add aux function result
315+
and x13, x4, x8 // Aux function round 2 (x & z)
316+
add w17, w17, w6 // Add (~z & y)
317+
add w17, w17, w13 // Add (x & z)
318318
ror w17, w17, #23 // Rotate left s=9 bits
319-
bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
319+
bic x6, x4, x9 // Aux function round 2 (~z & y)
320320
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xc33707d6, s=9, M[14])
321-
and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
322-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
323321
movz x13, #0xd87 // Load lower half of constant 0xf4d50d87
324322
movk x13, #0xf4d5, lsl #16 // Load upper half of constant 0xf4d50d87
325323
add w8, w8, w21 // Add dest value
326324
add w8, w8, w13 // Add constant 0xf4d50d87
327-
add w8, w8, w6 // Add aux function result
325+
and x13, x17, x9 // Aux function round 2 (x & z)
326+
add w8, w8, w6 // Add (~z & y)
327+
add w8, w8, w13 // Add (x & z)
328328
ror w8, w8, #18 // Rotate left s=14 bits
329-
bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
329+
bic x6, x17, x4 // Aux function round 2 (~z & y)
330330
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0xf4d50d87, s=14, M[3])
331-
and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
332-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
333331
movz x13, #0x14ed // Load lower half of constant 0x455a14ed
334332
movk x13, #0x455a, lsl #16 // Load upper half of constant 0x455a14ed
335333
add w9, w9, w5 // Add dest value
336334
add w9, w9, w13 // Add constant 0x455a14ed
337-
add w9, w9, w6 // Add aux function result
335+
and x13, x8, x4 // Aux function round 2 (x & z)
336+
add w9, w9, w6 // Add (~z & y)
337+
add w9, w9, w13 // Add (x & z)
338338
ror w9, w9, #12 // Rotate left s=20 bits
339-
bic x6, x8, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
339+
bic x6, x8, x17 // Aux function round 2 (~z & y)
340340
add w9, w8, w9 // Add X parameter round 2 B=GG(B, C, D, A, 0x455a14ed, s=20, M[8])
341-
and x13, x9, x17 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
342-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
343341
movz x13, #0xe905 // Load lower half of constant 0xa9e3e905
344342
movk x13, #0xa9e3, lsl #16 // Load upper half of constant 0xa9e3e905
345343
add w4, w4, w26 // Add dest value
346344
add w4, w4, w13 // Add constant 0xa9e3e905
347-
add w4, w4, w6 // Add aux function result
345+
and x13, x9, x17 // Aux function round 2 (x & z)
346+
add w4, w4, w6 // Add (~z & y)
347+
add w4, w4, w13 // Add (x & z)
348348
ror w4, w4, #27 // Rotate left s=5 bits
349-
bic x6, x9, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
349+
bic x6, x9, x8 // Aux function round 2 (~z & y)
350350
add w4, w9, w4 // Add X parameter round 2 A=GG(A, B, C, D, 0xa9e3e905, s=5, M[13])
351-
and x13, x4, x8 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
352-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
353351
movz x13, #0xa3f8 // Load lower half of constant 0xfcefa3f8
354352
movk x13, #0xfcef, lsl #16 // Load upper half of constant 0xfcefa3f8
355353
add w17, w17, w3 // Add dest value
356354
add w17, w17, w13 // Add constant 0xfcefa3f8
357-
add w17, w17, w6 // Add aux function result
355+
and x13, x4, x8 // Aux function round 2 (x & z)
356+
add w17, w17, w6 // Add (~z & y)
357+
add w17, w17, w13 // Add (x & z)
358358
ror w17, w17, #23 // Rotate left s=9 bits
359-
bic x6, x4, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
359+
bic x6, x4, x9 // Aux function round 2 (~z & y)
360360
add w17, w4, w17 // Add X parameter round 2 D=GG(D, A, B, C, 0xfcefa3f8, s=9, M[2])
361-
and x13, x17, x9 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
362-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
363361
movz x13, #0x2d9 // Load lower half of constant 0x676f02d9
364362
movk x13, #0x676f, lsl #16 // Load upper half of constant 0x676f02d9
365363
add w8, w8, w23 // Add dest value
366364
add w8, w8, w13 // Add constant 0x676f02d9
367-
add w8, w8, w6 // Add aux function result
365+
and x13, x17, x9 // Aux function round 2 (x & z)
366+
add w8, w8, w6 // Add (~z & y)
367+
add w8, w8, w13 // Add (x & z)
368368
ror w8, w8, #18 // Rotate left s=14 bits
369-
bic x6, x17, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
369+
bic x6, x17, x4 // Aux function round 2 (~z & y)
370370
add w8, w17, w8 // Add X parameter round 2 C=GG(C, D, A, B, 0x676f02d9, s=14, M[7])
371-
and x13, x8, x4 // Aux function round 2 G(x,y,z)=((x&z)|(~z&y))
372-
orr x6, x6, x13 // End aux function round 2 G(x,y,z)=((x&z)|(~z&y))
373371
movz x13, #0x4c8a // Load lower half of constant 0x8d2a4c8a
374372
movk x13, #0x8d2a, lsl #16 // Load upper half of constant 0x8d2a4c8a
375373
add w9, w9, w11 // Add dest value
376374
add w9, w9, w13 // Add constant 0x8d2a4c8a
377-
add w9, w9, w6 // Add aux function result
375+
and x13, x8, x4 // Aux function round 2 (x & z)
376+
add w9, w9, w6 // Add (~z & y)
377+
add w9, w9, w13 // Add (x & z)
378378
eor x6, x8, x17 // Begin aux function round 3 H(x,y,z)=(x^y^z)
379379
ror w9, w9, #12 // Rotate left s=20 bits
380380
movz x10, #0x3942 // Load lower half of constant 0xfffa3942

Diff for: crypto/fipsmodule/md5/asm/md5-x86_64.pl

+2-3
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@ sub round1_step
3939
# %r10d = X[k_next]
4040
# %r11d = z' (copy of z for the next step)
4141
# %r12d = z' (copy of z for the next step)
42-
# Each round2_step() takes about 5.4 clocks (11 instructions, 2.0 IPC)
4342
sub round2_step
4443
{
4544
my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
@@ -52,9 +51,9 @@ sub round2_step
5251
and $x, %r12d /* x & z */
5352
and $y, %r11d /* y & (not z) */
5453
mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */
55-
or %r11d, %r12d /* (y & (not z)) | (x & z) */
54+
add %r11d, $dst /* dst += (y & (not z)) */
5655
mov $y, %r11d /* (NEXT STEP) z' = $y */
57-
add %r12d, $dst /* dst += ... */
56+
add %r12d, $dst /* dst += (x & z) */
5857
mov $y, %r12d /* (NEXT STEP) z' = $y */
5958
rol \$$s, $dst /* dst <<< s */
6059
add $x, $dst /* dst += x */

0 commit comments

Comments
 (0)