1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72 #include <linux/linkage.h>
73
74 #define CTX %rdi
75 #define BUF %rsi
76 #define CNT %rdx
77
78 #define REG_A %ecx
79 #define REG_B %esi
80 #define REG_C %edi
81 #define REG_D %eax
82 #define REG_E %edx
83 #define REG_TB %ebx
84 #define REG_TA %r12d
85 #define REG_RA %rcx
86 #define REG_RB %rsi
87 #define REG_RC %rdi
88 #define REG_RD %rax
89 #define REG_RE %rdx
90 #define REG_RTA %r12
91 #define REG_RTB %rbx
92 #define REG_T1 %r11d
93 #define xmm_mov vmovups
94 #define avx2_zeroupper vzeroupper
95 #define RND_F1 1
96 #define RND_F2 2
97 #define RND_F3 3
98
99 .macro REGALLOC
100 .set A, REG_A
101 .set B, REG_B
102 .set C, REG_C
103 .set D, REG_D
104 .set E, REG_E
105 .set TB, REG_TB
106 .set TA, REG_TA
107
108 .set RA, REG_RA
109 .set RB, REG_RB
110 .set RC, REG_RC
111 .set RD, REG_RD
112 .set RE, REG_RE
113
114 .set RTA, REG_RTA
115 .set RTB, REG_RTB
116
117 .set T1, REG_T1
118 .endm
119
120 #define HASH_PTR %r9
121 #define BLOCKS_CTR %r8
122 #define BUFFER_PTR %r10
123 #define BUFFER_PTR2 %r13
124
125 #define PRECALC_BUF %r14
126 #define WK_BUF %r15
127
128 #define W_TMP %xmm0
129 #define WY_TMP %ymm0
130 #define WY_TMP2 %ymm9
131
132
133 #define WY0 %ymm3
134 #define WY4 %ymm5
135 #define WY08 %ymm7
136 #define WY12 %ymm8
137 #define WY16 %ymm12
138 #define WY20 %ymm13
139 #define WY24 %ymm14
140 #define WY28 %ymm15
141
142 #define YMM_SHUFB_BSWAP %ymm10
143
144
145
146
147
148 #define W_SIZE (80*2*2 +16)
149
150 #define WK(t) ((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
151 #define PRECALC_WK(t) ((t)*2*2)(PRECALC_BUF)
152
153
154 .macro UPDATE_HASH hash, val
155 add \hash, \val
156 mov \val, \hash
157 .endm
158
159 .macro PRECALC_RESET_WY
160 .set WY_00, WY0
161 .set WY_04, WY4
162 .set WY_08, WY08
163 .set WY_12, WY12
164 .set WY_16, WY16
165 .set WY_20, WY20
166 .set WY_24, WY24
167 .set WY_28, WY28
168 .set WY_32, WY_00
169 .endm
170
171 .macro PRECALC_ROTATE_WY
172
173 .set WY_32, WY_28
174 .set WY_28, WY_24
175 .set WY_24, WY_20
176 .set WY_20, WY_16
177 .set WY_16, WY_12
178 .set WY_12, WY_08
179 .set WY_08, WY_04
180 .set WY_04, WY_00
181 .set WY_00, WY_32
182
183
184 .set WY, WY_00
185 .set WY_minus_04, WY_04
186 .set WY_minus_08, WY_08
187 .set WY_minus_12, WY_12
188 .set WY_minus_16, WY_16
189 .set WY_minus_20, WY_20
190 .set WY_minus_24, WY_24
191 .set WY_minus_28, WY_28
192 .set WY_minus_32, WY
193 .endm
194
195 .macro PRECALC_00_15
196 .if (i == 0) # Initialize and rotate registers
197 PRECALC_RESET_WY
198 PRECALC_ROTATE_WY
199 .endif
200
201
202 .if ((i & 7) == 0)
203
204
205
206
207 vmovdqu (i * 2)(BUFFER_PTR), W_TMP
208 .elseif ((i & 7) == 1)
209 vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
210 WY_TMP, WY_TMP
211 .elseif ((i & 7) == 2)
212 vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
213 .elseif ((i & 7) == 4)
214 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
215 .elseif ((i & 7) == 7)
216 vmovdqu WY_TMP, PRECALC_WK(i&~7)
217
218 PRECALC_ROTATE_WY
219 .endif
220 .endm
221
222 .macro PRECALC_16_31
223
224
225
226
227
228
229
230
231
232 .if ((i & 7) == 0)
233
234
235
236
237
238 vpalignr $8, WY_minus_16, WY_minus_12, WY
239 vpsrldq $4, WY_minus_04, WY_TMP
240 .elseif ((i & 7) == 1)
241 vpxor WY_minus_08, WY, WY
242 vpxor WY_minus_16, WY_TMP, WY_TMP
243 .elseif ((i & 7) == 2)
244 vpxor WY_TMP, WY, WY
245 vpslldq $12, WY, WY_TMP2
246 .elseif ((i & 7) == 3)
247 vpslld $1, WY, WY_TMP
248 vpsrld $31, WY, WY
249 .elseif ((i & 7) == 4)
250 vpor WY, WY_TMP, WY_TMP
251 vpslld $2, WY_TMP2, WY
252 .elseif ((i & 7) == 5)
253 vpsrld $30, WY_TMP2, WY_TMP2
254 vpxor WY, WY_TMP, WY_TMP
255 .elseif ((i & 7) == 7)
256 vpxor WY_TMP2, WY_TMP, WY
257 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
258 vmovdqu WY_TMP, PRECALC_WK(i&~7)
259
260 PRECALC_ROTATE_WY
261 .endif
262 .endm
263
264 .macro PRECALC_32_79
265
266
267
268
269
270
271
272
273
274 .if ((i & 7) == 0)
275
276
277
278
279 vpalignr $8, WY_minus_08, WY_minus_04, WY_TMP
280 .elseif ((i & 7) == 1)
281
282 vpxor WY_minus_28, WY, WY
283 .elseif ((i & 7) == 2)
284 vpxor WY_minus_16, WY_TMP, WY_TMP
285 .elseif ((i & 7) == 3)
286 vpxor WY_TMP, WY, WY
287 .elseif ((i & 7) == 4)
288 vpslld $2, WY, WY_TMP
289 .elseif ((i & 7) == 5)
290 vpsrld $30, WY, WY
291 vpor WY, WY_TMP, WY
292 .elseif ((i & 7) == 7)
293 vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP
294 vmovdqu WY_TMP, PRECALC_WK(i&~7)
295
296 PRECALC_ROTATE_WY
297 .endif
298 .endm
299
300 .macro PRECALC r, s
301 .set i, \r
302
303 .if (i < 40)
304 .set K_XMM, 32*0
305 .elseif (i < 80)
306 .set K_XMM, 32*1
307 .elseif (i < 120)
308 .set K_XMM, 32*2
309 .else
310 .set K_XMM, 32*3
311 .endif
312
313 .if (i<32)
314 PRECALC_00_15 \s
315 .elseif (i<64)
316 PRECALC_16_31 \s
317 .elseif (i < 160)
318 PRECALC_32_79 \s
319 .endif
320 .endm
321
322 .macro ROTATE_STATE
323 .set T_REG, E
324 .set E, D
325 .set D, C
326 .set C, B
327 .set B, TB
328 .set TB, A
329 .set A, T_REG
330
331 .set T_REG, RE
332 .set RE, RD
333 .set RD, RC
334 .set RC, RB
335 .set RB, RTB
336 .set RTB, RA
337 .set RA, T_REG
338 .endm
339
340
341
342 .macro RND_FUN f, r
343 .if (\f == RND_F1)
344 ROUND_F1 \r
345 .elseif (\f == RND_F2)
346 ROUND_F2 \r
347 .elseif (\f == RND_F3)
348 ROUND_F3 \r
349 .endif
350 .endm
351
352 .macro RR r
353 .set round_id, (\r % 80)
354
355 .if (round_id == 0)
356 .set ROUND_FUNC, RND_F1
357 mov B, TB
358
359 rorx $(32-30), B, B
360 andn D, TB, T1
361 and C, TB
362 xor T1, TB
363 .endif
364
365 RND_FUN ROUND_FUNC, \r
366 ROTATE_STATE
367
368 .if (round_id == 18)
369 .set ROUND_FUNC, RND_F2
370 .elseif (round_id == 38)
371 .set ROUND_FUNC, RND_F3
372 .elseif (round_id == 58)
373 .set ROUND_FUNC, RND_F2
374 .endif
375
376 .set round_id, ( (\r+1) % 80)
377
378 RND_FUN ROUND_FUNC, (\r+1)
379 ROTATE_STATE
380 .endm
381
382 .macro ROUND_F1 r
383 add WK(\r), E
384
385 andn C, A, T1
386 lea (RE,RTB), E
387
388 rorx $(32-5), A, TA
389 rorx $(32-30),A, TB
390
391 PRECALC (\r)
392
393
394
395
396
397 and B, A
398 xor T1, A
399
400 lea (RE,RTA), E
401 .endm
402
403 .macro ROUND_F2 r
404 add WK(\r), E
405 lea (RE,RTB), E
406
407
408 rorx $(32-5), A, TA
409 .if ((round_id) < 79)
410 rorx $(32-30), A, TB
411 .endif
412 PRECALC (\r)
413
414 .if ((round_id) < 79)
415 xor B, A
416 .endif
417
418 add TA, E
419
420 .if ((round_id) < 79)
421 xor C, A
422 .endif
423 .endm
424
425 .macro ROUND_F3 r
426 add WK(\r), E
427 PRECALC (\r)
428
429 lea (RE,RTB), E
430
431 mov B, T1
432 or A, T1
433
434 rorx $(32-5), A, TA
435 rorx $(32-30), A, TB
436
437
438
439
440 and C, T1
441 and B, A
442 or T1, A
443
444 add TA, E
445
446 .endm
447
448
449
450
451 .macro ADD_IF_GE a, b, c, d
452 mov \a, RTA
453 add $\d, RTA
454 cmp $\c, \b
455 cmovge RTA, \a
456 .endm
457
458
459
460
461 .macro SHA1_PIPELINED_MAIN_BODY
462
463 REGALLOC
464
465 mov (HASH_PTR), A
466 mov 4(HASH_PTR), B
467 mov 8(HASH_PTR), C
468 mov 12(HASH_PTR), D
469 mov 16(HASH_PTR), E
470
471 mov %rsp, PRECALC_BUF
472 lea (2*4*80+32)(%rsp), WK_BUF
473
474
475 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
476 .set i, 0
477 .rept 160
478 PRECALC i
479 .set i, i + 1
480 .endr
481
482
483 ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
484 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
485 xchg WK_BUF, PRECALC_BUF
486
487 .align 32
488 _loop:
489
490
491
492
493
494 test BLOCKS_CTR, BLOCKS_CTR
495 jnz _begin
496 .align 32
497 jmp _end
498 .align 32
499 _begin:
500
501
502
503
504
505 .set j, 0
506 .rept 5
507 RR j
508 .set j, j+2
509 .endr
510
511 jmp _loop0
512 _loop0:
513
514
515
516
517
518
519
520
521
522 .rept 25
523 RR j
524 .set j, j+2
525 .endr
526
527
528 sub $1, BLOCKS_CTR
529
530 ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
531
532
533
534
535
536 .rept 10
537 RR j
538 .set j, j+2
539 .endr
540
541 UPDATE_HASH (HASH_PTR), A
542 UPDATE_HASH 4(HASH_PTR), TB
543 UPDATE_HASH 8(HASH_PTR), C
544 UPDATE_HASH 12(HASH_PTR), D
545 UPDATE_HASH 16(HASH_PTR), E
546
547 test BLOCKS_CTR, BLOCKS_CTR
548 jz _loop
549
550 mov TB, B
551
552
553
554
555
556
557
558
559 .set j, 0
560 .rept 10
561 RR j+80
562 .set j, j+2
563 .endr
564
565 jmp _loop1
566 _loop1:
567
568
569
570
571
572 .rept 10
573 RR j+80
574 .set j, j+2
575 .endr
576
577 jmp _loop2
578 _loop2:
579
580
581
582
583
584
585 .rept 10
586 RR j+80
587 .set j, j+2
588 .endr
589
590
591 sub $1, BLOCKS_CTR
592
593 ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
594
595 jmp _loop3
596 _loop3:
597
598
599
600
601
602
603 .rept 10
604 RR j+80
605 .set j, j+2
606 .endr
607
608 UPDATE_HASH (HASH_PTR), A
609 UPDATE_HASH 4(HASH_PTR), TB
610 UPDATE_HASH 8(HASH_PTR), C
611 UPDATE_HASH 12(HASH_PTR), D
612 UPDATE_HASH 16(HASH_PTR), E
613
614
615 mov A, TA
616 mov TB, A
617 mov C, TB
618 mov E, C
619 mov D, B
620 mov TA, D
621
622 REGALLOC
623
624 xchg WK_BUF, PRECALC_BUF
625
626 jmp _loop
627
628 .align 32
629 _end:
630
631 .endm
632
633
634
635
636 .macro SHA1_VECTOR_ASM name
637 ENTRY(\name)
638
639 push %rbx
640 push %r12
641 push %r13
642 push %r14
643 push %r15
644
645 RESERVE_STACK = (W_SIZE*4 + 8+24)
646
647
648 mov %rsp, %rbx
649 and $~(0x20-1), %rsp
650 push %rbx
651 sub $RESERVE_STACK, %rsp
652
653 avx2_zeroupper
654
655
656 mov CTX, HASH_PTR
657 mov BUF, BUFFER_PTR
658
659 mov BUF, BUFFER_PTR2
660 mov CNT, BLOCKS_CTR
661
662 xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
663
664 SHA1_PIPELINED_MAIN_BODY
665
666 avx2_zeroupper
667
668 add $RESERVE_STACK, %rsp
669 pop %rsp
670
671 pop %r15
672 pop %r14
673 pop %r13
674 pop %r12
675 pop %rbx
676
677 ret
678
679 ENDPROC(\name)
680 .endm
681
682 .section .rodata
683
684 #define K1 0x5a827999
685 #define K2 0x6ed9eba1
686 #define K3 0x8f1bbcdc
687 #define K4 0xca62c1d6
688
689 .align 128
690 K_XMM_AR:
691 .long K1, K1, K1, K1
692 .long K1, K1, K1, K1
693 .long K2, K2, K2, K2
694 .long K2, K2, K2, K2
695 .long K3, K3, K3, K3
696 .long K3, K3, K3, K3
697 .long K4, K4, K4, K4
698 .long K4, K4, K4, K4
699
700 BSWAP_SHUFB_CTL:
701 .long 0x00010203
702 .long 0x04050607
703 .long 0x08090a0b
704 .long 0x0c0d0e0f
705 .long 0x00010203
706 .long 0x04050607
707 .long 0x08090a0b
708 .long 0x0c0d0e0f
709 .text
710
711 SHA1_VECTOR_ASM sha1_transform_avx2