1
2
3
4
5
6
7
8 #include <linux/linkage.h>
9
10 .section .rodata.cst32.ROT8, "aM", @progbits, 32
11 .align 32
12 ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
13 .octa 0x0e0d0c0f0a09080b0605040702010003
14
15 .section .rodata.cst32.ROT16, "aM", @progbits, 32
16 .align 32
17 ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
18 .octa 0x0d0c0f0e09080b0a0504070601000302
19
20 .section .rodata.cst32.CTRINC, "aM", @progbits, 32
21 .align 32
22 CTRINC: .octa 0x00000003000000020000000100000000
23 .octa 0x00000007000000060000000500000004
24
25 .section .rodata.cst32.CTR2BL, "aM", @progbits, 32
26 .align 32
27 CTR2BL: .octa 0x00000000000000000000000000000000
28 .octa 0x00000000000000000000000000000001
29
30 .section .rodata.cst32.CTR4BL, "aM", @progbits, 32
31 .align 32
32 CTR4BL: .octa 0x00000000000000000000000000000002
33 .octa 0x00000000000000000000000000000003
34
35 .text
36
37 ENTRY(chacha_2block_xor_avx2)
38 # %rdi: Input state matrix, s
39 # %rsi: up to 2 data blocks output, o
40 # %rdx: up to 2 data blocks input, i
41 # %rcx: input/output length in bytes
42 # %r8d: nrounds
43
44
45
46
47
48
49 vzeroupper
50
51
52 vbroadcasti128 0x00(%rdi),%ymm0
53 vbroadcasti128 0x10(%rdi),%ymm1
54 vbroadcasti128 0x20(%rdi),%ymm2
55 vbroadcasti128 0x30(%rdi),%ymm3
56
57 vpaddd CTR2BL(%rip),%ymm3,%ymm3
58
59 vmovdqa %ymm0,%ymm8
60 vmovdqa %ymm1,%ymm9
61 vmovdqa %ymm2,%ymm10
62 vmovdqa %ymm3,%ymm11
63
64 vmovdqa ROT8(%rip),%ymm4
65 vmovdqa ROT16(%rip),%ymm5
66
67 mov %rcx,%rax
68
69 .Ldoubleround:
70
71
72 vpaddd %ymm1,%ymm0,%ymm0
73 vpxor %ymm0,%ymm3,%ymm3
74 vpshufb %ymm5,%ymm3,%ymm3
75
76
77 vpaddd %ymm3,%ymm2,%ymm2
78 vpxor %ymm2,%ymm1,%ymm1
79 vmovdqa %ymm1,%ymm6
80 vpslld $12,%ymm6,%ymm6
81 vpsrld $20,%ymm1,%ymm1
82 vpor %ymm6,%ymm1,%ymm1
83
84
85 vpaddd %ymm1,%ymm0,%ymm0
86 vpxor %ymm0,%ymm3,%ymm3
87 vpshufb %ymm4,%ymm3,%ymm3
88
89
90 vpaddd %ymm3,%ymm2,%ymm2
91 vpxor %ymm2,%ymm1,%ymm1
92 vmovdqa %ymm1,%ymm7
93 vpslld $7,%ymm7,%ymm7
94 vpsrld $25,%ymm1,%ymm1
95 vpor %ymm7,%ymm1,%ymm1
96
97
98 vpshufd $0x39,%ymm1,%ymm1
99
100 vpshufd $0x4e,%ymm2,%ymm2
101
102 vpshufd $0x93,%ymm3,%ymm3
103
104
105 vpaddd %ymm1,%ymm0,%ymm0
106 vpxor %ymm0,%ymm3,%ymm3
107 vpshufb %ymm5,%ymm3,%ymm3
108
109
110 vpaddd %ymm3,%ymm2,%ymm2
111 vpxor %ymm2,%ymm1,%ymm1
112 vmovdqa %ymm1,%ymm6
113 vpslld $12,%ymm6,%ymm6
114 vpsrld $20,%ymm1,%ymm1
115 vpor %ymm6,%ymm1,%ymm1
116
117
118 vpaddd %ymm1,%ymm0,%ymm0
119 vpxor %ymm0,%ymm3,%ymm3
120 vpshufb %ymm4,%ymm3,%ymm3
121
122
123 vpaddd %ymm3,%ymm2,%ymm2
124 vpxor %ymm2,%ymm1,%ymm1
125 vmovdqa %ymm1,%ymm7
126 vpslld $7,%ymm7,%ymm7
127 vpsrld $25,%ymm1,%ymm1
128 vpor %ymm7,%ymm1,%ymm1
129
130
131 vpshufd $0x93,%ymm1,%ymm1
132
133 vpshufd $0x4e,%ymm2,%ymm2
134
135 vpshufd $0x39,%ymm3,%ymm3
136
137 sub $2,%r8d
138 jnz .Ldoubleround
139
140
141 vpaddd %ymm8,%ymm0,%ymm7
142 cmp $0x10,%rax
143 jl .Lxorpart2
144 vpxor 0x00(%rdx),%xmm7,%xmm6
145 vmovdqu %xmm6,0x00(%rsi)
146 vextracti128 $1,%ymm7,%xmm0
147
148 vpaddd %ymm9,%ymm1,%ymm7
149 cmp $0x20,%rax
150 jl .Lxorpart2
151 vpxor 0x10(%rdx),%xmm7,%xmm6
152 vmovdqu %xmm6,0x10(%rsi)
153 vextracti128 $1,%ymm7,%xmm1
154
155 vpaddd %ymm10,%ymm2,%ymm7
156 cmp $0x30,%rax
157 jl .Lxorpart2
158 vpxor 0x20(%rdx),%xmm7,%xmm6
159 vmovdqu %xmm6,0x20(%rsi)
160 vextracti128 $1,%ymm7,%xmm2
161
162 vpaddd %ymm11,%ymm3,%ymm7
163 cmp $0x40,%rax
164 jl .Lxorpart2
165 vpxor 0x30(%rdx),%xmm7,%xmm6
166 vmovdqu %xmm6,0x30(%rsi)
167 vextracti128 $1,%ymm7,%xmm3
168
169
170 vmovdqa %xmm0,%xmm7
171 cmp $0x50,%rax
172 jl .Lxorpart2
173 vpxor 0x40(%rdx),%xmm7,%xmm6
174 vmovdqu %xmm6,0x40(%rsi)
175
176 vmovdqa %xmm1,%xmm7
177 cmp $0x60,%rax
178 jl .Lxorpart2
179 vpxor 0x50(%rdx),%xmm7,%xmm6
180 vmovdqu %xmm6,0x50(%rsi)
181
182 vmovdqa %xmm2,%xmm7
183 cmp $0x70,%rax
184 jl .Lxorpart2
185 vpxor 0x60(%rdx),%xmm7,%xmm6
186 vmovdqu %xmm6,0x60(%rsi)
187
188 vmovdqa %xmm3,%xmm7
189 cmp $0x80,%rax
190 jl .Lxorpart2
191 vpxor 0x70(%rdx),%xmm7,%xmm6
192 vmovdqu %xmm6,0x70(%rsi)
193
194 .Ldone2:
195 vzeroupper
196 ret
197
198 .Lxorpart2:
199
200 mov %rax,%r9
201 and $0x0f,%r9
202 jz .Ldone2
203 and $~0x0f,%rax
204
205 mov %rsi,%r11
206
207 lea 8(%rsp),%r10
208 sub $0x10,%rsp
209 and $~31,%rsp
210
211 lea (%rdx,%rax),%rsi
212 mov %rsp,%rdi
213 mov %r9,%rcx
214 rep movsb
215
216 vpxor 0x00(%rsp),%xmm7,%xmm7
217 vmovdqa %xmm7,0x00(%rsp)
218
219 mov %rsp,%rsi
220 lea (%r11,%rax),%rdi
221 mov %r9,%rcx
222 rep movsb
223
224 lea -8(%r10),%rsp
225 jmp .Ldone2
226
227 ENDPROC(chacha_2block_xor_avx2)
228
229 ENTRY(chacha_4block_xor_avx2)
230 # %rdi: Input state matrix, s
231 # %rsi: up to 4 data blocks output, o
232 # %rdx: up to 4 data blocks input, i
233 # %rcx: input/output length in bytes
234 # %r8d: nrounds
235
236
237
238
239
240
241
242
243 vzeroupper
244
245
246 vbroadcasti128 0x00(%rdi),%ymm0
247 vbroadcasti128 0x10(%rdi),%ymm1
248 vbroadcasti128 0x20(%rdi),%ymm2
249 vbroadcasti128 0x30(%rdi),%ymm3
250
251 vmovdqa %ymm0,%ymm4
252 vmovdqa %ymm1,%ymm5
253 vmovdqa %ymm2,%ymm6
254 vmovdqa %ymm3,%ymm7
255
256 vpaddd CTR2BL(%rip),%ymm3,%ymm3
257 vpaddd CTR4BL(%rip),%ymm7,%ymm7
258
259 vmovdqa %ymm0,%ymm11
260 vmovdqa %ymm1,%ymm12
261 vmovdqa %ymm2,%ymm13
262 vmovdqa %ymm3,%ymm14
263 vmovdqa %ymm7,%ymm15
264
265 vmovdqa ROT8(%rip),%ymm8
266 vmovdqa ROT16(%rip),%ymm9
267
268 mov %rcx,%rax
269
270 .Ldoubleround4:
271
272
273 vpaddd %ymm1,%ymm0,%ymm0
274 vpxor %ymm0,%ymm3,%ymm3
275 vpshufb %ymm9,%ymm3,%ymm3
276
277 vpaddd %ymm5,%ymm4,%ymm4
278 vpxor %ymm4,%ymm7,%ymm7
279 vpshufb %ymm9,%ymm7,%ymm7
280
281
282 vpaddd %ymm3,%ymm2,%ymm2
283 vpxor %ymm2,%ymm1,%ymm1
284 vmovdqa %ymm1,%ymm10
285 vpslld $12,%ymm10,%ymm10
286 vpsrld $20,%ymm1,%ymm1
287 vpor %ymm10,%ymm1,%ymm1
288
289 vpaddd %ymm7,%ymm6,%ymm6
290 vpxor %ymm6,%ymm5,%ymm5
291 vmovdqa %ymm5,%ymm10
292 vpslld $12,%ymm10,%ymm10
293 vpsrld $20,%ymm5,%ymm5
294 vpor %ymm10,%ymm5,%ymm5
295
296
297 vpaddd %ymm1,%ymm0,%ymm0
298 vpxor %ymm0,%ymm3,%ymm3
299 vpshufb %ymm8,%ymm3,%ymm3
300
301 vpaddd %ymm5,%ymm4,%ymm4
302 vpxor %ymm4,%ymm7,%ymm7
303 vpshufb %ymm8,%ymm7,%ymm7
304
305
306 vpaddd %ymm3,%ymm2,%ymm2
307 vpxor %ymm2,%ymm1,%ymm1
308 vmovdqa %ymm1,%ymm10
309 vpslld $7,%ymm10,%ymm10
310 vpsrld $25,%ymm1,%ymm1
311 vpor %ymm10,%ymm1,%ymm1
312
313 vpaddd %ymm7,%ymm6,%ymm6
314 vpxor %ymm6,%ymm5,%ymm5
315 vmovdqa %ymm5,%ymm10
316 vpslld $7,%ymm10,%ymm10
317 vpsrld $25,%ymm5,%ymm5
318 vpor %ymm10,%ymm5,%ymm5
319
320
321 vpshufd $0x39,%ymm1,%ymm1
322 vpshufd $0x39,%ymm5,%ymm5
323
324 vpshufd $0x4e,%ymm2,%ymm2
325 vpshufd $0x4e,%ymm6,%ymm6
326
327 vpshufd $0x93,%ymm3,%ymm3
328 vpshufd $0x93,%ymm7,%ymm7
329
330
331 vpaddd %ymm1,%ymm0,%ymm0
332 vpxor %ymm0,%ymm3,%ymm3
333 vpshufb %ymm9,%ymm3,%ymm3
334
335 vpaddd %ymm5,%ymm4,%ymm4
336 vpxor %ymm4,%ymm7,%ymm7
337 vpshufb %ymm9,%ymm7,%ymm7
338
339
340 vpaddd %ymm3,%ymm2,%ymm2
341 vpxor %ymm2,%ymm1,%ymm1
342 vmovdqa %ymm1,%ymm10
343 vpslld $12,%ymm10,%ymm10
344 vpsrld $20,%ymm1,%ymm1
345 vpor %ymm10,%ymm1,%ymm1
346
347 vpaddd %ymm7,%ymm6,%ymm6
348 vpxor %ymm6,%ymm5,%ymm5
349 vmovdqa %ymm5,%ymm10
350 vpslld $12,%ymm10,%ymm10
351 vpsrld $20,%ymm5,%ymm5
352 vpor %ymm10,%ymm5,%ymm5
353
354
355 vpaddd %ymm1,%ymm0,%ymm0
356 vpxor %ymm0,%ymm3,%ymm3
357 vpshufb %ymm8,%ymm3,%ymm3
358
359 vpaddd %ymm5,%ymm4,%ymm4
360 vpxor %ymm4,%ymm7,%ymm7
361 vpshufb %ymm8,%ymm7,%ymm7
362
363
364 vpaddd %ymm3,%ymm2,%ymm2
365 vpxor %ymm2,%ymm1,%ymm1
366 vmovdqa %ymm1,%ymm10
367 vpslld $7,%ymm10,%ymm10
368 vpsrld $25,%ymm1,%ymm1
369 vpor %ymm10,%ymm1,%ymm1
370
371 vpaddd %ymm7,%ymm6,%ymm6
372 vpxor %ymm6,%ymm5,%ymm5
373 vmovdqa %ymm5,%ymm10
374 vpslld $7,%ymm10,%ymm10
375 vpsrld $25,%ymm5,%ymm5
376 vpor %ymm10,%ymm5,%ymm5
377
378
379 vpshufd $0x93,%ymm1,%ymm1
380 vpshufd $0x93,%ymm5,%ymm5
381
382 vpshufd $0x4e,%ymm2,%ymm2
383 vpshufd $0x4e,%ymm6,%ymm6
384
385 vpshufd $0x39,%ymm3,%ymm3
386 vpshufd $0x39,%ymm7,%ymm7
387
388 sub $2,%r8d
389 jnz .Ldoubleround4
390
391
392 vpaddd %ymm11,%ymm0,%ymm10
393 cmp $0x10,%rax
394 jl .Lxorpart4
395 vpxor 0x00(%rdx),%xmm10,%xmm9
396 vmovdqu %xmm9,0x00(%rsi)
397 vextracti128 $1,%ymm10,%xmm0
398
399 vpaddd %ymm12,%ymm1,%ymm10
400 cmp $0x20,%rax
401 jl .Lxorpart4
402 vpxor 0x10(%rdx),%xmm10,%xmm9
403 vmovdqu %xmm9,0x10(%rsi)
404 vextracti128 $1,%ymm10,%xmm1
405
406 vpaddd %ymm13,%ymm2,%ymm10
407 cmp $0x30,%rax
408 jl .Lxorpart4
409 vpxor 0x20(%rdx),%xmm10,%xmm9
410 vmovdqu %xmm9,0x20(%rsi)
411 vextracti128 $1,%ymm10,%xmm2
412
413 vpaddd %ymm14,%ymm3,%ymm10
414 cmp $0x40,%rax
415 jl .Lxorpart4
416 vpxor 0x30(%rdx),%xmm10,%xmm9
417 vmovdqu %xmm9,0x30(%rsi)
418 vextracti128 $1,%ymm10,%xmm3
419
420
421 vmovdqa %xmm0,%xmm10
422 cmp $0x50,%rax
423 jl .Lxorpart4
424 vpxor 0x40(%rdx),%xmm10,%xmm9
425 vmovdqu %xmm9,0x40(%rsi)
426
427 vmovdqa %xmm1,%xmm10
428 cmp $0x60,%rax
429 jl .Lxorpart4
430 vpxor 0x50(%rdx),%xmm10,%xmm9
431 vmovdqu %xmm9,0x50(%rsi)
432
433 vmovdqa %xmm2,%xmm10
434 cmp $0x70,%rax
435 jl .Lxorpart4
436 vpxor 0x60(%rdx),%xmm10,%xmm9
437 vmovdqu %xmm9,0x60(%rsi)
438
439 vmovdqa %xmm3,%xmm10
440 cmp $0x80,%rax
441 jl .Lxorpart4
442 vpxor 0x70(%rdx),%xmm10,%xmm9
443 vmovdqu %xmm9,0x70(%rsi)
444
445
446 vpaddd %ymm11,%ymm4,%ymm10
447 cmp $0x90,%rax
448 jl .Lxorpart4
449 vpxor 0x80(%rdx),%xmm10,%xmm9
450 vmovdqu %xmm9,0x80(%rsi)
451 vextracti128 $1,%ymm10,%xmm4
452
453 vpaddd %ymm12,%ymm5,%ymm10
454 cmp $0xa0,%rax
455 jl .Lxorpart4
456 vpxor 0x90(%rdx),%xmm10,%xmm9
457 vmovdqu %xmm9,0x90(%rsi)
458 vextracti128 $1,%ymm10,%xmm5
459
460 vpaddd %ymm13,%ymm6,%ymm10
461 cmp $0xb0,%rax
462 jl .Lxorpart4
463 vpxor 0xa0(%rdx),%xmm10,%xmm9
464 vmovdqu %xmm9,0xa0(%rsi)
465 vextracti128 $1,%ymm10,%xmm6
466
467 vpaddd %ymm15,%ymm7,%ymm10
468 cmp $0xc0,%rax
469 jl .Lxorpart4
470 vpxor 0xb0(%rdx),%xmm10,%xmm9
471 vmovdqu %xmm9,0xb0(%rsi)
472 vextracti128 $1,%ymm10,%xmm7
473
474
475 vmovdqa %xmm4,%xmm10
476 cmp $0xd0,%rax
477 jl .Lxorpart4
478 vpxor 0xc0(%rdx),%xmm10,%xmm9
479 vmovdqu %xmm9,0xc0(%rsi)
480
481 vmovdqa %xmm5,%xmm10
482 cmp $0xe0,%rax
483 jl .Lxorpart4
484 vpxor 0xd0(%rdx),%xmm10,%xmm9
485 vmovdqu %xmm9,0xd0(%rsi)
486
487 vmovdqa %xmm6,%xmm10
488 cmp $0xf0,%rax
489 jl .Lxorpart4
490 vpxor 0xe0(%rdx),%xmm10,%xmm9
491 vmovdqu %xmm9,0xe0(%rsi)
492
493 vmovdqa %xmm7,%xmm10
494 cmp $0x100,%rax
495 jl .Lxorpart4
496 vpxor 0xf0(%rdx),%xmm10,%xmm9
497 vmovdqu %xmm9,0xf0(%rsi)
498
499 .Ldone4:
500 vzeroupper
501 ret
502
503 .Lxorpart4:
504
505 mov %rax,%r9
506 and $0x0f,%r9
507 jz .Ldone4
508 and $~0x0f,%rax
509
510 mov %rsi,%r11
511
512 lea 8(%rsp),%r10
513 sub $0x10,%rsp
514 and $~31,%rsp
515
516 lea (%rdx,%rax),%rsi
517 mov %rsp,%rdi
518 mov %r9,%rcx
519 rep movsb
520
521 vpxor 0x00(%rsp),%xmm10,%xmm10
522 vmovdqa %xmm10,0x00(%rsp)
523
524 mov %rsp,%rsi
525 lea (%r11,%rax),%rdi
526 mov %r9,%rcx
527 rep movsb
528
529 lea -8(%r10),%rsp
530 jmp .Ldone4
531
532 ENDPROC(chacha_4block_xor_avx2)
533
534 ENTRY(chacha_8block_xor_avx2)
535 # %rdi: Input state matrix, s
536 # %rsi: up to 8 data blocks output, o
537 # %rdx: up to 8 data blocks input, i
538 # %rcx: input/output length in bytes
539 # %r8d: nrounds
540
541
542
543
544
545
546
547
548
549 # 7/12-bit word rotation uses traditional shift+OR.
550
551 vzeroupper
552 # 4 * 32 byte stack, 32-byte aligned
553 lea 8(%rsp),%r10
554 and $~31, %rsp
555 sub $0x80, %rsp
556 mov %rcx,%rax
557
558
559 vpbroadcastd 0x00(%rdi),%ymm0
560 vpbroadcastd 0x04(%rdi),%ymm1
561 vpbroadcastd 0x08(%rdi),%ymm2
562 vpbroadcastd 0x0c(%rdi),%ymm3
563 vpbroadcastd 0x10(%rdi),%ymm4
564 vpbroadcastd 0x14(%rdi),%ymm5
565 vpbroadcastd 0x18(%rdi),%ymm6
566 vpbroadcastd 0x1c(%rdi),%ymm7
567 vpbroadcastd 0x20(%rdi),%ymm8
568 vpbroadcastd 0x24(%rdi),%ymm9
569 vpbroadcastd 0x28(%rdi),%ymm10
570 vpbroadcastd 0x2c(%rdi),%ymm11
571 vpbroadcastd 0x30(%rdi),%ymm12
572 vpbroadcastd 0x34(%rdi),%ymm13
573 vpbroadcastd 0x38(%rdi),%ymm14
574 vpbroadcastd 0x3c(%rdi),%ymm15
575
576 vmovdqa %ymm0,0x00(%rsp)
577 vmovdqa %ymm1,0x20(%rsp)
578 vmovdqa %ymm2,0x40(%rsp)
579 vmovdqa %ymm3,0x60(%rsp)
580
581 vmovdqa CTRINC(%rip),%ymm1
582 vmovdqa ROT8(%rip),%ymm2
583 vmovdqa ROT16(%rip),%ymm3
584
585
586 vpaddd %ymm1,%ymm12,%ymm12
587
588 .Ldoubleround8:
589
590 vpaddd 0x00(%rsp),%ymm4,%ymm0
591 vmovdqa %ymm0,0x00(%rsp)
592 vpxor %ymm0,%ymm12,%ymm12
593 vpshufb %ymm3,%ymm12,%ymm12
594
595 vpaddd 0x20(%rsp),%ymm5,%ymm0
596 vmovdqa %ymm0,0x20(%rsp)
597 vpxor %ymm0,%ymm13,%ymm13
598 vpshufb %ymm3,%ymm13,%ymm13
599
600 vpaddd 0x40(%rsp),%ymm6,%ymm0
601 vmovdqa %ymm0,0x40(%rsp)
602 vpxor %ymm0,%ymm14,%ymm14
603 vpshufb %ymm3,%ymm14,%ymm14
604
605 vpaddd 0x60(%rsp),%ymm7,%ymm0
606 vmovdqa %ymm0,0x60(%rsp)
607 vpxor %ymm0,%ymm15,%ymm15
608 vpshufb %ymm3,%ymm15,%ymm15
609
610
611 vpaddd %ymm12,%ymm8,%ymm8
612 vpxor %ymm8,%ymm4,%ymm4
613 vpslld $12,%ymm4,%ymm0
614 vpsrld $20,%ymm4,%ymm4
615 vpor %ymm0,%ymm4,%ymm4
616
617 vpaddd %ymm13,%ymm9,%ymm9
618 vpxor %ymm9,%ymm5,%ymm5
619 vpslld $12,%ymm5,%ymm0
620 vpsrld $20,%ymm5,%ymm5
621 vpor %ymm0,%ymm5,%ymm5
622
623 vpaddd %ymm14,%ymm10,%ymm10
624 vpxor %ymm10,%ymm6,%ymm6
625 vpslld $12,%ymm6,%ymm0
626 vpsrld $20,%ymm6,%ymm6
627 vpor %ymm0,%ymm6,%ymm6
628
629 vpaddd %ymm15,%ymm11,%ymm11
630 vpxor %ymm11,%ymm7,%ymm7
631 vpslld $12,%ymm7,%ymm0
632 vpsrld $20,%ymm7,%ymm7
633 vpor %ymm0,%ymm7,%ymm7
634
635
636 vpaddd 0x00(%rsp),%ymm4,%ymm0
637 vmovdqa %ymm0,0x00(%rsp)
638 vpxor %ymm0,%ymm12,%ymm12
639 vpshufb %ymm2,%ymm12,%ymm12
640
641 vpaddd 0x20(%rsp),%ymm5,%ymm0
642 vmovdqa %ymm0,0x20(%rsp)
643 vpxor %ymm0,%ymm13,%ymm13
644 vpshufb %ymm2,%ymm13,%ymm13
645
646 vpaddd 0x40(%rsp),%ymm6,%ymm0
647 vmovdqa %ymm0,0x40(%rsp)
648 vpxor %ymm0,%ymm14,%ymm14
649 vpshufb %ymm2,%ymm14,%ymm14
650
651 vpaddd 0x60(%rsp),%ymm7,%ymm0
652 vmovdqa %ymm0,0x60(%rsp)
653 vpxor %ymm0,%ymm15,%ymm15
654 vpshufb %ymm2,%ymm15,%ymm15
655
656
657 vpaddd %ymm12,%ymm8,%ymm8
658 vpxor %ymm8,%ymm4,%ymm4
659 vpslld $7,%ymm4,%ymm0
660 vpsrld $25,%ymm4,%ymm4
661 vpor %ymm0,%ymm4,%ymm4
662
663 vpaddd %ymm13,%ymm9,%ymm9
664 vpxor %ymm9,%ymm5,%ymm5
665 vpslld $7,%ymm5,%ymm0
666 vpsrld $25,%ymm5,%ymm5
667 vpor %ymm0,%ymm5,%ymm5
668
669 vpaddd %ymm14,%ymm10,%ymm10
670 vpxor %ymm10,%ymm6,%ymm6
671 vpslld $7,%ymm6,%ymm0
672 vpsrld $25,%ymm6,%ymm6
673 vpor %ymm0,%ymm6,%ymm6
674
675 vpaddd %ymm15,%ymm11,%ymm11
676 vpxor %ymm11,%ymm7,%ymm7
677 vpslld $7,%ymm7,%ymm0
678 vpsrld $25,%ymm7,%ymm7
679 vpor %ymm0,%ymm7,%ymm7
680
681
682 vpaddd 0x00(%rsp),%ymm5,%ymm0
683 vmovdqa %ymm0,0x00(%rsp)
684 vpxor %ymm0,%ymm15,%ymm15
685 vpshufb %ymm3,%ymm15,%ymm15
686
687 vpaddd 0x20(%rsp),%ymm6,%ymm0
688 vmovdqa %ymm0,0x20(%rsp)
689 vpxor %ymm0,%ymm12,%ymm12
690 vpshufb %ymm3,%ymm12,%ymm12
691
692 vpaddd 0x40(%rsp),%ymm7,%ymm0
693 vmovdqa %ymm0,0x40(%rsp)
694 vpxor %ymm0,%ymm13,%ymm13
695 vpshufb %ymm3,%ymm13,%ymm13
696
697 vpaddd 0x60(%rsp),%ymm4,%ymm0
698 vmovdqa %ymm0,0x60(%rsp)
699 vpxor %ymm0,%ymm14,%ymm14
700 vpshufb %ymm3,%ymm14,%ymm14
701
702
703 vpaddd %ymm15,%ymm10,%ymm10
704 vpxor %ymm10,%ymm5,%ymm5
705 vpslld $12,%ymm5,%ymm0
706 vpsrld $20,%ymm5,%ymm5
707 vpor %ymm0,%ymm5,%ymm5
708
709 vpaddd %ymm12,%ymm11,%ymm11
710 vpxor %ymm11,%ymm6,%ymm6
711 vpslld $12,%ymm6,%ymm0
712 vpsrld $20,%ymm6,%ymm6
713 vpor %ymm0,%ymm6,%ymm6
714
715 vpaddd %ymm13,%ymm8,%ymm8
716 vpxor %ymm8,%ymm7,%ymm7
717 vpslld $12,%ymm7,%ymm0
718 vpsrld $20,%ymm7,%ymm7
719 vpor %ymm0,%ymm7,%ymm7
720
721 vpaddd %ymm14,%ymm9,%ymm9
722 vpxor %ymm9,%ymm4,%ymm4
723 vpslld $12,%ymm4,%ymm0
724 vpsrld $20,%ymm4,%ymm4
725 vpor %ymm0,%ymm4,%ymm4
726
727
728 vpaddd 0x00(%rsp),%ymm5,%ymm0
729 vmovdqa %ymm0,0x00(%rsp)
730 vpxor %ymm0,%ymm15,%ymm15
731 vpshufb %ymm2,%ymm15,%ymm15
732
733 vpaddd 0x20(%rsp),%ymm6,%ymm0
734 vmovdqa %ymm0,0x20(%rsp)
735 vpxor %ymm0,%ymm12,%ymm12
736 vpshufb %ymm2,%ymm12,%ymm12
737
738 vpaddd 0x40(%rsp),%ymm7,%ymm0
739 vmovdqa %ymm0,0x40(%rsp)
740 vpxor %ymm0,%ymm13,%ymm13
741 vpshufb %ymm2,%ymm13,%ymm13
742
743 vpaddd 0x60(%rsp),%ymm4,%ymm0
744 vmovdqa %ymm0,0x60(%rsp)
745 vpxor %ymm0,%ymm14,%ymm14
746 vpshufb %ymm2,%ymm14,%ymm14
747
748
749 vpaddd %ymm15,%ymm10,%ymm10
750 vpxor %ymm10,%ymm5,%ymm5
751 vpslld $7,%ymm5,%ymm0
752 vpsrld $25,%ymm5,%ymm5
753 vpor %ymm0,%ymm5,%ymm5
754
755 vpaddd %ymm12,%ymm11,%ymm11
756 vpxor %ymm11,%ymm6,%ymm6
757 vpslld $7,%ymm6,%ymm0
758 vpsrld $25,%ymm6,%ymm6
759 vpor %ymm0,%ymm6,%ymm6
760
761 vpaddd %ymm13,%ymm8,%ymm8
762 vpxor %ymm8,%ymm7,%ymm7
763 vpslld $7,%ymm7,%ymm0
764 vpsrld $25,%ymm7,%ymm7
765 vpor %ymm0,%ymm7,%ymm7
766
767 vpaddd %ymm14,%ymm9,%ymm9
768 vpxor %ymm9,%ymm4,%ymm4
769 vpslld $7,%ymm4,%ymm0
770 vpsrld $25,%ymm4,%ymm4
771 vpor %ymm0,%ymm4,%ymm4
772
773 sub $2,%r8d
774 jnz .Ldoubleround8
775
776
777 vpbroadcastd 0x00(%rdi),%ymm0
778 vpaddd 0x00(%rsp),%ymm0,%ymm0
779 vmovdqa %ymm0,0x00(%rsp)
780 vpbroadcastd 0x04(%rdi),%ymm0
781 vpaddd 0x20(%rsp),%ymm0,%ymm0
782 vmovdqa %ymm0,0x20(%rsp)
783 vpbroadcastd 0x08(%rdi),%ymm0
784 vpaddd 0x40(%rsp),%ymm0,%ymm0
785 vmovdqa %ymm0,0x40(%rsp)
786 vpbroadcastd 0x0c(%rdi),%ymm0
787 vpaddd 0x60(%rsp),%ymm0,%ymm0
788 vmovdqa %ymm0,0x60(%rsp)
789 vpbroadcastd 0x10(%rdi),%ymm0
790 vpaddd %ymm0,%ymm4,%ymm4
791 vpbroadcastd 0x14(%rdi),%ymm0
792 vpaddd %ymm0,%ymm5,%ymm5
793 vpbroadcastd 0x18(%rdi),%ymm0
794 vpaddd %ymm0,%ymm6,%ymm6
795 vpbroadcastd 0x1c(%rdi),%ymm0
796 vpaddd %ymm0,%ymm7,%ymm7
797 vpbroadcastd 0x20(%rdi),%ymm0
798 vpaddd %ymm0,%ymm8,%ymm8
799 vpbroadcastd 0x24(%rdi),%ymm0
800 vpaddd %ymm0,%ymm9,%ymm9
801 vpbroadcastd 0x28(%rdi),%ymm0
802 vpaddd %ymm0,%ymm10,%ymm10
803 vpbroadcastd 0x2c(%rdi),%ymm0
804 vpaddd %ymm0,%ymm11,%ymm11
805 vpbroadcastd 0x30(%rdi),%ymm0
806 vpaddd %ymm0,%ymm12,%ymm12
807 vpbroadcastd 0x34(%rdi),%ymm0
808 vpaddd %ymm0,%ymm13,%ymm13
809 vpbroadcastd 0x38(%rdi),%ymm0
810 vpaddd %ymm0,%ymm14,%ymm14
811 vpbroadcastd 0x3c(%rdi),%ymm0
812 vpaddd %ymm0,%ymm15,%ymm15
813
814
815 vpaddd %ymm1,%ymm12,%ymm12
816
817
818 vmovdqa 0x00(%rsp),%ymm0
819 vmovdqa 0x20(%rsp),%ymm1
820 vpunpckldq %ymm1,%ymm0,%ymm2
821 vpunpckhdq %ymm1,%ymm0,%ymm1
822 vmovdqa %ymm2,0x00(%rsp)
823 vmovdqa %ymm1,0x20(%rsp)
824 vmovdqa 0x40(%rsp),%ymm0
825 vmovdqa 0x60(%rsp),%ymm1
826 vpunpckldq %ymm1,%ymm0,%ymm2
827 vpunpckhdq %ymm1,%ymm0,%ymm1
828 vmovdqa %ymm2,0x40(%rsp)
829 vmovdqa %ymm1,0x60(%rsp)
830 vmovdqa %ymm4,%ymm0
831 vpunpckldq %ymm5,%ymm0,%ymm4
832 vpunpckhdq %ymm5,%ymm0,%ymm5
833 vmovdqa %ymm6,%ymm0
834 vpunpckldq %ymm7,%ymm0,%ymm6
835 vpunpckhdq %ymm7,%ymm0,%ymm7
836 vmovdqa %ymm8,%ymm0
837 vpunpckldq %ymm9,%ymm0,%ymm8
838 vpunpckhdq %ymm9,%ymm0,%ymm9
839 vmovdqa %ymm10,%ymm0
840 vpunpckldq %ymm11,%ymm0,%ymm10
841 vpunpckhdq %ymm11,%ymm0,%ymm11
842 vmovdqa %ymm12,%ymm0
843 vpunpckldq %ymm13,%ymm0,%ymm12
844 vpunpckhdq %ymm13,%ymm0,%ymm13
845 vmovdqa %ymm14,%ymm0
846 vpunpckldq %ymm15,%ymm0,%ymm14
847 vpunpckhdq %ymm15,%ymm0,%ymm15
848
849
850 vmovdqa 0x00(%rsp),%ymm0
851 vmovdqa 0x40(%rsp),%ymm2
852 vpunpcklqdq %ymm2,%ymm0,%ymm1
853 vpunpckhqdq %ymm2,%ymm0,%ymm2
854 vmovdqa %ymm1,0x00(%rsp)
855 vmovdqa %ymm2,0x40(%rsp)
856 vmovdqa 0x20(%rsp),%ymm0
857 vmovdqa 0x60(%rsp),%ymm2
858 vpunpcklqdq %ymm2,%ymm0,%ymm1
859 vpunpckhqdq %ymm2,%ymm0,%ymm2
860 vmovdqa %ymm1,0x20(%rsp)
861 vmovdqa %ymm2,0x60(%rsp)
862 vmovdqa %ymm4,%ymm0
863 vpunpcklqdq %ymm6,%ymm0,%ymm4
864 vpunpckhqdq %ymm6,%ymm0,%ymm6
865 vmovdqa %ymm5,%ymm0
866 vpunpcklqdq %ymm7,%ymm0,%ymm5
867 vpunpckhqdq %ymm7,%ymm0,%ymm7
868 vmovdqa %ymm8,%ymm0
869 vpunpcklqdq %ymm10,%ymm0,%ymm8
870 vpunpckhqdq %ymm10,%ymm0,%ymm10
871 vmovdqa %ymm9,%ymm0
872 vpunpcklqdq %ymm11,%ymm0,%ymm9
873 vpunpckhqdq %ymm11,%ymm0,%ymm11
874 vmovdqa %ymm12,%ymm0
875 vpunpcklqdq %ymm14,%ymm0,%ymm12
876 vpunpckhqdq %ymm14,%ymm0,%ymm14
877 vmovdqa %ymm13,%ymm0
878 vpunpcklqdq %ymm15,%ymm0,%ymm13
879 vpunpckhqdq %ymm15,%ymm0,%ymm15
880
881
882
883 vmovdqa 0x00(%rsp),%ymm1
884 vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
885 cmp $0x0020,%rax
886 jl .Lxorpart8
887 vpxor 0x0000(%rdx),%ymm0,%ymm0
888 vmovdqu %ymm0,0x0000(%rsi)
889 vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
890
891 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
892 cmp $0x0040,%rax
893 jl .Lxorpart8
894 vpxor 0x0020(%rdx),%ymm0,%ymm0
895 vmovdqu %ymm0,0x0020(%rsi)
896 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
897
898 vmovdqa 0x40(%rsp),%ymm1
899 vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
900 cmp $0x0060,%rax
901 jl .Lxorpart8
902 vpxor 0x0040(%rdx),%ymm0,%ymm0
903 vmovdqu %ymm0,0x0040(%rsi)
904 vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
905
906 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
907 cmp $0x0080,%rax
908 jl .Lxorpart8
909 vpxor 0x0060(%rdx),%ymm0,%ymm0
910 vmovdqu %ymm0,0x0060(%rsi)
911 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
912
913 vmovdqa 0x20(%rsp),%ymm1
914 vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
915 cmp $0x00a0,%rax
916 jl .Lxorpart8
917 vpxor 0x0080(%rdx),%ymm0,%ymm0
918 vmovdqu %ymm0,0x0080(%rsi)
919 vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
920
921 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
922 cmp $0x00c0,%rax
923 jl .Lxorpart8
924 vpxor 0x00a0(%rdx),%ymm0,%ymm0
925 vmovdqu %ymm0,0x00a0(%rsi)
926 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
927
928 vmovdqa 0x60(%rsp),%ymm1
929 vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
930 cmp $0x00e0,%rax
931 jl .Lxorpart8
932 vpxor 0x00c0(%rdx),%ymm0,%ymm0
933 vmovdqu %ymm0,0x00c0(%rsi)
934 vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
935
936 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
937 cmp $0x0100,%rax
938 jl .Lxorpart8
939 vpxor 0x00e0(%rdx),%ymm0,%ymm0
940 vmovdqu %ymm0,0x00e0(%rsi)
941 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
942
943
944 vmovdqa %ymm4,%ymm0
945 cmp $0x0120,%rax
946 jl .Lxorpart8
947 vpxor 0x0100(%rdx),%ymm0,%ymm0
948 vmovdqu %ymm0,0x0100(%rsi)
949
950 vmovdqa %ymm12,%ymm0
951 cmp $0x0140,%rax
952 jl .Lxorpart8
953 vpxor 0x0120(%rdx),%ymm0,%ymm0
954 vmovdqu %ymm0,0x0120(%rsi)
955
956 vmovdqa %ymm6,%ymm0
957 cmp $0x0160,%rax
958 jl .Lxorpart8
959 vpxor 0x0140(%rdx),%ymm0,%ymm0
960 vmovdqu %ymm0,0x0140(%rsi)
961
962 vmovdqa %ymm14,%ymm0
963 cmp $0x0180,%rax
964 jl .Lxorpart8
965 vpxor 0x0160(%rdx),%ymm0,%ymm0
966 vmovdqu %ymm0,0x0160(%rsi)
967
968 vmovdqa %ymm5,%ymm0
969 cmp $0x01a0,%rax
970 jl .Lxorpart8
971 vpxor 0x0180(%rdx),%ymm0,%ymm0
972 vmovdqu %ymm0,0x0180(%rsi)
973
974 vmovdqa %ymm13,%ymm0
975 cmp $0x01c0,%rax
976 jl .Lxorpart8
977 vpxor 0x01a0(%rdx),%ymm0,%ymm0
978 vmovdqu %ymm0,0x01a0(%rsi)
979
980 vmovdqa %ymm7,%ymm0
981 cmp $0x01e0,%rax
982 jl .Lxorpart8
983 vpxor 0x01c0(%rdx),%ymm0,%ymm0
984 vmovdqu %ymm0,0x01c0(%rsi)
985
986 vmovdqa %ymm15,%ymm0
987 cmp $0x0200,%rax
988 jl .Lxorpart8
989 vpxor 0x01e0(%rdx),%ymm0,%ymm0
990 vmovdqu %ymm0,0x01e0(%rsi)
991
992 .Ldone8:
993 vzeroupper
994 lea -8(%r10),%rsp
995 ret
996
997 .Lxorpart8:
998
999 mov %rax,%r9
1000 and $0x1f,%r9
1001 jz .Ldone8
1002 and $~0x1f,%rax
1003
1004 mov %rsi,%r11
1005
1006 lea (%rdx,%rax),%rsi
1007 mov %rsp,%rdi
1008 mov %r9,%rcx
1009 rep movsb
1010
1011 vpxor 0x00(%rsp),%ymm0,%ymm0
1012 vmovdqa %ymm0,0x00(%rsp)
1013
1014 mov %rsp,%rsi
1015 lea (%r11,%rax),%rdi
1016 mov %r9,%rcx
1017 rep movsb
1018
1019 jmp .Ldone8
1020
1021 ENDPROC(chacha_8block_xor_avx2)