1
2
3
4
5
6
7
8 #include <linux/linkage.h>
9
10 .section .rodata.cst32.CTR2BL, "aM", @progbits, 32
11 .align 32
12 CTR2BL: .octa 0x00000000000000000000000000000000
13 .octa 0x00000000000000000000000000000001
14
15 .section .rodata.cst32.CTR4BL, "aM", @progbits, 32
16 .align 32
17 CTR4BL: .octa 0x00000000000000000000000000000002
18 .octa 0x00000000000000000000000000000003
19
20 .section .rodata.cst32.CTR8BL, "aM", @progbits, 32
21 .align 32
22 CTR8BL: .octa 0x00000003000000020000000100000000
23 .octa 0x00000007000000060000000500000004
24
25 .text
26
27 ENTRY(chacha_2block_xor_avx512vl)
28 # %rdi: Input state matrix, s
29 # %rsi: up to 2 data blocks output, o
30 # %rdx: up to 2 data blocks input, i
31 # %rcx: input/output length in bytes
32 # %r8d: nrounds
33
34
35
36
37
38
39 vzeroupper
40
41
42 vbroadcasti128 0x00(%rdi),%ymm0
43 vbroadcasti128 0x10(%rdi),%ymm1
44 vbroadcasti128 0x20(%rdi),%ymm2
45 vbroadcasti128 0x30(%rdi),%ymm3
46
47 vpaddd CTR2BL(%rip),%ymm3,%ymm3
48
49 vmovdqa %ymm0,%ymm8
50 vmovdqa %ymm1,%ymm9
51 vmovdqa %ymm2,%ymm10
52 vmovdqa %ymm3,%ymm11
53
54 .Ldoubleround:
55
56
57 vpaddd %ymm1,%ymm0,%ymm0
58 vpxord %ymm0,%ymm3,%ymm3
59 vprold $16,%ymm3,%ymm3
60
61
62 vpaddd %ymm3,%ymm2,%ymm2
63 vpxord %ymm2,%ymm1,%ymm1
64 vprold $12,%ymm1,%ymm1
65
66
67 vpaddd %ymm1,%ymm0,%ymm0
68 vpxord %ymm0,%ymm3,%ymm3
69 vprold $8,%ymm3,%ymm3
70
71
72 vpaddd %ymm3,%ymm2,%ymm2
73 vpxord %ymm2,%ymm1,%ymm1
74 vprold $7,%ymm1,%ymm1
75
76
77 vpshufd $0x39,%ymm1,%ymm1
78
79 vpshufd $0x4e,%ymm2,%ymm2
80
81 vpshufd $0x93,%ymm3,%ymm3
82
83
84 vpaddd %ymm1,%ymm0,%ymm0
85 vpxord %ymm0,%ymm3,%ymm3
86 vprold $16,%ymm3,%ymm3
87
88
89 vpaddd %ymm3,%ymm2,%ymm2
90 vpxord %ymm2,%ymm1,%ymm1
91 vprold $12,%ymm1,%ymm1
92
93
94 vpaddd %ymm1,%ymm0,%ymm0
95 vpxord %ymm0,%ymm3,%ymm3
96 vprold $8,%ymm3,%ymm3
97
98
99 vpaddd %ymm3,%ymm2,%ymm2
100 vpxord %ymm2,%ymm1,%ymm1
101 vprold $7,%ymm1,%ymm1
102
103
104 vpshufd $0x93,%ymm1,%ymm1
105
106 vpshufd $0x4e,%ymm2,%ymm2
107
108 vpshufd $0x39,%ymm3,%ymm3
109
110 sub $2,%r8d
111 jnz .Ldoubleround
112
113
114 vpaddd %ymm8,%ymm0,%ymm7
115 cmp $0x10,%rcx
116 jl .Lxorpart2
117 vpxord 0x00(%rdx),%xmm7,%xmm6
118 vmovdqu %xmm6,0x00(%rsi)
119 vextracti128 $1,%ymm7,%xmm0
120
121 vpaddd %ymm9,%ymm1,%ymm7
122 cmp $0x20,%rcx
123 jl .Lxorpart2
124 vpxord 0x10(%rdx),%xmm7,%xmm6
125 vmovdqu %xmm6,0x10(%rsi)
126 vextracti128 $1,%ymm7,%xmm1
127
128 vpaddd %ymm10,%ymm2,%ymm7
129 cmp $0x30,%rcx
130 jl .Lxorpart2
131 vpxord 0x20(%rdx),%xmm7,%xmm6
132 vmovdqu %xmm6,0x20(%rsi)
133 vextracti128 $1,%ymm7,%xmm2
134
135 vpaddd %ymm11,%ymm3,%ymm7
136 cmp $0x40,%rcx
137 jl .Lxorpart2
138 vpxord 0x30(%rdx),%xmm7,%xmm6
139 vmovdqu %xmm6,0x30(%rsi)
140 vextracti128 $1,%ymm7,%xmm3
141
142
143 vmovdqa %xmm0,%xmm7
144 cmp $0x50,%rcx
145 jl .Lxorpart2
146 vpxord 0x40(%rdx),%xmm7,%xmm6
147 vmovdqu %xmm6,0x40(%rsi)
148
149 vmovdqa %xmm1,%xmm7
150 cmp $0x60,%rcx
151 jl .Lxorpart2
152 vpxord 0x50(%rdx),%xmm7,%xmm6
153 vmovdqu %xmm6,0x50(%rsi)
154
155 vmovdqa %xmm2,%xmm7
156 cmp $0x70,%rcx
157 jl .Lxorpart2
158 vpxord 0x60(%rdx),%xmm7,%xmm6
159 vmovdqu %xmm6,0x60(%rsi)
160
161 vmovdqa %xmm3,%xmm7
162 cmp $0x80,%rcx
163 jl .Lxorpart2
164 vpxord 0x70(%rdx),%xmm7,%xmm6
165 vmovdqu %xmm6,0x70(%rsi)
166
167 .Ldone2:
168 vzeroupper
169 ret
170
171 .Lxorpart2:
172
173 mov %rcx,%rax
174 and $0xf,%rcx
175 jz .Ldone8
176 mov %rax,%r9
177 and $~0xf,%r9
178
179 mov $1,%rax
180 shld %cl,%rax,%rax
181 sub $1,%rax
182 kmovq %rax,%k1
183
184 vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
185 vpxord %xmm7,%xmm1,%xmm1
186 vmovdqu8 %xmm1,(%rsi,%r9){%k1}
187
188 jmp .Ldone2
189
190 ENDPROC(chacha_2block_xor_avx512vl)
191
192 ENTRY(chacha_4block_xor_avx512vl)
193 # %rdi: Input state matrix, s
194 # %rsi: up to 4 data blocks output, o
195 # %rdx: up to 4 data blocks input, i
196 # %rcx: input/output length in bytes
197 # %r8d: nrounds
198
199
200
201
202
203
204
205
206 vzeroupper
207
208
209 vbroadcasti128 0x00(%rdi),%ymm0
210 vbroadcasti128 0x10(%rdi),%ymm1
211 vbroadcasti128 0x20(%rdi),%ymm2
212 vbroadcasti128 0x30(%rdi),%ymm3
213
214 vmovdqa %ymm0,%ymm4
215 vmovdqa %ymm1,%ymm5
216 vmovdqa %ymm2,%ymm6
217 vmovdqa %ymm3,%ymm7
218
219 vpaddd CTR2BL(%rip),%ymm3,%ymm3
220 vpaddd CTR4BL(%rip),%ymm7,%ymm7
221
222 vmovdqa %ymm0,%ymm11
223 vmovdqa %ymm1,%ymm12
224 vmovdqa %ymm2,%ymm13
225 vmovdqa %ymm3,%ymm14
226 vmovdqa %ymm7,%ymm15
227
228 .Ldoubleround4:
229
230
231 vpaddd %ymm1,%ymm0,%ymm0
232 vpxord %ymm0,%ymm3,%ymm3
233 vprold $16,%ymm3,%ymm3
234
235 vpaddd %ymm5,%ymm4,%ymm4
236 vpxord %ymm4,%ymm7,%ymm7
237 vprold $16,%ymm7,%ymm7
238
239
240 vpaddd %ymm3,%ymm2,%ymm2
241 vpxord %ymm2,%ymm1,%ymm1
242 vprold $12,%ymm1,%ymm1
243
244 vpaddd %ymm7,%ymm6,%ymm6
245 vpxord %ymm6,%ymm5,%ymm5
246 vprold $12,%ymm5,%ymm5
247
248
249 vpaddd %ymm1,%ymm0,%ymm0
250 vpxord %ymm0,%ymm3,%ymm3
251 vprold $8,%ymm3,%ymm3
252
253 vpaddd %ymm5,%ymm4,%ymm4
254 vpxord %ymm4,%ymm7,%ymm7
255 vprold $8,%ymm7,%ymm7
256
257
258 vpaddd %ymm3,%ymm2,%ymm2
259 vpxord %ymm2,%ymm1,%ymm1
260 vprold $7,%ymm1,%ymm1
261
262 vpaddd %ymm7,%ymm6,%ymm6
263 vpxord %ymm6,%ymm5,%ymm5
264 vprold $7,%ymm5,%ymm5
265
266
267 vpshufd $0x39,%ymm1,%ymm1
268 vpshufd $0x39,%ymm5,%ymm5
269
270 vpshufd $0x4e,%ymm2,%ymm2
271 vpshufd $0x4e,%ymm6,%ymm6
272
273 vpshufd $0x93,%ymm3,%ymm3
274 vpshufd $0x93,%ymm7,%ymm7
275
276
277 vpaddd %ymm1,%ymm0,%ymm0
278 vpxord %ymm0,%ymm3,%ymm3
279 vprold $16,%ymm3,%ymm3
280
281 vpaddd %ymm5,%ymm4,%ymm4
282 vpxord %ymm4,%ymm7,%ymm7
283 vprold $16,%ymm7,%ymm7
284
285
286 vpaddd %ymm3,%ymm2,%ymm2
287 vpxord %ymm2,%ymm1,%ymm1
288 vprold $12,%ymm1,%ymm1
289
290 vpaddd %ymm7,%ymm6,%ymm6
291 vpxord %ymm6,%ymm5,%ymm5
292 vprold $12,%ymm5,%ymm5
293
294
295 vpaddd %ymm1,%ymm0,%ymm0
296 vpxord %ymm0,%ymm3,%ymm3
297 vprold $8,%ymm3,%ymm3
298
299 vpaddd %ymm5,%ymm4,%ymm4
300 vpxord %ymm4,%ymm7,%ymm7
301 vprold $8,%ymm7,%ymm7
302
303
304 vpaddd %ymm3,%ymm2,%ymm2
305 vpxord %ymm2,%ymm1,%ymm1
306 vprold $7,%ymm1,%ymm1
307
308 vpaddd %ymm7,%ymm6,%ymm6
309 vpxord %ymm6,%ymm5,%ymm5
310 vprold $7,%ymm5,%ymm5
311
312
313 vpshufd $0x93,%ymm1,%ymm1
314 vpshufd $0x93,%ymm5,%ymm5
315
316 vpshufd $0x4e,%ymm2,%ymm2
317 vpshufd $0x4e,%ymm6,%ymm6
318
319 vpshufd $0x39,%ymm3,%ymm3
320 vpshufd $0x39,%ymm7,%ymm7
321
322 sub $2,%r8d
323 jnz .Ldoubleround4
324
325
326 vpaddd %ymm11,%ymm0,%ymm10
327 cmp $0x10,%rcx
328 jl .Lxorpart4
329 vpxord 0x00(%rdx),%xmm10,%xmm9
330 vmovdqu %xmm9,0x00(%rsi)
331 vextracti128 $1,%ymm10,%xmm0
332
333 vpaddd %ymm12,%ymm1,%ymm10
334 cmp $0x20,%rcx
335 jl .Lxorpart4
336 vpxord 0x10(%rdx),%xmm10,%xmm9
337 vmovdqu %xmm9,0x10(%rsi)
338 vextracti128 $1,%ymm10,%xmm1
339
340 vpaddd %ymm13,%ymm2,%ymm10
341 cmp $0x30,%rcx
342 jl .Lxorpart4
343 vpxord 0x20(%rdx),%xmm10,%xmm9
344 vmovdqu %xmm9,0x20(%rsi)
345 vextracti128 $1,%ymm10,%xmm2
346
347 vpaddd %ymm14,%ymm3,%ymm10
348 cmp $0x40,%rcx
349 jl .Lxorpart4
350 vpxord 0x30(%rdx),%xmm10,%xmm9
351 vmovdqu %xmm9,0x30(%rsi)
352 vextracti128 $1,%ymm10,%xmm3
353
354
355 vmovdqa %xmm0,%xmm10
356 cmp $0x50,%rcx
357 jl .Lxorpart4
358 vpxord 0x40(%rdx),%xmm10,%xmm9
359 vmovdqu %xmm9,0x40(%rsi)
360
361 vmovdqa %xmm1,%xmm10
362 cmp $0x60,%rcx
363 jl .Lxorpart4
364 vpxord 0x50(%rdx),%xmm10,%xmm9
365 vmovdqu %xmm9,0x50(%rsi)
366
367 vmovdqa %xmm2,%xmm10
368 cmp $0x70,%rcx
369 jl .Lxorpart4
370 vpxord 0x60(%rdx),%xmm10,%xmm9
371 vmovdqu %xmm9,0x60(%rsi)
372
373 vmovdqa %xmm3,%xmm10
374 cmp $0x80,%rcx
375 jl .Lxorpart4
376 vpxord 0x70(%rdx),%xmm10,%xmm9
377 vmovdqu %xmm9,0x70(%rsi)
378
379
380 vpaddd %ymm11,%ymm4,%ymm10
381 cmp $0x90,%rcx
382 jl .Lxorpart4
383 vpxord 0x80(%rdx),%xmm10,%xmm9
384 vmovdqu %xmm9,0x80(%rsi)
385 vextracti128 $1,%ymm10,%xmm4
386
387 vpaddd %ymm12,%ymm5,%ymm10
388 cmp $0xa0,%rcx
389 jl .Lxorpart4
390 vpxord 0x90(%rdx),%xmm10,%xmm9
391 vmovdqu %xmm9,0x90(%rsi)
392 vextracti128 $1,%ymm10,%xmm5
393
394 vpaddd %ymm13,%ymm6,%ymm10
395 cmp $0xb0,%rcx
396 jl .Lxorpart4
397 vpxord 0xa0(%rdx),%xmm10,%xmm9
398 vmovdqu %xmm9,0xa0(%rsi)
399 vextracti128 $1,%ymm10,%xmm6
400
401 vpaddd %ymm15,%ymm7,%ymm10
402 cmp $0xc0,%rcx
403 jl .Lxorpart4
404 vpxord 0xb0(%rdx),%xmm10,%xmm9
405 vmovdqu %xmm9,0xb0(%rsi)
406 vextracti128 $1,%ymm10,%xmm7
407
408
409 vmovdqa %xmm4,%xmm10
410 cmp $0xd0,%rcx
411 jl .Lxorpart4
412 vpxord 0xc0(%rdx),%xmm10,%xmm9
413 vmovdqu %xmm9,0xc0(%rsi)
414
415 vmovdqa %xmm5,%xmm10
416 cmp $0xe0,%rcx
417 jl .Lxorpart4
418 vpxord 0xd0(%rdx),%xmm10,%xmm9
419 vmovdqu %xmm9,0xd0(%rsi)
420
421 vmovdqa %xmm6,%xmm10
422 cmp $0xf0,%rcx
423 jl .Lxorpart4
424 vpxord 0xe0(%rdx),%xmm10,%xmm9
425 vmovdqu %xmm9,0xe0(%rsi)
426
427 vmovdqa %xmm7,%xmm10
428 cmp $0x100,%rcx
429 jl .Lxorpart4
430 vpxord 0xf0(%rdx),%xmm10,%xmm9
431 vmovdqu %xmm9,0xf0(%rsi)
432
433 .Ldone4:
434 vzeroupper
435 ret
436
437 .Lxorpart4:
438
439 mov %rcx,%rax
440 and $0xf,%rcx
441 jz .Ldone8
442 mov %rax,%r9
443 and $~0xf,%r9
444
445 mov $1,%rax
446 shld %cl,%rax,%rax
447 sub $1,%rax
448 kmovq %rax,%k1
449
450 vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
451 vpxord %xmm10,%xmm1,%xmm1
452 vmovdqu8 %xmm1,(%rsi,%r9){%k1}
453
454 jmp .Ldone4
455
456 ENDPROC(chacha_4block_xor_avx512vl)
457
458 ENTRY(chacha_8block_xor_avx512vl)
459 # %rdi: Input state matrix, s
460 # %rsi: up to 8 data blocks output, o
461 # %rdx: up to 8 data blocks input, i
462 # %rcx: input/output length in bytes
463 # %r8d: nrounds
464
465
466
467
468
469
470 vzeroupper
471
472
473 vpbroadcastd 0x00(%rdi),%ymm0
474 vpbroadcastd 0x04(%rdi),%ymm1
475 vpbroadcastd 0x08(%rdi),%ymm2
476 vpbroadcastd 0x0c(%rdi),%ymm3
477 vpbroadcastd 0x10(%rdi),%ymm4
478 vpbroadcastd 0x14(%rdi),%ymm5
479 vpbroadcastd 0x18(%rdi),%ymm6
480 vpbroadcastd 0x1c(%rdi),%ymm7
481 vpbroadcastd 0x20(%rdi),%ymm8
482 vpbroadcastd 0x24(%rdi),%ymm9
483 vpbroadcastd 0x28(%rdi),%ymm10
484 vpbroadcastd 0x2c(%rdi),%ymm11
485 vpbroadcastd 0x30(%rdi),%ymm12
486 vpbroadcastd 0x34(%rdi),%ymm13
487 vpbroadcastd 0x38(%rdi),%ymm14
488 vpbroadcastd 0x3c(%rdi),%ymm15
489
490
491 vpaddd CTR8BL(%rip),%ymm12,%ymm12
492
493 vmovdqa64 %ymm0,%ymm16
494 vmovdqa64 %ymm1,%ymm17
495 vmovdqa64 %ymm2,%ymm18
496 vmovdqa64 %ymm3,%ymm19
497 vmovdqa64 %ymm4,%ymm20
498 vmovdqa64 %ymm5,%ymm21
499 vmovdqa64 %ymm6,%ymm22
500 vmovdqa64 %ymm7,%ymm23
501 vmovdqa64 %ymm8,%ymm24
502 vmovdqa64 %ymm9,%ymm25
503 vmovdqa64 %ymm10,%ymm26
504 vmovdqa64 %ymm11,%ymm27
505 vmovdqa64 %ymm12,%ymm28
506 vmovdqa64 %ymm13,%ymm29
507 vmovdqa64 %ymm14,%ymm30
508 vmovdqa64 %ymm15,%ymm31
509
510 .Ldoubleround8:
511
512 vpaddd %ymm0,%ymm4,%ymm0
513 vpxord %ymm0,%ymm12,%ymm12
514 vprold $16,%ymm12,%ymm12
515
516 vpaddd %ymm1,%ymm5,%ymm1
517 vpxord %ymm1,%ymm13,%ymm13
518 vprold $16,%ymm13,%ymm13
519
520 vpaddd %ymm2,%ymm6,%ymm2
521 vpxord %ymm2,%ymm14,%ymm14
522 vprold $16,%ymm14,%ymm14
523
524 vpaddd %ymm3,%ymm7,%ymm3
525 vpxord %ymm3,%ymm15,%ymm15
526 vprold $16,%ymm15,%ymm15
527
528
529 vpaddd %ymm12,%ymm8,%ymm8
530 vpxord %ymm8,%ymm4,%ymm4
531 vprold $12,%ymm4,%ymm4
532
533 vpaddd %ymm13,%ymm9,%ymm9
534 vpxord %ymm9,%ymm5,%ymm5
535 vprold $12,%ymm5,%ymm5
536
537 vpaddd %ymm14,%ymm10,%ymm10
538 vpxord %ymm10,%ymm6,%ymm6
539 vprold $12,%ymm6,%ymm6
540
541 vpaddd %ymm15,%ymm11,%ymm11
542 vpxord %ymm11,%ymm7,%ymm7
543 vprold $12,%ymm7,%ymm7
544
545
546 vpaddd %ymm0,%ymm4,%ymm0
547 vpxord %ymm0,%ymm12,%ymm12
548 vprold $8,%ymm12,%ymm12
549
550 vpaddd %ymm1,%ymm5,%ymm1
551 vpxord %ymm1,%ymm13,%ymm13
552 vprold $8,%ymm13,%ymm13
553
554 vpaddd %ymm2,%ymm6,%ymm2
555 vpxord %ymm2,%ymm14,%ymm14
556 vprold $8,%ymm14,%ymm14
557
558 vpaddd %ymm3,%ymm7,%ymm3
559 vpxord %ymm3,%ymm15,%ymm15
560 vprold $8,%ymm15,%ymm15
561
562
563 vpaddd %ymm12,%ymm8,%ymm8
564 vpxord %ymm8,%ymm4,%ymm4
565 vprold $7,%ymm4,%ymm4
566
567 vpaddd %ymm13,%ymm9,%ymm9
568 vpxord %ymm9,%ymm5,%ymm5
569 vprold $7,%ymm5,%ymm5
570
571 vpaddd %ymm14,%ymm10,%ymm10
572 vpxord %ymm10,%ymm6,%ymm6
573 vprold $7,%ymm6,%ymm6
574
575 vpaddd %ymm15,%ymm11,%ymm11
576 vpxord %ymm11,%ymm7,%ymm7
577 vprold $7,%ymm7,%ymm7
578
579
580 vpaddd %ymm0,%ymm5,%ymm0
581 vpxord %ymm0,%ymm15,%ymm15
582 vprold $16,%ymm15,%ymm15
583
584 vpaddd %ymm1,%ymm6,%ymm1
585 vpxord %ymm1,%ymm12,%ymm12
586 vprold $16,%ymm12,%ymm12
587
588 vpaddd %ymm2,%ymm7,%ymm2
589 vpxord %ymm2,%ymm13,%ymm13
590 vprold $16,%ymm13,%ymm13
591
592 vpaddd %ymm3,%ymm4,%ymm3
593 vpxord %ymm3,%ymm14,%ymm14
594 vprold $16,%ymm14,%ymm14
595
596
597 vpaddd %ymm15,%ymm10,%ymm10
598 vpxord %ymm10,%ymm5,%ymm5
599 vprold $12,%ymm5,%ymm5
600
601 vpaddd %ymm12,%ymm11,%ymm11
602 vpxord %ymm11,%ymm6,%ymm6
603 vprold $12,%ymm6,%ymm6
604
605 vpaddd %ymm13,%ymm8,%ymm8
606 vpxord %ymm8,%ymm7,%ymm7
607 vprold $12,%ymm7,%ymm7
608
609 vpaddd %ymm14,%ymm9,%ymm9
610 vpxord %ymm9,%ymm4,%ymm4
611 vprold $12,%ymm4,%ymm4
612
613
614 vpaddd %ymm0,%ymm5,%ymm0
615 vpxord %ymm0,%ymm15,%ymm15
616 vprold $8,%ymm15,%ymm15
617
618 vpaddd %ymm1,%ymm6,%ymm1
619 vpxord %ymm1,%ymm12,%ymm12
620 vprold $8,%ymm12,%ymm12
621
622 vpaddd %ymm2,%ymm7,%ymm2
623 vpxord %ymm2,%ymm13,%ymm13
624 vprold $8,%ymm13,%ymm13
625
626 vpaddd %ymm3,%ymm4,%ymm3
627 vpxord %ymm3,%ymm14,%ymm14
628 vprold $8,%ymm14,%ymm14
629
630
631 vpaddd %ymm15,%ymm10,%ymm10
632 vpxord %ymm10,%ymm5,%ymm5
633 vprold $7,%ymm5,%ymm5
634
635 vpaddd %ymm12,%ymm11,%ymm11
636 vpxord %ymm11,%ymm6,%ymm6
637 vprold $7,%ymm6,%ymm6
638
639 vpaddd %ymm13,%ymm8,%ymm8
640 vpxord %ymm8,%ymm7,%ymm7
641 vprold $7,%ymm7,%ymm7
642
643 vpaddd %ymm14,%ymm9,%ymm9
644 vpxord %ymm9,%ymm4,%ymm4
645 vprold $7,%ymm4,%ymm4
646
647 sub $2,%r8d
648 jnz .Ldoubleround8
649
650
651 vpaddd %ymm16,%ymm0,%ymm0
652 vpaddd %ymm17,%ymm1,%ymm1
653 vpaddd %ymm18,%ymm2,%ymm2
654 vpaddd %ymm19,%ymm3,%ymm3
655 vpaddd %ymm20,%ymm4,%ymm4
656 vpaddd %ymm21,%ymm5,%ymm5
657 vpaddd %ymm22,%ymm6,%ymm6
658 vpaddd %ymm23,%ymm7,%ymm7
659 vpaddd %ymm24,%ymm8,%ymm8
660 vpaddd %ymm25,%ymm9,%ymm9
661 vpaddd %ymm26,%ymm10,%ymm10
662 vpaddd %ymm27,%ymm11,%ymm11
663 vpaddd %ymm28,%ymm12,%ymm12
664 vpaddd %ymm29,%ymm13,%ymm13
665 vpaddd %ymm30,%ymm14,%ymm14
666 vpaddd %ymm31,%ymm15,%ymm15
667
668
669 vpunpckldq %ymm1,%ymm0,%ymm16
670 vpunpckhdq %ymm1,%ymm0,%ymm17
671 vpunpckldq %ymm3,%ymm2,%ymm18
672 vpunpckhdq %ymm3,%ymm2,%ymm19
673 vpunpckldq %ymm5,%ymm4,%ymm20
674 vpunpckhdq %ymm5,%ymm4,%ymm21
675 vpunpckldq %ymm7,%ymm6,%ymm22
676 vpunpckhdq %ymm7,%ymm6,%ymm23
677 vpunpckldq %ymm9,%ymm8,%ymm24
678 vpunpckhdq %ymm9,%ymm8,%ymm25
679 vpunpckldq %ymm11,%ymm10,%ymm26
680 vpunpckhdq %ymm11,%ymm10,%ymm27
681 vpunpckldq %ymm13,%ymm12,%ymm28
682 vpunpckhdq %ymm13,%ymm12,%ymm29
683 vpunpckldq %ymm15,%ymm14,%ymm30
684 vpunpckhdq %ymm15,%ymm14,%ymm31
685
686
687 vpunpcklqdq %ymm18,%ymm16,%ymm0
688 vpunpcklqdq %ymm19,%ymm17,%ymm1
689 vpunpckhqdq %ymm18,%ymm16,%ymm2
690 vpunpckhqdq %ymm19,%ymm17,%ymm3
691 vpunpcklqdq %ymm22,%ymm20,%ymm4
692 vpunpcklqdq %ymm23,%ymm21,%ymm5
693 vpunpckhqdq %ymm22,%ymm20,%ymm6
694 vpunpckhqdq %ymm23,%ymm21,%ymm7
695 vpunpcklqdq %ymm26,%ymm24,%ymm8
696 vpunpcklqdq %ymm27,%ymm25,%ymm9
697 vpunpckhqdq %ymm26,%ymm24,%ymm10
698 vpunpckhqdq %ymm27,%ymm25,%ymm11
699 vpunpcklqdq %ymm30,%ymm28,%ymm12
700 vpunpcklqdq %ymm31,%ymm29,%ymm13
701 vpunpckhqdq %ymm30,%ymm28,%ymm14
702 vpunpckhqdq %ymm31,%ymm29,%ymm15
703
704
705
706 vmovdqa64 %ymm0,%ymm16
707 vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
708 cmp $0x0020,%rcx
709 jl .Lxorpart8
710 vpxord 0x0000(%rdx),%ymm0,%ymm0
711 vmovdqu64 %ymm0,0x0000(%rsi)
712 vmovdqa64 %ymm16,%ymm0
713 vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
714
715 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
716 cmp $0x0040,%rcx
717 jl .Lxorpart8
718 vpxord 0x0020(%rdx),%ymm0,%ymm0
719 vmovdqu64 %ymm0,0x0020(%rsi)
720 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
721
722 vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
723 cmp $0x0060,%rcx
724 jl .Lxorpart8
725 vpxord 0x0040(%rdx),%ymm0,%ymm0
726 vmovdqu64 %ymm0,0x0040(%rsi)
727 vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
728
729 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
730 cmp $0x0080,%rcx
731 jl .Lxorpart8
732 vpxord 0x0060(%rdx),%ymm0,%ymm0
733 vmovdqu64 %ymm0,0x0060(%rsi)
734 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
735
736 vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
737 cmp $0x00a0,%rcx
738 jl .Lxorpart8
739 vpxord 0x0080(%rdx),%ymm0,%ymm0
740 vmovdqu64 %ymm0,0x0080(%rsi)
741 vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
742
743 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
744 cmp $0x00c0,%rcx
745 jl .Lxorpart8
746 vpxord 0x00a0(%rdx),%ymm0,%ymm0
747 vmovdqu64 %ymm0,0x00a0(%rsi)
748 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
749
750 vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
751 cmp $0x00e0,%rcx
752 jl .Lxorpart8
753 vpxord 0x00c0(%rdx),%ymm0,%ymm0
754 vmovdqu64 %ymm0,0x00c0(%rsi)
755 vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
756
757 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
758 cmp $0x0100,%rcx
759 jl .Lxorpart8
760 vpxord 0x00e0(%rdx),%ymm0,%ymm0
761 vmovdqu64 %ymm0,0x00e0(%rsi)
762 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
763
764
765 vmovdqa64 %ymm4,%ymm0
766 cmp $0x0120,%rcx
767 jl .Lxorpart8
768 vpxord 0x0100(%rdx),%ymm0,%ymm0
769 vmovdqu64 %ymm0,0x0100(%rsi)
770
771 vmovdqa64 %ymm12,%ymm0
772 cmp $0x0140,%rcx
773 jl .Lxorpart8
774 vpxord 0x0120(%rdx),%ymm0,%ymm0
775 vmovdqu64 %ymm0,0x0120(%rsi)
776
777 vmovdqa64 %ymm6,%ymm0
778 cmp $0x0160,%rcx
779 jl .Lxorpart8
780 vpxord 0x0140(%rdx),%ymm0,%ymm0
781 vmovdqu64 %ymm0,0x0140(%rsi)
782
783 vmovdqa64 %ymm14,%ymm0
784 cmp $0x0180,%rcx
785 jl .Lxorpart8
786 vpxord 0x0160(%rdx),%ymm0,%ymm0
787 vmovdqu64 %ymm0,0x0160(%rsi)
788
789 vmovdqa64 %ymm5,%ymm0
790 cmp $0x01a0,%rcx
791 jl .Lxorpart8
792 vpxord 0x0180(%rdx),%ymm0,%ymm0
793 vmovdqu64 %ymm0,0x0180(%rsi)
794
795 vmovdqa64 %ymm13,%ymm0
796 cmp $0x01c0,%rcx
797 jl .Lxorpart8
798 vpxord 0x01a0(%rdx),%ymm0,%ymm0
799 vmovdqu64 %ymm0,0x01a0(%rsi)
800
801 vmovdqa64 %ymm7,%ymm0
802 cmp $0x01e0,%rcx
803 jl .Lxorpart8
804 vpxord 0x01c0(%rdx),%ymm0,%ymm0
805 vmovdqu64 %ymm0,0x01c0(%rsi)
806
807 vmovdqa64 %ymm15,%ymm0
808 cmp $0x0200,%rcx
809 jl .Lxorpart8
810 vpxord 0x01e0(%rdx),%ymm0,%ymm0
811 vmovdqu64 %ymm0,0x01e0(%rsi)
812
813 .Ldone8:
814 vzeroupper
815 ret
816
817 .Lxorpart8:
818
819 mov %rcx,%rax
820 and $0x1f,%rcx
821 jz .Ldone8
822 mov %rax,%r9
823 and $~0x1f,%r9
824
825 mov $1,%rax
826 shld %cl,%rax,%rax
827 sub $1,%rax
828 kmovq %rax,%k1
829
830 vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
831 vpxord %ymm0,%ymm1,%ymm1
832 vmovdqu8 %ymm1,(%rsi,%r9){%k1}
833
834 jmp .Ldone8
835
836 ENDPROC(chacha_8block_xor_avx512vl)