1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 #include <linux/linkage.h>
22 #include <asm/assembler.h>
23 #include <asm/cache.h>
24
25 .text
26 .align 6
27
28
29
30
31
32
33
34
35
36
37
38
39 chacha_permute:
40
41 adr_l x10, ROT8
42 ld1 {v12.4s}, [x10]
43
44 .Ldoubleround:
45
46 add v0.4s, v0.4s, v1.4s
47 eor v3.16b, v3.16b, v0.16b
48 rev32 v3.8h, v3.8h
49
50
51 add v2.4s, v2.4s, v3.4s
52 eor v4.16b, v1.16b, v2.16b
53 shl v1.4s, v4.4s, #12
54 sri v1.4s, v4.4s, #20
55
56
57 add v0.4s, v0.4s, v1.4s
58 eor v3.16b, v3.16b, v0.16b
59 tbl v3.16b, {v3.16b}, v12.16b
60
61
62 add v2.4s, v2.4s, v3.4s
63 eor v4.16b, v1.16b, v2.16b
64 shl v1.4s, v4.4s, #7
65 sri v1.4s, v4.4s, #25
66
67
68 ext v1.16b, v1.16b, v1.16b, #4
69
70 ext v2.16b, v2.16b, v2.16b, #8
71
72 ext v3.16b, v3.16b, v3.16b, #12
73
74
75 add v0.4s, v0.4s, v1.4s
76 eor v3.16b, v3.16b, v0.16b
77 rev32 v3.8h, v3.8h
78
79
80 add v2.4s, v2.4s, v3.4s
81 eor v4.16b, v1.16b, v2.16b
82 shl v1.4s, v4.4s, #12
83 sri v1.4s, v4.4s, #20
84
85
86 add v0.4s, v0.4s, v1.4s
87 eor v3.16b, v3.16b, v0.16b
88 tbl v3.16b, {v3.16b}, v12.16b
89
90
91 add v2.4s, v2.4s, v3.4s
92 eor v4.16b, v1.16b, v2.16b
93 shl v1.4s, v4.4s, #7
94 sri v1.4s, v4.4s, #25
95
96
97 ext v1.16b, v1.16b, v1.16b, #12
98
99 ext v2.16b, v2.16b, v2.16b, #8
100
101 ext v3.16b, v3.16b, v3.16b, #4
102
103 subs w3, w3, #2
104 b.ne .Ldoubleround
105
106 ret
107 ENDPROC(chacha_permute)
108
109 ENTRY(chacha_block_xor_neon)
110
111
112
113
114
115 stp x29, x30, [sp, #-16]!
116 mov x29, sp
117
118
119 ld1 {v0.4s-v3.4s}, [x0]
120 ld1 {v8.4s-v11.4s}, [x0]
121
122 bl chacha_permute
123
124 ld1 {v4.16b-v7.16b}, [x2]
125
126
127 add v0.4s, v0.4s, v8.4s
128 eor v0.16b, v0.16b, v4.16b
129
130
131 add v1.4s, v1.4s, v9.4s
132 eor v1.16b, v1.16b, v5.16b
133
134
135 add v2.4s, v2.4s, v10.4s
136 eor v2.16b, v2.16b, v6.16b
137
138
139 add v3.4s, v3.4s, v11.4s
140 eor v3.16b, v3.16b, v7.16b
141
142 st1 {v0.16b-v3.16b}, [x1]
143
144 ldp x29, x30, [sp], #16
145 ret
146 ENDPROC(chacha_block_xor_neon)
147
148 ENTRY(hchacha_block_neon)
149
150
151
152
153 stp x29, x30, [sp, #-16]!
154 mov x29, sp
155
156 ld1 {v0.4s-v3.4s}, [x0]
157
158 mov w3, w2
159 bl chacha_permute
160
161 st1 {v0.4s}, [x1], #16
162 st1 {v3.4s}, [x1]
163
164 ldp x29, x30, [sp], #16
165 ret
166 ENDPROC(hchacha_block_neon)
167
168 a0 .req w12
169 a1 .req w13
170 a2 .req w14
171 a3 .req w15
172 a4 .req w16
173 a5 .req w17
174 a6 .req w19
175 a7 .req w20
176 a8 .req w21
177 a9 .req w22
178 a10 .req w23
179 a11 .req w24
180 a12 .req w25
181 a13 .req w26
182 a14 .req w27
183 a15 .req w28
184
185 .align 6
186 ENTRY(chacha_4block_xor_neon)
187 frame_push 10
188
189
190
191
192
193
194
195 adr_l x10, .Lpermute
196 and x5, x4, #63
197 add x10, x10, x5
198 add x11, x10, #64
199
200
201
202
203
204
205
206
207
208
209
210
211 adr_l x9, CTRINC
212 ld1 {v30.4s-v31.4s}, [x9]
213
214
215 add x8, x0, #16
216 ld4r { v0.4s- v3.4s}, [x0]
217 ld4r { v4.4s- v7.4s}, [x8], #16
218 ld4r { v8.4s-v11.4s}, [x8], #16
219 ld4r {v12.4s-v15.4s}, [x8]
220
221 mov a0, v0.s[0]
222 mov a1, v1.s[0]
223 mov a2, v2.s[0]
224 mov a3, v3.s[0]
225 mov a4, v4.s[0]
226 mov a5, v5.s[0]
227 mov a6, v6.s[0]
228 mov a7, v7.s[0]
229 mov a8, v8.s[0]
230 mov a9, v9.s[0]
231 mov a10, v10.s[0]
232 mov a11, v11.s[0]
233 mov a12, v12.s[0]
234 mov a13, v13.s[0]
235 mov a14, v14.s[0]
236 mov a15, v15.s[0]
237
238
239 add v12.4s, v12.4s, v30.4s
240
241 .Ldoubleround4:
242
243
244
245
246 add v0.4s, v0.4s, v4.4s
247 add a0, a0, a4
248 add v1.4s, v1.4s, v5.4s
249 add a1, a1, a5
250 add v2.4s, v2.4s, v6.4s
251 add a2, a2, a6
252 add v3.4s, v3.4s, v7.4s
253 add a3, a3, a7
254
255 eor v12.16b, v12.16b, v0.16b
256 eor a12, a12, a0
257 eor v13.16b, v13.16b, v1.16b
258 eor a13, a13, a1
259 eor v14.16b, v14.16b, v2.16b
260 eor a14, a14, a2
261 eor v15.16b, v15.16b, v3.16b
262 eor a15, a15, a3
263
264 rev32 v12.8h, v12.8h
265 ror a12, a12, #16
266 rev32 v13.8h, v13.8h
267 ror a13, a13, #16
268 rev32 v14.8h, v14.8h
269 ror a14, a14, #16
270 rev32 v15.8h, v15.8h
271 ror a15, a15, #16
272
273
274
275
276
277 add v8.4s, v8.4s, v12.4s
278 add a8, a8, a12
279 add v9.4s, v9.4s, v13.4s
280 add a9, a9, a13
281 add v10.4s, v10.4s, v14.4s
282 add a10, a10, a14
283 add v11.4s, v11.4s, v15.4s
284 add a11, a11, a15
285
286 eor v16.16b, v4.16b, v8.16b
287 eor a4, a4, a8
288 eor v17.16b, v5.16b, v9.16b
289 eor a5, a5, a9
290 eor v18.16b, v6.16b, v10.16b
291 eor a6, a6, a10
292 eor v19.16b, v7.16b, v11.16b
293 eor a7, a7, a11
294
295 shl v4.4s, v16.4s, #12
296 shl v5.4s, v17.4s, #12
297 shl v6.4s, v18.4s, #12
298 shl v7.4s, v19.4s, #12
299
300 sri v4.4s, v16.4s, #20
301 ror a4, a4, #20
302 sri v5.4s, v17.4s, #20
303 ror a5, a5, #20
304 sri v6.4s, v18.4s, #20
305 ror a6, a6, #20
306 sri v7.4s, v19.4s, #20
307 ror a7, a7, #20
308
309
310
311
312
313 add v0.4s, v0.4s, v4.4s
314 add a0, a0, a4
315 add v1.4s, v1.4s, v5.4s
316 add a1, a1, a5
317 add v2.4s, v2.4s, v6.4s
318 add a2, a2, a6
319 add v3.4s, v3.4s, v7.4s
320 add a3, a3, a7
321
322 eor v12.16b, v12.16b, v0.16b
323 eor a12, a12, a0
324 eor v13.16b, v13.16b, v1.16b
325 eor a13, a13, a1
326 eor v14.16b, v14.16b, v2.16b
327 eor a14, a14, a2
328 eor v15.16b, v15.16b, v3.16b
329 eor a15, a15, a3
330
331 tbl v12.16b, {v12.16b}, v31.16b
332 ror a12, a12, #24
333 tbl v13.16b, {v13.16b}, v31.16b
334 ror a13, a13, #24
335 tbl v14.16b, {v14.16b}, v31.16b
336 ror a14, a14, #24
337 tbl v15.16b, {v15.16b}, v31.16b
338 ror a15, a15, #24
339
340
341
342
343
344 add v8.4s, v8.4s, v12.4s
345 add a8, a8, a12
346 add v9.4s, v9.4s, v13.4s
347 add a9, a9, a13
348 add v10.4s, v10.4s, v14.4s
349 add a10, a10, a14
350 add v11.4s, v11.4s, v15.4s
351 add a11, a11, a15
352
353 eor v16.16b, v4.16b, v8.16b
354 eor a4, a4, a8
355 eor v17.16b, v5.16b, v9.16b
356 eor a5, a5, a9
357 eor v18.16b, v6.16b, v10.16b
358 eor a6, a6, a10
359 eor v19.16b, v7.16b, v11.16b
360 eor a7, a7, a11
361
362 shl v4.4s, v16.4s, #7
363 shl v5.4s, v17.4s, #7
364 shl v6.4s, v18.4s, #7
365 shl v7.4s, v19.4s, #7
366
367 sri v4.4s, v16.4s, #25
368 ror a4, a4, #25
369 sri v5.4s, v17.4s, #25
370 ror a5, a5, #25
371 sri v6.4s, v18.4s, #25
372 ror a6, a6, #25
373 sri v7.4s, v19.4s, #25
374 ror a7, a7, #25
375
376
377
378
379
380 add v0.4s, v0.4s, v5.4s
381 add a0, a0, a5
382 add v1.4s, v1.4s, v6.4s
383 add a1, a1, a6
384 add v2.4s, v2.4s, v7.4s
385 add a2, a2, a7
386 add v3.4s, v3.4s, v4.4s
387 add a3, a3, a4
388
389 eor v15.16b, v15.16b, v0.16b
390 eor a15, a15, a0
391 eor v12.16b, v12.16b, v1.16b
392 eor a12, a12, a1
393 eor v13.16b, v13.16b, v2.16b
394 eor a13, a13, a2
395 eor v14.16b, v14.16b, v3.16b
396 eor a14, a14, a3
397
398 rev32 v15.8h, v15.8h
399 ror a15, a15, #16
400 rev32 v12.8h, v12.8h
401 ror a12, a12, #16
402 rev32 v13.8h, v13.8h
403 ror a13, a13, #16
404 rev32 v14.8h, v14.8h
405 ror a14, a14, #16
406
407
408
409
410
411 add v10.4s, v10.4s, v15.4s
412 add a10, a10, a15
413 add v11.4s, v11.4s, v12.4s
414 add a11, a11, a12
415 add v8.4s, v8.4s, v13.4s
416 add a8, a8, a13
417 add v9.4s, v9.4s, v14.4s
418 add a9, a9, a14
419
420 eor v16.16b, v5.16b, v10.16b
421 eor a5, a5, a10
422 eor v17.16b, v6.16b, v11.16b
423 eor a6, a6, a11
424 eor v18.16b, v7.16b, v8.16b
425 eor a7, a7, a8
426 eor v19.16b, v4.16b, v9.16b
427 eor a4, a4, a9
428
429 shl v5.4s, v16.4s, #12
430 shl v6.4s, v17.4s, #12
431 shl v7.4s, v18.4s, #12
432 shl v4.4s, v19.4s, #12
433
434 sri v5.4s, v16.4s, #20
435 ror a5, a5, #20
436 sri v6.4s, v17.4s, #20
437 ror a6, a6, #20
438 sri v7.4s, v18.4s, #20
439 ror a7, a7, #20
440 sri v4.4s, v19.4s, #20
441 ror a4, a4, #20
442
443
444
445
446
447 add v0.4s, v0.4s, v5.4s
448 add a0, a0, a5
449 add v1.4s, v1.4s, v6.4s
450 add a1, a1, a6
451 add v2.4s, v2.4s, v7.4s
452 add a2, a2, a7
453 add v3.4s, v3.4s, v4.4s
454 add a3, a3, a4
455
456 eor v15.16b, v15.16b, v0.16b
457 eor a15, a15, a0
458 eor v12.16b, v12.16b, v1.16b
459 eor a12, a12, a1
460 eor v13.16b, v13.16b, v2.16b
461 eor a13, a13, a2
462 eor v14.16b, v14.16b, v3.16b
463 eor a14, a14, a3
464
465 tbl v15.16b, {v15.16b}, v31.16b
466 ror a15, a15, #24
467 tbl v12.16b, {v12.16b}, v31.16b
468 ror a12, a12, #24
469 tbl v13.16b, {v13.16b}, v31.16b
470 ror a13, a13, #24
471 tbl v14.16b, {v14.16b}, v31.16b
472 ror a14, a14, #24
473
474
475
476
477
478 add v10.4s, v10.4s, v15.4s
479 add a10, a10, a15
480 add v11.4s, v11.4s, v12.4s
481 add a11, a11, a12
482 add v8.4s, v8.4s, v13.4s
483 add a8, a8, a13
484 add v9.4s, v9.4s, v14.4s
485 add a9, a9, a14
486
487 eor v16.16b, v5.16b, v10.16b
488 eor a5, a5, a10
489 eor v17.16b, v6.16b, v11.16b
490 eor a6, a6, a11
491 eor v18.16b, v7.16b, v8.16b
492 eor a7, a7, a8
493 eor v19.16b, v4.16b, v9.16b
494 eor a4, a4, a9
495
496 shl v5.4s, v16.4s, #7
497 shl v6.4s, v17.4s, #7
498 shl v7.4s, v18.4s, #7
499 shl v4.4s, v19.4s, #7
500
501 sri v5.4s, v16.4s, #25
502 ror a5, a5, #25
503 sri v6.4s, v17.4s, #25
504 ror a6, a6, #25
505 sri v7.4s, v18.4s, #25
506 ror a7, a7, #25
507 sri v4.4s, v19.4s, #25
508 ror a4, a4, #25
509
510 subs w3, w3, #2
511 b.ne .Ldoubleround4
512
513 ld4r {v16.4s-v19.4s}, [x0], #16
514 ld4r {v20.4s-v23.4s}, [x0], #16
515
516
517 add v12.4s, v12.4s, v30.4s
518
519
520
521
522
523 add v0.4s, v0.4s, v16.4s
524 mov w6, v16.s[0]
525 mov w7, v17.s[0]
526 add v1.4s, v1.4s, v17.4s
527 mov w8, v18.s[0]
528 mov w9, v19.s[0]
529 add v2.4s, v2.4s, v18.4s
530 add a0, a0, w6
531 add a1, a1, w7
532 add v3.4s, v3.4s, v19.4s
533 add a2, a2, w8
534 add a3, a3, w9
535 CPU_BE( rev a0, a0 )
536 CPU_BE( rev a1, a1 )
537 CPU_BE( rev a2, a2 )
538 CPU_BE( rev a3, a3 )
539
540 ld4r {v24.4s-v27.4s}, [x0], #16
541 ld4r {v28.4s-v31.4s}, [x0]
542
543
544
545
546
547 add v4.4s, v4.4s, v20.4s
548 mov w6, v20.s[0]
549 mov w7, v21.s[0]
550 add v5.4s, v5.4s, v21.4s
551 mov w8, v22.s[0]
552 mov w9, v23.s[0]
553 add v6.4s, v6.4s, v22.4s
554 add a4, a4, w6
555 add a5, a5, w7
556 add v7.4s, v7.4s, v23.4s
557 add a6, a6, w8
558 add a7, a7, w9
559 CPU_BE( rev a4, a4 )
560 CPU_BE( rev a5, a5 )
561 CPU_BE( rev a6, a6 )
562 CPU_BE( rev a7, a7 )
563
564
565
566
567
568 add v8.4s, v8.4s, v24.4s
569 mov w6, v24.s[0]
570 mov w7, v25.s[0]
571 add v9.4s, v9.4s, v25.4s
572 mov w8, v26.s[0]
573 mov w9, v27.s[0]
574 add v10.4s, v10.4s, v26.4s
575 add a8, a8, w6
576 add a9, a9, w7
577 add v11.4s, v11.4s, v27.4s
578 add a10, a10, w8
579 add a11, a11, w9
580 CPU_BE( rev a8, a8 )
581 CPU_BE( rev a9, a9 )
582 CPU_BE( rev a10, a10 )
583 CPU_BE( rev a11, a11 )
584
585
586
587
588
589 add v12.4s, v12.4s, v28.4s
590 mov w6, v28.s[0]
591 mov w7, v29.s[0]
592 add v13.4s, v13.4s, v29.4s
593 mov w8, v30.s[0]
594 mov w9, v31.s[0]
595 add v14.4s, v14.4s, v30.4s
596 add a12, a12, w6
597 add a13, a13, w7
598 add v15.4s, v15.4s, v31.4s
599 add a14, a14, w8
600 add a15, a15, w9
601 CPU_BE( rev a12, a12 )
602 CPU_BE( rev a13, a13 )
603 CPU_BE( rev a14, a14 )
604 CPU_BE( rev a15, a15 )
605
606
607 ldp w6, w7, [x2], #64
608 zip1 v16.4s, v0.4s, v1.4s
609 ldp w8, w9, [x2, #-56]
610 eor a0, a0, w6
611 zip2 v17.4s, v0.4s, v1.4s
612 eor a1, a1, w7
613 zip1 v18.4s, v2.4s, v3.4s
614 eor a2, a2, w8
615 zip2 v19.4s, v2.4s, v3.4s
616 eor a3, a3, w9
617 ldp w6, w7, [x2, #-48]
618 zip1 v20.4s, v4.4s, v5.4s
619 ldp w8, w9, [x2, #-40]
620 eor a4, a4, w6
621 zip2 v21.4s, v4.4s, v5.4s
622 eor a5, a5, w7
623 zip1 v22.4s, v6.4s, v7.4s
624 eor a6, a6, w8
625 zip2 v23.4s, v6.4s, v7.4s
626 eor a7, a7, w9
627 ldp w6, w7, [x2, #-32]
628 zip1 v24.4s, v8.4s, v9.4s
629 ldp w8, w9, [x2, #-24]
630 eor a8, a8, w6
631 zip2 v25.4s, v8.4s, v9.4s
632 eor a9, a9, w7
633 zip1 v26.4s, v10.4s, v11.4s
634 eor a10, a10, w8
635 zip2 v27.4s, v10.4s, v11.4s
636 eor a11, a11, w9
637 ldp w6, w7, [x2, #-16]
638 zip1 v28.4s, v12.4s, v13.4s
639 ldp w8, w9, [x2, #-8]
640 eor a12, a12, w6
641 zip2 v29.4s, v12.4s, v13.4s
642 eor a13, a13, w7
643 zip1 v30.4s, v14.4s, v15.4s
644 eor a14, a14, w8
645 zip2 v31.4s, v14.4s, v15.4s
646 eor a15, a15, w9
647
648 mov x3, #64
649 subs x5, x4, #128
650 add x6, x5, x2
651 csel x3, x3, xzr, ge
652 csel x2, x2, x6, ge
653
654
655 zip1 v0.2d, v16.2d, v18.2d
656 zip2 v4.2d, v16.2d, v18.2d
657 stp a0, a1, [x1], #64
658 zip1 v8.2d, v17.2d, v19.2d
659 zip2 v12.2d, v17.2d, v19.2d
660 stp a2, a3, [x1, #-56]
661 ld1 {v16.16b-v19.16b}, [x2], x3
662
663 subs x6, x4, #192
664 ccmp x3, xzr, #4, lt
665 add x7, x6, x2
666 csel x3, x3, xzr, eq
667 csel x2, x2, x7, eq
668
669 zip1 v1.2d, v20.2d, v22.2d
670 zip2 v5.2d, v20.2d, v22.2d
671 stp a4, a5, [x1, #-48]
672 zip1 v9.2d, v21.2d, v23.2d
673 zip2 v13.2d, v21.2d, v23.2d
674 stp a6, a7, [x1, #-40]
675 ld1 {v20.16b-v23.16b}, [x2], x3
676
677 subs x7, x4, #256
678 ccmp x3, xzr, #4, lt
679 add x8, x7, x2
680 csel x3, x3, xzr, eq
681 csel x2, x2, x8, eq
682
683 zip1 v2.2d, v24.2d, v26.2d
684 zip2 v6.2d, v24.2d, v26.2d
685 stp a8, a9, [x1, #-32]
686 zip1 v10.2d, v25.2d, v27.2d
687 zip2 v14.2d, v25.2d, v27.2d
688 stp a10, a11, [x1, #-24]
689 ld1 {v24.16b-v27.16b}, [x2], x3
690
691 subs x8, x4, #320
692 ccmp x3, xzr, #4, lt
693 add x9, x8, x2
694 csel x2, x2, x9, eq
695
696 zip1 v3.2d, v28.2d, v30.2d
697 zip2 v7.2d, v28.2d, v30.2d
698 stp a12, a13, [x1, #-16]
699 zip1 v11.2d, v29.2d, v31.2d
700 zip2 v15.2d, v29.2d, v31.2d
701 stp a14, a15, [x1, #-8]
702 ld1 {v28.16b-v31.16b}, [x2]
703
704
705 tbnz x5, #63, 0f
706 eor v16.16b, v16.16b, v0.16b
707 eor v17.16b, v17.16b, v1.16b
708 eor v18.16b, v18.16b, v2.16b
709 eor v19.16b, v19.16b, v3.16b
710 st1 {v16.16b-v19.16b}, [x1], #64
711 cbz x5, .Lout
712
713 tbnz x6, #63, 1f
714 eor v20.16b, v20.16b, v4.16b
715 eor v21.16b, v21.16b, v5.16b
716 eor v22.16b, v22.16b, v6.16b
717 eor v23.16b, v23.16b, v7.16b
718 st1 {v20.16b-v23.16b}, [x1], #64
719 cbz x6, .Lout
720
721 tbnz x7, #63, 2f
722 eor v24.16b, v24.16b, v8.16b
723 eor v25.16b, v25.16b, v9.16b
724 eor v26.16b, v26.16b, v10.16b
725 eor v27.16b, v27.16b, v11.16b
726 st1 {v24.16b-v27.16b}, [x1], #64
727 cbz x7, .Lout
728
729 tbnz x8, #63, 3f
730 eor v28.16b, v28.16b, v12.16b
731 eor v29.16b, v29.16b, v13.16b
732 eor v30.16b, v30.16b, v14.16b
733 eor v31.16b, v31.16b, v15.16b
734 st1 {v28.16b-v31.16b}, [x1]
735
736 .Lout: frame_pop
737 ret
738
739
740 0: ld1 {v8.16b}, [x10]
741 ld1 {v9.16b}, [x11]
742 movi v10.16b, #16
743 sub x2, x1, #64
744 add x1, x1, x5
745 ld1 {v16.16b-v19.16b}, [x2]
746 tbl v4.16b, {v0.16b-v3.16b}, v8.16b
747 tbx v20.16b, {v16.16b-v19.16b}, v9.16b
748 add v8.16b, v8.16b, v10.16b
749 add v9.16b, v9.16b, v10.16b
750 tbl v5.16b, {v0.16b-v3.16b}, v8.16b
751 tbx v21.16b, {v16.16b-v19.16b}, v9.16b
752 add v8.16b, v8.16b, v10.16b
753 add v9.16b, v9.16b, v10.16b
754 tbl v6.16b, {v0.16b-v3.16b}, v8.16b
755 tbx v22.16b, {v16.16b-v19.16b}, v9.16b
756 add v8.16b, v8.16b, v10.16b
757 add v9.16b, v9.16b, v10.16b
758 tbl v7.16b, {v0.16b-v3.16b}, v8.16b
759 tbx v23.16b, {v16.16b-v19.16b}, v9.16b
760
761 eor v20.16b, v20.16b, v4.16b
762 eor v21.16b, v21.16b, v5.16b
763 eor v22.16b, v22.16b, v6.16b
764 eor v23.16b, v23.16b, v7.16b
765 st1 {v20.16b-v23.16b}, [x1]
766 b .Lout
767
768
769 1: ld1 {v8.16b}, [x10]
770 ld1 {v9.16b}, [x11]
771 movi v10.16b, #16
772 add x1, x1, x6
773 tbl v0.16b, {v4.16b-v7.16b}, v8.16b
774 tbx v20.16b, {v16.16b-v19.16b}, v9.16b
775 add v8.16b, v8.16b, v10.16b
776 add v9.16b, v9.16b, v10.16b
777 tbl v1.16b, {v4.16b-v7.16b}, v8.16b
778 tbx v21.16b, {v16.16b-v19.16b}, v9.16b
779 add v8.16b, v8.16b, v10.16b
780 add v9.16b, v9.16b, v10.16b
781 tbl v2.16b, {v4.16b-v7.16b}, v8.16b
782 tbx v22.16b, {v16.16b-v19.16b}, v9.16b
783 add v8.16b, v8.16b, v10.16b
784 add v9.16b, v9.16b, v10.16b
785 tbl v3.16b, {v4.16b-v7.16b}, v8.16b
786 tbx v23.16b, {v16.16b-v19.16b}, v9.16b
787
788 eor v20.16b, v20.16b, v0.16b
789 eor v21.16b, v21.16b, v1.16b
790 eor v22.16b, v22.16b, v2.16b
791 eor v23.16b, v23.16b, v3.16b
792 st1 {v20.16b-v23.16b}, [x1]
793 b .Lout
794
795
796 2: ld1 {v4.16b}, [x10]
797 ld1 {v5.16b}, [x11]
798 movi v6.16b, #16
799 add x1, x1, x7
800 tbl v0.16b, {v8.16b-v11.16b}, v4.16b
801 tbx v24.16b, {v20.16b-v23.16b}, v5.16b
802 add v4.16b, v4.16b, v6.16b
803 add v5.16b, v5.16b, v6.16b
804 tbl v1.16b, {v8.16b-v11.16b}, v4.16b
805 tbx v25.16b, {v20.16b-v23.16b}, v5.16b
806 add v4.16b, v4.16b, v6.16b
807 add v5.16b, v5.16b, v6.16b
808 tbl v2.16b, {v8.16b-v11.16b}, v4.16b
809 tbx v26.16b, {v20.16b-v23.16b}, v5.16b
810 add v4.16b, v4.16b, v6.16b
811 add v5.16b, v5.16b, v6.16b
812 tbl v3.16b, {v8.16b-v11.16b}, v4.16b
813 tbx v27.16b, {v20.16b-v23.16b}, v5.16b
814
815 eor v24.16b, v24.16b, v0.16b
816 eor v25.16b, v25.16b, v1.16b
817 eor v26.16b, v26.16b, v2.16b
818 eor v27.16b, v27.16b, v3.16b
819 st1 {v24.16b-v27.16b}, [x1]
820 b .Lout
821
822
823 3: ld1 {v4.16b}, [x10]
824 ld1 {v5.16b}, [x11]
825 movi v6.16b, #16
826 add x1, x1, x8
827 tbl v0.16b, {v12.16b-v15.16b}, v4.16b
828 tbx v28.16b, {v24.16b-v27.16b}, v5.16b
829 add v4.16b, v4.16b, v6.16b
830 add v5.16b, v5.16b, v6.16b
831 tbl v1.16b, {v12.16b-v15.16b}, v4.16b
832 tbx v29.16b, {v24.16b-v27.16b}, v5.16b
833 add v4.16b, v4.16b, v6.16b
834 add v5.16b, v5.16b, v6.16b
835 tbl v2.16b, {v12.16b-v15.16b}, v4.16b
836 tbx v30.16b, {v24.16b-v27.16b}, v5.16b
837 add v4.16b, v4.16b, v6.16b
838 add v5.16b, v5.16b, v6.16b
839 tbl v3.16b, {v12.16b-v15.16b}, v4.16b
840 tbx v31.16b, {v24.16b-v27.16b}, v5.16b
841
842 eor v28.16b, v28.16b, v0.16b
843 eor v29.16b, v29.16b, v1.16b
844 eor v30.16b, v30.16b, v2.16b
845 eor v31.16b, v31.16b, v3.16b
846 st1 {v28.16b-v31.16b}, [x1]
847 b .Lout
848 ENDPROC(chacha_4block_xor_neon)
849
850 .section ".rodata", "a", %progbits
851 .align L1_CACHE_SHIFT
852 .Lpermute:
853 .set .Li, 0
854 .rept 192
855 .byte (.Li - 64)
856 .set .Li, .Li + 1
857 .endr
858
859 CTRINC: .word 1, 2, 3, 4
860 ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f