1
2
3
4
5
6
7
8 #include <linux/linkage.h>
9 #include <asm/assembler.h>
10
11 .text
12 .arch armv8-a
13 .fpu crypto-neon-fp-armv8
14 .align 3
15
16 .macro enc_round, state, key
17 aese.8 \state, \key
18 aesmc.8 \state, \state
19 .endm
20
21 .macro dec_round, state, key
22 aesd.8 \state, \key
23 aesimc.8 \state, \state
24 .endm
25
26 .macro enc_dround, key1, key2
27 enc_round q0, \key1
28 enc_round q0, \key2
29 .endm
30
31 .macro dec_dround, key1, key2
32 dec_round q0, \key1
33 dec_round q0, \key2
34 .endm
35
36 .macro enc_fround, key1, key2, key3
37 enc_round q0, \key1
38 aese.8 q0, \key2
39 veor q0, q0, \key3
40 .endm
41
42 .macro dec_fround, key1, key2, key3
43 dec_round q0, \key1
44 aesd.8 q0, \key2
45 veor q0, q0, \key3
46 .endm
47
48 .macro enc_dround_4x, key1, key2
49 enc_round q0, \key1
50 enc_round q1, \key1
51 enc_round q2, \key1
52 enc_round q3, \key1
53 enc_round q0, \key2
54 enc_round q1, \key2
55 enc_round q2, \key2
56 enc_round q3, \key2
57 .endm
58
59 .macro dec_dround_4x, key1, key2
60 dec_round q0, \key1
61 dec_round q1, \key1
62 dec_round q2, \key1
63 dec_round q3, \key1
64 dec_round q0, \key2
65 dec_round q1, \key2
66 dec_round q2, \key2
67 dec_round q3, \key2
68 .endm
69
70 .macro enc_fround_4x, key1, key2, key3
71 enc_round q0, \key1
72 enc_round q1, \key1
73 enc_round q2, \key1
74 enc_round q3, \key1
75 aese.8 q0, \key2
76 aese.8 q1, \key2
77 aese.8 q2, \key2
78 aese.8 q3, \key2
79 veor q0, q0, \key3
80 veor q1, q1, \key3
81 veor q2, q2, \key3
82 veor q3, q3, \key3
83 .endm
84
85 .macro dec_fround_4x, key1, key2, key3
86 dec_round q0, \key1
87 dec_round q1, \key1
88 dec_round q2, \key1
89 dec_round q3, \key1
90 aesd.8 q0, \key2
91 aesd.8 q1, \key2
92 aesd.8 q2, \key2
93 aesd.8 q3, \key2
94 veor q0, q0, \key3
95 veor q1, q1, \key3
96 veor q2, q2, \key3
97 veor q3, q3, \key3
98 .endm
99
100 .macro do_block, dround, fround
101 cmp r3, #12 @ which key size?
102 vld1.32 {q10-q11}, [ip]!
103 \dround q8, q9
104 vld1.32 {q12-q13}, [ip]!
105 \dround q10, q11
106 vld1.32 {q10-q11}, [ip]!
107 \dround q12, q13
108 vld1.32 {q12-q13}, [ip]!
109 \dround q10, q11
110 blo 0f @ AES-128: 10 rounds
111 vld1.32 {q10-q11}, [ip]!
112 \dround q12, q13
113 beq 1f @ AES-192: 12 rounds
114 vld1.32 {q12-q13}, [ip]
115 \dround q10, q11
116 0: \fround q12, q13, q14
117 bx lr
118
119 1: \fround q10, q11, q14
120 bx lr
121 .endm
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137 .align 6
138 aes_encrypt:
139 add ip, r2, #32 @ 3rd round key
140 .Laes_encrypt_tweak:
141 do_block enc_dround, enc_fround
142 ENDPROC(aes_encrypt)
143
144 .align 6
145 aes_decrypt:
146 add ip, r2, #32 @ 3rd round key
147 do_block dec_dround, dec_fround
148 ENDPROC(aes_decrypt)
149
150 .align 6
151 aes_encrypt_4x:
152 add ip, r2, #32 @ 3rd round key
153 do_block enc_dround_4x, enc_fround_4x
154 ENDPROC(aes_encrypt_4x)
155
156 .align 6
157 aes_decrypt_4x:
158 add ip, r2, #32 @ 3rd round key
159 do_block dec_dround_4x, dec_fround_4x
160 ENDPROC(aes_decrypt_4x)
161
162 .macro prepare_key, rk, rounds
163 add ip, \rk, \rounds, lsl #4
164 vld1.32 {q8-q9}, [\rk] @ load first 2 round keys
165 vld1.32 {q14}, [ip] @ load last round key
166 .endm
167
168
169
170
171
172
173
174 ENTRY(ce_aes_ecb_encrypt)
175 push {r4, lr}
176 ldr r4, [sp, #8]
177 prepare_key r2, r3
178 .Lecbencloop4x:
179 subs r4, r4, #4
180 bmi .Lecbenc1x
181 vld1.8 {q0-q1}, [r1]!
182 vld1.8 {q2-q3}, [r1]!
183 bl aes_encrypt_4x
184 vst1.8 {q0-q1}, [r0]!
185 vst1.8 {q2-q3}, [r0]!
186 b .Lecbencloop4x
187 .Lecbenc1x:
188 adds r4, r4, #4
189 beq .Lecbencout
190 .Lecbencloop:
191 vld1.8 {q0}, [r1]!
192 bl aes_encrypt
193 vst1.8 {q0}, [r0]!
194 subs r4, r4, #1
195 bne .Lecbencloop
196 .Lecbencout:
197 pop {r4, pc}
198 ENDPROC(ce_aes_ecb_encrypt)
199
200 ENTRY(ce_aes_ecb_decrypt)
201 push {r4, lr}
202 ldr r4, [sp, #8]
203 prepare_key r2, r3
204 .Lecbdecloop4x:
205 subs r4, r4, #4
206 bmi .Lecbdec1x
207 vld1.8 {q0-q1}, [r1]!
208 vld1.8 {q2-q3}, [r1]!
209 bl aes_decrypt_4x
210 vst1.8 {q0-q1}, [r0]!
211 vst1.8 {q2-q3}, [r0]!
212 b .Lecbdecloop4x
213 .Lecbdec1x:
214 adds r4, r4, #4
215 beq .Lecbdecout
216 .Lecbdecloop:
217 vld1.8 {q0}, [r1]!
218 bl aes_decrypt
219 vst1.8 {q0}, [r0]!
220 subs r4, r4, #1
221 bne .Lecbdecloop
222 .Lecbdecout:
223 pop {r4, pc}
224 ENDPROC(ce_aes_ecb_decrypt)
225
226
227
228
229
230
231
232 ENTRY(ce_aes_cbc_encrypt)
233 push {r4-r6, lr}
234 ldrd r4, r5, [sp, #16]
235 vld1.8 {q0}, [r5]
236 prepare_key r2, r3
237 .Lcbcencloop:
238 vld1.8 {q1}, [r1]! @ get next pt block
239 veor q0, q0, q1 @ ..and xor with iv
240 bl aes_encrypt
241 vst1.8 {q0}, [r0]!
242 subs r4, r4, #1
243 bne .Lcbcencloop
244 vst1.8 {q0}, [r5]
245 pop {r4-r6, pc}
246 ENDPROC(ce_aes_cbc_encrypt)
247
248 ENTRY(ce_aes_cbc_decrypt)
249 push {r4-r6, lr}
250 ldrd r4, r5, [sp, #16]
251 vld1.8 {q15}, [r5] @ keep iv in q15
252 prepare_key r2, r3
253 .Lcbcdecloop4x:
254 subs r4, r4, #4
255 bmi .Lcbcdec1x
256 vld1.8 {q0-q1}, [r1]!
257 vld1.8 {q2-q3}, [r1]!
258 vmov q4, q0
259 vmov q5, q1
260 vmov q6, q2
261 vmov q7, q3
262 bl aes_decrypt_4x
263 veor q0, q0, q15
264 veor q1, q1, q4
265 veor q2, q2, q5
266 veor q3, q3, q6
267 vmov q15, q7
268 vst1.8 {q0-q1}, [r0]!
269 vst1.8 {q2-q3}, [r0]!
270 b .Lcbcdecloop4x
271 .Lcbcdec1x:
272 adds r4, r4, #4
273 beq .Lcbcdecout
274 vmov q6, q14 @ preserve last round key
275 .Lcbcdecloop:
276 vld1.8 {q0}, [r1]! @ get next ct block
277 veor q14, q15, q6 @ combine prev ct with last key
278 vmov q15, q0
279 bl aes_decrypt
280 vst1.8 {q0}, [r0]!
281 subs r4, r4, #1
282 bne .Lcbcdecloop
283 .Lcbcdecout:
284 vst1.8 {q15}, [r5] @ keep iv in q15
285 pop {r4-r6, pc}
286 ENDPROC(ce_aes_cbc_decrypt)
287
288
289
290
291
292
293
294
295
296 ENTRY(ce_aes_cbc_cts_encrypt)
297 push {r4-r6, lr}
298 ldrd r4, r5, [sp, #16]
299
300 movw ip, :lower16:.Lcts_permute_table
301 movt ip, :upper16:.Lcts_permute_table
302 sub r4, r4, #16
303 add lr, ip, #32
304 add ip, ip, r4
305 sub lr, lr, r4
306 vld1.8 {q5}, [ip]
307 vld1.8 {q6}, [lr]
308
309 add ip, r1, r4
310 vld1.8 {q0}, [r1] @ overlapping loads
311 vld1.8 {q3}, [ip]
312
313 vld1.8 {q1}, [r5] @ get iv
314 prepare_key r2, r3
315
316 veor q0, q0, q1 @ xor with iv
317 bl aes_encrypt
318
319 vtbl.8 d4, {d0-d1}, d10
320 vtbl.8 d5, {d0-d1}, d11
321 vtbl.8 d2, {d6-d7}, d12
322 vtbl.8 d3, {d6-d7}, d13
323
324 veor q0, q0, q1
325 bl aes_encrypt
326
327 add r4, r0, r4
328 vst1.8 {q2}, [r4] @ overlapping stores
329 vst1.8 {q0}, [r0]
330
331 pop {r4-r6, pc}
332 ENDPROC(ce_aes_cbc_cts_encrypt)
333
334 ENTRY(ce_aes_cbc_cts_decrypt)
335 push {r4-r6, lr}
336 ldrd r4, r5, [sp, #16]
337
338 movw ip, :lower16:.Lcts_permute_table
339 movt ip, :upper16:.Lcts_permute_table
340 sub r4, r4, #16
341 add lr, ip, #32
342 add ip, ip, r4
343 sub lr, lr, r4
344 vld1.8 {q5}, [ip]
345 vld1.8 {q6}, [lr]
346
347 add ip, r1, r4
348 vld1.8 {q0}, [r1] @ overlapping loads
349 vld1.8 {q1}, [ip]
350
351 vld1.8 {q3}, [r5] @ get iv
352 prepare_key r2, r3
353
354 bl aes_decrypt
355
356 vtbl.8 d4, {d0-d1}, d10
357 vtbl.8 d5, {d0-d1}, d11
358 vtbx.8 d0, {d2-d3}, d12
359 vtbx.8 d1, {d2-d3}, d13
360
361 veor q1, q1, q2
362 bl aes_decrypt
363 veor q0, q0, q3 @ xor with iv
364
365 add r4, r0, r4
366 vst1.8 {q1}, [r4] @ overlapping stores
367 vst1.8 {q0}, [r0]
368
369 pop {r4-r6, pc}
370 ENDPROC(ce_aes_cbc_cts_decrypt)
371
372
373
374
375
376
377 ENTRY(ce_aes_ctr_encrypt)
378 push {r4-r6, lr}
379 ldrd r4, r5, [sp, #16]
380 vld1.8 {q7}, [r5] @ load ctr
381 prepare_key r2, r3
382 vmov r6, s31 @ keep swabbed ctr in r6
383 rev r6, r6
384 cmn r6, r4 @ 32 bit overflow?
385 bcs .Lctrloop
386 .Lctrloop4x:
387 subs r4, r4, #4
388 bmi .Lctr1x
389 add r6, r6, #1
390 vmov q0, q7
391 vmov q1, q7
392 rev ip, r6
393 add r6, r6, #1
394 vmov q2, q7
395 vmov s7, ip
396 rev ip, r6
397 add r6, r6, #1
398 vmov q3, q7
399 vmov s11, ip
400 rev ip, r6
401 add r6, r6, #1
402 vmov s15, ip
403 vld1.8 {q4-q5}, [r1]!
404 vld1.8 {q6}, [r1]!
405 vld1.8 {q15}, [r1]!
406 bl aes_encrypt_4x
407 veor q0, q0, q4
408 veor q1, q1, q5
409 veor q2, q2, q6
410 veor q3, q3, q15
411 rev ip, r6
412 vst1.8 {q0-q1}, [r0]!
413 vst1.8 {q2-q3}, [r0]!
414 vmov s31, ip
415 b .Lctrloop4x
416 .Lctr1x:
417 adds r4, r4, #4
418 beq .Lctrout
419 .Lctrloop:
420 vmov q0, q7
421 bl aes_encrypt
422
423 adds r6, r6, #1 @ increment BE ctr
424 rev ip, r6
425 vmov s31, ip
426 bcs .Lctrcarry
427
428 .Lctrcarrydone:
429 subs r4, r4, #1
430 bmi .Lctrtailblock @ blocks < 0 means tail block
431 vld1.8 {q3}, [r1]!
432 veor q3, q0, q3
433 vst1.8 {q3}, [r0]!
434 bne .Lctrloop
435
436 .Lctrout:
437 vst1.8 {q7}, [r5] @ return next CTR value
438 pop {r4-r6, pc}
439
440 .Lctrtailblock:
441 vst1.8 {q0}, [r0, :64] @ return the key stream
442 b .Lctrout
443
444 .Lctrcarry:
445 .irp sreg, s30, s29, s28
446 vmov ip, \sreg @ load next word of ctr
447 rev ip, ip @ ... to handle the carry
448 adds ip, ip, #1
449 rev ip, ip
450 vmov \sreg, ip
451 bcc .Lctrcarrydone
452 .endr
453 b .Lctrcarrydone
454 ENDPROC(ce_aes_ctr_encrypt)
455
456
457
458
459
460
461
462
463 .macro next_tweak, out, in, const, tmp
464 vshr.s64 \tmp, \in, #63
465 vand \tmp, \tmp, \const
466 vadd.u64 \out, \in, \in
467 vext.8 \tmp, \tmp, \tmp, #8
468 veor \out, \out, \tmp
469 .endm
470
471 ce_aes_xts_init:
472 vmov.i32 d30, #0x87 @ compose tweak mask vector
473 vmovl.u32 q15, d30
474 vshr.u64 d30, d31, #7
475
476 ldrd r4, r5, [sp, #16] @ load args
477 ldr r6, [sp, #28]
478 vld1.8 {q0}, [r5] @ load iv
479 teq r6, #1 @ start of a block?
480 bxne lr
481
482 @ Encrypt the IV in q0 with the second AES key. This should only
483 @ be done at the start of a block.
484 ldr r6, [sp, #24] @ load AES key 2
485 prepare_key r6, r3
486 add ip, r6, #32 @ 3rd round key of key 2
487 b .Laes_encrypt_tweak @ tail call
488 ENDPROC(ce_aes_xts_init)
489
490 ENTRY(ce_aes_xts_encrypt)
491 push {r4-r6, lr}
492
493 bl ce_aes_xts_init @ run shared prologue
494 prepare_key r2, r3
495 vmov q4, q0
496
497 teq r6, #0 @ start of a block?
498 bne .Lxtsenc4x
499
500 .Lxtsencloop4x:
501 next_tweak q4, q4, q15, q10
502 .Lxtsenc4x:
503 subs r4, r4, #64
504 bmi .Lxtsenc1x
505 vld1.8 {q0-q1}, [r1]! @ get 4 pt blocks
506 vld1.8 {q2-q3}, [r1]!
507 next_tweak q5, q4, q15, q10
508 veor q0, q0, q4
509 next_tweak q6, q5, q15, q10
510 veor q1, q1, q5
511 next_tweak q7, q6, q15, q10
512 veor q2, q2, q6
513 veor q3, q3, q7
514 bl aes_encrypt_4x
515 veor q0, q0, q4
516 veor q1, q1, q5
517 veor q2, q2, q6
518 veor q3, q3, q7
519 vst1.8 {q0-q1}, [r0]! @ write 4 ct blocks
520 vst1.8 {q2-q3}, [r0]!
521 vmov q4, q7
522 teq r4, #0
523 beq .Lxtsencret
524 b .Lxtsencloop4x
525 .Lxtsenc1x:
526 adds r4, r4, #64
527 beq .Lxtsencout
528 subs r4, r4, #16
529 bmi .LxtsencctsNx
530 .Lxtsencloop:
531 vld1.8 {q0}, [r1]!
532 .Lxtsencctsout:
533 veor q0, q0, q4
534 bl aes_encrypt
535 veor q0, q0, q4
536 teq r4, #0
537 beq .Lxtsencout
538 subs r4, r4, #16
539 next_tweak q4, q4, q15, q6
540 bmi .Lxtsenccts
541 vst1.8 {q0}, [r0]!
542 b .Lxtsencloop
543 .Lxtsencout:
544 vst1.8 {q0}, [r0]
545 .Lxtsencret:
546 vst1.8 {q4}, [r5]
547 pop {r4-r6, pc}
548
549 .LxtsencctsNx:
550 vmov q0, q3
551 sub r0, r0, #16
552 .Lxtsenccts:
553 movw ip, :lower16:.Lcts_permute_table
554 movt ip, :upper16:.Lcts_permute_table
555
556 add r1, r1, r4 @ rewind input pointer
557 add r4, r4, #16 @ # bytes in final block
558 add lr, ip, #32
559 add ip, ip, r4
560 sub lr, lr, r4
561 add r4, r0, r4 @ output address of final block
562
563 vld1.8 {q1}, [r1] @ load final partial block
564 vld1.8 {q2}, [ip]
565 vld1.8 {q3}, [lr]
566
567 vtbl.8 d4, {d0-d1}, d4
568 vtbl.8 d5, {d0-d1}, d5
569 vtbx.8 d0, {d2-d3}, d6
570 vtbx.8 d1, {d2-d3}, d7
571
572 vst1.8 {q2}, [r4] @ overlapping stores
573 mov r4, #0
574 b .Lxtsencctsout
575 ENDPROC(ce_aes_xts_encrypt)
576
577
578 ENTRY(ce_aes_xts_decrypt)
579 push {r4-r6, lr}
580
581 bl ce_aes_xts_init @ run shared prologue
582 prepare_key r2, r3
583 vmov q4, q0
584
585
586 tst r4, #0xf
587 subne r4, r4, #0x10
588
589 teq r6, #0 @ start of a block?
590 bne .Lxtsdec4x
591
592 .Lxtsdecloop4x:
593 next_tweak q4, q4, q15, q10
594 .Lxtsdec4x:
595 subs r4, r4, #64
596 bmi .Lxtsdec1x
597 vld1.8 {q0-q1}, [r1]! @ get 4 ct blocks
598 vld1.8 {q2-q3}, [r1]!
599 next_tweak q5, q4, q15, q10
600 veor q0, q0, q4
601 next_tweak q6, q5, q15, q10
602 veor q1, q1, q5
603 next_tweak q7, q6, q15, q10
604 veor q2, q2, q6
605 veor q3, q3, q7
606 bl aes_decrypt_4x
607 veor q0, q0, q4
608 veor q1, q1, q5
609 veor q2, q2, q6
610 veor q3, q3, q7
611 vst1.8 {q0-q1}, [r0]! @ write 4 pt blocks
612 vst1.8 {q2-q3}, [r0]!
613 vmov q4, q7
614 teq r4, #0
615 beq .Lxtsdecout
616 b .Lxtsdecloop4x
617 .Lxtsdec1x:
618 adds r4, r4, #64
619 beq .Lxtsdecout
620 subs r4, r4, #16
621 .Lxtsdecloop:
622 vld1.8 {q0}, [r1]!
623 bmi .Lxtsdeccts
624 .Lxtsdecctsout:
625 veor q0, q0, q4
626 bl aes_decrypt
627 veor q0, q0, q4
628 vst1.8 {q0}, [r0]!
629 teq r4, #0
630 beq .Lxtsdecout
631 subs r4, r4, #16
632 next_tweak q4, q4, q15, q6
633 b .Lxtsdecloop
634 .Lxtsdecout:
635 vst1.8 {q4}, [r5]
636 pop {r4-r6, pc}
637
638 .Lxtsdeccts:
639 movw ip, :lower16:.Lcts_permute_table
640 movt ip, :upper16:.Lcts_permute_table
641
642 add r1, r1, r4 @ rewind input pointer
643 add r4, r4, #16 @ # bytes in final block
644 add lr, ip, #32
645 add ip, ip, r4
646 sub lr, lr, r4
647 add r4, r0, r4 @ output address of final block
648
649 next_tweak q5, q4, q15, q6
650
651 vld1.8 {q1}, [r1] @ load final partial block
652 vld1.8 {q2}, [ip]
653 vld1.8 {q3}, [lr]
654
655 veor q0, q0, q5
656 bl aes_decrypt
657 veor q0, q0, q5
658
659 vtbl.8 d4, {d0-d1}, d4
660 vtbl.8 d5, {d0-d1}, d5
661 vtbx.8 d0, {d2-d3}, d6
662 vtbx.8 d1, {d2-d3}, d7
663
664 vst1.8 {q2}, [r4] @ overlapping stores
665 mov r4, #0
666 b .Lxtsdecctsout
667 ENDPROC(ce_aes_xts_decrypt)
668
669
670
671
672
673
674 ENTRY(ce_aes_sub)
675 vdup.32 q1, r0
676 veor q0, q0, q0
677 aese.8 q0, q1
678 vmov r0, s0
679 bx lr
680 ENDPROC(ce_aes_sub)
681
682
683
684
685
686 ENTRY(ce_aes_invert)
687 vld1.32 {q0}, [r1]
688 aesimc.8 q0, q0
689 vst1.32 {q0}, [r0]
690 bx lr
691 ENDPROC(ce_aes_invert)
692
693 .section ".rodata", "a"
694 .align 6
695 .Lcts_permute_table:
696 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
697 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
698 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
699 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf
700 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
701 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff