1/*
2 * aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
3 *
4 * Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#include <linux/linkage.h>
12#include <asm/assembler.h>
13
14	.text
15	.fpu		crypto-neon-fp-armv8
16	.align		3
17
18	.macro		enc_round, state, key
19	aese.8		\state, \key
20	aesmc.8		\state, \state
21	.endm
22
23	.macro		dec_round, state, key
24	aesd.8		\state, \key
25	aesimc.8	\state, \state
26	.endm
27
28	.macro		enc_dround, key1, key2
29	enc_round	q0, \key1
30	enc_round	q0, \key2
31	.endm
32
33	.macro		dec_dround, key1, key2
34	dec_round	q0, \key1
35	dec_round	q0, \key2
36	.endm
37
38	.macro		enc_fround, key1, key2, key3
39	enc_round	q0, \key1
40	aese.8		q0, \key2
41	veor		q0, q0, \key3
42	.endm
43
44	.macro		dec_fround, key1, key2, key3
45	dec_round	q0, \key1
46	aesd.8		q0, \key2
47	veor		q0, q0, \key3
48	.endm
49
50	.macro		enc_dround_3x, key1, key2
51	enc_round	q0, \key1
52	enc_round	q1, \key1
53	enc_round	q2, \key1
54	enc_round	q0, \key2
55	enc_round	q1, \key2
56	enc_round	q2, \key2
57	.endm
58
59	.macro		dec_dround_3x, key1, key2
60	dec_round	q0, \key1
61	dec_round	q1, \key1
62	dec_round	q2, \key1
63	dec_round	q0, \key2
64	dec_round	q1, \key2
65	dec_round	q2, \key2
66	.endm
67
68	.macro		enc_fround_3x, key1, key2, key3
69	enc_round	q0, \key1
70	enc_round	q1, \key1
71	enc_round	q2, \key1
72	aese.8		q0, \key2
73	aese.8		q1, \key2
74	aese.8		q2, \key2
75	veor		q0, q0, \key3
76	veor		q1, q1, \key3
77	veor		q2, q2, \key3
78	.endm
79
80	.macro		dec_fround_3x, key1, key2, key3
81	dec_round	q0, \key1
82	dec_round	q1, \key1
83	dec_round	q2, \key1
84	aesd.8		q0, \key2
85	aesd.8		q1, \key2
86	aesd.8		q2, \key2
87	veor		q0, q0, \key3
88	veor		q1, q1, \key3
89	veor		q2, q2, \key3
90	.endm
91
92	.macro		do_block, dround, fround
93	cmp		r3, #12			@ which key size?
94	vld1.8		{q10-q11}, [ip]!
95	\dround		q8, q9
96	vld1.8		{q12-q13}, [ip]!
97	\dround		q10, q11
98	vld1.8		{q10-q11}, [ip]!
99	\dround		q12, q13
100	vld1.8		{q12-q13}, [ip]!
101	\dround		q10, q11
102	blo		0f			@ AES-128: 10 rounds
103	vld1.8		{q10-q11}, [ip]!
104	beq		1f			@ AES-192: 12 rounds
105	\dround		q12, q13
106	vld1.8		{q12-q13}, [ip]
107	\dround		q10, q11
1080:	\fround		q12, q13, q14
109	bx		lr
110
1111:	\dround		q12, q13
112	\fround		q10, q11, q14
113	bx		lr
114	.endm
115
116	/*
117	 * Internal, non-AAPCS compliant functions that implement the core AES
118	 * transforms. These should preserve all registers except q0 - q2 and ip
119	 * Arguments:
120	 *   q0        : first in/output block
121	 *   q1        : second in/output block (_3x version only)
122	 *   q2        : third in/output block (_3x version only)
123	 *   q8        : first round key
124	 *   q9        : secound round key
125	 *   ip        : address of 3rd round key
126	 *   q14       : final round key
127	 *   r3        : number of rounds
128	 */
129	.align		6
130aes_encrypt:
131	add		ip, r2, #32		@ 3rd round key
132.Laes_encrypt_tweak:
133	do_block	enc_dround, enc_fround
134ENDPROC(aes_encrypt)
135
136	.align		6
137aes_decrypt:
138	add		ip, r2, #32		@ 3rd round key
139	do_block	dec_dround, dec_fround
140ENDPROC(aes_decrypt)
141
142	.align		6
143aes_encrypt_3x:
144	add		ip, r2, #32		@ 3rd round key
145	do_block	enc_dround_3x, enc_fround_3x
146ENDPROC(aes_encrypt_3x)
147
148	.align		6
149aes_decrypt_3x:
150	add		ip, r2, #32		@ 3rd round key
151	do_block	dec_dround_3x, dec_fround_3x
152ENDPROC(aes_decrypt_3x)
153
154	.macro		prepare_key, rk, rounds
155	add		ip, \rk, \rounds, lsl #4
156	vld1.8		{q8-q9}, [\rk]		@ load first 2 round keys
157	vld1.8		{q14}, [ip]		@ load last round key
158	.endm
159
160	/*
161	 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
162	 *		   int blocks)
163	 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
164	 *		   int blocks)
165	 */
166ENTRY(ce_aes_ecb_encrypt)
167	push		{r4, lr}
168	ldr		r4, [sp, #8]
169	prepare_key	r2, r3
170.Lecbencloop3x:
171	subs		r4, r4, #3
172	bmi		.Lecbenc1x
173	vld1.8		{q0-q1}, [r1, :64]!
174	vld1.8		{q2}, [r1, :64]!
175	bl		aes_encrypt_3x
176	vst1.8		{q0-q1}, [r0, :64]!
177	vst1.8		{q2}, [r0, :64]!
178	b		.Lecbencloop3x
179.Lecbenc1x:
180	adds		r4, r4, #3
181	beq		.Lecbencout
182.Lecbencloop:
183	vld1.8		{q0}, [r1, :64]!
184	bl		aes_encrypt
185	vst1.8		{q0}, [r0, :64]!
186	subs		r4, r4, #1
187	bne		.Lecbencloop
188.Lecbencout:
189	pop		{r4, pc}
190ENDPROC(ce_aes_ecb_encrypt)
191
192ENTRY(ce_aes_ecb_decrypt)
193	push		{r4, lr}
194	ldr		r4, [sp, #8]
195	prepare_key	r2, r3
196.Lecbdecloop3x:
197	subs		r4, r4, #3
198	bmi		.Lecbdec1x
199	vld1.8		{q0-q1}, [r1, :64]!
200	vld1.8		{q2}, [r1, :64]!
201	bl		aes_decrypt_3x
202	vst1.8		{q0-q1}, [r0, :64]!
203	vst1.8		{q2}, [r0, :64]!
204	b		.Lecbdecloop3x
205.Lecbdec1x:
206	adds		r4, r4, #3
207	beq		.Lecbdecout
208.Lecbdecloop:
209	vld1.8		{q0}, [r1, :64]!
210	bl		aes_decrypt
211	vst1.8		{q0}, [r0, :64]!
212	subs		r4, r4, #1
213	bne		.Lecbdecloop
214.Lecbdecout:
215	pop		{r4, pc}
216ENDPROC(ce_aes_ecb_decrypt)
217
218	/*
219	 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
220	 *		   int blocks, u8 iv[])
221	 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
222	 *		   int blocks, u8 iv[])
223	 */
224ENTRY(ce_aes_cbc_encrypt)
225	push		{r4-r6, lr}
226	ldrd		r4, r5, [sp, #16]
227	vld1.8		{q0}, [r5]
228	prepare_key	r2, r3
229.Lcbcencloop:
230	vld1.8		{q1}, [r1, :64]!	@ get next pt block
231	veor		q0, q0, q1		@ ..and xor with iv
232	bl		aes_encrypt
233	vst1.8		{q0}, [r0, :64]!
234	subs		r4, r4, #1
235	bne		.Lcbcencloop
236	vst1.8		{q0}, [r5]
237	pop		{r4-r6, pc}
238ENDPROC(ce_aes_cbc_encrypt)
239
240ENTRY(ce_aes_cbc_decrypt)
241	push		{r4-r6, lr}
242	ldrd		r4, r5, [sp, #16]
243	vld1.8		{q6}, [r5]		@ keep iv in q6
244	prepare_key	r2, r3
245.Lcbcdecloop3x:
246	subs		r4, r4, #3
247	bmi		.Lcbcdec1x
248	vld1.8		{q0-q1}, [r1, :64]!
249	vld1.8		{q2}, [r1, :64]!
250	vmov		q3, q0
251	vmov		q4, q1
252	vmov		q5, q2
253	bl		aes_decrypt_3x
254	veor		q0, q0, q6
255	veor		q1, q1, q3
256	veor		q2, q2, q4
257	vmov		q6, q5
258	vst1.8		{q0-q1}, [r0, :64]!
259	vst1.8		{q2}, [r0, :64]!
260	b		.Lcbcdecloop3x
261.Lcbcdec1x:
262	adds		r4, r4, #3
263	beq		.Lcbcdecout
264	vmov		q15, q14		@ preserve last round key
265.Lcbcdecloop:
266	vld1.8		{q0}, [r1, :64]!	@ get next ct block
267	veor		q14, q15, q6		@ combine prev ct with last key
268	vmov		q6, q0
269	bl		aes_decrypt
270	vst1.8		{q0}, [r0, :64]!
271	subs		r4, r4, #1
272	bne		.Lcbcdecloop
273.Lcbcdecout:
274	vst1.8		{q6}, [r5]		@ keep iv in q6
275	pop		{r4-r6, pc}
276ENDPROC(ce_aes_cbc_decrypt)
277
278	/*
279	 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
280	 *		   int blocks, u8 ctr[])
281	 */
282ENTRY(ce_aes_ctr_encrypt)
283	push		{r4-r6, lr}
284	ldrd		r4, r5, [sp, #16]
285	vld1.8		{q6}, [r5]		@ load ctr
286	prepare_key	r2, r3
287	vmov		r6, s27			@ keep swabbed ctr in r6
288	rev		r6, r6
289	cmn		r6, r4			@ 32 bit overflow?
290	bcs		.Lctrloop
291.Lctrloop3x:
292	subs		r4, r4, #3
293	bmi		.Lctr1x
294	add		r6, r6, #1
295	vmov		q0, q6
296	vmov		q1, q6
297	rev		ip, r6
298	add		r6, r6, #1
299	vmov		q2, q6
300	vmov		s7, ip
301	rev		ip, r6
302	add		r6, r6, #1
303	vmov		s11, ip
304	vld1.8		{q3-q4}, [r1, :64]!
305	vld1.8		{q5}, [r1, :64]!
306	bl		aes_encrypt_3x
307	veor		q0, q0, q3
308	veor		q1, q1, q4
309	veor		q2, q2, q5
310	rev		ip, r6
311	vst1.8		{q0-q1}, [r0, :64]!
312	vst1.8		{q2}, [r0, :64]!
313	vmov		s27, ip
314	b		.Lctrloop3x
315.Lctr1x:
316	adds		r4, r4, #3
317	beq		.Lctrout
318.Lctrloop:
319	vmov		q0, q6
320	bl		aes_encrypt
321	subs		r4, r4, #1
322	bmi		.Lctrhalfblock		@ blocks < 0 means 1/2 block
323	vld1.8		{q3}, [r1, :64]!
324	veor		q3, q0, q3
325	vst1.8		{q3}, [r0, :64]!
326
327	adds		r6, r6, #1		@ increment BE ctr
328	rev		ip, r6
329	vmov		s27, ip
330	bcs		.Lctrcarry
331	teq		r4, #0
332	bne		.Lctrloop
333.Lctrout:
334	vst1.8		{q6}, [r5]
335	pop		{r4-r6, pc}
336
337.Lctrhalfblock:
338	vld1.8		{d1}, [r1, :64]
339	veor		d0, d0, d1
340	vst1.8		{d0}, [r0, :64]
341	pop		{r4-r6, pc}
342
343.Lctrcarry:
344	.irp		sreg, s26, s25, s24
345	vmov		ip, \sreg		@ load next word of ctr
346	rev		ip, ip			@ ... to handle the carry
347	adds		ip, ip, #1
348	rev		ip, ip
349	vmov		\sreg, ip
350	bcc		0f
351	.endr
3520:	teq		r4, #0
353	beq		.Lctrout
354	b		.Lctrloop
355ENDPROC(ce_aes_ctr_encrypt)
356
357	/*
358	 * aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
359	 *		   int blocks, u8 iv[], u8 const rk2[], int first)
360	 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
361	 *		   int blocks, u8 iv[], u8 const rk2[], int first)
362	 */
363
364	.macro		next_tweak, out, in, const, tmp
365	vshr.s64	\tmp, \in, #63
366	vand		\tmp, \tmp, \const
367	vadd.u64	\out, \in, \in
368	vext.8		\tmp, \tmp, \tmp, #8
369	veor		\out, \out, \tmp
370	.endm
371
372	.align		3
373.Lxts_mul_x:
374	.quad		1, 0x87
375
376ce_aes_xts_init:
377	vldr		d14, .Lxts_mul_x
378	vldr		d15, .Lxts_mul_x + 8
379
380	ldrd		r4, r5, [sp, #16]	@ load args
381	ldr		r6, [sp, #28]
382	vld1.8		{q0}, [r5]		@ load iv
383	teq		r6, #1			@ start of a block?
384	bxne		lr
385
386	@ Encrypt the IV in q0 with the second AES key. This should only
387	@ be done at the start of a block.
388	ldr		r6, [sp, #24]		@ load AES key 2
389	prepare_key	r6, r3
390	add		ip, r6, #32		@ 3rd round key of key 2
391	b		.Laes_encrypt_tweak	@ tail call
392ENDPROC(ce_aes_xts_init)
393
394ENTRY(ce_aes_xts_encrypt)
395	push		{r4-r6, lr}
396
397	bl		ce_aes_xts_init		@ run shared prologue
398	prepare_key	r2, r3
399	vmov		q3, q0
400
401	teq		r6, #0			@ start of a block?
402	bne		.Lxtsenc3x
403
404.Lxtsencloop3x:
405	next_tweak	q3, q3, q7, q6
406.Lxtsenc3x:
407	subs		r4, r4, #3
408	bmi		.Lxtsenc1x
409	vld1.8		{q0-q1}, [r1, :64]!	@ get 3 pt blocks
410	vld1.8		{q2}, [r1, :64]!
411	next_tweak	q4, q3, q7, q6
412	veor		q0, q0, q3
413	next_tweak	q5, q4, q7, q6
414	veor		q1, q1, q4
415	veor		q2, q2, q5
416	bl		aes_encrypt_3x
417	veor		q0, q0, q3
418	veor		q1, q1, q4
419	veor		q2, q2, q5
420	vst1.8		{q0-q1}, [r0, :64]!	@ write 3 ct blocks
421	vst1.8		{q2}, [r0, :64]!
422	vmov		q3, q5
423	teq		r4, #0
424	beq		.Lxtsencout
425	b		.Lxtsencloop3x
426.Lxtsenc1x:
427	adds		r4, r4, #3
428	beq		.Lxtsencout
429.Lxtsencloop:
430	vld1.8		{q0}, [r1, :64]!
431	veor		q0, q0, q3
432	bl		aes_encrypt
433	veor		q0, q0, q3
434	vst1.8		{q0}, [r0, :64]!
435	subs		r4, r4, #1
436	beq		.Lxtsencout
437	next_tweak	q3, q3, q7, q6
438	b		.Lxtsencloop
439.Lxtsencout:
440	vst1.8		{q3}, [r5]
441	pop		{r4-r6, pc}
442ENDPROC(ce_aes_xts_encrypt)
443
444
445ENTRY(ce_aes_xts_decrypt)
446	push		{r4-r6, lr}
447
448	bl		ce_aes_xts_init		@ run shared prologue
449	prepare_key	r2, r3
450	vmov		q3, q0
451
452	teq		r6, #0			@ start of a block?
453	bne		.Lxtsdec3x
454
455.Lxtsdecloop3x:
456	next_tweak	q3, q3, q7, q6
457.Lxtsdec3x:
458	subs		r4, r4, #3
459	bmi		.Lxtsdec1x
460	vld1.8		{q0-q1}, [r1, :64]!	@ get 3 ct blocks
461	vld1.8		{q2}, [r1, :64]!
462	next_tweak	q4, q3, q7, q6
463	veor		q0, q0, q3
464	next_tweak	q5, q4, q7, q6
465	veor		q1, q1, q4
466	veor		q2, q2, q5
467	bl		aes_decrypt_3x
468	veor		q0, q0, q3
469	veor		q1, q1, q4
470	veor		q2, q2, q5
471	vst1.8		{q0-q1}, [r0, :64]!	@ write 3 pt blocks
472	vst1.8		{q2}, [r0, :64]!
473	vmov		q3, q5
474	teq		r4, #0
475	beq		.Lxtsdecout
476	b		.Lxtsdecloop3x
477.Lxtsdec1x:
478	adds		r4, r4, #3
479	beq		.Lxtsdecout
480.Lxtsdecloop:
481	vld1.8		{q0}, [r1, :64]!
482	veor		q0, q0, q3
483	add		ip, r2, #32		@ 3rd round key
484	bl		aes_decrypt
485	veor		q0, q0, q3
486	vst1.8		{q0}, [r0, :64]!
487	subs		r4, r4, #1
488	beq		.Lxtsdecout
489	next_tweak	q3, q3, q7, q6
490	b		.Lxtsdecloop
491.Lxtsdecout:
492	vst1.8		{q3}, [r5]
493	pop		{r4-r6, pc}
494ENDPROC(ce_aes_xts_decrypt)
495
496	/*
497	 * u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
498	 *                             AES sbox substitution on each byte in
499	 *                             'input'
500	 */
501ENTRY(ce_aes_sub)
502	vdup.32		q1, r0
503	veor		q0, q0, q0
504	aese.8		q0, q1
505	vmov		r0, s0
506	bx		lr
507ENDPROC(ce_aes_sub)
508
509	/*
510	 * void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
511	 *                                        operation on round key *src
512	 */
513ENTRY(ce_aes_invert)
514	vld1.8		{q0}, [r1]
515	aesimc.8	q0, q0
516	vst1.8		{q0}, [r0]
517	bx		lr
518ENDPROC(ce_aes_invert)
519