1/*
2 * Fast AES implementation for SPE instruction set (PPC)
3 *
4 * This code makes use of the SPE SIMD instruction set as defined in
5 * http://cache.freescale.com/files/32bit/doc/ref_manual/SPEPIM.pdf
6 * Implementation is based on optimization guide notes from
7 * http://cache.freescale.com/files/32bit/doc/app_note/AN2665.pdf
8 *
9 * Copyright (c) 2015 Markus Stockhausen <stockhausen@collogia.de>
10 *
11 * This program is free software; you can redistribute it and/or modify it
12 * under the terms of the GNU General Public License as published by the Free
13 * Software Foundation; either version 2 of the License, or (at your option)
14 * any later version.
15 *
16 */
17
18#include <asm/ppc_asm.h>
19#include "aes-spe-regs.h"
20
21#define	EAD(in, bpos) \
22	rlwimi		rT0,in,28-((bpos+3)%4)*8,20,27;
23
24#define DAD(in, bpos) \
25	rlwimi		rT1,in,24-((bpos+3)%4)*8,24,31;
26
27#define LWH(out, off) \
28	evlwwsplat	out,off(rT0);	/* load word high		*/
29
30#define LWL(out, off) \
31	lwz		out,off(rT0);	/* load word low		*/
32
33#define LBZ(out, tab, off) \
34	lbz		out,off(tab);	/* load byte			*/
35
36#define LAH(out, in, bpos, off) \
37	EAD(in, bpos)			/* calc addr + load word high	*/ \
38	LWH(out, off)
39
40#define LAL(out, in, bpos, off) \
41	EAD(in, bpos)			/* calc addr + load word low	*/ \
42	LWL(out, off)
43
44#define LAE(out, in, bpos) \
45	EAD(in, bpos)			/* calc addr + load enc byte	*/ \
46	LBZ(out, rT0, 8)
47
48#define LBE(out) \
49	LBZ(out, rT0, 8)		/* load enc byte		*/
50
51#define LAD(out, in, bpos) \
52	DAD(in, bpos)			/* calc addr + load dec byte	*/ \
53	LBZ(out, rT1, 0)
54
55#define LBD(out) \
56	LBZ(out, rT1, 0)
57
58/*
59 * ppc_encrypt_block: The central encryption function for a single 16 bytes
60 * block. It does no stack handling or register saving to support fast calls
61 * via bl/blr. It expects that caller has pre-xored input data with first
62 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
63 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
64 * and rW0-rW3 and caller must execute a final xor on the ouput registers.
65 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
66 *
67 */
68_GLOBAL(ppc_encrypt_block)
69	LAH(rW4, rD1, 2, 4)
70	LAH(rW6, rD0, 3, 0)
71	LAH(rW3, rD0, 1, 8)
72ppc_encrypt_block_loop:
73	LAH(rW0, rD3, 0, 12)
74	LAL(rW0, rD0, 0, 12)
75	LAH(rW1, rD1, 0, 12)
76	LAH(rW2, rD2, 1, 8)
77	LAL(rW2, rD3, 1, 8)
78	LAL(rW3, rD1, 1, 8)
79	LAL(rW4, rD2, 2, 4)
80	LAL(rW6, rD1, 3, 0)
81	LAH(rW5, rD3, 2, 4)
82	LAL(rW5, rD0, 2, 4)
83	LAH(rW7, rD2, 3, 0)
84	evldw		rD1,16(rKP)
85	EAD(rD3, 3)
86	evxor		rW2,rW2,rW4
87	LWL(rW7, 0)
88	evxor		rW2,rW2,rW6
89	EAD(rD2, 0)
90	evxor		rD1,rD1,rW2
91	LWL(rW1, 12)
92	evxor		rD1,rD1,rW0
93	evldw		rD3,24(rKP)
94	evmergehi	rD0,rD0,rD1
95	EAD(rD1, 2)
96	evxor		rW3,rW3,rW5
97	LWH(rW4, 4)
98	evxor		rW3,rW3,rW7
99	EAD(rD0, 3)
100	evxor		rD3,rD3,rW3
101	LWH(rW6, 0)
102	evxor		rD3,rD3,rW1
103	EAD(rD0, 1)
104	evmergehi	rD2,rD2,rD3
105	LWH(rW3, 8)
106	LAH(rW0, rD3, 0, 12)
107	LAL(rW0, rD0, 0, 12)
108	LAH(rW1, rD1, 0, 12)
109	LAH(rW2, rD2, 1, 8)
110	LAL(rW2, rD3, 1, 8)
111	LAL(rW3, rD1, 1, 8)
112	LAL(rW4, rD2, 2, 4)
113	LAL(rW6, rD1, 3, 0)
114	LAH(rW5, rD3, 2, 4)
115	LAL(rW5, rD0, 2, 4)
116	LAH(rW7, rD2, 3, 0)
117	evldw		rD1,32(rKP)
118	EAD(rD3, 3)
119	evxor		rW2,rW2,rW4
120	LWL(rW7, 0)
121	evxor		rW2,rW2,rW6
122	EAD(rD2, 0)
123	evxor		rD1,rD1,rW2
124	LWL(rW1, 12)
125	evxor		rD1,rD1,rW0
126	evldw		rD3,40(rKP)
127	evmergehi	rD0,rD0,rD1
128	EAD(rD1, 2)
129	evxor		rW3,rW3,rW5
130	LWH(rW4, 4)
131	evxor		rW3,rW3,rW7
132	EAD(rD0, 3)
133	evxor		rD3,rD3,rW3
134	LWH(rW6, 0)
135	evxor		rD3,rD3,rW1
136	EAD(rD0, 1)
137	evmergehi	rD2,rD2,rD3
138	LWH(rW3, 8)
139	addi		rKP,rKP,32
140	bdnz		ppc_encrypt_block_loop
141	LAH(rW0, rD3, 0, 12)
142	LAL(rW0, rD0, 0, 12)
143	LAH(rW1, rD1, 0, 12)
144	LAH(rW2, rD2, 1, 8)
145	LAL(rW2, rD3, 1, 8)
146	LAL(rW3, rD1, 1, 8)
147	LAL(rW4, rD2, 2, 4)
148	LAH(rW5, rD3, 2, 4)
149	LAL(rW6, rD1, 3, 0)
150	LAL(rW5, rD0, 2, 4)
151	LAH(rW7, rD2, 3, 0)
152	evldw		rD1,16(rKP)
153	EAD(rD3, 3)
154	evxor		rW2,rW2,rW4
155	LWL(rW7, 0)
156	evxor		rW2,rW2,rW6
157	EAD(rD2, 0)
158	evxor		rD1,rD1,rW2
159	LWL(rW1, 12)
160	evxor		rD1,rD1,rW0
161	evldw		rD3,24(rKP)
162	evmergehi	rD0,rD0,rD1
163	EAD(rD1, 0)
164	evxor		rW3,rW3,rW5
165	LBE(rW2)
166	evxor		rW3,rW3,rW7
167	EAD(rD0, 1)
168	evxor		rD3,rD3,rW3
169	LBE(rW6)
170	evxor		rD3,rD3,rW1
171	EAD(rD0, 0)
172	evmergehi	rD2,rD2,rD3
173	LBE(rW1)
174	LAE(rW0, rD3, 0)
175	LAE(rW1, rD0, 0)
176	LAE(rW4, rD2, 1)
177	LAE(rW5, rD3, 1)
178	LAE(rW3, rD2, 0)
179	LAE(rW7, rD1, 1)
180	rlwimi		rW0,rW4,8,16,23
181	rlwimi		rW1,rW5,8,16,23
182	LAE(rW4, rD1, 2)
183	LAE(rW5, rD2, 2)
184	rlwimi		rW2,rW6,8,16,23
185	rlwimi		rW3,rW7,8,16,23
186	LAE(rW6, rD3, 2)
187	LAE(rW7, rD0, 2)
188	rlwimi		rW0,rW4,16,8,15
189	rlwimi		rW1,rW5,16,8,15
190	LAE(rW4, rD0, 3)
191	LAE(rW5, rD1, 3)
192	rlwimi		rW2,rW6,16,8,15
193	lwz		rD0,32(rKP)
194	rlwimi		rW3,rW7,16,8,15
195	lwz		rD1,36(rKP)
196	LAE(rW6, rD2, 3)
197	LAE(rW7, rD3, 3)
198	rlwimi		rW0,rW4,24,0,7
199	lwz		rD2,40(rKP)
200	rlwimi		rW1,rW5,24,0,7
201	lwz		rD3,44(rKP)
202	rlwimi		rW2,rW6,24,0,7
203	rlwimi		rW3,rW7,24,0,7
204	blr
205
206/*
207 * ppc_decrypt_block: The central decryption function for a single 16 bytes
208 * block. It does no stack handling or register saving to support fast calls
209 * via bl/blr. It expects that caller has pre-xored input data with first
210 * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
211 * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
212 * and rW0-rW3 and caller must execute a final xor on the ouput registers.
213 * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
214 *
215 */
216_GLOBAL(ppc_decrypt_block)
217	LAH(rW0, rD1, 0, 12)
218	LAH(rW6, rD0, 3, 0)
219	LAH(rW3, rD0, 1, 8)
220ppc_decrypt_block_loop:
221	LAH(rW1, rD3, 0, 12)
222	LAL(rW0, rD2, 0, 12)
223	LAH(rW2, rD2, 1, 8)
224	LAL(rW2, rD3, 1, 8)
225	LAH(rW4, rD3, 2, 4)
226	LAL(rW4, rD0, 2, 4)
227	LAL(rW6, rD1, 3, 0)
228	LAH(rW5, rD1, 2, 4)
229	LAH(rW7, rD2, 3, 0)
230	LAL(rW7, rD3, 3, 0)
231	LAL(rW3, rD1, 1, 8)
232	evldw		rD1,16(rKP)
233	EAD(rD0, 0)
234	evxor		rW4,rW4,rW6
235	LWL(rW1, 12)
236	evxor		rW0,rW0,rW4
237	EAD(rD2, 2)
238	evxor		rW0,rW0,rW2
239	LWL(rW5, 4)
240	evxor		rD1,rD1,rW0
241	evldw		rD3,24(rKP)
242	evmergehi	rD0,rD0,rD1
243	EAD(rD1, 0)
244	evxor		rW3,rW3,rW7
245	LWH(rW0, 12)
246	evxor		rW3,rW3,rW1
247	EAD(rD0, 3)
248	evxor		rD3,rD3,rW3
249	LWH(rW6, 0)
250	evxor		rD3,rD3,rW5
251	EAD(rD0, 1)
252	evmergehi	rD2,rD2,rD3
253	LWH(rW3, 8)
254	LAH(rW1, rD3, 0, 12)
255	LAL(rW0, rD2, 0, 12)
256	LAH(rW2, rD2, 1, 8)
257	LAL(rW2, rD3, 1, 8)
258	LAH(rW4, rD3, 2, 4)
259	LAL(rW4, rD0, 2, 4)
260	LAL(rW6, rD1, 3, 0)
261	LAH(rW5, rD1, 2, 4)
262	LAH(rW7, rD2, 3, 0)
263	LAL(rW7, rD3, 3, 0)
264	LAL(rW3, rD1, 1, 8)
265	evldw		 rD1,32(rKP)
266	EAD(rD0, 0)
267	evxor		rW4,rW4,rW6
268	LWL(rW1, 12)
269	evxor		rW0,rW0,rW4
270	EAD(rD2, 2)
271	evxor		rW0,rW0,rW2
272	LWL(rW5, 4)
273	evxor		rD1,rD1,rW0
274	evldw		rD3,40(rKP)
275	evmergehi	rD0,rD0,rD1
276	EAD(rD1, 0)
277	evxor		rW3,rW3,rW7
278	LWH(rW0, 12)
279	evxor		rW3,rW3,rW1
280	EAD(rD0, 3)
281	evxor		rD3,rD3,rW3
282	LWH(rW6, 0)
283	evxor		rD3,rD3,rW5
284	EAD(rD0, 1)
285	evmergehi	rD2,rD2,rD3
286	LWH(rW3, 8)
287	addi		rKP,rKP,32
288	bdnz		ppc_decrypt_block_loop
289	LAH(rW1, rD3, 0, 12)
290	LAL(rW0, rD2, 0, 12)
291	LAH(rW2, rD2, 1, 8)
292	LAL(rW2, rD3, 1, 8)
293	LAH(rW4, rD3, 2, 4)
294	LAL(rW4, rD0, 2, 4)
295	LAL(rW6, rD1, 3, 0)
296	LAH(rW5, rD1, 2, 4)
297	LAH(rW7, rD2, 3, 0)
298	LAL(rW7, rD3, 3, 0)
299	LAL(rW3, rD1, 1, 8)
300	evldw		 rD1,16(rKP)
301	EAD(rD0, 0)
302	evxor		rW4,rW4,rW6
303	LWL(rW1, 12)
304	evxor		rW0,rW0,rW4
305	EAD(rD2, 2)
306	evxor		rW0,rW0,rW2
307	LWL(rW5, 4)
308	evxor		rD1,rD1,rW0
309	evldw		rD3,24(rKP)
310	evmergehi	rD0,rD0,rD1
311	DAD(rD1, 0)
312	evxor		rW3,rW3,rW7
313	LBD(rW0)
314	evxor		rW3,rW3,rW1
315	DAD(rD0, 1)
316	evxor		rD3,rD3,rW3
317	LBD(rW6)
318	evxor		rD3,rD3,rW5
319	DAD(rD0, 0)
320	evmergehi	rD2,rD2,rD3
321	LBD(rW3)
322	LAD(rW2, rD3, 0)
323	LAD(rW1, rD2, 0)
324	LAD(rW4, rD2, 1)
325	LAD(rW5, rD3, 1)
326	LAD(rW7, rD1, 1)
327	rlwimi		rW0,rW4,8,16,23
328	rlwimi		rW1,rW5,8,16,23
329	LAD(rW4, rD3, 2)
330	LAD(rW5, rD0, 2)
331	rlwimi		rW2,rW6,8,16,23
332	rlwimi		rW3,rW7,8,16,23
333	LAD(rW6, rD1, 2)
334	LAD(rW7, rD2, 2)
335	rlwimi		rW0,rW4,16,8,15
336	rlwimi		rW1,rW5,16,8,15
337	LAD(rW4, rD0, 3)
338	LAD(rW5, rD1, 3)
339	rlwimi		rW2,rW6,16,8,15
340	lwz		rD0,32(rKP)
341	rlwimi		rW3,rW7,16,8,15
342	lwz		rD1,36(rKP)
343	LAD(rW6, rD2, 3)
344	LAD(rW7, rD3, 3)
345	rlwimi		rW0,rW4,24,0,7
346	lwz		rD2,40(rKP)
347	rlwimi		rW1,rW5,24,0,7
348	lwz		rD3,44(rKP)
349	rlwimi		rW2,rW6,24,0,7
350	rlwimi		rW3,rW7,24,0,7
351	blr
352