1/*
2 * Blowfish Cipher Algorithm (x86_64)
3 *
4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
19 * USA
20 *
21 */
22
23#include <linux/linkage.h>
24
25.file "blowfish-x86_64-asm.S"
26.text
27
28/* structure of crypto context */
29#define p	0
30#define s0	((16 + 2) * 4)
31#define s1	((16 + 2 + (1 * 256)) * 4)
32#define s2	((16 + 2 + (2 * 256)) * 4)
33#define s3	((16 + 2 + (3 * 256)) * 4)
34
35/* register macros */
36#define CTX %rdi
37#define RIO %rsi
38
39#define RX0 %rax
40#define RX1 %rbx
41#define RX2 %rcx
42#define RX3 %rdx
43
44#define RX0d %eax
45#define RX1d %ebx
46#define RX2d %ecx
47#define RX3d %edx
48
49#define RX0bl %al
50#define RX1bl %bl
51#define RX2bl %cl
52#define RX3bl %dl
53
54#define RX0bh %ah
55#define RX1bh %bh
56#define RX2bh %ch
57#define RX3bh %dh
58
59#define RT0 %rbp
60#define RT1 %rsi
61#define RT2 %r8
62#define RT3 %r9
63
64#define RT0d %ebp
65#define RT1d %esi
66#define RT2d %r8d
67#define RT3d %r9d
68
69#define RKEY %r10
70
71/***********************************************************************
72 * 1-way blowfish
73 ***********************************************************************/
74#define F() \
75	rorq $16,		RX0; \
76	movzbl RX0bh,		RT0d; \
77	movzbl RX0bl,		RT1d; \
78	rolq $16,		RX0; \
79	movl s0(CTX,RT0,4),	RT0d; \
80	addl s1(CTX,RT1,4),	RT0d; \
81	movzbl RX0bh,		RT1d; \
82	movzbl RX0bl,		RT2d; \
83	rolq $32,		RX0; \
84	xorl s2(CTX,RT1,4),	RT0d; \
85	addl s3(CTX,RT2,4),	RT0d; \
86	xorq RT0,		RX0;
87
88#define add_roundkey_enc(n) \
89	xorq p+4*(n)(CTX), 	RX0;
90
91#define round_enc(n) \
92	add_roundkey_enc(n); \
93	\
94	F(); \
95	F();
96
97#define add_roundkey_dec(n) \
98	movq p+4*(n-1)(CTX),	RT0; \
99	rorq $32,		RT0; \
100	xorq RT0,		RX0;
101
102#define round_dec(n) \
103	add_roundkey_dec(n); \
104	\
105	F(); \
106	F(); \
107
108#define read_block() \
109	movq (RIO), 		RX0; \
110	rorq $32, 		RX0; \
111	bswapq 			RX0;
112
113#define write_block() \
114	bswapq 			RX0; \
115	movq RX0, 		(RIO);
116
117#define xor_block() \
118	bswapq 			RX0; \
119	xorq RX0, 		(RIO);
120
121ENTRY(__blowfish_enc_blk)
122	/* input:
123	 *	%rdi: ctx, CTX
124	 *	%rsi: dst
125	 *	%rdx: src
126	 *	%rcx: bool, if true: xor output
127	 */
128	movq %rbp, %r11;
129
130	movq %rsi, %r10;
131	movq %rdx, RIO;
132
133	read_block();
134
135	round_enc(0);
136	round_enc(2);
137	round_enc(4);
138	round_enc(6);
139	round_enc(8);
140	round_enc(10);
141	round_enc(12);
142	round_enc(14);
143	add_roundkey_enc(16);
144
145	movq %r11, %rbp;
146
147	movq %r10, RIO;
148	test %cl, %cl;
149	jnz .L__enc_xor;
150
151	write_block();
152	ret;
153.L__enc_xor:
154	xor_block();
155	ret;
156ENDPROC(__blowfish_enc_blk)
157
158ENTRY(blowfish_dec_blk)
159	/* input:
160	 *	%rdi: ctx, CTX
161	 *	%rsi: dst
162	 *	%rdx: src
163	 */
164	movq %rbp, %r11;
165
166	movq %rsi, %r10;
167	movq %rdx, RIO;
168
169	read_block();
170
171	round_dec(17);
172	round_dec(15);
173	round_dec(13);
174	round_dec(11);
175	round_dec(9);
176	round_dec(7);
177	round_dec(5);
178	round_dec(3);
179	add_roundkey_dec(1);
180
181	movq %r10, RIO;
182	write_block();
183
184	movq %r11, %rbp;
185
186	ret;
187ENDPROC(blowfish_dec_blk)
188
189/**********************************************************************
190  4-way blowfish, four blocks parallel
191 **********************************************************************/
192
193/* F() for 4-way. Slower when used alone/1-way, but faster when used
194 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
195 */
196#define F4(x) \
197	movzbl x ## bh,		RT1d; \
198	movzbl x ## bl,		RT3d; \
199	rorq $16,		x; \
200	movzbl x ## bh,		RT0d; \
201	movzbl x ## bl,		RT2d; \
202	rorq $16,		x; \
203	movl s0(CTX,RT0,4),	RT0d; \
204	addl s1(CTX,RT2,4),	RT0d; \
205	xorl s2(CTX,RT1,4),	RT0d; \
206	addl s3(CTX,RT3,4),	RT0d; \
207	xorq RT0,		x;
208
209#define add_preloaded_roundkey4() \
210	xorq RKEY,		RX0; \
211	xorq RKEY,		RX1; \
212	xorq RKEY,		RX2; \
213	xorq RKEY,		RX3;
214
215#define preload_roundkey_enc(n) \
216	movq p+4*(n)(CTX),	RKEY;
217
218#define add_roundkey_enc4(n) \
219	add_preloaded_roundkey4(); \
220	preload_roundkey_enc(n + 2);
221
222#define round_enc4(n) \
223	add_roundkey_enc4(n); \
224	\
225	F4(RX0); \
226	F4(RX1); \
227	F4(RX2); \
228	F4(RX3); \
229	\
230	F4(RX0); \
231	F4(RX1); \
232	F4(RX2); \
233	F4(RX3);
234
235#define preload_roundkey_dec(n) \
236	movq p+4*((n)-1)(CTX),	RKEY; \
237	rorq $32,		RKEY;
238
239#define add_roundkey_dec4(n) \
240	add_preloaded_roundkey4(); \
241	preload_roundkey_dec(n - 2);
242
243#define round_dec4(n) \
244	add_roundkey_dec4(n); \
245	\
246	F4(RX0); \
247	F4(RX1); \
248	F4(RX2); \
249	F4(RX3); \
250	\
251	F4(RX0); \
252	F4(RX1); \
253	F4(RX2); \
254	F4(RX3);
255
256#define read_block4() \
257	movq (RIO),		RX0; \
258	rorq $32,		RX0; \
259	bswapq 			RX0; \
260	\
261	movq 8(RIO),		RX1; \
262	rorq $32,		RX1; \
263	bswapq 			RX1; \
264	\
265	movq 16(RIO),		RX2; \
266	rorq $32,		RX2; \
267	bswapq 			RX2; \
268	\
269	movq 24(RIO),		RX3; \
270	rorq $32,		RX3; \
271	bswapq 			RX3;
272
273#define write_block4() \
274	bswapq 			RX0; \
275	movq RX0,		(RIO); \
276	\
277	bswapq 			RX1; \
278	movq RX1,		8(RIO); \
279	\
280	bswapq 			RX2; \
281	movq RX2,		16(RIO); \
282	\
283	bswapq 			RX3; \
284	movq RX3,		24(RIO);
285
286#define xor_block4() \
287	bswapq 			RX0; \
288	xorq RX0,		(RIO); \
289	\
290	bswapq 			RX1; \
291	xorq RX1,		8(RIO); \
292	\
293	bswapq 			RX2; \
294	xorq RX2,		16(RIO); \
295	\
296	bswapq 			RX3; \
297	xorq RX3,		24(RIO);
298
299ENTRY(__blowfish_enc_blk_4way)
300	/* input:
301	 *	%rdi: ctx, CTX
302	 *	%rsi: dst
303	 *	%rdx: src
304	 *	%rcx: bool, if true: xor output
305	 */
306	pushq %rbp;
307	pushq %rbx;
308	pushq %rcx;
309
310	preload_roundkey_enc(0);
311
312	movq %rsi, %r11;
313	movq %rdx, RIO;
314
315	read_block4();
316
317	round_enc4(0);
318	round_enc4(2);
319	round_enc4(4);
320	round_enc4(6);
321	round_enc4(8);
322	round_enc4(10);
323	round_enc4(12);
324	round_enc4(14);
325	add_preloaded_roundkey4();
326
327	popq %rbp;
328	movq %r11, RIO;
329
330	test %bpl, %bpl;
331	jnz .L__enc_xor4;
332
333	write_block4();
334
335	popq %rbx;
336	popq %rbp;
337	ret;
338
339.L__enc_xor4:
340	xor_block4();
341
342	popq %rbx;
343	popq %rbp;
344	ret;
345ENDPROC(__blowfish_enc_blk_4way)
346
347ENTRY(blowfish_dec_blk_4way)
348	/* input:
349	 *	%rdi: ctx, CTX
350	 *	%rsi: dst
351	 *	%rdx: src
352	 */
353	pushq %rbp;
354	pushq %rbx;
355	preload_roundkey_dec(17);
356
357	movq %rsi, %r11;
358	movq %rdx, RIO;
359
360	read_block4();
361
362	round_dec4(17);
363	round_dec4(15);
364	round_dec4(13);
365	round_dec4(11);
366	round_dec4(9);
367	round_dec4(7);
368	round_dec4(5);
369	round_dec4(3);
370	add_preloaded_roundkey4();
371
372	movq %r11, RIO;
373	write_block4();
374
375	popq %rbx;
376	popq %rbp;
377
378	ret;
379ENDPROC(blowfish_dec_blk_4way)
380