1
2
3
4
5
6
7
8 #include <linux/linkage.h>
9
10 .file "blowfish-x86_64-asm.S"
11 .text
12
13
14 #define p 0
15 #define s0 ((16 + 2) * 4)
16 #define s1 ((16 + 2 + (1 * 256)) * 4)
17 #define s2 ((16 + 2 + (2 * 256)) * 4)
18 #define s3 ((16 + 2 + (3 * 256)) * 4)
19
20
21 #define CTX %r12
22 #define RIO %rsi
23
24 #define RX0 %rax
25 #define RX1 %rbx
26 #define RX2 %rcx
27 #define RX3 %rdx
28
29 #define RX0d %eax
30 #define RX1d %ebx
31 #define RX2d %ecx
32 #define RX3d %edx
33
34 #define RX0bl %al
35 #define RX1bl %bl
36 #define RX2bl %cl
37 #define RX3bl %dl
38
39 #define RX0bh %ah
40 #define RX1bh %bh
41 #define RX2bh %ch
42 #define RX3bh %dh
43
44 #define RT0 %rdi
45 #define RT1 %rsi
46 #define RT2 %r8
47 #define RT3 %r9
48
49 #define RT0d %edi
50 #define RT1d %esi
51 #define RT2d %r8d
52 #define RT3d %r9d
53
54 #define RKEY %r10
55
56
57
58
59 #define F() \
60 rorq $16, RX0; \
61 movzbl RX0bh, RT0d; \
62 movzbl RX0bl, RT1d; \
63 rolq $16, RX0; \
64 movl s0(CTX,RT0,4), RT0d; \
65 addl s1(CTX,RT1,4), RT0d; \
66 movzbl RX0bh, RT1d; \
67 movzbl RX0bl, RT2d; \
68 rolq $32, RX0; \
69 xorl s2(CTX,RT1,4), RT0d; \
70 addl s3(CTX,RT2,4), RT0d; \
71 xorq RT0, RX0;
72
73 #define add_roundkey_enc(n) \
74 xorq p+4*(n)(CTX), RX0;
75
76 #define round_enc(n) \
77 add_roundkey_enc(n); \
78 \
79 F(); \
80 F();
81
82 #define add_roundkey_dec(n) \
83 movq p+4*(n-1)(CTX), RT0; \
84 rorq $32, RT0; \
85 xorq RT0, RX0;
86
87 #define round_dec(n) \
88 add_roundkey_dec(n); \
89 \
90 F(); \
91 F(); \
92
93 #define read_block() \
94 movq (RIO), RX0; \
95 rorq $32, RX0; \
96 bswapq RX0;
97
98 #define write_block() \
99 bswapq RX0; \
100 movq RX0, (RIO);
101
102 #define xor_block() \
103 bswapq RX0; \
104 xorq RX0, (RIO);
105
106 ENTRY(__blowfish_enc_blk)
107
108
109
110
111
112
113 movq %r12, %r11;
114
115 movq %rdi, CTX;
116 movq %rsi, %r10;
117 movq %rdx, RIO;
118
119 read_block();
120
121 round_enc(0);
122 round_enc(2);
123 round_enc(4);
124 round_enc(6);
125 round_enc(8);
126 round_enc(10);
127 round_enc(12);
128 round_enc(14);
129 add_roundkey_enc(16);
130
131 movq %r11, %r12;
132
133 movq %r10, RIO;
134 test %cl, %cl;
135 jnz .L__enc_xor;
136
137 write_block();
138 ret;
139 .L__enc_xor:
140 xor_block();
141 ret;
142 ENDPROC(__blowfish_enc_blk)
143
144 ENTRY(blowfish_dec_blk)
145
146
147
148
149
150 movq %r12, %r11;
151
152 movq %rdi, CTX;
153 movq %rsi, %r10;
154 movq %rdx, RIO;
155
156 read_block();
157
158 round_dec(17);
159 round_dec(15);
160 round_dec(13);
161 round_dec(11);
162 round_dec(9);
163 round_dec(7);
164 round_dec(5);
165 round_dec(3);
166 add_roundkey_dec(1);
167
168 movq %r10, RIO;
169 write_block();
170
171 movq %r11, %r12;
172
173 ret;
174 ENDPROC(blowfish_dec_blk)
175
176
177
178
179
180
181
182
183 #define F4(x) \
184 movzbl x ## bh, RT1d; \
185 movzbl x ## bl, RT3d; \
186 rorq $16, x; \
187 movzbl x ## bh, RT0d; \
188 movzbl x ## bl, RT2d; \
189 rorq $16, x; \
190 movl s0(CTX,RT0,4), RT0d; \
191 addl s1(CTX,RT2,4), RT0d; \
192 xorl s2(CTX,RT1,4), RT0d; \
193 addl s3(CTX,RT3,4), RT0d; \
194 xorq RT0, x;
195
196 #define add_preloaded_roundkey4() \
197 xorq RKEY, RX0; \
198 xorq RKEY, RX1; \
199 xorq RKEY, RX2; \
200 xorq RKEY, RX3;
201
202 #define preload_roundkey_enc(n) \
203 movq p+4*(n)(CTX), RKEY;
204
205 #define add_roundkey_enc4(n) \
206 add_preloaded_roundkey4(); \
207 preload_roundkey_enc(n + 2);
208
209 #define round_enc4(n) \
210 add_roundkey_enc4(n); \
211 \
212 F4(RX0); \
213 F4(RX1); \
214 F4(RX2); \
215 F4(RX3); \
216 \
217 F4(RX0); \
218 F4(RX1); \
219 F4(RX2); \
220 F4(RX3);
221
222 #define preload_roundkey_dec(n) \
223 movq p+4*((n)-1)(CTX), RKEY; \
224 rorq $32, RKEY;
225
226 #define add_roundkey_dec4(n) \
227 add_preloaded_roundkey4(); \
228 preload_roundkey_dec(n - 2);
229
230 #define round_dec4(n) \
231 add_roundkey_dec4(n); \
232 \
233 F4(RX0); \
234 F4(RX1); \
235 F4(RX2); \
236 F4(RX3); \
237 \
238 F4(RX0); \
239 F4(RX1); \
240 F4(RX2); \
241 F4(RX3);
242
243 #define read_block4() \
244 movq (RIO), RX0; \
245 rorq $32, RX0; \
246 bswapq RX0; \
247 \
248 movq 8(RIO), RX1; \
249 rorq $32, RX1; \
250 bswapq RX1; \
251 \
252 movq 16(RIO), RX2; \
253 rorq $32, RX2; \
254 bswapq RX2; \
255 \
256 movq 24(RIO), RX3; \
257 rorq $32, RX3; \
258 bswapq RX3;
259
260 #define write_block4() \
261 bswapq RX0; \
262 movq RX0, (RIO); \
263 \
264 bswapq RX1; \
265 movq RX1, 8(RIO); \
266 \
267 bswapq RX2; \
268 movq RX2, 16(RIO); \
269 \
270 bswapq RX3; \
271 movq RX3, 24(RIO);
272
273 #define xor_block4() \
274 bswapq RX0; \
275 xorq RX0, (RIO); \
276 \
277 bswapq RX1; \
278 xorq RX1, 8(RIO); \
279 \
280 bswapq RX2; \
281 xorq RX2, 16(RIO); \
282 \
283 bswapq RX3; \
284 xorq RX3, 24(RIO);
285
286 ENTRY(__blowfish_enc_blk_4way)
287
288
289
290
291
292
293 pushq %r12;
294 pushq %rbx;
295 pushq %rcx;
296
297 movq %rdi, CTX
298 movq %rsi, %r11;
299 movq %rdx, RIO;
300
301 preload_roundkey_enc(0);
302
303 read_block4();
304
305 round_enc4(0);
306 round_enc4(2);
307 round_enc4(4);
308 round_enc4(6);
309 round_enc4(8);
310 round_enc4(10);
311 round_enc4(12);
312 round_enc4(14);
313 add_preloaded_roundkey4();
314
315 popq %r12;
316 movq %r11, RIO;
317
318 test %r12b, %r12b;
319 jnz .L__enc_xor4;
320
321 write_block4();
322
323 popq %rbx;
324 popq %r12;
325 ret;
326
327 .L__enc_xor4:
328 xor_block4();
329
330 popq %rbx;
331 popq %r12;
332 ret;
333 ENDPROC(__blowfish_enc_blk_4way)
334
335 ENTRY(blowfish_dec_blk_4way)
336
337
338
339
340
341 pushq %r12;
342 pushq %rbx;
343
344 movq %rdi, CTX;
345 movq %rsi, %r11
346 movq %rdx, RIO;
347
348 preload_roundkey_dec(17);
349 read_block4();
350
351 round_dec4(17);
352 round_dec4(15);
353 round_dec4(13);
354 round_dec4(11);
355 round_dec4(9);
356 round_dec4(7);
357 round_dec4(5);
358 round_dec4(3);
359 add_preloaded_roundkey4();
360
361 movq %r11, RIO;
362 write_block4();
363
364 popq %rbx;
365 popq %r12;
366
367 ret;
368 ENDPROC(blowfish_dec_blk_4way)