1
2
3
4
5
6
7
8
9
10
11 #include <linux/linkage.h>
12 #include <asm/frame.h>
13 #include "glue_helper-asm-avx.S"
14
15 .file "cast6-avx-x86_64-asm_64.S"
16
17 .extern cast_s1
18 .extern cast_s2
19 .extern cast_s3
20 .extern cast_s4
21
22
23 #define km 0
24 #define kr (12*4*4)
25
26
27 #define s1 cast_s1
28 #define s2 cast_s2
29 #define s3 cast_s3
30 #define s4 cast_s4
31
32
33
34
35 #define CTX %r15
36
37 #define RA1 %xmm0
38 #define RB1 %xmm1
39 #define RC1 %xmm2
40 #define RD1 %xmm3
41
42 #define RA2 %xmm4
43 #define RB2 %xmm5
44 #define RC2 %xmm6
45 #define RD2 %xmm7
46
47 #define RX %xmm8
48
49 #define RKM %xmm9
50 #define RKR %xmm10
51 #define RKRF %xmm11
52 #define RKRR %xmm12
53 #define R32 %xmm13
54 #define R1ST %xmm14
55
56 #define RTMP %xmm15
57
58 #define RID1 %rdi
59 #define RID1d %edi
60 #define RID2 %rsi
61 #define RID2d %esi
62
63 #define RGI1 %rdx
64 #define RGI1bl %dl
65 #define RGI1bh %dh
66 #define RGI2 %rcx
67 #define RGI2bl %cl
68 #define RGI2bh %ch
69
70 #define RGI3 %rax
71 #define RGI3bl %al
72 #define RGI3bh %ah
73 #define RGI4 %rbx
74 #define RGI4bl %bl
75 #define RGI4bh %bh
76
77 #define RFS1 %r8
78 #define RFS1d %r8d
79 #define RFS2 %r9
80 #define RFS2d %r9d
81 #define RFS3 %r10
82 #define RFS3d %r10d
83
84
85 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
86 movzbl src ## bh, RID1d; \
87 movzbl src ## bl, RID2d; \
88 shrq $16, src; \
89 movl s1(, RID1, 4), dst ## d; \
90 op1 s2(, RID2, 4), dst ## d; \
91 movzbl src ## bh, RID1d; \
92 movzbl src ## bl, RID2d; \
93 interleave_op(il_reg); \
94 op2 s3(, RID1, 4), dst ## d; \
95 op3 s4(, RID2, 4), dst ## d;
96
97 #define dummy(d)
98
99 #define shr_next(reg) \
100 shrq $16, reg;
101
102 #define F_head(a, x, gi1, gi2, op0) \
103 op0 a, RKM, x; \
104 vpslld RKRF, x, RTMP; \
105 vpsrld RKRR, x, x; \
106 vpor RTMP, x, x; \
107 \
108 vmovq x, gi1; \
109 vpextrq $1, x, gi2;
110
111 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
112 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
113 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
114 \
115 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \
116 shlq $32, RFS2; \
117 orq RFS1, RFS2; \
118 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \
119 shlq $32, RFS1; \
120 orq RFS1, RFS3; \
121 \
122 vmovq RFS2, x; \
123 vpinsrq $1, RFS3, x, x;
124
125 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
126 F_head(b1, RX, RGI1, RGI2, op0); \
127 F_head(b2, RX, RGI3, RGI4, op0); \
128 \
129 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \
130 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \
131 \
132 vpxor a1, RX, a1; \
133 vpxor a2, RTMP, a2;
134
135 #define F1_2(a1, b1, a2, b2) \
136 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
137 #define F2_2(a1, b1, a2, b2) \
138 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
139 #define F3_2(a1, b1, a2, b2) \
140 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
141
142 #define qop(in, out, f) \
143 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2);
144
145 #define get_round_keys(nn) \
146 vbroadcastss (km+(4*(nn)))(CTX), RKM; \
147 vpand R1ST, RKR, RKRF; \
148 vpsubq RKRF, R32, RKRR; \
149 vpsrldq $1, RKR, RKR;
150
151 #define Q(n) \
152 get_round_keys(4*n+0); \
153 qop(RD, RC, 1); \
154 \
155 get_round_keys(4*n+1); \
156 qop(RC, RB, 2); \
157 \
158 get_round_keys(4*n+2); \
159 qop(RB, RA, 3); \
160 \
161 get_round_keys(4*n+3); \
162 qop(RA, RD, 1);
163
164 #define QBAR(n) \
165 get_round_keys(4*n+3); \
166 qop(RA, RD, 1); \
167 \
168 get_round_keys(4*n+2); \
169 qop(RB, RA, 3); \
170 \
171 get_round_keys(4*n+1); \
172 qop(RC, RB, 2); \
173 \
174 get_round_keys(4*n+0); \
175 qop(RD, RC, 1);
176
177 #define shuffle(mask) \
178 vpshufb mask, RKR, RKR;
179
180 #define preload_rkr(n, do_mask, mask) \
181 vbroadcastss .L16_mask, RKR; \
182 \
183 vpxor (kr+n*16)(CTX), RKR, RKR; \
184 do_mask(mask);
185
186 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
187 vpunpckldq x1, x0, t0; \
188 vpunpckhdq x1, x0, t2; \
189 vpunpckldq x3, x2, t1; \
190 vpunpckhdq x3, x2, x3; \
191 \
192 vpunpcklqdq t1, t0, x0; \
193 vpunpckhqdq t1, t0, x1; \
194 vpunpcklqdq x3, t2, x2; \
195 vpunpckhqdq x3, t2, x3;
196
197 #define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
198 vpshufb rmask, x0, x0; \
199 vpshufb rmask, x1, x1; \
200 vpshufb rmask, x2, x2; \
201 vpshufb rmask, x3, x3; \
202 \
203 transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
204
205 #define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \
206 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
207 \
208 vpshufb rmask, x0, x0; \
209 vpshufb rmask, x1, x1; \
210 vpshufb rmask, x2, x2; \
211 vpshufb rmask, x3, x3;
212
213 .section .rodata.cst16, "aM", @progbits, 16
214 .align 16
215 .Lxts_gf128mul_and_shl1_mask:
216 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
217 .Lbswap_mask:
218 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
219 .Lbswap128_mask:
220 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
221 .Lrkr_enc_Q_Q_QBAR_QBAR:
222 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12
223 .Lrkr_enc_QBAR_QBAR_QBAR_QBAR:
224 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
225 .Lrkr_dec_Q_Q_Q_Q:
226 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
227 .Lrkr_dec_Q_Q_QBAR_QBAR:
228 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0
229 .Lrkr_dec_QBAR_QBAR_QBAR_QBAR:
230 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
231
232 .section .rodata.cst4.L16_mask, "aM", @progbits, 4
233 .align 4
234 .L16_mask:
235 .byte 16, 16, 16, 16
236
237 .section .rodata.cst4.L32_mask, "aM", @progbits, 4
238 .align 4
239 .L32_mask:
240 .byte 32, 0, 0, 0
241
242 .section .rodata.cst4.first_mask, "aM", @progbits, 4
243 .align 4
244 .Lfirst_mask:
245 .byte 0x1f, 0, 0, 0
246
247 .text
248
249 .align 8
250 __cast6_enc_blk8:
251
252
253
254
255
256
257
258 pushq %r15;
259 pushq %rbx;
260
261 movq %rdi, CTX;
262
263 vmovdqa .Lbswap_mask, RKM;
264 vmovd .Lfirst_mask, R1ST;
265 vmovd .L32_mask, R32;
266
267 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
268 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
269
270 preload_rkr(0, dummy, none);
271 Q(0);
272 Q(1);
273 Q(2);
274 Q(3);
275 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR);
276 Q(4);
277 Q(5);
278 QBAR(6);
279 QBAR(7);
280 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR);
281 QBAR(8);
282 QBAR(9);
283 QBAR(10);
284 QBAR(11);
285
286 popq %rbx;
287 popq %r15;
288
289 vmovdqa .Lbswap_mask, RKM;
290
291 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
292 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
293
294 ret;
295 ENDPROC(__cast6_enc_blk8)
296
297 .align 8
298 __cast6_dec_blk8:
299
300
301
302
303
304
305
306 pushq %r15;
307 pushq %rbx;
308
309 movq %rdi, CTX;
310
311 vmovdqa .Lbswap_mask, RKM;
312 vmovd .Lfirst_mask, R1ST;
313 vmovd .L32_mask, R32;
314
315 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
316 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
317
318 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q);
319 Q(11);
320 Q(10);
321 Q(9);
322 Q(8);
323 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR);
324 Q(7);
325 Q(6);
326 QBAR(5);
327 QBAR(4);
328 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR);
329 QBAR(3);
330 QBAR(2);
331 QBAR(1);
332 QBAR(0);
333
334 popq %rbx;
335 popq %r15;
336
337 vmovdqa .Lbswap_mask, RKM;
338 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM);
339 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM);
340
341 ret;
342 ENDPROC(__cast6_dec_blk8)
343
344 ENTRY(cast6_ecb_enc_8way)
345
346
347
348
349
350 FRAME_BEGIN
351 pushq %r15;
352
353 movq %rdi, CTX;
354 movq %rsi, %r11;
355
356 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
357
358 call __cast6_enc_blk8;
359
360 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
361
362 popq %r15;
363 FRAME_END
364 ret;
365 ENDPROC(cast6_ecb_enc_8way)
366
367 ENTRY(cast6_ecb_dec_8way)
368
369
370
371
372
373 FRAME_BEGIN
374 pushq %r15;
375
376 movq %rdi, CTX;
377 movq %rsi, %r11;
378
379 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
380
381 call __cast6_dec_blk8;
382
383 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
384
385 popq %r15;
386 FRAME_END
387 ret;
388 ENDPROC(cast6_ecb_dec_8way)
389
390 ENTRY(cast6_cbc_dec_8way)
391
392
393
394
395
396 FRAME_BEGIN
397 pushq %r12;
398 pushq %r15;
399
400 movq %rdi, CTX;
401 movq %rsi, %r11;
402 movq %rdx, %r12;
403
404 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
405
406 call __cast6_dec_blk8;
407
408 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
409
410 popq %r15;
411 popq %r12;
412 FRAME_END
413 ret;
414 ENDPROC(cast6_cbc_dec_8way)
415
416 ENTRY(cast6_ctr_8way)
417
418
419
420
421
422
423 FRAME_BEGIN
424 pushq %r12;
425 pushq %r15
426
427 movq %rdi, CTX;
428 movq %rsi, %r11;
429 movq %rdx, %r12;
430
431 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
432 RD2, RX, RKR, RKM);
433
434 call __cast6_enc_blk8;
435
436 store_ctr_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
437
438 popq %r15;
439 popq %r12;
440 FRAME_END
441 ret;
442 ENDPROC(cast6_ctr_8way)
443
444 ENTRY(cast6_xts_enc_8way)
445
446
447
448
449
450
451 FRAME_BEGIN
452 pushq %r15;
453
454 movq %rdi, CTX
455 movq %rsi, %r11;
456
457
458 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
459 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
460
461 call __cast6_enc_blk8;
462
463
464 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
465
466 popq %r15;
467 FRAME_END
468 ret;
469 ENDPROC(cast6_xts_enc_8way)
470
471 ENTRY(cast6_xts_dec_8way)
472
473
474
475
476
477
478 FRAME_BEGIN
479 pushq %r15;
480
481 movq %rdi, CTX
482 movq %rsi, %r11;
483
484
485 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
486 RX, RKR, RKM, .Lxts_gf128mul_and_shl1_mask);
487
488 call __cast6_dec_blk8;
489
490
491 store_xts_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
492
493 popq %r15;
494 FRAME_END
495 ret;
496 ENDPROC(cast6_xts_dec_8way)