1
2
3
4
5
6
7
8 #include <linux/linkage.h>
9 #include <asm/frame.h>
10 #include <asm/nospec-branch.h>
11
12 #define CAMELLIA_TABLE_BYTE_LEN 272
13
14
15 #define key_table 0
16 #define key_length CAMELLIA_TABLE_BYTE_LEN
17
18
19 #define CTX %rdi
20 #define RIO %r8
21
22
23
24
25 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
26 vpand x, mask4bit, tmp0; \
27 vpandn x, mask4bit, x; \
28 vpsrld $4, x, x; \
29 \
30 vpshufb tmp0, lo_t, tmp0; \
31 vpshufb x, hi_t, x; \
32 vpxor tmp0, x, x;
33
34 #define ymm0_x xmm0
35 #define ymm1_x xmm1
36 #define ymm2_x xmm2
37 #define ymm3_x xmm3
38 #define ymm4_x xmm4
39 #define ymm5_x xmm5
40 #define ymm6_x xmm6
41 #define ymm7_x xmm7
42 #define ymm8_x xmm8
43 #define ymm9_x xmm9
44 #define ymm10_x xmm10
45 #define ymm11_x xmm11
46 #define ymm12_x xmm12
47 #define ymm13_x xmm13
48 #define ymm14_x xmm14
49 #define ymm15_x xmm15
50
51
52
53
54
55
56
57
58
59
60
61
62
63 #define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
64 t7, mem_cd, key) \
65
66
67 \
68 vbroadcasti128 .Linv_shift_row, t4; \
69 vpbroadcastd .L0f0f0f0f, t7; \
70 vbroadcasti128 .Lpre_tf_lo_s1, t5; \
71 vbroadcasti128 .Lpre_tf_hi_s1, t6; \
72 vbroadcasti128 .Lpre_tf_lo_s4, t2; \
73 vbroadcasti128 .Lpre_tf_hi_s4, t3; \
74 \
75 \
76 vpshufb t4, x0, x0; \
77 vpshufb t4, x7, x7; \
78 vpshufb t4, x3, x3; \
79 vpshufb t4, x6, x6; \
80 vpshufb t4, x2, x2; \
81 vpshufb t4, x5, x5; \
82 vpshufb t4, x1, x1; \
83 vpshufb t4, x4, x4; \
84 \
85 \
86 \
87 filter_8bit(x0, t5, t6, t7, t4); \
88 filter_8bit(x7, t5, t6, t7, t4); \
89 vextracti128 $1, x0, t0##_x; \
90 vextracti128 $1, x7, t1##_x; \
91 filter_8bit(x3, t2, t3, t7, t4); \
92 filter_8bit(x6, t2, t3, t7, t4); \
93 vextracti128 $1, x3, t3##_x; \
94 vextracti128 $1, x6, t2##_x; \
95 filter_8bit(x2, t5, t6, t7, t4); \
96 filter_8bit(x5, t5, t6, t7, t4); \
97 filter_8bit(x1, t5, t6, t7, t4); \
98 filter_8bit(x4, t5, t6, t7, t4); \
99 \
100 vpxor t4##_x, t4##_x, t4##_x; \
101 \
102 \
103 vextracti128 $1, x2, t6##_x; \
104 vextracti128 $1, x5, t5##_x; \
105 vaesenclast t4##_x, x0##_x, x0##_x; \
106 vaesenclast t4##_x, t0##_x, t0##_x; \
107 vinserti128 $1, t0##_x, x0, x0; \
108 vaesenclast t4##_x, x7##_x, x7##_x; \
109 vaesenclast t4##_x, t1##_x, t1##_x; \
110 vinserti128 $1, t1##_x, x7, x7; \
111 vaesenclast t4##_x, x3##_x, x3##_x; \
112 vaesenclast t4##_x, t3##_x, t3##_x; \
113 vinserti128 $1, t3##_x, x3, x3; \
114 vaesenclast t4##_x, x6##_x, x6##_x; \
115 vaesenclast t4##_x, t2##_x, t2##_x; \
116 vinserti128 $1, t2##_x, x6, x6; \
117 vextracti128 $1, x1, t3##_x; \
118 vextracti128 $1, x4, t2##_x; \
119 vbroadcasti128 .Lpost_tf_lo_s1, t0; \
120 vbroadcasti128 .Lpost_tf_hi_s1, t1; \
121 vaesenclast t4##_x, x2##_x, x2##_x; \
122 vaesenclast t4##_x, t6##_x, t6##_x; \
123 vinserti128 $1, t6##_x, x2, x2; \
124 vaesenclast t4##_x, x5##_x, x5##_x; \
125 vaesenclast t4##_x, t5##_x, t5##_x; \
126 vinserti128 $1, t5##_x, x5, x5; \
127 vaesenclast t4##_x, x1##_x, x1##_x; \
128 vaesenclast t4##_x, t3##_x, t3##_x; \
129 vinserti128 $1, t3##_x, x1, x1; \
130 vaesenclast t4##_x, x4##_x, x4##_x; \
131 vaesenclast t4##_x, t2##_x, t2##_x; \
132 vinserti128 $1, t2##_x, x4, x4; \
133 \
134 \
135 vbroadcasti128 .Lpost_tf_lo_s3, t2; \
136 vbroadcasti128 .Lpost_tf_hi_s3, t3; \
137 filter_8bit(x0, t0, t1, t7, t6); \
138 filter_8bit(x7, t0, t1, t7, t6); \
139 filter_8bit(x3, t0, t1, t7, t6); \
140 filter_8bit(x6, t0, t1, t7, t6); \
141 \
142 \
143 vbroadcasti128 .Lpost_tf_lo_s2, t4; \
144 vbroadcasti128 .Lpost_tf_hi_s2, t5; \
145 filter_8bit(x2, t2, t3, t7, t6); \
146 filter_8bit(x5, t2, t3, t7, t6); \
147 \
148 vpbroadcastq key, t0; \
149 \
150 \
151 filter_8bit(x1, t4, t5, t7, t2); \
152 filter_8bit(x4, t4, t5, t7, t2); \
153 vpxor t7, t7, t7; \
154 \
155 vpsrldq $1, t0, t1; \
156 vpsrldq $2, t0, t2; \
157 vpshufb t7, t1, t1; \
158 vpsrldq $3, t0, t3; \
159 \
160 \
161 vpxor x5, x0, x0; \
162 vpxor x6, x1, x1; \
163 vpxor x7, x2, x2; \
164 vpxor x4, x3, x3; \
165 \
166 vpshufb t7, t2, t2; \
167 vpsrldq $4, t0, t4; \
168 vpshufb t7, t3, t3; \
169 vpsrldq $5, t0, t5; \
170 vpshufb t7, t4, t4; \
171 \
172 vpxor x2, x4, x4; \
173 vpxor x3, x5, x5; \
174 vpxor x0, x6, x6; \
175 vpxor x1, x7, x7; \
176 \
177 vpsrldq $6, t0, t6; \
178 vpshufb t7, t5, t5; \
179 vpshufb t7, t6, t6; \
180 \
181 vpxor x7, x0, x0; \
182 vpxor x4, x1, x1; \
183 vpxor x5, x2, x2; \
184 vpxor x6, x3, x3; \
185 \
186 vpxor x3, x4, x4; \
187 vpxor x0, x5, x5; \
188 vpxor x1, x6, x6; \
189 vpxor x2, x7, x7; \
190 \
191 \
192 \
193 vpxor t6, x1, x1; \
194 vpxor 5 * 32(mem_cd), x1, x1; \
195 \
196 vpsrldq $7, t0, t6; \
197 vpshufb t7, t0, t0; \
198 vpshufb t7, t6, t7; \
199 \
200 vpxor t7, x0, x0; \
201 vpxor 4 * 32(mem_cd), x0, x0; \
202 \
203 vpxor t5, x2, x2; \
204 vpxor 6 * 32(mem_cd), x2, x2; \
205 \
206 vpxor t4, x3, x3; \
207 vpxor 7 * 32(mem_cd), x3, x3; \
208 \
209 vpxor t3, x4, x4; \
210 vpxor 0 * 32(mem_cd), x4, x4; \
211 \
212 vpxor t2, x5, x5; \
213 vpxor 1 * 32(mem_cd), x5, x5; \
214 \
215 vpxor t1, x6, x6; \
216 vpxor 2 * 32(mem_cd), x6, x6; \
217 \
218 vpxor t0, x7, x7; \
219 vpxor 3 * 32(mem_cd), x7, x7;
220
221
222
223
224
225 .align 8
226 roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
227 roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
228 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15,
229 %rcx, (%r9));
230 ret;
231 ENDPROC(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
232
233 .align 8
234 roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
235 roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3,
236 %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11,
237 %rax, (%r9));
238 ret;
239 ENDPROC(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
240
241
242
243
244
245
246
247 #define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
248 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
249 leaq (key_table + (i) * 8)(CTX), %r9; \
250 call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
251 \
252 vmovdqu x0, 4 * 32(mem_cd); \
253 vmovdqu x1, 5 * 32(mem_cd); \
254 vmovdqu x2, 6 * 32(mem_cd); \
255 vmovdqu x3, 7 * 32(mem_cd); \
256 vmovdqu x4, 0 * 32(mem_cd); \
257 vmovdqu x5, 1 * 32(mem_cd); \
258 vmovdqu x6, 2 * 32(mem_cd); \
259 vmovdqu x7, 3 * 32(mem_cd); \
260 \
261 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
262 call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
263 \
264 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
265
266 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab)
267
268 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
269 \
270 vmovdqu x4, 4 * 32(mem_ab); \
271 vmovdqu x5, 5 * 32(mem_ab); \
272 vmovdqu x6, 6 * 32(mem_ab); \
273 vmovdqu x7, 7 * 32(mem_ab); \
274 vmovdqu x0, 0 * 32(mem_ab); \
275 vmovdqu x1, 1 * 32(mem_ab); \
276 vmovdqu x2, 2 * 32(mem_ab); \
277 vmovdqu x3, 3 * 32(mem_ab);
278
279 #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
280 y6, y7, mem_ab, mem_cd, i) \
281 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
282 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
283 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
284 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
285 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
286 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
287
288 #define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
289 y6, y7, mem_ab, mem_cd, i) \
290 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
291 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
292 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
293 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
294 two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
295 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
296
297
298
299
300
301
302
303 #define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \
304 vpcmpgtb v0, zero, t0; \
305 vpaddb v0, v0, v0; \
306 vpabsb t0, t0; \
307 \
308 vpcmpgtb v1, zero, t1; \
309 vpaddb v1, v1, v1; \
310 vpabsb t1, t1; \
311 \
312 vpcmpgtb v2, zero, t2; \
313 vpaddb v2, v2, v2; \
314 vpabsb t2, t2; \
315 \
316 vpor t0, v1, v1; \
317 \
318 vpcmpgtb v3, zero, t0; \
319 vpaddb v3, v3, v3; \
320 vpabsb t0, t0; \
321 \
322 vpor t1, v2, v2; \
323 vpor t2, v3, v3; \
324 vpor t0, v0, v0;
325
326
327
328
329
330
331
332
333 #define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
334 tt1, tt2, tt3, kll, klr, krl, krr) \
335
336
337
338
339 \
340 vpbroadcastd kll, t0; \
341 vpxor tt0, tt0, tt0; \
342 vpshufb tt0, t0, t3; \
343 vpsrldq $1, t0, t0; \
344 vpshufb tt0, t0, t2; \
345 vpsrldq $1, t0, t0; \
346 vpshufb tt0, t0, t1; \
347 vpsrldq $1, t0, t0; \
348 vpshufb tt0, t0, t0; \
349 \
350 vpand l0, t0, t0; \
351 vpand l1, t1, t1; \
352 vpand l2, t2, t2; \
353 vpand l3, t3, t3; \
354 \
355 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
356 \
357 vpxor l4, t0, l4; \
358 vpbroadcastd krr, t0; \
359 vmovdqu l4, 4 * 32(l); \
360 vpxor l5, t1, l5; \
361 vmovdqu l5, 5 * 32(l); \
362 vpxor l6, t2, l6; \
363 vmovdqu l6, 6 * 32(l); \
364 vpxor l7, t3, l7; \
365 vmovdqu l7, 7 * 32(l); \
366 \
367
368
369
370
371 \
372 \
373 vpshufb tt0, t0, t3; \
374 vpsrldq $1, t0, t0; \
375 vpshufb tt0, t0, t2; \
376 vpsrldq $1, t0, t0; \
377 vpshufb tt0, t0, t1; \
378 vpsrldq $1, t0, t0; \
379 vpshufb tt0, t0, t0; \
380 \
381 vpor 4 * 32(r), t0, t0; \
382 vpor 5 * 32(r), t1, t1; \
383 vpor 6 * 32(r), t2, t2; \
384 vpor 7 * 32(r), t3, t3; \
385 \
386 vpxor 0 * 32(r), t0, t0; \
387 vpxor 1 * 32(r), t1, t1; \
388 vpxor 2 * 32(r), t2, t2; \
389 vpxor 3 * 32(r), t3, t3; \
390 vmovdqu t0, 0 * 32(r); \
391 vpbroadcastd krl, t0; \
392 vmovdqu t1, 1 * 32(r); \
393 vmovdqu t2, 2 * 32(r); \
394 vmovdqu t3, 3 * 32(r); \
395 \
396
397
398
399
400 \
401 vpshufb tt0, t0, t3; \
402 vpsrldq $1, t0, t0; \
403 vpshufb tt0, t0, t2; \
404 vpsrldq $1, t0, t0; \
405 vpshufb tt0, t0, t1; \
406 vpsrldq $1, t0, t0; \
407 vpshufb tt0, t0, t0; \
408 \
409 vpand 0 * 32(r), t0, t0; \
410 vpand 1 * 32(r), t1, t1; \
411 vpand 2 * 32(r), t2, t2; \
412 vpand 3 * 32(r), t3, t3; \
413 \
414 rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
415 \
416 vpxor 4 * 32(r), t0, t0; \
417 vpxor 5 * 32(r), t1, t1; \
418 vpxor 6 * 32(r), t2, t2; \
419 vpxor 7 * 32(r), t3, t3; \
420 vmovdqu t0, 4 * 32(r); \
421 vpbroadcastd klr, t0; \
422 vmovdqu t1, 5 * 32(r); \
423 vmovdqu t2, 6 * 32(r); \
424 vmovdqu t3, 7 * 32(r); \
425 \
426
427
428
429
430 \
431 \
432 vpshufb tt0, t0, t3; \
433 vpsrldq $1, t0, t0; \
434 vpshufb tt0, t0, t2; \
435 vpsrldq $1, t0, t0; \
436 vpshufb tt0, t0, t1; \
437 vpsrldq $1, t0, t0; \
438 vpshufb tt0, t0, t0; \
439 \
440 vpor l4, t0, t0; \
441 vpor l5, t1, t1; \
442 vpor l6, t2, t2; \
443 vpor l7, t3, t3; \
444 \
445 vpxor l0, t0, l0; \
446 vmovdqu l0, 0 * 32(l); \
447 vpxor l1, t1, l1; \
448 vmovdqu l1, 1 * 32(l); \
449 vpxor l2, t2, l2; \
450 vmovdqu l2, 2 * 32(l); \
451 vpxor l3, t3, l3; \
452 vmovdqu l3, 3 * 32(l);
453
454 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
455 vpunpckhdq x1, x0, t2; \
456 vpunpckldq x1, x0, x0; \
457 \
458 vpunpckldq x3, x2, t1; \
459 vpunpckhdq x3, x2, x2; \
460 \
461 vpunpckhqdq t1, x0, x1; \
462 vpunpcklqdq t1, x0, x0; \
463 \
464 vpunpckhqdq x2, t2, x3; \
465 vpunpcklqdq x2, t2, x2;
466
467 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
468 a3, b3, c3, d3, st0, st1) \
469 vmovdqu d2, st0; \
470 vmovdqu d3, st1; \
471 transpose_4x4(a0, a1, a2, a3, d2, d3); \
472 transpose_4x4(b0, b1, b2, b3, d2, d3); \
473 vmovdqu st0, d2; \
474 vmovdqu st1, d3; \
475 \
476 vmovdqu a0, st0; \
477 vmovdqu a1, st1; \
478 transpose_4x4(c0, c1, c2, c3, a0, a1); \
479 transpose_4x4(d0, d1, d2, d3, a0, a1); \
480 \
481 vbroadcasti128 .Lshufb_16x16b, a0; \
482 vmovdqu st1, a1; \
483 vpshufb a0, a2, a2; \
484 vpshufb a0, a3, a3; \
485 vpshufb a0, b0, b0; \
486 vpshufb a0, b1, b1; \
487 vpshufb a0, b2, b2; \
488 vpshufb a0, b3, b3; \
489 vpshufb a0, a1, a1; \
490 vpshufb a0, c0, c0; \
491 vpshufb a0, c1, c1; \
492 vpshufb a0, c2, c2; \
493 vpshufb a0, c3, c3; \
494 vpshufb a0, d0, d0; \
495 vpshufb a0, d1, d1; \
496 vpshufb a0, d2, d2; \
497 vpshufb a0, d3, d3; \
498 vmovdqu d3, st1; \
499 vmovdqu st0, d3; \
500 vpshufb a0, d3, a0; \
501 vmovdqu d2, st0; \
502 \
503 transpose_4x4(a0, b0, c0, d0, d2, d3); \
504 transpose_4x4(a1, b1, c1, d1, d2, d3); \
505 vmovdqu st0, d2; \
506 vmovdqu st1, d3; \
507 \
508 vmovdqu b0, st0; \
509 vmovdqu b1, st1; \
510 transpose_4x4(a2, b2, c2, d2, b0, b1); \
511 transpose_4x4(a3, b3, c3, d3, b0, b1); \
512 vmovdqu st0, b0; \
513 vmovdqu st1, b1; \
514
515
516
517 #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
518 y6, y7, rio, key) \
519 vpbroadcastq key, x0; \
520 vpshufb .Lpack_bswap, x0, x0; \
521 \
522 vpxor 0 * 32(rio), x0, y7; \
523 vpxor 1 * 32(rio), x0, y6; \
524 vpxor 2 * 32(rio), x0, y5; \
525 vpxor 3 * 32(rio), x0, y4; \
526 vpxor 4 * 32(rio), x0, y3; \
527 vpxor 5 * 32(rio), x0, y2; \
528 vpxor 6 * 32(rio), x0, y1; \
529 vpxor 7 * 32(rio), x0, y0; \
530 vpxor 8 * 32(rio), x0, x7; \
531 vpxor 9 * 32(rio), x0, x6; \
532 vpxor 10 * 32(rio), x0, x5; \
533 vpxor 11 * 32(rio), x0, x4; \
534 vpxor 12 * 32(rio), x0, x3; \
535 vpxor 13 * 32(rio), x0, x2; \
536 vpxor 14 * 32(rio), x0, x1; \
537 vpxor 15 * 32(rio), x0, x0;
538
539
540 #define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
541 y6, y7, mem_ab, mem_cd) \
542 byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
543 y4, y5, y6, y7, (mem_ab), (mem_cd)); \
544 \
545 vmovdqu x0, 0 * 32(mem_ab); \
546 vmovdqu x1, 1 * 32(mem_ab); \
547 vmovdqu x2, 2 * 32(mem_ab); \
548 vmovdqu x3, 3 * 32(mem_ab); \
549 vmovdqu x4, 4 * 32(mem_ab); \
550 vmovdqu x5, 5 * 32(mem_ab); \
551 vmovdqu x6, 6 * 32(mem_ab); \
552 vmovdqu x7, 7 * 32(mem_ab); \
553 vmovdqu y0, 0 * 32(mem_cd); \
554 vmovdqu y1, 1 * 32(mem_cd); \
555 vmovdqu y2, 2 * 32(mem_cd); \
556 vmovdqu y3, 3 * 32(mem_cd); \
557 vmovdqu y4, 4 * 32(mem_cd); \
558 vmovdqu y5, 5 * 32(mem_cd); \
559 vmovdqu y6, 6 * 32(mem_cd); \
560 vmovdqu y7, 7 * 32(mem_cd);
561
562
563 #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
564 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
565 byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
566 y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
567 \
568 vmovdqu x0, stack_tmp0; \
569 \
570 vpbroadcastq key, x0; \
571 vpshufb .Lpack_bswap, x0, x0; \
572 \
573 vpxor x0, y7, y7; \
574 vpxor x0, y6, y6; \
575 vpxor x0, y5, y5; \
576 vpxor x0, y4, y4; \
577 vpxor x0, y3, y3; \
578 vpxor x0, y2, y2; \
579 vpxor x0, y1, y1; \
580 vpxor x0, y0, y0; \
581 vpxor x0, x7, x7; \
582 vpxor x0, x6, x6; \
583 vpxor x0, x5, x5; \
584 vpxor x0, x4, x4; \
585 vpxor x0, x3, x3; \
586 vpxor x0, x2, x2; \
587 vpxor x0, x1, x1; \
588 vpxor stack_tmp0, x0, x0;
589
590 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
591 y6, y7, rio) \
592 vmovdqu x0, 0 * 32(rio); \
593 vmovdqu x1, 1 * 32(rio); \
594 vmovdqu x2, 2 * 32(rio); \
595 vmovdqu x3, 3 * 32(rio); \
596 vmovdqu x4, 4 * 32(rio); \
597 vmovdqu x5, 5 * 32(rio); \
598 vmovdqu x6, 6 * 32(rio); \
599 vmovdqu x7, 7 * 32(rio); \
600 vmovdqu y0, 8 * 32(rio); \
601 vmovdqu y1, 9 * 32(rio); \
602 vmovdqu y2, 10 * 32(rio); \
603 vmovdqu y3, 11 * 32(rio); \
604 vmovdqu y4, 12 * 32(rio); \
605 vmovdqu y5, 13 * 32(rio); \
606 vmovdqu y6, 14 * 32(rio); \
607 vmovdqu y7, 15 * 32(rio);
608
609
610 .section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
611 .align 32
612 #define SHUFB_BYTES(idx) \
613 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
614 .Lshufb_16x16b:
615 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
616 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
617
618 .section .rodata.cst32.pack_bswap, "aM", @progbits, 32
619 .align 32
620 .Lpack_bswap:
621 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
622 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
623
624
625 .section .rodata.cst16, "aM", @progbits, 16
626 .align 16
627
628
629 .Lbswap128_mask:
630 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
631
632
633 .Lxts_gf128mul_and_shl1_mask_0:
634 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
635 .Lxts_gf128mul_and_shl1_mask_1:
636 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652 .Lpre_tf_lo_s1:
653 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
654 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
655 .Lpre_tf_hi_s1:
656 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
657 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673 .Lpre_tf_lo_s4:
674 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
675 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
676 .Lpre_tf_hi_s4:
677 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
678 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696 .Lpost_tf_lo_s1:
697 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
698 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
699 .Lpost_tf_hi_s1:
700 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
701 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719 .Lpost_tf_lo_s2:
720 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
721 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
722 .Lpost_tf_hi_s2:
723 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
724 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742 .Lpost_tf_lo_s3:
743 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
744 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
745 .Lpost_tf_hi_s3:
746 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
747 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
748
749
750 .Linv_shift_row:
751 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
752 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
753
754 .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
755 .align 4
756
757 .L0f0f0f0f:
758 .long 0x0f0f0f0f
759
760 .text
761
762 .align 8
763 __camellia_enc_blk32:
764
765
766
767
768
769
770
771
772 FRAME_BEGIN
773
774 leaq 8 * 32(%rax), %rcx;
775
776 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
777 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
778 %ymm15, %rax, %rcx);
779
780 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
781 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
782 %ymm15, %rax, %rcx, 0);
783
784 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
785 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
786 %ymm15,
787 ((key_table + (8) * 8) + 0)(CTX),
788 ((key_table + (8) * 8) + 4)(CTX),
789 ((key_table + (8) * 8) + 8)(CTX),
790 ((key_table + (8) * 8) + 12)(CTX));
791
792 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
793 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
794 %ymm15, %rax, %rcx, 8);
795
796 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
797 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
798 %ymm15,
799 ((key_table + (16) * 8) + 0)(CTX),
800 ((key_table + (16) * 8) + 4)(CTX),
801 ((key_table + (16) * 8) + 8)(CTX),
802 ((key_table + (16) * 8) + 12)(CTX));
803
804 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
805 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
806 %ymm15, %rax, %rcx, 16);
807
808 movl $24, %r8d;
809 cmpl $16, key_length(CTX);
810 jne .Lenc_max32;
811
812 .Lenc_done:
813
814 vmovdqu 0 * 32(%rcx), %ymm8;
815 vmovdqu 1 * 32(%rcx), %ymm9;
816 vmovdqu 2 * 32(%rcx), %ymm10;
817 vmovdqu 3 * 32(%rcx), %ymm11;
818 vmovdqu 4 * 32(%rcx), %ymm12;
819 vmovdqu 5 * 32(%rcx), %ymm13;
820 vmovdqu 6 * 32(%rcx), %ymm14;
821 vmovdqu 7 * 32(%rcx), %ymm15;
822
823 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
824 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
825 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
826
827 FRAME_END
828 ret;
829
830 .align 8
831 .Lenc_max32:
832 movl $32, %r8d;
833
834 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
835 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
836 %ymm15,
837 ((key_table + (24) * 8) + 0)(CTX),
838 ((key_table + (24) * 8) + 4)(CTX),
839 ((key_table + (24) * 8) + 8)(CTX),
840 ((key_table + (24) * 8) + 12)(CTX));
841
842 enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
843 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
844 %ymm15, %rax, %rcx, 24);
845
846 jmp .Lenc_done;
847 ENDPROC(__camellia_enc_blk32)
848
849 .align 8
850 __camellia_dec_blk32:
851
852
853
854
855
856
857
858
859
860 FRAME_BEGIN
861
862 leaq 8 * 32(%rax), %rcx;
863
864 inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
865 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
866 %ymm15, %rax, %rcx);
867
868 cmpl $32, %r8d;
869 je .Ldec_max32;
870
871 .Ldec_max24:
872 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
873 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
874 %ymm15, %rax, %rcx, 16);
875
876 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
877 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
878 %ymm15,
879 ((key_table + (16) * 8) + 8)(CTX),
880 ((key_table + (16) * 8) + 12)(CTX),
881 ((key_table + (16) * 8) + 0)(CTX),
882 ((key_table + (16) * 8) + 4)(CTX));
883
884 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
885 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
886 %ymm15, %rax, %rcx, 8);
887
888 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
889 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
890 %ymm15,
891 ((key_table + (8) * 8) + 8)(CTX),
892 ((key_table + (8) * 8) + 12)(CTX),
893 ((key_table + (8) * 8) + 0)(CTX),
894 ((key_table + (8) * 8) + 4)(CTX));
895
896 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
897 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
898 %ymm15, %rax, %rcx, 0);
899
900
901 vmovdqu 0 * 32(%rcx), %ymm8;
902 vmovdqu 1 * 32(%rcx), %ymm9;
903 vmovdqu 2 * 32(%rcx), %ymm10;
904 vmovdqu 3 * 32(%rcx), %ymm11;
905 vmovdqu 4 * 32(%rcx), %ymm12;
906 vmovdqu 5 * 32(%rcx), %ymm13;
907 vmovdqu 6 * 32(%rcx), %ymm14;
908 vmovdqu 7 * 32(%rcx), %ymm15;
909
910 outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
911 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
912 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
913
914 FRAME_END
915 ret;
916
917 .align 8
918 .Ldec_max32:
919 dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
920 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
921 %ymm15, %rax, %rcx, 24);
922
923 fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
924 %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
925 %ymm15,
926 ((key_table + (24) * 8) + 8)(CTX),
927 ((key_table + (24) * 8) + 12)(CTX),
928 ((key_table + (24) * 8) + 0)(CTX),
929 ((key_table + (24) * 8) + 4)(CTX));
930
931 jmp .Ldec_max24;
932 ENDPROC(__camellia_dec_blk32)
933
934 ENTRY(camellia_ecb_enc_32way)
935
936
937
938
939
940 FRAME_BEGIN
941
942 vzeroupper;
943
944 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
945 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
946 %ymm15, %rdx, (key_table)(CTX));
947
948
949 movq %rsi, %rax;
950
951 call __camellia_enc_blk32;
952
953 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
954 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
955 %ymm8, %rsi);
956
957 vzeroupper;
958
959 FRAME_END
960 ret;
961 ENDPROC(camellia_ecb_enc_32way)
962
963 ENTRY(camellia_ecb_dec_32way)
964
965
966
967
968
969 FRAME_BEGIN
970
971 vzeroupper;
972
973 cmpl $16, key_length(CTX);
974 movl $32, %r8d;
975 movl $24, %eax;
976 cmovel %eax, %r8d;
977
978 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
979 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
980 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
981
982
983 movq %rsi, %rax;
984
985 call __camellia_dec_blk32;
986
987 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
988 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
989 %ymm8, %rsi);
990
991 vzeroupper;
992
993 FRAME_END
994 ret;
995 ENDPROC(camellia_ecb_dec_32way)
996
997 ENTRY(camellia_cbc_dec_32way)
998
999
1000
1001
1002
1003 FRAME_BEGIN
1004
1005 vzeroupper;
1006
1007 cmpl $16, key_length(CTX);
1008 movl $32, %r8d;
1009 movl $24, %eax;
1010 cmovel %eax, %r8d;
1011
1012 inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
1013 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
1014 %ymm15, %rdx, (key_table)(CTX, %r8, 8));
1015
1016 movq %rsp, %r10;
1017 cmpq %rsi, %rdx;
1018 je .Lcbc_dec_use_stack;
1019
1020
1021 movq %rsi, %rax;
1022 jmp .Lcbc_dec_continue;
1023
1024 .Lcbc_dec_use_stack:
1025
1026
1027
1028
1029 subq $(16 * 32), %rsp;
1030 movq %rsp, %rax;
1031
1032 .Lcbc_dec_continue:
1033 call __camellia_dec_blk32;
1034
1035 vmovdqu %ymm7, (%rax);
1036 vpxor %ymm7, %ymm7, %ymm7;
1037 vinserti128 $1, (%rdx), %ymm7, %ymm7;
1038 vpxor (%rax), %ymm7, %ymm7;
1039 movq %r10, %rsp;
1040 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1041 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1042 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1043 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1044 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1045 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1046 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1047 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1048 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1049 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1050 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1051 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1052 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1053 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1054 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1055 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1056 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1057 %ymm8, %rsi);
1058
1059 vzeroupper;
1060
1061 FRAME_END
1062 ret;
1063 ENDPROC(camellia_cbc_dec_32way)
1064
1065 #define inc_le128(x, minus_one, tmp) \
1066 vpcmpeqq minus_one, x, tmp; \
1067 vpsubq minus_one, x, x; \
1068 vpslldq $8, tmp, tmp; \
1069 vpsubq tmp, x, x;
1070
1071 #define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \
1072 vpcmpeqq minus_one, x, tmp1; \
1073 vpcmpeqq minus_two, x, tmp2; \
1074 vpsubq minus_two, x, x; \
1075 vpor tmp2, tmp1, tmp1; \
1076 vpslldq $8, tmp1, tmp1; \
1077 vpsubq tmp1, x, x;
1078
1079 ENTRY(camellia_ctr_32way)
1080
1081
1082
1083
1084
1085
1086 FRAME_BEGIN
1087
1088 vzeroupper;
1089
1090 movq %rsp, %r10;
1091 cmpq %rsi, %rdx;
1092 je .Lctr_use_stack;
1093
1094
1095 movq %rsi, %rax;
1096 jmp .Lctr_continue;
1097
1098 .Lctr_use_stack:
1099 subq $(16 * 32), %rsp;
1100 movq %rsp, %rax;
1101
1102 .Lctr_continue:
1103 vpcmpeqd %ymm15, %ymm15, %ymm15;
1104 vpsrldq $8, %ymm15, %ymm15;
1105 vpaddq %ymm15, %ymm15, %ymm12;
1106
1107
1108 vmovdqu (%rcx), %xmm0;
1109 vmovdqa %xmm0, %xmm1;
1110 inc_le128(%xmm0, %xmm15, %xmm14);
1111 vbroadcasti128 .Lbswap128_mask, %ymm14;
1112 vinserti128 $1, %xmm0, %ymm1, %ymm0;
1113 vpshufb %ymm14, %ymm0, %ymm13;
1114 vmovdqu %ymm13, 15 * 32(%rax);
1115
1116
1117 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1118 vpshufb %ymm14, %ymm0, %ymm13;
1119 vmovdqu %ymm13, 14 * 32(%rax);
1120 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1121 vpshufb %ymm14, %ymm0, %ymm13;
1122 vmovdqu %ymm13, 13 * 32(%rax);
1123 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1124 vpshufb %ymm14, %ymm0, %ymm13;
1125 vmovdqu %ymm13, 12 * 32(%rax);
1126 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1127 vpshufb %ymm14, %ymm0, %ymm13;
1128 vmovdqu %ymm13, 11 * 32(%rax);
1129 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1130 vpshufb %ymm14, %ymm0, %ymm10;
1131 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1132 vpshufb %ymm14, %ymm0, %ymm9;
1133 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1134 vpshufb %ymm14, %ymm0, %ymm8;
1135 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1136 vpshufb %ymm14, %ymm0, %ymm7;
1137 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1138 vpshufb %ymm14, %ymm0, %ymm6;
1139 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1140 vpshufb %ymm14, %ymm0, %ymm5;
1141 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1142 vpshufb %ymm14, %ymm0, %ymm4;
1143 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1144 vpshufb %ymm14, %ymm0, %ymm3;
1145 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1146 vpshufb %ymm14, %ymm0, %ymm2;
1147 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1148 vpshufb %ymm14, %ymm0, %ymm1;
1149 add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13);
1150 vextracti128 $1, %ymm0, %xmm13;
1151 vpshufb %ymm14, %ymm0, %ymm0;
1152 inc_le128(%xmm13, %xmm15, %xmm14);
1153 vmovdqu %xmm13, (%rcx);
1154
1155
1156 vpbroadcastq (key_table)(CTX), %ymm15;
1157 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1158 vpxor %ymm0, %ymm15, %ymm0;
1159 vpxor %ymm1, %ymm15, %ymm1;
1160 vpxor %ymm2, %ymm15, %ymm2;
1161 vpxor %ymm3, %ymm15, %ymm3;
1162 vpxor %ymm4, %ymm15, %ymm4;
1163 vpxor %ymm5, %ymm15, %ymm5;
1164 vpxor %ymm6, %ymm15, %ymm6;
1165 vpxor %ymm7, %ymm15, %ymm7;
1166 vpxor %ymm8, %ymm15, %ymm8;
1167 vpxor %ymm9, %ymm15, %ymm9;
1168 vpxor %ymm10, %ymm15, %ymm10;
1169 vpxor 11 * 32(%rax), %ymm15, %ymm11;
1170 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1171 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1172 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1173 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1174
1175 call __camellia_enc_blk32;
1176
1177 movq %r10, %rsp;
1178
1179 vpxor 0 * 32(%rdx), %ymm7, %ymm7;
1180 vpxor 1 * 32(%rdx), %ymm6, %ymm6;
1181 vpxor 2 * 32(%rdx), %ymm5, %ymm5;
1182 vpxor 3 * 32(%rdx), %ymm4, %ymm4;
1183 vpxor 4 * 32(%rdx), %ymm3, %ymm3;
1184 vpxor 5 * 32(%rdx), %ymm2, %ymm2;
1185 vpxor 6 * 32(%rdx), %ymm1, %ymm1;
1186 vpxor 7 * 32(%rdx), %ymm0, %ymm0;
1187 vpxor 8 * 32(%rdx), %ymm15, %ymm15;
1188 vpxor 9 * 32(%rdx), %ymm14, %ymm14;
1189 vpxor 10 * 32(%rdx), %ymm13, %ymm13;
1190 vpxor 11 * 32(%rdx), %ymm12, %ymm12;
1191 vpxor 12 * 32(%rdx), %ymm11, %ymm11;
1192 vpxor 13 * 32(%rdx), %ymm10, %ymm10;
1193 vpxor 14 * 32(%rdx), %ymm9, %ymm9;
1194 vpxor 15 * 32(%rdx), %ymm8, %ymm8;
1195 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1196 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1197 %ymm8, %rsi);
1198
1199 vzeroupper;
1200
1201 FRAME_END
1202 ret;
1203 ENDPROC(camellia_ctr_32way)
1204
1205 #define gf128mul_x_ble(iv, mask, tmp) \
1206 vpsrad $31, iv, tmp; \
1207 vpaddq iv, iv, iv; \
1208 vpshufd $0x13, tmp, tmp; \
1209 vpand mask, tmp, tmp; \
1210 vpxor tmp, iv, iv;
1211
1212 #define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \
1213 vpsrad $31, iv, tmp0; \
1214 vpaddq iv, iv, tmp1; \
1215 vpsllq $2, iv, iv; \
1216 vpshufd $0x13, tmp0, tmp0; \
1217 vpsrad $31, tmp1, tmp1; \
1218 vpand mask2, tmp0, tmp0; \
1219 vpshufd $0x13, tmp1, tmp1; \
1220 vpxor tmp0, iv, iv; \
1221 vpand mask1, tmp1, tmp1; \
1222 vpxor tmp1, iv, iv;
1223
1224 .align 8
1225 camellia_xts_crypt_32way:
1226
1227
1228
1229
1230
1231
1232
1233
1234 FRAME_BEGIN
1235
1236 vzeroupper;
1237
1238 subq $(16 * 32), %rsp;
1239 movq %rsp, %rax;
1240
1241 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12;
1242
1243
1244 vmovdqu (%rcx), %xmm0;
1245 vmovdqa %xmm0, %xmm15;
1246 gf128mul_x_ble(%xmm0, %xmm12, %xmm13);
1247 vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13;
1248 vinserti128 $1, %xmm0, %ymm15, %ymm0;
1249 vpxor 0 * 32(%rdx), %ymm0, %ymm15;
1250 vmovdqu %ymm15, 15 * 32(%rax);
1251 vmovdqu %ymm0, 0 * 32(%rsi);
1252
1253
1254 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1255 vpxor 1 * 32(%rdx), %ymm0, %ymm15;
1256 vmovdqu %ymm15, 14 * 32(%rax);
1257 vmovdqu %ymm0, 1 * 32(%rsi);
1258
1259 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1260 vpxor 2 * 32(%rdx), %ymm0, %ymm15;
1261 vmovdqu %ymm15, 13 * 32(%rax);
1262 vmovdqu %ymm0, 2 * 32(%rsi);
1263
1264 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1265 vpxor 3 * 32(%rdx), %ymm0, %ymm15;
1266 vmovdqu %ymm15, 12 * 32(%rax);
1267 vmovdqu %ymm0, 3 * 32(%rsi);
1268
1269 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1270 vpxor 4 * 32(%rdx), %ymm0, %ymm11;
1271 vmovdqu %ymm0, 4 * 32(%rsi);
1272
1273 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1274 vpxor 5 * 32(%rdx), %ymm0, %ymm10;
1275 vmovdqu %ymm0, 5 * 32(%rsi);
1276
1277 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1278 vpxor 6 * 32(%rdx), %ymm0, %ymm9;
1279 vmovdqu %ymm0, 6 * 32(%rsi);
1280
1281 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1282 vpxor 7 * 32(%rdx), %ymm0, %ymm8;
1283 vmovdqu %ymm0, 7 * 32(%rsi);
1284
1285 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1286 vpxor 8 * 32(%rdx), %ymm0, %ymm7;
1287 vmovdqu %ymm0, 8 * 32(%rsi);
1288
1289 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1290 vpxor 9 * 32(%rdx), %ymm0, %ymm6;
1291 vmovdqu %ymm0, 9 * 32(%rsi);
1292
1293 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1294 vpxor 10 * 32(%rdx), %ymm0, %ymm5;
1295 vmovdqu %ymm0, 10 * 32(%rsi);
1296
1297 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1298 vpxor 11 * 32(%rdx), %ymm0, %ymm4;
1299 vmovdqu %ymm0, 11 * 32(%rsi);
1300
1301 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1302 vpxor 12 * 32(%rdx), %ymm0, %ymm3;
1303 vmovdqu %ymm0, 12 * 32(%rsi);
1304
1305 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1306 vpxor 13 * 32(%rdx), %ymm0, %ymm2;
1307 vmovdqu %ymm0, 13 * 32(%rsi);
1308
1309 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1310 vpxor 14 * 32(%rdx), %ymm0, %ymm1;
1311 vmovdqu %ymm0, 14 * 32(%rsi);
1312
1313 gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15);
1314 vpxor 15 * 32(%rdx), %ymm0, %ymm15;
1315 vmovdqu %ymm15, 0 * 32(%rax);
1316 vmovdqu %ymm0, 15 * 32(%rsi);
1317
1318 vextracti128 $1, %ymm0, %xmm0;
1319 gf128mul_x_ble(%xmm0, %xmm12, %xmm15);
1320 vmovdqu %xmm0, (%rcx);
1321
1322
1323 vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
1324 vpshufb .Lpack_bswap, %ymm15, %ymm15;
1325 vpxor 0 * 32(%rax), %ymm15, %ymm0;
1326 vpxor %ymm1, %ymm15, %ymm1;
1327 vpxor %ymm2, %ymm15, %ymm2;
1328 vpxor %ymm3, %ymm15, %ymm3;
1329 vpxor %ymm4, %ymm15, %ymm4;
1330 vpxor %ymm5, %ymm15, %ymm5;
1331 vpxor %ymm6, %ymm15, %ymm6;
1332 vpxor %ymm7, %ymm15, %ymm7;
1333 vpxor %ymm8, %ymm15, %ymm8;
1334 vpxor %ymm9, %ymm15, %ymm9;
1335 vpxor %ymm10, %ymm15, %ymm10;
1336 vpxor %ymm11, %ymm15, %ymm11;
1337 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1338 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1339 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1340 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1341
1342 CALL_NOSPEC %r9;
1343
1344 addq $(16 * 32), %rsp;
1345
1346 vpxor 0 * 32(%rsi), %ymm7, %ymm7;
1347 vpxor 1 * 32(%rsi), %ymm6, %ymm6;
1348 vpxor 2 * 32(%rsi), %ymm5, %ymm5;
1349 vpxor 3 * 32(%rsi), %ymm4, %ymm4;
1350 vpxor 4 * 32(%rsi), %ymm3, %ymm3;
1351 vpxor 5 * 32(%rsi), %ymm2, %ymm2;
1352 vpxor 6 * 32(%rsi), %ymm1, %ymm1;
1353 vpxor 7 * 32(%rsi), %ymm0, %ymm0;
1354 vpxor 8 * 32(%rsi), %ymm15, %ymm15;
1355 vpxor 9 * 32(%rsi), %ymm14, %ymm14;
1356 vpxor 10 * 32(%rsi), %ymm13, %ymm13;
1357 vpxor 11 * 32(%rsi), %ymm12, %ymm12;
1358 vpxor 12 * 32(%rsi), %ymm11, %ymm11;
1359 vpxor 13 * 32(%rsi), %ymm10, %ymm10;
1360 vpxor 14 * 32(%rsi), %ymm9, %ymm9;
1361 vpxor 15 * 32(%rsi), %ymm8, %ymm8;
1362 write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
1363 %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
1364 %ymm8, %rsi);
1365
1366 vzeroupper;
1367
1368 FRAME_END
1369 ret;
1370 ENDPROC(camellia_xts_crypt_32way)
1371
1372 ENTRY(camellia_xts_enc_32way)
1373
1374
1375
1376
1377
1378
1379
1380 xorl %r8d, %r8d;
1381
1382 leaq __camellia_enc_blk32, %r9;
1383
1384 jmp camellia_xts_crypt_32way;
1385 ENDPROC(camellia_xts_enc_32way)
1386
1387 ENTRY(camellia_xts_dec_32way)
1388
1389
1390
1391
1392
1393
1394
1395 cmpl $16, key_length(CTX);
1396 movl $32, %r8d;
1397 movl $24, %eax;
1398 cmovel %eax, %r8d;
1399
1400 leaq __camellia_dec_blk32, %r9;
1401
1402 jmp camellia_xts_crypt_32way;
1403 ENDPROC(camellia_xts_dec_32way)