1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 #include <linux/linkage.h>
19 #include <asm/frame.h>
20 #include <asm/nospec-branch.h>
21
22 #define CAMELLIA_TABLE_BYTE_LEN 272
23
24
25 #define key_table 0
26 #define key_length CAMELLIA_TABLE_BYTE_LEN
27
28
29 #define CTX %rdi
30
31
32
33
34 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
35 vpand x, mask4bit, tmp0; \
36 vpandn x, mask4bit, x; \
37 vpsrld $4, x, x; \
38 \
39 vpshufb tmp0, lo_t, tmp0; \
40 vpshufb x, hi_t, x; \
41 vpxor tmp0, x, x;
42
43
44
45
46
47
48
49
50
51 #define roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \
52 t7, mem_cd, key) \
53
54
55 \
56 vmovdqa .Linv_shift_row, t4; \
57 vbroadcastss .L0f0f0f0f, t7; \
58 vmovdqa .Lpre_tf_lo_s1, t0; \
59 vmovdqa .Lpre_tf_hi_s1, t1; \
60 \
61 \
62 vpshufb t4, x0, x0; \
63 vpshufb t4, x7, x7; \
64 vpshufb t4, x1, x1; \
65 vpshufb t4, x4, x4; \
66 vpshufb t4, x2, x2; \
67 vpshufb t4, x5, x5; \
68 vpshufb t4, x3, x3; \
69 vpshufb t4, x6, x6; \
70 \
71 \
72 vmovdqa .Lpre_tf_lo_s4, t2; \
73 vmovdqa .Lpre_tf_hi_s4, t3; \
74 filter_8bit(x0, t0, t1, t7, t6); \
75 filter_8bit(x7, t0, t1, t7, t6); \
76 filter_8bit(x1, t0, t1, t7, t6); \
77 filter_8bit(x4, t0, t1, t7, t6); \
78 filter_8bit(x2, t0, t1, t7, t6); \
79 filter_8bit(x5, t0, t1, t7, t6); \
80 \
81 \
82 vpxor t4, t4, t4; \
83 filter_8bit(x3, t2, t3, t7, t6); \
84 filter_8bit(x6, t2, t3, t7, t6); \
85 \
86 \
87 vmovdqa .Lpost_tf_lo_s1, t0; \
88 vmovdqa .Lpost_tf_hi_s1, t1; \
89 vaesenclast t4, x0, x0; \
90 vaesenclast t4, x7, x7; \
91 vaesenclast t4, x1, x1; \
92 vaesenclast t4, x4, x4; \
93 vaesenclast t4, x2, x2; \
94 vaesenclast t4, x5, x5; \
95 vaesenclast t4, x3, x3; \
96 vaesenclast t4, x6, x6; \
97 \
98 \
99 vmovdqa .Lpost_tf_lo_s3, t2; \
100 vmovdqa .Lpost_tf_hi_s3, t3; \
101 filter_8bit(x0, t0, t1, t7, t6); \
102 filter_8bit(x7, t0, t1, t7, t6); \
103 filter_8bit(x3, t0, t1, t7, t6); \
104 filter_8bit(x6, t0, t1, t7, t6); \
105 \
106 \
107 vmovdqa .Lpost_tf_lo_s2, t4; \
108 vmovdqa .Lpost_tf_hi_s2, t5; \
109 filter_8bit(x2, t2, t3, t7, t6); \
110 filter_8bit(x5, t2, t3, t7, t6); \
111 \
112 vpxor t6, t6, t6; \
113 vmovq key, t0; \
114 \
115 \
116 filter_8bit(x1, t4, t5, t7, t2); \
117 filter_8bit(x4, t4, t5, t7, t2); \
118 \
119 vpsrldq $5, t0, t5; \
120 vpsrldq $1, t0, t1; \
121 vpsrldq $2, t0, t2; \
122 vpsrldq $3, t0, t3; \
123 vpsrldq $4, t0, t4; \
124 vpshufb t6, t0, t0; \
125 vpshufb t6, t1, t1; \
126 vpshufb t6, t2, t2; \
127 vpshufb t6, t3, t3; \
128 vpshufb t6, t4, t4; \
129 vpsrldq $2, t5, t7; \
130 vpshufb t6, t7, t7; \
131 \
132
133
134 \
135 vpxor x5, x0, x0; \
136 vpxor x6, x1, x1; \
137 vpxor x7, x2, x2; \
138 vpxor x4, x3, x3; \
139 \
140 vpxor x2, x4, x4; \
141 vpxor x3, x5, x5; \
142 vpxor x0, x6, x6; \
143 vpxor x1, x7, x7; \
144 \
145 vpxor x7, x0, x0; \
146 vpxor x4, x1, x1; \
147 vpxor x5, x2, x2; \
148 vpxor x6, x3, x3; \
149 \
150 vpxor x3, x4, x4; \
151 vpxor x0, x5, x5; \
152 vpxor x1, x6, x6; \
153 vpxor x2, x7, x7; \
154 \
155
156
157 \
158 \
159 vpxor t3, x4, x4; \
160 vpxor 0 * 16(mem_cd), x4, x4; \
161 \
162 vpxor t2, x5, x5; \
163 vpxor 1 * 16(mem_cd), x5, x5; \
164 \
165 vpsrldq $1, t5, t3; \
166 vpshufb t6, t5, t5; \
167 vpshufb t6, t3, t6; \
168 \
169 vpxor t1, x6, x6; \
170 vpxor 2 * 16(mem_cd), x6, x6; \
171 \
172 vpxor t0, x7, x7; \
173 vpxor 3 * 16(mem_cd), x7, x7; \
174 \
175 vpxor t7, x0, x0; \
176 vpxor 4 * 16(mem_cd), x0, x0; \
177 \
178 vpxor t6, x1, x1; \
179 vpxor 5 * 16(mem_cd), x1, x1; \
180 \
181 vpxor t5, x2, x2; \
182 vpxor 6 * 16(mem_cd), x2, x2; \
183 \
184 vpxor t4, x3, x3; \
185 vpxor 7 * 16(mem_cd), x3, x3;
186
187
188
189
190
191 .align 8
192 roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd:
193 roundsm16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
194 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15,
195 %rcx, (%r9));
196 ret;
197 ENDPROC(roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd)
198
199 .align 8
200 roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab:
201 roundsm16(%xmm4, %xmm5, %xmm6, %xmm7, %xmm0, %xmm1, %xmm2, %xmm3,
202 %xmm12, %xmm13, %xmm14, %xmm15, %xmm8, %xmm9, %xmm10, %xmm11,
203 %rax, (%r9));
204 ret;
205 ENDPROC(roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab)
206
207
208
209
210
211
212
213 #define two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
214 y6, y7, mem_ab, mem_cd, i, dir, store_ab) \
215 leaq (key_table + (i) * 8)(CTX), %r9; \
216 call roundsm16_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \
217 \
218 vmovdqu x4, 0 * 16(mem_cd); \
219 vmovdqu x5, 1 * 16(mem_cd); \
220 vmovdqu x6, 2 * 16(mem_cd); \
221 vmovdqu x7, 3 * 16(mem_cd); \
222 vmovdqu x0, 4 * 16(mem_cd); \
223 vmovdqu x1, 5 * 16(mem_cd); \
224 vmovdqu x2, 6 * 16(mem_cd); \
225 vmovdqu x3, 7 * 16(mem_cd); \
226 \
227 leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \
228 call roundsm16_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \
229 \
230 store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab);
231
232 #define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab)
233
234 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
235 \
236 vmovdqu x0, 0 * 16(mem_ab); \
237 vmovdqu x1, 1 * 16(mem_ab); \
238 vmovdqu x2, 2 * 16(mem_ab); \
239 vmovdqu x3, 3 * 16(mem_ab); \
240 vmovdqu x4, 4 * 16(mem_ab); \
241 vmovdqu x5, 5 * 16(mem_ab); \
242 vmovdqu x6, 6 * 16(mem_ab); \
243 vmovdqu x7, 7 * 16(mem_ab);
244
245 #define enc_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
246 y6, y7, mem_ab, mem_cd, i) \
247 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
248 y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \
249 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
250 y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \
251 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
252 y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store);
253
254 #define dec_rounds16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
255 y6, y7, mem_ab, mem_cd, i) \
256 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
257 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
258 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
259 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
260 two_roundsm16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
261 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
262
263
264
265
266
267
268
269 #define rol32_1_16(v0, v1, v2, v3, t0, t1, t2, zero) \
270 vpcmpgtb v0, zero, t0; \
271 vpaddb v0, v0, v0; \
272 vpabsb t0, t0; \
273 \
274 vpcmpgtb v1, zero, t1; \
275 vpaddb v1, v1, v1; \
276 vpabsb t1, t1; \
277 \
278 vpcmpgtb v2, zero, t2; \
279 vpaddb v2, v2, v2; \
280 vpabsb t2, t2; \
281 \
282 vpor t0, v1, v1; \
283 \
284 vpcmpgtb v3, zero, t0; \
285 vpaddb v3, v3, v3; \
286 vpabsb t0, t0; \
287 \
288 vpor t1, v2, v2; \
289 vpor t2, v3, v3; \
290 vpor t0, v0, v0;
291
292
293
294
295
296
297
298
299 #define fls16(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \
300 tt1, tt2, tt3, kll, klr, krl, krr) \
301
302
303
304
305 \
306 vpxor tt0, tt0, tt0; \
307 vmovd kll, t0; \
308 vpshufb tt0, t0, t3; \
309 vpsrldq $1, t0, t0; \
310 vpshufb tt0, t0, t2; \
311 vpsrldq $1, t0, t0; \
312 vpshufb tt0, t0, t1; \
313 vpsrldq $1, t0, t0; \
314 vpshufb tt0, t0, t0; \
315 \
316 vpand l0, t0, t0; \
317 vpand l1, t1, t1; \
318 vpand l2, t2, t2; \
319 vpand l3, t3, t3; \
320 \
321 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
322 \
323 vpxor l4, t0, l4; \
324 vmovdqu l4, 4 * 16(l); \
325 vpxor l5, t1, l5; \
326 vmovdqu l5, 5 * 16(l); \
327 vpxor l6, t2, l6; \
328 vmovdqu l6, 6 * 16(l); \
329 vpxor l7, t3, l7; \
330 vmovdqu l7, 7 * 16(l); \
331 \
332
333
334
335
336 \
337 \
338 vmovd krr, t0; \
339 vpshufb tt0, t0, t3; \
340 vpsrldq $1, t0, t0; \
341 vpshufb tt0, t0, t2; \
342 vpsrldq $1, t0, t0; \
343 vpshufb tt0, t0, t1; \
344 vpsrldq $1, t0, t0; \
345 vpshufb tt0, t0, t0; \
346 \
347 vpor 4 * 16(r), t0, t0; \
348 vpor 5 * 16(r), t1, t1; \
349 vpor 6 * 16(r), t2, t2; \
350 vpor 7 * 16(r), t3, t3; \
351 \
352 vpxor 0 * 16(r), t0, t0; \
353 vpxor 1 * 16(r), t1, t1; \
354 vpxor 2 * 16(r), t2, t2; \
355 vpxor 3 * 16(r), t3, t3; \
356 vmovdqu t0, 0 * 16(r); \
357 vmovdqu t1, 1 * 16(r); \
358 vmovdqu t2, 2 * 16(r); \
359 vmovdqu t3, 3 * 16(r); \
360 \
361
362
363
364
365 \
366 vmovd krl, t0; \
367 vpshufb tt0, t0, t3; \
368 vpsrldq $1, t0, t0; \
369 vpshufb tt0, t0, t2; \
370 vpsrldq $1, t0, t0; \
371 vpshufb tt0, t0, t1; \
372 vpsrldq $1, t0, t0; \
373 vpshufb tt0, t0, t0; \
374 \
375 vpand 0 * 16(r), t0, t0; \
376 vpand 1 * 16(r), t1, t1; \
377 vpand 2 * 16(r), t2, t2; \
378 vpand 3 * 16(r), t3, t3; \
379 \
380 rol32_1_16(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \
381 \
382 vpxor 4 * 16(r), t0, t0; \
383 vpxor 5 * 16(r), t1, t1; \
384 vpxor 6 * 16(r), t2, t2; \
385 vpxor 7 * 16(r), t3, t3; \
386 vmovdqu t0, 4 * 16(r); \
387 vmovdqu t1, 5 * 16(r); \
388 vmovdqu t2, 6 * 16(r); \
389 vmovdqu t3, 7 * 16(r); \
390 \
391
392
393
394
395 \
396 \
397 vmovd klr, t0; \
398 vpshufb tt0, t0, t3; \
399 vpsrldq $1, t0, t0; \
400 vpshufb tt0, t0, t2; \
401 vpsrldq $1, t0, t0; \
402 vpshufb tt0, t0, t1; \
403 vpsrldq $1, t0, t0; \
404 vpshufb tt0, t0, t0; \
405 \
406 vpor l4, t0, t0; \
407 vpor l5, t1, t1; \
408 vpor l6, t2, t2; \
409 vpor l7, t3, t3; \
410 \
411 vpxor l0, t0, l0; \
412 vmovdqu l0, 0 * 16(l); \
413 vpxor l1, t1, l1; \
414 vmovdqu l1, 1 * 16(l); \
415 vpxor l2, t2, l2; \
416 vmovdqu l2, 2 * 16(l); \
417 vpxor l3, t3, l3; \
418 vmovdqu l3, 3 * 16(l);
419
420 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
421 vpunpckhdq x1, x0, t2; \
422 vpunpckldq x1, x0, x0; \
423 \
424 vpunpckldq x3, x2, t1; \
425 vpunpckhdq x3, x2, x2; \
426 \
427 vpunpckhqdq t1, x0, x1; \
428 vpunpcklqdq t1, x0, x0; \
429 \
430 vpunpckhqdq x2, t2, x3; \
431 vpunpcklqdq x2, t2, x2;
432
433 #define byteslice_16x16b(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, \
434 b3, c3, d3, st0, st1) \
435 vmovdqu d2, st0; \
436 vmovdqu d3, st1; \
437 transpose_4x4(a0, a1, a2, a3, d2, d3); \
438 transpose_4x4(b0, b1, b2, b3, d2, d3); \
439 vmovdqu st0, d2; \
440 vmovdqu st1, d3; \
441 \
442 vmovdqu a0, st0; \
443 vmovdqu a1, st1; \
444 transpose_4x4(c0, c1, c2, c3, a0, a1); \
445 transpose_4x4(d0, d1, d2, d3, a0, a1); \
446 \
447 vmovdqu .Lshufb_16x16b, a0; \
448 vmovdqu st1, a1; \
449 vpshufb a0, a2, a2; \
450 vpshufb a0, a3, a3; \
451 vpshufb a0, b0, b0; \
452 vpshufb a0, b1, b1; \
453 vpshufb a0, b2, b2; \
454 vpshufb a0, b3, b3; \
455 vpshufb a0, a1, a1; \
456 vpshufb a0, c0, c0; \
457 vpshufb a0, c1, c1; \
458 vpshufb a0, c2, c2; \
459 vpshufb a0, c3, c3; \
460 vpshufb a0, d0, d0; \
461 vpshufb a0, d1, d1; \
462 vpshufb a0, d2, d2; \
463 vpshufb a0, d3, d3; \
464 vmovdqu d3, st1; \
465 vmovdqu st0, d3; \
466 vpshufb a0, d3, a0; \
467 vmovdqu d2, st0; \
468 \
469 transpose_4x4(a0, b0, c0, d0, d2, d3); \
470 transpose_4x4(a1, b1, c1, d1, d2, d3); \
471 vmovdqu st0, d2; \
472 vmovdqu st1, d3; \
473 \
474 vmovdqu b0, st0; \
475 vmovdqu b1, st1; \
476 transpose_4x4(a2, b2, c2, d2, b0, b1); \
477 transpose_4x4(a3, b3, c3, d3, b0, b1); \
478 vmovdqu st0, b0; \
479 vmovdqu st1, b1; \
480
481
482
483 #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
484 y6, y7, rio, key) \
485 vmovq key, x0; \
486 vpshufb .Lpack_bswap, x0, x0; \
487 \
488 vpxor 0 * 16(rio), x0, y7; \
489 vpxor 1 * 16(rio), x0, y6; \
490 vpxor 2 * 16(rio), x0, y5; \
491 vpxor 3 * 16(rio), x0, y4; \
492 vpxor 4 * 16(rio), x0, y3; \
493 vpxor 5 * 16(rio), x0, y2; \
494 vpxor 6 * 16(rio), x0, y1; \
495 vpxor 7 * 16(rio), x0, y0; \
496 vpxor 8 * 16(rio), x0, x7; \
497 vpxor 9 * 16(rio), x0, x6; \
498 vpxor 10 * 16(rio), x0, x5; \
499 vpxor 11 * 16(rio), x0, x4; \
500 vpxor 12 * 16(rio), x0, x3; \
501 vpxor 13 * 16(rio), x0, x2; \
502 vpxor 14 * 16(rio), x0, x1; \
503 vpxor 15 * 16(rio), x0, x0;
504
505
506 #define inpack16_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
507 y6, y7, mem_ab, mem_cd) \
508 byteslice_16x16b(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
509 y5, y6, y7, (mem_ab), (mem_cd)); \
510 \
511 vmovdqu x0, 0 * 16(mem_ab); \
512 vmovdqu x1, 1 * 16(mem_ab); \
513 vmovdqu x2, 2 * 16(mem_ab); \
514 vmovdqu x3, 3 * 16(mem_ab); \
515 vmovdqu x4, 4 * 16(mem_ab); \
516 vmovdqu x5, 5 * 16(mem_ab); \
517 vmovdqu x6, 6 * 16(mem_ab); \
518 vmovdqu x7, 7 * 16(mem_ab); \
519 vmovdqu y0, 0 * 16(mem_cd); \
520 vmovdqu y1, 1 * 16(mem_cd); \
521 vmovdqu y2, 2 * 16(mem_cd); \
522 vmovdqu y3, 3 * 16(mem_cd); \
523 vmovdqu y4, 4 * 16(mem_cd); \
524 vmovdqu y5, 5 * 16(mem_cd); \
525 vmovdqu y6, 6 * 16(mem_cd); \
526 vmovdqu y7, 7 * 16(mem_cd);
527
528
529 #define outunpack16(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
530 y5, y6, y7, key, stack_tmp0, stack_tmp1) \
531 byteslice_16x16b(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, y3, \
532 y7, x3, x7, stack_tmp0, stack_tmp1); \
533 \
534 vmovdqu x0, stack_tmp0; \
535 \
536 vmovq key, x0; \
537 vpshufb .Lpack_bswap, x0, x0; \
538 \
539 vpxor x0, y7, y7; \
540 vpxor x0, y6, y6; \
541 vpxor x0, y5, y5; \
542 vpxor x0, y4, y4; \
543 vpxor x0, y3, y3; \
544 vpxor x0, y2, y2; \
545 vpxor x0, y1, y1; \
546 vpxor x0, y0, y0; \
547 vpxor x0, x7, x7; \
548 vpxor x0, x6, x6; \
549 vpxor x0, x5, x5; \
550 vpxor x0, x4, x4; \
551 vpxor x0, x3, x3; \
552 vpxor x0, x2, x2; \
553 vpxor x0, x1, x1; \
554 vpxor stack_tmp0, x0, x0;
555
556 #define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
557 y6, y7, rio) \
558 vmovdqu x0, 0 * 16(rio); \
559 vmovdqu x1, 1 * 16(rio); \
560 vmovdqu x2, 2 * 16(rio); \
561 vmovdqu x3, 3 * 16(rio); \
562 vmovdqu x4, 4 * 16(rio); \
563 vmovdqu x5, 5 * 16(rio); \
564 vmovdqu x6, 6 * 16(rio); \
565 vmovdqu x7, 7 * 16(rio); \
566 vmovdqu y0, 8 * 16(rio); \
567 vmovdqu y1, 9 * 16(rio); \
568 vmovdqu y2, 10 * 16(rio); \
569 vmovdqu y3, 11 * 16(rio); \
570 vmovdqu y4, 12 * 16(rio); \
571 vmovdqu y5, 13 * 16(rio); \
572 vmovdqu y6, 14 * 16(rio); \
573 vmovdqu y7, 15 * 16(rio);
574
575
576
577 .section .rodata.cst16, "aM", @progbits, 16
578 .align 16
579
580 #define SHUFB_BYTES(idx) \
581 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
582
583 .Lshufb_16x16b:
584 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3);
585
586 .Lpack_bswap:
587 .long 0x00010203
588 .long 0x04050607
589 .long 0x80808080
590 .long 0x80808080
591
592
593 .Lbswap128_mask:
594 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
595
596
597 .Lxts_gf128mul_and_shl1_mask:
598 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614 .Lpre_tf_lo_s1:
615 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
616 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
617 .Lpre_tf_hi_s1:
618 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
619 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635 .Lpre_tf_lo_s4:
636 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
637 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
638 .Lpre_tf_hi_s4:
639 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
640 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658 .Lpost_tf_lo_s1:
659 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
660 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
661 .Lpost_tf_hi_s1:
662 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
663 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681 .Lpost_tf_lo_s2:
682 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
683 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
684 .Lpost_tf_hi_s2:
685 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
686 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704 .Lpost_tf_lo_s3:
705 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
706 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
707 .Lpost_tf_hi_s3:
708 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
709 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
710
711
712 .Linv_shift_row:
713 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
714 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
715
716
717 .section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4
718 .align 4
719 .L0f0f0f0f:
720 .long 0x0f0f0f0f
721
722 .text
723
724 .align 8
725 __camellia_enc_blk16:
726
727
728
729
730
731
732
733
734 FRAME_BEGIN
735
736 leaq 8 * 16(%rax), %rcx;
737
738 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
739 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
740 %xmm15, %rax, %rcx);
741
742 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
743 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
744 %xmm15, %rax, %rcx, 0);
745
746 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
747 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
748 %xmm15,
749 ((key_table + (8) * 8) + 0)(CTX),
750 ((key_table + (8) * 8) + 4)(CTX),
751 ((key_table + (8) * 8) + 8)(CTX),
752 ((key_table + (8) * 8) + 12)(CTX));
753
754 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
755 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
756 %xmm15, %rax, %rcx, 8);
757
758 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
759 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
760 %xmm15,
761 ((key_table + (16) * 8) + 0)(CTX),
762 ((key_table + (16) * 8) + 4)(CTX),
763 ((key_table + (16) * 8) + 8)(CTX),
764 ((key_table + (16) * 8) + 12)(CTX));
765
766 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
767 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
768 %xmm15, %rax, %rcx, 16);
769
770 movl $24, %r8d;
771 cmpl $16, key_length(CTX);
772 jne .Lenc_max32;
773
774 .Lenc_done:
775
776 vmovdqu 0 * 16(%rcx), %xmm8;
777 vmovdqu 1 * 16(%rcx), %xmm9;
778 vmovdqu 2 * 16(%rcx), %xmm10;
779 vmovdqu 3 * 16(%rcx), %xmm11;
780 vmovdqu 4 * 16(%rcx), %xmm12;
781 vmovdqu 5 * 16(%rcx), %xmm13;
782 vmovdqu 6 * 16(%rcx), %xmm14;
783 vmovdqu 7 * 16(%rcx), %xmm15;
784
785 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
786 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
787 %xmm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 16(%rax));
788
789 FRAME_END
790 ret;
791
792 .align 8
793 .Lenc_max32:
794 movl $32, %r8d;
795
796 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
797 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
798 %xmm15,
799 ((key_table + (24) * 8) + 0)(CTX),
800 ((key_table + (24) * 8) + 4)(CTX),
801 ((key_table + (24) * 8) + 8)(CTX),
802 ((key_table + (24) * 8) + 12)(CTX));
803
804 enc_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
805 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
806 %xmm15, %rax, %rcx, 24);
807
808 jmp .Lenc_done;
809 ENDPROC(__camellia_enc_blk16)
810
811 .align 8
812 __camellia_dec_blk16:
813
814
815
816
817
818
819
820
821
822 FRAME_BEGIN
823
824 leaq 8 * 16(%rax), %rcx;
825
826 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
827 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
828 %xmm15, %rax, %rcx);
829
830 cmpl $32, %r8d;
831 je .Ldec_max32;
832
833 .Ldec_max24:
834 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
835 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
836 %xmm15, %rax, %rcx, 16);
837
838 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
839 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
840 %xmm15,
841 ((key_table + (16) * 8) + 8)(CTX),
842 ((key_table + (16) * 8) + 12)(CTX),
843 ((key_table + (16) * 8) + 0)(CTX),
844 ((key_table + (16) * 8) + 4)(CTX));
845
846 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
847 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
848 %xmm15, %rax, %rcx, 8);
849
850 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
851 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
852 %xmm15,
853 ((key_table + (8) * 8) + 8)(CTX),
854 ((key_table + (8) * 8) + 12)(CTX),
855 ((key_table + (8) * 8) + 0)(CTX),
856 ((key_table + (8) * 8) + 4)(CTX));
857
858 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
859 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
860 %xmm15, %rax, %rcx, 0);
861
862
863 vmovdqu 0 * 16(%rcx), %xmm8;
864 vmovdqu 1 * 16(%rcx), %xmm9;
865 vmovdqu 2 * 16(%rcx), %xmm10;
866 vmovdqu 3 * 16(%rcx), %xmm11;
867 vmovdqu 4 * 16(%rcx), %xmm12;
868 vmovdqu 5 * 16(%rcx), %xmm13;
869 vmovdqu 6 * 16(%rcx), %xmm14;
870 vmovdqu 7 * 16(%rcx), %xmm15;
871
872 outunpack16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
873 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
874 %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax));
875
876 FRAME_END
877 ret;
878
879 .align 8
880 .Ldec_max32:
881 dec_rounds16(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
882 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
883 %xmm15, %rax, %rcx, 24);
884
885 fls16(%rax, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
886 %rcx, %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
887 %xmm15,
888 ((key_table + (24) * 8) + 8)(CTX),
889 ((key_table + (24) * 8) + 12)(CTX),
890 ((key_table + (24) * 8) + 0)(CTX),
891 ((key_table + (24) * 8) + 4)(CTX));
892
893 jmp .Ldec_max24;
894 ENDPROC(__camellia_dec_blk16)
895
896 ENTRY(camellia_ecb_enc_16way)
897
898
899
900
901
902 FRAME_BEGIN
903
904 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
905 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
906 %xmm15, %rdx, (key_table)(CTX));
907
908
909 movq %rsi, %rax;
910
911 call __camellia_enc_blk16;
912
913 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
914 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
915 %xmm8, %rsi);
916
917 FRAME_END
918 ret;
919 ENDPROC(camellia_ecb_enc_16way)
920
921 ENTRY(camellia_ecb_dec_16way)
922
923
924
925
926
927 FRAME_BEGIN
928
929 cmpl $16, key_length(CTX);
930 movl $32, %r8d;
931 movl $24, %eax;
932 cmovel %eax, %r8d;
933
934 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
935 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
936 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
937
938
939 movq %rsi, %rax;
940
941 call __camellia_dec_blk16;
942
943 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
944 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
945 %xmm8, %rsi);
946
947 FRAME_END
948 ret;
949 ENDPROC(camellia_ecb_dec_16way)
950
951 ENTRY(camellia_cbc_dec_16way)
952
953
954
955
956
957 FRAME_BEGIN
958
959 cmpl $16, key_length(CTX);
960 movl $32, %r8d;
961 movl $24, %eax;
962 cmovel %eax, %r8d;
963
964 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7,
965 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14,
966 %xmm15, %rdx, (key_table)(CTX, %r8, 8));
967
968
969
970
971
972 subq $(16 * 16), %rsp;
973 movq %rsp, %rax;
974
975 call __camellia_dec_blk16;
976
977 addq $(16 * 16), %rsp;
978
979 vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
980 vpxor (1 * 16)(%rdx), %xmm5, %xmm5;
981 vpxor (2 * 16)(%rdx), %xmm4, %xmm4;
982 vpxor (3 * 16)(%rdx), %xmm3, %xmm3;
983 vpxor (4 * 16)(%rdx), %xmm2, %xmm2;
984 vpxor (5 * 16)(%rdx), %xmm1, %xmm1;
985 vpxor (6 * 16)(%rdx), %xmm0, %xmm0;
986 vpxor (7 * 16)(%rdx), %xmm15, %xmm15;
987 vpxor (8 * 16)(%rdx), %xmm14, %xmm14;
988 vpxor (9 * 16)(%rdx), %xmm13, %xmm13;
989 vpxor (10 * 16)(%rdx), %xmm12, %xmm12;
990 vpxor (11 * 16)(%rdx), %xmm11, %xmm11;
991 vpxor (12 * 16)(%rdx), %xmm10, %xmm10;
992 vpxor (13 * 16)(%rdx), %xmm9, %xmm9;
993 vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
994 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
995 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
996 %xmm8, %rsi);
997
998 FRAME_END
999 ret;
1000 ENDPROC(camellia_cbc_dec_16way)
1001
1002 #define inc_le128(x, minus_one, tmp) \
1003 vpcmpeqq minus_one, x, tmp; \
1004 vpsubq minus_one, x, x; \
1005 vpslldq $8, tmp, tmp; \
1006 vpsubq tmp, x, x;
1007
1008 ENTRY(camellia_ctr_16way)
1009
1010
1011
1012
1013
1014
1015 FRAME_BEGIN
1016
1017 subq $(16 * 16), %rsp;
1018 movq %rsp, %rax;
1019
1020 vmovdqa .Lbswap128_mask, %xmm14;
1021
1022
1023 vmovdqu (%rcx), %xmm0;
1024 vpshufb %xmm14, %xmm0, %xmm15;
1025 vmovdqu %xmm15, 15 * 16(%rax);
1026
1027 vpcmpeqd %xmm15, %xmm15, %xmm15;
1028 vpsrldq $8, %xmm15, %xmm15;
1029
1030
1031 inc_le128(%xmm0, %xmm15, %xmm13);
1032 vpshufb %xmm14, %xmm0, %xmm13;
1033 vmovdqu %xmm13, 14 * 16(%rax);
1034 inc_le128(%xmm0, %xmm15, %xmm13);
1035 vpshufb %xmm14, %xmm0, %xmm13;
1036 vmovdqu %xmm13, 13 * 16(%rax);
1037 inc_le128(%xmm0, %xmm15, %xmm13);
1038 vpshufb %xmm14, %xmm0, %xmm12;
1039 inc_le128(%xmm0, %xmm15, %xmm13);
1040 vpshufb %xmm14, %xmm0, %xmm11;
1041 inc_le128(%xmm0, %xmm15, %xmm13);
1042 vpshufb %xmm14, %xmm0, %xmm10;
1043 inc_le128(%xmm0, %xmm15, %xmm13);
1044 vpshufb %xmm14, %xmm0, %xmm9;
1045 inc_le128(%xmm0, %xmm15, %xmm13);
1046 vpshufb %xmm14, %xmm0, %xmm8;
1047 inc_le128(%xmm0, %xmm15, %xmm13);
1048 vpshufb %xmm14, %xmm0, %xmm7;
1049 inc_le128(%xmm0, %xmm15, %xmm13);
1050 vpshufb %xmm14, %xmm0, %xmm6;
1051 inc_le128(%xmm0, %xmm15, %xmm13);
1052 vpshufb %xmm14, %xmm0, %xmm5;
1053 inc_le128(%xmm0, %xmm15, %xmm13);
1054 vpshufb %xmm14, %xmm0, %xmm4;
1055 inc_le128(%xmm0, %xmm15, %xmm13);
1056 vpshufb %xmm14, %xmm0, %xmm3;
1057 inc_le128(%xmm0, %xmm15, %xmm13);
1058 vpshufb %xmm14, %xmm0, %xmm2;
1059 inc_le128(%xmm0, %xmm15, %xmm13);
1060 vpshufb %xmm14, %xmm0, %xmm1;
1061 inc_le128(%xmm0, %xmm15, %xmm13);
1062 vmovdqa %xmm0, %xmm13;
1063 vpshufb %xmm14, %xmm0, %xmm0;
1064 inc_le128(%xmm13, %xmm15, %xmm14);
1065 vmovdqu %xmm13, (%rcx);
1066
1067
1068 vmovq (key_table)(CTX), %xmm15;
1069 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1070 vpxor %xmm0, %xmm15, %xmm0;
1071 vpxor %xmm1, %xmm15, %xmm1;
1072 vpxor %xmm2, %xmm15, %xmm2;
1073 vpxor %xmm3, %xmm15, %xmm3;
1074 vpxor %xmm4, %xmm15, %xmm4;
1075 vpxor %xmm5, %xmm15, %xmm5;
1076 vpxor %xmm6, %xmm15, %xmm6;
1077 vpxor %xmm7, %xmm15, %xmm7;
1078 vpxor %xmm8, %xmm15, %xmm8;
1079 vpxor %xmm9, %xmm15, %xmm9;
1080 vpxor %xmm10, %xmm15, %xmm10;
1081 vpxor %xmm11, %xmm15, %xmm11;
1082 vpxor %xmm12, %xmm15, %xmm12;
1083 vpxor 13 * 16(%rax), %xmm15, %xmm13;
1084 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1085 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1086
1087 call __camellia_enc_blk16;
1088
1089 addq $(16 * 16), %rsp;
1090
1091 vpxor 0 * 16(%rdx), %xmm7, %xmm7;
1092 vpxor 1 * 16(%rdx), %xmm6, %xmm6;
1093 vpxor 2 * 16(%rdx), %xmm5, %xmm5;
1094 vpxor 3 * 16(%rdx), %xmm4, %xmm4;
1095 vpxor 4 * 16(%rdx), %xmm3, %xmm3;
1096 vpxor 5 * 16(%rdx), %xmm2, %xmm2;
1097 vpxor 6 * 16(%rdx), %xmm1, %xmm1;
1098 vpxor 7 * 16(%rdx), %xmm0, %xmm0;
1099 vpxor 8 * 16(%rdx), %xmm15, %xmm15;
1100 vpxor 9 * 16(%rdx), %xmm14, %xmm14;
1101 vpxor 10 * 16(%rdx), %xmm13, %xmm13;
1102 vpxor 11 * 16(%rdx), %xmm12, %xmm12;
1103 vpxor 12 * 16(%rdx), %xmm11, %xmm11;
1104 vpxor 13 * 16(%rdx), %xmm10, %xmm10;
1105 vpxor 14 * 16(%rdx), %xmm9, %xmm9;
1106 vpxor 15 * 16(%rdx), %xmm8, %xmm8;
1107 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1108 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1109 %xmm8, %rsi);
1110
1111 FRAME_END
1112 ret;
1113 ENDPROC(camellia_ctr_16way)
1114
1115 #define gf128mul_x_ble(iv, mask, tmp) \
1116 vpsrad $31, iv, tmp; \
1117 vpaddq iv, iv, iv; \
1118 vpshufd $0x13, tmp, tmp; \
1119 vpand mask, tmp, tmp; \
1120 vpxor tmp, iv, iv;
1121
1122 .align 8
1123 camellia_xts_crypt_16way:
1124
1125
1126
1127
1128
1129
1130
1131
1132 FRAME_BEGIN
1133
1134 subq $(16 * 16), %rsp;
1135 movq %rsp, %rax;
1136
1137 vmovdqa .Lxts_gf128mul_and_shl1_mask, %xmm14;
1138
1139
1140 vmovdqu (%rcx), %xmm0;
1141 vpxor 0 * 16(%rdx), %xmm0, %xmm15;
1142 vmovdqu %xmm15, 15 * 16(%rax);
1143 vmovdqu %xmm0, 0 * 16(%rsi);
1144
1145
1146 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1147 vpxor 1 * 16(%rdx), %xmm0, %xmm15;
1148 vmovdqu %xmm15, 14 * 16(%rax);
1149 vmovdqu %xmm0, 1 * 16(%rsi);
1150
1151 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1152 vpxor 2 * 16(%rdx), %xmm0, %xmm13;
1153 vmovdqu %xmm0, 2 * 16(%rsi);
1154
1155 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1156 vpxor 3 * 16(%rdx), %xmm0, %xmm12;
1157 vmovdqu %xmm0, 3 * 16(%rsi);
1158
1159 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1160 vpxor 4 * 16(%rdx), %xmm0, %xmm11;
1161 vmovdqu %xmm0, 4 * 16(%rsi);
1162
1163 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1164 vpxor 5 * 16(%rdx), %xmm0, %xmm10;
1165 vmovdqu %xmm0, 5 * 16(%rsi);
1166
1167 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1168 vpxor 6 * 16(%rdx), %xmm0, %xmm9;
1169 vmovdqu %xmm0, 6 * 16(%rsi);
1170
1171 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1172 vpxor 7 * 16(%rdx), %xmm0, %xmm8;
1173 vmovdqu %xmm0, 7 * 16(%rsi);
1174
1175 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1176 vpxor 8 * 16(%rdx), %xmm0, %xmm7;
1177 vmovdqu %xmm0, 8 * 16(%rsi);
1178
1179 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1180 vpxor 9 * 16(%rdx), %xmm0, %xmm6;
1181 vmovdqu %xmm0, 9 * 16(%rsi);
1182
1183 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1184 vpxor 10 * 16(%rdx), %xmm0, %xmm5;
1185 vmovdqu %xmm0, 10 * 16(%rsi);
1186
1187 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1188 vpxor 11 * 16(%rdx), %xmm0, %xmm4;
1189 vmovdqu %xmm0, 11 * 16(%rsi);
1190
1191 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1192 vpxor 12 * 16(%rdx), %xmm0, %xmm3;
1193 vmovdqu %xmm0, 12 * 16(%rsi);
1194
1195 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1196 vpxor 13 * 16(%rdx), %xmm0, %xmm2;
1197 vmovdqu %xmm0, 13 * 16(%rsi);
1198
1199 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1200 vpxor 14 * 16(%rdx), %xmm0, %xmm1;
1201 vmovdqu %xmm0, 14 * 16(%rsi);
1202
1203 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1204 vpxor 15 * 16(%rdx), %xmm0, %xmm15;
1205 vmovdqu %xmm15, 0 * 16(%rax);
1206 vmovdqu %xmm0, 15 * 16(%rsi);
1207
1208 gf128mul_x_ble(%xmm0, %xmm14, %xmm15);
1209 vmovdqu %xmm0, (%rcx);
1210
1211
1212 vmovq (key_table)(CTX, %r8, 8), %xmm15;
1213 vpshufb .Lpack_bswap, %xmm15, %xmm15;
1214 vpxor 0 * 16(%rax), %xmm15, %xmm0;
1215 vpxor %xmm1, %xmm15, %xmm1;
1216 vpxor %xmm2, %xmm15, %xmm2;
1217 vpxor %xmm3, %xmm15, %xmm3;
1218 vpxor %xmm4, %xmm15, %xmm4;
1219 vpxor %xmm5, %xmm15, %xmm5;
1220 vpxor %xmm6, %xmm15, %xmm6;
1221 vpxor %xmm7, %xmm15, %xmm7;
1222 vpxor %xmm8, %xmm15, %xmm8;
1223 vpxor %xmm9, %xmm15, %xmm9;
1224 vpxor %xmm10, %xmm15, %xmm10;
1225 vpxor %xmm11, %xmm15, %xmm11;
1226 vpxor %xmm12, %xmm15, %xmm12;
1227 vpxor %xmm13, %xmm15, %xmm13;
1228 vpxor 14 * 16(%rax), %xmm15, %xmm14;
1229 vpxor 15 * 16(%rax), %xmm15, %xmm15;
1230
1231 CALL_NOSPEC %r9;
1232
1233 addq $(16 * 16), %rsp;
1234
1235 vpxor 0 * 16(%rsi), %xmm7, %xmm7;
1236 vpxor 1 * 16(%rsi), %xmm6, %xmm6;
1237 vpxor 2 * 16(%rsi), %xmm5, %xmm5;
1238 vpxor 3 * 16(%rsi), %xmm4, %xmm4;
1239 vpxor 4 * 16(%rsi), %xmm3, %xmm3;
1240 vpxor 5 * 16(%rsi), %xmm2, %xmm2;
1241 vpxor 6 * 16(%rsi), %xmm1, %xmm1;
1242 vpxor 7 * 16(%rsi), %xmm0, %xmm0;
1243 vpxor 8 * 16(%rsi), %xmm15, %xmm15;
1244 vpxor 9 * 16(%rsi), %xmm14, %xmm14;
1245 vpxor 10 * 16(%rsi), %xmm13, %xmm13;
1246 vpxor 11 * 16(%rsi), %xmm12, %xmm12;
1247 vpxor 12 * 16(%rsi), %xmm11, %xmm11;
1248 vpxor 13 * 16(%rsi), %xmm10, %xmm10;
1249 vpxor 14 * 16(%rsi), %xmm9, %xmm9;
1250 vpxor 15 * 16(%rsi), %xmm8, %xmm8;
1251 write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
1252 %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
1253 %xmm8, %rsi);
1254
1255 FRAME_END
1256 ret;
1257 ENDPROC(camellia_xts_crypt_16way)
1258
1259 ENTRY(camellia_xts_enc_16way)
1260
1261
1262
1263
1264
1265
1266 xorl %r8d, %r8d;
1267
1268 leaq __camellia_enc_blk16, %r9;
1269
1270 jmp camellia_xts_crypt_16way;
1271 ENDPROC(camellia_xts_enc_16way)
1272
1273 ENTRY(camellia_xts_dec_16way)
1274
1275
1276
1277
1278
1279
1280
1281 cmpl $16, key_length(CTX);
1282 movl $32, %r8d;
1283 movl $24, %eax;
1284 cmovel %eax, %r8d;
1285
1286 leaq __camellia_dec_blk16, %r9;
1287
1288 jmp camellia_xts_crypt_16way;
1289 ENDPROC(camellia_xts_dec_16way)