1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65 #include <linux/linkage.h>
66 #include <asm/inst.h>
67
68 #define VMOVDQ vmovdqu
69
70 #define xdata0 %xmm0
71 #define xdata1 %xmm1
72 #define xdata2 %xmm2
73 #define xdata3 %xmm3
74 #define xdata4 %xmm4
75 #define xdata5 %xmm5
76 #define xdata6 %xmm6
77 #define xdata7 %xmm7
78 #define xcounter %xmm8
79 #define xbyteswap %xmm9
80 #define xkey0 %xmm10
81 #define xkey4 %xmm11
82 #define xkey8 %xmm12
83 #define xkey12 %xmm13
84 #define xkeyA %xmm14
85 #define xkeyB %xmm15
86
87 #define p_in %rdi
88 #define p_iv %rsi
89 #define p_keys %rdx
90 #define p_out %rcx
91 #define num_bytes %r8
92
93 #define tmp %r10
94 #define DDQ_DATA 0
95 #define XDATA 1
96 #define KEY_128 1
97 #define KEY_192 2
98 #define KEY_256 3
99
100 .section .rodata
101 .align 16
102
103 byteswap_const:
104 .octa 0x000102030405060708090A0B0C0D0E0F
105 ddq_low_msk:
106 .octa 0x0000000000000000FFFFFFFFFFFFFFFF
107 ddq_high_add_1:
108 .octa 0x00000000000000010000000000000000
109 ddq_add_1:
110 .octa 0x00000000000000000000000000000001
111 ddq_add_2:
112 .octa 0x00000000000000000000000000000002
113 ddq_add_3:
114 .octa 0x00000000000000000000000000000003
115 ddq_add_4:
116 .octa 0x00000000000000000000000000000004
117 ddq_add_5:
118 .octa 0x00000000000000000000000000000005
119 ddq_add_6:
120 .octa 0x00000000000000000000000000000006
121 ddq_add_7:
122 .octa 0x00000000000000000000000000000007
123 ddq_add_8:
124 .octa 0x00000000000000000000000000000008
125
126 .text
127
128
129
130 .macro setddq n
131 var_ddq_add = ddq_add_\n
132 .endm
133
134
135 .macro setxdata n
136 var_xdata = %xmm\n
137 .endm
138
139
140
141 .macro club name, id
142 .altmacro
143 .if \name == DDQ_DATA
144 setddq %\id
145 .elseif \name == XDATA
146 setxdata %\id
147 .endif
148 .noaltmacro
149 .endm
150
151
152
153
154
155 .macro do_aes b, k, key_len
156 .set by, \b
157 .set load_keys, \k
158 .set klen, \key_len
159
160 .if (load_keys)
161 vmovdqa 0*16(p_keys), xkey0
162 .endif
163
164 vpshufb xbyteswap, xcounter, xdata0
165
166 .set i, 1
167 .rept (by - 1)
168 club DDQ_DATA, i
169 club XDATA, i
170 vpaddq var_ddq_add(%rip), xcounter, var_xdata
171 vptest ddq_low_msk(%rip), var_xdata
172 jnz 1f
173 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata
174 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
175 1:
176 vpshufb xbyteswap, var_xdata, var_xdata
177 .set i, (i +1)
178 .endr
179
180 vmovdqa 1*16(p_keys), xkeyA
181
182 vpxor xkey0, xdata0, xdata0
183 club DDQ_DATA, by
184 vpaddq var_ddq_add(%rip), xcounter, xcounter
185 vptest ddq_low_msk(%rip), xcounter
186 jnz 1f
187 vpaddq ddq_high_add_1(%rip), xcounter, xcounter
188 1:
189
190 .set i, 1
191 .rept (by - 1)
192 club XDATA, i
193 vpxor xkey0, var_xdata, var_xdata
194 .set i, (i +1)
195 .endr
196
197 vmovdqa 2*16(p_keys), xkeyB
198
199 .set i, 0
200 .rept by
201 club XDATA, i
202 vaesenc xkeyA, var_xdata, var_xdata
203 .set i, (i +1)
204 .endr
205
206 .if (klen == KEY_128)
207 .if (load_keys)
208 vmovdqa 3*16(p_keys), xkey4
209 .endif
210 .else
211 vmovdqa 3*16(p_keys), xkeyA
212 .endif
213
214 .set i, 0
215 .rept by
216 club XDATA, i
217 vaesenc xkeyB, var_xdata, var_xdata
218 .set i, (i +1)
219 .endr
220
221 add $(16*by), p_in
222
223 .if (klen == KEY_128)
224 vmovdqa 4*16(p_keys), xkeyB
225 .else
226 .if (load_keys)
227 vmovdqa 4*16(p_keys), xkey4
228 .endif
229 .endif
230
231 .set i, 0
232 .rept by
233 club XDATA, i
234
235 .if (klen == KEY_128)
236 vaesenc xkey4, var_xdata, var_xdata
237 .else
238 vaesenc xkeyA, var_xdata, var_xdata
239 .endif
240 .set i, (i +1)
241 .endr
242
243 vmovdqa 5*16(p_keys), xkeyA
244
245 .set i, 0
246 .rept by
247 club XDATA, i
248
249 .if (klen == KEY_128)
250 vaesenc xkeyB, var_xdata, var_xdata
251 .else
252 vaesenc xkey4, var_xdata, var_xdata
253 .endif
254 .set i, (i +1)
255 .endr
256
257 .if (klen == KEY_128)
258 .if (load_keys)
259 vmovdqa 6*16(p_keys), xkey8
260 .endif
261 .else
262 vmovdqa 6*16(p_keys), xkeyB
263 .endif
264
265 .set i, 0
266 .rept by
267 club XDATA, i
268 vaesenc xkeyA, var_xdata, var_xdata
269 .set i, (i +1)
270 .endr
271
272 vmovdqa 7*16(p_keys), xkeyA
273
274 .set i, 0
275 .rept by
276 club XDATA, i
277
278 .if (klen == KEY_128)
279 vaesenc xkey8, var_xdata, var_xdata
280 .else
281 vaesenc xkeyB, var_xdata, var_xdata
282 .endif
283 .set i, (i +1)
284 .endr
285
286 .if (klen == KEY_128)
287 vmovdqa 8*16(p_keys), xkeyB
288 .else
289 .if (load_keys)
290 vmovdqa 8*16(p_keys), xkey8
291 .endif
292 .endif
293
294 .set i, 0
295 .rept by
296 club XDATA, i
297 vaesenc xkeyA, var_xdata, var_xdata
298 .set i, (i +1)
299 .endr
300
301 .if (klen == KEY_128)
302 .if (load_keys)
303 vmovdqa 9*16(p_keys), xkey12
304 .endif
305 .else
306 vmovdqa 9*16(p_keys), xkeyA
307 .endif
308
309 .set i, 0
310 .rept by
311 club XDATA, i
312
313 .if (klen == KEY_128)
314 vaesenc xkeyB, var_xdata, var_xdata
315 .else
316 vaesenc xkey8, var_xdata, var_xdata
317 .endif
318 .set i, (i +1)
319 .endr
320
321 vmovdqa 10*16(p_keys), xkeyB
322
323 .set i, 0
324 .rept by
325 club XDATA, i
326
327 .if (klen == KEY_128)
328 vaesenc xkey12, var_xdata, var_xdata
329 .else
330 vaesenc xkeyA, var_xdata, var_xdata
331 .endif
332 .set i, (i +1)
333 .endr
334
335 .if (klen != KEY_128)
336 vmovdqa 11*16(p_keys), xkeyA
337 .endif
338
339 .set i, 0
340 .rept by
341 club XDATA, i
342
343 .if (klen == KEY_128)
344 vaesenclast xkeyB, var_xdata, var_xdata
345 .else
346 vaesenc xkeyB, var_xdata, var_xdata
347 .endif
348 .set i, (i +1)
349 .endr
350
351 .if (klen != KEY_128)
352 .if (load_keys)
353 vmovdqa 12*16(p_keys), xkey12
354 .endif
355
356 .set i, 0
357 .rept by
358 club XDATA, i
359 vaesenc xkeyA, var_xdata, var_xdata
360 .set i, (i +1)
361 .endr
362
363 .if (klen == KEY_256)
364 vmovdqa 13*16(p_keys), xkeyA
365 .endif
366
367 .set i, 0
368 .rept by
369 club XDATA, i
370 .if (klen == KEY_256)
371
372 vaesenc xkey12, var_xdata, var_xdata
373 .else
374 vaesenclast xkey12, var_xdata, var_xdata
375 .endif
376 .set i, (i +1)
377 .endr
378
379 .if (klen == KEY_256)
380 vmovdqa 14*16(p_keys), xkeyB
381
382 .set i, 0
383 .rept by
384 club XDATA, i
385
386 vaesenc xkeyA, var_xdata, var_xdata
387 .set i, (i +1)
388 .endr
389
390 .set i, 0
391 .rept by
392 club XDATA, i
393
394 vaesenclast xkeyB, var_xdata, var_xdata
395 .set i, (i +1)
396 .endr
397 .endif
398 .endif
399
400 .set i, 0
401 .rept (by / 2)
402 .set j, (i+1)
403 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
404 VMOVDQ (j*16 - 16*by)(p_in), xkeyB
405 club XDATA, i
406 vpxor xkeyA, var_xdata, var_xdata
407 club XDATA, j
408 vpxor xkeyB, var_xdata, var_xdata
409 .set i, (i+2)
410 .endr
411
412 .if (i < by)
413 VMOVDQ (i*16 - 16*by)(p_in), xkeyA
414 club XDATA, i
415 vpxor xkeyA, var_xdata, var_xdata
416 .endif
417
418 .set i, 0
419 .rept by
420 club XDATA, i
421 VMOVDQ var_xdata, i*16(p_out)
422 .set i, (i+1)
423 .endr
424 .endm
425
426 .macro do_aes_load val, key_len
427 do_aes \val, 1, \key_len
428 .endm
429
430 .macro do_aes_noload val, key_len
431 do_aes \val, 0, \key_len
432 .endm
433
434
435
436 .macro do_aes_ctrmain key_len
437 cmp $16, num_bytes
438 jb .Ldo_return2\key_len
439
440 vmovdqa byteswap_const(%rip), xbyteswap
441 vmovdqu (p_iv), xcounter
442 vpshufb xbyteswap, xcounter, xcounter
443
444 mov num_bytes, tmp
445 and $(7*16), tmp
446 jz .Lmult_of_8_blks\key_len
447
448
449 cmp $(4*16), tmp
450 jg .Lgt4\key_len
451 je .Leq4\key_len
452
453 .Llt4\key_len:
454 cmp $(2*16), tmp
455 jg .Leq3\key_len
456 je .Leq2\key_len
457
458 .Leq1\key_len:
459 do_aes_load 1, \key_len
460 add $(1*16), p_out
461 and $(~7*16), num_bytes
462 jz .Ldo_return2\key_len
463 jmp .Lmain_loop2\key_len
464
465 .Leq2\key_len:
466 do_aes_load 2, \key_len
467 add $(2*16), p_out
468 and $(~7*16), num_bytes
469 jz .Ldo_return2\key_len
470 jmp .Lmain_loop2\key_len
471
472
473 .Leq3\key_len:
474 do_aes_load 3, \key_len
475 add $(3*16), p_out
476 and $(~7*16), num_bytes
477 jz .Ldo_return2\key_len
478 jmp .Lmain_loop2\key_len
479
480 .Leq4\key_len:
481 do_aes_load 4, \key_len
482 add $(4*16), p_out
483 and $(~7*16), num_bytes
484 jz .Ldo_return2\key_len
485 jmp .Lmain_loop2\key_len
486
487 .Lgt4\key_len:
488 cmp $(6*16), tmp
489 jg .Leq7\key_len
490 je .Leq6\key_len
491
492 .Leq5\key_len:
493 do_aes_load 5, \key_len
494 add $(5*16), p_out
495 and $(~7*16), num_bytes
496 jz .Ldo_return2\key_len
497 jmp .Lmain_loop2\key_len
498
499 .Leq6\key_len:
500 do_aes_load 6, \key_len
501 add $(6*16), p_out
502 and $(~7*16), num_bytes
503 jz .Ldo_return2\key_len
504 jmp .Lmain_loop2\key_len
505
506 .Leq7\key_len:
507 do_aes_load 7, \key_len
508 add $(7*16), p_out
509 and $(~7*16), num_bytes
510 jz .Ldo_return2\key_len
511 jmp .Lmain_loop2\key_len
512
513 .Lmult_of_8_blks\key_len:
514 .if (\key_len != KEY_128)
515 vmovdqa 0*16(p_keys), xkey0
516 vmovdqa 4*16(p_keys), xkey4
517 vmovdqa 8*16(p_keys), xkey8
518 vmovdqa 12*16(p_keys), xkey12
519 .else
520 vmovdqa 0*16(p_keys), xkey0
521 vmovdqa 3*16(p_keys), xkey4
522 vmovdqa 6*16(p_keys), xkey8
523 vmovdqa 9*16(p_keys), xkey12
524 .endif
525 .align 16
526 .Lmain_loop2\key_len:
527
528 do_aes_noload 8, \key_len
529 add $(8*16), p_out
530 sub $(8*16), num_bytes
531 jne .Lmain_loop2\key_len
532
533 .Ldo_return2\key_len:
534
535 vpshufb xbyteswap, xcounter, xcounter
536 vmovdqu xcounter, (p_iv)
537 ret
538 .endm
539
540
541
542
543
544
545
546
547 ENTRY(aes_ctr_enc_128_avx_by8)
548
549 do_aes_ctrmain KEY_128
550
551 ENDPROC(aes_ctr_enc_128_avx_by8)
552
553
554
555
556
557
558
559
560 ENTRY(aes_ctr_enc_192_avx_by8)
561
562 do_aes_ctrmain KEY_192
563
564 ENDPROC(aes_ctr_enc_192_avx_by8)
565
566
567
568
569
570
571
572
573 ENTRY(aes_ctr_enc_256_avx_by8)
574
575 do_aes_ctrmain KEY_256
576
577 ENDPROC(aes_ctr_enc_256_avx_by8)