1 ########################################################################
2
3 #
4
5
6
7
8
9 #
10
11
12
13 #
14 # * Redistributions of source code must retain the above copyright
15
16 #
17 # * Redistributions in binary form must reproduce the above copyright
18
19
20
21 #
22 # * Neither the name of the Intel Corporation nor the names of its
23
24
25 #
26 #
27
28
29
30
31
32
33
34
35
36
37
38 ########################################################################
39 ##
40 ## Authors:
41 ## Erdinc Ozturk <erdinc.ozturk@intel.com>
42 ## Vinodh Gopal <vinodh.gopal@intel.com>
43 ## James Guilford <james.guilford@intel.com>
44 ## Tim Chen <tim.c.chen@linux.intel.com>
45 ##
46 ## References:
47 ## This code was derived and highly optimized from the code described in paper:
48 ## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49 ## on Intel Architecture Processors. August, 2010
50 ## The details of the implementation is explained in:
51 ## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52 ## on Intel Architecture Processors. October, 2012.
53 ##
54 ## Assumptions:
55 ##
56 ##
57 ##
58 ## iv:
59 ## 0 1 2 3
60 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62 ## | Salt (From the SA) |
63 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64 ## | Initialization Vector |
65 ## | (This is the sequence number from IPSec header) |
66 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67 ## | 0x1 |
68 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69 ##
70 ##
71 ##
72 ## AAD:
73 ## AAD padded to 128 bits with 0
74 ## for example, assume AAD is a u32 vector
75 ##
76 ## if AAD is 8 bytes:
77 ## AAD[3] = {A0, A1}#
78 ## padded AAD in xmm register = {A1 A0 0 0}
79 ##
80 ## 0 1 2 3
81 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83 ## | SPI (A1) |
84 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85 ## | 32-bit Sequence Number (A0) |
86 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87 ## | 0x0 |
88 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89 ##
90 ## AAD Format with 32-bit Sequence Number
91 ##
92 ## if AAD is 12 bytes:
93 ## AAD[3] = {A0, A1, A2}#
94 ## padded AAD in xmm register = {A2 A1 A0 0}
95 ##
96 ## 0 1 2 3
97 ## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99 ## | SPI (A2) |
100 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101 ## | 64-bit Extended Sequence Number {A1,A0} |
102 ## | |
103 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104 ## | 0x0 |
105 ## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106 ##
107 ## AAD Format with 64-bit Extended Sequence Number
108 ##
109 ##
110 ## aadLen:
111 ## from the definition of the spec, aadLen can only be 8 or 12 bytes.
112 ## The code additionally supports aadLen of length 16 bytes.
113 ##
114 ## TLen:
115 ## from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116 ##
117 ## poly = x^128 + x^127 + x^126 + x^121 + 1
118 ## throughout the code, one tab and two tab indentations are used. one tab is
119 ## for GHASH part, two tabs is for AES part.
120 ##
121
122 #include <linux/linkage.h>
123 #include <asm/inst.h>
124
125
126 .section .rodata.cst16.POLY, "aM", @progbits, 16
127 .align 16
128 POLY: .octa 0xC2000000000000000000000000000001
129
130 .section .rodata.cst16.POLY2, "aM", @progbits, 16
131 .align 16
132 POLY2: .octa 0xC20000000000000000000001C2000000
133
134 .section .rodata.cst16.TWOONE, "aM", @progbits, 16
135 .align 16
136 TWOONE: .octa 0x00000001000000000000000000000001
137
138 .section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16
139 .align 16
140 SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
141
142 .section .rodata.cst16.ONE, "aM", @progbits, 16
143 .align 16
144 ONE: .octa 0x00000000000000000000000000000001
145
146 .section .rodata.cst16.ONEf, "aM", @progbits, 16
147 .align 16
148 ONEf: .octa 0x01000000000000000000000000000000
149
150
151
152 .section .rodata, "a", @progbits
153 .align 16
154 SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
155 ALL_F: .octa 0xffffffffffffffffffffffffffffffff
156 .octa 0x00000000000000000000000000000000
157
158 .section .rodata
159 .align 16
160 .type aad_shift_arr, @object
161 .size aad_shift_arr, 272
162 aad_shift_arr:
163 .octa 0xffffffffffffffffffffffffffffffff
164 .octa 0xffffffffffffffffffffffffffffff0C
165 .octa 0xffffffffffffffffffffffffffff0D0C
166 .octa 0xffffffffffffffffffffffffff0E0D0C
167 .octa 0xffffffffffffffffffffffff0F0E0D0C
168 .octa 0xffffffffffffffffffffff0C0B0A0908
169 .octa 0xffffffffffffffffffff0D0C0B0A0908
170 .octa 0xffffffffffffffffff0E0D0C0B0A0908
171 .octa 0xffffffffffffffff0F0E0D0C0B0A0908
172 .octa 0xffffffffffffff0C0B0A090807060504
173 .octa 0xffffffffffff0D0C0B0A090807060504
174 .octa 0xffffffffff0E0D0C0B0A090807060504
175 .octa 0xffffffff0F0E0D0C0B0A090807060504
176 .octa 0xffffff0C0B0A09080706050403020100
177 .octa 0xffff0D0C0B0A09080706050403020100
178 .octa 0xff0E0D0C0B0A09080706050403020100
179 .octa 0x0F0E0D0C0B0A09080706050403020100
180
181
182 .text
183
184
185 #define AadHash 16*0
186 #define AadLen 16*1
187 #define InLen (16*1)+8
188 #define PBlockEncKey 16*2
189 #define OrigIV 16*3
190 #define CurCount 16*4
191 #define PBlockLen 16*5
192
193 HashKey = 16*6 # store HashKey <<1 mod poly here
194 HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
195 HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
196 HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
197 HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
198 HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
199 HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
200 HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
201 HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
202 HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
203 HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
204 HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
205 HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
206 HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
207 HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
208 HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
209
210 #define arg1 %rdi
211 #define arg2 %rsi
212 #define arg3 %rdx
213 #define arg4 %rcx
214 #define arg5 %r8
215 #define arg6 %r9
216 #define arg7 STACK_OFFSET+8*1(%r14)
217 #define arg8 STACK_OFFSET+8*2(%r14)
218 #define arg9 STACK_OFFSET+8*3(%r14)
219 #define arg10 STACK_OFFSET+8*4(%r14)
220 #define keysize 2*15*16(arg1)
221
222 i = 0
223 j = 0
224
225 out_order = 0
226 in_order = 1
227 DEC = 0
228 ENC = 1
229
230 .macro define_reg r n
231 reg_\r = %xmm\n
232 .endm
233
234 .macro setreg
235 .altmacro
236 define_reg i %i
237 define_reg j %j
238 .noaltmacro
239 .endm
240
241
242 STACK_OFFSET = 8*4
243
244 TMP1 = 16*0 # Temporary storage for AAD
245 TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
246 TMP3 = 16*2 # Temporary storage for AES State 3
247 TMP4 = 16*3 # Temporary storage for AES State 4
248 TMP5 = 16*4 # Temporary storage for AES State 5
249 TMP6 = 16*5 # Temporary storage for AES State 6
250 TMP7 = 16*6 # Temporary storage for AES State 7
251 TMP8 = 16*7 # Temporary storage for AES State 8
252
253 VARIABLE_OFFSET = 16*8
254
255 ################################
256
257 ################################
258
259 .macro FUNC_SAVE
260
261 push %r12
262 push %r13
263 push %r14
264 push %r15
265
266 mov %rsp, %r14
267
268
269
270 sub $VARIABLE_OFFSET, %rsp
271 and $~63, %rsp # align rsp to 64 bytes
272 .endm
273
274 .macro FUNC_RESTORE
275 mov %r14, %rsp
276
277 pop %r15
278 pop %r14
279 pop %r13
280 pop %r12
281 .endm
282
283
284 .macro ENCRYPT_SINGLE_BLOCK REP XMM0
285 vpxor (arg1), \XMM0, \XMM0
286 i = 1
287 setreg
288 .rep \REP
289 vaesenc 16*i(arg1), \XMM0, \XMM0
290 i = (i+1)
291 setreg
292 .endr
293 vaesenclast 16*i(arg1), \XMM0, \XMM0
294 .endm
295
296
297
298
299 .macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
300 vmovdqu AadHash(arg2), %xmm8
301 vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
302 add arg5, InLen(arg2)
303
304
305 xor %r11d, %r11d
306
307 PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
308 sub %r11, arg5
309
310 mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
311 and $-16, %r13 # r13 = r13 - (r13 mod 16)
312
313 mov %r13, %r12
314 shr $4, %r12
315 and $7, %r12
316 jz _initial_num_blocks_is_0\@
317
318 cmp $7, %r12
319 je _initial_num_blocks_is_7\@
320 cmp $6, %r12
321 je _initial_num_blocks_is_6\@
322 cmp $5, %r12
323 je _initial_num_blocks_is_5\@
324 cmp $4, %r12
325 je _initial_num_blocks_is_4\@
326 cmp $3, %r12
327 je _initial_num_blocks_is_3\@
328 cmp $2, %r12
329 je _initial_num_blocks_is_2\@
330
331 jmp _initial_num_blocks_is_1\@
332
333 _initial_num_blocks_is_7\@:
334 \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
335 sub $16*7, %r13
336 jmp _initial_blocks_encrypted\@
337
338 _initial_num_blocks_is_6\@:
339 \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
340 sub $16*6, %r13
341 jmp _initial_blocks_encrypted\@
342
343 _initial_num_blocks_is_5\@:
344 \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
345 sub $16*5, %r13
346 jmp _initial_blocks_encrypted\@
347
348 _initial_num_blocks_is_4\@:
349 \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
350 sub $16*4, %r13
351 jmp _initial_blocks_encrypted\@
352
353 _initial_num_blocks_is_3\@:
354 \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
355 sub $16*3, %r13
356 jmp _initial_blocks_encrypted\@
357
358 _initial_num_blocks_is_2\@:
359 \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
360 sub $16*2, %r13
361 jmp _initial_blocks_encrypted\@
362
363 _initial_num_blocks_is_1\@:
364 \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
365 sub $16*1, %r13
366 jmp _initial_blocks_encrypted\@
367
368 _initial_num_blocks_is_0\@:
369 \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
370
371
372 _initial_blocks_encrypted\@:
373 cmp $0, %r13
374 je _zero_cipher_left\@
375
376 sub $128, %r13
377 je _eight_cipher_left\@
378
379
380
381
382 vmovd %xmm9, %r15d
383 and $255, %r15d
384 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
385
386
387 _encrypt_by_8_new\@:
388 cmp $(255-8), %r15d
389 jg _encrypt_by_8\@
390
391
392
393 add $8, %r15b
394 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
395 add $128, %r11
396 sub $128, %r13
397 jne _encrypt_by_8_new\@
398
399 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
400 jmp _eight_cipher_left\@
401
402 _encrypt_by_8\@:
403 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
404 add $8, %r15b
405 \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
406 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
407 add $128, %r11
408 sub $128, %r13
409 jne _encrypt_by_8_new\@
410
411 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
412
413
414
415
416 _eight_cipher_left\@:
417 \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
418
419
420 _zero_cipher_left\@:
421 vmovdqu %xmm14, AadHash(arg2)
422 vmovdqu %xmm9, CurCount(arg2)
423
424
425 mov arg5, %r13
426 and $15, %r13 # r13 = (arg5 mod 16)
427
428 je _multiple_of_16_bytes\@
429
430
431
432 mov %r13, PBlockLen(arg2)
433
434 vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
435 vmovdqu %xmm9, CurCount(arg2)
436 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
437
438 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
439 vmovdqu %xmm9, PBlockEncKey(arg2)
440
441 cmp $16, arg5
442 jge _large_enough_update\@
443
444 lea (arg4,%r11,1), %r10
445 mov %r13, %r12
446
447 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
448
449 lea SHIFT_MASK+16(%rip), %r12
450 sub %r13, %r12 # adjust the shuffle mask pointer to be
451
452
453
454 jmp _final_ghash_mul\@
455
456 _large_enough_update\@:
457 sub $16, %r11
458 add %r13, %r11
459
460
461 vmovdqu (arg4, %r11, 1), %xmm1
462
463 sub %r13, %r11
464 add $16, %r11
465
466 lea SHIFT_MASK+16(%rip), %r12
467
468 # (r13 is the number of bytes in plaintext mod 16)
469 sub %r13, %r12
470
471 vmovdqu (%r12), %xmm2
472
473 vpshufb %xmm2, %xmm1, %xmm1
474
475 _final_ghash_mul\@:
476 .if \ENC_DEC == DEC
477 vmovdqa %xmm1, %xmm2
478 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
479 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
480
481 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
482 vpand %xmm1, %xmm2, %xmm2
483 vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
484 vpxor %xmm2, %xmm14, %xmm14
485
486 vmovdqu %xmm14, AadHash(arg2)
487 .else
488 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
489 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
490
491 vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
492 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
493 vpxor %xmm9, %xmm14, %xmm14
494
495 vmovdqu %xmm14, AadHash(arg2)
496 vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
497 .endif
498
499
500 #############################
501
502 vmovq %xmm9, %rax
503 cmp $8, %r13
504 jle _less_than_8_bytes_left\@
505
506 mov %rax, (arg3 , %r11)
507 add $8, %r11
508 vpsrldq $8, %xmm9, %xmm9
509 vmovq %xmm9, %rax
510 sub $8, %r13
511
512 _less_than_8_bytes_left\@:
513 movb %al, (arg3 , %r11)
514 add $1, %r11
515 shr $8, %rax
516 sub $1, %r13
517 jne _less_than_8_bytes_left\@
518 #############################
519
520 _multiple_of_16_bytes\@:
521 .endm
522
523
524
525
526
527 .macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
528 vmovdqu AadHash(arg2), %xmm14
529 vmovdqu HashKey(arg2), %xmm13
530
531 mov PBlockLen(arg2), %r12
532 cmp $0, %r12
533 je _partial_done\@
534
535
536 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
537
538 _partial_done\@:
539 mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
540 shl $3, %r12 # convert into number of bits
541 vmovd %r12d, %xmm15 # len(A) in xmm15
542
543 mov InLen(arg2), %r12
544 shl $3, %r12 # len(C) in bits (*128)
545 vmovq %r12, %xmm1
546 vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
547 vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
548
549 vpxor %xmm15, %xmm14, %xmm14
550 \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
551 vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
552
553 vmovdqu OrigIV(arg2), %xmm9
554
555 ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
556
557 vpxor %xmm14, %xmm9, %xmm9
558
559
560
561 _return_T\@:
562 mov \AUTH_TAG, %r10 # r10 = authTag
563 mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
564
565 cmp $16, %r11
566 je _T_16\@
567
568 cmp $8, %r11
569 jl _T_4\@
570
571 _T_8\@:
572 vmovq %xmm9, %rax
573 mov %rax, (%r10)
574 add $8, %r10
575 sub $8, %r11
576 vpsrldq $8, %xmm9, %xmm9
577 cmp $0, %r11
578 je _return_T_done\@
579 _T_4\@:
580 vmovd %xmm9, %eax
581 mov %eax, (%r10)
582 add $4, %r10
583 sub $4, %r11
584 vpsrldq $4, %xmm9, %xmm9
585 cmp $0, %r11
586 je _return_T_done\@
587 _T_123\@:
588 vmovd %xmm9, %eax
589 cmp $2, %r11
590 jl _T_1\@
591 mov %ax, (%r10)
592 cmp $2, %r11
593 je _return_T_done\@
594 add $2, %r10
595 sar $16, %eax
596 _T_1\@:
597 mov %al, (%r10)
598 jmp _return_T_done\@
599
600 _T_16\@:
601 vmovdqu %xmm9, (%r10)
602
603 _return_T_done\@:
604 .endm
605
606 .macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
607
608 mov \AAD, %r10 # r10 = AAD
609 mov \AADLEN, %r12 # r12 = aadLen
610
611
612 mov %r12, %r11
613
614 vpxor \T8, \T8, \T8
615 vpxor \T7, \T7, \T7
616 cmp $16, %r11
617 jl _get_AAD_rest8\@
618 _get_AAD_blocks\@:
619 vmovdqu (%r10), \T7
620 vpshufb SHUF_MASK(%rip), \T7, \T7
621 vpxor \T7, \T8, \T8
622 \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
623 add $16, %r10
624 sub $16, %r12
625 sub $16, %r11
626 cmp $16, %r11
627 jge _get_AAD_blocks\@
628 vmovdqu \T8, \T7
629 cmp $0, %r11
630 je _get_AAD_done\@
631
632 vpxor \T7, \T7, \T7
633
634
635
636
637 _get_AAD_rest8\@:
638 cmp $4, %r11
639 jle _get_AAD_rest4\@
640 movq (%r10), \T1
641 add $8, %r10
642 sub $8, %r11
643 vpslldq $8, \T1, \T1
644 vpsrldq $8, \T7, \T7
645 vpxor \T1, \T7, \T7
646 jmp _get_AAD_rest8\@
647 _get_AAD_rest4\@:
648 cmp $0, %r11
649 jle _get_AAD_rest0\@
650 mov (%r10), %eax
651 movq %rax, \T1
652 add $4, %r10
653 sub $4, %r11
654 vpslldq $12, \T1, \T1
655 vpsrldq $4, \T7, \T7
656 vpxor \T1, \T7, \T7
657 _get_AAD_rest0\@:
658
659
660
661 movq %r12, %r11
662 salq $4, %r11
663 vmovdqu aad_shift_arr(%r11), \T1
664 vpshufb \T1, \T7, \T7
665 _get_AAD_rest_final\@:
666 vpshufb SHUF_MASK(%rip), \T7, \T7
667 vpxor \T8, \T7, \T7
668 \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
669
670 _get_AAD_done\@:
671 vmovdqu \T7, AadHash(arg2)
672 .endm
673
674 .macro INIT GHASH_MUL PRECOMPUTE
675 mov arg6, %r11
676 mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
677 xor %r11d, %r11d
678 mov %r11, InLen(arg2) # ctx_data.in_length = 0
679
680 mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
681 mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
682 mov arg3, %rax
683 movdqu (%rax), %xmm0
684 movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
685
686 vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
687 movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
688
689 vmovdqu (arg4), %xmm6 # xmm6 = HashKey
690
691 vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
692 ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
693 vmovdqa %xmm6, %xmm2
694 vpsllq $1, %xmm6, %xmm6
695 vpsrlq $63, %xmm2, %xmm2
696 vmovdqa %xmm2, %xmm1
697 vpslldq $8, %xmm2, %xmm2
698 vpsrldq $8, %xmm1, %xmm1
699 vpor %xmm2, %xmm6, %xmm6
700
701 vpshufd $0b00100100, %xmm1, %xmm2
702 vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
703 vpand POLY(%rip), %xmm2, %xmm2
704 vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
705 #######################################################################
706 vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
707
708 CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
709
710 \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
711 .endm
712
713
714
715
716
717 .macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
718 vpxor \XMMDst, \XMMDst, \XMMDst
719
720 cmp $8, \DLEN
721 jl _read_lt8_\@
722 mov (\DPTR), %rax
723 vpinsrq $0, %rax, \XMMDst, \XMMDst
724 sub $8, \DLEN
725 jz _done_read_partial_block_\@
726 xor %eax, %eax
727 _read_next_byte_\@:
728 shl $8, %rax
729 mov 7(\DPTR, \DLEN, 1), %al
730 dec \DLEN
731 jnz _read_next_byte_\@
732 vpinsrq $1, %rax, \XMMDst, \XMMDst
733 jmp _done_read_partial_block_\@
734 _read_lt8_\@:
735 xor %eax, %eax
736 _read_next_byte_lt8_\@:
737 shl $8, %rax
738 mov -1(\DPTR, \DLEN, 1), %al
739 dec \DLEN
740 jnz _read_next_byte_lt8_\@
741 vpinsrq $0, %rax, \XMMDst, \XMMDst
742 _done_read_partial_block_\@:
743 .endm
744
745
746
747
748
749
750 .macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
751 AAD_HASH ENC_DEC
752 mov PBlockLen(arg2), %r13
753 cmp $0, %r13
754 je _partial_block_done_\@ # Leave Macro if no partial blocks
755
756 cmp $16, \PLAIN_CYPH_LEN
757 jl _fewer_than_16_bytes_\@
758 vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
759 jmp _data_read_\@
760
761 _fewer_than_16_bytes_\@:
762 lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
763 mov \PLAIN_CYPH_LEN, %r12
764 READ_PARTIAL_BLOCK %r10 %r12 %xmm1
765
766 mov PBlockLen(arg2), %r13
767
768 _data_read_\@: # Finished reading in data
769
770 vmovdqu PBlockEncKey(arg2), %xmm9
771 vmovdqu HashKey(arg2), %xmm13
772
773 lea SHIFT_MASK(%rip), %r12
774
775
776
777 add %r13, %r12
778 vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
779 vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
780
781 .if \ENC_DEC == DEC
782 vmovdqa %xmm1, %xmm3
783 pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
784
785 mov \PLAIN_CYPH_LEN, %r10
786 add %r13, %r10
787
788 sub $16, %r10
789
790
791 jge _no_extra_mask_1_\@
792 sub %r10, %r12
793 _no_extra_mask_1_\@:
794
795 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
796
797 vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
798
799 vpand %xmm1, %xmm3, %xmm3
800 vmovdqa SHUF_MASK(%rip), %xmm10
801 vpshufb %xmm10, %xmm3, %xmm3
802 vpshufb %xmm2, %xmm3, %xmm3
803 vpxor %xmm3, \AAD_HASH, \AAD_HASH
804
805 cmp $0, %r10
806 jl _partial_incomplete_1_\@
807
808
809 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
810 xor %eax,%eax
811
812 mov %rax, PBlockLen(arg2)
813 jmp _dec_done_\@
814 _partial_incomplete_1_\@:
815 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
816 _dec_done_\@:
817 vmovdqu \AAD_HASH, AadHash(arg2)
818 .else
819 vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
820
821 mov \PLAIN_CYPH_LEN, %r10
822 add %r13, %r10
823
824 sub $16, %r10
825
826
827 jge _no_extra_mask_2_\@
828 sub %r10, %r12
829 _no_extra_mask_2_\@:
830
831 vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
832
833 vpand %xmm1, %xmm9, %xmm9
834
835 vmovdqa SHUF_MASK(%rip), %xmm1
836 vpshufb %xmm1, %xmm9, %xmm9
837 vpshufb %xmm2, %xmm9, %xmm9
838 vpxor %xmm9, \AAD_HASH, \AAD_HASH
839
840 cmp $0, %r10
841 jl _partial_incomplete_2_\@
842
843
844 \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
845 xor %eax,%eax
846
847 mov %rax, PBlockLen(arg2)
848 jmp _encode_done_\@
849 _partial_incomplete_2_\@:
850 add \PLAIN_CYPH_LEN, PBlockLen(arg2)
851 _encode_done_\@:
852 vmovdqu \AAD_HASH, AadHash(arg2)
853
854 vmovdqa SHUF_MASK(%rip), %xmm10
855
856 vpshufb %xmm10, %xmm9, %xmm9
857 vpshufb %xmm2, %xmm9, %xmm9
858 .endif
859
860 cmp $0, %r10
861 jl _partial_fill_\@
862 mov %r13, %r12
863 mov $16, %r13
864
865 sub %r12, %r13
866 jmp _count_set_\@
867 _partial_fill_\@:
868 mov \PLAIN_CYPH_LEN, %r13
869 _count_set_\@:
870 vmovdqa %xmm9, %xmm0
871 vmovq %xmm0, %rax
872 cmp $8, %r13
873 jle _less_than_8_bytes_left_\@
874
875 mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
876 add $8, \DATA_OFFSET
877 psrldq $8, %xmm0
878 vmovq %xmm0, %rax
879 sub $8, %r13
880 _less_than_8_bytes_left_\@:
881 movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
882 add $1, \DATA_OFFSET
883 shr $8, %rax
884 sub $1, %r13
885 jne _less_than_8_bytes_left_\@
886 _partial_block_done_\@:
887 .endm # PARTIAL_BLOCK
888
889 #ifdef CONFIG_AS_AVX
890 ###############################################################################
891
892
893
894
895
896 ###############################################################################
897 .macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
898
899 vpshufd $0b01001110, \GH, \T2
900 vpshufd $0b01001110, \HK, \T3
901 vpxor \GH , \T2, \T2 # T2 = (a1+a0)
902 vpxor \HK , \T3, \T3 # T3 = (b1+b0)
903
904 vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1
905 vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0
906 vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0)
907 vpxor \GH, \T2,\T2
908 vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0
909
910 vpslldq $8, \T2,\T3 # shift-L T3 2 DWs
911 vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs
912 vpxor \T3, \GH, \GH
913 vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK
914
915
916 vpslld $31, \GH, \T2 # packed right shifting << 31
917 vpslld $30, \GH, \T3 # packed right shifting shift << 30
918 vpslld $25, \GH, \T4 # packed right shifting shift << 25
919
920 vpxor \T3, \T2, \T2 # xor the shifted versions
921 vpxor \T4, \T2, \T2
922
923 vpsrldq $4, \T2, \T5 # shift-R T5 1 DW
924
925 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
926 vpxor \T2, \GH, \GH # first phase of the reduction complete
927
928
929
930 vpsrld $1,\GH, \T2 # packed left shifting >> 1
931 vpsrld $2,\GH, \T3 # packed left shifting >> 2
932 vpsrld $7,\GH, \T4 # packed left shifting >> 7
933 vpxor \T3, \T2, \T2 # xor the shifted versions
934 vpxor \T4, \T2, \T2
935
936 vpxor \T5, \T2, \T2
937 vpxor \T2, \GH, \GH
938 vpxor \T1, \GH, \GH # the result is in GH
939
940
941 .endm
942
943 .macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
944
945
946 vmovdqa \HK, \T5
947
948 vpshufd $0b01001110, \T5, \T1
949 vpxor \T5, \T1, \T1
950 vmovdqu \T1, HashKey_k(arg2)
951
952 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
953 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
954 vpshufd $0b01001110, \T5, \T1
955 vpxor \T5, \T1, \T1
956 vmovdqu \T1, HashKey_2_k(arg2)
957
958 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
959 vmovdqu \T5, HashKey_3(arg2)
960 vpshufd $0b01001110, \T5, \T1
961 vpxor \T5, \T1, \T1
962 vmovdqu \T1, HashKey_3_k(arg2)
963
964 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
965 vmovdqu \T5, HashKey_4(arg2)
966 vpshufd $0b01001110, \T5, \T1
967 vpxor \T5, \T1, \T1
968 vmovdqu \T1, HashKey_4_k(arg2)
969
970 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
971 vmovdqu \T5, HashKey_5(arg2)
972 vpshufd $0b01001110, \T5, \T1
973 vpxor \T5, \T1, \T1
974 vmovdqu \T1, HashKey_5_k(arg2)
975
976 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
977 vmovdqu \T5, HashKey_6(arg2)
978 vpshufd $0b01001110, \T5, \T1
979 vpxor \T5, \T1, \T1
980 vmovdqu \T1, HashKey_6_k(arg2)
981
982 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
983 vmovdqu \T5, HashKey_7(arg2)
984 vpshufd $0b01001110, \T5, \T1
985 vpxor \T5, \T1, \T1
986 vmovdqu \T1, HashKey_7_k(arg2)
987
988 GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
989 vmovdqu \T5, HashKey_8(arg2)
990 vpshufd $0b01001110, \T5, \T1
991 vpxor \T5, \T1, \T1
992 vmovdqu \T1, HashKey_8_k(arg2)
993
994 .endm
995
996 ## if a = number of total plaintext bytes
997 ## b = floor(a/16)
998 ## num_initial_blocks = b mod 4#
999 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1000 ## r10, r11, r12, rax are clobbered
1001 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1002
1003 .macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
1004 i = (8-\num_initial_blocks)
1005 setreg
1006 vmovdqu AadHash(arg2), reg_i
1007
1008
1009 vmovdqu CurCount(arg2), \CTR
1010
1011 i = (9-\num_initial_blocks)
1012 setreg
1013 .rep \num_initial_blocks
1014 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1015 vmovdqa \CTR, reg_i
1016 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1017 i = (i+1)
1018 setreg
1019 .endr
1020
1021 vmovdqa (arg1), \T_key
1022 i = (9-\num_initial_blocks)
1023 setreg
1024 .rep \num_initial_blocks
1025 vpxor \T_key, reg_i, reg_i
1026 i = (i+1)
1027 setreg
1028 .endr
1029
1030 j = 1
1031 setreg
1032 .rep \REP
1033 vmovdqa 16*j(arg1), \T_key
1034 i = (9-\num_initial_blocks)
1035 setreg
1036 .rep \num_initial_blocks
1037 vaesenc \T_key, reg_i, reg_i
1038 i = (i+1)
1039 setreg
1040 .endr
1041
1042 j = (j+1)
1043 setreg
1044 .endr
1045
1046 vmovdqa 16*j(arg1), \T_key
1047 i = (9-\num_initial_blocks)
1048 setreg
1049 .rep \num_initial_blocks
1050 vaesenclast \T_key, reg_i, reg_i
1051 i = (i+1)
1052 setreg
1053 .endr
1054
1055 i = (9-\num_initial_blocks)
1056 setreg
1057 .rep \num_initial_blocks
1058 vmovdqu (arg4, %r11), \T1
1059 vpxor \T1, reg_i, reg_i
1060 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
1061 add $16, %r11
1062 .if \ENC_DEC == DEC
1063 vmovdqa \T1, reg_i
1064 .endif
1065 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
1066 i = (i+1)
1067 setreg
1068 .endr
1069
1070
1071 i = (8-\num_initial_blocks)
1072 j = (9-\num_initial_blocks)
1073 setreg
1074
1075 .rep \num_initial_blocks
1076 vpxor reg_i, reg_j, reg_j
1077 GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1078 i = (i+1)
1079 j = (j+1)
1080 setreg
1081 .endr
1082
1083
1084 vmovdqa \XMM8, TMP1(%rsp)
1085 vmovdqa \XMM8, \T3
1086
1087 cmp $128, %r13
1088 jl _initial_blocks_done\@ # no need for precomputed constants
1089
1090 ###############################################################################
1091
1092 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1093 vmovdqa \CTR, \XMM1
1094 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1095
1096 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1097 vmovdqa \CTR, \XMM2
1098 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1099
1100 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1101 vmovdqa \CTR, \XMM3
1102 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1103
1104 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1105 vmovdqa \CTR, \XMM4
1106 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1107
1108 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1109 vmovdqa \CTR, \XMM5
1110 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1111
1112 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1113 vmovdqa \CTR, \XMM6
1114 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1115
1116 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1117 vmovdqa \CTR, \XMM7
1118 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1119
1120 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1121 vmovdqa \CTR, \XMM8
1122 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1123
1124 vmovdqa (arg1), \T_key
1125 vpxor \T_key, \XMM1, \XMM1
1126 vpxor \T_key, \XMM2, \XMM2
1127 vpxor \T_key, \XMM3, \XMM3
1128 vpxor \T_key, \XMM4, \XMM4
1129 vpxor \T_key, \XMM5, \XMM5
1130 vpxor \T_key, \XMM6, \XMM6
1131 vpxor \T_key, \XMM7, \XMM7
1132 vpxor \T_key, \XMM8, \XMM8
1133
1134 i = 1
1135 setreg
1136 .rep \REP # do REP rounds
1137 vmovdqa 16*i(arg1), \T_key
1138 vaesenc \T_key, \XMM1, \XMM1
1139 vaesenc \T_key, \XMM2, \XMM2
1140 vaesenc \T_key, \XMM3, \XMM3
1141 vaesenc \T_key, \XMM4, \XMM4
1142 vaesenc \T_key, \XMM5, \XMM5
1143 vaesenc \T_key, \XMM6, \XMM6
1144 vaesenc \T_key, \XMM7, \XMM7
1145 vaesenc \T_key, \XMM8, \XMM8
1146 i = (i+1)
1147 setreg
1148 .endr
1149
1150 vmovdqa 16*i(arg1), \T_key
1151 vaesenclast \T_key, \XMM1, \XMM1
1152 vaesenclast \T_key, \XMM2, \XMM2
1153 vaesenclast \T_key, \XMM3, \XMM3
1154 vaesenclast \T_key, \XMM4, \XMM4
1155 vaesenclast \T_key, \XMM5, \XMM5
1156 vaesenclast \T_key, \XMM6, \XMM6
1157 vaesenclast \T_key, \XMM7, \XMM7
1158 vaesenclast \T_key, \XMM8, \XMM8
1159
1160 vmovdqu (arg4, %r11), \T1
1161 vpxor \T1, \XMM1, \XMM1
1162 vmovdqu \XMM1, (arg3 , %r11)
1163 .if \ENC_DEC == DEC
1164 vmovdqa \T1, \XMM1
1165 .endif
1166
1167 vmovdqu 16*1(arg4, %r11), \T1
1168 vpxor \T1, \XMM2, \XMM2
1169 vmovdqu \XMM2, 16*1(arg3 , %r11)
1170 .if \ENC_DEC == DEC
1171 vmovdqa \T1, \XMM2
1172 .endif
1173
1174 vmovdqu 16*2(arg4, %r11), \T1
1175 vpxor \T1, \XMM3, \XMM3
1176 vmovdqu \XMM3, 16*2(arg3 , %r11)
1177 .if \ENC_DEC == DEC
1178 vmovdqa \T1, \XMM3
1179 .endif
1180
1181 vmovdqu 16*3(arg4, %r11), \T1
1182 vpxor \T1, \XMM4, \XMM4
1183 vmovdqu \XMM4, 16*3(arg3 , %r11)
1184 .if \ENC_DEC == DEC
1185 vmovdqa \T1, \XMM4
1186 .endif
1187
1188 vmovdqu 16*4(arg4, %r11), \T1
1189 vpxor \T1, \XMM5, \XMM5
1190 vmovdqu \XMM5, 16*4(arg3 , %r11)
1191 .if \ENC_DEC == DEC
1192 vmovdqa \T1, \XMM5
1193 .endif
1194
1195 vmovdqu 16*5(arg4, %r11), \T1
1196 vpxor \T1, \XMM6, \XMM6
1197 vmovdqu \XMM6, 16*5(arg3 , %r11)
1198 .if \ENC_DEC == DEC
1199 vmovdqa \T1, \XMM6
1200 .endif
1201
1202 vmovdqu 16*6(arg4, %r11), \T1
1203 vpxor \T1, \XMM7, \XMM7
1204 vmovdqu \XMM7, 16*6(arg3 , %r11)
1205 .if \ENC_DEC == DEC
1206 vmovdqa \T1, \XMM7
1207 .endif
1208
1209 vmovdqu 16*7(arg4, %r11), \T1
1210 vpxor \T1, \XMM8, \XMM8
1211 vmovdqu \XMM8, 16*7(arg3 , %r11)
1212 .if \ENC_DEC == DEC
1213 vmovdqa \T1, \XMM8
1214 .endif
1215
1216 add $128, %r11
1217
1218 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1219 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext
1220 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1221 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1222 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1223 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1224 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1225 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1226 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1227
1228 ###############################################################################
1229
1230 _initial_blocks_done\@:
1231
1232 .endm
1233
1234
1235
1236
1237
1238 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1239
1240 vmovdqa \XMM1, \T2
1241 vmovdqa \XMM2, TMP2(%rsp)
1242 vmovdqa \XMM3, TMP3(%rsp)
1243 vmovdqa \XMM4, TMP4(%rsp)
1244 vmovdqa \XMM5, TMP5(%rsp)
1245 vmovdqa \XMM6, TMP6(%rsp)
1246 vmovdqa \XMM7, TMP7(%rsp)
1247 vmovdqa \XMM8, TMP8(%rsp)
1248
1249 .if \loop_idx == in_order
1250 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
1251 vpaddd ONE(%rip), \XMM1, \XMM2
1252 vpaddd ONE(%rip), \XMM2, \XMM3
1253 vpaddd ONE(%rip), \XMM3, \XMM4
1254 vpaddd ONE(%rip), \XMM4, \XMM5
1255 vpaddd ONE(%rip), \XMM5, \XMM6
1256 vpaddd ONE(%rip), \XMM6, \XMM7
1257 vpaddd ONE(%rip), \XMM7, \XMM8
1258 vmovdqa \XMM8, \CTR
1259
1260 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1261 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1262 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1263 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1264 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1265 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1266 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1267 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1268 .else
1269 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
1270 vpaddd ONEf(%rip), \XMM1, \XMM2
1271 vpaddd ONEf(%rip), \XMM2, \XMM3
1272 vpaddd ONEf(%rip), \XMM3, \XMM4
1273 vpaddd ONEf(%rip), \XMM4, \XMM5
1274 vpaddd ONEf(%rip), \XMM5, \XMM6
1275 vpaddd ONEf(%rip), \XMM6, \XMM7
1276 vpaddd ONEf(%rip), \XMM7, \XMM8
1277 vmovdqa \XMM8, \CTR
1278 .endif
1279
1280
1281 #######################################################################
1282
1283 vmovdqu (arg1), \T1
1284 vpxor \T1, \XMM1, \XMM1
1285 vpxor \T1, \XMM2, \XMM2
1286 vpxor \T1, \XMM3, \XMM3
1287 vpxor \T1, \XMM4, \XMM4
1288 vpxor \T1, \XMM5, \XMM5
1289 vpxor \T1, \XMM6, \XMM6
1290 vpxor \T1, \XMM7, \XMM7
1291 vpxor \T1, \XMM8, \XMM8
1292
1293 #######################################################################
1294
1295
1296
1297
1298
1299 vmovdqu 16*1(arg1), \T1
1300 vaesenc \T1, \XMM1, \XMM1
1301 vaesenc \T1, \XMM2, \XMM2
1302 vaesenc \T1, \XMM3, \XMM3
1303 vaesenc \T1, \XMM4, \XMM4
1304 vaesenc \T1, \XMM5, \XMM5
1305 vaesenc \T1, \XMM6, \XMM6
1306 vaesenc \T1, \XMM7, \XMM7
1307 vaesenc \T1, \XMM8, \XMM8
1308
1309 vmovdqu 16*2(arg1), \T1
1310 vaesenc \T1, \XMM1, \XMM1
1311 vaesenc \T1, \XMM2, \XMM2
1312 vaesenc \T1, \XMM3, \XMM3
1313 vaesenc \T1, \XMM4, \XMM4
1314 vaesenc \T1, \XMM5, \XMM5
1315 vaesenc \T1, \XMM6, \XMM6
1316 vaesenc \T1, \XMM7, \XMM7
1317 vaesenc \T1, \XMM8, \XMM8
1318
1319
1320 #######################################################################
1321
1322 vmovdqu HashKey_8(arg2), \T5
1323 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
1324 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
1325
1326 vpshufd $0b01001110, \T2, \T6
1327 vpxor \T2, \T6, \T6
1328
1329 vmovdqu HashKey_8_k(arg2), \T5
1330 vpclmulqdq $0x00, \T5, \T6, \T6
1331
1332 vmovdqu 16*3(arg1), \T1
1333 vaesenc \T1, \XMM1, \XMM1
1334 vaesenc \T1, \XMM2, \XMM2
1335 vaesenc \T1, \XMM3, \XMM3
1336 vaesenc \T1, \XMM4, \XMM4
1337 vaesenc \T1, \XMM5, \XMM5
1338 vaesenc \T1, \XMM6, \XMM6
1339 vaesenc \T1, \XMM7, \XMM7
1340 vaesenc \T1, \XMM8, \XMM8
1341
1342 vmovdqa TMP2(%rsp), \T1
1343 vmovdqu HashKey_7(arg2), \T5
1344 vpclmulqdq $0x11, \T5, \T1, \T3
1345 vpxor \T3, \T4, \T4
1346 vpclmulqdq $0x00, \T5, \T1, \T3
1347 vpxor \T3, \T7, \T7
1348
1349 vpshufd $0b01001110, \T1, \T3
1350 vpxor \T1, \T3, \T3
1351 vmovdqu HashKey_7_k(arg2), \T5
1352 vpclmulqdq $0x10, \T5, \T3, \T3
1353 vpxor \T3, \T6, \T6
1354
1355 vmovdqu 16*4(arg1), \T1
1356 vaesenc \T1, \XMM1, \XMM1
1357 vaesenc \T1, \XMM2, \XMM2
1358 vaesenc \T1, \XMM3, \XMM3
1359 vaesenc \T1, \XMM4, \XMM4
1360 vaesenc \T1, \XMM5, \XMM5
1361 vaesenc \T1, \XMM6, \XMM6
1362 vaesenc \T1, \XMM7, \XMM7
1363 vaesenc \T1, \XMM8, \XMM8
1364
1365 #######################################################################
1366
1367 vmovdqa TMP3(%rsp), \T1
1368 vmovdqu HashKey_6(arg2), \T5
1369 vpclmulqdq $0x11, \T5, \T1, \T3
1370 vpxor \T3, \T4, \T4
1371 vpclmulqdq $0x00, \T5, \T1, \T3
1372 vpxor \T3, \T7, \T7
1373
1374 vpshufd $0b01001110, \T1, \T3
1375 vpxor \T1, \T3, \T3
1376 vmovdqu HashKey_6_k(arg2), \T5
1377 vpclmulqdq $0x10, \T5, \T3, \T3
1378 vpxor \T3, \T6, \T6
1379
1380 vmovdqu 16*5(arg1), \T1
1381 vaesenc \T1, \XMM1, \XMM1
1382 vaesenc \T1, \XMM2, \XMM2
1383 vaesenc \T1, \XMM3, \XMM3
1384 vaesenc \T1, \XMM4, \XMM4
1385 vaesenc \T1, \XMM5, \XMM5
1386 vaesenc \T1, \XMM6, \XMM6
1387 vaesenc \T1, \XMM7, \XMM7
1388 vaesenc \T1, \XMM8, \XMM8
1389
1390 vmovdqa TMP4(%rsp), \T1
1391 vmovdqu HashKey_5(arg2), \T5
1392 vpclmulqdq $0x11, \T5, \T1, \T3
1393 vpxor \T3, \T4, \T4
1394 vpclmulqdq $0x00, \T5, \T1, \T3
1395 vpxor \T3, \T7, \T7
1396
1397 vpshufd $0b01001110, \T1, \T3
1398 vpxor \T1, \T3, \T3
1399 vmovdqu HashKey_5_k(arg2), \T5
1400 vpclmulqdq $0x10, \T5, \T3, \T3
1401 vpxor \T3, \T6, \T6
1402
1403 vmovdqu 16*6(arg1), \T1
1404 vaesenc \T1, \XMM1, \XMM1
1405 vaesenc \T1, \XMM2, \XMM2
1406 vaesenc \T1, \XMM3, \XMM3
1407 vaesenc \T1, \XMM4, \XMM4
1408 vaesenc \T1, \XMM5, \XMM5
1409 vaesenc \T1, \XMM6, \XMM6
1410 vaesenc \T1, \XMM7, \XMM7
1411 vaesenc \T1, \XMM8, \XMM8
1412
1413
1414 vmovdqa TMP5(%rsp), \T1
1415 vmovdqu HashKey_4(arg2), \T5
1416 vpclmulqdq $0x11, \T5, \T1, \T3
1417 vpxor \T3, \T4, \T4
1418 vpclmulqdq $0x00, \T5, \T1, \T3
1419 vpxor \T3, \T7, \T7
1420
1421 vpshufd $0b01001110, \T1, \T3
1422 vpxor \T1, \T3, \T3
1423 vmovdqu HashKey_4_k(arg2), \T5
1424 vpclmulqdq $0x10, \T5, \T3, \T3
1425 vpxor \T3, \T6, \T6
1426
1427 vmovdqu 16*7(arg1), \T1
1428 vaesenc \T1, \XMM1, \XMM1
1429 vaesenc \T1, \XMM2, \XMM2
1430 vaesenc \T1, \XMM3, \XMM3
1431 vaesenc \T1, \XMM4, \XMM4
1432 vaesenc \T1, \XMM5, \XMM5
1433 vaesenc \T1, \XMM6, \XMM6
1434 vaesenc \T1, \XMM7, \XMM7
1435 vaesenc \T1, \XMM8, \XMM8
1436
1437 vmovdqa TMP6(%rsp), \T1
1438 vmovdqu HashKey_3(arg2), \T5
1439 vpclmulqdq $0x11, \T5, \T1, \T3
1440 vpxor \T3, \T4, \T4
1441 vpclmulqdq $0x00, \T5, \T1, \T3
1442 vpxor \T3, \T7, \T7
1443
1444 vpshufd $0b01001110, \T1, \T3
1445 vpxor \T1, \T3, \T3
1446 vmovdqu HashKey_3_k(arg2), \T5
1447 vpclmulqdq $0x10, \T5, \T3, \T3
1448 vpxor \T3, \T6, \T6
1449
1450
1451 vmovdqu 16*8(arg1), \T1
1452 vaesenc \T1, \XMM1, \XMM1
1453 vaesenc \T1, \XMM2, \XMM2
1454 vaesenc \T1, \XMM3, \XMM3
1455 vaesenc \T1, \XMM4, \XMM4
1456 vaesenc \T1, \XMM5, \XMM5
1457 vaesenc \T1, \XMM6, \XMM6
1458 vaesenc \T1, \XMM7, \XMM7
1459 vaesenc \T1, \XMM8, \XMM8
1460
1461 vmovdqa TMP7(%rsp), \T1
1462 vmovdqu HashKey_2(arg2), \T5
1463 vpclmulqdq $0x11, \T5, \T1, \T3
1464 vpxor \T3, \T4, \T4
1465 vpclmulqdq $0x00, \T5, \T1, \T3
1466 vpxor \T3, \T7, \T7
1467
1468 vpshufd $0b01001110, \T1, \T3
1469 vpxor \T1, \T3, \T3
1470 vmovdqu HashKey_2_k(arg2), \T5
1471 vpclmulqdq $0x10, \T5, \T3, \T3
1472 vpxor \T3, \T6, \T6
1473
1474 #######################################################################
1475
1476 vmovdqu 16*9(arg1), \T5
1477 vaesenc \T5, \XMM1, \XMM1
1478 vaesenc \T5, \XMM2, \XMM2
1479 vaesenc \T5, \XMM3, \XMM3
1480 vaesenc \T5, \XMM4, \XMM4
1481 vaesenc \T5, \XMM5, \XMM5
1482 vaesenc \T5, \XMM6, \XMM6
1483 vaesenc \T5, \XMM7, \XMM7
1484 vaesenc \T5, \XMM8, \XMM8
1485
1486 vmovdqa TMP8(%rsp), \T1
1487 vmovdqu HashKey(arg2), \T5
1488 vpclmulqdq $0x11, \T5, \T1, \T3
1489 vpxor \T3, \T4, \T4
1490 vpclmulqdq $0x00, \T5, \T1, \T3
1491 vpxor \T3, \T7, \T7
1492
1493 vpshufd $0b01001110, \T1, \T3
1494 vpxor \T1, \T3, \T3
1495 vmovdqu HashKey_k(arg2), \T5
1496 vpclmulqdq $0x10, \T5, \T3, \T3
1497 vpxor \T3, \T6, \T6
1498
1499 vpxor \T4, \T6, \T6
1500 vpxor \T7, \T6, \T6
1501
1502 vmovdqu 16*10(arg1), \T5
1503
1504 i = 11
1505 setreg
1506 .rep (\REP-9)
1507
1508 vaesenc \T5, \XMM1, \XMM1
1509 vaesenc \T5, \XMM2, \XMM2
1510 vaesenc \T5, \XMM3, \XMM3
1511 vaesenc \T5, \XMM4, \XMM4
1512 vaesenc \T5, \XMM5, \XMM5
1513 vaesenc \T5, \XMM6, \XMM6
1514 vaesenc \T5, \XMM7, \XMM7
1515 vaesenc \T5, \XMM8, \XMM8
1516
1517 vmovdqu 16*i(arg1), \T5
1518 i = i + 1
1519 setreg
1520 .endr
1521
1522 i = 0
1523 j = 1
1524 setreg
1525 .rep 8
1526 vpxor 16*i(arg4, %r11), \T5, \T2
1527 .if \ENC_DEC == ENC
1528 vaesenclast \T2, reg_j, reg_j
1529 .else
1530 vaesenclast \T2, reg_j, \T3
1531 vmovdqu 16*i(arg4, %r11), reg_j
1532 vmovdqu \T3, 16*i(arg3, %r11)
1533 .endif
1534 i = (i+1)
1535 j = (j+1)
1536 setreg
1537 .endr
1538 #######################################################################
1539
1540
1541 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
1542 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
1543 vpxor \T3, \T7, \T7
1544 vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
1545
1546
1547
1548 #######################################################################
1549
1550 #######################################################################
1551 vpslld $31, \T7, \T2 # packed right shifting << 31
1552 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1553 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1554
1555 vpxor \T3, \T2, \T2 # xor the shifted versions
1556 vpxor \T4, \T2, \T2
1557
1558 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1559
1560 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1561 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1562 #######################################################################
1563 .if \ENC_DEC == ENC
1564 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
1565 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
1566 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
1567 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
1568 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
1569 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
1570 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
1571 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
1572 .endif
1573
1574 #######################################################################
1575
1576 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1577 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1578 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1579 vpxor \T3, \T2, \T2 # xor the shifted versions
1580 vpxor \T4, \T2, \T2
1581
1582 vpxor \T1, \T2, \T2
1583 vpxor \T2, \T7, \T7
1584 vpxor \T7, \T6, \T6 # the result is in T6
1585 #######################################################################
1586
1587 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
1588 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
1589 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
1590 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
1591 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
1592 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
1593 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
1594 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
1595
1596
1597 vpxor \T6, \XMM1, \XMM1
1598
1599
1600
1601 .endm
1602
1603
1604
1605 .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1606
1607 ## Karatsuba Method
1608
1609
1610 vpshufd $0b01001110, \XMM1, \T2
1611 vpxor \XMM1, \T2, \T2
1612 vmovdqu HashKey_8(arg2), \T5
1613 vpclmulqdq $0x11, \T5, \XMM1, \T6
1614 vpclmulqdq $0x00, \T5, \XMM1, \T7
1615
1616 vmovdqu HashKey_8_k(arg2), \T3
1617 vpclmulqdq $0x00, \T3, \T2, \XMM1
1618
1619 ######################
1620
1621 vpshufd $0b01001110, \XMM2, \T2
1622 vpxor \XMM2, \T2, \T2
1623 vmovdqu HashKey_7(arg2), \T5
1624 vpclmulqdq $0x11, \T5, \XMM2, \T4
1625 vpxor \T4, \T6, \T6
1626
1627 vpclmulqdq $0x00, \T5, \XMM2, \T4
1628 vpxor \T4, \T7, \T7
1629
1630 vmovdqu HashKey_7_k(arg2), \T3
1631 vpclmulqdq $0x00, \T3, \T2, \T2
1632 vpxor \T2, \XMM1, \XMM1
1633
1634 ######################
1635
1636 vpshufd $0b01001110, \XMM3, \T2
1637 vpxor \XMM3, \T2, \T2
1638 vmovdqu HashKey_6(arg2), \T5
1639 vpclmulqdq $0x11, \T5, \XMM3, \T4
1640 vpxor \T4, \T6, \T6
1641
1642 vpclmulqdq $0x00, \T5, \XMM3, \T4
1643 vpxor \T4, \T7, \T7
1644
1645 vmovdqu HashKey_6_k(arg2), \T3
1646 vpclmulqdq $0x00, \T3, \T2, \T2
1647 vpxor \T2, \XMM1, \XMM1
1648
1649 ######################
1650
1651 vpshufd $0b01001110, \XMM4, \T2
1652 vpxor \XMM4, \T2, \T2
1653 vmovdqu HashKey_5(arg2), \T5
1654 vpclmulqdq $0x11, \T5, \XMM4, \T4
1655 vpxor \T4, \T6, \T6
1656
1657 vpclmulqdq $0x00, \T5, \XMM4, \T4
1658 vpxor \T4, \T7, \T7
1659
1660 vmovdqu HashKey_5_k(arg2), \T3
1661 vpclmulqdq $0x00, \T3, \T2, \T2
1662 vpxor \T2, \XMM1, \XMM1
1663
1664 ######################
1665
1666 vpshufd $0b01001110, \XMM5, \T2
1667 vpxor \XMM5, \T2, \T2
1668 vmovdqu HashKey_4(arg2), \T5
1669 vpclmulqdq $0x11, \T5, \XMM5, \T4
1670 vpxor \T4, \T6, \T6
1671
1672 vpclmulqdq $0x00, \T5, \XMM5, \T4
1673 vpxor \T4, \T7, \T7
1674
1675 vmovdqu HashKey_4_k(arg2), \T3
1676 vpclmulqdq $0x00, \T3, \T2, \T2
1677 vpxor \T2, \XMM1, \XMM1
1678
1679 ######################
1680
1681 vpshufd $0b01001110, \XMM6, \T2
1682 vpxor \XMM6, \T2, \T2
1683 vmovdqu HashKey_3(arg2), \T5
1684 vpclmulqdq $0x11, \T5, \XMM6, \T4
1685 vpxor \T4, \T6, \T6
1686
1687 vpclmulqdq $0x00, \T5, \XMM6, \T4
1688 vpxor \T4, \T7, \T7
1689
1690 vmovdqu HashKey_3_k(arg2), \T3
1691 vpclmulqdq $0x00, \T3, \T2, \T2
1692 vpxor \T2, \XMM1, \XMM1
1693
1694 ######################
1695
1696 vpshufd $0b01001110, \XMM7, \T2
1697 vpxor \XMM7, \T2, \T2
1698 vmovdqu HashKey_2(arg2), \T5
1699 vpclmulqdq $0x11, \T5, \XMM7, \T4
1700 vpxor \T4, \T6, \T6
1701
1702 vpclmulqdq $0x00, \T5, \XMM7, \T4
1703 vpxor \T4, \T7, \T7
1704
1705 vmovdqu HashKey_2_k(arg2), \T3
1706 vpclmulqdq $0x00, \T3, \T2, \T2
1707 vpxor \T2, \XMM1, \XMM1
1708
1709 ######################
1710
1711 vpshufd $0b01001110, \XMM8, \T2
1712 vpxor \XMM8, \T2, \T2
1713 vmovdqu HashKey(arg2), \T5
1714 vpclmulqdq $0x11, \T5, \XMM8, \T4
1715 vpxor \T4, \T6, \T6
1716
1717 vpclmulqdq $0x00, \T5, \XMM8, \T4
1718 vpxor \T4, \T7, \T7
1719
1720 vmovdqu HashKey_k(arg2), \T3
1721 vpclmulqdq $0x00, \T3, \T2, \T2
1722
1723 vpxor \T2, \XMM1, \XMM1
1724 vpxor \T6, \XMM1, \XMM1
1725 vpxor \T7, \XMM1, \T2
1726
1727
1728
1729
1730 vpslldq $8, \T2, \T4
1731 vpsrldq $8, \T2, \T2
1732
1733 vpxor \T4, \T7, \T7
1734 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
1735
1736
1737 #######################################################################
1738
1739 vpslld $31, \T7, \T2 # packed right shifting << 31
1740 vpslld $30, \T7, \T3 # packed right shifting shift << 30
1741 vpslld $25, \T7, \T4 # packed right shifting shift << 25
1742
1743 vpxor \T3, \T2, \T2 # xor the shifted versions
1744 vpxor \T4, \T2, \T2
1745
1746 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
1747
1748 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
1749 vpxor \T2, \T7, \T7 # first phase of the reduction complete
1750 #######################################################################
1751
1752
1753
1754 vpsrld $1, \T7, \T2 # packed left shifting >> 1
1755 vpsrld $2, \T7, \T3 # packed left shifting >> 2
1756 vpsrld $7, \T7, \T4 # packed left shifting >> 7
1757 vpxor \T3, \T2, \T2 # xor the shifted versions
1758 vpxor \T4, \T2, \T2
1759
1760 vpxor \T1, \T2, \T2
1761 vpxor \T2, \T7, \T7
1762 vpxor \T7, \T6, \T6 # the result is in T6
1763
1764 .endm
1765
1766 #############################################################
1767
1768 # (gcm_data *my_ctx_data,
1769
1770
1771
1772 # (from Security Association) concatenated with 8 byte
1773
1774
1775
1776
1777 #############################################################
1778 ENTRY(aesni_gcm_init_avx_gen2)
1779 FUNC_SAVE
1780 INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1781 FUNC_RESTORE
1782 ret
1783 ENDPROC(aesni_gcm_init_avx_gen2)
1784
1785 ###############################################################################
1786
1787
1788
1789
1790
1791
1792 ###############################################################################
1793 ENTRY(aesni_gcm_enc_update_avx_gen2)
1794 FUNC_SAVE
1795 mov keysize, %eax
1796 cmp $32, %eax
1797 je key_256_enc_update
1798 cmp $16, %eax
1799 je key_128_enc_update
1800
1801 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1802 FUNC_RESTORE
1803 ret
1804 key_128_enc_update:
1805 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1806 FUNC_RESTORE
1807 ret
1808 key_256_enc_update:
1809 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1810 FUNC_RESTORE
1811 ret
1812 ENDPROC(aesni_gcm_enc_update_avx_gen2)
1813
1814 ###############################################################################
1815
1816
1817
1818
1819
1820
1821 ###############################################################################
1822 ENTRY(aesni_gcm_dec_update_avx_gen2)
1823 FUNC_SAVE
1824 mov keysize,%eax
1825 cmp $32, %eax
1826 je key_256_dec_update
1827 cmp $16, %eax
1828 je key_128_dec_update
1829
1830 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1831 FUNC_RESTORE
1832 ret
1833 key_128_dec_update:
1834 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1835 FUNC_RESTORE
1836 ret
1837 key_256_dec_update:
1838 GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1839 FUNC_RESTORE
1840 ret
1841 ENDPROC(aesni_gcm_dec_update_avx_gen2)
1842
1843 ###############################################################################
1844
1845
1846
1847
1848
1849
1850 ###############################################################################
1851 ENTRY(aesni_gcm_finalize_avx_gen2)
1852 FUNC_SAVE
1853 mov keysize,%eax
1854 cmp $32, %eax
1855 je key_256_finalize
1856 cmp $16, %eax
1857 je key_128_finalize
1858
1859 GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1860 FUNC_RESTORE
1861 ret
1862 key_128_finalize:
1863 GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1864 FUNC_RESTORE
1865 ret
1866 key_256_finalize:
1867 GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1868 FUNC_RESTORE
1869 ret
1870 ENDPROC(aesni_gcm_finalize_avx_gen2)
1871
1872 #endif
1873
1874 #ifdef CONFIG_AS_AVX2
1875 ###############################################################################
1876
1877
1878
1879
1880
1881 ###############################################################################
1882 .macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1883
1884 vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1
1885 vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0
1886 vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0
1887 vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1
1888 vpxor \T3, \GH, \GH
1889
1890
1891 vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs
1892 vpslldq $8 , \GH, \GH # shift-L GH 2 DWs
1893
1894 vpxor \T3, \T1, \T1
1895 vpxor \T2, \GH, \GH
1896
1897 #######################################################################
1898
1899 vmovdqa POLY2(%rip), \T3
1900
1901 vpclmulqdq $0x01, \GH, \T3, \T2
1902 vpslldq $8, \T2, \T2 # shift-L T2 2 DWs
1903
1904 vpxor \T2, \GH, \GH # first phase of the reduction complete
1905 #######################################################################
1906
1907 vpclmulqdq $0x00, \GH, \T3, \T2
1908 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1909
1910 vpclmulqdq $0x10, \GH, \T3, \GH
1911 vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1912
1913 vpxor \T2, \GH, \GH # second phase of the reduction complete
1914 #######################################################################
1915 vpxor \T1, \GH, \GH # the result is in GH
1916
1917
1918 .endm
1919
1920 .macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1921
1922
1923 vmovdqa \HK, \T5
1924 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
1925 vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
1926
1927 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
1928 vmovdqu \T5, HashKey_3(arg2)
1929
1930 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
1931 vmovdqu \T5, HashKey_4(arg2)
1932
1933 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
1934 vmovdqu \T5, HashKey_5(arg2)
1935
1936 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
1937 vmovdqu \T5, HashKey_6(arg2)
1938
1939 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
1940 vmovdqu \T5, HashKey_7(arg2)
1941
1942 GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
1943 vmovdqu \T5, HashKey_8(arg2)
1944
1945 .endm
1946
1947 ## if a = number of total plaintext bytes
1948 ## b = floor(a/16)
1949 ## num_initial_blocks = b mod 4#
1950 ## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1951 ## r10, r11, r12, rax are clobbered
1952 ## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1953
1954 .macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1955 i = (8-\num_initial_blocks)
1956 setreg
1957 vmovdqu AadHash(arg2), reg_i
1958
1959
1960 vmovdqu CurCount(arg2), \CTR
1961
1962 i = (9-\num_initial_blocks)
1963 setreg
1964 .rep \num_initial_blocks
1965 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
1966 vmovdqa \CTR, reg_i
1967 vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap
1968 i = (i+1)
1969 setreg
1970 .endr
1971
1972 vmovdqa (arg1), \T_key
1973 i = (9-\num_initial_blocks)
1974 setreg
1975 .rep \num_initial_blocks
1976 vpxor \T_key, reg_i, reg_i
1977 i = (i+1)
1978 setreg
1979 .endr
1980
1981 j = 1
1982 setreg
1983 .rep \REP
1984 vmovdqa 16*j(arg1), \T_key
1985 i = (9-\num_initial_blocks)
1986 setreg
1987 .rep \num_initial_blocks
1988 vaesenc \T_key, reg_i, reg_i
1989 i = (i+1)
1990 setreg
1991 .endr
1992
1993 j = (j+1)
1994 setreg
1995 .endr
1996
1997
1998 vmovdqa 16*j(arg1), \T_key
1999 i = (9-\num_initial_blocks)
2000 setreg
2001 .rep \num_initial_blocks
2002 vaesenclast \T_key, reg_i, reg_i
2003 i = (i+1)
2004 setreg
2005 .endr
2006
2007 i = (9-\num_initial_blocks)
2008 setreg
2009 .rep \num_initial_blocks
2010 vmovdqu (arg4, %r11), \T1
2011 vpxor \T1, reg_i, reg_i
2012 vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
2013
2014 add $16, %r11
2015 .if \ENC_DEC == DEC
2016 vmovdqa \T1, reg_i
2017 .endif
2018 vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations
2019 i = (i+1)
2020 setreg
2021 .endr
2022
2023
2024 i = (8-\num_initial_blocks)
2025 j = (9-\num_initial_blocks)
2026 setreg
2027
2028 .rep \num_initial_blocks
2029 vpxor reg_i, reg_j, reg_j
2030 GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
2031 i = (i+1)
2032 j = (j+1)
2033 setreg
2034 .endr
2035
2036
2037 vmovdqa \XMM8, TMP1(%rsp)
2038 vmovdqa \XMM8, \T3
2039
2040 cmp $128, %r13
2041 jl _initial_blocks_done\@ # no need for precomputed constants
2042
2043 ###############################################################################
2044
2045 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2046 vmovdqa \CTR, \XMM1
2047 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2048
2049 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2050 vmovdqa \CTR, \XMM2
2051 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2052
2053 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2054 vmovdqa \CTR, \XMM3
2055 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2056
2057 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2058 vmovdqa \CTR, \XMM4
2059 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2060
2061 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2062 vmovdqa \CTR, \XMM5
2063 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2064
2065 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2066 vmovdqa \CTR, \XMM6
2067 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2068
2069 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2070 vmovdqa \CTR, \XMM7
2071 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2072
2073 vpaddd ONE(%rip), \CTR, \CTR # INCR Y0
2074 vmovdqa \CTR, \XMM8
2075 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2076
2077 vmovdqa (arg1), \T_key
2078 vpxor \T_key, \XMM1, \XMM1
2079 vpxor \T_key, \XMM2, \XMM2
2080 vpxor \T_key, \XMM3, \XMM3
2081 vpxor \T_key, \XMM4, \XMM4
2082 vpxor \T_key, \XMM5, \XMM5
2083 vpxor \T_key, \XMM6, \XMM6
2084 vpxor \T_key, \XMM7, \XMM7
2085 vpxor \T_key, \XMM8, \XMM8
2086
2087 i = 1
2088 setreg
2089 .rep \REP # do REP rounds
2090 vmovdqa 16*i(arg1), \T_key
2091 vaesenc \T_key, \XMM1, \XMM1
2092 vaesenc \T_key, \XMM2, \XMM2
2093 vaesenc \T_key, \XMM3, \XMM3
2094 vaesenc \T_key, \XMM4, \XMM4
2095 vaesenc \T_key, \XMM5, \XMM5
2096 vaesenc \T_key, \XMM6, \XMM6
2097 vaesenc \T_key, \XMM7, \XMM7
2098 vaesenc \T_key, \XMM8, \XMM8
2099 i = (i+1)
2100 setreg
2101 .endr
2102
2103
2104 vmovdqa 16*i(arg1), \T_key
2105 vaesenclast \T_key, \XMM1, \XMM1
2106 vaesenclast \T_key, \XMM2, \XMM2
2107 vaesenclast \T_key, \XMM3, \XMM3
2108 vaesenclast \T_key, \XMM4, \XMM4
2109 vaesenclast \T_key, \XMM5, \XMM5
2110 vaesenclast \T_key, \XMM6, \XMM6
2111 vaesenclast \T_key, \XMM7, \XMM7
2112 vaesenclast \T_key, \XMM8, \XMM8
2113
2114 vmovdqu (arg4, %r11), \T1
2115 vpxor \T1, \XMM1, \XMM1
2116 vmovdqu \XMM1, (arg3 , %r11)
2117 .if \ENC_DEC == DEC
2118 vmovdqa \T1, \XMM1
2119 .endif
2120
2121 vmovdqu 16*1(arg4, %r11), \T1
2122 vpxor \T1, \XMM2, \XMM2
2123 vmovdqu \XMM2, 16*1(arg3 , %r11)
2124 .if \ENC_DEC == DEC
2125 vmovdqa \T1, \XMM2
2126 .endif
2127
2128 vmovdqu 16*2(arg4, %r11), \T1
2129 vpxor \T1, \XMM3, \XMM3
2130 vmovdqu \XMM3, 16*2(arg3 , %r11)
2131 .if \ENC_DEC == DEC
2132 vmovdqa \T1, \XMM3
2133 .endif
2134
2135 vmovdqu 16*3(arg4, %r11), \T1
2136 vpxor \T1, \XMM4, \XMM4
2137 vmovdqu \XMM4, 16*3(arg3 , %r11)
2138 .if \ENC_DEC == DEC
2139 vmovdqa \T1, \XMM4
2140 .endif
2141
2142 vmovdqu 16*4(arg4, %r11), \T1
2143 vpxor \T1, \XMM5, \XMM5
2144 vmovdqu \XMM5, 16*4(arg3 , %r11)
2145 .if \ENC_DEC == DEC
2146 vmovdqa \T1, \XMM5
2147 .endif
2148
2149 vmovdqu 16*5(arg4, %r11), \T1
2150 vpxor \T1, \XMM6, \XMM6
2151 vmovdqu \XMM6, 16*5(arg3 , %r11)
2152 .if \ENC_DEC == DEC
2153 vmovdqa \T1, \XMM6
2154 .endif
2155
2156 vmovdqu 16*6(arg4, %r11), \T1
2157 vpxor \T1, \XMM7, \XMM7
2158 vmovdqu \XMM7, 16*6(arg3 , %r11)
2159 .if \ENC_DEC == DEC
2160 vmovdqa \T1, \XMM7
2161 .endif
2162
2163 vmovdqu 16*7(arg4, %r11), \T1
2164 vpxor \T1, \XMM8, \XMM8
2165 vmovdqu \XMM8, 16*7(arg3 , %r11)
2166 .if \ENC_DEC == DEC
2167 vmovdqa \T1, \XMM8
2168 .endif
2169
2170 add $128, %r11
2171
2172 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2173 vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with
2174
2175 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2176 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2177 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2178 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2179 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2180 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2181 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2182
2183 ###############################################################################
2184
2185 _initial_blocks_done\@:
2186
2187
2188 .endm
2189
2190
2191
2192
2193
2194
2195
2196 .macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2197
2198 vmovdqa \XMM1, \T2
2199 vmovdqa \XMM2, TMP2(%rsp)
2200 vmovdqa \XMM3, TMP3(%rsp)
2201 vmovdqa \XMM4, TMP4(%rsp)
2202 vmovdqa \XMM5, TMP5(%rsp)
2203 vmovdqa \XMM6, TMP6(%rsp)
2204 vmovdqa \XMM7, TMP7(%rsp)
2205 vmovdqa \XMM8, TMP8(%rsp)
2206
2207 .if \loop_idx == in_order
2208 vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT
2209 vpaddd ONE(%rip), \XMM1, \XMM2
2210 vpaddd ONE(%rip), \XMM2, \XMM3
2211 vpaddd ONE(%rip), \XMM3, \XMM4
2212 vpaddd ONE(%rip), \XMM4, \XMM5
2213 vpaddd ONE(%rip), \XMM5, \XMM6
2214 vpaddd ONE(%rip), \XMM6, \XMM7
2215 vpaddd ONE(%rip), \XMM7, \XMM8
2216 vmovdqa \XMM8, \CTR
2217
2218 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2219 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2220 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2221 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2222 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2223 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2224 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2225 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2226 .else
2227 vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT
2228 vpaddd ONEf(%rip), \XMM1, \XMM2
2229 vpaddd ONEf(%rip), \XMM2, \XMM3
2230 vpaddd ONEf(%rip), \XMM3, \XMM4
2231 vpaddd ONEf(%rip), \XMM4, \XMM5
2232 vpaddd ONEf(%rip), \XMM5, \XMM6
2233 vpaddd ONEf(%rip), \XMM6, \XMM7
2234 vpaddd ONEf(%rip), \XMM7, \XMM8
2235 vmovdqa \XMM8, \CTR
2236 .endif
2237
2238
2239 #######################################################################
2240
2241 vmovdqu (arg1), \T1
2242 vpxor \T1, \XMM1, \XMM1
2243 vpxor \T1, \XMM2, \XMM2
2244 vpxor \T1, \XMM3, \XMM3
2245 vpxor \T1, \XMM4, \XMM4
2246 vpxor \T1, \XMM5, \XMM5
2247 vpxor \T1, \XMM6, \XMM6
2248 vpxor \T1, \XMM7, \XMM7
2249 vpxor \T1, \XMM8, \XMM8
2250
2251 #######################################################################
2252
2253
2254
2255
2256
2257 vmovdqu 16*1(arg1), \T1
2258 vaesenc \T1, \XMM1, \XMM1
2259 vaesenc \T1, \XMM2, \XMM2
2260 vaesenc \T1, \XMM3, \XMM3
2261 vaesenc \T1, \XMM4, \XMM4
2262 vaesenc \T1, \XMM5, \XMM5
2263 vaesenc \T1, \XMM6, \XMM6
2264 vaesenc \T1, \XMM7, \XMM7
2265 vaesenc \T1, \XMM8, \XMM8
2266
2267 vmovdqu 16*2(arg1), \T1
2268 vaesenc \T1, \XMM1, \XMM1
2269 vaesenc \T1, \XMM2, \XMM2
2270 vaesenc \T1, \XMM3, \XMM3
2271 vaesenc \T1, \XMM4, \XMM4
2272 vaesenc \T1, \XMM5, \XMM5
2273 vaesenc \T1, \XMM6, \XMM6
2274 vaesenc \T1, \XMM7, \XMM7
2275 vaesenc \T1, \XMM8, \XMM8
2276
2277
2278 #######################################################################
2279
2280 vmovdqu HashKey_8(arg2), \T5
2281 vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
2282 vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
2283 vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
2284 vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1
2285 vpxor \T5, \T6, \T6
2286
2287 vmovdqu 16*3(arg1), \T1
2288 vaesenc \T1, \XMM1, \XMM1
2289 vaesenc \T1, \XMM2, \XMM2
2290 vaesenc \T1, \XMM3, \XMM3
2291 vaesenc \T1, \XMM4, \XMM4
2292 vaesenc \T1, \XMM5, \XMM5
2293 vaesenc \T1, \XMM6, \XMM6
2294 vaesenc \T1, \XMM7, \XMM7
2295 vaesenc \T1, \XMM8, \XMM8
2296
2297 vmovdqa TMP2(%rsp), \T1
2298 vmovdqu HashKey_7(arg2), \T5
2299 vpclmulqdq $0x11, \T5, \T1, \T3
2300 vpxor \T3, \T4, \T4
2301
2302 vpclmulqdq $0x00, \T5, \T1, \T3
2303 vpxor \T3, \T7, \T7
2304
2305 vpclmulqdq $0x01, \T5, \T1, \T3
2306 vpxor \T3, \T6, \T6
2307
2308 vpclmulqdq $0x10, \T5, \T1, \T3
2309 vpxor \T3, \T6, \T6
2310
2311 vmovdqu 16*4(arg1), \T1
2312 vaesenc \T1, \XMM1, \XMM1
2313 vaesenc \T1, \XMM2, \XMM2
2314 vaesenc \T1, \XMM3, \XMM3
2315 vaesenc \T1, \XMM4, \XMM4
2316 vaesenc \T1, \XMM5, \XMM5
2317 vaesenc \T1, \XMM6, \XMM6
2318 vaesenc \T1, \XMM7, \XMM7
2319 vaesenc \T1, \XMM8, \XMM8
2320
2321 #######################################################################
2322
2323 vmovdqa TMP3(%rsp), \T1
2324 vmovdqu HashKey_6(arg2), \T5
2325 vpclmulqdq $0x11, \T5, \T1, \T3
2326 vpxor \T3, \T4, \T4
2327
2328 vpclmulqdq $0x00, \T5, \T1, \T3
2329 vpxor \T3, \T7, \T7
2330
2331 vpclmulqdq $0x01, \T5, \T1, \T3
2332 vpxor \T3, \T6, \T6
2333
2334 vpclmulqdq $0x10, \T5, \T1, \T3
2335 vpxor \T3, \T6, \T6
2336
2337 vmovdqu 16*5(arg1), \T1
2338 vaesenc \T1, \XMM1, \XMM1
2339 vaesenc \T1, \XMM2, \XMM2
2340 vaesenc \T1, \XMM3, \XMM3
2341 vaesenc \T1, \XMM4, \XMM4
2342 vaesenc \T1, \XMM5, \XMM5
2343 vaesenc \T1, \XMM6, \XMM6
2344 vaesenc \T1, \XMM7, \XMM7
2345 vaesenc \T1, \XMM8, \XMM8
2346
2347 vmovdqa TMP4(%rsp), \T1
2348 vmovdqu HashKey_5(arg2), \T5
2349 vpclmulqdq $0x11, \T5, \T1, \T3
2350 vpxor \T3, \T4, \T4
2351
2352 vpclmulqdq $0x00, \T5, \T1, \T3
2353 vpxor \T3, \T7, \T7
2354
2355 vpclmulqdq $0x01, \T5, \T1, \T3
2356 vpxor \T3, \T6, \T6
2357
2358 vpclmulqdq $0x10, \T5, \T1, \T3
2359 vpxor \T3, \T6, \T6
2360
2361 vmovdqu 16*6(arg1), \T1
2362 vaesenc \T1, \XMM1, \XMM1
2363 vaesenc \T1, \XMM2, \XMM2
2364 vaesenc \T1, \XMM3, \XMM3
2365 vaesenc \T1, \XMM4, \XMM4
2366 vaesenc \T1, \XMM5, \XMM5
2367 vaesenc \T1, \XMM6, \XMM6
2368 vaesenc \T1, \XMM7, \XMM7
2369 vaesenc \T1, \XMM8, \XMM8
2370
2371
2372 vmovdqa TMP5(%rsp), \T1
2373 vmovdqu HashKey_4(arg2), \T5
2374 vpclmulqdq $0x11, \T5, \T1, \T3
2375 vpxor \T3, \T4, \T4
2376
2377 vpclmulqdq $0x00, \T5, \T1, \T3
2378 vpxor \T3, \T7, \T7
2379
2380 vpclmulqdq $0x01, \T5, \T1, \T3
2381 vpxor \T3, \T6, \T6
2382
2383 vpclmulqdq $0x10, \T5, \T1, \T3
2384 vpxor \T3, \T6, \T6
2385
2386 vmovdqu 16*7(arg1), \T1
2387 vaesenc \T1, \XMM1, \XMM1
2388 vaesenc \T1, \XMM2, \XMM2
2389 vaesenc \T1, \XMM3, \XMM3
2390 vaesenc \T1, \XMM4, \XMM4
2391 vaesenc \T1, \XMM5, \XMM5
2392 vaesenc \T1, \XMM6, \XMM6
2393 vaesenc \T1, \XMM7, \XMM7
2394 vaesenc \T1, \XMM8, \XMM8
2395
2396 vmovdqa TMP6(%rsp), \T1
2397 vmovdqu HashKey_3(arg2), \T5
2398 vpclmulqdq $0x11, \T5, \T1, \T3
2399 vpxor \T3, \T4, \T4
2400
2401 vpclmulqdq $0x00, \T5, \T1, \T3
2402 vpxor \T3, \T7, \T7
2403
2404 vpclmulqdq $0x01, \T5, \T1, \T3
2405 vpxor \T3, \T6, \T6
2406
2407 vpclmulqdq $0x10, \T5, \T1, \T3
2408 vpxor \T3, \T6, \T6
2409
2410 vmovdqu 16*8(arg1), \T1
2411 vaesenc \T1, \XMM1, \XMM1
2412 vaesenc \T1, \XMM2, \XMM2
2413 vaesenc \T1, \XMM3, \XMM3
2414 vaesenc \T1, \XMM4, \XMM4
2415 vaesenc \T1, \XMM5, \XMM5
2416 vaesenc \T1, \XMM6, \XMM6
2417 vaesenc \T1, \XMM7, \XMM7
2418 vaesenc \T1, \XMM8, \XMM8
2419
2420 vmovdqa TMP7(%rsp), \T1
2421 vmovdqu HashKey_2(arg2), \T5
2422 vpclmulqdq $0x11, \T5, \T1, \T3
2423 vpxor \T3, \T4, \T4
2424
2425 vpclmulqdq $0x00, \T5, \T1, \T3
2426 vpxor \T3, \T7, \T7
2427
2428 vpclmulqdq $0x01, \T5, \T1, \T3
2429 vpxor \T3, \T6, \T6
2430
2431 vpclmulqdq $0x10, \T5, \T1, \T3
2432 vpxor \T3, \T6, \T6
2433
2434
2435 #######################################################################
2436
2437 vmovdqu 16*9(arg1), \T5
2438 vaesenc \T5, \XMM1, \XMM1
2439 vaesenc \T5, \XMM2, \XMM2
2440 vaesenc \T5, \XMM3, \XMM3
2441 vaesenc \T5, \XMM4, \XMM4
2442 vaesenc \T5, \XMM5, \XMM5
2443 vaesenc \T5, \XMM6, \XMM6
2444 vaesenc \T5, \XMM7, \XMM7
2445 vaesenc \T5, \XMM8, \XMM8
2446
2447 vmovdqa TMP8(%rsp), \T1
2448 vmovdqu HashKey(arg2), \T5
2449
2450 vpclmulqdq $0x00, \T5, \T1, \T3
2451 vpxor \T3, \T7, \T7
2452
2453 vpclmulqdq $0x01, \T5, \T1, \T3
2454 vpxor \T3, \T6, \T6
2455
2456 vpclmulqdq $0x10, \T5, \T1, \T3
2457 vpxor \T3, \T6, \T6
2458
2459 vpclmulqdq $0x11, \T5, \T1, \T3
2460 vpxor \T3, \T4, \T1
2461
2462
2463 vmovdqu 16*10(arg1), \T5
2464
2465 i = 11
2466 setreg
2467 .rep (\REP-9)
2468 vaesenc \T5, \XMM1, \XMM1
2469 vaesenc \T5, \XMM2, \XMM2
2470 vaesenc \T5, \XMM3, \XMM3
2471 vaesenc \T5, \XMM4, \XMM4
2472 vaesenc \T5, \XMM5, \XMM5
2473 vaesenc \T5, \XMM6, \XMM6
2474 vaesenc \T5, \XMM7, \XMM7
2475 vaesenc \T5, \XMM8, \XMM8
2476
2477 vmovdqu 16*i(arg1), \T5
2478 i = i + 1
2479 setreg
2480 .endr
2481
2482 i = 0
2483 j = 1
2484 setreg
2485 .rep 8
2486 vpxor 16*i(arg4, %r11), \T5, \T2
2487 .if \ENC_DEC == ENC
2488 vaesenclast \T2, reg_j, reg_j
2489 .else
2490 vaesenclast \T2, reg_j, \T3
2491 vmovdqu 16*i(arg4, %r11), reg_j
2492 vmovdqu \T3, 16*i(arg3, %r11)
2493 .endif
2494 i = (i+1)
2495 j = (j+1)
2496 setreg
2497 .endr
2498 #######################################################################
2499
2500
2501 vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
2502 vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
2503 vpxor \T3, \T7, \T7
2504 vpxor \T6, \T1, \T1 # accumulate the results in T1:T7
2505
2506
2507
2508 #######################################################################
2509
2510 vmovdqa POLY2(%rip), \T3
2511
2512 vpclmulqdq $0x01, \T7, \T3, \T2
2513 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2514
2515 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2516 #######################################################################
2517 .if \ENC_DEC == ENC
2518 vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
2519 vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
2520 vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
2521 vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
2522 vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
2523 vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
2524 vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
2525 vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
2526 .endif
2527
2528 #######################################################################
2529
2530 vpclmulqdq $0x00, \T7, \T3, \T2
2531 vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2532
2533 vpclmulqdq $0x10, \T7, \T3, \T4
2534 vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2535
2536 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2537 #######################################################################
2538 vpxor \T4, \T1, \T1 # the result is in T1
2539
2540 vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
2541 vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
2542 vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
2543 vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
2544 vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
2545 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
2546 vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
2547 vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
2548
2549
2550 vpxor \T1, \XMM1, \XMM1
2551
2552
2553
2554 .endm
2555
2556
2557
2558 .macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2559
2560 ## Karatsuba Method
2561
2562 vmovdqu HashKey_8(arg2), \T5
2563
2564 vpshufd $0b01001110, \XMM1, \T2
2565 vpshufd $0b01001110, \T5, \T3
2566 vpxor \XMM1, \T2, \T2
2567 vpxor \T5, \T3, \T3
2568
2569 vpclmulqdq $0x11, \T5, \XMM1, \T6
2570 vpclmulqdq $0x00, \T5, \XMM1, \T7
2571
2572 vpclmulqdq $0x00, \T3, \T2, \XMM1
2573
2574 ######################
2575
2576 vmovdqu HashKey_7(arg2), \T5
2577 vpshufd $0b01001110, \XMM2, \T2
2578 vpshufd $0b01001110, \T5, \T3
2579 vpxor \XMM2, \T2, \T2
2580 vpxor \T5, \T3, \T3
2581
2582 vpclmulqdq $0x11, \T5, \XMM2, \T4
2583 vpxor \T4, \T6, \T6
2584
2585 vpclmulqdq $0x00, \T5, \XMM2, \T4
2586 vpxor \T4, \T7, \T7
2587
2588 vpclmulqdq $0x00, \T3, \T2, \T2
2589
2590 vpxor \T2, \XMM1, \XMM1
2591
2592 ######################
2593
2594 vmovdqu HashKey_6(arg2), \T5
2595 vpshufd $0b01001110, \XMM3, \T2
2596 vpshufd $0b01001110, \T5, \T3
2597 vpxor \XMM3, \T2, \T2
2598 vpxor \T5, \T3, \T3
2599
2600 vpclmulqdq $0x11, \T5, \XMM3, \T4
2601 vpxor \T4, \T6, \T6
2602
2603 vpclmulqdq $0x00, \T5, \XMM3, \T4
2604 vpxor \T4, \T7, \T7
2605
2606 vpclmulqdq $0x00, \T3, \T2, \T2
2607
2608 vpxor \T2, \XMM1, \XMM1
2609
2610 ######################
2611
2612 vmovdqu HashKey_5(arg2), \T5
2613 vpshufd $0b01001110, \XMM4, \T2
2614 vpshufd $0b01001110, \T5, \T3
2615 vpxor \XMM4, \T2, \T2
2616 vpxor \T5, \T3, \T3
2617
2618 vpclmulqdq $0x11, \T5, \XMM4, \T4
2619 vpxor \T4, \T6, \T6
2620
2621 vpclmulqdq $0x00, \T5, \XMM4, \T4
2622 vpxor \T4, \T7, \T7
2623
2624 vpclmulqdq $0x00, \T3, \T2, \T2
2625
2626 vpxor \T2, \XMM1, \XMM1
2627
2628 ######################
2629
2630 vmovdqu HashKey_4(arg2), \T5
2631 vpshufd $0b01001110, \XMM5, \T2
2632 vpshufd $0b01001110, \T5, \T3
2633 vpxor \XMM5, \T2, \T2
2634 vpxor \T5, \T3, \T3
2635
2636 vpclmulqdq $0x11, \T5, \XMM5, \T4
2637 vpxor \T4, \T6, \T6
2638
2639 vpclmulqdq $0x00, \T5, \XMM5, \T4
2640 vpxor \T4, \T7, \T7
2641
2642 vpclmulqdq $0x00, \T3, \T2, \T2
2643
2644 vpxor \T2, \XMM1, \XMM1
2645
2646 ######################
2647
2648 vmovdqu HashKey_3(arg2), \T5
2649 vpshufd $0b01001110, \XMM6, \T2
2650 vpshufd $0b01001110, \T5, \T3
2651 vpxor \XMM6, \T2, \T2
2652 vpxor \T5, \T3, \T3
2653
2654 vpclmulqdq $0x11, \T5, \XMM6, \T4
2655 vpxor \T4, \T6, \T6
2656
2657 vpclmulqdq $0x00, \T5, \XMM6, \T4
2658 vpxor \T4, \T7, \T7
2659
2660 vpclmulqdq $0x00, \T3, \T2, \T2
2661
2662 vpxor \T2, \XMM1, \XMM1
2663
2664 ######################
2665
2666 vmovdqu HashKey_2(arg2), \T5
2667 vpshufd $0b01001110, \XMM7, \T2
2668 vpshufd $0b01001110, \T5, \T3
2669 vpxor \XMM7, \T2, \T2
2670 vpxor \T5, \T3, \T3
2671
2672 vpclmulqdq $0x11, \T5, \XMM7, \T4
2673 vpxor \T4, \T6, \T6
2674
2675 vpclmulqdq $0x00, \T5, \XMM7, \T4
2676 vpxor \T4, \T7, \T7
2677
2678 vpclmulqdq $0x00, \T3, \T2, \T2
2679
2680 vpxor \T2, \XMM1, \XMM1
2681
2682 ######################
2683
2684 vmovdqu HashKey(arg2), \T5
2685 vpshufd $0b01001110, \XMM8, \T2
2686 vpshufd $0b01001110, \T5, \T3
2687 vpxor \XMM8, \T2, \T2
2688 vpxor \T5, \T3, \T3
2689
2690 vpclmulqdq $0x11, \T5, \XMM8, \T4
2691 vpxor \T4, \T6, \T6
2692
2693 vpclmulqdq $0x00, \T5, \XMM8, \T4
2694 vpxor \T4, \T7, \T7
2695
2696 vpclmulqdq $0x00, \T3, \T2, \T2
2697
2698 vpxor \T2, \XMM1, \XMM1
2699 vpxor \T6, \XMM1, \XMM1
2700 vpxor \T7, \XMM1, \T2
2701
2702
2703
2704
2705 vpslldq $8, \T2, \T4
2706 vpsrldq $8, \T2, \T2
2707
2708 vpxor \T4, \T7, \T7
2709 vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
2710
2711
2712 #######################################################################
2713
2714 vmovdqa POLY2(%rip), \T3
2715
2716 vpclmulqdq $0x01, \T7, \T3, \T2
2717 vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
2718
2719 vpxor \T2, \T7, \T7 # first phase of the reduction complete
2720 #######################################################################
2721
2722
2723
2724 vpclmulqdq $0x00, \T7, \T3, \T2
2725 vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2726
2727 vpclmulqdq $0x10, \T7, \T3, \T4
2728 vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2729
2730 vpxor \T2, \T4, \T4 # second phase of the reduction complete
2731 #######################################################################
2732 vpxor \T4, \T6, \T6 # the result is in T6
2733 .endm
2734
2735
2736
2737 #############################################################
2738
2739 # (gcm_data *my_ctx_data,
2740
2741
2742 # (from Security Association) concatenated with 8 byte
2743
2744
2745
2746
2747
2748 #############################################################
2749 ENTRY(aesni_gcm_init_avx_gen4)
2750 FUNC_SAVE
2751 INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2752 FUNC_RESTORE
2753 ret
2754 ENDPROC(aesni_gcm_init_avx_gen4)
2755
2756 ###############################################################################
2757
2758
2759
2760
2761
2762
2763 ###############################################################################
2764 ENTRY(aesni_gcm_enc_update_avx_gen4)
2765 FUNC_SAVE
2766 mov keysize,%eax
2767 cmp $32, %eax
2768 je key_256_enc_update4
2769 cmp $16, %eax
2770 je key_128_enc_update4
2771
2772 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2773 FUNC_RESTORE
2774 ret
2775 key_128_enc_update4:
2776 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2777 FUNC_RESTORE
2778 ret
2779 key_256_enc_update4:
2780 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2781 FUNC_RESTORE
2782 ret
2783 ENDPROC(aesni_gcm_enc_update_avx_gen4)
2784
2785 ###############################################################################
2786
2787
2788
2789
2790
2791
2792 ###############################################################################
2793 ENTRY(aesni_gcm_dec_update_avx_gen4)
2794 FUNC_SAVE
2795 mov keysize,%eax
2796 cmp $32, %eax
2797 je key_256_dec_update4
2798 cmp $16, %eax
2799 je key_128_dec_update4
2800
2801 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2802 FUNC_RESTORE
2803 ret
2804 key_128_dec_update4:
2805 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2806 FUNC_RESTORE
2807 ret
2808 key_256_dec_update4:
2809 GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2810 FUNC_RESTORE
2811 ret
2812 ENDPROC(aesni_gcm_dec_update_avx_gen4)
2813
2814 ###############################################################################
2815
2816
2817
2818
2819
2820
2821 ###############################################################################
2822 ENTRY(aesni_gcm_finalize_avx_gen4)
2823 FUNC_SAVE
2824 mov keysize,%eax
2825 cmp $32, %eax
2826 je key_256_finalize4
2827 cmp $16, %eax
2828 je key_128_finalize4
2829
2830 GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2831 FUNC_RESTORE
2832 ret
2833 key_128_finalize4:
2834 GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2835 FUNC_RESTORE
2836 ret
2837 key_256_finalize4:
2838 GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2839 FUNC_RESTORE
2840 ret
2841 ENDPROC(aesni_gcm_finalize_avx_gen4)
2842
2843 #endif