1
2
3
4
5
6
7
8 #include <linux/linkage.h>
9
10 .section .rodata.cst32.ANMASK, "aM", @progbits, 32
11 .align 32
12 ANMASK: .octa 0x0000000003ffffff0000000003ffffff
13 .octa 0x0000000003ffffff0000000003ffffff
14
15 .section .rodata.cst32.ORMASK, "aM", @progbits, 32
16 .align 32
17 ORMASK: .octa 0x00000000010000000000000001000000
18 .octa 0x00000000010000000000000001000000
19
20 .text
21
22 #define h0 0x00(%rdi)
23 #define h1 0x04(%rdi)
24 #define h2 0x08(%rdi)
25 #define h3 0x0c(%rdi)
26 #define h4 0x10(%rdi)
27 #define r0 0x00(%rdx)
28 #define r1 0x04(%rdx)
29 #define r2 0x08(%rdx)
30 #define r3 0x0c(%rdx)
31 #define r4 0x10(%rdx)
32 #define u0 0x00(%r8)
33 #define u1 0x04(%r8)
34 #define u2 0x08(%r8)
35 #define u3 0x0c(%r8)
36 #define u4 0x10(%r8)
37 #define w0 0x14(%r8)
38 #define w1 0x18(%r8)
39 #define w2 0x1c(%r8)
40 #define w3 0x20(%r8)
41 #define w4 0x24(%r8)
42 #define y0 0x28(%r8)
43 #define y1 0x2c(%r8)
44 #define y2 0x30(%r8)
45 #define y3 0x34(%r8)
46 #define y4 0x38(%r8)
47 #define m %rsi
48 #define hc0 %ymm0
49 #define hc1 %ymm1
50 #define hc2 %ymm2
51 #define hc3 %ymm3
52 #define hc4 %ymm4
53 #define hc0x %xmm0
54 #define hc1x %xmm1
55 #define hc2x %xmm2
56 #define hc3x %xmm3
57 #define hc4x %xmm4
58 #define t1 %ymm5
59 #define t2 %ymm6
60 #define t1x %xmm5
61 #define t2x %xmm6
62 #define ruwy0 %ymm7
63 #define ruwy1 %ymm8
64 #define ruwy2 %ymm9
65 #define ruwy3 %ymm10
66 #define ruwy4 %ymm11
67 #define ruwy0x %xmm7
68 #define ruwy1x %xmm8
69 #define ruwy2x %xmm9
70 #define ruwy3x %xmm10
71 #define ruwy4x %xmm11
72 #define svxz1 %ymm12
73 #define svxz2 %ymm13
74 #define svxz3 %ymm14
75 #define svxz4 %ymm15
76 #define d0 %r9
77 #define d1 %r10
78 #define d2 %r11
79 #define d3 %r12
80 #define d4 %r13
81
82 ENTRY(poly1305_4block_avx2)
83 # %rdi: Accumulator h[5]
84 # %rsi: 64 byte input block m
85 # %rdx: Poly1305 key r[5]
86 # %rcx: Quadblock count
87 # %r8: Poly1305 derived key r^2 u[5], r^3 w[5], r^4 y[5],
88
89
90
91
92
93 vzeroupper
94 push %rbx
95 push %r12
96 push %r13
97
98
99 vmovd y0,ruwy0x
100 vmovd w0,t1x
101 vpunpcklqdq t1,ruwy0,ruwy0
102 vmovd u0,t1x
103 vmovd r0,t2x
104 vpunpcklqdq t2,t1,t1
105 vperm2i128 $0x20,t1,ruwy0,ruwy0
106
107
108 vmovd y1,ruwy1x
109 vmovd w1,t1x
110 vpunpcklqdq t1,ruwy1,ruwy1
111 vmovd u1,t1x
112 vmovd r1,t2x
113 vpunpcklqdq t2,t1,t1
114 vperm2i128 $0x20,t1,ruwy1,ruwy1
115 vpslld $2,ruwy1,svxz1
116 vpaddd ruwy1,svxz1,svxz1
117
118
119 vmovd y2,ruwy2x
120 vmovd w2,t1x
121 vpunpcklqdq t1,ruwy2,ruwy2
122 vmovd u2,t1x
123 vmovd r2,t2x
124 vpunpcklqdq t2,t1,t1
125 vperm2i128 $0x20,t1,ruwy2,ruwy2
126 vpslld $2,ruwy2,svxz2
127 vpaddd ruwy2,svxz2,svxz2
128
129
130 vmovd y3,ruwy3x
131 vmovd w3,t1x
132 vpunpcklqdq t1,ruwy3,ruwy3
133 vmovd u3,t1x
134 vmovd r3,t2x
135 vpunpcklqdq t2,t1,t1
136 vperm2i128 $0x20,t1,ruwy3,ruwy3
137 vpslld $2,ruwy3,svxz3
138 vpaddd ruwy3,svxz3,svxz3
139
140
141 vmovd y4,ruwy4x
142 vmovd w4,t1x
143 vpunpcklqdq t1,ruwy4,ruwy4
144 vmovd u4,t1x
145 vmovd r4,t2x
146 vpunpcklqdq t2,t1,t1
147 vperm2i128 $0x20,t1,ruwy4,ruwy4
148 vpslld $2,ruwy4,svxz4
149 vpaddd ruwy4,svxz4,svxz4
150
151 .Ldoblock4:
152
153
154 vmovd 0x00(m),hc0x
155 vmovd 0x10(m),t1x
156 vpunpcklqdq t1,hc0,hc0
157 vmovd 0x20(m),t1x
158 vmovd 0x30(m),t2x
159 vpunpcklqdq t2,t1,t1
160 vperm2i128 $0x20,t1,hc0,hc0
161 vpand ANMASK(%rip),hc0,hc0
162 vmovd h0,t1x
163 vpaddd t1,hc0,hc0
164
165 # (m[19-22] >> 2) & 0x3ffffff, (m[ 3- 6] >> 2) & 0x3ffffff + h1]
166 vmovd 0x03(m),hc1x
167 vmovd 0x13(m),t1x
168 vpunpcklqdq t1,hc1,hc1
169 vmovd 0x23(m),t1x
170 vmovd 0x33(m),t2x
171 vpunpcklqdq t2,t1,t1
172 vperm2i128 $0x20,t1,hc1,hc1
173 vpsrld $2,hc1,hc1
174 vpand ANMASK(%rip),hc1,hc1
175 vmovd h1,t1x
176 vpaddd t1,hc1,hc1
177
178 # (m[22-25] >> 4) & 0x3ffffff, (m[ 6- 9] >> 4) & 0x3ffffff + h2]
179 vmovd 0x06(m),hc2x
180 vmovd 0x16(m),t1x
181 vpunpcklqdq t1,hc2,hc2
182 vmovd 0x26(m),t1x
183 vmovd 0x36(m),t2x
184 vpunpcklqdq t2,t1,t1
185 vperm2i128 $0x20,t1,hc2,hc2
186 vpsrld $4,hc2,hc2
187 vpand ANMASK(%rip),hc2,hc2
188 vmovd h2,t1x
189 vpaddd t1,hc2,hc2
190
191 # (m[25-28] >> 6) & 0x3ffffff, (m[ 9-12] >> 6) & 0x3ffffff + h3]
192 vmovd 0x09(m),hc3x
193 vmovd 0x19(m),t1x
194 vpunpcklqdq t1,hc3,hc3
195 vmovd 0x29(m),t1x
196 vmovd 0x39(m),t2x
197 vpunpcklqdq t2,t1,t1
198 vperm2i128 $0x20,t1,hc3,hc3
199 vpsrld $6,hc3,hc3
200 vpand ANMASK(%rip),hc3,hc3
201 vmovd h3,t1x
202 vpaddd t1,hc3,hc3
203
204 # (m[28-31] >> 8) | (1<<24), (m[12-15] >> 8) | (1<<24) + h4]
205 vmovd 0x0c(m),hc4x
206 vmovd 0x1c(m),t1x
207 vpunpcklqdq t1,hc4,hc4
208 vmovd 0x2c(m),t1x
209 vmovd 0x3c(m),t2x
210 vpunpcklqdq t2,t1,t1
211 vperm2i128 $0x20,t1,hc4,hc4
212 vpsrld $8,hc4,hc4
213 vpor ORMASK(%rip),hc4,hc4
214 vmovd h4,t1x
215 vpaddd t1,hc4,hc4
216
217
218 vpmuludq hc0,ruwy0,t1
219
220 vpmuludq hc1,svxz4,t2
221 vpaddq t2,t1,t1
222
223 vpmuludq hc2,svxz3,t2
224 vpaddq t2,t1,t1
225
226 vpmuludq hc3,svxz2,t2
227 vpaddq t2,t1,t1
228
229 vpmuludq hc4,svxz1,t2
230 vpaddq t2,t1,t1
231
232 vpermq $0xee,t1,t2
233 vpaddq t2,t1,t1
234 vpsrldq $8,t1,t2
235 vpaddq t2,t1,t1
236 vmovq t1x,d0
237
238
239 vpmuludq hc0,ruwy1,t1
240
241 vpmuludq hc1,ruwy0,t2
242 vpaddq t2,t1,t1
243
244 vpmuludq hc2,svxz4,t2
245 vpaddq t2,t1,t1
246
247 vpmuludq hc3,svxz3,t2
248 vpaddq t2,t1,t1
249
250 vpmuludq hc4,svxz2,t2
251 vpaddq t2,t1,t1
252
253 vpermq $0xee,t1,t2
254 vpaddq t2,t1,t1
255 vpsrldq $8,t1,t2
256 vpaddq t2,t1,t1
257 vmovq t1x,d1
258
259
260 vpmuludq hc0,ruwy2,t1
261
262 vpmuludq hc1,ruwy1,t2
263 vpaddq t2,t1,t1
264
265 vpmuludq hc2,ruwy0,t2
266 vpaddq t2,t1,t1
267
268 vpmuludq hc3,svxz4,t2
269 vpaddq t2,t1,t1
270
271 vpmuludq hc4,svxz3,t2
272 vpaddq t2,t1,t1
273
274 vpermq $0xee,t1,t2
275 vpaddq t2,t1,t1
276 vpsrldq $8,t1,t2
277 vpaddq t2,t1,t1
278 vmovq t1x,d2
279
280
281 vpmuludq hc0,ruwy3,t1
282
283 vpmuludq hc1,ruwy2,t2
284 vpaddq t2,t1,t1
285
286 vpmuludq hc2,ruwy1,t2
287 vpaddq t2,t1,t1
288
289 vpmuludq hc3,ruwy0,t2
290 vpaddq t2,t1,t1
291
292 vpmuludq hc4,svxz4,t2
293 vpaddq t2,t1,t1
294
295 vpermq $0xee,t1,t2
296 vpaddq t2,t1,t1
297 vpsrldq $8,t1,t2
298 vpaddq t2,t1,t1
299 vmovq t1x,d3
300
301
302 vpmuludq hc0,ruwy4,t1
303
304 vpmuludq hc1,ruwy3,t2
305 vpaddq t2,t1,t1
306
307 vpmuludq hc2,ruwy2,t2
308 vpaddq t2,t1,t1
309
310 vpmuludq hc3,ruwy1,t2
311 vpaddq t2,t1,t1
312
313 vpmuludq hc4,ruwy0,t2
314 vpaddq t2,t1,t1
315
316 vpermq $0xee,t1,t2
317 vpaddq t2,t1,t1
318 vpsrldq $8,t1,t2
319 vpaddq t2,t1,t1
320 vmovq t1x,d4
321
322
323
324
325 # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
326
327
328
329 mov d0,%rax
330 shr $26,%rax
331 add %rax,d1
332
333 mov d0,%rbx
334 and $0x3ffffff,%ebx
335
336
337 mov d1,%rax
338 shr $26,%rax
339 add %rax,d2
340
341 mov d1,%rax
342 and $0x3ffffff,%eax
343 mov %eax,h1
344
345
346 mov d2,%rax
347 shr $26,%rax
348 add %rax,d3
349
350 mov d2,%rax
351 and $0x3ffffff,%eax
352 mov %eax,h2
353
354
355 mov d3,%rax
356 shr $26,%rax
357 add %rax,d4
358
359 mov d3,%rax
360 and $0x3ffffff,%eax
361 mov %eax,h3
362
363
364 mov d4,%rax
365 shr $26,%rax
366 lea (%rax,%rax,4),%rax
367 add %rax,%rbx
368
369 mov d4,%rax
370 and $0x3ffffff,%eax
371 mov %eax,h4
372
373
374 mov %rbx,%rax
375 shr $26,%rax
376 add %eax,h1
377
378 andl $0x3ffffff,%ebx
379 mov %ebx,h0
380
381 add $0x40,m
382 dec %rcx
383 jnz .Ldoblock4
384
385 vzeroupper
386 pop %r13
387 pop %r12
388 pop %rbx
389 ret
390 ENDPROC(poly1305_4block_avx2)