1
2
3
4
5
6
7
8 #include <linux/linkage.h>
9
10 .section .rodata.cst16.ANMASK, "aM", @progbits, 16
11 .align 16
12 ANMASK: .octa 0x0000000003ffffff0000000003ffffff
13
14 .section .rodata.cst16.ORMASK, "aM", @progbits, 16
15 .align 16
16 ORMASK: .octa 0x00000000010000000000000001000000
17
18 .text
19
20 #define h0 0x00(%rdi)
21 #define h1 0x04(%rdi)
22 #define h2 0x08(%rdi)
23 #define h3 0x0c(%rdi)
24 #define h4 0x10(%rdi)
25 #define r0 0x00(%rdx)
26 #define r1 0x04(%rdx)
27 #define r2 0x08(%rdx)
28 #define r3 0x0c(%rdx)
29 #define r4 0x10(%rdx)
30 #define s1 0x00(%rsp)
31 #define s2 0x04(%rsp)
32 #define s3 0x08(%rsp)
33 #define s4 0x0c(%rsp)
34 #define m %rsi
35 #define h01 %xmm0
36 #define h23 %xmm1
37 #define h44 %xmm2
38 #define t1 %xmm3
39 #define t2 %xmm4
40 #define t3 %xmm5
41 #define t4 %xmm6
42 #define mask %xmm7
43 #define d0 %r8
44 #define d1 %r9
45 #define d2 %r10
46 #define d3 %r11
47 #define d4 %r12
48
49 ENTRY(poly1305_block_sse2)
50 # %rdi: Accumulator h[5]
51 # %rsi: 16 byte input block m
52 # %rdx: Poly1305 key r[5]
53 # %rcx: Block count
54
55
56
57
58
59 push %rbx
60 push %r12
61 sub $0x10,%rsp
62
63
64 mov r1,%eax
65 lea (%eax,%eax,4),%eax
66 mov %eax,s1
67 mov r2,%eax
68 lea (%eax,%eax,4),%eax
69 mov %eax,s2
70 mov r3,%eax
71 lea (%eax,%eax,4),%eax
72 mov %eax,s3
73 mov r4,%eax
74 lea (%eax,%eax,4),%eax
75 mov %eax,s4
76
77 movdqa ANMASK(%rip),mask
78
79 .Ldoblock:
80
81
82
83 movd h0,h01
84 movd h1,t1
85 movd h2,h23
86 movd h3,t2
87 movd h4,h44
88 punpcklqdq t1,h01
89 punpcklqdq t2,h23
90 punpcklqdq h44,h44
91
92
93 movd 0x00(m),t1
94 movd 0x03(m),t2
95 psrld $2,t2
96 punpcklqdq t2,t1
97 pand mask,t1
98 paddd t1,h01
99
100 movd 0x06(m),t1
101 movd 0x09(m),t2
102 psrld $4,t1
103 psrld $6,t2
104 punpcklqdq t2,t1
105 pand mask,t1
106 paddd t1,h23
107
108 mov 0x0c(m),%eax
109 shr $8,%eax
110 or $0x01000000,%eax
111 movd %eax,t1
112 pshufd $0xc4,t1,t1
113 paddd t1,h44
114
115
116
117 movd r0,t1
118 movd s4,t2
119 punpcklqdq t2,t1
120 pmuludq h01,t1
121 movd s3,t2
122 movd s2,t3
123 punpcklqdq t3,t2
124 pmuludq h23,t2
125 paddq t2,t1
126
127
128 movd r1,t2
129 movd r0,t3
130 punpcklqdq t3,t2
131 pmuludq h01,t2
132 movd s4,t3
133 movd s3,t4
134 punpcklqdq t4,t3
135 pmuludq h23,t3
136 paddq t3,t2
137
138
139 movd s1,t3
140 movd s2,t4
141 punpcklqdq t4,t3
142 pmuludq h44,t3
143
144
145 movdqa t1,t4
146 punpcklqdq t2,t4
147 punpckhqdq t2,t1
148 paddq t4,t1
149 paddq t3,t1
150 movq t1,d0
151 psrldq $8,t1
152 movq t1,d1
153
154
155
156 movd r2,t1
157 movd r1,t2
158 punpcklqdq t2,t1
159 pmuludq h01,t1
160 movd r0,t2
161 movd s4,t3
162 punpcklqdq t3,t2
163 pmuludq h23,t2
164 paddq t2,t1
165
166
167 movd r3,t2
168 movd r2,t3
169 punpcklqdq t3,t2
170 pmuludq h01,t2
171 movd r1,t3
172 movd r0,t4
173 punpcklqdq t4,t3
174 pmuludq h23,t3
175 paddq t3,t2
176
177
178 movd s3,t3
179 movd s4,t4
180 punpcklqdq t4,t3
181 pmuludq h44,t3
182
183
184 movdqa t1,t4
185 punpcklqdq t2,t4
186 punpckhqdq t2,t1
187 paddq t4,t1
188 paddq t3,t1
189 movq t1,d2
190 psrldq $8,t1
191 movq t1,d3
192
193
194
195 movd r4,t1
196 movd r3,t2
197 punpcklqdq t2,t1
198 pmuludq h01,t1
199 movd r2,t2
200 movd r1,t3
201 punpcklqdq t3,t2
202 pmuludq h23,t2
203 paddq t2,t1
204
205 movd r0,t3
206 pmuludq h44,t3
207
208 movdqa t1,t4
209 psrldq $8,t4
210 paddq t4,t1
211 paddq t3,t1
212 movq t1,d4
213
214
215 mov d0,%rax
216 shr $26,%rax
217 add %rax,d1
218
219 mov d0,%rbx
220 and $0x3ffffff,%ebx
221
222
223 mov d1,%rax
224 shr $26,%rax
225 add %rax,d2
226
227 mov d1,%rax
228 and $0x3ffffff,%eax
229 mov %eax,h1
230
231
232 mov d2,%rax
233 shr $26,%rax
234 add %rax,d3
235
236 mov d2,%rax
237 and $0x3ffffff,%eax
238 mov %eax,h2
239
240
241 mov d3,%rax
242 shr $26,%rax
243 add %rax,d4
244
245 mov d3,%rax
246 and $0x3ffffff,%eax
247 mov %eax,h3
248
249
250 mov d4,%rax
251 shr $26,%rax
252 lea (%rax,%rax,4),%rax
253 add %rax,%rbx
254
255 mov d4,%rax
256 and $0x3ffffff,%eax
257 mov %eax,h4
258
259
260 mov %rbx,%rax
261 shr $26,%rax
262 add %eax,h1
263
264 andl $0x3ffffff,%ebx
265 mov %ebx,h0
266
267 add $0x10,m
268 dec %rcx
269 jnz .Ldoblock
270
271
272 mov %rcx,0x00(%rsp)
273 mov %rcx,0x08(%rsp)
274
275 add $0x10,%rsp
276 pop %r12
277 pop %rbx
278 ret
279 ENDPROC(poly1305_block_sse2)
280
281
282 #define u0 0x00(%r8)
283 #define u1 0x04(%r8)
284 #define u2 0x08(%r8)
285 #define u3 0x0c(%r8)
286 #define u4 0x10(%r8)
287 #define hc0 %xmm0
288 #define hc1 %xmm1
289 #define hc2 %xmm2
290 #define hc3 %xmm5
291 #define hc4 %xmm6
292 #define ru0 %xmm7
293 #define ru1 %xmm8
294 #define ru2 %xmm9
295 #define ru3 %xmm10
296 #define ru4 %xmm11
297 #define sv1 %xmm12
298 #define sv2 %xmm13
299 #define sv3 %xmm14
300 #define sv4 %xmm15
301 #undef d0
302 #define d0 %r13
303
304 ENTRY(poly1305_2block_sse2)
305 # %rdi: Accumulator h[5]
306 # %rsi: 16 byte input block m
307 # %rdx: Poly1305 key r[5]
308 # %rcx: Doubleblock count
309 # %r8: Poly1305 derived key r^2 u[5]
310
311
312
313
314
315
316 push %rbx
317 push %r12
318 push %r13
319
320
321 movd u0,ru0
322 movd r0,t1
323 punpcklqdq t1,ru0
324
325
326 movd u1,ru1
327 movd r1,t1
328 punpcklqdq t1,ru1
329 movdqa ru1,sv1
330 pslld $2,sv1
331 paddd ru1,sv1
332
333
334 movd u2,ru2
335 movd r2,t1
336 punpcklqdq t1,ru2
337 movdqa ru2,sv2
338 pslld $2,sv2
339 paddd ru2,sv2
340
341
342 movd u3,ru3
343 movd r3,t1
344 punpcklqdq t1,ru3
345 movdqa ru3,sv3
346 pslld $2,sv3
347 paddd ru3,sv3
348
349
350 movd u4,ru4
351 movd r4,t1
352 punpcklqdq t1,ru4
353 movdqa ru4,sv4
354 pslld $2,sv4
355 paddd ru4,sv4
356
357 .Ldoblock2:
358
359 movd 0x00(m),hc0
360 movd 0x10(m),t1
361 punpcklqdq t1,hc0
362 pand ANMASK(%rip),hc0
363 movd h0,t1
364 paddd t1,hc0
365
366 movd 0x03(m),hc1
367 movd 0x13(m),t1
368 punpcklqdq t1,hc1
369 psrld $2,hc1
370 pand ANMASK(%rip),hc1
371 movd h1,t1
372 paddd t1,hc1
373
374 movd 0x06(m),hc2
375 movd 0x16(m),t1
376 punpcklqdq t1,hc2
377 psrld $4,hc2
378 pand ANMASK(%rip),hc2
379 movd h2,t1
380 paddd t1,hc2
381
382 movd 0x09(m),hc3
383 movd 0x19(m),t1
384 punpcklqdq t1,hc3
385 psrld $6,hc3
386 pand ANMASK(%rip),hc3
387 movd h3,t1
388 paddd t1,hc3
389
390 movd 0x0c(m),hc4
391 movd 0x1c(m),t1
392 punpcklqdq t1,hc4
393 psrld $8,hc4
394 por ORMASK(%rip),hc4
395 movd h4,t1
396 paddd t1,hc4
397
398
399 movdqa ru0,t1
400 pmuludq hc0,t1
401
402 movdqa sv4,t2
403 pmuludq hc1,t2
404 paddq t2,t1
405
406 movdqa sv3,t2
407 pmuludq hc2,t2
408 paddq t2,t1
409
410 movdqa sv2,t2
411 pmuludq hc3,t2
412 paddq t2,t1
413
414 movdqa sv1,t2
415 pmuludq hc4,t2
416 paddq t2,t1
417
418 movdqa t1,t2
419 psrldq $8,t2
420 paddq t2,t1
421 movq t1,d0
422
423
424 movdqa ru1,t1
425 pmuludq hc0,t1
426
427 movdqa ru0,t2
428 pmuludq hc1,t2
429 paddq t2,t1
430
431 movdqa sv4,t2
432 pmuludq hc2,t2
433 paddq t2,t1
434
435 movdqa sv3,t2
436 pmuludq hc3,t2
437 paddq t2,t1
438
439 movdqa sv2,t2
440 pmuludq hc4,t2
441 paddq t2,t1
442
443 movdqa t1,t2
444 psrldq $8,t2
445 paddq t2,t1
446 movq t1,d1
447
448
449 movdqa ru2,t1
450 pmuludq hc0,t1
451
452 movdqa ru1,t2
453 pmuludq hc1,t2
454 paddq t2,t1
455
456 movdqa ru0,t2
457 pmuludq hc2,t2
458 paddq t2,t1
459
460 movdqa sv4,t2
461 pmuludq hc3,t2
462 paddq t2,t1
463
464 movdqa sv3,t2
465 pmuludq hc4,t2
466 paddq t2,t1
467
468 movdqa t1,t2
469 psrldq $8,t2
470 paddq t2,t1
471 movq t1,d2
472
473
474 movdqa ru3,t1
475 pmuludq hc0,t1
476
477 movdqa ru2,t2
478 pmuludq hc1,t2
479 paddq t2,t1
480
481 movdqa ru1,t2
482 pmuludq hc2,t2
483 paddq t2,t1
484
485 movdqa ru0,t2
486 pmuludq hc3,t2
487 paddq t2,t1
488
489 movdqa sv4,t2
490 pmuludq hc4,t2
491 paddq t2,t1
492
493 movdqa t1,t2
494 psrldq $8,t2
495 paddq t2,t1
496 movq t1,d3
497
498
499 movdqa ru4,t1
500 pmuludq hc0,t1
501
502 movdqa ru3,t2
503 pmuludq hc1,t2
504 paddq t2,t1
505
506 movdqa ru2,t2
507 pmuludq hc2,t2
508 paddq t2,t1
509
510 movdqa ru1,t2
511 pmuludq hc3,t2
512 paddq t2,t1
513
514 movdqa ru0,t2
515 pmuludq hc4,t2
516 paddq t2,t1
517
518 movdqa t1,t2
519 psrldq $8,t2
520 paddq t2,t1
521 movq t1,d4
522
523
524
525
526 # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit
527
528
529
530 mov d0,%rax
531 shr $26,%rax
532 add %rax,d1
533
534 mov d0,%rbx
535 and $0x3ffffff,%ebx
536
537
538 mov d1,%rax
539 shr $26,%rax
540 add %rax,d2
541
542 mov d1,%rax
543 and $0x3ffffff,%eax
544 mov %eax,h1
545
546
547 mov d2,%rax
548 shr $26,%rax
549 add %rax,d3
550
551 mov d2,%rax
552 and $0x3ffffff,%eax
553 mov %eax,h2
554
555
556 mov d3,%rax
557 shr $26,%rax
558 add %rax,d4
559
560 mov d3,%rax
561 and $0x3ffffff,%eax
562 mov %eax,h3
563
564
565 mov d4,%rax
566 shr $26,%rax
567 lea (%rax,%rax,4),%rax
568 add %rax,%rbx
569
570 mov d4,%rax
571 and $0x3ffffff,%eax
572 mov %eax,h4
573
574
575 mov %rbx,%rax
576 shr $26,%rax
577 add %eax,h1
578
579 andl $0x3ffffff,%ebx
580 mov %ebx,h0
581
582 add $0x20,m
583 dec %rcx
584 jnz .Ldoblock2
585
586 pop %r13
587 pop %r12
588 pop %rbx
589 ret
590 ENDPROC(poly1305_2block_sse2)