1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65 #include <linux/linkage.h>
66 #include <asm/assembler.h>
67
68 .text
69 .cpu generic+crypto
70
71 init_crc .req w19
72 buf .req x20
73 len .req x21
74 fold_consts_ptr .req x22
75
76 fold_consts .req v10
77
78 ad .req v14
79
80 k00_16 .req v15
81 k32_48 .req v16
82
83 t3 .req v17
84 t4 .req v18
85 t5 .req v19
86 t6 .req v20
87 t7 .req v21
88 t8 .req v22
89 t9 .req v23
90
91 perm1 .req v24
92 perm2 .req v25
93 perm3 .req v26
94 perm4 .req v27
95
96 bd1 .req v28
97 bd2 .req v29
98 bd3 .req v30
99 bd4 .req v31
100
101 .macro __pmull_init_p64
102 .endm
103
104 .macro __pmull_pre_p64, bd
105 .endm
106
107 .macro __pmull_init_p8
108
109
110 movi k32_48.2d, #0xffffffff
111 mov k32_48.h[2], k32_48.h[0]
112 ushr k00_16.2d, k32_48.2d, #32
113
114
115 mov_q x5, 0x080f0e0d0c0b0a09
116 movi perm4.8b, #8
117 dup perm1.2d, x5
118 eor perm1.16b, perm1.16b, perm4.16b
119 ushr perm2.2d, perm1.2d, #8
120 ushr perm3.2d, perm1.2d, #16
121 ushr perm4.2d, perm1.2d, #24
122 sli perm2.2d, perm1.2d, #56
123 sli perm3.2d, perm1.2d, #48
124 sli perm4.2d, perm1.2d, #40
125 .endm
126
127 .macro __pmull_pre_p8, bd
128 tbl bd1.16b, {\bd\().16b}, perm1.16b
129 tbl bd2.16b, {\bd\().16b}, perm2.16b
130 tbl bd3.16b, {\bd\().16b}, perm3.16b
131 tbl bd4.16b, {\bd\().16b}, perm4.16b
132 .endm
133
134 __pmull_p8_core:
135 .L__pmull_p8_core:
136 ext t4.8b, ad.8b, ad.8b, #1
137 ext t5.8b, ad.8b, ad.8b, #2
138 ext t6.8b, ad.8b, ad.8b, #3
139
140 pmull t4.8h, t4.8b, fold_consts.8b
141 pmull t8.8h, ad.8b, bd1.8b
142 pmull t5.8h, t5.8b, fold_consts.8b
143 pmull t7.8h, ad.8b, bd2.8b
144 pmull t6.8h, t6.8b, fold_consts.8b
145 pmull t9.8h, ad.8b, bd3.8b
146 pmull t3.8h, ad.8b, bd4.8b
147 b 0f
148
149 .L__pmull_p8_core2:
150 tbl t4.16b, {ad.16b}, perm1.16b
151 tbl t5.16b, {ad.16b}, perm2.16b
152 tbl t6.16b, {ad.16b}, perm3.16b
153
154 pmull2 t4.8h, t4.16b, fold_consts.16b
155 pmull2 t8.8h, ad.16b, bd1.16b
156 pmull2 t5.8h, t5.16b, fold_consts.16b
157 pmull2 t7.8h, ad.16b, bd2.16b
158 pmull2 t6.8h, t6.16b, fold_consts.16b
159 pmull2 t9.8h, ad.16b, bd3.16b
160 pmull2 t3.8h, ad.16b, bd4.16b
161
162 0: eor t4.16b, t4.16b, t8.16b
163 eor t5.16b, t5.16b, t7.16b
164 eor t6.16b, t6.16b, t9.16b
165
166 uzp1 t8.2d, t4.2d, t5.2d
167 uzp2 t4.2d, t4.2d, t5.2d
168 uzp1 t7.2d, t6.2d, t3.2d
169 uzp2 t6.2d, t6.2d, t3.2d
170
171
172
173 eor t8.16b, t8.16b, t4.16b
174 and t4.16b, t4.16b, k32_48.16b
175
176
177
178 eor t7.16b, t7.16b, t6.16b
179 and t6.16b, t6.16b, k00_16.16b
180
181 eor t8.16b, t8.16b, t4.16b
182 eor t7.16b, t7.16b, t6.16b
183
184 zip2 t5.2d, t8.2d, t4.2d
185 zip1 t4.2d, t8.2d, t4.2d
186 zip2 t3.2d, t7.2d, t6.2d
187 zip1 t6.2d, t7.2d, t6.2d
188
189 ext t4.16b, t4.16b, t4.16b, #15
190 ext t5.16b, t5.16b, t5.16b, #14
191 ext t6.16b, t6.16b, t6.16b, #13
192 ext t3.16b, t3.16b, t3.16b, #12
193
194 eor t4.16b, t4.16b, t5.16b
195 eor t6.16b, t6.16b, t3.16b
196 ret
197 ENDPROC(__pmull_p8_core)
198
199 .macro __pmull_p8, rq, ad, bd, i
200 .ifnc \bd, fold_consts
201 .err
202 .endif
203 mov ad.16b, \ad\().16b
204 .ifb \i
205 pmull \rq\().8h, \ad\().8b, \bd\().8b
206 .else
207 pmull2 \rq\().8h, \ad\().16b, \bd\().16b
208 .endif
209
210 bl .L__pmull_p8_core\i
211
212 eor \rq\().16b, \rq\().16b, t4.16b
213 eor \rq\().16b, \rq\().16b, t6.16b
214 .endm
215
216
217
218 .macro fold_32_bytes, p, reg1, reg2
219 ldp q11, q12, [buf], #0x20
220
221 __pmull_\p v8, \reg1, fold_consts, 2
222 __pmull_\p \reg1, \reg1, fold_consts
223
224 CPU_LE( rev64 v11.16b, v11.16b )
225 CPU_LE( rev64 v12.16b, v12.16b )
226
227 __pmull_\p v9, \reg2, fold_consts, 2
228 __pmull_\p \reg2, \reg2, fold_consts
229
230 CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
231 CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
232
233 eor \reg1\().16b, \reg1\().16b, v8.16b
234 eor \reg2\().16b, \reg2\().16b, v9.16b
235 eor \reg1\().16b, \reg1\().16b, v11.16b
236 eor \reg2\().16b, \reg2\().16b, v12.16b
237 .endm
238
239
240 .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts
241 __pmull_\p v8, \src_reg, fold_consts
242 __pmull_\p \src_reg, \src_reg, fold_consts, 2
243 .ifnb \load_next_consts
244 ld1 {fold_consts.2d}, [fold_consts_ptr], #16
245 __pmull_pre_\p fold_consts
246 .endif
247 eor \dst_reg\().16b, \dst_reg\().16b, v8.16b
248 eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
249 .endm
250
251 .macro __pmull_p64, rd, rn, rm, n
252 .ifb \n
253 pmull \rd\().1q, \rn\().1d, \rm\().1d
254 .else
255 pmull2 \rd\().1q, \rn\().2d, \rm\().2d
256 .endif
257 .endm
258
259 .macro crc_t10dif_pmull, p
260 frame_push 4, 128
261
262 mov init_crc, w0
263 mov buf, x1
264 mov len, x2
265
266 __pmull_init_\p
267
268
269 cmp len, #256
270 b.lt .Lless_than_256_bytes_\@
271
272 adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts
273
274
275
276 ldp q0, q1, [buf]
277 ldp q2, q3, [buf, #0x20]
278 ldp q4, q5, [buf, #0x40]
279 ldp q6, q7, [buf, #0x60]
280 add buf, buf, #0x80
281 CPU_LE( rev64 v0.16b, v0.16b )
282 CPU_LE( rev64 v1.16b, v1.16b )
283 CPU_LE( rev64 v2.16b, v2.16b )
284 CPU_LE( rev64 v3.16b, v3.16b )
285 CPU_LE( rev64 v4.16b, v4.16b )
286 CPU_LE( rev64 v5.16b, v5.16b )
287 CPU_LE( rev64 v6.16b, v6.16b )
288 CPU_LE( rev64 v7.16b, v7.16b )
289 CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
290 CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
291 CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
292 CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
293 CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
294 CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
295 CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
296 CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
297
298
299 movi v8.16b, #0
300 mov v8.h[7], init_crc
301 eor v0.16b, v0.16b, v8.16b
302
303
304 ld1 {fold_consts.2d}, [fold_consts_ptr]
305 __pmull_pre_\p fold_consts
306
307
308
309 sub len, len, #256
310
311
312
313 .Lfold_128_bytes_loop_\@:
314 fold_32_bytes \p, v0, v1
315 fold_32_bytes \p, v2, v3
316 fold_32_bytes \p, v4, v5
317 fold_32_bytes \p, v6, v7
318
319 subs len, len, #128
320 b.lt .Lfold_128_bytes_loop_done_\@
321
322 if_will_cond_yield_neon
323 stp q0, q1, [sp, #.Lframe_local_offset]
324 stp q2, q3, [sp, #.Lframe_local_offset + 32]
325 stp q4, q5, [sp, #.Lframe_local_offset + 64]
326 stp q6, q7, [sp, #.Lframe_local_offset + 96]
327 do_cond_yield_neon
328 ldp q0, q1, [sp, #.Lframe_local_offset]
329 ldp q2, q3, [sp, #.Lframe_local_offset + 32]
330 ldp q4, q5, [sp, #.Lframe_local_offset + 64]
331 ldp q6, q7, [sp, #.Lframe_local_offset + 96]
332 ld1 {fold_consts.2d}, [fold_consts_ptr]
333 __pmull_init_\p
334 __pmull_pre_\p fold_consts
335 endif_yield_neon
336
337 b .Lfold_128_bytes_loop_\@
338
339 .Lfold_128_bytes_loop_done_\@:
340
341
342
343
344 add fold_consts_ptr, fold_consts_ptr, #16
345 ld1 {fold_consts.2d}, [fold_consts_ptr], #16
346 __pmull_pre_\p fold_consts
347 fold_16_bytes \p, v0, v4
348 fold_16_bytes \p, v1, v5
349 fold_16_bytes \p, v2, v6
350 fold_16_bytes \p, v3, v7, 1
351
352 fold_16_bytes \p, v4, v6
353 fold_16_bytes \p, v5, v7, 1
354
355 fold_16_bytes \p, v6, v7
356
357
358
359
360
361 adds len, len, #(128-16)
362
363
364
365 b.lt .Lfold_16_bytes_loop_done_\@
366 .Lfold_16_bytes_loop_\@:
367 __pmull_\p v8, v7, fold_consts
368 __pmull_\p v7, v7, fold_consts, 2
369 eor v7.16b, v7.16b, v8.16b
370 ldr q0, [buf], #16
371 CPU_LE( rev64 v0.16b, v0.16b )
372 CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
373 eor v7.16b, v7.16b, v0.16b
374 subs len, len, #16
375 b.ge .Lfold_16_bytes_loop_\@
376
377 .Lfold_16_bytes_loop_done_\@:
378
379
380 adds len, len, #16
381 b.eq .Lreduce_final_16_bytes_\@
382
383 .Lhandle_partial_segment_\@:
384
385
386
387
388
389
390
391 add buf, buf, len
392 ldr q0, [buf, #-16]
393 CPU_LE( rev64 v0.16b, v0.16b )
394 CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
395
396
397 adr_l x4, .Lbyteshift_table + 16
398 sub x4, x4, len
399 ld1 {v2.16b}, [x4]
400 tbl v1.16b, {v7.16b}, v2.16b
401
402
403 movi v3.16b, #0x80
404 eor v2.16b, v2.16b, v3.16b
405 tbl v3.16b, {v7.16b}, v2.16b
406
407
408 sshr v2.16b, v2.16b, #7
409
410
411
412 bsl v2.16b, v1.16b, v0.16b
413
414
415 __pmull_\p v0, v3, fold_consts
416 __pmull_\p v7, v3, fold_consts, 2
417 eor v7.16b, v7.16b, v0.16b
418 eor v7.16b, v7.16b, v2.16b
419
420 .Lreduce_final_16_bytes_\@:
421
422
423 movi v2.16b, #0
424
425
426 ld1 {fold_consts.2d}, [fold_consts_ptr], #16
427 __pmull_pre_\p fold_consts
428
429
430
431
432 ext v0.16b, v2.16b, v7.16b, #8
433 __pmull_\p v7, v7, fold_consts, 2
434 eor v0.16b, v0.16b, v7.16b
435
436
437
438 ext v1.16b, v0.16b, v2.16b, #12
439 mov v0.s[3], v2.s[0]
440 __pmull_\p v1, v1, fold_consts
441 eor v0.16b, v0.16b, v1.16b
442
443
444 ld1 {fold_consts.2d}, [fold_consts_ptr]
445 __pmull_pre_\p fold_consts
446
447
448 __pmull_\p v1, v0, fold_consts, 2
449 ushr v1.2d, v1.2d, #32
450 __pmull_\p v1, v1, fold_consts
451 ushr v0.2d, v0.2d, #48
452 eor v0.16b, v0.16b, v1.16b
453
454
455 umov w0, v0.h[0]
456 frame_pop
457 ret
458
459 .Lless_than_256_bytes_\@:
460
461
462 adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts
463
464
465 ldr q7, [buf], #0x10
466 CPU_LE( rev64 v7.16b, v7.16b )
467 CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
468
469
470 movi v0.16b, #0
471 mov v0.h[7], init_crc
472 eor v7.16b, v7.16b, v0.16b
473
474
475 ld1 {fold_consts.2d}, [fold_consts_ptr], #16
476 __pmull_pre_\p fold_consts
477
478 cmp len, #16
479 b.eq .Lreduce_final_16_bytes_\@
480 subs len, len, #32
481 b.ge .Lfold_16_bytes_loop_\@
482 add len, len, #16
483 b .Lhandle_partial_segment_\@
484 .endm
485
486
487
488
489
490
491 ENTRY(crc_t10dif_pmull_p8)
492 crc_t10dif_pmull p8
493 ENDPROC(crc_t10dif_pmull_p8)
494
495 .align 5
496
497
498
499
500
501 ENTRY(crc_t10dif_pmull_p64)
502 crc_t10dif_pmull p64
503 ENDPROC(crc_t10dif_pmull_p64)
504
505 .section ".rodata", "a"
506 .align 4
507
508
509
510 .Lfold_across_128_bytes_consts:
511 .quad 0x0000000000006123
512 .quad 0x0000000000002295
513
514 .quad 0x0000000000001069
515 .quad 0x000000000000dd31
516
517 .quad 0x000000000000857d
518 .quad 0x0000000000007acc
519 .Lfold_across_16_bytes_consts:
520 .quad 0x000000000000a010
521 .quad 0x0000000000001faa
522
523 .quad 0x1368000000000000
524 .quad 0x2d56000000000000
525
526 .quad 0x0000000000018bb7
527 .quad 0x00000001f65a57f8
528
529
530
531
532 .Lbyteshift_table:
533 .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
534 .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
535 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
536 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0