1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 #include <asm/ppc_asm.h>
28 #include <asm/ppc-opcode.h>
29
30 #define MAX_SIZE 32768
31
32 .text
33
34 #if defined(__BIG_ENDIAN__) && defined(REFLECT)
35 #define BYTESWAP_DATA
36 #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
37 #define BYTESWAP_DATA
38 #else
39 #undef BYTESWAP_DATA
40 #endif
41
42 #define off16 r25
43 #define off32 r26
44 #define off48 r27
45 #define off64 r28
46 #define off80 r29
47 #define off96 r30
48 #define off112 r31
49
50 #define const1 v24
51 #define const2 v25
52
53 #define byteswap v26
54 #define mask_32bit v27
55 #define mask_64bit v28
56 #define zeroes v29
57
58 #ifdef BYTESWAP_DATA
59 #define VPERM(A, B, C, D) vperm A, B, C, D
60 #else
61 #define VPERM(A, B, C, D)
62 #endif
63
64
65 FUNC_START(CRC_FUNCTION_NAME)
66 std r31,-8(r1)
67 std r30,-16(r1)
68 std r29,-24(r1)
69 std r28,-32(r1)
70 std r27,-40(r1)
71 std r26,-48(r1)
72 std r25,-56(r1)
73
74 li off16,16
75 li off32,32
76 li off48,48
77 li off64,64
78 li off80,80
79 li off96,96
80 li off112,112
81 li r0,0
82
83
84 subi r6,r1,56+10*16
85 subi r7,r1,56+2*16
86
87 stvx v20,0,r6
88 stvx v21,off16,r6
89 stvx v22,off32,r6
90 stvx v23,off48,r6
91 stvx v24,off64,r6
92 stvx v25,off80,r6
93 stvx v26,off96,r6
94 stvx v27,off112,r6
95 stvx v28,0,r7
96 stvx v29,off16,r7
97
98 mr r10,r3
99
100 vxor zeroes,zeroes,zeroes
101 vspltisw v0,-1
102
103 vsldoi mask_32bit,zeroes,v0,4
104 vsldoi mask_64bit,zeroes,v0,8
105
106
107 vxor v8,v8,v8
108 MTVRD(v8, R3)
109 #ifdef REFLECT
110 vsldoi v8,zeroes,v8,8
111 #else
112 vsldoi v8,v8,zeroes,4
113 #endif
114
115 #ifdef BYTESWAP_DATA
116 addis r3,r2,.byteswap_constant@toc@ha
117 addi r3,r3,.byteswap_constant@toc@l
118
119 lvx byteswap,0,r3
120 addi r3,r3,16
121 #endif
122
123 cmpdi r5,256
124 blt .Lshort
125
126 rldicr r6,r5,0,56
127
128
129 1: lis r7,MAX_SIZE@h
130 ori r7,r7,MAX_SIZE@l
131 mr r9,r7
132 cmpd r6,r7
133 bgt 2f
134 mr r7,r6
135 2: subf r6,r7,r6
136
137
138 srdi r7,r7,7
139
140
141
142
143
144
145 sldi r8,r7,4
146 srdi r9,r9,3
147 subf r8,r8,r9
148
149
150 addi r7,r7,-1
151 mtctr r7
152
153 addis r3,r2,.constants@toc@ha
154 addi r3,r3,.constants@toc@l
155
156
157 add r3,r3,r8
158
159
160 vxor v0,v0,v0
161 vxor v1,v1,v1
162 vxor v2,v2,v2
163 vxor v3,v3,v3
164 vxor v4,v4,v4
165 vxor v5,v5,v5
166 vxor v6,v6,v6
167 vxor v7,v7,v7
168
169 lvx const1,0,r3
170
171
172
173
174
175 cmpdi r0,1
176 beq 2f
177
178
179 lvx v16,0,r4
180 lvx v17,off16,r4
181 VPERM(v16,v16,v16,byteswap)
182 VPERM(v17,v17,v17,byteswap)
183 lvx v18,off32,r4
184 lvx v19,off48,r4
185 VPERM(v18,v18,v18,byteswap)
186 VPERM(v19,v19,v19,byteswap)
187 lvx v20,off64,r4
188 lvx v21,off80,r4
189 VPERM(v20,v20,v20,byteswap)
190 VPERM(v21,v21,v21,byteswap)
191 lvx v22,off96,r4
192 lvx v23,off112,r4
193 VPERM(v22,v22,v22,byteswap)
194 VPERM(v23,v23,v23,byteswap)
195 addi r4,r4,8*16
196
197
198 vxor v16,v16,v8
199
200 2: bdz .Lfirst_warm_up_done
201
202 addi r3,r3,16
203 lvx const2,0,r3
204
205
206 VPMSUMD(v8,v16,const1)
207 lvx v16,0,r4
208 VPERM(v16,v16,v16,byteswap)
209 ori r2,r2,0
210
211 VPMSUMD(v9,v17,const1)
212 lvx v17,off16,r4
213 VPERM(v17,v17,v17,byteswap)
214 ori r2,r2,0
215
216 VPMSUMD(v10,v18,const1)
217 lvx v18,off32,r4
218 VPERM(v18,v18,v18,byteswap)
219 ori r2,r2,0
220
221 VPMSUMD(v11,v19,const1)
222 lvx v19,off48,r4
223 VPERM(v19,v19,v19,byteswap)
224 ori r2,r2,0
225
226 VPMSUMD(v12,v20,const1)
227 lvx v20,off64,r4
228 VPERM(v20,v20,v20,byteswap)
229 ori r2,r2,0
230
231 VPMSUMD(v13,v21,const1)
232 lvx v21,off80,r4
233 VPERM(v21,v21,v21,byteswap)
234 ori r2,r2,0
235
236 VPMSUMD(v14,v22,const1)
237 lvx v22,off96,r4
238 VPERM(v22,v22,v22,byteswap)
239 ori r2,r2,0
240
241 VPMSUMD(v15,v23,const1)
242 lvx v23,off112,r4
243 VPERM(v23,v23,v23,byteswap)
244
245 addi r4,r4,8*16
246
247 bdz .Lfirst_cool_down
248
249
250
251
252
253
254 .balign 16
255 4: lvx const1,0,r3
256 addi r3,r3,16
257 ori r2,r2,0
258
259 vxor v0,v0,v8
260 VPMSUMD(v8,v16,const2)
261 lvx v16,0,r4
262 VPERM(v16,v16,v16,byteswap)
263 ori r2,r2,0
264
265 vxor v1,v1,v9
266 VPMSUMD(v9,v17,const2)
267 lvx v17,off16,r4
268 VPERM(v17,v17,v17,byteswap)
269 ori r2,r2,0
270
271 vxor v2,v2,v10
272 VPMSUMD(v10,v18,const2)
273 lvx v18,off32,r4
274 VPERM(v18,v18,v18,byteswap)
275 ori r2,r2,0
276
277 vxor v3,v3,v11
278 VPMSUMD(v11,v19,const2)
279 lvx v19,off48,r4
280 VPERM(v19,v19,v19,byteswap)
281 lvx const2,0,r3
282 ori r2,r2,0
283
284 vxor v4,v4,v12
285 VPMSUMD(v12,v20,const1)
286 lvx v20,off64,r4
287 VPERM(v20,v20,v20,byteswap)
288 ori r2,r2,0
289
290 vxor v5,v5,v13
291 VPMSUMD(v13,v21,const1)
292 lvx v21,off80,r4
293 VPERM(v21,v21,v21,byteswap)
294 ori r2,r2,0
295
296 vxor v6,v6,v14
297 VPMSUMD(v14,v22,const1)
298 lvx v22,off96,r4
299 VPERM(v22,v22,v22,byteswap)
300 ori r2,r2,0
301
302 vxor v7,v7,v15
303 VPMSUMD(v15,v23,const1)
304 lvx v23,off112,r4
305 VPERM(v23,v23,v23,byteswap)
306
307 addi r4,r4,8*16
308
309 bdnz 4b
310
311 .Lfirst_cool_down:
312
313 lvx const1,0,r3
314 addi r3,r3,16
315
316 vxor v0,v0,v8
317 VPMSUMD(v8,v16,const1)
318 ori r2,r2,0
319
320 vxor v1,v1,v9
321 VPMSUMD(v9,v17,const1)
322 ori r2,r2,0
323
324 vxor v2,v2,v10
325 VPMSUMD(v10,v18,const1)
326 ori r2,r2,0
327
328 vxor v3,v3,v11
329 VPMSUMD(v11,v19,const1)
330 ori r2,r2,0
331
332 vxor v4,v4,v12
333 VPMSUMD(v12,v20,const1)
334 ori r2,r2,0
335
336 vxor v5,v5,v13
337 VPMSUMD(v13,v21,const1)
338 ori r2,r2,0
339
340 vxor v6,v6,v14
341 VPMSUMD(v14,v22,const1)
342 ori r2,r2,0
343
344 vxor v7,v7,v15
345 VPMSUMD(v15,v23,const1)
346 ori r2,r2,0
347
348 .Lsecond_cool_down:
349
350 vxor v0,v0,v8
351 vxor v1,v1,v9
352 vxor v2,v2,v10
353 vxor v3,v3,v11
354 vxor v4,v4,v12
355 vxor v5,v5,v13
356 vxor v6,v6,v14
357 vxor v7,v7,v15
358
359 #ifdef REFLECT
360
361
362
363
364
365
366 vsldoi v0,v0,zeroes,4
367 vsldoi v1,v1,zeroes,4
368 vsldoi v2,v2,zeroes,4
369 vsldoi v3,v3,zeroes,4
370 vsldoi v4,v4,zeroes,4
371 vsldoi v5,v5,zeroes,4
372 vsldoi v6,v6,zeroes,4
373 vsldoi v7,v7,zeroes,4
374 #endif
375
376
377 lvx v8,0,r4
378 lvx v9,off16,r4
379 VPERM(v8,v8,v8,byteswap)
380 VPERM(v9,v9,v9,byteswap)
381 lvx v10,off32,r4
382 lvx v11,off48,r4
383 VPERM(v10,v10,v10,byteswap)
384 VPERM(v11,v11,v11,byteswap)
385 lvx v12,off64,r4
386 lvx v13,off80,r4
387 VPERM(v12,v12,v12,byteswap)
388 VPERM(v13,v13,v13,byteswap)
389 lvx v14,off96,r4
390 lvx v15,off112,r4
391 VPERM(v14,v14,v14,byteswap)
392 VPERM(v15,v15,v15,byteswap)
393
394 addi r4,r4,8*16
395
396 vxor v16,v0,v8
397 vxor v17,v1,v9
398 vxor v18,v2,v10
399 vxor v19,v3,v11
400 vxor v20,v4,v12
401 vxor v21,v5,v13
402 vxor v22,v6,v14
403 vxor v23,v7,v15
404
405 li r0,1
406 cmpdi r6,0
407 addi r6,r6,128
408 bne 1b
409
410
411 andi. r5,r5,127
412
413
414 subfic r6,r5,128
415 add r3,r3,r6
416
417
418 srdi r7,r5,4
419 mtctr r7
420
421
422
423
424
425 lvx v0,0,r3
426 lvx v1,off16,r3
427 lvx v2,off32,r3
428 lvx v3,off48,r3
429 lvx v4,off64,r3
430 lvx v5,off80,r3
431 lvx v6,off96,r3
432 lvx v7,off112,r3
433 addi r3,r3,8*16
434
435 VPMSUMW(v0,v16,v0)
436 VPMSUMW(v1,v17,v1)
437 VPMSUMW(v2,v18,v2)
438 VPMSUMW(v3,v19,v3)
439 VPMSUMW(v4,v20,v4)
440 VPMSUMW(v5,v21,v5)
441 VPMSUMW(v6,v22,v6)
442 VPMSUMW(v7,v23,v7)
443
444
445 cmpdi r7,0
446 beq 1f
447
448 lvx v16,0,r4
449 lvx v17,0,r3
450 VPERM(v16,v16,v16,byteswap)
451 VPMSUMW(v16,v16,v17)
452 vxor v0,v0,v16
453 bdz 1f
454
455 lvx v16,off16,r4
456 lvx v17,off16,r3
457 VPERM(v16,v16,v16,byteswap)
458 VPMSUMW(v16,v16,v17)
459 vxor v0,v0,v16
460 bdz 1f
461
462 lvx v16,off32,r4
463 lvx v17,off32,r3
464 VPERM(v16,v16,v16,byteswap)
465 VPMSUMW(v16,v16,v17)
466 vxor v0,v0,v16
467 bdz 1f
468
469 lvx v16,off48,r4
470 lvx v17,off48,r3
471 VPERM(v16,v16,v16,byteswap)
472 VPMSUMW(v16,v16,v17)
473 vxor v0,v0,v16
474 bdz 1f
475
476 lvx v16,off64,r4
477 lvx v17,off64,r3
478 VPERM(v16,v16,v16,byteswap)
479 VPMSUMW(v16,v16,v17)
480 vxor v0,v0,v16
481 bdz 1f
482
483 lvx v16,off80,r4
484 lvx v17,off80,r3
485 VPERM(v16,v16,v16,byteswap)
486 VPMSUMW(v16,v16,v17)
487 vxor v0,v0,v16
488 bdz 1f
489
490 lvx v16,off96,r4
491 lvx v17,off96,r3
492 VPERM(v16,v16,v16,byteswap)
493 VPMSUMW(v16,v16,v17)
494 vxor v0,v0,v16
495
496
497 1: vxor v0,v0,v1
498 vxor v2,v2,v3
499 vxor v4,v4,v5
500 vxor v6,v6,v7
501
502 vxor v0,v0,v2
503 vxor v4,v4,v6
504
505 vxor v0,v0,v4
506
507 .Lbarrett_reduction:
508
509 addis r3,r2,.barrett_constants@toc@ha
510 addi r3,r3,.barrett_constants@toc@l
511
512 lvx const1,0,r3
513 lvx const2,off16,r3
514
515 vsldoi v1,v0,v0,8
516 vxor v0,v0,v1
517
518 #ifdef REFLECT
519
520 vspltisb v1,1
521 vsl v0,v0,v1
522 #endif
523
524 vand v0,v0,mask_64bit
525 #ifndef REFLECT
526
527
528
529
530
531
532 VPMSUMD(v1,v0,const1)
533 vsldoi v1,zeroes,v1,8
534 VPMSUMD(v1,v1,const2)
535 vxor v0,v0,v1
536
537
538
539
540
541
542 vsldoi v0,v0,zeroes,8
543 #else
544
545
546
547
548
549
550
551 vand v1,v0,mask_32bit
552 VPMSUMD(v1,v1,const1)
553 vand v1,v1,mask_32bit
554 VPMSUMD(v1,v1,const2)
555 vxor v0,v0,v1
556
557
558
559
560
561
562
563 vsldoi v0,v0,zeroes,4
564 #endif
565
566
567 MFVRD(R3, v0)
568
569 .Lout:
570 subi r6,r1,56+10*16
571 subi r7,r1,56+2*16
572
573 lvx v20,0,r6
574 lvx v21,off16,r6
575 lvx v22,off32,r6
576 lvx v23,off48,r6
577 lvx v24,off64,r6
578 lvx v25,off80,r6
579 lvx v26,off96,r6
580 lvx v27,off112,r6
581 lvx v28,0,r7
582 lvx v29,off16,r7
583
584 ld r31,-8(r1)
585 ld r30,-16(r1)
586 ld r29,-24(r1)
587 ld r28,-32(r1)
588 ld r27,-40(r1)
589 ld r26,-48(r1)
590 ld r25,-56(r1)
591
592 blr
593
594 .Lfirst_warm_up_done:
595 lvx const1,0,r3
596 addi r3,r3,16
597
598 VPMSUMD(v8,v16,const1)
599 VPMSUMD(v9,v17,const1)
600 VPMSUMD(v10,v18,const1)
601 VPMSUMD(v11,v19,const1)
602 VPMSUMD(v12,v20,const1)
603 VPMSUMD(v13,v21,const1)
604 VPMSUMD(v14,v22,const1)
605 VPMSUMD(v15,v23,const1)
606
607 b .Lsecond_cool_down
608
609 .Lshort:
610 cmpdi r5,0
611 beq .Lzero
612
613 addis r3,r2,.short_constants@toc@ha
614 addi r3,r3,.short_constants@toc@l
615
616
617 subfic r6,r5,256
618 add r3,r3,r6
619
620
621 srdi r7,r5,4
622 mtctr r7
623
624 vxor v19,v19,v19
625 vxor v20,v20,v20
626
627 lvx v0,0,r4
628 lvx v16,0,r3
629 VPERM(v0,v0,v16,byteswap)
630 vxor v0,v0,v8
631 VPMSUMW(v0,v0,v16)
632 bdz .Lv0
633
634 lvx v1,off16,r4
635 lvx v17,off16,r3
636 VPERM(v1,v1,v17,byteswap)
637 VPMSUMW(v1,v1,v17)
638 bdz .Lv1
639
640 lvx v2,off32,r4
641 lvx v16,off32,r3
642 VPERM(v2,v2,v16,byteswap)
643 VPMSUMW(v2,v2,v16)
644 bdz .Lv2
645
646 lvx v3,off48,r4
647 lvx v17,off48,r3
648 VPERM(v3,v3,v17,byteswap)
649 VPMSUMW(v3,v3,v17)
650 bdz .Lv3
651
652 lvx v4,off64,r4
653 lvx v16,off64,r3
654 VPERM(v4,v4,v16,byteswap)
655 VPMSUMW(v4,v4,v16)
656 bdz .Lv4
657
658 lvx v5,off80,r4
659 lvx v17,off80,r3
660 VPERM(v5,v5,v17,byteswap)
661 VPMSUMW(v5,v5,v17)
662 bdz .Lv5
663
664 lvx v6,off96,r4
665 lvx v16,off96,r3
666 VPERM(v6,v6,v16,byteswap)
667 VPMSUMW(v6,v6,v16)
668 bdz .Lv6
669
670 lvx v7,off112,r4
671 lvx v17,off112,r3
672 VPERM(v7,v7,v17,byteswap)
673 VPMSUMW(v7,v7,v17)
674 bdz .Lv7
675
676 addi r3,r3,128
677 addi r4,r4,128
678
679 lvx v8,0,r4
680 lvx v16,0,r3
681 VPERM(v8,v8,v16,byteswap)
682 VPMSUMW(v8,v8,v16)
683 bdz .Lv8
684
685 lvx v9,off16,r4
686 lvx v17,off16,r3
687 VPERM(v9,v9,v17,byteswap)
688 VPMSUMW(v9,v9,v17)
689 bdz .Lv9
690
691 lvx v10,off32,r4
692 lvx v16,off32,r3
693 VPERM(v10,v10,v16,byteswap)
694 VPMSUMW(v10,v10,v16)
695 bdz .Lv10
696
697 lvx v11,off48,r4
698 lvx v17,off48,r3
699 VPERM(v11,v11,v17,byteswap)
700 VPMSUMW(v11,v11,v17)
701 bdz .Lv11
702
703 lvx v12,off64,r4
704 lvx v16,off64,r3
705 VPERM(v12,v12,v16,byteswap)
706 VPMSUMW(v12,v12,v16)
707 bdz .Lv12
708
709 lvx v13,off80,r4
710 lvx v17,off80,r3
711 VPERM(v13,v13,v17,byteswap)
712 VPMSUMW(v13,v13,v17)
713 bdz .Lv13
714
715 lvx v14,off96,r4
716 lvx v16,off96,r3
717 VPERM(v14,v14,v16,byteswap)
718 VPMSUMW(v14,v14,v16)
719 bdz .Lv14
720
721 lvx v15,off112,r4
722 lvx v17,off112,r3
723 VPERM(v15,v15,v17,byteswap)
724 VPMSUMW(v15,v15,v17)
725
726 .Lv15: vxor v19,v19,v15
727 .Lv14: vxor v20,v20,v14
728 .Lv13: vxor v19,v19,v13
729 .Lv12: vxor v20,v20,v12
730 .Lv11: vxor v19,v19,v11
731 .Lv10: vxor v20,v20,v10
732 .Lv9: vxor v19,v19,v9
733 .Lv8: vxor v20,v20,v8
734 .Lv7: vxor v19,v19,v7
735 .Lv6: vxor v20,v20,v6
736 .Lv5: vxor v19,v19,v5
737 .Lv4: vxor v20,v20,v4
738 .Lv3: vxor v19,v19,v3
739 .Lv2: vxor v20,v20,v2
740 .Lv1: vxor v19,v19,v1
741 .Lv0: vxor v20,v20,v0
742
743 vxor v0,v19,v20
744
745 b .Lbarrett_reduction
746
747 .Lzero:
748 mr r3,r10
749 b .Lout
750
751 FUNC_END(CRC_FUNCTION_NAME)