1
2
3
4
5
6 #include <asm/ppc_asm.h>
7 #include <asm/export.h>
8 #include <asm/ppc-opcode.h>
9
10 #define off8 r6
11 #define off16 r7
12 #define off24 r8
13
14 #define rA r9
15 #define rB r10
16 #define rC r11
17 #define rD r27
18 #define rE r28
19 #define rF r29
20 #define rG r30
21 #define rH r31
22
23 #ifdef __LITTLE_ENDIAN__
24 #define LH lhbrx
25 #define LW lwbrx
26 #define LD ldbrx
27 #define LVS lvsr
28 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
29 vperm _VRT,_VRB,_VRA,_VRC
30 #else
31 #define LH lhzx
32 #define LW lwzx
33 #define LD ldx
34 #define LVS lvsl
35 #define VPERM(_VRT,_VRA,_VRB,_VRC) \
36 vperm _VRT,_VRA,_VRB,_VRC
37 #endif
38
39 #define VMX_THRESH 4096
40 #define ENTER_VMX_OPS \
41 mflr r0; \
42 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
43 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
44 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
45 std r0,16(r1); \
46 stdu r1,-STACKFRAMESIZE(r1); \
47 bl enter_vmx_ops; \
48 cmpwi cr1,r3,0; \
49 ld r0,STACKFRAMESIZE+16(r1); \
50 ld r3,STK_REG(R31)(r1); \
51 ld r4,STK_REG(R30)(r1); \
52 ld r5,STK_REG(R29)(r1); \
53 addi r1,r1,STACKFRAMESIZE; \
54 mtlr r0
55
56 #define EXIT_VMX_OPS \
57 mflr r0; \
58 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1); \
59 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1); \
60 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1); \
61 std r0,16(r1); \
62 stdu r1,-STACKFRAMESIZE(r1); \
63 bl exit_vmx_ops; \
64 ld r0,STACKFRAMESIZE+16(r1); \
65 ld r3,STK_REG(R31)(r1); \
66 ld r4,STK_REG(R30)(r1); \
67 ld r5,STK_REG(R29)(r1); \
68 addi r1,r1,STACKFRAMESIZE; \
69 mtlr r0
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90 #define LD_VSR_CROSS16B(_vaddr,_vmask,_v1st_qw,_v2nd_qw,_v_res) \
91 lvx _v2nd_qw,_vaddr,off16; \
92 VPERM(_v_res,_v1st_qw,_v2nd_qw,_vmask)
93
94
95
96
97
98
99
100
101 _GLOBAL_TOC(memcmp)
102 cmpdi cr1,r5,0
103
104
105
106
107 xor r6,r3,r4
108 andi. r6,r6,7
109
110
111
112
113 cmpdi cr6,r5,7
114
115 beq cr1,.Lzero
116 bgt cr6,.Lno_short
117
118 .Lshort:
119 mtctr r5
120 1: lbz rA,0(r3)
121 lbz rB,0(r4)
122 subf. rC,rB,rA
123 bne .Lnon_zero
124 bdz .Lzero
125
126 lbz rA,1(r3)
127 lbz rB,1(r4)
128 subf. rC,rB,rA
129 bne .Lnon_zero
130 bdz .Lzero
131
132 lbz rA,2(r3)
133 lbz rB,2(r4)
134 subf. rC,rB,rA
135 bne .Lnon_zero
136 bdz .Lzero
137
138 lbz rA,3(r3)
139 lbz rB,3(r4)
140 subf. rC,rB,rA
141 bne .Lnon_zero
142
143 addi r3,r3,4
144 addi r4,r4,4
145
146 bdnz 1b
147
148 .Lzero:
149 li r3,0
150 blr
151
152 .Lno_short:
153 dcbt 0,r3
154 dcbt 0,r4
155 bne .Ldiffoffset_8bytes_make_align_start
156
157
158 .Lsameoffset_8bytes_make_align_start:
159
160
161
162 andi. r6,r3,7
163
164
165
166
167
168 rlwinm r6,r3,3,26,28
169 beq .Lsameoffset_8bytes_aligned
170 clrrdi r3,r3,3
171 clrrdi r4,r4,3
172 LD rA,0,r3
173 LD rB,0,r4
174 sld rA,rA,r6
175 sld rB,rB,r6
176 cmpld cr0,rA,rB
177 srwi r6,r6,3
178 bne cr0,.LcmpAB_lightweight
179 subfic r6,r6,8
180 subf. r5,r6,r5
181 addi r3,r3,8
182 addi r4,r4,8
183 beq .Lzero
184
185 .Lsameoffset_8bytes_aligned:
186
187
188
189 cmpdi cr6,r5,31
190 bgt cr6,.Llong
191
192 .Lcmp_lt32bytes:
193
194 cmpdi cr5,r5,7
195 srdi r0,r5,3
196 ble cr5,.Lcmp_rest_lt8bytes
197
198
199 clrldi r5,r5,61
200 mtctr r0
201 2:
202 LD rA,0,r3
203 LD rB,0,r4
204 cmpld cr0,rA,rB
205 addi r3,r3,8
206 addi r4,r4,8
207 bne cr0,.LcmpAB_lightweight
208 bdnz 2b
209
210 cmpwi r5,0
211 beq .Lzero
212
213 .Lcmp_rest_lt8bytes:
214
215
216
217
218
219
220
221
222
223
224 clrldi r6,r4,(64-12)
225 cmpdi r6,0xff8
226 bgt .Lshort
227
228 subfic r6,r5,8
229 slwi r6,r6,3
230 LD rA,0,r3
231 LD rB,0,r4
232 srd rA,rA,r6
233 srd rB,rB,r6
234 cmpld cr0,rA,rB
235 bne cr0,.LcmpAB_lightweight
236 b .Lzero
237
238 .Lnon_zero:
239 mr r3,rC
240 blr
241
242 .Llong:
243 #ifdef CONFIG_ALTIVEC
244 BEGIN_FTR_SECTION
245
246 cmpldi cr6,r5,VMX_THRESH
247 bge cr6,.Lsameoffset_vmx_cmp
248 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
249
250 .Llong_novmx_cmp:
251 #endif
252
253 li off8,8
254 li off16,16
255 li off24,24
256
257 std r31,-8(r1)
258 std r30,-16(r1)
259 std r29,-24(r1)
260 std r28,-32(r1)
261 std r27,-40(r1)
262
263 srdi r0,r5,5
264 mtctr r0
265 andi. r5,r5,31
266
267 LD rA,0,r3
268 LD rB,0,r4
269
270 LD rC,off8,r3
271 LD rD,off8,r4
272
273 LD rE,off16,r3
274 LD rF,off16,r4
275
276 LD rG,off24,r3
277 LD rH,off24,r4
278 cmpld cr0,rA,rB
279
280 addi r3,r3,32
281 addi r4,r4,32
282
283 bdz .Lfirst32
284
285 LD rA,0,r3
286 LD rB,0,r4
287 cmpld cr1,rC,rD
288
289 LD rC,off8,r3
290 LD rD,off8,r4
291 cmpld cr6,rE,rF
292
293 LD rE,off16,r3
294 LD rF,off16,r4
295 cmpld cr7,rG,rH
296 bne cr0,.LcmpAB
297
298 LD rG,off24,r3
299 LD rH,off24,r4
300 cmpld cr0,rA,rB
301 bne cr1,.LcmpCD
302
303 addi r3,r3,32
304 addi r4,r4,32
305
306 bdz .Lsecond32
307
308 .balign 16
309
310 1: LD rA,0,r3
311 LD rB,0,r4
312 cmpld cr1,rC,rD
313 bne cr6,.LcmpEF
314
315 LD rC,off8,r3
316 LD rD,off8,r4
317 cmpld cr6,rE,rF
318 bne cr7,.LcmpGH
319
320 LD rE,off16,r3
321 LD rF,off16,r4
322 cmpld cr7,rG,rH
323 bne cr0,.LcmpAB
324
325 LD rG,off24,r3
326 LD rH,off24,r4
327 cmpld cr0,rA,rB
328 bne cr1,.LcmpCD
329
330 addi r3,r3,32
331 addi r4,r4,32
332
333 bdnz 1b
334
335 .Lsecond32:
336 cmpld cr1,rC,rD
337 bne cr6,.LcmpEF
338
339 cmpld cr6,rE,rF
340 bne cr7,.LcmpGH
341
342 cmpld cr7,rG,rH
343 bne cr0,.LcmpAB
344
345 bne cr1,.LcmpCD
346 bne cr6,.LcmpEF
347 bne cr7,.LcmpGH
348
349 .Ltail:
350 ld r31,-8(r1)
351 ld r30,-16(r1)
352 ld r29,-24(r1)
353 ld r28,-32(r1)
354 ld r27,-40(r1)
355
356 cmpdi r5,0
357 beq .Lzero
358 b .Lshort
359
360 .Lfirst32:
361 cmpld cr1,rC,rD
362 cmpld cr6,rE,rF
363 cmpld cr7,rG,rH
364
365 bne cr0,.LcmpAB
366 bne cr1,.LcmpCD
367 bne cr6,.LcmpEF
368 bne cr7,.LcmpGH
369
370 b .Ltail
371
372 .LcmpAB:
373 li r3,1
374 bgt cr0,.Lout
375 li r3,-1
376 b .Lout
377
378 .LcmpCD:
379 li r3,1
380 bgt cr1,.Lout
381 li r3,-1
382 b .Lout
383
384 .LcmpEF:
385 li r3,1
386 bgt cr6,.Lout
387 li r3,-1
388 b .Lout
389
390 .LcmpGH:
391 li r3,1
392 bgt cr7,.Lout
393 li r3,-1
394
395 .Lout:
396 ld r31,-8(r1)
397 ld r30,-16(r1)
398 ld r29,-24(r1)
399 ld r28,-32(r1)
400 ld r27,-40(r1)
401 blr
402
403 .LcmpAB_lightweight:
404 li r3,1
405 bgtlr
406 li r3,-1
407 blr
408
409 #ifdef CONFIG_ALTIVEC
410 .Lsameoffset_vmx_cmp:
411
412
413
414
415
416
417
418
419
420
421 li r0,4
422 mtctr r0
423 .Lsameoffset_prechk_32B_loop:
424 LD rA,0,r3
425 LD rB,0,r4
426 cmpld cr0,rA,rB
427 addi r3,r3,8
428 addi r4,r4,8
429 bne cr0,.LcmpAB_lightweight
430 addi r5,r5,-8
431 bdnz .Lsameoffset_prechk_32B_loop
432
433 ENTER_VMX_OPS
434 beq cr1,.Llong_novmx_cmp
435
436 3:
437
438
439
440 xor r0,r3,r4
441 andi. r0,r0,0xf
442 bne .Ldiffoffset_vmx_cmp_start
443
444
445
446 andi. rA,r3,8
447 LD rA,0,r3
448 beq 4f
449 LD rB,0,r4
450 cmpld cr0,rA,rB
451 addi r3,r3,8
452 addi r4,r4,8
453 addi r5,r5,-8
454
455 beq cr0,4f
456
457 mfocrf r5,128
458 EXIT_VMX_OPS
459 mtocrf 128,r5
460 b .LcmpAB_lightweight
461
462 4:
463
464 srdi r0,r5,5
465 mtctr r0
466 clrldi r5,r5,59
467 li off16,16
468
469 .balign 16
470 5:
471 lvx v0,0,r3
472 lvx v1,0,r4
473 VCMPEQUD_RC(v0,v0,v1)
474 bnl cr6,7f
475 lvx v0,off16,r3
476 lvx v1,off16,r4
477 VCMPEQUD_RC(v0,v0,v1)
478 bnl cr6,6f
479 addi r3,r3,32
480 addi r4,r4,32
481 bdnz 5b
482
483 EXIT_VMX_OPS
484 cmpdi r5,0
485 beq .Lzero
486 b .Lcmp_lt32bytes
487
488 6:
489 addi r3,r3,16
490 addi r4,r4,16
491
492 7:
493
494 EXIT_VMX_OPS
495 LD rA,0,r3
496 LD rB,0,r4
497 cmpld cr0,rA,rB
498 li off8,8
499 bne cr0,.LcmpAB_lightweight
500
501 LD rA,off8,r3
502 LD rB,off8,r4
503 cmpld cr0,rA,rB
504 bne cr0,.LcmpAB_lightweight
505 b .Lzero
506 #endif
507
508 .Ldiffoffset_8bytes_make_align_start:
509
510 rlwinm r6,r3,3,26,28
511 beq .Ldiffoffset_align_s1_8bytes
512
513 clrrdi r3,r3,3
514 LD rA,0,r3
515 LD rB,0,r4
516 sld rA,rA,r6
517 srd rA,rA,r6
518 srd rB,rB,r6
519 cmpld cr0,rA,rB
520 srwi r6,r6,3
521 bne cr0,.LcmpAB_lightweight
522
523 subfic r6,r6,8
524 subf. r5,r6,r5
525 addi r3,r3,8
526 add r4,r4,r6
527
528 beq .Lzero
529
530 .Ldiffoffset_align_s1_8bytes:
531
532 #ifdef CONFIG_ALTIVEC
533 BEGIN_FTR_SECTION
534
535 cmpdi cr5,r5,VMX_THRESH
536 bge cr5,.Ldiffoffset_vmx_cmp
537 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
538
539 .Ldiffoffset_novmx_cmp:
540 #endif
541
542
543 cmpdi cr5,r5,31
544 ble cr5,.Lcmp_lt32bytes
545
546 #ifdef CONFIG_ALTIVEC
547 b .Llong_novmx_cmp
548 #else
549 b .Llong
550 #endif
551
552 #ifdef CONFIG_ALTIVEC
553 .Ldiffoffset_vmx_cmp:
554
555
556
557 li r0,4
558 mtctr r0
559 .Ldiffoffset_prechk_32B_loop:
560 LD rA,0,r3
561 LD rB,0,r4
562 cmpld cr0,rA,rB
563 addi r3,r3,8
564 addi r4,r4,8
565 bne cr0,.LcmpAB_lightweight
566 addi r5,r5,-8
567 bdnz .Ldiffoffset_prechk_32B_loop
568
569 ENTER_VMX_OPS
570 beq cr1,.Ldiffoffset_novmx_cmp
571
572 .Ldiffoffset_vmx_cmp_start:
573
574 andi. r6,r3,0xf
575 li off16,16
576 beq .Ldiffoffset_vmx_s1_16bytes_align
577
578 LVS v3,0,r3
579 LVS v4,0,r4
580
581 lvx v5,0,r3
582 lvx v6,0,r4
583 LD_VSR_CROSS16B(r3,v3,v5,v7,v9)
584 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
585
586 VCMPEQUB_RC(v7,v9,v10)
587 bnl cr6,.Ldiffoffset_vmx_diff_found
588
589 subfic r6,r6,16
590 subf r5,r6,r5
591 add r3,r3,r6
592 add r4,r4,r6
593
594 .Ldiffoffset_vmx_s1_16bytes_align:
595
596 lvx v6,0,r4
597 LVS v4,0,r4
598 srdi r6,r5,5
599 clrldi r5,r5,59
600 mtctr r6
601
602 .balign 16
603 .Ldiffoffset_vmx_32bytesloop:
604
605 lvx v9,0,r3
606 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
607 VCMPEQUB_RC(v7,v9,v10)
608 vor v6,v8,v8
609 bnl cr6,.Ldiffoffset_vmx_diff_found
610
611 addi r3,r3,16
612 addi r4,r4,16
613
614 lvx v9,0,r3
615 LD_VSR_CROSS16B(r4,v4,v6,v8,v10)
616 VCMPEQUB_RC(v7,v9,v10)
617 vor v6,v8,v8
618 bnl cr6,.Ldiffoffset_vmx_diff_found
619
620 addi r3,r3,16
621 addi r4,r4,16
622
623 bdnz .Ldiffoffset_vmx_32bytesloop
624
625 EXIT_VMX_OPS
626
627 cmpdi r5,0
628 beq .Lzero
629 b .Lcmp_lt32bytes
630
631 .Ldiffoffset_vmx_diff_found:
632 EXIT_VMX_OPS
633
634 li r5,16
635 b .Lcmp_lt32bytes
636
637 #endif
638 EXPORT_SYMBOL(memcmp)