root/tools/testing/selftests/powerpc/copyloops/memcpy_power7.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  *
   4  * Copyright (C) IBM Corporation, 2012
   5  *
   6  * Author: Anton Blanchard <anton@au.ibm.com>
   7  */
   8 #include <asm/ppc_asm.h>
   9 
  10 #ifndef SELFTEST_CASE
  11 /* 0 == don't use VMX, 1 == use VMX */
  12 #define SELFTEST_CASE   0
  13 #endif
  14 
  15 #ifdef __BIG_ENDIAN__
  16 #define LVS(VRT,RA,RB)          lvsl    VRT,RA,RB
  17 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRA,VRB,VRC
  18 #else
  19 #define LVS(VRT,RA,RB)          lvsr    VRT,RA,RB
  20 #define VPERM(VRT,VRA,VRB,VRC)  vperm   VRT,VRB,VRA,VRC
  21 #endif
  22 
  23 _GLOBAL(memcpy_power7)
  24         cmpldi  r5,16
  25         cmpldi  cr1,r5,4096
  26         std     r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
  27         blt     .Lshort_copy
  28 
  29 #ifdef CONFIG_ALTIVEC
  30 test_feature = SELFTEST_CASE
  31 BEGIN_FTR_SECTION
  32         bgt     cr1, .Lvmx_copy
  33 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
  34 #endif
  35 
  36 .Lnonvmx_copy:
  37         /* Get the source 8B aligned */
  38         neg     r6,r4
  39         mtocrf  0x01,r6
  40         clrldi  r6,r6,(64-3)
  41 
  42         bf      cr7*4+3,1f
  43         lbz     r0,0(r4)
  44         addi    r4,r4,1
  45         stb     r0,0(r3)
  46         addi    r3,r3,1
  47 
  48 1:      bf      cr7*4+2,2f
  49         lhz     r0,0(r4)
  50         addi    r4,r4,2
  51         sth     r0,0(r3)
  52         addi    r3,r3,2
  53 
  54 2:      bf      cr7*4+1,3f
  55         lwz     r0,0(r4)
  56         addi    r4,r4,4
  57         stw     r0,0(r3)
  58         addi    r3,r3,4
  59 
  60 3:      sub     r5,r5,r6
  61         cmpldi  r5,128
  62         blt     5f
  63 
  64         mflr    r0
  65         stdu    r1,-STACKFRAMESIZE(r1)
  66         std     r14,STK_REG(R14)(r1)
  67         std     r15,STK_REG(R15)(r1)
  68         std     r16,STK_REG(R16)(r1)
  69         std     r17,STK_REG(R17)(r1)
  70         std     r18,STK_REG(R18)(r1)
  71         std     r19,STK_REG(R19)(r1)
  72         std     r20,STK_REG(R20)(r1)
  73         std     r21,STK_REG(R21)(r1)
  74         std     r22,STK_REG(R22)(r1)
  75         std     r0,STACKFRAMESIZE+16(r1)
  76 
  77         srdi    r6,r5,7
  78         mtctr   r6
  79 
  80         /* Now do cacheline (128B) sized loads and stores. */
  81         .align  5
  82 4:
  83         ld      r0,0(r4)
  84         ld      r6,8(r4)
  85         ld      r7,16(r4)
  86         ld      r8,24(r4)
  87         ld      r9,32(r4)
  88         ld      r10,40(r4)
  89         ld      r11,48(r4)
  90         ld      r12,56(r4)
  91         ld      r14,64(r4)
  92         ld      r15,72(r4)
  93         ld      r16,80(r4)
  94         ld      r17,88(r4)
  95         ld      r18,96(r4)
  96         ld      r19,104(r4)
  97         ld      r20,112(r4)
  98         ld      r21,120(r4)
  99         addi    r4,r4,128
 100         std     r0,0(r3)
 101         std     r6,8(r3)
 102         std     r7,16(r3)
 103         std     r8,24(r3)
 104         std     r9,32(r3)
 105         std     r10,40(r3)
 106         std     r11,48(r3)
 107         std     r12,56(r3)
 108         std     r14,64(r3)
 109         std     r15,72(r3)
 110         std     r16,80(r3)
 111         std     r17,88(r3)
 112         std     r18,96(r3)
 113         std     r19,104(r3)
 114         std     r20,112(r3)
 115         std     r21,120(r3)
 116         addi    r3,r3,128
 117         bdnz    4b
 118 
 119         clrldi  r5,r5,(64-7)
 120 
 121         ld      r14,STK_REG(R14)(r1)
 122         ld      r15,STK_REG(R15)(r1)
 123         ld      r16,STK_REG(R16)(r1)
 124         ld      r17,STK_REG(R17)(r1)
 125         ld      r18,STK_REG(R18)(r1)
 126         ld      r19,STK_REG(R19)(r1)
 127         ld      r20,STK_REG(R20)(r1)
 128         ld      r21,STK_REG(R21)(r1)
 129         ld      r22,STK_REG(R22)(r1)
 130         addi    r1,r1,STACKFRAMESIZE
 131 
 132         /* Up to 127B to go */
 133 5:      srdi    r6,r5,4
 134         mtocrf  0x01,r6
 135 
 136 6:      bf      cr7*4+1,7f
 137         ld      r0,0(r4)
 138         ld      r6,8(r4)
 139         ld      r7,16(r4)
 140         ld      r8,24(r4)
 141         ld      r9,32(r4)
 142         ld      r10,40(r4)
 143         ld      r11,48(r4)
 144         ld      r12,56(r4)
 145         addi    r4,r4,64
 146         std     r0,0(r3)
 147         std     r6,8(r3)
 148         std     r7,16(r3)
 149         std     r8,24(r3)
 150         std     r9,32(r3)
 151         std     r10,40(r3)
 152         std     r11,48(r3)
 153         std     r12,56(r3)
 154         addi    r3,r3,64
 155 
 156         /* Up to 63B to go */
 157 7:      bf      cr7*4+2,8f
 158         ld      r0,0(r4)
 159         ld      r6,8(r4)
 160         ld      r7,16(r4)
 161         ld      r8,24(r4)
 162         addi    r4,r4,32
 163         std     r0,0(r3)
 164         std     r6,8(r3)
 165         std     r7,16(r3)
 166         std     r8,24(r3)
 167         addi    r3,r3,32
 168 
 169         /* Up to 31B to go */
 170 8:      bf      cr7*4+3,9f
 171         ld      r0,0(r4)
 172         ld      r6,8(r4)
 173         addi    r4,r4,16
 174         std     r0,0(r3)
 175         std     r6,8(r3)
 176         addi    r3,r3,16
 177 
 178 9:      clrldi  r5,r5,(64-4)
 179 
 180         /* Up to 15B to go */
 181 .Lshort_copy:
 182         mtocrf  0x01,r5
 183         bf      cr7*4+0,12f
 184         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 185         lwz     r6,4(r4)
 186         addi    r4,r4,8
 187         stw     r0,0(r3)
 188         stw     r6,4(r3)
 189         addi    r3,r3,8
 190 
 191 12:     bf      cr7*4+1,13f
 192         lwz     r0,0(r4)
 193         addi    r4,r4,4
 194         stw     r0,0(r3)
 195         addi    r3,r3,4
 196 
 197 13:     bf      cr7*4+2,14f
 198         lhz     r0,0(r4)
 199         addi    r4,r4,2
 200         sth     r0,0(r3)
 201         addi    r3,r3,2
 202 
 203 14:     bf      cr7*4+3,15f
 204         lbz     r0,0(r4)
 205         stb     r0,0(r3)
 206 
 207 15:     ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
 208         blr
 209 
 210 .Lunwind_stack_nonvmx_copy:
 211         addi    r1,r1,STACKFRAMESIZE
 212         b       .Lnonvmx_copy
 213 
 214 .Lvmx_copy:
 215 #ifdef CONFIG_ALTIVEC
 216         mflr    r0
 217         std     r4,-STACKFRAMESIZE+STK_REG(R30)(r1)
 218         std     r5,-STACKFRAMESIZE+STK_REG(R29)(r1)
 219         std     r0,16(r1)
 220         stdu    r1,-STACKFRAMESIZE(r1)
 221         bl      enter_vmx_ops
 222         cmpwi   cr1,r3,0
 223         ld      r0,STACKFRAMESIZE+16(r1)
 224         ld      r3,STK_REG(R31)(r1)
 225         ld      r4,STK_REG(R30)(r1)
 226         ld      r5,STK_REG(R29)(r1)
 227         mtlr    r0
 228 
 229         /*
 230          * We prefetch both the source and destination using enhanced touch
 231          * instructions. We use a stream ID of 0 for the load side and
 232          * 1 for the store side.
 233          */
 234         clrrdi  r6,r4,7
 235         clrrdi  r9,r3,7
 236         ori     r9,r9,1         /* stream=1 */
 237 
 238         srdi    r7,r5,7         /* length in cachelines, capped at 0x3FF */
 239         cmpldi  r7,0x3FF
 240         ble     1f
 241         li      r7,0x3FF
 242 1:      lis     r0,0x0E00       /* depth=7 */
 243         sldi    r7,r7,7
 244         or      r7,r7,r0
 245         ori     r10,r7,1        /* stream=1 */
 246 
 247         lis     r8,0x8000       /* GO=1 */
 248         clrldi  r8,r8,32
 249 
 250         dcbt    0,r6,0b01000
 251         dcbt    0,r7,0b01010
 252         dcbtst  0,r9,0b01000
 253         dcbtst  0,r10,0b01010
 254         eieio
 255         dcbt    0,r8,0b01010    /* GO */
 256 
 257         beq     cr1,.Lunwind_stack_nonvmx_copy
 258 
 259         /*
 260          * If source and destination are not relatively aligned we use a
 261          * slower permute loop.
 262          */
 263         xor     r6,r4,r3
 264         rldicl. r6,r6,0,(64-4)
 265         bne     .Lvmx_unaligned_copy
 266 
 267         /* Get the destination 16B aligned */
 268         neg     r6,r3
 269         mtocrf  0x01,r6
 270         clrldi  r6,r6,(64-4)
 271 
 272         bf      cr7*4+3,1f
 273         lbz     r0,0(r4)
 274         addi    r4,r4,1
 275         stb     r0,0(r3)
 276         addi    r3,r3,1
 277 
 278 1:      bf      cr7*4+2,2f
 279         lhz     r0,0(r4)
 280         addi    r4,r4,2
 281         sth     r0,0(r3)
 282         addi    r3,r3,2
 283 
 284 2:      bf      cr7*4+1,3f
 285         lwz     r0,0(r4)
 286         addi    r4,r4,4
 287         stw     r0,0(r3)
 288         addi    r3,r3,4
 289 
 290 3:      bf      cr7*4+0,4f
 291         ld      r0,0(r4)
 292         addi    r4,r4,8
 293         std     r0,0(r3)
 294         addi    r3,r3,8
 295 
 296 4:      sub     r5,r5,r6
 297 
 298         /* Get the desination 128B aligned */
 299         neg     r6,r3
 300         srdi    r7,r6,4
 301         mtocrf  0x01,r7
 302         clrldi  r6,r6,(64-7)
 303 
 304         li      r9,16
 305         li      r10,32
 306         li      r11,48
 307 
 308         bf      cr7*4+3,5f
 309         lvx     v1,0,r4
 310         addi    r4,r4,16
 311         stvx    v1,0,r3
 312         addi    r3,r3,16
 313 
 314 5:      bf      cr7*4+2,6f
 315         lvx     v1,0,r4
 316         lvx     v0,r4,r9
 317         addi    r4,r4,32
 318         stvx    v1,0,r3
 319         stvx    v0,r3,r9
 320         addi    r3,r3,32
 321 
 322 6:      bf      cr7*4+1,7f
 323         lvx     v3,0,r4
 324         lvx     v2,r4,r9
 325         lvx     v1,r4,r10
 326         lvx     v0,r4,r11
 327         addi    r4,r4,64
 328         stvx    v3,0,r3
 329         stvx    v2,r3,r9
 330         stvx    v1,r3,r10
 331         stvx    v0,r3,r11
 332         addi    r3,r3,64
 333 
 334 7:      sub     r5,r5,r6
 335         srdi    r6,r5,7
 336 
 337         std     r14,STK_REG(R14)(r1)
 338         std     r15,STK_REG(R15)(r1)
 339         std     r16,STK_REG(R16)(r1)
 340 
 341         li      r12,64
 342         li      r14,80
 343         li      r15,96
 344         li      r16,112
 345 
 346         mtctr   r6
 347 
 348         /*
 349          * Now do cacheline sized loads and stores. By this stage the
 350          * cacheline stores are also cacheline aligned.
 351          */
 352         .align  5
 353 8:
 354         lvx     v7,0,r4
 355         lvx     v6,r4,r9
 356         lvx     v5,r4,r10
 357         lvx     v4,r4,r11
 358         lvx     v3,r4,r12
 359         lvx     v2,r4,r14
 360         lvx     v1,r4,r15
 361         lvx     v0,r4,r16
 362         addi    r4,r4,128
 363         stvx    v7,0,r3
 364         stvx    v6,r3,r9
 365         stvx    v5,r3,r10
 366         stvx    v4,r3,r11
 367         stvx    v3,r3,r12
 368         stvx    v2,r3,r14
 369         stvx    v1,r3,r15
 370         stvx    v0,r3,r16
 371         addi    r3,r3,128
 372         bdnz    8b
 373 
 374         ld      r14,STK_REG(R14)(r1)
 375         ld      r15,STK_REG(R15)(r1)
 376         ld      r16,STK_REG(R16)(r1)
 377 
 378         /* Up to 127B to go */
 379         clrldi  r5,r5,(64-7)
 380         srdi    r6,r5,4
 381         mtocrf  0x01,r6
 382 
 383         bf      cr7*4+1,9f
 384         lvx     v3,0,r4
 385         lvx     v2,r4,r9
 386         lvx     v1,r4,r10
 387         lvx     v0,r4,r11
 388         addi    r4,r4,64
 389         stvx    v3,0,r3
 390         stvx    v2,r3,r9
 391         stvx    v1,r3,r10
 392         stvx    v0,r3,r11
 393         addi    r3,r3,64
 394 
 395 9:      bf      cr7*4+2,10f
 396         lvx     v1,0,r4
 397         lvx     v0,r4,r9
 398         addi    r4,r4,32
 399         stvx    v1,0,r3
 400         stvx    v0,r3,r9
 401         addi    r3,r3,32
 402 
 403 10:     bf      cr7*4+3,11f
 404         lvx     v1,0,r4
 405         addi    r4,r4,16
 406         stvx    v1,0,r3
 407         addi    r3,r3,16
 408 
 409         /* Up to 15B to go */
 410 11:     clrldi  r5,r5,(64-4)
 411         mtocrf  0x01,r5
 412         bf      cr7*4+0,12f
 413         ld      r0,0(r4)
 414         addi    r4,r4,8
 415         std     r0,0(r3)
 416         addi    r3,r3,8
 417 
 418 12:     bf      cr7*4+1,13f
 419         lwz     r0,0(r4)
 420         addi    r4,r4,4
 421         stw     r0,0(r3)
 422         addi    r3,r3,4
 423 
 424 13:     bf      cr7*4+2,14f
 425         lhz     r0,0(r4)
 426         addi    r4,r4,2
 427         sth     r0,0(r3)
 428         addi    r3,r3,2
 429 
 430 14:     bf      cr7*4+3,15f
 431         lbz     r0,0(r4)
 432         stb     r0,0(r3)
 433 
 434 15:     addi    r1,r1,STACKFRAMESIZE
 435         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
 436         b       exit_vmx_ops            /* tail call optimise */
 437 
 438 .Lvmx_unaligned_copy:
 439         /* Get the destination 16B aligned */
 440         neg     r6,r3
 441         mtocrf  0x01,r6
 442         clrldi  r6,r6,(64-4)
 443 
 444         bf      cr7*4+3,1f
 445         lbz     r0,0(r4)
 446         addi    r4,r4,1
 447         stb     r0,0(r3)
 448         addi    r3,r3,1
 449 
 450 1:      bf      cr7*4+2,2f
 451         lhz     r0,0(r4)
 452         addi    r4,r4,2
 453         sth     r0,0(r3)
 454         addi    r3,r3,2
 455 
 456 2:      bf      cr7*4+1,3f
 457         lwz     r0,0(r4)
 458         addi    r4,r4,4
 459         stw     r0,0(r3)
 460         addi    r3,r3,4
 461 
 462 3:      bf      cr7*4+0,4f
 463         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 464         lwz     r7,4(r4)
 465         addi    r4,r4,8
 466         stw     r0,0(r3)
 467         stw     r7,4(r3)
 468         addi    r3,r3,8
 469 
 470 4:      sub     r5,r5,r6
 471 
 472         /* Get the desination 128B aligned */
 473         neg     r6,r3
 474         srdi    r7,r6,4
 475         mtocrf  0x01,r7
 476         clrldi  r6,r6,(64-7)
 477 
 478         li      r9,16
 479         li      r10,32
 480         li      r11,48
 481 
 482         LVS(v16,0,r4)           /* Setup permute control vector */
 483         lvx     v0,0,r4
 484         addi    r4,r4,16
 485 
 486         bf      cr7*4+3,5f
 487         lvx     v1,0,r4
 488         VPERM(v8,v0,v1,v16)
 489         addi    r4,r4,16
 490         stvx    v8,0,r3
 491         addi    r3,r3,16
 492         vor     v0,v1,v1
 493 
 494 5:      bf      cr7*4+2,6f
 495         lvx     v1,0,r4
 496         VPERM(v8,v0,v1,v16)
 497         lvx     v0,r4,r9
 498         VPERM(v9,v1,v0,v16)
 499         addi    r4,r4,32
 500         stvx    v8,0,r3
 501         stvx    v9,r3,r9
 502         addi    r3,r3,32
 503 
 504 6:      bf      cr7*4+1,7f
 505         lvx     v3,0,r4
 506         VPERM(v8,v0,v3,v16)
 507         lvx     v2,r4,r9
 508         VPERM(v9,v3,v2,v16)
 509         lvx     v1,r4,r10
 510         VPERM(v10,v2,v1,v16)
 511         lvx     v0,r4,r11
 512         VPERM(v11,v1,v0,v16)
 513         addi    r4,r4,64
 514         stvx    v8,0,r3
 515         stvx    v9,r3,r9
 516         stvx    v10,r3,r10
 517         stvx    v11,r3,r11
 518         addi    r3,r3,64
 519 
 520 7:      sub     r5,r5,r6
 521         srdi    r6,r5,7
 522 
 523         std     r14,STK_REG(R14)(r1)
 524         std     r15,STK_REG(R15)(r1)
 525         std     r16,STK_REG(R16)(r1)
 526 
 527         li      r12,64
 528         li      r14,80
 529         li      r15,96
 530         li      r16,112
 531 
 532         mtctr   r6
 533 
 534         /*
 535          * Now do cacheline sized loads and stores. By this stage the
 536          * cacheline stores are also cacheline aligned.
 537          */
 538         .align  5
 539 8:
 540         lvx     v7,0,r4
 541         VPERM(v8,v0,v7,v16)
 542         lvx     v6,r4,r9
 543         VPERM(v9,v7,v6,v16)
 544         lvx     v5,r4,r10
 545         VPERM(v10,v6,v5,v16)
 546         lvx     v4,r4,r11
 547         VPERM(v11,v5,v4,v16)
 548         lvx     v3,r4,r12
 549         VPERM(v12,v4,v3,v16)
 550         lvx     v2,r4,r14
 551         VPERM(v13,v3,v2,v16)
 552         lvx     v1,r4,r15
 553         VPERM(v14,v2,v1,v16)
 554         lvx     v0,r4,r16
 555         VPERM(v15,v1,v0,v16)
 556         addi    r4,r4,128
 557         stvx    v8,0,r3
 558         stvx    v9,r3,r9
 559         stvx    v10,r3,r10
 560         stvx    v11,r3,r11
 561         stvx    v12,r3,r12
 562         stvx    v13,r3,r14
 563         stvx    v14,r3,r15
 564         stvx    v15,r3,r16
 565         addi    r3,r3,128
 566         bdnz    8b
 567 
 568         ld      r14,STK_REG(R14)(r1)
 569         ld      r15,STK_REG(R15)(r1)
 570         ld      r16,STK_REG(R16)(r1)
 571 
 572         /* Up to 127B to go */
 573         clrldi  r5,r5,(64-7)
 574         srdi    r6,r5,4
 575         mtocrf  0x01,r6
 576 
 577         bf      cr7*4+1,9f
 578         lvx     v3,0,r4
 579         VPERM(v8,v0,v3,v16)
 580         lvx     v2,r4,r9
 581         VPERM(v9,v3,v2,v16)
 582         lvx     v1,r4,r10
 583         VPERM(v10,v2,v1,v16)
 584         lvx     v0,r4,r11
 585         VPERM(v11,v1,v0,v16)
 586         addi    r4,r4,64
 587         stvx    v8,0,r3
 588         stvx    v9,r3,r9
 589         stvx    v10,r3,r10
 590         stvx    v11,r3,r11
 591         addi    r3,r3,64
 592 
 593 9:      bf      cr7*4+2,10f
 594         lvx     v1,0,r4
 595         VPERM(v8,v0,v1,v16)
 596         lvx     v0,r4,r9
 597         VPERM(v9,v1,v0,v16)
 598         addi    r4,r4,32
 599         stvx    v8,0,r3
 600         stvx    v9,r3,r9
 601         addi    r3,r3,32
 602 
 603 10:     bf      cr7*4+3,11f
 604         lvx     v1,0,r4
 605         VPERM(v8,v0,v1,v16)
 606         addi    r4,r4,16
 607         stvx    v8,0,r3
 608         addi    r3,r3,16
 609 
 610         /* Up to 15B to go */
 611 11:     clrldi  r5,r5,(64-4)
 612         addi    r4,r4,-16       /* Unwind the +16 load offset */
 613         mtocrf  0x01,r5
 614         bf      cr7*4+0,12f
 615         lwz     r0,0(r4)        /* Less chance of a reject with word ops */
 616         lwz     r6,4(r4)
 617         addi    r4,r4,8
 618         stw     r0,0(r3)
 619         stw     r6,4(r3)
 620         addi    r3,r3,8
 621 
 622 12:     bf      cr7*4+1,13f
 623         lwz     r0,0(r4)
 624         addi    r4,r4,4
 625         stw     r0,0(r3)
 626         addi    r3,r3,4
 627 
 628 13:     bf      cr7*4+2,14f
 629         lhz     r0,0(r4)
 630         addi    r4,r4,2
 631         sth     r0,0(r3)
 632         addi    r3,r3,2
 633 
 634 14:     bf      cr7*4+3,15f
 635         lbz     r0,0(r4)
 636         stb     r0,0(r3)
 637 
 638 15:     addi    r1,r1,STACKFRAMESIZE
 639         ld      r3,-STACKFRAMESIZE+STK_REG(R31)(r1)
 640         b       exit_vmx_ops            /* tail call optimise */
 641 #endif /* CONFIG_ALTIVEC */

/* [<][>][^][v][top][bottom][index][help] */