arch/powerpc/crypto/crc32-vpmsum

/* [<][>][^][v][top][bottom][index][help] */
   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Core of the accelerated CRC algorithm.
   4  * In your file, define the constants and CRC_FUNCTION_NAME
   5  * Then include this file.
   6  *
   7  * Calculate the checksum of data that is 16 byte aligned and a multiple of
   8  * 16 bytes.
   9  *
  10  * The first step is to reduce it to 1024 bits. We do this in 8 parallel
  11  * chunks in order to mask the latency of the vpmsum instructions. If we
  12  * have more than 32 kB of data to checksum we repeat this step multiple
  13  * times, passing in the previous 1024 bits.
  14  *
  15  * The next step is to reduce the 1024 bits to 64 bits. This step adds
  16  * 32 bits of 0s to the end - this matches what a CRC does. We just
  17  * calculate constants that land the data in this 32 bits.
  18  *
  19  * We then use fixed point Barrett reduction to compute a mod n over GF(2)
  20  * for n = CRC using POWER8 instructions. We use x = 32.
  21  *
  22  * http://en.wikipedia.org/wiki/Barrett_reduction
  23  *
  24  * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
  25 */
  26 
  27 #include <asm/ppc_asm.h>
  28 #include <asm/ppc-opcode.h>
  29 
  30 #define MAX_SIZE        32768
  31 
  32         .text
  33 
  34 #if defined(__BIG_ENDIAN__) && defined(REFLECT)
  35 #define BYTESWAP_DATA
  36 #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
  37 #define BYTESWAP_DATA
  38 #else
  39 #undef BYTESWAP_DATA
  40 #endif
  41 
  42 #define off16           r25
  43 #define off32           r26
  44 #define off48           r27
  45 #define off64           r28
  46 #define off80           r29
  47 #define off96           r30
  48 #define off112          r31
  49 
  50 #define const1          v24
  51 #define const2          v25
  52 
  53 #define byteswap        v26
  54 #define mask_32bit      v27
  55 #define mask_64bit      v28
  56 #define zeroes          v29
  57 
  58 #ifdef BYTESWAP_DATA
  59 #define VPERM(A, B, C, D) vperm A, B, C, D
  60 #else
  61 #define VPERM(A, B, C, D)
  62 #endif
  63 
  64 /* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
  65 FUNC_START(CRC_FUNCTION_NAME)
  66         std     r31,-8(r1)
  67         std     r30,-16(r1)
  68         std     r29,-24(r1)
  69         std     r28,-32(r1)
  70         std     r27,-40(r1)
  71         std     r26,-48(r1)
  72         std     r25,-56(r1)
  73 
  74         li      off16,16
  75         li      off32,32
  76         li      off48,48
  77         li      off64,64
  78         li      off80,80
  79         li      off96,96
  80         li      off112,112
  81         li      r0,0
  82 
  83         /* Enough room for saving 10 non volatile VMX registers */
  84         subi    r6,r1,56+10*16
  85         subi    r7,r1,56+2*16
  86 
  87         stvx    v20,0,r6
  88         stvx    v21,off16,r6
  89         stvx    v22,off32,r6
  90         stvx    v23,off48,r6
  91         stvx    v24,off64,r6
  92         stvx    v25,off80,r6
  93         stvx    v26,off96,r6
  94         stvx    v27,off112,r6
  95         stvx    v28,0,r7
  96         stvx    v29,off16,r7
  97 
  98         mr      r10,r3
  99 
 100         vxor    zeroes,zeroes,zeroes
 101         vspltisw v0,-1
 102 
 103         vsldoi  mask_32bit,zeroes,v0,4
 104         vsldoi  mask_64bit,zeroes,v0,8
 105 
 106         /* Get the initial value into v8 */
 107         vxor    v8,v8,v8
 108         MTVRD(v8, R3)
 109 #ifdef REFLECT
 110         vsldoi  v8,zeroes,v8,8  /* shift into bottom 32 bits */
 111 #else
 112         vsldoi  v8,v8,zeroes,4  /* shift into top 32 bits */
 113 #endif
 114 
 115 #ifdef BYTESWAP_DATA
 116         addis   r3,r2,.byteswap_constant@toc@ha
 117         addi    r3,r3,.byteswap_constant@toc@l
 118 
 119         lvx     byteswap,0,r3
 120         addi    r3,r3,16
 121 #endif
 122 
 123         cmpdi   r5,256
 124         blt     .Lshort
 125 
 126         rldicr  r6,r5,0,56
 127 
 128         /* Checksum in blocks of MAX_SIZE */
 129 1:      lis     r7,MAX_SIZE@h
 130         ori     r7,r7,MAX_SIZE@l
 131         mr      r9,r7
 132         cmpd    r6,r7
 133         bgt     2f
 134         mr      r7,r6
 135 2:      subf    r6,r7,r6
 136 
 137         /* our main loop does 128 bytes at a time */
 138         srdi    r7,r7,7
 139 
 140         /*
 141          * Work out the offset into the constants table to start at. Each
 142          * constant is 16 bytes, and it is used against 128 bytes of input
 143          * data - 128 / 16 = 8
 144          */
 145         sldi    r8,r7,4
 146         srdi    r9,r9,3
 147         subf    r8,r8,r9
 148 
 149         /* We reduce our final 128 bytes in a separate step */
 150         addi    r7,r7,-1
 151         mtctr   r7
 152 
 153         addis   r3,r2,.constants@toc@ha
 154         addi    r3,r3,.constants@toc@l
 155 
 156         /* Find the start of our constants */
 157         add     r3,r3,r8
 158 
 159         /* zero v0-v7 which will contain our checksums */
 160         vxor    v0,v0,v0
 161         vxor    v1,v1,v1
 162         vxor    v2,v2,v2
 163         vxor    v3,v3,v3
 164         vxor    v4,v4,v4
 165         vxor    v5,v5,v5
 166         vxor    v6,v6,v6
 167         vxor    v7,v7,v7
 168 
 169         lvx     const1,0,r3
 170 
 171         /*
 172          * If we are looping back to consume more data we use the values
 173          * already in v16-v23.
 174          */
 175         cmpdi   r0,1
 176         beq     2f
 177 
 178         /* First warm up pass */
 179         lvx     v16,0,r4
 180         lvx     v17,off16,r4
 181         VPERM(v16,v16,v16,byteswap)
 182         VPERM(v17,v17,v17,byteswap)
 183         lvx     v18,off32,r4
 184         lvx     v19,off48,r4
 185         VPERM(v18,v18,v18,byteswap)
 186         VPERM(v19,v19,v19,byteswap)
 187         lvx     v20,off64,r4
 188         lvx     v21,off80,r4
 189         VPERM(v20,v20,v20,byteswap)
 190         VPERM(v21,v21,v21,byteswap)
 191         lvx     v22,off96,r4
 192         lvx     v23,off112,r4
 193         VPERM(v22,v22,v22,byteswap)
 194         VPERM(v23,v23,v23,byteswap)
 195         addi    r4,r4,8*16
 196 
 197         /* xor in initial value */
 198         vxor    v16,v16,v8
 199 
 200 2:      bdz     .Lfirst_warm_up_done
 201 
 202         addi    r3,r3,16
 203         lvx     const2,0,r3
 204 
 205         /* Second warm up pass */
 206         VPMSUMD(v8,v16,const1)
 207         lvx     v16,0,r4
 208         VPERM(v16,v16,v16,byteswap)
 209         ori     r2,r2,0
 210 
 211         VPMSUMD(v9,v17,const1)
 212         lvx     v17,off16,r4
 213         VPERM(v17,v17,v17,byteswap)
 214         ori     r2,r2,0
 215 
 216         VPMSUMD(v10,v18,const1)
 217         lvx     v18,off32,r4
 218         VPERM(v18,v18,v18,byteswap)
 219         ori     r2,r2,0
 220 
 221         VPMSUMD(v11,v19,const1)
 222         lvx     v19,off48,r4
 223         VPERM(v19,v19,v19,byteswap)
 224         ori     r2,r2,0
 225 
 226         VPMSUMD(v12,v20,const1)
 227         lvx     v20,off64,r4
 228         VPERM(v20,v20,v20,byteswap)
 229         ori     r2,r2,0
 230 
 231         VPMSUMD(v13,v21,const1)
 232         lvx     v21,off80,r4
 233         VPERM(v21,v21,v21,byteswap)
 234         ori     r2,r2,0
 235 
 236         VPMSUMD(v14,v22,const1)
 237         lvx     v22,off96,r4
 238         VPERM(v22,v22,v22,byteswap)
 239         ori     r2,r2,0
 240 
 241         VPMSUMD(v15,v23,const1)
 242         lvx     v23,off112,r4
 243         VPERM(v23,v23,v23,byteswap)
 244 
 245         addi    r4,r4,8*16
 246 
 247         bdz     .Lfirst_cool_down
 248 
 249         /*
 250          * main loop. We modulo schedule it such that it takes three iterations
 251          * to complete - first iteration load, second iteration vpmsum, third
 252          * iteration xor.
 253          */
 254         .balign 16
 255 4:      lvx     const1,0,r3
 256         addi    r3,r3,16
 257         ori     r2,r2,0
 258 
 259         vxor    v0,v0,v8
 260         VPMSUMD(v8,v16,const2)
 261         lvx     v16,0,r4
 262         VPERM(v16,v16,v16,byteswap)
 263         ori     r2,r2,0
 264 
 265         vxor    v1,v1,v9
 266         VPMSUMD(v9,v17,const2)
 267         lvx     v17,off16,r4
 268         VPERM(v17,v17,v17,byteswap)
 269         ori     r2,r2,0
 270 
 271         vxor    v2,v2,v10
 272         VPMSUMD(v10,v18,const2)
 273         lvx     v18,off32,r4
 274         VPERM(v18,v18,v18,byteswap)
 275         ori     r2,r2,0
 276 
 277         vxor    v3,v3,v11
 278         VPMSUMD(v11,v19,const2)
 279         lvx     v19,off48,r4
 280         VPERM(v19,v19,v19,byteswap)
 281         lvx     const2,0,r3
 282         ori     r2,r2,0
 283 
 284         vxor    v4,v4,v12
 285         VPMSUMD(v12,v20,const1)
 286         lvx     v20,off64,r4
 287         VPERM(v20,v20,v20,byteswap)
 288         ori     r2,r2,0
 289 
 290         vxor    v5,v5,v13
 291         VPMSUMD(v13,v21,const1)
 292         lvx     v21,off80,r4
 293         VPERM(v21,v21,v21,byteswap)
 294         ori     r2,r2,0
 295 
 296         vxor    v6,v6,v14
 297         VPMSUMD(v14,v22,const1)
 298         lvx     v22,off96,r4
 299         VPERM(v22,v22,v22,byteswap)
 300         ori     r2,r2,0
 301 
 302         vxor    v7,v7,v15
 303         VPMSUMD(v15,v23,const1)
 304         lvx     v23,off112,r4
 305         VPERM(v23,v23,v23,byteswap)
 306 
 307         addi    r4,r4,8*16
 308 
 309         bdnz    4b
 310 
 311 .Lfirst_cool_down:
 312         /* First cool down pass */
 313         lvx     const1,0,r3
 314         addi    r3,r3,16
 315 
 316         vxor    v0,v0,v8
 317         VPMSUMD(v8,v16,const1)
 318         ori     r2,r2,0
 319 
 320         vxor    v1,v1,v9
 321         VPMSUMD(v9,v17,const1)
 322         ori     r2,r2,0
 323 
 324         vxor    v2,v2,v10
 325         VPMSUMD(v10,v18,const1)
 326         ori     r2,r2,0
 327 
 328         vxor    v3,v3,v11
 329         VPMSUMD(v11,v19,const1)
 330         ori     r2,r2,0
 331 
 332         vxor    v4,v4,v12
 333         VPMSUMD(v12,v20,const1)
 334         ori     r2,r2,0
 335 
 336         vxor    v5,v5,v13
 337         VPMSUMD(v13,v21,const1)
 338         ori     r2,r2,0
 339 
 340         vxor    v6,v6,v14
 341         VPMSUMD(v14,v22,const1)
 342         ori     r2,r2,0
 343 
 344         vxor    v7,v7,v15
 345         VPMSUMD(v15,v23,const1)
 346         ori     r2,r2,0
 347 
 348 .Lsecond_cool_down:
 349         /* Second cool down pass */
 350         vxor    v0,v0,v8
 351         vxor    v1,v1,v9
 352         vxor    v2,v2,v10
 353         vxor    v3,v3,v11
 354         vxor    v4,v4,v12
 355         vxor    v5,v5,v13
 356         vxor    v6,v6,v14
 357         vxor    v7,v7,v15
 358 
 359 #ifdef REFLECT
 360         /*
 361          * vpmsumd produces a 96 bit result in the least significant bits
 362          * of the register. Since we are bit reflected we have to shift it
 363          * left 32 bits so it occupies the least significant bits in the
 364          * bit reflected domain.
 365          */
 366         vsldoi  v0,v0,zeroes,4
 367         vsldoi  v1,v1,zeroes,4
 368         vsldoi  v2,v2,zeroes,4
 369         vsldoi  v3,v3,zeroes,4
 370         vsldoi  v4,v4,zeroes,4
 371         vsldoi  v5,v5,zeroes,4
 372         vsldoi  v6,v6,zeroes,4
 373         vsldoi  v7,v7,zeroes,4
 374 #endif
 375 
 376         /* xor with last 1024 bits */
 377         lvx     v8,0,r4
 378         lvx     v9,off16,r4
 379         VPERM(v8,v8,v8,byteswap)
 380         VPERM(v9,v9,v9,byteswap)
 381         lvx     v10,off32,r4
 382         lvx     v11,off48,r4
 383         VPERM(v10,v10,v10,byteswap)
 384         VPERM(v11,v11,v11,byteswap)
 385         lvx     v12,off64,r4
 386         lvx     v13,off80,r4
 387         VPERM(v12,v12,v12,byteswap)
 388         VPERM(v13,v13,v13,byteswap)
 389         lvx     v14,off96,r4
 390         lvx     v15,off112,r4
 391         VPERM(v14,v14,v14,byteswap)
 392         VPERM(v15,v15,v15,byteswap)
 393 
 394         addi    r4,r4,8*16
 395 
 396         vxor    v16,v0,v8
 397         vxor    v17,v1,v9
 398         vxor    v18,v2,v10
 399         vxor    v19,v3,v11
 400         vxor    v20,v4,v12
 401         vxor    v21,v5,v13
 402         vxor    v22,v6,v14
 403         vxor    v23,v7,v15
 404 
 405         li      r0,1
 406         cmpdi   r6,0
 407         addi    r6,r6,128
 408         bne     1b
 409 
 410         /* Work out how many bytes we have left */
 411         andi.   r5,r5,127
 412 
 413         /* Calculate where in the constant table we need to start */
 414         subfic  r6,r5,128
 415         add     r3,r3,r6
 416 
 417         /* How many 16 byte chunks are in the tail */
 418         srdi    r7,r5,4
 419         mtctr   r7
 420 
 421         /*
 422          * Reduce the previously calculated 1024 bits to 64 bits, shifting
 423          * 32 bits to include the trailing 32 bits of zeros
 424          */
 425         lvx     v0,0,r3
 426         lvx     v1,off16,r3
 427         lvx     v2,off32,r3
 428         lvx     v3,off48,r3
 429         lvx     v4,off64,r3
 430         lvx     v5,off80,r3
 431         lvx     v6,off96,r3
 432         lvx     v7,off112,r3
 433         addi    r3,r3,8*16
 434 
 435         VPMSUMW(v0,v16,v0)
 436         VPMSUMW(v1,v17,v1)
 437         VPMSUMW(v2,v18,v2)
 438         VPMSUMW(v3,v19,v3)
 439         VPMSUMW(v4,v20,v4)
 440         VPMSUMW(v5,v21,v5)
 441         VPMSUMW(v6,v22,v6)
 442         VPMSUMW(v7,v23,v7)
 443 
 444         /* Now reduce the tail (0 - 112 bytes) */
 445         cmpdi   r7,0
 446         beq     1f
 447 
 448         lvx     v16,0,r4
 449         lvx     v17,0,r3
 450         VPERM(v16,v16,v16,byteswap)
 451         VPMSUMW(v16,v16,v17)
 452         vxor    v0,v0,v16
 453         bdz     1f
 454 
 455         lvx     v16,off16,r4
 456         lvx     v17,off16,r3
 457         VPERM(v16,v16,v16,byteswap)
 458         VPMSUMW(v16,v16,v17)
 459         vxor    v0,v0,v16
 460         bdz     1f
 461 
 462         lvx     v16,off32,r4
 463         lvx     v17,off32,r3
 464         VPERM(v16,v16,v16,byteswap)
 465         VPMSUMW(v16,v16,v17)
 466         vxor    v0,v0,v16
 467         bdz     1f
 468 
 469         lvx     v16,off48,r4
 470         lvx     v17,off48,r3
 471         VPERM(v16,v16,v16,byteswap)
 472         VPMSUMW(v16,v16,v17)
 473         vxor    v0,v0,v16
 474         bdz     1f
 475 
 476         lvx     v16,off64,r4
 477         lvx     v17,off64,r3
 478         VPERM(v16,v16,v16,byteswap)
 479         VPMSUMW(v16,v16,v17)
 480         vxor    v0,v0,v16
 481         bdz     1f
 482 
 483         lvx     v16,off80,r4
 484         lvx     v17,off80,r3
 485         VPERM(v16,v16,v16,byteswap)
 486         VPMSUMW(v16,v16,v17)
 487         vxor    v0,v0,v16
 488         bdz     1f
 489 
 490         lvx     v16,off96,r4
 491         lvx     v17,off96,r3
 492         VPERM(v16,v16,v16,byteswap)
 493         VPMSUMW(v16,v16,v17)
 494         vxor    v0,v0,v16
 495 
 496         /* Now xor all the parallel chunks together */
 497 1:      vxor    v0,v0,v1
 498         vxor    v2,v2,v3
 499         vxor    v4,v4,v5
 500         vxor    v6,v6,v7
 501 
 502         vxor    v0,v0,v2
 503         vxor    v4,v4,v6
 504 
 505         vxor    v0,v0,v4
 506 
 507 .Lbarrett_reduction:
 508         /* Barrett constants */
 509         addis   r3,r2,.barrett_constants@toc@ha
 510         addi    r3,r3,.barrett_constants@toc@l
 511 
 512         lvx     const1,0,r3
 513         lvx     const2,off16,r3
 514 
 515         vsldoi  v1,v0,v0,8
 516         vxor    v0,v0,v1                /* xor two 64 bit results together */
 517 
 518 #ifdef REFLECT
 519         /* shift left one bit */
 520         vspltisb v1,1
 521         vsl     v0,v0,v1
 522 #endif
 523 
 524         vand    v0,v0,mask_64bit
 525 #ifndef REFLECT
 526         /*
 527          * Now for the Barrett reduction algorithm. The idea is to calculate q,
 528          * the multiple of our polynomial that we need to subtract. By
 529          * doing the computation 2x bits higher (ie 64 bits) and shifting the
 530          * result back down 2x bits, we round down to the nearest multiple.
 531          */
 532         VPMSUMD(v1,v0,const1)   /* ma */
 533         vsldoi  v1,zeroes,v1,8  /* q = floor(ma/(2^64)) */
 534         VPMSUMD(v1,v1,const2)   /* qn */
 535         vxor    v0,v0,v1        /* a - qn, subtraction is xor in GF(2) */
 536 
 537         /*
 538          * Get the result into r3. We need to shift it left 8 bytes:
 539          * V0 [ 0 1 2 X ]
 540          * V0 [ 0 X 2 3 ]
 541          */
 542         vsldoi  v0,v0,zeroes,8  /* shift result into top 64 bits */
 543 #else
 544         /*
 545          * The reflected version of Barrett reduction. Instead of bit
 546          * reflecting our data (which is expensive to do), we bit reflect our
 547          * constants and our algorithm, which means the intermediate data in
 548          * our vector registers goes from 0-63 instead of 63-0. We can reflect
 549          * the algorithm because we don't carry in mod 2 arithmetic.
 550          */
 551         vand    v1,v0,mask_32bit        /* bottom 32 bits of a */
 552         VPMSUMD(v1,v1,const1)           /* ma */
 553         vand    v1,v1,mask_32bit        /* bottom 32bits of ma */
 554         VPMSUMD(v1,v1,const2)           /* qn */
 555         vxor    v0,v0,v1                /* a - qn, subtraction is xor in GF(2) */
 556 
 557         /*
 558          * Since we are bit reflected, the result (ie the low 32 bits) is in
 559          * the high 32 bits. We just need to shift it left 4 bytes
 560          * V0 [ 0 1 X 3 ]
 561          * V0 [ 0 X 2 3 ]
 562          */
 563         vsldoi  v0,v0,zeroes,4          /* shift result into top 64 bits of */
 564 #endif
 565 
 566         /* Get it into r3 */
 567         MFVRD(R3, v0)
 568 
 569 .Lout:
 570         subi    r6,r1,56+10*16
 571         subi    r7,r1,56+2*16
 572 
 573         lvx     v20,0,r6
 574         lvx     v21,off16,r6
 575         lvx     v22,off32,r6
 576         lvx     v23,off48,r6
 577         lvx     v24,off64,r6
 578         lvx     v25,off80,r6
 579         lvx     v26,off96,r6
 580         lvx     v27,off112,r6
 581         lvx     v28,0,r7
 582         lvx     v29,off16,r7
 583 
 584         ld      r31,-8(r1)
 585         ld      r30,-16(r1)
 586         ld      r29,-24(r1)
 587         ld      r28,-32(r1)
 588         ld      r27,-40(r1)
 589         ld      r26,-48(r1)
 590         ld      r25,-56(r1)
 591 
 592         blr
 593 
 594 .Lfirst_warm_up_done:
 595         lvx     const1,0,r3
 596         addi    r3,r3,16
 597 
 598         VPMSUMD(v8,v16,const1)
 599         VPMSUMD(v9,v17,const1)
 600         VPMSUMD(v10,v18,const1)
 601         VPMSUMD(v11,v19,const1)
 602         VPMSUMD(v12,v20,const1)
 603         VPMSUMD(v13,v21,const1)
 604         VPMSUMD(v14,v22,const1)
 605         VPMSUMD(v15,v23,const1)
 606 
 607         b       .Lsecond_cool_down
 608 
 609 .Lshort:
 610         cmpdi   r5,0
 611         beq     .Lzero
 612 
 613         addis   r3,r2,.short_constants@toc@ha
 614         addi    r3,r3,.short_constants@toc@l
 615 
 616         /* Calculate where in the constant table we need to start */
 617         subfic  r6,r5,256
 618         add     r3,r3,r6
 619 
 620         /* How many 16 byte chunks? */
 621         srdi    r7,r5,4
 622         mtctr   r7
 623 
 624         vxor    v19,v19,v19
 625         vxor    v20,v20,v20
 626 
 627         lvx     v0,0,r4
 628         lvx     v16,0,r3
 629         VPERM(v0,v0,v16,byteswap)
 630         vxor    v0,v0,v8        /* xor in initial value */
 631         VPMSUMW(v0,v0,v16)
 632         bdz     .Lv0
 633 
 634         lvx     v1,off16,r4
 635         lvx     v17,off16,r3
 636         VPERM(v1,v1,v17,byteswap)
 637         VPMSUMW(v1,v1,v17)
 638         bdz     .Lv1
 639 
 640         lvx     v2,off32,r4
 641         lvx     v16,off32,r3
 642         VPERM(v2,v2,v16,byteswap)
 643         VPMSUMW(v2,v2,v16)
 644         bdz     .Lv2
 645 
 646         lvx     v3,off48,r4
 647         lvx     v17,off48,r3
 648         VPERM(v3,v3,v17,byteswap)
 649         VPMSUMW(v3,v3,v17)
 650         bdz     .Lv3
 651 
 652         lvx     v4,off64,r4
 653         lvx     v16,off64,r3
 654         VPERM(v4,v4,v16,byteswap)
 655         VPMSUMW(v4,v4,v16)
 656         bdz     .Lv4
 657 
 658         lvx     v5,off80,r4
 659         lvx     v17,off80,r3
 660         VPERM(v5,v5,v17,byteswap)
 661         VPMSUMW(v5,v5,v17)
 662         bdz     .Lv5
 663 
 664         lvx     v6,off96,r4
 665         lvx     v16,off96,r3
 666         VPERM(v6,v6,v16,byteswap)
 667         VPMSUMW(v6,v6,v16)
 668         bdz     .Lv6
 669 
 670         lvx     v7,off112,r4
 671         lvx     v17,off112,r3
 672         VPERM(v7,v7,v17,byteswap)
 673         VPMSUMW(v7,v7,v17)
 674         bdz     .Lv7
 675 
 676         addi    r3,r3,128
 677         addi    r4,r4,128
 678 
 679         lvx     v8,0,r4
 680         lvx     v16,0,r3
 681         VPERM(v8,v8,v16,byteswap)
 682         VPMSUMW(v8,v8,v16)
 683         bdz     .Lv8
 684 
 685         lvx     v9,off16,r4
 686         lvx     v17,off16,r3
 687         VPERM(v9,v9,v17,byteswap)
 688         VPMSUMW(v9,v9,v17)
 689         bdz     .Lv9
 690 
 691         lvx     v10,off32,r4
 692         lvx     v16,off32,r3
 693         VPERM(v10,v10,v16,byteswap)
 694         VPMSUMW(v10,v10,v16)
 695         bdz     .Lv10
 696 
 697         lvx     v11,off48,r4
 698         lvx     v17,off48,r3
 699         VPERM(v11,v11,v17,byteswap)
 700         VPMSUMW(v11,v11,v17)
 701         bdz     .Lv11
 702 
 703         lvx     v12,off64,r4
 704         lvx     v16,off64,r3
 705         VPERM(v12,v12,v16,byteswap)
 706         VPMSUMW(v12,v12,v16)
 707         bdz     .Lv12
 708 
 709         lvx     v13,off80,r4
 710         lvx     v17,off80,r3
 711         VPERM(v13,v13,v17,byteswap)
 712         VPMSUMW(v13,v13,v17)
 713         bdz     .Lv13
 714 
 715         lvx     v14,off96,r4
 716         lvx     v16,off96,r3
 717         VPERM(v14,v14,v16,byteswap)
 718         VPMSUMW(v14,v14,v16)
 719         bdz     .Lv14
 720 
 721         lvx     v15,off112,r4
 722         lvx     v17,off112,r3
 723         VPERM(v15,v15,v17,byteswap)
 724         VPMSUMW(v15,v15,v17)
 725 
 726 .Lv15:  vxor    v19,v19,v15
 727 .Lv14:  vxor    v20,v20,v14
 728 .Lv13:  vxor    v19,v19,v13
 729 .Lv12:  vxor    v20,v20,v12
 730 .Lv11:  vxor    v19,v19,v11
 731 .Lv10:  vxor    v20,v20,v10
 732 .Lv9:   vxor    v19,v19,v9
 733 .Lv8:   vxor    v20,v20,v8
 734 .Lv7:   vxor    v19,v19,v7
 735 .Lv6:   vxor    v20,v20,v6
 736 .Lv5:   vxor    v19,v19,v5
 737 .Lv4:   vxor    v20,v20,v4
 738 .Lv3:   vxor    v19,v19,v3
 739 .Lv2:   vxor    v20,v20,v2
 740 .Lv1:   vxor    v19,v19,v1
 741 .Lv0:   vxor    v20,v20,v0
 742 
 743         vxor    v0,v19,v20
 744 
 745         b       .Lbarrett_reduction
 746 
 747 .Lzero:
 748         mr      r3,r10
 749         b       .Lout
 750 
 751 FUNC_END(CRC_FUNCTION_NAME)
/* [<][>][^][v][top][bottom][index][help] */
root/arch/powerpc/crypto/crc32-vpmsum_core.S