root/arch/x86/crypto/crc32c-pcl-intel-asm_64.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /*
   2  * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64)
   3  *
   4  * The white papers on CRC32C calculations with PCLMULQDQ instruction can be
   5  * downloaded from:
   6  * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf
   7  * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf
   8  *
   9  * Copyright (C) 2012 Intel Corporation.
  10  *
  11  * Authors:
  12  *      Wajdi Feghali <wajdi.k.feghali@intel.com>
  13  *      James Guilford <james.guilford@intel.com>
  14  *      David Cote <david.m.cote@intel.com>
  15  *      Tim Chen <tim.c.chen@linux.intel.com>
  16  *
  17  * This software is available to you under a choice of one of two
  18  * licenses.  You may choose to be licensed under the terms of the GNU
  19  * General Public License (GPL) Version 2, available from the file
  20  * COPYING in the main directory of this source tree, or the
  21  * OpenIB.org BSD license below:
  22  *
  23  *     Redistribution and use in source and binary forms, with or
  24  *     without modification, are permitted provided that the following
  25  *     conditions are met:
  26  *
  27  *      - Redistributions of source code must retain the above
  28  *        copyright notice, this list of conditions and the following
  29  *        disclaimer.
  30  *
  31  *      - Redistributions in binary form must reproduce the above
  32  *        copyright notice, this list of conditions and the following
  33  *        disclaimer in the documentation and/or other materials
  34  *        provided with the distribution.
  35  *
  36  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  37  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  38  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  39  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  40  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  41  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  42  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  43  * SOFTWARE.
  44  */
  45 
  46 #include <asm/inst.h>
  47 #include <linux/linkage.h>
  48 #include <asm/nospec-branch.h>
  49 
  50 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
  51 
  52 .macro LABEL prefix n
  53 \prefix\n\():
  54 .endm
  55 
  56 .macro JMPTBL_ENTRY i
  57 .word crc_\i - crc_array
  58 .endm
  59 
  60 .macro JNC_LESS_THAN j
  61         jnc less_than_\j
  62 .endm
  63 
  64 # Define threshold where buffers are considered "small" and routed to more
  65 # efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so
  66 # SMALL_SIZE can be no larger than 255.
  67 
  68 #define SMALL_SIZE 200
  69 
  70 .if (SMALL_SIZE > 255)
  71 .error "SMALL_ SIZE must be < 256"
  72 .endif
  73 
  74 # unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init);
  75 
  76 .text
  77 ENTRY(crc_pcl)
  78 #define    bufp         %rdi
  79 #define    bufp_dw      %edi
  80 #define    bufp_w       %di
  81 #define    bufp_b       %dil
  82 #define    bufptmp      %rcx
  83 #define    block_0      %rcx
  84 #define    block_1      %rdx
  85 #define    block_2      %r11
  86 #define    len          %rsi
  87 #define    len_dw       %esi
  88 #define    len_w        %si
  89 #define    len_b        %sil
  90 #define    crc_init_arg %rdx
  91 #define    tmp          %rbx
  92 #define    crc_init     %r8
  93 #define    crc_init_dw  %r8d
  94 #define    crc1         %r9
  95 #define    crc2         %r10
  96 
  97         pushq   %rbx
  98         pushq   %rdi
  99         pushq   %rsi
 100 
 101         ## Move crc_init for Linux to a different
 102         mov     crc_init_arg, crc_init
 103 
 104         ################################################################
 105         ## 1) ALIGN:
 106         ################################################################
 107 
 108         mov     bufp, bufptmp           # rdi = *buf
 109         neg     bufp
 110         and     $7, bufp                # calculate the unalignment amount of
 111                                         # the address
 112         je      proc_block              # Skip if aligned
 113 
 114         ## If len is less than 8 and we're unaligned, we need to jump
 115         ## to special code to avoid reading beyond the end of the buffer
 116         cmp     $8, len
 117         jae     do_align
 118         # less_than_8 expects length in upper 3 bits of len_dw
 119         # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
 120         shl     $32-3+1, len_dw
 121         jmp     less_than_8_post_shl1
 122 
 123 do_align:
 124         #### Calculate CRC of unaligned bytes of the buffer (if any)
 125         movq    (bufptmp), tmp          # load a quadward from the buffer
 126         add     bufp, bufptmp           # align buffer pointer for quadword
 127                                         # processing
 128         sub     bufp, len               # update buffer length
 129 align_loop:
 130         crc32b  %bl, crc_init_dw        # compute crc32 of 1-byte
 131         shr     $8, tmp                 # get next byte
 132         dec     bufp
 133         jne     align_loop
 134 
 135 proc_block:
 136 
 137         ################################################################
 138         ## 2) PROCESS  BLOCKS:
 139         ################################################################
 140 
 141         ## compute num of bytes to be processed
 142         movq    len, tmp                # save num bytes in tmp
 143 
 144         cmpq    $128*24, len
 145         jae     full_block
 146 
 147 continue_block:
 148         cmpq    $SMALL_SIZE, len
 149         jb      small
 150 
 151         ## len < 128*24
 152         movq    $2731, %rax             # 2731 = ceil(2^16 / 24)
 153         mul     len_dw
 154         shrq    $16, %rax
 155 
 156         ## eax contains floor(bytes / 24) = num 24-byte chunks to do
 157 
 158         ## process rax 24-byte chunks (128 >= rax >= 0)
 159 
 160         ## compute end address of each block
 161         ## block 0 (base addr + RAX * 8)
 162         ## block 1 (base addr + RAX * 16)
 163         ## block 2 (base addr + RAX * 24)
 164         lea     (bufptmp, %rax, 8), block_0
 165         lea     (block_0, %rax, 8), block_1
 166         lea     (block_1, %rax, 8), block_2
 167 
 168         xor     crc1, crc1
 169         xor     crc2, crc2
 170 
 171         ## branch into array
 172         lea     jump_table(%rip), bufp
 173         movzxw  (bufp, %rax, 2), len
 174         lea     crc_array(%rip), bufp
 175         lea     (bufp, len, 1), bufp
 176         JMP_NOSPEC bufp
 177 
 178         ################################################################
 179         ## 2a) PROCESS FULL BLOCKS:
 180         ################################################################
 181 full_block:
 182         movl    $128,%eax
 183         lea     128*8*2(block_0), block_1
 184         lea     128*8*3(block_0), block_2
 185         add     $128*8*1, block_0
 186 
 187         xor     crc1,crc1
 188         xor     crc2,crc2
 189 
 190         # Fall thruogh into top of crc array (crc_128)
 191 
 192         ################################################################
 193         ## 3) CRC Array:
 194         ################################################################
 195 
 196 crc_array:
 197         i=128
 198 .rept 128-1
 199 .altmacro
 200 LABEL crc_ %i
 201 .noaltmacro
 202         crc32q   -i*8(block_0), crc_init
 203         crc32q   -i*8(block_1), crc1
 204         crc32q   -i*8(block_2), crc2
 205         i=(i-1)
 206 .endr
 207 
 208 .altmacro
 209 LABEL crc_ %i
 210 .noaltmacro
 211         crc32q   -i*8(block_0), crc_init
 212         crc32q   -i*8(block_1), crc1
 213 # SKIP  crc32  -i*8(block_2), crc2 ; Don't do this one yet
 214 
 215         mov     block_2, block_0
 216 
 217         ################################################################
 218         ## 4) Combine three results:
 219         ################################################################
 220 
 221         lea     (K_table-8)(%rip), bufp         # first entry is for idx 1
 222         shlq    $3, %rax                        # rax *= 8
 223         pmovzxdq (bufp,%rax), %xmm0             # 2 consts: K1:K2
 224         leal    (%eax,%eax,2), %eax             # rax *= 3 (total *24)
 225         subq    %rax, tmp                       # tmp -= rax*24
 226 
 227         movq    crc_init, %xmm1                 # CRC for block 1
 228         PCLMULQDQ 0x00,%xmm0,%xmm1              # Multiply by K2
 229 
 230         movq    crc1, %xmm2                     # CRC for block 2
 231         PCLMULQDQ 0x10, %xmm0, %xmm2            # Multiply by K1
 232 
 233         pxor    %xmm2,%xmm1
 234         movq    %xmm1, %rax
 235         xor     -i*8(block_2), %rax
 236         mov     crc2, crc_init
 237         crc32   %rax, crc_init
 238 
 239         ################################################################
 240         ## 5) Check for end:
 241         ################################################################
 242 
 243 LABEL crc_ 0
 244         mov     tmp, len
 245         cmp     $128*24, tmp
 246         jae     full_block
 247         cmp     $24, tmp
 248         jae     continue_block
 249 
 250 less_than_24:
 251         shl     $32-4, len_dw                   # less_than_16 expects length
 252                                                 # in upper 4 bits of len_dw
 253         jnc     less_than_16
 254         crc32q  (bufptmp), crc_init
 255         crc32q  8(bufptmp), crc_init
 256         jz      do_return
 257         add     $16, bufptmp
 258         # len is less than 8 if we got here
 259         # less_than_8 expects length in upper 3 bits of len_dw
 260         # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30]
 261         shl     $2, len_dw
 262         jmp     less_than_8_post_shl1
 263 
 264         #######################################################################
 265         ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full)
 266         #######################################################################
 267 small:
 268         shl $32-8, len_dw               # Prepare len_dw for less_than_256
 269         j=256
 270 .rept 5                                 # j = {256, 128, 64, 32, 16}
 271 .altmacro
 272 LABEL less_than_ %j                     # less_than_j: Length should be in
 273                                         # upper lg(j) bits of len_dw
 274         j=(j/2)
 275         shl     $1, len_dw              # Get next MSB
 276         JNC_LESS_THAN %j
 277 .noaltmacro
 278         i=0
 279 .rept (j/8)
 280         crc32q  i(bufptmp), crc_init    # Compute crc32 of 8-byte data
 281         i=i+8
 282 .endr
 283         jz      do_return               # Return if remaining length is zero
 284         add     $j, bufptmp             # Advance buf
 285 .endr
 286 
 287 less_than_8:                            # Length should be stored in
 288                                         # upper 3 bits of len_dw
 289         shl     $1, len_dw
 290 less_than_8_post_shl1:
 291         jnc     less_than_4
 292         crc32l  (bufptmp), crc_init_dw  # CRC of 4 bytes
 293         jz      do_return               # return if remaining data is zero
 294         add     $4, bufptmp
 295 less_than_4:                            # Length should be stored in
 296                                         # upper 2 bits of len_dw
 297         shl     $1, len_dw
 298         jnc     less_than_2
 299         crc32w  (bufptmp), crc_init_dw  # CRC of 2 bytes
 300         jz      do_return               # return if remaining data is zero
 301         add     $2, bufptmp
 302 less_than_2:                            # Length should be stored in the MSB
 303                                         # of len_dw
 304         shl     $1, len_dw
 305         jnc     less_than_1
 306         crc32b  (bufptmp), crc_init_dw  # CRC of 1 byte
 307 less_than_1:                            # Length should be zero
 308 do_return:
 309         movq    crc_init, %rax
 310         popq    %rsi
 311         popq    %rdi
 312         popq    %rbx
 313         ret
 314 ENDPROC(crc_pcl)
 315 
 316 .section        .rodata, "a", @progbits
 317         ################################################################
 318         ## jump table        Table is 129 entries x 2 bytes each
 319         ################################################################
 320 .align 4
 321 jump_table:
 322         i=0
 323 .rept 129
 324 .altmacro
 325 JMPTBL_ENTRY %i
 326 .noaltmacro
 327         i=i+1
 328 .endr
 329 
 330 
 331         ################################################################
 332         ## PCLMULQDQ tables
 333         ## Table is 128 entries x 2 words (8 bytes) each
 334         ################################################################
 335 .align 8
 336 K_table:
 337         .long 0x493c7d27, 0x00000001
 338         .long 0xba4fc28e, 0x493c7d27
 339         .long 0xddc0152b, 0xf20c0dfe
 340         .long 0x9e4addf8, 0xba4fc28e
 341         .long 0x39d3b296, 0x3da6d0cb
 342         .long 0x0715ce53, 0xddc0152b
 343         .long 0x47db8317, 0x1c291d04
 344         .long 0x0d3b6092, 0x9e4addf8
 345         .long 0xc96cfdc0, 0x740eef02
 346         .long 0x878a92a7, 0x39d3b296
 347         .long 0xdaece73e, 0x083a6eec
 348         .long 0xab7aff2a, 0x0715ce53
 349         .long 0x2162d385, 0xc49f4f67
 350         .long 0x83348832, 0x47db8317
 351         .long 0x299847d5, 0x2ad91c30
 352         .long 0xb9e02b86, 0x0d3b6092
 353         .long 0x18b33a4e, 0x6992cea2
 354         .long 0xb6dd949b, 0xc96cfdc0
 355         .long 0x78d9ccb7, 0x7e908048
 356         .long 0xbac2fd7b, 0x878a92a7
 357         .long 0xa60ce07b, 0x1b3d8f29
 358         .long 0xce7f39f4, 0xdaece73e
 359         .long 0x61d82e56, 0xf1d0f55e
 360         .long 0xd270f1a2, 0xab7aff2a
 361         .long 0xc619809d, 0xa87ab8a8
 362         .long 0x2b3cac5d, 0x2162d385
 363         .long 0x65863b64, 0x8462d800
 364         .long 0x1b03397f, 0x83348832
 365         .long 0xebb883bd, 0x71d111a8
 366         .long 0xb3e32c28, 0x299847d5
 367         .long 0x064f7f26, 0xffd852c6
 368         .long 0xdd7e3b0c, 0xb9e02b86
 369         .long 0xf285651c, 0xdcb17aa4
 370         .long 0x10746f3c, 0x18b33a4e
 371         .long 0xc7a68855, 0xf37c5aee
 372         .long 0x271d9844, 0xb6dd949b
 373         .long 0x8e766a0c, 0x6051d5a2
 374         .long 0x93a5f730, 0x78d9ccb7
 375         .long 0x6cb08e5c, 0x18b0d4ff
 376         .long 0x6b749fb2, 0xbac2fd7b
 377         .long 0x1393e203, 0x21f3d99c
 378         .long 0xcec3662e, 0xa60ce07b
 379         .long 0x96c515bb, 0x8f158014
 380         .long 0xe6fc4e6a, 0xce7f39f4
 381         .long 0x8227bb8a, 0xa00457f7
 382         .long 0xb0cd4768, 0x61d82e56
 383         .long 0x39c7ff35, 0x8d6d2c43
 384         .long 0xd7a4825c, 0xd270f1a2
 385         .long 0x0ab3844b, 0x00ac29cf
 386         .long 0x0167d312, 0xc619809d
 387         .long 0xf6076544, 0xe9adf796
 388         .long 0x26f6a60a, 0x2b3cac5d
 389         .long 0xa741c1bf, 0x96638b34
 390         .long 0x98d8d9cb, 0x65863b64
 391         .long 0x49c3cc9c, 0xe0e9f351
 392         .long 0x68bce87a, 0x1b03397f
 393         .long 0x57a3d037, 0x9af01f2d
 394         .long 0x6956fc3b, 0xebb883bd
 395         .long 0x42d98888, 0x2cff42cf
 396         .long 0x3771e98f, 0xb3e32c28
 397         .long 0xb42ae3d9, 0x88f25a3a
 398         .long 0x2178513a, 0x064f7f26
 399         .long 0xe0ac139e, 0x4e36f0b0
 400         .long 0x170076fa, 0xdd7e3b0c
 401         .long 0x444dd413, 0xbd6f81f8
 402         .long 0x6f345e45, 0xf285651c
 403         .long 0x41d17b64, 0x91c9bd4b
 404         .long 0xff0dba97, 0x10746f3c
 405         .long 0xa2b73df1, 0x885f087b
 406         .long 0xf872e54c, 0xc7a68855
 407         .long 0x1e41e9fc, 0x4c144932
 408         .long 0x86d8e4d2, 0x271d9844
 409         .long 0x651bd98b, 0x52148f02
 410         .long 0x5bb8f1bc, 0x8e766a0c
 411         .long 0xa90fd27a, 0xa3c6f37a
 412         .long 0xb3af077a, 0x93a5f730
 413         .long 0x4984d782, 0xd7c0557f
 414         .long 0xca6ef3ac, 0x6cb08e5c
 415         .long 0x234e0b26, 0x63ded06a
 416         .long 0xdd66cbbb, 0x6b749fb2
 417         .long 0x4597456a, 0x4d56973c
 418         .long 0xe9e28eb4, 0x1393e203
 419         .long 0x7b3ff57a, 0x9669c9df
 420         .long 0xc9c8b782, 0xcec3662e
 421         .long 0x3f70cc6f, 0xe417f38a
 422         .long 0x93e106a4, 0x96c515bb
 423         .long 0x62ec6c6d, 0x4b9e0f71
 424         .long 0xd813b325, 0xe6fc4e6a
 425         .long 0x0df04680, 0xd104b8fc
 426         .long 0x2342001e, 0x8227bb8a
 427         .long 0x0a2a8d7e, 0x5b397730
 428         .long 0x6d9a4957, 0xb0cd4768
 429         .long 0xe8b6368b, 0xe78eb416
 430         .long 0xd2c3ed1a, 0x39c7ff35
 431         .long 0x995a5724, 0x61ff0e01
 432         .long 0x9ef68d35, 0xd7a4825c
 433         .long 0x0c139b31, 0x8d96551c
 434         .long 0xf2271e60, 0x0ab3844b
 435         .long 0x0b0bf8ca, 0x0bf80dd2
 436         .long 0x2664fd8b, 0x0167d312
 437         .long 0xed64812d, 0x8821abed
 438         .long 0x02ee03b2, 0xf6076544
 439         .long 0x8604ae0f, 0x6a45d2b2
 440         .long 0x363bd6b3, 0x26f6a60a
 441         .long 0x135c83fd, 0xd8d26619
 442         .long 0x5fabe670, 0xa741c1bf
 443         .long 0x35ec3279, 0xde87806c
 444         .long 0x00bcf5f6, 0x98d8d9cb
 445         .long 0x8ae00689, 0x14338754
 446         .long 0x17f27698, 0x49c3cc9c
 447         .long 0x58ca5f00, 0x5bd2011f
 448         .long 0xaa7c7ad5, 0x68bce87a
 449         .long 0xb5cfca28, 0xdd07448e
 450         .long 0xded288f8, 0x57a3d037
 451         .long 0x59f229bc, 0xdde8f5b9
 452         .long 0x6d390dec, 0x6956fc3b
 453         .long 0x37170390, 0xa3e3e02c
 454         .long 0x6353c1cc, 0x42d98888
 455         .long 0xc4584f5c, 0xd73c7bea
 456         .long 0xf48642e9, 0x3771e98f
 457         .long 0x531377e2, 0x80ff0093
 458         .long 0xdd35bc8d, 0xb42ae3d9
 459         .long 0xb25b29f2, 0x8fe4c34d
 460         .long 0x9a5ede41, 0x2178513a
 461         .long 0xa563905d, 0xdf99fc11
 462         .long 0x45cddf4e, 0xe0ac139e
 463         .long 0xacfa3103, 0x6c23e841
 464         .long 0xa51b6135, 0x170076fa

/* [<][>][^][v][top][bottom][index][help] */