root/arch/x86/crypto/aes_ctrby8_avx-x86_64.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /*
   2  *      Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
   3  *
   4  * This is AES128/192/256 CTR mode optimization implementation. It requires
   5  * the support of Intel(R) AESNI and AVX instructions.
   6  *
   7  * This work was inspired by the AES CTR mode optimization published
   8  * in Intel Optimized IPSEC Cryptograhpic library.
   9  * Additional information on it can be found at:
  10  *    http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
  11  *
  12  * This file is provided under a dual BSD/GPLv2 license.  When using or
  13  * redistributing this file, you may do so under either license.
  14  *
  15  * GPL LICENSE SUMMARY
  16  *
  17  * Copyright(c) 2014 Intel Corporation.
  18  *
  19  * This program is free software; you can redistribute it and/or modify
  20  * it under the terms of version 2 of the GNU General Public License as
  21  * published by the Free Software Foundation.
  22  *
  23  * This program is distributed in the hope that it will be useful, but
  24  * WITHOUT ANY WARRANTY; without even the implied warranty of
  25  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  26  * General Public License for more details.
  27  *
  28  * Contact Information:
  29  * James Guilford <james.guilford@intel.com>
  30  * Sean Gulley <sean.m.gulley@intel.com>
  31  * Chandramouli Narayanan <mouli@linux.intel.com>
  32  *
  33  * BSD LICENSE
  34  *
  35  * Copyright(c) 2014 Intel Corporation.
  36  *
  37  * Redistribution and use in source and binary forms, with or without
  38  * modification, are permitted provided that the following conditions
  39  * are met:
  40  *
  41  * Redistributions of source code must retain the above copyright
  42  * notice, this list of conditions and the following disclaimer.
  43  * Redistributions in binary form must reproduce the above copyright
  44  * notice, this list of conditions and the following disclaimer in
  45  * the documentation and/or other materials provided with the
  46  * distribution.
  47  * Neither the name of Intel Corporation nor the names of its
  48  * contributors may be used to endorse or promote products derived
  49  * from this software without specific prior written permission.
  50  *
  51  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  52  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  53  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  54  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  55  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  56  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  57  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  58  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  59  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  60  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  61  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  62  *
  63  */
  64 
  65 #include <linux/linkage.h>
  66 #include <asm/inst.h>
  67 
  68 #define VMOVDQ          vmovdqu
  69 
  70 #define xdata0          %xmm0
  71 #define xdata1          %xmm1
  72 #define xdata2          %xmm2
  73 #define xdata3          %xmm3
  74 #define xdata4          %xmm4
  75 #define xdata5          %xmm5
  76 #define xdata6          %xmm6
  77 #define xdata7          %xmm7
  78 #define xcounter        %xmm8
  79 #define xbyteswap       %xmm9
  80 #define xkey0           %xmm10
  81 #define xkey4           %xmm11
  82 #define xkey8           %xmm12
  83 #define xkey12          %xmm13
  84 #define xkeyA           %xmm14
  85 #define xkeyB           %xmm15
  86 
  87 #define p_in            %rdi
  88 #define p_iv            %rsi
  89 #define p_keys          %rdx
  90 #define p_out           %rcx
  91 #define num_bytes       %r8
  92 
  93 #define tmp             %r10
  94 #define DDQ_DATA        0
  95 #define XDATA           1
  96 #define KEY_128         1
  97 #define KEY_192         2
  98 #define KEY_256         3
  99 
 100 .section .rodata
 101 .align 16
 102 
 103 byteswap_const:
 104         .octa 0x000102030405060708090A0B0C0D0E0F
 105 ddq_low_msk:
 106         .octa 0x0000000000000000FFFFFFFFFFFFFFFF
 107 ddq_high_add_1:
 108         .octa 0x00000000000000010000000000000000
 109 ddq_add_1:
 110         .octa 0x00000000000000000000000000000001
 111 ddq_add_2:
 112         .octa 0x00000000000000000000000000000002
 113 ddq_add_3:
 114         .octa 0x00000000000000000000000000000003
 115 ddq_add_4:
 116         .octa 0x00000000000000000000000000000004
 117 ddq_add_5:
 118         .octa 0x00000000000000000000000000000005
 119 ddq_add_6:
 120         .octa 0x00000000000000000000000000000006
 121 ddq_add_7:
 122         .octa 0x00000000000000000000000000000007
 123 ddq_add_8:
 124         .octa 0x00000000000000000000000000000008
 125 
 126 .text
 127 
 128 /* generate a unique variable for ddq_add_x */
 129 
 130 .macro setddq n
 131         var_ddq_add = ddq_add_\n
 132 .endm
 133 
 134 /* generate a unique variable for xmm register */
 135 .macro setxdata n
 136         var_xdata = %xmm\n
 137 .endm
 138 
 139 /* club the numeric 'id' to the symbol 'name' */
 140 
 141 .macro club name, id
 142 .altmacro
 143         .if \name == DDQ_DATA
 144                 setddq %\id
 145         .elseif \name == XDATA
 146                 setxdata %\id
 147         .endif
 148 .noaltmacro
 149 .endm
 150 
 151 /*
 152  * do_aes num_in_par load_keys key_len
 153  * This increments p_in, but not p_out
 154  */
 155 .macro do_aes b, k, key_len
 156         .set by, \b
 157         .set load_keys, \k
 158         .set klen, \key_len
 159 
 160         .if (load_keys)
 161                 vmovdqa 0*16(p_keys), xkey0
 162         .endif
 163 
 164         vpshufb xbyteswap, xcounter, xdata0
 165 
 166         .set i, 1
 167         .rept (by - 1)
 168                 club DDQ_DATA, i
 169                 club XDATA, i
 170                 vpaddq  var_ddq_add(%rip), xcounter, var_xdata
 171                 vptest  ddq_low_msk(%rip), var_xdata
 172                 jnz 1f
 173                 vpaddq  ddq_high_add_1(%rip), var_xdata, var_xdata
 174                 vpaddq  ddq_high_add_1(%rip), xcounter, xcounter
 175                 1:
 176                 vpshufb xbyteswap, var_xdata, var_xdata
 177                 .set i, (i +1)
 178         .endr
 179 
 180         vmovdqa 1*16(p_keys), xkeyA
 181 
 182         vpxor   xkey0, xdata0, xdata0
 183         club DDQ_DATA, by
 184         vpaddq  var_ddq_add(%rip), xcounter, xcounter
 185         vptest  ddq_low_msk(%rip), xcounter
 186         jnz     1f
 187         vpaddq  ddq_high_add_1(%rip), xcounter, xcounter
 188         1:
 189 
 190         .set i, 1
 191         .rept (by - 1)
 192                 club XDATA, i
 193                 vpxor   xkey0, var_xdata, var_xdata
 194                 .set i, (i +1)
 195         .endr
 196 
 197         vmovdqa 2*16(p_keys), xkeyB
 198 
 199         .set i, 0
 200         .rept by
 201                 club XDATA, i
 202                 vaesenc xkeyA, var_xdata, var_xdata             /* key 1 */
 203                 .set i, (i +1)
 204         .endr
 205 
 206         .if (klen == KEY_128)
 207                 .if (load_keys)
 208                         vmovdqa 3*16(p_keys), xkey4
 209                 .endif
 210         .else
 211                 vmovdqa 3*16(p_keys), xkeyA
 212         .endif
 213 
 214         .set i, 0
 215         .rept by
 216                 club XDATA, i
 217                 vaesenc xkeyB, var_xdata, var_xdata             /* key 2 */
 218                 .set i, (i +1)
 219         .endr
 220 
 221         add     $(16*by), p_in
 222 
 223         .if (klen == KEY_128)
 224                 vmovdqa 4*16(p_keys), xkeyB
 225         .else
 226                 .if (load_keys)
 227                         vmovdqa 4*16(p_keys), xkey4
 228                 .endif
 229         .endif
 230 
 231         .set i, 0
 232         .rept by
 233                 club XDATA, i
 234                 /* key 3 */
 235                 .if (klen == KEY_128)
 236                         vaesenc xkey4, var_xdata, var_xdata
 237                 .else
 238                         vaesenc xkeyA, var_xdata, var_xdata
 239                 .endif
 240                 .set i, (i +1)
 241         .endr
 242 
 243         vmovdqa 5*16(p_keys), xkeyA
 244 
 245         .set i, 0
 246         .rept by
 247                 club XDATA, i
 248                 /* key 4 */
 249                 .if (klen == KEY_128)
 250                         vaesenc xkeyB, var_xdata, var_xdata
 251                 .else
 252                         vaesenc xkey4, var_xdata, var_xdata
 253                 .endif
 254                 .set i, (i +1)
 255         .endr
 256 
 257         .if (klen == KEY_128)
 258                 .if (load_keys)
 259                         vmovdqa 6*16(p_keys), xkey8
 260                 .endif
 261         .else
 262                 vmovdqa 6*16(p_keys), xkeyB
 263         .endif
 264 
 265         .set i, 0
 266         .rept by
 267                 club XDATA, i
 268                 vaesenc xkeyA, var_xdata, var_xdata             /* key 5 */
 269                 .set i, (i +1)
 270         .endr
 271 
 272         vmovdqa 7*16(p_keys), xkeyA
 273 
 274         .set i, 0
 275         .rept by
 276                 club XDATA, i
 277                 /* key 6 */
 278                 .if (klen == KEY_128)
 279                         vaesenc xkey8, var_xdata, var_xdata
 280                 .else
 281                         vaesenc xkeyB, var_xdata, var_xdata
 282                 .endif
 283                 .set i, (i +1)
 284         .endr
 285 
 286         .if (klen == KEY_128)
 287                 vmovdqa 8*16(p_keys), xkeyB
 288         .else
 289                 .if (load_keys)
 290                         vmovdqa 8*16(p_keys), xkey8
 291                 .endif
 292         .endif
 293 
 294         .set i, 0
 295         .rept by
 296                 club XDATA, i
 297                 vaesenc xkeyA, var_xdata, var_xdata             /* key 7 */
 298                 .set i, (i +1)
 299         .endr
 300 
 301         .if (klen == KEY_128)
 302                 .if (load_keys)
 303                         vmovdqa 9*16(p_keys), xkey12
 304                 .endif
 305         .else
 306                 vmovdqa 9*16(p_keys), xkeyA
 307         .endif
 308 
 309         .set i, 0
 310         .rept by
 311                 club XDATA, i
 312                 /* key 8 */
 313                 .if (klen == KEY_128)
 314                         vaesenc xkeyB, var_xdata, var_xdata
 315                 .else
 316                         vaesenc xkey8, var_xdata, var_xdata
 317                 .endif
 318                 .set i, (i +1)
 319         .endr
 320 
 321         vmovdqa 10*16(p_keys), xkeyB
 322 
 323         .set i, 0
 324         .rept by
 325                 club XDATA, i
 326                 /* key 9 */
 327                 .if (klen == KEY_128)
 328                         vaesenc xkey12, var_xdata, var_xdata
 329                 .else
 330                         vaesenc xkeyA, var_xdata, var_xdata
 331                 .endif
 332                 .set i, (i +1)
 333         .endr
 334 
 335         .if (klen != KEY_128)
 336                 vmovdqa 11*16(p_keys), xkeyA
 337         .endif
 338 
 339         .set i, 0
 340         .rept by
 341                 club XDATA, i
 342                 /* key 10 */
 343                 .if (klen == KEY_128)
 344                         vaesenclast     xkeyB, var_xdata, var_xdata
 345                 .else
 346                         vaesenc xkeyB, var_xdata, var_xdata
 347                 .endif
 348                 .set i, (i +1)
 349         .endr
 350 
 351         .if (klen != KEY_128)
 352                 .if (load_keys)
 353                         vmovdqa 12*16(p_keys), xkey12
 354                 .endif
 355 
 356                 .set i, 0
 357                 .rept by
 358                         club XDATA, i
 359                         vaesenc xkeyA, var_xdata, var_xdata     /* key 11 */
 360                         .set i, (i +1)
 361                 .endr
 362 
 363                 .if (klen == KEY_256)
 364                         vmovdqa 13*16(p_keys), xkeyA
 365                 .endif
 366 
 367                 .set i, 0
 368                 .rept by
 369                         club XDATA, i
 370                         .if (klen == KEY_256)
 371                                 /* key 12 */
 372                                 vaesenc xkey12, var_xdata, var_xdata
 373                         .else
 374                                 vaesenclast xkey12, var_xdata, var_xdata
 375                         .endif
 376                         .set i, (i +1)
 377                 .endr
 378 
 379                 .if (klen == KEY_256)
 380                         vmovdqa 14*16(p_keys), xkeyB
 381 
 382                         .set i, 0
 383                         .rept by
 384                                 club XDATA, i
 385                                 /* key 13 */
 386                                 vaesenc xkeyA, var_xdata, var_xdata
 387                                 .set i, (i +1)
 388                         .endr
 389 
 390                         .set i, 0
 391                         .rept by
 392                                 club XDATA, i
 393                                 /* key 14 */
 394                                 vaesenclast     xkeyB, var_xdata, var_xdata
 395                                 .set i, (i +1)
 396                         .endr
 397                 .endif
 398         .endif
 399 
 400         .set i, 0
 401         .rept (by / 2)
 402                 .set j, (i+1)
 403                 VMOVDQ  (i*16 - 16*by)(p_in), xkeyA
 404                 VMOVDQ  (j*16 - 16*by)(p_in), xkeyB
 405                 club XDATA, i
 406                 vpxor   xkeyA, var_xdata, var_xdata
 407                 club XDATA, j
 408                 vpxor   xkeyB, var_xdata, var_xdata
 409                 .set i, (i+2)
 410         .endr
 411 
 412         .if (i < by)
 413                 VMOVDQ  (i*16 - 16*by)(p_in), xkeyA
 414                 club XDATA, i
 415                 vpxor   xkeyA, var_xdata, var_xdata
 416         .endif
 417 
 418         .set i, 0
 419         .rept by
 420                 club XDATA, i
 421                 VMOVDQ  var_xdata, i*16(p_out)
 422                 .set i, (i+1)
 423         .endr
 424 .endm
 425 
 426 .macro do_aes_load val, key_len
 427         do_aes \val, 1, \key_len
 428 .endm
 429 
 430 .macro do_aes_noload val, key_len
 431         do_aes \val, 0, \key_len
 432 .endm
 433 
 434 /* main body of aes ctr load */
 435 
 436 .macro do_aes_ctrmain key_len
 437         cmp     $16, num_bytes
 438         jb      .Ldo_return2\key_len
 439 
 440         vmovdqa byteswap_const(%rip), xbyteswap
 441         vmovdqu (p_iv), xcounter
 442         vpshufb xbyteswap, xcounter, xcounter
 443 
 444         mov     num_bytes, tmp
 445         and     $(7*16), tmp
 446         jz      .Lmult_of_8_blks\key_len
 447 
 448         /* 1 <= tmp <= 7 */
 449         cmp     $(4*16), tmp
 450         jg      .Lgt4\key_len
 451         je      .Leq4\key_len
 452 
 453 .Llt4\key_len:
 454         cmp     $(2*16), tmp
 455         jg      .Leq3\key_len
 456         je      .Leq2\key_len
 457 
 458 .Leq1\key_len:
 459         do_aes_load     1, \key_len
 460         add     $(1*16), p_out
 461         and     $(~7*16), num_bytes
 462         jz      .Ldo_return2\key_len
 463         jmp     .Lmain_loop2\key_len
 464 
 465 .Leq2\key_len:
 466         do_aes_load     2, \key_len
 467         add     $(2*16), p_out
 468         and     $(~7*16), num_bytes
 469         jz      .Ldo_return2\key_len
 470         jmp     .Lmain_loop2\key_len
 471 
 472 
 473 .Leq3\key_len:
 474         do_aes_load     3, \key_len
 475         add     $(3*16), p_out
 476         and     $(~7*16), num_bytes
 477         jz      .Ldo_return2\key_len
 478         jmp     .Lmain_loop2\key_len
 479 
 480 .Leq4\key_len:
 481         do_aes_load     4, \key_len
 482         add     $(4*16), p_out
 483         and     $(~7*16), num_bytes
 484         jz      .Ldo_return2\key_len
 485         jmp     .Lmain_loop2\key_len
 486 
 487 .Lgt4\key_len:
 488         cmp     $(6*16), tmp
 489         jg      .Leq7\key_len
 490         je      .Leq6\key_len
 491 
 492 .Leq5\key_len:
 493         do_aes_load     5, \key_len
 494         add     $(5*16), p_out
 495         and     $(~7*16), num_bytes
 496         jz      .Ldo_return2\key_len
 497         jmp     .Lmain_loop2\key_len
 498 
 499 .Leq6\key_len:
 500         do_aes_load     6, \key_len
 501         add     $(6*16), p_out
 502         and     $(~7*16), num_bytes
 503         jz      .Ldo_return2\key_len
 504         jmp     .Lmain_loop2\key_len
 505 
 506 .Leq7\key_len:
 507         do_aes_load     7, \key_len
 508         add     $(7*16), p_out
 509         and     $(~7*16), num_bytes
 510         jz      .Ldo_return2\key_len
 511         jmp     .Lmain_loop2\key_len
 512 
 513 .Lmult_of_8_blks\key_len:
 514         .if (\key_len != KEY_128)
 515                 vmovdqa 0*16(p_keys), xkey0
 516                 vmovdqa 4*16(p_keys), xkey4
 517                 vmovdqa 8*16(p_keys), xkey8
 518                 vmovdqa 12*16(p_keys), xkey12
 519         .else
 520                 vmovdqa 0*16(p_keys), xkey0
 521                 vmovdqa 3*16(p_keys), xkey4
 522                 vmovdqa 6*16(p_keys), xkey8
 523                 vmovdqa 9*16(p_keys), xkey12
 524         .endif
 525 .align 16
 526 .Lmain_loop2\key_len:
 527         /* num_bytes is a multiple of 8 and >0 */
 528         do_aes_noload   8, \key_len
 529         add     $(8*16), p_out
 530         sub     $(8*16), num_bytes
 531         jne     .Lmain_loop2\key_len
 532 
 533 .Ldo_return2\key_len:
 534         /* return updated IV */
 535         vpshufb xbyteswap, xcounter, xcounter
 536         vmovdqu xcounter, (p_iv)
 537         ret
 538 .endm
 539 
 540 /*
 541  * routine to do AES128 CTR enc/decrypt "by8"
 542  * XMM registers are clobbered.
 543  * Saving/restoring must be done at a higher level
 544  * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
 545  *                      unsigned int num_bytes)
 546  */
 547 ENTRY(aes_ctr_enc_128_avx_by8)
 548         /* call the aes main loop */
 549         do_aes_ctrmain KEY_128
 550 
 551 ENDPROC(aes_ctr_enc_128_avx_by8)
 552 
 553 /*
 554  * routine to do AES192 CTR enc/decrypt "by8"
 555  * XMM registers are clobbered.
 556  * Saving/restoring must be done at a higher level
 557  * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
 558  *                      unsigned int num_bytes)
 559  */
 560 ENTRY(aes_ctr_enc_192_avx_by8)
 561         /* call the aes main loop */
 562         do_aes_ctrmain KEY_192
 563 
 564 ENDPROC(aes_ctr_enc_192_avx_by8)
 565 
 566 /*
 567  * routine to do AES256 CTR enc/decrypt "by8"
 568  * XMM registers are clobbered.
 569  * Saving/restoring must be done at a higher level
 570  * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
 571  *                      unsigned int num_bytes)
 572  */
 573 ENTRY(aes_ctr_enc_256_avx_by8)
 574         /* call the aes main loop */
 575         do_aes_ctrmain KEY_256
 576 
 577 ENDPROC(aes_ctr_enc_256_avx_by8)

/* [<][>][^][v][top][bottom][index][help] */