1/* 2 * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64) 3 * 4 * This is AES128/192/256 CTR mode optimization implementation. It requires 5 * the support of Intel(R) AESNI and AVX instructions. 6 * 7 * This work was inspired by the AES CTR mode optimization published 8 * in Intel Optimized IPSEC Cryptograhpic library. 9 * Additional information on it can be found at: 10 * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972 11 * 12 * This file is provided under a dual BSD/GPLv2 license. When using or 13 * redistributing this file, you may do so under either license. 14 * 15 * GPL LICENSE SUMMARY 16 * 17 * Copyright(c) 2014 Intel Corporation. 18 * 19 * This program is free software; you can redistribute it and/or modify 20 * it under the terms of version 2 of the GNU General Public License as 21 * published by the Free Software Foundation. 22 * 23 * This program is distributed in the hope that it will be useful, but 24 * WITHOUT ANY WARRANTY; without even the implied warranty of 25 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 26 * General Public License for more details. 27 * 28 * Contact Information: 29 * James Guilford <james.guilford@intel.com> 30 * Sean Gulley <sean.m.gulley@intel.com> 31 * Chandramouli Narayanan <mouli@linux.intel.com> 32 * 33 * BSD LICENSE 34 * 35 * Copyright(c) 2014 Intel Corporation. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 41 * Redistributions of source code must retain the above copyright 42 * notice, this list of conditions and the following disclaimer. 43 * Redistributions in binary form must reproduce the above copyright 44 * notice, this list of conditions and the following disclaimer in 45 * the documentation and/or other materials provided with the 46 * distribution. 47 * Neither the name of Intel Corporation nor the names of its 48 * contributors may be used to endorse or promote products derived 49 * from this software without specific prior written permission. 50 * 51 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 52 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 53 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 54 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 55 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 56 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 57 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 58 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 59 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 60 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 61 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 62 * 63 */ 64 65#include <linux/linkage.h> 66#include <asm/inst.h> 67 68#define CONCAT(a,b) a##b 69#define VMOVDQ vmovdqu 70 71#define xdata0 %xmm0 72#define xdata1 %xmm1 73#define xdata2 %xmm2 74#define xdata3 %xmm3 75#define xdata4 %xmm4 76#define xdata5 %xmm5 77#define xdata6 %xmm6 78#define xdata7 %xmm7 79#define xcounter %xmm8 80#define xbyteswap %xmm9 81#define xkey0 %xmm10 82#define xkey4 %xmm11 83#define xkey8 %xmm12 84#define xkey12 %xmm13 85#define xkeyA %xmm14 86#define xkeyB %xmm15 87 88#define p_in %rdi 89#define p_iv %rsi 90#define p_keys %rdx 91#define p_out %rcx 92#define num_bytes %r8 93 94#define tmp %r10 95#define DDQ(i) CONCAT(ddq_add_,i) 96#define XMM(i) CONCAT(%xmm, i) 97#define DDQ_DATA 0 98#define XDATA 1 99#define KEY_128 1 100#define KEY_192 2 101#define KEY_256 3 102 103.section .rodata 104.align 16 105 106byteswap_const: 107 .octa 0x000102030405060708090A0B0C0D0E0F 108ddq_low_msk: 109 .octa 0x0000000000000000FFFFFFFFFFFFFFFF 110ddq_high_add_1: 111 .octa 0x00000000000000010000000000000000 112ddq_add_1: 113 .octa 0x00000000000000000000000000000001 114ddq_add_2: 115 .octa 0x00000000000000000000000000000002 116ddq_add_3: 117 .octa 0x00000000000000000000000000000003 118ddq_add_4: 119 .octa 0x00000000000000000000000000000004 120ddq_add_5: 121 .octa 0x00000000000000000000000000000005 122ddq_add_6: 123 .octa 0x00000000000000000000000000000006 124ddq_add_7: 125 .octa 0x00000000000000000000000000000007 126ddq_add_8: 127 .octa 0x00000000000000000000000000000008 128 129.text 130 131/* generate a unique variable for ddq_add_x */ 132 133.macro setddq n 134 var_ddq_add = DDQ(\n) 135.endm 136 137/* generate a unique variable for xmm register */ 138.macro setxdata n 139 var_xdata = XMM(\n) 140.endm 141 142/* club the numeric 'id' to the symbol 'name' */ 143 144.macro club name, id 145.altmacro 146 .if \name == DDQ_DATA 147 setddq %\id 148 .elseif \name == XDATA 149 setxdata %\id 150 .endif 151.noaltmacro 152.endm 153 154/* 155 * do_aes num_in_par load_keys key_len 156 * This increments p_in, but not p_out 157 */ 158.macro do_aes b, k, key_len 159 .set by, \b 160 .set load_keys, \k 161 .set klen, \key_len 162 163 .if (load_keys) 164 vmovdqa 0*16(p_keys), xkey0 165 .endif 166 167 vpshufb xbyteswap, xcounter, xdata0 168 169 .set i, 1 170 .rept (by - 1) 171 club DDQ_DATA, i 172 club XDATA, i 173 vpaddq var_ddq_add(%rip), xcounter, var_xdata 174 vptest ddq_low_msk(%rip), var_xdata 175 jnz 1f 176 vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata 177 vpaddq ddq_high_add_1(%rip), xcounter, xcounter 178 1: 179 vpshufb xbyteswap, var_xdata, var_xdata 180 .set i, (i +1) 181 .endr 182 183 vmovdqa 1*16(p_keys), xkeyA 184 185 vpxor xkey0, xdata0, xdata0 186 club DDQ_DATA, by 187 vpaddq var_ddq_add(%rip), xcounter, xcounter 188 vptest ddq_low_msk(%rip), xcounter 189 jnz 1f 190 vpaddq ddq_high_add_1(%rip), xcounter, xcounter 191 1: 192 193 .set i, 1 194 .rept (by - 1) 195 club XDATA, i 196 vpxor xkey0, var_xdata, var_xdata 197 .set i, (i +1) 198 .endr 199 200 vmovdqa 2*16(p_keys), xkeyB 201 202 .set i, 0 203 .rept by 204 club XDATA, i 205 vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ 206 .set i, (i +1) 207 .endr 208 209 .if (klen == KEY_128) 210 .if (load_keys) 211 vmovdqa 3*16(p_keys), xkey4 212 .endif 213 .else 214 vmovdqa 3*16(p_keys), xkeyA 215 .endif 216 217 .set i, 0 218 .rept by 219 club XDATA, i 220 vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ 221 .set i, (i +1) 222 .endr 223 224 add $(16*by), p_in 225 226 .if (klen == KEY_128) 227 vmovdqa 4*16(p_keys), xkeyB 228 .else 229 .if (load_keys) 230 vmovdqa 4*16(p_keys), xkey4 231 .endif 232 .endif 233 234 .set i, 0 235 .rept by 236 club XDATA, i 237 /* key 3 */ 238 .if (klen == KEY_128) 239 vaesenc xkey4, var_xdata, var_xdata 240 .else 241 vaesenc xkeyA, var_xdata, var_xdata 242 .endif 243 .set i, (i +1) 244 .endr 245 246 vmovdqa 5*16(p_keys), xkeyA 247 248 .set i, 0 249 .rept by 250 club XDATA, i 251 /* key 4 */ 252 .if (klen == KEY_128) 253 vaesenc xkeyB, var_xdata, var_xdata 254 .else 255 vaesenc xkey4, var_xdata, var_xdata 256 .endif 257 .set i, (i +1) 258 .endr 259 260 .if (klen == KEY_128) 261 .if (load_keys) 262 vmovdqa 6*16(p_keys), xkey8 263 .endif 264 .else 265 vmovdqa 6*16(p_keys), xkeyB 266 .endif 267 268 .set i, 0 269 .rept by 270 club XDATA, i 271 vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ 272 .set i, (i +1) 273 .endr 274 275 vmovdqa 7*16(p_keys), xkeyA 276 277 .set i, 0 278 .rept by 279 club XDATA, i 280 /* key 6 */ 281 .if (klen == KEY_128) 282 vaesenc xkey8, var_xdata, var_xdata 283 .else 284 vaesenc xkeyB, var_xdata, var_xdata 285 .endif 286 .set i, (i +1) 287 .endr 288 289 .if (klen == KEY_128) 290 vmovdqa 8*16(p_keys), xkeyB 291 .else 292 .if (load_keys) 293 vmovdqa 8*16(p_keys), xkey8 294 .endif 295 .endif 296 297 .set i, 0 298 .rept by 299 club XDATA, i 300 vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ 301 .set i, (i +1) 302 .endr 303 304 .if (klen == KEY_128) 305 .if (load_keys) 306 vmovdqa 9*16(p_keys), xkey12 307 .endif 308 .else 309 vmovdqa 9*16(p_keys), xkeyA 310 .endif 311 312 .set i, 0 313 .rept by 314 club XDATA, i 315 /* key 8 */ 316 .if (klen == KEY_128) 317 vaesenc xkeyB, var_xdata, var_xdata 318 .else 319 vaesenc xkey8, var_xdata, var_xdata 320 .endif 321 .set i, (i +1) 322 .endr 323 324 vmovdqa 10*16(p_keys), xkeyB 325 326 .set i, 0 327 .rept by 328 club XDATA, i 329 /* key 9 */ 330 .if (klen == KEY_128) 331 vaesenc xkey12, var_xdata, var_xdata 332 .else 333 vaesenc xkeyA, var_xdata, var_xdata 334 .endif 335 .set i, (i +1) 336 .endr 337 338 .if (klen != KEY_128) 339 vmovdqa 11*16(p_keys), xkeyA 340 .endif 341 342 .set i, 0 343 .rept by 344 club XDATA, i 345 /* key 10 */ 346 .if (klen == KEY_128) 347 vaesenclast xkeyB, var_xdata, var_xdata 348 .else 349 vaesenc xkeyB, var_xdata, var_xdata 350 .endif 351 .set i, (i +1) 352 .endr 353 354 .if (klen != KEY_128) 355 .if (load_keys) 356 vmovdqa 12*16(p_keys), xkey12 357 .endif 358 359 .set i, 0 360 .rept by 361 club XDATA, i 362 vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ 363 .set i, (i +1) 364 .endr 365 366 .if (klen == KEY_256) 367 vmovdqa 13*16(p_keys), xkeyA 368 .endif 369 370 .set i, 0 371 .rept by 372 club XDATA, i 373 .if (klen == KEY_256) 374 /* key 12 */ 375 vaesenc xkey12, var_xdata, var_xdata 376 .else 377 vaesenclast xkey12, var_xdata, var_xdata 378 .endif 379 .set i, (i +1) 380 .endr 381 382 .if (klen == KEY_256) 383 vmovdqa 14*16(p_keys), xkeyB 384 385 .set i, 0 386 .rept by 387 club XDATA, i 388 /* key 13 */ 389 vaesenc xkeyA, var_xdata, var_xdata 390 .set i, (i +1) 391 .endr 392 393 .set i, 0 394 .rept by 395 club XDATA, i 396 /* key 14 */ 397 vaesenclast xkeyB, var_xdata, var_xdata 398 .set i, (i +1) 399 .endr 400 .endif 401 .endif 402 403 .set i, 0 404 .rept (by / 2) 405 .set j, (i+1) 406 VMOVDQ (i*16 - 16*by)(p_in), xkeyA 407 VMOVDQ (j*16 - 16*by)(p_in), xkeyB 408 club XDATA, i 409 vpxor xkeyA, var_xdata, var_xdata 410 club XDATA, j 411 vpxor xkeyB, var_xdata, var_xdata 412 .set i, (i+2) 413 .endr 414 415 .if (i < by) 416 VMOVDQ (i*16 - 16*by)(p_in), xkeyA 417 club XDATA, i 418 vpxor xkeyA, var_xdata, var_xdata 419 .endif 420 421 .set i, 0 422 .rept by 423 club XDATA, i 424 VMOVDQ var_xdata, i*16(p_out) 425 .set i, (i+1) 426 .endr 427.endm 428 429.macro do_aes_load val, key_len 430 do_aes \val, 1, \key_len 431.endm 432 433.macro do_aes_noload val, key_len 434 do_aes \val, 0, \key_len 435.endm 436 437/* main body of aes ctr load */ 438 439.macro do_aes_ctrmain key_len 440 cmp $16, num_bytes 441 jb .Ldo_return2\key_len 442 443 vmovdqa byteswap_const(%rip), xbyteswap 444 vmovdqu (p_iv), xcounter 445 vpshufb xbyteswap, xcounter, xcounter 446 447 mov num_bytes, tmp 448 and $(7*16), tmp 449 jz .Lmult_of_8_blks\key_len 450 451 /* 1 <= tmp <= 7 */ 452 cmp $(4*16), tmp 453 jg .Lgt4\key_len 454 je .Leq4\key_len 455 456.Llt4\key_len: 457 cmp $(2*16), tmp 458 jg .Leq3\key_len 459 je .Leq2\key_len 460 461.Leq1\key_len: 462 do_aes_load 1, \key_len 463 add $(1*16), p_out 464 and $(~7*16), num_bytes 465 jz .Ldo_return2\key_len 466 jmp .Lmain_loop2\key_len 467 468.Leq2\key_len: 469 do_aes_load 2, \key_len 470 add $(2*16), p_out 471 and $(~7*16), num_bytes 472 jz .Ldo_return2\key_len 473 jmp .Lmain_loop2\key_len 474 475 476.Leq3\key_len: 477 do_aes_load 3, \key_len 478 add $(3*16), p_out 479 and $(~7*16), num_bytes 480 jz .Ldo_return2\key_len 481 jmp .Lmain_loop2\key_len 482 483.Leq4\key_len: 484 do_aes_load 4, \key_len 485 add $(4*16), p_out 486 and $(~7*16), num_bytes 487 jz .Ldo_return2\key_len 488 jmp .Lmain_loop2\key_len 489 490.Lgt4\key_len: 491 cmp $(6*16), tmp 492 jg .Leq7\key_len 493 je .Leq6\key_len 494 495.Leq5\key_len: 496 do_aes_load 5, \key_len 497 add $(5*16), p_out 498 and $(~7*16), num_bytes 499 jz .Ldo_return2\key_len 500 jmp .Lmain_loop2\key_len 501 502.Leq6\key_len: 503 do_aes_load 6, \key_len 504 add $(6*16), p_out 505 and $(~7*16), num_bytes 506 jz .Ldo_return2\key_len 507 jmp .Lmain_loop2\key_len 508 509.Leq7\key_len: 510 do_aes_load 7, \key_len 511 add $(7*16), p_out 512 and $(~7*16), num_bytes 513 jz .Ldo_return2\key_len 514 jmp .Lmain_loop2\key_len 515 516.Lmult_of_8_blks\key_len: 517 .if (\key_len != KEY_128) 518 vmovdqa 0*16(p_keys), xkey0 519 vmovdqa 4*16(p_keys), xkey4 520 vmovdqa 8*16(p_keys), xkey8 521 vmovdqa 12*16(p_keys), xkey12 522 .else 523 vmovdqa 0*16(p_keys), xkey0 524 vmovdqa 3*16(p_keys), xkey4 525 vmovdqa 6*16(p_keys), xkey8 526 vmovdqa 9*16(p_keys), xkey12 527 .endif 528.align 16 529.Lmain_loop2\key_len: 530 /* num_bytes is a multiple of 8 and >0 */ 531 do_aes_noload 8, \key_len 532 add $(8*16), p_out 533 sub $(8*16), num_bytes 534 jne .Lmain_loop2\key_len 535 536.Ldo_return2\key_len: 537 /* return updated IV */ 538 vpshufb xbyteswap, xcounter, xcounter 539 vmovdqu xcounter, (p_iv) 540 ret 541.endm 542 543/* 544 * routine to do AES128 CTR enc/decrypt "by8" 545 * XMM registers are clobbered. 546 * Saving/restoring must be done at a higher level 547 * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, 548 * unsigned int num_bytes) 549 */ 550ENTRY(aes_ctr_enc_128_avx_by8) 551 /* call the aes main loop */ 552 do_aes_ctrmain KEY_128 553 554ENDPROC(aes_ctr_enc_128_avx_by8) 555 556/* 557 * routine to do AES192 CTR enc/decrypt "by8" 558 * XMM registers are clobbered. 559 * Saving/restoring must be done at a higher level 560 * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, 561 * unsigned int num_bytes) 562 */ 563ENTRY(aes_ctr_enc_192_avx_by8) 564 /* call the aes main loop */ 565 do_aes_ctrmain KEY_192 566 567ENDPROC(aes_ctr_enc_192_avx_by8) 568 569/* 570 * routine to do AES256 CTR enc/decrypt "by8" 571 * XMM registers are clobbered. 572 * Saving/restoring must be done at a higher level 573 * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, 574 * unsigned int num_bytes) 575 */ 576ENTRY(aes_ctr_enc_256_avx_by8) 577 /* call the aes main loop */ 578 do_aes_ctrmain KEY_256 579 580ENDPROC(aes_ctr_enc_256_avx_by8) 581