root/arch/x86/crypto/sha1_ssse3_asm.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
   4  * SSE3 instruction set extensions introduced in Intel Core Microarchitecture
   5  * processors. CPUs supporting Intel(R) AVX extensions will get an additional
   6  * boost.
   7  *
   8  * This work was inspired by the vectorized implementation of Dean Gaudet.
   9  * Additional information on it can be found at:
  10  *    http://www.arctic.org/~dean/crypto/sha1.html
  11  *
  12  * It was improved upon with more efficient vectorization of the message
  13  * scheduling. This implementation has also been optimized for all current and
  14  * several future generations of Intel CPUs.
  15  *
  16  * See this article for more information about the implementation details:
  17  *   http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/
  18  *
  19  * Copyright (C) 2010, Intel Corp.
  20  *   Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com>
  21  *            Ronen Zohar <ronen.zohar@intel.com>
  22  *
  23  * Converted to AT&T syntax and adapted for inclusion in the Linux kernel:
  24  *   Author: Mathias Krause <minipli@googlemail.com>
  25  */
  26 
  27 #include <linux/linkage.h>
  28 
  29 #define CTX     %rdi    // arg1
  30 #define BUF     %rsi    // arg2
  31 #define CNT     %rdx    // arg3
  32 
  33 #define REG_A   %ecx
  34 #define REG_B   %esi
  35 #define REG_C   %edi
  36 #define REG_D   %r12d
  37 #define REG_E   %edx
  38 
  39 #define REG_T1  %eax
  40 #define REG_T2  %ebx
  41 
  42 #define K_BASE          %r8
  43 #define HASH_PTR        %r9
  44 #define BUFFER_PTR      %r10
  45 #define BUFFER_END      %r11
  46 
  47 #define W_TMP1  %xmm0
  48 #define W_TMP2  %xmm9
  49 
  50 #define W0      %xmm1
  51 #define W4      %xmm2
  52 #define W8      %xmm3
  53 #define W12     %xmm4
  54 #define W16     %xmm5
  55 #define W20     %xmm6
  56 #define W24     %xmm7
  57 #define W28     %xmm8
  58 
  59 #define XMM_SHUFB_BSWAP %xmm10
  60 
  61 /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
  62 #define WK(t)   (((t) & 15) * 4)(%rsp)
  63 #define W_PRECALC_AHEAD 16
  64 
  65 /*
  66  * This macro implements the SHA-1 function's body for single 64-byte block
  67  * param: function's name
  68  */
  69 .macro SHA1_VECTOR_ASM  name
  70         ENTRY(\name)
  71 
  72         push    %rbx
  73         push    %r12
  74         push    %rbp
  75         mov     %rsp, %rbp
  76 
  77         sub     $64, %rsp               # allocate workspace
  78         and     $~15, %rsp              # align stack
  79 
  80         mov     CTX, HASH_PTR
  81         mov     BUF, BUFFER_PTR
  82 
  83         shl     $6, CNT                 # multiply by 64
  84         add     BUF, CNT
  85         mov     CNT, BUFFER_END
  86 
  87         lea     K_XMM_AR(%rip), K_BASE
  88         xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP
  89 
  90         SHA1_PIPELINED_MAIN_BODY
  91 
  92         # cleanup workspace
  93         mov     $8, %ecx
  94         mov     %rsp, %rdi
  95         xor     %eax, %eax
  96         rep stosq
  97 
  98         mov     %rbp, %rsp              # deallocate workspace
  99         pop     %rbp
 100         pop     %r12
 101         pop     %rbx
 102         ret
 103 
 104         ENDPROC(\name)
 105 .endm
 106 
 107 /*
 108  * This macro implements 80 rounds of SHA-1 for one 64-byte block
 109  */
 110 .macro SHA1_PIPELINED_MAIN_BODY
 111         INIT_REGALLOC
 112 
 113         mov       (HASH_PTR), A
 114         mov      4(HASH_PTR), B
 115         mov      8(HASH_PTR), C
 116         mov     12(HASH_PTR), D
 117         mov     16(HASH_PTR), E
 118 
 119   .set i, 0
 120   .rept W_PRECALC_AHEAD
 121         W_PRECALC i
 122     .set i, (i+1)
 123   .endr
 124 
 125 .align 4
 126 1:
 127         RR F1,A,B,C,D,E,0
 128         RR F1,D,E,A,B,C,2
 129         RR F1,B,C,D,E,A,4
 130         RR F1,E,A,B,C,D,6
 131         RR F1,C,D,E,A,B,8
 132 
 133         RR F1,A,B,C,D,E,10
 134         RR F1,D,E,A,B,C,12
 135         RR F1,B,C,D,E,A,14
 136         RR F1,E,A,B,C,D,16
 137         RR F1,C,D,E,A,B,18
 138 
 139         RR F2,A,B,C,D,E,20
 140         RR F2,D,E,A,B,C,22
 141         RR F2,B,C,D,E,A,24
 142         RR F2,E,A,B,C,D,26
 143         RR F2,C,D,E,A,B,28
 144 
 145         RR F2,A,B,C,D,E,30
 146         RR F2,D,E,A,B,C,32
 147         RR F2,B,C,D,E,A,34
 148         RR F2,E,A,B,C,D,36
 149         RR F2,C,D,E,A,B,38
 150 
 151         RR F3,A,B,C,D,E,40
 152         RR F3,D,E,A,B,C,42
 153         RR F3,B,C,D,E,A,44
 154         RR F3,E,A,B,C,D,46
 155         RR F3,C,D,E,A,B,48
 156 
 157         RR F3,A,B,C,D,E,50
 158         RR F3,D,E,A,B,C,52
 159         RR F3,B,C,D,E,A,54
 160         RR F3,E,A,B,C,D,56
 161         RR F3,C,D,E,A,B,58
 162 
 163         add     $64, BUFFER_PTR         # move to the next 64-byte block
 164         cmp     BUFFER_END, BUFFER_PTR  # if the current is the last one use
 165         cmovae  K_BASE, BUFFER_PTR      # dummy source to avoid buffer overrun
 166 
 167         RR F4,A,B,C,D,E,60
 168         RR F4,D,E,A,B,C,62
 169         RR F4,B,C,D,E,A,64
 170         RR F4,E,A,B,C,D,66
 171         RR F4,C,D,E,A,B,68
 172 
 173         RR F4,A,B,C,D,E,70
 174         RR F4,D,E,A,B,C,72
 175         RR F4,B,C,D,E,A,74
 176         RR F4,E,A,B,C,D,76
 177         RR F4,C,D,E,A,B,78
 178 
 179         UPDATE_HASH   (HASH_PTR), A
 180         UPDATE_HASH  4(HASH_PTR), B
 181         UPDATE_HASH  8(HASH_PTR), C
 182         UPDATE_HASH 12(HASH_PTR), D
 183         UPDATE_HASH 16(HASH_PTR), E
 184 
 185         RESTORE_RENAMED_REGS
 186         cmp     K_BASE, BUFFER_PTR      # K_BASE means, we reached the end
 187         jne     1b
 188 .endm
 189 
 190 .macro INIT_REGALLOC
 191   .set A, REG_A
 192   .set B, REG_B
 193   .set C, REG_C
 194   .set D, REG_D
 195   .set E, REG_E
 196   .set T1, REG_T1
 197   .set T2, REG_T2
 198 .endm
 199 
 200 .macro RESTORE_RENAMED_REGS
 201         # order is important (REG_C is where it should be)
 202         mov     B, REG_B
 203         mov     D, REG_D
 204         mov     A, REG_A
 205         mov     E, REG_E
 206 .endm
 207 
 208 .macro SWAP_REG_NAMES  a, b
 209   .set _T, \a
 210   .set \a, \b
 211   .set \b, _T
 212 .endm
 213 
 214 .macro F1  b, c, d
 215         mov     \c, T1
 216         SWAP_REG_NAMES \c, T1
 217         xor     \d, T1
 218         and     \b, T1
 219         xor     \d, T1
 220 .endm
 221 
 222 .macro F2  b, c, d
 223         mov     \d, T1
 224         SWAP_REG_NAMES \d, T1
 225         xor     \c, T1
 226         xor     \b, T1
 227 .endm
 228 
 229 .macro F3  b, c ,d
 230         mov     \c, T1
 231         SWAP_REG_NAMES \c, T1
 232         mov     \b, T2
 233         or      \b, T1
 234         and     \c, T2
 235         and     \d, T1
 236         or      T2, T1
 237 .endm
 238 
 239 .macro F4  b, c, d
 240         F2 \b, \c, \d
 241 .endm
 242 
 243 .macro UPDATE_HASH  hash, val
 244         add     \hash, \val
 245         mov     \val, \hash
 246 .endm
 247 
 248 /*
 249  * RR does two rounds of SHA-1 back to back with W[] pre-calc
 250  *   t1 = F(b, c, d);   e += w(i)
 251  *   e += t1;           b <<= 30;   d  += w(i+1);
 252  *   t1 = F(a, b, c);
 253  *   d += t1;           a <<= 5;
 254  *   e += a;
 255  *   t1 = e;            a >>= 7;
 256  *   t1 <<= 5;
 257  *   d += t1;
 258  */
 259 .macro RR  F, a, b, c, d, e, round
 260         add     WK(\round), \e
 261         \F   \b, \c, \d         # t1 = F(b, c, d);
 262         W_PRECALC (\round + W_PRECALC_AHEAD)
 263         rol     $30, \b
 264         add     T1, \e
 265         add     WK(\round + 1), \d
 266 
 267         \F   \a, \b, \c
 268         W_PRECALC (\round + W_PRECALC_AHEAD + 1)
 269         rol     $5, \a
 270         add     \a, \e
 271         add     T1, \d
 272         ror     $7, \a          # (a <<r 5) >>r 7) => a <<r 30)
 273 
 274         mov     \e, T1
 275         SWAP_REG_NAMES \e, T1
 276 
 277         rol     $5, T1
 278         add     T1, \d
 279 
 280         # write:  \a, \b
 281         # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
 282 .endm
 283 
 284 .macro W_PRECALC  r
 285   .set i, \r
 286 
 287   .if (i < 20)
 288     .set K_XMM, 0
 289   .elseif (i < 40)
 290     .set K_XMM, 16
 291   .elseif (i < 60)
 292     .set K_XMM, 32
 293   .elseif (i < 80)
 294     .set K_XMM, 48
 295   .endif
 296 
 297   .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD))))
 298     .set i, ((\r) % 80)     # pre-compute for the next iteration
 299     .if (i == 0)
 300         W_PRECALC_RESET
 301     .endif
 302         W_PRECALC_00_15
 303   .elseif (i<32)
 304         W_PRECALC_16_31
 305   .elseif (i < 80)   // rounds 32-79
 306         W_PRECALC_32_79
 307   .endif
 308 .endm
 309 
 310 .macro W_PRECALC_RESET
 311   .set W,          W0
 312   .set W_minus_04, W4
 313   .set W_minus_08, W8
 314   .set W_minus_12, W12
 315   .set W_minus_16, W16
 316   .set W_minus_20, W20
 317   .set W_minus_24, W24
 318   .set W_minus_28, W28
 319   .set W_minus_32, W
 320 .endm
 321 
 322 .macro W_PRECALC_ROTATE
 323   .set W_minus_32, W_minus_28
 324   .set W_minus_28, W_minus_24
 325   .set W_minus_24, W_minus_20
 326   .set W_minus_20, W_minus_16
 327   .set W_minus_16, W_minus_12
 328   .set W_minus_12, W_minus_08
 329   .set W_minus_08, W_minus_04
 330   .set W_minus_04, W
 331   .set W,          W_minus_32
 332 .endm
 333 
 334 .macro W_PRECALC_SSSE3
 335 
 336 .macro W_PRECALC_00_15
 337         W_PRECALC_00_15_SSSE3
 338 .endm
 339 .macro W_PRECALC_16_31
 340         W_PRECALC_16_31_SSSE3
 341 .endm
 342 .macro W_PRECALC_32_79
 343         W_PRECALC_32_79_SSSE3
 344 .endm
 345 
 346 /* message scheduling pre-compute for rounds 0-15 */
 347 .macro W_PRECALC_00_15_SSSE3
 348   .if ((i & 3) == 0)
 349         movdqu  (i*4)(BUFFER_PTR), W_TMP1
 350   .elseif ((i & 3) == 1)
 351         pshufb  XMM_SHUFB_BSWAP, W_TMP1
 352         movdqa  W_TMP1, W
 353   .elseif ((i & 3) == 2)
 354         paddd   (K_BASE), W_TMP1
 355   .elseif ((i & 3) == 3)
 356         movdqa  W_TMP1, WK(i&~3)
 357         W_PRECALC_ROTATE
 358   .endif
 359 .endm
 360 
 361 /* message scheduling pre-compute for rounds 16-31
 362  *
 363  * - calculating last 32 w[i] values in 8 XMM registers
 364  * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
 365  *   instruction
 366  *
 367  * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
 368  * dependency, but improves for 32-79
 369  */
 370 .macro W_PRECALC_16_31_SSSE3
 371   # blended scheduling of vector and scalar instruction streams, one 4-wide
 372   # vector iteration / 4 scalar rounds
 373   .if ((i & 3) == 0)
 374         movdqa  W_minus_12, W
 375         palignr $8, W_minus_16, W       # w[i-14]
 376         movdqa  W_minus_04, W_TMP1
 377         psrldq  $4, W_TMP1              # w[i-3]
 378         pxor    W_minus_08, W
 379   .elseif ((i & 3) == 1)
 380         pxor    W_minus_16, W_TMP1
 381         pxor    W_TMP1, W
 382         movdqa  W, W_TMP2
 383         movdqa  W, W_TMP1
 384         pslldq  $12, W_TMP2
 385   .elseif ((i & 3) == 2)
 386         psrld   $31, W
 387         pslld   $1, W_TMP1
 388         por     W, W_TMP1
 389         movdqa  W_TMP2, W
 390         psrld   $30, W_TMP2
 391         pslld   $2, W
 392   .elseif ((i & 3) == 3)
 393         pxor    W, W_TMP1
 394         pxor    W_TMP2, W_TMP1
 395         movdqa  W_TMP1, W
 396         paddd   K_XMM(K_BASE), W_TMP1
 397         movdqa  W_TMP1, WK(i&~3)
 398         W_PRECALC_ROTATE
 399   .endif
 400 .endm
 401 
 402 /* message scheduling pre-compute for rounds 32-79
 403  *
 404  * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
 405  * instead we do equal:    w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
 406  * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
 407  */
 408 .macro W_PRECALC_32_79_SSSE3
 409   .if ((i & 3) == 0)
 410         movdqa  W_minus_04, W_TMP1
 411         pxor    W_minus_28, W           # W is W_minus_32 before xor
 412         palignr $8, W_minus_08, W_TMP1
 413   .elseif ((i & 3) == 1)
 414         pxor    W_minus_16, W
 415         pxor    W_TMP1, W
 416         movdqa  W, W_TMP1
 417   .elseif ((i & 3) == 2)
 418         psrld   $30, W
 419         pslld   $2, W_TMP1
 420         por     W, W_TMP1
 421   .elseif ((i & 3) == 3)
 422         movdqa  W_TMP1, W
 423         paddd   K_XMM(K_BASE), W_TMP1
 424         movdqa  W_TMP1, WK(i&~3)
 425         W_PRECALC_ROTATE
 426   .endif
 427 .endm
 428 
 429 .endm           // W_PRECALC_SSSE3
 430 
 431 
 432 #define K1      0x5a827999
 433 #define K2      0x6ed9eba1
 434 #define K3      0x8f1bbcdc
 435 #define K4      0xca62c1d6
 436 
 437 .section .rodata
 438 .align 16
 439 
 440 K_XMM_AR:
 441         .long K1, K1, K1, K1
 442         .long K2, K2, K2, K2
 443         .long K3, K3, K3, K3
 444         .long K4, K4, K4, K4
 445 
 446 BSWAP_SHUFB_CTL:
 447         .long 0x00010203
 448         .long 0x04050607
 449         .long 0x08090a0b
 450         .long 0x0c0d0e0f
 451 
 452 
 453 .section .text
 454 
 455 W_PRECALC_SSSE3
 456 .macro xmm_mov a, b
 457         movdqu  \a,\b
 458 .endm
 459 
 460 /* SSSE3 optimized implementation:
 461  *  extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws,
 462  *                                       unsigned int rounds);
 463  */
 464 SHA1_VECTOR_ASM     sha1_transform_ssse3
 465 
 466 #ifdef CONFIG_AS_AVX
 467 
 468 .macro W_PRECALC_AVX
 469 
 470 .purgem W_PRECALC_00_15
 471 .macro  W_PRECALC_00_15
 472     W_PRECALC_00_15_AVX
 473 .endm
 474 .purgem W_PRECALC_16_31
 475 .macro  W_PRECALC_16_31
 476     W_PRECALC_16_31_AVX
 477 .endm
 478 .purgem W_PRECALC_32_79
 479 .macro  W_PRECALC_32_79
 480     W_PRECALC_32_79_AVX
 481 .endm
 482 
 483 .macro W_PRECALC_00_15_AVX
 484   .if ((i & 3) == 0)
 485         vmovdqu (i*4)(BUFFER_PTR), W_TMP1
 486   .elseif ((i & 3) == 1)
 487         vpshufb XMM_SHUFB_BSWAP, W_TMP1, W
 488   .elseif ((i & 3) == 2)
 489         vpaddd  (K_BASE), W, W_TMP1
 490   .elseif ((i & 3) == 3)
 491         vmovdqa W_TMP1, WK(i&~3)
 492         W_PRECALC_ROTATE
 493   .endif
 494 .endm
 495 
 496 .macro W_PRECALC_16_31_AVX
 497   .if ((i & 3) == 0)
 498         vpalignr $8, W_minus_16, W_minus_12, W  # w[i-14]
 499         vpsrldq $4, W_minus_04, W_TMP1          # w[i-3]
 500         vpxor   W_minus_08, W, W
 501         vpxor   W_minus_16, W_TMP1, W_TMP1
 502   .elseif ((i & 3) == 1)
 503         vpxor   W_TMP1, W, W
 504         vpslldq $12, W, W_TMP2
 505         vpslld  $1, W, W_TMP1
 506   .elseif ((i & 3) == 2)
 507         vpsrld  $31, W, W
 508         vpor    W, W_TMP1, W_TMP1
 509         vpslld  $2, W_TMP2, W
 510         vpsrld  $30, W_TMP2, W_TMP2
 511   .elseif ((i & 3) == 3)
 512         vpxor   W, W_TMP1, W_TMP1
 513         vpxor   W_TMP2, W_TMP1, W
 514         vpaddd  K_XMM(K_BASE), W, W_TMP1
 515         vmovdqu W_TMP1, WK(i&~3)
 516         W_PRECALC_ROTATE
 517   .endif
 518 .endm
 519 
 520 .macro W_PRECALC_32_79_AVX
 521   .if ((i & 3) == 0)
 522         vpalignr $8, W_minus_08, W_minus_04, W_TMP1
 523         vpxor   W_minus_28, W, W                # W is W_minus_32 before xor
 524   .elseif ((i & 3) == 1)
 525         vpxor   W_minus_16, W_TMP1, W_TMP1
 526         vpxor   W_TMP1, W, W
 527   .elseif ((i & 3) == 2)
 528         vpslld  $2, W, W_TMP1
 529         vpsrld  $30, W, W
 530         vpor    W, W_TMP1, W
 531   .elseif ((i & 3) == 3)
 532         vpaddd  K_XMM(K_BASE), W, W_TMP1
 533         vmovdqu W_TMP1, WK(i&~3)
 534         W_PRECALC_ROTATE
 535   .endif
 536 .endm
 537 
 538 .endm    // W_PRECALC_AVX
 539 
 540 W_PRECALC_AVX
 541 .purgem xmm_mov
 542 .macro xmm_mov a, b
 543         vmovdqu \a,\b
 544 .endm
 545 
 546 
 547 /* AVX optimized implementation:
 548  *  extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws,
 549  *                                     unsigned int rounds);
 550  */
 551 SHA1_VECTOR_ASM     sha1_transform_avx
 552 
 553 #endif

/* [<][>][^][v][top][bottom][index][help] */