1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 
  11 #include <linux/linkage.h>
  12 #include <asm/frame.h>
  13 
  14 .file "cast5-avx-x86_64-asm_64.S"
  15 
  16 .extern cast_s1
  17 .extern cast_s2
  18 .extern cast_s3
  19 .extern cast_s4
  20 
  21 
  22 #define km      0
  23 #define kr      (16*4)
  24 #define rr      ((16*4)+16)
  25 
  26 
  27 #define s1      cast_s1
  28 #define s2      cast_s2
  29 #define s3      cast_s3
  30 #define s4      cast_s4
  31 
  32 
  33 
  34 
  35 #define CTX %r15
  36 
  37 #define RL1 %xmm0
  38 #define RR1 %xmm1
  39 #define RL2 %xmm2
  40 #define RR2 %xmm3
  41 #define RL3 %xmm4
  42 #define RR3 %xmm5
  43 #define RL4 %xmm6
  44 #define RR4 %xmm7
  45 
  46 #define RX %xmm8
  47 
  48 #define RKM  %xmm9
  49 #define RKR  %xmm10
  50 #define RKRF %xmm11
  51 #define RKRR %xmm12
  52 
  53 #define R32  %xmm13
  54 #define R1ST %xmm14
  55 
  56 #define RTMP %xmm15
  57 
  58 #define RID1  %rdi
  59 #define RID1d %edi
  60 #define RID2  %rsi
  61 #define RID2d %esi
  62 
  63 #define RGI1   %rdx
  64 #define RGI1bl %dl
  65 #define RGI1bh %dh
  66 #define RGI2   %rcx
  67 #define RGI2bl %cl
  68 #define RGI2bh %ch
  69 
  70 #define RGI3   %rax
  71 #define RGI3bl %al
  72 #define RGI3bh %ah
  73 #define RGI4   %rbx
  74 #define RGI4bl %bl
  75 #define RGI4bh %bh
  76 
  77 #define RFS1  %r8
  78 #define RFS1d %r8d
  79 #define RFS2  %r9
  80 #define RFS2d %r9d
  81 #define RFS3  %r10
  82 #define RFS3d %r10d
  83 
  84 
  85 #define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \
  86         movzbl          src ## bh,     RID1d;    \
  87         movzbl          src ## bl,     RID2d;    \
  88         shrq $16,       src;                     \
  89         movl            s1(, RID1, 4), dst ## d; \
  90         op1             s2(, RID2, 4), dst ## d; \
  91         movzbl          src ## bh,     RID1d;    \
  92         movzbl          src ## bl,     RID2d;    \
  93         interleave_op(il_reg);                   \
  94         op2             s3(, RID1, 4), dst ## d; \
  95         op3             s4(, RID2, 4), dst ## d;
  96 
  97 #define dummy(d) 
  98 
  99 #define shr_next(reg) \
 100         shrq $16,       reg;
 101 
 102 #define F_head(a, x, gi1, gi2, op0) \
 103         op0     a,      RKM,  x;                 \
 104         vpslld  RKRF,   x,    RTMP;              \
 105         vpsrld  RKRR,   x,    x;                 \
 106         vpor    RTMP,   x,    x;                 \
 107         \
 108         vmovq           x,    gi1;               \
 109         vpextrq $1,     x,    gi2;
 110 
 111 #define F_tail(a, x, gi1, gi2, op1, op2, op3) \
 112         lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \
 113         lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \
 114         \
 115         lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none);     \
 116         shlq $32,       RFS2;                                      \
 117         orq             RFS1, RFS2;                                \
 118         lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none);     \
 119         shlq $32,       RFS1;                                      \
 120         orq             RFS1, RFS3;                                \
 121         \
 122         vmovq           RFS2, x;                                   \
 123         vpinsrq $1,     RFS3, x, x;
 124 
 125 #define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \
 126         F_head(b1, RX, RGI1, RGI2, op0);              \
 127         F_head(b2, RX, RGI3, RGI4, op0);              \
 128         \
 129         F_tail(b1, RX, RGI1, RGI2, op1, op2, op3);    \
 130         F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3);  \
 131         \
 132         vpxor           a1, RX,   a1;                 \
 133         vpxor           a2, RTMP, a2;
 134 
 135 #define F1_2(a1, b1, a2, b2) \
 136         F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl)
 137 #define F2_2(a1, b1, a2, b2) \
 138         F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl)
 139 #define F3_2(a1, b1, a2, b2) \
 140         F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl)
 141 
 142 #define subround(a1, b1, a2, b2, f) \
 143         F ## f ## _2(a1, b1, a2, b2);
 144 
 145 #define round(l, r, n, f) \
 146         vbroadcastss    (km+(4*n))(CTX), RKM;        \
 147         vpand           R1ST,            RKR,  RKRF; \
 148         vpsubq          RKRF,            R32,  RKRR; \
 149         vpsrldq $1,     RKR,             RKR;        \
 150         subround(l ## 1, r ## 1, l ## 2, r ## 2, f); \
 151         subround(l ## 3, r ## 3, l ## 4, r ## 4, f);
 152 
 153 #define enc_preload_rkr() \
 154         vbroadcastss    .L16_mask,                RKR;      \
 155          \
 156         vpxor           kr(CTX),                  RKR, RKR;
 157 
 158 #define dec_preload_rkr() \
 159         vbroadcastss    .L16_mask,                RKR;      \
 160          \
 161         vpxor           kr(CTX),                  RKR, RKR; \
 162         vpshufb         .Lbswap128_mask,          RKR, RKR;
 163 
 164 #define transpose_2x4(x0, x1, t0, t1) \
 165         vpunpckldq              x1, x0, t0; \
 166         vpunpckhdq              x1, x0, t1; \
 167         \
 168         vpunpcklqdq             t1, t0, x0; \
 169         vpunpckhqdq             t1, t0, x1;
 170 
 171 #define inpack_blocks(x0, x1, t0, t1, rmask) \
 172         vpshufb rmask,  x0,     x0; \
 173         vpshufb rmask,  x1,     x1; \
 174         \
 175         transpose_2x4(x0, x1, t0, t1)
 176 
 177 #define outunpack_blocks(x0, x1, t0, t1, rmask) \
 178         transpose_2x4(x0, x1, t0, t1) \
 179         \
 180         vpshufb rmask,  x0, x0;           \
 181         vpshufb rmask,  x1, x1;
 182 
 183 .section        .rodata.cst16.bswap_mask, "aM", @progbits, 16
 184 .align 16
 185 .Lbswap_mask:
 186         .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 187 .section        .rodata.cst16.bswap128_mask, "aM", @progbits, 16
 188 .align 16
 189 .Lbswap128_mask:
 190         .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
 191 .section        .rodata.cst16.bswap_iv_mask, "aM", @progbits, 16
 192 .align 16
 193 .Lbswap_iv_mask:
 194         .byte 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0
 195 
 196 .section        .rodata.cst4.16_mask, "aM", @progbits, 4
 197 .align 4
 198 .L16_mask:
 199         .byte 16, 16, 16, 16
 200 .section        .rodata.cst4.32_mask, "aM", @progbits, 4
 201 .align 4
 202 .L32_mask:
 203         .byte 32, 0, 0, 0
 204 .section        .rodata.cst4.first_mask, "aM", @progbits, 4
 205 .align 4
 206 .Lfirst_mask:
 207         .byte 0x1f, 0, 0, 0
 208 
 209 .text
 210 
 211 .align 16
 212 __cast5_enc_blk16:
 213         
 214 
 215 
 216 
 217 
 218 
 219 
 220 
 221 
 222 
 223 
 224 
 225 
 226 
 227 
 228 
 229 
 230 
 231 
 232 
 233 
 234         pushq %r15;
 235         pushq %rbx;
 236 
 237         movq %rdi, CTX;
 238 
 239         vmovdqa .Lbswap_mask, RKM;
 240         vmovd .Lfirst_mask, R1ST;
 241         vmovd .L32_mask, R32;
 242         enc_preload_rkr();
 243 
 244         inpack_blocks(RL1, RR1, RTMP, RX, RKM);
 245         inpack_blocks(RL2, RR2, RTMP, RX, RKM);
 246         inpack_blocks(RL3, RR3, RTMP, RX, RKM);
 247         inpack_blocks(RL4, RR4, RTMP, RX, RKM);
 248 
 249         round(RL, RR, 0, 1);
 250         round(RR, RL, 1, 2);
 251         round(RL, RR, 2, 3);
 252         round(RR, RL, 3, 1);
 253         round(RL, RR, 4, 2);
 254         round(RR, RL, 5, 3);
 255         round(RL, RR, 6, 1);
 256         round(RR, RL, 7, 2);
 257         round(RL, RR, 8, 3);
 258         round(RR, RL, 9, 1);
 259         round(RL, RR, 10, 2);
 260         round(RR, RL, 11, 3);
 261 
 262         movzbl rr(CTX), %eax;
 263         testl %eax, %eax;
 264         jnz .L__skip_enc;
 265 
 266         round(RL, RR, 12, 1);
 267         round(RR, RL, 13, 2);
 268         round(RL, RR, 14, 3);
 269         round(RR, RL, 15, 1);
 270 
 271 .L__skip_enc:
 272         popq %rbx;
 273         popq %r15;
 274 
 275         vmovdqa .Lbswap_mask, RKM;
 276 
 277         outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
 278         outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
 279         outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
 280         outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
 281 
 282         ret;
 283 ENDPROC(__cast5_enc_blk16)
 284 
 285 .align 16
 286 __cast5_dec_blk16:
 287         
 288 
 289 
 290 
 291 
 292 
 293 
 294 
 295 
 296 
 297 
 298 
 299 
 300 
 301 
 302 
 303 
 304 
 305 
 306 
 307 
 308         pushq %r15;
 309         pushq %rbx;
 310 
 311         movq %rdi, CTX;
 312 
 313         vmovdqa .Lbswap_mask, RKM;
 314         vmovd .Lfirst_mask, R1ST;
 315         vmovd .L32_mask, R32;
 316         dec_preload_rkr();
 317 
 318         inpack_blocks(RL1, RR1, RTMP, RX, RKM);
 319         inpack_blocks(RL2, RR2, RTMP, RX, RKM);
 320         inpack_blocks(RL3, RR3, RTMP, RX, RKM);
 321         inpack_blocks(RL4, RR4, RTMP, RX, RKM);
 322 
 323         movzbl rr(CTX), %eax;
 324         testl %eax, %eax;
 325         jnz .L__skip_dec;
 326 
 327         round(RL, RR, 15, 1);
 328         round(RR, RL, 14, 3);
 329         round(RL, RR, 13, 2);
 330         round(RR, RL, 12, 1);
 331 
 332 .L__dec_tail:
 333         round(RL, RR, 11, 3);
 334         round(RR, RL, 10, 2);
 335         round(RL, RR, 9, 1);
 336         round(RR, RL, 8, 3);
 337         round(RL, RR, 7, 2);
 338         round(RR, RL, 6, 1);
 339         round(RL, RR, 5, 3);
 340         round(RR, RL, 4, 2);
 341         round(RL, RR, 3, 1);
 342         round(RR, RL, 2, 3);
 343         round(RL, RR, 1, 2);
 344         round(RR, RL, 0, 1);
 345 
 346         vmovdqa .Lbswap_mask, RKM;
 347         popq %rbx;
 348         popq %r15;
 349 
 350         outunpack_blocks(RR1, RL1, RTMP, RX, RKM);
 351         outunpack_blocks(RR2, RL2, RTMP, RX, RKM);
 352         outunpack_blocks(RR3, RL3, RTMP, RX, RKM);
 353         outunpack_blocks(RR4, RL4, RTMP, RX, RKM);
 354 
 355         ret;
 356 
 357 .L__skip_dec:
 358         vpsrldq $4, RKR, RKR;
 359         jmp .L__dec_tail;
 360 ENDPROC(__cast5_dec_blk16)
 361 
 362 ENTRY(cast5_ecb_enc_16way)
 363         
 364 
 365 
 366 
 367 
 368         FRAME_BEGIN
 369         pushq %r15;
 370 
 371         movq %rdi, CTX;
 372         movq %rsi, %r11;
 373 
 374         vmovdqu (0*4*4)(%rdx), RL1;
 375         vmovdqu (1*4*4)(%rdx), RR1;
 376         vmovdqu (2*4*4)(%rdx), RL2;
 377         vmovdqu (3*4*4)(%rdx), RR2;
 378         vmovdqu (4*4*4)(%rdx), RL3;
 379         vmovdqu (5*4*4)(%rdx), RR3;
 380         vmovdqu (6*4*4)(%rdx), RL4;
 381         vmovdqu (7*4*4)(%rdx), RR4;
 382 
 383         call __cast5_enc_blk16;
 384 
 385         vmovdqu RR1, (0*4*4)(%r11);
 386         vmovdqu RL1, (1*4*4)(%r11);
 387         vmovdqu RR2, (2*4*4)(%r11);
 388         vmovdqu RL2, (3*4*4)(%r11);
 389         vmovdqu RR3, (4*4*4)(%r11);
 390         vmovdqu RL3, (5*4*4)(%r11);
 391         vmovdqu RR4, (6*4*4)(%r11);
 392         vmovdqu RL4, (7*4*4)(%r11);
 393 
 394         popq %r15;
 395         FRAME_END
 396         ret;
 397 ENDPROC(cast5_ecb_enc_16way)
 398 
 399 ENTRY(cast5_ecb_dec_16way)
 400         
 401 
 402 
 403 
 404 
 405 
 406         FRAME_BEGIN
 407         pushq %r15;
 408 
 409         movq %rdi, CTX;
 410         movq %rsi, %r11;
 411 
 412         vmovdqu (0*4*4)(%rdx), RL1;
 413         vmovdqu (1*4*4)(%rdx), RR1;
 414         vmovdqu (2*4*4)(%rdx), RL2;
 415         vmovdqu (3*4*4)(%rdx), RR2;
 416         vmovdqu (4*4*4)(%rdx), RL3;
 417         vmovdqu (5*4*4)(%rdx), RR3;
 418         vmovdqu (6*4*4)(%rdx), RL4;
 419         vmovdqu (7*4*4)(%rdx), RR4;
 420 
 421         call __cast5_dec_blk16;
 422 
 423         vmovdqu RR1, (0*4*4)(%r11);
 424         vmovdqu RL1, (1*4*4)(%r11);
 425         vmovdqu RR2, (2*4*4)(%r11);
 426         vmovdqu RL2, (3*4*4)(%r11);
 427         vmovdqu RR3, (4*4*4)(%r11);
 428         vmovdqu RL3, (5*4*4)(%r11);
 429         vmovdqu RR4, (6*4*4)(%r11);
 430         vmovdqu RL4, (7*4*4)(%r11);
 431 
 432         popq %r15;
 433         FRAME_END
 434         ret;
 435 ENDPROC(cast5_ecb_dec_16way)
 436 
 437 ENTRY(cast5_cbc_dec_16way)
 438         
 439 
 440 
 441 
 442 
 443         FRAME_BEGIN
 444         pushq %r12;
 445         pushq %r15;
 446 
 447         movq %rdi, CTX;
 448         movq %rsi, %r11;
 449         movq %rdx, %r12;
 450 
 451         vmovdqu (0*16)(%rdx), RL1;
 452         vmovdqu (1*16)(%rdx), RR1;
 453         vmovdqu (2*16)(%rdx), RL2;
 454         vmovdqu (3*16)(%rdx), RR2;
 455         vmovdqu (4*16)(%rdx), RL3;
 456         vmovdqu (5*16)(%rdx), RR3;
 457         vmovdqu (6*16)(%rdx), RL4;
 458         vmovdqu (7*16)(%rdx), RR4;
 459 
 460         call __cast5_dec_blk16;
 461 
 462         
 463         vmovq (%r12), RX;
 464         vpshufd $0x4f, RX, RX;
 465         vpxor RX, RR1, RR1;
 466         vpxor 0*16+8(%r12), RL1, RL1;
 467         vpxor 1*16+8(%r12), RR2, RR2;
 468         vpxor 2*16+8(%r12), RL2, RL2;
 469         vpxor 3*16+8(%r12), RR3, RR3;
 470         vpxor 4*16+8(%r12), RL3, RL3;
 471         vpxor 5*16+8(%r12), RR4, RR4;
 472         vpxor 6*16+8(%r12), RL4, RL4;
 473 
 474         vmovdqu RR1, (0*16)(%r11);
 475         vmovdqu RL1, (1*16)(%r11);
 476         vmovdqu RR2, (2*16)(%r11);
 477         vmovdqu RL2, (3*16)(%r11);
 478         vmovdqu RR3, (4*16)(%r11);
 479         vmovdqu RL3, (5*16)(%r11);
 480         vmovdqu RR4, (6*16)(%r11);
 481         vmovdqu RL4, (7*16)(%r11);
 482 
 483         popq %r15;
 484         popq %r12;
 485         FRAME_END
 486         ret;
 487 ENDPROC(cast5_cbc_dec_16way)
 488 
 489 ENTRY(cast5_ctr_16way)
 490         
 491 
 492 
 493 
 494 
 495 
 496         FRAME_BEGIN
 497         pushq %r12;
 498         pushq %r15;
 499 
 500         movq %rdi, CTX;
 501         movq %rsi, %r11;
 502         movq %rdx, %r12;
 503 
 504         vpcmpeqd RTMP, RTMP, RTMP;
 505         vpsrldq $8, RTMP, RTMP; 
 506 
 507         vpcmpeqd RKR, RKR, RKR;
 508         vpaddq RKR, RKR, RKR; 
 509         vmovdqa .Lbswap_iv_mask, R1ST;
 510         vmovdqa .Lbswap128_mask, RKM;
 511 
 512         
 513         vmovq (%rcx), RX;
 514         vpshufb R1ST, RX, RX;
 515 
 516         
 517         vpsubq RTMP, RX, RX;  
 518         vpshufb RKM, RX, RL1; 
 519         vpsubq RKR, RX, RX;
 520         vpshufb RKM, RX, RR1; 
 521         vpsubq RKR, RX, RX;
 522         vpshufb RKM, RX, RL2; 
 523         vpsubq RKR, RX, RX;
 524         vpshufb RKM, RX, RR2; 
 525         vpsubq RKR, RX, RX;
 526         vpshufb RKM, RX, RL3; 
 527         vpsubq RKR, RX, RX;
 528         vpshufb RKM, RX, RR3; 
 529         vpsubq RKR, RX, RX;
 530         vpshufb RKM, RX, RL4; 
 531         vpsubq RKR, RX, RX;
 532         vpshufb RKM, RX, RR4; 
 533 
 534         
 535         vpsubq RTMP, RX, RX; 
 536         vpshufb R1ST, RX, RX; 
 537         vmovq RX, (%rcx);
 538 
 539         call __cast5_enc_blk16;
 540 
 541         
 542         vpxor (0*16)(%r12), RR1, RR1;
 543         vpxor (1*16)(%r12), RL1, RL1;
 544         vpxor (2*16)(%r12), RR2, RR2;
 545         vpxor (3*16)(%r12), RL2, RL2;
 546         vpxor (4*16)(%r12), RR3, RR3;
 547         vpxor (5*16)(%r12), RL3, RL3;
 548         vpxor (6*16)(%r12), RR4, RR4;
 549         vpxor (7*16)(%r12), RL4, RL4;
 550         vmovdqu RR1, (0*16)(%r11);
 551         vmovdqu RL1, (1*16)(%r11);
 552         vmovdqu RR2, (2*16)(%r11);
 553         vmovdqu RL2, (3*16)(%r11);
 554         vmovdqu RR3, (4*16)(%r11);
 555         vmovdqu RL3, (5*16)(%r11);
 556         vmovdqu RR4, (6*16)(%r11);
 557         vmovdqu RL4, (7*16)(%r11);
 558 
 559         popq %r15;
 560         popq %r12;
 561         FRAME_END
 562         ret;
 563 ENDPROC(cast5_ctr_16way)