root/arch/x86/crypto/blowfish-x86_64-asm_64.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Blowfish Cipher Algorithm (x86_64)
   4  *
   5  * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   6  */
   7 
   8 #include <linux/linkage.h>
   9 
  10 .file "blowfish-x86_64-asm.S"
  11 .text
  12 
  13 /* structure of crypto context */
  14 #define p       0
  15 #define s0      ((16 + 2) * 4)
  16 #define s1      ((16 + 2 + (1 * 256)) * 4)
  17 #define s2      ((16 + 2 + (2 * 256)) * 4)
  18 #define s3      ((16 + 2 + (3 * 256)) * 4)
  19 
  20 /* register macros */
  21 #define CTX %r12
  22 #define RIO %rsi
  23 
  24 #define RX0 %rax
  25 #define RX1 %rbx
  26 #define RX2 %rcx
  27 #define RX3 %rdx
  28 
  29 #define RX0d %eax
  30 #define RX1d %ebx
  31 #define RX2d %ecx
  32 #define RX3d %edx
  33 
  34 #define RX0bl %al
  35 #define RX1bl %bl
  36 #define RX2bl %cl
  37 #define RX3bl %dl
  38 
  39 #define RX0bh %ah
  40 #define RX1bh %bh
  41 #define RX2bh %ch
  42 #define RX3bh %dh
  43 
  44 #define RT0 %rdi
  45 #define RT1 %rsi
  46 #define RT2 %r8
  47 #define RT3 %r9
  48 
  49 #define RT0d %edi
  50 #define RT1d %esi
  51 #define RT2d %r8d
  52 #define RT3d %r9d
  53 
  54 #define RKEY %r10
  55 
  56 /***********************************************************************
  57  * 1-way blowfish
  58  ***********************************************************************/
  59 #define F() \
  60         rorq $16,               RX0; \
  61         movzbl RX0bh,           RT0d; \
  62         movzbl RX0bl,           RT1d; \
  63         rolq $16,               RX0; \
  64         movl s0(CTX,RT0,4),     RT0d; \
  65         addl s1(CTX,RT1,4),     RT0d; \
  66         movzbl RX0bh,           RT1d; \
  67         movzbl RX0bl,           RT2d; \
  68         rolq $32,               RX0; \
  69         xorl s2(CTX,RT1,4),     RT0d; \
  70         addl s3(CTX,RT2,4),     RT0d; \
  71         xorq RT0,               RX0;
  72 
  73 #define add_roundkey_enc(n) \
  74         xorq p+4*(n)(CTX),      RX0;
  75 
  76 #define round_enc(n) \
  77         add_roundkey_enc(n); \
  78         \
  79         F(); \
  80         F();
  81 
  82 #define add_roundkey_dec(n) \
  83         movq p+4*(n-1)(CTX),    RT0; \
  84         rorq $32,               RT0; \
  85         xorq RT0,               RX0;
  86 
  87 #define round_dec(n) \
  88         add_roundkey_dec(n); \
  89         \
  90         F(); \
  91         F(); \
  92 
  93 #define read_block() \
  94         movq (RIO),             RX0; \
  95         rorq $32,               RX0; \
  96         bswapq                  RX0;
  97 
  98 #define write_block() \
  99         bswapq                  RX0; \
 100         movq RX0,               (RIO);
 101 
 102 #define xor_block() \
 103         bswapq                  RX0; \
 104         xorq RX0,               (RIO);
 105 
 106 ENTRY(__blowfish_enc_blk)
 107         /* input:
 108          *      %rdi: ctx
 109          *      %rsi: dst
 110          *      %rdx: src
 111          *      %rcx: bool, if true: xor output
 112          */
 113         movq %r12, %r11;
 114 
 115         movq %rdi, CTX;
 116         movq %rsi, %r10;
 117         movq %rdx, RIO;
 118 
 119         read_block();
 120 
 121         round_enc(0);
 122         round_enc(2);
 123         round_enc(4);
 124         round_enc(6);
 125         round_enc(8);
 126         round_enc(10);
 127         round_enc(12);
 128         round_enc(14);
 129         add_roundkey_enc(16);
 130 
 131         movq %r11, %r12;
 132 
 133         movq %r10, RIO;
 134         test %cl, %cl;
 135         jnz .L__enc_xor;
 136 
 137         write_block();
 138         ret;
 139 .L__enc_xor:
 140         xor_block();
 141         ret;
 142 ENDPROC(__blowfish_enc_blk)
 143 
 144 ENTRY(blowfish_dec_blk)
 145         /* input:
 146          *      %rdi: ctx
 147          *      %rsi: dst
 148          *      %rdx: src
 149          */
 150         movq %r12, %r11;
 151 
 152         movq %rdi, CTX;
 153         movq %rsi, %r10;
 154         movq %rdx, RIO;
 155 
 156         read_block();
 157 
 158         round_dec(17);
 159         round_dec(15);
 160         round_dec(13);
 161         round_dec(11);
 162         round_dec(9);
 163         round_dec(7);
 164         round_dec(5);
 165         round_dec(3);
 166         add_roundkey_dec(1);
 167 
 168         movq %r10, RIO;
 169         write_block();
 170 
 171         movq %r11, %r12;
 172 
 173         ret;
 174 ENDPROC(blowfish_dec_blk)
 175 
 176 /**********************************************************************
 177   4-way blowfish, four blocks parallel
 178  **********************************************************************/
 179 
 180 /* F() for 4-way. Slower when used alone/1-way, but faster when used
 181  * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330).
 182  */
 183 #define F4(x) \
 184         movzbl x ## bh,         RT1d; \
 185         movzbl x ## bl,         RT3d; \
 186         rorq $16,               x; \
 187         movzbl x ## bh,         RT0d; \
 188         movzbl x ## bl,         RT2d; \
 189         rorq $16,               x; \
 190         movl s0(CTX,RT0,4),     RT0d; \
 191         addl s1(CTX,RT2,4),     RT0d; \
 192         xorl s2(CTX,RT1,4),     RT0d; \
 193         addl s3(CTX,RT3,4),     RT0d; \
 194         xorq RT0,               x;
 195 
 196 #define add_preloaded_roundkey4() \
 197         xorq RKEY,              RX0; \
 198         xorq RKEY,              RX1; \
 199         xorq RKEY,              RX2; \
 200         xorq RKEY,              RX3;
 201 
 202 #define preload_roundkey_enc(n) \
 203         movq p+4*(n)(CTX),      RKEY;
 204 
 205 #define add_roundkey_enc4(n) \
 206         add_preloaded_roundkey4(); \
 207         preload_roundkey_enc(n + 2);
 208 
 209 #define round_enc4(n) \
 210         add_roundkey_enc4(n); \
 211         \
 212         F4(RX0); \
 213         F4(RX1); \
 214         F4(RX2); \
 215         F4(RX3); \
 216         \
 217         F4(RX0); \
 218         F4(RX1); \
 219         F4(RX2); \
 220         F4(RX3);
 221 
 222 #define preload_roundkey_dec(n) \
 223         movq p+4*((n)-1)(CTX),  RKEY; \
 224         rorq $32,               RKEY;
 225 
 226 #define add_roundkey_dec4(n) \
 227         add_preloaded_roundkey4(); \
 228         preload_roundkey_dec(n - 2);
 229 
 230 #define round_dec4(n) \
 231         add_roundkey_dec4(n); \
 232         \
 233         F4(RX0); \
 234         F4(RX1); \
 235         F4(RX2); \
 236         F4(RX3); \
 237         \
 238         F4(RX0); \
 239         F4(RX1); \
 240         F4(RX2); \
 241         F4(RX3);
 242 
 243 #define read_block4() \
 244         movq (RIO),             RX0; \
 245         rorq $32,               RX0; \
 246         bswapq                  RX0; \
 247         \
 248         movq 8(RIO),            RX1; \
 249         rorq $32,               RX1; \
 250         bswapq                  RX1; \
 251         \
 252         movq 16(RIO),           RX2; \
 253         rorq $32,               RX2; \
 254         bswapq                  RX2; \
 255         \
 256         movq 24(RIO),           RX3; \
 257         rorq $32,               RX3; \
 258         bswapq                  RX3;
 259 
 260 #define write_block4() \
 261         bswapq                  RX0; \
 262         movq RX0,               (RIO); \
 263         \
 264         bswapq                  RX1; \
 265         movq RX1,               8(RIO); \
 266         \
 267         bswapq                  RX2; \
 268         movq RX2,               16(RIO); \
 269         \
 270         bswapq                  RX3; \
 271         movq RX3,               24(RIO);
 272 
 273 #define xor_block4() \
 274         bswapq                  RX0; \
 275         xorq RX0,               (RIO); \
 276         \
 277         bswapq                  RX1; \
 278         xorq RX1,               8(RIO); \
 279         \
 280         bswapq                  RX2; \
 281         xorq RX2,               16(RIO); \
 282         \
 283         bswapq                  RX3; \
 284         xorq RX3,               24(RIO);
 285 
 286 ENTRY(__blowfish_enc_blk_4way)
 287         /* input:
 288          *      %rdi: ctx
 289          *      %rsi: dst
 290          *      %rdx: src
 291          *      %rcx: bool, if true: xor output
 292          */
 293         pushq %r12;
 294         pushq %rbx;
 295         pushq %rcx;
 296 
 297         movq %rdi, CTX
 298         movq %rsi, %r11;
 299         movq %rdx, RIO;
 300 
 301         preload_roundkey_enc(0);
 302 
 303         read_block4();
 304 
 305         round_enc4(0);
 306         round_enc4(2);
 307         round_enc4(4);
 308         round_enc4(6);
 309         round_enc4(8);
 310         round_enc4(10);
 311         round_enc4(12);
 312         round_enc4(14);
 313         add_preloaded_roundkey4();
 314 
 315         popq %r12;
 316         movq %r11, RIO;
 317 
 318         test %r12b, %r12b;
 319         jnz .L__enc_xor4;
 320 
 321         write_block4();
 322 
 323         popq %rbx;
 324         popq %r12;
 325         ret;
 326 
 327 .L__enc_xor4:
 328         xor_block4();
 329 
 330         popq %rbx;
 331         popq %r12;
 332         ret;
 333 ENDPROC(__blowfish_enc_blk_4way)
 334 
 335 ENTRY(blowfish_dec_blk_4way)
 336         /* input:
 337          *      %rdi: ctx
 338          *      %rsi: dst
 339          *      %rdx: src
 340          */
 341         pushq %r12;
 342         pushq %rbx;
 343 
 344         movq %rdi, CTX;
 345         movq %rsi, %r11
 346         movq %rdx, RIO;
 347 
 348         preload_roundkey_dec(17);
 349         read_block4();
 350 
 351         round_dec4(17);
 352         round_dec4(15);
 353         round_dec4(13);
 354         round_dec4(11);
 355         round_dec4(9);
 356         round_dec4(7);
 357         round_dec4(5);
 358         round_dec4(3);
 359         add_preloaded_roundkey4();
 360 
 361         movq %r11, RIO;
 362         write_block4();
 363 
 364         popq %rbx;
 365         popq %r12;
 366 
 367         ret;
 368 ENDPROC(blowfish_dec_blk_4way)

/* [<][>][^][v][top][bottom][index][help] */