root/arch/x86/crypto/camellia-x86_64-asm_64.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Camellia Cipher Algorithm (x86_64)
   4  *
   5  * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   6  */
   7 
   8 #include <linux/linkage.h>
   9 
  10 .file "camellia-x86_64-asm_64.S"
  11 .text
  12 
  13 .extern camellia_sp10011110;
  14 .extern camellia_sp22000222;
  15 .extern camellia_sp03303033;
  16 .extern camellia_sp00444404;
  17 .extern camellia_sp02220222;
  18 .extern camellia_sp30333033;
  19 .extern camellia_sp44044404;
  20 .extern camellia_sp11101110;
  21 
  22 #define sp10011110 camellia_sp10011110
  23 #define sp22000222 camellia_sp22000222
  24 #define sp03303033 camellia_sp03303033
  25 #define sp00444404 camellia_sp00444404
  26 #define sp02220222 camellia_sp02220222
  27 #define sp30333033 camellia_sp30333033
  28 #define sp44044404 camellia_sp44044404
  29 #define sp11101110 camellia_sp11101110
  30 
  31 #define CAMELLIA_TABLE_BYTE_LEN 272
  32 
  33 /* struct camellia_ctx: */
  34 #define key_table 0
  35 #define key_length CAMELLIA_TABLE_BYTE_LEN
  36 
  37 /* register macros */
  38 #define CTX %rdi
  39 #define RIO %rsi
  40 #define RIOd %esi
  41 
  42 #define RAB0 %rax
  43 #define RCD0 %rcx
  44 #define RAB1 %rbx
  45 #define RCD1 %rdx
  46 
  47 #define RAB0d %eax
  48 #define RCD0d %ecx
  49 #define RAB1d %ebx
  50 #define RCD1d %edx
  51 
  52 #define RAB0bl %al
  53 #define RCD0bl %cl
  54 #define RAB1bl %bl
  55 #define RCD1bl %dl
  56 
  57 #define RAB0bh %ah
  58 #define RCD0bh %ch
  59 #define RAB1bh %bh
  60 #define RCD1bh %dh
  61 
  62 #define RT0 %rsi
  63 #define RT1 %r12
  64 #define RT2 %r8
  65 
  66 #define RT0d %esi
  67 #define RT1d %r12d
  68 #define RT2d %r8d
  69 
  70 #define RT2bl %r8b
  71 
  72 #define RXOR %r9
  73 #define RR12 %r10
  74 #define RDST %r11
  75 
  76 #define RXORd %r9d
  77 #define RXORbl %r9b
  78 
  79 #define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \
  80         movzbl ab ## bl,                tmp2 ## d; \
  81         movzbl ab ## bh,                tmp1 ## d; \
  82         rorq $16,                       ab; \
  83         xorq T0(, tmp2, 8),             dst; \
  84         xorq T1(, tmp1, 8),             dst;
  85 
  86 /**********************************************************************
  87   1-way camellia
  88  **********************************************************************/
  89 #define roundsm(ab, subkey, cd) \
  90         movq (key_table + ((subkey) * 2) * 4)(CTX),     RT2; \
  91         \
  92         xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
  93         xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
  94         xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
  95         xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
  96         \
  97         xorq RT2,                                       cd ## 0;
  98 
  99 #define fls(l, r, kl, kr) \
 100         movl (key_table + ((kl) * 2) * 4)(CTX),         RT0d; \
 101         andl l ## 0d,                                   RT0d; \
 102         roll $1,                                        RT0d; \
 103         shlq $32,                                       RT0; \
 104         xorq RT0,                                       l ## 0; \
 105         movq (key_table + ((kr) * 2) * 4)(CTX),         RT1; \
 106         orq r ## 0,                                     RT1; \
 107         shrq $32,                                       RT1; \
 108         xorq RT1,                                       r ## 0; \
 109         \
 110         movq (key_table + ((kl) * 2) * 4)(CTX),         RT2; \
 111         orq l ## 0,                                     RT2; \
 112         shrq $32,                                       RT2; \
 113         xorq RT2,                                       l ## 0; \
 114         movl (key_table + ((kr) * 2) * 4)(CTX),         RT0d; \
 115         andl r ## 0d,                                   RT0d; \
 116         roll $1,                                        RT0d; \
 117         shlq $32,                                       RT0; \
 118         xorq RT0,                                       r ## 0;
 119 
 120 #define enc_rounds(i) \
 121         roundsm(RAB, i + 2, RCD); \
 122         roundsm(RCD, i + 3, RAB); \
 123         roundsm(RAB, i + 4, RCD); \
 124         roundsm(RCD, i + 5, RAB); \
 125         roundsm(RAB, i + 6, RCD); \
 126         roundsm(RCD, i + 7, RAB);
 127 
 128 #define enc_fls(i) \
 129         fls(RAB, RCD, i + 0, i + 1);
 130 
 131 #define enc_inpack() \
 132         movq (RIO),                     RAB0; \
 133         bswapq                          RAB0; \
 134         rolq $32,                       RAB0; \
 135         movq 4*2(RIO),                  RCD0; \
 136         bswapq                          RCD0; \
 137         rorq $32,                       RCD0; \
 138         xorq key_table(CTX),            RAB0;
 139 
 140 #define enc_outunpack(op, max) \
 141         xorq key_table(CTX, max, 8),    RCD0; \
 142         rorq $32,                       RCD0; \
 143         bswapq                          RCD0; \
 144         op ## q RCD0,                   (RIO); \
 145         rolq $32,                       RAB0; \
 146         bswapq                          RAB0; \
 147         op ## q RAB0,                   4*2(RIO);
 148 
 149 #define dec_rounds(i) \
 150         roundsm(RAB, i + 7, RCD); \
 151         roundsm(RCD, i + 6, RAB); \
 152         roundsm(RAB, i + 5, RCD); \
 153         roundsm(RCD, i + 4, RAB); \
 154         roundsm(RAB, i + 3, RCD); \
 155         roundsm(RCD, i + 2, RAB);
 156 
 157 #define dec_fls(i) \
 158         fls(RAB, RCD, i + 1, i + 0);
 159 
 160 #define dec_inpack(max) \
 161         movq (RIO),                     RAB0; \
 162         bswapq                          RAB0; \
 163         rolq $32,                       RAB0; \
 164         movq 4*2(RIO),                  RCD0; \
 165         bswapq                          RCD0; \
 166         rorq $32,                       RCD0; \
 167         xorq key_table(CTX, max, 8),    RAB0;
 168 
 169 #define dec_outunpack() \
 170         xorq key_table(CTX),            RCD0; \
 171         rorq $32,                       RCD0; \
 172         bswapq                          RCD0; \
 173         movq RCD0,                      (RIO); \
 174         rolq $32,                       RAB0; \
 175         bswapq                          RAB0; \
 176         movq RAB0,                      4*2(RIO);
 177 
 178 ENTRY(__camellia_enc_blk)
 179         /* input:
 180          *      %rdi: ctx, CTX
 181          *      %rsi: dst
 182          *      %rdx: src
 183          *      %rcx: bool xor
 184          */
 185         movq %r12, RR12;
 186 
 187         movq %rcx, RXOR;
 188         movq %rsi, RDST;
 189         movq %rdx, RIO;
 190 
 191         enc_inpack();
 192 
 193         enc_rounds(0);
 194         enc_fls(8);
 195         enc_rounds(8);
 196         enc_fls(16);
 197         enc_rounds(16);
 198         movl $24, RT1d; /* max */
 199 
 200         cmpb $16, key_length(CTX);
 201         je .L__enc_done;
 202 
 203         enc_fls(24);
 204         enc_rounds(24);
 205         movl $32, RT1d; /* max */
 206 
 207 .L__enc_done:
 208         testb RXORbl, RXORbl;
 209         movq RDST, RIO;
 210 
 211         jnz .L__enc_xor;
 212 
 213         enc_outunpack(mov, RT1);
 214 
 215         movq RR12, %r12;
 216         ret;
 217 
 218 .L__enc_xor:
 219         enc_outunpack(xor, RT1);
 220 
 221         movq RR12, %r12;
 222         ret;
 223 ENDPROC(__camellia_enc_blk)
 224 
 225 ENTRY(camellia_dec_blk)
 226         /* input:
 227          *      %rdi: ctx, CTX
 228          *      %rsi: dst
 229          *      %rdx: src
 230          */
 231         cmpl $16, key_length(CTX);
 232         movl $32, RT2d;
 233         movl $24, RXORd;
 234         cmovel RXORd, RT2d; /* max */
 235 
 236         movq %r12, RR12;
 237         movq %rsi, RDST;
 238         movq %rdx, RIO;
 239 
 240         dec_inpack(RT2);
 241 
 242         cmpb $24, RT2bl;
 243         je .L__dec_rounds16;
 244 
 245         dec_rounds(24);
 246         dec_fls(24);
 247 
 248 .L__dec_rounds16:
 249         dec_rounds(16);
 250         dec_fls(16);
 251         dec_rounds(8);
 252         dec_fls(8);
 253         dec_rounds(0);
 254 
 255         movq RDST, RIO;
 256 
 257         dec_outunpack();
 258 
 259         movq RR12, %r12;
 260         ret;
 261 ENDPROC(camellia_dec_blk)
 262 
 263 /**********************************************************************
 264   2-way camellia
 265  **********************************************************************/
 266 #define roundsm2(ab, subkey, cd) \
 267         movq (key_table + ((subkey) * 2) * 4)(CTX),     RT2; \
 268         xorq RT2,                                       cd ## 1; \
 269         \
 270         xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \
 271         xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \
 272         xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \
 273         xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \
 274         \
 275                 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \
 276                 xorq RT2,                                       cd ## 0; \
 277                 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \
 278                 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \
 279                 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1);
 280 
 281 #define fls2(l, r, kl, kr) \
 282         movl (key_table + ((kl) * 2) * 4)(CTX),         RT0d; \
 283         andl l ## 0d,                                   RT0d; \
 284         roll $1,                                        RT0d; \
 285         shlq $32,                                       RT0; \
 286         xorq RT0,                                       l ## 0; \
 287         movq (key_table + ((kr) * 2) * 4)(CTX),         RT1; \
 288         orq r ## 0,                                     RT1; \
 289         shrq $32,                                       RT1; \
 290         xorq RT1,                                       r ## 0; \
 291         \
 292                 movl (key_table + ((kl) * 2) * 4)(CTX),         RT2d; \
 293                 andl l ## 1d,                                   RT2d; \
 294                 roll $1,                                        RT2d; \
 295                 shlq $32,                                       RT2; \
 296                 xorq RT2,                                       l ## 1; \
 297                 movq (key_table + ((kr) * 2) * 4)(CTX),         RT0; \
 298                 orq r ## 1,                                     RT0; \
 299                 shrq $32,                                       RT0; \
 300                 xorq RT0,                                       r ## 1; \
 301         \
 302         movq (key_table + ((kl) * 2) * 4)(CTX),         RT1; \
 303         orq l ## 0,                                     RT1; \
 304         shrq $32,                                       RT1; \
 305         xorq RT1,                                       l ## 0; \
 306         movl (key_table + ((kr) * 2) * 4)(CTX),         RT2d; \
 307         andl r ## 0d,                                   RT2d; \
 308         roll $1,                                        RT2d; \
 309         shlq $32,                                       RT2; \
 310         xorq RT2,                                       r ## 0; \
 311         \
 312                 movq (key_table + ((kl) * 2) * 4)(CTX),         RT0; \
 313                 orq l ## 1,                                     RT0; \
 314                 shrq $32,                                       RT0; \
 315                 xorq RT0,                                       l ## 1; \
 316                 movl (key_table + ((kr) * 2) * 4)(CTX),         RT1d; \
 317                 andl r ## 1d,                                   RT1d; \
 318                 roll $1,                                        RT1d; \
 319                 shlq $32,                                       RT1; \
 320                 xorq RT1,                                       r ## 1;
 321 
 322 #define enc_rounds2(i) \
 323         roundsm2(RAB, i + 2, RCD); \
 324         roundsm2(RCD, i + 3, RAB); \
 325         roundsm2(RAB, i + 4, RCD); \
 326         roundsm2(RCD, i + 5, RAB); \
 327         roundsm2(RAB, i + 6, RCD); \
 328         roundsm2(RCD, i + 7, RAB);
 329 
 330 #define enc_fls2(i) \
 331         fls2(RAB, RCD, i + 0, i + 1);
 332 
 333 #define enc_inpack2() \
 334         movq (RIO),                     RAB0; \
 335         bswapq                          RAB0; \
 336         rorq $32,                       RAB0; \
 337         movq 4*2(RIO),                  RCD0; \
 338         bswapq                          RCD0; \
 339         rolq $32,                       RCD0; \
 340         xorq key_table(CTX),            RAB0; \
 341         \
 342                 movq 8*2(RIO),                  RAB1; \
 343                 bswapq                          RAB1; \
 344                 rorq $32,                       RAB1; \
 345                 movq 12*2(RIO),                 RCD1; \
 346                 bswapq                          RCD1; \
 347                 rolq $32,                       RCD1; \
 348                 xorq key_table(CTX),            RAB1;
 349 
 350 #define enc_outunpack2(op, max) \
 351         xorq key_table(CTX, max, 8),    RCD0; \
 352         rolq $32,                       RCD0; \
 353         bswapq                          RCD0; \
 354         op ## q RCD0,                   (RIO); \
 355         rorq $32,                       RAB0; \
 356         bswapq                          RAB0; \
 357         op ## q RAB0,                   4*2(RIO); \
 358         \
 359                 xorq key_table(CTX, max, 8),    RCD1; \
 360                 rolq $32,                       RCD1; \
 361                 bswapq                          RCD1; \
 362                 op ## q RCD1,                   8*2(RIO); \
 363                 rorq $32,                       RAB1; \
 364                 bswapq                          RAB1; \
 365                 op ## q RAB1,                   12*2(RIO);
 366 
 367 #define dec_rounds2(i) \
 368         roundsm2(RAB, i + 7, RCD); \
 369         roundsm2(RCD, i + 6, RAB); \
 370         roundsm2(RAB, i + 5, RCD); \
 371         roundsm2(RCD, i + 4, RAB); \
 372         roundsm2(RAB, i + 3, RCD); \
 373         roundsm2(RCD, i + 2, RAB);
 374 
 375 #define dec_fls2(i) \
 376         fls2(RAB, RCD, i + 1, i + 0);
 377 
 378 #define dec_inpack2(max) \
 379         movq (RIO),                     RAB0; \
 380         bswapq                          RAB0; \
 381         rorq $32,                       RAB0; \
 382         movq 4*2(RIO),                  RCD0; \
 383         bswapq                          RCD0; \
 384         rolq $32,                       RCD0; \
 385         xorq key_table(CTX, max, 8),    RAB0; \
 386         \
 387                 movq 8*2(RIO),                  RAB1; \
 388                 bswapq                          RAB1; \
 389                 rorq $32,                       RAB1; \
 390                 movq 12*2(RIO),                 RCD1; \
 391                 bswapq                          RCD1; \
 392                 rolq $32,                       RCD1; \
 393                 xorq key_table(CTX, max, 8),    RAB1;
 394 
 395 #define dec_outunpack2() \
 396         xorq key_table(CTX),            RCD0; \
 397         rolq $32,                       RCD0; \
 398         bswapq                          RCD0; \
 399         movq RCD0,                      (RIO); \
 400         rorq $32,                       RAB0; \
 401         bswapq                          RAB0; \
 402         movq RAB0,                      4*2(RIO); \
 403         \
 404                 xorq key_table(CTX),            RCD1; \
 405                 rolq $32,                       RCD1; \
 406                 bswapq                          RCD1; \
 407                 movq RCD1,                      8*2(RIO); \
 408                 rorq $32,                       RAB1; \
 409                 bswapq                          RAB1; \
 410                 movq RAB1,                      12*2(RIO);
 411 
 412 ENTRY(__camellia_enc_blk_2way)
 413         /* input:
 414          *      %rdi: ctx, CTX
 415          *      %rsi: dst
 416          *      %rdx: src
 417          *      %rcx: bool xor
 418          */
 419         pushq %rbx;
 420 
 421         movq %r12, RR12;
 422         movq %rcx, RXOR;
 423         movq %rsi, RDST;
 424         movq %rdx, RIO;
 425 
 426         enc_inpack2();
 427 
 428         enc_rounds2(0);
 429         enc_fls2(8);
 430         enc_rounds2(8);
 431         enc_fls2(16);
 432         enc_rounds2(16);
 433         movl $24, RT2d; /* max */
 434 
 435         cmpb $16, key_length(CTX);
 436         je .L__enc2_done;
 437 
 438         enc_fls2(24);
 439         enc_rounds2(24);
 440         movl $32, RT2d; /* max */
 441 
 442 .L__enc2_done:
 443         test RXORbl, RXORbl;
 444         movq RDST, RIO;
 445         jnz .L__enc2_xor;
 446 
 447         enc_outunpack2(mov, RT2);
 448 
 449         movq RR12, %r12;
 450         popq %rbx;
 451         ret;
 452 
 453 .L__enc2_xor:
 454         enc_outunpack2(xor, RT2);
 455 
 456         movq RR12, %r12;
 457         popq %rbx;
 458         ret;
 459 ENDPROC(__camellia_enc_blk_2way)
 460 
 461 ENTRY(camellia_dec_blk_2way)
 462         /* input:
 463          *      %rdi: ctx, CTX
 464          *      %rsi: dst
 465          *      %rdx: src
 466          */
 467         cmpl $16, key_length(CTX);
 468         movl $32, RT2d;
 469         movl $24, RXORd;
 470         cmovel RXORd, RT2d; /* max */
 471 
 472         movq %rbx, RXOR;
 473         movq %r12, RR12;
 474         movq %rsi, RDST;
 475         movq %rdx, RIO;
 476 
 477         dec_inpack2(RT2);
 478 
 479         cmpb $24, RT2bl;
 480         je .L__dec2_rounds16;
 481 
 482         dec_rounds2(24);
 483         dec_fls2(24);
 484 
 485 .L__dec2_rounds16:
 486         dec_rounds2(16);
 487         dec_fls2(16);
 488         dec_rounds2(8);
 489         dec_fls2(8);
 490         dec_rounds2(0);
 491 
 492         movq RDST, RIO;
 493 
 494         dec_outunpack2();
 495 
 496         movq RR12, %r12;
 497         movq RXOR, %rbx;
 498         ret;
 499 ENDPROC(camellia_dec_blk_2way)

/* [<][>][^][v][top][bottom][index][help] */