root/arch/x86/crypto/serpent-avx-x86_64-asm_64.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Serpent Cipher 8-way parallel algorithm (x86_64/AVX)
   4  *
   5  * Copyright (C) 2012 Johannes Goetzfried
   6  *     <Johannes.Goetzfried@informatik.stud.uni-erlangen.de>
   7  *
   8  * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
   9  */
  10 
  11 #include <linux/linkage.h>
  12 #include <asm/frame.h>
  13 #include "glue_helper-asm-avx.S"
  14 
  15 .file "serpent-avx-x86_64-asm_64.S"
  16 
  17 .section        .rodata.cst16.bswap128_mask, "aM", @progbits, 16
  18 .align 16
  19 .Lbswap128_mask:
  20         .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
  21 .section        .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16
  22 .align 16
  23 .Lxts_gf128mul_and_shl1_mask:
  24         .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
  25 
  26 .text
  27 
  28 #define CTX %rdi
  29 
  30 /**********************************************************************
  31   8-way AVX serpent
  32  **********************************************************************/
  33 #define RA1 %xmm0
  34 #define RB1 %xmm1
  35 #define RC1 %xmm2
  36 #define RD1 %xmm3
  37 #define RE1 %xmm4
  38 
  39 #define tp  %xmm5
  40 
  41 #define RA2 %xmm6
  42 #define RB2 %xmm7
  43 #define RC2 %xmm8
  44 #define RD2 %xmm9
  45 #define RE2 %xmm10
  46 
  47 #define RNOT %xmm11
  48 
  49 #define RK0 %xmm12
  50 #define RK1 %xmm13
  51 #define RK2 %xmm14
  52 #define RK3 %xmm15
  53 
  54 
  55 #define S0_1(x0, x1, x2, x3, x4)      \
  56         vpor            x0,   x3, tp; \
  57         vpxor           x3,   x0, x0; \
  58         vpxor           x2,   x3, x4; \
  59         vpxor           RNOT, x4, x4; \
  60         vpxor           x1,   tp, x3; \
  61         vpand           x0,   x1, x1; \
  62         vpxor           x4,   x1, x1; \
  63         vpxor           x0,   x2, x2;
  64 #define S0_2(x0, x1, x2, x3, x4)      \
  65         vpxor           x3,   x0, x0; \
  66         vpor            x0,   x4, x4; \
  67         vpxor           x2,   x0, x0; \
  68         vpand           x1,   x2, x2; \
  69         vpxor           x2,   x3, x3; \
  70         vpxor           RNOT, x1, x1; \
  71         vpxor           x4,   x2, x2; \
  72         vpxor           x2,   x1, x1;
  73 
  74 #define S1_1(x0, x1, x2, x3, x4)      \
  75         vpxor           x0,   x1, tp; \
  76         vpxor           x3,   x0, x0; \
  77         vpxor           RNOT, x3, x3; \
  78         vpand           tp,   x1, x4; \
  79         vpor            tp,   x0, x0; \
  80         vpxor           x2,   x3, x3; \
  81         vpxor           x3,   x0, x0; \
  82         vpxor           x3,   tp, x1;
  83 #define S1_2(x0, x1, x2, x3, x4)      \
  84         vpxor           x4,   x3, x3; \
  85         vpor            x4,   x1, x1; \
  86         vpxor           x2,   x4, x4; \
  87         vpand           x0,   x2, x2; \
  88         vpxor           x1,   x2, x2; \
  89         vpor            x0,   x1, x1; \
  90         vpxor           RNOT, x0, x0; \
  91         vpxor           x2,   x0, x0; \
  92         vpxor           x1,   x4, x4;
  93 
  94 #define S2_1(x0, x1, x2, x3, x4)      \
  95         vpxor           RNOT, x3, x3; \
  96         vpxor           x0,   x1, x1; \
  97         vpand           x2,   x0, tp; \
  98         vpxor           x3,   tp, tp; \
  99         vpor            x0,   x3, x3; \
 100         vpxor           x1,   x2, x2; \
 101         vpxor           x1,   x3, x3; \
 102         vpand           tp,   x1, x1;
 103 #define S2_2(x0, x1, x2, x3, x4)      \
 104         vpxor           x2,   tp, tp; \
 105         vpand           x3,   x2, x2; \
 106         vpor            x1,   x3, x3; \
 107         vpxor           RNOT, tp, tp; \
 108         vpxor           tp,   x3, x3; \
 109         vpxor           tp,   x0, x4; \
 110         vpxor           x2,   tp, x0; \
 111         vpor            x2,   x1, x1;
 112 
 113 #define S3_1(x0, x1, x2, x3, x4)      \
 114         vpxor           x3,   x1, tp; \
 115         vpor            x0,   x3, x3; \
 116         vpand           x0,   x1, x4; \
 117         vpxor           x2,   x0, x0; \
 118         vpxor           tp,   x2, x2; \
 119         vpand           x3,   tp, x1; \
 120         vpxor           x3,   x2, x2; \
 121         vpor            x4,   x0, x0; \
 122         vpxor           x3,   x4, x4;
 123 #define S3_2(x0, x1, x2, x3, x4)      \
 124         vpxor           x0,   x1, x1; \
 125         vpand           x3,   x0, x0; \
 126         vpand           x4,   x3, x3; \
 127         vpxor           x2,   x3, x3; \
 128         vpor            x1,   x4, x4; \
 129         vpand           x1,   x2, x2; \
 130         vpxor           x3,   x4, x4; \
 131         vpxor           x3,   x0, x0; \
 132         vpxor           x2,   x3, x3;
 133 
 134 #define S4_1(x0, x1, x2, x3, x4)      \
 135         vpand           x0,   x3, tp; \
 136         vpxor           x3,   x0, x0; \
 137         vpxor           x2,   tp, tp; \
 138         vpor            x3,   x2, x2; \
 139         vpxor           x1,   x0, x0; \
 140         vpxor           tp,   x3, x4; \
 141         vpor            x0,   x2, x2; \
 142         vpxor           x1,   x2, x2;
 143 #define S4_2(x0, x1, x2, x3, x4)      \
 144         vpand           x0,   x1, x1; \
 145         vpxor           x4,   x1, x1; \
 146         vpand           x2,   x4, x4; \
 147         vpxor           tp,   x2, x2; \
 148         vpxor           x0,   x4, x4; \
 149         vpor            x1,   tp, x3; \
 150         vpxor           RNOT, x1, x1; \
 151         vpxor           x0,   x3, x3;
 152 
 153 #define S5_1(x0, x1, x2, x3, x4)      \
 154         vpor            x0,   x1, tp; \
 155         vpxor           tp,   x2, x2; \
 156         vpxor           RNOT, x3, x3; \
 157         vpxor           x0,   x1, x4; \
 158         vpxor           x2,   x0, x0; \
 159         vpand           x4,   tp, x1; \
 160         vpor            x3,   x4, x4; \
 161         vpxor           x0,   x4, x4;
 162 #define S5_2(x0, x1, x2, x3, x4)      \
 163         vpand           x3,   x0, x0; \
 164         vpxor           x3,   x1, x1; \
 165         vpxor           x2,   x3, x3; \
 166         vpxor           x1,   x0, x0; \
 167         vpand           x4,   x2, x2; \
 168         vpxor           x2,   x1, x1; \
 169         vpand           x0,   x2, x2; \
 170         vpxor           x2,   x3, x3;
 171 
 172 #define S6_1(x0, x1, x2, x3, x4)      \
 173         vpxor           x0,   x3, x3; \
 174         vpxor           x2,   x1, tp; \
 175         vpxor           x0,   x2, x2; \
 176         vpand           x3,   x0, x0; \
 177         vpor            x3,   tp, tp; \
 178         vpxor           RNOT, x1, x4; \
 179         vpxor           tp,   x0, x0; \
 180         vpxor           x2,   tp, x1;
 181 #define S6_2(x0, x1, x2, x3, x4)      \
 182         vpxor           x4,   x3, x3; \
 183         vpxor           x0,   x4, x4; \
 184         vpand           x0,   x2, x2; \
 185         vpxor           x1,   x4, x4; \
 186         vpxor           x3,   x2, x2; \
 187         vpand           x1,   x3, x3; \
 188         vpxor           x0,   x3, x3; \
 189         vpxor           x2,   x1, x1;
 190 
 191 #define S7_1(x0, x1, x2, x3, x4)      \
 192         vpxor           RNOT, x1, tp; \
 193         vpxor           RNOT, x0, x0; \
 194         vpand           x2,   tp, x1; \
 195         vpxor           x3,   x1, x1; \
 196         vpor            tp,   x3, x3; \
 197         vpxor           x2,   tp, x4; \
 198         vpxor           x3,   x2, x2; \
 199         vpxor           x0,   x3, x3; \
 200         vpor            x1,   x0, x0;
 201 #define S7_2(x0, x1, x2, x3, x4)      \
 202         vpand           x0,   x2, x2; \
 203         vpxor           x4,   x0, x0; \
 204         vpxor           x3,   x4, x4; \
 205         vpand           x0,   x3, x3; \
 206         vpxor           x1,   x4, x4; \
 207         vpxor           x4,   x2, x2; \
 208         vpxor           x1,   x3, x3; \
 209         vpor            x0,   x4, x4; \
 210         vpxor           x1,   x4, x4;
 211 
 212 #define SI0_1(x0, x1, x2, x3, x4)     \
 213         vpxor           x0,   x1, x1; \
 214         vpor            x1,   x3, tp; \
 215         vpxor           x1,   x3, x4; \
 216         vpxor           RNOT, x0, x0; \
 217         vpxor           tp,   x2, x2; \
 218         vpxor           x0,   tp, x3; \
 219         vpand           x1,   x0, x0; \
 220         vpxor           x2,   x0, x0;
 221 #define SI0_2(x0, x1, x2, x3, x4)     \
 222         vpand           x3,   x2, x2; \
 223         vpxor           x4,   x3, x3; \
 224         vpxor           x3,   x2, x2; \
 225         vpxor           x3,   x1, x1; \
 226         vpand           x0,   x3, x3; \
 227         vpxor           x0,   x1, x1; \
 228         vpxor           x2,   x0, x0; \
 229         vpxor           x3,   x4, x4;
 230 
 231 #define SI1_1(x0, x1, x2, x3, x4)     \
 232         vpxor           x3,   x1, x1; \
 233         vpxor           x2,   x0, tp; \
 234         vpxor           RNOT, x2, x2; \
 235         vpor            x1,   x0, x4; \
 236         vpxor           x3,   x4, x4; \
 237         vpand           x1,   x3, x3; \
 238         vpxor           x2,   x1, x1; \
 239         vpand           x4,   x2, x2;
 240 #define SI1_2(x0, x1, x2, x3, x4)     \
 241         vpxor           x1,   x4, x4; \
 242         vpor            x3,   x1, x1; \
 243         vpxor           tp,   x3, x3; \
 244         vpxor           tp,   x2, x2; \
 245         vpor            x4,   tp, x0; \
 246         vpxor           x4,   x2, x2; \
 247         vpxor           x0,   x1, x1; \
 248         vpxor           x1,   x4, x4;
 249 
 250 #define SI2_1(x0, x1, x2, x3, x4)     \
 251         vpxor           x1,   x2, x2; \
 252         vpxor           RNOT, x3, tp; \
 253         vpor            x2,   tp, tp; \
 254         vpxor           x3,   x2, x2; \
 255         vpxor           x0,   x3, x4; \
 256         vpxor           x1,   tp, x3; \
 257         vpor            x2,   x1, x1; \
 258         vpxor           x0,   x2, x2;
 259 #define SI2_2(x0, x1, x2, x3, x4)     \
 260         vpxor           x4,   x1, x1; \
 261         vpor            x3,   x4, x4; \
 262         vpxor           x3,   x2, x2; \
 263         vpxor           x2,   x4, x4; \
 264         vpand           x1,   x2, x2; \
 265         vpxor           x3,   x2, x2; \
 266         vpxor           x4,   x3, x3; \
 267         vpxor           x0,   x4, x4;
 268 
 269 #define SI3_1(x0, x1, x2, x3, x4)     \
 270         vpxor           x1,   x2, x2; \
 271         vpand           x2,   x1, tp; \
 272         vpxor           x0,   tp, tp; \
 273         vpor            x1,   x0, x0; \
 274         vpxor           x3,   x1, x4; \
 275         vpxor           x3,   x0, x0; \
 276         vpor            tp,   x3, x3; \
 277         vpxor           x2,   tp, x1;
 278 #define SI3_2(x0, x1, x2, x3, x4)     \
 279         vpxor           x3,   x1, x1; \
 280         vpxor           x2,   x0, x0; \
 281         vpxor           x3,   x2, x2; \
 282         vpand           x1,   x3, x3; \
 283         vpxor           x0,   x1, x1; \
 284         vpand           x2,   x0, x0; \
 285         vpxor           x3,   x4, x4; \
 286         vpxor           x0,   x3, x3; \
 287         vpxor           x1,   x0, x0;
 288 
 289 #define SI4_1(x0, x1, x2, x3, x4)     \
 290         vpxor           x3,   x2, x2; \
 291         vpand           x1,   x0, tp; \
 292         vpxor           x2,   tp, tp; \
 293         vpor            x3,   x2, x2; \
 294         vpxor           RNOT, x0, x4; \
 295         vpxor           tp,   x1, x1; \
 296         vpxor           x2,   tp, x0; \
 297         vpand           x4,   x2, x2;
 298 #define SI4_2(x0, x1, x2, x3, x4)     \
 299         vpxor           x0,   x2, x2; \
 300         vpor            x4,   x0, x0; \
 301         vpxor           x3,   x0, x0; \
 302         vpand           x2,   x3, x3; \
 303         vpxor           x3,   x4, x4; \
 304         vpxor           x1,   x3, x3; \
 305         vpand           x0,   x1, x1; \
 306         vpxor           x1,   x4, x4; \
 307         vpxor           x3,   x0, x0;
 308 
 309 #define SI5_1(x0, x1, x2, x3, x4)     \
 310         vpor            x2,   x1, tp; \
 311         vpxor           x1,   x2, x2; \
 312         vpxor           x3,   tp, tp; \
 313         vpand           x1,   x3, x3; \
 314         vpxor           x3,   x2, x2; \
 315         vpor            x0,   x3, x3; \
 316         vpxor           RNOT, x0, x0; \
 317         vpxor           x2,   x3, x3; \
 318         vpor            x0,   x2, x2;
 319 #define SI5_2(x0, x1, x2, x3, x4)     \
 320         vpxor           tp,   x1, x4; \
 321         vpxor           x4,   x2, x2; \
 322         vpand           x0,   x4, x4; \
 323         vpxor           tp,   x0, x0; \
 324         vpxor           x3,   tp, x1; \
 325         vpand           x2,   x0, x0; \
 326         vpxor           x3,   x2, x2; \
 327         vpxor           x2,   x0, x0; \
 328         vpxor           x4,   x2, x2; \
 329         vpxor           x3,   x4, x4;
 330 
 331 #define SI6_1(x0, x1, x2, x3, x4)     \
 332         vpxor           x2,   x0, x0; \
 333         vpand           x3,   x0, tp; \
 334         vpxor           x3,   x2, x2; \
 335         vpxor           x2,   tp, tp; \
 336         vpxor           x1,   x3, x3; \
 337         vpor            x0,   x2, x2; \
 338         vpxor           x3,   x2, x2; \
 339         vpand           tp,   x3, x3;
 340 #define SI6_2(x0, x1, x2, x3, x4)     \
 341         vpxor           RNOT, tp, tp; \
 342         vpxor           x1,   x3, x3; \
 343         vpand           x2,   x1, x1; \
 344         vpxor           tp,   x0, x4; \
 345         vpxor           x4,   x3, x3; \
 346         vpxor           x2,   x4, x4; \
 347         vpxor           x1,   tp, x0; \
 348         vpxor           x0,   x2, x2;
 349 
 350 #define SI7_1(x0, x1, x2, x3, x4)     \
 351         vpand           x0,   x3, tp; \
 352         vpxor           x2,   x0, x0; \
 353         vpor            x3,   x2, x2; \
 354         vpxor           x1,   x3, x4; \
 355         vpxor           RNOT, x0, x0; \
 356         vpor            tp,   x1, x1; \
 357         vpxor           x0,   x4, x4; \
 358         vpand           x2,   x0, x0; \
 359         vpxor           x1,   x0, x0;
 360 #define SI7_2(x0, x1, x2, x3, x4)     \
 361         vpand           x2,   x1, x1; \
 362         vpxor           x2,   tp, x3; \
 363         vpxor           x3,   x4, x4; \
 364         vpand           x3,   x2, x2; \
 365         vpor            x0,   x3, x3; \
 366         vpxor           x4,   x1, x1; \
 367         vpxor           x4,   x3, x3; \
 368         vpand           x0,   x4, x4; \
 369         vpxor           x2,   x4, x4;
 370 
 371 #define get_key(i, j, t) \
 372         vbroadcastss (4*(i)+(j))*4(CTX), t;
 373 
 374 #define K2(x0, x1, x2, x3, x4, i) \
 375         get_key(i, 0, RK0); \
 376         get_key(i, 1, RK1); \
 377         get_key(i, 2, RK2); \
 378         get_key(i, 3, RK3); \
 379         vpxor RK0,      x0 ## 1, x0 ## 1; \
 380         vpxor RK1,      x1 ## 1, x1 ## 1; \
 381         vpxor RK2,      x2 ## 1, x2 ## 1; \
 382         vpxor RK3,      x3 ## 1, x3 ## 1; \
 383                 vpxor RK0,      x0 ## 2, x0 ## 2; \
 384                 vpxor RK1,      x1 ## 2, x1 ## 2; \
 385                 vpxor RK2,      x2 ## 2, x2 ## 2; \
 386                 vpxor RK3,      x3 ## 2, x3 ## 2;
 387 
 388 #define LK2(x0, x1, x2, x3, x4, i) \
 389         vpslld $13,             x0 ## 1, x4 ## 1;          \
 390         vpsrld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 391         vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 392         vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 393         vpslld $3,              x2 ## 1, x4 ## 1;          \
 394         vpsrld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 395         vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 396         vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 397                 vpslld $13,             x0 ## 2, x4 ## 2;          \
 398                 vpsrld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 399                 vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 400                 vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 401                 vpslld $3,              x2 ## 2, x4 ## 2;          \
 402                 vpsrld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 403                 vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 404                 vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 405         vpslld $1,              x1 ## 1, x4 ## 1;          \
 406         vpsrld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 407         vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 408         vpslld $3,              x0 ## 1, x4 ## 1;          \
 409         vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 410         vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 411         get_key(i, 1, RK1); \
 412                 vpslld $1,              x1 ## 2, x4 ## 2;          \
 413                 vpsrld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 414                 vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 415                 vpslld $3,              x0 ## 2, x4 ## 2;          \
 416                 vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 417                 vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 418                 get_key(i, 3, RK3); \
 419         vpslld $7,              x3 ## 1, x4 ## 1;          \
 420         vpsrld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 421         vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 422         vpslld $7,              x1 ## 1, x4 ## 1;          \
 423         vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 424         vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 425         vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 426         vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 427         get_key(i, 0, RK0); \
 428                 vpslld $7,              x3 ## 2, x4 ## 2;          \
 429                 vpsrld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 430                 vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 431                 vpslld $7,              x1 ## 2, x4 ## 2;          \
 432                 vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 433                 vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 434                 vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 435                 vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 436                 get_key(i, 2, RK2); \
 437         vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 438         vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 439         vpslld $5,              x0 ## 1, x4 ## 1;          \
 440         vpsrld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 441         vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 442         vpslld $22,             x2 ## 1, x4 ## 1;          \
 443         vpsrld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 444         vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 445         vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 446         vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 447                 vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 448                 vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 449                 vpslld $5,              x0 ## 2, x4 ## 2;          \
 450                 vpsrld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 451                 vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 452                 vpslld $22,             x2 ## 2, x4 ## 2;          \
 453                 vpsrld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 454                 vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 455                 vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 456                 vpxor                   RK2, x2 ## 2, x2 ## 2;
 457 
 458 #define KL2(x0, x1, x2, x3, x4, i) \
 459         vpxor                   RK0, x0 ## 1, x0 ## 1;     \
 460         vpxor                   RK2, x2 ## 1, x2 ## 1;     \
 461         vpsrld $5,              x0 ## 1, x4 ## 1;          \
 462         vpslld $(32 - 5),       x0 ## 1, x0 ## 1;          \
 463         vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 464         vpxor                   RK3, x3 ## 1, x3 ## 1;     \
 465         vpxor                   RK1, x1 ## 1, x1 ## 1;     \
 466         vpsrld $22,             x2 ## 1, x4 ## 1;          \
 467         vpslld $(32 - 22),      x2 ## 1, x2 ## 1;          \
 468         vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 469         vpxor                   x3 ## 1, x2 ## 1, x2 ## 1; \
 470                 vpxor                   RK0, x0 ## 2, x0 ## 2;     \
 471                 vpxor                   RK2, x2 ## 2, x2 ## 2;     \
 472                 vpsrld $5,              x0 ## 2, x4 ## 2;          \
 473                 vpslld $(32 - 5),       x0 ## 2, x0 ## 2;          \
 474                 vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 475                 vpxor                   RK3, x3 ## 2, x3 ## 2;     \
 476                 vpxor                   RK1, x1 ## 2, x1 ## 2;     \
 477                 vpsrld $22,             x2 ## 2, x4 ## 2;          \
 478                 vpslld $(32 - 22),      x2 ## 2, x2 ## 2;          \
 479                 vpor                    x4 ## 2, x2 ## 2, x2 ## 2; \
 480                 vpxor                   x3 ## 2, x2 ## 2, x2 ## 2; \
 481         vpxor                   x3 ## 1, x0 ## 1, x0 ## 1; \
 482         vpslld $7,              x1 ## 1, x4 ## 1;          \
 483         vpxor                   x1 ## 1, x0 ## 1, x0 ## 1; \
 484         vpxor                   x4 ## 1, x2 ## 1, x2 ## 1; \
 485         vpsrld $1,              x1 ## 1, x4 ## 1;          \
 486         vpslld $(32 - 1),       x1 ## 1, x1 ## 1;          \
 487         vpor                    x4 ## 1, x1 ## 1, x1 ## 1; \
 488                 vpxor                   x3 ## 2, x0 ## 2, x0 ## 2; \
 489                 vpslld $7,              x1 ## 2, x4 ## 2;          \
 490                 vpxor                   x1 ## 2, x0 ## 2, x0 ## 2; \
 491                 vpxor                   x4 ## 2, x2 ## 2, x2 ## 2; \
 492                 vpsrld $1,              x1 ## 2, x4 ## 2;          \
 493                 vpslld $(32 - 1),       x1 ## 2, x1 ## 2;          \
 494                 vpor                    x4 ## 2, x1 ## 2, x1 ## 2; \
 495         vpsrld $7,              x3 ## 1, x4 ## 1;          \
 496         vpslld $(32 - 7),       x3 ## 1, x3 ## 1;          \
 497         vpor                    x4 ## 1, x3 ## 1, x3 ## 1; \
 498         vpxor                   x0 ## 1, x1 ## 1, x1 ## 1; \
 499         vpslld $3,              x0 ## 1, x4 ## 1;          \
 500         vpxor                   x4 ## 1, x3 ## 1, x3 ## 1; \
 501                 vpsrld $7,              x3 ## 2, x4 ## 2;          \
 502                 vpslld $(32 - 7),       x3 ## 2, x3 ## 2;          \
 503                 vpor                    x4 ## 2, x3 ## 2, x3 ## 2; \
 504                 vpxor                   x0 ## 2, x1 ## 2, x1 ## 2; \
 505                 vpslld $3,              x0 ## 2, x4 ## 2;          \
 506                 vpxor                   x4 ## 2, x3 ## 2, x3 ## 2; \
 507         vpsrld $13,             x0 ## 1, x4 ## 1;          \
 508         vpslld $(32 - 13),      x0 ## 1, x0 ## 1;          \
 509         vpor                    x4 ## 1, x0 ## 1, x0 ## 1; \
 510         vpxor                   x2 ## 1, x1 ## 1, x1 ## 1; \
 511         vpxor                   x2 ## 1, x3 ## 1, x3 ## 1; \
 512         vpsrld $3,              x2 ## 1, x4 ## 1;          \
 513         vpslld $(32 - 3),       x2 ## 1, x2 ## 1;          \
 514         vpor                    x4 ## 1, x2 ## 1, x2 ## 1; \
 515                 vpsrld $13,             x0 ## 2, x4 ## 2;          \
 516                 vpslld $(32 - 13),      x0 ## 2, x0 ## 2;          \
 517                 vpor                    x4 ## 2, x0 ## 2, x0 ## 2; \
 518                 vpxor                   x2 ## 2, x1 ## 2, x1 ## 2; \
 519                 vpxor                   x2 ## 2, x3 ## 2, x3 ## 2; \
 520                 vpsrld $3,              x2 ## 2, x4 ## 2;          \
 521                 vpslld $(32 - 3),       x2 ## 2, x2 ## 2;          \
 522                 vpor                    x4 ## 2, x2 ## 2, x2 ## 2;
 523 
 524 #define S(SBOX, x0, x1, x2, x3, x4) \
 525         SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 526         SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 527         SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 528         SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
 529 
 530 #define SP(SBOX, x0, x1, x2, x3, x4, i) \
 531         get_key(i, 0, RK0); \
 532         SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 533         get_key(i, 2, RK2); \
 534         SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 535         get_key(i, 3, RK3); \
 536         SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 537         get_key(i, 1, RK1); \
 538         SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 539 
 540 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 541         vpunpckldq              x1, x0, t0; \
 542         vpunpckhdq              x1, x0, t2; \
 543         vpunpckldq              x3, x2, t1; \
 544         vpunpckhdq              x3, x2, x3; \
 545         \
 546         vpunpcklqdq             t1, t0, x0; \
 547         vpunpckhqdq             t1, t0, x1; \
 548         vpunpcklqdq             x3, t2, x2; \
 549         vpunpckhqdq             x3, t2, x3;
 550 
 551 #define read_blocks(x0, x1, x2, x3, t0, t1, t2) \
 552         transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 553 
 554 #define write_blocks(x0, x1, x2, x3, t0, t1, t2) \
 555         transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 556 
 557 .align 8
 558 __serpent_enc_blk8_avx:
 559         /* input:
 560          *      %rdi: ctx, CTX
 561          *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks
 562          * output:
 563          *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
 564          */
 565 
 566         vpcmpeqd RNOT, RNOT, RNOT;
 567 
 568         read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 569         read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 570 
 571                                                  K2(RA, RB, RC, RD, RE, 0);
 572         S(S0, RA, RB, RC, RD, RE);              LK2(RC, RB, RD, RA, RE, 1);
 573         S(S1, RC, RB, RD, RA, RE);              LK2(RE, RD, RA, RC, RB, 2);
 574         S(S2, RE, RD, RA, RC, RB);              LK2(RB, RD, RE, RC, RA, 3);
 575         S(S3, RB, RD, RE, RC, RA);              LK2(RC, RA, RD, RB, RE, 4);
 576         S(S4, RC, RA, RD, RB, RE);              LK2(RA, RD, RB, RE, RC, 5);
 577         S(S5, RA, RD, RB, RE, RC);              LK2(RC, RA, RD, RE, RB, 6);
 578         S(S6, RC, RA, RD, RE, RB);              LK2(RD, RB, RA, RE, RC, 7);
 579         S(S7, RD, RB, RA, RE, RC);              LK2(RC, RA, RE, RD, RB, 8);
 580         S(S0, RC, RA, RE, RD, RB);              LK2(RE, RA, RD, RC, RB, 9);
 581         S(S1, RE, RA, RD, RC, RB);              LK2(RB, RD, RC, RE, RA, 10);
 582         S(S2, RB, RD, RC, RE, RA);              LK2(RA, RD, RB, RE, RC, 11);
 583         S(S3, RA, RD, RB, RE, RC);              LK2(RE, RC, RD, RA, RB, 12);
 584         S(S4, RE, RC, RD, RA, RB);              LK2(RC, RD, RA, RB, RE, 13);
 585         S(S5, RC, RD, RA, RB, RE);              LK2(RE, RC, RD, RB, RA, 14);
 586         S(S6, RE, RC, RD, RB, RA);              LK2(RD, RA, RC, RB, RE, 15);
 587         S(S7, RD, RA, RC, RB, RE);              LK2(RE, RC, RB, RD, RA, 16);
 588         S(S0, RE, RC, RB, RD, RA);              LK2(RB, RC, RD, RE, RA, 17);
 589         S(S1, RB, RC, RD, RE, RA);              LK2(RA, RD, RE, RB, RC, 18);
 590         S(S2, RA, RD, RE, RB, RC);              LK2(RC, RD, RA, RB, RE, 19);
 591         S(S3, RC, RD, RA, RB, RE);              LK2(RB, RE, RD, RC, RA, 20);
 592         S(S4, RB, RE, RD, RC, RA);              LK2(RE, RD, RC, RA, RB, 21);
 593         S(S5, RE, RD, RC, RA, RB);              LK2(RB, RE, RD, RA, RC, 22);
 594         S(S6, RB, RE, RD, RA, RC);              LK2(RD, RC, RE, RA, RB, 23);
 595         S(S7, RD, RC, RE, RA, RB);              LK2(RB, RE, RA, RD, RC, 24);
 596         S(S0, RB, RE, RA, RD, RC);              LK2(RA, RE, RD, RB, RC, 25);
 597         S(S1, RA, RE, RD, RB, RC);              LK2(RC, RD, RB, RA, RE, 26);
 598         S(S2, RC, RD, RB, RA, RE);              LK2(RE, RD, RC, RA, RB, 27);
 599         S(S3, RE, RD, RC, RA, RB);              LK2(RA, RB, RD, RE, RC, 28);
 600         S(S4, RA, RB, RD, RE, RC);              LK2(RB, RD, RE, RC, RA, 29);
 601         S(S5, RB, RD, RE, RC, RA);              LK2(RA, RB, RD, RC, RE, 30);
 602         S(S6, RA, RB, RD, RC, RE);              LK2(RD, RE, RB, RC, RA, 31);
 603         S(S7, RD, RE, RB, RC, RA);               K2(RA, RB, RC, RD, RE, 32);
 604 
 605         write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 606         write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 607 
 608         ret;
 609 ENDPROC(__serpent_enc_blk8_avx)
 610 
 611 .align 8
 612 __serpent_dec_blk8_avx:
 613         /* input:
 614          *      %rdi: ctx, CTX
 615          *      RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks
 616          * output:
 617          *      RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks
 618          */
 619 
 620         vpcmpeqd RNOT, RNOT, RNOT;
 621 
 622         read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 623         read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 624 
 625                                                  K2(RA, RB, RC, RD, RE, 32);
 626         SP(SI7, RA, RB, RC, RD, RE, 31);        KL2(RB, RD, RA, RE, RC, 31);
 627         SP(SI6, RB, RD, RA, RE, RC, 30);        KL2(RA, RC, RE, RB, RD, 30);
 628         SP(SI5, RA, RC, RE, RB, RD, 29);        KL2(RC, RD, RA, RE, RB, 29);
 629         SP(SI4, RC, RD, RA, RE, RB, 28);        KL2(RC, RA, RB, RE, RD, 28);
 630         SP(SI3, RC, RA, RB, RE, RD, 27);        KL2(RB, RC, RD, RE, RA, 27);
 631         SP(SI2, RB, RC, RD, RE, RA, 26);        KL2(RC, RA, RE, RD, RB, 26);
 632         SP(SI1, RC, RA, RE, RD, RB, 25);        KL2(RB, RA, RE, RD, RC, 25);
 633         SP(SI0, RB, RA, RE, RD, RC, 24);        KL2(RE, RC, RA, RB, RD, 24);
 634         SP(SI7, RE, RC, RA, RB, RD, 23);        KL2(RC, RB, RE, RD, RA, 23);
 635         SP(SI6, RC, RB, RE, RD, RA, 22);        KL2(RE, RA, RD, RC, RB, 22);
 636         SP(SI5, RE, RA, RD, RC, RB, 21);        KL2(RA, RB, RE, RD, RC, 21);
 637         SP(SI4, RA, RB, RE, RD, RC, 20);        KL2(RA, RE, RC, RD, RB, 20);
 638         SP(SI3, RA, RE, RC, RD, RB, 19);        KL2(RC, RA, RB, RD, RE, 19);
 639         SP(SI2, RC, RA, RB, RD, RE, 18);        KL2(RA, RE, RD, RB, RC, 18);
 640         SP(SI1, RA, RE, RD, RB, RC, 17);        KL2(RC, RE, RD, RB, RA, 17);
 641         SP(SI0, RC, RE, RD, RB, RA, 16);        KL2(RD, RA, RE, RC, RB, 16);
 642         SP(SI7, RD, RA, RE, RC, RB, 15);        KL2(RA, RC, RD, RB, RE, 15);
 643         SP(SI6, RA, RC, RD, RB, RE, 14);        KL2(RD, RE, RB, RA, RC, 14);
 644         SP(SI5, RD, RE, RB, RA, RC, 13);        KL2(RE, RC, RD, RB, RA, 13);
 645         SP(SI4, RE, RC, RD, RB, RA, 12);        KL2(RE, RD, RA, RB, RC, 12);
 646         SP(SI3, RE, RD, RA, RB, RC, 11);        KL2(RA, RE, RC, RB, RD, 11);
 647         SP(SI2, RA, RE, RC, RB, RD, 10);        KL2(RE, RD, RB, RC, RA, 10);
 648         SP(SI1, RE, RD, RB, RC, RA, 9);         KL2(RA, RD, RB, RC, RE, 9);
 649         SP(SI0, RA, RD, RB, RC, RE, 8);         KL2(RB, RE, RD, RA, RC, 8);
 650         SP(SI7, RB, RE, RD, RA, RC, 7);         KL2(RE, RA, RB, RC, RD, 7);
 651         SP(SI6, RE, RA, RB, RC, RD, 6);         KL2(RB, RD, RC, RE, RA, 6);
 652         SP(SI5, RB, RD, RC, RE, RA, 5);         KL2(RD, RA, RB, RC, RE, 5);
 653         SP(SI4, RD, RA, RB, RC, RE, 4);         KL2(RD, RB, RE, RC, RA, 4);
 654         SP(SI3, RD, RB, RE, RC, RA, 3);         KL2(RE, RD, RA, RC, RB, 3);
 655         SP(SI2, RE, RD, RA, RC, RB, 2);         KL2(RD, RB, RC, RA, RE, 2);
 656         SP(SI1, RD, RB, RC, RA, RE, 1);         KL2(RE, RB, RC, RA, RD, 1);
 657         S(SI0, RE, RB, RC, RA, RD);              K2(RC, RD, RB, RE, RA, 0);
 658 
 659         write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2);
 660         write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2);
 661 
 662         ret;
 663 ENDPROC(__serpent_dec_blk8_avx)
 664 
 665 ENTRY(serpent_ecb_enc_8way_avx)
 666         /* input:
 667          *      %rdi: ctx, CTX
 668          *      %rsi: dst
 669          *      %rdx: src
 670          */
 671         FRAME_BEGIN
 672 
 673         load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 674 
 675         call __serpent_enc_blk8_avx;
 676 
 677         store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 678 
 679         FRAME_END
 680         ret;
 681 ENDPROC(serpent_ecb_enc_8way_avx)
 682 
 683 ENTRY(serpent_ecb_dec_8way_avx)
 684         /* input:
 685          *      %rdi: ctx, CTX
 686          *      %rsi: dst
 687          *      %rdx: src
 688          */
 689         FRAME_BEGIN
 690 
 691         load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 692 
 693         call __serpent_dec_blk8_avx;
 694 
 695         store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 696 
 697         FRAME_END
 698         ret;
 699 ENDPROC(serpent_ecb_dec_8way_avx)
 700 
 701 ENTRY(serpent_cbc_dec_8way_avx)
 702         /* input:
 703          *      %rdi: ctx, CTX
 704          *      %rsi: dst
 705          *      %rdx: src
 706          */
 707         FRAME_BEGIN
 708 
 709         load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 710 
 711         call __serpent_dec_blk8_avx;
 712 
 713         store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 714 
 715         FRAME_END
 716         ret;
 717 ENDPROC(serpent_cbc_dec_8way_avx)
 718 
 719 ENTRY(serpent_ctr_8way_avx)
 720         /* input:
 721          *      %rdi: ctx, CTX
 722          *      %rsi: dst
 723          *      %rdx: src
 724          *      %rcx: iv (little endian, 128bit)
 725          */
 726         FRAME_BEGIN
 727 
 728         load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2,
 729                       RD2, RK0, RK1, RK2);
 730 
 731         call __serpent_enc_blk8_avx;
 732 
 733         store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 734 
 735         FRAME_END
 736         ret;
 737 ENDPROC(serpent_ctr_8way_avx)
 738 
 739 ENTRY(serpent_xts_enc_8way_avx)
 740         /* input:
 741          *      %rdi: ctx, CTX
 742          *      %rsi: dst
 743          *      %rdx: src
 744          *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 745          */
 746         FRAME_BEGIN
 747 
 748         /* regs <= src, dst <= IVs, regs <= regs xor IVs */
 749         load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
 750                       RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
 751 
 752         call __serpent_enc_blk8_avx;
 753 
 754         /* dst <= regs xor IVs(in dst) */
 755         store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2);
 756 
 757         FRAME_END
 758         ret;
 759 ENDPROC(serpent_xts_enc_8way_avx)
 760 
 761 ENTRY(serpent_xts_dec_8way_avx)
 762         /* input:
 763          *      %rdi: ctx, CTX
 764          *      %rsi: dst
 765          *      %rdx: src
 766          *      %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸))
 767          */
 768         FRAME_BEGIN
 769 
 770         /* regs <= src, dst <= IVs, regs <= regs xor IVs */
 771         load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2,
 772                       RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask);
 773 
 774         call __serpent_dec_blk8_avx;
 775 
 776         /* dst <= regs xor IVs(in dst) */
 777         store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2);
 778 
 779         FRAME_END
 780         ret;
 781 ENDPROC(serpent_xts_dec_8way_avx)

/* [<][>][^][v][top][bottom][index][help] */