root/arch/x86/crypto/serpent-sse2-x86_64-asm_64.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Serpent Cipher 8-way parallel algorithm (x86_64/SSE2)
   4  *
   5  * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
   6  *
   7  * Based on crypto/serpent.c by
   8  *  Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no>
   9  *                2003 Herbert Valerio Riedel <hvr@gnu.org>
  10  */
  11 
  12 #include <linux/linkage.h>
  13 
  14 .file "serpent-sse2-x86_64-asm_64.S"
  15 .text
  16 
  17 #define CTX %rdi
  18 
  19 /**********************************************************************
  20   8-way SSE2 serpent
  21  **********************************************************************/
  22 #define RA1 %xmm0
  23 #define RB1 %xmm1
  24 #define RC1 %xmm2
  25 #define RD1 %xmm3
  26 #define RE1 %xmm4
  27 
  28 #define RA2 %xmm5
  29 #define RB2 %xmm6
  30 #define RC2 %xmm7
  31 #define RD2 %xmm8
  32 #define RE2 %xmm9
  33 
  34 #define RNOT %xmm10
  35 
  36 #define RK0 %xmm11
  37 #define RK1 %xmm12
  38 #define RK2 %xmm13
  39 #define RK3 %xmm14
  40 
  41 #define S0_1(x0, x1, x2, x3, x4) \
  42         movdqa x3,              x4; \
  43         por x0,                 x3; \
  44         pxor x4,                x0; \
  45         pxor x2,                x4; \
  46         pxor RNOT,              x4; \
  47         pxor x1,                x3; \
  48         pand x0,                x1; \
  49         pxor x4,                x1; \
  50         pxor x0,                x2;
  51 #define S0_2(x0, x1, x2, x3, x4) \
  52         pxor x3,                x0; \
  53         por x0,                 x4; \
  54         pxor x2,                x0; \
  55         pand x1,                x2; \
  56         pxor x2,                x3; \
  57         pxor RNOT,              x1; \
  58         pxor x4,                x2; \
  59         pxor x2,                x1;
  60 
  61 #define S1_1(x0, x1, x2, x3, x4) \
  62         movdqa x1,              x4; \
  63         pxor x0,                x1; \
  64         pxor x3,                x0; \
  65         pxor RNOT,              x3; \
  66         pand x1,                x4; \
  67         por x1,                 x0; \
  68         pxor x2,                x3; \
  69         pxor x3,                x0; \
  70         pxor x3,                x1;
  71 #define S1_2(x0, x1, x2, x3, x4) \
  72         pxor x4,                x3; \
  73         por x4,                 x1; \
  74         pxor x2,                x4; \
  75         pand x0,                x2; \
  76         pxor x1,                x2; \
  77         por x0,                 x1; \
  78         pxor RNOT,              x0; \
  79         pxor x2,                x0; \
  80         pxor x1,                x4;
  81 
  82 #define S2_1(x0, x1, x2, x3, x4) \
  83         pxor RNOT,              x3; \
  84         pxor x0,                x1; \
  85         movdqa x0,              x4; \
  86         pand x2,                x0; \
  87         pxor x3,                x0; \
  88         por x4,                 x3; \
  89         pxor x1,                x2; \
  90         pxor x1,                x3; \
  91         pand x0,                x1;
  92 #define S2_2(x0, x1, x2, x3, x4) \
  93         pxor x2,                x0; \
  94         pand x3,                x2; \
  95         por x1,                 x3; \
  96         pxor RNOT,              x0; \
  97         pxor x0,                x3; \
  98         pxor x0,                x4; \
  99         pxor x2,                x0; \
 100         por x2,                 x1;
 101 
 102 #define S3_1(x0, x1, x2, x3, x4) \
 103         movdqa x1,              x4; \
 104         pxor x3,                x1; \
 105         por x0,                 x3; \
 106         pand x0,                x4; \
 107         pxor x2,                x0; \
 108         pxor x1,                x2; \
 109         pand x3,                x1; \
 110         pxor x3,                x2; \
 111         por x4,                 x0; \
 112         pxor x3,                x4;
 113 #define S3_2(x0, x1, x2, x3, x4) \
 114         pxor x0,                x1; \
 115         pand x3,                x0; \
 116         pand x4,                x3; \
 117         pxor x2,                x3; \
 118         por x1,                 x4; \
 119         pand x1,                x2; \
 120         pxor x3,                x4; \
 121         pxor x3,                x0; \
 122         pxor x2,                x3;
 123 
 124 #define S4_1(x0, x1, x2, x3, x4) \
 125         movdqa x3,              x4; \
 126         pand x0,                x3; \
 127         pxor x4,                x0; \
 128         pxor x2,                x3; \
 129         por x4,                 x2; \
 130         pxor x1,                x0; \
 131         pxor x3,                x4; \
 132         por x0,                 x2; \
 133         pxor x1,                x2;
 134 #define S4_2(x0, x1, x2, x3, x4) \
 135         pand x0,                x1; \
 136         pxor x4,                x1; \
 137         pand x2,                x4; \
 138         pxor x3,                x2; \
 139         pxor x0,                x4; \
 140         por x1,                 x3; \
 141         pxor RNOT,              x1; \
 142         pxor x0,                x3;
 143 
 144 #define S5_1(x0, x1, x2, x3, x4) \
 145         movdqa x1,              x4; \
 146         por x0,                 x1; \
 147         pxor x1,                x2; \
 148         pxor RNOT,              x3; \
 149         pxor x0,                x4; \
 150         pxor x2,                x0; \
 151         pand x4,                x1; \
 152         por x3,                 x4; \
 153         pxor x0,                x4;
 154 #define S5_2(x0, x1, x2, x3, x4) \
 155         pand x3,                x0; \
 156         pxor x3,                x1; \
 157         pxor x2,                x3; \
 158         pxor x1,                x0; \
 159         pand x4,                x2; \
 160         pxor x2,                x1; \
 161         pand x0,                x2; \
 162         pxor x2,                x3;
 163 
 164 #define S6_1(x0, x1, x2, x3, x4) \
 165         movdqa x1,              x4; \
 166         pxor x0,                x3; \
 167         pxor x2,                x1; \
 168         pxor x0,                x2; \
 169         pand x3,                x0; \
 170         por x3,                 x1; \
 171         pxor RNOT,              x4; \
 172         pxor x1,                x0; \
 173         pxor x2,                x1;
 174 #define S6_2(x0, x1, x2, x3, x4) \
 175         pxor x4,                x3; \
 176         pxor x0,                x4; \
 177         pand x0,                x2; \
 178         pxor x1,                x4; \
 179         pxor x3,                x2; \
 180         pand x1,                x3; \
 181         pxor x0,                x3; \
 182         pxor x2,                x1;
 183 
 184 #define S7_1(x0, x1, x2, x3, x4) \
 185         pxor RNOT,              x1; \
 186         movdqa x1,              x4; \
 187         pxor RNOT,              x0; \
 188         pand x2,                x1; \
 189         pxor x3,                x1; \
 190         por x4,                 x3; \
 191         pxor x2,                x4; \
 192         pxor x3,                x2; \
 193         pxor x0,                x3; \
 194         por x1,                 x0;
 195 #define S7_2(x0, x1, x2, x3, x4) \
 196         pand x0,                x2; \
 197         pxor x4,                x0; \
 198         pxor x3,                x4; \
 199         pand x0,                x3; \
 200         pxor x1,                x4; \
 201         pxor x4,                x2; \
 202         pxor x1,                x3; \
 203         por x0,                 x4; \
 204         pxor x1,                x4;
 205 
 206 #define SI0_1(x0, x1, x2, x3, x4) \
 207         movdqa x3,              x4; \
 208         pxor x0,                x1; \
 209         por x1,                 x3; \
 210         pxor x1,                x4; \
 211         pxor RNOT,              x0; \
 212         pxor x3,                x2; \
 213         pxor x0,                x3; \
 214         pand x1,                x0; \
 215         pxor x2,                x0;
 216 #define SI0_2(x0, x1, x2, x3, x4) \
 217         pand x3,                x2; \
 218         pxor x4,                x3; \
 219         pxor x3,                x2; \
 220         pxor x3,                x1; \
 221         pand x0,                x3; \
 222         pxor x0,                x1; \
 223         pxor x2,                x0; \
 224         pxor x3,                x4;
 225 
 226 #define SI1_1(x0, x1, x2, x3, x4) \
 227         pxor x3,                x1; \
 228         movdqa x0,              x4; \
 229         pxor x2,                x0; \
 230         pxor RNOT,              x2; \
 231         por x1,                 x4; \
 232         pxor x3,                x4; \
 233         pand x1,                x3; \
 234         pxor x2,                x1; \
 235         pand x4,                x2;
 236 #define SI1_2(x0, x1, x2, x3, x4) \
 237         pxor x1,                x4; \
 238         por x3,                 x1; \
 239         pxor x0,                x3; \
 240         pxor x0,                x2; \
 241         por x4,                 x0; \
 242         pxor x4,                x2; \
 243         pxor x0,                x1; \
 244         pxor x1,                x4;
 245 
 246 #define SI2_1(x0, x1, x2, x3, x4) \
 247         pxor x1,                x2; \
 248         movdqa x3,              x4; \
 249         pxor RNOT,              x3; \
 250         por x2,                 x3; \
 251         pxor x4,                x2; \
 252         pxor x0,                x4; \
 253         pxor x1,                x3; \
 254         por x2,                 x1; \
 255         pxor x0,                x2;
 256 #define SI2_2(x0, x1, x2, x3, x4) \
 257         pxor x4,                x1; \
 258         por x3,                 x4; \
 259         pxor x3,                x2; \
 260         pxor x2,                x4; \
 261         pand x1,                x2; \
 262         pxor x3,                x2; \
 263         pxor x4,                x3; \
 264         pxor x0,                x4;
 265 
 266 #define SI3_1(x0, x1, x2, x3, x4) \
 267         pxor x1,                x2; \
 268         movdqa x1,              x4; \
 269         pand x2,                x1; \
 270         pxor x0,                x1; \
 271         por x4,                 x0; \
 272         pxor x3,                x4; \
 273         pxor x3,                x0; \
 274         por x1,                 x3; \
 275         pxor x2,                x1;
 276 #define SI3_2(x0, x1, x2, x3, x4) \
 277         pxor x3,                x1; \
 278         pxor x2,                x0; \
 279         pxor x3,                x2; \
 280         pand x1,                x3; \
 281         pxor x0,                x1; \
 282         pand x2,                x0; \
 283         pxor x3,                x4; \
 284         pxor x0,                x3; \
 285         pxor x1,                x0;
 286 
 287 #define SI4_1(x0, x1, x2, x3, x4) \
 288         pxor x3,                x2; \
 289         movdqa x0,              x4; \
 290         pand x1,                x0; \
 291         pxor x2,                x0; \
 292         por x3,                 x2; \
 293         pxor RNOT,              x4; \
 294         pxor x0,                x1; \
 295         pxor x2,                x0; \
 296         pand x4,                x2;
 297 #define SI4_2(x0, x1, x2, x3, x4) \
 298         pxor x0,                x2; \
 299         por x4,                 x0; \
 300         pxor x3,                x0; \
 301         pand x2,                x3; \
 302         pxor x3,                x4; \
 303         pxor x1,                x3; \
 304         pand x0,                x1; \
 305         pxor x1,                x4; \
 306         pxor x3,                x0;
 307 
 308 #define SI5_1(x0, x1, x2, x3, x4) \
 309         movdqa x1,              x4; \
 310         por x2,                 x1; \
 311         pxor x4,                x2; \
 312         pxor x3,                x1; \
 313         pand x4,                x3; \
 314         pxor x3,                x2; \
 315         por x0,                 x3; \
 316         pxor RNOT,              x0; \
 317         pxor x2,                x3; \
 318         por x0,                 x2;
 319 #define SI5_2(x0, x1, x2, x3, x4) \
 320         pxor x1,                x4; \
 321         pxor x4,                x2; \
 322         pand x0,                x4; \
 323         pxor x1,                x0; \
 324         pxor x3,                x1; \
 325         pand x2,                x0; \
 326         pxor x3,                x2; \
 327         pxor x2,                x0; \
 328         pxor x4,                x2; \
 329         pxor x3,                x4;
 330 
 331 #define SI6_1(x0, x1, x2, x3, x4) \
 332         pxor x2,                x0; \
 333         movdqa x0,              x4; \
 334         pand x3,                x0; \
 335         pxor x3,                x2; \
 336         pxor x2,                x0; \
 337         pxor x1,                x3; \
 338         por x4,                 x2; \
 339         pxor x3,                x2; \
 340         pand x0,                x3;
 341 #define SI6_2(x0, x1, x2, x3, x4) \
 342         pxor RNOT,              x0; \
 343         pxor x1,                x3; \
 344         pand x2,                x1; \
 345         pxor x0,                x4; \
 346         pxor x4,                x3; \
 347         pxor x2,                x4; \
 348         pxor x1,                x0; \
 349         pxor x0,                x2;
 350 
 351 #define SI7_1(x0, x1, x2, x3, x4) \
 352         movdqa x3,              x4; \
 353         pand x0,                x3; \
 354         pxor x2,                x0; \
 355         por x4,                 x2; \
 356         pxor x1,                x4; \
 357         pxor RNOT,              x0; \
 358         por x3,                 x1; \
 359         pxor x0,                x4; \
 360         pand x2,                x0; \
 361         pxor x1,                x0;
 362 #define SI7_2(x0, x1, x2, x3, x4) \
 363         pand x2,                x1; \
 364         pxor x2,                x3; \
 365         pxor x3,                x4; \
 366         pand x3,                x2; \
 367         por x0,                 x3; \
 368         pxor x4,                x1; \
 369         pxor x4,                x3; \
 370         pand x0,                x4; \
 371         pxor x2,                x4;
 372 
 373 #define get_key(i, j, t) \
 374         movd (4*(i)+(j))*4(CTX), t; \
 375         pshufd $0, t, t;
 376 
 377 #define K2(x0, x1, x2, x3, x4, i) \
 378         get_key(i, 0, RK0); \
 379         get_key(i, 1, RK1); \
 380         get_key(i, 2, RK2); \
 381         get_key(i, 3, RK3); \
 382         pxor RK0,               x0 ## 1; \
 383         pxor RK1,               x1 ## 1; \
 384         pxor RK2,               x2 ## 1; \
 385         pxor RK3,               x3 ## 1; \
 386                 pxor RK0,               x0 ## 2; \
 387                 pxor RK1,               x1 ## 2; \
 388                 pxor RK2,               x2 ## 2; \
 389                 pxor RK3,               x3 ## 2;
 390 
 391 #define LK2(x0, x1, x2, x3, x4, i) \
 392         movdqa x0 ## 1,         x4 ## 1; \
 393         pslld $13,              x0 ## 1; \
 394         psrld $(32 - 13),       x4 ## 1; \
 395         por x4 ## 1,            x0 ## 1; \
 396         pxor x0 ## 1,           x1 ## 1; \
 397         movdqa x2 ## 1,         x4 ## 1; \
 398         pslld $3,               x2 ## 1; \
 399         psrld $(32 - 3),        x4 ## 1; \
 400         por x4 ## 1,            x2 ## 1; \
 401         pxor x2 ## 1,           x1 ## 1; \
 402                 movdqa x0 ## 2,         x4 ## 2; \
 403                 pslld $13,              x0 ## 2; \
 404                 psrld $(32 - 13),       x4 ## 2; \
 405                 por x4 ## 2,            x0 ## 2; \
 406                 pxor x0 ## 2,           x1 ## 2; \
 407                 movdqa x2 ## 2,         x4 ## 2; \
 408                 pslld $3,               x2 ## 2; \
 409                 psrld $(32 - 3),        x4 ## 2; \
 410                 por x4 ## 2,            x2 ## 2; \
 411                 pxor x2 ## 2,           x1 ## 2; \
 412         movdqa x1 ## 1,         x4 ## 1; \
 413         pslld $1,               x1 ## 1; \
 414         psrld $(32 - 1),        x4 ## 1; \
 415         por x4 ## 1,            x1 ## 1; \
 416         movdqa x0 ## 1,         x4 ## 1; \
 417         pslld $3,               x4 ## 1; \
 418         pxor x2 ## 1,           x3 ## 1; \
 419         pxor x4 ## 1,           x3 ## 1; \
 420         movdqa x3 ## 1,         x4 ## 1; \
 421         get_key(i, 1, RK1); \
 422                 movdqa x1 ## 2,         x4 ## 2; \
 423                 pslld $1,               x1 ## 2; \
 424                 psrld $(32 - 1),        x4 ## 2; \
 425                 por x4 ## 2,            x1 ## 2; \
 426                 movdqa x0 ## 2,         x4 ## 2; \
 427                 pslld $3,               x4 ## 2; \
 428                 pxor x2 ## 2,           x3 ## 2; \
 429                 pxor x4 ## 2,           x3 ## 2; \
 430                 movdqa x3 ## 2,         x4 ## 2; \
 431                 get_key(i, 3, RK3); \
 432         pslld $7,               x3 ## 1; \
 433         psrld $(32 - 7),        x4 ## 1; \
 434         por x4 ## 1,            x3 ## 1; \
 435         movdqa x1 ## 1,         x4 ## 1; \
 436         pslld $7,               x4 ## 1; \
 437         pxor x1 ## 1,           x0 ## 1; \
 438         pxor x3 ## 1,           x0 ## 1; \
 439         pxor x3 ## 1,           x2 ## 1; \
 440         pxor x4 ## 1,           x2 ## 1; \
 441         get_key(i, 0, RK0); \
 442                 pslld $7,               x3 ## 2; \
 443                 psrld $(32 - 7),        x4 ## 2; \
 444                 por x4 ## 2,            x3 ## 2; \
 445                 movdqa x1 ## 2,         x4 ## 2; \
 446                 pslld $7,               x4 ## 2; \
 447                 pxor x1 ## 2,           x0 ## 2; \
 448                 pxor x3 ## 2,           x0 ## 2; \
 449                 pxor x3 ## 2,           x2 ## 2; \
 450                 pxor x4 ## 2,           x2 ## 2; \
 451                 get_key(i, 2, RK2); \
 452         pxor RK1,               x1 ## 1; \
 453         pxor RK3,               x3 ## 1; \
 454         movdqa x0 ## 1,         x4 ## 1; \
 455         pslld $5,               x0 ## 1; \
 456         psrld $(32 - 5),        x4 ## 1; \
 457         por x4 ## 1,            x0 ## 1; \
 458         movdqa x2 ## 1,         x4 ## 1; \
 459         pslld $22,              x2 ## 1; \
 460         psrld $(32 - 22),       x4 ## 1; \
 461         por x4 ## 1,            x2 ## 1; \
 462         pxor RK0,               x0 ## 1; \
 463         pxor RK2,               x2 ## 1; \
 464                 pxor RK1,               x1 ## 2; \
 465                 pxor RK3,               x3 ## 2; \
 466                 movdqa x0 ## 2,         x4 ## 2; \
 467                 pslld $5,               x0 ## 2; \
 468                 psrld $(32 - 5),        x4 ## 2; \
 469                 por x4 ## 2,            x0 ## 2; \
 470                 movdqa x2 ## 2,         x4 ## 2; \
 471                 pslld $22,              x2 ## 2; \
 472                 psrld $(32 - 22),       x4 ## 2; \
 473                 por x4 ## 2,            x2 ## 2; \
 474                 pxor RK0,               x0 ## 2; \
 475                 pxor RK2,               x2 ## 2;
 476 
 477 #define KL2(x0, x1, x2, x3, x4, i) \
 478         pxor RK0,               x0 ## 1; \
 479         pxor RK2,               x2 ## 1; \
 480         movdqa x0 ## 1,         x4 ## 1; \
 481         psrld $5,               x0 ## 1; \
 482         pslld $(32 - 5),        x4 ## 1; \
 483         por x4 ## 1,            x0 ## 1; \
 484         pxor RK3,               x3 ## 1; \
 485         pxor RK1,               x1 ## 1; \
 486         movdqa x2 ## 1,         x4 ## 1; \
 487         psrld $22,              x2 ## 1; \
 488         pslld $(32 - 22),       x4 ## 1; \
 489         por x4 ## 1,            x2 ## 1; \
 490         pxor x3 ## 1,           x2 ## 1; \
 491                 pxor RK0,               x0 ## 2; \
 492                 pxor RK2,               x2 ## 2; \
 493                 movdqa x0 ## 2,         x4 ## 2; \
 494                 psrld $5,               x0 ## 2; \
 495                 pslld $(32 - 5),        x4 ## 2; \
 496                 por x4 ## 2,            x0 ## 2; \
 497                 pxor RK3,               x3 ## 2; \
 498                 pxor RK1,               x1 ## 2; \
 499                 movdqa x2 ## 2,         x4 ## 2; \
 500                 psrld $22,              x2 ## 2; \
 501                 pslld $(32 - 22),       x4 ## 2; \
 502                 por x4 ## 2,            x2 ## 2; \
 503                 pxor x3 ## 2,           x2 ## 2; \
 504         pxor x3 ## 1,           x0 ## 1; \
 505         movdqa x1 ## 1,         x4 ## 1; \
 506         pslld $7,               x4 ## 1; \
 507         pxor x1 ## 1,           x0 ## 1; \
 508         pxor x4 ## 1,           x2 ## 1; \
 509         movdqa x1 ## 1,         x4 ## 1; \
 510         psrld $1,               x1 ## 1; \
 511         pslld $(32 - 1),        x4 ## 1; \
 512         por x4 ## 1,            x1 ## 1; \
 513                 pxor x3 ## 2,           x0 ## 2; \
 514                 movdqa x1 ## 2,         x4 ## 2; \
 515                 pslld $7,               x4 ## 2; \
 516                 pxor x1 ## 2,           x0 ## 2; \
 517                 pxor x4 ## 2,           x2 ## 2; \
 518                 movdqa x1 ## 2,         x4 ## 2; \
 519                 psrld $1,               x1 ## 2; \
 520                 pslld $(32 - 1),        x4 ## 2; \
 521                 por x4 ## 2,            x1 ## 2; \
 522         movdqa x3 ## 1,         x4 ## 1; \
 523         psrld $7,               x3 ## 1; \
 524         pslld $(32 - 7),        x4 ## 1; \
 525         por x4 ## 1,            x3 ## 1; \
 526         pxor x0 ## 1,           x1 ## 1; \
 527         movdqa x0 ## 1,         x4 ## 1; \
 528         pslld $3,               x4 ## 1; \
 529         pxor x4 ## 1,           x3 ## 1; \
 530         movdqa x0 ## 1,         x4 ## 1; \
 531                 movdqa x3 ## 2,         x4 ## 2; \
 532                 psrld $7,               x3 ## 2; \
 533                 pslld $(32 - 7),        x4 ## 2; \
 534                 por x4 ## 2,            x3 ## 2; \
 535                 pxor x0 ## 2,           x1 ## 2; \
 536                 movdqa x0 ## 2,         x4 ## 2; \
 537                 pslld $3,               x4 ## 2; \
 538                 pxor x4 ## 2,           x3 ## 2; \
 539                 movdqa x0 ## 2,         x4 ## 2; \
 540         psrld $13,              x0 ## 1; \
 541         pslld $(32 - 13),       x4 ## 1; \
 542         por x4 ## 1,            x0 ## 1; \
 543         pxor x2 ## 1,           x1 ## 1; \
 544         pxor x2 ## 1,           x3 ## 1; \
 545         movdqa x2 ## 1,         x4 ## 1; \
 546         psrld $3,               x2 ## 1; \
 547         pslld $(32 - 3),        x4 ## 1; \
 548         por x4 ## 1,            x2 ## 1; \
 549                 psrld $13,              x0 ## 2; \
 550                 pslld $(32 - 13),       x4 ## 2; \
 551                 por x4 ## 2,            x0 ## 2; \
 552                 pxor x2 ## 2,           x1 ## 2; \
 553                 pxor x2 ## 2,           x3 ## 2; \
 554                 movdqa x2 ## 2,         x4 ## 2; \
 555                 psrld $3,               x2 ## 2; \
 556                 pslld $(32 - 3),        x4 ## 2; \
 557                 por x4 ## 2,            x2 ## 2;
 558 
 559 #define S(SBOX, x0, x1, x2, x3, x4) \
 560         SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 561         SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 562         SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 563         SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2);
 564 
 565 #define SP(SBOX, x0, x1, x2, x3, x4, i) \
 566         get_key(i, 0, RK0); \
 567         SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 568         get_key(i, 2, RK2); \
 569         SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 570         get_key(i, 3, RK3); \
 571         SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \
 572         get_key(i, 1, RK1); \
 573         SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \
 574 
 575 #define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 576         movdqa x0,              t2; \
 577         punpckldq x1,           x0; \
 578         punpckhdq x1,           t2; \
 579         movdqa x2,              t1; \
 580         punpckhdq x3,           x2; \
 581         punpckldq x3,           t1; \
 582         movdqa x0,              x1; \
 583         punpcklqdq t1,          x0; \
 584         punpckhqdq t1,          x1; \
 585         movdqa t2,              x3; \
 586         punpcklqdq x2,          t2; \
 587         punpckhqdq x2,          x3; \
 588         movdqa t2,              x2;
 589 
 590 #define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \
 591         movdqu (0*4*4)(in),     x0; \
 592         movdqu (1*4*4)(in),     x1; \
 593         movdqu (2*4*4)(in),     x2; \
 594         movdqu (3*4*4)(in),     x3; \
 595         \
 596         transpose_4x4(x0, x1, x2, x3, t0, t1, t2)
 597 
 598 #define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
 599         transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 600         \
 601         movdqu x0,              (0*4*4)(out); \
 602         movdqu x1,              (1*4*4)(out); \
 603         movdqu x2,              (2*4*4)(out); \
 604         movdqu x3,              (3*4*4)(out);
 605 
 606 #define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \
 607         transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \
 608         \
 609         movdqu (0*4*4)(out),    t0; \
 610         pxor t0,                x0; \
 611         movdqu x0,              (0*4*4)(out); \
 612         movdqu (1*4*4)(out),    t0; \
 613         pxor t0,                x1; \
 614         movdqu x1,              (1*4*4)(out); \
 615         movdqu (2*4*4)(out),    t0; \
 616         pxor t0,                x2; \
 617         movdqu x2,              (2*4*4)(out); \
 618         movdqu (3*4*4)(out),    t0; \
 619         pxor t0,                x3; \
 620         movdqu x3,              (3*4*4)(out);
 621 
 622 ENTRY(__serpent_enc_blk_8way)
 623         /* input:
 624          *      %rdi: ctx, CTX
 625          *      %rsi: dst
 626          *      %rdx: src
 627          *      %rcx: bool, if true: xor output
 628          */
 629 
 630         pcmpeqd RNOT, RNOT;
 631 
 632         leaq (4*4*4)(%rdx), %rax;
 633         read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 634         read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 635 
 636                                                  K2(RA, RB, RC, RD, RE, 0);
 637         S(S0, RA, RB, RC, RD, RE);              LK2(RC, RB, RD, RA, RE, 1);
 638         S(S1, RC, RB, RD, RA, RE);              LK2(RE, RD, RA, RC, RB, 2);
 639         S(S2, RE, RD, RA, RC, RB);              LK2(RB, RD, RE, RC, RA, 3);
 640         S(S3, RB, RD, RE, RC, RA);              LK2(RC, RA, RD, RB, RE, 4);
 641         S(S4, RC, RA, RD, RB, RE);              LK2(RA, RD, RB, RE, RC, 5);
 642         S(S5, RA, RD, RB, RE, RC);              LK2(RC, RA, RD, RE, RB, 6);
 643         S(S6, RC, RA, RD, RE, RB);              LK2(RD, RB, RA, RE, RC, 7);
 644         S(S7, RD, RB, RA, RE, RC);              LK2(RC, RA, RE, RD, RB, 8);
 645         S(S0, RC, RA, RE, RD, RB);              LK2(RE, RA, RD, RC, RB, 9);
 646         S(S1, RE, RA, RD, RC, RB);              LK2(RB, RD, RC, RE, RA, 10);
 647         S(S2, RB, RD, RC, RE, RA);              LK2(RA, RD, RB, RE, RC, 11);
 648         S(S3, RA, RD, RB, RE, RC);              LK2(RE, RC, RD, RA, RB, 12);
 649         S(S4, RE, RC, RD, RA, RB);              LK2(RC, RD, RA, RB, RE, 13);
 650         S(S5, RC, RD, RA, RB, RE);              LK2(RE, RC, RD, RB, RA, 14);
 651         S(S6, RE, RC, RD, RB, RA);              LK2(RD, RA, RC, RB, RE, 15);
 652         S(S7, RD, RA, RC, RB, RE);              LK2(RE, RC, RB, RD, RA, 16);
 653         S(S0, RE, RC, RB, RD, RA);              LK2(RB, RC, RD, RE, RA, 17);
 654         S(S1, RB, RC, RD, RE, RA);              LK2(RA, RD, RE, RB, RC, 18);
 655         S(S2, RA, RD, RE, RB, RC);              LK2(RC, RD, RA, RB, RE, 19);
 656         S(S3, RC, RD, RA, RB, RE);              LK2(RB, RE, RD, RC, RA, 20);
 657         S(S4, RB, RE, RD, RC, RA);              LK2(RE, RD, RC, RA, RB, 21);
 658         S(S5, RE, RD, RC, RA, RB);              LK2(RB, RE, RD, RA, RC, 22);
 659         S(S6, RB, RE, RD, RA, RC);              LK2(RD, RC, RE, RA, RB, 23);
 660         S(S7, RD, RC, RE, RA, RB);              LK2(RB, RE, RA, RD, RC, 24);
 661         S(S0, RB, RE, RA, RD, RC);              LK2(RA, RE, RD, RB, RC, 25);
 662         S(S1, RA, RE, RD, RB, RC);              LK2(RC, RD, RB, RA, RE, 26);
 663         S(S2, RC, RD, RB, RA, RE);              LK2(RE, RD, RC, RA, RB, 27);
 664         S(S3, RE, RD, RC, RA, RB);              LK2(RA, RB, RD, RE, RC, 28);
 665         S(S4, RA, RB, RD, RE, RC);              LK2(RB, RD, RE, RC, RA, 29);
 666         S(S5, RB, RD, RE, RC, RA);              LK2(RA, RB, RD, RC, RE, 30);
 667         S(S6, RA, RB, RD, RC, RE);              LK2(RD, RE, RB, RC, RA, 31);
 668         S(S7, RD, RE, RB, RC, RA);               K2(RA, RB, RC, RD, RE, 32);
 669 
 670         leaq (4*4*4)(%rsi), %rax;
 671 
 672         testb %cl, %cl;
 673         jnz .L__enc_xor8;
 674 
 675         write_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 676         write_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 677 
 678         ret;
 679 
 680 .L__enc_xor8:
 681         xor_blocks(%rsi, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 682         xor_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 683 
 684         ret;
 685 ENDPROC(__serpent_enc_blk_8way)
 686 
 687 ENTRY(serpent_dec_blk_8way)
 688         /* input:
 689          *      %rdi: ctx, CTX
 690          *      %rsi: dst
 691          *      %rdx: src
 692          */
 693 
 694         pcmpeqd RNOT, RNOT;
 695 
 696         leaq (4*4*4)(%rdx), %rax;
 697         read_blocks(%rdx, RA1, RB1, RC1, RD1, RK0, RK1, RK2);
 698         read_blocks(%rax, RA2, RB2, RC2, RD2, RK0, RK1, RK2);
 699 
 700                                                  K2(RA, RB, RC, RD, RE, 32);
 701         SP(SI7, RA, RB, RC, RD, RE, 31);        KL2(RB, RD, RA, RE, RC, 31);
 702         SP(SI6, RB, RD, RA, RE, RC, 30);        KL2(RA, RC, RE, RB, RD, 30);
 703         SP(SI5, RA, RC, RE, RB, RD, 29);        KL2(RC, RD, RA, RE, RB, 29);
 704         SP(SI4, RC, RD, RA, RE, RB, 28);        KL2(RC, RA, RB, RE, RD, 28);
 705         SP(SI3, RC, RA, RB, RE, RD, 27);        KL2(RB, RC, RD, RE, RA, 27);
 706         SP(SI2, RB, RC, RD, RE, RA, 26);        KL2(RC, RA, RE, RD, RB, 26);
 707         SP(SI1, RC, RA, RE, RD, RB, 25);        KL2(RB, RA, RE, RD, RC, 25);
 708         SP(SI0, RB, RA, RE, RD, RC, 24);        KL2(RE, RC, RA, RB, RD, 24);
 709         SP(SI7, RE, RC, RA, RB, RD, 23);        KL2(RC, RB, RE, RD, RA, 23);
 710         SP(SI6, RC, RB, RE, RD, RA, 22);        KL2(RE, RA, RD, RC, RB, 22);
 711         SP(SI5, RE, RA, RD, RC, RB, 21);        KL2(RA, RB, RE, RD, RC, 21);
 712         SP(SI4, RA, RB, RE, RD, RC, 20);        KL2(RA, RE, RC, RD, RB, 20);
 713         SP(SI3, RA, RE, RC, RD, RB, 19);        KL2(RC, RA, RB, RD, RE, 19);
 714         SP(SI2, RC, RA, RB, RD, RE, 18);        KL2(RA, RE, RD, RB, RC, 18);
 715         SP(SI1, RA, RE, RD, RB, RC, 17);        KL2(RC, RE, RD, RB, RA, 17);
 716         SP(SI0, RC, RE, RD, RB, RA, 16);        KL2(RD, RA, RE, RC, RB, 16);
 717         SP(SI7, RD, RA, RE, RC, RB, 15);        KL2(RA, RC, RD, RB, RE, 15);
 718         SP(SI6, RA, RC, RD, RB, RE, 14);        KL2(RD, RE, RB, RA, RC, 14);
 719         SP(SI5, RD, RE, RB, RA, RC, 13);        KL2(RE, RC, RD, RB, RA, 13);
 720         SP(SI4, RE, RC, RD, RB, RA, 12);        KL2(RE, RD, RA, RB, RC, 12);
 721         SP(SI3, RE, RD, RA, RB, RC, 11);        KL2(RA, RE, RC, RB, RD, 11);
 722         SP(SI2, RA, RE, RC, RB, RD, 10);        KL2(RE, RD, RB, RC, RA, 10);
 723         SP(SI1, RE, RD, RB, RC, RA, 9);         KL2(RA, RD, RB, RC, RE, 9);
 724         SP(SI0, RA, RD, RB, RC, RE, 8);         KL2(RB, RE, RD, RA, RC, 8);
 725         SP(SI7, RB, RE, RD, RA, RC, 7);         KL2(RE, RA, RB, RC, RD, 7);
 726         SP(SI6, RE, RA, RB, RC, RD, 6);         KL2(RB, RD, RC, RE, RA, 6);
 727         SP(SI5, RB, RD, RC, RE, RA, 5);         KL2(RD, RA, RB, RC, RE, 5);
 728         SP(SI4, RD, RA, RB, RC, RE, 4);         KL2(RD, RB, RE, RC, RA, 4);
 729         SP(SI3, RD, RB, RE, RC, RA, 3);         KL2(RE, RD, RA, RC, RB, 3);
 730         SP(SI2, RE, RD, RA, RC, RB, 2);         KL2(RD, RB, RC, RA, RE, 2);
 731         SP(SI1, RD, RB, RC, RA, RE, 1);         KL2(RE, RB, RC, RA, RD, 1);
 732         S(SI0, RE, RB, RC, RA, RD);              K2(RC, RD, RB, RE, RA, 0);
 733 
 734         leaq (4*4*4)(%rsi), %rax;
 735         write_blocks(%rsi, RC1, RD1, RB1, RE1, RK0, RK1, RK2);
 736         write_blocks(%rax, RC2, RD2, RB2, RE2, RK0, RK1, RK2);
 737 
 738         ret;
 739 ENDPROC(serpent_dec_blk_8way)

/* [<][>][^][v][top][bottom][index][help] */