1/* 2 * Serpent Cipher 8-way parallel algorithm (x86_64/AVX) 3 * 4 * Copyright (C) 2012 Johannes Goetzfried 5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 6 * 7 * Copyright �� 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 8 * 9 * This program is free software; you can redistribute it and/or modify 10 * it under the terms of the GNU General Public License as published by 11 * the Free Software Foundation; either version 2 of the License, or 12 * (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 * 19 * You should have received a copy of the GNU General Public License 20 * along with this program; if not, write to the Free Software 21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 22 * USA 23 * 24 */ 25 26#include <linux/linkage.h> 27#include "glue_helper-asm-avx.S" 28 29.file "serpent-avx-x86_64-asm_64.S" 30 31.data 32.align 16 33 34.Lbswap128_mask: 35 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 36.Lxts_gf128mul_and_shl1_mask: 37 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 38 39.text 40 41#define CTX %rdi 42 43/********************************************************************** 44 8-way AVX serpent 45 **********************************************************************/ 46#define RA1 %xmm0 47#define RB1 %xmm1 48#define RC1 %xmm2 49#define RD1 %xmm3 50#define RE1 %xmm4 51 52#define tp %xmm5 53 54#define RA2 %xmm6 55#define RB2 %xmm7 56#define RC2 %xmm8 57#define RD2 %xmm9 58#define RE2 %xmm10 59 60#define RNOT %xmm11 61 62#define RK0 %xmm12 63#define RK1 %xmm13 64#define RK2 %xmm14 65#define RK3 %xmm15 66 67 68#define S0_1(x0, x1, x2, x3, x4) \ 69 vpor x0, x3, tp; \ 70 vpxor x3, x0, x0; \ 71 vpxor x2, x3, x4; \ 72 vpxor RNOT, x4, x4; \ 73 vpxor x1, tp, x3; \ 74 vpand x0, x1, x1; \ 75 vpxor x4, x1, x1; \ 76 vpxor x0, x2, x2; 77#define S0_2(x0, x1, x2, x3, x4) \ 78 vpxor x3, x0, x0; \ 79 vpor x0, x4, x4; \ 80 vpxor x2, x0, x0; \ 81 vpand x1, x2, x2; \ 82 vpxor x2, x3, x3; \ 83 vpxor RNOT, x1, x1; \ 84 vpxor x4, x2, x2; \ 85 vpxor x2, x1, x1; 86 87#define S1_1(x0, x1, x2, x3, x4) \ 88 vpxor x0, x1, tp; \ 89 vpxor x3, x0, x0; \ 90 vpxor RNOT, x3, x3; \ 91 vpand tp, x1, x4; \ 92 vpor tp, x0, x0; \ 93 vpxor x2, x3, x3; \ 94 vpxor x3, x0, x0; \ 95 vpxor x3, tp, x1; 96#define S1_2(x0, x1, x2, x3, x4) \ 97 vpxor x4, x3, x3; \ 98 vpor x4, x1, x1; \ 99 vpxor x2, x4, x4; \ 100 vpand x0, x2, x2; \ 101 vpxor x1, x2, x2; \ 102 vpor x0, x1, x1; \ 103 vpxor RNOT, x0, x0; \ 104 vpxor x2, x0, x0; \ 105 vpxor x1, x4, x4; 106 107#define S2_1(x0, x1, x2, x3, x4) \ 108 vpxor RNOT, x3, x3; \ 109 vpxor x0, x1, x1; \ 110 vpand x2, x0, tp; \ 111 vpxor x3, tp, tp; \ 112 vpor x0, x3, x3; \ 113 vpxor x1, x2, x2; \ 114 vpxor x1, x3, x3; \ 115 vpand tp, x1, x1; 116#define S2_2(x0, x1, x2, x3, x4) \ 117 vpxor x2, tp, tp; \ 118 vpand x3, x2, x2; \ 119 vpor x1, x3, x3; \ 120 vpxor RNOT, tp, tp; \ 121 vpxor tp, x3, x3; \ 122 vpxor tp, x0, x4; \ 123 vpxor x2, tp, x0; \ 124 vpor x2, x1, x1; 125 126#define S3_1(x0, x1, x2, x3, x4) \ 127 vpxor x3, x1, tp; \ 128 vpor x0, x3, x3; \ 129 vpand x0, x1, x4; \ 130 vpxor x2, x0, x0; \ 131 vpxor tp, x2, x2; \ 132 vpand x3, tp, x1; \ 133 vpxor x3, x2, x2; \ 134 vpor x4, x0, x0; \ 135 vpxor x3, x4, x4; 136#define S3_2(x0, x1, x2, x3, x4) \ 137 vpxor x0, x1, x1; \ 138 vpand x3, x0, x0; \ 139 vpand x4, x3, x3; \ 140 vpxor x2, x3, x3; \ 141 vpor x1, x4, x4; \ 142 vpand x1, x2, x2; \ 143 vpxor x3, x4, x4; \ 144 vpxor x3, x0, x0; \ 145 vpxor x2, x3, x3; 146 147#define S4_1(x0, x1, x2, x3, x4) \ 148 vpand x0, x3, tp; \ 149 vpxor x3, x0, x0; \ 150 vpxor x2, tp, tp; \ 151 vpor x3, x2, x2; \ 152 vpxor x1, x0, x0; \ 153 vpxor tp, x3, x4; \ 154 vpor x0, x2, x2; \ 155 vpxor x1, x2, x2; 156#define S4_2(x0, x1, x2, x3, x4) \ 157 vpand x0, x1, x1; \ 158 vpxor x4, x1, x1; \ 159 vpand x2, x4, x4; \ 160 vpxor tp, x2, x2; \ 161 vpxor x0, x4, x4; \ 162 vpor x1, tp, x3; \ 163 vpxor RNOT, x1, x1; \ 164 vpxor x0, x3, x3; 165 166#define S5_1(x0, x1, x2, x3, x4) \ 167 vpor x0, x1, tp; \ 168 vpxor tp, x2, x2; \ 169 vpxor RNOT, x3, x3; \ 170 vpxor x0, x1, x4; \ 171 vpxor x2, x0, x0; \ 172 vpand x4, tp, x1; \ 173 vpor x3, x4, x4; \ 174 vpxor x0, x4, x4; 175#define S5_2(x0, x1, x2, x3, x4) \ 176 vpand x3, x0, x0; \ 177 vpxor x3, x1, x1; \ 178 vpxor x2, x3, x3; \ 179 vpxor x1, x0, x0; \ 180 vpand x4, x2, x2; \ 181 vpxor x2, x1, x1; \ 182 vpand x0, x2, x2; \ 183 vpxor x2, x3, x3; 184 185#define S6_1(x0, x1, x2, x3, x4) \ 186 vpxor x0, x3, x3; \ 187 vpxor x2, x1, tp; \ 188 vpxor x0, x2, x2; \ 189 vpand x3, x0, x0; \ 190 vpor x3, tp, tp; \ 191 vpxor RNOT, x1, x4; \ 192 vpxor tp, x0, x0; \ 193 vpxor x2, tp, x1; 194#define S6_2(x0, x1, x2, x3, x4) \ 195 vpxor x4, x3, x3; \ 196 vpxor x0, x4, x4; \ 197 vpand x0, x2, x2; \ 198 vpxor x1, x4, x4; \ 199 vpxor x3, x2, x2; \ 200 vpand x1, x3, x3; \ 201 vpxor x0, x3, x3; \ 202 vpxor x2, x1, x1; 203 204#define S7_1(x0, x1, x2, x3, x4) \ 205 vpxor RNOT, x1, tp; \ 206 vpxor RNOT, x0, x0; \ 207 vpand x2, tp, x1; \ 208 vpxor x3, x1, x1; \ 209 vpor tp, x3, x3; \ 210 vpxor x2, tp, x4; \ 211 vpxor x3, x2, x2; \ 212 vpxor x0, x3, x3; \ 213 vpor x1, x0, x0; 214#define S7_2(x0, x1, x2, x3, x4) \ 215 vpand x0, x2, x2; \ 216 vpxor x4, x0, x0; \ 217 vpxor x3, x4, x4; \ 218 vpand x0, x3, x3; \ 219 vpxor x1, x4, x4; \ 220 vpxor x4, x2, x2; \ 221 vpxor x1, x3, x3; \ 222 vpor x0, x4, x4; \ 223 vpxor x1, x4, x4; 224 225#define SI0_1(x0, x1, x2, x3, x4) \ 226 vpxor x0, x1, x1; \ 227 vpor x1, x3, tp; \ 228 vpxor x1, x3, x4; \ 229 vpxor RNOT, x0, x0; \ 230 vpxor tp, x2, x2; \ 231 vpxor x0, tp, x3; \ 232 vpand x1, x0, x0; \ 233 vpxor x2, x0, x0; 234#define SI0_2(x0, x1, x2, x3, x4) \ 235 vpand x3, x2, x2; \ 236 vpxor x4, x3, x3; \ 237 vpxor x3, x2, x2; \ 238 vpxor x3, x1, x1; \ 239 vpand x0, x3, x3; \ 240 vpxor x0, x1, x1; \ 241 vpxor x2, x0, x0; \ 242 vpxor x3, x4, x4; 243 244#define SI1_1(x0, x1, x2, x3, x4) \ 245 vpxor x3, x1, x1; \ 246 vpxor x2, x0, tp; \ 247 vpxor RNOT, x2, x2; \ 248 vpor x1, x0, x4; \ 249 vpxor x3, x4, x4; \ 250 vpand x1, x3, x3; \ 251 vpxor x2, x1, x1; \ 252 vpand x4, x2, x2; 253#define SI1_2(x0, x1, x2, x3, x4) \ 254 vpxor x1, x4, x4; \ 255 vpor x3, x1, x1; \ 256 vpxor tp, x3, x3; \ 257 vpxor tp, x2, x2; \ 258 vpor x4, tp, x0; \ 259 vpxor x4, x2, x2; \ 260 vpxor x0, x1, x1; \ 261 vpxor x1, x4, x4; 262 263#define SI2_1(x0, x1, x2, x3, x4) \ 264 vpxor x1, x2, x2; \ 265 vpxor RNOT, x3, tp; \ 266 vpor x2, tp, tp; \ 267 vpxor x3, x2, x2; \ 268 vpxor x0, x3, x4; \ 269 vpxor x1, tp, x3; \ 270 vpor x2, x1, x1; \ 271 vpxor x0, x2, x2; 272#define SI2_2(x0, x1, x2, x3, x4) \ 273 vpxor x4, x1, x1; \ 274 vpor x3, x4, x4; \ 275 vpxor x3, x2, x2; \ 276 vpxor x2, x4, x4; \ 277 vpand x1, x2, x2; \ 278 vpxor x3, x2, x2; \ 279 vpxor x4, x3, x3; \ 280 vpxor x0, x4, x4; 281 282#define SI3_1(x0, x1, x2, x3, x4) \ 283 vpxor x1, x2, x2; \ 284 vpand x2, x1, tp; \ 285 vpxor x0, tp, tp; \ 286 vpor x1, x0, x0; \ 287 vpxor x3, x1, x4; \ 288 vpxor x3, x0, x0; \ 289 vpor tp, x3, x3; \ 290 vpxor x2, tp, x1; 291#define SI3_2(x0, x1, x2, x3, x4) \ 292 vpxor x3, x1, x1; \ 293 vpxor x2, x0, x0; \ 294 vpxor x3, x2, x2; \ 295 vpand x1, x3, x3; \ 296 vpxor x0, x1, x1; \ 297 vpand x2, x0, x0; \ 298 vpxor x3, x4, x4; \ 299 vpxor x0, x3, x3; \ 300 vpxor x1, x0, x0; 301 302#define SI4_1(x0, x1, x2, x3, x4) \ 303 vpxor x3, x2, x2; \ 304 vpand x1, x0, tp; \ 305 vpxor x2, tp, tp; \ 306 vpor x3, x2, x2; \ 307 vpxor RNOT, x0, x4; \ 308 vpxor tp, x1, x1; \ 309 vpxor x2, tp, x0; \ 310 vpand x4, x2, x2; 311#define SI4_2(x0, x1, x2, x3, x4) \ 312 vpxor x0, x2, x2; \ 313 vpor x4, x0, x0; \ 314 vpxor x3, x0, x0; \ 315 vpand x2, x3, x3; \ 316 vpxor x3, x4, x4; \ 317 vpxor x1, x3, x3; \ 318 vpand x0, x1, x1; \ 319 vpxor x1, x4, x4; \ 320 vpxor x3, x0, x0; 321 322#define SI5_1(x0, x1, x2, x3, x4) \ 323 vpor x2, x1, tp; \ 324 vpxor x1, x2, x2; \ 325 vpxor x3, tp, tp; \ 326 vpand x1, x3, x3; \ 327 vpxor x3, x2, x2; \ 328 vpor x0, x3, x3; \ 329 vpxor RNOT, x0, x0; \ 330 vpxor x2, x3, x3; \ 331 vpor x0, x2, x2; 332#define SI5_2(x0, x1, x2, x3, x4) \ 333 vpxor tp, x1, x4; \ 334 vpxor x4, x2, x2; \ 335 vpand x0, x4, x4; \ 336 vpxor tp, x0, x0; \ 337 vpxor x3, tp, x1; \ 338 vpand x2, x0, x0; \ 339 vpxor x3, x2, x2; \ 340 vpxor x2, x0, x0; \ 341 vpxor x4, x2, x2; \ 342 vpxor x3, x4, x4; 343 344#define SI6_1(x0, x1, x2, x3, x4) \ 345 vpxor x2, x0, x0; \ 346 vpand x3, x0, tp; \ 347 vpxor x3, x2, x2; \ 348 vpxor x2, tp, tp; \ 349 vpxor x1, x3, x3; \ 350 vpor x0, x2, x2; \ 351 vpxor x3, x2, x2; \ 352 vpand tp, x3, x3; 353#define SI6_2(x0, x1, x2, x3, x4) \ 354 vpxor RNOT, tp, tp; \ 355 vpxor x1, x3, x3; \ 356 vpand x2, x1, x1; \ 357 vpxor tp, x0, x4; \ 358 vpxor x4, x3, x3; \ 359 vpxor x2, x4, x4; \ 360 vpxor x1, tp, x0; \ 361 vpxor x0, x2, x2; 362 363#define SI7_1(x0, x1, x2, x3, x4) \ 364 vpand x0, x3, tp; \ 365 vpxor x2, x0, x0; \ 366 vpor x3, x2, x2; \ 367 vpxor x1, x3, x4; \ 368 vpxor RNOT, x0, x0; \ 369 vpor tp, x1, x1; \ 370 vpxor x0, x4, x4; \ 371 vpand x2, x0, x0; \ 372 vpxor x1, x0, x0; 373#define SI7_2(x0, x1, x2, x3, x4) \ 374 vpand x2, x1, x1; \ 375 vpxor x2, tp, x3; \ 376 vpxor x3, x4, x4; \ 377 vpand x3, x2, x2; \ 378 vpor x0, x3, x3; \ 379 vpxor x4, x1, x1; \ 380 vpxor x4, x3, x3; \ 381 vpand x0, x4, x4; \ 382 vpxor x2, x4, x4; 383 384#define get_key(i, j, t) \ 385 vbroadcastss (4*(i)+(j))*4(CTX), t; 386 387#define K2(x0, x1, x2, x3, x4, i) \ 388 get_key(i, 0, RK0); \ 389 get_key(i, 1, RK1); \ 390 get_key(i, 2, RK2); \ 391 get_key(i, 3, RK3); \ 392 vpxor RK0, x0 ## 1, x0 ## 1; \ 393 vpxor RK1, x1 ## 1, x1 ## 1; \ 394 vpxor RK2, x2 ## 1, x2 ## 1; \ 395 vpxor RK3, x3 ## 1, x3 ## 1; \ 396 vpxor RK0, x0 ## 2, x0 ## 2; \ 397 vpxor RK1, x1 ## 2, x1 ## 2; \ 398 vpxor RK2, x2 ## 2, x2 ## 2; \ 399 vpxor RK3, x3 ## 2, x3 ## 2; 400 401#define LK2(x0, x1, x2, x3, x4, i) \ 402 vpslld $13, x0 ## 1, x4 ## 1; \ 403 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ 404 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 405 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 406 vpslld $3, x2 ## 1, x4 ## 1; \ 407 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ 408 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 409 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 410 vpslld $13, x0 ## 2, x4 ## 2; \ 411 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ 412 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 413 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 414 vpslld $3, x2 ## 2, x4 ## 2; \ 415 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ 416 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 417 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 418 vpslld $1, x1 ## 1, x4 ## 1; \ 419 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ 420 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 421 vpslld $3, x0 ## 1, x4 ## 1; \ 422 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 423 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 424 get_key(i, 1, RK1); \ 425 vpslld $1, x1 ## 2, x4 ## 2; \ 426 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ 427 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 428 vpslld $3, x0 ## 2, x4 ## 2; \ 429 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 430 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 431 get_key(i, 3, RK3); \ 432 vpslld $7, x3 ## 1, x4 ## 1; \ 433 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ 434 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 435 vpslld $7, x1 ## 1, x4 ## 1; \ 436 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 437 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 438 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 439 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 440 get_key(i, 0, RK0); \ 441 vpslld $7, x3 ## 2, x4 ## 2; \ 442 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ 443 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 444 vpslld $7, x1 ## 2, x4 ## 2; \ 445 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 446 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 447 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 448 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 449 get_key(i, 2, RK2); \ 450 vpxor RK1, x1 ## 1, x1 ## 1; \ 451 vpxor RK3, x3 ## 1, x3 ## 1; \ 452 vpslld $5, x0 ## 1, x4 ## 1; \ 453 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ 454 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 455 vpslld $22, x2 ## 1, x4 ## 1; \ 456 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ 457 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 458 vpxor RK0, x0 ## 1, x0 ## 1; \ 459 vpxor RK2, x2 ## 1, x2 ## 1; \ 460 vpxor RK1, x1 ## 2, x1 ## 2; \ 461 vpxor RK3, x3 ## 2, x3 ## 2; \ 462 vpslld $5, x0 ## 2, x4 ## 2; \ 463 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ 464 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 465 vpslld $22, x2 ## 2, x4 ## 2; \ 466 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ 467 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 468 vpxor RK0, x0 ## 2, x0 ## 2; \ 469 vpxor RK2, x2 ## 2, x2 ## 2; 470 471#define KL2(x0, x1, x2, x3, x4, i) \ 472 vpxor RK0, x0 ## 1, x0 ## 1; \ 473 vpxor RK2, x2 ## 1, x2 ## 1; \ 474 vpsrld $5, x0 ## 1, x4 ## 1; \ 475 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ 476 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 477 vpxor RK3, x3 ## 1, x3 ## 1; \ 478 vpxor RK1, x1 ## 1, x1 ## 1; \ 479 vpsrld $22, x2 ## 1, x4 ## 1; \ 480 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ 481 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 482 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 483 vpxor RK0, x0 ## 2, x0 ## 2; \ 484 vpxor RK2, x2 ## 2, x2 ## 2; \ 485 vpsrld $5, x0 ## 2, x4 ## 2; \ 486 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ 487 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 488 vpxor RK3, x3 ## 2, x3 ## 2; \ 489 vpxor RK1, x1 ## 2, x1 ## 2; \ 490 vpsrld $22, x2 ## 2, x4 ## 2; \ 491 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ 492 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 493 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 494 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 495 vpslld $7, x1 ## 1, x4 ## 1; \ 496 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 497 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 498 vpsrld $1, x1 ## 1, x4 ## 1; \ 499 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ 500 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 501 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 502 vpslld $7, x1 ## 2, x4 ## 2; \ 503 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 504 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 505 vpsrld $1, x1 ## 2, x4 ## 2; \ 506 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ 507 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 508 vpsrld $7, x3 ## 1, x4 ## 1; \ 509 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ 510 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 511 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 512 vpslld $3, x0 ## 1, x4 ## 1; \ 513 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 514 vpsrld $7, x3 ## 2, x4 ## 2; \ 515 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ 516 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 517 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 518 vpslld $3, x0 ## 2, x4 ## 2; \ 519 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 520 vpsrld $13, x0 ## 1, x4 ## 1; \ 521 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ 522 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 523 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 524 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 525 vpsrld $3, x2 ## 1, x4 ## 1; \ 526 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ 527 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 528 vpsrld $13, x0 ## 2, x4 ## 2; \ 529 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ 530 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 531 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 532 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 533 vpsrld $3, x2 ## 2, x4 ## 2; \ 534 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ 535 vpor x4 ## 2, x2 ## 2, x2 ## 2; 536 537#define S(SBOX, x0, x1, x2, x3, x4) \ 538 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 539 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 540 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 541 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); 542 543#define SP(SBOX, x0, x1, x2, x3, x4, i) \ 544 get_key(i, 0, RK0); \ 545 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 546 get_key(i, 2, RK2); \ 547 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 548 get_key(i, 3, RK3); \ 549 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 550 get_key(i, 1, RK1); \ 551 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 552 553#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 554 vpunpckldq x1, x0, t0; \ 555 vpunpckhdq x1, x0, t2; \ 556 vpunpckldq x3, x2, t1; \ 557 vpunpckhdq x3, x2, x3; \ 558 \ 559 vpunpcklqdq t1, t0, x0; \ 560 vpunpckhqdq t1, t0, x1; \ 561 vpunpcklqdq x3, t2, x2; \ 562 vpunpckhqdq x3, t2, x3; 563 564#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ 565 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 566 567#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ 568 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 569 570.align 8 571__serpent_enc_blk8_avx: 572 /* input: 573 * %rdi: ctx, CTX 574 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks 575 * output: 576 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 577 */ 578 579 vpcmpeqd RNOT, RNOT, RNOT; 580 581 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 582 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 583 584 K2(RA, RB, RC, RD, RE, 0); 585 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 586 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); 587 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); 588 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); 589 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); 590 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); 591 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); 592 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); 593 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); 594 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); 595 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); 596 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); 597 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); 598 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); 599 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); 600 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); 601 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); 602 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); 603 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); 604 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); 605 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); 606 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); 607 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); 608 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); 609 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); 610 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); 611 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); 612 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); 613 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); 614 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); 615 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 616 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 617 618 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 619 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 620 621 ret; 622ENDPROC(__serpent_enc_blk8_avx) 623 624.align 8 625__serpent_dec_blk8_avx: 626 /* input: 627 * %rdi: ctx, CTX 628 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 629 * output: 630 * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks 631 */ 632 633 vpcmpeqd RNOT, RNOT, RNOT; 634 635 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 636 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 637 638 K2(RA, RB, RC, RD, RE, 32); 639 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 640 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); 641 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); 642 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); 643 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); 644 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); 645 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); 646 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); 647 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); 648 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); 649 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); 650 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); 651 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); 652 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); 653 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); 654 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); 655 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); 656 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); 657 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); 658 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); 659 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); 660 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); 661 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); 662 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); 663 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); 664 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); 665 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); 666 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); 667 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); 668 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); 669 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 670 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 671 672 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); 673 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); 674 675 ret; 676ENDPROC(__serpent_dec_blk8_avx) 677 678ENTRY(serpent_ecb_enc_8way_avx) 679 /* input: 680 * %rdi: ctx, CTX 681 * %rsi: dst 682 * %rdx: src 683 */ 684 685 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 686 687 call __serpent_enc_blk8_avx; 688 689 store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 690 691 ret; 692ENDPROC(serpent_ecb_enc_8way_avx) 693 694ENTRY(serpent_ecb_dec_8way_avx) 695 /* input: 696 * %rdi: ctx, CTX 697 * %rsi: dst 698 * %rdx: src 699 */ 700 701 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 702 703 call __serpent_dec_blk8_avx; 704 705 store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 706 707 ret; 708ENDPROC(serpent_ecb_dec_8way_avx) 709 710ENTRY(serpent_cbc_dec_8way_avx) 711 /* input: 712 * %rdi: ctx, CTX 713 * %rsi: dst 714 * %rdx: src 715 */ 716 717 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 718 719 call __serpent_dec_blk8_avx; 720 721 store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 722 723 ret; 724ENDPROC(serpent_cbc_dec_8way_avx) 725 726ENTRY(serpent_ctr_8way_avx) 727 /* input: 728 * %rdi: ctx, CTX 729 * %rsi: dst 730 * %rdx: src 731 * %rcx: iv (little endian, 128bit) 732 */ 733 734 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 735 RD2, RK0, RK1, RK2); 736 737 call __serpent_enc_blk8_avx; 738 739 store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 740 741 ret; 742ENDPROC(serpent_ctr_8way_avx) 743 744ENTRY(serpent_xts_enc_8way_avx) 745 /* input: 746 * %rdi: ctx, CTX 747 * %rsi: dst 748 * %rdx: src 749 * %rcx: iv (t ��� ����� ��� GF(2�������)) 750 */ 751 752 /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 753 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 754 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); 755 756 call __serpent_enc_blk8_avx; 757 758 /* dst <= regs xor IVs(in dst) */ 759 store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 760 761 ret; 762ENDPROC(serpent_xts_enc_8way_avx) 763 764ENTRY(serpent_xts_dec_8way_avx) 765 /* input: 766 * %rdi: ctx, CTX 767 * %rsi: dst 768 * %rdx: src 769 * %rcx: iv (t ��� ����� ��� GF(2�������)) 770 */ 771 772 /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 773 load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 774 RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); 775 776 call __serpent_dec_blk8_avx; 777 778 /* dst <= regs xor IVs(in dst) */ 779 store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 780 781 ret; 782ENDPROC(serpent_xts_dec_8way_avx) 783