1/*********************************************************************** 2** 3** Implementation of the Skein block functions. 4** 5** Source code author: Doug Whiting, 2008. 6** 7** This algorithm and source code is released to the public domain. 8** 9** Compile-time switches: 10** 11** SKEIN_USE_ASM -- set bits (256/512/1024) to select which 12** versions use ASM code for block processing 13** [default: use C for all block sizes] 14** 15************************************************************************/ 16 17#include <linux/string.h> 18#include "skein_base.h" 19#include "skein_block.h" 20 21#ifndef SKEIN_USE_ASM 22#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */ 23#endif 24 25#ifndef SKEIN_LOOP 26#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */ 27#endif 28 29#define BLK_BITS (WCNT * 64) /* some useful definitions for code here */ 30#define KW_TWK_BASE (0) 31#define KW_KEY_BASE (3) 32#define ks (kw + KW_KEY_BASE) 33#define ts (kw + KW_TWK_BASE) 34 35#ifdef SKEIN_DEBUG 36#define debug_save_tweak(ctx) \ 37{ \ 38 ctx->h.tweak[0] = ts[0]; \ 39 ctx->h.tweak[1] = ts[1]; \ 40} 41#else 42#define debug_save_tweak(ctx) 43#endif 44 45#if !(SKEIN_USE_ASM & 256) 46#undef RCNT 47#define RCNT (SKEIN_256_ROUNDS_TOTAL / 8) 48#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ 49#define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10) 50#else 51#define SKEIN_UNROLL_256 (0) 52#endif 53 54#if SKEIN_UNROLL_256 55#if (RCNT % SKEIN_UNROLL_256) 56#error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */ 57#endif 58#endif 59#define ROUND256(p0, p1, p2, p3, ROT, r_num) \ 60 do { \ 61 X##p0 += X##p1; \ 62 X##p1 = rotl_64(X##p1, ROT##_0); \ 63 X##p1 ^= X##p0; \ 64 X##p2 += X##p3; \ 65 X##p3 = rotl_64(X##p3, ROT##_1); \ 66 X##p3 ^= X##p2; \ 67 } while (0) 68 69#if SKEIN_UNROLL_256 == 0 70#define R256(p0, p1, p2, p3, ROT, r_num) /* fully unrolled */ \ 71 ROUND256(p0, p1, p2, p3, ROT, r_num) 72 73#define I256(R) \ 74 do { \ 75 /* inject the key schedule value */ \ 76 X0 += ks[((R) + 1) % 5]; \ 77 X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3]; \ 78 X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3]; \ 79 X3 += ks[((R) + 4) % 5] + (R) + 1; \ 80 } while (0) 81#else 82/* looping version */ 83#define R256(p0, p1, p2, p3, ROT, r_num) ROUND256(p0, p1, p2, p3, ROT, r_num) 84 85#define I256(R) \ 86 do { \ 87 /* inject the key schedule value */ \ 88 X0 += ks[r + (R) + 0]; \ 89 X1 += ks[r + (R) + 1] + ts[r + (R) + 0];\ 90 X2 += ks[r + (R) + 2] + ts[r + (R) + 1];\ 91 X3 += ks[r + (R) + 3] + r + (R); \ 92 /* rotate key schedule */ \ 93 ks[r + (R) + 4] = ks[r + (R) - 1]; \ 94 ts[r + (R) + 2] = ts[r + (R) - 1]; \ 95 } while (0) 96#endif 97#define R256_8_ROUNDS(R) \ 98 do { \ 99 R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \ 100 R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \ 101 R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \ 102 R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \ 103 I256(2 * (R)); \ 104 R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \ 105 R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \ 106 R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \ 107 R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \ 108 I256(2 * (R) + 1); \ 109 } while (0) 110 111#define R256_UNROLL_R(NN) \ 112 ((SKEIN_UNROLL_256 == 0 && \ 113 SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \ 114 (SKEIN_UNROLL_256 > (NN))) 115 116#if (SKEIN_UNROLL_256 > 14) 117#error "need more unrolling in skein_256_process_block" 118#endif 119#endif 120 121#if !(SKEIN_USE_ASM & 512) 122#undef RCNT 123#define RCNT (SKEIN_512_ROUNDS_TOTAL/8) 124 125#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ 126#define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10) 127#else 128#define SKEIN_UNROLL_512 (0) 129#endif 130 131#if SKEIN_UNROLL_512 132#if (RCNT % SKEIN_UNROLL_512) 133#error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */ 134#endif 135#endif 136#define ROUND512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, r_num) \ 137 do { \ 138 X##p0 += X##p1; \ 139 X##p1 = rotl_64(X##p1, ROT##_0); \ 140 X##p1 ^= X##p0; \ 141 X##p2 += X##p3; \ 142 X##p3 = rotl_64(X##p3, ROT##_1); \ 143 X##p3 ^= X##p2; \ 144 X##p4 += X##p5; \ 145 X##p5 = rotl_64(X##p5, ROT##_2); \ 146 X##p5 ^= X##p4; \ 147 X##p6 += X##p7; X##p7 = rotl_64(X##p7, ROT##_3);\ 148 X##p7 ^= X##p6; \ 149 } while (0) 150 151#if SKEIN_UNROLL_512 == 0 152#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, r_num) /* unrolled */ \ 153 ROUND512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, r_num) 154 155#define I512(R) \ 156 do { \ 157 /* inject the key schedule value */ \ 158 X0 += ks[((R) + 1) % 9]; \ 159 X1 += ks[((R) + 2) % 9]; \ 160 X2 += ks[((R) + 3) % 9]; \ 161 X3 += ks[((R) + 4) % 9]; \ 162 X4 += ks[((R) + 5) % 9]; \ 163 X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \ 164 X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \ 165 X7 += ks[((R) + 8) % 9] + (R) + 1; \ 166 } while (0) 167 168#else /* looping version */ 169#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, r_num) \ 170 ROUND512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, r_num) \ 171 172#define I512(R) \ 173 do { \ 174 /* inject the key schedule value */ \ 175 X0 += ks[r + (R) + 0]; \ 176 X1 += ks[r + (R) + 1]; \ 177 X2 += ks[r + (R) + 2]; \ 178 X3 += ks[r + (R) + 3]; \ 179 X4 += ks[r + (R) + 4]; \ 180 X5 += ks[r + (R) + 5] + ts[r + (R) + 0]; \ 181 X6 += ks[r + (R) + 6] + ts[r + (R) + 1]; \ 182 X7 += ks[r + (R) + 7] + r + (R); \ 183 /* rotate key schedule */ \ 184 ks[r + (R) + 8] = ks[r + (R) - 1]; \ 185 ts[r + (R) + 2] = ts[r + (R) - 1]; \ 186 } while (0) 187#endif /* end of looped code definitions */ 188#define R512_8_ROUNDS(R) /* do 8 full rounds */ \ 189 do { \ 190 R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \ 191 R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \ 192 R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \ 193 R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \ 194 I512(2 * (R)); \ 195 R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \ 196 R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \ 197 R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \ 198 R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \ 199 I512(2 * (R) + 1); /* and key injection */ \ 200 } while (0) 201#define R512_UNROLL_R(NN) \ 202 ((SKEIN_UNROLL_512 == 0 && \ 203 SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || \ 204 (SKEIN_UNROLL_512 > (NN))) 205 206#if (SKEIN_UNROLL_512 > 14) 207#error "need more unrolling in skein_512_process_block" 208#endif 209#endif 210 211#if !(SKEIN_USE_ASM & 1024) 212#undef RCNT 213#define RCNT (SKEIN_1024_ROUNDS_TOTAL/8) 214#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ 215#define SKEIN_UNROLL_1024 ((SKEIN_LOOP) % 10) 216#else 217#define SKEIN_UNROLL_1024 (0) 218#endif 219 220#if (SKEIN_UNROLL_1024 != 0) 221#if (RCNT % SKEIN_UNROLL_1024) 222#error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */ 223#endif 224#endif 225#define ROUND1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, \ 226 pF, ROT, r_num) \ 227 do { \ 228 X##p0 += X##p1; \ 229 X##p1 = rotl_64(X##p1, ROT##_0); \ 230 X##p1 ^= X##p0; \ 231 X##p2 += X##p3; \ 232 X##p3 = rotl_64(X##p3, ROT##_1); \ 233 X##p3 ^= X##p2; \ 234 X##p4 += X##p5; \ 235 X##p5 = rotl_64(X##p5, ROT##_2); \ 236 X##p5 ^= X##p4; \ 237 X##p6 += X##p7; \ 238 X##p7 = rotl_64(X##p7, ROT##_3); \ 239 X##p7 ^= X##p6; \ 240 X##p8 += X##p9; \ 241 X##p9 = rotl_64(X##p9, ROT##_4); \ 242 X##p9 ^= X##p8; \ 243 X##pA += X##pB; \ 244 X##pB = rotl_64(X##pB, ROT##_5); \ 245 X##pB ^= X##pA; \ 246 X##pC += X##pD; \ 247 X##pD = rotl_64(X##pD, ROT##_6); \ 248 X##pD ^= X##pC; \ 249 X##pE += X##pF; \ 250 X##pF = rotl_64(X##pF, ROT##_7); \ 251 X##pF ^= X##pE; \ 252 } while (0) 253 254#if SKEIN_UNROLL_1024 == 0 255#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, \ 256 ROT, rn) \ 257 ROUND1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, \ 258 pF, ROT, rn) \ 259 260#define I1024(R) \ 261 do { \ 262 /* inject the key schedule value */ \ 263 X00 += ks[((R) + 1) % 17]; \ 264 X01 += ks[((R) + 2) % 17]; \ 265 X02 += ks[((R) + 3) % 17]; \ 266 X03 += ks[((R) + 4) % 17]; \ 267 X04 += ks[((R) + 5) % 17]; \ 268 X05 += ks[((R) + 6) % 17]; \ 269 X06 += ks[((R) + 7) % 17]; \ 270 X07 += ks[((R) + 8) % 17]; \ 271 X08 += ks[((R) + 9) % 17]; \ 272 X09 += ks[((R) + 10) % 17]; \ 273 X10 += ks[((R) + 11) % 17]; \ 274 X11 += ks[((R) + 12) % 17]; \ 275 X12 += ks[((R) + 13) % 17]; \ 276 X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3]; \ 277 X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3]; \ 278 X15 += ks[((R) + 16) % 17] + (R) + 1; \ 279 } while (0) 280#else /* looping version */ 281#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, pF, \ 282 ROT, rn) \ 283 ROUND1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, pE, \ 284 pF, ROT, rn) \ 285 286#define I1024(R) \ 287 do { \ 288 /* inject the key schedule value */ \ 289 X00 += ks[r + (R) + 0]; \ 290 X01 += ks[r + (R) + 1]; \ 291 X02 += ks[r + (R) + 2]; \ 292 X03 += ks[r + (R) + 3]; \ 293 X04 += ks[r + (R) + 4]; \ 294 X05 += ks[r + (R) + 5]; \ 295 X06 += ks[r + (R) + 6]; \ 296 X07 += ks[r + (R) + 7]; \ 297 X08 += ks[r + (R) + 8]; \ 298 X09 += ks[r + (R) + 9]; \ 299 X10 += ks[r + (R) + 10]; \ 300 X11 += ks[r + (R) + 11]; \ 301 X12 += ks[r + (R) + 12]; \ 302 X13 += ks[r + (R) + 13] + ts[r + (R) + 0]; \ 303 X14 += ks[r + (R) + 14] + ts[r + (R) + 1]; \ 304 X15 += ks[r + (R) + 15] + r + (R); \ 305 /* rotate key schedule */ \ 306 ks[r + (R) + 16] = ks[r + (R) - 1]; \ 307 ts[r + (R) + 2] = ts[r + (R) - 1]; \ 308 } while (0) 309 310#endif 311#define R1024_8_ROUNDS(R) \ 312 do { \ 313 R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, \ 314 13, 14, 15, R1024_0, 8*(R) + 1); \ 315 R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, \ 316 05, 08, 01, R1024_1, 8*(R) + 2); \ 317 R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, \ 318 11, 10, 09, R1024_2, 8*(R) + 3); \ 319 R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, \ 320 03, 12, 07, R1024_3, 8*(R) + 4); \ 321 I1024(2*(R)); \ 322 R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, \ 323 13, 14, 15, R1024_4, 8*(R) + 5); \ 324 R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, \ 325 05, 08, 01, R1024_5, 8*(R) + 6); \ 326 R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, \ 327 11, 10, 09, R1024_6, 8*(R) + 7); \ 328 R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, \ 329 03, 12, 07, R1024_7, 8*(R) + 8); \ 330 I1024(2*(R)+1); \ 331 } while (0) 332 333#define R1024_UNROLL_R(NN) \ 334 ((SKEIN_UNROLL_1024 == 0 && \ 335 SKEIN_1024_ROUNDS_TOTAL/8 > (NN)) || \ 336 (SKEIN_UNROLL_1024 > (NN))) 337 338#if (SKEIN_UNROLL_1024 > 14) 339#error "need more unrolling in Skein_1024_Process_Block" 340#endif 341#endif 342 343/***************************** SKEIN_256 ******************************/ 344#if !(SKEIN_USE_ASM & 256) 345void skein_256_process_block(struct skein_256_ctx *ctx, const u8 *blk_ptr, 346 size_t blk_cnt, size_t byte_cnt_add) 347{ /* do it in C */ 348 enum { 349 WCNT = SKEIN_256_STATE_WORDS 350 }; 351 size_t r; 352#if SKEIN_UNROLL_256 353 /* key schedule: chaining vars + tweak + "rot"*/ 354 u64 kw[WCNT+4+RCNT*2]; 355#else 356 /* key schedule words : chaining vars + tweak */ 357 u64 kw[WCNT+4]; 358#endif 359 u64 X0, X1, X2, X3; /* local copy of context vars, for speed */ 360 u64 w[WCNT]; /* local copy of input block */ 361#ifdef SKEIN_DEBUG 362 const u64 *X_ptr[4]; /* use for debugging (help cc put Xn in regs) */ 363 364 X_ptr[0] = &X0; 365 X_ptr[1] = &X1; 366 X_ptr[2] = &X2; 367 X_ptr[3] = &X3; 368#endif 369 skein_assert(blk_cnt != 0); /* never call with blk_cnt == 0! */ 370 ts[0] = ctx->h.tweak[0]; 371 ts[1] = ctx->h.tweak[1]; 372 do { 373 /* 374 * this implementation only supports 2**64 input bytes 375 * (no carry out here) 376 */ 377 ts[0] += byte_cnt_add; /* update processed length */ 378 379 /* precompute the key schedule for this block */ 380 ks[0] = ctx->x[0]; 381 ks[1] = ctx->x[1]; 382 ks[2] = ctx->x[2]; 383 ks[3] = ctx->x[3]; 384 ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY; 385 386 ts[2] = ts[0] ^ ts[1]; 387 388 /* get input block in little-endian format */ 389 skein_get64_lsb_first(w, blk_ptr, WCNT); 390 debug_save_tweak(ctx); 391 392 /* do the first full key injection */ 393 X0 = w[0] + ks[0]; 394 X1 = w[1] + ks[1] + ts[0]; 395 X2 = w[2] + ks[2] + ts[1]; 396 X3 = w[3] + ks[3]; 397 398 blk_ptr += SKEIN_256_BLOCK_BYTES; 399 400 /* run the rounds */ 401 for (r = 1; 402 r < (SKEIN_UNROLL_256 ? 2 * RCNT : 2); 403 r += (SKEIN_UNROLL_256 ? 2 * SKEIN_UNROLL_256 : 1)) { 404 R256_8_ROUNDS(0); 405#if R256_UNROLL_R(1) 406 R256_8_ROUNDS(1); 407#endif 408#if R256_UNROLL_R(2) 409 R256_8_ROUNDS(2); 410#endif 411#if R256_UNROLL_R(3) 412 R256_8_ROUNDS(3); 413#endif 414#if R256_UNROLL_R(4) 415 R256_8_ROUNDS(4); 416#endif 417#if R256_UNROLL_R(5) 418 R256_8_ROUNDS(5); 419#endif 420#if R256_UNROLL_R(6) 421 R256_8_ROUNDS(6); 422#endif 423#if R256_UNROLL_R(7) 424 R256_8_ROUNDS(7); 425#endif 426#if R256_UNROLL_R(8) 427 R256_8_ROUNDS(8); 428#endif 429#if R256_UNROLL_R(9) 430 R256_8_ROUNDS(9); 431#endif 432#if R256_UNROLL_R(10) 433 R256_8_ROUNDS(10); 434#endif 435#if R256_UNROLL_R(11) 436 R256_8_ROUNDS(11); 437#endif 438#if R256_UNROLL_R(12) 439 R256_8_ROUNDS(12); 440#endif 441#if R256_UNROLL_R(13) 442 R256_8_ROUNDS(13); 443#endif 444#if R256_UNROLL_R(14) 445 R256_8_ROUNDS(14); 446#endif 447 } 448 /* do the final "feedforward" xor, update context chaining */ 449 ctx->x[0] = X0 ^ w[0]; 450 ctx->x[1] = X1 ^ w[1]; 451 ctx->x[2] = X2 ^ w[2]; 452 ctx->x[3] = X3 ^ w[3]; 453 454 ts[1] &= ~SKEIN_T1_FLAG_FIRST; 455 } while (--blk_cnt); 456 ctx->h.tweak[0] = ts[0]; 457 ctx->h.tweak[1] = ts[1]; 458} 459 460#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) 461size_t skein_256_process_block_code_size(void) 462{ 463 return ((u8 *) skein_256_process_block_code_size) - 464 ((u8 *) skein_256_process_block); 465} 466unsigned int skein_256_unroll_cnt(void) 467{ 468 return SKEIN_UNROLL_256; 469} 470#endif 471#endif 472 473/***************************** SKEIN_512 ******************************/ 474#if !(SKEIN_USE_ASM & 512) 475void skein_512_process_block(struct skein_512_ctx *ctx, const u8 *blk_ptr, 476 size_t blk_cnt, size_t byte_cnt_add) 477{ /* do it in C */ 478 enum { 479 WCNT = SKEIN_512_STATE_WORDS 480 }; 481 size_t r; 482#if SKEIN_UNROLL_512 483 u64 kw[WCNT+4+RCNT*2]; /* key sched: chaining vars + tweak + "rot"*/ 484#else 485 u64 kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ 486#endif 487 u64 X0, X1, X2, X3, X4, X5, X6, X7; /* local copies, for speed */ 488 u64 w[WCNT]; /* local copy of input block */ 489#ifdef SKEIN_DEBUG 490 const u64 *X_ptr[8]; /* use for debugging (help cc put Xn in regs) */ 491 492 X_ptr[0] = &X0; 493 X_ptr[1] = &X1; 494 X_ptr[2] = &X2; 495 X_ptr[3] = &X3; 496 X_ptr[4] = &X4; 497 X_ptr[5] = &X5; 498 X_ptr[6] = &X6; 499 X_ptr[7] = &X7; 500#endif 501 502 skein_assert(blk_cnt != 0); /* never call with blk_cnt == 0! */ 503 ts[0] = ctx->h.tweak[0]; 504 ts[1] = ctx->h.tweak[1]; 505 do { 506 /* 507 * this implementation only supports 2**64 input bytes 508 * (no carry out here) 509 */ 510 ts[0] += byte_cnt_add; /* update processed length */ 511 512 /* precompute the key schedule for this block */ 513 ks[0] = ctx->x[0]; 514 ks[1] = ctx->x[1]; 515 ks[2] = ctx->x[2]; 516 ks[3] = ctx->x[3]; 517 ks[4] = ctx->x[4]; 518 ks[5] = ctx->x[5]; 519 ks[6] = ctx->x[6]; 520 ks[7] = ctx->x[7]; 521 ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ 522 ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; 523 524 ts[2] = ts[0] ^ ts[1]; 525 526 /* get input block in little-endian format */ 527 skein_get64_lsb_first(w, blk_ptr, WCNT); 528 debug_save_tweak(ctx); 529 530 /* do the first full key injection */ 531 X0 = w[0] + ks[0]; 532 X1 = w[1] + ks[1]; 533 X2 = w[2] + ks[2]; 534 X3 = w[3] + ks[3]; 535 X4 = w[4] + ks[4]; 536 X5 = w[5] + ks[5] + ts[0]; 537 X6 = w[6] + ks[6] + ts[1]; 538 X7 = w[7] + ks[7]; 539 540 blk_ptr += SKEIN_512_BLOCK_BYTES; 541 542 /* run the rounds */ 543 for (r = 1; 544 r < (SKEIN_UNROLL_512 ? 2 * RCNT : 2); 545 r += (SKEIN_UNROLL_512 ? 2 * SKEIN_UNROLL_512 : 1)) { 546 547 R512_8_ROUNDS(0); 548 549#if R512_UNROLL_R(1) 550 R512_8_ROUNDS(1); 551#endif 552#if R512_UNROLL_R(2) 553 R512_8_ROUNDS(2); 554#endif 555#if R512_UNROLL_R(3) 556 R512_8_ROUNDS(3); 557#endif 558#if R512_UNROLL_R(4) 559 R512_8_ROUNDS(4); 560#endif 561#if R512_UNROLL_R(5) 562 R512_8_ROUNDS(5); 563#endif 564#if R512_UNROLL_R(6) 565 R512_8_ROUNDS(6); 566#endif 567#if R512_UNROLL_R(7) 568 R512_8_ROUNDS(7); 569#endif 570#if R512_UNROLL_R(8) 571 R512_8_ROUNDS(8); 572#endif 573#if R512_UNROLL_R(9) 574 R512_8_ROUNDS(9); 575#endif 576#if R512_UNROLL_R(10) 577 R512_8_ROUNDS(10); 578#endif 579#if R512_UNROLL_R(11) 580 R512_8_ROUNDS(11); 581#endif 582#if R512_UNROLL_R(12) 583 R512_8_ROUNDS(12); 584#endif 585#if R512_UNROLL_R(13) 586 R512_8_ROUNDS(13); 587#endif 588#if R512_UNROLL_R(14) 589 R512_8_ROUNDS(14); 590#endif 591 } 592 593 /* do the final "feedforward" xor, update context chaining */ 594 ctx->x[0] = X0 ^ w[0]; 595 ctx->x[1] = X1 ^ w[1]; 596 ctx->x[2] = X2 ^ w[2]; 597 ctx->x[3] = X3 ^ w[3]; 598 ctx->x[4] = X4 ^ w[4]; 599 ctx->x[5] = X5 ^ w[5]; 600 ctx->x[6] = X6 ^ w[6]; 601 ctx->x[7] = X7 ^ w[7]; 602 603 ts[1] &= ~SKEIN_T1_FLAG_FIRST; 604 } while (--blk_cnt); 605 ctx->h.tweak[0] = ts[0]; 606 ctx->h.tweak[1] = ts[1]; 607} 608 609#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) 610size_t skein_512_process_block_code_size(void) 611{ 612 return ((u8 *) skein_512_process_block_code_size) - 613 ((u8 *) skein_512_process_block); 614} 615unsigned int skein_512_unroll_cnt(void) 616{ 617 return SKEIN_UNROLL_512; 618} 619#endif 620#endif 621 622/***************************** SKEIN_1024 ******************************/ 623#if !(SKEIN_USE_ASM & 1024) 624void skein_1024_process_block(struct skein_1024_ctx *ctx, const u8 *blk_ptr, 625 size_t blk_cnt, size_t byte_cnt_add) 626{ /* do it in C, always looping (unrolled is bigger AND slower!) */ 627 enum { 628 WCNT = SKEIN_1024_STATE_WORDS 629 }; 630 size_t r; 631#if (SKEIN_UNROLL_1024 != 0) 632 u64 kw[WCNT+4+RCNT*2]; /* key sched: chaining vars + tweak + "rot" */ 633#else 634 u64 kw[WCNT+4]; /* key schedule words : chaining vars + tweak */ 635#endif 636 637 /* local copy of vars, for speed */ 638 u64 X00, X01, X02, X03, X04, X05, X06, X07, 639 X08, X09, X10, X11, X12, X13, X14, X15; 640 u64 w[WCNT]; /* local copy of input block */ 641 642 skein_assert(blk_cnt != 0); /* never call with blk_cnt == 0! */ 643 ts[0] = ctx->h.tweak[0]; 644 ts[1] = ctx->h.tweak[1]; 645 do { 646 /* 647 * this implementation only supports 2**64 input bytes 648 * (no carry out here) 649 */ 650 ts[0] += byte_cnt_add; /* update processed length */ 651 652 /* precompute the key schedule for this block */ 653 ks[0] = ctx->x[0]; 654 ks[1] = ctx->x[1]; 655 ks[2] = ctx->x[2]; 656 ks[3] = ctx->x[3]; 657 ks[4] = ctx->x[4]; 658 ks[5] = ctx->x[5]; 659 ks[6] = ctx->x[6]; 660 ks[7] = ctx->x[7]; 661 ks[8] = ctx->x[8]; 662 ks[9] = ctx->x[9]; 663 ks[10] = ctx->x[10]; 664 ks[11] = ctx->x[11]; 665 ks[12] = ctx->x[12]; 666 ks[13] = ctx->x[13]; 667 ks[14] = ctx->x[14]; 668 ks[15] = ctx->x[15]; 669 ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ 670 ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ 671 ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^ 672 ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY; 673 674 ts[2] = ts[0] ^ ts[1]; 675 676 /* get input block in little-endian format */ 677 skein_get64_lsb_first(w, blk_ptr, WCNT); 678 debug_save_tweak(ctx); 679 680 /* do the first full key injection */ 681 X00 = w[0] + ks[0]; 682 X01 = w[1] + ks[1]; 683 X02 = w[2] + ks[2]; 684 X03 = w[3] + ks[3]; 685 X04 = w[4] + ks[4]; 686 X05 = w[5] + ks[5]; 687 X06 = w[6] + ks[6]; 688 X07 = w[7] + ks[7]; 689 X08 = w[8] + ks[8]; 690 X09 = w[9] + ks[9]; 691 X10 = w[10] + ks[10]; 692 X11 = w[11] + ks[11]; 693 X12 = w[12] + ks[12]; 694 X13 = w[13] + ks[13] + ts[0]; 695 X14 = w[14] + ks[14] + ts[1]; 696 X15 = w[15] + ks[15]; 697 698 for (r = 1; 699 r < (SKEIN_UNROLL_1024 ? 2 * RCNT : 2); 700 r += (SKEIN_UNROLL_1024 ? 2 * SKEIN_UNROLL_1024 : 1)) { 701 R1024_8_ROUNDS(0); 702#if R1024_UNROLL_R(1) 703 R1024_8_ROUNDS(1); 704#endif 705#if R1024_UNROLL_R(2) 706 R1024_8_ROUNDS(2); 707#endif 708#if R1024_UNROLL_R(3) 709 R1024_8_ROUNDS(3); 710#endif 711#if R1024_UNROLL_R(4) 712 R1024_8_ROUNDS(4); 713#endif 714#if R1024_UNROLL_R(5) 715 R1024_8_ROUNDS(5); 716#endif 717#if R1024_UNROLL_R(6) 718 R1024_8_ROUNDS(6); 719#endif 720#if R1024_UNROLL_R(7) 721 R1024_8_ROUNDS(7); 722#endif 723#if R1024_UNROLL_R(8) 724 R1024_8_ROUNDS(8); 725#endif 726#if R1024_UNROLL_R(9) 727 R1024_8_ROUNDS(9); 728#endif 729#if R1024_UNROLL_R(10) 730 R1024_8_ROUNDS(10); 731#endif 732#if R1024_UNROLL_R(11) 733 R1024_8_ROUNDS(11); 734#endif 735#if R1024_UNROLL_R(12) 736 R1024_8_ROUNDS(12); 737#endif 738#if R1024_UNROLL_R(13) 739 R1024_8_ROUNDS(13); 740#endif 741#if R1024_UNROLL_R(14) 742 R1024_8_ROUNDS(14); 743#endif 744 } 745 /* do the final "feedforward" xor, update context chaining */ 746 747 ctx->x[0] = X00 ^ w[0]; 748 ctx->x[1] = X01 ^ w[1]; 749 ctx->x[2] = X02 ^ w[2]; 750 ctx->x[3] = X03 ^ w[3]; 751 ctx->x[4] = X04 ^ w[4]; 752 ctx->x[5] = X05 ^ w[5]; 753 ctx->x[6] = X06 ^ w[6]; 754 ctx->x[7] = X07 ^ w[7]; 755 ctx->x[8] = X08 ^ w[8]; 756 ctx->x[9] = X09 ^ w[9]; 757 ctx->x[10] = X10 ^ w[10]; 758 ctx->x[11] = X11 ^ w[11]; 759 ctx->x[12] = X12 ^ w[12]; 760 ctx->x[13] = X13 ^ w[13]; 761 ctx->x[14] = X14 ^ w[14]; 762 ctx->x[15] = X15 ^ w[15]; 763 764 ts[1] &= ~SKEIN_T1_FLAG_FIRST; 765 blk_ptr += SKEIN_1024_BLOCK_BYTES; 766 } while (--blk_cnt); 767 ctx->h.tweak[0] = ts[0]; 768 ctx->h.tweak[1] = ts[1]; 769} 770 771#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) 772size_t skein_1024_process_block_code_size(void) 773{ 774 return ((u8 *) skein_1024_process_block_code_size) - 775 ((u8 *) skein_1024_process_block); 776} 777unsigned int skein_1024_unroll_cnt(void) 778{ 779 return SKEIN_UNROLL_1024; 780} 781#endif 782#endif 783