1/* sha512-armv7-neon.S - ARM/NEON assembly implementation of SHA-512 transform 2 * 3 * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License as published by the Free 7 * Software Foundation; either version 2 of the License, or (at your option) 8 * any later version. 9 */ 10 11#include <linux/linkage.h> 12 13 14.syntax unified 15.code 32 16.fpu neon 17 18.text 19 20/* structure of SHA512_CONTEXT */ 21#define hd_a 0 22#define hd_b ((hd_a) + 8) 23#define hd_c ((hd_b) + 8) 24#define hd_d ((hd_c) + 8) 25#define hd_e ((hd_d) + 8) 26#define hd_f ((hd_e) + 8) 27#define hd_g ((hd_f) + 8) 28 29/* register macros */ 30#define RK %r2 31 32#define RA d0 33#define RB d1 34#define RC d2 35#define RD d3 36#define RE d4 37#define RF d5 38#define RG d6 39#define RH d7 40 41#define RT0 d8 42#define RT1 d9 43#define RT2 d10 44#define RT3 d11 45#define RT4 d12 46#define RT5 d13 47#define RT6 d14 48#define RT7 d15 49 50#define RT01q q4 51#define RT23q q5 52#define RT45q q6 53#define RT67q q7 54 55#define RW0 d16 56#define RW1 d17 57#define RW2 d18 58#define RW3 d19 59#define RW4 d20 60#define RW5 d21 61#define RW6 d22 62#define RW7 d23 63#define RW8 d24 64#define RW9 d25 65#define RW10 d26 66#define RW11 d27 67#define RW12 d28 68#define RW13 d29 69#define RW14 d30 70#define RW15 d31 71 72#define RW01q q8 73#define RW23q q9 74#define RW45q q10 75#define RW67q q11 76#define RW89q q12 77#define RW1011q q13 78#define RW1213q q14 79#define RW1415q q15 80 81/*********************************************************************** 82 * ARM assembly implementation of sha512 transform 83 ***********************************************************************/ 84#define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, \ 85 rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \ 86 /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ 87 vshr.u64 RT2, re, #14; \ 88 vshl.u64 RT3, re, #64 - 14; \ 89 interleave_op(arg1); \ 90 vshr.u64 RT4, re, #18; \ 91 vshl.u64 RT5, re, #64 - 18; \ 92 vld1.64 {RT0}, [RK]!; \ 93 veor.64 RT23q, RT23q, RT45q; \ 94 vshr.u64 RT4, re, #41; \ 95 vshl.u64 RT5, re, #64 - 41; \ 96 vadd.u64 RT0, RT0, rw0; \ 97 veor.64 RT23q, RT23q, RT45q; \ 98 vmov.64 RT7, re; \ 99 veor.64 RT1, RT2, RT3; \ 100 vbsl.64 RT7, rf, rg; \ 101 \ 102 vadd.u64 RT1, RT1, rh; \ 103 vshr.u64 RT2, ra, #28; \ 104 vshl.u64 RT3, ra, #64 - 28; \ 105 vadd.u64 RT1, RT1, RT0; \ 106 vshr.u64 RT4, ra, #34; \ 107 vshl.u64 RT5, ra, #64 - 34; \ 108 vadd.u64 RT1, RT1, RT7; \ 109 \ 110 /* h = Sum0 (a) + Maj (a, b, c); */ \ 111 veor.64 RT23q, RT23q, RT45q; \ 112 vshr.u64 RT4, ra, #39; \ 113 vshl.u64 RT5, ra, #64 - 39; \ 114 veor.64 RT0, ra, rb; \ 115 veor.64 RT23q, RT23q, RT45q; \ 116 vbsl.64 RT0, rc, rb; \ 117 vadd.u64 rd, rd, RT1; /* d+=t1; */ \ 118 veor.64 rh, RT2, RT3; \ 119 \ 120 /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \ 121 vshr.u64 RT2, rd, #14; \ 122 vshl.u64 RT3, rd, #64 - 14; \ 123 vadd.u64 rh, rh, RT0; \ 124 vshr.u64 RT4, rd, #18; \ 125 vshl.u64 RT5, rd, #64 - 18; \ 126 vadd.u64 rh, rh, RT1; /* h+=t1; */ \ 127 vld1.64 {RT0}, [RK]!; \ 128 veor.64 RT23q, RT23q, RT45q; \ 129 vshr.u64 RT4, rd, #41; \ 130 vshl.u64 RT5, rd, #64 - 41; \ 131 vadd.u64 RT0, RT0, rw1; \ 132 veor.64 RT23q, RT23q, RT45q; \ 133 vmov.64 RT7, rd; \ 134 veor.64 RT1, RT2, RT3; \ 135 vbsl.64 RT7, re, rf; \ 136 \ 137 vadd.u64 RT1, RT1, rg; \ 138 vshr.u64 RT2, rh, #28; \ 139 vshl.u64 RT3, rh, #64 - 28; \ 140 vadd.u64 RT1, RT1, RT0; \ 141 vshr.u64 RT4, rh, #34; \ 142 vshl.u64 RT5, rh, #64 - 34; \ 143 vadd.u64 RT1, RT1, RT7; \ 144 \ 145 /* g = Sum0 (h) + Maj (h, a, b); */ \ 146 veor.64 RT23q, RT23q, RT45q; \ 147 vshr.u64 RT4, rh, #39; \ 148 vshl.u64 RT5, rh, #64 - 39; \ 149 veor.64 RT0, rh, ra; \ 150 veor.64 RT23q, RT23q, RT45q; \ 151 vbsl.64 RT0, rb, ra; \ 152 vadd.u64 rc, rc, RT1; /* c+=t1; */ \ 153 veor.64 rg, RT2, RT3; \ 154 \ 155 /* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \ 156 /* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \ 157 \ 158 /**** S0(w[1:2]) */ \ 159 \ 160 /* w[0:1] += w[9:10] */ \ 161 /* RT23q = rw1:rw2 */ \ 162 vext.u64 RT23q, rw01q, rw23q, #1; \ 163 vadd.u64 rw0, rw9; \ 164 vadd.u64 rg, rg, RT0; \ 165 vadd.u64 rw1, rw10;\ 166 vadd.u64 rg, rg, RT1; /* g+=t1; */ \ 167 \ 168 vshr.u64 RT45q, RT23q, #1; \ 169 vshl.u64 RT67q, RT23q, #64 - 1; \ 170 vshr.u64 RT01q, RT23q, #8; \ 171 veor.u64 RT45q, RT45q, RT67q; \ 172 vshl.u64 RT67q, RT23q, #64 - 8; \ 173 veor.u64 RT45q, RT45q, RT01q; \ 174 vshr.u64 RT01q, RT23q, #7; \ 175 veor.u64 RT45q, RT45q, RT67q; \ 176 \ 177 /**** S1(w[14:15]) */ \ 178 vshr.u64 RT23q, rw1415q, #6; \ 179 veor.u64 RT01q, RT01q, RT45q; \ 180 vshr.u64 RT45q, rw1415q, #19; \ 181 vshl.u64 RT67q, rw1415q, #64 - 19; \ 182 veor.u64 RT23q, RT23q, RT45q; \ 183 vshr.u64 RT45q, rw1415q, #61; \ 184 veor.u64 RT23q, RT23q, RT67q; \ 185 vshl.u64 RT67q, rw1415q, #64 - 61; \ 186 veor.u64 RT23q, RT23q, RT45q; \ 187 vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \ 188 veor.u64 RT01q, RT23q, RT67q; 189#define vadd_RT01q(rw01q) \ 190 /* w[0:1] += S(w[14:15]) */ \ 191 vadd.u64 rw01q, RT01q; 192 193#define dummy(_) /*_*/ 194 195#define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, \ 196 interleave_op1, arg1, interleave_op2, arg2) \ 197 /* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \ 198 vshr.u64 RT2, re, #14; \ 199 vshl.u64 RT3, re, #64 - 14; \ 200 interleave_op1(arg1); \ 201 vshr.u64 RT4, re, #18; \ 202 vshl.u64 RT5, re, #64 - 18; \ 203 interleave_op2(arg2); \ 204 vld1.64 {RT0}, [RK]!; \ 205 veor.64 RT23q, RT23q, RT45q; \ 206 vshr.u64 RT4, re, #41; \ 207 vshl.u64 RT5, re, #64 - 41; \ 208 vadd.u64 RT0, RT0, rw0; \ 209 veor.64 RT23q, RT23q, RT45q; \ 210 vmov.64 RT7, re; \ 211 veor.64 RT1, RT2, RT3; \ 212 vbsl.64 RT7, rf, rg; \ 213 \ 214 vadd.u64 RT1, RT1, rh; \ 215 vshr.u64 RT2, ra, #28; \ 216 vshl.u64 RT3, ra, #64 - 28; \ 217 vadd.u64 RT1, RT1, RT0; \ 218 vshr.u64 RT4, ra, #34; \ 219 vshl.u64 RT5, ra, #64 - 34; \ 220 vadd.u64 RT1, RT1, RT7; \ 221 \ 222 /* h = Sum0 (a) + Maj (a, b, c); */ \ 223 veor.64 RT23q, RT23q, RT45q; \ 224 vshr.u64 RT4, ra, #39; \ 225 vshl.u64 RT5, ra, #64 - 39; \ 226 veor.64 RT0, ra, rb; \ 227 veor.64 RT23q, RT23q, RT45q; \ 228 vbsl.64 RT0, rc, rb; \ 229 vadd.u64 rd, rd, RT1; /* d+=t1; */ \ 230 veor.64 rh, RT2, RT3; \ 231 \ 232 /* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \ 233 vshr.u64 RT2, rd, #14; \ 234 vshl.u64 RT3, rd, #64 - 14; \ 235 vadd.u64 rh, rh, RT0; \ 236 vshr.u64 RT4, rd, #18; \ 237 vshl.u64 RT5, rd, #64 - 18; \ 238 vadd.u64 rh, rh, RT1; /* h+=t1; */ \ 239 vld1.64 {RT0}, [RK]!; \ 240 veor.64 RT23q, RT23q, RT45q; \ 241 vshr.u64 RT4, rd, #41; \ 242 vshl.u64 RT5, rd, #64 - 41; \ 243 vadd.u64 RT0, RT0, rw1; \ 244 veor.64 RT23q, RT23q, RT45q; \ 245 vmov.64 RT7, rd; \ 246 veor.64 RT1, RT2, RT3; \ 247 vbsl.64 RT7, re, rf; \ 248 \ 249 vadd.u64 RT1, RT1, rg; \ 250 vshr.u64 RT2, rh, #28; \ 251 vshl.u64 RT3, rh, #64 - 28; \ 252 vadd.u64 RT1, RT1, RT0; \ 253 vshr.u64 RT4, rh, #34; \ 254 vshl.u64 RT5, rh, #64 - 34; \ 255 vadd.u64 RT1, RT1, RT7; \ 256 \ 257 /* g = Sum0 (h) + Maj (h, a, b); */ \ 258 veor.64 RT23q, RT23q, RT45q; \ 259 vshr.u64 RT4, rh, #39; \ 260 vshl.u64 RT5, rh, #64 - 39; \ 261 veor.64 RT0, rh, ra; \ 262 veor.64 RT23q, RT23q, RT45q; \ 263 vbsl.64 RT0, rb, ra; \ 264 vadd.u64 rc, rc, RT1; /* c+=t1; */ \ 265 veor.64 rg, RT2, RT3; 266#define vadd_rg_RT0(rg) \ 267 vadd.u64 rg, rg, RT0; 268#define vadd_rg_RT1(rg) \ 269 vadd.u64 rg, rg, RT1; /* g+=t1; */ 270 271.align 3 272ENTRY(sha512_transform_neon) 273 /* Input: 274 * %r0: SHA512_CONTEXT 275 * %r1: data 276 * %r2: u64 k[] constants 277 * %r3: nblks 278 */ 279 push {%lr}; 280 281 mov %lr, #0; 282 283 /* Load context to d0-d7 */ 284 vld1.64 {RA-RD}, [%r0]!; 285 vld1.64 {RE-RH}, [%r0]; 286 sub %r0, #(4*8); 287 288 /* Load input to w[16], d16-d31 */ 289 /* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */ 290 vld1.64 {RW0-RW3}, [%r1]!; 291 vld1.64 {RW4-RW7}, [%r1]!; 292 vld1.64 {RW8-RW11}, [%r1]!; 293 vld1.64 {RW12-RW15}, [%r1]!; 294#ifdef __ARMEL__ 295 /* byteswap */ 296 vrev64.8 RW01q, RW01q; 297 vrev64.8 RW23q, RW23q; 298 vrev64.8 RW45q, RW45q; 299 vrev64.8 RW67q, RW67q; 300 vrev64.8 RW89q, RW89q; 301 vrev64.8 RW1011q, RW1011q; 302 vrev64.8 RW1213q, RW1213q; 303 vrev64.8 RW1415q, RW1415q; 304#endif 305 306 /* EABI says that d8-d15 must be preserved by callee. */ 307 /*vpush {RT0-RT7};*/ 308 309.Loop: 310 rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, 311 RW23q, RW1415q, RW9, RW10, dummy, _); 312 b .Lenter_rounds; 313 314.Loop_rounds: 315 rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, 316 RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q); 317.Lenter_rounds: 318 rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4, 319 RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q); 320 rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6, 321 RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q); 322 rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8, 323 RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q); 324 rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10, 325 RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q); 326 rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12, 327 RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q); 328 add %lr, #16; 329 rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14, 330 RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q); 331 cmp %lr, #64; 332 rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0, 333 RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q); 334 bne .Loop_rounds; 335 336 subs %r3, #1; 337 338 rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, 339 vadd_RT01q, RW1415q, dummy, _); 340 rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, 341 vadd_rg_RT0, RG, vadd_rg_RT1, RG); 342 beq .Lhandle_tail; 343 vld1.64 {RW0-RW3}, [%r1]!; 344 rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, 345 vadd_rg_RT0, RE, vadd_rg_RT1, RE); 346 rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, 347 vadd_rg_RT0, RC, vadd_rg_RT1, RC); 348#ifdef __ARMEL__ 349 vrev64.8 RW01q, RW01q; 350 vrev64.8 RW23q, RW23q; 351#endif 352 vld1.64 {RW4-RW7}, [%r1]!; 353 rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, 354 vadd_rg_RT0, RA, vadd_rg_RT1, RA); 355 rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, 356 vadd_rg_RT0, RG, vadd_rg_RT1, RG); 357#ifdef __ARMEL__ 358 vrev64.8 RW45q, RW45q; 359 vrev64.8 RW67q, RW67q; 360#endif 361 vld1.64 {RW8-RW11}, [%r1]!; 362 rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, 363 vadd_rg_RT0, RE, vadd_rg_RT1, RE); 364 rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, 365 vadd_rg_RT0, RC, vadd_rg_RT1, RC); 366#ifdef __ARMEL__ 367 vrev64.8 RW89q, RW89q; 368 vrev64.8 RW1011q, RW1011q; 369#endif 370 vld1.64 {RW12-RW15}, [%r1]!; 371 vadd_rg_RT0(RA); 372 vadd_rg_RT1(RA); 373 374 /* Load context */ 375 vld1.64 {RT0-RT3}, [%r0]!; 376 vld1.64 {RT4-RT7}, [%r0]; 377 sub %r0, #(4*8); 378 379#ifdef __ARMEL__ 380 vrev64.8 RW1213q, RW1213q; 381 vrev64.8 RW1415q, RW1415q; 382#endif 383 384 vadd.u64 RA, RT0; 385 vadd.u64 RB, RT1; 386 vadd.u64 RC, RT2; 387 vadd.u64 RD, RT3; 388 vadd.u64 RE, RT4; 389 vadd.u64 RF, RT5; 390 vadd.u64 RG, RT6; 391 vadd.u64 RH, RT7; 392 393 /* Store the first half of context */ 394 vst1.64 {RA-RD}, [%r0]!; 395 sub RK, $(8*80); 396 vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */ 397 mov %lr, #0; 398 sub %r0, #(4*8); 399 400 b .Loop; 401 402.Lhandle_tail: 403 rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, 404 vadd_rg_RT0, RE, vadd_rg_RT1, RE); 405 rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, 406 vadd_rg_RT0, RC, vadd_rg_RT1, RC); 407 rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, 408 vadd_rg_RT0, RA, vadd_rg_RT1, RA); 409 rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, 410 vadd_rg_RT0, RG, vadd_rg_RT1, RG); 411 rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, 412 vadd_rg_RT0, RE, vadd_rg_RT1, RE); 413 rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, 414 vadd_rg_RT0, RC, vadd_rg_RT1, RC); 415 416 /* Load context to d16-d23 */ 417 vld1.64 {RW0-RW3}, [%r0]!; 418 vadd_rg_RT0(RA); 419 vld1.64 {RW4-RW7}, [%r0]; 420 vadd_rg_RT1(RA); 421 sub %r0, #(4*8); 422 423 vadd.u64 RA, RW0; 424 vadd.u64 RB, RW1; 425 vadd.u64 RC, RW2; 426 vadd.u64 RD, RW3; 427 vadd.u64 RE, RW4; 428 vadd.u64 RF, RW5; 429 vadd.u64 RG, RW6; 430 vadd.u64 RH, RW7; 431 432 /* Store the first half of context */ 433 vst1.64 {RA-RD}, [%r0]!; 434 435 /* Clear used registers */ 436 /* d16-d31 */ 437 veor.u64 RW01q, RW01q; 438 veor.u64 RW23q, RW23q; 439 veor.u64 RW45q, RW45q; 440 veor.u64 RW67q, RW67q; 441 vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */ 442 veor.u64 RW89q, RW89q; 443 veor.u64 RW1011q, RW1011q; 444 veor.u64 RW1213q, RW1213q; 445 veor.u64 RW1415q, RW1415q; 446 /* d8-d15 */ 447 /*vpop {RT0-RT7};*/ 448 /* d0-d7 (q0-q3) */ 449 veor.u64 %q0, %q0; 450 veor.u64 %q1, %q1; 451 veor.u64 %q2, %q2; 452 veor.u64 %q3, %q3; 453 454 pop {%pc}; 455ENDPROC(sha512_transform_neon) 456