1/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function 2 * 3 * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi> 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License as published by the Free 7 * Software Foundation; either version 2 of the License, or (at your option) 8 * any later version. 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14.syntax unified 15.code 32 16.fpu neon 17 18.text 19 20 21/* Context structure */ 22 23#define state_h0 0 24#define state_h1 4 25#define state_h2 8 26#define state_h3 12 27#define state_h4 16 28 29 30/* Constants */ 31 32#define K1 0x5A827999 33#define K2 0x6ED9EBA1 34#define K3 0x8F1BBCDC 35#define K4 0xCA62C1D6 36.align 4 37.LK_VEC: 38.LK1: .long K1, K1, K1, K1 39.LK2: .long K2, K2, K2, K2 40.LK3: .long K3, K3, K3, K3 41.LK4: .long K4, K4, K4, K4 42 43 44/* Register macros */ 45 46#define RSTATE r0 47#define RDATA r1 48#define RNBLKS r2 49#define ROLDSTACK r3 50#define RWK lr 51 52#define _a r4 53#define _b r5 54#define _c r6 55#define _d r7 56#define _e r8 57 58#define RT0 r9 59#define RT1 r10 60#define RT2 r11 61#define RT3 r12 62 63#define W0 q0 64#define W1 q7 65#define W2 q2 66#define W3 q3 67#define W4 q4 68#define W5 q6 69#define W6 q5 70#define W7 q1 71 72#define tmp0 q8 73#define tmp1 q9 74#define tmp2 q10 75#define tmp3 q11 76 77#define qK1 q12 78#define qK2 q13 79#define qK3 q14 80#define qK4 q15 81 82#ifdef CONFIG_CPU_BIG_ENDIAN 83#define ARM_LE(code...) 84#else 85#define ARM_LE(code...) code 86#endif 87 88/* Round function macros. */ 89 90#define WK_offs(i) (((i) & 15) * 4) 91 92#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 93 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 94 ldr RT3, [sp, WK_offs(i)]; \ 95 pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 96 bic RT0, d, b; \ 97 add e, e, a, ror #(32 - 5); \ 98 and RT1, c, b; \ 99 pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 100 add RT0, RT0, RT3; \ 101 add e, e, RT1; \ 102 ror b, #(32 - 30); \ 103 pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 104 add e, e, RT0; 105 106#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 107 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 108 ldr RT3, [sp, WK_offs(i)]; \ 109 pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 110 eor RT0, d, b; \ 111 add e, e, a, ror #(32 - 5); \ 112 eor RT0, RT0, c; \ 113 pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 114 add e, e, RT3; \ 115 ror b, #(32 - 30); \ 116 pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 117 add e, e, RT0; \ 118 119#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 120 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 121 ldr RT3, [sp, WK_offs(i)]; \ 122 pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 123 eor RT0, b, c; \ 124 and RT1, b, c; \ 125 add e, e, a, ror #(32 - 5); \ 126 pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 127 and RT0, RT0, d; \ 128 add RT1, RT1, RT3; \ 129 add e, e, RT0; \ 130 ror b, #(32 - 30); \ 131 pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \ 132 add e, e, RT1; 133 134#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 135 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 136 _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 137 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) 138 139#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\ 140 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 141 _R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\ 142 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) 143 144#define R(a,b,c,d,e,f,i) \ 145 _R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\ 146 W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) 147 148#define dummy(...) 149 150 151/* Input expansion macros. */ 152 153/********* Precalc macros for rounds 0-15 *************************************/ 154 155#define W_PRECALC_00_15() \ 156 add RWK, sp, #(WK_offs(0)); \ 157 \ 158 vld1.32 {W0, W7}, [RDATA]!; \ 159 ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ 160 vld1.32 {W6, W5}, [RDATA]!; \ 161 vadd.u32 tmp0, W0, curK; \ 162 ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ 163 ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ 164 vadd.u32 tmp1, W7, curK; \ 165 ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ 166 vadd.u32 tmp2, W6, curK; \ 167 vst1.32 {tmp0, tmp1}, [RWK]!; \ 168 vadd.u32 tmp3, W5, curK; \ 169 vst1.32 {tmp2, tmp3}, [RWK]; \ 170 171#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 172 vld1.32 {W0, W7}, [RDATA]!; \ 173 174#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 175 add RWK, sp, #(WK_offs(0)); \ 176 177#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 178 ARM_LE(vrev32.8 W0, W0; ) /* big => little */ \ 179 180#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 181 vld1.32 {W6, W5}, [RDATA]!; \ 182 183#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 184 vadd.u32 tmp0, W0, curK; \ 185 186#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 187 ARM_LE(vrev32.8 W7, W7; ) /* big => little */ \ 188 189#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 190 ARM_LE(vrev32.8 W6, W6; ) /* big => little */ \ 191 192#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 193 vadd.u32 tmp1, W7, curK; \ 194 195#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 196 ARM_LE(vrev32.8 W5, W5; ) /* big => little */ \ 197 198#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 199 vadd.u32 tmp2, W6, curK; \ 200 201#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 202 vst1.32 {tmp0, tmp1}, [RWK]!; \ 203 204#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 205 vadd.u32 tmp3, W5, curK; \ 206 207#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 208 vst1.32 {tmp2, tmp3}, [RWK]; \ 209 210 211/********* Precalc macros for rounds 16-31 ************************************/ 212 213#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 214 veor tmp0, tmp0; \ 215 vext.8 W, W_m16, W_m12, #8; \ 216 217#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 218 add RWK, sp, #(WK_offs(i)); \ 219 vext.8 tmp0, W_m04, tmp0, #4; \ 220 221#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 222 veor tmp0, tmp0, W_m16; \ 223 veor.32 W, W, W_m08; \ 224 225#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 226 veor tmp1, tmp1; \ 227 veor W, W, tmp0; \ 228 229#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 230 vshl.u32 tmp0, W, #1; \ 231 232#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 233 vext.8 tmp1, tmp1, W, #(16-12); \ 234 vshr.u32 W, W, #31; \ 235 236#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 237 vorr tmp0, tmp0, W; \ 238 vshr.u32 W, tmp1, #30; \ 239 240#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 241 vshl.u32 tmp1, tmp1, #2; \ 242 243#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 244 veor tmp0, tmp0, W; \ 245 246#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 247 veor W, tmp0, tmp1; \ 248 249#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 250 vadd.u32 tmp0, W, curK; \ 251 252#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 253 vst1.32 {tmp0}, [RWK]; 254 255 256/********* Precalc macros for rounds 32-79 ************************************/ 257 258#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 259 veor W, W_m28; \ 260 261#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 262 vext.8 tmp0, W_m08, W_m04, #8; \ 263 264#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 265 veor W, W_m16; \ 266 267#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 268 veor W, tmp0; \ 269 270#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 271 add RWK, sp, #(WK_offs(i&~3)); \ 272 273#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 274 vshl.u32 tmp1, W, #2; \ 275 276#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 277 vshr.u32 tmp0, W, #30; \ 278 279#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 280 vorr W, tmp0, tmp1; \ 281 282#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 283 vadd.u32 tmp0, W, curK; \ 284 285#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \ 286 vst1.32 {tmp0}, [RWK]; 287 288 289/* 290 * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA. 291 * 292 * unsigned int 293 * sha1_transform_neon (void *ctx, const unsigned char *data, 294 * unsigned int nblks) 295 */ 296.align 3 297ENTRY(sha1_transform_neon) 298 /* input: 299 * r0: ctx, CTX 300 * r1: data (64*nblks bytes) 301 * r2: nblks 302 */ 303 304 cmp RNBLKS, #0; 305 beq .Ldo_nothing; 306 307 push {r4-r12, lr}; 308 /*vpush {q4-q7};*/ 309 310 adr RT3, .LK_VEC; 311 312 mov ROLDSTACK, sp; 313 314 /* Align stack. */ 315 sub RT0, sp, #(16*4); 316 and RT0, #(~(16-1)); 317 mov sp, RT0; 318 319 vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */ 320 321 /* Get the values of the chaining variables. */ 322 ldm RSTATE, {_a-_e}; 323 324 vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */ 325 326#undef curK 327#define curK qK1 328 /* Precalc 0-15. */ 329 W_PRECALC_00_15(); 330 331.Loop: 332 /* Transform 0-15 + Precalc 16-31. */ 333 _R( _a, _b, _c, _d, _e, F1, 0, 334 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16, 335 W4, W5, W6, W7, W0, _, _, _ ); 336 _R( _e, _a, _b, _c, _d, F1, 1, 337 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16, 338 W4, W5, W6, W7, W0, _, _, _ ); 339 _R( _d, _e, _a, _b, _c, F1, 2, 340 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16, 341 W4, W5, W6, W7, W0, _, _, _ ); 342 _R( _c, _d, _e, _a, _b, F1, 3, 343 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16, 344 W4, W5, W6, W7, W0, _, _, _ ); 345 346#undef curK 347#define curK qK2 348 _R( _b, _c, _d, _e, _a, F1, 4, 349 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20, 350 W3, W4, W5, W6, W7, _, _, _ ); 351 _R( _a, _b, _c, _d, _e, F1, 5, 352 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20, 353 W3, W4, W5, W6, W7, _, _, _ ); 354 _R( _e, _a, _b, _c, _d, F1, 6, 355 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20, 356 W3, W4, W5, W6, W7, _, _, _ ); 357 _R( _d, _e, _a, _b, _c, F1, 7, 358 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20, 359 W3, W4, W5, W6, W7, _, _, _ ); 360 361 _R( _c, _d, _e, _a, _b, F1, 8, 362 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24, 363 W2, W3, W4, W5, W6, _, _, _ ); 364 _R( _b, _c, _d, _e, _a, F1, 9, 365 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24, 366 W2, W3, W4, W5, W6, _, _, _ ); 367 _R( _a, _b, _c, _d, _e, F1, 10, 368 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24, 369 W2, W3, W4, W5, W6, _, _, _ ); 370 _R( _e, _a, _b, _c, _d, F1, 11, 371 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24, 372 W2, W3, W4, W5, W6, _, _, _ ); 373 374 _R( _d, _e, _a, _b, _c, F1, 12, 375 WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28, 376 W1, W2, W3, W4, W5, _, _, _ ); 377 _R( _c, _d, _e, _a, _b, F1, 13, 378 WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28, 379 W1, W2, W3, W4, W5, _, _, _ ); 380 _R( _b, _c, _d, _e, _a, F1, 14, 381 WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28, 382 W1, W2, W3, W4, W5, _, _, _ ); 383 _R( _a, _b, _c, _d, _e, F1, 15, 384 WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28, 385 W1, W2, W3, W4, W5, _, _, _ ); 386 387 /* Transform 16-63 + Precalc 32-79. */ 388 _R( _e, _a, _b, _c, _d, F1, 16, 389 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32, 390 W0, W1, W2, W3, W4, W5, W6, W7); 391 _R( _d, _e, _a, _b, _c, F1, 17, 392 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32, 393 W0, W1, W2, W3, W4, W5, W6, W7); 394 _R( _c, _d, _e, _a, _b, F1, 18, 395 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 32, 396 W0, W1, W2, W3, W4, W5, W6, W7); 397 _R( _b, _c, _d, _e, _a, F1, 19, 398 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 32, 399 W0, W1, W2, W3, W4, W5, W6, W7); 400 401 _R( _a, _b, _c, _d, _e, F2, 20, 402 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36, 403 W7, W0, W1, W2, W3, W4, W5, W6); 404 _R( _e, _a, _b, _c, _d, F2, 21, 405 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36, 406 W7, W0, W1, W2, W3, W4, W5, W6); 407 _R( _d, _e, _a, _b, _c, F2, 22, 408 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 36, 409 W7, W0, W1, W2, W3, W4, W5, W6); 410 _R( _c, _d, _e, _a, _b, F2, 23, 411 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 36, 412 W7, W0, W1, W2, W3, W4, W5, W6); 413 414#undef curK 415#define curK qK3 416 _R( _b, _c, _d, _e, _a, F2, 24, 417 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40, 418 W6, W7, W0, W1, W2, W3, W4, W5); 419 _R( _a, _b, _c, _d, _e, F2, 25, 420 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40, 421 W6, W7, W0, W1, W2, W3, W4, W5); 422 _R( _e, _a, _b, _c, _d, F2, 26, 423 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 40, 424 W6, W7, W0, W1, W2, W3, W4, W5); 425 _R( _d, _e, _a, _b, _c, F2, 27, 426 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 40, 427 W6, W7, W0, W1, W2, W3, W4, W5); 428 429 _R( _c, _d, _e, _a, _b, F2, 28, 430 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44, 431 W5, W6, W7, W0, W1, W2, W3, W4); 432 _R( _b, _c, _d, _e, _a, F2, 29, 433 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44, 434 W5, W6, W7, W0, W1, W2, W3, W4); 435 _R( _a, _b, _c, _d, _e, F2, 30, 436 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 44, 437 W5, W6, W7, W0, W1, W2, W3, W4); 438 _R( _e, _a, _b, _c, _d, F2, 31, 439 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 44, 440 W5, W6, W7, W0, W1, W2, W3, W4); 441 442 _R( _d, _e, _a, _b, _c, F2, 32, 443 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48, 444 W4, W5, W6, W7, W0, W1, W2, W3); 445 _R( _c, _d, _e, _a, _b, F2, 33, 446 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48, 447 W4, W5, W6, W7, W0, W1, W2, W3); 448 _R( _b, _c, _d, _e, _a, F2, 34, 449 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 48, 450 W4, W5, W6, W7, W0, W1, W2, W3); 451 _R( _a, _b, _c, _d, _e, F2, 35, 452 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 48, 453 W4, W5, W6, W7, W0, W1, W2, W3); 454 455 _R( _e, _a, _b, _c, _d, F2, 36, 456 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52, 457 W3, W4, W5, W6, W7, W0, W1, W2); 458 _R( _d, _e, _a, _b, _c, F2, 37, 459 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52, 460 W3, W4, W5, W6, W7, W0, W1, W2); 461 _R( _c, _d, _e, _a, _b, F2, 38, 462 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 52, 463 W3, W4, W5, W6, W7, W0, W1, W2); 464 _R( _b, _c, _d, _e, _a, F2, 39, 465 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 52, 466 W3, W4, W5, W6, W7, W0, W1, W2); 467 468 _R( _a, _b, _c, _d, _e, F3, 40, 469 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56, 470 W2, W3, W4, W5, W6, W7, W0, W1); 471 _R( _e, _a, _b, _c, _d, F3, 41, 472 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56, 473 W2, W3, W4, W5, W6, W7, W0, W1); 474 _R( _d, _e, _a, _b, _c, F3, 42, 475 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 56, 476 W2, W3, W4, W5, W6, W7, W0, W1); 477 _R( _c, _d, _e, _a, _b, F3, 43, 478 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 56, 479 W2, W3, W4, W5, W6, W7, W0, W1); 480 481#undef curK 482#define curK qK4 483 _R( _b, _c, _d, _e, _a, F3, 44, 484 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60, 485 W1, W2, W3, W4, W5, W6, W7, W0); 486 _R( _a, _b, _c, _d, _e, F3, 45, 487 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60, 488 W1, W2, W3, W4, W5, W6, W7, W0); 489 _R( _e, _a, _b, _c, _d, F3, 46, 490 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 60, 491 W1, W2, W3, W4, W5, W6, W7, W0); 492 _R( _d, _e, _a, _b, _c, F3, 47, 493 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 60, 494 W1, W2, W3, W4, W5, W6, W7, W0); 495 496 _R( _c, _d, _e, _a, _b, F3, 48, 497 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64, 498 W0, W1, W2, W3, W4, W5, W6, W7); 499 _R( _b, _c, _d, _e, _a, F3, 49, 500 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64, 501 W0, W1, W2, W3, W4, W5, W6, W7); 502 _R( _a, _b, _c, _d, _e, F3, 50, 503 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 64, 504 W0, W1, W2, W3, W4, W5, W6, W7); 505 _R( _e, _a, _b, _c, _d, F3, 51, 506 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 64, 507 W0, W1, W2, W3, W4, W5, W6, W7); 508 509 _R( _d, _e, _a, _b, _c, F3, 52, 510 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68, 511 W7, W0, W1, W2, W3, W4, W5, W6); 512 _R( _c, _d, _e, _a, _b, F3, 53, 513 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68, 514 W7, W0, W1, W2, W3, W4, W5, W6); 515 _R( _b, _c, _d, _e, _a, F3, 54, 516 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 68, 517 W7, W0, W1, W2, W3, W4, W5, W6); 518 _R( _a, _b, _c, _d, _e, F3, 55, 519 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 68, 520 W7, W0, W1, W2, W3, W4, W5, W6); 521 522 _R( _e, _a, _b, _c, _d, F3, 56, 523 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72, 524 W6, W7, W0, W1, W2, W3, W4, W5); 525 _R( _d, _e, _a, _b, _c, F3, 57, 526 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72, 527 W6, W7, W0, W1, W2, W3, W4, W5); 528 _R( _c, _d, _e, _a, _b, F3, 58, 529 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 72, 530 W6, W7, W0, W1, W2, W3, W4, W5); 531 _R( _b, _c, _d, _e, _a, F3, 59, 532 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 72, 533 W6, W7, W0, W1, W2, W3, W4, W5); 534 535 subs RNBLKS, #1; 536 537 _R( _a, _b, _c, _d, _e, F4, 60, 538 WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76, 539 W5, W6, W7, W0, W1, W2, W3, W4); 540 _R( _e, _a, _b, _c, _d, F4, 61, 541 WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76, 542 W5, W6, W7, W0, W1, W2, W3, W4); 543 _R( _d, _e, _a, _b, _c, F4, 62, 544 WPRECALC_32_79_6, dummy, WPRECALC_32_79_7, 76, 545 W5, W6, W7, W0, W1, W2, W3, W4); 546 _R( _c, _d, _e, _a, _b, F4, 63, 547 WPRECALC_32_79_8, dummy, WPRECALC_32_79_9, 76, 548 W5, W6, W7, W0, W1, W2, W3, W4); 549 550 beq .Lend; 551 552 /* Transform 64-79 + Precalc 0-15 of next block. */ 553#undef curK 554#define curK qK1 555 _R( _b, _c, _d, _e, _a, F4, 64, 556 WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 557 _R( _a, _b, _c, _d, _e, F4, 65, 558 WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 559 _R( _e, _a, _b, _c, _d, F4, 66, 560 WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 561 _R( _d, _e, _a, _b, _c, F4, 67, 562 WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 563 564 _R( _c, _d, _e, _a, _b, F4, 68, 565 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 566 _R( _b, _c, _d, _e, _a, F4, 69, 567 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 568 _R( _a, _b, _c, _d, _e, F4, 70, 569 WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 570 _R( _e, _a, _b, _c, _d, F4, 71, 571 WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 572 573 _R( _d, _e, _a, _b, _c, F4, 72, 574 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 575 _R( _c, _d, _e, _a, _b, F4, 73, 576 dummy, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 577 _R( _b, _c, _d, _e, _a, F4, 74, 578 WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 579 _R( _a, _b, _c, _d, _e, F4, 75, 580 WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 581 582 _R( _e, _a, _b, _c, _d, F4, 76, 583 WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 584 _R( _d, _e, _a, _b, _c, F4, 77, 585 WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 586 _R( _c, _d, _e, _a, _b, F4, 78, 587 WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ ); 588 _R( _b, _c, _d, _e, _a, F4, 79, 589 WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ ); 590 591 /* Update the chaining variables. */ 592 ldm RSTATE, {RT0-RT3}; 593 add _a, RT0; 594 ldr RT0, [RSTATE, #state_h4]; 595 add _b, RT1; 596 add _c, RT2; 597 add _d, RT3; 598 add _e, RT0; 599 stm RSTATE, {_a-_e}; 600 601 b .Loop; 602 603.Lend: 604 /* Transform 64-79 */ 605 R( _b, _c, _d, _e, _a, F4, 64 ); 606 R( _a, _b, _c, _d, _e, F4, 65 ); 607 R( _e, _a, _b, _c, _d, F4, 66 ); 608 R( _d, _e, _a, _b, _c, F4, 67 ); 609 R( _c, _d, _e, _a, _b, F4, 68 ); 610 R( _b, _c, _d, _e, _a, F4, 69 ); 611 R( _a, _b, _c, _d, _e, F4, 70 ); 612 R( _e, _a, _b, _c, _d, F4, 71 ); 613 R( _d, _e, _a, _b, _c, F4, 72 ); 614 R( _c, _d, _e, _a, _b, F4, 73 ); 615 R( _b, _c, _d, _e, _a, F4, 74 ); 616 R( _a, _b, _c, _d, _e, F4, 75 ); 617 R( _e, _a, _b, _c, _d, F4, 76 ); 618 R( _d, _e, _a, _b, _c, F4, 77 ); 619 R( _c, _d, _e, _a, _b, F4, 78 ); 620 R( _b, _c, _d, _e, _a, F4, 79 ); 621 622 mov sp, ROLDSTACK; 623 624 /* Update the chaining variables. */ 625 ldm RSTATE, {RT0-RT3}; 626 add _a, RT0; 627 ldr RT0, [RSTATE, #state_h4]; 628 add _b, RT1; 629 add _c, RT2; 630 add _d, RT3; 631 /*vpop {q4-q7};*/ 632 add _e, RT0; 633 stm RSTATE, {_a-_e}; 634 635 pop {r4-r12, pc}; 636 637.Ldo_nothing: 638 bx lr 639ENDPROC(sha1_transform_neon) 640