1 ########################################################################
2
3 #
4
5 #
6
7
8
9
10 #
11
12
13
14
15
16 #
17
18
19
20 #
21 # - Redistributions of source code must retain the above
22
23
24 #
25 # - Redistributions in binary form must reproduce the above
26
27
28
29 #
30
31
32
33
34
35
36
37
38 #
39 ########################################################################
40 #
41
42 # "Fast SHA-256 Implementations on Intel Architecture Processors"
43 #
44
45
46 #
47 ########################################################################
48
49 ########################################################################
50
51 #ifdef CONFIG_AS_AVX2
52 #include <linux/linkage.h>
53
54 ## assume buffers not aligned
55 #define VMOVDQ vmovdqu
56
57 ################################ Define Macros
58
59
60
61 .macro addm p1 p2
62 add \p1, \p2
63 mov \p2, \p1
64 .endm
65
66 ################################
67
68 X0 = %ymm4
69 X1 = %ymm5
70 X2 = %ymm6
71 X3 = %ymm7
72
73
74 XWORD0 = %xmm4
75 XWORD1 = %xmm5
76 XWORD2 = %xmm6
77 XWORD3 = %xmm7
78
79 XTMP0 = %ymm0
80 XTMP1 = %ymm1
81 XTMP2 = %ymm2
82 XTMP3 = %ymm3
83 XTMP4 = %ymm8
84 XFER = %ymm9
85 XTMP5 = %ymm11
86
87 SHUF_00BA = %ymm10 # shuffle xBxA -> 00BA
88 SHUF_DC00 = %ymm12 # shuffle xDxC -> DC00
89 BYTE_FLIP_MASK = %ymm13
90
91 X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
92
93 NUM_BLKS = %rdx # 3rd arg
94 INP = %rsi # 2nd arg
95 CTX = %rdi # 1st arg
96 c = %ecx
97 d = %r8d
98 e = %edx # clobbers NUM_BLKS
99 y3 = %esi # clobbers INP
100
101 SRND = CTX # SRND is same register as CTX
102
103 a = %eax
104 b = %ebx
105 f = %r9d
106 g = %r10d
107 h = %r11d
108 old_h = %r11d
109
110 T1 = %r12d
111 y0 = %r13d
112 y1 = %r14d
113 y2 = %r15d
114
115
116 _XFER_SIZE = 2*64*4 # 2 blocks, 64 rounds, 4 bytes/round
117 _XMM_SAVE_SIZE = 0
118 _INP_END_SIZE = 8
119 _INP_SIZE = 8
120 _CTX_SIZE = 8
121 _RSP_SIZE = 8
122
123 _XFER = 0
124 _XMM_SAVE = _XFER + _XFER_SIZE
125 _INP_END = _XMM_SAVE + _XMM_SAVE_SIZE
126 _INP = _INP_END + _INP_END_SIZE
127 _CTX = _INP + _INP_SIZE
128 _RSP = _CTX + _CTX_SIZE
129 STACK_SIZE = _RSP + _RSP_SIZE
130
131
132
133 .macro rotate_Xs
134 X_ = X0
135 X0 = X1
136 X1 = X2
137 X2 = X3
138 X3 = X_
139 .endm
140
141
142
143 .macro ROTATE_ARGS
144 old_h = h
145 TMP_ = h
146 h = g
147 g = f
148 f = e
149 e = d
150 d = c
151 c = b
152 b = a
153 a = TMP_
154 .endm
155
156 .macro FOUR_ROUNDS_AND_SCHED disp
157 ################################### RND N + 0 ############################
158
159 mov a, y3 # y3 = a # MAJA
160 rorx $25, e, y0 # y0 = e >> 25 # S1A
161 rorx $11, e, y1 # y1 = e >> 11 # S1B
162
163 addl \disp(%rsp, SRND), h # h = k + w + h # --
164 or c, y3 # y3 = a|c # MAJA
165 vpalignr $4, X2, X3, XTMP0 # XTMP0 = W[-7]
166 mov f, y2 # y2 = f # CH
167 rorx $13, a, T1 # T1 = a >> 13 # S0B
168
169 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
170 xor g, y2 # y2 = f^g # CH
171 vpaddd X0, XTMP0, XTMP0 # XTMP0 = W[-7] + W[-16]# y1 = (e >> 6)# S1
172 rorx $6, e, y1 # y1 = (e >> 6) # S1
173
174 and e, y2 # y2 = (f^g)&e # CH
175 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
176 rorx $22, a, y1 # y1 = a >> 22 # S0A
177 add h, d # d = k + w + h + d # --
178
179 and b, y3 # y3 = (a|c)&b # MAJA
180 vpalignr $4, X0, X1, XTMP1 # XTMP1 = W[-15]
181 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
182 rorx $2, a, T1 # T1 = (a >> 2) # S0
183
184 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
185 vpsrld $7, XTMP1, XTMP2
186 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
187 mov a, T1 # T1 = a # MAJB
188 and c, T1 # T1 = a&c # MAJB
189
190 add y0, y2 # y2 = S1 + CH # --
191 vpslld $(32-7), XTMP1, XTMP3
192 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
193 add y1, h # h = k + w + h + S0 # --
194
195 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
196 vpor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7
197
198 vpsrld $18, XTMP1, XTMP2
199 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
200 add y3, h # h = t1 + S0 + MAJ # --
201
202
203 ROTATE_ARGS
204
205 ################################### RND N + 1 ############################
206
207 mov a, y3 # y3 = a # MAJA
208 rorx $25, e, y0 # y0 = e >> 25 # S1A
209 rorx $11, e, y1 # y1 = e >> 11 # S1B
210 offset = \disp + 1*4
211 addl offset(%rsp, SRND), h # h = k + w + h # --
212 or c, y3 # y3 = a|c # MAJA
213
214
215 vpsrld $3, XTMP1, XTMP4 # XTMP4 = W[-15] >> 3
216 mov f, y2 # y2 = f # CH
217 rorx $13, a, T1 # T1 = a >> 13 # S0B
218 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
219 xor g, y2 # y2 = f^g # CH
220
221
222 rorx $6, e, y1 # y1 = (e >> 6) # S1
223 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
224 rorx $22, a, y1 # y1 = a >> 22 # S0A
225 and e, y2 # y2 = (f^g)&e # CH
226 add h, d # d = k + w + h + d # --
227
228 vpslld $(32-18), XTMP1, XTMP1
229 and b, y3 # y3 = (a|c)&b # MAJA
230 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
231
232 vpxor XTMP1, XTMP3, XTMP3
233 rorx $2, a, T1 # T1 = (a >> 2) # S0
234 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
235
236 vpxor XTMP2, XTMP3, XTMP3 # XTMP3 = W[-15] ror 7 ^ W[-15] ror 18
237 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
238 mov a, T1 # T1 = a # MAJB
239 and c, T1 # T1 = a&c # MAJB
240 add y0, y2 # y2 = S1 + CH # --
241
242 vpxor XTMP4, XTMP3, XTMP1 # XTMP1 = s0
243 vpshufd $0b11111010, X3, XTMP2 # XTMP2 = W[-2] {BBAA}
244 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
245 add y1, h # h = k + w + h + S0 # --
246
247 vpaddd XTMP1, XTMP0, XTMP0 # XTMP0 = W[-16] + W[-7] + s0
248 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
249 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
250 add y3, h # h = t1 + S0 + MAJ # --
251
252 vpsrld $10, XTMP2, XTMP4 # XTMP4 = W[-2] >> 10 {BBAA}
253
254
255 ROTATE_ARGS
256
257 ################################### RND N + 2 ############################
258
259 mov a, y3 # y3 = a # MAJA
260 rorx $25, e, y0 # y0 = e >> 25 # S1A
261 offset = \disp + 2*4
262 addl offset(%rsp, SRND), h # h = k + w + h # --
263
264 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xBxA}
265 rorx $11, e, y1 # y1 = e >> 11 # S1B
266 or c, y3 # y3 = a|c # MAJA
267 mov f, y2 # y2 = f # CH
268 xor g, y2 # y2 = f^g # CH
269
270 rorx $13, a, T1 # T1 = a >> 13 # S0B
271 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
272 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xBxA}
273 and e, y2 # y2 = (f^g)&e # CH
274
275 rorx $6, e, y1 # y1 = (e >> 6) # S1
276 vpxor XTMP3, XTMP2, XTMP2
277 add h, d # d = k + w + h + d # --
278 and b, y3 # y3 = (a|c)&b # MAJA
279
280 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
281 rorx $22, a, y1 # y1 = a >> 22 # S0A
282 vpxor XTMP2, XTMP4, XTMP4 # XTMP4 = s1 {xBxA}
283 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
284
285 vpshufb SHUF_00BA, XTMP4, XTMP4 # XTMP4 = s1 {00BA}
286 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
287 rorx $2, a ,T1 # T1 = (a >> 2) # S0
288 vpaddd XTMP4, XTMP0, XTMP0 # XTMP0 = {..., ..., W[1], W[0]}
289
290 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
291 mov a, T1 # T1 = a # MAJB
292 and c, T1 # T1 = a&c # MAJB
293 add y0, y2 # y2 = S1 + CH # --
294 vpshufd $0b01010000, XTMP0, XTMP2 # XTMP2 = W[-2] {DDCC}
295
296 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
297 add y1,h # h = k + w + h + S0 # --
298 add y2,d # d = k + w + h + d + S1 + CH = d + t1 # --
299 add y2,h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
300
301 add y3,h # h = t1 + S0 + MAJ # --
302
303
304 ROTATE_ARGS
305
306 ################################### RND N + 3 ############################
307
308 mov a, y3 # y3 = a # MAJA
309 rorx $25, e, y0 # y0 = e >> 25 # S1A
310 rorx $11, e, y1 # y1 = e >> 11 # S1B
311 offset = \disp + 3*4
312 addl offset(%rsp, SRND), h # h = k + w + h # --
313 or c, y3 # y3 = a|c # MAJA
314
315
316 vpsrld $10, XTMP2, XTMP5 # XTMP5 = W[-2] >> 10 {DDCC}
317 mov f, y2 # y2 = f # CH
318 rorx $13, a, T1 # T1 = a >> 13 # S0B
319 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
320 xor g, y2 # y2 = f^g # CH
321
322
323 vpsrlq $19, XTMP2, XTMP3 # XTMP3 = W[-2] ror 19 {xDxC}
324 rorx $6, e, y1 # y1 = (e >> 6) # S1
325 and e, y2 # y2 = (f^g)&e # CH
326 add h, d # d = k + w + h + d # --
327 and b, y3 # y3 = (a|c)&b # MAJA
328
329 vpsrlq $17, XTMP2, XTMP2 # XTMP2 = W[-2] ror 17 {xDxC}
330 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
331 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
332
333 vpxor XTMP3, XTMP2, XTMP2
334 rorx $22, a, y1 # y1 = a >> 22 # S0A
335 add y0, y2 # y2 = S1 + CH # --
336
337 vpxor XTMP2, XTMP5, XTMP5 # XTMP5 = s1 {xDxC}
338 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
339 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
340
341 rorx $2, a, T1 # T1 = (a >> 2) # S0
342 vpshufb SHUF_DC00, XTMP5, XTMP5 # XTMP5 = s1 {DC00}
343
344 vpaddd XTMP0, XTMP5, X0 # X0 = {W[3], W[2], W[1], W[0]}
345 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
346 mov a, T1 # T1 = a # MAJB
347 and c, T1 # T1 = a&c # MAJB
348 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
349
350 add y1, h # h = k + w + h + S0 # --
351 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
352 add y3, h # h = t1 + S0 + MAJ # --
353
354 ROTATE_ARGS
355 rotate_Xs
356 .endm
357
358 .macro DO_4ROUNDS disp
359 ################################### RND N + 0 ###########################
360
361 mov f, y2 # y2 = f # CH
362 rorx $25, e, y0 # y0 = e >> 25 # S1A
363 rorx $11, e, y1 # y1 = e >> 11 # S1B
364 xor g, y2 # y2 = f^g # CH
365
366 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
367 rorx $6, e, y1 # y1 = (e >> 6) # S1
368 and e, y2 # y2 = (f^g)&e # CH
369
370 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
371 rorx $13, a, T1 # T1 = a >> 13 # S0B
372 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
373 rorx $22, a, y1 # y1 = a >> 22 # S0A
374 mov a, y3 # y3 = a # MAJA
375
376 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
377 rorx $2, a, T1 # T1 = (a >> 2) # S0
378 addl \disp(%rsp, SRND), h # h = k + w + h # --
379 or c, y3 # y3 = a|c # MAJA
380
381 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
382 mov a, T1 # T1 = a # MAJB
383 and b, y3 # y3 = (a|c)&b # MAJA
384 and c, T1 # T1 = a&c # MAJB
385 add y0, y2 # y2 = S1 + CH # --
386
387
388 add h, d # d = k + w + h + d # --
389 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
390 add y1, h # h = k + w + h + S0 # --
391 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
392
393 ROTATE_ARGS
394
395 ################################### RND N + 1 ###########################
396
397 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
398 mov f, y2 # y2 = f # CH
399 rorx $25, e, y0 # y0 = e >> 25 # S1A
400 rorx $11, e, y1 # y1 = e >> 11 # S1B
401 xor g, y2 # y2 = f^g # CH
402
403 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
404 rorx $6, e, y1 # y1 = (e >> 6) # S1
405 and e, y2 # y2 = (f^g)&e # CH
406 add y3, old_h # h = t1 + S0 + MAJ # --
407
408 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
409 rorx $13, a, T1 # T1 = a >> 13 # S0B
410 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
411 rorx $22, a, y1 # y1 = a >> 22 # S0A
412 mov a, y3 # y3 = a # MAJA
413
414 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
415 rorx $2, a, T1 # T1 = (a >> 2) # S0
416 offset = 4*1 + \disp
417 addl offset(%rsp, SRND), h # h = k + w + h # --
418 or c, y3 # y3 = a|c # MAJA
419
420 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
421 mov a, T1 # T1 = a # MAJB
422 and b, y3 # y3 = (a|c)&b # MAJA
423 and c, T1 # T1 = a&c # MAJB
424 add y0, y2 # y2 = S1 + CH # --
425
426
427 add h, d # d = k + w + h + d # --
428 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
429 add y1, h # h = k + w + h + S0 # --
430
431 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
432
433 ROTATE_ARGS
434
435 ################################### RND N + 2 ##############################
436
437 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
438 mov f, y2 # y2 = f # CH
439 rorx $25, e, y0 # y0 = e >> 25 # S1A
440 rorx $11, e, y1 # y1 = e >> 11 # S1B
441 xor g, y2 # y2 = f^g # CH
442
443 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
444 rorx $6, e, y1 # y1 = (e >> 6) # S1
445 and e, y2 # y2 = (f^g)&e # CH
446 add y3, old_h # h = t1 + S0 + MAJ # --
447
448 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
449 rorx $13, a, T1 # T1 = a >> 13 # S0B
450 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
451 rorx $22, a, y1 # y1 = a >> 22 # S0A
452 mov a, y3 # y3 = a # MAJA
453
454 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
455 rorx $2, a, T1 # T1 = (a >> 2) # S0
456 offset = 4*2 + \disp
457 addl offset(%rsp, SRND), h # h = k + w + h # --
458 or c, y3 # y3 = a|c # MAJA
459
460 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
461 mov a, T1 # T1 = a # MAJB
462 and b, y3 # y3 = (a|c)&b # MAJA
463 and c, T1 # T1 = a&c # MAJB
464 add y0, y2 # y2 = S1 + CH # --
465
466
467 add h, d # d = k + w + h + d # --
468 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
469 add y1, h # h = k + w + h + S0 # --
470
471 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
472
473 ROTATE_ARGS
474
475 ################################### RND N + 3 ###########################
476
477 add y2, old_h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
478 mov f, y2 # y2 = f # CH
479 rorx $25, e, y0 # y0 = e >> 25 # S1A
480 rorx $11, e, y1 # y1 = e >> 11 # S1B
481 xor g, y2 # y2 = f^g # CH
482
483 xor y1, y0 # y0 = (e>>25) ^ (e>>11) # S1
484 rorx $6, e, y1 # y1 = (e >> 6) # S1
485 and e, y2 # y2 = (f^g)&e # CH
486 add y3, old_h # h = t1 + S0 + MAJ # --
487
488 xor y1, y0 # y0 = (e>>25) ^ (e>>11) ^ (e>>6) # S1
489 rorx $13, a, T1 # T1 = a >> 13 # S0B
490 xor g, y2 # y2 = CH = ((f^g)&e)^g # CH
491 rorx $22, a, y1 # y1 = a >> 22 # S0A
492 mov a, y3 # y3 = a # MAJA
493
494 xor T1, y1 # y1 = (a>>22) ^ (a>>13) # S0
495 rorx $2, a, T1 # T1 = (a >> 2) # S0
496 offset = 4*3 + \disp
497 addl offset(%rsp, SRND), h # h = k + w + h # --
498 or c, y3 # y3 = a|c # MAJA
499
500 xor T1, y1 # y1 = (a>>22) ^ (a>>13) ^ (a>>2) # S0
501 mov a, T1 # T1 = a # MAJB
502 and b, y3 # y3 = (a|c)&b # MAJA
503 and c, T1 # T1 = a&c # MAJB
504 add y0, y2 # y2 = S1 + CH # --
505
506
507 add h, d # d = k + w + h + d # --
508 or T1, y3 # y3 = MAJ = (a|c)&b)|(a&c) # MAJ
509 add y1, h # h = k + w + h + S0 # --
510
511 add y2, d # d = k + w + h + d + S1 + CH = d + t1 # --
512
513
514 add y2, h # h = k + w + h + S0 + S1 + CH = t1 + S0# --
515
516 add y3, h # h = t1 + S0 + MAJ # --
517
518 ROTATE_ARGS
519
520 .endm
521
522 ########################################################################
523 ## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
524 ## arg 1 : pointer to digest
525 ## arg 2 : pointer to input data
526 ## arg 3 : Num blocks
527 ########################################################################
528 .text
529 ENTRY(sha256_transform_rorx)
530 .align 32
531 pushq %rbx
532 pushq %r12
533 pushq %r13
534 pushq %r14
535 pushq %r15
536
537 mov %rsp, %rax
538 subq $STACK_SIZE, %rsp
539 and $-32, %rsp # align rsp to 32 byte boundary
540 mov %rax, _RSP(%rsp)
541
542
543 shl $6, NUM_BLKS # convert to bytes
544 jz done_hash
545 lea -64(INP, NUM_BLKS), NUM_BLKS # pointer to last block
546 mov NUM_BLKS, _INP_END(%rsp)
547
548 cmp NUM_BLKS, INP
549 je only_one_block
550
551 ## load initial digest
552 mov (CTX), a
553 mov 4*1(CTX), b
554 mov 4*2(CTX), c
555 mov 4*3(CTX), d
556 mov 4*4(CTX), e
557 mov 4*5(CTX), f
558 mov 4*6(CTX), g
559 mov 4*7(CTX), h
560
561 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
562 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
563 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
564
565 mov CTX, _CTX(%rsp)
566
567 loop0:
568 ## Load first 16 dwords from two blocks
569 VMOVDQ 0*32(INP),XTMP0
570 VMOVDQ 1*32(INP),XTMP1
571 VMOVDQ 2*32(INP),XTMP2
572 VMOVDQ 3*32(INP),XTMP3
573
574 ## byte swap data
575 vpshufb BYTE_FLIP_MASK, XTMP0, XTMP0
576 vpshufb BYTE_FLIP_MASK, XTMP1, XTMP1
577 vpshufb BYTE_FLIP_MASK, XTMP2, XTMP2
578 vpshufb BYTE_FLIP_MASK, XTMP3, XTMP3
579
580 ## transpose data into high/low halves
581 vperm2i128 $0x20, XTMP2, XTMP0, X0
582 vperm2i128 $0x31, XTMP2, XTMP0, X1
583 vperm2i128 $0x20, XTMP3, XTMP1, X2
584 vperm2i128 $0x31, XTMP3, XTMP1, X3
585
586 last_block_enter:
587 add $64, INP
588 mov INP, _INP(%rsp)
589
590 ## schedule 48 input dwords, by doing 3 rounds of 12 each
591 xor SRND, SRND
592
593 .align 16
594 loop1:
595 vpaddd K256+0*32(SRND), X0, XFER
596 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
597 FOUR_ROUNDS_AND_SCHED _XFER + 0*32
598
599 vpaddd K256+1*32(SRND), X0, XFER
600 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
601 FOUR_ROUNDS_AND_SCHED _XFER + 1*32
602
603 vpaddd K256+2*32(SRND), X0, XFER
604 vmovdqa XFER, 2*32+_XFER(%rsp, SRND)
605 FOUR_ROUNDS_AND_SCHED _XFER + 2*32
606
607 vpaddd K256+3*32(SRND), X0, XFER
608 vmovdqa XFER, 3*32+_XFER(%rsp, SRND)
609 FOUR_ROUNDS_AND_SCHED _XFER + 3*32
610
611 add $4*32, SRND
612 cmp $3*4*32, SRND
613 jb loop1
614
615 loop2:
616 ## Do last 16 rounds with no scheduling
617 vpaddd K256+0*32(SRND), X0, XFER
618 vmovdqa XFER, 0*32+_XFER(%rsp, SRND)
619 DO_4ROUNDS _XFER + 0*32
620
621 vpaddd K256+1*32(SRND), X1, XFER
622 vmovdqa XFER, 1*32+_XFER(%rsp, SRND)
623 DO_4ROUNDS _XFER + 1*32
624 add $2*32, SRND
625
626 vmovdqa X2, X0
627 vmovdqa X3, X1
628
629 cmp $4*4*32, SRND
630 jb loop2
631
632 mov _CTX(%rsp), CTX
633 mov _INP(%rsp), INP
634
635 addm (4*0)(CTX),a
636 addm (4*1)(CTX),b
637 addm (4*2)(CTX),c
638 addm (4*3)(CTX),d
639 addm (4*4)(CTX),e
640 addm (4*5)(CTX),f
641 addm (4*6)(CTX),g
642 addm (4*7)(CTX),h
643
644 cmp _INP_END(%rsp), INP
645 ja done_hash
646
647 #### Do second block using previously scheduled results
648 xor SRND, SRND
649 .align 16
650 loop3:
651 DO_4ROUNDS _XFER + 0*32 + 16
652 DO_4ROUNDS _XFER + 1*32 + 16
653 add $2*32, SRND
654 cmp $4*4*32, SRND
655 jb loop3
656
657 mov _CTX(%rsp), CTX
658 mov _INP(%rsp), INP
659 add $64, INP
660
661 addm (4*0)(CTX),a
662 addm (4*1)(CTX),b
663 addm (4*2)(CTX),c
664 addm (4*3)(CTX),d
665 addm (4*4)(CTX),e
666 addm (4*5)(CTX),f
667 addm (4*6)(CTX),g
668 addm (4*7)(CTX),h
669
670 cmp _INP_END(%rsp), INP
671 jb loop0
672 ja done_hash
673
674 do_last_block:
675 VMOVDQ 0*16(INP),XWORD0
676 VMOVDQ 1*16(INP),XWORD1
677 VMOVDQ 2*16(INP),XWORD2
678 VMOVDQ 3*16(INP),XWORD3
679
680 vpshufb X_BYTE_FLIP_MASK, XWORD0, XWORD0
681 vpshufb X_BYTE_FLIP_MASK, XWORD1, XWORD1
682 vpshufb X_BYTE_FLIP_MASK, XWORD2, XWORD2
683 vpshufb X_BYTE_FLIP_MASK, XWORD3, XWORD3
684
685 jmp last_block_enter
686
687 only_one_block:
688
689 ## load initial digest
690 mov (4*0)(CTX),a
691 mov (4*1)(CTX),b
692 mov (4*2)(CTX),c
693 mov (4*3)(CTX),d
694 mov (4*4)(CTX),e
695 mov (4*5)(CTX),f
696 mov (4*6)(CTX),g
697 mov (4*7)(CTX),h
698
699 vmovdqa PSHUFFLE_BYTE_FLIP_MASK(%rip), BYTE_FLIP_MASK
700 vmovdqa _SHUF_00BA(%rip), SHUF_00BA
701 vmovdqa _SHUF_DC00(%rip), SHUF_DC00
702
703 mov CTX, _CTX(%rsp)
704 jmp do_last_block
705
706 done_hash:
707
708 mov _RSP(%rsp), %rsp
709
710 popq %r15
711 popq %r14
712 popq %r13
713 popq %r12
714 popq %rbx
715 ret
716 ENDPROC(sha256_transform_rorx)
717
718 .section .rodata.cst512.K256, "aM", @progbits, 512
719 .align 64
720 K256:
721 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
722 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
723 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
724 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
725 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
726 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
727 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
728 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
729 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
730 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
731 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
732 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
733 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
734 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
735 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
736 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
737 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
738 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
739 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
740 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
741 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
742 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
743 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
744 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
745 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
746 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
747 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
748 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
749 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
750 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
751 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
752 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
753
754 .section .rodata.cst32.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 32
755 .align 32
756 PSHUFFLE_BYTE_FLIP_MASK:
757 .octa 0x0c0d0e0f08090a0b0405060700010203,0x0c0d0e0f08090a0b0405060700010203
758
759
760 .section .rodata.cst32._SHUF_00BA, "aM", @progbits, 32
761 .align 32
762 _SHUF_00BA:
763 .octa 0xFFFFFFFFFFFFFFFF0b0a090803020100,0xFFFFFFFFFFFFFFFF0b0a090803020100
764
765
766 .section .rodata.cst32._SHUF_DC00, "aM", @progbits, 32
767 .align 32
768 _SHUF_DC00:
769 .octa 0x0b0a090803020100FFFFFFFFFFFFFFFF,0x0b0a090803020100FFFFFFFFFFFFFFFF
770
771 #endif