1#include <linux/linkage.h>
2
3# enter salsa20_encrypt_bytes
4ENTRY(salsa20_encrypt_bytes)
5	mov	%rsp,%r11
6	and	$31,%r11
7	add	$256,%r11
8	sub	%r11,%rsp
9	# x = arg1
10	mov	%rdi,%r8
11	# m = arg2
12	mov	%rsi,%rsi
13	# out = arg3
14	mov	%rdx,%rdi
15	# bytes = arg4
16	mov	%rcx,%rdx
17	#               unsigned>? bytes - 0
18	cmp	$0,%rdx
19	# comment:fp stack unchanged by jump
20	# goto done if !unsigned>
21	jbe	._done
22	# comment:fp stack unchanged by fallthrough
23# start:
24._start:
25	# r11_stack = r11
26	movq	%r11,0(%rsp)
27	# r12_stack = r12
28	movq	%r12,8(%rsp)
29	# r13_stack = r13
30	movq	%r13,16(%rsp)
31	# r14_stack = r14
32	movq	%r14,24(%rsp)
33	# r15_stack = r15
34	movq	%r15,32(%rsp)
35	# rbx_stack = rbx
36	movq	%rbx,40(%rsp)
37	# rbp_stack = rbp
38	movq	%rbp,48(%rsp)
39	# in0 = *(uint64 *) (x + 0)
40	movq	0(%r8),%rcx
41	# in2 = *(uint64 *) (x + 8)
42	movq	8(%r8),%r9
43	# in4 = *(uint64 *) (x + 16)
44	movq	16(%r8),%rax
45	# in6 = *(uint64 *) (x + 24)
46	movq	24(%r8),%r10
47	# in8 = *(uint64 *) (x + 32)
48	movq	32(%r8),%r11
49	# in10 = *(uint64 *) (x + 40)
50	movq	40(%r8),%r12
51	# in12 = *(uint64 *) (x + 48)
52	movq	48(%r8),%r13
53	# in14 = *(uint64 *) (x + 56)
54	movq	56(%r8),%r14
55	# j0 = in0
56	movq	%rcx,56(%rsp)
57	# j2 = in2
58	movq	%r9,64(%rsp)
59	# j4 = in4
60	movq	%rax,72(%rsp)
61	# j6 = in6
62	movq	%r10,80(%rsp)
63	# j8 = in8
64	movq	%r11,88(%rsp)
65	# j10 = in10
66	movq	%r12,96(%rsp)
67	# j12 = in12
68	movq	%r13,104(%rsp)
69	# j14 = in14
70	movq	%r14,112(%rsp)
71	# x_backup = x
72	movq	%r8,120(%rsp)
73# bytesatleast1:
74._bytesatleast1:
75	#                   unsigned<? bytes - 64
76	cmp	$64,%rdx
77	# comment:fp stack unchanged by jump
78	#   goto nocopy if !unsigned<
79	jae	._nocopy
80	#     ctarget = out
81	movq	%rdi,128(%rsp)
82	#     out = &tmp
83	leaq	192(%rsp),%rdi
84	#     i = bytes
85	mov	%rdx,%rcx
86	#     while (i) { *out++ = *m++; --i }
87	rep	movsb
88	#     out = &tmp
89	leaq	192(%rsp),%rdi
90	#     m = &tmp
91	leaq	192(%rsp),%rsi
92	# comment:fp stack unchanged by fallthrough
93#   nocopy:
94._nocopy:
95	#   out_backup = out
96	movq	%rdi,136(%rsp)
97	#   m_backup = m
98	movq	%rsi,144(%rsp)
99	#   bytes_backup = bytes
100	movq	%rdx,152(%rsp)
101	#   x1 = j0
102	movq	56(%rsp),%rdi
103	#   x0 = x1
104	mov	%rdi,%rdx
105	#   (uint64) x1 >>= 32
106	shr	$32,%rdi
107	#   		x3 = j2
108	movq	64(%rsp),%rsi
109	#   		x2 = x3
110	mov	%rsi,%rcx
111	#   		(uint64) x3 >>= 32
112	shr	$32,%rsi
113	#   x5 = j4
114	movq	72(%rsp),%r8
115	#   x4 = x5
116	mov	%r8,%r9
117	#   (uint64) x5 >>= 32
118	shr	$32,%r8
119	#   x5_stack = x5
120	movq	%r8,160(%rsp)
121	#   		x7 = j6
122	movq	80(%rsp),%r8
123	#   		x6 = x7
124	mov	%r8,%rax
125	#   		(uint64) x7 >>= 32
126	shr	$32,%r8
127	#   x9 = j8
128	movq	88(%rsp),%r10
129	#   x8 = x9
130	mov	%r10,%r11
131	#   (uint64) x9 >>= 32
132	shr	$32,%r10
133	#   		x11 = j10
134	movq	96(%rsp),%r12
135	#   		x10 = x11
136	mov	%r12,%r13
137	#   		x10_stack = x10
138	movq	%r13,168(%rsp)
139	#   		(uint64) x11 >>= 32
140	shr	$32,%r12
141	#   x13 = j12
142	movq	104(%rsp),%r13
143	#   x12 = x13
144	mov	%r13,%r14
145	#   (uint64) x13 >>= 32
146	shr	$32,%r13
147	#   		x15 = j14
148	movq	112(%rsp),%r15
149	#   		x14 = x15
150	mov	%r15,%rbx
151	#   		(uint64) x15 >>= 32
152	shr	$32,%r15
153	#   		x15_stack = x15
154	movq	%r15,176(%rsp)
155	#   i = 20
156	mov	$20,%r15
157#   mainloop:
158._mainloop:
159	#   i_backup = i
160	movq	%r15,184(%rsp)
161	# 		x5 = x5_stack
162	movq	160(%rsp),%r15
163	# a = x12 + x0
164	lea	(%r14,%rdx),%rbp
165	# (uint32) a <<<= 7
166	rol	$7,%ebp
167	# x4 ^= a
168	xor	%rbp,%r9
169	# 		b = x1 + x5
170	lea	(%rdi,%r15),%rbp
171	# 		(uint32) b <<<= 7
172	rol	$7,%ebp
173	# 		x9 ^= b
174	xor	%rbp,%r10
175	# a = x0 + x4
176	lea	(%rdx,%r9),%rbp
177	# (uint32) a <<<= 9
178	rol	$9,%ebp
179	# x8 ^= a
180	xor	%rbp,%r11
181	# 		b = x5 + x9
182	lea	(%r15,%r10),%rbp
183	# 		(uint32) b <<<= 9
184	rol	$9,%ebp
185	# 		x13 ^= b
186	xor	%rbp,%r13
187	# a = x4 + x8
188	lea	(%r9,%r11),%rbp
189	# (uint32) a <<<= 13
190	rol	$13,%ebp
191	# x12 ^= a
192	xor	%rbp,%r14
193	# 		b = x9 + x13
194	lea	(%r10,%r13),%rbp
195	# 		(uint32) b <<<= 13
196	rol	$13,%ebp
197	# 		x1 ^= b
198	xor	%rbp,%rdi
199	# a = x8 + x12
200	lea	(%r11,%r14),%rbp
201	# (uint32) a <<<= 18
202	rol	$18,%ebp
203	# x0 ^= a
204	xor	%rbp,%rdx
205	# 		b = x13 + x1
206	lea	(%r13,%rdi),%rbp
207	# 		(uint32) b <<<= 18
208	rol	$18,%ebp
209	# 		x5 ^= b
210	xor	%rbp,%r15
211	# 				x10 = x10_stack
212	movq	168(%rsp),%rbp
213	# 		x5_stack = x5
214	movq	%r15,160(%rsp)
215	# 				c = x6 + x10
216	lea	(%rax,%rbp),%r15
217	# 				(uint32) c <<<= 7
218	rol	$7,%r15d
219	# 				x14 ^= c
220	xor	%r15,%rbx
221	# 				c = x10 + x14
222	lea	(%rbp,%rbx),%r15
223	# 				(uint32) c <<<= 9
224	rol	$9,%r15d
225	# 				x2 ^= c
226	xor	%r15,%rcx
227	# 				c = x14 + x2
228	lea	(%rbx,%rcx),%r15
229	# 				(uint32) c <<<= 13
230	rol	$13,%r15d
231	# 				x6 ^= c
232	xor	%r15,%rax
233	# 				c = x2 + x6
234	lea	(%rcx,%rax),%r15
235	# 				(uint32) c <<<= 18
236	rol	$18,%r15d
237	# 				x10 ^= c
238	xor	%r15,%rbp
239	# 						x15 = x15_stack
240	movq	176(%rsp),%r15
241	# 				x10_stack = x10
242	movq	%rbp,168(%rsp)
243	# 						d = x11 + x15
244	lea	(%r12,%r15),%rbp
245	# 						(uint32) d <<<= 7
246	rol	$7,%ebp
247	# 						x3 ^= d
248	xor	%rbp,%rsi
249	# 						d = x15 + x3
250	lea	(%r15,%rsi),%rbp
251	# 						(uint32) d <<<= 9
252	rol	$9,%ebp
253	# 						x7 ^= d
254	xor	%rbp,%r8
255	# 						d = x3 + x7
256	lea	(%rsi,%r8),%rbp
257	# 						(uint32) d <<<= 13
258	rol	$13,%ebp
259	# 						x11 ^= d
260	xor	%rbp,%r12
261	# 						d = x7 + x11
262	lea	(%r8,%r12),%rbp
263	# 						(uint32) d <<<= 18
264	rol	$18,%ebp
265	# 						x15 ^= d
266	xor	%rbp,%r15
267	# 						x15_stack = x15
268	movq	%r15,176(%rsp)
269	# 		x5 = x5_stack
270	movq	160(%rsp),%r15
271	# a = x3 + x0
272	lea	(%rsi,%rdx),%rbp
273	# (uint32) a <<<= 7
274	rol	$7,%ebp
275	# x1 ^= a
276	xor	%rbp,%rdi
277	# 		b = x4 + x5
278	lea	(%r9,%r15),%rbp
279	# 		(uint32) b <<<= 7
280	rol	$7,%ebp
281	# 		x6 ^= b
282	xor	%rbp,%rax
283	# a = x0 + x1
284	lea	(%rdx,%rdi),%rbp
285	# (uint32) a <<<= 9
286	rol	$9,%ebp
287	# x2 ^= a
288	xor	%rbp,%rcx
289	# 		b = x5 + x6
290	lea	(%r15,%rax),%rbp
291	# 		(uint32) b <<<= 9
292	rol	$9,%ebp
293	# 		x7 ^= b
294	xor	%rbp,%r8
295	# a = x1 + x2
296	lea	(%rdi,%rcx),%rbp
297	# (uint32) a <<<= 13
298	rol	$13,%ebp
299	# x3 ^= a
300	xor	%rbp,%rsi
301	# 		b = x6 + x7
302	lea	(%rax,%r8),%rbp
303	# 		(uint32) b <<<= 13
304	rol	$13,%ebp
305	# 		x4 ^= b
306	xor	%rbp,%r9
307	# a = x2 + x3
308	lea	(%rcx,%rsi),%rbp
309	# (uint32) a <<<= 18
310	rol	$18,%ebp
311	# x0 ^= a
312	xor	%rbp,%rdx
313	# 		b = x7 + x4
314	lea	(%r8,%r9),%rbp
315	# 		(uint32) b <<<= 18
316	rol	$18,%ebp
317	# 		x5 ^= b
318	xor	%rbp,%r15
319	# 				x10 = x10_stack
320	movq	168(%rsp),%rbp
321	# 		x5_stack = x5
322	movq	%r15,160(%rsp)
323	# 				c = x9 + x10
324	lea	(%r10,%rbp),%r15
325	# 				(uint32) c <<<= 7
326	rol	$7,%r15d
327	# 				x11 ^= c
328	xor	%r15,%r12
329	# 				c = x10 + x11
330	lea	(%rbp,%r12),%r15
331	# 				(uint32) c <<<= 9
332	rol	$9,%r15d
333	# 				x8 ^= c
334	xor	%r15,%r11
335	# 				c = x11 + x8
336	lea	(%r12,%r11),%r15
337	# 				(uint32) c <<<= 13
338	rol	$13,%r15d
339	# 				x9 ^= c
340	xor	%r15,%r10
341	# 				c = x8 + x9
342	lea	(%r11,%r10),%r15
343	# 				(uint32) c <<<= 18
344	rol	$18,%r15d
345	# 				x10 ^= c
346	xor	%r15,%rbp
347	# 						x15 = x15_stack
348	movq	176(%rsp),%r15
349	# 				x10_stack = x10
350	movq	%rbp,168(%rsp)
351	# 						d = x14 + x15
352	lea	(%rbx,%r15),%rbp
353	# 						(uint32) d <<<= 7
354	rol	$7,%ebp
355	# 						x12 ^= d
356	xor	%rbp,%r14
357	# 						d = x15 + x12
358	lea	(%r15,%r14),%rbp
359	# 						(uint32) d <<<= 9
360	rol	$9,%ebp
361	# 						x13 ^= d
362	xor	%rbp,%r13
363	# 						d = x12 + x13
364	lea	(%r14,%r13),%rbp
365	# 						(uint32) d <<<= 13
366	rol	$13,%ebp
367	# 						x14 ^= d
368	xor	%rbp,%rbx
369	# 						d = x13 + x14
370	lea	(%r13,%rbx),%rbp
371	# 						(uint32) d <<<= 18
372	rol	$18,%ebp
373	# 						x15 ^= d
374	xor	%rbp,%r15
375	# 						x15_stack = x15
376	movq	%r15,176(%rsp)
377	# 		x5 = x5_stack
378	movq	160(%rsp),%r15
379	# a = x12 + x0
380	lea	(%r14,%rdx),%rbp
381	# (uint32) a <<<= 7
382	rol	$7,%ebp
383	# x4 ^= a
384	xor	%rbp,%r9
385	# 		b = x1 + x5
386	lea	(%rdi,%r15),%rbp
387	# 		(uint32) b <<<= 7
388	rol	$7,%ebp
389	# 		x9 ^= b
390	xor	%rbp,%r10
391	# a = x0 + x4
392	lea	(%rdx,%r9),%rbp
393	# (uint32) a <<<= 9
394	rol	$9,%ebp
395	# x8 ^= a
396	xor	%rbp,%r11
397	# 		b = x5 + x9
398	lea	(%r15,%r10),%rbp
399	# 		(uint32) b <<<= 9
400	rol	$9,%ebp
401	# 		x13 ^= b
402	xor	%rbp,%r13
403	# a = x4 + x8
404	lea	(%r9,%r11),%rbp
405	# (uint32) a <<<= 13
406	rol	$13,%ebp
407	# x12 ^= a
408	xor	%rbp,%r14
409	# 		b = x9 + x13
410	lea	(%r10,%r13),%rbp
411	# 		(uint32) b <<<= 13
412	rol	$13,%ebp
413	# 		x1 ^= b
414	xor	%rbp,%rdi
415	# a = x8 + x12
416	lea	(%r11,%r14),%rbp
417	# (uint32) a <<<= 18
418	rol	$18,%ebp
419	# x0 ^= a
420	xor	%rbp,%rdx
421	# 		b = x13 + x1
422	lea	(%r13,%rdi),%rbp
423	# 		(uint32) b <<<= 18
424	rol	$18,%ebp
425	# 		x5 ^= b
426	xor	%rbp,%r15
427	# 				x10 = x10_stack
428	movq	168(%rsp),%rbp
429	# 		x5_stack = x5
430	movq	%r15,160(%rsp)
431	# 				c = x6 + x10
432	lea	(%rax,%rbp),%r15
433	# 				(uint32) c <<<= 7
434	rol	$7,%r15d
435	# 				x14 ^= c
436	xor	%r15,%rbx
437	# 				c = x10 + x14
438	lea	(%rbp,%rbx),%r15
439	# 				(uint32) c <<<= 9
440	rol	$9,%r15d
441	# 				x2 ^= c
442	xor	%r15,%rcx
443	# 				c = x14 + x2
444	lea	(%rbx,%rcx),%r15
445	# 				(uint32) c <<<= 13
446	rol	$13,%r15d
447	# 				x6 ^= c
448	xor	%r15,%rax
449	# 				c = x2 + x6
450	lea	(%rcx,%rax),%r15
451	# 				(uint32) c <<<= 18
452	rol	$18,%r15d
453	# 				x10 ^= c
454	xor	%r15,%rbp
455	# 						x15 = x15_stack
456	movq	176(%rsp),%r15
457	# 				x10_stack = x10
458	movq	%rbp,168(%rsp)
459	# 						d = x11 + x15
460	lea	(%r12,%r15),%rbp
461	# 						(uint32) d <<<= 7
462	rol	$7,%ebp
463	# 						x3 ^= d
464	xor	%rbp,%rsi
465	# 						d = x15 + x3
466	lea	(%r15,%rsi),%rbp
467	# 						(uint32) d <<<= 9
468	rol	$9,%ebp
469	# 						x7 ^= d
470	xor	%rbp,%r8
471	# 						d = x3 + x7
472	lea	(%rsi,%r8),%rbp
473	# 						(uint32) d <<<= 13
474	rol	$13,%ebp
475	# 						x11 ^= d
476	xor	%rbp,%r12
477	# 						d = x7 + x11
478	lea	(%r8,%r12),%rbp
479	# 						(uint32) d <<<= 18
480	rol	$18,%ebp
481	# 						x15 ^= d
482	xor	%rbp,%r15
483	# 						x15_stack = x15
484	movq	%r15,176(%rsp)
485	# 		x5 = x5_stack
486	movq	160(%rsp),%r15
487	# a = x3 + x0
488	lea	(%rsi,%rdx),%rbp
489	# (uint32) a <<<= 7
490	rol	$7,%ebp
491	# x1 ^= a
492	xor	%rbp,%rdi
493	# 		b = x4 + x5
494	lea	(%r9,%r15),%rbp
495	# 		(uint32) b <<<= 7
496	rol	$7,%ebp
497	# 		x6 ^= b
498	xor	%rbp,%rax
499	# a = x0 + x1
500	lea	(%rdx,%rdi),%rbp
501	# (uint32) a <<<= 9
502	rol	$9,%ebp
503	# x2 ^= a
504	xor	%rbp,%rcx
505	# 		b = x5 + x6
506	lea	(%r15,%rax),%rbp
507	# 		(uint32) b <<<= 9
508	rol	$9,%ebp
509	# 		x7 ^= b
510	xor	%rbp,%r8
511	# a = x1 + x2
512	lea	(%rdi,%rcx),%rbp
513	# (uint32) a <<<= 13
514	rol	$13,%ebp
515	# x3 ^= a
516	xor	%rbp,%rsi
517	# 		b = x6 + x7
518	lea	(%rax,%r8),%rbp
519	# 		(uint32) b <<<= 13
520	rol	$13,%ebp
521	# 		x4 ^= b
522	xor	%rbp,%r9
523	# a = x2 + x3
524	lea	(%rcx,%rsi),%rbp
525	# (uint32) a <<<= 18
526	rol	$18,%ebp
527	# x0 ^= a
528	xor	%rbp,%rdx
529	# 		b = x7 + x4
530	lea	(%r8,%r9),%rbp
531	# 		(uint32) b <<<= 18
532	rol	$18,%ebp
533	# 		x5 ^= b
534	xor	%rbp,%r15
535	# 				x10 = x10_stack
536	movq	168(%rsp),%rbp
537	# 		x5_stack = x5
538	movq	%r15,160(%rsp)
539	# 				c = x9 + x10
540	lea	(%r10,%rbp),%r15
541	# 				(uint32) c <<<= 7
542	rol	$7,%r15d
543	# 				x11 ^= c
544	xor	%r15,%r12
545	# 				c = x10 + x11
546	lea	(%rbp,%r12),%r15
547	# 				(uint32) c <<<= 9
548	rol	$9,%r15d
549	# 				x8 ^= c
550	xor	%r15,%r11
551	# 				c = x11 + x8
552	lea	(%r12,%r11),%r15
553	# 				(uint32) c <<<= 13
554	rol	$13,%r15d
555	# 				x9 ^= c
556	xor	%r15,%r10
557	# 				c = x8 + x9
558	lea	(%r11,%r10),%r15
559	# 				(uint32) c <<<= 18
560	rol	$18,%r15d
561	# 				x10 ^= c
562	xor	%r15,%rbp
563	# 						x15 = x15_stack
564	movq	176(%rsp),%r15
565	# 				x10_stack = x10
566	movq	%rbp,168(%rsp)
567	# 						d = x14 + x15
568	lea	(%rbx,%r15),%rbp
569	# 						(uint32) d <<<= 7
570	rol	$7,%ebp
571	# 						x12 ^= d
572	xor	%rbp,%r14
573	# 						d = x15 + x12
574	lea	(%r15,%r14),%rbp
575	# 						(uint32) d <<<= 9
576	rol	$9,%ebp
577	# 						x13 ^= d
578	xor	%rbp,%r13
579	# 						d = x12 + x13
580	lea	(%r14,%r13),%rbp
581	# 						(uint32) d <<<= 13
582	rol	$13,%ebp
583	# 						x14 ^= d
584	xor	%rbp,%rbx
585	# 						d = x13 + x14
586	lea	(%r13,%rbx),%rbp
587	# 						(uint32) d <<<= 18
588	rol	$18,%ebp
589	# 						x15 ^= d
590	xor	%rbp,%r15
591	# 						x15_stack = x15
592	movq	%r15,176(%rsp)
593	#   i = i_backup
594	movq	184(%rsp),%r15
595	#                  unsigned>? i -= 4
596	sub	$4,%r15
597	# comment:fp stack unchanged by jump
598	# goto mainloop if unsigned>
599	ja	._mainloop
600	#   (uint32) x2 += j2
601	addl	64(%rsp),%ecx
602	#   x3 <<= 32
603	shl	$32,%rsi
604	#   x3 += j2
605	addq	64(%rsp),%rsi
606	#   (uint64) x3 >>= 32
607	shr	$32,%rsi
608	#   x3 <<= 32
609	shl	$32,%rsi
610	#   x2 += x3
611	add	%rsi,%rcx
612	#   (uint32) x6 += j6
613	addl	80(%rsp),%eax
614	#   x7 <<= 32
615	shl	$32,%r8
616	#   x7 += j6
617	addq	80(%rsp),%r8
618	#   (uint64) x7 >>= 32
619	shr	$32,%r8
620	#   x7 <<= 32
621	shl	$32,%r8
622	#   x6 += x7
623	add	%r8,%rax
624	#   (uint32) x8 += j8
625	addl	88(%rsp),%r11d
626	#   x9 <<= 32
627	shl	$32,%r10
628	#   x9 += j8
629	addq	88(%rsp),%r10
630	#   (uint64) x9 >>= 32
631	shr	$32,%r10
632	#   x9 <<= 32
633	shl	$32,%r10
634	#   x8 += x9
635	add	%r10,%r11
636	#   (uint32) x12 += j12
637	addl	104(%rsp),%r14d
638	#   x13 <<= 32
639	shl	$32,%r13
640	#   x13 += j12
641	addq	104(%rsp),%r13
642	#   (uint64) x13 >>= 32
643	shr	$32,%r13
644	#   x13 <<= 32
645	shl	$32,%r13
646	#   x12 += x13
647	add	%r13,%r14
648	#   (uint32) x0 += j0
649	addl	56(%rsp),%edx
650	#   x1 <<= 32
651	shl	$32,%rdi
652	#   x1 += j0
653	addq	56(%rsp),%rdi
654	#   (uint64) x1 >>= 32
655	shr	$32,%rdi
656	#   x1 <<= 32
657	shl	$32,%rdi
658	#   x0 += x1
659	add	%rdi,%rdx
660	#   x5 = x5_stack
661	movq	160(%rsp),%rdi
662	#   (uint32) x4 += j4
663	addl	72(%rsp),%r9d
664	#   x5 <<= 32
665	shl	$32,%rdi
666	#   x5 += j4
667	addq	72(%rsp),%rdi
668	#   (uint64) x5 >>= 32
669	shr	$32,%rdi
670	#   x5 <<= 32
671	shl	$32,%rdi
672	#   x4 += x5
673	add	%rdi,%r9
674	#   x10 = x10_stack
675	movq	168(%rsp),%r8
676	#   (uint32) x10 += j10
677	addl	96(%rsp),%r8d
678	#   x11 <<= 32
679	shl	$32,%r12
680	#   x11 += j10
681	addq	96(%rsp),%r12
682	#   (uint64) x11 >>= 32
683	shr	$32,%r12
684	#   x11 <<= 32
685	shl	$32,%r12
686	#   x10 += x11
687	add	%r12,%r8
688	#   x15 = x15_stack
689	movq	176(%rsp),%rdi
690	#   (uint32) x14 += j14
691	addl	112(%rsp),%ebx
692	#   x15 <<= 32
693	shl	$32,%rdi
694	#   x15 += j14
695	addq	112(%rsp),%rdi
696	#   (uint64) x15 >>= 32
697	shr	$32,%rdi
698	#   x15 <<= 32
699	shl	$32,%rdi
700	#   x14 += x15
701	add	%rdi,%rbx
702	#   out = out_backup
703	movq	136(%rsp),%rdi
704	#   m = m_backup
705	movq	144(%rsp),%rsi
706	#   x0 ^= *(uint64 *) (m + 0)
707	xorq	0(%rsi),%rdx
708	#   *(uint64 *) (out + 0) = x0
709	movq	%rdx,0(%rdi)
710	#   x2 ^= *(uint64 *) (m + 8)
711	xorq	8(%rsi),%rcx
712	#   *(uint64 *) (out + 8) = x2
713	movq	%rcx,8(%rdi)
714	#   x4 ^= *(uint64 *) (m + 16)
715	xorq	16(%rsi),%r9
716	#   *(uint64 *) (out + 16) = x4
717	movq	%r9,16(%rdi)
718	#   x6 ^= *(uint64 *) (m + 24)
719	xorq	24(%rsi),%rax
720	#   *(uint64 *) (out + 24) = x6
721	movq	%rax,24(%rdi)
722	#   x8 ^= *(uint64 *) (m + 32)
723	xorq	32(%rsi),%r11
724	#   *(uint64 *) (out + 32) = x8
725	movq	%r11,32(%rdi)
726	#   x10 ^= *(uint64 *) (m + 40)
727	xorq	40(%rsi),%r8
728	#   *(uint64 *) (out + 40) = x10
729	movq	%r8,40(%rdi)
730	#   x12 ^= *(uint64 *) (m + 48)
731	xorq	48(%rsi),%r14
732	#   *(uint64 *) (out + 48) = x12
733	movq	%r14,48(%rdi)
734	#   x14 ^= *(uint64 *) (m + 56)
735	xorq	56(%rsi),%rbx
736	#   *(uint64 *) (out + 56) = x14
737	movq	%rbx,56(%rdi)
738	#   bytes = bytes_backup
739	movq	152(%rsp),%rdx
740	#   in8 = j8
741	movq	88(%rsp),%rcx
742	#   in8 += 1
743	add	$1,%rcx
744	#   j8 = in8
745	movq	%rcx,88(%rsp)
746	#                          unsigned>? unsigned<? bytes - 64
747	cmp	$64,%rdx
748	# comment:fp stack unchanged by jump
749	#   goto bytesatleast65 if unsigned>
750	ja	._bytesatleast65
751	# comment:fp stack unchanged by jump
752	#     goto bytesatleast64 if !unsigned<
753	jae	._bytesatleast64
754	#       m = out
755	mov	%rdi,%rsi
756	#       out = ctarget
757	movq	128(%rsp),%rdi
758	#       i = bytes
759	mov	%rdx,%rcx
760	#       while (i) { *out++ = *m++; --i }
761	rep	movsb
762	# comment:fp stack unchanged by fallthrough
763#     bytesatleast64:
764._bytesatleast64:
765	#     x = x_backup
766	movq	120(%rsp),%rdi
767	#     in8 = j8
768	movq	88(%rsp),%rsi
769	#     *(uint64 *) (x + 32) = in8
770	movq	%rsi,32(%rdi)
771	#     r11 = r11_stack
772	movq	0(%rsp),%r11
773	#     r12 = r12_stack
774	movq	8(%rsp),%r12
775	#     r13 = r13_stack
776	movq	16(%rsp),%r13
777	#     r14 = r14_stack
778	movq	24(%rsp),%r14
779	#     r15 = r15_stack
780	movq	32(%rsp),%r15
781	#     rbx = rbx_stack
782	movq	40(%rsp),%rbx
783	#     rbp = rbp_stack
784	movq	48(%rsp),%rbp
785	# comment:fp stack unchanged by fallthrough
786#     done:
787._done:
788	#     leave
789	add	%r11,%rsp
790	mov	%rdi,%rax
791	mov	%rsi,%rdx
792	ret
793#   bytesatleast65:
794._bytesatleast65:
795	#   bytes -= 64
796	sub	$64,%rdx
797	#   out += 64
798	add	$64,%rdi
799	#   m += 64
800	add	$64,%rsi
801	# comment:fp stack unchanged by jump
802	# goto bytesatleast1
803	jmp	._bytesatleast1
804ENDPROC(salsa20_encrypt_bytes)
805
806# enter salsa20_keysetup
807ENTRY(salsa20_keysetup)
808	mov	%rsp,%r11
809	and	$31,%r11
810	add	$256,%r11
811	sub	%r11,%rsp
812	#   k = arg2
813	mov	%rsi,%rsi
814	#   kbits = arg3
815	mov	%rdx,%rdx
816	#   x = arg1
817	mov	%rdi,%rdi
818	#   in0 = *(uint64 *) (k + 0)
819	movq	0(%rsi),%r8
820	#   in2 = *(uint64 *) (k + 8)
821	movq	8(%rsi),%r9
822	#   *(uint64 *) (x + 4) = in0
823	movq	%r8,4(%rdi)
824	#   *(uint64 *) (x + 12) = in2
825	movq	%r9,12(%rdi)
826	#                    unsigned<? kbits - 256
827	cmp	$256,%rdx
828	# comment:fp stack unchanged by jump
829	#   goto kbits128 if unsigned<
830	jb	._kbits128
831#   kbits256:
832._kbits256:
833	#     in10 = *(uint64 *) (k + 16)
834	movq	16(%rsi),%rdx
835	#     in12 = *(uint64 *) (k + 24)
836	movq	24(%rsi),%rsi
837	#     *(uint64 *) (x + 44) = in10
838	movq	%rdx,44(%rdi)
839	#     *(uint64 *) (x + 52) = in12
840	movq	%rsi,52(%rdi)
841	#     in0 = 1634760805
842	mov	$1634760805,%rsi
843	#     in4 = 857760878
844	mov	$857760878,%rdx
845	#     in10 = 2036477234
846	mov	$2036477234,%rcx
847	#     in14 = 1797285236
848	mov	$1797285236,%r8
849	#     *(uint32 *) (x + 0) = in0
850	movl	%esi,0(%rdi)
851	#     *(uint32 *) (x + 20) = in4
852	movl	%edx,20(%rdi)
853	#     *(uint32 *) (x + 40) = in10
854	movl	%ecx,40(%rdi)
855	#     *(uint32 *) (x + 60) = in14
856	movl	%r8d,60(%rdi)
857	# comment:fp stack unchanged by jump
858	#   goto keysetupdone
859	jmp	._keysetupdone
860#   kbits128:
861._kbits128:
862	#     in10 = *(uint64 *) (k + 0)
863	movq	0(%rsi),%rdx
864	#     in12 = *(uint64 *) (k + 8)
865	movq	8(%rsi),%rsi
866	#     *(uint64 *) (x + 44) = in10
867	movq	%rdx,44(%rdi)
868	#     *(uint64 *) (x + 52) = in12
869	movq	%rsi,52(%rdi)
870	#     in0 = 1634760805
871	mov	$1634760805,%rsi
872	#     in4 = 824206446
873	mov	$824206446,%rdx
874	#     in10 = 2036477238
875	mov	$2036477238,%rcx
876	#     in14 = 1797285236
877	mov	$1797285236,%r8
878	#     *(uint32 *) (x + 0) = in0
879	movl	%esi,0(%rdi)
880	#     *(uint32 *) (x + 20) = in4
881	movl	%edx,20(%rdi)
882	#     *(uint32 *) (x + 40) = in10
883	movl	%ecx,40(%rdi)
884	#     *(uint32 *) (x + 60) = in14
885	movl	%r8d,60(%rdi)
886#   keysetupdone:
887._keysetupdone:
888	# leave
889	add	%r11,%rsp
890	mov	%rdi,%rax
891	mov	%rsi,%rdx
892	ret
893ENDPROC(salsa20_keysetup)
894
895# enter salsa20_ivsetup
896ENTRY(salsa20_ivsetup)
897	mov	%rsp,%r11
898	and	$31,%r11
899	add	$256,%r11
900	sub	%r11,%rsp
901	#   iv = arg2
902	mov	%rsi,%rsi
903	#   x = arg1
904	mov	%rdi,%rdi
905	#   in6 = *(uint64 *) (iv + 0)
906	movq	0(%rsi),%rsi
907	#   in8 = 0
908	mov	$0,%r8
909	#   *(uint64 *) (x + 24) = in6
910	movq	%rsi,24(%rdi)
911	#   *(uint64 *) (x + 32) = in8
912	movq	%r8,32(%rdi)
913	# leave
914	add	%r11,%rsp
915	mov	%rdi,%rax
916	mov	%rsi,%rdx
917	ret
918ENDPROC(salsa20_ivsetup)
919