1# salsa20_pm.s version 20051229
2# D. J. Bernstein
3# Public domain.
4
5#include <linux/linkage.h>
6
7.text
8
9# enter salsa20_encrypt_bytes
10ENTRY(salsa20_encrypt_bytes)
11	mov	%esp,%eax
12	and	$31,%eax
13	add	$256,%eax
14	sub	%eax,%esp
15	# eax_stack = eax
16	movl	%eax,80(%esp)
17	# ebx_stack = ebx
18	movl	%ebx,84(%esp)
19	# esi_stack = esi
20	movl	%esi,88(%esp)
21	# edi_stack = edi
22	movl	%edi,92(%esp)
23	# ebp_stack = ebp
24	movl	%ebp,96(%esp)
25	# x = arg1
26	movl	4(%esp,%eax),%edx
27	# m = arg2
28	movl	8(%esp,%eax),%esi
29	# out = arg3
30	movl	12(%esp,%eax),%edi
31	# bytes = arg4
32	movl	16(%esp,%eax),%ebx
33	# bytes -= 0
34	sub	$0,%ebx
35	# goto done if unsigned<=
36	jbe	._done
37._start:
38	# in0 = *(uint32 *) (x + 0)
39	movl	0(%edx),%eax
40	# in1 = *(uint32 *) (x + 4)
41	movl	4(%edx),%ecx
42	# in2 = *(uint32 *) (x + 8)
43	movl	8(%edx),%ebp
44	# j0 = in0
45	movl	%eax,164(%esp)
46	# in3 = *(uint32 *) (x + 12)
47	movl	12(%edx),%eax
48	# j1 = in1
49	movl	%ecx,168(%esp)
50	# in4 = *(uint32 *) (x + 16)
51	movl	16(%edx),%ecx
52	# j2 = in2
53	movl	%ebp,172(%esp)
54	# in5 = *(uint32 *) (x + 20)
55	movl	20(%edx),%ebp
56	# j3 = in3
57	movl	%eax,176(%esp)
58	# in6 = *(uint32 *) (x + 24)
59	movl	24(%edx),%eax
60	# j4 = in4
61	movl	%ecx,180(%esp)
62	# in7 = *(uint32 *) (x + 28)
63	movl	28(%edx),%ecx
64	# j5 = in5
65	movl	%ebp,184(%esp)
66	# in8 = *(uint32 *) (x + 32)
67	movl	32(%edx),%ebp
68	# j6 = in6
69	movl	%eax,188(%esp)
70	# in9 = *(uint32 *) (x + 36)
71	movl	36(%edx),%eax
72	# j7 = in7
73	movl	%ecx,192(%esp)
74	# in10 = *(uint32 *) (x + 40)
75	movl	40(%edx),%ecx
76	# j8 = in8
77	movl	%ebp,196(%esp)
78	# in11 = *(uint32 *) (x + 44)
79	movl	44(%edx),%ebp
80	# j9 = in9
81	movl	%eax,200(%esp)
82	# in12 = *(uint32 *) (x + 48)
83	movl	48(%edx),%eax
84	# j10 = in10
85	movl	%ecx,204(%esp)
86	# in13 = *(uint32 *) (x + 52)
87	movl	52(%edx),%ecx
88	# j11 = in11
89	movl	%ebp,208(%esp)
90	# in14 = *(uint32 *) (x + 56)
91	movl	56(%edx),%ebp
92	# j12 = in12
93	movl	%eax,212(%esp)
94	# in15 = *(uint32 *) (x + 60)
95	movl	60(%edx),%eax
96	# j13 = in13
97	movl	%ecx,216(%esp)
98	# j14 = in14
99	movl	%ebp,220(%esp)
100	# j15 = in15
101	movl	%eax,224(%esp)
102	# x_backup = x
103	movl	%edx,64(%esp)
104._bytesatleast1:
105	#   bytes - 64
106	cmp	$64,%ebx
107	#   goto nocopy if unsigned>=
108	jae	._nocopy
109	#     ctarget = out
110	movl	%edi,228(%esp)
111	#     out = &tmp
112	leal	0(%esp),%edi
113	#     i = bytes
114	mov	%ebx,%ecx
115	#     while (i) { *out++ = *m++; --i }
116	rep	movsb
117	#     out = &tmp
118	leal	0(%esp),%edi
119	#     m = &tmp
120	leal	0(%esp),%esi
121._nocopy:
122	#   out_backup = out
123	movl	%edi,72(%esp)
124	#   m_backup = m
125	movl	%esi,68(%esp)
126	#   bytes_backup = bytes
127	movl	%ebx,76(%esp)
128	#   in0 = j0
129	movl	164(%esp),%eax
130	#   in1 = j1
131	movl	168(%esp),%ecx
132	#   in2 = j2
133	movl	172(%esp),%edx
134	#   in3 = j3
135	movl	176(%esp),%ebx
136	#   x0 = in0
137	movl	%eax,100(%esp)
138	#   x1 = in1
139	movl	%ecx,104(%esp)
140	#   x2 = in2
141	movl	%edx,108(%esp)
142	#   x3 = in3
143	movl	%ebx,112(%esp)
144	#   in4 = j4
145	movl	180(%esp),%eax
146	#   in5 = j5
147	movl	184(%esp),%ecx
148	#   in6 = j6
149	movl	188(%esp),%edx
150	#   in7 = j7
151	movl	192(%esp),%ebx
152	#   x4 = in4
153	movl	%eax,116(%esp)
154	#   x5 = in5
155	movl	%ecx,120(%esp)
156	#   x6 = in6
157	movl	%edx,124(%esp)
158	#   x7 = in7
159	movl	%ebx,128(%esp)
160	#   in8 = j8
161	movl	196(%esp),%eax
162	#   in9 = j9
163	movl	200(%esp),%ecx
164	#   in10 = j10
165	movl	204(%esp),%edx
166	#   in11 = j11
167	movl	208(%esp),%ebx
168	#   x8 = in8
169	movl	%eax,132(%esp)
170	#   x9 = in9
171	movl	%ecx,136(%esp)
172	#   x10 = in10
173	movl	%edx,140(%esp)
174	#   x11 = in11
175	movl	%ebx,144(%esp)
176	#   in12 = j12
177	movl	212(%esp),%eax
178	#   in13 = j13
179	movl	216(%esp),%ecx
180	#   in14 = j14
181	movl	220(%esp),%edx
182	#   in15 = j15
183	movl	224(%esp),%ebx
184	#   x12 = in12
185	movl	%eax,148(%esp)
186	#   x13 = in13
187	movl	%ecx,152(%esp)
188	#   x14 = in14
189	movl	%edx,156(%esp)
190	#   x15 = in15
191	movl	%ebx,160(%esp)
192	#   i = 20
193	mov	$20,%ebp
194	# p = x0
195	movl	100(%esp),%eax
196	# s = x5
197	movl	120(%esp),%ecx
198	# t = x10
199	movl	140(%esp),%edx
200	# w = x15
201	movl	160(%esp),%ebx
202._mainloop:
203	# x0 = p
204	movl	%eax,100(%esp)
205	# 				x10 = t
206	movl	%edx,140(%esp)
207	# p += x12
208	addl	148(%esp),%eax
209	# 		x5 = s
210	movl	%ecx,120(%esp)
211	# 				t += x6
212	addl	124(%esp),%edx
213	# 						x15 = w
214	movl	%ebx,160(%esp)
215	# 		r = x1
216	movl	104(%esp),%esi
217	# 		r += s
218	add	%ecx,%esi
219	# 						v = x11
220	movl	144(%esp),%edi
221	# 						v += w
222	add	%ebx,%edi
223	# p <<<= 7
224	rol	$7,%eax
225	# p ^= x4
226	xorl	116(%esp),%eax
227	# 				t <<<= 7
228	rol	$7,%edx
229	# 				t ^= x14
230	xorl	156(%esp),%edx
231	# 		r <<<= 7
232	rol	$7,%esi
233	# 		r ^= x9
234	xorl	136(%esp),%esi
235	# 						v <<<= 7
236	rol	$7,%edi
237	# 						v ^= x3
238	xorl	112(%esp),%edi
239	# x4 = p
240	movl	%eax,116(%esp)
241	# 				x14 = t
242	movl	%edx,156(%esp)
243	# p += x0
244	addl	100(%esp),%eax
245	# 		x9 = r
246	movl	%esi,136(%esp)
247	# 				t += x10
248	addl	140(%esp),%edx
249	# 						x3 = v
250	movl	%edi,112(%esp)
251	# p <<<= 9
252	rol	$9,%eax
253	# p ^= x8
254	xorl	132(%esp),%eax
255	# 				t <<<= 9
256	rol	$9,%edx
257	# 				t ^= x2
258	xorl	108(%esp),%edx
259	# 		s += r
260	add	%esi,%ecx
261	# 		s <<<= 9
262	rol	$9,%ecx
263	# 		s ^= x13
264	xorl	152(%esp),%ecx
265	# 						w += v
266	add	%edi,%ebx
267	# 						w <<<= 9
268	rol	$9,%ebx
269	# 						w ^= x7
270	xorl	128(%esp),%ebx
271	# x8 = p
272	movl	%eax,132(%esp)
273	# 				x2 = t
274	movl	%edx,108(%esp)
275	# p += x4
276	addl	116(%esp),%eax
277	# 		x13 = s
278	movl	%ecx,152(%esp)
279	# 				t += x14
280	addl	156(%esp),%edx
281	# 						x7 = w
282	movl	%ebx,128(%esp)
283	# p <<<= 13
284	rol	$13,%eax
285	# p ^= x12
286	xorl	148(%esp),%eax
287	# 				t <<<= 13
288	rol	$13,%edx
289	# 				t ^= x6
290	xorl	124(%esp),%edx
291	# 		r += s
292	add	%ecx,%esi
293	# 		r <<<= 13
294	rol	$13,%esi
295	# 		r ^= x1
296	xorl	104(%esp),%esi
297	# 						v += w
298	add	%ebx,%edi
299	# 						v <<<= 13
300	rol	$13,%edi
301	# 						v ^= x11
302	xorl	144(%esp),%edi
303	# x12 = p
304	movl	%eax,148(%esp)
305	# 				x6 = t
306	movl	%edx,124(%esp)
307	# p += x8
308	addl	132(%esp),%eax
309	# 		x1 = r
310	movl	%esi,104(%esp)
311	# 				t += x2
312	addl	108(%esp),%edx
313	# 						x11 = v
314	movl	%edi,144(%esp)
315	# p <<<= 18
316	rol	$18,%eax
317	# p ^= x0
318	xorl	100(%esp),%eax
319	# 				t <<<= 18
320	rol	$18,%edx
321	# 				t ^= x10
322	xorl	140(%esp),%edx
323	# 		s += r
324	add	%esi,%ecx
325	# 		s <<<= 18
326	rol	$18,%ecx
327	# 		s ^= x5
328	xorl	120(%esp),%ecx
329	# 						w += v
330	add	%edi,%ebx
331	# 						w <<<= 18
332	rol	$18,%ebx
333	# 						w ^= x15
334	xorl	160(%esp),%ebx
335	# x0 = p
336	movl	%eax,100(%esp)
337	# 				x10 = t
338	movl	%edx,140(%esp)
339	# p += x3
340	addl	112(%esp),%eax
341	# p <<<= 7
342	rol	$7,%eax
343	# 		x5 = s
344	movl	%ecx,120(%esp)
345	# 				t += x9
346	addl	136(%esp),%edx
347	# 						x15 = w
348	movl	%ebx,160(%esp)
349	# 		r = x4
350	movl	116(%esp),%esi
351	# 		r += s
352	add	%ecx,%esi
353	# 						v = x14
354	movl	156(%esp),%edi
355	# 						v += w
356	add	%ebx,%edi
357	# p ^= x1
358	xorl	104(%esp),%eax
359	# 				t <<<= 7
360	rol	$7,%edx
361	# 				t ^= x11
362	xorl	144(%esp),%edx
363	# 		r <<<= 7
364	rol	$7,%esi
365	# 		r ^= x6
366	xorl	124(%esp),%esi
367	# 						v <<<= 7
368	rol	$7,%edi
369	# 						v ^= x12
370	xorl	148(%esp),%edi
371	# x1 = p
372	movl	%eax,104(%esp)
373	# 				x11 = t
374	movl	%edx,144(%esp)
375	# p += x0
376	addl	100(%esp),%eax
377	# 		x6 = r
378	movl	%esi,124(%esp)
379	# 				t += x10
380	addl	140(%esp),%edx
381	# 						x12 = v
382	movl	%edi,148(%esp)
383	# p <<<= 9
384	rol	$9,%eax
385	# p ^= x2
386	xorl	108(%esp),%eax
387	# 				t <<<= 9
388	rol	$9,%edx
389	# 				t ^= x8
390	xorl	132(%esp),%edx
391	# 		s += r
392	add	%esi,%ecx
393	# 		s <<<= 9
394	rol	$9,%ecx
395	# 		s ^= x7
396	xorl	128(%esp),%ecx
397	# 						w += v
398	add	%edi,%ebx
399	# 						w <<<= 9
400	rol	$9,%ebx
401	# 						w ^= x13
402	xorl	152(%esp),%ebx
403	# x2 = p
404	movl	%eax,108(%esp)
405	# 				x8 = t
406	movl	%edx,132(%esp)
407	# p += x1
408	addl	104(%esp),%eax
409	# 		x7 = s
410	movl	%ecx,128(%esp)
411	# 				t += x11
412	addl	144(%esp),%edx
413	# 						x13 = w
414	movl	%ebx,152(%esp)
415	# p <<<= 13
416	rol	$13,%eax
417	# p ^= x3
418	xorl	112(%esp),%eax
419	# 				t <<<= 13
420	rol	$13,%edx
421	# 				t ^= x9
422	xorl	136(%esp),%edx
423	# 		r += s
424	add	%ecx,%esi
425	# 		r <<<= 13
426	rol	$13,%esi
427	# 		r ^= x4
428	xorl	116(%esp),%esi
429	# 						v += w
430	add	%ebx,%edi
431	# 						v <<<= 13
432	rol	$13,%edi
433	# 						v ^= x14
434	xorl	156(%esp),%edi
435	# x3 = p
436	movl	%eax,112(%esp)
437	# 				x9 = t
438	movl	%edx,136(%esp)
439	# p += x2
440	addl	108(%esp),%eax
441	# 		x4 = r
442	movl	%esi,116(%esp)
443	# 				t += x8
444	addl	132(%esp),%edx
445	# 						x14 = v
446	movl	%edi,156(%esp)
447	# p <<<= 18
448	rol	$18,%eax
449	# p ^= x0
450	xorl	100(%esp),%eax
451	# 				t <<<= 18
452	rol	$18,%edx
453	# 				t ^= x10
454	xorl	140(%esp),%edx
455	# 		s += r
456	add	%esi,%ecx
457	# 		s <<<= 18
458	rol	$18,%ecx
459	# 		s ^= x5
460	xorl	120(%esp),%ecx
461	# 						w += v
462	add	%edi,%ebx
463	# 						w <<<= 18
464	rol	$18,%ebx
465	# 						w ^= x15
466	xorl	160(%esp),%ebx
467	# x0 = p
468	movl	%eax,100(%esp)
469	# 				x10 = t
470	movl	%edx,140(%esp)
471	# p += x12
472	addl	148(%esp),%eax
473	# 		x5 = s
474	movl	%ecx,120(%esp)
475	# 				t += x6
476	addl	124(%esp),%edx
477	# 						x15 = w
478	movl	%ebx,160(%esp)
479	# 		r = x1
480	movl	104(%esp),%esi
481	# 		r += s
482	add	%ecx,%esi
483	# 						v = x11
484	movl	144(%esp),%edi
485	# 						v += w
486	add	%ebx,%edi
487	# p <<<= 7
488	rol	$7,%eax
489	# p ^= x4
490	xorl	116(%esp),%eax
491	# 				t <<<= 7
492	rol	$7,%edx
493	# 				t ^= x14
494	xorl	156(%esp),%edx
495	# 		r <<<= 7
496	rol	$7,%esi
497	# 		r ^= x9
498	xorl	136(%esp),%esi
499	# 						v <<<= 7
500	rol	$7,%edi
501	# 						v ^= x3
502	xorl	112(%esp),%edi
503	# x4 = p
504	movl	%eax,116(%esp)
505	# 				x14 = t
506	movl	%edx,156(%esp)
507	# p += x0
508	addl	100(%esp),%eax
509	# 		x9 = r
510	movl	%esi,136(%esp)
511	# 				t += x10
512	addl	140(%esp),%edx
513	# 						x3 = v
514	movl	%edi,112(%esp)
515	# p <<<= 9
516	rol	$9,%eax
517	# p ^= x8
518	xorl	132(%esp),%eax
519	# 				t <<<= 9
520	rol	$9,%edx
521	# 				t ^= x2
522	xorl	108(%esp),%edx
523	# 		s += r
524	add	%esi,%ecx
525	# 		s <<<= 9
526	rol	$9,%ecx
527	# 		s ^= x13
528	xorl	152(%esp),%ecx
529	# 						w += v
530	add	%edi,%ebx
531	# 						w <<<= 9
532	rol	$9,%ebx
533	# 						w ^= x7
534	xorl	128(%esp),%ebx
535	# x8 = p
536	movl	%eax,132(%esp)
537	# 				x2 = t
538	movl	%edx,108(%esp)
539	# p += x4
540	addl	116(%esp),%eax
541	# 		x13 = s
542	movl	%ecx,152(%esp)
543	# 				t += x14
544	addl	156(%esp),%edx
545	# 						x7 = w
546	movl	%ebx,128(%esp)
547	# p <<<= 13
548	rol	$13,%eax
549	# p ^= x12
550	xorl	148(%esp),%eax
551	# 				t <<<= 13
552	rol	$13,%edx
553	# 				t ^= x6
554	xorl	124(%esp),%edx
555	# 		r += s
556	add	%ecx,%esi
557	# 		r <<<= 13
558	rol	$13,%esi
559	# 		r ^= x1
560	xorl	104(%esp),%esi
561	# 						v += w
562	add	%ebx,%edi
563	# 						v <<<= 13
564	rol	$13,%edi
565	# 						v ^= x11
566	xorl	144(%esp),%edi
567	# x12 = p
568	movl	%eax,148(%esp)
569	# 				x6 = t
570	movl	%edx,124(%esp)
571	# p += x8
572	addl	132(%esp),%eax
573	# 		x1 = r
574	movl	%esi,104(%esp)
575	# 				t += x2
576	addl	108(%esp),%edx
577	# 						x11 = v
578	movl	%edi,144(%esp)
579	# p <<<= 18
580	rol	$18,%eax
581	# p ^= x0
582	xorl	100(%esp),%eax
583	# 				t <<<= 18
584	rol	$18,%edx
585	# 				t ^= x10
586	xorl	140(%esp),%edx
587	# 		s += r
588	add	%esi,%ecx
589	# 		s <<<= 18
590	rol	$18,%ecx
591	# 		s ^= x5
592	xorl	120(%esp),%ecx
593	# 						w += v
594	add	%edi,%ebx
595	# 						w <<<= 18
596	rol	$18,%ebx
597	# 						w ^= x15
598	xorl	160(%esp),%ebx
599	# x0 = p
600	movl	%eax,100(%esp)
601	# 				x10 = t
602	movl	%edx,140(%esp)
603	# p += x3
604	addl	112(%esp),%eax
605	# p <<<= 7
606	rol	$7,%eax
607	# 		x5 = s
608	movl	%ecx,120(%esp)
609	# 				t += x9
610	addl	136(%esp),%edx
611	# 						x15 = w
612	movl	%ebx,160(%esp)
613	# 		r = x4
614	movl	116(%esp),%esi
615	# 		r += s
616	add	%ecx,%esi
617	# 						v = x14
618	movl	156(%esp),%edi
619	# 						v += w
620	add	%ebx,%edi
621	# p ^= x1
622	xorl	104(%esp),%eax
623	# 				t <<<= 7
624	rol	$7,%edx
625	# 				t ^= x11
626	xorl	144(%esp),%edx
627	# 		r <<<= 7
628	rol	$7,%esi
629	# 		r ^= x6
630	xorl	124(%esp),%esi
631	# 						v <<<= 7
632	rol	$7,%edi
633	# 						v ^= x12
634	xorl	148(%esp),%edi
635	# x1 = p
636	movl	%eax,104(%esp)
637	# 				x11 = t
638	movl	%edx,144(%esp)
639	# p += x0
640	addl	100(%esp),%eax
641	# 		x6 = r
642	movl	%esi,124(%esp)
643	# 				t += x10
644	addl	140(%esp),%edx
645	# 						x12 = v
646	movl	%edi,148(%esp)
647	# p <<<= 9
648	rol	$9,%eax
649	# p ^= x2
650	xorl	108(%esp),%eax
651	# 				t <<<= 9
652	rol	$9,%edx
653	# 				t ^= x8
654	xorl	132(%esp),%edx
655	# 		s += r
656	add	%esi,%ecx
657	# 		s <<<= 9
658	rol	$9,%ecx
659	# 		s ^= x7
660	xorl	128(%esp),%ecx
661	# 						w += v
662	add	%edi,%ebx
663	# 						w <<<= 9
664	rol	$9,%ebx
665	# 						w ^= x13
666	xorl	152(%esp),%ebx
667	# x2 = p
668	movl	%eax,108(%esp)
669	# 				x8 = t
670	movl	%edx,132(%esp)
671	# p += x1
672	addl	104(%esp),%eax
673	# 		x7 = s
674	movl	%ecx,128(%esp)
675	# 				t += x11
676	addl	144(%esp),%edx
677	# 						x13 = w
678	movl	%ebx,152(%esp)
679	# p <<<= 13
680	rol	$13,%eax
681	# p ^= x3
682	xorl	112(%esp),%eax
683	# 				t <<<= 13
684	rol	$13,%edx
685	# 				t ^= x9
686	xorl	136(%esp),%edx
687	# 		r += s
688	add	%ecx,%esi
689	# 		r <<<= 13
690	rol	$13,%esi
691	# 		r ^= x4
692	xorl	116(%esp),%esi
693	# 						v += w
694	add	%ebx,%edi
695	# 						v <<<= 13
696	rol	$13,%edi
697	# 						v ^= x14
698	xorl	156(%esp),%edi
699	# x3 = p
700	movl	%eax,112(%esp)
701	# 				x9 = t
702	movl	%edx,136(%esp)
703	# p += x2
704	addl	108(%esp),%eax
705	# 		x4 = r
706	movl	%esi,116(%esp)
707	# 				t += x8
708	addl	132(%esp),%edx
709	# 						x14 = v
710	movl	%edi,156(%esp)
711	# p <<<= 18
712	rol	$18,%eax
713	# p ^= x0
714	xorl	100(%esp),%eax
715	# 				t <<<= 18
716	rol	$18,%edx
717	# 				t ^= x10
718	xorl	140(%esp),%edx
719	# 		s += r
720	add	%esi,%ecx
721	# 		s <<<= 18
722	rol	$18,%ecx
723	# 		s ^= x5
724	xorl	120(%esp),%ecx
725	# 						w += v
726	add	%edi,%ebx
727	# 						w <<<= 18
728	rol	$18,%ebx
729	# 						w ^= x15
730	xorl	160(%esp),%ebx
731	# i -= 4
732	sub	$4,%ebp
733	# goto mainloop if unsigned >
734	ja	._mainloop
735	# x0 = p
736	movl	%eax,100(%esp)
737	# x5 = s
738	movl	%ecx,120(%esp)
739	# x10 = t
740	movl	%edx,140(%esp)
741	# x15 = w
742	movl	%ebx,160(%esp)
743	#   out = out_backup
744	movl	72(%esp),%edi
745	#   m = m_backup
746	movl	68(%esp),%esi
747	#   in0 = x0
748	movl	100(%esp),%eax
749	#   in1 = x1
750	movl	104(%esp),%ecx
751	#   in0 += j0
752	addl	164(%esp),%eax
753	#   in1 += j1
754	addl	168(%esp),%ecx
755	#   in0 ^= *(uint32 *) (m + 0)
756	xorl	0(%esi),%eax
757	#   in1 ^= *(uint32 *) (m + 4)
758	xorl	4(%esi),%ecx
759	#   *(uint32 *) (out + 0) = in0
760	movl	%eax,0(%edi)
761	#   *(uint32 *) (out + 4) = in1
762	movl	%ecx,4(%edi)
763	#   in2 = x2
764	movl	108(%esp),%eax
765	#   in3 = x3
766	movl	112(%esp),%ecx
767	#   in2 += j2
768	addl	172(%esp),%eax
769	#   in3 += j3
770	addl	176(%esp),%ecx
771	#   in2 ^= *(uint32 *) (m + 8)
772	xorl	8(%esi),%eax
773	#   in3 ^= *(uint32 *) (m + 12)
774	xorl	12(%esi),%ecx
775	#   *(uint32 *) (out + 8) = in2
776	movl	%eax,8(%edi)
777	#   *(uint32 *) (out + 12) = in3
778	movl	%ecx,12(%edi)
779	#   in4 = x4
780	movl	116(%esp),%eax
781	#   in5 = x5
782	movl	120(%esp),%ecx
783	#   in4 += j4
784	addl	180(%esp),%eax
785	#   in5 += j5
786	addl	184(%esp),%ecx
787	#   in4 ^= *(uint32 *) (m + 16)
788	xorl	16(%esi),%eax
789	#   in5 ^= *(uint32 *) (m + 20)
790	xorl	20(%esi),%ecx
791	#   *(uint32 *) (out + 16) = in4
792	movl	%eax,16(%edi)
793	#   *(uint32 *) (out + 20) = in5
794	movl	%ecx,20(%edi)
795	#   in6 = x6
796	movl	124(%esp),%eax
797	#   in7 = x7
798	movl	128(%esp),%ecx
799	#   in6 += j6
800	addl	188(%esp),%eax
801	#   in7 += j7
802	addl	192(%esp),%ecx
803	#   in6 ^= *(uint32 *) (m + 24)
804	xorl	24(%esi),%eax
805	#   in7 ^= *(uint32 *) (m + 28)
806	xorl	28(%esi),%ecx
807	#   *(uint32 *) (out + 24) = in6
808	movl	%eax,24(%edi)
809	#   *(uint32 *) (out + 28) = in7
810	movl	%ecx,28(%edi)
811	#   in8 = x8
812	movl	132(%esp),%eax
813	#   in9 = x9
814	movl	136(%esp),%ecx
815	#   in8 += j8
816	addl	196(%esp),%eax
817	#   in9 += j9
818	addl	200(%esp),%ecx
819	#   in8 ^= *(uint32 *) (m + 32)
820	xorl	32(%esi),%eax
821	#   in9 ^= *(uint32 *) (m + 36)
822	xorl	36(%esi),%ecx
823	#   *(uint32 *) (out + 32) = in8
824	movl	%eax,32(%edi)
825	#   *(uint32 *) (out + 36) = in9
826	movl	%ecx,36(%edi)
827	#   in10 = x10
828	movl	140(%esp),%eax
829	#   in11 = x11
830	movl	144(%esp),%ecx
831	#   in10 += j10
832	addl	204(%esp),%eax
833	#   in11 += j11
834	addl	208(%esp),%ecx
835	#   in10 ^= *(uint32 *) (m + 40)
836	xorl	40(%esi),%eax
837	#   in11 ^= *(uint32 *) (m + 44)
838	xorl	44(%esi),%ecx
839	#   *(uint32 *) (out + 40) = in10
840	movl	%eax,40(%edi)
841	#   *(uint32 *) (out + 44) = in11
842	movl	%ecx,44(%edi)
843	#   in12 = x12
844	movl	148(%esp),%eax
845	#   in13 = x13
846	movl	152(%esp),%ecx
847	#   in12 += j12
848	addl	212(%esp),%eax
849	#   in13 += j13
850	addl	216(%esp),%ecx
851	#   in12 ^= *(uint32 *) (m + 48)
852	xorl	48(%esi),%eax
853	#   in13 ^= *(uint32 *) (m + 52)
854	xorl	52(%esi),%ecx
855	#   *(uint32 *) (out + 48) = in12
856	movl	%eax,48(%edi)
857	#   *(uint32 *) (out + 52) = in13
858	movl	%ecx,52(%edi)
859	#   in14 = x14
860	movl	156(%esp),%eax
861	#   in15 = x15
862	movl	160(%esp),%ecx
863	#   in14 += j14
864	addl	220(%esp),%eax
865	#   in15 += j15
866	addl	224(%esp),%ecx
867	#   in14 ^= *(uint32 *) (m + 56)
868	xorl	56(%esi),%eax
869	#   in15 ^= *(uint32 *) (m + 60)
870	xorl	60(%esi),%ecx
871	#   *(uint32 *) (out + 56) = in14
872	movl	%eax,56(%edi)
873	#   *(uint32 *) (out + 60) = in15
874	movl	%ecx,60(%edi)
875	#   bytes = bytes_backup
876	movl	76(%esp),%ebx
877	#   in8 = j8
878	movl	196(%esp),%eax
879	#   in9 = j9
880	movl	200(%esp),%ecx
881	#   in8 += 1
882	add	$1,%eax
883	#   in9 += 0 + carry
884	adc	$0,%ecx
885	#   j8 = in8
886	movl	%eax,196(%esp)
887	#   j9 = in9
888	movl	%ecx,200(%esp)
889	#   bytes - 64
890	cmp	$64,%ebx
891	#   goto bytesatleast65 if unsigned>
892	ja	._bytesatleast65
893	#     goto bytesatleast64 if unsigned>=
894	jae	._bytesatleast64
895	#       m = out
896	mov	%edi,%esi
897	#       out = ctarget
898	movl	228(%esp),%edi
899	#       i = bytes
900	mov	%ebx,%ecx
901	#       while (i) { *out++ = *m++; --i }
902	rep	movsb
903._bytesatleast64:
904	#     x = x_backup
905	movl	64(%esp),%eax
906	#     in8 = j8
907	movl	196(%esp),%ecx
908	#     in9 = j9
909	movl	200(%esp),%edx
910	#     *(uint32 *) (x + 32) = in8
911	movl	%ecx,32(%eax)
912	#     *(uint32 *) (x + 36) = in9
913	movl	%edx,36(%eax)
914._done:
915	#     eax = eax_stack
916	movl	80(%esp),%eax
917	#     ebx = ebx_stack
918	movl	84(%esp),%ebx
919	#     esi = esi_stack
920	movl	88(%esp),%esi
921	#     edi = edi_stack
922	movl	92(%esp),%edi
923	#     ebp = ebp_stack
924	movl	96(%esp),%ebp
925	#     leave
926	add	%eax,%esp
927	ret
928._bytesatleast65:
929	#   bytes -= 64
930	sub	$64,%ebx
931	#   out += 64
932	add	$64,%edi
933	#   m += 64
934	add	$64,%esi
935	# goto bytesatleast1
936	jmp	._bytesatleast1
937ENDPROC(salsa20_encrypt_bytes)
938
939# enter salsa20_keysetup
940ENTRY(salsa20_keysetup)
941	mov	%esp,%eax
942	and	$31,%eax
943	add	$256,%eax
944	sub	%eax,%esp
945	#   eax_stack = eax
946	movl	%eax,64(%esp)
947	#   ebx_stack = ebx
948	movl	%ebx,68(%esp)
949	#   esi_stack = esi
950	movl	%esi,72(%esp)
951	#   edi_stack = edi
952	movl	%edi,76(%esp)
953	#   ebp_stack = ebp
954	movl	%ebp,80(%esp)
955	#   k = arg2
956	movl	8(%esp,%eax),%ecx
957	#   kbits = arg3
958	movl	12(%esp,%eax),%edx
959	#   x = arg1
960	movl	4(%esp,%eax),%eax
961	#   in1 = *(uint32 *) (k + 0)
962	movl	0(%ecx),%ebx
963	#   in2 = *(uint32 *) (k + 4)
964	movl	4(%ecx),%esi
965	#   in3 = *(uint32 *) (k + 8)
966	movl	8(%ecx),%edi
967	#   in4 = *(uint32 *) (k + 12)
968	movl	12(%ecx),%ebp
969	#   *(uint32 *) (x + 4) = in1
970	movl	%ebx,4(%eax)
971	#   *(uint32 *) (x + 8) = in2
972	movl	%esi,8(%eax)
973	#   *(uint32 *) (x + 12) = in3
974	movl	%edi,12(%eax)
975	#   *(uint32 *) (x + 16) = in4
976	movl	%ebp,16(%eax)
977	#   kbits - 256
978	cmp	$256,%edx
979	#   goto kbits128 if unsigned<
980	jb	._kbits128
981._kbits256:
982	#     in11 = *(uint32 *) (k + 16)
983	movl	16(%ecx),%edx
984	#     in12 = *(uint32 *) (k + 20)
985	movl	20(%ecx),%ebx
986	#     in13 = *(uint32 *) (k + 24)
987	movl	24(%ecx),%esi
988	#     in14 = *(uint32 *) (k + 28)
989	movl	28(%ecx),%ecx
990	#     *(uint32 *) (x + 44) = in11
991	movl	%edx,44(%eax)
992	#     *(uint32 *) (x + 48) = in12
993	movl	%ebx,48(%eax)
994	#     *(uint32 *) (x + 52) = in13
995	movl	%esi,52(%eax)
996	#     *(uint32 *) (x + 56) = in14
997	movl	%ecx,56(%eax)
998	#     in0 = 1634760805
999	mov	$1634760805,%ecx
1000	#     in5 = 857760878
1001	mov	$857760878,%edx
1002	#     in10 = 2036477234
1003	mov	$2036477234,%ebx
1004	#     in15 = 1797285236
1005	mov	$1797285236,%esi
1006	#     *(uint32 *) (x + 0) = in0
1007	movl	%ecx,0(%eax)
1008	#     *(uint32 *) (x + 20) = in5
1009	movl	%edx,20(%eax)
1010	#     *(uint32 *) (x + 40) = in10
1011	movl	%ebx,40(%eax)
1012	#     *(uint32 *) (x + 60) = in15
1013	movl	%esi,60(%eax)
1014	#   goto keysetupdone
1015	jmp	._keysetupdone
1016._kbits128:
1017	#     in11 = *(uint32 *) (k + 0)
1018	movl	0(%ecx),%edx
1019	#     in12 = *(uint32 *) (k + 4)
1020	movl	4(%ecx),%ebx
1021	#     in13 = *(uint32 *) (k + 8)
1022	movl	8(%ecx),%esi
1023	#     in14 = *(uint32 *) (k + 12)
1024	movl	12(%ecx),%ecx
1025	#     *(uint32 *) (x + 44) = in11
1026	movl	%edx,44(%eax)
1027	#     *(uint32 *) (x + 48) = in12
1028	movl	%ebx,48(%eax)
1029	#     *(uint32 *) (x + 52) = in13
1030	movl	%esi,52(%eax)
1031	#     *(uint32 *) (x + 56) = in14
1032	movl	%ecx,56(%eax)
1033	#     in0 = 1634760805
1034	mov	$1634760805,%ecx
1035	#     in5 = 824206446
1036	mov	$824206446,%edx
1037	#     in10 = 2036477238
1038	mov	$2036477238,%ebx
1039	#     in15 = 1797285236
1040	mov	$1797285236,%esi
1041	#     *(uint32 *) (x + 0) = in0
1042	movl	%ecx,0(%eax)
1043	#     *(uint32 *) (x + 20) = in5
1044	movl	%edx,20(%eax)
1045	#     *(uint32 *) (x + 40) = in10
1046	movl	%ebx,40(%eax)
1047	#     *(uint32 *) (x + 60) = in15
1048	movl	%esi,60(%eax)
1049._keysetupdone:
1050	#   eax = eax_stack
1051	movl	64(%esp),%eax
1052	#   ebx = ebx_stack
1053	movl	68(%esp),%ebx
1054	#   esi = esi_stack
1055	movl	72(%esp),%esi
1056	#   edi = edi_stack
1057	movl	76(%esp),%edi
1058	#   ebp = ebp_stack
1059	movl	80(%esp),%ebp
1060	# leave
1061	add	%eax,%esp
1062	ret
1063ENDPROC(salsa20_keysetup)
1064
1065# enter salsa20_ivsetup
1066ENTRY(salsa20_ivsetup)
1067	mov	%esp,%eax
1068	and	$31,%eax
1069	add	$256,%eax
1070	sub	%eax,%esp
1071	#   eax_stack = eax
1072	movl	%eax,64(%esp)
1073	#   ebx_stack = ebx
1074	movl	%ebx,68(%esp)
1075	#   esi_stack = esi
1076	movl	%esi,72(%esp)
1077	#   edi_stack = edi
1078	movl	%edi,76(%esp)
1079	#   ebp_stack = ebp
1080	movl	%ebp,80(%esp)
1081	#   iv = arg2
1082	movl	8(%esp,%eax),%ecx
1083	#   x = arg1
1084	movl	4(%esp,%eax),%eax
1085	#   in6 = *(uint32 *) (iv + 0)
1086	movl	0(%ecx),%edx
1087	#   in7 = *(uint32 *) (iv + 4)
1088	movl	4(%ecx),%ecx
1089	#   in8 = 0
1090	mov	$0,%ebx
1091	#   in9 = 0
1092	mov	$0,%esi
1093	#   *(uint32 *) (x + 24) = in6
1094	movl	%edx,24(%eax)
1095	#   *(uint32 *) (x + 28) = in7
1096	movl	%ecx,28(%eax)
1097	#   *(uint32 *) (x + 32) = in8
1098	movl	%ebx,32(%eax)
1099	#   *(uint32 *) (x + 36) = in9
1100	movl	%esi,36(%eax)
1101	#   eax = eax_stack
1102	movl	64(%esp),%eax
1103	#   ebx = ebx_stack
1104	movl	68(%esp),%ebx
1105	#   esi = esi_stack
1106	movl	72(%esp),%esi
1107	#   edi = edi_stack
1108	movl	76(%esp),%edi
1109	#   ebp = ebp_stack
1110	movl	80(%esp),%ebp
1111	# leave
1112	add	%eax,%esp
1113	ret
1114ENDPROC(salsa20_ivsetup)
1115