1/*
2 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
3 *
4 * Copyright (C) 2015 Martin Willi
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12#include <linux/linkage.h>
13
14.data
15.align 32
16
17ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
18	.octa 0x0e0d0c0f0a09080b0605040702010003
19ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
20	.octa 0x0d0c0f0e09080b0a0504070601000302
21CTRINC:	.octa 0x00000003000000020000000100000000
22	.octa 0x00000007000000060000000500000004
23
24.text
25
26ENTRY(chacha20_8block_xor_avx2)
27	# %rdi: Input state matrix, s
28	# %rsi: 8 data blocks output, o
29	# %rdx: 8 data blocks input, i
30
31	# This function encrypts eight consecutive ChaCha20 blocks by loading
32	# the state matrix in AVX registers eight times. As we need some
33	# scratch registers, we save the first four registers on the stack. The
34	# algorithm performs each operation on the corresponding word of each
35	# state matrix, hence requires no word shuffling. For final XORing step
36	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
37	# words, which allows us to do XOR in AVX registers. 8/16-bit word
38	# rotation is done with the slightly better performing byte shuffling,
39	# 7/12-bit word rotation uses traditional shift+OR.
40
41	vzeroupper
42	# 4 * 32 byte stack, 32-byte aligned
43	mov		%rsp, %r8
44	and		$~31, %rsp
45	sub		$0x80, %rsp
46
47	# x0..15[0-7] = s[0..15]
48	vpbroadcastd	0x00(%rdi),%ymm0
49	vpbroadcastd	0x04(%rdi),%ymm1
50	vpbroadcastd	0x08(%rdi),%ymm2
51	vpbroadcastd	0x0c(%rdi),%ymm3
52	vpbroadcastd	0x10(%rdi),%ymm4
53	vpbroadcastd	0x14(%rdi),%ymm5
54	vpbroadcastd	0x18(%rdi),%ymm6
55	vpbroadcastd	0x1c(%rdi),%ymm7
56	vpbroadcastd	0x20(%rdi),%ymm8
57	vpbroadcastd	0x24(%rdi),%ymm9
58	vpbroadcastd	0x28(%rdi),%ymm10
59	vpbroadcastd	0x2c(%rdi),%ymm11
60	vpbroadcastd	0x30(%rdi),%ymm12
61	vpbroadcastd	0x34(%rdi),%ymm13
62	vpbroadcastd	0x38(%rdi),%ymm14
63	vpbroadcastd	0x3c(%rdi),%ymm15
64	# x0..3 on stack
65	vmovdqa		%ymm0,0x00(%rsp)
66	vmovdqa		%ymm1,0x20(%rsp)
67	vmovdqa		%ymm2,0x40(%rsp)
68	vmovdqa		%ymm3,0x60(%rsp)
69
70	vmovdqa		CTRINC(%rip),%ymm1
71	vmovdqa		ROT8(%rip),%ymm2
72	vmovdqa		ROT16(%rip),%ymm3
73
74	# x12 += counter values 0-3
75	vpaddd		%ymm1,%ymm12,%ymm12
76
77	mov		$10,%ecx
78
79.Ldoubleround8:
80	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
81	vpaddd		0x00(%rsp),%ymm4,%ymm0
82	vmovdqa		%ymm0,0x00(%rsp)
83	vpxor		%ymm0,%ymm12,%ymm12
84	vpshufb		%ymm3,%ymm12,%ymm12
85	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
86	vpaddd		0x20(%rsp),%ymm5,%ymm0
87	vmovdqa		%ymm0,0x20(%rsp)
88	vpxor		%ymm0,%ymm13,%ymm13
89	vpshufb		%ymm3,%ymm13,%ymm13
90	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
91	vpaddd		0x40(%rsp),%ymm6,%ymm0
92	vmovdqa		%ymm0,0x40(%rsp)
93	vpxor		%ymm0,%ymm14,%ymm14
94	vpshufb		%ymm3,%ymm14,%ymm14
95	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
96	vpaddd		0x60(%rsp),%ymm7,%ymm0
97	vmovdqa		%ymm0,0x60(%rsp)
98	vpxor		%ymm0,%ymm15,%ymm15
99	vpshufb		%ymm3,%ymm15,%ymm15
100
101	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
102	vpaddd		%ymm12,%ymm8,%ymm8
103	vpxor		%ymm8,%ymm4,%ymm4
104	vpslld		$12,%ymm4,%ymm0
105	vpsrld		$20,%ymm4,%ymm4
106	vpor		%ymm0,%ymm4,%ymm4
107	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
108	vpaddd		%ymm13,%ymm9,%ymm9
109	vpxor		%ymm9,%ymm5,%ymm5
110	vpslld		$12,%ymm5,%ymm0
111	vpsrld		$20,%ymm5,%ymm5
112	vpor		%ymm0,%ymm5,%ymm5
113	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
114	vpaddd		%ymm14,%ymm10,%ymm10
115	vpxor		%ymm10,%ymm6,%ymm6
116	vpslld		$12,%ymm6,%ymm0
117	vpsrld		$20,%ymm6,%ymm6
118	vpor		%ymm0,%ymm6,%ymm6
119	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
120	vpaddd		%ymm15,%ymm11,%ymm11
121	vpxor		%ymm11,%ymm7,%ymm7
122	vpslld		$12,%ymm7,%ymm0
123	vpsrld		$20,%ymm7,%ymm7
124	vpor		%ymm0,%ymm7,%ymm7
125
126	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
127	vpaddd		0x00(%rsp),%ymm4,%ymm0
128	vmovdqa		%ymm0,0x00(%rsp)
129	vpxor		%ymm0,%ymm12,%ymm12
130	vpshufb		%ymm2,%ymm12,%ymm12
131	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
132	vpaddd		0x20(%rsp),%ymm5,%ymm0
133	vmovdqa		%ymm0,0x20(%rsp)
134	vpxor		%ymm0,%ymm13,%ymm13
135	vpshufb		%ymm2,%ymm13,%ymm13
136	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
137	vpaddd		0x40(%rsp),%ymm6,%ymm0
138	vmovdqa		%ymm0,0x40(%rsp)
139	vpxor		%ymm0,%ymm14,%ymm14
140	vpshufb		%ymm2,%ymm14,%ymm14
141	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
142	vpaddd		0x60(%rsp),%ymm7,%ymm0
143	vmovdqa		%ymm0,0x60(%rsp)
144	vpxor		%ymm0,%ymm15,%ymm15
145	vpshufb		%ymm2,%ymm15,%ymm15
146
147	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
148	vpaddd		%ymm12,%ymm8,%ymm8
149	vpxor		%ymm8,%ymm4,%ymm4
150	vpslld		$7,%ymm4,%ymm0
151	vpsrld		$25,%ymm4,%ymm4
152	vpor		%ymm0,%ymm4,%ymm4
153	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
154	vpaddd		%ymm13,%ymm9,%ymm9
155	vpxor		%ymm9,%ymm5,%ymm5
156	vpslld		$7,%ymm5,%ymm0
157	vpsrld		$25,%ymm5,%ymm5
158	vpor		%ymm0,%ymm5,%ymm5
159	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
160	vpaddd		%ymm14,%ymm10,%ymm10
161	vpxor		%ymm10,%ymm6,%ymm6
162	vpslld		$7,%ymm6,%ymm0
163	vpsrld		$25,%ymm6,%ymm6
164	vpor		%ymm0,%ymm6,%ymm6
165	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
166	vpaddd		%ymm15,%ymm11,%ymm11
167	vpxor		%ymm11,%ymm7,%ymm7
168	vpslld		$7,%ymm7,%ymm0
169	vpsrld		$25,%ymm7,%ymm7
170	vpor		%ymm0,%ymm7,%ymm7
171
172	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
173	vpaddd		0x00(%rsp),%ymm5,%ymm0
174	vmovdqa		%ymm0,0x00(%rsp)
175	vpxor		%ymm0,%ymm15,%ymm15
176	vpshufb		%ymm3,%ymm15,%ymm15
177	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
178	vpaddd		0x20(%rsp),%ymm6,%ymm0
179	vmovdqa		%ymm0,0x20(%rsp)
180	vpxor		%ymm0,%ymm12,%ymm12
181	vpshufb		%ymm3,%ymm12,%ymm12
182	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
183	vpaddd		0x40(%rsp),%ymm7,%ymm0
184	vmovdqa		%ymm0,0x40(%rsp)
185	vpxor		%ymm0,%ymm13,%ymm13
186	vpshufb		%ymm3,%ymm13,%ymm13
187	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
188	vpaddd		0x60(%rsp),%ymm4,%ymm0
189	vmovdqa		%ymm0,0x60(%rsp)
190	vpxor		%ymm0,%ymm14,%ymm14
191	vpshufb		%ymm3,%ymm14,%ymm14
192
193	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
194	vpaddd		%ymm15,%ymm10,%ymm10
195	vpxor		%ymm10,%ymm5,%ymm5
196	vpslld		$12,%ymm5,%ymm0
197	vpsrld		$20,%ymm5,%ymm5
198	vpor		%ymm0,%ymm5,%ymm5
199	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
200	vpaddd		%ymm12,%ymm11,%ymm11
201	vpxor		%ymm11,%ymm6,%ymm6
202	vpslld		$12,%ymm6,%ymm0
203	vpsrld		$20,%ymm6,%ymm6
204	vpor		%ymm0,%ymm6,%ymm6
205	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
206	vpaddd		%ymm13,%ymm8,%ymm8
207	vpxor		%ymm8,%ymm7,%ymm7
208	vpslld		$12,%ymm7,%ymm0
209	vpsrld		$20,%ymm7,%ymm7
210	vpor		%ymm0,%ymm7,%ymm7
211	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
212	vpaddd		%ymm14,%ymm9,%ymm9
213	vpxor		%ymm9,%ymm4,%ymm4
214	vpslld		$12,%ymm4,%ymm0
215	vpsrld		$20,%ymm4,%ymm4
216	vpor		%ymm0,%ymm4,%ymm4
217
218	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
219	vpaddd		0x00(%rsp),%ymm5,%ymm0
220	vmovdqa		%ymm0,0x00(%rsp)
221	vpxor		%ymm0,%ymm15,%ymm15
222	vpshufb		%ymm2,%ymm15,%ymm15
223	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
224	vpaddd		0x20(%rsp),%ymm6,%ymm0
225	vmovdqa		%ymm0,0x20(%rsp)
226	vpxor		%ymm0,%ymm12,%ymm12
227	vpshufb		%ymm2,%ymm12,%ymm12
228	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
229	vpaddd		0x40(%rsp),%ymm7,%ymm0
230	vmovdqa		%ymm0,0x40(%rsp)
231	vpxor		%ymm0,%ymm13,%ymm13
232	vpshufb		%ymm2,%ymm13,%ymm13
233	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
234	vpaddd		0x60(%rsp),%ymm4,%ymm0
235	vmovdqa		%ymm0,0x60(%rsp)
236	vpxor		%ymm0,%ymm14,%ymm14
237	vpshufb		%ymm2,%ymm14,%ymm14
238
239	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
240	vpaddd		%ymm15,%ymm10,%ymm10
241	vpxor		%ymm10,%ymm5,%ymm5
242	vpslld		$7,%ymm5,%ymm0
243	vpsrld		$25,%ymm5,%ymm5
244	vpor		%ymm0,%ymm5,%ymm5
245	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
246	vpaddd		%ymm12,%ymm11,%ymm11
247	vpxor		%ymm11,%ymm6,%ymm6
248	vpslld		$7,%ymm6,%ymm0
249	vpsrld		$25,%ymm6,%ymm6
250	vpor		%ymm0,%ymm6,%ymm6
251	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
252	vpaddd		%ymm13,%ymm8,%ymm8
253	vpxor		%ymm8,%ymm7,%ymm7
254	vpslld		$7,%ymm7,%ymm0
255	vpsrld		$25,%ymm7,%ymm7
256	vpor		%ymm0,%ymm7,%ymm7
257	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
258	vpaddd		%ymm14,%ymm9,%ymm9
259	vpxor		%ymm9,%ymm4,%ymm4
260	vpslld		$7,%ymm4,%ymm0
261	vpsrld		$25,%ymm4,%ymm4
262	vpor		%ymm0,%ymm4,%ymm4
263
264	dec		%ecx
265	jnz		.Ldoubleround8
266
267	# x0..15[0-3] += s[0..15]
268	vpbroadcastd	0x00(%rdi),%ymm0
269	vpaddd		0x00(%rsp),%ymm0,%ymm0
270	vmovdqa		%ymm0,0x00(%rsp)
271	vpbroadcastd	0x04(%rdi),%ymm0
272	vpaddd		0x20(%rsp),%ymm0,%ymm0
273	vmovdqa		%ymm0,0x20(%rsp)
274	vpbroadcastd	0x08(%rdi),%ymm0
275	vpaddd		0x40(%rsp),%ymm0,%ymm0
276	vmovdqa		%ymm0,0x40(%rsp)
277	vpbroadcastd	0x0c(%rdi),%ymm0
278	vpaddd		0x60(%rsp),%ymm0,%ymm0
279	vmovdqa		%ymm0,0x60(%rsp)
280	vpbroadcastd	0x10(%rdi),%ymm0
281	vpaddd		%ymm0,%ymm4,%ymm4
282	vpbroadcastd	0x14(%rdi),%ymm0
283	vpaddd		%ymm0,%ymm5,%ymm5
284	vpbroadcastd	0x18(%rdi),%ymm0
285	vpaddd		%ymm0,%ymm6,%ymm6
286	vpbroadcastd	0x1c(%rdi),%ymm0
287	vpaddd		%ymm0,%ymm7,%ymm7
288	vpbroadcastd	0x20(%rdi),%ymm0
289	vpaddd		%ymm0,%ymm8,%ymm8
290	vpbroadcastd	0x24(%rdi),%ymm0
291	vpaddd		%ymm0,%ymm9,%ymm9
292	vpbroadcastd	0x28(%rdi),%ymm0
293	vpaddd		%ymm0,%ymm10,%ymm10
294	vpbroadcastd	0x2c(%rdi),%ymm0
295	vpaddd		%ymm0,%ymm11,%ymm11
296	vpbroadcastd	0x30(%rdi),%ymm0
297	vpaddd		%ymm0,%ymm12,%ymm12
298	vpbroadcastd	0x34(%rdi),%ymm0
299	vpaddd		%ymm0,%ymm13,%ymm13
300	vpbroadcastd	0x38(%rdi),%ymm0
301	vpaddd		%ymm0,%ymm14,%ymm14
302	vpbroadcastd	0x3c(%rdi),%ymm0
303	vpaddd		%ymm0,%ymm15,%ymm15
304
305	# x12 += counter values 0-3
306	vpaddd		%ymm1,%ymm12,%ymm12
307
308	# interleave 32-bit words in state n, n+1
309	vmovdqa		0x00(%rsp),%ymm0
310	vmovdqa		0x20(%rsp),%ymm1
311	vpunpckldq	%ymm1,%ymm0,%ymm2
312	vpunpckhdq	%ymm1,%ymm0,%ymm1
313	vmovdqa		%ymm2,0x00(%rsp)
314	vmovdqa		%ymm1,0x20(%rsp)
315	vmovdqa		0x40(%rsp),%ymm0
316	vmovdqa		0x60(%rsp),%ymm1
317	vpunpckldq	%ymm1,%ymm0,%ymm2
318	vpunpckhdq	%ymm1,%ymm0,%ymm1
319	vmovdqa		%ymm2,0x40(%rsp)
320	vmovdqa		%ymm1,0x60(%rsp)
321	vmovdqa		%ymm4,%ymm0
322	vpunpckldq	%ymm5,%ymm0,%ymm4
323	vpunpckhdq	%ymm5,%ymm0,%ymm5
324	vmovdqa		%ymm6,%ymm0
325	vpunpckldq	%ymm7,%ymm0,%ymm6
326	vpunpckhdq	%ymm7,%ymm0,%ymm7
327	vmovdqa		%ymm8,%ymm0
328	vpunpckldq	%ymm9,%ymm0,%ymm8
329	vpunpckhdq	%ymm9,%ymm0,%ymm9
330	vmovdqa		%ymm10,%ymm0
331	vpunpckldq	%ymm11,%ymm0,%ymm10
332	vpunpckhdq	%ymm11,%ymm0,%ymm11
333	vmovdqa		%ymm12,%ymm0
334	vpunpckldq	%ymm13,%ymm0,%ymm12
335	vpunpckhdq	%ymm13,%ymm0,%ymm13
336	vmovdqa		%ymm14,%ymm0
337	vpunpckldq	%ymm15,%ymm0,%ymm14
338	vpunpckhdq	%ymm15,%ymm0,%ymm15
339
340	# interleave 64-bit words in state n, n+2
341	vmovdqa		0x00(%rsp),%ymm0
342	vmovdqa		0x40(%rsp),%ymm2
343	vpunpcklqdq	%ymm2,%ymm0,%ymm1
344	vpunpckhqdq	%ymm2,%ymm0,%ymm2
345	vmovdqa		%ymm1,0x00(%rsp)
346	vmovdqa		%ymm2,0x40(%rsp)
347	vmovdqa		0x20(%rsp),%ymm0
348	vmovdqa		0x60(%rsp),%ymm2
349	vpunpcklqdq	%ymm2,%ymm0,%ymm1
350	vpunpckhqdq	%ymm2,%ymm0,%ymm2
351	vmovdqa		%ymm1,0x20(%rsp)
352	vmovdqa		%ymm2,0x60(%rsp)
353	vmovdqa		%ymm4,%ymm0
354	vpunpcklqdq	%ymm6,%ymm0,%ymm4
355	vpunpckhqdq	%ymm6,%ymm0,%ymm6
356	vmovdqa		%ymm5,%ymm0
357	vpunpcklqdq	%ymm7,%ymm0,%ymm5
358	vpunpckhqdq	%ymm7,%ymm0,%ymm7
359	vmovdqa		%ymm8,%ymm0
360	vpunpcklqdq	%ymm10,%ymm0,%ymm8
361	vpunpckhqdq	%ymm10,%ymm0,%ymm10
362	vmovdqa		%ymm9,%ymm0
363	vpunpcklqdq	%ymm11,%ymm0,%ymm9
364	vpunpckhqdq	%ymm11,%ymm0,%ymm11
365	vmovdqa		%ymm12,%ymm0
366	vpunpcklqdq	%ymm14,%ymm0,%ymm12
367	vpunpckhqdq	%ymm14,%ymm0,%ymm14
368	vmovdqa		%ymm13,%ymm0
369	vpunpcklqdq	%ymm15,%ymm0,%ymm13
370	vpunpckhqdq	%ymm15,%ymm0,%ymm15
371
372	# interleave 128-bit words in state n, n+4
373	vmovdqa		0x00(%rsp),%ymm0
374	vperm2i128	$0x20,%ymm4,%ymm0,%ymm1
375	vperm2i128	$0x31,%ymm4,%ymm0,%ymm4
376	vmovdqa		%ymm1,0x00(%rsp)
377	vmovdqa		0x20(%rsp),%ymm0
378	vperm2i128	$0x20,%ymm5,%ymm0,%ymm1
379	vperm2i128	$0x31,%ymm5,%ymm0,%ymm5
380	vmovdqa		%ymm1,0x20(%rsp)
381	vmovdqa		0x40(%rsp),%ymm0
382	vperm2i128	$0x20,%ymm6,%ymm0,%ymm1
383	vperm2i128	$0x31,%ymm6,%ymm0,%ymm6
384	vmovdqa		%ymm1,0x40(%rsp)
385	vmovdqa		0x60(%rsp),%ymm0
386	vperm2i128	$0x20,%ymm7,%ymm0,%ymm1
387	vperm2i128	$0x31,%ymm7,%ymm0,%ymm7
388	vmovdqa		%ymm1,0x60(%rsp)
389	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
390	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
391	vmovdqa		%ymm0,%ymm8
392	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
393	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
394	vmovdqa		%ymm0,%ymm9
395	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
396	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
397	vmovdqa		%ymm0,%ymm10
398	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
399	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
400	vmovdqa		%ymm0,%ymm11
401
402	# xor with corresponding input, write to output
403	vmovdqa		0x00(%rsp),%ymm0
404	vpxor		0x0000(%rdx),%ymm0,%ymm0
405	vmovdqu		%ymm0,0x0000(%rsi)
406	vmovdqa		0x20(%rsp),%ymm0
407	vpxor		0x0080(%rdx),%ymm0,%ymm0
408	vmovdqu		%ymm0,0x0080(%rsi)
409	vmovdqa		0x40(%rsp),%ymm0
410	vpxor		0x0040(%rdx),%ymm0,%ymm0
411	vmovdqu		%ymm0,0x0040(%rsi)
412	vmovdqa		0x60(%rsp),%ymm0
413	vpxor		0x00c0(%rdx),%ymm0,%ymm0
414	vmovdqu		%ymm0,0x00c0(%rsi)
415	vpxor		0x0100(%rdx),%ymm4,%ymm4
416	vmovdqu		%ymm4,0x0100(%rsi)
417	vpxor		0x0180(%rdx),%ymm5,%ymm5
418	vmovdqu		%ymm5,0x00180(%rsi)
419	vpxor		0x0140(%rdx),%ymm6,%ymm6
420	vmovdqu		%ymm6,0x0140(%rsi)
421	vpxor		0x01c0(%rdx),%ymm7,%ymm7
422	vmovdqu		%ymm7,0x01c0(%rsi)
423	vpxor		0x0020(%rdx),%ymm8,%ymm8
424	vmovdqu		%ymm8,0x0020(%rsi)
425	vpxor		0x00a0(%rdx),%ymm9,%ymm9
426	vmovdqu		%ymm9,0x00a0(%rsi)
427	vpxor		0x0060(%rdx),%ymm10,%ymm10
428	vmovdqu		%ymm10,0x0060(%rsi)
429	vpxor		0x00e0(%rdx),%ymm11,%ymm11
430	vmovdqu		%ymm11,0x00e0(%rsi)
431	vpxor		0x0120(%rdx),%ymm12,%ymm12
432	vmovdqu		%ymm12,0x0120(%rsi)
433	vpxor		0x01a0(%rdx),%ymm13,%ymm13
434	vmovdqu		%ymm13,0x01a0(%rsi)
435	vpxor		0x0160(%rdx),%ymm14,%ymm14
436	vmovdqu		%ymm14,0x0160(%rsi)
437	vpxor		0x01e0(%rdx),%ymm15,%ymm15
438	vmovdqu		%ymm15,0x01e0(%rsi)
439
440	vzeroupper
441	mov		%r8,%rsp
442	ret
443ENDPROC(chacha20_8block_xor_avx2)
444