1/* 2 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions 3 * 4 * Copyright (C) 2015 Martin Willi 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 */ 11 12#include <linux/linkage.h> 13 14.data 15.align 32 16 17ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 18 .octa 0x0e0d0c0f0a09080b0605040702010003 19ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 20 .octa 0x0d0c0f0e09080b0a0504070601000302 21CTRINC: .octa 0x00000003000000020000000100000000 22 .octa 0x00000007000000060000000500000004 23 24.text 25 26ENTRY(chacha20_8block_xor_avx2) 27 # %rdi: Input state matrix, s 28 # %rsi: 8 data blocks output, o 29 # %rdx: 8 data blocks input, i 30 31 # This function encrypts eight consecutive ChaCha20 blocks by loading 32 # the state matrix in AVX registers eight times. As we need some 33 # scratch registers, we save the first four registers on the stack. The 34 # algorithm performs each operation on the corresponding word of each 35 # state matrix, hence requires no word shuffling. For final XORing step 36 # we transpose the matrix by interleaving 32-, 64- and then 128-bit 37 # words, which allows us to do XOR in AVX registers. 8/16-bit word 38 # rotation is done with the slightly better performing byte shuffling, 39 # 7/12-bit word rotation uses traditional shift+OR. 40 41 vzeroupper 42 # 4 * 32 byte stack, 32-byte aligned 43 mov %rsp, %r8 44 and $~31, %rsp 45 sub $0x80, %rsp 46 47 # x0..15[0-7] = s[0..15] 48 vpbroadcastd 0x00(%rdi),%ymm0 49 vpbroadcastd 0x04(%rdi),%ymm1 50 vpbroadcastd 0x08(%rdi),%ymm2 51 vpbroadcastd 0x0c(%rdi),%ymm3 52 vpbroadcastd 0x10(%rdi),%ymm4 53 vpbroadcastd 0x14(%rdi),%ymm5 54 vpbroadcastd 0x18(%rdi),%ymm6 55 vpbroadcastd 0x1c(%rdi),%ymm7 56 vpbroadcastd 0x20(%rdi),%ymm8 57 vpbroadcastd 0x24(%rdi),%ymm9 58 vpbroadcastd 0x28(%rdi),%ymm10 59 vpbroadcastd 0x2c(%rdi),%ymm11 60 vpbroadcastd 0x30(%rdi),%ymm12 61 vpbroadcastd 0x34(%rdi),%ymm13 62 vpbroadcastd 0x38(%rdi),%ymm14 63 vpbroadcastd 0x3c(%rdi),%ymm15 64 # x0..3 on stack 65 vmovdqa %ymm0,0x00(%rsp) 66 vmovdqa %ymm1,0x20(%rsp) 67 vmovdqa %ymm2,0x40(%rsp) 68 vmovdqa %ymm3,0x60(%rsp) 69 70 vmovdqa CTRINC(%rip),%ymm1 71 vmovdqa ROT8(%rip),%ymm2 72 vmovdqa ROT16(%rip),%ymm3 73 74 # x12 += counter values 0-3 75 vpaddd %ymm1,%ymm12,%ymm12 76 77 mov $10,%ecx 78 79.Ldoubleround8: 80 # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 81 vpaddd 0x00(%rsp),%ymm4,%ymm0 82 vmovdqa %ymm0,0x00(%rsp) 83 vpxor %ymm0,%ymm12,%ymm12 84 vpshufb %ymm3,%ymm12,%ymm12 85 # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 86 vpaddd 0x20(%rsp),%ymm5,%ymm0 87 vmovdqa %ymm0,0x20(%rsp) 88 vpxor %ymm0,%ymm13,%ymm13 89 vpshufb %ymm3,%ymm13,%ymm13 90 # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 91 vpaddd 0x40(%rsp),%ymm6,%ymm0 92 vmovdqa %ymm0,0x40(%rsp) 93 vpxor %ymm0,%ymm14,%ymm14 94 vpshufb %ymm3,%ymm14,%ymm14 95 # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 96 vpaddd 0x60(%rsp),%ymm7,%ymm0 97 vmovdqa %ymm0,0x60(%rsp) 98 vpxor %ymm0,%ymm15,%ymm15 99 vpshufb %ymm3,%ymm15,%ymm15 100 101 # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 102 vpaddd %ymm12,%ymm8,%ymm8 103 vpxor %ymm8,%ymm4,%ymm4 104 vpslld $12,%ymm4,%ymm0 105 vpsrld $20,%ymm4,%ymm4 106 vpor %ymm0,%ymm4,%ymm4 107 # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 108 vpaddd %ymm13,%ymm9,%ymm9 109 vpxor %ymm9,%ymm5,%ymm5 110 vpslld $12,%ymm5,%ymm0 111 vpsrld $20,%ymm5,%ymm5 112 vpor %ymm0,%ymm5,%ymm5 113 # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 114 vpaddd %ymm14,%ymm10,%ymm10 115 vpxor %ymm10,%ymm6,%ymm6 116 vpslld $12,%ymm6,%ymm0 117 vpsrld $20,%ymm6,%ymm6 118 vpor %ymm0,%ymm6,%ymm6 119 # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 120 vpaddd %ymm15,%ymm11,%ymm11 121 vpxor %ymm11,%ymm7,%ymm7 122 vpslld $12,%ymm7,%ymm0 123 vpsrld $20,%ymm7,%ymm7 124 vpor %ymm0,%ymm7,%ymm7 125 126 # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 127 vpaddd 0x00(%rsp),%ymm4,%ymm0 128 vmovdqa %ymm0,0x00(%rsp) 129 vpxor %ymm0,%ymm12,%ymm12 130 vpshufb %ymm2,%ymm12,%ymm12 131 # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 132 vpaddd 0x20(%rsp),%ymm5,%ymm0 133 vmovdqa %ymm0,0x20(%rsp) 134 vpxor %ymm0,%ymm13,%ymm13 135 vpshufb %ymm2,%ymm13,%ymm13 136 # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 137 vpaddd 0x40(%rsp),%ymm6,%ymm0 138 vmovdqa %ymm0,0x40(%rsp) 139 vpxor %ymm0,%ymm14,%ymm14 140 vpshufb %ymm2,%ymm14,%ymm14 141 # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 142 vpaddd 0x60(%rsp),%ymm7,%ymm0 143 vmovdqa %ymm0,0x60(%rsp) 144 vpxor %ymm0,%ymm15,%ymm15 145 vpshufb %ymm2,%ymm15,%ymm15 146 147 # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 148 vpaddd %ymm12,%ymm8,%ymm8 149 vpxor %ymm8,%ymm4,%ymm4 150 vpslld $7,%ymm4,%ymm0 151 vpsrld $25,%ymm4,%ymm4 152 vpor %ymm0,%ymm4,%ymm4 153 # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 154 vpaddd %ymm13,%ymm9,%ymm9 155 vpxor %ymm9,%ymm5,%ymm5 156 vpslld $7,%ymm5,%ymm0 157 vpsrld $25,%ymm5,%ymm5 158 vpor %ymm0,%ymm5,%ymm5 159 # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 160 vpaddd %ymm14,%ymm10,%ymm10 161 vpxor %ymm10,%ymm6,%ymm6 162 vpslld $7,%ymm6,%ymm0 163 vpsrld $25,%ymm6,%ymm6 164 vpor %ymm0,%ymm6,%ymm6 165 # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 166 vpaddd %ymm15,%ymm11,%ymm11 167 vpxor %ymm11,%ymm7,%ymm7 168 vpslld $7,%ymm7,%ymm0 169 vpsrld $25,%ymm7,%ymm7 170 vpor %ymm0,%ymm7,%ymm7 171 172 # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 173 vpaddd 0x00(%rsp),%ymm5,%ymm0 174 vmovdqa %ymm0,0x00(%rsp) 175 vpxor %ymm0,%ymm15,%ymm15 176 vpshufb %ymm3,%ymm15,%ymm15 177 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 178 vpaddd 0x20(%rsp),%ymm6,%ymm0 179 vmovdqa %ymm0,0x20(%rsp) 180 vpxor %ymm0,%ymm12,%ymm12 181 vpshufb %ymm3,%ymm12,%ymm12 182 # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 183 vpaddd 0x40(%rsp),%ymm7,%ymm0 184 vmovdqa %ymm0,0x40(%rsp) 185 vpxor %ymm0,%ymm13,%ymm13 186 vpshufb %ymm3,%ymm13,%ymm13 187 # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 188 vpaddd 0x60(%rsp),%ymm4,%ymm0 189 vmovdqa %ymm0,0x60(%rsp) 190 vpxor %ymm0,%ymm14,%ymm14 191 vpshufb %ymm3,%ymm14,%ymm14 192 193 # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 194 vpaddd %ymm15,%ymm10,%ymm10 195 vpxor %ymm10,%ymm5,%ymm5 196 vpslld $12,%ymm5,%ymm0 197 vpsrld $20,%ymm5,%ymm5 198 vpor %ymm0,%ymm5,%ymm5 199 # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 200 vpaddd %ymm12,%ymm11,%ymm11 201 vpxor %ymm11,%ymm6,%ymm6 202 vpslld $12,%ymm6,%ymm0 203 vpsrld $20,%ymm6,%ymm6 204 vpor %ymm0,%ymm6,%ymm6 205 # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 206 vpaddd %ymm13,%ymm8,%ymm8 207 vpxor %ymm8,%ymm7,%ymm7 208 vpslld $12,%ymm7,%ymm0 209 vpsrld $20,%ymm7,%ymm7 210 vpor %ymm0,%ymm7,%ymm7 211 # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 212 vpaddd %ymm14,%ymm9,%ymm9 213 vpxor %ymm9,%ymm4,%ymm4 214 vpslld $12,%ymm4,%ymm0 215 vpsrld $20,%ymm4,%ymm4 216 vpor %ymm0,%ymm4,%ymm4 217 218 # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 219 vpaddd 0x00(%rsp),%ymm5,%ymm0 220 vmovdqa %ymm0,0x00(%rsp) 221 vpxor %ymm0,%ymm15,%ymm15 222 vpshufb %ymm2,%ymm15,%ymm15 223 # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 224 vpaddd 0x20(%rsp),%ymm6,%ymm0 225 vmovdqa %ymm0,0x20(%rsp) 226 vpxor %ymm0,%ymm12,%ymm12 227 vpshufb %ymm2,%ymm12,%ymm12 228 # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 229 vpaddd 0x40(%rsp),%ymm7,%ymm0 230 vmovdqa %ymm0,0x40(%rsp) 231 vpxor %ymm0,%ymm13,%ymm13 232 vpshufb %ymm2,%ymm13,%ymm13 233 # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 234 vpaddd 0x60(%rsp),%ymm4,%ymm0 235 vmovdqa %ymm0,0x60(%rsp) 236 vpxor %ymm0,%ymm14,%ymm14 237 vpshufb %ymm2,%ymm14,%ymm14 238 239 # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 240 vpaddd %ymm15,%ymm10,%ymm10 241 vpxor %ymm10,%ymm5,%ymm5 242 vpslld $7,%ymm5,%ymm0 243 vpsrld $25,%ymm5,%ymm5 244 vpor %ymm0,%ymm5,%ymm5 245 # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 246 vpaddd %ymm12,%ymm11,%ymm11 247 vpxor %ymm11,%ymm6,%ymm6 248 vpslld $7,%ymm6,%ymm0 249 vpsrld $25,%ymm6,%ymm6 250 vpor %ymm0,%ymm6,%ymm6 251 # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 252 vpaddd %ymm13,%ymm8,%ymm8 253 vpxor %ymm8,%ymm7,%ymm7 254 vpslld $7,%ymm7,%ymm0 255 vpsrld $25,%ymm7,%ymm7 256 vpor %ymm0,%ymm7,%ymm7 257 # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 258 vpaddd %ymm14,%ymm9,%ymm9 259 vpxor %ymm9,%ymm4,%ymm4 260 vpslld $7,%ymm4,%ymm0 261 vpsrld $25,%ymm4,%ymm4 262 vpor %ymm0,%ymm4,%ymm4 263 264 dec %ecx 265 jnz .Ldoubleround8 266 267 # x0..15[0-3] += s[0..15] 268 vpbroadcastd 0x00(%rdi),%ymm0 269 vpaddd 0x00(%rsp),%ymm0,%ymm0 270 vmovdqa %ymm0,0x00(%rsp) 271 vpbroadcastd 0x04(%rdi),%ymm0 272 vpaddd 0x20(%rsp),%ymm0,%ymm0 273 vmovdqa %ymm0,0x20(%rsp) 274 vpbroadcastd 0x08(%rdi),%ymm0 275 vpaddd 0x40(%rsp),%ymm0,%ymm0 276 vmovdqa %ymm0,0x40(%rsp) 277 vpbroadcastd 0x0c(%rdi),%ymm0 278 vpaddd 0x60(%rsp),%ymm0,%ymm0 279 vmovdqa %ymm0,0x60(%rsp) 280 vpbroadcastd 0x10(%rdi),%ymm0 281 vpaddd %ymm0,%ymm4,%ymm4 282 vpbroadcastd 0x14(%rdi),%ymm0 283 vpaddd %ymm0,%ymm5,%ymm5 284 vpbroadcastd 0x18(%rdi),%ymm0 285 vpaddd %ymm0,%ymm6,%ymm6 286 vpbroadcastd 0x1c(%rdi),%ymm0 287 vpaddd %ymm0,%ymm7,%ymm7 288 vpbroadcastd 0x20(%rdi),%ymm0 289 vpaddd %ymm0,%ymm8,%ymm8 290 vpbroadcastd 0x24(%rdi),%ymm0 291 vpaddd %ymm0,%ymm9,%ymm9 292 vpbroadcastd 0x28(%rdi),%ymm0 293 vpaddd %ymm0,%ymm10,%ymm10 294 vpbroadcastd 0x2c(%rdi),%ymm0 295 vpaddd %ymm0,%ymm11,%ymm11 296 vpbroadcastd 0x30(%rdi),%ymm0 297 vpaddd %ymm0,%ymm12,%ymm12 298 vpbroadcastd 0x34(%rdi),%ymm0 299 vpaddd %ymm0,%ymm13,%ymm13 300 vpbroadcastd 0x38(%rdi),%ymm0 301 vpaddd %ymm0,%ymm14,%ymm14 302 vpbroadcastd 0x3c(%rdi),%ymm0 303 vpaddd %ymm0,%ymm15,%ymm15 304 305 # x12 += counter values 0-3 306 vpaddd %ymm1,%ymm12,%ymm12 307 308 # interleave 32-bit words in state n, n+1 309 vmovdqa 0x00(%rsp),%ymm0 310 vmovdqa 0x20(%rsp),%ymm1 311 vpunpckldq %ymm1,%ymm0,%ymm2 312 vpunpckhdq %ymm1,%ymm0,%ymm1 313 vmovdqa %ymm2,0x00(%rsp) 314 vmovdqa %ymm1,0x20(%rsp) 315 vmovdqa 0x40(%rsp),%ymm0 316 vmovdqa 0x60(%rsp),%ymm1 317 vpunpckldq %ymm1,%ymm0,%ymm2 318 vpunpckhdq %ymm1,%ymm0,%ymm1 319 vmovdqa %ymm2,0x40(%rsp) 320 vmovdqa %ymm1,0x60(%rsp) 321 vmovdqa %ymm4,%ymm0 322 vpunpckldq %ymm5,%ymm0,%ymm4 323 vpunpckhdq %ymm5,%ymm0,%ymm5 324 vmovdqa %ymm6,%ymm0 325 vpunpckldq %ymm7,%ymm0,%ymm6 326 vpunpckhdq %ymm7,%ymm0,%ymm7 327 vmovdqa %ymm8,%ymm0 328 vpunpckldq %ymm9,%ymm0,%ymm8 329 vpunpckhdq %ymm9,%ymm0,%ymm9 330 vmovdqa %ymm10,%ymm0 331 vpunpckldq %ymm11,%ymm0,%ymm10 332 vpunpckhdq %ymm11,%ymm0,%ymm11 333 vmovdqa %ymm12,%ymm0 334 vpunpckldq %ymm13,%ymm0,%ymm12 335 vpunpckhdq %ymm13,%ymm0,%ymm13 336 vmovdqa %ymm14,%ymm0 337 vpunpckldq %ymm15,%ymm0,%ymm14 338 vpunpckhdq %ymm15,%ymm0,%ymm15 339 340 # interleave 64-bit words in state n, n+2 341 vmovdqa 0x00(%rsp),%ymm0 342 vmovdqa 0x40(%rsp),%ymm2 343 vpunpcklqdq %ymm2,%ymm0,%ymm1 344 vpunpckhqdq %ymm2,%ymm0,%ymm2 345 vmovdqa %ymm1,0x00(%rsp) 346 vmovdqa %ymm2,0x40(%rsp) 347 vmovdqa 0x20(%rsp),%ymm0 348 vmovdqa 0x60(%rsp),%ymm2 349 vpunpcklqdq %ymm2,%ymm0,%ymm1 350 vpunpckhqdq %ymm2,%ymm0,%ymm2 351 vmovdqa %ymm1,0x20(%rsp) 352 vmovdqa %ymm2,0x60(%rsp) 353 vmovdqa %ymm4,%ymm0 354 vpunpcklqdq %ymm6,%ymm0,%ymm4 355 vpunpckhqdq %ymm6,%ymm0,%ymm6 356 vmovdqa %ymm5,%ymm0 357 vpunpcklqdq %ymm7,%ymm0,%ymm5 358 vpunpckhqdq %ymm7,%ymm0,%ymm7 359 vmovdqa %ymm8,%ymm0 360 vpunpcklqdq %ymm10,%ymm0,%ymm8 361 vpunpckhqdq %ymm10,%ymm0,%ymm10 362 vmovdqa %ymm9,%ymm0 363 vpunpcklqdq %ymm11,%ymm0,%ymm9 364 vpunpckhqdq %ymm11,%ymm0,%ymm11 365 vmovdqa %ymm12,%ymm0 366 vpunpcklqdq %ymm14,%ymm0,%ymm12 367 vpunpckhqdq %ymm14,%ymm0,%ymm14 368 vmovdqa %ymm13,%ymm0 369 vpunpcklqdq %ymm15,%ymm0,%ymm13 370 vpunpckhqdq %ymm15,%ymm0,%ymm15 371 372 # interleave 128-bit words in state n, n+4 373 vmovdqa 0x00(%rsp),%ymm0 374 vperm2i128 $0x20,%ymm4,%ymm0,%ymm1 375 vperm2i128 $0x31,%ymm4,%ymm0,%ymm4 376 vmovdqa %ymm1,0x00(%rsp) 377 vmovdqa 0x20(%rsp),%ymm0 378 vperm2i128 $0x20,%ymm5,%ymm0,%ymm1 379 vperm2i128 $0x31,%ymm5,%ymm0,%ymm5 380 vmovdqa %ymm1,0x20(%rsp) 381 vmovdqa 0x40(%rsp),%ymm0 382 vperm2i128 $0x20,%ymm6,%ymm0,%ymm1 383 vperm2i128 $0x31,%ymm6,%ymm0,%ymm6 384 vmovdqa %ymm1,0x40(%rsp) 385 vmovdqa 0x60(%rsp),%ymm0 386 vperm2i128 $0x20,%ymm7,%ymm0,%ymm1 387 vperm2i128 $0x31,%ymm7,%ymm0,%ymm7 388 vmovdqa %ymm1,0x60(%rsp) 389 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 390 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 391 vmovdqa %ymm0,%ymm8 392 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 393 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 394 vmovdqa %ymm0,%ymm9 395 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 396 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 397 vmovdqa %ymm0,%ymm10 398 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 399 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 400 vmovdqa %ymm0,%ymm11 401 402 # xor with corresponding input, write to output 403 vmovdqa 0x00(%rsp),%ymm0 404 vpxor 0x0000(%rdx),%ymm0,%ymm0 405 vmovdqu %ymm0,0x0000(%rsi) 406 vmovdqa 0x20(%rsp),%ymm0 407 vpxor 0x0080(%rdx),%ymm0,%ymm0 408 vmovdqu %ymm0,0x0080(%rsi) 409 vmovdqa 0x40(%rsp),%ymm0 410 vpxor 0x0040(%rdx),%ymm0,%ymm0 411 vmovdqu %ymm0,0x0040(%rsi) 412 vmovdqa 0x60(%rsp),%ymm0 413 vpxor 0x00c0(%rdx),%ymm0,%ymm0 414 vmovdqu %ymm0,0x00c0(%rsi) 415 vpxor 0x0100(%rdx),%ymm4,%ymm4 416 vmovdqu %ymm4,0x0100(%rsi) 417 vpxor 0x0180(%rdx),%ymm5,%ymm5 418 vmovdqu %ymm5,0x00180(%rsi) 419 vpxor 0x0140(%rdx),%ymm6,%ymm6 420 vmovdqu %ymm6,0x0140(%rsi) 421 vpxor 0x01c0(%rdx),%ymm7,%ymm7 422 vmovdqu %ymm7,0x01c0(%rsi) 423 vpxor 0x0020(%rdx),%ymm8,%ymm8 424 vmovdqu %ymm8,0x0020(%rsi) 425 vpxor 0x00a0(%rdx),%ymm9,%ymm9 426 vmovdqu %ymm9,0x00a0(%rsi) 427 vpxor 0x0060(%rdx),%ymm10,%ymm10 428 vmovdqu %ymm10,0x0060(%rsi) 429 vpxor 0x00e0(%rdx),%ymm11,%ymm11 430 vmovdqu %ymm11,0x00e0(%rsi) 431 vpxor 0x0120(%rdx),%ymm12,%ymm12 432 vmovdqu %ymm12,0x0120(%rsi) 433 vpxor 0x01a0(%rdx),%ymm13,%ymm13 434 vmovdqu %ymm13,0x01a0(%rsi) 435 vpxor 0x0160(%rdx),%ymm14,%ymm14 436 vmovdqu %ymm14,0x0160(%rsi) 437 vpxor 0x01e0(%rdx),%ymm15,%ymm15 438 vmovdqu %ymm15,0x01e0(%rsi) 439 440 vzeroupper 441 mov %r8,%rsp 442 ret 443ENDPROC(chacha20_8block_xor_avx2) 444