1 /* SPDX-License-Identifier: GPL-2.0+ */
3 * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
5 * Copyright (C) 2018 Martin Willi
8 #include <linux/linkage.h>
10 .section .rodata.cst32.CTR2BL, "aM", @progbits, 32
12 CTR2BL: .octa 0x00000000000000000000000000000000
13 .octa 0x00000000000000000000000000000001
15 .section .rodata.cst32.CTR4BL, "aM", @progbits, 32
17 CTR4BL: .octa 0x00000000000000000000000000000002
18 .octa 0x00000000000000000000000000000003
20 .section .rodata.cst32.CTR8BL, "aM", @progbits, 32
22 CTR8BL: .octa 0x00000003000000020000000100000000
23 .octa 0x00000007000000060000000500000004
27 SYM_FUNC_START(chacha_2block_xor_avx512vl)
28 # %rdi: Input state matrix, s
29 # %rsi: up to 2 data blocks output, o
30 # %rdx: up to 2 data blocks input, i
31 # %rcx: input/output length in bytes
34 # This function encrypts two ChaCha blocks by loading the state
35 # matrix twice across four AVX registers. It performs matrix operations
36 # on four words in each matrix in parallel, but requires shuffling to
37 # rearrange the words after each round.
42 vbroadcasti128 0x00(%rdi),%ymm0
43 vbroadcasti128 0x10(%rdi),%ymm1
44 vbroadcasti128 0x20(%rdi),%ymm2
45 vbroadcasti128 0x30(%rdi),%ymm3
47 vpaddd CTR2BL(%rip),%ymm3,%ymm3
56 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
57 vpaddd %ymm1,%ymm0,%ymm0
58 vpxord %ymm0,%ymm3,%ymm3
59 vprold $16,%ymm3,%ymm3
61 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
62 vpaddd %ymm3,%ymm2,%ymm2
63 vpxord %ymm2,%ymm1,%ymm1
64 vprold $12,%ymm1,%ymm1
66 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
67 vpaddd %ymm1,%ymm0,%ymm0
68 vpxord %ymm0,%ymm3,%ymm3
71 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
72 vpaddd %ymm3,%ymm2,%ymm2
73 vpxord %ymm2,%ymm1,%ymm1
76 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
77 vpshufd $0x39,%ymm1,%ymm1
78 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
79 vpshufd $0x4e,%ymm2,%ymm2
80 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
81 vpshufd $0x93,%ymm3,%ymm3
83 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
84 vpaddd %ymm1,%ymm0,%ymm0
85 vpxord %ymm0,%ymm3,%ymm3
86 vprold $16,%ymm3,%ymm3
88 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
89 vpaddd %ymm3,%ymm2,%ymm2
90 vpxord %ymm2,%ymm1,%ymm1
91 vprold $12,%ymm1,%ymm1
93 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
94 vpaddd %ymm1,%ymm0,%ymm0
95 vpxord %ymm0,%ymm3,%ymm3
98 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
99 vpaddd %ymm3,%ymm2,%ymm2
100 vpxord %ymm2,%ymm1,%ymm1
101 vprold $7,%ymm1,%ymm1
103 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
104 vpshufd $0x93,%ymm1,%ymm1
105 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
106 vpshufd $0x4e,%ymm2,%ymm2
107 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
108 vpshufd $0x39,%ymm3,%ymm3
113 # o0 = i0 ^ (x0 + s0)
114 vpaddd %ymm8,%ymm0,%ymm7
117 vpxord 0x00(%rdx),%xmm7,%xmm6
118 vmovdqu %xmm6,0x00(%rsi)
119 vextracti128 $1,%ymm7,%xmm0
120 # o1 = i1 ^ (x1 + s1)
121 vpaddd %ymm9,%ymm1,%ymm7
124 vpxord 0x10(%rdx),%xmm7,%xmm6
125 vmovdqu %xmm6,0x10(%rsi)
126 vextracti128 $1,%ymm7,%xmm1
127 # o2 = i2 ^ (x2 + s2)
128 vpaddd %ymm10,%ymm2,%ymm7
131 vpxord 0x20(%rdx),%xmm7,%xmm6
132 vmovdqu %xmm6,0x20(%rsi)
133 vextracti128 $1,%ymm7,%xmm2
134 # o3 = i3 ^ (x3 + s3)
135 vpaddd %ymm11,%ymm3,%ymm7
138 vpxord 0x30(%rdx),%xmm7,%xmm6
139 vmovdqu %xmm6,0x30(%rsi)
140 vextracti128 $1,%ymm7,%xmm3
142 # xor and write second block
146 vpxord 0x40(%rdx),%xmm7,%xmm6
147 vmovdqu %xmm6,0x40(%rsi)
152 vpxord 0x50(%rdx),%xmm7,%xmm6
153 vmovdqu %xmm6,0x50(%rsi)
158 vpxord 0x60(%rdx),%xmm7,%xmm6
159 vmovdqu %xmm6,0x60(%rsi)
164 vpxord 0x70(%rdx),%xmm7,%xmm6
165 vmovdqu %xmm6,0x70(%rsi)
172 # xor remaining bytes from partial register into output
184 vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
185 vpxord %xmm7,%xmm1,%xmm1
186 vmovdqu8 %xmm1,(%rsi,%r9){%k1}
190 SYM_FUNC_END(chacha_2block_xor_avx512vl)
192 SYM_FUNC_START(chacha_4block_xor_avx512vl)
193 # %rdi: Input state matrix, s
194 # %rsi: up to 4 data blocks output, o
195 # %rdx: up to 4 data blocks input, i
196 # %rcx: input/output length in bytes
199 # This function encrypts four ChaCha blocks by loading the state
200 # matrix four times across eight AVX registers. It performs matrix
201 # operations on four words in two matrices in parallel, sequentially
202 # to the operations on the four words of the other two matrices. The
203 # required word shuffling has a rather high latency, we can do the
204 # arithmetic on two matrix-pairs without much slowdown.
209 vbroadcasti128 0x00(%rdi),%ymm0
210 vbroadcasti128 0x10(%rdi),%ymm1
211 vbroadcasti128 0x20(%rdi),%ymm2
212 vbroadcasti128 0x30(%rdi),%ymm3
219 vpaddd CTR2BL(%rip),%ymm3,%ymm3
220 vpaddd CTR4BL(%rip),%ymm7,%ymm7
230 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
231 vpaddd %ymm1,%ymm0,%ymm0
232 vpxord %ymm0,%ymm3,%ymm3
233 vprold $16,%ymm3,%ymm3
235 vpaddd %ymm5,%ymm4,%ymm4
236 vpxord %ymm4,%ymm7,%ymm7
237 vprold $16,%ymm7,%ymm7
239 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
240 vpaddd %ymm3,%ymm2,%ymm2
241 vpxord %ymm2,%ymm1,%ymm1
242 vprold $12,%ymm1,%ymm1
244 vpaddd %ymm7,%ymm6,%ymm6
245 vpxord %ymm6,%ymm5,%ymm5
246 vprold $12,%ymm5,%ymm5
248 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
249 vpaddd %ymm1,%ymm0,%ymm0
250 vpxord %ymm0,%ymm3,%ymm3
251 vprold $8,%ymm3,%ymm3
253 vpaddd %ymm5,%ymm4,%ymm4
254 vpxord %ymm4,%ymm7,%ymm7
255 vprold $8,%ymm7,%ymm7
257 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
258 vpaddd %ymm3,%ymm2,%ymm2
259 vpxord %ymm2,%ymm1,%ymm1
260 vprold $7,%ymm1,%ymm1
262 vpaddd %ymm7,%ymm6,%ymm6
263 vpxord %ymm6,%ymm5,%ymm5
264 vprold $7,%ymm5,%ymm5
266 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
267 vpshufd $0x39,%ymm1,%ymm1
268 vpshufd $0x39,%ymm5,%ymm5
269 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
270 vpshufd $0x4e,%ymm2,%ymm2
271 vpshufd $0x4e,%ymm6,%ymm6
272 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
273 vpshufd $0x93,%ymm3,%ymm3
274 vpshufd $0x93,%ymm7,%ymm7
276 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
277 vpaddd %ymm1,%ymm0,%ymm0
278 vpxord %ymm0,%ymm3,%ymm3
279 vprold $16,%ymm3,%ymm3
281 vpaddd %ymm5,%ymm4,%ymm4
282 vpxord %ymm4,%ymm7,%ymm7
283 vprold $16,%ymm7,%ymm7
285 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
286 vpaddd %ymm3,%ymm2,%ymm2
287 vpxord %ymm2,%ymm1,%ymm1
288 vprold $12,%ymm1,%ymm1
290 vpaddd %ymm7,%ymm6,%ymm6
291 vpxord %ymm6,%ymm5,%ymm5
292 vprold $12,%ymm5,%ymm5
294 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
295 vpaddd %ymm1,%ymm0,%ymm0
296 vpxord %ymm0,%ymm3,%ymm3
297 vprold $8,%ymm3,%ymm3
299 vpaddd %ymm5,%ymm4,%ymm4
300 vpxord %ymm4,%ymm7,%ymm7
301 vprold $8,%ymm7,%ymm7
303 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
304 vpaddd %ymm3,%ymm2,%ymm2
305 vpxord %ymm2,%ymm1,%ymm1
306 vprold $7,%ymm1,%ymm1
308 vpaddd %ymm7,%ymm6,%ymm6
309 vpxord %ymm6,%ymm5,%ymm5
310 vprold $7,%ymm5,%ymm5
312 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
313 vpshufd $0x93,%ymm1,%ymm1
314 vpshufd $0x93,%ymm5,%ymm5
315 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
316 vpshufd $0x4e,%ymm2,%ymm2
317 vpshufd $0x4e,%ymm6,%ymm6
318 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
319 vpshufd $0x39,%ymm3,%ymm3
320 vpshufd $0x39,%ymm7,%ymm7
325 # o0 = i0 ^ (x0 + s0), first block
326 vpaddd %ymm11,%ymm0,%ymm10
329 vpxord 0x00(%rdx),%xmm10,%xmm9
330 vmovdqu %xmm9,0x00(%rsi)
331 vextracti128 $1,%ymm10,%xmm0
332 # o1 = i1 ^ (x1 + s1), first block
333 vpaddd %ymm12,%ymm1,%ymm10
336 vpxord 0x10(%rdx),%xmm10,%xmm9
337 vmovdqu %xmm9,0x10(%rsi)
338 vextracti128 $1,%ymm10,%xmm1
339 # o2 = i2 ^ (x2 + s2), first block
340 vpaddd %ymm13,%ymm2,%ymm10
343 vpxord 0x20(%rdx),%xmm10,%xmm9
344 vmovdqu %xmm9,0x20(%rsi)
345 vextracti128 $1,%ymm10,%xmm2
346 # o3 = i3 ^ (x3 + s3), first block
347 vpaddd %ymm14,%ymm3,%ymm10
350 vpxord 0x30(%rdx),%xmm10,%xmm9
351 vmovdqu %xmm9,0x30(%rsi)
352 vextracti128 $1,%ymm10,%xmm3
354 # xor and write second block
358 vpxord 0x40(%rdx),%xmm10,%xmm9
359 vmovdqu %xmm9,0x40(%rsi)
364 vpxord 0x50(%rdx),%xmm10,%xmm9
365 vmovdqu %xmm9,0x50(%rsi)
370 vpxord 0x60(%rdx),%xmm10,%xmm9
371 vmovdqu %xmm9,0x60(%rsi)
376 vpxord 0x70(%rdx),%xmm10,%xmm9
377 vmovdqu %xmm9,0x70(%rsi)
379 # o0 = i0 ^ (x0 + s0), third block
380 vpaddd %ymm11,%ymm4,%ymm10
383 vpxord 0x80(%rdx),%xmm10,%xmm9
384 vmovdqu %xmm9,0x80(%rsi)
385 vextracti128 $1,%ymm10,%xmm4
386 # o1 = i1 ^ (x1 + s1), third block
387 vpaddd %ymm12,%ymm5,%ymm10
390 vpxord 0x90(%rdx),%xmm10,%xmm9
391 vmovdqu %xmm9,0x90(%rsi)
392 vextracti128 $1,%ymm10,%xmm5
393 # o2 = i2 ^ (x2 + s2), third block
394 vpaddd %ymm13,%ymm6,%ymm10
397 vpxord 0xa0(%rdx),%xmm10,%xmm9
398 vmovdqu %xmm9,0xa0(%rsi)
399 vextracti128 $1,%ymm10,%xmm6
400 # o3 = i3 ^ (x3 + s3), third block
401 vpaddd %ymm15,%ymm7,%ymm10
404 vpxord 0xb0(%rdx),%xmm10,%xmm9
405 vmovdqu %xmm9,0xb0(%rsi)
406 vextracti128 $1,%ymm10,%xmm7
408 # xor and write fourth block
412 vpxord 0xc0(%rdx),%xmm10,%xmm9
413 vmovdqu %xmm9,0xc0(%rsi)
418 vpxord 0xd0(%rdx),%xmm10,%xmm9
419 vmovdqu %xmm9,0xd0(%rsi)
424 vpxord 0xe0(%rdx),%xmm10,%xmm9
425 vmovdqu %xmm9,0xe0(%rsi)
430 vpxord 0xf0(%rdx),%xmm10,%xmm9
431 vmovdqu %xmm9,0xf0(%rsi)
438 # xor remaining bytes from partial register into output
450 vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
451 vpxord %xmm10,%xmm1,%xmm1
452 vmovdqu8 %xmm1,(%rsi,%r9){%k1}
456 SYM_FUNC_END(chacha_4block_xor_avx512vl)
458 SYM_FUNC_START(chacha_8block_xor_avx512vl)
459 # %rdi: Input state matrix, s
460 # %rsi: up to 8 data blocks output, o
461 # %rdx: up to 8 data blocks input, i
462 # %rcx: input/output length in bytes
465 # This function encrypts eight consecutive ChaCha blocks by loading
466 # the state matrix in AVX registers eight times. Compared to AVX2, this
467 # mostly benefits from the new rotate instructions in VL and the
468 # additional registers.
472 # x0..15[0-7] = s[0..15]
473 vpbroadcastd 0x00(%rdi),%ymm0
474 vpbroadcastd 0x04(%rdi),%ymm1
475 vpbroadcastd 0x08(%rdi),%ymm2
476 vpbroadcastd 0x0c(%rdi),%ymm3
477 vpbroadcastd 0x10(%rdi),%ymm4
478 vpbroadcastd 0x14(%rdi),%ymm5
479 vpbroadcastd 0x18(%rdi),%ymm6
480 vpbroadcastd 0x1c(%rdi),%ymm7
481 vpbroadcastd 0x20(%rdi),%ymm8
482 vpbroadcastd 0x24(%rdi),%ymm9
483 vpbroadcastd 0x28(%rdi),%ymm10
484 vpbroadcastd 0x2c(%rdi),%ymm11
485 vpbroadcastd 0x30(%rdi),%ymm12
486 vpbroadcastd 0x34(%rdi),%ymm13
487 vpbroadcastd 0x38(%rdi),%ymm14
488 vpbroadcastd 0x3c(%rdi),%ymm15
490 # x12 += counter values 0-3
491 vpaddd CTR8BL(%rip),%ymm12,%ymm12
493 vmovdqa64 %ymm0,%ymm16
494 vmovdqa64 %ymm1,%ymm17
495 vmovdqa64 %ymm2,%ymm18
496 vmovdqa64 %ymm3,%ymm19
497 vmovdqa64 %ymm4,%ymm20
498 vmovdqa64 %ymm5,%ymm21
499 vmovdqa64 %ymm6,%ymm22
500 vmovdqa64 %ymm7,%ymm23
501 vmovdqa64 %ymm8,%ymm24
502 vmovdqa64 %ymm9,%ymm25
503 vmovdqa64 %ymm10,%ymm26
504 vmovdqa64 %ymm11,%ymm27
505 vmovdqa64 %ymm12,%ymm28
506 vmovdqa64 %ymm13,%ymm29
507 vmovdqa64 %ymm14,%ymm30
508 vmovdqa64 %ymm15,%ymm31
511 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
512 vpaddd %ymm0,%ymm4,%ymm0
513 vpxord %ymm0,%ymm12,%ymm12
514 vprold $16,%ymm12,%ymm12
515 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
516 vpaddd %ymm1,%ymm5,%ymm1
517 vpxord %ymm1,%ymm13,%ymm13
518 vprold $16,%ymm13,%ymm13
519 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
520 vpaddd %ymm2,%ymm6,%ymm2
521 vpxord %ymm2,%ymm14,%ymm14
522 vprold $16,%ymm14,%ymm14
523 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
524 vpaddd %ymm3,%ymm7,%ymm3
525 vpxord %ymm3,%ymm15,%ymm15
526 vprold $16,%ymm15,%ymm15
528 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
529 vpaddd %ymm12,%ymm8,%ymm8
530 vpxord %ymm8,%ymm4,%ymm4
531 vprold $12,%ymm4,%ymm4
532 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
533 vpaddd %ymm13,%ymm9,%ymm9
534 vpxord %ymm9,%ymm5,%ymm5
535 vprold $12,%ymm5,%ymm5
536 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
537 vpaddd %ymm14,%ymm10,%ymm10
538 vpxord %ymm10,%ymm6,%ymm6
539 vprold $12,%ymm6,%ymm6
540 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
541 vpaddd %ymm15,%ymm11,%ymm11
542 vpxord %ymm11,%ymm7,%ymm7
543 vprold $12,%ymm7,%ymm7
545 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
546 vpaddd %ymm0,%ymm4,%ymm0
547 vpxord %ymm0,%ymm12,%ymm12
548 vprold $8,%ymm12,%ymm12
549 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
550 vpaddd %ymm1,%ymm5,%ymm1
551 vpxord %ymm1,%ymm13,%ymm13
552 vprold $8,%ymm13,%ymm13
553 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
554 vpaddd %ymm2,%ymm6,%ymm2
555 vpxord %ymm2,%ymm14,%ymm14
556 vprold $8,%ymm14,%ymm14
557 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
558 vpaddd %ymm3,%ymm7,%ymm3
559 vpxord %ymm3,%ymm15,%ymm15
560 vprold $8,%ymm15,%ymm15
562 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
563 vpaddd %ymm12,%ymm8,%ymm8
564 vpxord %ymm8,%ymm4,%ymm4
565 vprold $7,%ymm4,%ymm4
566 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
567 vpaddd %ymm13,%ymm9,%ymm9
568 vpxord %ymm9,%ymm5,%ymm5
569 vprold $7,%ymm5,%ymm5
570 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
571 vpaddd %ymm14,%ymm10,%ymm10
572 vpxord %ymm10,%ymm6,%ymm6
573 vprold $7,%ymm6,%ymm6
574 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
575 vpaddd %ymm15,%ymm11,%ymm11
576 vpxord %ymm11,%ymm7,%ymm7
577 vprold $7,%ymm7,%ymm7
579 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
580 vpaddd %ymm0,%ymm5,%ymm0
581 vpxord %ymm0,%ymm15,%ymm15
582 vprold $16,%ymm15,%ymm15
583 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
584 vpaddd %ymm1,%ymm6,%ymm1
585 vpxord %ymm1,%ymm12,%ymm12
586 vprold $16,%ymm12,%ymm12
587 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
588 vpaddd %ymm2,%ymm7,%ymm2
589 vpxord %ymm2,%ymm13,%ymm13
590 vprold $16,%ymm13,%ymm13
591 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
592 vpaddd %ymm3,%ymm4,%ymm3
593 vpxord %ymm3,%ymm14,%ymm14
594 vprold $16,%ymm14,%ymm14
596 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
597 vpaddd %ymm15,%ymm10,%ymm10
598 vpxord %ymm10,%ymm5,%ymm5
599 vprold $12,%ymm5,%ymm5
600 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
601 vpaddd %ymm12,%ymm11,%ymm11
602 vpxord %ymm11,%ymm6,%ymm6
603 vprold $12,%ymm6,%ymm6
604 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
605 vpaddd %ymm13,%ymm8,%ymm8
606 vpxord %ymm8,%ymm7,%ymm7
607 vprold $12,%ymm7,%ymm7
608 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
609 vpaddd %ymm14,%ymm9,%ymm9
610 vpxord %ymm9,%ymm4,%ymm4
611 vprold $12,%ymm4,%ymm4
613 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
614 vpaddd %ymm0,%ymm5,%ymm0
615 vpxord %ymm0,%ymm15,%ymm15
616 vprold $8,%ymm15,%ymm15
617 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
618 vpaddd %ymm1,%ymm6,%ymm1
619 vpxord %ymm1,%ymm12,%ymm12
620 vprold $8,%ymm12,%ymm12
621 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
622 vpaddd %ymm2,%ymm7,%ymm2
623 vpxord %ymm2,%ymm13,%ymm13
624 vprold $8,%ymm13,%ymm13
625 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
626 vpaddd %ymm3,%ymm4,%ymm3
627 vpxord %ymm3,%ymm14,%ymm14
628 vprold $8,%ymm14,%ymm14
630 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
631 vpaddd %ymm15,%ymm10,%ymm10
632 vpxord %ymm10,%ymm5,%ymm5
633 vprold $7,%ymm5,%ymm5
634 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
635 vpaddd %ymm12,%ymm11,%ymm11
636 vpxord %ymm11,%ymm6,%ymm6
637 vprold $7,%ymm6,%ymm6
638 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
639 vpaddd %ymm13,%ymm8,%ymm8
640 vpxord %ymm8,%ymm7,%ymm7
641 vprold $7,%ymm7,%ymm7
642 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
643 vpaddd %ymm14,%ymm9,%ymm9
644 vpxord %ymm9,%ymm4,%ymm4
645 vprold $7,%ymm4,%ymm4
650 # x0..15[0-3] += s[0..15]
651 vpaddd %ymm16,%ymm0,%ymm0
652 vpaddd %ymm17,%ymm1,%ymm1
653 vpaddd %ymm18,%ymm2,%ymm2
654 vpaddd %ymm19,%ymm3,%ymm3
655 vpaddd %ymm20,%ymm4,%ymm4
656 vpaddd %ymm21,%ymm5,%ymm5
657 vpaddd %ymm22,%ymm6,%ymm6
658 vpaddd %ymm23,%ymm7,%ymm7
659 vpaddd %ymm24,%ymm8,%ymm8
660 vpaddd %ymm25,%ymm9,%ymm9
661 vpaddd %ymm26,%ymm10,%ymm10
662 vpaddd %ymm27,%ymm11,%ymm11
663 vpaddd %ymm28,%ymm12,%ymm12
664 vpaddd %ymm29,%ymm13,%ymm13
665 vpaddd %ymm30,%ymm14,%ymm14
666 vpaddd %ymm31,%ymm15,%ymm15
668 # interleave 32-bit words in state n, n+1
669 vpunpckldq %ymm1,%ymm0,%ymm16
670 vpunpckhdq %ymm1,%ymm0,%ymm17
671 vpunpckldq %ymm3,%ymm2,%ymm18
672 vpunpckhdq %ymm3,%ymm2,%ymm19
673 vpunpckldq %ymm5,%ymm4,%ymm20
674 vpunpckhdq %ymm5,%ymm4,%ymm21
675 vpunpckldq %ymm7,%ymm6,%ymm22
676 vpunpckhdq %ymm7,%ymm6,%ymm23
677 vpunpckldq %ymm9,%ymm8,%ymm24
678 vpunpckhdq %ymm9,%ymm8,%ymm25
679 vpunpckldq %ymm11,%ymm10,%ymm26
680 vpunpckhdq %ymm11,%ymm10,%ymm27
681 vpunpckldq %ymm13,%ymm12,%ymm28
682 vpunpckhdq %ymm13,%ymm12,%ymm29
683 vpunpckldq %ymm15,%ymm14,%ymm30
684 vpunpckhdq %ymm15,%ymm14,%ymm31
686 # interleave 64-bit words in state n, n+2
687 vpunpcklqdq %ymm18,%ymm16,%ymm0
688 vpunpcklqdq %ymm19,%ymm17,%ymm1
689 vpunpckhqdq %ymm18,%ymm16,%ymm2
690 vpunpckhqdq %ymm19,%ymm17,%ymm3
691 vpunpcklqdq %ymm22,%ymm20,%ymm4
692 vpunpcklqdq %ymm23,%ymm21,%ymm5
693 vpunpckhqdq %ymm22,%ymm20,%ymm6
694 vpunpckhqdq %ymm23,%ymm21,%ymm7
695 vpunpcklqdq %ymm26,%ymm24,%ymm8
696 vpunpcklqdq %ymm27,%ymm25,%ymm9
697 vpunpckhqdq %ymm26,%ymm24,%ymm10
698 vpunpckhqdq %ymm27,%ymm25,%ymm11
699 vpunpcklqdq %ymm30,%ymm28,%ymm12
700 vpunpcklqdq %ymm31,%ymm29,%ymm13
701 vpunpckhqdq %ymm30,%ymm28,%ymm14
702 vpunpckhqdq %ymm31,%ymm29,%ymm15
704 # interleave 128-bit words in state n, n+4
705 # xor/write first four blocks
706 vmovdqa64 %ymm0,%ymm16
707 vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
710 vpxord 0x0000(%rdx),%ymm0,%ymm0
711 vmovdqu64 %ymm0,0x0000(%rsi)
712 vmovdqa64 %ymm16,%ymm0
713 vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
715 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
718 vpxord 0x0020(%rdx),%ymm0,%ymm0
719 vmovdqu64 %ymm0,0x0020(%rsi)
720 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
722 vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
725 vpxord 0x0040(%rdx),%ymm0,%ymm0
726 vmovdqu64 %ymm0,0x0040(%rsi)
727 vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
729 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
732 vpxord 0x0060(%rdx),%ymm0,%ymm0
733 vmovdqu64 %ymm0,0x0060(%rsi)
734 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
736 vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
739 vpxord 0x0080(%rdx),%ymm0,%ymm0
740 vmovdqu64 %ymm0,0x0080(%rsi)
741 vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
743 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
746 vpxord 0x00a0(%rdx),%ymm0,%ymm0
747 vmovdqu64 %ymm0,0x00a0(%rsi)
748 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
750 vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
753 vpxord 0x00c0(%rdx),%ymm0,%ymm0
754 vmovdqu64 %ymm0,0x00c0(%rsi)
755 vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
757 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
760 vpxord 0x00e0(%rdx),%ymm0,%ymm0
761 vmovdqu64 %ymm0,0x00e0(%rsi)
762 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
764 # xor remaining blocks, write to output
765 vmovdqa64 %ymm4,%ymm0
768 vpxord 0x0100(%rdx),%ymm0,%ymm0
769 vmovdqu64 %ymm0,0x0100(%rsi)
771 vmovdqa64 %ymm12,%ymm0
774 vpxord 0x0120(%rdx),%ymm0,%ymm0
775 vmovdqu64 %ymm0,0x0120(%rsi)
777 vmovdqa64 %ymm6,%ymm0
780 vpxord 0x0140(%rdx),%ymm0,%ymm0
781 vmovdqu64 %ymm0,0x0140(%rsi)
783 vmovdqa64 %ymm14,%ymm0
786 vpxord 0x0160(%rdx),%ymm0,%ymm0
787 vmovdqu64 %ymm0,0x0160(%rsi)
789 vmovdqa64 %ymm5,%ymm0
792 vpxord 0x0180(%rdx),%ymm0,%ymm0
793 vmovdqu64 %ymm0,0x0180(%rsi)
795 vmovdqa64 %ymm13,%ymm0
798 vpxord 0x01a0(%rdx),%ymm0,%ymm0
799 vmovdqu64 %ymm0,0x01a0(%rsi)
801 vmovdqa64 %ymm7,%ymm0
804 vpxord 0x01c0(%rdx),%ymm0,%ymm0
805 vmovdqu64 %ymm0,0x01c0(%rsi)
807 vmovdqa64 %ymm15,%ymm0
810 vpxord 0x01e0(%rdx),%ymm0,%ymm0
811 vmovdqu64 %ymm0,0x01e0(%rsi)
818 # xor remaining bytes from partial register into output
830 vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
831 vpxord %ymm0,%ymm1,%ymm1
832 vmovdqu8 %ymm1,(%rsi,%r9){%k1}
836 SYM_FUNC_END(chacha_8block_xor_avx512vl)