1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
5 * Copyright (C) 2015 Martin Willi
8 #include <linux/linkage.h>
10 .section .rodata.cst32.ROT8, "aM", @progbits, 32
12 ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
13 .octa 0x0e0d0c0f0a09080b0605040702010003
15 .section .rodata.cst32.ROT16, "aM", @progbits, 32
17 ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
18 .octa 0x0d0c0f0e09080b0a0504070601000302
20 .section .rodata.cst32.CTRINC, "aM", @progbits, 32
22 CTRINC: .octa 0x00000003000000020000000100000000
23 .octa 0x00000007000000060000000500000004
25 .section .rodata.cst32.CTR2BL, "aM", @progbits, 32
27 CTR2BL: .octa 0x00000000000000000000000000000000
28 .octa 0x00000000000000000000000000000001
30 .section .rodata.cst32.CTR4BL, "aM", @progbits, 32
32 CTR4BL: .octa 0x00000000000000000000000000000002
33 .octa 0x00000000000000000000000000000003
37 SYM_FUNC_START(chacha_2block_xor_avx2)
38 # %rdi: Input state matrix, s
39 # %rsi: up to 2 data blocks output, o
40 # %rdx: up to 2 data blocks input, i
41 # %rcx: input/output length in bytes
44 # This function encrypts two ChaCha blocks by loading the state
45 # matrix twice across four AVX registers. It performs matrix operations
46 # on four words in each matrix in parallel, but requires shuffling to
47 # rearrange the words after each round.
52 vbroadcasti128 0x00(%rdi),%ymm0
53 vbroadcasti128 0x10(%rdi),%ymm1
54 vbroadcasti128 0x20(%rdi),%ymm2
55 vbroadcasti128 0x30(%rdi),%ymm3
57 vpaddd CTR2BL(%rip),%ymm3,%ymm3
64 vmovdqa ROT8(%rip),%ymm4
65 vmovdqa ROT16(%rip),%ymm5
71 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
72 vpaddd %ymm1,%ymm0,%ymm0
73 vpxor %ymm0,%ymm3,%ymm3
74 vpshufb %ymm5,%ymm3,%ymm3
76 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
77 vpaddd %ymm3,%ymm2,%ymm2
78 vpxor %ymm2,%ymm1,%ymm1
80 vpslld $12,%ymm6,%ymm6
81 vpsrld $20,%ymm1,%ymm1
82 vpor %ymm6,%ymm1,%ymm1
84 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
85 vpaddd %ymm1,%ymm0,%ymm0
86 vpxor %ymm0,%ymm3,%ymm3
87 vpshufb %ymm4,%ymm3,%ymm3
89 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
90 vpaddd %ymm3,%ymm2,%ymm2
91 vpxor %ymm2,%ymm1,%ymm1
94 vpsrld $25,%ymm1,%ymm1
95 vpor %ymm7,%ymm1,%ymm1
97 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
98 vpshufd $0x39,%ymm1,%ymm1
99 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
100 vpshufd $0x4e,%ymm2,%ymm2
101 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
102 vpshufd $0x93,%ymm3,%ymm3
104 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
105 vpaddd %ymm1,%ymm0,%ymm0
106 vpxor %ymm0,%ymm3,%ymm3
107 vpshufb %ymm5,%ymm3,%ymm3
109 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
110 vpaddd %ymm3,%ymm2,%ymm2
111 vpxor %ymm2,%ymm1,%ymm1
113 vpslld $12,%ymm6,%ymm6
114 vpsrld $20,%ymm1,%ymm1
115 vpor %ymm6,%ymm1,%ymm1
117 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
118 vpaddd %ymm1,%ymm0,%ymm0
119 vpxor %ymm0,%ymm3,%ymm3
120 vpshufb %ymm4,%ymm3,%ymm3
122 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
123 vpaddd %ymm3,%ymm2,%ymm2
124 vpxor %ymm2,%ymm1,%ymm1
126 vpslld $7,%ymm7,%ymm7
127 vpsrld $25,%ymm1,%ymm1
128 vpor %ymm7,%ymm1,%ymm1
130 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
131 vpshufd $0x93,%ymm1,%ymm1
132 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
133 vpshufd $0x4e,%ymm2,%ymm2
134 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
135 vpshufd $0x39,%ymm3,%ymm3
140 # o0 = i0 ^ (x0 + s0)
141 vpaddd %ymm8,%ymm0,%ymm7
144 vpxor 0x00(%rdx),%xmm7,%xmm6
145 vmovdqu %xmm6,0x00(%rsi)
146 vextracti128 $1,%ymm7,%xmm0
147 # o1 = i1 ^ (x1 + s1)
148 vpaddd %ymm9,%ymm1,%ymm7
151 vpxor 0x10(%rdx),%xmm7,%xmm6
152 vmovdqu %xmm6,0x10(%rsi)
153 vextracti128 $1,%ymm7,%xmm1
154 # o2 = i2 ^ (x2 + s2)
155 vpaddd %ymm10,%ymm2,%ymm7
158 vpxor 0x20(%rdx),%xmm7,%xmm6
159 vmovdqu %xmm6,0x20(%rsi)
160 vextracti128 $1,%ymm7,%xmm2
161 # o3 = i3 ^ (x3 + s3)
162 vpaddd %ymm11,%ymm3,%ymm7
165 vpxor 0x30(%rdx),%xmm7,%xmm6
166 vmovdqu %xmm6,0x30(%rsi)
167 vextracti128 $1,%ymm7,%xmm3
169 # xor and write second block
173 vpxor 0x40(%rdx),%xmm7,%xmm6
174 vmovdqu %xmm6,0x40(%rsi)
179 vpxor 0x50(%rdx),%xmm7,%xmm6
180 vmovdqu %xmm6,0x50(%rsi)
185 vpxor 0x60(%rdx),%xmm7,%xmm6
186 vmovdqu %xmm6,0x60(%rsi)
191 vpxor 0x70(%rdx),%xmm7,%xmm6
192 vmovdqu %xmm6,0x70(%rsi)
199 # xor remaining bytes from partial register into output
216 vpxor 0x00(%rsp),%xmm7,%xmm7
217 vmovdqa %xmm7,0x00(%rsp)
227 SYM_FUNC_END(chacha_2block_xor_avx2)
229 SYM_FUNC_START(chacha_4block_xor_avx2)
230 # %rdi: Input state matrix, s
231 # %rsi: up to 4 data blocks output, o
232 # %rdx: up to 4 data blocks input, i
233 # %rcx: input/output length in bytes
236 # This function encrypts four ChaCha blocks by loading the state
237 # matrix four times across eight AVX registers. It performs matrix
238 # operations on four words in two matrices in parallel, sequentially
239 # to the operations on the four words of the other two matrices. The
240 # required word shuffling has a rather high latency, we can do the
241 # arithmetic on two matrix-pairs without much slowdown.
246 vbroadcasti128 0x00(%rdi),%ymm0
247 vbroadcasti128 0x10(%rdi),%ymm1
248 vbroadcasti128 0x20(%rdi),%ymm2
249 vbroadcasti128 0x30(%rdi),%ymm3
256 vpaddd CTR2BL(%rip),%ymm3,%ymm3
257 vpaddd CTR4BL(%rip),%ymm7,%ymm7
265 vmovdqa ROT8(%rip),%ymm8
266 vmovdqa ROT16(%rip),%ymm9
272 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
273 vpaddd %ymm1,%ymm0,%ymm0
274 vpxor %ymm0,%ymm3,%ymm3
275 vpshufb %ymm9,%ymm3,%ymm3
277 vpaddd %ymm5,%ymm4,%ymm4
278 vpxor %ymm4,%ymm7,%ymm7
279 vpshufb %ymm9,%ymm7,%ymm7
281 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
282 vpaddd %ymm3,%ymm2,%ymm2
283 vpxor %ymm2,%ymm1,%ymm1
285 vpslld $12,%ymm10,%ymm10
286 vpsrld $20,%ymm1,%ymm1
287 vpor %ymm10,%ymm1,%ymm1
289 vpaddd %ymm7,%ymm6,%ymm6
290 vpxor %ymm6,%ymm5,%ymm5
292 vpslld $12,%ymm10,%ymm10
293 vpsrld $20,%ymm5,%ymm5
294 vpor %ymm10,%ymm5,%ymm5
296 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
297 vpaddd %ymm1,%ymm0,%ymm0
298 vpxor %ymm0,%ymm3,%ymm3
299 vpshufb %ymm8,%ymm3,%ymm3
301 vpaddd %ymm5,%ymm4,%ymm4
302 vpxor %ymm4,%ymm7,%ymm7
303 vpshufb %ymm8,%ymm7,%ymm7
305 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
306 vpaddd %ymm3,%ymm2,%ymm2
307 vpxor %ymm2,%ymm1,%ymm1
309 vpslld $7,%ymm10,%ymm10
310 vpsrld $25,%ymm1,%ymm1
311 vpor %ymm10,%ymm1,%ymm1
313 vpaddd %ymm7,%ymm6,%ymm6
314 vpxor %ymm6,%ymm5,%ymm5
316 vpslld $7,%ymm10,%ymm10
317 vpsrld $25,%ymm5,%ymm5
318 vpor %ymm10,%ymm5,%ymm5
320 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
321 vpshufd $0x39,%ymm1,%ymm1
322 vpshufd $0x39,%ymm5,%ymm5
323 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
324 vpshufd $0x4e,%ymm2,%ymm2
325 vpshufd $0x4e,%ymm6,%ymm6
326 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
327 vpshufd $0x93,%ymm3,%ymm3
328 vpshufd $0x93,%ymm7,%ymm7
330 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
331 vpaddd %ymm1,%ymm0,%ymm0
332 vpxor %ymm0,%ymm3,%ymm3
333 vpshufb %ymm9,%ymm3,%ymm3
335 vpaddd %ymm5,%ymm4,%ymm4
336 vpxor %ymm4,%ymm7,%ymm7
337 vpshufb %ymm9,%ymm7,%ymm7
339 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
340 vpaddd %ymm3,%ymm2,%ymm2
341 vpxor %ymm2,%ymm1,%ymm1
343 vpslld $12,%ymm10,%ymm10
344 vpsrld $20,%ymm1,%ymm1
345 vpor %ymm10,%ymm1,%ymm1
347 vpaddd %ymm7,%ymm6,%ymm6
348 vpxor %ymm6,%ymm5,%ymm5
350 vpslld $12,%ymm10,%ymm10
351 vpsrld $20,%ymm5,%ymm5
352 vpor %ymm10,%ymm5,%ymm5
354 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
355 vpaddd %ymm1,%ymm0,%ymm0
356 vpxor %ymm0,%ymm3,%ymm3
357 vpshufb %ymm8,%ymm3,%ymm3
359 vpaddd %ymm5,%ymm4,%ymm4
360 vpxor %ymm4,%ymm7,%ymm7
361 vpshufb %ymm8,%ymm7,%ymm7
363 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
364 vpaddd %ymm3,%ymm2,%ymm2
365 vpxor %ymm2,%ymm1,%ymm1
367 vpslld $7,%ymm10,%ymm10
368 vpsrld $25,%ymm1,%ymm1
369 vpor %ymm10,%ymm1,%ymm1
371 vpaddd %ymm7,%ymm6,%ymm6
372 vpxor %ymm6,%ymm5,%ymm5
374 vpslld $7,%ymm10,%ymm10
375 vpsrld $25,%ymm5,%ymm5
376 vpor %ymm10,%ymm5,%ymm5
378 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
379 vpshufd $0x93,%ymm1,%ymm1
380 vpshufd $0x93,%ymm5,%ymm5
381 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
382 vpshufd $0x4e,%ymm2,%ymm2
383 vpshufd $0x4e,%ymm6,%ymm6
384 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
385 vpshufd $0x39,%ymm3,%ymm3
386 vpshufd $0x39,%ymm7,%ymm7
391 # o0 = i0 ^ (x0 + s0), first block
392 vpaddd %ymm11,%ymm0,%ymm10
395 vpxor 0x00(%rdx),%xmm10,%xmm9
396 vmovdqu %xmm9,0x00(%rsi)
397 vextracti128 $1,%ymm10,%xmm0
398 # o1 = i1 ^ (x1 + s1), first block
399 vpaddd %ymm12,%ymm1,%ymm10
402 vpxor 0x10(%rdx),%xmm10,%xmm9
403 vmovdqu %xmm9,0x10(%rsi)
404 vextracti128 $1,%ymm10,%xmm1
405 # o2 = i2 ^ (x2 + s2), first block
406 vpaddd %ymm13,%ymm2,%ymm10
409 vpxor 0x20(%rdx),%xmm10,%xmm9
410 vmovdqu %xmm9,0x20(%rsi)
411 vextracti128 $1,%ymm10,%xmm2
412 # o3 = i3 ^ (x3 + s3), first block
413 vpaddd %ymm14,%ymm3,%ymm10
416 vpxor 0x30(%rdx),%xmm10,%xmm9
417 vmovdqu %xmm9,0x30(%rsi)
418 vextracti128 $1,%ymm10,%xmm3
420 # xor and write second block
424 vpxor 0x40(%rdx),%xmm10,%xmm9
425 vmovdqu %xmm9,0x40(%rsi)
430 vpxor 0x50(%rdx),%xmm10,%xmm9
431 vmovdqu %xmm9,0x50(%rsi)
436 vpxor 0x60(%rdx),%xmm10,%xmm9
437 vmovdqu %xmm9,0x60(%rsi)
442 vpxor 0x70(%rdx),%xmm10,%xmm9
443 vmovdqu %xmm9,0x70(%rsi)
445 # o0 = i0 ^ (x0 + s0), third block
446 vpaddd %ymm11,%ymm4,%ymm10
449 vpxor 0x80(%rdx),%xmm10,%xmm9
450 vmovdqu %xmm9,0x80(%rsi)
451 vextracti128 $1,%ymm10,%xmm4
452 # o1 = i1 ^ (x1 + s1), third block
453 vpaddd %ymm12,%ymm5,%ymm10
456 vpxor 0x90(%rdx),%xmm10,%xmm9
457 vmovdqu %xmm9,0x90(%rsi)
458 vextracti128 $1,%ymm10,%xmm5
459 # o2 = i2 ^ (x2 + s2), third block
460 vpaddd %ymm13,%ymm6,%ymm10
463 vpxor 0xa0(%rdx),%xmm10,%xmm9
464 vmovdqu %xmm9,0xa0(%rsi)
465 vextracti128 $1,%ymm10,%xmm6
466 # o3 = i3 ^ (x3 + s3), third block
467 vpaddd %ymm15,%ymm7,%ymm10
470 vpxor 0xb0(%rdx),%xmm10,%xmm9
471 vmovdqu %xmm9,0xb0(%rsi)
472 vextracti128 $1,%ymm10,%xmm7
474 # xor and write fourth block
478 vpxor 0xc0(%rdx),%xmm10,%xmm9
479 vmovdqu %xmm9,0xc0(%rsi)
484 vpxor 0xd0(%rdx),%xmm10,%xmm9
485 vmovdqu %xmm9,0xd0(%rsi)
490 vpxor 0xe0(%rdx),%xmm10,%xmm9
491 vmovdqu %xmm9,0xe0(%rsi)
496 vpxor 0xf0(%rdx),%xmm10,%xmm9
497 vmovdqu %xmm9,0xf0(%rsi)
504 # xor remaining bytes from partial register into output
521 vpxor 0x00(%rsp),%xmm10,%xmm10
522 vmovdqa %xmm10,0x00(%rsp)
532 SYM_FUNC_END(chacha_4block_xor_avx2)
534 SYM_FUNC_START(chacha_8block_xor_avx2)
535 # %rdi: Input state matrix, s
536 # %rsi: up to 8 data blocks output, o
537 # %rdx: up to 8 data blocks input, i
538 # %rcx: input/output length in bytes
541 # This function encrypts eight consecutive ChaCha blocks by loading
542 # the state matrix in AVX registers eight times. As we need some
543 # scratch registers, we save the first four registers on the stack. The
544 # algorithm performs each operation on the corresponding word of each
545 # state matrix, hence requires no word shuffling. For final XORing step
546 # we transpose the matrix by interleaving 32-, 64- and then 128-bit
547 # words, which allows us to do XOR in AVX registers. 8/16-bit word
548 # rotation is done with the slightly better performing byte shuffling,
549 # 7/12-bit word rotation uses traditional shift+OR.
552 # 4 * 32 byte stack, 32-byte aligned
558 # x0..15[0-7] = s[0..15]
559 vpbroadcastd 0x00(%rdi),%ymm0
560 vpbroadcastd 0x04(%rdi),%ymm1
561 vpbroadcastd 0x08(%rdi),%ymm2
562 vpbroadcastd 0x0c(%rdi),%ymm3
563 vpbroadcastd 0x10(%rdi),%ymm4
564 vpbroadcastd 0x14(%rdi),%ymm5
565 vpbroadcastd 0x18(%rdi),%ymm6
566 vpbroadcastd 0x1c(%rdi),%ymm7
567 vpbroadcastd 0x20(%rdi),%ymm8
568 vpbroadcastd 0x24(%rdi),%ymm9
569 vpbroadcastd 0x28(%rdi),%ymm10
570 vpbroadcastd 0x2c(%rdi),%ymm11
571 vpbroadcastd 0x30(%rdi),%ymm12
572 vpbroadcastd 0x34(%rdi),%ymm13
573 vpbroadcastd 0x38(%rdi),%ymm14
574 vpbroadcastd 0x3c(%rdi),%ymm15
576 vmovdqa %ymm0,0x00(%rsp)
577 vmovdqa %ymm1,0x20(%rsp)
578 vmovdqa %ymm2,0x40(%rsp)
579 vmovdqa %ymm3,0x60(%rsp)
581 vmovdqa CTRINC(%rip),%ymm1
582 vmovdqa ROT8(%rip),%ymm2
583 vmovdqa ROT16(%rip),%ymm3
585 # x12 += counter values 0-3
586 vpaddd %ymm1,%ymm12,%ymm12
589 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
590 vpaddd 0x00(%rsp),%ymm4,%ymm0
591 vmovdqa %ymm0,0x00(%rsp)
592 vpxor %ymm0,%ymm12,%ymm12
593 vpshufb %ymm3,%ymm12,%ymm12
594 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
595 vpaddd 0x20(%rsp),%ymm5,%ymm0
596 vmovdqa %ymm0,0x20(%rsp)
597 vpxor %ymm0,%ymm13,%ymm13
598 vpshufb %ymm3,%ymm13,%ymm13
599 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
600 vpaddd 0x40(%rsp),%ymm6,%ymm0
601 vmovdqa %ymm0,0x40(%rsp)
602 vpxor %ymm0,%ymm14,%ymm14
603 vpshufb %ymm3,%ymm14,%ymm14
604 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
605 vpaddd 0x60(%rsp),%ymm7,%ymm0
606 vmovdqa %ymm0,0x60(%rsp)
607 vpxor %ymm0,%ymm15,%ymm15
608 vpshufb %ymm3,%ymm15,%ymm15
610 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
611 vpaddd %ymm12,%ymm8,%ymm8
612 vpxor %ymm8,%ymm4,%ymm4
613 vpslld $12,%ymm4,%ymm0
614 vpsrld $20,%ymm4,%ymm4
615 vpor %ymm0,%ymm4,%ymm4
616 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
617 vpaddd %ymm13,%ymm9,%ymm9
618 vpxor %ymm9,%ymm5,%ymm5
619 vpslld $12,%ymm5,%ymm0
620 vpsrld $20,%ymm5,%ymm5
621 vpor %ymm0,%ymm5,%ymm5
622 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
623 vpaddd %ymm14,%ymm10,%ymm10
624 vpxor %ymm10,%ymm6,%ymm6
625 vpslld $12,%ymm6,%ymm0
626 vpsrld $20,%ymm6,%ymm6
627 vpor %ymm0,%ymm6,%ymm6
628 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
629 vpaddd %ymm15,%ymm11,%ymm11
630 vpxor %ymm11,%ymm7,%ymm7
631 vpslld $12,%ymm7,%ymm0
632 vpsrld $20,%ymm7,%ymm7
633 vpor %ymm0,%ymm7,%ymm7
635 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
636 vpaddd 0x00(%rsp),%ymm4,%ymm0
637 vmovdqa %ymm0,0x00(%rsp)
638 vpxor %ymm0,%ymm12,%ymm12
639 vpshufb %ymm2,%ymm12,%ymm12
640 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
641 vpaddd 0x20(%rsp),%ymm5,%ymm0
642 vmovdqa %ymm0,0x20(%rsp)
643 vpxor %ymm0,%ymm13,%ymm13
644 vpshufb %ymm2,%ymm13,%ymm13
645 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
646 vpaddd 0x40(%rsp),%ymm6,%ymm0
647 vmovdqa %ymm0,0x40(%rsp)
648 vpxor %ymm0,%ymm14,%ymm14
649 vpshufb %ymm2,%ymm14,%ymm14
650 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
651 vpaddd 0x60(%rsp),%ymm7,%ymm0
652 vmovdqa %ymm0,0x60(%rsp)
653 vpxor %ymm0,%ymm15,%ymm15
654 vpshufb %ymm2,%ymm15,%ymm15
656 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
657 vpaddd %ymm12,%ymm8,%ymm8
658 vpxor %ymm8,%ymm4,%ymm4
659 vpslld $7,%ymm4,%ymm0
660 vpsrld $25,%ymm4,%ymm4
661 vpor %ymm0,%ymm4,%ymm4
662 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
663 vpaddd %ymm13,%ymm9,%ymm9
664 vpxor %ymm9,%ymm5,%ymm5
665 vpslld $7,%ymm5,%ymm0
666 vpsrld $25,%ymm5,%ymm5
667 vpor %ymm0,%ymm5,%ymm5
668 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
669 vpaddd %ymm14,%ymm10,%ymm10
670 vpxor %ymm10,%ymm6,%ymm6
671 vpslld $7,%ymm6,%ymm0
672 vpsrld $25,%ymm6,%ymm6
673 vpor %ymm0,%ymm6,%ymm6
674 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
675 vpaddd %ymm15,%ymm11,%ymm11
676 vpxor %ymm11,%ymm7,%ymm7
677 vpslld $7,%ymm7,%ymm0
678 vpsrld $25,%ymm7,%ymm7
679 vpor %ymm0,%ymm7,%ymm7
681 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
682 vpaddd 0x00(%rsp),%ymm5,%ymm0
683 vmovdqa %ymm0,0x00(%rsp)
684 vpxor %ymm0,%ymm15,%ymm15
685 vpshufb %ymm3,%ymm15,%ymm15
686 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
687 vpaddd 0x20(%rsp),%ymm6,%ymm0
688 vmovdqa %ymm0,0x20(%rsp)
689 vpxor %ymm0,%ymm12,%ymm12
690 vpshufb %ymm3,%ymm12,%ymm12
691 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
692 vpaddd 0x40(%rsp),%ymm7,%ymm0
693 vmovdqa %ymm0,0x40(%rsp)
694 vpxor %ymm0,%ymm13,%ymm13
695 vpshufb %ymm3,%ymm13,%ymm13
696 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
697 vpaddd 0x60(%rsp),%ymm4,%ymm0
698 vmovdqa %ymm0,0x60(%rsp)
699 vpxor %ymm0,%ymm14,%ymm14
700 vpshufb %ymm3,%ymm14,%ymm14
702 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
703 vpaddd %ymm15,%ymm10,%ymm10
704 vpxor %ymm10,%ymm5,%ymm5
705 vpslld $12,%ymm5,%ymm0
706 vpsrld $20,%ymm5,%ymm5
707 vpor %ymm0,%ymm5,%ymm5
708 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
709 vpaddd %ymm12,%ymm11,%ymm11
710 vpxor %ymm11,%ymm6,%ymm6
711 vpslld $12,%ymm6,%ymm0
712 vpsrld $20,%ymm6,%ymm6
713 vpor %ymm0,%ymm6,%ymm6
714 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
715 vpaddd %ymm13,%ymm8,%ymm8
716 vpxor %ymm8,%ymm7,%ymm7
717 vpslld $12,%ymm7,%ymm0
718 vpsrld $20,%ymm7,%ymm7
719 vpor %ymm0,%ymm7,%ymm7
720 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
721 vpaddd %ymm14,%ymm9,%ymm9
722 vpxor %ymm9,%ymm4,%ymm4
723 vpslld $12,%ymm4,%ymm0
724 vpsrld $20,%ymm4,%ymm4
725 vpor %ymm0,%ymm4,%ymm4
727 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
728 vpaddd 0x00(%rsp),%ymm5,%ymm0
729 vmovdqa %ymm0,0x00(%rsp)
730 vpxor %ymm0,%ymm15,%ymm15
731 vpshufb %ymm2,%ymm15,%ymm15
732 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
733 vpaddd 0x20(%rsp),%ymm6,%ymm0
734 vmovdqa %ymm0,0x20(%rsp)
735 vpxor %ymm0,%ymm12,%ymm12
736 vpshufb %ymm2,%ymm12,%ymm12
737 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
738 vpaddd 0x40(%rsp),%ymm7,%ymm0
739 vmovdqa %ymm0,0x40(%rsp)
740 vpxor %ymm0,%ymm13,%ymm13
741 vpshufb %ymm2,%ymm13,%ymm13
742 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
743 vpaddd 0x60(%rsp),%ymm4,%ymm0
744 vmovdqa %ymm0,0x60(%rsp)
745 vpxor %ymm0,%ymm14,%ymm14
746 vpshufb %ymm2,%ymm14,%ymm14
748 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
749 vpaddd %ymm15,%ymm10,%ymm10
750 vpxor %ymm10,%ymm5,%ymm5
751 vpslld $7,%ymm5,%ymm0
752 vpsrld $25,%ymm5,%ymm5
753 vpor %ymm0,%ymm5,%ymm5
754 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
755 vpaddd %ymm12,%ymm11,%ymm11
756 vpxor %ymm11,%ymm6,%ymm6
757 vpslld $7,%ymm6,%ymm0
758 vpsrld $25,%ymm6,%ymm6
759 vpor %ymm0,%ymm6,%ymm6
760 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
761 vpaddd %ymm13,%ymm8,%ymm8
762 vpxor %ymm8,%ymm7,%ymm7
763 vpslld $7,%ymm7,%ymm0
764 vpsrld $25,%ymm7,%ymm7
765 vpor %ymm0,%ymm7,%ymm7
766 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
767 vpaddd %ymm14,%ymm9,%ymm9
768 vpxor %ymm9,%ymm4,%ymm4
769 vpslld $7,%ymm4,%ymm0
770 vpsrld $25,%ymm4,%ymm4
771 vpor %ymm0,%ymm4,%ymm4
776 # x0..15[0-3] += s[0..15]
777 vpbroadcastd 0x00(%rdi),%ymm0
778 vpaddd 0x00(%rsp),%ymm0,%ymm0
779 vmovdqa %ymm0,0x00(%rsp)
780 vpbroadcastd 0x04(%rdi),%ymm0
781 vpaddd 0x20(%rsp),%ymm0,%ymm0
782 vmovdqa %ymm0,0x20(%rsp)
783 vpbroadcastd 0x08(%rdi),%ymm0
784 vpaddd 0x40(%rsp),%ymm0,%ymm0
785 vmovdqa %ymm0,0x40(%rsp)
786 vpbroadcastd 0x0c(%rdi),%ymm0
787 vpaddd 0x60(%rsp),%ymm0,%ymm0
788 vmovdqa %ymm0,0x60(%rsp)
789 vpbroadcastd 0x10(%rdi),%ymm0
790 vpaddd %ymm0,%ymm4,%ymm4
791 vpbroadcastd 0x14(%rdi),%ymm0
792 vpaddd %ymm0,%ymm5,%ymm5
793 vpbroadcastd 0x18(%rdi),%ymm0
794 vpaddd %ymm0,%ymm6,%ymm6
795 vpbroadcastd 0x1c(%rdi),%ymm0
796 vpaddd %ymm0,%ymm7,%ymm7
797 vpbroadcastd 0x20(%rdi),%ymm0
798 vpaddd %ymm0,%ymm8,%ymm8
799 vpbroadcastd 0x24(%rdi),%ymm0
800 vpaddd %ymm0,%ymm9,%ymm9
801 vpbroadcastd 0x28(%rdi),%ymm0
802 vpaddd %ymm0,%ymm10,%ymm10
803 vpbroadcastd 0x2c(%rdi),%ymm0
804 vpaddd %ymm0,%ymm11,%ymm11
805 vpbroadcastd 0x30(%rdi),%ymm0
806 vpaddd %ymm0,%ymm12,%ymm12
807 vpbroadcastd 0x34(%rdi),%ymm0
808 vpaddd %ymm0,%ymm13,%ymm13
809 vpbroadcastd 0x38(%rdi),%ymm0
810 vpaddd %ymm0,%ymm14,%ymm14
811 vpbroadcastd 0x3c(%rdi),%ymm0
812 vpaddd %ymm0,%ymm15,%ymm15
814 # x12 += counter values 0-3
815 vpaddd %ymm1,%ymm12,%ymm12
817 # interleave 32-bit words in state n, n+1
818 vmovdqa 0x00(%rsp),%ymm0
819 vmovdqa 0x20(%rsp),%ymm1
820 vpunpckldq %ymm1,%ymm0,%ymm2
821 vpunpckhdq %ymm1,%ymm0,%ymm1
822 vmovdqa %ymm2,0x00(%rsp)
823 vmovdqa %ymm1,0x20(%rsp)
824 vmovdqa 0x40(%rsp),%ymm0
825 vmovdqa 0x60(%rsp),%ymm1
826 vpunpckldq %ymm1,%ymm0,%ymm2
827 vpunpckhdq %ymm1,%ymm0,%ymm1
828 vmovdqa %ymm2,0x40(%rsp)
829 vmovdqa %ymm1,0x60(%rsp)
831 vpunpckldq %ymm5,%ymm0,%ymm4
832 vpunpckhdq %ymm5,%ymm0,%ymm5
834 vpunpckldq %ymm7,%ymm0,%ymm6
835 vpunpckhdq %ymm7,%ymm0,%ymm7
837 vpunpckldq %ymm9,%ymm0,%ymm8
838 vpunpckhdq %ymm9,%ymm0,%ymm9
840 vpunpckldq %ymm11,%ymm0,%ymm10
841 vpunpckhdq %ymm11,%ymm0,%ymm11
843 vpunpckldq %ymm13,%ymm0,%ymm12
844 vpunpckhdq %ymm13,%ymm0,%ymm13
846 vpunpckldq %ymm15,%ymm0,%ymm14
847 vpunpckhdq %ymm15,%ymm0,%ymm15
849 # interleave 64-bit words in state n, n+2
850 vmovdqa 0x00(%rsp),%ymm0
851 vmovdqa 0x40(%rsp),%ymm2
852 vpunpcklqdq %ymm2,%ymm0,%ymm1
853 vpunpckhqdq %ymm2,%ymm0,%ymm2
854 vmovdqa %ymm1,0x00(%rsp)
855 vmovdqa %ymm2,0x40(%rsp)
856 vmovdqa 0x20(%rsp),%ymm0
857 vmovdqa 0x60(%rsp),%ymm2
858 vpunpcklqdq %ymm2,%ymm0,%ymm1
859 vpunpckhqdq %ymm2,%ymm0,%ymm2
860 vmovdqa %ymm1,0x20(%rsp)
861 vmovdqa %ymm2,0x60(%rsp)
863 vpunpcklqdq %ymm6,%ymm0,%ymm4
864 vpunpckhqdq %ymm6,%ymm0,%ymm6
866 vpunpcklqdq %ymm7,%ymm0,%ymm5
867 vpunpckhqdq %ymm7,%ymm0,%ymm7
869 vpunpcklqdq %ymm10,%ymm0,%ymm8
870 vpunpckhqdq %ymm10,%ymm0,%ymm10
872 vpunpcklqdq %ymm11,%ymm0,%ymm9
873 vpunpckhqdq %ymm11,%ymm0,%ymm11
875 vpunpcklqdq %ymm14,%ymm0,%ymm12
876 vpunpckhqdq %ymm14,%ymm0,%ymm14
878 vpunpcklqdq %ymm15,%ymm0,%ymm13
879 vpunpckhqdq %ymm15,%ymm0,%ymm15
881 # interleave 128-bit words in state n, n+4
882 # xor/write first four blocks
883 vmovdqa 0x00(%rsp),%ymm1
884 vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
887 vpxor 0x0000(%rdx),%ymm0,%ymm0
888 vmovdqu %ymm0,0x0000(%rsi)
889 vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
891 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
894 vpxor 0x0020(%rdx),%ymm0,%ymm0
895 vmovdqu %ymm0,0x0020(%rsi)
896 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
898 vmovdqa 0x40(%rsp),%ymm1
899 vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
902 vpxor 0x0040(%rdx),%ymm0,%ymm0
903 vmovdqu %ymm0,0x0040(%rsi)
904 vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
906 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
909 vpxor 0x0060(%rdx),%ymm0,%ymm0
910 vmovdqu %ymm0,0x0060(%rsi)
911 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
913 vmovdqa 0x20(%rsp),%ymm1
914 vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
917 vpxor 0x0080(%rdx),%ymm0,%ymm0
918 vmovdqu %ymm0,0x0080(%rsi)
919 vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
921 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
924 vpxor 0x00a0(%rdx),%ymm0,%ymm0
925 vmovdqu %ymm0,0x00a0(%rsi)
926 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
928 vmovdqa 0x60(%rsp),%ymm1
929 vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
932 vpxor 0x00c0(%rdx),%ymm0,%ymm0
933 vmovdqu %ymm0,0x00c0(%rsi)
934 vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
936 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
939 vpxor 0x00e0(%rdx),%ymm0,%ymm0
940 vmovdqu %ymm0,0x00e0(%rsi)
941 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
943 # xor remaining blocks, write to output
947 vpxor 0x0100(%rdx),%ymm0,%ymm0
948 vmovdqu %ymm0,0x0100(%rsi)
953 vpxor 0x0120(%rdx),%ymm0,%ymm0
954 vmovdqu %ymm0,0x0120(%rsi)
959 vpxor 0x0140(%rdx),%ymm0,%ymm0
960 vmovdqu %ymm0,0x0140(%rsi)
965 vpxor 0x0160(%rdx),%ymm0,%ymm0
966 vmovdqu %ymm0,0x0160(%rsi)
971 vpxor 0x0180(%rdx),%ymm0,%ymm0
972 vmovdqu %ymm0,0x0180(%rsi)
977 vpxor 0x01a0(%rdx),%ymm0,%ymm0
978 vmovdqu %ymm0,0x01a0(%rsi)
983 vpxor 0x01c0(%rdx),%ymm0,%ymm0
984 vmovdqu %ymm0,0x01c0(%rsi)
989 vpxor 0x01e0(%rdx),%ymm0,%ymm0
990 vmovdqu %ymm0,0x01e0(%rsi)
998 # xor remaining bytes from partial register into output
1006 lea (%rdx,%rax),%rsi
1011 vpxor 0x00(%rsp),%ymm0,%ymm0
1012 vmovdqa %ymm0,0x00(%rsp)
1015 lea (%r11,%rax),%rdi
1021 SYM_FUNC_END(chacha_8block_xor_avx2)