1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
5 * Copyright (C) 2015 Martin Willi
8 #include <linux/linkage.h>
11 .section .rodata.cst16.ROT8, "aM", @progbits, 16
13 ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
14 .section .rodata.cst16.ROT16, "aM", @progbits, 16
16 ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
17 .section .rodata.cst16.CTRINC, "aM", @progbits, 16
19 CTRINC: .octa 0x00000003000000020000000100000000
24 * chacha_permute - permute one block
26 * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
27 * function performs matrix operations on four words in parallel, but requires
28 * shuffling to rearrange the words after each round. 8/16-bit word rotation is
29 * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
30 * rotation uses traditional shift+OR.
32 * The round count is given in %r8d.
34 * Clobbers: %r8d, %xmm4-%xmm7
36 SYM_FUNC_START_LOCAL(chacha_permute)
38 movdqa ROT8(%rip),%xmm4
39 movdqa ROT16(%rip),%xmm5
42 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
47 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
55 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
60 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
68 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
69 pshufd $0x39,%xmm1,%xmm1
70 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
71 pshufd $0x4e,%xmm2,%xmm2
72 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
73 pshufd $0x93,%xmm3,%xmm3
75 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
80 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
88 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
93 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
101 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
102 pshufd $0x93,%xmm1,%xmm1
103 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
104 pshufd $0x4e,%xmm2,%xmm2
105 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
106 pshufd $0x39,%xmm3,%xmm3
112 SYM_FUNC_END(chacha_permute)
114 SYM_FUNC_START(chacha_block_xor_ssse3)
115 # %rdi: Input state matrix, s
116 # %rsi: up to 1 data block output, o
117 # %rdx: up to 1 data block input, i
118 # %rcx: input/output length in bytes
123 movdqu 0x00(%rdi),%xmm0
124 movdqu 0x10(%rdi),%xmm1
125 movdqu 0x20(%rdi),%xmm2
126 movdqu 0x30(%rdi),%xmm3
135 # o0 = i0 ^ (x0 + s0)
139 movdqu 0x00(%rdx),%xmm4
141 movdqu %xmm0,0x00(%rsi)
142 # o1 = i1 ^ (x1 + s1)
147 movdqu 0x10(%rdx),%xmm0
149 movdqu %xmm0,0x10(%rsi)
150 # o2 = i2 ^ (x2 + s2)
155 movdqu 0x20(%rdx),%xmm0
157 movdqu %xmm0,0x20(%rsi)
158 # o3 = i3 ^ (x3 + s3)
163 movdqu 0x30(%rdx),%xmm0
165 movdqu %xmm0,0x30(%rsi)
172 # xor remaining bytes from partial register into output
189 pxor 0x00(%rsp),%xmm0
190 movdqa %xmm0,0x00(%rsp)
200 SYM_FUNC_END(chacha_block_xor_ssse3)
202 SYM_FUNC_START(hchacha_block_ssse3)
203 # %rdi: Input state matrix, s
204 # %rsi: output (8 32-bit words)
208 movdqu 0x00(%rdi),%xmm0
209 movdqu 0x10(%rdi),%xmm1
210 movdqu 0x20(%rdi),%xmm2
211 movdqu 0x30(%rdi),%xmm3
216 movdqu %xmm0,0x00(%rsi)
217 movdqu %xmm3,0x10(%rsi)
221 SYM_FUNC_END(hchacha_block_ssse3)
223 SYM_FUNC_START(chacha_4block_xor_ssse3)
224 # %rdi: Input state matrix, s
225 # %rsi: up to 4 data blocks output, o
226 # %rdx: up to 4 data blocks input, i
227 # %rcx: input/output length in bytes
230 # This function encrypts four consecutive ChaCha blocks by loading the
231 # the state matrix in SSE registers four times. As we need some scratch
232 # registers, we save the first four registers on the stack. The
233 # algorithm performs each operation on the corresponding word of each
234 # state matrix, hence requires no word shuffling. For final XORing step
235 # we transpose the matrix by interleaving 32- and then 64-bit words,
236 # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
237 # done with the slightly better performing SSSE3 byte shuffling,
238 # 7/12-bit word rotation uses traditional shift+OR.
245 # x0..15[0-3] = s0..3[0..3]
246 movq 0x00(%rdi),%xmm1
247 pshufd $0x00,%xmm1,%xmm0
248 pshufd $0x55,%xmm1,%xmm1
249 movq 0x08(%rdi),%xmm3
250 pshufd $0x00,%xmm3,%xmm2
251 pshufd $0x55,%xmm3,%xmm3
252 movq 0x10(%rdi),%xmm5
253 pshufd $0x00,%xmm5,%xmm4
254 pshufd $0x55,%xmm5,%xmm5
255 movq 0x18(%rdi),%xmm7
256 pshufd $0x00,%xmm7,%xmm6
257 pshufd $0x55,%xmm7,%xmm7
258 movq 0x20(%rdi),%xmm9
259 pshufd $0x00,%xmm9,%xmm8
260 pshufd $0x55,%xmm9,%xmm9
261 movq 0x28(%rdi),%xmm11
262 pshufd $0x00,%xmm11,%xmm10
263 pshufd $0x55,%xmm11,%xmm11
264 movq 0x30(%rdi),%xmm13
265 pshufd $0x00,%xmm13,%xmm12
266 pshufd $0x55,%xmm13,%xmm13
267 movq 0x38(%rdi),%xmm15
268 pshufd $0x00,%xmm15,%xmm14
269 pshufd $0x55,%xmm15,%xmm15
271 movdqa %xmm0,0x00(%rsp)
272 movdqa %xmm1,0x10(%rsp)
273 movdqa %xmm2,0x20(%rsp)
274 movdqa %xmm3,0x30(%rsp)
276 movdqa CTRINC(%rip),%xmm1
277 movdqa ROT8(%rip),%xmm2
278 movdqa ROT16(%rip),%xmm3
280 # x12 += counter values 0-3
284 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
285 movdqa 0x00(%rsp),%xmm0
287 movdqa %xmm0,0x00(%rsp)
290 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
291 movdqa 0x10(%rsp),%xmm0
293 movdqa %xmm0,0x10(%rsp)
296 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
297 movdqa 0x20(%rsp),%xmm0
299 movdqa %xmm0,0x20(%rsp)
302 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
303 movdqa 0x30(%rsp),%xmm0
305 movdqa %xmm0,0x30(%rsp)
309 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
316 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
323 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
330 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
338 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
339 movdqa 0x00(%rsp),%xmm0
341 movdqa %xmm0,0x00(%rsp)
344 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
345 movdqa 0x10(%rsp),%xmm0
347 movdqa %xmm0,0x10(%rsp)
350 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
351 movdqa 0x20(%rsp),%xmm0
353 movdqa %xmm0,0x20(%rsp)
356 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
357 movdqa 0x30(%rsp),%xmm0
359 movdqa %xmm0,0x30(%rsp)
363 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
370 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
377 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
384 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
392 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
393 movdqa 0x00(%rsp),%xmm0
395 movdqa %xmm0,0x00(%rsp)
398 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
399 movdqa 0x10(%rsp),%xmm0
401 movdqa %xmm0,0x10(%rsp)
404 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
405 movdqa 0x20(%rsp),%xmm0
407 movdqa %xmm0,0x20(%rsp)
410 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
411 movdqa 0x30(%rsp),%xmm0
413 movdqa %xmm0,0x30(%rsp)
417 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
424 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
431 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
438 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
446 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
447 movdqa 0x00(%rsp),%xmm0
449 movdqa %xmm0,0x00(%rsp)
452 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
453 movdqa 0x10(%rsp),%xmm0
455 movdqa %xmm0,0x10(%rsp)
458 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
459 movdqa 0x20(%rsp),%xmm0
461 movdqa %xmm0,0x20(%rsp)
464 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
465 movdqa 0x30(%rsp),%xmm0
467 movdqa %xmm0,0x30(%rsp)
471 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
478 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
485 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
492 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
505 movq 0x00(%rdi),%xmm3
506 pshufd $0x00,%xmm3,%xmm2
507 pshufd $0x55,%xmm3,%xmm3
508 paddd 0x00(%rsp),%xmm2
509 movdqa %xmm2,0x00(%rsp)
510 paddd 0x10(%rsp),%xmm3
511 movdqa %xmm3,0x10(%rsp)
514 movq 0x08(%rdi),%xmm3
515 pshufd $0x00,%xmm3,%xmm2
516 pshufd $0x55,%xmm3,%xmm3
517 paddd 0x20(%rsp),%xmm2
518 movdqa %xmm2,0x20(%rsp)
519 paddd 0x30(%rsp),%xmm3
520 movdqa %xmm3,0x30(%rsp)
524 movq 0x10(%rdi),%xmm3
525 pshufd $0x00,%xmm3,%xmm2
526 pshufd $0x55,%xmm3,%xmm3
531 movq 0x18(%rdi),%xmm3
532 pshufd $0x00,%xmm3,%xmm2
533 pshufd $0x55,%xmm3,%xmm3
539 movq 0x20(%rdi),%xmm3
540 pshufd $0x00,%xmm3,%xmm2
541 pshufd $0x55,%xmm3,%xmm3
546 movq 0x28(%rdi),%xmm3
547 pshufd $0x00,%xmm3,%xmm2
548 pshufd $0x55,%xmm3,%xmm3
554 movq 0x30(%rdi),%xmm3
555 pshufd $0x00,%xmm3,%xmm2
556 pshufd $0x55,%xmm3,%xmm3
561 movq 0x38(%rdi),%xmm3
562 pshufd $0x00,%xmm3,%xmm2
563 pshufd $0x55,%xmm3,%xmm3
567 # x12 += counter values 0-3
570 # interleave 32-bit words in state n, n+1
571 movdqa 0x00(%rsp),%xmm0
572 movdqa 0x10(%rsp),%xmm1
574 punpckldq %xmm1,%xmm2
575 punpckhdq %xmm1,%xmm0
576 movdqa %xmm2,0x00(%rsp)
577 movdqa %xmm0,0x10(%rsp)
578 movdqa 0x20(%rsp),%xmm0
579 movdqa 0x30(%rsp),%xmm1
581 punpckldq %xmm1,%xmm2
582 punpckhdq %xmm1,%xmm0
583 movdqa %xmm2,0x20(%rsp)
584 movdqa %xmm0,0x30(%rsp)
586 punpckldq %xmm5,%xmm4
587 punpckhdq %xmm5,%xmm0
590 punpckldq %xmm7,%xmm6
591 punpckhdq %xmm7,%xmm0
594 punpckldq %xmm9,%xmm8
595 punpckhdq %xmm9,%xmm0
598 punpckldq %xmm11,%xmm10
599 punpckhdq %xmm11,%xmm0
602 punpckldq %xmm13,%xmm12
603 punpckhdq %xmm13,%xmm0
606 punpckldq %xmm15,%xmm14
607 punpckhdq %xmm15,%xmm0
610 # interleave 64-bit words in state n, n+2
611 movdqa 0x00(%rsp),%xmm0
612 movdqa 0x20(%rsp),%xmm1
614 punpcklqdq %xmm1,%xmm2
615 punpckhqdq %xmm1,%xmm0
616 movdqa %xmm2,0x00(%rsp)
617 movdqa %xmm0,0x20(%rsp)
618 movdqa 0x10(%rsp),%xmm0
619 movdqa 0x30(%rsp),%xmm1
621 punpcklqdq %xmm1,%xmm2
622 punpckhqdq %xmm1,%xmm0
623 movdqa %xmm2,0x10(%rsp)
624 movdqa %xmm0,0x30(%rsp)
626 punpcklqdq %xmm6,%xmm4
627 punpckhqdq %xmm6,%xmm0
630 punpcklqdq %xmm7,%xmm5
631 punpckhqdq %xmm7,%xmm0
634 punpcklqdq %xmm10,%xmm8
635 punpckhqdq %xmm10,%xmm0
638 punpcklqdq %xmm11,%xmm9
639 punpckhqdq %xmm11,%xmm0
642 punpcklqdq %xmm14,%xmm12
643 punpckhqdq %xmm14,%xmm0
646 punpcklqdq %xmm15,%xmm13
647 punpckhqdq %xmm15,%xmm0
650 # xor with corresponding input, write to output
651 movdqa 0x00(%rsp),%xmm0
654 movdqu 0x00(%rdx),%xmm1
656 movdqu %xmm0,0x00(%rsi)
661 movdqu 0x10(%rdx),%xmm1
663 movdqu %xmm0,0x10(%rsi)
668 movdqu 0x20(%rdx),%xmm1
670 movdqu %xmm0,0x20(%rsi)
675 movdqu 0x30(%rdx),%xmm1
677 movdqu %xmm0,0x30(%rsi)
679 movdqa 0x20(%rsp),%xmm0
682 movdqu 0x40(%rdx),%xmm1
684 movdqu %xmm0,0x40(%rsi)
689 movdqu 0x50(%rdx),%xmm1
691 movdqu %xmm0,0x50(%rsi)
696 movdqu 0x60(%rdx),%xmm1
698 movdqu %xmm0,0x60(%rsi)
703 movdqu 0x70(%rdx),%xmm1
705 movdqu %xmm0,0x70(%rsi)
707 movdqa 0x10(%rsp),%xmm0
710 movdqu 0x80(%rdx),%xmm1
712 movdqu %xmm0,0x80(%rsi)
717 movdqu 0x90(%rdx),%xmm1
719 movdqu %xmm0,0x90(%rsi)
724 movdqu 0xa0(%rdx),%xmm1
726 movdqu %xmm0,0xa0(%rsi)
731 movdqu 0xb0(%rdx),%xmm1
733 movdqu %xmm0,0xb0(%rsi)
735 movdqa 0x30(%rsp),%xmm0
738 movdqu 0xc0(%rdx),%xmm1
740 movdqu %xmm0,0xc0(%rsi)
745 movdqu 0xd0(%rdx),%xmm1
747 movdqu %xmm0,0xd0(%rsi)
752 movdqu 0xe0(%rdx),%xmm1
754 movdqu %xmm0,0xe0(%rsi)
759 movdqu 0xf0(%rdx),%xmm1
761 movdqu %xmm0,0xf0(%rsi)
768 # xor remaining bytes from partial register into output
781 pxor 0x00(%rsp),%xmm0
782 movdqa %xmm0,0x00(%rsp)
791 SYM_FUNC_END(chacha_4block_xor_ssse3)