2 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
4 * Copyright (C) 2015 Martin Willi
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
12 #include <linux/linkage.h>
17 ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
18 ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
19 CTRINC: .octa 0x00000003000000020000000100000000
23 ENTRY(chacha20_block_xor_ssse3)
24 # %rdi: Input state matrix, s
25 # %rsi: 1 data block output, o
26 # %rdx: 1 data block input, i
28 # This function encrypts one ChaCha20 block by loading the state matrix
29 # in four SSE registers. It performs matrix operation on four words in
30 # parallel, but requireds shuffling to rearrange the words after each
31 # round. 8/16-bit word rotation is done with the slightly better
32 # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
33 # traditional shift+OR.
36 movdqa 0x00(%rdi),%xmm0
37 movdqa 0x10(%rdi),%xmm1
38 movdqa 0x20(%rdi),%xmm2
39 movdqa 0x30(%rdi),%xmm3
45 movdqa ROT8(%rip),%xmm4
46 movdqa ROT16(%rip),%xmm5
52 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
57 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
65 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
70 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
78 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
79 pshufd $0x39,%xmm1,%xmm1
80 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
81 pshufd $0x4e,%xmm2,%xmm2
82 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
83 pshufd $0x93,%xmm3,%xmm3
85 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
90 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
98 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
103 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
111 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
112 pshufd $0x93,%xmm1,%xmm1
113 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
114 pshufd $0x4e,%xmm2,%xmm2
115 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
116 pshufd $0x39,%xmm3,%xmm3
121 # o0 = i0 ^ (x0 + s0)
122 movdqu 0x00(%rdx),%xmm4
125 movdqu %xmm0,0x00(%rsi)
126 # o1 = i1 ^ (x1 + s1)
127 movdqu 0x10(%rdx),%xmm5
130 movdqu %xmm1,0x10(%rsi)
131 # o2 = i2 ^ (x2 + s2)
132 movdqu 0x20(%rdx),%xmm6
135 movdqu %xmm2,0x20(%rsi)
136 # o3 = i3 ^ (x3 + s3)
137 movdqu 0x30(%rdx),%xmm7
140 movdqu %xmm3,0x30(%rsi)
143 ENDPROC(chacha20_block_xor_ssse3)
145 ENTRY(chacha20_4block_xor_ssse3)
146 # %rdi: Input state matrix, s
147 # %rsi: 4 data blocks output, o
148 # %rdx: 4 data blocks input, i
150 # This function encrypts four consecutive ChaCha20 blocks by loading the
151 # the state matrix in SSE registers four times. As we need some scratch
152 # registers, we save the first four registers on the stack. The
153 # algorithm performs each operation on the corresponding word of each
154 # state matrix, hence requires no word shuffling. For final XORing step
155 # we transpose the matrix by interleaving 32- and then 64-bit words,
156 # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
157 # done with the slightly better performing SSSE3 byte shuffling,
158 # 7/12-bit word rotation uses traditional shift+OR.
162 # x0..15[0-3] = s0..3[0..3]
163 movq 0x00(%rdi),%xmm1
164 pshufd $0x00,%xmm1,%xmm0
165 pshufd $0x55,%xmm1,%xmm1
166 movq 0x08(%rdi),%xmm3
167 pshufd $0x00,%xmm3,%xmm2
168 pshufd $0x55,%xmm3,%xmm3
169 movq 0x10(%rdi),%xmm5
170 pshufd $0x00,%xmm5,%xmm4
171 pshufd $0x55,%xmm5,%xmm5
172 movq 0x18(%rdi),%xmm7
173 pshufd $0x00,%xmm7,%xmm6
174 pshufd $0x55,%xmm7,%xmm7
175 movq 0x20(%rdi),%xmm9
176 pshufd $0x00,%xmm9,%xmm8
177 pshufd $0x55,%xmm9,%xmm9
178 movq 0x28(%rdi),%xmm11
179 pshufd $0x00,%xmm11,%xmm10
180 pshufd $0x55,%xmm11,%xmm11
181 movq 0x30(%rdi),%xmm13
182 pshufd $0x00,%xmm13,%xmm12
183 pshufd $0x55,%xmm13,%xmm13
184 movq 0x38(%rdi),%xmm15
185 pshufd $0x00,%xmm15,%xmm14
186 pshufd $0x55,%xmm15,%xmm15
188 movdqa %xmm0,0x00(%rsp)
189 movdqa %xmm1,0x10(%rsp)
190 movdqa %xmm2,0x20(%rsp)
191 movdqa %xmm3,0x30(%rsp)
193 movdqa CTRINC(%rip),%xmm1
194 movdqa ROT8(%rip),%xmm2
195 movdqa ROT16(%rip),%xmm3
197 # x12 += counter values 0-3
203 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
204 movdqa 0x00(%rsp),%xmm0
206 movdqa %xmm0,0x00(%rsp)
209 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
210 movdqa 0x10(%rsp),%xmm0
212 movdqa %xmm0,0x10(%rsp)
215 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
216 movdqa 0x20(%rsp),%xmm0
218 movdqa %xmm0,0x20(%rsp)
221 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
222 movdqa 0x30(%rsp),%xmm0
224 movdqa %xmm0,0x30(%rsp)
228 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
235 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
242 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
249 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
257 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
258 movdqa 0x00(%rsp),%xmm0
260 movdqa %xmm0,0x00(%rsp)
263 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
264 movdqa 0x10(%rsp),%xmm0
266 movdqa %xmm0,0x10(%rsp)
269 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
270 movdqa 0x20(%rsp),%xmm0
272 movdqa %xmm0,0x20(%rsp)
275 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
276 movdqa 0x30(%rsp),%xmm0
278 movdqa %xmm0,0x30(%rsp)
282 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
289 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
296 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
303 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
311 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
312 movdqa 0x00(%rsp),%xmm0
314 movdqa %xmm0,0x00(%rsp)
317 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
318 movdqa 0x10(%rsp),%xmm0
320 movdqa %xmm0,0x10(%rsp)
323 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
324 movdqa 0x20(%rsp),%xmm0
326 movdqa %xmm0,0x20(%rsp)
329 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
330 movdqa 0x30(%rsp),%xmm0
332 movdqa %xmm0,0x30(%rsp)
336 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
343 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
350 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
357 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
365 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
366 movdqa 0x00(%rsp),%xmm0
368 movdqa %xmm0,0x00(%rsp)
371 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
372 movdqa 0x10(%rsp),%xmm0
374 movdqa %xmm0,0x10(%rsp)
377 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
378 movdqa 0x20(%rsp),%xmm0
380 movdqa %xmm0,0x20(%rsp)
383 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
384 movdqa 0x30(%rsp),%xmm0
386 movdqa %xmm0,0x30(%rsp)
390 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
397 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
404 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
411 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
424 movq 0x00(%rdi),%xmm3
425 pshufd $0x00,%xmm3,%xmm2
426 pshufd $0x55,%xmm3,%xmm3
427 paddd 0x00(%rsp),%xmm2
428 movdqa %xmm2,0x00(%rsp)
429 paddd 0x10(%rsp),%xmm3
430 movdqa %xmm3,0x10(%rsp)
433 movq 0x08(%rdi),%xmm3
434 pshufd $0x00,%xmm3,%xmm2
435 pshufd $0x55,%xmm3,%xmm3
436 paddd 0x20(%rsp),%xmm2
437 movdqa %xmm2,0x20(%rsp)
438 paddd 0x30(%rsp),%xmm3
439 movdqa %xmm3,0x30(%rsp)
443 movq 0x10(%rdi),%xmm3
444 pshufd $0x00,%xmm3,%xmm2
445 pshufd $0x55,%xmm3,%xmm3
450 movq 0x18(%rdi),%xmm3
451 pshufd $0x00,%xmm3,%xmm2
452 pshufd $0x55,%xmm3,%xmm3
458 movq 0x20(%rdi),%xmm3
459 pshufd $0x00,%xmm3,%xmm2
460 pshufd $0x55,%xmm3,%xmm3
465 movq 0x28(%rdi),%xmm3
466 pshufd $0x00,%xmm3,%xmm2
467 pshufd $0x55,%xmm3,%xmm3
473 movq 0x30(%rdi),%xmm3
474 pshufd $0x00,%xmm3,%xmm2
475 pshufd $0x55,%xmm3,%xmm3
480 movq 0x38(%rdi),%xmm3
481 pshufd $0x00,%xmm3,%xmm2
482 pshufd $0x55,%xmm3,%xmm3
486 # x12 += counter values 0-3
489 # interleave 32-bit words in state n, n+1
490 movdqa 0x00(%rsp),%xmm0
491 movdqa 0x10(%rsp),%xmm1
493 punpckldq %xmm1,%xmm2
494 punpckhdq %xmm1,%xmm0
495 movdqa %xmm2,0x00(%rsp)
496 movdqa %xmm0,0x10(%rsp)
497 movdqa 0x20(%rsp),%xmm0
498 movdqa 0x30(%rsp),%xmm1
500 punpckldq %xmm1,%xmm2
501 punpckhdq %xmm1,%xmm0
502 movdqa %xmm2,0x20(%rsp)
503 movdqa %xmm0,0x30(%rsp)
505 punpckldq %xmm5,%xmm4
506 punpckhdq %xmm5,%xmm0
509 punpckldq %xmm7,%xmm6
510 punpckhdq %xmm7,%xmm0
513 punpckldq %xmm9,%xmm8
514 punpckhdq %xmm9,%xmm0
517 punpckldq %xmm11,%xmm10
518 punpckhdq %xmm11,%xmm0
521 punpckldq %xmm13,%xmm12
522 punpckhdq %xmm13,%xmm0
525 punpckldq %xmm15,%xmm14
526 punpckhdq %xmm15,%xmm0
529 # interleave 64-bit words in state n, n+2
530 movdqa 0x00(%rsp),%xmm0
531 movdqa 0x20(%rsp),%xmm1
533 punpcklqdq %xmm1,%xmm2
534 punpckhqdq %xmm1,%xmm0
535 movdqa %xmm2,0x00(%rsp)
536 movdqa %xmm0,0x20(%rsp)
537 movdqa 0x10(%rsp),%xmm0
538 movdqa 0x30(%rsp),%xmm1
540 punpcklqdq %xmm1,%xmm2
541 punpckhqdq %xmm1,%xmm0
542 movdqa %xmm2,0x10(%rsp)
543 movdqa %xmm0,0x30(%rsp)
545 punpcklqdq %xmm6,%xmm4
546 punpckhqdq %xmm6,%xmm0
549 punpcklqdq %xmm7,%xmm5
550 punpckhqdq %xmm7,%xmm0
553 punpcklqdq %xmm10,%xmm8
554 punpckhqdq %xmm10,%xmm0
557 punpcklqdq %xmm11,%xmm9
558 punpckhqdq %xmm11,%xmm0
561 punpcklqdq %xmm14,%xmm12
562 punpckhqdq %xmm14,%xmm0
565 punpcklqdq %xmm15,%xmm13
566 punpckhqdq %xmm15,%xmm0
569 # xor with corresponding input, write to output
570 movdqa 0x00(%rsp),%xmm0
571 movdqu 0x00(%rdx),%xmm1
573 movdqu %xmm0,0x00(%rsi)
574 movdqa 0x10(%rsp),%xmm0
575 movdqu 0x80(%rdx),%xmm1
577 movdqu %xmm0,0x80(%rsi)
578 movdqa 0x20(%rsp),%xmm0
579 movdqu 0x40(%rdx),%xmm1
581 movdqu %xmm0,0x40(%rsi)
582 movdqa 0x30(%rsp),%xmm0
583 movdqu 0xc0(%rdx),%xmm1
585 movdqu %xmm0,0xc0(%rsi)
586 movdqu 0x10(%rdx),%xmm1
588 movdqu %xmm4,0x10(%rsi)
589 movdqu 0x90(%rdx),%xmm1
591 movdqu %xmm5,0x90(%rsi)
592 movdqu 0x50(%rdx),%xmm1
594 movdqu %xmm6,0x50(%rsi)
595 movdqu 0xd0(%rdx),%xmm1
597 movdqu %xmm7,0xd0(%rsi)
598 movdqu 0x20(%rdx),%xmm1
600 movdqu %xmm8,0x20(%rsi)
601 movdqu 0xa0(%rdx),%xmm1
603 movdqu %xmm9,0xa0(%rsi)
604 movdqu 0x60(%rdx),%xmm1
606 movdqu %xmm10,0x60(%rsi)
607 movdqu 0xe0(%rdx),%xmm1
609 movdqu %xmm11,0xe0(%rsi)
610 movdqu 0x30(%rdx),%xmm1
612 movdqu %xmm12,0x30(%rsi)
613 movdqu 0xb0(%rdx),%xmm1
615 movdqu %xmm13,0xb0(%rsi)
616 movdqu 0x70(%rdx),%xmm1
618 movdqu %xmm14,0x70(%rsi)
619 movdqu 0xf0(%rdx),%xmm1
621 movdqu %xmm15,0xf0(%rsi)
625 ENDPROC(chacha20_4block_xor_ssse3)