2 * ChaCha/XChaCha NEON helper functions
4 * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
10 * Originally based on:
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
13 * Copyright (C) 2015 Martin Willi
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
21 #include <linux/linkage.h>
22 #include <asm/assembler.h>
23 #include <asm/cache.h>
29 * chacha_permute - permute one block
31 * Permute one 64-byte block where the state matrix is stored in the four NEON
32 * registers v0-v3. It performs matrix operations on four words in parallel,
33 * but requires shuffling to rearrange the words after each round.
35 * The round count is given in w3.
37 * Clobbers: w3, x10, v4, v12
39 SYM_FUNC_START_LOCAL(chacha_permute)
45 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
46 add v0.4s, v0.4s, v1.4s
47 eor v3.16b, v3.16b, v0.16b
50 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
51 add v2.4s, v2.4s, v3.4s
52 eor v4.16b, v1.16b, v2.16b
56 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
57 add v0.4s, v0.4s, v1.4s
58 eor v3.16b, v3.16b, v0.16b
59 tbl v3.16b, {v3.16b}, v12.16b
61 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
62 add v2.4s, v2.4s, v3.4s
63 eor v4.16b, v1.16b, v2.16b
67 // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
68 ext v1.16b, v1.16b, v1.16b, #4
69 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
70 ext v2.16b, v2.16b, v2.16b, #8
71 // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
72 ext v3.16b, v3.16b, v3.16b, #12
74 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
75 add v0.4s, v0.4s, v1.4s
76 eor v3.16b, v3.16b, v0.16b
79 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
80 add v2.4s, v2.4s, v3.4s
81 eor v4.16b, v1.16b, v2.16b
85 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
86 add v0.4s, v0.4s, v1.4s
87 eor v3.16b, v3.16b, v0.16b
88 tbl v3.16b, {v3.16b}, v12.16b
90 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
91 add v2.4s, v2.4s, v3.4s
92 eor v4.16b, v1.16b, v2.16b
96 // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
97 ext v1.16b, v1.16b, v1.16b, #12
98 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
99 ext v2.16b, v2.16b, v2.16b, #8
100 // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
101 ext v3.16b, v3.16b, v3.16b, #4
107 SYM_FUNC_END(chacha_permute)
109 SYM_FUNC_START(chacha_block_xor_neon)
110 // x0: Input state matrix, s
111 // x1: 1 data block output, o
112 // x2: 1 data block input, i
115 stp x29, x30, [sp, #-16]!
119 ld1 {v0.4s-v3.4s}, [x0]
120 ld1 {v8.4s-v11.4s}, [x0]
124 ld1 {v4.16b-v7.16b}, [x2]
126 // o0 = i0 ^ (x0 + s0)
127 add v0.4s, v0.4s, v8.4s
128 eor v0.16b, v0.16b, v4.16b
130 // o1 = i1 ^ (x1 + s1)
131 add v1.4s, v1.4s, v9.4s
132 eor v1.16b, v1.16b, v5.16b
134 // o2 = i2 ^ (x2 + s2)
135 add v2.4s, v2.4s, v10.4s
136 eor v2.16b, v2.16b, v6.16b
138 // o3 = i3 ^ (x3 + s3)
139 add v3.4s, v3.4s, v11.4s
140 eor v3.16b, v3.16b, v7.16b
142 st1 {v0.16b-v3.16b}, [x1]
144 ldp x29, x30, [sp], #16
146 SYM_FUNC_END(chacha_block_xor_neon)
148 SYM_FUNC_START(hchacha_block_neon)
149 // x0: Input state matrix, s
150 // x1: output (8 32-bit words)
153 stp x29, x30, [sp, #-16]!
156 ld1 {v0.4s-v3.4s}, [x0]
161 st1 {v0.4s}, [x1], #16
164 ldp x29, x30, [sp], #16
166 SYM_FUNC_END(hchacha_block_neon)
186 SYM_FUNC_START(chacha_4block_xor_neon)
189 // x0: Input state matrix, s
190 // x1: 4 data blocks output, o
191 // x2: 4 data blocks input, i
200 // This function encrypts four consecutive ChaCha blocks by loading
201 // the state matrix in NEON registers four times. The algorithm performs
202 // each operation on the corresponding word of each state matrix, hence
203 // requires no word shuffling. For final XORing step we transpose the
204 // matrix by interleaving 32- and then 64-bit words, which allows us to
205 // do XOR in NEON registers.
207 // At the same time, a fifth block is encrypted in parallel using
210 adr_l x9, CTRINC // ... and ROT8
211 ld1 {v30.4s-v31.4s}, [x9]
213 // x0..15[0-3] = s0..3[0..3]
215 ld4r { v0.4s- v3.4s}, [x0]
216 ld4r { v4.4s- v7.4s}, [x8], #16
217 ld4r { v8.4s-v11.4s}, [x8], #16
218 ld4r {v12.4s-v15.4s}, [x8]
237 // x12 += counter values 1-4
238 add v12.4s, v12.4s, v30.4s
241 // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
242 // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
243 // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
244 // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
245 add v0.4s, v0.4s, v4.4s
247 add v1.4s, v1.4s, v5.4s
249 add v2.4s, v2.4s, v6.4s
251 add v3.4s, v3.4s, v7.4s
254 eor v12.16b, v12.16b, v0.16b
256 eor v13.16b, v13.16b, v1.16b
258 eor v14.16b, v14.16b, v2.16b
260 eor v15.16b, v15.16b, v3.16b
272 // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
273 // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
274 // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
275 // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
276 add v8.4s, v8.4s, v12.4s
278 add v9.4s, v9.4s, v13.4s
280 add v10.4s, v10.4s, v14.4s
282 add v11.4s, v11.4s, v15.4s
285 eor v16.16b, v4.16b, v8.16b
287 eor v17.16b, v5.16b, v9.16b
289 eor v18.16b, v6.16b, v10.16b
291 eor v19.16b, v7.16b, v11.16b
294 shl v4.4s, v16.4s, #12
295 shl v5.4s, v17.4s, #12
296 shl v6.4s, v18.4s, #12
297 shl v7.4s, v19.4s, #12
299 sri v4.4s, v16.4s, #20
301 sri v5.4s, v17.4s, #20
303 sri v6.4s, v18.4s, #20
305 sri v7.4s, v19.4s, #20
308 // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
309 // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
310 // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
311 // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
312 add v0.4s, v0.4s, v4.4s
314 add v1.4s, v1.4s, v5.4s
316 add v2.4s, v2.4s, v6.4s
318 add v3.4s, v3.4s, v7.4s
321 eor v12.16b, v12.16b, v0.16b
323 eor v13.16b, v13.16b, v1.16b
325 eor v14.16b, v14.16b, v2.16b
327 eor v15.16b, v15.16b, v3.16b
330 tbl v12.16b, {v12.16b}, v31.16b
332 tbl v13.16b, {v13.16b}, v31.16b
334 tbl v14.16b, {v14.16b}, v31.16b
336 tbl v15.16b, {v15.16b}, v31.16b
339 // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
340 // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
341 // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
342 // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
343 add v8.4s, v8.4s, v12.4s
345 add v9.4s, v9.4s, v13.4s
347 add v10.4s, v10.4s, v14.4s
349 add v11.4s, v11.4s, v15.4s
352 eor v16.16b, v4.16b, v8.16b
354 eor v17.16b, v5.16b, v9.16b
356 eor v18.16b, v6.16b, v10.16b
358 eor v19.16b, v7.16b, v11.16b
361 shl v4.4s, v16.4s, #7
362 shl v5.4s, v17.4s, #7
363 shl v6.4s, v18.4s, #7
364 shl v7.4s, v19.4s, #7
366 sri v4.4s, v16.4s, #25
368 sri v5.4s, v17.4s, #25
370 sri v6.4s, v18.4s, #25
372 sri v7.4s, v19.4s, #25
375 // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
376 // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
377 // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
378 // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
379 add v0.4s, v0.4s, v5.4s
381 add v1.4s, v1.4s, v6.4s
383 add v2.4s, v2.4s, v7.4s
385 add v3.4s, v3.4s, v4.4s
388 eor v15.16b, v15.16b, v0.16b
390 eor v12.16b, v12.16b, v1.16b
392 eor v13.16b, v13.16b, v2.16b
394 eor v14.16b, v14.16b, v3.16b
406 // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
407 // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
408 // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
409 // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
410 add v10.4s, v10.4s, v15.4s
412 add v11.4s, v11.4s, v12.4s
414 add v8.4s, v8.4s, v13.4s
416 add v9.4s, v9.4s, v14.4s
419 eor v16.16b, v5.16b, v10.16b
421 eor v17.16b, v6.16b, v11.16b
423 eor v18.16b, v7.16b, v8.16b
425 eor v19.16b, v4.16b, v9.16b
428 shl v5.4s, v16.4s, #12
429 shl v6.4s, v17.4s, #12
430 shl v7.4s, v18.4s, #12
431 shl v4.4s, v19.4s, #12
433 sri v5.4s, v16.4s, #20
435 sri v6.4s, v17.4s, #20
437 sri v7.4s, v18.4s, #20
439 sri v4.4s, v19.4s, #20
442 // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
443 // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
444 // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
445 // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
446 add v0.4s, v0.4s, v5.4s
448 add v1.4s, v1.4s, v6.4s
450 add v2.4s, v2.4s, v7.4s
452 add v3.4s, v3.4s, v4.4s
455 eor v15.16b, v15.16b, v0.16b
457 eor v12.16b, v12.16b, v1.16b
459 eor v13.16b, v13.16b, v2.16b
461 eor v14.16b, v14.16b, v3.16b
464 tbl v15.16b, {v15.16b}, v31.16b
466 tbl v12.16b, {v12.16b}, v31.16b
468 tbl v13.16b, {v13.16b}, v31.16b
470 tbl v14.16b, {v14.16b}, v31.16b
473 // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
474 // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
475 // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
476 // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
477 add v10.4s, v10.4s, v15.4s
479 add v11.4s, v11.4s, v12.4s
481 add v8.4s, v8.4s, v13.4s
483 add v9.4s, v9.4s, v14.4s
486 eor v16.16b, v5.16b, v10.16b
488 eor v17.16b, v6.16b, v11.16b
490 eor v18.16b, v7.16b, v8.16b
492 eor v19.16b, v4.16b, v9.16b
495 shl v5.4s, v16.4s, #7
496 shl v6.4s, v17.4s, #7
497 shl v7.4s, v18.4s, #7
498 shl v4.4s, v19.4s, #7
500 sri v5.4s, v16.4s, #25
502 sri v6.4s, v17.4s, #25
504 sri v7.4s, v18.4s, #25
506 sri v4.4s, v19.4s, #25
512 ld4r {v16.4s-v19.4s}, [x0], #16
513 ld4r {v20.4s-v23.4s}, [x0], #16
515 // x12 += counter values 0-3
516 add v12.4s, v12.4s, v30.4s
522 add v0.4s, v0.4s, v16.4s
525 add v1.4s, v1.4s, v17.4s
528 add v2.4s, v2.4s, v18.4s
531 add v3.4s, v3.4s, v19.4s
539 ld4r {v24.4s-v27.4s}, [x0], #16
540 ld4r {v28.4s-v31.4s}, [x0]
546 add v4.4s, v4.4s, v20.4s
549 add v5.4s, v5.4s, v21.4s
552 add v6.4s, v6.4s, v22.4s
555 add v7.4s, v7.4s, v23.4s
567 add v8.4s, v8.4s, v24.4s
570 add v9.4s, v9.4s, v25.4s
573 add v10.4s, v10.4s, v26.4s
576 add v11.4s, v11.4s, v27.4s
581 CPU_BE( rev a10, a10 )
582 CPU_BE( rev a11, a11 )
588 add v12.4s, v12.4s, v28.4s
591 add v13.4s, v13.4s, v29.4s
594 add v14.4s, v14.4s, v30.4s
597 add v15.4s, v15.4s, v31.4s
600 CPU_BE( rev a12, a12 )
601 CPU_BE( rev a13, a13 )
602 CPU_BE( rev a14, a14 )
603 CPU_BE( rev a15, a15 )
605 // interleave 32-bit words in state n, n+1
606 ldp w6, w7, [x2], #64
607 zip1 v16.4s, v0.4s, v1.4s
608 ldp w8, w9, [x2, #-56]
610 zip2 v17.4s, v0.4s, v1.4s
612 zip1 v18.4s, v2.4s, v3.4s
614 zip2 v19.4s, v2.4s, v3.4s
616 ldp w6, w7, [x2, #-48]
617 zip1 v20.4s, v4.4s, v5.4s
618 ldp w8, w9, [x2, #-40]
620 zip2 v21.4s, v4.4s, v5.4s
622 zip1 v22.4s, v6.4s, v7.4s
624 zip2 v23.4s, v6.4s, v7.4s
626 ldp w6, w7, [x2, #-32]
627 zip1 v24.4s, v8.4s, v9.4s
628 ldp w8, w9, [x2, #-24]
630 zip2 v25.4s, v8.4s, v9.4s
632 zip1 v26.4s, v10.4s, v11.4s
634 zip2 v27.4s, v10.4s, v11.4s
636 ldp w6, w7, [x2, #-16]
637 zip1 v28.4s, v12.4s, v13.4s
638 ldp w8, w9, [x2, #-8]
640 zip2 v29.4s, v12.4s, v13.4s
642 zip1 v30.4s, v14.4s, v15.4s
644 zip2 v31.4s, v14.4s, v15.4s
648 sub x3, x3, #128 // start of last block
653 // interleave 64-bit words in state n, n+2
654 zip1 v0.2d, v16.2d, v18.2d
655 zip2 v4.2d, v16.2d, v18.2d
656 stp a0, a1, [x1], #64
657 zip1 v8.2d, v17.2d, v19.2d
658 zip2 v12.2d, v17.2d, v19.2d
659 stp a2, a3, [x1, #-56]
662 ld1 {v16.16b-v19.16b}, [x2], #64
665 zip1 v1.2d, v20.2d, v22.2d
666 zip2 v5.2d, v20.2d, v22.2d
667 stp a4, a5, [x1, #-48]
668 zip1 v9.2d, v21.2d, v23.2d
669 zip2 v13.2d, v21.2d, v23.2d
670 stp a6, a7, [x1, #-40]
673 ld1 {v20.16b-v23.16b}, [x2], #64
676 zip1 v2.2d, v24.2d, v26.2d
677 zip2 v6.2d, v24.2d, v26.2d
678 stp a8, a9, [x1, #-32]
679 zip1 v10.2d, v25.2d, v27.2d
680 zip2 v14.2d, v25.2d, v27.2d
681 stp a10, a11, [x1, #-24]
684 ld1 {v24.16b-v27.16b}, [x2], #64
687 zip1 v3.2d, v28.2d, v30.2d
688 zip2 v7.2d, v28.2d, v30.2d
689 stp a12, a13, [x1, #-16]
690 zip1 v11.2d, v29.2d, v31.2d
691 zip2 v15.2d, v29.2d, v31.2d
692 stp a14, a15, [x1, #-8]
695 ld1 {v28.16b-v31.16b}, [x2]
697 // xor with corresponding input, write to output
698 eor v16.16b, v16.16b, v0.16b
699 eor v17.16b, v17.16b, v1.16b
700 eor v18.16b, v18.16b, v2.16b
701 eor v19.16b, v19.16b, v3.16b
705 eor v20.16b, v20.16b, v4.16b
706 eor v21.16b, v21.16b, v5.16b
707 eor v22.16b, v22.16b, v6.16b
708 eor v23.16b, v23.16b, v7.16b
710 st1 {v16.16b-v19.16b}, [x1], #64
713 eor v24.16b, v24.16b, v8.16b
714 eor v25.16b, v25.16b, v9.16b
715 eor v26.16b, v26.16b, v10.16b
716 eor v27.16b, v27.16b, v11.16b
718 st1 {v20.16b-v23.16b}, [x1], #64
721 eor v28.16b, v28.16b, v12.16b
722 eor v29.16b, v29.16b, v13.16b
723 eor v30.16b, v30.16b, v14.16b
724 eor v31.16b, v31.16b, v15.16b
726 st1 {v24.16b-v27.16b}, [x1], #64
727 st1 {v28.16b-v31.16b}, [x1]
732 // fewer than 192 bytes of in/output
733 .Lt192: cbz x5, 1f // exactly 128 bytes?
734 ld1 {v28.16b-v31.16b}, [x10]
736 tbl v28.16b, {v4.16b-v7.16b}, v28.16b
737 tbl v29.16b, {v4.16b-v7.16b}, v29.16b
738 tbl v30.16b, {v4.16b-v7.16b}, v30.16b
739 tbl v31.16b, {v4.16b-v7.16b}, v31.16b
741 0: eor v20.16b, v20.16b, v28.16b
742 eor v21.16b, v21.16b, v29.16b
743 eor v22.16b, v22.16b, v30.16b
744 eor v23.16b, v23.16b, v31.16b
745 st1 {v20.16b-v23.16b}, [x5] // overlapping stores
746 1: st1 {v16.16b-v19.16b}, [x1]
749 // fewer than 128 bytes of in/output
750 .Lt128: ld1 {v28.16b-v31.16b}, [x10]
753 tbl v28.16b, {v0.16b-v3.16b}, v28.16b
754 tbl v29.16b, {v0.16b-v3.16b}, v29.16b
755 tbl v30.16b, {v0.16b-v3.16b}, v30.16b
756 tbl v31.16b, {v0.16b-v3.16b}, v31.16b
757 ld1 {v16.16b-v19.16b}, [x1] // reload first output block
760 // fewer than 256 bytes of in/output
761 .Lt256: cbz x6, 2f // exactly 192 bytes?
762 ld1 {v4.16b-v7.16b}, [x10]
764 tbl v0.16b, {v8.16b-v11.16b}, v4.16b
765 tbl v1.16b, {v8.16b-v11.16b}, v5.16b
766 tbl v2.16b, {v8.16b-v11.16b}, v6.16b
767 tbl v3.16b, {v8.16b-v11.16b}, v7.16b
769 eor v28.16b, v28.16b, v0.16b
770 eor v29.16b, v29.16b, v1.16b
771 eor v30.16b, v30.16b, v2.16b
772 eor v31.16b, v31.16b, v3.16b
773 st1 {v28.16b-v31.16b}, [x6] // overlapping stores
774 2: st1 {v20.16b-v23.16b}, [x1]
777 // fewer than 320 bytes of in/output
778 .Lt320: cbz x7, 3f // exactly 256 bytes?
779 ld1 {v4.16b-v7.16b}, [x10]
781 tbl v0.16b, {v12.16b-v15.16b}, v4.16b
782 tbl v1.16b, {v12.16b-v15.16b}, v5.16b
783 tbl v2.16b, {v12.16b-v15.16b}, v6.16b
784 tbl v3.16b, {v12.16b-v15.16b}, v7.16b
786 eor v28.16b, v28.16b, v0.16b
787 eor v29.16b, v29.16b, v1.16b
788 eor v30.16b, v30.16b, v2.16b
789 eor v31.16b, v31.16b, v3.16b
790 st1 {v28.16b-v31.16b}, [x7] // overlapping stores
791 3: st1 {v24.16b-v27.16b}, [x1]
793 SYM_FUNC_END(chacha_4block_xor_neon)
795 .section ".rodata", "a", %progbits
796 .align L1_CACHE_SHIFT
804 CTRINC: .word 1, 2, 3, 4
805 ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f