2 * ChaCha/XChaCha NEON helper functions
4 * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
10 * Originally based on:
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
13 * Copyright (C) 2015 Martin Willi
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
21 #include <linux/linkage.h>
22 #include <asm/assembler.h>
23 #include <asm/cache.h>
29 * chacha_permute - permute one block
31 * Permute one 64-byte block where the state matrix is stored in the four NEON
32 * registers v0-v3. It performs matrix operations on four words in parallel,
33 * but requires shuffling to rearrange the words after each round.
35 * The round count is given in w3.
37 * Clobbers: w3, x10, v4, v12
45 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
46 add v0.4s, v0.4s, v1.4s
47 eor v3.16b, v3.16b, v0.16b
50 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
51 add v2.4s, v2.4s, v3.4s
52 eor v4.16b, v1.16b, v2.16b
56 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
57 add v0.4s, v0.4s, v1.4s
58 eor v3.16b, v3.16b, v0.16b
59 tbl v3.16b, {v3.16b}, v12.16b
61 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
62 add v2.4s, v2.4s, v3.4s
63 eor v4.16b, v1.16b, v2.16b
67 // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
68 ext v1.16b, v1.16b, v1.16b, #4
69 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
70 ext v2.16b, v2.16b, v2.16b, #8
71 // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
72 ext v3.16b, v3.16b, v3.16b, #12
74 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
75 add v0.4s, v0.4s, v1.4s
76 eor v3.16b, v3.16b, v0.16b
79 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
80 add v2.4s, v2.4s, v3.4s
81 eor v4.16b, v1.16b, v2.16b
85 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
86 add v0.4s, v0.4s, v1.4s
87 eor v3.16b, v3.16b, v0.16b
88 tbl v3.16b, {v3.16b}, v12.16b
90 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
91 add v2.4s, v2.4s, v3.4s
92 eor v4.16b, v1.16b, v2.16b
96 // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
97 ext v1.16b, v1.16b, v1.16b, #12
98 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
99 ext v2.16b, v2.16b, v2.16b, #8
100 // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
101 ext v3.16b, v3.16b, v3.16b, #4
107 ENDPROC(chacha_permute)
109 ENTRY(chacha_block_xor_neon)
110 // x0: Input state matrix, s
111 // x1: 1 data block output, o
112 // x2: 1 data block input, i
115 stp x29, x30, [sp, #-16]!
119 ld1 {v0.4s-v3.4s}, [x0]
120 ld1 {v8.4s-v11.4s}, [x0]
124 ld1 {v4.16b-v7.16b}, [x2]
126 // o0 = i0 ^ (x0 + s0)
127 add v0.4s, v0.4s, v8.4s
128 eor v0.16b, v0.16b, v4.16b
130 // o1 = i1 ^ (x1 + s1)
131 add v1.4s, v1.4s, v9.4s
132 eor v1.16b, v1.16b, v5.16b
134 // o2 = i2 ^ (x2 + s2)
135 add v2.4s, v2.4s, v10.4s
136 eor v2.16b, v2.16b, v6.16b
138 // o3 = i3 ^ (x3 + s3)
139 add v3.4s, v3.4s, v11.4s
140 eor v3.16b, v3.16b, v7.16b
142 st1 {v0.16b-v3.16b}, [x1]
144 ldp x29, x30, [sp], #16
146 ENDPROC(chacha_block_xor_neon)
148 ENTRY(hchacha_block_neon)
149 // x0: Input state matrix, s
150 // x1: output (8 32-bit words)
153 stp x29, x30, [sp, #-16]!
156 ld1 {v0.4s-v3.4s}, [x0]
161 st1 {v0.16b}, [x1], #16
164 ldp x29, x30, [sp], #16
166 ENDPROC(hchacha_block_neon)
186 ENTRY(chacha_4block_xor_neon)
189 // x0: Input state matrix, s
190 // x1: 4 data blocks output, o
191 // x2: 4 data blocks input, i
201 // This function encrypts four consecutive ChaCha blocks by loading
202 // the state matrix in NEON registers four times. The algorithm performs
203 // each operation on the corresponding word of each state matrix, hence
204 // requires no word shuffling. For final XORing step we transpose the
205 // matrix by interleaving 32- and then 64-bit words, which allows us to
206 // do XOR in NEON registers.
208 // At the same time, a fifth block is encrypted in parallel using
211 adr_l x9, CTRINC // ... and ROT8
212 ld1 {v30.4s-v31.4s}, [x9]
214 // x0..15[0-3] = s0..3[0..3]
216 ld4r { v0.4s- v3.4s}, [x0]
217 ld4r { v4.4s- v7.4s}, [x8], #16
218 ld4r { v8.4s-v11.4s}, [x8], #16
219 ld4r {v12.4s-v15.4s}, [x8]
238 // x12 += counter values 1-4
239 add v12.4s, v12.4s, v30.4s
242 // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
243 // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
244 // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
245 // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
246 add v0.4s, v0.4s, v4.4s
248 add v1.4s, v1.4s, v5.4s
250 add v2.4s, v2.4s, v6.4s
252 add v3.4s, v3.4s, v7.4s
255 eor v12.16b, v12.16b, v0.16b
257 eor v13.16b, v13.16b, v1.16b
259 eor v14.16b, v14.16b, v2.16b
261 eor v15.16b, v15.16b, v3.16b
273 // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
274 // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
275 // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
276 // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
277 add v8.4s, v8.4s, v12.4s
279 add v9.4s, v9.4s, v13.4s
281 add v10.4s, v10.4s, v14.4s
283 add v11.4s, v11.4s, v15.4s
286 eor v16.16b, v4.16b, v8.16b
288 eor v17.16b, v5.16b, v9.16b
290 eor v18.16b, v6.16b, v10.16b
292 eor v19.16b, v7.16b, v11.16b
295 shl v4.4s, v16.4s, #12
296 shl v5.4s, v17.4s, #12
297 shl v6.4s, v18.4s, #12
298 shl v7.4s, v19.4s, #12
300 sri v4.4s, v16.4s, #20
302 sri v5.4s, v17.4s, #20
304 sri v6.4s, v18.4s, #20
306 sri v7.4s, v19.4s, #20
309 // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
310 // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
311 // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
312 // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
313 add v0.4s, v0.4s, v4.4s
315 add v1.4s, v1.4s, v5.4s
317 add v2.4s, v2.4s, v6.4s
319 add v3.4s, v3.4s, v7.4s
322 eor v12.16b, v12.16b, v0.16b
324 eor v13.16b, v13.16b, v1.16b
326 eor v14.16b, v14.16b, v2.16b
328 eor v15.16b, v15.16b, v3.16b
331 tbl v12.16b, {v12.16b}, v31.16b
333 tbl v13.16b, {v13.16b}, v31.16b
335 tbl v14.16b, {v14.16b}, v31.16b
337 tbl v15.16b, {v15.16b}, v31.16b
340 // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
341 // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
342 // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
343 // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
344 add v8.4s, v8.4s, v12.4s
346 add v9.4s, v9.4s, v13.4s
348 add v10.4s, v10.4s, v14.4s
350 add v11.4s, v11.4s, v15.4s
353 eor v16.16b, v4.16b, v8.16b
355 eor v17.16b, v5.16b, v9.16b
357 eor v18.16b, v6.16b, v10.16b
359 eor v19.16b, v7.16b, v11.16b
362 shl v4.4s, v16.4s, #7
363 shl v5.4s, v17.4s, #7
364 shl v6.4s, v18.4s, #7
365 shl v7.4s, v19.4s, #7
367 sri v4.4s, v16.4s, #25
369 sri v5.4s, v17.4s, #25
371 sri v6.4s, v18.4s, #25
373 sri v7.4s, v19.4s, #25
376 // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
377 // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
378 // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
379 // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
380 add v0.4s, v0.4s, v5.4s
382 add v1.4s, v1.4s, v6.4s
384 add v2.4s, v2.4s, v7.4s
386 add v3.4s, v3.4s, v4.4s
389 eor v15.16b, v15.16b, v0.16b
391 eor v12.16b, v12.16b, v1.16b
393 eor v13.16b, v13.16b, v2.16b
395 eor v14.16b, v14.16b, v3.16b
407 // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
408 // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
409 // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
410 // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
411 add v10.4s, v10.4s, v15.4s
413 add v11.4s, v11.4s, v12.4s
415 add v8.4s, v8.4s, v13.4s
417 add v9.4s, v9.4s, v14.4s
420 eor v16.16b, v5.16b, v10.16b
422 eor v17.16b, v6.16b, v11.16b
424 eor v18.16b, v7.16b, v8.16b
426 eor v19.16b, v4.16b, v9.16b
429 shl v5.4s, v16.4s, #12
430 shl v6.4s, v17.4s, #12
431 shl v7.4s, v18.4s, #12
432 shl v4.4s, v19.4s, #12
434 sri v5.4s, v16.4s, #20
436 sri v6.4s, v17.4s, #20
438 sri v7.4s, v18.4s, #20
440 sri v4.4s, v19.4s, #20
443 // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
444 // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
445 // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
446 // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
447 add v0.4s, v0.4s, v5.4s
449 add v1.4s, v1.4s, v6.4s
451 add v2.4s, v2.4s, v7.4s
453 add v3.4s, v3.4s, v4.4s
456 eor v15.16b, v15.16b, v0.16b
458 eor v12.16b, v12.16b, v1.16b
460 eor v13.16b, v13.16b, v2.16b
462 eor v14.16b, v14.16b, v3.16b
465 tbl v15.16b, {v15.16b}, v31.16b
467 tbl v12.16b, {v12.16b}, v31.16b
469 tbl v13.16b, {v13.16b}, v31.16b
471 tbl v14.16b, {v14.16b}, v31.16b
474 // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
475 // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
476 // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
477 // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
478 add v10.4s, v10.4s, v15.4s
480 add v11.4s, v11.4s, v12.4s
482 add v8.4s, v8.4s, v13.4s
484 add v9.4s, v9.4s, v14.4s
487 eor v16.16b, v5.16b, v10.16b
489 eor v17.16b, v6.16b, v11.16b
491 eor v18.16b, v7.16b, v8.16b
493 eor v19.16b, v4.16b, v9.16b
496 shl v5.4s, v16.4s, #7
497 shl v6.4s, v17.4s, #7
498 shl v7.4s, v18.4s, #7
499 shl v4.4s, v19.4s, #7
501 sri v5.4s, v16.4s, #25
503 sri v6.4s, v17.4s, #25
505 sri v7.4s, v18.4s, #25
507 sri v4.4s, v19.4s, #25
513 ld4r {v16.4s-v19.4s}, [x0], #16
514 ld4r {v20.4s-v23.4s}, [x0], #16
516 // x12 += counter values 0-3
517 add v12.4s, v12.4s, v30.4s
523 add v0.4s, v0.4s, v16.4s
526 add v1.4s, v1.4s, v17.4s
529 add v2.4s, v2.4s, v18.4s
532 add v3.4s, v3.4s, v19.4s
536 ld4r {v24.4s-v27.4s}, [x0], #16
537 ld4r {v28.4s-v31.4s}, [x0]
543 add v4.4s, v4.4s, v20.4s
546 add v5.4s, v5.4s, v21.4s
549 add v6.4s, v6.4s, v22.4s
552 add v7.4s, v7.4s, v23.4s
560 add v8.4s, v8.4s, v24.4s
563 add v9.4s, v9.4s, v25.4s
566 add v10.4s, v10.4s, v26.4s
569 add v11.4s, v11.4s, v27.4s
577 add v12.4s, v12.4s, v28.4s
580 add v13.4s, v13.4s, v29.4s
583 add v14.4s, v14.4s, v30.4s
586 add v15.4s, v15.4s, v31.4s
590 // interleave 32-bit words in state n, n+1
591 ldp w6, w7, [x2], #64
592 zip1 v16.4s, v0.4s, v1.4s
593 ldp w8, w9, [x2, #-56]
595 zip2 v17.4s, v0.4s, v1.4s
597 zip1 v18.4s, v2.4s, v3.4s
599 zip2 v19.4s, v2.4s, v3.4s
601 ldp w6, w7, [x2, #-48]
602 zip1 v20.4s, v4.4s, v5.4s
603 ldp w8, w9, [x2, #-40]
605 zip2 v21.4s, v4.4s, v5.4s
607 zip1 v22.4s, v6.4s, v7.4s
609 zip2 v23.4s, v6.4s, v7.4s
611 ldp w6, w7, [x2, #-32]
612 zip1 v24.4s, v8.4s, v9.4s
613 ldp w8, w9, [x2, #-24]
615 zip2 v25.4s, v8.4s, v9.4s
617 zip1 v26.4s, v10.4s, v11.4s
619 zip2 v27.4s, v10.4s, v11.4s
621 ldp w6, w7, [x2, #-16]
622 zip1 v28.4s, v12.4s, v13.4s
623 ldp w8, w9, [x2, #-8]
625 zip2 v29.4s, v12.4s, v13.4s
627 zip1 v30.4s, v14.4s, v15.4s
629 zip2 v31.4s, v14.4s, v15.4s
638 // interleave 64-bit words in state n, n+2
639 zip1 v0.2d, v16.2d, v18.2d
640 zip2 v4.2d, v16.2d, v18.2d
641 stp a0, a1, [x1], #64
642 zip1 v8.2d, v17.2d, v19.2d
643 zip2 v12.2d, v17.2d, v19.2d
644 stp a2, a3, [x1, #-56]
645 ld1 {v16.16b-v19.16b}, [x2], x3
653 zip1 v1.2d, v20.2d, v22.2d
654 zip2 v5.2d, v20.2d, v22.2d
655 stp a4, a5, [x1, #-48]
656 zip1 v9.2d, v21.2d, v23.2d
657 zip2 v13.2d, v21.2d, v23.2d
658 stp a6, a7, [x1, #-40]
659 ld1 {v20.16b-v23.16b}, [x2], x3
667 zip1 v2.2d, v24.2d, v26.2d
668 zip2 v6.2d, v24.2d, v26.2d
669 stp a8, a9, [x1, #-32]
670 zip1 v10.2d, v25.2d, v27.2d
671 zip2 v14.2d, v25.2d, v27.2d
672 stp a10, a11, [x1, #-24]
673 ld1 {v24.16b-v27.16b}, [x2], x3
680 zip1 v3.2d, v28.2d, v30.2d
681 zip2 v7.2d, v28.2d, v30.2d
682 stp a12, a13, [x1, #-16]
683 zip1 v11.2d, v29.2d, v31.2d
684 zip2 v15.2d, v29.2d, v31.2d
685 stp a14, a15, [x1, #-8]
686 ld1 {v28.16b-v31.16b}, [x2]
688 // xor with corresponding input, write to output
690 eor v16.16b, v16.16b, v0.16b
691 eor v17.16b, v17.16b, v1.16b
692 eor v18.16b, v18.16b, v2.16b
693 eor v19.16b, v19.16b, v3.16b
694 st1 {v16.16b-v19.16b}, [x1], #64
698 eor v20.16b, v20.16b, v4.16b
699 eor v21.16b, v21.16b, v5.16b
700 eor v22.16b, v22.16b, v6.16b
701 eor v23.16b, v23.16b, v7.16b
702 st1 {v20.16b-v23.16b}, [x1], #64
706 eor v24.16b, v24.16b, v8.16b
707 eor v25.16b, v25.16b, v9.16b
708 eor v26.16b, v26.16b, v10.16b
709 eor v27.16b, v27.16b, v11.16b
710 st1 {v24.16b-v27.16b}, [x1], #64
714 eor v28.16b, v28.16b, v12.16b
715 eor v29.16b, v29.16b, v13.16b
716 eor v30.16b, v30.16b, v14.16b
717 eor v31.16b, v31.16b, v15.16b
718 st1 {v28.16b-v31.16b}, [x1]
723 // fewer than 128 bytes of in/output
724 0: ld1 {v8.16b}, [x10]
729 ld1 {v16.16b-v19.16b}, [x2]
730 tbl v4.16b, {v0.16b-v3.16b}, v8.16b
731 tbx v20.16b, {v16.16b-v19.16b}, v9.16b
732 add v8.16b, v8.16b, v10.16b
733 add v9.16b, v9.16b, v10.16b
734 tbl v5.16b, {v0.16b-v3.16b}, v8.16b
735 tbx v21.16b, {v16.16b-v19.16b}, v9.16b
736 add v8.16b, v8.16b, v10.16b
737 add v9.16b, v9.16b, v10.16b
738 tbl v6.16b, {v0.16b-v3.16b}, v8.16b
739 tbx v22.16b, {v16.16b-v19.16b}, v9.16b
740 add v8.16b, v8.16b, v10.16b
741 add v9.16b, v9.16b, v10.16b
742 tbl v7.16b, {v0.16b-v3.16b}, v8.16b
743 tbx v23.16b, {v16.16b-v19.16b}, v9.16b
745 eor v20.16b, v20.16b, v4.16b
746 eor v21.16b, v21.16b, v5.16b
747 eor v22.16b, v22.16b, v6.16b
748 eor v23.16b, v23.16b, v7.16b
749 st1 {v20.16b-v23.16b}, [x1]
752 // fewer than 192 bytes of in/output
753 1: ld1 {v8.16b}, [x10]
757 tbl v0.16b, {v4.16b-v7.16b}, v8.16b
758 tbx v20.16b, {v16.16b-v19.16b}, v9.16b
759 add v8.16b, v8.16b, v10.16b
760 add v9.16b, v9.16b, v10.16b
761 tbl v1.16b, {v4.16b-v7.16b}, v8.16b
762 tbx v21.16b, {v16.16b-v19.16b}, v9.16b
763 add v8.16b, v8.16b, v10.16b
764 add v9.16b, v9.16b, v10.16b
765 tbl v2.16b, {v4.16b-v7.16b}, v8.16b
766 tbx v22.16b, {v16.16b-v19.16b}, v9.16b
767 add v8.16b, v8.16b, v10.16b
768 add v9.16b, v9.16b, v10.16b
769 tbl v3.16b, {v4.16b-v7.16b}, v8.16b
770 tbx v23.16b, {v16.16b-v19.16b}, v9.16b
772 eor v20.16b, v20.16b, v0.16b
773 eor v21.16b, v21.16b, v1.16b
774 eor v22.16b, v22.16b, v2.16b
775 eor v23.16b, v23.16b, v3.16b
776 st1 {v20.16b-v23.16b}, [x1]
779 // fewer than 256 bytes of in/output
780 2: ld1 {v4.16b}, [x10]
784 tbl v0.16b, {v8.16b-v11.16b}, v4.16b
785 tbx v24.16b, {v20.16b-v23.16b}, v5.16b
786 add v4.16b, v4.16b, v6.16b
787 add v5.16b, v5.16b, v6.16b
788 tbl v1.16b, {v8.16b-v11.16b}, v4.16b
789 tbx v25.16b, {v20.16b-v23.16b}, v5.16b
790 add v4.16b, v4.16b, v6.16b
791 add v5.16b, v5.16b, v6.16b
792 tbl v2.16b, {v8.16b-v11.16b}, v4.16b
793 tbx v26.16b, {v20.16b-v23.16b}, v5.16b
794 add v4.16b, v4.16b, v6.16b
795 add v5.16b, v5.16b, v6.16b
796 tbl v3.16b, {v8.16b-v11.16b}, v4.16b
797 tbx v27.16b, {v20.16b-v23.16b}, v5.16b
799 eor v24.16b, v24.16b, v0.16b
800 eor v25.16b, v25.16b, v1.16b
801 eor v26.16b, v26.16b, v2.16b
802 eor v27.16b, v27.16b, v3.16b
803 st1 {v24.16b-v27.16b}, [x1]
806 // fewer than 320 bytes of in/output
807 3: ld1 {v4.16b}, [x10]
811 tbl v0.16b, {v12.16b-v15.16b}, v4.16b
812 tbx v28.16b, {v24.16b-v27.16b}, v5.16b
813 add v4.16b, v4.16b, v6.16b
814 add v5.16b, v5.16b, v6.16b
815 tbl v1.16b, {v12.16b-v15.16b}, v4.16b
816 tbx v29.16b, {v24.16b-v27.16b}, v5.16b
817 add v4.16b, v4.16b, v6.16b
818 add v5.16b, v5.16b, v6.16b
819 tbl v2.16b, {v12.16b-v15.16b}, v4.16b
820 tbx v30.16b, {v24.16b-v27.16b}, v5.16b
821 add v4.16b, v4.16b, v6.16b
822 add v5.16b, v5.16b, v6.16b
823 tbl v3.16b, {v12.16b-v15.16b}, v4.16b
824 tbx v31.16b, {v24.16b-v27.16b}, v5.16b
826 eor v28.16b, v28.16b, v0.16b
827 eor v29.16b, v29.16b, v1.16b
828 eor v30.16b, v30.16b, v2.16b
829 eor v31.16b, v31.16b, v3.16b
830 st1 {v28.16b-v31.16b}, [x1]
832 ENDPROC(chacha_4block_xor_neon)
834 .section ".rodata", "a", %progbits
835 .align L1_CACHE_SHIFT
843 CTRINC: .word 1, 2, 3, 4
844 ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f