2 * ChaCha/XChaCha NEON helper functions
4 * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
10 * Originally based on:
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
13 * Copyright (C) 2015 Martin Willi
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
21 #include <linux/linkage.h>
22 #include <asm/assembler.h>
23 #include <asm/cache.h>
29 * chacha_permute - permute one block
31 * Permute one 64-byte block where the state matrix is stored in the four NEON
32 * registers v0-v3. It performs matrix operations on four words in parallel,
33 * but requires shuffling to rearrange the words after each round.
35 * The round count is given in w3.
37 * Clobbers: w3, x10, v4, v12
39 SYM_FUNC_START_LOCAL(chacha_permute)
45 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
46 add v0.4s, v0.4s, v1.4s
47 eor v3.16b, v3.16b, v0.16b
50 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
51 add v2.4s, v2.4s, v3.4s
52 eor v4.16b, v1.16b, v2.16b
56 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
57 add v0.4s, v0.4s, v1.4s
58 eor v3.16b, v3.16b, v0.16b
59 tbl v3.16b, {v3.16b}, v12.16b
61 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
62 add v2.4s, v2.4s, v3.4s
63 eor v4.16b, v1.16b, v2.16b
67 // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
68 ext v1.16b, v1.16b, v1.16b, #4
69 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
70 ext v2.16b, v2.16b, v2.16b, #8
71 // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
72 ext v3.16b, v3.16b, v3.16b, #12
74 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
75 add v0.4s, v0.4s, v1.4s
76 eor v3.16b, v3.16b, v0.16b
79 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
80 add v2.4s, v2.4s, v3.4s
81 eor v4.16b, v1.16b, v2.16b
85 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
86 add v0.4s, v0.4s, v1.4s
87 eor v3.16b, v3.16b, v0.16b
88 tbl v3.16b, {v3.16b}, v12.16b
90 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
91 add v2.4s, v2.4s, v3.4s
92 eor v4.16b, v1.16b, v2.16b
96 // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
97 ext v1.16b, v1.16b, v1.16b, #12
98 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
99 ext v2.16b, v2.16b, v2.16b, #8
100 // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
101 ext v3.16b, v3.16b, v3.16b, #4
107 SYM_FUNC_END(chacha_permute)
109 SYM_FUNC_START(chacha_block_xor_neon)
110 // x0: Input state matrix, s
111 // x1: 1 data block output, o
112 // x2: 1 data block input, i
115 stp x29, x30, [sp, #-16]!
119 ld1 {v0.4s-v3.4s}, [x0]
120 ld1 {v8.4s-v11.4s}, [x0]
124 ld1 {v4.16b-v7.16b}, [x2]
126 // o0 = i0 ^ (x0 + s0)
127 add v0.4s, v0.4s, v8.4s
128 eor v0.16b, v0.16b, v4.16b
130 // o1 = i1 ^ (x1 + s1)
131 add v1.4s, v1.4s, v9.4s
132 eor v1.16b, v1.16b, v5.16b
134 // o2 = i2 ^ (x2 + s2)
135 add v2.4s, v2.4s, v10.4s
136 eor v2.16b, v2.16b, v6.16b
138 // o3 = i3 ^ (x3 + s3)
139 add v3.4s, v3.4s, v11.4s
140 eor v3.16b, v3.16b, v7.16b
142 st1 {v0.16b-v3.16b}, [x1]
144 ldp x29, x30, [sp], #16
146 SYM_FUNC_END(chacha_block_xor_neon)
148 SYM_FUNC_START(hchacha_block_neon)
149 // x0: Input state matrix, s
150 // x1: output (8 32-bit words)
153 stp x29, x30, [sp, #-16]!
156 ld1 {v0.4s-v3.4s}, [x0]
161 st1 {v0.4s}, [x1], #16
164 ldp x29, x30, [sp], #16
166 SYM_FUNC_END(hchacha_block_neon)
186 SYM_FUNC_START(chacha_4block_xor_neon)
189 // x0: Input state matrix, s
190 // x1: 4 data blocks output, o
191 // x2: 4 data blocks input, i
201 // This function encrypts four consecutive ChaCha blocks by loading
202 // the state matrix in NEON registers four times. The algorithm performs
203 // each operation on the corresponding word of each state matrix, hence
204 // requires no word shuffling. For final XORing step we transpose the
205 // matrix by interleaving 32- and then 64-bit words, which allows us to
206 // do XOR in NEON registers.
208 // At the same time, a fifth block is encrypted in parallel using
211 adr_l x9, CTRINC // ... and ROT8
212 ld1 {v30.4s-v31.4s}, [x9]
214 // x0..15[0-3] = s0..3[0..3]
216 ld4r { v0.4s- v3.4s}, [x0]
217 ld4r { v4.4s- v7.4s}, [x8], #16
218 ld4r { v8.4s-v11.4s}, [x8], #16
219 ld4r {v12.4s-v15.4s}, [x8]
238 // x12 += counter values 1-4
239 add v12.4s, v12.4s, v30.4s
242 // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
243 // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
244 // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
245 // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
246 add v0.4s, v0.4s, v4.4s
248 add v1.4s, v1.4s, v5.4s
250 add v2.4s, v2.4s, v6.4s
252 add v3.4s, v3.4s, v7.4s
255 eor v12.16b, v12.16b, v0.16b
257 eor v13.16b, v13.16b, v1.16b
259 eor v14.16b, v14.16b, v2.16b
261 eor v15.16b, v15.16b, v3.16b
273 // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
274 // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
275 // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
276 // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
277 add v8.4s, v8.4s, v12.4s
279 add v9.4s, v9.4s, v13.4s
281 add v10.4s, v10.4s, v14.4s
283 add v11.4s, v11.4s, v15.4s
286 eor v16.16b, v4.16b, v8.16b
288 eor v17.16b, v5.16b, v9.16b
290 eor v18.16b, v6.16b, v10.16b
292 eor v19.16b, v7.16b, v11.16b
295 shl v4.4s, v16.4s, #12
296 shl v5.4s, v17.4s, #12
297 shl v6.4s, v18.4s, #12
298 shl v7.4s, v19.4s, #12
300 sri v4.4s, v16.4s, #20
302 sri v5.4s, v17.4s, #20
304 sri v6.4s, v18.4s, #20
306 sri v7.4s, v19.4s, #20
309 // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
310 // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
311 // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
312 // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
313 add v0.4s, v0.4s, v4.4s
315 add v1.4s, v1.4s, v5.4s
317 add v2.4s, v2.4s, v6.4s
319 add v3.4s, v3.4s, v7.4s
322 eor v12.16b, v12.16b, v0.16b
324 eor v13.16b, v13.16b, v1.16b
326 eor v14.16b, v14.16b, v2.16b
328 eor v15.16b, v15.16b, v3.16b
331 tbl v12.16b, {v12.16b}, v31.16b
333 tbl v13.16b, {v13.16b}, v31.16b
335 tbl v14.16b, {v14.16b}, v31.16b
337 tbl v15.16b, {v15.16b}, v31.16b
340 // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
341 // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
342 // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
343 // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
344 add v8.4s, v8.4s, v12.4s
346 add v9.4s, v9.4s, v13.4s
348 add v10.4s, v10.4s, v14.4s
350 add v11.4s, v11.4s, v15.4s
353 eor v16.16b, v4.16b, v8.16b
355 eor v17.16b, v5.16b, v9.16b
357 eor v18.16b, v6.16b, v10.16b
359 eor v19.16b, v7.16b, v11.16b
362 shl v4.4s, v16.4s, #7
363 shl v5.4s, v17.4s, #7
364 shl v6.4s, v18.4s, #7
365 shl v7.4s, v19.4s, #7
367 sri v4.4s, v16.4s, #25
369 sri v5.4s, v17.4s, #25
371 sri v6.4s, v18.4s, #25
373 sri v7.4s, v19.4s, #25
376 // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
377 // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
378 // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
379 // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
380 add v0.4s, v0.4s, v5.4s
382 add v1.4s, v1.4s, v6.4s
384 add v2.4s, v2.4s, v7.4s
386 add v3.4s, v3.4s, v4.4s
389 eor v15.16b, v15.16b, v0.16b
391 eor v12.16b, v12.16b, v1.16b
393 eor v13.16b, v13.16b, v2.16b
395 eor v14.16b, v14.16b, v3.16b
407 // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
408 // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
409 // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
410 // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
411 add v10.4s, v10.4s, v15.4s
413 add v11.4s, v11.4s, v12.4s
415 add v8.4s, v8.4s, v13.4s
417 add v9.4s, v9.4s, v14.4s
420 eor v16.16b, v5.16b, v10.16b
422 eor v17.16b, v6.16b, v11.16b
424 eor v18.16b, v7.16b, v8.16b
426 eor v19.16b, v4.16b, v9.16b
429 shl v5.4s, v16.4s, #12
430 shl v6.4s, v17.4s, #12
431 shl v7.4s, v18.4s, #12
432 shl v4.4s, v19.4s, #12
434 sri v5.4s, v16.4s, #20
436 sri v6.4s, v17.4s, #20
438 sri v7.4s, v18.4s, #20
440 sri v4.4s, v19.4s, #20
443 // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
444 // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
445 // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
446 // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
447 add v0.4s, v0.4s, v5.4s
449 add v1.4s, v1.4s, v6.4s
451 add v2.4s, v2.4s, v7.4s
453 add v3.4s, v3.4s, v4.4s
456 eor v15.16b, v15.16b, v0.16b
458 eor v12.16b, v12.16b, v1.16b
460 eor v13.16b, v13.16b, v2.16b
462 eor v14.16b, v14.16b, v3.16b
465 tbl v15.16b, {v15.16b}, v31.16b
467 tbl v12.16b, {v12.16b}, v31.16b
469 tbl v13.16b, {v13.16b}, v31.16b
471 tbl v14.16b, {v14.16b}, v31.16b
474 // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
475 // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
476 // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
477 // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
478 add v10.4s, v10.4s, v15.4s
480 add v11.4s, v11.4s, v12.4s
482 add v8.4s, v8.4s, v13.4s
484 add v9.4s, v9.4s, v14.4s
487 eor v16.16b, v5.16b, v10.16b
489 eor v17.16b, v6.16b, v11.16b
491 eor v18.16b, v7.16b, v8.16b
493 eor v19.16b, v4.16b, v9.16b
496 shl v5.4s, v16.4s, #7
497 shl v6.4s, v17.4s, #7
498 shl v7.4s, v18.4s, #7
499 shl v4.4s, v19.4s, #7
501 sri v5.4s, v16.4s, #25
503 sri v6.4s, v17.4s, #25
505 sri v7.4s, v18.4s, #25
507 sri v4.4s, v19.4s, #25
513 ld4r {v16.4s-v19.4s}, [x0], #16
514 ld4r {v20.4s-v23.4s}, [x0], #16
516 // x12 += counter values 0-3
517 add v12.4s, v12.4s, v30.4s
523 add v0.4s, v0.4s, v16.4s
526 add v1.4s, v1.4s, v17.4s
529 add v2.4s, v2.4s, v18.4s
532 add v3.4s, v3.4s, v19.4s
540 ld4r {v24.4s-v27.4s}, [x0], #16
541 ld4r {v28.4s-v31.4s}, [x0]
547 add v4.4s, v4.4s, v20.4s
550 add v5.4s, v5.4s, v21.4s
553 add v6.4s, v6.4s, v22.4s
556 add v7.4s, v7.4s, v23.4s
568 add v8.4s, v8.4s, v24.4s
571 add v9.4s, v9.4s, v25.4s
574 add v10.4s, v10.4s, v26.4s
577 add v11.4s, v11.4s, v27.4s
582 CPU_BE( rev a10, a10 )
583 CPU_BE( rev a11, a11 )
589 add v12.4s, v12.4s, v28.4s
592 add v13.4s, v13.4s, v29.4s
595 add v14.4s, v14.4s, v30.4s
598 add v15.4s, v15.4s, v31.4s
601 CPU_BE( rev a12, a12 )
602 CPU_BE( rev a13, a13 )
603 CPU_BE( rev a14, a14 )
604 CPU_BE( rev a15, a15 )
606 // interleave 32-bit words in state n, n+1
607 ldp w6, w7, [x2], #64
608 zip1 v16.4s, v0.4s, v1.4s
609 ldp w8, w9, [x2, #-56]
611 zip2 v17.4s, v0.4s, v1.4s
613 zip1 v18.4s, v2.4s, v3.4s
615 zip2 v19.4s, v2.4s, v3.4s
617 ldp w6, w7, [x2, #-48]
618 zip1 v20.4s, v4.4s, v5.4s
619 ldp w8, w9, [x2, #-40]
621 zip2 v21.4s, v4.4s, v5.4s
623 zip1 v22.4s, v6.4s, v7.4s
625 zip2 v23.4s, v6.4s, v7.4s
627 ldp w6, w7, [x2, #-32]
628 zip1 v24.4s, v8.4s, v9.4s
629 ldp w8, w9, [x2, #-24]
631 zip2 v25.4s, v8.4s, v9.4s
633 zip1 v26.4s, v10.4s, v11.4s
635 zip2 v27.4s, v10.4s, v11.4s
637 ldp w6, w7, [x2, #-16]
638 zip1 v28.4s, v12.4s, v13.4s
639 ldp w8, w9, [x2, #-8]
641 zip2 v29.4s, v12.4s, v13.4s
643 zip1 v30.4s, v14.4s, v15.4s
645 zip2 v31.4s, v14.4s, v15.4s
654 // interleave 64-bit words in state n, n+2
655 zip1 v0.2d, v16.2d, v18.2d
656 zip2 v4.2d, v16.2d, v18.2d
657 stp a0, a1, [x1], #64
658 zip1 v8.2d, v17.2d, v19.2d
659 zip2 v12.2d, v17.2d, v19.2d
660 stp a2, a3, [x1, #-56]
661 ld1 {v16.16b-v19.16b}, [x2], x3
669 zip1 v1.2d, v20.2d, v22.2d
670 zip2 v5.2d, v20.2d, v22.2d
671 stp a4, a5, [x1, #-48]
672 zip1 v9.2d, v21.2d, v23.2d
673 zip2 v13.2d, v21.2d, v23.2d
674 stp a6, a7, [x1, #-40]
675 ld1 {v20.16b-v23.16b}, [x2], x3
683 zip1 v2.2d, v24.2d, v26.2d
684 zip2 v6.2d, v24.2d, v26.2d
685 stp a8, a9, [x1, #-32]
686 zip1 v10.2d, v25.2d, v27.2d
687 zip2 v14.2d, v25.2d, v27.2d
688 stp a10, a11, [x1, #-24]
689 ld1 {v24.16b-v27.16b}, [x2], x3
696 zip1 v3.2d, v28.2d, v30.2d
697 zip2 v7.2d, v28.2d, v30.2d
698 stp a12, a13, [x1, #-16]
699 zip1 v11.2d, v29.2d, v31.2d
700 zip2 v15.2d, v29.2d, v31.2d
701 stp a14, a15, [x1, #-8]
702 ld1 {v28.16b-v31.16b}, [x2]
704 // xor with corresponding input, write to output
706 eor v16.16b, v16.16b, v0.16b
707 eor v17.16b, v17.16b, v1.16b
708 eor v18.16b, v18.16b, v2.16b
709 eor v19.16b, v19.16b, v3.16b
710 st1 {v16.16b-v19.16b}, [x1], #64
714 eor v20.16b, v20.16b, v4.16b
715 eor v21.16b, v21.16b, v5.16b
716 eor v22.16b, v22.16b, v6.16b
717 eor v23.16b, v23.16b, v7.16b
718 st1 {v20.16b-v23.16b}, [x1], #64
722 eor v24.16b, v24.16b, v8.16b
723 eor v25.16b, v25.16b, v9.16b
724 eor v26.16b, v26.16b, v10.16b
725 eor v27.16b, v27.16b, v11.16b
726 st1 {v24.16b-v27.16b}, [x1], #64
730 eor v28.16b, v28.16b, v12.16b
731 eor v29.16b, v29.16b, v13.16b
732 eor v30.16b, v30.16b, v14.16b
733 eor v31.16b, v31.16b, v15.16b
734 st1 {v28.16b-v31.16b}, [x1]
739 // fewer than 128 bytes of in/output
740 0: ld1 {v8.16b}, [x10]
745 ld1 {v16.16b-v19.16b}, [x2]
746 tbl v4.16b, {v0.16b-v3.16b}, v8.16b
747 tbx v20.16b, {v16.16b-v19.16b}, v9.16b
748 add v8.16b, v8.16b, v10.16b
749 add v9.16b, v9.16b, v10.16b
750 tbl v5.16b, {v0.16b-v3.16b}, v8.16b
751 tbx v21.16b, {v16.16b-v19.16b}, v9.16b
752 add v8.16b, v8.16b, v10.16b
753 add v9.16b, v9.16b, v10.16b
754 tbl v6.16b, {v0.16b-v3.16b}, v8.16b
755 tbx v22.16b, {v16.16b-v19.16b}, v9.16b
756 add v8.16b, v8.16b, v10.16b
757 add v9.16b, v9.16b, v10.16b
758 tbl v7.16b, {v0.16b-v3.16b}, v8.16b
759 tbx v23.16b, {v16.16b-v19.16b}, v9.16b
761 eor v20.16b, v20.16b, v4.16b
762 eor v21.16b, v21.16b, v5.16b
763 eor v22.16b, v22.16b, v6.16b
764 eor v23.16b, v23.16b, v7.16b
765 st1 {v20.16b-v23.16b}, [x1]
768 // fewer than 192 bytes of in/output
769 1: ld1 {v8.16b}, [x10]
773 tbl v0.16b, {v4.16b-v7.16b}, v8.16b
774 tbx v20.16b, {v16.16b-v19.16b}, v9.16b
775 add v8.16b, v8.16b, v10.16b
776 add v9.16b, v9.16b, v10.16b
777 tbl v1.16b, {v4.16b-v7.16b}, v8.16b
778 tbx v21.16b, {v16.16b-v19.16b}, v9.16b
779 add v8.16b, v8.16b, v10.16b
780 add v9.16b, v9.16b, v10.16b
781 tbl v2.16b, {v4.16b-v7.16b}, v8.16b
782 tbx v22.16b, {v16.16b-v19.16b}, v9.16b
783 add v8.16b, v8.16b, v10.16b
784 add v9.16b, v9.16b, v10.16b
785 tbl v3.16b, {v4.16b-v7.16b}, v8.16b
786 tbx v23.16b, {v16.16b-v19.16b}, v9.16b
788 eor v20.16b, v20.16b, v0.16b
789 eor v21.16b, v21.16b, v1.16b
790 eor v22.16b, v22.16b, v2.16b
791 eor v23.16b, v23.16b, v3.16b
792 st1 {v20.16b-v23.16b}, [x1]
795 // fewer than 256 bytes of in/output
796 2: ld1 {v4.16b}, [x10]
800 tbl v0.16b, {v8.16b-v11.16b}, v4.16b
801 tbx v24.16b, {v20.16b-v23.16b}, v5.16b
802 add v4.16b, v4.16b, v6.16b
803 add v5.16b, v5.16b, v6.16b
804 tbl v1.16b, {v8.16b-v11.16b}, v4.16b
805 tbx v25.16b, {v20.16b-v23.16b}, v5.16b
806 add v4.16b, v4.16b, v6.16b
807 add v5.16b, v5.16b, v6.16b
808 tbl v2.16b, {v8.16b-v11.16b}, v4.16b
809 tbx v26.16b, {v20.16b-v23.16b}, v5.16b
810 add v4.16b, v4.16b, v6.16b
811 add v5.16b, v5.16b, v6.16b
812 tbl v3.16b, {v8.16b-v11.16b}, v4.16b
813 tbx v27.16b, {v20.16b-v23.16b}, v5.16b
815 eor v24.16b, v24.16b, v0.16b
816 eor v25.16b, v25.16b, v1.16b
817 eor v26.16b, v26.16b, v2.16b
818 eor v27.16b, v27.16b, v3.16b
819 st1 {v24.16b-v27.16b}, [x1]
822 // fewer than 320 bytes of in/output
823 3: ld1 {v4.16b}, [x10]
827 tbl v0.16b, {v12.16b-v15.16b}, v4.16b
828 tbx v28.16b, {v24.16b-v27.16b}, v5.16b
829 add v4.16b, v4.16b, v6.16b
830 add v5.16b, v5.16b, v6.16b
831 tbl v1.16b, {v12.16b-v15.16b}, v4.16b
832 tbx v29.16b, {v24.16b-v27.16b}, v5.16b
833 add v4.16b, v4.16b, v6.16b
834 add v5.16b, v5.16b, v6.16b
835 tbl v2.16b, {v12.16b-v15.16b}, v4.16b
836 tbx v30.16b, {v24.16b-v27.16b}, v5.16b
837 add v4.16b, v4.16b, v6.16b
838 add v5.16b, v5.16b, v6.16b
839 tbl v3.16b, {v12.16b-v15.16b}, v4.16b
840 tbx v31.16b, {v24.16b-v27.16b}, v5.16b
842 eor v28.16b, v28.16b, v0.16b
843 eor v29.16b, v29.16b, v1.16b
844 eor v30.16b, v30.16b, v2.16b
845 eor v31.16b, v31.16b, v3.16b
846 st1 {v28.16b-v31.16b}, [x1]
848 SYM_FUNC_END(chacha_4block_xor_neon)
850 .section ".rodata", "a", %progbits
851 .align L1_CACHE_SHIFT
859 CTRINC: .word 1, 2, 3, 4
860 ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f