drm: add modifiers for MediaTek tiled formats
[drm/drm-misc.git] / arch / arm64 / crypto / chacha-neon-core.S
blobb70ac76f2610ce08b0bd1e5e5422cae4164aa5fe
1 /*
2  * ChaCha/XChaCha NEON helper functions
3  *
4  * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  *
10  * Originally based on:
11  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
12  *
13  * Copyright (C) 2015 Martin Willi
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License as published by
17  * the Free Software Foundation; either version 2 of the License, or
18  * (at your option) any later version.
19  */
21 #include <linux/linkage.h>
22 #include <asm/assembler.h>
23 #include <asm/cache.h>
25         .text
26         .align          6
29  * chacha_permute - permute one block
30  *
31  * Permute one 64-byte block where the state matrix is stored in the four NEON
32  * registers v0-v3.  It performs matrix operations on four words in parallel,
33  * but requires shuffling to rearrange the words after each round.
34  *
35  * The round count is given in w3.
36  *
37  * Clobbers: w3, x10, v4, v12
38  */
39 SYM_FUNC_START_LOCAL(chacha_permute)
41         adr_l           x10, ROT8
42         ld1             {v12.4s}, [x10]
44 .Ldoubleround:
45         // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
46         add             v0.4s, v0.4s, v1.4s
47         eor             v3.16b, v3.16b, v0.16b
48         rev32           v3.8h, v3.8h
50         // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
51         add             v2.4s, v2.4s, v3.4s
52         eor             v4.16b, v1.16b, v2.16b
53         shl             v1.4s, v4.4s, #12
54         sri             v1.4s, v4.4s, #20
56         // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
57         add             v0.4s, v0.4s, v1.4s
58         eor             v3.16b, v3.16b, v0.16b
59         tbl             v3.16b, {v3.16b}, v12.16b
61         // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
62         add             v2.4s, v2.4s, v3.4s
63         eor             v4.16b, v1.16b, v2.16b
64         shl             v1.4s, v4.4s, #7
65         sri             v1.4s, v4.4s, #25
67         // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
68         ext             v1.16b, v1.16b, v1.16b, #4
69         // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
70         ext             v2.16b, v2.16b, v2.16b, #8
71         // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
72         ext             v3.16b, v3.16b, v3.16b, #12
74         // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
75         add             v0.4s, v0.4s, v1.4s
76         eor             v3.16b, v3.16b, v0.16b
77         rev32           v3.8h, v3.8h
79         // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
80         add             v2.4s, v2.4s, v3.4s
81         eor             v4.16b, v1.16b, v2.16b
82         shl             v1.4s, v4.4s, #12
83         sri             v1.4s, v4.4s, #20
85         // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
86         add             v0.4s, v0.4s, v1.4s
87         eor             v3.16b, v3.16b, v0.16b
88         tbl             v3.16b, {v3.16b}, v12.16b
90         // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
91         add             v2.4s, v2.4s, v3.4s
92         eor             v4.16b, v1.16b, v2.16b
93         shl             v1.4s, v4.4s, #7
94         sri             v1.4s, v4.4s, #25
96         // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
97         ext             v1.16b, v1.16b, v1.16b, #12
98         // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
99         ext             v2.16b, v2.16b, v2.16b, #8
100         // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
101         ext             v3.16b, v3.16b, v3.16b, #4
103         subs            w3, w3, #2
104         b.ne            .Ldoubleround
106         ret
107 SYM_FUNC_END(chacha_permute)
109 SYM_FUNC_START(chacha_block_xor_neon)
110         // x0: Input state matrix, s
111         // x1: 1 data block output, o
112         // x2: 1 data block input, i
113         // w3: nrounds
115         stp             x29, x30, [sp, #-16]!
116         mov             x29, sp
118         // x0..3 = s0..3
119         ld1             {v0.4s-v3.4s}, [x0]
120         ld1             {v8.4s-v11.4s}, [x0]
122         bl              chacha_permute
124         ld1             {v4.16b-v7.16b}, [x2]
126         // o0 = i0 ^ (x0 + s0)
127         add             v0.4s, v0.4s, v8.4s
128         eor             v0.16b, v0.16b, v4.16b
130         // o1 = i1 ^ (x1 + s1)
131         add             v1.4s, v1.4s, v9.4s
132         eor             v1.16b, v1.16b, v5.16b
134         // o2 = i2 ^ (x2 + s2)
135         add             v2.4s, v2.4s, v10.4s
136         eor             v2.16b, v2.16b, v6.16b
138         // o3 = i3 ^ (x3 + s3)
139         add             v3.4s, v3.4s, v11.4s
140         eor             v3.16b, v3.16b, v7.16b
142         st1             {v0.16b-v3.16b}, [x1]
144         ldp             x29, x30, [sp], #16
145         ret
146 SYM_FUNC_END(chacha_block_xor_neon)
148 SYM_FUNC_START(hchacha_block_neon)
149         // x0: Input state matrix, s
150         // x1: output (8 32-bit words)
151         // w2: nrounds
153         stp             x29, x30, [sp, #-16]!
154         mov             x29, sp
156         ld1             {v0.4s-v3.4s}, [x0]
158         mov             w3, w2
159         bl              chacha_permute
161         st1             {v0.4s}, [x1], #16
162         st1             {v3.4s}, [x1]
164         ldp             x29, x30, [sp], #16
165         ret
166 SYM_FUNC_END(hchacha_block_neon)
168         a0              .req    w12
169         a1              .req    w13
170         a2              .req    w14
171         a3              .req    w15
172         a4              .req    w16
173         a5              .req    w17
174         a6              .req    w19
175         a7              .req    w20
176         a8              .req    w21
177         a9              .req    w22
178         a10             .req    w23
179         a11             .req    w24
180         a12             .req    w25
181         a13             .req    w26
182         a14             .req    w27
183         a15             .req    w28
185         .align          6
186 SYM_FUNC_START(chacha_4block_xor_neon)
187         frame_push      10
189         // x0: Input state matrix, s
190         // x1: 4 data blocks output, o
191         // x2: 4 data blocks input, i
192         // w3: nrounds
193         // x4: byte count
195         adr_l           x10, .Lpermute
196         and             x5, x4, #63
197         add             x10, x10, x5
199         //
200         // This function encrypts four consecutive ChaCha blocks by loading
201         // the state matrix in NEON registers four times. The algorithm performs
202         // each operation on the corresponding word of each state matrix, hence
203         // requires no word shuffling. For final XORing step we transpose the
204         // matrix by interleaving 32- and then 64-bit words, which allows us to
205         // do XOR in NEON registers.
206         //
207         // At the same time, a fifth block is encrypted in parallel using
208         // scalar registers
209         //
210         adr_l           x9, CTRINC              // ... and ROT8
211         ld1             {v30.4s-v31.4s}, [x9]
213         // x0..15[0-3] = s0..3[0..3]
214         add             x8, x0, #16
215         ld4r            { v0.4s- v3.4s}, [x0]
216         ld4r            { v4.4s- v7.4s}, [x8], #16
217         ld4r            { v8.4s-v11.4s}, [x8], #16
218         ld4r            {v12.4s-v15.4s}, [x8]
220         mov             a0, v0.s[0]
221         mov             a1, v1.s[0]
222         mov             a2, v2.s[0]
223         mov             a3, v3.s[0]
224         mov             a4, v4.s[0]
225         mov             a5, v5.s[0]
226         mov             a6, v6.s[0]
227         mov             a7, v7.s[0]
228         mov             a8, v8.s[0]
229         mov             a9, v9.s[0]
230         mov             a10, v10.s[0]
231         mov             a11, v11.s[0]
232         mov             a12, v12.s[0]
233         mov             a13, v13.s[0]
234         mov             a14, v14.s[0]
235         mov             a15, v15.s[0]
237         // x12 += counter values 1-4
238         add             v12.4s, v12.4s, v30.4s
240 .Ldoubleround4:
241         // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
242         // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
243         // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
244         // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
245         add             v0.4s, v0.4s, v4.4s
246           add           a0, a0, a4
247         add             v1.4s, v1.4s, v5.4s
248           add           a1, a1, a5
249         add             v2.4s, v2.4s, v6.4s
250           add           a2, a2, a6
251         add             v3.4s, v3.4s, v7.4s
252           add           a3, a3, a7
254         eor             v12.16b, v12.16b, v0.16b
255           eor           a12, a12, a0
256         eor             v13.16b, v13.16b, v1.16b
257           eor           a13, a13, a1
258         eor             v14.16b, v14.16b, v2.16b
259           eor           a14, a14, a2
260         eor             v15.16b, v15.16b, v3.16b
261           eor           a15, a15, a3
263         rev32           v12.8h, v12.8h
264           ror           a12, a12, #16
265         rev32           v13.8h, v13.8h
266           ror           a13, a13, #16
267         rev32           v14.8h, v14.8h
268           ror           a14, a14, #16
269         rev32           v15.8h, v15.8h
270           ror           a15, a15, #16
272         // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
273         // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
274         // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
275         // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
276         add             v8.4s, v8.4s, v12.4s
277           add           a8, a8, a12
278         add             v9.4s, v9.4s, v13.4s
279           add           a9, a9, a13
280         add             v10.4s, v10.4s, v14.4s
281           add           a10, a10, a14
282         add             v11.4s, v11.4s, v15.4s
283           add           a11, a11, a15
285         eor             v16.16b, v4.16b, v8.16b
286           eor           a4, a4, a8
287         eor             v17.16b, v5.16b, v9.16b
288           eor           a5, a5, a9
289         eor             v18.16b, v6.16b, v10.16b
290           eor           a6, a6, a10
291         eor             v19.16b, v7.16b, v11.16b
292           eor           a7, a7, a11
294         shl             v4.4s, v16.4s, #12
295         shl             v5.4s, v17.4s, #12
296         shl             v6.4s, v18.4s, #12
297         shl             v7.4s, v19.4s, #12
299         sri             v4.4s, v16.4s, #20
300           ror           a4, a4, #20
301         sri             v5.4s, v17.4s, #20
302           ror           a5, a5, #20
303         sri             v6.4s, v18.4s, #20
304           ror           a6, a6, #20
305         sri             v7.4s, v19.4s, #20
306           ror           a7, a7, #20
308         // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
309         // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
310         // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
311         // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
312         add             v0.4s, v0.4s, v4.4s
313           add           a0, a0, a4
314         add             v1.4s, v1.4s, v5.4s
315           add           a1, a1, a5
316         add             v2.4s, v2.4s, v6.4s
317           add           a2, a2, a6
318         add             v3.4s, v3.4s, v7.4s
319           add           a3, a3, a7
321         eor             v12.16b, v12.16b, v0.16b
322           eor           a12, a12, a0
323         eor             v13.16b, v13.16b, v1.16b
324           eor           a13, a13, a1
325         eor             v14.16b, v14.16b, v2.16b
326           eor           a14, a14, a2
327         eor             v15.16b, v15.16b, v3.16b
328           eor           a15, a15, a3
330         tbl             v12.16b, {v12.16b}, v31.16b
331           ror           a12, a12, #24
332         tbl             v13.16b, {v13.16b}, v31.16b
333           ror           a13, a13, #24
334         tbl             v14.16b, {v14.16b}, v31.16b
335           ror           a14, a14, #24
336         tbl             v15.16b, {v15.16b}, v31.16b
337           ror           a15, a15, #24
339         // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
340         // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
341         // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
342         // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
343         add             v8.4s, v8.4s, v12.4s
344           add           a8, a8, a12
345         add             v9.4s, v9.4s, v13.4s
346           add           a9, a9, a13
347         add             v10.4s, v10.4s, v14.4s
348           add           a10, a10, a14
349         add             v11.4s, v11.4s, v15.4s
350           add           a11, a11, a15
352         eor             v16.16b, v4.16b, v8.16b
353           eor           a4, a4, a8
354         eor             v17.16b, v5.16b, v9.16b
355           eor           a5, a5, a9
356         eor             v18.16b, v6.16b, v10.16b
357           eor           a6, a6, a10
358         eor             v19.16b, v7.16b, v11.16b
359           eor           a7, a7, a11
361         shl             v4.4s, v16.4s, #7
362         shl             v5.4s, v17.4s, #7
363         shl             v6.4s, v18.4s, #7
364         shl             v7.4s, v19.4s, #7
366         sri             v4.4s, v16.4s, #25
367           ror           a4, a4, #25
368         sri             v5.4s, v17.4s, #25
369           ror           a5, a5, #25
370         sri             v6.4s, v18.4s, #25
371          ror            a6, a6, #25
372         sri             v7.4s, v19.4s, #25
373           ror           a7, a7, #25
375         // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
376         // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
377         // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
378         // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
379         add             v0.4s, v0.4s, v5.4s
380           add           a0, a0, a5
381         add             v1.4s, v1.4s, v6.4s
382           add           a1, a1, a6
383         add             v2.4s, v2.4s, v7.4s
384           add           a2, a2, a7
385         add             v3.4s, v3.4s, v4.4s
386           add           a3, a3, a4
388         eor             v15.16b, v15.16b, v0.16b
389           eor           a15, a15, a0
390         eor             v12.16b, v12.16b, v1.16b
391           eor           a12, a12, a1
392         eor             v13.16b, v13.16b, v2.16b
393           eor           a13, a13, a2
394         eor             v14.16b, v14.16b, v3.16b
395           eor           a14, a14, a3
397         rev32           v15.8h, v15.8h
398           ror           a15, a15, #16
399         rev32           v12.8h, v12.8h
400           ror           a12, a12, #16
401         rev32           v13.8h, v13.8h
402           ror           a13, a13, #16
403         rev32           v14.8h, v14.8h
404           ror           a14, a14, #16
406         // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
407         // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
408         // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
409         // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
410         add             v10.4s, v10.4s, v15.4s
411           add           a10, a10, a15
412         add             v11.4s, v11.4s, v12.4s
413           add           a11, a11, a12
414         add             v8.4s, v8.4s, v13.4s
415           add           a8, a8, a13
416         add             v9.4s, v9.4s, v14.4s
417           add           a9, a9, a14
419         eor             v16.16b, v5.16b, v10.16b
420           eor           a5, a5, a10
421         eor             v17.16b, v6.16b, v11.16b
422           eor           a6, a6, a11
423         eor             v18.16b, v7.16b, v8.16b
424           eor           a7, a7, a8
425         eor             v19.16b, v4.16b, v9.16b
426           eor           a4, a4, a9
428         shl             v5.4s, v16.4s, #12
429         shl             v6.4s, v17.4s, #12
430         shl             v7.4s, v18.4s, #12
431         shl             v4.4s, v19.4s, #12
433         sri             v5.4s, v16.4s, #20
434           ror           a5, a5, #20
435         sri             v6.4s, v17.4s, #20
436           ror           a6, a6, #20
437         sri             v7.4s, v18.4s, #20
438           ror           a7, a7, #20
439         sri             v4.4s, v19.4s, #20
440           ror           a4, a4, #20
442         // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
443         // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
444         // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
445         // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
446         add             v0.4s, v0.4s, v5.4s
447           add           a0, a0, a5
448         add             v1.4s, v1.4s, v6.4s
449           add           a1, a1, a6
450         add             v2.4s, v2.4s, v7.4s
451           add           a2, a2, a7
452         add             v3.4s, v3.4s, v4.4s
453           add           a3, a3, a4
455         eor             v15.16b, v15.16b, v0.16b
456           eor           a15, a15, a0
457         eor             v12.16b, v12.16b, v1.16b
458           eor           a12, a12, a1
459         eor             v13.16b, v13.16b, v2.16b
460           eor           a13, a13, a2
461         eor             v14.16b, v14.16b, v3.16b
462           eor           a14, a14, a3
464         tbl             v15.16b, {v15.16b}, v31.16b
465           ror           a15, a15, #24
466         tbl             v12.16b, {v12.16b}, v31.16b
467           ror           a12, a12, #24
468         tbl             v13.16b, {v13.16b}, v31.16b
469           ror           a13, a13, #24
470         tbl             v14.16b, {v14.16b}, v31.16b
471           ror           a14, a14, #24
473         // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
474         // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
475         // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
476         // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
477         add             v10.4s, v10.4s, v15.4s
478           add           a10, a10, a15
479         add             v11.4s, v11.4s, v12.4s
480           add           a11, a11, a12
481         add             v8.4s, v8.4s, v13.4s
482           add           a8, a8, a13
483         add             v9.4s, v9.4s, v14.4s
484           add           a9, a9, a14
486         eor             v16.16b, v5.16b, v10.16b
487           eor           a5, a5, a10
488         eor             v17.16b, v6.16b, v11.16b
489           eor           a6, a6, a11
490         eor             v18.16b, v7.16b, v8.16b
491           eor           a7, a7, a8
492         eor             v19.16b, v4.16b, v9.16b
493           eor           a4, a4, a9
495         shl             v5.4s, v16.4s, #7
496         shl             v6.4s, v17.4s, #7
497         shl             v7.4s, v18.4s, #7
498         shl             v4.4s, v19.4s, #7
500         sri             v5.4s, v16.4s, #25
501           ror           a5, a5, #25
502         sri             v6.4s, v17.4s, #25
503           ror           a6, a6, #25
504         sri             v7.4s, v18.4s, #25
505           ror           a7, a7, #25
506         sri             v4.4s, v19.4s, #25
507           ror           a4, a4, #25
509         subs            w3, w3, #2
510         b.ne            .Ldoubleround4
512         ld4r            {v16.4s-v19.4s}, [x0], #16
513         ld4r            {v20.4s-v23.4s}, [x0], #16
515         // x12 += counter values 0-3
516         add             v12.4s, v12.4s, v30.4s
518         // x0[0-3] += s0[0]
519         // x1[0-3] += s0[1]
520         // x2[0-3] += s0[2]
521         // x3[0-3] += s0[3]
522         add             v0.4s, v0.4s, v16.4s
523           mov           w6, v16.s[0]
524           mov           w7, v17.s[0]
525         add             v1.4s, v1.4s, v17.4s
526           mov           w8, v18.s[0]
527           mov           w9, v19.s[0]
528         add             v2.4s, v2.4s, v18.4s
529           add           a0, a0, w6
530           add           a1, a1, w7
531         add             v3.4s, v3.4s, v19.4s
532           add           a2, a2, w8
533           add           a3, a3, w9
534 CPU_BE(   rev           a0, a0          )
535 CPU_BE(   rev           a1, a1          )
536 CPU_BE(   rev           a2, a2          )
537 CPU_BE(   rev           a3, a3          )
539         ld4r            {v24.4s-v27.4s}, [x0], #16
540         ld4r            {v28.4s-v31.4s}, [x0]
542         // x4[0-3] += s1[0]
543         // x5[0-3] += s1[1]
544         // x6[0-3] += s1[2]
545         // x7[0-3] += s1[3]
546         add             v4.4s, v4.4s, v20.4s
547           mov           w6, v20.s[0]
548           mov           w7, v21.s[0]
549         add             v5.4s, v5.4s, v21.4s
550           mov           w8, v22.s[0]
551           mov           w9, v23.s[0]
552         add             v6.4s, v6.4s, v22.4s
553           add           a4, a4, w6
554           add           a5, a5, w7
555         add             v7.4s, v7.4s, v23.4s
556           add           a6, a6, w8
557           add           a7, a7, w9
558 CPU_BE(   rev           a4, a4          )
559 CPU_BE(   rev           a5, a5          )
560 CPU_BE(   rev           a6, a6          )
561 CPU_BE(   rev           a7, a7          )
563         // x8[0-3] += s2[0]
564         // x9[0-3] += s2[1]
565         // x10[0-3] += s2[2]
566         // x11[0-3] += s2[3]
567         add             v8.4s, v8.4s, v24.4s
568           mov           w6, v24.s[0]
569           mov           w7, v25.s[0]
570         add             v9.4s, v9.4s, v25.4s
571           mov           w8, v26.s[0]
572           mov           w9, v27.s[0]
573         add             v10.4s, v10.4s, v26.4s
574           add           a8, a8, w6
575           add           a9, a9, w7
576         add             v11.4s, v11.4s, v27.4s
577           add           a10, a10, w8
578           add           a11, a11, w9
579 CPU_BE(   rev           a8, a8          )
580 CPU_BE(   rev           a9, a9          )
581 CPU_BE(   rev           a10, a10        )
582 CPU_BE(   rev           a11, a11        )
584         // x12[0-3] += s3[0]
585         // x13[0-3] += s3[1]
586         // x14[0-3] += s3[2]
587         // x15[0-3] += s3[3]
588         add             v12.4s, v12.4s, v28.4s
589           mov           w6, v28.s[0]
590           mov           w7, v29.s[0]
591         add             v13.4s, v13.4s, v29.4s
592           mov           w8, v30.s[0]
593           mov           w9, v31.s[0]
594         add             v14.4s, v14.4s, v30.4s
595           add           a12, a12, w6
596           add           a13, a13, w7
597         add             v15.4s, v15.4s, v31.4s
598           add           a14, a14, w8
599           add           a15, a15, w9
600 CPU_BE(   rev           a12, a12        )
601 CPU_BE(   rev           a13, a13        )
602 CPU_BE(   rev           a14, a14        )
603 CPU_BE(   rev           a15, a15        )
605         // interleave 32-bit words in state n, n+1
606           ldp           w6, w7, [x2], #64
607         zip1            v16.4s, v0.4s, v1.4s
608           ldp           w8, w9, [x2, #-56]
609           eor           a0, a0, w6
610         zip2            v17.4s, v0.4s, v1.4s
611           eor           a1, a1, w7
612         zip1            v18.4s, v2.4s, v3.4s
613           eor           a2, a2, w8
614         zip2            v19.4s, v2.4s, v3.4s
615           eor           a3, a3, w9
616           ldp           w6, w7, [x2, #-48]
617         zip1            v20.4s, v4.4s, v5.4s
618           ldp           w8, w9, [x2, #-40]
619           eor           a4, a4, w6
620         zip2            v21.4s, v4.4s, v5.4s
621           eor           a5, a5, w7
622         zip1            v22.4s, v6.4s, v7.4s
623           eor           a6, a6, w8
624         zip2            v23.4s, v6.4s, v7.4s
625           eor           a7, a7, w9
626           ldp           w6, w7, [x2, #-32]
627         zip1            v24.4s, v8.4s, v9.4s
628           ldp           w8, w9, [x2, #-24]
629           eor           a8, a8, w6
630         zip2            v25.4s, v8.4s, v9.4s
631           eor           a9, a9, w7
632         zip1            v26.4s, v10.4s, v11.4s
633           eor           a10, a10, w8
634         zip2            v27.4s, v10.4s, v11.4s
635           eor           a11, a11, w9
636           ldp           w6, w7, [x2, #-16]
637         zip1            v28.4s, v12.4s, v13.4s
638           ldp           w8, w9, [x2, #-8]
639           eor           a12, a12, w6
640         zip2            v29.4s, v12.4s, v13.4s
641           eor           a13, a13, w7
642         zip1            v30.4s, v14.4s, v15.4s
643           eor           a14, a14, w8
644         zip2            v31.4s, v14.4s, v15.4s
645           eor           a15, a15, w9
647         add             x3, x2, x4
648         sub             x3, x3, #128            // start of last block
650         subs            x5, x4, #128
651         csel            x2, x2, x3, ge
653         // interleave 64-bit words in state n, n+2
654         zip1            v0.2d, v16.2d, v18.2d
655         zip2            v4.2d, v16.2d, v18.2d
656           stp           a0, a1, [x1], #64
657         zip1            v8.2d, v17.2d, v19.2d
658         zip2            v12.2d, v17.2d, v19.2d
659           stp           a2, a3, [x1, #-56]
661         subs            x6, x4, #192
662         ld1             {v16.16b-v19.16b}, [x2], #64
663         csel            x2, x2, x3, ge
665         zip1            v1.2d, v20.2d, v22.2d
666         zip2            v5.2d, v20.2d, v22.2d
667           stp           a4, a5, [x1, #-48]
668         zip1            v9.2d, v21.2d, v23.2d
669         zip2            v13.2d, v21.2d, v23.2d
670           stp           a6, a7, [x1, #-40]
672         subs            x7, x4, #256
673         ld1             {v20.16b-v23.16b}, [x2], #64
674         csel            x2, x2, x3, ge
676         zip1            v2.2d, v24.2d, v26.2d
677         zip2            v6.2d, v24.2d, v26.2d
678           stp           a8, a9, [x1, #-32]
679         zip1            v10.2d, v25.2d, v27.2d
680         zip2            v14.2d, v25.2d, v27.2d
681           stp           a10, a11, [x1, #-24]
683         subs            x8, x4, #320
684         ld1             {v24.16b-v27.16b}, [x2], #64
685         csel            x2, x2, x3, ge
687         zip1            v3.2d, v28.2d, v30.2d
688         zip2            v7.2d, v28.2d, v30.2d
689           stp           a12, a13, [x1, #-16]
690         zip1            v11.2d, v29.2d, v31.2d
691         zip2            v15.2d, v29.2d, v31.2d
692           stp           a14, a15, [x1, #-8]
694         tbnz            x5, #63, .Lt128
695         ld1             {v28.16b-v31.16b}, [x2]
697         // xor with corresponding input, write to output
698         eor             v16.16b, v16.16b, v0.16b
699         eor             v17.16b, v17.16b, v1.16b
700         eor             v18.16b, v18.16b, v2.16b
701         eor             v19.16b, v19.16b, v3.16b
703         tbnz            x6, #63, .Lt192
705         eor             v20.16b, v20.16b, v4.16b
706         eor             v21.16b, v21.16b, v5.16b
707         eor             v22.16b, v22.16b, v6.16b
708         eor             v23.16b, v23.16b, v7.16b
710         st1             {v16.16b-v19.16b}, [x1], #64
711         tbnz            x7, #63, .Lt256
713         eor             v24.16b, v24.16b, v8.16b
714         eor             v25.16b, v25.16b, v9.16b
715         eor             v26.16b, v26.16b, v10.16b
716         eor             v27.16b, v27.16b, v11.16b
718         st1             {v20.16b-v23.16b}, [x1], #64
719         tbnz            x8, #63, .Lt320
721         eor             v28.16b, v28.16b, v12.16b
722         eor             v29.16b, v29.16b, v13.16b
723         eor             v30.16b, v30.16b, v14.16b
724         eor             v31.16b, v31.16b, v15.16b
726         st1             {v24.16b-v27.16b}, [x1], #64
727         st1             {v28.16b-v31.16b}, [x1]
729 .Lout:  frame_pop
730         ret
732         // fewer than 192 bytes of in/output
733 .Lt192: cbz             x5, 1f                          // exactly 128 bytes?
734         ld1             {v28.16b-v31.16b}, [x10]
735         add             x5, x5, x1
736         tbl             v28.16b, {v4.16b-v7.16b}, v28.16b
737         tbl             v29.16b, {v4.16b-v7.16b}, v29.16b
738         tbl             v30.16b, {v4.16b-v7.16b}, v30.16b
739         tbl             v31.16b, {v4.16b-v7.16b}, v31.16b
741 0:      eor             v20.16b, v20.16b, v28.16b
742         eor             v21.16b, v21.16b, v29.16b
743         eor             v22.16b, v22.16b, v30.16b
744         eor             v23.16b, v23.16b, v31.16b
745         st1             {v20.16b-v23.16b}, [x5]         // overlapping stores
746 1:      st1             {v16.16b-v19.16b}, [x1]
747         b               .Lout
749         // fewer than 128 bytes of in/output
750 .Lt128: ld1             {v28.16b-v31.16b}, [x10]
751         add             x5, x5, x1
752         sub             x1, x1, #64
753         tbl             v28.16b, {v0.16b-v3.16b}, v28.16b
754         tbl             v29.16b, {v0.16b-v3.16b}, v29.16b
755         tbl             v30.16b, {v0.16b-v3.16b}, v30.16b
756         tbl             v31.16b, {v0.16b-v3.16b}, v31.16b
757         ld1             {v16.16b-v19.16b}, [x1]         // reload first output block
758         b               0b
760         // fewer than 256 bytes of in/output
761 .Lt256: cbz             x6, 2f                          // exactly 192 bytes?
762         ld1             {v4.16b-v7.16b}, [x10]
763         add             x6, x6, x1
764         tbl             v0.16b, {v8.16b-v11.16b}, v4.16b
765         tbl             v1.16b, {v8.16b-v11.16b}, v5.16b
766         tbl             v2.16b, {v8.16b-v11.16b}, v6.16b
767         tbl             v3.16b, {v8.16b-v11.16b}, v7.16b
769         eor             v28.16b, v28.16b, v0.16b
770         eor             v29.16b, v29.16b, v1.16b
771         eor             v30.16b, v30.16b, v2.16b
772         eor             v31.16b, v31.16b, v3.16b
773         st1             {v28.16b-v31.16b}, [x6]         // overlapping stores
774 2:      st1             {v20.16b-v23.16b}, [x1]
775         b               .Lout
777         // fewer than 320 bytes of in/output
778 .Lt320: cbz             x7, 3f                          // exactly 256 bytes?
779         ld1             {v4.16b-v7.16b}, [x10]
780         add             x7, x7, x1
781         tbl             v0.16b, {v12.16b-v15.16b}, v4.16b
782         tbl             v1.16b, {v12.16b-v15.16b}, v5.16b
783         tbl             v2.16b, {v12.16b-v15.16b}, v6.16b
784         tbl             v3.16b, {v12.16b-v15.16b}, v7.16b
786         eor             v28.16b, v28.16b, v0.16b
787         eor             v29.16b, v29.16b, v1.16b
788         eor             v30.16b, v30.16b, v2.16b
789         eor             v31.16b, v31.16b, v3.16b
790         st1             {v28.16b-v31.16b}, [x7]         // overlapping stores
791 3:      st1             {v24.16b-v27.16b}, [x1]
792         b               .Lout
793 SYM_FUNC_END(chacha_4block_xor_neon)
795         .section        ".rodata", "a", %progbits
796         .align          L1_CACHE_SHIFT
797 .Lpermute:
798         .set            .Li, 0
799         .rept           128
800         .byte           (.Li - 64)
801         .set            .Li, .Li + 1
802         .endr
804 CTRINC: .word           1, 2, 3, 4
805 ROT8:   .word           0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f