drm/nouveau: consume the return of large GSP message
[drm/drm-misc.git] / arch / arm64 / kernel / vdso / vgetrandom-chacha.S
blob67890b4453092ee2e56274088be25c33fe3f23a6
1 // SPDX-License-Identifier: GPL-2.0
3 #include <linux/linkage.h>
4 #include <asm/cache.h>
5 #include <asm/assembler.h>
7         .text
9 #define state0          v0
10 #define state1          v1
11 #define state2          v2
12 #define state3          v3
13 #define copy0           v4
14 #define copy0_q         q4
15 #define copy1           v5
16 #define copy2           v6
17 #define copy3           v7
18 #define copy3_d         d7
19 #define one_d           d16
20 #define one_q           q16
21 #define one_v           v16
22 #define tmp             v17
23 #define rot8            v18
26  * ARM64 ChaCha20 implementation meant for vDSO.  Produces a given positive
27  * number of blocks of output with nonce 0, taking an input key and 8-bytes
28  * counter.  Importantly does not spill to the stack.
29  *
30  * This implementation avoids d8-d15 because they are callee-save in user
31  * space.
32  *
33  * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
34  *                                     const uint8_t *key,
35  *                                     uint32_t *counter,
36  *                                     size_t nblocks)
37  *
38  *      x0: output bytes
39  *      x1: 32-byte key input
40  *      x2: 8-byte counter input/output
41  *      x3: number of 64-byte block to write to output
42  */
43 SYM_FUNC_START(__arch_chacha20_blocks_nostack)
45         /* copy0 = "expand 32-byte k" */
46         mov_q           x8, 0x3320646e61707865
47         mov_q           x9, 0x6b20657479622d32
48         mov             copy0.d[0], x8
49         mov             copy0.d[1], x9
51         /* copy1,copy2 = key */
52         ld1             { copy1.4s, copy2.4s }, [x1]
53         /* copy3 = counter || zero nonce  */
54         ld1             { copy3.2s }, [x2]
56         movi            one_v.2s, #1
57         uzp1            one_v.4s, one_v.4s, one_v.4s
59 .Lblock:
60         /* copy state to auxiliary vectors for the final add after the permute.  */
61         mov             state0.16b, copy0.16b
62         mov             state1.16b, copy1.16b
63         mov             state2.16b, copy2.16b
64         mov             state3.16b, copy3.16b
66         mov             w4, 20
67 .Lpermute:
68         /*
69          * Permute one 64-byte block where the state matrix is stored in the four NEON
70          * registers state0-state3.  It performs matrix operations on four words in parallel,
71          * but requires shuffling to rearrange the words after each round.
72          */
74 .Ldoubleround:
75         /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
76         add             state0.4s, state0.4s, state1.4s
77         eor             state3.16b, state3.16b, state0.16b
78         rev32           state3.8h, state3.8h
80         /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
81         add             state2.4s, state2.4s, state3.4s
82         eor             tmp.16b, state1.16b, state2.16b
83         shl             state1.4s, tmp.4s, #12
84         sri             state1.4s, tmp.4s, #20
86         /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
87         add             state0.4s, state0.4s, state1.4s
88         eor             tmp.16b, state3.16b, state0.16b
89         shl             state3.4s, tmp.4s, #8
90         sri             state3.4s, tmp.4s, #24
92         /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
93         add             state2.4s, state2.4s, state3.4s
94         eor             tmp.16b, state1.16b, state2.16b
95         shl             state1.4s, tmp.4s, #7
96         sri             state1.4s, tmp.4s, #25
98         /* state1[0,1,2,3] = state1[1,2,3,0] */
99         ext             state1.16b, state1.16b, state1.16b, #4
100         /* state2[0,1,2,3] = state2[2,3,0,1] */
101         ext             state2.16b, state2.16b, state2.16b, #8
102         /* state3[0,1,2,3] = state3[1,2,3,0] */
103         ext             state3.16b, state3.16b, state3.16b, #12
105         /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
106         add             state0.4s, state0.4s, state1.4s
107         eor             state3.16b, state3.16b, state0.16b
108         rev32           state3.8h, state3.8h
110         /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
111         add             state2.4s, state2.4s, state3.4s
112         eor             tmp.16b, state1.16b, state2.16b
113         shl             state1.4s, tmp.4s, #12
114         sri             state1.4s, tmp.4s, #20
116         /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
117         add             state0.4s, state0.4s, state1.4s
118         eor             tmp.16b, state3.16b, state0.16b
119         shl             state3.4s, tmp.4s, #8
120         sri             state3.4s, tmp.4s, #24
122         /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
123         add             state2.4s, state2.4s, state3.4s
124         eor             tmp.16b, state1.16b, state2.16b
125         shl             state1.4s, tmp.4s, #7
126         sri             state1.4s, tmp.4s, #25
128         /* state1[0,1,2,3] = state1[3,0,1,2] */
129         ext             state1.16b, state1.16b, state1.16b, #12
130         /* state2[0,1,2,3] = state2[2,3,0,1] */
131         ext             state2.16b, state2.16b, state2.16b, #8
132         /* state3[0,1,2,3] = state3[1,2,3,0] */
133         ext             state3.16b, state3.16b, state3.16b, #4
135         subs            w4, w4, #2
136         b.ne            .Ldoubleround
138         /* output0 = state0 + state0 */
139         add             state0.4s, state0.4s, copy0.4s
140         /* output1 = state1 + state1 */
141         add             state1.4s, state1.4s, copy1.4s
142         /* output2 = state2 + state2 */
143         add             state2.4s, state2.4s, copy2.4s
144         /* output2 = state3 + state3 */
145         add             state3.4s, state3.4s, copy3.4s
146         st1             { state0.16b - state3.16b }, [x0]
148         /*
149          * ++copy3.counter, the 'add' clears the upper half of the SIMD register
150          * which is the expected behaviour here.
151          */
152         add             copy3_d, copy3_d, one_d
154         /* output += 64, --nblocks */
155         add             x0, x0, 64
156         subs            x3, x3, #1
157         b.ne            .Lblock
159         /* counter = copy3.counter */
160         st1             { copy3.2s }, [x2]
162         /* Zero out the potentially sensitive regs, in case nothing uses these again. */
163         movi            state0.16b, #0
164         movi            state1.16b, #0
165         movi            state2.16b, #0
166         movi            state3.16b, #0
167         movi            copy1.16b, #0
168         movi            copy2.16b, #0
169         ret
170 SYM_FUNC_END(__arch_chacha20_blocks_nostack)
172 emit_aarch64_feature_1_and