1 // SPDX-License-Identifier: GPL-2.0
3 #include <linux/linkage.h>
5 #include <asm/assembler.h>
26 * ARM64 ChaCha20 implementation meant for vDSO. Produces a given positive
27 * number of blocks of output with nonce 0, taking an input key and 8-bytes
28 * counter. Importantly does not spill to the stack.
30 * This implementation avoids d8-d15 because they are callee-save in user
33 * void __arch_chacha20_blocks_nostack(uint8_t *dst_bytes,
39 * x1: 32-byte key input
40 * x2: 8-byte counter input/output
41 * x3: number of 64-byte block to write to output
43 SYM_FUNC_START(__arch_chacha20_blocks_nostack)
45 /* copy0 = "expand 32-byte k" */
46 mov_q x8, 0x3320646e61707865
47 mov_q x9, 0x6b20657479622d32
51 /* copy1,copy2 = key */
52 ld1 { copy1.4s, copy2.4s }, [x1]
53 /* copy3 = counter || zero nonce */
54 ld1 { copy3.2s }, [x2]
57 uzp1 one_v.4s, one_v.4s, one_v.4s
60 /* copy state to auxiliary vectors for the final add after the permute. */
61 mov state0.16b, copy0.16b
62 mov state1.16b, copy1.16b
63 mov state2.16b, copy2.16b
64 mov state3.16b, copy3.16b
69 * Permute one 64-byte block where the state matrix is stored in the four NEON
70 * registers state0-state3. It performs matrix operations on four words in parallel,
71 * but requires shuffling to rearrange the words after each round.
75 /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
76 add state0.4s, state0.4s, state1.4s
77 eor state3.16b, state3.16b, state0.16b
78 rev32 state3.8h, state3.8h
80 /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
81 add state2.4s, state2.4s, state3.4s
82 eor tmp.16b, state1.16b, state2.16b
83 shl state1.4s, tmp.4s, #12
84 sri state1.4s, tmp.4s, #20
86 /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
87 add state0.4s, state0.4s, state1.4s
88 eor tmp.16b, state3.16b, state0.16b
89 shl state3.4s, tmp.4s, #8
90 sri state3.4s, tmp.4s, #24
92 /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
93 add state2.4s, state2.4s, state3.4s
94 eor tmp.16b, state1.16b, state2.16b
95 shl state1.4s, tmp.4s, #7
96 sri state1.4s, tmp.4s, #25
98 /* state1[0,1,2,3] = state1[1,2,3,0] */
99 ext state1.16b, state1.16b, state1.16b, #4
100 /* state2[0,1,2,3] = state2[2,3,0,1] */
101 ext state2.16b, state2.16b, state2.16b, #8
102 /* state3[0,1,2,3] = state3[1,2,3,0] */
103 ext state3.16b, state3.16b, state3.16b, #12
105 /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
106 add state0.4s, state0.4s, state1.4s
107 eor state3.16b, state3.16b, state0.16b
108 rev32 state3.8h, state3.8h
110 /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
111 add state2.4s, state2.4s, state3.4s
112 eor tmp.16b, state1.16b, state2.16b
113 shl state1.4s, tmp.4s, #12
114 sri state1.4s, tmp.4s, #20
116 /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
117 add state0.4s, state0.4s, state1.4s
118 eor tmp.16b, state3.16b, state0.16b
119 shl state3.4s, tmp.4s, #8
120 sri state3.4s, tmp.4s, #24
122 /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
123 add state2.4s, state2.4s, state3.4s
124 eor tmp.16b, state1.16b, state2.16b
125 shl state1.4s, tmp.4s, #7
126 sri state1.4s, tmp.4s, #25
128 /* state1[0,1,2,3] = state1[3,0,1,2] */
129 ext state1.16b, state1.16b, state1.16b, #12
130 /* state2[0,1,2,3] = state2[2,3,0,1] */
131 ext state2.16b, state2.16b, state2.16b, #8
132 /* state3[0,1,2,3] = state3[1,2,3,0] */
133 ext state3.16b, state3.16b, state3.16b, #4
138 /* output0 = state0 + state0 */
139 add state0.4s, state0.4s, copy0.4s
140 /* output1 = state1 + state1 */
141 add state1.4s, state1.4s, copy1.4s
142 /* output2 = state2 + state2 */
143 add state2.4s, state2.4s, copy2.4s
144 /* output2 = state3 + state3 */
145 add state3.4s, state3.4s, copy3.4s
146 st1 { state0.16b - state3.16b }, [x0]
149 * ++copy3.counter, the 'add' clears the upper half of the SIMD register
150 * which is the expected behaviour here.
152 add copy3_d, copy3_d, one_d
154 /* output += 64, --nblocks */
159 /* counter = copy3.counter */
160 st1 { copy3.2s }, [x2]
162 /* Zero out the potentially sensitive regs, in case nothing uses these again. */
170 SYM_FUNC_END(__arch_chacha20_blocks_nostack)
172 emit_aarch64_feature_1_and