1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2024 Xi Ruoyao <xry111@xry111.site>. All Rights Reserved.
7 #include <asm/regdef.h>
8 #include <linux/linkage.h>
12 .macro OP_4REG op d0 d1 d2 d3 s0 s1 s2 s3
20 * Very basic LoongArch implementation of ChaCha20. Produces a given positive
21 * number of blocks of output with a nonce of 0, taking an input key and
22 * 8-byte counter. Importantly does not spill to the stack. Its arguments
26 * a1: 32-byte key input
27 * a2: 8-byte counter input/output
28 * a3: number of 64-byte blocks to write to output
30 SYM_FUNC_START(__arch_chacha20_blocks_nostack)
32 /* We don't need a frame pointer */
62 /* Reuse i as copy3 */
65 /* Packs to be used with OP_4REG */
66 #define line0 state0, state1, state2, state3
67 #define line1 state4, state5, state6, state7
68 #define line2 state8, state9, state10, state11
69 #define line3 state12, state13, state14, state15
71 #define line1_perm state5, state6, state7, state4
72 #define line2_perm state10, state11, state8, state9
73 #define line3_perm state15, state12, state13, state14
75 #define copy copy0, copy1, copy2, copy3
77 #define _16 16, 16, 16, 16
78 #define _20 20, 20, 20, 20
79 #define _24 24, 24, 24, 24
80 #define _25 25, 25, 25, 25
83 * The ABI requires s0-s9 saved, and sp aligned to 16-byte.
84 * This does not violate the stack-less requirement: no sensitive data
85 * is spilled onto the stack.
87 PTR_ADDI sp, sp, (-SZREG * 10) & STACK_ALIGN
90 REG_S s2, sp, SZREG * 2
91 REG_S s3, sp, SZREG * 3
92 REG_S s4, sp, SZREG * 4
93 REG_S s5, sp, SZREG * 5
94 REG_S s6, sp, SZREG * 6
95 REG_S s7, sp, SZREG * 7
96 REG_S s8, sp, SZREG * 8
97 REG_S s9, sp, SZREG * 9
99 li.w copy0, 0x61707865
100 li.w copy1, 0x3320646e
101 li.w copy2, 0x79622d32
103 ld.w cnt_lo, counter, 0
104 ld.w cnt_hi, counter, 4
107 /* state[0,1,2,3] = "expand 32-byte k" */
111 li.w state3, 0x6b206574
113 /* state[4,5,..,11] = key */
120 ld.w state10, key, 24
121 ld.w state11, key, 28
123 /* state[12,13] = counter */
127 /* state[14,15] = 0 */
134 OP_4REG add.w line0, line1
135 OP_4REG xor line3, line0
136 OP_4REG rotri.w line3, _16
138 OP_4REG add.w line2, line3
139 OP_4REG xor line1, line2
140 OP_4REG rotri.w line1, _20
142 OP_4REG add.w line0, line1
143 OP_4REG xor line3, line0
144 OP_4REG rotri.w line3, _24
146 OP_4REG add.w line2, line3
147 OP_4REG xor line1, line2
148 OP_4REG rotri.w line1, _25
151 OP_4REG add.w line0, line1_perm
152 OP_4REG xor line3_perm, line0
153 OP_4REG rotri.w line3_perm, _16
155 OP_4REG add.w line2_perm, line3_perm
156 OP_4REG xor line1_perm, line2_perm
157 OP_4REG rotri.w line1_perm, _20
159 OP_4REG add.w line0, line1_perm
160 OP_4REG xor line3_perm, line0
161 OP_4REG rotri.w line3_perm, _24
163 OP_4REG add.w line2_perm, line3_perm
164 OP_4REG xor line1_perm, line2_perm
165 OP_4REG rotri.w line1_perm, _25
171 * copy[3] = "expa", materialize it here because copy[3] shares the
172 * same register with i which just became dead.
174 li.w copy3, 0x6b206574
176 /* output[0,1,2,3] = copy[0,1,2,3] + state[0,1,2,3] */
177 OP_4REG add.w line0, copy
178 st.w state0, output, 0
179 st.w state1, output, 4
180 st.w state2, output, 8
181 st.w state3, output, 12
183 /* from now on state[0,1,2,3] are scratch registers */
185 /* state[0,1,2,3] = lo32(key) */
191 /* output[4,5,6,7] = state[0,1,2,3] + state[4,5,6,7] */
192 OP_4REG add.w line1, line0
193 st.w state4, output, 16
194 st.w state5, output, 20
195 st.w state6, output, 24
196 st.w state7, output, 28
198 /* state[0,1,2,3] = hi32(key) */
204 /* output[8,9,10,11] = state[0,1,2,3] + state[8,9,10,11] */
205 OP_4REG add.w line2, line0
206 st.w state8, output, 32
207 st.w state9, output, 36
208 st.w state10, output, 40
209 st.w state11, output, 44
211 /* output[12,13,14,15] = state[12,13,14,15] + [cnt_lo, cnt_hi, 0, 0] */
212 add.w state12, state12, cnt_lo
213 add.w state13, state13, cnt_hi
214 st.w state12, output, 48
215 st.w state13, output, 52
216 st.w state14, output, 56
217 st.w state15, output, 60
220 addi.w cnt_lo, cnt_lo, 1
221 sltui state0, cnt_lo, 1
222 add.w cnt_hi, cnt_hi, state0
225 PTR_ADDI output, output, 64
227 PTR_ADDI nblocks, nblocks, -1
228 bnez nblocks, .Lblock
230 /* counter = [cnt_lo, cnt_hi] */
231 st.w cnt_lo, counter, 0
232 st.w cnt_hi, counter, 4
235 * Zero out the potentially sensitive regs, in case nothing uses these
236 * again. As at now copy[0,1,2,3] just contains "expand 32-byte k" and
237 * state[0,...,9] are s0-s9 those we'll restore in the epilogue, so we
238 * only need to zero state[11,...,15].
249 REG_L s2, sp, SZREG * 2
250 REG_L s3, sp, SZREG * 3
251 REG_L s4, sp, SZREG * 4
252 REG_L s5, sp, SZREG * 5
253 REG_L s6, sp, SZREG * 6
254 REG_L s7, sp, SZREG * 7
255 REG_L s8, sp, SZREG * 8
256 REG_L s9, sp, SZREG * 9
257 PTR_ADDI sp, sp, -((-SZREG * 10) & STACK_ALIGN)
260 SYM_FUNC_END(__arch_chacha20_blocks_nostack)