2 * ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
4 * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
13 * Copyright (C) 2015 Martin Willi
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
21 #include <linux/linkage.h>
26 ENTRY(chacha20_block_xor_neon)
27 // x0: Input state matrix, s
28 // x1: 1 data block output, o
29 // x2: 1 data block input, i
32 // This function encrypts one ChaCha20 block by loading the state matrix
33 // in four NEON registers. It performs matrix operation on four words in
34 // parallel, but requires shuffling to rearrange the words after each
40 ld1 {v0.4s-v3.4s}, [x0]
41 ld1 {v8.4s-v11.4s}, [x0]
47 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
48 add v0.4s, v0.4s, v1.4s
49 eor v3.16b, v3.16b, v0.16b
52 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
53 add v2.4s, v2.4s, v3.4s
54 eor v4.16b, v1.16b, v2.16b
58 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
59 add v0.4s, v0.4s, v1.4s
60 eor v3.16b, v3.16b, v0.16b
61 tbl v3.16b, {v3.16b}, v12.16b
63 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
64 add v2.4s, v2.4s, v3.4s
65 eor v4.16b, v1.16b, v2.16b
69 // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
70 ext v1.16b, v1.16b, v1.16b, #4
71 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
72 ext v2.16b, v2.16b, v2.16b, #8
73 // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
74 ext v3.16b, v3.16b, v3.16b, #12
76 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
77 add v0.4s, v0.4s, v1.4s
78 eor v3.16b, v3.16b, v0.16b
81 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
82 add v2.4s, v2.4s, v3.4s
83 eor v4.16b, v1.16b, v2.16b
87 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
88 add v0.4s, v0.4s, v1.4s
89 eor v3.16b, v3.16b, v0.16b
90 tbl v3.16b, {v3.16b}, v12.16b
92 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
93 add v2.4s, v2.4s, v3.4s
94 eor v4.16b, v1.16b, v2.16b
98 // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
99 ext v1.16b, v1.16b, v1.16b, #12
100 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
101 ext v2.16b, v2.16b, v2.16b, #8
102 // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
103 ext v3.16b, v3.16b, v3.16b, #4
108 ld1 {v4.16b-v7.16b}, [x2]
110 // o0 = i0 ^ (x0 + s0)
111 add v0.4s, v0.4s, v8.4s
112 eor v0.16b, v0.16b, v4.16b
114 // o1 = i1 ^ (x1 + s1)
115 add v1.4s, v1.4s, v9.4s
116 eor v1.16b, v1.16b, v5.16b
118 // o2 = i2 ^ (x2 + s2)
119 add v2.4s, v2.4s, v10.4s
120 eor v2.16b, v2.16b, v6.16b
122 // o3 = i3 ^ (x3 + s3)
123 add v3.4s, v3.4s, v11.4s
124 eor v3.16b, v3.16b, v7.16b
126 st1 {v0.16b-v3.16b}, [x1]
129 ENDPROC(chacha20_block_xor_neon)
132 ENTRY(chacha20_4block_xor_neon)
133 // x0: Input state matrix, s
134 // x1: 4 data blocks output, o
135 // x2: 4 data blocks input, i
138 // This function encrypts four consecutive ChaCha20 blocks by loading
139 // the state matrix in NEON registers four times. The algorithm performs
140 // each operation on the corresponding word of each state matrix, hence
141 // requires no word shuffling. For final XORing step we transpose the
142 // matrix by interleaving 32- and then 64-bit words, which allows us to
143 // do XOR in NEON registers.
145 adr x3, CTRINC // ... and ROT8
146 ld1 {v30.4s-v31.4s}, [x3]
148 // x0..15[0-3] = s0..3[0..3]
150 ld4r { v0.4s- v3.4s}, [x4], #16
151 ld4r { v4.4s- v7.4s}, [x4], #16
152 ld4r { v8.4s-v11.4s}, [x4], #16
153 ld4r {v12.4s-v15.4s}, [x4]
155 // x12 += counter values 0-3
156 add v12.4s, v12.4s, v30.4s
161 // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
162 // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
163 // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
164 // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
165 add v0.4s, v0.4s, v4.4s
166 add v1.4s, v1.4s, v5.4s
167 add v2.4s, v2.4s, v6.4s
168 add v3.4s, v3.4s, v7.4s
170 eor v12.16b, v12.16b, v0.16b
171 eor v13.16b, v13.16b, v1.16b
172 eor v14.16b, v14.16b, v2.16b
173 eor v15.16b, v15.16b, v3.16b
180 // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
181 // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
182 // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
183 // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
184 add v8.4s, v8.4s, v12.4s
185 add v9.4s, v9.4s, v13.4s
186 add v10.4s, v10.4s, v14.4s
187 add v11.4s, v11.4s, v15.4s
189 eor v16.16b, v4.16b, v8.16b
190 eor v17.16b, v5.16b, v9.16b
191 eor v18.16b, v6.16b, v10.16b
192 eor v19.16b, v7.16b, v11.16b
194 shl v4.4s, v16.4s, #12
195 shl v5.4s, v17.4s, #12
196 shl v6.4s, v18.4s, #12
197 shl v7.4s, v19.4s, #12
199 sri v4.4s, v16.4s, #20
200 sri v5.4s, v17.4s, #20
201 sri v6.4s, v18.4s, #20
202 sri v7.4s, v19.4s, #20
204 // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
205 // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
206 // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
207 // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
208 add v0.4s, v0.4s, v4.4s
209 add v1.4s, v1.4s, v5.4s
210 add v2.4s, v2.4s, v6.4s
211 add v3.4s, v3.4s, v7.4s
213 eor v12.16b, v12.16b, v0.16b
214 eor v13.16b, v13.16b, v1.16b
215 eor v14.16b, v14.16b, v2.16b
216 eor v15.16b, v15.16b, v3.16b
218 tbl v12.16b, {v12.16b}, v31.16b
219 tbl v13.16b, {v13.16b}, v31.16b
220 tbl v14.16b, {v14.16b}, v31.16b
221 tbl v15.16b, {v15.16b}, v31.16b
223 // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
224 // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
225 // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
226 // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
227 add v8.4s, v8.4s, v12.4s
228 add v9.4s, v9.4s, v13.4s
229 add v10.4s, v10.4s, v14.4s
230 add v11.4s, v11.4s, v15.4s
232 eor v16.16b, v4.16b, v8.16b
233 eor v17.16b, v5.16b, v9.16b
234 eor v18.16b, v6.16b, v10.16b
235 eor v19.16b, v7.16b, v11.16b
237 shl v4.4s, v16.4s, #7
238 shl v5.4s, v17.4s, #7
239 shl v6.4s, v18.4s, #7
240 shl v7.4s, v19.4s, #7
242 sri v4.4s, v16.4s, #25
243 sri v5.4s, v17.4s, #25
244 sri v6.4s, v18.4s, #25
245 sri v7.4s, v19.4s, #25
247 // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
248 // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
249 // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
250 // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
251 add v0.4s, v0.4s, v5.4s
252 add v1.4s, v1.4s, v6.4s
253 add v2.4s, v2.4s, v7.4s
254 add v3.4s, v3.4s, v4.4s
256 eor v15.16b, v15.16b, v0.16b
257 eor v12.16b, v12.16b, v1.16b
258 eor v13.16b, v13.16b, v2.16b
259 eor v14.16b, v14.16b, v3.16b
266 // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
267 // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
268 // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
269 // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
270 add v10.4s, v10.4s, v15.4s
271 add v11.4s, v11.4s, v12.4s
272 add v8.4s, v8.4s, v13.4s
273 add v9.4s, v9.4s, v14.4s
275 eor v16.16b, v5.16b, v10.16b
276 eor v17.16b, v6.16b, v11.16b
277 eor v18.16b, v7.16b, v8.16b
278 eor v19.16b, v4.16b, v9.16b
280 shl v5.4s, v16.4s, #12
281 shl v6.4s, v17.4s, #12
282 shl v7.4s, v18.4s, #12
283 shl v4.4s, v19.4s, #12
285 sri v5.4s, v16.4s, #20
286 sri v6.4s, v17.4s, #20
287 sri v7.4s, v18.4s, #20
288 sri v4.4s, v19.4s, #20
290 // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
291 // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
292 // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
293 // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
294 add v0.4s, v0.4s, v5.4s
295 add v1.4s, v1.4s, v6.4s
296 add v2.4s, v2.4s, v7.4s
297 add v3.4s, v3.4s, v4.4s
299 eor v15.16b, v15.16b, v0.16b
300 eor v12.16b, v12.16b, v1.16b
301 eor v13.16b, v13.16b, v2.16b
302 eor v14.16b, v14.16b, v3.16b
304 tbl v15.16b, {v15.16b}, v31.16b
305 tbl v12.16b, {v12.16b}, v31.16b
306 tbl v13.16b, {v13.16b}, v31.16b
307 tbl v14.16b, {v14.16b}, v31.16b
309 // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
310 // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
311 // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
312 // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
313 add v10.4s, v10.4s, v15.4s
314 add v11.4s, v11.4s, v12.4s
315 add v8.4s, v8.4s, v13.4s
316 add v9.4s, v9.4s, v14.4s
318 eor v16.16b, v5.16b, v10.16b
319 eor v17.16b, v6.16b, v11.16b
320 eor v18.16b, v7.16b, v8.16b
321 eor v19.16b, v4.16b, v9.16b
323 shl v5.4s, v16.4s, #7
324 shl v6.4s, v17.4s, #7
325 shl v7.4s, v18.4s, #7
326 shl v4.4s, v19.4s, #7
328 sri v5.4s, v16.4s, #25
329 sri v6.4s, v17.4s, #25
330 sri v7.4s, v18.4s, #25
331 sri v4.4s, v19.4s, #25
336 ld4r {v16.4s-v19.4s}, [x0], #16
337 ld4r {v20.4s-v23.4s}, [x0], #16
339 // x12 += counter values 0-3
340 add v12.4s, v12.4s, v30.4s
346 add v0.4s, v0.4s, v16.4s
347 add v1.4s, v1.4s, v17.4s
348 add v2.4s, v2.4s, v18.4s
349 add v3.4s, v3.4s, v19.4s
351 ld4r {v24.4s-v27.4s}, [x0], #16
352 ld4r {v28.4s-v31.4s}, [x0]
358 add v4.4s, v4.4s, v20.4s
359 add v5.4s, v5.4s, v21.4s
360 add v6.4s, v6.4s, v22.4s
361 add v7.4s, v7.4s, v23.4s
367 add v8.4s, v8.4s, v24.4s
368 add v9.4s, v9.4s, v25.4s
369 add v10.4s, v10.4s, v26.4s
370 add v11.4s, v11.4s, v27.4s
376 add v12.4s, v12.4s, v28.4s
377 add v13.4s, v13.4s, v29.4s
378 add v14.4s, v14.4s, v30.4s
379 add v15.4s, v15.4s, v31.4s
381 // interleave 32-bit words in state n, n+1
382 zip1 v16.4s, v0.4s, v1.4s
383 zip2 v17.4s, v0.4s, v1.4s
384 zip1 v18.4s, v2.4s, v3.4s
385 zip2 v19.4s, v2.4s, v3.4s
386 zip1 v20.4s, v4.4s, v5.4s
387 zip2 v21.4s, v4.4s, v5.4s
388 zip1 v22.4s, v6.4s, v7.4s
389 zip2 v23.4s, v6.4s, v7.4s
390 zip1 v24.4s, v8.4s, v9.4s
391 zip2 v25.4s, v8.4s, v9.4s
392 zip1 v26.4s, v10.4s, v11.4s
393 zip2 v27.4s, v10.4s, v11.4s
394 zip1 v28.4s, v12.4s, v13.4s
395 zip2 v29.4s, v12.4s, v13.4s
396 zip1 v30.4s, v14.4s, v15.4s
397 zip2 v31.4s, v14.4s, v15.4s
399 // interleave 64-bit words in state n, n+2
400 zip1 v0.2d, v16.2d, v18.2d
401 zip2 v4.2d, v16.2d, v18.2d
402 zip1 v8.2d, v17.2d, v19.2d
403 zip2 v12.2d, v17.2d, v19.2d
404 ld1 {v16.16b-v19.16b}, [x2], #64
406 zip1 v1.2d, v20.2d, v22.2d
407 zip2 v5.2d, v20.2d, v22.2d
408 zip1 v9.2d, v21.2d, v23.2d
409 zip2 v13.2d, v21.2d, v23.2d
410 ld1 {v20.16b-v23.16b}, [x2], #64
412 zip1 v2.2d, v24.2d, v26.2d
413 zip2 v6.2d, v24.2d, v26.2d
414 zip1 v10.2d, v25.2d, v27.2d
415 zip2 v14.2d, v25.2d, v27.2d
416 ld1 {v24.16b-v27.16b}, [x2], #64
418 zip1 v3.2d, v28.2d, v30.2d
419 zip2 v7.2d, v28.2d, v30.2d
420 zip1 v11.2d, v29.2d, v31.2d
421 zip2 v15.2d, v29.2d, v31.2d
422 ld1 {v28.16b-v31.16b}, [x2]
424 // xor with corresponding input, write to output
425 eor v16.16b, v16.16b, v0.16b
426 eor v17.16b, v17.16b, v1.16b
427 eor v18.16b, v18.16b, v2.16b
428 eor v19.16b, v19.16b, v3.16b
429 eor v20.16b, v20.16b, v4.16b
430 eor v21.16b, v21.16b, v5.16b
431 st1 {v16.16b-v19.16b}, [x1], #64
432 eor v22.16b, v22.16b, v6.16b
433 eor v23.16b, v23.16b, v7.16b
434 eor v24.16b, v24.16b, v8.16b
435 eor v25.16b, v25.16b, v9.16b
436 st1 {v20.16b-v23.16b}, [x1], #64
437 eor v26.16b, v26.16b, v10.16b
438 eor v27.16b, v27.16b, v11.16b
439 eor v28.16b, v28.16b, v12.16b
440 st1 {v24.16b-v27.16b}, [x1], #64
441 eor v29.16b, v29.16b, v13.16b
442 eor v30.16b, v30.16b, v14.16b
443 eor v31.16b, v31.16b, v15.16b
444 st1 {v28.16b-v31.16b}, [x1]
447 ENDPROC(chacha20_4block_xor_neon)
449 CTRINC: .word 0, 1, 2, 3
450 ROT8: .word 0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f