2 * ChaCha/XChaCha NEON helper functions
4 * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
13 * Copyright (C) 2015 Martin Willi
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
22 * NEON doesn't have a rotate instruction. The alternatives are, more or less:
24 * (a) vshl.u32 + vsri.u32 (needs temporary register)
25 * (b) vshl.u32 + vshr.u32 + vorr (needs temporary register)
26 * (c) vrev32.16 (16-bit rotations only)
27 * (d) vtbl.8 + vtbl.8 (multiple of 8 bits rotations only,
30 * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations,
31 * the only choices are (a) and (b). We use (a) since it takes two-thirds the
32 * cycles of (b) on both Cortex-A7 and Cortex-A53.
34 * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
35 * and doesn't need a temporary register.
37 * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence
38 * is twice as fast as (a), even when doing (a) on multiple registers
39 * simultaneously to eliminate the stall between vshl and vsri. Also, it
40 * parallelizes better when temporary registers are scarce.
42 * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
43 * (a), so the need to load the rotation table actually makes the vtbl method
44 * slightly slower overall on that CPU (~1.3% slower ChaCha20). Still, it
45 * seems to be a good compromise to get a more significant speed boost on some
46 * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
49 #include <linux/linkage.h>
50 #include <asm/cache.h>
57 * chacha_permute - permute one block
59 * Permute one 64-byte block where the state matrix is stored in the four NEON
60 * registers q0-q3. It performs matrix operations on four words in parallel,
61 * but requires shuffling to rearrange the words after each round.
63 * The round count is given in r3.
65 * Clobbers: r3, ip, q4-q5
70 vld1.8 {d10}, [ip, :64]
73 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
78 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
84 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
90 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
96 // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
98 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
100 // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
101 vext.8 q3, q3, q3, #12
103 // x0 += x1, x3 = rotl32(x3 ^ x0, 16)
108 // x2 += x3, x1 = rotl32(x1 ^ x2, 12)
114 // x0 += x1, x3 = rotl32(x3 ^ x0, 8)
120 // x2 += x3, x1 = rotl32(x1 ^ x2, 7)
126 // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
127 vext.8 q1, q1, q1, #12
128 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
129 vext.8 q2, q2, q2, #8
130 // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
131 vext.8 q3, q3, q3, #4
137 ENDPROC(chacha_permute)
139 ENTRY(chacha_block_xor_neon)
140 // r0: Input state matrix, s
141 // r1: 1 data block output, o
142 // r2: 1 data block input, i
148 vld1.32 {q0-q1}, [r0]
149 vld1.32 {q2-q3}, [ip]
162 // o0 = i0 ^ (x0 + s0)
166 // o1 = i1 ^ (x1 + s1)
170 // o2 = i2 ^ (x2 + s2)
174 // o3 = i3 ^ (x3 + s3)
183 ENDPROC(chacha_block_xor_neon)
185 ENTRY(hchacha_block_neon)
186 // r0: Input state matrix, s
187 // r1: output (8 32-bit words)
191 vld1.32 {q0-q1}, [r0]!
192 vld1.32 {q2-q3}, [r0]
201 ENDPROC(hchacha_block_neon)
204 .Lctrinc: .word 0, 1, 2, 3
205 .Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6
208 ENTRY(chacha_4block_xor_neon)
210 mov r4, sp // preserve the stack pointer
211 sub ip, sp, #0x20 // allocate a 32 byte buffer
212 bic ip, ip, #0x1f // aligned to 32 bytes
215 // r0: Input state matrix, s
216 // r1: 4 data blocks output, o
217 // r2: 4 data blocks input, i
221 // This function encrypts four consecutive ChaCha blocks by loading
222 // the state matrix in NEON registers four times. The algorithm performs
223 // each operation on the corresponding word of each state matrix, hence
224 // requires no word shuffling. The words are re-interleaved before the
225 // final addition of the original state and the XORing step.
228 // x0..15[0-3] = s0..15[0-3]
230 vld1.32 {q0-q1}, [r0]
231 vld1.32 {q2-q3}, [ip]
236 vld1.32 {q4}, [lr, :128]
241 vadd.u32 q12, q12, q4 // x12 += counter values 0-3
257 vld1.32 {q8-q9}, [sp, :256]
259 // x0 += x4, x12 = rotl32(x12 ^ x0, 16)
260 // x1 += x5, x13 = rotl32(x13 ^ x1, 16)
261 // x2 += x6, x14 = rotl32(x14 ^ x2, 16)
262 // x3 += x7, x15 = rotl32(x15 ^ x3, 16)
278 // x8 += x12, x4 = rotl32(x4 ^ x8, 12)
279 // x9 += x13, x5 = rotl32(x5 ^ x9, 12)
280 // x10 += x14, x6 = rotl32(x6 ^ x10, 12)
281 // x11 += x15, x7 = rotl32(x7 ^ x11, 12)
284 vadd.i32 q10, q10, q14
285 vadd.i32 q11, q11, q15
287 vst1.32 {q8-q9}, [sp, :256]
303 // x0 += x4, x12 = rotl32(x12 ^ x0, 8)
304 // x1 += x5, x13 = rotl32(x13 ^ x1, 8)
305 // x2 += x6, x14 = rotl32(x14 ^ x2, 8)
306 // x3 += x7, x15 = rotl32(x15 ^ x3, 8)
307 vld1.8 {d16}, [ip, :64]
318 vtbl.8 d24, {d24}, d16
319 vtbl.8 d25, {d25}, d16
320 vtbl.8 d26, {d26}, d16
321 vtbl.8 d27, {d27}, d16
322 vtbl.8 d28, {d28}, d16
323 vtbl.8 d29, {d29}, d16
324 vtbl.8 d30, {d30}, d16
325 vtbl.8 d31, {d31}, d16
327 vld1.32 {q8-q9}, [sp, :256]
329 // x8 += x12, x4 = rotl32(x4 ^ x8, 7)
330 // x9 += x13, x5 = rotl32(x5 ^ x9, 7)
331 // x10 += x14, x6 = rotl32(x6 ^ x10, 7)
332 // x11 += x15, x7 = rotl32(x7 ^ x11, 7)
335 vadd.i32 q10, q10, q14
336 vadd.i32 q11, q11, q15
338 vst1.32 {q8-q9}, [sp, :256]
354 vld1.32 {q8-q9}, [sp, :256]
356 // x0 += x5, x15 = rotl32(x15 ^ x0, 16)
357 // x1 += x6, x12 = rotl32(x12 ^ x1, 16)
358 // x2 += x7, x13 = rotl32(x13 ^ x2, 16)
359 // x3 += x4, x14 = rotl32(x14 ^ x3, 16)
375 // x10 += x15, x5 = rotl32(x5 ^ x10, 12)
376 // x11 += x12, x6 = rotl32(x6 ^ x11, 12)
377 // x8 += x13, x7 = rotl32(x7 ^ x8, 12)
378 // x9 += x14, x4 = rotl32(x4 ^ x9, 12)
379 vadd.i32 q10, q10, q15
380 vadd.i32 q11, q11, q12
384 vst1.32 {q8-q9}, [sp, :256]
400 // x0 += x5, x15 = rotl32(x15 ^ x0, 8)
401 // x1 += x6, x12 = rotl32(x12 ^ x1, 8)
402 // x2 += x7, x13 = rotl32(x13 ^ x2, 8)
403 // x3 += x4, x14 = rotl32(x14 ^ x3, 8)
404 vld1.8 {d16}, [ip, :64]
415 vtbl.8 d30, {d30}, d16
416 vtbl.8 d31, {d31}, d16
417 vtbl.8 d24, {d24}, d16
418 vtbl.8 d25, {d25}, d16
419 vtbl.8 d26, {d26}, d16
420 vtbl.8 d27, {d27}, d16
421 vtbl.8 d28, {d28}, d16
422 vtbl.8 d29, {d29}, d16
424 vld1.32 {q8-q9}, [sp, :256]
426 // x10 += x15, x5 = rotl32(x5 ^ x10, 7)
427 // x11 += x12, x6 = rotl32(x6 ^ x11, 7)
428 // x8 += x13, x7 = rotl32(x7 ^ x8, 7)
429 // x9 += x14, x4 = rotl32(x4 ^ x9, 7)
430 vadd.i32 q10, q10, q15
431 vadd.i32 q11, q11, q12
435 vst1.32 {q8-q9}, [sp, :256]
454 // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
455 // x8..9[0-3] are on the stack.
457 // Re-interleave the words in the first two rows of each block (x0..7).
458 // Also add the counter values 0-3 to x12[0-3].
459 vld1.32 {q8}, [lr, :128] // load counter values 0-3
460 vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
461 vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
462 vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
463 vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7)
464 vadd.u32 q12, q8 // x12 += counter values 0-3
467 vld1.32 {q8-q9}, [r0]! // load s0..7
471 // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
472 // after XORing the first 32 bytes.
475 // First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
477 // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block)
483 // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block)
489 // XOR first 32 bytes using keystream from first two rows of first block
490 vld1.8 {q8-q9}, [r2]!
493 vst1.8 {q8-q9}, [r1]!
495 // Re-interleave the words in the last two rows of each block (x8..15).
496 vld1.32 {q8-q9}, [sp, :256]
497 mov sp, r4 // restore original stack pointer
498 ldr r4, [r4, #8] // load number of bytes
499 vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
500 vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
501 vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
502 vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11)
503 vld1.32 {q0-q1}, [r0] // load s8..15
509 // Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
511 // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block)
513 vadd.u32 q10, q10, q0
515 vadd.u32 q11, q11, q0
517 // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
518 vadd.u32 q12, q12, q1
519 vadd.u32 q14, q14, q1
520 vadd.u32 q13, q13, q1
521 vadd.u32 q15, q15, q1
523 // XOR the rest of the data with the keystream
525 vld1.8 {q0-q1}, [r2]!
530 vst1.8 {q0-q1}, [r1]!
532 vld1.8 {q0-q1}, [r2]!
537 vst1.8 {q0-q1}, [r1]!
539 vld1.8 {q0-q1}, [r2]!
544 vst1.8 {q0-q1}, [r1]!
546 vld1.8 {q0-q1}, [r2]!
551 vst1.8 {q0-q1}, [r1]!
553 vld1.8 {q0-q1}, [r2]!
558 vst1.8 {q0-q1}, [r1]!
560 vld1.8 {q0-q1}, [r2]!
566 vst1.8 {q0-q1}, [r1]!
583 // Process the final block if processing less than 4 full blocks.
584 // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
585 // previous 32 byte output block that still needs to be written at
590 adr lr, .Lpermute + 32
600 vtbl.8 d4, {q4-q5}, d4
601 vtbl.8 d5, {q4-q5}, d5
602 vtbl.8 d6, {q4-q5}, d6
603 vtbl.8 d7, {q4-q5}, d7
608 vst1.8 {q6-q7}, [r4] // overlapping stores
632 ENDPROC(chacha_4block_xor_neon)
634 .align L1_CACHE_SHIFT
636 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
637 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
638 .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
639 .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
640 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
641 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
642 .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
643 .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f