1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
6 #include <linux/linkage.h>
11 CONSTANTS: .octa 0x6b20657479622d323320646e61707865
15 * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
16 * of blocks of output with a nonce of 0, taking an input key and 8-byte
17 * counter. Importantly does not spill to the stack. Its arguments are:
20 * rsi: 32-byte key input
21 * rdx: 8-byte counter input/output
22 * rcx: number of 64-byte blocks to write to output
24 SYM_FUNC_START(__arch_chacha20_blocks_nostack)
31 /* xmm registers are *not* callee-save. */
43 /* copy0 = "expand 32-byte k" */
44 movaps CONSTANTS(%rip),copy0
45 /* copy1,copy2 = key */
46 movups 0x00(key),copy1
47 movups 0x10(key),copy2
48 /* copy3 = counter || zero nonce */
49 movq 0x00(counter),copy3
55 /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */
63 /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
71 /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
79 /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
87 /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
95 /* state1[0,1,2,3] = state1[1,2,3,0] */
96 pshufd $0x39,state1,state1
97 /* state2[0,1,2,3] = state2[2,3,0,1] */
98 pshufd $0x4e,state2,state2
99 /* state3[0,1,2,3] = state3[3,0,1,2] */
100 pshufd $0x93,state3,state3
102 /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
110 /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
118 /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
126 /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
134 /* state1[0,1,2,3] = state1[3,0,1,2] */
135 pshufd $0x93,state1,state1
136 /* state2[0,1,2,3] = state2[2,3,0,1] */
137 pshufd $0x4e,state2,state2
138 /* state3[0,1,2,3] = state3[1,2,3,0] */
139 pshufd $0x39,state3,state3
144 /* output0 = state0 + copy0 */
146 movups state0,0x00(output)
147 /* output1 = state1 + copy1 */
149 movups state1,0x10(output)
150 /* output2 = state2 + copy2 */
152 movups state2,0x20(output)
153 /* output3 = state3 + copy3 */
155 movups state3,0x30(output)
157 /* ++copy3.counter */
160 /* output += 64, --nblocks */
165 /* counter = copy3.counter */
166 movq copy3,0x00(counter)
168 /* Zero out the potentially sensitive regs, in case nothing uses these again. */
178 SYM_FUNC_END(__arch_chacha20_blocks_nostack)