drm/modes: Fix drm_mode_vrefres() docs
[drm/drm-misc.git] / arch / x86 / entry / vdso / vgetrandom-chacha.S
blobbcba5639b8ee9a33556ba0d81287a4ef953b3446
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
4  */
6 #include <linux/linkage.h>
7 #include <asm/frame.h>
9 .section        .rodata, "a"
10 .align 16
11 CONSTANTS:      .octa 0x6b20657479622d323320646e61707865
12 .text
15  * Very basic SSE2 implementation of ChaCha20. Produces a given positive number
16  * of blocks of output with a nonce of 0, taking an input key and 8-byte
17  * counter. Importantly does not spill to the stack. Its arguments are:
18  *
19  *      rdi: output bytes
20  *      rsi: 32-byte key input
21  *      rdx: 8-byte counter input/output
22  *      rcx: number of 64-byte blocks to write to output
23  */
24 SYM_FUNC_START(__arch_chacha20_blocks_nostack)
26 .set    output,         %rdi
27 .set    key,            %rsi
28 .set    counter,        %rdx
29 .set    nblocks,        %rcx
30 .set    i,              %al
31 /* xmm registers are *not* callee-save. */
32 .set    temp,           %xmm0
33 .set    state0,         %xmm1
34 .set    state1,         %xmm2
35 .set    state2,         %xmm3
36 .set    state3,         %xmm4
37 .set    copy0,          %xmm5
38 .set    copy1,          %xmm6
39 .set    copy2,          %xmm7
40 .set    copy3,          %xmm8
41 .set    one,            %xmm9
43         /* copy0 = "expand 32-byte k" */
44         movaps          CONSTANTS(%rip),copy0
45         /* copy1,copy2 = key */
46         movups          0x00(key),copy1
47         movups          0x10(key),copy2
48         /* copy3 = counter || zero nonce */
49         movq            0x00(counter),copy3
50         /* one = 1 || 0 */
51         movq            $1,%rax
52         movq            %rax,one
54 .Lblock:
55         /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */
56         movdqa          copy0,state0
57         movdqa          copy1,state1
58         movdqa          copy2,state2
59         movdqa          copy3,state3
61         movb            $10,i
62 .Lpermute:
63         /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
64         paddd           state1,state0
65         pxor            state0,state3
66         movdqa          state3,temp
67         pslld           $16,temp
68         psrld           $16,state3
69         por             temp,state3
71         /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
72         paddd           state3,state2
73         pxor            state2,state1
74         movdqa          state1,temp
75         pslld           $12,temp
76         psrld           $20,state1
77         por             temp,state1
79         /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
80         paddd           state1,state0
81         pxor            state0,state3
82         movdqa          state3,temp
83         pslld           $8,temp
84         psrld           $24,state3
85         por             temp,state3
87         /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
88         paddd           state3,state2
89         pxor            state2,state1
90         movdqa          state1,temp
91         pslld           $7,temp
92         psrld           $25,state1
93         por             temp,state1
95         /* state1[0,1,2,3] = state1[1,2,3,0] */
96         pshufd          $0x39,state1,state1
97         /* state2[0,1,2,3] = state2[2,3,0,1] */
98         pshufd          $0x4e,state2,state2
99         /* state3[0,1,2,3] = state3[3,0,1,2] */
100         pshufd          $0x93,state3,state3
102         /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */
103         paddd           state1,state0
104         pxor            state0,state3
105         movdqa          state3,temp
106         pslld           $16,temp
107         psrld           $16,state3
108         por             temp,state3
110         /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */
111         paddd           state3,state2
112         pxor            state2,state1
113         movdqa          state1,temp
114         pslld           $12,temp
115         psrld           $20,state1
116         por             temp,state1
118         /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */
119         paddd           state1,state0
120         pxor            state0,state3
121         movdqa          state3,temp
122         pslld           $8,temp
123         psrld           $24,state3
124         por             temp,state3
126         /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */
127         paddd           state3,state2
128         pxor            state2,state1
129         movdqa          state1,temp
130         pslld           $7,temp
131         psrld           $25,state1
132         por             temp,state1
134         /* state1[0,1,2,3] = state1[3,0,1,2] */
135         pshufd          $0x93,state1,state1
136         /* state2[0,1,2,3] = state2[2,3,0,1] */
137         pshufd          $0x4e,state2,state2
138         /* state3[0,1,2,3] = state3[1,2,3,0] */
139         pshufd          $0x39,state3,state3
141         decb            i
142         jnz             .Lpermute
144         /* output0 = state0 + copy0 */
145         paddd           copy0,state0
146         movups          state0,0x00(output)
147         /* output1 = state1 + copy1 */
148         paddd           copy1,state1
149         movups          state1,0x10(output)
150         /* output2 = state2 + copy2 */
151         paddd           copy2,state2
152         movups          state2,0x20(output)
153         /* output3 = state3 + copy3 */
154         paddd           copy3,state3
155         movups          state3,0x30(output)
157         /* ++copy3.counter */
158         paddq           one,copy3
160         /* output += 64, --nblocks */
161         addq            $64,output
162         decq            nblocks
163         jnz             .Lblock
165         /* counter = copy3.counter */
166         movq            copy3,0x00(counter)
168         /* Zero out the potentially sensitive regs, in case nothing uses these again. */
169         pxor            state0,state0
170         pxor            state1,state1
171         pxor            state2,state2
172         pxor            state3,state3
173         pxor            copy1,copy1
174         pxor            copy2,copy2
175         pxor            temp,temp
177         ret
178 SYM_FUNC_END(__arch_chacha20_blocks_nostack)