1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2022-2024 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
6 #include <linux/array_size.h>
7 #include <linux/minmax.h>
8 #include <vdso/datapage.h>
9 #include <vdso/getrandom.h>
10 #include <vdso/unaligned.h>
11 #include <asm/vdso/getrandom.h>
12 #include <uapi/linux/mman.h>
13 #include <uapi/linux/random.h>
17 #define PAGE_SIZE (1UL << CONFIG_PAGE_SHIFT)
18 #define PAGE_MASK (~(PAGE_SIZE - 1))
20 #define MEMCPY_AND_ZERO_SRC(type, dst, src, len) do { \
21 while (len >= sizeof(type)) { \
22 __put_unaligned_t(type, __get_unaligned_t(type, src), dst); \
23 __put_unaligned_t(type, 0, src); \
24 dst += sizeof(type); \
25 src += sizeof(type); \
26 len -= sizeof(type); \
30 static void memcpy_and_zero_src(void *dst
, void *src
, size_t len
)
32 if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
)) {
33 if (IS_ENABLED(CONFIG_64BIT
))
34 MEMCPY_AND_ZERO_SRC(u64
, dst
, src
, len
);
35 MEMCPY_AND_ZERO_SRC(u32
, dst
, src
, len
);
36 MEMCPY_AND_ZERO_SRC(u16
, dst
, src
, len
);
38 MEMCPY_AND_ZERO_SRC(u8
, dst
, src
, len
);
42 * __cvdso_getrandom_data - Generic vDSO implementation of getrandom() syscall.
43 * @rng_info: Describes state of kernel RNG, memory shared with kernel.
44 * @buffer: Destination buffer to fill with random bytes.
45 * @len: Size of @buffer in bytes.
46 * @flags: Zero or more GRND_* flags.
47 * @opaque_state: Pointer to an opaque state area.
48 * @opaque_len: Length of opaque state area.
50 * This implements a "fast key erasure" RNG using ChaCha20, in the same way that the kernel's
51 * getrandom() syscall does. It periodically reseeds its key from the kernel's RNG, at the same
52 * schedule that the kernel's RNG is reseeded. If the kernel's RNG is not ready, then this always
53 * calls into the syscall.
55 * If @buffer, @len, and @flags are 0, and @opaque_len is ~0UL, then @opaque_state is populated
56 * with a struct vgetrandom_opaque_params and the function returns 0; if it does not return 0,
57 * this function should not be used.
59 * @opaque_state *must* be allocated by calling mmap(2) using the mmap_prot and mmap_flags fields
60 * from the struct vgetrandom_opaque_params, and states must not straddle pages. Unless external
61 * locking is used, one state must be allocated per thread, as it is not safe to call this function
62 * concurrently with the same @opaque_state. However, it is safe to call this using the same
63 * @opaque_state that is shared between main code and signal handling code, within the same thread.
65 * Returns: The number of random bytes written to @buffer, or a negative value indicating an error.
67 static __always_inline ssize_t
68 __cvdso_getrandom_data(const struct vdso_rng_data
*rng_info
, void *buffer
, size_t len
,
69 unsigned int flags
, void *opaque_state
, size_t opaque_len
)
71 ssize_t ret
= min_t(size_t, INT_MAX
& PAGE_MASK
/* = MAX_RW_COUNT */, len
);
72 struct vgetrandom_state
*state
= opaque_state
;
73 size_t batch_len
, nblocks
, orig_len
= len
;
74 bool in_use
, have_retried
= false;
75 void *orig_buffer
= buffer
;
76 u64 current_generation
;
77 u32 counter
[2] = { 0 };
79 if (unlikely(opaque_len
== ~0UL && !buffer
&& !len
&& !flags
)) {
80 struct vgetrandom_opaque_params
*params
= opaque_state
;
81 params
->size_of_opaque_state
= sizeof(*state
);
82 params
->mmap_prot
= PROT_READ
| PROT_WRITE
;
83 params
->mmap_flags
= MAP_DROPPABLE
| MAP_ANONYMOUS
;
84 for (size_t i
= 0; i
< ARRAY_SIZE(params
->reserved
); ++i
)
85 params
->reserved
[i
] = 0;
89 /* The state must not straddle a page, since pages can be zeroed at any time. */
90 if (unlikely(((unsigned long)opaque_state
& ~PAGE_MASK
) + sizeof(*state
) > PAGE_SIZE
))
93 /* Handle unexpected flags by falling back to the kernel. */
94 if (unlikely(flags
& ~(GRND_NONBLOCK
| GRND_RANDOM
| GRND_INSECURE
)))
95 goto fallback_syscall
;
97 /* If the caller passes the wrong size, which might happen due to CRIU, fallback. */
98 if (unlikely(opaque_len
!= sizeof(*state
)))
99 goto fallback_syscall
;
102 * If the kernel's RNG is not yet ready, then it's not possible to provide random bytes from
103 * userspace, because A) the various @flags require this to block, or not, depending on
104 * various factors unavailable to userspace, and B) the kernel's behavior before the RNG is
105 * ready is to reseed from the entropy pool at every invocation.
107 if (unlikely(!READ_ONCE(rng_info
->is_ready
)))
108 goto fallback_syscall
;
111 * This condition is checked after @rng_info->is_ready, because before the kernel's RNG is
112 * initialized, the @flags parameter may require this to block or return an error, even when
119 * @state->in_use is basic reentrancy protection against this running in a signal handler
120 * with the same @opaque_state, but obviously not atomic wrt multiple CPUs or more than one
121 * level of reentrancy. If a signal interrupts this after reading @state->in_use, but before
122 * writing @state->in_use, there is still no race, because the signal handler will run to
123 * its completion before returning execution.
125 in_use
= READ_ONCE(state
->in_use
);
126 if (unlikely(in_use
))
127 /* The syscall simply fills the buffer and does not touch @state, so fallback. */
128 goto fallback_syscall
;
129 WRITE_ONCE(state
->in_use
, true);
133 * @rng_info->generation must always be read here, as it serializes @state->key with the
134 * kernel's RNG reseeding schedule.
136 current_generation
= READ_ONCE(rng_info
->generation
);
139 * If @state->generation doesn't match the kernel RNG's generation, then it means the
140 * kernel's RNG has reseeded, and so @state->key is reseeded as well.
142 if (unlikely(state
->generation
!= current_generation
)) {
144 * Write the generation before filling the key, in case of fork. If there is a fork
145 * just after this line, the parent and child will get different random bytes from
146 * the syscall, which is good. However, were this line to occur after the getrandom
147 * syscall, then both child and parent could have the same bytes and the same
148 * generation counter, so the fork would not be detected. Therefore, write
149 * @state->generation before the call to the getrandom syscall.
151 WRITE_ONCE(state
->generation
, current_generation
);
154 * Prevent the syscall from being reordered wrt current_generation. Pairs with the
155 * smp_store_release(&_vdso_rng_data.generation) in random.c.
159 /* Reseed @state->key using fresh bytes from the kernel. */
160 if (getrandom_syscall(state
->key
, sizeof(state
->key
), 0) != sizeof(state
->key
)) {
162 * If the syscall failed to refresh the key, then @state->key is now
163 * invalid, so invalidate the generation so that it is not used again, and
164 * fallback to using the syscall entirely.
166 WRITE_ONCE(state
->generation
, 0);
169 * Set @state->in_use to false only after the last write to @state in the
172 WRITE_ONCE(state
->in_use
, false);
174 goto fallback_syscall
;
178 * Set @state->pos to beyond the end of the batch, so that the batch is refilled
181 state
->pos
= sizeof(state
->batch
);
184 /* Set len to the total amount of bytes that this function is allowed to read, ret. */
188 * First use bytes out of @state->batch, which may have been filled by the last call to this
191 batch_len
= min_t(size_t, sizeof(state
->batch
) - state
->pos
, len
);
193 /* Zeroing at the same time as memcpying helps preserve forward secrecy. */
194 memcpy_and_zero_src(buffer
, state
->batch
+ state
->pos
, batch_len
);
195 state
->pos
+= batch_len
;
201 /* Prevent the loop from being reordered wrt ->generation. */
205 * Since @rng_info->generation will never be 0, re-read @state->generation, rather
206 * than using the local current_generation variable, to learn whether a fork
207 * occurred or if @state was zeroed due to memory pressure. Primarily, though, this
208 * indicates whether the kernel's RNG has reseeded, in which case generate a new key
211 if (unlikely(READ_ONCE(state
->generation
) != READ_ONCE(rng_info
->generation
))) {
213 * Prevent this from looping forever in case of low memory or racing with a
214 * user force-reseeding the kernel's RNG using the ioctl.
217 WRITE_ONCE(state
->in_use
, false);
218 goto fallback_syscall
;
222 buffer
= orig_buffer
;
223 goto retry_generation
;
227 * Set @state->in_use to false only when there will be no more reads or writes of
230 WRITE_ONCE(state
->in_use
, false);
234 /* Generate blocks of RNG output directly into @buffer while there's enough room left. */
235 nblocks
= len
/ CHACHA_BLOCK_SIZE
;
237 __arch_chacha20_blocks_nostack(buffer
, state
->key
, counter
, nblocks
);
238 buffer
+= nblocks
* CHACHA_BLOCK_SIZE
;
239 len
-= nblocks
* CHACHA_BLOCK_SIZE
;
242 BUILD_BUG_ON(sizeof(state
->batch_key
) % CHACHA_BLOCK_SIZE
!= 0);
244 /* Refill the batch and overwrite the key, in order to preserve forward secrecy. */
245 __arch_chacha20_blocks_nostack(state
->batch_key
, state
->key
, counter
,
246 sizeof(state
->batch_key
) / CHACHA_BLOCK_SIZE
);
248 /* Since the batch was just refilled, set the position back to 0 to indicate a full batch. */
253 return getrandom_syscall(orig_buffer
, orig_len
, flags
);
256 static __always_inline ssize_t
257 __cvdso_getrandom(void *buffer
, size_t len
, unsigned int flags
, void *opaque_state
, size_t opaque_len
)
259 return __cvdso_getrandom_data(__arch_get_vdso_rng_data(), buffer
, len
, flags
, opaque_state
, opaque_len
);