1 // SPDX-License-Identifier: GPL-2.0
3 * Implement mseal() syscall.
5 * Copyright (c) 2023,2024 Google, Inc.
7 * Author: Jeff Xu <jeffxu@chromium.org>
10 #include <linux/mempolicy.h>
11 #include <linux/mman.h>
13 #include <linux/mm_inline.h>
14 #include <linux/mmu_context.h>
15 #include <linux/syscalls.h>
16 #include <linux/sched.h>
19 static inline void set_vma_sealed(struct vm_area_struct
*vma
)
21 vm_flags_set(vma
, VM_SEALED
);
24 static bool is_madv_discard(int behavior
)
29 case MADV_DONTNEED_LOCKED
:
33 case MADV_GUARD_INSTALL
:
40 static bool is_ro_anon(struct vm_area_struct
*vma
)
42 /* check anonymous mapping. */
43 if (vma
->vm_file
|| vma
->vm_flags
& VM_SHARED
)
47 * check for non-writable:
48 * PROT=RO or PKRU is not writeable.
50 if (!(vma
->vm_flags
& VM_WRITE
) ||
51 !arch_vma_access_permitted(vma
, true, false, false))
58 * Check if a vma is allowed to be modified by madvise.
60 bool can_modify_vma_madv(struct vm_area_struct
*vma
, int behavior
)
62 if (!is_madv_discard(behavior
))
65 if (unlikely(!can_modify_vma(vma
) && is_ro_anon(vma
)))
68 /* Allow by default. */
72 static int mseal_fixup(struct vma_iterator
*vmi
, struct vm_area_struct
*vma
,
73 struct vm_area_struct
**prev
, unsigned long start
,
74 unsigned long end
, vm_flags_t newflags
)
77 vm_flags_t oldflags
= vma
->vm_flags
;
79 if (newflags
== oldflags
)
82 vma
= vma_modify_flags(vmi
, *prev
, vma
, start
, end
, newflags
);
96 * 1> start is part of a valid vma.
97 * 2> end is part of a valid vma.
98 * 3> No gap (unallocated address) between start and end.
101 static int check_mm_seal(unsigned long start
, unsigned long end
)
103 struct vm_area_struct
*vma
;
104 unsigned long nstart
= start
;
106 VMA_ITERATOR(vmi
, current
->mm
, start
);
108 /* going through each vma to check. */
109 for_each_vma_range(vmi
, vma
, end
) {
110 if (vma
->vm_start
> nstart
)
111 /* unallocated memory found. */
114 if (vma
->vm_end
>= end
)
117 nstart
= vma
->vm_end
;
126 static int apply_mm_seal(unsigned long start
, unsigned long end
)
128 unsigned long nstart
;
129 struct vm_area_struct
*vma
, *prev
;
131 VMA_ITERATOR(vmi
, current
->mm
, start
);
133 vma
= vma_iter_load(&vmi
);
135 * Note: check_mm_seal should already checked ENOMEM case.
136 * so vma should not be null, same for the other ENOMEM cases.
138 prev
= vma_prev(&vmi
);
139 if (start
> vma
->vm_start
)
143 for_each_vma_range(vmi
, vma
, end
) {
148 newflags
= vma
->vm_flags
| VM_SEALED
;
152 error
= mseal_fixup(&vmi
, vma
, &prev
, nstart
, tmp
, newflags
);
155 nstart
= vma_iter_end(&vmi
);
162 * mseal(2) seals the VM's meta data from
165 * addr/len: VM address range.
167 * The address range by addr/len must meet:
168 * start (addr) must be in a valid VMA.
169 * end (addr + len) must be in a valid VMA.
170 * no gap (unallocated memory) between start and end.
171 * start (addr) must be page aligned.
173 * len: len will be page aligned implicitly.
175 * Below VMA operations are blocked after sealing.
176 * 1> Unmapping, moving to another location, and shrinking
177 * the size, via munmap() and mremap(), can leave an empty
178 * space, therefore can be replaced with a VMA with a new
180 * 2> Moving or expanding a different vma into the current location,
182 * 3> Modifying a VMA via mmap(MAP_FIXED).
183 * 4> Size expansion, via mremap(), does not appear to pose any
184 * specific risks to sealed VMAs. It is included anyway because
185 * the use case is unclear. In any case, users can rely on
186 * merging to expand a sealed VMA.
187 * 5> mprotect and pkey_mprotect.
188 * 6> Some destructive madvice() behavior (e.g. MADV_DONTNEED)
189 * for anonymous memory, when users don't have write permission to the
190 * memory. Those behaviors can alter region contents by discarding pages,
191 * effectively a memset(0) for anonymous memory.
198 * invalid input flags.
199 * start address is not page aligned.
200 * Address arange (start + len) overflow.
202 * addr is not a valid address (not allocated).
203 * end (start + len) is not a valid address.
204 * a gap (unallocated memory) between start and end.
206 * - In 32 bit architecture, sealing is not supported.
208 * user can call mseal(2) multiple times, adding a seal on an
209 * already sealed memory is a no-action (no error).
211 * unseal() is not supported.
213 int do_mseal(unsigned long start
, size_t len_in
, unsigned long flags
)
218 struct mm_struct
*mm
= current
->mm
;
220 ret
= can_do_mseal(flags
);
224 start
= untagged_addr(start
);
225 if (!PAGE_ALIGNED(start
))
228 len
= PAGE_ALIGN(len_in
);
229 /* Check to see whether len was rounded up from small -ve to zero. */
240 if (mmap_write_lock_killable(mm
))
244 * First pass, this helps to avoid
245 * partial sealing in case of error in input address range,
248 ret
= check_mm_seal(start
, end
);
253 * Second pass, this should success, unless there are errors
254 * from vma_modify_flags, e.g. merge/split error, or process
255 * reaching the max supported VMAs, however, those cases shall
258 ret
= apply_mm_seal(start
, end
);
261 mmap_write_unlock(current
->mm
);
265 SYSCALL_DEFINE3(mseal
, unsigned long, start
, size_t, len
, unsigned long,
268 return do_mseal(start
, len
, flags
);