1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright 2007 Andi Kleen, SUSE Labs.
5 * This contains most of the x86 vDSO kernel-side code.
9 #include <linux/sched.h>
10 #include <linux/sched/task_stack.h>
11 #include <linux/slab.h>
12 #include <linux/init.h>
13 #include <linux/random.h>
14 #include <linux/elf.h>
15 #include <linux/cpu.h>
16 #include <linux/ptrace.h>
17 #include <linux/time_namespace.h>
19 #include <asm/pvclock.h>
20 #include <asm/vgtod.h>
21 #include <asm/proto.h>
27 #include <asm/cpufeature.h>
28 #include <clocksource/hyperv_timer.h>
30 #undef _ASM_X86_VVAR_H
31 #define EMIT_VVAR(name, offset) \
32 const size_t name ## _offset = offset;
35 struct vdso_data
*arch_get_vdso_data(void *vvar_page
)
37 return (struct vdso_data
*)(vvar_page
+ _vdso_data_offset
);
41 unsigned int vclocks_used __read_mostly
;
43 #if defined(CONFIG_X86_64)
44 unsigned int __read_mostly vdso64_enabled
= 1;
47 void __init
init_vdso_image(const struct vdso_image
*image
)
49 BUG_ON(image
->size
% PAGE_SIZE
!= 0);
51 apply_alternatives((struct alt_instr
*)(image
->data
+ image
->alt
),
52 (struct alt_instr
*)(image
->data
+ image
->alt
+
56 static const struct vm_special_mapping vvar_mapping
;
59 static vm_fault_t
vdso_fault(const struct vm_special_mapping
*sm
,
60 struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
62 const struct vdso_image
*image
= vma
->vm_mm
->context
.vdso_image
;
64 if (!image
|| (vmf
->pgoff
<< PAGE_SHIFT
) >= image
->size
)
65 return VM_FAULT_SIGBUS
;
67 vmf
->page
= virt_to_page(image
->data
+ (vmf
->pgoff
<< PAGE_SHIFT
));
72 static void vdso_fix_landing(const struct vdso_image
*image
,
73 struct vm_area_struct
*new_vma
)
75 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
76 if (in_ia32_syscall() && image
== &vdso_image_32
) {
77 struct pt_regs
*regs
= current_pt_regs();
78 unsigned long vdso_land
= image
->sym_int80_landing_pad
;
79 unsigned long old_land_addr
= vdso_land
+
80 (unsigned long)current
->mm
->context
.vdso
;
82 /* Fixing userspace landing - look at do_fast_syscall_32 */
83 if (regs
->ip
== old_land_addr
)
84 regs
->ip
= new_vma
->vm_start
+ vdso_land
;
89 static int vdso_mremap(const struct vm_special_mapping
*sm
,
90 struct vm_area_struct
*new_vma
)
92 const struct vdso_image
*image
= current
->mm
->context
.vdso_image
;
94 vdso_fix_landing(image
, new_vma
);
95 current
->mm
->context
.vdso
= (void __user
*)new_vma
->vm_start
;
100 #ifdef CONFIG_TIME_NS
101 static struct page
*find_timens_vvar_page(struct vm_area_struct
*vma
)
103 if (likely(vma
->vm_mm
== current
->mm
))
104 return current
->nsproxy
->time_ns
->vvar_page
;
107 * VM_PFNMAP | VM_IO protect .fault() handler from being called
108 * through interfaces like /proc/$pid/mem or
109 * process_vm_{readv,writev}() as long as there's no .access()
110 * in special_mapping_vmops().
111 * For more details check_vma_flags() and __access_remote_vm()
114 WARN(1, "vvar_page accessed remotely");
120 * The vvar page layout depends on whether a task belongs to the root or
121 * non-root time namespace. Whenever a task changes its namespace, the VVAR
122 * page tables are cleared and then they will re-faulted with a
123 * corresponding layout.
124 * See also the comment near timens_setup_vdso_data() for details.
126 int vdso_join_timens(struct task_struct
*task
, struct time_namespace
*ns
)
128 struct mm_struct
*mm
= task
->mm
;
129 struct vm_area_struct
*vma
;
133 for (vma
= mm
->mmap
; vma
; vma
= vma
->vm_next
) {
134 unsigned long size
= vma
->vm_end
- vma
->vm_start
;
136 if (vma_is_special_mapping(vma
, &vvar_mapping
))
137 zap_page_range(vma
, vma
->vm_start
, size
);
140 mmap_read_unlock(mm
);
144 static inline struct page
*find_timens_vvar_page(struct vm_area_struct
*vma
)
150 static vm_fault_t
vvar_fault(const struct vm_special_mapping
*sm
,
151 struct vm_area_struct
*vma
, struct vm_fault
*vmf
)
153 const struct vdso_image
*image
= vma
->vm_mm
->context
.vdso_image
;
158 return VM_FAULT_SIGBUS
;
160 sym_offset
= (long)(vmf
->pgoff
<< PAGE_SHIFT
) +
161 image
->sym_vvar_start
;
164 * Sanity check: a symbol offset of zero means that the page
165 * does not exist for this vdso image, not that the page is at
166 * offset zero relative to the text mapping. This should be
167 * impossible here, because sym_offset should only be zero for
168 * the page past the end of the vvar mapping.
171 return VM_FAULT_SIGBUS
;
173 if (sym_offset
== image
->sym_vvar_page
) {
174 struct page
*timens_page
= find_timens_vvar_page(vma
);
176 pfn
= __pa_symbol(&__vvar_page
) >> PAGE_SHIFT
;
179 * If a task belongs to a time namespace then a namespace
180 * specific VVAR is mapped with the sym_vvar_page offset and
181 * the real VVAR page is mapped with the sym_timens_page
183 * See also the comment near timens_setup_vdso_data().
190 * Optimization: inside time namespace pre-fault
191 * VVAR page too. As on timens page there are only
192 * offsets for clocks on VVAR, it'll be faulted
193 * shortly by VDSO code.
195 addr
= vmf
->address
+ (image
->sym_timens_page
- sym_offset
);
196 err
= vmf_insert_pfn(vma
, addr
, pfn
);
197 if (unlikely(err
& VM_FAULT_ERROR
))
200 pfn
= page_to_pfn(timens_page
);
203 return vmf_insert_pfn(vma
, vmf
->address
, pfn
);
204 } else if (sym_offset
== image
->sym_pvclock_page
) {
205 struct pvclock_vsyscall_time_info
*pvti
=
206 pvclock_get_pvti_cpu0_va();
207 if (pvti
&& vclock_was_used(VDSO_CLOCKMODE_PVCLOCK
)) {
208 return vmf_insert_pfn_prot(vma
, vmf
->address
,
209 __pa(pvti
) >> PAGE_SHIFT
,
210 pgprot_decrypted(vma
->vm_page_prot
));
212 } else if (sym_offset
== image
->sym_hvclock_page
) {
213 struct ms_hyperv_tsc_page
*tsc_pg
= hv_get_tsc_page();
215 if (tsc_pg
&& vclock_was_used(VDSO_CLOCKMODE_HVCLOCK
))
216 return vmf_insert_pfn(vma
, vmf
->address
,
217 virt_to_phys(tsc_pg
) >> PAGE_SHIFT
);
218 } else if (sym_offset
== image
->sym_timens_page
) {
219 struct page
*timens_page
= find_timens_vvar_page(vma
);
222 return VM_FAULT_SIGBUS
;
224 pfn
= __pa_symbol(&__vvar_page
) >> PAGE_SHIFT
;
225 return vmf_insert_pfn(vma
, vmf
->address
, pfn
);
228 return VM_FAULT_SIGBUS
;
231 static const struct vm_special_mapping vdso_mapping
= {
234 .mremap
= vdso_mremap
,
236 static const struct vm_special_mapping vvar_mapping
= {
242 * Add vdso and vvar mappings to current process.
243 * @image - blob to map
244 * @addr - request a specific address (zero to map at free addr)
246 static int map_vdso(const struct vdso_image
*image
, unsigned long addr
)
248 struct mm_struct
*mm
= current
->mm
;
249 struct vm_area_struct
*vma
;
250 unsigned long text_start
;
253 if (mmap_write_lock_killable(mm
))
256 addr
= get_unmapped_area(NULL
, addr
,
257 image
->size
- image
->sym_vvar_start
, 0, 0);
258 if (IS_ERR_VALUE(addr
)) {
263 text_start
= addr
- image
->sym_vvar_start
;
266 * MAYWRITE to allow gdb to COW and set breakpoints
268 vma
= _install_special_mapping(mm
,
272 VM_MAYREAD
|VM_MAYWRITE
|VM_MAYEXEC
,
280 vma
= _install_special_mapping(mm
,
282 -image
->sym_vvar_start
,
283 VM_READ
|VM_MAYREAD
|VM_IO
|VM_DONTDUMP
|
289 do_munmap(mm
, text_start
, image
->size
, NULL
);
291 current
->mm
->context
.vdso
= (void __user
*)text_start
;
292 current
->mm
->context
.vdso_image
= image
;
296 mmap_write_unlock(mm
);
302 * Put the vdso above the (randomized) stack with another randomized
303 * offset. This way there is no hole in the middle of address space.
304 * To save memory make sure it is still in the same PTE as the stack
305 * top. This doesn't give that many random bits.
307 * Note that this algorithm is imperfect: the distribution of the vdso
308 * start address within a PMD is biased toward the end.
310 * Only used for the 64-bit and x32 vdsos.
312 static unsigned long vdso_addr(unsigned long start
, unsigned len
)
314 unsigned long addr
, end
;
318 * Round up the start address. It can start out unaligned as a result
319 * of stack start randomization.
321 start
= PAGE_ALIGN(start
);
323 /* Round the lowest possible end address up to a PMD boundary. */
324 end
= (start
+ len
+ PMD_SIZE
- 1) & PMD_MASK
;
325 if (end
>= TASK_SIZE_MAX
)
330 offset
= get_random_int() % (((end
- start
) >> PAGE_SHIFT
) + 1);
331 addr
= start
+ (offset
<< PAGE_SHIFT
);
337 * Forcibly align the final address in case we have a hardware
338 * issue that requires alignment for performance reasons.
340 addr
= align_vdso_addr(addr
);
345 static int map_vdso_randomized(const struct vdso_image
*image
)
347 unsigned long addr
= vdso_addr(current
->mm
->start_stack
, image
->size
-image
->sym_vvar_start
);
349 return map_vdso(image
, addr
);
353 int map_vdso_once(const struct vdso_image
*image
, unsigned long addr
)
355 struct mm_struct
*mm
= current
->mm
;
356 struct vm_area_struct
*vma
;
360 * Check if we have already mapped vdso blob - fail to prevent
361 * abusing from userspace install_speciall_mapping, which may
362 * not do accounting and rlimit right.
363 * We could search vma near context.vdso, but it's a slowpath,
364 * so let's explicitly check all VMAs to be completely sure.
366 for (vma
= mm
->mmap
; vma
; vma
= vma
->vm_next
) {
367 if (vma_is_special_mapping(vma
, &vdso_mapping
) ||
368 vma_is_special_mapping(vma
, &vvar_mapping
)) {
369 mmap_write_unlock(mm
);
373 mmap_write_unlock(mm
);
375 return map_vdso(image
, addr
);
378 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
379 static int load_vdso32(void)
381 if (vdso32_enabled
!= 1) /* Other values all mean "disabled" */
384 return map_vdso(&vdso_image_32
, 0);
389 int arch_setup_additional_pages(struct linux_binprm
*bprm
, int uses_interp
)
394 return map_vdso_randomized(&vdso_image_64
);
398 int compat_arch_setup_additional_pages(struct linux_binprm
*bprm
,
399 int uses_interp
, bool x32
)
401 #ifdef CONFIG_X86_X32_ABI
405 return map_vdso_randomized(&vdso_image_x32
);
408 #ifdef CONFIG_IA32_EMULATION
409 return load_vdso32();
416 int arch_setup_additional_pages(struct linux_binprm
*bprm
, int uses_interp
)
418 return load_vdso32();
422 bool arch_syscall_is_vdso_sigreturn(struct pt_regs
*regs
)
424 #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
425 const struct vdso_image
*image
= current
->mm
->context
.vdso_image
;
426 unsigned long vdso
= (unsigned long) current
->mm
->context
.vdso
;
428 if (in_ia32_syscall() && image
== &vdso_image_32
) {
429 if (regs
->ip
== vdso
+ image
->sym_vdso32_sigreturn_landing_pad
||
430 regs
->ip
== vdso
+ image
->sym_vdso32_rt_sigreturn_landing_pad
)
438 static __init
int vdso_setup(char *s
)
440 vdso64_enabled
= simple_strtoul(s
, NULL
, 0);
443 __setup("vdso=", vdso_setup
);
445 static int __init
init_vdso(void)
447 BUILD_BUG_ON(VDSO_CLOCKMODE_MAX
>= 32);
449 init_vdso_image(&vdso_image_64
);
451 #ifdef CONFIG_X86_X32_ABI
452 init_vdso_image(&vdso_image_x32
);
457 subsys_initcall(init_vdso
);
458 #endif /* CONFIG_X86_64 */