2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/string.h>
5 #include <linux/types.h>
7 #include <linux/init.h>
8 #include <linux/interrupt.h>
9 #include <linux/spinlock.h>
11 #include <linux/uaccess.h>
12 #include <linux/cpu.h>
15 #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
17 #include <asm/kaiser.h>
18 #include <asm/tlbflush.h> /* to verify its kaiser declarations */
19 #include <asm/pgtable.h>
20 #include <asm/pgalloc.h>
22 #include <asm/cmdline.h>
23 #include <asm/vsyscall.h>
24 #include <asm/sections.h>
26 int kaiser_enabled __read_mostly
= 1;
27 EXPORT_SYMBOL(kaiser_enabled
); /* for inlined TLB flush functions */
30 DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup
);
33 * These can have bit 63 set, so we can not just use a plain "or"
34 * instruction to get their value or'd into CR3. It would take
35 * another register. So, we use a memory reference to these instead.
37 * This is also handy because systems that do not support PCIDs
38 * just end up or'ing a 0 into their CR3, which does no harm.
40 DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user
);
43 * At runtime, the only things we map are some things for CPU
44 * hotplug, and stacks for new processes. No two CPUs will ever
45 * be populating the same addresses, so we only need to ensure
46 * that we protect between two CPUs trying to allocate and
47 * populate the same page table page.
49 * Only take this lock when doing a set_p[4um]d(), but it is not
50 * needed for doing a set_pte(). We assume that only the *owner*
51 * of a given allocation will be doing this for _their_
54 * This ensures that once a system has been running for a while
55 * and there have been stacks all over and these page tables
56 * are fully populated, there will be no further acquisitions of
59 static DEFINE_SPINLOCK(shadow_table_allocation_lock
);
62 * Returns -1 on error.
64 static inline unsigned long get_pa_from_mapping(unsigned long vaddr
)
71 pgd
= pgd_offset_k(vaddr
);
73 * We made all the kernel PGDs present in kaiser_init().
74 * We expect them to stay that way.
76 BUG_ON(pgd_none(*pgd
));
78 * PGDs are either 512GB or 128TB on all x86_64
79 * configurations. We don't handle these.
81 BUG_ON(pgd_large(*pgd
));
83 pud
= pud_offset(pgd
, vaddr
);
90 return (pud_pfn(*pud
) << PAGE_SHIFT
) | (vaddr
& ~PUD_PAGE_MASK
);
92 pmd
= pmd_offset(pud
, vaddr
);
99 return (pmd_pfn(*pmd
) << PAGE_SHIFT
) | (vaddr
& ~PMD_PAGE_MASK
);
101 pte
= pte_offset_kernel(pmd
, vaddr
);
102 if (pte_none(*pte
)) {
107 return (pte_pfn(*pte
) << PAGE_SHIFT
) | (vaddr
& ~PAGE_MASK
);
111 * This is a relatively normal page table walk, except that it
112 * also tries to allocate page tables pages along the way.
114 * Returns a pointer to a PTE on success, or NULL on failure.
116 static pte_t
*kaiser_pagetable_walk(unsigned long address
, bool user
)
120 pgd_t
*pgd
= native_get_shadow_pgd(pgd_offset_k(address
));
121 gfp_t gfp
= (GFP_KERNEL
| __GFP_NOTRACK
| __GFP_ZERO
);
122 unsigned long prot
= _KERNPG_TABLE
;
124 if (pgd_none(*pgd
)) {
125 WARN_ONCE(1, "All shadow pgds should have been populated");
128 BUILD_BUG_ON(pgd_large(*pgd
) != 0);
132 * The vsyscall page is the only page that will have
133 * _PAGE_USER set. Catch everything else.
135 BUG_ON(address
!= VSYSCALL_ADDR
);
137 set_pgd(pgd
, __pgd(pgd_val(*pgd
) | _PAGE_USER
));
141 pud
= pud_offset(pgd
, address
);
142 /* The shadow page tables do not use large mappings: */
143 if (pud_large(*pud
)) {
147 if (pud_none(*pud
)) {
148 unsigned long new_pmd_page
= __get_free_page(gfp
);
151 spin_lock(&shadow_table_allocation_lock
);
152 if (pud_none(*pud
)) {
153 set_pud(pud
, __pud(prot
| __pa(new_pmd_page
)));
154 __inc_zone_page_state(virt_to_page((void *)
155 new_pmd_page
), NR_KAISERTABLE
);
157 free_page(new_pmd_page
);
158 spin_unlock(&shadow_table_allocation_lock
);
161 pmd
= pmd_offset(pud
, address
);
162 /* The shadow page tables do not use large mappings: */
163 if (pmd_large(*pmd
)) {
167 if (pmd_none(*pmd
)) {
168 unsigned long new_pte_page
= __get_free_page(gfp
);
171 spin_lock(&shadow_table_allocation_lock
);
172 if (pmd_none(*pmd
)) {
173 set_pmd(pmd
, __pmd(prot
| __pa(new_pte_page
)));
174 __inc_zone_page_state(virt_to_page((void *)
175 new_pte_page
), NR_KAISERTABLE
);
177 free_page(new_pte_page
);
178 spin_unlock(&shadow_table_allocation_lock
);
181 return pte_offset_kernel(pmd
, address
);
184 static int kaiser_add_user_map(const void *__start_addr
, unsigned long size
,
189 unsigned long start_addr
= (unsigned long )__start_addr
;
190 unsigned long address
= start_addr
& PAGE_MASK
;
191 unsigned long end_addr
= PAGE_ALIGN(start_addr
+ size
);
192 unsigned long target_address
;
195 * It is convenient for callers to pass in __PAGE_KERNEL etc,
196 * and there is no actual harm from setting _PAGE_GLOBAL, so
197 * long as CR4.PGE is not set. But it is nonetheless troubling
198 * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
199 * requires that not to be #defined to 0): so mask it off here.
201 flags
&= ~_PAGE_GLOBAL
;
202 if (!(__supported_pte_mask
& _PAGE_NX
))
205 for (; address
< end_addr
; address
+= PAGE_SIZE
) {
206 target_address
= get_pa_from_mapping(address
);
207 if (target_address
== -1) {
211 pte
= kaiser_pagetable_walk(address
, flags
& _PAGE_USER
);
216 if (pte_none(*pte
)) {
217 set_pte(pte
, __pte(flags
| target_address
));
220 set_pte(&tmp
, __pte(flags
| target_address
));
221 WARN_ON_ONCE(!pte_same(*pte
, tmp
));
227 static int kaiser_add_user_map_ptrs(const void *start
, const void *end
, unsigned long flags
)
229 unsigned long size
= end
- start
;
231 return kaiser_add_user_map(start
, size
, flags
);
235 * Ensure that the top level of the (shadow) page tables are
236 * entirely populated. This ensures that all processes that get
237 * forked have the same entries. This way, we do not have to
238 * ever go set up new entries in older processes.
240 * Note: we never free these, so there are no updates to them
243 static void __init
kaiser_init_all_pgds(void)
248 pgd
= native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
249 for (i
= PTRS_PER_PGD
/ 2; i
< PTRS_PER_PGD
; i
++) {
251 pud_t
*pud
= pud_alloc_one(&init_mm
,
252 PAGE_OFFSET
+ i
* PGDIR_SIZE
);
257 inc_zone_page_state(virt_to_page(pud
), NR_KAISERTABLE
);
258 new_pgd
= __pgd(_KERNPG_TABLE
|__pa(pud
));
260 * Make sure not to stomp on some other pgd entry.
262 if (!pgd_none(pgd
[i
])) {
266 set_pgd(pgd
+ i
, new_pgd
);
270 #define kaiser_add_user_map_early(start, size, flags) do { \
271 int __ret = kaiser_add_user_map(start, size, flags); \
275 #define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
276 int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
280 void __init
kaiser_check_boottime_disable(void)
286 if (boot_cpu_has(X86_FEATURE_XENPV
))
289 ret
= cmdline_find_option(boot_command_line
, "pti", arg
, sizeof(arg
));
291 if (!strncmp(arg
, "on", 2))
294 if (!strncmp(arg
, "off", 3))
297 if (!strncmp(arg
, "auto", 4))
301 if (cmdline_find_option_bool(boot_command_line
, "nopti") ||
302 cpu_mitigations_off())
306 if (boot_cpu_data
.x86_vendor
== X86_VENDOR_AMD
)
311 setup_force_cpu_cap(X86_FEATURE_KAISER
);
316 pr_info("disabled\n");
320 setup_clear_cpu_cap(X86_FEATURE_KAISER
);
324 * If anything in here fails, we will likely die on one of the
325 * first kernel->user transitions and init will die. But, we
326 * will have most of the kernel up by then and should be able to
327 * get a clean warning out of it. If we BUG_ON() here, we run
328 * the risk of being before we have good console output.
330 void __init
kaiser_init(void)
337 kaiser_init_all_pgds();
340 * Note that this sets _PAGE_USER and it needs to happen when the
341 * pagetable hierarchy gets created, i.e., early. Otherwise
342 * kaiser_pagetable_walk() will encounter initialized PTEs in the
343 * hierarchy and not set the proper permissions, leading to the
344 * pagefaults with page-protection violations when trying to read the
345 * vsyscall page. For example.
347 if (vsyscall_enabled())
348 kaiser_add_user_map_early((void *)VSYSCALL_ADDR
,
352 for_each_possible_cpu(cpu
) {
353 void *percpu_vaddr
= __per_cpu_user_mapped_start
+
355 unsigned long percpu_sz
= __per_cpu_user_mapped_end
-
356 __per_cpu_user_mapped_start
;
357 kaiser_add_user_map_early(percpu_vaddr
, percpu_sz
,
362 * Map the entry/exit text section, which is needed at
363 * switches from user to and from kernel.
365 kaiser_add_user_map_ptrs_early(__entry_text_start
, __entry_text_end
,
368 #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
369 kaiser_add_user_map_ptrs_early(__irqentry_text_start
,
373 kaiser_add_user_map_early((void *)idt_descr
.address
,
374 sizeof(gate_desc
) * NR_VECTORS
,
376 #ifdef CONFIG_TRACING
377 kaiser_add_user_map_early(&trace_idt_descr
,
378 sizeof(trace_idt_descr
),
380 kaiser_add_user_map_early(&trace_idt_table
,
381 sizeof(gate_desc
) * NR_VECTORS
,
384 kaiser_add_user_map_early(&debug_idt_descr
, sizeof(debug_idt_descr
),
386 kaiser_add_user_map_early(&debug_idt_table
,
387 sizeof(gate_desc
) * NR_VECTORS
,
390 pr_info("enabled\n");
393 /* Add a mapping to the shadow mapping, and synchronize the mappings */
394 int kaiser_add_mapping(unsigned long addr
, unsigned long size
, unsigned long flags
)
398 return kaiser_add_user_map((const void *)addr
, size
, flags
);
401 void kaiser_remove_mapping(unsigned long start
, unsigned long size
)
403 extern void unmap_pud_range_nofree(pgd_t
*pgd
,
404 unsigned long start
, unsigned long end
);
405 unsigned long end
= start
+ size
;
406 unsigned long addr
, next
;
411 pgd
= native_get_shadow_pgd(pgd_offset_k(start
));
412 for (addr
= start
; addr
< end
; pgd
++, addr
= next
) {
413 next
= pgd_addr_end(addr
, end
);
414 unmap_pud_range_nofree(pgd
, addr
, next
);
419 * Page table pages are page-aligned. The lower half of the top
420 * level is used for userspace and the top half for the kernel.
421 * This returns true for user pages that need to get copied into
422 * both the user and kernel copies of the page tables, and false
423 * for kernel pages that should only be in the kernel copy.
425 static inline bool is_userspace_pgd(pgd_t
*pgdp
)
427 return ((unsigned long)pgdp
% PAGE_SIZE
) < (PAGE_SIZE
/ 2);
430 pgd_t
kaiser_set_shadow_pgd(pgd_t
*pgdp
, pgd_t pgd
)
435 * Do we need to also populate the shadow pgd? Check _PAGE_USER to
436 * skip cases like kexec and EFI which make temporary low mappings.
438 if (pgd
.pgd
& _PAGE_USER
) {
439 if (is_userspace_pgd(pgdp
)) {
440 native_get_shadow_pgd(pgdp
)->pgd
= pgd
.pgd
;
442 * Even if the entry is *mapping* userspace, ensure
443 * that userspace can not use it. This way, if we
444 * get out to userspace running on the kernel CR3,
445 * userspace will crash instead of running.
447 if (__supported_pte_mask
& _PAGE_NX
)
450 } else if (!pgd
.pgd
) {
452 * pgd_clear() cannot check _PAGE_USER, and is even used to
453 * clear corrupted pgd entries: so just rely on cases like
454 * kexec and EFI never to be using pgd_clear().
456 if (!WARN_ON_ONCE((unsigned long)pgdp
& PAGE_SIZE
) &&
457 is_userspace_pgd(pgdp
))
458 native_get_shadow_pgd(pgdp
)->pgd
= pgd
.pgd
;
463 void kaiser_setup_pcid(void)
465 unsigned long user_cr3
= KAISER_SHADOW_PGD_OFFSET
;
467 if (this_cpu_has(X86_FEATURE_PCID
))
468 user_cr3
|= X86_CR3_PCID_USER_NOFLUSH
;
470 * These variables are used by the entry/exit
471 * code to change PCID and pgd and TLB flushing.
473 this_cpu_write(x86_cr3_pcid_user
, user_cr3
);
477 * Make a note that this cpu will need to flush USER tlb on return to user.
478 * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
480 void kaiser_flush_tlb_on_return_to_user(void)
482 if (this_cpu_has(X86_FEATURE_PCID
))
483 this_cpu_write(x86_cr3_pcid_user
,
484 X86_CR3_PCID_USER_FLUSH
| KAISER_SHADOW_PGD_OFFSET
);
486 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user
);