kaiser: kaiser_flush_tlb_on_return_to_user() check PCID
[linux/fpc-iii.git] / arch / x86 / mm / kaiser.c
blob8600663ded3e40bfd4dfb0ce5199d01993192037
1 #include <linux/bug.h>
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/string.h>
5 #include <linux/types.h>
6 #include <linux/bug.h>
7 #include <linux/init.h>
8 #include <linux/interrupt.h>
9 #include <linux/spinlock.h>
10 #include <linux/mm.h>
11 #include <linux/uaccess.h>
13 #include <asm/kaiser.h>
14 #include <asm/tlbflush.h> /* to verify its kaiser declarations */
15 #include <asm/pgtable.h>
16 #include <asm/pgalloc.h>
17 #include <asm/desc.h>
18 #include <asm/cmdline.h>
20 int kaiser_enabled __read_mostly = 1;
21 EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
23 __visible
24 DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
27 * These can have bit 63 set, so we can not just use a plain "or"
28 * instruction to get their value or'd into CR3. It would take
29 * another register. So, we use a memory reference to these instead.
31 * This is also handy because systems that do not support PCIDs
32 * just end up or'ing a 0 into their CR3, which does no harm.
34 DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
37 * At runtime, the only things we map are some things for CPU
38 * hotplug, and stacks for new processes. No two CPUs will ever
39 * be populating the same addresses, so we only need to ensure
40 * that we protect between two CPUs trying to allocate and
41 * populate the same page table page.
43 * Only take this lock when doing a set_p[4um]d(), but it is not
44 * needed for doing a set_pte(). We assume that only the *owner*
45 * of a given allocation will be doing this for _their_
46 * allocation.
48 * This ensures that once a system has been running for a while
49 * and there have been stacks all over and these page tables
50 * are fully populated, there will be no further acquisitions of
51 * this lock.
53 static DEFINE_SPINLOCK(shadow_table_allocation_lock);
56 * Returns -1 on error.
58 static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
60 pgd_t *pgd;
61 pud_t *pud;
62 pmd_t *pmd;
63 pte_t *pte;
65 pgd = pgd_offset_k(vaddr);
67 * We made all the kernel PGDs present in kaiser_init().
68 * We expect them to stay that way.
70 BUG_ON(pgd_none(*pgd));
72 * PGDs are either 512GB or 128TB on all x86_64
73 * configurations. We don't handle these.
75 BUG_ON(pgd_large(*pgd));
77 pud = pud_offset(pgd, vaddr);
78 if (pud_none(*pud)) {
79 WARN_ON_ONCE(1);
80 return -1;
83 if (pud_large(*pud))
84 return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
86 pmd = pmd_offset(pud, vaddr);
87 if (pmd_none(*pmd)) {
88 WARN_ON_ONCE(1);
89 return -1;
92 if (pmd_large(*pmd))
93 return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
95 pte = pte_offset_kernel(pmd, vaddr);
96 if (pte_none(*pte)) {
97 WARN_ON_ONCE(1);
98 return -1;
101 return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
105 * This is a relatively normal page table walk, except that it
106 * also tries to allocate page tables pages along the way.
108 * Returns a pointer to a PTE on success, or NULL on failure.
110 static pte_t *kaiser_pagetable_walk(unsigned long address)
112 pmd_t *pmd;
113 pud_t *pud;
114 pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
115 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
117 if (pgd_none(*pgd)) {
118 WARN_ONCE(1, "All shadow pgds should have been populated");
119 return NULL;
121 BUILD_BUG_ON(pgd_large(*pgd) != 0);
123 pud = pud_offset(pgd, address);
124 /* The shadow page tables do not use large mappings: */
125 if (pud_large(*pud)) {
126 WARN_ON(1);
127 return NULL;
129 if (pud_none(*pud)) {
130 unsigned long new_pmd_page = __get_free_page(gfp);
131 if (!new_pmd_page)
132 return NULL;
133 spin_lock(&shadow_table_allocation_lock);
134 if (pud_none(*pud)) {
135 set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
136 __inc_zone_page_state(virt_to_page((void *)
137 new_pmd_page), NR_KAISERTABLE);
138 } else
139 free_page(new_pmd_page);
140 spin_unlock(&shadow_table_allocation_lock);
143 pmd = pmd_offset(pud, address);
144 /* The shadow page tables do not use large mappings: */
145 if (pmd_large(*pmd)) {
146 WARN_ON(1);
147 return NULL;
149 if (pmd_none(*pmd)) {
150 unsigned long new_pte_page = __get_free_page(gfp);
151 if (!new_pte_page)
152 return NULL;
153 spin_lock(&shadow_table_allocation_lock);
154 if (pmd_none(*pmd)) {
155 set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
156 __inc_zone_page_state(virt_to_page((void *)
157 new_pte_page), NR_KAISERTABLE);
158 } else
159 free_page(new_pte_page);
160 spin_unlock(&shadow_table_allocation_lock);
163 return pte_offset_kernel(pmd, address);
166 static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
167 unsigned long flags)
169 int ret = 0;
170 pte_t *pte;
171 unsigned long start_addr = (unsigned long )__start_addr;
172 unsigned long address = start_addr & PAGE_MASK;
173 unsigned long end_addr = PAGE_ALIGN(start_addr + size);
174 unsigned long target_address;
177 * It is convenient for callers to pass in __PAGE_KERNEL etc,
178 * and there is no actual harm from setting _PAGE_GLOBAL, so
179 * long as CR4.PGE is not set. But it is nonetheless troubling
180 * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
181 * requires that not to be #defined to 0): so mask it off here.
183 flags &= ~_PAGE_GLOBAL;
185 for (; address < end_addr; address += PAGE_SIZE) {
186 target_address = get_pa_from_mapping(address);
187 if (target_address == -1) {
188 ret = -EIO;
189 break;
191 pte = kaiser_pagetable_walk(address);
192 if (!pte) {
193 ret = -ENOMEM;
194 break;
196 if (pte_none(*pte)) {
197 set_pte(pte, __pte(flags | target_address));
198 } else {
199 pte_t tmp;
200 set_pte(&tmp, __pte(flags | target_address));
201 WARN_ON_ONCE(!pte_same(*pte, tmp));
204 return ret;
207 static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
209 unsigned long size = end - start;
211 return kaiser_add_user_map(start, size, flags);
215 * Ensure that the top level of the (shadow) page tables are
216 * entirely populated. This ensures that all processes that get
217 * forked have the same entries. This way, we do not have to
218 * ever go set up new entries in older processes.
220 * Note: we never free these, so there are no updates to them
221 * after this.
223 static void __init kaiser_init_all_pgds(void)
225 pgd_t *pgd;
226 int i = 0;
228 pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
229 for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
230 pgd_t new_pgd;
231 pud_t *pud = pud_alloc_one(&init_mm,
232 PAGE_OFFSET + i * PGDIR_SIZE);
233 if (!pud) {
234 WARN_ON(1);
235 break;
237 inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
238 new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
240 * Make sure not to stomp on some other pgd entry.
242 if (!pgd_none(pgd[i])) {
243 WARN_ON(1);
244 continue;
246 set_pgd(pgd + i, new_pgd);
250 #define kaiser_add_user_map_early(start, size, flags) do { \
251 int __ret = kaiser_add_user_map(start, size, flags); \
252 WARN_ON(__ret); \
253 } while (0)
255 #define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
256 int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
257 WARN_ON(__ret); \
258 } while (0)
260 void __init kaiser_check_boottime_disable(void)
262 bool enable = true;
263 char arg[5];
264 int ret;
266 ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
267 if (ret > 0) {
268 if (!strncmp(arg, "on", 2))
269 goto enable;
271 if (!strncmp(arg, "off", 3))
272 goto disable;
274 if (!strncmp(arg, "auto", 4))
275 goto skip;
278 if (cmdline_find_option_bool(boot_command_line, "nopti"))
279 goto disable;
281 skip:
282 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
283 goto disable;
285 enable:
286 if (enable)
287 setup_force_cpu_cap(X86_FEATURE_KAISER);
289 return;
291 disable:
292 pr_info("Kernel/User page tables isolation: disabled\n");
293 kaiser_enabled = 0;
294 setup_clear_cpu_cap(X86_FEATURE_KAISER);
298 * If anything in here fails, we will likely die on one of the
299 * first kernel->user transitions and init will die. But, we
300 * will have most of the kernel up by then and should be able to
301 * get a clean warning out of it. If we BUG_ON() here, we run
302 * the risk of being before we have good console output.
304 void __init kaiser_init(void)
306 int cpu;
308 kaiser_check_boottime_disable();
310 if (!kaiser_enabled)
311 return;
313 kaiser_init_all_pgds();
315 for_each_possible_cpu(cpu) {
316 void *percpu_vaddr = __per_cpu_user_mapped_start +
317 per_cpu_offset(cpu);
318 unsigned long percpu_sz = __per_cpu_user_mapped_end -
319 __per_cpu_user_mapped_start;
320 kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
321 __PAGE_KERNEL);
325 * Map the entry/exit text section, which is needed at
326 * switches from user to and from kernel.
328 kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
329 __PAGE_KERNEL_RX);
331 #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
332 kaiser_add_user_map_ptrs_early(__irqentry_text_start,
333 __irqentry_text_end,
334 __PAGE_KERNEL_RX);
335 #endif
336 kaiser_add_user_map_early((void *)idt_descr.address,
337 sizeof(gate_desc) * NR_VECTORS,
338 __PAGE_KERNEL_RO);
339 #ifdef CONFIG_TRACING
340 kaiser_add_user_map_early(&trace_idt_descr,
341 sizeof(trace_idt_descr),
342 __PAGE_KERNEL);
343 kaiser_add_user_map_early(&trace_idt_table,
344 sizeof(gate_desc) * NR_VECTORS,
345 __PAGE_KERNEL);
346 #endif
347 kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
348 __PAGE_KERNEL);
349 kaiser_add_user_map_early(&debug_idt_table,
350 sizeof(gate_desc) * NR_VECTORS,
351 __PAGE_KERNEL);
354 /* Add a mapping to the shadow mapping, and synchronize the mappings */
355 int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
357 if (!kaiser_enabled)
358 return 0;
359 return kaiser_add_user_map((const void *)addr, size, flags);
362 void kaiser_remove_mapping(unsigned long start, unsigned long size)
364 extern void unmap_pud_range_nofree(pgd_t *pgd,
365 unsigned long start, unsigned long end);
366 unsigned long end = start + size;
367 unsigned long addr, next;
368 pgd_t *pgd;
370 if (!kaiser_enabled)
371 return;
372 pgd = native_get_shadow_pgd(pgd_offset_k(start));
373 for (addr = start; addr < end; pgd++, addr = next) {
374 next = pgd_addr_end(addr, end);
375 unmap_pud_range_nofree(pgd, addr, next);
380 * Page table pages are page-aligned. The lower half of the top
381 * level is used for userspace and the top half for the kernel.
382 * This returns true for user pages that need to get copied into
383 * both the user and kernel copies of the page tables, and false
384 * for kernel pages that should only be in the kernel copy.
386 static inline bool is_userspace_pgd(pgd_t *pgdp)
388 return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
391 pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
393 if (!kaiser_enabled)
394 return pgd;
396 * Do we need to also populate the shadow pgd? Check _PAGE_USER to
397 * skip cases like kexec and EFI which make temporary low mappings.
399 if (pgd.pgd & _PAGE_USER) {
400 if (is_userspace_pgd(pgdp)) {
401 native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
403 * Even if the entry is *mapping* userspace, ensure
404 * that userspace can not use it. This way, if we
405 * get out to userspace running on the kernel CR3,
406 * userspace will crash instead of running.
408 pgd.pgd |= _PAGE_NX;
410 } else if (!pgd.pgd) {
412 * pgd_clear() cannot check _PAGE_USER, and is even used to
413 * clear corrupted pgd entries: so just rely on cases like
414 * kexec and EFI never to be using pgd_clear().
416 if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
417 is_userspace_pgd(pgdp))
418 native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
420 return pgd;
423 void kaiser_setup_pcid(void)
425 unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
427 if (this_cpu_has(X86_FEATURE_PCID))
428 user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
430 * These variables are used by the entry/exit
431 * code to change PCID and pgd and TLB flushing.
433 this_cpu_write(x86_cr3_pcid_user, user_cr3);
437 * Make a note that this cpu will need to flush USER tlb on return to user.
438 * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
440 void kaiser_flush_tlb_on_return_to_user(void)
442 if (this_cpu_has(X86_FEATURE_PCID))
443 this_cpu_write(x86_cr3_pcid_user,
444 X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
446 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);