arch/x86/mm/fault.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  *  Copyright (C) 1995  Linus Torvalds
   4  *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
   5  *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
   6  */
   7 #include <linux/sched.h>                /* test_thread_flag(), ...      */
   8 #include <linux/sched/task_stack.h>     /* task_stack_*(), ...          */
   9 #include <linux/kdebug.h>               /* oops_begin/end, ...          */
  10 #include <linux/extable.h>              /* search_exception_tables      */
  11 #include <linux/bootmem.h>              /* max_low_pfn                  */
  12 #include <linux/kprobes.h>              /* NOKPROBE_SYMBOL, ...         */
  13 #include <linux/mmiotrace.h>            /* kmmio_handler, ...           */
  14 #include <linux/perf_event.h>           /* perf_sw_event                */
  15 #include <linux/hugetlb.h>              /* hstate_index_to_shift        */
  16 #include <linux/prefetch.h>             /* prefetchw                    */
  17 #include <linux/context_tracking.h>     /* exception_enter(), ...       */
  18 #include <linux/uaccess.h>              /* faulthandler_disabled()      */
  19
  20 #include <asm/cpufeature.h>             /* boot_cpu_has, ...            */
  21 #include <asm/traps.h>                  /* dotraplinkage, ...           */
  22 #include <asm/pgalloc.h>                /* pgd_*(), ...                 */
  23 #include <asm/fixmap.h>                 /* VSYSCALL_ADDR                */
  24 #include <asm/vsyscall.h>               /* emulate_vsyscall             */
  25 #include <asm/vm86.h>                   /* struct vm86                  */
  26 #include <asm/mmu_context.h>            /* vma_pkey()                   */
  27
  28 #define CREATE_TRACE_POINTS
  29 #include <asm/trace/exceptions.h>
  30
  31 /*
  32  * Returns 0 if mmiotrace is disabled, or if the fault is not
  33  * handled by mmiotrace:
  34  */
  35 static nokprobe_inline int
  36 kmmio_fault(struct pt_regs *regs, unsigned long addr)
  37 {
  38         if (unlikely(is_kmmio_active()))
  39                 if (kmmio_handler(regs, addr) == 1)
  40                         return -1;
  41         return 0;
  42 }
  43
  44 static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
  45 {
  46         int ret = 0;
  47
  48         /* kprobe_running() needs smp_processor_id() */
  49         if (kprobes_built_in() && !user_mode(regs)) {
  50                 preempt_disable();
  51                 if (kprobe_running() && kprobe_fault_handler(regs, 14))
  52                         ret = 1;
  53                 preempt_enable();
  54         }
  55
  56         return ret;
  57 }
  58
  59 /*
  60  * Prefetch quirks:
  61  *
  62  * 32-bit mode:
  63  *
  64  *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
  65  *   Check that here and ignore it.
  66  *
  67  * 64-bit mode:
  68  *
  69  *   Sometimes the CPU reports invalid exceptions on prefetch.
  70  *   Check that here and ignore it.
  71  *
  72  * Opcode checker based on code by Richard Brunner.
  73  */
  74 static inline int
  75 check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
  76                       unsigned char opcode, int *prefetch)
  77 {
  78         unsigned char instr_hi = opcode & 0xf0;
  79         unsigned char instr_lo = opcode & 0x0f;
  80
  81         switch (instr_hi) {
  82         case 0x20:
  83         case 0x30:
  84                 /*
  85                  * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
  86                  * In X86_64 long mode, the CPU will signal invalid
  87                  * opcode if some of these prefixes are present so
  88                  * X86_64 will never get here anyway
  89                  */
  90                 return ((instr_lo & 7) == 0x6);
  91 #ifdef CONFIG_X86_64
  92         case 0x40:
  93                 /*
  94                  * In AMD64 long mode 0x40..0x4F are valid REX prefixes
  95                  * Need to figure out under what instruction mode the
  96                  * instruction was issued. Could check the LDT for lm,
  97                  * but for now it's good enough to assume that long
  98                  * mode only uses well known segments or kernel.
  99                  */
 100                 return (!user_mode(regs) || user_64bit_mode(regs));
 101 #endif
 102         case 0x60:
 103                 /* 0x64 thru 0x67 are valid prefixes in all modes. */
 104                 return (instr_lo & 0xC) == 0x4;
 105         case 0xF0:
 106                 /* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
 107                 return !instr_lo || (instr_lo>>1) == 1;
 108         case 0x00:
 109                 /* Prefetch instruction is 0x0F0D or 0x0F18 */
 110                 if (probe_kernel_address(instr, opcode))
 111                         return 0;
 112
 113                 *prefetch = (instr_lo == 0xF) &&
 114                         (opcode == 0x0D || opcode == 0x18);
 115                 return 0;
 116         default:
 117                 return 0;
 118         }
 119 }
 120
 121 static int
 122 is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 123 {
 124         unsigned char *max_instr;
 125         unsigned char *instr;
 126         int prefetch = 0;
 127
 128         /*
 129          * If it was a exec (instruction fetch) fault on NX page, then
 130          * do not ignore the fault:
 131          */
 132         if (error_code & X86_PF_INSTR)
 133                 return 0;
 134
 135         instr = (void *)convert_ip_to_linear(current, regs);
 136         max_instr = instr + 15;
 137
 138         if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
 139                 return 0;
 140
 141         while (instr < max_instr) {
 142                 unsigned char opcode;
 143
 144                 if (probe_kernel_address(instr, opcode))
 145                         break;
 146
 147                 instr++;
 148
 149                 if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
 150                         break;
 151         }
 152         return prefetch;
 153 }
 154
 155 /*
 156  * A protection key fault means that the PKRU value did not allow
 157  * access to some PTE.  Userspace can figure out what PKRU was
 158  * from the XSAVE state, and this function fills out a field in
 159  * siginfo so userspace can discover which protection key was set
 160  * on the PTE.
 161  *
 162  * If we get here, we know that the hardware signaled a X86_PF_PK
 163  * fault and that there was a VMA once we got in the fault
 164  * handler.  It does *not* guarantee that the VMA we find here
 165  * was the one that we faulted on.
 166  *
 167  * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4);
 168  * 2. T1   : set PKRU to deny access to pkey=4, touches page
 169  * 3. T1   : faults...
 170  * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
 171  * 5. T1   : enters fault handler, takes mmap_sem, etc...
 172  * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really
 173  *           faulted on a pte with its pkey=4.
 174  */
 175 static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info,
 176                 u32 *pkey)
 177 {
 178         /* This is effectively an #ifdef */
 179         if (!boot_cpu_has(X86_FEATURE_OSPKE))
 180                 return;
 181
 182         /* Fault not from Protection Keys: nothing to do */
 183         if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV))
 184                 return;
 185         /*
 186          * force_sig_info_fault() is called from a number of
 187          * contexts, some of which have a VMA and some of which
 188          * do not.  The X86_PF_PK handing happens after we have a
 189          * valid VMA, so we should never reach this without a
 190          * valid VMA.
 191          */
 192         if (!pkey) {
 193                 WARN_ONCE(1, "PKU fault with no VMA passed in");
 194                 info->si_pkey = 0;
 195                 return;
 196         }
 197         /*
 198          * si_pkey should be thought of as a strong hint, but not
 199          * absolutely guranteed to be 100% accurate because of
 200          * the race explained above.
 201          */
 202         info->si_pkey = *pkey;
 203 }
 204
 205 static void
 206 force_sig_info_fault(int si_signo, int si_code, unsigned long address,
 207                      struct task_struct *tsk, u32 *pkey, int fault)
 208 {
 209         unsigned lsb = 0;
 210         siginfo_t info;
 211
 212         info.si_signo   = si_signo;
 213         info.si_errno   = 0;
 214         info.si_code    = si_code;
 215         info.si_addr    = (void __user *)address;
 216         if (fault & VM_FAULT_HWPOISON_LARGE)
 217                 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
 218         if (fault & VM_FAULT_HWPOISON)
 219                 lsb = PAGE_SHIFT;
 220         info.si_addr_lsb = lsb;
 221
 222         fill_sig_info_pkey(si_signo, si_code, &info, pkey);
 223
 224         force_sig_info(si_signo, &info, tsk);
 225 }
 226
 227 DEFINE_SPINLOCK(pgd_lock);
 228 LIST_HEAD(pgd_list);
 229
 230 #ifdef CONFIG_X86_32
 231 static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 232 {
 233         unsigned index = pgd_index(address);
 234         pgd_t *pgd_k;
 235         p4d_t *p4d, *p4d_k;
 236         pud_t *pud, *pud_k;
 237         pmd_t *pmd, *pmd_k;
 238
 239         pgd += index;
 240         pgd_k = init_mm.pgd + index;
 241
 242         if (!pgd_present(*pgd_k))
 243                 return NULL;
 244
 245         /*
 246          * set_pgd(pgd, *pgd_k); here would be useless on PAE
 247          * and redundant with the set_pmd() on non-PAE. As would
 248          * set_p4d/set_pud.
 249          */
 250         p4d = p4d_offset(pgd, address);
 251         p4d_k = p4d_offset(pgd_k, address);
 252         if (!p4d_present(*p4d_k))
 253                 return NULL;
 254
 255         pud = pud_offset(p4d, address);
 256         pud_k = pud_offset(p4d_k, address);
 257         if (!pud_present(*pud_k))
 258                 return NULL;
 259
 260         pmd = pmd_offset(pud, address);
 261         pmd_k = pmd_offset(pud_k, address);
 262         if (!pmd_present(*pmd_k))
 263                 return NULL;
 264
 265         if (!pmd_present(*pmd))
 266                 set_pmd(pmd, *pmd_k);
 267         else
 268                 BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
 269
 270         return pmd_k;
 271 }
 272
 273 void vmalloc_sync_all(void)
 274 {
 275         unsigned long address;
 276
 277         if (SHARED_KERNEL_PMD)
 278                 return;
 279
 280         for (address = VMALLOC_START & PMD_MASK;
 281              address >= TASK_SIZE_MAX && address < FIXADDR_TOP;
 282              address += PMD_SIZE) {
 283                 struct page *page;
 284
 285                 spin_lock(&pgd_lock);
 286                 list_for_each_entry(page, &pgd_list, lru) {
 287                         spinlock_t *pgt_lock;
 288                         pmd_t *ret;
 289
 290                         /* the pgt_lock only for Xen */
 291                         pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
 292
 293                         spin_lock(pgt_lock);
 294                         ret = vmalloc_sync_one(page_address(page), address);
 295                         spin_unlock(pgt_lock);
 296
 297                         if (!ret)
 298                                 break;
 299                 }
 300                 spin_unlock(&pgd_lock);
 301         }
 302 }
 303
 304 /*
 305  * 32-bit:
 306  *
 307  *   Handle a fault on the vmalloc or module mapping area
 308  */
 309 static noinline int vmalloc_fault(unsigned long address)
 310 {
 311         unsigned long pgd_paddr;
 312         pmd_t *pmd_k;
 313         pte_t *pte_k;
 314
 315         /* Make sure we are in vmalloc area: */
 316         if (!(address >= VMALLOC_START && address < VMALLOC_END))
 317                 return -1;
 318
 319         WARN_ON_ONCE(in_nmi());
 320
 321         /*
 322          * Synchronize this task's top level page-table
 323          * with the 'reference' page table.
 324          *
 325          * Do _not_ use "current" here. We might be inside
 326          * an interrupt in the middle of a task switch..
 327          */
 328         pgd_paddr = read_cr3_pa();
 329         pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
 330         if (!pmd_k)
 331                 return -1;
 332
 333         if (pmd_huge(*pmd_k))
 334                 return 0;
 335
 336         pte_k = pte_offset_kernel(pmd_k, address);
 337         if (!pte_present(*pte_k))
 338                 return -1;
 339
 340         return 0;
 341 }
 342 NOKPROBE_SYMBOL(vmalloc_fault);
 343
 344 /*
 345  * Did it hit the DOS screen memory VA from vm86 mode?
 346  */
 347 static inline void
 348 check_v8086_mode(struct pt_regs *regs, unsigned long address,
 349                  struct task_struct *tsk)
 350 {
 351 #ifdef CONFIG_VM86
 352         unsigned long bit;
 353
 354         if (!v8086_mode(regs) || !tsk->thread.vm86)
 355                 return;
 356
 357         bit = (address - 0xA0000) >> PAGE_SHIFT;
 358         if (bit < 32)
 359                 tsk->thread.vm86->screen_bitmap |= 1 << bit;
 360 #endif
 361 }
 362
 363 static bool low_pfn(unsigned long pfn)
 364 {
 365         return pfn < max_low_pfn;
 366 }
 367
 368 static void dump_pagetable(unsigned long address)
 369 {
 370         pgd_t *base = __va(read_cr3_pa());
 371         pgd_t *pgd = &base[pgd_index(address)];
 372         p4d_t *p4d;
 373         pud_t *pud;
 374         pmd_t *pmd;
 375         pte_t *pte;
 376
 377 #ifdef CONFIG_X86_PAE
 378         pr_info("*pdpt = %016Lx ", pgd_val(*pgd));
 379         if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
 380                 goto out;
 381 #define pr_pde pr_cont
 382 #else
 383 #define pr_pde pr_info
 384 #endif
 385         p4d = p4d_offset(pgd, address);
 386         pud = pud_offset(p4d, address);
 387         pmd = pmd_offset(pud, address);
 388         pr_pde("*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
 389 #undef pr_pde
 390
 391         /*
 392          * We must not directly access the pte in the highpte
 393          * case if the page table is located in highmem.
 394          * And let's rather not kmap-atomic the pte, just in case
 395          * it's allocated already:
 396          */
 397         if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
 398                 goto out;
 399
 400         pte = pte_offset_kernel(pmd, address);
 401         pr_cont("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
 402 out:
 403         pr_cont("\n");
 404 }
 405
 406 #else /* CONFIG_X86_64: */
 407
 408 void vmalloc_sync_all(void)
 409 {
 410         sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
 411 }
 412
 413 /*
 414  * 64-bit:
 415  *
 416  *   Handle a fault on the vmalloc area
 417  */
 418 static noinline int vmalloc_fault(unsigned long address)
 419 {
 420         pgd_t *pgd, *pgd_ref;
 421         p4d_t *p4d, *p4d_ref;
 422         pud_t *pud, *pud_ref;
 423         pmd_t *pmd, *pmd_ref;
 424         pte_t *pte, *pte_ref;
 425
 426         /* Make sure we are in vmalloc area: */
 427         if (!(address >= VMALLOC_START && address < VMALLOC_END))
 428                 return -1;
 429
 430         WARN_ON_ONCE(in_nmi());
 431
 432         /*
 433          * Copy kernel mappings over when needed. This can also
 434          * happen within a race in page table update. In the later
 435          * case just flush:
 436          */
 437         pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
 438         pgd_ref = pgd_offset_k(address);
 439         if (pgd_none(*pgd_ref))
 440                 return -1;
 441
 442         if (CONFIG_PGTABLE_LEVELS > 4) {
 443                 if (pgd_none(*pgd)) {
 444                         set_pgd(pgd, *pgd_ref);
 445                         arch_flush_lazy_mmu_mode();
 446                 } else {
 447                         BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
 448                 }
 449         }
 450
 451         /* With 4-level paging, copying happens on the p4d level. */
 452         p4d = p4d_offset(pgd, address);
 453         p4d_ref = p4d_offset(pgd_ref, address);
 454         if (p4d_none(*p4d_ref))
 455                 return -1;
 456
 457         if (p4d_none(*p4d) && CONFIG_PGTABLE_LEVELS == 4) {
 458                 set_p4d(p4d, *p4d_ref);
 459                 arch_flush_lazy_mmu_mode();
 460         } else {
 461                 BUG_ON(p4d_pfn(*p4d) != p4d_pfn(*p4d_ref));
 462         }
 463
 464         /*
 465          * Below here mismatches are bugs because these lower tables
 466          * are shared:
 467          */
 468         BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
 469
 470         pud = pud_offset(p4d, address);
 471         pud_ref = pud_offset(p4d_ref, address);
 472         if (pud_none(*pud_ref))
 473                 return -1;
 474
 475         if (pud_none(*pud) || pud_pfn(*pud) != pud_pfn(*pud_ref))
 476                 BUG();
 477
 478         if (pud_huge(*pud))
 479                 return 0;
 480
 481         pmd = pmd_offset(pud, address);
 482         pmd_ref = pmd_offset(pud_ref, address);
 483         if (pmd_none(*pmd_ref))
 484                 return -1;
 485
 486         if (pmd_none(*pmd) || pmd_pfn(*pmd) != pmd_pfn(*pmd_ref))
 487                 BUG();
 488
 489         if (pmd_huge(*pmd))
 490                 return 0;
 491
 492         pte_ref = pte_offset_kernel(pmd_ref, address);
 493         if (!pte_present(*pte_ref))
 494                 return -1;
 495
 496         pte = pte_offset_kernel(pmd, address);
 497
 498         /*
 499          * Don't use pte_page here, because the mappings can point
 500          * outside mem_map, and the NUMA hash lookup cannot handle
 501          * that:
 502          */
 503         if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
 504                 BUG();
 505
 506         return 0;
 507 }
 508 NOKPROBE_SYMBOL(vmalloc_fault);
 509
 510 #ifdef CONFIG_CPU_SUP_AMD
 511 static const char errata93_warning[] =
 512 KERN_ERR
 513 "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
 514 "******* Working around it, but it may cause SEGVs or burn power.\n"
 515 "******* Please consider a BIOS update.\n"
 516 "******* Disabling USB legacy in the BIOS may also help.\n";
 517 #endif
 518
 519 /*
 520  * No vm86 mode in 64-bit mode:
 521  */
 522 static inline void
 523 check_v8086_mode(struct pt_regs *regs, unsigned long address,
 524                  struct task_struct *tsk)
 525 {
 526 }
 527
 528 static int bad_address(void *p)
 529 {
 530         unsigned long dummy;
 531
 532         return probe_kernel_address((unsigned long *)p, dummy);
 533 }
 534
 535 static void dump_pagetable(unsigned long address)
 536 {
 537         pgd_t *base = __va(read_cr3_pa());
 538         pgd_t *pgd = base + pgd_index(address);
 539         p4d_t *p4d;
 540         pud_t *pud;
 541         pmd_t *pmd;
 542         pte_t *pte;
 543
 544         if (bad_address(pgd))
 545                 goto bad;
 546
 547         pr_info("PGD %lx ", pgd_val(*pgd));
 548
 549         if (!pgd_present(*pgd))
 550                 goto out;
 551
 552         p4d = p4d_offset(pgd, address);
 553         if (bad_address(p4d))
 554                 goto bad;
 555
 556         pr_cont("P4D %lx ", p4d_val(*p4d));
 557         if (!p4d_present(*p4d) || p4d_large(*p4d))
 558                 goto out;
 559
 560         pud = pud_offset(p4d, address);
 561         if (bad_address(pud))
 562                 goto bad;
 563
 564         pr_cont("PUD %lx ", pud_val(*pud));
 565         if (!pud_present(*pud) || pud_large(*pud))
 566                 goto out;
 567
 568         pmd = pmd_offset(pud, address);
 569         if (bad_address(pmd))
 570                 goto bad;
 571
 572         pr_cont("PMD %lx ", pmd_val(*pmd));
 573         if (!pmd_present(*pmd) || pmd_large(*pmd))
 574                 goto out;
 575
 576         pte = pte_offset_kernel(pmd, address);
 577         if (bad_address(pte))
 578                 goto bad;
 579
 580         pr_cont("PTE %lx", pte_val(*pte));
 581 out:
 582         pr_cont("\n");
 583         return;
 584 bad:
 585         pr_info("BAD\n");
 586 }
 587
 588 #endif /* CONFIG_X86_64 */
 589
 590 /*
 591  * Workaround for K8 erratum #93 & buggy BIOS.
 592  *
 593  * BIOS SMM functions are required to use a specific workaround
 594  * to avoid corruption of the 64bit RIP register on C stepping K8.
 595  *
 596  * A lot of BIOS that didn't get tested properly miss this.
 597  *
 598  * The OS sees this as a page fault with the upper 32bits of RIP cleared.
 599  * Try to work around it here.
 600  *
 601  * Note we only handle faults in kernel here.
 602  * Does nothing on 32-bit.
 603  */
 604 static int is_errata93(struct pt_regs *regs, unsigned long address)
 605 {
 606 #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
 607         if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
 608             || boot_cpu_data.x86 != 0xf)
 609                 return 0;
 610
 611         if (address != regs->ip)
 612                 return 0;
 613
 614         if ((address >> 32) != 0)
 615                 return 0;
 616
 617         address |= 0xffffffffUL << 32;
 618         if ((address >= (u64)_stext && address <= (u64)_etext) ||
 619             (address >= MODULES_VADDR && address <= MODULES_END)) {
 620                 printk_once(errata93_warning);
 621                 regs->ip = address;
 622                 return 1;
 623         }
 624 #endif
 625         return 0;
 626 }
 627
 628 /*
 629  * Work around K8 erratum #100 K8 in compat mode occasionally jumps
 630  * to illegal addresses >4GB.
 631  *
 632  * We catch this in the page fault handler because these addresses
 633  * are not reachable. Just detect this case and return.  Any code
 634  * segment in LDT is compatibility mode.
 635  */
 636 static int is_errata100(struct pt_regs *regs, unsigned long address)
 637 {
 638 #ifdef CONFIG_X86_64
 639         if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
 640                 return 1;
 641 #endif
 642         return 0;
 643 }
 644
 645 static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
 646 {
 647 #ifdef CONFIG_X86_F00F_BUG
 648         unsigned long nr;
 649
 650         /*
 651          * Pentium F0 0F C7 C8 bug workaround:
 652          */
 653         if (boot_cpu_has_bug(X86_BUG_F00F)) {
 654                 nr = (address - idt_descr.address) >> 3;
 655
 656                 if (nr == 6) {
 657                         do_invalid_op(regs, 0);
 658                         return 1;
 659                 }
 660         }
 661 #endif
 662         return 0;
 663 }
 664
 665 static const char nx_warning[] = KERN_CRIT
 666 "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
 667 static const char smep_warning[] = KERN_CRIT
 668 "unable to execute userspace code (SMEP?) (uid: %d)\n";
 669
 670 static void
 671 show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 672                 unsigned long address)
 673 {
 674         if (!oops_may_print())
 675                 return;
 676
 677         if (error_code & X86_PF_INSTR) {
 678                 unsigned int level;
 679                 pgd_t *pgd;
 680                 pte_t *pte;
 681
 682                 pgd = __va(read_cr3_pa());
 683                 pgd += pgd_index(address);
 684
 685                 pte = lookup_address_in_pgd(pgd, address, &level);
 686
 687                 if (pte && pte_present(*pte) && !pte_exec(*pte))
 688                         printk(nx_warning, from_kuid(&init_user_ns, current_uid()));
 689                 if (pte && pte_present(*pte) && pte_exec(*pte) &&
 690                                 (pgd_flags(*pgd) & _PAGE_USER) &&
 691                                 (__read_cr4() & X86_CR4_SMEP))
 692                         printk(smep_warning, from_kuid(&init_user_ns, current_uid()));
 693         }
 694
 695         printk(KERN_ALERT "BUG: unable to handle kernel ");
 696         if (address < PAGE_SIZE)
 697                 printk(KERN_CONT "NULL pointer dereference");
 698         else
 699                 printk(KERN_CONT "paging request");
 700
 701         printk(KERN_CONT " at %px\n", (void *) address);
 702         printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip);
 703
 704         dump_pagetable(address);
 705 }
 706
 707 static noinline void
 708 pgtable_bad(struct pt_regs *regs, unsigned long error_code,
 709             unsigned long address)
 710 {
 711         struct task_struct *tsk;
 712         unsigned long flags;
 713         int sig;
 714
 715         flags = oops_begin();
 716         tsk = current;
 717         sig = SIGKILL;
 718
 719         printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
 720                tsk->comm, address);
 721         dump_pagetable(address);
 722
 723         tsk->thread.cr2         = address;
 724         tsk->thread.trap_nr     = X86_TRAP_PF;
 725         tsk->thread.error_code  = error_code;
 726
 727         if (__die("Bad pagetable", regs, error_code))
 728                 sig = 0;
 729
 730         oops_end(flags, regs, sig);
 731 }
 732
 733 static noinline void
 734 no_context(struct pt_regs *regs, unsigned long error_code,
 735            unsigned long address, int signal, int si_code)
 736 {
 737         struct task_struct *tsk = current;
 738         unsigned long flags;
 739         int sig;
 740
 741         /* Are we prepared to handle this kernel fault? */
 742         if (fixup_exception(regs, X86_TRAP_PF)) {
 743                 /*
 744                  * Any interrupt that takes a fault gets the fixup. This makes
 745                  * the below recursive fault logic only apply to a faults from
 746                  * task context.
 747                  */
 748                 if (in_interrupt())
 749                         return;
 750
 751                 /*
 752                  * Per the above we're !in_interrupt(), aka. task context.
 753                  *
 754                  * In this case we need to make sure we're not recursively
 755                  * faulting through the emulate_vsyscall() logic.
 756                  */
 757                 if (current->thread.sig_on_uaccess_err && signal) {
 758                         tsk->thread.trap_nr = X86_TRAP_PF;
 759                         tsk->thread.error_code = error_code | X86_PF_USER;
 760                         tsk->thread.cr2 = address;
 761
 762                         /* XXX: hwpoison faults will set the wrong code. */
 763                         force_sig_info_fault(signal, si_code, address,
 764                                              tsk, NULL, 0);
 765                 }
 766
 767                 /*
 768                  * Barring that, we can do the fixup and be happy.
 769                  */
 770                 return;
 771         }
 772
 773 #ifdef CONFIG_VMAP_STACK
 774         /*
 775          * Stack overflow?  During boot, we can fault near the initial
 776          * stack in the direct map, but that's not an overflow -- check
 777          * that we're in vmalloc space to avoid this.
 778          */
 779         if (is_vmalloc_addr((void *)address) &&
 780             (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) ||
 781              address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
 782                 unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *);
 783                 /*
 784                  * We're likely to be running with very little stack space
 785                  * left.  It's plausible that we'd hit this condition but
 786                  * double-fault even before we get this far, in which case
 787                  * we're fine: the double-fault handler will deal with it.
 788                  *
 789                  * We don't want to make it all the way into the oops code
 790                  * and then double-fault, though, because we're likely to
 791                  * break the console driver and lose most of the stack dump.
 792                  */
 793                 asm volatile ("movq %[stack], %%rsp\n\t"
 794                               "call handle_stack_overflow\n\t"
 795                               "1: jmp 1b"
 796                               : ASM_CALL_CONSTRAINT
 797                               : "D" ("kernel stack overflow (page fault)"),
 798                                 "S" (regs), "d" (address),
 799                                 [stack] "rm" (stack));
 800                 unreachable();
 801         }
 802 #endif
 803
 804         /*
 805          * 32-bit:
 806          *
 807          *   Valid to do another page fault here, because if this fault
 808          *   had been triggered by is_prefetch fixup_exception would have
 809          *   handled it.
 810          *
 811          * 64-bit:
 812          *
 813          *   Hall of shame of CPU/BIOS bugs.
 814          */
 815         if (is_prefetch(regs, error_code, address))
 816                 return;
 817
 818         if (is_errata93(regs, address))
 819                 return;
 820
 821         /*
 822          * Oops. The kernel tried to access some bad page. We'll have to
 823          * terminate things with extreme prejudice:
 824          */
 825         flags = oops_begin();
 826
 827         show_fault_oops(regs, error_code, address);
 828
 829         if (task_stack_end_corrupted(tsk))
 830                 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
 831
 832         tsk->thread.cr2         = address;
 833         tsk->thread.trap_nr     = X86_TRAP_PF;
 834         tsk->thread.error_code  = error_code;
 835
 836         sig = SIGKILL;
 837         if (__die("Oops", regs, error_code))
 838                 sig = 0;
 839
 840         /* Executive summary in case the body of the oops scrolled away */
 841         printk(KERN_DEFAULT "CR2: %016lx\n", address);
 842
 843         oops_end(flags, regs, sig);
 844 }
 845
 846 /*
 847  * Print out info about fatal segfaults, if the show_unhandled_signals
 848  * sysctl is set:
 849  */
 850 static inline void
 851 show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 852                 unsigned long address, struct task_struct *tsk)
 853 {
 854         if (!unhandled_signal(tsk, SIGSEGV))
 855                 return;
 856
 857         if (!printk_ratelimit())
 858                 return;
 859
 860         printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
 861                 task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 862                 tsk->comm, task_pid_nr(tsk), address,
 863                 (void *)regs->ip, (void *)regs->sp, error_code);
 864
 865         print_vma_addr(KERN_CONT " in ", regs->ip);
 866
 867         printk(KERN_CONT "\n");
 868 }
 869
 870 static void
 871 __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 872                        unsigned long address, u32 *pkey, int si_code)
 873 {
 874         struct task_struct *tsk = current;
 875
 876         /* User mode accesses just cause a SIGSEGV */
 877         if (error_code & X86_PF_USER) {
 878                 /*
 879                  * It's possible to have interrupts off here:
 880                  */
 881                 local_irq_enable();
 882
 883                 /*
 884                  * Valid to do another page fault here because this one came
 885                  * from user space:
 886                  */
 887                 if (is_prefetch(regs, error_code, address))
 888                         return;
 889
 890                 if (is_errata100(regs, address))
 891                         return;
 892
 893 #ifdef CONFIG_X86_64
 894                 /*
 895                  * Instruction fetch faults in the vsyscall page might need
 896                  * emulation.
 897                  */
 898                 if (unlikely((error_code & X86_PF_INSTR) &&
 899                              ((address & ~0xfff) == VSYSCALL_ADDR))) {
 900                         if (emulate_vsyscall(regs, address))
 901                                 return;
 902                 }
 903 #endif
 904
 905                 /*
 906                  * To avoid leaking information about the kernel page table
 907                  * layout, pretend that user-mode accesses to kernel addresses
 908                  * are always protection faults.
 909                  */
 910                 if (address >= TASK_SIZE_MAX)
 911                         error_code |= X86_PF_PROT;
 912
 913                 if (likely(show_unhandled_signals))
 914                         show_signal_msg(regs, error_code, address, tsk);
 915
 916                 tsk->thread.cr2         = address;
 917                 tsk->thread.error_code  = error_code;
 918                 tsk->thread.trap_nr     = X86_TRAP_PF;
 919
 920                 force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0);
 921
 922                 return;
 923         }
 924
 925         if (is_f00f_bug(regs, address))
 926                 return;
 927
 928         no_context(regs, error_code, address, SIGSEGV, si_code);
 929 }
 930
 931 static noinline void
 932 bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 933                      unsigned long address, u32 *pkey)
 934 {
 935         __bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR);
 936 }
 937
 938 static void
 939 __bad_area(struct pt_regs *regs, unsigned long error_code,
 940            unsigned long address,  struct vm_area_struct *vma, int si_code)
 941 {
 942         struct mm_struct *mm = current->mm;
 943         u32 pkey;
 944
 945         if (vma)
 946                 pkey = vma_pkey(vma);
 947
 948         /*
 949          * Something tried to access memory that isn't in our memory map..
 950          * Fix it, but check if it's kernel or user first..
 951          */
 952         up_read(&mm->mmap_sem);
 953
 954         __bad_area_nosemaphore(regs, error_code, address,
 955                                (vma) ? &pkey : NULL, si_code);
 956 }
 957
 958 static noinline void
 959 bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
 960 {
 961         __bad_area(regs, error_code, address, NULL, SEGV_MAPERR);
 962 }
 963
 964 static inline bool bad_area_access_from_pkeys(unsigned long error_code,
 965                 struct vm_area_struct *vma)
 966 {
 967         /* This code is always called on the current mm */
 968         bool foreign = false;
 969
 970         if (!boot_cpu_has(X86_FEATURE_OSPKE))
 971                 return false;
 972         if (error_code & X86_PF_PK)
 973                 return true;
 974         /* this checks permission keys on the VMA: */
 975         if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
 976                                        (error_code & X86_PF_INSTR), foreign))
 977                 return true;
 978         return false;
 979 }
 980
 981 static noinline void
 982 bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
 983                       unsigned long address, struct vm_area_struct *vma)
 984 {
 985         /*
 986          * This OSPKE check is not strictly necessary at runtime.
 987          * But, doing it this way allows compiler optimizations
 988          * if pkeys are compiled out.
 989          */
 990         if (bad_area_access_from_pkeys(error_code, vma))
 991                 __bad_area(regs, error_code, address, vma, SEGV_PKUERR);
 992         else
 993                 __bad_area(regs, error_code, address, vma, SEGV_ACCERR);
 994 }
 995
 996 static void
 997 do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 998           u32 *pkey, unsigned int fault)
 999 {
1000         struct task_struct *tsk = current;
1001         int code = BUS_ADRERR;
1002
1003         /* Kernel mode? Handle exceptions or die: */
1004         if (!(error_code & X86_PF_USER)) {
1005                 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
1006                 return;
1007         }
1008
1009         /* User-space => ok to do another page fault: */
1010         if (is_prefetch(regs, error_code, address))
1011                 return;
1012
1013         tsk->thread.cr2         = address;
1014         tsk->thread.error_code  = error_code;
1015         tsk->thread.trap_nr     = X86_TRAP_PF;
1016
1017 #ifdef CONFIG_MEMORY_FAILURE
1018         if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
1019                 printk(KERN_ERR
1020         "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
1021                         tsk->comm, tsk->pid, address);
1022                 code = BUS_MCEERR_AR;
1023         }
1024 #endif
1025         force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault);
1026 }
1027
1028 static noinline void
1029 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
1030                unsigned long address, u32 *pkey, unsigned int fault)
1031 {
1032         if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
1033                 no_context(regs, error_code, address, 0, 0);
1034                 return;
1035         }
1036
1037         if (fault & VM_FAULT_OOM) {
1038                 /* Kernel mode? Handle exceptions or die: */
1039                 if (!(error_code & X86_PF_USER)) {
1040                         no_context(regs, error_code, address,
1041                                    SIGSEGV, SEGV_MAPERR);
1042                         return;
1043                 }
1044
1045                 /*
1046                  * We ran out of memory, call the OOM killer, and return the
1047                  * userspace (which will retry the fault, or kill us if we got
1048                  * oom-killed):
1049                  */
1050                 pagefault_out_of_memory();
1051         } else {
1052                 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
1053                              VM_FAULT_HWPOISON_LARGE))
1054                         do_sigbus(regs, error_code, address, pkey, fault);
1055                 else if (fault & VM_FAULT_SIGSEGV)
1056                         bad_area_nosemaphore(regs, error_code, address, pkey);
1057                 else
1058                         BUG();
1059         }
1060 }
1061
1062 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
1063 {
1064         if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
1065                 return 0;
1066
1067         if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
1068                 return 0;
1069         /*
1070          * Note: We do not do lazy flushing on protection key
1071          * changes, so no spurious fault will ever set X86_PF_PK.
1072          */
1073         if ((error_code & X86_PF_PK))
1074                 return 1;
1075
1076         return 1;
1077 }
1078
1079 /*
1080  * Handle a spurious fault caused by a stale TLB entry.
1081  *
1082  * This allows us to lazily refresh the TLB when increasing the
1083  * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
1084  * eagerly is very expensive since that implies doing a full
1085  * cross-processor TLB flush, even if no stale TLB entries exist
1086  * on other processors.
1087  *
1088  * Spurious faults may only occur if the TLB contains an entry with
1089  * fewer permission than the page table entry.  Non-present (P = 0)
1090  * and reserved bit (R = 1) faults are never spurious.
1091  *
1092  * There are no security implications to leaving a stale TLB when
1093  * increasing the permissions on a page.
1094  *
1095  * Returns non-zero if a spurious fault was handled, zero otherwise.
1096  *
1097  * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
1098  * (Optional Invalidation).
1099  */
1100 static noinline int
1101 spurious_fault(unsigned long error_code, unsigned long address)
1102 {
1103         pgd_t *pgd;
1104         p4d_t *p4d;
1105         pud_t *pud;
1106         pmd_t *pmd;
1107         pte_t *pte;
1108         int ret;
1109
1110         /*
1111          * Only writes to RO or instruction fetches from NX may cause
1112          * spurious faults.
1113          *
1114          * These could be from user or supervisor accesses but the TLB
1115          * is only lazily flushed after a kernel mapping protection
1116          * change, so user accesses are not expected to cause spurious
1117          * faults.
1118          */
1119         if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
1120             error_code != (X86_PF_INSTR | X86_PF_PROT))
1121                 return 0;
1122
1123         pgd = init_mm.pgd + pgd_index(address);
1124         if (!pgd_present(*pgd))
1125                 return 0;
1126
1127         p4d = p4d_offset(pgd, address);
1128         if (!p4d_present(*p4d))
1129                 return 0;
1130
1131         if (p4d_large(*p4d))
1132                 return spurious_fault_check(error_code, (pte_t *) p4d);
1133
1134         pud = pud_offset(p4d, address);
1135         if (!pud_present(*pud))
1136                 return 0;
1137
1138         if (pud_large(*pud))
1139                 return spurious_fault_check(error_code, (pte_t *) pud);
1140
1141         pmd = pmd_offset(pud, address);
1142         if (!pmd_present(*pmd))
1143                 return 0;
1144
1145         if (pmd_large(*pmd))
1146                 return spurious_fault_check(error_code, (pte_t *) pmd);
1147
1148         pte = pte_offset_kernel(pmd, address);
1149         if (!pte_present(*pte))
1150                 return 0;
1151
1152         ret = spurious_fault_check(error_code, pte);
1153         if (!ret)
1154                 return 0;
1155
1156         /*
1157          * Make sure we have permissions in PMD.
1158          * If not, then there's a bug in the page tables:
1159          */
1160         ret = spurious_fault_check(error_code, (pte_t *) pmd);
1161         WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
1162
1163         return ret;
1164 }
1165 NOKPROBE_SYMBOL(spurious_fault);
1166
1167 int show_unhandled_signals = 1;
1168
1169 static inline int
1170 access_error(unsigned long error_code, struct vm_area_struct *vma)
1171 {
1172         /* This is only called for the current mm, so: */
1173         bool foreign = false;
1174
1175         /*
1176          * Read or write was blocked by protection keys.  This is
1177          * always an unconditional error and can never result in
1178          * a follow-up action to resolve the fault, like a COW.
1179          */
1180         if (error_code & X86_PF_PK)
1181                 return 1;
1182
1183         /*
1184          * Make sure to check the VMA so that we do not perform
1185          * faults just to hit a X86_PF_PK as soon as we fill in a
1186          * page.
1187          */
1188         if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
1189                                        (error_code & X86_PF_INSTR), foreign))
1190                 return 1;
1191
1192         if (error_code & X86_PF_WRITE) {
1193                 /* write, present and write, not present: */
1194                 if (unlikely(!(vma->vm_flags & VM_WRITE)))
1195                         return 1;
1196                 return 0;
1197         }
1198
1199         /* read, present: */
1200         if (unlikely(error_code & X86_PF_PROT))
1201                 return 1;
1202
1203         /* read, not present: */
1204         if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
1205                 return 1;
1206
1207         return 0;
1208 }
1209
1210 static int fault_in_kernel_space(unsigned long address)
1211 {
1212         return address >= TASK_SIZE_MAX;
1213 }
1214
1215 static inline bool smap_violation(int error_code, struct pt_regs *regs)
1216 {
1217         if (!IS_ENABLED(CONFIG_X86_SMAP))
1218                 return false;
1219
1220         if (!static_cpu_has(X86_FEATURE_SMAP))
1221                 return false;
1222
1223         if (error_code & X86_PF_USER)
1224                 return false;
1225
1226         if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
1227                 return false;
1228
1229         return true;
1230 }
1231
1232 /*
1233  * This routine handles page faults.  It determines the address,
1234  * and the problem, and then passes it off to one of the appropriate
1235  * routines.
1236  */
1237 static noinline void
1238 __do_page_fault(struct pt_regs *regs, unsigned long error_code,
1239                 unsigned long address)
1240 {
1241         struct vm_area_struct *vma;
1242         struct task_struct *tsk;
1243         struct mm_struct *mm;
1244         int fault, major = 0;
1245         unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
1246         u32 pkey;
1247
1248         tsk = current;
1249         mm = tsk->mm;
1250
1251         /*
1252          * Detect and handle instructions that would cause a page fault for
1253          * both a tracked kernel page and a userspace page.
1254          */
1255         prefetchw(&mm->mmap_sem);
1256
1257         if (unlikely(kmmio_fault(regs, address)))
1258                 return;
1259
1260         /*
1261          * We fault-in kernel-space virtual memory on-demand. The
1262          * 'reference' page table is init_mm.pgd.
1263          *
1264          * NOTE! We MUST NOT take any locks for this case. We may
1265          * be in an interrupt or a critical region, and should
1266          * only copy the information from the master page table,
1267          * nothing more.
1268          *
1269          * This verifies that the fault happens in kernel space
1270          * (error_code & 4) == 0, and that the fault was not a
1271          * protection error (error_code & 9) == 0.
1272          */
1273         if (unlikely(fault_in_kernel_space(address))) {
1274                 if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
1275                         if (vmalloc_fault(address) >= 0)
1276                                 return;
1277                 }
1278
1279                 /* Can handle a stale RO->RW TLB: */
1280                 if (spurious_fault(error_code, address))
1281                         return;
1282
1283                 /* kprobes don't want to hook the spurious faults: */
1284                 if (kprobes_fault(regs))
1285                         return;
1286                 /*
1287                  * Don't take the mm semaphore here. If we fixup a prefetch
1288                  * fault we could otherwise deadlock:
1289                  */
1290                 bad_area_nosemaphore(regs, error_code, address, NULL);
1291
1292                 return;
1293         }
1294
1295         /* kprobes don't want to hook the spurious faults: */
1296         if (unlikely(kprobes_fault(regs)))
1297                 return;
1298
1299         if (unlikely(error_code & X86_PF_RSVD))
1300                 pgtable_bad(regs, error_code, address);
1301
1302         if (unlikely(smap_violation(error_code, regs))) {
1303                 bad_area_nosemaphore(regs, error_code, address, NULL);
1304                 return;
1305         }
1306
1307         /*
1308          * If we're in an interrupt, have no user context or are running
1309          * in a region with pagefaults disabled then we must not take the fault
1310          */
1311         if (unlikely(faulthandler_disabled() || !mm)) {
1312                 bad_area_nosemaphore(regs, error_code, address, NULL);
1313                 return;
1314         }
1315
1316         /*
1317          * It's safe to allow irq's after cr2 has been saved and the
1318          * vmalloc fault has been handled.
1319          *
1320          * User-mode registers count as a user access even for any
1321          * potential system fault or CPU buglet:
1322          */
1323         if (user_mode(regs)) {
1324                 local_irq_enable();
1325                 error_code |= X86_PF_USER;
1326                 flags |= FAULT_FLAG_USER;
1327         } else {
1328                 if (regs->flags & X86_EFLAGS_IF)
1329                         local_irq_enable();
1330         }
1331
1332         perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1333
1334         if (error_code & X86_PF_WRITE)
1335                 flags |= FAULT_FLAG_WRITE;
1336         if (error_code & X86_PF_INSTR)
1337                 flags |= FAULT_FLAG_INSTRUCTION;
1338
1339         /*
1340          * When running in the kernel we expect faults to occur only to
1341          * addresses in user space.  All other faults represent errors in
1342          * the kernel and should generate an OOPS.  Unfortunately, in the
1343          * case of an erroneous fault occurring in a code path which already
1344          * holds mmap_sem we will deadlock attempting to validate the fault
1345          * against the address space.  Luckily the kernel only validly
1346          * references user space from well defined areas of code, which are
1347          * listed in the exceptions table.
1348          *
1349          * As the vast majority of faults will be valid we will only perform
1350          * the source reference check when there is a possibility of a
1351          * deadlock. Attempt to lock the address space, if we cannot we then
1352          * validate the source. If this is invalid we can skip the address
1353          * space check, thus avoiding the deadlock:
1354          */
1355         if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
1356                 if (!(error_code & X86_PF_USER) &&
1357                     !search_exception_tables(regs->ip)) {
1358                         bad_area_nosemaphore(regs, error_code, address, NULL);
1359                         return;
1360                 }
1361 retry:
1362                 down_read(&mm->mmap_sem);
1363         } else {
1364                 /*
1365                  * The above down_read_trylock() might have succeeded in
1366                  * which case we'll have missed the might_sleep() from
1367                  * down_read():
1368                  */
1369                 might_sleep();
1370         }
1371
1372         vma = find_vma(mm, address);
1373         if (unlikely(!vma)) {
1374                 bad_area(regs, error_code, address);
1375                 return;
1376         }
1377         if (likely(vma->vm_start <= address))
1378                 goto good_area;
1379         if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1380                 bad_area(regs, error_code, address);
1381                 return;
1382         }
1383         if (error_code & X86_PF_USER) {
1384                 /*
1385                  * Accessing the stack below %sp is always a bug.
1386                  * The large cushion allows instructions like enter
1387                  * and pusha to work. ("enter $65535, $31" pushes
1388                  * 32 pointers and then decrements %sp by 65535.)
1389                  */
1390                 if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
1391                         bad_area(regs, error_code, address);
1392                         return;
1393                 }
1394         }
1395         if (unlikely(expand_stack(vma, address))) {
1396                 bad_area(regs, error_code, address);
1397                 return;
1398         }
1399
1400         /*
1401          * Ok, we have a good vm_area for this memory access, so
1402          * we can handle it..
1403          */
1404 good_area:
1405         if (unlikely(access_error(error_code, vma))) {
1406                 bad_area_access_error(regs, error_code, address, vma);
1407                 return;
1408         }
1409
1410         /*
1411          * If for any reason at all we couldn't handle the fault,
1412          * make sure we exit gracefully rather than endlessly redo
1413          * the fault.  Since we never set FAULT_FLAG_RETRY_NOWAIT, if
1414          * we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
1415          *
1416          * Note that handle_userfault() may also release and reacquire mmap_sem
1417          * (and not return with VM_FAULT_RETRY), when returning to userland to
1418          * repeat the page fault later with a VM_FAULT_NOPAGE retval
1419          * (potentially after handling any pending signal during the return to
1420          * userland). The return to userland is identified whenever
1421          * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags.
1422          * Thus we have to be careful about not touching vma after handling the
1423          * fault, so we read the pkey beforehand.
1424          */
1425         pkey = vma_pkey(vma);
1426         fault = handle_mm_fault(vma, address, flags);
1427         major |= fault & VM_FAULT_MAJOR;
1428
1429         /*
1430          * If we need to retry the mmap_sem has already been released,
1431          * and if there is a fatal signal pending there is no guarantee
1432          * that we made any progress. Handle this case first.
1433          */
1434         if (unlikely(fault & VM_FAULT_RETRY)) {
1435                 /* Retry at most once */
1436                 if (flags & FAULT_FLAG_ALLOW_RETRY) {
1437                         flags &= ~FAULT_FLAG_ALLOW_RETRY;
1438                         flags |= FAULT_FLAG_TRIED;
1439                         if (!fatal_signal_pending(tsk))
1440                                 goto retry;
1441                 }
1442
1443                 /* User mode? Just return to handle the fatal exception */
1444                 if (flags & FAULT_FLAG_USER)
1445                         return;
1446
1447                 /* Not returning to user mode? Handle exceptions or die: */
1448                 no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
1449                 return;
1450         }
1451
1452         up_read(&mm->mmap_sem);
1453         if (unlikely(fault & VM_FAULT_ERROR)) {
1454                 mm_fault_error(regs, error_code, address, &pkey, fault);
1455                 return;
1456         }
1457
1458         /*
1459          * Major/minor page fault accounting. If any of the events
1460          * returned VM_FAULT_MAJOR, we account it as a major fault.
1461          */
1462         if (major) {
1463                 tsk->maj_flt++;
1464                 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
1465         } else {
1466                 tsk->min_flt++;
1467                 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
1468         }
1469
1470         check_v8086_mode(regs, address, tsk);
1471 }
1472 NOKPROBE_SYMBOL(__do_page_fault);
1473
1474 static nokprobe_inline void
1475 trace_page_fault_entries(unsigned long address, struct pt_regs *regs,
1476                          unsigned long error_code)
1477 {
1478         if (user_mode(regs))
1479                 trace_page_fault_user(address, regs, error_code);
1480         else
1481                 trace_page_fault_kernel(address, regs, error_code);
1482 }
1483
1484 /*
1485  * We must have this function blacklisted from kprobes, tagged with notrace
1486  * and call read_cr2() before calling anything else. To avoid calling any
1487  * kind of tracing machinery before we've observed the CR2 value.
1488  *
1489  * exception_{enter,exit}() contains all sorts of tracepoints.
1490  */
1491 dotraplinkage void notrace
1492 do_page_fault(struct pt_regs *regs, unsigned long error_code)
1493 {
1494         unsigned long address = read_cr2(); /* Get the faulting address */
1495         enum ctx_state prev_state;
1496
1497         prev_state = exception_enter();
1498         if (trace_pagefault_enabled())
1499                 trace_page_fault_entries(address, regs, error_code);
1500
1501         __do_page_fault(regs, error_code, address);
1502         exception_exit(prev_state);
1503 }
1504 NOKPROBE_SYMBOL(do_page_fault);