1 // SPDX-License-Identifier: GPL-2.0-only
3 * Based on arch/arm/mm/fault.c
5 * Copyright (C) 1995 Linus Torvalds
6 * Copyright (C) 1995-2004 Russell King
7 * Copyright (C) 2012 ARM Ltd.
10 #include <linux/acpi.h>
11 #include <linux/bitfield.h>
12 #include <linux/extable.h>
13 #include <linux/kfence.h>
14 #include <linux/signal.h>
16 #include <linux/hardirq.h>
17 #include <linux/init.h>
18 #include <linux/kasan.h>
19 #include <linux/kprobes.h>
20 #include <linux/uaccess.h>
21 #include <linux/page-flags.h>
22 #include <linux/sched/signal.h>
23 #include <linux/sched/debug.h>
24 #include <linux/highmem.h>
25 #include <linux/perf_event.h>
26 #include <linux/pkeys.h>
27 #include <linux/preempt.h>
28 #include <linux/hugetlb.h>
32 #include <asm/cmpxchg.h>
33 #include <asm/cpufeature.h>
35 #include <asm/exception.h>
36 #include <asm/daifflags.h>
37 #include <asm/debug-monitors.h>
39 #include <asm/kprobes.h>
41 #include <asm/processor.h>
42 #include <asm/sysreg.h>
43 #include <asm/system_misc.h>
44 #include <asm/tlbflush.h>
45 #include <asm/traps.h>
48 int (*fn
)(unsigned long far
, unsigned long esr
,
49 struct pt_regs
*regs
);
55 static const struct fault_info fault_info
[];
56 static struct fault_info debug_fault_info
[];
58 static inline const struct fault_info
*esr_to_fault_info(unsigned long esr
)
60 return fault_info
+ (esr
& ESR_ELx_FSC
);
63 static inline const struct fault_info
*esr_to_debug_fault_info(unsigned long esr
)
65 return debug_fault_info
+ DBG_ESR_EVT(esr
);
68 static void data_abort_decode(unsigned long esr
)
70 unsigned long iss2
= ESR_ELx_ISS2(esr
);
72 pr_alert("Data abort info:\n");
74 if (esr
& ESR_ELx_ISV
) {
75 pr_alert(" Access size = %u byte(s)\n",
76 1U << ((esr
& ESR_ELx_SAS
) >> ESR_ELx_SAS_SHIFT
));
77 pr_alert(" SSE = %lu, SRT = %lu\n",
78 (esr
& ESR_ELx_SSE
) >> ESR_ELx_SSE_SHIFT
,
79 (esr
& ESR_ELx_SRT_MASK
) >> ESR_ELx_SRT_SHIFT
);
80 pr_alert(" SF = %lu, AR = %lu\n",
81 (esr
& ESR_ELx_SF
) >> ESR_ELx_SF_SHIFT
,
82 (esr
& ESR_ELx_AR
) >> ESR_ELx_AR_SHIFT
);
84 pr_alert(" ISV = 0, ISS = 0x%08lx, ISS2 = 0x%08lx\n",
85 esr
& ESR_ELx_ISS_MASK
, iss2
);
88 pr_alert(" CM = %lu, WnR = %lu, TnD = %lu, TagAccess = %lu\n",
89 (esr
& ESR_ELx_CM
) >> ESR_ELx_CM_SHIFT
,
90 (esr
& ESR_ELx_WNR
) >> ESR_ELx_WNR_SHIFT
,
91 (iss2
& ESR_ELx_TnD
) >> ESR_ELx_TnD_SHIFT
,
92 (iss2
& ESR_ELx_TagAccess
) >> ESR_ELx_TagAccess_SHIFT
);
94 pr_alert(" GCS = %ld, Overlay = %lu, DirtyBit = %lu, Xs = %llu\n",
95 (iss2
& ESR_ELx_GCS
) >> ESR_ELx_GCS_SHIFT
,
96 (iss2
& ESR_ELx_Overlay
) >> ESR_ELx_Overlay_SHIFT
,
97 (iss2
& ESR_ELx_DirtyBit
) >> ESR_ELx_DirtyBit_SHIFT
,
98 (iss2
& ESR_ELx_Xs_MASK
) >> ESR_ELx_Xs_SHIFT
);
101 static void mem_abort_decode(unsigned long esr
)
103 pr_alert("Mem abort info:\n");
105 pr_alert(" ESR = 0x%016lx\n", esr
);
106 pr_alert(" EC = 0x%02lx: %s, IL = %u bits\n",
107 ESR_ELx_EC(esr
), esr_get_class_string(esr
),
108 (esr
& ESR_ELx_IL
) ? 32 : 16);
109 pr_alert(" SET = %lu, FnV = %lu\n",
110 (esr
& ESR_ELx_SET_MASK
) >> ESR_ELx_SET_SHIFT
,
111 (esr
& ESR_ELx_FnV
) >> ESR_ELx_FnV_SHIFT
);
112 pr_alert(" EA = %lu, S1PTW = %lu\n",
113 (esr
& ESR_ELx_EA
) >> ESR_ELx_EA_SHIFT
,
114 (esr
& ESR_ELx_S1PTW
) >> ESR_ELx_S1PTW_SHIFT
);
115 pr_alert(" FSC = 0x%02lx: %s\n", (esr
& ESR_ELx_FSC
),
116 esr_to_fault_info(esr
)->name
);
118 if (esr_is_data_abort(esr
))
119 data_abort_decode(esr
);
122 static inline unsigned long mm_to_pgd_phys(struct mm_struct
*mm
)
124 /* Either init_pg_dir or swapper_pg_dir */
126 return __pa_symbol(mm
->pgd
);
128 return (unsigned long)virt_to_phys(mm
->pgd
);
132 * Dump out the page tables associated with 'addr' in the currently active mm.
134 static void show_pte(unsigned long addr
)
136 struct mm_struct
*mm
;
140 if (is_ttbr0_addr(addr
)) {
142 mm
= current
->active_mm
;
143 if (mm
== &init_mm
) {
144 pr_alert("[%016lx] user address but active_mm is swapper\n",
148 } else if (is_ttbr1_addr(addr
)) {
152 pr_alert("[%016lx] address between user and kernel address ranges\n",
157 pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n",
158 mm
== &init_mm
? "swapper" : "user", PAGE_SIZE
/ SZ_1K
,
159 vabits_actual
, mm_to_pgd_phys(mm
));
160 pgdp
= pgd_offset(mm
, addr
);
161 pgd
= READ_ONCE(*pgdp
);
162 pr_alert("[%016lx] pgd=%016llx", addr
, pgd_val(pgd
));
170 if (pgd_none(pgd
) || pgd_bad(pgd
))
173 p4dp
= p4d_offset(pgdp
, addr
);
174 p4d
= READ_ONCE(*p4dp
);
175 pr_cont(", p4d=%016llx", p4d_val(p4d
));
176 if (p4d_none(p4d
) || p4d_bad(p4d
))
179 pudp
= pud_offset(p4dp
, addr
);
180 pud
= READ_ONCE(*pudp
);
181 pr_cont(", pud=%016llx", pud_val(pud
));
182 if (pud_none(pud
) || pud_bad(pud
))
185 pmdp
= pmd_offset(pudp
, addr
);
186 pmd
= READ_ONCE(*pmdp
);
187 pr_cont(", pmd=%016llx", pmd_val(pmd
));
188 if (pmd_none(pmd
) || pmd_bad(pmd
))
191 ptep
= pte_offset_map(pmdp
, addr
);
195 pte
= __ptep_get(ptep
);
196 pr_cont(", pte=%016llx", pte_val(pte
));
204 * This function sets the access flags (dirty, accessed), as well as write
205 * permission, and only to a more permissive setting.
207 * It needs to cope with hardware update of the accessed/dirty state by other
208 * agents in the system and can safely skip the __sync_icache_dcache() call as,
209 * like __set_ptes(), the PTE is never changed from no-exec to exec here.
211 * Returns whether or not the PTE actually changed.
213 int __ptep_set_access_flags(struct vm_area_struct
*vma
,
214 unsigned long address
, pte_t
*ptep
,
215 pte_t entry
, int dirty
)
217 pteval_t old_pteval
, pteval
;
218 pte_t pte
= __ptep_get(ptep
);
220 if (pte_same(pte
, entry
))
223 /* only preserve the access flags and write permission */
224 pte_val(entry
) &= PTE_RDONLY
| PTE_AF
| PTE_WRITE
| PTE_DIRTY
;
227 * Setting the flags must be done atomically to avoid racing with the
228 * hardware update of the access/dirty state. The PTE_RDONLY bit must
229 * be set to the most permissive (lowest value) of *ptep and entry
230 * (calculated as: a & b == ~(~a | ~b)).
232 pte_val(entry
) ^= PTE_RDONLY
;
233 pteval
= pte_val(pte
);
236 pteval
^= PTE_RDONLY
;
237 pteval
|= pte_val(entry
);
238 pteval
^= PTE_RDONLY
;
239 pteval
= cmpxchg_relaxed(&pte_val(*ptep
), old_pteval
, pteval
);
240 } while (pteval
!= old_pteval
);
242 /* Invalidate a stale read-only entry */
244 flush_tlb_page(vma
, address
);
248 static bool is_el1_instruction_abort(unsigned long esr
)
250 return ESR_ELx_EC(esr
) == ESR_ELx_EC_IABT_CUR
;
253 static bool is_el1_data_abort(unsigned long esr
)
255 return ESR_ELx_EC(esr
) == ESR_ELx_EC_DABT_CUR
;
258 static inline bool is_el1_permission_fault(unsigned long addr
, unsigned long esr
,
259 struct pt_regs
*regs
)
261 if (!is_el1_data_abort(esr
) && !is_el1_instruction_abort(esr
))
264 if (esr_fsc_is_permission_fault(esr
))
267 if (is_ttbr0_addr(addr
) && system_uses_ttbr0_pan())
268 return esr_fsc_is_translation_fault(esr
) &&
269 (regs
->pstate
& PSR_PAN_BIT
);
274 static bool __kprobes
is_spurious_el1_translation_fault(unsigned long addr
,
276 struct pt_regs
*regs
)
281 if (!is_el1_data_abort(esr
) || !esr_fsc_is_translation_fault(esr
))
284 local_irq_save(flags
);
285 asm volatile("at s1e1r, %0" :: "r" (addr
));
287 par
= read_sysreg_par();
288 local_irq_restore(flags
);
291 * If we now have a valid translation, treat the translation fault as
294 if (!(par
& SYS_PAR_EL1_F
))
298 * If we got a different type of fault from the AT instruction,
299 * treat the translation fault as spurious.
301 dfsc
= FIELD_GET(SYS_PAR_EL1_FST
, par
);
302 return !esr_fsc_is_translation_fault(dfsc
);
305 static void die_kernel_fault(const char *msg
, unsigned long addr
,
306 unsigned long esr
, struct pt_regs
*regs
)
310 pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg
,
313 kasan_non_canonical_hook(addr
);
315 mem_abort_decode(esr
);
318 die("Oops", regs
, esr
);
320 make_task_dead(SIGKILL
);
323 #ifdef CONFIG_KASAN_HW_TAGS
324 static void report_tag_fault(unsigned long addr
, unsigned long esr
,
325 struct pt_regs
*regs
)
328 * SAS bits aren't set for all faults reported in EL1, so we can't
329 * find out access size.
331 bool is_write
= !!(esr
& ESR_ELx_WNR
);
332 kasan_report((void *)addr
, 0, is_write
, regs
->pc
);
335 /* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */
336 static inline void report_tag_fault(unsigned long addr
, unsigned long esr
,
337 struct pt_regs
*regs
) { }
340 static void do_tag_recovery(unsigned long addr
, unsigned long esr
,
341 struct pt_regs
*regs
)
344 report_tag_fault(addr
, esr
, regs
);
347 * Disable MTE Tag Checking on the local CPU for the current EL.
348 * It will be done lazily on the other CPUs when they will hit a
351 sysreg_clear_set(sctlr_el1
, SCTLR_EL1_TCF_MASK
,
352 SYS_FIELD_PREP_ENUM(SCTLR_EL1
, TCF
, NONE
));
356 static bool is_el1_mte_sync_tag_check_fault(unsigned long esr
)
358 unsigned long fsc
= esr
& ESR_ELx_FSC
;
360 if (!is_el1_data_abort(esr
))
363 if (fsc
== ESR_ELx_FSC_MTE
)
369 static void __do_kernel_fault(unsigned long addr
, unsigned long esr
,
370 struct pt_regs
*regs
)
375 * Are we prepared to handle this kernel fault?
376 * We are almost certainly not prepared to handle instruction faults.
378 if (!is_el1_instruction_abort(esr
) && fixup_exception(regs
))
381 if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr
, esr
, regs
),
382 "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr
))
385 if (is_el1_mte_sync_tag_check_fault(esr
)) {
386 do_tag_recovery(addr
, esr
, regs
);
391 if (is_el1_permission_fault(addr
, esr
, regs
)) {
392 if (esr
& ESR_ELx_WNR
)
393 msg
= "write to read-only memory";
394 else if (is_el1_instruction_abort(esr
))
395 msg
= "execute from non-executable memory";
397 msg
= "read from unreadable memory";
398 } else if (addr
< PAGE_SIZE
) {
399 msg
= "NULL pointer dereference";
401 if (esr_fsc_is_translation_fault(esr
) &&
402 kfence_handle_page_fault(addr
, esr
& ESR_ELx_WNR
, regs
))
405 msg
= "paging request";
408 if (efi_runtime_fixup_exception(regs
, msg
))
411 die_kernel_fault(msg
, addr
, esr
, regs
);
414 static void set_thread_esr(unsigned long address
, unsigned long esr
)
416 current
->thread
.fault_address
= address
;
419 * If the faulting address is in the kernel, we must sanitize the ESR.
420 * From userspace's point of view, kernel-only mappings don't exist
421 * at all, so we report them as level 0 translation faults.
422 * (This is not quite the way that "no mapping there at all" behaves:
423 * an alignment fault not caused by the memory type would take
424 * precedence over translation fault for a real access to empty
425 * space. Unfortunately we can't easily distinguish "alignment fault
426 * not caused by memory type" from "alignment fault caused by memory
427 * type", so we ignore this wrinkle and just return the translation
430 if (!is_ttbr0_addr(current
->thread
.fault_address
)) {
431 switch (ESR_ELx_EC(esr
)) {
432 case ESR_ELx_EC_DABT_LOW
:
434 * These bits provide only information about the
435 * faulting instruction, which userspace knows already.
436 * We explicitly clear bits which are architecturally
437 * RES0 in case they are given meanings in future.
438 * We always report the ESR as if the fault was taken
439 * to EL1 and so ISV and the bits in ISS[23:14] are
440 * clear. (In fact it always will be a fault to EL1.)
442 esr
&= ESR_ELx_EC_MASK
| ESR_ELx_IL
|
443 ESR_ELx_CM
| ESR_ELx_WNR
;
444 esr
|= ESR_ELx_FSC_FAULT
;
446 case ESR_ELx_EC_IABT_LOW
:
448 * Claim a level 0 translation fault.
449 * All other bits are architecturally RES0 for faults
450 * reported with that DFSC value, so we clear them.
452 esr
&= ESR_ELx_EC_MASK
| ESR_ELx_IL
;
453 esr
|= ESR_ELx_FSC_FAULT
;
457 * This should never happen (entry.S only brings us
458 * into this code for insn and data aborts from a lower
459 * exception level). Fail safe by not providing an ESR
460 * context record at all.
462 WARN(1, "ESR 0x%lx is not DABT or IABT from EL0\n", esr
);
468 current
->thread
.fault_code
= esr
;
471 static void do_bad_area(unsigned long far
, unsigned long esr
,
472 struct pt_regs
*regs
)
474 unsigned long addr
= untagged_addr(far
);
477 * If we are in kernel mode at this point, we have no context to
478 * handle this fault with.
480 if (user_mode(regs
)) {
481 const struct fault_info
*inf
= esr_to_fault_info(esr
);
483 set_thread_esr(addr
, esr
);
484 arm64_force_sig_fault(inf
->sig
, inf
->code
, far
, inf
->name
);
486 __do_kernel_fault(addr
, esr
, regs
);
490 static bool fault_from_pkey(unsigned long esr
, struct vm_area_struct
*vma
,
491 unsigned int mm_flags
)
493 unsigned long iss2
= ESR_ELx_ISS2(esr
);
495 if (!system_supports_poe())
498 if (esr_fsc_is_permission_fault(esr
) && (iss2
& ESR_ELx_Overlay
))
501 return !arch_vma_access_permitted(vma
,
502 mm_flags
& FAULT_FLAG_WRITE
,
503 mm_flags
& FAULT_FLAG_INSTRUCTION
,
507 static bool is_el0_instruction_abort(unsigned long esr
)
509 return ESR_ELx_EC(esr
) == ESR_ELx_EC_IABT_LOW
;
513 * Note: not valid for EL1 DC IVAC, but we never use that such that it
514 * should fault. EL0 cannot issue DC IVAC (undef).
516 static bool is_write_abort(unsigned long esr
)
518 return (esr
& ESR_ELx_WNR
) && !(esr
& ESR_ELx_CM
);
521 static int __kprobes
do_page_fault(unsigned long far
, unsigned long esr
,
522 struct pt_regs
*regs
)
524 const struct fault_info
*inf
;
525 struct mm_struct
*mm
= current
->mm
;
527 unsigned long vm_flags
;
528 unsigned int mm_flags
= FAULT_FLAG_DEFAULT
;
529 unsigned long addr
= untagged_addr(far
);
530 struct vm_area_struct
*vma
;
534 if (kprobe_page_fault(regs
, esr
))
538 * If we're in an interrupt or have no user context, we must not take
541 if (faulthandler_disabled() || !mm
)
545 mm_flags
|= FAULT_FLAG_USER
;
548 * vm_flags tells us what bits we must have in vma->vm_flags
549 * for the fault to be benign, __do_page_fault() would check
550 * vma->vm_flags & vm_flags and returns an error if the
551 * intersection is empty
553 if (is_el0_instruction_abort(esr
)) {
554 /* It was exec fault */
556 mm_flags
|= FAULT_FLAG_INSTRUCTION
;
557 } else if (is_write_abort(esr
)) {
558 /* It was write fault */
560 mm_flags
|= FAULT_FLAG_WRITE
;
562 /* It was read fault */
564 /* Write implies read */
565 vm_flags
|= VM_WRITE
;
566 /* If EPAN is absent then exec implies read */
567 if (!alternative_has_cap_unlikely(ARM64_HAS_EPAN
))
571 if (is_ttbr0_addr(addr
) && is_el1_permission_fault(addr
, esr
, regs
)) {
572 if (is_el1_instruction_abort(esr
))
573 die_kernel_fault("execution of user memory",
576 if (!search_exception_tables(regs
->pc
))
577 die_kernel_fault("access to user memory outside uaccess routines",
581 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS
, 1, regs
, addr
);
583 if (!(mm_flags
& FAULT_FLAG_USER
))
586 vma
= lock_vma_under_rcu(mm
, addr
);
590 if (!(vma
->vm_flags
& vm_flags
)) {
593 si_code
= SEGV_ACCERR
;
594 count_vm_vma_lock_event(VMA_LOCK_SUCCESS
);
598 if (fault_from_pkey(esr
, vma
, mm_flags
)) {
599 pkey
= vma_pkey(vma
);
602 si_code
= SEGV_PKUERR
;
603 count_vm_vma_lock_event(VMA_LOCK_SUCCESS
);
607 fault
= handle_mm_fault(vma
, addr
, mm_flags
| FAULT_FLAG_VMA_LOCK
, regs
);
608 if (!(fault
& (VM_FAULT_RETRY
| VM_FAULT_COMPLETED
)))
611 if (!(fault
& VM_FAULT_RETRY
)) {
612 count_vm_vma_lock_event(VMA_LOCK_SUCCESS
);
615 count_vm_vma_lock_event(VMA_LOCK_RETRY
);
616 if (fault
& VM_FAULT_MAJOR
)
617 mm_flags
|= FAULT_FLAG_TRIED
;
619 /* Quick path to respond to signals */
620 if (fault_signal_pending(fault
, regs
)) {
621 if (!user_mode(regs
))
628 vma
= lock_mm_and_find_vma(mm
, addr
, regs
);
629 if (unlikely(!vma
)) {
631 si_code
= SEGV_MAPERR
;
635 if (!(vma
->vm_flags
& vm_flags
)) {
636 mmap_read_unlock(mm
);
638 si_code
= SEGV_ACCERR
;
642 if (fault_from_pkey(esr
, vma
, mm_flags
)) {
643 pkey
= vma_pkey(vma
);
644 mmap_read_unlock(mm
);
646 si_code
= SEGV_PKUERR
;
650 fault
= handle_mm_fault(vma
, addr
, mm_flags
, regs
);
652 /* Quick path to respond to signals */
653 if (fault_signal_pending(fault
, regs
)) {
654 if (!user_mode(regs
))
659 /* The fault is fully completed (including releasing mmap lock) */
660 if (fault
& VM_FAULT_COMPLETED
)
663 if (fault
& VM_FAULT_RETRY
) {
664 mm_flags
|= FAULT_FLAG_TRIED
;
667 mmap_read_unlock(mm
);
670 /* Handle the "normal" (no error) case first. */
671 if (likely(!(fault
& VM_FAULT_ERROR
)))
674 si_code
= SEGV_MAPERR
;
677 * If we are in kernel mode at this point, we have no context to
678 * handle this fault with.
680 if (!user_mode(regs
))
683 if (fault
& VM_FAULT_OOM
) {
685 * We ran out of memory, call the OOM killer, and return to
686 * userspace (which will retry the fault, or kill us if we got
689 pagefault_out_of_memory();
693 inf
= esr_to_fault_info(esr
);
694 set_thread_esr(addr
, esr
);
695 if (fault
& VM_FAULT_SIGBUS
) {
697 * We had some memory, but were unable to successfully fix up
700 arm64_force_sig_fault(SIGBUS
, BUS_ADRERR
, far
, inf
->name
);
701 } else if (fault
& (VM_FAULT_HWPOISON_LARGE
| VM_FAULT_HWPOISON
)) {
705 if (fault
& VM_FAULT_HWPOISON_LARGE
)
706 lsb
= hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault
));
708 arm64_force_sig_mceerr(BUS_MCEERR_AR
, far
, lsb
, inf
->name
);
711 * The pkey value that we return to userspace can be different
712 * from the pkey that caused the fault.
714 * 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
715 * 2. T1 : set POR_EL0 to deny access to pkey=4, touches, page
717 * 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
718 * 5. T1 : enters fault handler, takes mmap_lock, etc...
719 * 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
720 * faulted on a pte with its pkey=4.
722 /* Something tried to access memory that out of memory map */
723 if (si_code
== SEGV_PKUERR
)
724 arm64_force_sig_fault_pkey(far
, inf
->name
, pkey
);
726 arm64_force_sig_fault(SIGSEGV
, si_code
, far
, inf
->name
);
732 __do_kernel_fault(addr
, esr
, regs
);
736 static int __kprobes
do_translation_fault(unsigned long far
,
738 struct pt_regs
*regs
)
740 unsigned long addr
= untagged_addr(far
);
742 if (is_ttbr0_addr(addr
))
743 return do_page_fault(far
, esr
, regs
);
745 do_bad_area(far
, esr
, regs
);
749 static int do_alignment_fault(unsigned long far
, unsigned long esr
,
750 struct pt_regs
*regs
)
752 if (IS_ENABLED(CONFIG_COMPAT_ALIGNMENT_FIXUPS
) &&
753 compat_user_mode(regs
))
754 return do_compat_alignment_fixup(far
, regs
);
755 do_bad_area(far
, esr
, regs
);
759 static int do_bad(unsigned long far
, unsigned long esr
, struct pt_regs
*regs
)
761 return 1; /* "fault" */
764 static int do_sea(unsigned long far
, unsigned long esr
, struct pt_regs
*regs
)
766 const struct fault_info
*inf
;
767 unsigned long siaddr
;
769 inf
= esr_to_fault_info(esr
);
771 if (user_mode(regs
) && apei_claim_sea(regs
) == 0) {
773 * APEI claimed this as a firmware-first notification.
774 * Some processing deferred to task_work before ret_to_user().
779 if (esr
& ESR_ELx_FnV
) {
783 * The architecture specifies that the tag bits of FAR_EL1 are
784 * UNKNOWN for synchronous external aborts. Mask them out now
785 * so that userspace doesn't see them.
787 siaddr
= untagged_addr(far
);
789 arm64_notify_die(inf
->name
, regs
, inf
->sig
, inf
->code
, siaddr
, esr
);
794 static int do_tag_check_fault(unsigned long far
, unsigned long esr
,
795 struct pt_regs
*regs
)
798 * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN
799 * for tag check faults. Set them to corresponding bits in the untagged
802 far
= (__untagged_addr(far
) & ~MTE_TAG_MASK
) | (far
& MTE_TAG_MASK
);
803 do_bad_area(far
, esr
, regs
);
807 static const struct fault_info fault_info
[] = {
808 { do_bad
, SIGKILL
, SI_KERNEL
, "ttbr address size fault" },
809 { do_bad
, SIGKILL
, SI_KERNEL
, "level 1 address size fault" },
810 { do_bad
, SIGKILL
, SI_KERNEL
, "level 2 address size fault" },
811 { do_bad
, SIGKILL
, SI_KERNEL
, "level 3 address size fault" },
812 { do_translation_fault
, SIGSEGV
, SEGV_MAPERR
, "level 0 translation fault" },
813 { do_translation_fault
, SIGSEGV
, SEGV_MAPERR
, "level 1 translation fault" },
814 { do_translation_fault
, SIGSEGV
, SEGV_MAPERR
, "level 2 translation fault" },
815 { do_translation_fault
, SIGSEGV
, SEGV_MAPERR
, "level 3 translation fault" },
816 { do_page_fault
, SIGSEGV
, SEGV_ACCERR
, "level 0 access flag fault" },
817 { do_page_fault
, SIGSEGV
, SEGV_ACCERR
, "level 1 access flag fault" },
818 { do_page_fault
, SIGSEGV
, SEGV_ACCERR
, "level 2 access flag fault" },
819 { do_page_fault
, SIGSEGV
, SEGV_ACCERR
, "level 3 access flag fault" },
820 { do_page_fault
, SIGSEGV
, SEGV_ACCERR
, "level 0 permission fault" },
821 { do_page_fault
, SIGSEGV
, SEGV_ACCERR
, "level 1 permission fault" },
822 { do_page_fault
, SIGSEGV
, SEGV_ACCERR
, "level 2 permission fault" },
823 { do_page_fault
, SIGSEGV
, SEGV_ACCERR
, "level 3 permission fault" },
824 { do_sea
, SIGBUS
, BUS_OBJERR
, "synchronous external abort" },
825 { do_tag_check_fault
, SIGSEGV
, SEGV_MTESERR
, "synchronous tag check fault" },
826 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 18" },
827 { do_sea
, SIGKILL
, SI_KERNEL
, "level -1 (translation table walk)" },
828 { do_sea
, SIGKILL
, SI_KERNEL
, "level 0 (translation table walk)" },
829 { do_sea
, SIGKILL
, SI_KERNEL
, "level 1 (translation table walk)" },
830 { do_sea
, SIGKILL
, SI_KERNEL
, "level 2 (translation table walk)" },
831 { do_sea
, SIGKILL
, SI_KERNEL
, "level 3 (translation table walk)" },
832 { do_sea
, SIGBUS
, BUS_OBJERR
, "synchronous parity or ECC error" }, // Reserved when RAS is implemented
833 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 25" },
834 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 26" },
835 { do_sea
, SIGKILL
, SI_KERNEL
, "level -1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
836 { do_sea
, SIGKILL
, SI_KERNEL
, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
837 { do_sea
, SIGKILL
, SI_KERNEL
, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
838 { do_sea
, SIGKILL
, SI_KERNEL
, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
839 { do_sea
, SIGKILL
, SI_KERNEL
, "level 3 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
840 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 32" },
841 { do_alignment_fault
, SIGBUS
, BUS_ADRALN
, "alignment fault" },
842 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 34" },
843 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 35" },
844 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 36" },
845 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 37" },
846 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 38" },
847 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 39" },
848 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 40" },
849 { do_bad
, SIGKILL
, SI_KERNEL
, "level -1 address size fault" },
850 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 42" },
851 { do_translation_fault
, SIGSEGV
, SEGV_MAPERR
, "level -1 translation fault" },
852 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 44" },
853 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 45" },
854 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 46" },
855 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 47" },
856 { do_bad
, SIGKILL
, SI_KERNEL
, "TLB conflict abort" },
857 { do_bad
, SIGKILL
, SI_KERNEL
, "Unsupported atomic hardware update fault" },
858 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 50" },
859 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 51" },
860 { do_bad
, SIGKILL
, SI_KERNEL
, "implementation fault (lockdown abort)" },
861 { do_bad
, SIGBUS
, BUS_OBJERR
, "implementation fault (unsupported exclusive)" },
862 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 54" },
863 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 55" },
864 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 56" },
865 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 57" },
866 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 58" },
867 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 59" },
868 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 60" },
869 { do_bad
, SIGKILL
, SI_KERNEL
, "section domain fault" },
870 { do_bad
, SIGKILL
, SI_KERNEL
, "page domain fault" },
871 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 63" },
874 void do_mem_abort(unsigned long far
, unsigned long esr
, struct pt_regs
*regs
)
876 const struct fault_info
*inf
= esr_to_fault_info(esr
);
877 unsigned long addr
= untagged_addr(far
);
879 if (!inf
->fn(far
, esr
, regs
))
882 if (!user_mode(regs
))
883 die_kernel_fault(inf
->name
, addr
, esr
, regs
);
886 * At this point we have an unrecognized fault type whose tag bits may
887 * have been defined as UNKNOWN. Therefore we only expose the untagged
888 * address to the signal handler.
890 arm64_notify_die(inf
->name
, regs
, inf
->sig
, inf
->code
, addr
, esr
);
892 NOKPROBE_SYMBOL(do_mem_abort
);
894 void do_sp_pc_abort(unsigned long addr
, unsigned long esr
, struct pt_regs
*regs
)
896 arm64_notify_die("SP/PC alignment exception", regs
, SIGBUS
, BUS_ADRALN
,
899 NOKPROBE_SYMBOL(do_sp_pc_abort
);
902 * __refdata because early_brk64 is __init, but the reference to it is
903 * clobbered at arch_initcall time.
904 * See traps.c and debug-monitors.c:debug_traps_init().
906 static struct fault_info __refdata debug_fault_info
[] = {
907 { do_bad
, SIGTRAP
, TRAP_HWBKPT
, "hardware breakpoint" },
908 { do_bad
, SIGTRAP
, TRAP_HWBKPT
, "hardware single-step" },
909 { do_bad
, SIGTRAP
, TRAP_HWBKPT
, "hardware watchpoint" },
910 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 3" },
911 { do_bad
, SIGTRAP
, TRAP_BRKPT
, "aarch32 BKPT" },
912 { do_bad
, SIGKILL
, SI_KERNEL
, "aarch32 vector catch" },
913 { early_brk64
, SIGTRAP
, TRAP_BRKPT
, "aarch64 BRK" },
914 { do_bad
, SIGKILL
, SI_KERNEL
, "unknown 7" },
917 void __init
hook_debug_fault_code(int nr
,
918 int (*fn
)(unsigned long, unsigned long, struct pt_regs
*),
919 int sig
, int code
, const char *name
)
921 BUG_ON(nr
< 0 || nr
>= ARRAY_SIZE(debug_fault_info
));
923 debug_fault_info
[nr
].fn
= fn
;
924 debug_fault_info
[nr
].sig
= sig
;
925 debug_fault_info
[nr
].code
= code
;
926 debug_fault_info
[nr
].name
= name
;
930 * In debug exception context, we explicitly disable preemption despite
931 * having interrupts disabled.
932 * This serves two purposes: it makes it much less likely that we would
933 * accidentally schedule in exception context and it will force a warning
934 * if we somehow manage to schedule by accident.
936 static void debug_exception_enter(struct pt_regs
*regs
)
940 /* This code is a bit fragile. Test it. */
941 RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work");
943 NOKPROBE_SYMBOL(debug_exception_enter
);
945 static void debug_exception_exit(struct pt_regs
*regs
)
947 preempt_enable_no_resched();
949 NOKPROBE_SYMBOL(debug_exception_exit
);
951 void do_debug_exception(unsigned long addr_if_watchpoint
, unsigned long esr
,
952 struct pt_regs
*regs
)
954 const struct fault_info
*inf
= esr_to_debug_fault_info(esr
);
955 unsigned long pc
= instruction_pointer(regs
);
957 debug_exception_enter(regs
);
959 if (user_mode(regs
) && !is_ttbr0_addr(pc
))
960 arm64_apply_bp_hardening();
962 if (inf
->fn(addr_if_watchpoint
, esr
, regs
)) {
963 arm64_notify_die(inf
->name
, regs
, inf
->sig
, inf
->code
, pc
, esr
);
966 debug_exception_exit(regs
);
968 NOKPROBE_SYMBOL(do_debug_exception
);
971 * Used during anonymous page fault handling.
973 struct folio
*vma_alloc_zeroed_movable_folio(struct vm_area_struct
*vma
,
976 gfp_t flags
= GFP_HIGHUSER_MOVABLE
| __GFP_ZERO
;
979 * If the page is mapped with PROT_MTE, initialise the tags at the
980 * point of allocation and page zeroing as this is usually faster than
981 * separate DC ZVA and STGM.
983 if (vma
->vm_flags
& VM_MTE
)
984 flags
|= __GFP_ZEROTAGS
;
986 return vma_alloc_folio(flags
, 0, vma
, vaddr
, false);
989 void tag_clear_highpage(struct page
*page
)
991 /* Newly allocated page, shouldn't have been tagged yet */
992 WARN_ON_ONCE(!try_page_mte_tagging(page
));
993 mte_zero_clear_page_tags(page_address(page
));
994 set_page_mte_tagged(page
);