1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/extable.h>
3 #include <linux/uaccess.h>
4 #include <linux/sched/debug.h>
5 #include <linux/bitfield.h>
8 #include <asm/fpu/api.h>
11 #include <asm/traps.h>
12 #include <asm/kdebug.h>
13 #include <asm/insn-eval.h>
16 static inline unsigned long *pt_regs_nr(struct pt_regs
*regs
, int nr
)
18 int reg_offset
= pt_regs_offset(regs
, nr
);
19 static unsigned long __dummy
;
21 if (WARN_ON_ONCE(reg_offset
< 0))
24 return (unsigned long *)((unsigned long)regs
+ reg_offset
);
27 static inline unsigned long
28 ex_fixup_addr(const struct exception_table_entry
*x
)
30 return (unsigned long)&x
->fixup
+ x
->fixup
;
33 static bool ex_handler_default(const struct exception_table_entry
*e
,
36 if (e
->data
& EX_FLAG_CLEAR_AX
)
38 if (e
->data
& EX_FLAG_CLEAR_DX
)
41 regs
->ip
= ex_fixup_addr(e
);
46 * This is the *very* rare case where we do a "load_unaligned_zeropad()"
47 * and it's a page crosser into a non-existent page.
49 * This happens when we optimistically load a pathname a word-at-a-time
50 * and the name is less than the full word and the next page is not
51 * mapped. Typically that only happens for CONFIG_DEBUG_PAGEALLOC.
53 * NOTE! The faulting address is always a 'mov mem,reg' type instruction
54 * of size 'long', and the exception fixup must always point to right
55 * after the instruction.
57 static bool ex_handler_zeropad(const struct exception_table_entry
*e
,
59 unsigned long fault_addr
)
62 const unsigned long mask
= sizeof(long) - 1;
63 unsigned long offset
, addr
, next_ip
, len
;
66 next_ip
= ex_fixup_addr(e
);
67 len
= next_ip
- regs
->ip
;
68 if (len
> MAX_INSN_SIZE
)
71 if (insn_decode(&insn
, (void *) regs
->ip
, len
, INSN_MODE_KERN
))
73 if (insn
.length
!= len
)
76 if (insn
.opcode
.bytes
[0] != 0x8b)
78 if (insn
.opnd_bytes
!= sizeof(long))
81 addr
= (unsigned long) insn_get_addr_ref(&insn
, regs
);
87 if (fault_addr
!= addr
+ sizeof(long))
90 reg
= insn_get_modrm_reg_ptr(&insn
, regs
);
94 *reg
= *(unsigned long *)addr
>> (offset
* 8);
95 return ex_handler_default(e
, regs
);
98 static bool ex_handler_fault(const struct exception_table_entry
*fixup
,
99 struct pt_regs
*regs
, int trapnr
)
102 return ex_handler_default(fixup
, regs
);
105 static bool ex_handler_sgx(const struct exception_table_entry
*fixup
,
106 struct pt_regs
*regs
, int trapnr
)
108 regs
->ax
= trapnr
| SGX_ENCLS_FAULT_FLAG
;
109 return ex_handler_default(fixup
, regs
);
113 * Handler for when we fail to restore a task's FPU state. We should never get
114 * here because the FPU state of a task using the FPU (task->thread.fpu.state)
115 * should always be valid. However, past bugs have allowed userspace to set
116 * reserved bits in the XSAVE area using PTRACE_SETREGSET or sys_rt_sigreturn().
117 * These caused XRSTOR to fail when switching to the task, leaking the FPU
118 * registers of the task previously executing on the CPU. Mitigate this class
119 * of vulnerability by restoring from the initial state (essentially, zeroing
120 * out all the FPU registers) if we can't restore from the task's FPU state.
122 static bool ex_handler_fprestore(const struct exception_table_entry
*fixup
,
123 struct pt_regs
*regs
)
125 regs
->ip
= ex_fixup_addr(fixup
);
127 WARN_ONCE(1, "Bad FPU state detected at %pB, reinitializing FPU registers.",
128 (void *)instruction_pointer(regs
));
130 fpu_reset_from_exception_fixup();
135 * On x86-64, we end up being imprecise with 'access_ok()', and allow
136 * non-canonical user addresses to make the range comparisons simpler,
137 * and to not have to worry about LAM being enabled.
139 * In fact, we allow up to one page of "slop" at the sign boundary,
140 * which means that we can do access_ok() by just checking the sign
141 * of the pointer for the common case of having a small access size.
143 static bool gp_fault_address_ok(unsigned long fault_address
)
146 /* Is it in the "user space" part of the non-canonical space? */
147 if (valid_user_address(fault_address
))
150 /* .. or just above it? */
151 fault_address
-= PAGE_SIZE
;
152 if (valid_user_address(fault_address
))
158 static bool ex_handler_uaccess(const struct exception_table_entry
*fixup
,
159 struct pt_regs
*regs
, int trapnr
,
160 unsigned long fault_address
)
162 WARN_ONCE(trapnr
== X86_TRAP_GP
&& !gp_fault_address_ok(fault_address
),
163 "General protection fault in user access. Non-canonical address?");
164 return ex_handler_default(fixup
, regs
);
167 static bool ex_handler_msr(const struct exception_table_entry
*fixup
,
168 struct pt_regs
*regs
, bool wrmsr
, bool safe
, int reg
)
170 if (__ONCE_LITE_IF(!safe
&& wrmsr
)) {
171 pr_warn("unchecked MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
172 (unsigned int)regs
->cx
, (unsigned int)regs
->dx
,
173 (unsigned int)regs
->ax
, regs
->ip
, (void *)regs
->ip
);
174 show_stack_regs(regs
);
177 if (__ONCE_LITE_IF(!safe
&& !wrmsr
)) {
178 pr_warn("unchecked MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
179 (unsigned int)regs
->cx
, regs
->ip
, (void *)regs
->ip
);
180 show_stack_regs(regs
);
184 /* Pretend that the read succeeded and returned 0. */
190 *pt_regs_nr(regs
, reg
) = -EIO
;
192 return ex_handler_default(fixup
, regs
);
195 static bool ex_handler_clear_fs(const struct exception_table_entry
*fixup
,
196 struct pt_regs
*regs
)
198 if (static_cpu_has(X86_BUG_NULL_SEG
))
199 asm volatile ("mov %0, %%fs" : : "rm" (__USER_DS
));
200 asm volatile ("mov %0, %%fs" : : "rm" (0));
201 return ex_handler_default(fixup
, regs
);
204 static bool ex_handler_imm_reg(const struct exception_table_entry
*fixup
,
205 struct pt_regs
*regs
, int reg
, int imm
)
207 *pt_regs_nr(regs
, reg
) = (long)imm
;
208 return ex_handler_default(fixup
, regs
);
211 static bool ex_handler_ucopy_len(const struct exception_table_entry
*fixup
,
212 struct pt_regs
*regs
, int trapnr
,
213 unsigned long fault_address
,
216 regs
->cx
= imm
* regs
->cx
+ *pt_regs_nr(regs
, reg
);
217 return ex_handler_uaccess(fixup
, regs
, trapnr
, fault_address
);
220 #ifdef CONFIG_X86_FRED
221 static bool ex_handler_eretu(const struct exception_table_entry
*fixup
,
222 struct pt_regs
*regs
, unsigned long error_code
)
224 struct pt_regs
*uregs
= (struct pt_regs
*)(regs
->sp
- offsetof(struct pt_regs
, orig_ax
));
225 unsigned short ss
= uregs
->ss
;
226 unsigned short cs
= uregs
->cs
;
229 * Move the NMI bit from the invalid stack frame, which caused ERETU
230 * to fault, to the fault handler's stack frame, thus to unblock NMI
231 * with the fault handler's ERETS instruction ASAP if NMI is blocked.
233 regs
->fred_ss
.nmi
= uregs
->fred_ss
.nmi
;
236 * Sync event information to uregs, i.e., the ERETU return frame, but
237 * is it safe to write to the ERETU return frame which is just above
238 * current event stack frame?
240 * The RSP used by FRED to push a stack frame is not the value in %rsp,
241 * it is calculated from %rsp with the following 2 steps:
242 * 1) RSP = %rsp - (IA32_FRED_CONFIG & 0x1c0) // Reserve N*64 bytes
243 * 2) RSP = RSP & ~0x3f // Align to a 64-byte cache line
244 * when an event delivery doesn't trigger a stack level change.
246 * Here is an example with N*64 (N=1) bytes reserved:
248 * 64-byte cache line ==> ______________
256 * 64-byte cache line ==> |__Error_code__| <== ERETU return frame
264 * 64-byte cache line ==> |______________| <== RSP after step 1) and 2)
272 * 64-byte cache line ==> |__Error_code__| <== ERETS return frame
274 * Thus a new FRED stack frame will always be pushed below a previous
275 * FRED stack frame ((N*64) bytes may be reserved between), and it is
276 * safe to write to a previous FRED stack frame as they never overlap.
278 fred_info(uregs
)->edata
= fred_event_data(regs
);
279 uregs
->ssx
= regs
->ssx
;
280 uregs
->fred_ss
.ss
= ss
;
281 /* The NMI bit was moved away above */
282 uregs
->fred_ss
.nmi
= 0;
283 uregs
->csx
= regs
->csx
;
284 uregs
->fred_cs
.sl
= 0;
285 uregs
->fred_cs
.wfe
= 0;
287 uregs
->orig_ax
= error_code
;
289 return ex_handler_default(fixup
, regs
);
293 int ex_get_fixup_type(unsigned long ip
)
295 const struct exception_table_entry
*e
= search_exception_tables(ip
);
297 return e
? FIELD_GET(EX_DATA_TYPE_MASK
, e
->data
) : EX_TYPE_NONE
;
300 int fixup_exception(struct pt_regs
*regs
, int trapnr
, unsigned long error_code
,
301 unsigned long fault_addr
)
303 const struct exception_table_entry
*e
;
306 #ifdef CONFIG_PNPBIOS
307 if (unlikely(SEGMENT_IS_PNP_CODE(regs
->cs
))) {
308 extern u32 pnp_bios_fault_eip
, pnp_bios_fault_esp
;
309 extern u32 pnp_bios_is_utter_crap
;
310 pnp_bios_is_utter_crap
= 1;
311 printk(KERN_CRIT
"PNPBIOS fault.. attempting recovery.\n");
315 : : "g" (pnp_bios_fault_esp
), "g" (pnp_bios_fault_eip
));
316 panic("do_trap: can't hit this");
320 e
= search_exception_tables(regs
->ip
);
324 type
= FIELD_GET(EX_DATA_TYPE_MASK
, e
->data
);
325 reg
= FIELD_GET(EX_DATA_REG_MASK
, e
->data
);
326 imm
= FIELD_GET(EX_DATA_IMM_MASK
, e
->data
);
329 case EX_TYPE_DEFAULT
:
330 case EX_TYPE_DEFAULT_MCE_SAFE
:
331 return ex_handler_default(e
, regs
);
333 case EX_TYPE_FAULT_MCE_SAFE
:
334 return ex_handler_fault(e
, regs
, trapnr
);
335 case EX_TYPE_UACCESS
:
336 return ex_handler_uaccess(e
, regs
, trapnr
, fault_addr
);
337 case EX_TYPE_CLEAR_FS
:
338 return ex_handler_clear_fs(e
, regs
);
339 case EX_TYPE_FPU_RESTORE
:
340 return ex_handler_fprestore(e
, regs
);
342 return ex_handler_bpf(e
, regs
);
344 return ex_handler_msr(e
, regs
, true, false, reg
);
346 return ex_handler_msr(e
, regs
, false, false, reg
);
347 case EX_TYPE_WRMSR_SAFE
:
348 return ex_handler_msr(e
, regs
, true, true, reg
);
349 case EX_TYPE_RDMSR_SAFE
:
350 return ex_handler_msr(e
, regs
, false, true, reg
);
351 case EX_TYPE_WRMSR_IN_MCE
:
352 ex_handler_msr_mce(regs
, true);
354 case EX_TYPE_RDMSR_IN_MCE
:
355 ex_handler_msr_mce(regs
, false);
357 case EX_TYPE_POP_REG
:
358 regs
->sp
+= sizeof(long);
360 case EX_TYPE_IMM_REG
:
361 return ex_handler_imm_reg(e
, regs
, reg
, imm
);
362 case EX_TYPE_FAULT_SGX
:
363 return ex_handler_sgx(e
, regs
, trapnr
);
364 case EX_TYPE_UCOPY_LEN
:
365 return ex_handler_ucopy_len(e
, regs
, trapnr
, fault_addr
, reg
, imm
);
366 case EX_TYPE_ZEROPAD
:
367 return ex_handler_zeropad(e
, regs
, fault_addr
);
368 #ifdef CONFIG_X86_FRED
370 return ex_handler_eretu(e
, regs
, error_code
);
376 extern unsigned int early_recursion_flag
;
378 /* Restricted version used during very early boot */
379 void __init
early_fixup_exception(struct pt_regs
*regs
, int trapnr
)
381 /* Ignore early NMIs. */
382 if (trapnr
== X86_TRAP_NMI
)
385 if (early_recursion_flag
> 2)
389 * Old CPUs leave the high bits of CS on the stack
390 * undefined. I'm not sure which CPUs do this, but at least
391 * the 486 DX works this way.
392 * Xen pv domains are not using the default __KERNEL_CS.
394 if (!xen_pv_domain() && regs
->cs
!= __KERNEL_CS
)
398 * The full exception fixup machinery is available as soon as
399 * the early IDT is loaded. This means that it is the
400 * responsibility of extable users to either function correctly
401 * when handlers are invoked early or to simply avoid causing
402 * exceptions before they're ready to handle them.
404 * This is better than filtering which handlers can be used,
405 * because refusing to call a handler here is guaranteed to
406 * result in a hard-to-debug panic.
408 * Keep in mind that not all vectors actually get here. Early
409 * page faults, for example, are special.
411 if (fixup_exception(regs
, trapnr
, regs
->orig_ax
, 0))
414 if (trapnr
== X86_TRAP_UD
) {
415 if (report_bug(regs
->ip
, regs
) == BUG_TRAP_TYPE_WARN
) {
422 * If this was a BUG and report_bug returns or if this
423 * was just a normal #UD, we want to continue onward and
429 early_printk("PANIC: early exception 0x%02x IP %lx:%lx error %lx cr2 0x%lx\n",
430 (unsigned)trapnr
, (unsigned long)regs
->cs
, regs
->ip
,
431 regs
->orig_ax
, read_cr2());