4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2016 by Delphix. All rights reserved.
26 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
27 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
28 /* All Rights Reserved */
30 /* Copyright (c) 1987, 1988 Microsoft Corporation */
31 /* All Rights Reserved */
33 #include <sys/asm_linkage.h>
34 #include <sys/asm_misc.h>
35 #include <sys/regset.h>
37 #include <sys/x86_archext.h>
38 #include <sys/machbrand.h>
39 #include <sys/privregs.h>
43 #include <sys/types.h>
44 #include <sys/thread.h>
45 #include <sys/systm.h>
49 #include <sys/segments.h>
52 #include <sys/ftrace.h>
53 #include <sys/traptrace.h>
54 #include <sys/clock.h>
55 #include <sys/panic.h>
61 * We implement two flavours of system call entry points
63 * - {int,lcall}/iret (i386)
64 * - sysenter/sysexit (Pentium II and beyond)
66 * The basic pattern used in the handlers is to check to see if we can
67 * do fast (simple) version of the system call; if we can't we use various
68 * C routines that handle corner cases and debugging.
70 * To reduce the amount of assembler replication, yet keep the system call
71 * implementations vaguely comprehensible, the common code in the body
72 * of the handlers is broken up into a set of preprocessor definitions
77 * When we have SYSCALLTRACE defined, we sneak an extra
78 * predicate into a couple of tests.
80 #if defined(SYSCALLTRACE)
81 #define ORL_SYSCALLTRACE(r32) \
84 #define ORL_SYSCALLTRACE(r32)
88 * This check is false whenever we want to go fast i.e.
90 * if (code >= NSYSCALL ||
91 * t->t_pre_sys || (t->t_proc_flag & TP_WATCHPT) != 0)
100 * - code contains the syscall number
102 * - %ecx and %edi are smashed
103 * - condition code flag ZF is cleared if pre-sys is too complex
105 #define CHECK_PRESYS_NE(t, code) \
106 movzbl T_PRE_SYS
(t), %edi; \
107 movzwl T_PROC_FLAG
(t), %ecx; \
108 andl $TP_WATCHPT
, %ecx; \
110 cmpl $NSYSCALL
, code; \
114 ORL_SYSCALLTRACE
(%edi
)
117 * Check if a brand_mach_ops callback is defined for the specified callback_id
118 * type. If so invoke it with the user's %gs value loaded and the following
120 * --------------------------------------
123 * | | EFLAGS register |
125 * | | user's %eip (user return address) |
126 * | | 'scratch space' |
128 * | | user's %gs selector |
130 * | callback wrapper return addr |
131 * --------------------------------------
133 * If the brand code returns, we assume that we are meant to execute the
134 * normal system call path.
136 * The interface to the brand callbacks on the 32-bit kernel assumes %ebx
137 * is available as a scratch register within the callback. If the callback
138 * returns within the kernel then this macro will restore %ebx. If the
139 * callback is going to return directly to userland then it should restore
140 * %ebx before returning to userland.
142 #define BRAND_CALLBACK(callback_id) \
143 subl $
4, %esp
/* save some scratch space */ ;\
144 pushl
%ebx
/* save %ebx to use for scratch */ ;\
145 pushl
%gs
/* save the user %gs */ ;\
146 movl $KGS_SEL
, %ebx ;\
147 movw
%bx
, %gs
/* switch to the kernel's %gs */ ;\
148 movl
%gs
:CPU_THREAD
, %ebx
/* load the thread pointer */ ;\
149 movl T_LWP
(%ebx
), %ebx
/* load the lwp pointer */ ;\
150 pushl
%ebx
/* push the lwp pointer */ ;\
151 movl LWP_PROCP
(%ebx
), %ebx
/* load the proc pointer */ ;\
152 movl P_BRAND
(%ebx
), %ebx
/* load the brand pointer */ ;\
153 movl B_MACHOPS
(%ebx
), %ebx
/* load the machops pointer */ ;\
154 movl _CONST
(_MUL
(callback_id
, CPTRSIZE
))(%ebx
), %ebx ;\
157 movl
%ebx
, 12(%esp
) /* save callback to scratch */ ;\
158 movl
4(%esp
), %ebx
/* grab the user %gs */ ;\
159 movw
%bx
, %gs
/* restore the user %gs */ ;\
160 call
*12(%esp
) /* call callback in scratch */ ;\
161 1: movl
4(%esp
), %ebx
/* restore user %gs (re-do if */ ;\
162 movw
%bx
, %gs
/* branch due to no callback) */ ;\
163 movl
8(%esp
), %ebx
/* restore user's %ebx */ ;\
164 addl $
16, %esp
/* restore stack ptr */
166 #define MSTATE_TRANSITION(from, to) \
169 call syscall_mstate; \
173 * aka CPU_STATS_ADDQ(CPU, sys.syscall, 1)
174 * This must be called with interrupts or preemption disabled.
176 #define CPU_STATS_SYS_SYSCALL_INC \
177 addl $
1, %gs
:CPU_STATS_SYS_SYSCALL; \
178 adcl $
0, %gs
:CPU_STATS_SYS_SYSCALL+
4;
183 * ASSERT(lwptoregs(lwp) == rp);
185 * this may seem obvious, but very odd things happen if this
190 * Postconditions (if assertion is true):
191 * %esi and %edi are smashed
196 .string "syscall_asm.s:%d lwptoregs(%p) [%p] != rp [%p]"
198 #define ASSERT_LWPTOREGS(t, rp) \
199 movl T_LWP
(t), %esi; \
200 movl LWP_REGS
(%esi
), %edi; \
207 pushl $__lwptoregs_msg; \
211 #define ASSERT_LWPTOREGS(t, rp)
217 * This is an assembler version of this fragment:
219 * lwp->lwp_state = LWP_SYS;
220 * lwp->lwp_ru.sysc++;
221 * lwp->lwp_eosys = NORMALRETURN;
222 * lwp->lwp_ap = argp;
229 #define SET_LWP(lwp, argp) \
230 movb $LWP_SYS
, LWP_STATE
(lwp
); \
231 addl $
1, LWP_RU_SYSC
(lwp
); \
232 adcl $
0, LWP_RU_SYSC+
4(lwp
); \
233 movb $NORMALRETURN
, LWP_EOSYS
(lwp
); \
234 movl argp
, LWP_AP
(lwp
)
237 * Set up the thread, lwp, find the handler, and copy
238 * in the arguments from userland to the kernel stack.
241 * - %eax contains the syscall number
243 * - %eax contains a pointer to the sysent structure
245 * - %esi, %edi are smashed
246 * - %esp is SYS_DROPped ready for the syscall
248 #define SIMPLE_SYSCALL_PRESYS(t, faultlabel) \
249 movl T_LWP
(t), %esi; \
250 movw
%ax
, T_SYSNUM
(t); \
251 subl $SYS_DROP
, %esp; \
252 shll $SYSENT_SIZE_SHIFT
, %eax; \
253 SET_LWP
(%esi
, %esp
); \
254 leal sysent
(%eax
), %eax; \
255 movzbl SY_NARG
(%eax
), %ecx; \
259 movl SYS_DROP
+ REGOFF_UESP
(%esp
), %esi; \
260 movl $faultlabel
, T_LOFAULT
(t); \
264 movl
%ecx
, T_LOFAULT
(t); \
268 * Check to see if a simple return is possible i.e.
270 * if ((t->t_post_sys_ast | syscalltrace) != 0)
276 * - condition code NE is set if post-sys is too complex
277 * - rtmp is zeroed if it isn't (we rely on this!)
279 #define CHECK_POSTSYS_NE(t, rtmp) \
281 ORL_SYSCALLTRACE
(rtmp
); \
282 orl T_POST_SYS_AST
(t), rtmp; \
286 * Fix up the lwp, thread, and eflags for a successful return
289 * - zwreg contains zero
291 * - %esp has been unSYS_DROPped
292 * - %esi is smashed (points to lwp)
294 #define SIMPLE_SYSCALL_POSTSYS(t, zwreg) \
295 movl T_LWP
(t), %esi; \
296 addl $SYS_DROP
, %esp; \
297 movw zwreg
, T_SYSNUM
(t); \
298 movb $LWP_USER
, LWP_STATE
(%esi
); \
299 andb $_CONST
(0xffff - PS_C
), REGOFF_EFL
(%esp
)
302 * System call handler. This is the destination of both the call
303 * gate (lcall 0x27) _and_ the interrupt gate (int 0x91). For our purposes,
304 * there are two significant differences between an interrupt gate and a call
307 * 1) An interrupt gate runs the handler with interrupts disabled, whereas a
308 * call gate runs the handler with whatever EFLAGS settings were in effect at
309 * the time of the call.
311 * 2) An interrupt gate pushes the contents of the EFLAGS register at the time
312 * of the interrupt onto the stack, whereas a call gate does not.
314 * Because we use the following code sequence to handle system calls made from
315 * _both_ a call gate _and_ an interrupt gate, these two differences must be
316 * respected. In regards to number 1) above, the handler must ensure that a sane
317 * EFLAGS snapshot is stored on the stack so that when the kernel returns back
318 * to the user via iret (which returns to user with the EFLAGS value saved on
319 * the stack), interrupts are re-enabled.
321 * In regards to number 2) above, the handler must always put a current snapshot
322 * of EFLAGS onto the stack in the appropriate place. If we came in via an
323 * interrupt gate, we will be clobbering the EFLAGS value that was pushed by
324 * the interrupt gate. This is OK, as the only bit that was changed by the
325 * hardware was the IE (interrupt enable) bit, which for an interrupt gate is
326 * now off. If we were to do nothing, the stack would contain an EFLAGS with
327 * IE off, resulting in us eventually returning back to the user with interrupts
328 * disabled. The solution is to turn on the IE bit in the EFLAGS value saved on
331 * Another subtlety which deserves mention is the difference between the two
332 * descriptors. The call gate descriptor is set to instruct the hardware to copy
333 * one parameter from the user stack to the kernel stack, whereas the interrupt
334 * gate descriptor doesn't use the parameter passing mechanism at all. The
335 * kernel doesn't actually use the parameter that is copied by the hardware; the
336 * only reason it does this is so that there is a space on the stack large
337 * enough to hold an EFLAGS register value, which happens to be in the correct
338 * place for use by iret when we go back to userland. How convenient.
340 * Stack frame description in syscall() and callees.
343 * | regs | +(8*4)+4 registers
345 * | 8 args | <- %esp MAXSYSARGS (currently 8) arguments
349 #define SYS_DROP _CONST(_MUL(MAXSYSARGS, 4))
362 size_t _allsyscalls_size;
366 ENTRY_NP2
(brand_sys_call
, _allsyscalls
)
367 BRAND_CALLBACK
(BRAND_CB_SYSCALL
)
370 / on entry eax
= system call number
372 / set up the stack to look as in reg.h
373 subl $
8, %esp
/ pad the stack with ERRCODE
and TRAPNO
378 TRACE_PTR
(%edi
, %ebx
, %ebx
, %ecx
, $TT_SYSCALL
) / Uses labels
"8" and "9"
379 TRACE_REGS
(%edi
, %esp
, %ebx
, %ecx
) / Uses label
"9"
381 TRACE_STAMP
(%edi
) / Clobbers
%eax
, %edx
, uses
"9"
383 movl
%eax
, TTR_SYSNUM
(%edi
)
389 / Interrupts may
be enabled here
, so we must make sure this thread
390 / doesn
't migrate off the CPU while it updates the CPU stats.
392 / XXX This is only true if we got here via call gate thru the LDT for
393 / old style syscalls. Perhaps this preempt++-- will go away soon?
394 movl %gs:CPU_THREAD, %ebx
395 addb $1, T_PREEMPT(%ebx)
396 CPU_STATS_SYS_SYSCALL_INC
397 subb $1, T_PREEMPT(%ebx)
401 pushl %eax / preserve across mstate call
402 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
405 movl %gs:CPU_THREAD, %ebx
407 ASSERT_LWPTOREGS(%ebx, %esp)
409 CHECK_PRESYS_NE(%ebx, %eax)
410 jne _full_syscall_presys
411 SIMPLE_SYSCALL_PRESYS(%ebx, _syscall_fault)
417 CHECK_POSTSYS_NE(%ebx, %ecx)
418 jne _full_syscall_postsys
419 SIMPLE_SYSCALL_POSTSYS(%ebx, %cx)
420 movl %eax, REGOFF_EAX(%esp)
421 movl %edx, REGOFF_EDX(%esp)
423 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
431 _full_syscall_presys:
432 movl T_LWP(%ebx), %esi
434 movb $LWP_SYS, LWP_STATE(%esi)
441 _full_syscall_postsys:
448 MSTATE_TRANSITION(LMS_SYSTEM, LMS_USER)
455 xorl %eax, %eax / fake syscall_err()
459 SET_SIZE(brand_sys_call)
464 * System call handler via the sysenter instruction
466 * Here's how syscall entry usually works
(see sys_call for details
).
468 * There
, the caller
(lcall
or int
) in userland has arranged that
:
470 * - %eax contains the syscall number
471 * - the user stack contains the args to the syscall
473 * Normally the lcall instruction into the call gate causes the processor
474 * to push
%ss
, %esp
, <top-of-stack
>, %cs
, %eip onto the kernel stack.
475 * The sys_call handler then leaves space for r_trapno
and r_err
, and
476 * pusha
's {%eax, %ecx, %edx, %ebx, %esp, %ebp, %esi, %edi}, followed
477 * by %ds, %es, %fs and %gs to capture a 'struct regs
' on the stack.
478 * Then the kernel sets %ds, %es and %gs to kernel selectors, and finally
479 * extracts %efl and puts it into r_efl (which happens to live at the offset
480 * that <top-of-stack> was copied into). Note that the value in r_efl has
481 * the IF (interrupt enable) flag turned on. (The int instruction into the
482 * interrupt gate does essentially the same thing, only instead of
483 * <top-of-stack> we get eflags - see comment above.)
485 * In the sysenter case, things are a lot more primitive.
487 * The caller in userland has arranged that:
489 * - %eax contains the syscall number
490 * - %ecx contains the user %esp
491 * - %edx contains the return %eip
492 * - the user stack contains the args to the syscall
495 * <args on the stack>
496 * mov $SYS_callnum, %eax
497 * mov $1f, %edx / return %eip
498 * mov %esp, %ecx / return %esp
502 * Hardware and (privileged) initialization code have arranged that by
503 * the time the sysenter instructions completes:
505 * - %eip is pointing to sys_sysenter (below).
506 * - %cs and %ss are set to kernel text and stack (data) selectors.
507 * - %esp is pointing at the lwp's stack
508 * - Interrupts have been disabled.
510 * The task for the sysenter handler is
:
512 * - recreate the same regs structure on the stack
and the same
513 * kernel state as if we
'd come in on an lcall
514 * - do the normal work of a syscall
515 * - execute the system call epilogue, use sysexit to return to userland.
517 * Note that we are unable to return both "rvals" to userland with this
518 * call, as %edx is used by the sysexit instruction.
520 * One final complication in this routine is its interaction with
521 * single-stepping in a debugger. For most of the system call mechanisms,
522 * the CPU automatically clears the single-step flag before we enter the
523 * kernel. The sysenter mechanism does not clear the flag, so a user
524 * single-stepping through a libc routine may suddenly find themself
525 * single-stepping through the kernel. To detect this, kmdb compares the
526 * trap %pc to the [brand_]sys_enter addresses on each single-step trap.
527 * If it finds that we have single-stepped to a sysenter entry point, it
528 * explicitly clears the flag and executes the sys_sysenter routine.
530 * One final complication in this final complication is the fact that we
531 * have two different entry points for sysenter: brand_sys_sysenter and
532 * sys_sysenter. If we enter at brand_sys_sysenter and start single-stepping
533 * through the kernel with kmdb, we will eventually hit the instruction at
534 * sys_sysenter. kmdb cannot distinguish between that valid single-step
535 * and the undesirable one mentioned above. To avoid this situation, we
536 * simply add a jump over the instruction at sys_sysenter to make it
537 * impossible to single-step to it.
547 ENTRY_NP(brand_sys_sysenter)
549 BRAND_CALLBACK(BRAND_CB_SYSENTER)
552 * Jump over sys_sysenter to allow single-stepping as described
557 ALTENTRY(sys_sysenter)
561 / do what the call gate would've done to the stack
..
563 pushl $UDS_SEL
/ (really
%ss
, but it
's the same ..)
564 pushl %ecx / userland makes this a copy of %esp
566 orl $PS_IE, (%esp) / turn interrupts on when we return to user
568 pushl %edx / userland makes this a copy of %eip
570 / done. finish building the stack frame
572 subl $8, %esp / leave space for ERR and TRAPNO
577 TRACE_PTR(%edi, %ebx, %ebx, %ecx, $TT_SYSENTER) / uses labels 8 and 9
578 TRACE_REGS(%edi, %esp, %ebx, %ecx) / uses label 9
580 TRACE_STAMP(%edi) / clobbers %eax, %edx, uses label 9
582 movl %eax, TTR_SYSNUM(%edi)
586 CPU_STATS_SYS_SYSCALL_INC
590 pushl %eax / preserve across mstate call
591 MSTATE_TRANSITION(LMS_USER, LMS_SYSTEM)
594 movl %gs:CPU_THREAD, %ebx
596 ASSERT_LWPTOREGS(%ebx, %esp)
598 CHECK_PRESYS_NE(%ebx, %eax)
599 jne _full_syscall_presys
600 SIMPLE_SYSCALL_PRESYS(%ebx, _syscall_fault)
606 CHECK_POSTSYS_NE(%ebx, %ecx)
607 jne _full_syscall_postsys
608 SIMPLE_SYSCALL_POSTSYS(%ebx, %cx)
610 / sysexit uses %edx to restore %eip, so we can't use it
611 / to return
a value
, sigh.
613 movl
%eax
, REGOFF_EAX
(%esp
)
614 / movl
%edx
, REGOFF_EDX
(%esp
)
616 / Interrupts will
be turned on by the
'sti' executed just before
617 / sysexit. The following ensures that restoring the user
's EFLAGS
618 / doesn't enable interrupts too soon.
619 andl $_BITNOT
(PS_IE
), REGOFF_EFL
(%esp
)
621 MSTATE_TRANSITION
(LMS_SYSTEM
, LMS_USER
)
627 popl
%edx
/ sysexit
: %edx
-> %eip
628 addl $
4, %esp
/ get CS off the stack
630 popl
%ecx
/ sysexit
: %ecx
-> %esp
633 SET_SIZE
(sys_sysenter
)
634 SET_SIZE
(brand_sys_sysenter
)
637 * Declare a uintptr_t which covers the entire pc range of syscall
638 * handlers for the stack walkers that need this.
641 .globl _allsyscalls_size
642 .type _allsyscalls_size, @object
644 .NWORD . - _allsyscalls
645 SET_SIZE
(_allsyscalls_size
)
650 * These are the thread context handlers for lwps using sysenter/sysexit.
662 sep_restore
(void
*ksp
)
668 * setting this value to zero as we switch away causes the
669 * stack-pointer-on-sysenter to be NULL, ensuring that we
670 * don't silently corrupt another (preempted) thread stack
671 * when running an lwp that (somehow) didn't get sep_restore'd
676 movl $MSR_INTC_SEP_ESP
, %ecx
682 * Update the kernel stack pointer as we resume onto this cpu.
684 ENTRY_NP
(sep_restore
)
685 movl
4(%esp
), %eax
/* per-lwp kernel sp */
687 movl $MSR_INTC_SEP_ESP
, %ecx
690 SET_SIZE
(sep_restore
)
695 * Call syscall(). Called from trap() on watchpoint at lcall 0,7
706 ENTRY_NP
(watch_syscall
)
708 movl
%gs
:CPU_THREAD
, %ebx
709 movl T_STACK
(%ebx
), %esp
/ switch to the thread stack
710 movl REGOFF_EAX
(%esp
), %eax
/ recover original syscall
#
711 jmp _watch_do_syscall
712 SET_SIZE
(watch_syscall
)