4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
25 /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */
27 /* All Rights Reserved */
29 /* Copyright (c) 1987, 1988 Microsoft Corporation */
30 /* All Rights Reserved */
32 #include <sys/param.h>
33 #include <sys/types.h>
34 #include <sys/sysmacros.h>
35 #include <sys/systm.h>
36 #include <sys/signal.h>
37 #include <sys/errno.h>
38 #include <sys/fault.h>
39 #include <sys/syscall.h>
40 #include <sys/cpuvar.h>
41 #include <sys/sysi86.h>
44 #include <sys/policy.h>
45 #include <sys/thread.h>
46 #include <sys/debug.h>
47 #include <sys/ontrap.h>
48 #include <sys/privregs.h>
49 #include <sys/x86_archext.h>
53 #include <sys/archsystm.h>
57 #include <vm/seg_kmem.h>
58 #include <vm/faultcode.h>
60 #include <sys/cmn_err.h>
61 #include <sys/segments.h>
62 #include <sys/clock.h>
64 static void ldt_alloc(proc_t
*, uint_t
);
65 static void ldt_free(proc_t
*);
66 static void ldt_dup(proc_t
*, proc_t
*);
67 static void ldt_grow(proc_t
*, uint_t
);
75 sysi86(short cmd
, uintptr_t arg1
, uintptr_t arg2
, uintptr_t arg3
)
85 * The SI86V86 subsystem call of the SYSI86 system call
86 * supports only one subcode -- V86SC_IOPL.
89 if (arg1
== V86SC_IOPL
) {
90 struct regs
*rp
= lwptoregs(ttolwp(curthread
));
91 greg_t oldpl
= rp
->r_ps
& PS_IOPL
;
92 greg_t newpl
= arg2
& PS_IOPL
;
95 * Must be privileged to run this system call
96 * if giving more io privilege.
98 if (newpl
> oldpl
&& (error
=
99 secpolicy_sys_config(CRED(), B_FALSE
)) != 0)
100 return (set_errno(error
));
101 rp
->r_ps
^= oldpl
^ newpl
;
107 * Set a segment descriptor
111 * There are considerable problems here manipulating
112 * resources shared by many running lwps. Get everyone
113 * into a safe state before changing the LDT.
115 if (curthread
!= pp
->p_agenttp
&& !holdlwps(SHOLDFORK1
)) {
120 if (get_udatamodel() == DATAMODEL_LP64
) {
125 if (copyin((caddr_t
)arg1
, &ssd
, sizeof (ssd
)) < 0) {
130 error
= setdscr(&ssd
);
132 mutex_enter(&pp
->p_lock
);
133 if (curthread
!= pp
->p_agenttp
)
135 mutex_exit(&pp
->p_lock
);
140 if (suword32((void *)arg1
, c
) == -1)
146 * arg1 is the address of _fp_hw
147 * arg2 is the desired x87 FCW value
148 * arg3 is the desired SSE MXCSR value
149 * a return value of one means SSE hardware, else none.
152 if (suword32((void *)arg1
, c
) == -1) {
156 fpsetcw((uint16_t)arg2
, (uint32_t)arg3
);
157 return ((fp_kind
& __FP_SSE
) ? 1 : 0);
159 /* real time clock management commands */
162 if ((error
= secpolicy_settime(CRED())) == 0) {
164 mutex_enter(&tod_lock
);
167 mutex_exit(&tod_lock
);
171 /* Give some timezone playing room */
172 #define ONEWEEK (7 * 24 * 60 * 60)
176 * Called from 32 bit land, negative values
177 * are not sign extended, so we do that here
178 * by casting it to an int and back. We also
179 * clamp the value to within reason and detect
180 * when a 64 bit call overflows an int.
182 if ((error
= secpolicy_settime(CRED())) == 0) {
183 int newlag
= (int)arg1
;
185 #ifdef _SYSCALL32_IMPL
186 if (get_udatamodel() == DATAMODEL_NATIVE
&&
187 (long)newlag
!= (long)arg1
) {
191 if (newlag
>= -ONEWEEK
&& newlag
<= ONEWEEK
)
199 if (get_udatamodel() == DATAMODEL_NATIVE
) {
200 if (sulword((void *)arg1
, ggmtl()) == -1)
202 #ifdef _SYSCALL32_IMPL
206 if ((gmtl
= ggmtl()) > INT32_MAX
) {
208 * Since gmt_lag can at most be
209 * +/- 12 hours, something is
210 * *seriously* messed up here.
213 } else if (suword32((void *)arg1
, (int32_t)gmtl
) == -1)
220 if ((error
= secpolicy_settime(CRED())) == 0)
224 /* END OF real time clock management commands */
230 return (error
== 0 ? 0 : set_errno(error
));
234 usd_to_ssd(user_desc_t
*usd
, struct ssd
*ssd
, selector_t sel
)
236 ssd
->bo
= USEGD_GETBASE(usd
);
237 ssd
->ls
= USEGD_GETLIMIT(usd
);
241 * set type, dpl and present bits.
243 ssd
->acc1
= usd
->usd_type
;
244 ssd
->acc1
|= usd
->usd_dpl
<< 5;
245 ssd
->acc1
|= usd
->usd_p
<< (5 + 2);
248 * set avl, DB and granularity bits.
250 ssd
->acc2
= usd
->usd_avl
;
253 ssd
->acc2
|= usd
->usd_long
<< 1;
255 ssd
->acc2
|= usd
->usd_reserved
<< 1;
258 ssd
->acc2
|= usd
->usd_def32
<< (1 + 1);
259 ssd
->acc2
|= usd
->usd_gran
<< (1 + 1 + 1);
263 ssd_to_usd(struct ssd
*ssd
, user_desc_t
*usd
)
266 ASSERT(bcmp(usd
, &null_udesc
, sizeof (*usd
)) == 0);
268 USEGD_SETBASE(usd
, ssd
->bo
);
269 USEGD_SETLIMIT(usd
, ssd
->ls
);
272 * set type, dpl and present bits.
274 usd
->usd_type
= ssd
->acc1
;
275 usd
->usd_dpl
= ssd
->acc1
>> 5;
276 usd
->usd_p
= ssd
->acc1
>> (5 + 2);
278 ASSERT(usd
->usd_type
>= SDT_MEMRO
);
279 ASSERT(usd
->usd_dpl
== SEL_UPL
);
282 * 64-bit code selectors are never allowed in the LDT.
283 * Reserved bit is always 0 on 32-bit sytems.
288 usd
->usd_reserved
= 0;
292 * set avl, DB and granularity bits.
294 usd
->usd_avl
= ssd
->acc2
;
295 usd
->usd_def32
= ssd
->acc2
>> (1 + 1);
296 usd
->usd_gran
= ssd
->acc2
>> (1 + 1 + 1);
303 ssd_to_sgd(struct ssd
*ssd
, gate_desc_t
*sgd
)
306 ASSERT(bcmp(sgd
, &null_sdesc
, sizeof (*sgd
)) == 0);
308 sgd
->sgd_looffset
= ssd
->bo
;
309 sgd
->sgd_hioffset
= ssd
->bo
>> 16;
311 sgd
->sgd_selector
= ssd
->ls
;
314 * set type, dpl and present bits.
316 sgd
->sgd_type
= ssd
->acc1
;
317 sgd
->sgd_dpl
= ssd
->acc1
>> 5;
318 sgd
->sgd_p
= ssd
->acc1
>> 7;
319 ASSERT(sgd
->sgd_type
== SDT_SYSCGT
);
320 ASSERT(sgd
->sgd_dpl
== SEL_UPL
);
327 * Load LDT register with the current process's LDT.
332 *((system_desc_t
*)&CPU
->cpu_gdt
[GDT_LDT
]) = curproc
->p_ldt_desc
;
337 * Store a NULL selector in the LDTR. All subsequent illegal references to
338 * the LDT will result in a #gp.
343 *((system_desc_t
*)&CPU
->cpu_gdt
[GDT_LDT
]) = null_sdesc
;
349 ldt_savectx(proc_t
*p
)
351 ASSERT(p
->p_ldt
!= NULL
);
352 ASSERT(p
== curproc
);
356 * The 64-bit kernel must be sure to clear any stale ldt
357 * selectors when context switching away from a process that
358 * has a private ldt. Consider the following example:
360 * Wine creats a ldt descriptor and points a segment register
363 * We then context switch away from wine lwp to kernel
364 * thread and hit breakpoint in kernel with kmdb
366 * When we continue and resume from kmdb we will #gp
367 * fault since kmdb will have saved the stale ldt selector
368 * from wine and will try to restore it but we are no longer in
369 * the context of the wine process and do not have our
370 * ldtr register pointing to the private ldt.
376 cpu_fast_syscall_enable(NULL
);
380 ldt_restorectx(proc_t
*p
)
382 ASSERT(p
->p_ldt
!= NULL
);
383 ASSERT(p
== curproc
);
386 cpu_fast_syscall_disable(NULL
);
390 * When a process with a private LDT execs, fast syscalls must be enabled for
391 * the new process image.
395 ldt_freectx(proc_t
*p
, int isexec
)
401 cpu_fast_syscall_enable(NULL
);
406 * ldt_free() will free the memory used by the private LDT, reset the
407 * process's descriptor, and re-program the LDTR.
413 * Install ctx op that ensures syscall/sysenter are disabled.
414 * See comments below.
416 * When a thread with a private LDT forks, the new process
417 * must have the LDT context ops installed.
421 ldt_installctx(proc_t
*p
, proc_t
*cp
)
427 * If this is a fork, operate on the child process.
435 * The process context ops expect the target process as their argument.
437 ASSERT(removepctx(targ
, targ
, ldt_savectx
, ldt_restorectx
,
438 ldt_installctx
, ldt_savectx
, ldt_freectx
) == 0);
440 installpctx(targ
, targ
, ldt_savectx
, ldt_restorectx
,
441 ldt_installctx
, ldt_savectx
, ldt_freectx
);
444 * We've just disabled fast system call and return instructions; take
445 * the slow path out to make sure we don't try to use one to return
446 * back to user. We must set t_post_sys for every thread in the
447 * process to make sure none of them escape out via fast return.
450 mutex_enter(&targ
->p_lock
);
454 } while ((t
= t
->t_forw
) != targ
->p_tlist
);
455 mutex_exit(&targ
->p_lock
);
459 setdscr(struct ssd
*ssd
)
461 ushort_t seli
; /* selector index */
462 user_desc_t
*ldp
; /* descriptor pointer */
463 user_desc_t ndesc
; /* new descriptor */
464 proc_t
*pp
= ttoproc(curthread
);
468 * LDT segments: executable and data at DPL 3 only.
470 if (!SELISLDT(ssd
->sel
) || !SELISUPL(ssd
->sel
))
474 * check the selector index.
476 seli
= SELTOIDX(ssd
->sel
);
477 if (seli
>= MAXNLDT
|| seli
< LDT_UDBASE
)
481 mutex_enter(&pp
->p_ldtlock
);
484 * If this is the first time for this process then setup a
485 * private LDT for it.
487 if (pp
->p_ldt
== NULL
) {
491 * Now that this process has a private LDT, the use of
492 * the syscall/sysret and sysenter/sysexit instructions
493 * is forbidden for this processes because they destroy
494 * the contents of %cs and %ss segment registers.
496 * Explicity disable them here and add a context handler
497 * to the process. Note that disabling
498 * them here means we can't use sysret or sysexit on
499 * the way out of this system call - so we force this
500 * thread to take the slow path (which doesn't make use
501 * of sysenter or sysexit) back out.
504 ldt_installctx(pp
, NULL
);
505 cpu_fast_syscall_disable(NULL
);
506 ASSERT(curthread
->t_post_sys
!= 0);
509 } else if (seli
> pp
->p_ldtlimit
) {
512 * Increase size of ldt to include seli.
517 ASSERT(seli
<= pp
->p_ldtlimit
);
518 ldp
= &pp
->p_ldt
[seli
];
521 * On the 64-bit kernel, this is where things get more subtle.
522 * Recall that in the 64-bit kernel, when we enter the kernel we
523 * deliberately -don't- reload the segment selectors we came in on
524 * for %ds, %es, %fs or %gs. Messing with selectors is expensive,
525 * and the underlying descriptors are essentially ignored by the
526 * hardware in long mode - except for the base that we override with
529 * However, there's one unfortunate issue with this rosy picture --
530 * a descriptor that's not marked as 'present' will still generate
531 * an #np when loading a segment register.
533 * Consider this case. An lwp creates a harmless LDT entry, points
534 * one of it's segment registers at it, then tells the kernel (here)
535 * to delete it. In the 32-bit kernel, the #np will happen on the
536 * way back to userland where we reload the segment registers, and be
537 * handled in kern_gpfault(). In the 64-bit kernel, the same thing
538 * will happen in the normal case too. However, if we're trying to
539 * use a debugger that wants to save and restore the segment registers,
540 * and the debugger things that we have valid segment registers, we
541 * have the problem that the debugger will try and restore the
542 * segment register that points at the now 'not present' descriptor
543 * and will take a #np right there.
545 * We should obviously fix the debugger to be paranoid about
546 * -not- restoring segment registers that point to bad descriptors;
547 * however we can prevent the problem here if we check to see if any
548 * of the segment registers are still pointing at the thing we're
549 * destroying; if they are, return an error instead. (That also seems
550 * a lot better failure mode than SIGKILL and a core file
551 * from kern_gpfault() too.)
553 if (SI86SSD_PRES(ssd
) == 0) {
558 * Look carefully at the segment registers of every lwp
559 * in the process (they're all stopped by our caller).
560 * If we're about to invalidate a descriptor that's still
561 * being referenced by *any* of them, return an error,
562 * rather than having them #gp on their way out of the kernel.
564 ASSERT(pp
->p_lwprcnt
== 1);
566 mutex_enter(&pp
->p_lock
);
569 klwp_t
*lwp
= ttolwp(t
);
570 struct regs
*rp
= lwp
->lwp_regs
;
572 pcb_t
*pcb
= &lwp
->lwp_pcb
;
575 if (ssd
->sel
== rp
->r_cs
|| ssd
->sel
== rp
->r_ss
) {
581 if (pcb
->pcb_rupdate
== 1) {
582 if (ssd
->sel
== pcb
->pcb_ds
||
583 ssd
->sel
== pcb
->pcb_es
||
584 ssd
->sel
== pcb
->pcb_fs
||
585 ssd
->sel
== pcb
->pcb_gs
) {
592 if (ssd
->sel
== rp
->r_ds
||
593 ssd
->sel
== rp
->r_es
||
594 ssd
->sel
== rp
->r_fs
||
595 ssd
->sel
== rp
->r_gs
) {
601 } while ((t
= t
->t_forw
) != pp
->p_tlist
);
602 mutex_exit(&pp
->p_lock
);
605 mutex_exit(&pp
->p_ldtlock
);
611 * If acc1 is zero, clear the descriptor (including the 'present' bit)
613 if (ssd
->acc1
== 0) {
614 rc
= ldt_update_segd(ldp
, &null_udesc
);
615 mutex_exit(&pp
->p_ldtlock
);
620 * Check segment type, allow segment not present and
623 if (SI86SSD_DPL(ssd
) != SEL_UPL
) {
624 mutex_exit(&pp
->p_ldtlock
);
630 * Do not allow 32-bit applications to create 64-bit mode code
633 if (SI86SSD_ISUSEG(ssd
) && ((SI86SSD_TYPE(ssd
) >> 3) & 1) == 1 &&
634 SI86SSD_ISLONG(ssd
)) {
635 mutex_exit(&pp
->p_ldtlock
);
641 * Set up a code or data user segment descriptor.
643 if (SI86SSD_ISUSEG(ssd
)) {
644 ssd_to_usd(ssd
, &ndesc
);
645 rc
= ldt_update_segd(ldp
, &ndesc
);
646 mutex_exit(&pp
->p_ldtlock
);
652 * Allow a call gate only if the destination is in the LDT
653 * and the system is running in 32-bit legacy mode.
655 * In long mode 32-bit call gates are redefined as 64-bit call
656 * gates and the hw enforces that the target code selector
657 * of the call gate must be 64-bit selector. A #gp fault is
658 * generated if otherwise. Since we do not allow 32-bit processes
659 * to switch themselves to 64-bits we never allow call gates
660 * on 64-bit system system.
662 if (SI86SSD_TYPE(ssd
) == SDT_SYSCGT
&& SELISLDT(ssd
->ls
)) {
665 ssd_to_sgd(ssd
, (gate_desc_t
*)&ndesc
);
666 rc
= ldt_update_segd(ldp
, &ndesc
);
667 mutex_exit(&pp
->p_ldtlock
);
672 mutex_exit(&pp
->p_ldtlock
);
677 * Allocate new LDT for process just large enough to contain seli.
678 * Note we allocate and grow LDT in PAGESIZE chunks. We do this
679 * to simplify the implementation and because on the hypervisor it's
680 * required, since the LDT must live on pages that have PROT_WRITE
681 * removed and which are given to the hypervisor.
684 ldt_alloc(proc_t
*pp
, uint_t seli
)
690 ASSERT(MUTEX_HELD(&pp
->p_ldtlock
));
691 ASSERT(pp
->p_ldt
== NULL
);
692 ASSERT(pp
->p_ldtlimit
== 0);
695 * Allocate new LDT just large enough to contain seli.
697 ldtsz
= P2ROUNDUP((seli
+ 1) * sizeof (user_desc_t
), PAGESIZE
);
698 nsels
= ldtsz
/ sizeof (user_desc_t
);
699 ASSERT(nsels
>= MINNLDT
&& nsels
<= MAXNLDT
);
701 ldt
= kmem_zalloc(ldtsz
, KM_SLEEP
);
702 ASSERT(IS_P2ALIGNED(ldt
, PAGESIZE
));
706 pp
->p_ldtlimit
= nsels
- 1;
707 set_syssegd(&pp
->p_ldt_desc
, ldt
, ldtsz
- 1, SDT_SYSLDT
, SEL_KPL
);
722 ASSERT(pp
->p_ldt
!= NULL
);
724 mutex_enter(&pp
->p_ldtlock
);
726 ldtsz
= (pp
->p_ldtlimit
+ 1) * sizeof (user_desc_t
);
728 ASSERT(IS_P2ALIGNED(ldtsz
, PAGESIZE
));
732 pp
->p_ldt_desc
= null_sdesc
;
733 mutex_exit(&pp
->p_ldtlock
);
742 kmem_free(ldt
, ldtsz
);
746 * On fork copy new ldt for child.
749 ldt_dup(proc_t
*pp
, proc_t
*cp
)
753 ASSERT(pp
->p_ldt
!= NULL
);
754 ASSERT(cp
!= curproc
);
757 * I assume the parent's ldt can't increase since we're in a fork.
759 mutex_enter(&pp
->p_ldtlock
);
760 mutex_enter(&cp
->p_ldtlock
);
762 ldtsz
= (pp
->p_ldtlimit
+ 1) * sizeof (user_desc_t
);
764 ldt_alloc(cp
, pp
->p_ldtlimit
);
767 bcopy(pp
->p_ldt
, cp
->p_ldt
, ldtsz
);
769 mutex_exit(&cp
->p_ldtlock
);
770 mutex_exit(&pp
->p_ldtlock
);
775 ldt_grow(proc_t
*pp
, uint_t seli
)
777 user_desc_t
*oldt
, *nldt
;
779 size_t oldtsz
, nldtsz
;
781 ASSERT(MUTEX_HELD(&pp
->p_ldtlock
));
782 ASSERT(pp
->p_ldt
!= NULL
);
783 ASSERT(pp
->p_ldtlimit
!= 0);
786 * Allocate larger LDT just large enough to contain seli.
788 nldtsz
= P2ROUNDUP((seli
+ 1) * sizeof (user_desc_t
), PAGESIZE
);
789 nsels
= nldtsz
/ sizeof (user_desc_t
);
790 ASSERT(nsels
>= MINNLDT
&& nsels
<= MAXNLDT
);
791 ASSERT(nsels
> pp
->p_ldtlimit
);
794 oldtsz
= (pp
->p_ldtlimit
+ 1) * sizeof (user_desc_t
);
796 nldt
= kmem_zalloc(nldtsz
, KM_SLEEP
);
797 ASSERT(IS_P2ALIGNED(nldt
, PAGESIZE
));
799 bcopy(oldt
, nldt
, oldtsz
);
810 pp
->p_ldtlimit
= nsels
- 1;
813 * write new ldt segment descriptor.
815 set_syssegd(&pp
->p_ldt_desc
, nldt
, nldtsz
- 1, SDT_SYSLDT
, SEL_KPL
);
824 kmem_free(oldt
, oldtsz
);