4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
27 * Copyright 2011 Joyent, Inc. All rights reserved.
31 * Copyright (c) 1992 Terrence R. Lambert.
32 * Copyright (c) 1990 The Regents of the University of California.
33 * All rights reserved.
35 * This code is derived from software contributed to Berkeley by
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
41 * 1. Redistributions of source code must retain the above copyright
42 * notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 * notice, this list of conditions and the following disclaimer in the
45 * documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 * must display the following acknowledgement:
48 * This product includes software developed by the University of
49 * California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 * may be used to endorse or promote products derived from this software
52 * without specific prior written permission.
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
66 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
69 #include <sys/types.h>
70 #include <sys/sysmacros.h>
72 #include <sys/segments.h>
74 #include <sys/cpuvar.h>
75 #include <sys/bootconf.h>
76 #include <sys/x86_archext.h>
77 #include <sys/controlregs.h>
78 #include <sys/archsystm.h>
79 #include <sys/machsystm.h>
81 #include <sys/cmn_err.h>
82 #include <sys/reboot.h>
84 #include <sys/mach_mmu.h>
85 #include <sys/systm.h>
88 #include <sys/promif.h>
89 #include <sys/bootinfo.h>
90 #include <vm/kboot_mmu.h>
91 #include <vm/hat_pte.h>
94 * cpu0 and default tables and structures.
97 desctbr_t gdt0_default_r
;
99 gate_desc_t
*idt0
; /* interrupt descriptor table */
101 desctbr_t idt0_default_r
; /* describes idt0 in IDTR format */
104 tss_t
*ktss0
; /* kernel task state structure */
107 tss_t
*dftss0
; /* #DF double-fault exception */
110 user_desc_t zero_udesc
; /* base zero user desc native procs */
111 user_desc_t null_udesc
; /* null user descriptor */
112 system_desc_t null_sdesc
; /* null system descriptor */
115 user_desc_t zero_u32desc
; /* 32-bit compatibility procs */
121 user_desc_t ucs32_on
;
122 user_desc_t ucs32_off
;
125 #pragma align 16(dblfault_stack0)
126 char dblfault_stack0
[DEFAULTSTKSZ
];
128 extern void fast_null(void);
129 extern hrtime_t
get_hrtime(void);
130 extern hrtime_t
gethrvtime(void);
131 extern hrtime_t
get_hrestime(void);
132 extern uint64_t getlgrp(void);
134 void (*(fasttable
[]))(void) = {
135 fast_null
, /* T_FNULL routine */
136 fast_null
, /* T_FGETFP routine (initially null) */
137 fast_null
, /* T_FSETFP routine (initially null) */
138 (void (*)())get_hrtime
, /* T_GETHRTIME */
139 (void (*)())gethrvtime
, /* T_GETHRVTIME */
140 (void (*)())get_hrestime
, /* T_GETHRESTIME */
141 (void (*)())getlgrp
/* T_GETLGRP */
145 * Structure containing pre-computed descriptors to allow us to temporarily
146 * interpose on a standard handler.
148 struct interposing_handler
{
150 gate_desc_t ih_interp_desc
;
151 gate_desc_t ih_default_desc
;
155 * The brand infrastructure interposes on two handlers, and we use one as a
158 static struct interposing_handler brand_tbl
[2];
161 * software prototypes for default local descriptor table
165 * Routines for loading segment descriptors in format the hardware
172 * In long mode we have the new L or long mode attribute bit
173 * for code segments. Only the conforming bit in type is used along
174 * with descriptor priority and present bits. Default operand size must
175 * be zero when in long mode. In 32-bit compatibility mode all fields
176 * are treated as in legacy mode. For data segments while in long mode
177 * only the present bit is loaded.
180 set_usegd(user_desc_t
*dp
, uint_t lmode
, void *base
, size_t size
,
181 uint_t type
, uint_t dpl
, uint_t gran
, uint_t defopsz
)
183 ASSERT(lmode
== SDP_SHORT
|| lmode
== SDP_LONG
);
188 if (lmode
== SDP_LONG
)
189 dp
->usd_def32
= 0; /* 32-bit operands only */
192 * 32-bit compatibility mode.
194 dp
->usd_def32
= defopsz
; /* 0 = 16, 1 = 32-bit ops */
196 dp
->usd_long
= lmode
; /* 64-bit mode */
200 dp
->usd_gran
= gran
; /* 0 = bytes, 1 = pages */
202 dp
->usd_lobase
= (uintptr_t)base
;
203 dp
->usd_midbase
= (uintptr_t)base
>> 16;
204 dp
->usd_hibase
= (uintptr_t)base
>> (16 + 8);
205 dp
->usd_lolimit
= size
;
206 dp
->usd_hilimit
= (uintptr_t)size
>> 16;
209 #elif defined(__i386)
212 * Install user segment descriptor for code and data.
215 set_usegd(user_desc_t
*dp
, void *base
, size_t size
, uint_t type
,
216 uint_t dpl
, uint_t gran
, uint_t defopsz
)
218 dp
->usd_lolimit
= size
;
219 dp
->usd_hilimit
= (uintptr_t)size
>> 16;
221 dp
->usd_lobase
= (uintptr_t)base
;
222 dp
->usd_midbase
= (uintptr_t)base
>> 16;
223 dp
->usd_hibase
= (uintptr_t)base
>> (16 + 8);
228 dp
->usd_def32
= defopsz
; /* 0 = 16, 1 = 32 bit operands */
229 dp
->usd_gran
= gran
; /* 0 = bytes, 1 = pages */
235 * Install system segment descriptor for LDT and TSS segments.
241 set_syssegd(system_desc_t
*dp
, void *base
, size_t size
, uint_t type
,
244 dp
->ssd_lolimit
= size
;
245 dp
->ssd_hilimit
= (uintptr_t)size
>> 16;
247 dp
->ssd_lobase
= (uintptr_t)base
;
248 dp
->ssd_midbase
= (uintptr_t)base
>> 16;
249 dp
->ssd_hibase
= (uintptr_t)base
>> (16 + 8);
250 dp
->ssd_hi64base
= (uintptr_t)base
>> (16 + 8 + 8);
253 dp
->ssd_zero1
= 0; /* must be zero */
257 dp
->ssd_gran
= 0; /* force byte units */
261 get_ssd_base(system_desc_t
*dp
)
265 base
= (uintptr_t)dp
->ssd_lobase
|
266 (uintptr_t)dp
->ssd_midbase
<< 16 |
267 (uintptr_t)dp
->ssd_hibase
<< (16 + 8) |
268 (uintptr_t)dp
->ssd_hi64base
<< (16 + 8 + 8);
269 return ((void *)base
);
272 #elif defined(__i386)
275 set_syssegd(system_desc_t
*dp
, void *base
, size_t size
, uint_t type
,
278 dp
->ssd_lolimit
= size
;
279 dp
->ssd_hilimit
= (uintptr_t)size
>> 16;
281 dp
->ssd_lobase
= (uintptr_t)base
;
282 dp
->ssd_midbase
= (uintptr_t)base
>> 16;
283 dp
->ssd_hibase
= (uintptr_t)base
>> (16 + 8);
286 dp
->ssd_zero
= 0; /* must be zero */
289 dp
->ssd_gran
= 0; /* force byte units */
293 get_ssd_base(system_desc_t
*dp
)
297 base
= (uintptr_t)dp
->ssd_lobase
|
298 (uintptr_t)dp
->ssd_midbase
<< 16 |
299 (uintptr_t)dp
->ssd_hibase
<< (16 + 8);
300 return ((void *)base
);
306 * Install gate segment descriptor for interrupt, trap, call and task gates.
313 set_gatesegd(gate_desc_t
*dp
, void (*func
)(void), selector_t sel
,
314 uint_t type
, uint_t dpl
, uint_t vector
)
316 dp
->sgd_looffset
= (uintptr_t)func
;
317 dp
->sgd_hioffset
= (uintptr_t)func
>> 16;
318 dp
->sgd_hi64offset
= (uintptr_t)func
>> (16 + 16);
320 dp
->sgd_selector
= (uint16_t)sel
;
323 * For 64 bit native we use the IST stack mechanism
324 * for double faults. All other traps use the CPL = 0
328 if (vector
== T_DBLFLT
)
339 #elif defined(__i386)
343 set_gatesegd(gate_desc_t
*dp
, void (*func
)(void), selector_t sel
,
344 uint_t type
, uint_t dpl
, uint_t unused
)
346 dp
->sgd_looffset
= (uintptr_t)func
;
347 dp
->sgd_hioffset
= (uintptr_t)func
>> 16;
349 dp
->sgd_selector
= (uint16_t)sel
;
350 dp
->sgd_stkcpy
= 0; /* always zero bytes */
359 * Updates a single user descriptor in the the GDT of the current cpu.
360 * Caller is responsible for preventing cpu migration.
364 gdt_update_usegd(uint_t sidx
, user_desc_t
*udp
)
367 CPU
->cpu_gdt
[sidx
] = *udp
;
372 * Writes single descriptor pointed to by udp into a processes
373 * LDT entry pointed to by ldp.
376 ldt_update_segd(user_desc_t
*ldp
, user_desc_t
*udp
)
392 init_gdt_common(user_desc_t
*gdt
)
397 * 64-bit kernel code segment.
399 set_usegd(&gdt
[GDT_KCODE
], SDP_LONG
, NULL
, 0, SDT_MEMERA
, SEL_KPL
,
400 SDP_PAGES
, SDP_OP32
);
403 * 64-bit kernel data segment. The limit attribute is ignored in 64-bit
404 * mode, but we set it here to 0xFFFF so that we can use the SYSRET
405 * instruction to return from system calls back to 32-bit applications.
406 * SYSRET doesn't update the base, limit, or attributes of %ss or %ds
407 * descriptors. We therefore must ensure that the kernel uses something,
408 * though it will be ignored by hardware, that is compatible with 32-bit
409 * apps. For the same reason we must set the default op size of this
410 * descriptor to 32-bit operands.
412 set_usegd(&gdt
[GDT_KDATA
], SDP_LONG
, NULL
, -1, SDT_MEMRWA
,
413 SEL_KPL
, SDP_PAGES
, SDP_OP32
);
414 gdt
[GDT_KDATA
].usd_def32
= 1;
417 * 64-bit user code segment.
419 set_usegd(&gdt
[GDT_UCODE
], SDP_LONG
, NULL
, 0, SDT_MEMERA
, SEL_UPL
,
420 SDP_PAGES
, SDP_OP32
);
423 * 32-bit user code segment.
425 set_usegd(&gdt
[GDT_U32CODE
], SDP_SHORT
, NULL
, -1, SDT_MEMERA
,
426 SEL_UPL
, SDP_PAGES
, SDP_OP32
);
429 * See gdt_ucode32() and gdt_ucode_native().
431 ucs_on
= ucs_off
= gdt
[GDT_UCODE
];
432 ucs_off
.usd_p
= 0; /* forces #np fault */
434 ucs32_on
= ucs32_off
= gdt
[GDT_U32CODE
];
435 ucs32_off
.usd_p
= 0; /* forces #np fault */
438 * 32 and 64 bit data segments can actually share the same descriptor.
439 * In long mode only the present bit is checked but all other fields
440 * are loaded. But in compatibility mode all fields are interpreted
441 * as in legacy mode so they must be set correctly for a 32-bit data
444 set_usegd(&gdt
[GDT_UDATA
], SDP_SHORT
, NULL
, -1, SDT_MEMRWA
, SEL_UPL
,
445 SDP_PAGES
, SDP_OP32
);
450 * The 64-bit kernel has no default LDT. By default, the LDT descriptor
457 set_syssegd((system_desc_t
*)&gdt
[GDT_KTSS
], ktss0
,
458 sizeof (*ktss0
) - 1, SDT_SYSTSS
, SEL_KPL
);
463 * Initialize fs and gs descriptors for 32 bit processes.
464 * Only attributes and limits are initialized, the effective
465 * base address is programmed via fsbase/gsbase.
467 set_usegd(&gdt
[GDT_LWPFS
], SDP_SHORT
, NULL
, -1, SDT_MEMRWA
,
468 SEL_UPL
, SDP_PAGES
, SDP_OP32
);
469 set_usegd(&gdt
[GDT_LWPGS
], SDP_SHORT
, NULL
, -1, SDT_MEMRWA
,
470 SEL_UPL
, SDP_PAGES
, SDP_OP32
);
473 * Initialize the descriptors set aside for brand usage.
474 * Only attributes and limits are initialized.
476 for (i
= GDT_BRANDMIN
; i
<= GDT_BRANDMAX
; i
++)
477 set_usegd(&gdt0
[i
], SDP_SHORT
, NULL
, -1, SDT_MEMRWA
,
478 SEL_UPL
, SDP_PAGES
, SDP_OP32
);
481 * Initialize convenient zero base user descriptors for clearing
482 * lwp private %fs and %gs descriptors in GDT. See setregs() for
485 set_usegd(&zero_udesc
, SDP_LONG
, 0, 0, SDT_MEMRWA
, SEL_UPL
,
486 SDP_BYTES
, SDP_OP32
);
487 set_usegd(&zero_u32desc
, SDP_SHORT
, 0, -1, SDT_MEMRWA
, SEL_UPL
,
488 SDP_PAGES
, SDP_OP32
);
497 ulong_t ma
[1]; /* XXPV should be a memory_t */
501 * Our gdt is never larger than a single page.
503 ASSERT((sizeof (*gdt0
) * NGDT
) <= PAGESIZE
);
504 gdt0
= (user_desc_t
*)BOP_ALLOC(bootops
, (caddr_t
)GDT_VA
,
506 bzero(gdt0
, PAGESIZE
);
508 init_gdt_common(gdt0
);
511 * XXX Since we never invoke kmdb until after the kernel takes
512 * over the descriptor tables why not have it use the kernel's
515 if (boothowto
& RB_DEBUG
) {
516 set_usegd(&gdt0
[GDT_B32DATA
], SDP_LONG
, NULL
, -1, SDT_MEMRWA
,
517 SEL_KPL
, SDP_PAGES
, SDP_OP32
);
518 set_usegd(&gdt0
[GDT_B64CODE
], SDP_LONG
, NULL
, -1, SDT_MEMERA
,
519 SEL_KPL
, SDP_PAGES
, SDP_OP32
);
523 * Clear write permission for page containing the gdt and install it.
525 gdtpa
= pfn_to_pa(va_to_pfn(gdt0
));
526 ma
[0] = (ulong_t
)(pa_to_ma(gdtpa
) >> PAGESHIFT
);
527 kbm_read_only((uintptr_t)gdt0
, gdtpa
);
528 xen_set_gdt(ma
, NGDT
);
531 * Reload the segment registers to use the new GDT.
532 * On 64-bit, fixup KCS_SEL to be in ring 3.
533 * See KCS_SEL in segments.h.
535 load_segment_registers((KCS_SEL
| SEL_KPL
), KFS_SEL
, KGS_SEL
, KDS_SEL
);
538 * setup %gs for kernel
540 xen_set_segment_base(SEGBASE_GS_KERNEL
, (ulong_t
)&cpus
[0]);
543 * XX64 We should never dereference off "other gsbase" or
544 * "fsbase". So, we should arrange to point FSBASE and
545 * KGSBASE somewhere truly awful e.g. point it at the last
546 * valid address below the hole so that any attempts to index
547 * off them cause an exception.
549 * For now, point it at 8G -- at least it should be unmapped
550 * until some 64-bit processes run.
552 addr
= 0x200000000ul
;
553 xen_set_segment_base(SEGBASE_FS
, addr
);
554 xen_set_segment_base(SEGBASE_GS_USER
, addr
);
555 xen_set_segment_base(SEGBASE_GS_USER_SEL
, 0);
565 desctbr_t r_bgdt
, r_gdt
;
569 * Our gdt is never larger than a single page.
571 ASSERT((sizeof (*gdt0
) * NGDT
) <= PAGESIZE
);
572 gdt0
= (user_desc_t
*)BOP_ALLOC(bootops
, (caddr_t
)GDT_VA
,
574 bzero(gdt0
, PAGESIZE
);
576 init_gdt_common(gdt0
);
579 * Copy in from boot's gdt to our gdt.
580 * Entry 0 is the null descriptor by definition.
583 bgdt
= (user_desc_t
*)r_bgdt
.dtr_base
;
585 panic("null boot gdt");
587 gdt0
[GDT_B32DATA
] = bgdt
[GDT_B32DATA
];
588 gdt0
[GDT_B32CODE
] = bgdt
[GDT_B32CODE
];
589 gdt0
[GDT_B16CODE
] = bgdt
[GDT_B16CODE
];
590 gdt0
[GDT_B16DATA
] = bgdt
[GDT_B16DATA
];
591 gdt0
[GDT_B64CODE
] = bgdt
[GDT_B64CODE
];
594 * Install our new GDT
596 r_gdt
.dtr_limit
= (sizeof (*gdt0
) * NGDT
) - 1;
597 r_gdt
.dtr_base
= (uintptr_t)gdt0
;
601 * Reload the segment registers to use the new GDT
603 load_segment_registers(KCS_SEL
, KFS_SEL
, KGS_SEL
, KDS_SEL
);
606 * setup %gs for kernel
608 wrmsr(MSR_AMD_GSBASE
, (uint64_t)&cpus
[0]);
611 * XX64 We should never dereference off "other gsbase" or
612 * "fsbase". So, we should arrange to point FSBASE and
613 * KGSBASE somewhere truly awful e.g. point it at the last
614 * valid address below the hole so that any attempts to index
615 * off them cause an exception.
617 * For now, point it at 8G -- at least it should be unmapped
618 * until some 64-bit processes run.
620 wrmsr(MSR_AMD_FSBASE
, 0x200000000ul
);
621 wrmsr(MSR_AMD_KGSBASE
, 0x200000000ul
);
627 #elif defined(__i386)
630 init_gdt_common(user_desc_t
*gdt
)
635 * Text and data for both kernel and user span entire 32 bit
640 * kernel code segment.
642 set_usegd(&gdt
[GDT_KCODE
], NULL
, -1, SDT_MEMERA
, SEL_KPL
, SDP_PAGES
,
646 * kernel data segment.
648 set_usegd(&gdt
[GDT_KDATA
], NULL
, -1, SDT_MEMRWA
, SEL_KPL
, SDP_PAGES
,
654 set_usegd(&gdt
[GDT_UCODE
], NULL
, -1, SDT_MEMERA
, SEL_UPL
, SDP_PAGES
,
660 set_usegd(&gdt
[GDT_UDATA
], NULL
, -1, SDT_MEMRWA
, SEL_UPL
, SDP_PAGES
,
666 * TSS for T_DBLFLT (double fault) handler
668 set_syssegd((system_desc_t
*)&gdt
[GDT_DBFLT
], dftss0
,
669 sizeof (*dftss0
) - 1, SDT_SYSTSS
, SEL_KPL
);
674 set_syssegd((system_desc_t
*)&gdt
[GDT_KTSS
], ktss0
,
675 sizeof (*ktss0
) - 1, SDT_SYSTSS
, SEL_KPL
);
680 * %gs selector for kernel
682 set_usegd(&gdt
[GDT_GS
], &cpus
[0], sizeof (struct cpu
) -1, SDT_MEMRWA
,
683 SEL_KPL
, SDP_BYTES
, SDP_OP32
);
686 * Initialize lwp private descriptors.
687 * Only attributes and limits are initialized, the effective
688 * base address is programmed via fsbase/gsbase.
690 set_usegd(&gdt
[GDT_LWPFS
], NULL
, (size_t)-1, SDT_MEMRWA
, SEL_UPL
,
691 SDP_PAGES
, SDP_OP32
);
692 set_usegd(&gdt
[GDT_LWPGS
], NULL
, (size_t)-1, SDT_MEMRWA
, SEL_UPL
,
693 SDP_PAGES
, SDP_OP32
);
696 * Initialize the descriptors set aside for brand usage.
697 * Only attributes and limits are initialized.
699 for (i
= GDT_BRANDMIN
; i
<= GDT_BRANDMAX
; i
++)
700 set_usegd(&gdt0
[i
], NULL
, (size_t)-1, SDT_MEMRWA
, SEL_UPL
,
701 SDP_PAGES
, SDP_OP32
);
703 * Initialize convenient zero base user descriptor for clearing
704 * lwp private %fs and %gs descriptors in GDT. See setregs() for
707 set_usegd(&zero_udesc
, NULL
, -1, SDT_MEMRWA
, SEL_UPL
,
708 SDP_BYTES
, SDP_OP32
);
717 ulong_t ma
[1]; /* XXPV should be a memory_t */
720 * Our gdt is never larger than a single page.
722 ASSERT((sizeof (*gdt0
) * NGDT
) <= PAGESIZE
);
723 gdt0
= (user_desc_t
*)BOP_ALLOC(bootops
, (caddr_t
)GDT_VA
,
725 bzero(gdt0
, PAGESIZE
);
727 init_gdt_common(gdt0
);
728 gdtpa
= pfn_to_pa(va_to_pfn(gdt0
));
731 * XXX Since we never invoke kmdb until after the kernel takes
732 * over the descriptor tables why not have it use the kernel's
735 if (boothowto
& RB_DEBUG
) {
736 set_usegd(&gdt0
[GDT_B32DATA
], NULL
, -1, SDT_MEMRWA
, SEL_KPL
,
737 SDP_PAGES
, SDP_OP32
);
738 set_usegd(&gdt0
[GDT_B32CODE
], NULL
, -1, SDT_MEMERA
, SEL_KPL
,
739 SDP_PAGES
, SDP_OP32
);
743 * Clear write permission for page containing the gdt and install it.
745 ma
[0] = (ulong_t
)(pa_to_ma(gdtpa
) >> PAGESHIFT
);
746 kbm_read_only((uintptr_t)gdt0
, gdtpa
);
747 xen_set_gdt(ma
, NGDT
);
750 * Reload the segment registers to use the new GDT
752 load_segment_registers(
753 KCS_SEL
, KDS_SEL
, KDS_SEL
, KFS_SEL
, KGS_SEL
, KDS_SEL
);
763 desctbr_t r_bgdt
, r_gdt
;
767 * Our gdt is never larger than a single page.
769 ASSERT((sizeof (*gdt0
) * NGDT
) <= PAGESIZE
);
771 * XXX this allocation belongs in our caller, not here.
773 gdt0
= (user_desc_t
*)BOP_ALLOC(bootops
, (caddr_t
)GDT_VA
,
775 bzero(gdt0
, PAGESIZE
);
777 init_gdt_common(gdt0
);
780 * Copy in from boot's gdt to our gdt entries.
781 * Entry 0 is null descriptor by definition.
784 bgdt
= (user_desc_t
*)r_bgdt
.dtr_base
;
786 panic("null boot gdt");
788 gdt0
[GDT_B32DATA
] = bgdt
[GDT_B32DATA
];
789 gdt0
[GDT_B32CODE
] = bgdt
[GDT_B32CODE
];
790 gdt0
[GDT_B16CODE
] = bgdt
[GDT_B16CODE
];
791 gdt0
[GDT_B16DATA
] = bgdt
[GDT_B16DATA
];
794 * Install our new GDT
796 r_gdt
.dtr_limit
= (sizeof (*gdt0
) * NGDT
) - 1;
797 r_gdt
.dtr_base
= (uintptr_t)gdt0
;
801 * Reload the segment registers to use the new GDT
803 load_segment_registers(
804 KCS_SEL
, KDS_SEL
, KDS_SEL
, KFS_SEL
, KGS_SEL
, KDS_SEL
);
815 * Note that for amd64 we pretty much require every gate to be an interrupt
816 * gate which blocks interrupts atomically on entry; that's because of our
817 * dependency on using 'swapgs' every time we come into the kernel to find
818 * the cpu structure. If we get interrupted just before doing that, %cs could
819 * be in kernel mode (so that the trap prolog doesn't do a swapgs), but
820 * %gsbase is really still pointing at something in userland. Bad things will
821 * ensue. We also use interrupt gates for i386 as well even though this is not
822 * required for some traps.
824 * Perhaps they should have invented a trap gate that does an atomic swapgs?
827 init_idt_common(gate_desc_t
*idt
)
829 set_gatesegd(&idt
[T_ZERODIV
], &div0trap
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
,
831 set_gatesegd(&idt
[T_SGLSTP
], &dbgtrap
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
,
833 set_gatesegd(&idt
[T_NMIFLT
], &nmiint
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
,
835 set_gatesegd(&idt
[T_BPTFLT
], &brktrap
, KCS_SEL
, SDT_SYSIGT
, TRP_UPL
,
837 set_gatesegd(&idt
[T_OVFLW
], &ovflotrap
, KCS_SEL
, SDT_SYSIGT
, TRP_UPL
,
839 set_gatesegd(&idt
[T_BOUNDFLT
], &boundstrap
, KCS_SEL
, SDT_SYSIGT
,
841 set_gatesegd(&idt
[T_ILLINST
], &invoptrap
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
,
843 set_gatesegd(&idt
[T_NOEXTFLT
], &ndptrap
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
,
847 * double fault handler.
849 * Note that on the hypervisor a guest does not receive #df faults.
850 * Instead a failsafe event is injected into the guest if its selectors
851 * and/or stack is in a broken state. See xen_failsafe_callback.
855 set_gatesegd(&idt
[T_DBLFLT
], &syserrtrap
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
,
858 #elif defined(__i386)
861 * task gate required.
863 set_gatesegd(&idt
[T_DBLFLT
], NULL
, DFTSS_SEL
, SDT_SYSTASKGT
, TRP_KPL
,
869 * T_EXTOVRFLT coprocessor-segment-overrun not supported.
872 set_gatesegd(&idt
[T_TSSFLT
], &invtsstrap
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
,
874 set_gatesegd(&idt
[T_SEGFLT
], &segnptrap
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
,
876 set_gatesegd(&idt
[T_STKFLT
], &stktrap
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
, 0);
877 set_gatesegd(&idt
[T_GPFLT
], &gptrap
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
, 0);
878 set_gatesegd(&idt
[T_PGFLT
], &pftrap
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
, 0);
879 set_gatesegd(&idt
[T_EXTERRFLT
], &ndperr
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
,
881 set_gatesegd(&idt
[T_ALIGNMENT
], &achktrap
, KCS_SEL
, SDT_SYSIGT
,
883 set_gatesegd(&idt
[T_MCE
], &mcetrap
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
, 0);
884 set_gatesegd(&idt
[T_SIMDFPE
], &xmtrap
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
, 0);
887 * install fast trap handler at 210.
889 set_gatesegd(&idt
[T_FASTTRAP
], &fasttrap
, KCS_SEL
, SDT_SYSIGT
, TRP_UPL
,
893 * System call handler.
896 set_gatesegd(&idt
[T_SYSCALLINT
], &sys_syscall_int
, KCS_SEL
, SDT_SYSIGT
,
899 #elif defined(__i386)
900 set_gatesegd(&idt
[T_SYSCALLINT
], &sys_call
, KCS_SEL
, SDT_SYSIGT
,
905 * Install the DTrace interrupt handler for the pid provider.
907 set_gatesegd(&idt
[T_DTRACE_RET
], &dtrace_ret
, KCS_SEL
,
908 SDT_SYSIGT
, TRP_UPL
, 0);
911 * Prepare interposing descriptor for the syscall handler
912 * and cache copy of the default descriptor.
914 brand_tbl
[0].ih_inum
= T_SYSCALLINT
;
915 brand_tbl
[0].ih_default_desc
= idt0
[T_SYSCALLINT
];
918 set_gatesegd(&(brand_tbl
[0].ih_interp_desc
), &brand_sys_syscall_int
,
919 KCS_SEL
, SDT_SYSIGT
, TRP_UPL
, 0);
920 #elif defined(__i386)
921 set_gatesegd(&(brand_tbl
[0].ih_interp_desc
), &brand_sys_call
,
922 KCS_SEL
, SDT_SYSIGT
, TRP_UPL
, 0);
925 brand_tbl
[1].ih_inum
= 0;
930 init_idt(gate_desc_t
*idt
)
933 void (*ivctptr
)(void);
937 * Initialize entire table with 'reserved' trap and then overwrite
938 * specific entries. T_EXTOVRFLT (9) is unsupported and reserved
939 * since it can only be generated on a 386 processor. 15 is also
940 * unsupported and reserved.
942 for (i
= 0; i
< NIDT
; i
++)
943 set_gatesegd(&idt
[i
], &resvtrap
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
,
949 for (i
= 20; i
< 32; i
++)
950 set_gatesegd(&idt
[i
], &invaltrap
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
,
954 * interrupts 32 - 255
956 for (i
= 32; i
< 256; i
++) {
957 (void) snprintf(ivctname
, sizeof (ivctname
), "ivct%d", i
);
958 ivctptr
= (void (*)(void))kobj_getsymvalue(ivctname
, 0);
960 panic("kobj_getsymvalue(%s) failed", ivctname
);
962 set_gatesegd(&idt
[i
], ivctptr
, KCS_SEL
, SDT_SYSIGT
, TRP_KPL
, 0);
966 * Now install the common ones. Note that it will overlay some
967 * entries installed above like T_SYSCALLINT, T_FASTTRAP etc.
969 init_idt_common(idt
);
974 * The kernel does not deal with LDTs unless a user explicitly creates
975 * one. Under normal circumstances, the LDTR contains 0. Any process attempting
976 * to reference the LDT will therefore cause a #gp. System calls made via the
977 * obsolete lcall mechanism are emulated by the #gp fault handler.
991 * tss_rsp0 is dynamically filled in by resume() on each context switch.
992 * All exceptions but #DF will run on the thread stack.
993 * Set up the double fault stack here.
996 (uint64_t)&dblfault_stack0
[sizeof (dblfault_stack0
)];
999 * Set I/O bit map offset equal to size of TSS segment limit
1000 * for no I/O permission map. This will force all user I/O
1001 * instructions to generate #gp fault.
1003 ktss0
->tss_bitmapbase
= sizeof (*ktss0
);
1006 * Point %tr to descriptor for ktss0 in gdt.
1011 #elif defined(__i386)
1017 * ktss0->tss_esp dynamically filled in by resume() on each
1020 ktss0
->tss_ss0
= KDS_SEL
;
1021 ktss0
->tss_eip
= (uint32_t)_start
;
1022 ktss0
->tss_ds
= ktss0
->tss_es
= ktss0
->tss_ss
= KDS_SEL
;
1023 ktss0
->tss_cs
= KCS_SEL
;
1024 ktss0
->tss_fs
= KFS_SEL
;
1025 ktss0
->tss_gs
= KGS_SEL
;
1026 ktss0
->tss_ldt
= ULDT_SEL
;
1029 * Initialize double fault tss.
1031 dftss0
->tss_esp0
= (uint32_t)&dblfault_stack0
[sizeof (dblfault_stack0
)];
1032 dftss0
->tss_ss0
= KDS_SEL
;
1035 * tss_cr3 will get initialized in hat_kern_setup() once our page
1036 * tables have been setup.
1038 dftss0
->tss_eip
= (uint32_t)syserrtrap
;
1039 dftss0
->tss_esp
= (uint32_t)&dblfault_stack0
[sizeof (dblfault_stack0
)];
1040 dftss0
->tss_cs
= KCS_SEL
;
1041 dftss0
->tss_ds
= KDS_SEL
;
1042 dftss0
->tss_es
= KDS_SEL
;
1043 dftss0
->tss_ss
= KDS_SEL
;
1044 dftss0
->tss_fs
= KFS_SEL
;
1045 dftss0
->tss_gs
= KGS_SEL
;
1048 * Set I/O bit map offset equal to size of TSS segment limit
1049 * for no I/O permission map. This will force all user I/O
1050 * instructions to generate #gp fault.
1052 ktss0
->tss_bitmapbase
= sizeof (*ktss0
);
1055 * Point %tr to descriptor for ktss0 in gdt.
1070 * Allocate IDT and TSS structures on unique pages for better
1071 * performance in virtual machines.
1073 ASSERT(NIDT
* sizeof (*idt0
) <= PAGESIZE
);
1074 idt0
= (gate_desc_t
*)BOP_ALLOC(bootops
, (caddr_t
)IDT_VA
,
1075 PAGESIZE
, PAGESIZE
);
1076 bzero(idt0
, PAGESIZE
);
1077 ASSERT(sizeof (*ktss0
) <= PAGESIZE
);
1078 ktss0
= (tss_t
*)BOP_ALLOC(bootops
, (caddr_t
)KTSS_VA
,
1079 PAGESIZE
, PAGESIZE
);
1080 bzero(ktss0
, PAGESIZE
);
1083 ASSERT(sizeof (*dftss0
) <= PAGESIZE
);
1084 dftss0
= (tss_t
*)BOP_ALLOC(bootops
, (caddr_t
)DFTSS_VA
,
1085 PAGESIZE
, PAGESIZE
);
1086 bzero(dftss0
, PAGESIZE
);
1090 * Setup and install our GDT.
1093 ASSERT(IS_P2ALIGNED((uintptr_t)gdt
, PAGESIZE
));
1097 * Setup and install our IDT.
1101 idtr
.dtr_base
= (uintptr_t)idt0
;
1102 idtr
.dtr_limit
= (NIDT
* sizeof (*idt0
)) - 1;
1104 CPU
->cpu_idt
= idt0
;
1108 * We maintain a description of idt0 in convenient IDTR format
1109 * for #pf's on some older pentium processors. See pentium_pftrap().
1111 idt0_default_r
= idtr
;
1115 CPU
->cpu_tss
= ktss0
;
1121 * In the early kernel, we need to set up a simple GDT to run on.
1123 * XXPV Can dboot use this too? See dboot_gdt.s
1126 init_boot_gdt(user_desc_t
*bgdt
)
1128 #if defined(__amd64)
1129 set_usegd(&bgdt
[GDT_B32DATA
], SDP_LONG
, NULL
, -1, SDT_MEMRWA
, SEL_KPL
,
1130 SDP_PAGES
, SDP_OP32
);
1131 set_usegd(&bgdt
[GDT_B64CODE
], SDP_LONG
, NULL
, -1, SDT_MEMERA
, SEL_KPL
,
1132 SDP_PAGES
, SDP_OP32
);
1133 #elif defined(__i386)
1134 set_usegd(&bgdt
[GDT_B32DATA
], NULL
, -1, SDT_MEMRWA
, SEL_KPL
,
1135 SDP_PAGES
, SDP_OP32
);
1136 set_usegd(&bgdt
[GDT_B32CODE
], NULL
, -1, SDT_MEMERA
, SEL_KPL
,
1137 SDP_PAGES
, SDP_OP32
);
1142 * Enable interpositioning on the system call path by rewriting the
1143 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1144 * the branded entry points.
1147 brand_interpositioning_enable(void)
1149 gate_desc_t
*idt
= CPU
->cpu_idt
;
1152 ASSERT(curthread
->t_preempt
!= 0 || getpil() >= DISP_LEVEL
);
1154 for (i
= 0; brand_tbl
[i
].ih_inum
; i
++) {
1155 idt
[brand_tbl
[i
].ih_inum
] = brand_tbl
[i
].ih_interp_desc
;
1158 #if defined(__amd64)
1162 * Currently the hypervisor only supports 64-bit syscalls via
1163 * syscall instruction. The 32-bit syscalls are handled by
1164 * interrupt gate above.
1166 xen_set_callback(brand_sys_syscall
, CALLBACKTYPE_syscall
,
1167 CALLBACKF_mask_events
);
1171 if (is_x86_feature(x86_featureset
, X86FSET_ASYSC
)) {
1172 wrmsr(MSR_AMD_LSTAR
, (uintptr_t)brand_sys_syscall
);
1173 wrmsr(MSR_AMD_CSTAR
, (uintptr_t)brand_sys_syscall32
);
1177 #endif /* __amd64 */
1179 if (is_x86_feature(x86_featureset
, X86FSET_SEP
))
1180 wrmsr(MSR_INTC_SEP_EIP
, (uintptr_t)brand_sys_sysenter
);
1184 * Disable interpositioning on the system call path by rewriting the
1185 * sys{call|enter} MSRs and the syscall-related entries in the IDT to use
1186 * the standard entry points, which bypass the interpositioning hooks.
1189 brand_interpositioning_disable(void)
1191 gate_desc_t
*idt
= CPU
->cpu_idt
;
1194 ASSERT(curthread
->t_preempt
!= 0 || getpil() >= DISP_LEVEL
);
1196 for (i
= 0; brand_tbl
[i
].ih_inum
; i
++) {
1197 idt
[brand_tbl
[i
].ih_inum
] = brand_tbl
[i
].ih_default_desc
;
1200 #if defined(__amd64)
1204 * See comment above in brand_interpositioning_enable.
1206 xen_set_callback(sys_syscall
, CALLBACKTYPE_syscall
,
1207 CALLBACKF_mask_events
);
1211 if (is_x86_feature(x86_featureset
, X86FSET_ASYSC
)) {
1212 wrmsr(MSR_AMD_LSTAR
, (uintptr_t)sys_syscall
);
1213 wrmsr(MSR_AMD_CSTAR
, (uintptr_t)sys_syscall32
);
1217 #endif /* __amd64 */
1219 if (is_x86_feature(x86_featureset
, X86FSET_SEP
))
1220 wrmsr(MSR_INTC_SEP_EIP
, (uintptr_t)sys_sysenter
);