4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 1993, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2012 DEY Storage Systems, Inc. All rights reserved.
24 * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
25 * Copyright 2015 Joyent, Inc.
26 * Copyright (c) 2015 by Delphix. All rights reserved.
29 * Copyright (c) 2010, Intel Corporation.
30 * All rights reserved.
33 #include <sys/types.h>
34 #include <sys/t_lock.h>
35 #include <sys/param.h>
36 #include <sys/sysmacros.h>
37 #include <sys/signal.h>
38 #include <sys/systm.h>
43 #include <sys/avintr.h>
44 #include <sys/autoconf.h>
46 #include <sys/class.h>
47 #include <sys/bitmap.h>
49 #include <sys/privregs.h>
55 #include <sys/kstat.h>
57 #include <sys/reboot.h>
60 #include <sys/vnode.h>
63 #include <sys/procfs.h>
66 #include <sys/cmn_err.h>
67 #include <sys/utsname.h>
68 #include <sys/debug.h>
71 #include <sys/dumphdr.h>
72 #include <sys/bootconf.h>
73 #include <sys/memlist_plat.h>
74 #include <sys/varargs.h>
75 #include <sys/promif.h>
76 #include <sys/modctl.h>
78 #include <sys/sunddi.h>
79 #include <sys/sunndi.h>
80 #include <sys/ndi_impldefs.h>
81 #include <sys/ddidmareq.h>
83 #include <sys/regset.h>
84 #include <sys/clock.h>
87 #include <sys/stack.h>
90 #include <vm/kboot_mmu.h>
95 #include <vm/seg_dev.h>
96 #include <vm/seg_kmem.h>
97 #include <vm/seg_kpm.h>
98 #include <vm/seg_map.h>
99 #include <vm/seg_vn.h>
100 #include <vm/seg_kp.h>
101 #include <sys/memnode.h>
102 #include <vm/vm_dep.h>
103 #include <sys/thread.h>
104 #include <sys/sysconf.h>
105 #include <sys/vm_machparam.h>
106 #include <sys/archsystm.h>
107 #include <sys/machsystm.h>
109 #include <vm/hat_i86.h>
110 #include <sys/pmem.h>
111 #include <sys/smp_impldefs.h>
112 #include <sys/x86_archext.h>
113 #include <sys/cpuvar.h>
114 #include <sys/segments.h>
115 #include <sys/clconf.h>
116 #include <sys/kobj.h>
117 #include <sys/kobj_lex.h>
118 #include <sys/cpc_impl.h>
119 #include <sys/cpu_module.h>
120 #include <sys/smbios.h>
121 #include <sys/debug_info.h>
122 #include <sys/bootinfo.h>
123 #include <sys/ddi_periodic.h>
124 #include <sys/systeminfo.h>
125 #include <sys/multiboot.h>
126 #include <sys/ramdisk.h>
130 #include <sys/hypervisor.h>
131 #include <sys/xen_mmu.h>
132 #include <sys/evtchn_impl.h>
133 #include <sys/gnttab.h>
134 #include <sys/xpv_panic.h>
135 #include <xen/sys/xenbus_comms.h>
136 #include <xen/public/physdev.h>
138 extern void xen_late_startup(void);
140 struct xen_evt_data cpu0_evt_data
;
143 #include <sys/memlist_impl.h>
145 extern void mem_config_init(void);
148 extern void progressbar_init(void);
149 extern void brand_init(void);
150 extern void pcf_init(void);
151 extern void pg_init(void);
152 extern void ssp_init(void);
154 extern int size_pse_array(pgcnt_t
, int);
156 #if defined(_SOFT_HOSTID)
160 static int32_t set_soft_hostid(void);
161 static char hostid_file
[] = "/etc/hostid";
165 void *gfx_devinfo_list
;
167 #if defined(__amd64) && !defined(__xpv)
168 extern void immu_startup(void);
172 * XXX make declaration below "static" when drivers no longer use this
175 extern caddr_t p0_va
; /* Virtual address for accessing physical page 0 */
180 extern int segkp_fromheap
;
182 static void kvm_init(void);
183 static void startup_init(void);
184 static void startup_memlist(void);
185 static void startup_kmem(void);
186 static void startup_modules(void);
187 static void startup_vm(void);
188 static void startup_end(void);
189 static void layout_kernel_va(void);
192 * Declare these as initialized data so we can patch them.
197 * Due to virtual address space limitations running in 32 bit mode, restrict
198 * the amount of physical memory configured to a max of PHYSMEM pages (16g).
200 * If the physical max memory size of 64g were allowed to be configured, the
201 * size of user virtual address space will be less than 1g. A limited user
202 * address space greatly reduces the range of applications that can run.
204 * If more physical memory than PHYSMEM is required, users should preferably
205 * run in 64 bit mode which has far looser virtual address space limitations.
207 * If 64 bit mode is not available (as in IA32) and/or more physical memory
208 * than PHYSMEM is required in 32 bit mode, physmem can be set to the desired
209 * value or to 0 (to configure all available memory) via eeprom(1M). kernelbase
210 * should also be carefully tuned to balance out the need of the user
211 * application while minimizing the risk of kernel heap exhaustion due to
212 * kernelbase being set too high.
214 #define PHYSMEM 0x400000
219 * For now we can handle memory with physical addresses up to about
220 * 64 Terabytes. This keeps the kernel above the VA hole, leaving roughly
221 * half the VA space for seg_kpm. When systems get bigger than 64TB this
222 * code will need revisiting. There is an implicit assumption that there
223 * are no *huge* holes in the physical address space too.
225 #define TERABYTE (1ul << 40)
226 #define PHYSMEM_MAX64 mmu_btop(64 * TERABYTE)
227 #define PHYSMEM PHYSMEM_MAX64
228 #define AMD64_VA_HOLE_END 0xFFFF800000000000ul
232 pgcnt_t physmem
= PHYSMEM
;
233 pgcnt_t obp_pages
; /* Memory used by PROM for its text and data */
236 int kobj_file_bufsize
; /* set in /etc/system */
238 /* Global variables for MP support. Used in mp_startup */
239 caddr_t rm_platter_va
= 0;
240 uint32_t rm_platter_pa
;
242 int auto_lpg_disable
= 1;
245 * Some CPUs have holes in the middle of the 64-bit virtual address range.
247 uintptr_t hole_start
, hole_end
;
254 static int kpm_desired
;
256 static uintptr_t segkpm_base
= (uintptr_t)SEGKPM_BASE
;
260 * Configuration parameters set at boot time.
263 caddr_t econtig
; /* end of first block of contiguous kernel */
265 struct bootops
*bootops
= 0; /* passed in from boot */
266 struct bootops
**bootopsp
;
267 struct boot_syscalls
*sysp
; /* passed in from boot */
269 char bootblock_fstype
[16];
271 char kern_bootargs
[OBP_MAXPATHLEN
];
272 char kern_bootfile
[OBP_MAXPATHLEN
];
275 * ZFS zio segment. This allows us to exclude large portions of ZFS data that
276 * gets cached in kmem caches on the heap. If this is set to zero, we allocate
277 * zio buffers from their own segment, otherwise they are allocated from the
278 * heap. The optimization of allocating zio buffers from their own segment is
279 * only valid on 64-bit kernels.
282 int segzio_fromheap
= 0;
284 int segzio_fromheap
= 1;
288 * Give folks an escape hatch for disabling SMAP via kmdb. Doesn't work
291 int disable_smap
= 0;
294 * new memory fragmentations are possible in startup() due to BOP_ALLOCs. this
295 * depends on number of BOP_ALLOC calls made and requested size, memory size
296 * combination and whether boot.bin memory needs to be freed.
298 #define POSS_NEW_FRAGMENTS 12
303 long page_hashsz
; /* Size of page hash table (power of two) */
304 unsigned int page_hashsz_shift
; /* log2(page_hashsz) */
305 struct page
*pp_base
; /* Base of initial system page struct array */
306 struct page
**page_hash
; /* Page hash table */
307 pad_mutex_t
*pse_mutex
; /* Locks protecting pp->p_selock */
308 size_t pse_table_size
; /* Number of mutexes in pse_mutex[] */
309 int pse_shift
; /* log2(pse_table_size) */
310 struct seg ktextseg
; /* Segment used for kernel executable image */
311 struct seg kvalloc
; /* Segment used for "valloc" mapping */
312 struct seg kpseg
; /* Segment used for pageable kernel virt mem */
313 struct seg kmapseg
; /* Segment used for generic kernel mappings */
314 struct seg kdebugseg
; /* Segment used for the kernel debugger */
316 struct seg
*segkmap
= &kmapseg
; /* Kernel generic mapping segment */
317 static struct seg
*segmap
= &kmapseg
; /* easier to use name for in here */
319 struct seg
*segkp
= &kpseg
; /* Pageable kernel virtual memory segment */
322 struct seg kvseg_core
; /* Segment used for the core heap */
323 struct seg kpmseg
; /* Segment used for physical mapping */
324 struct seg
*segkpm
= &kpmseg
; /* 64bit kernel physical mapping segment */
326 struct seg
*segkpm
= NULL
; /* Unused on IA32 */
329 caddr_t segkp_base
; /* Base address of segkp */
330 caddr_t segzio_base
; /* Base address of segzio */
332 pgcnt_t segkpsize
= btop(SEGKPDEFSIZE
); /* size of segkp segment in pages */
334 pgcnt_t segkpsize
= 0;
336 pgcnt_t segziosize
= 0; /* size of zio segment in pages */
339 * A static DR page_t VA map is reserved that can map the page structures
340 * for a domain's entire RA space. The pages that back this space are
341 * dynamically allocated and need not be physically contiguous. The DR
342 * map size is derived from KPM size.
343 * This mechanism isn't used by x86 yet, so just stubs here.
345 int ppvm_enable
= 0; /* Static virtual map for page structs */
346 page_t
*ppvm_base
= NULL
; /* Base of page struct map */
347 pgcnt_t ppvm_size
= 0; /* Size of page struct map */
350 * VA range available to the debugger
352 const caddr_t kdi_segdebugbase
= (const caddr_t
)SEGDEBUGBASE
;
353 const size_t kdi_segdebugsize
= SEGDEBUGSIZE
;
355 struct memseg
*memseg_base
;
356 struct vnode unused_pages_vp
;
358 #define FOURGB 0x100000000LL
360 struct memlist
*memlist
;
362 caddr_t s_text
; /* start of kernel text segment */
363 caddr_t e_text
; /* end of kernel text segment */
364 caddr_t s_data
; /* start of kernel data segment */
365 caddr_t e_data
; /* end of kernel data segment */
366 caddr_t modtext
; /* start of loadable module text reserved */
367 caddr_t e_modtext
; /* end of loadable module text reserved */
368 caddr_t moddata
; /* start of loadable module data reserved */
369 caddr_t e_moddata
; /* end of loadable module data reserved */
371 struct memlist
*phys_install
; /* Total installed physical memory */
372 struct memlist
*phys_avail
; /* Total available physical memory */
373 struct memlist
*bios_rsvd
; /* Bios reserved memory */
376 * kphysm_init returns the number of pages that were processed
378 static pgcnt_t
kphysm_init(page_t
*, pgcnt_t
);
380 #define IO_PROP_SIZE 64 /* device property size */
383 * a couple useful roundup macros
385 #define ROUND_UP_PAGE(x) \
386 ((uintptr_t)P2ROUNDUP((uintptr_t)(x), (uintptr_t)MMU_PAGESIZE))
387 #define ROUND_UP_LPAGE(x) \
388 ((uintptr_t)P2ROUNDUP((uintptr_t)(x), mmu.level_size[1]))
389 #define ROUND_UP_4MEG(x) \
390 ((uintptr_t)P2ROUNDUP((uintptr_t)(x), (uintptr_t)FOUR_MEG))
391 #define ROUND_UP_TOPLEVEL(x) \
392 ((uintptr_t)P2ROUNDUP((uintptr_t)(x), mmu.level_size[mmu.max_level]))
395 * 32-bit Kernel's Virtual memory layout.
396 * +-----------------------+
398 * 0xFFC00000 -|-----------------------|- ARGSBASE
400 * 0xFF800000 -|-----------------------|- SEGDEBUGBASE
402 * 0xFEC00000 -|-----------------------|
404 * 0xFE800000 -|-----------------------|- KERNEL_TEXT (0xFB400000 on Xen)
405 * |--- GDT ---|- GDT page (GDT_VA)
406 * |--- debug info ---|- debug info (DEBUG_INFO_VA)
408 * | page_t structures |
409 * | memsegs, memlists, |
410 * | page hash, etc. |
411 * --- -|-----------------------|- ekernelheap, valloc_base (floating)
412 * | | (segkp is just an arena in the heap)
417 * --- -|-----------------------|- kernelheap (floating)
419 * 0xC3002000 -|-----------------------|- segmap_start (floating)
421 * 0xC3000000 -|-----------------------|- kernelbase / userlimit (floating)
423 * | Shared objects | \/
427 * |-----------------------|
429 * 0x08048000 -|-----------------------|
433 * 0x00000000 +-----------------------+
436 * 64-bit Kernel's Virtual memory layout. (assuming 64 bit app)
437 * +-----------------------+
439 * 0xFFFFFFFF.FFC00000 |-----------------------|- ARGSBASE
441 * 0xFFFFFFFF.FF800000 |-----------------------|- SEGDEBUGBASE
443 * +-----------------------+
445 * 0xFFFFFFFF.FBC00000 |-----------------------|
447 * 0xFFFFFFFF.FB800000 |-----------------------|- KERNEL_TEXT
448 * |--- GDT ---|- GDT page (GDT_VA)
449 * |--- debug info ---|- debug info (DEBUG_INFO_VA)
451 * | Core heap | (used for loadable modules)
452 * 0xFFFFFFFF.C0000000 |-----------------------|- core_base / ekernelheap
455 * 0xFFFFFXXX.XXX00000 |-----------------------|- kernelheap (floating)
457 * 0xFFFFFXXX.XXX00000 |-----------------------|- segmap_start (floating)
458 * | device mappings |
459 * 0xFFFFFXXX.XXX00000 |-----------------------|- toxic_addr (floating)
461 * 0xFFFFFXXX.XXX00000 |-----------------------|- segzio_base (floating)
463 * --- |-----------------------|- segkp_base (floating)
464 * | page_t structures | valloc_base + valloc_sz
465 * | memsegs, memlists, |
466 * | page hash, etc. |
467 * 0xFFFFFF00.00000000 |-----------------------|- valloc_base (lower if >256GB)
469 * 0xFFFFFE00.00000000 |-----------------------|
471 * 0xFFFFFD80.00000000 |-----------------------|- KERNELBASE (lower if >256GB)
472 * | User stack |- User space memory
474 * | shared objects, etc | (grows downwards)
477 * 0xFFFF8000.00000000 |-----------------------|
479 * | VA Hole / unused |
481 * 0x00008000.00000000 |-----------------------|
485 * | user heap | (grows upwards)
488 * |-----------------------|
490 * 0x00000000.04000000 |-----------------------|
492 * 0x00000000.00000000 +-----------------------+
494 * A 32 bit app on the 64 bit kernel sees the same layout as on the 32 bit
495 * kernel, except that userlimit is raised to 0xfe000000
499 * valloc_base: start of the kernel's memory management/tracking data
500 * structures. This region contains page_t structures for
501 * physical memory, memsegs, memlists, and the page hash.
503 * core_base: start of the kernel's "core" heap area on 64-bit systems.
504 * This area is intended to be used for global data as well as for module
505 * text/data that does not fit into the nucleus pages. The core heap is
506 * restricted to a 2GB range, allowing every address within it to be
507 * accessed using rip-relative addressing
509 * ekernelheap: end of kernelheap and start of segmap.
511 * kernelheap: start of kernel heap. On 32-bit systems, this starts right
512 * above a red zone that separates the user's address space from the
513 * kernel's. On 64-bit systems, it sits above segkp and segkpm.
515 * segmap_start: start of segmap. The length of segmap can be modified
516 * through eeprom. The default length is 16MB on 32-bit systems and 64MB
519 * kernelbase: On a 32-bit kernel the default value of 0xd4000000 will be
520 * decreased by 2X the size required for page_t. This allows the kernel
521 * heap to grow in size with physical memory. With sizeof(page_t) == 80
522 * bytes, the following shows the values of kernelbase and kernel heap
523 * sizes for different memory configurations (assuming default segmap and
526 * mem size for kernelbase kernel heap
528 * ---- --------- ---------- -----------
529 * 1gb 0x01400000 0xd1800000 684MB
530 * 2gb 0x02800000 0xcf000000 704MB
531 * 4gb 0x05000000 0xca000000 744MB
532 * 6gb 0x07800000 0xc5000000 784MB
533 * 8gb 0x0a000000 0xc0000000 824MB
534 * 16gb 0x14000000 0xac000000 984MB
535 * 32gb 0x28000000 0x84000000 1304MB
536 * 64gb 0x50000000 0x34000000 1944MB (*)
538 * kernelbase is less than the abi minimum of 0xc0000000 for memory
539 * configurations above 8gb.
541 * (*) support for memory configurations above 32gb will require manual tuning
542 * of kernelbase to balance out the need of user applications.
545 /* real-time-clock initialization parameters */
546 extern time_t process_rtc_config_file(void);
548 uintptr_t kernelbase
;
549 uintptr_t postbootkernelbase
; /* not set till boot loader is gone */
550 uintptr_t eprom_kernelbase
;
552 uintptr_t segmap_start
;
556 size_t core_size
; /* size of "core" heap */
557 uintptr_t core_base
; /* base address of "core" heap */
560 * List of bootstrap pages. We mark these as allocated in startup.
561 * release_bootstrap() will free them when we're completely done with
564 static page_t
*bootpages
;
567 * boot time pages that have a vnode from the ramdisk will keep that forever.
569 static page_t
*rd_pages
;
574 static page_t
*lower_pages
= NULL
;
575 static int lower_pages_count
= 0;
577 struct system_hardware system_hardware
;
580 * Enable some debugging messages concerning memory usage...
583 print_memlist(char *title
, struct memlist
*mp
)
585 prom_printf("MEMLIST: %s:\n", title
);
587 prom_printf("\tAddress 0x%" PRIx64
", size 0x%" PRIx64
"\n",
588 mp
->ml_address
, mp
->ml_size
);
594 * XX64 need a comment here.. are these just default values, surely
595 * we read the "cpuid" type information to figure this out.
597 int l2cache_sz
= 0x80000;
598 int l2cache_linesz
= 0x40;
599 int l2cache_assoc
= 1;
601 static size_t textrepl_min_gb
= 10;
604 * on 64 bit we use a predifined VA range for mapping devices in the kernel
605 * on 32 bit the mappings are intermixed in the heap, so we use a bit map
609 vmem_t
*device_arena
;
610 uintptr_t toxic_addr
= (uintptr_t)NULL
;
611 size_t toxic_size
= 1024 * 1024 * 1024; /* Sparc uses 1 gig too */
615 ulong_t
*toxic_bit_map
; /* one bit for each 4k of VA in heap_arena */
616 size_t toxic_bit_map_len
= 0; /* in bits */
621 * Simple boot time debug facilities
623 static char *prm_dbg_str
[] = {
624 "%s:%d: '%s' is 0x%x\n",
625 "%s:%d: '%s' is 0x%llx\n"
630 #define PRM_DEBUG(q) if (prom_debug) \
631 prom_printf(prm_dbg_str[sizeof (q) >> 3], "startup.c", __LINE__, #q, q);
632 #define PRM_POINT(q) if (prom_debug) \
633 prom_printf("%s:%d: %s\n", "startup.c", __LINE__, q);
636 * This structure is used to keep track of the intial allocations
637 * done in startup_memlist(). The value of NUM_ALLOCATIONS needs to
638 * be >= the number of ADD_TO_ALLOCATIONS() executed in the code.
640 #define NUM_ALLOCATIONS 8
641 int num_allocations
= 0;
645 } allocations
[NUM_ALLOCATIONS
];
646 size_t valloc_sz
= 0;
647 uintptr_t valloc_base
;
649 #define ADD_TO_ALLOCATIONS(ptr, size) { \
650 size = ROUND_UP_PAGE(size); \
651 if (num_allocations == NUM_ALLOCATIONS) \
652 panic("too many ADD_TO_ALLOCATIONS()"); \
653 allocations[num_allocations].al_ptr = (void**)&ptr; \
654 allocations[num_allocations].al_size = size; \
660 * Allocate all the initial memory needed by the page allocator.
663 perform_allocations(void)
669 PRM_DEBUG(valloc_base
);
670 PRM_DEBUG(valloc_sz
);
671 valloc_align
= mmu
.level_size
[mmu
.max_page_level
> 0];
672 mem
= BOP_ALLOC(bootops
, (caddr_t
)valloc_base
, valloc_sz
, valloc_align
);
673 if (mem
!= (caddr_t
)valloc_base
)
674 panic("BOP_ALLOC() failed");
675 bzero(mem
, valloc_sz
);
676 for (i
= 0; i
< num_allocations
; ++i
) {
677 *allocations
[i
].al_ptr
= (void *)mem
;
678 mem
+= allocations
[i
].al_size
;
683 * Set up and enable SMAP now before we start other CPUs, but after the kernel's
684 * VM has been set up so we can use hot_patch_kernel_text().
686 * We can only patch 1, 2, or 4 bytes, but not three bytes. So instead, we
687 * replace the four byte word at the patch point. See uts/intel/ia32/ml/copy.s
688 * for more information on what's going on here.
698 extern int _smap_enable_patch_count
;
699 extern int _smap_disable_patch_count
;
701 if (disable_smap
!= 0)
702 remove_x86_feature(x86_featureset
, X86FSET_SMAP
);
704 if (is_x86_feature(x86_featureset
, X86FSET_SMAP
) == B_FALSE
)
707 for (i
= 0; i
< _smap_enable_patch_count
; i
++) {
710 VERIFY3U(i
, <, _smap_enable_patch_count
);
711 VERIFY(snprintf(sym
, sizeof (sym
), "_smap_enable_patch_%d", i
) <
713 instp
= (uint8_t *)(void *)kobj_getelfsym(sym
, NULL
, &sizep
);
715 inst
= (instp
[3] << 24) | (SMAP_CLAC_INSTR
& 0x00ffffff);
716 hot_patch_kernel_text((caddr_t
)instp
, inst
, 4);
719 for (i
= 0; i
< _smap_disable_patch_count
; i
++) {
722 VERIFY(snprintf(sym
, sizeof (sym
), "_smap_disable_patch_%d",
724 instp
= (uint8_t *)(void *)kobj_getelfsym(sym
, NULL
, &sizep
);
726 inst
= (instp
[3] << 24) | (SMAP_STAC_INSTR
& 0x00ffffff);
727 hot_patch_kernel_text((caddr_t
)instp
, inst
, 4);
730 hot_patch_kernel_text((caddr_t
)smap_enable
, SMAP_CLAC_INSTR
, 4);
731 hot_patch_kernel_text((caddr_t
)smap_disable
, SMAP_STAC_INSTR
, 4);
732 setcr4(getcr4() | CR4_SMAP
);
737 * Our world looks like this at startup time.
739 * In a 32-bit OS, boot loads the kernel text at 0xfe800000 and kernel data
740 * at 0xfec00000. On a 64-bit OS, kernel text and data are loaded at
741 * 0xffffffff.fe800000 and 0xffffffff.fec00000 respectively. Those
742 * addresses are fixed in the binary at link time.
745 * unix/genunix/krtld/module text loads.
748 * unix/genunix/krtld/module data loads.
750 * Machine-dependent startup code
756 extern void startup_pci_bios(void);
758 extern cpuset_t cpu_ready_set
;
761 * Make sure that nobody tries to use sekpm until we have
762 * initialized it properly.
768 CPUSET_ONLY(cpu_ready_set
, 0); /* cpu 0 is boot cpu */
770 #if defined(__xpv) /* XXPV fix me! */
772 extern int segvn_use_regions
;
773 segvn_use_regions
= 0;
780 startup_xen_version();
787 * Note we need to do this even on fast reboot in order to access
788 * the irq routing table (used for pci labels).
804 PRM_POINT("startup_init() starting...");
807 * Complete the extraction of cpuid data
811 (void) check_boot_version(BOP_GETVERSION(bootops
));
814 * Check for prom_debug in boot environment
816 if (BOP_GETPROPLEN(bootops
, "prom_debug") >= 0) {
818 PRM_POINT("prom_debug found in boot enviroment");
822 * Collect node, cpu and memory configuration information.
824 get_system_configuration();
827 * Halt if this is an unsupported processor.
829 if (x86_type
== X86_TYPE_486
|| x86_type
== X86_TYPE_CYRIX_486
) {
830 printf("\n486 processor (\"%s\") detected.\n",
832 halt("This processor is not supported by this release "
836 PRM_POINT("startup_init() done");
840 * Callback for copy_memlist_filter() to filter nucleus, kadb/kmdb, (ie.
841 * everything mapped above KERNEL_TEXT) pages from phys_avail. Note it
842 * also filters out physical page zero. There is some reliance on the
843 * boot loader allocating only a few contiguous physical memory chunks.
846 avail_filter(uint64_t *addr
, uint64_t *size
)
858 prom_printf("\tFilter: in: a=%" PRIx64
", s=%" PRIx64
"\n",
862 * page zero is required for BIOS.. never make it available
865 *addr
+= MMU_PAGESIZE
;
866 *size
-= MMU_PAGESIZE
;
870 * First we trim from the front of the range. Since kbm_probe()
871 * walks ranges in virtual order, but addr/size are physical, we need
872 * to the list until no changes are seen. This deals with the case
873 * where page "p" is mapped at v, page "p + PAGESIZE" is mapped at w
878 for (va
= KERNEL_TEXT
;
879 *size
> 0 && kbm_probe(&va
, &len
, &pfn
, &prot
) != 0;
883 pfn_addr
= pfn_to_pa(pfn
);
884 pfn_eaddr
= pfn_addr
+ len
;
886 if (pfn_addr
<= *addr
&& pfn_eaddr
> *addr
) {
888 while (*size
> 0 && len
> 0) {
889 *addr
+= MMU_PAGESIZE
;
890 *size
-= MMU_PAGESIZE
;
895 if (change
&& prom_debug
)
896 prom_printf("\t\ttrim: a=%" PRIx64
", s=%" PRIx64
"\n",
901 * Trim pages from the end of the range.
903 for (va
= KERNEL_TEXT
;
904 *size
> 0 && kbm_probe(&va
, &len
, &pfn
, &prot
) != 0;
908 pfn_addr
= pfn_to_pa(pfn
);
910 if (pfn_addr
>= *addr
&& pfn_addr
< *addr
+ *size
)
911 *size
= pfn_addr
- *addr
;
915 prom_printf("\tFilter out: a=%" PRIx64
", s=%" PRIx64
"\n",
922 struct segkpm_crargs b
;
925 * These variables were all designed for sfmmu in which segkpm is
926 * mapped using a single pagesize - either 8KB or 4MB. On x86, we
927 * might use 2+ page sizes on a single machine, so none of these
928 * variables have a single correct value. They are set up as if we
929 * always use a 4KB pagesize, which should do no harm. In the long
930 * run, we should get rid of KPM's assumption that only a single
933 kpm_pgshft
= MMU_PAGESHIFT
;
934 kpm_pgsz
= MMU_PAGESIZE
;
935 kpm_pgoff
= MMU_PAGEOFFSET
;
938 ASSERT(((uintptr_t)kpm_vbase
& (kpm_pgsz
- 1)) == 0);
940 PRM_POINT("about to create segkpm");
941 rw_enter(&kas
.a_lock
, RW_WRITER
);
943 if (seg_attach(&kas
, kpm_vbase
, kpm_size
, segkpm
) < 0)
944 panic("cannot attach segkpm");
946 b
.prot
= PROT_READ
| PROT_WRITE
;
949 if (segkpm_create(segkpm
, (caddr_t
)&b
) != 0)
950 panic("segkpm_create segkpm");
952 rw_exit(&kas
.a_lock
);
956 * The debug info page provides enough information to allow external
957 * inspectors (e.g. when running under a hypervisor) to bootstrap
958 * themselves into allowing full-blown kernel debugging.
961 init_debug_info(void)
967 ASSERT(sizeof (debug_info_t
) < MMU_PAGESIZE
);
970 mem
= BOP_ALLOC(bootops
, (caddr_t
)DEBUG_INFO_VA
, MMU_PAGESIZE
,
973 if (mem
!= (caddr_t
)DEBUG_INFO_VA
)
974 panic("BOP_ALLOC() failed");
975 bzero(mem
, MMU_PAGESIZE
);
977 di
= (debug_info_t
*)mem
;
979 di
->di_magic
= DEBUG_INFO_MAGIC
;
980 di
->di_version
= DEBUG_INFO_VERSION
;
981 di
->di_modules
= (uintptr_t)&modules
;
982 di
->di_s_text
= (uintptr_t)s_text
;
983 di
->di_e_text
= (uintptr_t)e_text
;
984 di
->di_s_data
= (uintptr_t)s_data
;
985 di
->di_e_data
= (uintptr_t)e_data
;
986 di
->di_hat_htable_off
= offsetof(hat_t
, hat_htable
);
987 di
->di_ht_pfn_off
= offsetof(htable_t
, ht_pfn
);
991 * Build the memlists and other kernel essential memory system data structures.
992 * This is everything at valloc_base.
995 startup_memlist(void)
1006 pfn_t rsvd_high_pfn
;
1008 size_t rsvdmemlist_sz
;
1010 caddr_t pagecolor_mem
;
1011 size_t pagecolor_memsz
;
1012 caddr_t page_ctrs_mem
;
1013 size_t page_ctrs_size
;
1014 size_t pse_table_alloc_size
;
1015 struct memlist
*current
;
1016 extern void startup_build_mem_nodes(struct memlist
*);
1018 /* XX64 fix these - they should be in include files */
1019 extern size_t page_coloring_init(uint_t
, int, int);
1020 extern void page_coloring_setup(caddr_t
);
1022 PRM_POINT("startup_memlist() starting...");
1025 * Use leftover large page nucleus text/data space for loadable modules.
1026 * Use at most MODTEXT/MODDATA.
1028 len
= kbm_nucleus_size
;
1029 ASSERT(len
> MMU_PAGESIZE
);
1031 moddata
= (caddr_t
)ROUND_UP_PAGE(e_data
);
1032 e_moddata
= (caddr_t
)P2ROUNDUP((uintptr_t)e_data
, (uintptr_t)len
);
1033 if (e_moddata
- moddata
> MODDATA
)
1034 e_moddata
= moddata
+ MODDATA
;
1036 modtext
= (caddr_t
)ROUND_UP_PAGE(e_text
);
1037 e_modtext
= (caddr_t
)P2ROUNDUP((uintptr_t)e_text
, (uintptr_t)len
);
1038 if (e_modtext
- modtext
> MODTEXT
)
1039 e_modtext
= modtext
+ MODTEXT
;
1041 econtig
= e_moddata
;
1044 PRM_DEBUG(e_modtext
);
1046 PRM_DEBUG(e_moddata
);
1050 * Examine the boot loader physical memory map to find out:
1051 * - total memory in system - physinstalled
1052 * - the max physical address - physmax
1053 * - the number of discontiguous segments of memory.
1056 print_memlist("boot physinstalled",
1057 bootops
->boot_mem
->physinstalled
);
1058 installed_top_size_ex(bootops
->boot_mem
->physinstalled
, &physmax
,
1059 &physinstalled
, &memblocks
);
1061 PRM_DEBUG(physinstalled
);
1062 PRM_DEBUG(memblocks
);
1065 * Compute maximum physical address for memory DR operations.
1066 * Memory DR operations are unsupported on xpv or 32bit OSes.
1069 if (plat_dr_support_memory()) {
1070 if (plat_dr_physmax
== 0) {
1071 uint_t pabits
= UINT_MAX
;
1073 cpuid_get_addrsize(CPU
, &pabits
, NULL
);
1074 plat_dr_physmax
= btop(1ULL << pabits
);
1076 if (plat_dr_physmax
> PHYSMEM_MAX64
)
1077 plat_dr_physmax
= PHYSMEM_MAX64
;
1080 plat_dr_physmax
= 0;
1083 * Examine the bios reserved memory to find out:
1084 * - the number of discontiguous segments of memory.
1087 print_memlist("boot reserved mem",
1088 bootops
->boot_mem
->rsvdmem
);
1089 installed_top_size_ex(bootops
->boot_mem
->rsvdmem
, &rsvd_high_pfn
,
1090 &rsvd_pgcnt
, &rsvdmemblocks
);
1091 PRM_DEBUG(rsvd_high_pfn
);
1092 PRM_DEBUG(rsvd_pgcnt
);
1093 PRM_DEBUG(rsvdmemblocks
);
1096 * Initialize hat's mmu parameters.
1097 * Check for enforce-prot-exec in boot environment. It's used to
1098 * enable/disable support for the page table entry NX bit.
1099 * The default is to enforce PROT_EXEC on processors that support NX.
1100 * Boot seems to round up the "len", but 8 seems to be big enough.
1106 * physmax is lowered if there is more memory than can be
1107 * physically addressed in 32 bit (PAE/non-PAE) modes.
1110 if (PFN_ABOVE64G(physmax
)) {
1111 physinstalled
-= (physmax
- (PFN_64G
- 1));
1112 physmax
= PFN_64G
- 1;
1115 if (PFN_ABOVE4G(physmax
)) {
1116 physinstalled
-= (physmax
- (PFN_4G
- 1));
1117 physmax
= PFN_4G
- 1;
1122 startup_build_mem_nodes(bootops
->boot_mem
->physinstalled
);
1124 if (BOP_GETPROPLEN(bootops
, "enforce-prot-exec") >= 0) {
1125 int len
= BOP_GETPROPLEN(bootops
, "enforce-prot-exec");
1129 (void) BOP_GETPROP(bootops
, "enforce-prot-exec", value
);
1131 (void) strcpy(value
, "");
1132 if (strcmp(value
, "off") == 0)
1135 PRM_DEBUG(mmu
.pt_nx
);
1138 * We will need page_t's for every page in the system, except for
1139 * memory mapped at or above above the start of the kernel text segment.
1141 * pages above e_modtext are attributed to kernel debugger (obp_pages)
1143 npages
= physinstalled
- 1; /* avail_filter() skips page 0, so "- 1" */
1146 while (kbm_probe(&va
, &len
, &pfn
, &prot
) != 0) {
1147 npages
-= len
>> MMU_PAGESHIFT
;
1148 if (va
>= (uintptr_t)e_moddata
)
1149 obp_pages
+= len
>> MMU_PAGESHIFT
;
1153 PRM_DEBUG(obp_pages
);
1156 * If physmem is patched to be non-zero, use it instead of the computed
1157 * value unless it is larger than the actual amount of memory on hand.
1159 if (physmem
== 0 || physmem
> npages
) {
1161 } else if (physmem
< npages
) {
1162 orig_npages
= npages
;
1168 * We now compute the sizes of all the initial allocations for
1169 * structures the kernel needs in order do kmem_alloc(). These
1175 * page coloring data structs
1177 memseg_sz
= sizeof (struct memseg
) * (memblocks
+ POSS_NEW_FRAGMENTS
);
1178 ADD_TO_ALLOCATIONS(memseg_base
, memseg_sz
);
1179 PRM_DEBUG(memseg_sz
);
1182 * Reserve space for memlists. There's no real good way to know exactly
1183 * how much room we'll need, but this should be a good upper bound.
1185 memlist_sz
= ROUND_UP_PAGE(2 * sizeof (struct memlist
) *
1186 (memblocks
+ POSS_NEW_FRAGMENTS
));
1187 ADD_TO_ALLOCATIONS(memlist
, memlist_sz
);
1188 PRM_DEBUG(memlist_sz
);
1191 * Reserve space for bios reserved memlists.
1193 rsvdmemlist_sz
= ROUND_UP_PAGE(2 * sizeof (struct memlist
) *
1194 (rsvdmemblocks
+ POSS_NEW_FRAGMENTS
));
1195 ADD_TO_ALLOCATIONS(bios_rsvd
, rsvdmemlist_sz
);
1196 PRM_DEBUG(rsvdmemlist_sz
);
1199 ASSERT(P2SAMEHIGHBIT((1 << PP_SHIFT
), sizeof (struct page
)));
1201 * The page structure hash table size is a power of 2
1202 * such that the average hash chain length is PAGE_HASHAVELEN.
1204 page_hashsz
= npages
/ PAGE_HASHAVELEN
;
1205 page_hashsz_shift
= highbit(page_hashsz
);
1206 page_hashsz
= 1 << page_hashsz_shift
;
1207 pagehash_sz
= sizeof (struct page
*) * page_hashsz
;
1208 ADD_TO_ALLOCATIONS(page_hash
, pagehash_sz
);
1209 PRM_DEBUG(pagehash_sz
);
1212 * Set aside room for the page structures themselves.
1215 pp_sz
= sizeof (struct page
) * npages
;
1216 ADD_TO_ALLOCATIONS(pp_base
, pp_sz
);
1220 * determine l2 cache info and memory size for page coloring
1222 (void) getl2cacheinfo(CPU
,
1223 &l2cache_sz
, &l2cache_linesz
, &l2cache_assoc
);
1225 page_coloring_init(l2cache_sz
, l2cache_linesz
, l2cache_assoc
);
1226 ADD_TO_ALLOCATIONS(pagecolor_mem
, pagecolor_memsz
);
1227 PRM_DEBUG(pagecolor_memsz
);
1229 page_ctrs_size
= page_ctrs_sz();
1230 ADD_TO_ALLOCATIONS(page_ctrs_mem
, page_ctrs_size
);
1231 PRM_DEBUG(page_ctrs_size
);
1234 * Allocate the array that protects pp->p_selock.
1236 pse_shift
= size_pse_array(physmem
, max_ncpus
);
1237 pse_table_size
= 1 << pse_shift
;
1238 pse_table_alloc_size
= pse_table_size
* sizeof (pad_mutex_t
);
1239 ADD_TO_ALLOCATIONS(pse_mutex
, pse_table_alloc_size
);
1241 #if defined(__amd64)
1242 valloc_sz
= ROUND_UP_LPAGE(valloc_sz
);
1243 valloc_base
= VALLOC_BASE
;
1246 * The default values of VALLOC_BASE and SEGKPM_BASE should work
1247 * for values of physmax up to 256GB (1/4 TB). They need adjusting when
1248 * memory is at addresses above 256GB. When adjusted, segkpm_base must
1249 * be aligned on KERNEL_REDZONE_SIZE boundary (span of top level pte).
1251 * In the general case (>256GB), we use (4 * physmem) for the
1252 * kernel's virtual addresses, which is divided approximately
1254 * - 1 * physmem for segkpm
1255 * - 1.5 * physmem for segzio
1256 * - 1.5 * physmem for heap
1257 * Total: 4.0 * physmem
1259 * Note that the segzio and heap sizes are more than physmem so that
1260 * VA fragmentation does not prevent either of them from being
1261 * able to use nearly all of physmem. The value of 1.5x is determined
1262 * experimentally and may need to change if the workload changes.
1264 if (physmax
+ 1 > mmu_btop(TERABYTE
/ 4) ||
1265 plat_dr_physmax
> mmu_btop(TERABYTE
/ 4)) {
1266 uint64_t kpm_resv_amount
= mmu_ptob(physmax
+ 1);
1268 if (kpm_resv_amount
< mmu_ptob(plat_dr_physmax
)) {
1269 kpm_resv_amount
= mmu_ptob(plat_dr_physmax
);
1273 * This is what actually controls the KVA : UVA split.
1274 * The kernel uses high VA, and this is lowering the
1275 * boundary, thus increasing the amount of VA for the kernel.
1276 * This gives the kernel 4 * (amount of physical memory) VA.
1278 * The maximum VA is UINT64_MAX and we are using
1279 * 64-bit 2's complement math, so e.g. if you have 512GB
1280 * of memory, segkpm_base = -(4 * 512GB) == -2TB ==
1281 * UINT64_MAX - 2TB (approximately). So the kernel's
1282 * VA is [UINT64_MAX-2TB to UINT64_MAX].
1284 segkpm_base
= -(P2ROUNDUP((4 * kpm_resv_amount
),
1285 KERNEL_REDZONE_SIZE
));
1287 /* make sure we leave some space for user apps above hole */
1288 segkpm_base
= MAX(segkpm_base
, AMD64_VA_HOLE_END
+ TERABYTE
);
1289 if (segkpm_base
> SEGKPM_BASE
)
1290 segkpm_base
= SEGKPM_BASE
;
1291 PRM_DEBUG(segkpm_base
);
1293 valloc_base
= segkpm_base
+ P2ROUNDUP(kpm_resv_amount
, ONE_GIG
);
1294 if (valloc_base
< segkpm_base
)
1295 panic("not enough kernel VA to support memory size");
1296 PRM_DEBUG(valloc_base
);
1299 valloc_base
= (uintptr_t)(MISC_VA_BASE
- valloc_sz
);
1300 valloc_base
= P2ALIGN(valloc_base
, mmu
.level_size
[1]);
1301 PRM_DEBUG(valloc_base
);
1305 * do all the initial allocations
1307 perform_allocations();
1310 * Build phys_install and phys_avail in kernel memspace.
1311 * - phys_install should be all memory in the system.
1312 * - phys_avail is phys_install minus any memory mapped before this
1313 * point above KERNEL_TEXT.
1315 current
= phys_install
= memlist
;
1316 copy_memlist_filter(bootops
->boot_mem
->physinstalled
, ¤t
, NULL
);
1317 if ((caddr_t
)current
> (caddr_t
)memlist
+ memlist_sz
)
1318 panic("physinstalled was too big!");
1320 print_memlist("phys_install", phys_install
);
1322 phys_avail
= current
;
1323 PRM_POINT("Building phys_avail:\n");
1324 copy_memlist_filter(bootops
->boot_mem
->physinstalled
, ¤t
,
1326 if ((caddr_t
)current
> (caddr_t
)memlist
+ memlist_sz
)
1327 panic("physavail was too big!");
1329 print_memlist("phys_avail", phys_avail
);
1332 * Free unused memlist items, which may be used by memory DR driver
1335 if ((caddr_t
)current
< (caddr_t
)memlist
+ memlist_sz
) {
1336 memlist_free_block((caddr_t
)current
,
1337 (caddr_t
)memlist
+ memlist_sz
- (caddr_t
)current
);
1342 * Build bios reserved memspace
1344 current
= bios_rsvd
;
1345 copy_memlist_filter(bootops
->boot_mem
->rsvdmem
, ¤t
, NULL
);
1346 if ((caddr_t
)current
> (caddr_t
)bios_rsvd
+ rsvdmemlist_sz
)
1347 panic("bios_rsvd was too big!");
1349 print_memlist("bios_rsvd", bios_rsvd
);
1352 * Free unused memlist items, which may be used by memory DR driver
1355 if ((caddr_t
)current
< (caddr_t
)bios_rsvd
+ rsvdmemlist_sz
) {
1356 memlist_free_block((caddr_t
)current
,
1357 (caddr_t
)bios_rsvd
+ rsvdmemlist_sz
- (caddr_t
)current
);
1362 * setup page coloring
1364 page_coloring_setup(pagecolor_mem
);
1365 page_lock_init(); /* currently a no-op */
1368 * free page list counters
1370 (void) page_ctrs_alloc(page_ctrs_mem
);
1373 * Size the pcf array based on the number of cpus in the box at
1380 * Initialize the page structures from the memory lists.
1382 availrmem_initial
= availrmem
= freemem
= 0;
1383 PRM_POINT("Calling kphysm_init()...");
1384 npages
= kphysm_init(pp_base
, npages
);
1385 PRM_POINT("kphysm_init() done");
1391 * Now that page_t's have been initialized, remove all the
1392 * initial allocation pages from the kernel free page lists.
1394 boot_mapin((caddr_t
)valloc_base
, valloc_sz
);
1395 boot_mapin((caddr_t
)MISC_VA_BASE
, MISC_VA_SIZE
);
1396 PRM_POINT("startup_memlist() done");
1398 PRM_DEBUG(valloc_sz
);
1400 #if defined(__amd64)
1401 if ((availrmem
>> (30 - MMU_PAGESHIFT
)) >=
1402 textrepl_min_gb
&& l2cache_sz
<= 2 << 20) {
1403 extern size_t textrepl_size_thresh
;
1404 textrepl_size_thresh
= (16 << 20) - 1;
1410 * Layout the kernel's part of address space and initialize kmem allocator.
1415 extern void page_set_colorequiv_arr(void);
1417 PRM_POINT("startup_kmem() starting...");
1419 #if defined(__amd64)
1420 if (eprom_kernelbase
&& eprom_kernelbase
!= KERNELBASE
)
1421 cmn_err(CE_NOTE
, "!kernelbase cannot be changed on 64-bit "
1423 kernelbase
= segkpm_base
- KERNEL_REDZONE_SIZE
;
1424 core_base
= (uintptr_t)COREHEAP_BASE
;
1425 core_size
= (size_t)MISC_VA_BASE
- COREHEAP_BASE
;
1428 * We configure kernelbase based on:
1430 * 1. user specified kernelbase via eeprom command. Value cannot exceed
1431 * KERNELBASE_MAX. we large page align eprom_kernelbase
1433 * 2. Default to KERNELBASE and adjust to 2X less the size for page_t.
1434 * On large memory systems we must lower kernelbase to allow
1435 * enough room for page_t's for all of memory.
1437 * The value set here, might be changed a little later.
1439 if (eprom_kernelbase
) {
1440 kernelbase
= eprom_kernelbase
& mmu
.level_mask
[1];
1441 if (kernelbase
> KERNELBASE_MAX
)
1442 kernelbase
= KERNELBASE_MAX
;
1444 kernelbase
= (uintptr_t)KERNELBASE
;
1445 kernelbase
-= ROUND_UP_4MEG(2 * valloc_sz
);
1447 ASSERT((kernelbase
& mmu
.level_offset
[1]) == 0);
1448 core_base
= valloc_base
;
1452 PRM_DEBUG(core_base
);
1453 PRM_DEBUG(core_size
);
1454 PRM_DEBUG(kernelbase
);
1460 ekernelheap
= (char *)core_base
;
1461 PRM_DEBUG(ekernelheap
);
1464 * Now that we know the real value of kernelbase,
1465 * update variables that were initialized with a value of
1466 * KERNELBASE (in common/conf/param.c).
1468 * XXX The problem with this sort of hackery is that the
1469 * compiler just may feel like putting the const declarations
1470 * (in param.c) into the .text section. Perhaps they should
1471 * just be declared as variables there?
1474 *(uintptr_t *)&_kernelbase
= kernelbase
;
1475 *(uintptr_t *)&_userlimit
= kernelbase
;
1476 #if defined(__amd64)
1477 *(uintptr_t *)&_userlimit
-= KERNELBASE
- USERLIMIT
;
1479 *(uintptr_t *)&_userlimit32
= _userlimit
;
1481 PRM_DEBUG(_kernelbase
);
1482 PRM_DEBUG(_userlimit
);
1483 PRM_DEBUG(_userlimit32
);
1489 * If segmap is too large we can push the bottom of the kernel heap
1490 * higher than the base. Or worse, it could exceed the top of the
1491 * VA space entirely, causing it to wrap around.
1493 if (kernelheap
>= ekernelheap
|| (uintptr_t)kernelheap
< kernelbase
)
1494 panic("too little address space available for kernelheap,"
1495 " use eeprom for lower kernelbase or smaller segmapsize");
1499 * Initialize the kernel heap. Note 3rd argument must be > 1st.
1501 kernelheap_init(kernelheap
, ekernelheap
,
1502 kernelheap
+ MMU_PAGESIZE
,
1503 (void *)core_base
, (void *)(core_base
+ core_size
));
1507 * Link pending events struct into cpu struct
1509 CPU
->cpu_m
.mcpu_evt_pend
= &cpu0_evt_data
;
1512 * Initialize kernel memory allocator.
1517 * Factor in colorequiv to check additional 'equivalent' bins
1519 page_set_colorequiv_arr();
1522 * print this out early so that we know what's going on
1524 print_x86_featureset(x86_featureset
);
1527 * Initialize bp_mapin().
1529 bp_init(MMU_PAGESIZE
, HAT_STORECACHING_OK
);
1532 * orig_npages is non-zero if physmem has been configured for less
1533 * than the available memory.
1536 cmn_err(CE_WARN
, "!%slimiting physmem to 0x%lx of 0x%lx pages",
1537 (npages
== PHYSMEM
? "Due to virtual address space " : ""),
1538 npages
, orig_npages
);
1541 if (eprom_kernelbase
&& (eprom_kernelbase
!= kernelbase
))
1542 cmn_err(CE_WARN
, "kernelbase value, User specified 0x%lx, "
1543 "System using 0x%lx",
1544 (uintptr_t)eprom_kernelbase
, (uintptr_t)kernelbase
);
1547 #ifdef KERNELBASE_ABI_MIN
1548 if (kernelbase
< (uintptr_t)KERNELBASE_ABI_MIN
) {
1549 cmn_err(CE_NOTE
, "!kernelbase set to 0x%lx, system is not "
1550 "i386 ABI compliant.", (uintptr_t)kernelbase
);
1555 if (plat_dr_support_memory()) {
1560 * Some of the xen start information has to be relocated up
1561 * into the kernel's permanent address space.
1563 PRM_POINT("calling xen_relocate_start_info()");
1564 xen_relocate_start_info();
1565 PRM_POINT("xen_relocate_start_info() done");
1568 * (Update the vcpu pointer in our cpu structure to point into
1569 * the relocated shared info.)
1571 CPU
->cpu_m
.mcpu_vcpu_info
=
1572 &HYPERVISOR_shared_info
->vcpu_info
[CPU
->cpu_id
];
1575 PRM_POINT("startup_kmem() done");
1580 * If we have detected that we are running in an HVM environment, we need
1581 * to prepend the PV driver directory to the module search path.
1583 #define HVM_MOD_DIR "/platform/i86hvm/kernel"
1585 update_default_path()
1587 char *current
, *newpath
;
1591 * We are about to resync with krtld. krtld will reset its
1592 * internal module search path iff Solaris has set default_path.
1593 * We want to be sure we're prepending this new directory to the
1594 * right search path.
1596 current
= (default_path
== NULL
) ? kobj_module_path
: default_path
;
1598 newlen
= strlen(HVM_MOD_DIR
) + strlen(current
) + 2;
1599 newpath
= kmem_alloc(newlen
, KM_SLEEP
);
1600 (void) strcpy(newpath
, HVM_MOD_DIR
);
1601 (void) strcat(newpath
, " ");
1602 (void) strcat(newpath
, current
);
1604 default_path
= newpath
;
1609 startup_modules(void)
1612 extern void prom_setup(void);
1618 PRM_POINT("startup_modules() starting...");
1622 * Initialize ten-micro second timer so that drivers will
1623 * not get short changed in their init phase. This was
1624 * not getting called until clkinit which, on fast cpu's
1625 * caused the drv_usecwait to be way too short.
1629 if ((get_hwenv() & HW_XEN_HVM
) != 0)
1630 update_default_path();
1634 * Read the GMT lag from /etc/rtc_config.
1636 sgmtl(process_rtc_config_file());
1639 * Calculate default settings of system parameters based upon
1640 * maxusers, yet allow to be overridden via the /etc/system file.
1647 * Initialize system parameters.
1652 * Initialize the default brands
1657 * maxmem is the amount of physical memory we're playing with.
1662 * Initialize segment management stuff.
1666 if (modload("fs", "specfs") == -1)
1667 halt("Can't load specfs");
1669 if (modload("fs", "devfs") == -1)
1670 halt("Can't load devfs");
1672 if (modload("fs", "dev") == -1)
1673 halt("Can't load dev");
1675 if (modload("fs", "procfs") == -1)
1676 halt("Can't load procfs");
1678 (void) modloadonly("sys", "lbl_edition");
1682 /* Read cluster configuration data. */
1688 (void) xs_early_init();
1692 * Create a kernel device tree. First, create rootnex and
1693 * then invoke bus specific code to probe devices.
1698 if (DOMAIN_IS_INITDOMAIN(xen_info
))
1702 smbios_system_t smsys
;
1703 smbios_info_t sminfo
;
1706 * Load the System Management BIOS into the global ksmbios
1707 * handle, if an SMBIOS is present on this system.
1708 * Also set "si-hw-provider" property, if not already set.
1710 ksmbios
= smbios_open(NULL
, SMB_VERSION
, ksmbios_flags
, NULL
);
1711 if (ksmbios
!= NULL
&&
1712 ((smid
= smbios_info_system(ksmbios
, &smsys
)) != SMB_ERR
) &&
1713 (smbios_info_common(ksmbios
, smid
, &sminfo
)) != SMB_ERR
) {
1714 mfg
= (char *)sminfo
.smbi_manufacturer
;
1715 if (BOP_GETPROPLEN(bootops
, "si-hw-provider") < 0) {
1716 extern char hw_provider
[];
1718 for (i
= 0; i
< SYS_NMLN
; i
++) {
1719 if (isprint(mfg
[i
]))
1720 hw_provider
[i
] = mfg
[i
];
1722 hw_provider
[i
] = '\0';
1726 hw_provider
[SYS_NMLN
- 1] = '\0';
1733 * Originally clconf_init() apparently needed the hostid. But
1734 * this no longer appears to be true - it uses its own nodeid.
1735 * By placing the hostid logic here, we are able to make use of
1738 if ((h
= set_soft_hostid()) == HW_INVALID_HOSTID
) {
1739 cmn_err(CE_WARN
, "Unable to set hostid");
1741 for (v
= h
, cnt
= 0; cnt
< 10; cnt
++) {
1742 d
[cnt
] = (char)(v
% 10);
1747 for (cp
= hw_serial
; cnt
>= 0; cnt
--)
1748 *cp
++ = d
[cnt
] + '0';
1753 * Set up the CPU module subsystem for the boot cpu in the native
1754 * case, and all physical cpu resource in the xpv dom0 case.
1755 * Modifies the device tree, so this must be done after
1760 * If paravirtualized and on dom0 then we initialize all physical
1761 * cpu handles now; if paravirtualized on a domU then do not
1764 if (DOMAIN_IS_INITDOMAIN(xen_info
)) {
1765 xen_mc_lcpu_cookie_t cpi
;
1767 for (cpi
= xen_physcpu_next(NULL
); cpi
!= NULL
;
1768 cpi
= xen_physcpu_next(cpi
)) {
1769 if ((hdl
= cmi_init(CMI_HDL_SOLARIS_xVM_MCA
,
1770 xen_physcpu_chipid(cpi
), xen_physcpu_coreid(cpi
),
1771 xen_physcpu_strandid(cpi
))) != NULL
&&
1772 is_x86_feature(x86_featureset
, X86FSET_MCA
))
1778 * Initialize a handle for the boot cpu - others will initialize
1779 * as they startup. Do not do this if we know we are in an HVM domU.
1781 if ((get_hwenv() & HW_XEN_HVM
) == 0 &&
1782 (hdl
= cmi_init(CMI_HDL_NATIVE
, cmi_ntv_hwchipid(CPU
),
1783 cmi_ntv_hwcoreid(CPU
), cmi_ntv_hwstrandid(CPU
))) != NULL
&&
1784 is_x86_feature(x86_featureset
, X86FSET_MCA
)) {
1786 CPU
->cpu_m
.mcpu_cmi_hdl
= hdl
;
1791 * Fake a prom tree such that /dev/openprom continues to work
1793 PRM_POINT("startup_modules: calling prom_setup...");
1795 PRM_POINT("startup_modules: done");
1798 * Load all platform specific modules
1800 PRM_POINT("startup_modules: calling psm_modload...");
1803 PRM_POINT("startup_modules() done");
1807 * claim a "setaside" boot page for use in the kernel
1810 boot_claim_page(pfn_t pfn
)
1814 pp
= page_numtopp_nolock(pfn
);
1817 if (PP_ISBOOTPAGES(pp
)) {
1818 if (pp
->p_next
!= NULL
)
1819 pp
->p_next
->p_prev
= pp
->p_prev
;
1820 if (pp
->p_prev
== NULL
)
1821 bootpages
= pp
->p_next
;
1823 pp
->p_prev
->p_next
= pp
->p_next
;
1826 * htable_attach() expects a base pagesize page
1829 page_boot_demote(pp
);
1830 pp
= page_numtopp(pfn
, SE_EXCL
);
1836 * Walk through the pagetables looking for pages mapped in by boot. If the
1837 * setaside flag is set the pages are expected to be returned to the
1838 * kernel later in boot, so we add them to the bootpages list.
1841 protect_boot_range(uintptr_t low
, uintptr_t high
, int setaside
)
1848 pgcnt_t boot_protect_cnt
= 0;
1850 while (kbm_probe(&va
, &len
, &pfn
, &prot
) != 0 && va
< high
) {
1851 if (va
+ len
>= high
)
1852 panic("0x%lx byte mapping at 0x%p exceeds boot's "
1853 "legal range.", len
, (void *)va
);
1856 pp
= page_numtopp_alloc(pfn
);
1859 panic("Unexpected mapping by boot. "
1860 "addr=%p pfn=%lx\n",
1863 pp
->p_next
= bootpages
;
1865 PP_SETBOOTPAGES(pp
);
1866 if (bootpages
!= NULL
) {
1867 bootpages
->p_prev
= pp
;
1874 len
-= MMU_PAGESIZE
;
1878 PRM_DEBUG(boot_protect_cnt
);
1885 layout_kernel_va(void)
1887 PRM_POINT("layout_kernel_va() starting...");
1889 * Establish the final size of the kernel's heap, size of segmap,
1893 #if defined(__amd64)
1895 kpm_vbase
= (caddr_t
)segkpm_base
;
1896 if (physmax
+ 1 < plat_dr_physmax
) {
1897 kpm_size
= ROUND_UP_LPAGE(mmu_ptob(plat_dr_physmax
));
1899 kpm_size
= ROUND_UP_LPAGE(mmu_ptob(physmax
+ 1));
1901 if ((uintptr_t)kpm_vbase
+ kpm_size
> (uintptr_t)valloc_base
)
1902 panic("not enough room for kpm!");
1903 PRM_DEBUG(kpm_size
);
1904 PRM_DEBUG(kpm_vbase
);
1907 * By default we create a seg_kp in 64 bit kernels, it's a little
1908 * faster to access than embedding it in the heap.
1910 segkp_base
= (caddr_t
)valloc_base
+ valloc_sz
;
1911 if (!segkp_fromheap
) {
1912 size_t sz
= mmu_ptob(segkpsize
);
1915 * determine size of segkp
1917 if (sz
< SEGKPMINSIZE
|| sz
> SEGKPMAXSIZE
) {
1919 cmn_err(CE_WARN
, "!Illegal value for segkpsize. "
1920 "segkpsize has been reset to %ld pages",
1923 sz
= MIN(sz
, MAX(SEGKPMINSIZE
, mmu_ptob(physmem
)));
1925 segkpsize
= mmu_btop(ROUND_UP_LPAGE(sz
));
1927 PRM_DEBUG(segkp_base
);
1928 PRM_DEBUG(segkpsize
);
1931 * segzio is used for ZFS cached data. It uses a distinct VA
1932 * segment (from kernel heap) so that we can easily tell not to
1933 * include it in kernel crash dumps on 64 bit kernels. The trick is
1934 * to give it lots of VA, but not constrain the kernel heap.
1935 * We can use 1.5x physmem for segzio, leaving approximately
1936 * another 1.5x physmem for heap. See also the comment in
1937 * startup_memlist().
1939 segzio_base
= segkp_base
+ mmu_ptob(segkpsize
);
1940 if (segzio_fromheap
) {
1943 size_t physmem_size
= mmu_ptob(physmem
);
1944 size_t size
= (segziosize
== 0) ?
1945 physmem_size
* 3 / 2 : mmu_ptob(segziosize
);
1947 if (size
< SEGZIOMINSIZE
)
1948 size
= SEGZIOMINSIZE
;
1949 segziosize
= mmu_btop(ROUND_UP_LPAGE(size
));
1951 PRM_DEBUG(segziosize
);
1952 PRM_DEBUG(segzio_base
);
1955 * Put the range of VA for device mappings next, kmdb knows to not
1956 * grep in this range of addresses.
1959 ROUND_UP_LPAGE((uintptr_t)segzio_base
+ mmu_ptob(segziosize
));
1960 PRM_DEBUG(toxic_addr
);
1961 segmap_start
= ROUND_UP_LPAGE(toxic_addr
+ toxic_size
);
1963 segmap_start
= ROUND_UP_LPAGE(kernelbase
);
1965 PRM_DEBUG(segmap_start
);
1968 * Users can change segmapsize through eeprom. If the variable
1969 * is tuned through eeprom, there is no upper bound on the
1972 segmapsize
= MAX(ROUND_UP_LPAGE(segmapsize
), SEGMAPDEFAULT
);
1976 * 32-bit systems don't have segkpm or segkp, so segmap appears at
1977 * the bottom of the kernel's address range. Set aside space for a
1978 * small red zone just below the start of segmap.
1980 segmap_start
+= KERNEL_REDZONE_SIZE
;
1981 segmapsize
-= KERNEL_REDZONE_SIZE
;
1984 PRM_DEBUG(segmap_start
);
1985 PRM_DEBUG(segmapsize
);
1986 kernelheap
= (caddr_t
)ROUND_UP_LPAGE(segmap_start
+ segmapsize
);
1987 PRM_DEBUG(kernelheap
);
1988 PRM_POINT("layout_kernel_va() done...");
1992 * Finish initializing the VM system, now that we are no longer
1993 * relying on the boot time memory allocators.
1998 struct segmap_crargs a
;
2000 extern int use_brk_lpg
, use_stk_lpg
;
2002 PRM_POINT("startup_vm() starting...");
2005 * Initialize the hat layer.
2010 * Do final allocations of HAT data structures that need to
2011 * be allocated before quiescing the boot loader.
2013 PRM_POINT("Calling hat_kern_alloc()...");
2014 hat_kern_alloc((caddr_t
)segmap_start
, segmapsize
, ekernelheap
);
2015 PRM_POINT("hat_kern_alloc() done");
2019 * Setup Page Attribute Table
2025 * The next two loops are done in distinct steps in order
2026 * to be sure that any page that is doubly mapped (both above
2027 * KERNEL_TEXT and below kernelbase) is dealt with correctly.
2028 * Note this may never happen, but it might someday.
2031 PRM_POINT("Protecting boot pages");
2034 * Protect any pages mapped above KERNEL_TEXT that somehow have
2035 * page_t's. This can only happen if something weird allocated
2036 * in this range (like kadb/kmdb).
2038 protect_boot_range(KERNEL_TEXT
, (uintptr_t)-1, 0);
2041 * Before we can take over memory allocation/mapping from the boot
2042 * loader we must remove from our free page lists any boot allocated
2043 * pages that stay mapped until release_bootstrap().
2045 protect_boot_range(0, kernelbase
, 1);
2049 * Switch to running on regular HAT (not boot_mmu)
2051 PRM_POINT("Calling hat_kern_setup()...");
2055 * It is no longer safe to call BOP_ALLOC(), so make sure we don't.
2059 PRM_POINT("hat_kern_setup() done");
2061 hat_cpu_online(CPU
);
2064 * Initialize VM system
2066 PRM_POINT("Calling kvm_init()...");
2068 PRM_POINT("kvm_init() done");
2071 * Tell kmdb that the VM system is now working
2073 if (boothowto
& RB_DEBUG
)
2078 * Populate the I/O pool on domain 0
2080 if (DOMAIN_IS_INITDOMAIN(xen_info
)) {
2081 extern long populate_io_pool(void);
2082 long init_io_pool_cnt
;
2084 PRM_POINT("Populating reserve I/O page pool");
2085 init_io_pool_cnt
= populate_io_pool();
2086 PRM_DEBUG(init_io_pool_cnt
);
2090 * Mangle the brand string etc.
2094 #if defined(__amd64)
2097 * Create the device arena for toxic (to dtrace/kmdb) mappings.
2099 device_arena
= vmem_create("device", (void *)toxic_addr
,
2100 toxic_size
, MMU_PAGESIZE
, NULL
, NULL
, NULL
, 0, VM_SLEEP
);
2105 * allocate the bit map that tracks toxic pages
2107 toxic_bit_map_len
= btop((ulong_t
)(valloc_base
- kernelbase
));
2108 PRM_DEBUG(toxic_bit_map_len
);
2110 kmem_zalloc(BT_SIZEOFMAP(toxic_bit_map_len
), KM_NOSLEEP
);
2111 ASSERT(toxic_bit_map
!= NULL
);
2112 PRM_DEBUG(toxic_bit_map
);
2118 * Now that we've got more VA, as well as the ability to allocate from
2119 * it, tell the debugger.
2121 if (boothowto
& RB_DEBUG
)
2122 kdi_dvec_memavail();
2125 * The following code installs a special page fault handler (#pf)
2126 * to work around a pentium bug.
2128 #if !defined(__amd64) && !defined(__xpv)
2129 if (x86_type
== X86_TYPE_P5
) {
2131 gate_desc_t
*newidt
;
2133 if ((newidt
= kmem_zalloc(MMU_PAGESIZE
, KM_NOSLEEP
)) == NULL
)
2134 panic("failed to install pentium_pftrap");
2136 bcopy(idt0
, newidt
, NIDT
* sizeof (*idt0
));
2137 set_gatesegd(&newidt
[T_PGFLT
], &pentium_pftrap
,
2138 KCS_SEL
, SDT_SYSIGT
, TRP_KPL
, 0);
2140 (void) as_setprot(&kas
, (caddr_t
)newidt
, MMU_PAGESIZE
,
2141 PROT_READ
| PROT_EXEC
);
2143 CPU
->cpu_idt
= newidt
;
2144 idtr
.dtr_base
= (uintptr_t)CPU
->cpu_idt
;
2145 idtr
.dtr_limit
= (NIDT
* sizeof (*idt0
)) - 1;
2148 #endif /* !__amd64 */
2152 * Map page pfn=0 for drivers, such as kd, that need to pick up
2153 * parameters left there by controllers/BIOS.
2155 PRM_POINT("setup up p0_va");
2156 p0_va
= i86devmap(0, 1, PROT_READ
);
2160 cmn_err(CE_CONT
, "?mem = %luK (0x%lx)\n",
2161 physinstalled
<< (MMU_PAGESHIFT
- 10), ptob(physinstalled
));
2164 * disable automatic large pages for small memory systems or
2165 * when the disable flag is set.
2167 * Do not yet consider page sizes larger than 2m/4m.
2169 if (!auto_lpg_disable
&& mmu
.max_page_level
> 0) {
2170 max_uheap_lpsize
= LEVEL_SIZE(1);
2171 max_ustack_lpsize
= LEVEL_SIZE(1);
2172 max_privmap_lpsize
= LEVEL_SIZE(1);
2173 max_uidata_lpsize
= LEVEL_SIZE(1);
2174 max_utext_lpsize
= LEVEL_SIZE(1);
2175 max_shm_lpsize
= LEVEL_SIZE(1);
2177 if (physmem
< privm_lpg_min_physmem
|| mmu
.max_page_level
== 0 ||
2182 mcntl0_lpsize
= LEVEL_SIZE(mmu
.umax_page_level
);
2184 PRM_POINT("Calling hat_init_finish()...");
2186 PRM_POINT("hat_init_finish() done");
2189 * Initialize the segkp segment type.
2191 rw_enter(&kas
.a_lock
, RW_WRITER
);
2192 PRM_POINT("Attaching segkp");
2193 if (segkp_fromheap
) {
2195 } else if (seg_attach(&kas
, (caddr_t
)segkp_base
, mmu_ptob(segkpsize
),
2197 panic("startup: cannot attach segkp");
2200 PRM_POINT("Doing segkp_create()");
2201 if (segkp_create(segkp
) != 0) {
2202 panic("startup: segkp_create failed");
2206 rw_exit(&kas
.a_lock
);
2218 * Now create segmap segment.
2220 rw_enter(&kas
.a_lock
, RW_WRITER
);
2221 if (seg_attach(&kas
, (caddr_t
)segmap_start
, segmapsize
, segmap
) < 0) {
2222 panic("cannot attach segmap");
2227 a
.prot
= PROT_READ
| PROT_WRITE
;
2229 a
.nfreelist
= segmapfreelists
;
2231 if (segmap_create(segmap
, (caddr_t
)&a
) != 0)
2232 panic("segmap_create segmap");
2233 rw_exit(&kas
.a_lock
);
2235 setup_vaddr_for_ppcopy(CPU
);
2239 if (DOMAIN_IS_INITDOMAIN(xen_info
))
2243 PRM_POINT("startup_vm() done");
2247 * Load a tod module for the non-standard tod part found on this system.
2250 load_tod_module(char *todmod
)
2252 if (modload("tod", todmod
) == -1)
2253 halt("Can't load TOD module");
2260 extern void setx86isalist(void);
2261 extern void cpu_event_init(void);
2263 PRM_POINT("startup_end() starting...");
2266 * Perform tasks that get done after most of the VM
2267 * initialization has been done but before the clock
2268 * and other devices get started.
2273 * Perform CPC initialization for this CPU.
2278 * Initialize cpu event framework.
2282 #if defined(OPTERON_WORKAROUND_6323525)
2283 if (opteron_workaround_6323525
)
2284 patch_workaround_6323525();
2287 * If needed, load TOD module now so that ddi_get_time(9F) etc. work
2288 * (For now, "needed" is defined as set tod_module_name in /etc/system)
2290 if (tod_module_name
!= NULL
) {
2291 PRM_POINT("load_tod_module()");
2292 load_tod_module(tod_module_name
);
2297 * Forceload interposing TOD module for the hypervisor.
2299 PRM_POINT("load_tod_module()");
2300 load_tod_module("xpvtod");
2304 * Configure the system.
2306 PRM_POINT("Calling configure()...");
2307 configure(); /* set up devices */
2308 PRM_POINT("configure() done");
2311 * We can now setup for XSAVE because fpu_probe is done in configure().
2313 if (fp_save_mech
== FP_XSAVE
) {
2314 xsave_setup_msr(CPU
);
2318 * Set the isa_list string to the defined instruction sets we
2322 cpu_intr_alloc(CPU
, NINTR_THREADS
);
2326 * We're done with bootops. We don't unmap the bootstrap yet because
2327 * we're still using bootsvcs.
2329 PRM_POINT("NULLing out bootops");
2330 *bootopsp
= (struct bootops
*)NULL
;
2331 bootops
= (struct bootops
*)NULL
;
2334 ec_init_debug_irq();
2338 #if defined(__amd64) && !defined(__xpv)
2340 * Intel IOMMU has been setup/initialized in ddi_impl.c
2346 PRM_POINT("Enabling interrupts");
2350 ASSERT(CPU
->cpu_m
.mcpu_vcpu_info
->evtchn_upcall_mask
== 0);
2354 (void) add_avsoftintr((void *)&softlevel1_hdl
, 1, softlevel1
,
2355 "softlevel1", NULL
, NULL
); /* XXX to be moved later */
2358 * Register software interrupt handlers for ddi_periodic_add(9F).
2359 * Software interrupts up to the level 10 are supported.
2361 for (i
= DDI_IPL_1
; i
<= DDI_IPL_10
; i
++) {
2362 (void) add_avsoftintr((void *)&softlevel_hdl
[i
-1], i
,
2363 (avfunc
)ddi_periodic_softintr
, "ddi_periodic",
2364 (caddr_t
)(uintptr_t)i
, NULL
);
2368 if (modload("drv", "amd_iommu") < 0) {
2369 PRM_POINT("No AMD IOMMU present\n");
2370 } else if (ddi_hold_installed_driver(ddi_name_to_major(
2371 "amd_iommu")) == NULL
) {
2372 prom_printf("ERROR: failed to attach AMD IOMMU\n");
2375 post_startup_cpu_fixups();
2377 PRM_POINT("startup_end() done");
2381 * Don't remove the following 2 variables. They are necessary
2382 * for reading the hostid from the legacy file (/kernel/misc/sysinit).
2384 char *_hs1107
= hw_serial
;
2390 extern void cpupm_init(cpu_t
*);
2391 extern void cpu_event_init_cpu(cpu_t
*);
2394 * Set the system wide, processor-specific flags to be passed
2395 * to userland via the aux vector for performance hints and
2396 * instruction set extensions.
2401 if (DOMAIN_IS_INITDOMAIN(xen_info
))
2408 * Startup the memory scrubber.
2409 * XXPV This should be running somewhere ..
2411 if ((get_hwenv() & HW_VIRTUAL
) == 0)
2417 * Complete CPU module initialization
2422 * Perform forceloading tasks for /etc/system.
2424 (void) mod_sysctl(SYS_FORCELOAD
, NULL
);
2427 * ON4.0: Force /proc module in until clock interrupt handle fixed
2428 * ON4.0: This must be fixed or restated in /etc/systems.
2430 (void) modload("fs", "procfs");
2432 (void) i_ddi_attach_hw_nodes("pit_beep");
2436 * Check for required functional Floating Point hardware,
2437 * unless FP hardware explicitly disabled.
2439 if (fpu_exists
&& (fpu_pentium_fdivbug
|| fp_kind
== FP_NO
))
2440 halt("No working FP hardware found");
2445 cpu_event_init_cpu(CPU
);
2447 (void) mach_cpu_create_device_node(CPU
, NULL
);
2453 pp_in_range(page_t
*pp
, uint64_t low_addr
, uint64_t high_addr
)
2455 return ((pp
->p_pagenum
>= btop(low_addr
)) &&
2456 (pp
->p_pagenum
< btopr(high_addr
)));
2460 pp_in_module(page_t
*pp
, const rd_existing_t
*modranges
)
2464 for (i
= 0; modranges
[i
].phys
!= 0; i
++) {
2465 if (pp_in_range(pp
, modranges
[i
].phys
,
2466 modranges
[i
].phys
+ modranges
[i
].size
))
2474 release_bootstrap(void)
2476 int root_is_ramdisk
;
2478 extern void kobj_boot_unmountroot(void);
2479 extern dev_t rootdev
;
2482 rd_existing_t
*modranges
;
2488 * Save the bootfs module ranges so that we can reserve them below
2489 * for the real bootfs.
2491 modranges
= kmem_alloc(sizeof (rd_existing_t
) * MAX_BOOT_MODULES
,
2493 for (i
= 0; ; i
++) {
2494 uint64_t start
, size
;
2496 modranges
[i
].phys
= 0;
2498 (void) snprintf(propname
, sizeof (propname
),
2499 "module-addr-%u", i
);
2500 if (do_bsys_getproplen(NULL
, propname
) <= 0)
2502 (void) do_bsys_getprop(NULL
, propname
, &start
);
2504 (void) snprintf(propname
, sizeof (propname
),
2505 "module-size-%u", i
);
2506 if (do_bsys_getproplen(NULL
, propname
) <= 0)
2508 (void) do_bsys_getprop(NULL
, propname
, &size
);
2510 modranges
[i
].phys
= start
;
2511 modranges
[i
].size
= size
;
2514 /* unmount boot ramdisk and release kmem usage */
2515 kobj_boot_unmountroot();
2518 * We're finished using the boot loader so free its pages.
2520 PRM_POINT("Unmapping lower boot pages");
2522 clear_boot_mappings(0, _userlimit
);
2524 postbootkernelbase
= kernelbase
;
2527 * If root isn't on ramdisk, destroy the hardcoded
2528 * ramdisk node now and release the memory. Else,
2529 * ramdisk memory is kept in rd_pages.
2531 root_is_ramdisk
= (getmajor(rootdev
) == ddi_name_to_major("ramdisk"));
2532 if (!root_is_ramdisk
) {
2533 dev_info_t
*dip
= ddi_find_devinfo("ramdisk", -1, 0);
2534 ASSERT(dip
&& ddi_get_parent(dip
) == ddi_root_node());
2535 ndi_rele_devi(dip
); /* held from ddi_find_devinfo */
2536 (void) ddi_remove_child(dip
, 0);
2539 PRM_POINT("Releasing boot pages");
2541 extern uint64_t ramdisk_start
, ramdisk_end
;
2543 bootpages
= pp
->p_next
;
2546 /* Keep pages for the lower 64K */
2547 if (pp_in_range(pp
, 0, 0x40000)) {
2548 pp
->p_next
= lower_pages
;
2550 lower_pages_count
++;
2554 if (root_is_ramdisk
&& pp_in_range(pp
, ramdisk_start
,
2555 ramdisk_end
) || pp_in_module(pp
, modranges
)) {
2556 pp
->p_next
= rd_pages
;
2560 pp
->p_next
= (struct page
*)0;
2561 pp
->p_prev
= (struct page
*)0;
2562 PP_CLRBOOTPAGES(pp
);
2565 PRM_POINT("Boot pages released");
2567 kmem_free(modranges
, sizeof (rd_existing_t
) * 99);
2570 /* XXPV -- note this following bunch of code needs to be revisited in Xen 3.0 */
2572 * Find 1 page below 1 MB so that other processors can boot up or
2573 * so that any processor can resume.
2574 * Make sure it has a kernel VA as well as a 1:1 mapping.
2575 * We should have just free'd one up.
2579 * 0x10 pages is 64K. Leave the bottom 64K alone
2582 for (pfn
= 0x10; pfn
< btop(1*1024*1024); pfn
++) {
2583 if (page_numtopp_alloc(pfn
) == NULL
)
2585 rm_platter_va
= i86devmap(pfn
, 1,
2586 PROT_READ
| PROT_WRITE
| PROT_EXEC
);
2587 rm_platter_pa
= ptob(pfn
);
2590 if (pfn
== btop(1*1024*1024) && use_mp
)
2591 panic("No page below 1M available for starting "
2592 "other processors or for resuming from system-suspend");
2597 * Initialize the platform-specific parts of a page_t.
2600 add_physmem_cb(page_t
*pp
, pfn_t pnum
)
2602 pp
->p_pagenum
= pnum
;
2603 pp
->p_mapping
= NULL
;
2610 * kphysm_init() initializes physical memory.
2617 struct memlist
*pmem
;
2618 struct memseg
*cur_memseg
;
2622 pgcnt_t pages_done
= 0;
2625 extern pfn_t ddiphysmin
;
2626 extern int mnode_xwa
;
2629 ASSERT(page_hash
!= NULL
&& page_hashsz
!= 0);
2631 cur_memseg
= memseg_base
;
2632 for (pmem
= phys_avail
; pmem
&& npages
; pmem
= pmem
->ml_next
) {
2634 * In a 32 bit kernel can't use higher memory if we're
2635 * not booting in PAE mode. This check takes care of that.
2637 addr
= pmem
->ml_address
;
2638 size
= pmem
->ml_size
;
2639 if (btop(addr
) > physmax
)
2643 * align addr and size - they may not be at page boundaries
2645 if ((addr
& MMU_PAGEOFFSET
) != 0) {
2646 addr
+= MMU_PAGEOFFSET
;
2647 addr
&= ~(uint64_t)MMU_PAGEOFFSET
;
2648 size
-= addr
- pmem
->ml_address
;
2651 /* only process pages below or equal to physmax */
2652 if ((btop(addr
+ size
) - 1) > physmax
)
2653 size
= ptob(physmax
- btop(addr
) + 1);
2664 base_pfn
= btop(addr
);
2667 prom_printf("MEMSEG addr=0x%" PRIx64
2668 " pgs=0x%lx pfn 0x%lx-0x%lx\n",
2669 addr
, num
, base_pfn
, base_pfn
+ num
);
2672 * Ignore pages below ddiphysmin to simplify ddi memory
2673 * allocation with non-zero addr_lo requests.
2675 if (base_pfn
< ddiphysmin
) {
2676 if (base_pfn
+ num
<= ddiphysmin
)
2678 pp
+= (ddiphysmin
- base_pfn
);
2679 num
-= (ddiphysmin
- base_pfn
);
2680 base_pfn
= ddiphysmin
;
2684 * mnode_xwa is greater than 1 when large pages regions can
2685 * cross memory node boundaries. To prevent the formation
2686 * of these large pages, configure the memsegs based on the
2687 * memory node ranges which had been made non-contiguous.
2689 if (mnode_xwa
> 1) {
2691 end_pfn
= base_pfn
+ num
- 1;
2692 ms
= PFN_2_MEM_NODE(base_pfn
);
2693 me
= PFN_2_MEM_NODE(end_pfn
);
2697 * current range spans more than 1 memory node.
2698 * Set num to only the pfn range in the start
2701 num
= mem_node_config
[ms
].physmax
- base_pfn
2703 ASSERT(end_pfn
> mem_node_config
[ms
].physmax
);
2709 * Build the memsegs entry
2711 cur_memseg
->pages
= pp
;
2712 cur_memseg
->epages
= pp
+ num
;
2713 cur_memseg
->pages_base
= base_pfn
;
2714 cur_memseg
->pages_end
= base_pfn
+ num
;
2717 * Insert into memseg list in decreasing pfn range
2718 * order. Low memory is typically more fragmented such
2719 * that this ordering keeps the larger ranges at the
2720 * front of the list for code that searches memseg.
2721 * This ASSERTS that the memsegs coming in from boot
2722 * are in increasing physical address order and not
2725 if (memsegs
!= NULL
) {
2726 ASSERT(cur_memseg
->pages_base
>=
2727 memsegs
->pages_end
);
2728 cur_memseg
->next
= memsegs
;
2730 memsegs
= cur_memseg
;
2733 * add_physmem() initializes the PSM part of the page
2734 * struct by calling the PSM back with add_physmem_cb().
2735 * In addition it coalesces pages into larger pages as
2736 * it initializes them.
2738 add_physmem(pp
, num
, base_pfn
);
2740 availrmem_initial
+= num
;
2747 /* process next memory node range */
2749 base_pfn
= mem_node_config
[ms
].physbase
;
2750 num
= MIN(mem_node_config
[ms
].physmax
,
2751 end_pfn
) - base_pfn
+ 1;
2755 PRM_DEBUG(availrmem_initial
);
2756 PRM_DEBUG(availrmem
);
2759 return (pages_done
);
2763 * Kernel VM initialization.
2768 ASSERT((((uintptr_t)s_text
) & MMU_PAGEOFFSET
) == 0);
2771 * Put the kernel segments in kernel address space.
2773 rw_enter(&kas
.a_lock
, RW_WRITER
);
2776 (void) seg_attach(&kas
, s_text
, e_moddata
- s_text
, &ktextseg
);
2777 (void) segkmem_create(&ktextseg
);
2779 (void) seg_attach(&kas
, (caddr_t
)valloc_base
, valloc_sz
, &kvalloc
);
2780 (void) segkmem_create(&kvalloc
);
2782 (void) seg_attach(&kas
, kernelheap
,
2783 ekernelheap
- kernelheap
, &kvseg
);
2784 (void) segkmem_create(&kvseg
);
2786 if (core_size
> 0) {
2787 PRM_POINT("attaching kvseg_core");
2788 (void) seg_attach(&kas
, (caddr_t
)core_base
, core_size
,
2790 (void) segkmem_create(&kvseg_core
);
2793 if (segziosize
> 0) {
2794 PRM_POINT("attaching segzio");
2795 (void) seg_attach(&kas
, segzio_base
, mmu_ptob(segziosize
),
2797 (void) segkmem_zio_create(&kzioseg
);
2799 /* create zio area covering new segment */
2800 segkmem_zio_init(segzio_base
, mmu_ptob(segziosize
));
2803 (void) seg_attach(&kas
, kdi_segdebugbase
, kdi_segdebugsize
, &kdebugseg
);
2804 (void) segkmem_create(&kdebugseg
);
2806 rw_exit(&kas
.a_lock
);
2809 * Ensure that the red zone at kernelbase is never accessible.
2811 PRM_POINT("protecting redzone");
2812 (void) as_setprot(&kas
, (caddr_t
)kernelbase
, KERNEL_REDZONE_SIZE
, 0);
2815 * Make the text writable so that it can be hot patched by DTrace.
2817 (void) as_setprot(&kas
, s_text
, e_modtext
- s_text
,
2818 PROT_READ
| PROT_WRITE
| PROT_EXEC
);
2821 * Make data writable until end.
2823 (void) as_setprot(&kas
, s_data
, e_moddata
- s_data
,
2824 PROT_READ
| PROT_WRITE
| PROT_EXEC
);
2829 * Solaris adds an entry for Write Combining caching to the PAT
2831 static uint64_t pat_attr_reg
= PAT_DEFAULT_ATTRIBUTE
;
2836 ulong_t cr0
, cr0_orig
, cr4
;
2838 if (!is_x86_feature(x86_featureset
, X86FSET_PAT
))
2840 cr0_orig
= cr0
= getcr0();
2843 /* disable caching and flush all caches and TLBs */
2848 if (cr4
& CR4_PGE
) {
2849 setcr4(cr4
& ~(ulong_t
)CR4_PGE
);
2855 /* add our entry to the PAT */
2856 wrmsr(REG_PAT
, pat_attr_reg
);
2858 /* flush TLBs and cache again, then reenable cr0 caching */
2859 if (cr4
& CR4_PGE
) {
2860 setcr4(cr4
& ~(ulong_t
)CR4_PGE
);
2871 #if defined(_SOFT_HOSTID)
2873 * On platforms that do not have a hardware serial number, attempt
2874 * to set one based on the contents of /etc/hostid. If this file does
2875 * not exist, assume that we are to generate a new hostid and set
2876 * it in the kernel, for subsequent saving by a userland process
2877 * once the system is up and the root filesystem is mounted r/w.
2879 * In order to gracefully support upgrade on OpenSolaris, if
2880 * /etc/hostid does not exist, we will attempt to get a serial number
2881 * using the legacy method (/kernel/misc/sysinit).
2883 * If that isn't present, we attempt to use an SMBIOS UUID, which is
2884 * a hardware serial number. Note that we don't automatically trust
2885 * all SMBIOS UUIDs (some older platforms are defective and ship duplicate
2886 * UUIDs in violation of the standard), we check against a blacklist.
2888 * In an attempt to make the hostid less prone to abuse
2889 * (for license circumvention, etc), we store it in /etc/hostid
2892 extern volatile unsigned long tenmicrodata
;
2893 static int atoi(char *);
2896 * Set this to non-zero in /etc/system if you think your SMBIOS returns a
2897 * UUID that is not unique. (Also report it so that the smbios_uuid_blacklist
2898 * array can be updated.)
2900 int smbios_broken_uuid
= 0;
2903 * List of known bad UUIDs. This is just the lower 32-bit values, since
2904 * that's what we use for the host id. If your hostid falls here, you need
2905 * to contact your hardware OEM for a fix for your BIOS.
2907 static unsigned char
2908 smbios_uuid_blacklist
[][16] = {
2910 { /* Reported bad UUID (Google search) */
2911 0x00, 0x02, 0x00, 0x03, 0x00, 0x04, 0x00, 0x05,
2912 0x00, 0x06, 0x00, 0x07, 0x00, 0x08, 0x00, 0x09,
2914 { /* Known bad DELL UUID */
2915 0x4C, 0x4C, 0x45, 0x44, 0x00, 0x00, 0x20, 0x10,
2916 0x80, 0x20, 0x80, 0xC0, 0x4F, 0x20, 0x20, 0x20,
2918 { /* Uninitialized flash */
2919 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
2920 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
2923 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
2924 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
2929 uuid_to_hostid(const uint8_t *uuid
)
2932 * Although the UUIDs are 128-bits, they may not distribute entropy
2933 * evenly. We would like to use SHA or MD5, but those are located
2934 * in loadable modules and not available this early in boot. As we
2935 * don't need the values to be cryptographically strong, we just
2936 * generate 32-bit vaue by xor'ing the various sequences together,
2937 * which ensures that the entire UUID contributes to the hostid.
2941 /* first check against the blacklist */
2942 for (int i
= 0; i
< (sizeof (smbios_uuid_blacklist
) / 16); i
++) {
2943 if (bcmp(smbios_uuid_blacklist
[0], uuid
, 16) == 0) {
2944 cmn_err(CE_CONT
, "?Broken SMBIOS UUID. "
2945 "Contact BIOS manufacturer for repair.\n");
2946 return ((int32_t)HW_INVALID_HOSTID
);
2950 for (int i
= 0; i
< 16; i
++)
2951 id
^= ((uuid
[i
]) << (8 * (i
% sizeof (id
))));
2953 /* Make sure return value is positive */
2954 return (id
& 0x7fffffff);
2958 set_soft_hostid(void)
2961 char tokbuf
[MAXNAMELEN
];
2966 int32_t hostid
= (int32_t)HW_INVALID_HOSTID
;
2969 smbios_system_t smsys
;
2972 * If /etc/hostid file not found, we'd like to get a pseudo
2973 * random number to use at the hostid. A nice way to do this
2974 * is to read the real time clock. To remain xen-compatible,
2975 * we can't poke the real hardware, so we use tsc_read() to
2976 * read the real time clock. However, there is an ominous
2977 * warning in tsc_read that says it can return zero, so we
2978 * deal with that possibility by falling back to using the
2979 * (hopefully random enough) value in tenmicrodata.
2982 if ((file
= kobj_open_file(hostid_file
)) == (struct _buf
*)-1) {
2984 * hostid file not found - try to load sysinit module
2985 * and see if it has a nonzero hostid value...use that
2986 * instead of generating a new hostid here if so.
2988 if ((i
= modload("misc", "sysinit")) != -1) {
2989 if (strlen(hw_serial
) > 0)
2990 hostid
= (int32_t)atoi(hw_serial
);
2991 (void) modunload(i
);
2995 * We try to use the SMBIOS UUID. But not if it is blacklisted
2998 if ((hostid
== HW_INVALID_HOSTID
) &&
2999 (smbios_broken_uuid
== 0) &&
3000 (ksmbios
!= NULL
) &&
3001 (smbios_info_system(ksmbios
, &smsys
) != SMB_ERR
) &&
3002 (smsys
.smbs_uuidlen
>= 16)) {
3003 hostid
= uuid_to_hostid(smsys
.smbs_uuid
);
3007 * Generate a "random" hostid using the clock. These
3008 * hostids will change on each boot if the value is not
3009 * saved to a persistent /etc/hostid file.
3011 if (hostid
== HW_INVALID_HOSTID
) {
3013 if (tsc
== 0) /* tsc_read can return zero sometimes */
3014 hostid
= (int32_t)tenmicrodata
& 0x0CFFFFF;
3016 hostid
= (int32_t)tsc
& 0x0CFFFFF;
3019 /* hostid file found */
3021 token
= kobj_lex(file
, tokbuf
, sizeof (tokbuf
));
3028 kobj_find_eol(file
);
3032 * un-rot47 - obviously this
3033 * nonsense is ascii-specific
3035 for (c
= (unsigned char *)tokbuf
;
3044 * now we should have a real number
3047 if (kobj_getvalue(tokbuf
, &tmp
) != 0)
3048 kobj_file_err(CE_WARN
, file
,
3049 "Bad value %s for hostid",
3052 hostid
= (int32_t)tmp
;
3066 if (hostid
== HW_INVALID_HOSTID
) /* didn't find a hostid */
3067 kobj_file_err(CE_WARN
, file
,
3068 "hostid missing or corrupt");
3070 kobj_close_file(file
);
3073 * hostid is now the value read from /etc/hostid, or the
3074 * new hostid we generated in this routine or HW_INVALID_HOSTID if not
3086 i
= 10 * i
+ (*p
++ - '0');
3091 #endif /* _SOFT_HOSTID */
3094 get_system_configuration(void)
3097 u_longlong_t nodes_ll
, cpus_pernode_ll
, lvalue
;
3099 if (BOP_GETPROPLEN(bootops
, "nodes") > sizeof (prop
) ||
3100 BOP_GETPROP(bootops
, "nodes", prop
) < 0 ||
3101 kobj_getvalue(prop
, &nodes_ll
) == -1 ||
3102 nodes_ll
> MAXNODES
||
3103 BOP_GETPROPLEN(bootops
, "cpus_pernode") > sizeof (prop
) ||
3104 BOP_GETPROP(bootops
, "cpus_pernode", prop
) < 0 ||
3105 kobj_getvalue(prop
, &cpus_pernode_ll
) == -1) {
3106 system_hardware
.hd_nodes
= 1;
3107 system_hardware
.hd_cpus_per_node
= 0;
3109 system_hardware
.hd_nodes
= (int)nodes_ll
;
3110 system_hardware
.hd_cpus_per_node
= (int)cpus_pernode_ll
;
3113 if (BOP_GETPROPLEN(bootops
, "kernelbase") > sizeof (prop
) ||
3114 BOP_GETPROP(bootops
, "kernelbase", prop
) < 0 ||
3115 kobj_getvalue(prop
, &lvalue
) == -1)
3116 eprom_kernelbase
= NULL
;
3118 eprom_kernelbase
= (uintptr_t)lvalue
;
3120 if (BOP_GETPROPLEN(bootops
, "segmapsize") > sizeof (prop
) ||
3121 BOP_GETPROP(bootops
, "segmapsize", prop
) < 0 ||
3122 kobj_getvalue(prop
, &lvalue
) == -1)
3123 segmapsize
= SEGMAPDEFAULT
;
3125 segmapsize
= (uintptr_t)lvalue
;
3127 if (BOP_GETPROPLEN(bootops
, "segmapfreelists") > sizeof (prop
) ||
3128 BOP_GETPROP(bootops
, "segmapfreelists", prop
) < 0 ||
3129 kobj_getvalue(prop
, &lvalue
) == -1)
3130 segmapfreelists
= 0; /* use segmap driver default */
3132 segmapfreelists
= (int)lvalue
;
3134 /* physmem used to be here, but moved much earlier to fakebop.c */
3138 * Add to a memory list.
3139 * start = start of new memory segment
3140 * len = length of new memory segment in bytes
3141 * new = pointer to a new struct memlist
3142 * memlistp = memory list to which to add segment.
3148 struct memlist
*new,
3149 struct memlist
**memlistp
)
3151 struct memlist
*cur
;
3152 uint64_t end
= start
+ len
;
3154 new->ml_address
= start
;
3160 if (cur
->ml_address
>= end
) {
3163 new->ml_prev
= cur
->ml_prev
;
3167 ASSERT(cur
->ml_address
+ cur
->ml_size
<= start
);
3168 if (cur
->ml_next
== NULL
) {
3171 new->ml_next
= NULL
;
3174 memlistp
= &cur
->ml_next
;
3180 kobj_vmem_init(vmem_t
**text_arena
, vmem_t
**data_arena
)
3182 size_t tsize
= e_modtext
- modtext
;
3183 size_t dsize
= e_moddata
- moddata
;
3185 *text_arena
= vmem_create("module_text", tsize
? modtext
: NULL
, tsize
,
3186 1, segkmem_alloc
, segkmem_free
, heaptext_arena
, 0, VM_SLEEP
);
3187 *data_arena
= vmem_create("module_data", dsize
? moddata
: NULL
, dsize
,
3188 1, segkmem_alloc
, segkmem_free
, heap32_arena
, 0, VM_SLEEP
);
3192 kobj_text_alloc(vmem_t
*arena
, size_t size
)
3194 return (vmem_alloc(arena
, size
, VM_SLEEP
| VM_BESTFIT
));
3199 kobj_texthole_alloc(caddr_t addr
, size_t size
)
3201 panic("unexpected call to kobj_texthole_alloc()");
3208 kobj_texthole_free(caddr_t addr
, size_t size
)
3210 panic("unexpected call to kobj_texthole_free()");
3214 * This is called just after configure() in startup().
3216 * The ISALIST concept is a bit hopeless on Intel, because
3217 * there's no guarantee of an ever-more-capable processor
3218 * given that various parts of the instruction set may appear
3219 * and disappear between different implementations.
3221 * While it would be possible to correct it and even enhance
3222 * it somewhat, the explicit hardware capability bitmask allows
3225 * So, we just leave this alone.
3232 extern char *isa_list
;
3234 #define TBUFSIZE 1024
3236 tp
= kmem_alloc(TBUFSIZE
, KM_SLEEP
);
3239 #if defined(__amd64)
3240 (void) strcpy(tp
, "amd64 ");
3243 switch (x86_vendor
) {
3244 case X86_VENDOR_Intel
:
3245 case X86_VENDOR_AMD
:
3247 if (is_x86_feature(x86_featureset
, X86FSET_CMOV
)) {
3249 * Pentium Pro or later
3251 (void) strcat(tp
, "pentium_pro");
3253 is_x86_feature(x86_featureset
, X86FSET_MMX
) ?
3254 "+mmx pentium_pro " : " ");
3257 case X86_VENDOR_Cyrix
:
3259 * The Cyrix 6x86 does not have any Pentium features
3260 * accessible while not at privilege level 0.
3262 if (is_x86_feature(x86_featureset
, X86FSET_CPUID
)) {
3263 (void) strcat(tp
, "pentium");
3265 is_x86_feature(x86_featureset
, X86FSET_MMX
) ?
3266 "+mmx pentium " : " ");
3272 (void) strcat(tp
, "i486 i386 i86");
3273 len
= strlen(tp
) + 1; /* account for NULL at end of string */
3274 isa_list
= strcpy(kmem_alloc(len
, KM_SLEEP
), tp
);
3275 kmem_free(tp
, TBUFSIZE
);
3284 device_arena_alloc(size_t size
, int vm_flag
)
3286 return (vmem_alloc(device_arena
, size
, vm_flag
));
3290 device_arena_free(void *vaddr
, size_t size
)
3292 vmem_free(device_arena
, vaddr
, size
);
3298 device_arena_alloc(size_t size
, int vm_flag
)
3305 vaddr
= vmem_alloc(heap_arena
, size
, vm_flag
);
3309 v
= (uintptr_t)vaddr
;
3310 ASSERT(v
>= kernelbase
);
3311 ASSERT(v
+ size
<= valloc_base
);
3313 start
= btop(v
- kernelbase
);
3314 end
= btop(v
+ size
- 1 - kernelbase
);
3315 ASSERT(start
< toxic_bit_map_len
);
3316 ASSERT(end
< toxic_bit_map_len
);
3318 while (start
<= end
) {
3319 BT_ATOMIC_SET(toxic_bit_map
, start
);
3326 device_arena_free(void *vaddr
, size_t size
)
3328 uintptr_t v
= (uintptr_t)vaddr
;
3332 ASSERT(v
>= kernelbase
);
3333 ASSERT(v
+ size
<= valloc_base
);
3335 start
= btop(v
- kernelbase
);
3336 end
= btop(v
+ size
- 1 - kernelbase
);
3337 ASSERT(start
< toxic_bit_map_len
);
3338 ASSERT(end
< toxic_bit_map_len
);
3340 while (start
<= end
) {
3341 ASSERT(BT_TEST(toxic_bit_map
, start
) != 0);
3342 BT_ATOMIC_CLEAR(toxic_bit_map
, start
);
3345 vmem_free(heap_arena
, vaddr
, size
);
3349 * returns 1st address in range that is in device arena, or NULL
3350 * if len is not NULL it returns the length of the toxic range
3353 device_arena_contains(void *vaddr
, size_t size
, size_t *len
)
3355 uintptr_t v
= (uintptr_t)vaddr
;
3356 uintptr_t eaddr
= v
+ size
;
3361 * if called very early by kmdb, just return NULL
3363 if (toxic_bit_map
== NULL
)
3367 * First check if we're completely outside the bitmap range.
3369 if (v
>= valloc_base
|| eaddr
< kernelbase
)
3373 * Trim ends of search to look at only what the bitmap covers.
3377 start
= btop(v
- kernelbase
);
3378 end
= btop(eaddr
- kernelbase
);
3379 if (end
>= toxic_bit_map_len
)
3380 end
= toxic_bit_map_len
;
3382 if (bt_range(toxic_bit_map
, &start
, &end
, end
) == 0)
3385 v
= kernelbase
+ ptob(start
);
3387 *len
= ptob(end
- start
);