4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
26 * Copyright 2013 Joyent, Inc. All rights reserved.
30 #include <sys/types.h>
31 #include <sys/machparam.h>
32 #include <sys/x86_archext.h>
33 #include <sys/systm.h>
34 #include <sys/mach_mmu.h>
35 #include <sys/multiboot.h>
36 #include <sys/multiboot2.h>
37 #include <sys/multiboot2_impl.h>
38 #include <sys/sysmacros.h>
40 #include <util/string.h>
41 #include <util/strtolctype.h>
45 #include <sys/hypervisor.h>
46 uintptr_t xen_virt_start
;
47 pfn_t
*mfn_to_pfn_mapping
;
51 extern multiboot_header_t mb_header
;
52 extern uint32_t mb2_load_addr
;
53 extern int have_cpuid(void);
57 #include <sys/inttypes.h>
58 #include <sys/bootinfo.h>
59 #include <sys/mach_mmu.h>
60 #include <sys/boot_console.h>
62 #include "dboot_asm.h"
63 #include "dboot_printf.h"
64 #include "dboot_xboot.h"
65 #include "dboot_elfload.h"
67 #define SHA1_ASCII_LENGTH (SHA1_DIGEST_LENGTH * 2)
70 * This file contains code that runs to transition us from either a multiboot
71 * compliant loader (32 bit non-paging) or a XPV domain loader to
72 * regular kernel execution. Its task is to setup the kernel memory image
75 * The code executes as:
76 * - 32 bits under GRUB (for 32 or 64 bit Solaris)
77 * - a 32 bit program for the 32-bit PV hypervisor
78 * - a 64 bit program for the 64-bit PV hypervisor (at least for now)
80 * Under the PV hypervisor, we must create mappings for any memory beyond the
81 * initial start of day allocation (such as the kernel itself).
83 * When on the metal, the mapping between maddr_t and paddr_t is 1:1.
84 * Since we are running in real mode, so all such memory is accessible.
88 * Standard bits used in PTE (page level) and PTP (internal levels)
90 x86pte_t ptp_bits
= PT_VALID
| PT_REF
| PT_WRITABLE
| PT_USER
;
91 x86pte_t pte_bits
= PT_VALID
| PT_REF
| PT_WRITABLE
| PT_MOD
| PT_NOCONSIST
;
94 * This is the target addresses (physical) where the kernel text and data
95 * nucleus pages will be unpacked. On the hypervisor this is actually a
99 uint32_t ksize
= 2 * FOUR_MEG
; /* kernel nucleus is 8Meg */
101 static uint64_t target_kernel_text
; /* value to use for KERNEL_TEXT */
104 * The stack is setup in assembler before entering startup_kernel()
106 char stack_space
[STACK_SIZE
];
109 * Used to track physical memory allocation
111 static paddr_t next_avail_addr
= 0;
115 * Additional information needed for hypervisor memory allocation.
116 * Only memory up to scratch_end is mapped by page tables.
117 * mfn_base is the start of the hypervisor virtual image. It's ONE_GIG, so
118 * to derive a pfn from a pointer, you subtract mfn_base.
121 static paddr_t scratch_end
= 0; /* we can't write all of mem here */
122 static paddr_t mfn_base
; /* addr corresponding to mfn_list[0] */
123 start_info_t
*xen_info
;
128 * If on the metal, then we have a multiboot loader.
130 uint32_t mb_magic
; /* magic from boot loader */
131 uint32_t mb_addr
; /* multiboot info package from loader */
132 int multiboot_version
;
133 multiboot_info_t
*mb_info
;
134 multiboot2_info_header_t
*mb2_info
;
135 multiboot_tag_mmap_t
*mb2_mmap_tagp
;
136 int num_entries
; /* mmap entry count */
137 boolean_t num_entries_set
; /* is mmap entry count set */
143 * This contains information passed to the kernel
145 struct xboot_info boot_info
[2]; /* extra space to fix alignement for amd64 */
146 struct xboot_info
*bi
;
149 * Page table and memory stuff.
151 static paddr_t max_mem
; /* maximum memory address */
154 * Information about processor MMU
156 int amd64_support
= 0;
157 int largepage_support
= 0;
163 * Low 32 bits of kernel entry address passed back to assembler.
164 * When running a 64 bit kernel, the high 32 bits are 0xffffffff.
166 uint32_t entry_addr_low
;
169 * Memlists for the kernel. We shouldn't need a lot of these.
171 #define MAX_MEMLIST (50)
172 struct boot_memlist memlists
[MAX_MEMLIST
];
173 uint_t memlists_used
= 0;
174 struct boot_memlist pcimemlists
[MAX_MEMLIST
];
175 uint_t pcimemlists_used
= 0;
176 struct boot_memlist rsvdmemlists
[MAX_MEMLIST
];
177 uint_t rsvdmemlists_used
= 0;
180 * This should match what's in the bootloader. It's arbitrary, but GRUB
181 * in particular has limitations on how much space it can use before it
182 * stops working properly. This should be enough.
184 struct boot_modules modules
[MAX_BOOT_MODULES
];
185 uint_t modules_used
= 0;
189 * Xen strips the size field out of the mb_memory_map_t, see struct e820entry
190 * definition in Xen source.
193 uint32_t base_addr_low
;
194 uint32_t base_addr_high
;
196 uint32_t length_high
;
201 * There is 512KB of scratch area after the boot stack page.
202 * We'll use that for everything except the kernel nucleus pages which are too
203 * big to fit there and are allocated last anyway.
206 static mmap_t map_buffer
[MAXMAPS
];
208 typedef mb_memory_map_t mmap_t
;
214 uint_t prom_debug
= 0;
215 uint_t map_debug
= 0;
217 static char noname
[2] = "-";
220 * Either hypervisor-specific or grub-specific code builds the initial
221 * memlists. This code does the sort/merge/link for final use.
224 sort_physinstall(void)
229 struct boot_memlist tmp
;
232 * Now sort the memlists, in case they weren't in order.
233 * Yeah, this is a bubble sort; small, simple and easy to get right.
235 DBG_MSG("Sorting phys-installed list\n");
236 for (j
= memlists_used
- 1; j
> 0; --j
) {
237 for (i
= 0; i
< j
; ++i
) {
238 if (memlists
[i
].addr
< memlists
[i
+ 1].addr
)
241 memlists
[i
] = memlists
[i
+ 1];
242 memlists
[i
+ 1] = tmp
;
247 * Merge any memlists that don't have holes between them.
249 for (i
= 0; i
<= memlists_used
- 1; ++i
) {
250 if (memlists
[i
].addr
+ memlists
[i
].size
!= memlists
[i
+ 1].addr
)
255 "merging mem segs %" PRIx64
"...%" PRIx64
256 " w/ %" PRIx64
"...%" PRIx64
"\n",
258 memlists
[i
].addr
+ memlists
[i
].size
,
259 memlists
[i
+ 1].addr
,
260 memlists
[i
+ 1].addr
+ memlists
[i
+ 1].size
);
262 memlists
[i
].size
+= memlists
[i
+ 1].size
;
263 for (j
= i
+ 1; j
< memlists_used
- 1; ++j
)
264 memlists
[j
] = memlists
[j
+ 1];
267 --i
; /* after merging we need to reexamine, so do this */
272 dboot_printf("\nFinal memlists:\n");
273 for (i
= 0; i
< memlists_used
; ++i
) {
274 dboot_printf("\t%d: addr=%" PRIx64
" size=%"
275 PRIx64
"\n", i
, memlists
[i
].addr
, memlists
[i
].size
);
280 * link together the memlists with native size pointers
282 memlists
[0].next
= 0;
283 memlists
[0].prev
= 0;
284 for (i
= 1; i
< memlists_used
; ++i
) {
285 memlists
[i
].prev
= (native_ptr_t
)(uintptr_t)(memlists
+ i
- 1);
286 memlists
[i
].next
= 0;
287 memlists
[i
- 1].next
= (native_ptr_t
)(uintptr_t)(memlists
+ i
);
289 bi
->bi_phys_install
= (native_ptr_t
)(uintptr_t)memlists
;
290 DBG(bi
->bi_phys_install
);
294 * build bios reserved memlists
297 build_rsvdmemlists(void)
301 rsvdmemlists
[0].next
= 0;
302 rsvdmemlists
[0].prev
= 0;
303 for (i
= 1; i
< rsvdmemlists_used
; ++i
) {
304 rsvdmemlists
[i
].prev
=
305 (native_ptr_t
)(uintptr_t)(rsvdmemlists
+ i
- 1);
306 rsvdmemlists
[i
].next
= 0;
307 rsvdmemlists
[i
- 1].next
=
308 (native_ptr_t
)(uintptr_t)(rsvdmemlists
+ i
);
310 bi
->bi_rsvdmem
= (native_ptr_t
)(uintptr_t)rsvdmemlists
;
317 * halt on the hypervisor after a delay to drain console output
325 (void) HYPERVISOR_yield();
326 (void) HYPERVISOR_shutdown(SHUTDOWN_poweroff
);
330 * From a machine address, find the corresponding pseudo-physical address.
331 * Pseudo-physical address are contiguous and run from mfn_base in each VM.
332 * Machine addresses are the real underlying hardware addresses.
333 * These are needed for page table entries. Note that this routine is
334 * poorly protected. A bad value of "ma" will cause a page fault.
339 ulong_t pgoff
= ma
& MMU_PAGEOFFSET
;
340 ulong_t pfn
= mfn_to_pfn_mapping
[mmu_btop(ma
)];
343 if (pfn
>= xen_info
->nr_pages
)
344 return (-(paddr_t
)1);
345 pa
= mfn_base
+ mmu_ptob((paddr_t
)pfn
) + pgoff
;
347 if (ma
!= pa_to_ma(pa
))
348 dboot_printf("ma_to_pa(%" PRIx64
") got %" PRIx64
", "
349 "pa_to_ma() says %" PRIx64
"\n", ma
, pa
, pa_to_ma(pa
));
355 * From a pseudo-physical address, find the corresponding machine address.
363 pfn
= mmu_btop(pa
- mfn_base
);
364 if (pa
< mfn_base
|| pfn
>= xen_info
->nr_pages
)
365 dboot_panic("pa_to_ma(): illegal address 0x%lx", (ulong_t
)pa
);
366 mfn
= ((ulong_t
*)xen_info
->mfn_list
)[pfn
];
368 if (mfn_to_pfn_mapping
[mfn
] != pfn
)
369 dboot_printf("pa_to_ma(pfn=%lx) got %lx ma_to_pa() says %lx\n",
370 pfn
, mfn
, mfn_to_pfn_mapping
[mfn
]);
372 return (mfn_to_ma(mfn
) | (pa
& MMU_PAGEOFFSET
));
378 get_pteval(paddr_t table
, uint_t index
)
381 return (((x86pte_t
*)(uintptr_t)table
)[index
]);
382 return (((x86pte32_t
*)(uintptr_t)table
)[index
]);
387 set_pteval(paddr_t table
, uint_t index
, uint_t level
, x86pte_t pteval
)
391 maddr_t mtable
= pa_to_ma(table
);
394 t
.ptr
= (mtable
+ index
* pte_size
) | MMU_NORMAL_PT_UPDATE
;
396 if (HYPERVISOR_mmu_update(&t
, 1, &retcnt
, DOMID_SELF
) || retcnt
!= 1)
397 dboot_panic("HYPERVISOR_mmu_update() failed");
399 uintptr_t tab_addr
= (uintptr_t)table
;
402 ((x86pte_t
*)tab_addr
)[index
] = pteval
;
404 ((x86pte32_t
*)tab_addr
)[index
] = (x86pte32_t
)pteval
;
405 if (level
== top_level
&& level
== 2)
411 make_ptable(x86pte_t
*pteval
, uint_t level
)
413 paddr_t new_table
= (paddr_t
)(uintptr_t)mem_alloc(MMU_PAGESIZE
);
415 if (level
== top_level
&& level
== 2)
416 *pteval
= pa_to_ma((uintptr_t)new_table
) | PT_VALID
;
418 *pteval
= pa_to_ma((uintptr_t)new_table
) | ptp_bits
;
421 /* Remove write permission to the new page table. */
422 if (HYPERVISOR_update_va_mapping(new_table
,
423 *pteval
& ~(x86pte_t
)PT_WRITABLE
, UVMF_INVLPG
| UVMF_LOCAL
))
424 dboot_panic("HYP_update_va_mapping error");
428 dboot_printf("new page table lvl=%d paddr=0x%lx ptp=0x%"
429 PRIx64
"\n", level
, (ulong_t
)new_table
, *pteval
);
434 map_pte(paddr_t table
, uint_t index
)
436 return ((x86pte_t
*)(uintptr_t)(table
+ index
* pte_size
));
440 * dump out the contents of page tables...
445 uint_t save_index
[4]; /* for recursion */
446 char *save_table
[4]; /* for recursion */
454 static char *tablist
= "\t\t\t";
455 char *tabs
= tablist
+ 3 - top_level
;
458 #define maddr_t paddr_t
461 dboot_printf("Finished pagetables:\n");
462 table
= (char *)(uintptr_t)top_page_table
;
465 for (index
= 0; index
< ptes_per_table
; ++index
) {
466 pgsize
= 1ull << shift_amt
[l
];
468 pteval
= ((x86pte_t
*)table
)[index
];
470 pteval
= ((x86pte32_t
*)table
)[index
];
474 dboot_printf("%s %p[0x%x] = %" PRIx64
", va=%" PRIx64
,
475 tabs
+ l
, (void *)table
, index
, (uint64_t)pteval
, va
);
476 pa
= ma_to_pa(pteval
& MMU_PAGEMASK
);
477 dboot_printf(" physaddr=%x\n", pa
);
480 * Don't try to walk hypervisor private pagetables
482 if ((l
> 1 || (l
== 1 && (pteval
& PT_PAGESIZE
) == 0))) {
483 save_table
[l
] = table
;
484 save_index
[l
] = index
;
487 table
= (char *)(uintptr_t)
488 ma_to_pa(pteval
& MMU_PAGEMASK
);
493 * shorten dump for consecutive mappings
495 for (i
= 1; index
+ i
< ptes_per_table
; ++i
) {
497 pteval
= ((x86pte_t
*)table
)[index
+ i
];
499 pteval
= ((x86pte32_t
*)table
)[index
+ i
];
502 pa1
= ma_to_pa(pteval
& MMU_PAGEMASK
);
503 if (pa1
!= pa
+ i
* pgsize
)
507 dboot_printf("%s...\n", tabs
+ l
);
508 va
+= pgsize
* (i
- 2);
513 if (l
== 3 && index
== 256) /* VA hole */
514 va
= 0xffff800000000000ull
;
520 index
= save_index
[l
];
521 table
= save_table
[l
];
527 * Add a mapping for the machine page at the given virtual address.
530 map_ma_at_va(maddr_t ma
, native_ptr_t va
, uint_t level
)
535 pteval
= ma
| pte_bits
;
537 pteval
|= PT_PAGESIZE
;
538 if (va
>= target_kernel_text
&& pge_support
)
541 if (map_debug
&& ma
!= va
)
542 dboot_printf("mapping ma=0x%" PRIx64
" va=0x%" PRIx64
543 " pte=0x%" PRIx64
" l=%d\n",
544 (uint64_t)ma
, (uint64_t)va
, pteval
, level
);
548 * see if we can avoid find_pte() on the hypervisor
550 if (HYPERVISOR_update_va_mapping(va
, pteval
,
551 UVMF_INVLPG
| UVMF_LOCAL
) == 0)
556 * Find the pte that will map this address. This creates any
557 * missing intermediate level page tables
559 ptep
= find_pte(va
, NULL
, level
, 0);
562 * When paravirtualized, we must use hypervisor calls to modify the
563 * PTE, since paging is active. On real hardware we just write to
564 * the pagetables which aren't in use yet.
567 ptep
= ptep
; /* shut lint up */
568 if (HYPERVISOR_update_va_mapping(va
, pteval
, UVMF_INVLPG
| UVMF_LOCAL
))
569 dboot_panic("mmu_update failed-map_pa_at_va va=0x%" PRIx64
570 " l=%d ma=0x%" PRIx64
", pte=0x%" PRIx64
"",
571 (uint64_t)va
, level
, (uint64_t)ma
, pteval
);
573 if (va
< 1024 * 1024)
574 pteval
|= PT_NOCACHE
; /* for video RAM */
578 *((x86pte32_t
*)ptep
) = (x86pte32_t
)pteval
;
583 * Add a mapping for the physical page at the given virtual address.
586 map_pa_at_va(paddr_t pa
, native_ptr_t va
, uint_t level
)
588 map_ma_at_va(pa_to_ma(pa
), va
, level
);
592 * This is called to remove start..end from the
593 * possible range of PCI addresses.
595 const uint64_t pci_lo_limit
= 0x00100000ul
;
596 const uint64_t pci_hi_limit
= 0xfff00000ul
;
598 exclude_from_pci(uint64_t start
, uint64_t end
)
602 struct boot_memlist
*ml
;
604 for (i
= 0; i
< pcimemlists_used
; ++i
) {
605 ml
= &pcimemlists
[i
];
607 /* delete the entire range? */
608 if (start
<= ml
->addr
&& ml
->addr
+ ml
->size
<= end
) {
610 for (j
= i
; j
< pcimemlists_used
; ++j
)
611 pcimemlists
[j
] = pcimemlists
[j
+ 1];
612 --i
; /* to revisit the new one at this index */
616 else if (ml
->addr
< start
&& end
< ml
->addr
+ ml
->size
) {
619 if (pcimemlists_used
> MAX_MEMLIST
)
620 dboot_panic("too many pcimemlists");
622 for (j
= pcimemlists_used
- 1; j
> i
; --j
)
623 pcimemlists
[j
] = pcimemlists
[j
- 1];
624 ml
->size
= start
- ml
->addr
;
627 ml
->size
= (ml
->addr
+ ml
->size
) - end
;
629 ++i
; /* skip on to next one */
632 /* cut memory off the start? */
633 else if (ml
->addr
< end
&& end
< ml
->addr
+ ml
->size
) {
634 ml
->size
-= end
- ml
->addr
;
638 /* cut memory off the end? */
639 else if (ml
->addr
<= start
&& start
< ml
->addr
+ ml
->size
) {
640 ml
->size
= start
- ml
->addr
;
646 * During memory allocation, find the highest address not used yet.
649 check_higher(paddr_t a
)
651 if (a
< next_avail_addr
)
653 next_avail_addr
= RNDUP(a
+ 1, MMU_PAGESIZE
);
654 DBG(next_avail_addr
);
658 dboot_loader_mmap_entries(void)
661 if (num_entries_set
== B_TRUE
)
662 return (num_entries
);
664 switch (multiboot_version
) {
667 if (mb_info
->flags
& 0x40) {
668 mb_memory_map_t
*mmap
;
670 DBG(mb_info
->mmap_addr
);
671 DBG(mb_info
->mmap_length
);
672 check_higher(mb_info
->mmap_addr
+ mb_info
->mmap_length
);
674 for (mmap
= (mb_memory_map_t
*)mb_info
->mmap_addr
;
675 (uint32_t)mmap
< mb_info
->mmap_addr
+
676 mb_info
->mmap_length
;
677 mmap
= (mb_memory_map_t
*)((uint32_t)mmap
+
678 mmap
->size
+ sizeof (mmap
->size
)))
681 num_entries_set
= B_TRUE
;
685 num_entries_set
= B_TRUE
;
686 num_entries
= dboot_multiboot2_mmap_nentries(mb2_info
,
690 dboot_panic("Unknown multiboot version: %d\n",
694 return (num_entries
);
701 dboot_loader_mmap_get_type(int index
)
704 mb_memory_map_t
*mp
, *mpend
;
707 switch (multiboot_version
) {
709 mp
= (mb_memory_map_t
*)mb_info
->mmap_addr
;
710 mpend
= (mb_memory_map_t
*)
711 (mb_info
->mmap_addr
+ mb_info
->mmap_length
);
713 for (i
= 0; mp
< mpend
&& i
!= index
; i
++)
714 mp
= (mb_memory_map_t
*)((uint32_t)mp
+ mp
->size
+
717 dboot_panic("dboot_loader_mmap_get_type(): index "
718 "out of bounds: %d\n", index
);
723 return (dboot_multiboot2_mmap_get_type(mb2_info
,
724 mb2_mmap_tagp
, index
));
727 dboot_panic("Unknown multiboot version: %d\n",
733 return (map_buffer
[index
].type
);
738 dboot_loader_mmap_get_base(int index
)
741 mb_memory_map_t
*mp
, *mpend
;
744 switch (multiboot_version
) {
746 mp
= (mb_memory_map_t
*)mb_info
->mmap_addr
;
747 mpend
= (mb_memory_map_t
*)
748 (mb_info
->mmap_addr
+ mb_info
->mmap_length
);
750 for (i
= 0; mp
< mpend
&& i
!= index
; i
++)
751 mp
= (mb_memory_map_t
*)((uint32_t)mp
+ mp
->size
+
754 dboot_panic("dboot_loader_mmap_get_base(): index "
755 "out of bounds: %d\n", index
);
757 return (((uint64_t)mp
->base_addr_high
<< 32) +
758 (uint64_t)mp
->base_addr_low
);
761 return (dboot_multiboot2_mmap_get_base(mb2_info
,
762 mb2_mmap_tagp
, index
));
765 dboot_panic("Unknown multiboot version: %d\n",
771 return (((uint64_t)map_buffer
[index
].base_addr_high
<< 32) +
772 (uint64_t)map_buffer
[index
].base_addr_low
);
777 dboot_loader_mmap_get_length(int index
)
780 mb_memory_map_t
*mp
, *mpend
;
783 switch (multiboot_version
) {
785 mp
= (mb_memory_map_t
*)mb_info
->mmap_addr
;
786 mpend
= (mb_memory_map_t
*)
787 (mb_info
->mmap_addr
+ mb_info
->mmap_length
);
789 for (i
= 0; mp
< mpend
&& i
!= index
; i
++)
790 mp
= (mb_memory_map_t
*)((uint32_t)mp
+ mp
->size
+
793 dboot_panic("dboot_loader_mmap_get_length(): index "
794 "out of bounds: %d\n", index
);
796 return (((uint64_t)mp
->length_high
<< 32) +
797 (uint64_t)mp
->length_low
);
800 return (dboot_multiboot2_mmap_get_length(mb2_info
,
801 mb2_mmap_tagp
, index
));
804 dboot_panic("Unknown multiboot version: %d\n",
810 return (((uint64_t)map_buffer
[index
].length_high
<< 32) +
811 (uint64_t)map_buffer
[index
].length_low
);
816 build_pcimemlists(void)
818 uint64_t page_offset
= MMU_PAGEOFFSET
; /* needs to be 64 bits */
826 pcimemlists
[0].addr
= pci_lo_limit
;
827 pcimemlists
[0].size
= pci_hi_limit
- pci_lo_limit
;
828 pcimemlists_used
= 1;
830 num
= dboot_loader_mmap_entries();
832 * Fill in PCI memlists.
834 for (i
= 0; i
< num
; ++i
) {
835 start
= dboot_loader_mmap_get_base(i
);
836 end
= start
+ dboot_loader_mmap_get_length(i
);
839 dboot_printf("\ttype: %d %" PRIx64
"..%"
840 PRIx64
"\n", dboot_loader_mmap_get_type(i
),
844 * page align start and end
846 start
= (start
+ page_offset
) & ~page_offset
;
851 exclude_from_pci(start
, end
);
855 * Finish off the pcimemlist
858 for (i
= 0; i
< pcimemlists_used
; ++i
) {
859 dboot_printf("pcimemlist entry 0x%" PRIx64
"..0x%"
860 PRIx64
"\n", pcimemlists
[i
].addr
,
861 pcimemlists
[i
].addr
+ pcimemlists
[i
].size
);
864 pcimemlists
[0].next
= 0;
865 pcimemlists
[0].prev
= 0;
866 for (i
= 1; i
< pcimemlists_used
; ++i
) {
867 pcimemlists
[i
].prev
=
868 (native_ptr_t
)(uintptr_t)(pcimemlists
+ i
- 1);
869 pcimemlists
[i
].next
= 0;
870 pcimemlists
[i
- 1].next
=
871 (native_ptr_t
)(uintptr_t)(pcimemlists
+ i
);
873 bi
->bi_pcimem
= (native_ptr_t
)(uintptr_t)pcimemlists
;
879 * Initialize memory allocator stuff from hypervisor-supplied start info.
884 int local
; /* variables needed to find start region */
885 paddr_t scratch_start
;
886 xen_memory_map_t map
;
888 DBG_MSG("Entered init_mem_alloc()\n");
891 * Free memory follows the stack. There's at least 512KB of scratch
892 * space, rounded up to at least 2Mb alignment. That should be enough
893 * for the page tables we'll need to build. The nucleus memory is
894 * allocated last and will be outside the addressible range. We'll
895 * switch to new page tables before we unpack the kernel
897 scratch_start
= RNDUP((paddr_t
)(uintptr_t)&local
, MMU_PAGESIZE
);
899 scratch_end
= RNDUP((paddr_t
)scratch_start
+ 512 * 1024, TWO_MEG
);
903 * For paranoia, leave some space between hypervisor data and ours.
904 * Use 500 instead of 512.
906 next_avail_addr
= scratch_end
- 500 * 1024;
907 DBG(next_avail_addr
);
910 * The domain builder gives us at most 1 module
912 DBG(xen_info
->mod_len
);
913 if (xen_info
->mod_len
> 0) {
914 DBG(xen_info
->mod_start
);
915 modules
[0].bm_addr
= xen_info
->mod_start
;
916 modules
[0].bm_size
= xen_info
->mod_len
;
917 bi
->bi_module_cnt
= 1;
918 bi
->bi_modules
= (native_ptr_t
)modules
;
920 bi
->bi_module_cnt
= 0;
921 bi
->bi_modules
= NULL
;
923 DBG(bi
->bi_module_cnt
);
926 DBG(xen_info
->mfn_list
);
927 DBG(xen_info
->nr_pages
);
928 max_mem
= (paddr_t
)xen_info
->nr_pages
<< MMU_PAGESHIFT
;
932 * Using pseudo-physical addresses, so only 1 memlist element
934 memlists
[0].addr
= 0;
935 DBG(memlists
[0].addr
);
936 memlists
[0].size
= max_mem
;
937 DBG(memlists
[0].size
);
942 * finish building physinstall list
947 * build bios reserved memlists
949 build_rsvdmemlists();
951 if (DOMAIN_IS_INITDOMAIN(xen_info
)) {
953 * build PCI Memory list
955 map
.nr_entries
= MAXMAPS
;
956 /*LINTED: constant in conditional context*/
957 set_xen_guest_handle(map
.buffer
, map_buffer
);
958 if (HYPERVISOR_memory_op(XENMEM_machine_memory_map
, &map
) != 0)
959 dboot_panic("getting XENMEM_machine_memory_map failed");
967 dboot_multiboot1_xboot_consinfo(void)
972 dboot_multiboot2_xboot_consinfo(void)
977 dboot_multiboot_modcount(void)
979 switch (multiboot_version
) {
981 return (mb_info
->mods_count
);
984 return (dboot_multiboot2_modcount(mb2_info
));
987 dboot_panic("Unknown multiboot version: %d\n",
995 dboot_multiboot_modstart(int index
)
997 switch (multiboot_version
) {
999 return (((mb_module_t
*)mb_info
->mods_addr
)[index
].mod_start
);
1002 return (dboot_multiboot2_modstart(mb2_info
, index
));
1005 dboot_panic("Unknown multiboot version: %d\n",
1013 dboot_multiboot_modend(int index
)
1015 switch (multiboot_version
) {
1017 return (((mb_module_t
*)mb_info
->mods_addr
)[index
].mod_end
);
1020 return (dboot_multiboot2_modend(mb2_info
, index
));
1023 dboot_panic("Unknown multiboot version: %d\n",
1031 dboot_multiboot_modcmdline(int index
)
1033 switch (multiboot_version
) {
1035 return ((char *)((mb_module_t
*)
1036 mb_info
->mods_addr
)[index
].mod_name
);
1039 return (dboot_multiboot2_modcmdline(mb2_info
, index
));
1042 dboot_panic("Unknown multiboot version: %d\n",
1050 * Find the environment module for console setup.
1051 * Since we need the console to print early boot messages, the console is set up
1052 * before anything else and therefore we need to pick up the environment module
1055 * Note, we just will search for and if found, will pass the env
1056 * module to console setup, the proper module list processing will happen later.
1059 dboot_find_env(void)
1062 uint32_t mod_start
, mod_end
;
1065 modcount
= dboot_multiboot_modcount();
1067 for (i
= 0; i
< modcount
; ++i
) {
1068 cmdline
= dboot_multiboot_modcmdline(i
);
1069 if (cmdline
== NULL
)
1072 if (strstr(cmdline
, "type=environment") == NULL
)
1075 mod_start
= dboot_multiboot_modstart(i
);
1076 mod_end
= dboot_multiboot_modend(i
);
1077 modules
[0].bm_addr
= mod_start
;
1078 modules
[0].bm_size
= mod_end
- mod_start
;
1079 modules
[0].bm_name
= NULL
;
1080 modules
[0].bm_hash
= NULL
;
1081 modules
[0].bm_type
= BMT_ENV
;
1082 bi
->bi_modules
= (native_ptr_t
)(uintptr_t)modules
;
1083 bi
->bi_module_cnt
= 1;
1089 dboot_multiboot_basicmeminfo(uint32_t *lower
, uint32_t *upper
)
1091 boolean_t rv
= B_FALSE
;
1093 switch (multiboot_version
) {
1095 if (mb_info
->flags
& 0x01) {
1096 *lower
= mb_info
->mem_lower
;
1097 *upper
= mb_info
->mem_upper
;
1103 return (dboot_multiboot2_basicmeminfo(mb2_info
, lower
, upper
));
1106 dboot_panic("Unknown multiboot version: %d\n",
1117 return (v
- 'a' + 0xa);
1119 return (v
- 'A' + 0xa);
1123 dboot_panic("bad ASCII hex character %c\n", v
);
1129 digest_a2h(const char *ascii
, uint8_t *digest
)
1133 for (i
= 0; i
< SHA1_DIGEST_LENGTH
; i
++) {
1134 digest
[i
] = dboot_a2h(ascii
[i
* 2]) << 4;
1135 digest
[i
] |= dboot_a2h(ascii
[i
* 2 + 1]);
1140 * Generate a SHA-1 hash of the first len bytes of image, and compare it with
1141 * the ASCII-format hash found in the 40-byte buffer at ascii. If they
1142 * match, return 0, otherwise -1. This works only for images smaller than
1143 * 4 GB, which should not be a problem.
1146 check_image_hash(uint_t midx
)
1152 uint8_t digest
[SHA1_DIGEST_LENGTH
];
1153 uint8_t baseline
[SHA1_DIGEST_LENGTH
];
1156 ascii
= (const char *)(uintptr_t)modules
[midx
].bm_hash
;
1157 image
= (const void *)(uintptr_t)modules
[midx
].bm_addr
;
1158 len
= (size_t)modules
[midx
].bm_size
;
1160 digest_a2h(ascii
, baseline
);
1163 SHA1Update(&ctx
, image
, len
);
1164 SHA1Final(digest
, &ctx
);
1166 for (i
= 0; i
< SHA1_DIGEST_LENGTH
; i
++) {
1167 if (digest
[i
] != baseline
[i
])
1175 type_to_str(boot_module_type_t type
)
1185 return ("environment");
1195 char displayhash
[SHA1_ASCII_LENGTH
+ 1];
1197 for (i
= 0; i
< modules_used
; i
++) {
1199 dboot_printf("module #%d: name %s type %s "
1200 "addr %lx size %lx\n",
1201 i
, (char *)(uintptr_t)modules
[i
].bm_name
,
1202 type_to_str(modules
[i
].bm_type
),
1203 (ulong_t
)modules
[i
].bm_addr
,
1204 (ulong_t
)modules
[i
].bm_size
);
1207 if (modules
[i
].bm_type
== BMT_HASH
||
1208 modules
[i
].bm_hash
== NULL
) {
1209 DBG_MSG("module has no hash; skipping check\n");
1212 (void) memcpy(displayhash
,
1213 (void *)(uintptr_t)modules
[i
].bm_hash
,
1215 displayhash
[SHA1_ASCII_LENGTH
] = '\0';
1217 dboot_printf("checking expected hash [%s]: ",
1221 if (check_image_hash(i
) != 0)
1222 dboot_panic("hash mismatch!\n");
1229 * Determine the module's starting address, size, name, and type, and fill the
1230 * boot_modules structure. This structure is used by the bop code, except for
1231 * hashes which are checked prior to transferring control to the kernel.
1234 process_module(int midx
)
1236 uint32_t mod_start
= dboot_multiboot_modstart(midx
);
1237 uint32_t mod_end
= dboot_multiboot_modend(midx
);
1238 char *cmdline
= dboot_multiboot_modcmdline(midx
);
1241 check_higher(mod_end
);
1243 dboot_printf("\tmodule #%d: '%s' at 0x%lx, end 0x%lx\n",
1244 midx
, cmdline
, (ulong_t
)mod_start
, (ulong_t
)mod_end
);
1247 if (mod_start
> mod_end
) {
1248 dboot_panic("module #%d: module start address 0x%lx greater "
1249 "than end address 0x%lx", midx
,
1250 (ulong_t
)mod_start
, (ulong_t
)mod_end
);
1254 * A brief note on lengths and sizes: GRUB, for reasons unknown, passes
1255 * the address of the last valid byte in a module plus 1 as mod_end.
1256 * This is of course a bug; the multiboot specification simply states
1257 * that mod_start and mod_end "contain the start and end addresses of
1258 * the boot module itself" which is pretty obviously not what GRUB is
1259 * doing. However, fixing it requires that not only this code be
1260 * changed but also that other code consuming this value and values
1261 * derived from it be fixed, and that the kernel and GRUB must either
1262 * both have the bug or neither. While there are a lot of combinations
1263 * that will work, there are also some that won't, so for simplicity
1264 * we'll just cope with the bug. That means we won't actually hash the
1265 * byte at mod_end, and we will expect that mod_end for the hash file
1266 * itself is one greater than some multiple of 41 (40 bytes of ASCII
1267 * hash plus a newline for each module). We set bm_size to the true
1268 * correct number of bytes in each module, achieving exactly this.
1271 modules
[midx
].bm_addr
= mod_start
;
1272 modules
[midx
].bm_size
= mod_end
- mod_start
;
1273 modules
[midx
].bm_name
= (native_ptr_t
)(uintptr_t)cmdline
;
1274 modules
[midx
].bm_hash
= NULL
;
1275 modules
[midx
].bm_type
= BMT_FILE
;
1277 if (cmdline
== NULL
) {
1278 modules
[midx
].bm_name
= (native_ptr_t
)(uintptr_t)noname
;
1283 modules
[midx
].bm_name
=
1284 (native_ptr_t
)(uintptr_t)strsep(&p
, " \t\f\n\r");
1287 q
= strsep(&p
, " \t\f\n\r");
1288 if (strncmp(q
, "name=", 5) == 0) {
1289 if (q
[5] != '\0' && !isspace(q
[5])) {
1290 modules
[midx
].bm_name
=
1291 (native_ptr_t
)(uintptr_t)(q
+ 5);
1296 if (strncmp(q
, "type=", 5) == 0) {
1297 if (q
[5] == '\0' || isspace(q
[5]))
1300 if (strcmp(q
, "rootfs") == 0) {
1301 modules
[midx
].bm_type
= BMT_ROOTFS
;
1302 } else if (strcmp(q
, "hash") == 0) {
1303 modules
[midx
].bm_type
= BMT_HASH
;
1304 } else if (strcmp(q
, "environment") == 0) {
1305 modules
[midx
].bm_type
= BMT_ENV
;
1306 } else if (strcmp(q
, "file") != 0) {
1307 dboot_printf("\tmodule #%d: unknown module "
1308 "type '%s'; defaulting to 'file'",
1314 if (strncmp(q
, "hash=", 5) == 0) {
1315 if (q
[5] != '\0' && !isspace(q
[5])) {
1316 modules
[midx
].bm_hash
=
1317 (native_ptr_t
)(uintptr_t)(q
+ 5);
1322 dboot_printf("ignoring unknown option '%s'\n", q
);
1327 * Backward compatibility: if there are exactly one or two modules, both
1328 * of type 'file' and neither with an embedded hash value, we have been
1329 * given the legacy style modules. In this case we need to treat the first
1330 * module as a rootfs and the second as a hash referencing that module.
1331 * Otherwise, even if the configuration is invalid, we assume that the
1332 * operator knows what he's doing or at least isn't being bitten by this
1338 if (modules_used
== 0 || modules_used
> 2)
1341 if (modules
[0].bm_type
!= BMT_FILE
||
1342 modules_used
> 1 && modules
[1].bm_type
!= BMT_FILE
) {
1346 if (modules
[0].bm_hash
!= NULL
||
1347 modules_used
> 1 && modules
[1].bm_hash
!= NULL
) {
1351 modules
[0].bm_type
= BMT_ROOTFS
;
1352 if (modules_used
> 1) {
1353 modules
[1].bm_type
= BMT_HASH
;
1354 modules
[1].bm_name
= modules
[0].bm_name
;
1359 * For modules that do not have assigned hashes but have a separate hash module,
1360 * find the assigned hash module and set the primary module's bm_hash to point
1361 * to the hash data from that module. We will then ignore modules of type
1362 * BMT_HASH from this point forward.
1365 assign_module_hashes(void)
1369 for (i
= 0; i
< modules_used
; i
++) {
1370 if (modules
[i
].bm_type
== BMT_HASH
||
1371 modules
[i
].bm_hash
!= NULL
) {
1375 for (j
= 0; j
< modules_used
; j
++) {
1376 if (modules
[j
].bm_type
!= BMT_HASH
||
1377 strcmp((char *)(uintptr_t)modules
[j
].bm_name
,
1378 (char *)(uintptr_t)modules
[i
].bm_name
) != 0) {
1382 if (modules
[j
].bm_size
< SHA1_ASCII_LENGTH
) {
1383 dboot_printf("Short hash module of length "
1384 "0x%lx bytes; ignoring\n",
1385 (ulong_t
)modules
[j
].bm_size
);
1387 modules
[i
].bm_hash
= modules
[j
].bm_addr
;
1395 * Walk through the module information finding the last used address.
1396 * The first available address will become the top level page table.
1399 dboot_process_modules(void)
1404 DBG_MSG("\nFinding Modules\n");
1405 modcount
= dboot_multiboot_modcount();
1406 if (modcount
> MAX_BOOT_MODULES
) {
1407 dboot_panic("Too many modules (%d) -- the maximum is %d.",
1408 modcount
, MAX_BOOT_MODULES
);
1411 * search the modules to find the last used address
1412 * we'll build the module list while we're walking through here
1414 check_higher((paddr_t
)(uintptr_t)&_end
);
1415 for (i
= 0; i
< modcount
; ++i
) {
1419 bi
->bi_modules
= (native_ptr_t
)(uintptr_t)modules
;
1420 DBG(bi
->bi_modules
);
1421 bi
->bi_module_cnt
= modcount
;
1422 DBG(bi
->bi_module_cnt
);
1425 assign_module_hashes();
1430 * We then build the phys_install memlist from the multiboot information.
1433 dboot_process_mmap(void)
1437 uint64_t page_offset
= MMU_PAGEOFFSET
; /* needs to be 64 bits */
1438 uint32_t lower
, upper
;
1439 int i
, mmap_entries
;
1442 * Walk through the memory map from multiboot and build our memlist
1443 * structures. Note these will have native format pointers.
1445 DBG_MSG("\nFinding Memory Map\n");
1447 num_entries_set
= B_FALSE
;
1449 if ((mmap_entries
= dboot_loader_mmap_entries()) > 0) {
1450 for (i
= 0; i
< mmap_entries
; i
++) {
1451 uint32_t type
= dboot_loader_mmap_get_type(i
);
1452 start
= dboot_loader_mmap_get_base(i
);
1453 end
= start
+ dboot_loader_mmap_get_length(i
);
1456 dboot_printf("\ttype: %d %" PRIx64
"..%"
1457 PRIx64
"\n", type
, start
, end
);
1460 * page align start and end
1462 start
= (start
+ page_offset
) & ~page_offset
;
1463 end
&= ~page_offset
;
1468 * only type 1 is usable RAM
1474 memlists
[memlists_used
].addr
= start
;
1475 memlists
[memlists_used
].size
= end
- start
;
1477 if (memlists_used
> MAX_MEMLIST
)
1478 dboot_panic("too many memlists");
1481 rsvdmemlists
[rsvdmemlists_used
].addr
= start
;
1482 rsvdmemlists
[rsvdmemlists_used
].size
=
1484 ++rsvdmemlists_used
;
1485 if (rsvdmemlists_used
> MAX_MEMLIST
)
1486 dboot_panic("too many rsvdmemlists");
1492 build_pcimemlists();
1493 } else if (dboot_multiboot_basicmeminfo(&lower
, &upper
)) {
1495 memlists
[memlists_used
].addr
= 0;
1496 memlists
[memlists_used
].size
= lower
* 1024;
1499 memlists
[memlists_used
].addr
= 1024 * 1024;
1500 memlists
[memlists_used
].size
= upper
* 1024;
1504 * Old platform - assume I/O space at the end of memory.
1506 pcimemlists
[0].addr
= (upper
* 1024) + (1024 * 1024);
1507 pcimemlists
[0].size
= pci_hi_limit
- pcimemlists
[0].addr
;
1508 pcimemlists
[0].next
= 0;
1509 pcimemlists
[0].prev
= 0;
1510 bi
->bi_pcimem
= (native_ptr_t
)(uintptr_t)pcimemlists
;
1513 dboot_panic("No memory info from boot loader!!!");
1517 * finish processing the physinstall list
1522 * build bios reserved mem lists
1524 build_rsvdmemlists();
1528 * The highest address is used as the starting point for dboot's simple
1531 * Finding the highest address in case of Multiboot 1 protocol is
1532 * quite painful in the sense that some information provided by
1533 * the multiboot info structure points to BIOS data, and some to RAM.
1535 * The module list was processed and checked already by dboot_process_modules(),
1536 * so we will check the command line string and the memory map.
1538 * This list of to be checked items is based on our current knowledge of
1539 * allocations made by grub1 and will need to be reviewed if there
1540 * are updates about the information provided by Multiboot 1.
1542 * In the case of the Multiboot 2, our life is much simpler, as the MB2
1543 * information tag list is one contiguous chunk of memory.
1546 dboot_multiboot1_highest_addr(void)
1548 paddr_t addr
= NULL
;
1549 char *cmdl
= (char *)mb_info
->cmdline
;
1551 if (mb_info
->flags
& MB_INFO_CMDLINE
)
1552 addr
= ((paddr_t
)((uintptr_t)cmdl
+ strlen(cmdl
) + 1));
1554 if (mb_info
->flags
& MB_INFO_MEM_MAP
)
1556 ((paddr_t
)(mb_info
->mmap_addr
+ mb_info
->mmap_length
)));
1561 dboot_multiboot_highest_addr(void)
1565 switch (multiboot_version
) {
1567 addr
= dboot_multiboot1_highest_addr();
1572 addr
= dboot_multiboot2_highest_addr(mb2_info
);
1577 dboot_panic("Unknown multiboot version: %d\n",
1584 * Walk the boot loader provided information and find the highest free address.
1587 init_mem_alloc(void)
1589 DBG_MSG("Entered init_mem_alloc()\n");
1590 dboot_process_modules();
1591 dboot_process_mmap();
1592 dboot_multiboot_highest_addr();
1596 dboot_multiboot_get_fwtables(void)
1598 multiboot_tag_new_acpi_t
*nacpitagp
;
1599 multiboot_tag_old_acpi_t
*oacpitagp
;
1601 /* no fw tables from multiboot 1 */
1602 if (multiboot_version
!= 2)
1605 /* only provide SMBIOS pointer in case of UEFI */
1606 bi
->bi_smbios
= NULL
;
1608 nacpitagp
= (multiboot_tag_new_acpi_t
*)
1609 dboot_multiboot2_find_tag(mb2_info
,
1610 MULTIBOOT_TAG_TYPE_ACPI_NEW
);
1611 oacpitagp
= (multiboot_tag_old_acpi_t
*)
1612 dboot_multiboot2_find_tag(mb2_info
,
1613 MULTIBOOT_TAG_TYPE_ACPI_OLD
);
1615 if (nacpitagp
!= NULL
) {
1616 bi
->bi_acpi_rsdp
= (native_ptr_t
)(uintptr_t)
1617 &nacpitagp
->mb_rsdp
[0];
1618 } else if (oacpitagp
!= NULL
) {
1619 bi
->bi_acpi_rsdp
= (native_ptr_t
)(uintptr_t)
1620 &oacpitagp
->mb_rsdp
[0];
1622 bi
->bi_acpi_rsdp
= NULL
;
1628 * Simple memory allocator, allocates aligned physical memory.
1629 * Note that startup_kernel() only allocates memory, never frees.
1630 * Memory usage just grows in an upward direction.
1633 do_mem_alloc(uint32_t size
, uint32_t align
)
1641 * make sure size is a multiple of pagesize
1643 size
= RNDUP(size
, MMU_PAGESIZE
);
1644 next_avail_addr
= RNDUP(next_avail_addr
, align
);
1649 * a really large bootarchive that causes you to run out of memory
1650 * may cause this to blow up
1652 /* LINTED E_UNEXPECTED_UINT_PROMOTION */
1653 best
= (uint64_t)-size
;
1654 for (i
= 0; i
< memlists_used
; ++i
) {
1655 start
= memlists
[i
].addr
;
1659 end
= start
+ memlists
[i
].size
;
1662 * did we find the desired address?
1664 if (start
<= next_avail_addr
&& next_avail_addr
+ size
<= end
) {
1665 best
= next_avail_addr
;
1670 * if not is this address the best so far?
1672 if (start
> next_avail_addr
&& start
< best
&&
1673 RNDUP(start
, align
) + size
<= end
)
1674 best
= RNDUP(start
, align
);
1678 * We didn't find exactly the address we wanted, due to going off the
1679 * end of a memory region. Return the best found memory address.
1682 next_avail_addr
= best
+ size
;
1684 if (next_avail_addr
> scratch_end
)
1685 dboot_panic("Out of mem next_avail: 0x%lx, scratch_end: "
1686 "0x%lx", (ulong_t
)next_avail_addr
,
1687 (ulong_t
)scratch_end
);
1689 (void) memset((void *)(uintptr_t)best
, 0, size
);
1690 return ((void *)(uintptr_t)best
);
1694 mem_alloc(uint32_t size
)
1696 return (do_mem_alloc(size
, MMU_PAGESIZE
));
1701 * Build page tables to map all of memory used so far as well as the kernel.
1704 build_page_tables(void)
1716 * If we're on metal, we need to create the top level pagetable.
1719 top_page_table
= (paddr_t
)(uintptr_t)xen_info
->pt_base
;
1721 top_page_table
= (paddr_t
)(uintptr_t)mem_alloc(MMU_PAGESIZE
);
1723 DBG((uintptr_t)top_page_table
);
1726 * Determine if we'll use large mappings for kernel, then map it.
1728 if (largepage_support
) {
1732 psize
= MMU_PAGESIZE
;
1736 DBG_MSG("Mapping kernel\n");
1738 DBG(target_kernel_text
);
1741 for (off
= 0; off
< ksize
; off
+= psize
)
1742 map_pa_at_va(ktext_phys
+ off
, target_kernel_text
+ off
, level
);
1745 * The kernel will need a 1 page window to work with page tables
1747 bi
->bi_pt_window
= (uintptr_t)mem_alloc(MMU_PAGESIZE
);
1748 DBG(bi
->bi_pt_window
);
1749 bi
->bi_pte_to_pt_window
=
1750 (uintptr_t)find_pte(bi
->bi_pt_window
, NULL
, 0, 0);
1751 DBG(bi
->bi_pte_to_pt_window
);
1754 if (!DOMAIN_IS_INITDOMAIN(xen_info
)) {
1755 /* If this is a domU we're done. */
1756 DBG_MSG("\nPage tables constructed\n");
1762 * We need 1:1 mappings for the lower 1M of memory to access
1763 * BIOS tables used by a couple of drivers during boot.
1765 * The following code works because our simple memory allocator
1766 * only grows usage in an upwards direction.
1768 * Note that by this point in boot some mappings for low memory
1769 * may already exist because we've already accessed device in low
1770 * memory. (Specifically the video frame buffer and keyboard
1771 * status ports.) If we're booting on raw hardware then GRUB
1772 * created these mappings for us. If we're booting under a
1773 * hypervisor then we went ahead and remapped these devices into
1774 * memory allocated within dboot itself.
1777 dboot_printf("1:1 map pa=0..1Meg\n");
1778 for (start
= 0; start
< 1024 * 1024; start
+= MMU_PAGESIZE
) {
1780 map_ma_at_va(start
, start
, 0);
1782 map_pa_at_va(start
, start
, 0);
1787 for (i
= 0; i
< memlists_used
; ++i
) {
1788 start
= memlists
[i
].addr
;
1790 end
= start
+ memlists
[i
].size
;
1793 dboot_printf("1:1 map pa=%" PRIx64
"..%" PRIx64
"\n",
1795 while (start
< end
&& start
< next_avail_addr
) {
1796 map_pa_at_va(start
, start
, 0);
1797 start
+= MMU_PAGESIZE
;
1802 DBG_MSG("\nPage tables constructed\n");
1805 #define NO_MULTIBOOT \
1806 "multiboot is no longer used to boot the Solaris Operating System.\n\
1807 The grub entry should be changed to:\n\
1808 kernel$ /platform/i86pc/kernel/$ISADIR/unix\n\
1809 module$ /platform/i86pc/$ISADIR/boot_archive\n\
1810 See http://illumos.org/msg/SUNOS-8000-AK for details.\n"
1813 dboot_init_xboot_consinfo(void)
1817 * boot info must be 16 byte aligned for 64 bit kernel ABI
1819 addr
= (uintptr_t)boot_info
;
1820 addr
= (addr
+ 0xf) & ~0xf;
1821 bi
= (struct xboot_info
*)addr
;
1824 switch (multiboot_version
) {
1826 dboot_multiboot1_xboot_consinfo();
1829 dboot_multiboot2_xboot_consinfo();
1832 dboot_panic("Unknown multiboot version: %d\n",
1837 * Lookup environment module for the console. Complete module list
1838 * will be built after console setup.
1845 * Set up basic data from the boot loader.
1846 * The load_addr is part of AOUT kludge setup in dboot_grub.s, to support
1847 * 32-bit dboot code setup used to set up and start 64-bit kernel.
1848 * AOUT kludge does allow 32-bit boot loader, such as grub1, to load and
1849 * start 64-bit illumos kernel.
1852 dboot_loader_init(void)
1859 case MB_BOOTLOADER_MAGIC
:
1860 multiboot_version
= 1;
1861 mb_info
= (multiboot_info_t
*)(uintptr_t)mb_addr
;
1862 #if defined(_BOOT_TARGET_amd64)
1863 load_addr
= mb_header
.load_addr
;
1867 case MULTIBOOT2_BOOTLOADER_MAGIC
:
1868 multiboot_version
= 2;
1869 mb2_info
= (multiboot2_info_header_t
*)(uintptr_t)mb_addr
;
1870 mb2_mmap_tagp
= dboot_multiboot2_get_mmap_tagp(mb2_info
);
1871 #if defined(_BOOT_TARGET_amd64)
1872 load_addr
= mb2_load_addr
;
1877 dboot_panic("Unknown bootloader magic: 0x%x\n", mb_magic
);
1880 #endif /* !defined(__xpv) */
1883 /* Extract the kernel command line from [multi]boot information. */
1885 dboot_loader_cmdline(void)
1890 line
= (char *)xen_info
->cmd_line
;
1893 switch (multiboot_version
) {
1895 if (mb_info
->flags
& MB_INFO_CMDLINE
)
1896 line
= (char *)mb_info
->cmdline
;
1900 line
= dboot_multiboot2_cmdline(mb2_info
);
1904 dboot_panic("Unknown multiboot version: %d\n",
1912 * Make sure we have valid pointer so the string operations
1913 * will not crash us.
1922 dboot_loader_name(void)
1927 multiboot_tag_string_t
*tag
;
1929 switch (multiboot_version
) {
1931 return ((char *)mb_info
->boot_loader_name
);
1934 tag
= dboot_multiboot2_find_tag(mb2_info
,
1935 MULTIBOOT_TAG_TYPE_BOOT_LOADER_NAME
);
1936 return (tag
->mb_string
);
1938 dboot_panic("Unknown multiboot version: %d\n",
1947 * startup_kernel has a pretty simple job. It builds pagetables which reflect
1948 * 1:1 mappings for all memory in use. It then also adds mappings for
1949 * the kernel nucleus at virtual address of target_kernel_text using large page
1950 * mappings. The page table pages are also accessible at 1:1 mapped
1951 * virtual addresses.
1955 startup_kernel(void)
1960 physdev_set_iopl_t set_iopl
;
1963 dboot_loader_init();
1965 * At this point we are executing in a 32 bit real mode.
1968 bootloader
= dboot_loader_name();
1969 cmdline
= dboot_loader_cmdline();
1973 * For dom0, before we initialize the console subsystem we'll
1974 * need to enable io operations, so set I/O priveldge level to 1.
1976 if (DOMAIN_IS_INITDOMAIN(xen_info
)) {
1978 (void) HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl
, &set_iopl
);
1982 dboot_init_xboot_consinfo();
1983 bi
->bi_cmdline
= (native_ptr_t
)(uintptr_t)cmdline
;
1986 prom_debug
= (find_boot_prop("prom_debug") != NULL
);
1987 map_debug
= (find_boot_prop("map_debug") != NULL
);
1990 dboot_multiboot_get_fwtables();
1992 DBG_MSG("\n\nillumos prekernel set: ");
1996 if (bootloader
!= NULL
&& prom_debug
) {
1997 dboot_printf("Kernel loaded by: %s\n", bootloader
);
1999 dboot_printf("Using multiboot %d boot protocol.\n",
2004 if (strstr(cmdline
, "multiboot") != NULL
) {
2005 dboot_panic(NO_MULTIBOOT
);
2010 DBG((uintptr_t)mb_info
);
2011 DBG((uintptr_t)mb2_info
);
2012 if (mb2_info
!= NULL
)
2013 DBG(mb2_info
->mbi_total_size
);
2014 DBG(bi
->bi_acpi_rsdp
);
2019 * Need correct target_kernel_text value
2021 #if defined(_BOOT_TARGET_amd64)
2022 target_kernel_text
= KERNEL_TEXT_amd64
;
2023 #elif defined(__xpv)
2024 target_kernel_text
= KERNEL_TEXT_i386_xpv
;
2026 target_kernel_text
= KERNEL_TEXT_i386
;
2028 DBG(target_kernel_text
);
2033 * XXPV Derive this stuff from CPUID / what the hypervisor has enabled
2036 #if defined(_BOOT_TARGET_amd64)
2038 * 64-bit hypervisor.
2043 #else /* _BOOT_TARGET_amd64 */
2046 * See if we are running on a PAE Hypervisor
2049 xen_capabilities_info_t caps
;
2051 if (HYPERVISOR_xen_version(XENVER_capabilities
, &caps
) != 0)
2052 dboot_panic("HYPERVISOR_xen_version(caps) failed");
2053 caps
[sizeof (caps
) - 1] = 0;
2055 dboot_printf("xen capabilities %s\n", caps
);
2056 if (strstr(caps
, "x86_32p") != NULL
)
2060 #endif /* _BOOT_TARGET_amd64 */
2062 xen_platform_parameters_t p
;
2064 if (HYPERVISOR_xen_version(XENVER_platform_parameters
, &p
) != 0)
2065 dboot_panic("HYPERVISOR_xen_version(parms) failed");
2067 mfn_to_pfn_mapping
= (pfn_t
*)(xen_virt_start
= p
.virt_start
);
2071 * The hypervisor loads stuff starting at 1Gig
2077 * enable writable page table mode for the hypervisor
2079 if (HYPERVISOR_vm_assist(VMASST_CMD_enable
,
2080 VMASST_TYPE_writable_pagetables
) < 0)
2081 dboot_panic("HYPERVISOR_vm_assist(writable_pagetables) failed");
2084 * check for NX support
2087 uint32_t eax
= 0x80000000;
2088 uint32_t edx
= get_cpuid_edx(&eax
);
2090 if (eax
>= 0x80000001) {
2092 edx
= get_cpuid_edx(&eax
);
2093 if (edx
& CPUID_AMD_EDX_NX
)
2098 #if !defined(_BOOT_TARGET_amd64)
2101 * The 32-bit hypervisor uses segmentation to protect itself from
2102 * guests. This means when a guest attempts to install a flat 4GB
2103 * code or data descriptor the 32-bit hypervisor will protect itself
2104 * by silently shrinking the segment such that if the guest attempts
2105 * any access where the hypervisor lives a #gp fault is generated.
2106 * The problem is that some applications expect a full 4GB flat
2107 * segment for their current thread pointer and will use negative
2108 * offset segment wrap around to access data. TLS support in linux
2109 * brand is one example of this.
2111 * The 32-bit hypervisor can catch the #gp fault in these cases
2112 * and emulate the access without passing the #gp fault to the guest
2113 * but only if VMASST_TYPE_4gb_segments is explicitly turned on.
2114 * Seems like this should have been the default.
2115 * Either way, we want the hypervisor -- and not Solaris -- to deal
2116 * to deal with emulating these accesses.
2118 if (HYPERVISOR_vm_assist(VMASST_CMD_enable
,
2119 VMASST_TYPE_4gb_segments
) < 0)
2120 dboot_panic("HYPERVISOR_vm_assist(4gb_segments) failed");
2121 #endif /* !_BOOT_TARGET_amd64 */
2126 * use cpuid to enable MMU features
2132 edx
= get_cpuid_edx(&eax
);
2133 if (edx
& CPUID_INTC_EDX_PSE
)
2134 largepage_support
= 1;
2135 if (edx
& CPUID_INTC_EDX_PGE
)
2137 if (edx
& CPUID_INTC_EDX_PAE
)
2141 edx
= get_cpuid_edx(&eax
);
2142 if (eax
>= 0x80000001) {
2144 edx
= get_cpuid_edx(&eax
);
2145 if (edx
& CPUID_AMD_EDX_LM
)
2147 if (edx
& CPUID_AMD_EDX_NX
)
2151 dboot_printf("cpuid not supported\n");
2156 #if defined(_BOOT_TARGET_amd64)
2157 if (amd64_support
== 0)
2158 dboot_panic("long mode not supported, rebooting");
2159 else if (pae_support
== 0)
2160 dboot_panic("long mode, but no PAE; rebooting");
2163 * Allow the command line to over-ride use of PAE for 32 bit.
2165 if (strstr(cmdline
, "disablePAE=true") != NULL
) {
2173 * initialize the simple memory allocator
2177 #if !defined(__xpv) && !defined(_BOOT_TARGET_amd64)
2179 * disable PAE on 32 bit h/w w/o NX and < 4Gig of memory
2181 if (max_mem
< FOUR_GIG
&& NX_support
== 0)
2186 * configure mmu information
2189 shift_amt
= shift_amt_pae
;
2190 ptes_per_table
= 512;
2192 lpagesize
= TWO_MEG
;
2193 #if defined(_BOOT_TARGET_amd64)
2201 shift_amt
= shift_amt_nopae
;
2202 ptes_per_table
= 1024;
2204 lpagesize
= FOUR_MEG
;
2210 DBG(largepage_support
);
2214 DBG(ptes_per_table
);
2218 ktext_phys
= ONE_GIG
; /* from UNIX Mapfile */
2220 ktext_phys
= FOUR_MEG
; /* from UNIX Mapfile */
2223 #if !defined(__xpv) && defined(_BOOT_TARGET_amd64)
2225 * For grub, copy kernel bits from the ELF64 file to final place.
2227 DBG_MSG("\nAllocating nucleus pages.\n");
2228 ktext_phys
= (uintptr_t)do_mem_alloc(ksize
, FOUR_MEG
);
2229 if (ktext_phys
== 0)
2230 dboot_panic("failed to allocate aligned kernel memory");
2232 if (dboot_elfload64(load_addr
) != 0)
2233 dboot_panic("failed to parse kernel ELF image, rebooting");
2239 * Allocate page tables.
2241 build_page_tables();
2244 * return to assembly code to switch to running kernel
2246 entry_addr_low
= (uint32_t)target_kernel_text
;
2247 DBG(entry_addr_low
);
2248 bi
->bi_use_largepage
= largepage_support
;
2249 bi
->bi_use_pae
= pae_support
;
2250 bi
->bi_use_pge
= pge_support
;
2251 bi
->bi_use_nx
= NX_support
;
2255 bi
->bi_next_paddr
= next_avail_addr
- mfn_base
;
2256 DBG(bi
->bi_next_paddr
);
2257 bi
->bi_next_vaddr
= (native_ptr_t
)next_avail_addr
;
2258 DBG(bi
->bi_next_vaddr
);
2261 * unmap unused pages in start area to make them available for DMA
2263 while (next_avail_addr
< scratch_end
) {
2264 (void) HYPERVISOR_update_va_mapping(next_avail_addr
,
2265 0, UVMF_INVLPG
| UVMF_LOCAL
);
2266 next_avail_addr
+= MMU_PAGESIZE
;
2269 bi
->bi_xen_start_info
= (uintptr_t)xen_info
;
2270 DBG((uintptr_t)HYPERVISOR_shared_info
);
2271 bi
->bi_shared_info
= (native_ptr_t
)HYPERVISOR_shared_info
;
2272 bi
->bi_top_page_table
= (uintptr_t)top_page_table
- mfn_base
;
2276 bi
->bi_next_paddr
= next_avail_addr
;
2277 DBG(bi
->bi_next_paddr
);
2278 bi
->bi_next_vaddr
= (uintptr_t)next_avail_addr
;
2279 DBG(bi
->bi_next_vaddr
);
2280 bi
->bi_mb_version
= multiboot_version
;
2282 switch (multiboot_version
) {
2284 bi
->bi_mb_info
= (uintptr_t)mb_info
;
2287 bi
->bi_mb_info
= (uintptr_t)mb2_info
;
2290 dboot_panic("Unknown multiboot version: %d\n",
2294 bi
->bi_top_page_table
= (uintptr_t)top_page_table
;
2298 bi
->bi_kseg_size
= FOUR_MEG
;
2299 DBG(bi
->bi_kseg_size
);
2306 DBG_MSG("\n\n*** DBOOT DONE -- back to asm to jump to kernel\n\n");