2 * Machine specific setup for xen
4 * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
7 #include <linux/module.h>
8 #include <linux/sched.h>
11 #include <linux/memblock.h>
12 #include <linux/cpuidle.h>
13 #include <linux/cpufreq.h>
18 #include <asm/setup.h>
21 #include <asm/xen/hypervisor.h>
22 #include <asm/xen/hypercall.h>
26 #include <xen/interface/callback.h>
27 #include <xen/interface/memory.h>
28 #include <xen/interface/physdev.h>
29 #include <xen/features.h>
34 /* These are code, but not functions. Defined in entry.S */
35 extern const char xen_hypervisor_callback
[];
36 extern const char xen_failsafe_callback
[];
38 extern asmlinkage
void nmi(void);
40 extern void xen_sysenter_target(void);
41 extern void xen_syscall_target(void);
42 extern void xen_syscall32_target(void);
44 /* Amount of extra memory space we add to the e820 ranges */
45 struct xen_memory_region xen_extra_mem
[XEN_EXTRA_MEM_MAX_REGIONS
] __initdata
;
47 /* Number of pages released from the initial allocation. */
48 unsigned long xen_released_pages
;
51 * The maximum amount of extra memory compared to the base size. The
52 * main scaling factor is the size of struct page. At extreme ratios
53 * of base:extra, all the base memory can be filled with page
54 * structures for the extra memory, leaving no space for anything
57 * 10x seems like a reasonable balance between scaling flexibility and
58 * leaving a practically usable system.
60 #define EXTRA_MEM_RATIO (10)
62 static void __init
xen_add_extra_mem(u64 start
, u64 size
)
67 for (i
= 0; i
< XEN_EXTRA_MEM_MAX_REGIONS
; i
++) {
69 if (xen_extra_mem
[i
].size
== 0) {
70 xen_extra_mem
[i
].start
= start
;
71 xen_extra_mem
[i
].size
= size
;
74 /* Append to existing region. */
75 if (xen_extra_mem
[i
].start
+ xen_extra_mem
[i
].size
== start
) {
76 xen_extra_mem
[i
].size
+= size
;
80 if (i
== XEN_EXTRA_MEM_MAX_REGIONS
)
81 printk(KERN_WARNING
"Warning: not enough extra memory regions\n");
83 memblock_reserve(start
, size
);
85 if (xen_feature(XENFEAT_auto_translated_physmap
))
88 xen_max_p2m_pfn
= PFN_DOWN(start
+ size
);
89 for (pfn
= PFN_DOWN(start
); pfn
< xen_max_p2m_pfn
; pfn
++) {
90 unsigned long mfn
= pfn_to_mfn(pfn
);
92 if (WARN_ONCE(mfn
== pfn
, "Trying to over-write 1-1 mapping (pfn: %lx)\n", pfn
))
94 WARN_ONCE(mfn
!= INVALID_P2M_ENTRY
, "Trying to remove %lx which has %lx mfn!\n",
97 __set_phys_to_machine(pfn
, INVALID_P2M_ENTRY
);
101 static unsigned long __init
xen_do_chunk(unsigned long start
,
102 unsigned long end
, bool release
)
104 struct xen_memory_reservation reservation
= {
109 unsigned long len
= 0;
110 int xlated_phys
= xen_feature(XENFEAT_auto_translated_physmap
);
114 for (pfn
= start
; pfn
< end
; pfn
++) {
116 unsigned long mfn
= pfn_to_mfn(pfn
);
119 /* Make sure pfn exists to start with */
120 if (mfn
== INVALID_P2M_ENTRY
|| mfn_to_pfn(mfn
) != pfn
)
124 if (!xlated_phys
&& mfn
!= INVALID_P2M_ENTRY
)
128 set_xen_guest_handle(reservation
.extent_start
, &frame
);
129 reservation
.nr_extents
= 1;
131 ret
= HYPERVISOR_memory_op(release
? XENMEM_decrease_reservation
: XENMEM_populate_physmap
,
133 WARN(ret
!= 1, "Failed to %s pfn %lx err=%d\n",
134 release
? "release" : "populate", pfn
, ret
);
137 if (!early_set_phys_to_machine(pfn
, release
? INVALID_P2M_ENTRY
: frame
)) {
140 set_xen_guest_handle(reservation
.extent_start
, &frame
);
141 reservation
.nr_extents
= 1;
142 ret
= HYPERVISOR_memory_op(XENMEM_decrease_reservation
,
151 printk(KERN_INFO
"%s %lx-%lx pfn range: %lu pages %s\n",
152 release
? "Freeing" : "Populating",
154 release
? "freed" : "added");
159 static unsigned long __init
xen_release_chunk(unsigned long start
,
163 * Xen already ballooned out the E820 non RAM regions for us
164 * and set them up properly in EPT.
166 if (xen_feature(XENFEAT_auto_translated_physmap
))
169 return xen_do_chunk(start
, end
, true);
172 static unsigned long __init
xen_populate_chunk(
173 const struct e820entry
*list
, size_t map_size
,
174 unsigned long max_pfn
, unsigned long *last_pfn
,
175 unsigned long credits_left
)
177 const struct e820entry
*entry
;
179 unsigned long done
= 0;
180 unsigned long dest_pfn
;
182 for (i
= 0, entry
= list
; i
< map_size
; i
++, entry
++) {
188 if (credits_left
<= 0)
191 if (entry
->type
!= E820_RAM
)
194 e_pfn
= PFN_DOWN(entry
->addr
+ entry
->size
);
196 /* We only care about E820 after the xen_start_info->nr_pages */
197 if (e_pfn
<= max_pfn
)
200 s_pfn
= PFN_UP(entry
->addr
);
201 /* If the E820 falls within the nr_pages, we want to start
202 * at the nr_pages PFN.
203 * If that would mean going past the E820 entry, skip it
205 if (s_pfn
<= max_pfn
) {
206 capacity
= e_pfn
- max_pfn
;
209 capacity
= e_pfn
- s_pfn
;
213 if (credits_left
< capacity
)
214 capacity
= credits_left
;
216 pfns
= xen_do_chunk(dest_pfn
, dest_pfn
+ capacity
, false);
218 *last_pfn
= (dest_pfn
+ pfns
);
221 credits_left
-= pfns
;
226 static void __init
xen_set_identity_and_release_chunk(
227 unsigned long start_pfn
, unsigned long end_pfn
, unsigned long nr_pages
,
228 unsigned long *released
, unsigned long *identity
)
233 * If the PFNs are currently mapped, clear the mappings
234 * (except for the ISA region which must be 1:1 mapped) to
235 * release the refcounts (in Xen) on the original frames.
239 * PVH E820 matches the hypervisor's P2M which means we need to
240 * account for the proper values of *release and *identity.
242 for (pfn
= start_pfn
; !xen_feature(XENFEAT_auto_translated_physmap
) &&
243 pfn
<= max_pfn_mapped
&& pfn
< end_pfn
; pfn
++) {
244 pte_t pte
= __pte_ma(0);
246 if (pfn
< PFN_UP(ISA_END_ADDRESS
))
247 pte
= mfn_pte(pfn
, PAGE_KERNEL_IO
);
249 (void)HYPERVISOR_update_va_mapping(
250 (unsigned long)__va(pfn
<< PAGE_SHIFT
), pte
, 0);
253 if (start_pfn
< nr_pages
)
254 *released
+= xen_release_chunk(
255 start_pfn
, min(end_pfn
, nr_pages
));
257 *identity
+= set_phys_range_identity(start_pfn
, end_pfn
);
260 static unsigned long __init
xen_set_identity_and_release(
261 const struct e820entry
*list
, size_t map_size
, unsigned long nr_pages
)
263 phys_addr_t start
= 0;
264 unsigned long released
= 0;
265 unsigned long identity
= 0;
266 const struct e820entry
*entry
;
270 * Combine non-RAM regions and gaps until a RAM region (or the
271 * end of the map) is reached, then set the 1:1 map and
272 * release the pages (if available) in those non-RAM regions.
274 * The combined non-RAM regions are rounded to a whole number
275 * of pages so any partial pages are accessible via the 1:1
276 * mapping. This is needed for some BIOSes that put (for
277 * example) the DMI tables in a reserved region that begins on
278 * a non-page boundary.
280 for (i
= 0, entry
= list
; i
< map_size
; i
++, entry
++) {
281 phys_addr_t end
= entry
->addr
+ entry
->size
;
282 if (entry
->type
== E820_RAM
|| i
== map_size
- 1) {
283 unsigned long start_pfn
= PFN_DOWN(start
);
284 unsigned long end_pfn
= PFN_UP(end
);
286 if (entry
->type
== E820_RAM
)
287 end_pfn
= PFN_UP(entry
->addr
);
289 if (start_pfn
< end_pfn
)
290 xen_set_identity_and_release_chunk(
291 start_pfn
, end_pfn
, nr_pages
,
292 &released
, &identity
);
299 printk(KERN_INFO
"Released %lu pages of unused memory\n", released
);
301 printk(KERN_INFO
"Set %ld page(s) to 1-1 mapping\n", identity
);
306 static unsigned long __init
xen_get_max_pages(void)
308 unsigned long max_pages
= MAX_DOMAIN_PAGES
;
309 domid_t domid
= DOMID_SELF
;
313 * For the initial domain we use the maximum reservation as
316 * For guest domains the current maximum reservation reflects
317 * the current maximum rather than the static maximum. In this
318 * case the e820 map provided to us will cover the static
321 if (xen_initial_domain()) {
322 ret
= HYPERVISOR_memory_op(XENMEM_maximum_reservation
, &domid
);
327 return min(max_pages
, MAX_DOMAIN_PAGES
);
330 static void xen_align_and_add_e820_region(u64 start
, u64 size
, int type
)
332 u64 end
= start
+ size
;
334 /* Align RAM regions to page boundaries. */
335 if (type
== E820_RAM
) {
336 start
= PAGE_ALIGN(start
);
337 end
&= ~((u64
)PAGE_SIZE
- 1);
340 e820_add_region(start
, end
- start
, type
);
343 void xen_ignore_unusable(struct e820entry
*list
, size_t map_size
)
345 struct e820entry
*entry
;
348 for (i
= 0, entry
= list
; i
< map_size
; i
++, entry
++) {
349 if (entry
->type
== E820_UNUSABLE
)
350 entry
->type
= E820_RAM
;
355 * machine_specific_memory_setup - Hook for machine specific memory setup.
357 char * __init
xen_memory_setup(void)
359 static struct e820entry map
[E820MAX
] __initdata
;
361 unsigned long max_pfn
= xen_start_info
->nr_pages
;
362 unsigned long long mem_end
;
364 struct xen_memory_map memmap
;
365 unsigned long max_pages
;
366 unsigned long last_pfn
= 0;
367 unsigned long extra_pages
= 0;
368 unsigned long populated
;
372 max_pfn
= min(MAX_DOMAIN_PAGES
, max_pfn
);
373 mem_end
= PFN_PHYS(max_pfn
);
375 memmap
.nr_entries
= E820MAX
;
376 set_xen_guest_handle(memmap
.buffer
, map
);
378 op
= xen_initial_domain() ?
379 XENMEM_machine_memory_map
:
381 rc
= HYPERVISOR_memory_op(op
, &memmap
);
383 BUG_ON(xen_initial_domain());
384 memmap
.nr_entries
= 1;
386 map
[0].size
= mem_end
;
387 /* 8MB slack (to balance backend allocations). */
388 map
[0].size
+= 8ULL << 20;
389 map
[0].type
= E820_RAM
;
395 * Xen won't allow a 1:1 mapping to be created to UNUSABLE
396 * regions, so if we're using the machine memory map leave the
397 * region as RAM as it is in the pseudo-physical map.
399 * UNUSABLE regions in domUs are not handled and will need
400 * a patch in the future.
402 if (xen_initial_domain())
403 xen_ignore_unusable(map
, memmap
.nr_entries
);
405 /* Make sure the Xen-supplied memory map is well-ordered. */
406 sanitize_e820_map(map
, memmap
.nr_entries
, &memmap
.nr_entries
);
408 max_pages
= xen_get_max_pages();
409 if (max_pages
> max_pfn
)
410 extra_pages
+= max_pages
- max_pfn
;
413 * Set P2M for all non-RAM pages and E820 gaps to be identity
414 * type PFNs. Any RAM pages that would be made inaccesible by
415 * this are first released.
417 xen_released_pages
= xen_set_identity_and_release(
418 map
, memmap
.nr_entries
, max_pfn
);
421 * Populate back the non-RAM pages and E820 gaps that had been
423 populated
= xen_populate_chunk(map
, memmap
.nr_entries
,
424 max_pfn
, &last_pfn
, xen_released_pages
);
426 xen_released_pages
-= populated
;
427 extra_pages
+= xen_released_pages
;
429 if (last_pfn
> max_pfn
) {
430 max_pfn
= min(MAX_DOMAIN_PAGES
, last_pfn
);
431 mem_end
= PFN_PHYS(max_pfn
);
434 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
435 * factor the base size. On non-highmem systems, the base
436 * size is the full initial memory allocation; on highmem it
437 * is limited to the max size of lowmem, so that it doesn't
438 * get completely filled.
440 * In principle there could be a problem in lowmem systems if
441 * the initial memory is also very large with respect to
442 * lowmem, but we won't try to deal with that here.
444 extra_pages
= min(EXTRA_MEM_RATIO
* min(max_pfn
, PFN_DOWN(MAXMEM
)),
447 while (i
< memmap
.nr_entries
) {
448 u64 addr
= map
[i
].addr
;
449 u64 size
= map
[i
].size
;
450 u32 type
= map
[i
].type
;
452 if (type
== E820_RAM
) {
453 if (addr
< mem_end
) {
454 size
= min(size
, mem_end
- addr
);
455 } else if (extra_pages
) {
456 size
= min(size
, (u64
)extra_pages
* PAGE_SIZE
);
457 extra_pages
-= size
/ PAGE_SIZE
;
458 xen_add_extra_mem(addr
, size
);
460 type
= E820_UNUSABLE
;
463 xen_align_and_add_e820_region(addr
, size
, type
);
467 if (map
[i
].size
== 0)
472 * Set the rest as identity mapped, in case PCI BARs are
475 * PFNs above MAX_P2M_PFN are considered identity mapped as
478 set_phys_range_identity(map
[i
-1].addr
/ PAGE_SIZE
, ~0ul);
481 * In domU, the ISA region is normal, usable memory, but we
482 * reserve ISA memory anyway because too many things poke
485 e820_add_region(ISA_START_ADDRESS
, ISA_END_ADDRESS
- ISA_START_ADDRESS
,
492 * See comment above "struct start_info" in <xen/interface/xen.h>
493 * We tried to make the the memblock_reserve more selective so
494 * that it would be clear what region is reserved. Sadly we ran
495 * in the problem wherein on a 64-bit hypervisor with a 32-bit
496 * initial domain, the pt_base has the cr3 value which is not
497 * neccessarily where the pagetable starts! As Jan put it: "
498 * Actually, the adjustment turns out to be correct: The page
499 * tables for a 32-on-64 dom0 get allocated in the order "first L1",
500 * "first L2", "first L3", so the offset to the page table base is
501 * indeed 2. When reading xen/include/public/xen.h's comment
502 * very strictly, this is not a violation (since there nothing is said
503 * that the first thing in the page table space is pointed to by
504 * pt_base; I admit that this seems to be implied though, namely
505 * do I think that it is implied that the page table space is the
506 * range [pt_base, pt_base + nt_pt_frames), whereas that
507 * range here indeed is [pt_base - 2, pt_base - 2 + nt_pt_frames),
508 * which - without a priori knowledge - the kernel would have
509 * difficulty to figure out)." - so lets just fall back to the
510 * easy way and reserve the whole region.
512 memblock_reserve(__pa(xen_start_info
->mfn_list
),
513 xen_start_info
->pt_base
- xen_start_info
->mfn_list
);
515 sanitize_e820_map(e820
.map
, ARRAY_SIZE(e820
.map
), &e820
.nr_map
);
521 * Set the bit indicating "nosegneg" library variants should be used.
522 * We only need to bother in pure 32-bit mode; compat 32-bit processes
523 * can have un-truncated segments, so wrapping around is allowed.
525 static void __init
fiddle_vdso(void)
529 * This could be called before selected_vdso32 is initialized, so
530 * just fiddle with both possible images. vdso_image_32_syscall
531 * can't be selected, since it only exists on 64-bit systems.
534 mask
= vdso_image_32_int80
.data
+
535 vdso_image_32_int80
.sym_VDSO32_NOTE_MASK
;
536 *mask
|= 1 << VDSO_NOTE_NONEGSEG_BIT
;
537 mask
= vdso_image_32_sysenter
.data
+
538 vdso_image_32_sysenter
.sym_VDSO32_NOTE_MASK
;
539 *mask
|= 1 << VDSO_NOTE_NONEGSEG_BIT
;
543 static int register_callback(unsigned type
, const void *func
)
545 struct callback_register callback
= {
547 .address
= XEN_CALLBACK(__KERNEL_CS
, func
),
548 .flags
= CALLBACKF_mask_events
,
551 return HYPERVISOR_callback_op(CALLBACKOP_register
, &callback
);
554 void xen_enable_sysenter(void)
557 unsigned sysenter_feature
;
560 sysenter_feature
= X86_FEATURE_SEP
;
562 sysenter_feature
= X86_FEATURE_SYSENTER32
;
565 if (!boot_cpu_has(sysenter_feature
))
568 ret
= register_callback(CALLBACKTYPE_sysenter
, xen_sysenter_target
);
570 setup_clear_cpu_cap(sysenter_feature
);
573 void xen_enable_syscall(void)
578 ret
= register_callback(CALLBACKTYPE_syscall
, xen_syscall_target
);
580 printk(KERN_ERR
"Failed to set syscall callback: %d\n", ret
);
581 /* Pretty fatal; 64-bit userspace has no other
582 mechanism for syscalls. */
585 if (boot_cpu_has(X86_FEATURE_SYSCALL32
)) {
586 ret
= register_callback(CALLBACKTYPE_syscall32
,
587 xen_syscall32_target
);
589 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32
);
591 #endif /* CONFIG_X86_64 */
593 void xen_enable_nmi(void)
596 if (register_callback(CALLBACKTYPE_nmi
, (char *)nmi
))
600 void __init
xen_pvmmu_arch_setup(void)
602 HYPERVISOR_vm_assist(VMASST_CMD_enable
, VMASST_TYPE_4gb_segments
);
603 HYPERVISOR_vm_assist(VMASST_CMD_enable
, VMASST_TYPE_writable_pagetables
);
605 HYPERVISOR_vm_assist(VMASST_CMD_enable
,
606 VMASST_TYPE_pae_extended_cr3
);
608 if (register_callback(CALLBACKTYPE_event
, xen_hypervisor_callback
) ||
609 register_callback(CALLBACKTYPE_failsafe
, xen_failsafe_callback
))
612 xen_enable_sysenter();
613 xen_enable_syscall();
617 /* This function is not called for HVM domains */
618 void __init
xen_arch_setup(void)
620 xen_panic_handler_init();
621 if (!xen_feature(XENFEAT_auto_translated_physmap
))
622 xen_pvmmu_arch_setup();
625 if (!(xen_start_info
->flags
& SIF_INITDOMAIN
)) {
626 printk(KERN_INFO
"ACPI in unprivileged domain disabled\n");
631 memcpy(boot_command_line
, xen_start_info
->cmd_line
,
632 MAX_GUEST_CMDLINE
> COMMAND_LINE_SIZE
?
633 COMMAND_LINE_SIZE
: MAX_GUEST_CMDLINE
);
635 /* Set up idle, making sure it calls safe_halt() pvop */
638 WARN_ON(xen_set_default_idle());