4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
26 #include <sys/modctl.h>
27 #include <sys/types.h>
28 #include <sys/archsystm.h>
29 #include <sys/machsystm.h>
30 #include <sys/sunndi.h>
31 #include <sys/sunddi.h>
32 #include <sys/ddi_subrdefs.h>
33 #include <sys/xpv_support.h>
34 #include <sys/xen_errno.h>
35 #include <sys/hypervisor.h>
36 #include <sys/gnttab.h>
37 #include <sys/xenbus_comms.h>
38 #include <sys/xenbus_impl.h>
39 #include <xen/sys/xendev.h>
40 #include <sys/sysmacros.h>
41 #include <sys/x86_archext.h>
45 #include <sys/devops.h>
46 #include <sys/pc_mmu.h>
47 #include <sys/cmn_err.h>
50 #include <vm/seg_kmem.h>
52 #include <vm/hat_pte.h>
53 #include <vm/hat_i86.h>
56 #define XPV_BUFSIZE 128
58 /* virtual addr for the store_mfn page */
62 static dev_info_t
*xpvd_dip
;
65 int xen_suspend_debug
;
67 #define SUSPEND_DEBUG if (xen_suspend_debug) xen_printf
69 #define SUSPEND_DEBUG(...)
73 * Forward declarations
75 static int xpv_getinfo(dev_info_t
*, ddi_info_cmd_t
, void *, void **);
76 static int xpv_attach(dev_info_t
*, ddi_attach_cmd_t
);
77 static int xpv_detach(dev_info_t
*, ddi_detach_cmd_t
);
78 static int xpv_open(dev_t
*, int, int, cred_t
*);
79 static int xpv_ioctl(dev_t
, int, intptr_t, int, cred_t
*, int *);
81 static struct cb_ops xpv_cb_ops
= {
89 xpv_ioctl
, /* ioctl */
102 static struct dev_ops xpv_dv_ops
= {
106 nulldev
, /* identify */
112 NULL
, /* struct bus_ops */
114 ddi_quiesce_not_supported
, /* devo_quiesce */
117 static struct modldrv modldrv
= {
123 static struct modlinkage modl
= {
127 NULL
/* null termination */
131 static ddi_dma_attr_t xpv_dma_attr
= {
132 DMA_ATTR_V0
, /* version of this structure */
133 0, /* lowest usable address */
134 0xffffffffffffffffULL
, /* highest usable address */
135 0x7fffffff, /* maximum DMAable byte count */
136 MMU_PAGESIZE
, /* alignment in bytes */
137 0x7ff, /* bitmap of burst sizes */
138 1, /* minimum transfer */
139 0xffffffffU
, /* maximum transfer */
140 0x7fffffffULL
, /* maximum segment length */
141 1, /* maximum number of segments */
143 0, /* flags (reserved) */
146 static ddi_device_acc_attr_t xpv_accattr
= {
152 #define MAX_ALLOCATIONS 10
153 static ddi_dma_handle_t xpv_dma_handle
[MAX_ALLOCATIONS
];
154 static ddi_acc_handle_t xpv_dma_acchandle
[MAX_ALLOCATIONS
];
155 static int xen_alloc_cnt
= 0;
158 xen_alloc_pages(pgcnt_t cnt
)
161 int a
= xen_alloc_cnt
++;
164 ASSERT(xen_alloc_cnt
< MAX_ALLOCATIONS
);
165 if (ddi_dma_alloc_handle(xpv_dip
, &xpv_dma_attr
, DDI_DMA_SLEEP
, 0,
166 &xpv_dma_handle
[a
]) != DDI_SUCCESS
)
169 if (ddi_dma_mem_alloc(xpv_dma_handle
[a
], MMU_PAGESIZE
* cnt
,
170 &xpv_accattr
, DDI_DMA_CONSISTENT
, DDI_DMA_SLEEP
, 0,
171 &addr
, &len
, &xpv_dma_acchandle
[a
]) != DDI_SUCCESS
) {
172 ddi_dma_free_handle(&xpv_dma_handle
[a
]);
173 cmn_err(CE_WARN
, "Couldn't allocate memory for xpv devices");
180 * This function is invoked twice, first time with reprogram=0 to set up
181 * the xpvd portion of the device tree. The second time it is ignored.
184 xpv_enumerate(int reprogram
)
191 ndi_devi_alloc_sleep(ddi_root_node(), "xpvd",
192 (pnode_t
)DEVI_SID_NODEID
, &dip
);
194 (void) ndi_devi_bind_driver(dip
, 0);
197 * Too early to enumerate split device drivers in domU
198 * since we need to create taskq thread during enumeration.
199 * So, we only enumerate softdevs and console here.
201 xendev_enum_all(dip
, B_TRUE
);
205 * Translate a hypervisor errcode to a Solaris error code.
208 xen_xlate_errcode(int error
)
210 #define CASE(num) case X_##num: error = num; break
213 CASE(EPERM
); CASE(ENOENT
); CASE(ESRCH
);
214 CASE(EINTR
); CASE(EIO
); CASE(ENXIO
);
215 CASE(E2BIG
); CASE(ENOMEM
); CASE(EACCES
);
216 CASE(EFAULT
); CASE(EBUSY
); CASE(EEXIST
);
217 CASE(ENODEV
); CASE(EISDIR
); CASE(EINVAL
);
218 CASE(ENOSPC
); CASE(ESPIPE
); CASE(EROFS
);
219 CASE(ENOSYS
); CASE(ENOTEMPTY
); CASE(EISCONN
);
222 panic("xen_xlate_errcode: unknown error %d", error
);
230 xen_printf(const char *fmt
, ...)
240 * Stub functions to get the FE drivers to build, and to catch drivers that
241 * misbehave in HVM domains.
245 xen_release_pfn(pfn_t pfn
)
247 panic("xen_release_pfn() is not supported in HVM domains");
252 reassign_pfn(pfn_t pfn
, mfn_t mfn
)
254 panic("reassign_pfn() is not supported in HVM domains");
259 balloon_free_pages(uint_t page_cnt
, mfn_t
*mfns
, caddr_t kva
, pfn_t
*pfns
)
261 panic("balloon_free_pages() is not supported in HVM domains");
267 balloon_drv_added(int64_t delta
)
269 panic("balloon_drv_added() is not supported in HVM domains");
273 * Add a mapping for the machine page at the given virtual address.
276 kbm_map_ma(maddr_t ma
, uintptr_t va
, uint_t level
)
280 hat_devload(kas
.a_hat
, (caddr_t
)va
, MMU_PAGESIZE
,
281 mmu_btop(ma
), PROT_READ
| PROT_WRITE
, HAT_LOAD
);
286 xen_map_gref(uint_t cmd
, gnttab_map_grant_ref_t
*mapop
, uint_t count
,
291 ASSERT(cmd
== GNTTABOP_map_grant_ref
);
292 rc
= HYPERVISOR_grant_table_op(cmd
, mapop
, count
);
297 static struct xenbus_watch shutdown_watch
;
298 taskq_t
*xen_shutdown_tq
;
300 #define SHUTDOWN_INVALID -1
301 #define SHUTDOWN_POWEROFF 0
302 #define SHUTDOWN_REBOOT 1
303 #define SHUTDOWN_SUSPEND 2
304 #define SHUTDOWN_HALT 3
305 #define SHUTDOWN_MAX 4
307 #define SHUTDOWN_TIMEOUT_SECS (60 * 5)
310 xen_suspend_devices(dev_info_t
*dip
)
313 char buf
[XPV_BUFSIZE
];
315 SUSPEND_DEBUG("xen_suspend_devices\n");
317 for (; dip
!= NULL
; dip
= ddi_get_next_sibling(dip
)) {
318 if (xen_suspend_devices(ddi_get_child(dip
)))
320 if (ddi_get_driver(dip
) == NULL
)
322 SUSPEND_DEBUG("Suspending device %s\n", ddi_deviname(dip
, buf
));
323 ASSERT((DEVI(dip
)->devi_cpr_flags
& DCF_CPR_SUSPENDED
) == 0);
326 if (!i_ddi_devi_attached(dip
)) {
329 error
= devi_detach(dip
, DDI_SUSPEND
);
332 if (error
== DDI_SUCCESS
) {
333 DEVI(dip
)->devi_cpr_flags
|= DCF_CPR_SUSPENDED
;
335 SUSPEND_DEBUG("WARNING: Unable to suspend device %s\n",
336 ddi_deviname(dip
, buf
));
337 cmn_err(CE_WARN
, "Unable to suspend device %s.",
338 ddi_deviname(dip
, buf
));
339 cmn_err(CE_WARN
, "Device is busy or does not "
340 "support suspend/resume.");
348 xen_resume_devices(dev_info_t
*start
, int resume_failed
)
350 dev_info_t
*dip
, *next
, *last
= NULL
;
352 int error
= resume_failed
;
353 char buf
[XPV_BUFSIZE
];
355 SUSPEND_DEBUG("xen_resume_devices\n");
357 while (last
!= start
) {
359 next
= ddi_get_next_sibling(dip
);
360 while (next
!= last
) {
362 next
= ddi_get_next_sibling(dip
);
366 * cpr is the only one that uses this field and the device
367 * itself hasn't resumed yet, there is no need to use a
368 * lock, even though kernel threads are active by now.
370 did_suspend
= DEVI(dip
)->devi_cpr_flags
& DCF_CPR_SUSPENDED
;
372 DEVI(dip
)->devi_cpr_flags
&= ~DCF_CPR_SUSPENDED
;
375 * There may be background attaches happening on devices
376 * that were not originally suspended by cpr, so resume
377 * only devices that were suspended by cpr. Also, stop
378 * resuming after the first resume failure, but traverse
379 * the entire tree to clear the suspend flag.
381 if (did_suspend
&& !error
) {
382 SUSPEND_DEBUG("Resuming device %s\n",
383 ddi_deviname(dip
, buf
));
385 * If a device suspended by cpr gets detached during
386 * the resume process (for example, due to hotplugging)
387 * before cpr gets around to issuing it a DDI_RESUME,
388 * we'll have problems.
390 if (!i_ddi_devi_attached(dip
)) {
391 cmn_err(CE_WARN
, "Skipping %s, device "
392 "not ready for resume",
393 ddi_deviname(dip
, buf
));
395 if (devi_attach(dip
, DDI_RESUME
) !=
402 if (error
== ENXIO
) {
403 cmn_err(CE_WARN
, "Unable to resume device %s",
404 ddi_deviname(dip
, buf
));
407 error
= xen_resume_devices(ddi_get_child(dip
), error
);
416 check_xpvd(dev_info_t
*dip
, void *arg
)
420 name
= ddi_node_name(dip
);
421 if (name
== NULL
|| strcmp(name
, "xpvd")) {
422 return (DDI_WALK_CONTINUE
);
425 return (DDI_WALK_TERMINATE
);
430 * Top level routine to direct suspend/resume of a domain.
433 xen_suspend_domain(void)
435 extern void rtcsync(void);
436 extern void ec_resume(void);
437 extern kmutex_t ec_lock
;
438 struct xen_add_to_physmap xatp
;
442 cmn_err(CE_NOTE
, "Domain suspending for save/migrate");
444 SUSPEND_DEBUG("xen_suspend_domain\n");
447 * We only want to suspend the PV devices, since the emulated devices
448 * are suspended by saving the emulated device state. The PV devices
449 * are all children of the xpvd nexus device. So we search the
450 * device tree for the xpvd node to use as the root of the tree to
453 if (xpvd_dip
== NULL
)
454 ddi_walk_devs(ddi_root_node(), check_xpvd
, NULL
);
457 * suspend interrupts and devices
459 if (xpvd_dip
!= NULL
)
460 (void) xen_suspend_devices(ddi_get_child(xpvd_dip
));
462 cmn_err(CE_WARN
, "No PV devices found to suspend");
463 SUSPEND_DEBUG("xenbus_suspend\n");
466 mutex_enter(&cpu_lock
);
471 thread_affinity_set(curthread
, 0);
475 pause_cpus(NULL
, NULL
);
477 * We can grab the ec_lock as it's a spinlock with a high SPL. Hence
478 * any holder would have dropped it to get through pause_cpus().
480 mutex_enter(&ec_lock
);
483 * From here on in, we can't take locks.
486 flags
= intr_clear();
488 SUSPEND_DEBUG("HYPERVISOR_suspend\n");
490 * At this point we suspend and sometime later resume.
491 * Note that this call may return with an indication of a cancelled
492 * for now no matter ehat the return we do a full resume of all
493 * suspended drivers, etc.
495 (void) HYPERVISOR_shutdown(SHUTDOWN_suspend
);
498 * Point HYPERVISOR_shared_info to the proper place.
500 xatp
.domid
= DOMID_SELF
;
502 xatp
.space
= XENMAPSPACE_shared_info
;
503 xatp
.gpfn
= xen_shared_info_frame
;
504 if ((err
= HYPERVISOR_memory_op(XENMEM_add_to_physmap
, &xatp
)) != 0)
505 panic("Could not set shared_info page. error: %d", err
);
507 SUSPEND_DEBUG("gnttab_resume\n");
510 SUSPEND_DEBUG("ec_resume\n");
518 mutex_exit(&ec_lock
);
519 mutex_exit(&cpu_lock
);
522 * Now we can take locks again.
527 SUSPEND_DEBUG("xenbus_resume\n");
529 SUSPEND_DEBUG("xen_resume_devices\n");
530 if (xpvd_dip
!= NULL
)
531 (void) xen_resume_devices(ddi_get_child(xpvd_dip
), 0);
533 thread_affinity_clear(curthread
);
536 SUSPEND_DEBUG("finished xen_suspend_domain\n");
538 cmn_err(CE_NOTE
, "domain restore/migrate completed");
542 xen_dirty_shutdown(void *arg
)
544 int cmd
= (uintptr_t)arg
;
546 cmn_err(CE_WARN
, "Externally requested shutdown failed or "
547 "timed out.\nShutting down.\n");
551 case SHUTDOWN_POWEROFF
:
552 (void) kadmin(A_SHUTDOWN
, AD_POWEROFF
, NULL
, kcred
);
554 case SHUTDOWN_REBOOT
:
555 (void) kadmin(A_REBOOT
, AD_BOOT
, NULL
, kcred
);
561 xen_shutdown(void *arg
)
563 int cmd
= (uintptr_t)arg
;
566 ASSERT(cmd
> SHUTDOWN_INVALID
&& cmd
< SHUTDOWN_MAX
);
568 if (cmd
== SHUTDOWN_SUSPEND
) {
569 xen_suspend_domain();
574 case SHUTDOWN_POWEROFF
:
575 force_shutdown_method
= AD_POWEROFF
;
578 force_shutdown_method
= AD_HALT
;
580 case SHUTDOWN_REBOOT
:
581 force_shutdown_method
= AD_BOOT
;
587 * If we're still booting and init(1) isn't set up yet, simply halt.
589 mutex_enter(&pidlock
);
590 initpp
= prfind(P_INITPID
);
591 mutex_exit(&pidlock
);
592 if (initpp
== NULL
) {
593 extern void halt(char *);
594 halt("Power off the System"); /* just in case */
598 * else, graceful shutdown with inittab and all getting involved
600 psignal(initpp
, SIGPWR
);
602 (void) timeout(xen_dirty_shutdown
, arg
,
603 SHUTDOWN_TIMEOUT_SECS
* drv_usectohz(MICROSEC
));
608 xen_shutdown_handler(struct xenbus_watch
*watch
, const char **vec
,
612 xenbus_transaction_t xbt
;
613 int err
, shutdown_code
= SHUTDOWN_INVALID
;
617 err
= xenbus_transaction_start(&xbt
);
620 if (xenbus_read(xbt
, "control", "shutdown", (void *)&str
, &slen
)) {
621 (void) xenbus_transaction_end(xbt
, 1);
625 SUSPEND_DEBUG("%d: xen_shutdown_handler: \"%s\"\n", CPU
->cpu_id
, str
);
628 * If this is a watch fired from our write below, check out early to
629 * avoid an infinite loop.
631 if (strcmp(str
, "") == 0) {
632 (void) xenbus_transaction_end(xbt
, 0);
633 kmem_free(str
, slen
);
635 } else if (strcmp(str
, "poweroff") == 0) {
636 shutdown_code
= SHUTDOWN_POWEROFF
;
637 } else if (strcmp(str
, "reboot") == 0) {
638 shutdown_code
= SHUTDOWN_REBOOT
;
639 } else if (strcmp(str
, "suspend") == 0) {
640 shutdown_code
= SHUTDOWN_SUSPEND
;
641 } else if (strcmp(str
, "halt") == 0) {
642 shutdown_code
= SHUTDOWN_HALT
;
644 printf("Ignoring shutdown request: %s\n", str
);
647 (void) xenbus_write(xbt
, "control", "shutdown", "");
648 err
= xenbus_transaction_end(xbt
, 0);
650 SUSPEND_DEBUG("%d: trying again\n", CPU
->cpu_id
);
651 kmem_free(str
, slen
);
655 kmem_free(str
, slen
);
656 if (shutdown_code
!= SHUTDOWN_INVALID
) {
657 (void) taskq_dispatch(xen_shutdown_tq
, xen_shutdown
,
658 (void *)(intptr_t)shutdown_code
, 0);
665 if (xpv_feature(XPVF_HYPERCALLS
) < 0 ||
666 xpv_feature(XPVF_SHARED_INFO
) < 0)
669 /* Set up the grant tables. */
672 /* Set up event channel support */
677 xb_addr
= vmem_alloc(heap_arena
, MMU_PAGESIZE
, VM_SLEEP
);
681 /* Set up for suspend/resume/migrate */
682 xen_shutdown_tq
= taskq_create("shutdown_taskq", 1,
683 maxclsyspri
- 1, 1, 1, TASKQ_PREPOPULATE
);
684 shutdown_watch
.node
= "control/shutdown";
685 shutdown_watch
.callback
= xen_shutdown_handler
;
686 if (register_xenbus_watch(&shutdown_watch
))
687 cmn_err(CE_WARN
, "Failed to set shutdown watcher");
700 xpv_getinfo(dev_info_t
*dip
, ddi_info_cmd_t cmd
, void *arg
, void **result
)
702 if (getminor((dev_t
)arg
) != XPV_MINOR
)
703 return (DDI_FAILURE
);
706 case DDI_INFO_DEVT2DEVINFO
:
709 case DDI_INFO_DEVT2INSTANCE
:
713 return (DDI_FAILURE
);
716 return (DDI_SUCCESS
);
720 xpv_attach(dev_info_t
*dip
, ddi_attach_cmd_t cmd
)
722 if (cmd
!= DDI_ATTACH
)
723 return (DDI_FAILURE
);
725 if (ddi_create_minor_node(dip
, ddi_get_name(dip
), S_IFCHR
,
726 ddi_get_instance(dip
), DDI_PSEUDO
, 0) != DDI_SUCCESS
)
727 return (DDI_FAILURE
);
731 if (xpv_drv_init() != 0)
732 return (DDI_FAILURE
);
737 * If the memscrubber attempts to scrub the pages we hand to Xen,
738 * the domain will panic.
743 * Report our version to dom0.
745 if (xenbus_printf(XBT_NULL
, "guest/xpv", "version", "%d",
747 cmn_err(CE_WARN
, "xpv: couldn't write version\n");
749 return (DDI_SUCCESS
);
753 * Attempts to reload the PV driver plumbing hang on Intel platforms, so
754 * we don't want to unload the framework by accident.
756 int xpv_allow_detach
= 0;
759 xpv_detach(dev_info_t
*dip
, ddi_detach_cmd_t cmd
)
761 if (cmd
!= DDI_DETACH
|| xpv_allow_detach
== 0)
762 return (DDI_FAILURE
);
764 if (xpv_dip
!= NULL
) {
766 ddi_remove_minor_node(dip
, NULL
);
770 return (DDI_SUCCESS
);
775 xpv_open(dev_t
*dev
, int flag
, int otyp
, cred_t
*cr
)
777 return (getminor(*dev
) == XPV_MINOR
? 0 : ENXIO
);
782 xpv_ioctl(dev_t dev
, int cmd
, intptr_t arg
, int mode
, cred_t
*cr
,
793 if ((err
= mod_install(&modl
)) != 0)
796 impl_bus_add_probe(xpv_enumerate
);
805 if ((err
= mod_remove(&modl
)) != 0)
808 impl_bus_delete_probe(xpv_enumerate
);
813 _info(struct modinfo
*modinfop
)
815 return (mod_info(&modl
, modinfop
));