2 * Migration support for VFIO devices
4 * Copyright NVIDIA, Inc. 2020
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
10 #include "qemu/osdep.h"
11 #include "qemu/main-loop.h"
12 #include "qemu/cutils.h"
13 #include "qemu/units.h"
14 #include "qemu/error-report.h"
15 #include <linux/vfio.h>
16 #include <sys/ioctl.h>
18 #include "sysemu/runstate.h"
19 #include "hw/vfio/vfio-common.h"
20 #include "migration/migration.h"
21 #include "migration/vmstate.h"
22 #include "migration/qemu-file.h"
23 #include "migration/register.h"
24 #include "migration/blocker.h"
25 #include "migration/misc.h"
26 #include "qapi/error.h"
27 #include "exec/ramlist.h"
28 #include "exec/ram_addr.h"
34 * Flags to be used as unique delimiters for VFIO devices in the migration
35 * stream. These flags are composed as:
36 * 0xffffffff => MSB 32-bit all 1s
37 * 0xef10 => Magic ID, represents emulated (virtual) function IO
38 * 0x0000 => 16-bits reserved for flags
40 * The beginning of state information is marked by _DEV_CONFIG_STATE,
41 * _DEV_SETUP_STATE, or _DEV_DATA_STATE, respectively. The end of a
42 * certain state information is marked by _END_OF_STATE.
44 #define VFIO_MIG_FLAG_END_OF_STATE (0xffffffffef100001ULL)
45 #define VFIO_MIG_FLAG_DEV_CONFIG_STATE (0xffffffffef100002ULL)
46 #define VFIO_MIG_FLAG_DEV_SETUP_STATE (0xffffffffef100003ULL)
47 #define VFIO_MIG_FLAG_DEV_DATA_STATE (0xffffffffef100004ULL)
50 * This is an arbitrary size based on migration of mlx5 devices, where typically
51 * total device migration size is on the order of 100s of MB. Testing with
52 * larger values, e.g. 128MB and 1GB, did not show a performance improvement.
54 #define VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE (1 * MiB)
56 static int64_t bytes_transferred
;
58 static const char *mig_state_to_str(enum vfio_device_mig_state state
)
61 case VFIO_DEVICE_STATE_ERROR
:
63 case VFIO_DEVICE_STATE_STOP
:
65 case VFIO_DEVICE_STATE_RUNNING
:
67 case VFIO_DEVICE_STATE_STOP_COPY
:
69 case VFIO_DEVICE_STATE_RESUMING
:
72 return "UNKNOWN STATE";
76 static int vfio_migration_set_state(VFIODevice
*vbasedev
,
77 enum vfio_device_mig_state new_state
,
78 enum vfio_device_mig_state recover_state
)
80 VFIOMigration
*migration
= vbasedev
->migration
;
81 uint64_t buf
[DIV_ROUND_UP(sizeof(struct vfio_device_feature
) +
82 sizeof(struct vfio_device_feature_mig_state
),
83 sizeof(uint64_t))] = {};
84 struct vfio_device_feature
*feature
= (struct vfio_device_feature
*)buf
;
85 struct vfio_device_feature_mig_state
*mig_state
=
86 (struct vfio_device_feature_mig_state
*)feature
->data
;
89 feature
->argsz
= sizeof(buf
);
91 VFIO_DEVICE_FEATURE_SET
| VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE
;
92 mig_state
->device_state
= new_state
;
93 if (ioctl(vbasedev
->fd
, VFIO_DEVICE_FEATURE
, feature
)) {
94 /* Try to set the device in some good state */
97 if (recover_state
== VFIO_DEVICE_STATE_ERROR
) {
98 error_report("%s: Failed setting device state to %s, err: %s. "
99 "Recover state is ERROR. Resetting device",
100 vbasedev
->name
, mig_state_to_str(new_state
),
107 "%s: Failed setting device state to %s, err: %s. Setting device in recover state %s",
108 vbasedev
->name
, mig_state_to_str(new_state
),
109 strerror(errno
), mig_state_to_str(recover_state
));
111 mig_state
->device_state
= recover_state
;
112 if (ioctl(vbasedev
->fd
, VFIO_DEVICE_FEATURE
, feature
)) {
115 "%s: Failed setting device in recover state, err: %s. Resetting device",
116 vbasedev
->name
, strerror(errno
));
121 migration
->device_state
= recover_state
;
126 migration
->device_state
= new_state
;
127 if (mig_state
->data_fd
!= -1) {
128 if (migration
->data_fd
!= -1) {
130 * This can happen if the device is asynchronously reset and
131 * terminates a data transfer.
133 error_report("%s: data_fd out of sync", vbasedev
->name
);
134 close(mig_state
->data_fd
);
139 migration
->data_fd
= mig_state
->data_fd
;
142 trace_vfio_migration_set_state(vbasedev
->name
, mig_state_to_str(new_state
));
147 if (ioctl(vbasedev
->fd
, VFIO_DEVICE_RESET
)) {
148 hw_error("%s: Failed resetting device, err: %s", vbasedev
->name
,
152 migration
->device_state
= VFIO_DEVICE_STATE_RUNNING
;
157 static int vfio_load_buffer(QEMUFile
*f
, VFIODevice
*vbasedev
,
160 VFIOMigration
*migration
= vbasedev
->migration
;
163 ret
= qemu_file_get_to_fd(f
, migration
->data_fd
, data_size
);
164 trace_vfio_load_state_device_data(vbasedev
->name
, data_size
, ret
);
169 static int vfio_save_device_config_state(QEMUFile
*f
, void *opaque
)
171 VFIODevice
*vbasedev
= opaque
;
173 qemu_put_be64(f
, VFIO_MIG_FLAG_DEV_CONFIG_STATE
);
175 if (vbasedev
->ops
&& vbasedev
->ops
->vfio_save_config
) {
176 vbasedev
->ops
->vfio_save_config(vbasedev
, f
);
179 qemu_put_be64(f
, VFIO_MIG_FLAG_END_OF_STATE
);
181 trace_vfio_save_device_config_state(vbasedev
->name
);
183 return qemu_file_get_error(f
);
186 static int vfio_load_device_config_state(QEMUFile
*f
, void *opaque
)
188 VFIODevice
*vbasedev
= opaque
;
191 if (vbasedev
->ops
&& vbasedev
->ops
->vfio_load_config
) {
194 ret
= vbasedev
->ops
->vfio_load_config(vbasedev
, f
);
196 error_report("%s: Failed to load device config space",
202 data
= qemu_get_be64(f
);
203 if (data
!= VFIO_MIG_FLAG_END_OF_STATE
) {
204 error_report("%s: Failed loading device config space, "
205 "end flag incorrect 0x%"PRIx64
, vbasedev
->name
, data
);
209 trace_vfio_load_device_config_state(vbasedev
->name
);
210 return qemu_file_get_error(f
);
213 static void vfio_migration_cleanup(VFIODevice
*vbasedev
)
215 VFIOMigration
*migration
= vbasedev
->migration
;
217 close(migration
->data_fd
);
218 migration
->data_fd
= -1;
221 static int vfio_query_stop_copy_size(VFIODevice
*vbasedev
,
222 uint64_t *stop_copy_size
)
224 uint64_t buf
[DIV_ROUND_UP(sizeof(struct vfio_device_feature
) +
225 sizeof(struct vfio_device_feature_mig_data_size
),
226 sizeof(uint64_t))] = {};
227 struct vfio_device_feature
*feature
= (struct vfio_device_feature
*)buf
;
228 struct vfio_device_feature_mig_data_size
*mig_data_size
=
229 (struct vfio_device_feature_mig_data_size
*)feature
->data
;
231 feature
->argsz
= sizeof(buf
);
233 VFIO_DEVICE_FEATURE_GET
| VFIO_DEVICE_FEATURE_MIG_DATA_SIZE
;
235 if (ioctl(vbasedev
->fd
, VFIO_DEVICE_FEATURE
, feature
)) {
239 *stop_copy_size
= mig_data_size
->stop_copy_length
;
244 /* Returns 1 if end-of-stream is reached, 0 if more data and -errno if error */
245 static int vfio_save_block(QEMUFile
*f
, VFIOMigration
*migration
)
249 data_size
= read(migration
->data_fd
, migration
->data_buffer
,
250 migration
->data_buffer_size
);
254 if (data_size
== 0) {
258 qemu_put_be64(f
, VFIO_MIG_FLAG_DEV_DATA_STATE
);
259 qemu_put_be64(f
, data_size
);
260 qemu_put_buffer(f
, migration
->data_buffer
, data_size
);
261 bytes_transferred
+= data_size
;
263 trace_vfio_save_block(migration
->vbasedev
->name
, data_size
);
265 return qemu_file_get_error(f
);
268 /* ---------------------------------------------------------------------- */
270 static int vfio_save_setup(QEMUFile
*f
, void *opaque
)
272 VFIODevice
*vbasedev
= opaque
;
273 VFIOMigration
*migration
= vbasedev
->migration
;
274 uint64_t stop_copy_size
= VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE
;
276 qemu_put_be64(f
, VFIO_MIG_FLAG_DEV_SETUP_STATE
);
278 vfio_query_stop_copy_size(vbasedev
, &stop_copy_size
);
279 migration
->data_buffer_size
= MIN(VFIO_MIG_DEFAULT_DATA_BUFFER_SIZE
,
281 migration
->data_buffer
= g_try_malloc0(migration
->data_buffer_size
);
282 if (!migration
->data_buffer
) {
283 error_report("%s: Failed to allocate migration data buffer",
288 trace_vfio_save_setup(vbasedev
->name
, migration
->data_buffer_size
);
290 qemu_put_be64(f
, VFIO_MIG_FLAG_END_OF_STATE
);
292 return qemu_file_get_error(f
);
295 static void vfio_save_cleanup(void *opaque
)
297 VFIODevice
*vbasedev
= opaque
;
298 VFIOMigration
*migration
= vbasedev
->migration
;
300 g_free(migration
->data_buffer
);
301 migration
->data_buffer
= NULL
;
302 vfio_migration_cleanup(vbasedev
);
303 trace_vfio_save_cleanup(vbasedev
->name
);
307 * Migration size of VFIO devices can be as little as a few KBs or as big as
308 * many GBs. This value should be big enough to cover the worst case.
310 #define VFIO_MIG_STOP_COPY_SIZE (100 * GiB)
313 * Only exact function is implemented and not estimate function. The reason is
314 * that during pre-copy phase of migration the estimate function is called
315 * repeatedly while pending RAM size is over the threshold, thus migration
316 * can't converge and querying the VFIO device pending data size is useless.
318 static void vfio_state_pending_exact(void *opaque
, uint64_t *must_precopy
,
319 uint64_t *can_postcopy
)
321 VFIODevice
*vbasedev
= opaque
;
322 uint64_t stop_copy_size
= VFIO_MIG_STOP_COPY_SIZE
;
325 * If getting pending migration size fails, VFIO_MIG_STOP_COPY_SIZE is
326 * reported so downtime limit won't be violated.
328 vfio_query_stop_copy_size(vbasedev
, &stop_copy_size
);
329 *must_precopy
+= stop_copy_size
;
331 trace_vfio_state_pending_exact(vbasedev
->name
, *must_precopy
, *can_postcopy
,
335 static int vfio_save_complete_precopy(QEMUFile
*f
, void *opaque
)
337 VFIODevice
*vbasedev
= opaque
;
340 /* We reach here with device state STOP only */
341 ret
= vfio_migration_set_state(vbasedev
, VFIO_DEVICE_STATE_STOP_COPY
,
342 VFIO_DEVICE_STATE_STOP
);
348 ret
= vfio_save_block(f
, vbasedev
->migration
);
354 qemu_put_be64(f
, VFIO_MIG_FLAG_END_OF_STATE
);
355 ret
= qemu_file_get_error(f
);
361 * If setting the device in STOP state fails, the device should be reset.
362 * To do so, use ERROR state as a recover state.
364 ret
= vfio_migration_set_state(vbasedev
, VFIO_DEVICE_STATE_STOP
,
365 VFIO_DEVICE_STATE_ERROR
);
366 trace_vfio_save_complete_precopy(vbasedev
->name
, ret
);
371 static void vfio_save_state(QEMUFile
*f
, void *opaque
)
373 VFIODevice
*vbasedev
= opaque
;
376 ret
= vfio_save_device_config_state(f
, opaque
);
378 error_report("%s: Failed to save device config space",
380 qemu_file_set_error(f
, ret
);
384 static int vfio_load_setup(QEMUFile
*f
, void *opaque
)
386 VFIODevice
*vbasedev
= opaque
;
388 return vfio_migration_set_state(vbasedev
, VFIO_DEVICE_STATE_RESUMING
,
389 vbasedev
->migration
->device_state
);
392 static int vfio_load_cleanup(void *opaque
)
394 VFIODevice
*vbasedev
= opaque
;
396 vfio_migration_cleanup(vbasedev
);
397 trace_vfio_load_cleanup(vbasedev
->name
);
402 static int vfio_load_state(QEMUFile
*f
, void *opaque
, int version_id
)
404 VFIODevice
*vbasedev
= opaque
;
408 data
= qemu_get_be64(f
);
409 while (data
!= VFIO_MIG_FLAG_END_OF_STATE
) {
411 trace_vfio_load_state(vbasedev
->name
, data
);
414 case VFIO_MIG_FLAG_DEV_CONFIG_STATE
:
416 return vfio_load_device_config_state(f
, opaque
);
418 case VFIO_MIG_FLAG_DEV_SETUP_STATE
:
420 data
= qemu_get_be64(f
);
421 if (data
== VFIO_MIG_FLAG_END_OF_STATE
) {
424 error_report("%s: SETUP STATE: EOS not found 0x%"PRIx64
,
425 vbasedev
->name
, data
);
430 case VFIO_MIG_FLAG_DEV_DATA_STATE
:
432 uint64_t data_size
= qemu_get_be64(f
);
435 ret
= vfio_load_buffer(f
, vbasedev
, data_size
);
443 error_report("%s: Unknown tag 0x%"PRIx64
, vbasedev
->name
, data
);
447 data
= qemu_get_be64(f
);
448 ret
= qemu_file_get_error(f
);
456 static const SaveVMHandlers savevm_vfio_handlers
= {
457 .save_setup
= vfio_save_setup
,
458 .save_cleanup
= vfio_save_cleanup
,
459 .state_pending_exact
= vfio_state_pending_exact
,
460 .save_live_complete_precopy
= vfio_save_complete_precopy
,
461 .save_state
= vfio_save_state
,
462 .load_setup
= vfio_load_setup
,
463 .load_cleanup
= vfio_load_cleanup
,
464 .load_state
= vfio_load_state
,
467 /* ---------------------------------------------------------------------- */
469 static void vfio_vmstate_change(void *opaque
, bool running
, RunState state
)
471 VFIODevice
*vbasedev
= opaque
;
472 enum vfio_device_mig_state new_state
;
476 new_state
= VFIO_DEVICE_STATE_RUNNING
;
478 new_state
= VFIO_DEVICE_STATE_STOP
;
482 * If setting the device in new_state fails, the device should be reset.
483 * To do so, use ERROR state as a recover state.
485 ret
= vfio_migration_set_state(vbasedev
, new_state
,
486 VFIO_DEVICE_STATE_ERROR
);
489 * Migration should be aborted in this case, but vm_state_notify()
490 * currently does not support reporting failures.
492 if (migrate_get_current()->to_dst_file
) {
493 qemu_file_set_error(migrate_get_current()->to_dst_file
, ret
);
497 trace_vfio_vmstate_change(vbasedev
->name
, running
, RunState_str(state
),
498 mig_state_to_str(new_state
));
501 static void vfio_migration_state_notifier(Notifier
*notifier
, void *data
)
503 MigrationState
*s
= data
;
504 VFIOMigration
*migration
= container_of(notifier
, VFIOMigration
,
506 VFIODevice
*vbasedev
= migration
->vbasedev
;
508 trace_vfio_migration_state_notifier(vbasedev
->name
,
509 MigrationStatus_str(s
->state
));
512 case MIGRATION_STATUS_CANCELLING
:
513 case MIGRATION_STATUS_CANCELLED
:
514 case MIGRATION_STATUS_FAILED
:
515 bytes_transferred
= 0;
517 * If setting the device in RUNNING state fails, the device should
518 * be reset. To do so, use ERROR state as a recover state.
520 vfio_migration_set_state(vbasedev
, VFIO_DEVICE_STATE_RUNNING
,
521 VFIO_DEVICE_STATE_ERROR
);
525 static void vfio_migration_free(VFIODevice
*vbasedev
)
527 g_free(vbasedev
->migration
);
528 vbasedev
->migration
= NULL
;
531 static int vfio_migration_query_flags(VFIODevice
*vbasedev
, uint64_t *mig_flags
)
533 uint64_t buf
[DIV_ROUND_UP(sizeof(struct vfio_device_feature
) +
534 sizeof(struct vfio_device_feature_migration
),
535 sizeof(uint64_t))] = {};
536 struct vfio_device_feature
*feature
= (struct vfio_device_feature
*)buf
;
537 struct vfio_device_feature_migration
*mig
=
538 (struct vfio_device_feature_migration
*)feature
->data
;
540 feature
->argsz
= sizeof(buf
);
541 feature
->flags
= VFIO_DEVICE_FEATURE_GET
| VFIO_DEVICE_FEATURE_MIGRATION
;
542 if (ioctl(vbasedev
->fd
, VFIO_DEVICE_FEATURE
, feature
)) {
543 if (errno
== ENOTTY
) {
544 error_report("%s: VFIO migration is not supported in kernel",
547 error_report("%s: Failed to query VFIO migration support, err: %s",
548 vbasedev
->name
, strerror(errno
));
554 *mig_flags
= mig
->flags
;
559 static bool vfio_dma_logging_supported(VFIODevice
*vbasedev
)
561 uint64_t buf
[DIV_ROUND_UP(sizeof(struct vfio_device_feature
),
562 sizeof(uint64_t))] = {};
563 struct vfio_device_feature
*feature
= (struct vfio_device_feature
*)buf
;
565 feature
->argsz
= sizeof(buf
);
566 feature
->flags
= VFIO_DEVICE_FEATURE_PROBE
|
567 VFIO_DEVICE_FEATURE_DMA_LOGGING_START
;
569 return !ioctl(vbasedev
->fd
, VFIO_DEVICE_FEATURE
, feature
);
572 static int vfio_migration_init(VFIODevice
*vbasedev
)
576 VFIOMigration
*migration
;
578 g_autofree
char *path
= NULL
, *oid
= NULL
;
579 uint64_t mig_flags
= 0;
581 if (!vbasedev
->ops
->vfio_get_object
) {
585 obj
= vbasedev
->ops
->vfio_get_object(vbasedev
);
590 ret
= vfio_migration_query_flags(vbasedev
, &mig_flags
);
595 /* Basic migration functionality must be supported */
596 if (!(mig_flags
& VFIO_MIGRATION_STOP_COPY
)) {
600 vbasedev
->migration
= g_new0(VFIOMigration
, 1);
601 migration
= vbasedev
->migration
;
602 migration
->vbasedev
= vbasedev
;
603 migration
->device_state
= VFIO_DEVICE_STATE_RUNNING
;
604 migration
->data_fd
= -1;
606 vbasedev
->dirty_pages_supported
= vfio_dma_logging_supported(vbasedev
);
608 oid
= vmstate_if_get_id(VMSTATE_IF(DEVICE(obj
)));
610 path
= g_strdup_printf("%s/vfio", oid
);
612 path
= g_strdup("vfio");
614 strpadcpy(id
, sizeof(id
), path
, '\0');
616 register_savevm_live(id
, VMSTATE_INSTANCE_ID_ANY
, 1, &savevm_vfio_handlers
,
619 migration
->vm_state
= qdev_add_vm_change_state_handler(vbasedev
->dev
,
622 migration
->migration_state
.notify
= vfio_migration_state_notifier
;
623 add_migration_state_change_notifier(&migration
->migration_state
);
628 /* ---------------------------------------------------------------------- */
630 int64_t vfio_mig_bytes_transferred(void)
632 return bytes_transferred
;
635 int vfio_migration_realize(VFIODevice
*vbasedev
, Error
**errp
)
639 if (!vbasedev
->enable_migration
) {
643 ret
= vfio_migration_init(vbasedev
);
648 ret
= vfio_block_multiple_devices_migration(errp
);
653 ret
= vfio_block_giommu_migration(errp
);
658 trace_vfio_migration_probe(vbasedev
->name
);
662 error_setg(&vbasedev
->migration_blocker
,
663 "VFIO device doesn't support migration");
665 ret
= migrate_add_blocker(vbasedev
->migration_blocker
, errp
);
667 error_free(vbasedev
->migration_blocker
);
668 vbasedev
->migration_blocker
= NULL
;
673 void vfio_migration_exit(VFIODevice
*vbasedev
)
675 if (vbasedev
->migration
) {
676 VFIOMigration
*migration
= vbasedev
->migration
;
678 remove_migration_state_change_notifier(&migration
->migration_state
);
679 qemu_del_vm_change_state_handler(migration
->vm_state
);
680 unregister_savevm(VMSTATE_IF(vbasedev
->dev
), "vfio", vbasedev
);
681 vfio_migration_free(vbasedev
);
682 vfio_unblock_multiple_devices_migration();
685 if (vbasedev
->migration_blocker
) {
686 migrate_del_blocker(vbasedev
->migration_blocker
);
687 error_free(vbasedev
->migration_blocker
);
688 vbasedev
->migration_blocker
= NULL
;