1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
6 #include <linux/device.h>
7 #include <linux/module.h>
8 #include <linux/mutex.h>
10 #include <linux/pm_runtime.h>
11 #include <linux/types.h>
12 #include <linux/uaccess.h>
13 #include <linux/vfio.h>
14 #include <linux/vfio_pci_core.h>
15 #include <linux/virtio_pci.h>
16 #include <linux/virtio_net.h>
17 #include <linux/virtio_pci_admin.h>
18 #include <linux/anon_inodes.h>
22 /* Device specification max parts size */
23 #define MAX_LOAD_SIZE (BIT_ULL(BITS_PER_TYPE \
24 (((struct virtio_admin_cmd_dev_parts_metadata_result *)0)->parts_size.size)) - 1)
26 /* Initial target buffer size */
27 #define VIRTIOVF_TARGET_INITIAL_BUF_SIZE SZ_1M
30 virtiovf_read_device_context_chunk(struct virtiovf_migration_file
*migf
,
34 virtiovf_get_migration_page(struct virtiovf_data_buffer
*buf
,
37 unsigned long cur_offset
= 0;
38 struct scatterlist
*sg
;
41 /* All accesses are sequential */
42 if (offset
< buf
->last_offset
|| !buf
->last_offset_sg
) {
44 buf
->last_offset_sg
= buf
->table
.sgt
.sgl
;
45 buf
->sg_last_entry
= 0;
48 cur_offset
= buf
->last_offset
;
50 for_each_sg(buf
->last_offset_sg
, sg
,
51 buf
->table
.sgt
.orig_nents
- buf
->sg_last_entry
, i
) {
52 if (offset
< sg
->length
+ cur_offset
) {
53 buf
->last_offset_sg
= sg
;
54 buf
->sg_last_entry
+= i
;
55 buf
->last_offset
= cur_offset
;
56 return nth_page(sg_page(sg
),
57 (offset
- cur_offset
) / PAGE_SIZE
);
59 cur_offset
+= sg
->length
;
64 static int virtiovf_add_migration_pages(struct virtiovf_data_buffer
*buf
,
67 unsigned int to_alloc
= npages
;
68 struct page
**page_list
;
74 to_fill
= min_t(unsigned int, npages
, PAGE_SIZE
/ sizeof(*page_list
));
75 page_list
= kvcalloc(to_fill
, sizeof(*page_list
), GFP_KERNEL_ACCOUNT
);
80 filled
= alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT
, to_fill
,
87 ret
= sg_alloc_append_table_from_pages(&buf
->table
, page_list
,
88 filled
, 0, filled
<< PAGE_SHIFT
, UINT_MAX
,
89 SG_MAX_SINGLE_ALLOC
, GFP_KERNEL_ACCOUNT
);
93 buf
->allocated_length
+= filled
* PAGE_SIZE
;
94 /* clean input for another bulk allocation */
95 memset(page_list
, 0, filled
* sizeof(*page_list
));
96 to_fill
= min_t(unsigned int, to_alloc
,
97 PAGE_SIZE
/ sizeof(*page_list
));
98 } while (to_alloc
> 0);
104 for (i
= filled
- 1; i
>= 0; i
--)
105 __free_page(page_list
[i
]);
111 static void virtiovf_free_data_buffer(struct virtiovf_data_buffer
*buf
)
113 struct sg_page_iter sg_iter
;
115 /* Undo alloc_pages_bulk_array() */
116 for_each_sgtable_page(&buf
->table
.sgt
, &sg_iter
, 0)
117 __free_page(sg_page_iter_page(&sg_iter
));
118 sg_free_append_table(&buf
->table
);
122 static struct virtiovf_data_buffer
*
123 virtiovf_alloc_data_buffer(struct virtiovf_migration_file
*migf
, size_t length
)
125 struct virtiovf_data_buffer
*buf
;
128 buf
= kzalloc(sizeof(*buf
), GFP_KERNEL_ACCOUNT
);
130 return ERR_PTR(-ENOMEM
);
132 ret
= virtiovf_add_migration_pages(buf
,
133 DIV_ROUND_UP_ULL(length
, PAGE_SIZE
));
140 virtiovf_free_data_buffer(buf
);
144 static void virtiovf_put_data_buffer(struct virtiovf_data_buffer
*buf
)
146 spin_lock_irq(&buf
->migf
->list_lock
);
147 list_add_tail(&buf
->buf_elm
, &buf
->migf
->avail_list
);
148 spin_unlock_irq(&buf
->migf
->list_lock
);
152 virtiovf_pci_alloc_obj_id(struct virtiovf_pci_core_device
*virtvdev
, u8 type
,
155 return virtio_pci_admin_obj_create(virtvdev
->core_device
.pdev
,
156 VIRTIO_RESOURCE_OBJ_DEV_PARTS
, type
, obj_id
);
160 virtiovf_pci_free_obj_id(struct virtiovf_pci_core_device
*virtvdev
, u32 obj_id
)
162 virtio_pci_admin_obj_destroy(virtvdev
->core_device
.pdev
,
163 VIRTIO_RESOURCE_OBJ_DEV_PARTS
, obj_id
);
166 static struct virtiovf_data_buffer
*
167 virtiovf_get_data_buffer(struct virtiovf_migration_file
*migf
, size_t length
)
169 struct virtiovf_data_buffer
*buf
, *temp_buf
;
170 struct list_head free_list
;
172 INIT_LIST_HEAD(&free_list
);
174 spin_lock_irq(&migf
->list_lock
);
175 list_for_each_entry_safe(buf
, temp_buf
, &migf
->avail_list
, buf_elm
) {
176 list_del_init(&buf
->buf_elm
);
177 if (buf
->allocated_length
>= length
) {
178 spin_unlock_irq(&migf
->list_lock
);
182 * Prevent holding redundant buffers. Put in a free
183 * list and call at the end not under the spin lock
184 * (&migf->list_lock) to minimize its scope usage.
186 list_add(&buf
->buf_elm
, &free_list
);
188 spin_unlock_irq(&migf
->list_lock
);
189 buf
= virtiovf_alloc_data_buffer(migf
, length
);
192 while ((temp_buf
= list_first_entry_or_null(&free_list
,
193 struct virtiovf_data_buffer
, buf_elm
))) {
194 list_del(&temp_buf
->buf_elm
);
195 virtiovf_free_data_buffer(temp_buf
);
201 static void virtiovf_clean_migf_resources(struct virtiovf_migration_file
*migf
)
203 struct virtiovf_data_buffer
*entry
;
206 virtiovf_free_data_buffer(migf
->buf
);
210 if (migf
->buf_header
) {
211 virtiovf_free_data_buffer(migf
->buf_header
);
212 migf
->buf_header
= NULL
;
215 list_splice(&migf
->avail_list
, &migf
->buf_list
);
217 while ((entry
= list_first_entry_or_null(&migf
->buf_list
,
218 struct virtiovf_data_buffer
, buf_elm
))) {
219 list_del(&entry
->buf_elm
);
220 virtiovf_free_data_buffer(entry
);
223 if (migf
->has_obj_id
)
224 virtiovf_pci_free_obj_id(migf
->virtvdev
, migf
->obj_id
);
227 static void virtiovf_disable_fd(struct virtiovf_migration_file
*migf
)
229 mutex_lock(&migf
->lock
);
230 migf
->state
= VIRTIOVF_MIGF_STATE_ERROR
;
231 migf
->filp
->f_pos
= 0;
232 mutex_unlock(&migf
->lock
);
235 static void virtiovf_disable_fds(struct virtiovf_pci_core_device
*virtvdev
)
237 if (virtvdev
->resuming_migf
) {
238 virtiovf_disable_fd(virtvdev
->resuming_migf
);
239 virtiovf_clean_migf_resources(virtvdev
->resuming_migf
);
240 fput(virtvdev
->resuming_migf
->filp
);
241 virtvdev
->resuming_migf
= NULL
;
243 if (virtvdev
->saving_migf
) {
244 virtiovf_disable_fd(virtvdev
->saving_migf
);
245 virtiovf_clean_migf_resources(virtvdev
->saving_migf
);
246 fput(virtvdev
->saving_migf
->filp
);
247 virtvdev
->saving_migf
= NULL
;
252 * This function is called in all state_mutex unlock cases to
253 * handle a 'deferred_reset' if exists.
255 static void virtiovf_state_mutex_unlock(struct virtiovf_pci_core_device
*virtvdev
)
258 spin_lock(&virtvdev
->reset_lock
);
259 if (virtvdev
->deferred_reset
) {
260 virtvdev
->deferred_reset
= false;
261 spin_unlock(&virtvdev
->reset_lock
);
262 virtvdev
->mig_state
= VFIO_DEVICE_STATE_RUNNING
;
263 virtiovf_disable_fds(virtvdev
);
266 mutex_unlock(&virtvdev
->state_mutex
);
267 spin_unlock(&virtvdev
->reset_lock
);
270 void virtiovf_migration_reset_done(struct pci_dev
*pdev
)
272 struct virtiovf_pci_core_device
*virtvdev
= dev_get_drvdata(&pdev
->dev
);
274 if (!virtvdev
->migrate_cap
)
278 * As the higher VFIO layers are holding locks across reset and using
279 * those same locks with the mm_lock we need to prevent ABBA deadlock
280 * with the state_mutex and mm_lock.
281 * In case the state_mutex was taken already we defer the cleanup work
282 * to the unlock flow of the other running context.
284 spin_lock(&virtvdev
->reset_lock
);
285 virtvdev
->deferred_reset
= true;
286 if (!mutex_trylock(&virtvdev
->state_mutex
)) {
287 spin_unlock(&virtvdev
->reset_lock
);
290 spin_unlock(&virtvdev
->reset_lock
);
291 virtiovf_state_mutex_unlock(virtvdev
);
294 static int virtiovf_release_file(struct inode
*inode
, struct file
*filp
)
296 struct virtiovf_migration_file
*migf
= filp
->private_data
;
298 virtiovf_disable_fd(migf
);
299 mutex_destroy(&migf
->lock
);
304 static struct virtiovf_data_buffer
*
305 virtiovf_get_data_buff_from_pos(struct virtiovf_migration_file
*migf
,
306 loff_t pos
, bool *end_of_data
)
308 struct virtiovf_data_buffer
*buf
;
311 *end_of_data
= false;
312 spin_lock_irq(&migf
->list_lock
);
313 if (list_empty(&migf
->buf_list
)) {
318 buf
= list_first_entry(&migf
->buf_list
, struct virtiovf_data_buffer
,
320 if (pos
>= buf
->start_pos
&&
321 pos
< buf
->start_pos
+ buf
->length
) {
327 * As we use a stream based FD we may expect having the data always
330 migf
->state
= VIRTIOVF_MIGF_STATE_ERROR
;
333 spin_unlock_irq(&migf
->list_lock
);
334 return found
? buf
: NULL
;
337 static ssize_t
virtiovf_buf_read(struct virtiovf_data_buffer
*vhca_buf
,
338 char __user
**buf
, size_t *len
, loff_t
*pos
)
340 unsigned long offset
;
344 copy_len
= min_t(size_t,
345 vhca_buf
->start_pos
+ vhca_buf
->length
- *pos
, *len
);
353 offset
= *pos
- vhca_buf
->start_pos
;
354 page_offset
= offset
% PAGE_SIZE
;
355 offset
-= page_offset
;
356 page
= virtiovf_get_migration_page(vhca_buf
, offset
);
359 page_len
= min_t(size_t, copy_len
, PAGE_SIZE
- page_offset
);
360 from_buff
= kmap_local_page(page
);
361 ret
= copy_to_user(*buf
, from_buff
+ page_offset
, page_len
);
362 kunmap_local(from_buff
);
369 copy_len
-= page_len
;
372 if (*pos
>= vhca_buf
->start_pos
+ vhca_buf
->length
) {
373 spin_lock_irq(&vhca_buf
->migf
->list_lock
);
374 list_del_init(&vhca_buf
->buf_elm
);
375 list_add_tail(&vhca_buf
->buf_elm
, &vhca_buf
->migf
->avail_list
);
376 spin_unlock_irq(&vhca_buf
->migf
->list_lock
);
382 static ssize_t
virtiovf_save_read(struct file
*filp
, char __user
*buf
, size_t len
,
385 struct virtiovf_migration_file
*migf
= filp
->private_data
;
386 struct virtiovf_data_buffer
*vhca_buf
;
387 bool first_loop_call
= true;
395 mutex_lock(&migf
->lock
);
396 if (migf
->state
== VIRTIOVF_MIGF_STATE_ERROR
) {
404 vhca_buf
= virtiovf_get_data_buff_from_pos(migf
, *pos
, &end_of_data
);
405 if (first_loop_call
) {
406 first_loop_call
= false;
407 /* Temporary end of file as part of PRE_COPY */
408 if (end_of_data
&& migf
->state
== VIRTIOVF_MIGF_STATE_PRECOPY
) {
412 if (end_of_data
&& migf
->state
!= VIRTIOVF_MIGF_STATE_COMPLETE
) {
426 count
= virtiovf_buf_read(vhca_buf
, &buf
, &len
, pos
);
435 mutex_unlock(&migf
->lock
);
439 static long virtiovf_precopy_ioctl(struct file
*filp
, unsigned int cmd
,
442 struct virtiovf_migration_file
*migf
= filp
->private_data
;
443 struct virtiovf_pci_core_device
*virtvdev
= migf
->virtvdev
;
444 struct vfio_precopy_info info
= {};
445 loff_t
*pos
= &filp
->f_pos
;
446 bool end_of_data
= false;
451 if (cmd
!= VFIO_MIG_GET_PRECOPY_INFO
)
454 minsz
= offsetofend(struct vfio_precopy_info
, dirty_bytes
);
455 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
458 if (info
.argsz
< minsz
)
461 mutex_lock(&virtvdev
->state_mutex
);
462 if (virtvdev
->mig_state
!= VFIO_DEVICE_STATE_PRE_COPY
&&
463 virtvdev
->mig_state
!= VFIO_DEVICE_STATE_PRE_COPY_P2P
) {
465 goto err_state_unlock
;
469 * The virtio specification does not include a PRE_COPY concept.
470 * Since we can expect the data to remain the same for a certain period,
471 * we use a rate limiter mechanism before making a call to the device.
473 if (__ratelimit(&migf
->pre_copy_rl_state
)) {
475 ret
= virtio_pci_admin_dev_parts_metadata_get(virtvdev
->core_device
.pdev
,
476 VIRTIO_RESOURCE_OBJ_DEV_PARTS
, migf
->obj_id
,
477 VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE
,
480 goto err_state_unlock
;
483 mutex_lock(&migf
->lock
);
484 if (migf
->state
== VIRTIOVF_MIGF_STATE_ERROR
) {
486 goto err_migf_unlock
;
489 if (migf
->pre_copy_initial_bytes
> *pos
) {
490 info
.initial_bytes
= migf
->pre_copy_initial_bytes
- *pos
;
492 info
.dirty_bytes
= migf
->max_pos
- *pos
;
493 if (!info
.dirty_bytes
)
495 info
.dirty_bytes
+= ctx_size
;
498 if (!end_of_data
|| !ctx_size
) {
499 mutex_unlock(&migf
->lock
);
503 mutex_unlock(&migf
->lock
);
505 * We finished transferring the current state and the device has a
506 * dirty state, read a new state.
508 ret
= virtiovf_read_device_context_chunk(migf
, ctx_size
);
511 * The machine is running, and context size could be grow, so no reason to mark
512 * the device state as VIRTIOVF_MIGF_STATE_ERROR.
514 goto err_state_unlock
;
517 virtiovf_state_mutex_unlock(virtvdev
);
518 if (copy_to_user((void __user
*)arg
, &info
, minsz
))
523 mutex_unlock(&migf
->lock
);
525 virtiovf_state_mutex_unlock(virtvdev
);
529 static const struct file_operations virtiovf_save_fops
= {
530 .owner
= THIS_MODULE
,
531 .read
= virtiovf_save_read
,
532 .unlocked_ioctl
= virtiovf_precopy_ioctl
,
533 .compat_ioctl
= compat_ptr_ioctl
,
534 .release
= virtiovf_release_file
,
538 virtiovf_add_buf_header(struct virtiovf_data_buffer
*header_buf
,
541 struct virtiovf_migration_file
*migf
= header_buf
->migf
;
542 struct virtiovf_migration_header header
= {};
546 header
.record_size
= cpu_to_le64(data_size
);
547 header
.flags
= cpu_to_le32(VIRTIOVF_MIGF_HEADER_FLAGS_TAG_MANDATORY
);
548 header
.tag
= cpu_to_le32(VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA
);
549 page
= virtiovf_get_migration_page(header_buf
, 0);
552 to_buff
= kmap_local_page(page
);
553 memcpy(to_buff
, &header
, sizeof(header
));
554 kunmap_local(to_buff
);
555 header_buf
->length
= sizeof(header
);
556 header_buf
->start_pos
= header_buf
->migf
->max_pos
;
557 migf
->max_pos
+= header_buf
->length
;
558 spin_lock_irq(&migf
->list_lock
);
559 list_add_tail(&header_buf
->buf_elm
, &migf
->buf_list
);
560 spin_unlock_irq(&migf
->list_lock
);
565 virtiovf_read_device_context_chunk(struct virtiovf_migration_file
*migf
,
568 struct virtiovf_data_buffer
*header_buf
;
569 struct virtiovf_data_buffer
*buf
;
570 bool unmark_end
= false;
571 struct scatterlist
*sg
;
577 buf
= virtiovf_get_data_buffer(migf
, ctx_size
);
581 /* Find the total count of SG entries which satisfies the size */
582 nent
= sg_nents_for_len(buf
->table
.sgt
.sgl
, ctx_size
);
589 * Iterate to that SG entry and mark it as last (if it's not already)
590 * to let underlay layers iterate only till that entry.
592 for_each_sg(buf
->table
.sgt
.sgl
, sg
, nent
- 1, i
)
595 if (!sg_is_last(sg
)) {
600 ret
= virtio_pci_admin_dev_parts_get(migf
->virtvdev
->core_device
.pdev
,
601 VIRTIO_RESOURCE_OBJ_DEV_PARTS
,
603 VIRTIO_ADMIN_CMD_DEV_PARTS_GET_TYPE_ALL
,
604 buf
->table
.sgt
.sgl
, &res_size
);
605 /* Restore the original SG mark end */
611 buf
->length
= res_size
;
612 header_buf
= virtiovf_get_data_buffer(migf
,
613 sizeof(struct virtiovf_migration_header
));
614 if (IS_ERR(header_buf
)) {
615 ret
= PTR_ERR(header_buf
);
619 ret
= virtiovf_add_buf_header(header_buf
, res_size
);
623 buf
->start_pos
= buf
->migf
->max_pos
;
624 migf
->max_pos
+= buf
->length
;
625 spin_lock(&migf
->list_lock
);
626 list_add_tail(&buf
->buf_elm
, &migf
->buf_list
);
627 spin_unlock_irq(&migf
->list_lock
);
631 virtiovf_put_data_buffer(header_buf
);
633 virtiovf_put_data_buffer(buf
);
638 virtiovf_pci_save_device_final_data(struct virtiovf_pci_core_device
*virtvdev
)
640 struct virtiovf_migration_file
*migf
= virtvdev
->saving_migf
;
644 if (migf
->state
== VIRTIOVF_MIGF_STATE_ERROR
)
647 ret
= virtio_pci_admin_dev_parts_metadata_get(virtvdev
->core_device
.pdev
,
648 VIRTIO_RESOURCE_OBJ_DEV_PARTS
, migf
->obj_id
,
649 VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE
,
659 ret
= virtiovf_read_device_context_chunk(migf
, ctx_size
);
663 migf
->state
= VIRTIOVF_MIGF_STATE_COMPLETE
;
667 migf
->state
= VIRTIOVF_MIGF_STATE_ERROR
;
671 static struct virtiovf_migration_file
*
672 virtiovf_pci_save_device_data(struct virtiovf_pci_core_device
*virtvdev
,
675 struct virtiovf_migration_file
*migf
;
680 migf
= kzalloc(sizeof(*migf
), GFP_KERNEL_ACCOUNT
);
682 return ERR_PTR(-ENOMEM
);
684 migf
->filp
= anon_inode_getfile("virtiovf_mig", &virtiovf_save_fops
, migf
,
686 if (IS_ERR(migf
->filp
)) {
687 ret
= PTR_ERR(migf
->filp
);
692 stream_open(migf
->filp
->f_inode
, migf
->filp
);
693 mutex_init(&migf
->lock
);
694 INIT_LIST_HEAD(&migf
->buf_list
);
695 INIT_LIST_HEAD(&migf
->avail_list
);
696 spin_lock_init(&migf
->list_lock
);
697 migf
->virtvdev
= virtvdev
;
699 lockdep_assert_held(&virtvdev
->state_mutex
);
700 ret
= virtiovf_pci_alloc_obj_id(virtvdev
, VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET
,
705 migf
->obj_id
= obj_id
;
706 /* Mark as having a valid obj id which can be even 0 */
707 migf
->has_obj_id
= true;
708 ret
= virtio_pci_admin_dev_parts_metadata_get(virtvdev
->core_device
.pdev
,
709 VIRTIO_RESOURCE_OBJ_DEV_PARTS
, obj_id
,
710 VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE
,
720 ret
= virtiovf_read_device_context_chunk(migf
, ctx_size
);
725 migf
->pre_copy_initial_bytes
= migf
->max_pos
;
726 /* Arbitrarily set the pre-copy rate limit to 1-second intervals */
727 ratelimit_state_init(&migf
->pre_copy_rl_state
, 1 * HZ
, 1);
728 /* Prevent any rate messages upon its usage */
729 ratelimit_set_flags(&migf
->pre_copy_rl_state
,
730 RATELIMIT_MSG_ON_RELEASE
);
731 migf
->state
= VIRTIOVF_MIGF_STATE_PRECOPY
;
733 migf
->state
= VIRTIOVF_MIGF_STATE_COMPLETE
;
739 virtiovf_clean_migf_resources(migf
);
746 * Set the required object header at the beginning of the buffer.
747 * The actual device parts data will be written post of the header offset.
749 static int virtiovf_set_obj_cmd_header(struct virtiovf_data_buffer
*vhca_buf
)
751 struct virtio_admin_cmd_resource_obj_cmd_hdr obj_hdr
= {};
755 obj_hdr
.type
= cpu_to_le16(VIRTIO_RESOURCE_OBJ_DEV_PARTS
);
756 obj_hdr
.id
= cpu_to_le32(vhca_buf
->migf
->obj_id
);
757 page
= virtiovf_get_migration_page(vhca_buf
, 0);
760 to_buff
= kmap_local_page(page
);
761 memcpy(to_buff
, &obj_hdr
, sizeof(obj_hdr
));
762 kunmap_local(to_buff
);
764 /* Mark the buffer as including the header object data */
765 vhca_buf
->include_header_object
= 1;
770 virtiovf_append_page_to_mig_buf(struct virtiovf_data_buffer
*vhca_buf
,
771 const char __user
**buf
, size_t *len
,
772 loff_t
*pos
, ssize_t
*done
)
774 unsigned long offset
;
781 offset
= *pos
- vhca_buf
->start_pos
;
783 if (vhca_buf
->include_header_object
)
784 /* The buffer holds the object header, update the offset accordingly */
785 offset
+= sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr
);
787 page_offset
= offset
% PAGE_SIZE
;
789 page
= virtiovf_get_migration_page(vhca_buf
, offset
- page_offset
);
793 page_len
= min_t(size_t, *len
, PAGE_SIZE
- page_offset
);
794 to_buff
= kmap_local_page(page
);
795 ret
= copy_from_user(to_buff
+ page_offset
, *buf
, page_len
);
796 kunmap_local(to_buff
);
804 vhca_buf
->length
+= page_len
;
809 virtiovf_resume_read_chunk(struct virtiovf_migration_file
*migf
,
810 struct virtiovf_data_buffer
*vhca_buf
,
811 size_t chunk_size
, const char __user
**buf
,
812 size_t *len
, loff_t
*pos
, ssize_t
*done
,
815 size_t copy_len
, to_copy
;
818 to_copy
= min_t(size_t, *len
, chunk_size
- vhca_buf
->length
);
821 ret
= virtiovf_append_page_to_mig_buf(vhca_buf
, buf
, &to_copy
,
828 if (vhca_buf
->length
== chunk_size
) {
829 migf
->load_state
= VIRTIOVF_LOAD_STATE_LOAD_CHUNK
;
830 migf
->max_pos
+= chunk_size
;
838 virtiovf_resume_read_header_data(struct virtiovf_migration_file
*migf
,
839 struct virtiovf_data_buffer
*vhca_buf
,
840 const char __user
**buf
, size_t *len
,
841 loff_t
*pos
, ssize_t
*done
)
843 size_t copy_len
, to_copy
;
844 size_t required_data
;
847 required_data
= migf
->record_size
- vhca_buf
->length
;
848 to_copy
= min_t(size_t, *len
, required_data
);
851 ret
= virtiovf_append_page_to_mig_buf(vhca_buf
, buf
, &to_copy
,
858 if (vhca_buf
->length
== migf
->record_size
) {
859 switch (migf
->record_tag
) {
865 migf
->load_state
= VIRTIOVF_LOAD_STATE_READ_HEADER
;
866 migf
->max_pos
+= migf
->record_size
;
867 vhca_buf
->length
= 0;
874 virtiovf_resume_read_header(struct virtiovf_migration_file
*migf
,
875 struct virtiovf_data_buffer
*vhca_buf
,
876 const char __user
**buf
,
877 size_t *len
, loff_t
*pos
,
878 ssize_t
*done
, bool *has_work
)
885 copy_len
= min_t(size_t, *len
,
886 sizeof(struct virtiovf_migration_header
) - vhca_buf
->length
);
887 page
= virtiovf_get_migration_page(vhca_buf
, 0);
890 to_buff
= kmap_local_page(page
);
891 ret
= copy_from_user(to_buff
+ vhca_buf
->length
, *buf
, copy_len
);
901 vhca_buf
->length
+= copy_len
;
902 if (vhca_buf
->length
== sizeof(struct virtiovf_migration_header
)) {
906 record_size
= le64_to_cpup((__le64
*)to_buff
);
907 if (record_size
> MAX_LOAD_SIZE
) {
912 migf
->record_size
= record_size
;
913 flags
= le32_to_cpup((__le32
*)(to_buff
+
914 offsetof(struct virtiovf_migration_header
, flags
)));
915 migf
->record_tag
= le32_to_cpup((__le32
*)(to_buff
+
916 offsetof(struct virtiovf_migration_header
, tag
)));
917 switch (migf
->record_tag
) {
918 case VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA
:
919 migf
->load_state
= VIRTIOVF_LOAD_STATE_PREP_CHUNK
;
922 if (!(flags
& VIRTIOVF_MIGF_HEADER_FLAGS_TAG_OPTIONAL
)) {
926 /* We may read and skip this optional record data */
927 migf
->load_state
= VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA
;
930 migf
->max_pos
+= vhca_buf
->length
;
931 vhca_buf
->length
= 0;
935 kunmap_local(to_buff
);
939 static ssize_t
virtiovf_resume_write(struct file
*filp
, const char __user
*buf
,
940 size_t len
, loff_t
*pos
)
942 struct virtiovf_migration_file
*migf
= filp
->private_data
;
943 struct virtiovf_data_buffer
*vhca_buf
= migf
->buf
;
944 struct virtiovf_data_buffer
*vhca_buf_header
= migf
->buf_header
;
945 unsigned int orig_length
;
946 bool has_work
= false;
954 if (*pos
< vhca_buf
->start_pos
)
957 mutex_lock(&migf
->virtvdev
->state_mutex
);
958 mutex_lock(&migf
->lock
);
959 if (migf
->state
== VIRTIOVF_MIGF_STATE_ERROR
) {
964 while (len
|| has_work
) {
966 switch (migf
->load_state
) {
967 case VIRTIOVF_LOAD_STATE_READ_HEADER
:
968 ret
= virtiovf_resume_read_header(migf
, vhca_buf_header
, &buf
,
969 &len
, pos
, &done
, &has_work
);
973 case VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA
:
974 if (vhca_buf_header
->allocated_length
< migf
->record_size
) {
975 virtiovf_free_data_buffer(vhca_buf_header
);
977 migf
->buf_header
= virtiovf_alloc_data_buffer(migf
,
979 if (IS_ERR(migf
->buf_header
)) {
980 ret
= PTR_ERR(migf
->buf_header
);
981 migf
->buf_header
= NULL
;
985 vhca_buf_header
= migf
->buf_header
;
988 vhca_buf_header
->start_pos
= migf
->max_pos
;
989 migf
->load_state
= VIRTIOVF_LOAD_STATE_READ_HEADER_DATA
;
991 case VIRTIOVF_LOAD_STATE_READ_HEADER_DATA
:
992 ret
= virtiovf_resume_read_header_data(migf
, vhca_buf_header
,
993 &buf
, &len
, pos
, &done
);
997 case VIRTIOVF_LOAD_STATE_PREP_CHUNK
:
999 u32 cmd_size
= migf
->record_size
+
1000 sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr
);
1003 * The DMA map/unmap is managed in virtio layer, we just need to extend
1004 * the SG pages to hold the extra required chunk data.
1006 if (vhca_buf
->allocated_length
< cmd_size
) {
1007 ret
= virtiovf_add_migration_pages(vhca_buf
,
1008 DIV_ROUND_UP_ULL(cmd_size
- vhca_buf
->allocated_length
,
1014 vhca_buf
->start_pos
= migf
->max_pos
;
1015 migf
->load_state
= VIRTIOVF_LOAD_STATE_READ_CHUNK
;
1018 case VIRTIOVF_LOAD_STATE_READ_CHUNK
:
1019 ret
= virtiovf_resume_read_chunk(migf
, vhca_buf
, migf
->record_size
,
1020 &buf
, &len
, pos
, &done
, &has_work
);
1024 case VIRTIOVF_LOAD_STATE_LOAD_CHUNK
:
1025 /* Mark the last SG entry and set its length */
1026 sg_mark_end(vhca_buf
->last_offset_sg
);
1027 orig_length
= vhca_buf
->last_offset_sg
->length
;
1028 /* Length should include the resource object command header */
1029 vhca_buf
->last_offset_sg
->length
= vhca_buf
->length
+
1030 sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr
) -
1031 vhca_buf
->last_offset
;
1032 ret
= virtio_pci_admin_dev_parts_set(migf
->virtvdev
->core_device
.pdev
,
1033 vhca_buf
->table
.sgt
.sgl
);
1034 /* Restore the original SG data */
1035 vhca_buf
->last_offset_sg
->length
= orig_length
;
1036 sg_unmark_end(vhca_buf
->last_offset_sg
);
1039 migf
->load_state
= VIRTIOVF_LOAD_STATE_READ_HEADER
;
1040 /* be ready for reading the next chunk */
1041 vhca_buf
->length
= 0;
1050 migf
->state
= VIRTIOVF_MIGF_STATE_ERROR
;
1051 mutex_unlock(&migf
->lock
);
1052 virtiovf_state_mutex_unlock(migf
->virtvdev
);
1053 return ret
? ret
: done
;
1056 static const struct file_operations virtiovf_resume_fops
= {
1057 .owner
= THIS_MODULE
,
1058 .write
= virtiovf_resume_write
,
1059 .release
= virtiovf_release_file
,
1062 static struct virtiovf_migration_file
*
1063 virtiovf_pci_resume_device_data(struct virtiovf_pci_core_device
*virtvdev
)
1065 struct virtiovf_migration_file
*migf
;
1066 struct virtiovf_data_buffer
*buf
;
1070 migf
= kzalloc(sizeof(*migf
), GFP_KERNEL_ACCOUNT
);
1072 return ERR_PTR(-ENOMEM
);
1074 migf
->filp
= anon_inode_getfile("virtiovf_mig", &virtiovf_resume_fops
, migf
,
1076 if (IS_ERR(migf
->filp
)) {
1077 ret
= PTR_ERR(migf
->filp
);
1079 return ERR_PTR(ret
);
1082 stream_open(migf
->filp
->f_inode
, migf
->filp
);
1083 mutex_init(&migf
->lock
);
1084 INIT_LIST_HEAD(&migf
->buf_list
);
1085 INIT_LIST_HEAD(&migf
->avail_list
);
1086 spin_lock_init(&migf
->list_lock
);
1088 buf
= virtiovf_alloc_data_buffer(migf
, VIRTIOVF_TARGET_INITIAL_BUF_SIZE
);
1096 buf
= virtiovf_alloc_data_buffer(migf
,
1097 sizeof(struct virtiovf_migration_header
));
1103 migf
->buf_header
= buf
;
1104 migf
->load_state
= VIRTIOVF_LOAD_STATE_READ_HEADER
;
1106 migf
->virtvdev
= virtvdev
;
1107 ret
= virtiovf_pci_alloc_obj_id(virtvdev
, VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_SET
,
1112 migf
->obj_id
= obj_id
;
1113 /* Mark as having a valid obj id which can be even 0 */
1114 migf
->has_obj_id
= true;
1115 ret
= virtiovf_set_obj_cmd_header(migf
->buf
);
1122 virtiovf_clean_migf_resources(migf
);
1125 return ERR_PTR(ret
);
1128 static struct file
*
1129 virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device
*virtvdev
,
1132 u32 cur
= virtvdev
->mig_state
;
1135 if (cur
== VFIO_DEVICE_STATE_RUNNING_P2P
&& new == VFIO_DEVICE_STATE_STOP
) {
1140 if (cur
== VFIO_DEVICE_STATE_STOP
&& new == VFIO_DEVICE_STATE_RUNNING_P2P
) {
1145 if ((cur
== VFIO_DEVICE_STATE_RUNNING
&& new == VFIO_DEVICE_STATE_RUNNING_P2P
) ||
1146 (cur
== VFIO_DEVICE_STATE_PRE_COPY
&& new == VFIO_DEVICE_STATE_PRE_COPY_P2P
)) {
1147 ret
= virtio_pci_admin_mode_set(virtvdev
->core_device
.pdev
,
1148 BIT(VIRTIO_ADMIN_CMD_DEV_MODE_F_STOPPED
));
1150 return ERR_PTR(ret
);
1154 if ((cur
== VFIO_DEVICE_STATE_RUNNING_P2P
&& new == VFIO_DEVICE_STATE_RUNNING
) ||
1155 (cur
== VFIO_DEVICE_STATE_PRE_COPY_P2P
&& new == VFIO_DEVICE_STATE_PRE_COPY
)) {
1156 ret
= virtio_pci_admin_mode_set(virtvdev
->core_device
.pdev
, 0);
1158 return ERR_PTR(ret
);
1162 if (cur
== VFIO_DEVICE_STATE_STOP
&& new == VFIO_DEVICE_STATE_STOP_COPY
) {
1163 struct virtiovf_migration_file
*migf
;
1165 migf
= virtiovf_pci_save_device_data(virtvdev
, false);
1167 return ERR_CAST(migf
);
1168 get_file(migf
->filp
);
1169 virtvdev
->saving_migf
= migf
;
1173 if ((cur
== VFIO_DEVICE_STATE_STOP_COPY
&& new == VFIO_DEVICE_STATE_STOP
) ||
1174 (cur
== VFIO_DEVICE_STATE_PRE_COPY
&& new == VFIO_DEVICE_STATE_RUNNING
) ||
1175 (cur
== VFIO_DEVICE_STATE_PRE_COPY_P2P
&& new == VFIO_DEVICE_STATE_RUNNING_P2P
)) {
1176 virtiovf_disable_fds(virtvdev
);
1180 if (cur
== VFIO_DEVICE_STATE_STOP
&& new == VFIO_DEVICE_STATE_RESUMING
) {
1181 struct virtiovf_migration_file
*migf
;
1183 migf
= virtiovf_pci_resume_device_data(virtvdev
);
1185 return ERR_CAST(migf
);
1186 get_file(migf
->filp
);
1187 virtvdev
->resuming_migf
= migf
;
1191 if (cur
== VFIO_DEVICE_STATE_RESUMING
&& new == VFIO_DEVICE_STATE_STOP
) {
1192 virtiovf_disable_fds(virtvdev
);
1196 if ((cur
== VFIO_DEVICE_STATE_RUNNING
&& new == VFIO_DEVICE_STATE_PRE_COPY
) ||
1197 (cur
== VFIO_DEVICE_STATE_RUNNING_P2P
&&
1198 new == VFIO_DEVICE_STATE_PRE_COPY_P2P
)) {
1199 struct virtiovf_migration_file
*migf
;
1201 migf
= virtiovf_pci_save_device_data(virtvdev
, true);
1203 return ERR_CAST(migf
);
1204 get_file(migf
->filp
);
1205 virtvdev
->saving_migf
= migf
;
1209 if (cur
== VFIO_DEVICE_STATE_PRE_COPY_P2P
&& new == VFIO_DEVICE_STATE_STOP_COPY
) {
1210 ret
= virtiovf_pci_save_device_final_data(virtvdev
);
1211 return ret
? ERR_PTR(ret
) : NULL
;
1215 * vfio_mig_get_next_state() does not use arcs other than the above
1218 return ERR_PTR(-EINVAL
);
1221 static struct file
*
1222 virtiovf_pci_set_device_state(struct vfio_device
*vdev
,
1223 enum vfio_device_mig_state new_state
)
1225 struct virtiovf_pci_core_device
*virtvdev
= container_of(
1226 vdev
, struct virtiovf_pci_core_device
, core_device
.vdev
);
1227 enum vfio_device_mig_state next_state
;
1228 struct file
*res
= NULL
;
1231 mutex_lock(&virtvdev
->state_mutex
);
1232 while (new_state
!= virtvdev
->mig_state
) {
1233 ret
= vfio_mig_get_next_state(vdev
, virtvdev
->mig_state
,
1234 new_state
, &next_state
);
1239 res
= virtiovf_pci_step_device_state_locked(virtvdev
, next_state
);
1242 virtvdev
->mig_state
= next_state
;
1243 if (WARN_ON(res
&& new_state
!= virtvdev
->mig_state
)) {
1245 res
= ERR_PTR(-EINVAL
);
1249 virtiovf_state_mutex_unlock(virtvdev
);
1253 static int virtiovf_pci_get_device_state(struct vfio_device
*vdev
,
1254 enum vfio_device_mig_state
*curr_state
)
1256 struct virtiovf_pci_core_device
*virtvdev
= container_of(
1257 vdev
, struct virtiovf_pci_core_device
, core_device
.vdev
);
1259 mutex_lock(&virtvdev
->state_mutex
);
1260 *curr_state
= virtvdev
->mig_state
;
1261 virtiovf_state_mutex_unlock(virtvdev
);
1265 static int virtiovf_pci_get_data_size(struct vfio_device
*vdev
,
1266 unsigned long *stop_copy_length
)
1268 struct virtiovf_pci_core_device
*virtvdev
= container_of(
1269 vdev
, struct virtiovf_pci_core_device
, core_device
.vdev
);
1275 mutex_lock(&virtvdev
->state_mutex
);
1276 obj_id_exists
= virtvdev
->saving_migf
&& virtvdev
->saving_migf
->has_obj_id
;
1277 if (!obj_id_exists
) {
1278 ret
= virtiovf_pci_alloc_obj_id(virtvdev
,
1279 VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET
,
1284 obj_id
= virtvdev
->saving_migf
->obj_id
;
1287 ret
= virtio_pci_admin_dev_parts_metadata_get(virtvdev
->core_device
.pdev
,
1288 VIRTIO_RESOURCE_OBJ_DEV_PARTS
, obj_id
,
1289 VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE
,
1292 *stop_copy_length
= res_size
;
1295 * We can't leave this obj_id alive if didn't exist before, otherwise, it might
1296 * stay alive, even without an active migration flow (e.g. migration was cancelled)
1299 virtiovf_pci_free_obj_id(virtvdev
, obj_id
);
1301 virtiovf_state_mutex_unlock(virtvdev
);
1305 static const struct vfio_migration_ops virtvdev_pci_mig_ops
= {
1306 .migration_set_state
= virtiovf_pci_set_device_state
,
1307 .migration_get_state
= virtiovf_pci_get_device_state
,
1308 .migration_get_data_size
= virtiovf_pci_get_data_size
,
1311 void virtiovf_set_migratable(struct virtiovf_pci_core_device
*virtvdev
)
1313 virtvdev
->migrate_cap
= 1;
1314 mutex_init(&virtvdev
->state_mutex
);
1315 spin_lock_init(&virtvdev
->reset_lock
);
1316 virtvdev
->core_device
.vdev
.migration_flags
=
1317 VFIO_MIGRATION_STOP_COPY
|
1318 VFIO_MIGRATION_P2P
|
1319 VFIO_MIGRATION_PRE_COPY
;
1320 virtvdev
->core_device
.vdev
.mig_ops
= &virtvdev_pci_mig_ops
;
1323 void virtiovf_open_migration(struct virtiovf_pci_core_device
*virtvdev
)
1325 if (!virtvdev
->migrate_cap
)
1328 virtvdev
->mig_state
= VFIO_DEVICE_STATE_RUNNING
;
1331 void virtiovf_close_migration(struct virtiovf_pci_core_device
*virtvdev
)
1333 if (!virtvdev
->migrate_cap
)
1336 virtiovf_disable_fds(virtvdev
);