1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
6 #include <linux/device.h>
7 #include <linux/eventfd.h>
8 #include <linux/file.h>
9 #include <linux/interrupt.h>
10 #include <linux/iommu.h>
11 #include <linux/module.h>
12 #include <linux/mutex.h>
13 #include <linux/notifier.h>
14 #include <linux/pci.h>
15 #include <linux/pm_runtime.h>
16 #include <linux/types.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/anon_inodes.h>
24 /* Device specification max LOAD size */
25 #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
27 #define MAX_CHUNK_SIZE SZ_8M
29 static struct mlx5vf_pci_core_device
*mlx5vf_drvdata(struct pci_dev
*pdev
)
31 struct vfio_pci_core_device
*core_device
= dev_get_drvdata(&pdev
->dev
);
33 return container_of(core_device
, struct mlx5vf_pci_core_device
,
38 mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer
*buf
,
41 unsigned long cur_offset
= 0;
42 struct scatterlist
*sg
;
45 /* All accesses are sequential */
46 if (offset
< buf
->last_offset
|| !buf
->last_offset_sg
) {
48 buf
->last_offset_sg
= buf
->table
.sgt
.sgl
;
49 buf
->sg_last_entry
= 0;
52 cur_offset
= buf
->last_offset
;
54 for_each_sg(buf
->last_offset_sg
, sg
,
55 buf
->table
.sgt
.orig_nents
- buf
->sg_last_entry
, i
) {
56 if (offset
< sg
->length
+ cur_offset
) {
57 buf
->last_offset_sg
= sg
;
58 buf
->sg_last_entry
+= i
;
59 buf
->last_offset
= cur_offset
;
60 return nth_page(sg_page(sg
),
61 (offset
- cur_offset
) / PAGE_SIZE
);
63 cur_offset
+= sg
->length
;
68 static void mlx5vf_disable_fd(struct mlx5_vf_migration_file
*migf
)
70 mutex_lock(&migf
->lock
);
71 migf
->state
= MLX5_MIGF_STATE_ERROR
;
72 migf
->filp
->f_pos
= 0;
73 mutex_unlock(&migf
->lock
);
76 static int mlx5vf_release_file(struct inode
*inode
, struct file
*filp
)
78 struct mlx5_vf_migration_file
*migf
= filp
->private_data
;
80 mlx5vf_disable_fd(migf
);
81 mutex_destroy(&migf
->lock
);
86 static struct mlx5_vhca_data_buffer
*
87 mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file
*migf
, loff_t pos
,
90 struct mlx5_vhca_data_buffer
*buf
;
94 spin_lock_irq(&migf
->list_lock
);
95 if (list_empty(&migf
->buf_list
)) {
100 buf
= list_first_entry(&migf
->buf_list
, struct mlx5_vhca_data_buffer
,
102 if (pos
>= buf
->start_pos
&&
103 pos
< buf
->start_pos
+ buf
->length
) {
109 * As we use a stream based FD we may expect having the data always
112 migf
->state
= MLX5_MIGF_STATE_ERROR
;
115 spin_unlock_irq(&migf
->list_lock
);
116 return found
? buf
: NULL
;
119 static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer
*vhca_buf
)
121 struct mlx5_vf_migration_file
*migf
= vhca_buf
->migf
;
123 if (vhca_buf
->stop_copy_chunk_num
) {
124 bool is_header
= vhca_buf
->dma_dir
== DMA_NONE
;
125 u8 chunk_num
= vhca_buf
->stop_copy_chunk_num
;
126 size_t next_required_umem_size
= 0;
129 migf
->buf_header
[chunk_num
- 1] = vhca_buf
;
131 migf
->buf
[chunk_num
- 1] = vhca_buf
;
133 spin_lock_irq(&migf
->list_lock
);
134 list_del_init(&vhca_buf
->buf_elm
);
136 next_required_umem_size
=
137 migf
->next_required_umem_size
;
138 migf
->next_required_umem_size
= 0;
139 migf
->num_ready_chunks
--;
141 spin_unlock_irq(&migf
->list_lock
);
142 if (next_required_umem_size
)
143 mlx5vf_mig_file_set_save_work(migf
, chunk_num
,
144 next_required_umem_size
);
148 spin_lock_irq(&migf
->list_lock
);
149 list_del_init(&vhca_buf
->buf_elm
);
150 list_add_tail(&vhca_buf
->buf_elm
, &vhca_buf
->migf
->avail_list
);
151 spin_unlock_irq(&migf
->list_lock
);
154 static ssize_t
mlx5vf_buf_read(struct mlx5_vhca_data_buffer
*vhca_buf
,
155 char __user
**buf
, size_t *len
, loff_t
*pos
)
157 unsigned long offset
;
161 copy_len
= min_t(size_t,
162 vhca_buf
->start_pos
+ vhca_buf
->length
- *pos
, *len
);
170 offset
= *pos
- vhca_buf
->start_pos
;
171 page_offset
= offset
% PAGE_SIZE
;
172 offset
-= page_offset
;
173 page
= mlx5vf_get_migration_page(vhca_buf
, offset
);
176 page_len
= min_t(size_t, copy_len
, PAGE_SIZE
- page_offset
);
177 from_buff
= kmap_local_page(page
);
178 ret
= copy_to_user(*buf
, from_buff
+ page_offset
, page_len
);
179 kunmap_local(from_buff
);
186 copy_len
-= page_len
;
189 if (*pos
>= vhca_buf
->start_pos
+ vhca_buf
->length
)
190 mlx5vf_buf_read_done(vhca_buf
);
195 static ssize_t
mlx5vf_save_read(struct file
*filp
, char __user
*buf
, size_t len
,
198 struct mlx5_vf_migration_file
*migf
= filp
->private_data
;
199 struct mlx5_vhca_data_buffer
*vhca_buf
;
200 bool first_loop_call
= true;
208 if (!(filp
->f_flags
& O_NONBLOCK
)) {
209 if (wait_event_interruptible(migf
->poll_wait
,
210 !list_empty(&migf
->buf_list
) ||
211 migf
->state
== MLX5_MIGF_STATE_ERROR
||
212 migf
->state
== MLX5_MIGF_STATE_PRE_COPY_ERROR
||
213 migf
->state
== MLX5_MIGF_STATE_PRE_COPY
||
214 migf
->state
== MLX5_MIGF_STATE_COMPLETE
))
218 mutex_lock(&migf
->lock
);
219 if (migf
->state
== MLX5_MIGF_STATE_ERROR
) {
227 vhca_buf
= mlx5vf_get_data_buff_from_pos(migf
, *pos
,
229 if (first_loop_call
) {
230 first_loop_call
= false;
231 /* Temporary end of file as part of PRE_COPY */
232 if (end_of_data
&& (migf
->state
== MLX5_MIGF_STATE_PRE_COPY
||
233 migf
->state
== MLX5_MIGF_STATE_PRE_COPY_ERROR
)) {
238 if (end_of_data
&& migf
->state
!= MLX5_MIGF_STATE_COMPLETE
) {
239 if (filp
->f_flags
& O_NONBLOCK
) {
254 count
= mlx5vf_buf_read(vhca_buf
, &buf
, &len
, pos
);
263 mutex_unlock(&migf
->lock
);
267 static __poll_t
mlx5vf_save_poll(struct file
*filp
,
268 struct poll_table_struct
*wait
)
270 struct mlx5_vf_migration_file
*migf
= filp
->private_data
;
271 __poll_t pollflags
= 0;
273 poll_wait(filp
, &migf
->poll_wait
, wait
);
275 mutex_lock(&migf
->lock
);
276 if (migf
->state
== MLX5_MIGF_STATE_ERROR
)
277 pollflags
= EPOLLIN
| EPOLLRDNORM
| EPOLLRDHUP
;
278 else if (!list_empty(&migf
->buf_list
) ||
279 migf
->state
== MLX5_MIGF_STATE_COMPLETE
)
280 pollflags
= EPOLLIN
| EPOLLRDNORM
;
281 mutex_unlock(&migf
->lock
);
287 * FD is exposed and user can use it after receiving an error.
288 * Mark migf in error, and wake the user.
290 static void mlx5vf_mark_err(struct mlx5_vf_migration_file
*migf
)
292 migf
->state
= MLX5_MIGF_STATE_ERROR
;
293 wake_up_interruptible(&migf
->poll_wait
);
296 void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file
*migf
,
297 u8 chunk_num
, size_t next_required_umem_size
)
299 migf
->save_data
[chunk_num
- 1].next_required_umem_size
=
300 next_required_umem_size
;
301 migf
->save_data
[chunk_num
- 1].migf
= migf
;
302 get_file(migf
->filp
);
303 queue_work(migf
->mvdev
->cb_wq
,
304 &migf
->save_data
[chunk_num
- 1].work
);
307 static struct mlx5_vhca_data_buffer
*
308 mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file
*migf
,
309 u8 index
, size_t required_length
)
311 struct mlx5_vhca_data_buffer
*buf
= migf
->buf
[index
];
315 chunk_num
= buf
->stop_copy_chunk_num
;
316 buf
->migf
->buf
[index
] = NULL
;
317 /* Checking whether the pre-allocated buffer can fit */
318 if (buf
->allocated_length
>= required_length
)
321 mlx5vf_put_data_buffer(buf
);
322 buf
= mlx5vf_get_data_buffer(buf
->migf
, required_length
,
327 buf
->stop_copy_chunk_num
= chunk_num
;
331 static void mlx5vf_mig_file_save_work(struct work_struct
*_work
)
333 struct mlx5vf_save_work_data
*save_data
= container_of(_work
,
334 struct mlx5vf_save_work_data
, work
);
335 struct mlx5_vf_migration_file
*migf
= save_data
->migf
;
336 struct mlx5vf_pci_core_device
*mvdev
= migf
->mvdev
;
337 struct mlx5_vhca_data_buffer
*buf
;
339 mutex_lock(&mvdev
->state_mutex
);
340 if (migf
->state
== MLX5_MIGF_STATE_ERROR
)
343 buf
= mlx5vf_mig_file_get_stop_copy_buf(migf
,
344 save_data
->chunk_num
- 1,
345 save_data
->next_required_umem_size
);
349 if (mlx5vf_cmd_save_vhca_state(mvdev
, migf
, buf
, true, false))
355 mlx5vf_put_data_buffer(buf
);
357 mlx5vf_mark_err(migf
);
359 mlx5vf_state_mutex_unlock(mvdev
);
363 static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file
*migf
,
366 size_t size
= sizeof(struct mlx5_vf_migration_header
) +
367 sizeof(struct mlx5_vf_migration_tag_stop_copy_data
);
368 struct mlx5_vf_migration_tag_stop_copy_data data
= {};
369 struct mlx5_vhca_data_buffer
*header_buf
= NULL
;
370 struct mlx5_vf_migration_header header
= {};
376 header_buf
= mlx5vf_get_data_buffer(migf
, size
, DMA_NONE
);
377 if (IS_ERR(header_buf
))
378 return PTR_ERR(header_buf
);
380 header
.record_size
= cpu_to_le64(sizeof(data
));
381 header
.flags
= cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL
);
382 header
.tag
= cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE
);
383 page
= mlx5vf_get_migration_page(header_buf
, 0);
388 to_buff
= kmap_local_page(page
);
389 memcpy(to_buff
, &header
, sizeof(header
));
390 header_buf
->length
= sizeof(header
);
391 data
.stop_copy_size
= cpu_to_le64(migf
->buf
[0]->allocated_length
);
392 memcpy(to_buff
+ sizeof(header
), &data
, sizeof(data
));
393 header_buf
->length
+= sizeof(data
);
394 kunmap_local(to_buff
);
395 header_buf
->start_pos
= header_buf
->migf
->max_pos
;
396 migf
->max_pos
+= header_buf
->length
;
397 spin_lock_irqsave(&migf
->list_lock
, flags
);
398 list_add_tail(&header_buf
->buf_elm
, &migf
->buf_list
);
399 spin_unlock_irqrestore(&migf
->list_lock
, flags
);
401 migf
->pre_copy_initial_bytes
= size
;
404 mlx5vf_put_data_buffer(header_buf
);
408 static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device
*mvdev
,
409 struct mlx5_vf_migration_file
*migf
,
410 size_t state_size
, u64 full_size
,
413 struct mlx5_vhca_data_buffer
*buf
;
414 size_t inc_state_size
;
419 if (mvdev
->chunk_mode
) {
420 size_t chunk_size
= min_t(size_t, MAX_CHUNK_SIZE
, full_size
);
422 /* from firmware perspective at least 'state_size' buffer should be set */
423 inc_state_size
= max(state_size
, chunk_size
);
426 /* let's be ready for stop_copy size that might grow by 10 percents */
427 if (check_add_overflow(state_size
, state_size
/ 10, &inc_state_size
))
428 inc_state_size
= state_size
;
430 inc_state_size
= state_size
;
434 /* let's not overflow the device specification max SAVE size */
435 inc_state_size
= min_t(size_t, inc_state_size
,
436 (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in
, size
)) - PAGE_SIZE
));
438 num_chunks
= mvdev
->chunk_mode
? MAX_NUM_CHUNKS
: 1;
439 for (i
= 0; i
< num_chunks
; i
++) {
440 buf
= mlx5vf_get_data_buffer(migf
, inc_state_size
, DMA_FROM_DEVICE
);
447 buf
= mlx5vf_get_data_buffer(migf
,
448 sizeof(struct mlx5_vf_migration_header
), DMA_NONE
);
453 migf
->buf_header
[i
] = buf
;
454 if (mvdev
->chunk_mode
) {
455 migf
->buf
[i
]->stop_copy_chunk_num
= i
+ 1;
456 migf
->buf_header
[i
]->stop_copy_chunk_num
= i
+ 1;
457 INIT_WORK(&migf
->save_data
[i
].work
,
458 mlx5vf_mig_file_save_work
);
459 migf
->save_data
[i
].chunk_num
= i
+ 1;
463 ret
= mlx5vf_add_stop_copy_header(migf
, track
);
469 for (i
= 0; i
< num_chunks
; i
++) {
471 mlx5vf_put_data_buffer(migf
->buf
[i
]);
474 if (migf
->buf_header
[i
]) {
475 mlx5vf_put_data_buffer(migf
->buf_header
[i
]);
476 migf
->buf_header
[i
] = NULL
;
483 static long mlx5vf_precopy_ioctl(struct file
*filp
, unsigned int cmd
,
486 struct mlx5_vf_migration_file
*migf
= filp
->private_data
;
487 struct mlx5vf_pci_core_device
*mvdev
= migf
->mvdev
;
488 struct mlx5_vhca_data_buffer
*buf
;
489 struct vfio_precopy_info info
= {};
490 loff_t
*pos
= &filp
->f_pos
;
492 size_t inc_length
= 0;
493 bool end_of_data
= false;
496 if (cmd
!= VFIO_MIG_GET_PRECOPY_INFO
)
499 minsz
= offsetofend(struct vfio_precopy_info
, dirty_bytes
);
501 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
504 if (info
.argsz
< minsz
)
507 mutex_lock(&mvdev
->state_mutex
);
508 if (mvdev
->mig_state
!= VFIO_DEVICE_STATE_PRE_COPY
&&
509 mvdev
->mig_state
!= VFIO_DEVICE_STATE_PRE_COPY_P2P
) {
511 goto err_state_unlock
;
515 * We can't issue a SAVE command when the device is suspended, so as
516 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
517 * bytes that can't be read.
519 if (mvdev
->mig_state
== VFIO_DEVICE_STATE_PRE_COPY
) {
521 * Once the query returns it's guaranteed that there is no
522 * active SAVE command.
523 * As so, the other code below is safe with the proper locks.
525 ret
= mlx5vf_cmd_query_vhca_migration_state(mvdev
, &inc_length
,
526 NULL
, MLX5VF_QUERY_INC
);
528 goto err_state_unlock
;
531 mutex_lock(&migf
->lock
);
532 if (migf
->state
== MLX5_MIGF_STATE_ERROR
) {
534 goto err_migf_unlock
;
537 if (migf
->pre_copy_initial_bytes
> *pos
) {
538 info
.initial_bytes
= migf
->pre_copy_initial_bytes
- *pos
;
540 info
.dirty_bytes
= migf
->max_pos
- *pos
;
541 if (!info
.dirty_bytes
)
543 info
.dirty_bytes
+= inc_length
;
546 if (!end_of_data
|| !inc_length
) {
547 mutex_unlock(&migf
->lock
);
551 mutex_unlock(&migf
->lock
);
553 * We finished transferring the current state and the device has a
554 * dirty state, save a new state to be ready for.
556 buf
= mlx5vf_get_data_buffer(migf
, inc_length
, DMA_FROM_DEVICE
);
559 mlx5vf_mark_err(migf
);
560 goto err_state_unlock
;
563 ret
= mlx5vf_cmd_save_vhca_state(mvdev
, migf
, buf
, true, true);
565 mlx5vf_mark_err(migf
);
566 mlx5vf_put_data_buffer(buf
);
567 goto err_state_unlock
;
571 mlx5vf_state_mutex_unlock(mvdev
);
572 if (copy_to_user((void __user
*)arg
, &info
, minsz
))
577 mutex_unlock(&migf
->lock
);
579 mlx5vf_state_mutex_unlock(mvdev
);
583 static const struct file_operations mlx5vf_save_fops
= {
584 .owner
= THIS_MODULE
,
585 .read
= mlx5vf_save_read
,
586 .poll
= mlx5vf_save_poll
,
587 .unlocked_ioctl
= mlx5vf_precopy_ioctl
,
588 .compat_ioctl
= compat_ptr_ioctl
,
589 .release
= mlx5vf_release_file
,
592 static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device
*mvdev
)
594 struct mlx5_vf_migration_file
*migf
= mvdev
->saving_migf
;
595 struct mlx5_vhca_data_buffer
*buf
;
599 if (migf
->state
== MLX5_MIGF_STATE_ERROR
)
602 ret
= mlx5vf_cmd_query_vhca_migration_state(mvdev
, &length
, NULL
,
603 MLX5VF_QUERY_INC
| MLX5VF_QUERY_FINAL
);
607 buf
= mlx5vf_mig_file_get_stop_copy_buf(migf
, 0, length
);
613 ret
= mlx5vf_cmd_save_vhca_state(mvdev
, migf
, buf
, true, false);
620 mlx5vf_put_data_buffer(buf
);
622 mlx5vf_mark_err(migf
);
626 static struct mlx5_vf_migration_file
*
627 mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device
*mvdev
, bool track
)
629 struct mlx5_vf_migration_file
*migf
;
630 struct mlx5_vhca_data_buffer
*buf
;
635 migf
= kzalloc(sizeof(*migf
), GFP_KERNEL_ACCOUNT
);
637 return ERR_PTR(-ENOMEM
);
639 migf
->filp
= anon_inode_getfile("mlx5vf_mig", &mlx5vf_save_fops
, migf
,
641 if (IS_ERR(migf
->filp
)) {
642 ret
= PTR_ERR(migf
->filp
);
648 stream_open(migf
->filp
->f_inode
, migf
->filp
);
649 mutex_init(&migf
->lock
);
650 init_waitqueue_head(&migf
->poll_wait
);
651 init_completion(&migf
->save_comp
);
653 * save_comp is being used as a binary semaphore built from
654 * a completion. A normal mutex cannot be used because the lock is
655 * passed between kernel threads and lockdep can't model this.
657 complete(&migf
->save_comp
);
658 mlx5_cmd_init_async_ctx(mvdev
->mdev
, &migf
->async_ctx
);
659 INIT_WORK(&migf
->async_data
.work
, mlx5vf_mig_file_cleanup_cb
);
660 INIT_LIST_HEAD(&migf
->buf_list
);
661 INIT_LIST_HEAD(&migf
->avail_list
);
662 spin_lock_init(&migf
->list_lock
);
664 ret
= mlx5vf_cmd_alloc_pd(migf
);
668 ret
= mlx5vf_cmd_query_vhca_migration_state(mvdev
, &length
, &full_size
, 0);
672 ret
= mlx5vf_prep_stop_copy(mvdev
, migf
, length
, full_size
, track
);
677 /* leave the allocated buffer ready for the stop-copy phase */
678 buf
= mlx5vf_alloc_data_buffer(migf
,
679 migf
->buf
[0]->allocated_length
, DMA_FROM_DEVICE
);
689 ret
= mlx5vf_cmd_save_vhca_state(mvdev
, migf
, buf
, false, track
);
694 mlx5vf_free_data_buffer(buf
);
696 mlx5fv_cmd_clean_migf_resources(migf
);
703 mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer
*vhca_buf
,
704 const char __user
**buf
, size_t *len
,
705 loff_t
*pos
, ssize_t
*done
)
707 unsigned long offset
;
714 offset
= *pos
- vhca_buf
->start_pos
;
715 page_offset
= offset
% PAGE_SIZE
;
717 page
= mlx5vf_get_migration_page(vhca_buf
, offset
- page_offset
);
720 page_len
= min_t(size_t, *len
, PAGE_SIZE
- page_offset
);
721 to_buff
= kmap_local_page(page
);
722 ret
= copy_from_user(to_buff
+ page_offset
, *buf
, page_len
);
723 kunmap_local(to_buff
);
731 vhca_buf
->length
+= page_len
;
736 mlx5vf_resume_read_image(struct mlx5_vf_migration_file
*migf
,
737 struct mlx5_vhca_data_buffer
*vhca_buf
,
738 size_t image_size
, const char __user
**buf
,
739 size_t *len
, loff_t
*pos
, ssize_t
*done
,
742 size_t copy_len
, to_copy
;
745 to_copy
= min_t(size_t, *len
, image_size
- vhca_buf
->length
);
748 ret
= mlx5vf_append_page_to_mig_buf(vhca_buf
, buf
, &to_copy
, pos
,
755 if (vhca_buf
->length
== image_size
) {
756 migf
->load_state
= MLX5_VF_LOAD_STATE_LOAD_IMAGE
;
757 migf
->max_pos
+= image_size
;
765 mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file
*migf
,
766 struct mlx5_vhca_data_buffer
*vhca_buf
,
767 const char __user
**buf
, size_t *len
,
768 loff_t
*pos
, ssize_t
*done
)
770 size_t copy_len
, to_copy
;
771 size_t required_data
;
775 required_data
= migf
->record_size
- vhca_buf
->length
;
776 to_copy
= min_t(size_t, *len
, required_data
);
779 ret
= mlx5vf_append_page_to_mig_buf(vhca_buf
, buf
, &to_copy
, pos
,
786 if (vhca_buf
->length
== migf
->record_size
) {
787 switch (migf
->record_tag
) {
788 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE
:
792 page
= mlx5vf_get_migration_page(vhca_buf
, 0);
795 to_buff
= kmap_local_page(page
);
796 migf
->stop_copy_prep_size
= min_t(u64
,
797 le64_to_cpup((__le64
*)to_buff
), MAX_LOAD_SIZE
);
798 kunmap_local(to_buff
);
806 migf
->load_state
= MLX5_VF_LOAD_STATE_READ_HEADER
;
807 migf
->max_pos
+= migf
->record_size
;
808 vhca_buf
->length
= 0;
815 mlx5vf_resume_read_header(struct mlx5_vf_migration_file
*migf
,
816 struct mlx5_vhca_data_buffer
*vhca_buf
,
817 const char __user
**buf
,
818 size_t *len
, loff_t
*pos
,
819 ssize_t
*done
, bool *has_work
)
826 copy_len
= min_t(size_t, *len
,
827 sizeof(struct mlx5_vf_migration_header
) - vhca_buf
->length
);
828 page
= mlx5vf_get_migration_page(vhca_buf
, 0);
831 to_buff
= kmap_local_page(page
);
832 ret
= copy_from_user(to_buff
+ vhca_buf
->length
, *buf
, copy_len
);
842 vhca_buf
->length
+= copy_len
;
843 if (vhca_buf
->length
== sizeof(struct mlx5_vf_migration_header
)) {
847 record_size
= le64_to_cpup((__le64
*)to_buff
);
848 if (record_size
> MAX_LOAD_SIZE
) {
853 migf
->record_size
= record_size
;
854 flags
= le32_to_cpup((__le32
*)(to_buff
+
855 offsetof(struct mlx5_vf_migration_header
, flags
)));
856 migf
->record_tag
= le32_to_cpup((__le32
*)(to_buff
+
857 offsetof(struct mlx5_vf_migration_header
, tag
)));
858 switch (migf
->record_tag
) {
859 case MLX5_MIGF_HEADER_TAG_FW_DATA
:
860 migf
->load_state
= MLX5_VF_LOAD_STATE_PREP_IMAGE
;
862 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE
:
863 migf
->load_state
= MLX5_VF_LOAD_STATE_PREP_HEADER_DATA
;
866 if (!(flags
& MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL
)) {
870 /* We may read and skip this optional record data */
871 migf
->load_state
= MLX5_VF_LOAD_STATE_PREP_HEADER_DATA
;
874 migf
->max_pos
+= vhca_buf
->length
;
875 vhca_buf
->length
= 0;
879 kunmap_local(to_buff
);
883 static ssize_t
mlx5vf_resume_write(struct file
*filp
, const char __user
*buf
,
884 size_t len
, loff_t
*pos
)
886 struct mlx5_vf_migration_file
*migf
= filp
->private_data
;
887 struct mlx5_vhca_data_buffer
*vhca_buf
= migf
->buf
[0];
888 struct mlx5_vhca_data_buffer
*vhca_buf_header
= migf
->buf_header
[0];
889 loff_t requested_length
;
890 bool has_work
= false;
899 check_add_overflow((loff_t
)len
, *pos
, &requested_length
))
902 mutex_lock(&migf
->mvdev
->state_mutex
);
903 mutex_lock(&migf
->lock
);
904 if (migf
->state
== MLX5_MIGF_STATE_ERROR
) {
909 while (len
|| has_work
) {
911 switch (migf
->load_state
) {
912 case MLX5_VF_LOAD_STATE_READ_HEADER
:
913 ret
= mlx5vf_resume_read_header(migf
, vhca_buf_header
,
919 case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA
:
920 if (vhca_buf_header
->allocated_length
< migf
->record_size
) {
921 mlx5vf_free_data_buffer(vhca_buf_header
);
923 migf
->buf_header
[0] = mlx5vf_alloc_data_buffer(migf
,
924 migf
->record_size
, DMA_NONE
);
925 if (IS_ERR(migf
->buf_header
[0])) {
926 ret
= PTR_ERR(migf
->buf_header
[0]);
927 migf
->buf_header
[0] = NULL
;
931 vhca_buf_header
= migf
->buf_header
[0];
934 vhca_buf_header
->start_pos
= migf
->max_pos
;
935 migf
->load_state
= MLX5_VF_LOAD_STATE_READ_HEADER_DATA
;
937 case MLX5_VF_LOAD_STATE_READ_HEADER_DATA
:
938 ret
= mlx5vf_resume_read_header_data(migf
, vhca_buf_header
,
939 &buf
, &len
, pos
, &done
);
943 case MLX5_VF_LOAD_STATE_PREP_IMAGE
:
945 u64 size
= max(migf
->record_size
,
946 migf
->stop_copy_prep_size
);
948 if (vhca_buf
->allocated_length
< size
) {
949 mlx5vf_free_data_buffer(vhca_buf
);
951 migf
->buf
[0] = mlx5vf_alloc_data_buffer(migf
,
952 size
, DMA_TO_DEVICE
);
953 if (IS_ERR(migf
->buf
[0])) {
954 ret
= PTR_ERR(migf
->buf
[0]);
959 vhca_buf
= migf
->buf
[0];
962 vhca_buf
->start_pos
= migf
->max_pos
;
963 migf
->load_state
= MLX5_VF_LOAD_STATE_READ_IMAGE
;
966 case MLX5_VF_LOAD_STATE_READ_IMAGE
:
967 ret
= mlx5vf_resume_read_image(migf
, vhca_buf
,
969 &buf
, &len
, pos
, &done
, &has_work
);
973 case MLX5_VF_LOAD_STATE_LOAD_IMAGE
:
974 ret
= mlx5vf_cmd_load_vhca_state(migf
->mvdev
, migf
, vhca_buf
);
977 migf
->load_state
= MLX5_VF_LOAD_STATE_READ_HEADER
;
979 /* prep header buf for next image */
980 vhca_buf_header
->length
= 0;
981 /* prep data buf for next image */
982 vhca_buf
->length
= 0;
992 migf
->state
= MLX5_MIGF_STATE_ERROR
;
993 mutex_unlock(&migf
->lock
);
994 mlx5vf_state_mutex_unlock(migf
->mvdev
);
995 return ret
? ret
: done
;
998 static const struct file_operations mlx5vf_resume_fops
= {
999 .owner
= THIS_MODULE
,
1000 .write
= mlx5vf_resume_write
,
1001 .release
= mlx5vf_release_file
,
1004 static struct mlx5_vf_migration_file
*
1005 mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device
*mvdev
)
1007 struct mlx5_vf_migration_file
*migf
;
1008 struct mlx5_vhca_data_buffer
*buf
;
1011 migf
= kzalloc(sizeof(*migf
), GFP_KERNEL_ACCOUNT
);
1013 return ERR_PTR(-ENOMEM
);
1015 migf
->filp
= anon_inode_getfile("mlx5vf_mig", &mlx5vf_resume_fops
, migf
,
1017 if (IS_ERR(migf
->filp
)) {
1018 ret
= PTR_ERR(migf
->filp
);
1020 return ERR_PTR(ret
);
1023 stream_open(migf
->filp
->f_inode
, migf
->filp
);
1024 mutex_init(&migf
->lock
);
1025 INIT_LIST_HEAD(&migf
->buf_list
);
1026 INIT_LIST_HEAD(&migf
->avail_list
);
1027 spin_lock_init(&migf
->list_lock
);
1028 migf
->mvdev
= mvdev
;
1029 ret
= mlx5vf_cmd_alloc_pd(migf
);
1033 buf
= mlx5vf_alloc_data_buffer(migf
, 0, DMA_TO_DEVICE
);
1040 buf
= mlx5vf_alloc_data_buffer(migf
,
1041 sizeof(struct mlx5_vf_migration_header
), DMA_NONE
);
1047 migf
->buf_header
[0] = buf
;
1048 migf
->load_state
= MLX5_VF_LOAD_STATE_READ_HEADER
;
1052 mlx5vf_free_data_buffer(migf
->buf
[0]);
1054 mlx5vf_cmd_dealloc_pd(migf
);
1057 return ERR_PTR(ret
);
1060 void mlx5vf_disable_fds(struct mlx5vf_pci_core_device
*mvdev
,
1061 enum mlx5_vf_migf_state
*last_save_state
)
1063 if (mvdev
->resuming_migf
) {
1064 mlx5vf_disable_fd(mvdev
->resuming_migf
);
1065 mlx5fv_cmd_clean_migf_resources(mvdev
->resuming_migf
);
1066 fput(mvdev
->resuming_migf
->filp
);
1067 mvdev
->resuming_migf
= NULL
;
1069 if (mvdev
->saving_migf
) {
1070 mlx5_cmd_cleanup_async_ctx(&mvdev
->saving_migf
->async_ctx
);
1071 cancel_work_sync(&mvdev
->saving_migf
->async_data
.work
);
1072 if (last_save_state
)
1073 *last_save_state
= mvdev
->saving_migf
->state
;
1074 mlx5vf_disable_fd(mvdev
->saving_migf
);
1075 wake_up_interruptible(&mvdev
->saving_migf
->poll_wait
);
1076 mlx5fv_cmd_clean_migf_resources(mvdev
->saving_migf
);
1077 fput(mvdev
->saving_migf
->filp
);
1078 mvdev
->saving_migf
= NULL
;
1082 static struct file
*
1083 mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device
*mvdev
,
1086 u32 cur
= mvdev
->mig_state
;
1089 if (cur
== VFIO_DEVICE_STATE_RUNNING_P2P
&& new == VFIO_DEVICE_STATE_STOP
) {
1090 ret
= mlx5vf_cmd_suspend_vhca(mvdev
,
1091 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER
);
1093 return ERR_PTR(ret
);
1097 if (cur
== VFIO_DEVICE_STATE_STOP
&& new == VFIO_DEVICE_STATE_RUNNING_P2P
) {
1098 ret
= mlx5vf_cmd_resume_vhca(mvdev
,
1099 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER
);
1101 return ERR_PTR(ret
);
1105 if ((cur
== VFIO_DEVICE_STATE_RUNNING
&& new == VFIO_DEVICE_STATE_RUNNING_P2P
) ||
1106 (cur
== VFIO_DEVICE_STATE_PRE_COPY
&& new == VFIO_DEVICE_STATE_PRE_COPY_P2P
)) {
1107 ret
= mlx5vf_cmd_suspend_vhca(mvdev
,
1108 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR
);
1110 return ERR_PTR(ret
);
1114 if ((cur
== VFIO_DEVICE_STATE_RUNNING_P2P
&& new == VFIO_DEVICE_STATE_RUNNING
) ||
1115 (cur
== VFIO_DEVICE_STATE_PRE_COPY_P2P
&& new == VFIO_DEVICE_STATE_PRE_COPY
)) {
1116 ret
= mlx5vf_cmd_resume_vhca(mvdev
,
1117 MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR
);
1119 return ERR_PTR(ret
);
1123 if (cur
== VFIO_DEVICE_STATE_STOP
&& new == VFIO_DEVICE_STATE_STOP_COPY
) {
1124 struct mlx5_vf_migration_file
*migf
;
1126 migf
= mlx5vf_pci_save_device_data(mvdev
, false);
1128 return ERR_CAST(migf
);
1129 get_file(migf
->filp
);
1130 mvdev
->saving_migf
= migf
;
1134 if (cur
== VFIO_DEVICE_STATE_STOP_COPY
&& new == VFIO_DEVICE_STATE_STOP
) {
1135 mlx5vf_disable_fds(mvdev
, NULL
);
1139 if ((cur
== VFIO_DEVICE_STATE_PRE_COPY
&& new == VFIO_DEVICE_STATE_RUNNING
) ||
1140 (cur
== VFIO_DEVICE_STATE_PRE_COPY_P2P
&&
1141 new == VFIO_DEVICE_STATE_RUNNING_P2P
)) {
1142 struct mlx5_vf_migration_file
*migf
= mvdev
->saving_migf
;
1143 struct mlx5_vhca_data_buffer
*buf
;
1144 enum mlx5_vf_migf_state state
;
1147 ret
= mlx5vf_cmd_query_vhca_migration_state(mvdev
, &size
, NULL
,
1148 MLX5VF_QUERY_INC
| MLX5VF_QUERY_CLEANUP
);
1150 return ERR_PTR(ret
);
1151 buf
= mlx5vf_get_data_buffer(migf
, size
, DMA_FROM_DEVICE
);
1153 return ERR_CAST(buf
);
1154 /* pre_copy cleanup */
1155 ret
= mlx5vf_cmd_save_vhca_state(mvdev
, migf
, buf
, false, false);
1157 mlx5vf_put_data_buffer(buf
);
1158 return ERR_PTR(ret
);
1160 mlx5vf_disable_fds(mvdev
, &state
);
1161 return (state
!= MLX5_MIGF_STATE_ERROR
) ? NULL
: ERR_PTR(-EIO
);
1164 if (cur
== VFIO_DEVICE_STATE_STOP
&& new == VFIO_DEVICE_STATE_RESUMING
) {
1165 struct mlx5_vf_migration_file
*migf
;
1167 migf
= mlx5vf_pci_resume_device_data(mvdev
);
1169 return ERR_CAST(migf
);
1170 get_file(migf
->filp
);
1171 mvdev
->resuming_migf
= migf
;
1175 if (cur
== VFIO_DEVICE_STATE_RESUMING
&& new == VFIO_DEVICE_STATE_STOP
) {
1176 mlx5vf_disable_fds(mvdev
, NULL
);
1180 if ((cur
== VFIO_DEVICE_STATE_RUNNING
&& new == VFIO_DEVICE_STATE_PRE_COPY
) ||
1181 (cur
== VFIO_DEVICE_STATE_RUNNING_P2P
&&
1182 new == VFIO_DEVICE_STATE_PRE_COPY_P2P
)) {
1183 struct mlx5_vf_migration_file
*migf
;
1185 migf
= mlx5vf_pci_save_device_data(mvdev
, true);
1187 return ERR_CAST(migf
);
1188 get_file(migf
->filp
);
1189 mvdev
->saving_migf
= migf
;
1193 if (cur
== VFIO_DEVICE_STATE_PRE_COPY_P2P
&& new == VFIO_DEVICE_STATE_STOP_COPY
) {
1194 ret
= mlx5vf_cmd_suspend_vhca(mvdev
,
1195 MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER
);
1197 return ERR_PTR(ret
);
1198 ret
= mlx5vf_pci_save_device_inc_data(mvdev
);
1199 return ret
? ERR_PTR(ret
) : NULL
;
1203 * vfio_mig_get_next_state() does not use arcs other than the above
1206 return ERR_PTR(-EINVAL
);
1210 * This function is called in all state_mutex unlock cases to
1211 * handle a 'deferred_reset' if exists.
1213 void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device
*mvdev
)
1216 spin_lock(&mvdev
->reset_lock
);
1217 if (mvdev
->deferred_reset
) {
1218 mvdev
->deferred_reset
= false;
1219 spin_unlock(&mvdev
->reset_lock
);
1220 mvdev
->mig_state
= VFIO_DEVICE_STATE_RUNNING
;
1221 mlx5vf_disable_fds(mvdev
, NULL
);
1224 mutex_unlock(&mvdev
->state_mutex
);
1225 spin_unlock(&mvdev
->reset_lock
);
1228 static struct file
*
1229 mlx5vf_pci_set_device_state(struct vfio_device
*vdev
,
1230 enum vfio_device_mig_state new_state
)
1232 struct mlx5vf_pci_core_device
*mvdev
= container_of(
1233 vdev
, struct mlx5vf_pci_core_device
, core_device
.vdev
);
1234 enum vfio_device_mig_state next_state
;
1235 struct file
*res
= NULL
;
1238 mutex_lock(&mvdev
->state_mutex
);
1239 while (new_state
!= mvdev
->mig_state
) {
1240 ret
= vfio_mig_get_next_state(vdev
, mvdev
->mig_state
,
1241 new_state
, &next_state
);
1246 res
= mlx5vf_pci_step_device_state_locked(mvdev
, next_state
);
1249 mvdev
->mig_state
= next_state
;
1250 if (WARN_ON(res
&& new_state
!= mvdev
->mig_state
)) {
1252 res
= ERR_PTR(-EINVAL
);
1256 mlx5vf_state_mutex_unlock(mvdev
);
1260 static int mlx5vf_pci_get_data_size(struct vfio_device
*vdev
,
1261 unsigned long *stop_copy_length
)
1263 struct mlx5vf_pci_core_device
*mvdev
= container_of(
1264 vdev
, struct mlx5vf_pci_core_device
, core_device
.vdev
);
1269 mutex_lock(&mvdev
->state_mutex
);
1270 ret
= mlx5vf_cmd_query_vhca_migration_state(mvdev
, &state_size
,
1273 *stop_copy_length
= total_size
;
1274 mlx5vf_state_mutex_unlock(mvdev
);
1278 static int mlx5vf_pci_get_device_state(struct vfio_device
*vdev
,
1279 enum vfio_device_mig_state
*curr_state
)
1281 struct mlx5vf_pci_core_device
*mvdev
= container_of(
1282 vdev
, struct mlx5vf_pci_core_device
, core_device
.vdev
);
1284 mutex_lock(&mvdev
->state_mutex
);
1285 *curr_state
= mvdev
->mig_state
;
1286 mlx5vf_state_mutex_unlock(mvdev
);
1290 static void mlx5vf_pci_aer_reset_done(struct pci_dev
*pdev
)
1292 struct mlx5vf_pci_core_device
*mvdev
= mlx5vf_drvdata(pdev
);
1294 if (!mvdev
->migrate_cap
)
1298 * As the higher VFIO layers are holding locks across reset and using
1299 * those same locks with the mm_lock we need to prevent ABBA deadlock
1300 * with the state_mutex and mm_lock.
1301 * In case the state_mutex was taken already we defer the cleanup work
1302 * to the unlock flow of the other running context.
1304 spin_lock(&mvdev
->reset_lock
);
1305 mvdev
->deferred_reset
= true;
1306 if (!mutex_trylock(&mvdev
->state_mutex
)) {
1307 spin_unlock(&mvdev
->reset_lock
);
1310 spin_unlock(&mvdev
->reset_lock
);
1311 mlx5vf_state_mutex_unlock(mvdev
);
1314 static int mlx5vf_pci_open_device(struct vfio_device
*core_vdev
)
1316 struct mlx5vf_pci_core_device
*mvdev
= container_of(
1317 core_vdev
, struct mlx5vf_pci_core_device
, core_device
.vdev
);
1318 struct vfio_pci_core_device
*vdev
= &mvdev
->core_device
;
1321 ret
= vfio_pci_core_enable(vdev
);
1325 if (mvdev
->migrate_cap
)
1326 mvdev
->mig_state
= VFIO_DEVICE_STATE_RUNNING
;
1327 vfio_pci_core_finish_enable(vdev
);
1331 static void mlx5vf_pci_close_device(struct vfio_device
*core_vdev
)
1333 struct mlx5vf_pci_core_device
*mvdev
= container_of(
1334 core_vdev
, struct mlx5vf_pci_core_device
, core_device
.vdev
);
1336 mlx5vf_cmd_close_migratable(mvdev
);
1337 vfio_pci_core_close_device(core_vdev
);
1340 static const struct vfio_migration_ops mlx5vf_pci_mig_ops
= {
1341 .migration_set_state
= mlx5vf_pci_set_device_state
,
1342 .migration_get_state
= mlx5vf_pci_get_device_state
,
1343 .migration_get_data_size
= mlx5vf_pci_get_data_size
,
1346 static const struct vfio_log_ops mlx5vf_pci_log_ops
= {
1347 .log_start
= mlx5vf_start_page_tracker
,
1348 .log_stop
= mlx5vf_stop_page_tracker
,
1349 .log_read_and_clear
= mlx5vf_tracker_read_and_clear
,
1352 static int mlx5vf_pci_init_dev(struct vfio_device
*core_vdev
)
1354 struct mlx5vf_pci_core_device
*mvdev
= container_of(core_vdev
,
1355 struct mlx5vf_pci_core_device
, core_device
.vdev
);
1358 ret
= vfio_pci_core_init_dev(core_vdev
);
1362 mlx5vf_cmd_set_migratable(mvdev
, &mlx5vf_pci_mig_ops
,
1363 &mlx5vf_pci_log_ops
);
1368 static void mlx5vf_pci_release_dev(struct vfio_device
*core_vdev
)
1370 struct mlx5vf_pci_core_device
*mvdev
= container_of(core_vdev
,
1371 struct mlx5vf_pci_core_device
, core_device
.vdev
);
1373 mlx5vf_cmd_remove_migratable(mvdev
);
1374 vfio_pci_core_release_dev(core_vdev
);
1377 static const struct vfio_device_ops mlx5vf_pci_ops
= {
1378 .name
= "mlx5-vfio-pci",
1379 .init
= mlx5vf_pci_init_dev
,
1380 .release
= mlx5vf_pci_release_dev
,
1381 .open_device
= mlx5vf_pci_open_device
,
1382 .close_device
= mlx5vf_pci_close_device
,
1383 .ioctl
= vfio_pci_core_ioctl
,
1384 .device_feature
= vfio_pci_core_ioctl_feature
,
1385 .read
= vfio_pci_core_read
,
1386 .write
= vfio_pci_core_write
,
1387 .mmap
= vfio_pci_core_mmap
,
1388 .request
= vfio_pci_core_request
,
1389 .match
= vfio_pci_core_match
,
1390 .bind_iommufd
= vfio_iommufd_physical_bind
,
1391 .unbind_iommufd
= vfio_iommufd_physical_unbind
,
1392 .attach_ioas
= vfio_iommufd_physical_attach_ioas
,
1393 .detach_ioas
= vfio_iommufd_physical_detach_ioas
,
1396 static int mlx5vf_pci_probe(struct pci_dev
*pdev
,
1397 const struct pci_device_id
*id
)
1399 struct mlx5vf_pci_core_device
*mvdev
;
1402 mvdev
= vfio_alloc_device(mlx5vf_pci_core_device
, core_device
.vdev
,
1403 &pdev
->dev
, &mlx5vf_pci_ops
);
1405 return PTR_ERR(mvdev
);
1407 dev_set_drvdata(&pdev
->dev
, &mvdev
->core_device
);
1408 ret
= vfio_pci_core_register_device(&mvdev
->core_device
);
1414 vfio_put_device(&mvdev
->core_device
.vdev
);
1418 static void mlx5vf_pci_remove(struct pci_dev
*pdev
)
1420 struct mlx5vf_pci_core_device
*mvdev
= mlx5vf_drvdata(pdev
);
1422 vfio_pci_core_unregister_device(&mvdev
->core_device
);
1423 vfio_put_device(&mvdev
->core_device
.vdev
);
1426 static const struct pci_device_id mlx5vf_pci_table
[] = {
1427 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX
, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
1431 MODULE_DEVICE_TABLE(pci
, mlx5vf_pci_table
);
1433 static const struct pci_error_handlers mlx5vf_err_handlers
= {
1434 .reset_done
= mlx5vf_pci_aer_reset_done
,
1435 .error_detected
= vfio_pci_core_aer_err_detected
,
1438 static struct pci_driver mlx5vf_pci_driver
= {
1439 .name
= KBUILD_MODNAME
,
1440 .id_table
= mlx5vf_pci_table
,
1441 .probe
= mlx5vf_pci_probe
,
1442 .remove
= mlx5vf_pci_remove
,
1443 .err_handler
= &mlx5vf_err_handlers
,
1444 .driver_managed_dma
= true,
1447 module_pci_driver(mlx5vf_pci_driver
);
1449 MODULE_IMPORT_NS("IOMMUFD");
1450 MODULE_LICENSE("GPL");
1451 MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
1452 MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
1454 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");