1 // SPDX-License-Identifier: GPL-2.0
2 /* Copyright(c) 2023 Advanced Micro Devices, Inc. */
4 #include <linux/interval_tree.h>
5 #include <linux/vfio.h>
6 #include <linux/vmalloc.h>
8 #include <linux/pds/pds_common.h>
9 #include <linux/pds/pds_core_if.h>
10 #include <linux/pds/pds_adminq.h>
17 #define WRITE_ACK false
19 bool pds_vfio_dirty_is_enabled(struct pds_vfio_pci_device
*pds_vfio
)
21 return pds_vfio
->dirty
.is_enabled
;
24 void pds_vfio_dirty_set_enabled(struct pds_vfio_pci_device
*pds_vfio
)
26 pds_vfio
->dirty
.is_enabled
= true;
29 void pds_vfio_dirty_set_disabled(struct pds_vfio_pci_device
*pds_vfio
)
31 pds_vfio
->dirty
.is_enabled
= false;
35 pds_vfio_print_guest_region_info(struct pds_vfio_pci_device
*pds_vfio
,
38 int len
= max_regions
* sizeof(struct pds_lm_dirty_region_info
);
39 struct pci_dev
*pdev
= pds_vfio
->vfio_coredev
.pdev
;
40 struct device
*pdsc_dev
= &pci_physfn(pdev
)->dev
;
41 struct pds_lm_dirty_region_info
*region_info
;
42 dma_addr_t regions_dma
;
46 region_info
= kcalloc(max_regions
,
47 sizeof(struct pds_lm_dirty_region_info
),
53 dma_map_single(pdsc_dev
, region_info
, len
, DMA_FROM_DEVICE
);
54 if (dma_mapping_error(pdsc_dev
, regions_dma
))
55 goto out_free_region_info
;
57 err
= pds_vfio_dirty_status_cmd(pds_vfio
, regions_dma
, &max_regions
,
59 dma_unmap_single(pdsc_dev
, regions_dma
, len
, DMA_FROM_DEVICE
);
61 goto out_free_region_info
;
63 for (unsigned int i
= 0; i
< num_regions
; i
++)
65 "region_info[%d]: dma_base 0x%llx page_count %u page_size_log2 %u\n",
66 i
, le64_to_cpu(region_info
[i
].dma_base
),
67 le32_to_cpu(region_info
[i
].page_count
),
68 region_info
[i
].page_size_log2
);
74 static int pds_vfio_dirty_alloc_bitmaps(struct pds_vfio_region
*region
,
77 unsigned long *host_seq_bmp
, *host_ack_bmp
;
79 host_seq_bmp
= vzalloc(bytes
);
83 host_ack_bmp
= vzalloc(bytes
);
85 bitmap_free(host_seq_bmp
);
89 region
->host_seq
= host_seq_bmp
;
90 region
->host_ack
= host_ack_bmp
;
91 region
->bmp_bytes
= bytes
;
96 static void pds_vfio_dirty_free_bitmaps(struct pds_vfio_dirty
*dirty
)
101 for (int i
= 0; i
< dirty
->num_regions
; i
++) {
102 struct pds_vfio_region
*region
= &dirty
->regions
[i
];
104 vfree(region
->host_seq
);
105 vfree(region
->host_ack
);
106 region
->host_seq
= NULL
;
107 region
->host_ack
= NULL
;
108 region
->bmp_bytes
= 0;
112 static void __pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device
*pds_vfio
,
113 struct pds_vfio_region
*region
)
115 struct pci_dev
*pdev
= pds_vfio
->vfio_coredev
.pdev
;
116 struct device
*pdsc_dev
= &pci_physfn(pdev
)->dev
;
118 dma_unmap_single(pdsc_dev
, region
->sgl_addr
,
119 region
->num_sge
* sizeof(struct pds_lm_sg_elem
),
125 region
->sgl_addr
= 0;
128 static void pds_vfio_dirty_free_sgl(struct pds_vfio_pci_device
*pds_vfio
)
130 struct pds_vfio_dirty
*dirty
= &pds_vfio
->dirty
;
135 for (int i
= 0; i
< dirty
->num_regions
; i
++) {
136 struct pds_vfio_region
*region
= &dirty
->regions
[i
];
139 __pds_vfio_dirty_free_sgl(pds_vfio
, region
);
143 static int pds_vfio_dirty_alloc_sgl(struct pds_vfio_pci_device
*pds_vfio
,
144 struct pds_vfio_region
*region
,
147 struct pci_dev
*pdev
= pds_vfio
->vfio_coredev
.pdev
;
148 struct device
*pdsc_dev
= &pci_physfn(pdev
)->dev
;
149 struct pds_lm_sg_elem
*sgl
;
154 max_sge
= DIV_ROUND_UP(page_count
, PAGE_SIZE
* 8);
155 sgl_size
= max_sge
* sizeof(struct pds_lm_sg_elem
);
157 sgl
= kzalloc(sgl_size
, GFP_KERNEL
);
161 sgl_addr
= dma_map_single(pdsc_dev
, sgl
, sgl_size
, DMA_BIDIRECTIONAL
);
162 if (dma_mapping_error(pdsc_dev
, sgl_addr
)) {
168 region
->num_sge
= max_sge
;
169 region
->sgl_addr
= sgl_addr
;
174 static void pds_vfio_dirty_free_regions(struct pds_vfio_dirty
*dirty
)
176 vfree(dirty
->regions
);
177 dirty
->regions
= NULL
;
178 dirty
->num_regions
= 0;
181 static int pds_vfio_dirty_alloc_regions(struct pds_vfio_pci_device
*pds_vfio
,
182 struct pds_lm_dirty_region_info
*region_info
,
183 u64 region_page_size
, u8 num_regions
)
185 struct pci_dev
*pdev
= pds_vfio
->vfio_coredev
.pdev
;
186 struct pds_vfio_dirty
*dirty
= &pds_vfio
->dirty
;
187 u32 dev_bmp_offset_byte
= 0;
190 dirty
->regions
= vcalloc(num_regions
, sizeof(struct pds_vfio_region
));
193 dirty
->num_regions
= num_regions
;
195 for (int i
= 0; i
< num_regions
; i
++) {
196 struct pds_lm_dirty_region_info
*ri
= ®ion_info
[i
];
197 struct pds_vfio_region
*region
= &dirty
->regions
[i
];
198 u64 region_size
, region_start
;
201 /* page_count might be adjusted by the device */
202 page_count
= le32_to_cpu(ri
->page_count
);
203 region_start
= le64_to_cpu(ri
->dma_base
);
204 region_size
= page_count
* region_page_size
;
206 err
= pds_vfio_dirty_alloc_bitmaps(region
,
207 page_count
/ BITS_PER_BYTE
);
209 dev_err(&pdev
->dev
, "Failed to alloc dirty bitmaps: %pe\n",
211 goto out_free_regions
;
214 err
= pds_vfio_dirty_alloc_sgl(pds_vfio
, region
, page_count
);
216 dev_err(&pdev
->dev
, "Failed to alloc dirty sg lists: %pe\n",
218 goto out_free_regions
;
221 region
->size
= region_size
;
222 region
->start
= region_start
;
223 region
->page_size
= region_page_size
;
224 region
->dev_bmp_offset_start_byte
= dev_bmp_offset_byte
;
226 dev_bmp_offset_byte
+= page_count
/ BITS_PER_BYTE
;
227 if (dev_bmp_offset_byte
% BITS_PER_BYTE
) {
228 dev_err(&pdev
->dev
, "Device bitmap offset is mis-aligned\n");
230 goto out_free_regions
;
237 pds_vfio_dirty_free_bitmaps(dirty
);
238 pds_vfio_dirty_free_sgl(pds_vfio
);
239 pds_vfio_dirty_free_regions(dirty
);
244 static int pds_vfio_dirty_enable(struct pds_vfio_pci_device
*pds_vfio
,
245 struct rb_root_cached
*ranges
, u32 nnodes
,
248 struct pci_dev
*pdev
= pds_vfio
->vfio_coredev
.pdev
;
249 struct device
*pdsc_dev
= &pci_physfn(pdev
)->dev
;
250 struct pds_lm_dirty_region_info
*region_info
;
251 struct interval_tree_node
*node
= NULL
;
252 u64 region_page_size
= *page_size
;
253 u8 max_regions
= 0, num_regions
;
254 dma_addr_t regions_dma
= 0;
255 u32 num_ranges
= nnodes
;
259 dev_dbg(&pdev
->dev
, "vf%u: Start dirty page tracking\n",
262 if (pds_vfio_dirty_is_enabled(pds_vfio
))
265 /* find if dirty tracking is disabled, i.e. num_regions == 0 */
266 err
= pds_vfio_dirty_status_cmd(pds_vfio
, 0, &max_regions
,
269 dev_err(&pdev
->dev
, "Failed to get dirty status, err %pe\n",
272 } else if (num_regions
) {
274 "Dirty tracking already enabled for %d regions\n",
277 } else if (!max_regions
) {
279 "Device doesn't support dirty tracking, max_regions %d\n",
284 if (num_ranges
> max_regions
) {
285 vfio_combine_iova_ranges(ranges
, nnodes
, max_regions
);
286 num_ranges
= max_regions
;
289 region_info
= kcalloc(num_ranges
, sizeof(*region_info
), GFP_KERNEL
);
292 len
= num_ranges
* sizeof(*region_info
);
294 node
= interval_tree_iter_first(ranges
, 0, ULONG_MAX
);
297 for (int i
= 0; i
< num_ranges
; i
++) {
298 struct pds_lm_dirty_region_info
*ri
= ®ion_info
[i
];
299 u64 region_size
= node
->last
- node
->start
+ 1;
300 u64 region_start
= node
->start
;
303 page_count
= DIV_ROUND_UP(region_size
, region_page_size
);
305 ri
->dma_base
= cpu_to_le64(region_start
);
306 ri
->page_count
= cpu_to_le32(page_count
);
307 ri
->page_size_log2
= ilog2(region_page_size
);
310 "region_info[%d]: region_start 0x%llx region_end 0x%lx region_size 0x%llx page_count %u page_size %llu\n",
311 i
, region_start
, node
->last
, region_size
, page_count
,
314 node
= interval_tree_iter_next(node
, 0, ULONG_MAX
);
317 regions_dma
= dma_map_single(pdsc_dev
, (void *)region_info
, len
,
319 if (dma_mapping_error(pdsc_dev
, regions_dma
)) {
321 goto out_free_region_info
;
324 err
= pds_vfio_dirty_enable_cmd(pds_vfio
, regions_dma
, num_ranges
);
325 dma_unmap_single(pdsc_dev
, regions_dma
, len
, DMA_BIDIRECTIONAL
);
327 goto out_free_region_info
;
329 err
= pds_vfio_dirty_alloc_regions(pds_vfio
, region_info
,
330 region_page_size
, num_ranges
);
333 "Failed to allocate %d regions for tracking dirty regions: %pe\n",
334 num_regions
, ERR_PTR(err
));
335 goto out_dirty_disable
;
338 pds_vfio_dirty_set_enabled(pds_vfio
);
340 pds_vfio_print_guest_region_info(pds_vfio
, max_regions
);
347 pds_vfio_dirty_disable_cmd(pds_vfio
);
348 out_free_region_info
:
353 void pds_vfio_dirty_disable(struct pds_vfio_pci_device
*pds_vfio
, bool send_cmd
)
355 if (pds_vfio_dirty_is_enabled(pds_vfio
)) {
356 pds_vfio_dirty_set_disabled(pds_vfio
);
358 pds_vfio_dirty_disable_cmd(pds_vfio
);
359 pds_vfio_dirty_free_sgl(pds_vfio
);
360 pds_vfio_dirty_free_bitmaps(&pds_vfio
->dirty
);
361 pds_vfio_dirty_free_regions(&pds_vfio
->dirty
);
365 pds_vfio_send_host_vf_lm_status_cmd(pds_vfio
, PDS_LM_STA_NONE
);
368 static int pds_vfio_dirty_seq_ack(struct pds_vfio_pci_device
*pds_vfio
,
369 struct pds_vfio_region
*region
,
370 unsigned long *seq_ack_bmp
, u32 offset
,
371 u32 bmp_bytes
, bool read_seq
)
373 const char *bmp_type_str
= read_seq
? "read_seq" : "write_ack";
374 u8 dma_dir
= read_seq
? DMA_FROM_DEVICE
: DMA_TO_DEVICE
;
375 struct pci_dev
*pdev
= pds_vfio
->vfio_coredev
.pdev
;
376 struct device
*pdsc_dev
= &pci_physfn(pdev
)->dev
;
377 unsigned long long npages
;
378 struct sg_table sg_table
;
379 struct scatterlist
*sg
;
388 bmp
= (void *)((u64
)seq_ack_bmp
+ offset
);
389 page_offset
= offset_in_page(bmp
);
393 * Start and end of bitmap section to seq/ack might not be page
394 * aligned, so use the page_offset to account for that so there
395 * will be enough pages to represent the bmp_bytes
397 npages
= DIV_ROUND_UP_ULL(bmp_bytes
+ page_offset
, PAGE_SIZE
);
398 pages
= kmalloc_array(npages
, sizeof(*pages
), GFP_KERNEL
);
402 for (unsigned long long i
= 0; i
< npages
; i
++) {
403 struct page
*page
= vmalloc_to_page(bmp
);
414 err
= sg_alloc_table_from_pages(&sg_table
, pages
, npages
, page_offset
,
415 bmp_bytes
, GFP_KERNEL
);
419 err
= dma_map_sgtable(pdsc_dev
, &sg_table
, dma_dir
, 0);
421 goto out_free_sg_table
;
423 for_each_sgtable_dma_sg(&sg_table
, sg
, i
) {
424 struct pds_lm_sg_elem
*sg_elem
= ®ion
->sgl
[i
];
426 sg_elem
->addr
= cpu_to_le64(sg_dma_address(sg
));
427 sg_elem
->len
= cpu_to_le32(sg_dma_len(sg
));
430 num_sge
= sg_table
.nents
;
431 size
= num_sge
* sizeof(struct pds_lm_sg_elem
);
432 offset
+= region
->dev_bmp_offset_start_byte
;
433 dma_sync_single_for_device(pdsc_dev
, region
->sgl_addr
, size
, dma_dir
);
434 err
= pds_vfio_dirty_seq_ack_cmd(pds_vfio
, region
->sgl_addr
, num_sge
,
435 offset
, bmp_bytes
, read_seq
);
438 "Dirty bitmap %s failed offset %u bmp_bytes %u num_sge %u DMA 0x%llx: %pe\n",
439 bmp_type_str
, offset
, bmp_bytes
,
440 num_sge
, region
->sgl_addr
, ERR_PTR(err
));
441 dma_sync_single_for_cpu(pdsc_dev
, region
->sgl_addr
, size
, dma_dir
);
443 dma_unmap_sgtable(pdsc_dev
, &sg_table
, dma_dir
, 0);
445 sg_free_table(&sg_table
);
452 static int pds_vfio_dirty_write_ack(struct pds_vfio_pci_device
*pds_vfio
,
453 struct pds_vfio_region
*region
,
457 return pds_vfio_dirty_seq_ack(pds_vfio
, region
, region
->host_ack
,
458 offset
, len
, WRITE_ACK
);
461 static int pds_vfio_dirty_read_seq(struct pds_vfio_pci_device
*pds_vfio
,
462 struct pds_vfio_region
*region
,
465 return pds_vfio_dirty_seq_ack(pds_vfio
, region
, region
->host_seq
,
466 offset
, len
, READ_SEQ
);
469 static int pds_vfio_dirty_process_bitmaps(struct pds_vfio_pci_device
*pds_vfio
,
470 struct pds_vfio_region
*region
,
471 struct iova_bitmap
*dirty_bitmap
,
472 u32 bmp_offset
, u32 len_bytes
)
474 u64 page_size
= region
->page_size
;
475 u64 region_start
= region
->start
;
480 dword_count
= len_bytes
/ sizeof(u64
);
481 seq
= (__le64
*)((u64
)region
->host_seq
+ bmp_offset
);
482 ack
= (__le64
*)((u64
)region
->host_ack
+ bmp_offset
);
483 bmp_offset_bit
= bmp_offset
* 8;
485 for (int i
= 0; i
< dword_count
; i
++) {
486 u64
xor = le64_to_cpu(seq
[i
]) ^ le64_to_cpu(ack
[i
]);
488 /* prepare for next write_ack call */
491 for (u8 bit_i
= 0; bit_i
< BITS_PER_TYPE(u64
); ++bit_i
) {
492 if (xor & BIT(bit_i
)) {
493 u64 abs_bit_i
= bmp_offset_bit
+
494 i
* BITS_PER_TYPE(u64
) + bit_i
;
495 u64 addr
= abs_bit_i
* page_size
+ region_start
;
497 iova_bitmap_set(dirty_bitmap
, addr
, page_size
);
505 static struct pds_vfio_region
*
506 pds_vfio_get_region(struct pds_vfio_pci_device
*pds_vfio
, unsigned long iova
)
508 struct pds_vfio_dirty
*dirty
= &pds_vfio
->dirty
;
510 for (int i
= 0; i
< dirty
->num_regions
; i
++) {
511 struct pds_vfio_region
*region
= &dirty
->regions
[i
];
513 if (iova
>= region
->start
&&
514 iova
< (region
->start
+ region
->size
))
521 static int pds_vfio_dirty_sync(struct pds_vfio_pci_device
*pds_vfio
,
522 struct iova_bitmap
*dirty_bitmap
,
523 unsigned long iova
, unsigned long length
)
525 struct device
*dev
= &pds_vfio
->vfio_coredev
.pdev
->dev
;
526 struct pds_vfio_region
*region
;
527 u64 bmp_offset
, bmp_bytes
;
528 u64 bitmap_size
, pages
;
531 dev_dbg(dev
, "vf%u: Get dirty page bitmap\n", pds_vfio
->vf_id
);
533 if (!pds_vfio_dirty_is_enabled(pds_vfio
)) {
534 dev_err(dev
, "vf%u: Sync failed, dirty tracking is disabled\n",
539 region
= pds_vfio_get_region(pds_vfio
, iova
);
541 dev_err(dev
, "vf%u: Failed to find region that contains iova 0x%lx length 0x%lx\n",
542 pds_vfio
->vf_id
, iova
, length
);
546 pages
= DIV_ROUND_UP(length
, region
->page_size
);
548 round_up(pages
, sizeof(u64
) * BITS_PER_BYTE
) / BITS_PER_BYTE
;
551 "vf%u: iova 0x%lx length %lu page_size %llu pages %llu bitmap_size %llu\n",
552 pds_vfio
->vf_id
, iova
, length
, region
->page_size
,
555 if (!length
|| ((iova
- region
->start
+ length
) > region
->size
)) {
556 dev_err(dev
, "Invalid iova 0x%lx and/or length 0x%lx to sync\n",
561 /* bitmap is modified in 64 bit chunks */
562 bmp_bytes
= ALIGN(DIV_ROUND_UP(length
/ region
->page_size
,
563 sizeof(u64
)), sizeof(u64
));
564 if (bmp_bytes
!= bitmap_size
) {
566 "Calculated bitmap bytes %llu not equal to bitmap size %llu\n",
567 bmp_bytes
, bitmap_size
);
571 if (bmp_bytes
> region
->bmp_bytes
) {
573 "Calculated bitmap bytes %llu larger than region's cached bmp_bytes %llu\n",
574 bmp_bytes
, region
->bmp_bytes
);
578 bmp_offset
= DIV_ROUND_UP((iova
- region
->start
) /
579 region
->page_size
, sizeof(u64
));
582 "Syncing dirty bitmap, iova 0x%lx length 0x%lx, bmp_offset %llu bmp_bytes %llu\n",
583 iova
, length
, bmp_offset
, bmp_bytes
);
585 err
= pds_vfio_dirty_read_seq(pds_vfio
, region
, bmp_offset
, bmp_bytes
);
589 err
= pds_vfio_dirty_process_bitmaps(pds_vfio
, region
, dirty_bitmap
,
590 bmp_offset
, bmp_bytes
);
594 err
= pds_vfio_dirty_write_ack(pds_vfio
, region
, bmp_offset
, bmp_bytes
);
601 int pds_vfio_dma_logging_report(struct vfio_device
*vdev
, unsigned long iova
,
602 unsigned long length
, struct iova_bitmap
*dirty
)
604 struct pds_vfio_pci_device
*pds_vfio
=
605 container_of(vdev
, struct pds_vfio_pci_device
,
609 mutex_lock(&pds_vfio
->state_mutex
);
610 err
= pds_vfio_dirty_sync(pds_vfio
, dirty
, iova
, length
);
611 mutex_unlock(&pds_vfio
->state_mutex
);
616 int pds_vfio_dma_logging_start(struct vfio_device
*vdev
,
617 struct rb_root_cached
*ranges
, u32 nnodes
,
620 struct pds_vfio_pci_device
*pds_vfio
=
621 container_of(vdev
, struct pds_vfio_pci_device
,
625 mutex_lock(&pds_vfio
->state_mutex
);
626 pds_vfio_send_host_vf_lm_status_cmd(pds_vfio
, PDS_LM_STA_IN_PROGRESS
);
627 err
= pds_vfio_dirty_enable(pds_vfio
, ranges
, nnodes
, page_size
);
628 mutex_unlock(&pds_vfio
->state_mutex
);
633 int pds_vfio_dma_logging_stop(struct vfio_device
*vdev
)
635 struct pds_vfio_pci_device
*pds_vfio
=
636 container_of(vdev
, struct pds_vfio_pci_device
,
639 mutex_lock(&pds_vfio
->state_mutex
);
640 pds_vfio_dirty_disable(pds_vfio
, true);
641 mutex_unlock(&pds_vfio
->state_mutex
);