2 * KVMGT - the implementation of Intel mediated pass-through framework for KVM
4 * Copyright(c) 2014-2016 Intel Corporation. All rights reserved.
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26 * Kevin Tian <kevin.tian@intel.com>
27 * Jike Song <jike.song@intel.com>
28 * Xiaoguang Chen <xiaoguang.chen@intel.com>
31 #include <linux/init.h>
32 #include <linux/device.h>
34 #include <linux/mmu_context.h>
35 #include <linux/sched/mm.h>
36 #include <linux/types.h>
37 #include <linux/list.h>
38 #include <linux/rbtree.h>
39 #include <linux/spinlock.h>
40 #include <linux/eventfd.h>
41 #include <linux/uuid.h>
42 #include <linux/kvm_host.h>
43 #include <linux/vfio.h>
44 #include <linux/mdev.h>
45 #include <linux/debugfs.h>
47 #include <linux/nospec.h>
52 static const struct intel_gvt_ops
*intel_gvt_ops
;
54 /* helper macros copied from vfio-pci */
55 #define VFIO_PCI_OFFSET_SHIFT 40
56 #define VFIO_PCI_OFFSET_TO_INDEX(off) (off >> VFIO_PCI_OFFSET_SHIFT)
57 #define VFIO_PCI_INDEX_TO_OFFSET(index) ((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
58 #define VFIO_PCI_OFFSET_MASK (((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
60 #define EDID_BLOB_OFFSET (PAGE_SIZE/2)
62 #define OPREGION_SIGNATURE "IntelGraphicsMem"
65 struct intel_vgpu_regops
{
66 size_t (*rw
)(struct intel_vgpu
*vgpu
, char *buf
,
67 size_t count
, loff_t
*ppos
, bool iswrite
);
68 void (*release
)(struct intel_vgpu
*vgpu
,
69 struct vfio_region
*region
);
77 const struct intel_vgpu_regops
*ops
;
81 struct vfio_edid_region
{
82 struct vfio_region_gfx_edid vfio_edid_regs
;
88 struct hlist_node hnode
;
91 struct kvmgt_guest_info
{
93 struct intel_vgpu
*vgpu
;
94 struct kvm_page_track_notifier_node track_node
;
95 #define NR_BKT (1 << 18)
96 struct hlist_head ptable
[NR_BKT
];
98 struct dentry
*debugfs_cache_entries
;
102 struct intel_vgpu
*vgpu
;
103 struct rb_node gfn_node
;
104 struct rb_node dma_addr_node
;
111 static inline bool handle_valid(unsigned long handle
)
113 return !!(handle
& ~0xff);
116 static int kvmgt_guest_init(struct mdev_device
*mdev
);
117 static void intel_vgpu_release_work(struct work_struct
*work
);
118 static bool kvmgt_guest_exit(struct kvmgt_guest_info
*info
);
120 static void gvt_unpin_guest_page(struct intel_vgpu
*vgpu
, unsigned long gfn
,
127 total_pages
= roundup(size
, PAGE_SIZE
) / PAGE_SIZE
;
129 for (npage
= 0; npage
< total_pages
; npage
++) {
130 unsigned long cur_gfn
= gfn
+ npage
;
132 ret
= vfio_unpin_pages(mdev_dev(vgpu
->vdev
.mdev
), &cur_gfn
, 1);
137 /* Pin a normal or compound guest page for dma. */
138 static int gvt_pin_guest_page(struct intel_vgpu
*vgpu
, unsigned long gfn
,
139 unsigned long size
, struct page
**page
)
141 unsigned long base_pfn
= 0;
146 total_pages
= roundup(size
, PAGE_SIZE
) / PAGE_SIZE
;
148 * We pin the pages one-by-one to avoid allocating a big arrary
149 * on stack to hold pfns.
151 for (npage
= 0; npage
< total_pages
; npage
++) {
152 unsigned long cur_gfn
= gfn
+ npage
;
155 ret
= vfio_pin_pages(mdev_dev(vgpu
->vdev
.mdev
), &cur_gfn
, 1,
156 IOMMU_READ
| IOMMU_WRITE
, &pfn
);
158 gvt_vgpu_err("vfio_pin_pages failed for gfn 0x%lx, ret %d\n",
163 if (!pfn_valid(pfn
)) {
164 gvt_vgpu_err("pfn 0x%lx is not mem backed\n", pfn
);
172 else if (base_pfn
+ npage
!= pfn
) {
173 gvt_vgpu_err("The pages are not continuous\n");
180 *page
= pfn_to_page(base_pfn
);
183 gvt_unpin_guest_page(vgpu
, gfn
, npage
* PAGE_SIZE
);
187 static int gvt_dma_map_page(struct intel_vgpu
*vgpu
, unsigned long gfn
,
188 dma_addr_t
*dma_addr
, unsigned long size
)
190 struct device
*dev
= &vgpu
->gvt
->dev_priv
->drm
.pdev
->dev
;
191 struct page
*page
= NULL
;
194 ret
= gvt_pin_guest_page(vgpu
, gfn
, size
, &page
);
198 /* Setup DMA mapping. */
199 *dma_addr
= dma_map_page(dev
, page
, 0, size
, PCI_DMA_BIDIRECTIONAL
);
200 if (dma_mapping_error(dev
, *dma_addr
)) {
201 gvt_vgpu_err("DMA mapping failed for pfn 0x%lx, ret %d\n",
202 page_to_pfn(page
), ret
);
203 gvt_unpin_guest_page(vgpu
, gfn
, size
);
210 static void gvt_dma_unmap_page(struct intel_vgpu
*vgpu
, unsigned long gfn
,
211 dma_addr_t dma_addr
, unsigned long size
)
213 struct device
*dev
= &vgpu
->gvt
->dev_priv
->drm
.pdev
->dev
;
215 dma_unmap_page(dev
, dma_addr
, size
, PCI_DMA_BIDIRECTIONAL
);
216 gvt_unpin_guest_page(vgpu
, gfn
, size
);
219 static struct gvt_dma
*__gvt_cache_find_dma_addr(struct intel_vgpu
*vgpu
,
222 struct rb_node
*node
= vgpu
->vdev
.dma_addr_cache
.rb_node
;
226 itr
= rb_entry(node
, struct gvt_dma
, dma_addr_node
);
228 if (dma_addr
< itr
->dma_addr
)
229 node
= node
->rb_left
;
230 else if (dma_addr
> itr
->dma_addr
)
231 node
= node
->rb_right
;
238 static struct gvt_dma
*__gvt_cache_find_gfn(struct intel_vgpu
*vgpu
, gfn_t gfn
)
240 struct rb_node
*node
= vgpu
->vdev
.gfn_cache
.rb_node
;
244 itr
= rb_entry(node
, struct gvt_dma
, gfn_node
);
247 node
= node
->rb_left
;
248 else if (gfn
> itr
->gfn
)
249 node
= node
->rb_right
;
256 static int __gvt_cache_add(struct intel_vgpu
*vgpu
, gfn_t gfn
,
257 dma_addr_t dma_addr
, unsigned long size
)
259 struct gvt_dma
*new, *itr
;
260 struct rb_node
**link
, *parent
= NULL
;
262 new = kzalloc(sizeof(struct gvt_dma
), GFP_KERNEL
);
268 new->dma_addr
= dma_addr
;
270 kref_init(&new->ref
);
272 /* gfn_cache maps gfn to struct gvt_dma. */
273 link
= &vgpu
->vdev
.gfn_cache
.rb_node
;
276 itr
= rb_entry(parent
, struct gvt_dma
, gfn_node
);
279 link
= &parent
->rb_left
;
281 link
= &parent
->rb_right
;
283 rb_link_node(&new->gfn_node
, parent
, link
);
284 rb_insert_color(&new->gfn_node
, &vgpu
->vdev
.gfn_cache
);
286 /* dma_addr_cache maps dma addr to struct gvt_dma. */
288 link
= &vgpu
->vdev
.dma_addr_cache
.rb_node
;
291 itr
= rb_entry(parent
, struct gvt_dma
, dma_addr_node
);
293 if (dma_addr
< itr
->dma_addr
)
294 link
= &parent
->rb_left
;
296 link
= &parent
->rb_right
;
298 rb_link_node(&new->dma_addr_node
, parent
, link
);
299 rb_insert_color(&new->dma_addr_node
, &vgpu
->vdev
.dma_addr_cache
);
301 vgpu
->vdev
.nr_cache_entries
++;
305 static void __gvt_cache_remove_entry(struct intel_vgpu
*vgpu
,
306 struct gvt_dma
*entry
)
308 rb_erase(&entry
->gfn_node
, &vgpu
->vdev
.gfn_cache
);
309 rb_erase(&entry
->dma_addr_node
, &vgpu
->vdev
.dma_addr_cache
);
311 vgpu
->vdev
.nr_cache_entries
--;
314 static void gvt_cache_destroy(struct intel_vgpu
*vgpu
)
317 struct rb_node
*node
= NULL
;
320 mutex_lock(&vgpu
->vdev
.cache_lock
);
321 node
= rb_first(&vgpu
->vdev
.gfn_cache
);
323 mutex_unlock(&vgpu
->vdev
.cache_lock
);
326 dma
= rb_entry(node
, struct gvt_dma
, gfn_node
);
327 gvt_dma_unmap_page(vgpu
, dma
->gfn
, dma
->dma_addr
, dma
->size
);
328 __gvt_cache_remove_entry(vgpu
, dma
);
329 mutex_unlock(&vgpu
->vdev
.cache_lock
);
333 static void gvt_cache_init(struct intel_vgpu
*vgpu
)
335 vgpu
->vdev
.gfn_cache
= RB_ROOT
;
336 vgpu
->vdev
.dma_addr_cache
= RB_ROOT
;
337 vgpu
->vdev
.nr_cache_entries
= 0;
338 mutex_init(&vgpu
->vdev
.cache_lock
);
341 static void kvmgt_protect_table_init(struct kvmgt_guest_info
*info
)
343 hash_init(info
->ptable
);
346 static void kvmgt_protect_table_destroy(struct kvmgt_guest_info
*info
)
348 struct kvmgt_pgfn
*p
;
349 struct hlist_node
*tmp
;
352 hash_for_each_safe(info
->ptable
, i
, tmp
, p
, hnode
) {
358 static struct kvmgt_pgfn
*
359 __kvmgt_protect_table_find(struct kvmgt_guest_info
*info
, gfn_t gfn
)
361 struct kvmgt_pgfn
*p
, *res
= NULL
;
363 hash_for_each_possible(info
->ptable
, p
, hnode
, gfn
) {
373 static bool kvmgt_gfn_is_write_protected(struct kvmgt_guest_info
*info
,
376 struct kvmgt_pgfn
*p
;
378 p
= __kvmgt_protect_table_find(info
, gfn
);
382 static void kvmgt_protect_table_add(struct kvmgt_guest_info
*info
, gfn_t gfn
)
384 struct kvmgt_pgfn
*p
;
386 if (kvmgt_gfn_is_write_protected(info
, gfn
))
389 p
= kzalloc(sizeof(struct kvmgt_pgfn
), GFP_ATOMIC
);
390 if (WARN(!p
, "gfn: 0x%llx\n", gfn
))
394 hash_add(info
->ptable
, &p
->hnode
, gfn
);
397 static void kvmgt_protect_table_del(struct kvmgt_guest_info
*info
,
400 struct kvmgt_pgfn
*p
;
402 p
= __kvmgt_protect_table_find(info
, gfn
);
409 static size_t intel_vgpu_reg_rw_opregion(struct intel_vgpu
*vgpu
, char *buf
,
410 size_t count
, loff_t
*ppos
, bool iswrite
)
412 unsigned int i
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
) -
413 VFIO_PCI_NUM_REGIONS
;
414 void *base
= vgpu
->vdev
.region
[i
].data
;
415 loff_t pos
= *ppos
& VFIO_PCI_OFFSET_MASK
;
417 if (pos
>= vgpu
->vdev
.region
[i
].size
|| iswrite
) {
418 gvt_vgpu_err("invalid op or offset for Intel vgpu OpRegion\n");
421 count
= min(count
, (size_t)(vgpu
->vdev
.region
[i
].size
- pos
));
422 memcpy(buf
, base
+ pos
, count
);
427 static void intel_vgpu_reg_release_opregion(struct intel_vgpu
*vgpu
,
428 struct vfio_region
*region
)
432 static const struct intel_vgpu_regops intel_vgpu_regops_opregion
= {
433 .rw
= intel_vgpu_reg_rw_opregion
,
434 .release
= intel_vgpu_reg_release_opregion
,
437 static int handle_edid_regs(struct intel_vgpu
*vgpu
,
438 struct vfio_edid_region
*region
, char *buf
,
439 size_t count
, u16 offset
, bool is_write
)
441 struct vfio_region_gfx_edid
*regs
= ®ion
->vfio_edid_regs
;
444 if (offset
+ count
> sizeof(*regs
))
451 data
= *((unsigned int *)buf
);
453 case offsetof(struct vfio_region_gfx_edid
, link_state
):
454 if (data
== VFIO_DEVICE_GFX_LINK_STATE_UP
) {
455 if (!drm_edid_block_valid(
456 (u8
*)region
->edid_blob
,
460 gvt_vgpu_err("invalid EDID blob\n");
463 intel_gvt_ops
->emulate_hotplug(vgpu
, true);
464 } else if (data
== VFIO_DEVICE_GFX_LINK_STATE_DOWN
)
465 intel_gvt_ops
->emulate_hotplug(vgpu
, false);
467 gvt_vgpu_err("invalid EDID link state %d\n",
471 regs
->link_state
= data
;
473 case offsetof(struct vfio_region_gfx_edid
, edid_size
):
474 if (data
> regs
->edid_max_size
) {
475 gvt_vgpu_err("EDID size is bigger than %d!\n",
476 regs
->edid_max_size
);
479 regs
->edid_size
= data
;
483 gvt_vgpu_err("write read-only EDID region at offset %d\n",
488 memcpy(buf
, (char *)regs
+ offset
, count
);
494 static int handle_edid_blob(struct vfio_edid_region
*region
, char *buf
,
495 size_t count
, u16 offset
, bool is_write
)
497 if (offset
+ count
> region
->vfio_edid_regs
.edid_size
)
501 memcpy(region
->edid_blob
+ offset
, buf
, count
);
503 memcpy(buf
, region
->edid_blob
+ offset
, count
);
508 static size_t intel_vgpu_reg_rw_edid(struct intel_vgpu
*vgpu
, char *buf
,
509 size_t count
, loff_t
*ppos
, bool iswrite
)
512 unsigned int i
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
) -
513 VFIO_PCI_NUM_REGIONS
;
514 struct vfio_edid_region
*region
=
515 (struct vfio_edid_region
*)vgpu
->vdev
.region
[i
].data
;
516 loff_t pos
= *ppos
& VFIO_PCI_OFFSET_MASK
;
518 if (pos
< region
->vfio_edid_regs
.edid_offset
) {
519 ret
= handle_edid_regs(vgpu
, region
, buf
, count
, pos
, iswrite
);
521 pos
-= EDID_BLOB_OFFSET
;
522 ret
= handle_edid_blob(region
, buf
, count
, pos
, iswrite
);
526 gvt_vgpu_err("failed to access EDID region\n");
531 static void intel_vgpu_reg_release_edid(struct intel_vgpu
*vgpu
,
532 struct vfio_region
*region
)
537 static const struct intel_vgpu_regops intel_vgpu_regops_edid
= {
538 .rw
= intel_vgpu_reg_rw_edid
,
539 .release
= intel_vgpu_reg_release_edid
,
542 static int intel_vgpu_register_reg(struct intel_vgpu
*vgpu
,
543 unsigned int type
, unsigned int subtype
,
544 const struct intel_vgpu_regops
*ops
,
545 size_t size
, u32 flags
, void *data
)
547 struct vfio_region
*region
;
549 region
= krealloc(vgpu
->vdev
.region
,
550 (vgpu
->vdev
.num_regions
+ 1) * sizeof(*region
),
555 vgpu
->vdev
.region
= region
;
556 vgpu
->vdev
.region
[vgpu
->vdev
.num_regions
].type
= type
;
557 vgpu
->vdev
.region
[vgpu
->vdev
.num_regions
].subtype
= subtype
;
558 vgpu
->vdev
.region
[vgpu
->vdev
.num_regions
].ops
= ops
;
559 vgpu
->vdev
.region
[vgpu
->vdev
.num_regions
].size
= size
;
560 vgpu
->vdev
.region
[vgpu
->vdev
.num_regions
].flags
= flags
;
561 vgpu
->vdev
.region
[vgpu
->vdev
.num_regions
].data
= data
;
562 vgpu
->vdev
.num_regions
++;
566 static int kvmgt_get_vfio_device(void *p_vgpu
)
568 struct intel_vgpu
*vgpu
= (struct intel_vgpu
*)p_vgpu
;
570 vgpu
->vdev
.vfio_device
= vfio_device_get_from_dev(
571 mdev_dev(vgpu
->vdev
.mdev
));
572 if (!vgpu
->vdev
.vfio_device
) {
573 gvt_vgpu_err("failed to get vfio device\n");
580 static int kvmgt_set_opregion(void *p_vgpu
)
582 struct intel_vgpu
*vgpu
= (struct intel_vgpu
*)p_vgpu
;
586 /* Each vgpu has its own opregion, although VFIO would create another
587 * one later. This one is used to expose opregion to VFIO. And the
588 * other one created by VFIO later, is used by guest actually.
590 base
= vgpu_opregion(vgpu
)->va
;
594 if (memcmp(base
, OPREGION_SIGNATURE
, 16)) {
599 ret
= intel_vgpu_register_reg(vgpu
,
600 PCI_VENDOR_ID_INTEL
| VFIO_REGION_TYPE_PCI_VENDOR_TYPE
,
601 VFIO_REGION_SUBTYPE_INTEL_IGD_OPREGION
,
602 &intel_vgpu_regops_opregion
, OPREGION_SIZE
,
603 VFIO_REGION_INFO_FLAG_READ
, base
);
608 static int kvmgt_set_edid(void *p_vgpu
, int port_num
)
610 struct intel_vgpu
*vgpu
= (struct intel_vgpu
*)p_vgpu
;
611 struct intel_vgpu_port
*port
= intel_vgpu_port(vgpu
, port_num
);
612 struct vfio_edid_region
*base
;
615 base
= kzalloc(sizeof(*base
), GFP_KERNEL
);
619 /* TODO: Add multi-port and EDID extension block support */
620 base
->vfio_edid_regs
.edid_offset
= EDID_BLOB_OFFSET
;
621 base
->vfio_edid_regs
.edid_max_size
= EDID_SIZE
;
622 base
->vfio_edid_regs
.edid_size
= EDID_SIZE
;
623 base
->vfio_edid_regs
.max_xres
= vgpu_edid_xres(port
->id
);
624 base
->vfio_edid_regs
.max_yres
= vgpu_edid_yres(port
->id
);
625 base
->edid_blob
= port
->edid
->edid_block
;
627 ret
= intel_vgpu_register_reg(vgpu
,
628 VFIO_REGION_TYPE_GFX
,
629 VFIO_REGION_SUBTYPE_GFX_EDID
,
630 &intel_vgpu_regops_edid
, EDID_SIZE
,
631 VFIO_REGION_INFO_FLAG_READ
|
632 VFIO_REGION_INFO_FLAG_WRITE
|
633 VFIO_REGION_INFO_FLAG_CAPS
, base
);
638 static void kvmgt_put_vfio_device(void *vgpu
)
640 if (WARN_ON(!((struct intel_vgpu
*)vgpu
)->vdev
.vfio_device
))
643 vfio_device_put(((struct intel_vgpu
*)vgpu
)->vdev
.vfio_device
);
646 static int intel_vgpu_create(struct kobject
*kobj
, struct mdev_device
*mdev
)
648 struct intel_vgpu
*vgpu
= NULL
;
649 struct intel_vgpu_type
*type
;
654 pdev
= mdev_parent_dev(mdev
);
655 gvt
= kdev_to_i915(pdev
)->gvt
;
657 type
= intel_gvt_ops
->gvt_find_vgpu_type(gvt
, kobject_name(kobj
));
659 gvt_vgpu_err("failed to find type %s to create\n",
665 vgpu
= intel_gvt_ops
->vgpu_create(gvt
, type
);
666 if (IS_ERR_OR_NULL(vgpu
)) {
667 ret
= vgpu
== NULL
? -EFAULT
: PTR_ERR(vgpu
);
668 gvt_err("failed to create intel vgpu: %d\n", ret
);
672 INIT_WORK(&vgpu
->vdev
.release_work
, intel_vgpu_release_work
);
674 vgpu
->vdev
.mdev
= mdev
;
675 mdev_set_drvdata(mdev
, vgpu
);
677 gvt_dbg_core("intel_vgpu_create succeeded for mdev: %s\n",
678 dev_name(mdev_dev(mdev
)));
685 static int intel_vgpu_remove(struct mdev_device
*mdev
)
687 struct intel_vgpu
*vgpu
= mdev_get_drvdata(mdev
);
689 if (handle_valid(vgpu
->handle
))
692 intel_gvt_ops
->vgpu_destroy(vgpu
);
696 static int intel_vgpu_iommu_notifier(struct notifier_block
*nb
,
697 unsigned long action
, void *data
)
699 struct intel_vgpu
*vgpu
= container_of(nb
,
701 vdev
.iommu_notifier
);
703 if (action
== VFIO_IOMMU_NOTIFY_DMA_UNMAP
) {
704 struct vfio_iommu_type1_dma_unmap
*unmap
= data
;
705 struct gvt_dma
*entry
;
706 unsigned long iov_pfn
, end_iov_pfn
;
708 iov_pfn
= unmap
->iova
>> PAGE_SHIFT
;
709 end_iov_pfn
= iov_pfn
+ unmap
->size
/ PAGE_SIZE
;
711 mutex_lock(&vgpu
->vdev
.cache_lock
);
712 for (; iov_pfn
< end_iov_pfn
; iov_pfn
++) {
713 entry
= __gvt_cache_find_gfn(vgpu
, iov_pfn
);
717 gvt_dma_unmap_page(vgpu
, entry
->gfn
, entry
->dma_addr
,
719 __gvt_cache_remove_entry(vgpu
, entry
);
721 mutex_unlock(&vgpu
->vdev
.cache_lock
);
727 static int intel_vgpu_group_notifier(struct notifier_block
*nb
,
728 unsigned long action
, void *data
)
730 struct intel_vgpu
*vgpu
= container_of(nb
,
732 vdev
.group_notifier
);
734 /* the only action we care about */
735 if (action
== VFIO_GROUP_NOTIFY_SET_KVM
) {
736 vgpu
->vdev
.kvm
= data
;
739 schedule_work(&vgpu
->vdev
.release_work
);
745 static int intel_vgpu_open(struct mdev_device
*mdev
)
747 struct intel_vgpu
*vgpu
= mdev_get_drvdata(mdev
);
748 unsigned long events
;
751 vgpu
->vdev
.iommu_notifier
.notifier_call
= intel_vgpu_iommu_notifier
;
752 vgpu
->vdev
.group_notifier
.notifier_call
= intel_vgpu_group_notifier
;
754 events
= VFIO_IOMMU_NOTIFY_DMA_UNMAP
;
755 ret
= vfio_register_notifier(mdev_dev(mdev
), VFIO_IOMMU_NOTIFY
, &events
,
756 &vgpu
->vdev
.iommu_notifier
);
758 gvt_vgpu_err("vfio_register_notifier for iommu failed: %d\n",
763 events
= VFIO_GROUP_NOTIFY_SET_KVM
;
764 ret
= vfio_register_notifier(mdev_dev(mdev
), VFIO_GROUP_NOTIFY
, &events
,
765 &vgpu
->vdev
.group_notifier
);
767 gvt_vgpu_err("vfio_register_notifier for group failed: %d\n",
772 /* Take a module reference as mdev core doesn't take
773 * a reference for vendor driver.
775 if (!try_module_get(THIS_MODULE
))
778 ret
= kvmgt_guest_init(mdev
);
782 intel_gvt_ops
->vgpu_activate(vgpu
);
784 atomic_set(&vgpu
->vdev
.released
, 0);
788 vfio_unregister_notifier(mdev_dev(mdev
), VFIO_GROUP_NOTIFY
,
789 &vgpu
->vdev
.group_notifier
);
792 vfio_unregister_notifier(mdev_dev(mdev
), VFIO_IOMMU_NOTIFY
,
793 &vgpu
->vdev
.iommu_notifier
);
798 static void intel_vgpu_release_msi_eventfd_ctx(struct intel_vgpu
*vgpu
)
800 struct eventfd_ctx
*trigger
;
802 trigger
= vgpu
->vdev
.msi_trigger
;
804 eventfd_ctx_put(trigger
);
805 vgpu
->vdev
.msi_trigger
= NULL
;
809 static void __intel_vgpu_release(struct intel_vgpu
*vgpu
)
811 struct kvmgt_guest_info
*info
;
814 if (!handle_valid(vgpu
->handle
))
817 if (atomic_cmpxchg(&vgpu
->vdev
.released
, 0, 1))
820 intel_gvt_ops
->vgpu_release(vgpu
);
822 ret
= vfio_unregister_notifier(mdev_dev(vgpu
->vdev
.mdev
), VFIO_IOMMU_NOTIFY
,
823 &vgpu
->vdev
.iommu_notifier
);
824 WARN(ret
, "vfio_unregister_notifier for iommu failed: %d\n", ret
);
826 ret
= vfio_unregister_notifier(mdev_dev(vgpu
->vdev
.mdev
), VFIO_GROUP_NOTIFY
,
827 &vgpu
->vdev
.group_notifier
);
828 WARN(ret
, "vfio_unregister_notifier for group failed: %d\n", ret
);
830 /* dereference module reference taken at open */
831 module_put(THIS_MODULE
);
833 info
= (struct kvmgt_guest_info
*)vgpu
->handle
;
834 kvmgt_guest_exit(info
);
836 intel_vgpu_release_msi_eventfd_ctx(vgpu
);
838 vgpu
->vdev
.kvm
= NULL
;
842 static void intel_vgpu_release(struct mdev_device
*mdev
)
844 struct intel_vgpu
*vgpu
= mdev_get_drvdata(mdev
);
846 __intel_vgpu_release(vgpu
);
849 static void intel_vgpu_release_work(struct work_struct
*work
)
851 struct intel_vgpu
*vgpu
= container_of(work
, struct intel_vgpu
,
854 __intel_vgpu_release(vgpu
);
857 static u64
intel_vgpu_get_bar_addr(struct intel_vgpu
*vgpu
, int bar
)
859 u32 start_lo
, start_hi
;
862 start_lo
= (*(u32
*)(vgpu
->cfg_space
.virtual_cfg_space
+ bar
)) &
863 PCI_BASE_ADDRESS_MEM_MASK
;
864 mem_type
= (*(u32
*)(vgpu
->cfg_space
.virtual_cfg_space
+ bar
)) &
865 PCI_BASE_ADDRESS_MEM_TYPE_MASK
;
868 case PCI_BASE_ADDRESS_MEM_TYPE_64
:
869 start_hi
= (*(u32
*)(vgpu
->cfg_space
.virtual_cfg_space
872 case PCI_BASE_ADDRESS_MEM_TYPE_32
:
873 case PCI_BASE_ADDRESS_MEM_TYPE_1M
:
874 /* 1M mem BAR treated as 32-bit BAR */
876 /* mem unknown type treated as 32-bit BAR */
881 return ((u64
)start_hi
<< 32) | start_lo
;
884 static int intel_vgpu_bar_rw(struct intel_vgpu
*vgpu
, int bar
, u64 off
,
885 void *buf
, unsigned int count
, bool is_write
)
887 u64 bar_start
= intel_vgpu_get_bar_addr(vgpu
, bar
);
891 ret
= intel_gvt_ops
->emulate_mmio_write(vgpu
,
892 bar_start
+ off
, buf
, count
);
894 ret
= intel_gvt_ops
->emulate_mmio_read(vgpu
,
895 bar_start
+ off
, buf
, count
);
899 static inline bool intel_vgpu_in_aperture(struct intel_vgpu
*vgpu
, u64 off
)
901 return off
>= vgpu_aperture_offset(vgpu
) &&
902 off
< vgpu_aperture_offset(vgpu
) + vgpu_aperture_sz(vgpu
);
905 static int intel_vgpu_aperture_rw(struct intel_vgpu
*vgpu
, u64 off
,
906 void *buf
, unsigned long count
, bool is_write
)
908 void __iomem
*aperture_va
;
910 if (!intel_vgpu_in_aperture(vgpu
, off
) ||
911 !intel_vgpu_in_aperture(vgpu
, off
+ count
)) {
912 gvt_vgpu_err("Invalid aperture offset %llu\n", off
);
916 aperture_va
= io_mapping_map_wc(&vgpu
->gvt
->dev_priv
->ggtt
.iomap
,
917 ALIGN_DOWN(off
, PAGE_SIZE
),
918 count
+ offset_in_page(off
));
923 memcpy_toio(aperture_va
+ offset_in_page(off
), buf
, count
);
925 memcpy_fromio(buf
, aperture_va
+ offset_in_page(off
), count
);
927 io_mapping_unmap(aperture_va
);
932 static ssize_t
intel_vgpu_rw(struct mdev_device
*mdev
, char *buf
,
933 size_t count
, loff_t
*ppos
, bool is_write
)
935 struct intel_vgpu
*vgpu
= mdev_get_drvdata(mdev
);
936 unsigned int index
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
);
937 u64 pos
= *ppos
& VFIO_PCI_OFFSET_MASK
;
941 if (index
>= VFIO_PCI_NUM_REGIONS
+ vgpu
->vdev
.num_regions
) {
942 gvt_vgpu_err("invalid index: %u\n", index
);
947 case VFIO_PCI_CONFIG_REGION_INDEX
:
949 ret
= intel_gvt_ops
->emulate_cfg_write(vgpu
, pos
,
952 ret
= intel_gvt_ops
->emulate_cfg_read(vgpu
, pos
,
955 case VFIO_PCI_BAR0_REGION_INDEX
:
956 ret
= intel_vgpu_bar_rw(vgpu
, PCI_BASE_ADDRESS_0
, pos
,
957 buf
, count
, is_write
);
959 case VFIO_PCI_BAR2_REGION_INDEX
:
960 ret
= intel_vgpu_aperture_rw(vgpu
, pos
, buf
, count
, is_write
);
962 case VFIO_PCI_BAR1_REGION_INDEX
:
963 case VFIO_PCI_BAR3_REGION_INDEX
:
964 case VFIO_PCI_BAR4_REGION_INDEX
:
965 case VFIO_PCI_BAR5_REGION_INDEX
:
966 case VFIO_PCI_VGA_REGION_INDEX
:
967 case VFIO_PCI_ROM_REGION_INDEX
:
970 if (index
>= VFIO_PCI_NUM_REGIONS
+ vgpu
->vdev
.num_regions
)
973 index
-= VFIO_PCI_NUM_REGIONS
;
974 return vgpu
->vdev
.region
[index
].ops
->rw(vgpu
, buf
, count
,
978 return ret
== 0 ? count
: ret
;
981 static bool gtt_entry(struct mdev_device
*mdev
, loff_t
*ppos
)
983 struct intel_vgpu
*vgpu
= mdev_get_drvdata(mdev
);
984 unsigned int index
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
);
985 struct intel_gvt
*gvt
= vgpu
->gvt
;
988 /* Only allow MMIO GGTT entry access */
989 if (index
!= PCI_BASE_ADDRESS_0
)
992 offset
= (u64
)(*ppos
& VFIO_PCI_OFFSET_MASK
) -
993 intel_vgpu_get_bar_gpa(vgpu
, PCI_BASE_ADDRESS_0
);
995 return (offset
>= gvt
->device_info
.gtt_start_offset
&&
996 offset
< gvt
->device_info
.gtt_start_offset
+ gvt_ggtt_sz(gvt
)) ?
1000 static ssize_t
intel_vgpu_read(struct mdev_device
*mdev
, char __user
*buf
,
1001 size_t count
, loff_t
*ppos
)
1003 unsigned int done
= 0;
1009 /* Only support GGTT entry 8 bytes read */
1010 if (count
>= 8 && !(*ppos
% 8) &&
1011 gtt_entry(mdev
, ppos
)) {
1014 ret
= intel_vgpu_rw(mdev
, (char *)&val
, sizeof(val
),
1019 if (copy_to_user(buf
, &val
, sizeof(val
)))
1023 } else if (count
>= 4 && !(*ppos
% 4)) {
1026 ret
= intel_vgpu_rw(mdev
, (char *)&val
, sizeof(val
),
1031 if (copy_to_user(buf
, &val
, sizeof(val
)))
1035 } else if (count
>= 2 && !(*ppos
% 2)) {
1038 ret
= intel_vgpu_rw(mdev
, (char *)&val
, sizeof(val
),
1043 if (copy_to_user(buf
, &val
, sizeof(val
)))
1050 ret
= intel_vgpu_rw(mdev
, &val
, sizeof(val
), ppos
,
1055 if (copy_to_user(buf
, &val
, sizeof(val
)))
1073 static ssize_t
intel_vgpu_write(struct mdev_device
*mdev
,
1074 const char __user
*buf
,
1075 size_t count
, loff_t
*ppos
)
1077 unsigned int done
= 0;
1083 /* Only support GGTT entry 8 bytes write */
1084 if (count
>= 8 && !(*ppos
% 8) &&
1085 gtt_entry(mdev
, ppos
)) {
1088 if (copy_from_user(&val
, buf
, sizeof(val
)))
1091 ret
= intel_vgpu_rw(mdev
, (char *)&val
, sizeof(val
),
1097 } else if (count
>= 4 && !(*ppos
% 4)) {
1100 if (copy_from_user(&val
, buf
, sizeof(val
)))
1103 ret
= intel_vgpu_rw(mdev
, (char *)&val
, sizeof(val
),
1109 } else if (count
>= 2 && !(*ppos
% 2)) {
1112 if (copy_from_user(&val
, buf
, sizeof(val
)))
1115 ret
= intel_vgpu_rw(mdev
, (char *)&val
,
1116 sizeof(val
), ppos
, true);
1124 if (copy_from_user(&val
, buf
, sizeof(val
)))
1127 ret
= intel_vgpu_rw(mdev
, &val
, sizeof(val
),
1146 static int intel_vgpu_mmap(struct mdev_device
*mdev
, struct vm_area_struct
*vma
)
1150 unsigned long req_size
, pgoff
, req_start
;
1152 struct intel_vgpu
*vgpu
= mdev_get_drvdata(mdev
);
1154 index
= vma
->vm_pgoff
>> (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
);
1155 if (index
>= VFIO_PCI_ROM_REGION_INDEX
)
1158 if (vma
->vm_end
< vma
->vm_start
)
1160 if ((vma
->vm_flags
& VM_SHARED
) == 0)
1162 if (index
!= VFIO_PCI_BAR2_REGION_INDEX
)
1165 pg_prot
= vma
->vm_page_prot
;
1166 virtaddr
= vma
->vm_start
;
1167 req_size
= vma
->vm_end
- vma
->vm_start
;
1168 pgoff
= vma
->vm_pgoff
&
1169 ((1U << (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
)) - 1);
1170 req_start
= pgoff
<< PAGE_SHIFT
;
1172 if (!intel_vgpu_in_aperture(vgpu
, req_start
))
1174 if (req_start
+ req_size
>
1175 vgpu_aperture_offset(vgpu
) + vgpu_aperture_sz(vgpu
))
1178 pgoff
= (gvt_aperture_pa_base(vgpu
->gvt
) >> PAGE_SHIFT
) + pgoff
;
1180 return remap_pfn_range(vma
, virtaddr
, pgoff
, req_size
, pg_prot
);
1183 static int intel_vgpu_get_irq_count(struct intel_vgpu
*vgpu
, int type
)
1185 if (type
== VFIO_PCI_INTX_IRQ_INDEX
|| type
== VFIO_PCI_MSI_IRQ_INDEX
)
1191 static int intel_vgpu_set_intx_mask(struct intel_vgpu
*vgpu
,
1192 unsigned int index
, unsigned int start
,
1193 unsigned int count
, u32 flags
,
1199 static int intel_vgpu_set_intx_unmask(struct intel_vgpu
*vgpu
,
1200 unsigned int index
, unsigned int start
,
1201 unsigned int count
, u32 flags
, void *data
)
1206 static int intel_vgpu_set_intx_trigger(struct intel_vgpu
*vgpu
,
1207 unsigned int index
, unsigned int start
, unsigned int count
,
1208 u32 flags
, void *data
)
1213 static int intel_vgpu_set_msi_trigger(struct intel_vgpu
*vgpu
,
1214 unsigned int index
, unsigned int start
, unsigned int count
,
1215 u32 flags
, void *data
)
1217 struct eventfd_ctx
*trigger
;
1219 if (flags
& VFIO_IRQ_SET_DATA_EVENTFD
) {
1220 int fd
= *(int *)data
;
1222 trigger
= eventfd_ctx_fdget(fd
);
1223 if (IS_ERR(trigger
)) {
1224 gvt_vgpu_err("eventfd_ctx_fdget failed\n");
1225 return PTR_ERR(trigger
);
1227 vgpu
->vdev
.msi_trigger
= trigger
;
1228 } else if ((flags
& VFIO_IRQ_SET_DATA_NONE
) && !count
)
1229 intel_vgpu_release_msi_eventfd_ctx(vgpu
);
1234 static int intel_vgpu_set_irqs(struct intel_vgpu
*vgpu
, u32 flags
,
1235 unsigned int index
, unsigned int start
, unsigned int count
,
1238 int (*func
)(struct intel_vgpu
*vgpu
, unsigned int index
,
1239 unsigned int start
, unsigned int count
, u32 flags
,
1243 case VFIO_PCI_INTX_IRQ_INDEX
:
1244 switch (flags
& VFIO_IRQ_SET_ACTION_TYPE_MASK
) {
1245 case VFIO_IRQ_SET_ACTION_MASK
:
1246 func
= intel_vgpu_set_intx_mask
;
1248 case VFIO_IRQ_SET_ACTION_UNMASK
:
1249 func
= intel_vgpu_set_intx_unmask
;
1251 case VFIO_IRQ_SET_ACTION_TRIGGER
:
1252 func
= intel_vgpu_set_intx_trigger
;
1256 case VFIO_PCI_MSI_IRQ_INDEX
:
1257 switch (flags
& VFIO_IRQ_SET_ACTION_TYPE_MASK
) {
1258 case VFIO_IRQ_SET_ACTION_MASK
:
1259 case VFIO_IRQ_SET_ACTION_UNMASK
:
1260 /* XXX Need masking support exported */
1262 case VFIO_IRQ_SET_ACTION_TRIGGER
:
1263 func
= intel_vgpu_set_msi_trigger
;
1272 return func(vgpu
, index
, start
, count
, flags
, data
);
1275 static long intel_vgpu_ioctl(struct mdev_device
*mdev
, unsigned int cmd
,
1278 struct intel_vgpu
*vgpu
= mdev_get_drvdata(mdev
);
1279 unsigned long minsz
;
1281 gvt_dbg_core("vgpu%d ioctl, cmd: %d\n", vgpu
->id
, cmd
);
1283 if (cmd
== VFIO_DEVICE_GET_INFO
) {
1284 struct vfio_device_info info
;
1286 minsz
= offsetofend(struct vfio_device_info
, num_irqs
);
1288 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
1291 if (info
.argsz
< minsz
)
1294 info
.flags
= VFIO_DEVICE_FLAGS_PCI
;
1295 info
.flags
|= VFIO_DEVICE_FLAGS_RESET
;
1296 info
.num_regions
= VFIO_PCI_NUM_REGIONS
+
1297 vgpu
->vdev
.num_regions
;
1298 info
.num_irqs
= VFIO_PCI_NUM_IRQS
;
1300 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
1303 } else if (cmd
== VFIO_DEVICE_GET_REGION_INFO
) {
1304 struct vfio_region_info info
;
1305 struct vfio_info_cap caps
= { .buf
= NULL
, .size
= 0 };
1308 struct vfio_region_info_cap_sparse_mmap
*sparse
= NULL
;
1312 minsz
= offsetofend(struct vfio_region_info
, offset
);
1314 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
1317 if (info
.argsz
< minsz
)
1320 switch (info
.index
) {
1321 case VFIO_PCI_CONFIG_REGION_INDEX
:
1322 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
1323 info
.size
= vgpu
->gvt
->device_info
.cfg_space_size
;
1324 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
1325 VFIO_REGION_INFO_FLAG_WRITE
;
1327 case VFIO_PCI_BAR0_REGION_INDEX
:
1328 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
1329 info
.size
= vgpu
->cfg_space
.bar
[info
.index
].size
;
1335 info
.flags
= VFIO_REGION_INFO_FLAG_READ
|
1336 VFIO_REGION_INFO_FLAG_WRITE
;
1338 case VFIO_PCI_BAR1_REGION_INDEX
:
1339 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
1343 case VFIO_PCI_BAR2_REGION_INDEX
:
1344 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
1345 info
.flags
= VFIO_REGION_INFO_FLAG_CAPS
|
1346 VFIO_REGION_INFO_FLAG_MMAP
|
1347 VFIO_REGION_INFO_FLAG_READ
|
1348 VFIO_REGION_INFO_FLAG_WRITE
;
1349 info
.size
= gvt_aperture_sz(vgpu
->gvt
);
1351 sparse
= kzalloc(struct_size(sparse
, areas
, nr_areas
),
1356 sparse
->header
.id
= VFIO_REGION_INFO_CAP_SPARSE_MMAP
;
1357 sparse
->header
.version
= 1;
1358 sparse
->nr_areas
= nr_areas
;
1359 cap_type_id
= VFIO_REGION_INFO_CAP_SPARSE_MMAP
;
1360 sparse
->areas
[0].offset
=
1361 PAGE_ALIGN(vgpu_aperture_offset(vgpu
));
1362 sparse
->areas
[0].size
= vgpu_aperture_sz(vgpu
);
1365 case VFIO_PCI_BAR3_REGION_INDEX
... VFIO_PCI_BAR5_REGION_INDEX
:
1366 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
1370 gvt_dbg_core("get region info bar:%d\n", info
.index
);
1373 case VFIO_PCI_ROM_REGION_INDEX
:
1374 case VFIO_PCI_VGA_REGION_INDEX
:
1375 info
.offset
= VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
1379 gvt_dbg_core("get region info index:%d\n", info
.index
);
1383 struct vfio_region_info_cap_type cap_type
= {
1384 .header
.id
= VFIO_REGION_INFO_CAP_TYPE
,
1385 .header
.version
= 1 };
1387 if (info
.index
>= VFIO_PCI_NUM_REGIONS
+
1388 vgpu
->vdev
.num_regions
)
1391 array_index_nospec(info
.index
,
1392 VFIO_PCI_NUM_REGIONS
+
1393 vgpu
->vdev
.num_regions
);
1395 i
= info
.index
- VFIO_PCI_NUM_REGIONS
;
1398 VFIO_PCI_INDEX_TO_OFFSET(info
.index
);
1399 info
.size
= vgpu
->vdev
.region
[i
].size
;
1400 info
.flags
= vgpu
->vdev
.region
[i
].flags
;
1402 cap_type
.type
= vgpu
->vdev
.region
[i
].type
;
1403 cap_type
.subtype
= vgpu
->vdev
.region
[i
].subtype
;
1405 ret
= vfio_info_add_capability(&caps
,
1413 if ((info
.flags
& VFIO_REGION_INFO_FLAG_CAPS
) && sparse
) {
1414 switch (cap_type_id
) {
1415 case VFIO_REGION_INFO_CAP_SPARSE_MMAP
:
1416 ret
= vfio_info_add_capability(&caps
,
1418 struct_size(sparse
, areas
,
1432 info
.flags
|= VFIO_REGION_INFO_FLAG_CAPS
;
1433 if (info
.argsz
< sizeof(info
) + caps
.size
) {
1434 info
.argsz
= sizeof(info
) + caps
.size
;
1435 info
.cap_offset
= 0;
1437 vfio_info_cap_shift(&caps
, sizeof(info
));
1438 if (copy_to_user((void __user
*)arg
+
1439 sizeof(info
), caps
.buf
,
1445 info
.cap_offset
= sizeof(info
);
1452 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
1454 } else if (cmd
== VFIO_DEVICE_GET_IRQ_INFO
) {
1455 struct vfio_irq_info info
;
1457 minsz
= offsetofend(struct vfio_irq_info
, count
);
1459 if (copy_from_user(&info
, (void __user
*)arg
, minsz
))
1462 if (info
.argsz
< minsz
|| info
.index
>= VFIO_PCI_NUM_IRQS
)
1465 switch (info
.index
) {
1466 case VFIO_PCI_INTX_IRQ_INDEX
:
1467 case VFIO_PCI_MSI_IRQ_INDEX
:
1473 info
.flags
= VFIO_IRQ_INFO_EVENTFD
;
1475 info
.count
= intel_vgpu_get_irq_count(vgpu
, info
.index
);
1477 if (info
.index
== VFIO_PCI_INTX_IRQ_INDEX
)
1478 info
.flags
|= (VFIO_IRQ_INFO_MASKABLE
|
1479 VFIO_IRQ_INFO_AUTOMASKED
);
1481 info
.flags
|= VFIO_IRQ_INFO_NORESIZE
;
1483 return copy_to_user((void __user
*)arg
, &info
, minsz
) ?
1485 } else if (cmd
== VFIO_DEVICE_SET_IRQS
) {
1486 struct vfio_irq_set hdr
;
1489 size_t data_size
= 0;
1491 minsz
= offsetofend(struct vfio_irq_set
, count
);
1493 if (copy_from_user(&hdr
, (void __user
*)arg
, minsz
))
1496 if (!(hdr
.flags
& VFIO_IRQ_SET_DATA_NONE
)) {
1497 int max
= intel_vgpu_get_irq_count(vgpu
, hdr
.index
);
1499 ret
= vfio_set_irqs_validate_and_prepare(&hdr
, max
,
1500 VFIO_PCI_NUM_IRQS
, &data_size
);
1502 gvt_vgpu_err("intel:vfio_set_irqs_validate_and_prepare failed\n");
1506 data
= memdup_user((void __user
*)(arg
+ minsz
),
1509 return PTR_ERR(data
);
1513 ret
= intel_vgpu_set_irqs(vgpu
, hdr
.flags
, hdr
.index
,
1514 hdr
.start
, hdr
.count
, data
);
1518 } else if (cmd
== VFIO_DEVICE_RESET
) {
1519 intel_gvt_ops
->vgpu_reset(vgpu
);
1521 } else if (cmd
== VFIO_DEVICE_QUERY_GFX_PLANE
) {
1522 struct vfio_device_gfx_plane_info dmabuf
;
1525 minsz
= offsetofend(struct vfio_device_gfx_plane_info
,
1527 if (copy_from_user(&dmabuf
, (void __user
*)arg
, minsz
))
1529 if (dmabuf
.argsz
< minsz
)
1532 ret
= intel_gvt_ops
->vgpu_query_plane(vgpu
, &dmabuf
);
1536 return copy_to_user((void __user
*)arg
, &dmabuf
, minsz
) ?
1538 } else if (cmd
== VFIO_DEVICE_GET_GFX_DMABUF
) {
1542 if (get_user(dmabuf_id
, (__u32 __user
*)arg
))
1545 dmabuf_fd
= intel_gvt_ops
->vgpu_get_dmabuf(vgpu
, dmabuf_id
);
1554 vgpu_id_show(struct device
*dev
, struct device_attribute
*attr
,
1557 struct mdev_device
*mdev
= mdev_from_dev(dev
);
1560 struct intel_vgpu
*vgpu
= (struct intel_vgpu
*)
1561 mdev_get_drvdata(mdev
);
1562 return sprintf(buf
, "%d\n", vgpu
->id
);
1564 return sprintf(buf
, "\n");
1567 static DEVICE_ATTR_RO(vgpu_id
);
1569 static struct attribute
*intel_vgpu_attrs
[] = {
1570 &dev_attr_vgpu_id
.attr
,
1574 static const struct attribute_group intel_vgpu_group
= {
1575 .name
= "intel_vgpu",
1576 .attrs
= intel_vgpu_attrs
,
1579 static const struct attribute_group
*intel_vgpu_groups
[] = {
1584 static struct mdev_parent_ops intel_vgpu_ops
= {
1585 .mdev_attr_groups
= intel_vgpu_groups
,
1586 .create
= intel_vgpu_create
,
1587 .remove
= intel_vgpu_remove
,
1589 .open
= intel_vgpu_open
,
1590 .release
= intel_vgpu_release
,
1592 .read
= intel_vgpu_read
,
1593 .write
= intel_vgpu_write
,
1594 .mmap
= intel_vgpu_mmap
,
1595 .ioctl
= intel_vgpu_ioctl
,
1598 static int kvmgt_host_init(struct device
*dev
, void *gvt
, const void *ops
)
1600 struct attribute
**kvm_type_attrs
;
1601 struct attribute_group
**kvm_vgpu_type_groups
;
1603 intel_gvt_ops
= ops
;
1604 if (!intel_gvt_ops
->get_gvt_attrs(&kvm_type_attrs
,
1605 &kvm_vgpu_type_groups
))
1607 intel_vgpu_ops
.supported_type_groups
= kvm_vgpu_type_groups
;
1609 return mdev_register_device(dev
, &intel_vgpu_ops
);
1612 static void kvmgt_host_exit(struct device
*dev
)
1614 mdev_unregister_device(dev
);
1617 static int kvmgt_page_track_add(unsigned long handle
, u64 gfn
)
1619 struct kvmgt_guest_info
*info
;
1621 struct kvm_memory_slot
*slot
;
1624 if (!handle_valid(handle
))
1627 info
= (struct kvmgt_guest_info
*)handle
;
1630 idx
= srcu_read_lock(&kvm
->srcu
);
1631 slot
= gfn_to_memslot(kvm
, gfn
);
1633 srcu_read_unlock(&kvm
->srcu
, idx
);
1637 spin_lock(&kvm
->mmu_lock
);
1639 if (kvmgt_gfn_is_write_protected(info
, gfn
))
1642 kvm_slot_page_track_add_page(kvm
, slot
, gfn
, KVM_PAGE_TRACK_WRITE
);
1643 kvmgt_protect_table_add(info
, gfn
);
1646 spin_unlock(&kvm
->mmu_lock
);
1647 srcu_read_unlock(&kvm
->srcu
, idx
);
1651 static int kvmgt_page_track_remove(unsigned long handle
, u64 gfn
)
1653 struct kvmgt_guest_info
*info
;
1655 struct kvm_memory_slot
*slot
;
1658 if (!handle_valid(handle
))
1661 info
= (struct kvmgt_guest_info
*)handle
;
1664 idx
= srcu_read_lock(&kvm
->srcu
);
1665 slot
= gfn_to_memslot(kvm
, gfn
);
1667 srcu_read_unlock(&kvm
->srcu
, idx
);
1671 spin_lock(&kvm
->mmu_lock
);
1673 if (!kvmgt_gfn_is_write_protected(info
, gfn
))
1676 kvm_slot_page_track_remove_page(kvm
, slot
, gfn
, KVM_PAGE_TRACK_WRITE
);
1677 kvmgt_protect_table_del(info
, gfn
);
1680 spin_unlock(&kvm
->mmu_lock
);
1681 srcu_read_unlock(&kvm
->srcu
, idx
);
1685 static void kvmgt_page_track_write(struct kvm_vcpu
*vcpu
, gpa_t gpa
,
1686 const u8
*val
, int len
,
1687 struct kvm_page_track_notifier_node
*node
)
1689 struct kvmgt_guest_info
*info
= container_of(node
,
1690 struct kvmgt_guest_info
, track_node
);
1692 if (kvmgt_gfn_is_write_protected(info
, gpa_to_gfn(gpa
)))
1693 intel_gvt_ops
->write_protect_handler(info
->vgpu
, gpa
,
1697 static void kvmgt_page_track_flush_slot(struct kvm
*kvm
,
1698 struct kvm_memory_slot
*slot
,
1699 struct kvm_page_track_notifier_node
*node
)
1703 struct kvmgt_guest_info
*info
= container_of(node
,
1704 struct kvmgt_guest_info
, track_node
);
1706 spin_lock(&kvm
->mmu_lock
);
1707 for (i
= 0; i
< slot
->npages
; i
++) {
1708 gfn
= slot
->base_gfn
+ i
;
1709 if (kvmgt_gfn_is_write_protected(info
, gfn
)) {
1710 kvm_slot_page_track_remove_page(kvm
, slot
, gfn
,
1711 KVM_PAGE_TRACK_WRITE
);
1712 kvmgt_protect_table_del(info
, gfn
);
1715 spin_unlock(&kvm
->mmu_lock
);
1718 static bool __kvmgt_vgpu_exist(struct intel_vgpu
*vgpu
, struct kvm
*kvm
)
1720 struct intel_vgpu
*itr
;
1721 struct kvmgt_guest_info
*info
;
1725 mutex_lock(&vgpu
->gvt
->lock
);
1726 for_each_active_vgpu(vgpu
->gvt
, itr
, id
) {
1727 if (!handle_valid(itr
->handle
))
1730 info
= (struct kvmgt_guest_info
*)itr
->handle
;
1731 if (kvm
&& kvm
== info
->kvm
) {
1737 mutex_unlock(&vgpu
->gvt
->lock
);
1741 static int kvmgt_guest_init(struct mdev_device
*mdev
)
1743 struct kvmgt_guest_info
*info
;
1744 struct intel_vgpu
*vgpu
;
1747 vgpu
= mdev_get_drvdata(mdev
);
1748 if (handle_valid(vgpu
->handle
))
1751 kvm
= vgpu
->vdev
.kvm
;
1752 if (!kvm
|| kvm
->mm
!= current
->mm
) {
1753 gvt_vgpu_err("KVM is required to use Intel vGPU\n");
1757 if (__kvmgt_vgpu_exist(vgpu
, kvm
))
1760 info
= vzalloc(sizeof(struct kvmgt_guest_info
));
1764 vgpu
->handle
= (unsigned long)info
;
1767 kvm_get_kvm(info
->kvm
);
1769 kvmgt_protect_table_init(info
);
1770 gvt_cache_init(vgpu
);
1772 init_completion(&vgpu
->vblank_done
);
1774 info
->track_node
.track_write
= kvmgt_page_track_write
;
1775 info
->track_node
.track_flush_slot
= kvmgt_page_track_flush_slot
;
1776 kvm_page_track_register_notifier(kvm
, &info
->track_node
);
1778 info
->debugfs_cache_entries
= debugfs_create_ulong(
1779 "kvmgt_nr_cache_entries",
1780 0444, vgpu
->debugfs
,
1781 &vgpu
->vdev
.nr_cache_entries
);
1785 static bool kvmgt_guest_exit(struct kvmgt_guest_info
*info
)
1787 debugfs_remove(info
->debugfs_cache_entries
);
1789 kvm_page_track_unregister_notifier(info
->kvm
, &info
->track_node
);
1790 kvm_put_kvm(info
->kvm
);
1791 kvmgt_protect_table_destroy(info
);
1792 gvt_cache_destroy(info
->vgpu
);
1798 static int kvmgt_attach_vgpu(void *vgpu
, unsigned long *handle
)
1800 /* nothing to do here */
1804 static void kvmgt_detach_vgpu(void *p_vgpu
)
1807 struct intel_vgpu
*vgpu
= (struct intel_vgpu
*)p_vgpu
;
1809 if (!vgpu
->vdev
.region
)
1812 for (i
= 0; i
< vgpu
->vdev
.num_regions
; i
++)
1813 if (vgpu
->vdev
.region
[i
].ops
->release
)
1814 vgpu
->vdev
.region
[i
].ops
->release(vgpu
,
1815 &vgpu
->vdev
.region
[i
]);
1816 vgpu
->vdev
.num_regions
= 0;
1817 kfree(vgpu
->vdev
.region
);
1818 vgpu
->vdev
.region
= NULL
;
1821 static int kvmgt_inject_msi(unsigned long handle
, u32 addr
, u16 data
)
1823 struct kvmgt_guest_info
*info
;
1824 struct intel_vgpu
*vgpu
;
1826 if (!handle_valid(handle
))
1829 info
= (struct kvmgt_guest_info
*)handle
;
1833 * When guest is poweroff, msi_trigger is set to NULL, but vgpu's
1834 * config and mmio register isn't restored to default during guest
1835 * poweroff. If this vgpu is still used in next vm, this vgpu's pipe
1836 * may be enabled, then once this vgpu is active, it will get inject
1837 * vblank interrupt request. But msi_trigger is null until msi is
1838 * enabled by guest. so if msi_trigger is null, success is still
1839 * returned and don't inject interrupt into guest.
1841 if (vgpu
->vdev
.msi_trigger
== NULL
)
1844 if (eventfd_signal(vgpu
->vdev
.msi_trigger
, 1) == 1)
1850 static unsigned long kvmgt_gfn_to_pfn(unsigned long handle
, unsigned long gfn
)
1852 struct kvmgt_guest_info
*info
;
1855 if (!handle_valid(handle
))
1856 return INTEL_GVT_INVALID_ADDR
;
1858 info
= (struct kvmgt_guest_info
*)handle
;
1860 pfn
= gfn_to_pfn(info
->kvm
, gfn
);
1861 if (is_error_noslot_pfn(pfn
))
1862 return INTEL_GVT_INVALID_ADDR
;
1867 static int kvmgt_dma_map_guest_page(unsigned long handle
, unsigned long gfn
,
1868 unsigned long size
, dma_addr_t
*dma_addr
)
1870 struct kvmgt_guest_info
*info
;
1871 struct intel_vgpu
*vgpu
;
1872 struct gvt_dma
*entry
;
1875 if (!handle_valid(handle
))
1878 info
= (struct kvmgt_guest_info
*)handle
;
1881 mutex_lock(&info
->vgpu
->vdev
.cache_lock
);
1883 entry
= __gvt_cache_find_gfn(info
->vgpu
, gfn
);
1885 ret
= gvt_dma_map_page(vgpu
, gfn
, dma_addr
, size
);
1889 ret
= __gvt_cache_add(info
->vgpu
, gfn
, *dma_addr
, size
);
1892 } else if (entry
->size
!= size
) {
1893 /* the same gfn with different size: unmap and re-map */
1894 gvt_dma_unmap_page(vgpu
, gfn
, entry
->dma_addr
, entry
->size
);
1895 __gvt_cache_remove_entry(vgpu
, entry
);
1897 ret
= gvt_dma_map_page(vgpu
, gfn
, dma_addr
, size
);
1901 ret
= __gvt_cache_add(info
->vgpu
, gfn
, *dma_addr
, size
);
1905 kref_get(&entry
->ref
);
1906 *dma_addr
= entry
->dma_addr
;
1909 mutex_unlock(&info
->vgpu
->vdev
.cache_lock
);
1913 gvt_dma_unmap_page(vgpu
, gfn
, *dma_addr
, size
);
1915 mutex_unlock(&info
->vgpu
->vdev
.cache_lock
);
1919 static int kvmgt_dma_pin_guest_page(unsigned long handle
, dma_addr_t dma_addr
)
1921 struct kvmgt_guest_info
*info
;
1922 struct gvt_dma
*entry
;
1925 if (!handle_valid(handle
))
1928 info
= (struct kvmgt_guest_info
*)handle
;
1930 mutex_lock(&info
->vgpu
->vdev
.cache_lock
);
1931 entry
= __gvt_cache_find_dma_addr(info
->vgpu
, dma_addr
);
1933 kref_get(&entry
->ref
);
1936 mutex_unlock(&info
->vgpu
->vdev
.cache_lock
);
1941 static void __gvt_dma_release(struct kref
*ref
)
1943 struct gvt_dma
*entry
= container_of(ref
, typeof(*entry
), ref
);
1945 gvt_dma_unmap_page(entry
->vgpu
, entry
->gfn
, entry
->dma_addr
,
1947 __gvt_cache_remove_entry(entry
->vgpu
, entry
);
1950 static void kvmgt_dma_unmap_guest_page(unsigned long handle
, dma_addr_t dma_addr
)
1952 struct kvmgt_guest_info
*info
;
1953 struct gvt_dma
*entry
;
1955 if (!handle_valid(handle
))
1958 info
= (struct kvmgt_guest_info
*)handle
;
1960 mutex_lock(&info
->vgpu
->vdev
.cache_lock
);
1961 entry
= __gvt_cache_find_dma_addr(info
->vgpu
, dma_addr
);
1963 kref_put(&entry
->ref
, __gvt_dma_release
);
1964 mutex_unlock(&info
->vgpu
->vdev
.cache_lock
);
1967 static int kvmgt_rw_gpa(unsigned long handle
, unsigned long gpa
,
1968 void *buf
, unsigned long len
, bool write
)
1970 struct kvmgt_guest_info
*info
;
1973 bool kthread
= current
->mm
== NULL
;
1975 if (!handle_valid(handle
))
1978 info
= (struct kvmgt_guest_info
*)handle
;
1982 if (!mmget_not_zero(kvm
->mm
))
1987 idx
= srcu_read_lock(&kvm
->srcu
);
1988 ret
= write
? kvm_write_guest(kvm
, gpa
, buf
, len
) :
1989 kvm_read_guest(kvm
, gpa
, buf
, len
);
1990 srcu_read_unlock(&kvm
->srcu
, idx
);
2000 static int kvmgt_read_gpa(unsigned long handle
, unsigned long gpa
,
2001 void *buf
, unsigned long len
)
2003 return kvmgt_rw_gpa(handle
, gpa
, buf
, len
, false);
2006 static int kvmgt_write_gpa(unsigned long handle
, unsigned long gpa
,
2007 void *buf
, unsigned long len
)
2009 return kvmgt_rw_gpa(handle
, gpa
, buf
, len
, true);
2012 static unsigned long kvmgt_virt_to_pfn(void *addr
)
2014 return PFN_DOWN(__pa(addr
));
2017 static bool kvmgt_is_valid_gfn(unsigned long handle
, unsigned long gfn
)
2019 struct kvmgt_guest_info
*info
;
2024 if (!handle_valid(handle
))
2027 info
= (struct kvmgt_guest_info
*)handle
;
2030 idx
= srcu_read_lock(&kvm
->srcu
);
2031 ret
= kvm_is_visible_gfn(kvm
, gfn
);
2032 srcu_read_unlock(&kvm
->srcu
, idx
);
2037 static struct intel_gvt_mpt kvmgt_mpt
= {
2038 .type
= INTEL_GVT_HYPERVISOR_KVM
,
2039 .host_init
= kvmgt_host_init
,
2040 .host_exit
= kvmgt_host_exit
,
2041 .attach_vgpu
= kvmgt_attach_vgpu
,
2042 .detach_vgpu
= kvmgt_detach_vgpu
,
2043 .inject_msi
= kvmgt_inject_msi
,
2044 .from_virt_to_mfn
= kvmgt_virt_to_pfn
,
2045 .enable_page_track
= kvmgt_page_track_add
,
2046 .disable_page_track
= kvmgt_page_track_remove
,
2047 .read_gpa
= kvmgt_read_gpa
,
2048 .write_gpa
= kvmgt_write_gpa
,
2049 .gfn_to_mfn
= kvmgt_gfn_to_pfn
,
2050 .dma_map_guest_page
= kvmgt_dma_map_guest_page
,
2051 .dma_unmap_guest_page
= kvmgt_dma_unmap_guest_page
,
2052 .dma_pin_guest_page
= kvmgt_dma_pin_guest_page
,
2053 .set_opregion
= kvmgt_set_opregion
,
2054 .set_edid
= kvmgt_set_edid
,
2055 .get_vfio_device
= kvmgt_get_vfio_device
,
2056 .put_vfio_device
= kvmgt_put_vfio_device
,
2057 .is_valid_gfn
= kvmgt_is_valid_gfn
,
2060 static int __init
kvmgt_init(void)
2062 if (intel_gvt_register_hypervisor(&kvmgt_mpt
) < 0)
2067 static void __exit
kvmgt_exit(void)
2069 intel_gvt_unregister_hypervisor();
2072 module_init(kvmgt_init
);
2073 module_exit(kvmgt_exit
);
2075 MODULE_LICENSE("GPL and additional rights");
2076 MODULE_AUTHOR("Intel Corporation");