1 // SPDX-License-Identifier: GPL-2.0-only
3 * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2.
5 * Copyright (C) 2018 IBM Corp. All rights reserved.
6 * Author: Alexey Kardashevskiy <aik@ozlabs.ru>
8 * Register an on-GPU RAM region for cacheable access.
10 * Derived from original vfio_pci_igd.c:
11 * Copyright (C) 2016 Red Hat, Inc. All rights reserved.
12 * Author: Alex Williamson <alex.williamson@redhat.com>
16 #include <linux/pci.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/mmu_context.h>
21 #include <asm/kvm_ppc.h>
22 #include "vfio_pci_private.h"
24 #define CREATE_TRACE_POINTS
27 EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault
);
28 EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap
);
29 EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap
);
31 struct vfio_pci_nvgpu_data
{
32 unsigned long gpu_hpa
; /* GPU RAM physical address */
33 unsigned long gpu_tgt
; /* TGT address of corresponding GPU RAM */
34 unsigned long useraddr
; /* GPU RAM userspace address */
35 unsigned long size
; /* Size of the GPU RAM window (usually 128GB) */
37 struct mm_iommu_table_group_mem_t
*mem
; /* Pre-registered RAM descr. */
38 struct pci_dev
*gpdev
;
39 struct notifier_block group_notifier
;
42 static size_t vfio_pci_nvgpu_rw(struct vfio_pci_device
*vdev
,
43 char __user
*buf
, size_t count
, loff_t
*ppos
, bool iswrite
)
45 unsigned int i
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
) - VFIO_PCI_NUM_REGIONS
;
46 struct vfio_pci_nvgpu_data
*data
= vdev
->region
[i
].data
;
47 loff_t pos
= *ppos
& VFIO_PCI_OFFSET_MASK
;
48 loff_t posaligned
= pos
& PAGE_MASK
, posoff
= pos
& ~PAGE_MASK
;
52 if (pos
>= vdev
->region
[i
].size
)
55 count
= min(count
, (size_t)(vdev
->region
[i
].size
- pos
));
58 * We map only a bit of GPU RAM for a short time instead of mapping it
59 * for the guest lifetime as:
61 * 1) we do not know GPU RAM size, only aperture which is 4-8 times
62 * bigger than actual RAM size (16/32GB RAM vs. 128GB aperture);
63 * 2) mapping GPU RAM allows CPU to prefetch and if this happens
64 * before NVLink bridge is reset (which fences GPU RAM),
65 * hardware management interrupts (HMI) might happen, this
66 * will freeze NVLink bridge.
68 * This is not fast path anyway.
70 sizealigned
= _ALIGN_UP(posoff
+ count
, PAGE_SIZE
);
71 ptr
= ioremap_cache(data
->gpu_hpa
+ posaligned
, sizealigned
);
76 if (copy_from_user(ptr
+ posoff
, buf
, count
))
81 if (copy_to_user(buf
, ptr
+ posoff
, count
))
92 static void vfio_pci_nvgpu_release(struct vfio_pci_device
*vdev
,
93 struct vfio_pci_region
*region
)
95 struct vfio_pci_nvgpu_data
*data
= region
->data
;
98 /* If there were any mappings at all... */
101 ret
= mm_iommu_put(data
->mm
, data
->mem
);
108 vfio_unregister_notifier(&data
->gpdev
->dev
, VFIO_GROUP_NOTIFY
,
109 &data
->group_notifier
);
111 pnv_npu2_unmap_lpar_dev(data
->gpdev
);
116 static vm_fault_t
vfio_pci_nvgpu_mmap_fault(struct vm_fault
*vmf
)
119 struct vm_area_struct
*vma
= vmf
->vma
;
120 struct vfio_pci_region
*region
= vma
->vm_private_data
;
121 struct vfio_pci_nvgpu_data
*data
= region
->data
;
122 unsigned long vmf_off
= (vmf
->address
- vma
->vm_start
) >> PAGE_SHIFT
;
123 unsigned long nv2pg
= data
->gpu_hpa
>> PAGE_SHIFT
;
124 unsigned long vm_pgoff
= vma
->vm_pgoff
&
125 ((1U << (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
)) - 1);
126 unsigned long pfn
= nv2pg
+ vm_pgoff
+ vmf_off
;
128 ret
= vmf_insert_pfn(vma
, vmf
->address
, pfn
);
129 trace_vfio_pci_nvgpu_mmap_fault(data
->gpdev
, pfn
<< PAGE_SHIFT
,
135 static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops
= {
136 .fault
= vfio_pci_nvgpu_mmap_fault
,
139 static int vfio_pci_nvgpu_mmap(struct vfio_pci_device
*vdev
,
140 struct vfio_pci_region
*region
, struct vm_area_struct
*vma
)
143 struct vfio_pci_nvgpu_data
*data
= region
->data
;
148 if (vma
->vm_end
- vma
->vm_start
> data
->size
)
151 vma
->vm_private_data
= region
;
152 vma
->vm_flags
|= VM_PFNMAP
;
153 vma
->vm_ops
= &vfio_pci_nvgpu_mmap_vmops
;
156 * Calling mm_iommu_newdev() here once as the region is not
157 * registered yet and therefore right initialization will happen now.
158 * Other places will use mm_iommu_find() which returns
159 * registered @mem and does not go gup().
161 data
->useraddr
= vma
->vm_start
;
162 data
->mm
= current
->mm
;
165 ret
= (int) mm_iommu_newdev(data
->mm
, data
->useraddr
,
166 vma_pages(vma
), data
->gpu_hpa
, &data
->mem
);
168 trace_vfio_pci_nvgpu_mmap(vdev
->pdev
, data
->gpu_hpa
, data
->useraddr
,
169 vma
->vm_end
- vma
->vm_start
, ret
);
174 static int vfio_pci_nvgpu_add_capability(struct vfio_pci_device
*vdev
,
175 struct vfio_pci_region
*region
, struct vfio_info_cap
*caps
)
177 struct vfio_pci_nvgpu_data
*data
= region
->data
;
178 struct vfio_region_info_cap_nvlink2_ssatgt cap
= {
179 .header
.id
= VFIO_REGION_INFO_CAP_NVLINK2_SSATGT
,
184 return vfio_info_add_capability(caps
, &cap
.header
, sizeof(cap
));
187 static const struct vfio_pci_regops vfio_pci_nvgpu_regops
= {
188 .rw
= vfio_pci_nvgpu_rw
,
189 .release
= vfio_pci_nvgpu_release
,
190 .mmap
= vfio_pci_nvgpu_mmap
,
191 .add_capability
= vfio_pci_nvgpu_add_capability
,
194 static int vfio_pci_nvgpu_group_notifier(struct notifier_block
*nb
,
195 unsigned long action
, void *opaque
)
197 struct kvm
*kvm
= opaque
;
198 struct vfio_pci_nvgpu_data
*data
= container_of(nb
,
199 struct vfio_pci_nvgpu_data
,
202 if (action
== VFIO_GROUP_NOTIFY_SET_KVM
&& kvm
&&
203 pnv_npu2_map_lpar_dev(data
->gpdev
,
204 kvm
->arch
.lpid
, MSR_DR
| MSR_PR
))
210 int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device
*vdev
)
215 struct device_node
*npu_node
, *mem_node
;
216 struct pci_dev
*npu_dev
;
217 struct vfio_pci_nvgpu_data
*data
;
218 uint32_t mem_phandle
= 0;
219 unsigned long events
= VFIO_GROUP_NOTIFY_SET_KVM
;
222 * PCI config space does not tell us about NVLink presense but
223 * platform does, use this.
225 npu_dev
= pnv_pci_get_npu_dev(vdev
->pdev
, 0);
229 npu_node
= pci_device_to_OF_node(npu_dev
);
233 if (of_property_read_u32(npu_node
, "memory-region", &mem_phandle
))
236 mem_node
= of_find_node_by_phandle(mem_phandle
);
240 if (of_property_read_variable_u64_array(mem_node
, "reg", reg
,
241 ARRAY_SIZE(reg
), ARRAY_SIZE(reg
)) !=
245 if (of_property_read_u64(npu_node
, "ibm,device-tgt-addr", &tgt
)) {
246 dev_warn(&vdev
->pdev
->dev
, "No ibm,device-tgt-addr found\n");
250 data
= kzalloc(sizeof(*data
), GFP_KERNEL
);
254 data
->gpu_hpa
= reg
[0];
258 dev_dbg(&vdev
->pdev
->dev
, "%lx..%lx\n", data
->gpu_hpa
,
259 data
->gpu_hpa
+ data
->size
- 1);
261 data
->gpdev
= vdev
->pdev
;
262 data
->group_notifier
.notifier_call
= vfio_pci_nvgpu_group_notifier
;
264 ret
= vfio_register_notifier(&data
->gpdev
->dev
, VFIO_GROUP_NOTIFY
,
265 &events
, &data
->group_notifier
);
270 * We have just set KVM, we do not need the listener anymore.
271 * Also, keeping it registered means that if more than one GPU is
272 * assigned, we will get several similar notifiers notifying about
273 * the same device again which does not help with anything.
275 vfio_unregister_notifier(&data
->gpdev
->dev
, VFIO_GROUP_NOTIFY
,
276 &data
->group_notifier
);
278 ret
= vfio_pci_register_dev_region(vdev
,
279 PCI_VENDOR_ID_NVIDIA
| VFIO_REGION_TYPE_PCI_VENDOR_TYPE
,
280 VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM
,
281 &vfio_pci_nvgpu_regops
,
283 VFIO_REGION_INFO_FLAG_READ
|
284 VFIO_REGION_INFO_FLAG_WRITE
|
285 VFIO_REGION_INFO_FLAG_MMAP
,
300 struct vfio_pci_npu2_data
{
301 void *base
; /* ATSD register virtual address, for emulated access */
302 unsigned long mmio_atsd
; /* ATSD physical address */
303 unsigned long gpu_tgt
; /* TGT address of corresponding GPU RAM */
304 unsigned int link_speed
; /* The link speed from DT's ibm,nvlink-speed */
307 static size_t vfio_pci_npu2_rw(struct vfio_pci_device
*vdev
,
308 char __user
*buf
, size_t count
, loff_t
*ppos
, bool iswrite
)
310 unsigned int i
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
) - VFIO_PCI_NUM_REGIONS
;
311 struct vfio_pci_npu2_data
*data
= vdev
->region
[i
].data
;
312 loff_t pos
= *ppos
& VFIO_PCI_OFFSET_MASK
;
314 if (pos
>= vdev
->region
[i
].size
)
317 count
= min(count
, (size_t)(vdev
->region
[i
].size
- pos
));
320 if (copy_from_user(data
->base
+ pos
, buf
, count
))
323 if (copy_to_user(buf
, data
->base
+ pos
, count
))
331 static int vfio_pci_npu2_mmap(struct vfio_pci_device
*vdev
,
332 struct vfio_pci_region
*region
, struct vm_area_struct
*vma
)
335 struct vfio_pci_npu2_data
*data
= region
->data
;
336 unsigned long req_len
= vma
->vm_end
- vma
->vm_start
;
338 if (req_len
!= PAGE_SIZE
)
341 vma
->vm_flags
|= VM_PFNMAP
;
342 vma
->vm_page_prot
= pgprot_noncached(vma
->vm_page_prot
);
344 ret
= remap_pfn_range(vma
, vma
->vm_start
, data
->mmio_atsd
>> PAGE_SHIFT
,
345 req_len
, vma
->vm_page_prot
);
346 trace_vfio_pci_npu2_mmap(vdev
->pdev
, data
->mmio_atsd
, vma
->vm_start
,
347 vma
->vm_end
- vma
->vm_start
, ret
);
352 static void vfio_pci_npu2_release(struct vfio_pci_device
*vdev
,
353 struct vfio_pci_region
*region
)
355 struct vfio_pci_npu2_data
*data
= region
->data
;
357 memunmap(data
->base
);
361 static int vfio_pci_npu2_add_capability(struct vfio_pci_device
*vdev
,
362 struct vfio_pci_region
*region
, struct vfio_info_cap
*caps
)
364 struct vfio_pci_npu2_data
*data
= region
->data
;
365 struct vfio_region_info_cap_nvlink2_ssatgt captgt
= {
366 .header
.id
= VFIO_REGION_INFO_CAP_NVLINK2_SSATGT
,
370 struct vfio_region_info_cap_nvlink2_lnkspd capspd
= {
371 .header
.id
= VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD
,
373 .link_speed
= data
->link_speed
377 ret
= vfio_info_add_capability(caps
, &captgt
.header
, sizeof(captgt
));
381 return vfio_info_add_capability(caps
, &capspd
.header
, sizeof(capspd
));
384 static const struct vfio_pci_regops vfio_pci_npu2_regops
= {
385 .rw
= vfio_pci_npu2_rw
,
386 .mmap
= vfio_pci_npu2_mmap
,
387 .release
= vfio_pci_npu2_release
,
388 .add_capability
= vfio_pci_npu2_add_capability
,
391 int vfio_pci_ibm_npu2_init(struct vfio_pci_device
*vdev
)
394 struct vfio_pci_npu2_data
*data
;
395 struct device_node
*nvlink_dn
;
396 u32 nvlink_index
= 0;
397 struct pci_dev
*npdev
= vdev
->pdev
;
398 struct device_node
*npu_node
= pci_device_to_OF_node(npdev
);
399 struct pci_controller
*hose
= pci_bus_to_host(npdev
->bus
);
402 u32 link_speed
= 0xff;
405 * PCI config space does not tell us about NVLink presense but
406 * platform does, use this.
408 if (!pnv_pci_get_gpu_dev(vdev
->pdev
))
412 * NPU2 normally has 8 ATSD registers (for concurrency) and 6 links
413 * so we can allocate one register per link, using nvlink index as
415 * There is always at least one ATSD register so as long as at least
416 * NVLink bridge #0 is passed to the guest, ATSD will be available.
418 nvlink_dn
= of_parse_phandle(npdev
->dev
.of_node
, "ibm,nvlink", 0);
419 if (WARN_ON(of_property_read_u32(nvlink_dn
, "ibm,npu-link-index",
423 if (of_property_read_u64_index(hose
->dn
, "ibm,mmio-atsd", nvlink_index
,
425 if (of_property_read_u64_index(hose
->dn
, "ibm,mmio-atsd", 0,
427 dev_warn(&vdev
->pdev
->dev
, "No available ATSD found\n");
430 dev_warn(&vdev
->pdev
->dev
,
431 "Using fallback ibm,mmio-atsd[0] for ATSD.\n");
435 if (of_property_read_u64(npu_node
, "ibm,device-tgt-addr", &tgt
)) {
436 dev_warn(&vdev
->pdev
->dev
, "No ibm,device-tgt-addr found\n");
440 if (of_property_read_u32(npu_node
, "ibm,nvlink-speed", &link_speed
)) {
441 dev_warn(&vdev
->pdev
->dev
, "No ibm,nvlink-speed found\n");
445 data
= kzalloc(sizeof(*data
), GFP_KERNEL
);
449 data
->mmio_atsd
= mmio_atsd
;
451 data
->link_speed
= link_speed
;
452 if (data
->mmio_atsd
) {
453 data
->base
= memremap(data
->mmio_atsd
, SZ_64K
, MEMREMAP_WT
);
461 * We want to expose the capability even if this specific NVLink
462 * did not get its own ATSD register because capabilities
463 * belong to VFIO regions and normally there will be ATSD register
464 * assigned to the NVLink bridge.
466 ret
= vfio_pci_register_dev_region(vdev
,
468 VFIO_REGION_TYPE_PCI_VENDOR_TYPE
,
469 VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD
,
470 &vfio_pci_npu2_regops
,
471 data
->mmio_atsd
? PAGE_SIZE
: 0,
472 VFIO_REGION_INFO_FLAG_READ
|
473 VFIO_REGION_INFO_FLAG_WRITE
|
474 VFIO_REGION_INFO_FLAG_MMAP
,
483 memunmap(data
->base
);