1 // SPDX-License-Identifier: GPL-2.0-only
3 * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2.
5 * Copyright (C) 2018 IBM Corp. All rights reserved.
6 * Author: Alexey Kardashevskiy <aik@ozlabs.ru>
8 * Register an on-GPU RAM region for cacheable access.
10 * Derived from original vfio_pci_igd.c:
11 * Copyright (C) 2016 Red Hat, Inc. All rights reserved.
12 * Author: Alex Williamson <alex.williamson@redhat.com>
16 #include <linux/pci.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/mmu_context.h>
21 #include <asm/kvm_ppc.h>
22 #include "vfio_pci_private.h"
24 #define CREATE_TRACE_POINTS
27 EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault
);
28 EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap
);
29 EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap
);
31 struct vfio_pci_nvgpu_data
{
32 unsigned long gpu_hpa
; /* GPU RAM physical address */
33 unsigned long gpu_tgt
; /* TGT address of corresponding GPU RAM */
34 unsigned long useraddr
; /* GPU RAM userspace address */
35 unsigned long size
; /* Size of the GPU RAM window (usually 128GB) */
37 struct mm_iommu_table_group_mem_t
*mem
; /* Pre-registered RAM descr. */
38 struct pci_dev
*gpdev
;
39 struct notifier_block group_notifier
;
42 static size_t vfio_pci_nvgpu_rw(struct vfio_pci_device
*vdev
,
43 char __user
*buf
, size_t count
, loff_t
*ppos
, bool iswrite
)
45 unsigned int i
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
) - VFIO_PCI_NUM_REGIONS
;
46 struct vfio_pci_nvgpu_data
*data
= vdev
->region
[i
].data
;
47 loff_t pos
= *ppos
& VFIO_PCI_OFFSET_MASK
;
48 loff_t posaligned
= pos
& PAGE_MASK
, posoff
= pos
& ~PAGE_MASK
;
52 if (pos
>= vdev
->region
[i
].size
)
55 count
= min(count
, (size_t)(vdev
->region
[i
].size
- pos
));
58 * We map only a bit of GPU RAM for a short time instead of mapping it
59 * for the guest lifetime as:
61 * 1) we do not know GPU RAM size, only aperture which is 4-8 times
62 * bigger than actual RAM size (16/32GB RAM vs. 128GB aperture);
63 * 2) mapping GPU RAM allows CPU to prefetch and if this happens
64 * before NVLink bridge is reset (which fences GPU RAM),
65 * hardware management interrupts (HMI) might happen, this
66 * will freeze NVLink bridge.
68 * This is not fast path anyway.
70 sizealigned
= _ALIGN_UP(posoff
+ count
, PAGE_SIZE
);
71 ptr
= ioremap_cache(data
->gpu_hpa
+ posaligned
, sizealigned
);
76 if (copy_from_user(ptr
+ posoff
, buf
, count
))
81 if (copy_to_user(buf
, ptr
+ posoff
, count
))
92 static void vfio_pci_nvgpu_release(struct vfio_pci_device
*vdev
,
93 struct vfio_pci_region
*region
)
95 struct vfio_pci_nvgpu_data
*data
= region
->data
;
98 /* If there were any mappings at all... */
100 ret
= mm_iommu_put(data
->mm
, data
->mem
);
106 vfio_unregister_notifier(&data
->gpdev
->dev
, VFIO_GROUP_NOTIFY
,
107 &data
->group_notifier
);
109 pnv_npu2_unmap_lpar_dev(data
->gpdev
);
114 static vm_fault_t
vfio_pci_nvgpu_mmap_fault(struct vm_fault
*vmf
)
117 struct vm_area_struct
*vma
= vmf
->vma
;
118 struct vfio_pci_region
*region
= vma
->vm_private_data
;
119 struct vfio_pci_nvgpu_data
*data
= region
->data
;
120 unsigned long vmf_off
= (vmf
->address
- vma
->vm_start
) >> PAGE_SHIFT
;
121 unsigned long nv2pg
= data
->gpu_hpa
>> PAGE_SHIFT
;
122 unsigned long vm_pgoff
= vma
->vm_pgoff
&
123 ((1U << (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
)) - 1);
124 unsigned long pfn
= nv2pg
+ vm_pgoff
+ vmf_off
;
126 ret
= vmf_insert_pfn(vma
, vmf
->address
, pfn
);
127 trace_vfio_pci_nvgpu_mmap_fault(data
->gpdev
, pfn
<< PAGE_SHIFT
,
133 static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops
= {
134 .fault
= vfio_pci_nvgpu_mmap_fault
,
137 static int vfio_pci_nvgpu_mmap(struct vfio_pci_device
*vdev
,
138 struct vfio_pci_region
*region
, struct vm_area_struct
*vma
)
141 struct vfio_pci_nvgpu_data
*data
= region
->data
;
146 if (vma
->vm_end
- vma
->vm_start
> data
->size
)
149 vma
->vm_private_data
= region
;
150 vma
->vm_flags
|= VM_PFNMAP
;
151 vma
->vm_ops
= &vfio_pci_nvgpu_mmap_vmops
;
154 * Calling mm_iommu_newdev() here once as the region is not
155 * registered yet and therefore right initialization will happen now.
156 * Other places will use mm_iommu_find() which returns
157 * registered @mem and does not go gup().
159 data
->useraddr
= vma
->vm_start
;
160 data
->mm
= current
->mm
;
162 atomic_inc(&data
->mm
->mm_count
);
163 ret
= (int) mm_iommu_newdev(data
->mm
, data
->useraddr
,
164 vma_pages(vma
), data
->gpu_hpa
, &data
->mem
);
166 trace_vfio_pci_nvgpu_mmap(vdev
->pdev
, data
->gpu_hpa
, data
->useraddr
,
167 vma
->vm_end
- vma
->vm_start
, ret
);
172 static int vfio_pci_nvgpu_add_capability(struct vfio_pci_device
*vdev
,
173 struct vfio_pci_region
*region
, struct vfio_info_cap
*caps
)
175 struct vfio_pci_nvgpu_data
*data
= region
->data
;
176 struct vfio_region_info_cap_nvlink2_ssatgt cap
= {
177 .header
.id
= VFIO_REGION_INFO_CAP_NVLINK2_SSATGT
,
182 return vfio_info_add_capability(caps
, &cap
.header
, sizeof(cap
));
185 static const struct vfio_pci_regops vfio_pci_nvgpu_regops
= {
186 .rw
= vfio_pci_nvgpu_rw
,
187 .release
= vfio_pci_nvgpu_release
,
188 .mmap
= vfio_pci_nvgpu_mmap
,
189 .add_capability
= vfio_pci_nvgpu_add_capability
,
192 static int vfio_pci_nvgpu_group_notifier(struct notifier_block
*nb
,
193 unsigned long action
, void *opaque
)
195 struct kvm
*kvm
= opaque
;
196 struct vfio_pci_nvgpu_data
*data
= container_of(nb
,
197 struct vfio_pci_nvgpu_data
,
200 if (action
== VFIO_GROUP_NOTIFY_SET_KVM
&& kvm
&&
201 pnv_npu2_map_lpar_dev(data
->gpdev
,
202 kvm
->arch
.lpid
, MSR_DR
| MSR_PR
))
208 int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device
*vdev
)
213 struct device_node
*npu_node
, *mem_node
;
214 struct pci_dev
*npu_dev
;
215 struct vfio_pci_nvgpu_data
*data
;
216 uint32_t mem_phandle
= 0;
217 unsigned long events
= VFIO_GROUP_NOTIFY_SET_KVM
;
220 * PCI config space does not tell us about NVLink presense but
221 * platform does, use this.
223 npu_dev
= pnv_pci_get_npu_dev(vdev
->pdev
, 0);
227 npu_node
= pci_device_to_OF_node(npu_dev
);
231 if (of_property_read_u32(npu_node
, "memory-region", &mem_phandle
))
234 mem_node
= of_find_node_by_phandle(mem_phandle
);
238 if (of_property_read_variable_u64_array(mem_node
, "reg", reg
,
239 ARRAY_SIZE(reg
), ARRAY_SIZE(reg
)) !=
243 if (of_property_read_u64(npu_node
, "ibm,device-tgt-addr", &tgt
)) {
244 dev_warn(&vdev
->pdev
->dev
, "No ibm,device-tgt-addr found\n");
248 data
= kzalloc(sizeof(*data
), GFP_KERNEL
);
252 data
->gpu_hpa
= reg
[0];
256 dev_dbg(&vdev
->pdev
->dev
, "%lx..%lx\n", data
->gpu_hpa
,
257 data
->gpu_hpa
+ data
->size
- 1);
259 data
->gpdev
= vdev
->pdev
;
260 data
->group_notifier
.notifier_call
= vfio_pci_nvgpu_group_notifier
;
262 ret
= vfio_register_notifier(&data
->gpdev
->dev
, VFIO_GROUP_NOTIFY
,
263 &events
, &data
->group_notifier
);
268 * We have just set KVM, we do not need the listener anymore.
269 * Also, keeping it registered means that if more than one GPU is
270 * assigned, we will get several similar notifiers notifying about
271 * the same device again which does not help with anything.
273 vfio_unregister_notifier(&data
->gpdev
->dev
, VFIO_GROUP_NOTIFY
,
274 &data
->group_notifier
);
276 ret
= vfio_pci_register_dev_region(vdev
,
277 PCI_VENDOR_ID_NVIDIA
| VFIO_REGION_TYPE_PCI_VENDOR_TYPE
,
278 VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM
,
279 &vfio_pci_nvgpu_regops
,
281 VFIO_REGION_INFO_FLAG_READ
|
282 VFIO_REGION_INFO_FLAG_WRITE
|
283 VFIO_REGION_INFO_FLAG_MMAP
,
298 struct vfio_pci_npu2_data
{
299 void *base
; /* ATSD register virtual address, for emulated access */
300 unsigned long mmio_atsd
; /* ATSD physical address */
301 unsigned long gpu_tgt
; /* TGT address of corresponding GPU RAM */
302 unsigned int link_speed
; /* The link speed from DT's ibm,nvlink-speed */
305 static size_t vfio_pci_npu2_rw(struct vfio_pci_device
*vdev
,
306 char __user
*buf
, size_t count
, loff_t
*ppos
, bool iswrite
)
308 unsigned int i
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
) - VFIO_PCI_NUM_REGIONS
;
309 struct vfio_pci_npu2_data
*data
= vdev
->region
[i
].data
;
310 loff_t pos
= *ppos
& VFIO_PCI_OFFSET_MASK
;
312 if (pos
>= vdev
->region
[i
].size
)
315 count
= min(count
, (size_t)(vdev
->region
[i
].size
- pos
));
318 if (copy_from_user(data
->base
+ pos
, buf
, count
))
321 if (copy_to_user(buf
, data
->base
+ pos
, count
))
329 static int vfio_pci_npu2_mmap(struct vfio_pci_device
*vdev
,
330 struct vfio_pci_region
*region
, struct vm_area_struct
*vma
)
333 struct vfio_pci_npu2_data
*data
= region
->data
;
334 unsigned long req_len
= vma
->vm_end
- vma
->vm_start
;
336 if (req_len
!= PAGE_SIZE
)
339 vma
->vm_flags
|= VM_PFNMAP
;
340 vma
->vm_page_prot
= pgprot_noncached(vma
->vm_page_prot
);
342 ret
= remap_pfn_range(vma
, vma
->vm_start
, data
->mmio_atsd
>> PAGE_SHIFT
,
343 req_len
, vma
->vm_page_prot
);
344 trace_vfio_pci_npu2_mmap(vdev
->pdev
, data
->mmio_atsd
, vma
->vm_start
,
345 vma
->vm_end
- vma
->vm_start
, ret
);
350 static void vfio_pci_npu2_release(struct vfio_pci_device
*vdev
,
351 struct vfio_pci_region
*region
)
353 struct vfio_pci_npu2_data
*data
= region
->data
;
355 memunmap(data
->base
);
359 static int vfio_pci_npu2_add_capability(struct vfio_pci_device
*vdev
,
360 struct vfio_pci_region
*region
, struct vfio_info_cap
*caps
)
362 struct vfio_pci_npu2_data
*data
= region
->data
;
363 struct vfio_region_info_cap_nvlink2_ssatgt captgt
= {
364 .header
.id
= VFIO_REGION_INFO_CAP_NVLINK2_SSATGT
,
368 struct vfio_region_info_cap_nvlink2_lnkspd capspd
= {
369 .header
.id
= VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD
,
371 .link_speed
= data
->link_speed
375 ret
= vfio_info_add_capability(caps
, &captgt
.header
, sizeof(captgt
));
379 return vfio_info_add_capability(caps
, &capspd
.header
, sizeof(capspd
));
382 static const struct vfio_pci_regops vfio_pci_npu2_regops
= {
383 .rw
= vfio_pci_npu2_rw
,
384 .mmap
= vfio_pci_npu2_mmap
,
385 .release
= vfio_pci_npu2_release
,
386 .add_capability
= vfio_pci_npu2_add_capability
,
389 int vfio_pci_ibm_npu2_init(struct vfio_pci_device
*vdev
)
392 struct vfio_pci_npu2_data
*data
;
393 struct device_node
*nvlink_dn
;
394 u32 nvlink_index
= 0;
395 struct pci_dev
*npdev
= vdev
->pdev
;
396 struct device_node
*npu_node
= pci_device_to_OF_node(npdev
);
397 struct pci_controller
*hose
= pci_bus_to_host(npdev
->bus
);
400 u32 link_speed
= 0xff;
403 * PCI config space does not tell us about NVLink presense but
404 * platform does, use this.
406 if (!pnv_pci_get_gpu_dev(vdev
->pdev
))
410 * NPU2 normally has 8 ATSD registers (for concurrency) and 6 links
411 * so we can allocate one register per link, using nvlink index as
413 * There is always at least one ATSD register so as long as at least
414 * NVLink bridge #0 is passed to the guest, ATSD will be available.
416 nvlink_dn
= of_parse_phandle(npdev
->dev
.of_node
, "ibm,nvlink", 0);
417 if (WARN_ON(of_property_read_u32(nvlink_dn
, "ibm,npu-link-index",
421 if (of_property_read_u64_index(hose
->dn
, "ibm,mmio-atsd", nvlink_index
,
423 dev_warn(&vdev
->pdev
->dev
, "No available ATSD found\n");
427 if (of_property_read_u64(npu_node
, "ibm,device-tgt-addr", &tgt
)) {
428 dev_warn(&vdev
->pdev
->dev
, "No ibm,device-tgt-addr found\n");
432 if (of_property_read_u32(npu_node
, "ibm,nvlink-speed", &link_speed
)) {
433 dev_warn(&vdev
->pdev
->dev
, "No ibm,nvlink-speed found\n");
437 data
= kzalloc(sizeof(*data
), GFP_KERNEL
);
441 data
->mmio_atsd
= mmio_atsd
;
443 data
->link_speed
= link_speed
;
444 if (data
->mmio_atsd
) {
445 data
->base
= memremap(data
->mmio_atsd
, SZ_64K
, MEMREMAP_WT
);
453 * We want to expose the capability even if this specific NVLink
454 * did not get its own ATSD register because capabilities
455 * belong to VFIO regions and normally there will be ATSD register
456 * assigned to the NVLink bridge.
458 ret
= vfio_pci_register_dev_region(vdev
,
460 VFIO_REGION_TYPE_PCI_VENDOR_TYPE
,
461 VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD
,
462 &vfio_pci_npu2_regops
,
463 data
->mmio_atsd
? PAGE_SIZE
: 0,
464 VFIO_REGION_INFO_FLAG_READ
|
465 VFIO_REGION_INFO_FLAG_WRITE
|
466 VFIO_REGION_INFO_FLAG_MMAP
,
475 memunmap(data
->base
);