1 // SPDX-License-Identifier: GPL-2.0-only
3 * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2.
5 * Copyright (C) 2018 IBM Corp. All rights reserved.
6 * Author: Alexey Kardashevskiy <aik@ozlabs.ru>
8 * Register an on-GPU RAM region for cacheable access.
10 * Derived from original vfio_pci_igd.c:
11 * Copyright (C) 2016 Red Hat, Inc. All rights reserved.
12 * Author: Alex Williamson <alex.williamson@redhat.com>
16 #include <linux/pci.h>
17 #include <linux/uaccess.h>
18 #include <linux/vfio.h>
19 #include <linux/sched/mm.h>
20 #include <linux/mmu_context.h>
21 #include <asm/kvm_ppc.h>
22 #include "vfio_pci_private.h"
24 #define CREATE_TRACE_POINTS
27 EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault
);
28 EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap
);
29 EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap
);
31 struct vfio_pci_nvgpu_data
{
32 unsigned long gpu_hpa
; /* GPU RAM physical address */
33 unsigned long gpu_tgt
; /* TGT address of corresponding GPU RAM */
34 unsigned long useraddr
; /* GPU RAM userspace address */
35 unsigned long size
; /* Size of the GPU RAM window (usually 128GB) */
37 struct mm_iommu_table_group_mem_t
*mem
; /* Pre-registered RAM descr. */
38 struct pci_dev
*gpdev
;
39 struct notifier_block group_notifier
;
42 static size_t vfio_pci_nvgpu_rw(struct vfio_pci_device
*vdev
,
43 char __user
*buf
, size_t count
, loff_t
*ppos
, bool iswrite
)
45 unsigned int i
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
) - VFIO_PCI_NUM_REGIONS
;
46 struct vfio_pci_nvgpu_data
*data
= vdev
->region
[i
].data
;
47 loff_t pos
= *ppos
& VFIO_PCI_OFFSET_MASK
;
48 loff_t posaligned
= pos
& PAGE_MASK
, posoff
= pos
& ~PAGE_MASK
;
52 if (pos
>= vdev
->region
[i
].size
)
55 count
= min(count
, (size_t)(vdev
->region
[i
].size
- pos
));
58 * We map only a bit of GPU RAM for a short time instead of mapping it
59 * for the guest lifetime as:
61 * 1) we do not know GPU RAM size, only aperture which is 4-8 times
62 * bigger than actual RAM size (16/32GB RAM vs. 128GB aperture);
63 * 2) mapping GPU RAM allows CPU to prefetch and if this happens
64 * before NVLink bridge is reset (which fences GPU RAM),
65 * hardware management interrupts (HMI) might happen, this
66 * will freeze NVLink bridge.
68 * This is not fast path anyway.
70 sizealigned
= _ALIGN_UP(posoff
+ count
, PAGE_SIZE
);
71 ptr
= ioremap_cache(data
->gpu_hpa
+ posaligned
, sizealigned
);
76 if (copy_from_user(ptr
+ posoff
, buf
, count
))
81 if (copy_to_user(buf
, ptr
+ posoff
, count
))
92 static void vfio_pci_nvgpu_release(struct vfio_pci_device
*vdev
,
93 struct vfio_pci_region
*region
)
95 struct vfio_pci_nvgpu_data
*data
= region
->data
;
98 /* If there were any mappings at all... */
100 ret
= mm_iommu_put(data
->mm
, data
->mem
);
106 vfio_unregister_notifier(&data
->gpdev
->dev
, VFIO_GROUP_NOTIFY
,
107 &data
->group_notifier
);
109 pnv_npu2_unmap_lpar_dev(data
->gpdev
);
114 static vm_fault_t
vfio_pci_nvgpu_mmap_fault(struct vm_fault
*vmf
)
117 struct vm_area_struct
*vma
= vmf
->vma
;
118 struct vfio_pci_region
*region
= vma
->vm_private_data
;
119 struct vfio_pci_nvgpu_data
*data
= region
->data
;
120 unsigned long vmf_off
= (vmf
->address
- vma
->vm_start
) >> PAGE_SHIFT
;
121 unsigned long nv2pg
= data
->gpu_hpa
>> PAGE_SHIFT
;
122 unsigned long vm_pgoff
= vma
->vm_pgoff
&
123 ((1U << (VFIO_PCI_OFFSET_SHIFT
- PAGE_SHIFT
)) - 1);
124 unsigned long pfn
= nv2pg
+ vm_pgoff
+ vmf_off
;
126 ret
= vmf_insert_pfn(vma
, vmf
->address
, pfn
);
127 trace_vfio_pci_nvgpu_mmap_fault(data
->gpdev
, pfn
<< PAGE_SHIFT
,
133 static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops
= {
134 .fault
= vfio_pci_nvgpu_mmap_fault
,
137 static int vfio_pci_nvgpu_mmap(struct vfio_pci_device
*vdev
,
138 struct vfio_pci_region
*region
, struct vm_area_struct
*vma
)
141 struct vfio_pci_nvgpu_data
*data
= region
->data
;
146 if (vma
->vm_end
- vma
->vm_start
> data
->size
)
149 vma
->vm_private_data
= region
;
150 vma
->vm_flags
|= VM_PFNMAP
;
151 vma
->vm_ops
= &vfio_pci_nvgpu_mmap_vmops
;
154 * Calling mm_iommu_newdev() here once as the region is not
155 * registered yet and therefore right initialization will happen now.
156 * Other places will use mm_iommu_find() which returns
157 * registered @mem and does not go gup().
159 data
->useraddr
= vma
->vm_start
;
160 data
->mm
= current
->mm
;
162 atomic_inc(&data
->mm
->mm_count
);
163 ret
= (int) mm_iommu_newdev(data
->mm
, data
->useraddr
,
164 (vma
->vm_end
- vma
->vm_start
) >> PAGE_SHIFT
,
165 data
->gpu_hpa
, &data
->mem
);
167 trace_vfio_pci_nvgpu_mmap(vdev
->pdev
, data
->gpu_hpa
, data
->useraddr
,
168 vma
->vm_end
- vma
->vm_start
, ret
);
173 static int vfio_pci_nvgpu_add_capability(struct vfio_pci_device
*vdev
,
174 struct vfio_pci_region
*region
, struct vfio_info_cap
*caps
)
176 struct vfio_pci_nvgpu_data
*data
= region
->data
;
177 struct vfio_region_info_cap_nvlink2_ssatgt cap
= {
178 .header
.id
= VFIO_REGION_INFO_CAP_NVLINK2_SSATGT
,
183 return vfio_info_add_capability(caps
, &cap
.header
, sizeof(cap
));
186 static const struct vfio_pci_regops vfio_pci_nvgpu_regops
= {
187 .rw
= vfio_pci_nvgpu_rw
,
188 .release
= vfio_pci_nvgpu_release
,
189 .mmap
= vfio_pci_nvgpu_mmap
,
190 .add_capability
= vfio_pci_nvgpu_add_capability
,
193 static int vfio_pci_nvgpu_group_notifier(struct notifier_block
*nb
,
194 unsigned long action
, void *opaque
)
196 struct kvm
*kvm
= opaque
;
197 struct vfio_pci_nvgpu_data
*data
= container_of(nb
,
198 struct vfio_pci_nvgpu_data
,
201 if (action
== VFIO_GROUP_NOTIFY_SET_KVM
&& kvm
&&
202 pnv_npu2_map_lpar_dev(data
->gpdev
,
203 kvm
->arch
.lpid
, MSR_DR
| MSR_PR
))
209 int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device
*vdev
)
214 struct device_node
*npu_node
, *mem_node
;
215 struct pci_dev
*npu_dev
;
216 struct vfio_pci_nvgpu_data
*data
;
217 uint32_t mem_phandle
= 0;
218 unsigned long events
= VFIO_GROUP_NOTIFY_SET_KVM
;
221 * PCI config space does not tell us about NVLink presense but
222 * platform does, use this.
224 npu_dev
= pnv_pci_get_npu_dev(vdev
->pdev
, 0);
228 npu_node
= pci_device_to_OF_node(npu_dev
);
232 if (of_property_read_u32(npu_node
, "memory-region", &mem_phandle
))
235 mem_node
= of_find_node_by_phandle(mem_phandle
);
239 if (of_property_read_variable_u64_array(mem_node
, "reg", reg
,
240 ARRAY_SIZE(reg
), ARRAY_SIZE(reg
)) !=
244 if (of_property_read_u64(npu_node
, "ibm,device-tgt-addr", &tgt
)) {
245 dev_warn(&vdev
->pdev
->dev
, "No ibm,device-tgt-addr found\n");
249 data
= kzalloc(sizeof(*data
), GFP_KERNEL
);
253 data
->gpu_hpa
= reg
[0];
257 dev_dbg(&vdev
->pdev
->dev
, "%lx..%lx\n", data
->gpu_hpa
,
258 data
->gpu_hpa
+ data
->size
- 1);
260 data
->gpdev
= vdev
->pdev
;
261 data
->group_notifier
.notifier_call
= vfio_pci_nvgpu_group_notifier
;
263 ret
= vfio_register_notifier(&data
->gpdev
->dev
, VFIO_GROUP_NOTIFY
,
264 &events
, &data
->group_notifier
);
269 * We have just set KVM, we do not need the listener anymore.
270 * Also, keeping it registered means that if more than one GPU is
271 * assigned, we will get several similar notifiers notifying about
272 * the same device again which does not help with anything.
274 vfio_unregister_notifier(&data
->gpdev
->dev
, VFIO_GROUP_NOTIFY
,
275 &data
->group_notifier
);
277 ret
= vfio_pci_register_dev_region(vdev
,
278 PCI_VENDOR_ID_NVIDIA
| VFIO_REGION_TYPE_PCI_VENDOR_TYPE
,
279 VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM
,
280 &vfio_pci_nvgpu_regops
,
282 VFIO_REGION_INFO_FLAG_READ
|
283 VFIO_REGION_INFO_FLAG_WRITE
|
284 VFIO_REGION_INFO_FLAG_MMAP
,
299 struct vfio_pci_npu2_data
{
300 void *base
; /* ATSD register virtual address, for emulated access */
301 unsigned long mmio_atsd
; /* ATSD physical address */
302 unsigned long gpu_tgt
; /* TGT address of corresponding GPU RAM */
303 unsigned int link_speed
; /* The link speed from DT's ibm,nvlink-speed */
306 static size_t vfio_pci_npu2_rw(struct vfio_pci_device
*vdev
,
307 char __user
*buf
, size_t count
, loff_t
*ppos
, bool iswrite
)
309 unsigned int i
= VFIO_PCI_OFFSET_TO_INDEX(*ppos
) - VFIO_PCI_NUM_REGIONS
;
310 struct vfio_pci_npu2_data
*data
= vdev
->region
[i
].data
;
311 loff_t pos
= *ppos
& VFIO_PCI_OFFSET_MASK
;
313 if (pos
>= vdev
->region
[i
].size
)
316 count
= min(count
, (size_t)(vdev
->region
[i
].size
- pos
));
319 if (copy_from_user(data
->base
+ pos
, buf
, count
))
322 if (copy_to_user(buf
, data
->base
+ pos
, count
))
330 static int vfio_pci_npu2_mmap(struct vfio_pci_device
*vdev
,
331 struct vfio_pci_region
*region
, struct vm_area_struct
*vma
)
334 struct vfio_pci_npu2_data
*data
= region
->data
;
335 unsigned long req_len
= vma
->vm_end
- vma
->vm_start
;
337 if (req_len
!= PAGE_SIZE
)
340 vma
->vm_flags
|= VM_PFNMAP
;
341 vma
->vm_page_prot
= pgprot_noncached(vma
->vm_page_prot
);
343 ret
= remap_pfn_range(vma
, vma
->vm_start
, data
->mmio_atsd
>> PAGE_SHIFT
,
344 req_len
, vma
->vm_page_prot
);
345 trace_vfio_pci_npu2_mmap(vdev
->pdev
, data
->mmio_atsd
, vma
->vm_start
,
346 vma
->vm_end
- vma
->vm_start
, ret
);
351 static void vfio_pci_npu2_release(struct vfio_pci_device
*vdev
,
352 struct vfio_pci_region
*region
)
354 struct vfio_pci_npu2_data
*data
= region
->data
;
356 memunmap(data
->base
);
360 static int vfio_pci_npu2_add_capability(struct vfio_pci_device
*vdev
,
361 struct vfio_pci_region
*region
, struct vfio_info_cap
*caps
)
363 struct vfio_pci_npu2_data
*data
= region
->data
;
364 struct vfio_region_info_cap_nvlink2_ssatgt captgt
= {
365 .header
.id
= VFIO_REGION_INFO_CAP_NVLINK2_SSATGT
,
369 struct vfio_region_info_cap_nvlink2_lnkspd capspd
= {
370 .header
.id
= VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD
,
372 .link_speed
= data
->link_speed
376 ret
= vfio_info_add_capability(caps
, &captgt
.header
, sizeof(captgt
));
380 return vfio_info_add_capability(caps
, &capspd
.header
, sizeof(capspd
));
383 static const struct vfio_pci_regops vfio_pci_npu2_regops
= {
384 .rw
= vfio_pci_npu2_rw
,
385 .mmap
= vfio_pci_npu2_mmap
,
386 .release
= vfio_pci_npu2_release
,
387 .add_capability
= vfio_pci_npu2_add_capability
,
390 int vfio_pci_ibm_npu2_init(struct vfio_pci_device
*vdev
)
393 struct vfio_pci_npu2_data
*data
;
394 struct device_node
*nvlink_dn
;
395 u32 nvlink_index
= 0;
396 struct pci_dev
*npdev
= vdev
->pdev
;
397 struct device_node
*npu_node
= pci_device_to_OF_node(npdev
);
398 struct pci_controller
*hose
= pci_bus_to_host(npdev
->bus
);
401 u32 link_speed
= 0xff;
404 * PCI config space does not tell us about NVLink presense but
405 * platform does, use this.
407 if (!pnv_pci_get_gpu_dev(vdev
->pdev
))
411 * NPU2 normally has 8 ATSD registers (for concurrency) and 6 links
412 * so we can allocate one register per link, using nvlink index as
414 * There is always at least one ATSD register so as long as at least
415 * NVLink bridge #0 is passed to the guest, ATSD will be available.
417 nvlink_dn
= of_parse_phandle(npdev
->dev
.of_node
, "ibm,nvlink", 0);
418 if (WARN_ON(of_property_read_u32(nvlink_dn
, "ibm,npu-link-index",
422 if (of_property_read_u64_index(hose
->dn
, "ibm,mmio-atsd", nvlink_index
,
424 dev_warn(&vdev
->pdev
->dev
, "No available ATSD found\n");
428 if (of_property_read_u64(npu_node
, "ibm,device-tgt-addr", &tgt
)) {
429 dev_warn(&vdev
->pdev
->dev
, "No ibm,device-tgt-addr found\n");
433 if (of_property_read_u32(npu_node
, "ibm,nvlink-speed", &link_speed
)) {
434 dev_warn(&vdev
->pdev
->dev
, "No ibm,nvlink-speed found\n");
438 data
= kzalloc(sizeof(*data
), GFP_KERNEL
);
442 data
->mmio_atsd
= mmio_atsd
;
444 data
->link_speed
= link_speed
;
445 if (data
->mmio_atsd
) {
446 data
->base
= memremap(data
->mmio_atsd
, SZ_64K
, MEMREMAP_WT
);
454 * We want to expose the capability even if this specific NVLink
455 * did not get its own ATSD register because capabilities
456 * belong to VFIO regions and normally there will be ATSD register
457 * assigned to the NVLink bridge.
459 ret
= vfio_pci_register_dev_region(vdev
,
461 VFIO_REGION_TYPE_PCI_VENDOR_TYPE
,
462 VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD
,
463 &vfio_pci_npu2_regops
,
464 data
->mmio_atsd
? PAGE_SIZE
: 0,
465 VFIO_REGION_INFO_FLAG_READ
|
466 VFIO_REGION_INFO_FLAG_WRITE
|
467 VFIO_REGION_INFO_FLAG_MMAP
,
476 memunmap(data
->base
);