1 // SPDX-License-Identifier: GPL-2.0-only
3 * Intel MIC Platform Software Stack (MPSS)
5 * Copyright(c) 2015 Intel Corporation.
12 * struct scif_vma_info - Information about a remote memory mapping
13 * created via scif_mmap(..)
14 * @vma: VM area struct
15 * @list: link to list of active vmas
17 struct scif_vma_info
{
18 struct vm_area_struct
*vma
;
19 struct list_head list
;
22 void scif_recv_munmap(struct scif_dev
*scifdev
, struct scifmsg
*msg
)
24 struct scif_rma_req req
;
25 struct scif_window
*window
= NULL
;
26 struct scif_window
*recv_window
=
27 (struct scif_window
*)msg
->payload
[0];
28 struct scif_endpt
*ep
;
30 ep
= (struct scif_endpt
*)recv_window
->ep
;
31 req
.out_window
= &window
;
32 req
.offset
= recv_window
->offset
;
33 req
.prot
= recv_window
->prot
;
34 req
.nr_bytes
= recv_window
->nr_pages
<< PAGE_SHIFT
;
35 req
.type
= SCIF_WINDOW_FULL
;
36 req
.head
= &ep
->rma_info
.reg_list
;
37 msg
->payload
[0] = ep
->remote_ep
;
39 mutex_lock(&ep
->rma_info
.rma_lock
);
40 /* Does a valid window exist? */
41 if (scif_query_window(&req
)) {
42 dev_err(&scifdev
->sdev
->dev
,
43 "%s %d -ENXIO\n", __func__
, __LINE__
);
44 msg
->uop
= SCIF_UNREGISTER_ACK
;
48 scif_put_window(window
, window
->nr_pages
);
50 if (!window
->ref_count
) {
51 atomic_inc(&ep
->rma_info
.tw_refcount
);
52 ep
->rma_info
.async_list_del
= 1;
53 list_del_init(&window
->list
);
54 scif_free_window_offset(ep
, window
, window
->offset
);
57 mutex_unlock(&ep
->rma_info
.rma_lock
);
58 if (window
&& !window
->ref_count
)
59 scif_queue_for_cleanup(window
, &scif_info
.rma
);
63 * Remove valid remote memory mappings created via scif_mmap(..) from the
64 * process address space since the remote node is lost
66 static void __scif_zap_mmaps(struct scif_endpt
*ep
)
68 struct list_head
*item
;
69 struct scif_vma_info
*info
;
70 struct vm_area_struct
*vma
;
74 list_for_each(item
, &ep
->rma_info
.vma_list
) {
75 info
= list_entry(item
, struct scif_vma_info
, list
);
77 size
= vma
->vm_end
- vma
->vm_start
;
78 zap_vma_ptes(vma
, vma
->vm_start
, size
);
79 dev_dbg(scif_info
.mdev
.this_device
,
80 "%s ep %p zap vma %p size 0x%lx\n",
81 __func__
, ep
, info
->vma
, size
);
83 spin_unlock(&ep
->lock
);
87 * Traverse the list of endpoints for a particular remote node and
88 * zap valid remote memory mappings since the remote node is lost
90 static void _scif_zap_mmaps(int node
, struct list_head
*head
)
92 struct scif_endpt
*ep
;
93 struct list_head
*item
;
95 mutex_lock(&scif_info
.connlock
);
96 list_for_each(item
, head
) {
97 ep
= list_entry(item
, struct scif_endpt
, list
);
98 if (ep
->remote_dev
->node
== node
)
101 mutex_unlock(&scif_info
.connlock
);
105 * Wrapper for removing remote memory mappings for a particular node. This API
106 * is called by peer nodes as part of handling a lost node.
108 void scif_zap_mmaps(int node
)
110 _scif_zap_mmaps(node
, &scif_info
.connected
);
111 _scif_zap_mmaps(node
, &scif_info
.disconnected
);
115 * This API is only called while handling a lost node:
116 * a) Remote node is dead.
117 * b) Remote memory mappings have been zapped
118 * So we can traverse the remote_reg_list without any locks. Since
119 * the window has not yet been unregistered we can drop the ref count
120 * and queue it to the cleanup thread.
122 static void __scif_cleanup_rma_for_zombies(struct scif_endpt
*ep
)
124 struct list_head
*pos
, *tmp
;
125 struct scif_window
*window
;
127 list_for_each_safe(pos
, tmp
, &ep
->rma_info
.remote_reg_list
) {
128 window
= list_entry(pos
, struct scif_window
, list
);
129 if (window
->ref_count
)
130 scif_put_window(window
, window
->nr_pages
);
132 dev_err(scif_info
.mdev
.this_device
,
133 "%s %d unexpected\n",
135 if (!window
->ref_count
) {
136 atomic_inc(&ep
->rma_info
.tw_refcount
);
137 list_del_init(&window
->list
);
138 scif_queue_for_cleanup(window
, &scif_info
.rma
);
143 /* Cleanup remote registration lists for zombie endpoints */
144 void scif_cleanup_rma_for_zombies(int node
)
146 struct scif_endpt
*ep
;
147 struct list_head
*item
;
149 mutex_lock(&scif_info
.eplock
);
150 list_for_each(item
, &scif_info
.zombie
) {
151 ep
= list_entry(item
, struct scif_endpt
, list
);
152 if (ep
->remote_dev
&& ep
->remote_dev
->node
== node
)
153 __scif_cleanup_rma_for_zombies(ep
);
155 mutex_unlock(&scif_info
.eplock
);
156 flush_work(&scif_info
.misc_work
);
159 /* Insert the VMA into the per endpoint VMA list */
160 static int scif_insert_vma(struct scif_endpt
*ep
, struct vm_area_struct
*vma
)
162 struct scif_vma_info
*info
;
165 info
= kzalloc(sizeof(*info
), GFP_KERNEL
);
171 spin_lock(&ep
->lock
);
172 list_add_tail(&info
->list
, &ep
->rma_info
.vma_list
);
173 spin_unlock(&ep
->lock
);
178 /* Delete the VMA from the per endpoint VMA list */
179 static void scif_delete_vma(struct scif_endpt
*ep
, struct vm_area_struct
*vma
)
181 struct list_head
*item
;
182 struct scif_vma_info
*info
;
184 spin_lock(&ep
->lock
);
185 list_for_each(item
, &ep
->rma_info
.vma_list
) {
186 info
= list_entry(item
, struct scif_vma_info
, list
);
187 if (info
->vma
== vma
) {
188 list_del(&info
->list
);
193 spin_unlock(&ep
->lock
);
196 static phys_addr_t
scif_get_phys(phys_addr_t phys
, struct scif_endpt
*ep
)
198 struct scif_dev
*scifdev
= (struct scif_dev
*)ep
->remote_dev
;
199 struct scif_hw_dev
*sdev
= scifdev
->sdev
;
200 phys_addr_t out_phys
, apt_base
= 0;
203 * If the DMA address is card relative then we need to add the
204 * aperture base for mmap to work correctly
206 if (!scifdev_self(scifdev
) && sdev
->aper
&& sdev
->card_rel_da
)
207 apt_base
= sdev
->aper
->pa
;
208 out_phys
= apt_base
+ phys
;
212 int scif_get_pages(scif_epd_t epd
, off_t offset
, size_t len
,
213 struct scif_range
**pages
)
215 struct scif_endpt
*ep
= (struct scif_endpt
*)epd
;
216 struct scif_rma_req req
;
217 struct scif_window
*window
= NULL
;
218 int nr_pages
, err
, i
;
220 dev_dbg(scif_info
.mdev
.this_device
,
221 "SCIFAPI get_pinned_pages: ep %p offset 0x%lx len 0x%lx\n",
223 err
= scif_verify_epd(ep
);
227 if (!len
|| (offset
< 0) ||
228 (offset
+ len
< offset
) ||
229 (ALIGN(offset
, PAGE_SIZE
) != offset
) ||
230 (ALIGN(len
, PAGE_SIZE
) != len
))
233 nr_pages
= len
>> PAGE_SHIFT
;
235 req
.out_window
= &window
;
239 req
.type
= SCIF_WINDOW_SINGLE
;
240 req
.head
= &ep
->rma_info
.remote_reg_list
;
242 mutex_lock(&ep
->rma_info
.rma_lock
);
243 /* Does a valid window exist? */
244 err
= scif_query_window(&req
);
246 dev_err(&ep
->remote_dev
->sdev
->dev
,
247 "%s %d err %d\n", __func__
, __LINE__
, err
);
251 /* Allocate scif_range */
252 *pages
= kzalloc(sizeof(**pages
), GFP_KERNEL
);
258 /* Allocate phys addr array */
259 (*pages
)->phys_addr
= scif_zalloc(nr_pages
* sizeof(dma_addr_t
));
260 if (!((*pages
)->phys_addr
)) {
265 if (scif_is_mgmt_node() && !scifdev_self(ep
->remote_dev
)) {
266 /* Allocate virtual address array */
267 ((*pages
)->va
= scif_zalloc(nr_pages
* sizeof(void *)));
273 /* Populate the values */
274 (*pages
)->cookie
= window
;
275 (*pages
)->nr_pages
= nr_pages
;
276 (*pages
)->prot_flags
= window
->prot
;
278 for (i
= 0; i
< nr_pages
; i
++) {
279 (*pages
)->phys_addr
[i
] =
280 __scif_off_to_dma_addr(window
, offset
+
282 (*pages
)->phys_addr
[i
] = scif_get_phys((*pages
)->phys_addr
[i
],
284 if (scif_is_mgmt_node() && !scifdev_self(ep
->remote_dev
))
286 ep
->remote_dev
->sdev
->aper
->va
+
287 (*pages
)->phys_addr
[i
] -
288 ep
->remote_dev
->sdev
->aper
->pa
;
291 scif_get_window(window
, nr_pages
);
293 mutex_unlock(&ep
->rma_info
.rma_lock
);
296 scif_free((*pages
)->phys_addr
,
297 nr_pages
* sizeof(dma_addr_t
));
298 scif_free((*pages
)->va
,
299 nr_pages
* sizeof(void *));
303 dev_err(&ep
->remote_dev
->sdev
->dev
,
304 "%s %d err %d\n", __func__
, __LINE__
, err
);
308 EXPORT_SYMBOL_GPL(scif_get_pages
);
310 int scif_put_pages(struct scif_range
*pages
)
312 struct scif_endpt
*ep
;
313 struct scif_window
*window
;
316 if (!pages
|| !pages
->cookie
)
319 window
= pages
->cookie
;
321 if (!window
|| window
->magic
!= SCIFEP_MAGIC
)
324 ep
= (struct scif_endpt
*)window
->ep
;
326 * If the state is SCIFEP_CONNECTED or SCIFEP_DISCONNECTED then the
327 * callee should be allowed to release references to the pages,
328 * else the endpoint was not connected in the first place,
329 * hence the ENOTCONN.
331 if (ep
->state
!= SCIFEP_CONNECTED
&& ep
->state
!= SCIFEP_DISCONNECTED
)
334 mutex_lock(&ep
->rma_info
.rma_lock
);
336 scif_put_window(window
, pages
->nr_pages
);
338 /* Initiate window destruction if ref count is zero */
339 if (!window
->ref_count
) {
340 list_del(&window
->list
);
341 mutex_unlock(&ep
->rma_info
.rma_lock
);
342 scif_drain_dma_intr(ep
->remote_dev
->sdev
,
343 ep
->rma_info
.dma_chan
);
344 /* Inform the peer about this window being destroyed. */
345 msg
.uop
= SCIF_MUNMAP
;
347 msg
.payload
[0] = window
->peer_window
;
348 /* No error handling for notification messages */
349 scif_nodeqp_send(ep
->remote_dev
, &msg
);
350 /* Destroy this window from the peer's registered AS */
351 scif_destroy_remote_window(window
);
353 mutex_unlock(&ep
->rma_info
.rma_lock
);
356 scif_free(pages
->phys_addr
, pages
->nr_pages
* sizeof(dma_addr_t
));
357 scif_free(pages
->va
, pages
->nr_pages
* sizeof(void *));
361 EXPORT_SYMBOL_GPL(scif_put_pages
);
364 * scif_rma_list_mmap:
366 * Traverse the remote registration list starting from start_window:
367 * 1) Create VtoP mappings via remap_pfn_range(..)
368 * 2) Once step 1) and 2) complete successfully then traverse the range of
369 * windows again and bump the reference count.
370 * RMA lock must be held.
372 static int scif_rma_list_mmap(struct scif_window
*start_window
, s64 offset
,
373 int nr_pages
, struct vm_area_struct
*vma
)
375 s64 end_offset
, loop_offset
= offset
;
376 struct scif_window
*window
= start_window
;
377 int loop_nr_pages
, nr_pages_left
= nr_pages
;
378 struct scif_endpt
*ep
= (struct scif_endpt
*)start_window
->ep
;
379 struct list_head
*head
= &ep
->rma_info
.remote_reg_list
;
381 dma_addr_t phys_addr
;
382 struct scif_window_iter src_win_iter
;
383 size_t contig_bytes
= 0;
386 list_for_each_entry_from(window
, head
, list
) {
387 end_offset
= window
->offset
+
388 (window
->nr_pages
<< PAGE_SHIFT
);
389 loop_nr_pages
= min_t(int,
390 (end_offset
- loop_offset
) >> PAGE_SHIFT
,
392 scif_init_window_iter(window
, &src_win_iter
);
393 for (i
= 0; i
< loop_nr_pages
; i
++) {
394 phys_addr
= scif_off_to_dma_addr(window
, loop_offset
,
397 phys_addr
= scif_get_phys(phys_addr
, ep
);
398 err
= remap_pfn_range(vma
,
400 loop_offset
- offset
,
401 phys_addr
>> PAGE_SHIFT
,
406 loop_offset
+= PAGE_SIZE
;
408 nr_pages_left
-= loop_nr_pages
;
413 * No more failures expected. Bump up the ref count for all
414 * the windows. Another traversal from start_window required
415 * for handling errors encountered across windows during
416 * remap_pfn_range(..).
418 loop_offset
= offset
;
419 nr_pages_left
= nr_pages
;
420 window
= start_window
;
421 head
= &ep
->rma_info
.remote_reg_list
;
422 list_for_each_entry_from(window
, head
, list
) {
423 end_offset
= window
->offset
+
424 (window
->nr_pages
<< PAGE_SHIFT
);
425 loop_nr_pages
= min_t(int,
426 (end_offset
- loop_offset
) >> PAGE_SHIFT
,
428 scif_get_window(window
, loop_nr_pages
);
429 nr_pages_left
-= loop_nr_pages
;
430 loop_offset
+= (loop_nr_pages
<< PAGE_SHIFT
);
436 dev_err(scif_info
.mdev
.this_device
,
437 "%s %d err %d\n", __func__
, __LINE__
, err
);
442 * scif_rma_list_munmap:
444 * Traverse the remote registration list starting from window:
445 * 1) Decrement ref count.
446 * 2) If the ref count drops to zero then send a SCIF_MUNMAP message to peer.
447 * RMA lock must be held.
449 static void scif_rma_list_munmap(struct scif_window
*start_window
,
450 s64 offset
, int nr_pages
)
453 s64 loop_offset
= offset
, end_offset
;
454 int loop_nr_pages
, nr_pages_left
= nr_pages
;
455 struct scif_endpt
*ep
= (struct scif_endpt
*)start_window
->ep
;
456 struct list_head
*head
= &ep
->rma_info
.remote_reg_list
;
457 struct scif_window
*window
= start_window
, *_window
;
459 msg
.uop
= SCIF_MUNMAP
;
461 loop_offset
= offset
;
462 nr_pages_left
= nr_pages
;
463 list_for_each_entry_safe_from(window
, _window
, head
, list
) {
464 end_offset
= window
->offset
+
465 (window
->nr_pages
<< PAGE_SHIFT
);
466 loop_nr_pages
= min_t(int,
467 (end_offset
- loop_offset
) >> PAGE_SHIFT
,
469 scif_put_window(window
, loop_nr_pages
);
470 if (!window
->ref_count
) {
471 struct scif_dev
*rdev
= ep
->remote_dev
;
473 scif_drain_dma_intr(rdev
->sdev
,
474 ep
->rma_info
.dma_chan
);
475 /* Inform the peer about this munmap */
476 msg
.payload
[0] = window
->peer_window
;
477 /* No error handling for Notification messages. */
478 scif_nodeqp_send(ep
->remote_dev
, &msg
);
479 list_del(&window
->list
);
480 /* Destroy this window from the peer's registered AS */
481 scif_destroy_remote_window(window
);
483 nr_pages_left
-= loop_nr_pages
;
484 loop_offset
+= (loop_nr_pages
<< PAGE_SHIFT
);
491 * The private data field of each VMA used to mmap a remote window
492 * points to an instance of struct vma_pvt
495 struct scif_endpt
*ep
; /* End point for remote window */
496 s64 offset
; /* offset within remote window */
497 bool valid_offset
; /* offset is valid only if the original
498 * mmap request was for a single page
499 * else the offset within the vma is
505 static void vma_pvt_release(struct kref
*ref
)
507 struct vma_pvt
*vmapvt
= container_of(ref
, struct vma_pvt
, ref
);
513 * scif_vma_open - VMA open driver callback
514 * @vma: VMM memory area.
515 * The open method is called by the kernel to allow the subsystem implementing
516 * the VMA to initialize the area. This method is invoked any time a new
517 * reference to the VMA is made (when a process forks, for example).
518 * The one exception happens when the VMA is first created by mmap;
519 * in this case, the driver's mmap method is called instead.
520 * This function is also invoked when an existing VMA is split by the kernel
521 * due to a call to munmap on a subset of the VMA resulting in two VMAs.
522 * The kernel invokes this function only on one of the two VMAs.
524 static void scif_vma_open(struct vm_area_struct
*vma
)
526 struct vma_pvt
*vmapvt
= vma
->vm_private_data
;
528 dev_dbg(scif_info
.mdev
.this_device
,
529 "SCIFAPI vma open: vma_start 0x%lx vma_end 0x%lx\n",
530 vma
->vm_start
, vma
->vm_end
);
531 scif_insert_vma(vmapvt
->ep
, vma
);
532 kref_get(&vmapvt
->ref
);
536 * scif_munmap - VMA close driver callback.
537 * @vma: VMM memory area.
538 * When an area is destroyed, the kernel calls its close operation.
539 * Note that there's no usage count associated with VMA's; the area
540 * is opened and closed exactly once by each process that uses it.
542 static void scif_munmap(struct vm_area_struct
*vma
)
544 struct scif_endpt
*ep
;
545 struct vma_pvt
*vmapvt
= vma
->vm_private_data
;
546 int nr_pages
= vma_pages(vma
);
548 struct scif_rma_req req
;
549 struct scif_window
*window
= NULL
;
553 dev_dbg(scif_info
.mdev
.this_device
,
554 "SCIFAPI munmap: vma_start 0x%lx vma_end 0x%lx\n",
555 vma
->vm_start
, vma
->vm_end
);
557 offset
= vmapvt
->valid_offset
? vmapvt
->offset
:
558 (vma
->vm_pgoff
) << PAGE_SHIFT
;
559 dev_dbg(scif_info
.mdev
.this_device
,
560 "SCIFAPI munmap: ep %p nr_pages 0x%x offset 0x%llx\n",
561 ep
, nr_pages
, offset
);
562 req
.out_window
= &window
;
564 req
.nr_bytes
= vma
->vm_end
- vma
->vm_start
;
565 req
.prot
= vma
->vm_flags
& (VM_READ
| VM_WRITE
);
566 req
.type
= SCIF_WINDOW_PARTIAL
;
567 req
.head
= &ep
->rma_info
.remote_reg_list
;
569 mutex_lock(&ep
->rma_info
.rma_lock
);
571 err
= scif_query_window(&req
);
573 dev_err(scif_info
.mdev
.this_device
,
574 "%s %d err %d\n", __func__
, __LINE__
, err
);
576 scif_rma_list_munmap(window
, offset
, nr_pages
);
578 mutex_unlock(&ep
->rma_info
.rma_lock
);
580 * The kernel probably zeroes these out but we still want
581 * to clean up our own mess just in case.
584 vma
->vm_private_data
= NULL
;
585 kref_put(&vmapvt
->ref
, vma_pvt_release
);
586 scif_delete_vma(ep
, vma
);
589 static const struct vm_operations_struct scif_vm_ops
= {
590 .open
= scif_vma_open
,
591 .close
= scif_munmap
,
595 * scif_mmap - Map pages in virtual address space to a remote window.
596 * @vma: VMM memory area.
597 * @epd: endpoint descriptor
599 * Return: Upon successful completion, scif_mmap() returns zero
600 * else an apt error is returned as documented in scif.h
602 int scif_mmap(struct vm_area_struct
*vma
, scif_epd_t epd
)
604 struct scif_rma_req req
;
605 struct scif_window
*window
= NULL
;
606 struct scif_endpt
*ep
= (struct scif_endpt
*)epd
;
607 s64 start_offset
= vma
->vm_pgoff
<< PAGE_SHIFT
;
608 int nr_pages
= vma_pages(vma
);
610 struct vma_pvt
*vmapvt
;
612 dev_dbg(scif_info
.mdev
.this_device
,
613 "SCIFAPI mmap: ep %p start_offset 0x%llx nr_pages 0x%x\n",
614 ep
, start_offset
, nr_pages
);
615 err
= scif_verify_epd(ep
);
621 err
= scif_insert_vma(ep
, vma
);
625 vmapvt
= kzalloc(sizeof(*vmapvt
), GFP_KERNEL
);
627 scif_delete_vma(ep
, vma
);
632 kref_init(&vmapvt
->ref
);
634 req
.out_window
= &window
;
635 req
.offset
= start_offset
;
636 req
.nr_bytes
= vma
->vm_end
- vma
->vm_start
;
637 req
.prot
= vma
->vm_flags
& (VM_READ
| VM_WRITE
);
638 req
.type
= SCIF_WINDOW_PARTIAL
;
639 req
.head
= &ep
->rma_info
.remote_reg_list
;
641 mutex_lock(&ep
->rma_info
.rma_lock
);
642 /* Does a valid window exist? */
643 err
= scif_query_window(&req
);
645 dev_err(&ep
->remote_dev
->sdev
->dev
,
646 "%s %d err %d\n", __func__
, __LINE__
, err
);
650 /* Default prot for loopback */
651 if (!scifdev_self(ep
->remote_dev
))
652 vma
->vm_page_prot
= pgprot_writecombine(vma
->vm_page_prot
);
655 * VM_DONTCOPY - Do not copy this vma on fork
656 * VM_DONTEXPAND - Cannot expand with mremap()
657 * VM_RESERVED - Count as reserved_vm like IO
658 * VM_PFNMAP - Page-ranges managed without "struct page"
659 * VM_IO - Memory mapped I/O or similar
661 * We do not want to copy this VMA automatically on a fork(),
662 * expand this VMA due to mremap() or swap out these pages since
663 * the VMA is actually backed by physical pages in the remote
664 * node's physical memory and not via a struct page.
666 vma
->vm_flags
|= VM_DONTCOPY
| VM_DONTEXPAND
| VM_DONTDUMP
;
668 if (!scifdev_self(ep
->remote_dev
))
669 vma
->vm_flags
|= VM_IO
| VM_PFNMAP
;
671 /* Map this range of windows */
672 err
= scif_rma_list_mmap(window
, start_offset
, nr_pages
, vma
);
674 dev_err(&ep
->remote_dev
->sdev
->dev
,
675 "%s %d err %d\n", __func__
, __LINE__
, err
);
678 /* Set up the driver call back */
679 vma
->vm_ops
= &scif_vm_ops
;
680 vma
->vm_private_data
= vmapvt
;
682 mutex_unlock(&ep
->rma_info
.rma_lock
);
685 dev_err(&ep
->remote_dev
->sdev
->dev
,
686 "%s %d err %d\n", __func__
, __LINE__
, err
);
687 scif_delete_vma(ep
, vma
);