2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include <rdma/ib_umem.h>
34 #include <rdma/ib_umem_odp.h>
35 #include <linux/kernel.h>
41 #include <linux/mlx5/eq.h>
43 /* Contains the details of a pagefault. */
44 struct mlx5_pagefault
{
50 /* Initiator or send message responder pagefault details. */
52 /* Received packet size, only valid for responders. */
55 * Number of resource holding WQE, depends on type.
59 * WQE index. Refers to either the send queue or
60 * receive queue, according to event_subtype.
64 /* RDMA responder pagefault details */
68 * Received packet size, minimal size page fault
69 * resolution required for forward progress.
77 struct mlx5_ib_pf_eq
*eq
;
78 struct work_struct work
;
81 #define MAX_PREFETCH_LEN (4*1024*1024U)
83 /* Timeout in ms to wait for an active mmu notifier to complete when handling
85 #define MMU_NOTIFIER_TIMEOUT 1000
87 #define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
88 #define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
89 #define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
90 #define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
91 #define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
93 #define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
95 static u64 mlx5_imr_ksm_entries
;
97 static void populate_klm(struct mlx5_klm
*pklm
, size_t idx
, size_t nentries
,
98 struct mlx5_ib_mr
*imr
, int flags
)
100 struct mlx5_klm
*end
= pklm
+ nentries
;
102 if (flags
& MLX5_IB_UPD_XLT_ZAP
) {
103 for (; pklm
!= end
; pklm
++, idx
++) {
104 pklm
->bcount
= cpu_to_be32(MLX5_IMR_MTT_SIZE
);
105 pklm
->key
= cpu_to_be32(mr_to_mdev(imr
)->null_mkey
);
112 * The locking here is pretty subtle. Ideally the implicit_children
113 * xarray would be protected by the umem_mutex, however that is not
114 * possible. Instead this uses a weaker update-then-lock pattern:
118 * mutex_lock(umem_mutex)
119 * mlx5_ib_update_xlt()
120 * mutex_unlock(umem_mutex)
123 * ie any change the xarray must be followed by the locked update_xlt
126 * The umem_mutex provides the acquire/release semantic needed to make
127 * the xa_store() visible to a racing thread. While SRCU is not
128 * technically required, using it gives consistent use of the SRCU
129 * locking around the xarray.
131 lockdep_assert_held(&to_ib_umem_odp(imr
->umem
)->umem_mutex
);
132 lockdep_assert_held(&mr_to_mdev(imr
)->odp_srcu
);
134 for (; pklm
!= end
; pklm
++, idx
++) {
135 struct mlx5_ib_mr
*mtt
= xa_load(&imr
->implicit_children
, idx
);
137 pklm
->bcount
= cpu_to_be32(MLX5_IMR_MTT_SIZE
);
139 pklm
->key
= cpu_to_be32(mtt
->ibmr
.lkey
);
140 pklm
->va
= cpu_to_be64(idx
* MLX5_IMR_MTT_SIZE
);
142 pklm
->key
= cpu_to_be32(mr_to_mdev(imr
)->null_mkey
);
148 static u64
umem_dma_to_mtt(dma_addr_t umem_dma
)
150 u64 mtt_entry
= umem_dma
& ODP_DMA_ADDR_MASK
;
152 if (umem_dma
& ODP_READ_ALLOWED_BIT
)
153 mtt_entry
|= MLX5_IB_MTT_READ
;
154 if (umem_dma
& ODP_WRITE_ALLOWED_BIT
)
155 mtt_entry
|= MLX5_IB_MTT_WRITE
;
160 static void populate_mtt(__be64
*pas
, size_t idx
, size_t nentries
,
161 struct mlx5_ib_mr
*mr
, int flags
)
163 struct ib_umem_odp
*odp
= to_ib_umem_odp(mr
->umem
);
167 if (flags
& MLX5_IB_UPD_XLT_ZAP
)
170 for (i
= 0; i
< nentries
; i
++) {
171 pa
= odp
->dma_list
[idx
+ i
];
172 pas
[i
] = cpu_to_be64(umem_dma_to_mtt(pa
));
176 void mlx5_odp_populate_xlt(void *xlt
, size_t idx
, size_t nentries
,
177 struct mlx5_ib_mr
*mr
, int flags
)
179 if (flags
& MLX5_IB_UPD_XLT_INDIRECT
) {
180 populate_klm(xlt
, idx
, nentries
, mr
, flags
);
182 populate_mtt(xlt
, idx
, nentries
, mr
, flags
);
186 static void dma_fence_odp_mr(struct mlx5_ib_mr
*mr
)
188 struct ib_umem_odp
*odp
= to_ib_umem_odp(mr
->umem
);
190 /* Ensure mlx5_ib_invalidate_range() will not touch the MR any more */
191 mutex_lock(&odp
->umem_mutex
);
193 mlx5_mr_cache_invalidate(mr
);
194 ib_umem_odp_unmap_dma_pages(odp
, ib_umem_start(odp
),
196 WARN_ON(odp
->npages
);
199 mutex_unlock(&odp
->umem_mutex
);
201 if (!mr
->cache_ent
) {
202 mlx5_core_destroy_mkey(mr_to_mdev(mr
)->mdev
, &mr
->mmkey
);
208 * This must be called after the mr has been removed from implicit_children
209 * and the SRCU synchronized. NOTE: The MR does not necessarily have to be
210 * empty here, parallel page faults could have raced with the free process and
213 static void free_implicit_child_mr(struct mlx5_ib_mr
*mr
, bool need_imr_xlt
)
215 struct mlx5_ib_mr
*imr
= mr
->parent
;
216 struct ib_umem_odp
*odp_imr
= to_ib_umem_odp(imr
->umem
);
217 struct ib_umem_odp
*odp
= to_ib_umem_odp(mr
->umem
);
218 unsigned long idx
= ib_umem_start(odp
) >> MLX5_IMR_MTT_SHIFT
;
221 /* implicit_child_mr's are not allowed to have deferred work */
222 WARN_ON(atomic_read(&mr
->num_deferred_work
));
225 srcu_key
= srcu_read_lock(&mr_to_mdev(mr
)->odp_srcu
);
226 mutex_lock(&odp_imr
->umem_mutex
);
227 mlx5_ib_update_xlt(mr
->parent
, idx
, 1, 0,
228 MLX5_IB_UPD_XLT_INDIRECT
|
229 MLX5_IB_UPD_XLT_ATOMIC
);
230 mutex_unlock(&odp_imr
->umem_mutex
);
231 srcu_read_unlock(&mr_to_mdev(mr
)->odp_srcu
, srcu_key
);
234 dma_fence_odp_mr(mr
);
237 mlx5_mr_cache_free(mr_to_mdev(mr
), mr
);
238 ib_umem_odp_release(odp
);
239 if (atomic_dec_and_test(&imr
->num_deferred_work
))
240 wake_up(&imr
->q_deferred_work
);
243 static void free_implicit_child_mr_work(struct work_struct
*work
)
245 struct mlx5_ib_mr
*mr
=
246 container_of(work
, struct mlx5_ib_mr
, odp_destroy
.work
);
248 free_implicit_child_mr(mr
, true);
251 static void free_implicit_child_mr_rcu(struct rcu_head
*head
)
253 struct mlx5_ib_mr
*mr
=
254 container_of(head
, struct mlx5_ib_mr
, odp_destroy
.rcu
);
256 /* Freeing a MR is a sleeping operation, so bounce to a work queue */
257 INIT_WORK(&mr
->odp_destroy
.work
, free_implicit_child_mr_work
);
258 queue_work(system_unbound_wq
, &mr
->odp_destroy
.work
);
261 static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr
*mr
)
263 struct ib_umem_odp
*odp
= to_ib_umem_odp(mr
->umem
);
264 unsigned long idx
= ib_umem_start(odp
) >> MLX5_IMR_MTT_SHIFT
;
265 struct mlx5_ib_mr
*imr
= mr
->parent
;
267 xa_lock(&imr
->implicit_children
);
269 * This can race with mlx5_ib_free_implicit_mr(), the first one to
270 * reach the xa lock wins the race and destroys the MR.
272 if (__xa_cmpxchg(&imr
->implicit_children
, idx
, mr
, NULL
, GFP_ATOMIC
) !=
276 atomic_inc(&imr
->num_deferred_work
);
277 call_srcu(&mr_to_mdev(mr
)->odp_srcu
, &mr
->odp_destroy
.rcu
,
278 free_implicit_child_mr_rcu
);
281 xa_unlock(&imr
->implicit_children
);
284 static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier
*mni
,
285 const struct mmu_notifier_range
*range
,
286 unsigned long cur_seq
)
288 struct ib_umem_odp
*umem_odp
=
289 container_of(mni
, struct ib_umem_odp
, notifier
);
290 struct mlx5_ib_mr
*mr
;
291 const u64 umr_block_mask
= (MLX5_UMR_MTT_ALIGNMENT
/
292 sizeof(struct mlx5_mtt
)) - 1;
293 u64 idx
= 0, blk_start_idx
= 0;
294 u64 invalidations
= 0;
300 if (!mmu_notifier_range_blockable(range
))
303 mutex_lock(&umem_odp
->umem_mutex
);
304 mmu_interval_set_seq(mni
, cur_seq
);
306 * If npages is zero then umem_odp->private may not be setup yet. This
307 * does not complete until after the first page is mapped for DMA.
309 if (!umem_odp
->npages
)
311 mr
= umem_odp
->private;
313 start
= max_t(u64
, ib_umem_start(umem_odp
), range
->start
);
314 end
= min_t(u64
, ib_umem_end(umem_odp
), range
->end
);
317 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
318 * while we are doing the invalidation, no page fault will attempt to
319 * overwrite the same MTTs. Concurent invalidations might race us,
320 * but they will write 0s as well, so no difference in the end result.
322 for (addr
= start
; addr
< end
; addr
+= BIT(umem_odp
->page_shift
)) {
323 idx
= (addr
- ib_umem_start(umem_odp
)) >> umem_odp
->page_shift
;
325 * Strive to write the MTTs in chunks, but avoid overwriting
326 * non-existing MTTs. The huristic here can be improved to
327 * estimate the cost of another UMR vs. the cost of bigger
330 if (umem_odp
->dma_list
[idx
] &
331 (ODP_READ_ALLOWED_BIT
| ODP_WRITE_ALLOWED_BIT
)) {
337 /* Count page invalidations */
338 invalidations
+= idx
- blk_start_idx
+ 1;
340 u64 umr_offset
= idx
& umr_block_mask
;
342 if (in_block
&& umr_offset
== 0) {
343 mlx5_ib_update_xlt(mr
, blk_start_idx
,
344 idx
- blk_start_idx
, 0,
345 MLX5_IB_UPD_XLT_ZAP
|
346 MLX5_IB_UPD_XLT_ATOMIC
);
352 mlx5_ib_update_xlt(mr
, blk_start_idx
,
353 idx
- blk_start_idx
+ 1, 0,
354 MLX5_IB_UPD_XLT_ZAP
|
355 MLX5_IB_UPD_XLT_ATOMIC
);
357 mlx5_update_odp_stats(mr
, invalidations
, invalidations
);
360 * We are now sure that the device will not access the
361 * memory. We can safely unmap it, and mark it as dirty if
365 ib_umem_odp_unmap_dma_pages(umem_odp
, start
, end
);
367 if (unlikely(!umem_odp
->npages
&& mr
->parent
))
368 destroy_unused_implicit_child_mr(mr
);
370 mutex_unlock(&umem_odp
->umem_mutex
);
374 const struct mmu_interval_notifier_ops mlx5_mn_ops
= {
375 .invalidate
= mlx5_ib_invalidate_range
,
378 void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev
*dev
)
380 struct ib_odp_caps
*caps
= &dev
->odp_caps
;
382 memset(caps
, 0, sizeof(*caps
));
384 if (!MLX5_CAP_GEN(dev
->mdev
, pg
) ||
385 !mlx5_ib_can_load_pas_with_umr(dev
, 0))
388 caps
->general_caps
= IB_ODP_SUPPORT
;
390 if (MLX5_CAP_GEN(dev
->mdev
, umr_extended_translation_offset
))
391 dev
->odp_max_size
= U64_MAX
;
393 dev
->odp_max_size
= BIT_ULL(MLX5_MAX_UMR_SHIFT
+ PAGE_SHIFT
);
395 if (MLX5_CAP_ODP(dev
->mdev
, ud_odp_caps
.send
))
396 caps
->per_transport_caps
.ud_odp_caps
|= IB_ODP_SUPPORT_SEND
;
398 if (MLX5_CAP_ODP(dev
->mdev
, ud_odp_caps
.srq_receive
))
399 caps
->per_transport_caps
.ud_odp_caps
|= IB_ODP_SUPPORT_SRQ_RECV
;
401 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.send
))
402 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_SEND
;
404 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.receive
))
405 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_RECV
;
407 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.write
))
408 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_WRITE
;
410 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.read
))
411 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_READ
;
413 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.atomic
))
414 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_ATOMIC
;
416 if (MLX5_CAP_ODP(dev
->mdev
, rc_odp_caps
.srq_receive
))
417 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_SRQ_RECV
;
419 if (MLX5_CAP_ODP(dev
->mdev
, xrc_odp_caps
.send
))
420 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_SEND
;
422 if (MLX5_CAP_ODP(dev
->mdev
, xrc_odp_caps
.receive
))
423 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_RECV
;
425 if (MLX5_CAP_ODP(dev
->mdev
, xrc_odp_caps
.write
))
426 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_WRITE
;
428 if (MLX5_CAP_ODP(dev
->mdev
, xrc_odp_caps
.read
))
429 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_READ
;
431 if (MLX5_CAP_ODP(dev
->mdev
, xrc_odp_caps
.atomic
))
432 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_ATOMIC
;
434 if (MLX5_CAP_ODP(dev
->mdev
, xrc_odp_caps
.srq_receive
))
435 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_SRQ_RECV
;
437 if (MLX5_CAP_GEN(dev
->mdev
, fixed_buffer_size
) &&
438 MLX5_CAP_GEN(dev
->mdev
, null_mkey
) &&
439 MLX5_CAP_GEN(dev
->mdev
, umr_extended_translation_offset
) &&
440 !MLX5_CAP_GEN(dev
->mdev
, umr_indirect_mkey_disabled
))
441 caps
->general_caps
|= IB_ODP_SUPPORT_IMPLICIT
;
444 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev
*dev
,
445 struct mlx5_pagefault
*pfault
,
448 int wq_num
= pfault
->event_subtype
== MLX5_PFAULT_SUBTYPE_WQE
?
449 pfault
->wqe
.wq_num
: pfault
->token
;
450 u32 in
[MLX5_ST_SZ_DW(page_fault_resume_in
)] = {};
453 MLX5_SET(page_fault_resume_in
, in
, opcode
, MLX5_CMD_OP_PAGE_FAULT_RESUME
);
454 MLX5_SET(page_fault_resume_in
, in
, page_fault_type
, pfault
->type
);
455 MLX5_SET(page_fault_resume_in
, in
, token
, pfault
->token
);
456 MLX5_SET(page_fault_resume_in
, in
, wq_number
, wq_num
);
457 MLX5_SET(page_fault_resume_in
, in
, error
, !!error
);
459 err
= mlx5_cmd_exec_in(dev
->mdev
, page_fault_resume
, in
);
461 mlx5_ib_err(dev
, "Failed to resolve the page fault on WQ 0x%x err %d\n",
465 static struct mlx5_ib_mr
*implicit_get_child_mr(struct mlx5_ib_mr
*imr
,
468 struct ib_umem_odp
*odp
;
469 struct mlx5_ib_mr
*mr
;
470 struct mlx5_ib_mr
*ret
;
473 odp
= ib_umem_odp_alloc_child(to_ib_umem_odp(imr
->umem
),
474 idx
* MLX5_IMR_MTT_SIZE
,
475 MLX5_IMR_MTT_SIZE
, &mlx5_mn_ops
);
477 return ERR_CAST(odp
);
479 ret
= mr
= mlx5_mr_cache_alloc(
480 mr_to_mdev(imr
), MLX5_IMR_MTT_CACHE_ENTRY
, imr
->access_flags
);
484 mr
->ibmr
.pd
= imr
->ibmr
.pd
;
485 mr
->ibmr
.device
= &mr_to_mdev(imr
)->ib_dev
;
486 mr
->umem
= &odp
->umem
;
487 mr
->ibmr
.lkey
= mr
->mmkey
.key
;
488 mr
->ibmr
.rkey
= mr
->mmkey
.key
;
489 mr
->mmkey
.iova
= idx
* MLX5_IMR_MTT_SIZE
;
493 err
= mlx5_ib_update_xlt(mr
, 0,
494 MLX5_IMR_MTT_ENTRIES
,
496 MLX5_IB_UPD_XLT_ZAP
|
497 MLX5_IB_UPD_XLT_ENABLE
);
504 * Once the store to either xarray completes any error unwind has to
505 * use synchronize_srcu(). Avoid this with xa_reserve()
507 ret
= xa_cmpxchg(&imr
->implicit_children
, idx
, NULL
, mr
,
510 if (xa_is_err(ret
)) {
511 ret
= ERR_PTR(xa_err(ret
));
515 * Another thread beat us to creating the child mr, use
521 mlx5_ib_dbg(mr_to_mdev(imr
), "key %x mr %p\n", mr
->mmkey
.key
, mr
);
525 mlx5_mr_cache_free(mr_to_mdev(imr
), mr
);
527 ib_umem_odp_release(odp
);
531 struct mlx5_ib_mr
*mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd
*pd
,
532 struct ib_udata
*udata
,
535 struct mlx5_ib_dev
*dev
= to_mdev(pd
->ibpd
.device
);
536 struct ib_umem_odp
*umem_odp
;
537 struct mlx5_ib_mr
*imr
;
540 if (!mlx5_ib_can_load_pas_with_umr(dev
,
541 MLX5_IMR_MTT_ENTRIES
* PAGE_SIZE
))
542 return ERR_PTR(-EOPNOTSUPP
);
544 umem_odp
= ib_umem_odp_alloc_implicit(&dev
->ib_dev
, access_flags
);
545 if (IS_ERR(umem_odp
))
546 return ERR_CAST(umem_odp
);
548 imr
= mlx5_mr_cache_alloc(dev
, MLX5_IMR_KSM_CACHE_ENTRY
, access_flags
);
554 imr
->ibmr
.pd
= &pd
->ibpd
;
556 imr
->umem
= &umem_odp
->umem
;
557 imr
->ibmr
.lkey
= imr
->mmkey
.key
;
558 imr
->ibmr
.rkey
= imr
->mmkey
.key
;
559 imr
->ibmr
.device
= &dev
->ib_dev
;
560 imr
->umem
= &umem_odp
->umem
;
561 imr
->is_odp_implicit
= true;
562 atomic_set(&imr
->num_deferred_work
, 0);
563 init_waitqueue_head(&imr
->q_deferred_work
);
564 xa_init(&imr
->implicit_children
);
566 err
= mlx5_ib_update_xlt(imr
, 0,
567 mlx5_imr_ksm_entries
,
569 MLX5_IB_UPD_XLT_INDIRECT
|
570 MLX5_IB_UPD_XLT_ZAP
|
571 MLX5_IB_UPD_XLT_ENABLE
);
575 err
= xa_err(xa_store(&dev
->odp_mkeys
, mlx5_base_mkey(imr
->mmkey
.key
),
576 &imr
->mmkey
, GFP_KERNEL
));
580 mlx5_ib_dbg(dev
, "key %x mr %p\n", imr
->mmkey
.key
, imr
);
583 mlx5_ib_err(dev
, "Failed to register MKEY %d\n", err
);
584 mlx5_mr_cache_free(dev
, imr
);
586 ib_umem_odp_release(umem_odp
);
590 void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr
*imr
)
592 struct ib_umem_odp
*odp_imr
= to_ib_umem_odp(imr
->umem
);
593 struct mlx5_ib_dev
*dev
= mr_to_mdev(imr
);
594 struct list_head destroy_list
;
595 struct mlx5_ib_mr
*mtt
;
596 struct mlx5_ib_mr
*tmp
;
599 INIT_LIST_HEAD(&destroy_list
);
601 xa_erase(&dev
->odp_mkeys
, mlx5_base_mkey(imr
->mmkey
.key
));
603 * This stops the SRCU protected page fault path from touching either
604 * the imr or any children. The page fault path can only reach the
605 * children xarray via the imr.
607 synchronize_srcu(&dev
->odp_srcu
);
610 * All work on the prefetch list must be completed, xa_erase() prevented
611 * new work from being created.
613 wait_event(imr
->q_deferred_work
, !atomic_read(&imr
->num_deferred_work
));
616 * At this point it is forbidden for any other thread to enter
617 * pagefault_mr() on this imr. It is already forbidden to call
618 * pagefault_mr() on an implicit child. Due to this additions to
619 * implicit_children are prevented.
623 * Block destroy_unused_implicit_child_mr() from incrementing
626 xa_lock(&imr
->implicit_children
);
627 xa_for_each (&imr
->implicit_children
, idx
, mtt
) {
628 __xa_erase(&imr
->implicit_children
, idx
);
629 list_add(&mtt
->odp_destroy
.elm
, &destroy_list
);
631 xa_unlock(&imr
->implicit_children
);
634 * Wait for any concurrent destroy_unused_implicit_child_mr() to
637 wait_event(imr
->q_deferred_work
, !atomic_read(&imr
->num_deferred_work
));
640 * Fence the imr before we destroy the children. This allows us to
641 * skip updating the XLT of the imr during destroy of the child mkey
644 mlx5_mr_cache_invalidate(imr
);
646 list_for_each_entry_safe (mtt
, tmp
, &destroy_list
, odp_destroy
.elm
)
647 free_implicit_child_mr(mtt
, false);
649 mlx5_mr_cache_free(dev
, imr
);
650 ib_umem_odp_release(odp_imr
);
654 * mlx5_ib_fence_odp_mr - Stop all access to the ODP MR
657 * On return no parallel threads will be touching this MR and no DMA will be
660 void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr
*mr
)
662 /* Prevent new page faults and prefetch requests from succeeding */
663 xa_erase(&mr_to_mdev(mr
)->odp_mkeys
, mlx5_base_mkey(mr
->mmkey
.key
));
665 /* Wait for all running page-fault handlers to finish. */
666 synchronize_srcu(&mr_to_mdev(mr
)->odp_srcu
);
668 wait_event(mr
->q_deferred_work
, !atomic_read(&mr
->num_deferred_work
));
670 dma_fence_odp_mr(mr
);
673 #define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
674 #define MLX5_PF_FLAGS_SNAPSHOT BIT(2)
675 #define MLX5_PF_FLAGS_ENABLE BIT(3)
676 static int pagefault_real_mr(struct mlx5_ib_mr
*mr
, struct ib_umem_odp
*odp
,
677 u64 user_va
, size_t bcnt
, u32
*bytes_mapped
,
680 int page_shift
, ret
, np
;
681 bool downgrade
= flags
& MLX5_PF_FLAGS_DOWNGRADE
;
684 bool fault
= !(flags
& MLX5_PF_FLAGS_SNAPSHOT
);
685 u32 xlt_flags
= MLX5_IB_UPD_XLT_ATOMIC
;
687 if (flags
& MLX5_PF_FLAGS_ENABLE
)
688 xlt_flags
|= MLX5_IB_UPD_XLT_ENABLE
;
690 page_shift
= odp
->page_shift
;
691 start_idx
= (user_va
- ib_umem_start(odp
)) >> page_shift
;
692 access_mask
= ODP_READ_ALLOWED_BIT
;
694 if (odp
->umem
.writable
&& !downgrade
)
695 access_mask
|= ODP_WRITE_ALLOWED_BIT
;
697 np
= ib_umem_odp_map_dma_and_lock(odp
, user_va
, bcnt
, access_mask
, fault
);
702 * No need to check whether the MTTs really belong to this MR, since
703 * ib_umem_odp_map_dma_and_lock already checks this.
705 ret
= mlx5_ib_update_xlt(mr
, start_idx
, np
, page_shift
, xlt_flags
);
706 mutex_unlock(&odp
->umem_mutex
);
710 mlx5_ib_err(mr_to_mdev(mr
),
711 "Failed to update mkey page tables\n");
716 u32 new_mappings
= (np
<< page_shift
) -
717 (user_va
- round_down(user_va
, 1 << page_shift
));
719 *bytes_mapped
+= min_t(u32
, new_mappings
, bcnt
);
722 return np
<< (page_shift
- PAGE_SHIFT
);
728 static int pagefault_implicit_mr(struct mlx5_ib_mr
*imr
,
729 struct ib_umem_odp
*odp_imr
, u64 user_va
,
730 size_t bcnt
, u32
*bytes_mapped
, u32 flags
)
732 unsigned long end_idx
= (user_va
+ bcnt
- 1) >> MLX5_IMR_MTT_SHIFT
;
733 unsigned long upd_start_idx
= end_idx
+ 1;
734 unsigned long upd_len
= 0;
735 unsigned long npages
= 0;
739 if (unlikely(user_va
>= mlx5_imr_ksm_entries
* MLX5_IMR_MTT_SIZE
||
740 mlx5_imr_ksm_entries
* MLX5_IMR_MTT_SIZE
- user_va
< bcnt
))
743 /* Fault each child mr that intersects with our interval. */
745 unsigned long idx
= user_va
>> MLX5_IMR_MTT_SHIFT
;
746 struct ib_umem_odp
*umem_odp
;
747 struct mlx5_ib_mr
*mtt
;
750 mtt
= xa_load(&imr
->implicit_children
, idx
);
751 if (unlikely(!mtt
)) {
752 mtt
= implicit_get_child_mr(imr
, idx
);
757 upd_start_idx
= min(upd_start_idx
, idx
);
758 upd_len
= idx
- upd_start_idx
+ 1;
761 umem_odp
= to_ib_umem_odp(mtt
->umem
);
762 len
= min_t(u64
, user_va
+ bcnt
, ib_umem_end(umem_odp
)) -
765 ret
= pagefault_real_mr(mtt
, umem_odp
, user_va
, len
,
766 bytes_mapped
, flags
);
777 * Any time the implicit_children are changed we must perform an
778 * update of the xlt before exiting to ensure the HW and the
779 * implicit_children remains synchronized.
782 if (likely(!upd_len
))
786 * Notice this is not strictly ordered right, the KSM is updated after
787 * the implicit_children is updated, so a parallel page fault could
788 * see a MR that is not yet visible in the KSM. This is similar to a
789 * parallel page fault seeing a MR that is being concurrently removed
790 * from the KSM. Both of these improbable situations are resolved
791 * safely by resuming the HW and then taking another page fault. The
792 * next pagefault handler will see the new information.
794 mutex_lock(&odp_imr
->umem_mutex
);
795 err
= mlx5_ib_update_xlt(imr
, upd_start_idx
, upd_len
, 0,
796 MLX5_IB_UPD_XLT_INDIRECT
|
797 MLX5_IB_UPD_XLT_ATOMIC
);
798 mutex_unlock(&odp_imr
->umem_mutex
);
800 mlx5_ib_err(mr_to_mdev(imr
), "Failed to update PAS\n");
808 * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
809 * not accessible, or the MR is no longer valid.
810 * -EAGAIN/-ENOMEM: The operation should be retried
812 * -EINVAL/others: General internal malfunction
813 * >0: Number of pages mapped
815 static int pagefault_mr(struct mlx5_ib_mr
*mr
, u64 io_virt
, size_t bcnt
,
816 u32
*bytes_mapped
, u32 flags
)
818 struct ib_umem_odp
*odp
= to_ib_umem_odp(mr
->umem
);
820 lockdep_assert_held(&mr_to_mdev(mr
)->odp_srcu
);
821 if (unlikely(io_virt
< mr
->mmkey
.iova
))
824 if (!odp
->is_implicit_odp
) {
827 if (check_add_overflow(io_virt
- mr
->mmkey
.iova
,
828 (u64
)odp
->umem
.address
, &user_va
))
830 if (unlikely(user_va
>= ib_umem_end(odp
) ||
831 ib_umem_end(odp
) - user_va
< bcnt
))
833 return pagefault_real_mr(mr
, odp
, user_va
, bcnt
, bytes_mapped
,
836 return pagefault_implicit_mr(mr
, odp
, io_virt
, bcnt
, bytes_mapped
,
840 int mlx5_ib_init_odp_mr(struct mlx5_ib_mr
*mr
)
844 ret
= pagefault_real_mr(mr
, to_ib_umem_odp(mr
->umem
), mr
->umem
->address
,
845 mr
->umem
->length
, NULL
,
846 MLX5_PF_FLAGS_SNAPSHOT
| MLX5_PF_FLAGS_ENABLE
);
847 return ret
>= 0 ? 0 : ret
;
851 struct pf_frame
*next
;
858 static bool mkey_is_eq(struct mlx5_core_mkey
*mmkey
, u32 key
)
862 if (mmkey
->type
== MLX5_MKEY_MW
)
863 return mlx5_base_mkey(mmkey
->key
) == mlx5_base_mkey(key
);
864 return mmkey
->key
== key
;
867 static int get_indirect_num_descs(struct mlx5_core_mkey
*mmkey
)
869 struct mlx5_ib_mw
*mw
;
870 struct mlx5_ib_devx_mr
*devx_mr
;
872 if (mmkey
->type
== MLX5_MKEY_MW
) {
873 mw
= container_of(mmkey
, struct mlx5_ib_mw
, mmkey
);
877 devx_mr
= container_of(mmkey
, struct mlx5_ib_devx_mr
,
879 return devx_mr
->ndescs
;
883 * Handle a single data segment in a page-fault WQE or RDMA region.
885 * Returns number of OS pages retrieved on success. The caller may continue to
886 * the next data segment.
887 * Can return the following error codes:
888 * -EAGAIN to designate a temporary error. The caller will abort handling the
889 * page fault and resolve it.
890 * -EFAULT when there's an error mapping the requested pages. The caller will
891 * abort the page fault handling.
893 static int pagefault_single_data_segment(struct mlx5_ib_dev
*dev
,
894 struct ib_pd
*pd
, u32 key
,
895 u64 io_virt
, size_t bcnt
,
896 u32
*bytes_committed
,
899 int npages
= 0, srcu_key
, ret
, i
, outlen
, cur_outlen
= 0, depth
= 0;
900 struct pf_frame
*head
= NULL
, *frame
;
901 struct mlx5_core_mkey
*mmkey
;
902 struct mlx5_ib_mr
*mr
;
903 struct mlx5_klm
*pklm
;
908 srcu_key
= srcu_read_lock(&dev
->odp_srcu
);
910 io_virt
+= *bytes_committed
;
911 bcnt
-= *bytes_committed
;
914 mmkey
= xa_load(&dev
->odp_mkeys
, mlx5_base_mkey(key
));
918 "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
921 *bytes_mapped
+= bcnt
;
923 * The user could specify a SGL with multiple lkeys and only
924 * some of them are ODP. Treat the non-ODP ones as fully
930 if (!mkey_is_eq(mmkey
, key
)) {
931 mlx5_ib_dbg(dev
, "failed to find mkey %x\n", key
);
936 switch (mmkey
->type
) {
938 mr
= container_of(mmkey
, struct mlx5_ib_mr
, mmkey
);
940 ret
= pagefault_mr(mr
, io_virt
, bcnt
, bytes_mapped
, 0);
944 mlx5_update_odp_stats(mr
, faults
, ret
);
951 case MLX5_MKEY_INDIRECT_DEVX
:
952 ndescs
= get_indirect_num_descs(mmkey
);
954 if (depth
>= MLX5_CAP_GEN(dev
->mdev
, max_indirection
)) {
955 mlx5_ib_dbg(dev
, "indirection level exceeded\n");
960 outlen
= MLX5_ST_SZ_BYTES(query_mkey_out
) +
961 sizeof(*pklm
) * (ndescs
- 2);
963 if (outlen
> cur_outlen
) {
965 out
= kzalloc(outlen
, GFP_KERNEL
);
973 pklm
= (struct mlx5_klm
*)MLX5_ADDR_OF(query_mkey_out
, out
,
974 bsf0_klm0_pas_mtt0_1
);
976 ret
= mlx5_core_query_mkey(dev
->mdev
, mmkey
, out
, outlen
);
980 offset
= io_virt
- MLX5_GET64(query_mkey_out
, out
,
981 memory_key_mkey_entry
.start_addr
);
983 for (i
= 0; bcnt
&& i
< ndescs
; i
++, pklm
++) {
984 if (offset
>= be32_to_cpu(pklm
->bcount
)) {
985 offset
-= be32_to_cpu(pklm
->bcount
);
989 frame
= kzalloc(sizeof(*frame
), GFP_KERNEL
);
995 frame
->key
= be32_to_cpu(pklm
->key
);
996 frame
->io_virt
= be64_to_cpu(pklm
->va
) + offset
;
997 frame
->bcnt
= min_t(size_t, bcnt
,
998 be32_to_cpu(pklm
->bcount
) - offset
);
999 frame
->depth
= depth
+ 1;
1003 bcnt
-= frame
->bcnt
;
1009 mlx5_ib_dbg(dev
, "wrong mkey type %d\n", mmkey
->type
);
1019 io_virt
= frame
->io_virt
;
1021 depth
= frame
->depth
;
1035 srcu_read_unlock(&dev
->odp_srcu
, srcu_key
);
1036 *bytes_committed
= 0;
1037 return ret
? ret
: npages
;
1041 * Parse a series of data segments for page fault handling.
1043 * @pfault contains page fault information.
1044 * @wqe points at the first data segment in the WQE.
1045 * @wqe_end points after the end of the WQE.
1046 * @bytes_mapped receives the number of bytes that the function was able to
1047 * map. This allows the caller to decide intelligently whether
1048 * enough memory was mapped to resolve the page fault
1049 * successfully (e.g. enough for the next MTU, or the entire
1051 * @total_wqe_bytes receives the total data size of this WQE in bytes (minus
1052 * the committed bytes).
1054 * Returns the number of pages loaded if positive, zero for an empty WQE, or a
1055 * negative error code.
1057 static int pagefault_data_segments(struct mlx5_ib_dev
*dev
,
1058 struct mlx5_pagefault
*pfault
,
1060 void *wqe_end
, u32
*bytes_mapped
,
1061 u32
*total_wqe_bytes
, bool receive_queue
)
1063 int ret
= 0, npages
= 0;
1072 if (total_wqe_bytes
)
1073 *total_wqe_bytes
= 0;
1075 while (wqe
< wqe_end
) {
1076 struct mlx5_wqe_data_seg
*dseg
= wqe
;
1078 io_virt
= be64_to_cpu(dseg
->addr
);
1079 key
= be32_to_cpu(dseg
->lkey
);
1080 byte_count
= be32_to_cpu(dseg
->byte_count
);
1081 inline_segment
= !!(byte_count
& MLX5_INLINE_SEG
);
1082 bcnt
= byte_count
& ~MLX5_INLINE_SEG
;
1084 if (inline_segment
) {
1085 bcnt
= bcnt
& MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK
;
1086 wqe
+= ALIGN(sizeof(struct mlx5_wqe_inline_seg
) + bcnt
,
1089 wqe
+= sizeof(*dseg
);
1092 /* receive WQE end of sg list. */
1093 if (receive_queue
&& bcnt
== 0 && key
== MLX5_INVALID_LKEY
&&
1097 if (!inline_segment
&& total_wqe_bytes
) {
1098 *total_wqe_bytes
+= bcnt
- min_t(size_t, bcnt
,
1099 pfault
->bytes_committed
);
1102 /* A zero length data segment designates a length of 2GB. */
1106 if (inline_segment
|| bcnt
<= pfault
->bytes_committed
) {
1107 pfault
->bytes_committed
-=
1109 pfault
->bytes_committed
);
1113 ret
= pagefault_single_data_segment(dev
, NULL
, key
,
1115 &pfault
->bytes_committed
,
1122 return ret
< 0 ? ret
: npages
;
1126 * Parse initiator WQE. Advances the wqe pointer to point at the
1127 * scatter-gather list, and set wqe_end to the end of the WQE.
1129 static int mlx5_ib_mr_initiator_pfault_handler(
1130 struct mlx5_ib_dev
*dev
, struct mlx5_pagefault
*pfault
,
1131 struct mlx5_ib_qp
*qp
, void **wqe
, void **wqe_end
, int wqe_length
)
1133 struct mlx5_wqe_ctrl_seg
*ctrl
= *wqe
;
1134 u16 wqe_index
= pfault
->wqe
.wqe_index
;
1135 struct mlx5_base_av
*av
;
1136 unsigned ds
, opcode
;
1137 u32 qpn
= qp
->trans_qp
.base
.mqp
.qpn
;
1139 ds
= be32_to_cpu(ctrl
->qpn_ds
) & MLX5_WQE_CTRL_DS_MASK
;
1140 if (ds
* MLX5_WQE_DS_UNITS
> wqe_length
) {
1141 mlx5_ib_err(dev
, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
1147 mlx5_ib_err(dev
, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
1152 *wqe_end
= *wqe
+ ds
* MLX5_WQE_DS_UNITS
;
1153 *wqe
+= sizeof(*ctrl
);
1155 opcode
= be32_to_cpu(ctrl
->opmod_idx_opcode
) &
1156 MLX5_WQE_CTRL_OPCODE_MASK
;
1158 if (qp
->ibqp
.qp_type
== IB_QPT_XRC_INI
)
1159 *wqe
+= sizeof(struct mlx5_wqe_xrc_seg
);
1161 if (qp
->type
== IB_QPT_UD
|| qp
->type
== MLX5_IB_QPT_DCI
) {
1163 if (av
->dqp_dct
& cpu_to_be32(MLX5_EXTENDED_UD_AV
))
1164 *wqe
+= sizeof(struct mlx5_av
);
1166 *wqe
+= sizeof(struct mlx5_base_av
);
1170 case MLX5_OPCODE_RDMA_WRITE
:
1171 case MLX5_OPCODE_RDMA_WRITE_IMM
:
1172 case MLX5_OPCODE_RDMA_READ
:
1173 *wqe
+= sizeof(struct mlx5_wqe_raddr_seg
);
1175 case MLX5_OPCODE_ATOMIC_CS
:
1176 case MLX5_OPCODE_ATOMIC_FA
:
1177 *wqe
+= sizeof(struct mlx5_wqe_raddr_seg
);
1178 *wqe
+= sizeof(struct mlx5_wqe_atomic_seg
);
1186 * Parse responder WQE and set wqe_end to the end of the WQE.
1188 static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev
*dev
,
1189 struct mlx5_ib_srq
*srq
,
1190 void **wqe
, void **wqe_end
,
1193 int wqe_size
= 1 << srq
->msrq
.wqe_shift
;
1195 if (wqe_size
> wqe_length
) {
1196 mlx5_ib_err(dev
, "Couldn't read all of the receive WQE's content\n");
1200 *wqe_end
= *wqe
+ wqe_size
;
1201 *wqe
+= sizeof(struct mlx5_wqe_srq_next_seg
);
1206 static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev
*dev
,
1207 struct mlx5_ib_qp
*qp
,
1208 void *wqe
, void **wqe_end
,
1211 struct mlx5_ib_wq
*wq
= &qp
->rq
;
1212 int wqe_size
= 1 << wq
->wqe_shift
;
1214 if (qp
->flags_en
& MLX5_QP_FLAG_SIGNATURE
) {
1215 mlx5_ib_err(dev
, "ODP fault with WQE signatures is not supported\n");
1219 if (wqe_size
> wqe_length
) {
1220 mlx5_ib_err(dev
, "Couldn't read all of the receive WQE's content\n");
1224 *wqe_end
= wqe
+ wqe_size
;
1229 static inline struct mlx5_core_rsc_common
*odp_get_rsc(struct mlx5_ib_dev
*dev
,
1230 u32 wq_num
, int pf_type
)
1232 struct mlx5_core_rsc_common
*common
= NULL
;
1233 struct mlx5_core_srq
*srq
;
1236 case MLX5_WQE_PF_TYPE_RMP
:
1237 srq
= mlx5_cmd_get_srq(dev
, wq_num
);
1239 common
= &srq
->common
;
1241 case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE
:
1242 case MLX5_WQE_PF_TYPE_RESP
:
1243 case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC
:
1244 common
= mlx5_core_res_hold(dev
, wq_num
, MLX5_RES_QP
);
1253 static inline struct mlx5_ib_qp
*res_to_qp(struct mlx5_core_rsc_common
*res
)
1255 struct mlx5_core_qp
*mqp
= (struct mlx5_core_qp
*)res
;
1257 return to_mibqp(mqp
);
1260 static inline struct mlx5_ib_srq
*res_to_srq(struct mlx5_core_rsc_common
*res
)
1262 struct mlx5_core_srq
*msrq
=
1263 container_of(res
, struct mlx5_core_srq
, common
);
1265 return to_mibsrq(msrq
);
1268 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev
*dev
,
1269 struct mlx5_pagefault
*pfault
)
1271 bool sq
= pfault
->type
& MLX5_PFAULT_REQUESTOR
;
1272 u16 wqe_index
= pfault
->wqe
.wqe_index
;
1273 void *wqe
, *wqe_start
= NULL
, *wqe_end
= NULL
;
1274 u32 bytes_mapped
, total_wqe_bytes
;
1275 struct mlx5_core_rsc_common
*res
;
1276 int resume_with_error
= 1;
1277 struct mlx5_ib_qp
*qp
;
1278 size_t bytes_copied
;
1281 res
= odp_get_rsc(dev
, pfault
->wqe
.wq_num
, pfault
->type
);
1283 mlx5_ib_dbg(dev
, "wqe page fault for missing resource %d\n", pfault
->wqe
.wq_num
);
1287 if (res
->res
!= MLX5_RES_QP
&& res
->res
!= MLX5_RES_SRQ
&&
1288 res
->res
!= MLX5_RES_XSRQ
) {
1289 mlx5_ib_err(dev
, "wqe page fault for unsupported type %d\n",
1291 goto resolve_page_fault
;
1294 wqe_start
= (void *)__get_free_page(GFP_KERNEL
);
1296 mlx5_ib_err(dev
, "Error allocating memory for IO page fault handling.\n");
1297 goto resolve_page_fault
;
1301 qp
= (res
->res
== MLX5_RES_QP
) ? res_to_qp(res
) : NULL
;
1303 ret
= mlx5_ib_read_wqe_sq(qp
, wqe_index
, wqe
, PAGE_SIZE
,
1307 ret
= mlx5_ib_mr_initiator_pfault_handler(
1308 dev
, pfault
, qp
, &wqe
, &wqe_end
, bytes_copied
);
1309 } else if (qp
&& !sq
) {
1310 ret
= mlx5_ib_read_wqe_rq(qp
, wqe_index
, wqe
, PAGE_SIZE
,
1314 ret
= mlx5_ib_mr_responder_pfault_handler_rq(
1315 dev
, qp
, wqe
, &wqe_end
, bytes_copied
);
1317 struct mlx5_ib_srq
*srq
= res_to_srq(res
);
1319 ret
= mlx5_ib_read_wqe_srq(srq
, wqe_index
, wqe
, PAGE_SIZE
,
1323 ret
= mlx5_ib_mr_responder_pfault_handler_srq(
1324 dev
, srq
, &wqe
, &wqe_end
, bytes_copied
);
1327 if (ret
< 0 || wqe
>= wqe_end
)
1328 goto resolve_page_fault
;
1330 ret
= pagefault_data_segments(dev
, pfault
, wqe
, wqe_end
, &bytes_mapped
,
1331 &total_wqe_bytes
, !sq
);
1335 if (ret
< 0 || total_wqe_bytes
> bytes_mapped
)
1336 goto resolve_page_fault
;
1340 resume_with_error
= 0;
1346 "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n",
1347 ret
, wqe_index
, pfault
->token
);
1350 mlx5_ib_page_fault_resume(dev
, pfault
, resume_with_error
);
1351 mlx5_ib_dbg(dev
, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
1352 pfault
->wqe
.wq_num
, resume_with_error
,
1354 mlx5_core_res_put(res
);
1355 free_page((unsigned long)wqe_start
);
1358 static int pages_in_range(u64 address
, u32 length
)
1360 return (ALIGN(address
+ length
, PAGE_SIZE
) -
1361 (address
& PAGE_MASK
)) >> PAGE_SHIFT
;
1364 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev
*dev
,
1365 struct mlx5_pagefault
*pfault
)
1369 u32 prefetch_len
= pfault
->bytes_committed
;
1370 int prefetch_activated
= 0;
1371 u32 rkey
= pfault
->rdma
.r_key
;
1374 /* The RDMA responder handler handles the page fault in two parts.
1375 * First it brings the necessary pages for the current packet
1376 * (and uses the pfault context), and then (after resuming the QP)
1377 * prefetches more pages. The second operation cannot use the pfault
1378 * context and therefore uses the dummy_pfault context allocated on
1380 pfault
->rdma
.rdma_va
+= pfault
->bytes_committed
;
1381 pfault
->rdma
.rdma_op_len
-= min(pfault
->bytes_committed
,
1382 pfault
->rdma
.rdma_op_len
);
1383 pfault
->bytes_committed
= 0;
1385 address
= pfault
->rdma
.rdma_va
;
1386 length
= pfault
->rdma
.rdma_op_len
;
1388 /* For some operations, the hardware cannot tell the exact message
1389 * length, and in those cases it reports zero. Use prefetch
1392 prefetch_activated
= 1;
1393 length
= pfault
->rdma
.packet_size
;
1394 prefetch_len
= min(MAX_PREFETCH_LEN
, prefetch_len
);
1397 ret
= pagefault_single_data_segment(dev
, NULL
, rkey
, address
, length
,
1398 &pfault
->bytes_committed
, NULL
);
1399 if (ret
== -EAGAIN
) {
1400 /* We're racing with an invalidation, don't prefetch */
1401 prefetch_activated
= 0;
1402 } else if (ret
< 0 || pages_in_range(address
, length
) > ret
) {
1403 mlx5_ib_page_fault_resume(dev
, pfault
, 1);
1405 mlx5_ib_dbg(dev
, "PAGE FAULT error %d. QP 0x%x, type: 0x%x\n",
1406 ret
, pfault
->token
, pfault
->type
);
1410 mlx5_ib_page_fault_resume(dev
, pfault
, 0);
1411 mlx5_ib_dbg(dev
, "PAGE FAULT completed. QP 0x%x, type: 0x%x, prefetch_activated: %d\n",
1412 pfault
->token
, pfault
->type
,
1413 prefetch_activated
);
1415 /* At this point, there might be a new pagefault already arriving in
1416 * the eq, switch to the dummy pagefault for the rest of the
1417 * processing. We're still OK with the objects being alive as the
1418 * work-queue is being fenced. */
1420 if (prefetch_activated
) {
1421 u32 bytes_committed
= 0;
1423 ret
= pagefault_single_data_segment(dev
, NULL
, rkey
, address
,
1425 &bytes_committed
, NULL
);
1426 if (ret
< 0 && ret
!= -EAGAIN
) {
1427 mlx5_ib_dbg(dev
, "Prefetch failed. ret: %d, QP 0x%x, address: 0x%.16llx, length = 0x%.16x\n",
1428 ret
, pfault
->token
, address
, prefetch_len
);
1433 static void mlx5_ib_pfault(struct mlx5_ib_dev
*dev
, struct mlx5_pagefault
*pfault
)
1435 u8 event_subtype
= pfault
->event_subtype
;
1437 switch (event_subtype
) {
1438 case MLX5_PFAULT_SUBTYPE_WQE
:
1439 mlx5_ib_mr_wqe_pfault_handler(dev
, pfault
);
1441 case MLX5_PFAULT_SUBTYPE_RDMA
:
1442 mlx5_ib_mr_rdma_pfault_handler(dev
, pfault
);
1445 mlx5_ib_err(dev
, "Invalid page fault event subtype: 0x%x\n",
1447 mlx5_ib_page_fault_resume(dev
, pfault
, 1);
1451 static void mlx5_ib_eqe_pf_action(struct work_struct
*work
)
1453 struct mlx5_pagefault
*pfault
= container_of(work
,
1454 struct mlx5_pagefault
,
1456 struct mlx5_ib_pf_eq
*eq
= pfault
->eq
;
1458 mlx5_ib_pfault(eq
->dev
, pfault
);
1459 mempool_free(pfault
, eq
->pool
);
1462 static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq
*eq
)
1464 struct mlx5_eqe_page_fault
*pf_eqe
;
1465 struct mlx5_pagefault
*pfault
;
1466 struct mlx5_eqe
*eqe
;
1469 while ((eqe
= mlx5_eq_get_eqe(eq
->core
, cc
))) {
1470 pfault
= mempool_alloc(eq
->pool
, GFP_ATOMIC
);
1472 schedule_work(&eq
->work
);
1476 pf_eqe
= &eqe
->data
.page_fault
;
1477 pfault
->event_subtype
= eqe
->sub_type
;
1478 pfault
->bytes_committed
= be32_to_cpu(pf_eqe
->bytes_committed
);
1480 mlx5_ib_dbg(eq
->dev
,
1481 "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x\n",
1482 eqe
->sub_type
, pfault
->bytes_committed
);
1484 switch (eqe
->sub_type
) {
1485 case MLX5_PFAULT_SUBTYPE_RDMA
:
1486 /* RDMA based event */
1488 be32_to_cpu(pf_eqe
->rdma
.pftype_token
) >> 24;
1490 be32_to_cpu(pf_eqe
->rdma
.pftype_token
) &
1492 pfault
->rdma
.r_key
=
1493 be32_to_cpu(pf_eqe
->rdma
.r_key
);
1494 pfault
->rdma
.packet_size
=
1495 be16_to_cpu(pf_eqe
->rdma
.packet_length
);
1496 pfault
->rdma
.rdma_op_len
=
1497 be32_to_cpu(pf_eqe
->rdma
.rdma_op_len
);
1498 pfault
->rdma
.rdma_va
=
1499 be64_to_cpu(pf_eqe
->rdma
.rdma_va
);
1500 mlx5_ib_dbg(eq
->dev
,
1501 "PAGE_FAULT: type:0x%x, token: 0x%06x, r_key: 0x%08x\n",
1502 pfault
->type
, pfault
->token
,
1503 pfault
->rdma
.r_key
);
1504 mlx5_ib_dbg(eq
->dev
,
1505 "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
1506 pfault
->rdma
.rdma_op_len
,
1507 pfault
->rdma
.rdma_va
);
1510 case MLX5_PFAULT_SUBTYPE_WQE
:
1511 /* WQE based event */
1513 (be32_to_cpu(pf_eqe
->wqe
.pftype_wq
) >> 24) & 0x7;
1515 be32_to_cpu(pf_eqe
->wqe
.token
);
1516 pfault
->wqe
.wq_num
=
1517 be32_to_cpu(pf_eqe
->wqe
.pftype_wq
) &
1519 pfault
->wqe
.wqe_index
=
1520 be16_to_cpu(pf_eqe
->wqe
.wqe_index
);
1521 pfault
->wqe
.packet_size
=
1522 be16_to_cpu(pf_eqe
->wqe
.packet_length
);
1523 mlx5_ib_dbg(eq
->dev
,
1524 "PAGE_FAULT: type:0x%x, token: 0x%06x, wq_num: 0x%06x, wqe_index: 0x%04x\n",
1525 pfault
->type
, pfault
->token
,
1527 pfault
->wqe
.wqe_index
);
1531 mlx5_ib_warn(eq
->dev
,
1532 "Unsupported page fault event sub-type: 0x%02hhx\n",
1534 /* Unsupported page faults should still be
1535 * resolved by the page fault handler
1540 INIT_WORK(&pfault
->work
, mlx5_ib_eqe_pf_action
);
1541 queue_work(eq
->wq
, &pfault
->work
);
1543 cc
= mlx5_eq_update_cc(eq
->core
, ++cc
);
1546 mlx5_eq_update_ci(eq
->core
, cc
, 1);
1549 static int mlx5_ib_eq_pf_int(struct notifier_block
*nb
, unsigned long type
,
1552 struct mlx5_ib_pf_eq
*eq
=
1553 container_of(nb
, struct mlx5_ib_pf_eq
, irq_nb
);
1554 unsigned long flags
;
1556 if (spin_trylock_irqsave(&eq
->lock
, flags
)) {
1557 mlx5_ib_eq_pf_process(eq
);
1558 spin_unlock_irqrestore(&eq
->lock
, flags
);
1560 schedule_work(&eq
->work
);
1566 /* mempool_refill() was proposed but unfortunately wasn't accepted
1567 * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
1570 static void mempool_refill(mempool_t
*pool
)
1572 while (pool
->curr_nr
< pool
->min_nr
)
1573 mempool_free(mempool_alloc(pool
, GFP_KERNEL
), pool
);
1576 static void mlx5_ib_eq_pf_action(struct work_struct
*work
)
1578 struct mlx5_ib_pf_eq
*eq
=
1579 container_of(work
, struct mlx5_ib_pf_eq
, work
);
1581 mempool_refill(eq
->pool
);
1583 spin_lock_irq(&eq
->lock
);
1584 mlx5_ib_eq_pf_process(eq
);
1585 spin_unlock_irq(&eq
->lock
);
1589 MLX5_IB_NUM_PF_EQE
= 0x1000,
1590 MLX5_IB_NUM_PF_DRAIN
= 64,
1594 mlx5_ib_create_pf_eq(struct mlx5_ib_dev
*dev
, struct mlx5_ib_pf_eq
*eq
)
1596 struct mlx5_eq_param param
= {};
1599 INIT_WORK(&eq
->work
, mlx5_ib_eq_pf_action
);
1600 spin_lock_init(&eq
->lock
);
1603 eq
->pool
= mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN
,
1604 sizeof(struct mlx5_pagefault
));
1608 eq
->wq
= alloc_workqueue("mlx5_ib_page_fault",
1609 WQ_HIGHPRI
| WQ_UNBOUND
| WQ_MEM_RECLAIM
,
1616 eq
->irq_nb
.notifier_call
= mlx5_ib_eq_pf_int
;
1617 param
= (struct mlx5_eq_param
) {
1619 .nent
= MLX5_IB_NUM_PF_EQE
,
1621 param
.mask
[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT
;
1622 eq
->core
= mlx5_eq_create_generic(dev
->mdev
, ¶m
);
1623 if (IS_ERR(eq
->core
)) {
1624 err
= PTR_ERR(eq
->core
);
1627 err
= mlx5_eq_enable(dev
->mdev
, eq
->core
, &eq
->irq_nb
);
1629 mlx5_ib_err(dev
, "failed to enable odp EQ %d\n", err
);
1635 mlx5_eq_destroy_generic(dev
->mdev
, eq
->core
);
1637 destroy_workqueue(eq
->wq
);
1639 mempool_destroy(eq
->pool
);
1644 mlx5_ib_destroy_pf_eq(struct mlx5_ib_dev
*dev
, struct mlx5_ib_pf_eq
*eq
)
1648 mlx5_eq_disable(dev
->mdev
, eq
->core
, &eq
->irq_nb
);
1649 err
= mlx5_eq_destroy_generic(dev
->mdev
, eq
->core
);
1650 cancel_work_sync(&eq
->work
);
1651 destroy_workqueue(eq
->wq
);
1652 mempool_destroy(eq
->pool
);
1657 void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent
*ent
)
1659 if (!(ent
->dev
->odp_caps
.general_caps
& IB_ODP_SUPPORT_IMPLICIT
))
1662 switch (ent
->order
- 2) {
1663 case MLX5_IMR_MTT_CACHE_ENTRY
:
1664 ent
->page
= PAGE_SHIFT
;
1665 ent
->xlt
= MLX5_IMR_MTT_ENTRIES
*
1666 sizeof(struct mlx5_mtt
) /
1667 MLX5_IB_UMR_OCTOWORD
;
1668 ent
->access_mode
= MLX5_MKC_ACCESS_MODE_MTT
;
1672 case MLX5_IMR_KSM_CACHE_ENTRY
:
1673 ent
->page
= MLX5_KSM_PAGE_SHIFT
;
1674 ent
->xlt
= mlx5_imr_ksm_entries
*
1675 sizeof(struct mlx5_klm
) /
1676 MLX5_IB_UMR_OCTOWORD
;
1677 ent
->access_mode
= MLX5_MKC_ACCESS_MODE_KSM
;
1683 static const struct ib_device_ops mlx5_ib_dev_odp_ops
= {
1684 .advise_mr
= mlx5_ib_advise_mr
,
1687 int mlx5_ib_odp_init_one(struct mlx5_ib_dev
*dev
)
1691 if (!(dev
->odp_caps
.general_caps
& IB_ODP_SUPPORT
))
1694 ib_set_device_ops(&dev
->ib_dev
, &mlx5_ib_dev_odp_ops
);
1696 if (dev
->odp_caps
.general_caps
& IB_ODP_SUPPORT_IMPLICIT
) {
1697 ret
= mlx5_cmd_null_mkey(dev
->mdev
, &dev
->null_mkey
);
1699 mlx5_ib_err(dev
, "Error getting null_mkey %d\n", ret
);
1704 ret
= mlx5_ib_create_pf_eq(dev
, &dev
->odp_pf_eq
);
1709 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev
*dev
)
1711 if (!(dev
->odp_caps
.general_caps
& IB_ODP_SUPPORT
))
1714 mlx5_ib_destroy_pf_eq(dev
, &dev
->odp_pf_eq
);
1717 int mlx5_ib_odp_init(void)
1719 mlx5_imr_ksm_entries
= BIT_ULL(get_order(TASK_SIZE
) -
1725 struct prefetch_mr_work
{
1726 struct work_struct work
;
1731 struct mlx5_ib_mr
*mr
;
1736 static void destroy_prefetch_work(struct prefetch_mr_work
*work
)
1740 for (i
= 0; i
< work
->num_sge
; ++i
)
1741 if (atomic_dec_and_test(&work
->frags
[i
].mr
->num_deferred_work
))
1742 wake_up(&work
->frags
[i
].mr
->q_deferred_work
);
1746 static struct mlx5_ib_mr
*
1747 get_prefetchable_mr(struct ib_pd
*pd
, enum ib_uverbs_advise_mr_advice advice
,
1750 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1751 struct mlx5_core_mkey
*mmkey
;
1752 struct ib_umem_odp
*odp
;
1753 struct mlx5_ib_mr
*mr
;
1755 lockdep_assert_held(&dev
->odp_srcu
);
1757 mmkey
= xa_load(&dev
->odp_mkeys
, mlx5_base_mkey(lkey
));
1758 if (!mmkey
|| mmkey
->key
!= lkey
|| mmkey
->type
!= MLX5_MKEY_MR
)
1761 mr
= container_of(mmkey
, struct mlx5_ib_mr
, mmkey
);
1763 if (mr
->ibmr
.pd
!= pd
)
1766 odp
= to_ib_umem_odp(mr
->umem
);
1768 /* prefetch with write-access must be supported by the MR */
1769 if (advice
== IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE
&&
1770 !odp
->umem
.writable
)
1776 static void mlx5_ib_prefetch_mr_work(struct work_struct
*w
)
1778 struct prefetch_mr_work
*work
=
1779 container_of(w
, struct prefetch_mr_work
, work
);
1780 struct mlx5_ib_dev
*dev
;
1781 u32 bytes_mapped
= 0;
1786 /* We rely on IB/core that work is executed if we have num_sge != 0 only. */
1787 WARN_ON(!work
->num_sge
);
1788 dev
= mr_to_mdev(work
->frags
[0].mr
);
1789 /* SRCU should be held when calling to mlx5_odp_populate_xlt() */
1790 srcu_key
= srcu_read_lock(&dev
->odp_srcu
);
1791 for (i
= 0; i
< work
->num_sge
; ++i
) {
1792 ret
= pagefault_mr(work
->frags
[i
].mr
, work
->frags
[i
].io_virt
,
1793 work
->frags
[i
].length
, &bytes_mapped
,
1797 mlx5_update_odp_stats(work
->frags
[i
].mr
, prefetch
, ret
);
1799 srcu_read_unlock(&dev
->odp_srcu
, srcu_key
);
1801 destroy_prefetch_work(work
);
1804 static bool init_prefetch_work(struct ib_pd
*pd
,
1805 enum ib_uverbs_advise_mr_advice advice
,
1806 u32 pf_flags
, struct prefetch_mr_work
*work
,
1807 struct ib_sge
*sg_list
, u32 num_sge
)
1811 INIT_WORK(&work
->work
, mlx5_ib_prefetch_mr_work
);
1812 work
->pf_flags
= pf_flags
;
1814 for (i
= 0; i
< num_sge
; ++i
) {
1815 work
->frags
[i
].io_virt
= sg_list
[i
].addr
;
1816 work
->frags
[i
].length
= sg_list
[i
].length
;
1818 get_prefetchable_mr(pd
, advice
, sg_list
[i
].lkey
);
1819 if (!work
->frags
[i
].mr
) {
1824 /* Keep the MR pointer will valid outside the SRCU */
1825 atomic_inc(&work
->frags
[i
].mr
->num_deferred_work
);
1827 work
->num_sge
= num_sge
;
1831 static int mlx5_ib_prefetch_sg_list(struct ib_pd
*pd
,
1832 enum ib_uverbs_advise_mr_advice advice
,
1833 u32 pf_flags
, struct ib_sge
*sg_list
,
1836 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1837 u32 bytes_mapped
= 0;
1842 srcu_key
= srcu_read_lock(&dev
->odp_srcu
);
1843 for (i
= 0; i
< num_sge
; ++i
) {
1844 struct mlx5_ib_mr
*mr
;
1846 mr
= get_prefetchable_mr(pd
, advice
, sg_list
[i
].lkey
);
1851 ret
= pagefault_mr(mr
, sg_list
[i
].addr
, sg_list
[i
].length
,
1852 &bytes_mapped
, pf_flags
);
1855 mlx5_update_odp_stats(mr
, prefetch
, ret
);
1860 srcu_read_unlock(&dev
->odp_srcu
, srcu_key
);
1864 int mlx5_ib_advise_mr_prefetch(struct ib_pd
*pd
,
1865 enum ib_uverbs_advise_mr_advice advice
,
1866 u32 flags
, struct ib_sge
*sg_list
, u32 num_sge
)
1868 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1870 struct prefetch_mr_work
*work
;
1873 if (advice
== IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH
)
1874 pf_flags
|= MLX5_PF_FLAGS_DOWNGRADE
;
1876 if (advice
== IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT
)
1877 pf_flags
|= MLX5_PF_FLAGS_SNAPSHOT
;
1879 if (flags
& IB_UVERBS_ADVISE_MR_FLAG_FLUSH
)
1880 return mlx5_ib_prefetch_sg_list(pd
, advice
, pf_flags
, sg_list
,
1883 work
= kvzalloc(struct_size(work
, frags
, num_sge
), GFP_KERNEL
);
1887 srcu_key
= srcu_read_lock(&dev
->odp_srcu
);
1888 if (!init_prefetch_work(pd
, advice
, pf_flags
, work
, sg_list
, num_sge
)) {
1889 srcu_read_unlock(&dev
->odp_srcu
, srcu_key
);
1890 destroy_prefetch_work(work
);
1893 queue_work(system_unbound_wq
, &work
->work
);
1894 srcu_read_unlock(&dev
->odp_srcu
, srcu_key
);