2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include <rdma/ib_umem_odp.h>
34 #include <linux/kernel.h>
35 #include <linux/dma-buf.h>
36 #include <linux/dma-resv.h>
43 #include <linux/mlx5/eq.h>
45 /* Contains the details of a pagefault. */
46 struct mlx5_pagefault
{
52 /* Initiator or send message responder pagefault details. */
54 /* Received packet size, only valid for responders. */
57 * Number of resource holding WQE, depends on type.
61 * WQE index. Refers to either the send queue or
62 * receive queue, according to event_subtype.
66 /* RDMA responder pagefault details */
70 * Received packet size, minimal size page fault
71 * resolution required for forward progress.
81 u32 prefetch_before_byte_count
;
82 u32 prefetch_after_byte_count
;
87 struct mlx5_ib_pf_eq
*eq
;
88 struct work_struct work
;
91 #define MAX_PREFETCH_LEN (4*1024*1024U)
93 /* Timeout in ms to wait for an active mmu notifier to complete when handling
95 #define MMU_NOTIFIER_TIMEOUT 1000
97 #define MLX5_IMR_MTT_BITS (30 - PAGE_SHIFT)
98 #define MLX5_IMR_MTT_SHIFT (MLX5_IMR_MTT_BITS + PAGE_SHIFT)
99 #define MLX5_IMR_MTT_ENTRIES BIT_ULL(MLX5_IMR_MTT_BITS)
100 #define MLX5_IMR_MTT_SIZE BIT_ULL(MLX5_IMR_MTT_SHIFT)
101 #define MLX5_IMR_MTT_MASK (~(MLX5_IMR_MTT_SIZE - 1))
103 #define MLX5_KSM_PAGE_SHIFT MLX5_IMR_MTT_SHIFT
105 static u64 mlx5_imr_ksm_entries
;
107 static void populate_klm(struct mlx5_klm
*pklm
, size_t idx
, size_t nentries
,
108 struct mlx5_ib_mr
*imr
, int flags
)
110 struct mlx5_core_dev
*dev
= mr_to_mdev(imr
)->mdev
;
111 struct mlx5_klm
*end
= pklm
+ nentries
;
112 int step
= MLX5_CAP_ODP(dev
, mem_page_fault
) ? MLX5_IMR_MTT_SIZE
: 0;
113 __be32 key
= MLX5_CAP_ODP(dev
, mem_page_fault
) ?
114 cpu_to_be32(imr
->null_mmkey
.key
) :
115 mr_to_mdev(imr
)->mkeys
.null_mkey
;
117 MLX5_CAP_ODP(dev
, mem_page_fault
) ? idx
* MLX5_IMR_MTT_SIZE
: 0;
119 if (flags
& MLX5_IB_UPD_XLT_ZAP
) {
120 for (; pklm
!= end
; pklm
++, idx
++, va
+= step
) {
121 pklm
->bcount
= cpu_to_be32(MLX5_IMR_MTT_SIZE
);
123 pklm
->va
= cpu_to_be64(va
);
129 * The locking here is pretty subtle. Ideally the implicit_children
130 * xarray would be protected by the umem_mutex, however that is not
131 * possible. Instead this uses a weaker update-then-lock pattern:
134 * mutex_lock(umem_mutex)
135 * mlx5r_umr_update_xlt()
136 * mutex_unlock(umem_mutex)
139 * ie any change the xarray must be followed by the locked update_xlt
142 * The umem_mutex provides the acquire/release semantic needed to make
143 * the xa_store() visible to a racing thread.
145 lockdep_assert_held(&to_ib_umem_odp(imr
->umem
)->umem_mutex
);
147 for (; pklm
!= end
; pklm
++, idx
++, va
+= step
) {
148 struct mlx5_ib_mr
*mtt
= xa_load(&imr
->implicit_children
, idx
);
150 pklm
->bcount
= cpu_to_be32(MLX5_IMR_MTT_SIZE
);
152 pklm
->key
= cpu_to_be32(mtt
->ibmr
.lkey
);
153 pklm
->va
= cpu_to_be64(idx
* MLX5_IMR_MTT_SIZE
);
156 pklm
->va
= cpu_to_be64(va
);
161 static u64
umem_dma_to_mtt(dma_addr_t umem_dma
)
163 u64 mtt_entry
= umem_dma
& ODP_DMA_ADDR_MASK
;
165 if (umem_dma
& ODP_READ_ALLOWED_BIT
)
166 mtt_entry
|= MLX5_IB_MTT_READ
;
167 if (umem_dma
& ODP_WRITE_ALLOWED_BIT
)
168 mtt_entry
|= MLX5_IB_MTT_WRITE
;
173 static void populate_mtt(__be64
*pas
, size_t idx
, size_t nentries
,
174 struct mlx5_ib_mr
*mr
, int flags
)
176 struct ib_umem_odp
*odp
= to_ib_umem_odp(mr
->umem
);
180 if (flags
& MLX5_IB_UPD_XLT_ZAP
)
183 for (i
= 0; i
< nentries
; i
++) {
184 pa
= odp
->dma_list
[idx
+ i
];
185 pas
[i
] = cpu_to_be64(umem_dma_to_mtt(pa
));
189 void mlx5_odp_populate_xlt(void *xlt
, size_t idx
, size_t nentries
,
190 struct mlx5_ib_mr
*mr
, int flags
)
192 if (flags
& MLX5_IB_UPD_XLT_INDIRECT
) {
193 populate_klm(xlt
, idx
, nentries
, mr
, flags
);
195 populate_mtt(xlt
, idx
, nentries
, mr
, flags
);
200 * This must be called after the mr has been removed from implicit_children.
201 * NOTE: The MR does not necessarily have to be
202 * empty here, parallel page faults could have raced with the free process and
205 static void free_implicit_child_mr_work(struct work_struct
*work
)
207 struct mlx5_ib_mr
*mr
=
208 container_of(work
, struct mlx5_ib_mr
, odp_destroy
.work
);
209 struct mlx5_ib_mr
*imr
= mr
->parent
;
210 struct ib_umem_odp
*odp_imr
= to_ib_umem_odp(imr
->umem
);
211 struct ib_umem_odp
*odp
= to_ib_umem_odp(mr
->umem
);
213 mlx5r_deref_wait_odp_mkey(&mr
->mmkey
);
215 mutex_lock(&odp_imr
->umem_mutex
);
216 mlx5r_umr_update_xlt(mr
->parent
,
217 ib_umem_start(odp
) >> MLX5_IMR_MTT_SHIFT
, 1, 0,
218 MLX5_IB_UPD_XLT_INDIRECT
| MLX5_IB_UPD_XLT_ATOMIC
);
219 mutex_unlock(&odp_imr
->umem_mutex
);
220 mlx5_ib_dereg_mr(&mr
->ibmr
, NULL
);
222 mlx5r_deref_odp_mkey(&imr
->mmkey
);
225 static void destroy_unused_implicit_child_mr(struct mlx5_ib_mr
*mr
)
227 struct ib_umem_odp
*odp
= to_ib_umem_odp(mr
->umem
);
228 unsigned long idx
= ib_umem_start(odp
) >> MLX5_IMR_MTT_SHIFT
;
229 struct mlx5_ib_mr
*imr
= mr
->parent
;
231 if (!refcount_inc_not_zero(&imr
->mmkey
.usecount
))
234 xa_erase(&imr
->implicit_children
, idx
);
235 if (MLX5_CAP_ODP(mr_to_mdev(mr
)->mdev
, mem_page_fault
))
236 xa_erase(&mr_to_mdev(mr
)->odp_mkeys
,
237 mlx5_base_mkey(mr
->mmkey
.key
));
239 /* Freeing a MR is a sleeping operation, so bounce to a work queue */
240 INIT_WORK(&mr
->odp_destroy
.work
, free_implicit_child_mr_work
);
241 queue_work(system_unbound_wq
, &mr
->odp_destroy
.work
);
244 static bool mlx5_ib_invalidate_range(struct mmu_interval_notifier
*mni
,
245 const struct mmu_notifier_range
*range
,
246 unsigned long cur_seq
)
248 struct ib_umem_odp
*umem_odp
=
249 container_of(mni
, struct ib_umem_odp
, notifier
);
250 struct mlx5_ib_mr
*mr
;
251 const u64 umr_block_mask
= MLX5_UMR_MTT_NUM_ENTRIES_ALIGNMENT
- 1;
252 u64 idx
= 0, blk_start_idx
= 0;
253 u64 invalidations
= 0;
259 if (!mmu_notifier_range_blockable(range
))
262 mutex_lock(&umem_odp
->umem_mutex
);
263 mmu_interval_set_seq(mni
, cur_seq
);
265 * If npages is zero then umem_odp->private may not be setup yet. This
266 * does not complete until after the first page is mapped for DMA.
268 if (!umem_odp
->npages
)
270 mr
= umem_odp
->private;
272 start
= max_t(u64
, ib_umem_start(umem_odp
), range
->start
);
273 end
= min_t(u64
, ib_umem_end(umem_odp
), range
->end
);
276 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
277 * while we are doing the invalidation, no page fault will attempt to
278 * overwrite the same MTTs. Concurent invalidations might race us,
279 * but they will write 0s as well, so no difference in the end result.
281 for (addr
= start
; addr
< end
; addr
+= BIT(umem_odp
->page_shift
)) {
282 idx
= (addr
- ib_umem_start(umem_odp
)) >> umem_odp
->page_shift
;
284 * Strive to write the MTTs in chunks, but avoid overwriting
285 * non-existing MTTs. The huristic here can be improved to
286 * estimate the cost of another UMR vs. the cost of bigger
289 if (umem_odp
->dma_list
[idx
] &
290 (ODP_READ_ALLOWED_BIT
| ODP_WRITE_ALLOWED_BIT
)) {
296 /* Count page invalidations */
297 invalidations
+= idx
- blk_start_idx
+ 1;
299 u64 umr_offset
= idx
& umr_block_mask
;
301 if (in_block
&& umr_offset
== 0) {
302 mlx5r_umr_update_xlt(mr
, blk_start_idx
,
303 idx
- blk_start_idx
, 0,
304 MLX5_IB_UPD_XLT_ZAP
|
305 MLX5_IB_UPD_XLT_ATOMIC
);
311 mlx5r_umr_update_xlt(mr
, blk_start_idx
,
312 idx
- blk_start_idx
+ 1, 0,
313 MLX5_IB_UPD_XLT_ZAP
|
314 MLX5_IB_UPD_XLT_ATOMIC
);
316 mlx5_update_odp_stats(mr
, invalidations
, invalidations
);
319 * We are now sure that the device will not access the
320 * memory. We can safely unmap it, and mark it as dirty if
324 ib_umem_odp_unmap_dma_pages(umem_odp
, start
, end
);
326 if (unlikely(!umem_odp
->npages
&& mr
->parent
))
327 destroy_unused_implicit_child_mr(mr
);
329 mutex_unlock(&umem_odp
->umem_mutex
);
333 const struct mmu_interval_notifier_ops mlx5_mn_ops
= {
334 .invalidate
= mlx5_ib_invalidate_range
,
337 static void internal_fill_odp_caps(struct mlx5_ib_dev
*dev
)
339 struct ib_odp_caps
*caps
= &dev
->odp_caps
;
341 memset(caps
, 0, sizeof(*caps
));
343 if (!MLX5_CAP_GEN(dev
->mdev
, pg
) || !mlx5r_umr_can_load_pas(dev
, 0))
346 caps
->general_caps
= IB_ODP_SUPPORT
;
348 if (MLX5_CAP_GEN(dev
->mdev
, umr_extended_translation_offset
))
349 dev
->odp_max_size
= U64_MAX
;
351 dev
->odp_max_size
= BIT_ULL(MLX5_MAX_UMR_SHIFT
+ PAGE_SHIFT
);
353 if (MLX5_CAP_ODP_SCHEME(dev
->mdev
, ud_odp_caps
.send
))
354 caps
->per_transport_caps
.ud_odp_caps
|= IB_ODP_SUPPORT_SEND
;
356 if (MLX5_CAP_ODP_SCHEME(dev
->mdev
, ud_odp_caps
.srq_receive
))
357 caps
->per_transport_caps
.ud_odp_caps
|= IB_ODP_SUPPORT_SRQ_RECV
;
359 if (MLX5_CAP_ODP_SCHEME(dev
->mdev
, rc_odp_caps
.send
))
360 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_SEND
;
362 if (MLX5_CAP_ODP_SCHEME(dev
->mdev
, rc_odp_caps
.receive
))
363 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_RECV
;
365 if (MLX5_CAP_ODP_SCHEME(dev
->mdev
, rc_odp_caps
.write
))
366 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_WRITE
;
368 if (MLX5_CAP_ODP_SCHEME(dev
->mdev
, rc_odp_caps
.read
))
369 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_READ
;
371 if (MLX5_CAP_ODP_SCHEME(dev
->mdev
, rc_odp_caps
.atomic
))
372 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_ATOMIC
;
374 if (MLX5_CAP_ODP_SCHEME(dev
->mdev
, rc_odp_caps
.srq_receive
))
375 caps
->per_transport_caps
.rc_odp_caps
|= IB_ODP_SUPPORT_SRQ_RECV
;
377 if (MLX5_CAP_ODP_SCHEME(dev
->mdev
, xrc_odp_caps
.send
))
378 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_SEND
;
380 if (MLX5_CAP_ODP_SCHEME(dev
->mdev
, xrc_odp_caps
.receive
))
381 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_RECV
;
383 if (MLX5_CAP_ODP_SCHEME(dev
->mdev
, xrc_odp_caps
.write
))
384 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_WRITE
;
386 if (MLX5_CAP_ODP_SCHEME(dev
->mdev
, xrc_odp_caps
.read
))
387 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_READ
;
389 if (MLX5_CAP_ODP_SCHEME(dev
->mdev
, xrc_odp_caps
.atomic
))
390 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_ATOMIC
;
392 if (MLX5_CAP_ODP_SCHEME(dev
->mdev
, xrc_odp_caps
.srq_receive
))
393 caps
->per_transport_caps
.xrc_odp_caps
|= IB_ODP_SUPPORT_SRQ_RECV
;
395 if (MLX5_CAP_GEN(dev
->mdev
, fixed_buffer_size
) &&
396 MLX5_CAP_GEN(dev
->mdev
, null_mkey
) &&
397 MLX5_CAP_GEN(dev
->mdev
, umr_extended_translation_offset
) &&
398 !MLX5_CAP_GEN(dev
->mdev
, umr_indirect_mkey_disabled
))
399 caps
->general_caps
|= IB_ODP_SUPPORT_IMPLICIT
;
402 static void mlx5_ib_page_fault_resume(struct mlx5_ib_dev
*dev
,
403 struct mlx5_pagefault
*pfault
,
406 int wq_num
= pfault
->event_subtype
== MLX5_PFAULT_SUBTYPE_WQE
?
407 pfault
->wqe
.wq_num
: pfault
->token
;
408 u32 in
[MLX5_ST_SZ_DW(page_fault_resume_in
)] = {};
412 MLX5_SET(page_fault_resume_in
, in
, opcode
, MLX5_CMD_OP_PAGE_FAULT_RESUME
);
414 if (pfault
->event_subtype
== MLX5_PFAULT_SUBTYPE_MEMORY
) {
415 info
= MLX5_ADDR_OF(page_fault_resume_in
, in
,
416 page_fault_info
.mem_page_fault_info
);
417 MLX5_SET(mem_page_fault_info
, info
, fault_token_31_0
,
418 pfault
->token
& 0xffffffff);
419 MLX5_SET(mem_page_fault_info
, info
, fault_token_47_32
,
420 (pfault
->token
>> 32) & 0xffff);
421 MLX5_SET(mem_page_fault_info
, info
, error
, !!error
);
423 info
= MLX5_ADDR_OF(page_fault_resume_in
, in
,
424 page_fault_info
.trans_page_fault_info
);
425 MLX5_SET(trans_page_fault_info
, info
, page_fault_type
,
427 MLX5_SET(trans_page_fault_info
, info
, fault_token
,
429 MLX5_SET(trans_page_fault_info
, info
, wq_number
, wq_num
);
430 MLX5_SET(trans_page_fault_info
, info
, error
, !!error
);
433 err
= mlx5_cmd_exec_in(dev
->mdev
, page_fault_resume
, in
);
435 mlx5_ib_err(dev
, "Failed to resolve the page fault on WQ 0x%x err %d\n",
439 static struct mlx5_ib_mr
*implicit_get_child_mr(struct mlx5_ib_mr
*imr
,
442 struct mlx5_ib_dev
*dev
= mr_to_mdev(imr
);
443 struct ib_umem_odp
*odp
;
444 struct mlx5_ib_mr
*mr
;
445 struct mlx5_ib_mr
*ret
;
448 odp
= ib_umem_odp_alloc_child(to_ib_umem_odp(imr
->umem
),
449 idx
* MLX5_IMR_MTT_SIZE
,
450 MLX5_IMR_MTT_SIZE
, &mlx5_mn_ops
);
452 return ERR_CAST(odp
);
454 mr
= mlx5_mr_cache_alloc(dev
, imr
->access_flags
,
455 MLX5_MKC_ACCESS_MODE_MTT
,
456 MLX5_IMR_MTT_ENTRIES
);
458 ib_umem_odp_release(odp
);
462 mr
->access_flags
= imr
->access_flags
;
463 mr
->ibmr
.pd
= imr
->ibmr
.pd
;
464 mr
->ibmr
.device
= &mr_to_mdev(imr
)->ib_dev
;
465 mr
->umem
= &odp
->umem
;
466 mr
->ibmr
.lkey
= mr
->mmkey
.key
;
467 mr
->ibmr
.rkey
= mr
->mmkey
.key
;
468 mr
->ibmr
.iova
= idx
* MLX5_IMR_MTT_SIZE
;
473 * First refcount is owned by the xarray and second refconut
474 * is returned to the caller.
476 refcount_set(&mr
->mmkey
.usecount
, 2);
478 err
= mlx5r_umr_update_xlt(mr
, 0,
479 MLX5_IMR_MTT_ENTRIES
,
481 MLX5_IB_UPD_XLT_ZAP
|
482 MLX5_IB_UPD_XLT_ENABLE
);
488 xa_lock(&imr
->implicit_children
);
489 ret
= __xa_cmpxchg(&imr
->implicit_children
, idx
, NULL
, mr
,
492 if (xa_is_err(ret
)) {
493 ret
= ERR_PTR(xa_err(ret
));
497 * Another thread beat us to creating the child mr, use
500 refcount_inc(&ret
->mmkey
.usecount
);
503 xa_unlock(&imr
->implicit_children
);
505 if (MLX5_CAP_ODP(dev
->mdev
, mem_page_fault
)) {
506 ret
= xa_store(&dev
->odp_mkeys
, mlx5_base_mkey(mr
->mmkey
.key
),
507 &mr
->mmkey
, GFP_KERNEL
);
508 if (xa_is_err(ret
)) {
509 ret
= ERR_PTR(xa_err(ret
));
510 xa_erase(&imr
->implicit_children
, idx
);
513 mr
->mmkey
.type
= MLX5_MKEY_IMPLICIT_CHILD
;
515 mlx5_ib_dbg(mr_to_mdev(imr
), "key %x mr %p\n", mr
->mmkey
.key
, mr
);
519 xa_unlock(&imr
->implicit_children
);
521 mlx5_ib_dereg_mr(&mr
->ibmr
, NULL
);
526 * When using memory scheme ODP, implicit MRs can't use the reserved null mkey
527 * and each implicit MR needs to assign a private null mkey to get the page
529 * The null mkey is created with the properties to enable getting the page
530 * fault for every time it is accessed and having all relevant access flags.
532 static int alloc_implicit_mr_null_mkey(struct mlx5_ib_dev
*dev
,
533 struct mlx5_ib_mr
*imr
,
534 struct mlx5_ib_pd
*pd
)
536 size_t inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
) + 64;
541 in
= kzalloc(inlen
, GFP_KERNEL
);
545 MLX5_SET(create_mkey_in
, in
, translations_octword_actual_size
, 4);
546 MLX5_SET(create_mkey_in
, in
, pg_access
, 1);
548 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
549 MLX5_SET(mkc
, mkc
, a
, 1);
550 MLX5_SET(mkc
, mkc
, rw
, 1);
551 MLX5_SET(mkc
, mkc
, rr
, 1);
552 MLX5_SET(mkc
, mkc
, lw
, 1);
553 MLX5_SET(mkc
, mkc
, lr
, 1);
554 MLX5_SET(mkc
, mkc
, free
, 0);
555 MLX5_SET(mkc
, mkc
, umr_en
, 0);
556 MLX5_SET(mkc
, mkc
, access_mode_1_0
, MLX5_MKC_ACCESS_MODE_MTT
);
558 MLX5_SET(mkc
, mkc
, translations_octword_size
, 4);
559 MLX5_SET(mkc
, mkc
, log_page_size
, 61);
560 MLX5_SET(mkc
, mkc
, length64
, 1);
561 MLX5_SET(mkc
, mkc
, pd
, pd
->pdn
);
562 MLX5_SET64(mkc
, mkc
, start_addr
, 0);
563 MLX5_SET(mkc
, mkc
, qpn
, 0xffffff);
565 err
= mlx5_core_create_mkey(dev
->mdev
, &imr
->null_mmkey
.key
, in
, inlen
);
569 imr
->null_mmkey
.type
= MLX5_MKEY_NULL
;
576 struct mlx5_ib_mr
*mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd
*pd
,
579 struct mlx5_ib_dev
*dev
= to_mdev(pd
->ibpd
.device
);
580 struct ib_umem_odp
*umem_odp
;
581 struct mlx5_ib_mr
*imr
;
584 if (!mlx5r_umr_can_load_pas(dev
, MLX5_IMR_MTT_ENTRIES
* PAGE_SIZE
))
585 return ERR_PTR(-EOPNOTSUPP
);
587 umem_odp
= ib_umem_odp_alloc_implicit(&dev
->ib_dev
, access_flags
);
588 if (IS_ERR(umem_odp
))
589 return ERR_CAST(umem_odp
);
591 imr
= mlx5_mr_cache_alloc(dev
, access_flags
, MLX5_MKC_ACCESS_MODE_KSM
,
592 mlx5_imr_ksm_entries
);
594 ib_umem_odp_release(umem_odp
);
598 imr
->access_flags
= access_flags
;
599 imr
->ibmr
.pd
= &pd
->ibpd
;
601 imr
->umem
= &umem_odp
->umem
;
602 imr
->ibmr
.lkey
= imr
->mmkey
.key
;
603 imr
->ibmr
.rkey
= imr
->mmkey
.key
;
604 imr
->ibmr
.device
= &dev
->ib_dev
;
605 imr
->is_odp_implicit
= true;
606 xa_init(&imr
->implicit_children
);
608 if (MLX5_CAP_ODP(dev
->mdev
, mem_page_fault
)) {
609 err
= alloc_implicit_mr_null_mkey(dev
, imr
, pd
);
613 err
= mlx5r_store_odp_mkey(dev
, &imr
->null_mmkey
);
618 err
= mlx5r_umr_update_xlt(imr
, 0,
619 mlx5_imr_ksm_entries
,
621 MLX5_IB_UPD_XLT_INDIRECT
|
622 MLX5_IB_UPD_XLT_ZAP
|
623 MLX5_IB_UPD_XLT_ENABLE
);
627 err
= mlx5r_store_odp_mkey(dev
, &imr
->mmkey
);
631 mlx5_ib_dbg(dev
, "key %x mr %p\n", imr
->mmkey
.key
, imr
);
634 mlx5_ib_err(dev
, "Failed to register MKEY %d\n", err
);
635 mlx5_ib_dereg_mr(&imr
->ibmr
, NULL
);
639 void mlx5_ib_free_odp_mr(struct mlx5_ib_mr
*mr
)
641 struct mlx5_ib_mr
*mtt
;
645 * If this is an implicit MR it is already invalidated so we can just
646 * delete the children mkeys.
648 xa_for_each(&mr
->implicit_children
, idx
, mtt
) {
649 xa_erase(&mr
->implicit_children
, idx
);
650 mlx5_ib_dereg_mr(&mtt
->ibmr
, NULL
);
653 if (mr
->null_mmkey
.key
) {
654 xa_erase(&mr_to_mdev(mr
)->odp_mkeys
,
655 mlx5_base_mkey(mr
->null_mmkey
.key
));
657 mlx5_core_destroy_mkey(mr_to_mdev(mr
)->mdev
,
662 #define MLX5_PF_FLAGS_DOWNGRADE BIT(1)
663 #define MLX5_PF_FLAGS_SNAPSHOT BIT(2)
664 #define MLX5_PF_FLAGS_ENABLE BIT(3)
665 static int pagefault_real_mr(struct mlx5_ib_mr
*mr
, struct ib_umem_odp
*odp
,
666 u64 user_va
, size_t bcnt
, u32
*bytes_mapped
,
669 int page_shift
, ret
, np
;
670 bool downgrade
= flags
& MLX5_PF_FLAGS_DOWNGRADE
;
673 bool fault
= !(flags
& MLX5_PF_FLAGS_SNAPSHOT
);
674 u32 xlt_flags
= MLX5_IB_UPD_XLT_ATOMIC
;
676 if (flags
& MLX5_PF_FLAGS_ENABLE
)
677 xlt_flags
|= MLX5_IB_UPD_XLT_ENABLE
;
679 page_shift
= odp
->page_shift
;
680 start_idx
= (user_va
- ib_umem_start(odp
)) >> page_shift
;
681 access_mask
= ODP_READ_ALLOWED_BIT
;
683 if (odp
->umem
.writable
&& !downgrade
)
684 access_mask
|= ODP_WRITE_ALLOWED_BIT
;
686 np
= ib_umem_odp_map_dma_and_lock(odp
, user_va
, bcnt
, access_mask
, fault
);
691 * No need to check whether the MTTs really belong to this MR, since
692 * ib_umem_odp_map_dma_and_lock already checks this.
694 ret
= mlx5r_umr_update_xlt(mr
, start_idx
, np
, page_shift
, xlt_flags
);
695 mutex_unlock(&odp
->umem_mutex
);
699 mlx5_ib_err(mr_to_mdev(mr
),
700 "Failed to update mkey page tables\n");
705 u32 new_mappings
= (np
<< page_shift
) -
706 (user_va
- round_down(user_va
, 1 << page_shift
));
708 *bytes_mapped
+= min_t(u32
, new_mappings
, bcnt
);
711 return np
<< (page_shift
- PAGE_SHIFT
);
717 static int pagefault_implicit_mr(struct mlx5_ib_mr
*imr
,
718 struct ib_umem_odp
*odp_imr
, u64 user_va
,
719 size_t bcnt
, u32
*bytes_mapped
, u32 flags
)
721 unsigned long end_idx
= (user_va
+ bcnt
- 1) >> MLX5_IMR_MTT_SHIFT
;
722 unsigned long upd_start_idx
= end_idx
+ 1;
723 unsigned long upd_len
= 0;
724 unsigned long npages
= 0;
728 if (unlikely(user_va
>= mlx5_imr_ksm_entries
* MLX5_IMR_MTT_SIZE
||
729 mlx5_imr_ksm_entries
* MLX5_IMR_MTT_SIZE
- user_va
< bcnt
))
732 /* Fault each child mr that intersects with our interval. */
734 unsigned long idx
= user_va
>> MLX5_IMR_MTT_SHIFT
;
735 struct ib_umem_odp
*umem_odp
;
736 struct mlx5_ib_mr
*mtt
;
739 xa_lock(&imr
->implicit_children
);
740 mtt
= xa_load(&imr
->implicit_children
, idx
);
741 if (unlikely(!mtt
)) {
742 xa_unlock(&imr
->implicit_children
);
743 mtt
= implicit_get_child_mr(imr
, idx
);
748 upd_start_idx
= min(upd_start_idx
, idx
);
749 upd_len
= idx
- upd_start_idx
+ 1;
751 refcount_inc(&mtt
->mmkey
.usecount
);
752 xa_unlock(&imr
->implicit_children
);
755 umem_odp
= to_ib_umem_odp(mtt
->umem
);
756 len
= min_t(u64
, user_va
+ bcnt
, ib_umem_end(umem_odp
)) -
759 ret
= pagefault_real_mr(mtt
, umem_odp
, user_va
, len
,
760 bytes_mapped
, flags
);
762 mlx5r_deref_odp_mkey(&mtt
->mmkey
);
774 * Any time the implicit_children are changed we must perform an
775 * update of the xlt before exiting to ensure the HW and the
776 * implicit_children remains synchronized.
779 if (likely(!upd_len
))
783 * Notice this is not strictly ordered right, the KSM is updated after
784 * the implicit_children is updated, so a parallel page fault could
785 * see a MR that is not yet visible in the KSM. This is similar to a
786 * parallel page fault seeing a MR that is being concurrently removed
787 * from the KSM. Both of these improbable situations are resolved
788 * safely by resuming the HW and then taking another page fault. The
789 * next pagefault handler will see the new information.
791 mutex_lock(&odp_imr
->umem_mutex
);
792 err
= mlx5r_umr_update_xlt(imr
, upd_start_idx
, upd_len
, 0,
793 MLX5_IB_UPD_XLT_INDIRECT
|
794 MLX5_IB_UPD_XLT_ATOMIC
);
795 mutex_unlock(&odp_imr
->umem_mutex
);
797 mlx5_ib_err(mr_to_mdev(imr
), "Failed to update PAS\n");
803 static int pagefault_dmabuf_mr(struct mlx5_ib_mr
*mr
, size_t bcnt
,
804 u32
*bytes_mapped
, u32 flags
)
806 struct ib_umem_dmabuf
*umem_dmabuf
= to_ib_umem_dmabuf(mr
->umem
);
809 unsigned long page_size
;
811 if (flags
& MLX5_PF_FLAGS_ENABLE
)
812 xlt_flags
|= MLX5_IB_UPD_XLT_ENABLE
;
814 dma_resv_lock(umem_dmabuf
->attach
->dmabuf
->resv
, NULL
);
815 err
= ib_umem_dmabuf_map_pages(umem_dmabuf
);
817 dma_resv_unlock(umem_dmabuf
->attach
->dmabuf
->resv
);
821 page_size
= mlx5_umem_dmabuf_find_best_pgsz(umem_dmabuf
);
823 ib_umem_dmabuf_unmap_pages(umem_dmabuf
);
827 err
= mlx5r_umr_update_data_direct_ksm_pas(mr
, xlt_flags
);
829 err
= mlx5r_umr_update_mr_pas(mr
, xlt_flags
);
831 dma_resv_unlock(umem_dmabuf
->attach
->dmabuf
->resv
);
837 *bytes_mapped
+= bcnt
;
839 return ib_umem_num_pages(mr
->umem
);
844 * -EFAULT: The io_virt->bcnt is not within the MR, it covers pages that are
845 * not accessible, or the MR is no longer valid.
846 * -EAGAIN/-ENOMEM: The operation should be retried
848 * -EINVAL/others: General internal malfunction
849 * >0: Number of pages mapped
851 static int pagefault_mr(struct mlx5_ib_mr
*mr
, u64 io_virt
, size_t bcnt
,
852 u32
*bytes_mapped
, u32 flags
, bool permissive_fault
)
854 struct ib_umem_odp
*odp
= to_ib_umem_odp(mr
->umem
);
856 if (unlikely(io_virt
< mr
->ibmr
.iova
) && !permissive_fault
)
859 if (mr
->umem
->is_dmabuf
)
860 return pagefault_dmabuf_mr(mr
, bcnt
, bytes_mapped
, flags
);
862 if (!odp
->is_implicit_odp
) {
863 u64 offset
= io_virt
< mr
->ibmr
.iova
? 0 : io_virt
- mr
->ibmr
.iova
;
866 if (check_add_overflow(offset
, (u64
)odp
->umem
.address
,
870 if (permissive_fault
) {
871 if (user_va
< ib_umem_start(odp
))
872 user_va
= ib_umem_start(odp
);
873 if ((user_va
+ bcnt
) > ib_umem_end(odp
))
874 bcnt
= ib_umem_end(odp
) - user_va
;
875 } else if (unlikely(user_va
>= ib_umem_end(odp
) ||
876 ib_umem_end(odp
) - user_va
< bcnt
))
878 return pagefault_real_mr(mr
, odp
, user_va
, bcnt
, bytes_mapped
,
881 return pagefault_implicit_mr(mr
, odp
, io_virt
, bcnt
, bytes_mapped
,
885 int mlx5_ib_init_odp_mr(struct mlx5_ib_mr
*mr
)
889 ret
= pagefault_real_mr(mr
, to_ib_umem_odp(mr
->umem
), mr
->umem
->address
,
890 mr
->umem
->length
, NULL
,
891 MLX5_PF_FLAGS_SNAPSHOT
| MLX5_PF_FLAGS_ENABLE
);
892 return ret
>= 0 ? 0 : ret
;
895 int mlx5_ib_init_dmabuf_mr(struct mlx5_ib_mr
*mr
)
899 ret
= pagefault_dmabuf_mr(mr
, mr
->umem
->length
, NULL
,
900 MLX5_PF_FLAGS_ENABLE
);
902 return ret
>= 0 ? 0 : ret
;
906 struct pf_frame
*next
;
913 static bool mkey_is_eq(struct mlx5_ib_mkey
*mmkey
, u32 key
)
917 if (mmkey
->type
== MLX5_MKEY_MW
||
918 mmkey
->type
== MLX5_MKEY_INDIRECT_DEVX
)
919 return mlx5_base_mkey(mmkey
->key
) == mlx5_base_mkey(key
);
920 return mmkey
->key
== key
;
923 static struct mlx5_ib_mkey
*find_odp_mkey(struct mlx5_ib_dev
*dev
, u32 key
)
925 struct mlx5_ib_mkey
*mmkey
;
927 xa_lock(&dev
->odp_mkeys
);
928 mmkey
= xa_load(&dev
->odp_mkeys
, mlx5_base_mkey(key
));
930 mmkey
= ERR_PTR(-ENOENT
);
933 if (!mkey_is_eq(mmkey
, key
)) {
934 mmkey
= ERR_PTR(-EFAULT
);
937 refcount_inc(&mmkey
->usecount
);
939 xa_unlock(&dev
->odp_mkeys
);
945 * Handle a single data segment in a page-fault WQE or RDMA region.
947 * Returns number of OS pages retrieved on success. The caller may continue to
948 * the next data segment.
949 * Can return the following error codes:
950 * -EAGAIN to designate a temporary error. The caller will abort handling the
951 * page fault and resolve it.
952 * -EFAULT when there's an error mapping the requested pages. The caller will
953 * abort the page fault handling.
955 static int pagefault_single_data_segment(struct mlx5_ib_dev
*dev
,
956 struct ib_pd
*pd
, u32 key
,
957 u64 io_virt
, size_t bcnt
,
958 u32
*bytes_committed
,
961 int npages
= 0, ret
, i
, outlen
, cur_outlen
= 0, depth
= 0;
962 struct pf_frame
*head
= NULL
, *frame
;
963 struct mlx5_ib_mkey
*mmkey
;
964 struct mlx5_ib_mr
*mr
;
965 struct mlx5_klm
*pklm
;
969 io_virt
+= *bytes_committed
;
970 bcnt
-= *bytes_committed
;
972 mmkey
= find_odp_mkey(dev
, key
);
974 ret
= PTR_ERR(mmkey
);
975 if (ret
== -ENOENT
) {
978 "skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
981 *bytes_mapped
+= bcnt
;
983 * The user could specify a SGL with multiple lkeys and
984 * only some of them are ODP. Treat the non-ODP ones as
992 switch (mmkey
->type
) {
994 mr
= container_of(mmkey
, struct mlx5_ib_mr
, mmkey
);
996 ret
= pagefault_mr(mr
, io_virt
, bcnt
, bytes_mapped
, 0, false);
1000 mlx5_update_odp_stats(mr
, faults
, ret
);
1007 case MLX5_MKEY_INDIRECT_DEVX
:
1008 if (depth
>= MLX5_CAP_GEN(dev
->mdev
, max_indirection
)) {
1009 mlx5_ib_dbg(dev
, "indirection level exceeded\n");
1014 outlen
= MLX5_ST_SZ_BYTES(query_mkey_out
) +
1015 sizeof(*pklm
) * (mmkey
->ndescs
- 2);
1017 if (outlen
> cur_outlen
) {
1019 out
= kzalloc(outlen
, GFP_KERNEL
);
1024 cur_outlen
= outlen
;
1027 pklm
= (struct mlx5_klm
*)MLX5_ADDR_OF(query_mkey_out
, out
,
1028 bsf0_klm0_pas_mtt0_1
);
1030 ret
= mlx5_core_query_mkey(dev
->mdev
, mmkey
->key
, out
, outlen
);
1034 offset
= io_virt
- MLX5_GET64(query_mkey_out
, out
,
1035 memory_key_mkey_entry
.start_addr
);
1037 for (i
= 0; bcnt
&& i
< mmkey
->ndescs
; i
++, pklm
++) {
1038 if (offset
>= be32_to_cpu(pklm
->bcount
)) {
1039 offset
-= be32_to_cpu(pklm
->bcount
);
1043 frame
= kzalloc(sizeof(*frame
), GFP_KERNEL
);
1049 frame
->key
= be32_to_cpu(pklm
->key
);
1050 frame
->io_virt
= be64_to_cpu(pklm
->va
) + offset
;
1051 frame
->bcnt
= min_t(size_t, bcnt
,
1052 be32_to_cpu(pklm
->bcount
) - offset
);
1053 frame
->depth
= depth
+ 1;
1057 bcnt
-= frame
->bcnt
;
1063 mlx5_ib_dbg(dev
, "wrong mkey type %d\n", mmkey
->type
);
1073 io_virt
= frame
->io_virt
;
1075 depth
= frame
->depth
;
1078 mlx5r_deref_odp_mkey(mmkey
);
1084 mlx5r_deref_odp_mkey(mmkey
);
1092 *bytes_committed
= 0;
1093 return ret
? ret
: npages
;
1097 * Parse a series of data segments for page fault handling.
1099 * @dev: Pointer to mlx5 IB device
1100 * @pfault: contains page fault information.
1101 * @wqe: points at the first data segment in the WQE.
1102 * @wqe_end: points after the end of the WQE.
1103 * @bytes_mapped: receives the number of bytes that the function was able to
1104 * map. This allows the caller to decide intelligently whether
1105 * enough memory was mapped to resolve the page fault
1106 * successfully (e.g. enough for the next MTU, or the entire
1108 * @total_wqe_bytes: receives the total data size of this WQE in bytes (minus
1109 * the committed bytes).
1110 * @receive_queue: receive WQE end of sg list
1112 * Returns the number of pages loaded if positive, zero for an empty WQE, or a
1113 * negative error code.
1115 static int pagefault_data_segments(struct mlx5_ib_dev
*dev
,
1116 struct mlx5_pagefault
*pfault
,
1118 void *wqe_end
, u32
*bytes_mapped
,
1119 u32
*total_wqe_bytes
, bool receive_queue
)
1121 int ret
= 0, npages
= 0;
1130 if (total_wqe_bytes
)
1131 *total_wqe_bytes
= 0;
1133 while (wqe
< wqe_end
) {
1134 struct mlx5_wqe_data_seg
*dseg
= wqe
;
1136 io_virt
= be64_to_cpu(dseg
->addr
);
1138 byte_count
= be32_to_cpu(dseg
->byte_count
);
1139 inline_segment
= !!(byte_count
& MLX5_INLINE_SEG
);
1140 bcnt
= byte_count
& ~MLX5_INLINE_SEG
;
1142 if (inline_segment
) {
1143 bcnt
= bcnt
& MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK
;
1144 wqe
+= ALIGN(sizeof(struct mlx5_wqe_inline_seg
) + bcnt
,
1147 wqe
+= sizeof(*dseg
);
1150 /* receive WQE end of sg list. */
1151 if (receive_queue
&& bcnt
== 0 &&
1152 key
== dev
->mkeys
.terminate_scatter_list_mkey
&&
1156 if (!inline_segment
&& total_wqe_bytes
) {
1157 *total_wqe_bytes
+= bcnt
- min_t(size_t, bcnt
,
1158 pfault
->bytes_committed
);
1161 /* A zero length data segment designates a length of 2GB. */
1165 if (inline_segment
|| bcnt
<= pfault
->bytes_committed
) {
1166 pfault
->bytes_committed
-=
1168 pfault
->bytes_committed
);
1172 ret
= pagefault_single_data_segment(dev
, NULL
, be32_to_cpu(key
),
1174 &pfault
->bytes_committed
,
1181 return ret
< 0 ? ret
: npages
;
1185 * Parse initiator WQE. Advances the wqe pointer to point at the
1186 * scatter-gather list, and set wqe_end to the end of the WQE.
1188 static int mlx5_ib_mr_initiator_pfault_handler(
1189 struct mlx5_ib_dev
*dev
, struct mlx5_pagefault
*pfault
,
1190 struct mlx5_ib_qp
*qp
, void **wqe
, void **wqe_end
, int wqe_length
)
1192 struct mlx5_wqe_ctrl_seg
*ctrl
= *wqe
;
1193 u16 wqe_index
= pfault
->wqe
.wqe_index
;
1194 struct mlx5_base_av
*av
;
1195 unsigned ds
, opcode
;
1196 u32 qpn
= qp
->trans_qp
.base
.mqp
.qpn
;
1198 ds
= be32_to_cpu(ctrl
->qpn_ds
) & MLX5_WQE_CTRL_DS_MASK
;
1199 if (ds
* MLX5_WQE_DS_UNITS
> wqe_length
) {
1200 mlx5_ib_err(dev
, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
1206 mlx5_ib_err(dev
, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
1211 *wqe_end
= *wqe
+ ds
* MLX5_WQE_DS_UNITS
;
1212 *wqe
+= sizeof(*ctrl
);
1214 opcode
= be32_to_cpu(ctrl
->opmod_idx_opcode
) &
1215 MLX5_WQE_CTRL_OPCODE_MASK
;
1217 if (qp
->type
== IB_QPT_XRC_INI
)
1218 *wqe
+= sizeof(struct mlx5_wqe_xrc_seg
);
1220 if (qp
->type
== IB_QPT_UD
|| qp
->type
== MLX5_IB_QPT_DCI
) {
1222 if (av
->dqp_dct
& cpu_to_be32(MLX5_EXTENDED_UD_AV
))
1223 *wqe
+= sizeof(struct mlx5_av
);
1225 *wqe
+= sizeof(struct mlx5_base_av
);
1229 case MLX5_OPCODE_RDMA_WRITE
:
1230 case MLX5_OPCODE_RDMA_WRITE_IMM
:
1231 case MLX5_OPCODE_RDMA_READ
:
1232 *wqe
+= sizeof(struct mlx5_wqe_raddr_seg
);
1234 case MLX5_OPCODE_ATOMIC_CS
:
1235 case MLX5_OPCODE_ATOMIC_FA
:
1236 *wqe
+= sizeof(struct mlx5_wqe_raddr_seg
);
1237 *wqe
+= sizeof(struct mlx5_wqe_atomic_seg
);
1245 * Parse responder WQE and set wqe_end to the end of the WQE.
1247 static int mlx5_ib_mr_responder_pfault_handler_srq(struct mlx5_ib_dev
*dev
,
1248 struct mlx5_ib_srq
*srq
,
1249 void **wqe
, void **wqe_end
,
1252 int wqe_size
= 1 << srq
->msrq
.wqe_shift
;
1254 if (wqe_size
> wqe_length
) {
1255 mlx5_ib_err(dev
, "Couldn't read all of the receive WQE's content\n");
1259 *wqe_end
= *wqe
+ wqe_size
;
1260 *wqe
+= sizeof(struct mlx5_wqe_srq_next_seg
);
1265 static int mlx5_ib_mr_responder_pfault_handler_rq(struct mlx5_ib_dev
*dev
,
1266 struct mlx5_ib_qp
*qp
,
1267 void *wqe
, void **wqe_end
,
1270 struct mlx5_ib_wq
*wq
= &qp
->rq
;
1271 int wqe_size
= 1 << wq
->wqe_shift
;
1273 if (qp
->flags_en
& MLX5_QP_FLAG_SIGNATURE
) {
1274 mlx5_ib_err(dev
, "ODP fault with WQE signatures is not supported\n");
1278 if (wqe_size
> wqe_length
) {
1279 mlx5_ib_err(dev
, "Couldn't read all of the receive WQE's content\n");
1283 *wqe_end
= wqe
+ wqe_size
;
1288 static inline struct mlx5_core_rsc_common
*odp_get_rsc(struct mlx5_ib_dev
*dev
,
1289 u32 wq_num
, int pf_type
)
1291 struct mlx5_core_rsc_common
*common
= NULL
;
1292 struct mlx5_core_srq
*srq
;
1295 case MLX5_WQE_PF_TYPE_RMP
:
1296 srq
= mlx5_cmd_get_srq(dev
, wq_num
);
1298 common
= &srq
->common
;
1300 case MLX5_WQE_PF_TYPE_REQ_SEND_OR_WRITE
:
1301 case MLX5_WQE_PF_TYPE_RESP
:
1302 case MLX5_WQE_PF_TYPE_REQ_READ_OR_ATOMIC
:
1303 common
= mlx5_core_res_hold(dev
, wq_num
, MLX5_RES_QP
);
1312 static inline struct mlx5_ib_qp
*res_to_qp(struct mlx5_core_rsc_common
*res
)
1314 struct mlx5_core_qp
*mqp
= (struct mlx5_core_qp
*)res
;
1316 return to_mibqp(mqp
);
1319 static inline struct mlx5_ib_srq
*res_to_srq(struct mlx5_core_rsc_common
*res
)
1321 struct mlx5_core_srq
*msrq
=
1322 container_of(res
, struct mlx5_core_srq
, common
);
1324 return to_mibsrq(msrq
);
1327 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev
*dev
,
1328 struct mlx5_pagefault
*pfault
)
1330 bool sq
= pfault
->type
& MLX5_PFAULT_REQUESTOR
;
1331 u16 wqe_index
= pfault
->wqe
.wqe_index
;
1332 void *wqe
, *wqe_start
= NULL
, *wqe_end
= NULL
;
1333 u32 bytes_mapped
, total_wqe_bytes
;
1334 struct mlx5_core_rsc_common
*res
;
1335 int resume_with_error
= 1;
1336 struct mlx5_ib_qp
*qp
;
1337 size_t bytes_copied
;
1340 res
= odp_get_rsc(dev
, pfault
->wqe
.wq_num
, pfault
->type
);
1342 mlx5_ib_dbg(dev
, "wqe page fault for missing resource %d\n", pfault
->wqe
.wq_num
);
1346 if (res
->res
!= MLX5_RES_QP
&& res
->res
!= MLX5_RES_SRQ
&&
1347 res
->res
!= MLX5_RES_XSRQ
) {
1348 mlx5_ib_err(dev
, "wqe page fault for unsupported type %d\n",
1350 goto resolve_page_fault
;
1353 wqe_start
= (void *)__get_free_page(GFP_KERNEL
);
1355 mlx5_ib_err(dev
, "Error allocating memory for IO page fault handling.\n");
1356 goto resolve_page_fault
;
1360 qp
= (res
->res
== MLX5_RES_QP
) ? res_to_qp(res
) : NULL
;
1362 ret
= mlx5_ib_read_wqe_sq(qp
, wqe_index
, wqe
, PAGE_SIZE
,
1366 ret
= mlx5_ib_mr_initiator_pfault_handler(
1367 dev
, pfault
, qp
, &wqe
, &wqe_end
, bytes_copied
);
1368 } else if (qp
&& !sq
) {
1369 ret
= mlx5_ib_read_wqe_rq(qp
, wqe_index
, wqe
, PAGE_SIZE
,
1373 ret
= mlx5_ib_mr_responder_pfault_handler_rq(
1374 dev
, qp
, wqe
, &wqe_end
, bytes_copied
);
1376 struct mlx5_ib_srq
*srq
= res_to_srq(res
);
1378 ret
= mlx5_ib_read_wqe_srq(srq
, wqe_index
, wqe
, PAGE_SIZE
,
1382 ret
= mlx5_ib_mr_responder_pfault_handler_srq(
1383 dev
, srq
, &wqe
, &wqe_end
, bytes_copied
);
1386 if (ret
< 0 || wqe
>= wqe_end
)
1387 goto resolve_page_fault
;
1389 ret
= pagefault_data_segments(dev
, pfault
, wqe
, wqe_end
, &bytes_mapped
,
1390 &total_wqe_bytes
, !sq
);
1394 if (ret
< 0 || total_wqe_bytes
> bytes_mapped
)
1395 goto resolve_page_fault
;
1399 resume_with_error
= 0;
1405 "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %llx\n",
1406 ret
, wqe_index
, pfault
->token
);
1409 mlx5_ib_page_fault_resume(dev
, pfault
, resume_with_error
);
1410 mlx5_ib_dbg(dev
, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
1411 pfault
->wqe
.wq_num
, resume_with_error
,
1413 mlx5_core_res_put(res
);
1414 free_page((unsigned long)wqe_start
);
1417 static int pages_in_range(u64 address
, u32 length
)
1419 return (ALIGN(address
+ length
, PAGE_SIZE
) -
1420 (address
& PAGE_MASK
)) >> PAGE_SHIFT
;
1423 static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_dev
*dev
,
1424 struct mlx5_pagefault
*pfault
)
1428 u32 prefetch_len
= pfault
->bytes_committed
;
1429 int prefetch_activated
= 0;
1430 u32 rkey
= pfault
->rdma
.r_key
;
1433 /* The RDMA responder handler handles the page fault in two parts.
1434 * First it brings the necessary pages for the current packet
1435 * (and uses the pfault context), and then (after resuming the QP)
1436 * prefetches more pages. The second operation cannot use the pfault
1437 * context and therefore uses the dummy_pfault context allocated on
1439 pfault
->rdma
.rdma_va
+= pfault
->bytes_committed
;
1440 pfault
->rdma
.rdma_op_len
-= min(pfault
->bytes_committed
,
1441 pfault
->rdma
.rdma_op_len
);
1442 pfault
->bytes_committed
= 0;
1444 address
= pfault
->rdma
.rdma_va
;
1445 length
= pfault
->rdma
.rdma_op_len
;
1447 /* For some operations, the hardware cannot tell the exact message
1448 * length, and in those cases it reports zero. Use prefetch
1451 prefetch_activated
= 1;
1452 length
= pfault
->rdma
.packet_size
;
1453 prefetch_len
= min(MAX_PREFETCH_LEN
, prefetch_len
);
1456 ret
= pagefault_single_data_segment(dev
, NULL
, rkey
, address
, length
,
1457 &pfault
->bytes_committed
, NULL
);
1458 if (ret
== -EAGAIN
) {
1459 /* We're racing with an invalidation, don't prefetch */
1460 prefetch_activated
= 0;
1461 } else if (ret
< 0 || pages_in_range(address
, length
) > ret
) {
1462 mlx5_ib_page_fault_resume(dev
, pfault
, 1);
1464 mlx5_ib_dbg(dev
, "PAGE FAULT error %d. QP 0x%llx, type: 0x%x\n",
1465 ret
, pfault
->token
, pfault
->type
);
1469 mlx5_ib_page_fault_resume(dev
, pfault
, 0);
1470 mlx5_ib_dbg(dev
, "PAGE FAULT completed. QP 0x%llx, type: 0x%x, prefetch_activated: %d\n",
1471 pfault
->token
, pfault
->type
,
1472 prefetch_activated
);
1474 /* At this point, there might be a new pagefault already arriving in
1475 * the eq, switch to the dummy pagefault for the rest of the
1476 * processing. We're still OK with the objects being alive as the
1477 * work-queue is being fenced. */
1479 if (prefetch_activated
) {
1480 u32 bytes_committed
= 0;
1482 ret
= pagefault_single_data_segment(dev
, NULL
, rkey
, address
,
1484 &bytes_committed
, NULL
);
1485 if (ret
< 0 && ret
!= -EAGAIN
) {
1486 mlx5_ib_dbg(dev
, "Prefetch failed. ret: %d, QP 0x%llx, address: 0x%.16llx, length = 0x%.16x\n",
1487 ret
, pfault
->token
, address
, prefetch_len
);
1492 #define MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST BIT(7)
1493 static void mlx5_ib_mr_memory_pfault_handler(struct mlx5_ib_dev
*dev
,
1494 struct mlx5_pagefault
*pfault
)
1497 pfault
->memory
.va
- pfault
->memory
.prefetch_before_byte_count
;
1498 size_t prefetch_size
= pfault
->memory
.prefetch_before_byte_count
+
1499 pfault
->memory
.fault_byte_count
+
1500 pfault
->memory
.prefetch_after_byte_count
;
1501 struct mlx5_ib_mkey
*mmkey
;
1502 struct mlx5_ib_mr
*mr
, *child_mr
;
1505 mmkey
= find_odp_mkey(dev
, pfault
->memory
.mkey
);
1509 switch (mmkey
->type
) {
1510 case MLX5_MKEY_IMPLICIT_CHILD
:
1511 child_mr
= container_of(mmkey
, struct mlx5_ib_mr
, mmkey
);
1512 mr
= child_mr
->parent
;
1514 case MLX5_MKEY_NULL
:
1515 mr
= container_of(mmkey
, struct mlx5_ib_mr
, null_mmkey
);
1518 mr
= container_of(mmkey
, struct mlx5_ib_mr
, mmkey
);
1522 /* If prefetch fails, handle only demanded page fault */
1523 ret
= pagefault_mr(mr
, prefetch_va
, prefetch_size
, NULL
, 0, true);
1525 ret
= pagefault_mr(mr
, pfault
->memory
.va
,
1526 pfault
->memory
.fault_byte_count
, NULL
, 0,
1532 mlx5_update_odp_stats(mr
, faults
, ret
);
1533 mlx5r_deref_odp_mkey(mmkey
);
1535 if (pfault
->memory
.flags
& MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST
)
1536 mlx5_ib_page_fault_resume(dev
, pfault
, 0);
1540 "PAGE FAULT completed %s. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x\n",
1541 pfault
->memory
.flags
& MLX5_MEMORY_PAGE_FAULT_FLAGS_LAST
?
1543 "without resume cmd",
1544 pfault
->token
, pfault
->memory
.mkey
, pfault
->memory
.va
,
1545 pfault
->memory
.fault_byte_count
);
1551 mlx5r_deref_odp_mkey(mmkey
);
1552 mlx5_ib_page_fault_resume(dev
, pfault
, 1);
1555 "PAGE FAULT error. token 0x%llx, mkey: 0x%x, va: 0x%llx, byte_count: 0x%x, err: %d\n",
1556 pfault
->token
, pfault
->memory
.mkey
, pfault
->memory
.va
,
1557 pfault
->memory
.fault_byte_count
, ret
);
1560 static void mlx5_ib_pfault(struct mlx5_ib_dev
*dev
, struct mlx5_pagefault
*pfault
)
1562 u8 event_subtype
= pfault
->event_subtype
;
1564 switch (event_subtype
) {
1565 case MLX5_PFAULT_SUBTYPE_WQE
:
1566 mlx5_ib_mr_wqe_pfault_handler(dev
, pfault
);
1568 case MLX5_PFAULT_SUBTYPE_RDMA
:
1569 mlx5_ib_mr_rdma_pfault_handler(dev
, pfault
);
1571 case MLX5_PFAULT_SUBTYPE_MEMORY
:
1572 mlx5_ib_mr_memory_pfault_handler(dev
, pfault
);
1575 mlx5_ib_err(dev
, "Invalid page fault event subtype: 0x%x\n",
1577 mlx5_ib_page_fault_resume(dev
, pfault
, 1);
1581 static void mlx5_ib_eqe_pf_action(struct work_struct
*work
)
1583 struct mlx5_pagefault
*pfault
= container_of(work
,
1584 struct mlx5_pagefault
,
1586 struct mlx5_ib_pf_eq
*eq
= pfault
->eq
;
1588 mlx5_ib_pfault(eq
->dev
, pfault
);
1589 mempool_free(pfault
, eq
->pool
);
1592 #define MEMORY_SCHEME_PAGE_FAULT_GRANULARITY 4096
1593 static void mlx5_ib_eq_pf_process(struct mlx5_ib_pf_eq
*eq
)
1595 struct mlx5_eqe_page_fault
*pf_eqe
;
1596 struct mlx5_pagefault
*pfault
;
1597 struct mlx5_eqe
*eqe
;
1600 while ((eqe
= mlx5_eq_get_eqe(eq
->core
, cc
))) {
1601 pfault
= mempool_alloc(eq
->pool
, GFP_ATOMIC
);
1603 schedule_work(&eq
->work
);
1607 pf_eqe
= &eqe
->data
.page_fault
;
1608 pfault
->event_subtype
= eqe
->sub_type
;
1610 switch (eqe
->sub_type
) {
1611 case MLX5_PFAULT_SUBTYPE_RDMA
:
1612 /* RDMA based event */
1613 pfault
->bytes_committed
=
1614 be32_to_cpu(pf_eqe
->rdma
.bytes_committed
);
1616 be32_to_cpu(pf_eqe
->rdma
.pftype_token
) >> 24;
1618 be32_to_cpu(pf_eqe
->rdma
.pftype_token
) &
1620 pfault
->rdma
.r_key
=
1621 be32_to_cpu(pf_eqe
->rdma
.r_key
);
1622 pfault
->rdma
.packet_size
=
1623 be16_to_cpu(pf_eqe
->rdma
.packet_length
);
1624 pfault
->rdma
.rdma_op_len
=
1625 be32_to_cpu(pf_eqe
->rdma
.rdma_op_len
);
1626 pfault
->rdma
.rdma_va
=
1627 be64_to_cpu(pf_eqe
->rdma
.rdma_va
);
1630 "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, r_key: 0x%08x\n",
1631 eqe
->sub_type
, pfault
->bytes_committed
,
1632 pfault
->type
, pfault
->token
,
1633 pfault
->rdma
.r_key
);
1634 mlx5_ib_dbg(eq
->dev
,
1635 "PAGE_FAULT: rdma_op_len: 0x%08x, rdma_va: 0x%016llx\n",
1636 pfault
->rdma
.rdma_op_len
,
1637 pfault
->rdma
.rdma_va
);
1640 case MLX5_PFAULT_SUBTYPE_WQE
:
1641 /* WQE based event */
1642 pfault
->bytes_committed
=
1643 be32_to_cpu(pf_eqe
->wqe
.bytes_committed
);
1645 (be32_to_cpu(pf_eqe
->wqe
.pftype_wq
) >> 24) & 0x7;
1647 be32_to_cpu(pf_eqe
->wqe
.token
);
1648 pfault
->wqe
.wq_num
=
1649 be32_to_cpu(pf_eqe
->wqe
.pftype_wq
) &
1651 pfault
->wqe
.wqe_index
=
1652 be16_to_cpu(pf_eqe
->wqe
.wqe_index
);
1653 pfault
->wqe
.packet_size
=
1654 be16_to_cpu(pf_eqe
->wqe
.packet_length
);
1657 "PAGE_FAULT: subtype: 0x%02x, bytes_committed: 0x%06x, type:0x%x, token: 0x%06llx, wq_num: 0x%06x, wqe_index: 0x%04x\n",
1658 eqe
->sub_type
, pfault
->bytes_committed
,
1659 pfault
->type
, pfault
->token
, pfault
->wqe
.wq_num
,
1660 pfault
->wqe
.wqe_index
);
1663 case MLX5_PFAULT_SUBTYPE_MEMORY
:
1664 /* Memory based event */
1665 pfault
->bytes_committed
= 0;
1667 be32_to_cpu(pf_eqe
->memory
.token31_0
) |
1668 ((u64
)be16_to_cpu(pf_eqe
->memory
.token47_32
)
1670 pfault
->memory
.va
= be64_to_cpu(pf_eqe
->memory
.va
);
1671 pfault
->memory
.mkey
= be32_to_cpu(pf_eqe
->memory
.mkey
);
1672 pfault
->memory
.fault_byte_count
= (be32_to_cpu(
1673 pf_eqe
->memory
.demand_fault_pages
) >> 12) *
1674 MEMORY_SCHEME_PAGE_FAULT_GRANULARITY
;
1675 pfault
->memory
.prefetch_before_byte_count
=
1677 pf_eqe
->memory
.pre_demand_fault_pages
) *
1678 MEMORY_SCHEME_PAGE_FAULT_GRANULARITY
;
1679 pfault
->memory
.prefetch_after_byte_count
=
1681 pf_eqe
->memory
.post_demand_fault_pages
) *
1682 MEMORY_SCHEME_PAGE_FAULT_GRANULARITY
;
1683 pfault
->memory
.flags
= pf_eqe
->memory
.flags
;
1686 "PAGE_FAULT: subtype: 0x%02x, token: 0x%06llx, mkey: 0x%06x, fault_byte_count: 0x%06x, va: 0x%016llx, flags: 0x%02x\n",
1687 eqe
->sub_type
, pfault
->token
,
1688 pfault
->memory
.mkey
,
1689 pfault
->memory
.fault_byte_count
,
1690 pfault
->memory
.va
, pfault
->memory
.flags
);
1693 "PAGE_FAULT: prefetch size: before: 0x%06x, after 0x%06x\n",
1694 pfault
->memory
.prefetch_before_byte_count
,
1695 pfault
->memory
.prefetch_after_byte_count
);
1699 mlx5_ib_warn(eq
->dev
,
1700 "Unsupported page fault event sub-type: 0x%02hhx\n",
1702 /* Unsupported page faults should still be
1703 * resolved by the page fault handler
1708 INIT_WORK(&pfault
->work
, mlx5_ib_eqe_pf_action
);
1709 queue_work(eq
->wq
, &pfault
->work
);
1711 cc
= mlx5_eq_update_cc(eq
->core
, ++cc
);
1714 mlx5_eq_update_ci(eq
->core
, cc
, 1);
1717 static int mlx5_ib_eq_pf_int(struct notifier_block
*nb
, unsigned long type
,
1720 struct mlx5_ib_pf_eq
*eq
=
1721 container_of(nb
, struct mlx5_ib_pf_eq
, irq_nb
);
1722 unsigned long flags
;
1724 if (spin_trylock_irqsave(&eq
->lock
, flags
)) {
1725 mlx5_ib_eq_pf_process(eq
);
1726 spin_unlock_irqrestore(&eq
->lock
, flags
);
1728 schedule_work(&eq
->work
);
1734 /* mempool_refill() was proposed but unfortunately wasn't accepted
1735 * http://lkml.iu.edu/hypermail/linux/kernel/1512.1/05073.html
1738 static void mempool_refill(mempool_t
*pool
)
1740 while (pool
->curr_nr
< pool
->min_nr
)
1741 mempool_free(mempool_alloc(pool
, GFP_KERNEL
), pool
);
1744 static void mlx5_ib_eq_pf_action(struct work_struct
*work
)
1746 struct mlx5_ib_pf_eq
*eq
=
1747 container_of(work
, struct mlx5_ib_pf_eq
, work
);
1749 mempool_refill(eq
->pool
);
1751 spin_lock_irq(&eq
->lock
);
1752 mlx5_ib_eq_pf_process(eq
);
1753 spin_unlock_irq(&eq
->lock
);
1757 MLX5_IB_NUM_PF_EQE
= 0x1000,
1758 MLX5_IB_NUM_PF_DRAIN
= 64,
1761 int mlx5r_odp_create_eq(struct mlx5_ib_dev
*dev
, struct mlx5_ib_pf_eq
*eq
)
1763 struct mlx5_eq_param param
= {};
1766 mutex_lock(&dev
->odp_eq_mutex
);
1769 INIT_WORK(&eq
->work
, mlx5_ib_eq_pf_action
);
1770 spin_lock_init(&eq
->lock
);
1773 eq
->pool
= mempool_create_kmalloc_pool(MLX5_IB_NUM_PF_DRAIN
,
1774 sizeof(struct mlx5_pagefault
));
1780 eq
->wq
= alloc_workqueue("mlx5_ib_page_fault",
1781 WQ_HIGHPRI
| WQ_UNBOUND
| WQ_MEM_RECLAIM
,
1788 eq
->irq_nb
.notifier_call
= mlx5_ib_eq_pf_int
;
1789 param
= (struct mlx5_eq_param
) {
1790 .nent
= MLX5_IB_NUM_PF_EQE
,
1792 param
.mask
[0] = 1ull << MLX5_EVENT_TYPE_PAGE_FAULT
;
1793 eq
->core
= mlx5_eq_create_generic(dev
->mdev
, ¶m
);
1794 if (IS_ERR(eq
->core
)) {
1795 err
= PTR_ERR(eq
->core
);
1798 err
= mlx5_eq_enable(dev
->mdev
, eq
->core
, &eq
->irq_nb
);
1800 mlx5_ib_err(dev
, "failed to enable odp EQ %d\n", err
);
1804 mutex_unlock(&dev
->odp_eq_mutex
);
1807 mlx5_eq_destroy_generic(dev
->mdev
, eq
->core
);
1810 destroy_workqueue(eq
->wq
);
1812 mempool_destroy(eq
->pool
);
1814 mutex_unlock(&dev
->odp_eq_mutex
);
1819 mlx5_ib_odp_destroy_eq(struct mlx5_ib_dev
*dev
, struct mlx5_ib_pf_eq
*eq
)
1825 mlx5_eq_disable(dev
->mdev
, eq
->core
, &eq
->irq_nb
);
1826 err
= mlx5_eq_destroy_generic(dev
->mdev
, eq
->core
);
1827 cancel_work_sync(&eq
->work
);
1828 destroy_workqueue(eq
->wq
);
1829 mempool_destroy(eq
->pool
);
1834 int mlx5_odp_init_mkey_cache(struct mlx5_ib_dev
*dev
)
1836 struct mlx5r_cache_rb_key rb_key
= {
1837 .access_mode
= MLX5_MKC_ACCESS_MODE_KSM
,
1838 .ndescs
= mlx5_imr_ksm_entries
,
1840 struct mlx5_cache_ent
*ent
;
1842 if (!(dev
->odp_caps
.general_caps
& IB_ODP_SUPPORT_IMPLICIT
))
1845 ent
= mlx5r_cache_create_ent_locked(dev
, rb_key
, true);
1847 return PTR_ERR(ent
);
1852 static const struct ib_device_ops mlx5_ib_dev_odp_ops
= {
1853 .advise_mr
= mlx5_ib_advise_mr
,
1856 int mlx5_ib_odp_init_one(struct mlx5_ib_dev
*dev
)
1858 internal_fill_odp_caps(dev
);
1860 if (!(dev
->odp_caps
.general_caps
& IB_ODP_SUPPORT
))
1863 ib_set_device_ops(&dev
->ib_dev
, &mlx5_ib_dev_odp_ops
);
1865 mutex_init(&dev
->odp_eq_mutex
);
1869 void mlx5_ib_odp_cleanup_one(struct mlx5_ib_dev
*dev
)
1871 if (!(dev
->odp_caps
.general_caps
& IB_ODP_SUPPORT
))
1874 mlx5_ib_odp_destroy_eq(dev
, &dev
->odp_pf_eq
);
1877 int mlx5_ib_odp_init(void)
1879 mlx5_imr_ksm_entries
= BIT_ULL(get_order(TASK_SIZE
) -
1885 struct prefetch_mr_work
{
1886 struct work_struct work
;
1891 struct mlx5_ib_mr
*mr
;
1896 static void destroy_prefetch_work(struct prefetch_mr_work
*work
)
1900 for (i
= 0; i
< work
->num_sge
; ++i
)
1901 mlx5r_deref_odp_mkey(&work
->frags
[i
].mr
->mmkey
);
1906 static struct mlx5_ib_mr
*
1907 get_prefetchable_mr(struct ib_pd
*pd
, enum ib_uverbs_advise_mr_advice advice
,
1910 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1911 struct mlx5_ib_mr
*mr
= NULL
;
1912 struct mlx5_ib_mkey
*mmkey
;
1914 xa_lock(&dev
->odp_mkeys
);
1915 mmkey
= xa_load(&dev
->odp_mkeys
, mlx5_base_mkey(lkey
));
1916 if (!mmkey
|| mmkey
->key
!= lkey
) {
1917 mr
= ERR_PTR(-ENOENT
);
1920 if (mmkey
->type
!= MLX5_MKEY_MR
) {
1921 mr
= ERR_PTR(-EINVAL
);
1925 mr
= container_of(mmkey
, struct mlx5_ib_mr
, mmkey
);
1927 if (mr
->ibmr
.pd
!= pd
) {
1928 mr
= ERR_PTR(-EPERM
);
1932 /* prefetch with write-access must be supported by the MR */
1933 if (advice
== IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE
&&
1934 !mr
->umem
->writable
) {
1935 mr
= ERR_PTR(-EPERM
);
1939 refcount_inc(&mmkey
->usecount
);
1941 xa_unlock(&dev
->odp_mkeys
);
1945 static void mlx5_ib_prefetch_mr_work(struct work_struct
*w
)
1947 struct prefetch_mr_work
*work
=
1948 container_of(w
, struct prefetch_mr_work
, work
);
1949 u32 bytes_mapped
= 0;
1953 /* We rely on IB/core that work is executed if we have num_sge != 0 only. */
1954 WARN_ON(!work
->num_sge
);
1955 for (i
= 0; i
< work
->num_sge
; ++i
) {
1956 ret
= pagefault_mr(work
->frags
[i
].mr
, work
->frags
[i
].io_virt
,
1957 work
->frags
[i
].length
, &bytes_mapped
,
1958 work
->pf_flags
, false);
1961 mlx5_update_odp_stats(work
->frags
[i
].mr
, prefetch
, ret
);
1964 destroy_prefetch_work(work
);
1967 static int init_prefetch_work(struct ib_pd
*pd
,
1968 enum ib_uverbs_advise_mr_advice advice
,
1969 u32 pf_flags
, struct prefetch_mr_work
*work
,
1970 struct ib_sge
*sg_list
, u32 num_sge
)
1974 INIT_WORK(&work
->work
, mlx5_ib_prefetch_mr_work
);
1975 work
->pf_flags
= pf_flags
;
1977 for (i
= 0; i
< num_sge
; ++i
) {
1978 struct mlx5_ib_mr
*mr
;
1980 mr
= get_prefetchable_mr(pd
, advice
, sg_list
[i
].lkey
);
1985 work
->frags
[i
].io_virt
= sg_list
[i
].addr
;
1986 work
->frags
[i
].length
= sg_list
[i
].length
;
1987 work
->frags
[i
].mr
= mr
;
1989 work
->num_sge
= num_sge
;
1993 static int mlx5_ib_prefetch_sg_list(struct ib_pd
*pd
,
1994 enum ib_uverbs_advise_mr_advice advice
,
1995 u32 pf_flags
, struct ib_sge
*sg_list
,
1998 u32 bytes_mapped
= 0;
2002 for (i
= 0; i
< num_sge
; ++i
) {
2003 struct mlx5_ib_mr
*mr
;
2005 mr
= get_prefetchable_mr(pd
, advice
, sg_list
[i
].lkey
);
2008 ret
= pagefault_mr(mr
, sg_list
[i
].addr
, sg_list
[i
].length
,
2009 &bytes_mapped
, pf_flags
, false);
2011 mlx5r_deref_odp_mkey(&mr
->mmkey
);
2014 mlx5_update_odp_stats(mr
, prefetch
, ret
);
2015 mlx5r_deref_odp_mkey(&mr
->mmkey
);
2021 int mlx5_ib_advise_mr_prefetch(struct ib_pd
*pd
,
2022 enum ib_uverbs_advise_mr_advice advice
,
2023 u32 flags
, struct ib_sge
*sg_list
, u32 num_sge
)
2026 struct prefetch_mr_work
*work
;
2029 if (advice
== IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH
)
2030 pf_flags
|= MLX5_PF_FLAGS_DOWNGRADE
;
2032 if (advice
== IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT
)
2033 pf_flags
|= MLX5_PF_FLAGS_SNAPSHOT
;
2035 if (flags
& IB_UVERBS_ADVISE_MR_FLAG_FLUSH
)
2036 return mlx5_ib_prefetch_sg_list(pd
, advice
, pf_flags
, sg_list
,
2039 work
= kvzalloc(struct_size(work
, frags
, num_sge
), GFP_KERNEL
);
2043 rc
= init_prefetch_work(pd
, advice
, pf_flags
, work
, sg_list
, num_sge
);
2045 destroy_prefetch_work(work
);
2048 queue_work(system_unbound_wq
, &work
->work
);