2 * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
3 * Copyright (c) 2020, Intel Corporation. All rights reserved.
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
35 #include <linux/kref.h>
36 #include <linux/random.h>
37 #include <linux/debugfs.h>
38 #include <linux/export.h>
39 #include <linux/delay.h>
40 #include <linux/dma-buf.h>
41 #include <linux/dma-resv.h>
42 #include <rdma/ib_umem_odp.h>
46 #include "data_direct.h"
49 MAX_PENDING_REG_MR
= 8,
52 #define MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS 4
53 #define MLX5_UMR_ALIGN 2048
56 create_mkey_callback(int status
, struct mlx5_async_work
*context
);
57 static struct mlx5_ib_mr
*reg_create(struct ib_pd
*pd
, struct ib_umem
*umem
,
58 u64 iova
, int access_flags
,
59 unsigned int page_size
, bool populate
,
61 static int __mlx5_ib_dereg_mr(struct ib_mr
*ibmr
);
63 static void set_mkc_access_pd_addr_fields(void *mkc
, int acc
, u64 start_addr
,
66 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
68 MLX5_SET(mkc
, mkc
, a
, !!(acc
& IB_ACCESS_REMOTE_ATOMIC
));
69 MLX5_SET(mkc
, mkc
, rw
, !!(acc
& IB_ACCESS_REMOTE_WRITE
));
70 MLX5_SET(mkc
, mkc
, rr
, !!(acc
& IB_ACCESS_REMOTE_READ
));
71 MLX5_SET(mkc
, mkc
, lw
, !!(acc
& IB_ACCESS_LOCAL_WRITE
));
72 MLX5_SET(mkc
, mkc
, lr
, 1);
74 if (acc
& IB_ACCESS_RELAXED_ORDERING
) {
75 if (MLX5_CAP_GEN(dev
->mdev
, relaxed_ordering_write
))
76 MLX5_SET(mkc
, mkc
, relaxed_ordering_write
, 1);
78 if (MLX5_CAP_GEN(dev
->mdev
, relaxed_ordering_read
) ||
79 (MLX5_CAP_GEN(dev
->mdev
,
80 relaxed_ordering_read_pci_enabled
) &&
81 pcie_relaxed_ordering_enabled(dev
->mdev
->pdev
)))
82 MLX5_SET(mkc
, mkc
, relaxed_ordering_read
, 1);
85 MLX5_SET(mkc
, mkc
, pd
, to_mpd(pd
)->pdn
);
86 MLX5_SET(mkc
, mkc
, qpn
, 0xffffff);
87 MLX5_SET64(mkc
, mkc
, start_addr
, start_addr
);
90 static void assign_mkey_variant(struct mlx5_ib_dev
*dev
, u32
*mkey
, u32
*in
)
92 u8 key
= atomic_inc_return(&dev
->mkey_var
);
95 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
96 MLX5_SET(mkc
, mkc
, mkey_7_0
, key
);
100 static int mlx5_ib_create_mkey(struct mlx5_ib_dev
*dev
,
101 struct mlx5_ib_mkey
*mkey
, u32
*in
, int inlen
)
105 assign_mkey_variant(dev
, &mkey
->key
, in
);
106 ret
= mlx5_core_create_mkey(dev
->mdev
, &mkey
->key
, in
, inlen
);
108 init_waitqueue_head(&mkey
->wait
);
113 static int mlx5_ib_create_mkey_cb(struct mlx5r_async_create_mkey
*async_create
)
115 struct mlx5_ib_dev
*dev
= async_create
->ent
->dev
;
116 size_t inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
117 size_t outlen
= MLX5_ST_SZ_BYTES(create_mkey_out
);
119 MLX5_SET(create_mkey_in
, async_create
->in
, opcode
,
120 MLX5_CMD_OP_CREATE_MKEY
);
121 assign_mkey_variant(dev
, &async_create
->mkey
, async_create
->in
);
122 return mlx5_cmd_exec_cb(&dev
->async_ctx
, async_create
->in
, inlen
,
123 async_create
->out
, outlen
, create_mkey_callback
,
124 &async_create
->cb_work
);
127 static int mkey_cache_max_order(struct mlx5_ib_dev
*dev
);
128 static void queue_adjust_cache_locked(struct mlx5_cache_ent
*ent
);
130 static int destroy_mkey(struct mlx5_ib_dev
*dev
, struct mlx5_ib_mr
*mr
)
132 WARN_ON(xa_load(&dev
->odp_mkeys
, mlx5_base_mkey(mr
->mmkey
.key
)));
134 return mlx5_core_destroy_mkey(dev
->mdev
, mr
->mmkey
.key
);
137 static void create_mkey_warn(struct mlx5_ib_dev
*dev
, int status
, void *out
)
139 if (status
== -ENXIO
) /* core driver is not available */
142 mlx5_ib_warn(dev
, "async reg mr failed. status %d\n", status
);
143 if (status
!= -EREMOTEIO
) /* driver specific failure */
146 /* Failed in FW, print cmd out failure details */
147 mlx5_cmd_out_err(dev
->mdev
, MLX5_CMD_OP_CREATE_MKEY
, 0, out
);
150 static int push_mkey_locked(struct mlx5_cache_ent
*ent
, u32 mkey
)
152 unsigned long tmp
= ent
->mkeys_queue
.ci
% NUM_MKEYS_PER_PAGE
;
153 struct mlx5_mkeys_page
*page
;
155 lockdep_assert_held(&ent
->mkeys_queue
.lock
);
156 if (ent
->mkeys_queue
.ci
>=
157 ent
->mkeys_queue
.num_pages
* NUM_MKEYS_PER_PAGE
) {
158 page
= kzalloc(sizeof(*page
), GFP_ATOMIC
);
161 ent
->mkeys_queue
.num_pages
++;
162 list_add_tail(&page
->list
, &ent
->mkeys_queue
.pages_list
);
164 page
= list_last_entry(&ent
->mkeys_queue
.pages_list
,
165 struct mlx5_mkeys_page
, list
);
168 page
->mkeys
[tmp
] = mkey
;
169 ent
->mkeys_queue
.ci
++;
173 static int pop_mkey_locked(struct mlx5_cache_ent
*ent
)
175 unsigned long tmp
= (ent
->mkeys_queue
.ci
- 1) % NUM_MKEYS_PER_PAGE
;
176 struct mlx5_mkeys_page
*last_page
;
179 lockdep_assert_held(&ent
->mkeys_queue
.lock
);
180 last_page
= list_last_entry(&ent
->mkeys_queue
.pages_list
,
181 struct mlx5_mkeys_page
, list
);
182 mkey
= last_page
->mkeys
[tmp
];
183 last_page
->mkeys
[tmp
] = 0;
184 ent
->mkeys_queue
.ci
--;
185 if (ent
->mkeys_queue
.num_pages
> 1 && !tmp
) {
186 list_del(&last_page
->list
);
187 ent
->mkeys_queue
.num_pages
--;
193 static void create_mkey_callback(int status
, struct mlx5_async_work
*context
)
195 struct mlx5r_async_create_mkey
*mkey_out
=
196 container_of(context
, struct mlx5r_async_create_mkey
, cb_work
);
197 struct mlx5_cache_ent
*ent
= mkey_out
->ent
;
198 struct mlx5_ib_dev
*dev
= ent
->dev
;
202 create_mkey_warn(dev
, status
, mkey_out
->out
);
204 spin_lock_irqsave(&ent
->mkeys_queue
.lock
, flags
);
206 WRITE_ONCE(dev
->fill_delay
, 1);
207 spin_unlock_irqrestore(&ent
->mkeys_queue
.lock
, flags
);
208 mod_timer(&dev
->delay_timer
, jiffies
+ HZ
);
212 mkey_out
->mkey
|= mlx5_idx_to_mkey(
213 MLX5_GET(create_mkey_out
, mkey_out
->out
, mkey_index
));
214 WRITE_ONCE(dev
->cache
.last_add
, jiffies
);
216 spin_lock_irqsave(&ent
->mkeys_queue
.lock
, flags
);
217 push_mkey_locked(ent
, mkey_out
->mkey
);
219 /* If we are doing fill_to_high_water then keep going. */
220 queue_adjust_cache_locked(ent
);
221 spin_unlock_irqrestore(&ent
->mkeys_queue
.lock
, flags
);
225 static int get_mkc_octo_size(unsigned int access_mode
, unsigned int ndescs
)
229 switch (access_mode
) {
230 case MLX5_MKC_ACCESS_MODE_MTT
:
231 ret
= DIV_ROUND_UP(ndescs
, MLX5_IB_UMR_OCTOWORD
/
232 sizeof(struct mlx5_mtt
));
234 case MLX5_MKC_ACCESS_MODE_KSM
:
235 ret
= DIV_ROUND_UP(ndescs
, MLX5_IB_UMR_OCTOWORD
/
236 sizeof(struct mlx5_klm
));
244 static void set_cache_mkc(struct mlx5_cache_ent
*ent
, void *mkc
)
246 set_mkc_access_pd_addr_fields(mkc
, ent
->rb_key
.access_flags
, 0,
248 MLX5_SET(mkc
, mkc
, free
, 1);
249 MLX5_SET(mkc
, mkc
, umr_en
, 1);
250 MLX5_SET(mkc
, mkc
, access_mode_1_0
, ent
->rb_key
.access_mode
& 0x3);
251 MLX5_SET(mkc
, mkc
, access_mode_4_2
,
252 (ent
->rb_key
.access_mode
>> 2) & 0x7);
253 MLX5_SET(mkc
, mkc
, ma_translation_mode
, !!ent
->rb_key
.ats
);
255 MLX5_SET(mkc
, mkc
, translations_octword_size
,
256 get_mkc_octo_size(ent
->rb_key
.access_mode
,
257 ent
->rb_key
.ndescs
));
258 MLX5_SET(mkc
, mkc
, log_page_size
, PAGE_SHIFT
);
261 /* Asynchronously schedule new MRs to be populated in the cache. */
262 static int add_keys(struct mlx5_cache_ent
*ent
, unsigned int num
)
264 struct mlx5r_async_create_mkey
*async_create
;
269 for (i
= 0; i
< num
; i
++) {
270 async_create
= kzalloc(sizeof(struct mlx5r_async_create_mkey
),
274 mkc
= MLX5_ADDR_OF(create_mkey_in
, async_create
->in
,
275 memory_key_mkey_entry
);
276 set_cache_mkc(ent
, mkc
);
277 async_create
->ent
= ent
;
279 spin_lock_irq(&ent
->mkeys_queue
.lock
);
280 if (ent
->pending
>= MAX_PENDING_REG_MR
) {
282 goto free_async_create
;
285 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
287 err
= mlx5_ib_create_mkey_cb(async_create
);
289 mlx5_ib_warn(ent
->dev
, "create mkey failed %d\n", err
);
290 goto err_create_mkey
;
297 spin_lock_irq(&ent
->mkeys_queue
.lock
);
300 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
305 /* Synchronously create a MR in the cache */
306 static int create_cache_mkey(struct mlx5_cache_ent
*ent
, u32
*mkey
)
308 size_t inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
313 in
= kzalloc(inlen
, GFP_KERNEL
);
316 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
317 set_cache_mkc(ent
, mkc
);
319 err
= mlx5_core_create_mkey(ent
->dev
->mdev
, mkey
, in
, inlen
);
323 WRITE_ONCE(ent
->dev
->cache
.last_add
, jiffies
);
329 static void remove_cache_mr_locked(struct mlx5_cache_ent
*ent
)
333 lockdep_assert_held(&ent
->mkeys_queue
.lock
);
334 if (!ent
->mkeys_queue
.ci
)
336 mkey
= pop_mkey_locked(ent
);
337 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
338 mlx5_core_destroy_mkey(ent
->dev
->mdev
, mkey
);
339 spin_lock_irq(&ent
->mkeys_queue
.lock
);
342 static int resize_available_mrs(struct mlx5_cache_ent
*ent
, unsigned int target
,
344 __acquires(&ent
->mkeys_queue
.lock
) __releases(&ent
->mkeys_queue
.lock
)
348 lockdep_assert_held(&ent
->mkeys_queue
.lock
);
352 target
= ent
->limit
* 2;
353 if (target
== ent
->pending
+ ent
->mkeys_queue
.ci
)
355 if (target
> ent
->pending
+ ent
->mkeys_queue
.ci
) {
356 u32 todo
= target
- (ent
->pending
+ ent
->mkeys_queue
.ci
);
358 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
359 err
= add_keys(ent
, todo
);
361 usleep_range(3000, 5000);
362 spin_lock_irq(&ent
->mkeys_queue
.lock
);
369 remove_cache_mr_locked(ent
);
374 static ssize_t
size_write(struct file
*filp
, const char __user
*buf
,
375 size_t count
, loff_t
*pos
)
377 struct mlx5_cache_ent
*ent
= filp
->private_data
;
381 err
= kstrtou32_from_user(buf
, count
, 0, &target
);
386 * Target is the new value of total_mrs the user requests, however we
387 * cannot free MRs that are in use. Compute the target value for stored
390 spin_lock_irq(&ent
->mkeys_queue
.lock
);
391 if (target
< ent
->in_use
) {
395 target
= target
- ent
->in_use
;
396 if (target
< ent
->limit
|| target
> ent
->limit
*2) {
400 err
= resize_available_mrs(ent
, target
, false);
403 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
408 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
412 static ssize_t
size_read(struct file
*filp
, char __user
*buf
, size_t count
,
415 struct mlx5_cache_ent
*ent
= filp
->private_data
;
419 err
= snprintf(lbuf
, sizeof(lbuf
), "%ld\n",
420 ent
->mkeys_queue
.ci
+ ent
->in_use
);
424 return simple_read_from_buffer(buf
, count
, pos
, lbuf
, err
);
427 static const struct file_operations size_fops
= {
428 .owner
= THIS_MODULE
,
434 static ssize_t
limit_write(struct file
*filp
, const char __user
*buf
,
435 size_t count
, loff_t
*pos
)
437 struct mlx5_cache_ent
*ent
= filp
->private_data
;
441 err
= kstrtou32_from_user(buf
, count
, 0, &var
);
446 * Upon set we immediately fill the cache to high water mark implied by
449 spin_lock_irq(&ent
->mkeys_queue
.lock
);
451 err
= resize_available_mrs(ent
, 0, true);
452 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
458 static ssize_t
limit_read(struct file
*filp
, char __user
*buf
, size_t count
,
461 struct mlx5_cache_ent
*ent
= filp
->private_data
;
465 err
= snprintf(lbuf
, sizeof(lbuf
), "%d\n", ent
->limit
);
469 return simple_read_from_buffer(buf
, count
, pos
, lbuf
, err
);
472 static const struct file_operations limit_fops
= {
473 .owner
= THIS_MODULE
,
475 .write
= limit_write
,
479 static bool someone_adding(struct mlx5_mkey_cache
*cache
)
481 struct mlx5_cache_ent
*ent
;
482 struct rb_node
*node
;
485 mutex_lock(&cache
->rb_lock
);
486 for (node
= rb_first(&cache
->rb_root
); node
; node
= rb_next(node
)) {
487 ent
= rb_entry(node
, struct mlx5_cache_ent
, node
);
488 spin_lock_irq(&ent
->mkeys_queue
.lock
);
489 ret
= ent
->mkeys_queue
.ci
< ent
->limit
;
490 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
492 mutex_unlock(&cache
->rb_lock
);
496 mutex_unlock(&cache
->rb_lock
);
501 * Check if the bucket is outside the high/low water mark and schedule an async
502 * update. The cache refill has hysteresis, once the low water mark is hit it is
503 * refilled up to the high mark.
505 static void queue_adjust_cache_locked(struct mlx5_cache_ent
*ent
)
507 lockdep_assert_held(&ent
->mkeys_queue
.lock
);
509 if (ent
->disabled
|| READ_ONCE(ent
->dev
->fill_delay
) || ent
->is_tmp
)
511 if (ent
->mkeys_queue
.ci
< ent
->limit
) {
512 ent
->fill_to_high_water
= true;
513 mod_delayed_work(ent
->dev
->cache
.wq
, &ent
->dwork
, 0);
514 } else if (ent
->fill_to_high_water
&&
515 ent
->mkeys_queue
.ci
+ ent
->pending
< 2 * ent
->limit
) {
517 * Once we start populating due to hitting a low water mark
518 * continue until we pass the high water mark.
520 mod_delayed_work(ent
->dev
->cache
.wq
, &ent
->dwork
, 0);
521 } else if (ent
->mkeys_queue
.ci
== 2 * ent
->limit
) {
522 ent
->fill_to_high_water
= false;
523 } else if (ent
->mkeys_queue
.ci
> 2 * ent
->limit
) {
524 /* Queue deletion of excess entries */
525 ent
->fill_to_high_water
= false;
527 queue_delayed_work(ent
->dev
->cache
.wq
, &ent
->dwork
,
528 msecs_to_jiffies(1000));
530 mod_delayed_work(ent
->dev
->cache
.wq
, &ent
->dwork
, 0);
534 static void clean_keys(struct mlx5_ib_dev
*dev
, struct mlx5_cache_ent
*ent
)
538 spin_lock_irq(&ent
->mkeys_queue
.lock
);
539 while (ent
->mkeys_queue
.ci
) {
540 mkey
= pop_mkey_locked(ent
);
541 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
542 mlx5_core_destroy_mkey(dev
->mdev
, mkey
);
543 spin_lock_irq(&ent
->mkeys_queue
.lock
);
545 ent
->tmp_cleanup_scheduled
= false;
546 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
549 static void __cache_work_func(struct mlx5_cache_ent
*ent
)
551 struct mlx5_ib_dev
*dev
= ent
->dev
;
552 struct mlx5_mkey_cache
*cache
= &dev
->cache
;
555 spin_lock_irq(&ent
->mkeys_queue
.lock
);
559 if (ent
->fill_to_high_water
&&
560 ent
->mkeys_queue
.ci
+ ent
->pending
< 2 * ent
->limit
&&
561 !READ_ONCE(dev
->fill_delay
)) {
562 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
563 err
= add_keys(ent
, 1);
564 spin_lock_irq(&ent
->mkeys_queue
.lock
);
569 * EAGAIN only happens if there are pending MRs, so we
570 * will be rescheduled when storing them. The only
571 * failure path here is ENOMEM.
573 if (err
!= -EAGAIN
) {
576 "add keys command failed, err %d\n",
578 queue_delayed_work(cache
->wq
, &ent
->dwork
,
579 msecs_to_jiffies(1000));
582 } else if (ent
->mkeys_queue
.ci
> 2 * ent
->limit
) {
586 * The remove_cache_mr() logic is performed as garbage
587 * collection task. Such task is intended to be run when no
588 * other active processes are running.
590 * The need_resched() will return TRUE if there are user tasks
591 * to be activated in near future.
593 * In such case, we don't execute remove_cache_mr() and postpone
594 * the garbage collection work to try to run in next cycle, in
595 * order to free CPU resources to other tasks.
597 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
598 need_delay
= need_resched() || someone_adding(cache
) ||
600 READ_ONCE(cache
->last_add
) + 300 * HZ
);
601 spin_lock_irq(&ent
->mkeys_queue
.lock
);
605 queue_delayed_work(cache
->wq
, &ent
->dwork
, 300 * HZ
);
608 remove_cache_mr_locked(ent
);
609 queue_adjust_cache_locked(ent
);
612 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
615 static void delayed_cache_work_func(struct work_struct
*work
)
617 struct mlx5_cache_ent
*ent
;
619 ent
= container_of(work
, struct mlx5_cache_ent
, dwork
.work
);
620 /* temp entries are never filled, only cleaned */
622 clean_keys(ent
->dev
, ent
);
624 __cache_work_func(ent
);
627 static int cache_ent_key_cmp(struct mlx5r_cache_rb_key key1
,
628 struct mlx5r_cache_rb_key key2
)
632 res
= key1
.ats
- key2
.ats
;
636 res
= key1
.access_mode
- key2
.access_mode
;
640 res
= key1
.access_flags
- key2
.access_flags
;
645 * keep ndescs the last in the compare table since the find function
646 * searches for an exact match on all properties and only closest
649 return key1
.ndescs
- key2
.ndescs
;
652 static int mlx5_cache_ent_insert(struct mlx5_mkey_cache
*cache
,
653 struct mlx5_cache_ent
*ent
)
655 struct rb_node
**new = &cache
->rb_root
.rb_node
, *parent
= NULL
;
656 struct mlx5_cache_ent
*cur
;
659 /* Figure out where to put new node */
661 cur
= rb_entry(*new, struct mlx5_cache_ent
, node
);
663 cmp
= cache_ent_key_cmp(cur
->rb_key
, ent
->rb_key
);
665 new = &((*new)->rb_left
);
667 new = &((*new)->rb_right
);
672 /* Add new node and rebalance tree. */
673 rb_link_node(&ent
->node
, parent
, new);
674 rb_insert_color(&ent
->node
, &cache
->rb_root
);
679 static struct mlx5_cache_ent
*
680 mkey_cache_ent_from_rb_key(struct mlx5_ib_dev
*dev
,
681 struct mlx5r_cache_rb_key rb_key
)
683 struct rb_node
*node
= dev
->cache
.rb_root
.rb_node
;
684 struct mlx5_cache_ent
*cur
, *smallest
= NULL
;
689 * Find the smallest ent with order >= requested_order.
692 cur
= rb_entry(node
, struct mlx5_cache_ent
, node
);
693 cmp
= cache_ent_key_cmp(cur
->rb_key
, rb_key
);
696 node
= node
->rb_left
;
699 node
= node
->rb_right
;
705 * Limit the usage of mkeys larger than twice the required size while
706 * also allowing the usage of smallest cache entry for small MRs.
708 ndescs_limit
= max_t(u64
, rb_key
.ndescs
* 2,
709 MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS
);
712 smallest
->rb_key
.access_mode
== rb_key
.access_mode
&&
713 smallest
->rb_key
.access_flags
== rb_key
.access_flags
&&
714 smallest
->rb_key
.ats
== rb_key
.ats
&&
715 smallest
->rb_key
.ndescs
<= ndescs_limit
) ?
720 static struct mlx5_ib_mr
*_mlx5_mr_cache_alloc(struct mlx5_ib_dev
*dev
,
721 struct mlx5_cache_ent
*ent
,
724 struct mlx5_ib_mr
*mr
;
727 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
729 return ERR_PTR(-ENOMEM
);
731 spin_lock_irq(&ent
->mkeys_queue
.lock
);
734 if (!ent
->mkeys_queue
.ci
) {
735 queue_adjust_cache_locked(ent
);
737 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
738 err
= create_cache_mkey(ent
, &mr
->mmkey
.key
);
740 spin_lock_irq(&ent
->mkeys_queue
.lock
);
742 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
747 mr
->mmkey
.key
= pop_mkey_locked(ent
);
748 queue_adjust_cache_locked(ent
);
749 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
751 mr
->mmkey
.cache_ent
= ent
;
752 mr
->mmkey
.type
= MLX5_MKEY_MR
;
753 mr
->mmkey
.rb_key
= ent
->rb_key
;
754 mr
->mmkey
.cacheable
= true;
755 init_waitqueue_head(&mr
->mmkey
.wait
);
759 static int get_unchangeable_access_flags(struct mlx5_ib_dev
*dev
,
764 if ((access_flags
& IB_ACCESS_REMOTE_ATOMIC
) &&
765 MLX5_CAP_GEN(dev
->mdev
, atomic
) &&
766 MLX5_CAP_GEN(dev
->mdev
, umr_modify_atomic_disabled
))
767 ret
|= IB_ACCESS_REMOTE_ATOMIC
;
769 if ((access_flags
& IB_ACCESS_RELAXED_ORDERING
) &&
770 MLX5_CAP_GEN(dev
->mdev
, relaxed_ordering_write
) &&
771 !MLX5_CAP_GEN(dev
->mdev
, relaxed_ordering_write_umr
))
772 ret
|= IB_ACCESS_RELAXED_ORDERING
;
774 if ((access_flags
& IB_ACCESS_RELAXED_ORDERING
) &&
775 (MLX5_CAP_GEN(dev
->mdev
, relaxed_ordering_read
) ||
776 MLX5_CAP_GEN(dev
->mdev
, relaxed_ordering_read_pci_enabled
)) &&
777 !MLX5_CAP_GEN(dev
->mdev
, relaxed_ordering_read_umr
))
778 ret
|= IB_ACCESS_RELAXED_ORDERING
;
783 struct mlx5_ib_mr
*mlx5_mr_cache_alloc(struct mlx5_ib_dev
*dev
,
784 int access_flags
, int access_mode
,
787 struct mlx5r_cache_rb_key rb_key
= {
789 .access_mode
= access_mode
,
790 .access_flags
= get_unchangeable_access_flags(dev
, access_flags
)
792 struct mlx5_cache_ent
*ent
= mkey_cache_ent_from_rb_key(dev
, rb_key
);
795 return ERR_PTR(-EOPNOTSUPP
);
797 return _mlx5_mr_cache_alloc(dev
, ent
, access_flags
);
800 static void mlx5_mkey_cache_debugfs_cleanup(struct mlx5_ib_dev
*dev
)
802 if (!mlx5_debugfs_root
|| dev
->is_rep
)
805 debugfs_remove_recursive(dev
->cache
.fs_root
);
806 dev
->cache
.fs_root
= NULL
;
809 static void mlx5_mkey_cache_debugfs_add_ent(struct mlx5_ib_dev
*dev
,
810 struct mlx5_cache_ent
*ent
)
812 int order
= order_base_2(ent
->rb_key
.ndescs
);
815 if (!mlx5_debugfs_root
|| dev
->is_rep
)
818 if (ent
->rb_key
.access_mode
== MLX5_MKC_ACCESS_MODE_KSM
)
819 order
= MLX5_IMR_KSM_CACHE_ENTRY
+ 2;
821 sprintf(ent
->name
, "%d", order
);
822 dir
= debugfs_create_dir(ent
->name
, dev
->cache
.fs_root
);
823 debugfs_create_file("size", 0600, dir
, ent
, &size_fops
);
824 debugfs_create_file("limit", 0600, dir
, ent
, &limit_fops
);
825 debugfs_create_ulong("cur", 0400, dir
, &ent
->mkeys_queue
.ci
);
826 debugfs_create_u32("miss", 0600, dir
, &ent
->miss
);
829 static void mlx5_mkey_cache_debugfs_init(struct mlx5_ib_dev
*dev
)
831 struct dentry
*dbg_root
= mlx5_debugfs_get_dev_root(dev
->mdev
);
832 struct mlx5_mkey_cache
*cache
= &dev
->cache
;
834 if (!mlx5_debugfs_root
|| dev
->is_rep
)
837 cache
->fs_root
= debugfs_create_dir("mr_cache", dbg_root
);
840 static void delay_time_func(struct timer_list
*t
)
842 struct mlx5_ib_dev
*dev
= from_timer(dev
, t
, delay_timer
);
844 WRITE_ONCE(dev
->fill_delay
, 0);
847 static int mlx5r_mkeys_init(struct mlx5_cache_ent
*ent
)
849 struct mlx5_mkeys_page
*page
;
851 page
= kzalloc(sizeof(*page
), GFP_KERNEL
);
854 INIT_LIST_HEAD(&ent
->mkeys_queue
.pages_list
);
855 spin_lock_init(&ent
->mkeys_queue
.lock
);
856 list_add_tail(&page
->list
, &ent
->mkeys_queue
.pages_list
);
857 ent
->mkeys_queue
.num_pages
++;
861 static void mlx5r_mkeys_uninit(struct mlx5_cache_ent
*ent
)
863 struct mlx5_mkeys_page
*page
;
865 WARN_ON(ent
->mkeys_queue
.ci
|| ent
->mkeys_queue
.num_pages
> 1);
866 page
= list_last_entry(&ent
->mkeys_queue
.pages_list
,
867 struct mlx5_mkeys_page
, list
);
868 list_del(&page
->list
);
872 struct mlx5_cache_ent
*
873 mlx5r_cache_create_ent_locked(struct mlx5_ib_dev
*dev
,
874 struct mlx5r_cache_rb_key rb_key
,
875 bool persistent_entry
)
877 struct mlx5_cache_ent
*ent
;
881 ent
= kzalloc(sizeof(*ent
), GFP_KERNEL
);
883 return ERR_PTR(-ENOMEM
);
885 ret
= mlx5r_mkeys_init(ent
);
888 ent
->rb_key
= rb_key
;
890 ent
->is_tmp
= !persistent_entry
;
892 INIT_DELAYED_WORK(&ent
->dwork
, delayed_cache_work_func
);
894 ret
= mlx5_cache_ent_insert(&dev
->cache
, ent
);
898 if (persistent_entry
) {
899 if (rb_key
.access_mode
== MLX5_MKC_ACCESS_MODE_KSM
)
900 order
= MLX5_IMR_KSM_CACHE_ENTRY
;
902 order
= order_base_2(rb_key
.ndescs
) - 2;
904 if ((dev
->mdev
->profile
.mask
& MLX5_PROF_MASK_MR_CACHE
) &&
905 !dev
->is_rep
&& mlx5_core_is_pf(dev
->mdev
) &&
906 mlx5r_umr_can_load_pas(dev
, 0))
907 ent
->limit
= dev
->mdev
->profile
.mr_cache
[order
].limit
;
911 mlx5_mkey_cache_debugfs_add_ent(dev
, ent
);
916 mlx5r_mkeys_uninit(ent
);
922 int mlx5_mkey_cache_init(struct mlx5_ib_dev
*dev
)
924 struct mlx5_mkey_cache
*cache
= &dev
->cache
;
925 struct rb_root
*root
= &dev
->cache
.rb_root
;
926 struct mlx5r_cache_rb_key rb_key
= {
927 .access_mode
= MLX5_MKC_ACCESS_MODE_MTT
,
929 struct mlx5_cache_ent
*ent
;
930 struct rb_node
*node
;
934 mutex_init(&dev
->slow_path_mutex
);
935 mutex_init(&dev
->cache
.rb_lock
);
936 dev
->cache
.rb_root
= RB_ROOT
;
937 cache
->wq
= alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM
);
939 mlx5_ib_warn(dev
, "failed to create work queue\n");
943 mlx5_cmd_init_async_ctx(dev
->mdev
, &dev
->async_ctx
);
944 timer_setup(&dev
->delay_timer
, delay_time_func
, 0);
945 mlx5_mkey_cache_debugfs_init(dev
);
946 mutex_lock(&cache
->rb_lock
);
947 for (i
= 0; i
<= mkey_cache_max_order(dev
); i
++) {
948 rb_key
.ndescs
= MLX5_MR_CACHE_PERSISTENT_ENTRY_MIN_DESCS
<< i
;
949 ent
= mlx5r_cache_create_ent_locked(dev
, rb_key
, true);
956 ret
= mlx5_odp_init_mkey_cache(dev
);
960 mutex_unlock(&cache
->rb_lock
);
961 for (node
= rb_first(root
); node
; node
= rb_next(node
)) {
962 ent
= rb_entry(node
, struct mlx5_cache_ent
, node
);
963 spin_lock_irq(&ent
->mkeys_queue
.lock
);
964 queue_adjust_cache_locked(ent
);
965 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
971 mutex_unlock(&cache
->rb_lock
);
972 mlx5_mkey_cache_debugfs_cleanup(dev
);
973 mlx5_ib_warn(dev
, "failed to create mkey cache entry\n");
977 void mlx5_mkey_cache_cleanup(struct mlx5_ib_dev
*dev
)
979 struct rb_root
*root
= &dev
->cache
.rb_root
;
980 struct mlx5_cache_ent
*ent
;
981 struct rb_node
*node
;
986 mutex_lock(&dev
->cache
.rb_lock
);
987 for (node
= rb_first(root
); node
; node
= rb_next(node
)) {
988 ent
= rb_entry(node
, struct mlx5_cache_ent
, node
);
989 spin_lock_irq(&ent
->mkeys_queue
.lock
);
990 ent
->disabled
= true;
991 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
992 cancel_delayed_work(&ent
->dwork
);
994 mutex_unlock(&dev
->cache
.rb_lock
);
997 * After all entries are disabled and will not reschedule on WQ,
998 * flush it and all async commands.
1000 flush_workqueue(dev
->cache
.wq
);
1002 mlx5_mkey_cache_debugfs_cleanup(dev
);
1003 mlx5_cmd_cleanup_async_ctx(&dev
->async_ctx
);
1005 /* At this point all entries are disabled and have no concurrent work. */
1006 mutex_lock(&dev
->cache
.rb_lock
);
1007 node
= rb_first(root
);
1009 ent
= rb_entry(node
, struct mlx5_cache_ent
, node
);
1010 node
= rb_next(node
);
1011 clean_keys(dev
, ent
);
1012 rb_erase(&ent
->node
, root
);
1013 mlx5r_mkeys_uninit(ent
);
1016 mutex_unlock(&dev
->cache
.rb_lock
);
1018 destroy_workqueue(dev
->cache
.wq
);
1019 del_timer_sync(&dev
->delay_timer
);
1022 struct ib_mr
*mlx5_ib_get_dma_mr(struct ib_pd
*pd
, int acc
)
1024 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1025 int inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
1026 struct mlx5_ib_mr
*mr
;
1031 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
1033 return ERR_PTR(-ENOMEM
);
1035 in
= kzalloc(inlen
, GFP_KERNEL
);
1041 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
1043 MLX5_SET(mkc
, mkc
, access_mode_1_0
, MLX5_MKC_ACCESS_MODE_PA
);
1044 MLX5_SET(mkc
, mkc
, length64
, 1);
1045 set_mkc_access_pd_addr_fields(mkc
, acc
| IB_ACCESS_RELAXED_ORDERING
, 0,
1047 MLX5_SET(mkc
, mkc
, ma_translation_mode
, MLX5_CAP_GEN(dev
->mdev
, ats
));
1049 err
= mlx5_ib_create_mkey(dev
, &mr
->mmkey
, in
, inlen
);
1054 mr
->mmkey
.type
= MLX5_MKEY_MR
;
1055 mr
->ibmr
.lkey
= mr
->mmkey
.key
;
1056 mr
->ibmr
.rkey
= mr
->mmkey
.key
;
1067 return ERR_PTR(err
);
1070 static int get_octo_len(u64 addr
, u64 len
, int page_shift
)
1072 u64 page_size
= 1ULL << page_shift
;
1076 offset
= addr
& (page_size
- 1);
1077 npages
= ALIGN(len
+ offset
, page_size
) >> page_shift
;
1078 return (npages
+ 1) / 2;
1081 static int mkey_cache_max_order(struct mlx5_ib_dev
*dev
)
1083 if (MLX5_CAP_GEN(dev
->mdev
, umr_extended_translation_offset
))
1084 return MKEY_CACHE_LAST_STD_ENTRY
;
1085 return MLX5_MAX_UMR_SHIFT
;
1088 static void set_mr_fields(struct mlx5_ib_dev
*dev
, struct mlx5_ib_mr
*mr
,
1089 u64 length
, int access_flags
, u64 iova
)
1091 mr
->ibmr
.lkey
= mr
->mmkey
.key
;
1092 mr
->ibmr
.rkey
= mr
->mmkey
.key
;
1093 mr
->ibmr
.length
= length
;
1094 mr
->ibmr
.device
= &dev
->ib_dev
;
1095 mr
->ibmr
.iova
= iova
;
1096 mr
->access_flags
= access_flags
;
1099 static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem
*umem
,
1103 * The alignment of iova has already been checked upon entering
1104 * UVERBS_METHOD_REG_DMABUF_MR
1110 static struct mlx5_ib_mr
*alloc_cacheable_mr(struct ib_pd
*pd
,
1111 struct ib_umem
*umem
, u64 iova
,
1112 int access_flags
, int access_mode
)
1114 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1115 struct mlx5r_cache_rb_key rb_key
= {};
1116 struct mlx5_cache_ent
*ent
;
1117 struct mlx5_ib_mr
*mr
;
1118 unsigned int page_size
;
1120 if (umem
->is_dmabuf
)
1121 page_size
= mlx5_umem_dmabuf_default_pgsz(umem
, iova
);
1123 page_size
= mlx5_umem_mkc_find_best_pgsz(dev
, umem
, iova
);
1124 if (WARN_ON(!page_size
))
1125 return ERR_PTR(-EINVAL
);
1127 rb_key
.access_mode
= access_mode
;
1128 rb_key
.ndescs
= ib_umem_num_dma_blocks(umem
, page_size
);
1129 rb_key
.ats
= mlx5_umem_needs_ats(dev
, umem
, access_flags
);
1130 rb_key
.access_flags
= get_unchangeable_access_flags(dev
, access_flags
);
1131 ent
= mkey_cache_ent_from_rb_key(dev
, rb_key
);
1133 * If the MR can't come from the cache then synchronously create an uncached
1137 mutex_lock(&dev
->slow_path_mutex
);
1138 mr
= reg_create(pd
, umem
, iova
, access_flags
, page_size
, false, access_mode
);
1139 mutex_unlock(&dev
->slow_path_mutex
);
1142 mr
->mmkey
.rb_key
= rb_key
;
1143 mr
->mmkey
.cacheable
= true;
1147 mr
= _mlx5_mr_cache_alloc(dev
, ent
, access_flags
);
1153 mr
->page_shift
= order_base_2(page_size
);
1154 set_mr_fields(dev
, mr
, umem
->length
, access_flags
, iova
);
1159 static struct ib_mr
*
1160 reg_create_crossing_vhca_mr(struct ib_pd
*pd
, u64 iova
, u64 length
, int access_flags
,
1163 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1164 int access_mode
= MLX5_MKC_ACCESS_MODE_CROSSING
;
1165 struct mlx5_ib_mr
*mr
;
1171 if (!MLX5_CAP_GEN(dev
->mdev
, crossing_vhca_mkey
))
1172 return ERR_PTR(-EOPNOTSUPP
);
1174 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
1176 return ERR_PTR(-ENOMEM
);
1178 inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
1179 in
= kvzalloc(inlen
, GFP_KERNEL
);
1185 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
1186 MLX5_SET(mkc
, mkc
, crossing_target_vhca_id
,
1187 MLX5_CAP_GEN(dev
->mdev
, vhca_id
));
1188 MLX5_SET(mkc
, mkc
, translations_octword_size
, crossed_lkey
);
1189 MLX5_SET(mkc
, mkc
, access_mode_1_0
, access_mode
& 0x3);
1190 MLX5_SET(mkc
, mkc
, access_mode_4_2
, (access_mode
>> 2) & 0x7);
1192 /* for this crossing mkey IOVA should be 0 and len should be IOVA + len */
1193 set_mkc_access_pd_addr_fields(mkc
, access_flags
, 0, pd
);
1194 MLX5_SET64(mkc
, mkc
, len
, iova
+ length
);
1196 MLX5_SET(mkc
, mkc
, free
, 0);
1197 MLX5_SET(mkc
, mkc
, umr_en
, 0);
1198 err
= mlx5_ib_create_mkey(dev
, &mr
->mmkey
, in
, inlen
);
1202 mr
->mmkey
.type
= MLX5_MKEY_MR
;
1203 set_mr_fields(dev
, mr
, length
, access_flags
, iova
);
1206 mlx5_ib_dbg(dev
, "crossing mkey = 0x%x\n", mr
->mmkey
.key
);
1213 return ERR_PTR(err
);
1217 * If ibmr is NULL it will be allocated by reg_create.
1218 * Else, the given ibmr will be used.
1220 static struct mlx5_ib_mr
*reg_create(struct ib_pd
*pd
, struct ib_umem
*umem
,
1221 u64 iova
, int access_flags
,
1222 unsigned int page_size
, bool populate
,
1225 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1226 struct mlx5_ib_mr
*mr
;
1232 bool pg_cap
= !!(MLX5_CAP_GEN(dev
->mdev
, pg
)) &&
1233 (access_mode
== MLX5_MKC_ACCESS_MODE_MTT
);
1234 bool ksm_mode
= (access_mode
== MLX5_MKC_ACCESS_MODE_KSM
);
1237 return ERR_PTR(-EINVAL
);
1238 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
1240 return ERR_PTR(-ENOMEM
);
1243 mr
->access_flags
= access_flags
;
1244 mr
->page_shift
= order_base_2(page_size
);
1246 inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
1248 inlen
+= sizeof(*pas
) *
1249 roundup(ib_umem_num_dma_blocks(umem
, page_size
), 2);
1250 in
= kvzalloc(inlen
, GFP_KERNEL
);
1255 pas
= (__be64
*)MLX5_ADDR_OF(create_mkey_in
, in
, klm_pas_mtt
);
1257 if (WARN_ON(access_flags
& IB_ACCESS_ON_DEMAND
|| ksm_mode
)) {
1261 mlx5_ib_populate_pas(umem
, 1UL << mr
->page_shift
, pas
,
1262 pg_cap
? MLX5_IB_MTT_PRESENT
: 0);
1265 /* The pg_access bit allows setting the access flags
1266 * in the page list submitted with the command.
1268 MLX5_SET(create_mkey_in
, in
, pg_access
, !!(pg_cap
));
1270 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
1271 set_mkc_access_pd_addr_fields(mkc
, access_flags
, iova
,
1272 populate
? pd
: dev
->umrc
.pd
);
1273 /* In case a data direct flow, overwrite the pdn field by its internal kernel PD */
1274 if (umem
->is_dmabuf
&& ksm_mode
)
1275 MLX5_SET(mkc
, mkc
, pd
, dev
->ddr
.pdn
);
1277 MLX5_SET(mkc
, mkc
, free
, !populate
);
1278 MLX5_SET(mkc
, mkc
, access_mode_1_0
, access_mode
);
1279 MLX5_SET(mkc
, mkc
, umr_en
, 1);
1281 MLX5_SET64(mkc
, mkc
, len
, umem
->length
);
1282 MLX5_SET(mkc
, mkc
, bsf_octword_size
, 0);
1284 MLX5_SET(mkc
, mkc
, translations_octword_size
,
1285 get_octo_len(iova
, umem
->length
, mr
->page_shift
) * 2);
1287 MLX5_SET(mkc
, mkc
, translations_octword_size
,
1288 get_octo_len(iova
, umem
->length
, mr
->page_shift
));
1289 MLX5_SET(mkc
, mkc
, log_page_size
, mr
->page_shift
);
1290 if (mlx5_umem_needs_ats(dev
, umem
, access_flags
))
1291 MLX5_SET(mkc
, mkc
, ma_translation_mode
, 1);
1293 MLX5_SET(create_mkey_in
, in
, translations_octword_actual_size
,
1294 get_octo_len(iova
, umem
->length
, mr
->page_shift
));
1297 err
= mlx5_ib_create_mkey(dev
, &mr
->mmkey
, in
, inlen
);
1299 mlx5_ib_warn(dev
, "create mkey failed\n");
1302 mr
->mmkey
.type
= MLX5_MKEY_MR
;
1303 mr
->mmkey
.ndescs
= get_octo_len(iova
, umem
->length
, mr
->page_shift
);
1305 set_mr_fields(dev
, mr
, umem
->length
, access_flags
, iova
);
1308 mlx5_ib_dbg(dev
, "mkey = 0x%x\n", mr
->mmkey
.key
);
1316 return ERR_PTR(err
);
1319 static struct ib_mr
*mlx5_ib_get_dm_mr(struct ib_pd
*pd
, u64 start_addr
,
1320 u64 length
, int acc
, int mode
)
1322 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1323 int inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
1324 struct mlx5_ib_mr
*mr
;
1329 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
1331 return ERR_PTR(-ENOMEM
);
1333 in
= kzalloc(inlen
, GFP_KERNEL
);
1339 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
1341 MLX5_SET(mkc
, mkc
, access_mode_1_0
, mode
& 0x3);
1342 MLX5_SET(mkc
, mkc
, access_mode_4_2
, (mode
>> 2) & 0x7);
1343 MLX5_SET64(mkc
, mkc
, len
, length
);
1344 set_mkc_access_pd_addr_fields(mkc
, acc
, start_addr
, pd
);
1346 err
= mlx5_ib_create_mkey(dev
, &mr
->mmkey
, in
, inlen
);
1352 set_mr_fields(dev
, mr
, length
, acc
, start_addr
);
1362 return ERR_PTR(err
);
1365 int mlx5_ib_advise_mr(struct ib_pd
*pd
,
1366 enum ib_uverbs_advise_mr_advice advice
,
1368 struct ib_sge
*sg_list
,
1370 struct uverbs_attr_bundle
*attrs
)
1372 if (advice
!= IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH
&&
1373 advice
!= IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE
&&
1374 advice
!= IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT
)
1377 return mlx5_ib_advise_mr_prefetch(pd
, advice
, flags
,
1381 struct ib_mr
*mlx5_ib_reg_dm_mr(struct ib_pd
*pd
, struct ib_dm
*dm
,
1382 struct ib_dm_mr_attr
*attr
,
1383 struct uverbs_attr_bundle
*attrs
)
1385 struct mlx5_ib_dm
*mdm
= to_mdm(dm
);
1386 struct mlx5_core_dev
*dev
= to_mdev(dm
->device
)->mdev
;
1387 u64 start_addr
= mdm
->dev_addr
+ attr
->offset
;
1390 switch (mdm
->type
) {
1391 case MLX5_IB_UAPI_DM_TYPE_MEMIC
:
1392 if (attr
->access_flags
& ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS
)
1393 return ERR_PTR(-EINVAL
);
1395 mode
= MLX5_MKC_ACCESS_MODE_MEMIC
;
1396 start_addr
-= pci_resource_start(dev
->pdev
, 0);
1398 case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM
:
1399 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM
:
1400 case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_PATTERN_SW_ICM
:
1401 case MLX5_IB_UAPI_DM_TYPE_ENCAP_SW_ICM
:
1402 if (attr
->access_flags
& ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS
)
1403 return ERR_PTR(-EINVAL
);
1405 mode
= MLX5_MKC_ACCESS_MODE_SW_ICM
;
1408 return ERR_PTR(-EINVAL
);
1411 return mlx5_ib_get_dm_mr(pd
, start_addr
, attr
->length
,
1412 attr
->access_flags
, mode
);
1415 static struct ib_mr
*create_real_mr(struct ib_pd
*pd
, struct ib_umem
*umem
,
1416 u64 iova
, int access_flags
)
1418 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1419 struct mlx5_ib_mr
*mr
= NULL
;
1423 xlt_with_umr
= mlx5r_umr_can_load_pas(dev
, umem
->length
);
1425 mr
= alloc_cacheable_mr(pd
, umem
, iova
, access_flags
,
1426 MLX5_MKC_ACCESS_MODE_MTT
);
1428 unsigned int page_size
=
1429 mlx5_umem_mkc_find_best_pgsz(dev
, umem
, iova
);
1431 mutex_lock(&dev
->slow_path_mutex
);
1432 mr
= reg_create(pd
, umem
, iova
, access_flags
, page_size
,
1433 true, MLX5_MKC_ACCESS_MODE_MTT
);
1434 mutex_unlock(&dev
->slow_path_mutex
);
1437 ib_umem_release(umem
);
1438 return ERR_CAST(mr
);
1441 mlx5_ib_dbg(dev
, "mkey 0x%x\n", mr
->mmkey
.key
);
1443 atomic_add(ib_umem_num_pages(umem
), &dev
->mdev
->priv
.reg_pages
);
1447 * If the MR was created with reg_create then it will be
1448 * configured properly but left disabled. It is safe to go ahead
1449 * and configure it again via UMR while enabling it.
1451 err
= mlx5r_umr_update_mr_pas(mr
, MLX5_IB_UPD_XLT_ENABLE
);
1453 mlx5_ib_dereg_mr(&mr
->ibmr
, NULL
);
1454 return ERR_PTR(err
);
1460 static struct ib_mr
*create_user_odp_mr(struct ib_pd
*pd
, u64 start
, u64 length
,
1461 u64 iova
, int access_flags
,
1462 struct ib_udata
*udata
)
1464 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1465 struct ib_umem_odp
*odp
;
1466 struct mlx5_ib_mr
*mr
;
1469 if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING
))
1470 return ERR_PTR(-EOPNOTSUPP
);
1472 err
= mlx5r_odp_create_eq(dev
, &dev
->odp_pf_eq
);
1474 return ERR_PTR(err
);
1475 if (!start
&& length
== U64_MAX
) {
1477 return ERR_PTR(-EINVAL
);
1478 if (!(dev
->odp_caps
.general_caps
& IB_ODP_SUPPORT_IMPLICIT
))
1479 return ERR_PTR(-EINVAL
);
1481 mr
= mlx5_ib_alloc_implicit_mr(to_mpd(pd
), access_flags
);
1483 return ERR_CAST(mr
);
1487 /* ODP requires xlt update via umr to work. */
1488 if (!mlx5r_umr_can_load_pas(dev
, length
))
1489 return ERR_PTR(-EINVAL
);
1491 odp
= ib_umem_odp_get(&dev
->ib_dev
, start
, length
, access_flags
,
1494 return ERR_CAST(odp
);
1496 mr
= alloc_cacheable_mr(pd
, &odp
->umem
, iova
, access_flags
,
1497 MLX5_MKC_ACCESS_MODE_MTT
);
1499 ib_umem_release(&odp
->umem
);
1500 return ERR_CAST(mr
);
1502 xa_init(&mr
->implicit_children
);
1505 err
= mlx5r_store_odp_mkey(dev
, &mr
->mmkey
);
1509 err
= mlx5_ib_init_odp_mr(mr
);
1515 mlx5_ib_dereg_mr(&mr
->ibmr
, NULL
);
1516 return ERR_PTR(err
);
1519 struct ib_mr
*mlx5_ib_reg_user_mr(struct ib_pd
*pd
, u64 start
, u64 length
,
1520 u64 iova
, int access_flags
,
1521 struct ib_udata
*udata
)
1523 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1524 struct ib_umem
*umem
;
1527 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM
))
1528 return ERR_PTR(-EOPNOTSUPP
);
1530 mlx5_ib_dbg(dev
, "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1531 start
, iova
, length
, access_flags
);
1533 err
= mlx5r_umr_resource_init(dev
);
1535 return ERR_PTR(err
);
1537 if (access_flags
& IB_ACCESS_ON_DEMAND
)
1538 return create_user_odp_mr(pd
, start
, length
, iova
, access_flags
,
1540 umem
= ib_umem_get(&dev
->ib_dev
, start
, length
, access_flags
);
1542 return ERR_CAST(umem
);
1543 return create_real_mr(pd
, umem
, iova
, access_flags
);
1546 static void mlx5_ib_dmabuf_invalidate_cb(struct dma_buf_attachment
*attach
)
1548 struct ib_umem_dmabuf
*umem_dmabuf
= attach
->importer_priv
;
1549 struct mlx5_ib_mr
*mr
= umem_dmabuf
->private;
1551 dma_resv_assert_held(umem_dmabuf
->attach
->dmabuf
->resv
);
1553 if (!umem_dmabuf
->sgt
)
1556 mlx5r_umr_update_mr_pas(mr
, MLX5_IB_UPD_XLT_ZAP
);
1557 ib_umem_dmabuf_unmap_pages(umem_dmabuf
);
1560 static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops
= {
1561 .allow_peer2peer
= 1,
1562 .move_notify
= mlx5_ib_dmabuf_invalidate_cb
,
1565 static struct ib_mr
*
1566 reg_user_mr_dmabuf(struct ib_pd
*pd
, struct device
*dma_device
,
1567 u64 offset
, u64 length
, u64 virt_addr
,
1568 int fd
, int access_flags
, int access_mode
)
1570 bool pinned_mode
= (access_mode
== MLX5_MKC_ACCESS_MODE_KSM
);
1571 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1572 struct mlx5_ib_mr
*mr
= NULL
;
1573 struct ib_umem_dmabuf
*umem_dmabuf
;
1576 err
= mlx5r_umr_resource_init(dev
);
1578 return ERR_PTR(err
);
1581 umem_dmabuf
= ib_umem_dmabuf_get(&dev
->ib_dev
,
1584 &mlx5_ib_dmabuf_attach_ops
);
1586 umem_dmabuf
= ib_umem_dmabuf_get_pinned_with_dma_device(&dev
->ib_dev
,
1587 dma_device
, offset
, length
,
1590 if (IS_ERR(umem_dmabuf
)) {
1591 mlx5_ib_dbg(dev
, "umem_dmabuf get failed (%ld)\n",
1592 PTR_ERR(umem_dmabuf
));
1593 return ERR_CAST(umem_dmabuf
);
1596 mr
= alloc_cacheable_mr(pd
, &umem_dmabuf
->umem
, virt_addr
,
1597 access_flags
, access_mode
);
1599 ib_umem_release(&umem_dmabuf
->umem
);
1600 return ERR_CAST(mr
);
1603 mlx5_ib_dbg(dev
, "mkey 0x%x\n", mr
->mmkey
.key
);
1605 atomic_add(ib_umem_num_pages(mr
->umem
), &dev
->mdev
->priv
.reg_pages
);
1606 umem_dmabuf
->private = mr
;
1608 err
= mlx5r_store_odp_mkey(dev
, &mr
->mmkey
);
1612 mr
->data_direct
= true;
1615 err
= mlx5_ib_init_dmabuf_mr(mr
);
1621 __mlx5_ib_dereg_mr(&mr
->ibmr
);
1622 return ERR_PTR(err
);
1625 static struct ib_mr
*
1626 reg_user_mr_dmabuf_by_data_direct(struct ib_pd
*pd
, u64 offset
,
1627 u64 length
, u64 virt_addr
,
1628 int fd
, int access_flags
)
1630 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1631 struct mlx5_data_direct_dev
*data_direct_dev
;
1632 struct ib_mr
*crossing_mr
;
1633 struct ib_mr
*crossed_mr
;
1636 /* As of HW behaviour the IOVA must be page aligned in KSM mode */
1637 if (!PAGE_ALIGNED(virt_addr
) || (access_flags
& IB_ACCESS_ON_DEMAND
))
1638 return ERR_PTR(-EOPNOTSUPP
);
1640 mutex_lock(&dev
->data_direct_lock
);
1641 data_direct_dev
= dev
->data_direct_dev
;
1642 if (!data_direct_dev
) {
1647 /* The device's 'data direct mkey' was created without RO flags to
1648 * simplify things and allow for a single mkey per device.
1649 * Since RO is not a must, mask it out accordingly.
1651 access_flags
&= ~IB_ACCESS_RELAXED_ORDERING
;
1652 crossed_mr
= reg_user_mr_dmabuf(pd
, &data_direct_dev
->pdev
->dev
,
1653 offset
, length
, virt_addr
, fd
,
1654 access_flags
, MLX5_MKC_ACCESS_MODE_KSM
);
1655 if (IS_ERR(crossed_mr
)) {
1656 ret
= PTR_ERR(crossed_mr
);
1660 mutex_lock(&dev
->slow_path_mutex
);
1661 crossing_mr
= reg_create_crossing_vhca_mr(pd
, virt_addr
, length
, access_flags
,
1663 mutex_unlock(&dev
->slow_path_mutex
);
1664 if (IS_ERR(crossing_mr
)) {
1665 __mlx5_ib_dereg_mr(crossed_mr
);
1666 ret
= PTR_ERR(crossing_mr
);
1670 list_add_tail(&to_mmr(crossed_mr
)->dd_node
, &dev
->data_direct_mr_list
);
1671 to_mmr(crossing_mr
)->dd_crossed_mr
= to_mmr(crossed_mr
);
1672 to_mmr(crossing_mr
)->data_direct
= true;
1674 mutex_unlock(&dev
->data_direct_lock
);
1675 return ret
? ERR_PTR(ret
) : crossing_mr
;
1678 struct ib_mr
*mlx5_ib_reg_user_mr_dmabuf(struct ib_pd
*pd
, u64 offset
,
1679 u64 length
, u64 virt_addr
,
1680 int fd
, int access_flags
,
1681 struct uverbs_attr_bundle
*attrs
)
1683 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
1684 int mlx5_access_flags
= 0;
1687 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM
) ||
1688 !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING
))
1689 return ERR_PTR(-EOPNOTSUPP
);
1691 if (uverbs_attr_is_valid(attrs
, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS
)) {
1692 err
= uverbs_get_flags32(&mlx5_access_flags
, attrs
,
1693 MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS
,
1694 MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT
);
1696 return ERR_PTR(err
);
1700 "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n",
1701 offset
, virt_addr
, length
, fd
, access_flags
, mlx5_access_flags
);
1703 /* dmabuf requires xlt update via umr to work. */
1704 if (!mlx5r_umr_can_load_pas(dev
, length
))
1705 return ERR_PTR(-EINVAL
);
1707 if (mlx5_access_flags
& MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT
)
1708 return reg_user_mr_dmabuf_by_data_direct(pd
, offset
, length
, virt_addr
,
1711 return reg_user_mr_dmabuf(pd
, pd
->device
->dma_device
,
1712 offset
, length
, virt_addr
,
1713 fd
, access_flags
, MLX5_MKC_ACCESS_MODE_MTT
);
1717 * True if the change in access flags can be done via UMR, only some access
1718 * flags can be updated.
1720 static bool can_use_umr_rereg_access(struct mlx5_ib_dev
*dev
,
1721 unsigned int current_access_flags
,
1722 unsigned int target_access_flags
)
1724 unsigned int diffs
= current_access_flags
^ target_access_flags
;
1726 if (diffs
& ~(IB_ACCESS_LOCAL_WRITE
| IB_ACCESS_REMOTE_WRITE
|
1727 IB_ACCESS_REMOTE_READ
| IB_ACCESS_RELAXED_ORDERING
|
1728 IB_ACCESS_REMOTE_ATOMIC
))
1730 return mlx5r_umr_can_reconfig(dev
, current_access_flags
,
1731 target_access_flags
);
1734 static bool can_use_umr_rereg_pas(struct mlx5_ib_mr
*mr
,
1735 struct ib_umem
*new_umem
,
1736 int new_access_flags
, u64 iova
,
1737 unsigned long *page_size
)
1739 struct mlx5_ib_dev
*dev
= to_mdev(mr
->ibmr
.device
);
1741 /* We only track the allocated sizes of MRs from the cache */
1742 if (!mr
->mmkey
.cache_ent
)
1744 if (!mlx5r_umr_can_load_pas(dev
, new_umem
->length
))
1747 *page_size
= mlx5_umem_mkc_find_best_pgsz(dev
, new_umem
, iova
);
1748 if (WARN_ON(!*page_size
))
1750 return (mr
->mmkey
.cache_ent
->rb_key
.ndescs
) >=
1751 ib_umem_num_dma_blocks(new_umem
, *page_size
);
1754 static int umr_rereg_pas(struct mlx5_ib_mr
*mr
, struct ib_pd
*pd
,
1755 int access_flags
, int flags
, struct ib_umem
*new_umem
,
1756 u64 iova
, unsigned long page_size
)
1758 struct mlx5_ib_dev
*dev
= to_mdev(mr
->ibmr
.device
);
1759 int upd_flags
= MLX5_IB_UPD_XLT_ADDR
| MLX5_IB_UPD_XLT_ENABLE
;
1760 struct ib_umem
*old_umem
= mr
->umem
;
1764 * To keep everything simple the MR is revoked before we start to mess
1765 * with it. This ensure the change is atomic relative to any use of the
1768 err
= mlx5r_umr_revoke_mr(mr
);
1772 if (flags
& IB_MR_REREG_PD
) {
1774 upd_flags
|= MLX5_IB_UPD_XLT_PD
;
1776 if (flags
& IB_MR_REREG_ACCESS
) {
1777 mr
->access_flags
= access_flags
;
1778 upd_flags
|= MLX5_IB_UPD_XLT_ACCESS
;
1781 mr
->ibmr
.iova
= iova
;
1782 mr
->ibmr
.length
= new_umem
->length
;
1783 mr
->page_shift
= order_base_2(page_size
);
1784 mr
->umem
= new_umem
;
1785 err
= mlx5r_umr_update_mr_pas(mr
, upd_flags
);
1788 * The MR is revoked at this point so there is no issue to free
1791 mr
->umem
= old_umem
;
1795 atomic_sub(ib_umem_num_pages(old_umem
), &dev
->mdev
->priv
.reg_pages
);
1796 ib_umem_release(old_umem
);
1797 atomic_add(ib_umem_num_pages(new_umem
), &dev
->mdev
->priv
.reg_pages
);
1801 struct ib_mr
*mlx5_ib_rereg_user_mr(struct ib_mr
*ib_mr
, int flags
, u64 start
,
1802 u64 length
, u64 iova
, int new_access_flags
,
1803 struct ib_pd
*new_pd
,
1804 struct ib_udata
*udata
)
1806 struct mlx5_ib_dev
*dev
= to_mdev(ib_mr
->device
);
1807 struct mlx5_ib_mr
*mr
= to_mmr(ib_mr
);
1810 if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM
) || mr
->data_direct
)
1811 return ERR_PTR(-EOPNOTSUPP
);
1815 "start 0x%llx, iova 0x%llx, length 0x%llx, access_flags 0x%x\n",
1816 start
, iova
, length
, new_access_flags
);
1818 if (flags
& ~(IB_MR_REREG_TRANS
| IB_MR_REREG_PD
| IB_MR_REREG_ACCESS
))
1819 return ERR_PTR(-EOPNOTSUPP
);
1821 if (!(flags
& IB_MR_REREG_ACCESS
))
1822 new_access_flags
= mr
->access_flags
;
1823 if (!(flags
& IB_MR_REREG_PD
))
1826 if (!(flags
& IB_MR_REREG_TRANS
)) {
1827 struct ib_umem
*umem
;
1829 /* Fast path for PD/access change */
1830 if (can_use_umr_rereg_access(dev
, mr
->access_flags
,
1831 new_access_flags
)) {
1832 err
= mlx5r_umr_rereg_pd_access(mr
, new_pd
,
1835 return ERR_PTR(err
);
1838 /* DM or ODP MR's don't have a normal umem so we can't re-use it */
1839 if (!mr
->umem
|| is_odp_mr(mr
) || is_dmabuf_mr(mr
))
1843 * Only one active MR can refer to a umem at one time, revoke
1844 * the old MR before assigning the umem to the new one.
1846 err
= mlx5r_umr_revoke_mr(mr
);
1848 return ERR_PTR(err
);
1851 atomic_sub(ib_umem_num_pages(umem
), &dev
->mdev
->priv
.reg_pages
);
1853 return create_real_mr(new_pd
, umem
, mr
->ibmr
.iova
,
1858 * DM doesn't have a PAS list so we can't re-use it, odp/dmabuf does
1859 * but the logic around releasing the umem is different
1861 if (!mr
->umem
|| is_odp_mr(mr
) || is_dmabuf_mr(mr
))
1864 if (!(new_access_flags
& IB_ACCESS_ON_DEMAND
) &&
1865 can_use_umr_rereg_access(dev
, mr
->access_flags
, new_access_flags
)) {
1866 struct ib_umem
*new_umem
;
1867 unsigned long page_size
;
1869 new_umem
= ib_umem_get(&dev
->ib_dev
, start
, length
,
1871 if (IS_ERR(new_umem
))
1872 return ERR_CAST(new_umem
);
1874 /* Fast path for PAS change */
1875 if (can_use_umr_rereg_pas(mr
, new_umem
, new_access_flags
, iova
,
1877 err
= umr_rereg_pas(mr
, new_pd
, new_access_flags
, flags
,
1878 new_umem
, iova
, page_size
);
1880 ib_umem_release(new_umem
);
1881 return ERR_PTR(err
);
1885 return create_real_mr(new_pd
, new_umem
, iova
, new_access_flags
);
1889 * Everything else has no state we can preserve, just create a new MR
1893 return mlx5_ib_reg_user_mr(new_pd
, start
, length
, iova
,
1894 new_access_flags
, udata
);
1898 mlx5_alloc_priv_descs(struct ib_device
*device
,
1899 struct mlx5_ib_mr
*mr
,
1903 struct mlx5_ib_dev
*dev
= to_mdev(device
);
1904 struct device
*ddev
= &dev
->mdev
->pdev
->dev
;
1905 int size
= ndescs
* desc_size
;
1909 add_size
= max_t(int, MLX5_UMR_ALIGN
- ARCH_KMALLOC_MINALIGN
, 0);
1910 if (is_power_of_2(MLX5_UMR_ALIGN
) && add_size
) {
1911 int end
= max_t(int, MLX5_UMR_ALIGN
, roundup_pow_of_two(size
));
1913 add_size
= min_t(int, end
- size
, add_size
);
1916 mr
->descs_alloc
= kzalloc(size
+ add_size
, GFP_KERNEL
);
1917 if (!mr
->descs_alloc
)
1920 mr
->descs
= PTR_ALIGN(mr
->descs_alloc
, MLX5_UMR_ALIGN
);
1922 mr
->desc_map
= dma_map_single(ddev
, mr
->descs
, size
, DMA_TO_DEVICE
);
1923 if (dma_mapping_error(ddev
, mr
->desc_map
)) {
1930 kfree(mr
->descs_alloc
);
1936 mlx5_free_priv_descs(struct mlx5_ib_mr
*mr
)
1938 if (!mr
->umem
&& !mr
->data_direct
&& mr
->descs
) {
1939 struct ib_device
*device
= mr
->ibmr
.device
;
1940 int size
= mr
->max_descs
* mr
->desc_size
;
1941 struct mlx5_ib_dev
*dev
= to_mdev(device
);
1943 dma_unmap_single(&dev
->mdev
->pdev
->dev
, mr
->desc_map
, size
,
1945 kfree(mr
->descs_alloc
);
1950 static int cache_ent_find_and_store(struct mlx5_ib_dev
*dev
,
1951 struct mlx5_ib_mr
*mr
)
1953 struct mlx5_mkey_cache
*cache
= &dev
->cache
;
1954 struct mlx5_cache_ent
*ent
;
1957 if (mr
->mmkey
.cache_ent
) {
1958 spin_lock_irq(&mr
->mmkey
.cache_ent
->mkeys_queue
.lock
);
1959 mr
->mmkey
.cache_ent
->in_use
--;
1963 mutex_lock(&cache
->rb_lock
);
1964 ent
= mkey_cache_ent_from_rb_key(dev
, mr
->mmkey
.rb_key
);
1966 if (ent
->rb_key
.ndescs
== mr
->mmkey
.rb_key
.ndescs
) {
1967 if (ent
->disabled
) {
1968 mutex_unlock(&cache
->rb_lock
);
1971 mr
->mmkey
.cache_ent
= ent
;
1972 spin_lock_irq(&mr
->mmkey
.cache_ent
->mkeys_queue
.lock
);
1973 mutex_unlock(&cache
->rb_lock
);
1978 ent
= mlx5r_cache_create_ent_locked(dev
, mr
->mmkey
.rb_key
, false);
1979 mutex_unlock(&cache
->rb_lock
);
1981 return PTR_ERR(ent
);
1983 mr
->mmkey
.cache_ent
= ent
;
1984 spin_lock_irq(&mr
->mmkey
.cache_ent
->mkeys_queue
.lock
);
1987 ret
= push_mkey_locked(mr
->mmkey
.cache_ent
, mr
->mmkey
.key
);
1988 spin_unlock_irq(&mr
->mmkey
.cache_ent
->mkeys_queue
.lock
);
1992 static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr
*mr
)
1994 struct mlx5_ib_dev
*dev
= to_mdev(mr
->ibmr
.device
);
1995 struct ib_umem_dmabuf
*umem_dmabuf
= to_ib_umem_dmabuf(mr
->umem
);
1998 lockdep_assert_held(&dev
->data_direct_lock
);
2000 err
= mlx5r_umr_revoke_mr(mr
);
2004 ib_umem_dmabuf_revoke(umem_dmabuf
);
2008 void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev
*dev
)
2010 struct mlx5_ib_mr
*mr
, *next
;
2012 lockdep_assert_held(&dev
->data_direct_lock
);
2014 list_for_each_entry_safe(mr
, next
, &dev
->data_direct_mr_list
, dd_node
) {
2015 list_del(&mr
->dd_node
);
2016 mlx5_ib_revoke_data_direct_mr(mr
);
2020 static int mlx5_revoke_mr(struct mlx5_ib_mr
*mr
)
2022 struct mlx5_ib_dev
*dev
= to_mdev(mr
->ibmr
.device
);
2023 struct mlx5_cache_ent
*ent
= mr
->mmkey
.cache_ent
;
2025 if (mr
->mmkey
.cacheable
&& !mlx5r_umr_revoke_mr(mr
) && !cache_ent_find_and_store(dev
, mr
)) {
2026 ent
= mr
->mmkey
.cache_ent
;
2027 /* upon storing to a clean temp entry - schedule its cleanup */
2028 spin_lock_irq(&ent
->mkeys_queue
.lock
);
2029 if (ent
->is_tmp
&& !ent
->tmp_cleanup_scheduled
) {
2030 mod_delayed_work(ent
->dev
->cache
.wq
, &ent
->dwork
,
2031 msecs_to_jiffies(30 * 1000));
2032 ent
->tmp_cleanup_scheduled
= true;
2034 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
2039 spin_lock_irq(&ent
->mkeys_queue
.lock
);
2041 mr
->mmkey
.cache_ent
= NULL
;
2042 spin_unlock_irq(&ent
->mkeys_queue
.lock
);
2044 return destroy_mkey(dev
, mr
);
2047 static int __mlx5_ib_dereg_mr(struct ib_mr
*ibmr
)
2049 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2050 struct mlx5_ib_dev
*dev
= to_mdev(ibmr
->device
);
2054 * Any async use of the mr must hold the refcount, once the refcount
2055 * goes to zero no other thread, such as ODP page faults, prefetch, any
2056 * UMR activity, etc can touch the mkey. Thus it is safe to destroy it.
2058 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING
) &&
2059 refcount_read(&mr
->mmkey
.usecount
) != 0 &&
2060 xa_erase(&mr_to_mdev(mr
)->odp_mkeys
, mlx5_base_mkey(mr
->mmkey
.key
)))
2061 mlx5r_deref_wait_odp_mkey(&mr
->mmkey
);
2063 if (ibmr
->type
== IB_MR_TYPE_INTEGRITY
) {
2064 xa_cmpxchg(&dev
->sig_mrs
, mlx5_base_mkey(mr
->mmkey
.key
),
2065 mr
->sig
, NULL
, GFP_KERNEL
);
2068 rc
= mlx5_ib_dereg_mr(&mr
->mtt_mr
->ibmr
, NULL
);
2074 rc
= mlx5_ib_dereg_mr(&mr
->klm_mr
->ibmr
, NULL
);
2080 if (mlx5_core_destroy_psv(dev
->mdev
,
2081 mr
->sig
->psv_memory
.psv_idx
))
2082 mlx5_ib_warn(dev
, "failed to destroy mem psv %d\n",
2083 mr
->sig
->psv_memory
.psv_idx
);
2084 if (mlx5_core_destroy_psv(dev
->mdev
, mr
->sig
->psv_wire
.psv_idx
))
2085 mlx5_ib_warn(dev
, "failed to destroy wire psv %d\n",
2086 mr
->sig
->psv_wire
.psv_idx
);
2092 rc
= mlx5_revoke_mr(mr
);
2097 bool is_odp
= is_odp_mr(mr
);
2100 atomic_sub(ib_umem_num_pages(mr
->umem
),
2101 &dev
->mdev
->priv
.reg_pages
);
2102 ib_umem_release(mr
->umem
);
2104 mlx5_ib_free_odp_mr(mr
);
2107 if (!mr
->mmkey
.cache_ent
)
2108 mlx5_free_priv_descs(mr
);
2114 static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev
*dev
,
2115 struct mlx5_ib_mr
*mr
)
2117 struct mlx5_ib_mr
*dd_crossed_mr
= mr
->dd_crossed_mr
;
2120 ret
= __mlx5_ib_dereg_mr(&mr
->ibmr
);
2124 mutex_lock(&dev
->data_direct_lock
);
2125 if (!dd_crossed_mr
->revoked
)
2126 list_del(&dd_crossed_mr
->dd_node
);
2128 ret
= __mlx5_ib_dereg_mr(&dd_crossed_mr
->ibmr
);
2129 mutex_unlock(&dev
->data_direct_lock
);
2133 int mlx5_ib_dereg_mr(struct ib_mr
*ibmr
, struct ib_udata
*udata
)
2135 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2136 struct mlx5_ib_dev
*dev
= to_mdev(ibmr
->device
);
2138 if (mr
->data_direct
)
2139 return dereg_crossing_data_direct_mr(dev
, mr
);
2141 return __mlx5_ib_dereg_mr(ibmr
);
2144 static void mlx5_set_umr_free_mkey(struct ib_pd
*pd
, u32
*in
, int ndescs
,
2145 int access_mode
, int page_shift
)
2147 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
2150 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
2152 /* This is only used from the kernel, so setting the PD is OK. */
2153 set_mkc_access_pd_addr_fields(mkc
, IB_ACCESS_RELAXED_ORDERING
, 0, pd
);
2154 MLX5_SET(mkc
, mkc
, free
, 1);
2155 MLX5_SET(mkc
, mkc
, translations_octword_size
, ndescs
);
2156 MLX5_SET(mkc
, mkc
, access_mode_1_0
, access_mode
& 0x3);
2157 MLX5_SET(mkc
, mkc
, access_mode_4_2
, (access_mode
>> 2) & 0x7);
2158 MLX5_SET(mkc
, mkc
, umr_en
, 1);
2159 MLX5_SET(mkc
, mkc
, log_page_size
, page_shift
);
2160 if (access_mode
== MLX5_MKC_ACCESS_MODE_PA
||
2161 access_mode
== MLX5_MKC_ACCESS_MODE_MTT
)
2162 MLX5_SET(mkc
, mkc
, ma_translation_mode
, MLX5_CAP_GEN(dev
->mdev
, ats
));
2165 static int _mlx5_alloc_mkey_descs(struct ib_pd
*pd
, struct mlx5_ib_mr
*mr
,
2166 int ndescs
, int desc_size
, int page_shift
,
2167 int access_mode
, u32
*in
, int inlen
)
2169 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
2172 mr
->access_mode
= access_mode
;
2173 mr
->desc_size
= desc_size
;
2174 mr
->max_descs
= ndescs
;
2176 err
= mlx5_alloc_priv_descs(pd
->device
, mr
, ndescs
, desc_size
);
2180 mlx5_set_umr_free_mkey(pd
, in
, ndescs
, access_mode
, page_shift
);
2182 err
= mlx5_ib_create_mkey(dev
, &mr
->mmkey
, in
, inlen
);
2184 goto err_free_descs
;
2186 mr
->mmkey
.type
= MLX5_MKEY_MR
;
2187 mr
->ibmr
.lkey
= mr
->mmkey
.key
;
2188 mr
->ibmr
.rkey
= mr
->mmkey
.key
;
2193 mlx5_free_priv_descs(mr
);
2197 static struct mlx5_ib_mr
*mlx5_ib_alloc_pi_mr(struct ib_pd
*pd
,
2198 u32 max_num_sg
, u32 max_num_meta_sg
,
2199 int desc_size
, int access_mode
)
2201 int inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
2202 int ndescs
= ALIGN(max_num_sg
+ max_num_meta_sg
, 4);
2204 struct mlx5_ib_mr
*mr
;
2208 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
2210 return ERR_PTR(-ENOMEM
);
2213 mr
->ibmr
.device
= pd
->device
;
2215 in
= kzalloc(inlen
, GFP_KERNEL
);
2221 if (access_mode
== MLX5_MKC_ACCESS_MODE_MTT
)
2222 page_shift
= PAGE_SHIFT
;
2224 err
= _mlx5_alloc_mkey_descs(pd
, mr
, ndescs
, desc_size
, page_shift
,
2225 access_mode
, in
, inlen
);
2238 return ERR_PTR(err
);
2241 static int mlx5_alloc_mem_reg_descs(struct ib_pd
*pd
, struct mlx5_ib_mr
*mr
,
2242 int ndescs
, u32
*in
, int inlen
)
2244 return _mlx5_alloc_mkey_descs(pd
, mr
, ndescs
, sizeof(struct mlx5_mtt
),
2245 PAGE_SHIFT
, MLX5_MKC_ACCESS_MODE_MTT
, in
,
2249 static int mlx5_alloc_sg_gaps_descs(struct ib_pd
*pd
, struct mlx5_ib_mr
*mr
,
2250 int ndescs
, u32
*in
, int inlen
)
2252 return _mlx5_alloc_mkey_descs(pd
, mr
, ndescs
, sizeof(struct mlx5_klm
),
2253 0, MLX5_MKC_ACCESS_MODE_KLMS
, in
, inlen
);
2256 static int mlx5_alloc_integrity_descs(struct ib_pd
*pd
, struct mlx5_ib_mr
*mr
,
2257 int max_num_sg
, int max_num_meta_sg
,
2260 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
2265 mr
->sig
= kzalloc(sizeof(*mr
->sig
), GFP_KERNEL
);
2269 /* create mem & wire PSVs */
2270 err
= mlx5_core_create_psv(dev
->mdev
, to_mpd(pd
)->pdn
, 2, psv_index
);
2274 mr
->sig
->psv_memory
.psv_idx
= psv_index
[0];
2275 mr
->sig
->psv_wire
.psv_idx
= psv_index
[1];
2277 mr
->sig
->sig_status_checked
= true;
2278 mr
->sig
->sig_err_exists
= false;
2279 /* Next UMR, Arm SIGERR */
2280 ++mr
->sig
->sigerr_count
;
2281 mr
->klm_mr
= mlx5_ib_alloc_pi_mr(pd
, max_num_sg
, max_num_meta_sg
,
2282 sizeof(struct mlx5_klm
),
2283 MLX5_MKC_ACCESS_MODE_KLMS
);
2284 if (IS_ERR(mr
->klm_mr
)) {
2285 err
= PTR_ERR(mr
->klm_mr
);
2286 goto err_destroy_psv
;
2288 mr
->mtt_mr
= mlx5_ib_alloc_pi_mr(pd
, max_num_sg
, max_num_meta_sg
,
2289 sizeof(struct mlx5_mtt
),
2290 MLX5_MKC_ACCESS_MODE_MTT
);
2291 if (IS_ERR(mr
->mtt_mr
)) {
2292 err
= PTR_ERR(mr
->mtt_mr
);
2293 goto err_free_klm_mr
;
2296 /* Set bsf descriptors for mkey */
2297 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
2298 MLX5_SET(mkc
, mkc
, bsf_en
, 1);
2299 MLX5_SET(mkc
, mkc
, bsf_octword_size
, MLX5_MKEY_BSF_OCTO_SIZE
);
2301 err
= _mlx5_alloc_mkey_descs(pd
, mr
, 4, sizeof(struct mlx5_klm
), 0,
2302 MLX5_MKC_ACCESS_MODE_KLMS
, in
, inlen
);
2304 goto err_free_mtt_mr
;
2306 err
= xa_err(xa_store(&dev
->sig_mrs
, mlx5_base_mkey(mr
->mmkey
.key
),
2307 mr
->sig
, GFP_KERNEL
));
2309 goto err_free_descs
;
2313 destroy_mkey(dev
, mr
);
2314 mlx5_free_priv_descs(mr
);
2316 mlx5_ib_dereg_mr(&mr
->mtt_mr
->ibmr
, NULL
);
2319 mlx5_ib_dereg_mr(&mr
->klm_mr
->ibmr
, NULL
);
2322 if (mlx5_core_destroy_psv(dev
->mdev
, mr
->sig
->psv_memory
.psv_idx
))
2323 mlx5_ib_warn(dev
, "failed to destroy mem psv %d\n",
2324 mr
->sig
->psv_memory
.psv_idx
);
2325 if (mlx5_core_destroy_psv(dev
->mdev
, mr
->sig
->psv_wire
.psv_idx
))
2326 mlx5_ib_warn(dev
, "failed to destroy wire psv %d\n",
2327 mr
->sig
->psv_wire
.psv_idx
);
2334 static struct ib_mr
*__mlx5_ib_alloc_mr(struct ib_pd
*pd
,
2335 enum ib_mr_type mr_type
, u32 max_num_sg
,
2336 u32 max_num_meta_sg
)
2338 struct mlx5_ib_dev
*dev
= to_mdev(pd
->device
);
2339 int inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
2340 int ndescs
= ALIGN(max_num_sg
, 4);
2341 struct mlx5_ib_mr
*mr
;
2345 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
2347 return ERR_PTR(-ENOMEM
);
2349 in
= kzalloc(inlen
, GFP_KERNEL
);
2355 mr
->ibmr
.device
= pd
->device
;
2359 case IB_MR_TYPE_MEM_REG
:
2360 err
= mlx5_alloc_mem_reg_descs(pd
, mr
, ndescs
, in
, inlen
);
2362 case IB_MR_TYPE_SG_GAPS
:
2363 err
= mlx5_alloc_sg_gaps_descs(pd
, mr
, ndescs
, in
, inlen
);
2365 case IB_MR_TYPE_INTEGRITY
:
2366 err
= mlx5_alloc_integrity_descs(pd
, mr
, max_num_sg
,
2367 max_num_meta_sg
, in
, inlen
);
2370 mlx5_ib_warn(dev
, "Invalid mr type %d\n", mr_type
);
2385 return ERR_PTR(err
);
2388 struct ib_mr
*mlx5_ib_alloc_mr(struct ib_pd
*pd
, enum ib_mr_type mr_type
,
2391 return __mlx5_ib_alloc_mr(pd
, mr_type
, max_num_sg
, 0);
2394 struct ib_mr
*mlx5_ib_alloc_mr_integrity(struct ib_pd
*pd
,
2395 u32 max_num_sg
, u32 max_num_meta_sg
)
2397 return __mlx5_ib_alloc_mr(pd
, IB_MR_TYPE_INTEGRITY
, max_num_sg
,
2401 int mlx5_ib_alloc_mw(struct ib_mw
*ibmw
, struct ib_udata
*udata
)
2403 struct mlx5_ib_dev
*dev
= to_mdev(ibmw
->device
);
2404 int inlen
= MLX5_ST_SZ_BYTES(create_mkey_in
);
2405 struct mlx5_ib_mw
*mw
= to_mmw(ibmw
);
2406 unsigned int ndescs
;
2410 struct mlx5_ib_alloc_mw req
= {};
2413 __u32 response_length
;
2416 err
= ib_copy_from_udata(&req
, udata
, min(udata
->inlen
, sizeof(req
)));
2420 if (req
.comp_mask
|| req
.reserved1
|| req
.reserved2
)
2423 if (udata
->inlen
> sizeof(req
) &&
2424 !ib_is_udata_cleared(udata
, sizeof(req
),
2425 udata
->inlen
- sizeof(req
)))
2428 ndescs
= req
.num_klms
? roundup(req
.num_klms
, 4) : roundup(1, 4);
2430 in
= kzalloc(inlen
, GFP_KERNEL
);
2434 mkc
= MLX5_ADDR_OF(create_mkey_in
, in
, memory_key_mkey_entry
);
2436 MLX5_SET(mkc
, mkc
, free
, 1);
2437 MLX5_SET(mkc
, mkc
, translations_octword_size
, ndescs
);
2438 MLX5_SET(mkc
, mkc
, pd
, to_mpd(ibmw
->pd
)->pdn
);
2439 MLX5_SET(mkc
, mkc
, umr_en
, 1);
2440 MLX5_SET(mkc
, mkc
, lr
, 1);
2441 MLX5_SET(mkc
, mkc
, access_mode_1_0
, MLX5_MKC_ACCESS_MODE_KLMS
);
2442 MLX5_SET(mkc
, mkc
, en_rinval
, !!((ibmw
->type
== IB_MW_TYPE_2
)));
2443 MLX5_SET(mkc
, mkc
, qpn
, 0xffffff);
2445 err
= mlx5_ib_create_mkey(dev
, &mw
->mmkey
, in
, inlen
);
2449 mw
->mmkey
.type
= MLX5_MKEY_MW
;
2450 ibmw
->rkey
= mw
->mmkey
.key
;
2451 mw
->mmkey
.ndescs
= ndescs
;
2453 resp
.response_length
=
2454 min(offsetofend(typeof(resp
), response_length
), udata
->outlen
);
2455 if (resp
.response_length
) {
2456 err
= ib_copy_to_udata(udata
, &resp
, resp
.response_length
);
2461 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING
)) {
2462 err
= mlx5r_store_odp_mkey(dev
, &mw
->mmkey
);
2471 mlx5_core_destroy_mkey(dev
->mdev
, mw
->mmkey
.key
);
2477 int mlx5_ib_dealloc_mw(struct ib_mw
*mw
)
2479 struct mlx5_ib_dev
*dev
= to_mdev(mw
->device
);
2480 struct mlx5_ib_mw
*mmw
= to_mmw(mw
);
2482 if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING
) &&
2483 xa_erase(&dev
->odp_mkeys
, mlx5_base_mkey(mmw
->mmkey
.key
)))
2485 * pagefault_single_data_segment() may be accessing mmw
2486 * if the user bound an ODP MR to this MW.
2488 mlx5r_deref_wait_odp_mkey(&mmw
->mmkey
);
2490 return mlx5_core_destroy_mkey(dev
->mdev
, mmw
->mmkey
.key
);
2493 int mlx5_ib_check_mr_status(struct ib_mr
*ibmr
, u32 check_mask
,
2494 struct ib_mr_status
*mr_status
)
2496 struct mlx5_ib_mr
*mmr
= to_mmr(ibmr
);
2499 if (check_mask
& ~IB_MR_CHECK_SIG_STATUS
) {
2500 pr_err("Invalid status check mask\n");
2505 mr_status
->fail_status
= 0;
2506 if (check_mask
& IB_MR_CHECK_SIG_STATUS
) {
2509 pr_err("signature status check requested on a non-signature enabled MR\n");
2513 mmr
->sig
->sig_status_checked
= true;
2514 if (!mmr
->sig
->sig_err_exists
)
2517 if (ibmr
->lkey
== mmr
->sig
->err_item
.key
)
2518 memcpy(&mr_status
->sig_err
, &mmr
->sig
->err_item
,
2519 sizeof(mr_status
->sig_err
));
2521 mr_status
->sig_err
.err_type
= IB_SIG_BAD_GUARD
;
2522 mr_status
->sig_err
.sig_err_offset
= 0;
2523 mr_status
->sig_err
.key
= mmr
->sig
->err_item
.key
;
2526 mmr
->sig
->sig_err_exists
= false;
2527 mr_status
->fail_status
|= IB_MR_CHECK_SIG_STATUS
;
2535 mlx5_ib_map_pa_mr_sg_pi(struct ib_mr
*ibmr
, struct scatterlist
*data_sg
,
2536 int data_sg_nents
, unsigned int *data_sg_offset
,
2537 struct scatterlist
*meta_sg
, int meta_sg_nents
,
2538 unsigned int *meta_sg_offset
)
2540 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2541 unsigned int sg_offset
= 0;
2544 mr
->meta_length
= 0;
2545 if (data_sg_nents
== 1) {
2547 mr
->mmkey
.ndescs
= 1;
2549 sg_offset
= *data_sg_offset
;
2550 mr
->data_length
= sg_dma_len(data_sg
) - sg_offset
;
2551 mr
->data_iova
= sg_dma_address(data_sg
) + sg_offset
;
2552 if (meta_sg_nents
== 1) {
2554 mr
->meta_ndescs
= 1;
2556 sg_offset
= *meta_sg_offset
;
2559 mr
->meta_length
= sg_dma_len(meta_sg
) - sg_offset
;
2560 mr
->pi_iova
= sg_dma_address(meta_sg
) + sg_offset
;
2562 ibmr
->length
= mr
->data_length
+ mr
->meta_length
;
2569 mlx5_ib_sg_to_klms(struct mlx5_ib_mr
*mr
,
2570 struct scatterlist
*sgl
,
2571 unsigned short sg_nents
,
2572 unsigned int *sg_offset_p
,
2573 struct scatterlist
*meta_sgl
,
2574 unsigned short meta_sg_nents
,
2575 unsigned int *meta_sg_offset_p
)
2577 struct scatterlist
*sg
= sgl
;
2578 struct mlx5_klm
*klms
= mr
->descs
;
2579 unsigned int sg_offset
= sg_offset_p
? *sg_offset_p
: 0;
2580 u32 lkey
= mr
->ibmr
.pd
->local_dma_lkey
;
2583 mr
->ibmr
.iova
= sg_dma_address(sg
) + sg_offset
;
2584 mr
->ibmr
.length
= 0;
2586 for_each_sg(sgl
, sg
, sg_nents
, i
) {
2587 if (unlikely(i
>= mr
->max_descs
))
2589 klms
[i
].va
= cpu_to_be64(sg_dma_address(sg
) + sg_offset
);
2590 klms
[i
].bcount
= cpu_to_be32(sg_dma_len(sg
) - sg_offset
);
2591 klms
[i
].key
= cpu_to_be32(lkey
);
2592 mr
->ibmr
.length
+= sg_dma_len(sg
) - sg_offset
;
2598 *sg_offset_p
= sg_offset
;
2600 mr
->mmkey
.ndescs
= i
;
2601 mr
->data_length
= mr
->ibmr
.length
;
2603 if (meta_sg_nents
) {
2605 sg_offset
= meta_sg_offset_p
? *meta_sg_offset_p
: 0;
2606 for_each_sg(meta_sgl
, sg
, meta_sg_nents
, j
) {
2607 if (unlikely(i
+ j
>= mr
->max_descs
))
2609 klms
[i
+ j
].va
= cpu_to_be64(sg_dma_address(sg
) +
2611 klms
[i
+ j
].bcount
= cpu_to_be32(sg_dma_len(sg
) -
2613 klms
[i
+ j
].key
= cpu_to_be32(lkey
);
2614 mr
->ibmr
.length
+= sg_dma_len(sg
) - sg_offset
;
2618 if (meta_sg_offset_p
)
2619 *meta_sg_offset_p
= sg_offset
;
2621 mr
->meta_ndescs
= j
;
2622 mr
->meta_length
= mr
->ibmr
.length
- mr
->data_length
;
2628 static int mlx5_set_page(struct ib_mr
*ibmr
, u64 addr
)
2630 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2633 if (unlikely(mr
->mmkey
.ndescs
== mr
->max_descs
))
2637 descs
[mr
->mmkey
.ndescs
++] = cpu_to_be64(addr
| MLX5_EN_RD
| MLX5_EN_WR
);
2642 static int mlx5_set_page_pi(struct ib_mr
*ibmr
, u64 addr
)
2644 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2647 if (unlikely(mr
->mmkey
.ndescs
+ mr
->meta_ndescs
== mr
->max_descs
))
2651 descs
[mr
->mmkey
.ndescs
+ mr
->meta_ndescs
++] =
2652 cpu_to_be64(addr
| MLX5_EN_RD
| MLX5_EN_WR
);
2658 mlx5_ib_map_mtt_mr_sg_pi(struct ib_mr
*ibmr
, struct scatterlist
*data_sg
,
2659 int data_sg_nents
, unsigned int *data_sg_offset
,
2660 struct scatterlist
*meta_sg
, int meta_sg_nents
,
2661 unsigned int *meta_sg_offset
)
2663 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2664 struct mlx5_ib_mr
*pi_mr
= mr
->mtt_mr
;
2667 pi_mr
->mmkey
.ndescs
= 0;
2668 pi_mr
->meta_ndescs
= 0;
2669 pi_mr
->meta_length
= 0;
2671 ib_dma_sync_single_for_cpu(ibmr
->device
, pi_mr
->desc_map
,
2672 pi_mr
->desc_size
* pi_mr
->max_descs
,
2675 pi_mr
->ibmr
.page_size
= ibmr
->page_size
;
2676 n
= ib_sg_to_pages(&pi_mr
->ibmr
, data_sg
, data_sg_nents
, data_sg_offset
,
2678 if (n
!= data_sg_nents
)
2681 pi_mr
->data_iova
= pi_mr
->ibmr
.iova
;
2682 pi_mr
->data_length
= pi_mr
->ibmr
.length
;
2683 pi_mr
->ibmr
.length
= pi_mr
->data_length
;
2684 ibmr
->length
= pi_mr
->data_length
;
2686 if (meta_sg_nents
) {
2687 u64 page_mask
= ~((u64
)ibmr
->page_size
- 1);
2688 u64 iova
= pi_mr
->data_iova
;
2690 n
+= ib_sg_to_pages(&pi_mr
->ibmr
, meta_sg
, meta_sg_nents
,
2691 meta_sg_offset
, mlx5_set_page_pi
);
2693 pi_mr
->meta_length
= pi_mr
->ibmr
.length
;
2695 * PI address for the HW is the offset of the metadata address
2696 * relative to the first data page address.
2697 * It equals to first data page address + size of data pages +
2698 * metadata offset at the first metadata page
2700 pi_mr
->pi_iova
= (iova
& page_mask
) +
2701 pi_mr
->mmkey
.ndescs
* ibmr
->page_size
+
2702 (pi_mr
->ibmr
.iova
& ~page_mask
);
2704 * In order to use one MTT MR for data and metadata, we register
2705 * also the gaps between the end of the data and the start of
2706 * the metadata (the sig MR will verify that the HW will access
2707 * to right addresses). This mapping is safe because we use
2708 * internal mkey for the registration.
2710 pi_mr
->ibmr
.length
= pi_mr
->pi_iova
+ pi_mr
->meta_length
- iova
;
2711 pi_mr
->ibmr
.iova
= iova
;
2712 ibmr
->length
+= pi_mr
->meta_length
;
2715 ib_dma_sync_single_for_device(ibmr
->device
, pi_mr
->desc_map
,
2716 pi_mr
->desc_size
* pi_mr
->max_descs
,
2723 mlx5_ib_map_klm_mr_sg_pi(struct ib_mr
*ibmr
, struct scatterlist
*data_sg
,
2724 int data_sg_nents
, unsigned int *data_sg_offset
,
2725 struct scatterlist
*meta_sg
, int meta_sg_nents
,
2726 unsigned int *meta_sg_offset
)
2728 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2729 struct mlx5_ib_mr
*pi_mr
= mr
->klm_mr
;
2732 pi_mr
->mmkey
.ndescs
= 0;
2733 pi_mr
->meta_ndescs
= 0;
2734 pi_mr
->meta_length
= 0;
2736 ib_dma_sync_single_for_cpu(ibmr
->device
, pi_mr
->desc_map
,
2737 pi_mr
->desc_size
* pi_mr
->max_descs
,
2740 n
= mlx5_ib_sg_to_klms(pi_mr
, data_sg
, data_sg_nents
, data_sg_offset
,
2741 meta_sg
, meta_sg_nents
, meta_sg_offset
);
2743 ib_dma_sync_single_for_device(ibmr
->device
, pi_mr
->desc_map
,
2744 pi_mr
->desc_size
* pi_mr
->max_descs
,
2747 /* This is zero-based memory region */
2748 pi_mr
->data_iova
= 0;
2749 pi_mr
->ibmr
.iova
= 0;
2750 pi_mr
->pi_iova
= pi_mr
->data_length
;
2751 ibmr
->length
= pi_mr
->ibmr
.length
;
2756 int mlx5_ib_map_mr_sg_pi(struct ib_mr
*ibmr
, struct scatterlist
*data_sg
,
2757 int data_sg_nents
, unsigned int *data_sg_offset
,
2758 struct scatterlist
*meta_sg
, int meta_sg_nents
,
2759 unsigned int *meta_sg_offset
)
2761 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2762 struct mlx5_ib_mr
*pi_mr
= NULL
;
2765 WARN_ON(ibmr
->type
!= IB_MR_TYPE_INTEGRITY
);
2767 mr
->mmkey
.ndescs
= 0;
2768 mr
->data_length
= 0;
2770 mr
->meta_ndescs
= 0;
2773 * As a performance optimization, if possible, there is no need to
2774 * perform UMR operation to register the data/metadata buffers.
2775 * First try to map the sg lists to PA descriptors with local_dma_lkey.
2776 * Fallback to UMR only in case of a failure.
2778 n
= mlx5_ib_map_pa_mr_sg_pi(ibmr
, data_sg
, data_sg_nents
,
2779 data_sg_offset
, meta_sg
, meta_sg_nents
,
2781 if (n
== data_sg_nents
+ meta_sg_nents
)
2784 * As a performance optimization, if possible, there is no need to map
2785 * the sg lists to KLM descriptors. First try to map the sg lists to MTT
2786 * descriptors and fallback to KLM only in case of a failure.
2787 * It's more efficient for the HW to work with MTT descriptors
2788 * (especially in high load).
2789 * Use KLM (indirect access) only if it's mandatory.
2792 n
= mlx5_ib_map_mtt_mr_sg_pi(ibmr
, data_sg
, data_sg_nents
,
2793 data_sg_offset
, meta_sg
, meta_sg_nents
,
2795 if (n
== data_sg_nents
+ meta_sg_nents
)
2799 n
= mlx5_ib_map_klm_mr_sg_pi(ibmr
, data_sg
, data_sg_nents
,
2800 data_sg_offset
, meta_sg
, meta_sg_nents
,
2802 if (unlikely(n
!= data_sg_nents
+ meta_sg_nents
))
2806 /* This is zero-based memory region */
2810 ibmr
->sig_attrs
->meta_length
= pi_mr
->meta_length
;
2812 ibmr
->sig_attrs
->meta_length
= mr
->meta_length
;
2817 int mlx5_ib_map_mr_sg(struct ib_mr
*ibmr
, struct scatterlist
*sg
, int sg_nents
,
2818 unsigned int *sg_offset
)
2820 struct mlx5_ib_mr
*mr
= to_mmr(ibmr
);
2823 mr
->mmkey
.ndescs
= 0;
2825 ib_dma_sync_single_for_cpu(ibmr
->device
, mr
->desc_map
,
2826 mr
->desc_size
* mr
->max_descs
,
2829 if (mr
->access_mode
== MLX5_MKC_ACCESS_MODE_KLMS
)
2830 n
= mlx5_ib_sg_to_klms(mr
, sg
, sg_nents
, sg_offset
, NULL
, 0,
2833 n
= ib_sg_to_pages(ibmr
, sg
, sg_nents
, sg_offset
,
2836 ib_dma_sync_single_for_device(ibmr
->device
, mr
->desc_map
,
2837 mr
->desc_size
* mr
->max_descs
,