2 * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
3 * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 #include <linux/slab.h>
35 #include <rdma/ib_user_verbs.h>
39 static u32
convert_access(int acc
)
41 return (acc
& IB_ACCESS_REMOTE_ATOMIC
? MLX4_PERM_ATOMIC
: 0) |
42 (acc
& IB_ACCESS_REMOTE_WRITE
? MLX4_PERM_REMOTE_WRITE
: 0) |
43 (acc
& IB_ACCESS_REMOTE_READ
? MLX4_PERM_REMOTE_READ
: 0) |
44 (acc
& IB_ACCESS_LOCAL_WRITE
? MLX4_PERM_LOCAL_WRITE
: 0) |
45 (acc
& IB_ACCESS_MW_BIND
? MLX4_PERM_BIND_MW
: 0) |
49 static enum mlx4_mw_type
to_mlx4_type(enum ib_mw_type type
)
52 case IB_MW_TYPE_1
: return MLX4_MW_TYPE_1
;
53 case IB_MW_TYPE_2
: return MLX4_MW_TYPE_2
;
58 struct ib_mr
*mlx4_ib_get_dma_mr(struct ib_pd
*pd
, int acc
)
60 struct mlx4_ib_mr
*mr
;
63 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
65 return ERR_PTR(-ENOMEM
);
67 err
= mlx4_mr_alloc(to_mdev(pd
->device
)->dev
, to_mpd(pd
)->pdn
, 0,
68 ~0ull, convert_access(acc
), 0, 0, &mr
->mmr
);
72 err
= mlx4_mr_enable(to_mdev(pd
->device
)->dev
, &mr
->mmr
);
76 mr
->ibmr
.rkey
= mr
->ibmr
.lkey
= mr
->mmr
.key
;
82 (void) mlx4_mr_free(to_mdev(pd
->device
)->dev
, &mr
->mmr
);
91 MLX4_MAX_MTT_SHIFT
= 31
94 static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev
*dev
,
96 u64 mtt_size
, u64 mtt_shift
, u64 len
,
97 u64 cur_start_addr
, u64
*pages
,
98 int *start_index
, int *npages
)
100 u64 cur_end_addr
= cur_start_addr
+ len
;
101 u64 cur_end_addr_aligned
= 0;
106 len
+= (cur_start_addr
& (mtt_size
- 1ULL));
107 cur_end_addr_aligned
= round_up(cur_end_addr
, mtt_size
);
108 len
+= (cur_end_addr_aligned
- cur_end_addr
);
109 if (len
& (mtt_size
- 1ULL)) {
110 pr_warn("write_block: len %llx is not aligned to mtt_size %llx\n",
115 mtt_entries
= (len
>> mtt_shift
);
118 * Align the MTT start address to the mtt_size.
119 * Required to handle cases when the MR starts in the middle of an MTT
120 * record. Was not required in old code since the physical addresses
121 * provided by the dma subsystem were page aligned, which was also the
124 cur_start_addr
= round_down(cur_start_addr
, mtt_size
);
125 /* A new block is started ... */
126 for (k
= 0; k
< mtt_entries
; ++k
) {
127 pages
[*npages
] = cur_start_addr
+ (mtt_size
* k
);
130 * Be friendly to mlx4_write_mtt() and pass it chunks of
133 if (*npages
== PAGE_SIZE
/ sizeof(u64
)) {
134 err
= mlx4_write_mtt(dev
->dev
, mtt
, *start_index
,
139 (*start_index
) += *npages
;
147 static inline u64
alignment_of(u64 ptr
)
149 return ilog2(ptr
& (~(ptr
- 1)));
152 static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start
,
153 u64 current_block_end
,
156 /* Check whether the alignment of the new block is aligned as well as
157 * the previous block.
158 * Block address must start with zeros till size of entity_size.
160 if ((next_block_start
& ((1ULL << block_shift
) - 1ULL)) != 0)
162 * It is not as well aligned as the previous block-reduce the
163 * mtt size accordingly. Here we take the last right bit which
166 block_shift
= alignment_of(next_block_start
);
169 * Check whether the alignment of the end of previous block - is it
170 * aligned as well as the start of the block
172 if (((current_block_end
) & ((1ULL << block_shift
) - 1ULL)) != 0)
174 * It is not as well aligned as the start of the block -
175 * reduce the mtt size accordingly.
177 block_shift
= alignment_of(current_block_end
);
182 int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev
*dev
, struct mlx4_mtt
*mtt
,
183 struct ib_umem
*umem
)
189 u64 cur_start_addr
= 0;
193 struct scatterlist
*sg
;
196 pages
= (u64
*) __get_free_page(GFP_KERNEL
);
200 mtt_shift
= mtt
->page_shift
;
201 mtt_size
= 1ULL << mtt_shift
;
203 for_each_sg(umem
->sg_head
.sgl
, sg
, umem
->nmap
, i
) {
204 if (cur_start_addr
+ len
== sg_dma_address(sg
)) {
205 /* still the same block */
206 len
+= sg_dma_len(sg
);
210 * A new block is started ...
211 * If len is malaligned, write an extra mtt entry to cover the
212 * misaligned area (round up the division)
214 err
= mlx4_ib_umem_write_mtt_block(dev
, mtt
, mtt_size
,
222 cur_start_addr
= sg_dma_address(sg
);
223 len
= sg_dma_len(sg
);
226 /* Handle the last block */
229 * If len is malaligned, write an extra mtt entry to cover
230 * the misaligned area (round up the division)
232 err
= mlx4_ib_umem_write_mtt_block(dev
, mtt
, mtt_size
,
234 cur_start_addr
, pages
,
235 &start_index
, &npages
);
241 err
= mlx4_write_mtt(dev
->dev
, mtt
, start_index
, npages
, pages
);
244 free_page((unsigned long) pages
);
249 * Calculate optimal mtt size based on contiguous pages.
250 * Function will return also the number of pages that are not aligned to the
251 * calculated mtt_size to be added to total number of pages. For that we should
252 * check the first chunk length & last chunk length and if not aligned to
253 * mtt_size we should increment the non_aligned_pages number. All chunks in the
254 * middle already handled as part of mtt shift calculation for both their start
257 int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem
*umem
, u64 start_va
,
260 u64 block_shift
= MLX4_MAX_MTT_SHIFT
;
261 u64 min_shift
= umem
->page_shift
;
262 u64 last_block_aligned_end
= 0;
263 u64 current_block_start
= 0;
264 u64 first_block_start
= 0;
265 u64 current_block_len
= 0;
266 u64 last_block_end
= 0;
267 struct scatterlist
*sg
;
268 u64 current_block_end
;
269 u64 misalignment_bits
;
270 u64 next_block_start
;
274 for_each_sg(umem
->sg_head
.sgl
, sg
, umem
->nmap
, i
) {
276 * Initialization - save the first chunk start as the
277 * current_block_start - block means contiguous pages.
279 if (current_block_len
== 0 && current_block_start
== 0) {
280 current_block_start
= sg_dma_address(sg
);
281 first_block_start
= current_block_start
;
283 * Find the bits that are different between the physical
284 * address and the virtual address for the start of the
286 * umem_get aligned the start_va to a page boundary.
287 * Therefore, we need to align the start va to the same
289 * misalignment_bits is needed to handle the case of a
290 * single memory region. In this case, the rest of the
291 * logic will not reduce the block size. If we use a
292 * block size which is bigger than the alignment of the
293 * misalignment bits, we might use the virtual page
294 * number instead of the physical page number, resulting
295 * in access to the wrong data.
298 (start_va
& (~(((u64
)(BIT(umem
->page_shift
))) - 1ULL)))
299 ^ current_block_start
;
300 block_shift
= min(alignment_of(misalignment_bits
),
305 * Go over the scatter entries and check if they continue the
306 * previous scatter entry.
308 next_block_start
= sg_dma_address(sg
);
309 current_block_end
= current_block_start
+ current_block_len
;
310 /* If we have a split (non-contig.) between two blocks */
311 if (current_block_end
!= next_block_start
) {
312 block_shift
= mlx4_ib_umem_calc_block_mtt
318 * If we reached the minimum shift for 4k page we stop
321 if (block_shift
<= min_shift
)
325 * If not saved yet we are in first block - we save the
326 * length of first block to calculate the
327 * non_aligned_pages number at the end.
329 total_len
+= current_block_len
;
331 /* Start a new block */
332 current_block_start
= next_block_start
;
333 current_block_len
= sg_dma_len(sg
);
336 /* The scatter entry is another part of the current block,
337 * increase the block size.
338 * An entry in the scatter can be larger than 4k (page) as of
339 * dma mapping which merge some blocks together.
341 current_block_len
+= sg_dma_len(sg
);
344 /* Account for the last block in the total len */
345 total_len
+= current_block_len
;
346 /* Add to the first block the misalignment that it suffers from. */
347 total_len
+= (first_block_start
& ((1ULL << block_shift
) - 1ULL));
348 last_block_end
= current_block_start
+ current_block_len
;
349 last_block_aligned_end
= round_up(last_block_end
, 1 << block_shift
);
350 total_len
+= (last_block_aligned_end
- last_block_end
);
352 if (total_len
& ((1ULL << block_shift
) - 1ULL))
353 pr_warn("misaligned total length detected (%llu, %llu)!",
354 total_len
, block_shift
);
356 *num_of_mtts
= total_len
>> block_shift
;
358 if (block_shift
< min_shift
) {
360 * If shift is less than the min we set a warning and return the
363 pr_warn("umem_calc_optimal_mtt_size - unexpected shift %lld\n", block_shift
);
365 block_shift
= min_shift
;
370 struct ib_mr
*mlx4_ib_reg_user_mr(struct ib_pd
*pd
, u64 start
, u64 length
,
371 u64 virt_addr
, int access_flags
,
372 struct ib_udata
*udata
)
374 struct mlx4_ib_dev
*dev
= to_mdev(pd
->device
);
375 struct mlx4_ib_mr
*mr
;
380 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
382 return ERR_PTR(-ENOMEM
);
384 /* Force registering the memory as writable. */
385 /* Used for memory re-registeration. HCA protects the access */
386 mr
->umem
= ib_umem_get(pd
->uobject
->context
, start
, length
,
387 access_flags
| IB_ACCESS_LOCAL_WRITE
, 0);
388 if (IS_ERR(mr
->umem
)) {
389 err
= PTR_ERR(mr
->umem
);
393 n
= ib_umem_page_count(mr
->umem
);
394 shift
= mlx4_ib_umem_calc_optimal_mtt_size(mr
->umem
, start
, &n
);
396 err
= mlx4_mr_alloc(dev
->dev
, to_mpd(pd
)->pdn
, virt_addr
, length
,
397 convert_access(access_flags
), n
, shift
, &mr
->mmr
);
401 err
= mlx4_ib_umem_write_mtt(dev
, &mr
->mmr
.mtt
, mr
->umem
);
405 err
= mlx4_mr_enable(dev
->dev
, &mr
->mmr
);
409 mr
->ibmr
.rkey
= mr
->ibmr
.lkey
= mr
->mmr
.key
;
414 (void) mlx4_mr_free(to_mdev(pd
->device
)->dev
, &mr
->mmr
);
417 ib_umem_release(mr
->umem
);
425 int mlx4_ib_rereg_user_mr(struct ib_mr
*mr
, int flags
,
426 u64 start
, u64 length
, u64 virt_addr
,
427 int mr_access_flags
, struct ib_pd
*pd
,
428 struct ib_udata
*udata
)
430 struct mlx4_ib_dev
*dev
= to_mdev(mr
->device
);
431 struct mlx4_ib_mr
*mmr
= to_mmr(mr
);
432 struct mlx4_mpt_entry
*mpt_entry
;
433 struct mlx4_mpt_entry
**pmpt_entry
= &mpt_entry
;
436 /* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs,
437 * we assume that the calls can't run concurrently. Otherwise, a
440 err
= mlx4_mr_hw_get_mpt(dev
->dev
, &mmr
->mmr
, &pmpt_entry
);
445 if (flags
& IB_MR_REREG_PD
) {
446 err
= mlx4_mr_hw_change_pd(dev
->dev
, *pmpt_entry
,
450 goto release_mpt_entry
;
453 if (flags
& IB_MR_REREG_ACCESS
) {
454 err
= mlx4_mr_hw_change_access(dev
->dev
, *pmpt_entry
,
455 convert_access(mr_access_flags
));
458 goto release_mpt_entry
;
461 if (flags
& IB_MR_REREG_TRANS
) {
465 mlx4_mr_rereg_mem_cleanup(dev
->dev
, &mmr
->mmr
);
466 ib_umem_release(mmr
->umem
);
467 mmr
->umem
= ib_umem_get(mr
->uobject
->context
, start
, length
,
469 IB_ACCESS_LOCAL_WRITE
,
471 if (IS_ERR(mmr
->umem
)) {
472 err
= PTR_ERR(mmr
->umem
);
473 /* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */
475 goto release_mpt_entry
;
477 n
= ib_umem_page_count(mmr
->umem
);
478 shift
= mmr
->umem
->page_shift
;
480 err
= mlx4_mr_rereg_mem_write(dev
->dev
, &mmr
->mmr
,
481 virt_addr
, length
, n
, shift
,
484 ib_umem_release(mmr
->umem
);
485 goto release_mpt_entry
;
487 mmr
->mmr
.iova
= virt_addr
;
488 mmr
->mmr
.size
= length
;
490 err
= mlx4_ib_umem_write_mtt(dev
, &mmr
->mmr
.mtt
, mmr
->umem
);
492 mlx4_mr_rereg_mem_cleanup(dev
->dev
, &mmr
->mmr
);
493 ib_umem_release(mmr
->umem
);
494 goto release_mpt_entry
;
498 /* If we couldn't transfer the MR to the HCA, just remember to
499 * return a failure. But dereg_mr will free the resources.
501 err
= mlx4_mr_hw_write_mpt(dev
->dev
, &mmr
->mmr
, pmpt_entry
);
502 if (!err
&& flags
& IB_MR_REREG_ACCESS
)
503 mmr
->mmr
.access
= mr_access_flags
;
506 mlx4_mr_hw_put_mpt(dev
->dev
, pmpt_entry
);
512 mlx4_alloc_priv_pages(struct ib_device
*device
,
513 struct mlx4_ib_mr
*mr
,
518 /* Ensure that size is aligned to DMA cacheline
520 * max_pages is limited to MLX4_MAX_FAST_REG_PAGES
521 * so page_map_size will never cross PAGE_SIZE.
523 mr
->page_map_size
= roundup(max_pages
* sizeof(u64
),
524 MLX4_MR_PAGES_ALIGN
);
526 /* Prevent cross page boundary allocation. */
527 mr
->pages
= (__be64
*)get_zeroed_page(GFP_KERNEL
);
531 mr
->page_map
= dma_map_single(device
->dev
.parent
, mr
->pages
,
532 mr
->page_map_size
, DMA_TO_DEVICE
);
534 if (dma_mapping_error(device
->dev
.parent
, mr
->page_map
)) {
542 free_page((unsigned long)mr
->pages
);
547 mlx4_free_priv_pages(struct mlx4_ib_mr
*mr
)
550 struct ib_device
*device
= mr
->ibmr
.device
;
552 dma_unmap_single(device
->dev
.parent
, mr
->page_map
,
553 mr
->page_map_size
, DMA_TO_DEVICE
);
554 free_page((unsigned long)mr
->pages
);
559 int mlx4_ib_dereg_mr(struct ib_mr
*ibmr
)
561 struct mlx4_ib_mr
*mr
= to_mmr(ibmr
);
564 mlx4_free_priv_pages(mr
);
566 ret
= mlx4_mr_free(to_mdev(ibmr
->device
)->dev
, &mr
->mmr
);
570 ib_umem_release(mr
->umem
);
576 struct ib_mw
*mlx4_ib_alloc_mw(struct ib_pd
*pd
, enum ib_mw_type type
,
577 struct ib_udata
*udata
)
579 struct mlx4_ib_dev
*dev
= to_mdev(pd
->device
);
580 struct mlx4_ib_mw
*mw
;
583 mw
= kmalloc(sizeof(*mw
), GFP_KERNEL
);
585 return ERR_PTR(-ENOMEM
);
587 err
= mlx4_mw_alloc(dev
->dev
, to_mpd(pd
)->pdn
,
588 to_mlx4_type(type
), &mw
->mmw
);
592 err
= mlx4_mw_enable(dev
->dev
, &mw
->mmw
);
596 mw
->ibmw
.rkey
= mw
->mmw
.key
;
601 mlx4_mw_free(dev
->dev
, &mw
->mmw
);
609 int mlx4_ib_dealloc_mw(struct ib_mw
*ibmw
)
611 struct mlx4_ib_mw
*mw
= to_mmw(ibmw
);
613 mlx4_mw_free(to_mdev(ibmw
->device
)->dev
, &mw
->mmw
);
619 struct ib_mr
*mlx4_ib_alloc_mr(struct ib_pd
*pd
,
620 enum ib_mr_type mr_type
,
623 struct mlx4_ib_dev
*dev
= to_mdev(pd
->device
);
624 struct mlx4_ib_mr
*mr
;
627 if (mr_type
!= IB_MR_TYPE_MEM_REG
||
628 max_num_sg
> MLX4_MAX_FAST_REG_PAGES
)
629 return ERR_PTR(-EINVAL
);
631 mr
= kzalloc(sizeof(*mr
), GFP_KERNEL
);
633 return ERR_PTR(-ENOMEM
);
635 err
= mlx4_mr_alloc(dev
->dev
, to_mpd(pd
)->pdn
, 0, 0, 0,
636 max_num_sg
, 0, &mr
->mmr
);
640 err
= mlx4_alloc_priv_pages(pd
->device
, mr
, max_num_sg
);
644 mr
->max_pages
= max_num_sg
;
645 err
= mlx4_mr_enable(dev
->dev
, &mr
->mmr
);
649 mr
->ibmr
.rkey
= mr
->ibmr
.lkey
= mr
->mmr
.key
;
655 mr
->ibmr
.device
= pd
->device
;
656 mlx4_free_priv_pages(mr
);
658 (void) mlx4_mr_free(dev
->dev
, &mr
->mmr
);
664 struct ib_fmr
*mlx4_ib_fmr_alloc(struct ib_pd
*pd
, int acc
,
665 struct ib_fmr_attr
*fmr_attr
)
667 struct mlx4_ib_dev
*dev
= to_mdev(pd
->device
);
668 struct mlx4_ib_fmr
*fmr
;
671 fmr
= kmalloc(sizeof *fmr
, GFP_KERNEL
);
673 return ERR_PTR(-ENOMEM
);
675 err
= mlx4_fmr_alloc(dev
->dev
, to_mpd(pd
)->pdn
, convert_access(acc
),
676 fmr_attr
->max_pages
, fmr_attr
->max_maps
,
677 fmr_attr
->page_shift
, &fmr
->mfmr
);
681 err
= mlx4_fmr_enable(to_mdev(pd
->device
)->dev
, &fmr
->mfmr
);
685 fmr
->ibfmr
.rkey
= fmr
->ibfmr
.lkey
= fmr
->mfmr
.mr
.key
;
690 (void) mlx4_mr_free(to_mdev(pd
->device
)->dev
, &fmr
->mfmr
.mr
);
698 int mlx4_ib_map_phys_fmr(struct ib_fmr
*ibfmr
, u64
*page_list
,
699 int npages
, u64 iova
)
701 struct mlx4_ib_fmr
*ifmr
= to_mfmr(ibfmr
);
702 struct mlx4_ib_dev
*dev
= to_mdev(ifmr
->ibfmr
.device
);
704 return mlx4_map_phys_fmr(dev
->dev
, &ifmr
->mfmr
, page_list
, npages
, iova
,
705 &ifmr
->ibfmr
.lkey
, &ifmr
->ibfmr
.rkey
);
708 int mlx4_ib_unmap_fmr(struct list_head
*fmr_list
)
710 struct ib_fmr
*ibfmr
;
712 struct mlx4_dev
*mdev
= NULL
;
714 list_for_each_entry(ibfmr
, fmr_list
, list
) {
715 if (mdev
&& to_mdev(ibfmr
->device
)->dev
!= mdev
)
717 mdev
= to_mdev(ibfmr
->device
)->dev
;
723 list_for_each_entry(ibfmr
, fmr_list
, list
) {
724 struct mlx4_ib_fmr
*ifmr
= to_mfmr(ibfmr
);
726 mlx4_fmr_unmap(mdev
, &ifmr
->mfmr
, &ifmr
->ibfmr
.lkey
, &ifmr
->ibfmr
.rkey
);
730 * Make sure all MPT status updates are visible before issuing
731 * SYNC_TPT firmware command.
735 err
= mlx4_SYNC_TPT(mdev
);
737 pr_warn("SYNC_TPT error %d when "
738 "unmapping FMRs\n", err
);
743 int mlx4_ib_fmr_dealloc(struct ib_fmr
*ibfmr
)
745 struct mlx4_ib_fmr
*ifmr
= to_mfmr(ibfmr
);
746 struct mlx4_ib_dev
*dev
= to_mdev(ibfmr
->device
);
749 err
= mlx4_fmr_free(dev
->dev
, &ifmr
->mfmr
);
757 static int mlx4_set_page(struct ib_mr
*ibmr
, u64 addr
)
759 struct mlx4_ib_mr
*mr
= to_mmr(ibmr
);
761 if (unlikely(mr
->npages
== mr
->max_pages
))
764 mr
->pages
[mr
->npages
++] = cpu_to_be64(addr
| MLX4_MTT_FLAG_PRESENT
);
769 int mlx4_ib_map_mr_sg(struct ib_mr
*ibmr
, struct scatterlist
*sg
, int sg_nents
,
770 unsigned int *sg_offset
)
772 struct mlx4_ib_mr
*mr
= to_mmr(ibmr
);
777 ib_dma_sync_single_for_cpu(ibmr
->device
, mr
->page_map
,
778 mr
->page_map_size
, DMA_TO_DEVICE
);
780 rc
= ib_sg_to_pages(ibmr
, sg
, sg_nents
, sg_offset
, mlx4_set_page
);
782 ib_dma_sync_single_for_device(ibmr
->device
, mr
->page_map
,
783 mr
->page_map_size
, DMA_TO_DEVICE
);