drivers/infiniband/core/umem.c

   1 /*
   2  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
   3  * Copyright (c) 2005 Cisco Systems.  All rights reserved.
   4  * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
   5  *
   6  * This software is available to you under a choice of one of two
   7  * licenses.  You may choose to be licensed under the terms of the GNU
   8  * General Public License (GPL) Version 2, available from the file
   9  * COPYING in the main directory of this source tree, or the
  10  * OpenIB.org BSD license below:
  11  *
  12  *     Redistribution and use in source and binary forms, with or
  13  *     without modification, are permitted provided that the following
  14  *     conditions are met:
  15  *
  16  *      - Redistributions of source code must retain the above
  17  *        copyright notice, this list of conditions and the following
  18  *        disclaimer.
  19  *
  20  *      - Redistributions in binary form must reproduce the above
  21  *        copyright notice, this list of conditions and the following
  22  *        disclaimer in the documentation and/or other materials
  23  *        provided with the distribution.
  24  *
  25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  32  * SOFTWARE.
  33  */
  34
  35 #include <linux/mm.h>
  36 #include <linux/dma-mapping.h>
  37 #include <linux/sched/signal.h>
  38 #include <linux/sched/mm.h>
  39 #include <linux/export.h>
  40 #include <linux/slab.h>
  41 #include <linux/pagemap.h>
  42 #include <linux/count_zeros.h>
  43 #include <rdma/ib_umem_odp.h>
  44
  45 #include "uverbs.h"
  46
  47 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
  48 {
  49         struct sg_page_iter sg_iter;
  50         struct page *page;
  51
  52         if (umem->nmap > 0)
  53                 ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
  54                                 DMA_BIDIRECTIONAL);
  55
  56         for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
  57                 page = sg_page_iter_page(&sg_iter);
  58                 unpin_user_pages_dirty_lock(&page, 1, umem->writable && dirty);
  59         }
  60
  61         sg_free_table(&umem->sg_head);
  62 }
  63
  64 /**
  65  * ib_umem_find_best_pgsz - Find best HW page size to use for this MR
  66  *
  67  * @umem: umem struct
  68  * @pgsz_bitmap: bitmap of HW supported page sizes
  69  * @virt: IOVA
  70  *
  71  * This helper is intended for HW that support multiple page
  72  * sizes but can do only a single page size in an MR.
  73  *
  74  * Returns 0 if the umem requires page sizes not supported by
  75  * the driver to be mapped. Drivers always supporting PAGE_SIZE
  76  * or smaller will never see a 0 result.
  77  */
  78 unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
  79                                      unsigned long pgsz_bitmap,
  80                                      unsigned long virt)
  81 {
  82         struct scatterlist *sg;
  83         unsigned long va, pgoff;
  84         dma_addr_t mask;
  85         int i;
  86
  87         if (umem->is_odp) {
  88                 unsigned int page_size = BIT(to_ib_umem_odp(umem)->page_shift);
  89
  90                 /* ODP must always be self consistent. */
  91                 if (!(pgsz_bitmap & page_size))
  92                         return 0;
  93                 return page_size;
  94         }
  95
  96         /* rdma_for_each_block() has a bug if the page size is smaller than the
  97          * page size used to build the umem. For now prevent smaller page sizes
  98          * from being returned.
  99          */
 100         pgsz_bitmap &= GENMASK(BITS_PER_LONG - 1, PAGE_SHIFT);
 101
 102         /* At minimum, drivers must support PAGE_SIZE or smaller */
 103         if (WARN_ON(!(pgsz_bitmap & GENMASK(PAGE_SHIFT, 0))))
 104                 return 0;
 105
 106         umem->iova = va = virt;
 107         /* The best result is the smallest page size that results in the minimum
 108          * number of required pages. Compute the largest page size that could
 109          * work based on VA address bits that don't change.
 110          */
 111         mask = pgsz_bitmap &
 112                GENMASK(BITS_PER_LONG - 1,
 113                        bits_per((umem->length - 1 + virt) ^ virt));
 114         /* offset into first SGL */
 115         pgoff = umem->address & ~PAGE_MASK;
 116
 117         for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
 118                 /* Walk SGL and reduce max page size if VA/PA bits differ
 119                  * for any address.
 120                  */
 121                 mask |= (sg_dma_address(sg) + pgoff) ^ va;
 122                 va += sg_dma_len(sg) - pgoff;
 123                 /* Except for the last entry, the ending iova alignment sets
 124                  * the maximum possible page size as the low bits of the iova
 125                  * must be zero when starting the next chunk.
 126                  */
 127                 if (i != (umem->nmap - 1))
 128                         mask |= va;
 129                 pgoff = 0;
 130         }
 131
 132         /* The mask accumulates 1's in each position where the VA and physical
 133          * address differ, thus the length of trailing 0 is the largest page
 134          * size that can pass the VA through to the physical.
 135          */
 136         if (mask)
 137                 pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0);
 138         return rounddown_pow_of_two(pgsz_bitmap);
 139 }
 140 EXPORT_SYMBOL(ib_umem_find_best_pgsz);
 141
 142 /**
 143  * ib_umem_get - Pin and DMA map userspace memory.
 144  *
 145  * @device: IB device to connect UMEM
 146  * @addr: userspace virtual address to start at
 147  * @size: length of region to pin
 148  * @access: IB_ACCESS_xxx flags for memory being pinned
 149  */
 150 struct ib_umem *ib_umem_get(struct ib_device *device, unsigned long addr,
 151                             size_t size, int access)
 152 {
 153         struct ib_umem *umem;
 154         struct page **page_list;
 155         unsigned long lock_limit;
 156         unsigned long new_pinned;
 157         unsigned long cur_base;
 158         unsigned long dma_attr = 0;
 159         struct mm_struct *mm;
 160         unsigned long npages;
 161         int ret;
 162         struct scatterlist *sg = NULL;
 163         unsigned int gup_flags = FOLL_WRITE;
 164
 165         /*
 166          * If the combination of the addr and size requested for this memory
 167          * region causes an integer overflow, return error.
 168          */
 169         if (((addr + size) < addr) ||
 170             PAGE_ALIGN(addr + size) < (addr + size))
 171                 return ERR_PTR(-EINVAL);
 172
 173         if (!can_do_mlock())
 174                 return ERR_PTR(-EPERM);
 175
 176         if (access & IB_ACCESS_ON_DEMAND)
 177                 return ERR_PTR(-EOPNOTSUPP);
 178
 179         umem = kzalloc(sizeof(*umem), GFP_KERNEL);
 180         if (!umem)
 181                 return ERR_PTR(-ENOMEM);
 182         umem->ibdev      = device;
 183         umem->length     = size;
 184         umem->address    = addr;
 185         /*
 186          * Drivers should call ib_umem_find_best_pgsz() to set the iova
 187          * correctly.
 188          */
 189         umem->iova = addr;
 190         umem->writable   = ib_access_writable(access);
 191         umem->owning_mm = mm = current->mm;
 192         mmgrab(mm);
 193
 194         page_list = (struct page **) __get_free_page(GFP_KERNEL);
 195         if (!page_list) {
 196                 ret = -ENOMEM;
 197                 goto umem_kfree;
 198         }
 199
 200         npages = ib_umem_num_pages(umem);
 201         if (npages == 0 || npages > UINT_MAX) {
 202                 ret = -EINVAL;
 203                 goto out;
 204         }
 205
 206         lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
 207
 208         new_pinned = atomic64_add_return(npages, &mm->pinned_vm);
 209         if (new_pinned > lock_limit && !capable(CAP_IPC_LOCK)) {
 210                 atomic64_sub(npages, &mm->pinned_vm);
 211                 ret = -ENOMEM;
 212                 goto out;
 213         }
 214
 215         cur_base = addr & PAGE_MASK;
 216
 217         if (!umem->writable)
 218                 gup_flags |= FOLL_FORCE;
 219
 220         while (npages) {
 221                 cond_resched();
 222                 ret = pin_user_pages_fast(cur_base,
 223                                           min_t(unsigned long, npages,
 224                                                 PAGE_SIZE /
 225                                                 sizeof(struct page *)),
 226                                           gup_flags | FOLL_LONGTERM, page_list);
 227                 if (ret < 0)
 228                         goto umem_release;
 229
 230                 cur_base += ret * PAGE_SIZE;
 231                 npages -= ret;
 232                 sg = __sg_alloc_table_from_pages(&umem->sg_head, page_list, ret,
 233                                 0, ret << PAGE_SHIFT,
 234                                 ib_dma_max_seg_size(device), sg, npages,
 235                                 GFP_KERNEL);
 236                 umem->sg_nents = umem->sg_head.nents;
 237                 if (IS_ERR(sg)) {
 238                         unpin_user_pages_dirty_lock(page_list, ret, 0);
 239                         ret = PTR_ERR(sg);
 240                         goto umem_release;
 241                 }
 242         }
 243
 244         if (access & IB_ACCESS_RELAXED_ORDERING)
 245                 dma_attr |= DMA_ATTR_WEAK_ORDERING;
 246
 247         umem->nmap =
 248                 ib_dma_map_sg_attrs(device, umem->sg_head.sgl, umem->sg_nents,
 249                                     DMA_BIDIRECTIONAL, dma_attr);
 250
 251         if (!umem->nmap) {
 252                 ret = -ENOMEM;
 253                 goto umem_release;
 254         }
 255
 256         ret = 0;
 257         goto out;
 258
 259 umem_release:
 260         __ib_umem_release(device, umem, 0);
 261         atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
 262 out:
 263         free_page((unsigned long) page_list);
 264 umem_kfree:
 265         if (ret) {
 266                 mmdrop(umem->owning_mm);
 267                 kfree(umem);
 268         }
 269         return ret ? ERR_PTR(ret) : umem;
 270 }
 271 EXPORT_SYMBOL(ib_umem_get);
 272
 273 /**
 274  * ib_umem_release - release memory pinned with ib_umem_get
 275  * @umem: umem struct to release
 276  */
 277 void ib_umem_release(struct ib_umem *umem)
 278 {
 279         if (!umem)
 280                 return;
 281         if (umem->is_odp)
 282                 return ib_umem_odp_release(to_ib_umem_odp(umem));
 283
 284         __ib_umem_release(umem->ibdev, umem, 1);
 285
 286         atomic64_sub(ib_umem_num_pages(umem), &umem->owning_mm->pinned_vm);
 287         mmdrop(umem->owning_mm);
 288         kfree(umem);
 289 }
 290 EXPORT_SYMBOL(ib_umem_release);
 291
 292 /*
 293  * Copy from the given ib_umem's pages to the given buffer.
 294  *
 295  * umem - the umem to copy from
 296  * offset - offset to start copying from
 297  * dst - destination buffer
 298  * length - buffer length
 299  *
 300  * Returns 0 on success, or an error code.
 301  */
 302 int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
 303                       size_t length)
 304 {
 305         size_t end = offset + length;
 306         int ret;
 307
 308         if (offset > umem->length || length > umem->length - offset) {
 309                 pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
 310                        offset, umem->length, end);
 311                 return -EINVAL;
 312         }
 313
 314         ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->sg_nents, dst, length,
 315                                  offset + ib_umem_offset(umem));
 316
 317         if (ret < 0)
 318                 return ret;
 319         else if (ret != length)
 320                 return -EINVAL;
 321         else
 322                 return 0;
 323 }
 324 EXPORT_SYMBOL(ib_umem_copy_from);