net/rds/rdma.c

   1 /*
   2  * Copyright (c) 2007, 2020 Oracle and/or its affiliates.
   3  *
   4  * This software is available to you under a choice of one of two
   5  * licenses.  You may choose to be licensed under the terms of the GNU
   6  * General Public License (GPL) Version 2, available from the file
   7  * COPYING in the main directory of this source tree, or the
   8  * OpenIB.org BSD license below:
   9  *
  10  *     Redistribution and use in source and binary forms, with or
  11  *     without modification, are permitted provided that the following
  12  *     conditions are met:
  13  *
  14  *      - Redistributions of source code must retain the above
  15  *        copyright notice, this list of conditions and the following
  16  *        disclaimer.
  17  *
  18  *      - Redistributions in binary form must reproduce the above
  19  *        copyright notice, this list of conditions and the following
  20  *        disclaimer in the documentation and/or other materials
  21  *        provided with the distribution.
  22  *
  23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
  27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
  28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  30  * SOFTWARE.
  31  *
  32  */
  33 #include <linux/pagemap.h>
  34 #include <linux/slab.h>
  35 #include <linux/rbtree.h>
  36 #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */
  37
  38 #include "rds.h"
  39
  40 /*
  41  * XXX
  42  *  - build with sparse
  43  *  - should we detect duplicate keys on a socket?  hmm.
  44  *  - an rdma is an mlock, apply rlimit?
  45  */
  46
  47 /*
  48  * get the number of pages by looking at the page indices that the start and
  49  * end addresses fall in.
  50  *
  51  * Returns 0 if the vec is invalid.  It is invalid if the number of bytes
  52  * causes the address to wrap or overflows an unsigned int.  This comes
  53  * from being stored in the 'length' member of 'struct scatterlist'.
  54  */
  55 static unsigned int rds_pages_in_vec(struct rds_iovec *vec)
  56 {
  57         if ((vec->addr + vec->bytes <= vec->addr) ||
  58             (vec->bytes > (u64)UINT_MAX))
  59                 return 0;
  60
  61         return ((vec->addr + vec->bytes + PAGE_SIZE - 1) >> PAGE_SHIFT) -
  62                 (vec->addr >> PAGE_SHIFT);
  63 }
  64
  65 static struct rds_mr *rds_mr_tree_walk(struct rb_root *root, u64 key,
  66                                        struct rds_mr *insert)
  67 {
  68         struct rb_node **p = &root->rb_node;
  69         struct rb_node *parent = NULL;
  70         struct rds_mr *mr;
  71
  72         while (*p) {
  73                 parent = *p;
  74                 mr = rb_entry(parent, struct rds_mr, r_rb_node);
  75
  76                 if (key < mr->r_key)
  77                         p = &(*p)->rb_left;
  78                 else if (key > mr->r_key)
  79                         p = &(*p)->rb_right;
  80                 else
  81                         return mr;
  82         }
  83
  84         if (insert) {
  85                 rb_link_node(&insert->r_rb_node, parent, p);
  86                 rb_insert_color(&insert->r_rb_node, root);
  87                 kref_get(&insert->r_kref);
  88         }
  89         return NULL;
  90 }
  91
  92 /*
  93  * Destroy the transport-specific part of a MR.
  94  */
  95 static void rds_destroy_mr(struct rds_mr *mr)
  96 {
  97         struct rds_sock *rs = mr->r_sock;
  98         void *trans_private = NULL;
  99         unsigned long flags;
 100
 101         rdsdebug("RDS: destroy mr key is %x refcnt %u\n",
 102                  mr->r_key, kref_read(&mr->r_kref));
 103
 104         spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 105         if (!RB_EMPTY_NODE(&mr->r_rb_node))
 106                 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
 107         trans_private = mr->r_trans_private;
 108         mr->r_trans_private = NULL;
 109         spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 110
 111         if (trans_private)
 112                 mr->r_trans->free_mr(trans_private, mr->r_invalidate);
 113 }
 114
 115 void __rds_put_mr_final(struct kref *kref)
 116 {
 117         struct rds_mr *mr = container_of(kref, struct rds_mr, r_kref);
 118
 119         rds_destroy_mr(mr);
 120         kfree(mr);
 121 }
 122
 123 /*
 124  * By the time this is called we can't have any more ioctls called on
 125  * the socket so we don't need to worry about racing with others.
 126  */
 127 void rds_rdma_drop_keys(struct rds_sock *rs)
 128 {
 129         struct rds_mr *mr;
 130         struct rb_node *node;
 131         unsigned long flags;
 132
 133         /* Release any MRs associated with this socket */
 134         spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 135         while ((node = rb_first(&rs->rs_rdma_keys))) {
 136                 mr = rb_entry(node, struct rds_mr, r_rb_node);
 137                 if (mr->r_trans == rs->rs_transport)
 138                         mr->r_invalidate = 0;
 139                 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
 140                 RB_CLEAR_NODE(&mr->r_rb_node);
 141                 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 142                 kref_put(&mr->r_kref, __rds_put_mr_final);
 143                 spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 144         }
 145         spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 146
 147         if (rs->rs_transport && rs->rs_transport->flush_mrs)
 148                 rs->rs_transport->flush_mrs();
 149 }
 150
 151 /*
 152  * Helper function to pin user pages.
 153  */
 154 static int rds_pin_pages(unsigned long user_addr, unsigned int nr_pages,
 155                         struct page **pages, int write)
 156 {
 157         unsigned int gup_flags = FOLL_LONGTERM;
 158         int ret;
 159
 160         if (write)
 161                 gup_flags |= FOLL_WRITE;
 162
 163         ret = pin_user_pages_fast(user_addr, nr_pages, gup_flags, pages);
 164         if (ret >= 0 && ret < nr_pages) {
 165                 unpin_user_pages(pages, ret);
 166                 ret = -EFAULT;
 167         }
 168
 169         return ret;
 170 }
 171
 172 static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args,
 173                           u64 *cookie_ret, struct rds_mr **mr_ret,
 174                           struct rds_conn_path *cp)
 175 {
 176         struct rds_mr *mr = NULL, *found;
 177         struct scatterlist *sg = NULL;
 178         unsigned int nr_pages;
 179         struct page **pages = NULL;
 180         void *trans_private;
 181         unsigned long flags;
 182         rds_rdma_cookie_t cookie;
 183         unsigned int nents = 0;
 184         int need_odp = 0;
 185         long i;
 186         int ret;
 187
 188         if (ipv6_addr_any(&rs->rs_bound_addr) || !rs->rs_transport) {
 189                 ret = -ENOTCONN; /* XXX not a great errno */
 190                 goto out;
 191         }
 192
 193         if (!rs->rs_transport->get_mr) {
 194                 ret = -EOPNOTSUPP;
 195                 goto out;
 196         }
 197
 198         /* If the combination of the addr and size requested for this memory
 199          * region causes an integer overflow, return error.
 200          */
 201         if (((args->vec.addr + args->vec.bytes) < args->vec.addr) ||
 202             PAGE_ALIGN(args->vec.addr + args->vec.bytes) <
 203                     (args->vec.addr + args->vec.bytes)) {
 204                 ret = -EINVAL;
 205                 goto out;
 206         }
 207
 208         if (!can_do_mlock()) {
 209                 ret = -EPERM;
 210                 goto out;
 211         }
 212
 213         nr_pages = rds_pages_in_vec(&args->vec);
 214         if (nr_pages == 0) {
 215                 ret = -EINVAL;
 216                 goto out;
 217         }
 218
 219         /* Restrict the size of mr irrespective of underlying transport
 220          * To account for unaligned mr regions, subtract one from nr_pages
 221          */
 222         if ((nr_pages - 1) > (RDS_MAX_MSG_SIZE >> PAGE_SHIFT)) {
 223                 ret = -EMSGSIZE;
 224                 goto out;
 225         }
 226
 227         rdsdebug("RDS: get_mr addr %llx len %llu nr_pages %u\n",
 228                 args->vec.addr, args->vec.bytes, nr_pages);
 229
 230         /* XXX clamp nr_pages to limit the size of this alloc? */
 231         pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
 232         if (!pages) {
 233                 ret = -ENOMEM;
 234                 goto out;
 235         }
 236
 237         mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL);
 238         if (!mr) {
 239                 ret = -ENOMEM;
 240                 goto out;
 241         }
 242
 243         kref_init(&mr->r_kref);
 244         RB_CLEAR_NODE(&mr->r_rb_node);
 245         mr->r_trans = rs->rs_transport;
 246         mr->r_sock = rs;
 247
 248         if (args->flags & RDS_RDMA_USE_ONCE)
 249                 mr->r_use_once = 1;
 250         if (args->flags & RDS_RDMA_INVALIDATE)
 251                 mr->r_invalidate = 1;
 252         if (args->flags & RDS_RDMA_READWRITE)
 253                 mr->r_write = 1;
 254
 255         /*
 256          * Pin the pages that make up the user buffer and transfer the page
 257          * pointers to the mr's sg array.  We check to see if we've mapped
 258          * the whole region after transferring the partial page references
 259          * to the sg array so that we can have one page ref cleanup path.
 260          *
 261          * For now we have no flag that tells us whether the mapping is
 262          * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to
 263          * the zero page.
 264          */
 265         ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1);
 266         if (ret == -EOPNOTSUPP) {
 267                 need_odp = 1;
 268         } else if (ret <= 0) {
 269                 goto out;
 270         } else {
 271                 nents = ret;
 272                 sg = kmalloc_array(nents, sizeof(*sg), GFP_KERNEL);
 273                 if (!sg) {
 274                         ret = -ENOMEM;
 275                         goto out;
 276                 }
 277                 WARN_ON(!nents);
 278                 sg_init_table(sg, nents);
 279
 280                 /* Stick all pages into the scatterlist */
 281                 for (i = 0 ; i < nents; i++)
 282                         sg_set_page(&sg[i], pages[i], PAGE_SIZE, 0);
 283
 284                 rdsdebug("RDS: trans_private nents is %u\n", nents);
 285         }
 286         /* Obtain a transport specific MR. If this succeeds, the
 287          * s/g list is now owned by the MR.
 288          * Note that dma_map() implies that pending writes are
 289          * flushed to RAM, so no dma_sync is needed here. */
 290         trans_private = rs->rs_transport->get_mr(
 291                 sg, nents, rs, &mr->r_key, cp ? cp->cp_conn : NULL,
 292                 args->vec.addr, args->vec.bytes,
 293                 need_odp ? ODP_ZEROBASED : ODP_NOT_NEEDED);
 294
 295         if (IS_ERR(trans_private)) {
 296                 /* In ODP case, we don't GUP pages, so don't need
 297                  * to release anything.
 298                  */
 299                 if (!need_odp) {
 300                         unpin_user_pages(pages, nr_pages);
 301                         kfree(sg);
 302                 }
 303                 ret = PTR_ERR(trans_private);
 304                 /* Trigger connection so that its ready for the next retry */
 305                 if (ret == -ENODEV && cp)
 306                         rds_conn_connect_if_down(cp->cp_conn);
 307                 goto out;
 308         }
 309
 310         mr->r_trans_private = trans_private;
 311
 312         rdsdebug("RDS: get_mr put_user key is %x cookie_addr %p\n",
 313                mr->r_key, (void *)(unsigned long) args->cookie_addr);
 314
 315         /* The user may pass us an unaligned address, but we can only
 316          * map page aligned regions. So we keep the offset, and build
 317          * a 64bit cookie containing <R_Key, offset> and pass that
 318          * around. */
 319         if (need_odp)
 320                 cookie = rds_rdma_make_cookie(mr->r_key, 0);
 321         else
 322                 cookie = rds_rdma_make_cookie(mr->r_key,
 323                                               args->vec.addr & ~PAGE_MASK);
 324         if (cookie_ret)
 325                 *cookie_ret = cookie;
 326
 327         if (args->cookie_addr &&
 328             put_user(cookie, (u64 __user *)(unsigned long)args->cookie_addr)) {
 329                 if (!need_odp) {
 330                         unpin_user_pages(pages, nr_pages);
 331                         kfree(sg);
 332                 }
 333                 ret = -EFAULT;
 334                 goto out;
 335         }
 336
 337         /* Inserting the new MR into the rbtree bumps its
 338          * reference count. */
 339         spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 340         found = rds_mr_tree_walk(&rs->rs_rdma_keys, mr->r_key, mr);
 341         spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 342
 343         BUG_ON(found && found != mr);
 344
 345         rdsdebug("RDS: get_mr key is %x\n", mr->r_key);
 346         if (mr_ret) {
 347                 kref_get(&mr->r_kref);
 348                 *mr_ret = mr;
 349         }
 350
 351         ret = 0;
 352 out:
 353         kfree(pages);
 354         if (mr)
 355                 kref_put(&mr->r_kref, __rds_put_mr_final);
 356         return ret;
 357 }
 358
 359 int rds_get_mr(struct rds_sock *rs, sockptr_t optval, int optlen)
 360 {
 361         struct rds_get_mr_args args;
 362
 363         if (optlen != sizeof(struct rds_get_mr_args))
 364                 return -EINVAL;
 365
 366         if (copy_from_sockptr(&args, optval, sizeof(struct rds_get_mr_args)))
 367                 return -EFAULT;
 368
 369         return __rds_rdma_map(rs, &args, NULL, NULL, NULL);
 370 }
 371
 372 int rds_get_mr_for_dest(struct rds_sock *rs, sockptr_t optval, int optlen)
 373 {
 374         struct rds_get_mr_for_dest_args args;
 375         struct rds_get_mr_args new_args;
 376
 377         if (optlen != sizeof(struct rds_get_mr_for_dest_args))
 378                 return -EINVAL;
 379
 380         if (copy_from_sockptr(&args, optval,
 381                            sizeof(struct rds_get_mr_for_dest_args)))
 382                 return -EFAULT;
 383
 384         /*
 385          * Initially, just behave like get_mr().
 386          * TODO: Implement get_mr as wrapper around this
 387          *       and deprecate it.
 388          */
 389         new_args.vec = args.vec;
 390         new_args.cookie_addr = args.cookie_addr;
 391         new_args.flags = args.flags;
 392
 393         return __rds_rdma_map(rs, &new_args, NULL, NULL, NULL);
 394 }
 395
 396 /*
 397  * Free the MR indicated by the given R_Key
 398  */
 399 int rds_free_mr(struct rds_sock *rs, sockptr_t optval, int optlen)
 400 {
 401         struct rds_free_mr_args args;
 402         struct rds_mr *mr;
 403         unsigned long flags;
 404
 405         if (optlen != sizeof(struct rds_free_mr_args))
 406                 return -EINVAL;
 407
 408         if (copy_from_sockptr(&args, optval, sizeof(struct rds_free_mr_args)))
 409                 return -EFAULT;
 410
 411         /* Special case - a null cookie means flush all unused MRs */
 412         if (args.cookie == 0) {
 413                 if (!rs->rs_transport || !rs->rs_transport->flush_mrs)
 414                         return -EINVAL;
 415                 rs->rs_transport->flush_mrs();
 416                 return 0;
 417         }
 418
 419         /* Look up the MR given its R_key and remove it from the rbtree
 420          * so nobody else finds it.
 421          * This should also prevent races with rds_rdma_unuse.
 422          */
 423         spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 424         mr = rds_mr_tree_walk(&rs->rs_rdma_keys, rds_rdma_cookie_key(args.cookie), NULL);
 425         if (mr) {
 426                 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
 427                 RB_CLEAR_NODE(&mr->r_rb_node);
 428                 if (args.flags & RDS_RDMA_INVALIDATE)
 429                         mr->r_invalidate = 1;
 430         }
 431         spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 432
 433         if (!mr)
 434                 return -EINVAL;
 435
 436         kref_put(&mr->r_kref, __rds_put_mr_final);
 437         return 0;
 438 }
 439
 440 /*
 441  * This is called when we receive an extension header that
 442  * tells us this MR was used. It allows us to implement
 443  * use_once semantics
 444  */
 445 void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force)
 446 {
 447         struct rds_mr *mr;
 448         unsigned long flags;
 449         int zot_me = 0;
 450
 451         spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 452         mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
 453         if (!mr) {
 454                 pr_debug("rds: trying to unuse MR with unknown r_key %u!\n",
 455                          r_key);
 456                 spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 457                 return;
 458         }
 459
 460         /* Get a reference so that the MR won't go away before calling
 461          * sync_mr() below.
 462          */
 463         kref_get(&mr->r_kref);
 464
 465         /* If it is going to be freed, remove it from the tree now so
 466          * that no other thread can find it and free it.
 467          */
 468         if (mr->r_use_once || force) {
 469                 rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys);
 470                 RB_CLEAR_NODE(&mr->r_rb_node);
 471                 zot_me = 1;
 472         }
 473         spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 474
 475         /* May have to issue a dma_sync on this memory region.
 476          * Note we could avoid this if the operation was a RDMA READ,
 477          * but at this point we can't tell. */
 478         if (mr->r_trans->sync_mr)
 479                 mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE);
 480
 481         /* Release the reference held above. */
 482         kref_put(&mr->r_kref, __rds_put_mr_final);
 483
 484         /* If the MR was marked as invalidate, this will
 485          * trigger an async flush. */
 486         if (zot_me)
 487                 kref_put(&mr->r_kref, __rds_put_mr_final);
 488 }
 489
 490 void rds_rdma_free_op(struct rm_rdma_op *ro)
 491 {
 492         unsigned int i;
 493
 494         if (ro->op_odp_mr) {
 495                 kref_put(&ro->op_odp_mr->r_kref, __rds_put_mr_final);
 496         } else {
 497                 for (i = 0; i < ro->op_nents; i++) {
 498                         struct page *page = sg_page(&ro->op_sg[i]);
 499
 500                         /* Mark page dirty if it was possibly modified, which
 501                          * is the case for a RDMA_READ which copies from remote
 502                          * to local memory
 503                          */
 504                         unpin_user_pages_dirty_lock(&page, 1, !ro->op_write);
 505                 }
 506         }
 507
 508         kfree(ro->op_notifier);
 509         ro->op_notifier = NULL;
 510         ro->op_active = 0;
 511         ro->op_odp_mr = NULL;
 512 }
 513
 514 void rds_atomic_free_op(struct rm_atomic_op *ao)
 515 {
 516         struct page *page = sg_page(ao->op_sg);
 517
 518         /* Mark page dirty if it was possibly modified, which
 519          * is the case for a RDMA_READ which copies from remote
 520          * to local memory */
 521         unpin_user_pages_dirty_lock(&page, 1, true);
 522
 523         kfree(ao->op_notifier);
 524         ao->op_notifier = NULL;
 525         ao->op_active = 0;
 526 }
 527
 528
 529 /*
 530  * Count the number of pages needed to describe an incoming iovec array.
 531  */
 532 static int rds_rdma_pages(struct rds_iovec iov[], int nr_iovecs)
 533 {
 534         int tot_pages = 0;
 535         unsigned int nr_pages;
 536         unsigned int i;
 537
 538         /* figure out the number of pages in the vector */
 539         for (i = 0; i < nr_iovecs; i++) {
 540                 nr_pages = rds_pages_in_vec(&iov[i]);
 541                 if (nr_pages == 0)
 542                         return -EINVAL;
 543
 544                 tot_pages += nr_pages;
 545
 546                 /*
 547                  * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
 548                  * so tot_pages cannot overflow without first going negative.
 549                  */
 550                 if (tot_pages < 0)
 551                         return -EINVAL;
 552         }
 553
 554         return tot_pages;
 555 }
 556
 557 int rds_rdma_extra_size(struct rds_rdma_args *args,
 558                         struct rds_iov_vector *iov)
 559 {
 560         struct rds_iovec *vec;
 561         struct rds_iovec __user *local_vec;
 562         int tot_pages = 0;
 563         unsigned int nr_pages;
 564         unsigned int i;
 565
 566         local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr;
 567
 568         if (args->nr_local == 0)
 569                 return -EINVAL;
 570
 571         if (args->nr_local > UIO_MAXIOV)
 572                 return -EMSGSIZE;
 573
 574         iov->iov = kcalloc(args->nr_local,
 575                            sizeof(struct rds_iovec),
 576                            GFP_KERNEL);
 577         if (!iov->iov)
 578                 return -ENOMEM;
 579
 580         vec = &iov->iov[0];
 581
 582         if (copy_from_user(vec, local_vec, args->nr_local *
 583                            sizeof(struct rds_iovec)))
 584                 return -EFAULT;
 585         iov->len = args->nr_local;
 586
 587         /* figure out the number of pages in the vector */
 588         for (i = 0; i < args->nr_local; i++, vec++) {
 589
 590                 nr_pages = rds_pages_in_vec(vec);
 591                 if (nr_pages == 0)
 592                         return -EINVAL;
 593
 594                 tot_pages += nr_pages;
 595
 596                 /*
 597                  * nr_pages for one entry is limited to (UINT_MAX>>PAGE_SHIFT)+1,
 598                  * so tot_pages cannot overflow without first going negative.
 599                  */
 600                 if (tot_pages < 0)
 601                         return -EINVAL;
 602         }
 603
 604         return tot_pages * sizeof(struct scatterlist);
 605 }
 606
 607 /*
 608  * The application asks for a RDMA transfer.
 609  * Extract all arguments and set up the rdma_op
 610  */
 611 int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm,
 612                        struct cmsghdr *cmsg,
 613                        struct rds_iov_vector *vec)
 614 {
 615         struct rds_rdma_args *args;
 616         struct rm_rdma_op *op = &rm->rdma;
 617         int nr_pages;
 618         unsigned int nr_bytes;
 619         struct page **pages = NULL;
 620         struct rds_iovec *iovs;
 621         unsigned int i, j;
 622         int ret = 0;
 623         bool odp_supported = true;
 624
 625         if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args))
 626             || rm->rdma.op_active)
 627                 return -EINVAL;
 628
 629         args = CMSG_DATA(cmsg);
 630
 631         if (ipv6_addr_any(&rs->rs_bound_addr)) {
 632                 ret = -ENOTCONN; /* XXX not a great errno */
 633                 goto out_ret;
 634         }
 635
 636         if (args->nr_local > UIO_MAXIOV) {
 637                 ret = -EMSGSIZE;
 638                 goto out_ret;
 639         }
 640
 641         if (vec->len != args->nr_local) {
 642                 ret = -EINVAL;
 643                 goto out_ret;
 644         }
 645         /* odp-mr is not supported for multiple requests within one message */
 646         if (args->nr_local != 1)
 647                 odp_supported = false;
 648
 649         iovs = vec->iov;
 650
 651         nr_pages = rds_rdma_pages(iovs, args->nr_local);
 652         if (nr_pages < 0) {
 653                 ret = -EINVAL;
 654                 goto out_ret;
 655         }
 656
 657         pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
 658         if (!pages) {
 659                 ret = -ENOMEM;
 660                 goto out_ret;
 661         }
 662
 663         op->op_write = !!(args->flags & RDS_RDMA_READWRITE);
 664         op->op_fence = !!(args->flags & RDS_RDMA_FENCE);
 665         op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
 666         op->op_silent = !!(args->flags & RDS_RDMA_SILENT);
 667         op->op_active = 1;
 668         op->op_recverr = rs->rs_recverr;
 669         op->op_odp_mr = NULL;
 670
 671         WARN_ON(!nr_pages);
 672         op->op_sg = rds_message_alloc_sgs(rm, nr_pages);
 673         if (IS_ERR(op->op_sg)) {
 674                 ret = PTR_ERR(op->op_sg);
 675                 goto out_pages;
 676         }
 677
 678         if (op->op_notify || op->op_recverr) {
 679                 /* We allocate an uninitialized notifier here, because
 680                  * we don't want to do that in the completion handler. We
 681                  * would have to use GFP_ATOMIC there, and don't want to deal
 682                  * with failed allocations.
 683                  */
 684                 op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL);
 685                 if (!op->op_notifier) {
 686                         ret = -ENOMEM;
 687                         goto out_pages;
 688                 }
 689                 op->op_notifier->n_user_token = args->user_token;
 690                 op->op_notifier->n_status = RDS_RDMA_SUCCESS;
 691         }
 692
 693         /* The cookie contains the R_Key of the remote memory region, and
 694          * optionally an offset into it. This is how we implement RDMA into
 695          * unaligned memory.
 696          * When setting up the RDMA, we need to add that offset to the
 697          * destination address (which is really an offset into the MR)
 698          * FIXME: We may want to move this into ib_rdma.c
 699          */
 700         op->op_rkey = rds_rdma_cookie_key(args->cookie);
 701         op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie);
 702
 703         nr_bytes = 0;
 704
 705         rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n",
 706                (unsigned long long)args->nr_local,
 707                (unsigned long long)args->remote_vec.addr,
 708                op->op_rkey);
 709
 710         for (i = 0; i < args->nr_local; i++) {
 711                 struct rds_iovec *iov = &iovs[i];
 712                 /* don't need to check, rds_rdma_pages() verified nr will be +nonzero */
 713                 unsigned int nr = rds_pages_in_vec(iov);
 714
 715                 rs->rs_user_addr = iov->addr;
 716                 rs->rs_user_bytes = iov->bytes;
 717
 718                 /* If it's a WRITE operation, we want to pin the pages for reading.
 719                  * If it's a READ operation, we need to pin the pages for writing.
 720                  */
 721                 ret = rds_pin_pages(iov->addr, nr, pages, !op->op_write);
 722                 if ((!odp_supported && ret <= 0) ||
 723                     (odp_supported && ret <= 0 && ret != -EOPNOTSUPP))
 724                         goto out_pages;
 725
 726                 if (ret == -EOPNOTSUPP) {
 727                         struct rds_mr *local_odp_mr;
 728
 729                         if (!rs->rs_transport->get_mr) {
 730                                 ret = -EOPNOTSUPP;
 731                                 goto out_pages;
 732                         }
 733                         local_odp_mr =
 734                                 kzalloc(sizeof(*local_odp_mr), GFP_KERNEL);
 735                         if (!local_odp_mr) {
 736                                 ret = -ENOMEM;
 737                                 goto out_pages;
 738                         }
 739                         RB_CLEAR_NODE(&local_odp_mr->r_rb_node);
 740                         kref_init(&local_odp_mr->r_kref);
 741                         local_odp_mr->r_trans = rs->rs_transport;
 742                         local_odp_mr->r_sock = rs;
 743                         local_odp_mr->r_trans_private =
 744                                 rs->rs_transport->get_mr(
 745                                         NULL, 0, rs, &local_odp_mr->r_key, NULL,
 746                                         iov->addr, iov->bytes, ODP_VIRTUAL);
 747                         if (IS_ERR(local_odp_mr->r_trans_private)) {
 748                                 ret = PTR_ERR(local_odp_mr->r_trans_private);
 749                                 rdsdebug("get_mr ret %d %p\"", ret,
 750                                          local_odp_mr->r_trans_private);
 751                                 kfree(local_odp_mr);
 752                                 ret = -EOPNOTSUPP;
 753                                 goto out_pages;
 754                         }
 755                         rdsdebug("Need odp; local_odp_mr %p trans_private %p\n",
 756                                  local_odp_mr, local_odp_mr->r_trans_private);
 757                         op->op_odp_mr = local_odp_mr;
 758                         op->op_odp_addr = iov->addr;
 759                 }
 760
 761                 rdsdebug("RDS: nr_bytes %u nr %u iov->bytes %llu iov->addr %llx\n",
 762                          nr_bytes, nr, iov->bytes, iov->addr);
 763
 764                 nr_bytes += iov->bytes;
 765
 766                 for (j = 0; j < nr; j++) {
 767                         unsigned int offset = iov->addr & ~PAGE_MASK;
 768                         struct scatterlist *sg;
 769
 770                         sg = &op->op_sg[op->op_nents + j];
 771                         sg_set_page(sg, pages[j],
 772                                         min_t(unsigned int, iov->bytes, PAGE_SIZE - offset),
 773                                         offset);
 774
 775                         sg_dma_len(sg) = sg->length;
 776                         rdsdebug("RDS: sg->offset %x sg->len %x iov->addr %llx iov->bytes %llu\n",
 777                                sg->offset, sg->length, iov->addr, iov->bytes);
 778
 779                         iov->addr += sg->length;
 780                         iov->bytes -= sg->length;
 781                 }
 782
 783                 op->op_nents += nr;
 784         }
 785
 786         if (nr_bytes > args->remote_vec.bytes) {
 787                 rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n",
 788                                 nr_bytes,
 789                                 (unsigned int) args->remote_vec.bytes);
 790                 ret = -EINVAL;
 791                 goto out_pages;
 792         }
 793         op->op_bytes = nr_bytes;
 794         ret = 0;
 795
 796 out_pages:
 797         kfree(pages);
 798 out_ret:
 799         if (ret)
 800                 rds_rdma_free_op(op);
 801         else
 802                 rds_stats_inc(s_send_rdma);
 803
 804         return ret;
 805 }
 806
 807 /*
 808  * The application wants us to pass an RDMA destination (aka MR)
 809  * to the remote
 810  */
 811 int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm,
 812                           struct cmsghdr *cmsg)
 813 {
 814         unsigned long flags;
 815         struct rds_mr *mr;
 816         u32 r_key;
 817         int err = 0;
 818
 819         if (cmsg->cmsg_len < CMSG_LEN(sizeof(rds_rdma_cookie_t)) ||
 820             rm->m_rdma_cookie != 0)
 821                 return -EINVAL;
 822
 823         memcpy(&rm->m_rdma_cookie, CMSG_DATA(cmsg), sizeof(rm->m_rdma_cookie));
 824
 825         /* We are reusing a previously mapped MR here. Most likely, the
 826          * application has written to the buffer, so we need to explicitly
 827          * flush those writes to RAM. Otherwise the HCA may not see them
 828          * when doing a DMA from that buffer.
 829          */
 830         r_key = rds_rdma_cookie_key(rm->m_rdma_cookie);
 831
 832         spin_lock_irqsave(&rs->rs_rdma_lock, flags);
 833         mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL);
 834         if (!mr)
 835                 err = -EINVAL;  /* invalid r_key */
 836         else
 837                 kref_get(&mr->r_kref);
 838         spin_unlock_irqrestore(&rs->rs_rdma_lock, flags);
 839
 840         if (mr) {
 841                 mr->r_trans->sync_mr(mr->r_trans_private,
 842                                      DMA_TO_DEVICE);
 843                 rm->rdma.op_rdma_mr = mr;
 844         }
 845         return err;
 846 }
 847
 848 /*
 849  * The application passes us an address range it wants to enable RDMA
 850  * to/from. We map the area, and save the <R_Key,offset> pair
 851  * in rm->m_rdma_cookie. This causes it to be sent along to the peer
 852  * in an extension header.
 853  */
 854 int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm,
 855                           struct cmsghdr *cmsg)
 856 {
 857         if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_get_mr_args)) ||
 858             rm->m_rdma_cookie != 0)
 859                 return -EINVAL;
 860
 861         return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie,
 862                               &rm->rdma.op_rdma_mr, rm->m_conn_path);
 863 }
 864
 865 /*
 866  * Fill in rds_message for an atomic request.
 867  */
 868 int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm,
 869                     struct cmsghdr *cmsg)
 870 {
 871         struct page *page = NULL;
 872         struct rds_atomic_args *args;
 873         int ret = 0;
 874
 875         if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args))
 876          || rm->atomic.op_active)
 877                 return -EINVAL;
 878
 879         args = CMSG_DATA(cmsg);
 880
 881         /* Nonmasked & masked cmsg ops converted to masked hw ops */
 882         switch (cmsg->cmsg_type) {
 883         case RDS_CMSG_ATOMIC_FADD:
 884                 rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
 885                 rm->atomic.op_m_fadd.add = args->fadd.add;
 886                 rm->atomic.op_m_fadd.nocarry_mask = 0;
 887                 break;
 888         case RDS_CMSG_MASKED_ATOMIC_FADD:
 889                 rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD;
 890                 rm->atomic.op_m_fadd.add = args->m_fadd.add;
 891                 rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask;
 892                 break;
 893         case RDS_CMSG_ATOMIC_CSWP:
 894                 rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
 895                 rm->atomic.op_m_cswp.compare = args->cswp.compare;
 896                 rm->atomic.op_m_cswp.swap = args->cswp.swap;
 897                 rm->atomic.op_m_cswp.compare_mask = ~0;
 898                 rm->atomic.op_m_cswp.swap_mask = ~0;
 899                 break;
 900         case RDS_CMSG_MASKED_ATOMIC_CSWP:
 901                 rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP;
 902                 rm->atomic.op_m_cswp.compare = args->m_cswp.compare;
 903                 rm->atomic.op_m_cswp.swap = args->m_cswp.swap;
 904                 rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask;
 905                 rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask;
 906                 break;
 907         default:
 908                 BUG(); /* should never happen */
 909         }
 910
 911         rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME);
 912         rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT);
 913         rm->atomic.op_active = 1;
 914         rm->atomic.op_recverr = rs->rs_recverr;
 915         rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1);
 916         if (IS_ERR(rm->atomic.op_sg)) {
 917                 ret = PTR_ERR(rm->atomic.op_sg);
 918                 goto err;
 919         }
 920
 921         /* verify 8 byte-aligned */
 922         if (args->local_addr & 0x7) {
 923                 ret = -EFAULT;
 924                 goto err;
 925         }
 926
 927         ret = rds_pin_pages(args->local_addr, 1, &page, 1);
 928         if (ret != 1)
 929                 goto err;
 930         ret = 0;
 931
 932         sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr));
 933
 934         if (rm->atomic.op_notify || rm->atomic.op_recverr) {
 935                 /* We allocate an uninitialized notifier here, because
 936                  * we don't want to do that in the completion handler. We
 937                  * would have to use GFP_ATOMIC there, and don't want to deal
 938                  * with failed allocations.
 939                  */
 940                 rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL);
 941                 if (!rm->atomic.op_notifier) {
 942                         ret = -ENOMEM;
 943                         goto err;
 944                 }
 945
 946                 rm->atomic.op_notifier->n_user_token = args->user_token;
 947                 rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS;
 948         }
 949
 950         rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie);
 951         rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie);
 952
 953         return ret;
 954 err:
 955         if (page)
 956                 unpin_user_page(page);
 957         rm->atomic.op_active = 0;
 958         kfree(rm->atomic.op_notifier);
 959
 960         return ret;
 961 }