drivers/nvme/target/rdma.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * NVMe over Fabrics RDMA target.
   4  * Copyright (c) 2015-2016 HGST, a Western Digital Company.
   5  */
   6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
   7 #include <linux/atomic.h>
   8 #include <linux/ctype.h>
   9 #include <linux/delay.h>
  10 #include <linux/err.h>
  11 #include <linux/init.h>
  12 #include <linux/module.h>
  13 #include <linux/nvme.h>
  14 #include <linux/slab.h>
  15 #include <linux/string.h>
  16 #include <linux/wait.h>
  17 #include <linux/inet.h>
  18 #include <asm/unaligned.h>
  19
  20 #include <rdma/ib_verbs.h>
  21 #include <rdma/rdma_cm.h>
  22 #include <rdma/rw.h>
  23
  24 #include <linux/nvme-rdma.h>
  25 #include "nvmet.h"
  26
  27 /*
  28  * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data
  29  */
  30 #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE     PAGE_SIZE
  31 #define NVMET_RDMA_MAX_INLINE_SGE               4
  32 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE         max_t(int, SZ_16K, PAGE_SIZE)
  33
  34 /* Assume mpsmin == device_page_size == 4KB */
  35 #define NVMET_RDMA_MAX_MDTS                     8
  36
  37 struct nvmet_rdma_cmd {
  38         struct ib_sge           sge[NVMET_RDMA_MAX_INLINE_SGE + 1];
  39         struct ib_cqe           cqe;
  40         struct ib_recv_wr       wr;
  41         struct scatterlist      inline_sg[NVMET_RDMA_MAX_INLINE_SGE];
  42         struct nvme_command     *nvme_cmd;
  43         struct nvmet_rdma_queue *queue;
  44 };
  45
  46 enum {
  47         NVMET_RDMA_REQ_INLINE_DATA      = (1 << 0),
  48         NVMET_RDMA_REQ_INVALIDATE_RKEY  = (1 << 1),
  49 };
  50
  51 struct nvmet_rdma_rsp {
  52         struct ib_sge           send_sge;
  53         struct ib_cqe           send_cqe;
  54         struct ib_send_wr       send_wr;
  55
  56         struct nvmet_rdma_cmd   *cmd;
  57         struct nvmet_rdma_queue *queue;
  58
  59         struct ib_cqe           read_cqe;
  60         struct rdma_rw_ctx      rw;
  61
  62         struct nvmet_req        req;
  63
  64         bool                    allocated;
  65         u8                      n_rdma;
  66         u32                     flags;
  67         u32                     invalidate_rkey;
  68
  69         struct list_head        wait_list;
  70         struct list_head        free_list;
  71 };
  72
  73 enum nvmet_rdma_queue_state {
  74         NVMET_RDMA_Q_CONNECTING,
  75         NVMET_RDMA_Q_LIVE,
  76         NVMET_RDMA_Q_DISCONNECTING,
  77 };
  78
  79 struct nvmet_rdma_queue {
  80         struct rdma_cm_id       *cm_id;
  81         struct ib_qp            *qp;
  82         struct nvmet_port       *port;
  83         struct ib_cq            *cq;
  84         atomic_t                sq_wr_avail;
  85         struct nvmet_rdma_device *dev;
  86         spinlock_t              state_lock;
  87         enum nvmet_rdma_queue_state state;
  88         struct nvmet_cq         nvme_cq;
  89         struct nvmet_sq         nvme_sq;
  90
  91         struct nvmet_rdma_rsp   *rsps;
  92         struct list_head        free_rsps;
  93         spinlock_t              rsps_lock;
  94         struct nvmet_rdma_cmd   *cmds;
  95
  96         struct work_struct      release_work;
  97         struct list_head        rsp_wait_list;
  98         struct list_head        rsp_wr_wait_list;
  99         spinlock_t              rsp_wr_wait_lock;
 100
 101         int                     idx;
 102         int                     host_qid;
 103         int                     recv_queue_size;
 104         int                     send_queue_size;
 105
 106         struct list_head        queue_list;
 107 };
 108
 109 struct nvmet_rdma_port {
 110         struct nvmet_port       *nport;
 111         struct sockaddr_storage addr;
 112         struct rdma_cm_id       *cm_id;
 113         struct delayed_work     repair_work;
 114 };
 115
 116 struct nvmet_rdma_device {
 117         struct ib_device        *device;
 118         struct ib_pd            *pd;
 119         struct ib_srq           *srq;
 120         struct nvmet_rdma_cmd   *srq_cmds;
 121         size_t                  srq_size;
 122         struct kref             ref;
 123         struct list_head        entry;
 124         int                     inline_data_size;
 125         int                     inline_page_count;
 126 };
 127
 128 static bool nvmet_rdma_use_srq;
 129 module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444);
 130 MODULE_PARM_DESC(use_srq, "Use shared receive queue.");
 131
 132 static DEFINE_IDA(nvmet_rdma_queue_ida);
 133 static LIST_HEAD(nvmet_rdma_queue_list);
 134 static DEFINE_MUTEX(nvmet_rdma_queue_mutex);
 135
 136 static LIST_HEAD(device_list);
 137 static DEFINE_MUTEX(device_list_mutex);
 138
 139 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp);
 140 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc);
 141 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
 142 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc);
 143 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv);
 144 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
 145 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
 146                                 struct nvmet_rdma_rsp *r);
 147 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
 148                                 struct nvmet_rdma_rsp *r);
 149
 150 static const struct nvmet_fabrics_ops nvmet_rdma_ops;
 151
 152 static int num_pages(int len)
 153 {
 154         return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT);
 155 }
 156
 157 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
 158 {
 159         return nvme_is_write(rsp->req.cmd) &&
 160                 rsp->req.transfer_len &&
 161                 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
 162 }
 163
 164 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
 165 {
 166         return !nvme_is_write(rsp->req.cmd) &&
 167                 rsp->req.transfer_len &&
 168                 !rsp->req.cqe->status &&
 169                 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
 170 }
 171
 172 static inline struct nvmet_rdma_rsp *
 173 nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue)
 174 {
 175         struct nvmet_rdma_rsp *rsp;
 176         unsigned long flags;
 177
 178         spin_lock_irqsave(&queue->rsps_lock, flags);
 179         rsp = list_first_entry_or_null(&queue->free_rsps,
 180                                 struct nvmet_rdma_rsp, free_list);
 181         if (likely(rsp))
 182                 list_del(&rsp->free_list);
 183         spin_unlock_irqrestore(&queue->rsps_lock, flags);
 184
 185         if (unlikely(!rsp)) {
 186                 int ret;
 187
 188                 rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
 189                 if (unlikely(!rsp))
 190                         return NULL;
 191                 ret = nvmet_rdma_alloc_rsp(queue->dev, rsp);
 192                 if (unlikely(ret)) {
 193                         kfree(rsp);
 194                         return NULL;
 195                 }
 196
 197                 rsp->allocated = true;
 198         }
 199
 200         return rsp;
 201 }
 202
 203 static inline void
 204 nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
 205 {
 206         unsigned long flags;
 207
 208         if (unlikely(rsp->allocated)) {
 209                 nvmet_rdma_free_rsp(rsp->queue->dev, rsp);
 210                 kfree(rsp);
 211                 return;
 212         }
 213
 214         spin_lock_irqsave(&rsp->queue->rsps_lock, flags);
 215         list_add_tail(&rsp->free_list, &rsp->queue->free_rsps);
 216         spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
 217 }
 218
 219 static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev,
 220                                 struct nvmet_rdma_cmd *c)
 221 {
 222         struct scatterlist *sg;
 223         struct ib_sge *sge;
 224         int i;
 225
 226         if (!ndev->inline_data_size)
 227                 return;
 228
 229         sg = c->inline_sg;
 230         sge = &c->sge[1];
 231
 232         for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
 233                 if (sge->length)
 234                         ib_dma_unmap_page(ndev->device, sge->addr,
 235                                         sge->length, DMA_FROM_DEVICE);
 236                 if (sg_page(sg))
 237                         __free_page(sg_page(sg));
 238         }
 239 }
 240
 241 static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev,
 242                                 struct nvmet_rdma_cmd *c)
 243 {
 244         struct scatterlist *sg;
 245         struct ib_sge *sge;
 246         struct page *pg;
 247         int len;
 248         int i;
 249
 250         if (!ndev->inline_data_size)
 251                 return 0;
 252
 253         sg = c->inline_sg;
 254         sg_init_table(sg, ndev->inline_page_count);
 255         sge = &c->sge[1];
 256         len = ndev->inline_data_size;
 257
 258         for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
 259                 pg = alloc_page(GFP_KERNEL);
 260                 if (!pg)
 261                         goto out_err;
 262                 sg_assign_page(sg, pg);
 263                 sge->addr = ib_dma_map_page(ndev->device,
 264                         pg, 0, PAGE_SIZE, DMA_FROM_DEVICE);
 265                 if (ib_dma_mapping_error(ndev->device, sge->addr))
 266                         goto out_err;
 267                 sge->length = min_t(int, len, PAGE_SIZE);
 268                 sge->lkey = ndev->pd->local_dma_lkey;
 269                 len -= sge->length;
 270         }
 271
 272         return 0;
 273 out_err:
 274         for (; i >= 0; i--, sg--, sge--) {
 275                 if (sge->length)
 276                         ib_dma_unmap_page(ndev->device, sge->addr,
 277                                         sge->length, DMA_FROM_DEVICE);
 278                 if (sg_page(sg))
 279                         __free_page(sg_page(sg));
 280         }
 281         return -ENOMEM;
 282 }
 283
 284 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
 285                         struct nvmet_rdma_cmd *c, bool admin)
 286 {
 287         /* NVMe command / RDMA RECV */
 288         c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL);
 289         if (!c->nvme_cmd)
 290                 goto out;
 291
 292         c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd,
 293                         sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
 294         if (ib_dma_mapping_error(ndev->device, c->sge[0].addr))
 295                 goto out_free_cmd;
 296
 297         c->sge[0].length = sizeof(*c->nvme_cmd);
 298         c->sge[0].lkey = ndev->pd->local_dma_lkey;
 299
 300         if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c))
 301                 goto out_unmap_cmd;
 302
 303         c->cqe.done = nvmet_rdma_recv_done;
 304
 305         c->wr.wr_cqe = &c->cqe;
 306         c->wr.sg_list = c->sge;
 307         c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1;
 308
 309         return 0;
 310
 311 out_unmap_cmd:
 312         ib_dma_unmap_single(ndev->device, c->sge[0].addr,
 313                         sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
 314 out_free_cmd:
 315         kfree(c->nvme_cmd);
 316
 317 out:
 318         return -ENOMEM;
 319 }
 320
 321 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
 322                 struct nvmet_rdma_cmd *c, bool admin)
 323 {
 324         if (!admin)
 325                 nvmet_rdma_free_inline_pages(ndev, c);
 326         ib_dma_unmap_single(ndev->device, c->sge[0].addr,
 327                                 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
 328         kfree(c->nvme_cmd);
 329 }
 330
 331 static struct nvmet_rdma_cmd *
 332 nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev,
 333                 int nr_cmds, bool admin)
 334 {
 335         struct nvmet_rdma_cmd *cmds;
 336         int ret = -EINVAL, i;
 337
 338         cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL);
 339         if (!cmds)
 340                 goto out;
 341
 342         for (i = 0; i < nr_cmds; i++) {
 343                 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin);
 344                 if (ret)
 345                         goto out_free;
 346         }
 347
 348         return cmds;
 349
 350 out_free:
 351         while (--i >= 0)
 352                 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
 353         kfree(cmds);
 354 out:
 355         return ERR_PTR(ret);
 356 }
 357
 358 static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev,
 359                 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin)
 360 {
 361         int i;
 362
 363         for (i = 0; i < nr_cmds; i++)
 364                 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
 365         kfree(cmds);
 366 }
 367
 368 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
 369                 struct nvmet_rdma_rsp *r)
 370 {
 371         /* NVMe CQE / RDMA SEND */
 372         r->req.cqe = kmalloc(sizeof(*r->req.cqe), GFP_KERNEL);
 373         if (!r->req.cqe)
 374                 goto out;
 375
 376         r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.cqe,
 377                         sizeof(*r->req.cqe), DMA_TO_DEVICE);
 378         if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
 379                 goto out_free_rsp;
 380
 381         r->req.p2p_client = &ndev->device->dev;
 382         r->send_sge.length = sizeof(*r->req.cqe);
 383         r->send_sge.lkey = ndev->pd->local_dma_lkey;
 384
 385         r->send_cqe.done = nvmet_rdma_send_done;
 386
 387         r->send_wr.wr_cqe = &r->send_cqe;
 388         r->send_wr.sg_list = &r->send_sge;
 389         r->send_wr.num_sge = 1;
 390         r->send_wr.send_flags = IB_SEND_SIGNALED;
 391
 392         /* Data In / RDMA READ */
 393         r->read_cqe.done = nvmet_rdma_read_data_done;
 394         return 0;
 395
 396 out_free_rsp:
 397         kfree(r->req.cqe);
 398 out:
 399         return -ENOMEM;
 400 }
 401
 402 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
 403                 struct nvmet_rdma_rsp *r)
 404 {
 405         ib_dma_unmap_single(ndev->device, r->send_sge.addr,
 406                                 sizeof(*r->req.cqe), DMA_TO_DEVICE);
 407         kfree(r->req.cqe);
 408 }
 409
 410 static int
 411 nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue)
 412 {
 413         struct nvmet_rdma_device *ndev = queue->dev;
 414         int nr_rsps = queue->recv_queue_size * 2;
 415         int ret = -EINVAL, i;
 416
 417         queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp),
 418                         GFP_KERNEL);
 419         if (!queue->rsps)
 420                 goto out;
 421
 422         for (i = 0; i < nr_rsps; i++) {
 423                 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
 424
 425                 ret = nvmet_rdma_alloc_rsp(ndev, rsp);
 426                 if (ret)
 427                         goto out_free;
 428
 429                 list_add_tail(&rsp->free_list, &queue->free_rsps);
 430         }
 431
 432         return 0;
 433
 434 out_free:
 435         while (--i >= 0) {
 436                 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
 437
 438                 list_del(&rsp->free_list);
 439                 nvmet_rdma_free_rsp(ndev, rsp);
 440         }
 441         kfree(queue->rsps);
 442 out:
 443         return ret;
 444 }
 445
 446 static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue)
 447 {
 448         struct nvmet_rdma_device *ndev = queue->dev;
 449         int i, nr_rsps = queue->recv_queue_size * 2;
 450
 451         for (i = 0; i < nr_rsps; i++) {
 452                 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
 453
 454                 list_del(&rsp->free_list);
 455                 nvmet_rdma_free_rsp(ndev, rsp);
 456         }
 457         kfree(queue->rsps);
 458 }
 459
 460 static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
 461                 struct nvmet_rdma_cmd *cmd)
 462 {
 463         int ret;
 464
 465         ib_dma_sync_single_for_device(ndev->device,
 466                 cmd->sge[0].addr, cmd->sge[0].length,
 467                 DMA_FROM_DEVICE);
 468
 469         if (ndev->srq)
 470                 ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL);
 471         else
 472                 ret = ib_post_recv(cmd->queue->qp, &cmd->wr, NULL);
 473
 474         if (unlikely(ret))
 475                 pr_err("post_recv cmd failed\n");
 476
 477         return ret;
 478 }
 479
 480 static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
 481 {
 482         spin_lock(&queue->rsp_wr_wait_lock);
 483         while (!list_empty(&queue->rsp_wr_wait_list)) {
 484                 struct nvmet_rdma_rsp *rsp;
 485                 bool ret;
 486
 487                 rsp = list_entry(queue->rsp_wr_wait_list.next,
 488                                 struct nvmet_rdma_rsp, wait_list);
 489                 list_del(&rsp->wait_list);
 490
 491                 spin_unlock(&queue->rsp_wr_wait_lock);
 492                 ret = nvmet_rdma_execute_command(rsp);
 493                 spin_lock(&queue->rsp_wr_wait_lock);
 494
 495                 if (!ret) {
 496                         list_add(&rsp->wait_list, &queue->rsp_wr_wait_list);
 497                         break;
 498                 }
 499         }
 500         spin_unlock(&queue->rsp_wr_wait_lock);
 501 }
 502
 503
 504 static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
 505 {
 506         struct nvmet_rdma_queue *queue = rsp->queue;
 507
 508         atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
 509
 510         if (rsp->n_rdma) {
 511                 rdma_rw_ctx_destroy(&rsp->rw, queue->qp,
 512                                 queue->cm_id->port_num, rsp->req.sg,
 513                                 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
 514         }
 515
 516         if (rsp->req.sg != rsp->cmd->inline_sg)
 517                 nvmet_req_free_sgl(&rsp->req);
 518
 519         if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
 520                 nvmet_rdma_process_wr_wait_list(queue);
 521
 522         nvmet_rdma_put_rsp(rsp);
 523 }
 524
 525 static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue)
 526 {
 527         if (queue->nvme_sq.ctrl) {
 528                 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
 529         } else {
 530                 /*
 531                  * we didn't setup the controller yet in case
 532                  * of admin connect error, just disconnect and
 533                  * cleanup the queue
 534                  */
 535                 nvmet_rdma_queue_disconnect(queue);
 536         }
 537 }
 538
 539 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
 540 {
 541         struct nvmet_rdma_rsp *rsp =
 542                 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe);
 543         struct nvmet_rdma_queue *queue = cq->cq_context;
 544
 545         nvmet_rdma_release_rsp(rsp);
 546
 547         if (unlikely(wc->status != IB_WC_SUCCESS &&
 548                      wc->status != IB_WC_WR_FLUSH_ERR)) {
 549                 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n",
 550                         wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
 551                 nvmet_rdma_error_comp(queue);
 552         }
 553 }
 554
 555 static void nvmet_rdma_queue_response(struct nvmet_req *req)
 556 {
 557         struct nvmet_rdma_rsp *rsp =
 558                 container_of(req, struct nvmet_rdma_rsp, req);
 559         struct rdma_cm_id *cm_id = rsp->queue->cm_id;
 560         struct ib_send_wr *first_wr;
 561
 562         if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) {
 563                 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV;
 564                 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey;
 565         } else {
 566                 rsp->send_wr.opcode = IB_WR_SEND;
 567         }
 568
 569         if (nvmet_rdma_need_data_out(rsp))
 570                 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
 571                                 cm_id->port_num, NULL, &rsp->send_wr);
 572         else
 573                 first_wr = &rsp->send_wr;
 574
 575         nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
 576
 577         ib_dma_sync_single_for_device(rsp->queue->dev->device,
 578                 rsp->send_sge.addr, rsp->send_sge.length,
 579                 DMA_TO_DEVICE);
 580
 581         if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) {
 582                 pr_err("sending cmd response failed\n");
 583                 nvmet_rdma_release_rsp(rsp);
 584         }
 585 }
 586
 587 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
 588 {
 589         struct nvmet_rdma_rsp *rsp =
 590                 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
 591         struct nvmet_rdma_queue *queue = cq->cq_context;
 592
 593         WARN_ON(rsp->n_rdma <= 0);
 594         atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
 595         rdma_rw_ctx_destroy(&rsp->rw, queue->qp,
 596                         queue->cm_id->port_num, rsp->req.sg,
 597                         rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
 598         rsp->n_rdma = 0;
 599
 600         if (unlikely(wc->status != IB_WC_SUCCESS)) {
 601                 nvmet_req_uninit(&rsp->req);
 602                 nvmet_rdma_release_rsp(rsp);
 603                 if (wc->status != IB_WC_WR_FLUSH_ERR) {
 604                         pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n",
 605                                 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
 606                         nvmet_rdma_error_comp(queue);
 607                 }
 608                 return;
 609         }
 610
 611         rsp->req.execute(&rsp->req);
 612 }
 613
 614 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
 615                 u64 off)
 616 {
 617         int sg_count = num_pages(len);
 618         struct scatterlist *sg;
 619         int i;
 620
 621         sg = rsp->cmd->inline_sg;
 622         for (i = 0; i < sg_count; i++, sg++) {
 623                 if (i < sg_count - 1)
 624                         sg_unmark_end(sg);
 625                 else
 626                         sg_mark_end(sg);
 627                 sg->offset = off;
 628                 sg->length = min_t(int, len, PAGE_SIZE - off);
 629                 len -= sg->length;
 630                 if (!i)
 631                         off = 0;
 632         }
 633
 634         rsp->req.sg = rsp->cmd->inline_sg;
 635         rsp->req.sg_cnt = sg_count;
 636 }
 637
 638 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
 639 {
 640         struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl;
 641         u64 off = le64_to_cpu(sgl->addr);
 642         u32 len = le32_to_cpu(sgl->length);
 643
 644         if (!nvme_is_write(rsp->req.cmd)) {
 645                 rsp->req.error_loc =
 646                         offsetof(struct nvme_common_command, opcode);
 647                 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 648         }
 649
 650         if (off + len > rsp->queue->dev->inline_data_size) {
 651                 pr_err("invalid inline data offset!\n");
 652                 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
 653         }
 654
 655         /* no data command? */
 656         if (!len)
 657                 return 0;
 658
 659         nvmet_rdma_use_inline_sg(rsp, len, off);
 660         rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA;
 661         rsp->req.transfer_len += len;
 662         return 0;
 663 }
 664
 665 static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
 666                 struct nvme_keyed_sgl_desc *sgl, bool invalidate)
 667 {
 668         struct rdma_cm_id *cm_id = rsp->queue->cm_id;
 669         u64 addr = le64_to_cpu(sgl->addr);
 670         u32 key = get_unaligned_le32(sgl->key);
 671         int ret;
 672
 673         rsp->req.transfer_len = get_unaligned_le24(sgl->length);
 674
 675         /* no data command? */
 676         if (!rsp->req.transfer_len)
 677                 return 0;
 678
 679         ret = nvmet_req_alloc_sgl(&rsp->req);
 680         if (unlikely(ret < 0))
 681                 goto error_out;
 682
 683         ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
 684                         rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
 685                         nvmet_data_dir(&rsp->req));
 686         if (unlikely(ret < 0))
 687                 goto error_out;
 688         rsp->n_rdma += ret;
 689
 690         if (invalidate) {
 691                 rsp->invalidate_rkey = key;
 692                 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY;
 693         }
 694
 695         return 0;
 696
 697 error_out:
 698         rsp->req.transfer_len = 0;
 699         return NVME_SC_INTERNAL;
 700 }
 701
 702 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
 703 {
 704         struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl;
 705
 706         switch (sgl->type >> 4) {
 707         case NVME_SGL_FMT_DATA_DESC:
 708                 switch (sgl->type & 0xf) {
 709                 case NVME_SGL_FMT_OFFSET:
 710                         return nvmet_rdma_map_sgl_inline(rsp);
 711                 default:
 712                         pr_err("invalid SGL subtype: %#x\n", sgl->type);
 713                         rsp->req.error_loc =
 714                                 offsetof(struct nvme_common_command, dptr);
 715                         return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 716                 }
 717         case NVME_KEY_SGL_FMT_DATA_DESC:
 718                 switch (sgl->type & 0xf) {
 719                 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE:
 720                         return nvmet_rdma_map_sgl_keyed(rsp, sgl, true);
 721                 case NVME_SGL_FMT_ADDRESS:
 722                         return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
 723                 default:
 724                         pr_err("invalid SGL subtype: %#x\n", sgl->type);
 725                         rsp->req.error_loc =
 726                                 offsetof(struct nvme_common_command, dptr);
 727                         return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
 728                 }
 729         default:
 730                 pr_err("invalid SGL type: %#x\n", sgl->type);
 731                 rsp->req.error_loc = offsetof(struct nvme_common_command, dptr);
 732                 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
 733         }
 734 }
 735
 736 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
 737 {
 738         struct nvmet_rdma_queue *queue = rsp->queue;
 739
 740         if (unlikely(atomic_sub_return(1 + rsp->n_rdma,
 741                         &queue->sq_wr_avail) < 0)) {
 742                 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n",
 743                                 1 + rsp->n_rdma, queue->idx,
 744                                 queue->nvme_sq.ctrl->cntlid);
 745                 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
 746                 return false;
 747         }
 748
 749         if (nvmet_rdma_need_data_in(rsp)) {
 750                 if (rdma_rw_ctx_post(&rsp->rw, queue->qp,
 751                                 queue->cm_id->port_num, &rsp->read_cqe, NULL))
 752                         nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
 753         } else {
 754                 rsp->req.execute(&rsp->req);
 755         }
 756
 757         return true;
 758 }
 759
 760 static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
 761                 struct nvmet_rdma_rsp *cmd)
 762 {
 763         u16 status;
 764
 765         ib_dma_sync_single_for_cpu(queue->dev->device,
 766                 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length,
 767                 DMA_FROM_DEVICE);
 768         ib_dma_sync_single_for_cpu(queue->dev->device,
 769                 cmd->send_sge.addr, cmd->send_sge.length,
 770                 DMA_TO_DEVICE);
 771
 772         if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
 773                         &queue->nvme_sq, &nvmet_rdma_ops))
 774                 return;
 775
 776         status = nvmet_rdma_map_sgl(cmd);
 777         if (status)
 778                 goto out_err;
 779
 780         if (unlikely(!nvmet_rdma_execute_command(cmd))) {
 781                 spin_lock(&queue->rsp_wr_wait_lock);
 782                 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list);
 783                 spin_unlock(&queue->rsp_wr_wait_lock);
 784         }
 785
 786         return;
 787
 788 out_err:
 789         nvmet_req_complete(&cmd->req, status);
 790 }
 791
 792 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 793 {
 794         struct nvmet_rdma_cmd *cmd =
 795                 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
 796         struct nvmet_rdma_queue *queue = cq->cq_context;
 797         struct nvmet_rdma_rsp *rsp;
 798
 799         if (unlikely(wc->status != IB_WC_SUCCESS)) {
 800                 if (wc->status != IB_WC_WR_FLUSH_ERR) {
 801                         pr_err("RECV for CQE 0x%p failed with status %s (%d)\n",
 802                                 wc->wr_cqe, ib_wc_status_msg(wc->status),
 803                                 wc->status);
 804                         nvmet_rdma_error_comp(queue);
 805                 }
 806                 return;
 807         }
 808
 809         if (unlikely(wc->byte_len < sizeof(struct nvme_command))) {
 810                 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n");
 811                 nvmet_rdma_error_comp(queue);
 812                 return;
 813         }
 814
 815         cmd->queue = queue;
 816         rsp = nvmet_rdma_get_rsp(queue);
 817         if (unlikely(!rsp)) {
 818                 /*
 819                  * we get here only under memory pressure,
 820                  * silently drop and have the host retry
 821                  * as we can't even fail it.
 822                  */
 823                 nvmet_rdma_post_recv(queue->dev, cmd);
 824                 return;
 825         }
 826         rsp->queue = queue;
 827         rsp->cmd = cmd;
 828         rsp->flags = 0;
 829         rsp->req.cmd = cmd->nvme_cmd;
 830         rsp->req.port = queue->port;
 831         rsp->n_rdma = 0;
 832
 833         if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) {
 834                 unsigned long flags;
 835
 836                 spin_lock_irqsave(&queue->state_lock, flags);
 837                 if (queue->state == NVMET_RDMA_Q_CONNECTING)
 838                         list_add_tail(&rsp->wait_list, &queue->rsp_wait_list);
 839                 else
 840                         nvmet_rdma_put_rsp(rsp);
 841                 spin_unlock_irqrestore(&queue->state_lock, flags);
 842                 return;
 843         }
 844
 845         nvmet_rdma_handle_command(queue, rsp);
 846 }
 847
 848 static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev)
 849 {
 850         if (!ndev->srq)
 851                 return;
 852
 853         nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
 854         ib_destroy_srq(ndev->srq);
 855 }
 856
 857 static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
 858 {
 859         struct ib_srq_init_attr srq_attr = { NULL, };
 860         struct ib_srq *srq;
 861         size_t srq_size;
 862         int ret, i;
 863
 864         srq_size = 4095;        /* XXX: tune */
 865
 866         srq_attr.attr.max_wr = srq_size;
 867         srq_attr.attr.max_sge = 1 + ndev->inline_page_count;
 868         srq_attr.attr.srq_limit = 0;
 869         srq_attr.srq_type = IB_SRQT_BASIC;
 870         srq = ib_create_srq(ndev->pd, &srq_attr);
 871         if (IS_ERR(srq)) {
 872                 /*
 873                  * If SRQs aren't supported we just go ahead and use normal
 874                  * non-shared receive queues.
 875                  */
 876                 pr_info("SRQ requested but not supported.\n");
 877                 return 0;
 878         }
 879
 880         ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false);
 881         if (IS_ERR(ndev->srq_cmds)) {
 882                 ret = PTR_ERR(ndev->srq_cmds);
 883                 goto out_destroy_srq;
 884         }
 885
 886         ndev->srq = srq;
 887         ndev->srq_size = srq_size;
 888
 889         for (i = 0; i < srq_size; i++) {
 890                 ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
 891                 if (ret)
 892                         goto out_free_cmds;
 893         }
 894
 895         return 0;
 896
 897 out_free_cmds:
 898         nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
 899 out_destroy_srq:
 900         ib_destroy_srq(srq);
 901         return ret;
 902 }
 903
 904 static void nvmet_rdma_free_dev(struct kref *ref)
 905 {
 906         struct nvmet_rdma_device *ndev =
 907                 container_of(ref, struct nvmet_rdma_device, ref);
 908
 909         mutex_lock(&device_list_mutex);
 910         list_del(&ndev->entry);
 911         mutex_unlock(&device_list_mutex);
 912
 913         nvmet_rdma_destroy_srq(ndev);
 914         ib_dealloc_pd(ndev->pd);
 915
 916         kfree(ndev);
 917 }
 918
 919 static struct nvmet_rdma_device *
 920 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
 921 {
 922         struct nvmet_rdma_port *port = cm_id->context;
 923         struct nvmet_port *nport = port->nport;
 924         struct nvmet_rdma_device *ndev;
 925         int inline_page_count;
 926         int inline_sge_count;
 927         int ret;
 928
 929         mutex_lock(&device_list_mutex);
 930         list_for_each_entry(ndev, &device_list, entry) {
 931                 if (ndev->device->node_guid == cm_id->device->node_guid &&
 932                     kref_get_unless_zero(&ndev->ref))
 933                         goto out_unlock;
 934         }
 935
 936         ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
 937         if (!ndev)
 938                 goto out_err;
 939
 940         inline_page_count = num_pages(nport->inline_data_size);
 941         inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
 942                                 cm_id->device->attrs.max_recv_sge) - 1;
 943         if (inline_page_count > inline_sge_count) {
 944                 pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n",
 945                         nport->inline_data_size, cm_id->device->name,
 946                         inline_sge_count * PAGE_SIZE);
 947                 nport->inline_data_size = inline_sge_count * PAGE_SIZE;
 948                 inline_page_count = inline_sge_count;
 949         }
 950         ndev->inline_data_size = nport->inline_data_size;
 951         ndev->inline_page_count = inline_page_count;
 952         ndev->device = cm_id->device;
 953         kref_init(&ndev->ref);
 954
 955         ndev->pd = ib_alloc_pd(ndev->device, 0);
 956         if (IS_ERR(ndev->pd))
 957                 goto out_free_dev;
 958
 959         if (nvmet_rdma_use_srq) {
 960                 ret = nvmet_rdma_init_srq(ndev);
 961                 if (ret)
 962                         goto out_free_pd;
 963         }
 964
 965         list_add(&ndev->entry, &device_list);
 966 out_unlock:
 967         mutex_unlock(&device_list_mutex);
 968         pr_debug("added %s.\n", ndev->device->name);
 969         return ndev;
 970
 971 out_free_pd:
 972         ib_dealloc_pd(ndev->pd);
 973 out_free_dev:
 974         kfree(ndev);
 975 out_err:
 976         mutex_unlock(&device_list_mutex);
 977         return NULL;
 978 }
 979
 980 static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
 981 {
 982         struct ib_qp_init_attr qp_attr;
 983         struct nvmet_rdma_device *ndev = queue->dev;
 984         int comp_vector, nr_cqe, ret, i, factor;
 985
 986         /*
 987          * Spread the io queues across completion vectors,
 988          * but still keep all admin queues on vector 0.
 989          */
 990         comp_vector = !queue->host_qid ? 0 :
 991                 queue->idx % ndev->device->num_comp_vectors;
 992
 993         /*
 994          * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND.
 995          */
 996         nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
 997
 998         queue->cq = ib_alloc_cq(ndev->device, queue,
 999                         nr_cqe + 1, comp_vector,
1000                         IB_POLL_WORKQUEUE);
1001         if (IS_ERR(queue->cq)) {
1002                 ret = PTR_ERR(queue->cq);
1003                 pr_err("failed to create CQ cqe= %d ret= %d\n",
1004                        nr_cqe + 1, ret);
1005                 goto out;
1006         }
1007
1008         memset(&qp_attr, 0, sizeof(qp_attr));
1009         qp_attr.qp_context = queue;
1010         qp_attr.event_handler = nvmet_rdma_qp_event;
1011         qp_attr.send_cq = queue->cq;
1012         qp_attr.recv_cq = queue->cq;
1013         qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
1014         qp_attr.qp_type = IB_QPT_RC;
1015         /* +1 for drain */
1016         qp_attr.cap.max_send_wr = queue->send_queue_size + 1;
1017         factor = rdma_rw_mr_factor(ndev->device, queue->cm_id->port_num,
1018                                    1 << NVMET_RDMA_MAX_MDTS);
1019         qp_attr.cap.max_rdma_ctxs = queue->send_queue_size * factor;
1020         qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd,
1021                                         ndev->device->attrs.max_send_sge);
1022
1023         if (ndev->srq) {
1024                 qp_attr.srq = ndev->srq;
1025         } else {
1026                 /* +1 for drain */
1027                 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
1028                 qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count;
1029         }
1030
1031         ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
1032         if (ret) {
1033                 pr_err("failed to create_qp ret= %d\n", ret);
1034                 goto err_destroy_cq;
1035         }
1036         queue->qp = queue->cm_id->qp;
1037
1038         atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
1039
1040         pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
1041                  __func__, queue->cq->cqe, qp_attr.cap.max_send_sge,
1042                  qp_attr.cap.max_send_wr, queue->cm_id);
1043
1044         if (!ndev->srq) {
1045                 for (i = 0; i < queue->recv_queue_size; i++) {
1046                         queue->cmds[i].queue = queue;
1047                         ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
1048                         if (ret)
1049                                 goto err_destroy_qp;
1050                 }
1051         }
1052
1053 out:
1054         return ret;
1055
1056 err_destroy_qp:
1057         rdma_destroy_qp(queue->cm_id);
1058 err_destroy_cq:
1059         ib_free_cq(queue->cq);
1060         goto out;
1061 }
1062
1063 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
1064 {
1065         ib_drain_qp(queue->qp);
1066         if (queue->cm_id)
1067                 rdma_destroy_id(queue->cm_id);
1068         ib_destroy_qp(queue->qp);
1069         ib_free_cq(queue->cq);
1070 }
1071
1072 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
1073 {
1074         pr_debug("freeing queue %d\n", queue->idx);
1075
1076         nvmet_sq_destroy(&queue->nvme_sq);
1077
1078         nvmet_rdma_destroy_queue_ib(queue);
1079         if (!queue->dev->srq) {
1080                 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
1081                                 queue->recv_queue_size,
1082                                 !queue->host_qid);
1083         }
1084         nvmet_rdma_free_rsps(queue);
1085         ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
1086         kfree(queue);
1087 }
1088
1089 static void nvmet_rdma_release_queue_work(struct work_struct *w)
1090 {
1091         struct nvmet_rdma_queue *queue =
1092                 container_of(w, struct nvmet_rdma_queue, release_work);
1093         struct nvmet_rdma_device *dev = queue->dev;
1094
1095         nvmet_rdma_free_queue(queue);
1096
1097         kref_put(&dev->ref, nvmet_rdma_free_dev);
1098 }
1099
1100 static int
1101 nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn,
1102                                 struct nvmet_rdma_queue *queue)
1103 {
1104         struct nvme_rdma_cm_req *req;
1105
1106         req = (struct nvme_rdma_cm_req *)conn->private_data;
1107         if (!req || conn->private_data_len == 0)
1108                 return NVME_RDMA_CM_INVALID_LEN;
1109
1110         if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0)
1111                 return NVME_RDMA_CM_INVALID_RECFMT;
1112
1113         queue->host_qid = le16_to_cpu(req->qid);
1114
1115         /*
1116          * req->hsqsize corresponds to our recv queue size plus 1
1117          * req->hrqsize corresponds to our send queue size
1118          */
1119         queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1;
1120         queue->send_queue_size = le16_to_cpu(req->hrqsize);
1121
1122         if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH)
1123                 return NVME_RDMA_CM_INVALID_HSQSIZE;
1124
1125         /* XXX: Should we enforce some kind of max for IO queues? */
1126
1127         return 0;
1128 }
1129
1130 static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
1131                                 enum nvme_rdma_cm_status status)
1132 {
1133         struct nvme_rdma_cm_rej rej;
1134
1135         pr_debug("rejecting connect request: status %d (%s)\n",
1136                  status, nvme_rdma_cm_msg(status));
1137
1138         rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1139         rej.sts = cpu_to_le16(status);
1140
1141         return rdma_reject(cm_id, (void *)&rej, sizeof(rej));
1142 }
1143
1144 static struct nvmet_rdma_queue *
1145 nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
1146                 struct rdma_cm_id *cm_id,
1147                 struct rdma_cm_event *event)
1148 {
1149         struct nvmet_rdma_queue *queue;
1150         int ret;
1151
1152         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1153         if (!queue) {
1154                 ret = NVME_RDMA_CM_NO_RSC;
1155                 goto out_reject;
1156         }
1157
1158         ret = nvmet_sq_init(&queue->nvme_sq);
1159         if (ret) {
1160                 ret = NVME_RDMA_CM_NO_RSC;
1161                 goto out_free_queue;
1162         }
1163
1164         ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue);
1165         if (ret)
1166                 goto out_destroy_sq;
1167
1168         /*
1169          * Schedules the actual release because calling rdma_destroy_id from
1170          * inside a CM callback would trigger a deadlock. (great API design..)
1171          */
1172         INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
1173         queue->dev = ndev;
1174         queue->cm_id = cm_id;
1175
1176         spin_lock_init(&queue->state_lock);
1177         queue->state = NVMET_RDMA_Q_CONNECTING;
1178         INIT_LIST_HEAD(&queue->rsp_wait_list);
1179         INIT_LIST_HEAD(&queue->rsp_wr_wait_list);
1180         spin_lock_init(&queue->rsp_wr_wait_lock);
1181         INIT_LIST_HEAD(&queue->free_rsps);
1182         spin_lock_init(&queue->rsps_lock);
1183         INIT_LIST_HEAD(&queue->queue_list);
1184
1185         queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL);
1186         if (queue->idx < 0) {
1187                 ret = NVME_RDMA_CM_NO_RSC;
1188                 goto out_destroy_sq;
1189         }
1190
1191         ret = nvmet_rdma_alloc_rsps(queue);
1192         if (ret) {
1193                 ret = NVME_RDMA_CM_NO_RSC;
1194                 goto out_ida_remove;
1195         }
1196
1197         if (!ndev->srq) {
1198                 queue->cmds = nvmet_rdma_alloc_cmds(ndev,
1199                                 queue->recv_queue_size,
1200                                 !queue->host_qid);
1201                 if (IS_ERR(queue->cmds)) {
1202                         ret = NVME_RDMA_CM_NO_RSC;
1203                         goto out_free_responses;
1204                 }
1205         }
1206
1207         ret = nvmet_rdma_create_queue_ib(queue);
1208         if (ret) {
1209                 pr_err("%s: creating RDMA queue failed (%d).\n",
1210                         __func__, ret);
1211                 ret = NVME_RDMA_CM_NO_RSC;
1212                 goto out_free_cmds;
1213         }
1214
1215         return queue;
1216
1217 out_free_cmds:
1218         if (!ndev->srq) {
1219                 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
1220                                 queue->recv_queue_size,
1221                                 !queue->host_qid);
1222         }
1223 out_free_responses:
1224         nvmet_rdma_free_rsps(queue);
1225 out_ida_remove:
1226         ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
1227 out_destroy_sq:
1228         nvmet_sq_destroy(&queue->nvme_sq);
1229 out_free_queue:
1230         kfree(queue);
1231 out_reject:
1232         nvmet_rdma_cm_reject(cm_id, ret);
1233         return NULL;
1234 }
1235
1236 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv)
1237 {
1238         struct nvmet_rdma_queue *queue = priv;
1239
1240         switch (event->event) {
1241         case IB_EVENT_COMM_EST:
1242                 rdma_notify(queue->cm_id, event->event);
1243                 break;
1244         default:
1245                 pr_err("received IB QP event: %s (%d)\n",
1246                        ib_event_msg(event->event), event->event);
1247                 break;
1248         }
1249 }
1250
1251 static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
1252                 struct nvmet_rdma_queue *queue,
1253                 struct rdma_conn_param *p)
1254 {
1255         struct rdma_conn_param  param = { };
1256         struct nvme_rdma_cm_rep priv = { };
1257         int ret = -ENOMEM;
1258
1259         param.rnr_retry_count = 7;
1260         param.flow_control = 1;
1261         param.initiator_depth = min_t(u8, p->initiator_depth,
1262                 queue->dev->device->attrs.max_qp_init_rd_atom);
1263         param.private_data = &priv;
1264         param.private_data_len = sizeof(priv);
1265         priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1266         priv.crqsize = cpu_to_le16(queue->recv_queue_size);
1267
1268         ret = rdma_accept(cm_id, &param);
1269         if (ret)
1270                 pr_err("rdma_accept failed (error code = %d)\n", ret);
1271
1272         return ret;
1273 }
1274
1275 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
1276                 struct rdma_cm_event *event)
1277 {
1278         struct nvmet_rdma_port *port = cm_id->context;
1279         struct nvmet_rdma_device *ndev;
1280         struct nvmet_rdma_queue *queue;
1281         int ret = -EINVAL;
1282
1283         ndev = nvmet_rdma_find_get_device(cm_id);
1284         if (!ndev) {
1285                 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC);
1286                 return -ECONNREFUSED;
1287         }
1288
1289         queue = nvmet_rdma_alloc_queue(ndev, cm_id, event);
1290         if (!queue) {
1291                 ret = -ENOMEM;
1292                 goto put_device;
1293         }
1294         queue->port = port->nport;
1295
1296         if (queue->host_qid == 0) {
1297                 /* Let inflight controller teardown complete */
1298                 flush_scheduled_work();
1299         }
1300
1301         ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
1302         if (ret) {
1303                 /*
1304                  * Don't destroy the cm_id in free path, as we implicitly
1305                  * destroy the cm_id here with non-zero ret code.
1306                  */
1307                 queue->cm_id = NULL;
1308                 goto free_queue;
1309         }
1310
1311         mutex_lock(&nvmet_rdma_queue_mutex);
1312         list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list);
1313         mutex_unlock(&nvmet_rdma_queue_mutex);
1314
1315         return 0;
1316
1317 free_queue:
1318         nvmet_rdma_free_queue(queue);
1319 put_device:
1320         kref_put(&ndev->ref, nvmet_rdma_free_dev);
1321
1322         return ret;
1323 }
1324
1325 static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue)
1326 {
1327         unsigned long flags;
1328
1329         spin_lock_irqsave(&queue->state_lock, flags);
1330         if (queue->state != NVMET_RDMA_Q_CONNECTING) {
1331                 pr_warn("trying to establish a connected queue\n");
1332                 goto out_unlock;
1333         }
1334         queue->state = NVMET_RDMA_Q_LIVE;
1335
1336         while (!list_empty(&queue->rsp_wait_list)) {
1337                 struct nvmet_rdma_rsp *cmd;
1338
1339                 cmd = list_first_entry(&queue->rsp_wait_list,
1340                                         struct nvmet_rdma_rsp, wait_list);
1341                 list_del(&cmd->wait_list);
1342
1343                 spin_unlock_irqrestore(&queue->state_lock, flags);
1344                 nvmet_rdma_handle_command(queue, cmd);
1345                 spin_lock_irqsave(&queue->state_lock, flags);
1346         }
1347
1348 out_unlock:
1349         spin_unlock_irqrestore(&queue->state_lock, flags);
1350 }
1351
1352 static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1353 {
1354         bool disconnect = false;
1355         unsigned long flags;
1356
1357         pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state);
1358
1359         spin_lock_irqsave(&queue->state_lock, flags);
1360         switch (queue->state) {
1361         case NVMET_RDMA_Q_CONNECTING:
1362         case NVMET_RDMA_Q_LIVE:
1363                 queue->state = NVMET_RDMA_Q_DISCONNECTING;
1364                 disconnect = true;
1365                 break;
1366         case NVMET_RDMA_Q_DISCONNECTING:
1367                 break;
1368         }
1369         spin_unlock_irqrestore(&queue->state_lock, flags);
1370
1371         if (disconnect) {
1372                 rdma_disconnect(queue->cm_id);
1373                 schedule_work(&queue->release_work);
1374         }
1375 }
1376
1377 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1378 {
1379         bool disconnect = false;
1380
1381         mutex_lock(&nvmet_rdma_queue_mutex);
1382         if (!list_empty(&queue->queue_list)) {
1383                 list_del_init(&queue->queue_list);
1384                 disconnect = true;
1385         }
1386         mutex_unlock(&nvmet_rdma_queue_mutex);
1387
1388         if (disconnect)
1389                 __nvmet_rdma_queue_disconnect(queue);
1390 }
1391
1392 static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
1393                 struct nvmet_rdma_queue *queue)
1394 {
1395         WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING);
1396
1397         mutex_lock(&nvmet_rdma_queue_mutex);
1398         if (!list_empty(&queue->queue_list))
1399                 list_del_init(&queue->queue_list);
1400         mutex_unlock(&nvmet_rdma_queue_mutex);
1401
1402         pr_err("failed to connect queue %d\n", queue->idx);
1403         schedule_work(&queue->release_work);
1404 }
1405
1406 /**
1407  * nvme_rdma_device_removal() - Handle RDMA device removal
1408  * @cm_id:      rdma_cm id, used for nvmet port
1409  * @queue:      nvmet rdma queue (cm id qp_context)
1410  *
1411  * DEVICE_REMOVAL event notifies us that the RDMA device is about
1412  * to unplug. Note that this event can be generated on a normal
1413  * queue cm_id and/or a device bound listener cm_id (where in this
1414  * case queue will be null).
1415  *
1416  * We registered an ib_client to handle device removal for queues,
1417  * so we only need to handle the listening port cm_ids. In this case
1418  * we nullify the priv to prevent double cm_id destruction and destroying
1419  * the cm_id implicitely by returning a non-zero rc to the callout.
1420  */
1421 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
1422                 struct nvmet_rdma_queue *queue)
1423 {
1424         struct nvmet_rdma_port *port;
1425
1426         if (queue) {
1427                 /*
1428                  * This is a queue cm_id. we have registered
1429                  * an ib_client to handle queues removal
1430                  * so don't interfear and just return.
1431                  */
1432                 return 0;
1433         }
1434
1435         port = cm_id->context;
1436
1437         /*
1438          * This is a listener cm_id. Make sure that
1439          * future remove_port won't invoke a double
1440          * cm_id destroy. use atomic xchg to make sure
1441          * we don't compete with remove_port.
1442          */
1443         if (xchg(&port->cm_id, NULL) != cm_id)
1444                 return 0;
1445
1446         /*
1447          * We need to return 1 so that the core will destroy
1448          * it's own ID.  What a great API design..
1449          */
1450         return 1;
1451 }
1452
1453 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
1454                 struct rdma_cm_event *event)
1455 {
1456         struct nvmet_rdma_queue *queue = NULL;
1457         int ret = 0;
1458
1459         if (cm_id->qp)
1460                 queue = cm_id->qp->qp_context;
1461
1462         pr_debug("%s (%d): status %d id %p\n",
1463                 rdma_event_msg(event->event), event->event,
1464                 event->status, cm_id);
1465
1466         switch (event->event) {
1467         case RDMA_CM_EVENT_CONNECT_REQUEST:
1468                 ret = nvmet_rdma_queue_connect(cm_id, event);
1469                 break;
1470         case RDMA_CM_EVENT_ESTABLISHED:
1471                 nvmet_rdma_queue_established(queue);
1472                 break;
1473         case RDMA_CM_EVENT_ADDR_CHANGE:
1474                 if (!queue) {
1475                         struct nvmet_rdma_port *port = cm_id->context;
1476
1477                         schedule_delayed_work(&port->repair_work, 0);
1478                         break;
1479                 }
1480                 /* FALLTHROUGH */
1481         case RDMA_CM_EVENT_DISCONNECTED:
1482         case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1483                 nvmet_rdma_queue_disconnect(queue);
1484                 break;
1485         case RDMA_CM_EVENT_DEVICE_REMOVAL:
1486                 ret = nvmet_rdma_device_removal(cm_id, queue);
1487                 break;
1488         case RDMA_CM_EVENT_REJECTED:
1489                 pr_debug("Connection rejected: %s\n",
1490                          rdma_reject_msg(cm_id, event->status));
1491                 /* FALLTHROUGH */
1492         case RDMA_CM_EVENT_UNREACHABLE:
1493         case RDMA_CM_EVENT_CONNECT_ERROR:
1494                 nvmet_rdma_queue_connect_fail(cm_id, queue);
1495                 break;
1496         default:
1497                 pr_err("received unrecognized RDMA CM event %d\n",
1498                         event->event);
1499                 break;
1500         }
1501
1502         return ret;
1503 }
1504
1505 static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl)
1506 {
1507         struct nvmet_rdma_queue *queue;
1508
1509 restart:
1510         mutex_lock(&nvmet_rdma_queue_mutex);
1511         list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) {
1512                 if (queue->nvme_sq.ctrl == ctrl) {
1513                         list_del_init(&queue->queue_list);
1514                         mutex_unlock(&nvmet_rdma_queue_mutex);
1515
1516                         __nvmet_rdma_queue_disconnect(queue);
1517                         goto restart;
1518                 }
1519         }
1520         mutex_unlock(&nvmet_rdma_queue_mutex);
1521 }
1522
1523 static void nvmet_rdma_disable_port(struct nvmet_rdma_port *port)
1524 {
1525         struct rdma_cm_id *cm_id = xchg(&port->cm_id, NULL);
1526
1527         if (cm_id)
1528                 rdma_destroy_id(cm_id);
1529 }
1530
1531 static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port)
1532 {
1533         struct sockaddr *addr = (struct sockaddr *)&port->addr;
1534         struct rdma_cm_id *cm_id;
1535         int ret;
1536
1537         cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
1538                         RDMA_PS_TCP, IB_QPT_RC);
1539         if (IS_ERR(cm_id)) {
1540                 pr_err("CM ID creation failed\n");
1541                 return PTR_ERR(cm_id);
1542         }
1543
1544         /*
1545          * Allow both IPv4 and IPv6 sockets to bind a single port
1546          * at the same time.
1547          */
1548         ret = rdma_set_afonly(cm_id, 1);
1549         if (ret) {
1550                 pr_err("rdma_set_afonly failed (%d)\n", ret);
1551                 goto out_destroy_id;
1552         }
1553
1554         ret = rdma_bind_addr(cm_id, addr);
1555         if (ret) {
1556                 pr_err("binding CM ID to %pISpcs failed (%d)\n", addr, ret);
1557                 goto out_destroy_id;
1558         }
1559
1560         ret = rdma_listen(cm_id, 128);
1561         if (ret) {
1562                 pr_err("listening to %pISpcs failed (%d)\n", addr, ret);
1563                 goto out_destroy_id;
1564         }
1565
1566         port->cm_id = cm_id;
1567         return 0;
1568
1569 out_destroy_id:
1570         rdma_destroy_id(cm_id);
1571         return ret;
1572 }
1573
1574 static void nvmet_rdma_repair_port_work(struct work_struct *w)
1575 {
1576         struct nvmet_rdma_port *port = container_of(to_delayed_work(w),
1577                         struct nvmet_rdma_port, repair_work);
1578         int ret;
1579
1580         nvmet_rdma_disable_port(port);
1581         ret = nvmet_rdma_enable_port(port);
1582         if (ret)
1583                 schedule_delayed_work(&port->repair_work, 5 * HZ);
1584 }
1585
1586 static int nvmet_rdma_add_port(struct nvmet_port *nport)
1587 {
1588         struct nvmet_rdma_port *port;
1589         __kernel_sa_family_t af;
1590         int ret;
1591
1592         port = kzalloc(sizeof(*port), GFP_KERNEL);
1593         if (!port)
1594                 return -ENOMEM;
1595
1596         nport->priv = port;
1597         port->nport = nport;
1598         INIT_DELAYED_WORK(&port->repair_work, nvmet_rdma_repair_port_work);
1599
1600         switch (nport->disc_addr.adrfam) {
1601         case NVMF_ADDR_FAMILY_IP4:
1602                 af = AF_INET;
1603                 break;
1604         case NVMF_ADDR_FAMILY_IP6:
1605                 af = AF_INET6;
1606                 break;
1607         default:
1608                 pr_err("address family %d not supported\n",
1609                         nport->disc_addr.adrfam);
1610                 ret = -EINVAL;
1611                 goto out_free_port;
1612         }
1613
1614         if (nport->inline_data_size < 0) {
1615                 nport->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
1616         } else if (nport->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
1617                 pr_warn("inline_data_size %u is too large, reducing to %u\n",
1618                         nport->inline_data_size,
1619                         NVMET_RDMA_MAX_INLINE_DATA_SIZE);
1620                 nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
1621         }
1622
1623         ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
1624                         nport->disc_addr.trsvcid, &port->addr);
1625         if (ret) {
1626                 pr_err("malformed ip/port passed: %s:%s\n",
1627                         nport->disc_addr.traddr, nport->disc_addr.trsvcid);
1628                 goto out_free_port;
1629         }
1630
1631         ret = nvmet_rdma_enable_port(port);
1632         if (ret)
1633                 goto out_free_port;
1634
1635         pr_info("enabling port %d (%pISpcs)\n",
1636                 le16_to_cpu(nport->disc_addr.portid),
1637                 (struct sockaddr *)&port->addr);
1638
1639         return 0;
1640
1641 out_free_port:
1642         kfree(port);
1643         return ret;
1644 }
1645
1646 static void nvmet_rdma_remove_port(struct nvmet_port *nport)
1647 {
1648         struct nvmet_rdma_port *port = nport->priv;
1649
1650         cancel_delayed_work_sync(&port->repair_work);
1651         nvmet_rdma_disable_port(port);
1652         kfree(port);
1653 }
1654
1655 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
1656                 struct nvmet_port *nport, char *traddr)
1657 {
1658         struct nvmet_rdma_port *port = nport->priv;
1659         struct rdma_cm_id *cm_id = port->cm_id;
1660
1661         if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) {
1662                 struct nvmet_rdma_rsp *rsp =
1663                         container_of(req, struct nvmet_rdma_rsp, req);
1664                 struct rdma_cm_id *req_cm_id = rsp->queue->cm_id;
1665                 struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr;
1666
1667                 sprintf(traddr, "%pISc", addr);
1668         } else {
1669                 memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
1670         }
1671 }
1672
1673 static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl)
1674 {
1675         return NVMET_RDMA_MAX_MDTS;
1676 }
1677
1678 static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
1679         .owner                  = THIS_MODULE,
1680         .type                   = NVMF_TRTYPE_RDMA,
1681         .msdbd                  = 1,
1682         .has_keyed_sgls         = 1,
1683         .add_port               = nvmet_rdma_add_port,
1684         .remove_port            = nvmet_rdma_remove_port,
1685         .queue_response         = nvmet_rdma_queue_response,
1686         .delete_ctrl            = nvmet_rdma_delete_ctrl,
1687         .disc_traddr            = nvmet_rdma_disc_port_addr,
1688         .get_mdts               = nvmet_rdma_get_mdts,
1689 };
1690
1691 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
1692 {
1693         struct nvmet_rdma_queue *queue, *tmp;
1694         struct nvmet_rdma_device *ndev;
1695         bool found = false;
1696
1697         mutex_lock(&device_list_mutex);
1698         list_for_each_entry(ndev, &device_list, entry) {
1699                 if (ndev->device == ib_device) {
1700                         found = true;
1701                         break;
1702                 }
1703         }
1704         mutex_unlock(&device_list_mutex);
1705
1706         if (!found)
1707                 return;
1708
1709         /*
1710          * IB Device that is used by nvmet controllers is being removed,
1711          * delete all queues using this device.
1712          */
1713         mutex_lock(&nvmet_rdma_queue_mutex);
1714         list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
1715                                  queue_list) {
1716                 if (queue->dev->device != ib_device)
1717                         continue;
1718
1719                 pr_info("Removing queue %d\n", queue->idx);
1720                 list_del_init(&queue->queue_list);
1721                 __nvmet_rdma_queue_disconnect(queue);
1722         }
1723         mutex_unlock(&nvmet_rdma_queue_mutex);
1724
1725         flush_scheduled_work();
1726 }
1727
1728 static struct ib_client nvmet_rdma_ib_client = {
1729         .name   = "nvmet_rdma",
1730         .remove = nvmet_rdma_remove_one
1731 };
1732
1733 static int __init nvmet_rdma_init(void)
1734 {
1735         int ret;
1736
1737         ret = ib_register_client(&nvmet_rdma_ib_client);
1738         if (ret)
1739                 return ret;
1740
1741         ret = nvmet_register_transport(&nvmet_rdma_ops);
1742         if (ret)
1743                 goto err_ib_client;
1744
1745         return 0;
1746
1747 err_ib_client:
1748         ib_unregister_client(&nvmet_rdma_ib_client);
1749         return ret;
1750 }
1751
1752 static void __exit nvmet_rdma_exit(void)
1753 {
1754         nvmet_unregister_transport(&nvmet_rdma_ops);
1755         ib_unregister_client(&nvmet_rdma_ib_client);
1756         WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
1757         ida_destroy(&nvmet_rdma_queue_ida);
1758 }
1759
1760 module_init(nvmet_rdma_init);
1761 module_exit(nvmet_rdma_exit);
1762
1763 MODULE_LICENSE("GPL v2");
1764 MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */