dm writecache: add cond_resched to loop in persistent_memory_claim()
[linux/fpc-iii.git] / drivers / nvme / target / rdma.c
blobfd47de0e4e4e543eac0aca176e3ffa4d0c5b7fbb
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * NVMe over Fabrics RDMA target.
4 * Copyright (c) 2015-2016 HGST, a Western Digital Company.
5 */
6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7 #include <linux/atomic.h>
8 #include <linux/ctype.h>
9 #include <linux/delay.h>
10 #include <linux/err.h>
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/nvme.h>
14 #include <linux/slab.h>
15 #include <linux/string.h>
16 #include <linux/wait.h>
17 #include <linux/inet.h>
18 #include <asm/unaligned.h>
20 #include <rdma/ib_verbs.h>
21 #include <rdma/rdma_cm.h>
22 #include <rdma/rw.h>
24 #include <linux/nvme-rdma.h>
25 #include "nvmet.h"
28 * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data
30 #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE
31 #define NVMET_RDMA_MAX_INLINE_SGE 4
32 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE)
34 /* Assume mpsmin == device_page_size == 4KB */
35 #define NVMET_RDMA_MAX_MDTS 8
37 struct nvmet_rdma_cmd {
38 struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1];
39 struct ib_cqe cqe;
40 struct ib_recv_wr wr;
41 struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE];
42 struct nvme_command *nvme_cmd;
43 struct nvmet_rdma_queue *queue;
46 enum {
47 NVMET_RDMA_REQ_INLINE_DATA = (1 << 0),
48 NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1),
51 struct nvmet_rdma_rsp {
52 struct ib_sge send_sge;
53 struct ib_cqe send_cqe;
54 struct ib_send_wr send_wr;
56 struct nvmet_rdma_cmd *cmd;
57 struct nvmet_rdma_queue *queue;
59 struct ib_cqe read_cqe;
60 struct rdma_rw_ctx rw;
62 struct nvmet_req req;
64 bool allocated;
65 u8 n_rdma;
66 u32 flags;
67 u32 invalidate_rkey;
69 struct list_head wait_list;
70 struct list_head free_list;
73 enum nvmet_rdma_queue_state {
74 NVMET_RDMA_Q_CONNECTING,
75 NVMET_RDMA_Q_LIVE,
76 NVMET_RDMA_Q_DISCONNECTING,
79 struct nvmet_rdma_queue {
80 struct rdma_cm_id *cm_id;
81 struct ib_qp *qp;
82 struct nvmet_port *port;
83 struct ib_cq *cq;
84 atomic_t sq_wr_avail;
85 struct nvmet_rdma_device *dev;
86 spinlock_t state_lock;
87 enum nvmet_rdma_queue_state state;
88 struct nvmet_cq nvme_cq;
89 struct nvmet_sq nvme_sq;
91 struct nvmet_rdma_rsp *rsps;
92 struct list_head free_rsps;
93 spinlock_t rsps_lock;
94 struct nvmet_rdma_cmd *cmds;
96 struct work_struct release_work;
97 struct list_head rsp_wait_list;
98 struct list_head rsp_wr_wait_list;
99 spinlock_t rsp_wr_wait_lock;
101 int idx;
102 int host_qid;
103 int recv_queue_size;
104 int send_queue_size;
106 struct list_head queue_list;
109 struct nvmet_rdma_port {
110 struct nvmet_port *nport;
111 struct sockaddr_storage addr;
112 struct rdma_cm_id *cm_id;
113 struct delayed_work repair_work;
116 struct nvmet_rdma_device {
117 struct ib_device *device;
118 struct ib_pd *pd;
119 struct ib_srq *srq;
120 struct nvmet_rdma_cmd *srq_cmds;
121 size_t srq_size;
122 struct kref ref;
123 struct list_head entry;
124 int inline_data_size;
125 int inline_page_count;
128 static bool nvmet_rdma_use_srq;
129 module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444);
130 MODULE_PARM_DESC(use_srq, "Use shared receive queue.");
132 static DEFINE_IDA(nvmet_rdma_queue_ida);
133 static LIST_HEAD(nvmet_rdma_queue_list);
134 static DEFINE_MUTEX(nvmet_rdma_queue_mutex);
136 static LIST_HEAD(device_list);
137 static DEFINE_MUTEX(device_list_mutex);
139 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp);
140 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc);
141 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
142 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc);
143 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv);
144 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
145 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
146 struct nvmet_rdma_rsp *r);
147 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
148 struct nvmet_rdma_rsp *r);
150 static const struct nvmet_fabrics_ops nvmet_rdma_ops;
152 static int num_pages(int len)
154 return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT);
157 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
159 return nvme_is_write(rsp->req.cmd) &&
160 rsp->req.transfer_len &&
161 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
164 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp)
166 return !nvme_is_write(rsp->req.cmd) &&
167 rsp->req.transfer_len &&
168 !rsp->req.cqe->status &&
169 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA);
172 static inline struct nvmet_rdma_rsp *
173 nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue)
175 struct nvmet_rdma_rsp *rsp;
176 unsigned long flags;
178 spin_lock_irqsave(&queue->rsps_lock, flags);
179 rsp = list_first_entry_or_null(&queue->free_rsps,
180 struct nvmet_rdma_rsp, free_list);
181 if (likely(rsp))
182 list_del(&rsp->free_list);
183 spin_unlock_irqrestore(&queue->rsps_lock, flags);
185 if (unlikely(!rsp)) {
186 int ret;
188 rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
189 if (unlikely(!rsp))
190 return NULL;
191 ret = nvmet_rdma_alloc_rsp(queue->dev, rsp);
192 if (unlikely(ret)) {
193 kfree(rsp);
194 return NULL;
197 rsp->allocated = true;
200 return rsp;
203 static inline void
204 nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
206 unsigned long flags;
208 if (unlikely(rsp->allocated)) {
209 nvmet_rdma_free_rsp(rsp->queue->dev, rsp);
210 kfree(rsp);
211 return;
214 spin_lock_irqsave(&rsp->queue->rsps_lock, flags);
215 list_add_tail(&rsp->free_list, &rsp->queue->free_rsps);
216 spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
219 static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev,
220 struct nvmet_rdma_cmd *c)
222 struct scatterlist *sg;
223 struct ib_sge *sge;
224 int i;
226 if (!ndev->inline_data_size)
227 return;
229 sg = c->inline_sg;
230 sge = &c->sge[1];
232 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
233 if (sge->length)
234 ib_dma_unmap_page(ndev->device, sge->addr,
235 sge->length, DMA_FROM_DEVICE);
236 if (sg_page(sg))
237 __free_page(sg_page(sg));
241 static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev,
242 struct nvmet_rdma_cmd *c)
244 struct scatterlist *sg;
245 struct ib_sge *sge;
246 struct page *pg;
247 int len;
248 int i;
250 if (!ndev->inline_data_size)
251 return 0;
253 sg = c->inline_sg;
254 sg_init_table(sg, ndev->inline_page_count);
255 sge = &c->sge[1];
256 len = ndev->inline_data_size;
258 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) {
259 pg = alloc_page(GFP_KERNEL);
260 if (!pg)
261 goto out_err;
262 sg_assign_page(sg, pg);
263 sge->addr = ib_dma_map_page(ndev->device,
264 pg, 0, PAGE_SIZE, DMA_FROM_DEVICE);
265 if (ib_dma_mapping_error(ndev->device, sge->addr))
266 goto out_err;
267 sge->length = min_t(int, len, PAGE_SIZE);
268 sge->lkey = ndev->pd->local_dma_lkey;
269 len -= sge->length;
272 return 0;
273 out_err:
274 for (; i >= 0; i--, sg--, sge--) {
275 if (sge->length)
276 ib_dma_unmap_page(ndev->device, sge->addr,
277 sge->length, DMA_FROM_DEVICE);
278 if (sg_page(sg))
279 __free_page(sg_page(sg));
281 return -ENOMEM;
284 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
285 struct nvmet_rdma_cmd *c, bool admin)
287 /* NVMe command / RDMA RECV */
288 c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL);
289 if (!c->nvme_cmd)
290 goto out;
292 c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd,
293 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
294 if (ib_dma_mapping_error(ndev->device, c->sge[0].addr))
295 goto out_free_cmd;
297 c->sge[0].length = sizeof(*c->nvme_cmd);
298 c->sge[0].lkey = ndev->pd->local_dma_lkey;
300 if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c))
301 goto out_unmap_cmd;
303 c->cqe.done = nvmet_rdma_recv_done;
305 c->wr.wr_cqe = &c->cqe;
306 c->wr.sg_list = c->sge;
307 c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1;
309 return 0;
311 out_unmap_cmd:
312 ib_dma_unmap_single(ndev->device, c->sge[0].addr,
313 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
314 out_free_cmd:
315 kfree(c->nvme_cmd);
317 out:
318 return -ENOMEM;
321 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev,
322 struct nvmet_rdma_cmd *c, bool admin)
324 if (!admin)
325 nvmet_rdma_free_inline_pages(ndev, c);
326 ib_dma_unmap_single(ndev->device, c->sge[0].addr,
327 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE);
328 kfree(c->nvme_cmd);
331 static struct nvmet_rdma_cmd *
332 nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev,
333 int nr_cmds, bool admin)
335 struct nvmet_rdma_cmd *cmds;
336 int ret = -EINVAL, i;
338 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL);
339 if (!cmds)
340 goto out;
342 for (i = 0; i < nr_cmds; i++) {
343 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin);
344 if (ret)
345 goto out_free;
348 return cmds;
350 out_free:
351 while (--i >= 0)
352 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
353 kfree(cmds);
354 out:
355 return ERR_PTR(ret);
358 static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev,
359 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin)
361 int i;
363 for (i = 0; i < nr_cmds; i++)
364 nvmet_rdma_free_cmd(ndev, cmds + i, admin);
365 kfree(cmds);
368 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
369 struct nvmet_rdma_rsp *r)
371 /* NVMe CQE / RDMA SEND */
372 r->req.cqe = kmalloc(sizeof(*r->req.cqe), GFP_KERNEL);
373 if (!r->req.cqe)
374 goto out;
376 r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.cqe,
377 sizeof(*r->req.cqe), DMA_TO_DEVICE);
378 if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
379 goto out_free_rsp;
381 r->req.p2p_client = &ndev->device->dev;
382 r->send_sge.length = sizeof(*r->req.cqe);
383 r->send_sge.lkey = ndev->pd->local_dma_lkey;
385 r->send_cqe.done = nvmet_rdma_send_done;
387 r->send_wr.wr_cqe = &r->send_cqe;
388 r->send_wr.sg_list = &r->send_sge;
389 r->send_wr.num_sge = 1;
390 r->send_wr.send_flags = IB_SEND_SIGNALED;
392 /* Data In / RDMA READ */
393 r->read_cqe.done = nvmet_rdma_read_data_done;
394 return 0;
396 out_free_rsp:
397 kfree(r->req.cqe);
398 out:
399 return -ENOMEM;
402 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
403 struct nvmet_rdma_rsp *r)
405 ib_dma_unmap_single(ndev->device, r->send_sge.addr,
406 sizeof(*r->req.cqe), DMA_TO_DEVICE);
407 kfree(r->req.cqe);
410 static int
411 nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue)
413 struct nvmet_rdma_device *ndev = queue->dev;
414 int nr_rsps = queue->recv_queue_size * 2;
415 int ret = -EINVAL, i;
417 queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp),
418 GFP_KERNEL);
419 if (!queue->rsps)
420 goto out;
422 for (i = 0; i < nr_rsps; i++) {
423 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
425 ret = nvmet_rdma_alloc_rsp(ndev, rsp);
426 if (ret)
427 goto out_free;
429 list_add_tail(&rsp->free_list, &queue->free_rsps);
432 return 0;
434 out_free:
435 while (--i >= 0) {
436 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
438 list_del(&rsp->free_list);
439 nvmet_rdma_free_rsp(ndev, rsp);
441 kfree(queue->rsps);
442 out:
443 return ret;
446 static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue)
448 struct nvmet_rdma_device *ndev = queue->dev;
449 int i, nr_rsps = queue->recv_queue_size * 2;
451 for (i = 0; i < nr_rsps; i++) {
452 struct nvmet_rdma_rsp *rsp = &queue->rsps[i];
454 list_del(&rsp->free_list);
455 nvmet_rdma_free_rsp(ndev, rsp);
457 kfree(queue->rsps);
460 static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev,
461 struct nvmet_rdma_cmd *cmd)
463 int ret;
465 ib_dma_sync_single_for_device(ndev->device,
466 cmd->sge[0].addr, cmd->sge[0].length,
467 DMA_FROM_DEVICE);
469 if (ndev->srq)
470 ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL);
471 else
472 ret = ib_post_recv(cmd->queue->qp, &cmd->wr, NULL);
474 if (unlikely(ret))
475 pr_err("post_recv cmd failed\n");
477 return ret;
480 static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
482 spin_lock(&queue->rsp_wr_wait_lock);
483 while (!list_empty(&queue->rsp_wr_wait_list)) {
484 struct nvmet_rdma_rsp *rsp;
485 bool ret;
487 rsp = list_entry(queue->rsp_wr_wait_list.next,
488 struct nvmet_rdma_rsp, wait_list);
489 list_del(&rsp->wait_list);
491 spin_unlock(&queue->rsp_wr_wait_lock);
492 ret = nvmet_rdma_execute_command(rsp);
493 spin_lock(&queue->rsp_wr_wait_lock);
495 if (!ret) {
496 list_add(&rsp->wait_list, &queue->rsp_wr_wait_list);
497 break;
500 spin_unlock(&queue->rsp_wr_wait_lock);
504 static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
506 struct nvmet_rdma_queue *queue = rsp->queue;
508 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
510 if (rsp->n_rdma) {
511 rdma_rw_ctx_destroy(&rsp->rw, queue->qp,
512 queue->cm_id->port_num, rsp->req.sg,
513 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
516 if (rsp->req.sg != rsp->cmd->inline_sg)
517 nvmet_req_free_sgl(&rsp->req);
519 if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
520 nvmet_rdma_process_wr_wait_list(queue);
522 nvmet_rdma_put_rsp(rsp);
525 static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue)
527 if (queue->nvme_sq.ctrl) {
528 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl);
529 } else {
531 * we didn't setup the controller yet in case
532 * of admin connect error, just disconnect and
533 * cleanup the queue
535 nvmet_rdma_queue_disconnect(queue);
539 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc)
541 struct nvmet_rdma_rsp *rsp =
542 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe);
543 struct nvmet_rdma_queue *queue = cq->cq_context;
545 nvmet_rdma_release_rsp(rsp);
547 if (unlikely(wc->status != IB_WC_SUCCESS &&
548 wc->status != IB_WC_WR_FLUSH_ERR)) {
549 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n",
550 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
551 nvmet_rdma_error_comp(queue);
555 static void nvmet_rdma_queue_response(struct nvmet_req *req)
557 struct nvmet_rdma_rsp *rsp =
558 container_of(req, struct nvmet_rdma_rsp, req);
559 struct rdma_cm_id *cm_id = rsp->queue->cm_id;
560 struct ib_send_wr *first_wr;
562 if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) {
563 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV;
564 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey;
565 } else {
566 rsp->send_wr.opcode = IB_WR_SEND;
569 if (nvmet_rdma_need_data_out(rsp))
570 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
571 cm_id->port_num, NULL, &rsp->send_wr);
572 else
573 first_wr = &rsp->send_wr;
575 nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
577 ib_dma_sync_single_for_device(rsp->queue->dev->device,
578 rsp->send_sge.addr, rsp->send_sge.length,
579 DMA_TO_DEVICE);
581 if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) {
582 pr_err("sending cmd response failed\n");
583 nvmet_rdma_release_rsp(rsp);
587 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
589 struct nvmet_rdma_rsp *rsp =
590 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
591 struct nvmet_rdma_queue *queue = cq->cq_context;
593 WARN_ON(rsp->n_rdma <= 0);
594 atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
595 rdma_rw_ctx_destroy(&rsp->rw, queue->qp,
596 queue->cm_id->port_num, rsp->req.sg,
597 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
598 rsp->n_rdma = 0;
600 if (unlikely(wc->status != IB_WC_SUCCESS)) {
601 nvmet_req_uninit(&rsp->req);
602 nvmet_rdma_release_rsp(rsp);
603 if (wc->status != IB_WC_WR_FLUSH_ERR) {
604 pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n",
605 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status);
606 nvmet_rdma_error_comp(queue);
608 return;
611 rsp->req.execute(&rsp->req);
614 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
615 u64 off)
617 int sg_count = num_pages(len);
618 struct scatterlist *sg;
619 int i;
621 sg = rsp->cmd->inline_sg;
622 for (i = 0; i < sg_count; i++, sg++) {
623 if (i < sg_count - 1)
624 sg_unmark_end(sg);
625 else
626 sg_mark_end(sg);
627 sg->offset = off;
628 sg->length = min_t(int, len, PAGE_SIZE - off);
629 len -= sg->length;
630 if (!i)
631 off = 0;
634 rsp->req.sg = rsp->cmd->inline_sg;
635 rsp->req.sg_cnt = sg_count;
638 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
640 struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl;
641 u64 off = le64_to_cpu(sgl->addr);
642 u32 len = le32_to_cpu(sgl->length);
644 if (!nvme_is_write(rsp->req.cmd)) {
645 rsp->req.error_loc =
646 offsetof(struct nvme_common_command, opcode);
647 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
650 if (off + len > rsp->queue->dev->inline_data_size) {
651 pr_err("invalid inline data offset!\n");
652 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR;
655 /* no data command? */
656 if (!len)
657 return 0;
659 nvmet_rdma_use_inline_sg(rsp, len, off);
660 rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA;
661 rsp->req.transfer_len += len;
662 return 0;
665 static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
666 struct nvme_keyed_sgl_desc *sgl, bool invalidate)
668 struct rdma_cm_id *cm_id = rsp->queue->cm_id;
669 u64 addr = le64_to_cpu(sgl->addr);
670 u32 key = get_unaligned_le32(sgl->key);
671 int ret;
673 rsp->req.transfer_len = get_unaligned_le24(sgl->length);
675 /* no data command? */
676 if (!rsp->req.transfer_len)
677 return 0;
679 ret = nvmet_req_alloc_sgl(&rsp->req);
680 if (unlikely(ret < 0))
681 goto error_out;
683 ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
684 rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
685 nvmet_data_dir(&rsp->req));
686 if (unlikely(ret < 0))
687 goto error_out;
688 rsp->n_rdma += ret;
690 if (invalidate) {
691 rsp->invalidate_rkey = key;
692 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY;
695 return 0;
697 error_out:
698 rsp->req.transfer_len = 0;
699 return NVME_SC_INTERNAL;
702 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
704 struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl;
706 switch (sgl->type >> 4) {
707 case NVME_SGL_FMT_DATA_DESC:
708 switch (sgl->type & 0xf) {
709 case NVME_SGL_FMT_OFFSET:
710 return nvmet_rdma_map_sgl_inline(rsp);
711 default:
712 pr_err("invalid SGL subtype: %#x\n", sgl->type);
713 rsp->req.error_loc =
714 offsetof(struct nvme_common_command, dptr);
715 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
717 case NVME_KEY_SGL_FMT_DATA_DESC:
718 switch (sgl->type & 0xf) {
719 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE:
720 return nvmet_rdma_map_sgl_keyed(rsp, sgl, true);
721 case NVME_SGL_FMT_ADDRESS:
722 return nvmet_rdma_map_sgl_keyed(rsp, sgl, false);
723 default:
724 pr_err("invalid SGL subtype: %#x\n", sgl->type);
725 rsp->req.error_loc =
726 offsetof(struct nvme_common_command, dptr);
727 return NVME_SC_INVALID_FIELD | NVME_SC_DNR;
729 default:
730 pr_err("invalid SGL type: %#x\n", sgl->type);
731 rsp->req.error_loc = offsetof(struct nvme_common_command, dptr);
732 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR;
736 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
738 struct nvmet_rdma_queue *queue = rsp->queue;
740 if (unlikely(atomic_sub_return(1 + rsp->n_rdma,
741 &queue->sq_wr_avail) < 0)) {
742 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n",
743 1 + rsp->n_rdma, queue->idx,
744 queue->nvme_sq.ctrl->cntlid);
745 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
746 return false;
749 if (nvmet_rdma_need_data_in(rsp)) {
750 if (rdma_rw_ctx_post(&rsp->rw, queue->qp,
751 queue->cm_id->port_num, &rsp->read_cqe, NULL))
752 nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR);
753 } else {
754 rsp->req.execute(&rsp->req);
757 return true;
760 static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue,
761 struct nvmet_rdma_rsp *cmd)
763 u16 status;
765 ib_dma_sync_single_for_cpu(queue->dev->device,
766 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length,
767 DMA_FROM_DEVICE);
768 ib_dma_sync_single_for_cpu(queue->dev->device,
769 cmd->send_sge.addr, cmd->send_sge.length,
770 DMA_TO_DEVICE);
772 if (!nvmet_req_init(&cmd->req, &queue->nvme_cq,
773 &queue->nvme_sq, &nvmet_rdma_ops))
774 return;
776 status = nvmet_rdma_map_sgl(cmd);
777 if (status)
778 goto out_err;
780 if (unlikely(!nvmet_rdma_execute_command(cmd))) {
781 spin_lock(&queue->rsp_wr_wait_lock);
782 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list);
783 spin_unlock(&queue->rsp_wr_wait_lock);
786 return;
788 out_err:
789 nvmet_req_complete(&cmd->req, status);
792 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
794 struct nvmet_rdma_cmd *cmd =
795 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
796 struct nvmet_rdma_queue *queue = cq->cq_context;
797 struct nvmet_rdma_rsp *rsp;
799 if (unlikely(wc->status != IB_WC_SUCCESS)) {
800 if (wc->status != IB_WC_WR_FLUSH_ERR) {
801 pr_err("RECV for CQE 0x%p failed with status %s (%d)\n",
802 wc->wr_cqe, ib_wc_status_msg(wc->status),
803 wc->status);
804 nvmet_rdma_error_comp(queue);
806 return;
809 if (unlikely(wc->byte_len < sizeof(struct nvme_command))) {
810 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n");
811 nvmet_rdma_error_comp(queue);
812 return;
815 cmd->queue = queue;
816 rsp = nvmet_rdma_get_rsp(queue);
817 if (unlikely(!rsp)) {
819 * we get here only under memory pressure,
820 * silently drop and have the host retry
821 * as we can't even fail it.
823 nvmet_rdma_post_recv(queue->dev, cmd);
824 return;
826 rsp->queue = queue;
827 rsp->cmd = cmd;
828 rsp->flags = 0;
829 rsp->req.cmd = cmd->nvme_cmd;
830 rsp->req.port = queue->port;
831 rsp->n_rdma = 0;
833 if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) {
834 unsigned long flags;
836 spin_lock_irqsave(&queue->state_lock, flags);
837 if (queue->state == NVMET_RDMA_Q_CONNECTING)
838 list_add_tail(&rsp->wait_list, &queue->rsp_wait_list);
839 else
840 nvmet_rdma_put_rsp(rsp);
841 spin_unlock_irqrestore(&queue->state_lock, flags);
842 return;
845 nvmet_rdma_handle_command(queue, rsp);
848 static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev)
850 if (!ndev->srq)
851 return;
853 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
854 ib_destroy_srq(ndev->srq);
857 static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
859 struct ib_srq_init_attr srq_attr = { NULL, };
860 struct ib_srq *srq;
861 size_t srq_size;
862 int ret, i;
864 srq_size = 4095; /* XXX: tune */
866 srq_attr.attr.max_wr = srq_size;
867 srq_attr.attr.max_sge = 1 + ndev->inline_page_count;
868 srq_attr.attr.srq_limit = 0;
869 srq_attr.srq_type = IB_SRQT_BASIC;
870 srq = ib_create_srq(ndev->pd, &srq_attr);
871 if (IS_ERR(srq)) {
873 * If SRQs aren't supported we just go ahead and use normal
874 * non-shared receive queues.
876 pr_info("SRQ requested but not supported.\n");
877 return 0;
880 ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false);
881 if (IS_ERR(ndev->srq_cmds)) {
882 ret = PTR_ERR(ndev->srq_cmds);
883 goto out_destroy_srq;
886 ndev->srq = srq;
887 ndev->srq_size = srq_size;
889 for (i = 0; i < srq_size; i++) {
890 ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
891 if (ret)
892 goto out_free_cmds;
895 return 0;
897 out_free_cmds:
898 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false);
899 out_destroy_srq:
900 ib_destroy_srq(srq);
901 return ret;
904 static void nvmet_rdma_free_dev(struct kref *ref)
906 struct nvmet_rdma_device *ndev =
907 container_of(ref, struct nvmet_rdma_device, ref);
909 mutex_lock(&device_list_mutex);
910 list_del(&ndev->entry);
911 mutex_unlock(&device_list_mutex);
913 nvmet_rdma_destroy_srq(ndev);
914 ib_dealloc_pd(ndev->pd);
916 kfree(ndev);
919 static struct nvmet_rdma_device *
920 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id)
922 struct nvmet_rdma_port *port = cm_id->context;
923 struct nvmet_port *nport = port->nport;
924 struct nvmet_rdma_device *ndev;
925 int inline_page_count;
926 int inline_sge_count;
927 int ret;
929 mutex_lock(&device_list_mutex);
930 list_for_each_entry(ndev, &device_list, entry) {
931 if (ndev->device->node_guid == cm_id->device->node_guid &&
932 kref_get_unless_zero(&ndev->ref))
933 goto out_unlock;
936 ndev = kzalloc(sizeof(*ndev), GFP_KERNEL);
937 if (!ndev)
938 goto out_err;
940 inline_page_count = num_pages(nport->inline_data_size);
941 inline_sge_count = max(cm_id->device->attrs.max_sge_rd,
942 cm_id->device->attrs.max_recv_sge) - 1;
943 if (inline_page_count > inline_sge_count) {
944 pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n",
945 nport->inline_data_size, cm_id->device->name,
946 inline_sge_count * PAGE_SIZE);
947 nport->inline_data_size = inline_sge_count * PAGE_SIZE;
948 inline_page_count = inline_sge_count;
950 ndev->inline_data_size = nport->inline_data_size;
951 ndev->inline_page_count = inline_page_count;
952 ndev->device = cm_id->device;
953 kref_init(&ndev->ref);
955 ndev->pd = ib_alloc_pd(ndev->device, 0);
956 if (IS_ERR(ndev->pd))
957 goto out_free_dev;
959 if (nvmet_rdma_use_srq) {
960 ret = nvmet_rdma_init_srq(ndev);
961 if (ret)
962 goto out_free_pd;
965 list_add(&ndev->entry, &device_list);
966 out_unlock:
967 mutex_unlock(&device_list_mutex);
968 pr_debug("added %s.\n", ndev->device->name);
969 return ndev;
971 out_free_pd:
972 ib_dealloc_pd(ndev->pd);
973 out_free_dev:
974 kfree(ndev);
975 out_err:
976 mutex_unlock(&device_list_mutex);
977 return NULL;
980 static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
982 struct ib_qp_init_attr qp_attr;
983 struct nvmet_rdma_device *ndev = queue->dev;
984 int comp_vector, nr_cqe, ret, i, factor;
987 * Spread the io queues across completion vectors,
988 * but still keep all admin queues on vector 0.
990 comp_vector = !queue->host_qid ? 0 :
991 queue->idx % ndev->device->num_comp_vectors;
994 * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND.
996 nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
998 queue->cq = ib_alloc_cq(ndev->device, queue,
999 nr_cqe + 1, comp_vector,
1000 IB_POLL_WORKQUEUE);
1001 if (IS_ERR(queue->cq)) {
1002 ret = PTR_ERR(queue->cq);
1003 pr_err("failed to create CQ cqe= %d ret= %d\n",
1004 nr_cqe + 1, ret);
1005 goto out;
1008 memset(&qp_attr, 0, sizeof(qp_attr));
1009 qp_attr.qp_context = queue;
1010 qp_attr.event_handler = nvmet_rdma_qp_event;
1011 qp_attr.send_cq = queue->cq;
1012 qp_attr.recv_cq = queue->cq;
1013 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
1014 qp_attr.qp_type = IB_QPT_RC;
1015 /* +1 for drain */
1016 qp_attr.cap.max_send_wr = queue->send_queue_size + 1;
1017 factor = rdma_rw_mr_factor(ndev->device, queue->cm_id->port_num,
1018 1 << NVMET_RDMA_MAX_MDTS);
1019 qp_attr.cap.max_rdma_ctxs = queue->send_queue_size * factor;
1020 qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd,
1021 ndev->device->attrs.max_send_sge);
1023 if (ndev->srq) {
1024 qp_attr.srq = ndev->srq;
1025 } else {
1026 /* +1 for drain */
1027 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
1028 qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count;
1031 ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
1032 if (ret) {
1033 pr_err("failed to create_qp ret= %d\n", ret);
1034 goto err_destroy_cq;
1036 queue->qp = queue->cm_id->qp;
1038 atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr);
1040 pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
1041 __func__, queue->cq->cqe, qp_attr.cap.max_send_sge,
1042 qp_attr.cap.max_send_wr, queue->cm_id);
1044 if (!ndev->srq) {
1045 for (i = 0; i < queue->recv_queue_size; i++) {
1046 queue->cmds[i].queue = queue;
1047 ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]);
1048 if (ret)
1049 goto err_destroy_qp;
1053 out:
1054 return ret;
1056 err_destroy_qp:
1057 rdma_destroy_qp(queue->cm_id);
1058 err_destroy_cq:
1059 ib_free_cq(queue->cq);
1060 goto out;
1063 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
1065 ib_drain_qp(queue->qp);
1066 if (queue->cm_id)
1067 rdma_destroy_id(queue->cm_id);
1068 ib_destroy_qp(queue->qp);
1069 ib_free_cq(queue->cq);
1072 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
1074 pr_debug("freeing queue %d\n", queue->idx);
1076 nvmet_sq_destroy(&queue->nvme_sq);
1078 nvmet_rdma_destroy_queue_ib(queue);
1079 if (!queue->dev->srq) {
1080 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
1081 queue->recv_queue_size,
1082 !queue->host_qid);
1084 nvmet_rdma_free_rsps(queue);
1085 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
1086 kfree(queue);
1089 static void nvmet_rdma_release_queue_work(struct work_struct *w)
1091 struct nvmet_rdma_queue *queue =
1092 container_of(w, struct nvmet_rdma_queue, release_work);
1093 struct nvmet_rdma_device *dev = queue->dev;
1095 nvmet_rdma_free_queue(queue);
1097 kref_put(&dev->ref, nvmet_rdma_free_dev);
1100 static int
1101 nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn,
1102 struct nvmet_rdma_queue *queue)
1104 struct nvme_rdma_cm_req *req;
1106 req = (struct nvme_rdma_cm_req *)conn->private_data;
1107 if (!req || conn->private_data_len == 0)
1108 return NVME_RDMA_CM_INVALID_LEN;
1110 if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0)
1111 return NVME_RDMA_CM_INVALID_RECFMT;
1113 queue->host_qid = le16_to_cpu(req->qid);
1116 * req->hsqsize corresponds to our recv queue size plus 1
1117 * req->hrqsize corresponds to our send queue size
1119 queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1;
1120 queue->send_queue_size = le16_to_cpu(req->hrqsize);
1122 if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH)
1123 return NVME_RDMA_CM_INVALID_HSQSIZE;
1125 /* XXX: Should we enforce some kind of max for IO queues? */
1127 return 0;
1130 static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
1131 enum nvme_rdma_cm_status status)
1133 struct nvme_rdma_cm_rej rej;
1135 pr_debug("rejecting connect request: status %d (%s)\n",
1136 status, nvme_rdma_cm_msg(status));
1138 rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1139 rej.sts = cpu_to_le16(status);
1141 return rdma_reject(cm_id, (void *)&rej, sizeof(rej));
1144 static struct nvmet_rdma_queue *
1145 nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev,
1146 struct rdma_cm_id *cm_id,
1147 struct rdma_cm_event *event)
1149 struct nvmet_rdma_queue *queue;
1150 int ret;
1152 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
1153 if (!queue) {
1154 ret = NVME_RDMA_CM_NO_RSC;
1155 goto out_reject;
1158 ret = nvmet_sq_init(&queue->nvme_sq);
1159 if (ret) {
1160 ret = NVME_RDMA_CM_NO_RSC;
1161 goto out_free_queue;
1164 ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue);
1165 if (ret)
1166 goto out_destroy_sq;
1169 * Schedules the actual release because calling rdma_destroy_id from
1170 * inside a CM callback would trigger a deadlock. (great API design..)
1172 INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
1173 queue->dev = ndev;
1174 queue->cm_id = cm_id;
1176 spin_lock_init(&queue->state_lock);
1177 queue->state = NVMET_RDMA_Q_CONNECTING;
1178 INIT_LIST_HEAD(&queue->rsp_wait_list);
1179 INIT_LIST_HEAD(&queue->rsp_wr_wait_list);
1180 spin_lock_init(&queue->rsp_wr_wait_lock);
1181 INIT_LIST_HEAD(&queue->free_rsps);
1182 spin_lock_init(&queue->rsps_lock);
1183 INIT_LIST_HEAD(&queue->queue_list);
1185 queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL);
1186 if (queue->idx < 0) {
1187 ret = NVME_RDMA_CM_NO_RSC;
1188 goto out_destroy_sq;
1191 ret = nvmet_rdma_alloc_rsps(queue);
1192 if (ret) {
1193 ret = NVME_RDMA_CM_NO_RSC;
1194 goto out_ida_remove;
1197 if (!ndev->srq) {
1198 queue->cmds = nvmet_rdma_alloc_cmds(ndev,
1199 queue->recv_queue_size,
1200 !queue->host_qid);
1201 if (IS_ERR(queue->cmds)) {
1202 ret = NVME_RDMA_CM_NO_RSC;
1203 goto out_free_responses;
1207 ret = nvmet_rdma_create_queue_ib(queue);
1208 if (ret) {
1209 pr_err("%s: creating RDMA queue failed (%d).\n",
1210 __func__, ret);
1211 ret = NVME_RDMA_CM_NO_RSC;
1212 goto out_free_cmds;
1215 return queue;
1217 out_free_cmds:
1218 if (!ndev->srq) {
1219 nvmet_rdma_free_cmds(queue->dev, queue->cmds,
1220 queue->recv_queue_size,
1221 !queue->host_qid);
1223 out_free_responses:
1224 nvmet_rdma_free_rsps(queue);
1225 out_ida_remove:
1226 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx);
1227 out_destroy_sq:
1228 nvmet_sq_destroy(&queue->nvme_sq);
1229 out_free_queue:
1230 kfree(queue);
1231 out_reject:
1232 nvmet_rdma_cm_reject(cm_id, ret);
1233 return NULL;
1236 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv)
1238 struct nvmet_rdma_queue *queue = priv;
1240 switch (event->event) {
1241 case IB_EVENT_COMM_EST:
1242 rdma_notify(queue->cm_id, event->event);
1243 break;
1244 default:
1245 pr_err("received IB QP event: %s (%d)\n",
1246 ib_event_msg(event->event), event->event);
1247 break;
1251 static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id,
1252 struct nvmet_rdma_queue *queue,
1253 struct rdma_conn_param *p)
1255 struct rdma_conn_param param = { };
1256 struct nvme_rdma_cm_rep priv = { };
1257 int ret = -ENOMEM;
1259 param.rnr_retry_count = 7;
1260 param.flow_control = 1;
1261 param.initiator_depth = min_t(u8, p->initiator_depth,
1262 queue->dev->device->attrs.max_qp_init_rd_atom);
1263 param.private_data = &priv;
1264 param.private_data_len = sizeof(priv);
1265 priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0);
1266 priv.crqsize = cpu_to_le16(queue->recv_queue_size);
1268 ret = rdma_accept(cm_id, &param);
1269 if (ret)
1270 pr_err("rdma_accept failed (error code = %d)\n", ret);
1272 return ret;
1275 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
1276 struct rdma_cm_event *event)
1278 struct nvmet_rdma_port *port = cm_id->context;
1279 struct nvmet_rdma_device *ndev;
1280 struct nvmet_rdma_queue *queue;
1281 int ret = -EINVAL;
1283 ndev = nvmet_rdma_find_get_device(cm_id);
1284 if (!ndev) {
1285 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC);
1286 return -ECONNREFUSED;
1289 queue = nvmet_rdma_alloc_queue(ndev, cm_id, event);
1290 if (!queue) {
1291 ret = -ENOMEM;
1292 goto put_device;
1294 queue->port = port->nport;
1296 if (queue->host_qid == 0) {
1297 /* Let inflight controller teardown complete */
1298 flush_scheduled_work();
1301 ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn);
1302 if (ret) {
1304 * Don't destroy the cm_id in free path, as we implicitly
1305 * destroy the cm_id here with non-zero ret code.
1307 queue->cm_id = NULL;
1308 goto free_queue;
1311 mutex_lock(&nvmet_rdma_queue_mutex);
1312 list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list);
1313 mutex_unlock(&nvmet_rdma_queue_mutex);
1315 return 0;
1317 free_queue:
1318 nvmet_rdma_free_queue(queue);
1319 put_device:
1320 kref_put(&ndev->ref, nvmet_rdma_free_dev);
1322 return ret;
1325 static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue)
1327 unsigned long flags;
1329 spin_lock_irqsave(&queue->state_lock, flags);
1330 if (queue->state != NVMET_RDMA_Q_CONNECTING) {
1331 pr_warn("trying to establish a connected queue\n");
1332 goto out_unlock;
1334 queue->state = NVMET_RDMA_Q_LIVE;
1336 while (!list_empty(&queue->rsp_wait_list)) {
1337 struct nvmet_rdma_rsp *cmd;
1339 cmd = list_first_entry(&queue->rsp_wait_list,
1340 struct nvmet_rdma_rsp, wait_list);
1341 list_del(&cmd->wait_list);
1343 spin_unlock_irqrestore(&queue->state_lock, flags);
1344 nvmet_rdma_handle_command(queue, cmd);
1345 spin_lock_irqsave(&queue->state_lock, flags);
1348 out_unlock:
1349 spin_unlock_irqrestore(&queue->state_lock, flags);
1352 static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1354 bool disconnect = false;
1355 unsigned long flags;
1357 pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state);
1359 spin_lock_irqsave(&queue->state_lock, flags);
1360 switch (queue->state) {
1361 case NVMET_RDMA_Q_CONNECTING:
1362 case NVMET_RDMA_Q_LIVE:
1363 queue->state = NVMET_RDMA_Q_DISCONNECTING;
1364 disconnect = true;
1365 break;
1366 case NVMET_RDMA_Q_DISCONNECTING:
1367 break;
1369 spin_unlock_irqrestore(&queue->state_lock, flags);
1371 if (disconnect) {
1372 rdma_disconnect(queue->cm_id);
1373 schedule_work(&queue->release_work);
1377 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue)
1379 bool disconnect = false;
1381 mutex_lock(&nvmet_rdma_queue_mutex);
1382 if (!list_empty(&queue->queue_list)) {
1383 list_del_init(&queue->queue_list);
1384 disconnect = true;
1386 mutex_unlock(&nvmet_rdma_queue_mutex);
1388 if (disconnect)
1389 __nvmet_rdma_queue_disconnect(queue);
1392 static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id,
1393 struct nvmet_rdma_queue *queue)
1395 WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING);
1397 mutex_lock(&nvmet_rdma_queue_mutex);
1398 if (!list_empty(&queue->queue_list))
1399 list_del_init(&queue->queue_list);
1400 mutex_unlock(&nvmet_rdma_queue_mutex);
1402 pr_err("failed to connect queue %d\n", queue->idx);
1403 schedule_work(&queue->release_work);
1407 * nvme_rdma_device_removal() - Handle RDMA device removal
1408 * @cm_id: rdma_cm id, used for nvmet port
1409 * @queue: nvmet rdma queue (cm id qp_context)
1411 * DEVICE_REMOVAL event notifies us that the RDMA device is about
1412 * to unplug. Note that this event can be generated on a normal
1413 * queue cm_id and/or a device bound listener cm_id (where in this
1414 * case queue will be null).
1416 * We registered an ib_client to handle device removal for queues,
1417 * so we only need to handle the listening port cm_ids. In this case
1418 * we nullify the priv to prevent double cm_id destruction and destroying
1419 * the cm_id implicitely by returning a non-zero rc to the callout.
1421 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id,
1422 struct nvmet_rdma_queue *queue)
1424 struct nvmet_rdma_port *port;
1426 if (queue) {
1428 * This is a queue cm_id. we have registered
1429 * an ib_client to handle queues removal
1430 * so don't interfear and just return.
1432 return 0;
1435 port = cm_id->context;
1438 * This is a listener cm_id. Make sure that
1439 * future remove_port won't invoke a double
1440 * cm_id destroy. use atomic xchg to make sure
1441 * we don't compete with remove_port.
1443 if (xchg(&port->cm_id, NULL) != cm_id)
1444 return 0;
1447 * We need to return 1 so that the core will destroy
1448 * it's own ID. What a great API design..
1450 return 1;
1453 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id,
1454 struct rdma_cm_event *event)
1456 struct nvmet_rdma_queue *queue = NULL;
1457 int ret = 0;
1459 if (cm_id->qp)
1460 queue = cm_id->qp->qp_context;
1462 pr_debug("%s (%d): status %d id %p\n",
1463 rdma_event_msg(event->event), event->event,
1464 event->status, cm_id);
1466 switch (event->event) {
1467 case RDMA_CM_EVENT_CONNECT_REQUEST:
1468 ret = nvmet_rdma_queue_connect(cm_id, event);
1469 break;
1470 case RDMA_CM_EVENT_ESTABLISHED:
1471 nvmet_rdma_queue_established(queue);
1472 break;
1473 case RDMA_CM_EVENT_ADDR_CHANGE:
1474 if (!queue) {
1475 struct nvmet_rdma_port *port = cm_id->context;
1477 schedule_delayed_work(&port->repair_work, 0);
1478 break;
1480 /* FALLTHROUGH */
1481 case RDMA_CM_EVENT_DISCONNECTED:
1482 case RDMA_CM_EVENT_TIMEWAIT_EXIT:
1483 nvmet_rdma_queue_disconnect(queue);
1484 break;
1485 case RDMA_CM_EVENT_DEVICE_REMOVAL:
1486 ret = nvmet_rdma_device_removal(cm_id, queue);
1487 break;
1488 case RDMA_CM_EVENT_REJECTED:
1489 pr_debug("Connection rejected: %s\n",
1490 rdma_reject_msg(cm_id, event->status));
1491 /* FALLTHROUGH */
1492 case RDMA_CM_EVENT_UNREACHABLE:
1493 case RDMA_CM_EVENT_CONNECT_ERROR:
1494 nvmet_rdma_queue_connect_fail(cm_id, queue);
1495 break;
1496 default:
1497 pr_err("received unrecognized RDMA CM event %d\n",
1498 event->event);
1499 break;
1502 return ret;
1505 static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl)
1507 struct nvmet_rdma_queue *queue;
1509 restart:
1510 mutex_lock(&nvmet_rdma_queue_mutex);
1511 list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) {
1512 if (queue->nvme_sq.ctrl == ctrl) {
1513 list_del_init(&queue->queue_list);
1514 mutex_unlock(&nvmet_rdma_queue_mutex);
1516 __nvmet_rdma_queue_disconnect(queue);
1517 goto restart;
1520 mutex_unlock(&nvmet_rdma_queue_mutex);
1523 static void nvmet_rdma_disable_port(struct nvmet_rdma_port *port)
1525 struct rdma_cm_id *cm_id = xchg(&port->cm_id, NULL);
1527 if (cm_id)
1528 rdma_destroy_id(cm_id);
1531 static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port)
1533 struct sockaddr *addr = (struct sockaddr *)&port->addr;
1534 struct rdma_cm_id *cm_id;
1535 int ret;
1537 cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port,
1538 RDMA_PS_TCP, IB_QPT_RC);
1539 if (IS_ERR(cm_id)) {
1540 pr_err("CM ID creation failed\n");
1541 return PTR_ERR(cm_id);
1545 * Allow both IPv4 and IPv6 sockets to bind a single port
1546 * at the same time.
1548 ret = rdma_set_afonly(cm_id, 1);
1549 if (ret) {
1550 pr_err("rdma_set_afonly failed (%d)\n", ret);
1551 goto out_destroy_id;
1554 ret = rdma_bind_addr(cm_id, addr);
1555 if (ret) {
1556 pr_err("binding CM ID to %pISpcs failed (%d)\n", addr, ret);
1557 goto out_destroy_id;
1560 ret = rdma_listen(cm_id, 128);
1561 if (ret) {
1562 pr_err("listening to %pISpcs failed (%d)\n", addr, ret);
1563 goto out_destroy_id;
1566 port->cm_id = cm_id;
1567 return 0;
1569 out_destroy_id:
1570 rdma_destroy_id(cm_id);
1571 return ret;
1574 static void nvmet_rdma_repair_port_work(struct work_struct *w)
1576 struct nvmet_rdma_port *port = container_of(to_delayed_work(w),
1577 struct nvmet_rdma_port, repair_work);
1578 int ret;
1580 nvmet_rdma_disable_port(port);
1581 ret = nvmet_rdma_enable_port(port);
1582 if (ret)
1583 schedule_delayed_work(&port->repair_work, 5 * HZ);
1586 static int nvmet_rdma_add_port(struct nvmet_port *nport)
1588 struct nvmet_rdma_port *port;
1589 __kernel_sa_family_t af;
1590 int ret;
1592 port = kzalloc(sizeof(*port), GFP_KERNEL);
1593 if (!port)
1594 return -ENOMEM;
1596 nport->priv = port;
1597 port->nport = nport;
1598 INIT_DELAYED_WORK(&port->repair_work, nvmet_rdma_repair_port_work);
1600 switch (nport->disc_addr.adrfam) {
1601 case NVMF_ADDR_FAMILY_IP4:
1602 af = AF_INET;
1603 break;
1604 case NVMF_ADDR_FAMILY_IP6:
1605 af = AF_INET6;
1606 break;
1607 default:
1608 pr_err("address family %d not supported\n",
1609 nport->disc_addr.adrfam);
1610 ret = -EINVAL;
1611 goto out_free_port;
1614 if (nport->inline_data_size < 0) {
1615 nport->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE;
1616 } else if (nport->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) {
1617 pr_warn("inline_data_size %u is too large, reducing to %u\n",
1618 nport->inline_data_size,
1619 NVMET_RDMA_MAX_INLINE_DATA_SIZE);
1620 nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
1623 ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
1624 nport->disc_addr.trsvcid, &port->addr);
1625 if (ret) {
1626 pr_err("malformed ip/port passed: %s:%s\n",
1627 nport->disc_addr.traddr, nport->disc_addr.trsvcid);
1628 goto out_free_port;
1631 ret = nvmet_rdma_enable_port(port);
1632 if (ret)
1633 goto out_free_port;
1635 pr_info("enabling port %d (%pISpcs)\n",
1636 le16_to_cpu(nport->disc_addr.portid),
1637 (struct sockaddr *)&port->addr);
1639 return 0;
1641 out_free_port:
1642 kfree(port);
1643 return ret;
1646 static void nvmet_rdma_remove_port(struct nvmet_port *nport)
1648 struct nvmet_rdma_port *port = nport->priv;
1650 cancel_delayed_work_sync(&port->repair_work);
1651 nvmet_rdma_disable_port(port);
1652 kfree(port);
1655 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req,
1656 struct nvmet_port *nport, char *traddr)
1658 struct nvmet_rdma_port *port = nport->priv;
1659 struct rdma_cm_id *cm_id = port->cm_id;
1661 if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) {
1662 struct nvmet_rdma_rsp *rsp =
1663 container_of(req, struct nvmet_rdma_rsp, req);
1664 struct rdma_cm_id *req_cm_id = rsp->queue->cm_id;
1665 struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr;
1667 sprintf(traddr, "%pISc", addr);
1668 } else {
1669 memcpy(traddr, nport->disc_addr.traddr, NVMF_TRADDR_SIZE);
1673 static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl)
1675 return NVMET_RDMA_MAX_MDTS;
1678 static const struct nvmet_fabrics_ops nvmet_rdma_ops = {
1679 .owner = THIS_MODULE,
1680 .type = NVMF_TRTYPE_RDMA,
1681 .msdbd = 1,
1682 .has_keyed_sgls = 1,
1683 .add_port = nvmet_rdma_add_port,
1684 .remove_port = nvmet_rdma_remove_port,
1685 .queue_response = nvmet_rdma_queue_response,
1686 .delete_ctrl = nvmet_rdma_delete_ctrl,
1687 .disc_traddr = nvmet_rdma_disc_port_addr,
1688 .get_mdts = nvmet_rdma_get_mdts,
1691 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data)
1693 struct nvmet_rdma_queue *queue, *tmp;
1694 struct nvmet_rdma_device *ndev;
1695 bool found = false;
1697 mutex_lock(&device_list_mutex);
1698 list_for_each_entry(ndev, &device_list, entry) {
1699 if (ndev->device == ib_device) {
1700 found = true;
1701 break;
1704 mutex_unlock(&device_list_mutex);
1706 if (!found)
1707 return;
1710 * IB Device that is used by nvmet controllers is being removed,
1711 * delete all queues using this device.
1713 mutex_lock(&nvmet_rdma_queue_mutex);
1714 list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list,
1715 queue_list) {
1716 if (queue->dev->device != ib_device)
1717 continue;
1719 pr_info("Removing queue %d\n", queue->idx);
1720 list_del_init(&queue->queue_list);
1721 __nvmet_rdma_queue_disconnect(queue);
1723 mutex_unlock(&nvmet_rdma_queue_mutex);
1725 flush_scheduled_work();
1728 static struct ib_client nvmet_rdma_ib_client = {
1729 .name = "nvmet_rdma",
1730 .remove = nvmet_rdma_remove_one
1733 static int __init nvmet_rdma_init(void)
1735 int ret;
1737 ret = ib_register_client(&nvmet_rdma_ib_client);
1738 if (ret)
1739 return ret;
1741 ret = nvmet_register_transport(&nvmet_rdma_ops);
1742 if (ret)
1743 goto err_ib_client;
1745 return 0;
1747 err_ib_client:
1748 ib_unregister_client(&nvmet_rdma_ib_client);
1749 return ret;
1752 static void __exit nvmet_rdma_exit(void)
1754 nvmet_unregister_transport(&nvmet_rdma_ops);
1755 ib_unregister_client(&nvmet_rdma_ib_client);
1756 WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
1757 ida_destroy(&nvmet_rdma_queue_ida);
1760 module_init(nvmet_rdma_init);
1761 module_exit(nvmet_rdma_exit);
1763 MODULE_LICENSE("GPL v2");
1764 MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */