kvm tools, setup: Create private directory
[linux-2.6/next.git] / net / sunrpc / xprtrdma / verbs.c
blob80f8da344df538c5b6ef054e3caddc617a9a49ec
1 /*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 * verbs.c
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
50 #include <linux/pci.h> /* for Tavor hack below */
51 #include <linux/slab.h>
53 #include "xprt_rdma.h"
56 * Globals/Macros
59 #ifdef RPC_DEBUG
60 # define RPCDBG_FACILITY RPCDBG_TRANS
61 #endif
64 * internal functions
68 * handle replies in tasklet context, using a single, global list
69 * rdma tasklet function -- just turn around and call the func
70 * for all replies on the list
73 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
74 static LIST_HEAD(rpcrdma_tasklets_g);
76 static void
77 rpcrdma_run_tasklet(unsigned long data)
79 struct rpcrdma_rep *rep;
80 void (*func)(struct rpcrdma_rep *);
81 unsigned long flags;
83 data = data;
84 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
85 while (!list_empty(&rpcrdma_tasklets_g)) {
86 rep = list_entry(rpcrdma_tasklets_g.next,
87 struct rpcrdma_rep, rr_list);
88 list_del(&rep->rr_list);
89 func = rep->rr_func;
90 rep->rr_func = NULL;
91 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
93 if (func)
94 func(rep);
95 else
96 rpcrdma_recv_buffer_put(rep);
98 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
100 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
103 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
105 static inline void
106 rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
108 unsigned long flags;
110 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
111 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
112 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
113 tasklet_schedule(&rpcrdma_tasklet_g);
116 static void
117 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
119 struct rpcrdma_ep *ep = context;
121 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
122 __func__, event->event, event->device->name, context);
123 if (ep->rep_connected == 1) {
124 ep->rep_connected = -EIO;
125 ep->rep_func(ep);
126 wake_up_all(&ep->rep_connect_wait);
130 static void
131 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
133 struct rpcrdma_ep *ep = context;
135 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
136 __func__, event->event, event->device->name, context);
137 if (ep->rep_connected == 1) {
138 ep->rep_connected = -EIO;
139 ep->rep_func(ep);
140 wake_up_all(&ep->rep_connect_wait);
144 static inline
145 void rpcrdma_event_process(struct ib_wc *wc)
147 struct rpcrdma_mw *frmr;
148 struct rpcrdma_rep *rep =
149 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
151 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
152 __func__, rep, wc->status, wc->opcode, wc->byte_len);
154 if (!rep) /* send or bind completion that we don't care about */
155 return;
157 if (IB_WC_SUCCESS != wc->status) {
158 dprintk("RPC: %s: WC opcode %d status %X, connection lost\n",
159 __func__, wc->opcode, wc->status);
160 rep->rr_len = ~0U;
161 if (wc->opcode != IB_WC_FAST_REG_MR && wc->opcode != IB_WC_LOCAL_INV)
162 rpcrdma_schedule_tasklet(rep);
163 return;
166 switch (wc->opcode) {
167 case IB_WC_FAST_REG_MR:
168 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
169 frmr->r.frmr.state = FRMR_IS_VALID;
170 break;
171 case IB_WC_LOCAL_INV:
172 frmr = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
173 frmr->r.frmr.state = FRMR_IS_INVALID;
174 break;
175 case IB_WC_RECV:
176 rep->rr_len = wc->byte_len;
177 ib_dma_sync_single_for_cpu(
178 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
179 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
180 /* Keep (only) the most recent credits, after check validity */
181 if (rep->rr_len >= 16) {
182 struct rpcrdma_msg *p =
183 (struct rpcrdma_msg *) rep->rr_base;
184 unsigned int credits = ntohl(p->rm_credit);
185 if (credits == 0) {
186 dprintk("RPC: %s: server"
187 " dropped credits to 0!\n", __func__);
188 /* don't deadlock */
189 credits = 1;
190 } else if (credits > rep->rr_buffer->rb_max_requests) {
191 dprintk("RPC: %s: server"
192 " over-crediting: %d (%d)\n",
193 __func__, credits,
194 rep->rr_buffer->rb_max_requests);
195 credits = rep->rr_buffer->rb_max_requests;
197 atomic_set(&rep->rr_buffer->rb_credits, credits);
199 /* fall through */
200 case IB_WC_BIND_MW:
201 rpcrdma_schedule_tasklet(rep);
202 break;
203 default:
204 dprintk("RPC: %s: unexpected WC event %X\n",
205 __func__, wc->opcode);
206 break;
210 static inline int
211 rpcrdma_cq_poll(struct ib_cq *cq)
213 struct ib_wc wc;
214 int rc;
216 for (;;) {
217 rc = ib_poll_cq(cq, 1, &wc);
218 if (rc < 0) {
219 dprintk("RPC: %s: ib_poll_cq failed %i\n",
220 __func__, rc);
221 return rc;
223 if (rc == 0)
224 break;
226 rpcrdma_event_process(&wc);
229 return 0;
233 * rpcrdma_cq_event_upcall
235 * This upcall handles recv, send, bind and unbind events.
236 * It is reentrant but processes single events in order to maintain
237 * ordering of receives to keep server credits.
239 * It is the responsibility of the scheduled tasklet to return
240 * recv buffers to the pool. NOTE: this affects synchronization of
241 * connection shutdown. That is, the structures required for
242 * the completion of the reply handler must remain intact until
243 * all memory has been reclaimed.
245 * Note that send events are suppressed and do not result in an upcall.
247 static void
248 rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
250 int rc;
252 rc = rpcrdma_cq_poll(cq);
253 if (rc)
254 return;
256 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
257 if (rc) {
258 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
259 __func__, rc);
260 return;
263 rpcrdma_cq_poll(cq);
266 #ifdef RPC_DEBUG
267 static const char * const conn[] = {
268 "address resolved",
269 "address error",
270 "route resolved",
271 "route error",
272 "connect request",
273 "connect response",
274 "connect error",
275 "unreachable",
276 "rejected",
277 "established",
278 "disconnected",
279 "device removal"
281 #endif
283 static int
284 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
286 struct rpcrdma_xprt *xprt = id->context;
287 struct rpcrdma_ia *ia = &xprt->rx_ia;
288 struct rpcrdma_ep *ep = &xprt->rx_ep;
289 #ifdef RPC_DEBUG
290 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
291 #endif
292 struct ib_qp_attr attr;
293 struct ib_qp_init_attr iattr;
294 int connstate = 0;
296 switch (event->event) {
297 case RDMA_CM_EVENT_ADDR_RESOLVED:
298 case RDMA_CM_EVENT_ROUTE_RESOLVED:
299 ia->ri_async_rc = 0;
300 complete(&ia->ri_done);
301 break;
302 case RDMA_CM_EVENT_ADDR_ERROR:
303 ia->ri_async_rc = -EHOSTUNREACH;
304 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
305 __func__, ep);
306 complete(&ia->ri_done);
307 break;
308 case RDMA_CM_EVENT_ROUTE_ERROR:
309 ia->ri_async_rc = -ENETUNREACH;
310 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
311 __func__, ep);
312 complete(&ia->ri_done);
313 break;
314 case RDMA_CM_EVENT_ESTABLISHED:
315 connstate = 1;
316 ib_query_qp(ia->ri_id->qp, &attr,
317 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
318 &iattr);
319 dprintk("RPC: %s: %d responder resources"
320 " (%d initiator)\n",
321 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
322 goto connected;
323 case RDMA_CM_EVENT_CONNECT_ERROR:
324 connstate = -ENOTCONN;
325 goto connected;
326 case RDMA_CM_EVENT_UNREACHABLE:
327 connstate = -ENETDOWN;
328 goto connected;
329 case RDMA_CM_EVENT_REJECTED:
330 connstate = -ECONNREFUSED;
331 goto connected;
332 case RDMA_CM_EVENT_DISCONNECTED:
333 connstate = -ECONNABORTED;
334 goto connected;
335 case RDMA_CM_EVENT_DEVICE_REMOVAL:
336 connstate = -ENODEV;
337 connected:
338 dprintk("RPC: %s: %s: %pI4:%u (ep 0x%p event 0x%x)\n",
339 __func__,
340 (event->event <= 11) ? conn[event->event] :
341 "unknown connection error",
342 &addr->sin_addr.s_addr,
343 ntohs(addr->sin_port),
344 ep, event->event);
345 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
346 dprintk("RPC: %s: %sconnected\n",
347 __func__, connstate > 0 ? "" : "dis");
348 ep->rep_connected = connstate;
349 ep->rep_func(ep);
350 wake_up_all(&ep->rep_connect_wait);
351 break;
352 default:
353 dprintk("RPC: %s: unexpected CM event %d\n",
354 __func__, event->event);
355 break;
358 #ifdef RPC_DEBUG
359 if (connstate == 1) {
360 int ird = attr.max_dest_rd_atomic;
361 int tird = ep->rep_remote_cma.responder_resources;
362 printk(KERN_INFO "rpcrdma: connection to %pI4:%u "
363 "on %s, memreg %d slots %d ird %d%s\n",
364 &addr->sin_addr.s_addr,
365 ntohs(addr->sin_port),
366 ia->ri_id->device->name,
367 ia->ri_memreg_strategy,
368 xprt->rx_buf.rb_max_requests,
369 ird, ird < 4 && ird < tird / 2 ? " (low!)" : "");
370 } else if (connstate < 0) {
371 printk(KERN_INFO "rpcrdma: connection to %pI4:%u closed (%d)\n",
372 &addr->sin_addr.s_addr,
373 ntohs(addr->sin_port),
374 connstate);
376 #endif
378 return 0;
381 static struct rdma_cm_id *
382 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
383 struct rpcrdma_ia *ia, struct sockaddr *addr)
385 struct rdma_cm_id *id;
386 int rc;
388 init_completion(&ia->ri_done);
390 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
391 if (IS_ERR(id)) {
392 rc = PTR_ERR(id);
393 dprintk("RPC: %s: rdma_create_id() failed %i\n",
394 __func__, rc);
395 return id;
398 ia->ri_async_rc = -ETIMEDOUT;
399 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
400 if (rc) {
401 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
402 __func__, rc);
403 goto out;
405 wait_for_completion_interruptible_timeout(&ia->ri_done,
406 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
407 rc = ia->ri_async_rc;
408 if (rc)
409 goto out;
411 ia->ri_async_rc = -ETIMEDOUT;
412 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
413 if (rc) {
414 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
415 __func__, rc);
416 goto out;
418 wait_for_completion_interruptible_timeout(&ia->ri_done,
419 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
420 rc = ia->ri_async_rc;
421 if (rc)
422 goto out;
424 return id;
426 out:
427 rdma_destroy_id(id);
428 return ERR_PTR(rc);
432 * Drain any cq, prior to teardown.
434 static void
435 rpcrdma_clean_cq(struct ib_cq *cq)
437 struct ib_wc wc;
438 int count = 0;
440 while (1 == ib_poll_cq(cq, 1, &wc))
441 ++count;
443 if (count)
444 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
445 __func__, count, wc.opcode);
449 * Exported functions.
453 * Open and initialize an Interface Adapter.
454 * o initializes fields of struct rpcrdma_ia, including
455 * interface and provider attributes and protection zone.
458 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
460 int rc, mem_priv;
461 struct ib_device_attr devattr;
462 struct rpcrdma_ia *ia = &xprt->rx_ia;
464 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
465 if (IS_ERR(ia->ri_id)) {
466 rc = PTR_ERR(ia->ri_id);
467 goto out1;
470 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
471 if (IS_ERR(ia->ri_pd)) {
472 rc = PTR_ERR(ia->ri_pd);
473 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
474 __func__, rc);
475 goto out2;
479 * Query the device to determine if the requested memory
480 * registration strategy is supported. If it isn't, set the
481 * strategy to a globally supported model.
483 rc = ib_query_device(ia->ri_id->device, &devattr);
484 if (rc) {
485 dprintk("RPC: %s: ib_query_device failed %d\n",
486 __func__, rc);
487 goto out2;
490 if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
491 ia->ri_have_dma_lkey = 1;
492 ia->ri_dma_lkey = ia->ri_id->device->local_dma_lkey;
495 switch (memreg) {
496 case RPCRDMA_MEMWINDOWS:
497 case RPCRDMA_MEMWINDOWS_ASYNC:
498 if (!(devattr.device_cap_flags & IB_DEVICE_MEM_WINDOW)) {
499 dprintk("RPC: %s: MEMWINDOWS registration "
500 "specified but not supported by adapter, "
501 "using slower RPCRDMA_REGISTER\n",
502 __func__);
503 memreg = RPCRDMA_REGISTER;
505 break;
506 case RPCRDMA_MTHCAFMR:
507 if (!ia->ri_id->device->alloc_fmr) {
508 #if RPCRDMA_PERSISTENT_REGISTRATION
509 dprintk("RPC: %s: MTHCAFMR registration "
510 "specified but not supported by adapter, "
511 "using riskier RPCRDMA_ALLPHYSICAL\n",
512 __func__);
513 memreg = RPCRDMA_ALLPHYSICAL;
514 #else
515 dprintk("RPC: %s: MTHCAFMR registration "
516 "specified but not supported by adapter, "
517 "using slower RPCRDMA_REGISTER\n",
518 __func__);
519 memreg = RPCRDMA_REGISTER;
520 #endif
522 break;
523 case RPCRDMA_FRMR:
524 /* Requires both frmr reg and local dma lkey */
525 if ((devattr.device_cap_flags &
526 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) !=
527 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) {
528 #if RPCRDMA_PERSISTENT_REGISTRATION
529 dprintk("RPC: %s: FRMR registration "
530 "specified but not supported by adapter, "
531 "using riskier RPCRDMA_ALLPHYSICAL\n",
532 __func__);
533 memreg = RPCRDMA_ALLPHYSICAL;
534 #else
535 dprintk("RPC: %s: FRMR registration "
536 "specified but not supported by adapter, "
537 "using slower RPCRDMA_REGISTER\n",
538 __func__);
539 memreg = RPCRDMA_REGISTER;
540 #endif
542 break;
546 * Optionally obtain an underlying physical identity mapping in
547 * order to do a memory window-based bind. This base registration
548 * is protected from remote access - that is enabled only by binding
549 * for the specific bytes targeted during each RPC operation, and
550 * revoked after the corresponding completion similar to a storage
551 * adapter.
553 switch (memreg) {
554 case RPCRDMA_BOUNCEBUFFERS:
555 case RPCRDMA_REGISTER:
556 case RPCRDMA_FRMR:
557 break;
558 #if RPCRDMA_PERSISTENT_REGISTRATION
559 case RPCRDMA_ALLPHYSICAL:
560 mem_priv = IB_ACCESS_LOCAL_WRITE |
561 IB_ACCESS_REMOTE_WRITE |
562 IB_ACCESS_REMOTE_READ;
563 goto register_setup;
564 #endif
565 case RPCRDMA_MEMWINDOWS_ASYNC:
566 case RPCRDMA_MEMWINDOWS:
567 mem_priv = IB_ACCESS_LOCAL_WRITE |
568 IB_ACCESS_MW_BIND;
569 goto register_setup;
570 case RPCRDMA_MTHCAFMR:
571 if (ia->ri_have_dma_lkey)
572 break;
573 mem_priv = IB_ACCESS_LOCAL_WRITE;
574 register_setup:
575 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
576 if (IS_ERR(ia->ri_bind_mem)) {
577 printk(KERN_ALERT "%s: ib_get_dma_mr for "
578 "phys register failed with %lX\n\t"
579 "Will continue with degraded performance\n",
580 __func__, PTR_ERR(ia->ri_bind_mem));
581 memreg = RPCRDMA_REGISTER;
582 ia->ri_bind_mem = NULL;
584 break;
585 default:
586 printk(KERN_ERR "%s: invalid memory registration mode %d\n",
587 __func__, memreg);
588 rc = -EINVAL;
589 goto out2;
591 dprintk("RPC: %s: memory registration strategy is %d\n",
592 __func__, memreg);
594 /* Else will do memory reg/dereg for each chunk */
595 ia->ri_memreg_strategy = memreg;
597 return 0;
598 out2:
599 rdma_destroy_id(ia->ri_id);
600 ia->ri_id = NULL;
601 out1:
602 return rc;
606 * Clean up/close an IA.
607 * o if event handles and PD have been initialized, free them.
608 * o close the IA
610 void
611 rpcrdma_ia_close(struct rpcrdma_ia *ia)
613 int rc;
615 dprintk("RPC: %s: entering\n", __func__);
616 if (ia->ri_bind_mem != NULL) {
617 rc = ib_dereg_mr(ia->ri_bind_mem);
618 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
619 __func__, rc);
621 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
622 if (ia->ri_id->qp)
623 rdma_destroy_qp(ia->ri_id);
624 rdma_destroy_id(ia->ri_id);
625 ia->ri_id = NULL;
627 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
628 rc = ib_dealloc_pd(ia->ri_pd);
629 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
630 __func__, rc);
635 * Create unconnected endpoint.
638 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
639 struct rpcrdma_create_data_internal *cdata)
641 struct ib_device_attr devattr;
642 int rc, err;
644 rc = ib_query_device(ia->ri_id->device, &devattr);
645 if (rc) {
646 dprintk("RPC: %s: ib_query_device failed %d\n",
647 __func__, rc);
648 return rc;
651 /* check provider's send/recv wr limits */
652 if (cdata->max_requests > devattr.max_qp_wr)
653 cdata->max_requests = devattr.max_qp_wr;
655 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
656 ep->rep_attr.qp_context = ep;
657 /* send_cq and recv_cq initialized below */
658 ep->rep_attr.srq = NULL;
659 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
660 switch (ia->ri_memreg_strategy) {
661 case RPCRDMA_FRMR:
662 /* Add room for frmr register and invalidate WRs.
663 * 1. FRMR reg WR for head
664 * 2. FRMR invalidate WR for head
665 * 3. FRMR reg WR for pagelist
666 * 4. FRMR invalidate WR for pagelist
667 * 5. FRMR reg WR for tail
668 * 6. FRMR invalidate WR for tail
669 * 7. The RDMA_SEND WR
671 ep->rep_attr.cap.max_send_wr *= 7;
672 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr) {
673 cdata->max_requests = devattr.max_qp_wr / 7;
674 if (!cdata->max_requests)
675 return -EINVAL;
676 ep->rep_attr.cap.max_send_wr = cdata->max_requests * 7;
678 break;
679 case RPCRDMA_MEMWINDOWS_ASYNC:
680 case RPCRDMA_MEMWINDOWS:
681 /* Add room for mw_binds+unbinds - overkill! */
682 ep->rep_attr.cap.max_send_wr++;
683 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
684 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
685 return -EINVAL;
686 break;
687 default:
688 break;
690 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
691 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
692 ep->rep_attr.cap.max_recv_sge = 1;
693 ep->rep_attr.cap.max_inline_data = 0;
694 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
695 ep->rep_attr.qp_type = IB_QPT_RC;
696 ep->rep_attr.port_num = ~0;
698 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
699 "iovs: send %d recv %d\n",
700 __func__,
701 ep->rep_attr.cap.max_send_wr,
702 ep->rep_attr.cap.max_recv_wr,
703 ep->rep_attr.cap.max_send_sge,
704 ep->rep_attr.cap.max_recv_sge);
706 /* set trigger for requesting send completion */
707 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
708 switch (ia->ri_memreg_strategy) {
709 case RPCRDMA_MEMWINDOWS_ASYNC:
710 case RPCRDMA_MEMWINDOWS:
711 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
712 break;
713 default:
714 break;
716 if (ep->rep_cqinit <= 2)
717 ep->rep_cqinit = 0;
718 INIT_CQCOUNT(ep);
719 ep->rep_ia = ia;
720 init_waitqueue_head(&ep->rep_connect_wait);
723 * Create a single cq for receive dto and mw_bind (only ever
724 * care about unbind, really). Send completions are suppressed.
725 * Use single threaded tasklet upcalls to maintain ordering.
727 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
728 rpcrdma_cq_async_error_upcall, NULL,
729 ep->rep_attr.cap.max_recv_wr +
730 ep->rep_attr.cap.max_send_wr + 1, 0);
731 if (IS_ERR(ep->rep_cq)) {
732 rc = PTR_ERR(ep->rep_cq);
733 dprintk("RPC: %s: ib_create_cq failed: %i\n",
734 __func__, rc);
735 goto out1;
738 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
739 if (rc) {
740 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
741 __func__, rc);
742 goto out2;
745 ep->rep_attr.send_cq = ep->rep_cq;
746 ep->rep_attr.recv_cq = ep->rep_cq;
748 /* Initialize cma parameters */
750 /* RPC/RDMA does not use private data */
751 ep->rep_remote_cma.private_data = NULL;
752 ep->rep_remote_cma.private_data_len = 0;
754 /* Client offers RDMA Read but does not initiate */
755 ep->rep_remote_cma.initiator_depth = 0;
756 if (ia->ri_memreg_strategy == RPCRDMA_BOUNCEBUFFERS)
757 ep->rep_remote_cma.responder_resources = 0;
758 else if (devattr.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
759 ep->rep_remote_cma.responder_resources = 32;
760 else
761 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
763 ep->rep_remote_cma.retry_count = 7;
764 ep->rep_remote_cma.flow_control = 0;
765 ep->rep_remote_cma.rnr_retry_count = 0;
767 return 0;
769 out2:
770 err = ib_destroy_cq(ep->rep_cq);
771 if (err)
772 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
773 __func__, err);
774 out1:
775 return rc;
779 * rpcrdma_ep_destroy
781 * Disconnect and destroy endpoint. After this, the only
782 * valid operations on the ep are to free it (if dynamically
783 * allocated) or re-create it.
785 * The caller's error handling must be sure to not leak the endpoint
786 * if this function fails.
789 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
791 int rc;
793 dprintk("RPC: %s: entering, connected is %d\n",
794 __func__, ep->rep_connected);
796 if (ia->ri_id->qp) {
797 rc = rpcrdma_ep_disconnect(ep, ia);
798 if (rc)
799 dprintk("RPC: %s: rpcrdma_ep_disconnect"
800 " returned %i\n", __func__, rc);
801 rdma_destroy_qp(ia->ri_id);
802 ia->ri_id->qp = NULL;
805 /* padding - could be done in rpcrdma_buffer_destroy... */
806 if (ep->rep_pad_mr) {
807 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
808 ep->rep_pad_mr = NULL;
811 rpcrdma_clean_cq(ep->rep_cq);
812 rc = ib_destroy_cq(ep->rep_cq);
813 if (rc)
814 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
815 __func__, rc);
817 return rc;
821 * Connect unconnected endpoint.
824 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
826 struct rdma_cm_id *id;
827 int rc = 0;
828 int retry_count = 0;
830 if (ep->rep_connected != 0) {
831 struct rpcrdma_xprt *xprt;
832 retry:
833 rc = rpcrdma_ep_disconnect(ep, ia);
834 if (rc && rc != -ENOTCONN)
835 dprintk("RPC: %s: rpcrdma_ep_disconnect"
836 " status %i\n", __func__, rc);
837 rpcrdma_clean_cq(ep->rep_cq);
839 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
840 id = rpcrdma_create_id(xprt, ia,
841 (struct sockaddr *)&xprt->rx_data.addr);
842 if (IS_ERR(id)) {
843 rc = PTR_ERR(id);
844 goto out;
846 /* TEMP TEMP TEMP - fail if new device:
847 * Deregister/remarshal *all* requests!
848 * Close and recreate adapter, pd, etc!
849 * Re-determine all attributes still sane!
850 * More stuff I haven't thought of!
851 * Rrrgh!
853 if (ia->ri_id->device != id->device) {
854 printk("RPC: %s: can't reconnect on "
855 "different device!\n", __func__);
856 rdma_destroy_id(id);
857 rc = -ENETDOWN;
858 goto out;
860 /* END TEMP */
861 rdma_destroy_qp(ia->ri_id);
862 rdma_destroy_id(ia->ri_id);
863 ia->ri_id = id;
866 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
867 if (rc) {
868 dprintk("RPC: %s: rdma_create_qp failed %i\n",
869 __func__, rc);
870 goto out;
873 /* XXX Tavor device performs badly with 2K MTU! */
874 if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
875 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
876 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
877 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
878 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
879 struct ib_qp_attr attr = {
880 .path_mtu = IB_MTU_1024
882 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
886 ep->rep_connected = 0;
888 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
889 if (rc) {
890 dprintk("RPC: %s: rdma_connect() failed with %i\n",
891 __func__, rc);
892 goto out;
895 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
898 * Check state. A non-peer reject indicates no listener
899 * (ECONNREFUSED), which may be a transient state. All
900 * others indicate a transport condition which has already
901 * undergone a best-effort.
903 if (ep->rep_connected == -ECONNREFUSED &&
904 ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
905 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
906 goto retry;
908 if (ep->rep_connected <= 0) {
909 /* Sometimes, the only way to reliably connect to remote
910 * CMs is to use same nonzero values for ORD and IRD. */
911 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 &&
912 (ep->rep_remote_cma.responder_resources == 0 ||
913 ep->rep_remote_cma.initiator_depth !=
914 ep->rep_remote_cma.responder_resources)) {
915 if (ep->rep_remote_cma.responder_resources == 0)
916 ep->rep_remote_cma.responder_resources = 1;
917 ep->rep_remote_cma.initiator_depth =
918 ep->rep_remote_cma.responder_resources;
919 goto retry;
921 rc = ep->rep_connected;
922 } else {
923 dprintk("RPC: %s: connected\n", __func__);
926 out:
927 if (rc)
928 ep->rep_connected = rc;
929 return rc;
933 * rpcrdma_ep_disconnect
935 * This is separate from destroy to facilitate the ability
936 * to reconnect without recreating the endpoint.
938 * This call is not reentrant, and must not be made in parallel
939 * on the same endpoint.
942 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
944 int rc;
946 rpcrdma_clean_cq(ep->rep_cq);
947 rc = rdma_disconnect(ia->ri_id);
948 if (!rc) {
949 /* returns without wait if not connected */
950 wait_event_interruptible(ep->rep_connect_wait,
951 ep->rep_connected != 1);
952 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
953 (ep->rep_connected == 1) ? "still " : "dis");
954 } else {
955 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
956 ep->rep_connected = rc;
958 return rc;
962 * Initialize buffer memory
965 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
966 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
968 char *p;
969 size_t len;
970 int i, rc;
971 struct rpcrdma_mw *r;
973 buf->rb_max_requests = cdata->max_requests;
974 spin_lock_init(&buf->rb_lock);
975 atomic_set(&buf->rb_credits, 1);
977 /* Need to allocate:
978 * 1. arrays for send and recv pointers
979 * 2. arrays of struct rpcrdma_req to fill in pointers
980 * 3. array of struct rpcrdma_rep for replies
981 * 4. padding, if any
982 * 5. mw's, fmr's or frmr's, if any
983 * Send/recv buffers in req/rep need to be registered
986 len = buf->rb_max_requests *
987 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
988 len += cdata->padding;
989 switch (ia->ri_memreg_strategy) {
990 case RPCRDMA_FRMR:
991 len += buf->rb_max_requests * RPCRDMA_MAX_SEGS *
992 sizeof(struct rpcrdma_mw);
993 break;
994 case RPCRDMA_MTHCAFMR:
995 /* TBD we are perhaps overallocating here */
996 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
997 sizeof(struct rpcrdma_mw);
998 break;
999 case RPCRDMA_MEMWINDOWS_ASYNC:
1000 case RPCRDMA_MEMWINDOWS:
1001 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
1002 sizeof(struct rpcrdma_mw);
1003 break;
1004 default:
1005 break;
1008 /* allocate 1, 4 and 5 in one shot */
1009 p = kzalloc(len, GFP_KERNEL);
1010 if (p == NULL) {
1011 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
1012 __func__, len);
1013 rc = -ENOMEM;
1014 goto out;
1016 buf->rb_pool = p; /* for freeing it later */
1018 buf->rb_send_bufs = (struct rpcrdma_req **) p;
1019 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
1020 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
1021 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
1024 * Register the zeroed pad buffer, if any.
1026 if (cdata->padding) {
1027 rc = rpcrdma_register_internal(ia, p, cdata->padding,
1028 &ep->rep_pad_mr, &ep->rep_pad);
1029 if (rc)
1030 goto out;
1032 p += cdata->padding;
1035 * Allocate the fmr's, or mw's for mw_bind chunk registration.
1036 * We "cycle" the mw's in order to minimize rkey reuse,
1037 * and also reduce unbind-to-bind collision.
1039 INIT_LIST_HEAD(&buf->rb_mws);
1040 r = (struct rpcrdma_mw *)p;
1041 switch (ia->ri_memreg_strategy) {
1042 case RPCRDMA_FRMR:
1043 for (i = buf->rb_max_requests * RPCRDMA_MAX_SEGS; i; i--) {
1044 r->r.frmr.fr_mr = ib_alloc_fast_reg_mr(ia->ri_pd,
1045 RPCRDMA_MAX_SEGS);
1046 if (IS_ERR(r->r.frmr.fr_mr)) {
1047 rc = PTR_ERR(r->r.frmr.fr_mr);
1048 dprintk("RPC: %s: ib_alloc_fast_reg_mr"
1049 " failed %i\n", __func__, rc);
1050 goto out;
1052 r->r.frmr.fr_pgl =
1053 ib_alloc_fast_reg_page_list(ia->ri_id->device,
1054 RPCRDMA_MAX_SEGS);
1055 if (IS_ERR(r->r.frmr.fr_pgl)) {
1056 rc = PTR_ERR(r->r.frmr.fr_pgl);
1057 dprintk("RPC: %s: "
1058 "ib_alloc_fast_reg_page_list "
1059 "failed %i\n", __func__, rc);
1060 goto out;
1062 list_add(&r->mw_list, &buf->rb_mws);
1063 ++r;
1065 break;
1066 case RPCRDMA_MTHCAFMR:
1067 /* TBD we are perhaps overallocating here */
1068 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1069 static struct ib_fmr_attr fa =
1070 { RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT };
1071 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
1072 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
1073 &fa);
1074 if (IS_ERR(r->r.fmr)) {
1075 rc = PTR_ERR(r->r.fmr);
1076 dprintk("RPC: %s: ib_alloc_fmr"
1077 " failed %i\n", __func__, rc);
1078 goto out;
1080 list_add(&r->mw_list, &buf->rb_mws);
1081 ++r;
1083 break;
1084 case RPCRDMA_MEMWINDOWS_ASYNC:
1085 case RPCRDMA_MEMWINDOWS:
1086 /* Allocate one extra request's worth, for full cycling */
1087 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
1088 r->r.mw = ib_alloc_mw(ia->ri_pd);
1089 if (IS_ERR(r->r.mw)) {
1090 rc = PTR_ERR(r->r.mw);
1091 dprintk("RPC: %s: ib_alloc_mw"
1092 " failed %i\n", __func__, rc);
1093 goto out;
1095 list_add(&r->mw_list, &buf->rb_mws);
1096 ++r;
1098 break;
1099 default:
1100 break;
1104 * Allocate/init the request/reply buffers. Doing this
1105 * using kmalloc for now -- one for each buf.
1107 for (i = 0; i < buf->rb_max_requests; i++) {
1108 struct rpcrdma_req *req;
1109 struct rpcrdma_rep *rep;
1111 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
1112 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
1113 /* Typical ~2400b, so rounding up saves work later */
1114 if (len < 4096)
1115 len = 4096;
1116 req = kmalloc(len, GFP_KERNEL);
1117 if (req == NULL) {
1118 dprintk("RPC: %s: request buffer %d alloc"
1119 " failed\n", __func__, i);
1120 rc = -ENOMEM;
1121 goto out;
1123 memset(req, 0, sizeof(struct rpcrdma_req));
1124 buf->rb_send_bufs[i] = req;
1125 buf->rb_send_bufs[i]->rl_buffer = buf;
1127 rc = rpcrdma_register_internal(ia, req->rl_base,
1128 len - offsetof(struct rpcrdma_req, rl_base),
1129 &buf->rb_send_bufs[i]->rl_handle,
1130 &buf->rb_send_bufs[i]->rl_iov);
1131 if (rc)
1132 goto out;
1134 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1136 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1137 rep = kmalloc(len, GFP_KERNEL);
1138 if (rep == NULL) {
1139 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1140 __func__, i);
1141 rc = -ENOMEM;
1142 goto out;
1144 memset(rep, 0, sizeof(struct rpcrdma_rep));
1145 buf->rb_recv_bufs[i] = rep;
1146 buf->rb_recv_bufs[i]->rr_buffer = buf;
1147 init_waitqueue_head(&rep->rr_unbind);
1149 rc = rpcrdma_register_internal(ia, rep->rr_base,
1150 len - offsetof(struct rpcrdma_rep, rr_base),
1151 &buf->rb_recv_bufs[i]->rr_handle,
1152 &buf->rb_recv_bufs[i]->rr_iov);
1153 if (rc)
1154 goto out;
1157 dprintk("RPC: %s: max_requests %d\n",
1158 __func__, buf->rb_max_requests);
1159 /* done */
1160 return 0;
1161 out:
1162 rpcrdma_buffer_destroy(buf);
1163 return rc;
1167 * Unregister and destroy buffer memory. Need to deal with
1168 * partial initialization, so it's callable from failed create.
1169 * Must be called before destroying endpoint, as registrations
1170 * reference it.
1172 void
1173 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1175 int rc, i;
1176 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1177 struct rpcrdma_mw *r;
1179 /* clean up in reverse order from create
1180 * 1. recv mr memory (mr free, then kfree)
1181 * 1a. bind mw memory
1182 * 2. send mr memory (mr free, then kfree)
1183 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1184 * 4. arrays
1186 dprintk("RPC: %s: entering\n", __func__);
1188 for (i = 0; i < buf->rb_max_requests; i++) {
1189 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1190 rpcrdma_deregister_internal(ia,
1191 buf->rb_recv_bufs[i]->rr_handle,
1192 &buf->rb_recv_bufs[i]->rr_iov);
1193 kfree(buf->rb_recv_bufs[i]);
1195 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1196 while (!list_empty(&buf->rb_mws)) {
1197 r = list_entry(buf->rb_mws.next,
1198 struct rpcrdma_mw, mw_list);
1199 list_del(&r->mw_list);
1200 switch (ia->ri_memreg_strategy) {
1201 case RPCRDMA_FRMR:
1202 rc = ib_dereg_mr(r->r.frmr.fr_mr);
1203 if (rc)
1204 dprintk("RPC: %s:"
1205 " ib_dereg_mr"
1206 " failed %i\n",
1207 __func__, rc);
1208 ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
1209 break;
1210 case RPCRDMA_MTHCAFMR:
1211 rc = ib_dealloc_fmr(r->r.fmr);
1212 if (rc)
1213 dprintk("RPC: %s:"
1214 " ib_dealloc_fmr"
1215 " failed %i\n",
1216 __func__, rc);
1217 break;
1218 case RPCRDMA_MEMWINDOWS_ASYNC:
1219 case RPCRDMA_MEMWINDOWS:
1220 rc = ib_dealloc_mw(r->r.mw);
1221 if (rc)
1222 dprintk("RPC: %s:"
1223 " ib_dealloc_mw"
1224 " failed %i\n",
1225 __func__, rc);
1226 break;
1227 default:
1228 break;
1231 rpcrdma_deregister_internal(ia,
1232 buf->rb_send_bufs[i]->rl_handle,
1233 &buf->rb_send_bufs[i]->rl_iov);
1234 kfree(buf->rb_send_bufs[i]);
1238 kfree(buf->rb_pool);
1242 * Get a set of request/reply buffers.
1244 * Reply buffer (if needed) is attached to send buffer upon return.
1245 * Rule:
1246 * rb_send_index and rb_recv_index MUST always be pointing to the
1247 * *next* available buffer (non-NULL). They are incremented after
1248 * removing buffers, and decremented *before* returning them.
1250 struct rpcrdma_req *
1251 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1253 struct rpcrdma_req *req;
1254 unsigned long flags;
1255 int i;
1256 struct rpcrdma_mw *r;
1258 spin_lock_irqsave(&buffers->rb_lock, flags);
1259 if (buffers->rb_send_index == buffers->rb_max_requests) {
1260 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1261 dprintk("RPC: %s: out of request buffers\n", __func__);
1262 return ((struct rpcrdma_req *)NULL);
1265 req = buffers->rb_send_bufs[buffers->rb_send_index];
1266 if (buffers->rb_send_index < buffers->rb_recv_index) {
1267 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1268 __func__,
1269 buffers->rb_recv_index - buffers->rb_send_index);
1270 req->rl_reply = NULL;
1271 } else {
1272 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1273 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1275 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1276 if (!list_empty(&buffers->rb_mws)) {
1277 i = RPCRDMA_MAX_SEGS - 1;
1278 do {
1279 r = list_entry(buffers->rb_mws.next,
1280 struct rpcrdma_mw, mw_list);
1281 list_del(&r->mw_list);
1282 req->rl_segments[i].mr_chunk.rl_mw = r;
1283 } while (--i >= 0);
1285 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1286 return req;
1290 * Put request/reply buffers back into pool.
1291 * Pre-decrement counter/array index.
1293 void
1294 rpcrdma_buffer_put(struct rpcrdma_req *req)
1296 struct rpcrdma_buffer *buffers = req->rl_buffer;
1297 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1298 int i;
1299 unsigned long flags;
1301 BUG_ON(req->rl_nchunks != 0);
1302 spin_lock_irqsave(&buffers->rb_lock, flags);
1303 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1304 req->rl_niovs = 0;
1305 if (req->rl_reply) {
1306 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1307 init_waitqueue_head(&req->rl_reply->rr_unbind);
1308 req->rl_reply->rr_func = NULL;
1309 req->rl_reply = NULL;
1311 switch (ia->ri_memreg_strategy) {
1312 case RPCRDMA_FRMR:
1313 case RPCRDMA_MTHCAFMR:
1314 case RPCRDMA_MEMWINDOWS_ASYNC:
1315 case RPCRDMA_MEMWINDOWS:
1317 * Cycle mw's back in reverse order, and "spin" them.
1318 * This delays and scrambles reuse as much as possible.
1320 i = 1;
1321 do {
1322 struct rpcrdma_mw **mw;
1323 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1324 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1325 *mw = NULL;
1326 } while (++i < RPCRDMA_MAX_SEGS);
1327 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1328 &buffers->rb_mws);
1329 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1330 break;
1331 default:
1332 break;
1334 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1338 * Recover reply buffers from pool.
1339 * This happens when recovering from error conditions.
1340 * Post-increment counter/array index.
1342 void
1343 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1345 struct rpcrdma_buffer *buffers = req->rl_buffer;
1346 unsigned long flags;
1348 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1349 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1350 spin_lock_irqsave(&buffers->rb_lock, flags);
1351 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1352 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1353 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1355 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1359 * Put reply buffers back into pool when not attached to
1360 * request. This happens in error conditions, and when
1361 * aborting unbinds. Pre-decrement counter/array index.
1363 void
1364 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1366 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1367 unsigned long flags;
1369 rep->rr_func = NULL;
1370 spin_lock_irqsave(&buffers->rb_lock, flags);
1371 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1372 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1376 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1380 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1381 struct ib_mr **mrp, struct ib_sge *iov)
1383 struct ib_phys_buf ipb;
1384 struct ib_mr *mr;
1385 int rc;
1388 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1390 iov->addr = ib_dma_map_single(ia->ri_id->device,
1391 va, len, DMA_BIDIRECTIONAL);
1392 iov->length = len;
1394 if (ia->ri_have_dma_lkey) {
1395 *mrp = NULL;
1396 iov->lkey = ia->ri_dma_lkey;
1397 return 0;
1398 } else if (ia->ri_bind_mem != NULL) {
1399 *mrp = NULL;
1400 iov->lkey = ia->ri_bind_mem->lkey;
1401 return 0;
1404 ipb.addr = iov->addr;
1405 ipb.size = iov->length;
1406 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1407 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1409 dprintk("RPC: %s: phys convert: 0x%llx "
1410 "registered 0x%llx length %d\n",
1411 __func__, (unsigned long long)ipb.addr,
1412 (unsigned long long)iov->addr, len);
1414 if (IS_ERR(mr)) {
1415 *mrp = NULL;
1416 rc = PTR_ERR(mr);
1417 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1418 } else {
1419 *mrp = mr;
1420 iov->lkey = mr->lkey;
1421 rc = 0;
1424 return rc;
1428 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1429 struct ib_mr *mr, struct ib_sge *iov)
1431 int rc;
1433 ib_dma_unmap_single(ia->ri_id->device,
1434 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1436 if (NULL == mr)
1437 return 0;
1439 rc = ib_dereg_mr(mr);
1440 if (rc)
1441 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1442 return rc;
1446 * Wrappers for chunk registration, shared by read/write chunk code.
1449 static void
1450 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1452 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1453 seg->mr_dmalen = seg->mr_len;
1454 if (seg->mr_page)
1455 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1456 seg->mr_page, offset_in_page(seg->mr_offset),
1457 seg->mr_dmalen, seg->mr_dir);
1458 else
1459 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1460 seg->mr_offset,
1461 seg->mr_dmalen, seg->mr_dir);
1462 if (ib_dma_mapping_error(ia->ri_id->device, seg->mr_dma)) {
1463 dprintk("RPC: %s: mr_dma %llx mr_offset %p mr_dma_len %zu\n",
1464 __func__,
1465 (unsigned long long)seg->mr_dma,
1466 seg->mr_offset, seg->mr_dmalen);
1470 static void
1471 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1473 if (seg->mr_page)
1474 ib_dma_unmap_page(ia->ri_id->device,
1475 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1476 else
1477 ib_dma_unmap_single(ia->ri_id->device,
1478 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1481 static int
1482 rpcrdma_register_frmr_external(struct rpcrdma_mr_seg *seg,
1483 int *nsegs, int writing, struct rpcrdma_ia *ia,
1484 struct rpcrdma_xprt *r_xprt)
1486 struct rpcrdma_mr_seg *seg1 = seg;
1487 struct ib_send_wr invalidate_wr, frmr_wr, *bad_wr, *post_wr;
1489 u8 key;
1490 int len, pageoff;
1491 int i, rc;
1493 pageoff = offset_in_page(seg1->mr_offset);
1494 seg1->mr_offset -= pageoff; /* start of page */
1495 seg1->mr_len += pageoff;
1496 len = -pageoff;
1497 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1498 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1499 for (i = 0; i < *nsegs;) {
1500 rpcrdma_map_one(ia, seg, writing);
1501 seg1->mr_chunk.rl_mw->r.frmr.fr_pgl->page_list[i] = seg->mr_dma;
1502 len += seg->mr_len;
1503 BUG_ON(seg->mr_len > PAGE_SIZE);
1504 ++seg;
1505 ++i;
1506 /* Check for holes */
1507 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1508 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1509 break;
1511 dprintk("RPC: %s: Using frmr %p to map %d segments\n",
1512 __func__, seg1->mr_chunk.rl_mw, i);
1514 if (unlikely(seg1->mr_chunk.rl_mw->r.frmr.state == FRMR_IS_VALID)) {
1515 dprintk("RPC: %s: frmr %x left valid, posting invalidate.\n",
1516 __func__,
1517 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey);
1518 /* Invalidate before using. */
1519 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1520 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1521 invalidate_wr.next = &frmr_wr;
1522 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1523 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1524 invalidate_wr.ex.invalidate_rkey =
1525 seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1526 DECR_CQCOUNT(&r_xprt->rx_ep);
1527 post_wr = &invalidate_wr;
1528 } else
1529 post_wr = &frmr_wr;
1531 /* Bump the key */
1532 key = (u8)(seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey & 0x000000FF);
1533 ib_update_fast_reg_key(seg1->mr_chunk.rl_mw->r.frmr.fr_mr, ++key);
1535 /* Prepare FRMR WR */
1536 memset(&frmr_wr, 0, sizeof frmr_wr);
1537 frmr_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1538 frmr_wr.opcode = IB_WR_FAST_REG_MR;
1539 frmr_wr.send_flags = IB_SEND_SIGNALED;
1540 frmr_wr.wr.fast_reg.iova_start = seg1->mr_dma;
1541 frmr_wr.wr.fast_reg.page_list = seg1->mr_chunk.rl_mw->r.frmr.fr_pgl;
1542 frmr_wr.wr.fast_reg.page_list_len = i;
1543 frmr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
1544 frmr_wr.wr.fast_reg.length = i << PAGE_SHIFT;
1545 BUG_ON(frmr_wr.wr.fast_reg.length < len);
1546 frmr_wr.wr.fast_reg.access_flags = (writing ?
1547 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
1548 IB_ACCESS_REMOTE_READ);
1549 frmr_wr.wr.fast_reg.rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1550 DECR_CQCOUNT(&r_xprt->rx_ep);
1552 rc = ib_post_send(ia->ri_id->qp, post_wr, &bad_wr);
1554 if (rc) {
1555 dprintk("RPC: %s: failed ib_post_send for register,"
1556 " status %i\n", __func__, rc);
1557 while (i--)
1558 rpcrdma_unmap_one(ia, --seg);
1559 } else {
1560 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1561 seg1->mr_base = seg1->mr_dma + pageoff;
1562 seg1->mr_nsegs = i;
1563 seg1->mr_len = len;
1565 *nsegs = i;
1566 return rc;
1569 static int
1570 rpcrdma_deregister_frmr_external(struct rpcrdma_mr_seg *seg,
1571 struct rpcrdma_ia *ia, struct rpcrdma_xprt *r_xprt)
1573 struct rpcrdma_mr_seg *seg1 = seg;
1574 struct ib_send_wr invalidate_wr, *bad_wr;
1575 int rc;
1577 while (seg1->mr_nsegs--)
1578 rpcrdma_unmap_one(ia, seg++);
1580 memset(&invalidate_wr, 0, sizeof invalidate_wr);
1581 invalidate_wr.wr_id = (unsigned long)(void *)seg1->mr_chunk.rl_mw;
1582 invalidate_wr.opcode = IB_WR_LOCAL_INV;
1583 invalidate_wr.send_flags = IB_SEND_SIGNALED;
1584 invalidate_wr.ex.invalidate_rkey = seg1->mr_chunk.rl_mw->r.frmr.fr_mr->rkey;
1585 DECR_CQCOUNT(&r_xprt->rx_ep);
1587 rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
1588 if (rc)
1589 dprintk("RPC: %s: failed ib_post_send for invalidate,"
1590 " status %i\n", __func__, rc);
1591 return rc;
1594 static int
1595 rpcrdma_register_fmr_external(struct rpcrdma_mr_seg *seg,
1596 int *nsegs, int writing, struct rpcrdma_ia *ia)
1598 struct rpcrdma_mr_seg *seg1 = seg;
1599 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1600 int len, pageoff, i, rc;
1602 pageoff = offset_in_page(seg1->mr_offset);
1603 seg1->mr_offset -= pageoff; /* start of page */
1604 seg1->mr_len += pageoff;
1605 len = -pageoff;
1606 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1607 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1608 for (i = 0; i < *nsegs;) {
1609 rpcrdma_map_one(ia, seg, writing);
1610 physaddrs[i] = seg->mr_dma;
1611 len += seg->mr_len;
1612 ++seg;
1613 ++i;
1614 /* Check for holes */
1615 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1616 offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
1617 break;
1619 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1620 physaddrs, i, seg1->mr_dma);
1621 if (rc) {
1622 dprintk("RPC: %s: failed ib_map_phys_fmr "
1623 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1624 len, (unsigned long long)seg1->mr_dma,
1625 pageoff, i, rc);
1626 while (i--)
1627 rpcrdma_unmap_one(ia, --seg);
1628 } else {
1629 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1630 seg1->mr_base = seg1->mr_dma + pageoff;
1631 seg1->mr_nsegs = i;
1632 seg1->mr_len = len;
1634 *nsegs = i;
1635 return rc;
1638 static int
1639 rpcrdma_deregister_fmr_external(struct rpcrdma_mr_seg *seg,
1640 struct rpcrdma_ia *ia)
1642 struct rpcrdma_mr_seg *seg1 = seg;
1643 LIST_HEAD(l);
1644 int rc;
1646 list_add(&seg1->mr_chunk.rl_mw->r.fmr->list, &l);
1647 rc = ib_unmap_fmr(&l);
1648 while (seg1->mr_nsegs--)
1649 rpcrdma_unmap_one(ia, seg++);
1650 if (rc)
1651 dprintk("RPC: %s: failed ib_unmap_fmr,"
1652 " status %i\n", __func__, rc);
1653 return rc;
1656 static int
1657 rpcrdma_register_memwin_external(struct rpcrdma_mr_seg *seg,
1658 int *nsegs, int writing, struct rpcrdma_ia *ia,
1659 struct rpcrdma_xprt *r_xprt)
1661 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1662 IB_ACCESS_REMOTE_READ);
1663 struct ib_mw_bind param;
1664 int rc;
1666 *nsegs = 1;
1667 rpcrdma_map_one(ia, seg, writing);
1668 param.mr = ia->ri_bind_mem;
1669 param.wr_id = 0ULL; /* no send cookie */
1670 param.addr = seg->mr_dma;
1671 param.length = seg->mr_len;
1672 param.send_flags = 0;
1673 param.mw_access_flags = mem_priv;
1675 DECR_CQCOUNT(&r_xprt->rx_ep);
1676 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1677 if (rc) {
1678 dprintk("RPC: %s: failed ib_bind_mw "
1679 "%u@0x%llx status %i\n",
1680 __func__, seg->mr_len,
1681 (unsigned long long)seg->mr_dma, rc);
1682 rpcrdma_unmap_one(ia, seg);
1683 } else {
1684 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1685 seg->mr_base = param.addr;
1686 seg->mr_nsegs = 1;
1688 return rc;
1691 static int
1692 rpcrdma_deregister_memwin_external(struct rpcrdma_mr_seg *seg,
1693 struct rpcrdma_ia *ia,
1694 struct rpcrdma_xprt *r_xprt, void **r)
1696 struct ib_mw_bind param;
1697 LIST_HEAD(l);
1698 int rc;
1700 BUG_ON(seg->mr_nsegs != 1);
1701 param.mr = ia->ri_bind_mem;
1702 param.addr = 0ULL; /* unbind */
1703 param.length = 0;
1704 param.mw_access_flags = 0;
1705 if (*r) {
1706 param.wr_id = (u64) (unsigned long) *r;
1707 param.send_flags = IB_SEND_SIGNALED;
1708 INIT_CQCOUNT(&r_xprt->rx_ep);
1709 } else {
1710 param.wr_id = 0ULL;
1711 param.send_flags = 0;
1712 DECR_CQCOUNT(&r_xprt->rx_ep);
1714 rc = ib_bind_mw(ia->ri_id->qp, seg->mr_chunk.rl_mw->r.mw, &param);
1715 rpcrdma_unmap_one(ia, seg);
1716 if (rc)
1717 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1718 " status %i\n", __func__, rc);
1719 else
1720 *r = NULL; /* will upcall on completion */
1721 return rc;
1724 static int
1725 rpcrdma_register_default_external(struct rpcrdma_mr_seg *seg,
1726 int *nsegs, int writing, struct rpcrdma_ia *ia)
1728 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1729 IB_ACCESS_REMOTE_READ);
1730 struct rpcrdma_mr_seg *seg1 = seg;
1731 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1732 int len, i, rc = 0;
1734 if (*nsegs > RPCRDMA_MAX_DATA_SEGS)
1735 *nsegs = RPCRDMA_MAX_DATA_SEGS;
1736 for (len = 0, i = 0; i < *nsegs;) {
1737 rpcrdma_map_one(ia, seg, writing);
1738 ipb[i].addr = seg->mr_dma;
1739 ipb[i].size = seg->mr_len;
1740 len += seg->mr_len;
1741 ++seg;
1742 ++i;
1743 /* Check for holes */
1744 if ((i < *nsegs && offset_in_page(seg->mr_offset)) ||
1745 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1746 break;
1748 seg1->mr_base = seg1->mr_dma;
1749 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1750 ipb, i, mem_priv, &seg1->mr_base);
1751 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1752 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1753 dprintk("RPC: %s: failed ib_reg_phys_mr "
1754 "%u@0x%llx (%d)... status %i\n",
1755 __func__, len,
1756 (unsigned long long)seg1->mr_dma, i, rc);
1757 while (i--)
1758 rpcrdma_unmap_one(ia, --seg);
1759 } else {
1760 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1761 seg1->mr_nsegs = i;
1762 seg1->mr_len = len;
1764 *nsegs = i;
1765 return rc;
1768 static int
1769 rpcrdma_deregister_default_external(struct rpcrdma_mr_seg *seg,
1770 struct rpcrdma_ia *ia)
1772 struct rpcrdma_mr_seg *seg1 = seg;
1773 int rc;
1775 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1776 seg1->mr_chunk.rl_mr = NULL;
1777 while (seg1->mr_nsegs--)
1778 rpcrdma_unmap_one(ia, seg++);
1779 if (rc)
1780 dprintk("RPC: %s: failed ib_dereg_mr,"
1781 " status %i\n", __func__, rc);
1782 return rc;
1786 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1787 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1789 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1790 int rc = 0;
1792 switch (ia->ri_memreg_strategy) {
1794 #if RPCRDMA_PERSISTENT_REGISTRATION
1795 case RPCRDMA_ALLPHYSICAL:
1796 rpcrdma_map_one(ia, seg, writing);
1797 seg->mr_rkey = ia->ri_bind_mem->rkey;
1798 seg->mr_base = seg->mr_dma;
1799 seg->mr_nsegs = 1;
1800 nsegs = 1;
1801 break;
1802 #endif
1804 /* Registration using frmr registration */
1805 case RPCRDMA_FRMR:
1806 rc = rpcrdma_register_frmr_external(seg, &nsegs, writing, ia, r_xprt);
1807 break;
1809 /* Registration using fmr memory registration */
1810 case RPCRDMA_MTHCAFMR:
1811 rc = rpcrdma_register_fmr_external(seg, &nsegs, writing, ia);
1812 break;
1814 /* Registration using memory windows */
1815 case RPCRDMA_MEMWINDOWS_ASYNC:
1816 case RPCRDMA_MEMWINDOWS:
1817 rc = rpcrdma_register_memwin_external(seg, &nsegs, writing, ia, r_xprt);
1818 break;
1820 /* Default registration each time */
1821 default:
1822 rc = rpcrdma_register_default_external(seg, &nsegs, writing, ia);
1823 break;
1825 if (rc)
1826 return -1;
1828 return nsegs;
1832 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1833 struct rpcrdma_xprt *r_xprt, void *r)
1835 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1836 int nsegs = seg->mr_nsegs, rc;
1838 switch (ia->ri_memreg_strategy) {
1840 #if RPCRDMA_PERSISTENT_REGISTRATION
1841 case RPCRDMA_ALLPHYSICAL:
1842 BUG_ON(nsegs != 1);
1843 rpcrdma_unmap_one(ia, seg);
1844 rc = 0;
1845 break;
1846 #endif
1848 case RPCRDMA_FRMR:
1849 rc = rpcrdma_deregister_frmr_external(seg, ia, r_xprt);
1850 break;
1852 case RPCRDMA_MTHCAFMR:
1853 rc = rpcrdma_deregister_fmr_external(seg, ia);
1854 break;
1856 case RPCRDMA_MEMWINDOWS_ASYNC:
1857 case RPCRDMA_MEMWINDOWS:
1858 rc = rpcrdma_deregister_memwin_external(seg, ia, r_xprt, &r);
1859 break;
1861 default:
1862 rc = rpcrdma_deregister_default_external(seg, ia);
1863 break;
1865 if (r) {
1866 struct rpcrdma_rep *rep = r;
1867 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1868 rep->rr_func = NULL;
1869 func(rep); /* dereg done, callback now */
1871 return nsegs;
1875 * Prepost any receive buffer, then post send.
1877 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1880 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1881 struct rpcrdma_ep *ep,
1882 struct rpcrdma_req *req)
1884 struct ib_send_wr send_wr, *send_wr_fail;
1885 struct rpcrdma_rep *rep = req->rl_reply;
1886 int rc;
1888 if (rep) {
1889 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1890 if (rc)
1891 goto out;
1892 req->rl_reply = NULL;
1895 send_wr.next = NULL;
1896 send_wr.wr_id = 0ULL; /* no send cookie */
1897 send_wr.sg_list = req->rl_send_iov;
1898 send_wr.num_sge = req->rl_niovs;
1899 send_wr.opcode = IB_WR_SEND;
1900 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1901 ib_dma_sync_single_for_device(ia->ri_id->device,
1902 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1903 DMA_TO_DEVICE);
1904 ib_dma_sync_single_for_device(ia->ri_id->device,
1905 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1906 DMA_TO_DEVICE);
1907 ib_dma_sync_single_for_device(ia->ri_id->device,
1908 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1909 DMA_TO_DEVICE);
1911 if (DECR_CQCOUNT(ep) > 0)
1912 send_wr.send_flags = 0;
1913 else { /* Provider must take a send completion every now and then */
1914 INIT_CQCOUNT(ep);
1915 send_wr.send_flags = IB_SEND_SIGNALED;
1918 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1919 if (rc)
1920 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1921 rc);
1922 out:
1923 return rc;
1927 * (Re)post a receive buffer.
1930 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1931 struct rpcrdma_ep *ep,
1932 struct rpcrdma_rep *rep)
1934 struct ib_recv_wr recv_wr, *recv_wr_fail;
1935 int rc;
1937 recv_wr.next = NULL;
1938 recv_wr.wr_id = (u64) (unsigned long) rep;
1939 recv_wr.sg_list = &rep->rr_iov;
1940 recv_wr.num_sge = 1;
1942 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1943 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1945 DECR_CQCOUNT(ep);
1946 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1948 if (rc)
1949 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1950 rc);
1951 return rc;