Deadline iosched: Fix batching fairness
[linux-2.6/next.git] / net / sunrpc / xprtrdma / verbs.c
blob44b0fb942e8db5834ddaebf81fe4fc1e3fbd5530
1 /*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 * verbs.c
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
50 #include <linux/pci.h> /* for Tavor hack below */
52 #include "xprt_rdma.h"
55 * Globals/Macros
58 #ifdef RPC_DEBUG
59 # define RPCDBG_FACILITY RPCDBG_TRANS
60 #endif
63 * internal functions
67 * handle replies in tasklet context, using a single, global list
68 * rdma tasklet function -- just turn around and call the func
69 * for all replies on the list
72 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
73 static LIST_HEAD(rpcrdma_tasklets_g);
75 static void
76 rpcrdma_run_tasklet(unsigned long data)
78 struct rpcrdma_rep *rep;
79 void (*func)(struct rpcrdma_rep *);
80 unsigned long flags;
82 data = data;
83 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
84 while (!list_empty(&rpcrdma_tasklets_g)) {
85 rep = list_entry(rpcrdma_tasklets_g.next,
86 struct rpcrdma_rep, rr_list);
87 list_del(&rep->rr_list);
88 func = rep->rr_func;
89 rep->rr_func = NULL;
90 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
92 if (func)
93 func(rep);
94 else
95 rpcrdma_recv_buffer_put(rep);
97 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
99 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
102 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
104 static inline void
105 rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
107 unsigned long flags;
109 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
110 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
111 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
112 tasklet_schedule(&rpcrdma_tasklet_g);
115 static void
116 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
118 struct rpcrdma_ep *ep = context;
120 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
121 __func__, event->event, event->device->name, context);
122 if (ep->rep_connected == 1) {
123 ep->rep_connected = -EIO;
124 ep->rep_func(ep);
125 wake_up_all(&ep->rep_connect_wait);
129 static void
130 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
132 struct rpcrdma_ep *ep = context;
134 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
135 __func__, event->event, event->device->name, context);
136 if (ep->rep_connected == 1) {
137 ep->rep_connected = -EIO;
138 ep->rep_func(ep);
139 wake_up_all(&ep->rep_connect_wait);
143 static inline
144 void rpcrdma_event_process(struct ib_wc *wc)
146 struct rpcrdma_rep *rep =
147 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
149 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
150 __func__, rep, wc->status, wc->opcode, wc->byte_len);
152 if (!rep) /* send or bind completion that we don't care about */
153 return;
155 if (IB_WC_SUCCESS != wc->status) {
156 dprintk("RPC: %s: %s WC status %X, connection lost\n",
157 __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
158 wc->status);
159 rep->rr_len = ~0U;
160 rpcrdma_schedule_tasklet(rep);
161 return;
164 switch (wc->opcode) {
165 case IB_WC_RECV:
166 rep->rr_len = wc->byte_len;
167 ib_dma_sync_single_for_cpu(
168 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
169 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
170 /* Keep (only) the most recent credits, after check validity */
171 if (rep->rr_len >= 16) {
172 struct rpcrdma_msg *p =
173 (struct rpcrdma_msg *) rep->rr_base;
174 unsigned int credits = ntohl(p->rm_credit);
175 if (credits == 0) {
176 dprintk("RPC: %s: server"
177 " dropped credits to 0!\n", __func__);
178 /* don't deadlock */
179 credits = 1;
180 } else if (credits > rep->rr_buffer->rb_max_requests) {
181 dprintk("RPC: %s: server"
182 " over-crediting: %d (%d)\n",
183 __func__, credits,
184 rep->rr_buffer->rb_max_requests);
185 credits = rep->rr_buffer->rb_max_requests;
187 atomic_set(&rep->rr_buffer->rb_credits, credits);
189 /* fall through */
190 case IB_WC_BIND_MW:
191 rpcrdma_schedule_tasklet(rep);
192 break;
193 default:
194 dprintk("RPC: %s: unexpected WC event %X\n",
195 __func__, wc->opcode);
196 break;
200 static inline int
201 rpcrdma_cq_poll(struct ib_cq *cq)
203 struct ib_wc wc;
204 int rc;
206 for (;;) {
207 rc = ib_poll_cq(cq, 1, &wc);
208 if (rc < 0) {
209 dprintk("RPC: %s: ib_poll_cq failed %i\n",
210 __func__, rc);
211 return rc;
213 if (rc == 0)
214 break;
216 rpcrdma_event_process(&wc);
219 return 0;
223 * rpcrdma_cq_event_upcall
225 * This upcall handles recv, send, bind and unbind events.
226 * It is reentrant but processes single events in order to maintain
227 * ordering of receives to keep server credits.
229 * It is the responsibility of the scheduled tasklet to return
230 * recv buffers to the pool. NOTE: this affects synchronization of
231 * connection shutdown. That is, the structures required for
232 * the completion of the reply handler must remain intact until
233 * all memory has been reclaimed.
235 * Note that send events are suppressed and do not result in an upcall.
237 static void
238 rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
240 int rc;
242 rc = rpcrdma_cq_poll(cq);
243 if (rc)
244 return;
246 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
247 if (rc) {
248 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
249 __func__, rc);
250 return;
253 rpcrdma_cq_poll(cq);
256 #ifdef RPC_DEBUG
257 static const char * const conn[] = {
258 "address resolved",
259 "address error",
260 "route resolved",
261 "route error",
262 "connect request",
263 "connect response",
264 "connect error",
265 "unreachable",
266 "rejected",
267 "established",
268 "disconnected",
269 "device removal"
271 #endif
273 static int
274 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
276 struct rpcrdma_xprt *xprt = id->context;
277 struct rpcrdma_ia *ia = &xprt->rx_ia;
278 struct rpcrdma_ep *ep = &xprt->rx_ep;
279 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
280 struct ib_qp_attr attr;
281 struct ib_qp_init_attr iattr;
282 int connstate = 0;
284 switch (event->event) {
285 case RDMA_CM_EVENT_ADDR_RESOLVED:
286 case RDMA_CM_EVENT_ROUTE_RESOLVED:
287 complete(&ia->ri_done);
288 break;
289 case RDMA_CM_EVENT_ADDR_ERROR:
290 ia->ri_async_rc = -EHOSTUNREACH;
291 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
292 __func__, ep);
293 complete(&ia->ri_done);
294 break;
295 case RDMA_CM_EVENT_ROUTE_ERROR:
296 ia->ri_async_rc = -ENETUNREACH;
297 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
298 __func__, ep);
299 complete(&ia->ri_done);
300 break;
301 case RDMA_CM_EVENT_ESTABLISHED:
302 connstate = 1;
303 ib_query_qp(ia->ri_id->qp, &attr,
304 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
305 &iattr);
306 dprintk("RPC: %s: %d responder resources"
307 " (%d initiator)\n",
308 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
309 goto connected;
310 case RDMA_CM_EVENT_CONNECT_ERROR:
311 connstate = -ENOTCONN;
312 goto connected;
313 case RDMA_CM_EVENT_UNREACHABLE:
314 connstate = -ENETDOWN;
315 goto connected;
316 case RDMA_CM_EVENT_REJECTED:
317 connstate = -ECONNREFUSED;
318 goto connected;
319 case RDMA_CM_EVENT_DISCONNECTED:
320 connstate = -ECONNABORTED;
321 goto connected;
322 case RDMA_CM_EVENT_DEVICE_REMOVAL:
323 connstate = -ENODEV;
324 connected:
325 dprintk("RPC: %s: %s: %u.%u.%u.%u:%u"
326 " (ep 0x%p event 0x%x)\n",
327 __func__,
328 (event->event <= 11) ? conn[event->event] :
329 "unknown connection error",
330 NIPQUAD(addr->sin_addr.s_addr),
331 ntohs(addr->sin_port),
332 ep, event->event);
333 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
334 dprintk("RPC: %s: %sconnected\n",
335 __func__, connstate > 0 ? "" : "dis");
336 ep->rep_connected = connstate;
337 ep->rep_func(ep);
338 wake_up_all(&ep->rep_connect_wait);
339 break;
340 default:
341 ia->ri_async_rc = -EINVAL;
342 dprintk("RPC: %s: unexpected CM event %X\n",
343 __func__, event->event);
344 complete(&ia->ri_done);
345 break;
348 return 0;
351 static struct rdma_cm_id *
352 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
353 struct rpcrdma_ia *ia, struct sockaddr *addr)
355 struct rdma_cm_id *id;
356 int rc;
358 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
359 if (IS_ERR(id)) {
360 rc = PTR_ERR(id);
361 dprintk("RPC: %s: rdma_create_id() failed %i\n",
362 __func__, rc);
363 return id;
366 ia->ri_async_rc = 0;
367 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
368 if (rc) {
369 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
370 __func__, rc);
371 goto out;
373 wait_for_completion(&ia->ri_done);
374 rc = ia->ri_async_rc;
375 if (rc)
376 goto out;
378 ia->ri_async_rc = 0;
379 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
380 if (rc) {
381 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
382 __func__, rc);
383 goto out;
385 wait_for_completion(&ia->ri_done);
386 rc = ia->ri_async_rc;
387 if (rc)
388 goto out;
390 return id;
392 out:
393 rdma_destroy_id(id);
394 return ERR_PTR(rc);
398 * Drain any cq, prior to teardown.
400 static void
401 rpcrdma_clean_cq(struct ib_cq *cq)
403 struct ib_wc wc;
404 int count = 0;
406 while (1 == ib_poll_cq(cq, 1, &wc))
407 ++count;
409 if (count)
410 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
411 __func__, count, wc.opcode);
415 * Exported functions.
419 * Open and initialize an Interface Adapter.
420 * o initializes fields of struct rpcrdma_ia, including
421 * interface and provider attributes and protection zone.
424 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
426 int rc;
427 struct rpcrdma_ia *ia = &xprt->rx_ia;
429 init_completion(&ia->ri_done);
431 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
432 if (IS_ERR(ia->ri_id)) {
433 rc = PTR_ERR(ia->ri_id);
434 goto out1;
437 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
438 if (IS_ERR(ia->ri_pd)) {
439 rc = PTR_ERR(ia->ri_pd);
440 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
441 __func__, rc);
442 goto out2;
446 * Optionally obtain an underlying physical identity mapping in
447 * order to do a memory window-based bind. This base registration
448 * is protected from remote access - that is enabled only by binding
449 * for the specific bytes targeted during each RPC operation, and
450 * revoked after the corresponding completion similar to a storage
451 * adapter.
453 if (memreg > RPCRDMA_REGISTER) {
454 int mem_priv = IB_ACCESS_LOCAL_WRITE;
455 switch (memreg) {
456 #if RPCRDMA_PERSISTENT_REGISTRATION
457 case RPCRDMA_ALLPHYSICAL:
458 mem_priv |= IB_ACCESS_REMOTE_WRITE;
459 mem_priv |= IB_ACCESS_REMOTE_READ;
460 break;
461 #endif
462 case RPCRDMA_MEMWINDOWS_ASYNC:
463 case RPCRDMA_MEMWINDOWS:
464 mem_priv |= IB_ACCESS_MW_BIND;
465 break;
466 default:
467 break;
469 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
470 if (IS_ERR(ia->ri_bind_mem)) {
471 printk(KERN_ALERT "%s: ib_get_dma_mr for "
472 "phys register failed with %lX\n\t"
473 "Will continue with degraded performance\n",
474 __func__, PTR_ERR(ia->ri_bind_mem));
475 memreg = RPCRDMA_REGISTER;
476 ia->ri_bind_mem = NULL;
480 /* Else will do memory reg/dereg for each chunk */
481 ia->ri_memreg_strategy = memreg;
483 return 0;
484 out2:
485 rdma_destroy_id(ia->ri_id);
486 out1:
487 return rc;
491 * Clean up/close an IA.
492 * o if event handles and PD have been initialized, free them.
493 * o close the IA
495 void
496 rpcrdma_ia_close(struct rpcrdma_ia *ia)
498 int rc;
500 dprintk("RPC: %s: entering\n", __func__);
501 if (ia->ri_bind_mem != NULL) {
502 rc = ib_dereg_mr(ia->ri_bind_mem);
503 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
504 __func__, rc);
506 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp)
507 rdma_destroy_qp(ia->ri_id);
508 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
509 rc = ib_dealloc_pd(ia->ri_pd);
510 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
511 __func__, rc);
513 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
514 rdma_destroy_id(ia->ri_id);
518 * Create unconnected endpoint.
521 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
522 struct rpcrdma_create_data_internal *cdata)
524 struct ib_device_attr devattr;
525 int rc;
527 rc = ib_query_device(ia->ri_id->device, &devattr);
528 if (rc) {
529 dprintk("RPC: %s: ib_query_device failed %d\n",
530 __func__, rc);
531 return rc;
534 /* check provider's send/recv wr limits */
535 if (cdata->max_requests > devattr.max_qp_wr)
536 cdata->max_requests = devattr.max_qp_wr;
538 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
539 ep->rep_attr.qp_context = ep;
540 /* send_cq and recv_cq initialized below */
541 ep->rep_attr.srq = NULL;
542 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
543 switch (ia->ri_memreg_strategy) {
544 case RPCRDMA_MEMWINDOWS_ASYNC:
545 case RPCRDMA_MEMWINDOWS:
546 /* Add room for mw_binds+unbinds - overkill! */
547 ep->rep_attr.cap.max_send_wr++;
548 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
549 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
550 return -EINVAL;
551 break;
552 default:
553 break;
555 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
556 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
557 ep->rep_attr.cap.max_recv_sge = 1;
558 ep->rep_attr.cap.max_inline_data = 0;
559 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
560 ep->rep_attr.qp_type = IB_QPT_RC;
561 ep->rep_attr.port_num = ~0;
563 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
564 "iovs: send %d recv %d\n",
565 __func__,
566 ep->rep_attr.cap.max_send_wr,
567 ep->rep_attr.cap.max_recv_wr,
568 ep->rep_attr.cap.max_send_sge,
569 ep->rep_attr.cap.max_recv_sge);
571 /* set trigger for requesting send completion */
572 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
573 switch (ia->ri_memreg_strategy) {
574 case RPCRDMA_MEMWINDOWS_ASYNC:
575 case RPCRDMA_MEMWINDOWS:
576 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
577 break;
578 default:
579 break;
581 if (ep->rep_cqinit <= 2)
582 ep->rep_cqinit = 0;
583 INIT_CQCOUNT(ep);
584 ep->rep_ia = ia;
585 init_waitqueue_head(&ep->rep_connect_wait);
588 * Create a single cq for receive dto and mw_bind (only ever
589 * care about unbind, really). Send completions are suppressed.
590 * Use single threaded tasklet upcalls to maintain ordering.
592 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
593 rpcrdma_cq_async_error_upcall, NULL,
594 ep->rep_attr.cap.max_recv_wr +
595 ep->rep_attr.cap.max_send_wr + 1, 0);
596 if (IS_ERR(ep->rep_cq)) {
597 rc = PTR_ERR(ep->rep_cq);
598 dprintk("RPC: %s: ib_create_cq failed: %i\n",
599 __func__, rc);
600 goto out1;
603 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
604 if (rc) {
605 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
606 __func__, rc);
607 goto out2;
610 ep->rep_attr.send_cq = ep->rep_cq;
611 ep->rep_attr.recv_cq = ep->rep_cq;
613 /* Initialize cma parameters */
615 /* RPC/RDMA does not use private data */
616 ep->rep_remote_cma.private_data = NULL;
617 ep->rep_remote_cma.private_data_len = 0;
619 /* Client offers RDMA Read but does not initiate */
620 switch (ia->ri_memreg_strategy) {
621 case RPCRDMA_BOUNCEBUFFERS:
622 ep->rep_remote_cma.responder_resources = 0;
623 break;
624 case RPCRDMA_MTHCAFMR:
625 case RPCRDMA_REGISTER:
626 ep->rep_remote_cma.responder_resources = cdata->max_requests *
627 (RPCRDMA_MAX_DATA_SEGS / 8);
628 break;
629 case RPCRDMA_MEMWINDOWS:
630 case RPCRDMA_MEMWINDOWS_ASYNC:
631 #if RPCRDMA_PERSISTENT_REGISTRATION
632 case RPCRDMA_ALLPHYSICAL:
633 #endif
634 ep->rep_remote_cma.responder_resources = cdata->max_requests *
635 (RPCRDMA_MAX_DATA_SEGS / 2);
636 break;
637 default:
638 break;
640 if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom)
641 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
642 ep->rep_remote_cma.initiator_depth = 0;
644 ep->rep_remote_cma.retry_count = 7;
645 ep->rep_remote_cma.flow_control = 0;
646 ep->rep_remote_cma.rnr_retry_count = 0;
648 return 0;
650 out2:
651 if (ib_destroy_cq(ep->rep_cq))
653 out1:
654 return rc;
658 * rpcrdma_ep_destroy
660 * Disconnect and destroy endpoint. After this, the only
661 * valid operations on the ep are to free it (if dynamically
662 * allocated) or re-create it.
664 * The caller's error handling must be sure to not leak the endpoint
665 * if this function fails.
668 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
670 int rc;
672 dprintk("RPC: %s: entering, connected is %d\n",
673 __func__, ep->rep_connected);
675 if (ia->ri_id->qp) {
676 rc = rpcrdma_ep_disconnect(ep, ia);
677 if (rc)
678 dprintk("RPC: %s: rpcrdma_ep_disconnect"
679 " returned %i\n", __func__, rc);
682 ep->rep_func = NULL;
684 /* padding - could be done in rpcrdma_buffer_destroy... */
685 if (ep->rep_pad_mr) {
686 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
687 ep->rep_pad_mr = NULL;
690 if (ia->ri_id->qp) {
691 rdma_destroy_qp(ia->ri_id);
692 ia->ri_id->qp = NULL;
695 rpcrdma_clean_cq(ep->rep_cq);
696 rc = ib_destroy_cq(ep->rep_cq);
697 if (rc)
698 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
699 __func__, rc);
701 return rc;
705 * Connect unconnected endpoint.
708 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
710 struct rdma_cm_id *id;
711 int rc = 0;
712 int retry_count = 0;
713 int reconnect = (ep->rep_connected != 0);
715 if (reconnect) {
716 struct rpcrdma_xprt *xprt;
717 retry:
718 rc = rpcrdma_ep_disconnect(ep, ia);
719 if (rc && rc != -ENOTCONN)
720 dprintk("RPC: %s: rpcrdma_ep_disconnect"
721 " status %i\n", __func__, rc);
722 rpcrdma_clean_cq(ep->rep_cq);
724 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
725 id = rpcrdma_create_id(xprt, ia,
726 (struct sockaddr *)&xprt->rx_data.addr);
727 if (IS_ERR(id)) {
728 rc = PTR_ERR(id);
729 goto out;
731 /* TEMP TEMP TEMP - fail if new device:
732 * Deregister/remarshal *all* requests!
733 * Close and recreate adapter, pd, etc!
734 * Re-determine all attributes still sane!
735 * More stuff I haven't thought of!
736 * Rrrgh!
738 if (ia->ri_id->device != id->device) {
739 printk("RPC: %s: can't reconnect on "
740 "different device!\n", __func__);
741 rdma_destroy_id(id);
742 rc = -ENETDOWN;
743 goto out;
745 /* END TEMP */
746 rdma_destroy_id(ia->ri_id);
747 ia->ri_id = id;
750 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
751 if (rc) {
752 dprintk("RPC: %s: rdma_create_qp failed %i\n",
753 __func__, rc);
754 goto out;
757 /* XXX Tavor device performs badly with 2K MTU! */
758 if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
759 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
760 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
761 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
762 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
763 struct ib_qp_attr attr = {
764 .path_mtu = IB_MTU_1024
766 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
770 /* Theoretically a client initiator_depth > 0 is not needed,
771 * but many peers fail to complete the connection unless they
772 * == responder_resources! */
773 if (ep->rep_remote_cma.initiator_depth !=
774 ep->rep_remote_cma.responder_resources)
775 ep->rep_remote_cma.initiator_depth =
776 ep->rep_remote_cma.responder_resources;
778 ep->rep_connected = 0;
780 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
781 if (rc) {
782 dprintk("RPC: %s: rdma_connect() failed with %i\n",
783 __func__, rc);
784 goto out;
787 if (reconnect)
788 return 0;
790 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
793 * Check state. A non-peer reject indicates no listener
794 * (ECONNREFUSED), which may be a transient state. All
795 * others indicate a transport condition which has already
796 * undergone a best-effort.
798 if (ep->rep_connected == -ECONNREFUSED
799 && ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
800 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
801 goto retry;
803 if (ep->rep_connected <= 0) {
804 /* Sometimes, the only way to reliably connect to remote
805 * CMs is to use same nonzero values for ORD and IRD. */
806 ep->rep_remote_cma.initiator_depth =
807 ep->rep_remote_cma.responder_resources;
808 if (ep->rep_remote_cma.initiator_depth == 0)
809 ++ep->rep_remote_cma.initiator_depth;
810 if (ep->rep_remote_cma.responder_resources == 0)
811 ++ep->rep_remote_cma.responder_resources;
812 if (retry_count++ == 0)
813 goto retry;
814 rc = ep->rep_connected;
815 } else {
816 dprintk("RPC: %s: connected\n", __func__);
819 out:
820 if (rc)
821 ep->rep_connected = rc;
822 return rc;
826 * rpcrdma_ep_disconnect
828 * This is separate from destroy to facilitate the ability
829 * to reconnect without recreating the endpoint.
831 * This call is not reentrant, and must not be made in parallel
832 * on the same endpoint.
835 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
837 int rc;
839 rpcrdma_clean_cq(ep->rep_cq);
840 rc = rdma_disconnect(ia->ri_id);
841 if (!rc) {
842 /* returns without wait if not connected */
843 wait_event_interruptible(ep->rep_connect_wait,
844 ep->rep_connected != 1);
845 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
846 (ep->rep_connected == 1) ? "still " : "dis");
847 } else {
848 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
849 ep->rep_connected = rc;
851 return rc;
855 * Initialize buffer memory
858 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
859 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
861 char *p;
862 size_t len;
863 int i, rc;
865 buf->rb_max_requests = cdata->max_requests;
866 spin_lock_init(&buf->rb_lock);
867 atomic_set(&buf->rb_credits, 1);
869 /* Need to allocate:
870 * 1. arrays for send and recv pointers
871 * 2. arrays of struct rpcrdma_req to fill in pointers
872 * 3. array of struct rpcrdma_rep for replies
873 * 4. padding, if any
874 * 5. mw's, if any
875 * Send/recv buffers in req/rep need to be registered
878 len = buf->rb_max_requests *
879 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
880 len += cdata->padding;
881 switch (ia->ri_memreg_strategy) {
882 case RPCRDMA_MTHCAFMR:
883 /* TBD we are perhaps overallocating here */
884 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
885 sizeof(struct rpcrdma_mw);
886 break;
887 case RPCRDMA_MEMWINDOWS_ASYNC:
888 case RPCRDMA_MEMWINDOWS:
889 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
890 sizeof(struct rpcrdma_mw);
891 break;
892 default:
893 break;
896 /* allocate 1, 4 and 5 in one shot */
897 p = kzalloc(len, GFP_KERNEL);
898 if (p == NULL) {
899 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
900 __func__, len);
901 rc = -ENOMEM;
902 goto out;
904 buf->rb_pool = p; /* for freeing it later */
906 buf->rb_send_bufs = (struct rpcrdma_req **) p;
907 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
908 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
909 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
912 * Register the zeroed pad buffer, if any.
914 if (cdata->padding) {
915 rc = rpcrdma_register_internal(ia, p, cdata->padding,
916 &ep->rep_pad_mr, &ep->rep_pad);
917 if (rc)
918 goto out;
920 p += cdata->padding;
923 * Allocate the fmr's, or mw's for mw_bind chunk registration.
924 * We "cycle" the mw's in order to minimize rkey reuse,
925 * and also reduce unbind-to-bind collision.
927 INIT_LIST_HEAD(&buf->rb_mws);
928 switch (ia->ri_memreg_strategy) {
929 case RPCRDMA_MTHCAFMR:
931 struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
932 struct ib_fmr_attr fa = {
933 RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT
935 /* TBD we are perhaps overallocating here */
936 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
937 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
938 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
939 &fa);
940 if (IS_ERR(r->r.fmr)) {
941 rc = PTR_ERR(r->r.fmr);
942 dprintk("RPC: %s: ib_alloc_fmr"
943 " failed %i\n", __func__, rc);
944 goto out;
946 list_add(&r->mw_list, &buf->rb_mws);
947 ++r;
950 break;
951 case RPCRDMA_MEMWINDOWS_ASYNC:
952 case RPCRDMA_MEMWINDOWS:
954 struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
955 /* Allocate one extra request's worth, for full cycling */
956 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
957 r->r.mw = ib_alloc_mw(ia->ri_pd);
958 if (IS_ERR(r->r.mw)) {
959 rc = PTR_ERR(r->r.mw);
960 dprintk("RPC: %s: ib_alloc_mw"
961 " failed %i\n", __func__, rc);
962 goto out;
964 list_add(&r->mw_list, &buf->rb_mws);
965 ++r;
968 break;
969 default:
970 break;
974 * Allocate/init the request/reply buffers. Doing this
975 * using kmalloc for now -- one for each buf.
977 for (i = 0; i < buf->rb_max_requests; i++) {
978 struct rpcrdma_req *req;
979 struct rpcrdma_rep *rep;
981 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
982 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
983 /* Typical ~2400b, so rounding up saves work later */
984 if (len < 4096)
985 len = 4096;
986 req = kmalloc(len, GFP_KERNEL);
987 if (req == NULL) {
988 dprintk("RPC: %s: request buffer %d alloc"
989 " failed\n", __func__, i);
990 rc = -ENOMEM;
991 goto out;
993 memset(req, 0, sizeof(struct rpcrdma_req));
994 buf->rb_send_bufs[i] = req;
995 buf->rb_send_bufs[i]->rl_buffer = buf;
997 rc = rpcrdma_register_internal(ia, req->rl_base,
998 len - offsetof(struct rpcrdma_req, rl_base),
999 &buf->rb_send_bufs[i]->rl_handle,
1000 &buf->rb_send_bufs[i]->rl_iov);
1001 if (rc)
1002 goto out;
1004 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1006 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1007 rep = kmalloc(len, GFP_KERNEL);
1008 if (rep == NULL) {
1009 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1010 __func__, i);
1011 rc = -ENOMEM;
1012 goto out;
1014 memset(rep, 0, sizeof(struct rpcrdma_rep));
1015 buf->rb_recv_bufs[i] = rep;
1016 buf->rb_recv_bufs[i]->rr_buffer = buf;
1017 init_waitqueue_head(&rep->rr_unbind);
1019 rc = rpcrdma_register_internal(ia, rep->rr_base,
1020 len - offsetof(struct rpcrdma_rep, rr_base),
1021 &buf->rb_recv_bufs[i]->rr_handle,
1022 &buf->rb_recv_bufs[i]->rr_iov);
1023 if (rc)
1024 goto out;
1027 dprintk("RPC: %s: max_requests %d\n",
1028 __func__, buf->rb_max_requests);
1029 /* done */
1030 return 0;
1031 out:
1032 rpcrdma_buffer_destroy(buf);
1033 return rc;
1037 * Unregister and destroy buffer memory. Need to deal with
1038 * partial initialization, so it's callable from failed create.
1039 * Must be called before destroying endpoint, as registrations
1040 * reference it.
1042 void
1043 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1045 int rc, i;
1046 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1048 /* clean up in reverse order from create
1049 * 1. recv mr memory (mr free, then kfree)
1050 * 1a. bind mw memory
1051 * 2. send mr memory (mr free, then kfree)
1052 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1053 * 4. arrays
1055 dprintk("RPC: %s: entering\n", __func__);
1057 for (i = 0; i < buf->rb_max_requests; i++) {
1058 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1059 rpcrdma_deregister_internal(ia,
1060 buf->rb_recv_bufs[i]->rr_handle,
1061 &buf->rb_recv_bufs[i]->rr_iov);
1062 kfree(buf->rb_recv_bufs[i]);
1064 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1065 while (!list_empty(&buf->rb_mws)) {
1066 struct rpcrdma_mw *r;
1067 r = list_entry(buf->rb_mws.next,
1068 struct rpcrdma_mw, mw_list);
1069 list_del(&r->mw_list);
1070 switch (ia->ri_memreg_strategy) {
1071 case RPCRDMA_MTHCAFMR:
1072 rc = ib_dealloc_fmr(r->r.fmr);
1073 if (rc)
1074 dprintk("RPC: %s:"
1075 " ib_dealloc_fmr"
1076 " failed %i\n",
1077 __func__, rc);
1078 break;
1079 case RPCRDMA_MEMWINDOWS_ASYNC:
1080 case RPCRDMA_MEMWINDOWS:
1081 rc = ib_dealloc_mw(r->r.mw);
1082 if (rc)
1083 dprintk("RPC: %s:"
1084 " ib_dealloc_mw"
1085 " failed %i\n",
1086 __func__, rc);
1087 break;
1088 default:
1089 break;
1092 rpcrdma_deregister_internal(ia,
1093 buf->rb_send_bufs[i]->rl_handle,
1094 &buf->rb_send_bufs[i]->rl_iov);
1095 kfree(buf->rb_send_bufs[i]);
1099 kfree(buf->rb_pool);
1103 * Get a set of request/reply buffers.
1105 * Reply buffer (if needed) is attached to send buffer upon return.
1106 * Rule:
1107 * rb_send_index and rb_recv_index MUST always be pointing to the
1108 * *next* available buffer (non-NULL). They are incremented after
1109 * removing buffers, and decremented *before* returning them.
1111 struct rpcrdma_req *
1112 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1114 struct rpcrdma_req *req;
1115 unsigned long flags;
1117 spin_lock_irqsave(&buffers->rb_lock, flags);
1118 if (buffers->rb_send_index == buffers->rb_max_requests) {
1119 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1120 dprintk("RPC: %s: out of request buffers\n", __func__);
1121 return ((struct rpcrdma_req *)NULL);
1124 req = buffers->rb_send_bufs[buffers->rb_send_index];
1125 if (buffers->rb_send_index < buffers->rb_recv_index) {
1126 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1127 __func__,
1128 buffers->rb_recv_index - buffers->rb_send_index);
1129 req->rl_reply = NULL;
1130 } else {
1131 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1132 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1134 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1135 if (!list_empty(&buffers->rb_mws)) {
1136 int i = RPCRDMA_MAX_SEGS - 1;
1137 do {
1138 struct rpcrdma_mw *r;
1139 r = list_entry(buffers->rb_mws.next,
1140 struct rpcrdma_mw, mw_list);
1141 list_del(&r->mw_list);
1142 req->rl_segments[i].mr_chunk.rl_mw = r;
1143 } while (--i >= 0);
1145 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1146 return req;
1150 * Put request/reply buffers back into pool.
1151 * Pre-decrement counter/array index.
1153 void
1154 rpcrdma_buffer_put(struct rpcrdma_req *req)
1156 struct rpcrdma_buffer *buffers = req->rl_buffer;
1157 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1158 int i;
1159 unsigned long flags;
1161 BUG_ON(req->rl_nchunks != 0);
1162 spin_lock_irqsave(&buffers->rb_lock, flags);
1163 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1164 req->rl_niovs = 0;
1165 if (req->rl_reply) {
1166 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1167 init_waitqueue_head(&req->rl_reply->rr_unbind);
1168 req->rl_reply->rr_func = NULL;
1169 req->rl_reply = NULL;
1171 switch (ia->ri_memreg_strategy) {
1172 case RPCRDMA_MTHCAFMR:
1173 case RPCRDMA_MEMWINDOWS_ASYNC:
1174 case RPCRDMA_MEMWINDOWS:
1176 * Cycle mw's back in reverse order, and "spin" them.
1177 * This delays and scrambles reuse as much as possible.
1179 i = 1;
1180 do {
1181 struct rpcrdma_mw **mw;
1182 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1183 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1184 *mw = NULL;
1185 } while (++i < RPCRDMA_MAX_SEGS);
1186 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1187 &buffers->rb_mws);
1188 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1189 break;
1190 default:
1191 break;
1193 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1197 * Recover reply buffers from pool.
1198 * This happens when recovering from error conditions.
1199 * Post-increment counter/array index.
1201 void
1202 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1204 struct rpcrdma_buffer *buffers = req->rl_buffer;
1205 unsigned long flags;
1207 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1208 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1209 spin_lock_irqsave(&buffers->rb_lock, flags);
1210 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1211 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1212 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1214 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1218 * Put reply buffers back into pool when not attached to
1219 * request. This happens in error conditions, and when
1220 * aborting unbinds. Pre-decrement counter/array index.
1222 void
1223 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1225 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1226 unsigned long flags;
1228 rep->rr_func = NULL;
1229 spin_lock_irqsave(&buffers->rb_lock, flags);
1230 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1231 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1235 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1239 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1240 struct ib_mr **mrp, struct ib_sge *iov)
1242 struct ib_phys_buf ipb;
1243 struct ib_mr *mr;
1244 int rc;
1247 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1249 iov->addr = ib_dma_map_single(ia->ri_id->device,
1250 va, len, DMA_BIDIRECTIONAL);
1251 iov->length = len;
1253 if (ia->ri_bind_mem != NULL) {
1254 *mrp = NULL;
1255 iov->lkey = ia->ri_bind_mem->lkey;
1256 return 0;
1259 ipb.addr = iov->addr;
1260 ipb.size = iov->length;
1261 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1262 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1264 dprintk("RPC: %s: phys convert: 0x%llx "
1265 "registered 0x%llx length %d\n",
1266 __func__, (unsigned long long)ipb.addr,
1267 (unsigned long long)iov->addr, len);
1269 if (IS_ERR(mr)) {
1270 *mrp = NULL;
1271 rc = PTR_ERR(mr);
1272 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1273 } else {
1274 *mrp = mr;
1275 iov->lkey = mr->lkey;
1276 rc = 0;
1279 return rc;
1283 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1284 struct ib_mr *mr, struct ib_sge *iov)
1286 int rc;
1288 ib_dma_unmap_single(ia->ri_id->device,
1289 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1291 if (NULL == mr)
1292 return 0;
1294 rc = ib_dereg_mr(mr);
1295 if (rc)
1296 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1297 return rc;
1301 * Wrappers for chunk registration, shared by read/write chunk code.
1304 static void
1305 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1307 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1308 seg->mr_dmalen = seg->mr_len;
1309 if (seg->mr_page)
1310 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1311 seg->mr_page, offset_in_page(seg->mr_offset),
1312 seg->mr_dmalen, seg->mr_dir);
1313 else
1314 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1315 seg->mr_offset,
1316 seg->mr_dmalen, seg->mr_dir);
1319 static void
1320 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1322 if (seg->mr_page)
1323 ib_dma_unmap_page(ia->ri_id->device,
1324 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1325 else
1326 ib_dma_unmap_single(ia->ri_id->device,
1327 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1331 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1332 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1334 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1335 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1336 IB_ACCESS_REMOTE_READ);
1337 struct rpcrdma_mr_seg *seg1 = seg;
1338 int i;
1339 int rc = 0;
1341 switch (ia->ri_memreg_strategy) {
1343 #if RPCRDMA_PERSISTENT_REGISTRATION
1344 case RPCRDMA_ALLPHYSICAL:
1345 rpcrdma_map_one(ia, seg, writing);
1346 seg->mr_rkey = ia->ri_bind_mem->rkey;
1347 seg->mr_base = seg->mr_dma;
1348 seg->mr_nsegs = 1;
1349 nsegs = 1;
1350 break;
1351 #endif
1353 /* Registration using fast memory registration */
1354 case RPCRDMA_MTHCAFMR:
1356 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1357 int len, pageoff = offset_in_page(seg->mr_offset);
1358 seg1->mr_offset -= pageoff; /* start of page */
1359 seg1->mr_len += pageoff;
1360 len = -pageoff;
1361 if (nsegs > RPCRDMA_MAX_DATA_SEGS)
1362 nsegs = RPCRDMA_MAX_DATA_SEGS;
1363 for (i = 0; i < nsegs;) {
1364 rpcrdma_map_one(ia, seg, writing);
1365 physaddrs[i] = seg->mr_dma;
1366 len += seg->mr_len;
1367 ++seg;
1368 ++i;
1369 /* Check for holes */
1370 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
1371 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1372 break;
1374 nsegs = i;
1375 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1376 physaddrs, nsegs, seg1->mr_dma);
1377 if (rc) {
1378 dprintk("RPC: %s: failed ib_map_phys_fmr "
1379 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1380 len, (unsigned long long)seg1->mr_dma,
1381 pageoff, nsegs, rc);
1382 while (nsegs--)
1383 rpcrdma_unmap_one(ia, --seg);
1384 } else {
1385 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1386 seg1->mr_base = seg1->mr_dma + pageoff;
1387 seg1->mr_nsegs = nsegs;
1388 seg1->mr_len = len;
1391 break;
1393 /* Registration using memory windows */
1394 case RPCRDMA_MEMWINDOWS_ASYNC:
1395 case RPCRDMA_MEMWINDOWS:
1397 struct ib_mw_bind param;
1398 rpcrdma_map_one(ia, seg, writing);
1399 param.mr = ia->ri_bind_mem;
1400 param.wr_id = 0ULL; /* no send cookie */
1401 param.addr = seg->mr_dma;
1402 param.length = seg->mr_len;
1403 param.send_flags = 0;
1404 param.mw_access_flags = mem_priv;
1406 DECR_CQCOUNT(&r_xprt->rx_ep);
1407 rc = ib_bind_mw(ia->ri_id->qp,
1408 seg->mr_chunk.rl_mw->r.mw, &param);
1409 if (rc) {
1410 dprintk("RPC: %s: failed ib_bind_mw "
1411 "%u@0x%llx status %i\n",
1412 __func__, seg->mr_len,
1413 (unsigned long long)seg->mr_dma, rc);
1414 rpcrdma_unmap_one(ia, seg);
1415 } else {
1416 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1417 seg->mr_base = param.addr;
1418 seg->mr_nsegs = 1;
1419 nsegs = 1;
1422 break;
1424 /* Default registration each time */
1425 default:
1427 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1428 int len = 0;
1429 if (nsegs > RPCRDMA_MAX_DATA_SEGS)
1430 nsegs = RPCRDMA_MAX_DATA_SEGS;
1431 for (i = 0; i < nsegs;) {
1432 rpcrdma_map_one(ia, seg, writing);
1433 ipb[i].addr = seg->mr_dma;
1434 ipb[i].size = seg->mr_len;
1435 len += seg->mr_len;
1436 ++seg;
1437 ++i;
1438 /* Check for holes */
1439 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
1440 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1441 break;
1443 nsegs = i;
1444 seg1->mr_base = seg1->mr_dma;
1445 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1446 ipb, nsegs, mem_priv, &seg1->mr_base);
1447 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1448 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1449 dprintk("RPC: %s: failed ib_reg_phys_mr "
1450 "%u@0x%llx (%d)... status %i\n",
1451 __func__, len,
1452 (unsigned long long)seg1->mr_dma, nsegs, rc);
1453 while (nsegs--)
1454 rpcrdma_unmap_one(ia, --seg);
1455 } else {
1456 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1457 seg1->mr_nsegs = nsegs;
1458 seg1->mr_len = len;
1461 break;
1463 if (rc)
1464 return -1;
1466 return nsegs;
1470 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1471 struct rpcrdma_xprt *r_xprt, void *r)
1473 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1474 struct rpcrdma_mr_seg *seg1 = seg;
1475 int nsegs = seg->mr_nsegs, rc;
1477 switch (ia->ri_memreg_strategy) {
1479 #if RPCRDMA_PERSISTENT_REGISTRATION
1480 case RPCRDMA_ALLPHYSICAL:
1481 BUG_ON(nsegs != 1);
1482 rpcrdma_unmap_one(ia, seg);
1483 rc = 0;
1484 break;
1485 #endif
1487 case RPCRDMA_MTHCAFMR:
1489 LIST_HEAD(l);
1490 list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l);
1491 rc = ib_unmap_fmr(&l);
1492 while (seg1->mr_nsegs--)
1493 rpcrdma_unmap_one(ia, seg++);
1495 if (rc)
1496 dprintk("RPC: %s: failed ib_unmap_fmr,"
1497 " status %i\n", __func__, rc);
1498 break;
1500 case RPCRDMA_MEMWINDOWS_ASYNC:
1501 case RPCRDMA_MEMWINDOWS:
1503 struct ib_mw_bind param;
1504 BUG_ON(nsegs != 1);
1505 param.mr = ia->ri_bind_mem;
1506 param.addr = 0ULL; /* unbind */
1507 param.length = 0;
1508 param.mw_access_flags = 0;
1509 if (r) {
1510 param.wr_id = (u64) (unsigned long) r;
1511 param.send_flags = IB_SEND_SIGNALED;
1512 INIT_CQCOUNT(&r_xprt->rx_ep);
1513 } else {
1514 param.wr_id = 0ULL;
1515 param.send_flags = 0;
1516 DECR_CQCOUNT(&r_xprt->rx_ep);
1518 rc = ib_bind_mw(ia->ri_id->qp,
1519 seg->mr_chunk.rl_mw->r.mw, &param);
1520 rpcrdma_unmap_one(ia, seg);
1522 if (rc)
1523 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1524 " status %i\n", __func__, rc);
1525 else
1526 r = NULL; /* will upcall on completion */
1527 break;
1529 default:
1530 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1531 seg1->mr_chunk.rl_mr = NULL;
1532 while (seg1->mr_nsegs--)
1533 rpcrdma_unmap_one(ia, seg++);
1534 if (rc)
1535 dprintk("RPC: %s: failed ib_dereg_mr,"
1536 " status %i\n", __func__, rc);
1537 break;
1539 if (r) {
1540 struct rpcrdma_rep *rep = r;
1541 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1542 rep->rr_func = NULL;
1543 func(rep); /* dereg done, callback now */
1545 return nsegs;
1549 * Prepost any receive buffer, then post send.
1551 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1554 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1555 struct rpcrdma_ep *ep,
1556 struct rpcrdma_req *req)
1558 struct ib_send_wr send_wr, *send_wr_fail;
1559 struct rpcrdma_rep *rep = req->rl_reply;
1560 int rc;
1562 if (rep) {
1563 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1564 if (rc)
1565 goto out;
1566 req->rl_reply = NULL;
1569 send_wr.next = NULL;
1570 send_wr.wr_id = 0ULL; /* no send cookie */
1571 send_wr.sg_list = req->rl_send_iov;
1572 send_wr.num_sge = req->rl_niovs;
1573 send_wr.opcode = IB_WR_SEND;
1574 send_wr.imm_data = 0;
1575 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1576 ib_dma_sync_single_for_device(ia->ri_id->device,
1577 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1578 DMA_TO_DEVICE);
1579 ib_dma_sync_single_for_device(ia->ri_id->device,
1580 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1581 DMA_TO_DEVICE);
1582 ib_dma_sync_single_for_device(ia->ri_id->device,
1583 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1584 DMA_TO_DEVICE);
1586 if (DECR_CQCOUNT(ep) > 0)
1587 send_wr.send_flags = 0;
1588 else { /* Provider must take a send completion every now and then */
1589 INIT_CQCOUNT(ep);
1590 send_wr.send_flags = IB_SEND_SIGNALED;
1593 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1594 if (rc)
1595 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1596 rc);
1597 out:
1598 return rc;
1602 * (Re)post a receive buffer.
1605 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1606 struct rpcrdma_ep *ep,
1607 struct rpcrdma_rep *rep)
1609 struct ib_recv_wr recv_wr, *recv_wr_fail;
1610 int rc;
1612 recv_wr.next = NULL;
1613 recv_wr.wr_id = (u64) (unsigned long) rep;
1614 recv_wr.sg_list = &rep->rr_iov;
1615 recv_wr.num_sge = 1;
1617 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1618 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1620 DECR_CQCOUNT(ep);
1621 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1623 if (rc)
1624 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1625 rc);
1626 return rc;