Linux 2.6.26-rc5
[linux-2.6/openmoko-kernel/knife-kernel.git] / net / sunrpc / xprtrdma / verbs.c
blob8ea283ecc522a8a5e8e033b12f6485fc5ef456c4
1 /*
2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the BSD-type
8 * license below:
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
14 * Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
17 * Redistributions in binary form must reproduce the above
18 * copyright notice, this list of conditions and the following
19 * disclaimer in the documentation and/or other materials provided
20 * with the distribution.
22 * Neither the name of the Network Appliance, Inc. nor the names of
23 * its contributors may be used to endorse or promote products
24 * derived from this software without specific prior written
25 * permission.
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 * verbs.c
43 * Encapsulates the major functions managing:
44 * o adapters
45 * o endpoints
46 * o connections
47 * o buffer memory
50 #include <linux/pci.h> /* for Tavor hack below */
52 #include "xprt_rdma.h"
55 * Globals/Macros
58 #ifdef RPC_DEBUG
59 # define RPCDBG_FACILITY RPCDBG_TRANS
60 #endif
63 * internal functions
67 * handle replies in tasklet context, using a single, global list
68 * rdma tasklet function -- just turn around and call the func
69 * for all replies on the list
72 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
73 static LIST_HEAD(rpcrdma_tasklets_g);
75 static void
76 rpcrdma_run_tasklet(unsigned long data)
78 struct rpcrdma_rep *rep;
79 void (*func)(struct rpcrdma_rep *);
80 unsigned long flags;
82 data = data;
83 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
84 while (!list_empty(&rpcrdma_tasklets_g)) {
85 rep = list_entry(rpcrdma_tasklets_g.next,
86 struct rpcrdma_rep, rr_list);
87 list_del(&rep->rr_list);
88 func = rep->rr_func;
89 rep->rr_func = NULL;
90 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
92 if (func)
93 func(rep);
94 else
95 rpcrdma_recv_buffer_put(rep);
97 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
99 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
102 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
104 static inline void
105 rpcrdma_schedule_tasklet(struct rpcrdma_rep *rep)
107 unsigned long flags;
109 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
110 list_add_tail(&rep->rr_list, &rpcrdma_tasklets_g);
111 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
112 tasklet_schedule(&rpcrdma_tasklet_g);
115 static void
116 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
118 struct rpcrdma_ep *ep = context;
120 dprintk("RPC: %s: QP error %X on device %s ep %p\n",
121 __func__, event->event, event->device->name, context);
122 if (ep->rep_connected == 1) {
123 ep->rep_connected = -EIO;
124 ep->rep_func(ep);
125 wake_up_all(&ep->rep_connect_wait);
129 static void
130 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
132 struct rpcrdma_ep *ep = context;
134 dprintk("RPC: %s: CQ error %X on device %s ep %p\n",
135 __func__, event->event, event->device->name, context);
136 if (ep->rep_connected == 1) {
137 ep->rep_connected = -EIO;
138 ep->rep_func(ep);
139 wake_up_all(&ep->rep_connect_wait);
143 static inline
144 void rpcrdma_event_process(struct ib_wc *wc)
146 struct rpcrdma_rep *rep =
147 (struct rpcrdma_rep *)(unsigned long) wc->wr_id;
149 dprintk("RPC: %s: event rep %p status %X opcode %X length %u\n",
150 __func__, rep, wc->status, wc->opcode, wc->byte_len);
152 if (!rep) /* send or bind completion that we don't care about */
153 return;
155 if (IB_WC_SUCCESS != wc->status) {
156 dprintk("RPC: %s: %s WC status %X, connection lost\n",
157 __func__, (wc->opcode & IB_WC_RECV) ? "recv" : "send",
158 wc->status);
159 rep->rr_len = ~0U;
160 rpcrdma_schedule_tasklet(rep);
161 return;
164 switch (wc->opcode) {
165 case IB_WC_RECV:
166 rep->rr_len = wc->byte_len;
167 ib_dma_sync_single_for_cpu(
168 rdmab_to_ia(rep->rr_buffer)->ri_id->device,
169 rep->rr_iov.addr, rep->rr_len, DMA_FROM_DEVICE);
170 /* Keep (only) the most recent credits, after check validity */
171 if (rep->rr_len >= 16) {
172 struct rpcrdma_msg *p =
173 (struct rpcrdma_msg *) rep->rr_base;
174 unsigned int credits = ntohl(p->rm_credit);
175 if (credits == 0) {
176 dprintk("RPC: %s: server"
177 " dropped credits to 0!\n", __func__);
178 /* don't deadlock */
179 credits = 1;
180 } else if (credits > rep->rr_buffer->rb_max_requests) {
181 dprintk("RPC: %s: server"
182 " over-crediting: %d (%d)\n",
183 __func__, credits,
184 rep->rr_buffer->rb_max_requests);
185 credits = rep->rr_buffer->rb_max_requests;
187 atomic_set(&rep->rr_buffer->rb_credits, credits);
189 /* fall through */
190 case IB_WC_BIND_MW:
191 rpcrdma_schedule_tasklet(rep);
192 break;
193 default:
194 dprintk("RPC: %s: unexpected WC event %X\n",
195 __func__, wc->opcode);
196 break;
200 static inline int
201 rpcrdma_cq_poll(struct ib_cq *cq)
203 struct ib_wc wc;
204 int rc;
206 for (;;) {
207 rc = ib_poll_cq(cq, 1, &wc);
208 if (rc < 0) {
209 dprintk("RPC: %s: ib_poll_cq failed %i\n",
210 __func__, rc);
211 return rc;
213 if (rc == 0)
214 break;
216 rpcrdma_event_process(&wc);
219 return 0;
223 * rpcrdma_cq_event_upcall
225 * This upcall handles recv, send, bind and unbind events.
226 * It is reentrant but processes single events in order to maintain
227 * ordering of receives to keep server credits.
229 * It is the responsibility of the scheduled tasklet to return
230 * recv buffers to the pool. NOTE: this affects synchronization of
231 * connection shutdown. That is, the structures required for
232 * the completion of the reply handler must remain intact until
233 * all memory has been reclaimed.
235 * Note that send events are suppressed and do not result in an upcall.
237 static void
238 rpcrdma_cq_event_upcall(struct ib_cq *cq, void *context)
240 int rc;
242 rc = rpcrdma_cq_poll(cq);
243 if (rc)
244 return;
246 rc = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
247 if (rc) {
248 dprintk("RPC: %s: ib_req_notify_cq failed %i\n",
249 __func__, rc);
250 return;
253 rpcrdma_cq_poll(cq);
256 #ifdef RPC_DEBUG
257 static const char * const conn[] = {
258 "address resolved",
259 "address error",
260 "route resolved",
261 "route error",
262 "connect request",
263 "connect response",
264 "connect error",
265 "unreachable",
266 "rejected",
267 "established",
268 "disconnected",
269 "device removal"
271 #endif
273 static int
274 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
276 struct rpcrdma_xprt *xprt = id->context;
277 struct rpcrdma_ia *ia = &xprt->rx_ia;
278 struct rpcrdma_ep *ep = &xprt->rx_ep;
279 struct sockaddr_in *addr = (struct sockaddr_in *) &ep->rep_remote_addr;
280 struct ib_qp_attr attr;
281 struct ib_qp_init_attr iattr;
282 int connstate = 0;
284 switch (event->event) {
285 case RDMA_CM_EVENT_ADDR_RESOLVED:
286 case RDMA_CM_EVENT_ROUTE_RESOLVED:
287 complete(&ia->ri_done);
288 break;
289 case RDMA_CM_EVENT_ADDR_ERROR:
290 ia->ri_async_rc = -EHOSTUNREACH;
291 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n",
292 __func__, ep);
293 complete(&ia->ri_done);
294 break;
295 case RDMA_CM_EVENT_ROUTE_ERROR:
296 ia->ri_async_rc = -ENETUNREACH;
297 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n",
298 __func__, ep);
299 complete(&ia->ri_done);
300 break;
301 case RDMA_CM_EVENT_ESTABLISHED:
302 connstate = 1;
303 ib_query_qp(ia->ri_id->qp, &attr,
304 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC,
305 &iattr);
306 dprintk("RPC: %s: %d responder resources"
307 " (%d initiator)\n",
308 __func__, attr.max_dest_rd_atomic, attr.max_rd_atomic);
309 goto connected;
310 case RDMA_CM_EVENT_CONNECT_ERROR:
311 connstate = -ENOTCONN;
312 goto connected;
313 case RDMA_CM_EVENT_UNREACHABLE:
314 connstate = -ENETDOWN;
315 goto connected;
316 case RDMA_CM_EVENT_REJECTED:
317 connstate = -ECONNREFUSED;
318 goto connected;
319 case RDMA_CM_EVENT_DISCONNECTED:
320 connstate = -ECONNABORTED;
321 goto connected;
322 case RDMA_CM_EVENT_DEVICE_REMOVAL:
323 connstate = -ENODEV;
324 connected:
325 dprintk("RPC: %s: %s: %u.%u.%u.%u:%u"
326 " (ep 0x%p event 0x%x)\n",
327 __func__,
328 (event->event <= 11) ? conn[event->event] :
329 "unknown connection error",
330 NIPQUAD(addr->sin_addr.s_addr),
331 ntohs(addr->sin_port),
332 ep, event->event);
333 atomic_set(&rpcx_to_rdmax(ep->rep_xprt)->rx_buf.rb_credits, 1);
334 dprintk("RPC: %s: %sconnected\n",
335 __func__, connstate > 0 ? "" : "dis");
336 ep->rep_connected = connstate;
337 ep->rep_func(ep);
338 wake_up_all(&ep->rep_connect_wait);
339 break;
340 default:
341 ia->ri_async_rc = -EINVAL;
342 dprintk("RPC: %s: unexpected CM event %X\n",
343 __func__, event->event);
344 complete(&ia->ri_done);
345 break;
348 return 0;
351 static struct rdma_cm_id *
352 rpcrdma_create_id(struct rpcrdma_xprt *xprt,
353 struct rpcrdma_ia *ia, struct sockaddr *addr)
355 struct rdma_cm_id *id;
356 int rc;
358 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP);
359 if (IS_ERR(id)) {
360 rc = PTR_ERR(id);
361 dprintk("RPC: %s: rdma_create_id() failed %i\n",
362 __func__, rc);
363 return id;
366 ia->ri_async_rc = 0;
367 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT);
368 if (rc) {
369 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n",
370 __func__, rc);
371 goto out;
373 wait_for_completion(&ia->ri_done);
374 rc = ia->ri_async_rc;
375 if (rc)
376 goto out;
378 ia->ri_async_rc = 0;
379 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
380 if (rc) {
381 dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
382 __func__, rc);
383 goto out;
385 wait_for_completion(&ia->ri_done);
386 rc = ia->ri_async_rc;
387 if (rc)
388 goto out;
390 return id;
392 out:
393 rdma_destroy_id(id);
394 return ERR_PTR(rc);
398 * Drain any cq, prior to teardown.
400 static void
401 rpcrdma_clean_cq(struct ib_cq *cq)
403 struct ib_wc wc;
404 int count = 0;
406 while (1 == ib_poll_cq(cq, 1, &wc))
407 ++count;
409 if (count)
410 dprintk("RPC: %s: flushed %d events (last 0x%x)\n",
411 __func__, count, wc.opcode);
415 * Exported functions.
419 * Open and initialize an Interface Adapter.
420 * o initializes fields of struct rpcrdma_ia, including
421 * interface and provider attributes and protection zone.
424 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
426 int rc;
427 struct rpcrdma_ia *ia = &xprt->rx_ia;
429 init_completion(&ia->ri_done);
431 ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
432 if (IS_ERR(ia->ri_id)) {
433 rc = PTR_ERR(ia->ri_id);
434 goto out1;
437 ia->ri_pd = ib_alloc_pd(ia->ri_id->device);
438 if (IS_ERR(ia->ri_pd)) {
439 rc = PTR_ERR(ia->ri_pd);
440 dprintk("RPC: %s: ib_alloc_pd() failed %i\n",
441 __func__, rc);
442 goto out2;
446 * Optionally obtain an underlying physical identity mapping in
447 * order to do a memory window-based bind. This base registration
448 * is protected from remote access - that is enabled only by binding
449 * for the specific bytes targeted during each RPC operation, and
450 * revoked after the corresponding completion similar to a storage
451 * adapter.
453 if (memreg > RPCRDMA_REGISTER) {
454 int mem_priv = IB_ACCESS_LOCAL_WRITE;
455 switch (memreg) {
456 #if RPCRDMA_PERSISTENT_REGISTRATION
457 case RPCRDMA_ALLPHYSICAL:
458 mem_priv |= IB_ACCESS_REMOTE_WRITE;
459 mem_priv |= IB_ACCESS_REMOTE_READ;
460 break;
461 #endif
462 case RPCRDMA_MEMWINDOWS_ASYNC:
463 case RPCRDMA_MEMWINDOWS:
464 mem_priv |= IB_ACCESS_MW_BIND;
465 break;
466 default:
467 break;
469 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
470 if (IS_ERR(ia->ri_bind_mem)) {
471 printk(KERN_ALERT "%s: ib_get_dma_mr for "
472 "phys register failed with %lX\n\t"
473 "Will continue with degraded performance\n",
474 __func__, PTR_ERR(ia->ri_bind_mem));
475 memreg = RPCRDMA_REGISTER;
476 ia->ri_bind_mem = NULL;
480 /* Else will do memory reg/dereg for each chunk */
481 ia->ri_memreg_strategy = memreg;
483 return 0;
484 out2:
485 rdma_destroy_id(ia->ri_id);
486 out1:
487 return rc;
491 * Clean up/close an IA.
492 * o if event handles and PD have been initialized, free them.
493 * o close the IA
495 void
496 rpcrdma_ia_close(struct rpcrdma_ia *ia)
498 int rc;
500 dprintk("RPC: %s: entering\n", __func__);
501 if (ia->ri_bind_mem != NULL) {
502 rc = ib_dereg_mr(ia->ri_bind_mem);
503 dprintk("RPC: %s: ib_dereg_mr returned %i\n",
504 __func__, rc);
506 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id) && ia->ri_id->qp)
507 rdma_destroy_qp(ia->ri_id);
508 if (ia->ri_pd != NULL && !IS_ERR(ia->ri_pd)) {
509 rc = ib_dealloc_pd(ia->ri_pd);
510 dprintk("RPC: %s: ib_dealloc_pd returned %i\n",
511 __func__, rc);
513 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id))
514 rdma_destroy_id(ia->ri_id);
518 * Create unconnected endpoint.
521 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
522 struct rpcrdma_create_data_internal *cdata)
524 struct ib_device_attr devattr;
525 int rc, err;
527 rc = ib_query_device(ia->ri_id->device, &devattr);
528 if (rc) {
529 dprintk("RPC: %s: ib_query_device failed %d\n",
530 __func__, rc);
531 return rc;
534 /* check provider's send/recv wr limits */
535 if (cdata->max_requests > devattr.max_qp_wr)
536 cdata->max_requests = devattr.max_qp_wr;
538 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
539 ep->rep_attr.qp_context = ep;
540 /* send_cq and recv_cq initialized below */
541 ep->rep_attr.srq = NULL;
542 ep->rep_attr.cap.max_send_wr = cdata->max_requests;
543 switch (ia->ri_memreg_strategy) {
544 case RPCRDMA_MEMWINDOWS_ASYNC:
545 case RPCRDMA_MEMWINDOWS:
546 /* Add room for mw_binds+unbinds - overkill! */
547 ep->rep_attr.cap.max_send_wr++;
548 ep->rep_attr.cap.max_send_wr *= (2 * RPCRDMA_MAX_SEGS);
549 if (ep->rep_attr.cap.max_send_wr > devattr.max_qp_wr)
550 return -EINVAL;
551 break;
552 default:
553 break;
555 ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
556 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
557 ep->rep_attr.cap.max_recv_sge = 1;
558 ep->rep_attr.cap.max_inline_data = 0;
559 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
560 ep->rep_attr.qp_type = IB_QPT_RC;
561 ep->rep_attr.port_num = ~0;
563 dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
564 "iovs: send %d recv %d\n",
565 __func__,
566 ep->rep_attr.cap.max_send_wr,
567 ep->rep_attr.cap.max_recv_wr,
568 ep->rep_attr.cap.max_send_sge,
569 ep->rep_attr.cap.max_recv_sge);
571 /* set trigger for requesting send completion */
572 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 /* - 1*/;
573 switch (ia->ri_memreg_strategy) {
574 case RPCRDMA_MEMWINDOWS_ASYNC:
575 case RPCRDMA_MEMWINDOWS:
576 ep->rep_cqinit -= RPCRDMA_MAX_SEGS;
577 break;
578 default:
579 break;
581 if (ep->rep_cqinit <= 2)
582 ep->rep_cqinit = 0;
583 INIT_CQCOUNT(ep);
584 ep->rep_ia = ia;
585 init_waitqueue_head(&ep->rep_connect_wait);
588 * Create a single cq for receive dto and mw_bind (only ever
589 * care about unbind, really). Send completions are suppressed.
590 * Use single threaded tasklet upcalls to maintain ordering.
592 ep->rep_cq = ib_create_cq(ia->ri_id->device, rpcrdma_cq_event_upcall,
593 rpcrdma_cq_async_error_upcall, NULL,
594 ep->rep_attr.cap.max_recv_wr +
595 ep->rep_attr.cap.max_send_wr + 1, 0);
596 if (IS_ERR(ep->rep_cq)) {
597 rc = PTR_ERR(ep->rep_cq);
598 dprintk("RPC: %s: ib_create_cq failed: %i\n",
599 __func__, rc);
600 goto out1;
603 rc = ib_req_notify_cq(ep->rep_cq, IB_CQ_NEXT_COMP);
604 if (rc) {
605 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
606 __func__, rc);
607 goto out2;
610 ep->rep_attr.send_cq = ep->rep_cq;
611 ep->rep_attr.recv_cq = ep->rep_cq;
613 /* Initialize cma parameters */
615 /* RPC/RDMA does not use private data */
616 ep->rep_remote_cma.private_data = NULL;
617 ep->rep_remote_cma.private_data_len = 0;
619 /* Client offers RDMA Read but does not initiate */
620 switch (ia->ri_memreg_strategy) {
621 case RPCRDMA_BOUNCEBUFFERS:
622 ep->rep_remote_cma.responder_resources = 0;
623 break;
624 case RPCRDMA_MTHCAFMR:
625 case RPCRDMA_REGISTER:
626 ep->rep_remote_cma.responder_resources = cdata->max_requests *
627 (RPCRDMA_MAX_DATA_SEGS / 8);
628 break;
629 case RPCRDMA_MEMWINDOWS:
630 case RPCRDMA_MEMWINDOWS_ASYNC:
631 #if RPCRDMA_PERSISTENT_REGISTRATION
632 case RPCRDMA_ALLPHYSICAL:
633 #endif
634 ep->rep_remote_cma.responder_resources = cdata->max_requests *
635 (RPCRDMA_MAX_DATA_SEGS / 2);
636 break;
637 default:
638 break;
640 if (ep->rep_remote_cma.responder_resources > devattr.max_qp_rd_atom)
641 ep->rep_remote_cma.responder_resources = devattr.max_qp_rd_atom;
642 ep->rep_remote_cma.initiator_depth = 0;
644 ep->rep_remote_cma.retry_count = 7;
645 ep->rep_remote_cma.flow_control = 0;
646 ep->rep_remote_cma.rnr_retry_count = 0;
648 return 0;
650 out2:
651 err = ib_destroy_cq(ep->rep_cq);
652 if (err)
653 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
654 __func__, err);
655 out1:
656 return rc;
660 * rpcrdma_ep_destroy
662 * Disconnect and destroy endpoint. After this, the only
663 * valid operations on the ep are to free it (if dynamically
664 * allocated) or re-create it.
666 * The caller's error handling must be sure to not leak the endpoint
667 * if this function fails.
670 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
672 int rc;
674 dprintk("RPC: %s: entering, connected is %d\n",
675 __func__, ep->rep_connected);
677 if (ia->ri_id->qp) {
678 rc = rpcrdma_ep_disconnect(ep, ia);
679 if (rc)
680 dprintk("RPC: %s: rpcrdma_ep_disconnect"
681 " returned %i\n", __func__, rc);
684 ep->rep_func = NULL;
686 /* padding - could be done in rpcrdma_buffer_destroy... */
687 if (ep->rep_pad_mr) {
688 rpcrdma_deregister_internal(ia, ep->rep_pad_mr, &ep->rep_pad);
689 ep->rep_pad_mr = NULL;
692 if (ia->ri_id->qp) {
693 rdma_destroy_qp(ia->ri_id);
694 ia->ri_id->qp = NULL;
697 rpcrdma_clean_cq(ep->rep_cq);
698 rc = ib_destroy_cq(ep->rep_cq);
699 if (rc)
700 dprintk("RPC: %s: ib_destroy_cq returned %i\n",
701 __func__, rc);
703 return rc;
707 * Connect unconnected endpoint.
710 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
712 struct rdma_cm_id *id;
713 int rc = 0;
714 int retry_count = 0;
715 int reconnect = (ep->rep_connected != 0);
717 if (reconnect) {
718 struct rpcrdma_xprt *xprt;
719 retry:
720 rc = rpcrdma_ep_disconnect(ep, ia);
721 if (rc && rc != -ENOTCONN)
722 dprintk("RPC: %s: rpcrdma_ep_disconnect"
723 " status %i\n", __func__, rc);
724 rpcrdma_clean_cq(ep->rep_cq);
726 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
727 id = rpcrdma_create_id(xprt, ia,
728 (struct sockaddr *)&xprt->rx_data.addr);
729 if (IS_ERR(id)) {
730 rc = PTR_ERR(id);
731 goto out;
733 /* TEMP TEMP TEMP - fail if new device:
734 * Deregister/remarshal *all* requests!
735 * Close and recreate adapter, pd, etc!
736 * Re-determine all attributes still sane!
737 * More stuff I haven't thought of!
738 * Rrrgh!
740 if (ia->ri_id->device != id->device) {
741 printk("RPC: %s: can't reconnect on "
742 "different device!\n", __func__);
743 rdma_destroy_id(id);
744 rc = -ENETDOWN;
745 goto out;
747 /* END TEMP */
748 rdma_destroy_id(ia->ri_id);
749 ia->ri_id = id;
752 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
753 if (rc) {
754 dprintk("RPC: %s: rdma_create_qp failed %i\n",
755 __func__, rc);
756 goto out;
759 /* XXX Tavor device performs badly with 2K MTU! */
760 if (strnicmp(ia->ri_id->device->dma_device->bus->name, "pci", 3) == 0) {
761 struct pci_dev *pcid = to_pci_dev(ia->ri_id->device->dma_device);
762 if (pcid->device == PCI_DEVICE_ID_MELLANOX_TAVOR &&
763 (pcid->vendor == PCI_VENDOR_ID_MELLANOX ||
764 pcid->vendor == PCI_VENDOR_ID_TOPSPIN)) {
765 struct ib_qp_attr attr = {
766 .path_mtu = IB_MTU_1024
768 rc = ib_modify_qp(ia->ri_id->qp, &attr, IB_QP_PATH_MTU);
772 /* Theoretically a client initiator_depth > 0 is not needed,
773 * but many peers fail to complete the connection unless they
774 * == responder_resources! */
775 if (ep->rep_remote_cma.initiator_depth !=
776 ep->rep_remote_cma.responder_resources)
777 ep->rep_remote_cma.initiator_depth =
778 ep->rep_remote_cma.responder_resources;
780 ep->rep_connected = 0;
782 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
783 if (rc) {
784 dprintk("RPC: %s: rdma_connect() failed with %i\n",
785 __func__, rc);
786 goto out;
789 if (reconnect)
790 return 0;
792 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
795 * Check state. A non-peer reject indicates no listener
796 * (ECONNREFUSED), which may be a transient state. All
797 * others indicate a transport condition which has already
798 * undergone a best-effort.
800 if (ep->rep_connected == -ECONNREFUSED
801 && ++retry_count <= RDMA_CONNECT_RETRY_MAX) {
802 dprintk("RPC: %s: non-peer_reject, retry\n", __func__);
803 goto retry;
805 if (ep->rep_connected <= 0) {
806 /* Sometimes, the only way to reliably connect to remote
807 * CMs is to use same nonzero values for ORD and IRD. */
808 ep->rep_remote_cma.initiator_depth =
809 ep->rep_remote_cma.responder_resources;
810 if (ep->rep_remote_cma.initiator_depth == 0)
811 ++ep->rep_remote_cma.initiator_depth;
812 if (ep->rep_remote_cma.responder_resources == 0)
813 ++ep->rep_remote_cma.responder_resources;
814 if (retry_count++ == 0)
815 goto retry;
816 rc = ep->rep_connected;
817 } else {
818 dprintk("RPC: %s: connected\n", __func__);
821 out:
822 if (rc)
823 ep->rep_connected = rc;
824 return rc;
828 * rpcrdma_ep_disconnect
830 * This is separate from destroy to facilitate the ability
831 * to reconnect without recreating the endpoint.
833 * This call is not reentrant, and must not be made in parallel
834 * on the same endpoint.
837 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
839 int rc;
841 rpcrdma_clean_cq(ep->rep_cq);
842 rc = rdma_disconnect(ia->ri_id);
843 if (!rc) {
844 /* returns without wait if not connected */
845 wait_event_interruptible(ep->rep_connect_wait,
846 ep->rep_connected != 1);
847 dprintk("RPC: %s: after wait, %sconnected\n", __func__,
848 (ep->rep_connected == 1) ? "still " : "dis");
849 } else {
850 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc);
851 ep->rep_connected = rc;
853 return rc;
857 * Initialize buffer memory
860 rpcrdma_buffer_create(struct rpcrdma_buffer *buf, struct rpcrdma_ep *ep,
861 struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata)
863 char *p;
864 size_t len;
865 int i, rc;
867 buf->rb_max_requests = cdata->max_requests;
868 spin_lock_init(&buf->rb_lock);
869 atomic_set(&buf->rb_credits, 1);
871 /* Need to allocate:
872 * 1. arrays for send and recv pointers
873 * 2. arrays of struct rpcrdma_req to fill in pointers
874 * 3. array of struct rpcrdma_rep for replies
875 * 4. padding, if any
876 * 5. mw's, if any
877 * Send/recv buffers in req/rep need to be registered
880 len = buf->rb_max_requests *
881 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
882 len += cdata->padding;
883 switch (ia->ri_memreg_strategy) {
884 case RPCRDMA_MTHCAFMR:
885 /* TBD we are perhaps overallocating here */
886 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
887 sizeof(struct rpcrdma_mw);
888 break;
889 case RPCRDMA_MEMWINDOWS_ASYNC:
890 case RPCRDMA_MEMWINDOWS:
891 len += (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS *
892 sizeof(struct rpcrdma_mw);
893 break;
894 default:
895 break;
898 /* allocate 1, 4 and 5 in one shot */
899 p = kzalloc(len, GFP_KERNEL);
900 if (p == NULL) {
901 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
902 __func__, len);
903 rc = -ENOMEM;
904 goto out;
906 buf->rb_pool = p; /* for freeing it later */
908 buf->rb_send_bufs = (struct rpcrdma_req **) p;
909 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
910 buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
911 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
914 * Register the zeroed pad buffer, if any.
916 if (cdata->padding) {
917 rc = rpcrdma_register_internal(ia, p, cdata->padding,
918 &ep->rep_pad_mr, &ep->rep_pad);
919 if (rc)
920 goto out;
922 p += cdata->padding;
925 * Allocate the fmr's, or mw's for mw_bind chunk registration.
926 * We "cycle" the mw's in order to minimize rkey reuse,
927 * and also reduce unbind-to-bind collision.
929 INIT_LIST_HEAD(&buf->rb_mws);
930 switch (ia->ri_memreg_strategy) {
931 case RPCRDMA_MTHCAFMR:
933 struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
934 struct ib_fmr_attr fa = {
935 RPCRDMA_MAX_DATA_SEGS, 1, PAGE_SHIFT
937 /* TBD we are perhaps overallocating here */
938 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
939 r->r.fmr = ib_alloc_fmr(ia->ri_pd,
940 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ,
941 &fa);
942 if (IS_ERR(r->r.fmr)) {
943 rc = PTR_ERR(r->r.fmr);
944 dprintk("RPC: %s: ib_alloc_fmr"
945 " failed %i\n", __func__, rc);
946 goto out;
948 list_add(&r->mw_list, &buf->rb_mws);
949 ++r;
952 break;
953 case RPCRDMA_MEMWINDOWS_ASYNC:
954 case RPCRDMA_MEMWINDOWS:
956 struct rpcrdma_mw *r = (struct rpcrdma_mw *)p;
957 /* Allocate one extra request's worth, for full cycling */
958 for (i = (buf->rb_max_requests+1) * RPCRDMA_MAX_SEGS; i; i--) {
959 r->r.mw = ib_alloc_mw(ia->ri_pd);
960 if (IS_ERR(r->r.mw)) {
961 rc = PTR_ERR(r->r.mw);
962 dprintk("RPC: %s: ib_alloc_mw"
963 " failed %i\n", __func__, rc);
964 goto out;
966 list_add(&r->mw_list, &buf->rb_mws);
967 ++r;
970 break;
971 default:
972 break;
976 * Allocate/init the request/reply buffers. Doing this
977 * using kmalloc for now -- one for each buf.
979 for (i = 0; i < buf->rb_max_requests; i++) {
980 struct rpcrdma_req *req;
981 struct rpcrdma_rep *rep;
983 len = cdata->inline_wsize + sizeof(struct rpcrdma_req);
984 /* RPC layer requests *double* size + 1K RPC_SLACK_SPACE! */
985 /* Typical ~2400b, so rounding up saves work later */
986 if (len < 4096)
987 len = 4096;
988 req = kmalloc(len, GFP_KERNEL);
989 if (req == NULL) {
990 dprintk("RPC: %s: request buffer %d alloc"
991 " failed\n", __func__, i);
992 rc = -ENOMEM;
993 goto out;
995 memset(req, 0, sizeof(struct rpcrdma_req));
996 buf->rb_send_bufs[i] = req;
997 buf->rb_send_bufs[i]->rl_buffer = buf;
999 rc = rpcrdma_register_internal(ia, req->rl_base,
1000 len - offsetof(struct rpcrdma_req, rl_base),
1001 &buf->rb_send_bufs[i]->rl_handle,
1002 &buf->rb_send_bufs[i]->rl_iov);
1003 if (rc)
1004 goto out;
1006 buf->rb_send_bufs[i]->rl_size = len-sizeof(struct rpcrdma_req);
1008 len = cdata->inline_rsize + sizeof(struct rpcrdma_rep);
1009 rep = kmalloc(len, GFP_KERNEL);
1010 if (rep == NULL) {
1011 dprintk("RPC: %s: reply buffer %d alloc failed\n",
1012 __func__, i);
1013 rc = -ENOMEM;
1014 goto out;
1016 memset(rep, 0, sizeof(struct rpcrdma_rep));
1017 buf->rb_recv_bufs[i] = rep;
1018 buf->rb_recv_bufs[i]->rr_buffer = buf;
1019 init_waitqueue_head(&rep->rr_unbind);
1021 rc = rpcrdma_register_internal(ia, rep->rr_base,
1022 len - offsetof(struct rpcrdma_rep, rr_base),
1023 &buf->rb_recv_bufs[i]->rr_handle,
1024 &buf->rb_recv_bufs[i]->rr_iov);
1025 if (rc)
1026 goto out;
1029 dprintk("RPC: %s: max_requests %d\n",
1030 __func__, buf->rb_max_requests);
1031 /* done */
1032 return 0;
1033 out:
1034 rpcrdma_buffer_destroy(buf);
1035 return rc;
1039 * Unregister and destroy buffer memory. Need to deal with
1040 * partial initialization, so it's callable from failed create.
1041 * Must be called before destroying endpoint, as registrations
1042 * reference it.
1044 void
1045 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
1047 int rc, i;
1048 struct rpcrdma_ia *ia = rdmab_to_ia(buf);
1050 /* clean up in reverse order from create
1051 * 1. recv mr memory (mr free, then kfree)
1052 * 1a. bind mw memory
1053 * 2. send mr memory (mr free, then kfree)
1054 * 3. padding (if any) [moved to rpcrdma_ep_destroy]
1055 * 4. arrays
1057 dprintk("RPC: %s: entering\n", __func__);
1059 for (i = 0; i < buf->rb_max_requests; i++) {
1060 if (buf->rb_recv_bufs && buf->rb_recv_bufs[i]) {
1061 rpcrdma_deregister_internal(ia,
1062 buf->rb_recv_bufs[i]->rr_handle,
1063 &buf->rb_recv_bufs[i]->rr_iov);
1064 kfree(buf->rb_recv_bufs[i]);
1066 if (buf->rb_send_bufs && buf->rb_send_bufs[i]) {
1067 while (!list_empty(&buf->rb_mws)) {
1068 struct rpcrdma_mw *r;
1069 r = list_entry(buf->rb_mws.next,
1070 struct rpcrdma_mw, mw_list);
1071 list_del(&r->mw_list);
1072 switch (ia->ri_memreg_strategy) {
1073 case RPCRDMA_MTHCAFMR:
1074 rc = ib_dealloc_fmr(r->r.fmr);
1075 if (rc)
1076 dprintk("RPC: %s:"
1077 " ib_dealloc_fmr"
1078 " failed %i\n",
1079 __func__, rc);
1080 break;
1081 case RPCRDMA_MEMWINDOWS_ASYNC:
1082 case RPCRDMA_MEMWINDOWS:
1083 rc = ib_dealloc_mw(r->r.mw);
1084 if (rc)
1085 dprintk("RPC: %s:"
1086 " ib_dealloc_mw"
1087 " failed %i\n",
1088 __func__, rc);
1089 break;
1090 default:
1091 break;
1094 rpcrdma_deregister_internal(ia,
1095 buf->rb_send_bufs[i]->rl_handle,
1096 &buf->rb_send_bufs[i]->rl_iov);
1097 kfree(buf->rb_send_bufs[i]);
1101 kfree(buf->rb_pool);
1105 * Get a set of request/reply buffers.
1107 * Reply buffer (if needed) is attached to send buffer upon return.
1108 * Rule:
1109 * rb_send_index and rb_recv_index MUST always be pointing to the
1110 * *next* available buffer (non-NULL). They are incremented after
1111 * removing buffers, and decremented *before* returning them.
1113 struct rpcrdma_req *
1114 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
1116 struct rpcrdma_req *req;
1117 unsigned long flags;
1119 spin_lock_irqsave(&buffers->rb_lock, flags);
1120 if (buffers->rb_send_index == buffers->rb_max_requests) {
1121 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1122 dprintk("RPC: %s: out of request buffers\n", __func__);
1123 return ((struct rpcrdma_req *)NULL);
1126 req = buffers->rb_send_bufs[buffers->rb_send_index];
1127 if (buffers->rb_send_index < buffers->rb_recv_index) {
1128 dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
1129 __func__,
1130 buffers->rb_recv_index - buffers->rb_send_index);
1131 req->rl_reply = NULL;
1132 } else {
1133 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1134 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1136 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
1137 if (!list_empty(&buffers->rb_mws)) {
1138 int i = RPCRDMA_MAX_SEGS - 1;
1139 do {
1140 struct rpcrdma_mw *r;
1141 r = list_entry(buffers->rb_mws.next,
1142 struct rpcrdma_mw, mw_list);
1143 list_del(&r->mw_list);
1144 req->rl_segments[i].mr_chunk.rl_mw = r;
1145 } while (--i >= 0);
1147 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1148 return req;
1152 * Put request/reply buffers back into pool.
1153 * Pre-decrement counter/array index.
1155 void
1156 rpcrdma_buffer_put(struct rpcrdma_req *req)
1158 struct rpcrdma_buffer *buffers = req->rl_buffer;
1159 struct rpcrdma_ia *ia = rdmab_to_ia(buffers);
1160 int i;
1161 unsigned long flags;
1163 BUG_ON(req->rl_nchunks != 0);
1164 spin_lock_irqsave(&buffers->rb_lock, flags);
1165 buffers->rb_send_bufs[--buffers->rb_send_index] = req;
1166 req->rl_niovs = 0;
1167 if (req->rl_reply) {
1168 buffers->rb_recv_bufs[--buffers->rb_recv_index] = req->rl_reply;
1169 init_waitqueue_head(&req->rl_reply->rr_unbind);
1170 req->rl_reply->rr_func = NULL;
1171 req->rl_reply = NULL;
1173 switch (ia->ri_memreg_strategy) {
1174 case RPCRDMA_MTHCAFMR:
1175 case RPCRDMA_MEMWINDOWS_ASYNC:
1176 case RPCRDMA_MEMWINDOWS:
1178 * Cycle mw's back in reverse order, and "spin" them.
1179 * This delays and scrambles reuse as much as possible.
1181 i = 1;
1182 do {
1183 struct rpcrdma_mw **mw;
1184 mw = &req->rl_segments[i].mr_chunk.rl_mw;
1185 list_add_tail(&(*mw)->mw_list, &buffers->rb_mws);
1186 *mw = NULL;
1187 } while (++i < RPCRDMA_MAX_SEGS);
1188 list_add_tail(&req->rl_segments[0].mr_chunk.rl_mw->mw_list,
1189 &buffers->rb_mws);
1190 req->rl_segments[0].mr_chunk.rl_mw = NULL;
1191 break;
1192 default:
1193 break;
1195 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1199 * Recover reply buffers from pool.
1200 * This happens when recovering from error conditions.
1201 * Post-increment counter/array index.
1203 void
1204 rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
1206 struct rpcrdma_buffer *buffers = req->rl_buffer;
1207 unsigned long flags;
1209 if (req->rl_iov.length == 0) /* special case xprt_rdma_allocate() */
1210 buffers = ((struct rpcrdma_req *) buffers)->rl_buffer;
1211 spin_lock_irqsave(&buffers->rb_lock, flags);
1212 if (buffers->rb_recv_index < buffers->rb_max_requests) {
1213 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
1214 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
1216 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1220 * Put reply buffers back into pool when not attached to
1221 * request. This happens in error conditions, and when
1222 * aborting unbinds. Pre-decrement counter/array index.
1224 void
1225 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
1227 struct rpcrdma_buffer *buffers = rep->rr_buffer;
1228 unsigned long flags;
1230 rep->rr_func = NULL;
1231 spin_lock_irqsave(&buffers->rb_lock, flags);
1232 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
1233 spin_unlock_irqrestore(&buffers->rb_lock, flags);
1237 * Wrappers for internal-use kmalloc memory registration, used by buffer code.
1241 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
1242 struct ib_mr **mrp, struct ib_sge *iov)
1244 struct ib_phys_buf ipb;
1245 struct ib_mr *mr;
1246 int rc;
1249 * All memory passed here was kmalloc'ed, therefore phys-contiguous.
1251 iov->addr = ib_dma_map_single(ia->ri_id->device,
1252 va, len, DMA_BIDIRECTIONAL);
1253 iov->length = len;
1255 if (ia->ri_bind_mem != NULL) {
1256 *mrp = NULL;
1257 iov->lkey = ia->ri_bind_mem->lkey;
1258 return 0;
1261 ipb.addr = iov->addr;
1262 ipb.size = iov->length;
1263 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
1264 IB_ACCESS_LOCAL_WRITE, &iov->addr);
1266 dprintk("RPC: %s: phys convert: 0x%llx "
1267 "registered 0x%llx length %d\n",
1268 __func__, (unsigned long long)ipb.addr,
1269 (unsigned long long)iov->addr, len);
1271 if (IS_ERR(mr)) {
1272 *mrp = NULL;
1273 rc = PTR_ERR(mr);
1274 dprintk("RPC: %s: failed with %i\n", __func__, rc);
1275 } else {
1276 *mrp = mr;
1277 iov->lkey = mr->lkey;
1278 rc = 0;
1281 return rc;
1285 rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
1286 struct ib_mr *mr, struct ib_sge *iov)
1288 int rc;
1290 ib_dma_unmap_single(ia->ri_id->device,
1291 iov->addr, iov->length, DMA_BIDIRECTIONAL);
1293 if (NULL == mr)
1294 return 0;
1296 rc = ib_dereg_mr(mr);
1297 if (rc)
1298 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
1299 return rc;
1303 * Wrappers for chunk registration, shared by read/write chunk code.
1306 static void
1307 rpcrdma_map_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg, int writing)
1309 seg->mr_dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
1310 seg->mr_dmalen = seg->mr_len;
1311 if (seg->mr_page)
1312 seg->mr_dma = ib_dma_map_page(ia->ri_id->device,
1313 seg->mr_page, offset_in_page(seg->mr_offset),
1314 seg->mr_dmalen, seg->mr_dir);
1315 else
1316 seg->mr_dma = ib_dma_map_single(ia->ri_id->device,
1317 seg->mr_offset,
1318 seg->mr_dmalen, seg->mr_dir);
1321 static void
1322 rpcrdma_unmap_one(struct rpcrdma_ia *ia, struct rpcrdma_mr_seg *seg)
1324 if (seg->mr_page)
1325 ib_dma_unmap_page(ia->ri_id->device,
1326 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1327 else
1328 ib_dma_unmap_single(ia->ri_id->device,
1329 seg->mr_dma, seg->mr_dmalen, seg->mr_dir);
1333 rpcrdma_register_external(struct rpcrdma_mr_seg *seg,
1334 int nsegs, int writing, struct rpcrdma_xprt *r_xprt)
1336 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1337 int mem_priv = (writing ? IB_ACCESS_REMOTE_WRITE :
1338 IB_ACCESS_REMOTE_READ);
1339 struct rpcrdma_mr_seg *seg1 = seg;
1340 int i;
1341 int rc = 0;
1343 switch (ia->ri_memreg_strategy) {
1345 #if RPCRDMA_PERSISTENT_REGISTRATION
1346 case RPCRDMA_ALLPHYSICAL:
1347 rpcrdma_map_one(ia, seg, writing);
1348 seg->mr_rkey = ia->ri_bind_mem->rkey;
1349 seg->mr_base = seg->mr_dma;
1350 seg->mr_nsegs = 1;
1351 nsegs = 1;
1352 break;
1353 #endif
1355 /* Registration using fast memory registration */
1356 case RPCRDMA_MTHCAFMR:
1358 u64 physaddrs[RPCRDMA_MAX_DATA_SEGS];
1359 int len, pageoff = offset_in_page(seg->mr_offset);
1360 seg1->mr_offset -= pageoff; /* start of page */
1361 seg1->mr_len += pageoff;
1362 len = -pageoff;
1363 if (nsegs > RPCRDMA_MAX_DATA_SEGS)
1364 nsegs = RPCRDMA_MAX_DATA_SEGS;
1365 for (i = 0; i < nsegs;) {
1366 rpcrdma_map_one(ia, seg, writing);
1367 physaddrs[i] = seg->mr_dma;
1368 len += seg->mr_len;
1369 ++seg;
1370 ++i;
1371 /* Check for holes */
1372 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
1373 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1374 break;
1376 nsegs = i;
1377 rc = ib_map_phys_fmr(seg1->mr_chunk.rl_mw->r.fmr,
1378 physaddrs, nsegs, seg1->mr_dma);
1379 if (rc) {
1380 dprintk("RPC: %s: failed ib_map_phys_fmr "
1381 "%u@0x%llx+%i (%d)... status %i\n", __func__,
1382 len, (unsigned long long)seg1->mr_dma,
1383 pageoff, nsegs, rc);
1384 while (nsegs--)
1385 rpcrdma_unmap_one(ia, --seg);
1386 } else {
1387 seg1->mr_rkey = seg1->mr_chunk.rl_mw->r.fmr->rkey;
1388 seg1->mr_base = seg1->mr_dma + pageoff;
1389 seg1->mr_nsegs = nsegs;
1390 seg1->mr_len = len;
1393 break;
1395 /* Registration using memory windows */
1396 case RPCRDMA_MEMWINDOWS_ASYNC:
1397 case RPCRDMA_MEMWINDOWS:
1399 struct ib_mw_bind param;
1400 rpcrdma_map_one(ia, seg, writing);
1401 param.mr = ia->ri_bind_mem;
1402 param.wr_id = 0ULL; /* no send cookie */
1403 param.addr = seg->mr_dma;
1404 param.length = seg->mr_len;
1405 param.send_flags = 0;
1406 param.mw_access_flags = mem_priv;
1408 DECR_CQCOUNT(&r_xprt->rx_ep);
1409 rc = ib_bind_mw(ia->ri_id->qp,
1410 seg->mr_chunk.rl_mw->r.mw, &param);
1411 if (rc) {
1412 dprintk("RPC: %s: failed ib_bind_mw "
1413 "%u@0x%llx status %i\n",
1414 __func__, seg->mr_len,
1415 (unsigned long long)seg->mr_dma, rc);
1416 rpcrdma_unmap_one(ia, seg);
1417 } else {
1418 seg->mr_rkey = seg->mr_chunk.rl_mw->r.mw->rkey;
1419 seg->mr_base = param.addr;
1420 seg->mr_nsegs = 1;
1421 nsegs = 1;
1424 break;
1426 /* Default registration each time */
1427 default:
1429 struct ib_phys_buf ipb[RPCRDMA_MAX_DATA_SEGS];
1430 int len = 0;
1431 if (nsegs > RPCRDMA_MAX_DATA_SEGS)
1432 nsegs = RPCRDMA_MAX_DATA_SEGS;
1433 for (i = 0; i < nsegs;) {
1434 rpcrdma_map_one(ia, seg, writing);
1435 ipb[i].addr = seg->mr_dma;
1436 ipb[i].size = seg->mr_len;
1437 len += seg->mr_len;
1438 ++seg;
1439 ++i;
1440 /* Check for holes */
1441 if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
1442 offset_in_page((seg-1)->mr_offset+(seg-1)->mr_len))
1443 break;
1445 nsegs = i;
1446 seg1->mr_base = seg1->mr_dma;
1447 seg1->mr_chunk.rl_mr = ib_reg_phys_mr(ia->ri_pd,
1448 ipb, nsegs, mem_priv, &seg1->mr_base);
1449 if (IS_ERR(seg1->mr_chunk.rl_mr)) {
1450 rc = PTR_ERR(seg1->mr_chunk.rl_mr);
1451 dprintk("RPC: %s: failed ib_reg_phys_mr "
1452 "%u@0x%llx (%d)... status %i\n",
1453 __func__, len,
1454 (unsigned long long)seg1->mr_dma, nsegs, rc);
1455 while (nsegs--)
1456 rpcrdma_unmap_one(ia, --seg);
1457 } else {
1458 seg1->mr_rkey = seg1->mr_chunk.rl_mr->rkey;
1459 seg1->mr_nsegs = nsegs;
1460 seg1->mr_len = len;
1463 break;
1465 if (rc)
1466 return -1;
1468 return nsegs;
1472 rpcrdma_deregister_external(struct rpcrdma_mr_seg *seg,
1473 struct rpcrdma_xprt *r_xprt, void *r)
1475 struct rpcrdma_ia *ia = &r_xprt->rx_ia;
1476 struct rpcrdma_mr_seg *seg1 = seg;
1477 int nsegs = seg->mr_nsegs, rc;
1479 switch (ia->ri_memreg_strategy) {
1481 #if RPCRDMA_PERSISTENT_REGISTRATION
1482 case RPCRDMA_ALLPHYSICAL:
1483 BUG_ON(nsegs != 1);
1484 rpcrdma_unmap_one(ia, seg);
1485 rc = 0;
1486 break;
1487 #endif
1489 case RPCRDMA_MTHCAFMR:
1491 LIST_HEAD(l);
1492 list_add(&seg->mr_chunk.rl_mw->r.fmr->list, &l);
1493 rc = ib_unmap_fmr(&l);
1494 while (seg1->mr_nsegs--)
1495 rpcrdma_unmap_one(ia, seg++);
1497 if (rc)
1498 dprintk("RPC: %s: failed ib_unmap_fmr,"
1499 " status %i\n", __func__, rc);
1500 break;
1502 case RPCRDMA_MEMWINDOWS_ASYNC:
1503 case RPCRDMA_MEMWINDOWS:
1505 struct ib_mw_bind param;
1506 BUG_ON(nsegs != 1);
1507 param.mr = ia->ri_bind_mem;
1508 param.addr = 0ULL; /* unbind */
1509 param.length = 0;
1510 param.mw_access_flags = 0;
1511 if (r) {
1512 param.wr_id = (u64) (unsigned long) r;
1513 param.send_flags = IB_SEND_SIGNALED;
1514 INIT_CQCOUNT(&r_xprt->rx_ep);
1515 } else {
1516 param.wr_id = 0ULL;
1517 param.send_flags = 0;
1518 DECR_CQCOUNT(&r_xprt->rx_ep);
1520 rc = ib_bind_mw(ia->ri_id->qp,
1521 seg->mr_chunk.rl_mw->r.mw, &param);
1522 rpcrdma_unmap_one(ia, seg);
1524 if (rc)
1525 dprintk("RPC: %s: failed ib_(un)bind_mw,"
1526 " status %i\n", __func__, rc);
1527 else
1528 r = NULL; /* will upcall on completion */
1529 break;
1531 default:
1532 rc = ib_dereg_mr(seg1->mr_chunk.rl_mr);
1533 seg1->mr_chunk.rl_mr = NULL;
1534 while (seg1->mr_nsegs--)
1535 rpcrdma_unmap_one(ia, seg++);
1536 if (rc)
1537 dprintk("RPC: %s: failed ib_dereg_mr,"
1538 " status %i\n", __func__, rc);
1539 break;
1541 if (r) {
1542 struct rpcrdma_rep *rep = r;
1543 void (*func)(struct rpcrdma_rep *) = rep->rr_func;
1544 rep->rr_func = NULL;
1545 func(rep); /* dereg done, callback now */
1547 return nsegs;
1551 * Prepost any receive buffer, then post send.
1553 * Receive buffer is donated to hardware, reclaimed upon recv completion.
1556 rpcrdma_ep_post(struct rpcrdma_ia *ia,
1557 struct rpcrdma_ep *ep,
1558 struct rpcrdma_req *req)
1560 struct ib_send_wr send_wr, *send_wr_fail;
1561 struct rpcrdma_rep *rep = req->rl_reply;
1562 int rc;
1564 if (rep) {
1565 rc = rpcrdma_ep_post_recv(ia, ep, rep);
1566 if (rc)
1567 goto out;
1568 req->rl_reply = NULL;
1571 send_wr.next = NULL;
1572 send_wr.wr_id = 0ULL; /* no send cookie */
1573 send_wr.sg_list = req->rl_send_iov;
1574 send_wr.num_sge = req->rl_niovs;
1575 send_wr.opcode = IB_WR_SEND;
1576 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
1577 ib_dma_sync_single_for_device(ia->ri_id->device,
1578 req->rl_send_iov[3].addr, req->rl_send_iov[3].length,
1579 DMA_TO_DEVICE);
1580 ib_dma_sync_single_for_device(ia->ri_id->device,
1581 req->rl_send_iov[1].addr, req->rl_send_iov[1].length,
1582 DMA_TO_DEVICE);
1583 ib_dma_sync_single_for_device(ia->ri_id->device,
1584 req->rl_send_iov[0].addr, req->rl_send_iov[0].length,
1585 DMA_TO_DEVICE);
1587 if (DECR_CQCOUNT(ep) > 0)
1588 send_wr.send_flags = 0;
1589 else { /* Provider must take a send completion every now and then */
1590 INIT_CQCOUNT(ep);
1591 send_wr.send_flags = IB_SEND_SIGNALED;
1594 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
1595 if (rc)
1596 dprintk("RPC: %s: ib_post_send returned %i\n", __func__,
1597 rc);
1598 out:
1599 return rc;
1603 * (Re)post a receive buffer.
1606 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
1607 struct rpcrdma_ep *ep,
1608 struct rpcrdma_rep *rep)
1610 struct ib_recv_wr recv_wr, *recv_wr_fail;
1611 int rc;
1613 recv_wr.next = NULL;
1614 recv_wr.wr_id = (u64) (unsigned long) rep;
1615 recv_wr.sg_list = &rep->rr_iov;
1616 recv_wr.num_sge = 1;
1618 ib_dma_sync_single_for_cpu(ia->ri_id->device,
1619 rep->rr_iov.addr, rep->rr_iov.length, DMA_BIDIRECTIONAL);
1621 DECR_CQCOUNT(ep);
1622 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
1624 if (rc)
1625 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__,
1626 rc);
1627 return rc;