2 * Copyright (c) 2016 Oracle. All rights reserved.
3 * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved.
4 * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved.
6 * This software is available to you under a choice of one of two
7 * licenses. You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the BSD-type
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
16 * Redistributions of source code must retain the above copyright
17 * notice, this list of conditions and the following disclaimer.
19 * Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials provided
22 * with the distribution.
24 * Neither the name of the Network Appliance, Inc. nor the names of
25 * its contributors may be used to endorse or promote products
26 * derived from this software without specific prior written
29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
30 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
31 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
32 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
33 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
34 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
35 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
36 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
37 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
38 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
39 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 * Author: Tom Tucker <tom@opengridcomputing.com>
46 * The main entry point is svc_rdma_sendto. This is called by the
47 * RPC server when an RPC Reply is ready to be transmitted to a client.
49 * The passed-in svc_rqst contains a struct xdr_buf which holds an
50 * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA
51 * transport header, post all Write WRs needed for this Reply, then post
52 * a Send WR conveying the transport header and the RPC message itself to
55 * svc_rdma_sendto must fully transmit the Reply before returning, as
56 * the svc_rqst will be recycled as soon as sendto returns. Remaining
57 * resources referred to by the svc_rqst are also recycled at that time.
58 * Therefore any resources that must remain longer must be detached
59 * from the svc_rqst and released later.
63 * The I/O that performs Reply transmission is asynchronous, and may
64 * complete well after sendto returns. Thus pages under I/O must be
65 * removed from the svc_rqst before sendto returns.
67 * The logic here depends on Send Queue and completion ordering. Since
68 * the Send WR is always posted last, it will always complete last. Thus
69 * when it completes, it is guaranteed that all previous Write WRs have
72 * Write WRs are constructed and posted. Each Write segment gets its own
73 * svc_rdma_rw_ctxt, allowing the Write completion handler to find and
74 * DMA-unmap the pages under I/O for that Write segment. The Write
75 * completion handler does not release any pages.
77 * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt.
78 * The ownership of all of the Reply's pages are transferred into that
79 * ctxt, the Send WR is posted, and sendto returns.
81 * The svc_rdma_op_ctxt is presented when the Send WR completes. The
82 * Send completion handler finally releases the Reply's pages.
84 * This mechanism also assumes that completions on the transport's Send
85 * Completion Queue do not run in parallel. Otherwise a Write completion
86 * and Send completion running at the same time could release pages that
87 * are still DMA-mapped.
91 * - If the Send WR is posted successfully, it will either complete
92 * successfully, or get flushed. Either way, the Send completion
93 * handler releases the Reply's pages.
94 * - If the Send WR cannot be not posted, the forward path releases
97 * This handles the case, without the use of page reference counting,
98 * where two different Write segments send portions of the same page.
101 #include <linux/sunrpc/debug.h>
102 #include <linux/sunrpc/rpc_rdma.h>
103 #include <linux/spinlock.h>
104 #include <asm/unaligned.h>
105 #include <rdma/ib_verbs.h>
106 #include <rdma/rdma_cm.h>
107 #include <linux/sunrpc/svc_rdma.h>
109 #define RPCDBG_FACILITY RPCDBG_SVCXPRT
111 static u32
xdr_padsize(u32 len
)
113 return (len
& 3) ? (4 - (len
& 3)) : 0;
116 /* Returns length of transport header, in bytes.
118 static unsigned int svc_rdma_reply_hdr_len(__be32
*rdma_resp
)
125 /* RPC-over-RDMA V1 replies never have a Read list. */
126 p
+= rpcrdma_fixed_maxsz
+ 1;
128 /* Skip Write list. */
129 while (*p
++ != xdr_zero
) {
130 nsegs
= be32_to_cpup(p
++);
131 p
+= nsegs
* rpcrdma_segment_maxsz
;
134 /* Skip Reply chunk. */
135 if (*p
++ != xdr_zero
) {
136 nsegs
= be32_to_cpup(p
++);
137 p
+= nsegs
* rpcrdma_segment_maxsz
;
140 return (unsigned long)p
- (unsigned long)rdma_resp
;
143 /* One Write chunk is copied from Call transport header to Reply
144 * transport header. Each segment's length field is updated to
145 * reflect number of bytes consumed in the segment.
147 * Returns number of segments in this chunk.
149 static unsigned int xdr_encode_write_chunk(__be32
*dst
, __be32
*src
,
150 unsigned int remaining
)
152 unsigned int i
, nsegs
;
155 /* Write list discriminator */
158 /* number of segments in this chunk */
159 nsegs
= be32_to_cpup(src
);
162 for (i
= nsegs
; i
; i
--) {
163 /* segment's RDMA handle */
166 /* bytes returned in this segment */
167 seg_len
= be32_to_cpu(*src
);
168 if (remaining
>= seg_len
) {
169 /* entire segment was consumed */
171 remaining
-= seg_len
;
173 /* segment only partly filled */
174 *dst
= cpu_to_be32(remaining
);
179 /* segment's RDMA offset */
187 /* The client provided a Write list in the Call message. Fill in
188 * the segments in the first Write chunk in the Reply's transport
189 * header with the number of bytes consumed in each segment.
190 * Remaining chunks are returned unused.
193 * - Client has provided only one Write chunk
195 static void svc_rdma_xdr_encode_write_list(__be32
*rdma_resp
, __be32
*wr_ch
,
196 unsigned int consumed
)
201 /* RPC-over-RDMA V1 replies never have a Read list. */
202 p
= rdma_resp
+ rpcrdma_fixed_maxsz
+ 1;
205 while (*q
!= xdr_zero
) {
206 nsegs
= xdr_encode_write_chunk(p
, q
, consumed
);
207 q
+= 2 + nsegs
* rpcrdma_segment_maxsz
;
208 p
+= 2 + nsegs
* rpcrdma_segment_maxsz
;
212 /* Terminate Write list */
215 /* Reply chunk discriminator; may be replaced later */
219 /* The client provided a Reply chunk in the Call message. Fill in
220 * the segments in the Reply chunk in the Reply message with the
221 * number of bytes consumed in each segment.
224 * - Reply can always fit in the provided Reply chunk
226 static void svc_rdma_xdr_encode_reply_chunk(__be32
*rdma_resp
, __be32
*rp_ch
,
227 unsigned int consumed
)
231 /* Find the Reply chunk in the Reply's xprt header.
232 * RPC-over-RDMA V1 replies never have a Read list.
234 p
= rdma_resp
+ rpcrdma_fixed_maxsz
+ 1;
236 /* Skip past Write list */
237 while (*p
++ != xdr_zero
)
238 p
+= 1 + be32_to_cpup(p
) * rpcrdma_segment_maxsz
;
240 xdr_encode_write_chunk(p
, rp_ch
, consumed
);
243 /* Parse the RPC Call's transport header.
245 static void svc_rdma_get_write_arrays(__be32
*rdma_argp
,
246 __be32
**write
, __be32
**reply
)
250 p
= rdma_argp
+ rpcrdma_fixed_maxsz
;
253 while (*p
++ != xdr_zero
)
257 if (*p
!= xdr_zero
) {
259 while (*p
++ != xdr_zero
)
260 p
+= 1 + be32_to_cpu(*p
) * 4;
273 /* RPC-over-RDMA Version One private extension: Remote Invalidation.
274 * Responder's choice: requester signals it can handle Send With
275 * Invalidate, and responder chooses one rkey to invalidate.
277 * Find a candidate rkey to invalidate when sending a reply. Picks the
278 * first R_key it finds in the chunk lists.
280 * Returns zero if RPC's chunk lists are empty.
282 static u32
svc_rdma_get_inv_rkey(__be32
*rdma_argp
,
283 __be32
*wr_lst
, __be32
*rp_ch
)
287 p
= rdma_argp
+ rpcrdma_fixed_maxsz
;
290 else if (wr_lst
&& be32_to_cpup(wr_lst
+ 1))
292 else if (rp_ch
&& be32_to_cpup(rp_ch
+ 1))
296 return be32_to_cpup(p
);
299 /* ib_dma_map_page() is used here because svc_rdma_dma_unmap()
300 * is used during completion to DMA-unmap this memory, and
301 * it uses ib_dma_unmap_page() exclusively.
303 static int svc_rdma_dma_map_buf(struct svcxprt_rdma
*rdma
,
304 struct svc_rdma_op_ctxt
*ctxt
,
309 unsigned long offset
= (unsigned long)base
& ~PAGE_MASK
;
310 struct ib_device
*dev
= rdma
->sc_cm_id
->device
;
313 dma_addr
= ib_dma_map_page(dev
, virt_to_page(base
),
314 offset
, len
, DMA_TO_DEVICE
);
315 if (ib_dma_mapping_error(dev
, dma_addr
))
318 ctxt
->sge
[sge_no
].addr
= dma_addr
;
319 ctxt
->sge
[sge_no
].length
= len
;
320 ctxt
->sge
[sge_no
].lkey
= rdma
->sc_pd
->local_dma_lkey
;
321 svc_rdma_count_mappings(rdma
, ctxt
);
325 static int svc_rdma_dma_map_page(struct svcxprt_rdma
*rdma
,
326 struct svc_rdma_op_ctxt
*ctxt
,
332 struct ib_device
*dev
= rdma
->sc_cm_id
->device
;
335 dma_addr
= ib_dma_map_page(dev
, page
, offset
, len
, DMA_TO_DEVICE
);
336 if (ib_dma_mapping_error(dev
, dma_addr
))
339 ctxt
->sge
[sge_no
].addr
= dma_addr
;
340 ctxt
->sge
[sge_no
].length
= len
;
341 ctxt
->sge
[sge_no
].lkey
= rdma
->sc_pd
->local_dma_lkey
;
342 svc_rdma_count_mappings(rdma
, ctxt
);
347 * svc_rdma_map_reply_hdr - DMA map the transport header buffer
348 * @rdma: controlling transport
349 * @ctxt: op_ctxt for the Send WR
350 * @rdma_resp: buffer containing transport header
351 * @len: length of transport header
354 * %0 if the header is DMA mapped,
355 * %-EIO if DMA mapping failed.
357 int svc_rdma_map_reply_hdr(struct svcxprt_rdma
*rdma
,
358 struct svc_rdma_op_ctxt
*ctxt
,
362 ctxt
->direction
= DMA_TO_DEVICE
;
363 ctxt
->pages
[0] = virt_to_page(rdma_resp
);
365 return svc_rdma_dma_map_page(rdma
, ctxt
, 0, ctxt
->pages
[0], 0, len
);
368 /* Load the xdr_buf into the ctxt's sge array, and DMA map each
369 * element as it is added.
371 * Returns the number of sge elements loaded on success, or
372 * a negative errno on failure.
374 static int svc_rdma_map_reply_msg(struct svcxprt_rdma
*rdma
,
375 struct svc_rdma_op_ctxt
*ctxt
,
376 struct xdr_buf
*xdr
, __be32
*wr_lst
)
378 unsigned int len
, sge_no
, remaining
, page_off
;
379 struct page
**ppages
;
386 ret
= svc_rdma_dma_map_buf(rdma
, ctxt
, sge_no
++,
387 xdr
->head
[0].iov_base
,
388 xdr
->head
[0].iov_len
);
392 /* If a Write chunk is present, the xdr_buf's page list
393 * is not included inline. However the Upper Layer may
394 * have added XDR padding in the tail buffer, and that
395 * should not be included inline.
398 base
= xdr
->tail
[0].iov_base
;
399 len
= xdr
->tail
[0].iov_len
;
400 xdr_pad
= xdr_padsize(xdr
->page_len
);
402 if (len
&& xdr_pad
) {
410 ppages
= xdr
->pages
+ (xdr
->page_base
>> PAGE_SHIFT
);
411 page_off
= xdr
->page_base
& ~PAGE_MASK
;
412 remaining
= xdr
->page_len
;
414 len
= min_t(u32
, PAGE_SIZE
- page_off
, remaining
);
416 ret
= svc_rdma_dma_map_page(rdma
, ctxt
, sge_no
++,
417 *ppages
++, page_off
, len
);
425 base
= xdr
->tail
[0].iov_base
;
426 len
= xdr
->tail
[0].iov_len
;
429 ret
= svc_rdma_dma_map_buf(rdma
, ctxt
, sge_no
++, base
, len
);
437 /* The svc_rqst and all resources it owns are released as soon as
438 * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt
439 * so they are released by the Send completion handler.
441 static void svc_rdma_save_io_pages(struct svc_rqst
*rqstp
,
442 struct svc_rdma_op_ctxt
*ctxt
)
444 int i
, pages
= rqstp
->rq_next_page
- rqstp
->rq_respages
;
446 ctxt
->count
+= pages
;
447 for (i
= 0; i
< pages
; i
++) {
448 ctxt
->pages
[i
+ 1] = rqstp
->rq_respages
[i
];
449 rqstp
->rq_respages
[i
] = NULL
;
451 rqstp
->rq_next_page
= rqstp
->rq_respages
+ 1;
455 * svc_rdma_post_send_wr - Set up and post one Send Work Request
456 * @rdma: controlling transport
457 * @ctxt: op_ctxt for transmitting the Send WR
458 * @num_sge: number of SGEs to send
459 * @inv_rkey: R_key argument to Send With Invalidate, or zero
462 * %0 if the Send* was posted successfully,
463 * %-ENOTCONN if the connection was lost or dropped,
464 * %-EINVAL if there was a problem with the Send we built,
465 * %-ENOMEM if ib_post_send failed.
467 int svc_rdma_post_send_wr(struct svcxprt_rdma
*rdma
,
468 struct svc_rdma_op_ctxt
*ctxt
, int num_sge
,
471 struct ib_send_wr
*send_wr
= &ctxt
->send_wr
;
473 dprintk("svcrdma: posting Send WR with %u sge(s)\n", num_sge
);
475 send_wr
->next
= NULL
;
476 ctxt
->cqe
.done
= svc_rdma_wc_send
;
477 send_wr
->wr_cqe
= &ctxt
->cqe
;
478 send_wr
->sg_list
= ctxt
->sge
;
479 send_wr
->num_sge
= num_sge
;
480 send_wr
->send_flags
= IB_SEND_SIGNALED
;
482 send_wr
->opcode
= IB_WR_SEND_WITH_INV
;
483 send_wr
->ex
.invalidate_rkey
= inv_rkey
;
485 send_wr
->opcode
= IB_WR_SEND
;
488 return svc_rdma_send(rdma
, send_wr
);
491 /* Prepare the portion of the RPC Reply that will be transmitted
492 * via RDMA Send. The RPC-over-RDMA transport header is prepared
493 * in sge[0], and the RPC xdr_buf is prepared in following sges.
495 * Depending on whether a Write list or Reply chunk is present,
496 * the server may send all, a portion of, or none of the xdr_buf.
497 * In the latter case, only the transport header (sge[0]) is
500 * RDMA Send is the last step of transmitting an RPC reply. Pages
501 * involved in the earlier RDMA Writes are here transferred out
502 * of the rqstp and into the ctxt's page array. These pages are
503 * DMA unmapped by each Write completion, but the subsequent Send
504 * completion finally releases these pages.
507 * - The Reply's transport header will never be larger than a page.
509 static int svc_rdma_send_reply_msg(struct svcxprt_rdma
*rdma
,
510 __be32
*rdma_argp
, __be32
*rdma_resp
,
511 struct svc_rqst
*rqstp
,
512 __be32
*wr_lst
, __be32
*rp_ch
)
514 struct svc_rdma_op_ctxt
*ctxt
;
518 dprintk("svcrdma: sending %s reply: head=%zu, pagelen=%u, tail=%zu\n",
519 (rp_ch
? "RDMA_NOMSG" : "RDMA_MSG"),
520 rqstp
->rq_res
.head
[0].iov_len
,
521 rqstp
->rq_res
.page_len
,
522 rqstp
->rq_res
.tail
[0].iov_len
);
524 ctxt
= svc_rdma_get_context(rdma
);
526 ret
= svc_rdma_map_reply_hdr(rdma
, ctxt
, rdma_resp
,
527 svc_rdma_reply_hdr_len(rdma_resp
));
532 ret
= svc_rdma_map_reply_msg(rdma
, ctxt
,
533 &rqstp
->rq_res
, wr_lst
);
538 svc_rdma_save_io_pages(rqstp
, ctxt
);
541 if (rdma
->sc_snd_w_inv
)
542 inv_rkey
= svc_rdma_get_inv_rkey(rdma_argp
, wr_lst
, rp_ch
);
543 ret
= svc_rdma_post_send_wr(rdma
, ctxt
, 1 + ret
, inv_rkey
);
550 pr_err("svcrdma: failed to post Send WR (%d)\n", ret
);
551 svc_rdma_unmap_dma(ctxt
);
552 svc_rdma_put_context(ctxt
, 1);
556 /* Given the client-provided Write and Reply chunks, the server was not
557 * able to form a complete reply. Return an RDMA_ERROR message so the
558 * client can retire this RPC transaction. As above, the Send completion
559 * routine releases payload pages that were part of a previous RDMA Write.
561 * Remote Invalidation is skipped for simplicity.
563 static int svc_rdma_send_error_msg(struct svcxprt_rdma
*rdma
,
564 __be32
*rdma_resp
, struct svc_rqst
*rqstp
)
566 struct svc_rdma_op_ctxt
*ctxt
;
570 ctxt
= svc_rdma_get_context(rdma
);
572 /* Replace the original transport header with an
573 * RDMA_ERROR response. XID etc are preserved.
579 ret
= svc_rdma_map_reply_hdr(rdma
, ctxt
, rdma_resp
, 20);
583 svc_rdma_save_io_pages(rqstp
, ctxt
);
585 ret
= svc_rdma_post_send_wr(rdma
, ctxt
, 1 + ret
, 0);
592 pr_err("svcrdma: failed to post Send WR (%d)\n", ret
);
593 svc_rdma_unmap_dma(ctxt
);
594 svc_rdma_put_context(ctxt
, 1);
598 void svc_rdma_prep_reply_hdr(struct svc_rqst
*rqstp
)
603 * svc_rdma_sendto - Transmit an RPC reply
604 * @rqstp: processed RPC request, reply XDR already in ::rq_res
606 * Any resources still associated with @rqstp are released upon return.
607 * If no reply message was possible, the connection is closed.
610 * %0 if an RPC reply has been successfully posted,
611 * %-ENOMEM if a resource shortage occurred (connection is lost),
612 * %-ENOTCONN if posting failed (connection is lost).
614 int svc_rdma_sendto(struct svc_rqst
*rqstp
)
616 struct svc_xprt
*xprt
= rqstp
->rq_xprt
;
617 struct svcxprt_rdma
*rdma
=
618 container_of(xprt
, struct svcxprt_rdma
, sc_xprt
);
619 __be32
*p
, *rdma_argp
, *rdma_resp
, *wr_lst
, *rp_ch
;
620 struct xdr_buf
*xdr
= &rqstp
->rq_res
;
621 struct page
*res_page
;
624 /* Find the call's chunk lists to decide how to send the reply.
625 * Receive places the Call's xprt header at the start of page 0.
627 rdma_argp
= page_address(rqstp
->rq_pages
[0]);
628 svc_rdma_get_write_arrays(rdma_argp
, &wr_lst
, &rp_ch
);
630 dprintk("svcrdma: preparing response for XID 0x%08x\n",
631 be32_to_cpup(rdma_argp
));
633 /* Create the RDMA response header. xprt->xpt_mutex,
634 * acquired in svc_send(), serializes RPC replies. The
635 * code path below that inserts the credit grant value
636 * into each transport header runs only inside this
640 res_page
= alloc_page(GFP_KERNEL
);
643 rdma_resp
= page_address(res_page
);
647 *p
++ = *(rdma_argp
+ 1);
648 *p
++ = rdma
->sc_fc_credits
;
649 *p
++ = rp_ch
? rdma_nomsg
: rdma_msg
;
651 /* Start with empty chunks */
657 /* XXX: Presume the client sent only one Write chunk */
658 ret
= svc_rdma_send_write_chunk(rdma
, wr_lst
, xdr
);
661 svc_rdma_xdr_encode_write_list(rdma_resp
, wr_lst
, ret
);
664 ret
= svc_rdma_send_reply_chunk(rdma
, rp_ch
, wr_lst
, xdr
);
667 svc_rdma_xdr_encode_reply_chunk(rdma_resp
, rp_ch
, ret
);
670 ret
= svc_rdma_post_recv(rdma
, GFP_KERNEL
);
673 ret
= svc_rdma_send_reply_msg(rdma
, rdma_argp
, rdma_resp
, rqstp
,
683 ret
= svc_rdma_post_recv(rdma
, GFP_KERNEL
);
686 ret
= svc_rdma_send_error_msg(rdma
, rdma_resp
, rqstp
);
694 pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n",
696 set_bit(XPT_CLOSE
, &xprt
->xpt_flags
);