1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
6 #include <linux/errno.h>
7 #include <linux/types.h>
9 #include <linux/scatterlist.h>
10 #include <linux/highmem.h>
13 #include <rdma/iw_cm.h>
14 #include <rdma/ib_verbs.h>
15 #include <rdma/ib_user_verbs.h>
18 #include "siw_verbs.h"
21 #define MAX_HDR_INLINE \
22 (((uint32_t)(sizeof(struct siw_rreq_pkt) - \
23 sizeof(struct iwarp_send))) & 0xF8)
25 static struct page
*siw_get_pblpage(struct siw_mem
*mem
, u64 addr
, int *idx
)
27 struct siw_pbl
*pbl
= mem
->pbl
;
28 u64 offset
= addr
- mem
->va
;
29 dma_addr_t paddr
= siw_pbl_get_buffer(pbl
, offset
, NULL
, idx
);
32 return virt_to_page(paddr
);
38 * Copy short payload at provided destination payload address
40 static int siw_try_1seg(struct siw_iwarp_tx
*c_tx
, void *paddr
)
42 struct siw_wqe
*wqe
= &c_tx
->wqe_active
;
43 struct siw_sge
*sge
= &wqe
->sqe
.sge
[0];
44 u32 bytes
= sge
->length
;
46 if (bytes
> MAX_HDR_INLINE
|| wqe
->sqe
.num_sge
!= 1)
47 return MAX_HDR_INLINE
+ 1;
52 if (tx_flags(wqe
) & SIW_WQE_INLINE
) {
53 memcpy(paddr
, &wqe
->sqe
.sge
[1], bytes
);
55 struct siw_mem
*mem
= wqe
->mem
[0];
58 /* Kernel client using kva */
60 (const void *)(uintptr_t)sge
->laddr
, bytes
);
61 } else if (c_tx
->in_syscall
) {
62 if (copy_from_user(paddr
, u64_to_user_ptr(sge
->laddr
),
66 unsigned int off
= sge
->laddr
& ~PAGE_MASK
;
72 p
= siw_get_upage(mem
->umem
, sge
->laddr
);
74 p
= siw_get_pblpage(mem
, sge
->laddr
, &pbl_idx
);
81 if (likely(PAGE_SIZE
- off
>= bytes
)) {
82 memcpy(paddr
, buffer
+ off
, bytes
);
84 unsigned long part
= bytes
- (PAGE_SIZE
- off
);
86 memcpy(paddr
, buffer
+ off
, part
);
90 p
= siw_get_upage(mem
->umem
,
93 p
= siw_get_pblpage(mem
,
100 memcpy(paddr
+ part
, buffer
, bytes
- part
);
108 #define PKT_FRAGMENTED 1
109 #define PKT_COMPLETE 0
112 * siw_qp_prepare_tx()
114 * Prepare tx state for sending out one fpdu. Builds complete pkt
115 * if no user data or only immediate data are present.
117 * returns PKT_COMPLETE if complete pkt built, PKT_FRAGMENTED otherwise.
119 static int siw_qp_prepare_tx(struct siw_iwarp_tx
*c_tx
)
121 struct siw_wqe
*wqe
= &c_tx
->wqe_active
;
125 switch (tx_type(wqe
)) {
127 case SIW_OP_READ_LOCAL_INV
:
128 memcpy(&c_tx
->pkt
.ctrl
,
129 &iwarp_pktinfo
[RDMAP_RDMA_READ_REQ
].ctrl
,
130 sizeof(struct iwarp_ctrl
));
132 c_tx
->pkt
.rreq
.rsvd
= 0;
133 c_tx
->pkt
.rreq
.ddp_qn
= htonl(RDMAP_UNTAGGED_QN_RDMA_READ
);
134 c_tx
->pkt
.rreq
.ddp_msn
=
135 htonl(++c_tx
->ddp_msn
[RDMAP_UNTAGGED_QN_RDMA_READ
]);
136 c_tx
->pkt
.rreq
.ddp_mo
= 0;
137 c_tx
->pkt
.rreq
.sink_stag
= htonl(wqe
->sqe
.sge
[0].lkey
);
138 c_tx
->pkt
.rreq
.sink_to
=
139 cpu_to_be64(wqe
->sqe
.sge
[0].laddr
);
140 c_tx
->pkt
.rreq
.source_stag
= htonl(wqe
->sqe
.rkey
);
141 c_tx
->pkt
.rreq
.source_to
= cpu_to_be64(wqe
->sqe
.raddr
);
142 c_tx
->pkt
.rreq
.read_size
= htonl(wqe
->sqe
.sge
[0].length
);
144 c_tx
->ctrl_len
= sizeof(struct iwarp_rdma_rreq
);
145 crc
= (char *)&c_tx
->pkt
.rreq_pkt
.crc
;
149 if (tx_flags(wqe
) & SIW_WQE_SOLICITED
)
150 memcpy(&c_tx
->pkt
.ctrl
,
151 &iwarp_pktinfo
[RDMAP_SEND_SE
].ctrl
,
152 sizeof(struct iwarp_ctrl
));
154 memcpy(&c_tx
->pkt
.ctrl
, &iwarp_pktinfo
[RDMAP_SEND
].ctrl
,
155 sizeof(struct iwarp_ctrl
));
157 c_tx
->pkt
.send
.ddp_qn
= RDMAP_UNTAGGED_QN_SEND
;
158 c_tx
->pkt
.send
.ddp_msn
=
159 htonl(++c_tx
->ddp_msn
[RDMAP_UNTAGGED_QN_SEND
]);
160 c_tx
->pkt
.send
.ddp_mo
= 0;
162 c_tx
->pkt
.send_inv
.inval_stag
= 0;
164 c_tx
->ctrl_len
= sizeof(struct iwarp_send
);
166 crc
= (char *)&c_tx
->pkt
.send_pkt
.crc
;
167 data
= siw_try_1seg(c_tx
, crc
);
170 case SIW_OP_SEND_REMOTE_INV
:
171 if (tx_flags(wqe
) & SIW_WQE_SOLICITED
)
172 memcpy(&c_tx
->pkt
.ctrl
,
173 &iwarp_pktinfo
[RDMAP_SEND_SE_INVAL
].ctrl
,
174 sizeof(struct iwarp_ctrl
));
176 memcpy(&c_tx
->pkt
.ctrl
,
177 &iwarp_pktinfo
[RDMAP_SEND_INVAL
].ctrl
,
178 sizeof(struct iwarp_ctrl
));
180 c_tx
->pkt
.send
.ddp_qn
= RDMAP_UNTAGGED_QN_SEND
;
181 c_tx
->pkt
.send
.ddp_msn
=
182 htonl(++c_tx
->ddp_msn
[RDMAP_UNTAGGED_QN_SEND
]);
183 c_tx
->pkt
.send
.ddp_mo
= 0;
185 c_tx
->pkt
.send_inv
.inval_stag
= cpu_to_be32(wqe
->sqe
.rkey
);
187 c_tx
->ctrl_len
= sizeof(struct iwarp_send_inv
);
189 crc
= (char *)&c_tx
->pkt
.send_pkt
.crc
;
190 data
= siw_try_1seg(c_tx
, crc
);
194 memcpy(&c_tx
->pkt
.ctrl
, &iwarp_pktinfo
[RDMAP_RDMA_WRITE
].ctrl
,
195 sizeof(struct iwarp_ctrl
));
197 c_tx
->pkt
.rwrite
.sink_stag
= htonl(wqe
->sqe
.rkey
);
198 c_tx
->pkt
.rwrite
.sink_to
= cpu_to_be64(wqe
->sqe
.raddr
);
199 c_tx
->ctrl_len
= sizeof(struct iwarp_rdma_write
);
201 crc
= (char *)&c_tx
->pkt
.write_pkt
.crc
;
202 data
= siw_try_1seg(c_tx
, crc
);
205 case SIW_OP_READ_RESPONSE
:
206 memcpy(&c_tx
->pkt
.ctrl
,
207 &iwarp_pktinfo
[RDMAP_RDMA_READ_RESP
].ctrl
,
208 sizeof(struct iwarp_ctrl
));
211 c_tx
->pkt
.rresp
.sink_stag
= cpu_to_be32(wqe
->sqe
.rkey
);
212 c_tx
->pkt
.rresp
.sink_to
= cpu_to_be64(wqe
->sqe
.raddr
);
214 c_tx
->ctrl_len
= sizeof(struct iwarp_rdma_rresp
);
216 crc
= (char *)&c_tx
->pkt
.write_pkt
.crc
;
217 data
= siw_try_1seg(c_tx
, crc
);
221 siw_dbg_qp(tx_qp(c_tx
), "stale wqe type %d\n", tx_type(wqe
));
224 if (unlikely(data
< 0))
229 if (data
<= MAX_HDR_INLINE
) {
231 wqe
->processed
= data
;
233 c_tx
->pkt
.ctrl
.mpa_len
=
234 htons(c_tx
->ctrl_len
+ data
- MPA_HDR_SIZE
);
236 /* Add pad, if needed */
237 data
+= -(int)data
& 0x3;
238 /* advance CRC location after payload */
240 c_tx
->ctrl_len
+= data
;
242 if (!(c_tx
->pkt
.ctrl
.ddp_rdmap_ctrl
& DDP_FLAG_TAGGED
))
243 c_tx
->pkt
.c_untagged
.ddp_mo
= 0;
245 c_tx
->pkt
.c_tagged
.ddp_to
=
246 cpu_to_be64(wqe
->sqe
.raddr
);
251 * Do complete CRC if enabled and short packet
253 if (c_tx
->mpa_crc_hd
) {
254 crypto_shash_init(c_tx
->mpa_crc_hd
);
255 if (crypto_shash_update(c_tx
->mpa_crc_hd
,
259 crypto_shash_final(c_tx
->mpa_crc_hd
, (u8
*)crc
);
261 c_tx
->ctrl_len
+= MPA_CRC_SIZE
;
265 c_tx
->ctrl_len
+= MPA_CRC_SIZE
;
271 * Allow direct sending out of user buffer if WR is non signalled
272 * and payload is over threshold.
273 * Per RDMA verbs, the application should not change the send buffer
274 * until the work completed. In iWarp, work completion is only
275 * local delivery to TCP. TCP may reuse the buffer for
276 * retransmission. Changing unsent data also breaks the CRC,
279 if (c_tx
->zcopy_tx
&& wqe
->bytes
>= SENDPAGE_THRESH
&&
280 !(tx_flags(wqe
) & SIW_WQE_SIGNALLED
))
281 c_tx
->use_sendpage
= 1;
283 c_tx
->use_sendpage
= 0;
285 return PKT_FRAGMENTED
;
289 * Send out one complete control type FPDU, or header of FPDU carrying
290 * data. Used for fixed sized packets like Read.Requests or zero length
291 * SENDs, WRITEs, READ.Responses, or header only.
293 static int siw_tx_ctrl(struct siw_iwarp_tx
*c_tx
, struct socket
*s
,
296 struct msghdr msg
= { .msg_flags
= flags
};
297 struct kvec iov
= { .iov_base
=
298 (char *)&c_tx
->pkt
.ctrl
+ c_tx
->ctrl_sent
,
299 .iov_len
= c_tx
->ctrl_len
- c_tx
->ctrl_sent
};
301 int rv
= kernel_sendmsg(s
, &msg
, &iov
, 1,
302 c_tx
->ctrl_len
- c_tx
->ctrl_sent
);
305 c_tx
->ctrl_sent
+= rv
;
307 if (c_tx
->ctrl_sent
== c_tx
->ctrl_len
)
316 * 0copy TCP transmit interface: Use do_tcp_sendpages.
318 * Using sendpage to push page by page appears to be less efficient
319 * than using sendmsg, even if data are copied.
321 * A general performance limitation might be the extra four bytes
322 * trailer checksum segment to be pushed after user data.
324 static int siw_tcp_sendpages(struct socket
*s
, struct page
**page
, int offset
,
327 struct sock
*sk
= s
->sk
;
328 int i
= 0, rv
= 0, sent
= 0,
329 flags
= MSG_MORE
| MSG_DONTWAIT
| MSG_SENDPAGE_NOTLAST
;
332 size_t bytes
= min_t(size_t, PAGE_SIZE
- offset
, size
);
334 if (size
+ offset
<= PAGE_SIZE
)
335 flags
= MSG_MORE
| MSG_DONTWAIT
;
337 tcp_rate_check_app_limited(sk
);
340 rv
= do_tcp_sendpages(sk
, page
[i
], offset
, bytes
, flags
);
353 if (rv
== -EAGAIN
|| rv
== 0)
365 * Pushes list of pages to TCP socket. If pages from multiple
366 * SGE's, all referenced pages of each SGE are pushed in one
369 static int siw_0copy_tx(struct socket
*s
, struct page
**page
,
370 struct siw_sge
*sge
, unsigned int offset
,
373 int i
= 0, sent
= 0, rv
;
374 int sge_bytes
= min(sge
->length
- offset
, size
);
376 offset
= (sge
->laddr
+ offset
) & ~PAGE_MASK
;
378 while (sent
!= size
) {
379 rv
= siw_tcp_sendpages(s
, &page
[i
], offset
, sge_bytes
);
382 if (size
== sent
|| sge_bytes
> rv
)
385 i
+= PAGE_ALIGN(sge_bytes
+ offset
) >> PAGE_SHIFT
;
387 sge_bytes
= min(sge
->length
, size
- sent
);
388 offset
= sge
->laddr
& ~PAGE_MASK
;
397 #define MAX_TRAILER (MPA_CRC_SIZE + 4)
399 static void siw_unmap_pages(struct page
**pp
, unsigned long kmap_mask
)
402 if (kmap_mask
& BIT(0))
410 * siw_tx_hdt() tries to push a complete packet to TCP where all
411 * packet fragments are referenced by the elements of one iovec.
412 * For the data portion, each involved page must be referenced by
413 * one extra element. All sge's data can be non-aligned to page
414 * boundaries. Two more elements are referencing iWARP header
416 * MAX_ARRAY = 64KB/PAGE_SIZE + 1 + (2 * (SIW_MAX_SGE - 1) + HDR + TRL
418 #define MAX_ARRAY ((0xffff / PAGE_SIZE) + 1 + (2 * (SIW_MAX_SGE - 1) + 2))
421 * Write out iov referencing hdr, data and trailer of current FPDU.
422 * Update transmit state dependent on write return status
424 static int siw_tx_hdt(struct siw_iwarp_tx
*c_tx
, struct socket
*s
)
426 struct siw_wqe
*wqe
= &c_tx
->wqe_active
;
427 struct siw_sge
*sge
= &wqe
->sqe
.sge
[c_tx
->sge_idx
];
428 struct kvec iov
[MAX_ARRAY
];
429 struct page
*page_array
[MAX_ARRAY
];
430 struct msghdr msg
= { .msg_flags
= MSG_DONTWAIT
| MSG_EOR
};
432 int seg
= 0, do_crc
= c_tx
->do_crc
, is_kva
= 0, rv
;
433 unsigned int data_len
= c_tx
->bytes_unsent
, hdr_len
= 0, trl_len
= 0,
434 sge_off
= c_tx
->sge_off
, sge_idx
= c_tx
->sge_idx
,
435 pbl_idx
= c_tx
->pbl_idx
;
436 unsigned long kmap_mask
= 0L;
438 if (c_tx
->state
== SIW_SEND_HDR
) {
439 if (c_tx
->use_sendpage
) {
440 rv
= siw_tx_ctrl(c_tx
, s
, MSG_DONTWAIT
| MSG_MORE
);
444 c_tx
->state
= SIW_SEND_DATA
;
447 (char *)&c_tx
->pkt
.ctrl
+ c_tx
->ctrl_sent
;
448 iov
[0].iov_len
= hdr_len
=
449 c_tx
->ctrl_len
- c_tx
->ctrl_sent
;
454 wqe
->processed
+= data_len
;
456 while (data_len
) { /* walk the list of SGE's */
457 unsigned int sge_len
= min(sge
->length
- sge_off
, data_len
);
458 unsigned int fp_off
= (sge
->laddr
+ sge_off
) & ~PAGE_MASK
;
461 if (!(tx_flags(wqe
) & SIW_WQE_INLINE
)) {
462 mem
= wqe
->mem
[sge_idx
];
463 is_kva
= mem
->mem_obj
== NULL
? 1 : 0;
467 if (is_kva
&& !c_tx
->use_sendpage
) {
469 * tx from kernel virtual address: either inline data
470 * or memory region with assigned kernel buffer
473 (void *)(uintptr_t)(sge
->laddr
+ sge_off
);
474 iov
[seg
].iov_len
= sge_len
;
477 crypto_shash_update(c_tx
->mpa_crc_hd
,
487 size_t plen
= min((int)PAGE_SIZE
- fp_off
, sge_len
);
494 mem
, sge
->laddr
+ sge_off
,
497 p
= siw_get_upage(mem
->umem
,
498 sge
->laddr
+ sge_off
);
500 siw_unmap_pages(page_array
, kmap_mask
);
501 wqe
->processed
-= c_tx
->bytes_unsent
;
507 if (!c_tx
->use_sendpage
) {
508 iov
[seg
].iov_base
= kmap(p
) + fp_off
;
509 iov
[seg
].iov_len
= plen
;
511 /* Remember for later kunmap() */
512 kmap_mask
|= BIT(seg
);
520 crypto_shash_update(c_tx
->mpa_crc_hd
,
526 u64 va
= sge
->laddr
+ sge_off
;
528 page_array
[seg
] = virt_to_page(va
& PAGE_MASK
);
532 (void *)(uintptr_t)va
,
541 if (++seg
> (int)MAX_ARRAY
) {
542 siw_dbg_qp(tx_qp(c_tx
), "to many fragments\n");
543 siw_unmap_pages(page_array
, kmap_mask
);
544 wqe
->processed
-= c_tx
->bytes_unsent
;
550 /* Update SGE variables at end of SGE */
551 if (sge_off
== sge
->length
&&
552 (data_len
!= 0 || wqe
->processed
< wqe
->bytes
)) {
559 if (likely(c_tx
->state
!= SIW_SEND_TRAILER
)) {
560 iov
[seg
].iov_base
= &c_tx
->trailer
.pad
[4 - c_tx
->pad
];
561 iov
[seg
].iov_len
= trl_len
= MAX_TRAILER
- (4 - c_tx
->pad
);
563 iov
[seg
].iov_base
= &c_tx
->trailer
.pad
[c_tx
->ctrl_sent
];
564 iov
[seg
].iov_len
= trl_len
= MAX_TRAILER
- c_tx
->ctrl_sent
;
568 *(u32
*)c_tx
->trailer
.pad
= 0;
570 crypto_shash_update(c_tx
->mpa_crc_hd
,
571 (u8
*)&c_tx
->trailer
.crc
- c_tx
->pad
,
574 if (!c_tx
->mpa_crc_hd
)
575 c_tx
->trailer
.crc
= 0;
577 crypto_shash_final(c_tx
->mpa_crc_hd
, (u8
*)&c_tx
->trailer
.crc
);
579 data_len
= c_tx
->bytes_unsent
;
581 if (c_tx
->use_sendpage
) {
582 rv
= siw_0copy_tx(s
, page_array
, &wqe
->sqe
.sge
[c_tx
->sge_idx
],
583 c_tx
->sge_off
, data_len
);
584 if (rv
== data_len
) {
585 rv
= kernel_sendmsg(s
, &msg
, &iov
[seg
], 1, trl_len
);
592 rv
= kernel_sendmsg(s
, &msg
, iov
, seg
+ 1,
593 hdr_len
+ data_len
+ trl_len
);
594 siw_unmap_pages(page_array
, kmap_mask
);
596 if (rv
< (int)hdr_len
) {
597 /* Not even complete hdr pushed or negative rv */
598 wqe
->processed
-= data_len
;
600 c_tx
->ctrl_sent
+= rv
;
607 if (rv
>= (int)data_len
) {
608 /* all user data pushed to TCP or no data to push */
609 if (data_len
> 0 && wqe
->processed
< wqe
->bytes
) {
610 /* Save the current state for next tx */
611 c_tx
->sge_idx
= sge_idx
;
612 c_tx
->sge_off
= sge_off
;
613 c_tx
->pbl_idx
= pbl_idx
;
617 if (rv
== trl_len
) /* all pushed */
620 c_tx
->state
= SIW_SEND_TRAILER
;
621 c_tx
->ctrl_len
= MAX_TRAILER
;
622 c_tx
->ctrl_sent
= rv
+ 4 - c_tx
->pad
;
623 c_tx
->bytes_unsent
= 0;
627 } else if (data_len
> 0) {
628 /* Maybe some user data pushed to TCP */
629 c_tx
->state
= SIW_SEND_DATA
;
630 wqe
->processed
-= data_len
- rv
;
634 * Some bytes out. Recompute tx state based
635 * on old state and bytes pushed
637 unsigned int sge_unsent
;
639 c_tx
->bytes_unsent
-= rv
;
640 sge
= &wqe
->sqe
.sge
[c_tx
->sge_idx
];
641 sge_unsent
= sge
->length
- c_tx
->sge_off
;
643 while (sge_unsent
<= rv
) {
648 sge_unsent
= sge
->length
;
660 static void siw_update_tcpseg(struct siw_iwarp_tx
*c_tx
,
663 struct tcp_sock
*tp
= tcp_sk(s
->sk
);
666 if (c_tx
->gso_seg_limit
== 0)
667 c_tx
->tcp_seglen
= tp
->mss_cache
* tp
->gso_segs
;
671 min_t(u16
, c_tx
->gso_seg_limit
, tp
->gso_segs
);
673 c_tx
->tcp_seglen
= tp
->mss_cache
;
675 /* Loopback may give odd numbers */
676 c_tx
->tcp_seglen
&= 0xfffffff8;
682 * Prepares transmit context to send out one FPDU if FPDU will contain
683 * user data and user data are not immediate data.
684 * Computes maximum FPDU length to fill up TCP MSS if possible.
686 * @qp: QP from which to transmit
687 * @wqe: Current WQE causing transmission
689 * TODO: Take into account real available sendspace on socket
690 * to avoid header misalignment due to send pausing within
693 static void siw_prepare_fpdu(struct siw_qp
*qp
, struct siw_wqe
*wqe
)
695 struct siw_iwarp_tx
*c_tx
= &qp
->tx_ctx
;
699 iwarp_pktinfo
[__rdmap_get_opcode(&c_tx
->pkt
.ctrl
)].hdr_len
;
703 * Update target buffer offset if any
705 if (!(c_tx
->pkt
.ctrl
.ddp_rdmap_ctrl
& DDP_FLAG_TAGGED
))
706 /* Untagged message */
707 c_tx
->pkt
.c_untagged
.ddp_mo
= cpu_to_be32(wqe
->processed
);
708 else /* Tagged message */
709 c_tx
->pkt
.c_tagged
.ddp_to
=
710 cpu_to_be64(wqe
->sqe
.raddr
+ wqe
->processed
);
712 data_len
= wqe
->bytes
- wqe
->processed
;
713 if (data_len
+ c_tx
->ctrl_len
+ MPA_CRC_SIZE
> c_tx
->tcp_seglen
) {
714 /* Trim DDP payload to fit into current TCP segment */
715 data_len
= c_tx
->tcp_seglen
- (c_tx
->ctrl_len
+ MPA_CRC_SIZE
);
716 c_tx
->pkt
.ctrl
.ddp_rdmap_ctrl
&= ~DDP_FLAG_LAST
;
719 c_tx
->pkt
.ctrl
.ddp_rdmap_ctrl
|= DDP_FLAG_LAST
;
720 c_tx
->pad
= -data_len
& 0x3;
722 c_tx
->bytes_unsent
= data_len
;
724 c_tx
->pkt
.ctrl
.mpa_len
=
725 htons(c_tx
->ctrl_len
+ data_len
- MPA_HDR_SIZE
);
728 * Init MPA CRC computation
730 if (c_tx
->mpa_crc_hd
) {
731 crypto_shash_init(c_tx
->mpa_crc_hd
);
732 crypto_shash_update(c_tx
->mpa_crc_hd
, (u8
*)&c_tx
->pkt
,
741 * Check permissions for a list of SGE's (SGL).
742 * A successful check will have all memory referenced
743 * for transmission resolved and assigned to the WQE.
745 * @pd: Protection Domain SGL should belong to
746 * @wqe: WQE to be checked
747 * @perms: requested access permissions
751 static int siw_check_sgl_tx(struct ib_pd
*pd
, struct siw_wqe
*wqe
,
752 enum ib_access_flags perms
)
754 struct siw_sge
*sge
= &wqe
->sqe
.sge
[0];
755 int i
, len
, num_sge
= wqe
->sqe
.num_sge
;
757 if (unlikely(num_sge
> SIW_MAX_SGE
))
760 for (i
= 0, len
= 0; num_sge
; num_sge
--, i
++, sge
++) {
762 * rdma verbs: do not check stag for a zero length sge
765 int rv
= siw_check_sge(pd
, sge
, &wqe
->mem
[i
], perms
, 0,
768 if (unlikely(rv
!= E_ACCESS_OK
))
777 * siw_qp_sq_proc_tx()
779 * Process one WQE which needs transmission on the wire.
781 static int siw_qp_sq_proc_tx(struct siw_qp
*qp
, struct siw_wqe
*wqe
)
783 struct siw_iwarp_tx
*c_tx
= &qp
->tx_ctx
;
784 struct socket
*s
= qp
->attrs
.sk
;
785 int rv
= 0, burst_len
= qp
->tx_ctx
.burst
;
786 enum rdmap_ecode ecode
= RDMAP_ECODE_CATASTROPHIC_STREAM
;
788 if (unlikely(wqe
->wr_status
== SIW_WR_IDLE
))
792 burst_len
= SQ_USER_MAXBURST
;
794 if (wqe
->wr_status
== SIW_WR_QUEUED
) {
795 if (!(wqe
->sqe
.flags
& SIW_WQE_INLINE
)) {
796 if (tx_type(wqe
) == SIW_OP_READ_RESPONSE
)
797 wqe
->sqe
.num_sge
= 1;
799 if (tx_type(wqe
) != SIW_OP_READ
&&
800 tx_type(wqe
) != SIW_OP_READ_LOCAL_INV
) {
802 * Reference memory to be tx'd w/o checking
803 * access for LOCAL_READ permission, since
804 * not defined in RDMA core.
806 rv
= siw_check_sgl_tx(qp
->pd
, wqe
, 0);
809 SIW_OP_READ_RESPONSE
)
810 ecode
= siw_rdmap_error(-rv
);
819 wqe
->bytes
= wqe
->sqe
.sge
[0].length
;
820 if (!rdma_is_kernel_res(&qp
->base_qp
.res
)) {
821 if (wqe
->bytes
> SIW_MAX_INLINE
) {
825 wqe
->sqe
.sge
[0].laddr
=
826 (u64
)(uintptr_t)&wqe
->sqe
.sge
[1];
829 wqe
->wr_status
= SIW_WR_INPROGRESS
;
832 siw_update_tcpseg(c_tx
, s
);
834 rv
= siw_qp_prepare_tx(c_tx
);
835 if (rv
== PKT_FRAGMENTED
) {
836 c_tx
->state
= SIW_SEND_HDR
;
837 siw_prepare_fpdu(qp
, wqe
);
838 } else if (rv
== PKT_COMPLETE
) {
839 c_tx
->state
= SIW_SEND_SHORT_FPDU
;
846 siw_dbg_qp(qp
, "wr type %d, state %d, data %u, sent %u, id %llx\n",
847 tx_type(wqe
), wqe
->wr_status
, wqe
->bytes
, wqe
->processed
,
850 if (--burst_len
== 0) {
854 if (c_tx
->state
== SIW_SEND_SHORT_FPDU
) {
855 enum siw_opcode tx_type
= tx_type(wqe
);
856 unsigned int msg_flags
;
858 if (siw_sq_empty(qp
) || !siw_tcp_nagle
|| burst_len
== 1)
860 * End current TCP segment, if SQ runs empty,
861 * or siw_tcp_nagle is not set, or we bail out
862 * soon due to no burst credit left.
864 msg_flags
= MSG_DONTWAIT
;
866 msg_flags
= MSG_DONTWAIT
| MSG_MORE
;
868 rv
= siw_tx_ctrl(c_tx
, s
, msg_flags
);
870 if (!rv
&& tx_type
!= SIW_OP_READ
&&
871 tx_type
!= SIW_OP_READ_LOCAL_INV
)
872 wqe
->processed
= wqe
->bytes
;
877 rv
= siw_tx_hdt(c_tx
, s
);
881 * One segment sent. Processing completed if last
882 * segment, Do next segment otherwise.
884 if (unlikely(c_tx
->tx_suspend
)) {
886 * Verbs, 6.4.: Try stopping sending after a full
887 * DDP segment if the connection goes down
888 * (== peer halfclose)
893 if (c_tx
->pkt
.ctrl
.ddp_rdmap_ctrl
& DDP_FLAG_LAST
) {
894 siw_dbg_qp(qp
, "WQE completed\n");
897 c_tx
->state
= SIW_SEND_HDR
;
899 siw_update_tcpseg(c_tx
, s
);
901 siw_prepare_fpdu(qp
, wqe
);
905 qp
->tx_ctx
.burst
= burst_len
;
909 if (ecode
!= RDMAP_ECODE_CATASTROPHIC_STREAM
)
910 siw_init_terminate(qp
, TERM_ERROR_LAYER_RDMAP
,
911 RDMAP_ETYPE_REMOTE_PROTECTION
, ecode
, 1);
913 siw_init_terminate(qp
, TERM_ERROR_LAYER_RDMAP
,
914 RDMAP_ETYPE_CATASTROPHIC
,
915 RDMAP_ECODE_UNSPECIFIED
, 1);
919 static int siw_fastreg_mr(struct ib_pd
*pd
, struct siw_sqe
*sqe
)
921 struct ib_mr
*base_mr
= (struct ib_mr
*)(uintptr_t)sqe
->base_mr
;
922 struct siw_device
*sdev
= to_siw_dev(pd
->device
);
926 siw_dbg_pd(pd
, "STag 0x%08x\n", sqe
->rkey
);
928 if (unlikely(!base_mr
)) {
929 pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe
->rkey
);
933 if (unlikely(base_mr
->rkey
>> 8 != sqe
->rkey
>> 8)) {
934 pr_warn("siw: fastreg: STag 0x%08x: bad MR\n", sqe
->rkey
);
938 mem
= siw_mem_id2obj(sdev
, sqe
->rkey
>> 8);
939 if (unlikely(!mem
)) {
940 pr_warn("siw: fastreg: STag 0x%08x unknown\n", sqe
->rkey
);
944 if (unlikely(mem
->pd
!= pd
)) {
945 pr_warn("siw: fastreg: PD mismatch\n");
949 if (unlikely(mem
->stag_valid
)) {
950 pr_warn("siw: fastreg: STag 0x%08x already valid\n", sqe
->rkey
);
954 /* Refresh STag since user may have changed key part */
955 mem
->stag
= sqe
->rkey
;
956 mem
->perms
= sqe
->access
;
958 siw_dbg_mem(mem
, "STag 0x%08x now valid\n", sqe
->rkey
);
959 mem
->va
= base_mr
->iova
;
966 static int siw_qp_sq_proc_local(struct siw_qp
*qp
, struct siw_wqe
*wqe
)
970 switch (tx_type(wqe
)) {
972 rv
= siw_fastreg_mr(qp
->pd
, &wqe
->sqe
);
975 case SIW_OP_INVAL_STAG
:
976 rv
= siw_invalidate_stag(qp
->pd
, wqe
->sqe
.rkey
);
986 * siw_qp_sq_process()
988 * Core TX path routine for RDMAP/DDP/MPA using a TCP kernel socket.
989 * Sends RDMAP payload for the current SQ WR @wqe of @qp in one or more
990 * MPA FPDUs, each containing a DDP segment.
992 * SQ processing may occur in user context as a result of posting
993 * new WQE's or from siw_sq_work_handler() context. Processing in
994 * user context is limited to non-kernel verbs users.
996 * SQ processing may get paused anytime, possibly in the middle of a WR
997 * or FPDU, if insufficient send space is available. SQ processing
998 * gets resumed from siw_sq_work_handler(), if send space becomes
1001 * Must be called with the QP state read-locked.
1004 * An outbound RREQ can be satisfied by the corresponding RRESP
1005 * _before_ it gets assigned to the ORQ. This happens regularly
1006 * in RDMA READ via loopback case. Since both outbound RREQ and
1007 * inbound RRESP can be handled by the same CPU, locking the ORQ
1008 * is dead-lock prone and thus not an option. With that, the
1009 * RREQ gets assigned to the ORQ _before_ being sent - see
1010 * siw_activate_tx() - and pulled back in case of send failure.
1012 int siw_qp_sq_process(struct siw_qp
*qp
)
1014 struct siw_wqe
*wqe
= tx_wqe(qp
);
1015 enum siw_opcode tx_type
;
1016 unsigned long flags
;
1019 siw_dbg_qp(qp
, "enter for type %d\n", tx_type(wqe
));
1023 * Stop QP processing if SQ state changed
1025 if (unlikely(qp
->tx_ctx
.tx_suspend
)) {
1026 siw_dbg_qp(qp
, "tx suspended\n");
1029 tx_type
= tx_type(wqe
);
1031 if (tx_type
<= SIW_OP_READ_RESPONSE
)
1032 rv
= siw_qp_sq_proc_tx(qp
, wqe
);
1034 rv
= siw_qp_sq_proc_local(qp
, wqe
);
1038 * WQE processing done
1042 case SIW_OP_SEND_REMOTE_INV
:
1044 siw_wqe_put_mem(wqe
, tx_type
);
1047 case SIW_OP_INVAL_STAG
:
1049 if (tx_flags(wqe
) & SIW_WQE_SIGNALLED
)
1050 siw_sqe_complete(qp
, &wqe
->sqe
, wqe
->bytes
,
1055 case SIW_OP_READ_LOCAL_INV
:
1057 * already enqueued to ORQ queue
1061 case SIW_OP_READ_RESPONSE
:
1062 siw_wqe_put_mem(wqe
, tx_type
);
1066 WARN(1, "undefined WQE type %d\n", tx_type
);
1071 spin_lock_irqsave(&qp
->sq_lock
, flags
);
1072 wqe
->wr_status
= SIW_WR_IDLE
;
1073 rv
= siw_activate_tx(qp
);
1074 spin_unlock_irqrestore(&qp
->sq_lock
, flags
);
1081 } else if (rv
== -EAGAIN
) {
1082 siw_dbg_qp(qp
, "sq paused: hd/tr %d of %d, data %d\n",
1083 qp
->tx_ctx
.ctrl_sent
, qp
->tx_ctx
.ctrl_len
,
1084 qp
->tx_ctx
.bytes_unsent
);
1087 } else if (rv
== -EINPROGRESS
) {
1088 rv
= siw_sq_start(qp
);
1092 * WQE processing failed.
1094 * o It turns any WQE into a signalled WQE.
1095 * o Local catastrophic error must be surfaced
1096 * o QP must be moved into Terminate state: done by code
1097 * doing socket state change processing
1099 * o TODO: Termination message must be sent.
1100 * o TODO: Implement more precise work completion errors,
1101 * see enum ib_wc_status in ib_verbs.h
1103 siw_dbg_qp(qp
, "wqe type %d processing failed: %d\n",
1106 spin_lock_irqsave(&qp
->sq_lock
, flags
);
1108 * RREQ may have already been completed by inbound RRESP!
1110 if (tx_type
== SIW_OP_READ
||
1111 tx_type
== SIW_OP_READ_LOCAL_INV
) {
1112 /* Cleanup pending entry in ORQ */
1114 qp
->orq
[qp
->orq_put
% qp
->attrs
.orq_size
].flags
= 0;
1116 spin_unlock_irqrestore(&qp
->sq_lock
, flags
);
1118 * immediately suspends further TX processing
1120 if (!qp
->tx_ctx
.tx_suspend
)
1121 siw_qp_cm_drop(qp
, 0);
1125 case SIW_OP_SEND_REMOTE_INV
:
1126 case SIW_OP_SEND_WITH_IMM
:
1129 case SIW_OP_READ_LOCAL_INV
:
1130 siw_wqe_put_mem(wqe
, tx_type
);
1133 case SIW_OP_INVAL_STAG
:
1135 siw_sqe_complete(qp
, &wqe
->sqe
, wqe
->bytes
,
1136 SIW_WC_LOC_QP_OP_ERR
);
1138 siw_qp_event(qp
, IB_EVENT_QP_FATAL
);
1142 case SIW_OP_READ_RESPONSE
:
1143 siw_dbg_qp(qp
, "proc. read.response failed: %d\n", rv
);
1145 siw_qp_event(qp
, IB_EVENT_QP_REQ_ERR
);
1147 siw_wqe_put_mem(wqe
, SIW_OP_READ_RESPONSE
);
1152 WARN(1, "undefined WQE type %d\n", tx_type
);
1155 wqe
->wr_status
= SIW_WR_IDLE
;
1161 static void siw_sq_resume(struct siw_qp
*qp
)
1163 if (down_read_trylock(&qp
->state_lock
)) {
1164 if (likely(qp
->attrs
.state
== SIW_QP_STATE_RTS
&&
1165 !qp
->tx_ctx
.tx_suspend
)) {
1166 int rv
= siw_qp_sq_process(qp
);
1168 up_read(&qp
->state_lock
);
1170 if (unlikely(rv
< 0)) {
1171 siw_dbg_qp(qp
, "SQ task failed: err %d\n", rv
);
1173 if (!qp
->tx_ctx
.tx_suspend
)
1174 siw_qp_cm_drop(qp
, 0);
1177 up_read(&qp
->state_lock
);
1180 siw_dbg_qp(qp
, "Resume SQ while QP locked\n");
1186 struct llist_head active
;
1187 wait_queue_head_t waiting
;
1190 static DEFINE_PER_CPU(struct tx_task_t
, siw_tx_task_g
);
1192 void siw_stop_tx_thread(int nr_cpu
)
1194 kthread_stop(siw_tx_thread
[nr_cpu
]);
1195 wake_up(&per_cpu(siw_tx_task_g
, nr_cpu
).waiting
);
1198 int siw_run_sq(void *data
)
1200 const int nr_cpu
= (unsigned int)(long)data
;
1201 struct llist_node
*active
;
1203 struct tx_task_t
*tx_task
= &per_cpu(siw_tx_task_g
, nr_cpu
);
1205 init_llist_head(&tx_task
->active
);
1206 init_waitqueue_head(&tx_task
->waiting
);
1209 struct llist_node
*fifo_list
= NULL
;
1211 wait_event_interruptible(tx_task
->waiting
,
1212 !llist_empty(&tx_task
->active
) ||
1213 kthread_should_stop());
1215 if (kthread_should_stop())
1218 active
= llist_del_all(&tx_task
->active
);
1220 * llist_del_all returns a list with newest entry first.
1221 * Re-order list for fairness among QP's.
1224 struct llist_node
*tmp
= active
;
1226 active
= llist_next(active
);
1227 tmp
->next
= fifo_list
;
1231 qp
= container_of(fifo_list
, struct siw_qp
, tx_list
);
1232 fifo_list
= llist_next(fifo_list
);
1233 qp
->tx_list
.next
= NULL
;
1238 active
= llist_del_all(&tx_task
->active
);
1240 llist_for_each_entry(qp
, active
, tx_list
) {
1241 qp
->tx_list
.next
= NULL
;
1248 int siw_sq_start(struct siw_qp
*qp
)
1250 if (tx_wqe(qp
)->wr_status
== SIW_WR_IDLE
)
1253 if (unlikely(!cpu_online(qp
->tx_cpu
))) {
1254 siw_put_tx_cpu(qp
->tx_cpu
);
1255 qp
->tx_cpu
= siw_get_tx_cpu(qp
->sdev
);
1256 if (qp
->tx_cpu
< 0) {
1257 pr_warn("siw: no tx cpu available\n");
1264 llist_add(&qp
->tx_list
, &per_cpu(siw_tx_task_g
, qp
->tx_cpu
).active
);
1266 wake_up(&per_cpu(siw_tx_task_g
, qp
->tx_cpu
).waiting
);