1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
6 #include <linux/errno.h>
7 #include <linux/types.h>
9 #include <linux/scatterlist.h>
10 #include <linux/highmem.h>
12 #include <rdma/iw_cm.h>
13 #include <rdma/ib_verbs.h>
16 #include "siw_verbs.h"
22 * Receive data of @len into target referenced by @dest_addr.
24 * @srx: Receive Context
25 * @umem: siw representation of target memory
26 * @dest_addr: user virtual address
27 * @len: number of bytes to place
29 static int siw_rx_umem(struct siw_rx_stream
*srx
, struct siw_umem
*umem
,
30 u64 dest_addr
, int len
)
36 int pg_off
, bytes
, rv
;
39 p
= siw_get_upage(umem
, dest_addr
);
41 pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
42 __func__
, qp_id(rx_qp(srx
)),
43 (void *)(uintptr_t)dest_addr
,
44 (void *)(uintptr_t)umem
->fp_addr
);
45 /* siw internal error */
46 srx
->skb_copied
+= copied
;
47 srx
->skb_new
-= copied
;
51 pg_off
= dest_addr
& ~PAGE_MASK
;
52 bytes
= min(len
, (int)PAGE_SIZE
- pg_off
);
54 siw_dbg_qp(rx_qp(srx
), "page %pK, bytes=%u\n", p
, bytes
);
56 dest
= kmap_atomic(p
);
57 rv
= skb_copy_bits(srx
->skb
, srx
->skb_offset
, dest
+ pg_off
,
62 srx
->skb_copied
+= copied
;
63 srx
->skb_new
-= copied
;
65 pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
66 qp_id(rx_qp(srx
)), __func__
, len
, p
, rv
);
70 if (srx
->mpa_crc_hd
) {
71 if (rdma_is_kernel_res(&rx_qp(srx
)->base_qp
.res
)) {
72 crypto_shash_update(srx
->mpa_crc_hd
,
73 (u8
*)(dest
+ pg_off
), bytes
);
78 * Do CRC on original, not target buffer.
79 * Some user land applications may
80 * concurrently write the target buffer,
81 * which would yield a broken CRC.
82 * Walking the skb twice is very ineffcient.
83 * Folding the CRC into skb_copy_bits()
84 * would be much better, but is currently
87 siw_crc_skb(srx
, bytes
);
92 srx
->skb_offset
+= bytes
;
98 srx
->skb_copied
+= copied
;
99 srx
->skb_new
-= copied
;
104 static int siw_rx_kva(struct siw_rx_stream
*srx
, void *kva
, int len
)
108 siw_dbg_qp(rx_qp(srx
), "kva: 0x%pK, len: %u\n", kva
, len
);
110 rv
= skb_copy_bits(srx
->skb
, srx
->skb_offset
, kva
, len
);
112 pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
113 qp_id(rx_qp(srx
)), __func__
, len
, kva
, rv
);
118 crypto_shash_update(srx
->mpa_crc_hd
, (u8
*)kva
, len
);
120 srx
->skb_offset
+= len
;
121 srx
->skb_copied
+= len
;
127 static int siw_rx_pbl(struct siw_rx_stream
*srx
, int *pbl_idx
,
128 struct siw_mem
*mem
, u64 addr
, int len
)
130 struct siw_pbl
*pbl
= mem
->pbl
;
131 u64 offset
= addr
- mem
->va
;
136 dma_addr_t buf_addr
=
137 siw_pbl_get_buffer(pbl
, offset
, &bytes
, pbl_idx
);
141 bytes
= min(bytes
, len
);
142 if (siw_rx_kva(srx
, (void *)(uintptr_t)buf_addr
, bytes
) ==
155 * siw_rresp_check_ntoh()
157 * Check incoming RRESP fragment header against expected
158 * header values and update expected values for potential next
161 * NOTE: This function must be called only if a RRESP DDP segment
162 * starts but not for fragmented consecutive pieces of an
163 * already started DDP segment.
165 static int siw_rresp_check_ntoh(struct siw_rx_stream
*srx
,
166 struct siw_rx_fpdu
*frx
)
168 struct iwarp_rdma_rresp
*rresp
= &srx
->hdr
.rresp
;
169 struct siw_wqe
*wqe
= &frx
->wqe_active
;
170 enum ddp_ecode ecode
;
172 u32 sink_stag
= be32_to_cpu(rresp
->sink_stag
);
173 u64 sink_to
= be64_to_cpu(rresp
->sink_to
);
175 if (frx
->first_ddp_seg
) {
176 srx
->ddp_stag
= wqe
->sqe
.sge
[0].lkey
;
177 srx
->ddp_to
= wqe
->sqe
.sge
[0].laddr
;
180 /* Below checks extend beyond the semantics of DDP, and
182 * We check if the read response matches exactly the
183 * read request which was send to the remote peer to
184 * trigger this read response. RFC5040/5041 do not
185 * always have a proper error code for the detected
186 * error cases. We choose 'base or bounds error' for
187 * cases where the inbound STag is valid, but offset
188 * or length do not match our response receive state.
190 if (unlikely(srx
->ddp_stag
!= sink_stag
)) {
191 pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
192 qp_id(rx_qp(srx
)), sink_stag
, srx
->ddp_stag
);
193 ecode
= DDP_ECODE_T_INVALID_STAG
;
196 if (unlikely(srx
->ddp_to
!= sink_to
)) {
197 pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
198 qp_id(rx_qp(srx
)), (unsigned long long)sink_to
,
199 (unsigned long long)srx
->ddp_to
);
200 ecode
= DDP_ECODE_T_BASE_BOUNDS
;
203 if (unlikely(!frx
->more_ddp_segs
&&
204 (wqe
->processed
+ srx
->fpdu_part_rem
!= wqe
->bytes
))) {
205 pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
207 wqe
->processed
+ srx
->fpdu_part_rem
, wqe
->bytes
);
208 ecode
= DDP_ECODE_T_BASE_BOUNDS
;
213 siw_init_terminate(rx_qp(srx
), TERM_ERROR_LAYER_DDP
,
214 DDP_ETYPE_TAGGED_BUF
, ecode
, 0);
219 * siw_write_check_ntoh()
221 * Check incoming WRITE fragment header against expected
222 * header values and update expected values for potential next
225 * NOTE: This function must be called only if a WRITE DDP segment
226 * starts but not for fragmented consecutive pieces of an
227 * already started DDP segment.
229 static int siw_write_check_ntoh(struct siw_rx_stream
*srx
,
230 struct siw_rx_fpdu
*frx
)
232 struct iwarp_rdma_write
*write
= &srx
->hdr
.rwrite
;
233 enum ddp_ecode ecode
;
235 u32 sink_stag
= be32_to_cpu(write
->sink_stag
);
236 u64 sink_to
= be64_to_cpu(write
->sink_to
);
238 if (frx
->first_ddp_seg
) {
239 srx
->ddp_stag
= sink_stag
;
240 srx
->ddp_to
= sink_to
;
243 if (unlikely(srx
->ddp_stag
!= sink_stag
)) {
244 pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
245 qp_id(rx_qp(srx
)), sink_stag
,
247 ecode
= DDP_ECODE_T_INVALID_STAG
;
250 if (unlikely(srx
->ddp_to
!= sink_to
)) {
251 pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
253 (unsigned long long)sink_to
,
254 (unsigned long long)srx
->ddp_to
);
255 ecode
= DDP_ECODE_T_BASE_BOUNDS
;
261 siw_init_terminate(rx_qp(srx
), TERM_ERROR_LAYER_DDP
,
262 DDP_ETYPE_TAGGED_BUF
, ecode
, 0);
267 * siw_send_check_ntoh()
269 * Check incoming SEND fragment header against expected
270 * header values and update expected MSN if no next
273 * NOTE: This function must be called only if a SEND DDP segment
274 * starts but not for fragmented consecutive pieces of an
275 * already started DDP segment.
277 static int siw_send_check_ntoh(struct siw_rx_stream
*srx
,
278 struct siw_rx_fpdu
*frx
)
280 struct iwarp_send_inv
*send
= &srx
->hdr
.send_inv
;
281 struct siw_wqe
*wqe
= &frx
->wqe_active
;
282 enum ddp_ecode ecode
;
284 u32 ddp_msn
= be32_to_cpu(send
->ddp_msn
);
285 u32 ddp_mo
= be32_to_cpu(send
->ddp_mo
);
286 u32 ddp_qn
= be32_to_cpu(send
->ddp_qn
);
288 if (unlikely(ddp_qn
!= RDMAP_UNTAGGED_QN_SEND
)) {
289 pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
290 qp_id(rx_qp(srx
)), ddp_qn
);
291 ecode
= DDP_ECODE_UT_INVALID_QN
;
294 if (unlikely(ddp_msn
!= srx
->ddp_msn
[RDMAP_UNTAGGED_QN_SEND
])) {
295 pr_warn("siw: [QP %u]: send msn: %u != %u\n",
296 qp_id(rx_qp(srx
)), ddp_msn
,
297 srx
->ddp_msn
[RDMAP_UNTAGGED_QN_SEND
]);
298 ecode
= DDP_ECODE_UT_INVALID_MSN_RANGE
;
301 if (unlikely(ddp_mo
!= wqe
->processed
)) {
302 pr_warn("siw: [QP %u], send mo: %u != %u\n",
303 qp_id(rx_qp(srx
)), ddp_mo
, wqe
->processed
);
304 ecode
= DDP_ECODE_UT_INVALID_MO
;
307 if (frx
->first_ddp_seg
) {
308 /* initialize user memory write position */
313 /* only valid for SEND_INV and SEND_SE_INV operations */
314 srx
->inval_stag
= be32_to_cpu(send
->inval_stag
);
316 if (unlikely(wqe
->bytes
< wqe
->processed
+ srx
->fpdu_part_rem
)) {
317 siw_dbg_qp(rx_qp(srx
), "receive space short: %d - %d < %d\n",
318 wqe
->bytes
, wqe
->processed
, srx
->fpdu_part_rem
);
319 wqe
->wc_status
= SIW_WC_LOC_LEN_ERR
;
320 ecode
= DDP_ECODE_UT_INVALID_MSN_NOBUF
;
325 siw_init_terminate(rx_qp(srx
), TERM_ERROR_LAYER_DDP
,
326 DDP_ETYPE_UNTAGGED_BUF
, ecode
, 0);
330 static struct siw_wqe
*siw_rqe_get(struct siw_qp
*qp
)
334 struct siw_wqe
*wqe
= NULL
;
335 bool srq_event
= false;
340 spin_lock_irqsave(&srq
->lock
, flags
);
341 if (unlikely(!srq
->num_rqe
))
344 rqe
= &srq
->recvq
[srq
->rq_get
% srq
->num_rqe
];
346 if (unlikely(!qp
->recvq
))
349 rqe
= &qp
->recvq
[qp
->rq_get
% qp
->attrs
.rq_size
];
351 if (likely(rqe
->flags
== SIW_WQE_VALID
)) {
352 int num_sge
= rqe
->num_sge
;
354 if (likely(num_sge
<= SIW_MAX_SGE
)) {
357 wqe
= rx_wqe(&qp
->rx_untagged
);
358 rx_type(wqe
) = SIW_OP_RECEIVE
;
359 wqe
->wr_status
= SIW_WR_INPROGRESS
;
363 wqe
->rqe
.id
= rqe
->id
;
364 wqe
->rqe
.num_sge
= num_sge
;
366 while (i
< num_sge
) {
367 wqe
->rqe
.sge
[i
].laddr
= rqe
->sge
[i
].laddr
;
368 wqe
->rqe
.sge
[i
].lkey
= rqe
->sge
[i
].lkey
;
369 wqe
->rqe
.sge
[i
].length
= rqe
->sge
[i
].length
;
370 wqe
->bytes
+= wqe
->rqe
.sge
[i
].length
;
374 /* can be re-used by appl */
375 smp_store_mb(rqe
->flags
, 0);
377 siw_dbg_qp(qp
, "too many sge's: %d\n", rqe
->num_sge
);
379 spin_unlock_irqrestore(&srq
->lock
, flags
);
387 u32 off
= (srq
->rq_get
+ srq
->limit
) %
389 struct siw_rqe
*rqe2
= &srq
->recvq
[off
];
391 if (!(rqe2
->flags
& SIW_WQE_VALID
)) {
401 spin_unlock_irqrestore(&srq
->lock
, flags
);
403 siw_srq_event(srq
, IB_EVENT_SRQ_LIMIT_REACHED
);
411 * Process one incoming SEND and place data into memory referenced by
414 * Function supports partially received sends (suspending/resuming
415 * current receive wqe processing)
418 * 0: reached the end of a DDP segment
419 * -EAGAIN: to be called again to finish the DDP segment
421 int siw_proc_send(struct siw_qp
*qp
)
423 struct siw_rx_stream
*srx
= &qp
->rx_stream
;
424 struct siw_rx_fpdu
*frx
= &qp
->rx_untagged
;
426 u32 data_bytes
; /* all data bytes available */
427 u32 rcvd_bytes
; /* sum of data bytes rcvd */
430 if (frx
->first_ddp_seg
) {
431 wqe
= siw_rqe_get(qp
);
432 if (unlikely(!wqe
)) {
433 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
434 DDP_ETYPE_UNTAGGED_BUF
,
435 DDP_ECODE_UT_INVALID_MSN_NOBUF
, 0);
441 if (srx
->state
== SIW_GET_DATA_START
) {
442 rv
= siw_send_check_ntoh(srx
, frx
);
444 siw_qp_event(qp
, IB_EVENT_QP_FATAL
);
447 if (!srx
->fpdu_part_rem
) /* zero length SEND */
450 data_bytes
= min(srx
->fpdu_part_rem
, srx
->skb_new
);
453 /* A zero length SEND will skip below loop */
456 struct siw_mem
**mem
, *mem_p
;
458 u32 sge_bytes
; /* data bytes avail for SGE */
460 sge
= &wqe
->rqe
.sge
[frx
->sge_idx
];
463 /* just skip empty sge's */
469 sge_bytes
= min(data_bytes
, sge
->length
- frx
->sge_off
);
470 mem
= &wqe
->mem
[frx
->sge_idx
];
473 * check with QP's PD if no SRQ present, SRQ's PD otherwise
475 pd
= qp
->srq
== NULL
? qp
->pd
: qp
->srq
->base_srq
.pd
;
477 rv
= siw_check_sge(pd
, sge
, mem
, IB_ACCESS_LOCAL_WRITE
,
478 frx
->sge_off
, sge_bytes
);
480 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
481 DDP_ETYPE_CATASTROPHIC
,
482 DDP_ECODE_CATASTROPHIC
, 0);
484 siw_qp_event(qp
, IB_EVENT_QP_ACCESS_ERR
);
488 if (mem_p
->mem_obj
== NULL
)
490 (void *)(uintptr_t)(sge
->laddr
+ frx
->sge_off
),
492 else if (!mem_p
->is_pbl
)
493 rv
= siw_rx_umem(srx
, mem_p
->umem
,
494 sge
->laddr
+ frx
->sge_off
, sge_bytes
);
496 rv
= siw_rx_pbl(srx
, &frx
->pbl_idx
, mem_p
,
497 sge
->laddr
+ frx
->sge_off
, sge_bytes
);
499 if (unlikely(rv
!= sge_bytes
)) {
500 wqe
->processed
+= rcvd_bytes
;
502 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
503 DDP_ETYPE_CATASTROPHIC
,
504 DDP_ECODE_CATASTROPHIC
, 0);
509 if (frx
->sge_off
== sge
->length
) {
517 srx
->fpdu_part_rem
-= rv
;
518 srx
->fpdu_part_rcvd
+= rv
;
520 wqe
->processed
+= rcvd_bytes
;
522 if (!srx
->fpdu_part_rem
)
525 return (rv
< 0) ? rv
: -EAGAIN
;
531 * Place incoming WRITE after referencing and checking target buffer
533 * Function supports partially received WRITEs (suspending/resuming
534 * current receive processing)
537 * 0: reached the end of a DDP segment
538 * -EAGAIN: to be called again to finish the DDP segment
540 int siw_proc_write(struct siw_qp
*qp
)
542 struct siw_rx_stream
*srx
= &qp
->rx_stream
;
543 struct siw_rx_fpdu
*frx
= &qp
->rx_tagged
;
547 if (srx
->state
== SIW_GET_DATA_START
) {
548 if (!srx
->fpdu_part_rem
) /* zero length WRITE */
551 rv
= siw_write_check_ntoh(srx
, frx
);
553 siw_qp_event(qp
, IB_EVENT_QP_FATAL
);
557 bytes
= min(srx
->fpdu_part_rem
, srx
->skb_new
);
559 if (frx
->first_ddp_seg
) {
560 struct siw_wqe
*wqe
= rx_wqe(frx
);
562 rx_mem(frx
) = siw_mem_id2obj(qp
->sdev
, srx
->ddp_stag
>> 8);
563 if (unlikely(!rx_mem(frx
))) {
565 "sink stag not found/invalid, stag 0x%08x\n",
568 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
569 DDP_ETYPE_TAGGED_BUF
,
570 DDP_ECODE_T_INVALID_STAG
, 0);
573 wqe
->rqe
.num_sge
= 1;
574 rx_type(wqe
) = SIW_OP_WRITE
;
575 wqe
->wr_status
= SIW_WR_INPROGRESS
;
580 * Check if application re-registered memory with different
583 if (unlikely(mem
->stag
!= srx
->ddp_stag
)) {
584 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
585 DDP_ETYPE_TAGGED_BUF
,
586 DDP_ECODE_T_INVALID_STAG
, 0);
589 rv
= siw_check_mem(qp
->pd
, mem
, srx
->ddp_to
+ srx
->fpdu_part_rcvd
,
590 IB_ACCESS_REMOTE_WRITE
, bytes
);
592 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
593 DDP_ETYPE_TAGGED_BUF
, siw_tagged_error(-rv
),
596 siw_qp_event(qp
, IB_EVENT_QP_ACCESS_ERR
);
601 if (mem
->mem_obj
== NULL
)
603 (void *)(uintptr_t)(srx
->ddp_to
+ srx
->fpdu_part_rcvd
),
605 else if (!mem
->is_pbl
)
606 rv
= siw_rx_umem(srx
, mem
->umem
,
607 srx
->ddp_to
+ srx
->fpdu_part_rcvd
, bytes
);
609 rv
= siw_rx_pbl(srx
, &frx
->pbl_idx
, mem
,
610 srx
->ddp_to
+ srx
->fpdu_part_rcvd
, bytes
);
612 if (unlikely(rv
!= bytes
)) {
613 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
614 DDP_ETYPE_CATASTROPHIC
,
615 DDP_ECODE_CATASTROPHIC
, 0);
618 srx
->fpdu_part_rem
-= rv
;
619 srx
->fpdu_part_rcvd
+= rv
;
621 if (!srx
->fpdu_part_rem
) {
622 srx
->ddp_to
+= srx
->fpdu_part_rcvd
;
629 * Inbound RREQ's cannot carry user data.
631 int siw_proc_rreq(struct siw_qp
*qp
)
633 struct siw_rx_stream
*srx
= &qp
->rx_stream
;
635 if (!srx
->fpdu_part_rem
)
638 pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp
),
639 be16_to_cpu(srx
->hdr
.ctrl
.mpa_len
));
647 * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
648 * Put it at the tail of the IRQ, if there is another WQE currently in
649 * transmit processing. If not, make it the current WQE to be processed
650 * and schedule transmit processing.
652 * Can be called from softirq context and from process
653 * context (RREAD socket loopback case!)
657 * failure code otherwise
660 static int siw_init_rresp(struct siw_qp
*qp
, struct siw_rx_stream
*srx
)
662 struct siw_wqe
*tx_work
= tx_wqe(qp
);
663 struct siw_sqe
*resp
;
665 uint64_t raddr
= be64_to_cpu(srx
->hdr
.rreq
.sink_to
),
666 laddr
= be64_to_cpu(srx
->hdr
.rreq
.source_to
);
667 uint32_t length
= be32_to_cpu(srx
->hdr
.rreq
.read_size
),
668 lkey
= be32_to_cpu(srx
->hdr
.rreq
.source_stag
),
669 rkey
= be32_to_cpu(srx
->hdr
.rreq
.sink_stag
),
670 msn
= be32_to_cpu(srx
->hdr
.rreq
.ddp_msn
);
672 int run_sq
= 1, rv
= 0;
675 if (unlikely(msn
!= srx
->ddp_msn
[RDMAP_UNTAGGED_QN_RDMA_READ
])) {
676 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
677 DDP_ETYPE_UNTAGGED_BUF
,
678 DDP_ECODE_UT_INVALID_MSN_RANGE
, 0);
681 spin_lock_irqsave(&qp
->sq_lock
, flags
);
683 if (tx_work
->wr_status
== SIW_WR_IDLE
) {
685 * immediately schedule READ response w/o
686 * consuming IRQ entry: IRQ must be empty.
688 tx_work
->processed
= 0;
689 tx_work
->mem
[0] = NULL
;
690 tx_work
->wr_status
= SIW_WR_QUEUED
;
691 resp
= &tx_work
->sqe
;
693 resp
= irq_alloc_free(qp
);
697 resp
->opcode
= SIW_OP_READ_RESPONSE
;
699 resp
->sge
[0].length
= length
;
700 resp
->sge
[0].laddr
= laddr
;
701 resp
->sge
[0].lkey
= lkey
;
703 /* Keep aside message sequence number for potential
704 * error reporting during Read Response generation.
706 resp
->sge
[1].length
= msn
;
710 resp
->num_sge
= length
? 1 : 0;
712 /* RRESP now valid as current TX wqe or placed into IRQ */
713 smp_store_mb(resp
->flags
, SIW_WQE_VALID
);
715 pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp
),
716 qp
->irq_put
% qp
->attrs
.irq_size
, qp
->attrs
.irq_size
);
718 siw_init_terminate(qp
, TERM_ERROR_LAYER_RDMAP
,
719 RDMAP_ETYPE_REMOTE_OPERATION
,
720 RDMAP_ECODE_CATASTROPHIC_STREAM
, 0);
724 spin_unlock_irqrestore(&qp
->sq_lock
, flags
);
727 rv
= siw_sq_start(qp
);
733 * Only called at start of Read.Resonse processing.
734 * Transfer pending Read from tip of ORQ into currrent rx wqe,
735 * but keep ORQ entry valid until Read.Response processing done.
736 * No Queue locking needed.
738 static int siw_orqe_start_rx(struct siw_qp
*qp
)
740 struct siw_sqe
*orqe
;
741 struct siw_wqe
*wqe
= NULL
;
743 /* make sure ORQ indices are current */
746 orqe
= orq_get_current(qp
);
747 if (READ_ONCE(orqe
->flags
) & SIW_WQE_VALID
) {
748 /* RRESP is a TAGGED RDMAP operation */
749 wqe
= rx_wqe(&qp
->rx_tagged
);
750 wqe
->sqe
.id
= orqe
->id
;
751 wqe
->sqe
.opcode
= orqe
->opcode
;
752 wqe
->sqe
.sge
[0].laddr
= orqe
->sge
[0].laddr
;
753 wqe
->sqe
.sge
[0].lkey
= orqe
->sge
[0].lkey
;
754 wqe
->sqe
.sge
[0].length
= orqe
->sge
[0].length
;
755 wqe
->sqe
.flags
= orqe
->flags
;
756 wqe
->sqe
.num_sge
= 1;
757 wqe
->bytes
= orqe
->sge
[0].length
;
760 /* make sure WQE is completely written before valid */
762 wqe
->wr_status
= SIW_WR_INPROGRESS
;
772 * Place incoming RRESP data into memory referenced by RREQ WQE
773 * which is at the tip of the ORQ
775 * Function supports partially received RRESP's (suspending/resuming
776 * current receive processing)
778 int siw_proc_rresp(struct siw_qp
*qp
)
780 struct siw_rx_stream
*srx
= &qp
->rx_stream
;
781 struct siw_rx_fpdu
*frx
= &qp
->rx_tagged
;
782 struct siw_wqe
*wqe
= rx_wqe(frx
);
783 struct siw_mem
**mem
, *mem_p
;
787 if (frx
->first_ddp_seg
) {
788 if (unlikely(wqe
->wr_status
!= SIW_WR_IDLE
)) {
789 pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
790 qp_id(qp
), wqe
->wr_status
, wqe
->sqe
.opcode
);
795 * fetch pending RREQ from orq
797 rv
= siw_orqe_start_rx(qp
);
799 pr_warn("siw: [QP %u]: ORQ empty at idx %d\n",
800 qp_id(qp
), qp
->orq_get
% qp
->attrs
.orq_size
);
803 rv
= siw_rresp_check_ntoh(srx
, frx
);
805 siw_qp_event(qp
, IB_EVENT_QP_FATAL
);
809 if (unlikely(wqe
->wr_status
!= SIW_WR_INPROGRESS
)) {
810 pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
811 qp_id(qp
), wqe
->wr_status
);
816 if (!srx
->fpdu_part_rem
) /* zero length RRESPONSE */
819 sge
= wqe
->sqe
.sge
; /* there is only one */
824 * check target memory which resolves memory on first fragment
826 rv
= siw_check_sge(qp
->pd
, sge
, mem
, IB_ACCESS_LOCAL_WRITE
, 0,
829 siw_dbg_qp(qp
, "target mem check: %d\n", rv
);
830 wqe
->wc_status
= SIW_WC_LOC_PROT_ERR
;
832 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
833 DDP_ETYPE_TAGGED_BUF
,
834 siw_tagged_error(-rv
), 0);
836 siw_qp_event(qp
, IB_EVENT_QP_ACCESS_ERR
);
843 bytes
= min(srx
->fpdu_part_rem
, srx
->skb_new
);
845 if (mem_p
->mem_obj
== NULL
)
847 (void *)(uintptr_t)(sge
->laddr
+ wqe
->processed
),
849 else if (!mem_p
->is_pbl
)
850 rv
= siw_rx_umem(srx
, mem_p
->umem
, sge
->laddr
+ wqe
->processed
,
853 rv
= siw_rx_pbl(srx
, &frx
->pbl_idx
, mem_p
,
854 sge
->laddr
+ wqe
->processed
, bytes
);
856 wqe
->wc_status
= SIW_WC_GENERAL_ERR
;
860 srx
->fpdu_part_rem
-= rv
;
861 srx
->fpdu_part_rcvd
+= rv
;
862 wqe
->processed
+= rv
;
864 if (!srx
->fpdu_part_rem
) {
865 srx
->ddp_to
+= srx
->fpdu_part_rcvd
;
871 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
, DDP_ETYPE_CATASTROPHIC
,
872 DDP_ECODE_CATASTROPHIC
, 0);
876 int siw_proc_terminate(struct siw_qp
*qp
)
878 struct siw_rx_stream
*srx
= &qp
->rx_stream
;
879 struct sk_buff
*skb
= srx
->skb
;
880 struct iwarp_terminate
*term
= &srx
->hdr
.terminate
;
881 union iwarp_hdr term_info
;
882 u8
*infop
= (u8
*)&term_info
;
884 u16 to_copy
= sizeof(struct iwarp_ctrl
);
886 pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
887 __rdmap_term_layer(term
), __rdmap_term_etype(term
),
888 __rdmap_term_ecode(term
));
890 if (be32_to_cpu(term
->ddp_qn
) != RDMAP_UNTAGGED_QN_TERMINATE
||
891 be32_to_cpu(term
->ddp_msn
) !=
892 qp
->rx_stream
.ddp_msn
[RDMAP_UNTAGGED_QN_TERMINATE
] ||
893 be32_to_cpu(term
->ddp_mo
) != 0) {
894 pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
895 be32_to_cpu(term
->ddp_qn
), be32_to_cpu(term
->ddp_msn
),
896 be32_to_cpu(term
->ddp_mo
));
900 * Receive remaining pieces of TERM if indicated
905 /* Do not take the effort to reassemble a network fragmented
908 if (srx
->skb_new
< sizeof(struct iwarp_ctrl_tagged
))
911 memset(infop
, 0, sizeof(term_info
));
913 skb_copy_bits(skb
, srx
->skb_offset
, infop
, to_copy
);
915 op
= __rdmap_get_opcode(&term_info
.ctrl
);
916 if (op
>= RDMAP_TERMINATE
)
920 srx
->skb_offset
+= to_copy
;
921 srx
->skb_new
-= to_copy
;
922 srx
->skb_copied
+= to_copy
;
923 srx
->fpdu_part_rcvd
+= to_copy
;
924 srx
->fpdu_part_rem
-= to_copy
;
926 to_copy
= iwarp_pktinfo
[op
].hdr_len
- to_copy
;
928 /* Again, no network fragmented TERM's */
929 if (to_copy
+ MPA_CRC_SIZE
> srx
->skb_new
)
932 skb_copy_bits(skb
, srx
->skb_offset
, infop
, to_copy
);
935 siw_dbg_qp(qp
, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
936 op
, be16_to_cpu(term_info
.ctrl
.mpa_len
),
937 term
->flag_m
? "valid" : "invalid");
938 } else if (term
->flag_d
) {
939 siw_dbg_qp(qp
, "TERM reports DDP hdr type %u, len %u (%s)\n",
940 op
, be16_to_cpu(term_info
.ctrl
.mpa_len
),
941 term
->flag_m
? "valid" : "invalid");
944 srx
->skb_new
-= to_copy
;
945 srx
->skb_offset
+= to_copy
;
946 srx
->skb_copied
+= to_copy
;
947 srx
->fpdu_part_rcvd
+= to_copy
;
948 srx
->fpdu_part_rem
-= to_copy
;
953 static int siw_get_trailer(struct siw_qp
*qp
, struct siw_rx_stream
*srx
)
955 struct sk_buff
*skb
= srx
->skb
;
956 u8
*tbuf
= (u8
*)&srx
->trailer
.crc
- srx
->pad
;
957 __wsum crc_in
, crc_own
= 0;
959 siw_dbg_qp(qp
, "expected %d, available %d, pad %u\n",
960 srx
->fpdu_part_rem
, srx
->skb_new
, srx
->pad
);
962 if (srx
->skb_new
< srx
->fpdu_part_rem
)
965 skb_copy_bits(skb
, srx
->skb_offset
, tbuf
, srx
->fpdu_part_rem
);
967 if (srx
->mpa_crc_hd
&& srx
->pad
)
968 crypto_shash_update(srx
->mpa_crc_hd
, tbuf
, srx
->pad
);
970 srx
->skb_new
-= srx
->fpdu_part_rem
;
971 srx
->skb_offset
+= srx
->fpdu_part_rem
;
972 srx
->skb_copied
+= srx
->fpdu_part_rem
;
974 if (!srx
->mpa_crc_hd
)
978 * CRC32 is computed, transmitted and received directly in NBO,
979 * so there's never a reason to convert byte order.
981 crypto_shash_final(srx
->mpa_crc_hd
, (u8
*)&crc_own
);
982 crc_in
= (__force __wsum
)srx
->trailer
.crc
;
984 if (unlikely(crc_in
!= crc_own
)) {
985 pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
986 crc_in
, crc_own
, qp
->rx_stream
.rdmap_op
);
988 siw_init_terminate(qp
, TERM_ERROR_LAYER_LLP
,
990 LLP_ECODE_RECEIVED_CRC
, 0);
996 #define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
998 static int siw_get_hdr(struct siw_rx_stream
*srx
)
1000 struct sk_buff
*skb
= srx
->skb
;
1001 struct siw_qp
*qp
= rx_qp(srx
);
1002 struct iwarp_ctrl
*c_hdr
= &srx
->hdr
.ctrl
;
1003 struct siw_rx_fpdu
*frx
;
1007 if (srx
->fpdu_part_rcvd
< MIN_DDP_HDR
) {
1009 * copy a mimimum sized (tagged) DDP frame control part
1011 bytes
= min_t(int, srx
->skb_new
,
1012 MIN_DDP_HDR
- srx
->fpdu_part_rcvd
);
1014 skb_copy_bits(skb
, srx
->skb_offset
,
1015 (char *)c_hdr
+ srx
->fpdu_part_rcvd
, bytes
);
1017 srx
->fpdu_part_rcvd
+= bytes
;
1019 srx
->skb_new
-= bytes
;
1020 srx
->skb_offset
+= bytes
;
1021 srx
->skb_copied
+= bytes
;
1023 if (srx
->fpdu_part_rcvd
< MIN_DDP_HDR
)
1026 if (unlikely(__ddp_get_version(c_hdr
) != DDP_VERSION
)) {
1027 enum ddp_etype etype
;
1028 enum ddp_ecode ecode
;
1030 pr_warn("siw: received ddp version unsupported %d\n",
1031 __ddp_get_version(c_hdr
));
1033 if (c_hdr
->ddp_rdmap_ctrl
& DDP_FLAG_TAGGED
) {
1034 etype
= DDP_ETYPE_TAGGED_BUF
;
1035 ecode
= DDP_ECODE_T_VERSION
;
1037 etype
= DDP_ETYPE_UNTAGGED_BUF
;
1038 ecode
= DDP_ECODE_UT_VERSION
;
1040 siw_init_terminate(rx_qp(srx
), TERM_ERROR_LAYER_DDP
,
1044 if (unlikely(__rdmap_get_version(c_hdr
) != RDMAP_VERSION
)) {
1045 pr_warn("siw: received rdmap version unsupported %d\n",
1046 __rdmap_get_version(c_hdr
));
1048 siw_init_terminate(rx_qp(srx
), TERM_ERROR_LAYER_RDMAP
,
1049 RDMAP_ETYPE_REMOTE_OPERATION
,
1050 RDMAP_ECODE_VERSION
, 0);
1053 opcode
= __rdmap_get_opcode(c_hdr
);
1055 if (opcode
> RDMAP_TERMINATE
) {
1056 pr_warn("siw: received unknown packet type %u\n",
1059 siw_init_terminate(rx_qp(srx
), TERM_ERROR_LAYER_RDMAP
,
1060 RDMAP_ETYPE_REMOTE_OPERATION
,
1061 RDMAP_ECODE_OPCODE
, 0);
1064 siw_dbg_qp(rx_qp(srx
), "new header, opcode %u\n", opcode
);
1066 opcode
= __rdmap_get_opcode(c_hdr
);
1068 set_rx_fpdu_context(qp
, opcode
);
1072 * Figure out len of current hdr: variable length of
1073 * iwarp hdr may force us to copy hdr information in
1074 * two steps. Only tagged DDP messages are already
1075 * completely received.
1077 if (iwarp_pktinfo
[opcode
].hdr_len
> sizeof(struct iwarp_ctrl_tagged
)) {
1078 bytes
= iwarp_pktinfo
[opcode
].hdr_len
- MIN_DDP_HDR
;
1080 if (srx
->skb_new
< bytes
)
1083 skb_copy_bits(skb
, srx
->skb_offset
,
1084 (char *)c_hdr
+ srx
->fpdu_part_rcvd
, bytes
);
1086 srx
->fpdu_part_rcvd
+= bytes
;
1088 srx
->skb_new
-= bytes
;
1089 srx
->skb_offset
+= bytes
;
1090 srx
->skb_copied
+= bytes
;
1094 * DDP/RDMAP header receive completed. Check if the current
1095 * DDP segment starts a new RDMAP message or continues a previously
1096 * started RDMAP message.
1098 * Alternating reception of DDP segments (or FPDUs) from incomplete
1099 * tagged and untagged RDMAP messages is supported, as long as
1100 * the current tagged or untagged message gets eventually completed
1101 * w/o intersection from another message of the same type
1102 * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
1103 * but not by a READ RESPONSE etc.
1105 if (srx
->mpa_crc_hd
) {
1107 * Restart CRC computation
1109 crypto_shash_init(srx
->mpa_crc_hd
);
1110 crypto_shash_update(srx
->mpa_crc_hd
, (u8
*)c_hdr
,
1111 srx
->fpdu_part_rcvd
);
1113 if (frx
->more_ddp_segs
) {
1114 frx
->first_ddp_seg
= 0;
1115 if (frx
->prev_rdmap_op
!= opcode
) {
1116 pr_warn("siw: packet intersection: %u : %u\n",
1117 frx
->prev_rdmap_op
, opcode
);
1119 * The last inbound RDMA operation of same type
1120 * (tagged or untagged) is left unfinished.
1121 * To complete it in error, make it the current
1122 * operation again, even with the header already
1123 * overwritten. For error handling, only the opcode
1124 * and current rx context are relevant.
1126 set_rx_fpdu_context(qp
, frx
->prev_rdmap_op
);
1127 __rdmap_set_opcode(c_hdr
, frx
->prev_rdmap_op
);
1131 frx
->prev_rdmap_op
= opcode
;
1132 frx
->first_ddp_seg
= 1;
1134 frx
->more_ddp_segs
= c_hdr
->ddp_rdmap_ctrl
& DDP_FLAG_LAST
? 0 : 1;
1139 static int siw_check_tx_fence(struct siw_qp
*qp
)
1141 struct siw_wqe
*tx_waiting
= tx_wqe(qp
);
1142 struct siw_sqe
*rreq
;
1143 int resume_tx
= 0, rv
= 0;
1144 unsigned long flags
;
1146 spin_lock_irqsave(&qp
->orq_lock
, flags
);
1148 rreq
= orq_get_current(qp
);
1150 /* free current orq entry */
1151 WRITE_ONCE(rreq
->flags
, 0);
1153 if (qp
->tx_ctx
.orq_fence
) {
1154 if (unlikely(tx_waiting
->wr_status
!= SIW_WR_QUEUED
)) {
1155 pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
1156 qp_id(qp
), tx_waiting
->wr_status
);
1160 /* resume SQ processing */
1161 if (tx_waiting
->sqe
.opcode
== SIW_OP_READ
||
1162 tx_waiting
->sqe
.opcode
== SIW_OP_READ_LOCAL_INV
) {
1163 rreq
= orq_get_tail(qp
);
1164 if (unlikely(!rreq
)) {
1165 pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp
));
1169 siw_read_to_orq(rreq
, &tx_waiting
->sqe
);
1172 qp
->tx_ctx
.orq_fence
= 0;
1175 } else if (siw_orq_empty(qp
)) {
1176 qp
->tx_ctx
.orq_fence
= 0;
1179 pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n",
1180 qp_id(qp
), qp
->orq_get
, qp
->orq_put
);
1186 spin_unlock_irqrestore(&qp
->orq_lock
, flags
);
1189 rv
= siw_sq_start(qp
);
1195 * siw_rdmap_complete()
1197 * Complete processing of an RDMA message after receiving all
1198 * DDP segmens or ABort processing after encountering error case.
1200 * o SENDs + RRESPs will need for completion,
1201 * o RREQs need for READ RESPONSE initialization
1202 * o WRITEs need memory dereferencing
1204 * TODO: Failed WRITEs need local error to be surfaced.
1206 static int siw_rdmap_complete(struct siw_qp
*qp
, int error
)
1208 struct siw_rx_stream
*srx
= &qp
->rx_stream
;
1209 struct siw_wqe
*wqe
= rx_wqe(qp
->rx_fpdu
);
1210 enum siw_wc_status wc_status
= wqe
->wc_status
;
1211 u8 opcode
= __rdmap_get_opcode(&srx
->hdr
.ctrl
);
1216 case RDMAP_SEND_SE_INVAL
:
1217 wqe
->rqe
.flags
|= SIW_WQE_SOLICITED
;
1221 case RDMAP_SEND_INVAL
:
1222 if (wqe
->wr_status
== SIW_WR_IDLE
)
1225 srx
->ddp_msn
[RDMAP_UNTAGGED_QN_SEND
]++;
1227 if (error
!= 0 && wc_status
== SIW_WC_SUCCESS
)
1228 wc_status
= SIW_WC_GENERAL_ERR
;
1230 * Handle STag invalidation request
1232 if (wc_status
== SIW_WC_SUCCESS
&&
1233 (opcode
== RDMAP_SEND_INVAL
||
1234 opcode
== RDMAP_SEND_SE_INVAL
)) {
1235 rv
= siw_invalidate_stag(qp
->pd
, srx
->inval_stag
);
1238 qp
, TERM_ERROR_LAYER_RDMAP
,
1240 RDMAP_ETYPE_REMOTE_PROTECTION
:
1241 RDMAP_ETYPE_REMOTE_OPERATION
,
1242 RDMAP_ECODE_CANNOT_INVALIDATE
, 0);
1244 wc_status
= SIW_WC_REM_INV_REQ_ERR
;
1246 rv
= siw_rqe_complete(qp
, &wqe
->rqe
, wqe
->processed
,
1247 rv
? 0 : srx
->inval_stag
,
1250 rv
= siw_rqe_complete(qp
, &wqe
->rqe
, wqe
->processed
,
1253 siw_wqe_put_mem(wqe
, SIW_OP_RECEIVE
);
1256 case RDMAP_RDMA_READ_RESP
:
1257 if (wqe
->wr_status
== SIW_WR_IDLE
)
1261 if ((srx
->state
== SIW_GET_HDR
&&
1262 qp
->rx_fpdu
->first_ddp_seg
) || error
== -ENODATA
)
1263 /* possible RREQ in ORQ left untouched */
1266 if (wc_status
== SIW_WC_SUCCESS
)
1267 wc_status
= SIW_WC_GENERAL_ERR
;
1268 } else if (rdma_is_kernel_res(&qp
->base_qp
.res
) &&
1269 rx_type(wqe
) == SIW_OP_READ_LOCAL_INV
) {
1271 * Handle any STag invalidation request
1273 rv
= siw_invalidate_stag(qp
->pd
, wqe
->sqe
.sge
[0].lkey
);
1275 siw_init_terminate(qp
, TERM_ERROR_LAYER_RDMAP
,
1276 RDMAP_ETYPE_CATASTROPHIC
,
1277 RDMAP_ECODE_UNSPECIFIED
, 0);
1279 if (wc_status
== SIW_WC_SUCCESS
) {
1280 wc_status
= SIW_WC_GENERAL_ERR
;
1286 * All errors turn the wqe into signalled.
1288 if ((wqe
->sqe
.flags
& SIW_WQE_SIGNALLED
) || error
!= 0)
1289 rv
= siw_sqe_complete(qp
, &wqe
->sqe
, wqe
->processed
,
1291 siw_wqe_put_mem(wqe
, SIW_OP_READ
);
1294 rv
= siw_check_tx_fence(qp
);
1296 /* Disable current ORQ eleement */
1297 WRITE_ONCE(orq_get_current(qp
)->flags
, 0);
1300 case RDMAP_RDMA_READ_REQ
:
1302 rv
= siw_init_rresp(qp
, srx
);
1303 srx
->ddp_msn
[RDMAP_UNTAGGED_QN_RDMA_READ
]++;
1307 case RDMAP_RDMA_WRITE
:
1308 if (wqe
->wr_status
== SIW_WR_IDLE
)
1312 * Free References from memory object if
1313 * attached to receive context (inbound WRITE).
1314 * While a zero-length WRITE is allowed,
1315 * no memory reference got created.
1317 if (rx_mem(&qp
->rx_tagged
)) {
1318 siw_mem_put(rx_mem(&qp
->rx_tagged
));
1319 rx_mem(&qp
->rx_tagged
) = NULL
;
1326 wqe
->wr_status
= SIW_WR_IDLE
;
1334 * Main routine to consume inbound TCP payload
1336 * @rd_desc: read descriptor
1337 * @skb: socket buffer
1338 * @off: offset in skb
1339 * @len: skb->len - offset : payload in skb
1341 int siw_tcp_rx_data(read_descriptor_t
*rd_desc
, struct sk_buff
*skb
,
1342 unsigned int off
, size_t len
)
1344 struct siw_qp
*qp
= rd_desc
->arg
.data
;
1345 struct siw_rx_stream
*srx
= &qp
->rx_stream
;
1349 srx
->skb_new
= skb
->len
- off
;
1350 srx
->skb_offset
= off
;
1351 srx
->skb_copied
= 0;
1353 siw_dbg_qp(qp
, "new data, len %d\n", srx
->skb_new
);
1355 while (srx
->skb_new
) {
1356 int run_completion
= 1;
1358 if (unlikely(srx
->rx_suspend
)) {
1359 /* Do not process any more data */
1360 srx
->skb_copied
+= srx
->skb_new
;
1363 switch (srx
->state
) {
1365 rv
= siw_get_hdr(srx
);
1367 srx
->fpdu_part_rem
=
1368 be16_to_cpu(srx
->hdr
.ctrl
.mpa_len
) -
1369 srx
->fpdu_part_rcvd
+ MPA_HDR_SIZE
;
1371 if (srx
->fpdu_part_rem
)
1372 srx
->pad
= -srx
->fpdu_part_rem
& 0x3;
1376 srx
->state
= SIW_GET_DATA_START
;
1377 srx
->fpdu_part_rcvd
= 0;
1381 case SIW_GET_DATA_MORE
:
1383 * Another data fragment of the same DDP segment.
1384 * Setting first_ddp_seg = 0 avoids repeating
1385 * initializations that shall occur only once per
1388 qp
->rx_fpdu
->first_ddp_seg
= 0;
1391 case SIW_GET_DATA_START
:
1393 * Headers will be checked by the opcode-specific
1394 * data receive function below.
1396 rv
= iwarp_pktinfo
[qp
->rx_stream
.rdmap_op
].rx_data(qp
);
1399 be16_to_cpu(srx
->hdr
.ctrl
.mpa_len
)
1402 srx
->fpdu_part_rem
= (-mpa_len
& 0x3)
1404 srx
->fpdu_part_rcvd
= 0;
1405 srx
->state
= SIW_GET_TRAILER
;
1407 if (unlikely(rv
== -ECONNRESET
))
1410 srx
->state
= SIW_GET_DATA_MORE
;
1414 case SIW_GET_TRAILER
:
1416 * read CRC + any padding
1418 rv
= siw_get_trailer(qp
, srx
);
1422 * complete RDMAP message if last fragment
1424 srx
->state
= SIW_GET_HDR
;
1425 srx
->fpdu_part_rcvd
= 0;
1427 if (!(srx
->hdr
.ctrl
.ddp_rdmap_ctrl
&
1432 rv
= siw_rdmap_complete(qp
, 0);
1438 pr_warn("QP[%u]: RX out of state\n", qp_id(qp
));
1442 if (unlikely(rv
!= 0 && rv
!= -EAGAIN
)) {
1443 if ((srx
->state
> SIW_GET_HDR
||
1444 qp
->rx_fpdu
->more_ddp_segs
) && run_completion
)
1445 siw_rdmap_complete(qp
, rv
);
1447 siw_dbg_qp(qp
, "rx error %d, rx state %d\n", rv
,
1450 siw_qp_cm_drop(qp
, 1);
1455 siw_dbg_qp(qp
, "fpdu fragment, state %d, missing %d\n",
1456 srx
->state
, srx
->fpdu_part_rem
);
1460 return srx
->skb_copied
;