1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
6 #include <linux/errno.h>
7 #include <linux/types.h>
9 #include <linux/scatterlist.h>
10 #include <linux/highmem.h>
12 #include <rdma/iw_cm.h>
13 #include <rdma/ib_verbs.h>
16 #include "siw_verbs.h"
22 * Receive data of @len into target referenced by @dest_addr.
24 * @srx: Receive Context
25 * @umem: siw representation of target memory
26 * @dest_addr: user virtual address
27 * @len: number of bytes to place
29 static int siw_rx_umem(struct siw_rx_stream
*srx
, struct siw_umem
*umem
,
30 u64 dest_addr
, int len
)
36 int pg_off
, bytes
, rv
;
39 p
= siw_get_upage(umem
, dest_addr
);
41 pr_warn("siw: %s: [QP %u]: bogus addr: %pK, %pK\n",
42 __func__
, qp_id(rx_qp(srx
)),
43 (void *)(uintptr_t)dest_addr
,
44 (void *)(uintptr_t)umem
->fp_addr
);
45 /* siw internal error */
46 srx
->skb_copied
+= copied
;
47 srx
->skb_new
-= copied
;
51 pg_off
= dest_addr
& ~PAGE_MASK
;
52 bytes
= min(len
, (int)PAGE_SIZE
- pg_off
);
54 siw_dbg_qp(rx_qp(srx
), "page %pK, bytes=%u\n", p
, bytes
);
56 dest
= kmap_atomic(p
);
57 rv
= skb_copy_bits(srx
->skb
, srx
->skb_offset
, dest
+ pg_off
,
62 srx
->skb_copied
+= copied
;
63 srx
->skb_new
-= copied
;
65 pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
66 qp_id(rx_qp(srx
)), __func__
, len
, p
, rv
);
70 if (srx
->mpa_crc_hd
) {
71 if (rdma_is_kernel_res(&rx_qp(srx
)->base_qp
.res
)) {
72 crypto_shash_update(srx
->mpa_crc_hd
,
73 (u8
*)(dest
+ pg_off
), bytes
);
78 * Do CRC on original, not target buffer.
79 * Some user land applications may
80 * concurrently write the target buffer,
81 * which would yield a broken CRC.
82 * Walking the skb twice is very ineffcient.
83 * Folding the CRC into skb_copy_bits()
84 * would be much better, but is currently
87 siw_crc_skb(srx
, bytes
);
92 srx
->skb_offset
+= bytes
;
98 srx
->skb_copied
+= copied
;
99 srx
->skb_new
-= copied
;
104 static int siw_rx_kva(struct siw_rx_stream
*srx
, void *kva
, int len
)
108 siw_dbg_qp(rx_qp(srx
), "kva: 0x%pK, len: %u\n", kva
, len
);
110 rv
= skb_copy_bits(srx
->skb
, srx
->skb_offset
, kva
, len
);
112 pr_warn("siw: [QP %u]: %s, len %d, kva 0x%pK, rv %d\n",
113 qp_id(rx_qp(srx
)), __func__
, len
, kva
, rv
);
118 crypto_shash_update(srx
->mpa_crc_hd
, (u8
*)kva
, len
);
120 srx
->skb_offset
+= len
;
121 srx
->skb_copied
+= len
;
127 static int siw_rx_pbl(struct siw_rx_stream
*srx
, int *pbl_idx
,
128 struct siw_mem
*mem
, u64 addr
, int len
)
130 struct siw_pbl
*pbl
= mem
->pbl
;
131 u64 offset
= addr
- mem
->va
;
136 dma_addr_t buf_addr
=
137 siw_pbl_get_buffer(pbl
, offset
, &bytes
, pbl_idx
);
141 bytes
= min(bytes
, len
);
142 if (siw_rx_kva(srx
, (void *)buf_addr
, bytes
) == bytes
) {
154 * siw_rresp_check_ntoh()
156 * Check incoming RRESP fragment header against expected
157 * header values and update expected values for potential next
160 * NOTE: This function must be called only if a RRESP DDP segment
161 * starts but not for fragmented consecutive pieces of an
162 * already started DDP segment.
164 static int siw_rresp_check_ntoh(struct siw_rx_stream
*srx
,
165 struct siw_rx_fpdu
*frx
)
167 struct iwarp_rdma_rresp
*rresp
= &srx
->hdr
.rresp
;
168 struct siw_wqe
*wqe
= &frx
->wqe_active
;
169 enum ddp_ecode ecode
;
171 u32 sink_stag
= be32_to_cpu(rresp
->sink_stag
);
172 u64 sink_to
= be64_to_cpu(rresp
->sink_to
);
174 if (frx
->first_ddp_seg
) {
175 srx
->ddp_stag
= wqe
->sqe
.sge
[0].lkey
;
176 srx
->ddp_to
= wqe
->sqe
.sge
[0].laddr
;
179 /* Below checks extend beyond the semantics of DDP, and
181 * We check if the read response matches exactly the
182 * read request which was send to the remote peer to
183 * trigger this read response. RFC5040/5041 do not
184 * always have a proper error code for the detected
185 * error cases. We choose 'base or bounds error' for
186 * cases where the inbound STag is valid, but offset
187 * or length do not match our response receive state.
189 if (unlikely(srx
->ddp_stag
!= sink_stag
)) {
190 pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
191 qp_id(rx_qp(srx
)), sink_stag
, srx
->ddp_stag
);
192 ecode
= DDP_ECODE_T_INVALID_STAG
;
195 if (unlikely(srx
->ddp_to
!= sink_to
)) {
196 pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
197 qp_id(rx_qp(srx
)), (unsigned long long)sink_to
,
198 (unsigned long long)srx
->ddp_to
);
199 ecode
= DDP_ECODE_T_BASE_BOUNDS
;
202 if (unlikely(!frx
->more_ddp_segs
&&
203 (wqe
->processed
+ srx
->fpdu_part_rem
!= wqe
->bytes
))) {
204 pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
206 wqe
->processed
+ srx
->fpdu_part_rem
, wqe
->bytes
);
207 ecode
= DDP_ECODE_T_BASE_BOUNDS
;
212 siw_init_terminate(rx_qp(srx
), TERM_ERROR_LAYER_DDP
,
213 DDP_ETYPE_TAGGED_BUF
, ecode
, 0);
218 * siw_write_check_ntoh()
220 * Check incoming WRITE fragment header against expected
221 * header values and update expected values for potential next
224 * NOTE: This function must be called only if a WRITE DDP segment
225 * starts but not for fragmented consecutive pieces of an
226 * already started DDP segment.
228 static int siw_write_check_ntoh(struct siw_rx_stream
*srx
,
229 struct siw_rx_fpdu
*frx
)
231 struct iwarp_rdma_write
*write
= &srx
->hdr
.rwrite
;
232 enum ddp_ecode ecode
;
234 u32 sink_stag
= be32_to_cpu(write
->sink_stag
);
235 u64 sink_to
= be64_to_cpu(write
->sink_to
);
237 if (frx
->first_ddp_seg
) {
238 srx
->ddp_stag
= sink_stag
;
239 srx
->ddp_to
= sink_to
;
242 if (unlikely(srx
->ddp_stag
!= sink_stag
)) {
243 pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
244 qp_id(rx_qp(srx
)), sink_stag
,
246 ecode
= DDP_ECODE_T_INVALID_STAG
;
249 if (unlikely(srx
->ddp_to
!= sink_to
)) {
250 pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
252 (unsigned long long)sink_to
,
253 (unsigned long long)srx
->ddp_to
);
254 ecode
= DDP_ECODE_T_BASE_BOUNDS
;
260 siw_init_terminate(rx_qp(srx
), TERM_ERROR_LAYER_DDP
,
261 DDP_ETYPE_TAGGED_BUF
, ecode
, 0);
266 * siw_send_check_ntoh()
268 * Check incoming SEND fragment header against expected
269 * header values and update expected MSN if no next
272 * NOTE: This function must be called only if a SEND DDP segment
273 * starts but not for fragmented consecutive pieces of an
274 * already started DDP segment.
276 static int siw_send_check_ntoh(struct siw_rx_stream
*srx
,
277 struct siw_rx_fpdu
*frx
)
279 struct iwarp_send_inv
*send
= &srx
->hdr
.send_inv
;
280 struct siw_wqe
*wqe
= &frx
->wqe_active
;
281 enum ddp_ecode ecode
;
283 u32 ddp_msn
= be32_to_cpu(send
->ddp_msn
);
284 u32 ddp_mo
= be32_to_cpu(send
->ddp_mo
);
285 u32 ddp_qn
= be32_to_cpu(send
->ddp_qn
);
287 if (unlikely(ddp_qn
!= RDMAP_UNTAGGED_QN_SEND
)) {
288 pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
289 qp_id(rx_qp(srx
)), ddp_qn
);
290 ecode
= DDP_ECODE_UT_INVALID_QN
;
293 if (unlikely(ddp_msn
!= srx
->ddp_msn
[RDMAP_UNTAGGED_QN_SEND
])) {
294 pr_warn("siw: [QP %u]: send msn: %u != %u\n",
295 qp_id(rx_qp(srx
)), ddp_msn
,
296 srx
->ddp_msn
[RDMAP_UNTAGGED_QN_SEND
]);
297 ecode
= DDP_ECODE_UT_INVALID_MSN_RANGE
;
300 if (unlikely(ddp_mo
!= wqe
->processed
)) {
301 pr_warn("siw: [QP %u], send mo: %u != %u\n",
302 qp_id(rx_qp(srx
)), ddp_mo
, wqe
->processed
);
303 ecode
= DDP_ECODE_UT_INVALID_MO
;
306 if (frx
->first_ddp_seg
) {
307 /* initialize user memory write position */
312 /* only valid for SEND_INV and SEND_SE_INV operations */
313 srx
->inval_stag
= be32_to_cpu(send
->inval_stag
);
315 if (unlikely(wqe
->bytes
< wqe
->processed
+ srx
->fpdu_part_rem
)) {
316 siw_dbg_qp(rx_qp(srx
), "receive space short: %d - %d < %d\n",
317 wqe
->bytes
, wqe
->processed
, srx
->fpdu_part_rem
);
318 wqe
->wc_status
= SIW_WC_LOC_LEN_ERR
;
319 ecode
= DDP_ECODE_UT_INVALID_MSN_NOBUF
;
324 siw_init_terminate(rx_qp(srx
), TERM_ERROR_LAYER_DDP
,
325 DDP_ETYPE_UNTAGGED_BUF
, ecode
, 0);
329 static struct siw_wqe
*siw_rqe_get(struct siw_qp
*qp
)
333 struct siw_wqe
*wqe
= NULL
;
334 bool srq_event
= false;
339 spin_lock_irqsave(&srq
->lock
, flags
);
340 if (unlikely(!srq
->num_rqe
))
343 rqe
= &srq
->recvq
[srq
->rq_get
% srq
->num_rqe
];
345 if (unlikely(!qp
->recvq
))
348 rqe
= &qp
->recvq
[qp
->rq_get
% qp
->attrs
.rq_size
];
350 if (likely(rqe
->flags
== SIW_WQE_VALID
)) {
351 int num_sge
= rqe
->num_sge
;
353 if (likely(num_sge
<= SIW_MAX_SGE
)) {
356 wqe
= rx_wqe(&qp
->rx_untagged
);
357 rx_type(wqe
) = SIW_OP_RECEIVE
;
358 wqe
->wr_status
= SIW_WR_INPROGRESS
;
362 wqe
->rqe
.id
= rqe
->id
;
363 wqe
->rqe
.num_sge
= num_sge
;
365 while (i
< num_sge
) {
366 wqe
->rqe
.sge
[i
].laddr
= rqe
->sge
[i
].laddr
;
367 wqe
->rqe
.sge
[i
].lkey
= rqe
->sge
[i
].lkey
;
368 wqe
->rqe
.sge
[i
].length
= rqe
->sge
[i
].length
;
369 wqe
->bytes
+= wqe
->rqe
.sge
[i
].length
;
373 /* can be re-used by appl */
374 smp_store_mb(rqe
->flags
, 0);
376 siw_dbg_qp(qp
, "too many sge's: %d\n", rqe
->num_sge
);
378 spin_unlock_irqrestore(&srq
->lock
, flags
);
386 u32 off
= (srq
->rq_get
+ srq
->limit
) %
388 struct siw_rqe
*rqe2
= &srq
->recvq
[off
];
390 if (!(rqe2
->flags
& SIW_WQE_VALID
)) {
400 spin_unlock_irqrestore(&srq
->lock
, flags
);
402 siw_srq_event(srq
, IB_EVENT_SRQ_LIMIT_REACHED
);
410 * Process one incoming SEND and place data into memory referenced by
413 * Function supports partially received sends (suspending/resuming
414 * current receive wqe processing)
417 * 0: reached the end of a DDP segment
418 * -EAGAIN: to be called again to finish the DDP segment
420 int siw_proc_send(struct siw_qp
*qp
)
422 struct siw_rx_stream
*srx
= &qp
->rx_stream
;
423 struct siw_rx_fpdu
*frx
= &qp
->rx_untagged
;
425 u32 data_bytes
; /* all data bytes available */
426 u32 rcvd_bytes
; /* sum of data bytes rcvd */
429 if (frx
->first_ddp_seg
) {
430 wqe
= siw_rqe_get(qp
);
431 if (unlikely(!wqe
)) {
432 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
433 DDP_ETYPE_UNTAGGED_BUF
,
434 DDP_ECODE_UT_INVALID_MSN_NOBUF
, 0);
440 if (srx
->state
== SIW_GET_DATA_START
) {
441 rv
= siw_send_check_ntoh(srx
, frx
);
443 siw_qp_event(qp
, IB_EVENT_QP_FATAL
);
446 if (!srx
->fpdu_part_rem
) /* zero length SEND */
449 data_bytes
= min(srx
->fpdu_part_rem
, srx
->skb_new
);
452 /* A zero length SEND will skip below loop */
455 struct siw_mem
**mem
, *mem_p
;
457 u32 sge_bytes
; /* data bytes avail for SGE */
459 sge
= &wqe
->rqe
.sge
[frx
->sge_idx
];
462 /* just skip empty sge's */
468 sge_bytes
= min(data_bytes
, sge
->length
- frx
->sge_off
);
469 mem
= &wqe
->mem
[frx
->sge_idx
];
472 * check with QP's PD if no SRQ present, SRQ's PD otherwise
474 pd
= qp
->srq
== NULL
? qp
->pd
: qp
->srq
->base_srq
.pd
;
476 rv
= siw_check_sge(pd
, sge
, mem
, IB_ACCESS_LOCAL_WRITE
,
477 frx
->sge_off
, sge_bytes
);
479 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
480 DDP_ETYPE_CATASTROPHIC
,
481 DDP_ECODE_CATASTROPHIC
, 0);
483 siw_qp_event(qp
, IB_EVENT_QP_ACCESS_ERR
);
487 if (mem_p
->mem_obj
== NULL
)
489 (void *)(uintptr_t)(sge
->laddr
+ frx
->sge_off
),
491 else if (!mem_p
->is_pbl
)
492 rv
= siw_rx_umem(srx
, mem_p
->umem
,
493 sge
->laddr
+ frx
->sge_off
, sge_bytes
);
495 rv
= siw_rx_pbl(srx
, &frx
->pbl_idx
, mem_p
,
496 sge
->laddr
+ frx
->sge_off
, sge_bytes
);
498 if (unlikely(rv
!= sge_bytes
)) {
499 wqe
->processed
+= rcvd_bytes
;
501 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
502 DDP_ETYPE_CATASTROPHIC
,
503 DDP_ECODE_CATASTROPHIC
, 0);
508 if (frx
->sge_off
== sge
->length
) {
516 srx
->fpdu_part_rem
-= rv
;
517 srx
->fpdu_part_rcvd
+= rv
;
519 wqe
->processed
+= rcvd_bytes
;
521 if (!srx
->fpdu_part_rem
)
524 return (rv
< 0) ? rv
: -EAGAIN
;
530 * Place incoming WRITE after referencing and checking target buffer
532 * Function supports partially received WRITEs (suspending/resuming
533 * current receive processing)
536 * 0: reached the end of a DDP segment
537 * -EAGAIN: to be called again to finish the DDP segment
539 int siw_proc_write(struct siw_qp
*qp
)
541 struct siw_rx_stream
*srx
= &qp
->rx_stream
;
542 struct siw_rx_fpdu
*frx
= &qp
->rx_tagged
;
546 if (srx
->state
== SIW_GET_DATA_START
) {
547 if (!srx
->fpdu_part_rem
) /* zero length WRITE */
550 rv
= siw_write_check_ntoh(srx
, frx
);
552 siw_qp_event(qp
, IB_EVENT_QP_FATAL
);
556 bytes
= min(srx
->fpdu_part_rem
, srx
->skb_new
);
558 if (frx
->first_ddp_seg
) {
559 struct siw_wqe
*wqe
= rx_wqe(frx
);
561 rx_mem(frx
) = siw_mem_id2obj(qp
->sdev
, srx
->ddp_stag
>> 8);
562 if (unlikely(!rx_mem(frx
))) {
564 "sink stag not found/invalid, stag 0x%08x\n",
567 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
568 DDP_ETYPE_TAGGED_BUF
,
569 DDP_ECODE_T_INVALID_STAG
, 0);
572 wqe
->rqe
.num_sge
= 1;
573 rx_type(wqe
) = SIW_OP_WRITE
;
574 wqe
->wr_status
= SIW_WR_INPROGRESS
;
579 * Check if application re-registered memory with different
582 if (unlikely(mem
->stag
!= srx
->ddp_stag
)) {
583 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
584 DDP_ETYPE_TAGGED_BUF
,
585 DDP_ECODE_T_INVALID_STAG
, 0);
588 rv
= siw_check_mem(qp
->pd
, mem
, srx
->ddp_to
+ srx
->fpdu_part_rcvd
,
589 IB_ACCESS_REMOTE_WRITE
, bytes
);
591 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
592 DDP_ETYPE_TAGGED_BUF
, siw_tagged_error(-rv
),
595 siw_qp_event(qp
, IB_EVENT_QP_ACCESS_ERR
);
600 if (mem
->mem_obj
== NULL
)
602 (void *)(uintptr_t)(srx
->ddp_to
+ srx
->fpdu_part_rcvd
),
604 else if (!mem
->is_pbl
)
605 rv
= siw_rx_umem(srx
, mem
->umem
,
606 srx
->ddp_to
+ srx
->fpdu_part_rcvd
, bytes
);
608 rv
= siw_rx_pbl(srx
, &frx
->pbl_idx
, mem
,
609 srx
->ddp_to
+ srx
->fpdu_part_rcvd
, bytes
);
611 if (unlikely(rv
!= bytes
)) {
612 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
613 DDP_ETYPE_CATASTROPHIC
,
614 DDP_ECODE_CATASTROPHIC
, 0);
617 srx
->fpdu_part_rem
-= rv
;
618 srx
->fpdu_part_rcvd
+= rv
;
620 if (!srx
->fpdu_part_rem
) {
621 srx
->ddp_to
+= srx
->fpdu_part_rcvd
;
628 * Inbound RREQ's cannot carry user data.
630 int siw_proc_rreq(struct siw_qp
*qp
)
632 struct siw_rx_stream
*srx
= &qp
->rx_stream
;
634 if (!srx
->fpdu_part_rem
)
637 pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp
),
638 be16_to_cpu(srx
->hdr
.ctrl
.mpa_len
));
646 * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
647 * Put it at the tail of the IRQ, if there is another WQE currently in
648 * transmit processing. If not, make it the current WQE to be processed
649 * and schedule transmit processing.
651 * Can be called from softirq context and from process
652 * context (RREAD socket loopback case!)
656 * failure code otherwise
659 static int siw_init_rresp(struct siw_qp
*qp
, struct siw_rx_stream
*srx
)
661 struct siw_wqe
*tx_work
= tx_wqe(qp
);
662 struct siw_sqe
*resp
;
664 uint64_t raddr
= be64_to_cpu(srx
->hdr
.rreq
.sink_to
),
665 laddr
= be64_to_cpu(srx
->hdr
.rreq
.source_to
);
666 uint32_t length
= be32_to_cpu(srx
->hdr
.rreq
.read_size
),
667 lkey
= be32_to_cpu(srx
->hdr
.rreq
.source_stag
),
668 rkey
= be32_to_cpu(srx
->hdr
.rreq
.sink_stag
),
669 msn
= be32_to_cpu(srx
->hdr
.rreq
.ddp_msn
);
671 int run_sq
= 1, rv
= 0;
674 if (unlikely(msn
!= srx
->ddp_msn
[RDMAP_UNTAGGED_QN_RDMA_READ
])) {
675 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
676 DDP_ETYPE_UNTAGGED_BUF
,
677 DDP_ECODE_UT_INVALID_MSN_RANGE
, 0);
680 spin_lock_irqsave(&qp
->sq_lock
, flags
);
682 if (tx_work
->wr_status
== SIW_WR_IDLE
) {
684 * immediately schedule READ response w/o
685 * consuming IRQ entry: IRQ must be empty.
687 tx_work
->processed
= 0;
688 tx_work
->mem
[0] = NULL
;
689 tx_work
->wr_status
= SIW_WR_QUEUED
;
690 resp
= &tx_work
->sqe
;
692 resp
= irq_alloc_free(qp
);
696 resp
->opcode
= SIW_OP_READ_RESPONSE
;
698 resp
->sge
[0].length
= length
;
699 resp
->sge
[0].laddr
= laddr
;
700 resp
->sge
[0].lkey
= lkey
;
702 /* Keep aside message sequence number for potential
703 * error reporting during Read Response generation.
705 resp
->sge
[1].length
= msn
;
709 resp
->num_sge
= length
? 1 : 0;
711 /* RRESP now valid as current TX wqe or placed into IRQ */
712 smp_store_mb(resp
->flags
, SIW_WQE_VALID
);
714 pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp
),
715 qp
->irq_put
% qp
->attrs
.irq_size
, qp
->attrs
.irq_size
);
717 siw_init_terminate(qp
, TERM_ERROR_LAYER_RDMAP
,
718 RDMAP_ETYPE_REMOTE_OPERATION
,
719 RDMAP_ECODE_CATASTROPHIC_STREAM
, 0);
723 spin_unlock_irqrestore(&qp
->sq_lock
, flags
);
726 rv
= siw_sq_start(qp
);
732 * Only called at start of Read.Resonse processing.
733 * Transfer pending Read from tip of ORQ into currrent rx wqe,
734 * but keep ORQ entry valid until Read.Response processing done.
735 * No Queue locking needed.
737 static int siw_orqe_start_rx(struct siw_qp
*qp
)
739 struct siw_sqe
*orqe
;
740 struct siw_wqe
*wqe
= NULL
;
742 /* make sure ORQ indices are current */
745 orqe
= orq_get_current(qp
);
746 if (READ_ONCE(orqe
->flags
) & SIW_WQE_VALID
) {
747 /* RRESP is a TAGGED RDMAP operation */
748 wqe
= rx_wqe(&qp
->rx_tagged
);
749 wqe
->sqe
.id
= orqe
->id
;
750 wqe
->sqe
.opcode
= orqe
->opcode
;
751 wqe
->sqe
.sge
[0].laddr
= orqe
->sge
[0].laddr
;
752 wqe
->sqe
.sge
[0].lkey
= orqe
->sge
[0].lkey
;
753 wqe
->sqe
.sge
[0].length
= orqe
->sge
[0].length
;
754 wqe
->sqe
.flags
= orqe
->flags
;
755 wqe
->sqe
.num_sge
= 1;
756 wqe
->bytes
= orqe
->sge
[0].length
;
759 /* make sure WQE is completely written before valid */
761 wqe
->wr_status
= SIW_WR_INPROGRESS
;
771 * Place incoming RRESP data into memory referenced by RREQ WQE
772 * which is at the tip of the ORQ
774 * Function supports partially received RRESP's (suspending/resuming
775 * current receive processing)
777 int siw_proc_rresp(struct siw_qp
*qp
)
779 struct siw_rx_stream
*srx
= &qp
->rx_stream
;
780 struct siw_rx_fpdu
*frx
= &qp
->rx_tagged
;
781 struct siw_wqe
*wqe
= rx_wqe(frx
);
782 struct siw_mem
**mem
, *mem_p
;
786 if (frx
->first_ddp_seg
) {
787 if (unlikely(wqe
->wr_status
!= SIW_WR_IDLE
)) {
788 pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
789 qp_id(qp
), wqe
->wr_status
, wqe
->sqe
.opcode
);
794 * fetch pending RREQ from orq
796 rv
= siw_orqe_start_rx(qp
);
798 pr_warn("siw: [QP %u]: ORQ empty at idx %d\n",
799 qp_id(qp
), qp
->orq_get
% qp
->attrs
.orq_size
);
802 rv
= siw_rresp_check_ntoh(srx
, frx
);
804 siw_qp_event(qp
, IB_EVENT_QP_FATAL
);
808 if (unlikely(wqe
->wr_status
!= SIW_WR_INPROGRESS
)) {
809 pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
810 qp_id(qp
), wqe
->wr_status
);
815 if (!srx
->fpdu_part_rem
) /* zero length RRESPONSE */
818 sge
= wqe
->sqe
.sge
; /* there is only one */
823 * check target memory which resolves memory on first fragment
825 rv
= siw_check_sge(qp
->pd
, sge
, mem
, IB_ACCESS_LOCAL_WRITE
, 0,
828 siw_dbg_qp(qp
, "target mem check: %d\n", rv
);
829 wqe
->wc_status
= SIW_WC_LOC_PROT_ERR
;
831 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
,
832 DDP_ETYPE_TAGGED_BUF
,
833 siw_tagged_error(-rv
), 0);
835 siw_qp_event(qp
, IB_EVENT_QP_ACCESS_ERR
);
842 bytes
= min(srx
->fpdu_part_rem
, srx
->skb_new
);
844 if (mem_p
->mem_obj
== NULL
)
846 (void *)(uintptr_t)(sge
->laddr
+ wqe
->processed
),
848 else if (!mem_p
->is_pbl
)
849 rv
= siw_rx_umem(srx
, mem_p
->umem
, sge
->laddr
+ wqe
->processed
,
852 rv
= siw_rx_pbl(srx
, &frx
->pbl_idx
, mem_p
,
853 sge
->laddr
+ wqe
->processed
, bytes
);
855 wqe
->wc_status
= SIW_WC_GENERAL_ERR
;
859 srx
->fpdu_part_rem
-= rv
;
860 srx
->fpdu_part_rcvd
+= rv
;
861 wqe
->processed
+= rv
;
863 if (!srx
->fpdu_part_rem
) {
864 srx
->ddp_to
+= srx
->fpdu_part_rcvd
;
870 siw_init_terminate(qp
, TERM_ERROR_LAYER_DDP
, DDP_ETYPE_CATASTROPHIC
,
871 DDP_ECODE_CATASTROPHIC
, 0);
875 int siw_proc_terminate(struct siw_qp
*qp
)
877 struct siw_rx_stream
*srx
= &qp
->rx_stream
;
878 struct sk_buff
*skb
= srx
->skb
;
879 struct iwarp_terminate
*term
= &srx
->hdr
.terminate
;
880 union iwarp_hdr term_info
;
881 u8
*infop
= (u8
*)&term_info
;
883 u16 to_copy
= sizeof(struct iwarp_ctrl
);
885 pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
886 __rdmap_term_layer(term
), __rdmap_term_etype(term
),
887 __rdmap_term_ecode(term
));
889 if (be32_to_cpu(term
->ddp_qn
) != RDMAP_UNTAGGED_QN_TERMINATE
||
890 be32_to_cpu(term
->ddp_msn
) !=
891 qp
->rx_stream
.ddp_msn
[RDMAP_UNTAGGED_QN_TERMINATE
] ||
892 be32_to_cpu(term
->ddp_mo
) != 0) {
893 pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
894 be32_to_cpu(term
->ddp_qn
), be32_to_cpu(term
->ddp_msn
),
895 be32_to_cpu(term
->ddp_mo
));
899 * Receive remaining pieces of TERM if indicated
904 /* Do not take the effort to reassemble a network fragmented
907 if (srx
->skb_new
< sizeof(struct iwarp_ctrl_tagged
))
910 memset(infop
, 0, sizeof(term_info
));
912 skb_copy_bits(skb
, srx
->skb_offset
, infop
, to_copy
);
914 op
= __rdmap_get_opcode(&term_info
.ctrl
);
915 if (op
>= RDMAP_TERMINATE
)
919 srx
->skb_offset
+= to_copy
;
920 srx
->skb_new
-= to_copy
;
921 srx
->skb_copied
+= to_copy
;
922 srx
->fpdu_part_rcvd
+= to_copy
;
923 srx
->fpdu_part_rem
-= to_copy
;
925 to_copy
= iwarp_pktinfo
[op
].hdr_len
- to_copy
;
927 /* Again, no network fragmented TERM's */
928 if (to_copy
+ MPA_CRC_SIZE
> srx
->skb_new
)
931 skb_copy_bits(skb
, srx
->skb_offset
, infop
, to_copy
);
934 siw_dbg_qp(qp
, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
935 op
, be16_to_cpu(term_info
.ctrl
.mpa_len
),
936 term
->flag_m
? "valid" : "invalid");
937 } else if (term
->flag_d
) {
938 siw_dbg_qp(qp
, "TERM reports DDP hdr type %u, len %u (%s)\n",
939 op
, be16_to_cpu(term_info
.ctrl
.mpa_len
),
940 term
->flag_m
? "valid" : "invalid");
943 srx
->skb_new
-= to_copy
;
944 srx
->skb_offset
+= to_copy
;
945 srx
->skb_copied
+= to_copy
;
946 srx
->fpdu_part_rcvd
+= to_copy
;
947 srx
->fpdu_part_rem
-= to_copy
;
952 static int siw_get_trailer(struct siw_qp
*qp
, struct siw_rx_stream
*srx
)
954 struct sk_buff
*skb
= srx
->skb
;
955 u8
*tbuf
= (u8
*)&srx
->trailer
.crc
- srx
->pad
;
956 __wsum crc_in
, crc_own
= 0;
958 siw_dbg_qp(qp
, "expected %d, available %d, pad %u\n",
959 srx
->fpdu_part_rem
, srx
->skb_new
, srx
->pad
);
961 if (srx
->skb_new
< srx
->fpdu_part_rem
)
964 skb_copy_bits(skb
, srx
->skb_offset
, tbuf
, srx
->fpdu_part_rem
);
966 if (srx
->mpa_crc_hd
&& srx
->pad
)
967 crypto_shash_update(srx
->mpa_crc_hd
, tbuf
, srx
->pad
);
969 srx
->skb_new
-= srx
->fpdu_part_rem
;
970 srx
->skb_offset
+= srx
->fpdu_part_rem
;
971 srx
->skb_copied
+= srx
->fpdu_part_rem
;
973 if (!srx
->mpa_crc_hd
)
977 * CRC32 is computed, transmitted and received directly in NBO,
978 * so there's never a reason to convert byte order.
980 crypto_shash_final(srx
->mpa_crc_hd
, (u8
*)&crc_own
);
981 crc_in
= (__force __wsum
)srx
->trailer
.crc
;
983 if (unlikely(crc_in
!= crc_own
)) {
984 pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
985 crc_in
, crc_own
, qp
->rx_stream
.rdmap_op
);
987 siw_init_terminate(qp
, TERM_ERROR_LAYER_LLP
,
989 LLP_ECODE_RECEIVED_CRC
, 0);
995 #define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
997 static int siw_get_hdr(struct siw_rx_stream
*srx
)
999 struct sk_buff
*skb
= srx
->skb
;
1000 struct siw_qp
*qp
= rx_qp(srx
);
1001 struct iwarp_ctrl
*c_hdr
= &srx
->hdr
.ctrl
;
1002 struct siw_rx_fpdu
*frx
;
1006 if (srx
->fpdu_part_rcvd
< MIN_DDP_HDR
) {
1008 * copy a mimimum sized (tagged) DDP frame control part
1010 bytes
= min_t(int, srx
->skb_new
,
1011 MIN_DDP_HDR
- srx
->fpdu_part_rcvd
);
1013 skb_copy_bits(skb
, srx
->skb_offset
,
1014 (char *)c_hdr
+ srx
->fpdu_part_rcvd
, bytes
);
1016 srx
->fpdu_part_rcvd
+= bytes
;
1018 srx
->skb_new
-= bytes
;
1019 srx
->skb_offset
+= bytes
;
1020 srx
->skb_copied
+= bytes
;
1022 if (srx
->fpdu_part_rcvd
< MIN_DDP_HDR
)
1025 if (unlikely(__ddp_get_version(c_hdr
) != DDP_VERSION
)) {
1026 enum ddp_etype etype
;
1027 enum ddp_ecode ecode
;
1029 pr_warn("siw: received ddp version unsupported %d\n",
1030 __ddp_get_version(c_hdr
));
1032 if (c_hdr
->ddp_rdmap_ctrl
& DDP_FLAG_TAGGED
) {
1033 etype
= DDP_ETYPE_TAGGED_BUF
;
1034 ecode
= DDP_ECODE_T_VERSION
;
1036 etype
= DDP_ETYPE_UNTAGGED_BUF
;
1037 ecode
= DDP_ECODE_UT_VERSION
;
1039 siw_init_terminate(rx_qp(srx
), TERM_ERROR_LAYER_DDP
,
1043 if (unlikely(__rdmap_get_version(c_hdr
) != RDMAP_VERSION
)) {
1044 pr_warn("siw: received rdmap version unsupported %d\n",
1045 __rdmap_get_version(c_hdr
));
1047 siw_init_terminate(rx_qp(srx
), TERM_ERROR_LAYER_RDMAP
,
1048 RDMAP_ETYPE_REMOTE_OPERATION
,
1049 RDMAP_ECODE_VERSION
, 0);
1052 opcode
= __rdmap_get_opcode(c_hdr
);
1054 if (opcode
> RDMAP_TERMINATE
) {
1055 pr_warn("siw: received unknown packet type %u\n",
1058 siw_init_terminate(rx_qp(srx
), TERM_ERROR_LAYER_RDMAP
,
1059 RDMAP_ETYPE_REMOTE_OPERATION
,
1060 RDMAP_ECODE_OPCODE
, 0);
1063 siw_dbg_qp(rx_qp(srx
), "new header, opcode %u\n", opcode
);
1065 opcode
= __rdmap_get_opcode(c_hdr
);
1067 set_rx_fpdu_context(qp
, opcode
);
1071 * Figure out len of current hdr: variable length of
1072 * iwarp hdr may force us to copy hdr information in
1073 * two steps. Only tagged DDP messages are already
1074 * completely received.
1076 if (iwarp_pktinfo
[opcode
].hdr_len
> sizeof(struct iwarp_ctrl_tagged
)) {
1077 bytes
= iwarp_pktinfo
[opcode
].hdr_len
- MIN_DDP_HDR
;
1079 if (srx
->skb_new
< bytes
)
1082 skb_copy_bits(skb
, srx
->skb_offset
,
1083 (char *)c_hdr
+ srx
->fpdu_part_rcvd
, bytes
);
1085 srx
->fpdu_part_rcvd
+= bytes
;
1087 srx
->skb_new
-= bytes
;
1088 srx
->skb_offset
+= bytes
;
1089 srx
->skb_copied
+= bytes
;
1093 * DDP/RDMAP header receive completed. Check if the current
1094 * DDP segment starts a new RDMAP message or continues a previously
1095 * started RDMAP message.
1097 * Alternating reception of DDP segments (or FPDUs) from incomplete
1098 * tagged and untagged RDMAP messages is supported, as long as
1099 * the current tagged or untagged message gets eventually completed
1100 * w/o intersection from another message of the same type
1101 * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
1102 * but not by a READ RESPONSE etc.
1104 if (srx
->mpa_crc_hd
) {
1106 * Restart CRC computation
1108 crypto_shash_init(srx
->mpa_crc_hd
);
1109 crypto_shash_update(srx
->mpa_crc_hd
, (u8
*)c_hdr
,
1110 srx
->fpdu_part_rcvd
);
1112 if (frx
->more_ddp_segs
) {
1113 frx
->first_ddp_seg
= 0;
1114 if (frx
->prev_rdmap_op
!= opcode
) {
1115 pr_warn("siw: packet intersection: %u : %u\n",
1116 frx
->prev_rdmap_op
, opcode
);
1118 * The last inbound RDMA operation of same type
1119 * (tagged or untagged) is left unfinished.
1120 * To complete it in error, make it the current
1121 * operation again, even with the header already
1122 * overwritten. For error handling, only the opcode
1123 * and current rx context are relevant.
1125 set_rx_fpdu_context(qp
, frx
->prev_rdmap_op
);
1126 __rdmap_set_opcode(c_hdr
, frx
->prev_rdmap_op
);
1130 frx
->prev_rdmap_op
= opcode
;
1131 frx
->first_ddp_seg
= 1;
1133 frx
->more_ddp_segs
= c_hdr
->ddp_rdmap_ctrl
& DDP_FLAG_LAST
? 0 : 1;
1138 static int siw_check_tx_fence(struct siw_qp
*qp
)
1140 struct siw_wqe
*tx_waiting
= tx_wqe(qp
);
1141 struct siw_sqe
*rreq
;
1142 int resume_tx
= 0, rv
= 0;
1143 unsigned long flags
;
1145 spin_lock_irqsave(&qp
->orq_lock
, flags
);
1147 rreq
= orq_get_current(qp
);
1149 /* free current orq entry */
1150 WRITE_ONCE(rreq
->flags
, 0);
1152 if (qp
->tx_ctx
.orq_fence
) {
1153 if (unlikely(tx_waiting
->wr_status
!= SIW_WR_QUEUED
)) {
1154 pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
1155 qp_id(qp
), tx_waiting
->wr_status
);
1159 /* resume SQ processing */
1160 if (tx_waiting
->sqe
.opcode
== SIW_OP_READ
||
1161 tx_waiting
->sqe
.opcode
== SIW_OP_READ_LOCAL_INV
) {
1162 rreq
= orq_get_tail(qp
);
1163 if (unlikely(!rreq
)) {
1164 pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp
));
1168 siw_read_to_orq(rreq
, &tx_waiting
->sqe
);
1171 qp
->tx_ctx
.orq_fence
= 0;
1174 } else if (siw_orq_empty(qp
)) {
1175 qp
->tx_ctx
.orq_fence
= 0;
1178 pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n",
1179 qp_id(qp
), qp
->orq_get
, qp
->orq_put
);
1185 spin_unlock_irqrestore(&qp
->orq_lock
, flags
);
1188 rv
= siw_sq_start(qp
);
1194 * siw_rdmap_complete()
1196 * Complete processing of an RDMA message after receiving all
1197 * DDP segmens or ABort processing after encountering error case.
1199 * o SENDs + RRESPs will need for completion,
1200 * o RREQs need for READ RESPONSE initialization
1201 * o WRITEs need memory dereferencing
1203 * TODO: Failed WRITEs need local error to be surfaced.
1205 static int siw_rdmap_complete(struct siw_qp
*qp
, int error
)
1207 struct siw_rx_stream
*srx
= &qp
->rx_stream
;
1208 struct siw_wqe
*wqe
= rx_wqe(qp
->rx_fpdu
);
1209 enum siw_wc_status wc_status
= wqe
->wc_status
;
1210 u8 opcode
= __rdmap_get_opcode(&srx
->hdr
.ctrl
);
1215 case RDMAP_SEND_SE_INVAL
:
1216 wqe
->rqe
.flags
|= SIW_WQE_SOLICITED
;
1220 case RDMAP_SEND_INVAL
:
1221 if (wqe
->wr_status
== SIW_WR_IDLE
)
1224 srx
->ddp_msn
[RDMAP_UNTAGGED_QN_SEND
]++;
1226 if (error
!= 0 && wc_status
== SIW_WC_SUCCESS
)
1227 wc_status
= SIW_WC_GENERAL_ERR
;
1229 * Handle STag invalidation request
1231 if (wc_status
== SIW_WC_SUCCESS
&&
1232 (opcode
== RDMAP_SEND_INVAL
||
1233 opcode
== RDMAP_SEND_SE_INVAL
)) {
1234 rv
= siw_invalidate_stag(qp
->pd
, srx
->inval_stag
);
1237 qp
, TERM_ERROR_LAYER_RDMAP
,
1239 RDMAP_ETYPE_REMOTE_PROTECTION
:
1240 RDMAP_ETYPE_REMOTE_OPERATION
,
1241 RDMAP_ECODE_CANNOT_INVALIDATE
, 0);
1243 wc_status
= SIW_WC_REM_INV_REQ_ERR
;
1245 rv
= siw_rqe_complete(qp
, &wqe
->rqe
, wqe
->processed
,
1246 rv
? 0 : srx
->inval_stag
,
1249 rv
= siw_rqe_complete(qp
, &wqe
->rqe
, wqe
->processed
,
1252 siw_wqe_put_mem(wqe
, SIW_OP_RECEIVE
);
1255 case RDMAP_RDMA_READ_RESP
:
1256 if (wqe
->wr_status
== SIW_WR_IDLE
)
1260 if ((srx
->state
== SIW_GET_HDR
&&
1261 qp
->rx_fpdu
->first_ddp_seg
) || error
== -ENODATA
)
1262 /* possible RREQ in ORQ left untouched */
1265 if (wc_status
== SIW_WC_SUCCESS
)
1266 wc_status
= SIW_WC_GENERAL_ERR
;
1267 } else if (rdma_is_kernel_res(&qp
->base_qp
.res
) &&
1268 rx_type(wqe
) == SIW_OP_READ_LOCAL_INV
) {
1270 * Handle any STag invalidation request
1272 rv
= siw_invalidate_stag(qp
->pd
, wqe
->sqe
.sge
[0].lkey
);
1274 siw_init_terminate(qp
, TERM_ERROR_LAYER_RDMAP
,
1275 RDMAP_ETYPE_CATASTROPHIC
,
1276 RDMAP_ECODE_UNSPECIFIED
, 0);
1278 if (wc_status
== SIW_WC_SUCCESS
) {
1279 wc_status
= SIW_WC_GENERAL_ERR
;
1285 * All errors turn the wqe into signalled.
1287 if ((wqe
->sqe
.flags
& SIW_WQE_SIGNALLED
) || error
!= 0)
1288 rv
= siw_sqe_complete(qp
, &wqe
->sqe
, wqe
->processed
,
1290 siw_wqe_put_mem(wqe
, SIW_OP_READ
);
1293 rv
= siw_check_tx_fence(qp
);
1295 /* Disable current ORQ eleement */
1296 WRITE_ONCE(orq_get_current(qp
)->flags
, 0);
1299 case RDMAP_RDMA_READ_REQ
:
1301 rv
= siw_init_rresp(qp
, srx
);
1302 srx
->ddp_msn
[RDMAP_UNTAGGED_QN_RDMA_READ
]++;
1306 case RDMAP_RDMA_WRITE
:
1307 if (wqe
->wr_status
== SIW_WR_IDLE
)
1311 * Free References from memory object if
1312 * attached to receive context (inbound WRITE).
1313 * While a zero-length WRITE is allowed,
1314 * no memory reference got created.
1316 if (rx_mem(&qp
->rx_tagged
)) {
1317 siw_mem_put(rx_mem(&qp
->rx_tagged
));
1318 rx_mem(&qp
->rx_tagged
) = NULL
;
1325 wqe
->wr_status
= SIW_WR_IDLE
;
1333 * Main routine to consume inbound TCP payload
1335 * @rd_desc: read descriptor
1336 * @skb: socket buffer
1337 * @off: offset in skb
1338 * @len: skb->len - offset : payload in skb
1340 int siw_tcp_rx_data(read_descriptor_t
*rd_desc
, struct sk_buff
*skb
,
1341 unsigned int off
, size_t len
)
1343 struct siw_qp
*qp
= rd_desc
->arg
.data
;
1344 struct siw_rx_stream
*srx
= &qp
->rx_stream
;
1348 srx
->skb_new
= skb
->len
- off
;
1349 srx
->skb_offset
= off
;
1350 srx
->skb_copied
= 0;
1352 siw_dbg_qp(qp
, "new data, len %d\n", srx
->skb_new
);
1354 while (srx
->skb_new
) {
1355 int run_completion
= 1;
1357 if (unlikely(srx
->rx_suspend
)) {
1358 /* Do not process any more data */
1359 srx
->skb_copied
+= srx
->skb_new
;
1362 switch (srx
->state
) {
1364 rv
= siw_get_hdr(srx
);
1366 srx
->fpdu_part_rem
=
1367 be16_to_cpu(srx
->hdr
.ctrl
.mpa_len
) -
1368 srx
->fpdu_part_rcvd
+ MPA_HDR_SIZE
;
1370 if (srx
->fpdu_part_rem
)
1371 srx
->pad
= -srx
->fpdu_part_rem
& 0x3;
1375 srx
->state
= SIW_GET_DATA_START
;
1376 srx
->fpdu_part_rcvd
= 0;
1380 case SIW_GET_DATA_MORE
:
1382 * Another data fragment of the same DDP segment.
1383 * Setting first_ddp_seg = 0 avoids repeating
1384 * initializations that shall occur only once per
1387 qp
->rx_fpdu
->first_ddp_seg
= 0;
1390 case SIW_GET_DATA_START
:
1392 * Headers will be checked by the opcode-specific
1393 * data receive function below.
1395 rv
= iwarp_pktinfo
[qp
->rx_stream
.rdmap_op
].rx_data(qp
);
1398 be16_to_cpu(srx
->hdr
.ctrl
.mpa_len
)
1401 srx
->fpdu_part_rem
= (-mpa_len
& 0x3)
1403 srx
->fpdu_part_rcvd
= 0;
1404 srx
->state
= SIW_GET_TRAILER
;
1406 if (unlikely(rv
== -ECONNRESET
))
1409 srx
->state
= SIW_GET_DATA_MORE
;
1413 case SIW_GET_TRAILER
:
1415 * read CRC + any padding
1417 rv
= siw_get_trailer(qp
, srx
);
1421 * complete RDMAP message if last fragment
1423 srx
->state
= SIW_GET_HDR
;
1424 srx
->fpdu_part_rcvd
= 0;
1426 if (!(srx
->hdr
.ctrl
.ddp_rdmap_ctrl
&
1431 rv
= siw_rdmap_complete(qp
, 0);
1437 pr_warn("QP[%u]: RX out of state\n", qp_id(qp
));
1441 if (unlikely(rv
!= 0 && rv
!= -EAGAIN
)) {
1442 if ((srx
->state
> SIW_GET_HDR
||
1443 qp
->rx_fpdu
->more_ddp_segs
) && run_completion
)
1444 siw_rdmap_complete(qp
, rv
);
1446 siw_dbg_qp(qp
, "rx error %d, rx state %d\n", rv
,
1449 siw_qp_cm_drop(qp
, 1);
1454 siw_dbg_qp(qp
, "fpdu fragment, state %d, missing %d\n",
1455 srx
->state
, srx
->fpdu_part_rem
);
1459 return srx
->skb_copied
;