1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
6 #include <linux/errno.h>
7 #include <linux/types.h>
9 #include <linux/scatterlist.h>
10 #include <linux/llist.h>
11 #include <asm/barrier.h>
15 #include "siw_verbs.h"
18 static char siw_qp_state_to_string
[SIW_QP_STATE_COUNT
][sizeof "TERMINATE"] = {
19 [SIW_QP_STATE_IDLE
] = "IDLE",
20 [SIW_QP_STATE_RTR
] = "RTR",
21 [SIW_QP_STATE_RTS
] = "RTS",
22 [SIW_QP_STATE_CLOSING
] = "CLOSING",
23 [SIW_QP_STATE_TERMINATE
] = "TERMINATE",
24 [SIW_QP_STATE_ERROR
] = "ERROR"
28 * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a
29 * per-RDMAP message basis. Please keep order of initializer. All MPA len
30 * is initialized to minimum packet size.
32 struct iwarp_msg_info iwarp_pktinfo
[RDMAP_TERMINATE
+ 1] = {
33 { /* RDMAP_RDMA_WRITE */
34 .hdr_len
= sizeof(struct iwarp_rdma_write
),
35 .ctrl
.mpa_len
= htons(sizeof(struct iwarp_rdma_write
) - 2),
36 .ctrl
.ddp_rdmap_ctrl
= DDP_FLAG_TAGGED
| DDP_FLAG_LAST
|
37 cpu_to_be16(DDP_VERSION
<< 8) |
38 cpu_to_be16(RDMAP_VERSION
<< 6) |
39 cpu_to_be16(RDMAP_RDMA_WRITE
),
40 .rx_data
= siw_proc_write
},
41 { /* RDMAP_RDMA_READ_REQ */
42 .hdr_len
= sizeof(struct iwarp_rdma_rreq
),
43 .ctrl
.mpa_len
= htons(sizeof(struct iwarp_rdma_rreq
) - 2),
44 .ctrl
.ddp_rdmap_ctrl
= DDP_FLAG_LAST
| cpu_to_be16(DDP_VERSION
<< 8) |
45 cpu_to_be16(RDMAP_VERSION
<< 6) |
46 cpu_to_be16(RDMAP_RDMA_READ_REQ
),
47 .rx_data
= siw_proc_rreq
},
48 { /* RDMAP_RDMA_READ_RESP */
49 .hdr_len
= sizeof(struct iwarp_rdma_rresp
),
50 .ctrl
.mpa_len
= htons(sizeof(struct iwarp_rdma_rresp
) - 2),
51 .ctrl
.ddp_rdmap_ctrl
= DDP_FLAG_TAGGED
| DDP_FLAG_LAST
|
52 cpu_to_be16(DDP_VERSION
<< 8) |
53 cpu_to_be16(RDMAP_VERSION
<< 6) |
54 cpu_to_be16(RDMAP_RDMA_READ_RESP
),
55 .rx_data
= siw_proc_rresp
},
57 .hdr_len
= sizeof(struct iwarp_send
),
58 .ctrl
.mpa_len
= htons(sizeof(struct iwarp_send
) - 2),
59 .ctrl
.ddp_rdmap_ctrl
= DDP_FLAG_LAST
| cpu_to_be16(DDP_VERSION
<< 8) |
60 cpu_to_be16(RDMAP_VERSION
<< 6) |
61 cpu_to_be16(RDMAP_SEND
),
62 .rx_data
= siw_proc_send
},
63 { /* RDMAP_SEND_INVAL */
64 .hdr_len
= sizeof(struct iwarp_send_inv
),
65 .ctrl
.mpa_len
= htons(sizeof(struct iwarp_send_inv
) - 2),
66 .ctrl
.ddp_rdmap_ctrl
= DDP_FLAG_LAST
| cpu_to_be16(DDP_VERSION
<< 8) |
67 cpu_to_be16(RDMAP_VERSION
<< 6) |
68 cpu_to_be16(RDMAP_SEND_INVAL
),
69 .rx_data
= siw_proc_send
},
71 .hdr_len
= sizeof(struct iwarp_send
),
72 .ctrl
.mpa_len
= htons(sizeof(struct iwarp_send
) - 2),
73 .ctrl
.ddp_rdmap_ctrl
= DDP_FLAG_LAST
| cpu_to_be16(DDP_VERSION
<< 8) |
74 cpu_to_be16(RDMAP_VERSION
<< 6) |
75 cpu_to_be16(RDMAP_SEND_SE
),
76 .rx_data
= siw_proc_send
},
77 { /* RDMAP_SEND_SE_INVAL */
78 .hdr_len
= sizeof(struct iwarp_send_inv
),
79 .ctrl
.mpa_len
= htons(sizeof(struct iwarp_send_inv
) - 2),
80 .ctrl
.ddp_rdmap_ctrl
= DDP_FLAG_LAST
| cpu_to_be16(DDP_VERSION
<< 8) |
81 cpu_to_be16(RDMAP_VERSION
<< 6) |
82 cpu_to_be16(RDMAP_SEND_SE_INVAL
),
83 .rx_data
= siw_proc_send
},
84 { /* RDMAP_TERMINATE */
85 .hdr_len
= sizeof(struct iwarp_terminate
),
86 .ctrl
.mpa_len
= htons(sizeof(struct iwarp_terminate
) - 2),
87 .ctrl
.ddp_rdmap_ctrl
= DDP_FLAG_LAST
| cpu_to_be16(DDP_VERSION
<< 8) |
88 cpu_to_be16(RDMAP_VERSION
<< 6) |
89 cpu_to_be16(RDMAP_TERMINATE
),
90 .rx_data
= siw_proc_terminate
}
93 void siw_qp_llp_data_ready(struct sock
*sk
)
97 read_lock(&sk
->sk_callback_lock
);
99 if (unlikely(!sk
->sk_user_data
|| !sk_to_qp(sk
)))
104 if (likely(!qp
->rx_stream
.rx_suspend
&&
105 down_read_trylock(&qp
->state_lock
))) {
106 read_descriptor_t rd_desc
= { .arg
.data
= qp
, .count
= 1 };
108 if (likely(qp
->attrs
.state
== SIW_QP_STATE_RTS
))
110 * Implements data receive operation during
111 * socket callback. TCP gracefully catches
112 * the case where there is nothing to receive
113 * (not calling siw_tcp_rx_data() then).
115 tcp_read_sock(sk
, &rd_desc
, siw_tcp_rx_data
);
117 up_read(&qp
->state_lock
);
119 siw_dbg_qp(qp
, "unable to process RX, suspend: %d\n",
120 qp
->rx_stream
.rx_suspend
);
123 read_unlock(&sk
->sk_callback_lock
);
126 void siw_qp_llp_close(struct siw_qp
*qp
)
128 siw_dbg_qp(qp
, "enter llp close, state = %s\n",
129 siw_qp_state_to_string
[qp
->attrs
.state
]);
131 down_write(&qp
->state_lock
);
133 qp
->rx_stream
.rx_suspend
= 1;
134 qp
->tx_ctx
.tx_suspend
= 1;
137 switch (qp
->attrs
.state
) {
138 case SIW_QP_STATE_RTS
:
139 case SIW_QP_STATE_RTR
:
140 case SIW_QP_STATE_IDLE
:
141 case SIW_QP_STATE_TERMINATE
:
142 qp
->attrs
.state
= SIW_QP_STATE_ERROR
;
145 * SIW_QP_STATE_CLOSING:
147 * This is a forced close. shall the QP be moved to
150 case SIW_QP_STATE_CLOSING
:
151 if (tx_wqe(qp
)->wr_status
== SIW_WR_IDLE
)
152 qp
->attrs
.state
= SIW_QP_STATE_ERROR
;
154 qp
->attrs
.state
= SIW_QP_STATE_IDLE
;
158 siw_dbg_qp(qp
, "llp close: no state transition needed: %s\n",
159 siw_qp_state_to_string
[qp
->attrs
.state
]);
166 * Dereference closing CEP
169 siw_cep_put(qp
->cep
);
173 up_write(&qp
->state_lock
);
175 siw_dbg_qp(qp
, "llp close exit: state %s\n",
176 siw_qp_state_to_string
[qp
->attrs
.state
]);
180 * socket callback routine informing about newly available send space.
181 * Function schedules SQ work for processing SQ items.
183 void siw_qp_llp_write_space(struct sock
*sk
)
187 read_lock(&sk
->sk_callback_lock
);
191 cep
->sk_write_space(sk
);
193 if (!test_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
))
194 (void)siw_sq_start(cep
->qp
);
197 read_unlock(&sk
->sk_callback_lock
);
200 static int siw_qp_readq_init(struct siw_qp
*qp
, int irq_size
, int orq_size
)
202 irq_size
= roundup_pow_of_two(irq_size
);
203 orq_size
= roundup_pow_of_two(orq_size
);
205 qp
->attrs
.irq_size
= irq_size
;
206 qp
->attrs
.orq_size
= orq_size
;
208 qp
->irq
= vzalloc(irq_size
* sizeof(struct siw_sqe
));
210 siw_dbg_qp(qp
, "irq malloc for %d failed\n", irq_size
);
211 qp
->attrs
.irq_size
= 0;
214 qp
->orq
= vzalloc(orq_size
* sizeof(struct siw_sqe
));
216 siw_dbg_qp(qp
, "orq malloc for %d failed\n", orq_size
);
217 qp
->attrs
.orq_size
= 0;
218 qp
->attrs
.irq_size
= 0;
222 siw_dbg_qp(qp
, "ORD %d, IRD %d\n", orq_size
, irq_size
);
226 static int siw_qp_enable_crc(struct siw_qp
*qp
)
228 struct siw_rx_stream
*c_rx
= &qp
->rx_stream
;
229 struct siw_iwarp_tx
*c_tx
= &qp
->tx_ctx
;
232 if (siw_crypto_shash
== NULL
)
235 size
= crypto_shash_descsize(siw_crypto_shash
) +
236 sizeof(struct shash_desc
);
238 c_tx
->mpa_crc_hd
= kzalloc(size
, GFP_KERNEL
);
239 c_rx
->mpa_crc_hd
= kzalloc(size
, GFP_KERNEL
);
240 if (!c_tx
->mpa_crc_hd
|| !c_rx
->mpa_crc_hd
) {
241 kfree(c_tx
->mpa_crc_hd
);
242 kfree(c_rx
->mpa_crc_hd
);
243 c_tx
->mpa_crc_hd
= NULL
;
244 c_rx
->mpa_crc_hd
= NULL
;
247 c_tx
->mpa_crc_hd
->tfm
= siw_crypto_shash
;
248 c_rx
->mpa_crc_hd
->tfm
= siw_crypto_shash
;
254 * Send a non signalled READ or WRITE to peer side as negotiated
255 * with MPAv2 P2P setup protocol. The work request is only created
256 * as a current active WR and does not consume Send Queue space.
258 * Caller must hold QP state lock.
260 int siw_qp_mpa_rts(struct siw_qp
*qp
, enum mpa_v2_ctrl ctrl
)
262 struct siw_wqe
*wqe
= tx_wqe(qp
);
266 spin_lock_irqsave(&qp
->sq_lock
, flags
);
268 if (unlikely(wqe
->wr_status
!= SIW_WR_IDLE
)) {
269 spin_unlock_irqrestore(&qp
->sq_lock
, flags
);
272 memset(wqe
->mem
, 0, sizeof(*wqe
->mem
) * SIW_MAX_SGE
);
274 wqe
->wr_status
= SIW_WR_QUEUED
;
276 wqe
->sqe
.num_sge
= 1;
277 wqe
->sqe
.sge
[0].length
= 0;
278 wqe
->sqe
.sge
[0].laddr
= 0;
279 wqe
->sqe
.sge
[0].lkey
= 0;
281 * While it must not be checked for inbound zero length
282 * READ/WRITE, some HW may treat STag 0 special.
288 if (ctrl
& MPA_V2_RDMA_WRITE_RTR
)
289 wqe
->sqe
.opcode
= SIW_OP_WRITE
;
290 else if (ctrl
& MPA_V2_RDMA_READ_RTR
) {
291 struct siw_sqe
*rreq
;
293 wqe
->sqe
.opcode
= SIW_OP_READ
;
295 spin_lock(&qp
->orq_lock
);
297 rreq
= orq_get_free(qp
);
299 siw_read_to_orq(rreq
, &wqe
->sqe
);
304 spin_unlock(&qp
->orq_lock
);
309 wqe
->wr_status
= SIW_WR_IDLE
;
311 spin_unlock_irqrestore(&qp
->sq_lock
, flags
);
314 rv
= siw_sq_start(qp
);
320 * Map memory access error to DDP tagged error
322 enum ddp_ecode
siw_tagged_error(enum siw_access_state state
)
326 return DDP_ECODE_T_INVALID_STAG
;
328 return DDP_ECODE_T_BASE_BOUNDS
;
330 return DDP_ECODE_T_STAG_NOT_ASSOC
;
333 * RFC 5041 (DDP) lacks an ecode for insufficient access
334 * permissions. 'Invalid STag' seem to be the closest
337 return DDP_ECODE_T_INVALID_STAG
;
340 return DDP_ECODE_T_INVALID_STAG
;
345 * Map memory access error to RDMAP protection error
347 enum rdmap_ecode
siw_rdmap_error(enum siw_access_state state
)
351 return RDMAP_ECODE_INVALID_STAG
;
353 return RDMAP_ECODE_BASE_BOUNDS
;
355 return RDMAP_ECODE_STAG_NOT_ASSOC
;
357 return RDMAP_ECODE_ACCESS_RIGHTS
;
359 return RDMAP_ECODE_UNSPECIFIED
;
363 void siw_init_terminate(struct siw_qp
*qp
, enum term_elayer layer
, u8 etype
,
366 if (!qp
->term_info
.valid
) {
367 memset(&qp
->term_info
, 0, sizeof(qp
->term_info
));
368 qp
->term_info
.layer
= layer
;
369 qp
->term_info
.etype
= etype
;
370 qp
->term_info
.ecode
= ecode
;
371 qp
->term_info
.in_tx
= in_tx
;
372 qp
->term_info
.valid
= 1;
374 siw_dbg_qp(qp
, "init TERM: layer %d, type %d, code %d, in tx %s\n",
375 layer
, etype
, ecode
, in_tx
? "yes" : "no");
379 * Send a TERMINATE message, as defined in RFC's 5040/5041/5044/6581.
380 * Sending TERMINATE messages is best effort - such messages
381 * can only be send if the QP is still connected and it does
382 * not have another outbound message in-progress, i.e. the
383 * TERMINATE message must not interfer with an incomplete current
384 * transmit operation.
386 void siw_send_terminate(struct siw_qp
*qp
)
389 struct msghdr msg
= { .msg_flags
= MSG_DONTWAIT
| MSG_EOR
};
390 struct iwarp_terminate
*term
= NULL
;
391 union iwarp_hdr
*err_hdr
= NULL
;
392 struct socket
*s
= qp
->attrs
.sk
;
393 struct siw_rx_stream
*srx
= &qp
->rx_stream
;
394 union iwarp_hdr
*rx_hdr
= &srx
->hdr
;
396 int num_frags
, len_terminate
, rv
;
398 if (!qp
->term_info
.valid
)
401 qp
->term_info
.valid
= 0;
403 if (tx_wqe(qp
)->wr_status
== SIW_WR_INPROGRESS
) {
404 siw_dbg_qp(qp
, "cannot send TERMINATE: op %d in progress\n",
405 tx_type(tx_wqe(qp
)));
409 /* QP not yet in RTS. Take socket from connection end point */
413 siw_dbg_qp(qp
, "cannot send TERMINATE: not connected\n");
417 term
= kzalloc(sizeof(*term
), GFP_KERNEL
);
421 term
->ddp_qn
= cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE
);
423 term
->ddp_msn
= cpu_to_be32(1);
425 iov
[0].iov_base
= term
;
426 iov
[0].iov_len
= sizeof(*term
);
428 if ((qp
->term_info
.layer
== TERM_ERROR_LAYER_DDP
) ||
429 ((qp
->term_info
.layer
== TERM_ERROR_LAYER_RDMAP
) &&
430 (qp
->term_info
.etype
!= RDMAP_ETYPE_CATASTROPHIC
))) {
431 err_hdr
= kzalloc(sizeof(*err_hdr
), GFP_KERNEL
);
437 memcpy(&term
->ctrl
, &iwarp_pktinfo
[RDMAP_TERMINATE
].ctrl
,
438 sizeof(struct iwarp_ctrl
));
440 __rdmap_term_set_layer(term
, qp
->term_info
.layer
);
441 __rdmap_term_set_etype(term
, qp
->term_info
.etype
);
442 __rdmap_term_set_ecode(term
, qp
->term_info
.ecode
);
444 switch (qp
->term_info
.layer
) {
445 case TERM_ERROR_LAYER_RDMAP
:
446 if (qp
->term_info
.etype
== RDMAP_ETYPE_CATASTROPHIC
)
447 /* No additional DDP/RDMAP header to be included */
450 if (qp
->term_info
.etype
== RDMAP_ETYPE_REMOTE_PROTECTION
) {
452 * Complete RDMAP frame will get attached, and
453 * DDP segment length is valid
459 if (qp
->term_info
.in_tx
) {
460 struct iwarp_rdma_rreq
*rreq
;
461 struct siw_wqe
*wqe
= tx_wqe(qp
);
463 /* Inbound RREQ error, detected during
464 * RRESP creation. Take state from
465 * current TX work queue element to
466 * reconstruct peers RREQ.
468 rreq
= (struct iwarp_rdma_rreq
*)err_hdr
;
471 &iwarp_pktinfo
[RDMAP_RDMA_READ_REQ
].ctrl
,
472 sizeof(struct iwarp_ctrl
));
476 htonl(RDMAP_UNTAGGED_QN_RDMA_READ
);
478 /* Provide RREQ's MSN as kept aside */
479 rreq
->ddp_msn
= htonl(wqe
->sqe
.sge
[0].length
);
481 rreq
->ddp_mo
= htonl(wqe
->processed
);
482 rreq
->sink_stag
= htonl(wqe
->sqe
.rkey
);
483 rreq
->sink_to
= cpu_to_be64(wqe
->sqe
.raddr
);
484 rreq
->read_size
= htonl(wqe
->sqe
.sge
[0].length
);
485 rreq
->source_stag
= htonl(wqe
->sqe
.sge
[0].lkey
);
487 cpu_to_be64(wqe
->sqe
.sge
[0].laddr
);
489 iov
[1].iov_base
= rreq
;
490 iov
[1].iov_len
= sizeof(*rreq
);
492 rx_hdr
= (union iwarp_hdr
*)rreq
;
494 /* Take RDMAP/DDP information from
495 * current (failed) inbound frame.
497 iov
[1].iov_base
= rx_hdr
;
499 if (__rdmap_get_opcode(&rx_hdr
->ctrl
) ==
502 sizeof(struct iwarp_rdma_rreq
);
505 sizeof(struct iwarp_send
);
508 /* Do not report DDP hdr information if packet
511 if ((qp
->term_info
.ecode
== RDMAP_ECODE_VERSION
) ||
512 (qp
->term_info
.ecode
== RDMAP_ECODE_OPCODE
))
515 iov
[1].iov_base
= rx_hdr
;
517 /* Only DDP frame will get attached */
518 if (rx_hdr
->ctrl
.ddp_rdmap_ctrl
& DDP_FLAG_TAGGED
)
520 sizeof(struct iwarp_rdma_write
);
522 iov
[1].iov_len
= sizeof(struct iwarp_send
);
527 term
->ctrl
.mpa_len
= cpu_to_be16(iov
[1].iov_len
);
530 case TERM_ERROR_LAYER_DDP
:
531 /* Report error encountered while DDP processing.
532 * This can only happen as a result of inbound
536 /* Do not report DDP hdr information if packet
539 if (((qp
->term_info
.etype
== DDP_ETYPE_TAGGED_BUF
) &&
540 (qp
->term_info
.ecode
== DDP_ECODE_T_VERSION
)) ||
541 ((qp
->term_info
.etype
== DDP_ETYPE_UNTAGGED_BUF
) &&
542 (qp
->term_info
.ecode
== DDP_ECODE_UT_VERSION
)))
545 iov
[1].iov_base
= rx_hdr
;
547 if (rx_hdr
->ctrl
.ddp_rdmap_ctrl
& DDP_FLAG_TAGGED
)
548 iov
[1].iov_len
= sizeof(struct iwarp_ctrl_tagged
);
550 iov
[1].iov_len
= sizeof(struct iwarp_ctrl_untagged
);
559 if (term
->flag_m
|| term
->flag_d
|| term
->flag_r
) {
560 iov
[2].iov_base
= &crc
;
561 iov
[2].iov_len
= sizeof(crc
);
562 len_terminate
= sizeof(*term
) + iov
[1].iov_len
+ MPA_CRC_SIZE
;
565 iov
[1].iov_base
= &crc
;
566 iov
[1].iov_len
= sizeof(crc
);
567 len_terminate
= sizeof(*term
) + MPA_CRC_SIZE
;
571 /* Adjust DDP Segment Length parameter, if valid */
573 u32 real_ddp_len
= be16_to_cpu(rx_hdr
->ctrl
.mpa_len
);
574 enum rdma_opcode op
= __rdmap_get_opcode(&rx_hdr
->ctrl
);
576 real_ddp_len
-= iwarp_pktinfo
[op
].hdr_len
- MPA_HDR_SIZE
;
577 rx_hdr
->ctrl
.mpa_len
= cpu_to_be16(real_ddp_len
);
581 cpu_to_be16(len_terminate
- (MPA_HDR_SIZE
+ MPA_CRC_SIZE
));
582 if (qp
->tx_ctx
.mpa_crc_hd
) {
583 crypto_shash_init(qp
->tx_ctx
.mpa_crc_hd
);
584 if (crypto_shash_update(qp
->tx_ctx
.mpa_crc_hd
,
585 (u8
*)iov
[0].iov_base
,
589 if (num_frags
== 3) {
590 if (crypto_shash_update(qp
->tx_ctx
.mpa_crc_hd
,
591 (u8
*)iov
[1].iov_base
,
595 crypto_shash_final(qp
->tx_ctx
.mpa_crc_hd
, (u8
*)&crc
);
598 rv
= kernel_sendmsg(s
, &msg
, iov
, num_frags
, len_terminate
);
599 siw_dbg_qp(qp
, "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n",
600 rv
== len_terminate
? "success" : "failure",
601 __rdmap_term_layer(term
), __rdmap_term_etype(term
),
602 __rdmap_term_ecode(term
), rv
);
609 * Handle all attrs other than state
611 static void siw_qp_modify_nonstate(struct siw_qp
*qp
,
612 struct siw_qp_attrs
*attrs
,
613 enum siw_qp_attr_mask mask
)
615 if (mask
& SIW_QP_ATTR_ACCESS_FLAGS
) {
616 if (attrs
->flags
& SIW_RDMA_BIND_ENABLED
)
617 qp
->attrs
.flags
|= SIW_RDMA_BIND_ENABLED
;
619 qp
->attrs
.flags
&= ~SIW_RDMA_BIND_ENABLED
;
621 if (attrs
->flags
& SIW_RDMA_WRITE_ENABLED
)
622 qp
->attrs
.flags
|= SIW_RDMA_WRITE_ENABLED
;
624 qp
->attrs
.flags
&= ~SIW_RDMA_WRITE_ENABLED
;
626 if (attrs
->flags
& SIW_RDMA_READ_ENABLED
)
627 qp
->attrs
.flags
|= SIW_RDMA_READ_ENABLED
;
629 qp
->attrs
.flags
&= ~SIW_RDMA_READ_ENABLED
;
633 static int siw_qp_nextstate_from_idle(struct siw_qp
*qp
,
634 struct siw_qp_attrs
*attrs
,
635 enum siw_qp_attr_mask mask
)
639 switch (attrs
->state
) {
640 case SIW_QP_STATE_RTS
:
641 if (attrs
->flags
& SIW_MPA_CRC
) {
642 rv
= siw_qp_enable_crc(qp
);
646 if (!(mask
& SIW_QP_ATTR_LLP_HANDLE
)) {
647 siw_dbg_qp(qp
, "no socket\n");
651 if (!(mask
& SIW_QP_ATTR_MPA
)) {
652 siw_dbg_qp(qp
, "no MPA\n");
657 * Initialize iWARP TX state
659 qp
->tx_ctx
.ddp_msn
[RDMAP_UNTAGGED_QN_SEND
] = 0;
660 qp
->tx_ctx
.ddp_msn
[RDMAP_UNTAGGED_QN_RDMA_READ
] = 0;
661 qp
->tx_ctx
.ddp_msn
[RDMAP_UNTAGGED_QN_TERMINATE
] = 0;
664 * Initialize iWARP RX state
666 qp
->rx_stream
.ddp_msn
[RDMAP_UNTAGGED_QN_SEND
] = 1;
667 qp
->rx_stream
.ddp_msn
[RDMAP_UNTAGGED_QN_RDMA_READ
] = 1;
668 qp
->rx_stream
.ddp_msn
[RDMAP_UNTAGGED_QN_TERMINATE
] = 1;
671 * init IRD free queue, caller has already checked
674 rv
= siw_qp_readq_init(qp
, attrs
->irq_size
,
679 qp
->attrs
.sk
= attrs
->sk
;
680 qp
->attrs
.state
= SIW_QP_STATE_RTS
;
682 siw_dbg_qp(qp
, "enter RTS: crc=%s, ord=%u, ird=%u\n",
683 attrs
->flags
& SIW_MPA_CRC
? "y" : "n",
684 qp
->attrs
.orq_size
, qp
->attrs
.irq_size
);
687 case SIW_QP_STATE_ERROR
:
689 qp
->attrs
.state
= SIW_QP_STATE_ERROR
;
691 siw_cep_put(qp
->cep
);
702 static int siw_qp_nextstate_from_rts(struct siw_qp
*qp
,
703 struct siw_qp_attrs
*attrs
)
707 switch (attrs
->state
) {
708 case SIW_QP_STATE_CLOSING
:
710 * Verbs: move to IDLE if SQ and ORQ are empty.
711 * Move to ERROR otherwise. But first of all we must
712 * close the connection. So we keep CLOSING or ERROR
713 * as a transient state, schedule connection drop work
714 * and wait for the socket state change upcall to
717 if (tx_wqe(qp
)->wr_status
== SIW_WR_IDLE
) {
718 qp
->attrs
.state
= SIW_QP_STATE_CLOSING
;
720 qp
->attrs
.state
= SIW_QP_STATE_ERROR
;
728 case SIW_QP_STATE_TERMINATE
:
729 qp
->attrs
.state
= SIW_QP_STATE_TERMINATE
;
731 siw_init_terminate(qp
, TERM_ERROR_LAYER_RDMAP
,
732 RDMAP_ETYPE_CATASTROPHIC
,
733 RDMAP_ECODE_UNSPECIFIED
, 1);
737 case SIW_QP_STATE_ERROR
:
739 * This is an emergency close.
741 * Any in progress transmit operation will get
743 * This will likely result in a protocol failure,
744 * if a TX operation is in transit. The caller
745 * could unconditional wait to give the current
746 * operation a chance to complete.
747 * Esp., how to handle the non-empty IRQ case?
748 * The peer was asking for data transfer at a valid
753 qp
->attrs
.state
= SIW_QP_STATE_ERROR
;
763 static void siw_qp_nextstate_from_term(struct siw_qp
*qp
,
764 struct siw_qp_attrs
*attrs
)
766 switch (attrs
->state
) {
767 case SIW_QP_STATE_ERROR
:
769 qp
->attrs
.state
= SIW_QP_STATE_ERROR
;
771 if (tx_wqe(qp
)->wr_status
!= SIW_WR_IDLE
)
780 static int siw_qp_nextstate_from_close(struct siw_qp
*qp
,
781 struct siw_qp_attrs
*attrs
)
785 switch (attrs
->state
) {
786 case SIW_QP_STATE_IDLE
:
787 WARN_ON(tx_wqe(qp
)->wr_status
!= SIW_WR_IDLE
);
788 qp
->attrs
.state
= SIW_QP_STATE_IDLE
;
791 case SIW_QP_STATE_CLOSING
:
793 * The LLP may already moved the QP to closing
794 * due to graceful peer close init
798 case SIW_QP_STATE_ERROR
:
800 * QP was moved to CLOSING by LLP event
801 * not yet seen by user.
803 qp
->attrs
.state
= SIW_QP_STATE_ERROR
;
805 if (tx_wqe(qp
)->wr_status
!= SIW_WR_IDLE
)
812 siw_dbg_qp(qp
, "state transition undefined: %s => %s\n",
813 siw_qp_state_to_string
[qp
->attrs
.state
],
814 siw_qp_state_to_string
[attrs
->state
]);
822 * Caller must hold qp->state_lock
824 int siw_qp_modify(struct siw_qp
*qp
, struct siw_qp_attrs
*attrs
,
825 enum siw_qp_attr_mask mask
)
827 int drop_conn
= 0, rv
= 0;
832 siw_dbg_qp(qp
, "state: %s => %s\n",
833 siw_qp_state_to_string
[qp
->attrs
.state
],
834 siw_qp_state_to_string
[attrs
->state
]);
836 if (mask
!= SIW_QP_ATTR_STATE
)
837 siw_qp_modify_nonstate(qp
, attrs
, mask
);
839 if (!(mask
& SIW_QP_ATTR_STATE
))
842 switch (qp
->attrs
.state
) {
843 case SIW_QP_STATE_IDLE
:
844 case SIW_QP_STATE_RTR
:
845 rv
= siw_qp_nextstate_from_idle(qp
, attrs
, mask
);
848 case SIW_QP_STATE_RTS
:
849 drop_conn
= siw_qp_nextstate_from_rts(qp
, attrs
);
852 case SIW_QP_STATE_TERMINATE
:
853 siw_qp_nextstate_from_term(qp
, attrs
);
856 case SIW_QP_STATE_CLOSING
:
857 siw_qp_nextstate_from_close(qp
, attrs
);
863 siw_qp_cm_drop(qp
, 0);
868 void siw_read_to_orq(struct siw_sqe
*rreq
, struct siw_sqe
*sqe
)
871 rreq
->opcode
= sqe
->opcode
;
872 rreq
->sge
[0].laddr
= sqe
->sge
[0].laddr
;
873 rreq
->sge
[0].length
= sqe
->sge
[0].length
;
874 rreq
->sge
[0].lkey
= sqe
->sge
[0].lkey
;
875 rreq
->sge
[1].lkey
= sqe
->sge
[1].lkey
;
876 rreq
->flags
= sqe
->flags
| SIW_WQE_VALID
;
881 * Must be called with SQ locked.
882 * To avoid complete SQ starvation by constant inbound READ requests,
883 * the active IRQ will not be served after qp->irq_burst, if the
884 * SQ has pending work.
886 int siw_activate_tx(struct siw_qp
*qp
)
888 struct siw_sqe
*irqe
, *sqe
;
889 struct siw_wqe
*wqe
= tx_wqe(qp
);
892 irqe
= &qp
->irq
[qp
->irq_get
% qp
->attrs
.irq_size
];
894 if (irqe
->flags
& SIW_WQE_VALID
) {
895 sqe
= sq_get_next(qp
);
898 * Avoid local WQE processing starvation in case
899 * of constant inbound READ request stream
901 if (sqe
&& ++qp
->irq_burst
>= SIW_IRQ_MAXBURST_SQ_ACTIVE
) {
905 memset(wqe
->mem
, 0, sizeof(*wqe
->mem
) * SIW_MAX_SGE
);
906 wqe
->wr_status
= SIW_WR_QUEUED
;
908 /* start READ RESPONSE */
909 wqe
->sqe
.opcode
= SIW_OP_READ_RESPONSE
;
912 wqe
->sqe
.num_sge
= 1;
913 wqe
->sqe
.sge
[0].length
= irqe
->sge
[0].length
;
914 wqe
->sqe
.sge
[0].laddr
= irqe
->sge
[0].laddr
;
915 wqe
->sqe
.sge
[0].lkey
= irqe
->sge
[0].lkey
;
917 wqe
->sqe
.num_sge
= 0;
920 /* Retain original RREQ's message sequence number for
921 * potential error reporting cases.
923 wqe
->sqe
.sge
[1].length
= irqe
->sge
[1].length
;
925 wqe
->sqe
.rkey
= irqe
->rkey
;
926 wqe
->sqe
.raddr
= irqe
->raddr
;
931 /* mark current IRQ entry free */
932 smp_store_mb(irqe
->flags
, 0);
936 sqe
= sq_get_next(qp
);
939 memset(wqe
->mem
, 0, sizeof(*wqe
->mem
) * SIW_MAX_SGE
);
940 wqe
->wr_status
= SIW_WR_QUEUED
;
942 /* First copy SQE to kernel private memory */
943 memcpy(&wqe
->sqe
, sqe
, sizeof(*sqe
));
945 if (wqe
->sqe
.opcode
>= SIW_NUM_OPCODES
) {
949 if (wqe
->sqe
.flags
& SIW_WQE_INLINE
) {
950 if (wqe
->sqe
.opcode
!= SIW_OP_SEND
&&
951 wqe
->sqe
.opcode
!= SIW_OP_WRITE
) {
955 if (wqe
->sqe
.sge
[0].length
> SIW_MAX_INLINE
) {
959 wqe
->sqe
.sge
[0].laddr
= (uintptr_t)&wqe
->sqe
.sge
[1];
960 wqe
->sqe
.sge
[0].lkey
= 0;
961 wqe
->sqe
.num_sge
= 1;
963 if (wqe
->sqe
.flags
& SIW_WQE_READ_FENCE
) {
964 /* A READ cannot be fenced */
965 if (unlikely(wqe
->sqe
.opcode
== SIW_OP_READ
||
967 SIW_OP_READ_LOCAL_INV
)) {
968 siw_dbg_qp(qp
, "cannot fence read\n");
972 spin_lock(&qp
->orq_lock
);
974 if (!siw_orq_empty(qp
)) {
975 qp
->tx_ctx
.orq_fence
= 1;
978 spin_unlock(&qp
->orq_lock
);
980 } else if (wqe
->sqe
.opcode
== SIW_OP_READ
||
981 wqe
->sqe
.opcode
== SIW_OP_READ_LOCAL_INV
) {
982 struct siw_sqe
*rreq
;
984 wqe
->sqe
.num_sge
= 1;
986 spin_lock(&qp
->orq_lock
);
988 rreq
= orq_get_free(qp
);
991 * Make an immediate copy in ORQ to be ready
992 * to process loopback READ reply
994 siw_read_to_orq(rreq
, &wqe
->sqe
);
997 qp
->tx_ctx
.orq_fence
= 1;
1000 spin_unlock(&qp
->orq_lock
);
1003 /* Clear SQE, can be re-used by application */
1004 smp_store_mb(sqe
->flags
, 0);
1010 if (unlikely(rv
< 0)) {
1011 siw_dbg_qp(qp
, "error %d\n", rv
);
1012 wqe
->wr_status
= SIW_WR_IDLE
;
1018 * Check if current CQ state qualifies for calling CQ completion
1019 * handler. Must be called with CQ lock held.
1021 static bool siw_cq_notify_now(struct siw_cq
*cq
, u32 flags
)
1025 if (!cq
->base_cq
.comp_handler
)
1028 /* Read application shared notification state */
1029 cq_notify
= READ_ONCE(cq
->notify
->flags
);
1031 if ((cq_notify
& SIW_NOTIFY_NEXT_COMPLETION
) ||
1032 ((cq_notify
& SIW_NOTIFY_SOLICITED
) &&
1033 (flags
& SIW_WQE_SOLICITED
))) {
1035 * CQ notification is one-shot: Since the
1036 * current CQE causes user notification,
1037 * the CQ gets dis-aremd and must be re-aremd
1038 * by the user for a new notification.
1040 WRITE_ONCE(cq
->notify
->flags
, SIW_NOTIFY_NOT
);
1047 int siw_sqe_complete(struct siw_qp
*qp
, struct siw_sqe
*sqe
, u32 bytes
,
1048 enum siw_wc_status status
)
1050 struct siw_cq
*cq
= qp
->scq
;
1054 u32 sqe_flags
= sqe
->flags
;
1055 struct siw_cqe
*cqe
;
1057 unsigned long flags
;
1059 spin_lock_irqsave(&cq
->lock
, flags
);
1061 idx
= cq
->cq_put
% cq
->num_cqe
;
1062 cqe
= &cq
->queue
[idx
];
1064 if (!READ_ONCE(cqe
->flags
)) {
1068 cqe
->opcode
= sqe
->opcode
;
1069 cqe
->status
= status
;
1073 if (rdma_is_kernel_res(&cq
->base_cq
.res
))
1074 cqe
->base_qp
= &qp
->base_qp
;
1076 cqe
->qp_id
= qp_id(qp
);
1078 /* mark CQE valid for application */
1079 WRITE_ONCE(cqe
->flags
, SIW_WQE_VALID
);
1081 smp_store_mb(sqe
->flags
, 0);
1084 notify
= siw_cq_notify_now(cq
, sqe_flags
);
1086 spin_unlock_irqrestore(&cq
->lock
, flags
);
1089 siw_dbg_cq(cq
, "Call completion handler\n");
1090 cq
->base_cq
.comp_handler(&cq
->base_cq
,
1091 cq
->base_cq
.cq_context
);
1094 spin_unlock_irqrestore(&cq
->lock
, flags
);
1096 siw_cq_event(cq
, IB_EVENT_CQ_ERR
);
1100 smp_store_mb(sqe
->flags
, 0);
1105 int siw_rqe_complete(struct siw_qp
*qp
, struct siw_rqe
*rqe
, u32 bytes
,
1106 u32 inval_stag
, enum siw_wc_status status
)
1108 struct siw_cq
*cq
= qp
->rcq
;
1112 struct siw_cqe
*cqe
;
1114 unsigned long flags
;
1116 spin_lock_irqsave(&cq
->lock
, flags
);
1118 idx
= cq
->cq_put
% cq
->num_cqe
;
1119 cqe
= &cq
->queue
[idx
];
1121 if (!READ_ONCE(cqe
->flags
)) {
1123 u8 cqe_flags
= SIW_WQE_VALID
;
1126 cqe
->opcode
= SIW_OP_RECEIVE
;
1127 cqe
->status
= status
;
1131 if (rdma_is_kernel_res(&cq
->base_cq
.res
)) {
1132 cqe
->base_qp
= &qp
->base_qp
;
1134 cqe_flags
|= SIW_WQE_REM_INVAL
;
1135 cqe
->inval_stag
= inval_stag
;
1138 cqe
->qp_id
= qp_id(qp
);
1140 /* mark CQE valid for application */
1141 WRITE_ONCE(cqe
->flags
, cqe_flags
);
1143 smp_store_mb(rqe
->flags
, 0);
1146 notify
= siw_cq_notify_now(cq
, SIW_WQE_SIGNALLED
);
1148 spin_unlock_irqrestore(&cq
->lock
, flags
);
1151 siw_dbg_cq(cq
, "Call completion handler\n");
1152 cq
->base_cq
.comp_handler(&cq
->base_cq
,
1153 cq
->base_cq
.cq_context
);
1156 spin_unlock_irqrestore(&cq
->lock
, flags
);
1158 siw_cq_event(cq
, IB_EVENT_CQ_ERR
);
1162 smp_store_mb(rqe
->flags
, 0);
1170 * Flush SQ and ORRQ entries to CQ.
1172 * Must be called with QP state write lock held.
1173 * Therefore, SQ and ORQ lock must not be taken.
1175 void siw_sq_flush(struct siw_qp
*qp
)
1177 struct siw_sqe
*sqe
;
1178 struct siw_wqe
*wqe
= tx_wqe(qp
);
1179 int async_event
= 0;
1182 * Start with completing any work currently on the ORQ
1184 while (qp
->attrs
.orq_size
) {
1185 sqe
= &qp
->orq
[qp
->orq_get
% qp
->attrs
.orq_size
];
1186 if (!READ_ONCE(sqe
->flags
))
1189 if (siw_sqe_complete(qp
, sqe
, 0, SIW_WC_WR_FLUSH_ERR
) != 0)
1192 WRITE_ONCE(sqe
->flags
, 0);
1196 * Flush an in-progress WQE if present
1198 if (wqe
->wr_status
!= SIW_WR_IDLE
) {
1199 siw_dbg_qp(qp
, "flush current SQE, type %d, status %d\n",
1200 tx_type(wqe
), wqe
->wr_status
);
1202 siw_wqe_put_mem(wqe
, tx_type(wqe
));
1204 if (tx_type(wqe
) != SIW_OP_READ_RESPONSE
&&
1205 ((tx_type(wqe
) != SIW_OP_READ
&&
1206 tx_type(wqe
) != SIW_OP_READ_LOCAL_INV
) ||
1207 wqe
->wr_status
== SIW_WR_QUEUED
))
1209 * An in-progress Read Request is already in
1212 siw_sqe_complete(qp
, &wqe
->sqe
, wqe
->bytes
,
1213 SIW_WC_WR_FLUSH_ERR
);
1215 wqe
->wr_status
= SIW_WR_IDLE
;
1218 * Flush the Send Queue
1220 while (qp
->attrs
.sq_size
) {
1221 sqe
= &qp
->sendq
[qp
->sq_get
% qp
->attrs
.sq_size
];
1222 if (!READ_ONCE(sqe
->flags
))
1226 if (siw_sqe_complete(qp
, sqe
, 0, SIW_WC_WR_FLUSH_ERR
) != 0)
1228 * Shall IB_EVENT_SQ_DRAINED be supressed if work
1233 WRITE_ONCE(sqe
->flags
, 0);
1237 siw_qp_event(qp
, IB_EVENT_SQ_DRAINED
);
1243 * Flush recv queue entries to CQ. Also
1244 * takes care of pending active tagged and untagged
1245 * inbound transfers, which have target memory
1248 * Must be called with QP state write lock held.
1249 * Therefore, RQ lock must not be taken.
1251 void siw_rq_flush(struct siw_qp
*qp
)
1253 struct siw_wqe
*wqe
= &qp
->rx_untagged
.wqe_active
;
1256 * Flush an in-progress untagged operation if present
1258 if (wqe
->wr_status
!= SIW_WR_IDLE
) {
1259 siw_dbg_qp(qp
, "flush current rqe, type %d, status %d\n",
1260 rx_type(wqe
), wqe
->wr_status
);
1262 siw_wqe_put_mem(wqe
, rx_type(wqe
));
1264 if (rx_type(wqe
) == SIW_OP_RECEIVE
) {
1265 siw_rqe_complete(qp
, &wqe
->rqe
, wqe
->bytes
,
1266 0, SIW_WC_WR_FLUSH_ERR
);
1267 } else if (rx_type(wqe
) != SIW_OP_READ
&&
1268 rx_type(wqe
) != SIW_OP_READ_RESPONSE
&&
1269 rx_type(wqe
) != SIW_OP_WRITE
) {
1270 siw_sqe_complete(qp
, &wqe
->sqe
, 0, SIW_WC_WR_FLUSH_ERR
);
1272 wqe
->wr_status
= SIW_WR_IDLE
;
1274 wqe
= &qp
->rx_tagged
.wqe_active
;
1276 if (wqe
->wr_status
!= SIW_WR_IDLE
) {
1277 siw_wqe_put_mem(wqe
, rx_type(wqe
));
1278 wqe
->wr_status
= SIW_WR_IDLE
;
1281 * Flush the Receive Queue
1283 while (qp
->attrs
.rq_size
) {
1284 struct siw_rqe
*rqe
=
1285 &qp
->recvq
[qp
->rq_get
% qp
->attrs
.rq_size
];
1287 if (!READ_ONCE(rqe
->flags
))
1290 if (siw_rqe_complete(qp
, rqe
, 0, 0, SIW_WC_WR_FLUSH_ERR
) != 0)
1293 WRITE_ONCE(rqe
->flags
, 0);
1298 int siw_qp_add(struct siw_device
*sdev
, struct siw_qp
*qp
)
1300 int rv
= xa_alloc(&sdev
->qp_xa
, &qp
->base_qp
.qp_num
, qp
, xa_limit_32b
,
1304 kref_init(&qp
->ref
);
1306 siw_dbg_qp(qp
, "new QP\n");
1311 void siw_free_qp(struct kref
*ref
)
1313 struct siw_qp
*found
, *qp
= container_of(ref
, struct siw_qp
, ref
);
1314 struct siw_device
*sdev
= qp
->sdev
;
1315 unsigned long flags
;
1318 siw_cep_put(qp
->cep
);
1320 found
= xa_erase(&sdev
->qp_xa
, qp_id(qp
));
1321 WARN_ON(found
!= qp
);
1322 spin_lock_irqsave(&sdev
->lock
, flags
);
1323 list_del(&qp
->devq
);
1324 spin_unlock_irqrestore(&sdev
->lock
, flags
);
1331 siw_put_tx_cpu(qp
->tx_cpu
);
1333 atomic_dec(&sdev
->num_qp
);
1334 siw_dbg_qp(qp
, "free QP\n");