2 * Copyright(c) 2015 - 2018 Intel Corporation.
4 * This file is provided under a dual BSD/GPLv2 license. When using or
5 * redistributing this file, you may do so under either license.
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
20 * Redistribution and use in source and binary forms, with or without
21 * modification, are permitted provided that the following conditions
24 * - Redistributions of source code must retain the above copyright
25 * notice, this list of conditions and the following disclaimer.
26 * - Redistributions in binary form must reproduce the above copyright
27 * notice, this list of conditions and the following disclaimer in
28 * the documentation and/or other materials provided with the
30 * - Neither the name of Intel Corporation nor the names of its
31 * contributors may be used to endorse or promote products derived
32 * from this software without specific prior written permission.
34 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
49 #include <rdma/rdma_vt.h>
50 #include <rdma/rdmavt_qp.h>
55 #include "verbs_txreq.h"
58 struct rvt_ack_entry
*find_prev_entry(struct rvt_qp
*qp
, u32 psn
, u8
*prev
,
59 u8
*prev_ack
, bool *scheduled
)
60 __must_hold(&qp
->s_lock
)
62 struct rvt_ack_entry
*e
= NULL
;
66 for (i
= qp
->r_head_ack_queue
; ; i
= p
) {
67 if (i
== qp
->s_tail_ack_queue
)
72 p
= rvt_size_atomic(ib_to_rvt(qp
->ibqp
.device
));
73 if (p
== qp
->r_head_ack_queue
) {
77 e
= &qp
->s_ack_queue
[p
];
82 if (cmp_psn(psn
, e
->psn
) >= 0) {
83 if (p
== qp
->s_tail_ack_queue
&&
84 cmp_psn(psn
, e
->lpsn
) <= 0)
99 * make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
100 * @dev: the device for this QP
101 * @qp: a pointer to the QP
102 * @ohdr: a pointer to the IB header being constructed
103 * @ps: the xmit packet state
105 * Return 1 if constructed; otherwise, return 0.
106 * Note that we are in the responder's side of the QP context.
107 * Note the QP s_lock must be held.
109 static int make_rc_ack(struct hfi1_ibdev
*dev
, struct rvt_qp
*qp
,
110 struct ib_other_headers
*ohdr
,
111 struct hfi1_pkt_state
*ps
)
113 struct rvt_ack_entry
*e
;
116 u32 bth0
= 0, bth2
= 0;
117 u32 bth1
= qp
->remote_qpn
| (HFI1_CAP_IS_KSET(OPFN
) << IB_BTHE_E_SHIFT
);
120 struct hfi1_qp_priv
*qpriv
= qp
->priv
;
123 u8 next
= qp
->s_tail_ack_queue
;
124 struct tid_rdma_request
*req
;
126 trace_hfi1_rsp_make_rc_ack(qp
, 0);
127 lockdep_assert_held(&qp
->s_lock
);
128 /* Don't send an ACK if we aren't supposed to. */
129 if (!(ib_rvt_state_ops
[qp
->state
] & RVT_PROCESS_RECV_OK
))
132 if (qpriv
->hdr_type
== HFI1_PKT_TYPE_9B
)
133 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
136 /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
139 switch (qp
->s_ack_state
) {
140 case OP(RDMA_READ_RESPONSE_LAST
):
141 case OP(RDMA_READ_RESPONSE_ONLY
):
142 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
143 release_rdma_sge_mr(e
);
145 case OP(ATOMIC_ACKNOWLEDGE
):
147 * We can increment the tail pointer now that the last
148 * response has been sent instead of only being
151 if (++next
> rvt_size_atomic(&dev
->rdi
))
154 * Only advance the s_acked_ack_queue pointer if there
155 * have been no TID RDMA requests.
157 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
158 if (e
->opcode
!= TID_OP(WRITE_REQ
) &&
159 qp
->s_acked_ack_queue
== qp
->s_tail_ack_queue
)
160 qp
->s_acked_ack_queue
= next
;
161 qp
->s_tail_ack_queue
= next
;
162 trace_hfi1_rsp_make_rc_ack(qp
, e
->psn
);
165 case OP(ACKNOWLEDGE
):
166 /* Check for no next entry in the queue. */
167 if (qp
->r_head_ack_queue
== qp
->s_tail_ack_queue
) {
168 if (qp
->s_flags
& RVT_S_ACK_PENDING
)
173 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
174 /* Check for tid write fence */
175 if ((qpriv
->s_flags
& HFI1_R_TID_WAIT_INTERLCK
) ||
176 hfi1_tid_rdma_ack_interlock(qp
, e
)) {
177 iowait_set_flag(&qpriv
->s_iowait
, IOWAIT_PENDING_IB
);
180 if (e
->opcode
== OP(RDMA_READ_REQUEST
)) {
182 * If a RDMA read response is being resent and
183 * we haven't seen the duplicate request yet,
184 * then stop sending the remaining responses the
185 * responder has seen until the requester re-sends it.
187 len
= e
->rdma_sge
.sge_length
;
188 if (len
&& !e
->rdma_sge
.mr
) {
189 if (qp
->s_acked_ack_queue
==
190 qp
->s_tail_ack_queue
)
191 qp
->s_acked_ack_queue
=
192 qp
->r_head_ack_queue
;
193 qp
->s_tail_ack_queue
= qp
->r_head_ack_queue
;
196 /* Copy SGE state in case we need to resend */
197 ps
->s_txreq
->mr
= e
->rdma_sge
.mr
;
199 rvt_get_mr(ps
->s_txreq
->mr
);
200 qp
->s_ack_rdma_sge
.sge
= e
->rdma_sge
;
201 qp
->s_ack_rdma_sge
.num_sge
= 1;
202 ps
->s_txreq
->ss
= &qp
->s_ack_rdma_sge
;
205 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_FIRST
);
207 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_ONLY
);
210 ohdr
->u
.aeth
= rvt_compute_aeth(qp
);
212 qp
->s_ack_rdma_psn
= e
->psn
;
213 bth2
= mask_psn(qp
->s_ack_rdma_psn
++);
214 } else if (e
->opcode
== TID_OP(WRITE_REQ
)) {
216 * If a TID RDMA WRITE RESP is being resent, we have to
217 * wait for the actual request. All requests that are to
218 * be resent will have their state set to
219 * TID_REQUEST_RESEND. When the new request arrives, the
220 * state will be changed to TID_REQUEST_RESEND_ACTIVE.
222 req
= ack_to_tid_req(e
);
223 if (req
->state
== TID_REQUEST_RESEND
||
224 req
->state
== TID_REQUEST_INIT_RESEND
)
226 qp
->s_ack_state
= TID_OP(WRITE_RESP
);
227 qp
->s_ack_rdma_psn
= mask_psn(e
->psn
+ req
->cur_seg
);
229 } else if (e
->opcode
== TID_OP(READ_REQ
)) {
231 * If a TID RDMA read response is being resent and
232 * we haven't seen the duplicate request yet,
233 * then stop sending the remaining responses the
234 * responder has seen until the requester re-sends it.
236 len
= e
->rdma_sge
.sge_length
;
237 if (len
&& !e
->rdma_sge
.mr
) {
238 if (qp
->s_acked_ack_queue
==
239 qp
->s_tail_ack_queue
)
240 qp
->s_acked_ack_queue
=
241 qp
->r_head_ack_queue
;
242 qp
->s_tail_ack_queue
= qp
->r_head_ack_queue
;
245 /* Copy SGE state in case we need to resend */
246 ps
->s_txreq
->mr
= e
->rdma_sge
.mr
;
248 rvt_get_mr(ps
->s_txreq
->mr
);
249 qp
->s_ack_rdma_sge
.sge
= e
->rdma_sge
;
250 qp
->s_ack_rdma_sge
.num_sge
= 1;
251 qp
->s_ack_state
= TID_OP(READ_RESP
);
254 /* COMPARE_SWAP or FETCH_ADD */
255 ps
->s_txreq
->ss
= NULL
;
257 qp
->s_ack_state
= OP(ATOMIC_ACKNOWLEDGE
);
258 ohdr
->u
.at
.aeth
= rvt_compute_aeth(qp
);
259 ib_u64_put(e
->atomic_data
, &ohdr
->u
.at
.atomic_ack_eth
);
260 hwords
+= sizeof(ohdr
->u
.at
) / sizeof(u32
);
261 bth2
= mask_psn(e
->psn
);
264 trace_hfi1_tid_write_rsp_make_rc_ack(qp
);
265 bth0
= qp
->s_ack_state
<< 24;
268 case OP(RDMA_READ_RESPONSE_FIRST
):
269 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
271 case OP(RDMA_READ_RESPONSE_MIDDLE
):
272 ps
->s_txreq
->ss
= &qp
->s_ack_rdma_sge
;
273 ps
->s_txreq
->mr
= qp
->s_ack_rdma_sge
.sge
.mr
;
275 rvt_get_mr(ps
->s_txreq
->mr
);
276 len
= qp
->s_ack_rdma_sge
.sge
.sge_length
;
279 middle
= HFI1_CAP_IS_KSET(SDMA_AHG
);
281 ohdr
->u
.aeth
= rvt_compute_aeth(qp
);
283 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_LAST
);
284 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
287 bth0
= qp
->s_ack_state
<< 24;
288 bth2
= mask_psn(qp
->s_ack_rdma_psn
++);
291 case TID_OP(WRITE_RESP
):
294 * 1. Check if RVT_S_ACK_PENDING is set. If yes,
296 * 2. Attempt to allocate TID resources.
297 * 3. Remove RVT_S_RESP_PENDING flags from s_flags
298 * 4. If resources not available:
299 * 4.1 Set RVT_S_WAIT_TID_SPACE
300 * 4.2 Queue QP on RCD TID queue
301 * 4.3 Put QP on iowait list.
302 * 4.4 Build IB RNR NAK with appropriate timeout value
303 * 4.5 Return indication progress made.
304 * 5. If resources are available:
305 * 5.1 Program HW flow CSRs
306 * 5.2 Build TID RDMA WRITE RESP packet
307 * 5.3 If more resources needed, do 2.1 - 2.3.
308 * 5.4 Wake up next QP on RCD TID queue.
309 * 5.5 Return indication progress made.
312 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
313 req
= ack_to_tid_req(e
);
316 * Send scheduled RNR NAK's. RNR NAK's need to be sent at
317 * segment boundaries, not at request boundaries. Don't change
318 * s_ack_state because we are still in the middle of a request
320 if (qpriv
->rnr_nak_state
== TID_RNR_NAK_SEND
&&
321 qp
->s_tail_ack_queue
== qpriv
->r_tid_alloc
&&
322 req
->cur_seg
== req
->alloc_seg
) {
323 qpriv
->rnr_nak_state
= TID_RNR_NAK_SENT
;
324 goto normal_no_state
;
327 bth2
= mask_psn(qp
->s_ack_rdma_psn
);
328 hdrlen
= hfi1_build_tid_rdma_write_resp(qp
, e
, ohdr
, &bth1
,
335 bth0
= qp
->s_ack_state
<< 24;
336 qp
->s_ack_rdma_psn
++;
337 trace_hfi1_tid_req_make_rc_ack_write(qp
, 0, e
->opcode
, e
->psn
,
339 if (req
->cur_seg
!= req
->total_segs
)
343 /* Do not free e->rdma_sge until all data are received */
344 qp
->s_ack_state
= OP(ATOMIC_ACKNOWLEDGE
);
347 case TID_OP(READ_RESP
):
349 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
350 ps
->s_txreq
->ss
= &qp
->s_ack_rdma_sge
;
351 delta
= hfi1_build_tid_rdma_read_resp(qp
, e
, ohdr
, &bth0
,
360 * Increment qp->s_tail_ack_queue through s_ack_state
363 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_LAST
);
366 case TID_OP(READ_REQ
):
372 * Send a regular ACK.
373 * Set the s_ack_state so we wait until after sending
374 * the ACK before setting s_ack_state to ACKNOWLEDGE
377 qp
->s_ack_state
= OP(SEND_ONLY
);
381 cpu_to_be32((qp
->r_msn
& IB_MSN_MASK
) |
383 IB_AETH_CREDIT_SHIFT
));
385 ohdr
->u
.aeth
= rvt_compute_aeth(qp
);
388 bth0
= OP(ACKNOWLEDGE
) << 24;
389 bth2
= mask_psn(qp
->s_ack_psn
);
390 qp
->s_flags
&= ~RVT_S_ACK_PENDING
;
391 ps
->s_txreq
->txreq
.flags
|= SDMA_TXREQ_F_VIP
;
392 ps
->s_txreq
->ss
= NULL
;
394 qp
->s_rdma_ack_cnt
++;
395 ps
->s_txreq
->sde
= qpriv
->s_sde
;
396 ps
->s_txreq
->s_cur_size
= len
;
397 ps
->s_txreq
->hdr_dwords
= hwords
;
398 hfi1_make_ruc_header(qp
, ohdr
, bth0
, bth1
, bth2
, middle
, ps
);
401 spin_unlock_irqrestore(&qp
->s_lock
, ps
->flags
);
402 spin_lock_irqsave(&qp
->r_lock
, ps
->flags
);
403 spin_lock(&qp
->s_lock
);
404 rvt_error_qp(qp
, IB_WC_WR_FLUSH_ERR
);
405 spin_unlock(&qp
->s_lock
);
406 spin_unlock_irqrestore(&qp
->r_lock
, ps
->flags
);
407 spin_lock_irqsave(&qp
->s_lock
, ps
->flags
);
409 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
411 * Ensure s_rdma_ack_cnt changes are committed prior to resetting
415 qp
->s_flags
&= ~(RVT_S_RESP_PENDING
422 * hfi1_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
423 * @qp: a pointer to the QP
425 * Assumes s_lock is held.
427 * Return 1 if constructed; otherwise, return 0.
429 int hfi1_make_rc_req(struct rvt_qp
*qp
, struct hfi1_pkt_state
*ps
)
431 struct hfi1_qp_priv
*priv
= qp
->priv
;
432 struct hfi1_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
433 struct ib_other_headers
*ohdr
;
434 struct rvt_sge_state
*ss
= NULL
;
435 struct rvt_swqe
*wqe
;
436 struct hfi1_swqe_priv
*wpriv
;
437 struct tid_rdma_request
*req
= NULL
;
438 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
441 u32 bth0
= 0, bth2
= 0;
442 u32 bth1
= qp
->remote_qpn
| (HFI1_CAP_IS_KSET(OPFN
) << IB_BTHE_E_SHIFT
);
447 struct tid_rdma_flow
*flow
= NULL
;
448 struct tid_rdma_params
*remote
;
450 trace_hfi1_sender_make_rc_req(qp
);
451 lockdep_assert_held(&qp
->s_lock
);
452 ps
->s_txreq
= get_txreq(ps
->dev
, qp
);
456 if (priv
->hdr_type
== HFI1_PKT_TYPE_9B
) {
457 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
459 if (rdma_ah_get_ah_flags(&qp
->remote_ah_attr
) & IB_AH_GRH
)
460 ohdr
= &ps
->s_txreq
->phdr
.hdr
.ibh
.u
.l
.oth
;
462 ohdr
= &ps
->s_txreq
->phdr
.hdr
.ibh
.u
.oth
;
464 /* header size in 32-bit words 16B LRH+BTH = (16+12)/4. */
466 if ((rdma_ah_get_ah_flags(&qp
->remote_ah_attr
) & IB_AH_GRH
) &&
467 (hfi1_check_mcast(rdma_ah_get_dlid(&qp
->remote_ah_attr
))))
468 ohdr
= &ps
->s_txreq
->phdr
.hdr
.opah
.u
.l
.oth
;
470 ohdr
= &ps
->s_txreq
->phdr
.hdr
.opah
.u
.oth
;
473 /* Sending responses has higher priority over sending requests. */
474 if ((qp
->s_flags
& RVT_S_RESP_PENDING
) &&
475 make_rc_ack(dev
, qp
, ohdr
, ps
))
478 if (!(ib_rvt_state_ops
[qp
->state
] & RVT_PROCESS_SEND_OK
)) {
479 if (!(ib_rvt_state_ops
[qp
->state
] & RVT_FLUSH_SEND
))
481 /* We are in the error state, flush the work request. */
482 if (qp
->s_last
== READ_ONCE(qp
->s_head
))
484 /* If DMAs are in progress, we can't flush immediately. */
485 if (iowait_sdma_pending(&priv
->s_iowait
)) {
486 qp
->s_flags
|= RVT_S_WAIT_DMA
;
490 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_last
);
491 hfi1_trdma_send_complete(qp
, wqe
, qp
->s_last
!= qp
->s_acked
?
492 IB_WC_SUCCESS
: IB_WC_WR_FLUSH_ERR
);
493 /* will get called again */
497 if (qp
->s_flags
& (RVT_S_WAIT_RNR
| RVT_S_WAIT_ACK
| HFI1_S_WAIT_HALT
))
500 if (cmp_psn(qp
->s_psn
, qp
->s_sending_hpsn
) <= 0) {
501 if (cmp_psn(qp
->s_sending_psn
, qp
->s_sending_hpsn
) <= 0) {
502 qp
->s_flags
|= RVT_S_WAIT_PSN
;
505 qp
->s_sending_psn
= qp
->s_psn
;
506 qp
->s_sending_hpsn
= qp
->s_psn
- 1;
509 /* Send a request. */
510 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_cur
);
512 switch (qp
->s_state
) {
514 if (!(ib_rvt_state_ops
[qp
->state
] & RVT_PROCESS_NEXT_SEND_OK
))
517 * Resend an old request or start a new one.
519 * We keep track of the current SWQE so that
520 * we don't reset the "furthest progress" state
521 * if we need to back up.
524 if (qp
->s_cur
== qp
->s_tail
) {
525 /* Check if send work queue is empty. */
526 if (qp
->s_tail
== READ_ONCE(qp
->s_head
)) {
531 * If a fence is requested, wait for previous
532 * RDMA read and atomic operations to finish.
533 * However, there is no need to guard against
534 * TID RDMA READ after TID RDMA READ.
536 if ((wqe
->wr
.send_flags
& IB_SEND_FENCE
) &&
537 qp
->s_num_rd_atomic
&&
538 (wqe
->wr
.opcode
!= IB_WR_TID_RDMA_READ
||
539 priv
->pending_tid_r_segs
< qp
->s_num_rd_atomic
)) {
540 qp
->s_flags
|= RVT_S_WAIT_FENCE
;
544 * Local operations are processed immediately
545 * after all prior requests have completed
547 if (wqe
->wr
.opcode
== IB_WR_REG_MR
||
548 wqe
->wr
.opcode
== IB_WR_LOCAL_INV
) {
552 if (qp
->s_last
!= qp
->s_cur
)
554 if (++qp
->s_cur
== qp
->s_size
)
556 if (++qp
->s_tail
== qp
->s_size
)
558 if (!(wqe
->wr
.send_flags
&
559 RVT_SEND_COMPLETION_ONLY
)) {
560 err
= rvt_invalidate_rkey(
562 wqe
->wr
.ex
.invalidate_rkey
);
565 rvt_send_complete(qp
, wqe
,
566 err
? IB_WC_LOC_PROT_ERR
569 atomic_dec(&qp
->local_ops_pending
);
574 qp
->s_psn
= wqe
->psn
;
577 * Note that we have to be careful not to modify the
578 * original work request since we may need to resend
583 bth2
= mask_psn(qp
->s_psn
);
586 * Interlock between various IB requests and TID RDMA
589 if ((priv
->s_flags
& HFI1_S_TID_WAIT_INTERLCK
) ||
590 hfi1_tid_rdma_wqe_interlock(qp
, wqe
))
593 switch (wqe
->wr
.opcode
) {
595 case IB_WR_SEND_WITH_IMM
:
596 case IB_WR_SEND_WITH_INV
:
597 /* If no credit, return. */
598 if (!rvt_rc_credit_avail(qp
, wqe
))
601 qp
->s_state
= OP(SEND_FIRST
);
605 if (wqe
->wr
.opcode
== IB_WR_SEND
) {
606 qp
->s_state
= OP(SEND_ONLY
);
607 } else if (wqe
->wr
.opcode
== IB_WR_SEND_WITH_IMM
) {
608 qp
->s_state
= OP(SEND_ONLY_WITH_IMMEDIATE
);
609 /* Immediate data comes after the BTH */
610 ohdr
->u
.imm_data
= wqe
->wr
.ex
.imm_data
;
613 qp
->s_state
= OP(SEND_ONLY_WITH_INVALIDATE
);
614 /* Invalidate rkey comes after the BTH */
615 ohdr
->u
.ieth
= cpu_to_be32(
616 wqe
->wr
.ex
.invalidate_rkey
);
619 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
620 bth0
|= IB_BTH_SOLICITED
;
621 bth2
|= IB_BTH_REQ_ACK
;
622 if (++qp
->s_cur
== qp
->s_size
)
626 case IB_WR_RDMA_WRITE
:
627 if (newreq
&& !(qp
->s_flags
& RVT_S_UNLIMITED_CREDIT
))
629 goto no_flow_control
;
630 case IB_WR_RDMA_WRITE_WITH_IMM
:
631 /* If no credit, return. */
632 if (!rvt_rc_credit_avail(qp
, wqe
))
636 wqe
->rdma_wr
.remote_addr
,
638 ohdr
->u
.rc
.reth
.rkey
=
639 cpu_to_be32(wqe
->rdma_wr
.rkey
);
640 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
641 hwords
+= sizeof(struct ib_reth
) / sizeof(u32
);
643 qp
->s_state
= OP(RDMA_WRITE_FIRST
);
647 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
) {
648 qp
->s_state
= OP(RDMA_WRITE_ONLY
);
651 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
);
652 /* Immediate data comes after RETH */
653 ohdr
->u
.rc
.imm_data
= wqe
->wr
.ex
.imm_data
;
655 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
656 bth0
|= IB_BTH_SOLICITED
;
658 bth2
|= IB_BTH_REQ_ACK
;
659 if (++qp
->s_cur
== qp
->s_size
)
663 case IB_WR_TID_RDMA_WRITE
:
666 * Limit the number of TID RDMA WRITE requests.
668 if (atomic_read(&priv
->n_tid_requests
) >=
669 HFI1_TID_RDMA_WRITE_CNT
)
672 if (!(qp
->s_flags
& RVT_S_UNLIMITED_CREDIT
))
676 hwords
+= hfi1_build_tid_rdma_write_req(qp
, wqe
, ohdr
,
680 if (priv
->s_tid_cur
== HFI1_QP_WQE_INVALID
) {
681 priv
->s_tid_cur
= qp
->s_cur
;
682 if (priv
->s_tid_tail
== HFI1_QP_WQE_INVALID
) {
683 priv
->s_tid_tail
= qp
->s_cur
;
684 priv
->s_state
= TID_OP(WRITE_RESP
);
686 } else if (priv
->s_tid_cur
== priv
->s_tid_head
) {
687 struct rvt_swqe
*__w
;
688 struct tid_rdma_request
*__r
;
690 __w
= rvt_get_swqe_ptr(qp
, priv
->s_tid_cur
);
691 __r
= wqe_to_tid_req(__w
);
694 * The s_tid_cur pointer is advanced to s_cur if
695 * any of the following conditions about the WQE
696 * to which s_ti_cur currently points to are
698 * 1. The request is not a TID RDMA WRITE
700 * 2. The request is in the INACTIVE or
701 * COMPLETE states (TID RDMA READ requests
702 * stay at INACTIVE and TID RDMA WRITE
703 * transition to COMPLETE when done),
704 * 3. The request is in the ACTIVE or SYNC
705 * state and the number of completed
706 * segments is equal to the total segment
708 * (If ACTIVE, the request is waiting for
709 * ACKs. If SYNC, the request has not
710 * received any responses because it's
711 * waiting on a sync point.)
713 if (__w
->wr
.opcode
!= IB_WR_TID_RDMA_WRITE
||
714 __r
->state
== TID_REQUEST_INACTIVE
||
715 __r
->state
== TID_REQUEST_COMPLETE
||
716 ((__r
->state
== TID_REQUEST_ACTIVE
||
717 __r
->state
== TID_REQUEST_SYNC
) &&
718 __r
->comp_seg
== __r
->total_segs
)) {
719 if (priv
->s_tid_tail
==
722 TID_OP(WRITE_DATA_LAST
)) {
723 priv
->s_tid_tail
= qp
->s_cur
;
727 priv
->s_tid_cur
= qp
->s_cur
;
730 * A corner case: when the last TID RDMA WRITE
731 * request was completed, s_tid_head,
732 * s_tid_cur, and s_tid_tail all point to the
733 * same location. Other requests are posted and
734 * s_cur wraps around to the same location,
735 * where a new TID RDMA WRITE is posted. In
736 * this case, none of the indices need to be
737 * updated. However, the priv->s_state should.
739 if (priv
->s_tid_tail
== qp
->s_cur
&&
740 priv
->s_state
== TID_OP(WRITE_DATA_LAST
))
741 priv
->s_state
= TID_OP(WRITE_RESP
);
743 req
= wqe_to_tid_req(wqe
);
745 priv
->s_tid_head
= qp
->s_cur
;
746 priv
->pending_tid_w_resp
+= req
->total_segs
;
747 atomic_inc(&priv
->n_tid_requests
);
748 atomic_dec(&priv
->n_requests
);
750 req
->state
= TID_REQUEST_RESEND
;
751 req
->comp_seg
= delta_psn(bth2
, wqe
->psn
);
753 * Pull back any segments since we are going
754 * to re-receive them.
756 req
->setup_head
= req
->clear_tail
;
757 priv
->pending_tid_w_resp
+=
758 delta_psn(wqe
->lpsn
, bth2
) + 1;
761 trace_hfi1_tid_write_sender_make_req(qp
, newreq
);
762 trace_hfi1_tid_req_make_req_write(qp
, newreq
,
766 if (++qp
->s_cur
== qp
->s_size
)
770 case IB_WR_RDMA_READ
:
772 * Don't allow more operations to be started
773 * than the QP limits allow.
775 if (qp
->s_num_rd_atomic
>=
776 qp
->s_max_rd_atomic
) {
777 qp
->s_flags
|= RVT_S_WAIT_RDMAR
;
780 qp
->s_num_rd_atomic
++;
781 if (newreq
&& !(qp
->s_flags
& RVT_S_UNLIMITED_CREDIT
))
784 wqe
->rdma_wr
.remote_addr
,
786 ohdr
->u
.rc
.reth
.rkey
=
787 cpu_to_be32(wqe
->rdma_wr
.rkey
);
788 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
789 qp
->s_state
= OP(RDMA_READ_REQUEST
);
790 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
793 bth2
|= IB_BTH_REQ_ACK
;
794 if (++qp
->s_cur
== qp
->s_size
)
798 case IB_WR_TID_RDMA_READ
:
799 trace_hfi1_tid_read_sender_make_req(qp
, newreq
);
801 req
= wqe_to_tid_req(wqe
);
802 trace_hfi1_tid_req_make_req_read(qp
, newreq
,
806 delta
= cmp_psn(qp
->s_psn
, wqe
->psn
);
809 * Don't allow more operations to be started
810 * than the QP limits allow. We could get here under
811 * three conditions; (1) It's a new request; (2) We are
812 * sending the second or later segment of a request,
813 * but the qp->s_state is set to OP(RDMA_READ_REQUEST)
814 * when the last segment of a previous request is
815 * received just before this; (3) We are re-sending a
818 if (qp
->s_num_rd_atomic
>= qp
->s_max_rd_atomic
) {
819 qp
->s_flags
|= RVT_S_WAIT_RDMAR
;
823 struct tid_rdma_flow
*flow
=
824 &req
->flows
[req
->setup_head
];
827 * Set up s_sge as it is needed for TID
828 * allocation. However, if the pages have been
829 * walked and mapped, skip it. An earlier try
830 * has failed to allocate the TID entries.
832 if (!flow
->npagesets
) {
833 qp
->s_sge
.sge
= wqe
->sg_list
[0];
834 qp
->s_sge
.sg_list
= wqe
->sg_list
+ 1;
835 qp
->s_sge
.num_sge
= wqe
->wr
.num_sge
;
836 qp
->s_sge
.total_len
= wqe
->length
;
837 qp
->s_len
= wqe
->length
;
839 req
->clear_tail
= req
->setup_head
;
840 req
->flow_idx
= req
->setup_head
;
841 req
->state
= TID_REQUEST_ACTIVE
;
843 } else if (delta
== 0) {
844 /* Re-send a request */
847 req
->ack_pending
= 0;
848 req
->flow_idx
= req
->clear_tail
;
849 req
->state
= TID_REQUEST_RESEND
;
851 req
->s_next_psn
= qp
->s_psn
;
852 /* Read one segment at a time */
853 len
= min_t(u32
, req
->seg_len
,
854 wqe
->length
- req
->seg_len
* req
->cur_seg
);
855 delta
= hfi1_build_tid_rdma_read_req(qp
, wqe
, ohdr
,
859 /* Wait for TID space */
862 if (newreq
&& !(qp
->s_flags
& RVT_S_UNLIMITED_CREDIT
))
866 /* Check if this is the last segment */
867 if (req
->cur_seg
>= req
->total_segs
&&
868 ++qp
->s_cur
== qp
->s_size
)
872 case IB_WR_ATOMIC_CMP_AND_SWP
:
873 case IB_WR_ATOMIC_FETCH_AND_ADD
:
875 * Don't allow more operations to be started
876 * than the QP limits allow.
878 if (qp
->s_num_rd_atomic
>=
879 qp
->s_max_rd_atomic
) {
880 qp
->s_flags
|= RVT_S_WAIT_RDMAR
;
883 qp
->s_num_rd_atomic
++;
886 if (newreq
&& !(qp
->s_flags
& RVT_S_UNLIMITED_CREDIT
))
888 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
889 wqe
->wr
.opcode
== IB_WR_OPFN
) {
890 qp
->s_state
= OP(COMPARE_SWAP
);
891 put_ib_ateth_swap(wqe
->atomic_wr
.swap
,
892 &ohdr
->u
.atomic_eth
);
893 put_ib_ateth_compare(wqe
->atomic_wr
.compare_add
,
894 &ohdr
->u
.atomic_eth
);
896 qp
->s_state
= OP(FETCH_ADD
);
897 put_ib_ateth_swap(wqe
->atomic_wr
.compare_add
,
898 &ohdr
->u
.atomic_eth
);
899 put_ib_ateth_compare(0, &ohdr
->u
.atomic_eth
);
901 put_ib_ateth_vaddr(wqe
->atomic_wr
.remote_addr
,
902 &ohdr
->u
.atomic_eth
);
903 ohdr
->u
.atomic_eth
.rkey
= cpu_to_be32(
904 wqe
->atomic_wr
.rkey
);
905 hwords
+= sizeof(struct ib_atomic_eth
) / sizeof(u32
);
908 bth2
|= IB_BTH_REQ_ACK
;
909 if (++qp
->s_cur
== qp
->s_size
)
916 if (wqe
->wr
.opcode
!= IB_WR_TID_RDMA_READ
) {
917 qp
->s_sge
.sge
= wqe
->sg_list
[0];
918 qp
->s_sge
.sg_list
= wqe
->sg_list
+ 1;
919 qp
->s_sge
.num_sge
= wqe
->wr
.num_sge
;
920 qp
->s_sge
.total_len
= wqe
->length
;
921 qp
->s_len
= wqe
->length
;
925 if (qp
->s_tail
>= qp
->s_size
)
928 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
||
929 wqe
->wr
.opcode
== IB_WR_TID_RDMA_WRITE
)
930 qp
->s_psn
= wqe
->lpsn
+ 1;
931 else if (wqe
->wr
.opcode
== IB_WR_TID_RDMA_READ
)
932 qp
->s_psn
= req
->s_next_psn
;
937 case OP(RDMA_READ_RESPONSE_FIRST
):
939 * qp->s_state is normally set to the opcode of the
940 * last packet constructed for new requests and therefore
941 * is never set to RDMA read response.
942 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
943 * thread to indicate a SEND needs to be restarted from an
944 * earlier PSN without interfering with the sending thread.
947 qp
->s_len
= restart_sge(&qp
->s_sge
, wqe
, qp
->s_psn
, pmtu
);
950 qp
->s_state
= OP(SEND_MIDDLE
);
952 case OP(SEND_MIDDLE
):
953 bth2
= mask_psn(qp
->s_psn
++);
958 middle
= HFI1_CAP_IS_KSET(SDMA_AHG
);
961 if (wqe
->wr
.opcode
== IB_WR_SEND
) {
962 qp
->s_state
= OP(SEND_LAST
);
963 } else if (wqe
->wr
.opcode
== IB_WR_SEND_WITH_IMM
) {
964 qp
->s_state
= OP(SEND_LAST_WITH_IMMEDIATE
);
965 /* Immediate data comes after the BTH */
966 ohdr
->u
.imm_data
= wqe
->wr
.ex
.imm_data
;
969 qp
->s_state
= OP(SEND_LAST_WITH_INVALIDATE
);
970 /* invalidate data comes after the BTH */
971 ohdr
->u
.ieth
= cpu_to_be32(wqe
->wr
.ex
.invalidate_rkey
);
974 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
975 bth0
|= IB_BTH_SOLICITED
;
976 bth2
|= IB_BTH_REQ_ACK
;
978 if (qp
->s_cur
>= qp
->s_size
)
982 case OP(RDMA_READ_RESPONSE_LAST
):
984 * qp->s_state is normally set to the opcode of the
985 * last packet constructed for new requests and therefore
986 * is never set to RDMA read response.
987 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
988 * thread to indicate a RDMA write needs to be restarted from
989 * an earlier PSN without interfering with the sending thread.
992 qp
->s_len
= restart_sge(&qp
->s_sge
, wqe
, qp
->s_psn
, pmtu
);
994 case OP(RDMA_WRITE_FIRST
):
995 qp
->s_state
= OP(RDMA_WRITE_MIDDLE
);
997 case OP(RDMA_WRITE_MIDDLE
):
998 bth2
= mask_psn(qp
->s_psn
++);
1003 middle
= HFI1_CAP_IS_KSET(SDMA_AHG
);
1006 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
) {
1007 qp
->s_state
= OP(RDMA_WRITE_LAST
);
1009 qp
->s_state
= OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
);
1010 /* Immediate data comes after the BTH */
1011 ohdr
->u
.imm_data
= wqe
->wr
.ex
.imm_data
;
1013 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
1014 bth0
|= IB_BTH_SOLICITED
;
1016 bth2
|= IB_BTH_REQ_ACK
;
1018 if (qp
->s_cur
>= qp
->s_size
)
1022 case OP(RDMA_READ_RESPONSE_MIDDLE
):
1024 * qp->s_state is normally set to the opcode of the
1025 * last packet constructed for new requests and therefore
1026 * is never set to RDMA read response.
1027 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
1028 * thread to indicate a RDMA read needs to be restarted from
1029 * an earlier PSN without interfering with the sending thread.
1032 len
= (delta_psn(qp
->s_psn
, wqe
->psn
)) * pmtu
;
1034 wqe
->rdma_wr
.remote_addr
+ len
,
1036 ohdr
->u
.rc
.reth
.rkey
=
1037 cpu_to_be32(wqe
->rdma_wr
.rkey
);
1038 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(wqe
->length
- len
);
1039 qp
->s_state
= OP(RDMA_READ_REQUEST
);
1040 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
1041 bth2
= mask_psn(qp
->s_psn
) | IB_BTH_REQ_ACK
;
1042 qp
->s_psn
= wqe
->lpsn
+ 1;
1046 if (qp
->s_cur
== qp
->s_size
)
1050 case TID_OP(WRITE_RESP
):
1052 * This value for s_state is used for restarting a TID RDMA
1053 * WRITE request. See comment in OP(RDMA_READ_RESPONSE_MIDDLE
1056 req
= wqe_to_tid_req(wqe
);
1057 req
->state
= TID_REQUEST_RESEND
;
1059 remote
= rcu_dereference(priv
->tid_rdma
.remote
);
1060 req
->comp_seg
= delta_psn(qp
->s_psn
, wqe
->psn
);
1061 len
= wqe
->length
- (req
->comp_seg
* remote
->max_len
);
1064 bth2
= mask_psn(qp
->s_psn
);
1065 hwords
+= hfi1_build_tid_rdma_write_req(qp
, wqe
, ohdr
, &bth1
,
1067 qp
->s_psn
= wqe
->lpsn
+ 1;
1069 qp
->s_state
= TID_OP(WRITE_REQ
);
1070 priv
->pending_tid_w_resp
+= delta_psn(wqe
->lpsn
, bth2
) + 1;
1071 priv
->s_tid_cur
= qp
->s_cur
;
1072 if (++qp
->s_cur
== qp
->s_size
)
1074 trace_hfi1_tid_req_make_req_write(qp
, 0, wqe
->wr
.opcode
,
1075 wqe
->psn
, wqe
->lpsn
, req
);
1078 case TID_OP(READ_RESP
):
1079 if (wqe
->wr
.opcode
!= IB_WR_TID_RDMA_READ
)
1081 /* This is used to restart a TID read request */
1082 req
= wqe_to_tid_req(wqe
);
1085 * Back down. The field qp->s_psn has been set to the psn with
1086 * which the request should be restart. It's OK to use division
1087 * as this is on the retry path.
1089 req
->cur_seg
= delta_psn(qp
->s_psn
, wqe
->psn
) / priv
->pkts_ps
;
1092 * The following function need to be redefined to return the
1093 * status to make sure that we find the flow. At the same
1094 * time, we can use the req->state change to check if the
1095 * call succeeds or not.
1097 req
->state
= TID_REQUEST_RESEND
;
1098 hfi1_tid_rdma_restart_req(qp
, wqe
, &bth2
);
1099 if (req
->state
!= TID_REQUEST_ACTIVE
) {
1101 * Failed to find the flow. Release all allocated tid
1104 hfi1_kern_exp_rcv_clear_all(req
);
1105 hfi1_kern_clear_hw_flow(priv
->rcd
, qp
);
1107 hfi1_trdma_send_complete(qp
, wqe
, IB_WC_LOC_QP_OP_ERR
);
1110 req
->state
= TID_REQUEST_RESEND
;
1111 len
= min_t(u32
, req
->seg_len
,
1112 wqe
->length
- req
->seg_len
* req
->cur_seg
);
1113 flow
= &req
->flows
[req
->flow_idx
];
1115 req
->s_next_psn
= flow
->flow_state
.ib_lpsn
+ 1;
1116 delta
= hfi1_build_tid_rdma_read_packet(wqe
, ohdr
, &bth1
,
1119 /* Wait for TID space */
1124 /* Check if this is the last segment */
1125 if (req
->cur_seg
>= req
->total_segs
&&
1126 ++qp
->s_cur
== qp
->s_size
)
1128 qp
->s_psn
= req
->s_next_psn
;
1129 trace_hfi1_tid_req_make_req_read(qp
, 0, wqe
->wr
.opcode
,
1130 wqe
->psn
, wqe
->lpsn
, req
);
1132 case TID_OP(READ_REQ
):
1133 req
= wqe_to_tid_req(wqe
);
1134 delta
= cmp_psn(qp
->s_psn
, wqe
->psn
);
1136 * If the current WR is not TID RDMA READ, or this is the start
1137 * of a new request, we need to change the qp->s_state so that
1138 * the request can be set up properly.
1140 if (wqe
->wr
.opcode
!= IB_WR_TID_RDMA_READ
|| delta
== 0 ||
1141 qp
->s_cur
== qp
->s_tail
) {
1142 qp
->s_state
= OP(RDMA_READ_REQUEST
);
1143 if (delta
== 0 || qp
->s_cur
== qp
->s_tail
)
1150 if (qp
->s_num_rd_atomic
>= qp
->s_max_rd_atomic
) {
1151 qp
->s_flags
|= RVT_S_WAIT_RDMAR
;
1156 /* Read one segment at a time */
1157 len
= min_t(u32
, req
->seg_len
,
1158 wqe
->length
- req
->seg_len
* req
->cur_seg
);
1159 delta
= hfi1_build_tid_rdma_read_req(qp
, wqe
, ohdr
, &bth1
,
1162 /* Wait for TID space */
1167 /* Check if this is the last segment */
1168 if (req
->cur_seg
>= req
->total_segs
&&
1169 ++qp
->s_cur
== qp
->s_size
)
1171 qp
->s_psn
= req
->s_next_psn
;
1172 trace_hfi1_tid_req_make_req_read(qp
, 0, wqe
->wr
.opcode
,
1173 wqe
->psn
, wqe
->lpsn
, req
);
1176 qp
->s_sending_hpsn
= bth2
;
1177 delta
= delta_psn(bth2
, wqe
->psn
);
1178 if (delta
&& delta
% HFI1_PSN_CREDIT
== 0 &&
1179 wqe
->wr
.opcode
!= IB_WR_TID_RDMA_WRITE
)
1180 bth2
|= IB_BTH_REQ_ACK
;
1181 if (qp
->s_flags
& RVT_S_SEND_ONE
) {
1182 qp
->s_flags
&= ~RVT_S_SEND_ONE
;
1183 qp
->s_flags
|= RVT_S_WAIT_ACK
;
1184 bth2
|= IB_BTH_REQ_ACK
;
1187 ps
->s_txreq
->hdr_dwords
= hwords
;
1188 ps
->s_txreq
->sde
= priv
->s_sde
;
1189 ps
->s_txreq
->ss
= ss
;
1190 ps
->s_txreq
->s_cur_size
= len
;
1191 hfi1_make_ruc_header(
1194 bth0
| (qp
->s_state
<< 24),
1202 hfi1_put_txreq(ps
->s_txreq
);
1207 hfi1_put_txreq(ps
->s_txreq
);
1211 qp
->s_flags
&= ~RVT_S_BUSY
;
1213 * If we didn't get a txreq, the QP will be woken up later to try
1214 * again. Set the flags to indicate which work item to wake
1217 iowait_set_flag(&priv
->s_iowait
, IOWAIT_PENDING_IB
);
1221 static inline void hfi1_make_bth_aeth(struct rvt_qp
*qp
,
1222 struct ib_other_headers
*ohdr
,
1225 if (qp
->r_nak_state
)
1226 ohdr
->u
.aeth
= cpu_to_be32((qp
->r_msn
& IB_MSN_MASK
) |
1228 IB_AETH_CREDIT_SHIFT
));
1230 ohdr
->u
.aeth
= rvt_compute_aeth(qp
);
1232 ohdr
->bth
[0] = cpu_to_be32(bth0
);
1233 ohdr
->bth
[1] = cpu_to_be32(bth1
| qp
->remote_qpn
);
1234 ohdr
->bth
[2] = cpu_to_be32(mask_psn(qp
->r_ack_psn
));
1237 static inline void hfi1_queue_rc_ack(struct hfi1_packet
*packet
, bool is_fecn
)
1239 struct rvt_qp
*qp
= packet
->qp
;
1240 struct hfi1_ibport
*ibp
;
1241 unsigned long flags
;
1243 spin_lock_irqsave(&qp
->s_lock
, flags
);
1244 if (!(ib_rvt_state_ops
[qp
->state
] & RVT_PROCESS_RECV_OK
))
1246 ibp
= rcd_to_iport(packet
->rcd
);
1247 this_cpu_inc(*ibp
->rvp
.rc_qacks
);
1248 qp
->s_flags
|= RVT_S_ACK_PENDING
| RVT_S_RESP_PENDING
;
1249 qp
->s_nak_state
= qp
->r_nak_state
;
1250 qp
->s_ack_psn
= qp
->r_ack_psn
;
1252 qp
->s_flags
|= RVT_S_ECN
;
1254 /* Schedule the send tasklet. */
1255 hfi1_schedule_send(qp
);
1257 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1260 static inline void hfi1_make_rc_ack_9B(struct hfi1_packet
*packet
,
1261 struct hfi1_opa_header
*opa_hdr
,
1262 u8 sc5
, bool is_fecn
,
1263 u64
*pbc_flags
, u32
*hwords
,
1266 struct rvt_qp
*qp
= packet
->qp
;
1267 struct hfi1_ibport
*ibp
= rcd_to_iport(packet
->rcd
);
1268 struct hfi1_pportdata
*ppd
= ppd_from_ibp(ibp
);
1269 struct ib_header
*hdr
= &opa_hdr
->ibh
;
1270 struct ib_other_headers
*ohdr
;
1271 u16 lrh0
= HFI1_LRH_BTH
;
1275 opa_hdr
->hdr_type
= HFI1_PKT_TYPE_9B
;
1277 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4 */
1280 if (unlikely(rdma_ah_get_ah_flags(&qp
->remote_ah_attr
) & IB_AH_GRH
)) {
1281 *hwords
+= hfi1_make_grh(ibp
, &hdr
->u
.l
.grh
,
1282 rdma_ah_read_grh(&qp
->remote_ah_attr
),
1283 *hwords
- 2, SIZE_OF_CRC
);
1284 ohdr
= &hdr
->u
.l
.oth
;
1285 lrh0
= HFI1_LRH_GRH
;
1287 /* set PBC_DC_INFO bit (aka SC[4]) in pbc_flags */
1288 *pbc_flags
|= ((!!(sc5
& 0x10)) << PBC_DC_INFO_SHIFT
);
1290 /* read pkey_index w/o lock (its atomic) */
1291 pkey
= hfi1_get_pkey(ibp
, qp
->s_pkey_index
);
1293 lrh0
|= (sc5
& IB_SC_MASK
) << IB_SC_SHIFT
|
1294 (rdma_ah_get_sl(&qp
->remote_ah_attr
) & IB_SL_MASK
) <<
1297 hfi1_make_ib_hdr(hdr
, lrh0
, *hwords
+ SIZE_OF_CRC
,
1298 opa_get_lid(rdma_ah_get_dlid(&qp
->remote_ah_attr
), 9B
),
1299 ppd
->lid
| rdma_ah_get_path_bits(&qp
->remote_ah_attr
));
1301 bth0
= pkey
| (OP(ACKNOWLEDGE
) << 24);
1302 if (qp
->s_mig_state
== IB_MIG_MIGRATED
)
1303 bth0
|= IB_BTH_MIG_REQ
;
1304 bth1
= (!!is_fecn
) << IB_BECN_SHIFT
;
1306 * Inline ACKs go out without the use of the Verbs send engine, so
1307 * we need to set the STL Verbs Extended bit here
1309 bth1
|= HFI1_CAP_IS_KSET(OPFN
) << IB_BTHE_E_SHIFT
;
1310 hfi1_make_bth_aeth(qp
, ohdr
, bth0
, bth1
);
1313 static inline void hfi1_make_rc_ack_16B(struct hfi1_packet
*packet
,
1314 struct hfi1_opa_header
*opa_hdr
,
1315 u8 sc5
, bool is_fecn
,
1316 u64
*pbc_flags
, u32
*hwords
,
1319 struct rvt_qp
*qp
= packet
->qp
;
1320 struct hfi1_ibport
*ibp
= rcd_to_iport(packet
->rcd
);
1321 struct hfi1_pportdata
*ppd
= ppd_from_ibp(ibp
);
1322 struct hfi1_16b_header
*hdr
= &opa_hdr
->opah
;
1323 struct ib_other_headers
*ohdr
;
1326 bool becn
= is_fecn
;
1327 u8 l4
= OPA_16B_L4_IB_LOCAL
;
1330 opa_hdr
->hdr_type
= HFI1_PKT_TYPE_16B
;
1332 /* header size in 32-bit words 16B LRH+BTH+AETH = (16+12+4)/4 */
1334 extra_bytes
= hfi1_get_16b_padding(*hwords
<< 2, 0);
1335 *nwords
= SIZE_OF_CRC
+ ((extra_bytes
+ SIZE_OF_LT
) >> 2);
1337 if (unlikely(rdma_ah_get_ah_flags(&qp
->remote_ah_attr
) & IB_AH_GRH
) &&
1338 hfi1_check_mcast(rdma_ah_get_dlid(&qp
->remote_ah_attr
))) {
1339 *hwords
+= hfi1_make_grh(ibp
, &hdr
->u
.l
.grh
,
1340 rdma_ah_read_grh(&qp
->remote_ah_attr
),
1341 *hwords
- 4, *nwords
);
1342 ohdr
= &hdr
->u
.l
.oth
;
1343 l4
= OPA_16B_L4_IB_GLOBAL
;
1345 *pbc_flags
|= PBC_PACKET_BYPASS
| PBC_INSERT_BYPASS_ICRC
;
1347 /* read pkey_index w/o lock (its atomic) */
1348 pkey
= hfi1_get_pkey(ibp
, qp
->s_pkey_index
);
1350 /* Convert dwords to flits */
1351 len
= (*hwords
+ *nwords
) >> 1;
1353 hfi1_make_16b_hdr(hdr
, ppd
->lid
|
1354 (rdma_ah_get_path_bits(&qp
->remote_ah_attr
) &
1355 ((1 << ppd
->lmc
) - 1)),
1356 opa_get_lid(rdma_ah_get_dlid(&qp
->remote_ah_attr
),
1357 16B
), len
, pkey
, becn
, 0, l4
, sc5
);
1359 bth0
= pkey
| (OP(ACKNOWLEDGE
) << 24);
1360 bth0
|= extra_bytes
<< 20;
1361 if (qp
->s_mig_state
== IB_MIG_MIGRATED
)
1362 bth1
= OPA_BTH_MIG_REQ
;
1363 hfi1_make_bth_aeth(qp
, ohdr
, bth0
, bth1
);
1366 typedef void (*hfi1_make_rc_ack
)(struct hfi1_packet
*packet
,
1367 struct hfi1_opa_header
*opa_hdr
,
1368 u8 sc5
, bool is_fecn
,
1369 u64
*pbc_flags
, u32
*hwords
,
1372 /* We support only two types - 9B and 16B for now */
1373 static const hfi1_make_rc_ack hfi1_make_rc_ack_tbl
[2] = {
1374 [HFI1_PKT_TYPE_9B
] = &hfi1_make_rc_ack_9B
,
1375 [HFI1_PKT_TYPE_16B
] = &hfi1_make_rc_ack_16B
1379 * hfi1_send_rc_ack - Construct an ACK packet and send it
1380 * @qp: a pointer to the QP
1382 * This is called from hfi1_rc_rcv() and handle_receive_interrupt().
1383 * Note that RDMA reads and atomics are handled in the
1384 * send side QP state and send engine.
1386 void hfi1_send_rc_ack(struct hfi1_packet
*packet
, bool is_fecn
)
1388 struct hfi1_ctxtdata
*rcd
= packet
->rcd
;
1389 struct rvt_qp
*qp
= packet
->qp
;
1390 struct hfi1_ibport
*ibp
= rcd_to_iport(rcd
);
1391 struct hfi1_qp_priv
*priv
= qp
->priv
;
1392 struct hfi1_pportdata
*ppd
= ppd_from_ibp(ibp
);
1393 u8 sc5
= ibp
->sl_to_sc
[rdma_ah_get_sl(&qp
->remote_ah_attr
)];
1394 u64 pbc
, pbc_flags
= 0;
1398 struct pio_buf
*pbuf
;
1399 struct hfi1_opa_header opa_hdr
;
1401 /* clear the defer count */
1404 /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
1405 if (qp
->s_flags
& RVT_S_RESP_PENDING
) {
1406 hfi1_queue_rc_ack(packet
, is_fecn
);
1410 /* Ensure s_rdma_ack_cnt changes are committed */
1411 if (qp
->s_rdma_ack_cnt
) {
1412 hfi1_queue_rc_ack(packet
, is_fecn
);
1416 /* Don't try to send ACKs if the link isn't ACTIVE */
1417 if (driver_lstate(ppd
) != IB_PORT_ACTIVE
)
1420 /* Make the appropriate header */
1421 hfi1_make_rc_ack_tbl
[priv
->hdr_type
](packet
, &opa_hdr
, sc5
, is_fecn
,
1422 &pbc_flags
, &hwords
, &nwords
);
1424 plen
= 2 /* PBC */ + hwords
+ nwords
;
1425 pbc
= create_pbc(ppd
, pbc_flags
, qp
->srate_mbps
,
1426 sc_to_vlt(ppd
->dd
, sc5
), plen
);
1427 pbuf
= sc_buffer_alloc(rcd
->sc
, plen
, NULL
, NULL
);
1428 if (IS_ERR_OR_NULL(pbuf
)) {
1430 * We have no room to send at the moment. Pass
1431 * responsibility for sending the ACK to the send engine
1432 * so that when enough buffer space becomes available,
1433 * the ACK is sent ahead of other outgoing packets.
1435 hfi1_queue_rc_ack(packet
, is_fecn
);
1438 trace_ack_output_ibhdr(dd_from_ibdev(qp
->ibqp
.device
),
1439 &opa_hdr
, ib_is_sc5(sc5
));
1441 /* write the pbc and data */
1442 ppd
->dd
->pio_inline_send(ppd
->dd
, pbuf
, pbc
,
1443 (priv
->hdr_type
== HFI1_PKT_TYPE_9B
?
1444 (void *)&opa_hdr
.ibh
:
1445 (void *)&opa_hdr
.opah
), hwords
);
1450 * update_num_rd_atomic - update the qp->s_num_rd_atomic
1452 * @psn: the packet sequence number to restart at
1455 * This is called from reset_psn() to update qp->s_num_rd_atomic
1456 * for the current wqe.
1457 * Called at interrupt level with the QP s_lock held.
1459 static void update_num_rd_atomic(struct rvt_qp
*qp
, u32 psn
,
1460 struct rvt_swqe
*wqe
)
1462 u32 opcode
= wqe
->wr
.opcode
;
1464 if (opcode
== IB_WR_RDMA_READ
||
1465 opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
1466 opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
) {
1467 qp
->s_num_rd_atomic
++;
1468 } else if (opcode
== IB_WR_TID_RDMA_READ
) {
1469 struct tid_rdma_request
*req
= wqe_to_tid_req(wqe
);
1470 struct hfi1_qp_priv
*priv
= qp
->priv
;
1472 if (cmp_psn(psn
, wqe
->lpsn
) <= 0) {
1475 cur_seg
= (psn
- wqe
->psn
) / priv
->pkts_ps
;
1476 req
->ack_pending
= cur_seg
- req
->comp_seg
;
1477 priv
->pending_tid_r_segs
+= req
->ack_pending
;
1478 qp
->s_num_rd_atomic
+= req
->ack_pending
;
1479 trace_hfi1_tid_req_update_num_rd_atomic(qp
, 0,
1485 priv
->pending_tid_r_segs
+= req
->total_segs
;
1486 qp
->s_num_rd_atomic
+= req
->total_segs
;
1492 * reset_psn - reset the QP state to send starting from PSN
1494 * @psn: the packet sequence number to restart at
1496 * This is called from hfi1_rc_rcv() to process an incoming RC ACK
1498 * Called at interrupt level with the QP s_lock held.
1500 static void reset_psn(struct rvt_qp
*qp
, u32 psn
)
1502 u32 n
= qp
->s_acked
;
1503 struct rvt_swqe
*wqe
= rvt_get_swqe_ptr(qp
, n
);
1505 struct hfi1_qp_priv
*priv
= qp
->priv
;
1507 lockdep_assert_held(&qp
->s_lock
);
1509 priv
->pending_tid_r_segs
= 0;
1510 priv
->pending_tid_w_resp
= 0;
1511 qp
->s_num_rd_atomic
= 0;
1514 * If we are starting the request from the beginning,
1515 * let the normal send code handle initialization.
1517 if (cmp_psn(psn
, wqe
->psn
) <= 0) {
1518 qp
->s_state
= OP(SEND_LAST
);
1521 update_num_rd_atomic(qp
, psn
, wqe
);
1523 /* Find the work request opcode corresponding to the given PSN. */
1527 if (++n
== qp
->s_size
)
1529 if (n
== qp
->s_tail
)
1531 wqe
= rvt_get_swqe_ptr(qp
, n
);
1532 diff
= cmp_psn(psn
, wqe
->psn
);
1534 /* Point wqe back to the previous one*/
1535 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_cur
);
1540 * If we are starting the request from the beginning,
1541 * let the normal send code handle initialization.
1544 qp
->s_state
= OP(SEND_LAST
);
1548 update_num_rd_atomic(qp
, psn
, wqe
);
1550 opcode
= wqe
->wr
.opcode
;
1553 * Set the state to restart in the middle of a request.
1554 * Don't change the s_sge, s_cur_sge, or s_cur_size.
1555 * See hfi1_make_rc_req().
1559 case IB_WR_SEND_WITH_IMM
:
1560 qp
->s_state
= OP(RDMA_READ_RESPONSE_FIRST
);
1563 case IB_WR_RDMA_WRITE
:
1564 case IB_WR_RDMA_WRITE_WITH_IMM
:
1565 qp
->s_state
= OP(RDMA_READ_RESPONSE_LAST
);
1568 case IB_WR_TID_RDMA_WRITE
:
1569 qp
->s_state
= TID_OP(WRITE_RESP
);
1572 case IB_WR_RDMA_READ
:
1573 qp
->s_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
1576 case IB_WR_TID_RDMA_READ
:
1577 qp
->s_state
= TID_OP(READ_RESP
);
1582 * This case shouldn't happen since its only
1585 qp
->s_state
= OP(SEND_LAST
);
1588 priv
->s_flags
&= ~HFI1_S_TID_WAIT_INTERLCK
;
1591 * Set RVT_S_WAIT_PSN as rc_complete() may start the timer
1592 * asynchronously before the send engine can get scheduled.
1593 * Doing it in hfi1_make_rc_req() is too late.
1595 if ((cmp_psn(qp
->s_psn
, qp
->s_sending_hpsn
) <= 0) &&
1596 (cmp_psn(qp
->s_sending_psn
, qp
->s_sending_hpsn
) <= 0))
1597 qp
->s_flags
|= RVT_S_WAIT_PSN
;
1598 qp
->s_flags
&= ~HFI1_S_AHG_VALID
;
1599 trace_hfi1_sender_reset_psn(qp
);
1603 * Back up requester to resend the last un-ACKed request.
1604 * The QP r_lock and s_lock should be held and interrupts disabled.
1606 void hfi1_restart_rc(struct rvt_qp
*qp
, u32 psn
, int wait
)
1608 struct hfi1_qp_priv
*priv
= qp
->priv
;
1609 struct rvt_swqe
*wqe
= rvt_get_swqe_ptr(qp
, qp
->s_acked
);
1610 struct hfi1_ibport
*ibp
;
1612 lockdep_assert_held(&qp
->r_lock
);
1613 lockdep_assert_held(&qp
->s_lock
);
1614 trace_hfi1_sender_restart_rc(qp
);
1615 if (qp
->s_retry
== 0) {
1616 if (qp
->s_mig_state
== IB_MIG_ARMED
) {
1617 hfi1_migrate_qp(qp
);
1618 qp
->s_retry
= qp
->s_retry_cnt
;
1619 } else if (qp
->s_last
== qp
->s_acked
) {
1621 * We need special handling for the OPFN request WQEs as
1622 * they are not allowed to generate real user errors
1624 if (wqe
->wr
.opcode
== IB_WR_OPFN
) {
1625 struct hfi1_ibport
*ibp
=
1626 to_iport(qp
->ibqp
.device
, qp
->port_num
);
1628 * Call opfn_conn_reply() with capcode and
1629 * remaining data as 0 to close out the
1632 opfn_conn_reply(qp
, priv
->opfn
.curr
);
1633 wqe
= do_rc_completion(qp
, wqe
, ibp
);
1634 qp
->s_flags
&= ~RVT_S_WAIT_ACK
;
1636 trace_hfi1_tid_write_sender_restart_rc(qp
, 0);
1637 if (wqe
->wr
.opcode
== IB_WR_TID_RDMA_READ
) {
1638 struct tid_rdma_request
*req
;
1640 req
= wqe_to_tid_req(wqe
);
1641 hfi1_kern_exp_rcv_clear_all(req
);
1642 hfi1_kern_clear_hw_flow(priv
->rcd
, qp
);
1645 hfi1_trdma_send_complete(qp
, wqe
,
1646 IB_WC_RETRY_EXC_ERR
);
1647 rvt_error_qp(qp
, IB_WC_WR_FLUSH_ERR
);
1650 } else { /* need to handle delayed completion */
1657 ibp
= to_iport(qp
->ibqp
.device
, qp
->port_num
);
1658 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
||
1659 wqe
->wr
.opcode
== IB_WR_TID_RDMA_READ
)
1660 ibp
->rvp
.n_rc_resends
++;
1662 ibp
->rvp
.n_rc_resends
+= delta_psn(qp
->s_psn
, psn
);
1664 qp
->s_flags
&= ~(RVT_S_WAIT_FENCE
| RVT_S_WAIT_RDMAR
|
1665 RVT_S_WAIT_SSN_CREDIT
| RVT_S_WAIT_PSN
|
1666 RVT_S_WAIT_ACK
| HFI1_S_WAIT_TID_RESP
);
1668 qp
->s_flags
|= RVT_S_SEND_ONE
;
1673 * Set qp->s_sending_psn to the next PSN after the given one.
1674 * This would be psn+1 except when RDMA reads or TID RDMA ops
1677 static void reset_sending_psn(struct rvt_qp
*qp
, u32 psn
)
1679 struct rvt_swqe
*wqe
;
1682 lockdep_assert_held(&qp
->s_lock
);
1683 /* Find the work request corresponding to the given PSN. */
1685 wqe
= rvt_get_swqe_ptr(qp
, n
);
1686 if (cmp_psn(psn
, wqe
->lpsn
) <= 0) {
1687 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
||
1688 wqe
->wr
.opcode
== IB_WR_TID_RDMA_READ
||
1689 wqe
->wr
.opcode
== IB_WR_TID_RDMA_WRITE
)
1690 qp
->s_sending_psn
= wqe
->lpsn
+ 1;
1692 qp
->s_sending_psn
= psn
+ 1;
1695 if (++n
== qp
->s_size
)
1697 if (n
== qp
->s_tail
)
1703 * hfi1_rc_verbs_aborted - handle abort status
1705 * @opah: the opa header
1707 * This code modifies both ACK bit in BTH[2]
1708 * and the s_flags to go into send one mode.
1710 * This serves to throttle the send engine to only
1711 * send a single packet in the likely case the
1712 * a link has gone down.
1714 void hfi1_rc_verbs_aborted(struct rvt_qp
*qp
, struct hfi1_opa_header
*opah
)
1716 struct ib_other_headers
*ohdr
= hfi1_get_rc_ohdr(opah
);
1717 u8 opcode
= ib_bth_get_opcode(ohdr
);
1720 /* ignore responses */
1721 if ((opcode
>= OP(RDMA_READ_RESPONSE_FIRST
) &&
1722 opcode
<= OP(ATOMIC_ACKNOWLEDGE
)) ||
1723 opcode
== TID_OP(READ_RESP
) ||
1724 opcode
== TID_OP(WRITE_RESP
))
1727 psn
= ib_bth_get_psn(ohdr
) | IB_BTH_REQ_ACK
;
1728 ohdr
->bth
[2] = cpu_to_be32(psn
);
1729 qp
->s_flags
|= RVT_S_SEND_ONE
;
1733 * This should be called with the QP s_lock held and interrupts disabled.
1735 void hfi1_rc_send_complete(struct rvt_qp
*qp
, struct hfi1_opa_header
*opah
)
1737 struct ib_other_headers
*ohdr
;
1738 struct hfi1_qp_priv
*priv
= qp
->priv
;
1739 struct rvt_swqe
*wqe
;
1740 u32 opcode
, head
, tail
;
1742 struct tid_rdma_request
*req
;
1744 lockdep_assert_held(&qp
->s_lock
);
1745 if (!(ib_rvt_state_ops
[qp
->state
] & RVT_SEND_OR_FLUSH_OR_RECV_OK
))
1748 ohdr
= hfi1_get_rc_ohdr(opah
);
1749 opcode
= ib_bth_get_opcode(ohdr
);
1750 if ((opcode
>= OP(RDMA_READ_RESPONSE_FIRST
) &&
1751 opcode
<= OP(ATOMIC_ACKNOWLEDGE
)) ||
1752 opcode
== TID_OP(READ_RESP
) ||
1753 opcode
== TID_OP(WRITE_RESP
)) {
1754 WARN_ON(!qp
->s_rdma_ack_cnt
);
1755 qp
->s_rdma_ack_cnt
--;
1759 psn
= ib_bth_get_psn(ohdr
);
1761 * Don't attempt to reset the sending PSN for packets in the
1762 * KDETH PSN space since the PSN does not match anything.
1764 if (opcode
!= TID_OP(WRITE_DATA
) &&
1765 opcode
!= TID_OP(WRITE_DATA_LAST
) &&
1766 opcode
!= TID_OP(ACK
) && opcode
!= TID_OP(RESYNC
))
1767 reset_sending_psn(qp
, psn
);
1769 /* Handle TID RDMA WRITE packets differently */
1770 if (opcode
>= TID_OP(WRITE_REQ
) &&
1771 opcode
<= TID_OP(WRITE_DATA_LAST
)) {
1772 head
= priv
->s_tid_head
;
1773 tail
= priv
->s_tid_cur
;
1775 * s_tid_cur is set to s_tid_head in the case, where
1776 * a new TID RDMA request is being started and all
1777 * previous ones have been completed.
1778 * Therefore, we need to do a secondary check in order
1779 * to properly determine whether we should start the
1782 wqe
= rvt_get_swqe_ptr(qp
, tail
);
1783 req
= wqe_to_tid_req(wqe
);
1784 if (head
== tail
&& req
->comp_seg
< req
->total_segs
) {
1786 tail
= qp
->s_size
- 1;
1796 * Start timer after a packet requesting an ACK has been sent and
1797 * there are still requests that haven't been acked.
1799 if ((psn
& IB_BTH_REQ_ACK
) && tail
!= head
&&
1800 opcode
!= TID_OP(WRITE_DATA
) && opcode
!= TID_OP(WRITE_DATA_LAST
) &&
1801 opcode
!= TID_OP(RESYNC
) &&
1803 (RVT_S_TIMER
| RVT_S_WAIT_RNR
| RVT_S_WAIT_PSN
)) &&
1804 (ib_rvt_state_ops
[qp
->state
] & RVT_PROCESS_RECV_OK
)) {
1805 if (opcode
== TID_OP(READ_REQ
))
1806 rvt_add_retry_timer_ext(qp
, priv
->timeout_shift
);
1808 rvt_add_retry_timer(qp
);
1811 /* Start TID RDMA ACK timer */
1812 if ((opcode
== TID_OP(WRITE_DATA
) ||
1813 opcode
== TID_OP(WRITE_DATA_LAST
) ||
1814 opcode
== TID_OP(RESYNC
)) &&
1815 (psn
& IB_BTH_REQ_ACK
) &&
1816 !(priv
->s_flags
& HFI1_S_TID_RETRY_TIMER
) &&
1817 (ib_rvt_state_ops
[qp
->state
] & RVT_PROCESS_RECV_OK
)) {
1819 * The TID RDMA ACK packet could be received before this
1820 * function is called. Therefore, add the timer only if TID
1821 * RDMA ACK packets are actually pending.
1823 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_acked
);
1824 req
= wqe_to_tid_req(wqe
);
1825 if (wqe
->wr
.opcode
== IB_WR_TID_RDMA_WRITE
&&
1826 req
->ack_seg
< req
->cur_seg
)
1827 hfi1_add_tid_retry_timer(qp
);
1830 while (qp
->s_last
!= qp
->s_acked
) {
1831 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_last
);
1832 if (cmp_psn(wqe
->lpsn
, qp
->s_sending_psn
) >= 0 &&
1833 cmp_psn(qp
->s_sending_psn
, qp
->s_sending_hpsn
) <= 0)
1835 trdma_clean_swqe(qp
, wqe
);
1836 trace_hfi1_qp_send_completion(qp
, wqe
, qp
->s_last
);
1837 rvt_qp_complete_swqe(qp
,
1839 ib_hfi1_wc_opcode
[wqe
->wr
.opcode
],
1843 * If we were waiting for sends to complete before re-sending,
1844 * and they are now complete, restart sending.
1846 trace_hfi1_sendcomplete(qp
, psn
);
1847 if (qp
->s_flags
& RVT_S_WAIT_PSN
&&
1848 cmp_psn(qp
->s_sending_psn
, qp
->s_sending_hpsn
) > 0) {
1849 qp
->s_flags
&= ~RVT_S_WAIT_PSN
;
1850 qp
->s_sending_psn
= qp
->s_psn
;
1851 qp
->s_sending_hpsn
= qp
->s_psn
- 1;
1852 hfi1_schedule_send(qp
);
1856 static inline void update_last_psn(struct rvt_qp
*qp
, u32 psn
)
1858 qp
->s_last_psn
= psn
;
1862 * Generate a SWQE completion.
1863 * This is similar to hfi1_send_complete but has to check to be sure
1864 * that the SGEs are not being referenced if the SWQE is being resent.
1866 struct rvt_swqe
*do_rc_completion(struct rvt_qp
*qp
,
1867 struct rvt_swqe
*wqe
,
1868 struct hfi1_ibport
*ibp
)
1870 struct hfi1_qp_priv
*priv
= qp
->priv
;
1872 lockdep_assert_held(&qp
->s_lock
);
1874 * Don't decrement refcount and don't generate a
1875 * completion if the SWQE is being resent until the send
1878 trace_hfi1_rc_completion(qp
, wqe
->lpsn
);
1879 if (cmp_psn(wqe
->lpsn
, qp
->s_sending_psn
) < 0 ||
1880 cmp_psn(qp
->s_sending_psn
, qp
->s_sending_hpsn
) > 0) {
1881 trdma_clean_swqe(qp
, wqe
);
1882 trace_hfi1_qp_send_completion(qp
, wqe
, qp
->s_last
);
1883 rvt_qp_complete_swqe(qp
,
1885 ib_hfi1_wc_opcode
[wqe
->wr
.opcode
],
1888 struct hfi1_pportdata
*ppd
= ppd_from_ibp(ibp
);
1890 this_cpu_inc(*ibp
->rvp
.rc_delayed_comp
);
1892 * If send progress not running attempt to progress
1895 if (ppd
->dd
->flags
& HFI1_HAS_SEND_DMA
) {
1896 struct sdma_engine
*engine
;
1897 u8 sl
= rdma_ah_get_sl(&qp
->remote_ah_attr
);
1900 /* For now use sc to find engine */
1901 sc5
= ibp
->sl_to_sc
[sl
];
1902 engine
= qp_to_sdma_engine(qp
, sc5
);
1903 sdma_engine_progress_schedule(engine
);
1907 qp
->s_retry
= qp
->s_retry_cnt
;
1909 * Don't update the last PSN if the request being completed is
1910 * a TID RDMA WRITE request.
1911 * Completion of the TID RDMA WRITE requests are done by the
1912 * TID RDMA ACKs and as such could be for a request that has
1913 * already been ACKed as far as the IB state machine is
1916 if (wqe
->wr
.opcode
!= IB_WR_TID_RDMA_WRITE
)
1917 update_last_psn(qp
, wqe
->lpsn
);
1920 * If we are completing a request which is in the process of
1921 * being resent, we can stop re-sending it since we know the
1922 * responder has already seen it.
1924 if (qp
->s_acked
== qp
->s_cur
) {
1925 if (++qp
->s_cur
>= qp
->s_size
)
1927 qp
->s_acked
= qp
->s_cur
;
1928 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_cur
);
1929 if (qp
->s_acked
!= qp
->s_tail
) {
1930 qp
->s_state
= OP(SEND_LAST
);
1931 qp
->s_psn
= wqe
->psn
;
1934 if (++qp
->s_acked
>= qp
->s_size
)
1936 if (qp
->state
== IB_QPS_SQD
&& qp
->s_acked
== qp
->s_cur
)
1938 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_acked
);
1940 if (priv
->s_flags
& HFI1_S_TID_WAIT_INTERLCK
) {
1941 priv
->s_flags
&= ~HFI1_S_TID_WAIT_INTERLCK
;
1942 hfi1_schedule_send(qp
);
1947 static void set_restart_qp(struct rvt_qp
*qp
, struct hfi1_ctxtdata
*rcd
)
1949 /* Retry this request. */
1950 if (!(qp
->r_flags
& RVT_R_RDMAR_SEQ
)) {
1951 qp
->r_flags
|= RVT_R_RDMAR_SEQ
;
1952 hfi1_restart_rc(qp
, qp
->s_last_psn
+ 1, 0);
1953 if (list_empty(&qp
->rspwait
)) {
1954 qp
->r_flags
|= RVT_R_RSP_SEND
;
1956 list_add_tail(&qp
->rspwait
, &rcd
->qp_wait_list
);
1962 * update_qp_retry_state - Update qp retry state.
1964 * @psn: the packet sequence number of the TID RDMA WRITE RESP.
1965 * @spsn: The start psn for the given TID RDMA WRITE swqe.
1966 * @lpsn: The last psn for the given TID RDMA WRITE swqe.
1968 * This function is called to update the qp retry state upon
1969 * receiving a TID WRITE RESP after the qp is scheduled to retry
1972 static void update_qp_retry_state(struct rvt_qp
*qp
, u32 psn
, u32 spsn
,
1975 struct hfi1_qp_priv
*qpriv
= qp
->priv
;
1977 qp
->s_psn
= psn
+ 1;
1979 * If this is the first TID RDMA WRITE RESP packet for the current
1980 * request, change the s_state so that the retry will be processed
1981 * correctly. Similarly, if this is the last TID RDMA WRITE RESP
1982 * packet, change the s_state and advance the s_cur.
1984 if (cmp_psn(psn
, lpsn
) >= 0) {
1985 qp
->s_cur
= qpriv
->s_tid_cur
+ 1;
1986 if (qp
->s_cur
>= qp
->s_size
)
1988 qp
->s_state
= TID_OP(WRITE_REQ
);
1989 } else if (!cmp_psn(psn
, spsn
)) {
1990 qp
->s_cur
= qpriv
->s_tid_cur
;
1991 qp
->s_state
= TID_OP(WRITE_RESP
);
1996 * do_rc_ack - process an incoming RC ACK
1997 * @qp: the QP the ACK came in on
1998 * @psn: the packet sequence number of the ACK
1999 * @opcode: the opcode of the request that resulted in the ACK
2001 * This is called from rc_rcv_resp() to process an incoming RC ACK
2003 * May be called at interrupt level, with the QP s_lock held.
2004 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
2006 int do_rc_ack(struct rvt_qp
*qp
, u32 aeth
, u32 psn
, int opcode
,
2007 u64 val
, struct hfi1_ctxtdata
*rcd
)
2009 struct hfi1_ibport
*ibp
;
2010 enum ib_wc_status status
;
2011 struct hfi1_qp_priv
*qpriv
= qp
->priv
;
2012 struct rvt_swqe
*wqe
;
2016 struct rvt_dev_info
*rdi
;
2018 lockdep_assert_held(&qp
->s_lock
);
2020 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
2021 * requests and implicitly NAK RDMA read and atomic requests issued
2022 * before the NAK'ed request. The MSN won't include the NAK'ed
2023 * request but will include an ACK'ed request(s).
2026 if (aeth
>> IB_AETH_NAK_SHIFT
)
2028 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_acked
);
2029 ibp
= rcd_to_iport(rcd
);
2032 * The MSN might be for a later WQE than the PSN indicates so
2033 * only complete WQEs that the PSN finishes.
2035 while ((diff
= delta_psn(ack_psn
, wqe
->lpsn
)) >= 0) {
2037 * RDMA_READ_RESPONSE_ONLY is a special case since
2038 * we want to generate completion events for everything
2039 * before the RDMA read, copy the data, then generate
2040 * the completion for the read.
2042 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
2043 opcode
== OP(RDMA_READ_RESPONSE_ONLY
) &&
2049 * If this request is a RDMA read or atomic, and the ACK is
2050 * for a later operation, this ACK NAKs the RDMA read or
2051 * atomic. In other words, only a RDMA_READ_LAST or ONLY
2052 * can ACK a RDMA read and likewise for atomic ops. Note
2053 * that the NAK case can only happen if relaxed ordering is
2054 * used and requests are sent after an RDMA read or atomic
2055 * is sent but before the response is received.
2057 if ((wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
2058 (opcode
!= OP(RDMA_READ_RESPONSE_LAST
) || diff
!= 0)) ||
2059 (wqe
->wr
.opcode
== IB_WR_TID_RDMA_READ
&&
2060 (opcode
!= TID_OP(READ_RESP
) || diff
!= 0)) ||
2061 ((wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
2062 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
) &&
2063 (opcode
!= OP(ATOMIC_ACKNOWLEDGE
) || diff
!= 0)) ||
2064 (wqe
->wr
.opcode
== IB_WR_TID_RDMA_WRITE
&&
2065 (delta_psn(psn
, qp
->s_last_psn
) != 1))) {
2066 set_restart_qp(qp
, rcd
);
2068 * No need to process the ACK/NAK since we are
2069 * restarting an earlier request.
2073 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
2074 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
) {
2075 u64
*vaddr
= wqe
->sg_list
[0].vaddr
;
2078 if (wqe
->wr
.opcode
== IB_WR_OPFN
)
2079 opfn_conn_reply(qp
, val
);
2081 if (qp
->s_num_rd_atomic
&&
2082 (wqe
->wr
.opcode
== IB_WR_RDMA_READ
||
2083 wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
2084 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
)) {
2085 qp
->s_num_rd_atomic
--;
2086 /* Restart sending task if fence is complete */
2087 if ((qp
->s_flags
& RVT_S_WAIT_FENCE
) &&
2088 !qp
->s_num_rd_atomic
) {
2089 qp
->s_flags
&= ~(RVT_S_WAIT_FENCE
|
2091 hfi1_schedule_send(qp
);
2092 } else if (qp
->s_flags
& RVT_S_WAIT_RDMAR
) {
2093 qp
->s_flags
&= ~(RVT_S_WAIT_RDMAR
|
2095 hfi1_schedule_send(qp
);
2100 * TID RDMA WRITE requests will be completed by the TID RDMA
2101 * ACK packet handler (see tid_rdma.c).
2103 if (wqe
->wr
.opcode
== IB_WR_TID_RDMA_WRITE
)
2106 wqe
= do_rc_completion(qp
, wqe
, ibp
);
2107 if (qp
->s_acked
== qp
->s_tail
)
2111 trace_hfi1_rc_ack_do(qp
, aeth
, psn
, wqe
);
2112 trace_hfi1_sender_do_rc_ack(qp
);
2113 switch (aeth
>> IB_AETH_NAK_SHIFT
) {
2115 this_cpu_inc(*ibp
->rvp
.rc_acks
);
2116 if (wqe
->wr
.opcode
== IB_WR_TID_RDMA_READ
) {
2117 if (wqe_to_tid_req(wqe
)->ack_pending
)
2118 rvt_mod_retry_timer_ext(qp
,
2119 qpriv
->timeout_shift
);
2121 rvt_stop_rc_timers(qp
);
2122 } else if (qp
->s_acked
!= qp
->s_tail
) {
2123 struct rvt_swqe
*__w
= NULL
;
2125 if (qpriv
->s_tid_cur
!= HFI1_QP_WQE_INVALID
)
2126 __w
= rvt_get_swqe_ptr(qp
, qpriv
->s_tid_cur
);
2129 * Stop timers if we've received all of the TID RDMA
2130 * WRITE * responses.
2132 if (__w
&& __w
->wr
.opcode
== IB_WR_TID_RDMA_WRITE
&&
2133 opcode
== TID_OP(WRITE_RESP
)) {
2135 * Normally, the loop above would correctly
2136 * process all WQEs from s_acked onward and
2137 * either complete them or check for correct
2139 * However, for TID RDMA, due to pipelining,
2140 * the response may not be for the request at
2141 * s_acked so the above look would just be
2142 * skipped. This does not allow for checking
2143 * the PSN sequencing. It has to be done
2146 if (cmp_psn(psn
, qp
->s_last_psn
+ 1)) {
2147 set_restart_qp(qp
, rcd
);
2151 * If the psn is being resent, stop the
2154 if (qp
->s_cur
!= qp
->s_tail
&&
2155 cmp_psn(qp
->s_psn
, psn
) <= 0)
2156 update_qp_retry_state(qp
, psn
,
2159 else if (--qpriv
->pending_tid_w_resp
)
2160 rvt_mod_retry_timer(qp
);
2162 rvt_stop_rc_timers(qp
);
2165 * We are expecting more ACKs so
2166 * mod the retry timer.
2168 rvt_mod_retry_timer(qp
);
2170 * We can stop re-sending the earlier packets
2171 * and continue with the next packet the
2174 if (cmp_psn(qp
->s_psn
, psn
) <= 0)
2175 reset_psn(qp
, psn
+ 1);
2178 /* No more acks - kill all timers */
2179 rvt_stop_rc_timers(qp
);
2180 if (cmp_psn(qp
->s_psn
, psn
) <= 0) {
2181 qp
->s_state
= OP(SEND_LAST
);
2182 qp
->s_psn
= psn
+ 1;
2185 if (qp
->s_flags
& RVT_S_WAIT_ACK
) {
2186 qp
->s_flags
&= ~RVT_S_WAIT_ACK
;
2187 hfi1_schedule_send(qp
);
2189 rvt_get_credit(qp
, aeth
);
2190 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
2191 qp
->s_retry
= qp
->s_retry_cnt
;
2193 * If the current request is a TID RDMA WRITE request and the
2194 * response is not a TID RDMA WRITE RESP packet, s_last_psn
2195 * can't be advanced.
2197 if (wqe
->wr
.opcode
== IB_WR_TID_RDMA_WRITE
&&
2198 opcode
!= TID_OP(WRITE_RESP
) &&
2199 cmp_psn(psn
, wqe
->psn
) >= 0)
2201 update_last_psn(qp
, psn
);
2204 case 1: /* RNR NAK */
2205 ibp
->rvp
.n_rnr_naks
++;
2206 if (qp
->s_acked
== qp
->s_tail
)
2208 if (qp
->s_flags
& RVT_S_WAIT_RNR
)
2210 rdi
= ib_to_rvt(qp
->ibqp
.device
);
2211 if (!(rdi
->post_parms
[wqe
->wr
.opcode
].flags
&
2212 RVT_OPERATION_IGN_RNR_CNT
)) {
2213 if (qp
->s_rnr_retry
== 0) {
2214 status
= IB_WC_RNR_RETRY_EXC_ERR
;
2217 if (qp
->s_rnr_retry_cnt
< 7 && qp
->s_rnr_retry_cnt
> 0)
2222 * The last valid PSN is the previous PSN. For TID RDMA WRITE
2223 * request, s_last_psn should be incremented only when a TID
2224 * RDMA WRITE RESP is received to avoid skipping lost TID RDMA
2225 * WRITE RESP packets.
2227 if (wqe
->wr
.opcode
== IB_WR_TID_RDMA_WRITE
) {
2228 reset_psn(qp
, qp
->s_last_psn
+ 1);
2230 update_last_psn(qp
, psn
- 1);
2234 ibp
->rvp
.n_rc_resends
+= delta_psn(qp
->s_psn
, psn
);
2235 qp
->s_flags
&= ~(RVT_S_WAIT_SSN_CREDIT
| RVT_S_WAIT_ACK
);
2236 rvt_stop_rc_timers(qp
);
2237 rvt_add_rnr_timer(qp
, aeth
);
2241 if (qp
->s_acked
== qp
->s_tail
)
2243 /* The last valid PSN is the previous PSN. */
2244 update_last_psn(qp
, psn
- 1);
2245 switch ((aeth
>> IB_AETH_CREDIT_SHIFT
) &
2246 IB_AETH_CREDIT_MASK
) {
2247 case 0: /* PSN sequence error */
2248 ibp
->rvp
.n_seq_naks
++;
2250 * Back up to the responder's expected PSN.
2251 * Note that we might get a NAK in the middle of an
2252 * RDMA READ response which terminates the RDMA
2255 hfi1_restart_rc(qp
, psn
, 0);
2256 hfi1_schedule_send(qp
);
2259 case 1: /* Invalid Request */
2260 status
= IB_WC_REM_INV_REQ_ERR
;
2261 ibp
->rvp
.n_other_naks
++;
2264 case 2: /* Remote Access Error */
2265 status
= IB_WC_REM_ACCESS_ERR
;
2266 ibp
->rvp
.n_other_naks
++;
2269 case 3: /* Remote Operation Error */
2270 status
= IB_WC_REM_OP_ERR
;
2271 ibp
->rvp
.n_other_naks
++;
2273 if (qp
->s_last
== qp
->s_acked
) {
2274 if (wqe
->wr
.opcode
== IB_WR_TID_RDMA_READ
)
2275 hfi1_kern_read_tid_flow_free(qp
);
2277 hfi1_trdma_send_complete(qp
, wqe
, status
);
2278 rvt_error_qp(qp
, IB_WC_WR_FLUSH_ERR
);
2283 /* Ignore other reserved NAK error codes */
2286 qp
->s_retry
= qp
->s_retry_cnt
;
2287 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
2290 default: /* 2: reserved */
2292 /* Ignore reserved NAK codes. */
2295 /* cannot be reached */
2297 rvt_stop_rc_timers(qp
);
2302 * We have seen an out of sequence RDMA read middle or last packet.
2303 * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
2305 static void rdma_seq_err(struct rvt_qp
*qp
, struct hfi1_ibport
*ibp
, u32 psn
,
2306 struct hfi1_ctxtdata
*rcd
)
2308 struct rvt_swqe
*wqe
;
2310 lockdep_assert_held(&qp
->s_lock
);
2311 /* Remove QP from retry timer */
2312 rvt_stop_rc_timers(qp
);
2314 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_acked
);
2316 while (cmp_psn(psn
, wqe
->lpsn
) > 0) {
2317 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
||
2318 wqe
->wr
.opcode
== IB_WR_TID_RDMA_READ
||
2319 wqe
->wr
.opcode
== IB_WR_TID_RDMA_WRITE
||
2320 wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
2321 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
)
2323 wqe
= do_rc_completion(qp
, wqe
, ibp
);
2326 ibp
->rvp
.n_rdma_seq
++;
2327 qp
->r_flags
|= RVT_R_RDMAR_SEQ
;
2328 hfi1_restart_rc(qp
, qp
->s_last_psn
+ 1, 0);
2329 if (list_empty(&qp
->rspwait
)) {
2330 qp
->r_flags
|= RVT_R_RSP_SEND
;
2332 list_add_tail(&qp
->rspwait
, &rcd
->qp_wait_list
);
2337 * rc_rcv_resp - process an incoming RC response packet
2338 * @packet: data packet information
2340 * This is called from hfi1_rc_rcv() to process an incoming RC response
2341 * packet for the given QP.
2342 * Called at interrupt level.
2344 static void rc_rcv_resp(struct hfi1_packet
*packet
)
2346 struct hfi1_ctxtdata
*rcd
= packet
->rcd
;
2347 void *data
= packet
->payload
;
2348 u32 tlen
= packet
->tlen
;
2349 struct rvt_qp
*qp
= packet
->qp
;
2350 struct hfi1_ibport
*ibp
;
2351 struct ib_other_headers
*ohdr
= packet
->ohdr
;
2352 struct rvt_swqe
*wqe
;
2353 enum ib_wc_status status
;
2354 unsigned long flags
;
2358 u32 psn
= ib_bth_get_psn(packet
->ohdr
);
2359 u32 pmtu
= qp
->pmtu
;
2360 u16 hdrsize
= packet
->hlen
;
2361 u8 opcode
= packet
->opcode
;
2362 u8 pad
= packet
->pad
;
2363 u8 extra_bytes
= pad
+ packet
->extra_byte
+ (SIZE_OF_CRC
<< 2);
2365 spin_lock_irqsave(&qp
->s_lock
, flags
);
2366 trace_hfi1_ack(qp
, psn
);
2368 /* Ignore invalid responses. */
2369 if (cmp_psn(psn
, READ_ONCE(qp
->s_next_psn
)) >= 0)
2372 /* Ignore duplicate responses. */
2373 diff
= cmp_psn(psn
, qp
->s_last_psn
);
2374 if (unlikely(diff
<= 0)) {
2375 /* Update credits for "ghost" ACKs */
2376 if (diff
== 0 && opcode
== OP(ACKNOWLEDGE
)) {
2377 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
2378 if ((aeth
>> IB_AETH_NAK_SHIFT
) == 0)
2379 rvt_get_credit(qp
, aeth
);
2385 * Skip everything other than the PSN we expect, if we are waiting
2386 * for a reply to a restarted RDMA read or atomic op.
2388 if (qp
->r_flags
& RVT_R_RDMAR_SEQ
) {
2389 if (cmp_psn(psn
, qp
->s_last_psn
+ 1) != 0)
2391 qp
->r_flags
&= ~RVT_R_RDMAR_SEQ
;
2394 if (unlikely(qp
->s_acked
== qp
->s_tail
))
2396 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_acked
);
2397 status
= IB_WC_SUCCESS
;
2400 case OP(ACKNOWLEDGE
):
2401 case OP(ATOMIC_ACKNOWLEDGE
):
2402 case OP(RDMA_READ_RESPONSE_FIRST
):
2403 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
2404 if (opcode
== OP(ATOMIC_ACKNOWLEDGE
))
2405 val
= ib_u64_get(&ohdr
->u
.at
.atomic_ack_eth
);
2408 if (!do_rc_ack(qp
, aeth
, psn
, opcode
, val
, rcd
) ||
2409 opcode
!= OP(RDMA_READ_RESPONSE_FIRST
))
2411 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_acked
);
2412 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
2415 * If this is a response to a resent RDMA read, we
2416 * have to be careful to copy the data to the right
2419 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
2423 case OP(RDMA_READ_RESPONSE_MIDDLE
):
2424 /* no AETH, no ACK */
2425 if (unlikely(cmp_psn(psn
, qp
->s_last_psn
+ 1)))
2427 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
2430 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ extra_bytes
)))
2432 if (unlikely(pmtu
>= qp
->s_rdma_read_len
))
2436 * We got a response so update the timeout.
2437 * 4.096 usec. * (1 << qp->timeout)
2439 rvt_mod_retry_timer(qp
);
2440 if (qp
->s_flags
& RVT_S_WAIT_ACK
) {
2441 qp
->s_flags
&= ~RVT_S_WAIT_ACK
;
2442 hfi1_schedule_send(qp
);
2445 if (opcode
== OP(RDMA_READ_RESPONSE_MIDDLE
))
2446 qp
->s_retry
= qp
->s_retry_cnt
;
2449 * Update the RDMA receive state but do the copy w/o
2450 * holding the locks and blocking interrupts.
2452 qp
->s_rdma_read_len
-= pmtu
;
2453 update_last_psn(qp
, psn
);
2454 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
2455 rvt_copy_sge(qp
, &qp
->s_rdma_read_sge
,
2456 data
, pmtu
, false, false);
2459 case OP(RDMA_READ_RESPONSE_ONLY
):
2460 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
2461 if (!do_rc_ack(qp
, aeth
, psn
, opcode
, 0, rcd
))
2464 * Check that the data size is >= 0 && <= pmtu.
2465 * Remember to account for ICRC (4).
2467 if (unlikely(tlen
< (hdrsize
+ extra_bytes
)))
2470 * If this is a response to a resent RDMA read, we
2471 * have to be careful to copy the data to the right
2474 wqe
= rvt_get_swqe_ptr(qp
, qp
->s_acked
);
2475 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
2479 case OP(RDMA_READ_RESPONSE_LAST
):
2480 /* ACKs READ req. */
2481 if (unlikely(cmp_psn(psn
, qp
->s_last_psn
+ 1)))
2483 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
2486 * Check that the data size is >= 1 && <= pmtu.
2487 * Remember to account for ICRC (4).
2489 if (unlikely(tlen
<= (hdrsize
+ extra_bytes
)))
2492 tlen
-= hdrsize
+ extra_bytes
;
2493 if (unlikely(tlen
!= qp
->s_rdma_read_len
))
2495 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
2496 rvt_copy_sge(qp
, &qp
->s_rdma_read_sge
,
2497 data
, tlen
, false, false);
2498 WARN_ON(qp
->s_rdma_read_sge
.num_sge
);
2499 (void)do_rc_ack(qp
, aeth
, psn
,
2500 OP(RDMA_READ_RESPONSE_LAST
), 0, rcd
);
2505 status
= IB_WC_LOC_QP_OP_ERR
;
2509 ibp
= rcd_to_iport(rcd
);
2510 rdma_seq_err(qp
, ibp
, psn
, rcd
);
2514 status
= IB_WC_LOC_LEN_ERR
;
2516 if (qp
->s_last
== qp
->s_acked
) {
2517 rvt_send_complete(qp
, wqe
, status
);
2518 rvt_error_qp(qp
, IB_WC_WR_FLUSH_ERR
);
2521 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
2526 static inline void rc_cancel_ack(struct rvt_qp
*qp
)
2529 if (list_empty(&qp
->rspwait
))
2531 list_del_init(&qp
->rspwait
);
2532 qp
->r_flags
&= ~RVT_R_RSP_NAK
;
2537 * rc_rcv_error - process an incoming duplicate or error RC packet
2538 * @ohdr: the other headers for this packet
2539 * @data: the packet data
2540 * @qp: the QP for this packet
2541 * @opcode: the opcode for this packet
2542 * @psn: the packet sequence number for this packet
2543 * @diff: the difference between the PSN and the expected PSN
2545 * This is called from hfi1_rc_rcv() to process an unexpected
2546 * incoming RC packet for the given QP.
2547 * Called at interrupt level.
2548 * Return 1 if no more processing is needed; otherwise return 0 to
2549 * schedule a response to be sent.
2551 static noinline
int rc_rcv_error(struct ib_other_headers
*ohdr
, void *data
,
2552 struct rvt_qp
*qp
, u32 opcode
, u32 psn
,
2553 int diff
, struct hfi1_ctxtdata
*rcd
)
2555 struct hfi1_ibport
*ibp
= rcd_to_iport(rcd
);
2556 struct rvt_ack_entry
*e
;
2557 unsigned long flags
;
2559 u8 mra
; /* most recent ACK */
2562 trace_hfi1_rcv_error(qp
, psn
);
2565 * Packet sequence error.
2566 * A NAK will ACK earlier sends and RDMA writes.
2567 * Don't queue the NAK if we already sent one.
2569 if (!qp
->r_nak_state
) {
2570 ibp
->rvp
.n_rc_seqnak
++;
2571 qp
->r_nak_state
= IB_NAK_PSN_ERROR
;
2572 /* Use the expected PSN. */
2573 qp
->r_ack_psn
= qp
->r_psn
;
2575 * Wait to send the sequence NAK until all packets
2576 * in the receive queue have been processed.
2577 * Otherwise, we end up propagating congestion.
2579 rc_defered_ack(rcd
, qp
);
2585 * Handle a duplicate request. Don't re-execute SEND, RDMA
2586 * write or atomic op. Don't NAK errors, just silently drop
2587 * the duplicate request. Note that r_sge, r_len, and
2588 * r_rcv_len may be in use so don't modify them.
2590 * We are supposed to ACK the earliest duplicate PSN but we
2591 * can coalesce an outstanding duplicate ACK. We have to
2592 * send the earliest so that RDMA reads can be restarted at
2593 * the requester's expected PSN.
2595 * First, find where this duplicate PSN falls within the
2596 * ACKs previously sent.
2597 * old_req is true if there is an older response that is scheduled
2598 * to be sent before sending this one.
2602 ibp
->rvp
.n_rc_dupreq
++;
2604 spin_lock_irqsave(&qp
->s_lock
, flags
);
2606 e
= find_prev_entry(qp
, psn
, &prev
, &mra
, &old_req
);
2609 case OP(RDMA_READ_REQUEST
): {
2610 struct ib_reth
*reth
;
2615 * If we didn't find the RDMA read request in the ack queue,
2616 * we can ignore this request.
2618 if (!e
|| e
->opcode
!= OP(RDMA_READ_REQUEST
))
2620 /* RETH comes after BTH */
2621 reth
= &ohdr
->u
.rc
.reth
;
2623 * Address range must be a subset of the original
2624 * request and start on pmtu boundaries.
2625 * We reuse the old ack_queue slot since the requester
2626 * should not back up and request an earlier PSN for the
2629 offset
= delta_psn(psn
, e
->psn
) * qp
->pmtu
;
2630 len
= be32_to_cpu(reth
->length
);
2631 if (unlikely(offset
+ len
!= e
->rdma_sge
.sge_length
))
2633 release_rdma_sge_mr(e
);
2635 u32 rkey
= be32_to_cpu(reth
->rkey
);
2636 u64 vaddr
= get_ib_reth_vaddr(reth
);
2639 ok
= rvt_rkey_ok(qp
, &e
->rdma_sge
, len
, vaddr
, rkey
,
2640 IB_ACCESS_REMOTE_READ
);
2644 e
->rdma_sge
.vaddr
= NULL
;
2645 e
->rdma_sge
.length
= 0;
2646 e
->rdma_sge
.sge_length
= 0;
2651 if (qp
->s_acked_ack_queue
== qp
->s_tail_ack_queue
)
2652 qp
->s_acked_ack_queue
= prev
;
2653 qp
->s_tail_ack_queue
= prev
;
2657 case OP(COMPARE_SWAP
):
2658 case OP(FETCH_ADD
): {
2660 * If we didn't find the atomic request in the ack queue
2661 * or the send engine is already backed up to send an
2662 * earlier entry, we can ignore this request.
2664 if (!e
|| e
->opcode
!= (u8
)opcode
|| old_req
)
2666 if (qp
->s_tail_ack_queue
== qp
->s_acked_ack_queue
)
2667 qp
->s_acked_ack_queue
= prev
;
2668 qp
->s_tail_ack_queue
= prev
;
2674 * Ignore this operation if it doesn't request an ACK
2675 * or an earlier RDMA read or atomic is going to be resent.
2677 if (!(psn
& IB_BTH_REQ_ACK
) || old_req
)
2680 * Resend the most recent ACK if this request is
2681 * after all the previous RDMA reads and atomics.
2683 if (mra
== qp
->r_head_ack_queue
) {
2684 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
2685 qp
->r_nak_state
= 0;
2686 qp
->r_ack_psn
= qp
->r_psn
- 1;
2691 * Resend the RDMA read or atomic op which
2692 * ACKs this duplicate request.
2694 if (qp
->s_tail_ack_queue
== qp
->s_acked_ack_queue
)
2695 qp
->s_acked_ack_queue
= mra
;
2696 qp
->s_tail_ack_queue
= mra
;
2699 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
2700 qp
->s_flags
|= RVT_S_RESP_PENDING
;
2701 qp
->r_nak_state
= 0;
2702 hfi1_schedule_send(qp
);
2705 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
2713 static void log_cca_event(struct hfi1_pportdata
*ppd
, u8 sl
, u32 rlid
,
2714 u32 lqpn
, u32 rqpn
, u8 svc_type
)
2716 struct opa_hfi1_cong_log_event_internal
*cc_event
;
2717 unsigned long flags
;
2719 if (sl
>= OPA_MAX_SLS
)
2722 spin_lock_irqsave(&ppd
->cc_log_lock
, flags
);
2724 ppd
->threshold_cong_event_map
[sl
/ 8] |= 1 << (sl
% 8);
2725 ppd
->threshold_event_counter
++;
2727 cc_event
= &ppd
->cc_events
[ppd
->cc_log_idx
++];
2728 if (ppd
->cc_log_idx
== OPA_CONG_LOG_ELEMS
)
2729 ppd
->cc_log_idx
= 0;
2730 cc_event
->lqpn
= lqpn
& RVT_QPN_MASK
;
2731 cc_event
->rqpn
= rqpn
& RVT_QPN_MASK
;
2733 cc_event
->svc_type
= svc_type
;
2734 cc_event
->rlid
= rlid
;
2735 /* keep timestamp in units of 1.024 usec */
2736 cc_event
->timestamp
= ktime_get_ns() / 1024;
2738 spin_unlock_irqrestore(&ppd
->cc_log_lock
, flags
);
2741 void process_becn(struct hfi1_pportdata
*ppd
, u8 sl
, u32 rlid
, u32 lqpn
,
2742 u32 rqpn
, u8 svc_type
)
2744 struct cca_timer
*cca_timer
;
2745 u16 ccti
, ccti_incr
, ccti_timer
, ccti_limit
;
2746 u8 trigger_threshold
;
2747 struct cc_state
*cc_state
;
2748 unsigned long flags
;
2750 if (sl
>= OPA_MAX_SLS
)
2753 cc_state
= get_cc_state(ppd
);
2759 * 1) increase CCTI (for this SL)
2760 * 2) select IPG (i.e., call set_link_ipg())
2763 ccti_limit
= cc_state
->cct
.ccti_limit
;
2764 ccti_incr
= cc_state
->cong_setting
.entries
[sl
].ccti_increase
;
2765 ccti_timer
= cc_state
->cong_setting
.entries
[sl
].ccti_timer
;
2767 cc_state
->cong_setting
.entries
[sl
].trigger_threshold
;
2769 spin_lock_irqsave(&ppd
->cca_timer_lock
, flags
);
2771 cca_timer
= &ppd
->cca_timer
[sl
];
2772 if (cca_timer
->ccti
< ccti_limit
) {
2773 if (cca_timer
->ccti
+ ccti_incr
<= ccti_limit
)
2774 cca_timer
->ccti
+= ccti_incr
;
2776 cca_timer
->ccti
= ccti_limit
;
2780 ccti
= cca_timer
->ccti
;
2782 if (!hrtimer_active(&cca_timer
->hrtimer
)) {
2783 /* ccti_timer is in units of 1.024 usec */
2784 unsigned long nsec
= 1024 * ccti_timer
;
2786 hrtimer_start(&cca_timer
->hrtimer
, ns_to_ktime(nsec
),
2787 HRTIMER_MODE_REL_PINNED
);
2790 spin_unlock_irqrestore(&ppd
->cca_timer_lock
, flags
);
2792 if ((trigger_threshold
!= 0) && (ccti
>= trigger_threshold
))
2793 log_cca_event(ppd
, sl
, rlid
, lqpn
, rqpn
, svc_type
);
2797 * hfi1_rc_rcv - process an incoming RC packet
2798 * @packet: data packet information
2800 * This is called from qp_rcv() to process an incoming RC packet
2802 * May be called at interrupt level.
2804 void hfi1_rc_rcv(struct hfi1_packet
*packet
)
2806 struct hfi1_ctxtdata
*rcd
= packet
->rcd
;
2807 void *data
= packet
->payload
;
2808 u32 tlen
= packet
->tlen
;
2809 struct rvt_qp
*qp
= packet
->qp
;
2810 struct hfi1_qp_priv
*qpriv
= qp
->priv
;
2811 struct hfi1_ibport
*ibp
= rcd_to_iport(rcd
);
2812 struct ib_other_headers
*ohdr
= packet
->ohdr
;
2813 u32 opcode
= packet
->opcode
;
2814 u32 hdrsize
= packet
->hlen
;
2815 u32 psn
= ib_bth_get_psn(packet
->ohdr
);
2816 u32 pad
= packet
->pad
;
2818 u32 pmtu
= qp
->pmtu
;
2820 struct ib_reth
*reth
;
2821 unsigned long flags
;
2823 bool copy_last
= false, fecn
;
2825 u8 extra_bytes
= pad
+ packet
->extra_byte
+ (SIZE_OF_CRC
<< 2);
2827 lockdep_assert_held(&qp
->r_lock
);
2829 if (hfi1_ruc_check_hdr(ibp
, packet
))
2832 fecn
= process_ecn(qp
, packet
);
2833 opfn_trigger_conn_request(qp
, be32_to_cpu(ohdr
->bth
[1]));
2836 * Process responses (ACKs) before anything else. Note that the
2837 * packet sequence number will be for something in the send work
2838 * queue rather than the expected receive packet sequence number.
2839 * In other words, this QP is the requester.
2841 if (opcode
>= OP(RDMA_READ_RESPONSE_FIRST
) &&
2842 opcode
<= OP(ATOMIC_ACKNOWLEDGE
)) {
2843 rc_rcv_resp(packet
);
2847 /* Compute 24 bits worth of difference. */
2848 diff
= delta_psn(psn
, qp
->r_psn
);
2849 if (unlikely(diff
)) {
2850 if (rc_rcv_error(ohdr
, data
, qp
, opcode
, psn
, diff
, rcd
))
2855 /* Check for opcode sequence errors. */
2856 switch (qp
->r_state
) {
2857 case OP(SEND_FIRST
):
2858 case OP(SEND_MIDDLE
):
2859 if (opcode
== OP(SEND_MIDDLE
) ||
2860 opcode
== OP(SEND_LAST
) ||
2861 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
) ||
2862 opcode
== OP(SEND_LAST_WITH_INVALIDATE
))
2866 case OP(RDMA_WRITE_FIRST
):
2867 case OP(RDMA_WRITE_MIDDLE
):
2868 if (opcode
== OP(RDMA_WRITE_MIDDLE
) ||
2869 opcode
== OP(RDMA_WRITE_LAST
) ||
2870 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
2875 if (opcode
== OP(SEND_MIDDLE
) ||
2876 opcode
== OP(SEND_LAST
) ||
2877 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
) ||
2878 opcode
== OP(SEND_LAST_WITH_INVALIDATE
) ||
2879 opcode
== OP(RDMA_WRITE_MIDDLE
) ||
2880 opcode
== OP(RDMA_WRITE_LAST
) ||
2881 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
2884 * Note that it is up to the requester to not send a new
2885 * RDMA read or atomic operation before receiving an ACK
2886 * for the previous operation.
2891 if (qp
->state
== IB_QPS_RTR
&& !(qp
->r_flags
& RVT_R_COMM_EST
))
2894 /* OK, process the packet. */
2896 case OP(SEND_FIRST
):
2897 ret
= rvt_get_rwqe(qp
, false);
2904 case OP(SEND_MIDDLE
):
2905 case OP(RDMA_WRITE_MIDDLE
):
2907 /* Check for invalid length PMTU or posted rwqe len. */
2909 * There will be no padding for 9B packet but 16B packets
2910 * will come in with some padding since we always add
2911 * CRC and LT bytes which will need to be flit aligned
2913 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ extra_bytes
)))
2915 qp
->r_rcv_len
+= pmtu
;
2916 if (unlikely(qp
->r_rcv_len
> qp
->r_len
))
2918 rvt_copy_sge(qp
, &qp
->r_sge
, data
, pmtu
, true, false);
2921 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
):
2923 ret
= rvt_get_rwqe(qp
, true);
2931 case OP(SEND_ONLY_WITH_IMMEDIATE
):
2932 case OP(SEND_ONLY_WITH_INVALIDATE
):
2933 ret
= rvt_get_rwqe(qp
, false);
2939 if (opcode
== OP(SEND_ONLY
))
2940 goto no_immediate_data
;
2941 if (opcode
== OP(SEND_ONLY_WITH_INVALIDATE
))
2943 fallthrough
; /* for SEND_ONLY_WITH_IMMEDIATE */
2944 case OP(SEND_LAST_WITH_IMMEDIATE
):
2946 wc
.ex
.imm_data
= ohdr
->u
.imm_data
;
2947 wc
.wc_flags
= IB_WC_WITH_IMM
;
2949 case OP(SEND_LAST_WITH_INVALIDATE
):
2951 rkey
= be32_to_cpu(ohdr
->u
.ieth
);
2952 if (rvt_invalidate_rkey(qp
, rkey
))
2953 goto no_immediate_data
;
2954 wc
.ex
.invalidate_rkey
= rkey
;
2955 wc
.wc_flags
= IB_WC_WITH_INVALIDATE
;
2957 case OP(RDMA_WRITE_LAST
):
2958 copy_last
= rvt_is_user_qp(qp
);
2965 /* Check for invalid length. */
2966 /* LAST len should be >= 1 */
2967 if (unlikely(tlen
< (hdrsize
+ extra_bytes
)))
2969 /* Don't count the CRC(and padding and LT byte for 16B). */
2970 tlen
-= (hdrsize
+ extra_bytes
);
2971 wc
.byte_len
= tlen
+ qp
->r_rcv_len
;
2972 if (unlikely(wc
.byte_len
> qp
->r_len
))
2974 rvt_copy_sge(qp
, &qp
->r_sge
, data
, tlen
, true, copy_last
);
2975 rvt_put_ss(&qp
->r_sge
);
2977 if (!__test_and_clear_bit(RVT_R_WRID_VALID
, &qp
->r_aflags
))
2979 wc
.wr_id
= qp
->r_wr_id
;
2980 wc
.status
= IB_WC_SUCCESS
;
2981 if (opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
) ||
2982 opcode
== OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
))
2983 wc
.opcode
= IB_WC_RECV_RDMA_WITH_IMM
;
2985 wc
.opcode
= IB_WC_RECV
;
2987 wc
.src_qp
= qp
->remote_qpn
;
2988 wc
.slid
= rdma_ah_get_dlid(&qp
->remote_ah_attr
) & U16_MAX
;
2990 * It seems that IB mandates the presence of an SL in a
2991 * work completion only for the UD transport (see section
2992 * 11.4.2 of IBTA Vol. 1).
2994 * However, the way the SL is chosen below is consistent
2995 * with the way that IB/qib works and is trying avoid
2996 * introducing incompatibilities.
2998 * See also OPA Vol. 1, section 9.7.6, and table 9-17.
3000 wc
.sl
= rdma_ah_get_sl(&qp
->remote_ah_attr
);
3001 /* zero fields that are N/A */
3004 wc
.dlid_path_bits
= 0;
3006 /* Signal completion event if the solicited bit is set. */
3007 rvt_recv_cq(qp
, &wc
, ib_bth_is_solicited(ohdr
));
3010 case OP(RDMA_WRITE_ONLY
):
3011 copy_last
= rvt_is_user_qp(qp
);
3013 case OP(RDMA_WRITE_FIRST
):
3014 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
):
3015 if (unlikely(!(qp
->qp_access_flags
& IB_ACCESS_REMOTE_WRITE
)))
3018 reth
= &ohdr
->u
.rc
.reth
;
3019 qp
->r_len
= be32_to_cpu(reth
->length
);
3021 qp
->r_sge
.sg_list
= NULL
;
3022 if (qp
->r_len
!= 0) {
3023 u32 rkey
= be32_to_cpu(reth
->rkey
);
3024 u64 vaddr
= get_ib_reth_vaddr(reth
);
3027 /* Check rkey & NAK */
3028 ok
= rvt_rkey_ok(qp
, &qp
->r_sge
.sge
, qp
->r_len
, vaddr
,
3029 rkey
, IB_ACCESS_REMOTE_WRITE
);
3032 qp
->r_sge
.num_sge
= 1;
3034 qp
->r_sge
.num_sge
= 0;
3035 qp
->r_sge
.sge
.mr
= NULL
;
3036 qp
->r_sge
.sge
.vaddr
= NULL
;
3037 qp
->r_sge
.sge
.length
= 0;
3038 qp
->r_sge
.sge
.sge_length
= 0;
3040 if (opcode
== OP(RDMA_WRITE_FIRST
))
3042 else if (opcode
== OP(RDMA_WRITE_ONLY
))
3043 goto no_immediate_data
;
3044 ret
= rvt_get_rwqe(qp
, true);
3048 /* peer will send again */
3049 rvt_put_ss(&qp
->r_sge
);
3052 wc
.ex
.imm_data
= ohdr
->u
.rc
.imm_data
;
3053 wc
.wc_flags
= IB_WC_WITH_IMM
;
3056 case OP(RDMA_READ_REQUEST
): {
3057 struct rvt_ack_entry
*e
;
3061 if (unlikely(!(qp
->qp_access_flags
& IB_ACCESS_REMOTE_READ
)))
3063 next
= qp
->r_head_ack_queue
+ 1;
3064 /* s_ack_queue is size rvt_size_atomic()+1 so use > not >= */
3065 if (next
> rvt_size_atomic(ib_to_rvt(qp
->ibqp
.device
)))
3067 spin_lock_irqsave(&qp
->s_lock
, flags
);
3068 if (unlikely(next
== qp
->s_acked_ack_queue
)) {
3069 if (!qp
->s_ack_queue
[next
].sent
)
3070 goto nack_inv_unlck
;
3071 update_ack_queue(qp
, next
);
3073 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
3074 release_rdma_sge_mr(e
);
3075 reth
= &ohdr
->u
.rc
.reth
;
3076 len
= be32_to_cpu(reth
->length
);
3078 u32 rkey
= be32_to_cpu(reth
->rkey
);
3079 u64 vaddr
= get_ib_reth_vaddr(reth
);
3082 /* Check rkey & NAK */
3083 ok
= rvt_rkey_ok(qp
, &e
->rdma_sge
, len
, vaddr
,
3084 rkey
, IB_ACCESS_REMOTE_READ
);
3086 goto nack_acc_unlck
;
3088 * Update the next expected PSN. We add 1 later
3089 * below, so only add the remainder here.
3091 qp
->r_psn
+= rvt_div_mtu(qp
, len
- 1);
3093 e
->rdma_sge
.mr
= NULL
;
3094 e
->rdma_sge
.vaddr
= NULL
;
3095 e
->rdma_sge
.length
= 0;
3096 e
->rdma_sge
.sge_length
= 0;
3101 e
->lpsn
= qp
->r_psn
;
3103 * We need to increment the MSN here instead of when we
3104 * finish sending the result since a duplicate request would
3105 * increment it more than once.
3109 qp
->r_state
= opcode
;
3110 qp
->r_nak_state
= 0;
3111 qp
->r_head_ack_queue
= next
;
3112 qpriv
->r_tid_alloc
= qp
->r_head_ack_queue
;
3114 /* Schedule the send engine. */
3115 qp
->s_flags
|= RVT_S_RESP_PENDING
;
3117 qp
->s_flags
|= RVT_S_ECN
;
3118 hfi1_schedule_send(qp
);
3120 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
3124 case OP(COMPARE_SWAP
):
3125 case OP(FETCH_ADD
): {
3126 struct ib_atomic_eth
*ateth
= &ohdr
->u
.atomic_eth
;
3127 u64 vaddr
= get_ib_ateth_vaddr(ateth
);
3128 bool opfn
= opcode
== OP(COMPARE_SWAP
) &&
3129 vaddr
== HFI1_VERBS_E_ATOMIC_VADDR
;
3130 struct rvt_ack_entry
*e
;
3136 if (unlikely(!(qp
->qp_access_flags
& IB_ACCESS_REMOTE_ATOMIC
) &&
3139 next
= qp
->r_head_ack_queue
+ 1;
3140 if (next
> rvt_size_atomic(ib_to_rvt(qp
->ibqp
.device
)))
3142 spin_lock_irqsave(&qp
->s_lock
, flags
);
3143 if (unlikely(next
== qp
->s_acked_ack_queue
)) {
3144 if (!qp
->s_ack_queue
[next
].sent
)
3145 goto nack_inv_unlck
;
3146 update_ack_queue(qp
, next
);
3148 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
3149 release_rdma_sge_mr(e
);
3150 /* Process OPFN special virtual address */
3152 opfn_conn_response(qp
, e
, ateth
);
3155 if (unlikely(vaddr
& (sizeof(u64
) - 1)))
3156 goto nack_inv_unlck
;
3157 rkey
= be32_to_cpu(ateth
->rkey
);
3158 /* Check rkey & NAK */
3159 if (unlikely(!rvt_rkey_ok(qp
, &qp
->r_sge
.sge
, sizeof(u64
),
3161 IB_ACCESS_REMOTE_ATOMIC
)))
3162 goto nack_acc_unlck
;
3163 /* Perform atomic OP and save result. */
3164 maddr
= (atomic64_t
*)qp
->r_sge
.sge
.vaddr
;
3165 sdata
= get_ib_ateth_swap(ateth
);
3166 e
->atomic_data
= (opcode
== OP(FETCH_ADD
)) ?
3167 (u64
)atomic64_add_return(sdata
, maddr
) - sdata
:
3168 (u64
)cmpxchg((u64
*)qp
->r_sge
.sge
.vaddr
,
3169 get_ib_ateth_compare(ateth
),
3171 rvt_put_mr(qp
->r_sge
.sge
.mr
);
3172 qp
->r_sge
.num_sge
= 0;
3180 qp
->r_state
= opcode
;
3181 qp
->r_nak_state
= 0;
3182 qp
->r_head_ack_queue
= next
;
3183 qpriv
->r_tid_alloc
= qp
->r_head_ack_queue
;
3185 /* Schedule the send engine. */
3186 qp
->s_flags
|= RVT_S_RESP_PENDING
;
3188 qp
->s_flags
|= RVT_S_ECN
;
3189 hfi1_schedule_send(qp
);
3191 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
3196 /* NAK unknown opcodes. */
3200 qp
->r_state
= opcode
;
3201 qp
->r_ack_psn
= psn
;
3202 qp
->r_nak_state
= 0;
3203 /* Send an ACK if requested or required. */
3204 if (psn
& IB_BTH_REQ_ACK
|| fecn
) {
3205 if (packet
->numpkt
== 0 || fecn
||
3206 qp
->r_adefered
>= HFI1_PSN_CREDIT
) {
3211 rc_defered_ack(rcd
, qp
);
3216 qp
->r_nak_state
= qp
->r_min_rnr_timer
| IB_RNR_NAK
;
3217 qp
->r_ack_psn
= qp
->r_psn
;
3218 /* Queue RNR NAK for later */
3219 rc_defered_ack(rcd
, qp
);
3223 rvt_rc_error(qp
, IB_WC_LOC_QP_OP_ERR
);
3224 qp
->r_nak_state
= IB_NAK_REMOTE_OPERATIONAL_ERROR
;
3225 qp
->r_ack_psn
= qp
->r_psn
;
3226 /* Queue NAK for later */
3227 rc_defered_ack(rcd
, qp
);
3231 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
3233 rvt_rc_error(qp
, IB_WC_LOC_QP_OP_ERR
);
3234 qp
->r_nak_state
= IB_NAK_INVALID_REQUEST
;
3235 qp
->r_ack_psn
= qp
->r_psn
;
3236 /* Queue NAK for later */
3237 rc_defered_ack(rcd
, qp
);
3241 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
3243 rvt_rc_error(qp
, IB_WC_LOC_PROT_ERR
);
3244 qp
->r_nak_state
= IB_NAK_REMOTE_ACCESS_ERROR
;
3245 qp
->r_ack_psn
= qp
->r_psn
;
3247 hfi1_send_rc_ack(packet
, fecn
);
3250 void hfi1_rc_hdrerr(
3251 struct hfi1_ctxtdata
*rcd
,
3252 struct hfi1_packet
*packet
,
3255 struct hfi1_ibport
*ibp
= rcd_to_iport(rcd
);
3260 if (hfi1_ruc_check_hdr(ibp
, packet
))
3263 psn
= ib_bth_get_psn(packet
->ohdr
);
3264 opcode
= ib_bth_get_opcode(packet
->ohdr
);
3266 /* Only deal with RDMA Writes for now */
3267 if (opcode
< IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST
) {
3268 diff
= delta_psn(psn
, qp
->r_psn
);
3269 if (!qp
->r_nak_state
&& diff
>= 0) {
3270 ibp
->rvp
.n_rc_seqnak
++;
3271 qp
->r_nak_state
= IB_NAK_PSN_ERROR
;
3272 /* Use the expected PSN. */
3273 qp
->r_ack_psn
= qp
->r_psn
;
3275 * Wait to send the sequence
3276 * NAK until all packets
3277 * in the receive queue have
3279 * Otherwise, we end up
3280 * propagating congestion.
3282 rc_defered_ack(rcd
, qp
);
3283 } /* Out of sequence NAK */
3284 } /* QP Request NAKs */