2 * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
3 * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 #include "ipath_verbs.h"
35 #include "ipath_kernel.h"
37 /* cut down ridiculously long IB macro names */
38 #define OP(x) IB_OPCODE_RC_##x
40 static u32
restart_sge(struct ipath_sge_state
*ss
, struct ipath_swqe
*wqe
,
45 len
= ((psn
- wqe
->psn
) & IPATH_PSN_MASK
) * pmtu
;
46 ss
->sge
= wqe
->sg_list
[0];
47 ss
->sg_list
= wqe
->sg_list
+ 1;
48 ss
->num_sge
= wqe
->wr
.num_sge
;
49 ipath_skip_sge(ss
, len
);
50 return wqe
->length
- len
;
54 * ipath_init_restart- initialize the qp->s_sge after a restart
55 * @qp: the QP who's SGE we're restarting
56 * @wqe: the work queue to initialize the QP's SGE from
58 * The QP s_lock should be held and interrupts disabled.
60 static void ipath_init_restart(struct ipath_qp
*qp
, struct ipath_swqe
*wqe
)
62 struct ipath_ibdev
*dev
;
64 qp
->s_len
= restart_sge(&qp
->s_sge
, wqe
, qp
->s_psn
,
65 ib_mtu_enum_to_int(qp
->path_mtu
));
66 dev
= to_idev(qp
->ibqp
.device
);
67 spin_lock(&dev
->pending_lock
);
68 if (list_empty(&qp
->timerwait
))
69 list_add_tail(&qp
->timerwait
,
70 &dev
->pending
[dev
->pending_index
]);
71 spin_unlock(&dev
->pending_lock
);
75 * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
76 * @qp: a pointer to the QP
77 * @ohdr: a pointer to the IB header being constructed
80 * Return 1 if constructed; otherwise, return 0.
81 * Note that we are in the responder's side of the QP context.
82 * Note the QP s_lock must be held.
84 static int ipath_make_rc_ack(struct ipath_ibdev
*dev
, struct ipath_qp
*qp
,
85 struct ipath_other_headers
*ohdr
, u32 pmtu
)
87 struct ipath_ack_entry
*e
;
93 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
96 switch (qp
->s_ack_state
) {
97 case OP(RDMA_READ_RESPONSE_LAST
):
98 case OP(RDMA_READ_RESPONSE_ONLY
):
99 case OP(ATOMIC_ACKNOWLEDGE
):
101 * We can increment the tail pointer now that the last
102 * response has been sent instead of only being
105 if (++qp
->s_tail_ack_queue
> IPATH_MAX_RDMA_ATOMIC
)
106 qp
->s_tail_ack_queue
= 0;
109 case OP(ACKNOWLEDGE
):
110 /* Check for no next entry in the queue. */
111 if (qp
->r_head_ack_queue
== qp
->s_tail_ack_queue
) {
112 if (qp
->s_flags
& IPATH_S_ACK_PENDING
)
114 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
118 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
119 if (e
->opcode
== OP(RDMA_READ_REQUEST
)) {
120 /* Copy SGE state in case we need to resend */
121 qp
->s_ack_rdma_sge
= e
->rdma_sge
;
122 qp
->s_cur_sge
= &qp
->s_ack_rdma_sge
;
123 len
= e
->rdma_sge
.sge
.sge_length
;
126 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_FIRST
);
128 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_ONLY
);
131 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
133 qp
->s_ack_rdma_psn
= e
->psn
;
134 bth2
= qp
->s_ack_rdma_psn
++ & IPATH_PSN_MASK
;
136 /* COMPARE_SWAP or FETCH_ADD */
137 qp
->s_cur_sge
= NULL
;
139 qp
->s_ack_state
= OP(ATOMIC_ACKNOWLEDGE
);
140 ohdr
->u
.at
.aeth
= ipath_compute_aeth(qp
);
141 ohdr
->u
.at
.atomic_ack_eth
[0] =
142 cpu_to_be32(e
->atomic_data
>> 32);
143 ohdr
->u
.at
.atomic_ack_eth
[1] =
144 cpu_to_be32(e
->atomic_data
);
145 hwords
+= sizeof(ohdr
->u
.at
) / sizeof(u32
);
149 bth0
= qp
->s_ack_state
<< 24;
152 case OP(RDMA_READ_RESPONSE_FIRST
):
153 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
155 case OP(RDMA_READ_RESPONSE_MIDDLE
):
156 len
= qp
->s_ack_rdma_sge
.sge
.sge_length
;
160 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
162 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_LAST
);
163 qp
->s_ack_queue
[qp
->s_tail_ack_queue
].sent
= 1;
165 bth0
= qp
->s_ack_state
<< 24;
166 bth2
= qp
->s_ack_rdma_psn
++ & IPATH_PSN_MASK
;
172 * Send a regular ACK.
173 * Set the s_ack_state so we wait until after sending
174 * the ACK before setting s_ack_state to ACKNOWLEDGE
177 qp
->s_ack_state
= OP(SEND_ONLY
);
178 qp
->s_flags
&= ~IPATH_S_ACK_PENDING
;
179 qp
->s_cur_sge
= NULL
;
182 cpu_to_be32((qp
->r_msn
& IPATH_MSN_MASK
) |
184 IPATH_AETH_CREDIT_SHIFT
));
186 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
189 bth0
= OP(ACKNOWLEDGE
) << 24;
190 bth2
= qp
->s_ack_psn
& IPATH_PSN_MASK
;
192 qp
->s_hdrwords
= hwords
;
193 qp
->s_cur_size
= len
;
194 ipath_make_ruc_header(dev
, qp
, ohdr
, bth0
, bth2
);
202 * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
203 * @qp: a pointer to the QP
205 * Return 1 if constructed; otherwise, return 0.
207 int ipath_make_rc_req(struct ipath_qp
*qp
)
209 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
210 struct ipath_other_headers
*ohdr
;
211 struct ipath_sge_state
*ss
;
212 struct ipath_swqe
*wqe
;
217 u32 pmtu
= ib_mtu_enum_to_int(qp
->path_mtu
);
222 ohdr
= &qp
->s_hdr
.u
.oth
;
223 if (qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)
224 ohdr
= &qp
->s_hdr
.u
.l
.oth
;
227 * The lock is needed to synchronize between the sending tasklet,
228 * the receive interrupt handler, and timeout resends.
230 spin_lock_irqsave(&qp
->s_lock
, flags
);
232 /* Sending responses has higher priority over sending requests. */
233 if ((qp
->r_head_ack_queue
!= qp
->s_tail_ack_queue
||
234 (qp
->s_flags
& IPATH_S_ACK_PENDING
) ||
235 qp
->s_ack_state
!= OP(ACKNOWLEDGE
)) &&
236 ipath_make_rc_ack(dev
, qp
, ohdr
, pmtu
))
239 if (!(ib_ipath_state_ops
[qp
->state
] & IPATH_PROCESS_SEND_OK
) ||
240 qp
->s_rnr_timeout
|| qp
->s_wait_credit
)
243 /* Limit the number of packets sent without an ACK. */
244 if (ipath_cmp24(qp
->s_psn
, qp
->s_last_psn
+ IPATH_PSN_CREDIT
) > 0) {
245 qp
->s_wait_credit
= 1;
250 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
252 bth0
= 1 << 22; /* Set M bit */
254 /* Send a request. */
255 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
256 switch (qp
->s_state
) {
259 * Resend an old request or start a new one.
261 * We keep track of the current SWQE so that
262 * we don't reset the "furthest progress" state
263 * if we need to back up.
266 if (qp
->s_cur
== qp
->s_tail
) {
267 /* Check if send work queue is empty. */
268 if (qp
->s_tail
== qp
->s_head
)
271 * If a fence is requested, wait for previous
272 * RDMA read and atomic operations to finish.
274 if ((wqe
->wr
.send_flags
& IB_SEND_FENCE
) &&
275 qp
->s_num_rd_atomic
) {
276 qp
->s_flags
|= IPATH_S_FENCE_PENDING
;
279 wqe
->psn
= qp
->s_next_psn
;
283 * Note that we have to be careful not to modify the
284 * original work request since we may need to resend
290 switch (wqe
->wr
.opcode
) {
292 case IB_WR_SEND_WITH_IMM
:
293 /* If no credit, return. */
294 if (qp
->s_lsn
!= (u32
) -1 &&
295 ipath_cmp24(wqe
->ssn
, qp
->s_lsn
+ 1) > 0)
297 wqe
->lpsn
= wqe
->psn
;
299 wqe
->lpsn
+= (len
- 1) / pmtu
;
300 qp
->s_state
= OP(SEND_FIRST
);
304 if (wqe
->wr
.opcode
== IB_WR_SEND
)
305 qp
->s_state
= OP(SEND_ONLY
);
307 qp
->s_state
= OP(SEND_ONLY_WITH_IMMEDIATE
);
308 /* Immediate data comes after the BTH */
309 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
312 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
314 bth2
= 1 << 31; /* Request ACK. */
315 if (++qp
->s_cur
== qp
->s_size
)
319 case IB_WR_RDMA_WRITE
:
320 if (newreq
&& qp
->s_lsn
!= (u32
) -1)
323 case IB_WR_RDMA_WRITE_WITH_IMM
:
324 /* If no credit, return. */
325 if (qp
->s_lsn
!= (u32
) -1 &&
326 ipath_cmp24(wqe
->ssn
, qp
->s_lsn
+ 1) > 0)
328 ohdr
->u
.rc
.reth
.vaddr
=
329 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
);
330 ohdr
->u
.rc
.reth
.rkey
=
331 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
332 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
333 hwords
+= sizeof(struct ib_reth
) / sizeof(u32
);
334 wqe
->lpsn
= wqe
->psn
;
336 wqe
->lpsn
+= (len
- 1) / pmtu
;
337 qp
->s_state
= OP(RDMA_WRITE_FIRST
);
341 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
342 qp
->s_state
= OP(RDMA_WRITE_ONLY
);
345 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
);
346 /* Immediate data comes after RETH */
347 ohdr
->u
.rc
.imm_data
= wqe
->wr
.imm_data
;
349 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
352 bth2
= 1 << 31; /* Request ACK. */
353 if (++qp
->s_cur
== qp
->s_size
)
357 case IB_WR_RDMA_READ
:
359 * Don't allow more operations to be started
360 * than the QP limits allow.
363 if (qp
->s_num_rd_atomic
>=
364 qp
->s_max_rd_atomic
) {
365 qp
->s_flags
|= IPATH_S_RDMAR_PENDING
;
368 qp
->s_num_rd_atomic
++;
369 if (qp
->s_lsn
!= (u32
) -1)
372 * Adjust s_next_psn to count the
373 * expected number of responses.
376 qp
->s_next_psn
+= (len
- 1) / pmtu
;
377 wqe
->lpsn
= qp
->s_next_psn
++;
379 ohdr
->u
.rc
.reth
.vaddr
=
380 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
);
381 ohdr
->u
.rc
.reth
.rkey
=
382 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
383 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
384 qp
->s_state
= OP(RDMA_READ_REQUEST
);
385 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
388 if (++qp
->s_cur
== qp
->s_size
)
392 case IB_WR_ATOMIC_CMP_AND_SWP
:
393 case IB_WR_ATOMIC_FETCH_AND_ADD
:
395 * Don't allow more operations to be started
396 * than the QP limits allow.
399 if (qp
->s_num_rd_atomic
>=
400 qp
->s_max_rd_atomic
) {
401 qp
->s_flags
|= IPATH_S_RDMAR_PENDING
;
404 qp
->s_num_rd_atomic
++;
405 if (qp
->s_lsn
!= (u32
) -1)
407 wqe
->lpsn
= wqe
->psn
;
409 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
) {
410 qp
->s_state
= OP(COMPARE_SWAP
);
411 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
412 wqe
->wr
.wr
.atomic
.swap
);
413 ohdr
->u
.atomic_eth
.compare_data
= cpu_to_be64(
414 wqe
->wr
.wr
.atomic
.compare_add
);
416 qp
->s_state
= OP(FETCH_ADD
);
417 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
418 wqe
->wr
.wr
.atomic
.compare_add
);
419 ohdr
->u
.atomic_eth
.compare_data
= 0;
421 ohdr
->u
.atomic_eth
.vaddr
[0] = cpu_to_be32(
422 wqe
->wr
.wr
.atomic
.remote_addr
>> 32);
423 ohdr
->u
.atomic_eth
.vaddr
[1] = cpu_to_be32(
424 wqe
->wr
.wr
.atomic
.remote_addr
);
425 ohdr
->u
.atomic_eth
.rkey
= cpu_to_be32(
426 wqe
->wr
.wr
.atomic
.rkey
);
427 hwords
+= sizeof(struct ib_atomic_eth
) / sizeof(u32
);
430 if (++qp
->s_cur
== qp
->s_size
)
437 qp
->s_sge
.sge
= wqe
->sg_list
[0];
438 qp
->s_sge
.sg_list
= wqe
->sg_list
+ 1;
439 qp
->s_sge
.num_sge
= wqe
->wr
.num_sge
;
440 qp
->s_len
= wqe
->length
;
443 if (qp
->s_tail
>= qp
->s_size
)
446 bth2
|= qp
->s_psn
& IPATH_PSN_MASK
;
447 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
448 qp
->s_psn
= wqe
->lpsn
+ 1;
451 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
452 qp
->s_next_psn
= qp
->s_psn
;
455 * Put the QP on the pending list so lost ACKs will cause
456 * a retry. More than one request can be pending so the
457 * QP may already be on the dev->pending list.
459 spin_lock(&dev
->pending_lock
);
460 if (list_empty(&qp
->timerwait
))
461 list_add_tail(&qp
->timerwait
,
462 &dev
->pending
[dev
->pending_index
]);
463 spin_unlock(&dev
->pending_lock
);
466 case OP(RDMA_READ_RESPONSE_FIRST
):
468 * This case can only happen if a send is restarted.
469 * See ipath_restart_rc().
471 ipath_init_restart(qp
, wqe
);
474 qp
->s_state
= OP(SEND_MIDDLE
);
476 case OP(SEND_MIDDLE
):
477 bth2
= qp
->s_psn
++ & IPATH_PSN_MASK
;
478 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
479 qp
->s_next_psn
= qp
->s_psn
;
486 if (wqe
->wr
.opcode
== IB_WR_SEND
)
487 qp
->s_state
= OP(SEND_LAST
);
489 qp
->s_state
= OP(SEND_LAST_WITH_IMMEDIATE
);
490 /* Immediate data comes after the BTH */
491 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
494 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
496 bth2
|= 1 << 31; /* Request ACK. */
498 if (qp
->s_cur
>= qp
->s_size
)
502 case OP(RDMA_READ_RESPONSE_LAST
):
504 * This case can only happen if a RDMA write is restarted.
505 * See ipath_restart_rc().
507 ipath_init_restart(qp
, wqe
);
509 case OP(RDMA_WRITE_FIRST
):
510 qp
->s_state
= OP(RDMA_WRITE_MIDDLE
);
512 case OP(RDMA_WRITE_MIDDLE
):
513 bth2
= qp
->s_psn
++ & IPATH_PSN_MASK
;
514 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
515 qp
->s_next_psn
= qp
->s_psn
;
522 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
523 qp
->s_state
= OP(RDMA_WRITE_LAST
);
525 qp
->s_state
= OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
);
526 /* Immediate data comes after the BTH */
527 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
529 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
532 bth2
|= 1 << 31; /* Request ACK. */
534 if (qp
->s_cur
>= qp
->s_size
)
538 case OP(RDMA_READ_RESPONSE_MIDDLE
):
540 * This case can only happen if a RDMA read is restarted.
541 * See ipath_restart_rc().
543 ipath_init_restart(qp
, wqe
);
544 len
= ((qp
->s_psn
- wqe
->psn
) & IPATH_PSN_MASK
) * pmtu
;
545 ohdr
->u
.rc
.reth
.vaddr
=
546 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
+ len
);
547 ohdr
->u
.rc
.reth
.rkey
=
548 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
549 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(qp
->s_len
);
550 qp
->s_state
= OP(RDMA_READ_REQUEST
);
551 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
552 bth2
= qp
->s_psn
++ & IPATH_PSN_MASK
;
553 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
554 qp
->s_next_psn
= qp
->s_psn
;
558 if (qp
->s_cur
== qp
->s_size
)
562 if (ipath_cmp24(qp
->s_psn
, qp
->s_last_psn
+ IPATH_PSN_CREDIT
- 1) >= 0)
563 bth2
|= 1 << 31; /* Request ACK. */
565 qp
->s_hdrwords
= hwords
;
567 qp
->s_cur_size
= len
;
568 ipath_make_ruc_header(dev
, qp
, ohdr
, bth0
| (qp
->s_state
<< 24), bth2
);
572 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
577 * send_rc_ack - Construct an ACK packet and send it
578 * @qp: a pointer to the QP
580 * This is called from ipath_rc_rcv() and only uses the receive
582 * Note that RDMA reads and atomics are handled in the
583 * send side QP state and tasklet.
585 static void send_rc_ack(struct ipath_qp
*qp
)
587 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
591 struct ipath_ib_header hdr
;
592 struct ipath_other_headers
*ohdr
;
595 /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
596 if (qp
->r_head_ack_queue
!= qp
->s_tail_ack_queue
||
597 (qp
->s_flags
& IPATH_S_ACK_PENDING
) ||
598 qp
->s_ack_state
!= OP(ACKNOWLEDGE
))
601 /* Construct the header. */
603 lrh0
= IPATH_LRH_BTH
;
604 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
606 if (unlikely(qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)) {
607 hwords
+= ipath_make_grh(dev
, &hdr
.u
.l
.grh
,
608 &qp
->remote_ah_attr
.grh
,
611 lrh0
= IPATH_LRH_GRH
;
613 /* read pkey_index w/o lock (its atomic) */
614 bth0
= ipath_get_pkey(dev
->dd
, qp
->s_pkey_index
) |
615 (OP(ACKNOWLEDGE
) << 24) | (1 << 22);
617 ohdr
->u
.aeth
= cpu_to_be32((qp
->r_msn
& IPATH_MSN_MASK
) |
619 IPATH_AETH_CREDIT_SHIFT
));
621 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
622 lrh0
|= qp
->remote_ah_attr
.sl
<< 4;
623 hdr
.lrh
[0] = cpu_to_be16(lrh0
);
624 hdr
.lrh
[1] = cpu_to_be16(qp
->remote_ah_attr
.dlid
);
625 hdr
.lrh
[2] = cpu_to_be16(hwords
+ SIZE_OF_CRC
);
626 hdr
.lrh
[3] = cpu_to_be16(dev
->dd
->ipath_lid
);
627 ohdr
->bth
[0] = cpu_to_be32(bth0
);
628 ohdr
->bth
[1] = cpu_to_be32(qp
->remote_qpn
);
629 ohdr
->bth
[2] = cpu_to_be32(qp
->r_ack_psn
& IPATH_PSN_MASK
);
632 * If we can send the ACK, clear the ACK state.
634 if (ipath_verbs_send(qp
, &hdr
, hwords
, NULL
, 0) == 0) {
635 dev
->n_unicast_xmit
++;
640 * We are out of PIO buffers at the moment.
641 * Pass responsibility for sending the ACK to the
642 * send tasklet so that when a PIO buffer becomes
643 * available, the ACK is sent ahead of other outgoing
649 spin_lock_irqsave(&qp
->s_lock
, flags
);
651 qp
->s_flags
|= IPATH_S_ACK_PENDING
;
652 qp
->s_nak_state
= qp
->r_nak_state
;
653 qp
->s_ack_psn
= qp
->r_ack_psn
;
654 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
656 /* Call ipath_do_rc_send() in another thread. */
657 tasklet_hi_schedule(&qp
->s_task
);
664 * reset_psn - reset the QP state to send starting from PSN
666 * @psn: the packet sequence number to restart at
668 * This is called from ipath_rc_rcv() to process an incoming RC ACK
670 * Called at interrupt level with the QP s_lock held.
672 static void reset_psn(struct ipath_qp
*qp
, u32 psn
)
675 struct ipath_swqe
*wqe
= get_swqe_ptr(qp
, n
);
681 * If we are starting the request from the beginning,
682 * let the normal send code handle initialization.
684 if (ipath_cmp24(psn
, wqe
->psn
) <= 0) {
685 qp
->s_state
= OP(SEND_LAST
);
689 /* Find the work request opcode corresponding to the given PSN. */
690 opcode
= wqe
->wr
.opcode
;
694 if (++n
== qp
->s_size
)
698 wqe
= get_swqe_ptr(qp
, n
);
699 diff
= ipath_cmp24(psn
, wqe
->psn
);
704 * If we are starting the request from the beginning,
705 * let the normal send code handle initialization.
708 qp
->s_state
= OP(SEND_LAST
);
711 opcode
= wqe
->wr
.opcode
;
715 * Set the state to restart in the middle of a request.
716 * Don't change the s_sge, s_cur_sge, or s_cur_size.
717 * See ipath_do_rc_send().
721 case IB_WR_SEND_WITH_IMM
:
722 qp
->s_state
= OP(RDMA_READ_RESPONSE_FIRST
);
725 case IB_WR_RDMA_WRITE
:
726 case IB_WR_RDMA_WRITE_WITH_IMM
:
727 qp
->s_state
= OP(RDMA_READ_RESPONSE_LAST
);
730 case IB_WR_RDMA_READ
:
731 qp
->s_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
736 * This case shouldn't happen since its only
739 qp
->s_state
= OP(SEND_LAST
);
746 * ipath_restart_rc - back up requester to resend the last un-ACKed request
747 * @qp: the QP to restart
748 * @psn: packet sequence number for the request
749 * @wc: the work completion request
751 * The QP s_lock should be held and interrupts disabled.
753 void ipath_restart_rc(struct ipath_qp
*qp
, u32 psn
, struct ib_wc
*wc
)
755 struct ipath_swqe
*wqe
= get_swqe_ptr(qp
, qp
->s_last
);
756 struct ipath_ibdev
*dev
;
758 if (qp
->s_retry
== 0) {
759 wc
->wr_id
= wqe
->wr
.wr_id
;
760 wc
->status
= IB_WC_RETRY_EXC_ERR
;
761 wc
->opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
766 wc
->src_qp
= qp
->remote_qpn
;
769 wc
->slid
= qp
->remote_ah_attr
.dlid
;
770 wc
->sl
= qp
->remote_ah_attr
.sl
;
771 wc
->dlid_path_bits
= 0;
773 ipath_sqerror_qp(qp
, wc
);
779 * Remove the QP from the timeout queue.
780 * Note: it may already have been removed by ipath_ib_timer().
782 dev
= to_idev(qp
->ibqp
.device
);
783 spin_lock(&dev
->pending_lock
);
784 if (!list_empty(&qp
->timerwait
))
785 list_del_init(&qp
->timerwait
);
786 spin_unlock(&dev
->pending_lock
);
788 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
791 dev
->n_rc_resends
+= (qp
->s_psn
- psn
) & IPATH_PSN_MASK
;
794 tasklet_hi_schedule(&qp
->s_task
);
800 static inline void update_last_psn(struct ipath_qp
*qp
, u32 psn
)
802 if (qp
->s_last_psn
!= psn
) {
803 qp
->s_last_psn
= psn
;
804 if (qp
->s_wait_credit
) {
805 qp
->s_wait_credit
= 0;
806 tasklet_hi_schedule(&qp
->s_task
);
812 * do_rc_ack - process an incoming RC ACK
813 * @qp: the QP the ACK came in on
814 * @psn: the packet sequence number of the ACK
815 * @opcode: the opcode of the request that resulted in the ACK
817 * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
819 * Called at interrupt level with the QP s_lock held and interrupts disabled.
820 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
822 static int do_rc_ack(struct ipath_qp
*qp
, u32 aeth
, u32 psn
, int opcode
,
825 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
827 struct ipath_swqe
*wqe
;
833 * Remove the QP from the timeout queue (or RNR timeout queue).
834 * If ipath_ib_timer() has already removed it,
835 * it's OK since we hold the QP s_lock and ipath_restart_rc()
836 * just won't find anything to restart if we ACK everything.
838 spin_lock(&dev
->pending_lock
);
839 if (!list_empty(&qp
->timerwait
))
840 list_del_init(&qp
->timerwait
);
841 spin_unlock(&dev
->pending_lock
);
844 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
845 * requests and implicitly NAK RDMA read and atomic requests issued
846 * before the NAK'ed request. The MSN won't include the NAK'ed
847 * request but will include an ACK'ed request(s).
852 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
855 * The MSN might be for a later WQE than the PSN indicates so
856 * only complete WQEs that the PSN finishes.
858 while ((diff
= ipath_cmp24(ack_psn
, wqe
->lpsn
)) >= 0) {
860 * RDMA_READ_RESPONSE_ONLY is a special case since
861 * we want to generate completion events for everything
862 * before the RDMA read, copy the data, then generate
863 * the completion for the read.
865 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
866 opcode
== OP(RDMA_READ_RESPONSE_ONLY
) &&
872 * If this request is a RDMA read or atomic, and the ACK is
873 * for a later operation, this ACK NAKs the RDMA read or
874 * atomic. In other words, only a RDMA_READ_LAST or ONLY
875 * can ACK a RDMA read and likewise for atomic ops. Note
876 * that the NAK case can only happen if relaxed ordering is
877 * used and requests are sent after an RDMA read or atomic
878 * is sent but before the response is received.
880 if ((wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
881 (opcode
!= OP(RDMA_READ_RESPONSE_LAST
) || diff
!= 0)) ||
882 ((wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
883 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
) &&
884 (opcode
!= OP(ATOMIC_ACKNOWLEDGE
) || diff
!= 0))) {
886 * The last valid PSN seen is the previous
889 update_last_psn(qp
, wqe
->psn
- 1);
890 /* Retry this request. */
891 ipath_restart_rc(qp
, wqe
->psn
, &wc
);
893 * No need to process the ACK/NAK since we are
894 * restarting an earlier request.
898 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
899 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
)
900 *(u64
*) wqe
->sg_list
[0].vaddr
= val
;
901 if (qp
->s_num_rd_atomic
&&
902 (wqe
->wr
.opcode
== IB_WR_RDMA_READ
||
903 wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
904 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
)) {
905 qp
->s_num_rd_atomic
--;
906 /* Restart sending task if fence is complete */
907 if ((qp
->s_flags
& IPATH_S_FENCE_PENDING
) &&
908 !qp
->s_num_rd_atomic
) {
909 qp
->s_flags
&= ~IPATH_S_FENCE_PENDING
;
910 tasklet_hi_schedule(&qp
->s_task
);
911 } else if (qp
->s_flags
& IPATH_S_RDMAR_PENDING
) {
912 qp
->s_flags
&= ~IPATH_S_RDMAR_PENDING
;
913 tasklet_hi_schedule(&qp
->s_task
);
916 /* Post a send completion queue entry if requested. */
917 if (!(qp
->s_flags
& IPATH_S_SIGNAL_REQ_WR
) ||
918 (wqe
->wr
.send_flags
& IB_SEND_SIGNALED
)) {
919 wc
.wr_id
= wqe
->wr
.wr_id
;
920 wc
.status
= IB_WC_SUCCESS
;
921 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
923 wc
.byte_len
= wqe
->length
;
926 wc
.src_qp
= qp
->remote_qpn
;
929 wc
.slid
= qp
->remote_ah_attr
.dlid
;
930 wc
.sl
= qp
->remote_ah_attr
.sl
;
931 wc
.dlid_path_bits
= 0;
933 ipath_cq_enter(to_icq(qp
->ibqp
.send_cq
), &wc
, 0);
935 qp
->s_retry
= qp
->s_retry_cnt
;
937 * If we are completing a request which is in the process of
938 * being resent, we can stop resending it since we know the
939 * responder has already seen it.
941 if (qp
->s_last
== qp
->s_cur
) {
942 if (++qp
->s_cur
>= qp
->s_size
)
944 qp
->s_last
= qp
->s_cur
;
945 if (qp
->s_last
== qp
->s_tail
)
947 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
948 qp
->s_state
= OP(SEND_LAST
);
949 qp
->s_psn
= wqe
->psn
;
951 if (++qp
->s_last
>= qp
->s_size
)
953 if (qp
->s_last
== qp
->s_tail
)
955 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
959 switch (aeth
>> 29) {
962 /* If this is a partial ACK, reset the retransmit timer. */
963 if (qp
->s_last
!= qp
->s_tail
) {
964 spin_lock(&dev
->pending_lock
);
965 if (list_empty(&qp
->timerwait
))
966 list_add_tail(&qp
->timerwait
,
967 &dev
->pending
[dev
->pending_index
]);
968 spin_unlock(&dev
->pending_lock
);
970 * If we get a partial ACK for a resent operation,
971 * we can stop resending the earlier packets and
972 * continue with the next packet the receiver wants.
974 if (ipath_cmp24(qp
->s_psn
, psn
) <= 0) {
975 reset_psn(qp
, psn
+ 1);
976 tasklet_hi_schedule(&qp
->s_task
);
978 } else if (ipath_cmp24(qp
->s_psn
, psn
) <= 0) {
979 qp
->s_state
= OP(SEND_LAST
);
982 ipath_get_credit(qp
, aeth
);
983 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
984 qp
->s_retry
= qp
->s_retry_cnt
;
985 update_last_psn(qp
, psn
);
989 case 1: /* RNR NAK */
991 if (qp
->s_last
== qp
->s_tail
)
993 if (qp
->s_rnr_retry
== 0) {
994 wc
.status
= IB_WC_RNR_RETRY_EXC_ERR
;
997 if (qp
->s_rnr_retry_cnt
< 7)
1000 /* The last valid PSN is the previous PSN. */
1001 update_last_psn(qp
, psn
- 1);
1003 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
1004 dev
->n_rc_resends
++;
1006 dev
->n_rc_resends
+=
1007 (qp
->s_psn
- psn
) & IPATH_PSN_MASK
;
1012 ib_ipath_rnr_table
[(aeth
>> IPATH_AETH_CREDIT_SHIFT
) &
1013 IPATH_AETH_CREDIT_MASK
];
1014 ipath_insert_rnr_queue(qp
);
1018 if (qp
->s_last
== qp
->s_tail
)
1020 /* The last valid PSN is the previous PSN. */
1021 update_last_psn(qp
, psn
- 1);
1022 switch ((aeth
>> IPATH_AETH_CREDIT_SHIFT
) &
1023 IPATH_AETH_CREDIT_MASK
) {
1024 case 0: /* PSN sequence error */
1027 * Back up to the responder's expected PSN.
1028 * Note that we might get a NAK in the middle of an
1029 * RDMA READ response which terminates the RDMA
1032 ipath_restart_rc(qp
, psn
, &wc
);
1035 case 1: /* Invalid Request */
1036 wc
.status
= IB_WC_REM_INV_REQ_ERR
;
1037 dev
->n_other_naks
++;
1040 case 2: /* Remote Access Error */
1041 wc
.status
= IB_WC_REM_ACCESS_ERR
;
1042 dev
->n_other_naks
++;
1045 case 3: /* Remote Operation Error */
1046 wc
.status
= IB_WC_REM_OP_ERR
;
1047 dev
->n_other_naks
++;
1049 wc
.wr_id
= wqe
->wr
.wr_id
;
1050 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
1055 wc
.src_qp
= qp
->remote_qpn
;
1058 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1059 wc
.sl
= qp
->remote_ah_attr
.sl
;
1060 wc
.dlid_path_bits
= 0;
1062 ipath_sqerror_qp(qp
, &wc
);
1066 /* Ignore other reserved NAK error codes */
1069 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
1072 default: /* 2: reserved */
1074 /* Ignore reserved NAK codes. */
1083 * ipath_rc_rcv_resp - process an incoming RC response packet
1084 * @dev: the device this packet came in on
1085 * @ohdr: the other headers for this packet
1086 * @data: the packet data
1087 * @tlen: the packet length
1088 * @qp: the QP for this packet
1089 * @opcode: the opcode for this packet
1090 * @psn: the packet sequence number for this packet
1091 * @hdrsize: the header length
1092 * @pmtu: the path MTU
1093 * @header_in_data: true if part of the header data is in the data buffer
1095 * This is called from ipath_rc_rcv() to process an incoming RC response
1096 * packet for the given QP.
1097 * Called at interrupt level.
1099 static inline void ipath_rc_rcv_resp(struct ipath_ibdev
*dev
,
1100 struct ipath_other_headers
*ohdr
,
1101 void *data
, u32 tlen
,
1102 struct ipath_qp
*qp
,
1104 u32 psn
, u32 hdrsize
, u32 pmtu
,
1107 struct ipath_swqe
*wqe
;
1108 unsigned long flags
;
1115 spin_lock_irqsave(&qp
->s_lock
, flags
);
1117 /* Ignore invalid responses. */
1118 if (ipath_cmp24(psn
, qp
->s_next_psn
) >= 0)
1121 /* Ignore duplicate responses. */
1122 diff
= ipath_cmp24(psn
, qp
->s_last_psn
);
1123 if (unlikely(diff
<= 0)) {
1124 /* Update credits for "ghost" ACKs */
1125 if (diff
== 0 && opcode
== OP(ACKNOWLEDGE
)) {
1126 if (!header_in_data
)
1127 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1129 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1130 data
+= sizeof(__be32
);
1132 if ((aeth
>> 29) == 0)
1133 ipath_get_credit(qp
, aeth
);
1138 if (unlikely(qp
->s_last
== qp
->s_tail
))
1140 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
1143 case OP(ACKNOWLEDGE
):
1144 case OP(ATOMIC_ACKNOWLEDGE
):
1145 case OP(RDMA_READ_RESPONSE_FIRST
):
1146 if (!header_in_data
)
1147 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1149 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1150 data
+= sizeof(__be32
);
1152 if (opcode
== OP(ATOMIC_ACKNOWLEDGE
)) {
1153 if (!header_in_data
) {
1154 __be32
*p
= ohdr
->u
.at
.atomic_ack_eth
;
1156 val
= ((u64
) be32_to_cpu(p
[0]) << 32) |
1159 val
= be64_to_cpu(((__be64
*) data
)[0]);
1162 if (!do_rc_ack(qp
, aeth
, psn
, opcode
, val
) ||
1163 opcode
!= OP(RDMA_READ_RESPONSE_FIRST
))
1166 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
1167 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1170 * If this is a response to a resent RDMA read, we
1171 * have to be careful to copy the data to the right
1174 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
1178 case OP(RDMA_READ_RESPONSE_MIDDLE
):
1179 /* no AETH, no ACK */
1180 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1182 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1, &wc
);
1185 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1188 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1190 if (unlikely(pmtu
>= qp
->s_rdma_read_len
))
1193 /* We got a response so update the timeout. */
1194 spin_lock(&dev
->pending_lock
);
1195 if (qp
->s_rnr_timeout
== 0 && !list_empty(&qp
->timerwait
))
1196 list_move_tail(&qp
->timerwait
,
1197 &dev
->pending
[dev
->pending_index
]);
1198 spin_unlock(&dev
->pending_lock
);
1200 * Update the RDMA receive state but do the copy w/o
1201 * holding the locks and blocking interrupts.
1203 qp
->s_rdma_read_len
-= pmtu
;
1204 update_last_psn(qp
, psn
);
1205 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1206 ipath_copy_sge(&qp
->s_rdma_read_sge
, data
, pmtu
);
1209 case OP(RDMA_READ_RESPONSE_ONLY
):
1210 if (!header_in_data
)
1211 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1213 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1214 if (!do_rc_ack(qp
, aeth
, psn
, opcode
, 0))
1216 /* Get the number of bytes the message was padded by. */
1217 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1219 * Check that the data size is >= 0 && <= pmtu.
1220 * Remember to account for the AETH header (4) and
1223 if (unlikely(tlen
< (hdrsize
+ pad
+ 8)))
1226 * If this is a response to a resent RDMA read, we
1227 * have to be careful to copy the data to the right
1230 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
1231 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
1235 case OP(RDMA_READ_RESPONSE_LAST
):
1236 /* ACKs READ req. */
1237 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1239 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1, &wc
);
1242 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1244 /* Get the number of bytes the message was padded by. */
1245 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1247 * Check that the data size is >= 1 && <= pmtu.
1248 * Remember to account for the AETH header (4) and
1251 if (unlikely(tlen
<= (hdrsize
+ pad
+ 8)))
1254 tlen
-= hdrsize
+ pad
+ 8;
1255 if (unlikely(tlen
!= qp
->s_rdma_read_len
))
1257 if (!header_in_data
)
1258 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1260 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1261 data
+= sizeof(__be32
);
1263 ipath_copy_sge(&qp
->s_rdma_read_sge
, data
, tlen
);
1264 (void) do_rc_ack(qp
, aeth
, psn
,
1265 OP(RDMA_READ_RESPONSE_LAST
), 0);
1270 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1274 wc
.status
= IB_WC_LOC_QP_OP_ERR
;
1278 wc
.status
= IB_WC_LOC_LEN_ERR
;
1280 wc
.wr_id
= wqe
->wr
.wr_id
;
1281 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
1286 wc
.src_qp
= qp
->remote_qpn
;
1289 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1290 wc
.sl
= qp
->remote_ah_attr
.sl
;
1291 wc
.dlid_path_bits
= 0;
1293 ipath_sqerror_qp(qp
, &wc
);
1294 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1300 * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
1301 * @dev: the device this packet came in on
1302 * @ohdr: the other headers for this packet
1303 * @data: the packet data
1304 * @qp: the QP for this packet
1305 * @opcode: the opcode for this packet
1306 * @psn: the packet sequence number for this packet
1307 * @diff: the difference between the PSN and the expected PSN
1308 * @header_in_data: true if part of the header data is in the data buffer
1310 * This is called from ipath_rc_rcv() to process an unexpected
1311 * incoming RC packet for the given QP.
1312 * Called at interrupt level.
1313 * Return 1 if no more processing is needed; otherwise return 0 to
1314 * schedule a response to be sent.
1316 static inline int ipath_rc_rcv_error(struct ipath_ibdev
*dev
,
1317 struct ipath_other_headers
*ohdr
,
1319 struct ipath_qp
*qp
,
1325 struct ipath_ack_entry
*e
;
1328 unsigned long flags
;
1332 * Packet sequence error.
1333 * A NAK will ACK earlier sends and RDMA writes.
1334 * Don't queue the NAK if we already sent one.
1336 if (!qp
->r_nak_state
) {
1337 qp
->r_nak_state
= IB_NAK_PSN_ERROR
;
1338 /* Use the expected PSN. */
1339 qp
->r_ack_psn
= qp
->r_psn
;
1346 * Handle a duplicate request. Don't re-execute SEND, RDMA
1347 * write or atomic op. Don't NAK errors, just silently drop
1348 * the duplicate request. Note that r_sge, r_len, and
1349 * r_rcv_len may be in use so don't modify them.
1351 * We are supposed to ACK the earliest duplicate PSN but we
1352 * can coalesce an outstanding duplicate ACK. We have to
1353 * send the earliest so that RDMA reads can be restarted at
1354 * the requester's expected PSN.
1356 * First, find where this duplicate PSN falls within the
1357 * ACKs previously sent.
1359 psn
&= IPATH_PSN_MASK
;
1362 spin_lock_irqsave(&qp
->s_lock
, flags
);
1363 for (i
= qp
->r_head_ack_queue
; ; i
= prev
) {
1364 if (i
== qp
->s_tail_ack_queue
)
1369 prev
= IPATH_MAX_RDMA_ATOMIC
;
1370 if (prev
== qp
->r_head_ack_queue
) {
1374 e
= &qp
->s_ack_queue
[prev
];
1379 if (ipath_cmp24(psn
, e
->psn
) >= 0) {
1380 if (prev
== qp
->s_tail_ack_queue
)
1386 case OP(RDMA_READ_REQUEST
): {
1387 struct ib_reth
*reth
;
1392 * If we didn't find the RDMA read request in the ack queue,
1393 * or the send tasklet is already backed up to send an
1394 * earlier entry, we can ignore this request.
1396 if (!e
|| e
->opcode
!= OP(RDMA_READ_REQUEST
) || old_req
)
1398 /* RETH comes after BTH */
1399 if (!header_in_data
)
1400 reth
= &ohdr
->u
.rc
.reth
;
1402 reth
= (struct ib_reth
*)data
;
1403 data
+= sizeof(*reth
);
1406 * Address range must be a subset of the original
1407 * request and start on pmtu boundaries.
1408 * We reuse the old ack_queue slot since the requester
1409 * should not back up and request an earlier PSN for the
1412 offset
= ((psn
- e
->psn
) & IPATH_PSN_MASK
) *
1413 ib_mtu_enum_to_int(qp
->path_mtu
);
1414 len
= be32_to_cpu(reth
->length
);
1415 if (unlikely(offset
+ len
> e
->rdma_sge
.sge
.sge_length
))
1418 u32 rkey
= be32_to_cpu(reth
->rkey
);
1419 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1422 ok
= ipath_rkey_ok(qp
, &e
->rdma_sge
,
1424 IB_ACCESS_REMOTE_READ
);
1428 e
->rdma_sge
.sg_list
= NULL
;
1429 e
->rdma_sge
.num_sge
= 0;
1430 e
->rdma_sge
.sge
.mr
= NULL
;
1431 e
->rdma_sge
.sge
.vaddr
= NULL
;
1432 e
->rdma_sge
.sge
.length
= 0;
1433 e
->rdma_sge
.sge
.sge_length
= 0;
1436 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1437 qp
->s_tail_ack_queue
= prev
;
1441 case OP(COMPARE_SWAP
):
1442 case OP(FETCH_ADD
): {
1444 * If we didn't find the atomic request in the ack queue
1445 * or the send tasklet is already backed up to send an
1446 * earlier entry, we can ignore this request.
1448 if (!e
|| e
->opcode
!= (u8
) opcode
|| old_req
)
1450 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1451 qp
->s_tail_ack_queue
= prev
;
1459 * Resend the most recent ACK if this request is
1460 * after all the previous RDMA reads and atomics.
1462 if (i
== qp
->r_head_ack_queue
) {
1463 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1464 qp
->r_nak_state
= 0;
1465 qp
->r_ack_psn
= qp
->r_psn
- 1;
1469 * Try to send a simple ACK to work around a Mellanox bug
1470 * which doesn't accept a RDMA read response or atomic
1471 * response as an ACK for earlier SENDs or RDMA writes.
1473 if (qp
->r_head_ack_queue
== qp
->s_tail_ack_queue
&&
1474 !(qp
->s_flags
& IPATH_S_ACK_PENDING
) &&
1475 qp
->s_ack_state
== OP(ACKNOWLEDGE
)) {
1476 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1477 qp
->r_nak_state
= 0;
1478 qp
->r_ack_psn
= qp
->s_ack_queue
[i
].psn
- 1;
1482 * Resend the RDMA read or atomic op which
1483 * ACKs this duplicate request.
1485 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1486 qp
->s_tail_ack_queue
= i
;
1489 qp
->r_nak_state
= 0;
1490 tasklet_hi_schedule(&qp
->s_task
);
1493 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1501 static void ipath_rc_error(struct ipath_qp
*qp
, enum ib_wc_status err
)
1503 unsigned long flags
;
1506 spin_lock_irqsave(&qp
->s_lock
, flags
);
1507 qp
->state
= IB_QPS_ERR
;
1508 lastwqe
= ipath_error_qp(qp
, err
);
1509 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1514 ev
.device
= qp
->ibqp
.device
;
1515 ev
.element
.qp
= &qp
->ibqp
;
1516 ev
.event
= IB_EVENT_QP_LAST_WQE_REACHED
;
1517 qp
->ibqp
.event_handler(&ev
, qp
->ibqp
.qp_context
);
1521 static inline void ipath_update_ack_queue(struct ipath_qp
*qp
, unsigned n
)
1523 unsigned long flags
;
1527 if (next
> IPATH_MAX_RDMA_ATOMIC
)
1529 spin_lock_irqsave(&qp
->s_lock
, flags
);
1530 if (n
== qp
->s_tail_ack_queue
) {
1531 qp
->s_tail_ack_queue
= next
;
1532 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1534 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1538 * ipath_rc_rcv - process an incoming RC packet
1539 * @dev: the device this packet came in on
1540 * @hdr: the header of this packet
1541 * @has_grh: true if the header has a GRH
1542 * @data: the packet data
1543 * @tlen: the packet length
1544 * @qp: the QP for this packet
1546 * This is called from ipath_qp_rcv() to process an incoming RC packet
1548 * Called at interrupt level.
1550 void ipath_rc_rcv(struct ipath_ibdev
*dev
, struct ipath_ib_header
*hdr
,
1551 int has_grh
, void *data
, u32 tlen
, struct ipath_qp
*qp
)
1553 struct ipath_other_headers
*ohdr
;
1559 u32 pmtu
= ib_mtu_enum_to_int(qp
->path_mtu
);
1561 struct ib_reth
*reth
;
1564 /* Validate the SLID. See Ch. 9.6.1.5 */
1565 if (unlikely(be16_to_cpu(hdr
->lrh
[3]) != qp
->remote_ah_attr
.dlid
))
1571 hdrsize
= 8 + 12; /* LRH + BTH */
1572 psn
= be32_to_cpu(ohdr
->bth
[2]);
1575 ohdr
= &hdr
->u
.l
.oth
;
1576 hdrsize
= 8 + 40 + 12; /* LRH + GRH + BTH */
1578 * The header with GRH is 60 bytes and the core driver sets
1579 * the eager header buffer size to 56 bytes so the last 4
1580 * bytes of the BTH header (PSN) is in the data buffer.
1582 header_in_data
= dev
->dd
->ipath_rcvhdrentsize
== 16;
1583 if (header_in_data
) {
1584 psn
= be32_to_cpu(((__be32
*) data
)[0]);
1585 data
+= sizeof(__be32
);
1587 psn
= be32_to_cpu(ohdr
->bth
[2]);
1591 * Process responses (ACKs) before anything else. Note that the
1592 * packet sequence number will be for something in the send work
1593 * queue rather than the expected receive packet sequence number.
1594 * In other words, this QP is the requester.
1596 opcode
= be32_to_cpu(ohdr
->bth
[0]) >> 24;
1597 if (opcode
>= OP(RDMA_READ_RESPONSE_FIRST
) &&
1598 opcode
<= OP(ATOMIC_ACKNOWLEDGE
)) {
1599 ipath_rc_rcv_resp(dev
, ohdr
, data
, tlen
, qp
, opcode
, psn
,
1600 hdrsize
, pmtu
, header_in_data
);
1604 /* Compute 24 bits worth of difference. */
1605 diff
= ipath_cmp24(psn
, qp
->r_psn
);
1606 if (unlikely(diff
)) {
1607 if (ipath_rc_rcv_error(dev
, ohdr
, data
, qp
, opcode
,
1608 psn
, diff
, header_in_data
))
1613 /* Check for opcode sequence errors. */
1614 switch (qp
->r_state
) {
1615 case OP(SEND_FIRST
):
1616 case OP(SEND_MIDDLE
):
1617 if (opcode
== OP(SEND_MIDDLE
) ||
1618 opcode
== OP(SEND_LAST
) ||
1619 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
))
1622 ipath_rc_error(qp
, IB_WC_REM_INV_REQ_ERR
);
1623 qp
->r_nak_state
= IB_NAK_INVALID_REQUEST
;
1624 qp
->r_ack_psn
= qp
->r_psn
;
1627 case OP(RDMA_WRITE_FIRST
):
1628 case OP(RDMA_WRITE_MIDDLE
):
1629 if (opcode
== OP(RDMA_WRITE_MIDDLE
) ||
1630 opcode
== OP(RDMA_WRITE_LAST
) ||
1631 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
1636 if (opcode
== OP(SEND_MIDDLE
) ||
1637 opcode
== OP(SEND_LAST
) ||
1638 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
) ||
1639 opcode
== OP(RDMA_WRITE_MIDDLE
) ||
1640 opcode
== OP(RDMA_WRITE_LAST
) ||
1641 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
1644 * Note that it is up to the requester to not send a new
1645 * RDMA read or atomic operation before receiving an ACK
1646 * for the previous operation.
1654 /* OK, process the packet. */
1656 case OP(SEND_FIRST
):
1657 if (!ipath_get_rwqe(qp
, 0)) {
1659 qp
->r_nak_state
= IB_RNR_NAK
| qp
->r_min_rnr_timer
;
1660 qp
->r_ack_psn
= qp
->r_psn
;
1665 case OP(SEND_MIDDLE
):
1666 case OP(RDMA_WRITE_MIDDLE
):
1668 /* Check for invalid length PMTU or posted rwqe len. */
1669 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1671 qp
->r_rcv_len
+= pmtu
;
1672 if (unlikely(qp
->r_rcv_len
> qp
->r_len
))
1674 ipath_copy_sge(&qp
->r_sge
, data
, pmtu
);
1677 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
):
1679 if (!ipath_get_rwqe(qp
, 1))
1684 case OP(SEND_ONLY_WITH_IMMEDIATE
):
1685 if (!ipath_get_rwqe(qp
, 0))
1688 if (opcode
== OP(SEND_ONLY
))
1691 case OP(SEND_LAST_WITH_IMMEDIATE
):
1693 if (header_in_data
) {
1694 wc
.imm_data
= *(__be32
*) data
;
1695 data
+= sizeof(__be32
);
1697 /* Immediate data comes after BTH */
1698 wc
.imm_data
= ohdr
->u
.imm_data
;
1701 wc
.wc_flags
= IB_WC_WITH_IMM
;
1704 case OP(RDMA_WRITE_LAST
):
1706 /* Get the number of bytes the message was padded by. */
1707 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1708 /* Check for invalid length. */
1709 /* XXX LAST len should be >= 1 */
1710 if (unlikely(tlen
< (hdrsize
+ pad
+ 4)))
1712 /* Don't count the CRC. */
1713 tlen
-= (hdrsize
+ pad
+ 4);
1714 wc
.byte_len
= tlen
+ qp
->r_rcv_len
;
1715 if (unlikely(wc
.byte_len
> qp
->r_len
))
1717 ipath_copy_sge(&qp
->r_sge
, data
, tlen
);
1719 if (!qp
->r_wrid_valid
)
1721 qp
->r_wrid_valid
= 0;
1722 wc
.wr_id
= qp
->r_wr_id
;
1723 wc
.status
= IB_WC_SUCCESS
;
1724 wc
.opcode
= IB_WC_RECV
;
1727 wc
.src_qp
= qp
->remote_qpn
;
1729 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1730 wc
.sl
= qp
->remote_ah_attr
.sl
;
1731 wc
.dlid_path_bits
= 0;
1733 /* Signal completion event if the solicited bit is set. */
1734 ipath_cq_enter(to_icq(qp
->ibqp
.recv_cq
), &wc
,
1736 __constant_cpu_to_be32(1 << 23)) != 0);
1739 case OP(RDMA_WRITE_FIRST
):
1740 case OP(RDMA_WRITE_ONLY
):
1741 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
):
1742 if (unlikely(!(qp
->qp_access_flags
&
1743 IB_ACCESS_REMOTE_WRITE
)))
1746 /* RETH comes after BTH */
1747 if (!header_in_data
)
1748 reth
= &ohdr
->u
.rc
.reth
;
1750 reth
= (struct ib_reth
*)data
;
1751 data
+= sizeof(*reth
);
1753 hdrsize
+= sizeof(*reth
);
1754 qp
->r_len
= be32_to_cpu(reth
->length
);
1756 if (qp
->r_len
!= 0) {
1757 u32 rkey
= be32_to_cpu(reth
->rkey
);
1758 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1761 /* Check rkey & NAK */
1762 ok
= ipath_rkey_ok(qp
, &qp
->r_sge
,
1763 qp
->r_len
, vaddr
, rkey
,
1764 IB_ACCESS_REMOTE_WRITE
);
1768 qp
->r_sge
.sg_list
= NULL
;
1769 qp
->r_sge
.sge
.mr
= NULL
;
1770 qp
->r_sge
.sge
.vaddr
= NULL
;
1771 qp
->r_sge
.sge
.length
= 0;
1772 qp
->r_sge
.sge
.sge_length
= 0;
1774 if (opcode
== OP(RDMA_WRITE_FIRST
))
1776 else if (opcode
== OP(RDMA_WRITE_ONLY
))
1778 if (!ipath_get_rwqe(qp
, 1))
1782 case OP(RDMA_READ_REQUEST
): {
1783 struct ipath_ack_entry
*e
;
1787 if (unlikely(!(qp
->qp_access_flags
&
1788 IB_ACCESS_REMOTE_READ
)))
1790 next
= qp
->r_head_ack_queue
+ 1;
1791 if (next
> IPATH_MAX_RDMA_ATOMIC
)
1793 if (unlikely(next
== qp
->s_tail_ack_queue
)) {
1794 if (!qp
->s_ack_queue
[next
].sent
)
1796 ipath_update_ack_queue(qp
, next
);
1798 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
1799 /* RETH comes after BTH */
1800 if (!header_in_data
)
1801 reth
= &ohdr
->u
.rc
.reth
;
1803 reth
= (struct ib_reth
*)data
;
1804 data
+= sizeof(*reth
);
1806 len
= be32_to_cpu(reth
->length
);
1808 u32 rkey
= be32_to_cpu(reth
->rkey
);
1809 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1812 /* Check rkey & NAK */
1813 ok
= ipath_rkey_ok(qp
, &e
->rdma_sge
, len
, vaddr
,
1814 rkey
, IB_ACCESS_REMOTE_READ
);
1818 * Update the next expected PSN. We add 1 later
1819 * below, so only add the remainder here.
1822 qp
->r_psn
+= (len
- 1) / pmtu
;
1824 e
->rdma_sge
.sg_list
= NULL
;
1825 e
->rdma_sge
.num_sge
= 0;
1826 e
->rdma_sge
.sge
.mr
= NULL
;
1827 e
->rdma_sge
.sge
.vaddr
= NULL
;
1828 e
->rdma_sge
.sge
.length
= 0;
1829 e
->rdma_sge
.sge
.sge_length
= 0;
1835 * We need to increment the MSN here instead of when we
1836 * finish sending the result since a duplicate request would
1837 * increment it more than once.
1841 qp
->r_state
= opcode
;
1842 qp
->r_nak_state
= 0;
1844 qp
->r_head_ack_queue
= next
;
1846 /* Call ipath_do_rc_send() in another thread. */
1847 tasklet_hi_schedule(&qp
->s_task
);
1852 case OP(COMPARE_SWAP
):
1853 case OP(FETCH_ADD
): {
1854 struct ib_atomic_eth
*ateth
;
1855 struct ipath_ack_entry
*e
;
1862 if (unlikely(!(qp
->qp_access_flags
&
1863 IB_ACCESS_REMOTE_ATOMIC
)))
1865 next
= qp
->r_head_ack_queue
+ 1;
1866 if (next
> IPATH_MAX_RDMA_ATOMIC
)
1868 if (unlikely(next
== qp
->s_tail_ack_queue
)) {
1869 if (!qp
->s_ack_queue
[next
].sent
)
1871 ipath_update_ack_queue(qp
, next
);
1873 if (!header_in_data
)
1874 ateth
= &ohdr
->u
.atomic_eth
;
1876 ateth
= (struct ib_atomic_eth
*)data
;
1877 vaddr
= ((u64
) be32_to_cpu(ateth
->vaddr
[0]) << 32) |
1878 be32_to_cpu(ateth
->vaddr
[1]);
1879 if (unlikely(vaddr
& (sizeof(u64
) - 1)))
1881 rkey
= be32_to_cpu(ateth
->rkey
);
1882 /* Check rkey & NAK */
1883 if (unlikely(!ipath_rkey_ok(qp
, &qp
->r_sge
,
1884 sizeof(u64
), vaddr
, rkey
,
1885 IB_ACCESS_REMOTE_ATOMIC
)))
1887 /* Perform atomic OP and save result. */
1888 maddr
= (atomic64_t
*) qp
->r_sge
.sge
.vaddr
;
1889 sdata
= be64_to_cpu(ateth
->swap_data
);
1890 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
1891 e
->atomic_data
= (opcode
== OP(FETCH_ADD
)) ?
1892 (u64
) atomic64_add_return(sdata
, maddr
) - sdata
:
1893 (u64
) cmpxchg((u64
*) qp
->r_sge
.sge
.vaddr
,
1894 be64_to_cpu(ateth
->compare_data
),
1898 e
->psn
= psn
& IPATH_PSN_MASK
;
1901 qp
->r_state
= opcode
;
1902 qp
->r_nak_state
= 0;
1904 qp
->r_head_ack_queue
= next
;
1906 /* Call ipath_do_rc_send() in another thread. */
1907 tasklet_hi_schedule(&qp
->s_task
);
1913 /* NAK unknown opcodes. */
1917 qp
->r_state
= opcode
;
1918 qp
->r_ack_psn
= psn
;
1919 qp
->r_nak_state
= 0;
1920 /* Send an ACK if requested or required. */
1921 if (psn
& (1 << 31))
1926 ipath_rc_error(qp
, IB_WC_REM_ACCESS_ERR
);
1927 qp
->r_nak_state
= IB_NAK_REMOTE_ACCESS_ERROR
;
1928 qp
->r_ack_psn
= qp
->r_psn
;