2 * Copyright (c) 2006 QLogic, Inc. All rights reserved.
3 * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 #include "ipath_verbs.h"
35 #include "ipath_kernel.h"
37 /* cut down ridiculously long IB macro names */
38 #define OP(x) IB_OPCODE_RC_##x
40 static u32
restart_sge(struct ipath_sge_state
*ss
, struct ipath_swqe
*wqe
,
45 len
= ((psn
- wqe
->psn
) & IPATH_PSN_MASK
) * pmtu
;
46 ss
->sge
= wqe
->sg_list
[0];
47 ss
->sg_list
= wqe
->sg_list
+ 1;
48 ss
->num_sge
= wqe
->wr
.num_sge
;
49 ipath_skip_sge(ss
, len
);
50 return wqe
->length
- len
;
54 * ipath_init_restart- initialize the qp->s_sge after a restart
55 * @qp: the QP who's SGE we're restarting
56 * @wqe: the work queue to initialize the QP's SGE from
58 * The QP s_lock should be held and interrupts disabled.
60 static void ipath_init_restart(struct ipath_qp
*qp
, struct ipath_swqe
*wqe
)
62 struct ipath_ibdev
*dev
;
64 qp
->s_len
= restart_sge(&qp
->s_sge
, wqe
, qp
->s_psn
,
65 ib_mtu_enum_to_int(qp
->path_mtu
));
66 dev
= to_idev(qp
->ibqp
.device
);
67 spin_lock(&dev
->pending_lock
);
68 if (list_empty(&qp
->timerwait
))
69 list_add_tail(&qp
->timerwait
,
70 &dev
->pending
[dev
->pending_index
]);
71 spin_unlock(&dev
->pending_lock
);
75 * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
76 * @qp: a pointer to the QP
77 * @ohdr: a pointer to the IB header being constructed
80 * Return 1 if constructed; otherwise, return 0.
81 * Note that we are in the responder's side of the QP context.
82 * Note the QP s_lock must be held.
84 static int ipath_make_rc_ack(struct ipath_qp
*qp
,
85 struct ipath_other_headers
*ohdr
,
86 u32 pmtu
, u32
*bth0p
, u32
*bth2p
)
88 struct ipath_ack_entry
*e
;
94 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
97 switch (qp
->s_ack_state
) {
98 case OP(RDMA_READ_RESPONSE_LAST
):
99 case OP(RDMA_READ_RESPONSE_ONLY
):
100 case OP(ATOMIC_ACKNOWLEDGE
):
102 * We can increment the tail pointer now that the last
103 * response has been sent instead of only being
106 if (++qp
->s_tail_ack_queue
> IPATH_MAX_RDMA_ATOMIC
)
107 qp
->s_tail_ack_queue
= 0;
110 case OP(ACKNOWLEDGE
):
111 /* Check for no next entry in the queue. */
112 if (qp
->r_head_ack_queue
== qp
->s_tail_ack_queue
) {
113 if (qp
->s_flags
& IPATH_S_ACK_PENDING
)
115 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
119 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
120 if (e
->opcode
== OP(RDMA_READ_REQUEST
)) {
121 /* Copy SGE state in case we need to resend */
122 qp
->s_ack_rdma_sge
= e
->rdma_sge
;
123 qp
->s_cur_sge
= &qp
->s_ack_rdma_sge
;
124 len
= e
->rdma_sge
.sge
.sge_length
;
127 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_FIRST
);
129 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_ONLY
);
130 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
132 qp
->s_ack_rdma_psn
= e
->psn
;
133 bth2
= qp
->s_ack_rdma_psn
++ & IPATH_PSN_MASK
;
135 /* COMPARE_SWAP or FETCH_ADD */
136 qp
->s_cur_sge
= NULL
;
138 qp
->s_ack_state
= OP(ATOMIC_ACKNOWLEDGE
);
139 ohdr
->u
.at
.aeth
= ipath_compute_aeth(qp
);
140 ohdr
->u
.at
.atomic_ack_eth
[0] =
141 cpu_to_be32(e
->atomic_data
>> 32);
142 ohdr
->u
.at
.atomic_ack_eth
[1] =
143 cpu_to_be32(e
->atomic_data
);
144 hwords
+= sizeof(ohdr
->u
.at
) / sizeof(u32
);
147 bth0
= qp
->s_ack_state
<< 24;
150 case OP(RDMA_READ_RESPONSE_FIRST
):
151 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
153 case OP(RDMA_READ_RESPONSE_MIDDLE
):
154 len
= qp
->s_ack_rdma_sge
.sge
.sge_length
;
158 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
160 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_LAST
);
162 bth0
= qp
->s_ack_state
<< 24;
163 bth2
= qp
->s_ack_rdma_psn
++ & IPATH_PSN_MASK
;
169 * Send a regular ACK.
170 * Set the s_ack_state so we wait until after sending
171 * the ACK before setting s_ack_state to ACKNOWLEDGE
174 qp
->s_ack_state
= OP(SEND_ONLY
);
175 qp
->s_flags
&= ~IPATH_S_ACK_PENDING
;
176 qp
->s_cur_sge
= NULL
;
179 cpu_to_be32((qp
->r_msn
& IPATH_MSN_MASK
) |
181 IPATH_AETH_CREDIT_SHIFT
));
183 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
186 bth0
= OP(ACKNOWLEDGE
) << 24;
187 bth2
= qp
->s_ack_psn
& IPATH_PSN_MASK
;
189 qp
->s_hdrwords
= hwords
;
190 qp
->s_cur_size
= len
;
200 * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
201 * @qp: a pointer to the QP
202 * @ohdr: a pointer to the IB header being constructed
203 * @pmtu: the path MTU
204 * @bth0p: pointer to the BTH opcode word
205 * @bth2p: pointer to the BTH PSN word
207 * Return 1 if constructed; otherwise, return 0.
208 * Note the QP s_lock must be held and interrupts disabled.
210 int ipath_make_rc_req(struct ipath_qp
*qp
,
211 struct ipath_other_headers
*ohdr
,
212 u32 pmtu
, u32
*bth0p
, u32
*bth2p
)
214 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
215 struct ipath_sge_state
*ss
;
216 struct ipath_swqe
*wqe
;
223 /* Sending responses has higher priority over sending requests. */
224 if ((qp
->r_head_ack_queue
!= qp
->s_tail_ack_queue
||
225 (qp
->s_flags
& IPATH_S_ACK_PENDING
) ||
226 qp
->s_ack_state
!= OP(ACKNOWLEDGE
)) &&
227 ipath_make_rc_ack(qp
, ohdr
, pmtu
, bth0p
, bth2p
))
230 if (!(ib_ipath_state_ops
[qp
->state
] & IPATH_PROCESS_SEND_OK
) ||
231 qp
->s_rnr_timeout
|| qp
->s_wait_credit
)
234 /* Limit the number of packets sent without an ACK. */
235 if (ipath_cmp24(qp
->s_psn
, qp
->s_last_psn
+ IPATH_PSN_CREDIT
) > 0) {
236 qp
->s_wait_credit
= 1;
241 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
245 /* Send a request. */
246 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
247 switch (qp
->s_state
) {
250 * Resend an old request or start a new one.
252 * We keep track of the current SWQE so that
253 * we don't reset the "furthest progress" state
254 * if we need to back up.
257 if (qp
->s_cur
== qp
->s_tail
) {
258 /* Check if send work queue is empty. */
259 if (qp
->s_tail
== qp
->s_head
)
262 * If a fence is requested, wait for previous
263 * RDMA read and atomic operations to finish.
265 if ((wqe
->wr
.send_flags
& IB_SEND_FENCE
) &&
266 qp
->s_num_rd_atomic
) {
267 qp
->s_flags
|= IPATH_S_FENCE_PENDING
;
270 wqe
->psn
= qp
->s_next_psn
;
274 * Note that we have to be careful not to modify the
275 * original work request since we may need to resend
281 switch (wqe
->wr
.opcode
) {
283 case IB_WR_SEND_WITH_IMM
:
284 /* If no credit, return. */
285 if (qp
->s_lsn
!= (u32
) -1 &&
286 ipath_cmp24(wqe
->ssn
, qp
->s_lsn
+ 1) > 0)
288 wqe
->lpsn
= wqe
->psn
;
290 wqe
->lpsn
+= (len
- 1) / pmtu
;
291 qp
->s_state
= OP(SEND_FIRST
);
295 if (wqe
->wr
.opcode
== IB_WR_SEND
)
296 qp
->s_state
= OP(SEND_ONLY
);
298 qp
->s_state
= OP(SEND_ONLY_WITH_IMMEDIATE
);
299 /* Immediate data comes after the BTH */
300 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
303 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
305 bth2
= 1 << 31; /* Request ACK. */
306 if (++qp
->s_cur
== qp
->s_size
)
310 case IB_WR_RDMA_WRITE
:
311 if (newreq
&& qp
->s_lsn
!= (u32
) -1)
314 case IB_WR_RDMA_WRITE_WITH_IMM
:
315 /* If no credit, return. */
316 if (qp
->s_lsn
!= (u32
) -1 &&
317 ipath_cmp24(wqe
->ssn
, qp
->s_lsn
+ 1) > 0)
319 ohdr
->u
.rc
.reth
.vaddr
=
320 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
);
321 ohdr
->u
.rc
.reth
.rkey
=
322 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
323 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
324 hwords
+= sizeof(struct ib_reth
) / sizeof(u32
);
325 wqe
->lpsn
= wqe
->psn
;
327 wqe
->lpsn
+= (len
- 1) / pmtu
;
328 qp
->s_state
= OP(RDMA_WRITE_FIRST
);
332 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
333 qp
->s_state
= OP(RDMA_WRITE_ONLY
);
336 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
);
337 /* Immediate data comes after RETH */
338 ohdr
->u
.rc
.imm_data
= wqe
->wr
.imm_data
;
340 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
343 bth2
= 1 << 31; /* Request ACK. */
344 if (++qp
->s_cur
== qp
->s_size
)
348 case IB_WR_RDMA_READ
:
350 * Don't allow more operations to be started
351 * than the QP limits allow.
354 if (qp
->s_num_rd_atomic
>=
355 qp
->s_max_rd_atomic
) {
356 qp
->s_flags
|= IPATH_S_RDMAR_PENDING
;
359 qp
->s_num_rd_atomic
++;
360 if (qp
->s_lsn
!= (u32
) -1)
363 * Adjust s_next_psn to count the
364 * expected number of responses.
367 qp
->s_next_psn
+= (len
- 1) / pmtu
;
368 wqe
->lpsn
= qp
->s_next_psn
++;
370 ohdr
->u
.rc
.reth
.vaddr
=
371 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
);
372 ohdr
->u
.rc
.reth
.rkey
=
373 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
374 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
375 qp
->s_state
= OP(RDMA_READ_REQUEST
);
376 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
379 if (++qp
->s_cur
== qp
->s_size
)
383 case IB_WR_ATOMIC_CMP_AND_SWP
:
384 case IB_WR_ATOMIC_FETCH_AND_ADD
:
386 * Don't allow more operations to be started
387 * than the QP limits allow.
390 if (qp
->s_num_rd_atomic
>=
391 qp
->s_max_rd_atomic
) {
392 qp
->s_flags
|= IPATH_S_RDMAR_PENDING
;
395 qp
->s_num_rd_atomic
++;
396 if (qp
->s_lsn
!= (u32
) -1)
398 wqe
->lpsn
= wqe
->psn
;
400 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
) {
401 qp
->s_state
= OP(COMPARE_SWAP
);
402 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
403 wqe
->wr
.wr
.atomic
.swap
);
404 ohdr
->u
.atomic_eth
.compare_data
= cpu_to_be64(
405 wqe
->wr
.wr
.atomic
.compare_add
);
407 qp
->s_state
= OP(FETCH_ADD
);
408 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
409 wqe
->wr
.wr
.atomic
.compare_add
);
410 ohdr
->u
.atomic_eth
.compare_data
= 0;
412 ohdr
->u
.atomic_eth
.vaddr
[0] = cpu_to_be32(
413 wqe
->wr
.wr
.atomic
.remote_addr
>> 32);
414 ohdr
->u
.atomic_eth
.vaddr
[1] = cpu_to_be32(
415 wqe
->wr
.wr
.atomic
.remote_addr
);
416 ohdr
->u
.atomic_eth
.rkey
= cpu_to_be32(
417 wqe
->wr
.wr
.atomic
.rkey
);
418 hwords
+= sizeof(struct ib_atomic_eth
) / sizeof(u32
);
421 if (++qp
->s_cur
== qp
->s_size
)
428 qp
->s_sge
.sge
= wqe
->sg_list
[0];
429 qp
->s_sge
.sg_list
= wqe
->sg_list
+ 1;
430 qp
->s_sge
.num_sge
= wqe
->wr
.num_sge
;
431 qp
->s_len
= wqe
->length
;
434 if (qp
->s_tail
>= qp
->s_size
)
437 bth2
|= qp
->s_psn
& IPATH_PSN_MASK
;
438 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
439 qp
->s_psn
= wqe
->lpsn
+ 1;
442 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
443 qp
->s_next_psn
= qp
->s_psn
;
446 * Put the QP on the pending list so lost ACKs will cause
447 * a retry. More than one request can be pending so the
448 * QP may already be on the dev->pending list.
450 spin_lock(&dev
->pending_lock
);
451 if (list_empty(&qp
->timerwait
))
452 list_add_tail(&qp
->timerwait
,
453 &dev
->pending
[dev
->pending_index
]);
454 spin_unlock(&dev
->pending_lock
);
457 case OP(RDMA_READ_RESPONSE_FIRST
):
459 * This case can only happen if a send is restarted.
460 * See ipath_restart_rc().
462 ipath_init_restart(qp
, wqe
);
465 qp
->s_state
= OP(SEND_MIDDLE
);
467 case OP(SEND_MIDDLE
):
468 bth2
= qp
->s_psn
++ & IPATH_PSN_MASK
;
469 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
470 qp
->s_next_psn
= qp
->s_psn
;
477 if (wqe
->wr
.opcode
== IB_WR_SEND
)
478 qp
->s_state
= OP(SEND_LAST
);
480 qp
->s_state
= OP(SEND_LAST_WITH_IMMEDIATE
);
481 /* Immediate data comes after the BTH */
482 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
485 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
487 bth2
|= 1 << 31; /* Request ACK. */
489 if (qp
->s_cur
>= qp
->s_size
)
493 case OP(RDMA_READ_RESPONSE_LAST
):
495 * This case can only happen if a RDMA write is restarted.
496 * See ipath_restart_rc().
498 ipath_init_restart(qp
, wqe
);
500 case OP(RDMA_WRITE_FIRST
):
501 qp
->s_state
= OP(RDMA_WRITE_MIDDLE
);
503 case OP(RDMA_WRITE_MIDDLE
):
504 bth2
= qp
->s_psn
++ & IPATH_PSN_MASK
;
505 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
506 qp
->s_next_psn
= qp
->s_psn
;
513 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
514 qp
->s_state
= OP(RDMA_WRITE_LAST
);
516 qp
->s_state
= OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
);
517 /* Immediate data comes after the BTH */
518 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
520 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
523 bth2
|= 1 << 31; /* Request ACK. */
525 if (qp
->s_cur
>= qp
->s_size
)
529 case OP(RDMA_READ_RESPONSE_MIDDLE
):
531 * This case can only happen if a RDMA read is restarted.
532 * See ipath_restart_rc().
534 ipath_init_restart(qp
, wqe
);
535 len
= ((qp
->s_psn
- wqe
->psn
) & IPATH_PSN_MASK
) * pmtu
;
536 ohdr
->u
.rc
.reth
.vaddr
=
537 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
+ len
);
538 ohdr
->u
.rc
.reth
.rkey
=
539 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
540 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(qp
->s_len
);
541 qp
->s_state
= OP(RDMA_READ_REQUEST
);
542 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
543 bth2
= qp
->s_psn
++ & IPATH_PSN_MASK
;
544 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
545 qp
->s_next_psn
= qp
->s_psn
;
549 if (qp
->s_cur
== qp
->s_size
)
553 if (ipath_cmp24(qp
->s_psn
, qp
->s_last_psn
+ IPATH_PSN_CREDIT
- 1) >= 0)
554 bth2
|= 1 << 31; /* Request ACK. */
556 qp
->s_hdrwords
= hwords
;
558 qp
->s_cur_size
= len
;
559 *bth0p
= bth0
| (qp
->s_state
<< 24);
569 * send_rc_ack - Construct an ACK packet and send it
570 * @qp: a pointer to the QP
572 * This is called from ipath_rc_rcv() and only uses the receive
574 * Note that RDMA reads and atomics are handled in the
575 * send side QP state and tasklet.
577 static void send_rc_ack(struct ipath_qp
*qp
)
579 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
583 struct ipath_ib_header hdr
;
584 struct ipath_other_headers
*ohdr
;
587 /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
588 if (qp
->r_head_ack_queue
!= qp
->s_tail_ack_queue
||
589 (qp
->s_flags
& IPATH_S_ACK_PENDING
) ||
590 qp
->s_ack_state
!= OP(ACKNOWLEDGE
))
593 /* Construct the header. */
595 lrh0
= IPATH_LRH_BTH
;
596 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
598 if (unlikely(qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)) {
599 hwords
+= ipath_make_grh(dev
, &hdr
.u
.l
.grh
,
600 &qp
->remote_ah_attr
.grh
,
603 lrh0
= IPATH_LRH_GRH
;
605 /* read pkey_index w/o lock (its atomic) */
606 bth0
= ipath_get_pkey(dev
->dd
, qp
->s_pkey_index
) |
607 OP(ACKNOWLEDGE
) << 24;
609 ohdr
->u
.aeth
= cpu_to_be32((qp
->r_msn
& IPATH_MSN_MASK
) |
611 IPATH_AETH_CREDIT_SHIFT
));
613 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
614 lrh0
|= qp
->remote_ah_attr
.sl
<< 4;
615 hdr
.lrh
[0] = cpu_to_be16(lrh0
);
616 hdr
.lrh
[1] = cpu_to_be16(qp
->remote_ah_attr
.dlid
);
617 hdr
.lrh
[2] = cpu_to_be16(hwords
+ SIZE_OF_CRC
);
618 hdr
.lrh
[3] = cpu_to_be16(dev
->dd
->ipath_lid
);
619 ohdr
->bth
[0] = cpu_to_be32(bth0
);
620 ohdr
->bth
[1] = cpu_to_be32(qp
->remote_qpn
);
621 ohdr
->bth
[2] = cpu_to_be32(qp
->r_ack_psn
& IPATH_PSN_MASK
);
624 * If we can send the ACK, clear the ACK state.
626 if (ipath_verbs_send(dev
->dd
, hwords
, (u32
*) &hdr
, 0, NULL
) == 0) {
627 dev
->n_unicast_xmit
++;
632 * We are out of PIO buffers at the moment.
633 * Pass responsibility for sending the ACK to the
634 * send tasklet so that when a PIO buffer becomes
635 * available, the ACK is sent ahead of other outgoing
641 spin_lock_irqsave(&qp
->s_lock
, flags
);
642 qp
->s_flags
|= IPATH_S_ACK_PENDING
;
643 qp
->s_nak_state
= qp
->r_nak_state
;
644 qp
->s_ack_psn
= qp
->r_ack_psn
;
645 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
647 /* Call ipath_do_rc_send() in another thread. */
648 tasklet_hi_schedule(&qp
->s_task
);
655 * reset_psn - reset the QP state to send starting from PSN
657 * @psn: the packet sequence number to restart at
659 * This is called from ipath_rc_rcv() to process an incoming RC ACK
661 * Called at interrupt level with the QP s_lock held.
663 static void reset_psn(struct ipath_qp
*qp
, u32 psn
)
666 struct ipath_swqe
*wqe
= get_swqe_ptr(qp
, n
);
672 * If we are starting the request from the beginning,
673 * let the normal send code handle initialization.
675 if (ipath_cmp24(psn
, wqe
->psn
) <= 0) {
676 qp
->s_state
= OP(SEND_LAST
);
680 /* Find the work request opcode corresponding to the given PSN. */
681 opcode
= wqe
->wr
.opcode
;
685 if (++n
== qp
->s_size
)
689 wqe
= get_swqe_ptr(qp
, n
);
690 diff
= ipath_cmp24(psn
, wqe
->psn
);
695 * If we are starting the request from the beginning,
696 * let the normal send code handle initialization.
699 qp
->s_state
= OP(SEND_LAST
);
702 opcode
= wqe
->wr
.opcode
;
706 * Set the state to restart in the middle of a request.
707 * Don't change the s_sge, s_cur_sge, or s_cur_size.
708 * See ipath_do_rc_send().
712 case IB_WR_SEND_WITH_IMM
:
713 qp
->s_state
= OP(RDMA_READ_RESPONSE_FIRST
);
716 case IB_WR_RDMA_WRITE
:
717 case IB_WR_RDMA_WRITE_WITH_IMM
:
718 qp
->s_state
= OP(RDMA_READ_RESPONSE_LAST
);
721 case IB_WR_RDMA_READ
:
722 qp
->s_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
727 * This case shouldn't happen since its only
730 qp
->s_state
= OP(SEND_LAST
);
737 * ipath_restart_rc - back up requester to resend the last un-ACKed request
738 * @qp: the QP to restart
739 * @psn: packet sequence number for the request
740 * @wc: the work completion request
742 * The QP s_lock should be held and interrupts disabled.
744 void ipath_restart_rc(struct ipath_qp
*qp
, u32 psn
, struct ib_wc
*wc
)
746 struct ipath_swqe
*wqe
= get_swqe_ptr(qp
, qp
->s_last
);
747 struct ipath_ibdev
*dev
;
749 if (qp
->s_retry
== 0) {
750 wc
->wr_id
= wqe
->wr
.wr_id
;
751 wc
->status
= IB_WC_RETRY_EXC_ERR
;
752 wc
->opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
756 wc
->src_qp
= qp
->remote_qpn
;
758 wc
->slid
= qp
->remote_ah_attr
.dlid
;
759 wc
->sl
= qp
->remote_ah_attr
.sl
;
760 wc
->dlid_path_bits
= 0;
762 ipath_sqerror_qp(qp
, wc
);
768 * Remove the QP from the timeout queue.
769 * Note: it may already have been removed by ipath_ib_timer().
771 dev
= to_idev(qp
->ibqp
.device
);
772 spin_lock(&dev
->pending_lock
);
773 if (!list_empty(&qp
->timerwait
))
774 list_del_init(&qp
->timerwait
);
775 spin_unlock(&dev
->pending_lock
);
777 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
780 dev
->n_rc_resends
+= (qp
->s_psn
- psn
) & IPATH_PSN_MASK
;
783 tasklet_hi_schedule(&qp
->s_task
);
789 static inline void update_last_psn(struct ipath_qp
*qp
, u32 psn
)
791 if (qp
->s_wait_credit
) {
792 qp
->s_wait_credit
= 0;
793 tasklet_hi_schedule(&qp
->s_task
);
795 qp
->s_last_psn
= psn
;
799 * do_rc_ack - process an incoming RC ACK
800 * @qp: the QP the ACK came in on
801 * @psn: the packet sequence number of the ACK
802 * @opcode: the opcode of the request that resulted in the ACK
804 * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
806 * Called at interrupt level with the QP s_lock held and interrupts disabled.
807 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
809 static int do_rc_ack(struct ipath_qp
*qp
, u32 aeth
, u32 psn
, int opcode
)
811 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
813 struct ipath_swqe
*wqe
;
818 * Remove the QP from the timeout queue (or RNR timeout queue).
819 * If ipath_ib_timer() has already removed it,
820 * it's OK since we hold the QP s_lock and ipath_restart_rc()
821 * just won't find anything to restart if we ACK everything.
823 spin_lock(&dev
->pending_lock
);
824 if (!list_empty(&qp
->timerwait
))
825 list_del_init(&qp
->timerwait
);
826 spin_unlock(&dev
->pending_lock
);
829 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
830 * requests and implicitly NAK RDMA read and atomic requests issued
831 * before the NAK'ed request. The MSN won't include the NAK'ed
832 * request but will include an ACK'ed request(s).
837 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
840 * The MSN might be for a later WQE than the PSN indicates so
841 * only complete WQEs that the PSN finishes.
843 while (ipath_cmp24(ack_psn
, wqe
->lpsn
) >= 0) {
845 * If this request is a RDMA read or atomic, and the ACK is
846 * for a later operation, this ACK NAKs the RDMA read or
847 * atomic. In other words, only a RDMA_READ_LAST or ONLY
848 * can ACK a RDMA read and likewise for atomic ops. Note
849 * that the NAK case can only happen if relaxed ordering is
850 * used and requests are sent after an RDMA read or atomic
851 * is sent but before the response is received.
853 if ((wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
854 (opcode
!= OP(RDMA_READ_RESPONSE_LAST
) ||
855 ipath_cmp24(ack_psn
, wqe
->lpsn
) != 0)) ||
856 ((wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
857 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
) &&
858 (opcode
!= OP(ATOMIC_ACKNOWLEDGE
) ||
859 ipath_cmp24(wqe
->psn
, psn
) != 0))) {
861 * The last valid PSN seen is the previous
864 update_last_psn(qp
, wqe
->psn
- 1);
865 /* Retry this request. */
866 ipath_restart_rc(qp
, wqe
->psn
, &wc
);
868 * No need to process the ACK/NAK since we are
869 * restarting an earlier request.
873 if (qp
->s_num_rd_atomic
&&
874 (wqe
->wr
.opcode
== IB_WR_RDMA_READ
||
875 wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
876 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
)) {
877 qp
->s_num_rd_atomic
--;
878 /* Restart sending task if fence is complete */
879 if ((qp
->s_flags
& IPATH_S_FENCE_PENDING
) &&
880 !qp
->s_num_rd_atomic
) {
881 qp
->s_flags
&= ~IPATH_S_FENCE_PENDING
;
882 tasklet_hi_schedule(&qp
->s_task
);
883 } else if (qp
->s_flags
& IPATH_S_RDMAR_PENDING
) {
884 qp
->s_flags
&= ~IPATH_S_RDMAR_PENDING
;
885 tasklet_hi_schedule(&qp
->s_task
);
888 /* Post a send completion queue entry if requested. */
889 if (!(qp
->s_flags
& IPATH_S_SIGNAL_REQ_WR
) ||
890 (wqe
->wr
.send_flags
& IB_SEND_SIGNALED
)) {
891 wc
.wr_id
= wqe
->wr
.wr_id
;
892 wc
.status
= IB_WC_SUCCESS
;
893 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
895 wc
.byte_len
= wqe
->length
;
898 wc
.src_qp
= qp
->remote_qpn
;
901 wc
.slid
= qp
->remote_ah_attr
.dlid
;
902 wc
.sl
= qp
->remote_ah_attr
.sl
;
903 wc
.dlid_path_bits
= 0;
905 ipath_cq_enter(to_icq(qp
->ibqp
.send_cq
), &wc
, 0);
907 qp
->s_retry
= qp
->s_retry_cnt
;
909 * If we are completing a request which is in the process of
910 * being resent, we can stop resending it since we know the
911 * responder has already seen it.
913 if (qp
->s_last
== qp
->s_cur
) {
914 if (++qp
->s_cur
>= qp
->s_size
)
916 qp
->s_last
= qp
->s_cur
;
917 if (qp
->s_last
== qp
->s_tail
)
919 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
920 qp
->s_state
= OP(SEND_LAST
);
921 qp
->s_psn
= wqe
->psn
;
923 if (++qp
->s_last
>= qp
->s_size
)
925 if (qp
->s_last
== qp
->s_tail
)
927 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
931 switch (aeth
>> 29) {
934 /* If this is a partial ACK, reset the retransmit timer. */
935 if (qp
->s_last
!= qp
->s_tail
) {
936 spin_lock(&dev
->pending_lock
);
937 list_add_tail(&qp
->timerwait
,
938 &dev
->pending
[dev
->pending_index
]);
939 spin_unlock(&dev
->pending_lock
);
941 * If we get a partial ACK for a resent operation,
942 * we can stop resending the earlier packets and
943 * continue with the next packet the receiver wants.
945 if (ipath_cmp24(qp
->s_psn
, psn
) <= 0) {
946 reset_psn(qp
, psn
+ 1);
947 tasklet_hi_schedule(&qp
->s_task
);
949 } else if (ipath_cmp24(qp
->s_psn
, psn
) <= 0) {
950 qp
->s_state
= OP(SEND_LAST
);
953 ipath_get_credit(qp
, aeth
);
954 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
955 qp
->s_retry
= qp
->s_retry_cnt
;
956 update_last_psn(qp
, psn
);
960 case 1: /* RNR NAK */
962 if (qp
->s_last
== qp
->s_tail
)
964 if (qp
->s_rnr_retry
== 0) {
965 wc
.status
= IB_WC_RNR_RETRY_EXC_ERR
;
968 if (qp
->s_rnr_retry_cnt
< 7)
971 /* The last valid PSN is the previous PSN. */
972 update_last_psn(qp
, psn
- 1);
974 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
978 (qp
->s_psn
- psn
) & IPATH_PSN_MASK
;
983 ib_ipath_rnr_table
[(aeth
>> IPATH_AETH_CREDIT_SHIFT
) &
984 IPATH_AETH_CREDIT_MASK
];
985 ipath_insert_rnr_queue(qp
);
989 if (qp
->s_last
== qp
->s_tail
)
991 /* The last valid PSN is the previous PSN. */
992 update_last_psn(qp
, psn
- 1);
993 switch ((aeth
>> IPATH_AETH_CREDIT_SHIFT
) &
994 IPATH_AETH_CREDIT_MASK
) {
995 case 0: /* PSN sequence error */
998 * Back up to the responder's expected PSN.
999 * Note that we might get a NAK in the middle of an
1000 * RDMA READ response which terminates the RDMA
1003 ipath_restart_rc(qp
, psn
, &wc
);
1006 case 1: /* Invalid Request */
1007 wc
.status
= IB_WC_REM_INV_REQ_ERR
;
1008 dev
->n_other_naks
++;
1011 case 2: /* Remote Access Error */
1012 wc
.status
= IB_WC_REM_ACCESS_ERR
;
1013 dev
->n_other_naks
++;
1016 case 3: /* Remote Operation Error */
1017 wc
.status
= IB_WC_REM_OP_ERR
;
1018 dev
->n_other_naks
++;
1020 wc
.wr_id
= wqe
->wr
.wr_id
;
1021 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
1025 wc
.src_qp
= qp
->remote_qpn
;
1027 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1028 wc
.sl
= qp
->remote_ah_attr
.sl
;
1029 wc
.dlid_path_bits
= 0;
1031 ipath_sqerror_qp(qp
, &wc
);
1035 /* Ignore other reserved NAK error codes */
1038 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
1041 default: /* 2: reserved */
1043 /* Ignore reserved NAK codes. */
1052 * ipath_rc_rcv_resp - process an incoming RC response packet
1053 * @dev: the device this packet came in on
1054 * @ohdr: the other headers for this packet
1055 * @data: the packet data
1056 * @tlen: the packet length
1057 * @qp: the QP for this packet
1058 * @opcode: the opcode for this packet
1059 * @psn: the packet sequence number for this packet
1060 * @hdrsize: the header length
1061 * @pmtu: the path MTU
1062 * @header_in_data: true if part of the header data is in the data buffer
1064 * This is called from ipath_rc_rcv() to process an incoming RC response
1065 * packet for the given QP.
1066 * Called at interrupt level.
1068 static inline void ipath_rc_rcv_resp(struct ipath_ibdev
*dev
,
1069 struct ipath_other_headers
*ohdr
,
1070 void *data
, u32 tlen
,
1071 struct ipath_qp
*qp
,
1073 u32 psn
, u32 hdrsize
, u32 pmtu
,
1076 struct ipath_swqe
*wqe
;
1077 unsigned long flags
;
1083 spin_lock_irqsave(&qp
->s_lock
, flags
);
1085 /* Ignore invalid responses. */
1086 if (ipath_cmp24(psn
, qp
->s_next_psn
) >= 0)
1089 /* Ignore duplicate responses. */
1090 diff
= ipath_cmp24(psn
, qp
->s_last_psn
);
1091 if (unlikely(diff
<= 0)) {
1092 /* Update credits for "ghost" ACKs */
1093 if (diff
== 0 && opcode
== OP(ACKNOWLEDGE
)) {
1094 if (!header_in_data
)
1095 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1097 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1098 data
+= sizeof(__be32
);
1100 if ((aeth
>> 29) == 0)
1101 ipath_get_credit(qp
, aeth
);
1106 if (unlikely(qp
->s_last
== qp
->s_tail
))
1108 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
1111 case OP(ACKNOWLEDGE
):
1112 case OP(ATOMIC_ACKNOWLEDGE
):
1113 case OP(RDMA_READ_RESPONSE_FIRST
):
1114 if (!header_in_data
)
1115 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1117 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1118 data
+= sizeof(__be32
);
1120 if (opcode
== OP(ATOMIC_ACKNOWLEDGE
)) {
1123 if (!header_in_data
) {
1124 __be32
*p
= ohdr
->u
.at
.atomic_ack_eth
;
1126 val
= ((u64
) be32_to_cpu(p
[0]) << 32) |
1129 val
= be64_to_cpu(((__be64
*) data
)[0]);
1130 *(u64
*) wqe
->sg_list
[0].vaddr
= val
;
1132 if (!do_rc_ack(qp
, aeth
, psn
, opcode
) ||
1133 opcode
!= OP(RDMA_READ_RESPONSE_FIRST
))
1136 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1139 * If this is a response to a resent RDMA read, we
1140 * have to be careful to copy the data to the right
1143 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
1147 case OP(RDMA_READ_RESPONSE_MIDDLE
):
1148 /* no AETH, no ACK */
1149 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1151 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1, &wc
);
1154 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1157 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1159 if (unlikely(pmtu
>= qp
->s_rdma_read_len
))
1162 /* We got a response so update the timeout. */
1163 spin_lock(&dev
->pending_lock
);
1164 if (qp
->s_rnr_timeout
== 0 && !list_empty(&qp
->timerwait
))
1165 list_move_tail(&qp
->timerwait
,
1166 &dev
->pending
[dev
->pending_index
]);
1167 spin_unlock(&dev
->pending_lock
);
1169 * Update the RDMA receive state but do the copy w/o
1170 * holding the locks and blocking interrupts.
1172 qp
->s_rdma_read_len
-= pmtu
;
1173 update_last_psn(qp
, psn
);
1174 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1175 ipath_copy_sge(&qp
->s_rdma_read_sge
, data
, pmtu
);
1178 case OP(RDMA_READ_RESPONSE_ONLY
):
1179 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1181 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1, &wc
);
1184 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1186 /* Get the number of bytes the message was padded by. */
1187 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1189 * Check that the data size is >= 0 && <= pmtu.
1190 * Remember to account for the AETH header (4) and
1193 if (unlikely(tlen
< (hdrsize
+ pad
+ 8)))
1196 * If this is a response to a resent RDMA read, we
1197 * have to be careful to copy the data to the right
1200 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
1204 case OP(RDMA_READ_RESPONSE_LAST
):
1205 /* ACKs READ req. */
1206 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1208 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1, &wc
);
1211 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1213 /* Get the number of bytes the message was padded by. */
1214 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1216 * Check that the data size is >= 1 && <= pmtu.
1217 * Remember to account for the AETH header (4) and
1220 if (unlikely(tlen
<= (hdrsize
+ pad
+ 8)))
1223 tlen
-= hdrsize
+ pad
+ 8;
1224 if (unlikely(tlen
!= qp
->s_rdma_read_len
))
1226 if (!header_in_data
)
1227 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1229 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1230 data
+= sizeof(__be32
);
1232 ipath_copy_sge(&qp
->s_rdma_read_sge
, data
, tlen
);
1233 (void) do_rc_ack(qp
, aeth
, psn
, OP(RDMA_READ_RESPONSE_LAST
));
1238 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1242 wc
.status
= IB_WC_LOC_QP_OP_ERR
;
1246 wc
.status
= IB_WC_LOC_LEN_ERR
;
1248 wc
.wr_id
= wqe
->wr
.wr_id
;
1249 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
1254 wc
.src_qp
= qp
->remote_qpn
;
1257 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1258 wc
.sl
= qp
->remote_ah_attr
.sl
;
1259 wc
.dlid_path_bits
= 0;
1261 ipath_sqerror_qp(qp
, &wc
);
1262 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1268 * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
1269 * @dev: the device this packet came in on
1270 * @ohdr: the other headers for this packet
1271 * @data: the packet data
1272 * @qp: the QP for this packet
1273 * @opcode: the opcode for this packet
1274 * @psn: the packet sequence number for this packet
1275 * @diff: the difference between the PSN and the expected PSN
1276 * @header_in_data: true if part of the header data is in the data buffer
1278 * This is called from ipath_rc_rcv() to process an unexpected
1279 * incoming RC packet for the given QP.
1280 * Called at interrupt level.
1281 * Return 1 if no more processing is needed; otherwise return 0 to
1282 * schedule a response to be sent.
1284 static inline int ipath_rc_rcv_error(struct ipath_ibdev
*dev
,
1285 struct ipath_other_headers
*ohdr
,
1287 struct ipath_qp
*qp
,
1293 struct ipath_ack_entry
*e
;
1296 unsigned long flags
;
1300 * Packet sequence error.
1301 * A NAK will ACK earlier sends and RDMA writes.
1302 * Don't queue the NAK if we already sent one.
1304 if (!qp
->r_nak_state
) {
1305 qp
->r_nak_state
= IB_NAK_PSN_ERROR
;
1306 /* Use the expected PSN. */
1307 qp
->r_ack_psn
= qp
->r_psn
;
1314 * Handle a duplicate request. Don't re-execute SEND, RDMA
1315 * write or atomic op. Don't NAK errors, just silently drop
1316 * the duplicate request. Note that r_sge, r_len, and
1317 * r_rcv_len may be in use so don't modify them.
1319 * We are supposed to ACK the earliest duplicate PSN but we
1320 * can coalesce an outstanding duplicate ACK. We have to
1321 * send the earliest so that RDMA reads can be restarted at
1322 * the requester's expected PSN.
1324 * First, find where this duplicate PSN falls within the
1325 * ACKs previously sent.
1327 psn
&= IPATH_PSN_MASK
;
1330 spin_lock_irqsave(&qp
->s_lock
, flags
);
1331 for (i
= qp
->r_head_ack_queue
; ; i
= prev
) {
1332 if (i
== qp
->s_tail_ack_queue
)
1337 prev
= IPATH_MAX_RDMA_ATOMIC
;
1338 if (prev
== qp
->r_head_ack_queue
) {
1342 e
= &qp
->s_ack_queue
[prev
];
1347 if (ipath_cmp24(psn
, e
->psn
) >= 0)
1351 case OP(RDMA_READ_REQUEST
): {
1352 struct ib_reth
*reth
;
1357 * If we didn't find the RDMA read request in the ack queue,
1358 * or the send tasklet is already backed up to send an
1359 * earlier entry, we can ignore this request.
1361 if (!e
|| e
->opcode
!= OP(RDMA_READ_REQUEST
) || old_req
)
1363 /* RETH comes after BTH */
1364 if (!header_in_data
)
1365 reth
= &ohdr
->u
.rc
.reth
;
1367 reth
= (struct ib_reth
*)data
;
1368 data
+= sizeof(*reth
);
1371 * Address range must be a subset of the original
1372 * request and start on pmtu boundaries.
1373 * We reuse the old ack_queue slot since the requester
1374 * should not back up and request an earlier PSN for the
1377 offset
= ((psn
- e
->psn
) & IPATH_PSN_MASK
) *
1378 ib_mtu_enum_to_int(qp
->path_mtu
);
1379 len
= be32_to_cpu(reth
->length
);
1380 if (unlikely(offset
+ len
> e
->rdma_sge
.sge
.sge_length
))
1383 u32 rkey
= be32_to_cpu(reth
->rkey
);
1384 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1387 ok
= ipath_rkey_ok(qp
, &e
->rdma_sge
,
1389 IB_ACCESS_REMOTE_READ
);
1393 e
->rdma_sge
.sg_list
= NULL
;
1394 e
->rdma_sge
.num_sge
= 0;
1395 e
->rdma_sge
.sge
.mr
= NULL
;
1396 e
->rdma_sge
.sge
.vaddr
= NULL
;
1397 e
->rdma_sge
.sge
.length
= 0;
1398 e
->rdma_sge
.sge
.sge_length
= 0;
1401 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1402 qp
->s_tail_ack_queue
= prev
;
1406 case OP(COMPARE_SWAP
):
1407 case OP(FETCH_ADD
): {
1409 * If we didn't find the atomic request in the ack queue
1410 * or the send tasklet is already backed up to send an
1411 * earlier entry, we can ignore this request.
1413 if (!e
|| e
->opcode
!= (u8
) opcode
|| old_req
)
1415 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1416 qp
->s_tail_ack_queue
= prev
;
1424 * Resend the most recent ACK if this request is
1425 * after all the previous RDMA reads and atomics.
1427 if (i
== qp
->r_head_ack_queue
) {
1428 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1429 qp
->r_nak_state
= 0;
1430 qp
->r_ack_psn
= qp
->r_psn
- 1;
1434 * Resend the RDMA read or atomic op which
1435 * ACKs this duplicate request.
1437 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1438 qp
->s_tail_ack_queue
= i
;
1441 qp
->r_nak_state
= 0;
1442 tasklet_hi_schedule(&qp
->s_task
);
1445 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1453 static void ipath_rc_error(struct ipath_qp
*qp
, enum ib_wc_status err
)
1455 unsigned long flags
;
1457 spin_lock_irqsave(&qp
->s_lock
, flags
);
1458 qp
->state
= IB_QPS_ERR
;
1459 ipath_error_qp(qp
, err
);
1460 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1464 * ipath_rc_rcv - process an incoming RC packet
1465 * @dev: the device this packet came in on
1466 * @hdr: the header of this packet
1467 * @has_grh: true if the header has a GRH
1468 * @data: the packet data
1469 * @tlen: the packet length
1470 * @qp: the QP for this packet
1472 * This is called from ipath_qp_rcv() to process an incoming RC packet
1474 * Called at interrupt level.
1476 void ipath_rc_rcv(struct ipath_ibdev
*dev
, struct ipath_ib_header
*hdr
,
1477 int has_grh
, void *data
, u32 tlen
, struct ipath_qp
*qp
)
1479 struct ipath_other_headers
*ohdr
;
1485 u32 pmtu
= ib_mtu_enum_to_int(qp
->path_mtu
);
1487 struct ib_reth
*reth
;
1490 /* Validate the SLID. See Ch. 9.6.1.5 */
1491 if (unlikely(be16_to_cpu(hdr
->lrh
[3]) != qp
->remote_ah_attr
.dlid
))
1497 hdrsize
= 8 + 12; /* LRH + BTH */
1498 psn
= be32_to_cpu(ohdr
->bth
[2]);
1501 ohdr
= &hdr
->u
.l
.oth
;
1502 hdrsize
= 8 + 40 + 12; /* LRH + GRH + BTH */
1504 * The header with GRH is 60 bytes and the core driver sets
1505 * the eager header buffer size to 56 bytes so the last 4
1506 * bytes of the BTH header (PSN) is in the data buffer.
1508 header_in_data
= dev
->dd
->ipath_rcvhdrentsize
== 16;
1509 if (header_in_data
) {
1510 psn
= be32_to_cpu(((__be32
*) data
)[0]);
1511 data
+= sizeof(__be32
);
1513 psn
= be32_to_cpu(ohdr
->bth
[2]);
1517 * Process responses (ACKs) before anything else. Note that the
1518 * packet sequence number will be for something in the send work
1519 * queue rather than the expected receive packet sequence number.
1520 * In other words, this QP is the requester.
1522 opcode
= be32_to_cpu(ohdr
->bth
[0]) >> 24;
1523 if (opcode
>= OP(RDMA_READ_RESPONSE_FIRST
) &&
1524 opcode
<= OP(ATOMIC_ACKNOWLEDGE
)) {
1525 ipath_rc_rcv_resp(dev
, ohdr
, data
, tlen
, qp
, opcode
, psn
,
1526 hdrsize
, pmtu
, header_in_data
);
1530 /* Compute 24 bits worth of difference. */
1531 diff
= ipath_cmp24(psn
, qp
->r_psn
);
1532 if (unlikely(diff
)) {
1533 if (ipath_rc_rcv_error(dev
, ohdr
, data
, qp
, opcode
,
1534 psn
, diff
, header_in_data
))
1539 /* Check for opcode sequence errors. */
1540 switch (qp
->r_state
) {
1541 case OP(SEND_FIRST
):
1542 case OP(SEND_MIDDLE
):
1543 if (opcode
== OP(SEND_MIDDLE
) ||
1544 opcode
== OP(SEND_LAST
) ||
1545 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
))
1548 ipath_rc_error(qp
, IB_WC_REM_INV_REQ_ERR
);
1549 qp
->r_nak_state
= IB_NAK_INVALID_REQUEST
;
1550 qp
->r_ack_psn
= qp
->r_psn
;
1553 case OP(RDMA_WRITE_FIRST
):
1554 case OP(RDMA_WRITE_MIDDLE
):
1555 if (opcode
== OP(RDMA_WRITE_MIDDLE
) ||
1556 opcode
== OP(RDMA_WRITE_LAST
) ||
1557 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
1562 if (opcode
== OP(SEND_MIDDLE
) ||
1563 opcode
== OP(SEND_LAST
) ||
1564 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
) ||
1565 opcode
== OP(RDMA_WRITE_MIDDLE
) ||
1566 opcode
== OP(RDMA_WRITE_LAST
) ||
1567 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
1570 * Note that it is up to the requester to not send a new
1571 * RDMA read or atomic operation before receiving an ACK
1572 * for the previous operation.
1580 /* OK, process the packet. */
1582 case OP(SEND_FIRST
):
1583 if (!ipath_get_rwqe(qp
, 0)) {
1586 * A RNR NAK will ACK earlier sends and RDMA writes.
1587 * Don't queue the NAK if a RDMA read or atomic
1588 * is pending though.
1590 if (qp
->r_nak_state
)
1592 qp
->r_nak_state
= IB_RNR_NAK
| qp
->r_min_rnr_timer
;
1593 qp
->r_ack_psn
= qp
->r_psn
;
1598 case OP(SEND_MIDDLE
):
1599 case OP(RDMA_WRITE_MIDDLE
):
1601 /* Check for invalid length PMTU or posted rwqe len. */
1602 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1604 qp
->r_rcv_len
+= pmtu
;
1605 if (unlikely(qp
->r_rcv_len
> qp
->r_len
))
1607 ipath_copy_sge(&qp
->r_sge
, data
, pmtu
);
1610 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
):
1612 if (!ipath_get_rwqe(qp
, 1))
1617 case OP(SEND_ONLY_WITH_IMMEDIATE
):
1618 if (!ipath_get_rwqe(qp
, 0))
1621 if (opcode
== OP(SEND_ONLY
))
1624 case OP(SEND_LAST_WITH_IMMEDIATE
):
1626 if (header_in_data
) {
1627 wc
.imm_data
= *(__be32
*) data
;
1628 data
+= sizeof(__be32
);
1630 /* Immediate data comes after BTH */
1631 wc
.imm_data
= ohdr
->u
.imm_data
;
1634 wc
.wc_flags
= IB_WC_WITH_IMM
;
1637 case OP(RDMA_WRITE_LAST
):
1639 /* Get the number of bytes the message was padded by. */
1640 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1641 /* Check for invalid length. */
1642 /* XXX LAST len should be >= 1 */
1643 if (unlikely(tlen
< (hdrsize
+ pad
+ 4)))
1645 /* Don't count the CRC. */
1646 tlen
-= (hdrsize
+ pad
+ 4);
1647 wc
.byte_len
= tlen
+ qp
->r_rcv_len
;
1648 if (unlikely(wc
.byte_len
> qp
->r_len
))
1650 ipath_copy_sge(&qp
->r_sge
, data
, tlen
);
1652 if (!qp
->r_wrid_valid
)
1654 qp
->r_wrid_valid
= 0;
1655 wc
.wr_id
= qp
->r_wr_id
;
1656 wc
.status
= IB_WC_SUCCESS
;
1657 wc
.opcode
= IB_WC_RECV
;
1660 wc
.src_qp
= qp
->remote_qpn
;
1662 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1663 wc
.sl
= qp
->remote_ah_attr
.sl
;
1664 wc
.dlid_path_bits
= 0;
1666 /* Signal completion event if the solicited bit is set. */
1667 ipath_cq_enter(to_icq(qp
->ibqp
.recv_cq
), &wc
,
1669 __constant_cpu_to_be32(1 << 23)) != 0);
1672 case OP(RDMA_WRITE_FIRST
):
1673 case OP(RDMA_WRITE_ONLY
):
1674 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
):
1676 /* RETH comes after BTH */
1677 if (!header_in_data
)
1678 reth
= &ohdr
->u
.rc
.reth
;
1680 reth
= (struct ib_reth
*)data
;
1681 data
+= sizeof(*reth
);
1683 hdrsize
+= sizeof(*reth
);
1684 qp
->r_len
= be32_to_cpu(reth
->length
);
1686 if (qp
->r_len
!= 0) {
1687 u32 rkey
= be32_to_cpu(reth
->rkey
);
1688 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1691 /* Check rkey & NAK */
1692 ok
= ipath_rkey_ok(qp
, &qp
->r_sge
,
1693 qp
->r_len
, vaddr
, rkey
,
1694 IB_ACCESS_REMOTE_WRITE
);
1698 qp
->r_sge
.sg_list
= NULL
;
1699 qp
->r_sge
.sge
.mr
= NULL
;
1700 qp
->r_sge
.sge
.vaddr
= NULL
;
1701 qp
->r_sge
.sge
.length
= 0;
1702 qp
->r_sge
.sge
.sge_length
= 0;
1704 if (unlikely(!(qp
->qp_access_flags
&
1705 IB_ACCESS_REMOTE_WRITE
)))
1707 if (opcode
== OP(RDMA_WRITE_FIRST
))
1709 else if (opcode
== OP(RDMA_WRITE_ONLY
))
1711 if (!ipath_get_rwqe(qp
, 1))
1715 case OP(RDMA_READ_REQUEST
): {
1716 struct ipath_ack_entry
*e
;
1720 if (unlikely(!(qp
->qp_access_flags
& IB_ACCESS_REMOTE_READ
)))
1722 next
= qp
->r_head_ack_queue
+ 1;
1723 if (next
> IPATH_MAX_RDMA_ATOMIC
)
1725 if (unlikely(next
== qp
->s_tail_ack_queue
))
1727 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
1728 /* RETH comes after BTH */
1729 if (!header_in_data
)
1730 reth
= &ohdr
->u
.rc
.reth
;
1732 reth
= (struct ib_reth
*)data
;
1733 data
+= sizeof(*reth
);
1735 len
= be32_to_cpu(reth
->length
);
1737 u32 rkey
= be32_to_cpu(reth
->rkey
);
1738 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1741 /* Check rkey & NAK */
1742 ok
= ipath_rkey_ok(qp
, &e
->rdma_sge
, len
, vaddr
,
1743 rkey
, IB_ACCESS_REMOTE_READ
);
1747 * Update the next expected PSN. We add 1 later
1748 * below, so only add the remainder here.
1751 qp
->r_psn
+= (len
- 1) / pmtu
;
1753 e
->rdma_sge
.sg_list
= NULL
;
1754 e
->rdma_sge
.num_sge
= 0;
1755 e
->rdma_sge
.sge
.mr
= NULL
;
1756 e
->rdma_sge
.sge
.vaddr
= NULL
;
1757 e
->rdma_sge
.sge
.length
= 0;
1758 e
->rdma_sge
.sge
.sge_length
= 0;
1763 * We need to increment the MSN here instead of when we
1764 * finish sending the result since a duplicate request would
1765 * increment it more than once.
1769 qp
->r_state
= opcode
;
1770 qp
->r_nak_state
= 0;
1772 qp
->r_head_ack_queue
= next
;
1774 /* Call ipath_do_rc_send() in another thread. */
1775 tasklet_hi_schedule(&qp
->s_task
);
1780 case OP(COMPARE_SWAP
):
1781 case OP(FETCH_ADD
): {
1782 struct ib_atomic_eth
*ateth
;
1783 struct ipath_ack_entry
*e
;
1790 if (unlikely(!(qp
->qp_access_flags
&
1791 IB_ACCESS_REMOTE_ATOMIC
)))
1793 next
= qp
->r_head_ack_queue
+ 1;
1794 if (next
> IPATH_MAX_RDMA_ATOMIC
)
1796 if (unlikely(next
== qp
->s_tail_ack_queue
))
1798 if (!header_in_data
)
1799 ateth
= &ohdr
->u
.atomic_eth
;
1801 ateth
= (struct ib_atomic_eth
*)data
;
1802 vaddr
= ((u64
) be32_to_cpu(ateth
->vaddr
[0]) << 32) |
1803 be32_to_cpu(ateth
->vaddr
[1]);
1804 if (unlikely(vaddr
& (sizeof(u64
) - 1)))
1806 rkey
= be32_to_cpu(ateth
->rkey
);
1807 /* Check rkey & NAK */
1808 if (unlikely(!ipath_rkey_ok(qp
, &qp
->r_sge
,
1809 sizeof(u64
), vaddr
, rkey
,
1810 IB_ACCESS_REMOTE_ATOMIC
)))
1812 /* Perform atomic OP and save result. */
1813 maddr
= (atomic64_t
*) qp
->r_sge
.sge
.vaddr
;
1814 sdata
= be64_to_cpu(ateth
->swap_data
);
1815 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
1816 e
->atomic_data
= (opcode
== OP(FETCH_ADD
)) ?
1817 (u64
) atomic64_add_return(sdata
, maddr
) - sdata
:
1818 (u64
) cmpxchg((u64
*) qp
->r_sge
.sge
.vaddr
,
1819 be64_to_cpu(ateth
->compare_data
),
1822 e
->psn
= psn
& IPATH_PSN_MASK
;
1825 qp
->r_state
= opcode
;
1826 qp
->r_nak_state
= 0;
1828 qp
->r_head_ack_queue
= next
;
1830 /* Call ipath_do_rc_send() in another thread. */
1831 tasklet_hi_schedule(&qp
->s_task
);
1837 /* NAK unknown opcodes. */
1841 qp
->r_state
= opcode
;
1842 qp
->r_ack_psn
= psn
;
1843 qp
->r_nak_state
= 0;
1844 /* Send an ACK if requested or required. */
1845 if (psn
& (1 << 31))
1850 ipath_rc_error(qp
, IB_WC_REM_ACCESS_ERR
);
1851 qp
->r_nak_state
= IB_NAK_REMOTE_ACCESS_ERROR
;
1852 qp
->r_ack_psn
= qp
->r_psn
;