2 * Copyright (c) 2006 QLogic, Inc. All rights reserved.
3 * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 #include "ipath_verbs.h"
35 #include "ipath_kernel.h"
37 /* cut down ridiculously long IB macro names */
38 #define OP(x) IB_OPCODE_RC_##x
40 static u32
restart_sge(struct ipath_sge_state
*ss
, struct ipath_swqe
*wqe
,
45 len
= ((psn
- wqe
->psn
) & IPATH_PSN_MASK
) * pmtu
;
46 ss
->sge
= wqe
->sg_list
[0];
47 ss
->sg_list
= wqe
->sg_list
+ 1;
48 ss
->num_sge
= wqe
->wr
.num_sge
;
49 ipath_skip_sge(ss
, len
);
50 return wqe
->length
- len
;
54 * ipath_init_restart- initialize the qp->s_sge after a restart
55 * @qp: the QP who's SGE we're restarting
56 * @wqe: the work queue to initialize the QP's SGE from
58 * The QP s_lock should be held and interrupts disabled.
60 static void ipath_init_restart(struct ipath_qp
*qp
, struct ipath_swqe
*wqe
)
62 struct ipath_ibdev
*dev
;
64 qp
->s_len
= restart_sge(&qp
->s_sge
, wqe
, qp
->s_psn
,
65 ib_mtu_enum_to_int(qp
->path_mtu
));
66 dev
= to_idev(qp
->ibqp
.device
);
67 spin_lock(&dev
->pending_lock
);
68 if (list_empty(&qp
->timerwait
))
69 list_add_tail(&qp
->timerwait
,
70 &dev
->pending
[dev
->pending_index
]);
71 spin_unlock(&dev
->pending_lock
);
75 * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
76 * @qp: a pointer to the QP
77 * @ohdr: a pointer to the IB header being constructed
80 * Return 1 if constructed; otherwise, return 0.
81 * Note that we are in the responder's side of the QP context.
82 * Note the QP s_lock must be held.
84 static int ipath_make_rc_ack(struct ipath_qp
*qp
,
85 struct ipath_other_headers
*ohdr
,
86 u32 pmtu
, u32
*bth0p
, u32
*bth2p
)
88 struct ipath_ack_entry
*e
;
94 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
97 switch (qp
->s_ack_state
) {
98 case OP(RDMA_READ_RESPONSE_LAST
):
99 case OP(RDMA_READ_RESPONSE_ONLY
):
100 case OP(ATOMIC_ACKNOWLEDGE
):
101 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
103 case OP(ACKNOWLEDGE
):
104 /* Check for no next entry in the queue. */
105 if (qp
->r_head_ack_queue
== qp
->s_tail_ack_queue
) {
106 if (qp
->s_flags
& IPATH_S_ACK_PENDING
)
111 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
112 if (e
->opcode
== OP(RDMA_READ_REQUEST
)) {
113 /* Copy SGE state in case we need to resend */
114 qp
->s_ack_rdma_sge
= e
->rdma_sge
;
115 qp
->s_cur_sge
= &qp
->s_ack_rdma_sge
;
116 len
= e
->rdma_sge
.sge
.sge_length
;
119 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_FIRST
);
121 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_ONLY
);
122 if (++qp
->s_tail_ack_queue
>
123 IPATH_MAX_RDMA_ATOMIC
)
124 qp
->s_tail_ack_queue
= 0;
126 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
128 qp
->s_ack_rdma_psn
= e
->psn
;
129 bth2
= qp
->s_ack_rdma_psn
++ & IPATH_PSN_MASK
;
131 /* COMPARE_SWAP or FETCH_ADD */
132 qp
->s_cur_sge
= NULL
;
134 qp
->s_ack_state
= OP(ATOMIC_ACKNOWLEDGE
);
135 ohdr
->u
.at
.aeth
= ipath_compute_aeth(qp
);
136 ohdr
->u
.at
.atomic_ack_eth
[0] =
137 cpu_to_be32(e
->atomic_data
>> 32);
138 ohdr
->u
.at
.atomic_ack_eth
[1] =
139 cpu_to_be32(e
->atomic_data
);
140 hwords
+= sizeof(ohdr
->u
.at
) / sizeof(u32
);
142 if (++qp
->s_tail_ack_queue
> IPATH_MAX_RDMA_ATOMIC
)
143 qp
->s_tail_ack_queue
= 0;
145 bth0
= qp
->s_ack_state
<< 24;
148 case OP(RDMA_READ_RESPONSE_FIRST
):
149 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
151 case OP(RDMA_READ_RESPONSE_MIDDLE
):
152 len
= qp
->s_ack_rdma_sge
.sge
.sge_length
;
156 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
158 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_LAST
);
159 if (++qp
->s_tail_ack_queue
> IPATH_MAX_RDMA_ATOMIC
)
160 qp
->s_tail_ack_queue
= 0;
162 bth0
= qp
->s_ack_state
<< 24;
163 bth2
= qp
->s_ack_rdma_psn
++ & IPATH_PSN_MASK
;
169 * Send a regular ACK.
170 * Set the s_ack_state so we wait until after sending
171 * the ACK before setting s_ack_state to ACKNOWLEDGE
174 qp
->s_ack_state
= OP(ATOMIC_ACKNOWLEDGE
);
175 qp
->s_flags
&= ~IPATH_S_ACK_PENDING
;
176 qp
->s_cur_sge
= NULL
;
179 cpu_to_be32((qp
->r_msn
& IPATH_MSN_MASK
) |
181 IPATH_AETH_CREDIT_SHIFT
));
183 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
186 bth0
= OP(ACKNOWLEDGE
) << 24;
187 bth2
= qp
->s_ack_psn
& IPATH_PSN_MASK
;
189 qp
->s_hdrwords
= hwords
;
190 qp
->s_cur_size
= len
;
200 * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
201 * @qp: a pointer to the QP
202 * @ohdr: a pointer to the IB header being constructed
203 * @pmtu: the path MTU
204 * @bth0p: pointer to the BTH opcode word
205 * @bth2p: pointer to the BTH PSN word
207 * Return 1 if constructed; otherwise, return 0.
208 * Note the QP s_lock must be held and interrupts disabled.
210 int ipath_make_rc_req(struct ipath_qp
*qp
,
211 struct ipath_other_headers
*ohdr
,
212 u32 pmtu
, u32
*bth0p
, u32
*bth2p
)
214 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
215 struct ipath_sge_state
*ss
;
216 struct ipath_swqe
*wqe
;
223 /* Sending responses has higher priority over sending requests. */
224 if ((qp
->r_head_ack_queue
!= qp
->s_tail_ack_queue
||
225 (qp
->s_flags
& IPATH_S_ACK_PENDING
) ||
226 qp
->s_ack_state
!= IB_OPCODE_RC_ACKNOWLEDGE
) &&
227 ipath_make_rc_ack(qp
, ohdr
, pmtu
, bth0p
, bth2p
))
230 if (!(ib_ipath_state_ops
[qp
->state
] & IPATH_PROCESS_SEND_OK
) ||
234 /* Limit the number of packets sent without an ACK. */
235 if (ipath_cmp24(qp
->s_psn
, qp
->s_last_psn
+ IPATH_PSN_CREDIT
) > 0) {
236 qp
->s_wait_credit
= 1;
238 spin_lock(&dev
->pending_lock
);
239 if (list_empty(&qp
->timerwait
))
240 list_add_tail(&qp
->timerwait
,
241 &dev
->pending
[dev
->pending_index
]);
242 spin_unlock(&dev
->pending_lock
);
246 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
250 /* Send a request. */
251 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
252 switch (qp
->s_state
) {
255 * Resend an old request or start a new one.
257 * We keep track of the current SWQE so that
258 * we don't reset the "furthest progress" state
259 * if we need to back up.
262 if (qp
->s_cur
== qp
->s_tail
) {
263 /* Check if send work queue is empty. */
264 if (qp
->s_tail
== qp
->s_head
)
267 * If a fence is requested, wait for previous
268 * RDMA read and atomic operations to finish.
270 if ((wqe
->wr
.send_flags
& IB_SEND_FENCE
) &&
271 qp
->s_num_rd_atomic
) {
272 qp
->s_flags
|= IPATH_S_FENCE_PENDING
;
275 wqe
->psn
= qp
->s_next_psn
;
279 * Note that we have to be careful not to modify the
280 * original work request since we may need to resend
286 switch (wqe
->wr
.opcode
) {
288 case IB_WR_SEND_WITH_IMM
:
289 /* If no credit, return. */
290 if (qp
->s_lsn
!= (u32
) -1 &&
291 ipath_cmp24(wqe
->ssn
, qp
->s_lsn
+ 1) > 0)
293 wqe
->lpsn
= wqe
->psn
;
295 wqe
->lpsn
+= (len
- 1) / pmtu
;
296 qp
->s_state
= OP(SEND_FIRST
);
300 if (wqe
->wr
.opcode
== IB_WR_SEND
)
301 qp
->s_state
= OP(SEND_ONLY
);
303 qp
->s_state
= OP(SEND_ONLY_WITH_IMMEDIATE
);
304 /* Immediate data comes after the BTH */
305 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
308 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
310 bth2
= 1 << 31; /* Request ACK. */
311 if (++qp
->s_cur
== qp
->s_size
)
315 case IB_WR_RDMA_WRITE
:
316 if (newreq
&& qp
->s_lsn
!= (u32
) -1)
319 case IB_WR_RDMA_WRITE_WITH_IMM
:
320 /* If no credit, return. */
321 if (qp
->s_lsn
!= (u32
) -1 &&
322 ipath_cmp24(wqe
->ssn
, qp
->s_lsn
+ 1) > 0)
324 ohdr
->u
.rc
.reth
.vaddr
=
325 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
);
326 ohdr
->u
.rc
.reth
.rkey
=
327 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
328 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
329 hwords
+= sizeof(struct ib_reth
) / sizeof(u32
);
330 wqe
->lpsn
= wqe
->psn
;
332 wqe
->lpsn
+= (len
- 1) / pmtu
;
333 qp
->s_state
= OP(RDMA_WRITE_FIRST
);
337 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
338 qp
->s_state
= OP(RDMA_WRITE_ONLY
);
341 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
);
342 /* Immediate data comes after RETH */
343 ohdr
->u
.rc
.imm_data
= wqe
->wr
.imm_data
;
345 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
348 bth2
= 1 << 31; /* Request ACK. */
349 if (++qp
->s_cur
== qp
->s_size
)
353 case IB_WR_RDMA_READ
:
355 * Don't allow more operations to be started
356 * than the QP limits allow.
359 if (qp
->s_num_rd_atomic
>=
360 qp
->s_max_rd_atomic
) {
361 qp
->s_flags
|= IPATH_S_RDMAR_PENDING
;
364 qp
->s_num_rd_atomic
++;
365 if (qp
->s_lsn
!= (u32
) -1)
368 * Adjust s_next_psn to count the
369 * expected number of responses.
372 qp
->s_next_psn
+= (len
- 1) / pmtu
;
373 wqe
->lpsn
= qp
->s_next_psn
++;
375 ohdr
->u
.rc
.reth
.vaddr
=
376 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
);
377 ohdr
->u
.rc
.reth
.rkey
=
378 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
379 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
380 qp
->s_state
= OP(RDMA_READ_REQUEST
);
381 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
384 if (++qp
->s_cur
== qp
->s_size
)
388 case IB_WR_ATOMIC_CMP_AND_SWP
:
389 case IB_WR_ATOMIC_FETCH_AND_ADD
:
391 * Don't allow more operations to be started
392 * than the QP limits allow.
395 if (qp
->s_num_rd_atomic
>=
396 qp
->s_max_rd_atomic
) {
397 qp
->s_flags
|= IPATH_S_RDMAR_PENDING
;
400 qp
->s_num_rd_atomic
++;
401 if (qp
->s_lsn
!= (u32
) -1)
403 wqe
->lpsn
= wqe
->psn
;
405 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
) {
406 qp
->s_state
= OP(COMPARE_SWAP
);
407 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
408 wqe
->wr
.wr
.atomic
.swap
);
409 ohdr
->u
.atomic_eth
.compare_data
= cpu_to_be64(
410 wqe
->wr
.wr
.atomic
.compare_add
);
412 qp
->s_state
= OP(FETCH_ADD
);
413 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
414 wqe
->wr
.wr
.atomic
.compare_add
);
415 ohdr
->u
.atomic_eth
.compare_data
= 0;
417 ohdr
->u
.atomic_eth
.vaddr
[0] = cpu_to_be32(
418 wqe
->wr
.wr
.atomic
.remote_addr
>> 32);
419 ohdr
->u
.atomic_eth
.vaddr
[1] = cpu_to_be32(
420 wqe
->wr
.wr
.atomic
.remote_addr
);
421 ohdr
->u
.atomic_eth
.rkey
= cpu_to_be32(
422 wqe
->wr
.wr
.atomic
.rkey
);
423 hwords
+= sizeof(struct ib_atomic_eth
) / sizeof(u32
);
426 if (++qp
->s_cur
== qp
->s_size
)
433 qp
->s_sge
.sge
= wqe
->sg_list
[0];
434 qp
->s_sge
.sg_list
= wqe
->sg_list
+ 1;
435 qp
->s_sge
.num_sge
= wqe
->wr
.num_sge
;
436 qp
->s_len
= wqe
->length
;
439 if (qp
->s_tail
>= qp
->s_size
)
442 bth2
|= qp
->s_psn
& IPATH_PSN_MASK
;
443 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
444 qp
->s_psn
= wqe
->lpsn
+ 1;
447 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
448 qp
->s_next_psn
= qp
->s_psn
;
451 * Put the QP on the pending list so lost ACKs will cause
452 * a retry. More than one request can be pending so the
453 * QP may already be on the dev->pending list.
455 spin_lock(&dev
->pending_lock
);
456 if (list_empty(&qp
->timerwait
))
457 list_add_tail(&qp
->timerwait
,
458 &dev
->pending
[dev
->pending_index
]);
459 spin_unlock(&dev
->pending_lock
);
462 case OP(RDMA_READ_RESPONSE_FIRST
):
464 * This case can only happen if a send is restarted.
465 * See ipath_restart_rc().
467 ipath_init_restart(qp
, wqe
);
470 qp
->s_state
= OP(SEND_MIDDLE
);
472 case OP(SEND_MIDDLE
):
473 bth2
= qp
->s_psn
++ & IPATH_PSN_MASK
;
474 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
475 qp
->s_next_psn
= qp
->s_psn
;
482 if (wqe
->wr
.opcode
== IB_WR_SEND
)
483 qp
->s_state
= OP(SEND_LAST
);
485 qp
->s_state
= OP(SEND_LAST_WITH_IMMEDIATE
);
486 /* Immediate data comes after the BTH */
487 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
490 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
492 bth2
|= 1 << 31; /* Request ACK. */
494 if (qp
->s_cur
>= qp
->s_size
)
498 case OP(RDMA_READ_RESPONSE_LAST
):
500 * This case can only happen if a RDMA write is restarted.
501 * See ipath_restart_rc().
503 ipath_init_restart(qp
, wqe
);
505 case OP(RDMA_WRITE_FIRST
):
506 qp
->s_state
= OP(RDMA_WRITE_MIDDLE
);
508 case OP(RDMA_WRITE_MIDDLE
):
509 bth2
= qp
->s_psn
++ & IPATH_PSN_MASK
;
510 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
511 qp
->s_next_psn
= qp
->s_psn
;
518 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
519 qp
->s_state
= OP(RDMA_WRITE_LAST
);
521 qp
->s_state
= OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
);
522 /* Immediate data comes after the BTH */
523 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
525 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
528 bth2
|= 1 << 31; /* Request ACK. */
530 if (qp
->s_cur
>= qp
->s_size
)
534 case OP(RDMA_READ_RESPONSE_MIDDLE
):
536 * This case can only happen if a RDMA read is restarted.
537 * See ipath_restart_rc().
539 ipath_init_restart(qp
, wqe
);
540 len
= ((qp
->s_psn
- wqe
->psn
) & IPATH_PSN_MASK
) * pmtu
;
541 ohdr
->u
.rc
.reth
.vaddr
=
542 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
+ len
);
543 ohdr
->u
.rc
.reth
.rkey
=
544 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
545 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(qp
->s_len
);
546 qp
->s_state
= OP(RDMA_READ_REQUEST
);
547 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
548 bth2
= qp
->s_psn
++ & IPATH_PSN_MASK
;
549 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
550 qp
->s_next_psn
= qp
->s_psn
;
554 if (qp
->s_cur
== qp
->s_size
)
558 if (ipath_cmp24(qp
->s_psn
, qp
->s_last_psn
+ IPATH_PSN_CREDIT
- 1) >= 0)
559 bth2
|= 1 << 31; /* Request ACK. */
561 qp
->s_hdrwords
= hwords
;
563 qp
->s_cur_size
= len
;
564 *bth0p
= bth0
| (qp
->s_state
<< 24);
574 * send_rc_ack - Construct an ACK packet and send it
575 * @qp: a pointer to the QP
577 * This is called from ipath_rc_rcv() and only uses the receive
579 * Note that RDMA reads and atomics are handled in the
580 * send side QP state and tasklet.
582 static void send_rc_ack(struct ipath_qp
*qp
)
584 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
588 struct ipath_ib_header hdr
;
589 struct ipath_other_headers
*ohdr
;
591 /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
592 if (qp
->r_head_ack_queue
!= qp
->s_tail_ack_queue
)
595 /* Construct the header. */
597 lrh0
= IPATH_LRH_BTH
;
598 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
600 if (unlikely(qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)) {
601 hwords
+= ipath_make_grh(dev
, &hdr
.u
.l
.grh
,
602 &qp
->remote_ah_attr
.grh
,
605 lrh0
= IPATH_LRH_GRH
;
607 /* read pkey_index w/o lock (its atomic) */
608 bth0
= ipath_get_pkey(dev
->dd
, qp
->s_pkey_index
) |
609 OP(ACKNOWLEDGE
) << 24;
611 ohdr
->u
.aeth
= cpu_to_be32((qp
->r_msn
& IPATH_MSN_MASK
) |
613 IPATH_AETH_CREDIT_SHIFT
));
615 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
616 lrh0
|= qp
->remote_ah_attr
.sl
<< 4;
617 hdr
.lrh
[0] = cpu_to_be16(lrh0
);
618 hdr
.lrh
[1] = cpu_to_be16(qp
->remote_ah_attr
.dlid
);
619 hdr
.lrh
[2] = cpu_to_be16(hwords
+ SIZE_OF_CRC
);
620 hdr
.lrh
[3] = cpu_to_be16(dev
->dd
->ipath_lid
);
621 ohdr
->bth
[0] = cpu_to_be32(bth0
);
622 ohdr
->bth
[1] = cpu_to_be32(qp
->remote_qpn
);
623 ohdr
->bth
[2] = cpu_to_be32(qp
->r_ack_psn
& IPATH_PSN_MASK
);
626 * If we can send the ACK, clear the ACK state.
628 if (ipath_verbs_send(dev
->dd
, hwords
, (u32
*) &hdr
, 0, NULL
) == 0) {
629 dev
->n_unicast_xmit
++;
634 * We are out of PIO buffers at the moment.
635 * Pass responsibility for sending the ACK to the
636 * send tasklet so that when a PIO buffer becomes
637 * available, the ACK is sent ahead of other outgoing
643 spin_lock_irq(&qp
->s_lock
);
644 qp
->s_flags
|= IPATH_S_ACK_PENDING
;
645 qp
->s_nak_state
= qp
->r_nak_state
;
646 qp
->s_ack_psn
= qp
->r_ack_psn
;
647 spin_unlock_irq(&qp
->s_lock
);
649 /* Call ipath_do_rc_send() in another thread. */
650 tasklet_hi_schedule(&qp
->s_task
);
657 * reset_psn - reset the QP state to send starting from PSN
659 * @psn: the packet sequence number to restart at
661 * This is called from ipath_rc_rcv() to process an incoming RC ACK
663 * Called at interrupt level with the QP s_lock held.
665 static void reset_psn(struct ipath_qp
*qp
, u32 psn
)
668 struct ipath_swqe
*wqe
= get_swqe_ptr(qp
, n
);
674 * If we are starting the request from the beginning,
675 * let the normal send code handle initialization.
677 if (ipath_cmp24(psn
, wqe
->psn
) <= 0) {
678 qp
->s_state
= OP(SEND_LAST
);
682 /* Find the work request opcode corresponding to the given PSN. */
683 opcode
= wqe
->wr
.opcode
;
687 if (++n
== qp
->s_size
)
691 wqe
= get_swqe_ptr(qp
, n
);
692 diff
= ipath_cmp24(psn
, wqe
->psn
);
697 * If we are starting the request from the beginning,
698 * let the normal send code handle initialization.
701 qp
->s_state
= OP(SEND_LAST
);
704 opcode
= wqe
->wr
.opcode
;
708 * Set the state to restart in the middle of a request.
709 * Don't change the s_sge, s_cur_sge, or s_cur_size.
710 * See ipath_do_rc_send().
714 case IB_WR_SEND_WITH_IMM
:
715 qp
->s_state
= OP(RDMA_READ_RESPONSE_FIRST
);
718 case IB_WR_RDMA_WRITE
:
719 case IB_WR_RDMA_WRITE_WITH_IMM
:
720 qp
->s_state
= OP(RDMA_READ_RESPONSE_LAST
);
723 case IB_WR_RDMA_READ
:
724 qp
->s_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
729 * This case shouldn't happen since its only
732 qp
->s_state
= OP(SEND_LAST
);
739 * ipath_restart_rc - back up requester to resend the last un-ACKed request
740 * @qp: the QP to restart
741 * @psn: packet sequence number for the request
742 * @wc: the work completion request
744 * The QP s_lock should be held and interrupts disabled.
746 void ipath_restart_rc(struct ipath_qp
*qp
, u32 psn
, struct ib_wc
*wc
)
748 struct ipath_swqe
*wqe
= get_swqe_ptr(qp
, qp
->s_last
);
749 struct ipath_ibdev
*dev
;
751 if (qp
->s_retry
== 0) {
752 wc
->wr_id
= wqe
->wr
.wr_id
;
753 wc
->status
= IB_WC_RETRY_EXC_ERR
;
754 wc
->opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
758 wc
->src_qp
= qp
->remote_qpn
;
760 wc
->slid
= qp
->remote_ah_attr
.dlid
;
761 wc
->sl
= qp
->remote_ah_attr
.sl
;
762 wc
->dlid_path_bits
= 0;
764 ipath_sqerror_qp(qp
, wc
);
770 * Remove the QP from the timeout queue.
771 * Note: it may already have been removed by ipath_ib_timer().
773 dev
= to_idev(qp
->ibqp
.device
);
774 spin_lock(&dev
->pending_lock
);
775 if (!list_empty(&qp
->timerwait
))
776 list_del_init(&qp
->timerwait
);
777 spin_unlock(&dev
->pending_lock
);
779 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
782 dev
->n_rc_resends
+= (qp
->s_psn
- psn
) & IPATH_PSN_MASK
;
785 tasklet_hi_schedule(&qp
->s_task
);
791 static inline void update_last_psn(struct ipath_qp
*qp
, u32 psn
)
793 if (qp
->s_wait_credit
) {
794 qp
->s_wait_credit
= 0;
795 tasklet_hi_schedule(&qp
->s_task
);
797 qp
->s_last_psn
= psn
;
801 * do_rc_ack - process an incoming RC ACK
802 * @qp: the QP the ACK came in on
803 * @psn: the packet sequence number of the ACK
804 * @opcode: the opcode of the request that resulted in the ACK
806 * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
808 * Called at interrupt level with the QP s_lock held and interrupts disabled.
809 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
811 static int do_rc_ack(struct ipath_qp
*qp
, u32 aeth
, u32 psn
, int opcode
)
813 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
815 struct ipath_swqe
*wqe
;
820 * Remove the QP from the timeout queue (or RNR timeout queue).
821 * If ipath_ib_timer() has already removed it,
822 * it's OK since we hold the QP s_lock and ipath_restart_rc()
823 * just won't find anything to restart if we ACK everything.
825 spin_lock(&dev
->pending_lock
);
826 if (!list_empty(&qp
->timerwait
))
827 list_del_init(&qp
->timerwait
);
828 spin_unlock(&dev
->pending_lock
);
831 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
832 * requests and implicitly NAK RDMA read and atomic requests issued
833 * before the NAK'ed request. The MSN won't include the NAK'ed
834 * request but will include an ACK'ed request(s).
839 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
842 * The MSN might be for a later WQE than the PSN indicates so
843 * only complete WQEs that the PSN finishes.
845 while (ipath_cmp24(ack_psn
, wqe
->lpsn
) >= 0) {
847 * If this request is a RDMA read or atomic, and the ACK is
848 * for a later operation, this ACK NAKs the RDMA read or
849 * atomic. In other words, only a RDMA_READ_LAST or ONLY
850 * can ACK a RDMA read and likewise for atomic ops. Note
851 * that the NAK case can only happen if relaxed ordering is
852 * used and requests are sent after an RDMA read or atomic
853 * is sent but before the response is received.
855 if ((wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
856 (opcode
!= OP(RDMA_READ_RESPONSE_LAST
) ||
857 ipath_cmp24(ack_psn
, wqe
->lpsn
) != 0)) ||
858 ((wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
859 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
) &&
860 (opcode
!= OP(ATOMIC_ACKNOWLEDGE
) ||
861 ipath_cmp24(wqe
->psn
, psn
) != 0))) {
863 * The last valid PSN seen is the previous
866 update_last_psn(qp
, wqe
->psn
- 1);
867 /* Retry this request. */
868 ipath_restart_rc(qp
, wqe
->psn
, &wc
);
870 * No need to process the ACK/NAK since we are
871 * restarting an earlier request.
875 if (qp
->s_num_rd_atomic
&&
876 (wqe
->wr
.opcode
== IB_WR_RDMA_READ
||
877 wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
878 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
)) {
879 qp
->s_num_rd_atomic
--;
880 /* Restart sending task if fence is complete */
881 if ((qp
->s_flags
& IPATH_S_FENCE_PENDING
) &&
882 !qp
->s_num_rd_atomic
) {
883 qp
->s_flags
&= ~IPATH_S_FENCE_PENDING
;
884 tasklet_hi_schedule(&qp
->s_task
);
885 } else if (qp
->s_flags
& IPATH_S_RDMAR_PENDING
) {
886 qp
->s_flags
&= ~IPATH_S_RDMAR_PENDING
;
887 tasklet_hi_schedule(&qp
->s_task
);
890 /* Post a send completion queue entry if requested. */
891 if (!(qp
->s_flags
& IPATH_S_SIGNAL_REQ_WR
) ||
892 (wqe
->wr
.send_flags
& IB_SEND_SIGNALED
)) {
893 wc
.wr_id
= wqe
->wr
.wr_id
;
894 wc
.status
= IB_WC_SUCCESS
;
895 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
897 wc
.byte_len
= wqe
->length
;
900 wc
.src_qp
= qp
->remote_qpn
;
903 wc
.slid
= qp
->remote_ah_attr
.dlid
;
904 wc
.sl
= qp
->remote_ah_attr
.sl
;
905 wc
.dlid_path_bits
= 0;
907 ipath_cq_enter(to_icq(qp
->ibqp
.send_cq
), &wc
, 0);
909 qp
->s_retry
= qp
->s_retry_cnt
;
911 * If we are completing a request which is in the process of
912 * being resent, we can stop resending it since we know the
913 * responder has already seen it.
915 if (qp
->s_last
== qp
->s_cur
) {
916 if (++qp
->s_cur
>= qp
->s_size
)
918 qp
->s_last
= qp
->s_cur
;
919 if (qp
->s_last
== qp
->s_tail
)
921 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
922 qp
->s_state
= OP(SEND_LAST
);
923 qp
->s_psn
= wqe
->psn
;
925 if (++qp
->s_last
>= qp
->s_size
)
927 if (qp
->s_last
== qp
->s_tail
)
929 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
933 switch (aeth
>> 29) {
936 /* If this is a partial ACK, reset the retransmit timer. */
937 if (qp
->s_last
!= qp
->s_tail
) {
938 spin_lock(&dev
->pending_lock
);
939 list_add_tail(&qp
->timerwait
,
940 &dev
->pending
[dev
->pending_index
]);
941 spin_unlock(&dev
->pending_lock
);
943 * If we get a partial ACK for a resent operation,
944 * we can stop resending the earlier packets and
945 * continue with the next packet the receiver wants.
947 if (ipath_cmp24(qp
->s_psn
, psn
) <= 0) {
948 reset_psn(qp
, psn
+ 1);
949 tasklet_hi_schedule(&qp
->s_task
);
951 } else if (ipath_cmp24(qp
->s_psn
, psn
) <= 0) {
952 qp
->s_state
= OP(SEND_LAST
);
955 ipath_get_credit(qp
, aeth
);
956 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
957 qp
->s_retry
= qp
->s_retry_cnt
;
958 update_last_psn(qp
, psn
);
962 case 1: /* RNR NAK */
964 if (qp
->s_last
== qp
->s_tail
)
966 if (qp
->s_rnr_retry
== 0) {
967 wc
.status
= IB_WC_RNR_RETRY_EXC_ERR
;
970 if (qp
->s_rnr_retry_cnt
< 7)
973 /* The last valid PSN is the previous PSN. */
974 update_last_psn(qp
, psn
- 1);
976 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
980 (qp
->s_psn
- psn
) & IPATH_PSN_MASK
;
985 ib_ipath_rnr_table
[(aeth
>> IPATH_AETH_CREDIT_SHIFT
) &
986 IPATH_AETH_CREDIT_MASK
];
987 ipath_insert_rnr_queue(qp
);
991 if (qp
->s_last
== qp
->s_tail
)
993 /* The last valid PSN is the previous PSN. */
994 update_last_psn(qp
, psn
- 1);
995 switch ((aeth
>> IPATH_AETH_CREDIT_SHIFT
) &
996 IPATH_AETH_CREDIT_MASK
) {
997 case 0: /* PSN sequence error */
1000 * Back up to the responder's expected PSN.
1001 * Note that we might get a NAK in the middle of an
1002 * RDMA READ response which terminates the RDMA
1005 ipath_restart_rc(qp
, psn
, &wc
);
1008 case 1: /* Invalid Request */
1009 wc
.status
= IB_WC_REM_INV_REQ_ERR
;
1010 dev
->n_other_naks
++;
1013 case 2: /* Remote Access Error */
1014 wc
.status
= IB_WC_REM_ACCESS_ERR
;
1015 dev
->n_other_naks
++;
1018 case 3: /* Remote Operation Error */
1019 wc
.status
= IB_WC_REM_OP_ERR
;
1020 dev
->n_other_naks
++;
1022 wc
.wr_id
= wqe
->wr
.wr_id
;
1023 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
1027 wc
.src_qp
= qp
->remote_qpn
;
1029 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1030 wc
.sl
= qp
->remote_ah_attr
.sl
;
1031 wc
.dlid_path_bits
= 0;
1033 ipath_sqerror_qp(qp
, &wc
);
1037 /* Ignore other reserved NAK error codes */
1040 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
1043 default: /* 2: reserved */
1045 /* Ignore reserved NAK codes. */
1054 * ipath_rc_rcv_resp - process an incoming RC response packet
1055 * @dev: the device this packet came in on
1056 * @ohdr: the other headers for this packet
1057 * @data: the packet data
1058 * @tlen: the packet length
1059 * @qp: the QP for this packet
1060 * @opcode: the opcode for this packet
1061 * @psn: the packet sequence number for this packet
1062 * @hdrsize: the header length
1063 * @pmtu: the path MTU
1064 * @header_in_data: true if part of the header data is in the data buffer
1066 * This is called from ipath_rc_rcv() to process an incoming RC response
1067 * packet for the given QP.
1068 * Called at interrupt level.
1070 static inline void ipath_rc_rcv_resp(struct ipath_ibdev
*dev
,
1071 struct ipath_other_headers
*ohdr
,
1072 void *data
, u32 tlen
,
1073 struct ipath_qp
*qp
,
1075 u32 psn
, u32 hdrsize
, u32 pmtu
,
1078 struct ipath_swqe
*wqe
;
1079 unsigned long flags
;
1085 spin_lock_irqsave(&qp
->s_lock
, flags
);
1087 /* Ignore invalid responses. */
1088 if (ipath_cmp24(psn
, qp
->s_next_psn
) >= 0)
1091 /* Ignore duplicate responses. */
1092 diff
= ipath_cmp24(psn
, qp
->s_last_psn
);
1093 if (unlikely(diff
<= 0)) {
1094 /* Update credits for "ghost" ACKs */
1095 if (diff
== 0 && opcode
== OP(ACKNOWLEDGE
)) {
1096 if (!header_in_data
)
1097 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1099 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1100 data
+= sizeof(__be32
);
1102 if ((aeth
>> 29) == 0)
1103 ipath_get_credit(qp
, aeth
);
1108 if (unlikely(qp
->s_last
== qp
->s_tail
))
1110 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
1113 case OP(ACKNOWLEDGE
):
1114 case OP(ATOMIC_ACKNOWLEDGE
):
1115 case OP(RDMA_READ_RESPONSE_FIRST
):
1116 if (!header_in_data
)
1117 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1119 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1120 data
+= sizeof(__be32
);
1122 if (opcode
== OP(ATOMIC_ACKNOWLEDGE
)) {
1125 if (!header_in_data
) {
1126 __be32
*p
= ohdr
->u
.at
.atomic_ack_eth
;
1128 val
= ((u64
) be32_to_cpu(p
[0]) << 32) |
1131 val
= be64_to_cpu(((__be64
*) data
)[0]);
1132 *(u64
*) wqe
->sg_list
[0].vaddr
= val
;
1134 if (!do_rc_ack(qp
, aeth
, psn
, opcode
) ||
1135 opcode
!= OP(RDMA_READ_RESPONSE_FIRST
))
1138 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1141 * If this is a response to a resent RDMA read, we
1142 * have to be careful to copy the data to the right
1145 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
1149 case OP(RDMA_READ_RESPONSE_MIDDLE
):
1150 /* no AETH, no ACK */
1151 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1153 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1, &wc
);
1156 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1159 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1161 if (unlikely(pmtu
>= qp
->s_rdma_read_len
))
1164 /* We got a response so update the timeout. */
1165 spin_lock(&dev
->pending_lock
);
1166 if (qp
->s_rnr_timeout
== 0 && !list_empty(&qp
->timerwait
))
1167 list_move_tail(&qp
->timerwait
,
1168 &dev
->pending
[dev
->pending_index
]);
1169 spin_unlock(&dev
->pending_lock
);
1171 * Update the RDMA receive state but do the copy w/o
1172 * holding the locks and blocking interrupts.
1174 qp
->s_rdma_read_len
-= pmtu
;
1175 update_last_psn(qp
, psn
);
1176 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1177 ipath_copy_sge(&qp
->s_rdma_read_sge
, data
, pmtu
);
1180 case OP(RDMA_READ_RESPONSE_ONLY
):
1181 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1183 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1, &wc
);
1186 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1188 /* Get the number of bytes the message was padded by. */
1189 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1191 * Check that the data size is >= 0 && <= pmtu.
1192 * Remember to account for the AETH header (4) and
1195 if (unlikely(tlen
< (hdrsize
+ pad
+ 8)))
1198 * If this is a response to a resent RDMA read, we
1199 * have to be careful to copy the data to the right
1202 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
1206 case OP(RDMA_READ_RESPONSE_LAST
):
1207 /* ACKs READ req. */
1208 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1210 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1, &wc
);
1213 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1215 /* Get the number of bytes the message was padded by. */
1216 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1218 * Check that the data size is >= 1 && <= pmtu.
1219 * Remember to account for the AETH header (4) and
1222 if (unlikely(tlen
<= (hdrsize
+ pad
+ 8)))
1225 tlen
-= hdrsize
+ pad
+ 8;
1226 if (unlikely(tlen
!= qp
->s_rdma_read_len
))
1228 if (!header_in_data
)
1229 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1231 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1232 data
+= sizeof(__be32
);
1234 ipath_copy_sge(&qp
->s_rdma_read_sge
, data
, tlen
);
1235 (void) do_rc_ack(qp
, aeth
, psn
, OP(RDMA_READ_RESPONSE_LAST
));
1240 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1244 wc
.status
= IB_WC_LOC_QP_OP_ERR
;
1248 wc
.status
= IB_WC_LOC_LEN_ERR
;
1250 wc
.wr_id
= wqe
->wr
.wr_id
;
1251 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
1256 wc
.src_qp
= qp
->remote_qpn
;
1259 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1260 wc
.sl
= qp
->remote_ah_attr
.sl
;
1261 wc
.dlid_path_bits
= 0;
1263 ipath_sqerror_qp(qp
, &wc
);
1269 * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
1270 * @dev: the device this packet came in on
1271 * @ohdr: the other headers for this packet
1272 * @data: the packet data
1273 * @qp: the QP for this packet
1274 * @opcode: the opcode for this packet
1275 * @psn: the packet sequence number for this packet
1276 * @diff: the difference between the PSN and the expected PSN
1277 * @header_in_data: true if part of the header data is in the data buffer
1279 * This is called from ipath_rc_rcv() to process an unexpected
1280 * incoming RC packet for the given QP.
1281 * Called at interrupt level.
1282 * Return 1 if no more processing is needed; otherwise return 0 to
1283 * schedule a response to be sent.
1285 static inline int ipath_rc_rcv_error(struct ipath_ibdev
*dev
,
1286 struct ipath_other_headers
*ohdr
,
1288 struct ipath_qp
*qp
,
1294 struct ipath_ack_entry
*e
;
1300 * Packet sequence error.
1301 * A NAK will ACK earlier sends and RDMA writes.
1302 * Don't queue the NAK if we already sent one.
1304 if (!qp
->r_nak_state
) {
1305 qp
->r_nak_state
= IB_NAK_PSN_ERROR
;
1306 /* Use the expected PSN. */
1307 qp
->r_ack_psn
= qp
->r_psn
;
1314 * Handle a duplicate request. Don't re-execute SEND, RDMA
1315 * write or atomic op. Don't NAK errors, just silently drop
1316 * the duplicate request. Note that r_sge, r_len, and
1317 * r_rcv_len may be in use so don't modify them.
1319 * We are supposed to ACK the earliest duplicate PSN but we
1320 * can coalesce an outstanding duplicate ACK. We have to
1321 * send the earliest so that RDMA reads can be restarted at
1322 * the requester's expected PSN.
1324 * First, find where this duplicate PSN falls within the
1325 * ACKs previously sent.
1327 psn
&= IPATH_PSN_MASK
;
1330 spin_lock_irq(&qp
->s_lock
);
1331 for (i
= qp
->r_head_ack_queue
; ; i
= prev
) {
1332 if (i
== qp
->s_tail_ack_queue
)
1337 prev
= IPATH_MAX_RDMA_ATOMIC
;
1338 if (prev
== qp
->r_head_ack_queue
) {
1342 e
= &qp
->s_ack_queue
[prev
];
1347 if (ipath_cmp24(psn
, e
->psn
) >= 0)
1351 case OP(RDMA_READ_REQUEST
): {
1352 struct ib_reth
*reth
;
1357 * If we didn't find the RDMA read request in the ack queue,
1358 * or the send tasklet is already backed up to send an
1359 * earlier entry, we can ignore this request.
1361 if (!e
|| e
->opcode
!= OP(RDMA_READ_REQUEST
) || old_req
)
1363 /* RETH comes after BTH */
1364 if (!header_in_data
)
1365 reth
= &ohdr
->u
.rc
.reth
;
1367 reth
= (struct ib_reth
*)data
;
1368 data
+= sizeof(*reth
);
1371 * Address range must be a subset of the original
1372 * request and start on pmtu boundaries.
1373 * We reuse the old ack_queue slot since the requester
1374 * should not back up and request an earlier PSN for the
1377 offset
= ((psn
- e
->psn
) & IPATH_PSN_MASK
) *
1378 ib_mtu_enum_to_int(qp
->path_mtu
);
1379 len
= be32_to_cpu(reth
->length
);
1380 if (unlikely(offset
+ len
> e
->rdma_sge
.sge
.sge_length
))
1383 u32 rkey
= be32_to_cpu(reth
->rkey
);
1384 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1387 ok
= ipath_rkey_ok(qp
, &e
->rdma_sge
,
1389 IB_ACCESS_REMOTE_READ
);
1393 e
->rdma_sge
.sg_list
= NULL
;
1394 e
->rdma_sge
.num_sge
= 0;
1395 e
->rdma_sge
.sge
.mr
= NULL
;
1396 e
->rdma_sge
.sge
.vaddr
= NULL
;
1397 e
->rdma_sge
.sge
.length
= 0;
1398 e
->rdma_sge
.sge
.sge_length
= 0;
1401 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1402 qp
->s_tail_ack_queue
= prev
;
1406 case OP(COMPARE_SWAP
):
1407 case OP(FETCH_ADD
): {
1409 * If we didn't find the atomic request in the ack queue
1410 * or the send tasklet is already backed up to send an
1411 * earlier entry, we can ignore this request.
1413 if (!e
|| e
->opcode
!= (u8
) opcode
|| old_req
)
1415 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1416 qp
->s_tail_ack_queue
= prev
;
1424 * Resend the most recent ACK if this request is
1425 * after all the previous RDMA reads and atomics.
1427 if (i
== qp
->r_head_ack_queue
) {
1428 spin_unlock_irq(&qp
->s_lock
);
1429 qp
->r_nak_state
= 0;
1430 qp
->r_ack_psn
= qp
->r_psn
- 1;
1434 * Resend the RDMA read or atomic op which
1435 * ACKs this duplicate request.
1437 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1438 qp
->s_tail_ack_queue
= i
;
1441 qp
->r_nak_state
= 0;
1442 spin_unlock_irq(&qp
->s_lock
);
1443 tasklet_hi_schedule(&qp
->s_task
);
1446 spin_unlock_irq(&qp
->s_lock
);
1454 static void ipath_rc_error(struct ipath_qp
*qp
, enum ib_wc_status err
)
1456 spin_lock_irq(&qp
->s_lock
);
1457 qp
->state
= IB_QPS_ERR
;
1458 ipath_error_qp(qp
, err
);
1459 spin_unlock_irq(&qp
->s_lock
);
1463 * ipath_rc_rcv - process an incoming RC packet
1464 * @dev: the device this packet came in on
1465 * @hdr: the header of this packet
1466 * @has_grh: true if the header has a GRH
1467 * @data: the packet data
1468 * @tlen: the packet length
1469 * @qp: the QP for this packet
1471 * This is called from ipath_qp_rcv() to process an incoming RC packet
1473 * Called at interrupt level.
1475 void ipath_rc_rcv(struct ipath_ibdev
*dev
, struct ipath_ib_header
*hdr
,
1476 int has_grh
, void *data
, u32 tlen
, struct ipath_qp
*qp
)
1478 struct ipath_other_headers
*ohdr
;
1484 u32 pmtu
= ib_mtu_enum_to_int(qp
->path_mtu
);
1486 struct ib_reth
*reth
;
1489 /* Validate the SLID. See Ch. 9.6.1.5 */
1490 if (unlikely(be16_to_cpu(hdr
->lrh
[3]) != qp
->remote_ah_attr
.dlid
))
1496 hdrsize
= 8 + 12; /* LRH + BTH */
1497 psn
= be32_to_cpu(ohdr
->bth
[2]);
1500 ohdr
= &hdr
->u
.l
.oth
;
1501 hdrsize
= 8 + 40 + 12; /* LRH + GRH + BTH */
1503 * The header with GRH is 60 bytes and the core driver sets
1504 * the eager header buffer size to 56 bytes so the last 4
1505 * bytes of the BTH header (PSN) is in the data buffer.
1507 header_in_data
= dev
->dd
->ipath_rcvhdrentsize
== 16;
1508 if (header_in_data
) {
1509 psn
= be32_to_cpu(((__be32
*) data
)[0]);
1510 data
+= sizeof(__be32
);
1512 psn
= be32_to_cpu(ohdr
->bth
[2]);
1516 * Process responses (ACKs) before anything else. Note that the
1517 * packet sequence number will be for something in the send work
1518 * queue rather than the expected receive packet sequence number.
1519 * In other words, this QP is the requester.
1521 opcode
= be32_to_cpu(ohdr
->bth
[0]) >> 24;
1522 if (opcode
>= OP(RDMA_READ_RESPONSE_FIRST
) &&
1523 opcode
<= OP(ATOMIC_ACKNOWLEDGE
)) {
1524 ipath_rc_rcv_resp(dev
, ohdr
, data
, tlen
, qp
, opcode
, psn
,
1525 hdrsize
, pmtu
, header_in_data
);
1529 /* Compute 24 bits worth of difference. */
1530 diff
= ipath_cmp24(psn
, qp
->r_psn
);
1531 if (unlikely(diff
)) {
1532 if (ipath_rc_rcv_error(dev
, ohdr
, data
, qp
, opcode
,
1533 psn
, diff
, header_in_data
))
1538 /* Check for opcode sequence errors. */
1539 switch (qp
->r_state
) {
1540 case OP(SEND_FIRST
):
1541 case OP(SEND_MIDDLE
):
1542 if (opcode
== OP(SEND_MIDDLE
) ||
1543 opcode
== OP(SEND_LAST
) ||
1544 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
))
1547 ipath_rc_error(qp
, IB_WC_REM_INV_REQ_ERR
);
1548 qp
->r_nak_state
= IB_NAK_INVALID_REQUEST
;
1549 qp
->r_ack_psn
= qp
->r_psn
;
1552 case OP(RDMA_WRITE_FIRST
):
1553 case OP(RDMA_WRITE_MIDDLE
):
1554 if (opcode
== OP(RDMA_WRITE_MIDDLE
) ||
1555 opcode
== OP(RDMA_WRITE_LAST
) ||
1556 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
1561 if (opcode
== OP(SEND_MIDDLE
) ||
1562 opcode
== OP(SEND_LAST
) ||
1563 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
) ||
1564 opcode
== OP(RDMA_WRITE_MIDDLE
) ||
1565 opcode
== OP(RDMA_WRITE_LAST
) ||
1566 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
1569 * Note that it is up to the requester to not send a new
1570 * RDMA read or atomic operation before receiving an ACK
1571 * for the previous operation.
1579 /* OK, process the packet. */
1581 case OP(SEND_FIRST
):
1582 if (!ipath_get_rwqe(qp
, 0)) {
1585 * A RNR NAK will ACK earlier sends and RDMA writes.
1586 * Don't queue the NAK if a RDMA read or atomic
1587 * is pending though.
1589 if (qp
->r_nak_state
)
1591 qp
->r_nak_state
= IB_RNR_NAK
| qp
->r_min_rnr_timer
;
1592 qp
->r_ack_psn
= qp
->r_psn
;
1597 case OP(SEND_MIDDLE
):
1598 case OP(RDMA_WRITE_MIDDLE
):
1600 /* Check for invalid length PMTU or posted rwqe len. */
1601 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1603 qp
->r_rcv_len
+= pmtu
;
1604 if (unlikely(qp
->r_rcv_len
> qp
->r_len
))
1606 ipath_copy_sge(&qp
->r_sge
, data
, pmtu
);
1609 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
):
1611 if (!ipath_get_rwqe(qp
, 1))
1616 case OP(SEND_ONLY_WITH_IMMEDIATE
):
1617 if (!ipath_get_rwqe(qp
, 0))
1620 if (opcode
== OP(SEND_ONLY
))
1623 case OP(SEND_LAST_WITH_IMMEDIATE
):
1625 if (header_in_data
) {
1626 wc
.imm_data
= *(__be32
*) data
;
1627 data
+= sizeof(__be32
);
1629 /* Immediate data comes after BTH */
1630 wc
.imm_data
= ohdr
->u
.imm_data
;
1633 wc
.wc_flags
= IB_WC_WITH_IMM
;
1636 case OP(RDMA_WRITE_LAST
):
1638 /* Get the number of bytes the message was padded by. */
1639 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1640 /* Check for invalid length. */
1641 /* XXX LAST len should be >= 1 */
1642 if (unlikely(tlen
< (hdrsize
+ pad
+ 4)))
1644 /* Don't count the CRC. */
1645 tlen
-= (hdrsize
+ pad
+ 4);
1646 wc
.byte_len
= tlen
+ qp
->r_rcv_len
;
1647 if (unlikely(wc
.byte_len
> qp
->r_len
))
1649 ipath_copy_sge(&qp
->r_sge
, data
, tlen
);
1651 if (!qp
->r_wrid_valid
)
1653 qp
->r_wrid_valid
= 0;
1654 wc
.wr_id
= qp
->r_wr_id
;
1655 wc
.status
= IB_WC_SUCCESS
;
1656 wc
.opcode
= IB_WC_RECV
;
1659 wc
.src_qp
= qp
->remote_qpn
;
1661 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1662 wc
.sl
= qp
->remote_ah_attr
.sl
;
1663 wc
.dlid_path_bits
= 0;
1665 /* Signal completion event if the solicited bit is set. */
1666 ipath_cq_enter(to_icq(qp
->ibqp
.recv_cq
), &wc
,
1668 __constant_cpu_to_be32(1 << 23)) != 0);
1671 case OP(RDMA_WRITE_FIRST
):
1672 case OP(RDMA_WRITE_ONLY
):
1673 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
):
1675 /* RETH comes after BTH */
1676 if (!header_in_data
)
1677 reth
= &ohdr
->u
.rc
.reth
;
1679 reth
= (struct ib_reth
*)data
;
1680 data
+= sizeof(*reth
);
1682 hdrsize
+= sizeof(*reth
);
1683 qp
->r_len
= be32_to_cpu(reth
->length
);
1685 if (qp
->r_len
!= 0) {
1686 u32 rkey
= be32_to_cpu(reth
->rkey
);
1687 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1690 /* Check rkey & NAK */
1691 ok
= ipath_rkey_ok(qp
, &qp
->r_sge
,
1692 qp
->r_len
, vaddr
, rkey
,
1693 IB_ACCESS_REMOTE_WRITE
);
1697 qp
->r_sge
.sg_list
= NULL
;
1698 qp
->r_sge
.sge
.mr
= NULL
;
1699 qp
->r_sge
.sge
.vaddr
= NULL
;
1700 qp
->r_sge
.sge
.length
= 0;
1701 qp
->r_sge
.sge
.sge_length
= 0;
1703 if (unlikely(!(qp
->qp_access_flags
&
1704 IB_ACCESS_REMOTE_WRITE
)))
1706 if (opcode
== OP(RDMA_WRITE_FIRST
))
1708 else if (opcode
== OP(RDMA_WRITE_ONLY
))
1710 if (!ipath_get_rwqe(qp
, 1))
1714 case OP(RDMA_READ_REQUEST
): {
1715 struct ipath_ack_entry
*e
;
1719 if (unlikely(!(qp
->qp_access_flags
& IB_ACCESS_REMOTE_READ
)))
1721 next
= qp
->r_head_ack_queue
+ 1;
1722 if (next
> IPATH_MAX_RDMA_ATOMIC
)
1724 if (unlikely(next
== qp
->s_tail_ack_queue
))
1726 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
1727 /* RETH comes after BTH */
1728 if (!header_in_data
)
1729 reth
= &ohdr
->u
.rc
.reth
;
1731 reth
= (struct ib_reth
*)data
;
1732 data
+= sizeof(*reth
);
1734 len
= be32_to_cpu(reth
->length
);
1736 u32 rkey
= be32_to_cpu(reth
->rkey
);
1737 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1740 /* Check rkey & NAK */
1741 ok
= ipath_rkey_ok(qp
, &e
->rdma_sge
, len
, vaddr
,
1742 rkey
, IB_ACCESS_REMOTE_READ
);
1746 * Update the next expected PSN. We add 1 later
1747 * below, so only add the remainder here.
1750 qp
->r_psn
+= (len
- 1) / pmtu
;
1752 e
->rdma_sge
.sg_list
= NULL
;
1753 e
->rdma_sge
.num_sge
= 0;
1754 e
->rdma_sge
.sge
.mr
= NULL
;
1755 e
->rdma_sge
.sge
.vaddr
= NULL
;
1756 e
->rdma_sge
.sge
.length
= 0;
1757 e
->rdma_sge
.sge
.sge_length
= 0;
1762 * We need to increment the MSN here instead of when we
1763 * finish sending the result since a duplicate request would
1764 * increment it more than once.
1768 qp
->r_state
= opcode
;
1769 qp
->r_nak_state
= 0;
1771 qp
->r_head_ack_queue
= next
;
1773 /* Call ipath_do_rc_send() in another thread. */
1774 tasklet_hi_schedule(&qp
->s_task
);
1779 case OP(COMPARE_SWAP
):
1780 case OP(FETCH_ADD
): {
1781 struct ib_atomic_eth
*ateth
;
1782 struct ipath_ack_entry
*e
;
1789 if (unlikely(!(qp
->qp_access_flags
&
1790 IB_ACCESS_REMOTE_ATOMIC
)))
1792 next
= qp
->r_head_ack_queue
+ 1;
1793 if (next
> IPATH_MAX_RDMA_ATOMIC
)
1795 if (unlikely(next
== qp
->s_tail_ack_queue
))
1797 if (!header_in_data
)
1798 ateth
= &ohdr
->u
.atomic_eth
;
1800 ateth
= (struct ib_atomic_eth
*)data
;
1801 vaddr
= ((u64
) be32_to_cpu(ateth
->vaddr
[0]) << 32) |
1802 be32_to_cpu(ateth
->vaddr
[1]);
1803 if (unlikely(vaddr
& (sizeof(u64
) - 1)))
1805 rkey
= be32_to_cpu(ateth
->rkey
);
1806 /* Check rkey & NAK */
1807 if (unlikely(!ipath_rkey_ok(qp
, &qp
->r_sge
,
1808 sizeof(u64
), vaddr
, rkey
,
1809 IB_ACCESS_REMOTE_ATOMIC
)))
1811 /* Perform atomic OP and save result. */
1812 maddr
= (atomic64_t
*) qp
->r_sge
.sge
.vaddr
;
1813 sdata
= be64_to_cpu(ateth
->swap_data
);
1814 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
1815 e
->atomic_data
= (opcode
== OP(FETCH_ADD
)) ?
1816 (u64
) atomic64_add_return(sdata
, maddr
) - sdata
:
1817 (u64
) cmpxchg((u64
*) qp
->r_sge
.sge
.vaddr
,
1818 be64_to_cpu(ateth
->compare_data
),
1821 e
->psn
= psn
& IPATH_PSN_MASK
;
1824 qp
->r_state
= opcode
;
1825 qp
->r_nak_state
= 0;
1827 qp
->r_head_ack_queue
= next
;
1829 /* Call ipath_do_rc_send() in another thread. */
1830 tasklet_hi_schedule(&qp
->s_task
);
1836 /* NAK unknown opcodes. */
1840 qp
->r_state
= opcode
;
1841 qp
->r_ack_psn
= psn
;
1842 qp
->r_nak_state
= 0;
1843 /* Send an ACK if requested or required. */
1844 if (psn
& (1 << 31))
1849 ipath_rc_error(qp
, IB_WC_REM_ACCESS_ERR
);
1850 qp
->r_nak_state
= IB_NAK_REMOTE_ACCESS_ERROR
;
1851 qp
->r_ack_psn
= qp
->r_psn
;