2 * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
3 * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 #include "ipath_verbs.h"
35 #include "ipath_kernel.h"
37 /* cut down ridiculously long IB macro names */
38 #define OP(x) IB_OPCODE_RC_##x
40 static u32
restart_sge(struct ipath_sge_state
*ss
, struct ipath_swqe
*wqe
,
45 len
= ((psn
- wqe
->psn
) & IPATH_PSN_MASK
) * pmtu
;
46 ss
->sge
= wqe
->sg_list
[0];
47 ss
->sg_list
= wqe
->sg_list
+ 1;
48 ss
->num_sge
= wqe
->wr
.num_sge
;
49 ipath_skip_sge(ss
, len
);
50 return wqe
->length
- len
;
54 * ipath_init_restart- initialize the qp->s_sge after a restart
55 * @qp: the QP who's SGE we're restarting
56 * @wqe: the work queue to initialize the QP's SGE from
58 * The QP s_lock should be held and interrupts disabled.
60 static void ipath_init_restart(struct ipath_qp
*qp
, struct ipath_swqe
*wqe
)
62 struct ipath_ibdev
*dev
;
64 qp
->s_len
= restart_sge(&qp
->s_sge
, wqe
, qp
->s_psn
,
65 ib_mtu_enum_to_int(qp
->path_mtu
));
66 dev
= to_idev(qp
->ibqp
.device
);
67 spin_lock(&dev
->pending_lock
);
68 if (list_empty(&qp
->timerwait
))
69 list_add_tail(&qp
->timerwait
,
70 &dev
->pending
[dev
->pending_index
]);
71 spin_unlock(&dev
->pending_lock
);
75 * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
76 * @qp: a pointer to the QP
77 * @ohdr: a pointer to the IB header being constructed
80 * Return 1 if constructed; otherwise, return 0.
81 * Note that we are in the responder's side of the QP context.
82 * Note the QP s_lock must be held.
84 static int ipath_make_rc_ack(struct ipath_ibdev
*dev
, struct ipath_qp
*qp
,
85 struct ipath_other_headers
*ohdr
, u32 pmtu
)
87 struct ipath_ack_entry
*e
;
93 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
96 switch (qp
->s_ack_state
) {
97 case OP(RDMA_READ_RESPONSE_LAST
):
98 case OP(RDMA_READ_RESPONSE_ONLY
):
99 case OP(ATOMIC_ACKNOWLEDGE
):
101 * We can increment the tail pointer now that the last
102 * response has been sent instead of only being
105 if (++qp
->s_tail_ack_queue
> IPATH_MAX_RDMA_ATOMIC
)
106 qp
->s_tail_ack_queue
= 0;
109 case OP(ACKNOWLEDGE
):
110 /* Check for no next entry in the queue. */
111 if (qp
->r_head_ack_queue
== qp
->s_tail_ack_queue
) {
112 if (qp
->s_flags
& IPATH_S_ACK_PENDING
)
114 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
118 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
119 if (e
->opcode
== OP(RDMA_READ_REQUEST
)) {
120 /* Copy SGE state in case we need to resend */
121 qp
->s_ack_rdma_sge
= e
->rdma_sge
;
122 qp
->s_cur_sge
= &qp
->s_ack_rdma_sge
;
123 len
= e
->rdma_sge
.sge
.sge_length
;
126 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_FIRST
);
128 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_ONLY
);
131 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
133 qp
->s_ack_rdma_psn
= e
->psn
;
134 bth2
= qp
->s_ack_rdma_psn
++ & IPATH_PSN_MASK
;
136 /* COMPARE_SWAP or FETCH_ADD */
137 qp
->s_cur_sge
= NULL
;
139 qp
->s_ack_state
= OP(ATOMIC_ACKNOWLEDGE
);
140 ohdr
->u
.at
.aeth
= ipath_compute_aeth(qp
);
141 ohdr
->u
.at
.atomic_ack_eth
[0] =
142 cpu_to_be32(e
->atomic_data
>> 32);
143 ohdr
->u
.at
.atomic_ack_eth
[1] =
144 cpu_to_be32(e
->atomic_data
);
145 hwords
+= sizeof(ohdr
->u
.at
) / sizeof(u32
);
149 bth0
= qp
->s_ack_state
<< 24;
152 case OP(RDMA_READ_RESPONSE_FIRST
):
153 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
155 case OP(RDMA_READ_RESPONSE_MIDDLE
):
156 len
= qp
->s_ack_rdma_sge
.sge
.sge_length
;
160 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
162 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_LAST
);
163 qp
->s_ack_queue
[qp
->s_tail_ack_queue
].sent
= 1;
165 bth0
= qp
->s_ack_state
<< 24;
166 bth2
= qp
->s_ack_rdma_psn
++ & IPATH_PSN_MASK
;
172 * Send a regular ACK.
173 * Set the s_ack_state so we wait until after sending
174 * the ACK before setting s_ack_state to ACKNOWLEDGE
177 qp
->s_ack_state
= OP(SEND_ONLY
);
178 qp
->s_flags
&= ~IPATH_S_ACK_PENDING
;
179 qp
->s_cur_sge
= NULL
;
182 cpu_to_be32((qp
->r_msn
& IPATH_MSN_MASK
) |
184 IPATH_AETH_CREDIT_SHIFT
));
186 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
189 bth0
= OP(ACKNOWLEDGE
) << 24;
190 bth2
= qp
->s_ack_psn
& IPATH_PSN_MASK
;
192 qp
->s_hdrwords
= hwords
;
193 qp
->s_cur_size
= len
;
194 ipath_make_ruc_header(dev
, qp
, ohdr
, bth0
, bth2
);
202 * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
203 * @qp: a pointer to the QP
205 * Return 1 if constructed; otherwise, return 0.
207 int ipath_make_rc_req(struct ipath_qp
*qp
)
209 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
210 struct ipath_other_headers
*ohdr
;
211 struct ipath_sge_state
*ss
;
212 struct ipath_swqe
*wqe
;
217 u32 pmtu
= ib_mtu_enum_to_int(qp
->path_mtu
);
222 ohdr
= &qp
->s_hdr
.u
.oth
;
223 if (qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)
224 ohdr
= &qp
->s_hdr
.u
.l
.oth
;
227 * The lock is needed to synchronize between the sending tasklet,
228 * the receive interrupt handler, and timeout resends.
230 spin_lock_irqsave(&qp
->s_lock
, flags
);
232 /* Sending responses has higher priority over sending requests. */
233 if ((qp
->r_head_ack_queue
!= qp
->s_tail_ack_queue
||
234 (qp
->s_flags
& IPATH_S_ACK_PENDING
) ||
235 qp
->s_ack_state
!= OP(ACKNOWLEDGE
)) &&
236 ipath_make_rc_ack(dev
, qp
, ohdr
, pmtu
))
239 if (!(ib_ipath_state_ops
[qp
->state
] & IPATH_PROCESS_SEND_OK
) ||
240 qp
->s_rnr_timeout
|| qp
->s_wait_credit
)
243 /* Limit the number of packets sent without an ACK. */
244 if (ipath_cmp24(qp
->s_psn
, qp
->s_last_psn
+ IPATH_PSN_CREDIT
) > 0) {
245 qp
->s_wait_credit
= 1;
250 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
252 bth0
= 1 << 22; /* Set M bit */
254 /* Send a request. */
255 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
256 switch (qp
->s_state
) {
259 * Resend an old request or start a new one.
261 * We keep track of the current SWQE so that
262 * we don't reset the "furthest progress" state
263 * if we need to back up.
266 if (qp
->s_cur
== qp
->s_tail
) {
267 /* Check if send work queue is empty. */
268 if (qp
->s_tail
== qp
->s_head
)
271 * If a fence is requested, wait for previous
272 * RDMA read and atomic operations to finish.
274 if ((wqe
->wr
.send_flags
& IB_SEND_FENCE
) &&
275 qp
->s_num_rd_atomic
) {
276 qp
->s_flags
|= IPATH_S_FENCE_PENDING
;
279 wqe
->psn
= qp
->s_next_psn
;
283 * Note that we have to be careful not to modify the
284 * original work request since we may need to resend
290 switch (wqe
->wr
.opcode
) {
292 case IB_WR_SEND_WITH_IMM
:
293 /* If no credit, return. */
294 if (qp
->s_lsn
!= (u32
) -1 &&
295 ipath_cmp24(wqe
->ssn
, qp
->s_lsn
+ 1) > 0)
297 wqe
->lpsn
= wqe
->psn
;
299 wqe
->lpsn
+= (len
- 1) / pmtu
;
300 qp
->s_state
= OP(SEND_FIRST
);
304 if (wqe
->wr
.opcode
== IB_WR_SEND
)
305 qp
->s_state
= OP(SEND_ONLY
);
307 qp
->s_state
= OP(SEND_ONLY_WITH_IMMEDIATE
);
308 /* Immediate data comes after the BTH */
309 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
312 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
314 bth2
= 1 << 31; /* Request ACK. */
315 if (++qp
->s_cur
== qp
->s_size
)
319 case IB_WR_RDMA_WRITE
:
320 if (newreq
&& qp
->s_lsn
!= (u32
) -1)
323 case IB_WR_RDMA_WRITE_WITH_IMM
:
324 /* If no credit, return. */
325 if (qp
->s_lsn
!= (u32
) -1 &&
326 ipath_cmp24(wqe
->ssn
, qp
->s_lsn
+ 1) > 0)
328 ohdr
->u
.rc
.reth
.vaddr
=
329 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
);
330 ohdr
->u
.rc
.reth
.rkey
=
331 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
332 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
333 hwords
+= sizeof(struct ib_reth
) / sizeof(u32
);
334 wqe
->lpsn
= wqe
->psn
;
336 wqe
->lpsn
+= (len
- 1) / pmtu
;
337 qp
->s_state
= OP(RDMA_WRITE_FIRST
);
341 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
342 qp
->s_state
= OP(RDMA_WRITE_ONLY
);
345 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
);
346 /* Immediate data comes after RETH */
347 ohdr
->u
.rc
.imm_data
= wqe
->wr
.imm_data
;
349 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
352 bth2
= 1 << 31; /* Request ACK. */
353 if (++qp
->s_cur
== qp
->s_size
)
357 case IB_WR_RDMA_READ
:
359 * Don't allow more operations to be started
360 * than the QP limits allow.
363 if (qp
->s_num_rd_atomic
>=
364 qp
->s_max_rd_atomic
) {
365 qp
->s_flags
|= IPATH_S_RDMAR_PENDING
;
368 qp
->s_num_rd_atomic
++;
369 if (qp
->s_lsn
!= (u32
) -1)
372 * Adjust s_next_psn to count the
373 * expected number of responses.
376 qp
->s_next_psn
+= (len
- 1) / pmtu
;
377 wqe
->lpsn
= qp
->s_next_psn
++;
379 ohdr
->u
.rc
.reth
.vaddr
=
380 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
);
381 ohdr
->u
.rc
.reth
.rkey
=
382 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
383 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
384 qp
->s_state
= OP(RDMA_READ_REQUEST
);
385 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
388 if (++qp
->s_cur
== qp
->s_size
)
392 case IB_WR_ATOMIC_CMP_AND_SWP
:
393 case IB_WR_ATOMIC_FETCH_AND_ADD
:
395 * Don't allow more operations to be started
396 * than the QP limits allow.
399 if (qp
->s_num_rd_atomic
>=
400 qp
->s_max_rd_atomic
) {
401 qp
->s_flags
|= IPATH_S_RDMAR_PENDING
;
404 qp
->s_num_rd_atomic
++;
405 if (qp
->s_lsn
!= (u32
) -1)
407 wqe
->lpsn
= wqe
->psn
;
409 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
) {
410 qp
->s_state
= OP(COMPARE_SWAP
);
411 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
412 wqe
->wr
.wr
.atomic
.swap
);
413 ohdr
->u
.atomic_eth
.compare_data
= cpu_to_be64(
414 wqe
->wr
.wr
.atomic
.compare_add
);
416 qp
->s_state
= OP(FETCH_ADD
);
417 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
418 wqe
->wr
.wr
.atomic
.compare_add
);
419 ohdr
->u
.atomic_eth
.compare_data
= 0;
421 ohdr
->u
.atomic_eth
.vaddr
[0] = cpu_to_be32(
422 wqe
->wr
.wr
.atomic
.remote_addr
>> 32);
423 ohdr
->u
.atomic_eth
.vaddr
[1] = cpu_to_be32(
424 wqe
->wr
.wr
.atomic
.remote_addr
);
425 ohdr
->u
.atomic_eth
.rkey
= cpu_to_be32(
426 wqe
->wr
.wr
.atomic
.rkey
);
427 hwords
+= sizeof(struct ib_atomic_eth
) / sizeof(u32
);
430 if (++qp
->s_cur
== qp
->s_size
)
437 qp
->s_sge
.sge
= wqe
->sg_list
[0];
438 qp
->s_sge
.sg_list
= wqe
->sg_list
+ 1;
439 qp
->s_sge
.num_sge
= wqe
->wr
.num_sge
;
440 qp
->s_len
= wqe
->length
;
443 if (qp
->s_tail
>= qp
->s_size
)
446 bth2
|= qp
->s_psn
& IPATH_PSN_MASK
;
447 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
448 qp
->s_psn
= wqe
->lpsn
+ 1;
451 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
452 qp
->s_next_psn
= qp
->s_psn
;
455 * Put the QP on the pending list so lost ACKs will cause
456 * a retry. More than one request can be pending so the
457 * QP may already be on the dev->pending list.
459 spin_lock(&dev
->pending_lock
);
460 if (list_empty(&qp
->timerwait
))
461 list_add_tail(&qp
->timerwait
,
462 &dev
->pending
[dev
->pending_index
]);
463 spin_unlock(&dev
->pending_lock
);
466 case OP(RDMA_READ_RESPONSE_FIRST
):
468 * This case can only happen if a send is restarted.
469 * See ipath_restart_rc().
471 ipath_init_restart(qp
, wqe
);
474 qp
->s_state
= OP(SEND_MIDDLE
);
476 case OP(SEND_MIDDLE
):
477 bth2
= qp
->s_psn
++ & IPATH_PSN_MASK
;
478 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
479 qp
->s_next_psn
= qp
->s_psn
;
486 if (wqe
->wr
.opcode
== IB_WR_SEND
)
487 qp
->s_state
= OP(SEND_LAST
);
489 qp
->s_state
= OP(SEND_LAST_WITH_IMMEDIATE
);
490 /* Immediate data comes after the BTH */
491 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
494 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
496 bth2
|= 1 << 31; /* Request ACK. */
498 if (qp
->s_cur
>= qp
->s_size
)
502 case OP(RDMA_READ_RESPONSE_LAST
):
504 * This case can only happen if a RDMA write is restarted.
505 * See ipath_restart_rc().
507 ipath_init_restart(qp
, wqe
);
509 case OP(RDMA_WRITE_FIRST
):
510 qp
->s_state
= OP(RDMA_WRITE_MIDDLE
);
512 case OP(RDMA_WRITE_MIDDLE
):
513 bth2
= qp
->s_psn
++ & IPATH_PSN_MASK
;
514 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
515 qp
->s_next_psn
= qp
->s_psn
;
522 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
523 qp
->s_state
= OP(RDMA_WRITE_LAST
);
525 qp
->s_state
= OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
);
526 /* Immediate data comes after the BTH */
527 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
529 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
532 bth2
|= 1 << 31; /* Request ACK. */
534 if (qp
->s_cur
>= qp
->s_size
)
538 case OP(RDMA_READ_RESPONSE_MIDDLE
):
540 * This case can only happen if a RDMA read is restarted.
541 * See ipath_restart_rc().
543 ipath_init_restart(qp
, wqe
);
544 len
= ((qp
->s_psn
- wqe
->psn
) & IPATH_PSN_MASK
) * pmtu
;
545 ohdr
->u
.rc
.reth
.vaddr
=
546 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
+ len
);
547 ohdr
->u
.rc
.reth
.rkey
=
548 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
549 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(qp
->s_len
);
550 qp
->s_state
= OP(RDMA_READ_REQUEST
);
551 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
552 bth2
= qp
->s_psn
++ & IPATH_PSN_MASK
;
553 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
554 qp
->s_next_psn
= qp
->s_psn
;
558 if (qp
->s_cur
== qp
->s_size
)
562 if (ipath_cmp24(qp
->s_psn
, qp
->s_last_psn
+ IPATH_PSN_CREDIT
- 1) >= 0)
563 bth2
|= 1 << 31; /* Request ACK. */
565 qp
->s_hdrwords
= hwords
;
567 qp
->s_cur_size
= len
;
568 ipath_make_ruc_header(dev
, qp
, ohdr
, bth0
| (qp
->s_state
<< 24), bth2
);
572 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
577 * send_rc_ack - Construct an ACK packet and send it
578 * @qp: a pointer to the QP
580 * This is called from ipath_rc_rcv() and only uses the receive
582 * Note that RDMA reads and atomics are handled in the
583 * send side QP state and tasklet.
585 static void send_rc_ack(struct ipath_qp
*qp
)
587 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
591 struct ipath_ib_header hdr
;
592 struct ipath_other_headers
*ohdr
;
595 /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
596 if (qp
->r_head_ack_queue
!= qp
->s_tail_ack_queue
||
597 (qp
->s_flags
& IPATH_S_ACK_PENDING
) ||
598 qp
->s_ack_state
!= OP(ACKNOWLEDGE
))
601 /* Construct the header. */
603 lrh0
= IPATH_LRH_BTH
;
604 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
606 if (unlikely(qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)) {
607 hwords
+= ipath_make_grh(dev
, &hdr
.u
.l
.grh
,
608 &qp
->remote_ah_attr
.grh
,
611 lrh0
= IPATH_LRH_GRH
;
613 /* read pkey_index w/o lock (its atomic) */
614 bth0
= ipath_get_pkey(dev
->dd
, qp
->s_pkey_index
) |
615 (OP(ACKNOWLEDGE
) << 24) | (1 << 22);
617 ohdr
->u
.aeth
= cpu_to_be32((qp
->r_msn
& IPATH_MSN_MASK
) |
619 IPATH_AETH_CREDIT_SHIFT
));
621 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
622 lrh0
|= qp
->remote_ah_attr
.sl
<< 4;
623 hdr
.lrh
[0] = cpu_to_be16(lrh0
);
624 hdr
.lrh
[1] = cpu_to_be16(qp
->remote_ah_attr
.dlid
);
625 hdr
.lrh
[2] = cpu_to_be16(hwords
+ SIZE_OF_CRC
);
626 hdr
.lrh
[3] = cpu_to_be16(dev
->dd
->ipath_lid
);
627 ohdr
->bth
[0] = cpu_to_be32(bth0
);
628 ohdr
->bth
[1] = cpu_to_be32(qp
->remote_qpn
);
629 ohdr
->bth
[2] = cpu_to_be32(qp
->r_ack_psn
& IPATH_PSN_MASK
);
632 * If we can send the ACK, clear the ACK state.
634 if (ipath_verbs_send(qp
, &hdr
, hwords
, NULL
, 0) == 0) {
635 dev
->n_unicast_xmit
++;
640 * We are out of PIO buffers at the moment.
641 * Pass responsibility for sending the ACK to the
642 * send tasklet so that when a PIO buffer becomes
643 * available, the ACK is sent ahead of other outgoing
649 spin_lock_irqsave(&qp
->s_lock
, flags
);
650 qp
->s_flags
|= IPATH_S_ACK_PENDING
;
651 qp
->s_nak_state
= qp
->r_nak_state
;
652 qp
->s_ack_psn
= qp
->r_ack_psn
;
653 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
655 /* Call ipath_do_rc_send() in another thread. */
656 tasklet_hi_schedule(&qp
->s_task
);
663 * reset_psn - reset the QP state to send starting from PSN
665 * @psn: the packet sequence number to restart at
667 * This is called from ipath_rc_rcv() to process an incoming RC ACK
669 * Called at interrupt level with the QP s_lock held.
671 static void reset_psn(struct ipath_qp
*qp
, u32 psn
)
674 struct ipath_swqe
*wqe
= get_swqe_ptr(qp
, n
);
680 * If we are starting the request from the beginning,
681 * let the normal send code handle initialization.
683 if (ipath_cmp24(psn
, wqe
->psn
) <= 0) {
684 qp
->s_state
= OP(SEND_LAST
);
688 /* Find the work request opcode corresponding to the given PSN. */
689 opcode
= wqe
->wr
.opcode
;
693 if (++n
== qp
->s_size
)
697 wqe
= get_swqe_ptr(qp
, n
);
698 diff
= ipath_cmp24(psn
, wqe
->psn
);
703 * If we are starting the request from the beginning,
704 * let the normal send code handle initialization.
707 qp
->s_state
= OP(SEND_LAST
);
710 opcode
= wqe
->wr
.opcode
;
714 * Set the state to restart in the middle of a request.
715 * Don't change the s_sge, s_cur_sge, or s_cur_size.
716 * See ipath_do_rc_send().
720 case IB_WR_SEND_WITH_IMM
:
721 qp
->s_state
= OP(RDMA_READ_RESPONSE_FIRST
);
724 case IB_WR_RDMA_WRITE
:
725 case IB_WR_RDMA_WRITE_WITH_IMM
:
726 qp
->s_state
= OP(RDMA_READ_RESPONSE_LAST
);
729 case IB_WR_RDMA_READ
:
730 qp
->s_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
735 * This case shouldn't happen since its only
738 qp
->s_state
= OP(SEND_LAST
);
745 * ipath_restart_rc - back up requester to resend the last un-ACKed request
746 * @qp: the QP to restart
747 * @psn: packet sequence number for the request
748 * @wc: the work completion request
750 * The QP s_lock should be held and interrupts disabled.
752 void ipath_restart_rc(struct ipath_qp
*qp
, u32 psn
, struct ib_wc
*wc
)
754 struct ipath_swqe
*wqe
= get_swqe_ptr(qp
, qp
->s_last
);
755 struct ipath_ibdev
*dev
;
757 if (qp
->s_retry
== 0) {
758 wc
->wr_id
= wqe
->wr
.wr_id
;
759 wc
->status
= IB_WC_RETRY_EXC_ERR
;
760 wc
->opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
765 wc
->src_qp
= qp
->remote_qpn
;
768 wc
->slid
= qp
->remote_ah_attr
.dlid
;
769 wc
->sl
= qp
->remote_ah_attr
.sl
;
770 wc
->dlid_path_bits
= 0;
772 ipath_sqerror_qp(qp
, wc
);
778 * Remove the QP from the timeout queue.
779 * Note: it may already have been removed by ipath_ib_timer().
781 dev
= to_idev(qp
->ibqp
.device
);
782 spin_lock(&dev
->pending_lock
);
783 if (!list_empty(&qp
->timerwait
))
784 list_del_init(&qp
->timerwait
);
785 spin_unlock(&dev
->pending_lock
);
787 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
790 dev
->n_rc_resends
+= (qp
->s_psn
- psn
) & IPATH_PSN_MASK
;
793 tasklet_hi_schedule(&qp
->s_task
);
799 static inline void update_last_psn(struct ipath_qp
*qp
, u32 psn
)
801 if (qp
->s_wait_credit
) {
802 qp
->s_wait_credit
= 0;
803 tasklet_hi_schedule(&qp
->s_task
);
805 qp
->s_last_psn
= psn
;
809 * do_rc_ack - process an incoming RC ACK
810 * @qp: the QP the ACK came in on
811 * @psn: the packet sequence number of the ACK
812 * @opcode: the opcode of the request that resulted in the ACK
814 * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
816 * Called at interrupt level with the QP s_lock held and interrupts disabled.
817 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
819 static int do_rc_ack(struct ipath_qp
*qp
, u32 aeth
, u32 psn
, int opcode
,
822 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
824 struct ipath_swqe
*wqe
;
830 * Remove the QP from the timeout queue (or RNR timeout queue).
831 * If ipath_ib_timer() has already removed it,
832 * it's OK since we hold the QP s_lock and ipath_restart_rc()
833 * just won't find anything to restart if we ACK everything.
835 spin_lock(&dev
->pending_lock
);
836 if (!list_empty(&qp
->timerwait
))
837 list_del_init(&qp
->timerwait
);
838 spin_unlock(&dev
->pending_lock
);
841 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
842 * requests and implicitly NAK RDMA read and atomic requests issued
843 * before the NAK'ed request. The MSN won't include the NAK'ed
844 * request but will include an ACK'ed request(s).
849 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
852 * The MSN might be for a later WQE than the PSN indicates so
853 * only complete WQEs that the PSN finishes.
855 while ((diff
= ipath_cmp24(ack_psn
, wqe
->lpsn
)) >= 0) {
857 * RDMA_READ_RESPONSE_ONLY is a special case since
858 * we want to generate completion events for everything
859 * before the RDMA read, copy the data, then generate
860 * the completion for the read.
862 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
863 opcode
== OP(RDMA_READ_RESPONSE_ONLY
) &&
869 * If this request is a RDMA read or atomic, and the ACK is
870 * for a later operation, this ACK NAKs the RDMA read or
871 * atomic. In other words, only a RDMA_READ_LAST or ONLY
872 * can ACK a RDMA read and likewise for atomic ops. Note
873 * that the NAK case can only happen if relaxed ordering is
874 * used and requests are sent after an RDMA read or atomic
875 * is sent but before the response is received.
877 if ((wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
878 (opcode
!= OP(RDMA_READ_RESPONSE_LAST
) || diff
!= 0)) ||
879 ((wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
880 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
) &&
881 (opcode
!= OP(ATOMIC_ACKNOWLEDGE
) || diff
!= 0))) {
883 * The last valid PSN seen is the previous
886 update_last_psn(qp
, wqe
->psn
- 1);
887 /* Retry this request. */
888 ipath_restart_rc(qp
, wqe
->psn
, &wc
);
890 * No need to process the ACK/NAK since we are
891 * restarting an earlier request.
895 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
896 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
)
897 *(u64
*) wqe
->sg_list
[0].vaddr
= val
;
898 if (qp
->s_num_rd_atomic
&&
899 (wqe
->wr
.opcode
== IB_WR_RDMA_READ
||
900 wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
901 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
)) {
902 qp
->s_num_rd_atomic
--;
903 /* Restart sending task if fence is complete */
904 if ((qp
->s_flags
& IPATH_S_FENCE_PENDING
) &&
905 !qp
->s_num_rd_atomic
) {
906 qp
->s_flags
&= ~IPATH_S_FENCE_PENDING
;
907 tasklet_hi_schedule(&qp
->s_task
);
908 } else if (qp
->s_flags
& IPATH_S_RDMAR_PENDING
) {
909 qp
->s_flags
&= ~IPATH_S_RDMAR_PENDING
;
910 tasklet_hi_schedule(&qp
->s_task
);
913 /* Post a send completion queue entry if requested. */
914 if (!(qp
->s_flags
& IPATH_S_SIGNAL_REQ_WR
) ||
915 (wqe
->wr
.send_flags
& IB_SEND_SIGNALED
)) {
916 wc
.wr_id
= wqe
->wr
.wr_id
;
917 wc
.status
= IB_WC_SUCCESS
;
918 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
920 wc
.byte_len
= wqe
->length
;
923 wc
.src_qp
= qp
->remote_qpn
;
926 wc
.slid
= qp
->remote_ah_attr
.dlid
;
927 wc
.sl
= qp
->remote_ah_attr
.sl
;
928 wc
.dlid_path_bits
= 0;
930 ipath_cq_enter(to_icq(qp
->ibqp
.send_cq
), &wc
, 0);
932 qp
->s_retry
= qp
->s_retry_cnt
;
934 * If we are completing a request which is in the process of
935 * being resent, we can stop resending it since we know the
936 * responder has already seen it.
938 if (qp
->s_last
== qp
->s_cur
) {
939 if (++qp
->s_cur
>= qp
->s_size
)
941 qp
->s_last
= qp
->s_cur
;
942 if (qp
->s_last
== qp
->s_tail
)
944 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
945 qp
->s_state
= OP(SEND_LAST
);
946 qp
->s_psn
= wqe
->psn
;
948 if (++qp
->s_last
>= qp
->s_size
)
950 if (qp
->s_last
== qp
->s_tail
)
952 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
956 switch (aeth
>> 29) {
959 /* If this is a partial ACK, reset the retransmit timer. */
960 if (qp
->s_last
!= qp
->s_tail
) {
961 spin_lock(&dev
->pending_lock
);
962 list_add_tail(&qp
->timerwait
,
963 &dev
->pending
[dev
->pending_index
]);
964 spin_unlock(&dev
->pending_lock
);
966 * If we get a partial ACK for a resent operation,
967 * we can stop resending the earlier packets and
968 * continue with the next packet the receiver wants.
970 if (ipath_cmp24(qp
->s_psn
, psn
) <= 0) {
971 reset_psn(qp
, psn
+ 1);
972 tasklet_hi_schedule(&qp
->s_task
);
974 } else if (ipath_cmp24(qp
->s_psn
, psn
) <= 0) {
975 qp
->s_state
= OP(SEND_LAST
);
978 ipath_get_credit(qp
, aeth
);
979 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
980 qp
->s_retry
= qp
->s_retry_cnt
;
981 update_last_psn(qp
, psn
);
985 case 1: /* RNR NAK */
987 if (qp
->s_last
== qp
->s_tail
)
989 if (qp
->s_rnr_retry
== 0) {
990 wc
.status
= IB_WC_RNR_RETRY_EXC_ERR
;
993 if (qp
->s_rnr_retry_cnt
< 7)
996 /* The last valid PSN is the previous PSN. */
997 update_last_psn(qp
, psn
- 1);
999 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
1000 dev
->n_rc_resends
++;
1002 dev
->n_rc_resends
+=
1003 (qp
->s_psn
- psn
) & IPATH_PSN_MASK
;
1008 ib_ipath_rnr_table
[(aeth
>> IPATH_AETH_CREDIT_SHIFT
) &
1009 IPATH_AETH_CREDIT_MASK
];
1010 ipath_insert_rnr_queue(qp
);
1014 if (qp
->s_last
== qp
->s_tail
)
1016 /* The last valid PSN is the previous PSN. */
1017 update_last_psn(qp
, psn
- 1);
1018 switch ((aeth
>> IPATH_AETH_CREDIT_SHIFT
) &
1019 IPATH_AETH_CREDIT_MASK
) {
1020 case 0: /* PSN sequence error */
1023 * Back up to the responder's expected PSN.
1024 * Note that we might get a NAK in the middle of an
1025 * RDMA READ response which terminates the RDMA
1028 ipath_restart_rc(qp
, psn
, &wc
);
1031 case 1: /* Invalid Request */
1032 wc
.status
= IB_WC_REM_INV_REQ_ERR
;
1033 dev
->n_other_naks
++;
1036 case 2: /* Remote Access Error */
1037 wc
.status
= IB_WC_REM_ACCESS_ERR
;
1038 dev
->n_other_naks
++;
1041 case 3: /* Remote Operation Error */
1042 wc
.status
= IB_WC_REM_OP_ERR
;
1043 dev
->n_other_naks
++;
1045 wc
.wr_id
= wqe
->wr
.wr_id
;
1046 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
1051 wc
.src_qp
= qp
->remote_qpn
;
1054 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1055 wc
.sl
= qp
->remote_ah_attr
.sl
;
1056 wc
.dlid_path_bits
= 0;
1058 ipath_sqerror_qp(qp
, &wc
);
1062 /* Ignore other reserved NAK error codes */
1065 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
1068 default: /* 2: reserved */
1070 /* Ignore reserved NAK codes. */
1079 * ipath_rc_rcv_resp - process an incoming RC response packet
1080 * @dev: the device this packet came in on
1081 * @ohdr: the other headers for this packet
1082 * @data: the packet data
1083 * @tlen: the packet length
1084 * @qp: the QP for this packet
1085 * @opcode: the opcode for this packet
1086 * @psn: the packet sequence number for this packet
1087 * @hdrsize: the header length
1088 * @pmtu: the path MTU
1089 * @header_in_data: true if part of the header data is in the data buffer
1091 * This is called from ipath_rc_rcv() to process an incoming RC response
1092 * packet for the given QP.
1093 * Called at interrupt level.
1095 static inline void ipath_rc_rcv_resp(struct ipath_ibdev
*dev
,
1096 struct ipath_other_headers
*ohdr
,
1097 void *data
, u32 tlen
,
1098 struct ipath_qp
*qp
,
1100 u32 psn
, u32 hdrsize
, u32 pmtu
,
1103 struct ipath_swqe
*wqe
;
1104 unsigned long flags
;
1111 spin_lock_irqsave(&qp
->s_lock
, flags
);
1113 /* Ignore invalid responses. */
1114 if (ipath_cmp24(psn
, qp
->s_next_psn
) >= 0)
1117 /* Ignore duplicate responses. */
1118 diff
= ipath_cmp24(psn
, qp
->s_last_psn
);
1119 if (unlikely(diff
<= 0)) {
1120 /* Update credits for "ghost" ACKs */
1121 if (diff
== 0 && opcode
== OP(ACKNOWLEDGE
)) {
1122 if (!header_in_data
)
1123 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1125 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1126 data
+= sizeof(__be32
);
1128 if ((aeth
>> 29) == 0)
1129 ipath_get_credit(qp
, aeth
);
1134 if (unlikely(qp
->s_last
== qp
->s_tail
))
1136 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
1139 case OP(ACKNOWLEDGE
):
1140 case OP(ATOMIC_ACKNOWLEDGE
):
1141 case OP(RDMA_READ_RESPONSE_FIRST
):
1142 if (!header_in_data
)
1143 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1145 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1146 data
+= sizeof(__be32
);
1148 if (opcode
== OP(ATOMIC_ACKNOWLEDGE
)) {
1149 if (!header_in_data
) {
1150 __be32
*p
= ohdr
->u
.at
.atomic_ack_eth
;
1152 val
= ((u64
) be32_to_cpu(p
[0]) << 32) |
1155 val
= be64_to_cpu(((__be64
*) data
)[0]);
1158 if (!do_rc_ack(qp
, aeth
, psn
, opcode
, val
) ||
1159 opcode
!= OP(RDMA_READ_RESPONSE_FIRST
))
1162 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
1163 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1166 * If this is a response to a resent RDMA read, we
1167 * have to be careful to copy the data to the right
1170 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
1174 case OP(RDMA_READ_RESPONSE_MIDDLE
):
1175 /* no AETH, no ACK */
1176 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1178 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1, &wc
);
1181 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1184 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1186 if (unlikely(pmtu
>= qp
->s_rdma_read_len
))
1189 /* We got a response so update the timeout. */
1190 spin_lock(&dev
->pending_lock
);
1191 if (qp
->s_rnr_timeout
== 0 && !list_empty(&qp
->timerwait
))
1192 list_move_tail(&qp
->timerwait
,
1193 &dev
->pending
[dev
->pending_index
]);
1194 spin_unlock(&dev
->pending_lock
);
1196 * Update the RDMA receive state but do the copy w/o
1197 * holding the locks and blocking interrupts.
1199 qp
->s_rdma_read_len
-= pmtu
;
1200 update_last_psn(qp
, psn
);
1201 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1202 ipath_copy_sge(&qp
->s_rdma_read_sge
, data
, pmtu
);
1205 case OP(RDMA_READ_RESPONSE_ONLY
):
1206 if (!header_in_data
)
1207 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1209 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1210 if (!do_rc_ack(qp
, aeth
, psn
, opcode
, 0))
1212 /* Get the number of bytes the message was padded by. */
1213 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1215 * Check that the data size is >= 0 && <= pmtu.
1216 * Remember to account for the AETH header (4) and
1219 if (unlikely(tlen
< (hdrsize
+ pad
+ 8)))
1222 * If this is a response to a resent RDMA read, we
1223 * have to be careful to copy the data to the right
1226 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
1227 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
1231 case OP(RDMA_READ_RESPONSE_LAST
):
1232 /* ACKs READ req. */
1233 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1235 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1, &wc
);
1238 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1240 /* Get the number of bytes the message was padded by. */
1241 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1243 * Check that the data size is >= 1 && <= pmtu.
1244 * Remember to account for the AETH header (4) and
1247 if (unlikely(tlen
<= (hdrsize
+ pad
+ 8)))
1250 tlen
-= hdrsize
+ pad
+ 8;
1251 if (unlikely(tlen
!= qp
->s_rdma_read_len
))
1253 if (!header_in_data
)
1254 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1256 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1257 data
+= sizeof(__be32
);
1259 ipath_copy_sge(&qp
->s_rdma_read_sge
, data
, tlen
);
1260 (void) do_rc_ack(qp
, aeth
, psn
,
1261 OP(RDMA_READ_RESPONSE_LAST
), 0);
1266 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1270 wc
.status
= IB_WC_LOC_QP_OP_ERR
;
1274 wc
.status
= IB_WC_LOC_LEN_ERR
;
1276 wc
.wr_id
= wqe
->wr
.wr_id
;
1277 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
1282 wc
.src_qp
= qp
->remote_qpn
;
1285 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1286 wc
.sl
= qp
->remote_ah_attr
.sl
;
1287 wc
.dlid_path_bits
= 0;
1289 ipath_sqerror_qp(qp
, &wc
);
1290 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1296 * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
1297 * @dev: the device this packet came in on
1298 * @ohdr: the other headers for this packet
1299 * @data: the packet data
1300 * @qp: the QP for this packet
1301 * @opcode: the opcode for this packet
1302 * @psn: the packet sequence number for this packet
1303 * @diff: the difference between the PSN and the expected PSN
1304 * @header_in_data: true if part of the header data is in the data buffer
1306 * This is called from ipath_rc_rcv() to process an unexpected
1307 * incoming RC packet for the given QP.
1308 * Called at interrupt level.
1309 * Return 1 if no more processing is needed; otherwise return 0 to
1310 * schedule a response to be sent.
1312 static inline int ipath_rc_rcv_error(struct ipath_ibdev
*dev
,
1313 struct ipath_other_headers
*ohdr
,
1315 struct ipath_qp
*qp
,
1321 struct ipath_ack_entry
*e
;
1324 unsigned long flags
;
1328 * Packet sequence error.
1329 * A NAK will ACK earlier sends and RDMA writes.
1330 * Don't queue the NAK if we already sent one.
1332 if (!qp
->r_nak_state
) {
1333 qp
->r_nak_state
= IB_NAK_PSN_ERROR
;
1334 /* Use the expected PSN. */
1335 qp
->r_ack_psn
= qp
->r_psn
;
1342 * Handle a duplicate request. Don't re-execute SEND, RDMA
1343 * write or atomic op. Don't NAK errors, just silently drop
1344 * the duplicate request. Note that r_sge, r_len, and
1345 * r_rcv_len may be in use so don't modify them.
1347 * We are supposed to ACK the earliest duplicate PSN but we
1348 * can coalesce an outstanding duplicate ACK. We have to
1349 * send the earliest so that RDMA reads can be restarted at
1350 * the requester's expected PSN.
1352 * First, find where this duplicate PSN falls within the
1353 * ACKs previously sent.
1355 psn
&= IPATH_PSN_MASK
;
1358 spin_lock_irqsave(&qp
->s_lock
, flags
);
1359 for (i
= qp
->r_head_ack_queue
; ; i
= prev
) {
1360 if (i
== qp
->s_tail_ack_queue
)
1365 prev
= IPATH_MAX_RDMA_ATOMIC
;
1366 if (prev
== qp
->r_head_ack_queue
) {
1370 e
= &qp
->s_ack_queue
[prev
];
1375 if (ipath_cmp24(psn
, e
->psn
) >= 0) {
1376 if (prev
== qp
->s_tail_ack_queue
)
1382 case OP(RDMA_READ_REQUEST
): {
1383 struct ib_reth
*reth
;
1388 * If we didn't find the RDMA read request in the ack queue,
1389 * or the send tasklet is already backed up to send an
1390 * earlier entry, we can ignore this request.
1392 if (!e
|| e
->opcode
!= OP(RDMA_READ_REQUEST
) || old_req
)
1394 /* RETH comes after BTH */
1395 if (!header_in_data
)
1396 reth
= &ohdr
->u
.rc
.reth
;
1398 reth
= (struct ib_reth
*)data
;
1399 data
+= sizeof(*reth
);
1402 * Address range must be a subset of the original
1403 * request and start on pmtu boundaries.
1404 * We reuse the old ack_queue slot since the requester
1405 * should not back up and request an earlier PSN for the
1408 offset
= ((psn
- e
->psn
) & IPATH_PSN_MASK
) *
1409 ib_mtu_enum_to_int(qp
->path_mtu
);
1410 len
= be32_to_cpu(reth
->length
);
1411 if (unlikely(offset
+ len
> e
->rdma_sge
.sge
.sge_length
))
1414 u32 rkey
= be32_to_cpu(reth
->rkey
);
1415 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1418 ok
= ipath_rkey_ok(qp
, &e
->rdma_sge
,
1420 IB_ACCESS_REMOTE_READ
);
1424 e
->rdma_sge
.sg_list
= NULL
;
1425 e
->rdma_sge
.num_sge
= 0;
1426 e
->rdma_sge
.sge
.mr
= NULL
;
1427 e
->rdma_sge
.sge
.vaddr
= NULL
;
1428 e
->rdma_sge
.sge
.length
= 0;
1429 e
->rdma_sge
.sge
.sge_length
= 0;
1432 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1433 qp
->s_tail_ack_queue
= prev
;
1437 case OP(COMPARE_SWAP
):
1438 case OP(FETCH_ADD
): {
1440 * If we didn't find the atomic request in the ack queue
1441 * or the send tasklet is already backed up to send an
1442 * earlier entry, we can ignore this request.
1444 if (!e
|| e
->opcode
!= (u8
) opcode
|| old_req
)
1446 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1447 qp
->s_tail_ack_queue
= prev
;
1455 * Resend the most recent ACK if this request is
1456 * after all the previous RDMA reads and atomics.
1458 if (i
== qp
->r_head_ack_queue
) {
1459 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1460 qp
->r_nak_state
= 0;
1461 qp
->r_ack_psn
= qp
->r_psn
- 1;
1465 * Try to send a simple ACK to work around a Mellanox bug
1466 * which doesn't accept a RDMA read response or atomic
1467 * response as an ACK for earlier SENDs or RDMA writes.
1469 if (qp
->r_head_ack_queue
== qp
->s_tail_ack_queue
&&
1470 !(qp
->s_flags
& IPATH_S_ACK_PENDING
) &&
1471 qp
->s_ack_state
== OP(ACKNOWLEDGE
)) {
1472 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1473 qp
->r_nak_state
= 0;
1474 qp
->r_ack_psn
= qp
->s_ack_queue
[i
].psn
- 1;
1478 * Resend the RDMA read or atomic op which
1479 * ACKs this duplicate request.
1481 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1482 qp
->s_tail_ack_queue
= i
;
1485 qp
->r_nak_state
= 0;
1486 tasklet_hi_schedule(&qp
->s_task
);
1489 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1497 static void ipath_rc_error(struct ipath_qp
*qp
, enum ib_wc_status err
)
1499 unsigned long flags
;
1502 spin_lock_irqsave(&qp
->s_lock
, flags
);
1503 qp
->state
= IB_QPS_ERR
;
1504 lastwqe
= ipath_error_qp(qp
, err
);
1505 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1510 ev
.device
= qp
->ibqp
.device
;
1511 ev
.element
.qp
= &qp
->ibqp
;
1512 ev
.event
= IB_EVENT_QP_LAST_WQE_REACHED
;
1513 qp
->ibqp
.event_handler(&ev
, qp
->ibqp
.qp_context
);
1517 static inline void ipath_update_ack_queue(struct ipath_qp
*qp
, unsigned n
)
1519 unsigned long flags
;
1523 if (next
> IPATH_MAX_RDMA_ATOMIC
)
1525 spin_lock_irqsave(&qp
->s_lock
, flags
);
1526 if (n
== qp
->s_tail_ack_queue
) {
1527 qp
->s_tail_ack_queue
= next
;
1528 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1530 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1534 * ipath_rc_rcv - process an incoming RC packet
1535 * @dev: the device this packet came in on
1536 * @hdr: the header of this packet
1537 * @has_grh: true if the header has a GRH
1538 * @data: the packet data
1539 * @tlen: the packet length
1540 * @qp: the QP for this packet
1542 * This is called from ipath_qp_rcv() to process an incoming RC packet
1544 * Called at interrupt level.
1546 void ipath_rc_rcv(struct ipath_ibdev
*dev
, struct ipath_ib_header
*hdr
,
1547 int has_grh
, void *data
, u32 tlen
, struct ipath_qp
*qp
)
1549 struct ipath_other_headers
*ohdr
;
1555 u32 pmtu
= ib_mtu_enum_to_int(qp
->path_mtu
);
1557 struct ib_reth
*reth
;
1560 /* Validate the SLID. See Ch. 9.6.1.5 */
1561 if (unlikely(be16_to_cpu(hdr
->lrh
[3]) != qp
->remote_ah_attr
.dlid
))
1567 hdrsize
= 8 + 12; /* LRH + BTH */
1568 psn
= be32_to_cpu(ohdr
->bth
[2]);
1571 ohdr
= &hdr
->u
.l
.oth
;
1572 hdrsize
= 8 + 40 + 12; /* LRH + GRH + BTH */
1574 * The header with GRH is 60 bytes and the core driver sets
1575 * the eager header buffer size to 56 bytes so the last 4
1576 * bytes of the BTH header (PSN) is in the data buffer.
1578 header_in_data
= dev
->dd
->ipath_rcvhdrentsize
== 16;
1579 if (header_in_data
) {
1580 psn
= be32_to_cpu(((__be32
*) data
)[0]);
1581 data
+= sizeof(__be32
);
1583 psn
= be32_to_cpu(ohdr
->bth
[2]);
1587 * Process responses (ACKs) before anything else. Note that the
1588 * packet sequence number will be for something in the send work
1589 * queue rather than the expected receive packet sequence number.
1590 * In other words, this QP is the requester.
1592 opcode
= be32_to_cpu(ohdr
->bth
[0]) >> 24;
1593 if (opcode
>= OP(RDMA_READ_RESPONSE_FIRST
) &&
1594 opcode
<= OP(ATOMIC_ACKNOWLEDGE
)) {
1595 ipath_rc_rcv_resp(dev
, ohdr
, data
, tlen
, qp
, opcode
, psn
,
1596 hdrsize
, pmtu
, header_in_data
);
1600 /* Compute 24 bits worth of difference. */
1601 diff
= ipath_cmp24(psn
, qp
->r_psn
);
1602 if (unlikely(diff
)) {
1603 if (ipath_rc_rcv_error(dev
, ohdr
, data
, qp
, opcode
,
1604 psn
, diff
, header_in_data
))
1609 /* Check for opcode sequence errors. */
1610 switch (qp
->r_state
) {
1611 case OP(SEND_FIRST
):
1612 case OP(SEND_MIDDLE
):
1613 if (opcode
== OP(SEND_MIDDLE
) ||
1614 opcode
== OP(SEND_LAST
) ||
1615 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
))
1618 ipath_rc_error(qp
, IB_WC_REM_INV_REQ_ERR
);
1619 qp
->r_nak_state
= IB_NAK_INVALID_REQUEST
;
1620 qp
->r_ack_psn
= qp
->r_psn
;
1623 case OP(RDMA_WRITE_FIRST
):
1624 case OP(RDMA_WRITE_MIDDLE
):
1625 if (opcode
== OP(RDMA_WRITE_MIDDLE
) ||
1626 opcode
== OP(RDMA_WRITE_LAST
) ||
1627 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
1632 if (opcode
== OP(SEND_MIDDLE
) ||
1633 opcode
== OP(SEND_LAST
) ||
1634 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
) ||
1635 opcode
== OP(RDMA_WRITE_MIDDLE
) ||
1636 opcode
== OP(RDMA_WRITE_LAST
) ||
1637 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
1640 * Note that it is up to the requester to not send a new
1641 * RDMA read or atomic operation before receiving an ACK
1642 * for the previous operation.
1650 /* OK, process the packet. */
1652 case OP(SEND_FIRST
):
1653 if (!ipath_get_rwqe(qp
, 0)) {
1656 * A RNR NAK will ACK earlier sends and RDMA writes.
1657 * Don't queue the NAK if a RDMA read or atomic
1658 * is pending though.
1660 if (qp
->r_nak_state
)
1662 qp
->r_nak_state
= IB_RNR_NAK
| qp
->r_min_rnr_timer
;
1663 qp
->r_ack_psn
= qp
->r_psn
;
1668 case OP(SEND_MIDDLE
):
1669 case OP(RDMA_WRITE_MIDDLE
):
1671 /* Check for invalid length PMTU or posted rwqe len. */
1672 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1674 qp
->r_rcv_len
+= pmtu
;
1675 if (unlikely(qp
->r_rcv_len
> qp
->r_len
))
1677 ipath_copy_sge(&qp
->r_sge
, data
, pmtu
);
1680 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
):
1682 if (!ipath_get_rwqe(qp
, 1))
1687 case OP(SEND_ONLY_WITH_IMMEDIATE
):
1688 if (!ipath_get_rwqe(qp
, 0))
1691 if (opcode
== OP(SEND_ONLY
))
1694 case OP(SEND_LAST_WITH_IMMEDIATE
):
1696 if (header_in_data
) {
1697 wc
.imm_data
= *(__be32
*) data
;
1698 data
+= sizeof(__be32
);
1700 /* Immediate data comes after BTH */
1701 wc
.imm_data
= ohdr
->u
.imm_data
;
1704 wc
.wc_flags
= IB_WC_WITH_IMM
;
1707 case OP(RDMA_WRITE_LAST
):
1709 /* Get the number of bytes the message was padded by. */
1710 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1711 /* Check for invalid length. */
1712 /* XXX LAST len should be >= 1 */
1713 if (unlikely(tlen
< (hdrsize
+ pad
+ 4)))
1715 /* Don't count the CRC. */
1716 tlen
-= (hdrsize
+ pad
+ 4);
1717 wc
.byte_len
= tlen
+ qp
->r_rcv_len
;
1718 if (unlikely(wc
.byte_len
> qp
->r_len
))
1720 ipath_copy_sge(&qp
->r_sge
, data
, tlen
);
1722 if (!qp
->r_wrid_valid
)
1724 qp
->r_wrid_valid
= 0;
1725 wc
.wr_id
= qp
->r_wr_id
;
1726 wc
.status
= IB_WC_SUCCESS
;
1727 wc
.opcode
= IB_WC_RECV
;
1730 wc
.src_qp
= qp
->remote_qpn
;
1732 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1733 wc
.sl
= qp
->remote_ah_attr
.sl
;
1734 wc
.dlid_path_bits
= 0;
1736 /* Signal completion event if the solicited bit is set. */
1737 ipath_cq_enter(to_icq(qp
->ibqp
.recv_cq
), &wc
,
1739 __constant_cpu_to_be32(1 << 23)) != 0);
1742 case OP(RDMA_WRITE_FIRST
):
1743 case OP(RDMA_WRITE_ONLY
):
1744 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
):
1745 if (unlikely(!(qp
->qp_access_flags
&
1746 IB_ACCESS_REMOTE_WRITE
)))
1749 /* RETH comes after BTH */
1750 if (!header_in_data
)
1751 reth
= &ohdr
->u
.rc
.reth
;
1753 reth
= (struct ib_reth
*)data
;
1754 data
+= sizeof(*reth
);
1756 hdrsize
+= sizeof(*reth
);
1757 qp
->r_len
= be32_to_cpu(reth
->length
);
1759 if (qp
->r_len
!= 0) {
1760 u32 rkey
= be32_to_cpu(reth
->rkey
);
1761 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1764 /* Check rkey & NAK */
1765 ok
= ipath_rkey_ok(qp
, &qp
->r_sge
,
1766 qp
->r_len
, vaddr
, rkey
,
1767 IB_ACCESS_REMOTE_WRITE
);
1771 qp
->r_sge
.sg_list
= NULL
;
1772 qp
->r_sge
.sge
.mr
= NULL
;
1773 qp
->r_sge
.sge
.vaddr
= NULL
;
1774 qp
->r_sge
.sge
.length
= 0;
1775 qp
->r_sge
.sge
.sge_length
= 0;
1777 if (opcode
== OP(RDMA_WRITE_FIRST
))
1779 else if (opcode
== OP(RDMA_WRITE_ONLY
))
1781 if (!ipath_get_rwqe(qp
, 1))
1785 case OP(RDMA_READ_REQUEST
): {
1786 struct ipath_ack_entry
*e
;
1790 if (unlikely(!(qp
->qp_access_flags
&
1791 IB_ACCESS_REMOTE_READ
)))
1793 next
= qp
->r_head_ack_queue
+ 1;
1794 if (next
> IPATH_MAX_RDMA_ATOMIC
)
1796 if (unlikely(next
== qp
->s_tail_ack_queue
)) {
1797 if (!qp
->s_ack_queue
[next
].sent
)
1799 ipath_update_ack_queue(qp
, next
);
1801 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
1802 /* RETH comes after BTH */
1803 if (!header_in_data
)
1804 reth
= &ohdr
->u
.rc
.reth
;
1806 reth
= (struct ib_reth
*)data
;
1807 data
+= sizeof(*reth
);
1809 len
= be32_to_cpu(reth
->length
);
1811 u32 rkey
= be32_to_cpu(reth
->rkey
);
1812 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1815 /* Check rkey & NAK */
1816 ok
= ipath_rkey_ok(qp
, &e
->rdma_sge
, len
, vaddr
,
1817 rkey
, IB_ACCESS_REMOTE_READ
);
1821 * Update the next expected PSN. We add 1 later
1822 * below, so only add the remainder here.
1825 qp
->r_psn
+= (len
- 1) / pmtu
;
1827 e
->rdma_sge
.sg_list
= NULL
;
1828 e
->rdma_sge
.num_sge
= 0;
1829 e
->rdma_sge
.sge
.mr
= NULL
;
1830 e
->rdma_sge
.sge
.vaddr
= NULL
;
1831 e
->rdma_sge
.sge
.length
= 0;
1832 e
->rdma_sge
.sge
.sge_length
= 0;
1838 * We need to increment the MSN here instead of when we
1839 * finish sending the result since a duplicate request would
1840 * increment it more than once.
1844 qp
->r_state
= opcode
;
1845 qp
->r_nak_state
= 0;
1847 qp
->r_head_ack_queue
= next
;
1849 /* Call ipath_do_rc_send() in another thread. */
1850 tasklet_hi_schedule(&qp
->s_task
);
1855 case OP(COMPARE_SWAP
):
1856 case OP(FETCH_ADD
): {
1857 struct ib_atomic_eth
*ateth
;
1858 struct ipath_ack_entry
*e
;
1865 if (unlikely(!(qp
->qp_access_flags
&
1866 IB_ACCESS_REMOTE_ATOMIC
)))
1868 next
= qp
->r_head_ack_queue
+ 1;
1869 if (next
> IPATH_MAX_RDMA_ATOMIC
)
1871 if (unlikely(next
== qp
->s_tail_ack_queue
)) {
1872 if (!qp
->s_ack_queue
[next
].sent
)
1874 ipath_update_ack_queue(qp
, next
);
1876 if (!header_in_data
)
1877 ateth
= &ohdr
->u
.atomic_eth
;
1879 ateth
= (struct ib_atomic_eth
*)data
;
1880 vaddr
= ((u64
) be32_to_cpu(ateth
->vaddr
[0]) << 32) |
1881 be32_to_cpu(ateth
->vaddr
[1]);
1882 if (unlikely(vaddr
& (sizeof(u64
) - 1)))
1884 rkey
= be32_to_cpu(ateth
->rkey
);
1885 /* Check rkey & NAK */
1886 if (unlikely(!ipath_rkey_ok(qp
, &qp
->r_sge
,
1887 sizeof(u64
), vaddr
, rkey
,
1888 IB_ACCESS_REMOTE_ATOMIC
)))
1890 /* Perform atomic OP and save result. */
1891 maddr
= (atomic64_t
*) qp
->r_sge
.sge
.vaddr
;
1892 sdata
= be64_to_cpu(ateth
->swap_data
);
1893 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
1894 e
->atomic_data
= (opcode
== OP(FETCH_ADD
)) ?
1895 (u64
) atomic64_add_return(sdata
, maddr
) - sdata
:
1896 (u64
) cmpxchg((u64
*) qp
->r_sge
.sge
.vaddr
,
1897 be64_to_cpu(ateth
->compare_data
),
1901 e
->psn
= psn
& IPATH_PSN_MASK
;
1904 qp
->r_state
= opcode
;
1905 qp
->r_nak_state
= 0;
1907 qp
->r_head_ack_queue
= next
;
1909 /* Call ipath_do_rc_send() in another thread. */
1910 tasklet_hi_schedule(&qp
->s_task
);
1916 /* NAK unknown opcodes. */
1920 qp
->r_state
= opcode
;
1921 qp
->r_ack_psn
= psn
;
1922 qp
->r_nak_state
= 0;
1923 /* Send an ACK if requested or required. */
1924 if (psn
& (1 << 31))
1929 ipath_rc_error(qp
, IB_WC_REM_ACCESS_ERR
);
1930 qp
->r_nak_state
= IB_NAK_REMOTE_ACCESS_ERROR
;
1931 qp
->r_ack_psn
= qp
->r_psn
;