2 * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
3 * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
36 #include "ipath_verbs.h"
37 #include "ipath_kernel.h"
39 /* cut down ridiculously long IB macro names */
40 #define OP(x) IB_OPCODE_RC_##x
42 static u32
restart_sge(struct ipath_sge_state
*ss
, struct ipath_swqe
*wqe
,
47 len
= ((psn
- wqe
->psn
) & IPATH_PSN_MASK
) * pmtu
;
48 ss
->sge
= wqe
->sg_list
[0];
49 ss
->sg_list
= wqe
->sg_list
+ 1;
50 ss
->num_sge
= wqe
->wr
.num_sge
;
51 ipath_skip_sge(ss
, len
);
52 return wqe
->length
- len
;
56 * ipath_init_restart- initialize the qp->s_sge after a restart
57 * @qp: the QP who's SGE we're restarting
58 * @wqe: the work queue to initialize the QP's SGE from
60 * The QP s_lock should be held and interrupts disabled.
62 static void ipath_init_restart(struct ipath_qp
*qp
, struct ipath_swqe
*wqe
)
64 struct ipath_ibdev
*dev
;
66 qp
->s_len
= restart_sge(&qp
->s_sge
, wqe
, qp
->s_psn
,
67 ib_mtu_enum_to_int(qp
->path_mtu
));
68 dev
= to_idev(qp
->ibqp
.device
);
69 spin_lock(&dev
->pending_lock
);
70 if (list_empty(&qp
->timerwait
))
71 list_add_tail(&qp
->timerwait
,
72 &dev
->pending
[dev
->pending_index
]);
73 spin_unlock(&dev
->pending_lock
);
77 * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
78 * @qp: a pointer to the QP
79 * @ohdr: a pointer to the IB header being constructed
82 * Return 1 if constructed; otherwise, return 0.
83 * Note that we are in the responder's side of the QP context.
84 * Note the QP s_lock must be held.
86 static int ipath_make_rc_ack(struct ipath_ibdev
*dev
, struct ipath_qp
*qp
,
87 struct ipath_other_headers
*ohdr
, u32 pmtu
)
89 struct ipath_ack_entry
*e
;
95 /* Don't send an ACK if we aren't supposed to. */
96 if (!(ib_ipath_state_ops
[qp
->state
] & IPATH_PROCESS_RECV_OK
))
99 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
102 switch (qp
->s_ack_state
) {
103 case OP(RDMA_READ_RESPONSE_LAST
):
104 case OP(RDMA_READ_RESPONSE_ONLY
):
105 case OP(ATOMIC_ACKNOWLEDGE
):
107 * We can increment the tail pointer now that the last
108 * response has been sent instead of only being
111 if (++qp
->s_tail_ack_queue
> IPATH_MAX_RDMA_ATOMIC
)
112 qp
->s_tail_ack_queue
= 0;
115 case OP(ACKNOWLEDGE
):
116 /* Check for no next entry in the queue. */
117 if (qp
->r_head_ack_queue
== qp
->s_tail_ack_queue
) {
118 if (qp
->s_flags
& IPATH_S_ACK_PENDING
)
120 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
124 e
= &qp
->s_ack_queue
[qp
->s_tail_ack_queue
];
125 if (e
->opcode
== OP(RDMA_READ_REQUEST
)) {
126 /* Copy SGE state in case we need to resend */
127 qp
->s_ack_rdma_sge
= e
->rdma_sge
;
128 qp
->s_cur_sge
= &qp
->s_ack_rdma_sge
;
129 len
= e
->rdma_sge
.sge
.sge_length
;
132 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_FIRST
);
134 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_ONLY
);
137 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
139 qp
->s_ack_rdma_psn
= e
->psn
;
140 bth2
= qp
->s_ack_rdma_psn
++ & IPATH_PSN_MASK
;
142 /* COMPARE_SWAP or FETCH_ADD */
143 qp
->s_cur_sge
= NULL
;
145 qp
->s_ack_state
= OP(ATOMIC_ACKNOWLEDGE
);
146 ohdr
->u
.at
.aeth
= ipath_compute_aeth(qp
);
147 ohdr
->u
.at
.atomic_ack_eth
[0] =
148 cpu_to_be32(e
->atomic_data
>> 32);
149 ohdr
->u
.at
.atomic_ack_eth
[1] =
150 cpu_to_be32(e
->atomic_data
);
151 hwords
+= sizeof(ohdr
->u
.at
) / sizeof(u32
);
155 bth0
= qp
->s_ack_state
<< 24;
158 case OP(RDMA_READ_RESPONSE_FIRST
):
159 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
161 case OP(RDMA_READ_RESPONSE_MIDDLE
):
162 len
= qp
->s_ack_rdma_sge
.sge
.sge_length
;
166 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
168 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_LAST
);
169 qp
->s_ack_queue
[qp
->s_tail_ack_queue
].sent
= 1;
171 bth0
= qp
->s_ack_state
<< 24;
172 bth2
= qp
->s_ack_rdma_psn
++ & IPATH_PSN_MASK
;
178 * Send a regular ACK.
179 * Set the s_ack_state so we wait until after sending
180 * the ACK before setting s_ack_state to ACKNOWLEDGE
183 qp
->s_ack_state
= OP(SEND_ONLY
);
184 qp
->s_flags
&= ~IPATH_S_ACK_PENDING
;
185 qp
->s_cur_sge
= NULL
;
188 cpu_to_be32((qp
->r_msn
& IPATH_MSN_MASK
) |
190 IPATH_AETH_CREDIT_SHIFT
));
192 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
195 bth0
= OP(ACKNOWLEDGE
) << 24;
196 bth2
= qp
->s_ack_psn
& IPATH_PSN_MASK
;
198 qp
->s_hdrwords
= hwords
;
199 qp
->s_cur_size
= len
;
200 ipath_make_ruc_header(dev
, qp
, ohdr
, bth0
, bth2
);
208 * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
209 * @qp: a pointer to the QP
211 * Return 1 if constructed; otherwise, return 0.
213 int ipath_make_rc_req(struct ipath_qp
*qp
)
215 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
216 struct ipath_other_headers
*ohdr
;
217 struct ipath_sge_state
*ss
;
218 struct ipath_swqe
*wqe
;
223 u32 pmtu
= ib_mtu_enum_to_int(qp
->path_mtu
);
228 ohdr
= &qp
->s_hdr
.u
.oth
;
229 if (qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)
230 ohdr
= &qp
->s_hdr
.u
.l
.oth
;
233 * The lock is needed to synchronize between the sending tasklet,
234 * the receive interrupt handler, and timeout resends.
236 spin_lock_irqsave(&qp
->s_lock
, flags
);
238 /* Sending responses has higher priority over sending requests. */
239 if ((qp
->r_head_ack_queue
!= qp
->s_tail_ack_queue
||
240 (qp
->s_flags
& IPATH_S_ACK_PENDING
) ||
241 qp
->s_ack_state
!= OP(ACKNOWLEDGE
)) &&
242 ipath_make_rc_ack(dev
, qp
, ohdr
, pmtu
))
245 if (!(ib_ipath_state_ops
[qp
->state
] & IPATH_PROCESS_SEND_OK
)) {
246 if (!(ib_ipath_state_ops
[qp
->state
] & IPATH_FLUSH_SEND
))
248 /* We are in the error state, flush the work request. */
249 if (qp
->s_last
== qp
->s_head
)
251 /* If DMAs are in progress, we can't flush immediately. */
252 if (atomic_read(&qp
->s_dma_busy
)) {
253 qp
->s_flags
|= IPATH_S_WAIT_DMA
;
256 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
257 ipath_send_complete(qp
, wqe
, IB_WC_WR_FLUSH_ERR
);
261 /* Leave BUSY set until RNR timeout. */
262 if (qp
->s_rnr_timeout
) {
263 qp
->s_flags
|= IPATH_S_WAITING
;
267 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
269 bth0
= 1 << 22; /* Set M bit */
271 /* Send a request. */
272 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
273 switch (qp
->s_state
) {
275 if (!(ib_ipath_state_ops
[qp
->state
] &
276 IPATH_PROCESS_NEXT_SEND_OK
))
279 * Resend an old request or start a new one.
281 * We keep track of the current SWQE so that
282 * we don't reset the "furthest progress" state
283 * if we need to back up.
286 if (qp
->s_cur
== qp
->s_tail
) {
287 /* Check if send work queue is empty. */
288 if (qp
->s_tail
== qp
->s_head
)
291 * If a fence is requested, wait for previous
292 * RDMA read and atomic operations to finish.
294 if ((wqe
->wr
.send_flags
& IB_SEND_FENCE
) &&
295 qp
->s_num_rd_atomic
) {
296 qp
->s_flags
|= IPATH_S_FENCE_PENDING
;
299 wqe
->psn
= qp
->s_next_psn
;
303 * Note that we have to be careful not to modify the
304 * original work request since we may need to resend
310 switch (wqe
->wr
.opcode
) {
312 case IB_WR_SEND_WITH_IMM
:
313 /* If no credit, return. */
314 if (qp
->s_lsn
!= (u32
) -1 &&
315 ipath_cmp24(wqe
->ssn
, qp
->s_lsn
+ 1) > 0) {
316 qp
->s_flags
|= IPATH_S_WAIT_SSN_CREDIT
;
319 wqe
->lpsn
= wqe
->psn
;
321 wqe
->lpsn
+= (len
- 1) / pmtu
;
322 qp
->s_state
= OP(SEND_FIRST
);
326 if (wqe
->wr
.opcode
== IB_WR_SEND
)
327 qp
->s_state
= OP(SEND_ONLY
);
329 qp
->s_state
= OP(SEND_ONLY_WITH_IMMEDIATE
);
330 /* Immediate data comes after the BTH */
331 ohdr
->u
.imm_data
= wqe
->wr
.ex
.imm_data
;
334 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
336 bth2
= 1 << 31; /* Request ACK. */
337 if (++qp
->s_cur
== qp
->s_size
)
341 case IB_WR_RDMA_WRITE
:
342 if (newreq
&& qp
->s_lsn
!= (u32
) -1)
345 case IB_WR_RDMA_WRITE_WITH_IMM
:
346 /* If no credit, return. */
347 if (qp
->s_lsn
!= (u32
) -1 &&
348 ipath_cmp24(wqe
->ssn
, qp
->s_lsn
+ 1) > 0) {
349 qp
->s_flags
|= IPATH_S_WAIT_SSN_CREDIT
;
352 ohdr
->u
.rc
.reth
.vaddr
=
353 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
);
354 ohdr
->u
.rc
.reth
.rkey
=
355 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
356 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
357 hwords
+= sizeof(struct ib_reth
) / sizeof(u32
);
358 wqe
->lpsn
= wqe
->psn
;
360 wqe
->lpsn
+= (len
- 1) / pmtu
;
361 qp
->s_state
= OP(RDMA_WRITE_FIRST
);
365 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
366 qp
->s_state
= OP(RDMA_WRITE_ONLY
);
369 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
);
370 /* Immediate data comes after RETH */
371 ohdr
->u
.rc
.imm_data
= wqe
->wr
.ex
.imm_data
;
373 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
376 bth2
= 1 << 31; /* Request ACK. */
377 if (++qp
->s_cur
== qp
->s_size
)
381 case IB_WR_RDMA_READ
:
383 * Don't allow more operations to be started
384 * than the QP limits allow.
387 if (qp
->s_num_rd_atomic
>=
388 qp
->s_max_rd_atomic
) {
389 qp
->s_flags
|= IPATH_S_RDMAR_PENDING
;
392 qp
->s_num_rd_atomic
++;
393 if (qp
->s_lsn
!= (u32
) -1)
396 * Adjust s_next_psn to count the
397 * expected number of responses.
400 qp
->s_next_psn
+= (len
- 1) / pmtu
;
401 wqe
->lpsn
= qp
->s_next_psn
++;
403 ohdr
->u
.rc
.reth
.vaddr
=
404 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
);
405 ohdr
->u
.rc
.reth
.rkey
=
406 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
407 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
408 qp
->s_state
= OP(RDMA_READ_REQUEST
);
409 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
412 if (++qp
->s_cur
== qp
->s_size
)
416 case IB_WR_ATOMIC_CMP_AND_SWP
:
417 case IB_WR_ATOMIC_FETCH_AND_ADD
:
419 * Don't allow more operations to be started
420 * than the QP limits allow.
423 if (qp
->s_num_rd_atomic
>=
424 qp
->s_max_rd_atomic
) {
425 qp
->s_flags
|= IPATH_S_RDMAR_PENDING
;
428 qp
->s_num_rd_atomic
++;
429 if (qp
->s_lsn
!= (u32
) -1)
431 wqe
->lpsn
= wqe
->psn
;
433 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
) {
434 qp
->s_state
= OP(COMPARE_SWAP
);
435 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
436 wqe
->wr
.wr
.atomic
.swap
);
437 ohdr
->u
.atomic_eth
.compare_data
= cpu_to_be64(
438 wqe
->wr
.wr
.atomic
.compare_add
);
440 qp
->s_state
= OP(FETCH_ADD
);
441 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
442 wqe
->wr
.wr
.atomic
.compare_add
);
443 ohdr
->u
.atomic_eth
.compare_data
= 0;
445 ohdr
->u
.atomic_eth
.vaddr
[0] = cpu_to_be32(
446 wqe
->wr
.wr
.atomic
.remote_addr
>> 32);
447 ohdr
->u
.atomic_eth
.vaddr
[1] = cpu_to_be32(
448 wqe
->wr
.wr
.atomic
.remote_addr
);
449 ohdr
->u
.atomic_eth
.rkey
= cpu_to_be32(
450 wqe
->wr
.wr
.atomic
.rkey
);
451 hwords
+= sizeof(struct ib_atomic_eth
) / sizeof(u32
);
454 if (++qp
->s_cur
== qp
->s_size
)
461 qp
->s_sge
.sge
= wqe
->sg_list
[0];
462 qp
->s_sge
.sg_list
= wqe
->sg_list
+ 1;
463 qp
->s_sge
.num_sge
= wqe
->wr
.num_sge
;
464 qp
->s_len
= wqe
->length
;
467 if (qp
->s_tail
>= qp
->s_size
)
470 bth2
|= qp
->s_psn
& IPATH_PSN_MASK
;
471 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
472 qp
->s_psn
= wqe
->lpsn
+ 1;
475 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
476 qp
->s_next_psn
= qp
->s_psn
;
479 * Put the QP on the pending list so lost ACKs will cause
480 * a retry. More than one request can be pending so the
481 * QP may already be on the dev->pending list.
483 spin_lock(&dev
->pending_lock
);
484 if (list_empty(&qp
->timerwait
))
485 list_add_tail(&qp
->timerwait
,
486 &dev
->pending
[dev
->pending_index
]);
487 spin_unlock(&dev
->pending_lock
);
490 case OP(RDMA_READ_RESPONSE_FIRST
):
492 * This case can only happen if a send is restarted.
493 * See ipath_restart_rc().
495 ipath_init_restart(qp
, wqe
);
498 qp
->s_state
= OP(SEND_MIDDLE
);
500 case OP(SEND_MIDDLE
):
501 bth2
= qp
->s_psn
++ & IPATH_PSN_MASK
;
502 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
503 qp
->s_next_psn
= qp
->s_psn
;
510 if (wqe
->wr
.opcode
== IB_WR_SEND
)
511 qp
->s_state
= OP(SEND_LAST
);
513 qp
->s_state
= OP(SEND_LAST_WITH_IMMEDIATE
);
514 /* Immediate data comes after the BTH */
515 ohdr
->u
.imm_data
= wqe
->wr
.ex
.imm_data
;
518 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
520 bth2
|= 1 << 31; /* Request ACK. */
522 if (qp
->s_cur
>= qp
->s_size
)
526 case OP(RDMA_READ_RESPONSE_LAST
):
528 * This case can only happen if a RDMA write is restarted.
529 * See ipath_restart_rc().
531 ipath_init_restart(qp
, wqe
);
533 case OP(RDMA_WRITE_FIRST
):
534 qp
->s_state
= OP(RDMA_WRITE_MIDDLE
);
536 case OP(RDMA_WRITE_MIDDLE
):
537 bth2
= qp
->s_psn
++ & IPATH_PSN_MASK
;
538 if (ipath_cmp24(qp
->s_psn
, qp
->s_next_psn
) > 0)
539 qp
->s_next_psn
= qp
->s_psn
;
546 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
547 qp
->s_state
= OP(RDMA_WRITE_LAST
);
549 qp
->s_state
= OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
);
550 /* Immediate data comes after the BTH */
551 ohdr
->u
.imm_data
= wqe
->wr
.ex
.imm_data
;
553 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
556 bth2
|= 1 << 31; /* Request ACK. */
558 if (qp
->s_cur
>= qp
->s_size
)
562 case OP(RDMA_READ_RESPONSE_MIDDLE
):
564 * This case can only happen if a RDMA read is restarted.
565 * See ipath_restart_rc().
567 ipath_init_restart(qp
, wqe
);
568 len
= ((qp
->s_psn
- wqe
->psn
) & IPATH_PSN_MASK
) * pmtu
;
569 ohdr
->u
.rc
.reth
.vaddr
=
570 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
+ len
);
571 ohdr
->u
.rc
.reth
.rkey
=
572 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
573 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(qp
->s_len
);
574 qp
->s_state
= OP(RDMA_READ_REQUEST
);
575 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / sizeof(u32
);
576 bth2
= qp
->s_psn
& IPATH_PSN_MASK
;
577 qp
->s_psn
= wqe
->lpsn
+ 1;
581 if (qp
->s_cur
== qp
->s_size
)
585 if (ipath_cmp24(qp
->s_psn
, qp
->s_last_psn
+ IPATH_PSN_CREDIT
- 1) >= 0)
586 bth2
|= 1 << 31; /* Request ACK. */
588 qp
->s_hdrwords
= hwords
;
590 qp
->s_cur_size
= len
;
591 ipath_make_ruc_header(dev
, qp
, ohdr
, bth0
| (qp
->s_state
<< 24), bth2
);
597 qp
->s_flags
&= ~IPATH_S_BUSY
;
599 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
604 * send_rc_ack - Construct an ACK packet and send it
605 * @qp: a pointer to the QP
607 * This is called from ipath_rc_rcv() and only uses the receive
609 * Note that RDMA reads and atomics are handled in the
610 * send side QP state and tasklet.
612 static void send_rc_ack(struct ipath_qp
*qp
)
614 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
615 struct ipath_devdata
*dd
;
620 struct ipath_ib_header hdr
;
621 struct ipath_other_headers
*ohdr
;
624 spin_lock_irqsave(&qp
->s_lock
, flags
);
626 /* Don't send ACK or NAK if a RDMA read or atomic is pending. */
627 if (qp
->r_head_ack_queue
!= qp
->s_tail_ack_queue
||
628 (qp
->s_flags
& IPATH_S_ACK_PENDING
) ||
629 qp
->s_ack_state
!= OP(ACKNOWLEDGE
))
632 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
634 /* Don't try to send ACKs if the link isn't ACTIVE */
636 if (!(dd
->ipath_flags
& IPATH_LINKACTIVE
))
639 piobuf
= ipath_getpiobuf(dd
, 0, NULL
);
642 * We are out of PIO buffers at the moment.
643 * Pass responsibility for sending the ACK to the
644 * send tasklet so that when a PIO buffer becomes
645 * available, the ACK is sent ahead of other outgoing
648 spin_lock_irqsave(&qp
->s_lock
, flags
);
652 /* Construct the header. */
654 lrh0
= IPATH_LRH_BTH
;
655 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
657 if (unlikely(qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)) {
658 hwords
+= ipath_make_grh(dev
, &hdr
.u
.l
.grh
,
659 &qp
->remote_ah_attr
.grh
,
662 lrh0
= IPATH_LRH_GRH
;
664 /* read pkey_index w/o lock (its atomic) */
665 bth0
= ipath_get_pkey(dd
, qp
->s_pkey_index
) |
666 (OP(ACKNOWLEDGE
) << 24) | (1 << 22);
668 ohdr
->u
.aeth
= cpu_to_be32((qp
->r_msn
& IPATH_MSN_MASK
) |
670 IPATH_AETH_CREDIT_SHIFT
));
672 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
673 lrh0
|= qp
->remote_ah_attr
.sl
<< 4;
674 hdr
.lrh
[0] = cpu_to_be16(lrh0
);
675 hdr
.lrh
[1] = cpu_to_be16(qp
->remote_ah_attr
.dlid
);
676 hdr
.lrh
[2] = cpu_to_be16(hwords
+ SIZE_OF_CRC
);
677 hdr
.lrh
[3] = cpu_to_be16(dd
->ipath_lid
|
678 qp
->remote_ah_attr
.src_path_bits
);
679 ohdr
->bth
[0] = cpu_to_be32(bth0
);
680 ohdr
->bth
[1] = cpu_to_be32(qp
->remote_qpn
);
681 ohdr
->bth
[2] = cpu_to_be32(qp
->r_ack_psn
& IPATH_PSN_MASK
);
683 writeq(hwords
+ 1, piobuf
);
685 if (dd
->ipath_flags
& IPATH_PIO_FLUSH_WC
) {
686 u32
*hdrp
= (u32
*) &hdr
;
689 __iowrite32_copy(piobuf
+ 2, hdrp
, hwords
- 1);
691 __raw_writel(hdrp
[hwords
- 1], piobuf
+ hwords
+ 1);
693 __iowrite32_copy(piobuf
+ 2, (u32
*) &hdr
, hwords
);
697 dev
->n_unicast_xmit
++;
701 if (ib_ipath_state_ops
[qp
->state
] & IPATH_PROCESS_RECV_OK
) {
703 qp
->s_flags
|= IPATH_S_ACK_PENDING
;
704 qp
->s_nak_state
= qp
->r_nak_state
;
705 qp
->s_ack_psn
= qp
->r_ack_psn
;
707 /* Schedule the send tasklet. */
708 ipath_schedule_send(qp
);
710 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
716 * reset_psn - reset the QP state to send starting from PSN
718 * @psn: the packet sequence number to restart at
720 * This is called from ipath_rc_rcv() to process an incoming RC ACK
722 * Called at interrupt level with the QP s_lock held.
724 static void reset_psn(struct ipath_qp
*qp
, u32 psn
)
727 struct ipath_swqe
*wqe
= get_swqe_ptr(qp
, n
);
733 * If we are starting the request from the beginning,
734 * let the normal send code handle initialization.
736 if (ipath_cmp24(psn
, wqe
->psn
) <= 0) {
737 qp
->s_state
= OP(SEND_LAST
);
741 /* Find the work request opcode corresponding to the given PSN. */
742 opcode
= wqe
->wr
.opcode
;
746 if (++n
== qp
->s_size
)
750 wqe
= get_swqe_ptr(qp
, n
);
751 diff
= ipath_cmp24(psn
, wqe
->psn
);
756 * If we are starting the request from the beginning,
757 * let the normal send code handle initialization.
760 qp
->s_state
= OP(SEND_LAST
);
763 opcode
= wqe
->wr
.opcode
;
767 * Set the state to restart in the middle of a request.
768 * Don't change the s_sge, s_cur_sge, or s_cur_size.
769 * See ipath_make_rc_req().
773 case IB_WR_SEND_WITH_IMM
:
774 qp
->s_state
= OP(RDMA_READ_RESPONSE_FIRST
);
777 case IB_WR_RDMA_WRITE
:
778 case IB_WR_RDMA_WRITE_WITH_IMM
:
779 qp
->s_state
= OP(RDMA_READ_RESPONSE_LAST
);
782 case IB_WR_RDMA_READ
:
783 qp
->s_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
788 * This case shouldn't happen since its only
791 qp
->s_state
= OP(SEND_LAST
);
798 * ipath_restart_rc - back up requester to resend the last un-ACKed request
799 * @qp: the QP to restart
800 * @psn: packet sequence number for the request
801 * @wc: the work completion request
803 * The QP s_lock should be held and interrupts disabled.
805 void ipath_restart_rc(struct ipath_qp
*qp
, u32 psn
)
807 struct ipath_swqe
*wqe
= get_swqe_ptr(qp
, qp
->s_last
);
808 struct ipath_ibdev
*dev
;
810 if (qp
->s_retry
== 0) {
811 ipath_send_complete(qp
, wqe
, IB_WC_RETRY_EXC_ERR
);
812 ipath_error_qp(qp
, IB_WC_WR_FLUSH_ERR
);
818 * Remove the QP from the timeout queue.
819 * Note: it may already have been removed by ipath_ib_timer().
821 dev
= to_idev(qp
->ibqp
.device
);
822 spin_lock(&dev
->pending_lock
);
823 if (!list_empty(&qp
->timerwait
))
824 list_del_init(&qp
->timerwait
);
825 if (!list_empty(&qp
->piowait
))
826 list_del_init(&qp
->piowait
);
827 spin_unlock(&dev
->pending_lock
);
829 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
832 dev
->n_rc_resends
+= (qp
->s_psn
- psn
) & IPATH_PSN_MASK
;
835 ipath_schedule_send(qp
);
841 static inline void update_last_psn(struct ipath_qp
*qp
, u32 psn
)
843 qp
->s_last_psn
= psn
;
847 * do_rc_ack - process an incoming RC ACK
848 * @qp: the QP the ACK came in on
849 * @psn: the packet sequence number of the ACK
850 * @opcode: the opcode of the request that resulted in the ACK
852 * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
854 * Called at interrupt level with the QP s_lock held and interrupts disabled.
855 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
857 static int do_rc_ack(struct ipath_qp
*qp
, u32 aeth
, u32 psn
, int opcode
,
860 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
862 enum ib_wc_status status
;
863 struct ipath_swqe
*wqe
;
869 * Remove the QP from the timeout queue (or RNR timeout queue).
870 * If ipath_ib_timer() has already removed it,
871 * it's OK since we hold the QP s_lock and ipath_restart_rc()
872 * just won't find anything to restart if we ACK everything.
874 spin_lock(&dev
->pending_lock
);
875 if (!list_empty(&qp
->timerwait
))
876 list_del_init(&qp
->timerwait
);
877 spin_unlock(&dev
->pending_lock
);
880 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
881 * requests and implicitly NAK RDMA read and atomic requests issued
882 * before the NAK'ed request. The MSN won't include the NAK'ed
883 * request but will include an ACK'ed request(s).
888 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
891 * The MSN might be for a later WQE than the PSN indicates so
892 * only complete WQEs that the PSN finishes.
894 while ((diff
= ipath_cmp24(ack_psn
, wqe
->lpsn
)) >= 0) {
896 * RDMA_READ_RESPONSE_ONLY is a special case since
897 * we want to generate completion events for everything
898 * before the RDMA read, copy the data, then generate
899 * the completion for the read.
901 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
902 opcode
== OP(RDMA_READ_RESPONSE_ONLY
) &&
908 * If this request is a RDMA read or atomic, and the ACK is
909 * for a later operation, this ACK NAKs the RDMA read or
910 * atomic. In other words, only a RDMA_READ_LAST or ONLY
911 * can ACK a RDMA read and likewise for atomic ops. Note
912 * that the NAK case can only happen if relaxed ordering is
913 * used and requests are sent after an RDMA read or atomic
914 * is sent but before the response is received.
916 if ((wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
917 (opcode
!= OP(RDMA_READ_RESPONSE_LAST
) || diff
!= 0)) ||
918 ((wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
919 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
) &&
920 (opcode
!= OP(ATOMIC_ACKNOWLEDGE
) || diff
!= 0))) {
922 * The last valid PSN seen is the previous
925 update_last_psn(qp
, wqe
->psn
- 1);
926 /* Retry this request. */
927 ipath_restart_rc(qp
, wqe
->psn
);
929 * No need to process the ACK/NAK since we are
930 * restarting an earlier request.
934 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
935 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
)
936 *(u64
*) wqe
->sg_list
[0].vaddr
= val
;
937 if (qp
->s_num_rd_atomic
&&
938 (wqe
->wr
.opcode
== IB_WR_RDMA_READ
||
939 wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
940 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
)) {
941 qp
->s_num_rd_atomic
--;
942 /* Restart sending task if fence is complete */
943 if (((qp
->s_flags
& IPATH_S_FENCE_PENDING
) &&
944 !qp
->s_num_rd_atomic
) ||
945 qp
->s_flags
& IPATH_S_RDMAR_PENDING
)
946 ipath_schedule_send(qp
);
948 /* Post a send completion queue entry if requested. */
949 if (!(qp
->s_flags
& IPATH_S_SIGNAL_REQ_WR
) ||
950 (wqe
->wr
.send_flags
& IB_SEND_SIGNALED
)) {
951 memset(&wc
, 0, sizeof wc
);
952 wc
.wr_id
= wqe
->wr
.wr_id
;
953 wc
.status
= IB_WC_SUCCESS
;
954 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
955 wc
.byte_len
= wqe
->length
;
957 wc
.src_qp
= qp
->remote_qpn
;
958 wc
.slid
= qp
->remote_ah_attr
.dlid
;
959 wc
.sl
= qp
->remote_ah_attr
.sl
;
960 ipath_cq_enter(to_icq(qp
->ibqp
.send_cq
), &wc
, 0);
962 qp
->s_retry
= qp
->s_retry_cnt
;
964 * If we are completing a request which is in the process of
965 * being resent, we can stop resending it since we know the
966 * responder has already seen it.
968 if (qp
->s_last
== qp
->s_cur
) {
969 if (++qp
->s_cur
>= qp
->s_size
)
971 qp
->s_last
= qp
->s_cur
;
972 if (qp
->s_last
== qp
->s_tail
)
974 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
975 qp
->s_state
= OP(SEND_LAST
);
976 qp
->s_psn
= wqe
->psn
;
978 if (++qp
->s_last
>= qp
->s_size
)
980 if (qp
->state
== IB_QPS_SQD
&& qp
->s_last
== qp
->s_cur
)
982 if (qp
->s_last
== qp
->s_tail
)
984 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
988 switch (aeth
>> 29) {
991 /* If this is a partial ACK, reset the retransmit timer. */
992 if (qp
->s_last
!= qp
->s_tail
) {
993 spin_lock(&dev
->pending_lock
);
994 if (list_empty(&qp
->timerwait
))
995 list_add_tail(&qp
->timerwait
,
996 &dev
->pending
[dev
->pending_index
]);
997 spin_unlock(&dev
->pending_lock
);
999 * If we get a partial ACK for a resent operation,
1000 * we can stop resending the earlier packets and
1001 * continue with the next packet the receiver wants.
1003 if (ipath_cmp24(qp
->s_psn
, psn
) <= 0) {
1004 reset_psn(qp
, psn
+ 1);
1005 ipath_schedule_send(qp
);
1007 } else if (ipath_cmp24(qp
->s_psn
, psn
) <= 0) {
1008 qp
->s_state
= OP(SEND_LAST
);
1009 qp
->s_psn
= psn
+ 1;
1011 ipath_get_credit(qp
, aeth
);
1012 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
1013 qp
->s_retry
= qp
->s_retry_cnt
;
1014 update_last_psn(qp
, psn
);
1018 case 1: /* RNR NAK */
1020 if (qp
->s_last
== qp
->s_tail
)
1022 if (qp
->s_rnr_retry
== 0) {
1023 status
= IB_WC_RNR_RETRY_EXC_ERR
;
1026 if (qp
->s_rnr_retry_cnt
< 7)
1029 /* The last valid PSN is the previous PSN. */
1030 update_last_psn(qp
, psn
- 1);
1032 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
1033 dev
->n_rc_resends
++;
1035 dev
->n_rc_resends
+=
1036 (qp
->s_psn
- psn
) & IPATH_PSN_MASK
;
1041 ib_ipath_rnr_table
[(aeth
>> IPATH_AETH_CREDIT_SHIFT
) &
1042 IPATH_AETH_CREDIT_MASK
];
1043 ipath_insert_rnr_queue(qp
);
1044 ipath_schedule_send(qp
);
1048 if (qp
->s_last
== qp
->s_tail
)
1050 /* The last valid PSN is the previous PSN. */
1051 update_last_psn(qp
, psn
- 1);
1052 switch ((aeth
>> IPATH_AETH_CREDIT_SHIFT
) &
1053 IPATH_AETH_CREDIT_MASK
) {
1054 case 0: /* PSN sequence error */
1057 * Back up to the responder's expected PSN.
1058 * Note that we might get a NAK in the middle of an
1059 * RDMA READ response which terminates the RDMA
1062 ipath_restart_rc(qp
, psn
);
1065 case 1: /* Invalid Request */
1066 status
= IB_WC_REM_INV_REQ_ERR
;
1067 dev
->n_other_naks
++;
1070 case 2: /* Remote Access Error */
1071 status
= IB_WC_REM_ACCESS_ERR
;
1072 dev
->n_other_naks
++;
1075 case 3: /* Remote Operation Error */
1076 status
= IB_WC_REM_OP_ERR
;
1077 dev
->n_other_naks
++;
1079 ipath_send_complete(qp
, wqe
, status
);
1080 ipath_error_qp(qp
, IB_WC_WR_FLUSH_ERR
);
1084 /* Ignore other reserved NAK error codes */
1087 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
1090 default: /* 2: reserved */
1092 /* Ignore reserved NAK codes. */
1101 * ipath_rc_rcv_resp - process an incoming RC response packet
1102 * @dev: the device this packet came in on
1103 * @ohdr: the other headers for this packet
1104 * @data: the packet data
1105 * @tlen: the packet length
1106 * @qp: the QP for this packet
1107 * @opcode: the opcode for this packet
1108 * @psn: the packet sequence number for this packet
1109 * @hdrsize: the header length
1110 * @pmtu: the path MTU
1111 * @header_in_data: true if part of the header data is in the data buffer
1113 * This is called from ipath_rc_rcv() to process an incoming RC response
1114 * packet for the given QP.
1115 * Called at interrupt level.
1117 static inline void ipath_rc_rcv_resp(struct ipath_ibdev
*dev
,
1118 struct ipath_other_headers
*ohdr
,
1119 void *data
, u32 tlen
,
1120 struct ipath_qp
*qp
,
1122 u32 psn
, u32 hdrsize
, u32 pmtu
,
1125 struct ipath_swqe
*wqe
;
1126 enum ib_wc_status status
;
1127 unsigned long flags
;
1133 spin_lock_irqsave(&qp
->s_lock
, flags
);
1135 /* Double check we can process this now that we hold the s_lock. */
1136 if (!(ib_ipath_state_ops
[qp
->state
] & IPATH_PROCESS_RECV_OK
))
1139 /* Ignore invalid responses. */
1140 if (ipath_cmp24(psn
, qp
->s_next_psn
) >= 0)
1143 /* Ignore duplicate responses. */
1144 diff
= ipath_cmp24(psn
, qp
->s_last_psn
);
1145 if (unlikely(diff
<= 0)) {
1146 /* Update credits for "ghost" ACKs */
1147 if (diff
== 0 && opcode
== OP(ACKNOWLEDGE
)) {
1148 if (!header_in_data
)
1149 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1151 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1152 data
+= sizeof(__be32
);
1154 if ((aeth
>> 29) == 0)
1155 ipath_get_credit(qp
, aeth
);
1160 if (unlikely(qp
->s_last
== qp
->s_tail
))
1162 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
1163 status
= IB_WC_SUCCESS
;
1166 case OP(ACKNOWLEDGE
):
1167 case OP(ATOMIC_ACKNOWLEDGE
):
1168 case OP(RDMA_READ_RESPONSE_FIRST
):
1169 if (!header_in_data
)
1170 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1172 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1173 data
+= sizeof(__be32
);
1175 if (opcode
== OP(ATOMIC_ACKNOWLEDGE
)) {
1176 if (!header_in_data
) {
1177 __be32
*p
= ohdr
->u
.at
.atomic_ack_eth
;
1179 val
= ((u64
) be32_to_cpu(p
[0]) << 32) |
1182 val
= be64_to_cpu(((__be64
*) data
)[0]);
1185 if (!do_rc_ack(qp
, aeth
, psn
, opcode
, val
) ||
1186 opcode
!= OP(RDMA_READ_RESPONSE_FIRST
))
1189 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
1190 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1192 qp
->r_flags
&= ~IPATH_R_RDMAR_SEQ
;
1194 * If this is a response to a resent RDMA read, we
1195 * have to be careful to copy the data to the right
1198 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
1202 case OP(RDMA_READ_RESPONSE_MIDDLE
):
1203 /* no AETH, no ACK */
1204 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1206 if (qp
->r_flags
& IPATH_R_RDMAR_SEQ
)
1208 qp
->r_flags
|= IPATH_R_RDMAR_SEQ
;
1209 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1);
1212 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1215 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1217 if (unlikely(pmtu
>= qp
->s_rdma_read_len
))
1220 /* We got a response so update the timeout. */
1221 spin_lock(&dev
->pending_lock
);
1222 if (qp
->s_rnr_timeout
== 0 && !list_empty(&qp
->timerwait
))
1223 list_move_tail(&qp
->timerwait
,
1224 &dev
->pending
[dev
->pending_index
]);
1225 spin_unlock(&dev
->pending_lock
);
1227 if (opcode
== OP(RDMA_READ_RESPONSE_MIDDLE
))
1228 qp
->s_retry
= qp
->s_retry_cnt
;
1231 * Update the RDMA receive state but do the copy w/o
1232 * holding the locks and blocking interrupts.
1234 qp
->s_rdma_read_len
-= pmtu
;
1235 update_last_psn(qp
, psn
);
1236 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1237 ipath_copy_sge(&qp
->s_rdma_read_sge
, data
, pmtu
);
1240 case OP(RDMA_READ_RESPONSE_ONLY
):
1241 if (!header_in_data
)
1242 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1244 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1245 if (!do_rc_ack(qp
, aeth
, psn
, opcode
, 0))
1247 /* Get the number of bytes the message was padded by. */
1248 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1250 * Check that the data size is >= 0 && <= pmtu.
1251 * Remember to account for the AETH header (4) and
1254 if (unlikely(tlen
< (hdrsize
+ pad
+ 8)))
1257 * If this is a response to a resent RDMA read, we
1258 * have to be careful to copy the data to the right
1261 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
1262 qp
->s_rdma_read_len
= restart_sge(&qp
->s_rdma_read_sge
,
1266 case OP(RDMA_READ_RESPONSE_LAST
):
1267 /* ACKs READ req. */
1268 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1270 if (qp
->r_flags
& IPATH_R_RDMAR_SEQ
)
1272 qp
->r_flags
|= IPATH_R_RDMAR_SEQ
;
1273 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1);
1276 if (unlikely(wqe
->wr
.opcode
!= IB_WR_RDMA_READ
))
1278 /* Get the number of bytes the message was padded by. */
1279 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1281 * Check that the data size is >= 1 && <= pmtu.
1282 * Remember to account for the AETH header (4) and
1285 if (unlikely(tlen
<= (hdrsize
+ pad
+ 8)))
1288 tlen
-= hdrsize
+ pad
+ 8;
1289 if (unlikely(tlen
!= qp
->s_rdma_read_len
))
1291 if (!header_in_data
)
1292 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1294 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1295 data
+= sizeof(__be32
);
1297 ipath_copy_sge(&qp
->s_rdma_read_sge
, data
, tlen
);
1298 (void) do_rc_ack(qp
, aeth
, psn
,
1299 OP(RDMA_READ_RESPONSE_LAST
), 0);
1304 status
= IB_WC_LOC_QP_OP_ERR
;
1308 status
= IB_WC_LOC_LEN_ERR
;
1310 ipath_send_complete(qp
, wqe
, status
);
1311 ipath_error_qp(qp
, IB_WC_WR_FLUSH_ERR
);
1313 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1319 * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
1320 * @dev: the device this packet came in on
1321 * @ohdr: the other headers for this packet
1322 * @data: the packet data
1323 * @qp: the QP for this packet
1324 * @opcode: the opcode for this packet
1325 * @psn: the packet sequence number for this packet
1326 * @diff: the difference between the PSN and the expected PSN
1327 * @header_in_data: true if part of the header data is in the data buffer
1329 * This is called from ipath_rc_rcv() to process an unexpected
1330 * incoming RC packet for the given QP.
1331 * Called at interrupt level.
1332 * Return 1 if no more processing is needed; otherwise return 0 to
1333 * schedule a response to be sent.
1335 static inline int ipath_rc_rcv_error(struct ipath_ibdev
*dev
,
1336 struct ipath_other_headers
*ohdr
,
1338 struct ipath_qp
*qp
,
1344 struct ipath_ack_entry
*e
;
1347 unsigned long flags
;
1351 * Packet sequence error.
1352 * A NAK will ACK earlier sends and RDMA writes.
1353 * Don't queue the NAK if we already sent one.
1355 if (!qp
->r_nak_state
) {
1356 qp
->r_nak_state
= IB_NAK_PSN_ERROR
;
1357 /* Use the expected PSN. */
1358 qp
->r_ack_psn
= qp
->r_psn
;
1365 * Handle a duplicate request. Don't re-execute SEND, RDMA
1366 * write or atomic op. Don't NAK errors, just silently drop
1367 * the duplicate request. Note that r_sge, r_len, and
1368 * r_rcv_len may be in use so don't modify them.
1370 * We are supposed to ACK the earliest duplicate PSN but we
1371 * can coalesce an outstanding duplicate ACK. We have to
1372 * send the earliest so that RDMA reads can be restarted at
1373 * the requester's expected PSN.
1375 * First, find where this duplicate PSN falls within the
1376 * ACKs previously sent.
1378 psn
&= IPATH_PSN_MASK
;
1382 spin_lock_irqsave(&qp
->s_lock
, flags
);
1383 /* Double check we can process this now that we hold the s_lock. */
1384 if (!(ib_ipath_state_ops
[qp
->state
] & IPATH_PROCESS_RECV_OK
))
1387 for (i
= qp
->r_head_ack_queue
; ; i
= prev
) {
1388 if (i
== qp
->s_tail_ack_queue
)
1393 prev
= IPATH_MAX_RDMA_ATOMIC
;
1394 if (prev
== qp
->r_head_ack_queue
) {
1398 e
= &qp
->s_ack_queue
[prev
];
1403 if (ipath_cmp24(psn
, e
->psn
) >= 0) {
1404 if (prev
== qp
->s_tail_ack_queue
)
1410 case OP(RDMA_READ_REQUEST
): {
1411 struct ib_reth
*reth
;
1416 * If we didn't find the RDMA read request in the ack queue,
1417 * or the send tasklet is already backed up to send an
1418 * earlier entry, we can ignore this request.
1420 if (!e
|| e
->opcode
!= OP(RDMA_READ_REQUEST
) || old_req
)
1422 /* RETH comes after BTH */
1423 if (!header_in_data
)
1424 reth
= &ohdr
->u
.rc
.reth
;
1426 reth
= (struct ib_reth
*)data
;
1427 data
+= sizeof(*reth
);
1430 * Address range must be a subset of the original
1431 * request and start on pmtu boundaries.
1432 * We reuse the old ack_queue slot since the requester
1433 * should not back up and request an earlier PSN for the
1436 offset
= ((psn
- e
->psn
) & IPATH_PSN_MASK
) *
1437 ib_mtu_enum_to_int(qp
->path_mtu
);
1438 len
= be32_to_cpu(reth
->length
);
1439 if (unlikely(offset
+ len
> e
->rdma_sge
.sge
.sge_length
))
1442 u32 rkey
= be32_to_cpu(reth
->rkey
);
1443 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1446 ok
= ipath_rkey_ok(qp
, &e
->rdma_sge
,
1448 IB_ACCESS_REMOTE_READ
);
1452 e
->rdma_sge
.sg_list
= NULL
;
1453 e
->rdma_sge
.num_sge
= 0;
1454 e
->rdma_sge
.sge
.mr
= NULL
;
1455 e
->rdma_sge
.sge
.vaddr
= NULL
;
1456 e
->rdma_sge
.sge
.length
= 0;
1457 e
->rdma_sge
.sge
.sge_length
= 0;
1460 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1461 qp
->s_tail_ack_queue
= prev
;
1465 case OP(COMPARE_SWAP
):
1466 case OP(FETCH_ADD
): {
1468 * If we didn't find the atomic request in the ack queue
1469 * or the send tasklet is already backed up to send an
1470 * earlier entry, we can ignore this request.
1472 if (!e
|| e
->opcode
!= (u8
) opcode
|| old_req
)
1474 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1475 qp
->s_tail_ack_queue
= prev
;
1483 * Resend the most recent ACK if this request is
1484 * after all the previous RDMA reads and atomics.
1486 if (i
== qp
->r_head_ack_queue
) {
1487 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1488 qp
->r_nak_state
= 0;
1489 qp
->r_ack_psn
= qp
->r_psn
- 1;
1493 * Try to send a simple ACK to work around a Mellanox bug
1494 * which doesn't accept a RDMA read response or atomic
1495 * response as an ACK for earlier SENDs or RDMA writes.
1497 if (qp
->r_head_ack_queue
== qp
->s_tail_ack_queue
&&
1498 !(qp
->s_flags
& IPATH_S_ACK_PENDING
) &&
1499 qp
->s_ack_state
== OP(ACKNOWLEDGE
)) {
1500 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1501 qp
->r_nak_state
= 0;
1502 qp
->r_ack_psn
= qp
->s_ack_queue
[i
].psn
- 1;
1506 * Resend the RDMA read or atomic op which
1507 * ACKs this duplicate request.
1509 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1510 qp
->s_tail_ack_queue
= i
;
1513 qp
->r_nak_state
= 0;
1514 ipath_schedule_send(qp
);
1517 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1525 void ipath_rc_error(struct ipath_qp
*qp
, enum ib_wc_status err
)
1527 unsigned long flags
;
1530 spin_lock_irqsave(&qp
->s_lock
, flags
);
1531 lastwqe
= ipath_error_qp(qp
, err
);
1532 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1537 ev
.device
= qp
->ibqp
.device
;
1538 ev
.element
.qp
= &qp
->ibqp
;
1539 ev
.event
= IB_EVENT_QP_LAST_WQE_REACHED
;
1540 qp
->ibqp
.event_handler(&ev
, qp
->ibqp
.qp_context
);
1544 static inline void ipath_update_ack_queue(struct ipath_qp
*qp
, unsigned n
)
1549 if (next
> IPATH_MAX_RDMA_ATOMIC
)
1551 if (n
== qp
->s_tail_ack_queue
) {
1552 qp
->s_tail_ack_queue
= next
;
1553 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
1558 * ipath_rc_rcv - process an incoming RC packet
1559 * @dev: the device this packet came in on
1560 * @hdr: the header of this packet
1561 * @has_grh: true if the header has a GRH
1562 * @data: the packet data
1563 * @tlen: the packet length
1564 * @qp: the QP for this packet
1566 * This is called from ipath_qp_rcv() to process an incoming RC packet
1568 * Called at interrupt level.
1570 void ipath_rc_rcv(struct ipath_ibdev
*dev
, struct ipath_ib_header
*hdr
,
1571 int has_grh
, void *data
, u32 tlen
, struct ipath_qp
*qp
)
1573 struct ipath_other_headers
*ohdr
;
1579 u32 pmtu
= ib_mtu_enum_to_int(qp
->path_mtu
);
1581 struct ib_reth
*reth
;
1583 unsigned long flags
;
1585 /* Validate the SLID. See Ch. 9.6.1.5 */
1586 if (unlikely(be16_to_cpu(hdr
->lrh
[3]) != qp
->remote_ah_attr
.dlid
))
1592 hdrsize
= 8 + 12; /* LRH + BTH */
1593 psn
= be32_to_cpu(ohdr
->bth
[2]);
1596 ohdr
= &hdr
->u
.l
.oth
;
1597 hdrsize
= 8 + 40 + 12; /* LRH + GRH + BTH */
1599 * The header with GRH is 60 bytes and the core driver sets
1600 * the eager header buffer size to 56 bytes so the last 4
1601 * bytes of the BTH header (PSN) is in the data buffer.
1603 header_in_data
= dev
->dd
->ipath_rcvhdrentsize
== 16;
1604 if (header_in_data
) {
1605 psn
= be32_to_cpu(((__be32
*) data
)[0]);
1606 data
+= sizeof(__be32
);
1608 psn
= be32_to_cpu(ohdr
->bth
[2]);
1612 * Process responses (ACKs) before anything else. Note that the
1613 * packet sequence number will be for something in the send work
1614 * queue rather than the expected receive packet sequence number.
1615 * In other words, this QP is the requester.
1617 opcode
= be32_to_cpu(ohdr
->bth
[0]) >> 24;
1618 if (opcode
>= OP(RDMA_READ_RESPONSE_FIRST
) &&
1619 opcode
<= OP(ATOMIC_ACKNOWLEDGE
)) {
1620 ipath_rc_rcv_resp(dev
, ohdr
, data
, tlen
, qp
, opcode
, psn
,
1621 hdrsize
, pmtu
, header_in_data
);
1625 /* Compute 24 bits worth of difference. */
1626 diff
= ipath_cmp24(psn
, qp
->r_psn
);
1627 if (unlikely(diff
)) {
1628 if (ipath_rc_rcv_error(dev
, ohdr
, data
, qp
, opcode
,
1629 psn
, diff
, header_in_data
))
1634 /* Check for opcode sequence errors. */
1635 switch (qp
->r_state
) {
1636 case OP(SEND_FIRST
):
1637 case OP(SEND_MIDDLE
):
1638 if (opcode
== OP(SEND_MIDDLE
) ||
1639 opcode
== OP(SEND_LAST
) ||
1640 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
))
1644 case OP(RDMA_WRITE_FIRST
):
1645 case OP(RDMA_WRITE_MIDDLE
):
1646 if (opcode
== OP(RDMA_WRITE_MIDDLE
) ||
1647 opcode
== OP(RDMA_WRITE_LAST
) ||
1648 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
1653 if (opcode
== OP(SEND_MIDDLE
) ||
1654 opcode
== OP(SEND_LAST
) ||
1655 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
) ||
1656 opcode
== OP(RDMA_WRITE_MIDDLE
) ||
1657 opcode
== OP(RDMA_WRITE_LAST
) ||
1658 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
1661 * Note that it is up to the requester to not send a new
1662 * RDMA read or atomic operation before receiving an ACK
1663 * for the previous operation.
1668 memset(&wc
, 0, sizeof wc
);
1670 /* OK, process the packet. */
1672 case OP(SEND_FIRST
):
1673 if (!ipath_get_rwqe(qp
, 0))
1677 case OP(SEND_MIDDLE
):
1678 case OP(RDMA_WRITE_MIDDLE
):
1680 /* Check for invalid length PMTU or posted rwqe len. */
1681 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1683 qp
->r_rcv_len
+= pmtu
;
1684 if (unlikely(qp
->r_rcv_len
> qp
->r_len
))
1686 ipath_copy_sge(&qp
->r_sge
, data
, pmtu
);
1689 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
):
1691 if (!ipath_get_rwqe(qp
, 1))
1696 case OP(SEND_ONLY_WITH_IMMEDIATE
):
1697 if (!ipath_get_rwqe(qp
, 0))
1700 if (opcode
== OP(SEND_ONLY
))
1703 case OP(SEND_LAST_WITH_IMMEDIATE
):
1705 if (header_in_data
) {
1706 wc
.ex
.imm_data
= *(__be32
*) data
;
1707 data
+= sizeof(__be32
);
1709 /* Immediate data comes after BTH */
1710 wc
.ex
.imm_data
= ohdr
->u
.imm_data
;
1713 wc
.wc_flags
= IB_WC_WITH_IMM
;
1716 case OP(RDMA_WRITE_LAST
):
1718 /* Get the number of bytes the message was padded by. */
1719 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1720 /* Check for invalid length. */
1721 /* XXX LAST len should be >= 1 */
1722 if (unlikely(tlen
< (hdrsize
+ pad
+ 4)))
1724 /* Don't count the CRC. */
1725 tlen
-= (hdrsize
+ pad
+ 4);
1726 wc
.byte_len
= tlen
+ qp
->r_rcv_len
;
1727 if (unlikely(wc
.byte_len
> qp
->r_len
))
1729 ipath_copy_sge(&qp
->r_sge
, data
, tlen
);
1731 if (!test_and_clear_bit(IPATH_R_WRID_VALID
, &qp
->r_aflags
))
1733 wc
.wr_id
= qp
->r_wr_id
;
1734 wc
.status
= IB_WC_SUCCESS
;
1735 if (opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
) ||
1736 opcode
== OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
))
1737 wc
.opcode
= IB_WC_RECV_RDMA_WITH_IMM
;
1739 wc
.opcode
= IB_WC_RECV
;
1741 wc
.src_qp
= qp
->remote_qpn
;
1742 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1743 wc
.sl
= qp
->remote_ah_attr
.sl
;
1744 /* Signal completion event if the solicited bit is set. */
1745 ipath_cq_enter(to_icq(qp
->ibqp
.recv_cq
), &wc
,
1747 cpu_to_be32(1 << 23)) != 0);
1750 case OP(RDMA_WRITE_FIRST
):
1751 case OP(RDMA_WRITE_ONLY
):
1752 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
):
1753 if (unlikely(!(qp
->qp_access_flags
&
1754 IB_ACCESS_REMOTE_WRITE
)))
1757 /* RETH comes after BTH */
1758 if (!header_in_data
)
1759 reth
= &ohdr
->u
.rc
.reth
;
1761 reth
= (struct ib_reth
*)data
;
1762 data
+= sizeof(*reth
);
1764 hdrsize
+= sizeof(*reth
);
1765 qp
->r_len
= be32_to_cpu(reth
->length
);
1767 if (qp
->r_len
!= 0) {
1768 u32 rkey
= be32_to_cpu(reth
->rkey
);
1769 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1772 /* Check rkey & NAK */
1773 ok
= ipath_rkey_ok(qp
, &qp
->r_sge
,
1774 qp
->r_len
, vaddr
, rkey
,
1775 IB_ACCESS_REMOTE_WRITE
);
1779 qp
->r_sge
.sg_list
= NULL
;
1780 qp
->r_sge
.sge
.mr
= NULL
;
1781 qp
->r_sge
.sge
.vaddr
= NULL
;
1782 qp
->r_sge
.sge
.length
= 0;
1783 qp
->r_sge
.sge
.sge_length
= 0;
1785 if (opcode
== OP(RDMA_WRITE_FIRST
))
1787 else if (opcode
== OP(RDMA_WRITE_ONLY
))
1789 if (!ipath_get_rwqe(qp
, 1))
1793 case OP(RDMA_READ_REQUEST
): {
1794 struct ipath_ack_entry
*e
;
1798 if (unlikely(!(qp
->qp_access_flags
&
1799 IB_ACCESS_REMOTE_READ
)))
1801 next
= qp
->r_head_ack_queue
+ 1;
1802 if (next
> IPATH_MAX_RDMA_ATOMIC
)
1804 spin_lock_irqsave(&qp
->s_lock
, flags
);
1805 /* Double check we can process this while holding the s_lock. */
1806 if (!(ib_ipath_state_ops
[qp
->state
] & IPATH_PROCESS_RECV_OK
))
1808 if (unlikely(next
== qp
->s_tail_ack_queue
)) {
1809 if (!qp
->s_ack_queue
[next
].sent
)
1810 goto nack_inv_unlck
;
1811 ipath_update_ack_queue(qp
, next
);
1813 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
1814 /* RETH comes after BTH */
1815 if (!header_in_data
)
1816 reth
= &ohdr
->u
.rc
.reth
;
1818 reth
= (struct ib_reth
*)data
;
1819 data
+= sizeof(*reth
);
1821 len
= be32_to_cpu(reth
->length
);
1823 u32 rkey
= be32_to_cpu(reth
->rkey
);
1824 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1827 /* Check rkey & NAK */
1828 ok
= ipath_rkey_ok(qp
, &e
->rdma_sge
, len
, vaddr
,
1829 rkey
, IB_ACCESS_REMOTE_READ
);
1831 goto nack_acc_unlck
;
1833 * Update the next expected PSN. We add 1 later
1834 * below, so only add the remainder here.
1837 qp
->r_psn
+= (len
- 1) / pmtu
;
1839 e
->rdma_sge
.sg_list
= NULL
;
1840 e
->rdma_sge
.num_sge
= 0;
1841 e
->rdma_sge
.sge
.mr
= NULL
;
1842 e
->rdma_sge
.sge
.vaddr
= NULL
;
1843 e
->rdma_sge
.sge
.length
= 0;
1844 e
->rdma_sge
.sge
.sge_length
= 0;
1850 * We need to increment the MSN here instead of when we
1851 * finish sending the result since a duplicate request would
1852 * increment it more than once.
1856 qp
->r_state
= opcode
;
1857 qp
->r_nak_state
= 0;
1858 qp
->r_head_ack_queue
= next
;
1860 /* Schedule the send tasklet. */
1861 ipath_schedule_send(qp
);
1866 case OP(COMPARE_SWAP
):
1867 case OP(FETCH_ADD
): {
1868 struct ib_atomic_eth
*ateth
;
1869 struct ipath_ack_entry
*e
;
1876 if (unlikely(!(qp
->qp_access_flags
&
1877 IB_ACCESS_REMOTE_ATOMIC
)))
1879 next
= qp
->r_head_ack_queue
+ 1;
1880 if (next
> IPATH_MAX_RDMA_ATOMIC
)
1882 spin_lock_irqsave(&qp
->s_lock
, flags
);
1883 /* Double check we can process this while holding the s_lock. */
1884 if (!(ib_ipath_state_ops
[qp
->state
] & IPATH_PROCESS_RECV_OK
))
1886 if (unlikely(next
== qp
->s_tail_ack_queue
)) {
1887 if (!qp
->s_ack_queue
[next
].sent
)
1888 goto nack_inv_unlck
;
1889 ipath_update_ack_queue(qp
, next
);
1891 if (!header_in_data
)
1892 ateth
= &ohdr
->u
.atomic_eth
;
1894 ateth
= (struct ib_atomic_eth
*)data
;
1895 vaddr
= ((u64
) be32_to_cpu(ateth
->vaddr
[0]) << 32) |
1896 be32_to_cpu(ateth
->vaddr
[1]);
1897 if (unlikely(vaddr
& (sizeof(u64
) - 1)))
1898 goto nack_inv_unlck
;
1899 rkey
= be32_to_cpu(ateth
->rkey
);
1900 /* Check rkey & NAK */
1901 if (unlikely(!ipath_rkey_ok(qp
, &qp
->r_sge
,
1902 sizeof(u64
), vaddr
, rkey
,
1903 IB_ACCESS_REMOTE_ATOMIC
)))
1904 goto nack_acc_unlck
;
1905 /* Perform atomic OP and save result. */
1906 maddr
= (atomic64_t
*) qp
->r_sge
.sge
.vaddr
;
1907 sdata
= be64_to_cpu(ateth
->swap_data
);
1908 e
= &qp
->s_ack_queue
[qp
->r_head_ack_queue
];
1909 e
->atomic_data
= (opcode
== OP(FETCH_ADD
)) ?
1910 (u64
) atomic64_add_return(sdata
, maddr
) - sdata
:
1911 (u64
) cmpxchg((u64
*) qp
->r_sge
.sge
.vaddr
,
1912 be64_to_cpu(ateth
->compare_data
),
1916 e
->psn
= psn
& IPATH_PSN_MASK
;
1919 qp
->r_state
= opcode
;
1920 qp
->r_nak_state
= 0;
1921 qp
->r_head_ack_queue
= next
;
1923 /* Schedule the send tasklet. */
1924 ipath_schedule_send(qp
);
1930 /* NAK unknown opcodes. */
1934 qp
->r_state
= opcode
;
1935 qp
->r_ack_psn
= psn
;
1936 qp
->r_nak_state
= 0;
1937 /* Send an ACK if requested or required. */
1938 if (psn
& (1 << 31))
1943 qp
->r_nak_state
= IB_RNR_NAK
| qp
->r_min_rnr_timer
;
1944 qp
->r_ack_psn
= qp
->r_psn
;
1948 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1950 ipath_rc_error(qp
, IB_WC_LOC_QP_OP_ERR
);
1951 qp
->r_nak_state
= IB_NAK_INVALID_REQUEST
;
1952 qp
->r_ack_psn
= qp
->r_psn
;
1956 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1958 ipath_rc_error(qp
, IB_WC_LOC_PROT_ERR
);
1959 qp
->r_nak_state
= IB_NAK_REMOTE_ACCESS_ERROR
;
1960 qp
->r_ack_psn
= qp
->r_psn
;
1966 spin_unlock_irqrestore(&qp
->s_lock
, flags
);