2 * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
33 #include "ipath_verbs.h"
34 #include "ips_common.h"
36 /* cut down ridiculously long IB macro names */
37 #define OP(x) IB_OPCODE_RC_##x
40 * ipath_init_restart- initialize the qp->s_sge after a restart
41 * @qp: the QP who's SGE we're restarting
42 * @wqe: the work queue to initialize the QP's SGE from
44 * The QP s_lock should be held.
46 static void ipath_init_restart(struct ipath_qp
*qp
, struct ipath_swqe
*wqe
)
48 struct ipath_ibdev
*dev
;
51 len
= ((qp
->s_psn
- wqe
->psn
) & IPS_PSN_MASK
) *
52 ib_mtu_enum_to_int(qp
->path_mtu
);
53 qp
->s_sge
.sge
= wqe
->sg_list
[0];
54 qp
->s_sge
.sg_list
= wqe
->sg_list
+ 1;
55 qp
->s_sge
.num_sge
= wqe
->wr
.num_sge
;
56 ipath_skip_sge(&qp
->s_sge
, len
);
57 qp
->s_len
= wqe
->length
- len
;
58 dev
= to_idev(qp
->ibqp
.device
);
59 spin_lock(&dev
->pending_lock
);
60 if (list_empty(&qp
->timerwait
))
61 list_add_tail(&qp
->timerwait
,
62 &dev
->pending
[dev
->pending_index
]);
63 spin_unlock(&dev
->pending_lock
);
67 * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
68 * @qp: a pointer to the QP
69 * @ohdr: a pointer to the IB header being constructed
72 * Return bth0 if constructed; otherwise, return 0.
73 * Note the QP s_lock must be held.
75 static inline u32
ipath_make_rc_ack(struct ipath_qp
*qp
,
76 struct ipath_other_headers
*ohdr
,
79 struct ipath_sge_state
*ss
;
84 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
88 * Send a response. Note that we are in the responder's
89 * side of the QP context.
91 switch (qp
->s_ack_state
) {
92 case OP(RDMA_READ_REQUEST
):
97 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_FIRST
);
100 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_ONLY
);
101 qp
->s_rdma_len
-= len
;
102 bth0
= qp
->s_ack_state
<< 24;
103 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
107 case OP(RDMA_READ_RESPONSE_FIRST
):
108 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
110 case OP(RDMA_READ_RESPONSE_MIDDLE
):
111 ss
= &qp
->s_rdma_sge
;
112 len
= qp
->s_rdma_len
;
116 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
118 qp
->s_ack_state
= OP(RDMA_READ_RESPONSE_LAST
);
120 qp
->s_rdma_len
-= len
;
121 bth0
= qp
->s_ack_state
<< 24;
124 case OP(RDMA_READ_RESPONSE_LAST
):
125 case OP(RDMA_READ_RESPONSE_ONLY
):
127 * We have to prevent new requests from changing
128 * the r_sge state while a ipath_verbs_send()
130 * Changing r_state allows the receiver
131 * to continue processing new packets.
132 * We do it here now instead of above so
133 * that we are sure the packet was sent before
134 * changing the state.
136 qp
->r_state
= OP(RDMA_READ_RESPONSE_LAST
);
137 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
140 case OP(COMPARE_SWAP
):
144 qp
->r_state
= OP(SEND_LAST
);
145 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
146 bth0
= IB_OPCODE_ATOMIC_ACKNOWLEDGE
<< 24;
147 ohdr
->u
.at
.aeth
= ipath_compute_aeth(qp
);
148 ohdr
->u
.at
.atomic_ack_eth
= cpu_to_be64(qp
->s_ack_atomic
);
149 hwords
+= sizeof(ohdr
->u
.at
) / 4;
153 /* Send a regular ACK. */
156 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
157 bth0
= qp
->s_ack_state
<< 24;
158 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
161 qp
->s_hdrwords
= hwords
;
163 qp
->s_cur_size
= len
;
169 * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
170 * @qp: a pointer to the QP
171 * @ohdr: a pointer to the IB header being constructed
172 * @pmtu: the path MTU
173 * @bth0p: pointer to the BTH opcode word
174 * @bth2p: pointer to the BTH PSN word
176 * Return 1 if constructed; otherwise, return 0.
177 * Note the QP s_lock must be held.
179 static inline int ipath_make_rc_req(struct ipath_qp
*qp
,
180 struct ipath_other_headers
*ohdr
,
181 u32 pmtu
, u32
*bth0p
, u32
*bth2p
)
183 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
184 struct ipath_sge_state
*ss
;
185 struct ipath_swqe
*wqe
;
192 if (!(ib_ipath_state_ops
[qp
->state
] & IPATH_PROCESS_SEND_OK
) ||
196 /* header size in 32-bit words LRH+BTH = (8+12)/4. */
200 /* Send a request. */
201 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
202 switch (qp
->s_state
) {
205 * Resend an old request or start a new one.
207 * We keep track of the current SWQE so that
208 * we don't reset the "furthest progress" state
209 * if we need to back up.
212 if (qp
->s_cur
== qp
->s_tail
) {
213 /* Check if send work queue is empty. */
214 if (qp
->s_tail
== qp
->s_head
)
216 qp
->s_psn
= wqe
->psn
= qp
->s_next_psn
;
220 * Note that we have to be careful not to modify the
221 * original work request since we may need to resend
224 qp
->s_sge
.sge
= wqe
->sg_list
[0];
225 qp
->s_sge
.sg_list
= wqe
->sg_list
+ 1;
226 qp
->s_sge
.num_sge
= wqe
->wr
.num_sge
;
227 qp
->s_len
= len
= wqe
->length
;
230 switch (wqe
->wr
.opcode
) {
232 case IB_WR_SEND_WITH_IMM
:
233 /* If no credit, return. */
234 if (qp
->s_lsn
!= (u32
) -1 &&
235 ipath_cmp24(wqe
->ssn
, qp
->s_lsn
+ 1) > 0)
237 wqe
->lpsn
= wqe
->psn
;
239 wqe
->lpsn
+= (len
- 1) / pmtu
;
240 qp
->s_state
= OP(SEND_FIRST
);
244 if (wqe
->wr
.opcode
== IB_WR_SEND
)
245 qp
->s_state
= OP(SEND_ONLY
);
247 qp
->s_state
= OP(SEND_ONLY_WITH_IMMEDIATE
);
248 /* Immediate data comes after the BTH */
249 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
252 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
254 bth2
= 1 << 31; /* Request ACK. */
255 if (++qp
->s_cur
== qp
->s_size
)
259 case IB_WR_RDMA_WRITE
:
263 case IB_WR_RDMA_WRITE_WITH_IMM
:
264 /* If no credit, return. */
265 if (qp
->s_lsn
!= (u32
) -1 &&
266 ipath_cmp24(wqe
->ssn
, qp
->s_lsn
+ 1) > 0)
268 ohdr
->u
.rc
.reth
.vaddr
=
269 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
);
270 ohdr
->u
.rc
.reth
.rkey
=
271 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
272 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
273 hwords
+= sizeof(struct ib_reth
) / 4;
274 wqe
->lpsn
= wqe
->psn
;
276 wqe
->lpsn
+= (len
- 1) / pmtu
;
277 qp
->s_state
= OP(RDMA_WRITE_FIRST
);
281 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
282 qp
->s_state
= OP(RDMA_WRITE_ONLY
);
285 OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
);
286 /* Immediate data comes
288 ohdr
->u
.rc
.imm_data
= wqe
->wr
.imm_data
;
290 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
293 bth2
= 1 << 31; /* Request ACK. */
294 if (++qp
->s_cur
== qp
->s_size
)
298 case IB_WR_RDMA_READ
:
299 ohdr
->u
.rc
.reth
.vaddr
=
300 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
);
301 ohdr
->u
.rc
.reth
.rkey
=
302 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
303 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(len
);
304 qp
->s_state
= OP(RDMA_READ_REQUEST
);
305 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / 4;
309 * Adjust s_next_psn to count the
310 * expected number of responses.
313 qp
->s_next_psn
+= (len
- 1) / pmtu
;
314 wqe
->lpsn
= qp
->s_next_psn
++;
318 if (++qp
->s_cur
== qp
->s_size
)
322 case IB_WR_ATOMIC_CMP_AND_SWP
:
323 case IB_WR_ATOMIC_FETCH_AND_ADD
:
324 if (wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
)
325 qp
->s_state
= OP(COMPARE_SWAP
);
327 qp
->s_state
= OP(FETCH_ADD
);
328 ohdr
->u
.atomic_eth
.vaddr
= cpu_to_be64(
329 wqe
->wr
.wr
.atomic
.remote_addr
);
330 ohdr
->u
.atomic_eth
.rkey
= cpu_to_be32(
331 wqe
->wr
.wr
.atomic
.rkey
);
332 ohdr
->u
.atomic_eth
.swap_data
= cpu_to_be64(
333 wqe
->wr
.wr
.atomic
.swap
);
334 ohdr
->u
.atomic_eth
.compare_data
= cpu_to_be64(
335 wqe
->wr
.wr
.atomic
.compare_add
);
336 hwords
+= sizeof(struct ib_atomic_eth
) / 4;
339 wqe
->lpsn
= wqe
->psn
;
341 if (++qp
->s_cur
== qp
->s_size
)
352 if (qp
->s_tail
>= qp
->s_size
)
355 bth2
|= qp
->s_psn
++ & IPS_PSN_MASK
;
356 if ((int)(qp
->s_psn
- qp
->s_next_psn
) > 0)
357 qp
->s_next_psn
= qp
->s_psn
;
358 spin_lock(&dev
->pending_lock
);
359 if (list_empty(&qp
->timerwait
))
360 list_add_tail(&qp
->timerwait
,
361 &dev
->pending
[dev
->pending_index
]);
362 spin_unlock(&dev
->pending_lock
);
365 case OP(RDMA_READ_RESPONSE_FIRST
):
367 * This case can only happen if a send is restarted. See
368 * ipath_restart_rc().
370 ipath_init_restart(qp
, wqe
);
373 qp
->s_state
= OP(SEND_MIDDLE
);
375 case OP(SEND_MIDDLE
):
376 bth2
= qp
->s_psn
++ & IPS_PSN_MASK
;
377 if ((int)(qp
->s_psn
- qp
->s_next_psn
) > 0)
378 qp
->s_next_psn
= qp
->s_psn
;
383 * Request an ACK every 1/2 MB to avoid retransmit
386 if (((wqe
->length
- len
) % (512 * 1024)) == 0)
391 if (wqe
->wr
.opcode
== IB_WR_SEND
)
392 qp
->s_state
= OP(SEND_LAST
);
394 qp
->s_state
= OP(SEND_LAST_WITH_IMMEDIATE
);
395 /* Immediate data comes after the BTH */
396 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
399 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
401 bth2
|= 1 << 31; /* Request ACK. */
403 if (qp
->s_cur
>= qp
->s_size
)
407 case OP(RDMA_READ_RESPONSE_LAST
):
409 * This case can only happen if a RDMA write is restarted.
410 * See ipath_restart_rc().
412 ipath_init_restart(qp
, wqe
);
414 case OP(RDMA_WRITE_FIRST
):
415 qp
->s_state
= OP(RDMA_WRITE_MIDDLE
);
417 case OP(RDMA_WRITE_MIDDLE
):
418 bth2
= qp
->s_psn
++ & IPS_PSN_MASK
;
419 if ((int)(qp
->s_psn
- qp
->s_next_psn
) > 0)
420 qp
->s_next_psn
= qp
->s_psn
;
425 * Request an ACK every 1/2 MB to avoid retransmit
428 if (((wqe
->length
- len
) % (512 * 1024)) == 0)
433 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
)
434 qp
->s_state
= OP(RDMA_WRITE_LAST
);
436 qp
->s_state
= OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
);
437 /* Immediate data comes after the BTH */
438 ohdr
->u
.imm_data
= wqe
->wr
.imm_data
;
440 if (wqe
->wr
.send_flags
& IB_SEND_SOLICITED
)
443 bth2
|= 1 << 31; /* Request ACK. */
445 if (qp
->s_cur
>= qp
->s_size
)
449 case OP(RDMA_READ_RESPONSE_MIDDLE
):
451 * This case can only happen if a RDMA read is restarted.
452 * See ipath_restart_rc().
454 ipath_init_restart(qp
, wqe
);
455 len
= ((qp
->s_psn
- wqe
->psn
) & IPS_PSN_MASK
) * pmtu
;
456 ohdr
->u
.rc
.reth
.vaddr
=
457 cpu_to_be64(wqe
->wr
.wr
.rdma
.remote_addr
+ len
);
458 ohdr
->u
.rc
.reth
.rkey
=
459 cpu_to_be32(wqe
->wr
.wr
.rdma
.rkey
);
460 ohdr
->u
.rc
.reth
.length
= cpu_to_be32(qp
->s_len
);
461 qp
->s_state
= OP(RDMA_READ_REQUEST
);
462 hwords
+= sizeof(ohdr
->u
.rc
.reth
) / 4;
463 bth2
= qp
->s_psn
++ & IPS_PSN_MASK
;
464 if ((int)(qp
->s_psn
- qp
->s_next_psn
) > 0)
465 qp
->s_next_psn
= qp
->s_psn
;
469 if (qp
->s_cur
== qp
->s_size
)
473 case OP(RDMA_READ_REQUEST
):
474 case OP(COMPARE_SWAP
):
477 * We shouldn't start anything new until this request is
478 * finished. The ACK will handle rescheduling us. XXX The
479 * number of outstanding ones is negotiated at connection
480 * setup time (see pg. 258,289)? XXX Also, if we support
481 * multiple outstanding requests, we need to check the WQE
482 * IB_SEND_FENCE flag and not send a new request if a RDMA
483 * read or atomic is pending.
488 qp
->s_hdrwords
= hwords
;
490 qp
->s_cur_size
= len
;
491 *bth0p
= bth0
| (qp
->s_state
<< 24);
499 static inline void ipath_make_rc_grh(struct ipath_qp
*qp
,
500 struct ib_global_route
*grh
,
503 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
505 /* GRH header size in 32-bit words. */
506 qp
->s_hdrwords
+= 10;
507 qp
->s_hdr
.u
.l
.grh
.version_tclass_flow
=
508 cpu_to_be32((6 << 28) |
509 (grh
->traffic_class
<< 20) |
511 qp
->s_hdr
.u
.l
.grh
.paylen
=
512 cpu_to_be16(((qp
->s_hdrwords
- 12) + nwords
+
514 /* next_hdr is defined by C8-7 in ch. 8.4.1 */
515 qp
->s_hdr
.u
.l
.grh
.next_hdr
= 0x1B;
516 qp
->s_hdr
.u
.l
.grh
.hop_limit
= grh
->hop_limit
;
517 /* The SGID is 32-bit aligned. */
518 qp
->s_hdr
.u
.l
.grh
.sgid
.global
.subnet_prefix
= dev
->gid_prefix
;
519 qp
->s_hdr
.u
.l
.grh
.sgid
.global
.interface_id
=
520 ipath_layer_get_guid(dev
->dd
);
521 qp
->s_hdr
.u
.l
.grh
.dgid
= grh
->dgid
;
525 * ipath_do_rc_send - perform a send on an RC QP
526 * @data: contains a pointer to the QP
528 * Process entries in the send work queue until credit or queue is
529 * exhausted. Only allow one CPU to send a packet per QP (tasklet).
530 * Otherwise, after we drop the QP s_lock, two threads could send
531 * packets out of order.
533 void ipath_do_rc_send(unsigned long data
)
535 struct ipath_qp
*qp
= (struct ipath_qp
*)data
;
536 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
543 u32 pmtu
= ib_mtu_enum_to_int(qp
->path_mtu
);
544 struct ipath_other_headers
*ohdr
;
546 if (test_and_set_bit(IPATH_S_BUSY
, &qp
->s_flags
))
549 if (unlikely(qp
->remote_ah_attr
.dlid
==
550 ipath_layer_get_lid(dev
->dd
))) {
554 * Pass in an uninitialized ib_wc to be consistent with
555 * other places where ipath_ruc_loopback() is called.
557 ipath_ruc_loopback(qp
, &wc
);
561 ohdr
= &qp
->s_hdr
.u
.oth
;
562 if (qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)
563 ohdr
= &qp
->s_hdr
.u
.l
.oth
;
566 /* Check for a constructed packet to be sent. */
567 if (qp
->s_hdrwords
!= 0) {
569 * If no PIO bufs are available, return. An interrupt will
570 * call ipath_ib_piobufavail() when one is available.
572 _VERBS_INFO("h %u %p\n", qp
->s_hdrwords
, &qp
->s_hdr
);
573 _VERBS_INFO("d %u %p %u %p %u %u %u %u\n", qp
->s_cur_size
,
574 qp
->s_cur_sge
->sg_list
,
575 qp
->s_cur_sge
->num_sge
,
576 qp
->s_cur_sge
->sge
.vaddr
,
577 qp
->s_cur_sge
->sge
.sge_length
,
578 qp
->s_cur_sge
->sge
.length
,
579 qp
->s_cur_sge
->sge
.m
,
580 qp
->s_cur_sge
->sge
.n
);
581 if (ipath_verbs_send(dev
->dd
, qp
->s_hdrwords
,
582 (u32
*) &qp
->s_hdr
, qp
->s_cur_size
,
584 ipath_no_bufs_available(qp
, dev
);
587 dev
->n_unicast_xmit
++;
588 /* Record that we sent the packet and s_hdr is empty. */
593 * The lock is needed to synchronize between setting
594 * qp->s_ack_state, resend timer, and post_send().
596 spin_lock_irqsave(&qp
->s_lock
, flags
);
598 /* Sending responses has higher priority over sending requests. */
599 if (qp
->s_ack_state
!= OP(ACKNOWLEDGE
) &&
600 (bth0
= ipath_make_rc_ack(qp
, ohdr
, pmtu
)) != 0)
601 bth2
= qp
->s_ack_psn
++ & IPS_PSN_MASK
;
602 else if (!ipath_make_rc_req(qp
, ohdr
, pmtu
, &bth0
, &bth2
))
605 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
607 /* Construct the header. */
608 extra_bytes
= (4 - qp
->s_cur_size
) & 3;
609 nwords
= (qp
->s_cur_size
+ extra_bytes
) >> 2;
611 if (unlikely(qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)) {
612 ipath_make_rc_grh(qp
, &qp
->remote_ah_attr
.grh
, nwords
);
615 lrh0
|= qp
->remote_ah_attr
.sl
<< 4;
616 qp
->s_hdr
.lrh
[0] = cpu_to_be16(lrh0
);
617 qp
->s_hdr
.lrh
[1] = cpu_to_be16(qp
->remote_ah_attr
.dlid
);
618 qp
->s_hdr
.lrh
[2] = cpu_to_be16(qp
->s_hdrwords
+ nwords
+
620 qp
->s_hdr
.lrh
[3] = cpu_to_be16(ipath_layer_get_lid(dev
->dd
));
621 bth0
|= ipath_layer_get_pkey(dev
->dd
, qp
->s_pkey_index
);
622 bth0
|= extra_bytes
<< 20;
623 ohdr
->bth
[0] = cpu_to_be32(bth0
);
624 ohdr
->bth
[1] = cpu_to_be32(qp
->remote_qpn
);
625 ohdr
->bth
[2] = cpu_to_be32(bth2
);
627 /* Check for more work to do. */
631 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
633 clear_bit(IPATH_S_BUSY
, &qp
->s_flags
);
638 static void send_rc_ack(struct ipath_qp
*qp
)
640 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
643 struct ipath_other_headers
*ohdr
;
645 /* Construct the header. */
646 ohdr
= &qp
->s_hdr
.u
.oth
;
648 /* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
650 if (unlikely(qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)) {
651 ipath_make_rc_grh(qp
, &qp
->remote_ah_attr
.grh
, 0);
652 ohdr
= &qp
->s_hdr
.u
.l
.oth
;
655 bth0
= ipath_layer_get_pkey(dev
->dd
, qp
->s_pkey_index
);
656 ohdr
->u
.aeth
= ipath_compute_aeth(qp
);
657 if (qp
->s_ack_state
>= OP(COMPARE_SWAP
)) {
658 bth0
|= IB_OPCODE_ATOMIC_ACKNOWLEDGE
<< 24;
659 ohdr
->u
.at
.atomic_ack_eth
= cpu_to_be64(qp
->s_ack_atomic
);
660 qp
->s_hdrwords
+= sizeof(ohdr
->u
.at
.atomic_ack_eth
) / 4;
663 bth0
|= OP(ACKNOWLEDGE
) << 24;
664 lrh0
|= qp
->remote_ah_attr
.sl
<< 4;
665 qp
->s_hdr
.lrh
[0] = cpu_to_be16(lrh0
);
666 qp
->s_hdr
.lrh
[1] = cpu_to_be16(qp
->remote_ah_attr
.dlid
);
667 qp
->s_hdr
.lrh
[2] = cpu_to_be16(qp
->s_hdrwords
+ SIZE_OF_CRC
);
668 qp
->s_hdr
.lrh
[3] = cpu_to_be16(ipath_layer_get_lid(dev
->dd
));
669 ohdr
->bth
[0] = cpu_to_be32(bth0
);
670 ohdr
->bth
[1] = cpu_to_be32(qp
->remote_qpn
);
671 ohdr
->bth
[2] = cpu_to_be32(qp
->s_ack_psn
& IPS_PSN_MASK
);
674 * If we can send the ACK, clear the ACK state.
676 if (ipath_verbs_send(dev
->dd
, qp
->s_hdrwords
, (u32
*) &qp
->s_hdr
,
678 qp
->s_ack_state
= OP(ACKNOWLEDGE
);
680 dev
->n_unicast_xmit
++;
685 * ipath_restart_rc - back up requester to resend the last un-ACKed request
686 * @qp: the QP to restart
687 * @psn: packet sequence number for the request
688 * @wc: the work completion request
690 * The QP s_lock should be held.
692 void ipath_restart_rc(struct ipath_qp
*qp
, u32 psn
, struct ib_wc
*wc
)
694 struct ipath_swqe
*wqe
= get_swqe_ptr(qp
, qp
->s_last
);
695 struct ipath_ibdev
*dev
;
699 * If there are no requests pending, we are done.
701 if (ipath_cmp24(psn
, qp
->s_next_psn
) >= 0 ||
702 qp
->s_last
== qp
->s_tail
)
705 if (qp
->s_retry
== 0) {
706 wc
->wr_id
= wqe
->wr
.wr_id
;
707 wc
->status
= IB_WC_RETRY_EXC_ERR
;
708 wc
->opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
711 wc
->qp_num
= qp
->ibqp
.qp_num
;
712 wc
->src_qp
= qp
->remote_qpn
;
714 wc
->slid
= qp
->remote_ah_attr
.dlid
;
715 wc
->sl
= qp
->remote_ah_attr
.sl
;
716 wc
->dlid_path_bits
= 0;
718 ipath_sqerror_qp(qp
, wc
);
724 * Remove the QP from the timeout queue.
725 * Note: it may already have been removed by ipath_ib_timer().
727 dev
= to_idev(qp
->ibqp
.device
);
728 spin_lock(&dev
->pending_lock
);
729 if (!list_empty(&qp
->timerwait
))
730 list_del_init(&qp
->timerwait
);
731 spin_unlock(&dev
->pending_lock
);
733 if (wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
736 dev
->n_rc_resends
+= (int)qp
->s_psn
- (int)psn
;
739 * If we are starting the request from the beginning, let the normal
740 * send code handle initialization.
742 qp
->s_cur
= qp
->s_last
;
743 if (ipath_cmp24(psn
, wqe
->psn
) <= 0) {
744 qp
->s_state
= OP(SEND_LAST
);
745 qp
->s_psn
= wqe
->psn
;
749 if (++n
== qp
->s_size
)
751 if (n
== qp
->s_tail
) {
752 if (ipath_cmp24(psn
, qp
->s_next_psn
) >= 0) {
754 wqe
= get_swqe_ptr(qp
, n
);
758 wqe
= get_swqe_ptr(qp
, n
);
759 if (ipath_cmp24(psn
, wqe
->psn
) < 0)
766 * Reset the state to restart in the middle of a request.
767 * Don't change the s_sge, s_cur_sge, or s_cur_size.
768 * See ipath_do_rc_send().
770 switch (wqe
->wr
.opcode
) {
772 case IB_WR_SEND_WITH_IMM
:
773 qp
->s_state
= OP(RDMA_READ_RESPONSE_FIRST
);
776 case IB_WR_RDMA_WRITE
:
777 case IB_WR_RDMA_WRITE_WITH_IMM
:
778 qp
->s_state
= OP(RDMA_READ_RESPONSE_LAST
);
781 case IB_WR_RDMA_READ
:
783 OP(RDMA_READ_RESPONSE_MIDDLE
);
788 * This case shouldn't happen since its only
791 qp
->s_state
= OP(SEND_LAST
);
796 tasklet_hi_schedule(&qp
->s_task
);
803 * reset_psn - reset the QP state to send starting from PSN
805 * @psn: the packet sequence number to restart at
807 * This is called from ipath_rc_rcv() to process an incoming RC ACK
809 * Called at interrupt level with the QP s_lock held.
811 static void reset_psn(struct ipath_qp
*qp
, u32 psn
)
813 struct ipath_swqe
*wqe
;
817 wqe
= get_swqe_ptr(qp
, n
);
819 if (++n
== qp
->s_size
)
821 if (n
== qp
->s_tail
) {
822 if (ipath_cmp24(psn
, qp
->s_next_psn
) >= 0) {
824 wqe
= get_swqe_ptr(qp
, n
);
828 wqe
= get_swqe_ptr(qp
, n
);
829 if (ipath_cmp24(psn
, wqe
->psn
) < 0)
836 * Set the state to restart in the middle of a
837 * request. Don't change the s_sge, s_cur_sge, or
838 * s_cur_size. See ipath_do_rc_send().
840 switch (wqe
->wr
.opcode
) {
842 case IB_WR_SEND_WITH_IMM
:
843 qp
->s_state
= OP(RDMA_READ_RESPONSE_FIRST
);
846 case IB_WR_RDMA_WRITE
:
847 case IB_WR_RDMA_WRITE_WITH_IMM
:
848 qp
->s_state
= OP(RDMA_READ_RESPONSE_LAST
);
851 case IB_WR_RDMA_READ
:
852 qp
->s_state
= OP(RDMA_READ_RESPONSE_MIDDLE
);
857 * This case shouldn't happen since its only
860 qp
->s_state
= OP(SEND_LAST
);
865 * do_rc_ack - process an incoming RC ACK
866 * @qp: the QP the ACK came in on
867 * @psn: the packet sequence number of the ACK
868 * @opcode: the opcode of the request that resulted in the ACK
870 * This is called from ipath_rc_rcv() to process an incoming RC ACK
872 * Called at interrupt level with the QP s_lock held.
873 * Returns 1 if OK, 0 if current operation should be aborted (NAK).
875 static int do_rc_ack(struct ipath_qp
*qp
, u32 aeth
, u32 psn
, int opcode
)
877 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
879 struct ipath_swqe
*wqe
;
883 * Remove the QP from the timeout queue (or RNR timeout queue).
884 * If ipath_ib_timer() has already removed it,
885 * it's OK since we hold the QP s_lock and ipath_restart_rc()
886 * just won't find anything to restart if we ACK everything.
888 spin_lock(&dev
->pending_lock
);
889 if (!list_empty(&qp
->timerwait
))
890 list_del_init(&qp
->timerwait
);
891 spin_unlock(&dev
->pending_lock
);
894 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
895 * requests and implicitly NAK RDMA read and atomic requests issued
896 * before the NAK'ed request. The MSN won't include the NAK'ed
897 * request but will include an ACK'ed request(s).
899 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
901 /* Nothing is pending to ACK/NAK. */
902 if (qp
->s_last
== qp
->s_tail
)
906 * The MSN might be for a later WQE than the PSN indicates so
907 * only complete WQEs that the PSN finishes.
909 while (ipath_cmp24(psn
, wqe
->lpsn
) >= 0) {
910 /* If we are ACKing a WQE, the MSN should be >= the SSN. */
911 if (ipath_cmp24(aeth
, wqe
->ssn
) < 0)
914 * If this request is a RDMA read or atomic, and the ACK is
915 * for a later operation, this ACK NAKs the RDMA read or
916 * atomic. In other words, only a RDMA_READ_LAST or ONLY
917 * can ACK a RDMA read and likewise for atomic ops. Note
918 * that the NAK case can only happen if relaxed ordering is
919 * used and requests are sent after an RDMA read or atomic
920 * is sent but before the response is received.
922 if ((wqe
->wr
.opcode
== IB_WR_RDMA_READ
&&
923 opcode
!= OP(RDMA_READ_RESPONSE_LAST
)) ||
924 ((wqe
->wr
.opcode
== IB_WR_ATOMIC_CMP_AND_SWP
||
925 wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
) &&
926 (opcode
!= OP(ATOMIC_ACKNOWLEDGE
) ||
927 ipath_cmp24(wqe
->psn
, psn
) != 0))) {
929 * The last valid PSN seen is the previous
932 qp
->s_last_psn
= wqe
->psn
- 1;
933 /* Retry this request. */
934 ipath_restart_rc(qp
, wqe
->psn
, &wc
);
936 * No need to process the ACK/NAK since we are
937 * restarting an earlier request.
941 /* Post a send completion queue entry if requested. */
942 if (!test_bit(IPATH_S_SIGNAL_REQ_WR
, &qp
->s_flags
) ||
943 (wqe
->wr
.send_flags
& IB_SEND_SIGNALED
)) {
944 wc
.wr_id
= wqe
->wr
.wr_id
;
945 wc
.status
= IB_WC_SUCCESS
;
946 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
948 wc
.byte_len
= wqe
->length
;
949 wc
.qp_num
= qp
->ibqp
.qp_num
;
950 wc
.src_qp
= qp
->remote_qpn
;
952 wc
.slid
= qp
->remote_ah_attr
.dlid
;
953 wc
.sl
= qp
->remote_ah_attr
.sl
;
954 wc
.dlid_path_bits
= 0;
956 ipath_cq_enter(to_icq(qp
->ibqp
.send_cq
), &wc
, 0);
958 qp
->s_retry
= qp
->s_retry_cnt
;
960 * If we are completing a request which is in the process of
961 * being resent, we can stop resending it since we know the
962 * responder has already seen it.
964 if (qp
->s_last
== qp
->s_cur
) {
965 if (++qp
->s_cur
>= qp
->s_size
)
967 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
968 qp
->s_state
= OP(SEND_LAST
);
969 qp
->s_psn
= wqe
->psn
;
971 if (++qp
->s_last
>= qp
->s_size
)
973 wqe
= get_swqe_ptr(qp
, qp
->s_last
);
974 if (qp
->s_last
== qp
->s_tail
)
978 switch (aeth
>> 29) {
981 /* If this is a partial ACK, reset the retransmit timer. */
982 if (qp
->s_last
!= qp
->s_tail
) {
983 spin_lock(&dev
->pending_lock
);
984 list_add_tail(&qp
->timerwait
,
985 &dev
->pending
[dev
->pending_index
]);
986 spin_unlock(&dev
->pending_lock
);
988 ipath_get_credit(qp
, aeth
);
989 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
990 qp
->s_retry
= qp
->s_retry_cnt
;
991 qp
->s_last_psn
= psn
;
995 case 1: /* RNR NAK */
997 if (qp
->s_rnr_retry
== 0) {
998 if (qp
->s_last
== qp
->s_tail
)
1001 wc
.status
= IB_WC_RNR_RETRY_EXC_ERR
;
1004 if (qp
->s_rnr_retry_cnt
< 7)
1006 if (qp
->s_last
== qp
->s_tail
)
1009 /* The last valid PSN seen is the previous request's. */
1010 qp
->s_last_psn
= wqe
->psn
- 1;
1012 dev
->n_rc_resends
+= (int)qp
->s_psn
- (int)psn
;
1015 * If we are starting the request from the beginning, let
1016 * the normal send code handle initialization.
1018 qp
->s_cur
= qp
->s_last
;
1019 wqe
= get_swqe_ptr(qp
, qp
->s_cur
);
1020 if (ipath_cmp24(psn
, wqe
->psn
) <= 0) {
1021 qp
->s_state
= OP(SEND_LAST
);
1022 qp
->s_psn
= wqe
->psn
;
1027 ib_ipath_rnr_table
[(aeth
>> IPS_AETH_CREDIT_SHIFT
) &
1028 IPS_AETH_CREDIT_MASK
];
1029 ipath_insert_rnr_queue(qp
);
1033 /* The last valid PSN seen is the previous request's. */
1034 if (qp
->s_last
!= qp
->s_tail
)
1035 qp
->s_last_psn
= wqe
->psn
- 1;
1036 switch ((aeth
>> IPS_AETH_CREDIT_SHIFT
) &
1037 IPS_AETH_CREDIT_MASK
) {
1038 case 0: /* PSN sequence error */
1041 * Back up to the responder's expected PSN. XXX
1042 * Note that we might get a NAK in the middle of an
1043 * RDMA READ response which terminates the RDMA
1046 if (qp
->s_last
== qp
->s_tail
)
1049 if (ipath_cmp24(psn
, wqe
->psn
) < 0)
1052 /* Retry the request. */
1053 ipath_restart_rc(qp
, psn
, &wc
);
1056 case 1: /* Invalid Request */
1057 wc
.status
= IB_WC_REM_INV_REQ_ERR
;
1058 dev
->n_other_naks
++;
1061 case 2: /* Remote Access Error */
1062 wc
.status
= IB_WC_REM_ACCESS_ERR
;
1063 dev
->n_other_naks
++;
1066 case 3: /* Remote Operation Error */
1067 wc
.status
= IB_WC_REM_OP_ERR
;
1068 dev
->n_other_naks
++;
1070 wc
.wr_id
= wqe
->wr
.wr_id
;
1071 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
1074 wc
.qp_num
= qp
->ibqp
.qp_num
;
1075 wc
.src_qp
= qp
->remote_qpn
;
1077 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1078 wc
.sl
= qp
->remote_ah_attr
.sl
;
1079 wc
.dlid_path_bits
= 0;
1081 ipath_sqerror_qp(qp
, &wc
);
1085 /* Ignore other reserved NAK error codes */
1088 qp
->s_rnr_retry
= qp
->s_rnr_retry_cnt
;
1091 default: /* 2: reserved */
1093 /* Ignore reserved NAK codes. */
1102 * ipath_rc_rcv_resp - process an incoming RC response packet
1103 * @dev: the device this packet came in on
1104 * @ohdr: the other headers for this packet
1105 * @data: the packet data
1106 * @tlen: the packet length
1107 * @qp: the QP for this packet
1108 * @opcode: the opcode for this packet
1109 * @psn: the packet sequence number for this packet
1110 * @hdrsize: the header length
1111 * @pmtu: the path MTU
1112 * @header_in_data: true if part of the header data is in the data buffer
1114 * This is called from ipath_rc_rcv() to process an incoming RC response
1115 * packet for the given QP.
1116 * Called at interrupt level.
1118 static inline void ipath_rc_rcv_resp(struct ipath_ibdev
*dev
,
1119 struct ipath_other_headers
*ohdr
,
1120 void *data
, u32 tlen
,
1121 struct ipath_qp
*qp
,
1123 u32 psn
, u32 hdrsize
, u32 pmtu
,
1126 unsigned long flags
;
1132 spin_lock_irqsave(&qp
->s_lock
, flags
);
1134 /* Ignore invalid responses. */
1135 if (ipath_cmp24(psn
, qp
->s_next_psn
) >= 0)
1138 /* Ignore duplicate responses. */
1139 diff
= ipath_cmp24(psn
, qp
->s_last_psn
);
1140 if (unlikely(diff
<= 0)) {
1141 /* Update credits for "ghost" ACKs */
1142 if (diff
== 0 && opcode
== OP(ACKNOWLEDGE
)) {
1143 if (!header_in_data
)
1144 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1146 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1147 data
+= sizeof(__be32
);
1149 if ((aeth
>> 29) == 0)
1150 ipath_get_credit(qp
, aeth
);
1156 case OP(ACKNOWLEDGE
):
1157 case OP(ATOMIC_ACKNOWLEDGE
):
1158 case OP(RDMA_READ_RESPONSE_FIRST
):
1159 if (!header_in_data
)
1160 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1162 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1163 data
+= sizeof(__be32
);
1165 if (opcode
== OP(ATOMIC_ACKNOWLEDGE
))
1166 *(u64
*) qp
->s_sge
.sge
.vaddr
= *(u64
*) data
;
1167 if (!do_rc_ack(qp
, aeth
, psn
, opcode
) ||
1168 opcode
!= OP(RDMA_READ_RESPONSE_FIRST
))
1172 * do_rc_ack() has already checked the PSN so skip
1173 * the sequence check.
1177 case OP(RDMA_READ_RESPONSE_MIDDLE
):
1178 /* no AETH, no ACK */
1179 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1181 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1, &wc
);
1185 if (unlikely(qp
->s_state
!= OP(RDMA_READ_REQUEST
)))
1187 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1189 if (unlikely(pmtu
>= qp
->s_len
))
1191 /* We got a response so update the timeout. */
1192 if (unlikely(qp
->s_last
== qp
->s_tail
||
1193 get_swqe_ptr(qp
, qp
->s_last
)->wr
.opcode
!=
1196 spin_lock(&dev
->pending_lock
);
1197 if (qp
->s_rnr_timeout
== 0 && !list_empty(&qp
->timerwait
))
1198 list_move_tail(&qp
->timerwait
,
1199 &dev
->pending
[dev
->pending_index
]);
1200 spin_unlock(&dev
->pending_lock
);
1202 * Update the RDMA receive state but do the copy w/o holding the
1203 * locks and blocking interrupts. XXX Yet another place that
1204 * affects relaxed RDMA order since we don't want s_sge modified.
1207 qp
->s_last_psn
= psn
;
1208 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1209 ipath_copy_sge(&qp
->s_sge
, data
, pmtu
);
1212 case OP(RDMA_READ_RESPONSE_LAST
):
1213 /* ACKs READ req. */
1214 if (unlikely(ipath_cmp24(psn
, qp
->s_last_psn
+ 1))) {
1216 ipath_restart_rc(qp
, qp
->s_last_psn
+ 1, &wc
);
1220 case OP(RDMA_READ_RESPONSE_ONLY
):
1221 if (unlikely(qp
->s_state
!= OP(RDMA_READ_REQUEST
)))
1224 * Get the number of bytes the message was padded by.
1226 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1228 * Check that the data size is >= 1 && <= pmtu.
1229 * Remember to account for the AETH header (4) and
1232 if (unlikely(tlen
<= (hdrsize
+ pad
+ 8))) {
1234 * XXX Need to generate an error CQ
1239 tlen
-= hdrsize
+ pad
+ 8;
1240 if (unlikely(tlen
!= qp
->s_len
)) {
1242 * XXX Need to generate an error CQ
1247 if (!header_in_data
)
1248 aeth
= be32_to_cpu(ohdr
->u
.aeth
);
1250 aeth
= be32_to_cpu(((__be32
*) data
)[0]);
1251 data
+= sizeof(__be32
);
1253 ipath_copy_sge(&qp
->s_sge
, data
, tlen
);
1254 if (do_rc_ack(qp
, aeth
, psn
, OP(RDMA_READ_RESPONSE_LAST
))) {
1256 * Change the state so we contimue
1257 * processing new requests.
1259 qp
->s_state
= OP(SEND_LAST
);
1265 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
1271 * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
1272 * @dev: the device this packet came in on
1273 * @ohdr: the other headers for this packet
1274 * @data: the packet data
1275 * @qp: the QP for this packet
1276 * @opcode: the opcode for this packet
1277 * @psn: the packet sequence number for this packet
1278 * @diff: the difference between the PSN and the expected PSN
1279 * @header_in_data: true if part of the header data is in the data buffer
1281 * This is called from ipath_rc_rcv() to process an unexpected
1282 * incoming RC packet for the given QP.
1283 * Called at interrupt level.
1284 * Return 1 if no more processing is needed; otherwise return 0 to
1285 * schedule a response to be sent and the s_lock unlocked.
1287 static inline int ipath_rc_rcv_error(struct ipath_ibdev
*dev
,
1288 struct ipath_other_headers
*ohdr
,
1290 struct ipath_qp
*qp
,
1296 struct ib_reth
*reth
;
1300 * Packet sequence error.
1301 * A NAK will ACK earlier sends and RDMA writes.
1302 * Don't queue the NAK if a RDMA read, atomic, or
1303 * NAK is pending though.
1305 spin_lock(&qp
->s_lock
);
1306 if ((qp
->s_ack_state
>= OP(RDMA_READ_REQUEST
) &&
1307 qp
->s_ack_state
!= IB_OPCODE_ACKNOWLEDGE
) ||
1308 qp
->s_nak_state
!= 0) {
1309 spin_unlock(&qp
->s_lock
);
1312 qp
->s_ack_state
= OP(SEND_ONLY
);
1313 qp
->s_nak_state
= IB_NAK_PSN_ERROR
;
1314 /* Use the expected PSN. */
1315 qp
->s_ack_psn
= qp
->r_psn
;
1320 * Handle a duplicate request. Don't re-execute SEND, RDMA
1321 * write or atomic op. Don't NAK errors, just silently drop
1322 * the duplicate request. Note that r_sge, r_len, and
1323 * r_rcv_len may be in use so don't modify them.
1325 * We are supposed to ACK the earliest duplicate PSN but we
1326 * can coalesce an outstanding duplicate ACK. We have to
1327 * send the earliest so that RDMA reads can be restarted at
1328 * the requester's expected PSN.
1330 spin_lock(&qp
->s_lock
);
1331 if (qp
->s_ack_state
!= IB_OPCODE_ACKNOWLEDGE
&&
1332 ipath_cmp24(psn
, qp
->s_ack_psn
) >= 0) {
1333 if (qp
->s_ack_state
< IB_OPCODE_RDMA_READ_REQUEST
)
1334 qp
->s_ack_psn
= psn
;
1335 spin_unlock(&qp
->s_lock
);
1339 case OP(RDMA_READ_REQUEST
):
1341 * We have to be careful to not change s_rdma_sge
1342 * while ipath_do_rc_send() is using it and not
1343 * holding the s_lock.
1345 if (qp
->s_ack_state
!= OP(ACKNOWLEDGE
) &&
1346 qp
->s_ack_state
>= IB_OPCODE_RDMA_READ_REQUEST
) {
1347 spin_unlock(&qp
->s_lock
);
1348 dev
->n_rdma_dup_busy
++;
1351 /* RETH comes after BTH */
1352 if (!header_in_data
)
1353 reth
= &ohdr
->u
.rc
.reth
;
1355 reth
= (struct ib_reth
*)data
;
1356 data
+= sizeof(*reth
);
1358 qp
->s_rdma_len
= be32_to_cpu(reth
->length
);
1359 if (qp
->s_rdma_len
!= 0) {
1360 u32 rkey
= be32_to_cpu(reth
->rkey
);
1361 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1365 * Address range must be a subset of the original
1366 * request and start on pmtu boundaries.
1368 ok
= ipath_rkey_ok(dev
, &qp
->s_rdma_sge
,
1369 qp
->s_rdma_len
, vaddr
, rkey
,
1370 IB_ACCESS_REMOTE_READ
);
1374 qp
->s_rdma_sge
.sg_list
= NULL
;
1375 qp
->s_rdma_sge
.num_sge
= 0;
1376 qp
->s_rdma_sge
.sge
.mr
= NULL
;
1377 qp
->s_rdma_sge
.sge
.vaddr
= NULL
;
1378 qp
->s_rdma_sge
.sge
.length
= 0;
1379 qp
->s_rdma_sge
.sge
.sge_length
= 0;
1383 case OP(COMPARE_SWAP
):
1386 * Check for the PSN of the last atomic operations
1387 * performed and resend the result if found.
1389 if ((psn
& IPS_PSN_MASK
) != qp
->r_atomic_psn
) {
1390 spin_unlock(&qp
->s_lock
);
1393 qp
->s_ack_atomic
= qp
->r_atomic_data
;
1396 qp
->s_ack_state
= opcode
;
1397 qp
->s_nak_state
= 0;
1398 qp
->s_ack_psn
= psn
;
1407 * ipath_rc_rcv - process an incoming RC packet
1408 * @dev: the device this packet came in on
1409 * @hdr: the header of this packet
1410 * @has_grh: true if the header has a GRH
1411 * @data: the packet data
1412 * @tlen: the packet length
1413 * @qp: the QP for this packet
1415 * This is called from ipath_qp_rcv() to process an incoming RC packet
1417 * Called at interrupt level.
1419 void ipath_rc_rcv(struct ipath_ibdev
*dev
, struct ipath_ib_header
*hdr
,
1420 int has_grh
, void *data
, u32 tlen
, struct ipath_qp
*qp
)
1422 struct ipath_other_headers
*ohdr
;
1427 unsigned long flags
;
1429 u32 pmtu
= ib_mtu_enum_to_int(qp
->path_mtu
);
1431 struct ib_reth
*reth
;
1437 hdrsize
= 8 + 12; /* LRH + BTH */
1438 psn
= be32_to_cpu(ohdr
->bth
[2]);
1441 ohdr
= &hdr
->u
.l
.oth
;
1442 hdrsize
= 8 + 40 + 12; /* LRH + GRH + BTH */
1444 * The header with GRH is 60 bytes and the core driver sets
1445 * the eager header buffer size to 56 bytes so the last 4
1446 * bytes of the BTH header (PSN) is in the data buffer.
1449 ipath_layer_get_rcvhdrentsize(dev
->dd
) == 16;
1450 if (header_in_data
) {
1451 psn
= be32_to_cpu(((__be32
*) data
)[0]);
1452 data
+= sizeof(__be32
);
1454 psn
= be32_to_cpu(ohdr
->bth
[2]);
1457 * The opcode is in the low byte when its in network order
1458 * (top byte when in host order).
1460 opcode
= be32_to_cpu(ohdr
->bth
[0]) >> 24;
1463 * Process responses (ACKs) before anything else. Note that the
1464 * packet sequence number will be for something in the send work
1465 * queue rather than the expected receive packet sequence number.
1466 * In other words, this QP is the requester.
1468 if (opcode
>= OP(RDMA_READ_RESPONSE_FIRST
) &&
1469 opcode
<= OP(ATOMIC_ACKNOWLEDGE
)) {
1470 ipath_rc_rcv_resp(dev
, ohdr
, data
, tlen
, qp
, opcode
, psn
,
1471 hdrsize
, pmtu
, header_in_data
);
1475 spin_lock_irqsave(&qp
->r_rq
.lock
, flags
);
1477 /* Compute 24 bits worth of difference. */
1478 diff
= ipath_cmp24(psn
, qp
->r_psn
);
1479 if (unlikely(diff
)) {
1480 if (ipath_rc_rcv_error(dev
, ohdr
, data
, qp
, opcode
,
1481 psn
, diff
, header_in_data
))
1486 /* Check for opcode sequence errors. */
1487 switch (qp
->r_state
) {
1488 case OP(SEND_FIRST
):
1489 case OP(SEND_MIDDLE
):
1490 if (opcode
== OP(SEND_MIDDLE
) ||
1491 opcode
== OP(SEND_LAST
) ||
1492 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
))
1496 * A NAK will ACK earlier sends and RDMA writes. Don't queue the
1497 * NAK if a RDMA read, atomic, or NAK is pending though.
1499 spin_lock(&qp
->s_lock
);
1500 if (qp
->s_ack_state
>= OP(RDMA_READ_REQUEST
) &&
1501 qp
->s_ack_state
!= IB_OPCODE_ACKNOWLEDGE
) {
1502 spin_unlock(&qp
->s_lock
);
1505 /* XXX Flush WQEs */
1506 qp
->state
= IB_QPS_ERR
;
1507 qp
->s_ack_state
= OP(SEND_ONLY
);
1508 qp
->s_nak_state
= IB_NAK_INVALID_REQUEST
;
1509 qp
->s_ack_psn
= qp
->r_psn
;
1512 case OP(RDMA_WRITE_FIRST
):
1513 case OP(RDMA_WRITE_MIDDLE
):
1514 if (opcode
== OP(RDMA_WRITE_MIDDLE
) ||
1515 opcode
== OP(RDMA_WRITE_LAST
) ||
1516 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
1520 case OP(RDMA_READ_REQUEST
):
1521 case OP(COMPARE_SWAP
):
1524 * Drop all new requests until a response has been sent. A
1525 * new request then ACKs the RDMA response we sent. Relaxed
1526 * ordering would allow new requests to be processed but we
1527 * would need to keep a queue of rwqe's for all that are in
1528 * progress. Note that we can't RNR NAK this request since
1529 * the RDMA READ or atomic response is already queued to be
1530 * sent (unless we implement a response send queue).
1535 if (opcode
== OP(SEND_MIDDLE
) ||
1536 opcode
== OP(SEND_LAST
) ||
1537 opcode
== OP(SEND_LAST_WITH_IMMEDIATE
) ||
1538 opcode
== OP(RDMA_WRITE_MIDDLE
) ||
1539 opcode
== OP(RDMA_WRITE_LAST
) ||
1540 opcode
== OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
))
1548 /* OK, process the packet. */
1550 case OP(SEND_FIRST
):
1551 if (!ipath_get_rwqe(qp
, 0)) {
1554 * A RNR NAK will ACK earlier sends and RDMA writes.
1555 * Don't queue the NAK if a RDMA read or atomic
1556 * is pending though.
1558 spin_lock(&qp
->s_lock
);
1559 if (qp
->s_ack_state
>=
1560 OP(RDMA_READ_REQUEST
) &&
1561 qp
->s_ack_state
!= IB_OPCODE_ACKNOWLEDGE
) {
1562 spin_unlock(&qp
->s_lock
);
1565 qp
->s_ack_state
= OP(SEND_ONLY
);
1566 qp
->s_nak_state
= IB_RNR_NAK
| qp
->s_min_rnr_timer
;
1567 qp
->s_ack_psn
= qp
->r_psn
;
1572 case OP(SEND_MIDDLE
):
1573 case OP(RDMA_WRITE_MIDDLE
):
1575 /* Check for invalid length PMTU or posted rwqe len. */
1576 if (unlikely(tlen
!= (hdrsize
+ pmtu
+ 4)))
1578 qp
->r_rcv_len
+= pmtu
;
1579 if (unlikely(qp
->r_rcv_len
> qp
->r_len
))
1581 ipath_copy_sge(&qp
->r_sge
, data
, pmtu
);
1584 case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE
):
1586 if (!ipath_get_rwqe(qp
, 1))
1591 case OP(SEND_ONLY_WITH_IMMEDIATE
):
1592 if (!ipath_get_rwqe(qp
, 0))
1595 if (opcode
== OP(SEND_ONLY
))
1598 case OP(SEND_LAST_WITH_IMMEDIATE
):
1600 if (header_in_data
) {
1601 wc
.imm_data
= *(__be32
*) data
;
1602 data
+= sizeof(__be32
);
1604 /* Immediate data comes after BTH */
1605 wc
.imm_data
= ohdr
->u
.imm_data
;
1608 wc
.wc_flags
= IB_WC_WITH_IMM
;
1611 case OP(RDMA_WRITE_LAST
):
1613 /* Get the number of bytes the message was padded by. */
1614 pad
= (be32_to_cpu(ohdr
->bth
[0]) >> 20) & 3;
1615 /* Check for invalid length. */
1616 /* XXX LAST len should be >= 1 */
1617 if (unlikely(tlen
< (hdrsize
+ pad
+ 4)))
1619 /* Don't count the CRC. */
1620 tlen
-= (hdrsize
+ pad
+ 4);
1621 wc
.byte_len
= tlen
+ qp
->r_rcv_len
;
1622 if (unlikely(wc
.byte_len
> qp
->r_len
))
1624 ipath_copy_sge(&qp
->r_sge
, data
, tlen
);
1625 atomic_inc(&qp
->msn
);
1626 if (opcode
== OP(RDMA_WRITE_LAST
) ||
1627 opcode
== OP(RDMA_WRITE_ONLY
))
1629 wc
.wr_id
= qp
->r_wr_id
;
1630 wc
.status
= IB_WC_SUCCESS
;
1631 wc
.opcode
= IB_WC_RECV
;
1633 wc
.qp_num
= qp
->ibqp
.qp_num
;
1634 wc
.src_qp
= qp
->remote_qpn
;
1636 wc
.slid
= qp
->remote_ah_attr
.dlid
;
1637 wc
.sl
= qp
->remote_ah_attr
.sl
;
1638 wc
.dlid_path_bits
= 0;
1640 /* Signal completion event if the solicited bit is set. */
1641 ipath_cq_enter(to_icq(qp
->ibqp
.recv_cq
), &wc
,
1643 __constant_cpu_to_be32(1 << 23)) != 0);
1646 case OP(RDMA_WRITE_FIRST
):
1647 case OP(RDMA_WRITE_ONLY
):
1648 case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE
):
1650 /* RETH comes after BTH */
1651 if (!header_in_data
)
1652 reth
= &ohdr
->u
.rc
.reth
;
1654 reth
= (struct ib_reth
*)data
;
1655 data
+= sizeof(*reth
);
1657 hdrsize
+= sizeof(*reth
);
1658 qp
->r_len
= be32_to_cpu(reth
->length
);
1660 if (qp
->r_len
!= 0) {
1661 u32 rkey
= be32_to_cpu(reth
->rkey
);
1662 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1665 /* Check rkey & NAK */
1666 ok
= ipath_rkey_ok(dev
, &qp
->r_sge
,
1667 qp
->r_len
, vaddr
, rkey
,
1668 IB_ACCESS_REMOTE_WRITE
);
1669 if (unlikely(!ok
)) {
1672 * A NAK will ACK earlier sends and RDMA
1673 * writes. Don't queue the NAK if a RDMA
1674 * read, atomic, or NAK is pending though.
1676 spin_lock(&qp
->s_lock
);
1677 if (qp
->s_ack_state
>=
1678 OP(RDMA_READ_REQUEST
) &&
1680 IB_OPCODE_ACKNOWLEDGE
) {
1681 spin_unlock(&qp
->s_lock
);
1684 /* XXX Flush WQEs */
1685 qp
->state
= IB_QPS_ERR
;
1686 qp
->s_ack_state
= OP(RDMA_WRITE_ONLY
);
1688 IB_NAK_REMOTE_ACCESS_ERROR
;
1689 qp
->s_ack_psn
= qp
->r_psn
;
1693 qp
->r_sge
.sg_list
= NULL
;
1694 qp
->r_sge
.sge
.mr
= NULL
;
1695 qp
->r_sge
.sge
.vaddr
= NULL
;
1696 qp
->r_sge
.sge
.length
= 0;
1697 qp
->r_sge
.sge
.sge_length
= 0;
1699 if (unlikely(!(qp
->qp_access_flags
&
1700 IB_ACCESS_REMOTE_WRITE
)))
1702 if (opcode
== OP(RDMA_WRITE_FIRST
))
1704 else if (opcode
== OP(RDMA_WRITE_ONLY
))
1706 if (!ipath_get_rwqe(qp
, 1))
1710 case OP(RDMA_READ_REQUEST
):
1711 /* RETH comes after BTH */
1712 if (!header_in_data
)
1713 reth
= &ohdr
->u
.rc
.reth
;
1715 reth
= (struct ib_reth
*)data
;
1716 data
+= sizeof(*reth
);
1718 spin_lock(&qp
->s_lock
);
1719 if (qp
->s_ack_state
!= OP(ACKNOWLEDGE
) &&
1720 qp
->s_ack_state
>= IB_OPCODE_RDMA_READ_REQUEST
) {
1721 spin_unlock(&qp
->s_lock
);
1724 qp
->s_rdma_len
= be32_to_cpu(reth
->length
);
1725 if (qp
->s_rdma_len
!= 0) {
1726 u32 rkey
= be32_to_cpu(reth
->rkey
);
1727 u64 vaddr
= be64_to_cpu(reth
->vaddr
);
1730 /* Check rkey & NAK */
1731 ok
= ipath_rkey_ok(dev
, &qp
->s_rdma_sge
,
1732 qp
->s_rdma_len
, vaddr
, rkey
,
1733 IB_ACCESS_REMOTE_READ
);
1734 if (unlikely(!ok
)) {
1735 spin_unlock(&qp
->s_lock
);
1739 * Update the next expected PSN. We add 1 later
1740 * below, so only add the remainder here.
1742 if (qp
->s_rdma_len
> pmtu
)
1743 qp
->r_psn
+= (qp
->s_rdma_len
- 1) / pmtu
;
1745 qp
->s_rdma_sge
.sg_list
= NULL
;
1746 qp
->s_rdma_sge
.num_sge
= 0;
1747 qp
->s_rdma_sge
.sge
.mr
= NULL
;
1748 qp
->s_rdma_sge
.sge
.vaddr
= NULL
;
1749 qp
->s_rdma_sge
.sge
.length
= 0;
1750 qp
->s_rdma_sge
.sge
.sge_length
= 0;
1752 if (unlikely(!(qp
->qp_access_flags
&
1753 IB_ACCESS_REMOTE_READ
)))
1756 * We need to increment the MSN here instead of when we
1757 * finish sending the result since a duplicate request would
1758 * increment it more than once.
1760 atomic_inc(&qp
->msn
);
1761 qp
->s_ack_state
= opcode
;
1762 qp
->s_nak_state
= 0;
1763 qp
->s_ack_psn
= psn
;
1765 qp
->r_state
= opcode
;
1768 case OP(COMPARE_SWAP
):
1769 case OP(FETCH_ADD
): {
1770 struct ib_atomic_eth
*ateth
;
1775 if (!header_in_data
)
1776 ateth
= &ohdr
->u
.atomic_eth
;
1778 ateth
= (struct ib_atomic_eth
*)data
;
1779 data
+= sizeof(*ateth
);
1781 vaddr
= be64_to_cpu(ateth
->vaddr
);
1782 if (unlikely(vaddr
& (sizeof(u64
) - 1)))
1784 rkey
= be32_to_cpu(ateth
->rkey
);
1785 /* Check rkey & NAK */
1786 if (unlikely(!ipath_rkey_ok(dev
, &qp
->r_sge
,
1787 sizeof(u64
), vaddr
, rkey
,
1788 IB_ACCESS_REMOTE_ATOMIC
)))
1790 if (unlikely(!(qp
->qp_access_flags
&
1791 IB_ACCESS_REMOTE_ATOMIC
)))
1793 /* Perform atomic OP and save result. */
1794 sdata
= be64_to_cpu(ateth
->swap_data
);
1795 spin_lock(&dev
->pending_lock
);
1796 qp
->r_atomic_data
= *(u64
*) qp
->r_sge
.sge
.vaddr
;
1797 if (opcode
== OP(FETCH_ADD
))
1798 *(u64
*) qp
->r_sge
.sge
.vaddr
=
1799 qp
->r_atomic_data
+ sdata
;
1800 else if (qp
->r_atomic_data
==
1801 be64_to_cpu(ateth
->compare_data
))
1802 *(u64
*) qp
->r_sge
.sge
.vaddr
= sdata
;
1803 spin_unlock(&dev
->pending_lock
);
1804 atomic_inc(&qp
->msn
);
1805 qp
->r_atomic_psn
= psn
& IPS_PSN_MASK
;
1811 /* Drop packet for unknown opcodes. */
1815 qp
->r_state
= opcode
;
1816 /* Send an ACK if requested or required. */
1817 if (psn
& (1 << 31)) {
1819 * Coalesce ACKs unless there is a RDMA READ or
1822 spin_lock(&qp
->s_lock
);
1823 if (qp
->s_ack_state
== OP(ACKNOWLEDGE
) ||
1824 qp
->s_ack_state
< IB_OPCODE_RDMA_READ_REQUEST
) {
1825 qp
->s_ack_state
= opcode
;
1826 qp
->s_nak_state
= 0;
1827 qp
->s_ack_psn
= psn
;
1828 qp
->s_ack_atomic
= qp
->r_atomic_data
;
1831 spin_unlock(&qp
->s_lock
);
1834 spin_unlock_irqrestore(&qp
->r_rq
.lock
, flags
);
1839 * Try to send ACK right away but not if ipath_do_rc_send() is
1842 if (qp
->s_hdrwords
== 0 &&
1843 (qp
->s_ack_state
< IB_OPCODE_RDMA_READ_REQUEST
||
1844 qp
->s_ack_state
>= IB_OPCODE_COMPARE_SWAP
))
1848 spin_unlock(&qp
->s_lock
);
1849 spin_unlock_irqrestore(&qp
->r_rq
.lock
, flags
);
1851 /* Call ipath_do_rc_send() in another thread. */
1852 tasklet_hi_schedule(&qp
->s_task
);