2 * Copyright (c) 2006 QLogic, Inc. All rights reserved.
3 * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
5 * This software is available to you under a choice of one of two
6 * licenses. You may choose to be licensed under the terms of the GNU
7 * General Public License (GPL) Version 2, available from the file
8 * COPYING in the main directory of this source tree, or the
9 * OpenIB.org BSD license below:
11 * Redistribution and use in source and binary forms, with or
12 * without modification, are permitted provided that the following
15 * - Redistributions of source code must retain the above
16 * copyright notice, this list of conditions and the following
19 * - Redistributions in binary form must reproduce the above
20 * copyright notice, this list of conditions and the following
21 * disclaimer in the documentation and/or other materials
22 * provided with the distribution.
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
34 #include "ipath_verbs.h"
35 #include "ipath_kernel.h"
38 * Convert the AETH RNR timeout code into the number of milliseconds.
40 const u32 ib_ipath_rnr_table
[32] = {
76 * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device
79 * XXX Use a simple list for now. We might need a priority
80 * queue if we have lots of QPs waiting for RNR timeouts
81 * but that should be rare.
83 void ipath_insert_rnr_queue(struct ipath_qp
*qp
)
85 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
88 spin_lock_irqsave(&dev
->pending_lock
, flags
);
89 if (list_empty(&dev
->rnrwait
))
90 list_add(&qp
->timerwait
, &dev
->rnrwait
);
92 struct list_head
*l
= &dev
->rnrwait
;
93 struct ipath_qp
*nqp
= list_entry(l
->next
, struct ipath_qp
,
96 while (qp
->s_rnr_timeout
>= nqp
->s_rnr_timeout
) {
97 qp
->s_rnr_timeout
-= nqp
->s_rnr_timeout
;
99 if (l
->next
== &dev
->rnrwait
)
101 nqp
= list_entry(l
->next
, struct ipath_qp
,
104 list_add(&qp
->timerwait
, l
);
106 spin_unlock_irqrestore(&dev
->pending_lock
, flags
);
109 static int init_sge(struct ipath_qp
*qp
, struct ipath_rwqe
*wqe
)
111 int user
= to_ipd(qp
->ibqp
.pd
)->user
;
116 for (i
= j
= 0; i
< wqe
->num_sge
; i
++) {
117 if (wqe
->sg_list
[i
].length
== 0)
120 if ((user
&& wqe
->sg_list
[i
].lkey
== 0) ||
121 !ipath_lkey_ok(qp
, &qp
->r_sg_list
[j
], &wqe
->sg_list
[i
],
122 IB_ACCESS_LOCAL_WRITE
))
124 qp
->r_len
+= wqe
->sg_list
[i
].length
;
127 qp
->r_sge
.sge
= qp
->r_sg_list
[0];
128 qp
->r_sge
.sg_list
= qp
->r_sg_list
+ 1;
129 qp
->r_sge
.num_sge
= j
;
134 wc
.wr_id
= wqe
->wr_id
;
135 wc
.status
= IB_WC_LOC_PROT_ERR
;
136 wc
.opcode
= IB_WC_RECV
;
146 wc
.dlid_path_bits
= 0;
148 /* Signal solicited completion event. */
149 ipath_cq_enter(to_icq(qp
->ibqp
.recv_cq
), &wc
, 1);
156 * ipath_get_rwqe - copy the next RWQE into the QP's RWQE
158 * @wr_id_only: update wr_id only, not SGEs
160 * Return 0 if no RWQE is available, otherwise return 1.
162 * Can be called from interrupt level.
164 int ipath_get_rwqe(struct ipath_qp
*qp
, int wr_id_only
)
168 struct ipath_rwq
*wq
;
169 struct ipath_srq
*srq
;
170 struct ipath_rwqe
*wqe
;
171 void (*handler
)(struct ib_event
*, void *);
176 srq
= to_isrq(qp
->ibqp
.srq
);
177 handler
= srq
->ibsrq
.event_handler
;
185 spin_lock_irqsave(&rq
->lock
, flags
);
188 /* Validate tail before using it since it is user writable. */
189 if (tail
>= rq
->size
)
192 if (unlikely(tail
== wq
->head
)) {
193 spin_unlock_irqrestore(&rq
->lock
, flags
);
197 wqe
= get_rwqe_ptr(rq
, tail
);
198 if (++tail
>= rq
->size
)
200 } while (!wr_id_only
&& !init_sge(qp
, wqe
));
201 qp
->r_wr_id
= wqe
->wr_id
;
205 qp
->r_wrid_valid
= 1;
210 * validate head pointer value and compute
211 * the number of remaining WQEs.
217 n
+= rq
->size
- tail
;
220 if (n
< srq
->limit
) {
224 spin_unlock_irqrestore(&rq
->lock
, flags
);
225 ev
.device
= qp
->ibqp
.device
;
226 ev
.element
.srq
= qp
->ibqp
.srq
;
227 ev
.event
= IB_EVENT_SRQ_LIMIT_REACHED
;
228 handler(&ev
, srq
->ibsrq
.srq_context
);
232 spin_unlock_irqrestore(&rq
->lock
, flags
);
239 * ipath_ruc_loopback - handle UC and RC lookback requests
240 * @sqp: the loopback QP
242 * This is called from ipath_do_uc_send() or ipath_do_rc_send() to
243 * forward a WQE addressed to the same HCA.
244 * Note that although we are single threaded due to the tasklet, we still
245 * have to protect against post_send(). We don't have to worry about
246 * receive interrupts since this is a connected protocol and all packets
247 * will pass through here.
249 static void ipath_ruc_loopback(struct ipath_qp
*sqp
)
251 struct ipath_ibdev
*dev
= to_idev(sqp
->ibqp
.device
);
253 struct ipath_swqe
*wqe
;
254 struct ipath_sge
*sge
;
260 qp
= ipath_lookup_qpn(&dev
->qp_table
, sqp
->remote_qpn
);
267 spin_lock_irqsave(&sqp
->s_lock
, flags
);
269 if (!(ib_ipath_state_ops
[sqp
->state
] & IPATH_PROCESS_SEND_OK
) ||
271 spin_unlock_irqrestore(&sqp
->s_lock
, flags
);
275 /* Get the next send request. */
276 if (sqp
->s_last
== sqp
->s_head
) {
277 /* Send work queue is empty. */
278 spin_unlock_irqrestore(&sqp
->s_lock
, flags
);
283 * We can rely on the entry not changing without the s_lock
284 * being held until we update s_last.
286 wqe
= get_swqe_ptr(sqp
, sqp
->s_last
);
287 spin_unlock_irqrestore(&sqp
->s_lock
, flags
);
292 sqp
->s_sge
.sge
= wqe
->sg_list
[0];
293 sqp
->s_sge
.sg_list
= wqe
->sg_list
+ 1;
294 sqp
->s_sge
.num_sge
= wqe
->wr
.num_sge
;
295 sqp
->s_len
= wqe
->length
;
296 switch (wqe
->wr
.opcode
) {
297 case IB_WR_SEND_WITH_IMM
:
298 wc
.wc_flags
= IB_WC_WITH_IMM
;
299 wc
.imm_data
= wqe
->wr
.imm_data
;
302 if (!ipath_get_rwqe(qp
, 0)) {
305 if (qp
->ibqp
.qp_type
== IB_QPT_UC
)
307 if (sqp
->s_rnr_retry
== 0) {
308 wc
.status
= IB_WC_RNR_RETRY_EXC_ERR
;
311 if (sqp
->s_rnr_retry_cnt
< 7)
315 ib_ipath_rnr_table
[qp
->r_min_rnr_timer
];
316 ipath_insert_rnr_queue(sqp
);
321 case IB_WR_RDMA_WRITE_WITH_IMM
:
322 wc
.wc_flags
= IB_WC_WITH_IMM
;
323 wc
.imm_data
= wqe
->wr
.imm_data
;
324 if (!ipath_get_rwqe(qp
, 1))
327 case IB_WR_RDMA_WRITE
:
328 if (wqe
->length
== 0)
330 if (unlikely(!ipath_rkey_ok(qp
, &qp
->r_sge
, wqe
->length
,
331 wqe
->wr
.wr
.rdma
.remote_addr
,
332 wqe
->wr
.wr
.rdma
.rkey
,
333 IB_ACCESS_REMOTE_WRITE
))) {
335 wc
.status
= IB_WC_REM_ACCESS_ERR
;
337 wc
.wr_id
= wqe
->wr
.wr_id
;
338 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
342 wc
.src_qp
= sqp
->remote_qpn
;
344 wc
.slid
= sqp
->remote_ah_attr
.dlid
;
345 wc
.sl
= sqp
->remote_ah_attr
.sl
;
346 wc
.dlid_path_bits
= 0;
348 spin_lock_irqsave(&sqp
->s_lock
, flags
);
349 ipath_sqerror_qp(sqp
, &wc
);
350 spin_unlock_irqrestore(&sqp
->s_lock
, flags
);
355 case IB_WR_RDMA_READ
:
356 if (unlikely(!(qp
->qp_access_flags
&
357 IB_ACCESS_REMOTE_READ
)))
359 if (unlikely(!ipath_rkey_ok(qp
, &sqp
->s_sge
, wqe
->length
,
360 wqe
->wr
.wr
.rdma
.remote_addr
,
361 wqe
->wr
.wr
.rdma
.rkey
,
362 IB_ACCESS_REMOTE_READ
)))
364 qp
->r_sge
.sge
= wqe
->sg_list
[0];
365 qp
->r_sge
.sg_list
= wqe
->sg_list
+ 1;
366 qp
->r_sge
.num_sge
= wqe
->wr
.num_sge
;
369 case IB_WR_ATOMIC_CMP_AND_SWP
:
370 case IB_WR_ATOMIC_FETCH_AND_ADD
:
371 if (unlikely(!(qp
->qp_access_flags
&
372 IB_ACCESS_REMOTE_ATOMIC
)))
374 if (unlikely(!ipath_rkey_ok(qp
, &qp
->r_sge
, sizeof(u64
),
375 wqe
->wr
.wr
.atomic
.remote_addr
,
376 wqe
->wr
.wr
.atomic
.rkey
,
377 IB_ACCESS_REMOTE_ATOMIC
)))
379 /* Perform atomic OP and save result. */
380 maddr
= (atomic64_t
*) qp
->r_sge
.sge
.vaddr
;
381 sdata
= wqe
->wr
.wr
.atomic
.compare_add
;
382 *(u64
*) sqp
->s_sge
.sge
.vaddr
=
383 (wqe
->wr
.opcode
== IB_WR_ATOMIC_FETCH_AND_ADD
) ?
384 (u64
) atomic64_add_return(sdata
, maddr
) - sdata
:
385 (u64
) cmpxchg((u64
*) qp
->r_sge
.sge
.vaddr
,
386 sdata
, wqe
->wr
.wr
.atomic
.swap
);
393 sge
= &sqp
->s_sge
.sge
;
395 u32 len
= sqp
->s_len
;
397 if (len
> sge
->length
)
400 ipath_copy_sge(&qp
->r_sge
, sge
->vaddr
, len
);
403 sge
->sge_length
-= len
;
404 if (sge
->sge_length
== 0) {
405 if (--sqp
->s_sge
.num_sge
)
406 *sge
= *sqp
->s_sge
.sg_list
++;
407 } else if (sge
->length
== 0 && sge
->mr
!= NULL
) {
408 if (++sge
->n
>= IPATH_SEGSZ
) {
409 if (++sge
->m
>= sge
->mr
->mapsz
)
414 sge
->mr
->map
[sge
->m
]->segs
[sge
->n
].vaddr
;
416 sge
->mr
->map
[sge
->m
]->segs
[sge
->n
].length
;
421 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE
||
422 wqe
->wr
.opcode
== IB_WR_RDMA_READ
)
425 if (wqe
->wr
.opcode
== IB_WR_RDMA_WRITE_WITH_IMM
)
426 wc
.opcode
= IB_WC_RECV_RDMA_WITH_IMM
;
428 wc
.opcode
= IB_WC_RECV
;
429 wc
.wr_id
= qp
->r_wr_id
;
430 wc
.status
= IB_WC_SUCCESS
;
432 wc
.byte_len
= wqe
->length
;
434 wc
.src_qp
= qp
->remote_qpn
;
435 /* XXX do we know which pkey matched? Only needed for GSI. */
437 wc
.slid
= qp
->remote_ah_attr
.dlid
;
438 wc
.sl
= qp
->remote_ah_attr
.sl
;
439 wc
.dlid_path_bits
= 0;
440 /* Signal completion event if the solicited bit is set. */
441 ipath_cq_enter(to_icq(qp
->ibqp
.recv_cq
), &wc
,
442 wqe
->wr
.send_flags
& IB_SEND_SOLICITED
);
445 sqp
->s_rnr_retry
= sqp
->s_rnr_retry_cnt
;
447 if (!(sqp
->s_flags
& IPATH_S_SIGNAL_REQ_WR
) ||
448 (wqe
->wr
.send_flags
& IB_SEND_SIGNALED
)) {
449 wc
.wr_id
= wqe
->wr
.wr_id
;
450 wc
.status
= IB_WC_SUCCESS
;
451 wc
.opcode
= ib_ipath_wc_opcode
[wqe
->wr
.opcode
];
453 wc
.byte_len
= wqe
->length
;
459 wc
.dlid_path_bits
= 0;
461 ipath_cq_enter(to_icq(sqp
->ibqp
.send_cq
), &wc
, 0);
464 /* Update s_last now that we are finished with the SWQE */
465 spin_lock_irqsave(&sqp
->s_lock
, flags
);
466 if (++sqp
->s_last
>= sqp
->s_size
)
468 spin_unlock_irqrestore(&sqp
->s_lock
, flags
);
472 if (atomic_dec_and_test(&qp
->refcount
))
476 static int want_buffer(struct ipath_devdata
*dd
)
478 set_bit(IPATH_S_PIOINTBUFAVAIL
, &dd
->ipath_sendctrl
);
479 ipath_write_kreg(dd
, dd
->ipath_kregs
->kr_sendctrl
,
486 * ipath_no_bufs_available - tell the layer driver we need buffers
487 * @qp: the QP that caused the problem
488 * @dev: the device we ran out of buffers on
490 * Called when we run out of PIO buffers.
492 void ipath_no_bufs_available(struct ipath_qp
*qp
, struct ipath_ibdev
*dev
)
496 spin_lock_irqsave(&dev
->pending_lock
, flags
);
497 if (list_empty(&qp
->piowait
))
498 list_add_tail(&qp
->piowait
, &dev
->piowait
);
499 spin_unlock_irqrestore(&dev
->pending_lock
, flags
);
501 * Note that as soon as want_buffer() is called and
502 * possibly before it returns, ipath_ib_piobufavail()
503 * could be called. If we are still in the tasklet function,
504 * tasklet_hi_schedule() will not call us until the next time
505 * tasklet_hi_schedule() is called.
506 * We clear the tasklet flag now since we are committing to return
507 * from the tasklet function.
509 clear_bit(IPATH_S_BUSY
, &qp
->s_busy
);
510 tasklet_unlock(&qp
->s_task
);
511 want_buffer(dev
->dd
);
516 * ipath_post_ruc_send - post RC and UC sends
517 * @qp: the QP to post on
518 * @wr: the work request to send
520 int ipath_post_ruc_send(struct ipath_qp
*qp
, struct ib_send_wr
*wr
)
522 struct ipath_swqe
*wqe
;
530 * Don't allow RDMA reads or atomic operations on UC or
531 * undefined operations.
532 * Make sure buffer is large enough to hold the result for atomics.
534 if (qp
->ibqp
.qp_type
== IB_QPT_UC
) {
535 if ((unsigned) wr
->opcode
>= IB_WR_RDMA_READ
) {
539 } else if ((unsigned) wr
->opcode
> IB_WR_ATOMIC_FETCH_AND_ADD
) {
542 } else if (wr
->opcode
>= IB_WR_ATOMIC_CMP_AND_SWP
&&
544 wr
->sg_list
[0].length
< sizeof(u64
) ||
545 wr
->sg_list
[0].addr
& (sizeof(u64
) - 1))) {
548 } else if (wr
->opcode
>= IB_WR_RDMA_READ
&& !qp
->s_max_rd_atomic
) {
552 /* IB spec says that num_sge == 0 is OK. */
553 if (wr
->num_sge
> qp
->s_max_sge
) {
557 spin_lock_irqsave(&qp
->s_lock
, flags
);
558 next
= qp
->s_head
+ 1;
559 if (next
>= qp
->s_size
)
561 if (next
== qp
->s_last
) {
562 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
567 wqe
= get_swqe_ptr(qp
, qp
->s_head
);
569 wqe
->ssn
= qp
->s_ssn
++;
570 wqe
->sg_list
[0].mr
= NULL
;
571 wqe
->sg_list
[0].vaddr
= NULL
;
572 wqe
->sg_list
[0].length
= 0;
573 wqe
->sg_list
[0].sge_length
= 0;
575 acc
= wr
->opcode
>= IB_WR_RDMA_READ
? IB_ACCESS_LOCAL_WRITE
: 0;
576 for (i
= 0, j
= 0; i
< wr
->num_sge
; i
++) {
577 if (to_ipd(qp
->ibqp
.pd
)->user
&& wr
->sg_list
[i
].lkey
== 0) {
578 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
582 if (wr
->sg_list
[i
].length
== 0)
584 if (!ipath_lkey_ok(qp
, &wqe
->sg_list
[j
], &wr
->sg_list
[i
],
586 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
590 wqe
->length
+= wr
->sg_list
[i
].length
;
595 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
597 ipath_do_ruc_send((unsigned long) qp
);
606 * ipath_make_grh - construct a GRH header
607 * @dev: a pointer to the ipath device
608 * @hdr: a pointer to the GRH header being constructed
609 * @grh: the global route address to send to
610 * @hwords: the number of 32 bit words of header being sent
611 * @nwords: the number of 32 bit words of data being sent
613 * Return the size of the header in 32 bit words.
615 u32
ipath_make_grh(struct ipath_ibdev
*dev
, struct ib_grh
*hdr
,
616 struct ib_global_route
*grh
, u32 hwords
, u32 nwords
)
618 hdr
->version_tclass_flow
=
619 cpu_to_be32((6 << 28) |
620 (grh
->traffic_class
<< 20) |
622 hdr
->paylen
= cpu_to_be16((hwords
- 2 + nwords
+ SIZE_OF_CRC
) << 2);
623 /* next_hdr is defined by C8-7 in ch. 8.4.1 */
624 hdr
->next_hdr
= 0x1B;
625 hdr
->hop_limit
= grh
->hop_limit
;
626 /* The SGID is 32-bit aligned. */
627 hdr
->sgid
.global
.subnet_prefix
= dev
->gid_prefix
;
628 hdr
->sgid
.global
.interface_id
= dev
->dd
->ipath_guid
;
629 hdr
->dgid
= grh
->dgid
;
631 /* GRH header size in 32-bit words. */
632 return sizeof(struct ib_grh
) / sizeof(u32
);
636 * ipath_do_ruc_send - perform a send on an RC or UC QP
637 * @data: contains a pointer to the QP
639 * Process entries in the send work queue until credit or queue is
640 * exhausted. Only allow one CPU to send a packet per QP (tasklet).
641 * Otherwise, after we drop the QP s_lock, two threads could send
642 * packets out of order.
644 void ipath_do_ruc_send(unsigned long data
)
646 struct ipath_qp
*qp
= (struct ipath_qp
*)data
;
647 struct ipath_ibdev
*dev
= to_idev(qp
->ibqp
.device
);
654 u32 pmtu
= ib_mtu_enum_to_int(qp
->path_mtu
);
655 struct ipath_other_headers
*ohdr
;
657 if (test_and_set_bit(IPATH_S_BUSY
, &qp
->s_busy
))
660 if (unlikely(qp
->remote_ah_attr
.dlid
== dev
->dd
->ipath_lid
)) {
661 ipath_ruc_loopback(qp
);
665 ohdr
= &qp
->s_hdr
.u
.oth
;
666 if (qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)
667 ohdr
= &qp
->s_hdr
.u
.l
.oth
;
670 /* Check for a constructed packet to be sent. */
671 if (qp
->s_hdrwords
!= 0) {
673 * If no PIO bufs are available, return. An interrupt will
674 * call ipath_ib_piobufavail() when one is available.
676 if (ipath_verbs_send(dev
->dd
, qp
->s_hdrwords
,
677 (u32
*) &qp
->s_hdr
, qp
->s_cur_size
,
679 ipath_no_bufs_available(qp
, dev
);
682 dev
->n_unicast_xmit
++;
683 /* Record that we sent the packet and s_hdr is empty. */
688 * The lock is needed to synchronize between setting
689 * qp->s_ack_state, resend timer, and post_send().
691 spin_lock_irqsave(&qp
->s_lock
, flags
);
693 if (!((qp
->ibqp
.qp_type
== IB_QPT_RC
) ?
694 ipath_make_rc_req(qp
, ohdr
, pmtu
, &bth0
, &bth2
) :
695 ipath_make_uc_req(qp
, ohdr
, pmtu
, &bth0
, &bth2
))) {
697 * Clear the busy bit before unlocking to avoid races with
698 * adding new work queue items and then failing to process
701 clear_bit(IPATH_S_BUSY
, &qp
->s_busy
);
702 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
706 spin_unlock_irqrestore(&qp
->s_lock
, flags
);
708 /* Construct the header. */
709 extra_bytes
= (4 - qp
->s_cur_size
) & 3;
710 nwords
= (qp
->s_cur_size
+ extra_bytes
) >> 2;
711 lrh0
= IPATH_LRH_BTH
;
712 if (unlikely(qp
->remote_ah_attr
.ah_flags
& IB_AH_GRH
)) {
713 qp
->s_hdrwords
+= ipath_make_grh(dev
, &qp
->s_hdr
.u
.l
.grh
,
714 &qp
->remote_ah_attr
.grh
,
715 qp
->s_hdrwords
, nwords
);
716 lrh0
= IPATH_LRH_GRH
;
718 lrh0
|= qp
->remote_ah_attr
.sl
<< 4;
719 qp
->s_hdr
.lrh
[0] = cpu_to_be16(lrh0
);
720 qp
->s_hdr
.lrh
[1] = cpu_to_be16(qp
->remote_ah_attr
.dlid
);
721 qp
->s_hdr
.lrh
[2] = cpu_to_be16(qp
->s_hdrwords
+ nwords
+
723 qp
->s_hdr
.lrh
[3] = cpu_to_be16(dev
->dd
->ipath_lid
);
724 bth0
|= ipath_get_pkey(dev
->dd
, qp
->s_pkey_index
);
725 bth0
|= extra_bytes
<< 20;
726 ohdr
->bth
[0] = cpu_to_be32(bth0
);
727 ohdr
->bth
[1] = cpu_to_be32(qp
->remote_qpn
);
728 ohdr
->bth
[2] = cpu_to_be32(bth2
);
730 /* Check for more work to do. */
734 clear_bit(IPATH_S_BUSY
, &qp
->s_busy
);