2 * Copyright (c) 2004, 2005 Topspin Communications. All rights reserved.
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * OpenIB.org BSD license below:
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * $Id: ipoib_ib.c 1386 2004-12-27 16:23:17Z roland $
35 #include <linux/delay.h>
36 #include <linux/dma-mapping.h>
42 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
43 static int data_debug_level
;
45 module_param(data_debug_level
, int, 0644);
46 MODULE_PARM_DESC(data_debug_level
,
47 "Enable data path debug tracing if > 0");
50 #define IPOIB_OP_RECV (1ul << 31)
52 static DECLARE_MUTEX(pkey_sem
);
54 struct ipoib_ah
*ipoib_create_ah(struct net_device
*dev
,
55 struct ib_pd
*pd
, struct ib_ah_attr
*attr
)
59 ah
= kmalloc(sizeof *ah
, GFP_KERNEL
);
67 ah
->ah
= ib_create_ah(pd
, attr
);
72 ipoib_dbg(netdev_priv(dev
), "Created ah %p\n", ah
->ah
);
77 void ipoib_free_ah(struct kref
*kref
)
79 struct ipoib_ah
*ah
= container_of(kref
, struct ipoib_ah
, ref
);
80 struct ipoib_dev_priv
*priv
= netdev_priv(ah
->dev
);
84 if (ah
->last_send
<= priv
->tx_tail
) {
85 ipoib_dbg(priv
, "Freeing ah %p\n", ah
->ah
);
86 ib_destroy_ah(ah
->ah
);
89 spin_lock_irqsave(&priv
->lock
, flags
);
90 list_add_tail(&ah
->list
, &priv
->dead_ahs
);
91 spin_unlock_irqrestore(&priv
->lock
, flags
);
95 static inline int ipoib_ib_receive(struct ipoib_dev_priv
*priv
,
99 struct ib_sge list
= {
101 .length
= IPOIB_BUF_SIZE
,
102 .lkey
= priv
->mr
->lkey
,
104 struct ib_recv_wr param
= {
105 .wr_id
= wr_id
| IPOIB_OP_RECV
,
109 struct ib_recv_wr
*bad_wr
;
111 return ib_post_recv(priv
->qp
, ¶m
, &bad_wr
);
114 static int ipoib_ib_post_receive(struct net_device
*dev
, int id
)
116 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
121 skb
= dev_alloc_skb(IPOIB_BUF_SIZE
+ 4);
123 ipoib_warn(priv
, "failed to allocate receive buffer\n");
125 priv
->rx_ring
[id
].skb
= NULL
;
128 skb_reserve(skb
, 4); /* 16 byte align IP header */
129 priv
->rx_ring
[id
].skb
= skb
;
130 addr
= dma_map_single(priv
->ca
->dma_device
,
131 skb
->data
, IPOIB_BUF_SIZE
,
133 pci_unmap_addr_set(&priv
->rx_ring
[id
], mapping
, addr
);
135 ret
= ipoib_ib_receive(priv
, id
, addr
);
137 ipoib_warn(priv
, "ipoib_ib_receive failed for buf %d (%d)\n",
139 dma_unmap_single(priv
->ca
->dma_device
, addr
,
140 IPOIB_BUF_SIZE
, DMA_FROM_DEVICE
);
141 dev_kfree_skb_any(skb
);
142 priv
->rx_ring
[id
].skb
= NULL
;
148 static int ipoib_ib_post_receives(struct net_device
*dev
)
150 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
153 for (i
= 0; i
< IPOIB_RX_RING_SIZE
; ++i
) {
154 if (ipoib_ib_post_receive(dev
, i
)) {
155 ipoib_warn(priv
, "ipoib_ib_post_receive failed for buf %d\n", i
);
163 static void ipoib_ib_handle_wc(struct net_device
*dev
,
166 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
167 unsigned int wr_id
= wc
->wr_id
;
169 ipoib_dbg_data(priv
, "called: id %d, op %d, status: %d\n",
170 wr_id
, wc
->opcode
, wc
->status
);
172 if (wr_id
& IPOIB_OP_RECV
) {
173 wr_id
&= ~IPOIB_OP_RECV
;
175 if (wr_id
< IPOIB_RX_RING_SIZE
) {
176 struct sk_buff
*skb
= priv
->rx_ring
[wr_id
].skb
;
178 priv
->rx_ring
[wr_id
].skb
= NULL
;
180 dma_unmap_single(priv
->ca
->dma_device
,
181 pci_unmap_addr(&priv
->rx_ring
[wr_id
],
186 if (wc
->status
!= IB_WC_SUCCESS
) {
187 if (wc
->status
!= IB_WC_WR_FLUSH_ERR
)
188 ipoib_warn(priv
, "failed recv event "
189 "(status=%d, wrid=%d vend_err %x)\n",
190 wc
->status
, wr_id
, wc
->vendor_err
);
191 dev_kfree_skb_any(skb
);
195 ipoib_dbg_data(priv
, "received %d bytes, SLID 0x%04x\n",
196 wc
->byte_len
, wc
->slid
);
198 skb_put(skb
, wc
->byte_len
);
199 skb_pull(skb
, IB_GRH_BYTES
);
201 if (wc
->slid
!= priv
->local_lid
||
202 wc
->src_qp
!= priv
->qp
->qp_num
) {
203 skb
->protocol
= ((struct ipoib_header
*) skb
->data
)->proto
;
204 skb
->mac
.raw
= skb
->data
;
205 skb_pull(skb
, IPOIB_ENCAP_LEN
);
207 dev
->last_rx
= jiffies
;
208 ++priv
->stats
.rx_packets
;
209 priv
->stats
.rx_bytes
+= skb
->len
;
212 /* XXX get correct PACKET_ type here */
213 skb
->pkt_type
= PACKET_HOST
;
216 ipoib_dbg_data(priv
, "dropping loopback packet\n");
217 dev_kfree_skb_any(skb
);
221 if (ipoib_ib_post_receive(dev
, wr_id
))
222 ipoib_warn(priv
, "ipoib_ib_post_receive failed "
223 "for buf %d\n", wr_id
);
225 ipoib_warn(priv
, "completion event with wrid %d\n",
229 struct ipoib_buf
*tx_req
;
232 if (wr_id
>= IPOIB_TX_RING_SIZE
) {
233 ipoib_warn(priv
, "completion event with wrid %d (> %d)\n",
234 wr_id
, IPOIB_TX_RING_SIZE
);
238 ipoib_dbg_data(priv
, "send complete, wrid %d\n", wr_id
);
240 tx_req
= &priv
->tx_ring
[wr_id
];
242 dma_unmap_single(priv
->ca
->dma_device
,
243 pci_unmap_addr(tx_req
, mapping
),
247 ++priv
->stats
.tx_packets
;
248 priv
->stats
.tx_bytes
+= tx_req
->skb
->len
;
250 dev_kfree_skb_any(tx_req
->skb
);
252 spin_lock_irqsave(&priv
->tx_lock
, flags
);
254 if (netif_queue_stopped(dev
) &&
255 priv
->tx_head
- priv
->tx_tail
<= IPOIB_TX_RING_SIZE
/ 2)
256 netif_wake_queue(dev
);
257 spin_unlock_irqrestore(&priv
->tx_lock
, flags
);
259 if (wc
->status
!= IB_WC_SUCCESS
&&
260 wc
->status
!= IB_WC_WR_FLUSH_ERR
)
261 ipoib_warn(priv
, "failed send event "
262 "(status=%d, wrid=%d vend_err %x)\n",
263 wc
->status
, wr_id
, wc
->vendor_err
);
267 void ipoib_ib_completion(struct ib_cq
*cq
, void *dev_ptr
)
269 struct net_device
*dev
= (struct net_device
*) dev_ptr
;
270 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
273 ib_req_notify_cq(cq
, IB_CQ_NEXT_COMP
);
275 n
= ib_poll_cq(cq
, IPOIB_NUM_WC
, priv
->ibwc
);
276 for (i
= 0; i
< n
; ++i
)
277 ipoib_ib_handle_wc(dev
, priv
->ibwc
+ i
);
278 } while (n
== IPOIB_NUM_WC
);
281 static inline int post_send(struct ipoib_dev_priv
*priv
,
283 struct ib_ah
*address
, u32 qpn
,
284 dma_addr_t addr
, int len
)
286 struct ib_send_wr
*bad_wr
;
288 priv
->tx_sge
.addr
= addr
;
289 priv
->tx_sge
.length
= len
;
291 priv
->tx_wr
.wr_id
= wr_id
;
292 priv
->tx_wr
.wr
.ud
.remote_qpn
= qpn
;
293 priv
->tx_wr
.wr
.ud
.ah
= address
;
295 return ib_post_send(priv
->qp
, &priv
->tx_wr
, &bad_wr
);
298 void ipoib_send(struct net_device
*dev
, struct sk_buff
*skb
,
299 struct ipoib_ah
*address
, u32 qpn
)
301 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
302 struct ipoib_buf
*tx_req
;
305 if (skb
->len
> dev
->mtu
+ INFINIBAND_ALEN
) {
306 ipoib_warn(priv
, "packet len %d (> %d) too long to send, dropping\n",
307 skb
->len
, dev
->mtu
+ INFINIBAND_ALEN
);
308 ++priv
->stats
.tx_dropped
;
309 ++priv
->stats
.tx_errors
;
310 dev_kfree_skb_any(skb
);
314 ipoib_dbg_data(priv
, "sending packet, length=%d address=%p qpn=0x%06x\n",
315 skb
->len
, address
, qpn
);
318 * We put the skb into the tx_ring _before_ we call post_send()
319 * because it's entirely possible that the completion handler will
320 * run before we execute anything after the post_send(). That
321 * means we have to make sure everything is properly recorded and
322 * our state is consistent before we call post_send().
324 tx_req
= &priv
->tx_ring
[priv
->tx_head
& (IPOIB_TX_RING_SIZE
- 1)];
326 addr
= dma_map_single(priv
->ca
->dma_device
, skb
->data
, skb
->len
,
328 pci_unmap_addr_set(tx_req
, mapping
, addr
);
330 if (unlikely(post_send(priv
, priv
->tx_head
& (IPOIB_TX_RING_SIZE
- 1),
331 address
->ah
, qpn
, addr
, skb
->len
))) {
332 ipoib_warn(priv
, "post_send failed\n");
333 ++priv
->stats
.tx_errors
;
334 dma_unmap_single(priv
->ca
->dma_device
, addr
, skb
->len
,
336 dev_kfree_skb_any(skb
);
338 dev
->trans_start
= jiffies
;
340 address
->last_send
= priv
->tx_head
;
343 if (priv
->tx_head
- priv
->tx_tail
== IPOIB_TX_RING_SIZE
) {
344 ipoib_dbg(priv
, "TX ring full, stopping kernel net queue\n");
345 netif_stop_queue(dev
);
350 static void __ipoib_reap_ah(struct net_device
*dev
)
352 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
353 struct ipoib_ah
*ah
, *tah
;
354 LIST_HEAD(remove_list
);
356 spin_lock_irq(&priv
->lock
);
357 list_for_each_entry_safe(ah
, tah
, &priv
->dead_ahs
, list
)
358 if (ah
->last_send
<= priv
->tx_tail
) {
360 list_add_tail(&ah
->list
, &remove_list
);
362 spin_unlock_irq(&priv
->lock
);
364 list_for_each_entry_safe(ah
, tah
, &remove_list
, list
) {
365 ipoib_dbg(priv
, "Reaping ah %p\n", ah
->ah
);
366 ib_destroy_ah(ah
->ah
);
371 void ipoib_reap_ah(void *dev_ptr
)
373 struct net_device
*dev
= dev_ptr
;
374 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
376 __ipoib_reap_ah(dev
);
378 if (!test_bit(IPOIB_STOP_REAPER
, &priv
->flags
))
379 queue_delayed_work(ipoib_workqueue
, &priv
->ah_reap_task
, HZ
);
382 int ipoib_ib_dev_open(struct net_device
*dev
)
384 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
387 ret
= ipoib_qp_create(dev
);
389 ipoib_warn(priv
, "ipoib_qp_create returned %d\n", ret
);
393 ret
= ipoib_ib_post_receives(dev
);
395 ipoib_warn(priv
, "ipoib_ib_post_receives returned %d\n", ret
);
399 clear_bit(IPOIB_STOP_REAPER
, &priv
->flags
);
400 queue_delayed_work(ipoib_workqueue
, &priv
->ah_reap_task
, HZ
);
405 int ipoib_ib_dev_up(struct net_device
*dev
)
407 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
409 set_bit(IPOIB_FLAG_OPER_UP
, &priv
->flags
);
411 return ipoib_mcast_start_thread(dev
);
414 int ipoib_ib_dev_down(struct net_device
*dev
)
416 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
418 ipoib_dbg(priv
, "downing ib_dev\n");
420 clear_bit(IPOIB_FLAG_OPER_UP
, &priv
->flags
);
421 netif_carrier_off(dev
);
423 /* Shutdown the P_Key thread if still active */
424 if (!test_bit(IPOIB_PKEY_ASSIGNED
, &priv
->flags
)) {
426 set_bit(IPOIB_PKEY_STOP
, &priv
->flags
);
427 cancel_delayed_work(&priv
->pkey_task
);
429 flush_workqueue(ipoib_workqueue
);
432 ipoib_mcast_stop_thread(dev
);
435 * Flush the multicast groups first so we stop any multicast joins. The
436 * completion thread may have already died and we may deadlock waiting
437 * for the completion thread to finish some multicast joins.
439 ipoib_mcast_dev_flush(dev
);
441 /* Delete broadcast and local addresses since they will be recreated */
442 ipoib_mcast_dev_down(dev
);
444 ipoib_flush_paths(dev
);
449 static int recvs_pending(struct net_device
*dev
)
451 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
455 for (i
= 0; i
< IPOIB_RX_RING_SIZE
; ++i
)
456 if (priv
->rx_ring
[i
].skb
)
462 int ipoib_ib_dev_stop(struct net_device
*dev
)
464 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
465 struct ib_qp_attr qp_attr
;
468 struct ipoib_buf
*tx_req
;
471 /* Kill the existing QP and allocate a new one */
472 qp_attr
.qp_state
= IB_QPS_ERR
;
473 attr_mask
= IB_QP_STATE
;
474 if (ib_modify_qp(priv
->qp
, &qp_attr
, attr_mask
))
475 ipoib_warn(priv
, "Failed to modify QP to ERROR state\n");
477 /* Wait for all sends and receives to complete */
480 while (priv
->tx_head
!= priv
->tx_tail
|| recvs_pending(dev
)) {
481 if (time_after(jiffies
, begin
+ 5 * HZ
)) {
482 ipoib_warn(priv
, "timing out; %d sends %d receives not completed\n",
483 priv
->tx_head
- priv
->tx_tail
, recvs_pending(dev
));
486 * assume the HW is wedged and just free up
487 * all our pending work requests.
489 while (priv
->tx_tail
< priv
->tx_head
) {
490 tx_req
= &priv
->tx_ring
[priv
->tx_tail
&
491 (IPOIB_TX_RING_SIZE
- 1)];
492 dma_unmap_single(priv
->ca
->dma_device
,
493 pci_unmap_addr(tx_req
, mapping
),
496 dev_kfree_skb_any(tx_req
->skb
);
500 for (i
= 0; i
< IPOIB_RX_RING_SIZE
; ++i
)
501 if (priv
->rx_ring
[i
].skb
) {
502 dma_unmap_single(priv
->ca
->dma_device
,
503 pci_unmap_addr(&priv
->rx_ring
[i
],
507 dev_kfree_skb_any(priv
->rx_ring
[i
].skb
);
508 priv
->rx_ring
[i
].skb
= NULL
;
517 ipoib_dbg(priv
, "All sends and receives done.\n");
520 qp_attr
.qp_state
= IB_QPS_RESET
;
521 attr_mask
= IB_QP_STATE
;
522 if (ib_modify_qp(priv
->qp
, &qp_attr
, attr_mask
))
523 ipoib_warn(priv
, "Failed to modify QP to RESET state\n");
525 /* Wait for all AHs to be reaped */
526 set_bit(IPOIB_STOP_REAPER
, &priv
->flags
);
527 cancel_delayed_work(&priv
->ah_reap_task
);
528 flush_workqueue(ipoib_workqueue
);
532 while (!list_empty(&priv
->dead_ahs
)) {
533 __ipoib_reap_ah(dev
);
535 if (time_after(jiffies
, begin
+ HZ
)) {
536 ipoib_warn(priv
, "timing out; will leak address handles\n");
546 int ipoib_ib_dev_init(struct net_device
*dev
, struct ib_device
*ca
, int port
)
548 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
554 if (ipoib_transport_dev_init(dev
, ca
)) {
555 printk(KERN_WARNING
"%s: ipoib_transport_dev_init failed\n", ca
->name
);
559 if (dev
->flags
& IFF_UP
) {
560 if (ipoib_ib_dev_open(dev
)) {
561 ipoib_transport_dev_cleanup(dev
);
569 void ipoib_ib_dev_flush(void *_dev
)
571 struct net_device
*dev
= (struct net_device
*)_dev
;
572 struct ipoib_dev_priv
*priv
= netdev_priv(dev
), *cpriv
;
574 if (!test_bit(IPOIB_FLAG_ADMIN_UP
, &priv
->flags
))
577 ipoib_dbg(priv
, "flushing\n");
579 ipoib_ib_dev_down(dev
);
582 * The device could have been brought down between the start and when
583 * we get here, don't bring it back up if it's not configured up
585 if (test_bit(IPOIB_FLAG_ADMIN_UP
, &priv
->flags
))
586 ipoib_ib_dev_up(dev
);
588 /* Flush any child interfaces too */
589 list_for_each_entry(cpriv
, &priv
->child_intfs
, list
)
590 ipoib_ib_dev_flush(&cpriv
->dev
);
593 void ipoib_ib_dev_cleanup(struct net_device
*dev
)
595 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
597 ipoib_dbg(priv
, "cleaning up ib_dev\n");
599 ipoib_mcast_stop_thread(dev
);
601 /* Delete the broadcast address and the local address */
602 ipoib_mcast_dev_down(dev
);
604 ipoib_transport_dev_cleanup(dev
);
608 * Delayed P_Key Assigment Interim Support
610 * The following is initial implementation of delayed P_Key assigment
611 * mechanism. It is using the same approach implemented for the multicast
612 * group join. The single goal of this implementation is to quickly address
613 * Bug #2507. This implementation will probably be removed when the P_Key
614 * change async notification is available.
616 int ipoib_open(struct net_device
*dev
);
618 static void ipoib_pkey_dev_check_presence(struct net_device
*dev
)
620 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
623 if (ib_find_cached_pkey(priv
->ca
, priv
->port
, priv
->pkey
, &pkey_index
))
624 clear_bit(IPOIB_PKEY_ASSIGNED
, &priv
->flags
);
626 set_bit(IPOIB_PKEY_ASSIGNED
, &priv
->flags
);
629 void ipoib_pkey_poll(void *dev_ptr
)
631 struct net_device
*dev
= dev_ptr
;
632 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
634 ipoib_pkey_dev_check_presence(dev
);
636 if (test_bit(IPOIB_PKEY_ASSIGNED
, &priv
->flags
))
640 if (!test_bit(IPOIB_PKEY_STOP
, &priv
->flags
))
641 queue_delayed_work(ipoib_workqueue
,
648 int ipoib_pkey_dev_delay_open(struct net_device
*dev
)
650 struct ipoib_dev_priv
*priv
= netdev_priv(dev
);
652 /* Look for the interface pkey value in the IB Port P_Key table and */
653 /* set the interface pkey assigment flag */
654 ipoib_pkey_dev_check_presence(dev
);
656 /* P_Key value not assigned yet - start polling */
657 if (!test_bit(IPOIB_PKEY_ASSIGNED
, &priv
->flags
)) {
659 clear_bit(IPOIB_PKEY_STOP
, &priv
->flags
);
660 queue_delayed_work(ipoib_workqueue
,