target: Fix kref->refcount underflow in transport_cmd_finish_abort
[linux/fpc-iii.git] / drivers / infiniband / ulp / ipoib / ipoib_main.c
blob08c4b028730438951e65d7499596a9445ceba980
1 /*
2 * Copyright (c) 2004 Topspin Communications. All rights reserved.
3 * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
4 * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
6 * This software is available to you under a choice of one of two
7 * licenses. You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
12 * Redistribution and use in source and binary forms, with or
13 * without modification, are permitted provided that the following
14 * conditions are met:
16 * - Redistributions of source code must retain the above
17 * copyright notice, this list of conditions and the following
18 * disclaimer.
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials
23 * provided with the distribution.
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * SOFTWARE.
35 #include "ipoib.h"
37 #include <linux/module.h>
39 #include <linux/init.h>
40 #include <linux/slab.h>
41 #include <linux/kernel.h>
42 #include <linux/vmalloc.h>
44 #include <linux/if_arp.h> /* For ARPHRD_xxx */
46 #include <linux/ip.h>
47 #include <linux/in.h>
49 #include <linux/jhash.h>
50 #include <net/arp.h>
51 #include <net/addrconf.h>
52 #include <linux/inetdevice.h>
53 #include <rdma/ib_cache.h>
54 #include <linux/pci.h>
56 #define DRV_VERSION "1.0.0"
58 const char ipoib_driver_version[] = DRV_VERSION;
60 MODULE_AUTHOR("Roland Dreier");
61 MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
62 MODULE_LICENSE("Dual BSD/GPL");
63 MODULE_VERSION(DRV_VERSION);
65 int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
66 int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
68 module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
69 MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
70 module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
71 MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
73 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
74 int ipoib_debug_level;
76 module_param_named(debug_level, ipoib_debug_level, int, 0644);
77 MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
78 #endif
80 struct ipoib_path_iter {
81 struct net_device *dev;
82 struct ipoib_path path;
85 static const u8 ipv4_bcast_addr[] = {
86 0x00, 0xff, 0xff, 0xff,
87 0xff, 0x12, 0x40, 0x1b, 0x00, 0x00, 0x00, 0x00,
88 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff
91 struct workqueue_struct *ipoib_workqueue;
93 struct ib_sa_client ipoib_sa_client;
95 static void ipoib_add_one(struct ib_device *device);
96 static void ipoib_remove_one(struct ib_device *device, void *client_data);
97 static void ipoib_neigh_reclaim(struct rcu_head *rp);
98 static struct net_device *ipoib_get_net_dev_by_params(
99 struct ib_device *dev, u8 port, u16 pkey,
100 const union ib_gid *gid, const struct sockaddr *addr,
101 void *client_data);
102 static int ipoib_set_mac(struct net_device *dev, void *addr);
104 static struct ib_client ipoib_client = {
105 .name = "ipoib",
106 .add = ipoib_add_one,
107 .remove = ipoib_remove_one,
108 .get_net_dev_by_params = ipoib_get_net_dev_by_params,
111 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
112 static int ipoib_netdev_event(struct notifier_block *this,
113 unsigned long event, void *ptr)
115 struct netdev_notifier_info *ni = ptr;
116 struct net_device *dev = ni->dev;
118 if (dev->netdev_ops->ndo_open != ipoib_open)
119 return NOTIFY_DONE;
121 switch (event) {
122 case NETDEV_REGISTER:
123 ipoib_create_debug_files(dev);
124 break;
125 case NETDEV_CHANGENAME:
126 ipoib_delete_debug_files(dev);
127 ipoib_create_debug_files(dev);
128 break;
129 case NETDEV_UNREGISTER:
130 ipoib_delete_debug_files(dev);
131 break;
134 return NOTIFY_DONE;
136 #endif
138 int ipoib_open(struct net_device *dev)
140 struct ipoib_dev_priv *priv = netdev_priv(dev);
142 ipoib_dbg(priv, "bringing up interface\n");
144 netif_carrier_off(dev);
146 set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
148 priv->sm_fullmember_sendonly_support = false;
150 if (ipoib_ib_dev_open(dev)) {
151 if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
152 return 0;
153 goto err_disable;
156 if (ipoib_ib_dev_up(dev))
157 goto err_stop;
159 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
160 struct ipoib_dev_priv *cpriv;
162 /* Bring up any child interfaces too */
163 down_read(&priv->vlan_rwsem);
164 list_for_each_entry(cpriv, &priv->child_intfs, list) {
165 int flags;
167 flags = cpriv->dev->flags;
168 if (flags & IFF_UP)
169 continue;
171 dev_change_flags(cpriv->dev, flags | IFF_UP);
173 up_read(&priv->vlan_rwsem);
176 netif_start_queue(dev);
178 return 0;
180 err_stop:
181 ipoib_ib_dev_stop(dev);
183 err_disable:
184 clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
186 return -EINVAL;
189 static int ipoib_stop(struct net_device *dev)
191 struct ipoib_dev_priv *priv = netdev_priv(dev);
193 ipoib_dbg(priv, "stopping interface\n");
195 clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
197 netif_stop_queue(dev);
199 ipoib_ib_dev_down(dev);
200 ipoib_ib_dev_stop(dev);
202 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
203 struct ipoib_dev_priv *cpriv;
205 /* Bring down any child interfaces too */
206 down_read(&priv->vlan_rwsem);
207 list_for_each_entry(cpriv, &priv->child_intfs, list) {
208 int flags;
210 flags = cpriv->dev->flags;
211 if (!(flags & IFF_UP))
212 continue;
214 dev_change_flags(cpriv->dev, flags & ~IFF_UP);
216 up_read(&priv->vlan_rwsem);
219 return 0;
222 static void ipoib_uninit(struct net_device *dev)
224 ipoib_dev_cleanup(dev);
227 static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
229 struct ipoib_dev_priv *priv = netdev_priv(dev);
231 if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
232 features &= ~(NETIF_F_IP_CSUM | NETIF_F_TSO);
234 return features;
237 static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
239 struct ipoib_dev_priv *priv = netdev_priv(dev);
241 /* dev->mtu > 2K ==> connected mode */
242 if (ipoib_cm_admin_enabled(dev)) {
243 if (new_mtu > ipoib_cm_max_mtu(dev))
244 return -EINVAL;
246 if (new_mtu > priv->mcast_mtu)
247 ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
248 priv->mcast_mtu);
250 dev->mtu = new_mtu;
251 return 0;
254 if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
255 return -EINVAL;
257 priv->admin_mtu = new_mtu;
259 dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
261 return 0;
264 /* Called with an RCU read lock taken */
265 static bool ipoib_is_dev_match_addr_rcu(const struct sockaddr *addr,
266 struct net_device *dev)
268 struct net *net = dev_net(dev);
269 struct in_device *in_dev;
270 struct sockaddr_in *addr_in = (struct sockaddr_in *)addr;
271 struct sockaddr_in6 *addr_in6 = (struct sockaddr_in6 *)addr;
272 __be32 ret_addr;
274 switch (addr->sa_family) {
275 case AF_INET:
276 in_dev = in_dev_get(dev);
277 if (!in_dev)
278 return false;
280 ret_addr = inet_confirm_addr(net, in_dev, 0,
281 addr_in->sin_addr.s_addr,
282 RT_SCOPE_HOST);
283 in_dev_put(in_dev);
284 if (ret_addr)
285 return true;
287 break;
288 case AF_INET6:
289 if (IS_ENABLED(CONFIG_IPV6) &&
290 ipv6_chk_addr(net, &addr_in6->sin6_addr, dev, 1))
291 return true;
293 break;
295 return false;
299 * Find the master net_device on top of the given net_device.
300 * @dev: base IPoIB net_device
302 * Returns the master net_device with a reference held, or the same net_device
303 * if no master exists.
305 static struct net_device *ipoib_get_master_net_dev(struct net_device *dev)
307 struct net_device *master;
309 rcu_read_lock();
310 master = netdev_master_upper_dev_get_rcu(dev);
311 if (master)
312 dev_hold(master);
313 rcu_read_unlock();
315 if (master)
316 return master;
318 dev_hold(dev);
319 return dev;
323 * Find a net_device matching the given address, which is an upper device of
324 * the given net_device.
325 * @addr: IP address to look for.
326 * @dev: base IPoIB net_device
328 * If found, returns the net_device with a reference held. Otherwise return
329 * NULL.
331 static struct net_device *ipoib_get_net_dev_match_addr(
332 const struct sockaddr *addr, struct net_device *dev)
334 struct net_device *upper,
335 *result = NULL;
336 struct list_head *iter;
338 rcu_read_lock();
339 if (ipoib_is_dev_match_addr_rcu(addr, dev)) {
340 dev_hold(dev);
341 result = dev;
342 goto out;
345 netdev_for_each_all_upper_dev_rcu(dev, upper, iter) {
346 if (ipoib_is_dev_match_addr_rcu(addr, upper)) {
347 dev_hold(upper);
348 result = upper;
349 break;
352 out:
353 rcu_read_unlock();
354 return result;
357 /* returns the number of IPoIB netdevs on top a given ipoib device matching a
358 * pkey_index and address, if one exists.
360 * @found_net_dev: contains a matching net_device if the return value >= 1,
361 * with a reference held. */
362 static int ipoib_match_gid_pkey_addr(struct ipoib_dev_priv *priv,
363 const union ib_gid *gid,
364 u16 pkey_index,
365 const struct sockaddr *addr,
366 int nesting,
367 struct net_device **found_net_dev)
369 struct ipoib_dev_priv *child_priv;
370 struct net_device *net_dev = NULL;
371 int matches = 0;
373 if (priv->pkey_index == pkey_index &&
374 (!gid || !memcmp(gid, &priv->local_gid, sizeof(*gid)))) {
375 if (!addr) {
376 net_dev = ipoib_get_master_net_dev(priv->dev);
377 } else {
378 /* Verify the net_device matches the IP address, as
379 * IPoIB child devices currently share a GID. */
380 net_dev = ipoib_get_net_dev_match_addr(addr, priv->dev);
382 if (net_dev) {
383 if (!*found_net_dev)
384 *found_net_dev = net_dev;
385 else
386 dev_put(net_dev);
387 ++matches;
391 /* Check child interfaces */
392 down_read_nested(&priv->vlan_rwsem, nesting);
393 list_for_each_entry(child_priv, &priv->child_intfs, list) {
394 matches += ipoib_match_gid_pkey_addr(child_priv, gid,
395 pkey_index, addr,
396 nesting + 1,
397 found_net_dev);
398 if (matches > 1)
399 break;
401 up_read(&priv->vlan_rwsem);
403 return matches;
406 /* Returns the number of matching net_devs found (between 0 and 2). Also
407 * return the matching net_device in the @net_dev parameter, holding a
408 * reference to the net_device, if the number of matches >= 1 */
409 static int __ipoib_get_net_dev_by_params(struct list_head *dev_list, u8 port,
410 u16 pkey_index,
411 const union ib_gid *gid,
412 const struct sockaddr *addr,
413 struct net_device **net_dev)
415 struct ipoib_dev_priv *priv;
416 int matches = 0;
418 *net_dev = NULL;
420 list_for_each_entry(priv, dev_list, list) {
421 if (priv->port != port)
422 continue;
424 matches += ipoib_match_gid_pkey_addr(priv, gid, pkey_index,
425 addr, 0, net_dev);
426 if (matches > 1)
427 break;
430 return matches;
433 static struct net_device *ipoib_get_net_dev_by_params(
434 struct ib_device *dev, u8 port, u16 pkey,
435 const union ib_gid *gid, const struct sockaddr *addr,
436 void *client_data)
438 struct net_device *net_dev;
439 struct list_head *dev_list = client_data;
440 u16 pkey_index;
441 int matches;
442 int ret;
444 if (!rdma_protocol_ib(dev, port))
445 return NULL;
447 ret = ib_find_cached_pkey(dev, port, pkey, &pkey_index);
448 if (ret)
449 return NULL;
451 if (!dev_list)
452 return NULL;
454 /* See if we can find a unique device matching the L2 parameters */
455 matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
456 gid, NULL, &net_dev);
458 switch (matches) {
459 case 0:
460 return NULL;
461 case 1:
462 return net_dev;
465 dev_put(net_dev);
467 /* Couldn't find a unique device with L2 parameters only. Use L3
468 * address to uniquely match the net device */
469 matches = __ipoib_get_net_dev_by_params(dev_list, port, pkey_index,
470 gid, addr, &net_dev);
471 switch (matches) {
472 case 0:
473 return NULL;
474 default:
475 dev_warn_ratelimited(&dev->dev,
476 "duplicate IP address detected\n");
477 /* Fall through */
478 case 1:
479 return net_dev;
483 int ipoib_set_mode(struct net_device *dev, const char *buf)
485 struct ipoib_dev_priv *priv = netdev_priv(dev);
487 /* flush paths if we switch modes so that connections are restarted */
488 if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
489 set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
490 ipoib_warn(priv, "enabling connected mode "
491 "will cause multicast packet drops\n");
492 netdev_update_features(dev);
493 dev_set_mtu(dev, ipoib_cm_max_mtu(dev));
494 rtnl_unlock();
495 priv->tx_wr.wr.send_flags &= ~IB_SEND_IP_CSUM;
497 ipoib_flush_paths(dev);
498 return (!rtnl_trylock()) ? -EBUSY : 0;
501 if (!strcmp(buf, "datagram\n")) {
502 clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
503 netdev_update_features(dev);
504 dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
505 rtnl_unlock();
506 ipoib_flush_paths(dev);
507 return (!rtnl_trylock()) ? -EBUSY : 0;
510 return -EINVAL;
513 struct ipoib_path *__path_find(struct net_device *dev, void *gid)
515 struct ipoib_dev_priv *priv = netdev_priv(dev);
516 struct rb_node *n = priv->path_tree.rb_node;
517 struct ipoib_path *path;
518 int ret;
520 while (n) {
521 path = rb_entry(n, struct ipoib_path, rb_node);
523 ret = memcmp(gid, path->pathrec.dgid.raw,
524 sizeof (union ib_gid));
526 if (ret < 0)
527 n = n->rb_left;
528 else if (ret > 0)
529 n = n->rb_right;
530 else
531 return path;
534 return NULL;
537 static int __path_add(struct net_device *dev, struct ipoib_path *path)
539 struct ipoib_dev_priv *priv = netdev_priv(dev);
540 struct rb_node **n = &priv->path_tree.rb_node;
541 struct rb_node *pn = NULL;
542 struct ipoib_path *tpath;
543 int ret;
545 while (*n) {
546 pn = *n;
547 tpath = rb_entry(pn, struct ipoib_path, rb_node);
549 ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
550 sizeof (union ib_gid));
551 if (ret < 0)
552 n = &pn->rb_left;
553 else if (ret > 0)
554 n = &pn->rb_right;
555 else
556 return -EEXIST;
559 rb_link_node(&path->rb_node, pn, n);
560 rb_insert_color(&path->rb_node, &priv->path_tree);
562 list_add_tail(&path->list, &priv->path_list);
564 return 0;
567 static void path_free(struct net_device *dev, struct ipoib_path *path)
569 struct sk_buff *skb;
571 while ((skb = __skb_dequeue(&path->queue)))
572 dev_kfree_skb_irq(skb);
574 ipoib_dbg(netdev_priv(dev), "path_free\n");
576 /* remove all neigh connected to this path */
577 ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
579 if (path->ah)
580 ipoib_put_ah(path->ah);
582 kfree(path);
585 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
587 struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
589 struct ipoib_path_iter *iter;
591 iter = kmalloc(sizeof *iter, GFP_KERNEL);
592 if (!iter)
593 return NULL;
595 iter->dev = dev;
596 memset(iter->path.pathrec.dgid.raw, 0, 16);
598 if (ipoib_path_iter_next(iter)) {
599 kfree(iter);
600 return NULL;
603 return iter;
606 int ipoib_path_iter_next(struct ipoib_path_iter *iter)
608 struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
609 struct rb_node *n;
610 struct ipoib_path *path;
611 int ret = 1;
613 spin_lock_irq(&priv->lock);
615 n = rb_first(&priv->path_tree);
617 while (n) {
618 path = rb_entry(n, struct ipoib_path, rb_node);
620 if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
621 sizeof (union ib_gid)) < 0) {
622 iter->path = *path;
623 ret = 0;
624 break;
627 n = rb_next(n);
630 spin_unlock_irq(&priv->lock);
632 return ret;
635 void ipoib_path_iter_read(struct ipoib_path_iter *iter,
636 struct ipoib_path *path)
638 *path = iter->path;
641 #endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
643 void ipoib_mark_paths_invalid(struct net_device *dev)
645 struct ipoib_dev_priv *priv = netdev_priv(dev);
646 struct ipoib_path *path, *tp;
648 spin_lock_irq(&priv->lock);
650 list_for_each_entry_safe(path, tp, &priv->path_list, list) {
651 ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",
652 be16_to_cpu(path->pathrec.dlid),
653 path->pathrec.dgid.raw);
654 path->valid = 0;
657 spin_unlock_irq(&priv->lock);
660 struct classport_info_context {
661 struct ipoib_dev_priv *priv;
662 struct completion done;
663 struct ib_sa_query *sa_query;
666 static void classport_info_query_cb(int status, struct ib_class_port_info *rec,
667 void *context)
669 struct classport_info_context *cb_ctx = context;
670 struct ipoib_dev_priv *priv;
672 WARN_ON(!context);
674 priv = cb_ctx->priv;
676 if (status || !rec) {
677 pr_debug("device: %s failed query classport_info status: %d\n",
678 priv->dev->name, status);
679 /* keeps the default, will try next mcast_restart */
680 priv->sm_fullmember_sendonly_support = false;
681 goto out;
684 if (ib_get_cpi_capmask2(rec) &
685 IB_SA_CAP_MASK2_SENDONLY_FULL_MEM_SUPPORT) {
686 pr_debug("device: %s enabled fullmember-sendonly for sendonly MCG\n",
687 priv->dev->name);
688 priv->sm_fullmember_sendonly_support = true;
689 } else {
690 pr_debug("device: %s disabled fullmember-sendonly for sendonly MCG\n",
691 priv->dev->name);
692 priv->sm_fullmember_sendonly_support = false;
695 out:
696 complete(&cb_ctx->done);
699 int ipoib_check_sm_sendonly_fullmember_support(struct ipoib_dev_priv *priv)
701 struct classport_info_context *callback_context;
702 int ret;
704 callback_context = kmalloc(sizeof(*callback_context), GFP_KERNEL);
705 if (!callback_context)
706 return -ENOMEM;
708 callback_context->priv = priv;
709 init_completion(&callback_context->done);
711 ret = ib_sa_classport_info_rec_query(&ipoib_sa_client,
712 priv->ca, priv->port, 3000,
713 GFP_KERNEL,
714 classport_info_query_cb,
715 callback_context,
716 &callback_context->sa_query);
717 if (ret < 0) {
718 pr_info("%s failed to send ib_sa_classport_info query, ret: %d\n",
719 priv->dev->name, ret);
720 kfree(callback_context);
721 return ret;
724 /* waiting for the callback to finish before returnning */
725 wait_for_completion(&callback_context->done);
726 kfree(callback_context);
728 return ret;
731 static void push_pseudo_header(struct sk_buff *skb, const char *daddr)
733 struct ipoib_pseudo_header *phdr;
735 phdr = (struct ipoib_pseudo_header *)skb_push(skb, sizeof(*phdr));
736 memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
739 void ipoib_flush_paths(struct net_device *dev)
741 struct ipoib_dev_priv *priv = netdev_priv(dev);
742 struct ipoib_path *path, *tp;
743 LIST_HEAD(remove_list);
744 unsigned long flags;
746 netif_tx_lock_bh(dev);
747 spin_lock_irqsave(&priv->lock, flags);
749 list_splice_init(&priv->path_list, &remove_list);
751 list_for_each_entry(path, &remove_list, list)
752 rb_erase(&path->rb_node, &priv->path_tree);
754 list_for_each_entry_safe(path, tp, &remove_list, list) {
755 if (path->query)
756 ib_sa_cancel_query(path->query_id, path->query);
757 spin_unlock_irqrestore(&priv->lock, flags);
758 netif_tx_unlock_bh(dev);
759 wait_for_completion(&path->done);
760 path_free(dev, path);
761 netif_tx_lock_bh(dev);
762 spin_lock_irqsave(&priv->lock, flags);
765 spin_unlock_irqrestore(&priv->lock, flags);
766 netif_tx_unlock_bh(dev);
769 static void path_rec_completion(int status,
770 struct ib_sa_path_rec *pathrec,
771 void *path_ptr)
773 struct ipoib_path *path = path_ptr;
774 struct net_device *dev = path->dev;
775 struct ipoib_dev_priv *priv = netdev_priv(dev);
776 struct ipoib_ah *ah = NULL;
777 struct ipoib_ah *old_ah = NULL;
778 struct ipoib_neigh *neigh, *tn;
779 struct sk_buff_head skqueue;
780 struct sk_buff *skb;
781 unsigned long flags;
783 if (!status)
784 ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
785 be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);
786 else
787 ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
788 status, path->pathrec.dgid.raw);
790 skb_queue_head_init(&skqueue);
792 if (!status) {
793 struct ib_ah_attr av;
795 if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
796 ah = ipoib_create_ah(dev, priv->pd, &av);
799 spin_lock_irqsave(&priv->lock, flags);
801 if (!IS_ERR_OR_NULL(ah)) {
802 path->pathrec = *pathrec;
804 old_ah = path->ah;
805 path->ah = ah;
807 ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
808 ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
810 while ((skb = __skb_dequeue(&path->queue)))
811 __skb_queue_tail(&skqueue, skb);
813 list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
814 if (neigh->ah) {
815 WARN_ON(neigh->ah != old_ah);
817 * Dropping the ah reference inside
818 * priv->lock is safe here, because we
819 * will hold one more reference from
820 * the original value of path->ah (ie
821 * old_ah).
823 ipoib_put_ah(neigh->ah);
825 kref_get(&path->ah->ref);
826 neigh->ah = path->ah;
828 if (ipoib_cm_enabled(dev, neigh->daddr)) {
829 if (!ipoib_cm_get(neigh))
830 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
831 path,
832 neigh));
833 if (!ipoib_cm_get(neigh)) {
834 ipoib_neigh_free(neigh);
835 continue;
839 while ((skb = __skb_dequeue(&neigh->queue)))
840 __skb_queue_tail(&skqueue, skb);
842 path->valid = 1;
845 path->query = NULL;
846 complete(&path->done);
848 spin_unlock_irqrestore(&priv->lock, flags);
850 if (IS_ERR_OR_NULL(ah))
851 ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
853 if (old_ah)
854 ipoib_put_ah(old_ah);
856 while ((skb = __skb_dequeue(&skqueue))) {
857 skb->dev = dev;
858 if (dev_queue_xmit(skb))
859 ipoib_warn(priv, "dev_queue_xmit failed "
860 "to requeue packet\n");
864 static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
866 struct ipoib_dev_priv *priv = netdev_priv(dev);
867 struct ipoib_path *path;
869 if (!priv->broadcast)
870 return NULL;
872 path = kzalloc(sizeof *path, GFP_ATOMIC);
873 if (!path)
874 return NULL;
876 path->dev = dev;
878 skb_queue_head_init(&path->queue);
880 INIT_LIST_HEAD(&path->neigh_list);
882 memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
883 path->pathrec.sgid = priv->local_gid;
884 path->pathrec.pkey = cpu_to_be16(priv->pkey);
885 path->pathrec.numb_path = 1;
886 path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
888 return path;
891 static int path_rec_start(struct net_device *dev,
892 struct ipoib_path *path)
894 struct ipoib_dev_priv *priv = netdev_priv(dev);
896 ipoib_dbg(priv, "Start path record lookup for %pI6\n",
897 path->pathrec.dgid.raw);
899 init_completion(&path->done);
901 path->query_id =
902 ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
903 &path->pathrec,
904 IB_SA_PATH_REC_DGID |
905 IB_SA_PATH_REC_SGID |
906 IB_SA_PATH_REC_NUMB_PATH |
907 IB_SA_PATH_REC_TRAFFIC_CLASS |
908 IB_SA_PATH_REC_PKEY,
909 1000, GFP_ATOMIC,
910 path_rec_completion,
911 path, &path->query);
912 if (path->query_id < 0) {
913 ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
914 path->query = NULL;
915 complete(&path->done);
916 return path->query_id;
919 return 0;
922 static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
923 struct net_device *dev)
925 struct ipoib_dev_priv *priv = netdev_priv(dev);
926 struct ipoib_path *path;
927 struct ipoib_neigh *neigh;
928 unsigned long flags;
930 spin_lock_irqsave(&priv->lock, flags);
931 neigh = ipoib_neigh_alloc(daddr, dev);
932 if (!neigh) {
933 spin_unlock_irqrestore(&priv->lock, flags);
934 ++dev->stats.tx_dropped;
935 dev_kfree_skb_any(skb);
936 return;
939 path = __path_find(dev, daddr + 4);
940 if (!path) {
941 path = path_rec_create(dev, daddr + 4);
942 if (!path)
943 goto err_path;
945 __path_add(dev, path);
948 list_add_tail(&neigh->list, &path->neigh_list);
950 if (path->ah) {
951 kref_get(&path->ah->ref);
952 neigh->ah = path->ah;
954 if (ipoib_cm_enabled(dev, neigh->daddr)) {
955 if (!ipoib_cm_get(neigh))
956 ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
957 if (!ipoib_cm_get(neigh)) {
958 ipoib_neigh_free(neigh);
959 goto err_drop;
961 if (skb_queue_len(&neigh->queue) <
962 IPOIB_MAX_PATH_REC_QUEUE) {
963 push_pseudo_header(skb, neigh->daddr);
964 __skb_queue_tail(&neigh->queue, skb);
965 } else {
966 ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
967 skb_queue_len(&neigh->queue));
968 goto err_drop;
970 } else {
971 spin_unlock_irqrestore(&priv->lock, flags);
972 ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr));
973 ipoib_neigh_put(neigh);
974 return;
976 } else {
977 neigh->ah = NULL;
979 if (!path->query && path_rec_start(dev, path))
980 goto err_path;
981 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
982 push_pseudo_header(skb, neigh->daddr);
983 __skb_queue_tail(&neigh->queue, skb);
984 } else {
985 goto err_drop;
989 spin_unlock_irqrestore(&priv->lock, flags);
990 ipoib_neigh_put(neigh);
991 return;
993 err_path:
994 ipoib_neigh_free(neigh);
995 err_drop:
996 ++dev->stats.tx_dropped;
997 dev_kfree_skb_any(skb);
999 spin_unlock_irqrestore(&priv->lock, flags);
1000 ipoib_neigh_put(neigh);
1003 static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
1004 struct ipoib_pseudo_header *phdr)
1006 struct ipoib_dev_priv *priv = netdev_priv(dev);
1007 struct ipoib_path *path;
1008 unsigned long flags;
1010 spin_lock_irqsave(&priv->lock, flags);
1012 path = __path_find(dev, phdr->hwaddr + 4);
1013 if (!path || !path->valid) {
1014 int new_path = 0;
1016 if (!path) {
1017 path = path_rec_create(dev, phdr->hwaddr + 4);
1018 new_path = 1;
1020 if (path) {
1021 if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
1022 push_pseudo_header(skb, phdr->hwaddr);
1023 __skb_queue_tail(&path->queue, skb);
1024 } else {
1025 ++dev->stats.tx_dropped;
1026 dev_kfree_skb_any(skb);
1029 if (!path->query && path_rec_start(dev, path)) {
1030 spin_unlock_irqrestore(&priv->lock, flags);
1031 if (new_path)
1032 path_free(dev, path);
1033 return;
1034 } else
1035 __path_add(dev, path);
1036 } else {
1037 ++dev->stats.tx_dropped;
1038 dev_kfree_skb_any(skb);
1041 spin_unlock_irqrestore(&priv->lock, flags);
1042 return;
1045 if (path->ah) {
1046 ipoib_dbg(priv, "Send unicast ARP to %04x\n",
1047 be16_to_cpu(path->pathrec.dlid));
1049 spin_unlock_irqrestore(&priv->lock, flags);
1050 ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));
1051 return;
1052 } else if ((path->query || !path_rec_start(dev, path)) &&
1053 skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
1054 push_pseudo_header(skb, phdr->hwaddr);
1055 __skb_queue_tail(&path->queue, skb);
1056 } else {
1057 ++dev->stats.tx_dropped;
1058 dev_kfree_skb_any(skb);
1061 spin_unlock_irqrestore(&priv->lock, flags);
1064 static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
1066 struct ipoib_dev_priv *priv = netdev_priv(dev);
1067 struct ipoib_neigh *neigh;
1068 struct ipoib_pseudo_header *phdr;
1069 struct ipoib_header *header;
1070 unsigned long flags;
1072 phdr = (struct ipoib_pseudo_header *) skb->data;
1073 skb_pull(skb, sizeof(*phdr));
1074 header = (struct ipoib_header *) skb->data;
1076 if (unlikely(phdr->hwaddr[4] == 0xff)) {
1077 /* multicast, arrange "if" according to probability */
1078 if ((header->proto != htons(ETH_P_IP)) &&
1079 (header->proto != htons(ETH_P_IPV6)) &&
1080 (header->proto != htons(ETH_P_ARP)) &&
1081 (header->proto != htons(ETH_P_RARP)) &&
1082 (header->proto != htons(ETH_P_TIPC))) {
1083 /* ethertype not supported by IPoIB */
1084 ++dev->stats.tx_dropped;
1085 dev_kfree_skb_any(skb);
1086 return NETDEV_TX_OK;
1088 /* Add in the P_Key for multicast*/
1089 phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
1090 phdr->hwaddr[9] = priv->pkey & 0xff;
1092 neigh = ipoib_neigh_get(dev, phdr->hwaddr);
1093 if (likely(neigh))
1094 goto send_using_neigh;
1095 ipoib_mcast_send(dev, phdr->hwaddr, skb);
1096 return NETDEV_TX_OK;
1099 /* unicast, arrange "switch" according to probability */
1100 switch (header->proto) {
1101 case htons(ETH_P_IP):
1102 case htons(ETH_P_IPV6):
1103 case htons(ETH_P_TIPC):
1104 neigh = ipoib_neigh_get(dev, phdr->hwaddr);
1105 if (unlikely(!neigh)) {
1106 neigh_add_path(skb, phdr->hwaddr, dev);
1107 return NETDEV_TX_OK;
1109 break;
1110 case htons(ETH_P_ARP):
1111 case htons(ETH_P_RARP):
1112 /* for unicast ARP and RARP should always perform path find */
1113 unicast_arp_send(skb, dev, phdr);
1114 return NETDEV_TX_OK;
1115 default:
1116 /* ethertype not supported by IPoIB */
1117 ++dev->stats.tx_dropped;
1118 dev_kfree_skb_any(skb);
1119 return NETDEV_TX_OK;
1122 send_using_neigh:
1123 /* note we now hold a ref to neigh */
1124 if (ipoib_cm_get(neigh)) {
1125 if (ipoib_cm_up(neigh)) {
1126 ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
1127 goto unref;
1129 } else if (neigh->ah) {
1130 ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(phdr->hwaddr));
1131 goto unref;
1134 if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
1135 push_pseudo_header(skb, phdr->hwaddr);
1136 spin_lock_irqsave(&priv->lock, flags);
1137 __skb_queue_tail(&neigh->queue, skb);
1138 spin_unlock_irqrestore(&priv->lock, flags);
1139 } else {
1140 ++dev->stats.tx_dropped;
1141 dev_kfree_skb_any(skb);
1144 unref:
1145 ipoib_neigh_put(neigh);
1147 return NETDEV_TX_OK;
1150 static void ipoib_timeout(struct net_device *dev)
1152 struct ipoib_dev_priv *priv = netdev_priv(dev);
1154 ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
1155 jiffies_to_msecs(jiffies - dev_trans_start(dev)));
1156 ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
1157 netif_queue_stopped(dev),
1158 priv->tx_head, priv->tx_tail);
1159 /* XXX reset QP, etc. */
1162 static int ipoib_hard_header(struct sk_buff *skb,
1163 struct net_device *dev,
1164 unsigned short type,
1165 const void *daddr, const void *saddr, unsigned len)
1167 struct ipoib_header *header;
1169 header = (struct ipoib_header *) skb_push(skb, sizeof *header);
1171 header->proto = htons(type);
1172 header->reserved = 0;
1175 * we don't rely on dst_entry structure, always stuff the
1176 * destination address into skb hard header so we can figure out where
1177 * to send the packet later.
1179 push_pseudo_header(skb, daddr);
1181 return IPOIB_HARD_LEN;
1184 static void ipoib_set_mcast_list(struct net_device *dev)
1186 struct ipoib_dev_priv *priv = netdev_priv(dev);
1188 if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
1189 ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
1190 return;
1193 queue_work(priv->wq, &priv->restart_task);
1196 static int ipoib_get_iflink(const struct net_device *dev)
1198 struct ipoib_dev_priv *priv = netdev_priv(dev);
1200 /* parent interface */
1201 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
1202 return dev->ifindex;
1204 /* child/vlan interface */
1205 return priv->parent->ifindex;
1208 static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
1211 * Use only the address parts that contributes to spreading
1212 * The subnet prefix is not used as one can not connect to
1213 * same remote port (GUID) using the same remote QPN via two
1214 * different subnets.
1216 /* qpn octets[1:4) & port GUID octets[12:20) */
1217 u32 *d32 = (u32 *) daddr;
1218 u32 hv;
1220 hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0);
1221 return hv & htbl->mask;
1224 struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
1226 struct ipoib_dev_priv *priv = netdev_priv(dev);
1227 struct ipoib_neigh_table *ntbl = &priv->ntbl;
1228 struct ipoib_neigh_hash *htbl;
1229 struct ipoib_neigh *neigh = NULL;
1230 u32 hash_val;
1232 rcu_read_lock_bh();
1234 htbl = rcu_dereference_bh(ntbl->htbl);
1236 if (!htbl)
1237 goto out_unlock;
1239 hash_val = ipoib_addr_hash(htbl, daddr);
1240 for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
1241 neigh != NULL;
1242 neigh = rcu_dereference_bh(neigh->hnext)) {
1243 if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
1244 /* found, take one ref on behalf of the caller */
1245 if (!atomic_inc_not_zero(&neigh->refcnt)) {
1246 /* deleted */
1247 neigh = NULL;
1248 goto out_unlock;
1251 if (likely(skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE))
1252 neigh->alive = jiffies;
1253 goto out_unlock;
1257 out_unlock:
1258 rcu_read_unlock_bh();
1259 return neigh;
1262 static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
1264 struct ipoib_neigh_table *ntbl = &priv->ntbl;
1265 struct ipoib_neigh_hash *htbl;
1266 unsigned long neigh_obsolete;
1267 unsigned long dt;
1268 unsigned long flags;
1269 int i;
1270 LIST_HEAD(remove_list);
1272 if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
1273 return;
1275 spin_lock_irqsave(&priv->lock, flags);
1277 htbl = rcu_dereference_protected(ntbl->htbl,
1278 lockdep_is_held(&priv->lock));
1280 if (!htbl)
1281 goto out_unlock;
1283 /* neigh is obsolete if it was idle for two GC periods */
1284 dt = 2 * arp_tbl.gc_interval;
1285 neigh_obsolete = jiffies - dt;
1286 /* handle possible race condition */
1287 if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
1288 goto out_unlock;
1290 for (i = 0; i < htbl->size; i++) {
1291 struct ipoib_neigh *neigh;
1292 struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1294 while ((neigh = rcu_dereference_protected(*np,
1295 lockdep_is_held(&priv->lock))) != NULL) {
1296 /* was the neigh idle for two GC periods */
1297 if (time_after(neigh_obsolete, neigh->alive)) {
1299 ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list);
1301 rcu_assign_pointer(*np,
1302 rcu_dereference_protected(neigh->hnext,
1303 lockdep_is_held(&priv->lock)));
1304 /* remove from path/mc list */
1305 list_del(&neigh->list);
1306 call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1307 } else {
1308 np = &neigh->hnext;
1314 out_unlock:
1315 spin_unlock_irqrestore(&priv->lock, flags);
1316 ipoib_mcast_remove_list(&remove_list);
1319 static void ipoib_reap_neigh(struct work_struct *work)
1321 struct ipoib_dev_priv *priv =
1322 container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);
1324 __ipoib_reap_neigh(priv);
1326 if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
1327 queue_delayed_work(priv->wq, &priv->neigh_reap_task,
1328 arp_tbl.gc_interval);
1332 static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
1333 struct net_device *dev)
1335 struct ipoib_neigh *neigh;
1337 neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
1338 if (!neigh)
1339 return NULL;
1341 neigh->dev = dev;
1342 memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
1343 skb_queue_head_init(&neigh->queue);
1344 INIT_LIST_HEAD(&neigh->list);
1345 ipoib_cm_set(neigh, NULL);
1346 /* one ref on behalf of the caller */
1347 atomic_set(&neigh->refcnt, 1);
1349 return neigh;
1352 struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
1353 struct net_device *dev)
1355 struct ipoib_dev_priv *priv = netdev_priv(dev);
1356 struct ipoib_neigh_table *ntbl = &priv->ntbl;
1357 struct ipoib_neigh_hash *htbl;
1358 struct ipoib_neigh *neigh;
1359 u32 hash_val;
1361 htbl = rcu_dereference_protected(ntbl->htbl,
1362 lockdep_is_held(&priv->lock));
1363 if (!htbl) {
1364 neigh = NULL;
1365 goto out_unlock;
1368 /* need to add a new neigh, but maybe some other thread succeeded?
1369 * recalc hash, maybe hash resize took place so we do a search
1371 hash_val = ipoib_addr_hash(htbl, daddr);
1372 for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
1373 lockdep_is_held(&priv->lock));
1374 neigh != NULL;
1375 neigh = rcu_dereference_protected(neigh->hnext,
1376 lockdep_is_held(&priv->lock))) {
1377 if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
1378 /* found, take one ref on behalf of the caller */
1379 if (!atomic_inc_not_zero(&neigh->refcnt)) {
1380 /* deleted */
1381 neigh = NULL;
1382 break;
1384 neigh->alive = jiffies;
1385 goto out_unlock;
1389 neigh = ipoib_neigh_ctor(daddr, dev);
1390 if (!neigh)
1391 goto out_unlock;
1393 /* one ref on behalf of the hash table */
1394 atomic_inc(&neigh->refcnt);
1395 neigh->alive = jiffies;
1396 /* put in hash */
1397 rcu_assign_pointer(neigh->hnext,
1398 rcu_dereference_protected(htbl->buckets[hash_val],
1399 lockdep_is_held(&priv->lock)));
1400 rcu_assign_pointer(htbl->buckets[hash_val], neigh);
1401 atomic_inc(&ntbl->entries);
1403 out_unlock:
1405 return neigh;
1408 void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
1410 /* neigh reference count was dropprd to zero */
1411 struct net_device *dev = neigh->dev;
1412 struct ipoib_dev_priv *priv = netdev_priv(dev);
1413 struct sk_buff *skb;
1414 if (neigh->ah)
1415 ipoib_put_ah(neigh->ah);
1416 while ((skb = __skb_dequeue(&neigh->queue))) {
1417 ++dev->stats.tx_dropped;
1418 dev_kfree_skb_any(skb);
1420 if (ipoib_cm_get(neigh))
1421 ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
1422 ipoib_dbg(netdev_priv(dev),
1423 "neigh free for %06x %pI6\n",
1424 IPOIB_QPN(neigh->daddr),
1425 neigh->daddr + 4);
1426 kfree(neigh);
1427 if (atomic_dec_and_test(&priv->ntbl.entries)) {
1428 if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
1429 complete(&priv->ntbl.flushed);
1433 static void ipoib_neigh_reclaim(struct rcu_head *rp)
1435 /* Called as a result of removal from hash table */
1436 struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
1437 /* note TX context may hold another ref */
1438 ipoib_neigh_put(neigh);
1441 void ipoib_neigh_free(struct ipoib_neigh *neigh)
1443 struct net_device *dev = neigh->dev;
1444 struct ipoib_dev_priv *priv = netdev_priv(dev);
1445 struct ipoib_neigh_table *ntbl = &priv->ntbl;
1446 struct ipoib_neigh_hash *htbl;
1447 struct ipoib_neigh __rcu **np;
1448 struct ipoib_neigh *n;
1449 u32 hash_val;
1451 htbl = rcu_dereference_protected(ntbl->htbl,
1452 lockdep_is_held(&priv->lock));
1453 if (!htbl)
1454 return;
1456 hash_val = ipoib_addr_hash(htbl, neigh->daddr);
1457 np = &htbl->buckets[hash_val];
1458 for (n = rcu_dereference_protected(*np,
1459 lockdep_is_held(&priv->lock));
1460 n != NULL;
1461 n = rcu_dereference_protected(*np,
1462 lockdep_is_held(&priv->lock))) {
1463 if (n == neigh) {
1464 /* found */
1465 rcu_assign_pointer(*np,
1466 rcu_dereference_protected(neigh->hnext,
1467 lockdep_is_held(&priv->lock)));
1468 /* remove from parent list */
1469 list_del(&neigh->list);
1470 call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1471 return;
1472 } else {
1473 np = &n->hnext;
1478 static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
1480 struct ipoib_neigh_table *ntbl = &priv->ntbl;
1481 struct ipoib_neigh_hash *htbl;
1482 struct ipoib_neigh __rcu **buckets;
1483 u32 size;
1485 clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1486 ntbl->htbl = NULL;
1487 htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
1488 if (!htbl)
1489 return -ENOMEM;
1490 set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1491 size = roundup_pow_of_two(arp_tbl.gc_thresh3);
1492 buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
1493 if (!buckets) {
1494 kfree(htbl);
1495 return -ENOMEM;
1497 htbl->size = size;
1498 htbl->mask = (size - 1);
1499 htbl->buckets = buckets;
1500 RCU_INIT_POINTER(ntbl->htbl, htbl);
1501 htbl->ntbl = ntbl;
1502 atomic_set(&ntbl->entries, 0);
1504 /* start garbage collection */
1505 clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1506 queue_delayed_work(priv->wq, &priv->neigh_reap_task,
1507 arp_tbl.gc_interval);
1509 return 0;
1512 static void neigh_hash_free_rcu(struct rcu_head *head)
1514 struct ipoib_neigh_hash *htbl = container_of(head,
1515 struct ipoib_neigh_hash,
1516 rcu);
1517 struct ipoib_neigh __rcu **buckets = htbl->buckets;
1518 struct ipoib_neigh_table *ntbl = htbl->ntbl;
1520 kfree(buckets);
1521 kfree(htbl);
1522 complete(&ntbl->deleted);
1525 void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
1527 struct ipoib_dev_priv *priv = netdev_priv(dev);
1528 struct ipoib_neigh_table *ntbl = &priv->ntbl;
1529 struct ipoib_neigh_hash *htbl;
1530 unsigned long flags;
1531 int i;
1533 /* remove all neigh connected to a given path or mcast */
1534 spin_lock_irqsave(&priv->lock, flags);
1536 htbl = rcu_dereference_protected(ntbl->htbl,
1537 lockdep_is_held(&priv->lock));
1539 if (!htbl)
1540 goto out_unlock;
1542 for (i = 0; i < htbl->size; i++) {
1543 struct ipoib_neigh *neigh;
1544 struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1546 while ((neigh = rcu_dereference_protected(*np,
1547 lockdep_is_held(&priv->lock))) != NULL) {
1548 /* delete neighs belong to this parent */
1549 if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
1550 rcu_assign_pointer(*np,
1551 rcu_dereference_protected(neigh->hnext,
1552 lockdep_is_held(&priv->lock)));
1553 /* remove from parent list */
1554 list_del(&neigh->list);
1555 call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1556 } else {
1557 np = &neigh->hnext;
1562 out_unlock:
1563 spin_unlock_irqrestore(&priv->lock, flags);
1566 static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
1568 struct ipoib_neigh_table *ntbl = &priv->ntbl;
1569 struct ipoib_neigh_hash *htbl;
1570 unsigned long flags;
1571 int i, wait_flushed = 0;
1573 init_completion(&priv->ntbl.flushed);
1575 spin_lock_irqsave(&priv->lock, flags);
1577 htbl = rcu_dereference_protected(ntbl->htbl,
1578 lockdep_is_held(&priv->lock));
1579 if (!htbl)
1580 goto out_unlock;
1582 wait_flushed = atomic_read(&priv->ntbl.entries);
1583 if (!wait_flushed)
1584 goto free_htbl;
1586 for (i = 0; i < htbl->size; i++) {
1587 struct ipoib_neigh *neigh;
1588 struct ipoib_neigh __rcu **np = &htbl->buckets[i];
1590 while ((neigh = rcu_dereference_protected(*np,
1591 lockdep_is_held(&priv->lock))) != NULL) {
1592 rcu_assign_pointer(*np,
1593 rcu_dereference_protected(neigh->hnext,
1594 lockdep_is_held(&priv->lock)));
1595 /* remove from path/mc list */
1596 list_del(&neigh->list);
1597 call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
1601 free_htbl:
1602 rcu_assign_pointer(ntbl->htbl, NULL);
1603 call_rcu(&htbl->rcu, neigh_hash_free_rcu);
1605 out_unlock:
1606 spin_unlock_irqrestore(&priv->lock, flags);
1607 if (wait_flushed)
1608 wait_for_completion(&priv->ntbl.flushed);
1611 static void ipoib_neigh_hash_uninit(struct net_device *dev)
1613 struct ipoib_dev_priv *priv = netdev_priv(dev);
1614 int stopped;
1616 ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
1617 init_completion(&priv->ntbl.deleted);
1618 set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
1620 /* Stop GC if called at init fail need to cancel work */
1621 stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
1622 if (!stopped)
1623 cancel_delayed_work(&priv->neigh_reap_task);
1625 ipoib_flush_neighs(priv);
1627 wait_for_completion(&priv->ntbl.deleted);
1631 int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
1633 struct ipoib_dev_priv *priv = netdev_priv(dev);
1635 /* Allocate RX/TX "rings" to hold queued skbs */
1636 priv->rx_ring = kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
1637 GFP_KERNEL);
1638 if (!priv->rx_ring) {
1639 printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
1640 ca->name, ipoib_recvq_size);
1641 goto out;
1644 priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
1645 if (!priv->tx_ring) {
1646 printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
1647 ca->name, ipoib_sendq_size);
1648 goto out_rx_ring_cleanup;
1651 /* priv->tx_head, tx_tail & tx_outstanding are already 0 */
1653 if (ipoib_ib_dev_init(dev, ca, port))
1654 goto out_tx_ring_cleanup;
1657 * Must be after ipoib_ib_dev_init so we can allocate a per
1658 * device wq there and use it here
1660 if (ipoib_neigh_hash_init(priv) < 0)
1661 goto out_dev_uninit;
1663 return 0;
1665 out_dev_uninit:
1666 ipoib_ib_dev_cleanup(dev);
1668 out_tx_ring_cleanup:
1669 vfree(priv->tx_ring);
1671 out_rx_ring_cleanup:
1672 kfree(priv->rx_ring);
1674 out:
1675 return -ENOMEM;
1678 void ipoib_dev_cleanup(struct net_device *dev)
1680 struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
1681 LIST_HEAD(head);
1683 ASSERT_RTNL();
1685 /* Delete any child interfaces first */
1686 list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
1687 /* Stop GC on child */
1688 set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
1689 cancel_delayed_work(&cpriv->neigh_reap_task);
1690 unregister_netdevice_queue(cpriv->dev, &head);
1692 unregister_netdevice_many(&head);
1695 * Must be before ipoib_ib_dev_cleanup or we delete an in use
1696 * work queue
1698 ipoib_neigh_hash_uninit(dev);
1700 ipoib_ib_dev_cleanup(dev);
1702 kfree(priv->rx_ring);
1703 vfree(priv->tx_ring);
1705 priv->rx_ring = NULL;
1706 priv->tx_ring = NULL;
1709 static int ipoib_set_vf_link_state(struct net_device *dev, int vf, int link_state)
1711 struct ipoib_dev_priv *priv = netdev_priv(dev);
1713 return ib_set_vf_link_state(priv->ca, vf, priv->port, link_state);
1716 static int ipoib_get_vf_config(struct net_device *dev, int vf,
1717 struct ifla_vf_info *ivf)
1719 struct ipoib_dev_priv *priv = netdev_priv(dev);
1720 int err;
1722 err = ib_get_vf_config(priv->ca, vf, priv->port, ivf);
1723 if (err)
1724 return err;
1726 ivf->vf = vf;
1728 return 0;
1731 static int ipoib_set_vf_guid(struct net_device *dev, int vf, u64 guid, int type)
1733 struct ipoib_dev_priv *priv = netdev_priv(dev);
1735 if (type != IFLA_VF_IB_NODE_GUID && type != IFLA_VF_IB_PORT_GUID)
1736 return -EINVAL;
1738 return ib_set_vf_guid(priv->ca, vf, priv->port, guid, type);
1741 static int ipoib_get_vf_stats(struct net_device *dev, int vf,
1742 struct ifla_vf_stats *vf_stats)
1744 struct ipoib_dev_priv *priv = netdev_priv(dev);
1746 return ib_get_vf_stats(priv->ca, vf, priv->port, vf_stats);
1749 static const struct header_ops ipoib_header_ops = {
1750 .create = ipoib_hard_header,
1753 static const struct net_device_ops ipoib_netdev_ops_pf = {
1754 .ndo_uninit = ipoib_uninit,
1755 .ndo_open = ipoib_open,
1756 .ndo_stop = ipoib_stop,
1757 .ndo_change_mtu = ipoib_change_mtu,
1758 .ndo_fix_features = ipoib_fix_features,
1759 .ndo_start_xmit = ipoib_start_xmit,
1760 .ndo_tx_timeout = ipoib_timeout,
1761 .ndo_set_rx_mode = ipoib_set_mcast_list,
1762 .ndo_get_iflink = ipoib_get_iflink,
1763 .ndo_set_vf_link_state = ipoib_set_vf_link_state,
1764 .ndo_get_vf_config = ipoib_get_vf_config,
1765 .ndo_get_vf_stats = ipoib_get_vf_stats,
1766 .ndo_set_vf_guid = ipoib_set_vf_guid,
1767 .ndo_set_mac_address = ipoib_set_mac,
1770 static const struct net_device_ops ipoib_netdev_ops_vf = {
1771 .ndo_uninit = ipoib_uninit,
1772 .ndo_open = ipoib_open,
1773 .ndo_stop = ipoib_stop,
1774 .ndo_change_mtu = ipoib_change_mtu,
1775 .ndo_fix_features = ipoib_fix_features,
1776 .ndo_start_xmit = ipoib_start_xmit,
1777 .ndo_tx_timeout = ipoib_timeout,
1778 .ndo_set_rx_mode = ipoib_set_mcast_list,
1779 .ndo_get_iflink = ipoib_get_iflink,
1782 void ipoib_setup(struct net_device *dev)
1784 struct ipoib_dev_priv *priv = netdev_priv(dev);
1786 if (priv->hca_caps & IB_DEVICE_VIRTUAL_FUNCTION)
1787 dev->netdev_ops = &ipoib_netdev_ops_vf;
1788 else
1789 dev->netdev_ops = &ipoib_netdev_ops_pf;
1791 dev->header_ops = &ipoib_header_ops;
1793 ipoib_set_ethtool_ops(dev);
1795 netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT);
1797 dev->watchdog_timeo = HZ;
1799 dev->flags |= IFF_BROADCAST | IFF_MULTICAST;
1801 dev->hard_header_len = IPOIB_HARD_LEN;
1802 dev->addr_len = INFINIBAND_ALEN;
1803 dev->type = ARPHRD_INFINIBAND;
1804 dev->tx_queue_len = ipoib_sendq_size * 2;
1805 dev->features = (NETIF_F_VLAN_CHALLENGED |
1806 NETIF_F_HIGHDMA);
1807 netif_keep_dst(dev);
1809 memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
1811 priv->dev = dev;
1813 spin_lock_init(&priv->lock);
1815 init_rwsem(&priv->vlan_rwsem);
1817 INIT_LIST_HEAD(&priv->path_list);
1818 INIT_LIST_HEAD(&priv->child_intfs);
1819 INIT_LIST_HEAD(&priv->dead_ahs);
1820 INIT_LIST_HEAD(&priv->multicast_list);
1822 INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task);
1823 INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
1824 INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light);
1825 INIT_WORK(&priv->flush_normal, ipoib_ib_dev_flush_normal);
1826 INIT_WORK(&priv->flush_heavy, ipoib_ib_dev_flush_heavy);
1827 INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
1828 INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
1829 INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
1832 struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
1834 struct net_device *dev;
1836 dev = alloc_netdev((int)sizeof(struct ipoib_dev_priv), name,
1837 NET_NAME_UNKNOWN, ipoib_setup);
1838 if (!dev)
1839 return NULL;
1841 return netdev_priv(dev);
1844 static ssize_t show_pkey(struct device *dev,
1845 struct device_attribute *attr, char *buf)
1847 struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1849 return sprintf(buf, "0x%04x\n", priv->pkey);
1851 static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
1853 static ssize_t show_umcast(struct device *dev,
1854 struct device_attribute *attr, char *buf)
1856 struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
1858 return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
1861 void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
1863 struct ipoib_dev_priv *priv = netdev_priv(ndev);
1865 if (umcast_val > 0) {
1866 set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1867 ipoib_warn(priv, "ignoring multicast groups joined directly "
1868 "by userspace\n");
1869 } else
1870 clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
1873 static ssize_t set_umcast(struct device *dev,
1874 struct device_attribute *attr,
1875 const char *buf, size_t count)
1877 unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
1879 ipoib_set_umcast(to_net_dev(dev), umcast_val);
1881 return count;
1883 static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
1885 int ipoib_add_umcast_attr(struct net_device *dev)
1887 return device_create_file(&dev->dev, &dev_attr_umcast);
1890 static void set_base_guid(struct ipoib_dev_priv *priv, union ib_gid *gid)
1892 struct ipoib_dev_priv *child_priv;
1893 struct net_device *netdev = priv->dev;
1895 netif_addr_lock_bh(netdev);
1897 memcpy(&priv->local_gid.global.interface_id,
1898 &gid->global.interface_id,
1899 sizeof(gid->global.interface_id));
1900 memcpy(netdev->dev_addr + 4, &priv->local_gid, sizeof(priv->local_gid));
1901 clear_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
1903 netif_addr_unlock_bh(netdev);
1905 if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
1906 down_read(&priv->vlan_rwsem);
1907 list_for_each_entry(child_priv, &priv->child_intfs, list)
1908 set_base_guid(child_priv, gid);
1909 up_read(&priv->vlan_rwsem);
1913 static int ipoib_check_lladdr(struct net_device *dev,
1914 struct sockaddr_storage *ss)
1916 union ib_gid *gid = (union ib_gid *)(ss->__data + 4);
1917 int ret = 0;
1919 netif_addr_lock_bh(dev);
1921 /* Make sure the QPN, reserved and subnet prefix match the current
1922 * lladdr, it also makes sure the lladdr is unicast.
1924 if (memcmp(dev->dev_addr, ss->__data,
1925 4 + sizeof(gid->global.subnet_prefix)) ||
1926 gid->global.interface_id == 0)
1927 ret = -EINVAL;
1929 netif_addr_unlock_bh(dev);
1931 return ret;
1934 static int ipoib_set_mac(struct net_device *dev, void *addr)
1936 struct ipoib_dev_priv *priv = netdev_priv(dev);
1937 struct sockaddr_storage *ss = addr;
1938 int ret;
1940 if (!(dev->priv_flags & IFF_LIVE_ADDR_CHANGE) && netif_running(dev))
1941 return -EBUSY;
1943 ret = ipoib_check_lladdr(dev, ss);
1944 if (ret)
1945 return ret;
1947 set_base_guid(priv, (union ib_gid *)(ss->__data + 4));
1949 queue_work(ipoib_workqueue, &priv->flush_light);
1951 return 0;
1954 static ssize_t create_child(struct device *dev,
1955 struct device_attribute *attr,
1956 const char *buf, size_t count)
1958 int pkey;
1959 int ret;
1961 if (sscanf(buf, "%i", &pkey) != 1)
1962 return -EINVAL;
1964 if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)
1965 return -EINVAL;
1968 * Set the full membership bit, so that we join the right
1969 * broadcast group, etc.
1971 pkey |= 0x8000;
1973 ret = ipoib_vlan_add(to_net_dev(dev), pkey);
1975 return ret ? ret : count;
1977 static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
1979 static ssize_t delete_child(struct device *dev,
1980 struct device_attribute *attr,
1981 const char *buf, size_t count)
1983 int pkey;
1984 int ret;
1986 if (sscanf(buf, "%i", &pkey) != 1)
1987 return -EINVAL;
1989 if (pkey < 0 || pkey > 0xffff)
1990 return -EINVAL;
1992 ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
1994 return ret ? ret : count;
1997 static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
1999 int ipoib_add_pkey_attr(struct net_device *dev)
2001 return device_create_file(&dev->dev, &dev_attr_pkey);
2004 int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
2006 priv->hca_caps = hca->attrs.device_cap_flags;
2008 if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
2009 priv->dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
2011 if (priv->hca_caps & IB_DEVICE_UD_TSO)
2012 priv->dev->hw_features |= NETIF_F_TSO;
2014 priv->dev->features |= priv->dev->hw_features;
2017 return 0;
2020 static struct net_device *ipoib_add_port(const char *format,
2021 struct ib_device *hca, u8 port)
2023 struct ipoib_dev_priv *priv;
2024 struct ib_port_attr attr;
2025 int result = -ENOMEM;
2027 priv = ipoib_intf_alloc(format);
2028 if (!priv)
2029 goto alloc_mem_failed;
2031 SET_NETDEV_DEV(priv->dev, hca->dma_device);
2032 priv->dev->dev_id = port - 1;
2034 result = ib_query_port(hca, port, &attr);
2035 if (!result)
2036 priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
2037 else {
2038 printk(KERN_WARNING "%s: ib_query_port %d failed\n",
2039 hca->name, port);
2040 goto device_init_failed;
2043 /* MTU will be reset when mcast join happens */
2044 priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu);
2045 priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu;
2047 priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);
2049 result = ib_query_pkey(hca, port, 0, &priv->pkey);
2050 if (result) {
2051 printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
2052 hca->name, port, result);
2053 goto device_init_failed;
2056 result = ipoib_set_dev_features(priv, hca);
2057 if (result)
2058 goto device_init_failed;
2061 * Set the full membership bit, so that we join the right
2062 * broadcast group, etc.
2064 priv->pkey |= 0x8000;
2066 priv->dev->broadcast[8] = priv->pkey >> 8;
2067 priv->dev->broadcast[9] = priv->pkey & 0xff;
2069 result = ib_query_gid(hca, port, 0, &priv->local_gid, NULL);
2070 if (result) {
2071 printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
2072 hca->name, port, result);
2073 goto device_init_failed;
2074 } else
2075 memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
2076 set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags);
2078 result = ipoib_dev_init(priv->dev, hca, port);
2079 if (result < 0) {
2080 printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
2081 hca->name, port, result);
2082 goto device_init_failed;
2085 INIT_IB_EVENT_HANDLER(&priv->event_handler,
2086 priv->ca, ipoib_event);
2087 result = ib_register_event_handler(&priv->event_handler);
2088 if (result < 0) {
2089 printk(KERN_WARNING "%s: ib_register_event_handler failed for "
2090 "port %d (ret = %d)\n",
2091 hca->name, port, result);
2092 goto event_failed;
2095 result = register_netdev(priv->dev);
2096 if (result) {
2097 printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
2098 hca->name, port, result);
2099 goto register_failed;
2102 if (ipoib_cm_add_mode_attr(priv->dev))
2103 goto sysfs_failed;
2104 if (ipoib_add_pkey_attr(priv->dev))
2105 goto sysfs_failed;
2106 if (ipoib_add_umcast_attr(priv->dev))
2107 goto sysfs_failed;
2108 if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
2109 goto sysfs_failed;
2110 if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
2111 goto sysfs_failed;
2113 return priv->dev;
2115 sysfs_failed:
2116 unregister_netdev(priv->dev);
2118 register_failed:
2119 ib_unregister_event_handler(&priv->event_handler);
2120 flush_workqueue(ipoib_workqueue);
2121 /* Stop GC if started before flush */
2122 set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
2123 cancel_delayed_work(&priv->neigh_reap_task);
2124 flush_workqueue(priv->wq);
2126 event_failed:
2127 ipoib_dev_cleanup(priv->dev);
2129 device_init_failed:
2130 free_netdev(priv->dev);
2132 alloc_mem_failed:
2133 return ERR_PTR(result);
2136 static void ipoib_add_one(struct ib_device *device)
2138 struct list_head *dev_list;
2139 struct net_device *dev;
2140 struct ipoib_dev_priv *priv;
2141 int p;
2142 int count = 0;
2144 dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
2145 if (!dev_list)
2146 return;
2148 INIT_LIST_HEAD(dev_list);
2150 for (p = rdma_start_port(device); p <= rdma_end_port(device); ++p) {
2151 if (!rdma_protocol_ib(device, p))
2152 continue;
2153 dev = ipoib_add_port("ib%d", device, p);
2154 if (!IS_ERR(dev)) {
2155 priv = netdev_priv(dev);
2156 list_add_tail(&priv->list, dev_list);
2157 count++;
2161 if (!count) {
2162 kfree(dev_list);
2163 return;
2166 ib_set_client_data(device, &ipoib_client, dev_list);
2169 static void ipoib_remove_one(struct ib_device *device, void *client_data)
2171 struct ipoib_dev_priv *priv, *tmp;
2172 struct list_head *dev_list = client_data;
2174 if (!dev_list)
2175 return;
2177 list_for_each_entry_safe(priv, tmp, dev_list, list) {
2178 ib_unregister_event_handler(&priv->event_handler);
2179 flush_workqueue(ipoib_workqueue);
2181 /* mark interface in the middle of destruction */
2182 set_bit(IPOIB_FLAG_GOING_DOWN, &priv->flags);
2184 rtnl_lock();
2185 dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
2186 rtnl_unlock();
2188 /* Stop GC */
2189 set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
2190 cancel_delayed_work(&priv->neigh_reap_task);
2191 flush_workqueue(priv->wq);
2193 unregister_netdev(priv->dev);
2194 free_netdev(priv->dev);
2197 kfree(dev_list);
2200 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
2201 static struct notifier_block ipoib_netdev_notifier = {
2202 .notifier_call = ipoib_netdev_event,
2204 #endif
2206 static int __init ipoib_init_module(void)
2208 int ret;
2210 ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
2211 ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
2212 ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
2214 ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
2215 ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
2216 ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
2217 #ifdef CONFIG_INFINIBAND_IPOIB_CM
2218 ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
2219 #endif
2222 * When copying small received packets, we only copy from the
2223 * linear data part of the SKB, so we rely on this condition.
2225 BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
2227 ret = ipoib_register_debugfs();
2228 if (ret)
2229 return ret;
2232 * We create a global workqueue here that is used for all flush
2233 * operations. However, if you attempt to flush a workqueue
2234 * from a task on that same workqueue, it deadlocks the system.
2235 * We want to be able to flush the tasks associated with a
2236 * specific net device, so we also create a workqueue for each
2237 * netdevice. We queue up the tasks for that device only on
2238 * its private workqueue, and we only queue up flush events
2239 * on our global flush workqueue. This avoids the deadlocks.
2241 ipoib_workqueue = alloc_ordered_workqueue("ipoib_flush",
2242 WQ_MEM_RECLAIM);
2243 if (!ipoib_workqueue) {
2244 ret = -ENOMEM;
2245 goto err_fs;
2248 ib_sa_register_client(&ipoib_sa_client);
2250 ret = ib_register_client(&ipoib_client);
2251 if (ret)
2252 goto err_sa;
2254 ret = ipoib_netlink_init();
2255 if (ret)
2256 goto err_client;
2258 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
2259 register_netdevice_notifier(&ipoib_netdev_notifier);
2260 #endif
2261 return 0;
2263 err_client:
2264 ib_unregister_client(&ipoib_client);
2266 err_sa:
2267 ib_sa_unregister_client(&ipoib_sa_client);
2268 destroy_workqueue(ipoib_workqueue);
2270 err_fs:
2271 ipoib_unregister_debugfs();
2273 return ret;
2276 static void __exit ipoib_cleanup_module(void)
2278 #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
2279 unregister_netdevice_notifier(&ipoib_netdev_notifier);
2280 #endif
2281 ipoib_netlink_fini();
2282 ib_unregister_client(&ipoib_client);
2283 ib_sa_unregister_client(&ipoib_sa_client);
2284 ipoib_unregister_debugfs();
2285 destroy_workqueue(ipoib_workqueue);
2288 module_init(ipoib_init_module);
2289 module_exit(ipoib_cleanup_module);