2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <asm/system.h>
77 #include <linux/bitops.h>
78 #include <linux/capability.h>
79 #include <linux/cpu.h>
80 #include <linux/types.h>
81 #include <linux/kernel.h>
82 #include <linux/hash.h>
83 #include <linux/slab.h>
84 #include <linux/sched.h>
85 #include <linux/mutex.h>
86 #include <linux/string.h>
88 #include <linux/socket.h>
89 #include <linux/sockios.h>
90 #include <linux/errno.h>
91 #include <linux/interrupt.h>
92 #include <linux/if_ether.h>
93 #include <linux/netdevice.h>
94 #include <linux/etherdevice.h>
95 #include <linux/ethtool.h>
96 #include <linux/notifier.h>
97 #include <linux/skbuff.h>
98 #include <net/net_namespace.h>
100 #include <linux/rtnetlink.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/stat.h>
105 #include <net/pkt_sched.h>
106 #include <net/checksum.h>
107 #include <net/xfrm.h>
108 #include <linux/highmem.h>
109 #include <linux/init.h>
110 #include <linux/kmod.h>
111 #include <linux/module.h>
112 #include <linux/netpoll.h>
113 #include <linux/rcupdate.h>
114 #include <linux/delay.h>
115 #include <net/wext.h>
116 #include <net/iw_handler.h>
117 #include <asm/current.h>
118 #include <linux/audit.h>
119 #include <linux/dmaengine.h>
120 #include <linux/err.h>
121 #include <linux/ctype.h>
122 #include <linux/if_arp.h>
123 #include <linux/if_vlan.h>
124 #include <linux/ip.h>
126 #include <linux/ipv6.h>
127 #include <linux/in.h>
128 #include <linux/jhash.h>
129 #include <linux/random.h>
130 #include <trace/events/napi.h>
131 #include <trace/events/net.h>
132 #include <trace/events/skb.h>
133 #include <linux/pci.h>
134 #include <linux/inetdevice.h>
135 #include <linux/cpu_rmap.h>
137 #include "net-sysfs.h"
139 /* Instead of increasing this, you should create a hash table. */
140 #define MAX_GRO_SKBS 8
142 /* This should be increased if a protocol with a bigger head is added. */
143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
146 * The list of packet types we will receive (as opposed to discard)
147 * and the routines to invoke.
149 * Why 16. Because with 16 the only overlap we get on a hash of the
150 * low nibble of the protocol value is RARP/SNAP/X.25.
152 * NOTE: That is no longer true with the addition of VLAN tags. Not
153 * sure which should go first, but I bet it won't make much
154 * difference if we are running VLANs. The good news is that
155 * this protocol won't be in the list unless compiled in, so
156 * the average user (w/out VLANs) will not be adversely affected.
173 #define PTYPE_HASH_SIZE (16)
174 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
176 static DEFINE_SPINLOCK(ptype_lock
);
177 static struct list_head ptype_base
[PTYPE_HASH_SIZE
] __read_mostly
;
178 static struct list_head ptype_all __read_mostly
; /* Taps */
181 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
184 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
186 * Writers must hold the rtnl semaphore while they loop through the
187 * dev_base_head list, and hold dev_base_lock for writing when they do the
188 * actual updates. This allows pure readers to access the list even
189 * while a writer is preparing to update it.
191 * To put it another way, dev_base_lock is held for writing only to
192 * protect against pure readers; the rtnl semaphore provides the
193 * protection against other writers.
195 * See, for example usages, register_netdevice() and
196 * unregister_netdevice(), which must be called with the rtnl
199 DEFINE_RWLOCK(dev_base_lock
);
200 EXPORT_SYMBOL(dev_base_lock
);
202 static inline struct hlist_head
*dev_name_hash(struct net
*net
, const char *name
)
204 unsigned hash
= full_name_hash(name
, strnlen(name
, IFNAMSIZ
));
205 return &net
->dev_name_head
[hash_32(hash
, NETDEV_HASHBITS
)];
208 static inline struct hlist_head
*dev_index_hash(struct net
*net
, int ifindex
)
210 return &net
->dev_index_head
[ifindex
& (NETDEV_HASHENTRIES
- 1)];
213 static inline void rps_lock(struct softnet_data
*sd
)
216 spin_lock(&sd
->input_pkt_queue
.lock
);
220 static inline void rps_unlock(struct softnet_data
*sd
)
223 spin_unlock(&sd
->input_pkt_queue
.lock
);
227 /* Device list insertion */
228 static int list_netdevice(struct net_device
*dev
)
230 struct net
*net
= dev_net(dev
);
234 write_lock_bh(&dev_base_lock
);
235 list_add_tail_rcu(&dev
->dev_list
, &net
->dev_base_head
);
236 hlist_add_head_rcu(&dev
->name_hlist
, dev_name_hash(net
, dev
->name
));
237 hlist_add_head_rcu(&dev
->index_hlist
,
238 dev_index_hash(net
, dev
->ifindex
));
239 write_unlock_bh(&dev_base_lock
);
243 /* Device list removal
244 * caller must respect a RCU grace period before freeing/reusing dev
246 static void unlist_netdevice(struct net_device
*dev
)
250 /* Unlink dev from the device chain */
251 write_lock_bh(&dev_base_lock
);
252 list_del_rcu(&dev
->dev_list
);
253 hlist_del_rcu(&dev
->name_hlist
);
254 hlist_del_rcu(&dev
->index_hlist
);
255 write_unlock_bh(&dev_base_lock
);
262 static RAW_NOTIFIER_HEAD(netdev_chain
);
265 * Device drivers call our routines to queue packets here. We empty the
266 * queue in the local softnet handler.
269 DEFINE_PER_CPU_ALIGNED(struct softnet_data
, softnet_data
);
270 EXPORT_PER_CPU_SYMBOL(softnet_data
);
272 #ifdef CONFIG_LOCKDEP
274 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
275 * according to dev->type
277 static const unsigned short netdev_lock_type
[] =
278 {ARPHRD_NETROM
, ARPHRD_ETHER
, ARPHRD_EETHER
, ARPHRD_AX25
,
279 ARPHRD_PRONET
, ARPHRD_CHAOS
, ARPHRD_IEEE802
, ARPHRD_ARCNET
,
280 ARPHRD_APPLETLK
, ARPHRD_DLCI
, ARPHRD_ATM
, ARPHRD_METRICOM
,
281 ARPHRD_IEEE1394
, ARPHRD_EUI64
, ARPHRD_INFINIBAND
, ARPHRD_SLIP
,
282 ARPHRD_CSLIP
, ARPHRD_SLIP6
, ARPHRD_CSLIP6
, ARPHRD_RSRVD
,
283 ARPHRD_ADAPT
, ARPHRD_ROSE
, ARPHRD_X25
, ARPHRD_HWX25
,
284 ARPHRD_PPP
, ARPHRD_CISCO
, ARPHRD_LAPB
, ARPHRD_DDCMP
,
285 ARPHRD_RAWHDLC
, ARPHRD_TUNNEL
, ARPHRD_TUNNEL6
, ARPHRD_FRAD
,
286 ARPHRD_SKIP
, ARPHRD_LOOPBACK
, ARPHRD_LOCALTLK
, ARPHRD_FDDI
,
287 ARPHRD_BIF
, ARPHRD_SIT
, ARPHRD_IPDDP
, ARPHRD_IPGRE
,
288 ARPHRD_PIMREG
, ARPHRD_HIPPI
, ARPHRD_ASH
, ARPHRD_ECONET
,
289 ARPHRD_IRDA
, ARPHRD_FCPP
, ARPHRD_FCAL
, ARPHRD_FCPL
,
290 ARPHRD_FCFABRIC
, ARPHRD_IEEE802_TR
, ARPHRD_IEEE80211
,
291 ARPHRD_IEEE80211_PRISM
, ARPHRD_IEEE80211_RADIOTAP
, ARPHRD_PHONET
,
292 ARPHRD_PHONET_PIPE
, ARPHRD_IEEE802154
,
293 ARPHRD_VOID
, ARPHRD_NONE
};
295 static const char *const netdev_lock_name
[] =
296 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
297 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
298 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
299 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
300 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
301 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
302 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
303 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
304 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
305 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
306 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
307 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
308 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
309 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
310 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
311 "_xmit_VOID", "_xmit_NONE"};
313 static struct lock_class_key netdev_xmit_lock_key
[ARRAY_SIZE(netdev_lock_type
)];
314 static struct lock_class_key netdev_addr_lock_key
[ARRAY_SIZE(netdev_lock_type
)];
316 static inline unsigned short netdev_lock_pos(unsigned short dev_type
)
320 for (i
= 0; i
< ARRAY_SIZE(netdev_lock_type
); i
++)
321 if (netdev_lock_type
[i
] == dev_type
)
323 /* the last key is used by default */
324 return ARRAY_SIZE(netdev_lock_type
) - 1;
327 static inline void netdev_set_xmit_lockdep_class(spinlock_t
*lock
,
328 unsigned short dev_type
)
332 i
= netdev_lock_pos(dev_type
);
333 lockdep_set_class_and_name(lock
, &netdev_xmit_lock_key
[i
],
334 netdev_lock_name
[i
]);
337 static inline void netdev_set_addr_lockdep_class(struct net_device
*dev
)
341 i
= netdev_lock_pos(dev
->type
);
342 lockdep_set_class_and_name(&dev
->addr_list_lock
,
343 &netdev_addr_lock_key
[i
],
344 netdev_lock_name
[i
]);
347 static inline void netdev_set_xmit_lockdep_class(spinlock_t
*lock
,
348 unsigned short dev_type
)
351 static inline void netdev_set_addr_lockdep_class(struct net_device
*dev
)
356 /*******************************************************************************
358 Protocol management and registration routines
360 *******************************************************************************/
363 * Add a protocol ID to the list. Now that the input handler is
364 * smarter we can dispense with all the messy stuff that used to be
367 * BEWARE!!! Protocol handlers, mangling input packets,
368 * MUST BE last in hash buckets and checking protocol handlers
369 * MUST start from promiscuous ptype_all chain in net_bh.
370 * It is true now, do not change it.
371 * Explanation follows: if protocol handler, mangling packet, will
372 * be the first on list, it is not able to sense, that packet
373 * is cloned and should be copied-on-write, so that it will
374 * change it and subsequent readers will get broken packet.
378 static inline struct list_head
*ptype_head(const struct packet_type
*pt
)
380 if (pt
->type
== htons(ETH_P_ALL
))
383 return &ptype_base
[ntohs(pt
->type
) & PTYPE_HASH_MASK
];
387 * dev_add_pack - add packet handler
388 * @pt: packet type declaration
390 * Add a protocol handler to the networking stack. The passed &packet_type
391 * is linked into kernel lists and may not be freed until it has been
392 * removed from the kernel lists.
394 * This call does not sleep therefore it can not
395 * guarantee all CPU's that are in middle of receiving packets
396 * will see the new packet type (until the next received packet).
399 void dev_add_pack(struct packet_type
*pt
)
401 struct list_head
*head
= ptype_head(pt
);
403 spin_lock(&ptype_lock
);
404 list_add_rcu(&pt
->list
, head
);
405 spin_unlock(&ptype_lock
);
407 EXPORT_SYMBOL(dev_add_pack
);
410 * __dev_remove_pack - remove packet handler
411 * @pt: packet type declaration
413 * Remove a protocol handler that was previously added to the kernel
414 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
415 * from the kernel lists and can be freed or reused once this function
418 * The packet type might still be in use by receivers
419 * and must not be freed until after all the CPU's have gone
420 * through a quiescent state.
422 void __dev_remove_pack(struct packet_type
*pt
)
424 struct list_head
*head
= ptype_head(pt
);
425 struct packet_type
*pt1
;
427 spin_lock(&ptype_lock
);
429 list_for_each_entry(pt1
, head
, list
) {
431 list_del_rcu(&pt
->list
);
436 printk(KERN_WARNING
"dev_remove_pack: %p not found.\n", pt
);
438 spin_unlock(&ptype_lock
);
440 EXPORT_SYMBOL(__dev_remove_pack
);
443 * dev_remove_pack - remove packet handler
444 * @pt: packet type declaration
446 * Remove a protocol handler that was previously added to the kernel
447 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
448 * from the kernel lists and can be freed or reused once this function
451 * This call sleeps to guarantee that no CPU is looking at the packet
454 void dev_remove_pack(struct packet_type
*pt
)
456 __dev_remove_pack(pt
);
460 EXPORT_SYMBOL(dev_remove_pack
);
462 /******************************************************************************
464 Device Boot-time Settings Routines
466 *******************************************************************************/
468 /* Boot time configuration table */
469 static struct netdev_boot_setup dev_boot_setup
[NETDEV_BOOT_SETUP_MAX
];
472 * netdev_boot_setup_add - add new setup entry
473 * @name: name of the device
474 * @map: configured settings for the device
476 * Adds new setup entry to the dev_boot_setup list. The function
477 * returns 0 on error and 1 on success. This is a generic routine to
480 static int netdev_boot_setup_add(char *name
, struct ifmap
*map
)
482 struct netdev_boot_setup
*s
;
486 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++) {
487 if (s
[i
].name
[0] == '\0' || s
[i
].name
[0] == ' ') {
488 memset(s
[i
].name
, 0, sizeof(s
[i
].name
));
489 strlcpy(s
[i
].name
, name
, IFNAMSIZ
);
490 memcpy(&s
[i
].map
, map
, sizeof(s
[i
].map
));
495 return i
>= NETDEV_BOOT_SETUP_MAX
? 0 : 1;
499 * netdev_boot_setup_check - check boot time settings
500 * @dev: the netdevice
502 * Check boot time settings for the device.
503 * The found settings are set for the device to be used
504 * later in the device probing.
505 * Returns 0 if no settings found, 1 if they are.
507 int netdev_boot_setup_check(struct net_device
*dev
)
509 struct netdev_boot_setup
*s
= dev_boot_setup
;
512 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++) {
513 if (s
[i
].name
[0] != '\0' && s
[i
].name
[0] != ' ' &&
514 !strcmp(dev
->name
, s
[i
].name
)) {
515 dev
->irq
= s
[i
].map
.irq
;
516 dev
->base_addr
= s
[i
].map
.base_addr
;
517 dev
->mem_start
= s
[i
].map
.mem_start
;
518 dev
->mem_end
= s
[i
].map
.mem_end
;
524 EXPORT_SYMBOL(netdev_boot_setup_check
);
528 * netdev_boot_base - get address from boot time settings
529 * @prefix: prefix for network device
530 * @unit: id for network device
532 * Check boot time settings for the base address of device.
533 * The found settings are set for the device to be used
534 * later in the device probing.
535 * Returns 0 if no settings found.
537 unsigned long netdev_boot_base(const char *prefix
, int unit
)
539 const struct netdev_boot_setup
*s
= dev_boot_setup
;
543 sprintf(name
, "%s%d", prefix
, unit
);
546 * If device already registered then return base of 1
547 * to indicate not to probe for this interface
549 if (__dev_get_by_name(&init_net
, name
))
552 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++)
553 if (!strcmp(name
, s
[i
].name
))
554 return s
[i
].map
.base_addr
;
559 * Saves at boot time configured settings for any netdevice.
561 int __init
netdev_boot_setup(char *str
)
566 str
= get_options(str
, ARRAY_SIZE(ints
), ints
);
571 memset(&map
, 0, sizeof(map
));
575 map
.base_addr
= ints
[2];
577 map
.mem_start
= ints
[3];
579 map
.mem_end
= ints
[4];
581 /* Add new entry to the list */
582 return netdev_boot_setup_add(str
, &map
);
585 __setup("netdev=", netdev_boot_setup
);
587 /*******************************************************************************
589 Device Interface Subroutines
591 *******************************************************************************/
594 * __dev_get_by_name - find a device by its name
595 * @net: the applicable net namespace
596 * @name: name to find
598 * Find an interface by name. Must be called under RTNL semaphore
599 * or @dev_base_lock. If the name is found a pointer to the device
600 * is returned. If the name is not found then %NULL is returned. The
601 * reference counters are not incremented so the caller must be
602 * careful with locks.
605 struct net_device
*__dev_get_by_name(struct net
*net
, const char *name
)
607 struct hlist_node
*p
;
608 struct net_device
*dev
;
609 struct hlist_head
*head
= dev_name_hash(net
, name
);
611 hlist_for_each_entry(dev
, p
, head
, name_hlist
)
612 if (!strncmp(dev
->name
, name
, IFNAMSIZ
))
617 EXPORT_SYMBOL(__dev_get_by_name
);
620 * dev_get_by_name_rcu - find a device by its name
621 * @net: the applicable net namespace
622 * @name: name to find
624 * Find an interface by name.
625 * If the name is found a pointer to the device is returned.
626 * If the name is not found then %NULL is returned.
627 * The reference counters are not incremented so the caller must be
628 * careful with locks. The caller must hold RCU lock.
631 struct net_device
*dev_get_by_name_rcu(struct net
*net
, const char *name
)
633 struct hlist_node
*p
;
634 struct net_device
*dev
;
635 struct hlist_head
*head
= dev_name_hash(net
, name
);
637 hlist_for_each_entry_rcu(dev
, p
, head
, name_hlist
)
638 if (!strncmp(dev
->name
, name
, IFNAMSIZ
))
643 EXPORT_SYMBOL(dev_get_by_name_rcu
);
646 * dev_get_by_name - find a device by its name
647 * @net: the applicable net namespace
648 * @name: name to find
650 * Find an interface by name. This can be called from any
651 * context and does its own locking. The returned handle has
652 * the usage count incremented and the caller must use dev_put() to
653 * release it when it is no longer needed. %NULL is returned if no
654 * matching device is found.
657 struct net_device
*dev_get_by_name(struct net
*net
, const char *name
)
659 struct net_device
*dev
;
662 dev
= dev_get_by_name_rcu(net
, name
);
668 EXPORT_SYMBOL(dev_get_by_name
);
671 * __dev_get_by_index - find a device by its ifindex
672 * @net: the applicable net namespace
673 * @ifindex: index of device
675 * Search for an interface by index. Returns %NULL if the device
676 * is not found or a pointer to the device. The device has not
677 * had its reference counter increased so the caller must be careful
678 * about locking. The caller must hold either the RTNL semaphore
682 struct net_device
*__dev_get_by_index(struct net
*net
, int ifindex
)
684 struct hlist_node
*p
;
685 struct net_device
*dev
;
686 struct hlist_head
*head
= dev_index_hash(net
, ifindex
);
688 hlist_for_each_entry(dev
, p
, head
, index_hlist
)
689 if (dev
->ifindex
== ifindex
)
694 EXPORT_SYMBOL(__dev_get_by_index
);
697 * dev_get_by_index_rcu - find a device by its ifindex
698 * @net: the applicable net namespace
699 * @ifindex: index of device
701 * Search for an interface by index. Returns %NULL if the device
702 * is not found or a pointer to the device. The device has not
703 * had its reference counter increased so the caller must be careful
704 * about locking. The caller must hold RCU lock.
707 struct net_device
*dev_get_by_index_rcu(struct net
*net
, int ifindex
)
709 struct hlist_node
*p
;
710 struct net_device
*dev
;
711 struct hlist_head
*head
= dev_index_hash(net
, ifindex
);
713 hlist_for_each_entry_rcu(dev
, p
, head
, index_hlist
)
714 if (dev
->ifindex
== ifindex
)
719 EXPORT_SYMBOL(dev_get_by_index_rcu
);
723 * dev_get_by_index - find a device by its ifindex
724 * @net: the applicable net namespace
725 * @ifindex: index of device
727 * Search for an interface by index. Returns NULL if the device
728 * is not found or a pointer to the device. The device returned has
729 * had a reference added and the pointer is safe until the user calls
730 * dev_put to indicate they have finished with it.
733 struct net_device
*dev_get_by_index(struct net
*net
, int ifindex
)
735 struct net_device
*dev
;
738 dev
= dev_get_by_index_rcu(net
, ifindex
);
744 EXPORT_SYMBOL(dev_get_by_index
);
747 * dev_getbyhwaddr_rcu - find a device by its hardware address
748 * @net: the applicable net namespace
749 * @type: media type of device
750 * @ha: hardware address
752 * Search for an interface by MAC address. Returns NULL if the device
753 * is not found or a pointer to the device.
754 * The caller must hold RCU or RTNL.
755 * The returned device has not had its ref count increased
756 * and the caller must therefore be careful about locking
760 struct net_device
*dev_getbyhwaddr_rcu(struct net
*net
, unsigned short type
,
763 struct net_device
*dev
;
765 for_each_netdev_rcu(net
, dev
)
766 if (dev
->type
== type
&&
767 !memcmp(dev
->dev_addr
, ha
, dev
->addr_len
))
772 EXPORT_SYMBOL(dev_getbyhwaddr_rcu
);
774 struct net_device
*__dev_getfirstbyhwtype(struct net
*net
, unsigned short type
)
776 struct net_device
*dev
;
779 for_each_netdev(net
, dev
)
780 if (dev
->type
== type
)
785 EXPORT_SYMBOL(__dev_getfirstbyhwtype
);
787 struct net_device
*dev_getfirstbyhwtype(struct net
*net
, unsigned short type
)
789 struct net_device
*dev
, *ret
= NULL
;
792 for_each_netdev_rcu(net
, dev
)
793 if (dev
->type
== type
) {
801 EXPORT_SYMBOL(dev_getfirstbyhwtype
);
804 * dev_get_by_flags_rcu - find any device with given flags
805 * @net: the applicable net namespace
806 * @if_flags: IFF_* values
807 * @mask: bitmask of bits in if_flags to check
809 * Search for any interface with the given flags. Returns NULL if a device
810 * is not found or a pointer to the device. Must be called inside
811 * rcu_read_lock(), and result refcount is unchanged.
814 struct net_device
*dev_get_by_flags_rcu(struct net
*net
, unsigned short if_flags
,
817 struct net_device
*dev
, *ret
;
820 for_each_netdev_rcu(net
, dev
) {
821 if (((dev
->flags
^ if_flags
) & mask
) == 0) {
828 EXPORT_SYMBOL(dev_get_by_flags_rcu
);
831 * dev_valid_name - check if name is okay for network device
834 * Network device names need to be valid file names to
835 * to allow sysfs to work. We also disallow any kind of
838 int dev_valid_name(const char *name
)
842 if (strlen(name
) >= IFNAMSIZ
)
844 if (!strcmp(name
, ".") || !strcmp(name
, ".."))
848 if (*name
== '/' || isspace(*name
))
854 EXPORT_SYMBOL(dev_valid_name
);
857 * __dev_alloc_name - allocate a name for a device
858 * @net: network namespace to allocate the device name in
859 * @name: name format string
860 * @buf: scratch buffer and result name string
862 * Passed a format string - eg "lt%d" it will try and find a suitable
863 * id. It scans list of devices to build up a free map, then chooses
864 * the first empty slot. The caller must hold the dev_base or rtnl lock
865 * while allocating the name and adding the device in order to avoid
867 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
868 * Returns the number of the unit assigned or a negative errno code.
871 static int __dev_alloc_name(struct net
*net
, const char *name
, char *buf
)
875 const int max_netdevices
= 8*PAGE_SIZE
;
876 unsigned long *inuse
;
877 struct net_device
*d
;
879 p
= strnchr(name
, IFNAMSIZ
-1, '%');
882 * Verify the string as this thing may have come from
883 * the user. There must be either one "%d" and no other "%"
886 if (p
[1] != 'd' || strchr(p
+ 2, '%'))
889 /* Use one page as a bit array of possible slots */
890 inuse
= (unsigned long *) get_zeroed_page(GFP_ATOMIC
);
894 for_each_netdev(net
, d
) {
895 if (!sscanf(d
->name
, name
, &i
))
897 if (i
< 0 || i
>= max_netdevices
)
900 /* avoid cases where sscanf is not exact inverse of printf */
901 snprintf(buf
, IFNAMSIZ
, name
, i
);
902 if (!strncmp(buf
, d
->name
, IFNAMSIZ
))
906 i
= find_first_zero_bit(inuse
, max_netdevices
);
907 free_page((unsigned long) inuse
);
911 snprintf(buf
, IFNAMSIZ
, name
, i
);
912 if (!__dev_get_by_name(net
, buf
))
915 /* It is possible to run out of possible slots
916 * when the name is long and there isn't enough space left
917 * for the digits, or if all bits are used.
923 * dev_alloc_name - allocate a name for a device
925 * @name: name format string
927 * Passed a format string - eg "lt%d" it will try and find a suitable
928 * id. It scans list of devices to build up a free map, then chooses
929 * the first empty slot. The caller must hold the dev_base or rtnl lock
930 * while allocating the name and adding the device in order to avoid
932 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
933 * Returns the number of the unit assigned or a negative errno code.
936 int dev_alloc_name(struct net_device
*dev
, const char *name
)
942 BUG_ON(!dev_net(dev
));
944 ret
= __dev_alloc_name(net
, name
, buf
);
946 strlcpy(dev
->name
, buf
, IFNAMSIZ
);
949 EXPORT_SYMBOL(dev_alloc_name
);
951 static int dev_get_valid_name(struct net_device
*dev
, const char *name
, bool fmt
)
955 BUG_ON(!dev_net(dev
));
958 if (!dev_valid_name(name
))
961 if (fmt
&& strchr(name
, '%'))
962 return dev_alloc_name(dev
, name
);
963 else if (__dev_get_by_name(net
, name
))
965 else if (dev
->name
!= name
)
966 strlcpy(dev
->name
, name
, IFNAMSIZ
);
972 * dev_change_name - change name of a device
974 * @newname: name (or format string) must be at least IFNAMSIZ
976 * Change name of a device, can pass format strings "eth%d".
979 int dev_change_name(struct net_device
*dev
, const char *newname
)
981 char oldname
[IFNAMSIZ
];
987 BUG_ON(!dev_net(dev
));
990 if (dev
->flags
& IFF_UP
)
993 if (strncmp(newname
, dev
->name
, IFNAMSIZ
) == 0)
996 memcpy(oldname
, dev
->name
, IFNAMSIZ
);
998 err
= dev_get_valid_name(dev
, newname
, 1);
1003 ret
= device_rename(&dev
->dev
, dev
->name
);
1005 memcpy(dev
->name
, oldname
, IFNAMSIZ
);
1009 write_lock_bh(&dev_base_lock
);
1010 hlist_del(&dev
->name_hlist
);
1011 write_unlock_bh(&dev_base_lock
);
1015 write_lock_bh(&dev_base_lock
);
1016 hlist_add_head_rcu(&dev
->name_hlist
, dev_name_hash(net
, dev
->name
));
1017 write_unlock_bh(&dev_base_lock
);
1019 ret
= call_netdevice_notifiers(NETDEV_CHANGENAME
, dev
);
1020 ret
= notifier_to_errno(ret
);
1023 /* err >= 0 after dev_alloc_name() or stores the first errno */
1026 memcpy(dev
->name
, oldname
, IFNAMSIZ
);
1030 "%s: name change rollback failed: %d.\n",
1039 * dev_set_alias - change ifalias of a device
1041 * @alias: name up to IFALIASZ
1042 * @len: limit of bytes to copy from info
1044 * Set ifalias for a device,
1046 int dev_set_alias(struct net_device
*dev
, const char *alias
, size_t len
)
1050 if (len
>= IFALIASZ
)
1055 kfree(dev
->ifalias
);
1056 dev
->ifalias
= NULL
;
1061 dev
->ifalias
= krealloc(dev
->ifalias
, len
+ 1, GFP_KERNEL
);
1065 strlcpy(dev
->ifalias
, alias
, len
+1);
1071 * netdev_features_change - device changes features
1072 * @dev: device to cause notification
1074 * Called to indicate a device has changed features.
1076 void netdev_features_change(struct net_device
*dev
)
1078 call_netdevice_notifiers(NETDEV_FEAT_CHANGE
, dev
);
1080 EXPORT_SYMBOL(netdev_features_change
);
1083 * netdev_state_change - device changes state
1084 * @dev: device to cause notification
1086 * Called to indicate a device has changed state. This function calls
1087 * the notifier chains for netdev_chain and sends a NEWLINK message
1088 * to the routing socket.
1090 void netdev_state_change(struct net_device
*dev
)
1092 if (dev
->flags
& IFF_UP
) {
1093 call_netdevice_notifiers(NETDEV_CHANGE
, dev
);
1094 rtmsg_ifinfo(RTM_NEWLINK
, dev
, 0);
1097 EXPORT_SYMBOL(netdev_state_change
);
1099 int netdev_bonding_change(struct net_device
*dev
, unsigned long event
)
1101 return call_netdevice_notifiers(event
, dev
);
1103 EXPORT_SYMBOL(netdev_bonding_change
);
1106 * dev_load - load a network module
1107 * @net: the applicable net namespace
1108 * @name: name of interface
1110 * If a network interface is not present and the process has suitable
1111 * privileges this function loads the module. If module loading is not
1112 * available in this kernel then it becomes a nop.
1115 void dev_load(struct net
*net
, const char *name
)
1117 struct net_device
*dev
;
1121 dev
= dev_get_by_name_rcu(net
, name
);
1125 if (no_module
&& capable(CAP_NET_ADMIN
))
1126 no_module
= request_module("netdev-%s", name
);
1127 if (no_module
&& capable(CAP_SYS_MODULE
)) {
1128 if (!request_module("%s", name
))
1129 pr_err("Loading kernel module for a network device "
1130 "with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s "
1134 EXPORT_SYMBOL(dev_load
);
1136 static int __dev_open(struct net_device
*dev
)
1138 const struct net_device_ops
*ops
= dev
->netdev_ops
;
1143 if (!netif_device_present(dev
))
1146 ret
= call_netdevice_notifiers(NETDEV_PRE_UP
, dev
);
1147 ret
= notifier_to_errno(ret
);
1151 set_bit(__LINK_STATE_START
, &dev
->state
);
1153 if (ops
->ndo_validate_addr
)
1154 ret
= ops
->ndo_validate_addr(dev
);
1156 if (!ret
&& ops
->ndo_open
)
1157 ret
= ops
->ndo_open(dev
);
1160 clear_bit(__LINK_STATE_START
, &dev
->state
);
1162 dev
->flags
|= IFF_UP
;
1163 net_dmaengine_get();
1164 dev_set_rx_mode(dev
);
1172 * dev_open - prepare an interface for use.
1173 * @dev: device to open
1175 * Takes a device from down to up state. The device's private open
1176 * function is invoked and then the multicast lists are loaded. Finally
1177 * the device is moved into the up state and a %NETDEV_UP message is
1178 * sent to the netdev notifier chain.
1180 * Calling this function on an active interface is a nop. On a failure
1181 * a negative errno code is returned.
1183 int dev_open(struct net_device
*dev
)
1187 if (dev
->flags
& IFF_UP
)
1190 ret
= __dev_open(dev
);
1194 rtmsg_ifinfo(RTM_NEWLINK
, dev
, IFF_UP
|IFF_RUNNING
);
1195 call_netdevice_notifiers(NETDEV_UP
, dev
);
1199 EXPORT_SYMBOL(dev_open
);
1201 static int __dev_close_many(struct list_head
*head
)
1203 struct net_device
*dev
;
1208 list_for_each_entry(dev
, head
, unreg_list
) {
1209 call_netdevice_notifiers(NETDEV_GOING_DOWN
, dev
);
1211 clear_bit(__LINK_STATE_START
, &dev
->state
);
1213 /* Synchronize to scheduled poll. We cannot touch poll list, it
1214 * can be even on different cpu. So just clear netif_running().
1216 * dev->stop() will invoke napi_disable() on all of it's
1217 * napi_struct instances on this device.
1219 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1222 dev_deactivate_many(head
);
1224 list_for_each_entry(dev
, head
, unreg_list
) {
1225 const struct net_device_ops
*ops
= dev
->netdev_ops
;
1228 * Call the device specific close. This cannot fail.
1229 * Only if device is UP
1231 * We allow it to be called even after a DETACH hot-plug
1237 dev
->flags
&= ~IFF_UP
;
1238 net_dmaengine_put();
1244 static int __dev_close(struct net_device
*dev
)
1249 list_add(&dev
->unreg_list
, &single
);
1250 retval
= __dev_close_many(&single
);
1255 static int dev_close_many(struct list_head
*head
)
1257 struct net_device
*dev
, *tmp
;
1258 LIST_HEAD(tmp_list
);
1260 list_for_each_entry_safe(dev
, tmp
, head
, unreg_list
)
1261 if (!(dev
->flags
& IFF_UP
))
1262 list_move(&dev
->unreg_list
, &tmp_list
);
1264 __dev_close_many(head
);
1266 list_for_each_entry(dev
, head
, unreg_list
) {
1267 rtmsg_ifinfo(RTM_NEWLINK
, dev
, IFF_UP
|IFF_RUNNING
);
1268 call_netdevice_notifiers(NETDEV_DOWN
, dev
);
1271 /* rollback_registered_many needs the complete original list */
1272 list_splice(&tmp_list
, head
);
1277 * dev_close - shutdown an interface.
1278 * @dev: device to shutdown
1280 * This function moves an active device into down state. A
1281 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1282 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1285 int dev_close(struct net_device
*dev
)
1289 list_add(&dev
->unreg_list
, &single
);
1290 dev_close_many(&single
);
1294 EXPORT_SYMBOL(dev_close
);
1298 * dev_disable_lro - disable Large Receive Offload on a device
1301 * Disable Large Receive Offload (LRO) on a net device. Must be
1302 * called under RTNL. This is needed if received packets may be
1303 * forwarded to another interface.
1305 void dev_disable_lro(struct net_device
*dev
)
1309 if (dev
->ethtool_ops
&& dev
->ethtool_ops
->get_flags
)
1310 flags
= dev
->ethtool_ops
->get_flags(dev
);
1312 flags
= ethtool_op_get_flags(dev
);
1314 if (!(flags
& ETH_FLAG_LRO
))
1317 __ethtool_set_flags(dev
, flags
& ~ETH_FLAG_LRO
);
1318 WARN_ON(dev
->features
& NETIF_F_LRO
);
1320 EXPORT_SYMBOL(dev_disable_lro
);
1323 static int dev_boot_phase
= 1;
1326 * register_netdevice_notifier - register a network notifier block
1329 * Register a notifier to be called when network device events occur.
1330 * The notifier passed is linked into the kernel structures and must
1331 * not be reused until it has been unregistered. A negative errno code
1332 * is returned on a failure.
1334 * When registered all registration and up events are replayed
1335 * to the new notifier to allow device to have a race free
1336 * view of the network device list.
1339 int register_netdevice_notifier(struct notifier_block
*nb
)
1341 struct net_device
*dev
;
1342 struct net_device
*last
;
1347 err
= raw_notifier_chain_register(&netdev_chain
, nb
);
1353 for_each_netdev(net
, dev
) {
1354 err
= nb
->notifier_call(nb
, NETDEV_REGISTER
, dev
);
1355 err
= notifier_to_errno(err
);
1359 if (!(dev
->flags
& IFF_UP
))
1362 nb
->notifier_call(nb
, NETDEV_UP
, dev
);
1373 for_each_netdev(net
, dev
) {
1377 if (dev
->flags
& IFF_UP
) {
1378 nb
->notifier_call(nb
, NETDEV_GOING_DOWN
, dev
);
1379 nb
->notifier_call(nb
, NETDEV_DOWN
, dev
);
1381 nb
->notifier_call(nb
, NETDEV_UNREGISTER
, dev
);
1382 nb
->notifier_call(nb
, NETDEV_UNREGISTER_BATCH
, dev
);
1386 raw_notifier_chain_unregister(&netdev_chain
, nb
);
1389 EXPORT_SYMBOL(register_netdevice_notifier
);
1392 * unregister_netdevice_notifier - unregister a network notifier block
1395 * Unregister a notifier previously registered by
1396 * register_netdevice_notifier(). The notifier is unlinked into the
1397 * kernel structures and may then be reused. A negative errno code
1398 * is returned on a failure.
1401 int unregister_netdevice_notifier(struct notifier_block
*nb
)
1406 err
= raw_notifier_chain_unregister(&netdev_chain
, nb
);
1410 EXPORT_SYMBOL(unregister_netdevice_notifier
);
1413 * call_netdevice_notifiers - call all network notifier blocks
1414 * @val: value passed unmodified to notifier function
1415 * @dev: net_device pointer passed unmodified to notifier function
1417 * Call all network notifier blocks. Parameters and return value
1418 * are as for raw_notifier_call_chain().
1421 int call_netdevice_notifiers(unsigned long val
, struct net_device
*dev
)
1424 return raw_notifier_call_chain(&netdev_chain
, val
, dev
);
1426 EXPORT_SYMBOL(call_netdevice_notifiers
);
1428 /* When > 0 there are consumers of rx skb time stamps */
1429 static atomic_t netstamp_needed
= ATOMIC_INIT(0);
1431 void net_enable_timestamp(void)
1433 atomic_inc(&netstamp_needed
);
1435 EXPORT_SYMBOL(net_enable_timestamp
);
1437 void net_disable_timestamp(void)
1439 atomic_dec(&netstamp_needed
);
1441 EXPORT_SYMBOL(net_disable_timestamp
);
1443 static inline void net_timestamp_set(struct sk_buff
*skb
)
1445 if (atomic_read(&netstamp_needed
))
1446 __net_timestamp(skb
);
1448 skb
->tstamp
.tv64
= 0;
1451 static inline void net_timestamp_check(struct sk_buff
*skb
)
1453 if (!skb
->tstamp
.tv64
&& atomic_read(&netstamp_needed
))
1454 __net_timestamp(skb
);
1458 * dev_forward_skb - loopback an skb to another netif
1460 * @dev: destination network device
1461 * @skb: buffer to forward
1464 * NET_RX_SUCCESS (no congestion)
1465 * NET_RX_DROP (packet was dropped, but freed)
1467 * dev_forward_skb can be used for injecting an skb from the
1468 * start_xmit function of one device into the receive queue
1469 * of another device.
1471 * The receiving device may be in another namespace, so
1472 * we have to clear all information in the skb that could
1473 * impact namespace isolation.
1475 int dev_forward_skb(struct net_device
*dev
, struct sk_buff
*skb
)
1480 if (unlikely(!(dev
->flags
& IFF_UP
) ||
1481 (skb
->len
> (dev
->mtu
+ dev
->hard_header_len
+ VLAN_HLEN
)))) {
1482 atomic_long_inc(&dev
->rx_dropped
);
1486 skb_set_dev(skb
, dev
);
1487 skb
->tstamp
.tv64
= 0;
1488 skb
->pkt_type
= PACKET_HOST
;
1489 skb
->protocol
= eth_type_trans(skb
, dev
);
1490 return netif_rx(skb
);
1492 EXPORT_SYMBOL_GPL(dev_forward_skb
);
1494 static inline int deliver_skb(struct sk_buff
*skb
,
1495 struct packet_type
*pt_prev
,
1496 struct net_device
*orig_dev
)
1498 atomic_inc(&skb
->users
);
1499 return pt_prev
->func(skb
, skb
->dev
, pt_prev
, orig_dev
);
1503 * Support routine. Sends outgoing frames to any network
1504 * taps currently in use.
1507 static void dev_queue_xmit_nit(struct sk_buff
*skb
, struct net_device
*dev
)
1509 struct packet_type
*ptype
;
1510 struct sk_buff
*skb2
= NULL
;
1511 struct packet_type
*pt_prev
= NULL
;
1514 list_for_each_entry_rcu(ptype
, &ptype_all
, list
) {
1515 /* Never send packets back to the socket
1516 * they originated from - MvS (miquels@drinkel.ow.org)
1518 if ((ptype
->dev
== dev
|| !ptype
->dev
) &&
1519 (ptype
->af_packet_priv
== NULL
||
1520 (struct sock
*)ptype
->af_packet_priv
!= skb
->sk
)) {
1522 deliver_skb(skb2
, pt_prev
, skb
->dev
);
1527 skb2
= skb_clone(skb
, GFP_ATOMIC
);
1531 net_timestamp_set(skb2
);
1533 /* skb->nh should be correctly
1534 set by sender, so that the second statement is
1535 just protection against buggy protocols.
1537 skb_reset_mac_header(skb2
);
1539 if (skb_network_header(skb2
) < skb2
->data
||
1540 skb2
->network_header
> skb2
->tail
) {
1541 if (net_ratelimit())
1542 printk(KERN_CRIT
"protocol %04x is "
1544 ntohs(skb2
->protocol
),
1546 skb_reset_network_header(skb2
);
1549 skb2
->transport_header
= skb2
->network_header
;
1550 skb2
->pkt_type
= PACKET_OUTGOING
;
1555 pt_prev
->func(skb2
, skb
->dev
, pt_prev
, skb
->dev
);
1559 /* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1560 * @dev: Network device
1561 * @txq: number of queues available
1563 * If real_num_tx_queues is changed the tc mappings may no longer be
1564 * valid. To resolve this verify the tc mapping remains valid and if
1565 * not NULL the mapping. With no priorities mapping to this
1566 * offset/count pair it will no longer be used. In the worst case TC0
1567 * is invalid nothing can be done so disable priority mappings. If is
1568 * expected that drivers will fix this mapping if they can before
1569 * calling netif_set_real_num_tx_queues.
1571 static void netif_setup_tc(struct net_device
*dev
, unsigned int txq
)
1574 struct netdev_tc_txq
*tc
= &dev
->tc_to_txq
[0];
1576 /* If TC0 is invalidated disable TC mapping */
1577 if (tc
->offset
+ tc
->count
> txq
) {
1578 pr_warning("Number of in use tx queues changed "
1579 "invalidating tc mappings. Priority "
1580 "traffic classification disabled!\n");
1585 /* Invalidated prio to tc mappings set to TC0 */
1586 for (i
= 1; i
< TC_BITMASK
+ 1; i
++) {
1587 int q
= netdev_get_prio_tc_map(dev
, i
);
1589 tc
= &dev
->tc_to_txq
[q
];
1590 if (tc
->offset
+ tc
->count
> txq
) {
1591 pr_warning("Number of in use tx queues "
1592 "changed. Priority %i to tc "
1593 "mapping %i is no longer valid "
1594 "setting map to 0\n",
1596 netdev_set_prio_tc_map(dev
, i
, 0);
1602 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1603 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1605 int netif_set_real_num_tx_queues(struct net_device
*dev
, unsigned int txq
)
1609 if (txq
< 1 || txq
> dev
->num_tx_queues
)
1612 if (dev
->reg_state
== NETREG_REGISTERED
||
1613 dev
->reg_state
== NETREG_UNREGISTERING
) {
1616 rc
= netdev_queue_update_kobjects(dev
, dev
->real_num_tx_queues
,
1622 netif_setup_tc(dev
, txq
);
1624 if (txq
< dev
->real_num_tx_queues
)
1625 qdisc_reset_all_tx_gt(dev
, txq
);
1628 dev
->real_num_tx_queues
= txq
;
1631 EXPORT_SYMBOL(netif_set_real_num_tx_queues
);
1635 * netif_set_real_num_rx_queues - set actual number of RX queues used
1636 * @dev: Network device
1637 * @rxq: Actual number of RX queues
1639 * This must be called either with the rtnl_lock held or before
1640 * registration of the net device. Returns 0 on success, or a
1641 * negative error code. If called before registration, it always
1644 int netif_set_real_num_rx_queues(struct net_device
*dev
, unsigned int rxq
)
1648 if (rxq
< 1 || rxq
> dev
->num_rx_queues
)
1651 if (dev
->reg_state
== NETREG_REGISTERED
) {
1654 rc
= net_rx_queue_update_kobjects(dev
, dev
->real_num_rx_queues
,
1660 dev
->real_num_rx_queues
= rxq
;
1663 EXPORT_SYMBOL(netif_set_real_num_rx_queues
);
1666 static inline void __netif_reschedule(struct Qdisc
*q
)
1668 struct softnet_data
*sd
;
1669 unsigned long flags
;
1671 local_irq_save(flags
);
1672 sd
= &__get_cpu_var(softnet_data
);
1673 q
->next_sched
= NULL
;
1674 *sd
->output_queue_tailp
= q
;
1675 sd
->output_queue_tailp
= &q
->next_sched
;
1676 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
1677 local_irq_restore(flags
);
1680 void __netif_schedule(struct Qdisc
*q
)
1682 if (!test_and_set_bit(__QDISC_STATE_SCHED
, &q
->state
))
1683 __netif_reschedule(q
);
1685 EXPORT_SYMBOL(__netif_schedule
);
1687 void dev_kfree_skb_irq(struct sk_buff
*skb
)
1689 if (atomic_dec_and_test(&skb
->users
)) {
1690 struct softnet_data
*sd
;
1691 unsigned long flags
;
1693 local_irq_save(flags
);
1694 sd
= &__get_cpu_var(softnet_data
);
1695 skb
->next
= sd
->completion_queue
;
1696 sd
->completion_queue
= skb
;
1697 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
1698 local_irq_restore(flags
);
1701 EXPORT_SYMBOL(dev_kfree_skb_irq
);
1703 void dev_kfree_skb_any(struct sk_buff
*skb
)
1705 if (in_irq() || irqs_disabled())
1706 dev_kfree_skb_irq(skb
);
1710 EXPORT_SYMBOL(dev_kfree_skb_any
);
1714 * netif_device_detach - mark device as removed
1715 * @dev: network device
1717 * Mark device as removed from system and therefore no longer available.
1719 void netif_device_detach(struct net_device
*dev
)
1721 if (test_and_clear_bit(__LINK_STATE_PRESENT
, &dev
->state
) &&
1722 netif_running(dev
)) {
1723 netif_tx_stop_all_queues(dev
);
1726 EXPORT_SYMBOL(netif_device_detach
);
1729 * netif_device_attach - mark device as attached
1730 * @dev: network device
1732 * Mark device as attached from system and restart if needed.
1734 void netif_device_attach(struct net_device
*dev
)
1736 if (!test_and_set_bit(__LINK_STATE_PRESENT
, &dev
->state
) &&
1737 netif_running(dev
)) {
1738 netif_tx_wake_all_queues(dev
);
1739 __netdev_watchdog_up(dev
);
1742 EXPORT_SYMBOL(netif_device_attach
);
1745 * skb_dev_set -- assign a new device to a buffer
1746 * @skb: buffer for the new device
1747 * @dev: network device
1749 * If an skb is owned by a device already, we have to reset
1750 * all data private to the namespace a device belongs to
1751 * before assigning it a new device.
1753 #ifdef CONFIG_NET_NS
1754 void skb_set_dev(struct sk_buff
*skb
, struct net_device
*dev
)
1757 if (skb
->dev
&& !net_eq(dev_net(skb
->dev
), dev_net(dev
))) {
1760 skb_init_secmark(skb
);
1764 skb
->ipvs_property
= 0;
1765 #ifdef CONFIG_NET_SCHED
1771 EXPORT_SYMBOL(skb_set_dev
);
1772 #endif /* CONFIG_NET_NS */
1775 * Invalidate hardware checksum when packet is to be mangled, and
1776 * complete checksum manually on outgoing path.
1778 int skb_checksum_help(struct sk_buff
*skb
)
1781 int ret
= 0, offset
;
1783 if (skb
->ip_summed
== CHECKSUM_COMPLETE
)
1784 goto out_set_summed
;
1786 if (unlikely(skb_shinfo(skb
)->gso_size
)) {
1787 /* Let GSO fix up the checksum. */
1788 goto out_set_summed
;
1791 offset
= skb_checksum_start_offset(skb
);
1792 BUG_ON(offset
>= skb_headlen(skb
));
1793 csum
= skb_checksum(skb
, offset
, skb
->len
- offset
, 0);
1795 offset
+= skb
->csum_offset
;
1796 BUG_ON(offset
+ sizeof(__sum16
) > skb_headlen(skb
));
1798 if (skb_cloned(skb
) &&
1799 !skb_clone_writable(skb
, offset
+ sizeof(__sum16
))) {
1800 ret
= pskb_expand_head(skb
, 0, 0, GFP_ATOMIC
);
1805 *(__sum16
*)(skb
->data
+ offset
) = csum_fold(csum
);
1807 skb
->ip_summed
= CHECKSUM_NONE
;
1811 EXPORT_SYMBOL(skb_checksum_help
);
1814 * skb_gso_segment - Perform segmentation on skb.
1815 * @skb: buffer to segment
1816 * @features: features for the output path (see dev->features)
1818 * This function segments the given skb and returns a list of segments.
1820 * It may return NULL if the skb requires no segmentation. This is
1821 * only possible when GSO is used for verifying header integrity.
1823 struct sk_buff
*skb_gso_segment(struct sk_buff
*skb
, u32 features
)
1825 struct sk_buff
*segs
= ERR_PTR(-EPROTONOSUPPORT
);
1826 struct packet_type
*ptype
;
1827 __be16 type
= skb
->protocol
;
1828 int vlan_depth
= ETH_HLEN
;
1831 while (type
== htons(ETH_P_8021Q
)) {
1832 struct vlan_hdr
*vh
;
1834 if (unlikely(!pskb_may_pull(skb
, vlan_depth
+ VLAN_HLEN
)))
1835 return ERR_PTR(-EINVAL
);
1837 vh
= (struct vlan_hdr
*)(skb
->data
+ vlan_depth
);
1838 type
= vh
->h_vlan_encapsulated_proto
;
1839 vlan_depth
+= VLAN_HLEN
;
1842 skb_reset_mac_header(skb
);
1843 skb
->mac_len
= skb
->network_header
- skb
->mac_header
;
1844 __skb_pull(skb
, skb
->mac_len
);
1846 if (unlikely(skb
->ip_summed
!= CHECKSUM_PARTIAL
)) {
1847 struct net_device
*dev
= skb
->dev
;
1848 struct ethtool_drvinfo info
= {};
1850 if (dev
&& dev
->ethtool_ops
&& dev
->ethtool_ops
->get_drvinfo
)
1851 dev
->ethtool_ops
->get_drvinfo(dev
, &info
);
1853 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
1854 info
.driver
, dev
? dev
->features
: 0L,
1855 skb
->sk
? skb
->sk
->sk_route_caps
: 0L,
1856 skb
->len
, skb
->data_len
, skb
->ip_summed
);
1858 if (skb_header_cloned(skb
) &&
1859 (err
= pskb_expand_head(skb
, 0, 0, GFP_ATOMIC
)))
1860 return ERR_PTR(err
);
1864 list_for_each_entry_rcu(ptype
,
1865 &ptype_base
[ntohs(type
) & PTYPE_HASH_MASK
], list
) {
1866 if (ptype
->type
== type
&& !ptype
->dev
&& ptype
->gso_segment
) {
1867 if (unlikely(skb
->ip_summed
!= CHECKSUM_PARTIAL
)) {
1868 err
= ptype
->gso_send_check(skb
);
1869 segs
= ERR_PTR(err
);
1870 if (err
|| skb_gso_ok(skb
, features
))
1872 __skb_push(skb
, (skb
->data
-
1873 skb_network_header(skb
)));
1875 segs
= ptype
->gso_segment(skb
, features
);
1881 __skb_push(skb
, skb
->data
- skb_mac_header(skb
));
1885 EXPORT_SYMBOL(skb_gso_segment
);
1887 /* Take action when hardware reception checksum errors are detected. */
1889 void netdev_rx_csum_fault(struct net_device
*dev
)
1891 if (net_ratelimit()) {
1892 printk(KERN_ERR
"%s: hw csum failure.\n",
1893 dev
? dev
->name
: "<unknown>");
1897 EXPORT_SYMBOL(netdev_rx_csum_fault
);
1900 /* Actually, we should eliminate this check as soon as we know, that:
1901 * 1. IOMMU is present and allows to map all the memory.
1902 * 2. No high memory really exists on this machine.
1905 static int illegal_highdma(struct net_device
*dev
, struct sk_buff
*skb
)
1907 #ifdef CONFIG_HIGHMEM
1909 if (!(dev
->features
& NETIF_F_HIGHDMA
)) {
1910 for (i
= 0; i
< skb_shinfo(skb
)->nr_frags
; i
++)
1911 if (PageHighMem(skb_shinfo(skb
)->frags
[i
].page
))
1915 if (PCI_DMA_BUS_IS_PHYS
) {
1916 struct device
*pdev
= dev
->dev
.parent
;
1920 for (i
= 0; i
< skb_shinfo(skb
)->nr_frags
; i
++) {
1921 dma_addr_t addr
= page_to_phys(skb_shinfo(skb
)->frags
[i
].page
);
1922 if (!pdev
->dma_mask
|| addr
+ PAGE_SIZE
- 1 > *pdev
->dma_mask
)
1931 void (*destructor
)(struct sk_buff
*skb
);
1934 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1936 static void dev_gso_skb_destructor(struct sk_buff
*skb
)
1938 struct dev_gso_cb
*cb
;
1941 struct sk_buff
*nskb
= skb
->next
;
1943 skb
->next
= nskb
->next
;
1946 } while (skb
->next
);
1948 cb
= DEV_GSO_CB(skb
);
1950 cb
->destructor(skb
);
1954 * dev_gso_segment - Perform emulated hardware segmentation on skb.
1955 * @skb: buffer to segment
1956 * @features: device features as applicable to this skb
1958 * This function segments the given skb and stores the list of segments
1961 static int dev_gso_segment(struct sk_buff
*skb
, int features
)
1963 struct sk_buff
*segs
;
1965 segs
= skb_gso_segment(skb
, features
);
1967 /* Verifying header integrity only. */
1972 return PTR_ERR(segs
);
1975 DEV_GSO_CB(skb
)->destructor
= skb
->destructor
;
1976 skb
->destructor
= dev_gso_skb_destructor
;
1982 * Try to orphan skb early, right before transmission by the device.
1983 * We cannot orphan skb if tx timestamp is requested or the sk-reference
1984 * is needed on driver level for other reasons, e.g. see net/can/raw.c
1986 static inline void skb_orphan_try(struct sk_buff
*skb
)
1988 struct sock
*sk
= skb
->sk
;
1990 if (sk
&& !skb_shinfo(skb
)->tx_flags
) {
1991 /* skb_tx_hash() wont be able to get sk.
1992 * We copy sk_hash into skb->rxhash
1995 skb
->rxhash
= sk
->sk_hash
;
2000 static bool can_checksum_protocol(unsigned long features
, __be16 protocol
)
2002 return ((features
& NETIF_F_GEN_CSUM
) ||
2003 ((features
& NETIF_F_V4_CSUM
) &&
2004 protocol
== htons(ETH_P_IP
)) ||
2005 ((features
& NETIF_F_V6_CSUM
) &&
2006 protocol
== htons(ETH_P_IPV6
)) ||
2007 ((features
& NETIF_F_FCOE_CRC
) &&
2008 protocol
== htons(ETH_P_FCOE
)));
2011 static u32
harmonize_features(struct sk_buff
*skb
, __be16 protocol
, u32 features
)
2013 if (!can_checksum_protocol(features
, protocol
)) {
2014 features
&= ~NETIF_F_ALL_CSUM
;
2015 features
&= ~NETIF_F_SG
;
2016 } else if (illegal_highdma(skb
->dev
, skb
)) {
2017 features
&= ~NETIF_F_SG
;
2023 u32
netif_skb_features(struct sk_buff
*skb
)
2025 __be16 protocol
= skb
->protocol
;
2026 u32 features
= skb
->dev
->features
;
2028 if (protocol
== htons(ETH_P_8021Q
)) {
2029 struct vlan_ethhdr
*veh
= (struct vlan_ethhdr
*)skb
->data
;
2030 protocol
= veh
->h_vlan_encapsulated_proto
;
2031 } else if (!vlan_tx_tag_present(skb
)) {
2032 return harmonize_features(skb
, protocol
, features
);
2035 features
&= (skb
->dev
->vlan_features
| NETIF_F_HW_VLAN_TX
);
2037 if (protocol
!= htons(ETH_P_8021Q
)) {
2038 return harmonize_features(skb
, protocol
, features
);
2040 features
&= NETIF_F_SG
| NETIF_F_HIGHDMA
| NETIF_F_FRAGLIST
|
2041 NETIF_F_GEN_CSUM
| NETIF_F_HW_VLAN_TX
;
2042 return harmonize_features(skb
, protocol
, features
);
2045 EXPORT_SYMBOL(netif_skb_features
);
2048 * Returns true if either:
2049 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2050 * 2. skb is fragmented and the device does not support SG, or if
2051 * at least one of fragments is in highmem and device does not
2052 * support DMA from it.
2054 static inline int skb_needs_linearize(struct sk_buff
*skb
,
2057 return skb_is_nonlinear(skb
) &&
2058 ((skb_has_frag_list(skb
) &&
2059 !(features
& NETIF_F_FRAGLIST
)) ||
2060 (skb_shinfo(skb
)->nr_frags
&&
2061 !(features
& NETIF_F_SG
)));
2064 int dev_hard_start_xmit(struct sk_buff
*skb
, struct net_device
*dev
,
2065 struct netdev_queue
*txq
)
2067 const struct net_device_ops
*ops
= dev
->netdev_ops
;
2068 int rc
= NETDEV_TX_OK
;
2070 if (likely(!skb
->next
)) {
2074 * If device doesnt need skb->dst, release it right now while
2075 * its hot in this cpu cache
2077 if (dev
->priv_flags
& IFF_XMIT_DST_RELEASE
)
2080 if (!list_empty(&ptype_all
))
2081 dev_queue_xmit_nit(skb
, dev
);
2083 skb_orphan_try(skb
);
2085 features
= netif_skb_features(skb
);
2087 if (vlan_tx_tag_present(skb
) &&
2088 !(features
& NETIF_F_HW_VLAN_TX
)) {
2089 skb
= __vlan_put_tag(skb
, vlan_tx_tag_get(skb
));
2096 if (netif_needs_gso(skb
, features
)) {
2097 if (unlikely(dev_gso_segment(skb
, features
)))
2102 if (skb_needs_linearize(skb
, features
) &&
2103 __skb_linearize(skb
))
2106 /* If packet is not checksummed and device does not
2107 * support checksumming for this protocol, complete
2108 * checksumming here.
2110 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
2111 skb_set_transport_header(skb
,
2112 skb_checksum_start_offset(skb
));
2113 if (!(features
& NETIF_F_ALL_CSUM
) &&
2114 skb_checksum_help(skb
))
2119 rc
= ops
->ndo_start_xmit(skb
, dev
);
2120 trace_net_dev_xmit(skb
, rc
);
2121 if (rc
== NETDEV_TX_OK
)
2122 txq_trans_update(txq
);
2128 struct sk_buff
*nskb
= skb
->next
;
2130 skb
->next
= nskb
->next
;
2134 * If device doesnt need nskb->dst, release it right now while
2135 * its hot in this cpu cache
2137 if (dev
->priv_flags
& IFF_XMIT_DST_RELEASE
)
2140 rc
= ops
->ndo_start_xmit(nskb
, dev
);
2141 trace_net_dev_xmit(nskb
, rc
);
2142 if (unlikely(rc
!= NETDEV_TX_OK
)) {
2143 if (rc
& ~NETDEV_TX_MASK
)
2144 goto out_kfree_gso_skb
;
2145 nskb
->next
= skb
->next
;
2149 txq_trans_update(txq
);
2150 if (unlikely(netif_tx_queue_stopped(txq
) && skb
->next
))
2151 return NETDEV_TX_BUSY
;
2152 } while (skb
->next
);
2155 if (likely(skb
->next
== NULL
))
2156 skb
->destructor
= DEV_GSO_CB(skb
)->destructor
;
2163 static u32 hashrnd __read_mostly
;
2166 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2167 * to be used as a distribution range.
2169 u16
__skb_tx_hash(const struct net_device
*dev
, const struct sk_buff
*skb
,
2170 unsigned int num_tx_queues
)
2174 u16 qcount
= num_tx_queues
;
2176 if (skb_rx_queue_recorded(skb
)) {
2177 hash
= skb_get_rx_queue(skb
);
2178 while (unlikely(hash
>= num_tx_queues
))
2179 hash
-= num_tx_queues
;
2184 u8 tc
= netdev_get_prio_tc_map(dev
, skb
->priority
);
2185 qoffset
= dev
->tc_to_txq
[tc
].offset
;
2186 qcount
= dev
->tc_to_txq
[tc
].count
;
2189 if (skb
->sk
&& skb
->sk
->sk_hash
)
2190 hash
= skb
->sk
->sk_hash
;
2192 hash
= (__force u16
) skb
->protocol
^ skb
->rxhash
;
2193 hash
= jhash_1word(hash
, hashrnd
);
2195 return (u16
) (((u64
) hash
* qcount
) >> 32) + qoffset
;
2197 EXPORT_SYMBOL(__skb_tx_hash
);
2199 static inline u16
dev_cap_txqueue(struct net_device
*dev
, u16 queue_index
)
2201 if (unlikely(queue_index
>= dev
->real_num_tx_queues
)) {
2202 if (net_ratelimit()) {
2203 pr_warning("%s selects TX queue %d, but "
2204 "real number of TX queues is %d\n",
2205 dev
->name
, queue_index
, dev
->real_num_tx_queues
);
2212 static inline int get_xps_queue(struct net_device
*dev
, struct sk_buff
*skb
)
2215 struct xps_dev_maps
*dev_maps
;
2216 struct xps_map
*map
;
2217 int queue_index
= -1;
2220 dev_maps
= rcu_dereference(dev
->xps_maps
);
2222 map
= rcu_dereference(
2223 dev_maps
->cpu_map
[raw_smp_processor_id()]);
2226 queue_index
= map
->queues
[0];
2229 if (skb
->sk
&& skb
->sk
->sk_hash
)
2230 hash
= skb
->sk
->sk_hash
;
2232 hash
= (__force u16
) skb
->protocol
^
2234 hash
= jhash_1word(hash
, hashrnd
);
2235 queue_index
= map
->queues
[
2236 ((u64
)hash
* map
->len
) >> 32];
2238 if (unlikely(queue_index
>= dev
->real_num_tx_queues
))
2250 static struct netdev_queue
*dev_pick_tx(struct net_device
*dev
,
2251 struct sk_buff
*skb
)
2254 const struct net_device_ops
*ops
= dev
->netdev_ops
;
2256 if (dev
->real_num_tx_queues
== 1)
2258 else if (ops
->ndo_select_queue
) {
2259 queue_index
= ops
->ndo_select_queue(dev
, skb
);
2260 queue_index
= dev_cap_txqueue(dev
, queue_index
);
2262 struct sock
*sk
= skb
->sk
;
2263 queue_index
= sk_tx_queue_get(sk
);
2265 if (queue_index
< 0 || skb
->ooo_okay
||
2266 queue_index
>= dev
->real_num_tx_queues
) {
2267 int old_index
= queue_index
;
2269 queue_index
= get_xps_queue(dev
, skb
);
2270 if (queue_index
< 0)
2271 queue_index
= skb_tx_hash(dev
, skb
);
2273 if (queue_index
!= old_index
&& sk
) {
2274 struct dst_entry
*dst
=
2275 rcu_dereference_check(sk
->sk_dst_cache
, 1);
2277 if (dst
&& skb_dst(skb
) == dst
)
2278 sk_tx_queue_set(sk
, queue_index
);
2283 skb_set_queue_mapping(skb
, queue_index
);
2284 return netdev_get_tx_queue(dev
, queue_index
);
2287 static inline int __dev_xmit_skb(struct sk_buff
*skb
, struct Qdisc
*q
,
2288 struct net_device
*dev
,
2289 struct netdev_queue
*txq
)
2291 spinlock_t
*root_lock
= qdisc_lock(q
);
2295 qdisc_skb_cb(skb
)->pkt_len
= skb
->len
;
2296 qdisc_calculate_pkt_len(skb
, q
);
2298 * Heuristic to force contended enqueues to serialize on a
2299 * separate lock before trying to get qdisc main lock.
2300 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2301 * and dequeue packets faster.
2303 contended
= qdisc_is_running(q
);
2304 if (unlikely(contended
))
2305 spin_lock(&q
->busylock
);
2307 spin_lock(root_lock
);
2308 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED
, &q
->state
))) {
2311 } else if ((q
->flags
& TCQ_F_CAN_BYPASS
) && !qdisc_qlen(q
) &&
2312 qdisc_run_begin(q
)) {
2314 * This is a work-conserving queue; there are no old skbs
2315 * waiting to be sent out; and the qdisc is not running -
2316 * xmit the skb directly.
2318 if (!(dev
->priv_flags
& IFF_XMIT_DST_RELEASE
))
2321 qdisc_bstats_update(q
, skb
);
2323 if (sch_direct_xmit(skb
, q
, dev
, txq
, root_lock
)) {
2324 if (unlikely(contended
)) {
2325 spin_unlock(&q
->busylock
);
2332 rc
= NET_XMIT_SUCCESS
;
2335 rc
= q
->enqueue(skb
, q
) & NET_XMIT_MASK
;
2336 if (qdisc_run_begin(q
)) {
2337 if (unlikely(contended
)) {
2338 spin_unlock(&q
->busylock
);
2344 spin_unlock(root_lock
);
2345 if (unlikely(contended
))
2346 spin_unlock(&q
->busylock
);
2350 static DEFINE_PER_CPU(int, xmit_recursion
);
2351 #define RECURSION_LIMIT 10
2354 * dev_queue_xmit - transmit a buffer
2355 * @skb: buffer to transmit
2357 * Queue a buffer for transmission to a network device. The caller must
2358 * have set the device and priority and built the buffer before calling
2359 * this function. The function can be called from an interrupt.
2361 * A negative errno code is returned on a failure. A success does not
2362 * guarantee the frame will be transmitted as it may be dropped due
2363 * to congestion or traffic shaping.
2365 * -----------------------------------------------------------------------------------
2366 * I notice this method can also return errors from the queue disciplines,
2367 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2370 * Regardless of the return value, the skb is consumed, so it is currently
2371 * difficult to retry a send to this method. (You can bump the ref count
2372 * before sending to hold a reference for retry if you are careful.)
2374 * When calling this method, interrupts MUST be enabled. This is because
2375 * the BH enable code must have IRQs enabled so that it will not deadlock.
2378 int dev_queue_xmit(struct sk_buff
*skb
)
2380 struct net_device
*dev
= skb
->dev
;
2381 struct netdev_queue
*txq
;
2385 /* Disable soft irqs for various locks below. Also
2386 * stops preemption for RCU.
2390 txq
= dev_pick_tx(dev
, skb
);
2391 q
= rcu_dereference_bh(txq
->qdisc
);
2393 #ifdef CONFIG_NET_CLS_ACT
2394 skb
->tc_verd
= SET_TC_AT(skb
->tc_verd
, AT_EGRESS
);
2396 trace_net_dev_queue(skb
);
2398 rc
= __dev_xmit_skb(skb
, q
, dev
, txq
);
2402 /* The device has no queue. Common case for software devices:
2403 loopback, all the sorts of tunnels...
2405 Really, it is unlikely that netif_tx_lock protection is necessary
2406 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2408 However, it is possible, that they rely on protection
2411 Check this and shot the lock. It is not prone from deadlocks.
2412 Either shot noqueue qdisc, it is even simpler 8)
2414 if (dev
->flags
& IFF_UP
) {
2415 int cpu
= smp_processor_id(); /* ok because BHs are off */
2417 if (txq
->xmit_lock_owner
!= cpu
) {
2419 if (__this_cpu_read(xmit_recursion
) > RECURSION_LIMIT
)
2420 goto recursion_alert
;
2422 HARD_TX_LOCK(dev
, txq
, cpu
);
2424 if (!netif_tx_queue_stopped(txq
)) {
2425 __this_cpu_inc(xmit_recursion
);
2426 rc
= dev_hard_start_xmit(skb
, dev
, txq
);
2427 __this_cpu_dec(xmit_recursion
);
2428 if (dev_xmit_complete(rc
)) {
2429 HARD_TX_UNLOCK(dev
, txq
);
2433 HARD_TX_UNLOCK(dev
, txq
);
2434 if (net_ratelimit())
2435 printk(KERN_CRIT
"Virtual device %s asks to "
2436 "queue packet!\n", dev
->name
);
2438 /* Recursion is detected! It is possible,
2442 if (net_ratelimit())
2443 printk(KERN_CRIT
"Dead loop on virtual device "
2444 "%s, fix it urgently!\n", dev
->name
);
2449 rcu_read_unlock_bh();
2454 rcu_read_unlock_bh();
2457 EXPORT_SYMBOL(dev_queue_xmit
);
2460 /*=======================================================================
2462 =======================================================================*/
2464 int netdev_max_backlog __read_mostly
= 1000;
2465 int netdev_tstamp_prequeue __read_mostly
= 1;
2466 int netdev_budget __read_mostly
= 300;
2467 int weight_p __read_mostly
= 64; /* old backlog weight */
2469 /* Called with irq disabled */
2470 static inline void ____napi_schedule(struct softnet_data
*sd
,
2471 struct napi_struct
*napi
)
2473 list_add_tail(&napi
->poll_list
, &sd
->poll_list
);
2474 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
2478 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
2479 * and src/dst port numbers. Returns a non-zero hash number on success
2482 __u32
__skb_get_rxhash(struct sk_buff
*skb
)
2484 int nhoff
, hash
= 0, poff
;
2485 struct ipv6hdr
*ip6
;
2488 u32 addr1
, addr2
, ihl
;
2494 nhoff
= skb_network_offset(skb
);
2496 switch (skb
->protocol
) {
2497 case __constant_htons(ETH_P_IP
):
2498 if (!pskb_may_pull(skb
, sizeof(*ip
) + nhoff
))
2501 ip
= (struct iphdr
*) (skb
->data
+ nhoff
);
2502 if (ip
->frag_off
& htons(IP_MF
| IP_OFFSET
))
2505 ip_proto
= ip
->protocol
;
2506 addr1
= (__force u32
) ip
->saddr
;
2507 addr2
= (__force u32
) ip
->daddr
;
2510 case __constant_htons(ETH_P_IPV6
):
2511 if (!pskb_may_pull(skb
, sizeof(*ip6
) + nhoff
))
2514 ip6
= (struct ipv6hdr
*) (skb
->data
+ nhoff
);
2515 ip_proto
= ip6
->nexthdr
;
2516 addr1
= (__force u32
) ip6
->saddr
.s6_addr32
[3];
2517 addr2
= (__force u32
) ip6
->daddr
.s6_addr32
[3];
2525 poff
= proto_ports_offset(ip_proto
);
2527 nhoff
+= ihl
* 4 + poff
;
2528 if (pskb_may_pull(skb
, nhoff
+ 4)) {
2529 ports
.v32
= * (__force u32
*) (skb
->data
+ nhoff
);
2530 if (ports
.v16
[1] < ports
.v16
[0])
2531 swap(ports
.v16
[0], ports
.v16
[1]);
2535 /* get a consistent hash (same value on both flow directions) */
2539 hash
= jhash_3words(addr1
, addr2
, ports
.v32
, hashrnd
);
2546 EXPORT_SYMBOL(__skb_get_rxhash
);
2550 /* One global table that all flow-based protocols share. */
2551 struct rps_sock_flow_table __rcu
*rps_sock_flow_table __read_mostly
;
2552 EXPORT_SYMBOL(rps_sock_flow_table
);
2554 static struct rps_dev_flow
*
2555 set_rps_cpu(struct net_device
*dev
, struct sk_buff
*skb
,
2556 struct rps_dev_flow
*rflow
, u16 next_cpu
)
2560 tcpu
= rflow
->cpu
= next_cpu
;
2561 if (tcpu
!= RPS_NO_CPU
) {
2562 #ifdef CONFIG_RFS_ACCEL
2563 struct netdev_rx_queue
*rxqueue
;
2564 struct rps_dev_flow_table
*flow_table
;
2565 struct rps_dev_flow
*old_rflow
;
2570 /* Should we steer this flow to a different hardware queue? */
2571 if (!skb_rx_queue_recorded(skb
) || !dev
->rx_cpu_rmap
||
2572 !(dev
->features
& NETIF_F_NTUPLE
))
2574 rxq_index
= cpu_rmap_lookup_index(dev
->rx_cpu_rmap
, next_cpu
);
2575 if (rxq_index
== skb_get_rx_queue(skb
))
2578 rxqueue
= dev
->_rx
+ rxq_index
;
2579 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
2582 flow_id
= skb
->rxhash
& flow_table
->mask
;
2583 rc
= dev
->netdev_ops
->ndo_rx_flow_steer(dev
, skb
,
2584 rxq_index
, flow_id
);
2588 rflow
= &flow_table
->flows
[flow_id
];
2589 rflow
->cpu
= next_cpu
;
2591 if (old_rflow
->filter
== rflow
->filter
)
2592 old_rflow
->filter
= RPS_NO_FILTER
;
2596 per_cpu(softnet_data
, tcpu
).input_queue_head
;
2603 * get_rps_cpu is called from netif_receive_skb and returns the target
2604 * CPU from the RPS map of the receiving queue for a given skb.
2605 * rcu_read_lock must be held on entry.
2607 static int get_rps_cpu(struct net_device
*dev
, struct sk_buff
*skb
,
2608 struct rps_dev_flow
**rflowp
)
2610 struct netdev_rx_queue
*rxqueue
;
2611 struct rps_map
*map
;
2612 struct rps_dev_flow_table
*flow_table
;
2613 struct rps_sock_flow_table
*sock_flow_table
;
2617 if (skb_rx_queue_recorded(skb
)) {
2618 u16 index
= skb_get_rx_queue(skb
);
2619 if (unlikely(index
>= dev
->real_num_rx_queues
)) {
2620 WARN_ONCE(dev
->real_num_rx_queues
> 1,
2621 "%s received packet on queue %u, but number "
2622 "of RX queues is %u\n",
2623 dev
->name
, index
, dev
->real_num_rx_queues
);
2626 rxqueue
= dev
->_rx
+ index
;
2630 map
= rcu_dereference(rxqueue
->rps_map
);
2632 if (map
->len
== 1 &&
2633 !rcu_dereference_raw(rxqueue
->rps_flow_table
)) {
2634 tcpu
= map
->cpus
[0];
2635 if (cpu_online(tcpu
))
2639 } else if (!rcu_dereference_raw(rxqueue
->rps_flow_table
)) {
2643 skb_reset_network_header(skb
);
2644 if (!skb_get_rxhash(skb
))
2647 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
2648 sock_flow_table
= rcu_dereference(rps_sock_flow_table
);
2649 if (flow_table
&& sock_flow_table
) {
2651 struct rps_dev_flow
*rflow
;
2653 rflow
= &flow_table
->flows
[skb
->rxhash
& flow_table
->mask
];
2656 next_cpu
= sock_flow_table
->ents
[skb
->rxhash
&
2657 sock_flow_table
->mask
];
2660 * If the desired CPU (where last recvmsg was done) is
2661 * different from current CPU (one in the rx-queue flow
2662 * table entry), switch if one of the following holds:
2663 * - Current CPU is unset (equal to RPS_NO_CPU).
2664 * - Current CPU is offline.
2665 * - The current CPU's queue tail has advanced beyond the
2666 * last packet that was enqueued using this table entry.
2667 * This guarantees that all previous packets for the flow
2668 * have been dequeued, thus preserving in order delivery.
2670 if (unlikely(tcpu
!= next_cpu
) &&
2671 (tcpu
== RPS_NO_CPU
|| !cpu_online(tcpu
) ||
2672 ((int)(per_cpu(softnet_data
, tcpu
).input_queue_head
-
2673 rflow
->last_qtail
)) >= 0))
2674 rflow
= set_rps_cpu(dev
, skb
, rflow
, next_cpu
);
2676 if (tcpu
!= RPS_NO_CPU
&& cpu_online(tcpu
)) {
2684 tcpu
= map
->cpus
[((u64
) skb
->rxhash
* map
->len
) >> 32];
2686 if (cpu_online(tcpu
)) {
2696 #ifdef CONFIG_RFS_ACCEL
2699 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2700 * @dev: Device on which the filter was set
2701 * @rxq_index: RX queue index
2702 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2703 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2705 * Drivers that implement ndo_rx_flow_steer() should periodically call
2706 * this function for each installed filter and remove the filters for
2707 * which it returns %true.
2709 bool rps_may_expire_flow(struct net_device
*dev
, u16 rxq_index
,
2710 u32 flow_id
, u16 filter_id
)
2712 struct netdev_rx_queue
*rxqueue
= dev
->_rx
+ rxq_index
;
2713 struct rps_dev_flow_table
*flow_table
;
2714 struct rps_dev_flow
*rflow
;
2719 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
2720 if (flow_table
&& flow_id
<= flow_table
->mask
) {
2721 rflow
= &flow_table
->flows
[flow_id
];
2722 cpu
= ACCESS_ONCE(rflow
->cpu
);
2723 if (rflow
->filter
== filter_id
&& cpu
!= RPS_NO_CPU
&&
2724 ((int)(per_cpu(softnet_data
, cpu
).input_queue_head
-
2725 rflow
->last_qtail
) <
2726 (int)(10 * flow_table
->mask
)))
2732 EXPORT_SYMBOL(rps_may_expire_flow
);
2734 #endif /* CONFIG_RFS_ACCEL */
2736 /* Called from hardirq (IPI) context */
2737 static void rps_trigger_softirq(void *data
)
2739 struct softnet_data
*sd
= data
;
2741 ____napi_schedule(sd
, &sd
->backlog
);
2745 #endif /* CONFIG_RPS */
2748 * Check if this softnet_data structure is another cpu one
2749 * If yes, queue it to our IPI list and return 1
2752 static int rps_ipi_queued(struct softnet_data
*sd
)
2755 struct softnet_data
*mysd
= &__get_cpu_var(softnet_data
);
2758 sd
->rps_ipi_next
= mysd
->rps_ipi_list
;
2759 mysd
->rps_ipi_list
= sd
;
2761 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
2764 #endif /* CONFIG_RPS */
2769 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2770 * queue (may be a remote CPU queue).
2772 static int enqueue_to_backlog(struct sk_buff
*skb
, int cpu
,
2773 unsigned int *qtail
)
2775 struct softnet_data
*sd
;
2776 unsigned long flags
;
2778 sd
= &per_cpu(softnet_data
, cpu
);
2780 local_irq_save(flags
);
2783 if (skb_queue_len(&sd
->input_pkt_queue
) <= netdev_max_backlog
) {
2784 if (skb_queue_len(&sd
->input_pkt_queue
)) {
2786 __skb_queue_tail(&sd
->input_pkt_queue
, skb
);
2787 input_queue_tail_incr_save(sd
, qtail
);
2789 local_irq_restore(flags
);
2790 return NET_RX_SUCCESS
;
2793 /* Schedule NAPI for backlog device
2794 * We can use non atomic operation since we own the queue lock
2796 if (!__test_and_set_bit(NAPI_STATE_SCHED
, &sd
->backlog
.state
)) {
2797 if (!rps_ipi_queued(sd
))
2798 ____napi_schedule(sd
, &sd
->backlog
);
2806 local_irq_restore(flags
);
2808 atomic_long_inc(&skb
->dev
->rx_dropped
);
2814 * netif_rx - post buffer to the network code
2815 * @skb: buffer to post
2817 * This function receives a packet from a device driver and queues it for
2818 * the upper (protocol) levels to process. It always succeeds. The buffer
2819 * may be dropped during processing for congestion control or by the
2823 * NET_RX_SUCCESS (no congestion)
2824 * NET_RX_DROP (packet was dropped)
2828 int netif_rx(struct sk_buff
*skb
)
2832 /* if netpoll wants it, pretend we never saw it */
2833 if (netpoll_rx(skb
))
2836 if (netdev_tstamp_prequeue
)
2837 net_timestamp_check(skb
);
2839 trace_netif_rx(skb
);
2842 struct rps_dev_flow voidflow
, *rflow
= &voidflow
;
2848 cpu
= get_rps_cpu(skb
->dev
, skb
, &rflow
);
2850 cpu
= smp_processor_id();
2852 ret
= enqueue_to_backlog(skb
, cpu
, &rflow
->last_qtail
);
2860 ret
= enqueue_to_backlog(skb
, get_cpu(), &qtail
);
2866 EXPORT_SYMBOL(netif_rx
);
2868 int netif_rx_ni(struct sk_buff
*skb
)
2873 err
= netif_rx(skb
);
2874 if (local_softirq_pending())
2880 EXPORT_SYMBOL(netif_rx_ni
);
2882 static void net_tx_action(struct softirq_action
*h
)
2884 struct softnet_data
*sd
= &__get_cpu_var(softnet_data
);
2886 if (sd
->completion_queue
) {
2887 struct sk_buff
*clist
;
2889 local_irq_disable();
2890 clist
= sd
->completion_queue
;
2891 sd
->completion_queue
= NULL
;
2895 struct sk_buff
*skb
= clist
;
2896 clist
= clist
->next
;
2898 WARN_ON(atomic_read(&skb
->users
));
2899 trace_kfree_skb(skb
, net_tx_action
);
2904 if (sd
->output_queue
) {
2907 local_irq_disable();
2908 head
= sd
->output_queue
;
2909 sd
->output_queue
= NULL
;
2910 sd
->output_queue_tailp
= &sd
->output_queue
;
2914 struct Qdisc
*q
= head
;
2915 spinlock_t
*root_lock
;
2917 head
= head
->next_sched
;
2919 root_lock
= qdisc_lock(q
);
2920 if (spin_trylock(root_lock
)) {
2921 smp_mb__before_clear_bit();
2922 clear_bit(__QDISC_STATE_SCHED
,
2925 spin_unlock(root_lock
);
2927 if (!test_bit(__QDISC_STATE_DEACTIVATED
,
2929 __netif_reschedule(q
);
2931 smp_mb__before_clear_bit();
2932 clear_bit(__QDISC_STATE_SCHED
,
2940 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
2941 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
2942 /* This hook is defined here for ATM LANE */
2943 int (*br_fdb_test_addr_hook
)(struct net_device
*dev
,
2944 unsigned char *addr
) __read_mostly
;
2945 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook
);
2948 #ifdef CONFIG_NET_CLS_ACT
2949 /* TODO: Maybe we should just force sch_ingress to be compiled in
2950 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2951 * a compare and 2 stores extra right now if we dont have it on
2952 * but have CONFIG_NET_CLS_ACT
2953 * NOTE: This doesnt stop any functionality; if you dont have
2954 * the ingress scheduler, you just cant add policies on ingress.
2957 static int ing_filter(struct sk_buff
*skb
, struct netdev_queue
*rxq
)
2959 struct net_device
*dev
= skb
->dev
;
2960 u32 ttl
= G_TC_RTTL(skb
->tc_verd
);
2961 int result
= TC_ACT_OK
;
2964 if (unlikely(MAX_RED_LOOP
< ttl
++)) {
2965 if (net_ratelimit())
2966 pr_warning( "Redir loop detected Dropping packet (%d->%d)\n",
2967 skb
->skb_iif
, dev
->ifindex
);
2971 skb
->tc_verd
= SET_TC_RTTL(skb
->tc_verd
, ttl
);
2972 skb
->tc_verd
= SET_TC_AT(skb
->tc_verd
, AT_INGRESS
);
2975 if (q
!= &noop_qdisc
) {
2976 spin_lock(qdisc_lock(q
));
2977 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED
, &q
->state
)))
2978 result
= qdisc_enqueue_root(skb
, q
);
2979 spin_unlock(qdisc_lock(q
));
2985 static inline struct sk_buff
*handle_ing(struct sk_buff
*skb
,
2986 struct packet_type
**pt_prev
,
2987 int *ret
, struct net_device
*orig_dev
)
2989 struct netdev_queue
*rxq
= rcu_dereference(skb
->dev
->ingress_queue
);
2991 if (!rxq
|| rxq
->qdisc
== &noop_qdisc
)
2995 *ret
= deliver_skb(skb
, *pt_prev
, orig_dev
);
2999 switch (ing_filter(skb
, rxq
)) {
3013 * netdev_rx_handler_register - register receive handler
3014 * @dev: device to register a handler for
3015 * @rx_handler: receive handler to register
3016 * @rx_handler_data: data pointer that is used by rx handler
3018 * Register a receive hander for a device. This handler will then be
3019 * called from __netif_receive_skb. A negative errno code is returned
3022 * The caller must hold the rtnl_mutex.
3024 * For a general description of rx_handler, see enum rx_handler_result.
3026 int netdev_rx_handler_register(struct net_device
*dev
,
3027 rx_handler_func_t
*rx_handler
,
3028 void *rx_handler_data
)
3032 if (dev
->rx_handler
)
3035 rcu_assign_pointer(dev
->rx_handler_data
, rx_handler_data
);
3036 rcu_assign_pointer(dev
->rx_handler
, rx_handler
);
3040 EXPORT_SYMBOL_GPL(netdev_rx_handler_register
);
3043 * netdev_rx_handler_unregister - unregister receive handler
3044 * @dev: device to unregister a handler from
3046 * Unregister a receive hander from a device.
3048 * The caller must hold the rtnl_mutex.
3050 void netdev_rx_handler_unregister(struct net_device
*dev
)
3054 rcu_assign_pointer(dev
->rx_handler
, NULL
);
3055 rcu_assign_pointer(dev
->rx_handler_data
, NULL
);
3057 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister
);
3059 static void vlan_on_bond_hook(struct sk_buff
*skb
)
3062 * Make sure ARP frames received on VLAN interfaces stacked on
3063 * bonding interfaces still make their way to any base bonding
3064 * device that may have registered for a specific ptype.
3066 if (skb
->dev
->priv_flags
& IFF_802_1Q_VLAN
&&
3067 vlan_dev_real_dev(skb
->dev
)->priv_flags
& IFF_BONDING
&&
3068 skb
->protocol
== htons(ETH_P_ARP
)) {
3069 struct sk_buff
*skb2
= skb_clone(skb
, GFP_ATOMIC
);
3073 skb2
->dev
= vlan_dev_real_dev(skb
->dev
);
3078 static int __netif_receive_skb(struct sk_buff
*skb
)
3080 struct packet_type
*ptype
, *pt_prev
;
3081 rx_handler_func_t
*rx_handler
;
3082 struct net_device
*orig_dev
;
3083 struct net_device
*null_or_dev
;
3084 bool deliver_exact
= false;
3085 int ret
= NET_RX_DROP
;
3088 if (!netdev_tstamp_prequeue
)
3089 net_timestamp_check(skb
);
3091 trace_netif_receive_skb(skb
);
3093 /* if we've gotten here through NAPI, check netpoll */
3094 if (netpoll_receive_skb(skb
))
3098 skb
->skb_iif
= skb
->dev
->ifindex
;
3099 orig_dev
= skb
->dev
;
3101 skb_reset_network_header(skb
);
3102 skb_reset_transport_header(skb
);
3103 skb
->mac_len
= skb
->network_header
- skb
->mac_header
;
3111 __this_cpu_inc(softnet_data
.processed
);
3113 #ifdef CONFIG_NET_CLS_ACT
3114 if (skb
->tc_verd
& TC_NCLS
) {
3115 skb
->tc_verd
= CLR_TC_NCLS(skb
->tc_verd
);
3120 list_for_each_entry_rcu(ptype
, &ptype_all
, list
) {
3121 if (!ptype
->dev
|| ptype
->dev
== skb
->dev
) {
3123 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3128 #ifdef CONFIG_NET_CLS_ACT
3129 skb
= handle_ing(skb
, &pt_prev
, &ret
, orig_dev
);
3135 rx_handler
= rcu_dereference(skb
->dev
->rx_handler
);
3138 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3141 switch (rx_handler(&skb
)) {
3142 case RX_HANDLER_CONSUMED
:
3144 case RX_HANDLER_ANOTHER
:
3146 case RX_HANDLER_EXACT
:
3147 deliver_exact
= true;
3148 case RX_HANDLER_PASS
:
3155 if (vlan_tx_tag_present(skb
)) {
3157 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3160 if (vlan_hwaccel_do_receive(&skb
)) {
3161 ret
= __netif_receive_skb(skb
);
3163 } else if (unlikely(!skb
))
3167 vlan_on_bond_hook(skb
);
3169 /* deliver only exact match when indicated */
3170 null_or_dev
= deliver_exact
? skb
->dev
: NULL
;
3172 type
= skb
->protocol
;
3173 list_for_each_entry_rcu(ptype
,
3174 &ptype_base
[ntohs(type
) & PTYPE_HASH_MASK
], list
) {
3175 if (ptype
->type
== type
&&
3176 (ptype
->dev
== null_or_dev
|| ptype
->dev
== skb
->dev
||
3177 ptype
->dev
== orig_dev
)) {
3179 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3185 ret
= pt_prev
->func(skb
, skb
->dev
, pt_prev
, orig_dev
);
3187 atomic_long_inc(&skb
->dev
->rx_dropped
);
3189 /* Jamal, now you will not able to escape explaining
3190 * me how you were going to use this. :-)
3201 * netif_receive_skb - process receive buffer from network
3202 * @skb: buffer to process
3204 * netif_receive_skb() is the main receive data processing function.
3205 * It always succeeds. The buffer may be dropped during processing
3206 * for congestion control or by the protocol layers.
3208 * This function may only be called from softirq context and interrupts
3209 * should be enabled.
3211 * Return values (usually ignored):
3212 * NET_RX_SUCCESS: no congestion
3213 * NET_RX_DROP: packet was dropped
3215 int netif_receive_skb(struct sk_buff
*skb
)
3217 if (netdev_tstamp_prequeue
)
3218 net_timestamp_check(skb
);
3220 if (skb_defer_rx_timestamp(skb
))
3221 return NET_RX_SUCCESS
;
3225 struct rps_dev_flow voidflow
, *rflow
= &voidflow
;
3230 cpu
= get_rps_cpu(skb
->dev
, skb
, &rflow
);
3233 ret
= enqueue_to_backlog(skb
, cpu
, &rflow
->last_qtail
);
3237 ret
= __netif_receive_skb(skb
);
3243 return __netif_receive_skb(skb
);
3246 EXPORT_SYMBOL(netif_receive_skb
);
3248 /* Network device is going away, flush any packets still pending
3249 * Called with irqs disabled.
3251 static void flush_backlog(void *arg
)
3253 struct net_device
*dev
= arg
;
3254 struct softnet_data
*sd
= &__get_cpu_var(softnet_data
);
3255 struct sk_buff
*skb
, *tmp
;
3258 skb_queue_walk_safe(&sd
->input_pkt_queue
, skb
, tmp
) {
3259 if (skb
->dev
== dev
) {
3260 __skb_unlink(skb
, &sd
->input_pkt_queue
);
3262 input_queue_head_incr(sd
);
3267 skb_queue_walk_safe(&sd
->process_queue
, skb
, tmp
) {
3268 if (skb
->dev
== dev
) {
3269 __skb_unlink(skb
, &sd
->process_queue
);
3271 input_queue_head_incr(sd
);
3276 static int napi_gro_complete(struct sk_buff
*skb
)
3278 struct packet_type
*ptype
;
3279 __be16 type
= skb
->protocol
;
3280 struct list_head
*head
= &ptype_base
[ntohs(type
) & PTYPE_HASH_MASK
];
3283 if (NAPI_GRO_CB(skb
)->count
== 1) {
3284 skb_shinfo(skb
)->gso_size
= 0;
3289 list_for_each_entry_rcu(ptype
, head
, list
) {
3290 if (ptype
->type
!= type
|| ptype
->dev
|| !ptype
->gro_complete
)
3293 err
= ptype
->gro_complete(skb
);
3299 WARN_ON(&ptype
->list
== head
);
3301 return NET_RX_SUCCESS
;
3305 return netif_receive_skb(skb
);
3308 inline void napi_gro_flush(struct napi_struct
*napi
)
3310 struct sk_buff
*skb
, *next
;
3312 for (skb
= napi
->gro_list
; skb
; skb
= next
) {
3315 napi_gro_complete(skb
);
3318 napi
->gro_count
= 0;
3319 napi
->gro_list
= NULL
;
3321 EXPORT_SYMBOL(napi_gro_flush
);
3323 enum gro_result
dev_gro_receive(struct napi_struct
*napi
, struct sk_buff
*skb
)
3325 struct sk_buff
**pp
= NULL
;
3326 struct packet_type
*ptype
;
3327 __be16 type
= skb
->protocol
;
3328 struct list_head
*head
= &ptype_base
[ntohs(type
) & PTYPE_HASH_MASK
];
3331 enum gro_result ret
;
3333 if (!(skb
->dev
->features
& NETIF_F_GRO
) || netpoll_rx_on(skb
))
3336 if (skb_is_gso(skb
) || skb_has_frag_list(skb
))
3340 list_for_each_entry_rcu(ptype
, head
, list
) {
3341 if (ptype
->type
!= type
|| ptype
->dev
|| !ptype
->gro_receive
)
3344 skb_set_network_header(skb
, skb_gro_offset(skb
));
3345 mac_len
= skb
->network_header
- skb
->mac_header
;
3346 skb
->mac_len
= mac_len
;
3347 NAPI_GRO_CB(skb
)->same_flow
= 0;
3348 NAPI_GRO_CB(skb
)->flush
= 0;
3349 NAPI_GRO_CB(skb
)->free
= 0;
3351 pp
= ptype
->gro_receive(&napi
->gro_list
, skb
);
3356 if (&ptype
->list
== head
)
3359 same_flow
= NAPI_GRO_CB(skb
)->same_flow
;
3360 ret
= NAPI_GRO_CB(skb
)->free
? GRO_MERGED_FREE
: GRO_MERGED
;
3363 struct sk_buff
*nskb
= *pp
;
3367 napi_gro_complete(nskb
);
3374 if (NAPI_GRO_CB(skb
)->flush
|| napi
->gro_count
>= MAX_GRO_SKBS
)
3378 NAPI_GRO_CB(skb
)->count
= 1;
3379 skb_shinfo(skb
)->gso_size
= skb_gro_len(skb
);
3380 skb
->next
= napi
->gro_list
;
3381 napi
->gro_list
= skb
;
3385 if (skb_headlen(skb
) < skb_gro_offset(skb
)) {
3386 int grow
= skb_gro_offset(skb
) - skb_headlen(skb
);
3388 BUG_ON(skb
->end
- skb
->tail
< grow
);
3390 memcpy(skb_tail_pointer(skb
), NAPI_GRO_CB(skb
)->frag0
, grow
);
3393 skb
->data_len
-= grow
;
3395 skb_shinfo(skb
)->frags
[0].page_offset
+= grow
;
3396 skb_shinfo(skb
)->frags
[0].size
-= grow
;
3398 if (unlikely(!skb_shinfo(skb
)->frags
[0].size
)) {
3399 put_page(skb_shinfo(skb
)->frags
[0].page
);
3400 memmove(skb_shinfo(skb
)->frags
,
3401 skb_shinfo(skb
)->frags
+ 1,
3402 --skb_shinfo(skb
)->nr_frags
* sizeof(skb_frag_t
));
3413 EXPORT_SYMBOL(dev_gro_receive
);
3415 static inline gro_result_t
3416 __napi_gro_receive(struct napi_struct
*napi
, struct sk_buff
*skb
)
3420 for (p
= napi
->gro_list
; p
; p
= p
->next
) {
3421 unsigned long diffs
;
3423 diffs
= (unsigned long)p
->dev
^ (unsigned long)skb
->dev
;
3424 diffs
|= p
->vlan_tci
^ skb
->vlan_tci
;
3425 diffs
|= compare_ether_header(skb_mac_header(p
),
3426 skb_gro_mac_header(skb
));
3427 NAPI_GRO_CB(p
)->same_flow
= !diffs
;
3428 NAPI_GRO_CB(p
)->flush
= 0;
3431 return dev_gro_receive(napi
, skb
);
3434 gro_result_t
napi_skb_finish(gro_result_t ret
, struct sk_buff
*skb
)
3438 if (netif_receive_skb(skb
))
3443 case GRO_MERGED_FREE
:
3454 EXPORT_SYMBOL(napi_skb_finish
);
3456 void skb_gro_reset_offset(struct sk_buff
*skb
)
3458 NAPI_GRO_CB(skb
)->data_offset
= 0;
3459 NAPI_GRO_CB(skb
)->frag0
= NULL
;
3460 NAPI_GRO_CB(skb
)->frag0_len
= 0;
3462 if (skb
->mac_header
== skb
->tail
&&
3463 !PageHighMem(skb_shinfo(skb
)->frags
[0].page
)) {
3464 NAPI_GRO_CB(skb
)->frag0
=
3465 page_address(skb_shinfo(skb
)->frags
[0].page
) +
3466 skb_shinfo(skb
)->frags
[0].page_offset
;
3467 NAPI_GRO_CB(skb
)->frag0_len
= skb_shinfo(skb
)->frags
[0].size
;
3470 EXPORT_SYMBOL(skb_gro_reset_offset
);
3472 gro_result_t
napi_gro_receive(struct napi_struct
*napi
, struct sk_buff
*skb
)
3474 skb_gro_reset_offset(skb
);
3476 return napi_skb_finish(__napi_gro_receive(napi
, skb
), skb
);
3478 EXPORT_SYMBOL(napi_gro_receive
);
3480 static void napi_reuse_skb(struct napi_struct
*napi
, struct sk_buff
*skb
)
3482 __skb_pull(skb
, skb_headlen(skb
));
3483 skb_reserve(skb
, NET_IP_ALIGN
- skb_headroom(skb
));
3485 skb
->dev
= napi
->dev
;
3491 struct sk_buff
*napi_get_frags(struct napi_struct
*napi
)
3493 struct sk_buff
*skb
= napi
->skb
;
3496 skb
= netdev_alloc_skb_ip_align(napi
->dev
, GRO_MAX_HEAD
);
3502 EXPORT_SYMBOL(napi_get_frags
);
3504 gro_result_t
napi_frags_finish(struct napi_struct
*napi
, struct sk_buff
*skb
,
3510 skb
->protocol
= eth_type_trans(skb
, skb
->dev
);
3512 if (ret
== GRO_HELD
)
3513 skb_gro_pull(skb
, -ETH_HLEN
);
3514 else if (netif_receive_skb(skb
))
3519 case GRO_MERGED_FREE
:
3520 napi_reuse_skb(napi
, skb
);
3529 EXPORT_SYMBOL(napi_frags_finish
);
3531 struct sk_buff
*napi_frags_skb(struct napi_struct
*napi
)
3533 struct sk_buff
*skb
= napi
->skb
;
3540 skb_reset_mac_header(skb
);
3541 skb_gro_reset_offset(skb
);
3543 off
= skb_gro_offset(skb
);
3544 hlen
= off
+ sizeof(*eth
);
3545 eth
= skb_gro_header_fast(skb
, off
);
3546 if (skb_gro_header_hard(skb
, hlen
)) {
3547 eth
= skb_gro_header_slow(skb
, hlen
, off
);
3548 if (unlikely(!eth
)) {
3549 napi_reuse_skb(napi
, skb
);
3555 skb_gro_pull(skb
, sizeof(*eth
));
3558 * This works because the only protocols we care about don't require
3559 * special handling. We'll fix it up properly at the end.
3561 skb
->protocol
= eth
->h_proto
;
3566 EXPORT_SYMBOL(napi_frags_skb
);
3568 gro_result_t
napi_gro_frags(struct napi_struct
*napi
)
3570 struct sk_buff
*skb
= napi_frags_skb(napi
);
3575 return napi_frags_finish(napi
, skb
, __napi_gro_receive(napi
, skb
));
3577 EXPORT_SYMBOL(napi_gro_frags
);
3580 * net_rps_action sends any pending IPI's for rps.
3581 * Note: called with local irq disabled, but exits with local irq enabled.
3583 static void net_rps_action_and_irq_enable(struct softnet_data
*sd
)
3586 struct softnet_data
*remsd
= sd
->rps_ipi_list
;
3589 sd
->rps_ipi_list
= NULL
;
3593 /* Send pending IPI's to kick RPS processing on remote cpus. */
3595 struct softnet_data
*next
= remsd
->rps_ipi_next
;
3597 if (cpu_online(remsd
->cpu
))
3598 __smp_call_function_single(remsd
->cpu
,
3607 static int process_backlog(struct napi_struct
*napi
, int quota
)
3610 struct softnet_data
*sd
= container_of(napi
, struct softnet_data
, backlog
);
3613 /* Check if we have pending ipi, its better to send them now,
3614 * not waiting net_rx_action() end.
3616 if (sd
->rps_ipi_list
) {
3617 local_irq_disable();
3618 net_rps_action_and_irq_enable(sd
);
3621 napi
->weight
= weight_p
;
3622 local_irq_disable();
3623 while (work
< quota
) {
3624 struct sk_buff
*skb
;
3627 while ((skb
= __skb_dequeue(&sd
->process_queue
))) {
3629 __netif_receive_skb(skb
);
3630 local_irq_disable();
3631 input_queue_head_incr(sd
);
3632 if (++work
>= quota
) {
3639 qlen
= skb_queue_len(&sd
->input_pkt_queue
);
3641 skb_queue_splice_tail_init(&sd
->input_pkt_queue
,
3642 &sd
->process_queue
);
3644 if (qlen
< quota
- work
) {
3646 * Inline a custom version of __napi_complete().
3647 * only current cpu owns and manipulates this napi,
3648 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3649 * we can use a plain write instead of clear_bit(),
3650 * and we dont need an smp_mb() memory barrier.
3652 list_del(&napi
->poll_list
);
3655 quota
= work
+ qlen
;
3665 * __napi_schedule - schedule for receive
3666 * @n: entry to schedule
3668 * The entry's receive function will be scheduled to run
3670 void __napi_schedule(struct napi_struct
*n
)
3672 unsigned long flags
;
3674 local_irq_save(flags
);
3675 ____napi_schedule(&__get_cpu_var(softnet_data
), n
);
3676 local_irq_restore(flags
);
3678 EXPORT_SYMBOL(__napi_schedule
);
3680 void __napi_complete(struct napi_struct
*n
)
3682 BUG_ON(!test_bit(NAPI_STATE_SCHED
, &n
->state
));
3683 BUG_ON(n
->gro_list
);
3685 list_del(&n
->poll_list
);
3686 smp_mb__before_clear_bit();
3687 clear_bit(NAPI_STATE_SCHED
, &n
->state
);
3689 EXPORT_SYMBOL(__napi_complete
);
3691 void napi_complete(struct napi_struct
*n
)
3693 unsigned long flags
;
3696 * don't let napi dequeue from the cpu poll list
3697 * just in case its running on a different cpu
3699 if (unlikely(test_bit(NAPI_STATE_NPSVC
, &n
->state
)))
3703 local_irq_save(flags
);
3705 local_irq_restore(flags
);
3707 EXPORT_SYMBOL(napi_complete
);
3709 void netif_napi_add(struct net_device
*dev
, struct napi_struct
*napi
,
3710 int (*poll
)(struct napi_struct
*, int), int weight
)
3712 INIT_LIST_HEAD(&napi
->poll_list
);
3713 napi
->gro_count
= 0;
3714 napi
->gro_list
= NULL
;
3717 napi
->weight
= weight
;
3718 list_add(&napi
->dev_list
, &dev
->napi_list
);
3720 #ifdef CONFIG_NETPOLL
3721 spin_lock_init(&napi
->poll_lock
);
3722 napi
->poll_owner
= -1;
3724 set_bit(NAPI_STATE_SCHED
, &napi
->state
);
3726 EXPORT_SYMBOL(netif_napi_add
);
3728 void netif_napi_del(struct napi_struct
*napi
)
3730 struct sk_buff
*skb
, *next
;
3732 list_del_init(&napi
->dev_list
);
3733 napi_free_frags(napi
);
3735 for (skb
= napi
->gro_list
; skb
; skb
= next
) {
3741 napi
->gro_list
= NULL
;
3742 napi
->gro_count
= 0;
3744 EXPORT_SYMBOL(netif_napi_del
);
3746 static void net_rx_action(struct softirq_action
*h
)
3748 struct softnet_data
*sd
= &__get_cpu_var(softnet_data
);
3749 unsigned long time_limit
= jiffies
+ 2;
3750 int budget
= netdev_budget
;
3753 local_irq_disable();
3755 while (!list_empty(&sd
->poll_list
)) {
3756 struct napi_struct
*n
;
3759 /* If softirq window is exhuasted then punt.
3760 * Allow this to run for 2 jiffies since which will allow
3761 * an average latency of 1.5/HZ.
3763 if (unlikely(budget
<= 0 || time_after(jiffies
, time_limit
)))
3768 /* Even though interrupts have been re-enabled, this
3769 * access is safe because interrupts can only add new
3770 * entries to the tail of this list, and only ->poll()
3771 * calls can remove this head entry from the list.
3773 n
= list_first_entry(&sd
->poll_list
, struct napi_struct
, poll_list
);
3775 have
= netpoll_poll_lock(n
);
3779 /* This NAPI_STATE_SCHED test is for avoiding a race
3780 * with netpoll's poll_napi(). Only the entity which
3781 * obtains the lock and sees NAPI_STATE_SCHED set will
3782 * actually make the ->poll() call. Therefore we avoid
3783 * accidently calling ->poll() when NAPI is not scheduled.
3786 if (test_bit(NAPI_STATE_SCHED
, &n
->state
)) {
3787 work
= n
->poll(n
, weight
);
3791 WARN_ON_ONCE(work
> weight
);
3795 local_irq_disable();
3797 /* Drivers must not modify the NAPI state if they
3798 * consume the entire weight. In such cases this code
3799 * still "owns" the NAPI instance and therefore can
3800 * move the instance around on the list at-will.
3802 if (unlikely(work
== weight
)) {
3803 if (unlikely(napi_disable_pending(n
))) {
3806 local_irq_disable();
3808 list_move_tail(&n
->poll_list
, &sd
->poll_list
);
3811 netpoll_poll_unlock(have
);
3814 net_rps_action_and_irq_enable(sd
);
3816 #ifdef CONFIG_NET_DMA
3818 * There may not be any more sk_buffs coming right now, so push
3819 * any pending DMA copies to hardware
3821 dma_issue_pending_all();
3828 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
3832 static gifconf_func_t
*gifconf_list
[NPROTO
];
3835 * register_gifconf - register a SIOCGIF handler
3836 * @family: Address family
3837 * @gifconf: Function handler
3839 * Register protocol dependent address dumping routines. The handler
3840 * that is passed must not be freed or reused until it has been replaced
3841 * by another handler.
3843 int register_gifconf(unsigned int family
, gifconf_func_t
*gifconf
)
3845 if (family
>= NPROTO
)
3847 gifconf_list
[family
] = gifconf
;
3850 EXPORT_SYMBOL(register_gifconf
);
3854 * Map an interface index to its name (SIOCGIFNAME)
3858 * We need this ioctl for efficient implementation of the
3859 * if_indextoname() function required by the IPv6 API. Without
3860 * it, we would have to search all the interfaces to find a
3864 static int dev_ifname(struct net
*net
, struct ifreq __user
*arg
)
3866 struct net_device
*dev
;
3870 * Fetch the caller's info block.
3873 if (copy_from_user(&ifr
, arg
, sizeof(struct ifreq
)))
3877 dev
= dev_get_by_index_rcu(net
, ifr
.ifr_ifindex
);
3883 strcpy(ifr
.ifr_name
, dev
->name
);
3886 if (copy_to_user(arg
, &ifr
, sizeof(struct ifreq
)))
3892 * Perform a SIOCGIFCONF call. This structure will change
3893 * size eventually, and there is nothing I can do about it.
3894 * Thus we will need a 'compatibility mode'.
3897 static int dev_ifconf(struct net
*net
, char __user
*arg
)
3900 struct net_device
*dev
;
3907 * Fetch the caller's info block.
3910 if (copy_from_user(&ifc
, arg
, sizeof(struct ifconf
)))
3917 * Loop over the interfaces, and write an info block for each.
3921 for_each_netdev(net
, dev
) {
3922 for (i
= 0; i
< NPROTO
; i
++) {
3923 if (gifconf_list
[i
]) {
3926 done
= gifconf_list
[i
](dev
, NULL
, 0);
3928 done
= gifconf_list
[i
](dev
, pos
+ total
,
3938 * All done. Write the updated control block back to the caller.
3940 ifc
.ifc_len
= total
;
3943 * Both BSD and Solaris return 0 here, so we do too.
3945 return copy_to_user(arg
, &ifc
, sizeof(struct ifconf
)) ? -EFAULT
: 0;
3948 #ifdef CONFIG_PROC_FS
3950 * This is invoked by the /proc filesystem handler to display a device
3953 void *dev_seq_start(struct seq_file
*seq
, loff_t
*pos
)
3956 struct net
*net
= seq_file_net(seq
);
3958 struct net_device
*dev
;
3962 return SEQ_START_TOKEN
;
3965 for_each_netdev_rcu(net
, dev
)
3972 void *dev_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
3974 struct net_device
*dev
= v
;
3976 if (v
== SEQ_START_TOKEN
)
3977 dev
= first_net_device_rcu(seq_file_net(seq
));
3979 dev
= next_net_device_rcu(dev
);
3985 void dev_seq_stop(struct seq_file
*seq
, void *v
)
3991 static void dev_seq_printf_stats(struct seq_file
*seq
, struct net_device
*dev
)
3993 struct rtnl_link_stats64 temp
;
3994 const struct rtnl_link_stats64
*stats
= dev_get_stats(dev
, &temp
);
3996 seq_printf(seq
, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
3997 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
3998 dev
->name
, stats
->rx_bytes
, stats
->rx_packets
,
4000 stats
->rx_dropped
+ stats
->rx_missed_errors
,
4001 stats
->rx_fifo_errors
,
4002 stats
->rx_length_errors
+ stats
->rx_over_errors
+
4003 stats
->rx_crc_errors
+ stats
->rx_frame_errors
,
4004 stats
->rx_compressed
, stats
->multicast
,
4005 stats
->tx_bytes
, stats
->tx_packets
,
4006 stats
->tx_errors
, stats
->tx_dropped
,
4007 stats
->tx_fifo_errors
, stats
->collisions
,
4008 stats
->tx_carrier_errors
+
4009 stats
->tx_aborted_errors
+
4010 stats
->tx_window_errors
+
4011 stats
->tx_heartbeat_errors
,
4012 stats
->tx_compressed
);
4016 * Called from the PROCfs module. This now uses the new arbitrary sized
4017 * /proc/net interface to create /proc/net/dev
4019 static int dev_seq_show(struct seq_file
*seq
, void *v
)
4021 if (v
== SEQ_START_TOKEN
)
4022 seq_puts(seq
, "Inter-| Receive "
4024 " face |bytes packets errs drop fifo frame "
4025 "compressed multicast|bytes packets errs "
4026 "drop fifo colls carrier compressed\n");
4028 dev_seq_printf_stats(seq
, v
);
4032 static struct softnet_data
*softnet_get_online(loff_t
*pos
)
4034 struct softnet_data
*sd
= NULL
;
4036 while (*pos
< nr_cpu_ids
)
4037 if (cpu_online(*pos
)) {
4038 sd
= &per_cpu(softnet_data
, *pos
);
4045 static void *softnet_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4047 return softnet_get_online(pos
);
4050 static void *softnet_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4053 return softnet_get_online(pos
);
4056 static void softnet_seq_stop(struct seq_file
*seq
, void *v
)
4060 static int softnet_seq_show(struct seq_file
*seq
, void *v
)
4062 struct softnet_data
*sd
= v
;
4064 seq_printf(seq
, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
4065 sd
->processed
, sd
->dropped
, sd
->time_squeeze
, 0,
4066 0, 0, 0, 0, /* was fastroute */
4067 sd
->cpu_collision
, sd
->received_rps
);
4071 static const struct seq_operations dev_seq_ops
= {
4072 .start
= dev_seq_start
,
4073 .next
= dev_seq_next
,
4074 .stop
= dev_seq_stop
,
4075 .show
= dev_seq_show
,
4078 static int dev_seq_open(struct inode
*inode
, struct file
*file
)
4080 return seq_open_net(inode
, file
, &dev_seq_ops
,
4081 sizeof(struct seq_net_private
));
4084 static const struct file_operations dev_seq_fops
= {
4085 .owner
= THIS_MODULE
,
4086 .open
= dev_seq_open
,
4088 .llseek
= seq_lseek
,
4089 .release
= seq_release_net
,
4092 static const struct seq_operations softnet_seq_ops
= {
4093 .start
= softnet_seq_start
,
4094 .next
= softnet_seq_next
,
4095 .stop
= softnet_seq_stop
,
4096 .show
= softnet_seq_show
,
4099 static int softnet_seq_open(struct inode
*inode
, struct file
*file
)
4101 return seq_open(file
, &softnet_seq_ops
);
4104 static const struct file_operations softnet_seq_fops
= {
4105 .owner
= THIS_MODULE
,
4106 .open
= softnet_seq_open
,
4108 .llseek
= seq_lseek
,
4109 .release
= seq_release
,
4112 static void *ptype_get_idx(loff_t pos
)
4114 struct packet_type
*pt
= NULL
;
4118 list_for_each_entry_rcu(pt
, &ptype_all
, list
) {
4124 for (t
= 0; t
< PTYPE_HASH_SIZE
; t
++) {
4125 list_for_each_entry_rcu(pt
, &ptype_base
[t
], list
) {
4134 static void *ptype_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4138 return *pos
? ptype_get_idx(*pos
- 1) : SEQ_START_TOKEN
;
4141 static void *ptype_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4143 struct packet_type
*pt
;
4144 struct list_head
*nxt
;
4148 if (v
== SEQ_START_TOKEN
)
4149 return ptype_get_idx(0);
4152 nxt
= pt
->list
.next
;
4153 if (pt
->type
== htons(ETH_P_ALL
)) {
4154 if (nxt
!= &ptype_all
)
4157 nxt
= ptype_base
[0].next
;
4159 hash
= ntohs(pt
->type
) & PTYPE_HASH_MASK
;
4161 while (nxt
== &ptype_base
[hash
]) {
4162 if (++hash
>= PTYPE_HASH_SIZE
)
4164 nxt
= ptype_base
[hash
].next
;
4167 return list_entry(nxt
, struct packet_type
, list
);
4170 static void ptype_seq_stop(struct seq_file
*seq
, void *v
)
4176 static int ptype_seq_show(struct seq_file
*seq
, void *v
)
4178 struct packet_type
*pt
= v
;
4180 if (v
== SEQ_START_TOKEN
)
4181 seq_puts(seq
, "Type Device Function\n");
4182 else if (pt
->dev
== NULL
|| dev_net(pt
->dev
) == seq_file_net(seq
)) {
4183 if (pt
->type
== htons(ETH_P_ALL
))
4184 seq_puts(seq
, "ALL ");
4186 seq_printf(seq
, "%04x", ntohs(pt
->type
));
4188 seq_printf(seq
, " %-8s %pF\n",
4189 pt
->dev
? pt
->dev
->name
: "", pt
->func
);
4195 static const struct seq_operations ptype_seq_ops
= {
4196 .start
= ptype_seq_start
,
4197 .next
= ptype_seq_next
,
4198 .stop
= ptype_seq_stop
,
4199 .show
= ptype_seq_show
,
4202 static int ptype_seq_open(struct inode
*inode
, struct file
*file
)
4204 return seq_open_net(inode
, file
, &ptype_seq_ops
,
4205 sizeof(struct seq_net_private
));
4208 static const struct file_operations ptype_seq_fops
= {
4209 .owner
= THIS_MODULE
,
4210 .open
= ptype_seq_open
,
4212 .llseek
= seq_lseek
,
4213 .release
= seq_release_net
,
4217 static int __net_init
dev_proc_net_init(struct net
*net
)
4221 if (!proc_net_fops_create(net
, "dev", S_IRUGO
, &dev_seq_fops
))
4223 if (!proc_net_fops_create(net
, "softnet_stat", S_IRUGO
, &softnet_seq_fops
))
4225 if (!proc_net_fops_create(net
, "ptype", S_IRUGO
, &ptype_seq_fops
))
4228 if (wext_proc_init(net
))
4234 proc_net_remove(net
, "ptype");
4236 proc_net_remove(net
, "softnet_stat");
4238 proc_net_remove(net
, "dev");
4242 static void __net_exit
dev_proc_net_exit(struct net
*net
)
4244 wext_proc_exit(net
);
4246 proc_net_remove(net
, "ptype");
4247 proc_net_remove(net
, "softnet_stat");
4248 proc_net_remove(net
, "dev");
4251 static struct pernet_operations __net_initdata dev_proc_ops
= {
4252 .init
= dev_proc_net_init
,
4253 .exit
= dev_proc_net_exit
,
4256 static int __init
dev_proc_init(void)
4258 return register_pernet_subsys(&dev_proc_ops
);
4261 #define dev_proc_init() 0
4262 #endif /* CONFIG_PROC_FS */
4266 * netdev_set_master - set up master pointer
4267 * @slave: slave device
4268 * @master: new master device
4270 * Changes the master device of the slave. Pass %NULL to break the
4271 * bonding. The caller must hold the RTNL semaphore. On a failure
4272 * a negative errno code is returned. On success the reference counts
4273 * are adjusted and the function returns zero.
4275 int netdev_set_master(struct net_device
*slave
, struct net_device
*master
)
4277 struct net_device
*old
= slave
->master
;
4287 slave
->master
= master
;
4295 EXPORT_SYMBOL(netdev_set_master
);
4298 * netdev_set_bond_master - set up bonding master/slave pair
4299 * @slave: slave device
4300 * @master: new master device
4302 * Changes the master device of the slave. Pass %NULL to break the
4303 * bonding. The caller must hold the RTNL semaphore. On a failure
4304 * a negative errno code is returned. On success %RTM_NEWLINK is sent
4305 * to the routing socket and the function returns zero.
4307 int netdev_set_bond_master(struct net_device
*slave
, struct net_device
*master
)
4313 err
= netdev_set_master(slave
, master
);
4317 slave
->flags
|= IFF_SLAVE
;
4319 slave
->flags
&= ~IFF_SLAVE
;
4321 rtmsg_ifinfo(RTM_NEWLINK
, slave
, IFF_SLAVE
);
4324 EXPORT_SYMBOL(netdev_set_bond_master
);
4326 static void dev_change_rx_flags(struct net_device
*dev
, int flags
)
4328 const struct net_device_ops
*ops
= dev
->netdev_ops
;
4330 if ((dev
->flags
& IFF_UP
) && ops
->ndo_change_rx_flags
)
4331 ops
->ndo_change_rx_flags(dev
, flags
);
4334 static int __dev_set_promiscuity(struct net_device
*dev
, int inc
)
4336 unsigned short old_flags
= dev
->flags
;
4342 dev
->flags
|= IFF_PROMISC
;
4343 dev
->promiscuity
+= inc
;
4344 if (dev
->promiscuity
== 0) {
4347 * If inc causes overflow, untouch promisc and return error.
4350 dev
->flags
&= ~IFF_PROMISC
;
4352 dev
->promiscuity
-= inc
;
4353 printk(KERN_WARNING
"%s: promiscuity touches roof, "
4354 "set promiscuity failed, promiscuity feature "
4355 "of device might be broken.\n", dev
->name
);
4359 if (dev
->flags
!= old_flags
) {
4360 printk(KERN_INFO
"device %s %s promiscuous mode\n",
4361 dev
->name
, (dev
->flags
& IFF_PROMISC
) ? "entered" :
4363 if (audit_enabled
) {
4364 current_uid_gid(&uid
, &gid
);
4365 audit_log(current
->audit_context
, GFP_ATOMIC
,
4366 AUDIT_ANOM_PROMISCUOUS
,
4367 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4368 dev
->name
, (dev
->flags
& IFF_PROMISC
),
4369 (old_flags
& IFF_PROMISC
),
4370 audit_get_loginuid(current
),
4372 audit_get_sessionid(current
));
4375 dev_change_rx_flags(dev
, IFF_PROMISC
);
4381 * dev_set_promiscuity - update promiscuity count on a device
4385 * Add or remove promiscuity from a device. While the count in the device
4386 * remains above zero the interface remains promiscuous. Once it hits zero
4387 * the device reverts back to normal filtering operation. A negative inc
4388 * value is used to drop promiscuity on the device.
4389 * Return 0 if successful or a negative errno code on error.
4391 int dev_set_promiscuity(struct net_device
*dev
, int inc
)
4393 unsigned short old_flags
= dev
->flags
;
4396 err
= __dev_set_promiscuity(dev
, inc
);
4399 if (dev
->flags
!= old_flags
)
4400 dev_set_rx_mode(dev
);
4403 EXPORT_SYMBOL(dev_set_promiscuity
);
4406 * dev_set_allmulti - update allmulti count on a device
4410 * Add or remove reception of all multicast frames to a device. While the
4411 * count in the device remains above zero the interface remains listening
4412 * to all interfaces. Once it hits zero the device reverts back to normal
4413 * filtering operation. A negative @inc value is used to drop the counter
4414 * when releasing a resource needing all multicasts.
4415 * Return 0 if successful or a negative errno code on error.
4418 int dev_set_allmulti(struct net_device
*dev
, int inc
)
4420 unsigned short old_flags
= dev
->flags
;
4424 dev
->flags
|= IFF_ALLMULTI
;
4425 dev
->allmulti
+= inc
;
4426 if (dev
->allmulti
== 0) {
4429 * If inc causes overflow, untouch allmulti and return error.
4432 dev
->flags
&= ~IFF_ALLMULTI
;
4434 dev
->allmulti
-= inc
;
4435 printk(KERN_WARNING
"%s: allmulti touches roof, "
4436 "set allmulti failed, allmulti feature of "
4437 "device might be broken.\n", dev
->name
);
4441 if (dev
->flags
^ old_flags
) {
4442 dev_change_rx_flags(dev
, IFF_ALLMULTI
);
4443 dev_set_rx_mode(dev
);
4447 EXPORT_SYMBOL(dev_set_allmulti
);
4450 * Upload unicast and multicast address lists to device and
4451 * configure RX filtering. When the device doesn't support unicast
4452 * filtering it is put in promiscuous mode while unicast addresses
4455 void __dev_set_rx_mode(struct net_device
*dev
)
4457 const struct net_device_ops
*ops
= dev
->netdev_ops
;
4459 /* dev_open will call this function so the list will stay sane. */
4460 if (!(dev
->flags
&IFF_UP
))
4463 if (!netif_device_present(dev
))
4466 if (ops
->ndo_set_rx_mode
)
4467 ops
->ndo_set_rx_mode(dev
);
4469 /* Unicast addresses changes may only happen under the rtnl,
4470 * therefore calling __dev_set_promiscuity here is safe.
4472 if (!netdev_uc_empty(dev
) && !dev
->uc_promisc
) {
4473 __dev_set_promiscuity(dev
, 1);
4474 dev
->uc_promisc
= 1;
4475 } else if (netdev_uc_empty(dev
) && dev
->uc_promisc
) {
4476 __dev_set_promiscuity(dev
, -1);
4477 dev
->uc_promisc
= 0;
4480 if (ops
->ndo_set_multicast_list
)
4481 ops
->ndo_set_multicast_list(dev
);
4485 void dev_set_rx_mode(struct net_device
*dev
)
4487 netif_addr_lock_bh(dev
);
4488 __dev_set_rx_mode(dev
);
4489 netif_addr_unlock_bh(dev
);
4493 * dev_get_flags - get flags reported to userspace
4496 * Get the combination of flag bits exported through APIs to userspace.
4498 unsigned dev_get_flags(const struct net_device
*dev
)
4502 flags
= (dev
->flags
& ~(IFF_PROMISC
|
4507 (dev
->gflags
& (IFF_PROMISC
|
4510 if (netif_running(dev
)) {
4511 if (netif_oper_up(dev
))
4512 flags
|= IFF_RUNNING
;
4513 if (netif_carrier_ok(dev
))
4514 flags
|= IFF_LOWER_UP
;
4515 if (netif_dormant(dev
))
4516 flags
|= IFF_DORMANT
;
4521 EXPORT_SYMBOL(dev_get_flags
);
4523 int __dev_change_flags(struct net_device
*dev
, unsigned int flags
)
4525 int old_flags
= dev
->flags
;
4531 * Set the flags on our device.
4534 dev
->flags
= (flags
& (IFF_DEBUG
| IFF_NOTRAILERS
| IFF_NOARP
|
4535 IFF_DYNAMIC
| IFF_MULTICAST
| IFF_PORTSEL
|
4537 (dev
->flags
& (IFF_UP
| IFF_VOLATILE
| IFF_PROMISC
|
4541 * Load in the correct multicast list now the flags have changed.
4544 if ((old_flags
^ flags
) & IFF_MULTICAST
)
4545 dev_change_rx_flags(dev
, IFF_MULTICAST
);
4547 dev_set_rx_mode(dev
);
4550 * Have we downed the interface. We handle IFF_UP ourselves
4551 * according to user attempts to set it, rather than blindly
4556 if ((old_flags
^ flags
) & IFF_UP
) { /* Bit is different ? */
4557 ret
= ((old_flags
& IFF_UP
) ? __dev_close
: __dev_open
)(dev
);
4560 dev_set_rx_mode(dev
);
4563 if ((flags
^ dev
->gflags
) & IFF_PROMISC
) {
4564 int inc
= (flags
& IFF_PROMISC
) ? 1 : -1;
4566 dev
->gflags
^= IFF_PROMISC
;
4567 dev_set_promiscuity(dev
, inc
);
4570 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4571 is important. Some (broken) drivers set IFF_PROMISC, when
4572 IFF_ALLMULTI is requested not asking us and not reporting.
4574 if ((flags
^ dev
->gflags
) & IFF_ALLMULTI
) {
4575 int inc
= (flags
& IFF_ALLMULTI
) ? 1 : -1;
4577 dev
->gflags
^= IFF_ALLMULTI
;
4578 dev_set_allmulti(dev
, inc
);
4584 void __dev_notify_flags(struct net_device
*dev
, unsigned int old_flags
)
4586 unsigned int changes
= dev
->flags
^ old_flags
;
4588 if (changes
& IFF_UP
) {
4589 if (dev
->flags
& IFF_UP
)
4590 call_netdevice_notifiers(NETDEV_UP
, dev
);
4592 call_netdevice_notifiers(NETDEV_DOWN
, dev
);
4595 if (dev
->flags
& IFF_UP
&&
4596 (changes
& ~(IFF_UP
| IFF_PROMISC
| IFF_ALLMULTI
| IFF_VOLATILE
)))
4597 call_netdevice_notifiers(NETDEV_CHANGE
, dev
);
4601 * dev_change_flags - change device settings
4603 * @flags: device state flags
4605 * Change settings on device based state flags. The flags are
4606 * in the userspace exported format.
4608 int dev_change_flags(struct net_device
*dev
, unsigned flags
)
4611 int old_flags
= dev
->flags
;
4613 ret
= __dev_change_flags(dev
, flags
);
4617 changes
= old_flags
^ dev
->flags
;
4619 rtmsg_ifinfo(RTM_NEWLINK
, dev
, changes
);
4621 __dev_notify_flags(dev
, old_flags
);
4624 EXPORT_SYMBOL(dev_change_flags
);
4627 * dev_set_mtu - Change maximum transfer unit
4629 * @new_mtu: new transfer unit
4631 * Change the maximum transfer size of the network device.
4633 int dev_set_mtu(struct net_device
*dev
, int new_mtu
)
4635 const struct net_device_ops
*ops
= dev
->netdev_ops
;
4638 if (new_mtu
== dev
->mtu
)
4641 /* MTU must be positive. */
4645 if (!netif_device_present(dev
))
4649 if (ops
->ndo_change_mtu
)
4650 err
= ops
->ndo_change_mtu(dev
, new_mtu
);
4654 if (!err
&& dev
->flags
& IFF_UP
)
4655 call_netdevice_notifiers(NETDEV_CHANGEMTU
, dev
);
4658 EXPORT_SYMBOL(dev_set_mtu
);
4661 * dev_set_group - Change group this device belongs to
4663 * @new_group: group this device should belong to
4665 void dev_set_group(struct net_device
*dev
, int new_group
)
4667 dev
->group
= new_group
;
4669 EXPORT_SYMBOL(dev_set_group
);
4672 * dev_set_mac_address - Change Media Access Control Address
4676 * Change the hardware (MAC) address of the device
4678 int dev_set_mac_address(struct net_device
*dev
, struct sockaddr
*sa
)
4680 const struct net_device_ops
*ops
= dev
->netdev_ops
;
4683 if (!ops
->ndo_set_mac_address
)
4685 if (sa
->sa_family
!= dev
->type
)
4687 if (!netif_device_present(dev
))
4689 err
= ops
->ndo_set_mac_address(dev
, sa
);
4691 call_netdevice_notifiers(NETDEV_CHANGEADDR
, dev
);
4694 EXPORT_SYMBOL(dev_set_mac_address
);
4697 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4699 static int dev_ifsioc_locked(struct net
*net
, struct ifreq
*ifr
, unsigned int cmd
)
4702 struct net_device
*dev
= dev_get_by_name_rcu(net
, ifr
->ifr_name
);
4708 case SIOCGIFFLAGS
: /* Get interface flags */
4709 ifr
->ifr_flags
= (short) dev_get_flags(dev
);
4712 case SIOCGIFMETRIC
: /* Get the metric on the interface
4713 (currently unused) */
4714 ifr
->ifr_metric
= 0;
4717 case SIOCGIFMTU
: /* Get the MTU of a device */
4718 ifr
->ifr_mtu
= dev
->mtu
;
4723 memset(ifr
->ifr_hwaddr
.sa_data
, 0, sizeof ifr
->ifr_hwaddr
.sa_data
);
4725 memcpy(ifr
->ifr_hwaddr
.sa_data
, dev
->dev_addr
,
4726 min(sizeof ifr
->ifr_hwaddr
.sa_data
, (size_t) dev
->addr_len
));
4727 ifr
->ifr_hwaddr
.sa_family
= dev
->type
;
4735 ifr
->ifr_map
.mem_start
= dev
->mem_start
;
4736 ifr
->ifr_map
.mem_end
= dev
->mem_end
;
4737 ifr
->ifr_map
.base_addr
= dev
->base_addr
;
4738 ifr
->ifr_map
.irq
= dev
->irq
;
4739 ifr
->ifr_map
.dma
= dev
->dma
;
4740 ifr
->ifr_map
.port
= dev
->if_port
;
4744 ifr
->ifr_ifindex
= dev
->ifindex
;
4748 ifr
->ifr_qlen
= dev
->tx_queue_len
;
4752 /* dev_ioctl() should ensure this case
4764 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4766 static int dev_ifsioc(struct net
*net
, struct ifreq
*ifr
, unsigned int cmd
)
4769 struct net_device
*dev
= __dev_get_by_name(net
, ifr
->ifr_name
);
4770 const struct net_device_ops
*ops
;
4775 ops
= dev
->netdev_ops
;
4778 case SIOCSIFFLAGS
: /* Set interface flags */
4779 return dev_change_flags(dev
, ifr
->ifr_flags
);
4781 case SIOCSIFMETRIC
: /* Set the metric on the interface
4782 (currently unused) */
4785 case SIOCSIFMTU
: /* Set the MTU of a device */
4786 return dev_set_mtu(dev
, ifr
->ifr_mtu
);
4789 return dev_set_mac_address(dev
, &ifr
->ifr_hwaddr
);
4791 case SIOCSIFHWBROADCAST
:
4792 if (ifr
->ifr_hwaddr
.sa_family
!= dev
->type
)
4794 memcpy(dev
->broadcast
, ifr
->ifr_hwaddr
.sa_data
,
4795 min(sizeof ifr
->ifr_hwaddr
.sa_data
, (size_t) dev
->addr_len
));
4796 call_netdevice_notifiers(NETDEV_CHANGEADDR
, dev
);
4800 if (ops
->ndo_set_config
) {
4801 if (!netif_device_present(dev
))
4803 return ops
->ndo_set_config(dev
, &ifr
->ifr_map
);
4808 if ((!ops
->ndo_set_multicast_list
&& !ops
->ndo_set_rx_mode
) ||
4809 ifr
->ifr_hwaddr
.sa_family
!= AF_UNSPEC
)
4811 if (!netif_device_present(dev
))
4813 return dev_mc_add_global(dev
, ifr
->ifr_hwaddr
.sa_data
);
4816 if ((!ops
->ndo_set_multicast_list
&& !ops
->ndo_set_rx_mode
) ||
4817 ifr
->ifr_hwaddr
.sa_family
!= AF_UNSPEC
)
4819 if (!netif_device_present(dev
))
4821 return dev_mc_del_global(dev
, ifr
->ifr_hwaddr
.sa_data
);
4824 if (ifr
->ifr_qlen
< 0)
4826 dev
->tx_queue_len
= ifr
->ifr_qlen
;
4830 ifr
->ifr_newname
[IFNAMSIZ
-1] = '\0';
4831 return dev_change_name(dev
, ifr
->ifr_newname
);
4834 * Unknown or private ioctl
4837 if ((cmd
>= SIOCDEVPRIVATE
&&
4838 cmd
<= SIOCDEVPRIVATE
+ 15) ||
4839 cmd
== SIOCBONDENSLAVE
||
4840 cmd
== SIOCBONDRELEASE
||
4841 cmd
== SIOCBONDSETHWADDR
||
4842 cmd
== SIOCBONDSLAVEINFOQUERY
||
4843 cmd
== SIOCBONDINFOQUERY
||
4844 cmd
== SIOCBONDCHANGEACTIVE
||
4845 cmd
== SIOCGMIIPHY
||
4846 cmd
== SIOCGMIIREG
||
4847 cmd
== SIOCSMIIREG
||
4848 cmd
== SIOCBRADDIF
||
4849 cmd
== SIOCBRDELIF
||
4850 cmd
== SIOCSHWTSTAMP
||
4851 cmd
== SIOCWANDEV
) {
4853 if (ops
->ndo_do_ioctl
) {
4854 if (netif_device_present(dev
))
4855 err
= ops
->ndo_do_ioctl(dev
, ifr
, cmd
);
4867 * This function handles all "interface"-type I/O control requests. The actual
4868 * 'doing' part of this is dev_ifsioc above.
4872 * dev_ioctl - network device ioctl
4873 * @net: the applicable net namespace
4874 * @cmd: command to issue
4875 * @arg: pointer to a struct ifreq in user space
4877 * Issue ioctl functions to devices. This is normally called by the
4878 * user space syscall interfaces but can sometimes be useful for
4879 * other purposes. The return value is the return from the syscall if
4880 * positive or a negative errno code on error.
4883 int dev_ioctl(struct net
*net
, unsigned int cmd
, void __user
*arg
)
4889 /* One special case: SIOCGIFCONF takes ifconf argument
4890 and requires shared lock, because it sleeps writing
4894 if (cmd
== SIOCGIFCONF
) {
4896 ret
= dev_ifconf(net
, (char __user
*) arg
);
4900 if (cmd
== SIOCGIFNAME
)
4901 return dev_ifname(net
, (struct ifreq __user
*)arg
);
4903 if (copy_from_user(&ifr
, arg
, sizeof(struct ifreq
)))
4906 ifr
.ifr_name
[IFNAMSIZ
-1] = 0;
4908 colon
= strchr(ifr
.ifr_name
, ':');
4913 * See which interface the caller is talking about.
4918 * These ioctl calls:
4919 * - can be done by all.
4920 * - atomic and do not require locking.
4931 dev_load(net
, ifr
.ifr_name
);
4933 ret
= dev_ifsioc_locked(net
, &ifr
, cmd
);
4938 if (copy_to_user(arg
, &ifr
,
4939 sizeof(struct ifreq
)))
4945 dev_load(net
, ifr
.ifr_name
);
4947 ret
= dev_ethtool(net
, &ifr
);
4952 if (copy_to_user(arg
, &ifr
,
4953 sizeof(struct ifreq
)))
4959 * These ioctl calls:
4960 * - require superuser power.
4961 * - require strict serialization.
4967 if (!capable(CAP_NET_ADMIN
))
4969 dev_load(net
, ifr
.ifr_name
);
4971 ret
= dev_ifsioc(net
, &ifr
, cmd
);
4976 if (copy_to_user(arg
, &ifr
,
4977 sizeof(struct ifreq
)))
4983 * These ioctl calls:
4984 * - require superuser power.
4985 * - require strict serialization.
4986 * - do not return a value
4996 case SIOCSIFHWBROADCAST
:
4999 case SIOCBONDENSLAVE
:
5000 case SIOCBONDRELEASE
:
5001 case SIOCBONDSETHWADDR
:
5002 case SIOCBONDCHANGEACTIVE
:
5006 if (!capable(CAP_NET_ADMIN
))
5009 case SIOCBONDSLAVEINFOQUERY
:
5010 case SIOCBONDINFOQUERY
:
5011 dev_load(net
, ifr
.ifr_name
);
5013 ret
= dev_ifsioc(net
, &ifr
, cmd
);
5018 /* Get the per device memory space. We can add this but
5019 * currently do not support it */
5021 /* Set the per device memory buffer space.
5022 * Not applicable in our case */
5027 * Unknown or private ioctl.
5030 if (cmd
== SIOCWANDEV
||
5031 (cmd
>= SIOCDEVPRIVATE
&&
5032 cmd
<= SIOCDEVPRIVATE
+ 15)) {
5033 dev_load(net
, ifr
.ifr_name
);
5035 ret
= dev_ifsioc(net
, &ifr
, cmd
);
5037 if (!ret
&& copy_to_user(arg
, &ifr
,
5038 sizeof(struct ifreq
)))
5042 /* Take care of Wireless Extensions */
5043 if (cmd
>= SIOCIWFIRST
&& cmd
<= SIOCIWLAST
)
5044 return wext_handle_ioctl(net
, &ifr
, cmd
, arg
);
5051 * dev_new_index - allocate an ifindex
5052 * @net: the applicable net namespace
5054 * Returns a suitable unique value for a new device interface
5055 * number. The caller must hold the rtnl semaphore or the
5056 * dev_base_lock to be sure it remains unique.
5058 static int dev_new_index(struct net
*net
)
5064 if (!__dev_get_by_index(net
, ifindex
))
5069 /* Delayed registration/unregisteration */
5070 static LIST_HEAD(net_todo_list
);
5072 static void net_set_todo(struct net_device
*dev
)
5074 list_add_tail(&dev
->todo_list
, &net_todo_list
);
5077 static void rollback_registered_many(struct list_head
*head
)
5079 struct net_device
*dev
, *tmp
;
5081 BUG_ON(dev_boot_phase
);
5084 list_for_each_entry_safe(dev
, tmp
, head
, unreg_list
) {
5085 /* Some devices call without registering
5086 * for initialization unwind. Remove those
5087 * devices and proceed with the remaining.
5089 if (dev
->reg_state
== NETREG_UNINITIALIZED
) {
5090 pr_debug("unregister_netdevice: device %s/%p never "
5091 "was registered\n", dev
->name
, dev
);
5094 list_del(&dev
->unreg_list
);
5098 BUG_ON(dev
->reg_state
!= NETREG_REGISTERED
);
5101 /* If device is running, close it first. */
5102 dev_close_many(head
);
5104 list_for_each_entry(dev
, head
, unreg_list
) {
5105 /* And unlink it from device chain. */
5106 unlist_netdevice(dev
);
5108 dev
->reg_state
= NETREG_UNREGISTERING
;
5113 list_for_each_entry(dev
, head
, unreg_list
) {
5114 /* Shutdown queueing discipline. */
5118 /* Notify protocols, that we are about to destroy
5119 this device. They should clean all the things.
5121 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
5123 if (!dev
->rtnl_link_ops
||
5124 dev
->rtnl_link_state
== RTNL_LINK_INITIALIZED
)
5125 rtmsg_ifinfo(RTM_DELLINK
, dev
, ~0U);
5128 * Flush the unicast and multicast chains
5133 if (dev
->netdev_ops
->ndo_uninit
)
5134 dev
->netdev_ops
->ndo_uninit(dev
);
5136 /* Notifier chain MUST detach us from master device. */
5137 WARN_ON(dev
->master
);
5139 /* Remove entries from kobject tree */
5140 netdev_unregister_kobject(dev
);
5143 /* Process any work delayed until the end of the batch */
5144 dev
= list_first_entry(head
, struct net_device
, unreg_list
);
5145 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH
, dev
);
5149 list_for_each_entry(dev
, head
, unreg_list
)
5153 static void rollback_registered(struct net_device
*dev
)
5157 list_add(&dev
->unreg_list
, &single
);
5158 rollback_registered_many(&single
);
5162 u32
netdev_fix_features(struct net_device
*dev
, u32 features
)
5164 /* Fix illegal checksum combinations */
5165 if ((features
& NETIF_F_HW_CSUM
) &&
5166 (features
& (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))) {
5167 netdev_info(dev
, "mixed HW and IP checksum settings.\n");
5168 features
&= ~(NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
);
5171 if ((features
& NETIF_F_NO_CSUM
) &&
5172 (features
& (NETIF_F_HW_CSUM
|NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))) {
5173 netdev_info(dev
, "mixed no checksumming and other settings.\n");
5174 features
&= ~(NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
|NETIF_F_HW_CSUM
);
5177 /* Fix illegal SG+CSUM combinations. */
5178 if ((features
& NETIF_F_SG
) &&
5179 !(features
& NETIF_F_ALL_CSUM
)) {
5181 "Dropping NETIF_F_SG since no checksum feature.\n");
5182 features
&= ~NETIF_F_SG
;
5185 /* TSO requires that SG is present as well. */
5186 if ((features
& NETIF_F_TSO
) && !(features
& NETIF_F_SG
)) {
5187 netdev_info(dev
, "Dropping NETIF_F_TSO since no SG feature.\n");
5188 features
&= ~NETIF_F_TSO
;
5191 /* Software GSO depends on SG. */
5192 if ((features
& NETIF_F_GSO
) && !(features
& NETIF_F_SG
)) {
5193 netdev_info(dev
, "Dropping NETIF_F_GSO since no SG feature.\n");
5194 features
&= ~NETIF_F_GSO
;
5197 /* UFO needs SG and checksumming */
5198 if (features
& NETIF_F_UFO
) {
5199 /* maybe split UFO into V4 and V6? */
5200 if (!((features
& NETIF_F_GEN_CSUM
) ||
5201 (features
& (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))
5202 == (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))) {
5204 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5205 features
&= ~NETIF_F_UFO
;
5208 if (!(features
& NETIF_F_SG
)) {
5210 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5211 features
&= ~NETIF_F_UFO
;
5217 EXPORT_SYMBOL(netdev_fix_features
);
5219 void netdev_update_features(struct net_device
*dev
)
5224 features
= netdev_get_wanted_features(dev
);
5226 if (dev
->netdev_ops
->ndo_fix_features
)
5227 features
= dev
->netdev_ops
->ndo_fix_features(dev
, features
);
5229 /* driver might be less strict about feature dependencies */
5230 features
= netdev_fix_features(dev
, features
);
5232 if (dev
->features
== features
)
5235 netdev_info(dev
, "Features changed: 0x%08x -> 0x%08x\n",
5236 dev
->features
, features
);
5238 if (dev
->netdev_ops
->ndo_set_features
)
5239 err
= dev
->netdev_ops
->ndo_set_features(dev
, features
);
5242 dev
->features
= features
;
5245 "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
5246 err
, features
, dev
->features
);
5248 EXPORT_SYMBOL(netdev_update_features
);
5251 * netif_stacked_transfer_operstate - transfer operstate
5252 * @rootdev: the root or lower level device to transfer state from
5253 * @dev: the device to transfer operstate to
5255 * Transfer operational state from root to device. This is normally
5256 * called when a stacking relationship exists between the root
5257 * device and the device(a leaf device).
5259 void netif_stacked_transfer_operstate(const struct net_device
*rootdev
,
5260 struct net_device
*dev
)
5262 if (rootdev
->operstate
== IF_OPER_DORMANT
)
5263 netif_dormant_on(dev
);
5265 netif_dormant_off(dev
);
5267 if (netif_carrier_ok(rootdev
)) {
5268 if (!netif_carrier_ok(dev
))
5269 netif_carrier_on(dev
);
5271 if (netif_carrier_ok(dev
))
5272 netif_carrier_off(dev
);
5275 EXPORT_SYMBOL(netif_stacked_transfer_operstate
);
5278 static int netif_alloc_rx_queues(struct net_device
*dev
)
5280 unsigned int i
, count
= dev
->num_rx_queues
;
5281 struct netdev_rx_queue
*rx
;
5285 rx
= kcalloc(count
, sizeof(struct netdev_rx_queue
), GFP_KERNEL
);
5287 pr_err("netdev: Unable to allocate %u rx queues.\n", count
);
5292 for (i
= 0; i
< count
; i
++)
5298 static void netdev_init_one_queue(struct net_device
*dev
,
5299 struct netdev_queue
*queue
, void *_unused
)
5301 /* Initialize queue lock */
5302 spin_lock_init(&queue
->_xmit_lock
);
5303 netdev_set_xmit_lockdep_class(&queue
->_xmit_lock
, dev
->type
);
5304 queue
->xmit_lock_owner
= -1;
5305 netdev_queue_numa_node_write(queue
, NUMA_NO_NODE
);
5309 static int netif_alloc_netdev_queues(struct net_device
*dev
)
5311 unsigned int count
= dev
->num_tx_queues
;
5312 struct netdev_queue
*tx
;
5316 tx
= kcalloc(count
, sizeof(struct netdev_queue
), GFP_KERNEL
);
5318 pr_err("netdev: Unable to allocate %u tx queues.\n",
5324 netdev_for_each_tx_queue(dev
, netdev_init_one_queue
, NULL
);
5325 spin_lock_init(&dev
->tx_global_lock
);
5331 * register_netdevice - register a network device
5332 * @dev: device to register
5334 * Take a completed network device structure and add it to the kernel
5335 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5336 * chain. 0 is returned on success. A negative errno code is returned
5337 * on a failure to set up the device, or if the name is a duplicate.
5339 * Callers must hold the rtnl semaphore. You may want
5340 * register_netdev() instead of this.
5343 * The locking appears insufficient to guarantee two parallel registers
5344 * will not get the same name.
5347 int register_netdevice(struct net_device
*dev
)
5350 struct net
*net
= dev_net(dev
);
5352 BUG_ON(dev_boot_phase
);
5357 /* When net_device's are persistent, this will be fatal. */
5358 BUG_ON(dev
->reg_state
!= NETREG_UNINITIALIZED
);
5361 spin_lock_init(&dev
->addr_list_lock
);
5362 netdev_set_addr_lockdep_class(dev
);
5366 /* Init, if this function is available */
5367 if (dev
->netdev_ops
->ndo_init
) {
5368 ret
= dev
->netdev_ops
->ndo_init(dev
);
5376 ret
= dev_get_valid_name(dev
, dev
->name
, 0);
5380 dev
->ifindex
= dev_new_index(net
);
5381 if (dev
->iflink
== -1)
5382 dev
->iflink
= dev
->ifindex
;
5384 /* Transfer changeable features to wanted_features and enable
5385 * software offloads (GSO and GRO).
5387 dev
->hw_features
|= NETIF_F_SOFT_FEATURES
;
5388 dev
->features
|= NETIF_F_SOFT_FEATURES
;
5389 dev
->wanted_features
= dev
->features
& dev
->hw_features
;
5391 /* Avoid warning from netdev_fix_features() for GSO without SG */
5392 if (!(dev
->wanted_features
& NETIF_F_SG
)) {
5393 dev
->wanted_features
&= ~NETIF_F_GSO
;
5394 dev
->features
&= ~NETIF_F_GSO
;
5397 /* Enable GRO and NETIF_F_HIGHDMA for vlans by default,
5398 * vlan_dev_init() will do the dev->features check, so these features
5399 * are enabled only if supported by underlying device.
5401 dev
->vlan_features
|= (NETIF_F_GRO
| NETIF_F_HIGHDMA
);
5403 ret
= call_netdevice_notifiers(NETDEV_POST_INIT
, dev
);
5404 ret
= notifier_to_errno(ret
);
5408 ret
= netdev_register_kobject(dev
);
5411 dev
->reg_state
= NETREG_REGISTERED
;
5413 netdev_update_features(dev
);
5416 * Default initial state at registry is that the
5417 * device is present.
5420 set_bit(__LINK_STATE_PRESENT
, &dev
->state
);
5422 dev_init_scheduler(dev
);
5424 list_netdevice(dev
);
5426 /* Notify protocols, that a new device appeared. */
5427 ret
= call_netdevice_notifiers(NETDEV_REGISTER
, dev
);
5428 ret
= notifier_to_errno(ret
);
5430 rollback_registered(dev
);
5431 dev
->reg_state
= NETREG_UNREGISTERED
;
5434 * Prevent userspace races by waiting until the network
5435 * device is fully setup before sending notifications.
5437 if (!dev
->rtnl_link_ops
||
5438 dev
->rtnl_link_state
== RTNL_LINK_INITIALIZED
)
5439 rtmsg_ifinfo(RTM_NEWLINK
, dev
, ~0U);
5445 if (dev
->netdev_ops
->ndo_uninit
)
5446 dev
->netdev_ops
->ndo_uninit(dev
);
5449 EXPORT_SYMBOL(register_netdevice
);
5452 * init_dummy_netdev - init a dummy network device for NAPI
5453 * @dev: device to init
5455 * This takes a network device structure and initialize the minimum
5456 * amount of fields so it can be used to schedule NAPI polls without
5457 * registering a full blown interface. This is to be used by drivers
5458 * that need to tie several hardware interfaces to a single NAPI
5459 * poll scheduler due to HW limitations.
5461 int init_dummy_netdev(struct net_device
*dev
)
5463 /* Clear everything. Note we don't initialize spinlocks
5464 * are they aren't supposed to be taken by any of the
5465 * NAPI code and this dummy netdev is supposed to be
5466 * only ever used for NAPI polls
5468 memset(dev
, 0, sizeof(struct net_device
));
5470 /* make sure we BUG if trying to hit standard
5471 * register/unregister code path
5473 dev
->reg_state
= NETREG_DUMMY
;
5475 /* NAPI wants this */
5476 INIT_LIST_HEAD(&dev
->napi_list
);
5478 /* a dummy interface is started by default */
5479 set_bit(__LINK_STATE_PRESENT
, &dev
->state
);
5480 set_bit(__LINK_STATE_START
, &dev
->state
);
5482 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5483 * because users of this 'device' dont need to change
5489 EXPORT_SYMBOL_GPL(init_dummy_netdev
);
5493 * register_netdev - register a network device
5494 * @dev: device to register
5496 * Take a completed network device structure and add it to the kernel
5497 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5498 * chain. 0 is returned on success. A negative errno code is returned
5499 * on a failure to set up the device, or if the name is a duplicate.
5501 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5502 * and expands the device name if you passed a format string to
5505 int register_netdev(struct net_device
*dev
)
5512 * If the name is a format string the caller wants us to do a
5515 if (strchr(dev
->name
, '%')) {
5516 err
= dev_alloc_name(dev
, dev
->name
);
5521 err
= register_netdevice(dev
);
5526 EXPORT_SYMBOL(register_netdev
);
5528 int netdev_refcnt_read(const struct net_device
*dev
)
5532 for_each_possible_cpu(i
)
5533 refcnt
+= *per_cpu_ptr(dev
->pcpu_refcnt
, i
);
5536 EXPORT_SYMBOL(netdev_refcnt_read
);
5539 * netdev_wait_allrefs - wait until all references are gone.
5541 * This is called when unregistering network devices.
5543 * Any protocol or device that holds a reference should register
5544 * for netdevice notification, and cleanup and put back the
5545 * reference if they receive an UNREGISTER event.
5546 * We can get stuck here if buggy protocols don't correctly
5549 static void netdev_wait_allrefs(struct net_device
*dev
)
5551 unsigned long rebroadcast_time
, warning_time
;
5554 linkwatch_forget_dev(dev
);
5556 rebroadcast_time
= warning_time
= jiffies
;
5557 refcnt
= netdev_refcnt_read(dev
);
5559 while (refcnt
!= 0) {
5560 if (time_after(jiffies
, rebroadcast_time
+ 1 * HZ
)) {
5563 /* Rebroadcast unregister notification */
5564 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
5565 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5566 * should have already handle it the first time */
5568 if (test_bit(__LINK_STATE_LINKWATCH_PENDING
,
5570 /* We must not have linkwatch events
5571 * pending on unregister. If this
5572 * happens, we simply run the queue
5573 * unscheduled, resulting in a noop
5576 linkwatch_run_queue();
5581 rebroadcast_time
= jiffies
;
5586 refcnt
= netdev_refcnt_read(dev
);
5588 if (time_after(jiffies
, warning_time
+ 10 * HZ
)) {
5589 printk(KERN_EMERG
"unregister_netdevice: "
5590 "waiting for %s to become free. Usage "
5593 warning_time
= jiffies
;
5602 * register_netdevice(x1);
5603 * register_netdevice(x2);
5605 * unregister_netdevice(y1);
5606 * unregister_netdevice(y2);
5612 * We are invoked by rtnl_unlock().
5613 * This allows us to deal with problems:
5614 * 1) We can delete sysfs objects which invoke hotplug
5615 * without deadlocking with linkwatch via keventd.
5616 * 2) Since we run with the RTNL semaphore not held, we can sleep
5617 * safely in order to wait for the netdev refcnt to drop to zero.
5619 * We must not return until all unregister events added during
5620 * the interval the lock was held have been completed.
5622 void netdev_run_todo(void)
5624 struct list_head list
;
5626 /* Snapshot list, allow later requests */
5627 list_replace_init(&net_todo_list
, &list
);
5631 while (!list_empty(&list
)) {
5632 struct net_device
*dev
5633 = list_first_entry(&list
, struct net_device
, todo_list
);
5634 list_del(&dev
->todo_list
);
5636 if (unlikely(dev
->reg_state
!= NETREG_UNREGISTERING
)) {
5637 printk(KERN_ERR
"network todo '%s' but state %d\n",
5638 dev
->name
, dev
->reg_state
);
5643 dev
->reg_state
= NETREG_UNREGISTERED
;
5645 on_each_cpu(flush_backlog
, dev
, 1);
5647 netdev_wait_allrefs(dev
);
5650 BUG_ON(netdev_refcnt_read(dev
));
5651 WARN_ON(rcu_dereference_raw(dev
->ip_ptr
));
5652 WARN_ON(rcu_dereference_raw(dev
->ip6_ptr
));
5653 WARN_ON(dev
->dn_ptr
);
5655 if (dev
->destructor
)
5656 dev
->destructor(dev
);
5658 /* Free network device */
5659 kobject_put(&dev
->dev
.kobj
);
5663 /* Convert net_device_stats to rtnl_link_stats64. They have the same
5664 * fields in the same order, with only the type differing.
5666 static void netdev_stats_to_stats64(struct rtnl_link_stats64
*stats64
,
5667 const struct net_device_stats
*netdev_stats
)
5669 #if BITS_PER_LONG == 64
5670 BUILD_BUG_ON(sizeof(*stats64
) != sizeof(*netdev_stats
));
5671 memcpy(stats64
, netdev_stats
, sizeof(*stats64
));
5673 size_t i
, n
= sizeof(*stats64
) / sizeof(u64
);
5674 const unsigned long *src
= (const unsigned long *)netdev_stats
;
5675 u64
*dst
= (u64
*)stats64
;
5677 BUILD_BUG_ON(sizeof(*netdev_stats
) / sizeof(unsigned long) !=
5678 sizeof(*stats64
) / sizeof(u64
));
5679 for (i
= 0; i
< n
; i
++)
5685 * dev_get_stats - get network device statistics
5686 * @dev: device to get statistics from
5687 * @storage: place to store stats
5689 * Get network statistics from device. Return @storage.
5690 * The device driver may provide its own method by setting
5691 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5692 * otherwise the internal statistics structure is used.
5694 struct rtnl_link_stats64
*dev_get_stats(struct net_device
*dev
,
5695 struct rtnl_link_stats64
*storage
)
5697 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5699 if (ops
->ndo_get_stats64
) {
5700 memset(storage
, 0, sizeof(*storage
));
5701 ops
->ndo_get_stats64(dev
, storage
);
5702 } else if (ops
->ndo_get_stats
) {
5703 netdev_stats_to_stats64(storage
, ops
->ndo_get_stats(dev
));
5705 netdev_stats_to_stats64(storage
, &dev
->stats
);
5707 storage
->rx_dropped
+= atomic_long_read(&dev
->rx_dropped
);
5710 EXPORT_SYMBOL(dev_get_stats
);
5712 struct netdev_queue
*dev_ingress_queue_create(struct net_device
*dev
)
5714 struct netdev_queue
*queue
= dev_ingress_queue(dev
);
5716 #ifdef CONFIG_NET_CLS_ACT
5719 queue
= kzalloc(sizeof(*queue
), GFP_KERNEL
);
5722 netdev_init_one_queue(dev
, queue
, NULL
);
5723 queue
->qdisc
= &noop_qdisc
;
5724 queue
->qdisc_sleeping
= &noop_qdisc
;
5725 rcu_assign_pointer(dev
->ingress_queue
, queue
);
5731 * alloc_netdev_mqs - allocate network device
5732 * @sizeof_priv: size of private data to allocate space for
5733 * @name: device name format string
5734 * @setup: callback to initialize device
5735 * @txqs: the number of TX subqueues to allocate
5736 * @rxqs: the number of RX subqueues to allocate
5738 * Allocates a struct net_device with private data area for driver use
5739 * and performs basic initialization. Also allocates subquue structs
5740 * for each queue on the device.
5742 struct net_device
*alloc_netdev_mqs(int sizeof_priv
, const char *name
,
5743 void (*setup
)(struct net_device
*),
5744 unsigned int txqs
, unsigned int rxqs
)
5746 struct net_device
*dev
;
5748 struct net_device
*p
;
5750 BUG_ON(strlen(name
) >= sizeof(dev
->name
));
5753 pr_err("alloc_netdev: Unable to allocate device "
5754 "with zero queues.\n");
5760 pr_err("alloc_netdev: Unable to allocate device "
5761 "with zero RX queues.\n");
5766 alloc_size
= sizeof(struct net_device
);
5768 /* ensure 32-byte alignment of private area */
5769 alloc_size
= ALIGN(alloc_size
, NETDEV_ALIGN
);
5770 alloc_size
+= sizeof_priv
;
5772 /* ensure 32-byte alignment of whole construct */
5773 alloc_size
+= NETDEV_ALIGN
- 1;
5775 p
= kzalloc(alloc_size
, GFP_KERNEL
);
5777 printk(KERN_ERR
"alloc_netdev: Unable to allocate device.\n");
5781 dev
= PTR_ALIGN(p
, NETDEV_ALIGN
);
5782 dev
->padded
= (char *)dev
- (char *)p
;
5784 dev
->pcpu_refcnt
= alloc_percpu(int);
5785 if (!dev
->pcpu_refcnt
)
5788 if (dev_addr_init(dev
))
5794 dev_net_set(dev
, &init_net
);
5796 dev
->gso_max_size
= GSO_MAX_SIZE
;
5798 INIT_LIST_HEAD(&dev
->ethtool_ntuple_list
.list
);
5799 dev
->ethtool_ntuple_list
.count
= 0;
5800 INIT_LIST_HEAD(&dev
->napi_list
);
5801 INIT_LIST_HEAD(&dev
->unreg_list
);
5802 INIT_LIST_HEAD(&dev
->link_watch_list
);
5803 dev
->priv_flags
= IFF_XMIT_DST_RELEASE
;
5806 dev
->num_tx_queues
= txqs
;
5807 dev
->real_num_tx_queues
= txqs
;
5808 if (netif_alloc_netdev_queues(dev
))
5812 dev
->num_rx_queues
= rxqs
;
5813 dev
->real_num_rx_queues
= rxqs
;
5814 if (netif_alloc_rx_queues(dev
))
5818 strcpy(dev
->name
, name
);
5819 dev
->group
= INIT_NETDEV_GROUP
;
5827 free_percpu(dev
->pcpu_refcnt
);
5837 EXPORT_SYMBOL(alloc_netdev_mqs
);
5840 * free_netdev - free network device
5843 * This function does the last stage of destroying an allocated device
5844 * interface. The reference to the device object is released.
5845 * If this is the last reference then it will be freed.
5847 void free_netdev(struct net_device
*dev
)
5849 struct napi_struct
*p
, *n
;
5851 release_net(dev_net(dev
));
5858 kfree(rcu_dereference_raw(dev
->ingress_queue
));
5860 /* Flush device addresses */
5861 dev_addr_flush(dev
);
5863 /* Clear ethtool n-tuple list */
5864 ethtool_ntuple_flush(dev
);
5866 list_for_each_entry_safe(p
, n
, &dev
->napi_list
, dev_list
)
5869 free_percpu(dev
->pcpu_refcnt
);
5870 dev
->pcpu_refcnt
= NULL
;
5872 /* Compatibility with error handling in drivers */
5873 if (dev
->reg_state
== NETREG_UNINITIALIZED
) {
5874 kfree((char *)dev
- dev
->padded
);
5878 BUG_ON(dev
->reg_state
!= NETREG_UNREGISTERED
);
5879 dev
->reg_state
= NETREG_RELEASED
;
5881 /* will free via device release */
5882 put_device(&dev
->dev
);
5884 EXPORT_SYMBOL(free_netdev
);
5887 * synchronize_net - Synchronize with packet receive processing
5889 * Wait for packets currently being received to be done.
5890 * Does not block later packets from starting.
5892 void synchronize_net(void)
5897 EXPORT_SYMBOL(synchronize_net
);
5900 * unregister_netdevice_queue - remove device from the kernel
5904 * This function shuts down a device interface and removes it
5905 * from the kernel tables.
5906 * If head not NULL, device is queued to be unregistered later.
5908 * Callers must hold the rtnl semaphore. You may want
5909 * unregister_netdev() instead of this.
5912 void unregister_netdevice_queue(struct net_device
*dev
, struct list_head
*head
)
5917 list_move_tail(&dev
->unreg_list
, head
);
5919 rollback_registered(dev
);
5920 /* Finish processing unregister after unlock */
5924 EXPORT_SYMBOL(unregister_netdevice_queue
);
5927 * unregister_netdevice_many - unregister many devices
5928 * @head: list of devices
5930 void unregister_netdevice_many(struct list_head
*head
)
5932 struct net_device
*dev
;
5934 if (!list_empty(head
)) {
5935 rollback_registered_many(head
);
5936 list_for_each_entry(dev
, head
, unreg_list
)
5940 EXPORT_SYMBOL(unregister_netdevice_many
);
5943 * unregister_netdev - remove device from the kernel
5946 * This function shuts down a device interface and removes it
5947 * from the kernel tables.
5949 * This is just a wrapper for unregister_netdevice that takes
5950 * the rtnl semaphore. In general you want to use this and not
5951 * unregister_netdevice.
5953 void unregister_netdev(struct net_device
*dev
)
5956 unregister_netdevice(dev
);
5959 EXPORT_SYMBOL(unregister_netdev
);
5962 * dev_change_net_namespace - move device to different nethost namespace
5964 * @net: network namespace
5965 * @pat: If not NULL name pattern to try if the current device name
5966 * is already taken in the destination network namespace.
5968 * This function shuts down a device interface and moves it
5969 * to a new network namespace. On success 0 is returned, on
5970 * a failure a netagive errno code is returned.
5972 * Callers must hold the rtnl semaphore.
5975 int dev_change_net_namespace(struct net_device
*dev
, struct net
*net
, const char *pat
)
5981 /* Don't allow namespace local devices to be moved. */
5983 if (dev
->features
& NETIF_F_NETNS_LOCAL
)
5986 /* Ensure the device has been registrered */
5988 if (dev
->reg_state
!= NETREG_REGISTERED
)
5991 /* Get out if there is nothing todo */
5993 if (net_eq(dev_net(dev
), net
))
5996 /* Pick the destination device name, and ensure
5997 * we can use it in the destination network namespace.
6000 if (__dev_get_by_name(net
, dev
->name
)) {
6001 /* We get here if we can't use the current device name */
6004 if (dev_get_valid_name(dev
, pat
, 1))
6009 * And now a mini version of register_netdevice unregister_netdevice.
6012 /* If device is running close it first. */
6015 /* And unlink it from device chain */
6017 unlist_netdevice(dev
);
6021 /* Shutdown queueing discipline. */
6024 /* Notify protocols, that we are about to destroy
6025 this device. They should clean all the things.
6027 Note that dev->reg_state stays at NETREG_REGISTERED.
6028 This is wanted because this way 8021q and macvlan know
6029 the device is just moving and can keep their slaves up.
6031 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
6032 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH
, dev
);
6035 * Flush the unicast and multicast chains
6040 /* Actually switch the network namespace */
6041 dev_net_set(dev
, net
);
6043 /* If there is an ifindex conflict assign a new one */
6044 if (__dev_get_by_index(net
, dev
->ifindex
)) {
6045 int iflink
= (dev
->iflink
== dev
->ifindex
);
6046 dev
->ifindex
= dev_new_index(net
);
6048 dev
->iflink
= dev
->ifindex
;
6051 /* Fixup kobjects */
6052 err
= device_rename(&dev
->dev
, dev
->name
);
6055 /* Add the device back in the hashes */
6056 list_netdevice(dev
);
6058 /* Notify protocols, that a new device appeared. */
6059 call_netdevice_notifiers(NETDEV_REGISTER
, dev
);
6062 * Prevent userspace races by waiting until the network
6063 * device is fully setup before sending notifications.
6065 rtmsg_ifinfo(RTM_NEWLINK
, dev
, ~0U);
6072 EXPORT_SYMBOL_GPL(dev_change_net_namespace
);
6074 static int dev_cpu_callback(struct notifier_block
*nfb
,
6075 unsigned long action
,
6078 struct sk_buff
**list_skb
;
6079 struct sk_buff
*skb
;
6080 unsigned int cpu
, oldcpu
= (unsigned long)ocpu
;
6081 struct softnet_data
*sd
, *oldsd
;
6083 if (action
!= CPU_DEAD
&& action
!= CPU_DEAD_FROZEN
)
6086 local_irq_disable();
6087 cpu
= smp_processor_id();
6088 sd
= &per_cpu(softnet_data
, cpu
);
6089 oldsd
= &per_cpu(softnet_data
, oldcpu
);
6091 /* Find end of our completion_queue. */
6092 list_skb
= &sd
->completion_queue
;
6094 list_skb
= &(*list_skb
)->next
;
6095 /* Append completion queue from offline CPU. */
6096 *list_skb
= oldsd
->completion_queue
;
6097 oldsd
->completion_queue
= NULL
;
6099 /* Append output queue from offline CPU. */
6100 if (oldsd
->output_queue
) {
6101 *sd
->output_queue_tailp
= oldsd
->output_queue
;
6102 sd
->output_queue_tailp
= oldsd
->output_queue_tailp
;
6103 oldsd
->output_queue
= NULL
;
6104 oldsd
->output_queue_tailp
= &oldsd
->output_queue
;
6107 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
6110 /* Process offline CPU's input_pkt_queue */
6111 while ((skb
= __skb_dequeue(&oldsd
->process_queue
))) {
6113 input_queue_head_incr(oldsd
);
6115 while ((skb
= __skb_dequeue(&oldsd
->input_pkt_queue
))) {
6117 input_queue_head_incr(oldsd
);
6125 * netdev_increment_features - increment feature set by one
6126 * @all: current feature set
6127 * @one: new feature set
6128 * @mask: mask feature set
6130 * Computes a new feature set after adding a device with feature set
6131 * @one to the master device with current feature set @all. Will not
6132 * enable anything that is off in @mask. Returns the new feature set.
6134 u32
netdev_increment_features(u32 all
, u32 one
, u32 mask
)
6136 /* If device needs checksumming, downgrade to it. */
6137 if (all
& NETIF_F_NO_CSUM
&& !(one
& NETIF_F_NO_CSUM
))
6138 all
^= NETIF_F_NO_CSUM
| (one
& NETIF_F_ALL_CSUM
);
6139 else if (mask
& NETIF_F_ALL_CSUM
) {
6140 /* If one device supports v4/v6 checksumming, set for all. */
6141 if (one
& (NETIF_F_IP_CSUM
| NETIF_F_IPV6_CSUM
) &&
6142 !(all
& NETIF_F_GEN_CSUM
)) {
6143 all
&= ~NETIF_F_ALL_CSUM
;
6144 all
|= one
& (NETIF_F_IP_CSUM
| NETIF_F_IPV6_CSUM
);
6147 /* If one device supports hw checksumming, set for all. */
6148 if (one
& NETIF_F_GEN_CSUM
&& !(all
& NETIF_F_GEN_CSUM
)) {
6149 all
&= ~NETIF_F_ALL_CSUM
;
6150 all
|= NETIF_F_HW_CSUM
;
6154 one
|= NETIF_F_ALL_CSUM
;
6156 one
|= all
& NETIF_F_ONE_FOR_ALL
;
6157 all
&= one
| NETIF_F_LLTX
| NETIF_F_GSO
| NETIF_F_UFO
;
6158 all
|= one
& mask
& NETIF_F_ONE_FOR_ALL
;
6162 EXPORT_SYMBOL(netdev_increment_features
);
6164 static struct hlist_head
*netdev_create_hash(void)
6167 struct hlist_head
*hash
;
6169 hash
= kmalloc(sizeof(*hash
) * NETDEV_HASHENTRIES
, GFP_KERNEL
);
6171 for (i
= 0; i
< NETDEV_HASHENTRIES
; i
++)
6172 INIT_HLIST_HEAD(&hash
[i
]);
6177 /* Initialize per network namespace state */
6178 static int __net_init
netdev_init(struct net
*net
)
6180 INIT_LIST_HEAD(&net
->dev_base_head
);
6182 net
->dev_name_head
= netdev_create_hash();
6183 if (net
->dev_name_head
== NULL
)
6186 net
->dev_index_head
= netdev_create_hash();
6187 if (net
->dev_index_head
== NULL
)
6193 kfree(net
->dev_name_head
);
6199 * netdev_drivername - network driver for the device
6200 * @dev: network device
6201 * @buffer: buffer for resulting name
6202 * @len: size of buffer
6204 * Determine network driver for device.
6206 char *netdev_drivername(const struct net_device
*dev
, char *buffer
, int len
)
6208 const struct device_driver
*driver
;
6209 const struct device
*parent
;
6211 if (len
<= 0 || !buffer
)
6215 parent
= dev
->dev
.parent
;
6220 driver
= parent
->driver
;
6221 if (driver
&& driver
->name
)
6222 strlcpy(buffer
, driver
->name
, len
);
6226 static int __netdev_printk(const char *level
, const struct net_device
*dev
,
6227 struct va_format
*vaf
)
6231 if (dev
&& dev
->dev
.parent
)
6232 r
= dev_printk(level
, dev
->dev
.parent
, "%s: %pV",
6233 netdev_name(dev
), vaf
);
6235 r
= printk("%s%s: %pV", level
, netdev_name(dev
), vaf
);
6237 r
= printk("%s(NULL net_device): %pV", level
, vaf
);
6242 int netdev_printk(const char *level
, const struct net_device
*dev
,
6243 const char *format
, ...)
6245 struct va_format vaf
;
6249 va_start(args
, format
);
6254 r
= __netdev_printk(level
, dev
, &vaf
);
6259 EXPORT_SYMBOL(netdev_printk
);
6261 #define define_netdev_printk_level(func, level) \
6262 int func(const struct net_device *dev, const char *fmt, ...) \
6265 struct va_format vaf; \
6268 va_start(args, fmt); \
6273 r = __netdev_printk(level, dev, &vaf); \
6278 EXPORT_SYMBOL(func);
6280 define_netdev_printk_level(netdev_emerg
, KERN_EMERG
);
6281 define_netdev_printk_level(netdev_alert
, KERN_ALERT
);
6282 define_netdev_printk_level(netdev_crit
, KERN_CRIT
);
6283 define_netdev_printk_level(netdev_err
, KERN_ERR
);
6284 define_netdev_printk_level(netdev_warn
, KERN_WARNING
);
6285 define_netdev_printk_level(netdev_notice
, KERN_NOTICE
);
6286 define_netdev_printk_level(netdev_info
, KERN_INFO
);
6288 static void __net_exit
netdev_exit(struct net
*net
)
6290 kfree(net
->dev_name_head
);
6291 kfree(net
->dev_index_head
);
6294 static struct pernet_operations __net_initdata netdev_net_ops
= {
6295 .init
= netdev_init
,
6296 .exit
= netdev_exit
,
6299 static void __net_exit
default_device_exit(struct net
*net
)
6301 struct net_device
*dev
, *aux
;
6303 * Push all migratable network devices back to the
6304 * initial network namespace
6307 for_each_netdev_safe(net
, dev
, aux
) {
6309 char fb_name
[IFNAMSIZ
];
6311 /* Ignore unmoveable devices (i.e. loopback) */
6312 if (dev
->features
& NETIF_F_NETNS_LOCAL
)
6315 /* Leave virtual devices for the generic cleanup */
6316 if (dev
->rtnl_link_ops
)
6319 /* Push remaing network devices to init_net */
6320 snprintf(fb_name
, IFNAMSIZ
, "dev%d", dev
->ifindex
);
6321 err
= dev_change_net_namespace(dev
, &init_net
, fb_name
);
6323 printk(KERN_EMERG
"%s: failed to move %s to init_net: %d\n",
6324 __func__
, dev
->name
, err
);
6331 static void __net_exit
default_device_exit_batch(struct list_head
*net_list
)
6333 /* At exit all network devices most be removed from a network
6334 * namespace. Do this in the reverse order of registration.
6335 * Do this across as many network namespaces as possible to
6336 * improve batching efficiency.
6338 struct net_device
*dev
;
6340 LIST_HEAD(dev_kill_list
);
6343 list_for_each_entry(net
, net_list
, exit_list
) {
6344 for_each_netdev_reverse(net
, dev
) {
6345 if (dev
->rtnl_link_ops
)
6346 dev
->rtnl_link_ops
->dellink(dev
, &dev_kill_list
);
6348 unregister_netdevice_queue(dev
, &dev_kill_list
);
6351 unregister_netdevice_many(&dev_kill_list
);
6352 list_del(&dev_kill_list
);
6356 static struct pernet_operations __net_initdata default_device_ops
= {
6357 .exit
= default_device_exit
,
6358 .exit_batch
= default_device_exit_batch
,
6362 * Initialize the DEV module. At boot time this walks the device list and
6363 * unhooks any devices that fail to initialise (normally hardware not
6364 * present) and leaves us with a valid list of present and active devices.
6369 * This is called single threaded during boot, so no need
6370 * to take the rtnl semaphore.
6372 static int __init
net_dev_init(void)
6374 int i
, rc
= -ENOMEM
;
6376 BUG_ON(!dev_boot_phase
);
6378 if (dev_proc_init())
6381 if (netdev_kobject_init())
6384 INIT_LIST_HEAD(&ptype_all
);
6385 for (i
= 0; i
< PTYPE_HASH_SIZE
; i
++)
6386 INIT_LIST_HEAD(&ptype_base
[i
]);
6388 if (register_pernet_subsys(&netdev_net_ops
))
6392 * Initialise the packet receive queues.
6395 for_each_possible_cpu(i
) {
6396 struct softnet_data
*sd
= &per_cpu(softnet_data
, i
);
6398 memset(sd
, 0, sizeof(*sd
));
6399 skb_queue_head_init(&sd
->input_pkt_queue
);
6400 skb_queue_head_init(&sd
->process_queue
);
6401 sd
->completion_queue
= NULL
;
6402 INIT_LIST_HEAD(&sd
->poll_list
);
6403 sd
->output_queue
= NULL
;
6404 sd
->output_queue_tailp
= &sd
->output_queue
;
6406 sd
->csd
.func
= rps_trigger_softirq
;
6412 sd
->backlog
.poll
= process_backlog
;
6413 sd
->backlog
.weight
= weight_p
;
6414 sd
->backlog
.gro_list
= NULL
;
6415 sd
->backlog
.gro_count
= 0;
6420 /* The loopback device is special if any other network devices
6421 * is present in a network namespace the loopback device must
6422 * be present. Since we now dynamically allocate and free the
6423 * loopback device ensure this invariant is maintained by
6424 * keeping the loopback device as the first device on the
6425 * list of network devices. Ensuring the loopback devices
6426 * is the first device that appears and the last network device
6429 if (register_pernet_device(&loopback_net_ops
))
6432 if (register_pernet_device(&default_device_ops
))
6435 open_softirq(NET_TX_SOFTIRQ
, net_tx_action
);
6436 open_softirq(NET_RX_SOFTIRQ
, net_rx_action
);
6438 hotcpu_notifier(dev_cpu_callback
, 0);
6446 subsys_initcall(net_dev_init
);
6448 static int __init
initialize_hashrnd(void)
6450 get_random_bytes(&hashrnd
, sizeof(hashrnd
));
6454 late_initcall_sync(initialize_hashrnd
);