ALSA: usb-audio: Fix an out-of-bound read in create_composite_quirks
[linux/fpc-iii.git] / net / core / dev.c
blobe03c1d2f6707ead048e503d249c342d727dfc568
1 /*
2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/dst_metadata.h>
103 #include <net/pkt_sched.h>
104 #include <net/checksum.h>
105 #include <net/xfrm.h>
106 #include <linux/highmem.h>
107 #include <linux/init.h>
108 #include <linux/module.h>
109 #include <linux/netpoll.h>
110 #include <linux/rcupdate.h>
111 #include <linux/delay.h>
112 #include <net/iw_handler.h>
113 #include <asm/current.h>
114 #include <linux/audit.h>
115 #include <linux/dmaengine.h>
116 #include <linux/err.h>
117 #include <linux/ctype.h>
118 #include <linux/if_arp.h>
119 #include <linux/if_vlan.h>
120 #include <linux/ip.h>
121 #include <net/ip.h>
122 #include <net/mpls.h>
123 #include <linux/ipv6.h>
124 #include <linux/in.h>
125 #include <linux/jhash.h>
126 #include <linux/random.h>
127 #include <trace/events/napi.h>
128 #include <trace/events/net.h>
129 #include <trace/events/skb.h>
130 #include <linux/pci.h>
131 #include <linux/inetdevice.h>
132 #include <linux/cpu_rmap.h>
133 #include <linux/static_key.h>
134 #include <linux/hashtable.h>
135 #include <linux/vmalloc.h>
136 #include <linux/if_macvlan.h>
137 #include <linux/errqueue.h>
138 #include <linux/hrtimer.h>
139 #include <linux/netfilter_ingress.h>
141 #include "net-sysfs.h"
143 /* Instead of increasing this, you should create a hash table. */
144 #define MAX_GRO_SKBS 8
146 /* This should be increased if a protocol with a bigger head is added. */
147 #define GRO_MAX_HEAD (MAX_HEADER + 128)
149 static DEFINE_SPINLOCK(ptype_lock);
150 static DEFINE_SPINLOCK(offload_lock);
151 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
152 struct list_head ptype_all __read_mostly; /* Taps */
153 static struct list_head offload_base __read_mostly;
155 static int netif_rx_internal(struct sk_buff *skb);
156 static int call_netdevice_notifiers_info(unsigned long val,
157 struct net_device *dev,
158 struct netdev_notifier_info *info);
161 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
162 * semaphore.
164 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
166 * Writers must hold the rtnl semaphore while they loop through the
167 * dev_base_head list, and hold dev_base_lock for writing when they do the
168 * actual updates. This allows pure readers to access the list even
169 * while a writer is preparing to update it.
171 * To put it another way, dev_base_lock is held for writing only to
172 * protect against pure readers; the rtnl semaphore provides the
173 * protection against other writers.
175 * See, for example usages, register_netdevice() and
176 * unregister_netdevice(), which must be called with the rtnl
177 * semaphore held.
179 DEFINE_RWLOCK(dev_base_lock);
180 EXPORT_SYMBOL(dev_base_lock);
182 /* protects napi_hash addition/deletion and napi_gen_id */
183 static DEFINE_SPINLOCK(napi_hash_lock);
185 static unsigned int napi_gen_id = NR_CPUS;
186 static DEFINE_HASHTABLE(napi_hash, 8);
188 static seqcount_t devnet_rename_seq;
190 static inline void dev_base_seq_inc(struct net *net)
192 while (++net->dev_base_seq == 0);
195 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
197 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
199 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
202 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
204 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
207 static inline void rps_lock(struct softnet_data *sd)
209 #ifdef CONFIG_RPS
210 spin_lock(&sd->input_pkt_queue.lock);
211 #endif
214 static inline void rps_unlock(struct softnet_data *sd)
216 #ifdef CONFIG_RPS
217 spin_unlock(&sd->input_pkt_queue.lock);
218 #endif
221 /* Device list insertion */
222 static void list_netdevice(struct net_device *dev)
224 struct net *net = dev_net(dev);
226 ASSERT_RTNL();
228 write_lock_bh(&dev_base_lock);
229 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
230 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
231 hlist_add_head_rcu(&dev->index_hlist,
232 dev_index_hash(net, dev->ifindex));
233 write_unlock_bh(&dev_base_lock);
235 dev_base_seq_inc(net);
238 /* Device list removal
239 * caller must respect a RCU grace period before freeing/reusing dev
241 static void unlist_netdevice(struct net_device *dev)
243 ASSERT_RTNL();
245 /* Unlink dev from the device chain */
246 write_lock_bh(&dev_base_lock);
247 list_del_rcu(&dev->dev_list);
248 hlist_del_rcu(&dev->name_hlist);
249 hlist_del_rcu(&dev->index_hlist);
250 write_unlock_bh(&dev_base_lock);
252 dev_base_seq_inc(dev_net(dev));
256 * Our notifier list
259 static RAW_NOTIFIER_HEAD(netdev_chain);
262 * Device drivers call our routines to queue packets here. We empty the
263 * queue in the local softnet handler.
266 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
267 EXPORT_PER_CPU_SYMBOL(softnet_data);
269 #ifdef CONFIG_LOCKDEP
271 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
272 * according to dev->type
274 static const unsigned short netdev_lock_type[] =
275 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
276 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
277 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
278 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
279 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
280 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
281 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
282 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
283 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
284 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
285 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
286 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
287 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
288 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
289 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
291 static const char *const netdev_lock_name[] =
292 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
293 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
294 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
295 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
296 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
297 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
298 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
299 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
300 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
301 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
302 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
303 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
304 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
305 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
306 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
308 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
309 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
311 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
313 int i;
315 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
316 if (netdev_lock_type[i] == dev_type)
317 return i;
318 /* the last key is used by default */
319 return ARRAY_SIZE(netdev_lock_type) - 1;
322 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
323 unsigned short dev_type)
325 int i;
327 i = netdev_lock_pos(dev_type);
328 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
329 netdev_lock_name[i]);
332 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
334 int i;
336 i = netdev_lock_pos(dev->type);
337 lockdep_set_class_and_name(&dev->addr_list_lock,
338 &netdev_addr_lock_key[i],
339 netdev_lock_name[i]);
341 #else
342 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
343 unsigned short dev_type)
346 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
349 #endif
351 /*******************************************************************************
353 Protocol management and registration routines
355 *******************************************************************************/
358 * Add a protocol ID to the list. Now that the input handler is
359 * smarter we can dispense with all the messy stuff that used to be
360 * here.
362 * BEWARE!!! Protocol handlers, mangling input packets,
363 * MUST BE last in hash buckets and checking protocol handlers
364 * MUST start from promiscuous ptype_all chain in net_bh.
365 * It is true now, do not change it.
366 * Explanation follows: if protocol handler, mangling packet, will
367 * be the first on list, it is not able to sense, that packet
368 * is cloned and should be copied-on-write, so that it will
369 * change it and subsequent readers will get broken packet.
370 * --ANK (980803)
373 static inline struct list_head *ptype_head(const struct packet_type *pt)
375 if (pt->type == htons(ETH_P_ALL))
376 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
377 else
378 return pt->dev ? &pt->dev->ptype_specific :
379 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
383 * dev_add_pack - add packet handler
384 * @pt: packet type declaration
386 * Add a protocol handler to the networking stack. The passed &packet_type
387 * is linked into kernel lists and may not be freed until it has been
388 * removed from the kernel lists.
390 * This call does not sleep therefore it can not
391 * guarantee all CPU's that are in middle of receiving packets
392 * will see the new packet type (until the next received packet).
395 void dev_add_pack(struct packet_type *pt)
397 struct list_head *head = ptype_head(pt);
399 spin_lock(&ptype_lock);
400 list_add_rcu(&pt->list, head);
401 spin_unlock(&ptype_lock);
403 EXPORT_SYMBOL(dev_add_pack);
406 * __dev_remove_pack - remove packet handler
407 * @pt: packet type declaration
409 * Remove a protocol handler that was previously added to the kernel
410 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
411 * from the kernel lists and can be freed or reused once this function
412 * returns.
414 * The packet type might still be in use by receivers
415 * and must not be freed until after all the CPU's have gone
416 * through a quiescent state.
418 void __dev_remove_pack(struct packet_type *pt)
420 struct list_head *head = ptype_head(pt);
421 struct packet_type *pt1;
423 spin_lock(&ptype_lock);
425 list_for_each_entry(pt1, head, list) {
426 if (pt == pt1) {
427 list_del_rcu(&pt->list);
428 goto out;
432 pr_warn("dev_remove_pack: %p not found\n", pt);
433 out:
434 spin_unlock(&ptype_lock);
436 EXPORT_SYMBOL(__dev_remove_pack);
439 * dev_remove_pack - remove packet handler
440 * @pt: packet type declaration
442 * Remove a protocol handler that was previously added to the kernel
443 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
444 * from the kernel lists and can be freed or reused once this function
445 * returns.
447 * This call sleeps to guarantee that no CPU is looking at the packet
448 * type after return.
450 void dev_remove_pack(struct packet_type *pt)
452 __dev_remove_pack(pt);
454 synchronize_net();
456 EXPORT_SYMBOL(dev_remove_pack);
460 * dev_add_offload - register offload handlers
461 * @po: protocol offload declaration
463 * Add protocol offload handlers to the networking stack. The passed
464 * &proto_offload is linked into kernel lists and may not be freed until
465 * it has been removed from the kernel lists.
467 * This call does not sleep therefore it can not
468 * guarantee all CPU's that are in middle of receiving packets
469 * will see the new offload handlers (until the next received packet).
471 void dev_add_offload(struct packet_offload *po)
473 struct packet_offload *elem;
475 spin_lock(&offload_lock);
476 list_for_each_entry(elem, &offload_base, list) {
477 if (po->priority < elem->priority)
478 break;
480 list_add_rcu(&po->list, elem->list.prev);
481 spin_unlock(&offload_lock);
483 EXPORT_SYMBOL(dev_add_offload);
486 * __dev_remove_offload - remove offload handler
487 * @po: packet offload declaration
489 * Remove a protocol offload handler that was previously added to the
490 * kernel offload handlers by dev_add_offload(). The passed &offload_type
491 * is removed from the kernel lists and can be freed or reused once this
492 * function returns.
494 * The packet type might still be in use by receivers
495 * and must not be freed until after all the CPU's have gone
496 * through a quiescent state.
498 static void __dev_remove_offload(struct packet_offload *po)
500 struct list_head *head = &offload_base;
501 struct packet_offload *po1;
503 spin_lock(&offload_lock);
505 list_for_each_entry(po1, head, list) {
506 if (po == po1) {
507 list_del_rcu(&po->list);
508 goto out;
512 pr_warn("dev_remove_offload: %p not found\n", po);
513 out:
514 spin_unlock(&offload_lock);
518 * dev_remove_offload - remove packet offload handler
519 * @po: packet offload declaration
521 * Remove a packet offload handler that was previously added to the kernel
522 * offload handlers by dev_add_offload(). The passed &offload_type is
523 * removed from the kernel lists and can be freed or reused once this
524 * function returns.
526 * This call sleeps to guarantee that no CPU is looking at the packet
527 * type after return.
529 void dev_remove_offload(struct packet_offload *po)
531 __dev_remove_offload(po);
533 synchronize_net();
535 EXPORT_SYMBOL(dev_remove_offload);
537 /******************************************************************************
539 Device Boot-time Settings Routines
541 *******************************************************************************/
543 /* Boot time configuration table */
544 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
547 * netdev_boot_setup_add - add new setup entry
548 * @name: name of the device
549 * @map: configured settings for the device
551 * Adds new setup entry to the dev_boot_setup list. The function
552 * returns 0 on error and 1 on success. This is a generic routine to
553 * all netdevices.
555 static int netdev_boot_setup_add(char *name, struct ifmap *map)
557 struct netdev_boot_setup *s;
558 int i;
560 s = dev_boot_setup;
561 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
562 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
563 memset(s[i].name, 0, sizeof(s[i].name));
564 strlcpy(s[i].name, name, IFNAMSIZ);
565 memcpy(&s[i].map, map, sizeof(s[i].map));
566 break;
570 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
574 * netdev_boot_setup_check - check boot time settings
575 * @dev: the netdevice
577 * Check boot time settings for the device.
578 * The found settings are set for the device to be used
579 * later in the device probing.
580 * Returns 0 if no settings found, 1 if they are.
582 int netdev_boot_setup_check(struct net_device *dev)
584 struct netdev_boot_setup *s = dev_boot_setup;
585 int i;
587 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
588 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
589 !strcmp(dev->name, s[i].name)) {
590 dev->irq = s[i].map.irq;
591 dev->base_addr = s[i].map.base_addr;
592 dev->mem_start = s[i].map.mem_start;
593 dev->mem_end = s[i].map.mem_end;
594 return 1;
597 return 0;
599 EXPORT_SYMBOL(netdev_boot_setup_check);
603 * netdev_boot_base - get address from boot time settings
604 * @prefix: prefix for network device
605 * @unit: id for network device
607 * Check boot time settings for the base address of device.
608 * The found settings are set for the device to be used
609 * later in the device probing.
610 * Returns 0 if no settings found.
612 unsigned long netdev_boot_base(const char *prefix, int unit)
614 const struct netdev_boot_setup *s = dev_boot_setup;
615 char name[IFNAMSIZ];
616 int i;
618 sprintf(name, "%s%d", prefix, unit);
621 * If device already registered then return base of 1
622 * to indicate not to probe for this interface
624 if (__dev_get_by_name(&init_net, name))
625 return 1;
627 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
628 if (!strcmp(name, s[i].name))
629 return s[i].map.base_addr;
630 return 0;
634 * Saves at boot time configured settings for any netdevice.
636 int __init netdev_boot_setup(char *str)
638 int ints[5];
639 struct ifmap map;
641 str = get_options(str, ARRAY_SIZE(ints), ints);
642 if (!str || !*str)
643 return 0;
645 /* Save settings */
646 memset(&map, 0, sizeof(map));
647 if (ints[0] > 0)
648 map.irq = ints[1];
649 if (ints[0] > 1)
650 map.base_addr = ints[2];
651 if (ints[0] > 2)
652 map.mem_start = ints[3];
653 if (ints[0] > 3)
654 map.mem_end = ints[4];
656 /* Add new entry to the list */
657 return netdev_boot_setup_add(str, &map);
660 __setup("netdev=", netdev_boot_setup);
662 /*******************************************************************************
664 Device Interface Subroutines
666 *******************************************************************************/
669 * dev_get_iflink - get 'iflink' value of a interface
670 * @dev: targeted interface
672 * Indicates the ifindex the interface is linked to.
673 * Physical interfaces have the same 'ifindex' and 'iflink' values.
676 int dev_get_iflink(const struct net_device *dev)
678 if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
679 return dev->netdev_ops->ndo_get_iflink(dev);
681 return dev->ifindex;
683 EXPORT_SYMBOL(dev_get_iflink);
686 * dev_fill_metadata_dst - Retrieve tunnel egress information.
687 * @dev: targeted interface
688 * @skb: The packet.
690 * For better visibility of tunnel traffic OVS needs to retrieve
691 * egress tunnel information for a packet. Following API allows
692 * user to get this info.
694 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
696 struct ip_tunnel_info *info;
698 if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
699 return -EINVAL;
701 info = skb_tunnel_info_unclone(skb);
702 if (!info)
703 return -ENOMEM;
704 if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
705 return -EINVAL;
707 return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
709 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
712 * __dev_get_by_name - find a device by its name
713 * @net: the applicable net namespace
714 * @name: name to find
716 * Find an interface by name. Must be called under RTNL semaphore
717 * or @dev_base_lock. If the name is found a pointer to the device
718 * is returned. If the name is not found then %NULL is returned. The
719 * reference counters are not incremented so the caller must be
720 * careful with locks.
723 struct net_device *__dev_get_by_name(struct net *net, const char *name)
725 struct net_device *dev;
726 struct hlist_head *head = dev_name_hash(net, name);
728 hlist_for_each_entry(dev, head, name_hlist)
729 if (!strncmp(dev->name, name, IFNAMSIZ))
730 return dev;
732 return NULL;
734 EXPORT_SYMBOL(__dev_get_by_name);
737 * dev_get_by_name_rcu - find a device by its name
738 * @net: the applicable net namespace
739 * @name: name to find
741 * Find an interface by name.
742 * If the name is found a pointer to the device is returned.
743 * If the name is not found then %NULL is returned.
744 * The reference counters are not incremented so the caller must be
745 * careful with locks. The caller must hold RCU lock.
748 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
750 struct net_device *dev;
751 struct hlist_head *head = dev_name_hash(net, name);
753 hlist_for_each_entry_rcu(dev, head, name_hlist)
754 if (!strncmp(dev->name, name, IFNAMSIZ))
755 return dev;
757 return NULL;
759 EXPORT_SYMBOL(dev_get_by_name_rcu);
762 * dev_get_by_name - find a device by its name
763 * @net: the applicable net namespace
764 * @name: name to find
766 * Find an interface by name. This can be called from any
767 * context and does its own locking. The returned handle has
768 * the usage count incremented and the caller must use dev_put() to
769 * release it when it is no longer needed. %NULL is returned if no
770 * matching device is found.
773 struct net_device *dev_get_by_name(struct net *net, const char *name)
775 struct net_device *dev;
777 rcu_read_lock();
778 dev = dev_get_by_name_rcu(net, name);
779 if (dev)
780 dev_hold(dev);
781 rcu_read_unlock();
782 return dev;
784 EXPORT_SYMBOL(dev_get_by_name);
787 * __dev_get_by_index - find a device by its ifindex
788 * @net: the applicable net namespace
789 * @ifindex: index of device
791 * Search for an interface by index. Returns %NULL if the device
792 * is not found or a pointer to the device. The device has not
793 * had its reference counter increased so the caller must be careful
794 * about locking. The caller must hold either the RTNL semaphore
795 * or @dev_base_lock.
798 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
800 struct net_device *dev;
801 struct hlist_head *head = dev_index_hash(net, ifindex);
803 hlist_for_each_entry(dev, head, index_hlist)
804 if (dev->ifindex == ifindex)
805 return dev;
807 return NULL;
809 EXPORT_SYMBOL(__dev_get_by_index);
812 * dev_get_by_index_rcu - find a device by its ifindex
813 * @net: the applicable net namespace
814 * @ifindex: index of device
816 * Search for an interface by index. Returns %NULL if the device
817 * is not found or a pointer to the device. The device has not
818 * had its reference counter increased so the caller must be careful
819 * about locking. The caller must hold RCU lock.
822 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
824 struct net_device *dev;
825 struct hlist_head *head = dev_index_hash(net, ifindex);
827 hlist_for_each_entry_rcu(dev, head, index_hlist)
828 if (dev->ifindex == ifindex)
829 return dev;
831 return NULL;
833 EXPORT_SYMBOL(dev_get_by_index_rcu);
837 * dev_get_by_index - find a device by its ifindex
838 * @net: the applicable net namespace
839 * @ifindex: index of device
841 * Search for an interface by index. Returns NULL if the device
842 * is not found or a pointer to the device. The device returned has
843 * had a reference added and the pointer is safe until the user calls
844 * dev_put to indicate they have finished with it.
847 struct net_device *dev_get_by_index(struct net *net, int ifindex)
849 struct net_device *dev;
851 rcu_read_lock();
852 dev = dev_get_by_index_rcu(net, ifindex);
853 if (dev)
854 dev_hold(dev);
855 rcu_read_unlock();
856 return dev;
858 EXPORT_SYMBOL(dev_get_by_index);
861 * netdev_get_name - get a netdevice name, knowing its ifindex.
862 * @net: network namespace
863 * @name: a pointer to the buffer where the name will be stored.
864 * @ifindex: the ifindex of the interface to get the name from.
866 * The use of raw_seqcount_begin() and cond_resched() before
867 * retrying is required as we want to give the writers a chance
868 * to complete when CONFIG_PREEMPT is not set.
870 int netdev_get_name(struct net *net, char *name, int ifindex)
872 struct net_device *dev;
873 unsigned int seq;
875 retry:
876 seq = raw_seqcount_begin(&devnet_rename_seq);
877 rcu_read_lock();
878 dev = dev_get_by_index_rcu(net, ifindex);
879 if (!dev) {
880 rcu_read_unlock();
881 return -ENODEV;
884 strcpy(name, dev->name);
885 rcu_read_unlock();
886 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
887 cond_resched();
888 goto retry;
891 return 0;
895 * dev_getbyhwaddr_rcu - find a device by its hardware address
896 * @net: the applicable net namespace
897 * @type: media type of device
898 * @ha: hardware address
900 * Search for an interface by MAC address. Returns NULL if the device
901 * is not found or a pointer to the device.
902 * The caller must hold RCU or RTNL.
903 * The returned device has not had its ref count increased
904 * and the caller must therefore be careful about locking
908 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
909 const char *ha)
911 struct net_device *dev;
913 for_each_netdev_rcu(net, dev)
914 if (dev->type == type &&
915 !memcmp(dev->dev_addr, ha, dev->addr_len))
916 return dev;
918 return NULL;
920 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
922 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
924 struct net_device *dev;
926 ASSERT_RTNL();
927 for_each_netdev(net, dev)
928 if (dev->type == type)
929 return dev;
931 return NULL;
933 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
935 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
937 struct net_device *dev, *ret = NULL;
939 rcu_read_lock();
940 for_each_netdev_rcu(net, dev)
941 if (dev->type == type) {
942 dev_hold(dev);
943 ret = dev;
944 break;
946 rcu_read_unlock();
947 return ret;
949 EXPORT_SYMBOL(dev_getfirstbyhwtype);
952 * __dev_get_by_flags - find any device with given flags
953 * @net: the applicable net namespace
954 * @if_flags: IFF_* values
955 * @mask: bitmask of bits in if_flags to check
957 * Search for any interface with the given flags. Returns NULL if a device
958 * is not found or a pointer to the device. Must be called inside
959 * rtnl_lock(), and result refcount is unchanged.
962 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
963 unsigned short mask)
965 struct net_device *dev, *ret;
967 ASSERT_RTNL();
969 ret = NULL;
970 for_each_netdev(net, dev) {
971 if (((dev->flags ^ if_flags) & mask) == 0) {
972 ret = dev;
973 break;
976 return ret;
978 EXPORT_SYMBOL(__dev_get_by_flags);
981 * dev_valid_name - check if name is okay for network device
982 * @name: name string
984 * Network device names need to be valid file names to
985 * to allow sysfs to work. We also disallow any kind of
986 * whitespace.
988 bool dev_valid_name(const char *name)
990 if (*name == '\0')
991 return false;
992 if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
993 return false;
994 if (!strcmp(name, ".") || !strcmp(name, ".."))
995 return false;
997 while (*name) {
998 if (*name == '/' || *name == ':' || isspace(*name))
999 return false;
1000 name++;
1002 return true;
1004 EXPORT_SYMBOL(dev_valid_name);
1007 * __dev_alloc_name - allocate a name for a device
1008 * @net: network namespace to allocate the device name in
1009 * @name: name format string
1010 * @buf: scratch buffer and result name string
1012 * Passed a format string - eg "lt%d" it will try and find a suitable
1013 * id. It scans list of devices to build up a free map, then chooses
1014 * the first empty slot. The caller must hold the dev_base or rtnl lock
1015 * while allocating the name and adding the device in order to avoid
1016 * duplicates.
1017 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1018 * Returns the number of the unit assigned or a negative errno code.
1021 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1023 int i = 0;
1024 const char *p;
1025 const int max_netdevices = 8*PAGE_SIZE;
1026 unsigned long *inuse;
1027 struct net_device *d;
1029 p = strnchr(name, IFNAMSIZ-1, '%');
1030 if (p) {
1032 * Verify the string as this thing may have come from
1033 * the user. There must be either one "%d" and no other "%"
1034 * characters.
1036 if (p[1] != 'd' || strchr(p + 2, '%'))
1037 return -EINVAL;
1039 /* Use one page as a bit array of possible slots */
1040 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1041 if (!inuse)
1042 return -ENOMEM;
1044 for_each_netdev(net, d) {
1045 if (!sscanf(d->name, name, &i))
1046 continue;
1047 if (i < 0 || i >= max_netdevices)
1048 continue;
1050 /* avoid cases where sscanf is not exact inverse of printf */
1051 snprintf(buf, IFNAMSIZ, name, i);
1052 if (!strncmp(buf, d->name, IFNAMSIZ))
1053 set_bit(i, inuse);
1056 i = find_first_zero_bit(inuse, max_netdevices);
1057 free_page((unsigned long) inuse);
1060 if (buf != name)
1061 snprintf(buf, IFNAMSIZ, name, i);
1062 if (!__dev_get_by_name(net, buf))
1063 return i;
1065 /* It is possible to run out of possible slots
1066 * when the name is long and there isn't enough space left
1067 * for the digits, or if all bits are used.
1069 return -ENFILE;
1073 * dev_alloc_name - allocate a name for a device
1074 * @dev: device
1075 * @name: name format string
1077 * Passed a format string - eg "lt%d" it will try and find a suitable
1078 * id. It scans list of devices to build up a free map, then chooses
1079 * the first empty slot. The caller must hold the dev_base or rtnl lock
1080 * while allocating the name and adding the device in order to avoid
1081 * duplicates.
1082 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1083 * Returns the number of the unit assigned or a negative errno code.
1086 int dev_alloc_name(struct net_device *dev, const char *name)
1088 char buf[IFNAMSIZ];
1089 struct net *net;
1090 int ret;
1092 BUG_ON(!dev_net(dev));
1093 net = dev_net(dev);
1094 ret = __dev_alloc_name(net, name, buf);
1095 if (ret >= 0)
1096 strlcpy(dev->name, buf, IFNAMSIZ);
1097 return ret;
1099 EXPORT_SYMBOL(dev_alloc_name);
1101 static int dev_alloc_name_ns(struct net *net,
1102 struct net_device *dev,
1103 const char *name)
1105 char buf[IFNAMSIZ];
1106 int ret;
1108 ret = __dev_alloc_name(net, name, buf);
1109 if (ret >= 0)
1110 strlcpy(dev->name, buf, IFNAMSIZ);
1111 return ret;
1114 int dev_get_valid_name(struct net *net, struct net_device *dev,
1115 const char *name)
1117 BUG_ON(!net);
1119 if (!dev_valid_name(name))
1120 return -EINVAL;
1122 if (strchr(name, '%'))
1123 return dev_alloc_name_ns(net, dev, name);
1124 else if (__dev_get_by_name(net, name))
1125 return -EEXIST;
1126 else if (dev->name != name)
1127 strlcpy(dev->name, name, IFNAMSIZ);
1129 return 0;
1131 EXPORT_SYMBOL(dev_get_valid_name);
1134 * dev_change_name - change name of a device
1135 * @dev: device
1136 * @newname: name (or format string) must be at least IFNAMSIZ
1138 * Change name of a device, can pass format strings "eth%d".
1139 * for wildcarding.
1141 int dev_change_name(struct net_device *dev, const char *newname)
1143 unsigned char old_assign_type;
1144 char oldname[IFNAMSIZ];
1145 int err = 0;
1146 int ret;
1147 struct net *net;
1149 ASSERT_RTNL();
1150 BUG_ON(!dev_net(dev));
1152 net = dev_net(dev);
1153 if (dev->flags & IFF_UP)
1154 return -EBUSY;
1156 write_seqcount_begin(&devnet_rename_seq);
1158 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1159 write_seqcount_end(&devnet_rename_seq);
1160 return 0;
1163 memcpy(oldname, dev->name, IFNAMSIZ);
1165 err = dev_get_valid_name(net, dev, newname);
1166 if (err < 0) {
1167 write_seqcount_end(&devnet_rename_seq);
1168 return err;
1171 if (oldname[0] && !strchr(oldname, '%'))
1172 netdev_info(dev, "renamed from %s\n", oldname);
1174 old_assign_type = dev->name_assign_type;
1175 dev->name_assign_type = NET_NAME_RENAMED;
1177 rollback:
1178 ret = device_rename(&dev->dev, dev->name);
1179 if (ret) {
1180 memcpy(dev->name, oldname, IFNAMSIZ);
1181 dev->name_assign_type = old_assign_type;
1182 write_seqcount_end(&devnet_rename_seq);
1183 return ret;
1186 write_seqcount_end(&devnet_rename_seq);
1188 netdev_adjacent_rename_links(dev, oldname);
1190 write_lock_bh(&dev_base_lock);
1191 hlist_del_rcu(&dev->name_hlist);
1192 write_unlock_bh(&dev_base_lock);
1194 synchronize_rcu();
1196 write_lock_bh(&dev_base_lock);
1197 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1198 write_unlock_bh(&dev_base_lock);
1200 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1201 ret = notifier_to_errno(ret);
1203 if (ret) {
1204 /* err >= 0 after dev_alloc_name() or stores the first errno */
1205 if (err >= 0) {
1206 err = ret;
1207 write_seqcount_begin(&devnet_rename_seq);
1208 memcpy(dev->name, oldname, IFNAMSIZ);
1209 memcpy(oldname, newname, IFNAMSIZ);
1210 dev->name_assign_type = old_assign_type;
1211 old_assign_type = NET_NAME_RENAMED;
1212 goto rollback;
1213 } else {
1214 pr_err("%s: name change rollback failed: %d\n",
1215 dev->name, ret);
1219 return err;
1223 * dev_set_alias - change ifalias of a device
1224 * @dev: device
1225 * @alias: name up to IFALIASZ
1226 * @len: limit of bytes to copy from info
1228 * Set ifalias for a device,
1230 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1232 char *new_ifalias;
1234 ASSERT_RTNL();
1236 if (len >= IFALIASZ)
1237 return -EINVAL;
1239 if (!len) {
1240 kfree(dev->ifalias);
1241 dev->ifalias = NULL;
1242 return 0;
1245 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1246 if (!new_ifalias)
1247 return -ENOMEM;
1248 dev->ifalias = new_ifalias;
1249 memcpy(dev->ifalias, alias, len);
1250 dev->ifalias[len] = 0;
1252 return len;
1257 * netdev_features_change - device changes features
1258 * @dev: device to cause notification
1260 * Called to indicate a device has changed features.
1262 void netdev_features_change(struct net_device *dev)
1264 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1266 EXPORT_SYMBOL(netdev_features_change);
1269 * netdev_state_change - device changes state
1270 * @dev: device to cause notification
1272 * Called to indicate a device has changed state. This function calls
1273 * the notifier chains for netdev_chain and sends a NEWLINK message
1274 * to the routing socket.
1276 void netdev_state_change(struct net_device *dev)
1278 if (dev->flags & IFF_UP) {
1279 struct netdev_notifier_change_info change_info;
1281 change_info.flags_changed = 0;
1282 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1283 &change_info.info);
1284 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287 EXPORT_SYMBOL(netdev_state_change);
1290 * netdev_notify_peers - notify network peers about existence of @dev
1291 * @dev: network device
1293 * Generate traffic such that interested network peers are aware of
1294 * @dev, such as by generating a gratuitous ARP. This may be used when
1295 * a device wants to inform the rest of the network about some sort of
1296 * reconfiguration such as a failover event or virtual machine
1297 * migration.
1299 void netdev_notify_peers(struct net_device *dev)
1301 rtnl_lock();
1302 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1303 call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1304 rtnl_unlock();
1306 EXPORT_SYMBOL(netdev_notify_peers);
1308 static int __dev_open(struct net_device *dev)
1310 const struct net_device_ops *ops = dev->netdev_ops;
1311 int ret;
1313 ASSERT_RTNL();
1315 if (!netif_device_present(dev))
1316 return -ENODEV;
1318 /* Block netpoll from trying to do any rx path servicing.
1319 * If we don't do this there is a chance ndo_poll_controller
1320 * or ndo_poll may be running while we open the device
1322 netpoll_poll_disable(dev);
1324 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1325 ret = notifier_to_errno(ret);
1326 if (ret)
1327 return ret;
1329 set_bit(__LINK_STATE_START, &dev->state);
1331 if (ops->ndo_validate_addr)
1332 ret = ops->ndo_validate_addr(dev);
1334 if (!ret && ops->ndo_open)
1335 ret = ops->ndo_open(dev);
1337 netpoll_poll_enable(dev);
1339 if (ret)
1340 clear_bit(__LINK_STATE_START, &dev->state);
1341 else {
1342 dev->flags |= IFF_UP;
1343 dev_set_rx_mode(dev);
1344 dev_activate(dev);
1345 add_device_randomness(dev->dev_addr, dev->addr_len);
1348 return ret;
1352 * dev_open - prepare an interface for use.
1353 * @dev: device to open
1355 * Takes a device from down to up state. The device's private open
1356 * function is invoked and then the multicast lists are loaded. Finally
1357 * the device is moved into the up state and a %NETDEV_UP message is
1358 * sent to the netdev notifier chain.
1360 * Calling this function on an active interface is a nop. On a failure
1361 * a negative errno code is returned.
1363 int dev_open(struct net_device *dev)
1365 int ret;
1367 if (dev->flags & IFF_UP)
1368 return 0;
1370 ret = __dev_open(dev);
1371 if (ret < 0)
1372 return ret;
1374 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1375 call_netdevice_notifiers(NETDEV_UP, dev);
1377 return ret;
1379 EXPORT_SYMBOL(dev_open);
1381 static int __dev_close_many(struct list_head *head)
1383 struct net_device *dev;
1385 ASSERT_RTNL();
1386 might_sleep();
1388 list_for_each_entry(dev, head, close_list) {
1389 /* Temporarily disable netpoll until the interface is down */
1390 netpoll_poll_disable(dev);
1392 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394 clear_bit(__LINK_STATE_START, &dev->state);
1396 /* Synchronize to scheduled poll. We cannot touch poll list, it
1397 * can be even on different cpu. So just clear netif_running().
1399 * dev->stop() will invoke napi_disable() on all of it's
1400 * napi_struct instances on this device.
1402 smp_mb__after_atomic(); /* Commit netif_running(). */
1405 dev_deactivate_many(head);
1407 list_for_each_entry(dev, head, close_list) {
1408 const struct net_device_ops *ops = dev->netdev_ops;
1411 * Call the device specific close. This cannot fail.
1412 * Only if device is UP
1414 * We allow it to be called even after a DETACH hot-plug
1415 * event.
1417 if (ops->ndo_stop)
1418 ops->ndo_stop(dev);
1420 dev->flags &= ~IFF_UP;
1421 netpoll_poll_enable(dev);
1424 return 0;
1427 static int __dev_close(struct net_device *dev)
1429 int retval;
1430 LIST_HEAD(single);
1432 list_add(&dev->close_list, &single);
1433 retval = __dev_close_many(&single);
1434 list_del(&single);
1436 return retval;
1439 int dev_close_many(struct list_head *head, bool unlink)
1441 struct net_device *dev, *tmp;
1443 /* Remove the devices that don't need to be closed */
1444 list_for_each_entry_safe(dev, tmp, head, close_list)
1445 if (!(dev->flags & IFF_UP))
1446 list_del_init(&dev->close_list);
1448 __dev_close_many(head);
1450 list_for_each_entry_safe(dev, tmp, head, close_list) {
1451 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1452 call_netdevice_notifiers(NETDEV_DOWN, dev);
1453 if (unlink)
1454 list_del_init(&dev->close_list);
1457 return 0;
1459 EXPORT_SYMBOL(dev_close_many);
1462 * dev_close - shutdown an interface.
1463 * @dev: device to shutdown
1465 * This function moves an active device into down state. A
1466 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1467 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1468 * chain.
1470 int dev_close(struct net_device *dev)
1472 if (dev->flags & IFF_UP) {
1473 LIST_HEAD(single);
1475 list_add(&dev->close_list, &single);
1476 dev_close_many(&single, true);
1477 list_del(&single);
1479 return 0;
1481 EXPORT_SYMBOL(dev_close);
1485 * dev_disable_lro - disable Large Receive Offload on a device
1486 * @dev: device
1488 * Disable Large Receive Offload (LRO) on a net device. Must be
1489 * called under RTNL. This is needed if received packets may be
1490 * forwarded to another interface.
1492 void dev_disable_lro(struct net_device *dev)
1494 struct net_device *lower_dev;
1495 struct list_head *iter;
1497 dev->wanted_features &= ~NETIF_F_LRO;
1498 netdev_update_features(dev);
1500 if (unlikely(dev->features & NETIF_F_LRO))
1501 netdev_WARN(dev, "failed to disable LRO!\n");
1503 netdev_for_each_lower_dev(dev, lower_dev, iter)
1504 dev_disable_lro(lower_dev);
1506 EXPORT_SYMBOL(dev_disable_lro);
1508 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1509 struct net_device *dev)
1511 struct netdev_notifier_info info;
1513 netdev_notifier_info_init(&info, dev);
1514 return nb->notifier_call(nb, val, &info);
1517 static int dev_boot_phase = 1;
1520 * register_netdevice_notifier - register a network notifier block
1521 * @nb: notifier
1523 * Register a notifier to be called when network device events occur.
1524 * The notifier passed is linked into the kernel structures and must
1525 * not be reused until it has been unregistered. A negative errno code
1526 * is returned on a failure.
1528 * When registered all registration and up events are replayed
1529 * to the new notifier to allow device to have a race free
1530 * view of the network device list.
1533 int register_netdevice_notifier(struct notifier_block *nb)
1535 struct net_device *dev;
1536 struct net_device *last;
1537 struct net *net;
1538 int err;
1540 rtnl_lock();
1541 err = raw_notifier_chain_register(&netdev_chain, nb);
1542 if (err)
1543 goto unlock;
1544 if (dev_boot_phase)
1545 goto unlock;
1546 for_each_net(net) {
1547 for_each_netdev(net, dev) {
1548 err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1549 err = notifier_to_errno(err);
1550 if (err)
1551 goto rollback;
1553 if (!(dev->flags & IFF_UP))
1554 continue;
1556 call_netdevice_notifier(nb, NETDEV_UP, dev);
1560 unlock:
1561 rtnl_unlock();
1562 return err;
1564 rollback:
1565 last = dev;
1566 for_each_net(net) {
1567 for_each_netdev(net, dev) {
1568 if (dev == last)
1569 goto outroll;
1571 if (dev->flags & IFF_UP) {
1572 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1573 dev);
1574 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1580 outroll:
1581 raw_notifier_chain_unregister(&netdev_chain, nb);
1582 goto unlock;
1584 EXPORT_SYMBOL(register_netdevice_notifier);
1587 * unregister_netdevice_notifier - unregister a network notifier block
1588 * @nb: notifier
1590 * Unregister a notifier previously registered by
1591 * register_netdevice_notifier(). The notifier is unlinked into the
1592 * kernel structures and may then be reused. A negative errno code
1593 * is returned on a failure.
1595 * After unregistering unregister and down device events are synthesized
1596 * for all devices on the device list to the removed notifier to remove
1597 * the need for special case cleanup code.
1600 int unregister_netdevice_notifier(struct notifier_block *nb)
1602 struct net_device *dev;
1603 struct net *net;
1604 int err;
1606 rtnl_lock();
1607 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1608 if (err)
1609 goto unlock;
1611 for_each_net(net) {
1612 for_each_netdev(net, dev) {
1613 if (dev->flags & IFF_UP) {
1614 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1615 dev);
1616 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618 call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1621 unlock:
1622 rtnl_unlock();
1623 return err;
1625 EXPORT_SYMBOL(unregister_netdevice_notifier);
1628 * call_netdevice_notifiers_info - call all network notifier blocks
1629 * @val: value passed unmodified to notifier function
1630 * @dev: net_device pointer passed unmodified to notifier function
1631 * @info: notifier information data
1633 * Call all network notifier blocks. Parameters and return value
1634 * are as for raw_notifier_call_chain().
1637 static int call_netdevice_notifiers_info(unsigned long val,
1638 struct net_device *dev,
1639 struct netdev_notifier_info *info)
1641 ASSERT_RTNL();
1642 netdev_notifier_info_init(info, dev);
1643 return raw_notifier_call_chain(&netdev_chain, val, info);
1647 * call_netdevice_notifiers - call all network notifier blocks
1648 * @val: value passed unmodified to notifier function
1649 * @dev: net_device pointer passed unmodified to notifier function
1651 * Call all network notifier blocks. Parameters and return value
1652 * are as for raw_notifier_call_chain().
1655 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657 struct netdev_notifier_info info;
1659 return call_netdevice_notifiers_info(val, dev, &info);
1661 EXPORT_SYMBOL(call_netdevice_notifiers);
1664 * call_netdevice_notifiers_mtu - call all network notifier blocks
1665 * @val: value passed unmodified to notifier function
1666 * @dev: net_device pointer passed unmodified to notifier function
1667 * @arg: additional u32 argument passed to the notifier function
1669 * Call all network notifier blocks. Parameters and return value
1670 * are as for raw_notifier_call_chain().
1672 static int call_netdevice_notifiers_mtu(unsigned long val,
1673 struct net_device *dev, u32 arg)
1675 struct netdev_notifier_info_ext info = {
1676 .info.dev = dev,
1677 .ext.mtu = arg,
1680 BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
1682 return call_netdevice_notifiers_info(val, dev, &info.info);
1685 #ifdef CONFIG_NET_INGRESS
1686 static struct static_key ingress_needed __read_mostly;
1688 void net_inc_ingress_queue(void)
1690 static_key_slow_inc(&ingress_needed);
1692 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1694 void net_dec_ingress_queue(void)
1696 static_key_slow_dec(&ingress_needed);
1698 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1699 #endif
1701 static struct static_key netstamp_needed __read_mostly;
1702 #ifdef HAVE_JUMP_LABEL
1703 static atomic_t netstamp_needed_deferred;
1704 static atomic_t netstamp_wanted;
1705 static void netstamp_clear(struct work_struct *work)
1707 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1708 int wanted;
1710 wanted = atomic_add_return(deferred, &netstamp_wanted);
1711 if (wanted > 0)
1712 static_key_enable(&netstamp_needed);
1713 else
1714 static_key_disable(&netstamp_needed);
1716 static DECLARE_WORK(netstamp_work, netstamp_clear);
1717 #endif
1719 void net_enable_timestamp(void)
1721 #ifdef HAVE_JUMP_LABEL
1722 int wanted;
1724 while (1) {
1725 wanted = atomic_read(&netstamp_wanted);
1726 if (wanted <= 0)
1727 break;
1728 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1729 return;
1731 atomic_inc(&netstamp_needed_deferred);
1732 schedule_work(&netstamp_work);
1733 #else
1734 static_key_slow_inc(&netstamp_needed);
1735 #endif
1737 EXPORT_SYMBOL(net_enable_timestamp);
1739 void net_disable_timestamp(void)
1741 #ifdef HAVE_JUMP_LABEL
1742 int wanted;
1744 while (1) {
1745 wanted = atomic_read(&netstamp_wanted);
1746 if (wanted <= 1)
1747 break;
1748 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1749 return;
1751 atomic_dec(&netstamp_needed_deferred);
1752 schedule_work(&netstamp_work);
1753 #else
1754 static_key_slow_dec(&netstamp_needed);
1755 #endif
1757 EXPORT_SYMBOL(net_disable_timestamp);
1759 static inline void net_timestamp_set(struct sk_buff *skb)
1761 skb->tstamp.tv64 = 0;
1762 if (static_key_false(&netstamp_needed))
1763 __net_timestamp(skb);
1766 #define net_timestamp_check(COND, SKB) \
1767 if (static_key_false(&netstamp_needed)) { \
1768 if ((COND) && !(SKB)->tstamp.tv64) \
1769 __net_timestamp(SKB); \
1772 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1774 unsigned int len;
1776 if (!(dev->flags & IFF_UP))
1777 return false;
1779 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1780 if (skb->len <= len)
1781 return true;
1783 /* if TSO is enabled, we don't care about the length as the packet
1784 * could be forwarded without being segmented before
1786 if (skb_is_gso(skb))
1787 return true;
1789 return false;
1791 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1793 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1795 if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1796 unlikely(!is_skb_forwardable(dev, skb))) {
1797 atomic_long_inc(&dev->rx_dropped);
1798 kfree_skb(skb);
1799 return NET_RX_DROP;
1802 skb_scrub_packet(skb, true);
1803 skb->priority = 0;
1804 skb->protocol = eth_type_trans(skb, dev);
1805 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1807 return 0;
1809 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1812 * dev_forward_skb - loopback an skb to another netif
1814 * @dev: destination network device
1815 * @skb: buffer to forward
1817 * return values:
1818 * NET_RX_SUCCESS (no congestion)
1819 * NET_RX_DROP (packet was dropped, but freed)
1821 * dev_forward_skb can be used for injecting an skb from the
1822 * start_xmit function of one device into the receive queue
1823 * of another device.
1825 * The receiving device may be in another namespace, so
1826 * we have to clear all information in the skb that could
1827 * impact namespace isolation.
1829 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1831 return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1833 EXPORT_SYMBOL_GPL(dev_forward_skb);
1835 static inline int deliver_skb(struct sk_buff *skb,
1836 struct packet_type *pt_prev,
1837 struct net_device *orig_dev)
1839 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1840 return -ENOMEM;
1841 atomic_inc(&skb->users);
1842 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1845 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1846 struct packet_type **pt,
1847 struct net_device *orig_dev,
1848 __be16 type,
1849 struct list_head *ptype_list)
1851 struct packet_type *ptype, *pt_prev = *pt;
1853 list_for_each_entry_rcu(ptype, ptype_list, list) {
1854 if (ptype->type != type)
1855 continue;
1856 if (pt_prev)
1857 deliver_skb(skb, pt_prev, orig_dev);
1858 pt_prev = ptype;
1860 *pt = pt_prev;
1863 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1865 if (!ptype->af_packet_priv || !skb->sk)
1866 return false;
1868 if (ptype->id_match)
1869 return ptype->id_match(ptype, skb->sk);
1870 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1871 return true;
1873 return false;
1877 * Support routine. Sends outgoing frames to any network
1878 * taps currently in use.
1881 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1883 struct packet_type *ptype;
1884 struct sk_buff *skb2 = NULL;
1885 struct packet_type *pt_prev = NULL;
1886 struct list_head *ptype_list = &ptype_all;
1888 rcu_read_lock();
1889 again:
1890 list_for_each_entry_rcu(ptype, ptype_list, list) {
1891 /* Never send packets back to the socket
1892 * they originated from - MvS (miquels@drinkel.ow.org)
1894 if (skb_loop_sk(ptype, skb))
1895 continue;
1897 if (pt_prev) {
1898 deliver_skb(skb2, pt_prev, skb->dev);
1899 pt_prev = ptype;
1900 continue;
1903 /* need to clone skb, done only once */
1904 skb2 = skb_clone(skb, GFP_ATOMIC);
1905 if (!skb2)
1906 goto out_unlock;
1908 net_timestamp_set(skb2);
1910 /* skb->nh should be correctly
1911 * set by sender, so that the second statement is
1912 * just protection against buggy protocols.
1914 skb_reset_mac_header(skb2);
1916 if (skb_network_header(skb2) < skb2->data ||
1917 skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1918 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1919 ntohs(skb2->protocol),
1920 dev->name);
1921 skb_reset_network_header(skb2);
1924 skb2->transport_header = skb2->network_header;
1925 skb2->pkt_type = PACKET_OUTGOING;
1926 pt_prev = ptype;
1929 if (ptype_list == &ptype_all) {
1930 ptype_list = &dev->ptype_all;
1931 goto again;
1933 out_unlock:
1934 if (pt_prev)
1935 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1936 rcu_read_unlock();
1940 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1941 * @dev: Network device
1942 * @txq: number of queues available
1944 * If real_num_tx_queues is changed the tc mappings may no longer be
1945 * valid. To resolve this verify the tc mapping remains valid and if
1946 * not NULL the mapping. With no priorities mapping to this
1947 * offset/count pair it will no longer be used. In the worst case TC0
1948 * is invalid nothing can be done so disable priority mappings. If is
1949 * expected that drivers will fix this mapping if they can before
1950 * calling netif_set_real_num_tx_queues.
1952 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1954 int i;
1955 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1957 /* If TC0 is invalidated disable TC mapping */
1958 if (tc->offset + tc->count > txq) {
1959 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1960 dev->num_tc = 0;
1961 return;
1964 /* Invalidated prio to tc mappings set to TC0 */
1965 for (i = 1; i < TC_BITMASK + 1; i++) {
1966 int q = netdev_get_prio_tc_map(dev, i);
1968 tc = &dev->tc_to_txq[q];
1969 if (tc->offset + tc->count > txq) {
1970 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1971 i, q);
1972 netdev_set_prio_tc_map(dev, i, 0);
1977 #ifdef CONFIG_XPS
1978 static DEFINE_MUTEX(xps_map_mutex);
1979 #define xmap_dereference(P) \
1980 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1982 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1983 int cpu, u16 index)
1985 struct xps_map *map = NULL;
1986 int pos;
1988 if (dev_maps)
1989 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1991 for (pos = 0; map && pos < map->len; pos++) {
1992 if (map->queues[pos] == index) {
1993 if (map->len > 1) {
1994 map->queues[pos] = map->queues[--map->len];
1995 } else {
1996 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1997 kfree_rcu(map, rcu);
1998 map = NULL;
2000 break;
2004 return map;
2007 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2009 struct xps_dev_maps *dev_maps;
2010 int cpu, i;
2011 bool active = false;
2013 mutex_lock(&xps_map_mutex);
2014 dev_maps = xmap_dereference(dev->xps_maps);
2016 if (!dev_maps)
2017 goto out_no_maps;
2019 for_each_possible_cpu(cpu) {
2020 for (i = index; i < dev->num_tx_queues; i++) {
2021 if (!remove_xps_queue(dev_maps, cpu, i))
2022 break;
2024 if (i == dev->num_tx_queues)
2025 active = true;
2028 if (!active) {
2029 RCU_INIT_POINTER(dev->xps_maps, NULL);
2030 kfree_rcu(dev_maps, rcu);
2033 for (i = index; i < dev->num_tx_queues; i++)
2034 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2035 NUMA_NO_NODE);
2037 out_no_maps:
2038 mutex_unlock(&xps_map_mutex);
2041 static struct xps_map *expand_xps_map(struct xps_map *map,
2042 int cpu, u16 index)
2044 struct xps_map *new_map;
2045 int alloc_len = XPS_MIN_MAP_ALLOC;
2046 int i, pos;
2048 for (pos = 0; map && pos < map->len; pos++) {
2049 if (map->queues[pos] != index)
2050 continue;
2051 return map;
2054 /* Need to add queue to this CPU's existing map */
2055 if (map) {
2056 if (pos < map->alloc_len)
2057 return map;
2059 alloc_len = map->alloc_len * 2;
2062 /* Need to allocate new map to store queue on this CPU's map */
2063 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2064 cpu_to_node(cpu));
2065 if (!new_map)
2066 return NULL;
2068 for (i = 0; i < pos; i++)
2069 new_map->queues[i] = map->queues[i];
2070 new_map->alloc_len = alloc_len;
2071 new_map->len = pos;
2073 return new_map;
2076 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2077 u16 index)
2079 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2080 struct xps_map *map, *new_map;
2081 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2082 int cpu, numa_node_id = -2;
2083 bool active = false;
2085 mutex_lock(&xps_map_mutex);
2087 dev_maps = xmap_dereference(dev->xps_maps);
2089 /* allocate memory for queue storage */
2090 for_each_online_cpu(cpu) {
2091 if (!cpumask_test_cpu(cpu, mask))
2092 continue;
2094 if (!new_dev_maps)
2095 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2096 if (!new_dev_maps) {
2097 mutex_unlock(&xps_map_mutex);
2098 return -ENOMEM;
2101 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2102 NULL;
2104 map = expand_xps_map(map, cpu, index);
2105 if (!map)
2106 goto error;
2108 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2111 if (!new_dev_maps)
2112 goto out_no_new_maps;
2114 for_each_possible_cpu(cpu) {
2115 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2116 /* add queue to CPU maps */
2117 int pos = 0;
2119 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2120 while ((pos < map->len) && (map->queues[pos] != index))
2121 pos++;
2123 if (pos == map->len)
2124 map->queues[map->len++] = index;
2125 #ifdef CONFIG_NUMA
2126 if (numa_node_id == -2)
2127 numa_node_id = cpu_to_node(cpu);
2128 else if (numa_node_id != cpu_to_node(cpu))
2129 numa_node_id = -1;
2130 #endif
2131 } else if (dev_maps) {
2132 /* fill in the new device map from the old device map */
2133 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2134 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2139 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2141 /* Cleanup old maps */
2142 if (dev_maps) {
2143 for_each_possible_cpu(cpu) {
2144 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2145 map = xmap_dereference(dev_maps->cpu_map[cpu]);
2146 if (map && map != new_map)
2147 kfree_rcu(map, rcu);
2150 kfree_rcu(dev_maps, rcu);
2153 dev_maps = new_dev_maps;
2154 active = true;
2156 out_no_new_maps:
2157 /* update Tx queue numa node */
2158 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2159 (numa_node_id >= 0) ? numa_node_id :
2160 NUMA_NO_NODE);
2162 if (!dev_maps)
2163 goto out_no_maps;
2165 /* removes queue from unused CPUs */
2166 for_each_possible_cpu(cpu) {
2167 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2168 continue;
2170 if (remove_xps_queue(dev_maps, cpu, index))
2171 active = true;
2174 /* free map if not active */
2175 if (!active) {
2176 RCU_INIT_POINTER(dev->xps_maps, NULL);
2177 kfree_rcu(dev_maps, rcu);
2180 out_no_maps:
2181 mutex_unlock(&xps_map_mutex);
2183 return 0;
2184 error:
2185 /* remove any maps that we added */
2186 for_each_possible_cpu(cpu) {
2187 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2188 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2189 NULL;
2190 if (new_map && new_map != map)
2191 kfree(new_map);
2194 mutex_unlock(&xps_map_mutex);
2196 kfree(new_dev_maps);
2197 return -ENOMEM;
2199 EXPORT_SYMBOL(netif_set_xps_queue);
2201 #endif
2203 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2204 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2206 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2208 bool disabling;
2209 int rc;
2211 disabling = txq < dev->real_num_tx_queues;
2213 if (txq < 1 || txq > dev->num_tx_queues)
2214 return -EINVAL;
2216 if (dev->reg_state == NETREG_REGISTERED ||
2217 dev->reg_state == NETREG_UNREGISTERING) {
2218 ASSERT_RTNL();
2220 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2221 txq);
2222 if (rc)
2223 return rc;
2225 if (dev->num_tc)
2226 netif_setup_tc(dev, txq);
2228 dev->real_num_tx_queues = txq;
2230 if (disabling) {
2231 synchronize_net();
2232 qdisc_reset_all_tx_gt(dev, txq);
2233 #ifdef CONFIG_XPS
2234 netif_reset_xps_queues_gt(dev, txq);
2235 #endif
2237 } else {
2238 dev->real_num_tx_queues = txq;
2241 return 0;
2243 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2245 #ifdef CONFIG_SYSFS
2247 * netif_set_real_num_rx_queues - set actual number of RX queues used
2248 * @dev: Network device
2249 * @rxq: Actual number of RX queues
2251 * This must be called either with the rtnl_lock held or before
2252 * registration of the net device. Returns 0 on success, or a
2253 * negative error code. If called before registration, it always
2254 * succeeds.
2256 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2258 int rc;
2260 if (rxq < 1 || rxq > dev->num_rx_queues)
2261 return -EINVAL;
2263 if (dev->reg_state == NETREG_REGISTERED) {
2264 ASSERT_RTNL();
2266 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2267 rxq);
2268 if (rc)
2269 return rc;
2272 dev->real_num_rx_queues = rxq;
2273 return 0;
2275 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2276 #endif
2279 * netif_get_num_default_rss_queues - default number of RSS queues
2281 * This routine should set an upper limit on the number of RSS queues
2282 * used by default by multiqueue devices.
2284 int netif_get_num_default_rss_queues(void)
2286 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2288 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2290 static inline void __netif_reschedule(struct Qdisc *q)
2292 struct softnet_data *sd;
2293 unsigned long flags;
2295 local_irq_save(flags);
2296 sd = this_cpu_ptr(&softnet_data);
2297 q->next_sched = NULL;
2298 *sd->output_queue_tailp = q;
2299 sd->output_queue_tailp = &q->next_sched;
2300 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2301 local_irq_restore(flags);
2304 void __netif_schedule(struct Qdisc *q)
2306 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2307 __netif_reschedule(q);
2309 EXPORT_SYMBOL(__netif_schedule);
2311 struct dev_kfree_skb_cb {
2312 enum skb_free_reason reason;
2315 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2317 return (struct dev_kfree_skb_cb *)skb->cb;
2320 void netif_schedule_queue(struct netdev_queue *txq)
2322 rcu_read_lock();
2323 if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2324 struct Qdisc *q = rcu_dereference(txq->qdisc);
2326 __netif_schedule(q);
2328 rcu_read_unlock();
2330 EXPORT_SYMBOL(netif_schedule_queue);
2333 * netif_wake_subqueue - allow sending packets on subqueue
2334 * @dev: network device
2335 * @queue_index: sub queue index
2337 * Resume individual transmit queue of a device with multiple transmit queues.
2339 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2341 struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2343 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2344 struct Qdisc *q;
2346 rcu_read_lock();
2347 q = rcu_dereference(txq->qdisc);
2348 __netif_schedule(q);
2349 rcu_read_unlock();
2352 EXPORT_SYMBOL(netif_wake_subqueue);
2354 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2356 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2357 struct Qdisc *q;
2359 rcu_read_lock();
2360 q = rcu_dereference(dev_queue->qdisc);
2361 __netif_schedule(q);
2362 rcu_read_unlock();
2365 EXPORT_SYMBOL(netif_tx_wake_queue);
2367 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2369 unsigned long flags;
2371 if (unlikely(!skb))
2372 return;
2374 if (likely(atomic_read(&skb->users) == 1)) {
2375 smp_rmb();
2376 atomic_set(&skb->users, 0);
2377 } else if (likely(!atomic_dec_and_test(&skb->users))) {
2378 return;
2380 get_kfree_skb_cb(skb)->reason = reason;
2381 local_irq_save(flags);
2382 skb->next = __this_cpu_read(softnet_data.completion_queue);
2383 __this_cpu_write(softnet_data.completion_queue, skb);
2384 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2385 local_irq_restore(flags);
2387 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2389 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2391 if (in_irq() || irqs_disabled())
2392 __dev_kfree_skb_irq(skb, reason);
2393 else
2394 dev_kfree_skb(skb);
2396 EXPORT_SYMBOL(__dev_kfree_skb_any);
2400 * netif_device_detach - mark device as removed
2401 * @dev: network device
2403 * Mark device as removed from system and therefore no longer available.
2405 void netif_device_detach(struct net_device *dev)
2407 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2408 netif_running(dev)) {
2409 netif_tx_stop_all_queues(dev);
2412 EXPORT_SYMBOL(netif_device_detach);
2415 * netif_device_attach - mark device as attached
2416 * @dev: network device
2418 * Mark device as attached from system and restart if needed.
2420 void netif_device_attach(struct net_device *dev)
2422 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2423 netif_running(dev)) {
2424 netif_tx_wake_all_queues(dev);
2425 __netdev_watchdog_up(dev);
2428 EXPORT_SYMBOL(netif_device_attach);
2431 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2432 * to be used as a distribution range.
2434 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2435 unsigned int num_tx_queues)
2437 u32 hash;
2438 u16 qoffset = 0;
2439 u16 qcount = num_tx_queues;
2441 if (skb_rx_queue_recorded(skb)) {
2442 hash = skb_get_rx_queue(skb);
2443 while (unlikely(hash >= num_tx_queues))
2444 hash -= num_tx_queues;
2445 return hash;
2448 if (dev->num_tc) {
2449 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2450 qoffset = dev->tc_to_txq[tc].offset;
2451 qcount = dev->tc_to_txq[tc].count;
2454 return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2456 EXPORT_SYMBOL(__skb_tx_hash);
2458 static void skb_warn_bad_offload(const struct sk_buff *skb)
2460 static const netdev_features_t null_features = 0;
2461 struct net_device *dev = skb->dev;
2462 const char *name = "";
2464 if (!net_ratelimit())
2465 return;
2467 if (dev) {
2468 if (dev->dev.parent)
2469 name = dev_driver_string(dev->dev.parent);
2470 else
2471 name = netdev_name(dev);
2473 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2474 "gso_type=%d ip_summed=%d\n",
2475 name, dev ? &dev->features : &null_features,
2476 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2477 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2478 skb_shinfo(skb)->gso_type, skb->ip_summed);
2482 * Invalidate hardware checksum when packet is to be mangled, and
2483 * complete checksum manually on outgoing path.
2485 int skb_checksum_help(struct sk_buff *skb)
2487 __wsum csum;
2488 int ret = 0, offset;
2490 if (skb->ip_summed == CHECKSUM_COMPLETE)
2491 goto out_set_summed;
2493 if (unlikely(skb_shinfo(skb)->gso_size)) {
2494 skb_warn_bad_offload(skb);
2495 return -EINVAL;
2498 /* Before computing a checksum, we should make sure no frag could
2499 * be modified by an external entity : checksum could be wrong.
2501 if (skb_has_shared_frag(skb)) {
2502 ret = __skb_linearize(skb);
2503 if (ret)
2504 goto out;
2507 offset = skb_checksum_start_offset(skb);
2508 BUG_ON(offset >= skb_headlen(skb));
2509 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2511 offset += skb->csum_offset;
2512 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2514 if (skb_cloned(skb) &&
2515 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2516 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2517 if (ret)
2518 goto out;
2521 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2522 out_set_summed:
2523 skb->ip_summed = CHECKSUM_NONE;
2524 out:
2525 return ret;
2527 EXPORT_SYMBOL(skb_checksum_help);
2529 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2531 __be16 type = skb->protocol;
2533 /* Tunnel gso handlers can set protocol to ethernet. */
2534 if (type == htons(ETH_P_TEB)) {
2535 struct ethhdr *eth;
2537 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2538 return 0;
2540 eth = (struct ethhdr *)skb->data;
2541 type = eth->h_proto;
2544 return __vlan_get_protocol(skb, type, depth);
2548 * skb_mac_gso_segment - mac layer segmentation handler.
2549 * @skb: buffer to segment
2550 * @features: features for the output path (see dev->features)
2552 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2553 netdev_features_t features)
2555 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2556 struct packet_offload *ptype;
2557 int vlan_depth = skb->mac_len;
2558 __be16 type = skb_network_protocol(skb, &vlan_depth);
2560 if (unlikely(!type))
2561 return ERR_PTR(-EINVAL);
2563 __skb_pull(skb, vlan_depth);
2565 rcu_read_lock();
2566 list_for_each_entry_rcu(ptype, &offload_base, list) {
2567 if (ptype->type == type && ptype->callbacks.gso_segment) {
2568 segs = ptype->callbacks.gso_segment(skb, features);
2569 break;
2572 rcu_read_unlock();
2574 __skb_push(skb, skb->data - skb_mac_header(skb));
2576 return segs;
2578 EXPORT_SYMBOL(skb_mac_gso_segment);
2581 /* openvswitch calls this on rx path, so we need a different check.
2583 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2585 if (tx_path)
2586 return skb->ip_summed != CHECKSUM_PARTIAL &&
2587 skb->ip_summed != CHECKSUM_UNNECESSARY;
2589 return skb->ip_summed == CHECKSUM_NONE;
2593 * __skb_gso_segment - Perform segmentation on skb.
2594 * @skb: buffer to segment
2595 * @features: features for the output path (see dev->features)
2596 * @tx_path: whether it is called in TX path
2598 * This function segments the given skb and returns a list of segments.
2600 * It may return NULL if the skb requires no segmentation. This is
2601 * only possible when GSO is used for verifying header integrity.
2603 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2605 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2606 netdev_features_t features, bool tx_path)
2608 struct sk_buff *segs;
2610 if (unlikely(skb_needs_check(skb, tx_path))) {
2611 int err;
2613 /* We're going to init ->check field in TCP or UDP header */
2614 err = skb_cow_head(skb, 0);
2615 if (err < 0)
2616 return ERR_PTR(err);
2619 BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2620 sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2622 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2623 SKB_GSO_CB(skb)->encap_level = 0;
2625 skb_reset_mac_header(skb);
2626 skb_reset_mac_len(skb);
2628 segs = skb_mac_gso_segment(skb, features);
2630 if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
2631 skb_warn_bad_offload(skb);
2633 return segs;
2635 EXPORT_SYMBOL(__skb_gso_segment);
2637 /* Take action when hardware reception checksum errors are detected. */
2638 #ifdef CONFIG_BUG
2639 void netdev_rx_csum_fault(struct net_device *dev)
2641 if (net_ratelimit()) {
2642 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2643 dump_stack();
2646 EXPORT_SYMBOL(netdev_rx_csum_fault);
2647 #endif
2649 /* Actually, we should eliminate this check as soon as we know, that:
2650 * 1. IOMMU is present and allows to map all the memory.
2651 * 2. No high memory really exists on this machine.
2654 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2656 #ifdef CONFIG_HIGHMEM
2657 int i;
2658 if (!(dev->features & NETIF_F_HIGHDMA)) {
2659 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2660 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2661 if (PageHighMem(skb_frag_page(frag)))
2662 return 1;
2666 if (PCI_DMA_BUS_IS_PHYS) {
2667 struct device *pdev = dev->dev.parent;
2669 if (!pdev)
2670 return 0;
2671 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2672 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2673 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2674 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2675 return 1;
2678 #endif
2679 return 0;
2682 /* If MPLS offload request, verify we are testing hardware MPLS features
2683 * instead of standard features for the netdev.
2685 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2686 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2687 netdev_features_t features,
2688 __be16 type)
2690 if (eth_p_mpls(type))
2691 features &= skb->dev->mpls_features;
2693 return features;
2695 #else
2696 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2697 netdev_features_t features,
2698 __be16 type)
2700 return features;
2702 #endif
2704 static netdev_features_t harmonize_features(struct sk_buff *skb,
2705 netdev_features_t features)
2707 int tmp;
2708 __be16 type;
2710 type = skb_network_protocol(skb, &tmp);
2711 features = net_mpls_features(skb, features, type);
2713 if (skb->ip_summed != CHECKSUM_NONE &&
2714 !can_checksum_protocol(features, type)) {
2715 features &= ~NETIF_F_ALL_CSUM;
2717 if (illegal_highdma(skb->dev, skb))
2718 features &= ~NETIF_F_SG;
2720 return features;
2723 netdev_features_t passthru_features_check(struct sk_buff *skb,
2724 struct net_device *dev,
2725 netdev_features_t features)
2727 return features;
2729 EXPORT_SYMBOL(passthru_features_check);
2731 static netdev_features_t dflt_features_check(struct sk_buff *skb,
2732 struct net_device *dev,
2733 netdev_features_t features)
2735 return vlan_features_check(skb, features);
2738 netdev_features_t netif_skb_features(struct sk_buff *skb)
2740 struct net_device *dev = skb->dev;
2741 netdev_features_t features = dev->features;
2742 u16 gso_segs = skb_shinfo(skb)->gso_segs;
2744 if (gso_segs > dev->gso_max_segs || gso_segs < dev->gso_min_segs)
2745 features &= ~NETIF_F_GSO_MASK;
2747 /* If encapsulation offload request, verify we are testing
2748 * hardware encapsulation features instead of standard
2749 * features for the netdev
2751 if (skb->encapsulation)
2752 features &= dev->hw_enc_features;
2754 if (skb_vlan_tagged(skb))
2755 features = netdev_intersect_features(features,
2756 dev->vlan_features |
2757 NETIF_F_HW_VLAN_CTAG_TX |
2758 NETIF_F_HW_VLAN_STAG_TX);
2760 if (dev->netdev_ops->ndo_features_check)
2761 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2762 features);
2763 else
2764 features &= dflt_features_check(skb, dev, features);
2766 return harmonize_features(skb, features);
2768 EXPORT_SYMBOL(netif_skb_features);
2770 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2771 struct netdev_queue *txq, bool more)
2773 unsigned int len;
2774 int rc;
2776 if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2777 dev_queue_xmit_nit(skb, dev);
2779 len = skb->len;
2780 trace_net_dev_start_xmit(skb, dev);
2781 rc = netdev_start_xmit(skb, dev, txq, more);
2782 trace_net_dev_xmit(skb, rc, dev, len);
2784 return rc;
2787 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2788 struct netdev_queue *txq, int *ret)
2790 struct sk_buff *skb = first;
2791 int rc = NETDEV_TX_OK;
2793 while (skb) {
2794 struct sk_buff *next = skb->next;
2796 skb->next = NULL;
2797 rc = xmit_one(skb, dev, txq, next != NULL);
2798 if (unlikely(!dev_xmit_complete(rc))) {
2799 skb->next = next;
2800 goto out;
2803 skb = next;
2804 if (netif_xmit_stopped(txq) && skb) {
2805 rc = NETDEV_TX_BUSY;
2806 break;
2810 out:
2811 *ret = rc;
2812 return skb;
2815 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2816 netdev_features_t features)
2818 if (skb_vlan_tag_present(skb) &&
2819 !vlan_hw_offload_capable(features, skb->vlan_proto))
2820 skb = __vlan_hwaccel_push_inside(skb);
2821 return skb;
2824 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2826 netdev_features_t features;
2828 if (skb->next)
2829 return skb;
2831 features = netif_skb_features(skb);
2832 skb = validate_xmit_vlan(skb, features);
2833 if (unlikely(!skb))
2834 goto out_null;
2836 if (netif_needs_gso(skb, features)) {
2837 struct sk_buff *segs;
2839 segs = skb_gso_segment(skb, features);
2840 if (IS_ERR(segs)) {
2841 goto out_kfree_skb;
2842 } else if (segs) {
2843 consume_skb(skb);
2844 skb = segs;
2846 } else {
2847 if (skb_needs_linearize(skb, features) &&
2848 __skb_linearize(skb))
2849 goto out_kfree_skb;
2851 /* If packet is not checksummed and device does not
2852 * support checksumming for this protocol, complete
2853 * checksumming here.
2855 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2856 if (skb->encapsulation)
2857 skb_set_inner_transport_header(skb,
2858 skb_checksum_start_offset(skb));
2859 else
2860 skb_set_transport_header(skb,
2861 skb_checksum_start_offset(skb));
2862 if (!(features & NETIF_F_ALL_CSUM) &&
2863 skb_checksum_help(skb))
2864 goto out_kfree_skb;
2868 return skb;
2870 out_kfree_skb:
2871 kfree_skb(skb);
2872 out_null:
2873 return NULL;
2876 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2878 struct sk_buff *next, *head = NULL, *tail;
2880 for (; skb != NULL; skb = next) {
2881 next = skb->next;
2882 skb->next = NULL;
2884 /* in case skb wont be segmented, point to itself */
2885 skb->prev = skb;
2887 skb = validate_xmit_skb(skb, dev);
2888 if (!skb)
2889 continue;
2891 if (!head)
2892 head = skb;
2893 else
2894 tail->next = skb;
2895 /* If skb was segmented, skb->prev points to
2896 * the last segment. If not, it still contains skb.
2898 tail = skb->prev;
2900 return head;
2902 EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
2904 static void qdisc_pkt_len_init(struct sk_buff *skb)
2906 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2908 qdisc_skb_cb(skb)->pkt_len = skb->len;
2910 /* To get more precise estimation of bytes sent on wire,
2911 * we add to pkt_len the headers size of all segments
2913 if (shinfo->gso_size) {
2914 unsigned int hdr_len;
2915 u16 gso_segs = shinfo->gso_segs;
2917 /* mac layer + network layer */
2918 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2920 /* + transport layer */
2921 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
2922 const struct tcphdr *th;
2923 struct tcphdr _tcphdr;
2925 th = skb_header_pointer(skb, skb_transport_offset(skb),
2926 sizeof(_tcphdr), &_tcphdr);
2927 if (likely(th))
2928 hdr_len += __tcp_hdrlen(th);
2929 } else {
2930 struct udphdr _udphdr;
2932 if (skb_header_pointer(skb, skb_transport_offset(skb),
2933 sizeof(_udphdr), &_udphdr))
2934 hdr_len += sizeof(struct udphdr);
2937 if (shinfo->gso_type & SKB_GSO_DODGY)
2938 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2939 shinfo->gso_size);
2941 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2945 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2946 struct net_device *dev,
2947 struct netdev_queue *txq)
2949 spinlock_t *root_lock = qdisc_lock(q);
2950 bool contended;
2951 int rc;
2953 qdisc_pkt_len_init(skb);
2954 qdisc_calculate_pkt_len(skb, q);
2956 * Heuristic to force contended enqueues to serialize on a
2957 * separate lock before trying to get qdisc main lock.
2958 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2959 * often and dequeue packets faster.
2961 contended = qdisc_is_running(q);
2962 if (unlikely(contended))
2963 spin_lock(&q->busylock);
2965 spin_lock(root_lock);
2966 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2967 kfree_skb(skb);
2968 rc = NET_XMIT_DROP;
2969 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2970 qdisc_run_begin(q)) {
2972 * This is a work-conserving queue; there are no old skbs
2973 * waiting to be sent out; and the qdisc is not running -
2974 * xmit the skb directly.
2977 qdisc_bstats_update(q, skb);
2979 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2980 if (unlikely(contended)) {
2981 spin_unlock(&q->busylock);
2982 contended = false;
2984 __qdisc_run(q);
2985 } else
2986 qdisc_run_end(q);
2988 rc = NET_XMIT_SUCCESS;
2989 } else {
2990 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2991 if (qdisc_run_begin(q)) {
2992 if (unlikely(contended)) {
2993 spin_unlock(&q->busylock);
2994 contended = false;
2996 __qdisc_run(q);
2999 spin_unlock(root_lock);
3000 if (unlikely(contended))
3001 spin_unlock(&q->busylock);
3002 return rc;
3005 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3006 static void skb_update_prio(struct sk_buff *skb)
3008 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
3010 if (!skb->priority && skb->sk && map) {
3011 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
3013 if (prioidx < map->priomap_len)
3014 skb->priority = map->priomap[prioidx];
3017 #else
3018 #define skb_update_prio(skb)
3019 #endif
3021 DEFINE_PER_CPU(int, xmit_recursion);
3022 EXPORT_SYMBOL(xmit_recursion);
3024 #define RECURSION_LIMIT 10
3027 * dev_loopback_xmit - loop back @skb
3028 * @net: network namespace this loopback is happening in
3029 * @sk: sk needed to be a netfilter okfn
3030 * @skb: buffer to transmit
3032 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3034 skb_reset_mac_header(skb);
3035 __skb_pull(skb, skb_network_offset(skb));
3036 skb->pkt_type = PACKET_LOOPBACK;
3037 skb->ip_summed = CHECKSUM_UNNECESSARY;
3038 WARN_ON(!skb_dst(skb));
3039 skb_dst_force(skb);
3040 netif_rx_ni(skb);
3041 return 0;
3043 EXPORT_SYMBOL(dev_loopback_xmit);
3045 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3047 #ifdef CONFIG_XPS
3048 struct xps_dev_maps *dev_maps;
3049 struct xps_map *map;
3050 int queue_index = -1;
3052 rcu_read_lock();
3053 dev_maps = rcu_dereference(dev->xps_maps);
3054 if (dev_maps) {
3055 map = rcu_dereference(
3056 dev_maps->cpu_map[skb->sender_cpu - 1]);
3057 if (map) {
3058 if (map->len == 1)
3059 queue_index = map->queues[0];
3060 else
3061 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3062 map->len)];
3063 if (unlikely(queue_index >= dev->real_num_tx_queues))
3064 queue_index = -1;
3067 rcu_read_unlock();
3069 return queue_index;
3070 #else
3071 return -1;
3072 #endif
3075 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3077 struct sock *sk = skb->sk;
3078 int queue_index = sk_tx_queue_get(sk);
3080 if (queue_index < 0 || skb->ooo_okay ||
3081 queue_index >= dev->real_num_tx_queues) {
3082 int new_index = get_xps_queue(dev, skb);
3083 if (new_index < 0)
3084 new_index = skb_tx_hash(dev, skb);
3086 if (queue_index != new_index && sk &&
3087 sk_fullsock(sk) &&
3088 rcu_access_pointer(sk->sk_dst_cache))
3089 sk_tx_queue_set(sk, new_index);
3091 queue_index = new_index;
3094 return queue_index;
3097 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3098 struct sk_buff *skb,
3099 void *accel_priv)
3101 int queue_index = 0;
3103 #ifdef CONFIG_XPS
3104 u32 sender_cpu = skb->sender_cpu - 1;
3106 if (sender_cpu >= (u32)NR_CPUS)
3107 skb->sender_cpu = raw_smp_processor_id() + 1;
3108 #endif
3110 if (dev->real_num_tx_queues != 1) {
3111 const struct net_device_ops *ops = dev->netdev_ops;
3112 if (ops->ndo_select_queue)
3113 queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3114 __netdev_pick_tx);
3115 else
3116 queue_index = __netdev_pick_tx(dev, skb);
3118 if (!accel_priv)
3119 queue_index = netdev_cap_txqueue(dev, queue_index);
3122 skb_set_queue_mapping(skb, queue_index);
3123 return netdev_get_tx_queue(dev, queue_index);
3127 * __dev_queue_xmit - transmit a buffer
3128 * @skb: buffer to transmit
3129 * @accel_priv: private data used for L2 forwarding offload
3131 * Queue a buffer for transmission to a network device. The caller must
3132 * have set the device and priority and built the buffer before calling
3133 * this function. The function can be called from an interrupt.
3135 * A negative errno code is returned on a failure. A success does not
3136 * guarantee the frame will be transmitted as it may be dropped due
3137 * to congestion or traffic shaping.
3139 * -----------------------------------------------------------------------------------
3140 * I notice this method can also return errors from the queue disciplines,
3141 * including NET_XMIT_DROP, which is a positive value. So, errors can also
3142 * be positive.
3144 * Regardless of the return value, the skb is consumed, so it is currently
3145 * difficult to retry a send to this method. (You can bump the ref count
3146 * before sending to hold a reference for retry if you are careful.)
3148 * When calling this method, interrupts MUST be enabled. This is because
3149 * the BH enable code must have IRQs enabled so that it will not deadlock.
3150 * --BLG
3152 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3154 struct net_device *dev = skb->dev;
3155 struct netdev_queue *txq;
3156 struct Qdisc *q;
3157 int rc = -ENOMEM;
3159 skb_reset_mac_header(skb);
3161 if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3162 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3164 /* Disable soft irqs for various locks below. Also
3165 * stops preemption for RCU.
3167 rcu_read_lock_bh();
3169 skb_update_prio(skb);
3171 /* If device/qdisc don't need skb->dst, release it right now while
3172 * its hot in this cpu cache.
3174 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3175 skb_dst_drop(skb);
3176 else
3177 skb_dst_force(skb);
3179 #ifdef CONFIG_NET_SWITCHDEV
3180 /* Don't forward if offload device already forwarded */
3181 if (skb->offload_fwd_mark &&
3182 skb->offload_fwd_mark == dev->offload_fwd_mark) {
3183 consume_skb(skb);
3184 rc = NET_XMIT_SUCCESS;
3185 goto out;
3187 #endif
3189 txq = netdev_pick_tx(dev, skb, accel_priv);
3190 q = rcu_dereference_bh(txq->qdisc);
3192 #ifdef CONFIG_NET_CLS_ACT
3193 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3194 #endif
3195 trace_net_dev_queue(skb);
3196 if (q->enqueue) {
3197 rc = __dev_xmit_skb(skb, q, dev, txq);
3198 goto out;
3201 /* The device has no queue. Common case for software devices:
3202 loopback, all the sorts of tunnels...
3204 Really, it is unlikely that netif_tx_lock protection is necessary
3205 here. (f.e. loopback and IP tunnels are clean ignoring statistics
3206 counters.)
3207 However, it is possible, that they rely on protection
3208 made by us here.
3210 Check this and shot the lock. It is not prone from deadlocks.
3211 Either shot noqueue qdisc, it is even simpler 8)
3213 if (dev->flags & IFF_UP) {
3214 int cpu = smp_processor_id(); /* ok because BHs are off */
3216 if (txq->xmit_lock_owner != cpu) {
3218 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
3219 goto recursion_alert;
3221 skb = validate_xmit_skb(skb, dev);
3222 if (!skb)
3223 goto drop;
3225 HARD_TX_LOCK(dev, txq, cpu);
3227 if (!netif_xmit_stopped(txq)) {
3228 __this_cpu_inc(xmit_recursion);
3229 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3230 __this_cpu_dec(xmit_recursion);
3231 if (dev_xmit_complete(rc)) {
3232 HARD_TX_UNLOCK(dev, txq);
3233 goto out;
3236 HARD_TX_UNLOCK(dev, txq);
3237 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3238 dev->name);
3239 } else {
3240 /* Recursion is detected! It is possible,
3241 * unfortunately
3243 recursion_alert:
3244 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3245 dev->name);
3249 rc = -ENETDOWN;
3250 drop:
3251 rcu_read_unlock_bh();
3253 atomic_long_inc(&dev->tx_dropped);
3254 kfree_skb_list(skb);
3255 return rc;
3256 out:
3257 rcu_read_unlock_bh();
3258 return rc;
3261 int dev_queue_xmit(struct sk_buff *skb)
3263 return __dev_queue_xmit(skb, NULL);
3265 EXPORT_SYMBOL(dev_queue_xmit);
3267 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3269 return __dev_queue_xmit(skb, accel_priv);
3271 EXPORT_SYMBOL(dev_queue_xmit_accel);
3274 /*=======================================================================
3275 Receiver routines
3276 =======================================================================*/
3278 int netdev_max_backlog __read_mostly = 1000;
3279 EXPORT_SYMBOL(netdev_max_backlog);
3281 int netdev_tstamp_prequeue __read_mostly = 1;
3282 int netdev_budget __read_mostly = 300;
3283 int weight_p __read_mostly = 64; /* old backlog weight */
3285 /* Called with irq disabled */
3286 static inline void ____napi_schedule(struct softnet_data *sd,
3287 struct napi_struct *napi)
3289 list_add_tail(&napi->poll_list, &sd->poll_list);
3290 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3293 #ifdef CONFIG_RPS
3295 /* One global table that all flow-based protocols share. */
3296 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3297 EXPORT_SYMBOL(rps_sock_flow_table);
3298 u32 rps_cpu_mask __read_mostly;
3299 EXPORT_SYMBOL(rps_cpu_mask);
3301 struct static_key rps_needed __read_mostly;
3303 static struct rps_dev_flow *
3304 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3305 struct rps_dev_flow *rflow, u16 next_cpu)
3307 if (next_cpu < nr_cpu_ids) {
3308 #ifdef CONFIG_RFS_ACCEL
3309 struct netdev_rx_queue *rxqueue;
3310 struct rps_dev_flow_table *flow_table;
3311 struct rps_dev_flow *old_rflow;
3312 u32 flow_id;
3313 u16 rxq_index;
3314 int rc;
3316 /* Should we steer this flow to a different hardware queue? */
3317 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3318 !(dev->features & NETIF_F_NTUPLE))
3319 goto out;
3320 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3321 if (rxq_index == skb_get_rx_queue(skb))
3322 goto out;
3324 rxqueue = dev->_rx + rxq_index;
3325 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3326 if (!flow_table)
3327 goto out;
3328 flow_id = skb_get_hash(skb) & flow_table->mask;
3329 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3330 rxq_index, flow_id);
3331 if (rc < 0)
3332 goto out;
3333 old_rflow = rflow;
3334 rflow = &flow_table->flows[flow_id];
3335 rflow->filter = rc;
3336 if (old_rflow->filter == rflow->filter)
3337 old_rflow->filter = RPS_NO_FILTER;
3338 out:
3339 #endif
3340 rflow->last_qtail =
3341 per_cpu(softnet_data, next_cpu).input_queue_head;
3344 rflow->cpu = next_cpu;
3345 return rflow;
3349 * get_rps_cpu is called from netif_receive_skb and returns the target
3350 * CPU from the RPS map of the receiving queue for a given skb.
3351 * rcu_read_lock must be held on entry.
3353 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3354 struct rps_dev_flow **rflowp)
3356 const struct rps_sock_flow_table *sock_flow_table;
3357 struct netdev_rx_queue *rxqueue = dev->_rx;
3358 struct rps_dev_flow_table *flow_table;
3359 struct rps_map *map;
3360 int cpu = -1;
3361 u32 tcpu;
3362 u32 hash;
3364 if (skb_rx_queue_recorded(skb)) {
3365 u16 index = skb_get_rx_queue(skb);
3367 if (unlikely(index >= dev->real_num_rx_queues)) {
3368 WARN_ONCE(dev->real_num_rx_queues > 1,
3369 "%s received packet on queue %u, but number "
3370 "of RX queues is %u\n",
3371 dev->name, index, dev->real_num_rx_queues);
3372 goto done;
3374 rxqueue += index;
3377 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3379 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3380 map = rcu_dereference(rxqueue->rps_map);
3381 if (!flow_table && !map)
3382 goto done;
3384 skb_reset_network_header(skb);
3385 hash = skb_get_hash(skb);
3386 if (!hash)
3387 goto done;
3389 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3390 if (flow_table && sock_flow_table) {
3391 struct rps_dev_flow *rflow;
3392 u32 next_cpu;
3393 u32 ident;
3395 /* First check into global flow table if there is a match */
3396 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3397 if ((ident ^ hash) & ~rps_cpu_mask)
3398 goto try_rps;
3400 next_cpu = ident & rps_cpu_mask;
3402 /* OK, now we know there is a match,
3403 * we can look at the local (per receive queue) flow table
3405 rflow = &flow_table->flows[hash & flow_table->mask];
3406 tcpu = rflow->cpu;
3409 * If the desired CPU (where last recvmsg was done) is
3410 * different from current CPU (one in the rx-queue flow
3411 * table entry), switch if one of the following holds:
3412 * - Current CPU is unset (>= nr_cpu_ids).
3413 * - Current CPU is offline.
3414 * - The current CPU's queue tail has advanced beyond the
3415 * last packet that was enqueued using this table entry.
3416 * This guarantees that all previous packets for the flow
3417 * have been dequeued, thus preserving in order delivery.
3419 if (unlikely(tcpu != next_cpu) &&
3420 (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3421 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3422 rflow->last_qtail)) >= 0)) {
3423 tcpu = next_cpu;
3424 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3427 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3428 *rflowp = rflow;
3429 cpu = tcpu;
3430 goto done;
3434 try_rps:
3436 if (map) {
3437 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3438 if (cpu_online(tcpu)) {
3439 cpu = tcpu;
3440 goto done;
3444 done:
3445 return cpu;
3448 #ifdef CONFIG_RFS_ACCEL
3451 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3452 * @dev: Device on which the filter was set
3453 * @rxq_index: RX queue index
3454 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3455 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3457 * Drivers that implement ndo_rx_flow_steer() should periodically call
3458 * this function for each installed filter and remove the filters for
3459 * which it returns %true.
3461 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3462 u32 flow_id, u16 filter_id)
3464 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3465 struct rps_dev_flow_table *flow_table;
3466 struct rps_dev_flow *rflow;
3467 bool expire = true;
3468 unsigned int cpu;
3470 rcu_read_lock();
3471 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3472 if (flow_table && flow_id <= flow_table->mask) {
3473 rflow = &flow_table->flows[flow_id];
3474 cpu = ACCESS_ONCE(rflow->cpu);
3475 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3476 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3477 rflow->last_qtail) <
3478 (int)(10 * flow_table->mask)))
3479 expire = false;
3481 rcu_read_unlock();
3482 return expire;
3484 EXPORT_SYMBOL(rps_may_expire_flow);
3486 #endif /* CONFIG_RFS_ACCEL */
3488 /* Called from hardirq (IPI) context */
3489 static void rps_trigger_softirq(void *data)
3491 struct softnet_data *sd = data;
3493 ____napi_schedule(sd, &sd->backlog);
3494 sd->received_rps++;
3497 #endif /* CONFIG_RPS */
3500 * Check if this softnet_data structure is another cpu one
3501 * If yes, queue it to our IPI list and return 1
3502 * If no, return 0
3504 static int rps_ipi_queued(struct softnet_data *sd)
3506 #ifdef CONFIG_RPS
3507 struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3509 if (sd != mysd) {
3510 sd->rps_ipi_next = mysd->rps_ipi_list;
3511 mysd->rps_ipi_list = sd;
3513 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3514 return 1;
3516 #endif /* CONFIG_RPS */
3517 return 0;
3520 #ifdef CONFIG_NET_FLOW_LIMIT
3521 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3522 #endif
3524 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3526 #ifdef CONFIG_NET_FLOW_LIMIT
3527 struct sd_flow_limit *fl;
3528 struct softnet_data *sd;
3529 unsigned int old_flow, new_flow;
3531 if (qlen < (netdev_max_backlog >> 1))
3532 return false;
3534 sd = this_cpu_ptr(&softnet_data);
3536 rcu_read_lock();
3537 fl = rcu_dereference(sd->flow_limit);
3538 if (fl) {
3539 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3540 old_flow = fl->history[fl->history_head];
3541 fl->history[fl->history_head] = new_flow;
3543 fl->history_head++;
3544 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3546 if (likely(fl->buckets[old_flow]))
3547 fl->buckets[old_flow]--;
3549 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3550 fl->count++;
3551 rcu_read_unlock();
3552 return true;
3555 rcu_read_unlock();
3556 #endif
3557 return false;
3561 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3562 * queue (may be a remote CPU queue).
3564 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3565 unsigned int *qtail)
3567 struct softnet_data *sd;
3568 unsigned long flags;
3569 unsigned int qlen;
3571 sd = &per_cpu(softnet_data, cpu);
3573 local_irq_save(flags);
3575 rps_lock(sd);
3576 if (!netif_running(skb->dev))
3577 goto drop;
3578 qlen = skb_queue_len(&sd->input_pkt_queue);
3579 if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3580 if (qlen) {
3581 enqueue:
3582 __skb_queue_tail(&sd->input_pkt_queue, skb);
3583 input_queue_tail_incr_save(sd, qtail);
3584 rps_unlock(sd);
3585 local_irq_restore(flags);
3586 return NET_RX_SUCCESS;
3589 /* Schedule NAPI for backlog device
3590 * We can use non atomic operation since we own the queue lock
3592 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3593 if (!rps_ipi_queued(sd))
3594 ____napi_schedule(sd, &sd->backlog);
3596 goto enqueue;
3599 drop:
3600 sd->dropped++;
3601 rps_unlock(sd);
3603 local_irq_restore(flags);
3605 atomic_long_inc(&skb->dev->rx_dropped);
3606 kfree_skb(skb);
3607 return NET_RX_DROP;
3610 static int netif_rx_internal(struct sk_buff *skb)
3612 int ret;
3614 net_timestamp_check(netdev_tstamp_prequeue, skb);
3616 trace_netif_rx(skb);
3617 #ifdef CONFIG_RPS
3618 if (static_key_false(&rps_needed)) {
3619 struct rps_dev_flow voidflow, *rflow = &voidflow;
3620 int cpu;
3622 preempt_disable();
3623 rcu_read_lock();
3625 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3626 if (cpu < 0)
3627 cpu = smp_processor_id();
3629 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3631 rcu_read_unlock();
3632 preempt_enable();
3633 } else
3634 #endif
3636 unsigned int qtail;
3637 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3638 put_cpu();
3640 return ret;
3644 * netif_rx - post buffer to the network code
3645 * @skb: buffer to post
3647 * This function receives a packet from a device driver and queues it for
3648 * the upper (protocol) levels to process. It always succeeds. The buffer
3649 * may be dropped during processing for congestion control or by the
3650 * protocol layers.
3652 * return values:
3653 * NET_RX_SUCCESS (no congestion)
3654 * NET_RX_DROP (packet was dropped)
3658 int netif_rx(struct sk_buff *skb)
3660 trace_netif_rx_entry(skb);
3662 return netif_rx_internal(skb);
3664 EXPORT_SYMBOL(netif_rx);
3666 int netif_rx_ni(struct sk_buff *skb)
3668 int err;
3670 trace_netif_rx_ni_entry(skb);
3672 preempt_disable();
3673 err = netif_rx_internal(skb);
3674 if (local_softirq_pending())
3675 do_softirq();
3676 preempt_enable();
3678 return err;
3680 EXPORT_SYMBOL(netif_rx_ni);
3682 static void net_tx_action(struct softirq_action *h)
3684 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3686 if (sd->completion_queue) {
3687 struct sk_buff *clist;
3689 local_irq_disable();
3690 clist = sd->completion_queue;
3691 sd->completion_queue = NULL;
3692 local_irq_enable();
3694 while (clist) {
3695 struct sk_buff *skb = clist;
3696 clist = clist->next;
3698 WARN_ON(atomic_read(&skb->users));
3699 if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3700 trace_consume_skb(skb);
3701 else
3702 trace_kfree_skb(skb, net_tx_action);
3703 __kfree_skb(skb);
3707 if (sd->output_queue) {
3708 struct Qdisc *head;
3710 local_irq_disable();
3711 head = sd->output_queue;
3712 sd->output_queue = NULL;
3713 sd->output_queue_tailp = &sd->output_queue;
3714 local_irq_enable();
3716 while (head) {
3717 struct Qdisc *q = head;
3718 spinlock_t *root_lock;
3720 head = head->next_sched;
3722 root_lock = qdisc_lock(q);
3723 if (spin_trylock(root_lock)) {
3724 smp_mb__before_atomic();
3725 clear_bit(__QDISC_STATE_SCHED,
3726 &q->state);
3727 qdisc_run(q);
3728 spin_unlock(root_lock);
3729 } else {
3730 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3731 &q->state)) {
3732 __netif_reschedule(q);
3733 } else {
3734 smp_mb__before_atomic();
3735 clear_bit(__QDISC_STATE_SCHED,
3736 &q->state);
3743 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3744 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3745 /* This hook is defined here for ATM LANE */
3746 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3747 unsigned char *addr) __read_mostly;
3748 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3749 #endif
3751 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3752 struct packet_type **pt_prev,
3753 int *ret, struct net_device *orig_dev)
3755 #ifdef CONFIG_NET_CLS_ACT
3756 struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3757 struct tcf_result cl_res;
3759 /* If there's at least one ingress present somewhere (so
3760 * we get here via enabled static key), remaining devices
3761 * that are not configured with an ingress qdisc will bail
3762 * out here.
3764 if (!cl)
3765 return skb;
3766 if (*pt_prev) {
3767 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3768 *pt_prev = NULL;
3771 qdisc_skb_cb(skb)->pkt_len = skb->len;
3772 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3773 qdisc_bstats_cpu_update(cl->q, skb);
3775 switch (tc_classify(skb, cl, &cl_res, false)) {
3776 case TC_ACT_OK:
3777 case TC_ACT_RECLASSIFY:
3778 skb->tc_index = TC_H_MIN(cl_res.classid);
3779 break;
3780 case TC_ACT_SHOT:
3781 qdisc_qstats_cpu_drop(cl->q);
3782 case TC_ACT_STOLEN:
3783 case TC_ACT_QUEUED:
3784 kfree_skb(skb);
3785 return NULL;
3786 case TC_ACT_REDIRECT:
3787 /* skb_mac_header check was done by cls/act_bpf, so
3788 * we can safely push the L2 header back before
3789 * redirecting to another netdev
3791 __skb_push(skb, skb->mac_len);
3792 skb_do_redirect(skb);
3793 return NULL;
3794 default:
3795 break;
3797 #endif /* CONFIG_NET_CLS_ACT */
3798 return skb;
3802 * netdev_is_rx_handler_busy - check if receive handler is registered
3803 * @dev: device to check
3805 * Check if a receive handler is already registered for a given device.
3806 * Return true if there one.
3808 * The caller must hold the rtnl_mutex.
3810 bool netdev_is_rx_handler_busy(struct net_device *dev)
3812 ASSERT_RTNL();
3813 return dev && rtnl_dereference(dev->rx_handler);
3815 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3818 * netdev_rx_handler_register - register receive handler
3819 * @dev: device to register a handler for
3820 * @rx_handler: receive handler to register
3821 * @rx_handler_data: data pointer that is used by rx handler
3823 * Register a receive handler for a device. This handler will then be
3824 * called from __netif_receive_skb. A negative errno code is returned
3825 * on a failure.
3827 * The caller must hold the rtnl_mutex.
3829 * For a general description of rx_handler, see enum rx_handler_result.
3831 int netdev_rx_handler_register(struct net_device *dev,
3832 rx_handler_func_t *rx_handler,
3833 void *rx_handler_data)
3835 ASSERT_RTNL();
3837 if (dev->rx_handler)
3838 return -EBUSY;
3840 /* Note: rx_handler_data must be set before rx_handler */
3841 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3842 rcu_assign_pointer(dev->rx_handler, rx_handler);
3844 return 0;
3846 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3849 * netdev_rx_handler_unregister - unregister receive handler
3850 * @dev: device to unregister a handler from
3852 * Unregister a receive handler from a device.
3854 * The caller must hold the rtnl_mutex.
3856 void netdev_rx_handler_unregister(struct net_device *dev)
3859 ASSERT_RTNL();
3860 RCU_INIT_POINTER(dev->rx_handler, NULL);
3861 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3862 * section has a guarantee to see a non NULL rx_handler_data
3863 * as well.
3865 synchronize_net();
3866 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3868 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3871 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3872 * the special handling of PFMEMALLOC skbs.
3874 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3876 switch (skb->protocol) {
3877 case htons(ETH_P_ARP):
3878 case htons(ETH_P_IP):
3879 case htons(ETH_P_IPV6):
3880 case htons(ETH_P_8021Q):
3881 case htons(ETH_P_8021AD):
3882 return true;
3883 default:
3884 return false;
3888 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3889 int *ret, struct net_device *orig_dev)
3891 #ifdef CONFIG_NETFILTER_INGRESS
3892 if (nf_hook_ingress_active(skb)) {
3893 if (*pt_prev) {
3894 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3895 *pt_prev = NULL;
3898 return nf_hook_ingress(skb);
3900 #endif /* CONFIG_NETFILTER_INGRESS */
3901 return 0;
3904 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3906 struct packet_type *ptype, *pt_prev;
3907 rx_handler_func_t *rx_handler;
3908 struct net_device *orig_dev;
3909 bool deliver_exact = false;
3910 int ret = NET_RX_DROP;
3911 __be16 type;
3913 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3915 trace_netif_receive_skb(skb);
3917 orig_dev = skb->dev;
3919 skb_reset_network_header(skb);
3920 if (!skb_transport_header_was_set(skb))
3921 skb_reset_transport_header(skb);
3922 skb_reset_mac_len(skb);
3924 pt_prev = NULL;
3926 another_round:
3927 skb->skb_iif = skb->dev->ifindex;
3929 __this_cpu_inc(softnet_data.processed);
3931 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3932 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3933 skb = skb_vlan_untag(skb);
3934 if (unlikely(!skb))
3935 goto out;
3938 #ifdef CONFIG_NET_CLS_ACT
3939 if (skb->tc_verd & TC_NCLS) {
3940 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3941 goto ncls;
3943 #endif
3945 if (pfmemalloc)
3946 goto skip_taps;
3948 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3949 if (pt_prev)
3950 ret = deliver_skb(skb, pt_prev, orig_dev);
3951 pt_prev = ptype;
3954 list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3955 if (pt_prev)
3956 ret = deliver_skb(skb, pt_prev, orig_dev);
3957 pt_prev = ptype;
3960 skip_taps:
3961 #ifdef CONFIG_NET_INGRESS
3962 if (static_key_false(&ingress_needed)) {
3963 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3964 if (!skb)
3965 goto out;
3967 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
3968 goto out;
3970 #endif
3971 #ifdef CONFIG_NET_CLS_ACT
3972 skb->tc_verd = 0;
3973 ncls:
3974 #endif
3975 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3976 goto drop;
3978 if (skb_vlan_tag_present(skb)) {
3979 if (pt_prev) {
3980 ret = deliver_skb(skb, pt_prev, orig_dev);
3981 pt_prev = NULL;
3983 if (vlan_do_receive(&skb))
3984 goto another_round;
3985 else if (unlikely(!skb))
3986 goto out;
3989 rx_handler = rcu_dereference(skb->dev->rx_handler);
3990 if (rx_handler) {
3991 if (pt_prev) {
3992 ret = deliver_skb(skb, pt_prev, orig_dev);
3993 pt_prev = NULL;
3995 switch (rx_handler(&skb)) {
3996 case RX_HANDLER_CONSUMED:
3997 ret = NET_RX_SUCCESS;
3998 goto out;
3999 case RX_HANDLER_ANOTHER:
4000 goto another_round;
4001 case RX_HANDLER_EXACT:
4002 deliver_exact = true;
4003 case RX_HANDLER_PASS:
4004 break;
4005 default:
4006 BUG();
4010 if (unlikely(skb_vlan_tag_present(skb))) {
4011 if (skb_vlan_tag_get_id(skb))
4012 skb->pkt_type = PACKET_OTHERHOST;
4013 /* Note: we might in the future use prio bits
4014 * and set skb->priority like in vlan_do_receive()
4015 * For the time being, just ignore Priority Code Point
4017 skb->vlan_tci = 0;
4020 type = skb->protocol;
4022 /* deliver only exact match when indicated */
4023 if (likely(!deliver_exact)) {
4024 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4025 &ptype_base[ntohs(type) &
4026 PTYPE_HASH_MASK]);
4029 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4030 &orig_dev->ptype_specific);
4032 if (unlikely(skb->dev != orig_dev)) {
4033 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4034 &skb->dev->ptype_specific);
4037 if (pt_prev) {
4038 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4039 goto drop;
4040 else
4041 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4042 } else {
4043 drop:
4044 atomic_long_inc(&skb->dev->rx_dropped);
4045 kfree_skb(skb);
4046 /* Jamal, now you will not able to escape explaining
4047 * me how you were going to use this. :-)
4049 ret = NET_RX_DROP;
4052 out:
4053 return ret;
4056 static int __netif_receive_skb(struct sk_buff *skb)
4058 int ret;
4060 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4061 unsigned long pflags = current->flags;
4064 * PFMEMALLOC skbs are special, they should
4065 * - be delivered to SOCK_MEMALLOC sockets only
4066 * - stay away from userspace
4067 * - have bounded memory usage
4069 * Use PF_MEMALLOC as this saves us from propagating the allocation
4070 * context down to all allocation sites.
4072 current->flags |= PF_MEMALLOC;
4073 ret = __netif_receive_skb_core(skb, true);
4074 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4075 } else
4076 ret = __netif_receive_skb_core(skb, false);
4078 return ret;
4081 static int netif_receive_skb_internal(struct sk_buff *skb)
4083 int ret;
4085 net_timestamp_check(netdev_tstamp_prequeue, skb);
4087 if (skb_defer_rx_timestamp(skb))
4088 return NET_RX_SUCCESS;
4090 rcu_read_lock();
4092 #ifdef CONFIG_RPS
4093 if (static_key_false(&rps_needed)) {
4094 struct rps_dev_flow voidflow, *rflow = &voidflow;
4095 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4097 if (cpu >= 0) {
4098 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4099 rcu_read_unlock();
4100 return ret;
4103 #endif
4104 ret = __netif_receive_skb(skb);
4105 rcu_read_unlock();
4106 return ret;
4110 * netif_receive_skb - process receive buffer from network
4111 * @skb: buffer to process
4113 * netif_receive_skb() is the main receive data processing function.
4114 * It always succeeds. The buffer may be dropped during processing
4115 * for congestion control or by the protocol layers.
4117 * This function may only be called from softirq context and interrupts
4118 * should be enabled.
4120 * Return values (usually ignored):
4121 * NET_RX_SUCCESS: no congestion
4122 * NET_RX_DROP: packet was dropped
4124 int netif_receive_skb(struct sk_buff *skb)
4126 trace_netif_receive_skb_entry(skb);
4128 return netif_receive_skb_internal(skb);
4130 EXPORT_SYMBOL(netif_receive_skb);
4132 /* Network device is going away, flush any packets still pending
4133 * Called with irqs disabled.
4135 static void flush_backlog(void *arg)
4137 struct net_device *dev = arg;
4138 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4139 struct sk_buff *skb, *tmp;
4141 rps_lock(sd);
4142 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4143 if (skb->dev == dev) {
4144 __skb_unlink(skb, &sd->input_pkt_queue);
4145 kfree_skb(skb);
4146 input_queue_head_incr(sd);
4149 rps_unlock(sd);
4151 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4152 if (skb->dev == dev) {
4153 __skb_unlink(skb, &sd->process_queue);
4154 kfree_skb(skb);
4155 input_queue_head_incr(sd);
4160 static int napi_gro_complete(struct sk_buff *skb)
4162 struct packet_offload *ptype;
4163 __be16 type = skb->protocol;
4164 struct list_head *head = &offload_base;
4165 int err = -ENOENT;
4167 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4169 if (NAPI_GRO_CB(skb)->count == 1) {
4170 skb_shinfo(skb)->gso_size = 0;
4171 goto out;
4174 rcu_read_lock();
4175 list_for_each_entry_rcu(ptype, head, list) {
4176 if (ptype->type != type || !ptype->callbacks.gro_complete)
4177 continue;
4179 err = ptype->callbacks.gro_complete(skb, 0);
4180 break;
4182 rcu_read_unlock();
4184 if (err) {
4185 WARN_ON(&ptype->list == head);
4186 kfree_skb(skb);
4187 return NET_RX_SUCCESS;
4190 out:
4191 return netif_receive_skb_internal(skb);
4194 /* napi->gro_list contains packets ordered by age.
4195 * youngest packets at the head of it.
4196 * Complete skbs in reverse order to reduce latencies.
4198 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4200 struct sk_buff *skb, *prev = NULL;
4202 /* scan list and build reverse chain */
4203 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4204 skb->prev = prev;
4205 prev = skb;
4208 for (skb = prev; skb; skb = prev) {
4209 skb->next = NULL;
4211 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4212 return;
4214 prev = skb->prev;
4215 napi_gro_complete(skb);
4216 napi->gro_count--;
4219 napi->gro_list = NULL;
4221 EXPORT_SYMBOL(napi_gro_flush);
4223 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4225 struct sk_buff *p;
4226 unsigned int maclen = skb->dev->hard_header_len;
4227 u32 hash = skb_get_hash_raw(skb);
4229 for (p = napi->gro_list; p; p = p->next) {
4230 unsigned long diffs;
4232 NAPI_GRO_CB(p)->flush = 0;
4234 if (hash != skb_get_hash_raw(p)) {
4235 NAPI_GRO_CB(p)->same_flow = 0;
4236 continue;
4239 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4240 diffs |= p->vlan_tci ^ skb->vlan_tci;
4241 diffs |= skb_metadata_dst_cmp(p, skb);
4242 if (maclen == ETH_HLEN)
4243 diffs |= compare_ether_header(skb_mac_header(p),
4244 skb_mac_header(skb));
4245 else if (!diffs)
4246 diffs = memcmp(skb_mac_header(p),
4247 skb_mac_header(skb),
4248 maclen);
4249 NAPI_GRO_CB(p)->same_flow = !diffs;
4253 static void skb_gro_reset_offset(struct sk_buff *skb)
4255 const struct skb_shared_info *pinfo = skb_shinfo(skb);
4256 const skb_frag_t *frag0 = &pinfo->frags[0];
4258 NAPI_GRO_CB(skb)->data_offset = 0;
4259 NAPI_GRO_CB(skb)->frag0 = NULL;
4260 NAPI_GRO_CB(skb)->frag0_len = 0;
4262 if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4263 pinfo->nr_frags &&
4264 !PageHighMem(skb_frag_page(frag0))) {
4265 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4266 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
4267 skb_frag_size(frag0),
4268 skb->end - skb->tail);
4272 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4274 struct skb_shared_info *pinfo = skb_shinfo(skb);
4276 BUG_ON(skb->end - skb->tail < grow);
4278 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4280 skb->data_len -= grow;
4281 skb->tail += grow;
4283 pinfo->frags[0].page_offset += grow;
4284 skb_frag_size_sub(&pinfo->frags[0], grow);
4286 if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4287 skb_frag_unref(skb, 0);
4288 memmove(pinfo->frags, pinfo->frags + 1,
4289 --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4293 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4295 struct sk_buff **pp = NULL;
4296 struct packet_offload *ptype;
4297 __be16 type = skb->protocol;
4298 struct list_head *head = &offload_base;
4299 int same_flow;
4300 enum gro_result ret;
4301 int grow;
4303 if (!(skb->dev->features & NETIF_F_GRO))
4304 goto normal;
4306 if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4307 goto normal;
4309 gro_list_prepare(napi, skb);
4311 rcu_read_lock();
4312 list_for_each_entry_rcu(ptype, head, list) {
4313 if (ptype->type != type || !ptype->callbacks.gro_receive)
4314 continue;
4316 skb_set_network_header(skb, skb_gro_offset(skb));
4317 skb_reset_mac_len(skb);
4318 NAPI_GRO_CB(skb)->same_flow = 0;
4319 NAPI_GRO_CB(skb)->flush = 0;
4320 NAPI_GRO_CB(skb)->free = 0;
4321 NAPI_GRO_CB(skb)->encap_mark = 0;
4322 NAPI_GRO_CB(skb)->recursion_counter = 0;
4323 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4325 /* Setup for GRO checksum validation */
4326 switch (skb->ip_summed) {
4327 case CHECKSUM_COMPLETE:
4328 NAPI_GRO_CB(skb)->csum = skb->csum;
4329 NAPI_GRO_CB(skb)->csum_valid = 1;
4330 NAPI_GRO_CB(skb)->csum_cnt = 0;
4331 break;
4332 case CHECKSUM_UNNECESSARY:
4333 NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4334 NAPI_GRO_CB(skb)->csum_valid = 0;
4335 break;
4336 default:
4337 NAPI_GRO_CB(skb)->csum_cnt = 0;
4338 NAPI_GRO_CB(skb)->csum_valid = 0;
4341 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4342 break;
4344 rcu_read_unlock();
4346 if (&ptype->list == head)
4347 goto normal;
4349 same_flow = NAPI_GRO_CB(skb)->same_flow;
4350 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4352 if (pp) {
4353 struct sk_buff *nskb = *pp;
4355 *pp = nskb->next;
4356 nskb->next = NULL;
4357 napi_gro_complete(nskb);
4358 napi->gro_count--;
4361 if (same_flow)
4362 goto ok;
4364 if (NAPI_GRO_CB(skb)->flush)
4365 goto normal;
4367 if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4368 struct sk_buff *nskb = napi->gro_list;
4370 /* locate the end of the list to select the 'oldest' flow */
4371 while (nskb->next) {
4372 pp = &nskb->next;
4373 nskb = *pp;
4375 *pp = NULL;
4376 nskb->next = NULL;
4377 napi_gro_complete(nskb);
4378 } else {
4379 napi->gro_count++;
4381 NAPI_GRO_CB(skb)->count = 1;
4382 NAPI_GRO_CB(skb)->age = jiffies;
4383 NAPI_GRO_CB(skb)->last = skb;
4384 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4385 skb->next = napi->gro_list;
4386 napi->gro_list = skb;
4387 ret = GRO_HELD;
4389 pull:
4390 grow = skb_gro_offset(skb) - skb_headlen(skb);
4391 if (grow > 0)
4392 gro_pull_from_frag0(skb, grow);
4394 return ret;
4396 normal:
4397 ret = GRO_NORMAL;
4398 goto pull;
4401 struct packet_offload *gro_find_receive_by_type(__be16 type)
4403 struct list_head *offload_head = &offload_base;
4404 struct packet_offload *ptype;
4406 list_for_each_entry_rcu(ptype, offload_head, list) {
4407 if (ptype->type != type || !ptype->callbacks.gro_receive)
4408 continue;
4409 return ptype;
4411 return NULL;
4413 EXPORT_SYMBOL(gro_find_receive_by_type);
4415 struct packet_offload *gro_find_complete_by_type(__be16 type)
4417 struct list_head *offload_head = &offload_base;
4418 struct packet_offload *ptype;
4420 list_for_each_entry_rcu(ptype, offload_head, list) {
4421 if (ptype->type != type || !ptype->callbacks.gro_complete)
4422 continue;
4423 return ptype;
4425 return NULL;
4427 EXPORT_SYMBOL(gro_find_complete_by_type);
4429 static void napi_skb_free_stolen_head(struct sk_buff *skb)
4431 skb_dst_drop(skb);
4432 kmem_cache_free(skbuff_head_cache, skb);
4435 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4437 switch (ret) {
4438 case GRO_NORMAL:
4439 if (netif_receive_skb_internal(skb))
4440 ret = GRO_DROP;
4441 break;
4443 case GRO_DROP:
4444 kfree_skb(skb);
4445 break;
4447 case GRO_MERGED_FREE:
4448 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4449 napi_skb_free_stolen_head(skb);
4450 else
4451 __kfree_skb(skb);
4452 break;
4454 case GRO_HELD:
4455 case GRO_MERGED:
4456 break;
4459 return ret;
4462 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4464 trace_napi_gro_receive_entry(skb);
4466 skb_gro_reset_offset(skb);
4468 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4470 EXPORT_SYMBOL(napi_gro_receive);
4472 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4474 if (unlikely(skb->pfmemalloc)) {
4475 consume_skb(skb);
4476 return;
4478 __skb_pull(skb, skb_headlen(skb));
4479 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4480 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4481 skb->vlan_tci = 0;
4482 skb->dev = napi->dev;
4483 skb->skb_iif = 0;
4485 /* eth_type_trans() assumes pkt_type is PACKET_HOST */
4486 skb->pkt_type = PACKET_HOST;
4488 skb->encapsulation = 0;
4489 skb_shinfo(skb)->gso_type = 0;
4490 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4492 napi->skb = skb;
4495 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4497 struct sk_buff *skb = napi->skb;
4499 if (!skb) {
4500 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4501 napi->skb = skb;
4503 return skb;
4505 EXPORT_SYMBOL(napi_get_frags);
4507 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4508 struct sk_buff *skb,
4509 gro_result_t ret)
4511 switch (ret) {
4512 case GRO_NORMAL:
4513 case GRO_HELD:
4514 __skb_push(skb, ETH_HLEN);
4515 skb->protocol = eth_type_trans(skb, skb->dev);
4516 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4517 ret = GRO_DROP;
4518 break;
4520 case GRO_DROP:
4521 napi_reuse_skb(napi, skb);
4522 break;
4524 case GRO_MERGED_FREE:
4525 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
4526 napi_skb_free_stolen_head(skb);
4527 else
4528 napi_reuse_skb(napi, skb);
4529 break;
4531 case GRO_MERGED:
4532 break;
4535 return ret;
4538 /* Upper GRO stack assumes network header starts at gro_offset=0
4539 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4540 * We copy ethernet header into skb->data to have a common layout.
4542 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4544 struct sk_buff *skb = napi->skb;
4545 const struct ethhdr *eth;
4546 unsigned int hlen = sizeof(*eth);
4548 napi->skb = NULL;
4550 skb_reset_mac_header(skb);
4551 skb_gro_reset_offset(skb);
4553 eth = skb_gro_header_fast(skb, 0);
4554 if (unlikely(skb_gro_header_hard(skb, hlen))) {
4555 eth = skb_gro_header_slow(skb, hlen, 0);
4556 if (unlikely(!eth)) {
4557 napi_reuse_skb(napi, skb);
4558 return NULL;
4560 } else {
4561 gro_pull_from_frag0(skb, hlen);
4562 NAPI_GRO_CB(skb)->frag0 += hlen;
4563 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4565 __skb_pull(skb, hlen);
4568 * This works because the only protocols we care about don't require
4569 * special handling.
4570 * We'll fix it up properly in napi_frags_finish()
4572 skb->protocol = eth->h_proto;
4574 return skb;
4577 gro_result_t napi_gro_frags(struct napi_struct *napi)
4579 struct sk_buff *skb = napi_frags_skb(napi);
4581 if (!skb)
4582 return GRO_DROP;
4584 trace_napi_gro_frags_entry(skb);
4586 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4588 EXPORT_SYMBOL(napi_gro_frags);
4590 /* Compute the checksum from gro_offset and return the folded value
4591 * after adding in any pseudo checksum.
4593 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4595 __wsum wsum;
4596 __sum16 sum;
4598 wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4600 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4601 sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4602 if (likely(!sum)) {
4603 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4604 !skb->csum_complete_sw)
4605 netdev_rx_csum_fault(skb->dev);
4608 NAPI_GRO_CB(skb)->csum = wsum;
4609 NAPI_GRO_CB(skb)->csum_valid = 1;
4611 return sum;
4613 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4616 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4617 * Note: called with local irq disabled, but exits with local irq enabled.
4619 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4621 #ifdef CONFIG_RPS
4622 struct softnet_data *remsd = sd->rps_ipi_list;
4624 if (remsd) {
4625 sd->rps_ipi_list = NULL;
4627 local_irq_enable();
4629 /* Send pending IPI's to kick RPS processing on remote cpus. */
4630 while (remsd) {
4631 struct softnet_data *next = remsd->rps_ipi_next;
4633 if (cpu_online(remsd->cpu))
4634 smp_call_function_single_async(remsd->cpu,
4635 &remsd->csd);
4636 remsd = next;
4638 } else
4639 #endif
4640 local_irq_enable();
4643 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4645 #ifdef CONFIG_RPS
4646 return sd->rps_ipi_list != NULL;
4647 #else
4648 return false;
4649 #endif
4652 static int process_backlog(struct napi_struct *napi, int quota)
4654 int work = 0;
4655 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4657 /* Check if we have pending ipi, its better to send them now,
4658 * not waiting net_rx_action() end.
4660 if (sd_has_rps_ipi_waiting(sd)) {
4661 local_irq_disable();
4662 net_rps_action_and_irq_enable(sd);
4665 napi->weight = weight_p;
4666 local_irq_disable();
4667 while (1) {
4668 struct sk_buff *skb;
4670 while ((skb = __skb_dequeue(&sd->process_queue))) {
4671 rcu_read_lock();
4672 local_irq_enable();
4673 __netif_receive_skb(skb);
4674 rcu_read_unlock();
4675 local_irq_disable();
4676 input_queue_head_incr(sd);
4677 if (++work >= quota) {
4678 local_irq_enable();
4679 return work;
4683 rps_lock(sd);
4684 if (skb_queue_empty(&sd->input_pkt_queue)) {
4686 * Inline a custom version of __napi_complete().
4687 * only current cpu owns and manipulates this napi,
4688 * and NAPI_STATE_SCHED is the only possible flag set
4689 * on backlog.
4690 * We can use a plain write instead of clear_bit(),
4691 * and we dont need an smp_mb() memory barrier.
4693 napi->state = 0;
4694 rps_unlock(sd);
4696 break;
4699 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4700 &sd->process_queue);
4701 rps_unlock(sd);
4703 local_irq_enable();
4705 return work;
4709 * __napi_schedule - schedule for receive
4710 * @n: entry to schedule
4712 * The entry's receive function will be scheduled to run.
4713 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4715 void __napi_schedule(struct napi_struct *n)
4717 unsigned long flags;
4719 local_irq_save(flags);
4720 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4721 local_irq_restore(flags);
4723 EXPORT_SYMBOL(__napi_schedule);
4726 * __napi_schedule_irqoff - schedule for receive
4727 * @n: entry to schedule
4729 * Variant of __napi_schedule() assuming hard irqs are masked
4731 void __napi_schedule_irqoff(struct napi_struct *n)
4733 ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4735 EXPORT_SYMBOL(__napi_schedule_irqoff);
4737 void __napi_complete(struct napi_struct *n)
4739 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4741 list_del_init(&n->poll_list);
4742 smp_mb__before_atomic();
4743 clear_bit(NAPI_STATE_SCHED, &n->state);
4745 EXPORT_SYMBOL(__napi_complete);
4747 void napi_complete_done(struct napi_struct *n, int work_done)
4749 unsigned long flags;
4752 * don't let napi dequeue from the cpu poll list
4753 * just in case its running on a different cpu
4755 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4756 return;
4758 if (n->gro_list) {
4759 unsigned long timeout = 0;
4761 if (work_done)
4762 timeout = n->dev->gro_flush_timeout;
4764 if (timeout)
4765 hrtimer_start(&n->timer, ns_to_ktime(timeout),
4766 HRTIMER_MODE_REL_PINNED);
4767 else
4768 napi_gro_flush(n, false);
4770 if (likely(list_empty(&n->poll_list))) {
4771 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4772 } else {
4773 /* If n->poll_list is not empty, we need to mask irqs */
4774 local_irq_save(flags);
4775 __napi_complete(n);
4776 local_irq_restore(flags);
4779 EXPORT_SYMBOL(napi_complete_done);
4781 /* must be called under rcu_read_lock(), as we dont take a reference */
4782 struct napi_struct *napi_by_id(unsigned int napi_id)
4784 unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4785 struct napi_struct *napi;
4787 hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4788 if (napi->napi_id == napi_id)
4789 return napi;
4791 return NULL;
4793 EXPORT_SYMBOL_GPL(napi_by_id);
4795 void napi_hash_add(struct napi_struct *napi)
4797 if (test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
4798 return;
4800 spin_lock(&napi_hash_lock);
4802 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
4803 do {
4804 if (unlikely(++napi_gen_id < NR_CPUS + 1))
4805 napi_gen_id = NR_CPUS + 1;
4806 } while (napi_by_id(napi_gen_id));
4807 napi->napi_id = napi_gen_id;
4809 hlist_add_head_rcu(&napi->napi_hash_node,
4810 &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4812 spin_unlock(&napi_hash_lock);
4814 EXPORT_SYMBOL_GPL(napi_hash_add);
4816 /* Warning : caller is responsible to make sure rcu grace period
4817 * is respected before freeing memory containing @napi
4819 void napi_hash_del(struct napi_struct *napi)
4821 spin_lock(&napi_hash_lock);
4823 if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
4824 hlist_del_rcu(&napi->napi_hash_node);
4826 spin_unlock(&napi_hash_lock);
4828 EXPORT_SYMBOL_GPL(napi_hash_del);
4830 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4832 struct napi_struct *napi;
4834 napi = container_of(timer, struct napi_struct, timer);
4835 if (napi->gro_list)
4836 napi_schedule(napi);
4838 return HRTIMER_NORESTART;
4841 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4842 int (*poll)(struct napi_struct *, int), int weight)
4844 INIT_LIST_HEAD(&napi->poll_list);
4845 hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4846 napi->timer.function = napi_watchdog;
4847 napi->gro_count = 0;
4848 napi->gro_list = NULL;
4849 napi->skb = NULL;
4850 napi->poll = poll;
4851 if (weight > NAPI_POLL_WEIGHT)
4852 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4853 weight, dev->name);
4854 napi->weight = weight;
4855 list_add(&napi->dev_list, &dev->napi_list);
4856 napi->dev = dev;
4857 #ifdef CONFIG_NETPOLL
4858 spin_lock_init(&napi->poll_lock);
4859 napi->poll_owner = -1;
4860 #endif
4861 set_bit(NAPI_STATE_SCHED, &napi->state);
4863 EXPORT_SYMBOL(netif_napi_add);
4865 void napi_disable(struct napi_struct *n)
4867 might_sleep();
4868 set_bit(NAPI_STATE_DISABLE, &n->state);
4870 while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4871 msleep(1);
4872 while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
4873 msleep(1);
4875 hrtimer_cancel(&n->timer);
4877 clear_bit(NAPI_STATE_DISABLE, &n->state);
4879 EXPORT_SYMBOL(napi_disable);
4881 void netif_napi_del(struct napi_struct *napi)
4883 list_del_init(&napi->dev_list);
4884 napi_free_frags(napi);
4886 kfree_skb_list(napi->gro_list);
4887 napi->gro_list = NULL;
4888 napi->gro_count = 0;
4890 EXPORT_SYMBOL(netif_napi_del);
4892 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
4894 void *have;
4895 int work, weight;
4897 list_del_init(&n->poll_list);
4899 have = netpoll_poll_lock(n);
4901 weight = n->weight;
4903 /* This NAPI_STATE_SCHED test is for avoiding a race
4904 * with netpoll's poll_napi(). Only the entity which
4905 * obtains the lock and sees NAPI_STATE_SCHED set will
4906 * actually make the ->poll() call. Therefore we avoid
4907 * accidentally calling ->poll() when NAPI is not scheduled.
4909 work = 0;
4910 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4911 work = n->poll(n, weight);
4912 trace_napi_poll(n);
4915 WARN_ON_ONCE(work > weight);
4917 if (likely(work < weight))
4918 goto out_unlock;
4920 /* Drivers must not modify the NAPI state if they
4921 * consume the entire weight. In such cases this code
4922 * still "owns" the NAPI instance and therefore can
4923 * move the instance around on the list at-will.
4925 if (unlikely(napi_disable_pending(n))) {
4926 napi_complete(n);
4927 goto out_unlock;
4930 if (n->gro_list) {
4931 /* flush too old packets
4932 * If HZ < 1000, flush all packets.
4934 napi_gro_flush(n, HZ >= 1000);
4937 /* Some drivers may have called napi_schedule
4938 * prior to exhausting their budget.
4940 if (unlikely(!list_empty(&n->poll_list))) {
4941 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4942 n->dev ? n->dev->name : "backlog");
4943 goto out_unlock;
4946 list_add_tail(&n->poll_list, repoll);
4948 out_unlock:
4949 netpoll_poll_unlock(have);
4951 return work;
4954 static void net_rx_action(struct softirq_action *h)
4956 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4957 unsigned long time_limit = jiffies + 2;
4958 int budget = netdev_budget;
4959 LIST_HEAD(list);
4960 LIST_HEAD(repoll);
4962 local_irq_disable();
4963 list_splice_init(&sd->poll_list, &list);
4964 local_irq_enable();
4966 for (;;) {
4967 struct napi_struct *n;
4969 if (list_empty(&list)) {
4970 if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
4971 return;
4972 break;
4975 n = list_first_entry(&list, struct napi_struct, poll_list);
4976 budget -= napi_poll(n, &repoll);
4978 /* If softirq window is exhausted then punt.
4979 * Allow this to run for 2 jiffies since which will allow
4980 * an average latency of 1.5/HZ.
4982 if (unlikely(budget <= 0 ||
4983 time_after_eq(jiffies, time_limit))) {
4984 sd->time_squeeze++;
4985 break;
4989 local_irq_disable();
4991 list_splice_tail_init(&sd->poll_list, &list);
4992 list_splice_tail(&repoll, &list);
4993 list_splice(&list, &sd->poll_list);
4994 if (!list_empty(&sd->poll_list))
4995 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4997 net_rps_action_and_irq_enable(sd);
5000 struct netdev_adjacent {
5001 struct net_device *dev;
5003 /* upper master flag, there can only be one master device per list */
5004 bool master;
5006 /* counter for the number of times this device was added to us */
5007 u16 ref_nr;
5009 /* private field for the users */
5010 void *private;
5012 struct list_head list;
5013 struct rcu_head rcu;
5016 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5017 struct list_head *adj_list)
5019 struct netdev_adjacent *adj;
5021 list_for_each_entry(adj, adj_list, list) {
5022 if (adj->dev == adj_dev)
5023 return adj;
5025 return NULL;
5029 * netdev_has_upper_dev - Check if device is linked to an upper device
5030 * @dev: device
5031 * @upper_dev: upper device to check
5033 * Find out if a device is linked to specified upper device and return true
5034 * in case it is. Note that this checks only immediate upper device,
5035 * not through a complete stack of devices. The caller must hold the RTNL lock.
5037 bool netdev_has_upper_dev(struct net_device *dev,
5038 struct net_device *upper_dev)
5040 ASSERT_RTNL();
5042 return __netdev_find_adj(upper_dev, &dev->all_adj_list.upper);
5044 EXPORT_SYMBOL(netdev_has_upper_dev);
5047 * netdev_has_any_upper_dev - Check if device is linked to some device
5048 * @dev: device
5050 * Find out if a device is linked to an upper device and return true in case
5051 * it is. The caller must hold the RTNL lock.
5053 static bool netdev_has_any_upper_dev(struct net_device *dev)
5055 ASSERT_RTNL();
5057 return !list_empty(&dev->all_adj_list.upper);
5061 * netdev_master_upper_dev_get - Get master upper device
5062 * @dev: device
5064 * Find a master upper device and return pointer to it or NULL in case
5065 * it's not there. The caller must hold the RTNL lock.
5067 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5069 struct netdev_adjacent *upper;
5071 ASSERT_RTNL();
5073 if (list_empty(&dev->adj_list.upper))
5074 return NULL;
5076 upper = list_first_entry(&dev->adj_list.upper,
5077 struct netdev_adjacent, list);
5078 if (likely(upper->master))
5079 return upper->dev;
5080 return NULL;
5082 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5084 void *netdev_adjacent_get_private(struct list_head *adj_list)
5086 struct netdev_adjacent *adj;
5088 adj = list_entry(adj_list, struct netdev_adjacent, list);
5090 return adj->private;
5092 EXPORT_SYMBOL(netdev_adjacent_get_private);
5095 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5096 * @dev: device
5097 * @iter: list_head ** of the current position
5099 * Gets the next device from the dev's upper list, starting from iter
5100 * position. The caller must hold RCU read lock.
5102 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5103 struct list_head **iter)
5105 struct netdev_adjacent *upper;
5107 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5109 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5111 if (&upper->list == &dev->adj_list.upper)
5112 return NULL;
5114 *iter = &upper->list;
5116 return upper->dev;
5118 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5121 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5122 * @dev: device
5123 * @iter: list_head ** of the current position
5125 * Gets the next device from the dev's upper list, starting from iter
5126 * position. The caller must hold RCU read lock.
5128 struct net_device *netdev_all_upper_get_next_dev_rcu(struct net_device *dev,
5129 struct list_head **iter)
5131 struct netdev_adjacent *upper;
5133 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5135 upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5137 if (&upper->list == &dev->all_adj_list.upper)
5138 return NULL;
5140 *iter = &upper->list;
5142 return upper->dev;
5144 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu);
5147 * netdev_lower_get_next_private - Get the next ->private from the
5148 * lower neighbour list
5149 * @dev: device
5150 * @iter: list_head ** of the current position
5152 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5153 * list, starting from iter position. The caller must hold either hold the
5154 * RTNL lock or its own locking that guarantees that the neighbour lower
5155 * list will remain unchanged.
5157 void *netdev_lower_get_next_private(struct net_device *dev,
5158 struct list_head **iter)
5160 struct netdev_adjacent *lower;
5162 lower = list_entry(*iter, struct netdev_adjacent, list);
5164 if (&lower->list == &dev->adj_list.lower)
5165 return NULL;
5167 *iter = lower->list.next;
5169 return lower->private;
5171 EXPORT_SYMBOL(netdev_lower_get_next_private);
5174 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5175 * lower neighbour list, RCU
5176 * variant
5177 * @dev: device
5178 * @iter: list_head ** of the current position
5180 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5181 * list, starting from iter position. The caller must hold RCU read lock.
5183 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5184 struct list_head **iter)
5186 struct netdev_adjacent *lower;
5188 WARN_ON_ONCE(!rcu_read_lock_held());
5190 lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5192 if (&lower->list == &dev->adj_list.lower)
5193 return NULL;
5195 *iter = &lower->list;
5197 return lower->private;
5199 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5202 * netdev_lower_get_next - Get the next device from the lower neighbour
5203 * list
5204 * @dev: device
5205 * @iter: list_head ** of the current position
5207 * Gets the next netdev_adjacent from the dev's lower neighbour
5208 * list, starting from iter position. The caller must hold RTNL lock or
5209 * its own locking that guarantees that the neighbour lower
5210 * list will remain unchanged.
5212 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5214 struct netdev_adjacent *lower;
5216 lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5218 if (&lower->list == &dev->adj_list.lower)
5219 return NULL;
5221 *iter = &lower->list;
5223 return lower->dev;
5225 EXPORT_SYMBOL(netdev_lower_get_next);
5228 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5229 * lower neighbour list, RCU
5230 * variant
5231 * @dev: device
5233 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5234 * list. The caller must hold RCU read lock.
5236 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5238 struct netdev_adjacent *lower;
5240 lower = list_first_or_null_rcu(&dev->adj_list.lower,
5241 struct netdev_adjacent, list);
5242 if (lower)
5243 return lower->private;
5244 return NULL;
5246 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5249 * netdev_master_upper_dev_get_rcu - Get master upper device
5250 * @dev: device
5252 * Find a master upper device and return pointer to it or NULL in case
5253 * it's not there. The caller must hold the RCU read lock.
5255 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5257 struct netdev_adjacent *upper;
5259 upper = list_first_or_null_rcu(&dev->adj_list.upper,
5260 struct netdev_adjacent, list);
5261 if (upper && likely(upper->master))
5262 return upper->dev;
5263 return NULL;
5265 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5267 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5268 struct net_device *adj_dev,
5269 struct list_head *dev_list)
5271 char linkname[IFNAMSIZ+7];
5272 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5273 "upper_%s" : "lower_%s", adj_dev->name);
5274 return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5275 linkname);
5277 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5278 char *name,
5279 struct list_head *dev_list)
5281 char linkname[IFNAMSIZ+7];
5282 sprintf(linkname, dev_list == &dev->adj_list.upper ?
5283 "upper_%s" : "lower_%s", name);
5284 sysfs_remove_link(&(dev->dev.kobj), linkname);
5287 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5288 struct net_device *adj_dev,
5289 struct list_head *dev_list)
5291 return (dev_list == &dev->adj_list.upper ||
5292 dev_list == &dev->adj_list.lower) &&
5293 net_eq(dev_net(dev), dev_net(adj_dev));
5296 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5297 struct net_device *adj_dev,
5298 u16 ref_nr,
5299 struct list_head *dev_list,
5300 void *private, bool master)
5302 struct netdev_adjacent *adj;
5303 int ret;
5305 adj = __netdev_find_adj(adj_dev, dev_list);
5307 if (adj) {
5308 adj->ref_nr += ref_nr;
5309 return 0;
5312 adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5313 if (!adj)
5314 return -ENOMEM;
5316 adj->dev = adj_dev;
5317 adj->master = master;
5318 adj->ref_nr = ref_nr;
5319 adj->private = private;
5320 dev_hold(adj_dev);
5322 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5323 adj_dev->name, dev->name, adj_dev->name);
5325 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5326 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5327 if (ret)
5328 goto free_adj;
5331 /* Ensure that master link is always the first item in list. */
5332 if (master) {
5333 ret = sysfs_create_link(&(dev->dev.kobj),
5334 &(adj_dev->dev.kobj), "master");
5335 if (ret)
5336 goto remove_symlinks;
5338 list_add_rcu(&adj->list, dev_list);
5339 } else {
5340 list_add_tail_rcu(&adj->list, dev_list);
5343 return 0;
5345 remove_symlinks:
5346 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5347 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5348 free_adj:
5349 kfree(adj);
5350 dev_put(adj_dev);
5352 return ret;
5355 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5356 struct net_device *adj_dev,
5357 u16 ref_nr,
5358 struct list_head *dev_list)
5360 struct netdev_adjacent *adj;
5362 adj = __netdev_find_adj(adj_dev, dev_list);
5364 if (!adj) {
5365 pr_err("tried to remove device %s from %s\n",
5366 dev->name, adj_dev->name);
5367 BUG();
5370 if (adj->ref_nr > ref_nr) {
5371 pr_debug("%s to %s ref_nr-%d = %d\n", dev->name, adj_dev->name,
5372 ref_nr, adj->ref_nr-ref_nr);
5373 adj->ref_nr -= ref_nr;
5374 return;
5377 if (adj->master)
5378 sysfs_remove_link(&(dev->dev.kobj), "master");
5380 if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5381 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5383 list_del_rcu(&adj->list);
5384 pr_debug("dev_put for %s, because link removed from %s to %s\n",
5385 adj_dev->name, dev->name, adj_dev->name);
5386 dev_put(adj_dev);
5387 kfree_rcu(adj, rcu);
5390 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5391 struct net_device *upper_dev,
5392 u16 ref_nr,
5393 struct list_head *up_list,
5394 struct list_head *down_list,
5395 void *private, bool master)
5397 int ret;
5399 ret = __netdev_adjacent_dev_insert(dev, upper_dev, ref_nr, up_list,
5400 private, master);
5401 if (ret)
5402 return ret;
5404 ret = __netdev_adjacent_dev_insert(upper_dev, dev, ref_nr, down_list,
5405 private, false);
5406 if (ret) {
5407 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5408 return ret;
5411 return 0;
5414 static int __netdev_adjacent_dev_link(struct net_device *dev,
5415 struct net_device *upper_dev,
5416 u16 ref_nr)
5418 return __netdev_adjacent_dev_link_lists(dev, upper_dev, ref_nr,
5419 &dev->all_adj_list.upper,
5420 &upper_dev->all_adj_list.lower,
5421 NULL, false);
5424 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5425 struct net_device *upper_dev,
5426 u16 ref_nr,
5427 struct list_head *up_list,
5428 struct list_head *down_list)
5430 __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5431 __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5434 static void __netdev_adjacent_dev_unlink(struct net_device *dev,
5435 struct net_device *upper_dev,
5436 u16 ref_nr)
5438 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, ref_nr,
5439 &dev->all_adj_list.upper,
5440 &upper_dev->all_adj_list.lower);
5443 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5444 struct net_device *upper_dev,
5445 void *private, bool master)
5447 int ret = __netdev_adjacent_dev_link(dev, upper_dev, 1);
5449 if (ret)
5450 return ret;
5452 ret = __netdev_adjacent_dev_link_lists(dev, upper_dev, 1,
5453 &dev->adj_list.upper,
5454 &upper_dev->adj_list.lower,
5455 private, master);
5456 if (ret) {
5457 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5458 return ret;
5461 return 0;
5464 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5465 struct net_device *upper_dev)
5467 __netdev_adjacent_dev_unlink(dev, upper_dev, 1);
5468 __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5469 &dev->adj_list.upper,
5470 &upper_dev->adj_list.lower);
5473 static int __netdev_upper_dev_link(struct net_device *dev,
5474 struct net_device *upper_dev, bool master,
5475 void *private)
5477 struct netdev_notifier_changeupper_info changeupper_info;
5478 struct netdev_adjacent *i, *j, *to_i, *to_j;
5479 int ret = 0;
5481 ASSERT_RTNL();
5483 if (dev == upper_dev)
5484 return -EBUSY;
5486 /* To prevent loops, check if dev is not upper device to upper_dev. */
5487 if (__netdev_find_adj(dev, &upper_dev->all_adj_list.upper))
5488 return -EBUSY;
5490 if (__netdev_find_adj(upper_dev, &dev->adj_list.upper))
5491 return -EEXIST;
5493 if (master && netdev_master_upper_dev_get(dev))
5494 return -EBUSY;
5496 changeupper_info.upper_dev = upper_dev;
5497 changeupper_info.master = master;
5498 changeupper_info.linking = true;
5500 ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5501 &changeupper_info.info);
5502 ret = notifier_to_errno(ret);
5503 if (ret)
5504 return ret;
5506 ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
5507 master);
5508 if (ret)
5509 return ret;
5511 /* Now that we linked these devs, make all the upper_dev's
5512 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5513 * versa, and don't forget the devices itself. All of these
5514 * links are non-neighbours.
5516 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5517 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5518 pr_debug("Interlinking %s with %s, non-neighbour\n",
5519 i->dev->name, j->dev->name);
5520 ret = __netdev_adjacent_dev_link(i->dev, j->dev, i->ref_nr);
5521 if (ret)
5522 goto rollback_mesh;
5526 /* add dev to every upper_dev's upper device */
5527 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5528 pr_debug("linking %s's upper device %s with %s\n",
5529 upper_dev->name, i->dev->name, dev->name);
5530 ret = __netdev_adjacent_dev_link(dev, i->dev, i->ref_nr);
5531 if (ret)
5532 goto rollback_upper_mesh;
5535 /* add upper_dev to every dev's lower device */
5536 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5537 pr_debug("linking %s's lower device %s with %s\n", dev->name,
5538 i->dev->name, upper_dev->name);
5539 ret = __netdev_adjacent_dev_link(i->dev, upper_dev, i->ref_nr);
5540 if (ret)
5541 goto rollback_lower_mesh;
5544 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5545 &changeupper_info.info);
5546 return 0;
5548 rollback_lower_mesh:
5549 to_i = i;
5550 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5551 if (i == to_i)
5552 break;
5553 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5556 i = NULL;
5558 rollback_upper_mesh:
5559 to_i = i;
5560 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list) {
5561 if (i == to_i)
5562 break;
5563 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5566 i = j = NULL;
5568 rollback_mesh:
5569 to_i = i;
5570 to_j = j;
5571 list_for_each_entry(i, &dev->all_adj_list.lower, list) {
5572 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list) {
5573 if (i == to_i && j == to_j)
5574 break;
5575 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5577 if (i == to_i)
5578 break;
5581 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5583 return ret;
5587 * netdev_upper_dev_link - Add a link to the upper device
5588 * @dev: device
5589 * @upper_dev: new upper device
5591 * Adds a link to device which is upper to this one. The caller must hold
5592 * the RTNL lock. On a failure a negative errno code is returned.
5593 * On success the reference counts are adjusted and the function
5594 * returns zero.
5596 int netdev_upper_dev_link(struct net_device *dev,
5597 struct net_device *upper_dev)
5599 return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
5601 EXPORT_SYMBOL(netdev_upper_dev_link);
5604 * netdev_master_upper_dev_link - Add a master link to the upper device
5605 * @dev: device
5606 * @upper_dev: new upper device
5608 * Adds a link to device which is upper to this one. In this case, only
5609 * one master upper device can be linked, although other non-master devices
5610 * might be linked as well. The caller must hold the RTNL lock.
5611 * On a failure a negative errno code is returned. On success the reference
5612 * counts are adjusted and the function returns zero.
5614 int netdev_master_upper_dev_link(struct net_device *dev,
5615 struct net_device *upper_dev)
5617 return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
5619 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5621 int netdev_master_upper_dev_link_private(struct net_device *dev,
5622 struct net_device *upper_dev,
5623 void *private)
5625 return __netdev_upper_dev_link(dev, upper_dev, true, private);
5627 EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
5630 * netdev_upper_dev_unlink - Removes a link to upper device
5631 * @dev: device
5632 * @upper_dev: new upper device
5634 * Removes a link to device which is upper to this one. The caller must hold
5635 * the RTNL lock.
5637 void netdev_upper_dev_unlink(struct net_device *dev,
5638 struct net_device *upper_dev)
5640 struct netdev_notifier_changeupper_info changeupper_info;
5641 struct netdev_adjacent *i, *j;
5642 ASSERT_RTNL();
5644 changeupper_info.upper_dev = upper_dev;
5645 changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5646 changeupper_info.linking = false;
5648 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5649 &changeupper_info.info);
5651 __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5653 /* Here is the tricky part. We must remove all dev's lower
5654 * devices from all upper_dev's upper devices and vice
5655 * versa, to maintain the graph relationship.
5657 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5658 list_for_each_entry(j, &upper_dev->all_adj_list.upper, list)
5659 __netdev_adjacent_dev_unlink(i->dev, j->dev, i->ref_nr);
5661 /* remove also the devices itself from lower/upper device
5662 * list
5664 list_for_each_entry(i, &dev->all_adj_list.lower, list)
5665 __netdev_adjacent_dev_unlink(i->dev, upper_dev, i->ref_nr);
5667 list_for_each_entry(i, &upper_dev->all_adj_list.upper, list)
5668 __netdev_adjacent_dev_unlink(dev, i->dev, i->ref_nr);
5670 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5671 &changeupper_info.info);
5673 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5676 * netdev_bonding_info_change - Dispatch event about slave change
5677 * @dev: device
5678 * @bonding_info: info to dispatch
5680 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5681 * The caller must hold the RTNL lock.
5683 void netdev_bonding_info_change(struct net_device *dev,
5684 struct netdev_bonding_info *bonding_info)
5686 struct netdev_notifier_bonding_info info;
5688 memcpy(&info.bonding_info, bonding_info,
5689 sizeof(struct netdev_bonding_info));
5690 call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5691 &info.info);
5693 EXPORT_SYMBOL(netdev_bonding_info_change);
5695 static void netdev_adjacent_add_links(struct net_device *dev)
5697 struct netdev_adjacent *iter;
5699 struct net *net = dev_net(dev);
5701 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5702 if (!net_eq(net,dev_net(iter->dev)))
5703 continue;
5704 netdev_adjacent_sysfs_add(iter->dev, dev,
5705 &iter->dev->adj_list.lower);
5706 netdev_adjacent_sysfs_add(dev, iter->dev,
5707 &dev->adj_list.upper);
5710 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5711 if (!net_eq(net,dev_net(iter->dev)))
5712 continue;
5713 netdev_adjacent_sysfs_add(iter->dev, dev,
5714 &iter->dev->adj_list.upper);
5715 netdev_adjacent_sysfs_add(dev, iter->dev,
5716 &dev->adj_list.lower);
5720 static void netdev_adjacent_del_links(struct net_device *dev)
5722 struct netdev_adjacent *iter;
5724 struct net *net = dev_net(dev);
5726 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5727 if (!net_eq(net,dev_net(iter->dev)))
5728 continue;
5729 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5730 &iter->dev->adj_list.lower);
5731 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5732 &dev->adj_list.upper);
5735 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5736 if (!net_eq(net,dev_net(iter->dev)))
5737 continue;
5738 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5739 &iter->dev->adj_list.upper);
5740 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5741 &dev->adj_list.lower);
5745 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5747 struct netdev_adjacent *iter;
5749 struct net *net = dev_net(dev);
5751 list_for_each_entry(iter, &dev->adj_list.upper, list) {
5752 if (!net_eq(net,dev_net(iter->dev)))
5753 continue;
5754 netdev_adjacent_sysfs_del(iter->dev, oldname,
5755 &iter->dev->adj_list.lower);
5756 netdev_adjacent_sysfs_add(iter->dev, dev,
5757 &iter->dev->adj_list.lower);
5760 list_for_each_entry(iter, &dev->adj_list.lower, list) {
5761 if (!net_eq(net,dev_net(iter->dev)))
5762 continue;
5763 netdev_adjacent_sysfs_del(iter->dev, oldname,
5764 &iter->dev->adj_list.upper);
5765 netdev_adjacent_sysfs_add(iter->dev, dev,
5766 &iter->dev->adj_list.upper);
5770 void *netdev_lower_dev_get_private(struct net_device *dev,
5771 struct net_device *lower_dev)
5773 struct netdev_adjacent *lower;
5775 if (!lower_dev)
5776 return NULL;
5777 lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5778 if (!lower)
5779 return NULL;
5781 return lower->private;
5783 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5786 int dev_get_nest_level(struct net_device *dev,
5787 bool (*type_check)(struct net_device *dev))
5789 struct net_device *lower = NULL;
5790 struct list_head *iter;
5791 int max_nest = -1;
5792 int nest;
5794 ASSERT_RTNL();
5796 netdev_for_each_lower_dev(dev, lower, iter) {
5797 nest = dev_get_nest_level(lower, type_check);
5798 if (max_nest < nest)
5799 max_nest = nest;
5802 if (type_check(dev))
5803 max_nest++;
5805 return max_nest;
5807 EXPORT_SYMBOL(dev_get_nest_level);
5809 static void dev_change_rx_flags(struct net_device *dev, int flags)
5811 const struct net_device_ops *ops = dev->netdev_ops;
5813 if (ops->ndo_change_rx_flags)
5814 ops->ndo_change_rx_flags(dev, flags);
5817 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
5819 unsigned int old_flags = dev->flags;
5820 kuid_t uid;
5821 kgid_t gid;
5823 ASSERT_RTNL();
5825 dev->flags |= IFF_PROMISC;
5826 dev->promiscuity += inc;
5827 if (dev->promiscuity == 0) {
5829 * Avoid overflow.
5830 * If inc causes overflow, untouch promisc and return error.
5832 if (inc < 0)
5833 dev->flags &= ~IFF_PROMISC;
5834 else {
5835 dev->promiscuity -= inc;
5836 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5837 dev->name);
5838 return -EOVERFLOW;
5841 if (dev->flags != old_flags) {
5842 pr_info("device %s %s promiscuous mode\n",
5843 dev->name,
5844 dev->flags & IFF_PROMISC ? "entered" : "left");
5845 if (audit_enabled) {
5846 current_uid_gid(&uid, &gid);
5847 audit_log(current->audit_context, GFP_ATOMIC,
5848 AUDIT_ANOM_PROMISCUOUS,
5849 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5850 dev->name, (dev->flags & IFF_PROMISC),
5851 (old_flags & IFF_PROMISC),
5852 from_kuid(&init_user_ns, audit_get_loginuid(current)),
5853 from_kuid(&init_user_ns, uid),
5854 from_kgid(&init_user_ns, gid),
5855 audit_get_sessionid(current));
5858 dev_change_rx_flags(dev, IFF_PROMISC);
5860 if (notify)
5861 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
5862 return 0;
5866 * dev_set_promiscuity - update promiscuity count on a device
5867 * @dev: device
5868 * @inc: modifier
5870 * Add or remove promiscuity from a device. While the count in the device
5871 * remains above zero the interface remains promiscuous. Once it hits zero
5872 * the device reverts back to normal filtering operation. A negative inc
5873 * value is used to drop promiscuity on the device.
5874 * Return 0 if successful or a negative errno code on error.
5876 int dev_set_promiscuity(struct net_device *dev, int inc)
5878 unsigned int old_flags = dev->flags;
5879 int err;
5881 err = __dev_set_promiscuity(dev, inc, true);
5882 if (err < 0)
5883 return err;
5884 if (dev->flags != old_flags)
5885 dev_set_rx_mode(dev);
5886 return err;
5888 EXPORT_SYMBOL(dev_set_promiscuity);
5890 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
5892 unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
5894 ASSERT_RTNL();
5896 dev->flags |= IFF_ALLMULTI;
5897 dev->allmulti += inc;
5898 if (dev->allmulti == 0) {
5900 * Avoid overflow.
5901 * If inc causes overflow, untouch allmulti and return error.
5903 if (inc < 0)
5904 dev->flags &= ~IFF_ALLMULTI;
5905 else {
5906 dev->allmulti -= inc;
5907 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5908 dev->name);
5909 return -EOVERFLOW;
5912 if (dev->flags ^ old_flags) {
5913 dev_change_rx_flags(dev, IFF_ALLMULTI);
5914 dev_set_rx_mode(dev);
5915 if (notify)
5916 __dev_notify_flags(dev, old_flags,
5917 dev->gflags ^ old_gflags);
5919 return 0;
5923 * dev_set_allmulti - update allmulti count on a device
5924 * @dev: device
5925 * @inc: modifier
5927 * Add or remove reception of all multicast frames to a device. While the
5928 * count in the device remains above zero the interface remains listening
5929 * to all interfaces. Once it hits zero the device reverts back to normal
5930 * filtering operation. A negative @inc value is used to drop the counter
5931 * when releasing a resource needing all multicasts.
5932 * Return 0 if successful or a negative errno code on error.
5935 int dev_set_allmulti(struct net_device *dev, int inc)
5937 return __dev_set_allmulti(dev, inc, true);
5939 EXPORT_SYMBOL(dev_set_allmulti);
5942 * Upload unicast and multicast address lists to device and
5943 * configure RX filtering. When the device doesn't support unicast
5944 * filtering it is put in promiscuous mode while unicast addresses
5945 * are present.
5947 void __dev_set_rx_mode(struct net_device *dev)
5949 const struct net_device_ops *ops = dev->netdev_ops;
5951 /* dev_open will call this function so the list will stay sane. */
5952 if (!(dev->flags&IFF_UP))
5953 return;
5955 if (!netif_device_present(dev))
5956 return;
5958 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
5959 /* Unicast addresses changes may only happen under the rtnl,
5960 * therefore calling __dev_set_promiscuity here is safe.
5962 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
5963 __dev_set_promiscuity(dev, 1, false);
5964 dev->uc_promisc = true;
5965 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
5966 __dev_set_promiscuity(dev, -1, false);
5967 dev->uc_promisc = false;
5971 if (ops->ndo_set_rx_mode)
5972 ops->ndo_set_rx_mode(dev);
5975 void dev_set_rx_mode(struct net_device *dev)
5977 netif_addr_lock_bh(dev);
5978 __dev_set_rx_mode(dev);
5979 netif_addr_unlock_bh(dev);
5983 * dev_get_flags - get flags reported to userspace
5984 * @dev: device
5986 * Get the combination of flag bits exported through APIs to userspace.
5988 unsigned int dev_get_flags(const struct net_device *dev)
5990 unsigned int flags;
5992 flags = (dev->flags & ~(IFF_PROMISC |
5993 IFF_ALLMULTI |
5994 IFF_RUNNING |
5995 IFF_LOWER_UP |
5996 IFF_DORMANT)) |
5997 (dev->gflags & (IFF_PROMISC |
5998 IFF_ALLMULTI));
6000 if (netif_running(dev)) {
6001 if (netif_oper_up(dev))
6002 flags |= IFF_RUNNING;
6003 if (netif_carrier_ok(dev))
6004 flags |= IFF_LOWER_UP;
6005 if (netif_dormant(dev))
6006 flags |= IFF_DORMANT;
6009 return flags;
6011 EXPORT_SYMBOL(dev_get_flags);
6013 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6015 unsigned int old_flags = dev->flags;
6016 int ret;
6018 ASSERT_RTNL();
6021 * Set the flags on our device.
6024 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6025 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6026 IFF_AUTOMEDIA)) |
6027 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6028 IFF_ALLMULTI));
6031 * Load in the correct multicast list now the flags have changed.
6034 if ((old_flags ^ flags) & IFF_MULTICAST)
6035 dev_change_rx_flags(dev, IFF_MULTICAST);
6037 dev_set_rx_mode(dev);
6040 * Have we downed the interface. We handle IFF_UP ourselves
6041 * according to user attempts to set it, rather than blindly
6042 * setting it.
6045 ret = 0;
6046 if ((old_flags ^ flags) & IFF_UP)
6047 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6049 if ((flags ^ dev->gflags) & IFF_PROMISC) {
6050 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6051 unsigned int old_flags = dev->flags;
6053 dev->gflags ^= IFF_PROMISC;
6055 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6056 if (dev->flags != old_flags)
6057 dev_set_rx_mode(dev);
6060 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6061 is important. Some (broken) drivers set IFF_PROMISC, when
6062 IFF_ALLMULTI is requested not asking us and not reporting.
6064 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6065 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6067 dev->gflags ^= IFF_ALLMULTI;
6068 __dev_set_allmulti(dev, inc, false);
6071 return ret;
6074 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6075 unsigned int gchanges)
6077 unsigned int changes = dev->flags ^ old_flags;
6079 if (gchanges)
6080 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6082 if (changes & IFF_UP) {
6083 if (dev->flags & IFF_UP)
6084 call_netdevice_notifiers(NETDEV_UP, dev);
6085 else
6086 call_netdevice_notifiers(NETDEV_DOWN, dev);
6089 if (dev->flags & IFF_UP &&
6090 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6091 struct netdev_notifier_change_info change_info;
6093 change_info.flags_changed = changes;
6094 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6095 &change_info.info);
6100 * dev_change_flags - change device settings
6101 * @dev: device
6102 * @flags: device state flags
6104 * Change settings on device based state flags. The flags are
6105 * in the userspace exported format.
6107 int dev_change_flags(struct net_device *dev, unsigned int flags)
6109 int ret;
6110 unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6112 ret = __dev_change_flags(dev, flags);
6113 if (ret < 0)
6114 return ret;
6116 changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6117 __dev_notify_flags(dev, old_flags, changes);
6118 return ret;
6120 EXPORT_SYMBOL(dev_change_flags);
6122 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6124 const struct net_device_ops *ops = dev->netdev_ops;
6126 if (ops->ndo_change_mtu)
6127 return ops->ndo_change_mtu(dev, new_mtu);
6129 dev->mtu = new_mtu;
6130 return 0;
6134 * dev_set_mtu - Change maximum transfer unit
6135 * @dev: device
6136 * @new_mtu: new transfer unit
6138 * Change the maximum transfer size of the network device.
6140 int dev_set_mtu(struct net_device *dev, int new_mtu)
6142 int err, orig_mtu;
6144 if (new_mtu == dev->mtu)
6145 return 0;
6147 /* MTU must be positive. */
6148 if (new_mtu < 0)
6149 return -EINVAL;
6151 if (!netif_device_present(dev))
6152 return -ENODEV;
6154 err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6155 err = notifier_to_errno(err);
6156 if (err)
6157 return err;
6159 orig_mtu = dev->mtu;
6160 err = __dev_set_mtu(dev, new_mtu);
6162 if (!err) {
6163 err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
6164 orig_mtu);
6165 err = notifier_to_errno(err);
6166 if (err) {
6167 /* setting mtu back and notifying everyone again,
6168 * so that they have a chance to revert changes.
6170 __dev_set_mtu(dev, orig_mtu);
6171 call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
6172 new_mtu);
6175 return err;
6177 EXPORT_SYMBOL(dev_set_mtu);
6180 * dev_set_group - Change group this device belongs to
6181 * @dev: device
6182 * @new_group: group this device should belong to
6184 void dev_set_group(struct net_device *dev, int new_group)
6186 dev->group = new_group;
6188 EXPORT_SYMBOL(dev_set_group);
6191 * dev_set_mac_address - Change Media Access Control Address
6192 * @dev: device
6193 * @sa: new address
6195 * Change the hardware (MAC) address of the device
6197 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6199 const struct net_device_ops *ops = dev->netdev_ops;
6200 int err;
6202 if (!ops->ndo_set_mac_address)
6203 return -EOPNOTSUPP;
6204 if (sa->sa_family != dev->type)
6205 return -EINVAL;
6206 if (!netif_device_present(dev))
6207 return -ENODEV;
6208 err = ops->ndo_set_mac_address(dev, sa);
6209 if (err)
6210 return err;
6211 dev->addr_assign_type = NET_ADDR_SET;
6212 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6213 add_device_randomness(dev->dev_addr, dev->addr_len);
6214 return 0;
6216 EXPORT_SYMBOL(dev_set_mac_address);
6219 * dev_change_carrier - Change device carrier
6220 * @dev: device
6221 * @new_carrier: new value
6223 * Change device carrier
6225 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6227 const struct net_device_ops *ops = dev->netdev_ops;
6229 if (!ops->ndo_change_carrier)
6230 return -EOPNOTSUPP;
6231 if (!netif_device_present(dev))
6232 return -ENODEV;
6233 return ops->ndo_change_carrier(dev, new_carrier);
6235 EXPORT_SYMBOL(dev_change_carrier);
6238 * dev_get_phys_port_id - Get device physical port ID
6239 * @dev: device
6240 * @ppid: port ID
6242 * Get device physical port ID
6244 int dev_get_phys_port_id(struct net_device *dev,
6245 struct netdev_phys_item_id *ppid)
6247 const struct net_device_ops *ops = dev->netdev_ops;
6249 if (!ops->ndo_get_phys_port_id)
6250 return -EOPNOTSUPP;
6251 return ops->ndo_get_phys_port_id(dev, ppid);
6253 EXPORT_SYMBOL(dev_get_phys_port_id);
6256 * dev_get_phys_port_name - Get device physical port name
6257 * @dev: device
6258 * @name: port name
6260 * Get device physical port name
6262 int dev_get_phys_port_name(struct net_device *dev,
6263 char *name, size_t len)
6265 const struct net_device_ops *ops = dev->netdev_ops;
6267 if (!ops->ndo_get_phys_port_name)
6268 return -EOPNOTSUPP;
6269 return ops->ndo_get_phys_port_name(dev, name, len);
6271 EXPORT_SYMBOL(dev_get_phys_port_name);
6274 * dev_change_proto_down - update protocol port state information
6275 * @dev: device
6276 * @proto_down: new value
6278 * This info can be used by switch drivers to set the phys state of the
6279 * port.
6281 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6283 const struct net_device_ops *ops = dev->netdev_ops;
6285 if (!ops->ndo_change_proto_down)
6286 return -EOPNOTSUPP;
6287 if (!netif_device_present(dev))
6288 return -ENODEV;
6289 return ops->ndo_change_proto_down(dev, proto_down);
6291 EXPORT_SYMBOL(dev_change_proto_down);
6294 * dev_new_index - allocate an ifindex
6295 * @net: the applicable net namespace
6297 * Returns a suitable unique value for a new device interface
6298 * number. The caller must hold the rtnl semaphore or the
6299 * dev_base_lock to be sure it remains unique.
6301 static int dev_new_index(struct net *net)
6303 int ifindex = net->ifindex;
6304 for (;;) {
6305 if (++ifindex <= 0)
6306 ifindex = 1;
6307 if (!__dev_get_by_index(net, ifindex))
6308 return net->ifindex = ifindex;
6312 /* Delayed registration/unregisteration */
6313 static LIST_HEAD(net_todo_list);
6314 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6316 static void net_set_todo(struct net_device *dev)
6318 list_add_tail(&dev->todo_list, &net_todo_list);
6319 dev_net(dev)->dev_unreg_count++;
6322 static void rollback_registered_many(struct list_head *head)
6324 struct net_device *dev, *tmp;
6325 LIST_HEAD(close_head);
6327 BUG_ON(dev_boot_phase);
6328 ASSERT_RTNL();
6330 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6331 /* Some devices call without registering
6332 * for initialization unwind. Remove those
6333 * devices and proceed with the remaining.
6335 if (dev->reg_state == NETREG_UNINITIALIZED) {
6336 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6337 dev->name, dev);
6339 WARN_ON(1);
6340 list_del(&dev->unreg_list);
6341 continue;
6343 dev->dismantle = true;
6344 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6347 /* If device is running, close it first. */
6348 list_for_each_entry(dev, head, unreg_list)
6349 list_add_tail(&dev->close_list, &close_head);
6350 dev_close_many(&close_head, true);
6352 list_for_each_entry(dev, head, unreg_list) {
6353 /* And unlink it from device chain. */
6354 unlist_netdevice(dev);
6356 dev->reg_state = NETREG_UNREGISTERING;
6357 on_each_cpu(flush_backlog, dev, 1);
6360 synchronize_net();
6362 list_for_each_entry(dev, head, unreg_list) {
6363 struct sk_buff *skb = NULL;
6365 /* Shutdown queueing discipline. */
6366 dev_shutdown(dev);
6369 /* Notify protocols, that we are about to destroy
6370 this device. They should clean all the things.
6372 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6374 if (!dev->rtnl_link_ops ||
6375 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6376 skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6377 GFP_KERNEL);
6380 * Flush the unicast and multicast chains
6382 dev_uc_flush(dev);
6383 dev_mc_flush(dev);
6385 if (dev->netdev_ops->ndo_uninit)
6386 dev->netdev_ops->ndo_uninit(dev);
6388 if (skb)
6389 rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6391 /* Notifier chain MUST detach us all upper devices. */
6392 WARN_ON(netdev_has_any_upper_dev(dev));
6394 /* Remove entries from kobject tree */
6395 netdev_unregister_kobject(dev);
6396 #ifdef CONFIG_XPS
6397 /* Remove XPS queueing entries */
6398 netif_reset_xps_queues_gt(dev, 0);
6399 #endif
6402 synchronize_net();
6404 list_for_each_entry(dev, head, unreg_list)
6405 dev_put(dev);
6408 static void rollback_registered(struct net_device *dev)
6410 LIST_HEAD(single);
6412 list_add(&dev->unreg_list, &single);
6413 rollback_registered_many(&single);
6414 list_del(&single);
6417 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6418 struct net_device *upper, netdev_features_t features)
6420 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6421 netdev_features_t feature;
6422 int feature_bit;
6424 for_each_netdev_feature(&upper_disables, feature_bit) {
6425 feature = __NETIF_F_BIT(feature_bit);
6426 if (!(upper->wanted_features & feature)
6427 && (features & feature)) {
6428 netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6429 &feature, upper->name);
6430 features &= ~feature;
6434 return features;
6437 static void netdev_sync_lower_features(struct net_device *upper,
6438 struct net_device *lower, netdev_features_t features)
6440 netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6441 netdev_features_t feature;
6442 int feature_bit;
6444 for_each_netdev_feature(&upper_disables, feature_bit) {
6445 feature = __NETIF_F_BIT(feature_bit);
6446 if (!(features & feature) && (lower->features & feature)) {
6447 netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6448 &feature, lower->name);
6449 lower->wanted_features &= ~feature;
6450 netdev_update_features(lower);
6452 if (unlikely(lower->features & feature))
6453 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6454 &feature, lower->name);
6459 static netdev_features_t netdev_fix_features(struct net_device *dev,
6460 netdev_features_t features)
6462 /* Fix illegal checksum combinations */
6463 if ((features & NETIF_F_HW_CSUM) &&
6464 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6465 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6466 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6469 /* TSO requires that SG is present as well. */
6470 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6471 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6472 features &= ~NETIF_F_ALL_TSO;
6475 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6476 !(features & NETIF_F_IP_CSUM)) {
6477 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6478 features &= ~NETIF_F_TSO;
6479 features &= ~NETIF_F_TSO_ECN;
6482 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6483 !(features & NETIF_F_IPV6_CSUM)) {
6484 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6485 features &= ~NETIF_F_TSO6;
6488 /* TSO ECN requires that TSO is present as well. */
6489 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6490 features &= ~NETIF_F_TSO_ECN;
6492 /* Software GSO depends on SG. */
6493 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6494 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6495 features &= ~NETIF_F_GSO;
6498 /* UFO needs SG and checksumming */
6499 if (features & NETIF_F_UFO) {
6500 /* maybe split UFO into V4 and V6? */
6501 if (!((features & NETIF_F_GEN_CSUM) ||
6502 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
6503 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6504 netdev_dbg(dev,
6505 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6506 features &= ~NETIF_F_UFO;
6509 if (!(features & NETIF_F_SG)) {
6510 netdev_dbg(dev,
6511 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6512 features &= ~NETIF_F_UFO;
6516 #ifdef CONFIG_NET_RX_BUSY_POLL
6517 if (dev->netdev_ops->ndo_busy_poll)
6518 features |= NETIF_F_BUSY_POLL;
6519 else
6520 #endif
6521 features &= ~NETIF_F_BUSY_POLL;
6523 return features;
6526 int __netdev_update_features(struct net_device *dev)
6528 struct net_device *upper, *lower;
6529 netdev_features_t features;
6530 struct list_head *iter;
6531 int err = -1;
6533 ASSERT_RTNL();
6535 features = netdev_get_wanted_features(dev);
6537 if (dev->netdev_ops->ndo_fix_features)
6538 features = dev->netdev_ops->ndo_fix_features(dev, features);
6540 /* driver might be less strict about feature dependencies */
6541 features = netdev_fix_features(dev, features);
6543 /* some features can't be enabled if they're off an an upper device */
6544 netdev_for_each_upper_dev_rcu(dev, upper, iter)
6545 features = netdev_sync_upper_features(dev, upper, features);
6547 if (dev->features == features)
6548 goto sync_lower;
6550 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6551 &dev->features, &features);
6553 if (dev->netdev_ops->ndo_set_features)
6554 err = dev->netdev_ops->ndo_set_features(dev, features);
6555 else
6556 err = 0;
6558 if (unlikely(err < 0)) {
6559 netdev_err(dev,
6560 "set_features() failed (%d); wanted %pNF, left %pNF\n",
6561 err, &features, &dev->features);
6562 /* return non-0 since some features might have changed and
6563 * it's better to fire a spurious notification than miss it
6565 return -1;
6568 sync_lower:
6569 /* some features must be disabled on lower devices when disabled
6570 * on an upper device (think: bonding master or bridge)
6572 netdev_for_each_lower_dev(dev, lower, iter)
6573 netdev_sync_lower_features(dev, lower, features);
6575 if (!err)
6576 dev->features = features;
6578 return err < 0 ? 0 : 1;
6582 * netdev_update_features - recalculate device features
6583 * @dev: the device to check
6585 * Recalculate dev->features set and send notifications if it
6586 * has changed. Should be called after driver or hardware dependent
6587 * conditions might have changed that influence the features.
6589 void netdev_update_features(struct net_device *dev)
6591 if (__netdev_update_features(dev))
6592 netdev_features_change(dev);
6594 EXPORT_SYMBOL(netdev_update_features);
6597 * netdev_change_features - recalculate device features
6598 * @dev: the device to check
6600 * Recalculate dev->features set and send notifications even
6601 * if they have not changed. Should be called instead of
6602 * netdev_update_features() if also dev->vlan_features might
6603 * have changed to allow the changes to be propagated to stacked
6604 * VLAN devices.
6606 void netdev_change_features(struct net_device *dev)
6608 __netdev_update_features(dev);
6609 netdev_features_change(dev);
6611 EXPORT_SYMBOL(netdev_change_features);
6614 * netif_stacked_transfer_operstate - transfer operstate
6615 * @rootdev: the root or lower level device to transfer state from
6616 * @dev: the device to transfer operstate to
6618 * Transfer operational state from root to device. This is normally
6619 * called when a stacking relationship exists between the root
6620 * device and the device(a leaf device).
6622 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6623 struct net_device *dev)
6625 if (rootdev->operstate == IF_OPER_DORMANT)
6626 netif_dormant_on(dev);
6627 else
6628 netif_dormant_off(dev);
6630 if (netif_carrier_ok(rootdev)) {
6631 if (!netif_carrier_ok(dev))
6632 netif_carrier_on(dev);
6633 } else {
6634 if (netif_carrier_ok(dev))
6635 netif_carrier_off(dev);
6638 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6640 #ifdef CONFIG_SYSFS
6641 static int netif_alloc_rx_queues(struct net_device *dev)
6643 unsigned int i, count = dev->num_rx_queues;
6644 struct netdev_rx_queue *rx;
6645 size_t sz = count * sizeof(*rx);
6647 BUG_ON(count < 1);
6649 rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6650 if (!rx) {
6651 rx = vzalloc(sz);
6652 if (!rx)
6653 return -ENOMEM;
6655 dev->_rx = rx;
6657 for (i = 0; i < count; i++)
6658 rx[i].dev = dev;
6659 return 0;
6661 #endif
6663 static void netdev_init_one_queue(struct net_device *dev,
6664 struct netdev_queue *queue, void *_unused)
6666 /* Initialize queue lock */
6667 spin_lock_init(&queue->_xmit_lock);
6668 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6669 queue->xmit_lock_owner = -1;
6670 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6671 queue->dev = dev;
6672 #ifdef CONFIG_BQL
6673 dql_init(&queue->dql, HZ);
6674 #endif
6677 static void netif_free_tx_queues(struct net_device *dev)
6679 kvfree(dev->_tx);
6682 static int netif_alloc_netdev_queues(struct net_device *dev)
6684 unsigned int count = dev->num_tx_queues;
6685 struct netdev_queue *tx;
6686 size_t sz = count * sizeof(*tx);
6688 if (count < 1 || count > 0xffff)
6689 return -EINVAL;
6691 tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6692 if (!tx) {
6693 tx = vzalloc(sz);
6694 if (!tx)
6695 return -ENOMEM;
6697 dev->_tx = tx;
6699 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6700 spin_lock_init(&dev->tx_global_lock);
6702 return 0;
6705 void netif_tx_stop_all_queues(struct net_device *dev)
6707 unsigned int i;
6709 for (i = 0; i < dev->num_tx_queues; i++) {
6710 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6711 netif_tx_stop_queue(txq);
6714 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6717 * register_netdevice - register a network device
6718 * @dev: device to register
6720 * Take a completed network device structure and add it to the kernel
6721 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6722 * chain. 0 is returned on success. A negative errno code is returned
6723 * on a failure to set up the device, or if the name is a duplicate.
6725 * Callers must hold the rtnl semaphore. You may want
6726 * register_netdev() instead of this.
6728 * BUGS:
6729 * The locking appears insufficient to guarantee two parallel registers
6730 * will not get the same name.
6733 int register_netdevice(struct net_device *dev)
6735 int ret;
6736 struct net *net = dev_net(dev);
6738 BUG_ON(dev_boot_phase);
6739 ASSERT_RTNL();
6741 might_sleep();
6743 /* When net_device's are persistent, this will be fatal. */
6744 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
6745 BUG_ON(!net);
6747 spin_lock_init(&dev->addr_list_lock);
6748 netdev_set_addr_lockdep_class(dev);
6750 ret = dev_get_valid_name(net, dev, dev->name);
6751 if (ret < 0)
6752 goto out;
6754 /* Init, if this function is available */
6755 if (dev->netdev_ops->ndo_init) {
6756 ret = dev->netdev_ops->ndo_init(dev);
6757 if (ret) {
6758 if (ret > 0)
6759 ret = -EIO;
6760 goto out;
6764 if (((dev->hw_features | dev->features) &
6765 NETIF_F_HW_VLAN_CTAG_FILTER) &&
6766 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
6767 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
6768 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
6769 ret = -EINVAL;
6770 goto err_uninit;
6773 ret = -EBUSY;
6774 if (!dev->ifindex)
6775 dev->ifindex = dev_new_index(net);
6776 else if (__dev_get_by_index(net, dev->ifindex))
6777 goto err_uninit;
6779 /* Transfer changeable features to wanted_features and enable
6780 * software offloads (GSO and GRO).
6782 dev->hw_features |= NETIF_F_SOFT_FEATURES;
6783 dev->features |= NETIF_F_SOFT_FEATURES;
6784 dev->wanted_features = dev->features & dev->hw_features;
6786 if (!(dev->flags & IFF_LOOPBACK)) {
6787 dev->hw_features |= NETIF_F_NOCACHE_COPY;
6790 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6792 dev->vlan_features |= NETIF_F_HIGHDMA;
6794 /* Make NETIF_F_SG inheritable to tunnel devices.
6796 dev->hw_enc_features |= NETIF_F_SG;
6798 /* Make NETIF_F_SG inheritable to MPLS.
6800 dev->mpls_features |= NETIF_F_SG;
6802 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
6803 ret = notifier_to_errno(ret);
6804 if (ret)
6805 goto err_uninit;
6807 ret = netdev_register_kobject(dev);
6808 if (ret)
6809 goto err_uninit;
6810 dev->reg_state = NETREG_REGISTERED;
6812 __netdev_update_features(dev);
6815 * Default initial state at registry is that the
6816 * device is present.
6819 set_bit(__LINK_STATE_PRESENT, &dev->state);
6821 linkwatch_init_dev(dev);
6823 dev_init_scheduler(dev);
6824 dev_hold(dev);
6825 list_netdevice(dev);
6826 add_device_randomness(dev->dev_addr, dev->addr_len);
6828 /* If the device has permanent device address, driver should
6829 * set dev_addr and also addr_assign_type should be set to
6830 * NET_ADDR_PERM (default value).
6832 if (dev->addr_assign_type == NET_ADDR_PERM)
6833 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
6835 /* Notify protocols, that a new device appeared. */
6836 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
6837 ret = notifier_to_errno(ret);
6838 if (ret) {
6839 rollback_registered(dev);
6840 dev->reg_state = NETREG_UNREGISTERED;
6843 * Prevent userspace races by waiting until the network
6844 * device is fully setup before sending notifications.
6846 if (!dev->rtnl_link_ops ||
6847 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6848 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
6850 out:
6851 return ret;
6853 err_uninit:
6854 if (dev->netdev_ops->ndo_uninit)
6855 dev->netdev_ops->ndo_uninit(dev);
6856 goto out;
6858 EXPORT_SYMBOL(register_netdevice);
6861 * init_dummy_netdev - init a dummy network device for NAPI
6862 * @dev: device to init
6864 * This takes a network device structure and initialize the minimum
6865 * amount of fields so it can be used to schedule NAPI polls without
6866 * registering a full blown interface. This is to be used by drivers
6867 * that need to tie several hardware interfaces to a single NAPI
6868 * poll scheduler due to HW limitations.
6870 int init_dummy_netdev(struct net_device *dev)
6872 /* Clear everything. Note we don't initialize spinlocks
6873 * are they aren't supposed to be taken by any of the
6874 * NAPI code and this dummy netdev is supposed to be
6875 * only ever used for NAPI polls
6877 memset(dev, 0, sizeof(struct net_device));
6879 /* make sure we BUG if trying to hit standard
6880 * register/unregister code path
6882 dev->reg_state = NETREG_DUMMY;
6884 /* NAPI wants this */
6885 INIT_LIST_HEAD(&dev->napi_list);
6887 /* a dummy interface is started by default */
6888 set_bit(__LINK_STATE_PRESENT, &dev->state);
6889 set_bit(__LINK_STATE_START, &dev->state);
6891 /* Note : We dont allocate pcpu_refcnt for dummy devices,
6892 * because users of this 'device' dont need to change
6893 * its refcount.
6896 return 0;
6898 EXPORT_SYMBOL_GPL(init_dummy_netdev);
6902 * register_netdev - register a network device
6903 * @dev: device to register
6905 * Take a completed network device structure and add it to the kernel
6906 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6907 * chain. 0 is returned on success. A negative errno code is returned
6908 * on a failure to set up the device, or if the name is a duplicate.
6910 * This is a wrapper around register_netdevice that takes the rtnl semaphore
6911 * and expands the device name if you passed a format string to
6912 * alloc_netdev.
6914 int register_netdev(struct net_device *dev)
6916 int err;
6918 rtnl_lock();
6919 err = register_netdevice(dev);
6920 rtnl_unlock();
6921 return err;
6923 EXPORT_SYMBOL(register_netdev);
6925 int netdev_refcnt_read(const struct net_device *dev)
6927 int i, refcnt = 0;
6929 for_each_possible_cpu(i)
6930 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
6931 return refcnt;
6933 EXPORT_SYMBOL(netdev_refcnt_read);
6936 * netdev_wait_allrefs - wait until all references are gone.
6937 * @dev: target net_device
6939 * This is called when unregistering network devices.
6941 * Any protocol or device that holds a reference should register
6942 * for netdevice notification, and cleanup and put back the
6943 * reference if they receive an UNREGISTER event.
6944 * We can get stuck here if buggy protocols don't correctly
6945 * call dev_put.
6947 static void netdev_wait_allrefs(struct net_device *dev)
6949 unsigned long rebroadcast_time, warning_time;
6950 int refcnt;
6952 linkwatch_forget_dev(dev);
6954 rebroadcast_time = warning_time = jiffies;
6955 refcnt = netdev_refcnt_read(dev);
6957 while (refcnt != 0) {
6958 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6959 rtnl_lock();
6961 /* Rebroadcast unregister notification */
6962 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6964 __rtnl_unlock();
6965 rcu_barrier();
6966 rtnl_lock();
6968 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6969 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
6970 &dev->state)) {
6971 /* We must not have linkwatch events
6972 * pending on unregister. If this
6973 * happens, we simply run the queue
6974 * unscheduled, resulting in a noop
6975 * for this device.
6977 linkwatch_run_queue();
6980 __rtnl_unlock();
6982 rebroadcast_time = jiffies;
6985 msleep(250);
6987 refcnt = netdev_refcnt_read(dev);
6989 if (time_after(jiffies, warning_time + 10 * HZ)) {
6990 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6991 dev->name, refcnt);
6992 warning_time = jiffies;
6997 /* The sequence is:
6999 * rtnl_lock();
7000 * ...
7001 * register_netdevice(x1);
7002 * register_netdevice(x2);
7003 * ...
7004 * unregister_netdevice(y1);
7005 * unregister_netdevice(y2);
7006 * ...
7007 * rtnl_unlock();
7008 * free_netdev(y1);
7009 * free_netdev(y2);
7011 * We are invoked by rtnl_unlock().
7012 * This allows us to deal with problems:
7013 * 1) We can delete sysfs objects which invoke hotplug
7014 * without deadlocking with linkwatch via keventd.
7015 * 2) Since we run with the RTNL semaphore not held, we can sleep
7016 * safely in order to wait for the netdev refcnt to drop to zero.
7018 * We must not return until all unregister events added during
7019 * the interval the lock was held have been completed.
7021 void netdev_run_todo(void)
7023 struct list_head list;
7025 /* Snapshot list, allow later requests */
7026 list_replace_init(&net_todo_list, &list);
7028 __rtnl_unlock();
7031 /* Wait for rcu callbacks to finish before next phase */
7032 if (!list_empty(&list))
7033 rcu_barrier();
7035 while (!list_empty(&list)) {
7036 struct net_device *dev
7037 = list_first_entry(&list, struct net_device, todo_list);
7038 list_del(&dev->todo_list);
7040 rtnl_lock();
7041 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7042 __rtnl_unlock();
7044 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7045 pr_err("network todo '%s' but state %d\n",
7046 dev->name, dev->reg_state);
7047 dump_stack();
7048 continue;
7051 dev->reg_state = NETREG_UNREGISTERED;
7053 netdev_wait_allrefs(dev);
7055 /* paranoia */
7056 BUG_ON(netdev_refcnt_read(dev));
7057 BUG_ON(!list_empty(&dev->ptype_all));
7058 BUG_ON(!list_empty(&dev->ptype_specific));
7059 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7060 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7061 WARN_ON(dev->dn_ptr);
7063 if (dev->destructor)
7064 dev->destructor(dev);
7066 /* Report a network device has been unregistered */
7067 rtnl_lock();
7068 dev_net(dev)->dev_unreg_count--;
7069 __rtnl_unlock();
7070 wake_up(&netdev_unregistering_wq);
7072 /* Free network device */
7073 kobject_put(&dev->dev.kobj);
7077 /* Convert net_device_stats to rtnl_link_stats64. They have the same
7078 * fields in the same order, with only the type differing.
7080 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7081 const struct net_device_stats *netdev_stats)
7083 #if BITS_PER_LONG == 64
7084 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
7085 memcpy(stats64, netdev_stats, sizeof(*stats64));
7086 #else
7087 size_t i, n = sizeof(*stats64) / sizeof(u64);
7088 const unsigned long *src = (const unsigned long *)netdev_stats;
7089 u64 *dst = (u64 *)stats64;
7091 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
7092 sizeof(*stats64) / sizeof(u64));
7093 for (i = 0; i < n; i++)
7094 dst[i] = src[i];
7095 #endif
7097 EXPORT_SYMBOL(netdev_stats_to_stats64);
7100 * dev_get_stats - get network device statistics
7101 * @dev: device to get statistics from
7102 * @storage: place to store stats
7104 * Get network statistics from device. Return @storage.
7105 * The device driver may provide its own method by setting
7106 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7107 * otherwise the internal statistics structure is used.
7109 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7110 struct rtnl_link_stats64 *storage)
7112 const struct net_device_ops *ops = dev->netdev_ops;
7114 if (ops->ndo_get_stats64) {
7115 memset(storage, 0, sizeof(*storage));
7116 ops->ndo_get_stats64(dev, storage);
7117 } else if (ops->ndo_get_stats) {
7118 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7119 } else {
7120 netdev_stats_to_stats64(storage, &dev->stats);
7122 storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
7123 storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
7124 return storage;
7126 EXPORT_SYMBOL(dev_get_stats);
7128 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7130 struct netdev_queue *queue = dev_ingress_queue(dev);
7132 #ifdef CONFIG_NET_CLS_ACT
7133 if (queue)
7134 return queue;
7135 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7136 if (!queue)
7137 return NULL;
7138 netdev_init_one_queue(dev, queue, NULL);
7139 RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7140 queue->qdisc_sleeping = &noop_qdisc;
7141 rcu_assign_pointer(dev->ingress_queue, queue);
7142 #endif
7143 return queue;
7146 static const struct ethtool_ops default_ethtool_ops;
7148 void netdev_set_default_ethtool_ops(struct net_device *dev,
7149 const struct ethtool_ops *ops)
7151 if (dev->ethtool_ops == &default_ethtool_ops)
7152 dev->ethtool_ops = ops;
7154 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7156 void netdev_freemem(struct net_device *dev)
7158 char *addr = (char *)dev - dev->padded;
7160 kvfree(addr);
7164 * alloc_netdev_mqs - allocate network device
7165 * @sizeof_priv: size of private data to allocate space for
7166 * @name: device name format string
7167 * @name_assign_type: origin of device name
7168 * @setup: callback to initialize device
7169 * @txqs: the number of TX subqueues to allocate
7170 * @rxqs: the number of RX subqueues to allocate
7172 * Allocates a struct net_device with private data area for driver use
7173 * and performs basic initialization. Also allocates subqueue structs
7174 * for each queue on the device.
7176 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7177 unsigned char name_assign_type,
7178 void (*setup)(struct net_device *),
7179 unsigned int txqs, unsigned int rxqs)
7181 struct net_device *dev;
7182 size_t alloc_size;
7183 struct net_device *p;
7185 BUG_ON(strlen(name) >= sizeof(dev->name));
7187 if (txqs < 1) {
7188 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7189 return NULL;
7192 #ifdef CONFIG_SYSFS
7193 if (rxqs < 1) {
7194 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7195 return NULL;
7197 #endif
7199 alloc_size = sizeof(struct net_device);
7200 if (sizeof_priv) {
7201 /* ensure 32-byte alignment of private area */
7202 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7203 alloc_size += sizeof_priv;
7205 /* ensure 32-byte alignment of whole construct */
7206 alloc_size += NETDEV_ALIGN - 1;
7208 p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7209 if (!p)
7210 p = vzalloc(alloc_size);
7211 if (!p)
7212 return NULL;
7214 dev = PTR_ALIGN(p, NETDEV_ALIGN);
7215 dev->padded = (char *)dev - (char *)p;
7217 dev->pcpu_refcnt = alloc_percpu(int);
7218 if (!dev->pcpu_refcnt)
7219 goto free_dev;
7221 if (dev_addr_init(dev))
7222 goto free_pcpu;
7224 dev_mc_init(dev);
7225 dev_uc_init(dev);
7227 dev_net_set(dev, &init_net);
7229 dev->gso_max_size = GSO_MAX_SIZE;
7230 dev->gso_max_segs = GSO_MAX_SEGS;
7231 dev->gso_min_segs = 0;
7233 INIT_LIST_HEAD(&dev->napi_list);
7234 INIT_LIST_HEAD(&dev->unreg_list);
7235 INIT_LIST_HEAD(&dev->close_list);
7236 INIT_LIST_HEAD(&dev->link_watch_list);
7237 INIT_LIST_HEAD(&dev->adj_list.upper);
7238 INIT_LIST_HEAD(&dev->adj_list.lower);
7239 INIT_LIST_HEAD(&dev->all_adj_list.upper);
7240 INIT_LIST_HEAD(&dev->all_adj_list.lower);
7241 INIT_LIST_HEAD(&dev->ptype_all);
7242 INIT_LIST_HEAD(&dev->ptype_specific);
7243 dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7244 setup(dev);
7246 if (!dev->tx_queue_len) {
7247 dev->priv_flags |= IFF_NO_QUEUE;
7248 dev->tx_queue_len = 1;
7251 dev->num_tx_queues = txqs;
7252 dev->real_num_tx_queues = txqs;
7253 if (netif_alloc_netdev_queues(dev))
7254 goto free_all;
7256 #ifdef CONFIG_SYSFS
7257 dev->num_rx_queues = rxqs;
7258 dev->real_num_rx_queues = rxqs;
7259 if (netif_alloc_rx_queues(dev))
7260 goto free_all;
7261 #endif
7263 strcpy(dev->name, name);
7264 dev->name_assign_type = name_assign_type;
7265 dev->group = INIT_NETDEV_GROUP;
7266 if (!dev->ethtool_ops)
7267 dev->ethtool_ops = &default_ethtool_ops;
7269 nf_hook_ingress_init(dev);
7271 return dev;
7273 free_all:
7274 free_netdev(dev);
7275 return NULL;
7277 free_pcpu:
7278 free_percpu(dev->pcpu_refcnt);
7279 free_dev:
7280 netdev_freemem(dev);
7281 return NULL;
7283 EXPORT_SYMBOL(alloc_netdev_mqs);
7286 * free_netdev - free network device
7287 * @dev: device
7289 * This function does the last stage of destroying an allocated device
7290 * interface. The reference to the device object is released.
7291 * If this is the last reference then it will be freed.
7293 void free_netdev(struct net_device *dev)
7295 struct napi_struct *p, *n;
7297 netif_free_tx_queues(dev);
7298 #ifdef CONFIG_SYSFS
7299 kvfree(dev->_rx);
7300 #endif
7302 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7304 /* Flush device addresses */
7305 dev_addr_flush(dev);
7307 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7308 netif_napi_del(p);
7310 free_percpu(dev->pcpu_refcnt);
7311 dev->pcpu_refcnt = NULL;
7313 /* Compatibility with error handling in drivers */
7314 if (dev->reg_state == NETREG_UNINITIALIZED) {
7315 netdev_freemem(dev);
7316 return;
7319 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7320 dev->reg_state = NETREG_RELEASED;
7322 /* will free via device release */
7323 put_device(&dev->dev);
7325 EXPORT_SYMBOL(free_netdev);
7328 * synchronize_net - Synchronize with packet receive processing
7330 * Wait for packets currently being received to be done.
7331 * Does not block later packets from starting.
7333 void synchronize_net(void)
7335 might_sleep();
7336 if (rtnl_is_locked())
7337 synchronize_rcu_expedited();
7338 else
7339 synchronize_rcu();
7341 EXPORT_SYMBOL(synchronize_net);
7344 * unregister_netdevice_queue - remove device from the kernel
7345 * @dev: device
7346 * @head: list
7348 * This function shuts down a device interface and removes it
7349 * from the kernel tables.
7350 * If head not NULL, device is queued to be unregistered later.
7352 * Callers must hold the rtnl semaphore. You may want
7353 * unregister_netdev() instead of this.
7356 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7358 ASSERT_RTNL();
7360 if (head) {
7361 list_move_tail(&dev->unreg_list, head);
7362 } else {
7363 rollback_registered(dev);
7364 /* Finish processing unregister after unlock */
7365 net_set_todo(dev);
7368 EXPORT_SYMBOL(unregister_netdevice_queue);
7371 * unregister_netdevice_many - unregister many devices
7372 * @head: list of devices
7374 * Note: As most callers use a stack allocated list_head,
7375 * we force a list_del() to make sure stack wont be corrupted later.
7377 void unregister_netdevice_many(struct list_head *head)
7379 struct net_device *dev;
7381 if (!list_empty(head)) {
7382 rollback_registered_many(head);
7383 list_for_each_entry(dev, head, unreg_list)
7384 net_set_todo(dev);
7385 list_del(head);
7388 EXPORT_SYMBOL(unregister_netdevice_many);
7391 * unregister_netdev - remove device from the kernel
7392 * @dev: device
7394 * This function shuts down a device interface and removes it
7395 * from the kernel tables.
7397 * This is just a wrapper for unregister_netdevice that takes
7398 * the rtnl semaphore. In general you want to use this and not
7399 * unregister_netdevice.
7401 void unregister_netdev(struct net_device *dev)
7403 rtnl_lock();
7404 unregister_netdevice(dev);
7405 rtnl_unlock();
7407 EXPORT_SYMBOL(unregister_netdev);
7410 * dev_change_net_namespace - move device to different nethost namespace
7411 * @dev: device
7412 * @net: network namespace
7413 * @pat: If not NULL name pattern to try if the current device name
7414 * is already taken in the destination network namespace.
7416 * This function shuts down a device interface and moves it
7417 * to a new network namespace. On success 0 is returned, on
7418 * a failure a netagive errno code is returned.
7420 * Callers must hold the rtnl semaphore.
7423 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7425 int err;
7427 ASSERT_RTNL();
7429 /* Don't allow namespace local devices to be moved. */
7430 err = -EINVAL;
7431 if (dev->features & NETIF_F_NETNS_LOCAL)
7432 goto out;
7434 /* Ensure the device has been registrered */
7435 if (dev->reg_state != NETREG_REGISTERED)
7436 goto out;
7438 /* Get out if there is nothing todo */
7439 err = 0;
7440 if (net_eq(dev_net(dev), net))
7441 goto out;
7443 /* Pick the destination device name, and ensure
7444 * we can use it in the destination network namespace.
7446 err = -EEXIST;
7447 if (__dev_get_by_name(net, dev->name)) {
7448 /* We get here if we can't use the current device name */
7449 if (!pat)
7450 goto out;
7451 err = dev_get_valid_name(net, dev, pat);
7452 if (err < 0)
7453 goto out;
7457 * And now a mini version of register_netdevice unregister_netdevice.
7460 /* If device is running close it first. */
7461 dev_close(dev);
7463 /* And unlink it from device chain */
7464 unlist_netdevice(dev);
7466 synchronize_net();
7468 /* Shutdown queueing discipline. */
7469 dev_shutdown(dev);
7471 /* Notify protocols, that we are about to destroy
7472 this device. They should clean all the things.
7474 Note that dev->reg_state stays at NETREG_REGISTERED.
7475 This is wanted because this way 8021q and macvlan know
7476 the device is just moving and can keep their slaves up.
7478 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7479 rcu_barrier();
7480 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7481 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7484 * Flush the unicast and multicast chains
7486 dev_uc_flush(dev);
7487 dev_mc_flush(dev);
7489 /* Send a netdev-removed uevent to the old namespace */
7490 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7491 netdev_adjacent_del_links(dev);
7493 /* Actually switch the network namespace */
7494 dev_net_set(dev, net);
7496 /* If there is an ifindex conflict assign a new one */
7497 if (__dev_get_by_index(net, dev->ifindex))
7498 dev->ifindex = dev_new_index(net);
7500 /* Send a netdev-add uevent to the new namespace */
7501 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7502 netdev_adjacent_add_links(dev);
7504 /* Fixup kobjects */
7505 err = device_rename(&dev->dev, dev->name);
7506 WARN_ON(err);
7508 /* Add the device back in the hashes */
7509 list_netdevice(dev);
7511 /* Notify protocols, that a new device appeared. */
7512 call_netdevice_notifiers(NETDEV_REGISTER, dev);
7515 * Prevent userspace races by waiting until the network
7516 * device is fully setup before sending notifications.
7518 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7520 synchronize_net();
7521 err = 0;
7522 out:
7523 return err;
7525 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7527 static int dev_cpu_callback(struct notifier_block *nfb,
7528 unsigned long action,
7529 void *ocpu)
7531 struct sk_buff **list_skb;
7532 struct sk_buff *skb;
7533 unsigned int cpu, oldcpu = (unsigned long)ocpu;
7534 struct softnet_data *sd, *oldsd;
7536 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7537 return NOTIFY_OK;
7539 local_irq_disable();
7540 cpu = smp_processor_id();
7541 sd = &per_cpu(softnet_data, cpu);
7542 oldsd = &per_cpu(softnet_data, oldcpu);
7544 /* Find end of our completion_queue. */
7545 list_skb = &sd->completion_queue;
7546 while (*list_skb)
7547 list_skb = &(*list_skb)->next;
7548 /* Append completion queue from offline CPU. */
7549 *list_skb = oldsd->completion_queue;
7550 oldsd->completion_queue = NULL;
7552 /* Append output queue from offline CPU. */
7553 if (oldsd->output_queue) {
7554 *sd->output_queue_tailp = oldsd->output_queue;
7555 sd->output_queue_tailp = oldsd->output_queue_tailp;
7556 oldsd->output_queue = NULL;
7557 oldsd->output_queue_tailp = &oldsd->output_queue;
7559 /* Append NAPI poll list from offline CPU, with one exception :
7560 * process_backlog() must be called by cpu owning percpu backlog.
7561 * We properly handle process_queue & input_pkt_queue later.
7563 while (!list_empty(&oldsd->poll_list)) {
7564 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7565 struct napi_struct,
7566 poll_list);
7568 list_del_init(&napi->poll_list);
7569 if (napi->poll == process_backlog)
7570 napi->state = 0;
7571 else
7572 ____napi_schedule(sd, napi);
7575 raise_softirq_irqoff(NET_TX_SOFTIRQ);
7576 local_irq_enable();
7578 /* Process offline CPU's input_pkt_queue */
7579 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7580 netif_rx_ni(skb);
7581 input_queue_head_incr(oldsd);
7583 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7584 netif_rx_ni(skb);
7585 input_queue_head_incr(oldsd);
7588 return NOTIFY_OK;
7593 * netdev_increment_features - increment feature set by one
7594 * @all: current feature set
7595 * @one: new feature set
7596 * @mask: mask feature set
7598 * Computes a new feature set after adding a device with feature set
7599 * @one to the master device with current feature set @all. Will not
7600 * enable anything that is off in @mask. Returns the new feature set.
7602 netdev_features_t netdev_increment_features(netdev_features_t all,
7603 netdev_features_t one, netdev_features_t mask)
7605 if (mask & NETIF_F_GEN_CSUM)
7606 mask |= NETIF_F_ALL_CSUM;
7607 mask |= NETIF_F_VLAN_CHALLENGED;
7609 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
7610 all &= one | ~NETIF_F_ALL_FOR_ALL;
7612 /* If one device supports hw checksumming, set for all. */
7613 if (all & NETIF_F_GEN_CSUM)
7614 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7616 return all;
7618 EXPORT_SYMBOL(netdev_increment_features);
7620 static struct hlist_head * __net_init netdev_create_hash(void)
7622 int i;
7623 struct hlist_head *hash;
7625 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7626 if (hash != NULL)
7627 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7628 INIT_HLIST_HEAD(&hash[i]);
7630 return hash;
7633 /* Initialize per network namespace state */
7634 static int __net_init netdev_init(struct net *net)
7636 if (net != &init_net)
7637 INIT_LIST_HEAD(&net->dev_base_head);
7639 net->dev_name_head = netdev_create_hash();
7640 if (net->dev_name_head == NULL)
7641 goto err_name;
7643 net->dev_index_head = netdev_create_hash();
7644 if (net->dev_index_head == NULL)
7645 goto err_idx;
7647 return 0;
7649 err_idx:
7650 kfree(net->dev_name_head);
7651 err_name:
7652 return -ENOMEM;
7656 * netdev_drivername - network driver for the device
7657 * @dev: network device
7659 * Determine network driver for device.
7661 const char *netdev_drivername(const struct net_device *dev)
7663 const struct device_driver *driver;
7664 const struct device *parent;
7665 const char *empty = "";
7667 parent = dev->dev.parent;
7668 if (!parent)
7669 return empty;
7671 driver = parent->driver;
7672 if (driver && driver->name)
7673 return driver->name;
7674 return empty;
7677 static void __netdev_printk(const char *level, const struct net_device *dev,
7678 struct va_format *vaf)
7680 if (dev && dev->dev.parent) {
7681 dev_printk_emit(level[1] - '0',
7682 dev->dev.parent,
7683 "%s %s %s%s: %pV",
7684 dev_driver_string(dev->dev.parent),
7685 dev_name(dev->dev.parent),
7686 netdev_name(dev), netdev_reg_state(dev),
7687 vaf);
7688 } else if (dev) {
7689 printk("%s%s%s: %pV",
7690 level, netdev_name(dev), netdev_reg_state(dev), vaf);
7691 } else {
7692 printk("%s(NULL net_device): %pV", level, vaf);
7696 void netdev_printk(const char *level, const struct net_device *dev,
7697 const char *format, ...)
7699 struct va_format vaf;
7700 va_list args;
7702 va_start(args, format);
7704 vaf.fmt = format;
7705 vaf.va = &args;
7707 __netdev_printk(level, dev, &vaf);
7709 va_end(args);
7711 EXPORT_SYMBOL(netdev_printk);
7713 #define define_netdev_printk_level(func, level) \
7714 void func(const struct net_device *dev, const char *fmt, ...) \
7716 struct va_format vaf; \
7717 va_list args; \
7719 va_start(args, fmt); \
7721 vaf.fmt = fmt; \
7722 vaf.va = &args; \
7724 __netdev_printk(level, dev, &vaf); \
7726 va_end(args); \
7728 EXPORT_SYMBOL(func);
7730 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
7731 define_netdev_printk_level(netdev_alert, KERN_ALERT);
7732 define_netdev_printk_level(netdev_crit, KERN_CRIT);
7733 define_netdev_printk_level(netdev_err, KERN_ERR);
7734 define_netdev_printk_level(netdev_warn, KERN_WARNING);
7735 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
7736 define_netdev_printk_level(netdev_info, KERN_INFO);
7738 static void __net_exit netdev_exit(struct net *net)
7740 kfree(net->dev_name_head);
7741 kfree(net->dev_index_head);
7744 static struct pernet_operations __net_initdata netdev_net_ops = {
7745 .init = netdev_init,
7746 .exit = netdev_exit,
7749 static void __net_exit default_device_exit(struct net *net)
7751 struct net_device *dev, *aux;
7753 * Push all migratable network devices back to the
7754 * initial network namespace
7756 rtnl_lock();
7757 for_each_netdev_safe(net, dev, aux) {
7758 int err;
7759 char fb_name[IFNAMSIZ];
7761 /* Ignore unmoveable devices (i.e. loopback) */
7762 if (dev->features & NETIF_F_NETNS_LOCAL)
7763 continue;
7765 /* Leave virtual devices for the generic cleanup */
7766 if (dev->rtnl_link_ops)
7767 continue;
7769 /* Push remaining network devices to init_net */
7770 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
7771 err = dev_change_net_namespace(dev, &init_net, fb_name);
7772 if (err) {
7773 pr_emerg("%s: failed to move %s to init_net: %d\n",
7774 __func__, dev->name, err);
7775 BUG();
7778 rtnl_unlock();
7781 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
7783 /* Return with the rtnl_lock held when there are no network
7784 * devices unregistering in any network namespace in net_list.
7786 struct net *net;
7787 bool unregistering;
7788 DEFINE_WAIT_FUNC(wait, woken_wake_function);
7790 add_wait_queue(&netdev_unregistering_wq, &wait);
7791 for (;;) {
7792 unregistering = false;
7793 rtnl_lock();
7794 list_for_each_entry(net, net_list, exit_list) {
7795 if (net->dev_unreg_count > 0) {
7796 unregistering = true;
7797 break;
7800 if (!unregistering)
7801 break;
7802 __rtnl_unlock();
7804 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
7806 remove_wait_queue(&netdev_unregistering_wq, &wait);
7809 static void __net_exit default_device_exit_batch(struct list_head *net_list)
7811 /* At exit all network devices most be removed from a network
7812 * namespace. Do this in the reverse order of registration.
7813 * Do this across as many network namespaces as possible to
7814 * improve batching efficiency.
7816 struct net_device *dev;
7817 struct net *net;
7818 LIST_HEAD(dev_kill_list);
7820 /* To prevent network device cleanup code from dereferencing
7821 * loopback devices or network devices that have been freed
7822 * wait here for all pending unregistrations to complete,
7823 * before unregistring the loopback device and allowing the
7824 * network namespace be freed.
7826 * The netdev todo list containing all network devices
7827 * unregistrations that happen in default_device_exit_batch
7828 * will run in the rtnl_unlock() at the end of
7829 * default_device_exit_batch.
7831 rtnl_lock_unregistering(net_list);
7832 list_for_each_entry(net, net_list, exit_list) {
7833 for_each_netdev_reverse(net, dev) {
7834 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
7835 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
7836 else
7837 unregister_netdevice_queue(dev, &dev_kill_list);
7840 unregister_netdevice_many(&dev_kill_list);
7841 rtnl_unlock();
7844 static struct pernet_operations __net_initdata default_device_ops = {
7845 .exit = default_device_exit,
7846 .exit_batch = default_device_exit_batch,
7850 * Initialize the DEV module. At boot time this walks the device list and
7851 * unhooks any devices that fail to initialise (normally hardware not
7852 * present) and leaves us with a valid list of present and active devices.
7857 * This is called single threaded during boot, so no need
7858 * to take the rtnl semaphore.
7860 static int __init net_dev_init(void)
7862 int i, rc = -ENOMEM;
7864 BUG_ON(!dev_boot_phase);
7866 if (dev_proc_init())
7867 goto out;
7869 if (netdev_kobject_init())
7870 goto out;
7872 INIT_LIST_HEAD(&ptype_all);
7873 for (i = 0; i < PTYPE_HASH_SIZE; i++)
7874 INIT_LIST_HEAD(&ptype_base[i]);
7876 INIT_LIST_HEAD(&offload_base);
7878 if (register_pernet_subsys(&netdev_net_ops))
7879 goto out;
7882 * Initialise the packet receive queues.
7885 for_each_possible_cpu(i) {
7886 struct softnet_data *sd = &per_cpu(softnet_data, i);
7888 skb_queue_head_init(&sd->input_pkt_queue);
7889 skb_queue_head_init(&sd->process_queue);
7890 INIT_LIST_HEAD(&sd->poll_list);
7891 sd->output_queue_tailp = &sd->output_queue;
7892 #ifdef CONFIG_RPS
7893 sd->csd.func = rps_trigger_softirq;
7894 sd->csd.info = sd;
7895 sd->cpu = i;
7896 #endif
7898 sd->backlog.poll = process_backlog;
7899 sd->backlog.weight = weight_p;
7902 dev_boot_phase = 0;
7904 /* The loopback device is special if any other network devices
7905 * is present in a network namespace the loopback device must
7906 * be present. Since we now dynamically allocate and free the
7907 * loopback device ensure this invariant is maintained by
7908 * keeping the loopback device as the first device on the
7909 * list of network devices. Ensuring the loopback devices
7910 * is the first device that appears and the last network device
7911 * that disappears.
7913 if (register_pernet_device(&loopback_net_ops))
7914 goto out;
7916 if (register_pernet_device(&default_device_ops))
7917 goto out;
7919 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
7920 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
7922 hotcpu_notifier(dev_cpu_callback, 0);
7923 dst_subsys_init();
7924 rc = 0;
7925 out:
7926 return rc;
7929 subsys_initcall(net_dev_init);