2 * NET3 Protocol independent device support routines.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Derived from the non IP parts of dev.c 1.0.19
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
51 * Rudi Cilibrasi : Pass the right thing to
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
102 #include <net/dst_metadata.h>
103 #include <net/pkt_sched.h>
104 #include <net/checksum.h>
105 #include <net/xfrm.h>
106 #include <linux/highmem.h>
107 #include <linux/init.h>
108 #include <linux/module.h>
109 #include <linux/netpoll.h>
110 #include <linux/rcupdate.h>
111 #include <linux/delay.h>
112 #include <net/iw_handler.h>
113 #include <asm/current.h>
114 #include <linux/audit.h>
115 #include <linux/dmaengine.h>
116 #include <linux/err.h>
117 #include <linux/ctype.h>
118 #include <linux/if_arp.h>
119 #include <linux/if_vlan.h>
120 #include <linux/ip.h>
122 #include <net/mpls.h>
123 #include <linux/ipv6.h>
124 #include <linux/in.h>
125 #include <linux/jhash.h>
126 #include <linux/random.h>
127 #include <trace/events/napi.h>
128 #include <trace/events/net.h>
129 #include <trace/events/skb.h>
130 #include <linux/pci.h>
131 #include <linux/inetdevice.h>
132 #include <linux/cpu_rmap.h>
133 #include <linux/static_key.h>
134 #include <linux/hashtable.h>
135 #include <linux/vmalloc.h>
136 #include <linux/if_macvlan.h>
137 #include <linux/errqueue.h>
138 #include <linux/hrtimer.h>
139 #include <linux/netfilter_ingress.h>
141 #include "net-sysfs.h"
143 /* Instead of increasing this, you should create a hash table. */
144 #define MAX_GRO_SKBS 8
146 /* This should be increased if a protocol with a bigger head is added. */
147 #define GRO_MAX_HEAD (MAX_HEADER + 128)
149 static DEFINE_SPINLOCK(ptype_lock
);
150 static DEFINE_SPINLOCK(offload_lock
);
151 struct list_head ptype_base
[PTYPE_HASH_SIZE
] __read_mostly
;
152 struct list_head ptype_all __read_mostly
; /* Taps */
153 static struct list_head offload_base __read_mostly
;
155 static int netif_rx_internal(struct sk_buff
*skb
);
156 static int call_netdevice_notifiers_info(unsigned long val
,
157 struct net_device
*dev
,
158 struct netdev_notifier_info
*info
);
161 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
164 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
166 * Writers must hold the rtnl semaphore while they loop through the
167 * dev_base_head list, and hold dev_base_lock for writing when they do the
168 * actual updates. This allows pure readers to access the list even
169 * while a writer is preparing to update it.
171 * To put it another way, dev_base_lock is held for writing only to
172 * protect against pure readers; the rtnl semaphore provides the
173 * protection against other writers.
175 * See, for example usages, register_netdevice() and
176 * unregister_netdevice(), which must be called with the rtnl
179 DEFINE_RWLOCK(dev_base_lock
);
180 EXPORT_SYMBOL(dev_base_lock
);
182 /* protects napi_hash addition/deletion and napi_gen_id */
183 static DEFINE_SPINLOCK(napi_hash_lock
);
185 static unsigned int napi_gen_id
= NR_CPUS
;
186 static DEFINE_HASHTABLE(napi_hash
, 8);
188 static seqcount_t devnet_rename_seq
;
190 static inline void dev_base_seq_inc(struct net
*net
)
192 while (++net
->dev_base_seq
== 0);
195 static inline struct hlist_head
*dev_name_hash(struct net
*net
, const char *name
)
197 unsigned int hash
= full_name_hash(name
, strnlen(name
, IFNAMSIZ
));
199 return &net
->dev_name_head
[hash_32(hash
, NETDEV_HASHBITS
)];
202 static inline struct hlist_head
*dev_index_hash(struct net
*net
, int ifindex
)
204 return &net
->dev_index_head
[ifindex
& (NETDEV_HASHENTRIES
- 1)];
207 static inline void rps_lock(struct softnet_data
*sd
)
210 spin_lock(&sd
->input_pkt_queue
.lock
);
214 static inline void rps_unlock(struct softnet_data
*sd
)
217 spin_unlock(&sd
->input_pkt_queue
.lock
);
221 /* Device list insertion */
222 static void list_netdevice(struct net_device
*dev
)
224 struct net
*net
= dev_net(dev
);
228 write_lock_bh(&dev_base_lock
);
229 list_add_tail_rcu(&dev
->dev_list
, &net
->dev_base_head
);
230 hlist_add_head_rcu(&dev
->name_hlist
, dev_name_hash(net
, dev
->name
));
231 hlist_add_head_rcu(&dev
->index_hlist
,
232 dev_index_hash(net
, dev
->ifindex
));
233 write_unlock_bh(&dev_base_lock
);
235 dev_base_seq_inc(net
);
238 /* Device list removal
239 * caller must respect a RCU grace period before freeing/reusing dev
241 static void unlist_netdevice(struct net_device
*dev
)
245 /* Unlink dev from the device chain */
246 write_lock_bh(&dev_base_lock
);
247 list_del_rcu(&dev
->dev_list
);
248 hlist_del_rcu(&dev
->name_hlist
);
249 hlist_del_rcu(&dev
->index_hlist
);
250 write_unlock_bh(&dev_base_lock
);
252 dev_base_seq_inc(dev_net(dev
));
259 static RAW_NOTIFIER_HEAD(netdev_chain
);
262 * Device drivers call our routines to queue packets here. We empty the
263 * queue in the local softnet handler.
266 DEFINE_PER_CPU_ALIGNED(struct softnet_data
, softnet_data
);
267 EXPORT_PER_CPU_SYMBOL(softnet_data
);
269 #ifdef CONFIG_LOCKDEP
271 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
272 * according to dev->type
274 static const unsigned short netdev_lock_type
[] =
275 {ARPHRD_NETROM
, ARPHRD_ETHER
, ARPHRD_EETHER
, ARPHRD_AX25
,
276 ARPHRD_PRONET
, ARPHRD_CHAOS
, ARPHRD_IEEE802
, ARPHRD_ARCNET
,
277 ARPHRD_APPLETLK
, ARPHRD_DLCI
, ARPHRD_ATM
, ARPHRD_METRICOM
,
278 ARPHRD_IEEE1394
, ARPHRD_EUI64
, ARPHRD_INFINIBAND
, ARPHRD_SLIP
,
279 ARPHRD_CSLIP
, ARPHRD_SLIP6
, ARPHRD_CSLIP6
, ARPHRD_RSRVD
,
280 ARPHRD_ADAPT
, ARPHRD_ROSE
, ARPHRD_X25
, ARPHRD_HWX25
,
281 ARPHRD_PPP
, ARPHRD_CISCO
, ARPHRD_LAPB
, ARPHRD_DDCMP
,
282 ARPHRD_RAWHDLC
, ARPHRD_TUNNEL
, ARPHRD_TUNNEL6
, ARPHRD_FRAD
,
283 ARPHRD_SKIP
, ARPHRD_LOOPBACK
, ARPHRD_LOCALTLK
, ARPHRD_FDDI
,
284 ARPHRD_BIF
, ARPHRD_SIT
, ARPHRD_IPDDP
, ARPHRD_IPGRE
,
285 ARPHRD_PIMREG
, ARPHRD_HIPPI
, ARPHRD_ASH
, ARPHRD_ECONET
,
286 ARPHRD_IRDA
, ARPHRD_FCPP
, ARPHRD_FCAL
, ARPHRD_FCPL
,
287 ARPHRD_FCFABRIC
, ARPHRD_IEEE80211
, ARPHRD_IEEE80211_PRISM
,
288 ARPHRD_IEEE80211_RADIOTAP
, ARPHRD_PHONET
, ARPHRD_PHONET_PIPE
,
289 ARPHRD_IEEE802154
, ARPHRD_VOID
, ARPHRD_NONE
};
291 static const char *const netdev_lock_name
[] =
292 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
293 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
294 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
295 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
296 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
297 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
298 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
299 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
300 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
301 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
302 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
303 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
304 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
305 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
306 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
308 static struct lock_class_key netdev_xmit_lock_key
[ARRAY_SIZE(netdev_lock_type
)];
309 static struct lock_class_key netdev_addr_lock_key
[ARRAY_SIZE(netdev_lock_type
)];
311 static inline unsigned short netdev_lock_pos(unsigned short dev_type
)
315 for (i
= 0; i
< ARRAY_SIZE(netdev_lock_type
); i
++)
316 if (netdev_lock_type
[i
] == dev_type
)
318 /* the last key is used by default */
319 return ARRAY_SIZE(netdev_lock_type
) - 1;
322 static inline void netdev_set_xmit_lockdep_class(spinlock_t
*lock
,
323 unsigned short dev_type
)
327 i
= netdev_lock_pos(dev_type
);
328 lockdep_set_class_and_name(lock
, &netdev_xmit_lock_key
[i
],
329 netdev_lock_name
[i
]);
332 static inline void netdev_set_addr_lockdep_class(struct net_device
*dev
)
336 i
= netdev_lock_pos(dev
->type
);
337 lockdep_set_class_and_name(&dev
->addr_list_lock
,
338 &netdev_addr_lock_key
[i
],
339 netdev_lock_name
[i
]);
342 static inline void netdev_set_xmit_lockdep_class(spinlock_t
*lock
,
343 unsigned short dev_type
)
346 static inline void netdev_set_addr_lockdep_class(struct net_device
*dev
)
351 /*******************************************************************************
353 Protocol management and registration routines
355 *******************************************************************************/
358 * Add a protocol ID to the list. Now that the input handler is
359 * smarter we can dispense with all the messy stuff that used to be
362 * BEWARE!!! Protocol handlers, mangling input packets,
363 * MUST BE last in hash buckets and checking protocol handlers
364 * MUST start from promiscuous ptype_all chain in net_bh.
365 * It is true now, do not change it.
366 * Explanation follows: if protocol handler, mangling packet, will
367 * be the first on list, it is not able to sense, that packet
368 * is cloned and should be copied-on-write, so that it will
369 * change it and subsequent readers will get broken packet.
373 static inline struct list_head
*ptype_head(const struct packet_type
*pt
)
375 if (pt
->type
== htons(ETH_P_ALL
))
376 return pt
->dev
? &pt
->dev
->ptype_all
: &ptype_all
;
378 return pt
->dev
? &pt
->dev
->ptype_specific
:
379 &ptype_base
[ntohs(pt
->type
) & PTYPE_HASH_MASK
];
383 * dev_add_pack - add packet handler
384 * @pt: packet type declaration
386 * Add a protocol handler to the networking stack. The passed &packet_type
387 * is linked into kernel lists and may not be freed until it has been
388 * removed from the kernel lists.
390 * This call does not sleep therefore it can not
391 * guarantee all CPU's that are in middle of receiving packets
392 * will see the new packet type (until the next received packet).
395 void dev_add_pack(struct packet_type
*pt
)
397 struct list_head
*head
= ptype_head(pt
);
399 spin_lock(&ptype_lock
);
400 list_add_rcu(&pt
->list
, head
);
401 spin_unlock(&ptype_lock
);
403 EXPORT_SYMBOL(dev_add_pack
);
406 * __dev_remove_pack - remove packet handler
407 * @pt: packet type declaration
409 * Remove a protocol handler that was previously added to the kernel
410 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
411 * from the kernel lists and can be freed or reused once this function
414 * The packet type might still be in use by receivers
415 * and must not be freed until after all the CPU's have gone
416 * through a quiescent state.
418 void __dev_remove_pack(struct packet_type
*pt
)
420 struct list_head
*head
= ptype_head(pt
);
421 struct packet_type
*pt1
;
423 spin_lock(&ptype_lock
);
425 list_for_each_entry(pt1
, head
, list
) {
427 list_del_rcu(&pt
->list
);
432 pr_warn("dev_remove_pack: %p not found\n", pt
);
434 spin_unlock(&ptype_lock
);
436 EXPORT_SYMBOL(__dev_remove_pack
);
439 * dev_remove_pack - remove packet handler
440 * @pt: packet type declaration
442 * Remove a protocol handler that was previously added to the kernel
443 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
444 * from the kernel lists and can be freed or reused once this function
447 * This call sleeps to guarantee that no CPU is looking at the packet
450 void dev_remove_pack(struct packet_type
*pt
)
452 __dev_remove_pack(pt
);
456 EXPORT_SYMBOL(dev_remove_pack
);
460 * dev_add_offload - register offload handlers
461 * @po: protocol offload declaration
463 * Add protocol offload handlers to the networking stack. The passed
464 * &proto_offload is linked into kernel lists and may not be freed until
465 * it has been removed from the kernel lists.
467 * This call does not sleep therefore it can not
468 * guarantee all CPU's that are in middle of receiving packets
469 * will see the new offload handlers (until the next received packet).
471 void dev_add_offload(struct packet_offload
*po
)
473 struct packet_offload
*elem
;
475 spin_lock(&offload_lock
);
476 list_for_each_entry(elem
, &offload_base
, list
) {
477 if (po
->priority
< elem
->priority
)
480 list_add_rcu(&po
->list
, elem
->list
.prev
);
481 spin_unlock(&offload_lock
);
483 EXPORT_SYMBOL(dev_add_offload
);
486 * __dev_remove_offload - remove offload handler
487 * @po: packet offload declaration
489 * Remove a protocol offload handler that was previously added to the
490 * kernel offload handlers by dev_add_offload(). The passed &offload_type
491 * is removed from the kernel lists and can be freed or reused once this
494 * The packet type might still be in use by receivers
495 * and must not be freed until after all the CPU's have gone
496 * through a quiescent state.
498 static void __dev_remove_offload(struct packet_offload
*po
)
500 struct list_head
*head
= &offload_base
;
501 struct packet_offload
*po1
;
503 spin_lock(&offload_lock
);
505 list_for_each_entry(po1
, head
, list
) {
507 list_del_rcu(&po
->list
);
512 pr_warn("dev_remove_offload: %p not found\n", po
);
514 spin_unlock(&offload_lock
);
518 * dev_remove_offload - remove packet offload handler
519 * @po: packet offload declaration
521 * Remove a packet offload handler that was previously added to the kernel
522 * offload handlers by dev_add_offload(). The passed &offload_type is
523 * removed from the kernel lists and can be freed or reused once this
526 * This call sleeps to guarantee that no CPU is looking at the packet
529 void dev_remove_offload(struct packet_offload
*po
)
531 __dev_remove_offload(po
);
535 EXPORT_SYMBOL(dev_remove_offload
);
537 /******************************************************************************
539 Device Boot-time Settings Routines
541 *******************************************************************************/
543 /* Boot time configuration table */
544 static struct netdev_boot_setup dev_boot_setup
[NETDEV_BOOT_SETUP_MAX
];
547 * netdev_boot_setup_add - add new setup entry
548 * @name: name of the device
549 * @map: configured settings for the device
551 * Adds new setup entry to the dev_boot_setup list. The function
552 * returns 0 on error and 1 on success. This is a generic routine to
555 static int netdev_boot_setup_add(char *name
, struct ifmap
*map
)
557 struct netdev_boot_setup
*s
;
561 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++) {
562 if (s
[i
].name
[0] == '\0' || s
[i
].name
[0] == ' ') {
563 memset(s
[i
].name
, 0, sizeof(s
[i
].name
));
564 strlcpy(s
[i
].name
, name
, IFNAMSIZ
);
565 memcpy(&s
[i
].map
, map
, sizeof(s
[i
].map
));
570 return i
>= NETDEV_BOOT_SETUP_MAX
? 0 : 1;
574 * netdev_boot_setup_check - check boot time settings
575 * @dev: the netdevice
577 * Check boot time settings for the device.
578 * The found settings are set for the device to be used
579 * later in the device probing.
580 * Returns 0 if no settings found, 1 if they are.
582 int netdev_boot_setup_check(struct net_device
*dev
)
584 struct netdev_boot_setup
*s
= dev_boot_setup
;
587 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++) {
588 if (s
[i
].name
[0] != '\0' && s
[i
].name
[0] != ' ' &&
589 !strcmp(dev
->name
, s
[i
].name
)) {
590 dev
->irq
= s
[i
].map
.irq
;
591 dev
->base_addr
= s
[i
].map
.base_addr
;
592 dev
->mem_start
= s
[i
].map
.mem_start
;
593 dev
->mem_end
= s
[i
].map
.mem_end
;
599 EXPORT_SYMBOL(netdev_boot_setup_check
);
603 * netdev_boot_base - get address from boot time settings
604 * @prefix: prefix for network device
605 * @unit: id for network device
607 * Check boot time settings for the base address of device.
608 * The found settings are set for the device to be used
609 * later in the device probing.
610 * Returns 0 if no settings found.
612 unsigned long netdev_boot_base(const char *prefix
, int unit
)
614 const struct netdev_boot_setup
*s
= dev_boot_setup
;
618 sprintf(name
, "%s%d", prefix
, unit
);
621 * If device already registered then return base of 1
622 * to indicate not to probe for this interface
624 if (__dev_get_by_name(&init_net
, name
))
627 for (i
= 0; i
< NETDEV_BOOT_SETUP_MAX
; i
++)
628 if (!strcmp(name
, s
[i
].name
))
629 return s
[i
].map
.base_addr
;
634 * Saves at boot time configured settings for any netdevice.
636 int __init
netdev_boot_setup(char *str
)
641 str
= get_options(str
, ARRAY_SIZE(ints
), ints
);
646 memset(&map
, 0, sizeof(map
));
650 map
.base_addr
= ints
[2];
652 map
.mem_start
= ints
[3];
654 map
.mem_end
= ints
[4];
656 /* Add new entry to the list */
657 return netdev_boot_setup_add(str
, &map
);
660 __setup("netdev=", netdev_boot_setup
);
662 /*******************************************************************************
664 Device Interface Subroutines
666 *******************************************************************************/
669 * dev_get_iflink - get 'iflink' value of a interface
670 * @dev: targeted interface
672 * Indicates the ifindex the interface is linked to.
673 * Physical interfaces have the same 'ifindex' and 'iflink' values.
676 int dev_get_iflink(const struct net_device
*dev
)
678 if (dev
->netdev_ops
&& dev
->netdev_ops
->ndo_get_iflink
)
679 return dev
->netdev_ops
->ndo_get_iflink(dev
);
683 EXPORT_SYMBOL(dev_get_iflink
);
686 * dev_fill_metadata_dst - Retrieve tunnel egress information.
687 * @dev: targeted interface
690 * For better visibility of tunnel traffic OVS needs to retrieve
691 * egress tunnel information for a packet. Following API allows
692 * user to get this info.
694 int dev_fill_metadata_dst(struct net_device
*dev
, struct sk_buff
*skb
)
696 struct ip_tunnel_info
*info
;
698 if (!dev
->netdev_ops
|| !dev
->netdev_ops
->ndo_fill_metadata_dst
)
701 info
= skb_tunnel_info_unclone(skb
);
704 if (unlikely(!(info
->mode
& IP_TUNNEL_INFO_TX
)))
707 return dev
->netdev_ops
->ndo_fill_metadata_dst(dev
, skb
);
709 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst
);
712 * __dev_get_by_name - find a device by its name
713 * @net: the applicable net namespace
714 * @name: name to find
716 * Find an interface by name. Must be called under RTNL semaphore
717 * or @dev_base_lock. If the name is found a pointer to the device
718 * is returned. If the name is not found then %NULL is returned. The
719 * reference counters are not incremented so the caller must be
720 * careful with locks.
723 struct net_device
*__dev_get_by_name(struct net
*net
, const char *name
)
725 struct net_device
*dev
;
726 struct hlist_head
*head
= dev_name_hash(net
, name
);
728 hlist_for_each_entry(dev
, head
, name_hlist
)
729 if (!strncmp(dev
->name
, name
, IFNAMSIZ
))
734 EXPORT_SYMBOL(__dev_get_by_name
);
737 * dev_get_by_name_rcu - find a device by its name
738 * @net: the applicable net namespace
739 * @name: name to find
741 * Find an interface by name.
742 * If the name is found a pointer to the device is returned.
743 * If the name is not found then %NULL is returned.
744 * The reference counters are not incremented so the caller must be
745 * careful with locks. The caller must hold RCU lock.
748 struct net_device
*dev_get_by_name_rcu(struct net
*net
, const char *name
)
750 struct net_device
*dev
;
751 struct hlist_head
*head
= dev_name_hash(net
, name
);
753 hlist_for_each_entry_rcu(dev
, head
, name_hlist
)
754 if (!strncmp(dev
->name
, name
, IFNAMSIZ
))
759 EXPORT_SYMBOL(dev_get_by_name_rcu
);
762 * dev_get_by_name - find a device by its name
763 * @net: the applicable net namespace
764 * @name: name to find
766 * Find an interface by name. This can be called from any
767 * context and does its own locking. The returned handle has
768 * the usage count incremented and the caller must use dev_put() to
769 * release it when it is no longer needed. %NULL is returned if no
770 * matching device is found.
773 struct net_device
*dev_get_by_name(struct net
*net
, const char *name
)
775 struct net_device
*dev
;
778 dev
= dev_get_by_name_rcu(net
, name
);
784 EXPORT_SYMBOL(dev_get_by_name
);
787 * __dev_get_by_index - find a device by its ifindex
788 * @net: the applicable net namespace
789 * @ifindex: index of device
791 * Search for an interface by index. Returns %NULL if the device
792 * is not found or a pointer to the device. The device has not
793 * had its reference counter increased so the caller must be careful
794 * about locking. The caller must hold either the RTNL semaphore
798 struct net_device
*__dev_get_by_index(struct net
*net
, int ifindex
)
800 struct net_device
*dev
;
801 struct hlist_head
*head
= dev_index_hash(net
, ifindex
);
803 hlist_for_each_entry(dev
, head
, index_hlist
)
804 if (dev
->ifindex
== ifindex
)
809 EXPORT_SYMBOL(__dev_get_by_index
);
812 * dev_get_by_index_rcu - find a device by its ifindex
813 * @net: the applicable net namespace
814 * @ifindex: index of device
816 * Search for an interface by index. Returns %NULL if the device
817 * is not found or a pointer to the device. The device has not
818 * had its reference counter increased so the caller must be careful
819 * about locking. The caller must hold RCU lock.
822 struct net_device
*dev_get_by_index_rcu(struct net
*net
, int ifindex
)
824 struct net_device
*dev
;
825 struct hlist_head
*head
= dev_index_hash(net
, ifindex
);
827 hlist_for_each_entry_rcu(dev
, head
, index_hlist
)
828 if (dev
->ifindex
== ifindex
)
833 EXPORT_SYMBOL(dev_get_by_index_rcu
);
837 * dev_get_by_index - find a device by its ifindex
838 * @net: the applicable net namespace
839 * @ifindex: index of device
841 * Search for an interface by index. Returns NULL if the device
842 * is not found or a pointer to the device. The device returned has
843 * had a reference added and the pointer is safe until the user calls
844 * dev_put to indicate they have finished with it.
847 struct net_device
*dev_get_by_index(struct net
*net
, int ifindex
)
849 struct net_device
*dev
;
852 dev
= dev_get_by_index_rcu(net
, ifindex
);
858 EXPORT_SYMBOL(dev_get_by_index
);
861 * netdev_get_name - get a netdevice name, knowing its ifindex.
862 * @net: network namespace
863 * @name: a pointer to the buffer where the name will be stored.
864 * @ifindex: the ifindex of the interface to get the name from.
866 * The use of raw_seqcount_begin() and cond_resched() before
867 * retrying is required as we want to give the writers a chance
868 * to complete when CONFIG_PREEMPT is not set.
870 int netdev_get_name(struct net
*net
, char *name
, int ifindex
)
872 struct net_device
*dev
;
876 seq
= raw_seqcount_begin(&devnet_rename_seq
);
878 dev
= dev_get_by_index_rcu(net
, ifindex
);
884 strcpy(name
, dev
->name
);
886 if (read_seqcount_retry(&devnet_rename_seq
, seq
)) {
895 * dev_getbyhwaddr_rcu - find a device by its hardware address
896 * @net: the applicable net namespace
897 * @type: media type of device
898 * @ha: hardware address
900 * Search for an interface by MAC address. Returns NULL if the device
901 * is not found or a pointer to the device.
902 * The caller must hold RCU or RTNL.
903 * The returned device has not had its ref count increased
904 * and the caller must therefore be careful about locking
908 struct net_device
*dev_getbyhwaddr_rcu(struct net
*net
, unsigned short type
,
911 struct net_device
*dev
;
913 for_each_netdev_rcu(net
, dev
)
914 if (dev
->type
== type
&&
915 !memcmp(dev
->dev_addr
, ha
, dev
->addr_len
))
920 EXPORT_SYMBOL(dev_getbyhwaddr_rcu
);
922 struct net_device
*__dev_getfirstbyhwtype(struct net
*net
, unsigned short type
)
924 struct net_device
*dev
;
927 for_each_netdev(net
, dev
)
928 if (dev
->type
== type
)
933 EXPORT_SYMBOL(__dev_getfirstbyhwtype
);
935 struct net_device
*dev_getfirstbyhwtype(struct net
*net
, unsigned short type
)
937 struct net_device
*dev
, *ret
= NULL
;
940 for_each_netdev_rcu(net
, dev
)
941 if (dev
->type
== type
) {
949 EXPORT_SYMBOL(dev_getfirstbyhwtype
);
952 * __dev_get_by_flags - find any device with given flags
953 * @net: the applicable net namespace
954 * @if_flags: IFF_* values
955 * @mask: bitmask of bits in if_flags to check
957 * Search for any interface with the given flags. Returns NULL if a device
958 * is not found or a pointer to the device. Must be called inside
959 * rtnl_lock(), and result refcount is unchanged.
962 struct net_device
*__dev_get_by_flags(struct net
*net
, unsigned short if_flags
,
965 struct net_device
*dev
, *ret
;
970 for_each_netdev(net
, dev
) {
971 if (((dev
->flags
^ if_flags
) & mask
) == 0) {
978 EXPORT_SYMBOL(__dev_get_by_flags
);
981 * dev_valid_name - check if name is okay for network device
984 * Network device names need to be valid file names to
985 * to allow sysfs to work. We also disallow any kind of
988 bool dev_valid_name(const char *name
)
992 if (strnlen(name
, IFNAMSIZ
) == IFNAMSIZ
)
994 if (!strcmp(name
, ".") || !strcmp(name
, ".."))
998 if (*name
== '/' || *name
== ':' || isspace(*name
))
1004 EXPORT_SYMBOL(dev_valid_name
);
1007 * __dev_alloc_name - allocate a name for a device
1008 * @net: network namespace to allocate the device name in
1009 * @name: name format string
1010 * @buf: scratch buffer and result name string
1012 * Passed a format string - eg "lt%d" it will try and find a suitable
1013 * id. It scans list of devices to build up a free map, then chooses
1014 * the first empty slot. The caller must hold the dev_base or rtnl lock
1015 * while allocating the name and adding the device in order to avoid
1017 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1018 * Returns the number of the unit assigned or a negative errno code.
1021 static int __dev_alloc_name(struct net
*net
, const char *name
, char *buf
)
1025 const int max_netdevices
= 8*PAGE_SIZE
;
1026 unsigned long *inuse
;
1027 struct net_device
*d
;
1029 p
= strnchr(name
, IFNAMSIZ
-1, '%');
1032 * Verify the string as this thing may have come from
1033 * the user. There must be either one "%d" and no other "%"
1036 if (p
[1] != 'd' || strchr(p
+ 2, '%'))
1039 /* Use one page as a bit array of possible slots */
1040 inuse
= (unsigned long *) get_zeroed_page(GFP_ATOMIC
);
1044 for_each_netdev(net
, d
) {
1045 if (!sscanf(d
->name
, name
, &i
))
1047 if (i
< 0 || i
>= max_netdevices
)
1050 /* avoid cases where sscanf is not exact inverse of printf */
1051 snprintf(buf
, IFNAMSIZ
, name
, i
);
1052 if (!strncmp(buf
, d
->name
, IFNAMSIZ
))
1056 i
= find_first_zero_bit(inuse
, max_netdevices
);
1057 free_page((unsigned long) inuse
);
1061 snprintf(buf
, IFNAMSIZ
, name
, i
);
1062 if (!__dev_get_by_name(net
, buf
))
1065 /* It is possible to run out of possible slots
1066 * when the name is long and there isn't enough space left
1067 * for the digits, or if all bits are used.
1073 * dev_alloc_name - allocate a name for a device
1075 * @name: name format string
1077 * Passed a format string - eg "lt%d" it will try and find a suitable
1078 * id. It scans list of devices to build up a free map, then chooses
1079 * the first empty slot. The caller must hold the dev_base or rtnl lock
1080 * while allocating the name and adding the device in order to avoid
1082 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1083 * Returns the number of the unit assigned or a negative errno code.
1086 int dev_alloc_name(struct net_device
*dev
, const char *name
)
1092 BUG_ON(!dev_net(dev
));
1094 ret
= __dev_alloc_name(net
, name
, buf
);
1096 strlcpy(dev
->name
, buf
, IFNAMSIZ
);
1099 EXPORT_SYMBOL(dev_alloc_name
);
1101 static int dev_alloc_name_ns(struct net
*net
,
1102 struct net_device
*dev
,
1108 ret
= __dev_alloc_name(net
, name
, buf
);
1110 strlcpy(dev
->name
, buf
, IFNAMSIZ
);
1114 int dev_get_valid_name(struct net
*net
, struct net_device
*dev
,
1119 if (!dev_valid_name(name
))
1122 if (strchr(name
, '%'))
1123 return dev_alloc_name_ns(net
, dev
, name
);
1124 else if (__dev_get_by_name(net
, name
))
1126 else if (dev
->name
!= name
)
1127 strlcpy(dev
->name
, name
, IFNAMSIZ
);
1131 EXPORT_SYMBOL(dev_get_valid_name
);
1134 * dev_change_name - change name of a device
1136 * @newname: name (or format string) must be at least IFNAMSIZ
1138 * Change name of a device, can pass format strings "eth%d".
1141 int dev_change_name(struct net_device
*dev
, const char *newname
)
1143 unsigned char old_assign_type
;
1144 char oldname
[IFNAMSIZ
];
1150 BUG_ON(!dev_net(dev
));
1153 if (dev
->flags
& IFF_UP
)
1156 write_seqcount_begin(&devnet_rename_seq
);
1158 if (strncmp(newname
, dev
->name
, IFNAMSIZ
) == 0) {
1159 write_seqcount_end(&devnet_rename_seq
);
1163 memcpy(oldname
, dev
->name
, IFNAMSIZ
);
1165 err
= dev_get_valid_name(net
, dev
, newname
);
1167 write_seqcount_end(&devnet_rename_seq
);
1171 if (oldname
[0] && !strchr(oldname
, '%'))
1172 netdev_info(dev
, "renamed from %s\n", oldname
);
1174 old_assign_type
= dev
->name_assign_type
;
1175 dev
->name_assign_type
= NET_NAME_RENAMED
;
1178 ret
= device_rename(&dev
->dev
, dev
->name
);
1180 memcpy(dev
->name
, oldname
, IFNAMSIZ
);
1181 dev
->name_assign_type
= old_assign_type
;
1182 write_seqcount_end(&devnet_rename_seq
);
1186 write_seqcount_end(&devnet_rename_seq
);
1188 netdev_adjacent_rename_links(dev
, oldname
);
1190 write_lock_bh(&dev_base_lock
);
1191 hlist_del_rcu(&dev
->name_hlist
);
1192 write_unlock_bh(&dev_base_lock
);
1196 write_lock_bh(&dev_base_lock
);
1197 hlist_add_head_rcu(&dev
->name_hlist
, dev_name_hash(net
, dev
->name
));
1198 write_unlock_bh(&dev_base_lock
);
1200 ret
= call_netdevice_notifiers(NETDEV_CHANGENAME
, dev
);
1201 ret
= notifier_to_errno(ret
);
1204 /* err >= 0 after dev_alloc_name() or stores the first errno */
1207 write_seqcount_begin(&devnet_rename_seq
);
1208 memcpy(dev
->name
, oldname
, IFNAMSIZ
);
1209 memcpy(oldname
, newname
, IFNAMSIZ
);
1210 dev
->name_assign_type
= old_assign_type
;
1211 old_assign_type
= NET_NAME_RENAMED
;
1214 pr_err("%s: name change rollback failed: %d\n",
1223 * dev_set_alias - change ifalias of a device
1225 * @alias: name up to IFALIASZ
1226 * @len: limit of bytes to copy from info
1228 * Set ifalias for a device,
1230 int dev_set_alias(struct net_device
*dev
, const char *alias
, size_t len
)
1236 if (len
>= IFALIASZ
)
1240 kfree(dev
->ifalias
);
1241 dev
->ifalias
= NULL
;
1245 new_ifalias
= krealloc(dev
->ifalias
, len
+ 1, GFP_KERNEL
);
1248 dev
->ifalias
= new_ifalias
;
1249 memcpy(dev
->ifalias
, alias
, len
);
1250 dev
->ifalias
[len
] = 0;
1257 * netdev_features_change - device changes features
1258 * @dev: device to cause notification
1260 * Called to indicate a device has changed features.
1262 void netdev_features_change(struct net_device
*dev
)
1264 call_netdevice_notifiers(NETDEV_FEAT_CHANGE
, dev
);
1266 EXPORT_SYMBOL(netdev_features_change
);
1269 * netdev_state_change - device changes state
1270 * @dev: device to cause notification
1272 * Called to indicate a device has changed state. This function calls
1273 * the notifier chains for netdev_chain and sends a NEWLINK message
1274 * to the routing socket.
1276 void netdev_state_change(struct net_device
*dev
)
1278 if (dev
->flags
& IFF_UP
) {
1279 struct netdev_notifier_change_info change_info
;
1281 change_info
.flags_changed
= 0;
1282 call_netdevice_notifiers_info(NETDEV_CHANGE
, dev
,
1284 rtmsg_ifinfo(RTM_NEWLINK
, dev
, 0, GFP_KERNEL
);
1287 EXPORT_SYMBOL(netdev_state_change
);
1290 * netdev_notify_peers - notify network peers about existence of @dev
1291 * @dev: network device
1293 * Generate traffic such that interested network peers are aware of
1294 * @dev, such as by generating a gratuitous ARP. This may be used when
1295 * a device wants to inform the rest of the network about some sort of
1296 * reconfiguration such as a failover event or virtual machine
1299 void netdev_notify_peers(struct net_device
*dev
)
1302 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS
, dev
);
1303 call_netdevice_notifiers(NETDEV_RESEND_IGMP
, dev
);
1306 EXPORT_SYMBOL(netdev_notify_peers
);
1308 static int __dev_open(struct net_device
*dev
)
1310 const struct net_device_ops
*ops
= dev
->netdev_ops
;
1315 if (!netif_device_present(dev
))
1318 /* Block netpoll from trying to do any rx path servicing.
1319 * If we don't do this there is a chance ndo_poll_controller
1320 * or ndo_poll may be running while we open the device
1322 netpoll_poll_disable(dev
);
1324 ret
= call_netdevice_notifiers(NETDEV_PRE_UP
, dev
);
1325 ret
= notifier_to_errno(ret
);
1329 set_bit(__LINK_STATE_START
, &dev
->state
);
1331 if (ops
->ndo_validate_addr
)
1332 ret
= ops
->ndo_validate_addr(dev
);
1334 if (!ret
&& ops
->ndo_open
)
1335 ret
= ops
->ndo_open(dev
);
1337 netpoll_poll_enable(dev
);
1340 clear_bit(__LINK_STATE_START
, &dev
->state
);
1342 dev
->flags
|= IFF_UP
;
1343 dev_set_rx_mode(dev
);
1345 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
1352 * dev_open - prepare an interface for use.
1353 * @dev: device to open
1355 * Takes a device from down to up state. The device's private open
1356 * function is invoked and then the multicast lists are loaded. Finally
1357 * the device is moved into the up state and a %NETDEV_UP message is
1358 * sent to the netdev notifier chain.
1360 * Calling this function on an active interface is a nop. On a failure
1361 * a negative errno code is returned.
1363 int dev_open(struct net_device
*dev
)
1367 if (dev
->flags
& IFF_UP
)
1370 ret
= __dev_open(dev
);
1374 rtmsg_ifinfo(RTM_NEWLINK
, dev
, IFF_UP
|IFF_RUNNING
, GFP_KERNEL
);
1375 call_netdevice_notifiers(NETDEV_UP
, dev
);
1379 EXPORT_SYMBOL(dev_open
);
1381 static int __dev_close_many(struct list_head
*head
)
1383 struct net_device
*dev
;
1388 list_for_each_entry(dev
, head
, close_list
) {
1389 /* Temporarily disable netpoll until the interface is down */
1390 netpoll_poll_disable(dev
);
1392 call_netdevice_notifiers(NETDEV_GOING_DOWN
, dev
);
1394 clear_bit(__LINK_STATE_START
, &dev
->state
);
1396 /* Synchronize to scheduled poll. We cannot touch poll list, it
1397 * can be even on different cpu. So just clear netif_running().
1399 * dev->stop() will invoke napi_disable() on all of it's
1400 * napi_struct instances on this device.
1402 smp_mb__after_atomic(); /* Commit netif_running(). */
1405 dev_deactivate_many(head
);
1407 list_for_each_entry(dev
, head
, close_list
) {
1408 const struct net_device_ops
*ops
= dev
->netdev_ops
;
1411 * Call the device specific close. This cannot fail.
1412 * Only if device is UP
1414 * We allow it to be called even after a DETACH hot-plug
1420 dev
->flags
&= ~IFF_UP
;
1421 netpoll_poll_enable(dev
);
1427 static int __dev_close(struct net_device
*dev
)
1432 list_add(&dev
->close_list
, &single
);
1433 retval
= __dev_close_many(&single
);
1439 int dev_close_many(struct list_head
*head
, bool unlink
)
1441 struct net_device
*dev
, *tmp
;
1443 /* Remove the devices that don't need to be closed */
1444 list_for_each_entry_safe(dev
, tmp
, head
, close_list
)
1445 if (!(dev
->flags
& IFF_UP
))
1446 list_del_init(&dev
->close_list
);
1448 __dev_close_many(head
);
1450 list_for_each_entry_safe(dev
, tmp
, head
, close_list
) {
1451 rtmsg_ifinfo(RTM_NEWLINK
, dev
, IFF_UP
|IFF_RUNNING
, GFP_KERNEL
);
1452 call_netdevice_notifiers(NETDEV_DOWN
, dev
);
1454 list_del_init(&dev
->close_list
);
1459 EXPORT_SYMBOL(dev_close_many
);
1462 * dev_close - shutdown an interface.
1463 * @dev: device to shutdown
1465 * This function moves an active device into down state. A
1466 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1467 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1470 int dev_close(struct net_device
*dev
)
1472 if (dev
->flags
& IFF_UP
) {
1475 list_add(&dev
->close_list
, &single
);
1476 dev_close_many(&single
, true);
1481 EXPORT_SYMBOL(dev_close
);
1485 * dev_disable_lro - disable Large Receive Offload on a device
1488 * Disable Large Receive Offload (LRO) on a net device. Must be
1489 * called under RTNL. This is needed if received packets may be
1490 * forwarded to another interface.
1492 void dev_disable_lro(struct net_device
*dev
)
1494 struct net_device
*lower_dev
;
1495 struct list_head
*iter
;
1497 dev
->wanted_features
&= ~NETIF_F_LRO
;
1498 netdev_update_features(dev
);
1500 if (unlikely(dev
->features
& NETIF_F_LRO
))
1501 netdev_WARN(dev
, "failed to disable LRO!\n");
1503 netdev_for_each_lower_dev(dev
, lower_dev
, iter
)
1504 dev_disable_lro(lower_dev
);
1506 EXPORT_SYMBOL(dev_disable_lro
);
1508 static int call_netdevice_notifier(struct notifier_block
*nb
, unsigned long val
,
1509 struct net_device
*dev
)
1511 struct netdev_notifier_info info
;
1513 netdev_notifier_info_init(&info
, dev
);
1514 return nb
->notifier_call(nb
, val
, &info
);
1517 static int dev_boot_phase
= 1;
1520 * register_netdevice_notifier - register a network notifier block
1523 * Register a notifier to be called when network device events occur.
1524 * The notifier passed is linked into the kernel structures and must
1525 * not be reused until it has been unregistered. A negative errno code
1526 * is returned on a failure.
1528 * When registered all registration and up events are replayed
1529 * to the new notifier to allow device to have a race free
1530 * view of the network device list.
1533 int register_netdevice_notifier(struct notifier_block
*nb
)
1535 struct net_device
*dev
;
1536 struct net_device
*last
;
1541 err
= raw_notifier_chain_register(&netdev_chain
, nb
);
1547 for_each_netdev(net
, dev
) {
1548 err
= call_netdevice_notifier(nb
, NETDEV_REGISTER
, dev
);
1549 err
= notifier_to_errno(err
);
1553 if (!(dev
->flags
& IFF_UP
))
1556 call_netdevice_notifier(nb
, NETDEV_UP
, dev
);
1567 for_each_netdev(net
, dev
) {
1571 if (dev
->flags
& IFF_UP
) {
1572 call_netdevice_notifier(nb
, NETDEV_GOING_DOWN
,
1574 call_netdevice_notifier(nb
, NETDEV_DOWN
, dev
);
1576 call_netdevice_notifier(nb
, NETDEV_UNREGISTER
, dev
);
1581 raw_notifier_chain_unregister(&netdev_chain
, nb
);
1584 EXPORT_SYMBOL(register_netdevice_notifier
);
1587 * unregister_netdevice_notifier - unregister a network notifier block
1590 * Unregister a notifier previously registered by
1591 * register_netdevice_notifier(). The notifier is unlinked into the
1592 * kernel structures and may then be reused. A negative errno code
1593 * is returned on a failure.
1595 * After unregistering unregister and down device events are synthesized
1596 * for all devices on the device list to the removed notifier to remove
1597 * the need for special case cleanup code.
1600 int unregister_netdevice_notifier(struct notifier_block
*nb
)
1602 struct net_device
*dev
;
1607 err
= raw_notifier_chain_unregister(&netdev_chain
, nb
);
1612 for_each_netdev(net
, dev
) {
1613 if (dev
->flags
& IFF_UP
) {
1614 call_netdevice_notifier(nb
, NETDEV_GOING_DOWN
,
1616 call_netdevice_notifier(nb
, NETDEV_DOWN
, dev
);
1618 call_netdevice_notifier(nb
, NETDEV_UNREGISTER
, dev
);
1625 EXPORT_SYMBOL(unregister_netdevice_notifier
);
1628 * call_netdevice_notifiers_info - call all network notifier blocks
1629 * @val: value passed unmodified to notifier function
1630 * @dev: net_device pointer passed unmodified to notifier function
1631 * @info: notifier information data
1633 * Call all network notifier blocks. Parameters and return value
1634 * are as for raw_notifier_call_chain().
1637 static int call_netdevice_notifiers_info(unsigned long val
,
1638 struct net_device
*dev
,
1639 struct netdev_notifier_info
*info
)
1642 netdev_notifier_info_init(info
, dev
);
1643 return raw_notifier_call_chain(&netdev_chain
, val
, info
);
1647 * call_netdevice_notifiers - call all network notifier blocks
1648 * @val: value passed unmodified to notifier function
1649 * @dev: net_device pointer passed unmodified to notifier function
1651 * Call all network notifier blocks. Parameters and return value
1652 * are as for raw_notifier_call_chain().
1655 int call_netdevice_notifiers(unsigned long val
, struct net_device
*dev
)
1657 struct netdev_notifier_info info
;
1659 return call_netdevice_notifiers_info(val
, dev
, &info
);
1661 EXPORT_SYMBOL(call_netdevice_notifiers
);
1664 * call_netdevice_notifiers_mtu - call all network notifier blocks
1665 * @val: value passed unmodified to notifier function
1666 * @dev: net_device pointer passed unmodified to notifier function
1667 * @arg: additional u32 argument passed to the notifier function
1669 * Call all network notifier blocks. Parameters and return value
1670 * are as for raw_notifier_call_chain().
1672 static int call_netdevice_notifiers_mtu(unsigned long val
,
1673 struct net_device
*dev
, u32 arg
)
1675 struct netdev_notifier_info_ext info
= {
1680 BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext
, info
) != 0);
1682 return call_netdevice_notifiers_info(val
, dev
, &info
.info
);
1685 #ifdef CONFIG_NET_INGRESS
1686 static struct static_key ingress_needed __read_mostly
;
1688 void net_inc_ingress_queue(void)
1690 static_key_slow_inc(&ingress_needed
);
1692 EXPORT_SYMBOL_GPL(net_inc_ingress_queue
);
1694 void net_dec_ingress_queue(void)
1696 static_key_slow_dec(&ingress_needed
);
1698 EXPORT_SYMBOL_GPL(net_dec_ingress_queue
);
1701 static struct static_key netstamp_needed __read_mostly
;
1702 #ifdef HAVE_JUMP_LABEL
1703 static atomic_t netstamp_needed_deferred
;
1704 static atomic_t netstamp_wanted
;
1705 static void netstamp_clear(struct work_struct
*work
)
1707 int deferred
= atomic_xchg(&netstamp_needed_deferred
, 0);
1710 wanted
= atomic_add_return(deferred
, &netstamp_wanted
);
1712 static_key_enable(&netstamp_needed
);
1714 static_key_disable(&netstamp_needed
);
1716 static DECLARE_WORK(netstamp_work
, netstamp_clear
);
1719 void net_enable_timestamp(void)
1721 #ifdef HAVE_JUMP_LABEL
1725 wanted
= atomic_read(&netstamp_wanted
);
1728 if (atomic_cmpxchg(&netstamp_wanted
, wanted
, wanted
+ 1) == wanted
)
1731 atomic_inc(&netstamp_needed_deferred
);
1732 schedule_work(&netstamp_work
);
1734 static_key_slow_inc(&netstamp_needed
);
1737 EXPORT_SYMBOL(net_enable_timestamp
);
1739 void net_disable_timestamp(void)
1741 #ifdef HAVE_JUMP_LABEL
1745 wanted
= atomic_read(&netstamp_wanted
);
1748 if (atomic_cmpxchg(&netstamp_wanted
, wanted
, wanted
- 1) == wanted
)
1751 atomic_dec(&netstamp_needed_deferred
);
1752 schedule_work(&netstamp_work
);
1754 static_key_slow_dec(&netstamp_needed
);
1757 EXPORT_SYMBOL(net_disable_timestamp
);
1759 static inline void net_timestamp_set(struct sk_buff
*skb
)
1761 skb
->tstamp
.tv64
= 0;
1762 if (static_key_false(&netstamp_needed
))
1763 __net_timestamp(skb
);
1766 #define net_timestamp_check(COND, SKB) \
1767 if (static_key_false(&netstamp_needed)) { \
1768 if ((COND) && !(SKB)->tstamp.tv64) \
1769 __net_timestamp(SKB); \
1772 bool is_skb_forwardable(struct net_device *dev, struct sk_buff *skb)
1776 if (!(dev
->flags
& IFF_UP
))
1779 len
= dev
->mtu
+ dev
->hard_header_len
+ VLAN_HLEN
;
1780 if (skb
->len
<= len
)
1783 /* if TSO is enabled, we don't care about the length as the packet
1784 * could be forwarded without being segmented before
1786 if (skb_is_gso(skb
))
1791 EXPORT_SYMBOL_GPL(is_skb_forwardable
);
1793 int __dev_forward_skb(struct net_device
*dev
, struct sk_buff
*skb
)
1795 if (skb_orphan_frags(skb
, GFP_ATOMIC
) ||
1796 unlikely(!is_skb_forwardable(dev
, skb
))) {
1797 atomic_long_inc(&dev
->rx_dropped
);
1802 skb_scrub_packet(skb
, true);
1804 skb
->protocol
= eth_type_trans(skb
, dev
);
1805 skb_postpull_rcsum(skb
, eth_hdr(skb
), ETH_HLEN
);
1809 EXPORT_SYMBOL_GPL(__dev_forward_skb
);
1812 * dev_forward_skb - loopback an skb to another netif
1814 * @dev: destination network device
1815 * @skb: buffer to forward
1818 * NET_RX_SUCCESS (no congestion)
1819 * NET_RX_DROP (packet was dropped, but freed)
1821 * dev_forward_skb can be used for injecting an skb from the
1822 * start_xmit function of one device into the receive queue
1823 * of another device.
1825 * The receiving device may be in another namespace, so
1826 * we have to clear all information in the skb that could
1827 * impact namespace isolation.
1829 int dev_forward_skb(struct net_device
*dev
, struct sk_buff
*skb
)
1831 return __dev_forward_skb(dev
, skb
) ?: netif_rx_internal(skb
);
1833 EXPORT_SYMBOL_GPL(dev_forward_skb
);
1835 static inline int deliver_skb(struct sk_buff
*skb
,
1836 struct packet_type
*pt_prev
,
1837 struct net_device
*orig_dev
)
1839 if (unlikely(skb_orphan_frags(skb
, GFP_ATOMIC
)))
1841 atomic_inc(&skb
->users
);
1842 return pt_prev
->func(skb
, skb
->dev
, pt_prev
, orig_dev
);
1845 static inline void deliver_ptype_list_skb(struct sk_buff
*skb
,
1846 struct packet_type
**pt
,
1847 struct net_device
*orig_dev
,
1849 struct list_head
*ptype_list
)
1851 struct packet_type
*ptype
, *pt_prev
= *pt
;
1853 list_for_each_entry_rcu(ptype
, ptype_list
, list
) {
1854 if (ptype
->type
!= type
)
1857 deliver_skb(skb
, pt_prev
, orig_dev
);
1863 static inline bool skb_loop_sk(struct packet_type
*ptype
, struct sk_buff
*skb
)
1865 if (!ptype
->af_packet_priv
|| !skb
->sk
)
1868 if (ptype
->id_match
)
1869 return ptype
->id_match(ptype
, skb
->sk
);
1870 else if ((struct sock
*)ptype
->af_packet_priv
== skb
->sk
)
1877 * Support routine. Sends outgoing frames to any network
1878 * taps currently in use.
1881 static void dev_queue_xmit_nit(struct sk_buff
*skb
, struct net_device
*dev
)
1883 struct packet_type
*ptype
;
1884 struct sk_buff
*skb2
= NULL
;
1885 struct packet_type
*pt_prev
= NULL
;
1886 struct list_head
*ptype_list
= &ptype_all
;
1890 list_for_each_entry_rcu(ptype
, ptype_list
, list
) {
1891 /* Never send packets back to the socket
1892 * they originated from - MvS (miquels@drinkel.ow.org)
1894 if (skb_loop_sk(ptype
, skb
))
1898 deliver_skb(skb2
, pt_prev
, skb
->dev
);
1903 /* need to clone skb, done only once */
1904 skb2
= skb_clone(skb
, GFP_ATOMIC
);
1908 net_timestamp_set(skb2
);
1910 /* skb->nh should be correctly
1911 * set by sender, so that the second statement is
1912 * just protection against buggy protocols.
1914 skb_reset_mac_header(skb2
);
1916 if (skb_network_header(skb2
) < skb2
->data
||
1917 skb_network_header(skb2
) > skb_tail_pointer(skb2
)) {
1918 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1919 ntohs(skb2
->protocol
),
1921 skb_reset_network_header(skb2
);
1924 skb2
->transport_header
= skb2
->network_header
;
1925 skb2
->pkt_type
= PACKET_OUTGOING
;
1929 if (ptype_list
== &ptype_all
) {
1930 ptype_list
= &dev
->ptype_all
;
1935 pt_prev
->func(skb2
, skb
->dev
, pt_prev
, skb
->dev
);
1940 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1941 * @dev: Network device
1942 * @txq: number of queues available
1944 * If real_num_tx_queues is changed the tc mappings may no longer be
1945 * valid. To resolve this verify the tc mapping remains valid and if
1946 * not NULL the mapping. With no priorities mapping to this
1947 * offset/count pair it will no longer be used. In the worst case TC0
1948 * is invalid nothing can be done so disable priority mappings. If is
1949 * expected that drivers will fix this mapping if they can before
1950 * calling netif_set_real_num_tx_queues.
1952 static void netif_setup_tc(struct net_device
*dev
, unsigned int txq
)
1955 struct netdev_tc_txq
*tc
= &dev
->tc_to_txq
[0];
1957 /* If TC0 is invalidated disable TC mapping */
1958 if (tc
->offset
+ tc
->count
> txq
) {
1959 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1964 /* Invalidated prio to tc mappings set to TC0 */
1965 for (i
= 1; i
< TC_BITMASK
+ 1; i
++) {
1966 int q
= netdev_get_prio_tc_map(dev
, i
);
1968 tc
= &dev
->tc_to_txq
[q
];
1969 if (tc
->offset
+ tc
->count
> txq
) {
1970 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1972 netdev_set_prio_tc_map(dev
, i
, 0);
1978 static DEFINE_MUTEX(xps_map_mutex
);
1979 #define xmap_dereference(P) \
1980 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1982 static struct xps_map
*remove_xps_queue(struct xps_dev_maps
*dev_maps
,
1985 struct xps_map
*map
= NULL
;
1989 map
= xmap_dereference(dev_maps
->cpu_map
[cpu
]);
1991 for (pos
= 0; map
&& pos
< map
->len
; pos
++) {
1992 if (map
->queues
[pos
] == index
) {
1994 map
->queues
[pos
] = map
->queues
[--map
->len
];
1996 RCU_INIT_POINTER(dev_maps
->cpu_map
[cpu
], NULL
);
1997 kfree_rcu(map
, rcu
);
2007 static void netif_reset_xps_queues_gt(struct net_device
*dev
, u16 index
)
2009 struct xps_dev_maps
*dev_maps
;
2011 bool active
= false;
2013 mutex_lock(&xps_map_mutex
);
2014 dev_maps
= xmap_dereference(dev
->xps_maps
);
2019 for_each_possible_cpu(cpu
) {
2020 for (i
= index
; i
< dev
->num_tx_queues
; i
++) {
2021 if (!remove_xps_queue(dev_maps
, cpu
, i
))
2024 if (i
== dev
->num_tx_queues
)
2029 RCU_INIT_POINTER(dev
->xps_maps
, NULL
);
2030 kfree_rcu(dev_maps
, rcu
);
2033 for (i
= index
; i
< dev
->num_tx_queues
; i
++)
2034 netdev_queue_numa_node_write(netdev_get_tx_queue(dev
, i
),
2038 mutex_unlock(&xps_map_mutex
);
2041 static struct xps_map
*expand_xps_map(struct xps_map
*map
,
2044 struct xps_map
*new_map
;
2045 int alloc_len
= XPS_MIN_MAP_ALLOC
;
2048 for (pos
= 0; map
&& pos
< map
->len
; pos
++) {
2049 if (map
->queues
[pos
] != index
)
2054 /* Need to add queue to this CPU's existing map */
2056 if (pos
< map
->alloc_len
)
2059 alloc_len
= map
->alloc_len
* 2;
2062 /* Need to allocate new map to store queue on this CPU's map */
2063 new_map
= kzalloc_node(XPS_MAP_SIZE(alloc_len
), GFP_KERNEL
,
2068 for (i
= 0; i
< pos
; i
++)
2069 new_map
->queues
[i
] = map
->queues
[i
];
2070 new_map
->alloc_len
= alloc_len
;
2076 int netif_set_xps_queue(struct net_device
*dev
, const struct cpumask
*mask
,
2079 struct xps_dev_maps
*dev_maps
, *new_dev_maps
= NULL
;
2080 struct xps_map
*map
, *new_map
;
2081 int maps_sz
= max_t(unsigned int, XPS_DEV_MAPS_SIZE
, L1_CACHE_BYTES
);
2082 int cpu
, numa_node_id
= -2;
2083 bool active
= false;
2085 mutex_lock(&xps_map_mutex
);
2087 dev_maps
= xmap_dereference(dev
->xps_maps
);
2089 /* allocate memory for queue storage */
2090 for_each_online_cpu(cpu
) {
2091 if (!cpumask_test_cpu(cpu
, mask
))
2095 new_dev_maps
= kzalloc(maps_sz
, GFP_KERNEL
);
2096 if (!new_dev_maps
) {
2097 mutex_unlock(&xps_map_mutex
);
2101 map
= dev_maps
? xmap_dereference(dev_maps
->cpu_map
[cpu
]) :
2104 map
= expand_xps_map(map
, cpu
, index
);
2108 RCU_INIT_POINTER(new_dev_maps
->cpu_map
[cpu
], map
);
2112 goto out_no_new_maps
;
2114 for_each_possible_cpu(cpu
) {
2115 if (cpumask_test_cpu(cpu
, mask
) && cpu_online(cpu
)) {
2116 /* add queue to CPU maps */
2119 map
= xmap_dereference(new_dev_maps
->cpu_map
[cpu
]);
2120 while ((pos
< map
->len
) && (map
->queues
[pos
] != index
))
2123 if (pos
== map
->len
)
2124 map
->queues
[map
->len
++] = index
;
2126 if (numa_node_id
== -2)
2127 numa_node_id
= cpu_to_node(cpu
);
2128 else if (numa_node_id
!= cpu_to_node(cpu
))
2131 } else if (dev_maps
) {
2132 /* fill in the new device map from the old device map */
2133 map
= xmap_dereference(dev_maps
->cpu_map
[cpu
]);
2134 RCU_INIT_POINTER(new_dev_maps
->cpu_map
[cpu
], map
);
2139 rcu_assign_pointer(dev
->xps_maps
, new_dev_maps
);
2141 /* Cleanup old maps */
2143 for_each_possible_cpu(cpu
) {
2144 new_map
= xmap_dereference(new_dev_maps
->cpu_map
[cpu
]);
2145 map
= xmap_dereference(dev_maps
->cpu_map
[cpu
]);
2146 if (map
&& map
!= new_map
)
2147 kfree_rcu(map
, rcu
);
2150 kfree_rcu(dev_maps
, rcu
);
2153 dev_maps
= new_dev_maps
;
2157 /* update Tx queue numa node */
2158 netdev_queue_numa_node_write(netdev_get_tx_queue(dev
, index
),
2159 (numa_node_id
>= 0) ? numa_node_id
:
2165 /* removes queue from unused CPUs */
2166 for_each_possible_cpu(cpu
) {
2167 if (cpumask_test_cpu(cpu
, mask
) && cpu_online(cpu
))
2170 if (remove_xps_queue(dev_maps
, cpu
, index
))
2174 /* free map if not active */
2176 RCU_INIT_POINTER(dev
->xps_maps
, NULL
);
2177 kfree_rcu(dev_maps
, rcu
);
2181 mutex_unlock(&xps_map_mutex
);
2185 /* remove any maps that we added */
2186 for_each_possible_cpu(cpu
) {
2187 new_map
= xmap_dereference(new_dev_maps
->cpu_map
[cpu
]);
2188 map
= dev_maps
? xmap_dereference(dev_maps
->cpu_map
[cpu
]) :
2190 if (new_map
&& new_map
!= map
)
2194 mutex_unlock(&xps_map_mutex
);
2196 kfree(new_dev_maps
);
2199 EXPORT_SYMBOL(netif_set_xps_queue
);
2203 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2204 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2206 int netif_set_real_num_tx_queues(struct net_device
*dev
, unsigned int txq
)
2211 disabling
= txq
< dev
->real_num_tx_queues
;
2213 if (txq
< 1 || txq
> dev
->num_tx_queues
)
2216 if (dev
->reg_state
== NETREG_REGISTERED
||
2217 dev
->reg_state
== NETREG_UNREGISTERING
) {
2220 rc
= netdev_queue_update_kobjects(dev
, dev
->real_num_tx_queues
,
2226 netif_setup_tc(dev
, txq
);
2228 dev
->real_num_tx_queues
= txq
;
2232 qdisc_reset_all_tx_gt(dev
, txq
);
2234 netif_reset_xps_queues_gt(dev
, txq
);
2238 dev
->real_num_tx_queues
= txq
;
2243 EXPORT_SYMBOL(netif_set_real_num_tx_queues
);
2247 * netif_set_real_num_rx_queues - set actual number of RX queues used
2248 * @dev: Network device
2249 * @rxq: Actual number of RX queues
2251 * This must be called either with the rtnl_lock held or before
2252 * registration of the net device. Returns 0 on success, or a
2253 * negative error code. If called before registration, it always
2256 int netif_set_real_num_rx_queues(struct net_device
*dev
, unsigned int rxq
)
2260 if (rxq
< 1 || rxq
> dev
->num_rx_queues
)
2263 if (dev
->reg_state
== NETREG_REGISTERED
) {
2266 rc
= net_rx_queue_update_kobjects(dev
, dev
->real_num_rx_queues
,
2272 dev
->real_num_rx_queues
= rxq
;
2275 EXPORT_SYMBOL(netif_set_real_num_rx_queues
);
2279 * netif_get_num_default_rss_queues - default number of RSS queues
2281 * This routine should set an upper limit on the number of RSS queues
2282 * used by default by multiqueue devices.
2284 int netif_get_num_default_rss_queues(void)
2286 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES
, num_online_cpus());
2288 EXPORT_SYMBOL(netif_get_num_default_rss_queues
);
2290 static inline void __netif_reschedule(struct Qdisc
*q
)
2292 struct softnet_data
*sd
;
2293 unsigned long flags
;
2295 local_irq_save(flags
);
2296 sd
= this_cpu_ptr(&softnet_data
);
2297 q
->next_sched
= NULL
;
2298 *sd
->output_queue_tailp
= q
;
2299 sd
->output_queue_tailp
= &q
->next_sched
;
2300 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
2301 local_irq_restore(flags
);
2304 void __netif_schedule(struct Qdisc
*q
)
2306 if (!test_and_set_bit(__QDISC_STATE_SCHED
, &q
->state
))
2307 __netif_reschedule(q
);
2309 EXPORT_SYMBOL(__netif_schedule
);
2311 struct dev_kfree_skb_cb
{
2312 enum skb_free_reason reason
;
2315 static struct dev_kfree_skb_cb
*get_kfree_skb_cb(const struct sk_buff
*skb
)
2317 return (struct dev_kfree_skb_cb
*)skb
->cb
;
2320 void netif_schedule_queue(struct netdev_queue
*txq
)
2323 if (!(txq
->state
& QUEUE_STATE_ANY_XOFF
)) {
2324 struct Qdisc
*q
= rcu_dereference(txq
->qdisc
);
2326 __netif_schedule(q
);
2330 EXPORT_SYMBOL(netif_schedule_queue
);
2333 * netif_wake_subqueue - allow sending packets on subqueue
2334 * @dev: network device
2335 * @queue_index: sub queue index
2337 * Resume individual transmit queue of a device with multiple transmit queues.
2339 void netif_wake_subqueue(struct net_device
*dev
, u16 queue_index
)
2341 struct netdev_queue
*txq
= netdev_get_tx_queue(dev
, queue_index
);
2343 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF
, &txq
->state
)) {
2347 q
= rcu_dereference(txq
->qdisc
);
2348 __netif_schedule(q
);
2352 EXPORT_SYMBOL(netif_wake_subqueue
);
2354 void netif_tx_wake_queue(struct netdev_queue
*dev_queue
)
2356 if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF
, &dev_queue
->state
)) {
2360 q
= rcu_dereference(dev_queue
->qdisc
);
2361 __netif_schedule(q
);
2365 EXPORT_SYMBOL(netif_tx_wake_queue
);
2367 void __dev_kfree_skb_irq(struct sk_buff
*skb
, enum skb_free_reason reason
)
2369 unsigned long flags
;
2374 if (likely(atomic_read(&skb
->users
) == 1)) {
2376 atomic_set(&skb
->users
, 0);
2377 } else if (likely(!atomic_dec_and_test(&skb
->users
))) {
2380 get_kfree_skb_cb(skb
)->reason
= reason
;
2381 local_irq_save(flags
);
2382 skb
->next
= __this_cpu_read(softnet_data
.completion_queue
);
2383 __this_cpu_write(softnet_data
.completion_queue
, skb
);
2384 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
2385 local_irq_restore(flags
);
2387 EXPORT_SYMBOL(__dev_kfree_skb_irq
);
2389 void __dev_kfree_skb_any(struct sk_buff
*skb
, enum skb_free_reason reason
)
2391 if (in_irq() || irqs_disabled())
2392 __dev_kfree_skb_irq(skb
, reason
);
2396 EXPORT_SYMBOL(__dev_kfree_skb_any
);
2400 * netif_device_detach - mark device as removed
2401 * @dev: network device
2403 * Mark device as removed from system and therefore no longer available.
2405 void netif_device_detach(struct net_device
*dev
)
2407 if (test_and_clear_bit(__LINK_STATE_PRESENT
, &dev
->state
) &&
2408 netif_running(dev
)) {
2409 netif_tx_stop_all_queues(dev
);
2412 EXPORT_SYMBOL(netif_device_detach
);
2415 * netif_device_attach - mark device as attached
2416 * @dev: network device
2418 * Mark device as attached from system and restart if needed.
2420 void netif_device_attach(struct net_device
*dev
)
2422 if (!test_and_set_bit(__LINK_STATE_PRESENT
, &dev
->state
) &&
2423 netif_running(dev
)) {
2424 netif_tx_wake_all_queues(dev
);
2425 __netdev_watchdog_up(dev
);
2428 EXPORT_SYMBOL(netif_device_attach
);
2431 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2432 * to be used as a distribution range.
2434 u16
__skb_tx_hash(const struct net_device
*dev
, struct sk_buff
*skb
,
2435 unsigned int num_tx_queues
)
2439 u16 qcount
= num_tx_queues
;
2441 if (skb_rx_queue_recorded(skb
)) {
2442 hash
= skb_get_rx_queue(skb
);
2443 while (unlikely(hash
>= num_tx_queues
))
2444 hash
-= num_tx_queues
;
2449 u8 tc
= netdev_get_prio_tc_map(dev
, skb
->priority
);
2450 qoffset
= dev
->tc_to_txq
[tc
].offset
;
2451 qcount
= dev
->tc_to_txq
[tc
].count
;
2454 return (u16
) reciprocal_scale(skb_get_hash(skb
), qcount
) + qoffset
;
2456 EXPORT_SYMBOL(__skb_tx_hash
);
2458 static void skb_warn_bad_offload(const struct sk_buff
*skb
)
2460 static const netdev_features_t null_features
= 0;
2461 struct net_device
*dev
= skb
->dev
;
2462 const char *name
= "";
2464 if (!net_ratelimit())
2468 if (dev
->dev
.parent
)
2469 name
= dev_driver_string(dev
->dev
.parent
);
2471 name
= netdev_name(dev
);
2473 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2474 "gso_type=%d ip_summed=%d\n",
2475 name
, dev
? &dev
->features
: &null_features
,
2476 skb
->sk
? &skb
->sk
->sk_route_caps
: &null_features
,
2477 skb
->len
, skb
->data_len
, skb_shinfo(skb
)->gso_size
,
2478 skb_shinfo(skb
)->gso_type
, skb
->ip_summed
);
2482 * Invalidate hardware checksum when packet is to be mangled, and
2483 * complete checksum manually on outgoing path.
2485 int skb_checksum_help(struct sk_buff
*skb
)
2488 int ret
= 0, offset
;
2490 if (skb
->ip_summed
== CHECKSUM_COMPLETE
)
2491 goto out_set_summed
;
2493 if (unlikely(skb_shinfo(skb
)->gso_size
)) {
2494 skb_warn_bad_offload(skb
);
2498 /* Before computing a checksum, we should make sure no frag could
2499 * be modified by an external entity : checksum could be wrong.
2501 if (skb_has_shared_frag(skb
)) {
2502 ret
= __skb_linearize(skb
);
2507 offset
= skb_checksum_start_offset(skb
);
2508 BUG_ON(offset
>= skb_headlen(skb
));
2509 csum
= skb_checksum(skb
, offset
, skb
->len
- offset
, 0);
2511 offset
+= skb
->csum_offset
;
2512 BUG_ON(offset
+ sizeof(__sum16
) > skb_headlen(skb
));
2514 if (skb_cloned(skb
) &&
2515 !skb_clone_writable(skb
, offset
+ sizeof(__sum16
))) {
2516 ret
= pskb_expand_head(skb
, 0, 0, GFP_ATOMIC
);
2521 *(__sum16
*)(skb
->data
+ offset
) = csum_fold(csum
) ?: CSUM_MANGLED_0
;
2523 skb
->ip_summed
= CHECKSUM_NONE
;
2527 EXPORT_SYMBOL(skb_checksum_help
);
2529 __be16
skb_network_protocol(struct sk_buff
*skb
, int *depth
)
2531 __be16 type
= skb
->protocol
;
2533 /* Tunnel gso handlers can set protocol to ethernet. */
2534 if (type
== htons(ETH_P_TEB
)) {
2537 if (unlikely(!pskb_may_pull(skb
, sizeof(struct ethhdr
))))
2540 eth
= (struct ethhdr
*)skb
->data
;
2541 type
= eth
->h_proto
;
2544 return __vlan_get_protocol(skb
, type
, depth
);
2548 * skb_mac_gso_segment - mac layer segmentation handler.
2549 * @skb: buffer to segment
2550 * @features: features for the output path (see dev->features)
2552 struct sk_buff
*skb_mac_gso_segment(struct sk_buff
*skb
,
2553 netdev_features_t features
)
2555 struct sk_buff
*segs
= ERR_PTR(-EPROTONOSUPPORT
);
2556 struct packet_offload
*ptype
;
2557 int vlan_depth
= skb
->mac_len
;
2558 __be16 type
= skb_network_protocol(skb
, &vlan_depth
);
2560 if (unlikely(!type
))
2561 return ERR_PTR(-EINVAL
);
2563 __skb_pull(skb
, vlan_depth
);
2566 list_for_each_entry_rcu(ptype
, &offload_base
, list
) {
2567 if (ptype
->type
== type
&& ptype
->callbacks
.gso_segment
) {
2568 segs
= ptype
->callbacks
.gso_segment(skb
, features
);
2574 __skb_push(skb
, skb
->data
- skb_mac_header(skb
));
2578 EXPORT_SYMBOL(skb_mac_gso_segment
);
2581 /* openvswitch calls this on rx path, so we need a different check.
2583 static inline bool skb_needs_check(struct sk_buff
*skb
, bool tx_path
)
2586 return skb
->ip_summed
!= CHECKSUM_PARTIAL
&&
2587 skb
->ip_summed
!= CHECKSUM_UNNECESSARY
;
2589 return skb
->ip_summed
== CHECKSUM_NONE
;
2593 * __skb_gso_segment - Perform segmentation on skb.
2594 * @skb: buffer to segment
2595 * @features: features for the output path (see dev->features)
2596 * @tx_path: whether it is called in TX path
2598 * This function segments the given skb and returns a list of segments.
2600 * It may return NULL if the skb requires no segmentation. This is
2601 * only possible when GSO is used for verifying header integrity.
2603 * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2605 struct sk_buff
*__skb_gso_segment(struct sk_buff
*skb
,
2606 netdev_features_t features
, bool tx_path
)
2608 struct sk_buff
*segs
;
2610 if (unlikely(skb_needs_check(skb
, tx_path
))) {
2613 /* We're going to init ->check field in TCP or UDP header */
2614 err
= skb_cow_head(skb
, 0);
2616 return ERR_PTR(err
);
2619 BUILD_BUG_ON(SKB_SGO_CB_OFFSET
+
2620 sizeof(*SKB_GSO_CB(skb
)) > sizeof(skb
->cb
));
2622 SKB_GSO_CB(skb
)->mac_offset
= skb_headroom(skb
);
2623 SKB_GSO_CB(skb
)->encap_level
= 0;
2625 skb_reset_mac_header(skb
);
2626 skb_reset_mac_len(skb
);
2628 segs
= skb_mac_gso_segment(skb
, features
);
2630 if (unlikely(skb_needs_check(skb
, tx_path
) && !IS_ERR(segs
)))
2631 skb_warn_bad_offload(skb
);
2635 EXPORT_SYMBOL(__skb_gso_segment
);
2637 /* Take action when hardware reception checksum errors are detected. */
2639 void netdev_rx_csum_fault(struct net_device
*dev
)
2641 if (net_ratelimit()) {
2642 pr_err("%s: hw csum failure\n", dev
? dev
->name
: "<unknown>");
2646 EXPORT_SYMBOL(netdev_rx_csum_fault
);
2649 /* Actually, we should eliminate this check as soon as we know, that:
2650 * 1. IOMMU is present and allows to map all the memory.
2651 * 2. No high memory really exists on this machine.
2654 static int illegal_highdma(struct net_device
*dev
, struct sk_buff
*skb
)
2656 #ifdef CONFIG_HIGHMEM
2658 if (!(dev
->features
& NETIF_F_HIGHDMA
)) {
2659 for (i
= 0; i
< skb_shinfo(skb
)->nr_frags
; i
++) {
2660 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
];
2661 if (PageHighMem(skb_frag_page(frag
)))
2666 if (PCI_DMA_BUS_IS_PHYS
) {
2667 struct device
*pdev
= dev
->dev
.parent
;
2671 for (i
= 0; i
< skb_shinfo(skb
)->nr_frags
; i
++) {
2672 skb_frag_t
*frag
= &skb_shinfo(skb
)->frags
[i
];
2673 dma_addr_t addr
= page_to_phys(skb_frag_page(frag
));
2674 if (!pdev
->dma_mask
|| addr
+ PAGE_SIZE
- 1 > *pdev
->dma_mask
)
2682 /* If MPLS offload request, verify we are testing hardware MPLS features
2683 * instead of standard features for the netdev.
2685 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2686 static netdev_features_t
net_mpls_features(struct sk_buff
*skb
,
2687 netdev_features_t features
,
2690 if (eth_p_mpls(type
))
2691 features
&= skb
->dev
->mpls_features
;
2696 static netdev_features_t
net_mpls_features(struct sk_buff
*skb
,
2697 netdev_features_t features
,
2704 static netdev_features_t
harmonize_features(struct sk_buff
*skb
,
2705 netdev_features_t features
)
2710 type
= skb_network_protocol(skb
, &tmp
);
2711 features
= net_mpls_features(skb
, features
, type
);
2713 if (skb
->ip_summed
!= CHECKSUM_NONE
&&
2714 !can_checksum_protocol(features
, type
)) {
2715 features
&= ~NETIF_F_ALL_CSUM
;
2717 if (illegal_highdma(skb
->dev
, skb
))
2718 features
&= ~NETIF_F_SG
;
2723 netdev_features_t
passthru_features_check(struct sk_buff
*skb
,
2724 struct net_device
*dev
,
2725 netdev_features_t features
)
2729 EXPORT_SYMBOL(passthru_features_check
);
2731 static netdev_features_t
dflt_features_check(struct sk_buff
*skb
,
2732 struct net_device
*dev
,
2733 netdev_features_t features
)
2735 return vlan_features_check(skb
, features
);
2738 netdev_features_t
netif_skb_features(struct sk_buff
*skb
)
2740 struct net_device
*dev
= skb
->dev
;
2741 netdev_features_t features
= dev
->features
;
2742 u16 gso_segs
= skb_shinfo(skb
)->gso_segs
;
2744 if (gso_segs
> dev
->gso_max_segs
|| gso_segs
< dev
->gso_min_segs
)
2745 features
&= ~NETIF_F_GSO_MASK
;
2747 /* If encapsulation offload request, verify we are testing
2748 * hardware encapsulation features instead of standard
2749 * features for the netdev
2751 if (skb
->encapsulation
)
2752 features
&= dev
->hw_enc_features
;
2754 if (skb_vlan_tagged(skb
))
2755 features
= netdev_intersect_features(features
,
2756 dev
->vlan_features
|
2757 NETIF_F_HW_VLAN_CTAG_TX
|
2758 NETIF_F_HW_VLAN_STAG_TX
);
2760 if (dev
->netdev_ops
->ndo_features_check
)
2761 features
&= dev
->netdev_ops
->ndo_features_check(skb
, dev
,
2764 features
&= dflt_features_check(skb
, dev
, features
);
2766 return harmonize_features(skb
, features
);
2768 EXPORT_SYMBOL(netif_skb_features
);
2770 static int xmit_one(struct sk_buff
*skb
, struct net_device
*dev
,
2771 struct netdev_queue
*txq
, bool more
)
2776 if (!list_empty(&ptype_all
) || !list_empty(&dev
->ptype_all
))
2777 dev_queue_xmit_nit(skb
, dev
);
2780 trace_net_dev_start_xmit(skb
, dev
);
2781 rc
= netdev_start_xmit(skb
, dev
, txq
, more
);
2782 trace_net_dev_xmit(skb
, rc
, dev
, len
);
2787 struct sk_buff
*dev_hard_start_xmit(struct sk_buff
*first
, struct net_device
*dev
,
2788 struct netdev_queue
*txq
, int *ret
)
2790 struct sk_buff
*skb
= first
;
2791 int rc
= NETDEV_TX_OK
;
2794 struct sk_buff
*next
= skb
->next
;
2797 rc
= xmit_one(skb
, dev
, txq
, next
!= NULL
);
2798 if (unlikely(!dev_xmit_complete(rc
))) {
2804 if (netif_xmit_stopped(txq
) && skb
) {
2805 rc
= NETDEV_TX_BUSY
;
2815 static struct sk_buff
*validate_xmit_vlan(struct sk_buff
*skb
,
2816 netdev_features_t features
)
2818 if (skb_vlan_tag_present(skb
) &&
2819 !vlan_hw_offload_capable(features
, skb
->vlan_proto
))
2820 skb
= __vlan_hwaccel_push_inside(skb
);
2824 static struct sk_buff
*validate_xmit_skb(struct sk_buff
*skb
, struct net_device
*dev
)
2826 netdev_features_t features
;
2831 features
= netif_skb_features(skb
);
2832 skb
= validate_xmit_vlan(skb
, features
);
2836 if (netif_needs_gso(skb
, features
)) {
2837 struct sk_buff
*segs
;
2839 segs
= skb_gso_segment(skb
, features
);
2847 if (skb_needs_linearize(skb
, features
) &&
2848 __skb_linearize(skb
))
2851 /* If packet is not checksummed and device does not
2852 * support checksumming for this protocol, complete
2853 * checksumming here.
2855 if (skb
->ip_summed
== CHECKSUM_PARTIAL
) {
2856 if (skb
->encapsulation
)
2857 skb_set_inner_transport_header(skb
,
2858 skb_checksum_start_offset(skb
));
2860 skb_set_transport_header(skb
,
2861 skb_checksum_start_offset(skb
));
2862 if (!(features
& NETIF_F_ALL_CSUM
) &&
2863 skb_checksum_help(skb
))
2876 struct sk_buff
*validate_xmit_skb_list(struct sk_buff
*skb
, struct net_device
*dev
)
2878 struct sk_buff
*next
, *head
= NULL
, *tail
;
2880 for (; skb
!= NULL
; skb
= next
) {
2884 /* in case skb wont be segmented, point to itself */
2887 skb
= validate_xmit_skb(skb
, dev
);
2895 /* If skb was segmented, skb->prev points to
2896 * the last segment. If not, it still contains skb.
2902 EXPORT_SYMBOL_GPL(validate_xmit_skb_list
);
2904 static void qdisc_pkt_len_init(struct sk_buff
*skb
)
2906 const struct skb_shared_info
*shinfo
= skb_shinfo(skb
);
2908 qdisc_skb_cb(skb
)->pkt_len
= skb
->len
;
2910 /* To get more precise estimation of bytes sent on wire,
2911 * we add to pkt_len the headers size of all segments
2913 if (shinfo
->gso_size
) {
2914 unsigned int hdr_len
;
2915 u16 gso_segs
= shinfo
->gso_segs
;
2917 /* mac layer + network layer */
2918 hdr_len
= skb_transport_header(skb
) - skb_mac_header(skb
);
2920 /* + transport layer */
2921 if (likely(shinfo
->gso_type
& (SKB_GSO_TCPV4
| SKB_GSO_TCPV6
))) {
2922 const struct tcphdr
*th
;
2923 struct tcphdr _tcphdr
;
2925 th
= skb_header_pointer(skb
, skb_transport_offset(skb
),
2926 sizeof(_tcphdr
), &_tcphdr
);
2928 hdr_len
+= __tcp_hdrlen(th
);
2930 struct udphdr _udphdr
;
2932 if (skb_header_pointer(skb
, skb_transport_offset(skb
),
2933 sizeof(_udphdr
), &_udphdr
))
2934 hdr_len
+= sizeof(struct udphdr
);
2937 if (shinfo
->gso_type
& SKB_GSO_DODGY
)
2938 gso_segs
= DIV_ROUND_UP(skb
->len
- hdr_len
,
2941 qdisc_skb_cb(skb
)->pkt_len
+= (gso_segs
- 1) * hdr_len
;
2945 static inline int __dev_xmit_skb(struct sk_buff
*skb
, struct Qdisc
*q
,
2946 struct net_device
*dev
,
2947 struct netdev_queue
*txq
)
2949 spinlock_t
*root_lock
= qdisc_lock(q
);
2953 qdisc_pkt_len_init(skb
);
2954 qdisc_calculate_pkt_len(skb
, q
);
2956 * Heuristic to force contended enqueues to serialize on a
2957 * separate lock before trying to get qdisc main lock.
2958 * This permits __QDISC___STATE_RUNNING owner to get the lock more
2959 * often and dequeue packets faster.
2961 contended
= qdisc_is_running(q
);
2962 if (unlikely(contended
))
2963 spin_lock(&q
->busylock
);
2965 spin_lock(root_lock
);
2966 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED
, &q
->state
))) {
2969 } else if ((q
->flags
& TCQ_F_CAN_BYPASS
) && !qdisc_qlen(q
) &&
2970 qdisc_run_begin(q
)) {
2972 * This is a work-conserving queue; there are no old skbs
2973 * waiting to be sent out; and the qdisc is not running -
2974 * xmit the skb directly.
2977 qdisc_bstats_update(q
, skb
);
2979 if (sch_direct_xmit(skb
, q
, dev
, txq
, root_lock
, true)) {
2980 if (unlikely(contended
)) {
2981 spin_unlock(&q
->busylock
);
2988 rc
= NET_XMIT_SUCCESS
;
2990 rc
= q
->enqueue(skb
, q
) & NET_XMIT_MASK
;
2991 if (qdisc_run_begin(q
)) {
2992 if (unlikely(contended
)) {
2993 spin_unlock(&q
->busylock
);
2999 spin_unlock(root_lock
);
3000 if (unlikely(contended
))
3001 spin_unlock(&q
->busylock
);
3005 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3006 static void skb_update_prio(struct sk_buff
*skb
)
3008 struct netprio_map
*map
= rcu_dereference_bh(skb
->dev
->priomap
);
3010 if (!skb
->priority
&& skb
->sk
&& map
) {
3011 unsigned int prioidx
= skb
->sk
->sk_cgrp_prioidx
;
3013 if (prioidx
< map
->priomap_len
)
3014 skb
->priority
= map
->priomap
[prioidx
];
3018 #define skb_update_prio(skb)
3021 DEFINE_PER_CPU(int, xmit_recursion
);
3022 EXPORT_SYMBOL(xmit_recursion
);
3024 #define RECURSION_LIMIT 10
3027 * dev_loopback_xmit - loop back @skb
3028 * @net: network namespace this loopback is happening in
3029 * @sk: sk needed to be a netfilter okfn
3030 * @skb: buffer to transmit
3032 int dev_loopback_xmit(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
3034 skb_reset_mac_header(skb
);
3035 __skb_pull(skb
, skb_network_offset(skb
));
3036 skb
->pkt_type
= PACKET_LOOPBACK
;
3037 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
3038 WARN_ON(!skb_dst(skb
));
3043 EXPORT_SYMBOL(dev_loopback_xmit
);
3045 static inline int get_xps_queue(struct net_device
*dev
, struct sk_buff
*skb
)
3048 struct xps_dev_maps
*dev_maps
;
3049 struct xps_map
*map
;
3050 int queue_index
= -1;
3053 dev_maps
= rcu_dereference(dev
->xps_maps
);
3055 map
= rcu_dereference(
3056 dev_maps
->cpu_map
[skb
->sender_cpu
- 1]);
3059 queue_index
= map
->queues
[0];
3061 queue_index
= map
->queues
[reciprocal_scale(skb_get_hash(skb
),
3063 if (unlikely(queue_index
>= dev
->real_num_tx_queues
))
3075 static u16
__netdev_pick_tx(struct net_device
*dev
, struct sk_buff
*skb
)
3077 struct sock
*sk
= skb
->sk
;
3078 int queue_index
= sk_tx_queue_get(sk
);
3080 if (queue_index
< 0 || skb
->ooo_okay
||
3081 queue_index
>= dev
->real_num_tx_queues
) {
3082 int new_index
= get_xps_queue(dev
, skb
);
3084 new_index
= skb_tx_hash(dev
, skb
);
3086 if (queue_index
!= new_index
&& sk
&&
3088 rcu_access_pointer(sk
->sk_dst_cache
))
3089 sk_tx_queue_set(sk
, new_index
);
3091 queue_index
= new_index
;
3097 struct netdev_queue
*netdev_pick_tx(struct net_device
*dev
,
3098 struct sk_buff
*skb
,
3101 int queue_index
= 0;
3104 u32 sender_cpu
= skb
->sender_cpu
- 1;
3106 if (sender_cpu
>= (u32
)NR_CPUS
)
3107 skb
->sender_cpu
= raw_smp_processor_id() + 1;
3110 if (dev
->real_num_tx_queues
!= 1) {
3111 const struct net_device_ops
*ops
= dev
->netdev_ops
;
3112 if (ops
->ndo_select_queue
)
3113 queue_index
= ops
->ndo_select_queue(dev
, skb
, accel_priv
,
3116 queue_index
= __netdev_pick_tx(dev
, skb
);
3119 queue_index
= netdev_cap_txqueue(dev
, queue_index
);
3122 skb_set_queue_mapping(skb
, queue_index
);
3123 return netdev_get_tx_queue(dev
, queue_index
);
3127 * __dev_queue_xmit - transmit a buffer
3128 * @skb: buffer to transmit
3129 * @accel_priv: private data used for L2 forwarding offload
3131 * Queue a buffer for transmission to a network device. The caller must
3132 * have set the device and priority and built the buffer before calling
3133 * this function. The function can be called from an interrupt.
3135 * A negative errno code is returned on a failure. A success does not
3136 * guarantee the frame will be transmitted as it may be dropped due
3137 * to congestion or traffic shaping.
3139 * -----------------------------------------------------------------------------------
3140 * I notice this method can also return errors from the queue disciplines,
3141 * including NET_XMIT_DROP, which is a positive value. So, errors can also
3144 * Regardless of the return value, the skb is consumed, so it is currently
3145 * difficult to retry a send to this method. (You can bump the ref count
3146 * before sending to hold a reference for retry if you are careful.)
3148 * When calling this method, interrupts MUST be enabled. This is because
3149 * the BH enable code must have IRQs enabled so that it will not deadlock.
3152 static int __dev_queue_xmit(struct sk_buff
*skb
, void *accel_priv
)
3154 struct net_device
*dev
= skb
->dev
;
3155 struct netdev_queue
*txq
;
3159 skb_reset_mac_header(skb
);
3161 if (unlikely(skb_shinfo(skb
)->tx_flags
& SKBTX_SCHED_TSTAMP
))
3162 __skb_tstamp_tx(skb
, NULL
, skb
->sk
, SCM_TSTAMP_SCHED
);
3164 /* Disable soft irqs for various locks below. Also
3165 * stops preemption for RCU.
3169 skb_update_prio(skb
);
3171 /* If device/qdisc don't need skb->dst, release it right now while
3172 * its hot in this cpu cache.
3174 if (dev
->priv_flags
& IFF_XMIT_DST_RELEASE
)
3179 #ifdef CONFIG_NET_SWITCHDEV
3180 /* Don't forward if offload device already forwarded */
3181 if (skb
->offload_fwd_mark
&&
3182 skb
->offload_fwd_mark
== dev
->offload_fwd_mark
) {
3184 rc
= NET_XMIT_SUCCESS
;
3189 txq
= netdev_pick_tx(dev
, skb
, accel_priv
);
3190 q
= rcu_dereference_bh(txq
->qdisc
);
3192 #ifdef CONFIG_NET_CLS_ACT
3193 skb
->tc_verd
= SET_TC_AT(skb
->tc_verd
, AT_EGRESS
);
3195 trace_net_dev_queue(skb
);
3197 rc
= __dev_xmit_skb(skb
, q
, dev
, txq
);
3201 /* The device has no queue. Common case for software devices:
3202 loopback, all the sorts of tunnels...
3204 Really, it is unlikely that netif_tx_lock protection is necessary
3205 here. (f.e. loopback and IP tunnels are clean ignoring statistics
3207 However, it is possible, that they rely on protection
3210 Check this and shot the lock. It is not prone from deadlocks.
3211 Either shot noqueue qdisc, it is even simpler 8)
3213 if (dev
->flags
& IFF_UP
) {
3214 int cpu
= smp_processor_id(); /* ok because BHs are off */
3216 if (txq
->xmit_lock_owner
!= cpu
) {
3218 if (__this_cpu_read(xmit_recursion
) > RECURSION_LIMIT
)
3219 goto recursion_alert
;
3221 skb
= validate_xmit_skb(skb
, dev
);
3225 HARD_TX_LOCK(dev
, txq
, cpu
);
3227 if (!netif_xmit_stopped(txq
)) {
3228 __this_cpu_inc(xmit_recursion
);
3229 skb
= dev_hard_start_xmit(skb
, dev
, txq
, &rc
);
3230 __this_cpu_dec(xmit_recursion
);
3231 if (dev_xmit_complete(rc
)) {
3232 HARD_TX_UNLOCK(dev
, txq
);
3236 HARD_TX_UNLOCK(dev
, txq
);
3237 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3240 /* Recursion is detected! It is possible,
3244 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3251 rcu_read_unlock_bh();
3253 atomic_long_inc(&dev
->tx_dropped
);
3254 kfree_skb_list(skb
);
3257 rcu_read_unlock_bh();
3261 int dev_queue_xmit(struct sk_buff
*skb
)
3263 return __dev_queue_xmit(skb
, NULL
);
3265 EXPORT_SYMBOL(dev_queue_xmit
);
3267 int dev_queue_xmit_accel(struct sk_buff
*skb
, void *accel_priv
)
3269 return __dev_queue_xmit(skb
, accel_priv
);
3271 EXPORT_SYMBOL(dev_queue_xmit_accel
);
3274 /*=======================================================================
3276 =======================================================================*/
3278 int netdev_max_backlog __read_mostly
= 1000;
3279 EXPORT_SYMBOL(netdev_max_backlog
);
3281 int netdev_tstamp_prequeue __read_mostly
= 1;
3282 int netdev_budget __read_mostly
= 300;
3283 int weight_p __read_mostly
= 64; /* old backlog weight */
3285 /* Called with irq disabled */
3286 static inline void ____napi_schedule(struct softnet_data
*sd
,
3287 struct napi_struct
*napi
)
3289 list_add_tail(&napi
->poll_list
, &sd
->poll_list
);
3290 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
3295 /* One global table that all flow-based protocols share. */
3296 struct rps_sock_flow_table __rcu
*rps_sock_flow_table __read_mostly
;
3297 EXPORT_SYMBOL(rps_sock_flow_table
);
3298 u32 rps_cpu_mask __read_mostly
;
3299 EXPORT_SYMBOL(rps_cpu_mask
);
3301 struct static_key rps_needed __read_mostly
;
3303 static struct rps_dev_flow
*
3304 set_rps_cpu(struct net_device
*dev
, struct sk_buff
*skb
,
3305 struct rps_dev_flow
*rflow
, u16 next_cpu
)
3307 if (next_cpu
< nr_cpu_ids
) {
3308 #ifdef CONFIG_RFS_ACCEL
3309 struct netdev_rx_queue
*rxqueue
;
3310 struct rps_dev_flow_table
*flow_table
;
3311 struct rps_dev_flow
*old_rflow
;
3316 /* Should we steer this flow to a different hardware queue? */
3317 if (!skb_rx_queue_recorded(skb
) || !dev
->rx_cpu_rmap
||
3318 !(dev
->features
& NETIF_F_NTUPLE
))
3320 rxq_index
= cpu_rmap_lookup_index(dev
->rx_cpu_rmap
, next_cpu
);
3321 if (rxq_index
== skb_get_rx_queue(skb
))
3324 rxqueue
= dev
->_rx
+ rxq_index
;
3325 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
3328 flow_id
= skb_get_hash(skb
) & flow_table
->mask
;
3329 rc
= dev
->netdev_ops
->ndo_rx_flow_steer(dev
, skb
,
3330 rxq_index
, flow_id
);
3334 rflow
= &flow_table
->flows
[flow_id
];
3336 if (old_rflow
->filter
== rflow
->filter
)
3337 old_rflow
->filter
= RPS_NO_FILTER
;
3341 per_cpu(softnet_data
, next_cpu
).input_queue_head
;
3344 rflow
->cpu
= next_cpu
;
3349 * get_rps_cpu is called from netif_receive_skb and returns the target
3350 * CPU from the RPS map of the receiving queue for a given skb.
3351 * rcu_read_lock must be held on entry.
3353 static int get_rps_cpu(struct net_device
*dev
, struct sk_buff
*skb
,
3354 struct rps_dev_flow
**rflowp
)
3356 const struct rps_sock_flow_table
*sock_flow_table
;
3357 struct netdev_rx_queue
*rxqueue
= dev
->_rx
;
3358 struct rps_dev_flow_table
*flow_table
;
3359 struct rps_map
*map
;
3364 if (skb_rx_queue_recorded(skb
)) {
3365 u16 index
= skb_get_rx_queue(skb
);
3367 if (unlikely(index
>= dev
->real_num_rx_queues
)) {
3368 WARN_ONCE(dev
->real_num_rx_queues
> 1,
3369 "%s received packet on queue %u, but number "
3370 "of RX queues is %u\n",
3371 dev
->name
, index
, dev
->real_num_rx_queues
);
3377 /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3379 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
3380 map
= rcu_dereference(rxqueue
->rps_map
);
3381 if (!flow_table
&& !map
)
3384 skb_reset_network_header(skb
);
3385 hash
= skb_get_hash(skb
);
3389 sock_flow_table
= rcu_dereference(rps_sock_flow_table
);
3390 if (flow_table
&& sock_flow_table
) {
3391 struct rps_dev_flow
*rflow
;
3395 /* First check into global flow table if there is a match */
3396 ident
= sock_flow_table
->ents
[hash
& sock_flow_table
->mask
];
3397 if ((ident
^ hash
) & ~rps_cpu_mask
)
3400 next_cpu
= ident
& rps_cpu_mask
;
3402 /* OK, now we know there is a match,
3403 * we can look at the local (per receive queue) flow table
3405 rflow
= &flow_table
->flows
[hash
& flow_table
->mask
];
3409 * If the desired CPU (where last recvmsg was done) is
3410 * different from current CPU (one in the rx-queue flow
3411 * table entry), switch if one of the following holds:
3412 * - Current CPU is unset (>= nr_cpu_ids).
3413 * - Current CPU is offline.
3414 * - The current CPU's queue tail has advanced beyond the
3415 * last packet that was enqueued using this table entry.
3416 * This guarantees that all previous packets for the flow
3417 * have been dequeued, thus preserving in order delivery.
3419 if (unlikely(tcpu
!= next_cpu
) &&
3420 (tcpu
>= nr_cpu_ids
|| !cpu_online(tcpu
) ||
3421 ((int)(per_cpu(softnet_data
, tcpu
).input_queue_head
-
3422 rflow
->last_qtail
)) >= 0)) {
3424 rflow
= set_rps_cpu(dev
, skb
, rflow
, next_cpu
);
3427 if (tcpu
< nr_cpu_ids
&& cpu_online(tcpu
)) {
3437 tcpu
= map
->cpus
[reciprocal_scale(hash
, map
->len
)];
3438 if (cpu_online(tcpu
)) {
3448 #ifdef CONFIG_RFS_ACCEL
3451 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3452 * @dev: Device on which the filter was set
3453 * @rxq_index: RX queue index
3454 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3455 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3457 * Drivers that implement ndo_rx_flow_steer() should periodically call
3458 * this function for each installed filter and remove the filters for
3459 * which it returns %true.
3461 bool rps_may_expire_flow(struct net_device
*dev
, u16 rxq_index
,
3462 u32 flow_id
, u16 filter_id
)
3464 struct netdev_rx_queue
*rxqueue
= dev
->_rx
+ rxq_index
;
3465 struct rps_dev_flow_table
*flow_table
;
3466 struct rps_dev_flow
*rflow
;
3471 flow_table
= rcu_dereference(rxqueue
->rps_flow_table
);
3472 if (flow_table
&& flow_id
<= flow_table
->mask
) {
3473 rflow
= &flow_table
->flows
[flow_id
];
3474 cpu
= ACCESS_ONCE(rflow
->cpu
);
3475 if (rflow
->filter
== filter_id
&& cpu
< nr_cpu_ids
&&
3476 ((int)(per_cpu(softnet_data
, cpu
).input_queue_head
-
3477 rflow
->last_qtail
) <
3478 (int)(10 * flow_table
->mask
)))
3484 EXPORT_SYMBOL(rps_may_expire_flow
);
3486 #endif /* CONFIG_RFS_ACCEL */
3488 /* Called from hardirq (IPI) context */
3489 static void rps_trigger_softirq(void *data
)
3491 struct softnet_data
*sd
= data
;
3493 ____napi_schedule(sd
, &sd
->backlog
);
3497 #endif /* CONFIG_RPS */
3500 * Check if this softnet_data structure is another cpu one
3501 * If yes, queue it to our IPI list and return 1
3504 static int rps_ipi_queued(struct softnet_data
*sd
)
3507 struct softnet_data
*mysd
= this_cpu_ptr(&softnet_data
);
3510 sd
->rps_ipi_next
= mysd
->rps_ipi_list
;
3511 mysd
->rps_ipi_list
= sd
;
3513 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
3516 #endif /* CONFIG_RPS */
3520 #ifdef CONFIG_NET_FLOW_LIMIT
3521 int netdev_flow_limit_table_len __read_mostly
= (1 << 12);
3524 static bool skb_flow_limit(struct sk_buff
*skb
, unsigned int qlen
)
3526 #ifdef CONFIG_NET_FLOW_LIMIT
3527 struct sd_flow_limit
*fl
;
3528 struct softnet_data
*sd
;
3529 unsigned int old_flow
, new_flow
;
3531 if (qlen
< (netdev_max_backlog
>> 1))
3534 sd
= this_cpu_ptr(&softnet_data
);
3537 fl
= rcu_dereference(sd
->flow_limit
);
3539 new_flow
= skb_get_hash(skb
) & (fl
->num_buckets
- 1);
3540 old_flow
= fl
->history
[fl
->history_head
];
3541 fl
->history
[fl
->history_head
] = new_flow
;
3544 fl
->history_head
&= FLOW_LIMIT_HISTORY
- 1;
3546 if (likely(fl
->buckets
[old_flow
]))
3547 fl
->buckets
[old_flow
]--;
3549 if (++fl
->buckets
[new_flow
] > (FLOW_LIMIT_HISTORY
>> 1)) {
3561 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3562 * queue (may be a remote CPU queue).
3564 static int enqueue_to_backlog(struct sk_buff
*skb
, int cpu
,
3565 unsigned int *qtail
)
3567 struct softnet_data
*sd
;
3568 unsigned long flags
;
3571 sd
= &per_cpu(softnet_data
, cpu
);
3573 local_irq_save(flags
);
3576 if (!netif_running(skb
->dev
))
3578 qlen
= skb_queue_len(&sd
->input_pkt_queue
);
3579 if (qlen
<= netdev_max_backlog
&& !skb_flow_limit(skb
, qlen
)) {
3582 __skb_queue_tail(&sd
->input_pkt_queue
, skb
);
3583 input_queue_tail_incr_save(sd
, qtail
);
3585 local_irq_restore(flags
);
3586 return NET_RX_SUCCESS
;
3589 /* Schedule NAPI for backlog device
3590 * We can use non atomic operation since we own the queue lock
3592 if (!__test_and_set_bit(NAPI_STATE_SCHED
, &sd
->backlog
.state
)) {
3593 if (!rps_ipi_queued(sd
))
3594 ____napi_schedule(sd
, &sd
->backlog
);
3603 local_irq_restore(flags
);
3605 atomic_long_inc(&skb
->dev
->rx_dropped
);
3610 static int netif_rx_internal(struct sk_buff
*skb
)
3614 net_timestamp_check(netdev_tstamp_prequeue
, skb
);
3616 trace_netif_rx(skb
);
3618 if (static_key_false(&rps_needed
)) {
3619 struct rps_dev_flow voidflow
, *rflow
= &voidflow
;
3625 cpu
= get_rps_cpu(skb
->dev
, skb
, &rflow
);
3627 cpu
= smp_processor_id();
3629 ret
= enqueue_to_backlog(skb
, cpu
, &rflow
->last_qtail
);
3637 ret
= enqueue_to_backlog(skb
, get_cpu(), &qtail
);
3644 * netif_rx - post buffer to the network code
3645 * @skb: buffer to post
3647 * This function receives a packet from a device driver and queues it for
3648 * the upper (protocol) levels to process. It always succeeds. The buffer
3649 * may be dropped during processing for congestion control or by the
3653 * NET_RX_SUCCESS (no congestion)
3654 * NET_RX_DROP (packet was dropped)
3658 int netif_rx(struct sk_buff
*skb
)
3660 trace_netif_rx_entry(skb
);
3662 return netif_rx_internal(skb
);
3664 EXPORT_SYMBOL(netif_rx
);
3666 int netif_rx_ni(struct sk_buff
*skb
)
3670 trace_netif_rx_ni_entry(skb
);
3673 err
= netif_rx_internal(skb
);
3674 if (local_softirq_pending())
3680 EXPORT_SYMBOL(netif_rx_ni
);
3682 static void net_tx_action(struct softirq_action
*h
)
3684 struct softnet_data
*sd
= this_cpu_ptr(&softnet_data
);
3686 if (sd
->completion_queue
) {
3687 struct sk_buff
*clist
;
3689 local_irq_disable();
3690 clist
= sd
->completion_queue
;
3691 sd
->completion_queue
= NULL
;
3695 struct sk_buff
*skb
= clist
;
3696 clist
= clist
->next
;
3698 WARN_ON(atomic_read(&skb
->users
));
3699 if (likely(get_kfree_skb_cb(skb
)->reason
== SKB_REASON_CONSUMED
))
3700 trace_consume_skb(skb
);
3702 trace_kfree_skb(skb
, net_tx_action
);
3707 if (sd
->output_queue
) {
3710 local_irq_disable();
3711 head
= sd
->output_queue
;
3712 sd
->output_queue
= NULL
;
3713 sd
->output_queue_tailp
= &sd
->output_queue
;
3717 struct Qdisc
*q
= head
;
3718 spinlock_t
*root_lock
;
3720 head
= head
->next_sched
;
3722 root_lock
= qdisc_lock(q
);
3723 if (spin_trylock(root_lock
)) {
3724 smp_mb__before_atomic();
3725 clear_bit(__QDISC_STATE_SCHED
,
3728 spin_unlock(root_lock
);
3730 if (!test_bit(__QDISC_STATE_DEACTIVATED
,
3732 __netif_reschedule(q
);
3734 smp_mb__before_atomic();
3735 clear_bit(__QDISC_STATE_SCHED
,
3743 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3744 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3745 /* This hook is defined here for ATM LANE */
3746 int (*br_fdb_test_addr_hook
)(struct net_device
*dev
,
3747 unsigned char *addr
) __read_mostly
;
3748 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook
);
3751 static inline struct sk_buff
*handle_ing(struct sk_buff
*skb
,
3752 struct packet_type
**pt_prev
,
3753 int *ret
, struct net_device
*orig_dev
)
3755 #ifdef CONFIG_NET_CLS_ACT
3756 struct tcf_proto
*cl
= rcu_dereference_bh(skb
->dev
->ingress_cl_list
);
3757 struct tcf_result cl_res
;
3759 /* If there's at least one ingress present somewhere (so
3760 * we get here via enabled static key), remaining devices
3761 * that are not configured with an ingress qdisc will bail
3767 *ret
= deliver_skb(skb
, *pt_prev
, orig_dev
);
3771 qdisc_skb_cb(skb
)->pkt_len
= skb
->len
;
3772 skb
->tc_verd
= SET_TC_AT(skb
->tc_verd
, AT_INGRESS
);
3773 qdisc_bstats_cpu_update(cl
->q
, skb
);
3775 switch (tc_classify(skb
, cl
, &cl_res
, false)) {
3777 case TC_ACT_RECLASSIFY
:
3778 skb
->tc_index
= TC_H_MIN(cl_res
.classid
);
3781 qdisc_qstats_cpu_drop(cl
->q
);
3786 case TC_ACT_REDIRECT
:
3787 /* skb_mac_header check was done by cls/act_bpf, so
3788 * we can safely push the L2 header back before
3789 * redirecting to another netdev
3791 __skb_push(skb
, skb
->mac_len
);
3792 skb_do_redirect(skb
);
3797 #endif /* CONFIG_NET_CLS_ACT */
3802 * netdev_is_rx_handler_busy - check if receive handler is registered
3803 * @dev: device to check
3805 * Check if a receive handler is already registered for a given device.
3806 * Return true if there one.
3808 * The caller must hold the rtnl_mutex.
3810 bool netdev_is_rx_handler_busy(struct net_device
*dev
)
3813 return dev
&& rtnl_dereference(dev
->rx_handler
);
3815 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy
);
3818 * netdev_rx_handler_register - register receive handler
3819 * @dev: device to register a handler for
3820 * @rx_handler: receive handler to register
3821 * @rx_handler_data: data pointer that is used by rx handler
3823 * Register a receive handler for a device. This handler will then be
3824 * called from __netif_receive_skb. A negative errno code is returned
3827 * The caller must hold the rtnl_mutex.
3829 * For a general description of rx_handler, see enum rx_handler_result.
3831 int netdev_rx_handler_register(struct net_device
*dev
,
3832 rx_handler_func_t
*rx_handler
,
3833 void *rx_handler_data
)
3837 if (dev
->rx_handler
)
3840 /* Note: rx_handler_data must be set before rx_handler */
3841 rcu_assign_pointer(dev
->rx_handler_data
, rx_handler_data
);
3842 rcu_assign_pointer(dev
->rx_handler
, rx_handler
);
3846 EXPORT_SYMBOL_GPL(netdev_rx_handler_register
);
3849 * netdev_rx_handler_unregister - unregister receive handler
3850 * @dev: device to unregister a handler from
3852 * Unregister a receive handler from a device.
3854 * The caller must hold the rtnl_mutex.
3856 void netdev_rx_handler_unregister(struct net_device
*dev
)
3860 RCU_INIT_POINTER(dev
->rx_handler
, NULL
);
3861 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3862 * section has a guarantee to see a non NULL rx_handler_data
3866 RCU_INIT_POINTER(dev
->rx_handler_data
, NULL
);
3868 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister
);
3871 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3872 * the special handling of PFMEMALLOC skbs.
3874 static bool skb_pfmemalloc_protocol(struct sk_buff
*skb
)
3876 switch (skb
->protocol
) {
3877 case htons(ETH_P_ARP
):
3878 case htons(ETH_P_IP
):
3879 case htons(ETH_P_IPV6
):
3880 case htons(ETH_P_8021Q
):
3881 case htons(ETH_P_8021AD
):
3888 static inline int nf_ingress(struct sk_buff
*skb
, struct packet_type
**pt_prev
,
3889 int *ret
, struct net_device
*orig_dev
)
3891 #ifdef CONFIG_NETFILTER_INGRESS
3892 if (nf_hook_ingress_active(skb
)) {
3894 *ret
= deliver_skb(skb
, *pt_prev
, orig_dev
);
3898 return nf_hook_ingress(skb
);
3900 #endif /* CONFIG_NETFILTER_INGRESS */
3904 static int __netif_receive_skb_core(struct sk_buff
*skb
, bool pfmemalloc
)
3906 struct packet_type
*ptype
, *pt_prev
;
3907 rx_handler_func_t
*rx_handler
;
3908 struct net_device
*orig_dev
;
3909 bool deliver_exact
= false;
3910 int ret
= NET_RX_DROP
;
3913 net_timestamp_check(!netdev_tstamp_prequeue
, skb
);
3915 trace_netif_receive_skb(skb
);
3917 orig_dev
= skb
->dev
;
3919 skb_reset_network_header(skb
);
3920 if (!skb_transport_header_was_set(skb
))
3921 skb_reset_transport_header(skb
);
3922 skb_reset_mac_len(skb
);
3927 skb
->skb_iif
= skb
->dev
->ifindex
;
3929 __this_cpu_inc(softnet_data
.processed
);
3931 if (skb
->protocol
== cpu_to_be16(ETH_P_8021Q
) ||
3932 skb
->protocol
== cpu_to_be16(ETH_P_8021AD
)) {
3933 skb
= skb_vlan_untag(skb
);
3938 #ifdef CONFIG_NET_CLS_ACT
3939 if (skb
->tc_verd
& TC_NCLS
) {
3940 skb
->tc_verd
= CLR_TC_NCLS(skb
->tc_verd
);
3948 list_for_each_entry_rcu(ptype
, &ptype_all
, list
) {
3950 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3954 list_for_each_entry_rcu(ptype
, &skb
->dev
->ptype_all
, list
) {
3956 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3961 #ifdef CONFIG_NET_INGRESS
3962 if (static_key_false(&ingress_needed
)) {
3963 skb
= handle_ing(skb
, &pt_prev
, &ret
, orig_dev
);
3967 if (nf_ingress(skb
, &pt_prev
, &ret
, orig_dev
) < 0)
3971 #ifdef CONFIG_NET_CLS_ACT
3975 if (pfmemalloc
&& !skb_pfmemalloc_protocol(skb
))
3978 if (skb_vlan_tag_present(skb
)) {
3980 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3983 if (vlan_do_receive(&skb
))
3985 else if (unlikely(!skb
))
3989 rx_handler
= rcu_dereference(skb
->dev
->rx_handler
);
3992 ret
= deliver_skb(skb
, pt_prev
, orig_dev
);
3995 switch (rx_handler(&skb
)) {
3996 case RX_HANDLER_CONSUMED
:
3997 ret
= NET_RX_SUCCESS
;
3999 case RX_HANDLER_ANOTHER
:
4001 case RX_HANDLER_EXACT
:
4002 deliver_exact
= true;
4003 case RX_HANDLER_PASS
:
4010 if (unlikely(skb_vlan_tag_present(skb
))) {
4011 if (skb_vlan_tag_get_id(skb
))
4012 skb
->pkt_type
= PACKET_OTHERHOST
;
4013 /* Note: we might in the future use prio bits
4014 * and set skb->priority like in vlan_do_receive()
4015 * For the time being, just ignore Priority Code Point
4020 type
= skb
->protocol
;
4022 /* deliver only exact match when indicated */
4023 if (likely(!deliver_exact
)) {
4024 deliver_ptype_list_skb(skb
, &pt_prev
, orig_dev
, type
,
4025 &ptype_base
[ntohs(type
) &
4029 deliver_ptype_list_skb(skb
, &pt_prev
, orig_dev
, type
,
4030 &orig_dev
->ptype_specific
);
4032 if (unlikely(skb
->dev
!= orig_dev
)) {
4033 deliver_ptype_list_skb(skb
, &pt_prev
, orig_dev
, type
,
4034 &skb
->dev
->ptype_specific
);
4038 if (unlikely(skb_orphan_frags(skb
, GFP_ATOMIC
)))
4041 ret
= pt_prev
->func(skb
, skb
->dev
, pt_prev
, orig_dev
);
4044 atomic_long_inc(&skb
->dev
->rx_dropped
);
4046 /* Jamal, now you will not able to escape explaining
4047 * me how you were going to use this. :-)
4056 static int __netif_receive_skb(struct sk_buff
*skb
)
4060 if (sk_memalloc_socks() && skb_pfmemalloc(skb
)) {
4061 unsigned long pflags
= current
->flags
;
4064 * PFMEMALLOC skbs are special, they should
4065 * - be delivered to SOCK_MEMALLOC sockets only
4066 * - stay away from userspace
4067 * - have bounded memory usage
4069 * Use PF_MEMALLOC as this saves us from propagating the allocation
4070 * context down to all allocation sites.
4072 current
->flags
|= PF_MEMALLOC
;
4073 ret
= __netif_receive_skb_core(skb
, true);
4074 tsk_restore_flags(current
, pflags
, PF_MEMALLOC
);
4076 ret
= __netif_receive_skb_core(skb
, false);
4081 static int netif_receive_skb_internal(struct sk_buff
*skb
)
4085 net_timestamp_check(netdev_tstamp_prequeue
, skb
);
4087 if (skb_defer_rx_timestamp(skb
))
4088 return NET_RX_SUCCESS
;
4093 if (static_key_false(&rps_needed
)) {
4094 struct rps_dev_flow voidflow
, *rflow
= &voidflow
;
4095 int cpu
= get_rps_cpu(skb
->dev
, skb
, &rflow
);
4098 ret
= enqueue_to_backlog(skb
, cpu
, &rflow
->last_qtail
);
4104 ret
= __netif_receive_skb(skb
);
4110 * netif_receive_skb - process receive buffer from network
4111 * @skb: buffer to process
4113 * netif_receive_skb() is the main receive data processing function.
4114 * It always succeeds. The buffer may be dropped during processing
4115 * for congestion control or by the protocol layers.
4117 * This function may only be called from softirq context and interrupts
4118 * should be enabled.
4120 * Return values (usually ignored):
4121 * NET_RX_SUCCESS: no congestion
4122 * NET_RX_DROP: packet was dropped
4124 int netif_receive_skb(struct sk_buff
*skb
)
4126 trace_netif_receive_skb_entry(skb
);
4128 return netif_receive_skb_internal(skb
);
4130 EXPORT_SYMBOL(netif_receive_skb
);
4132 /* Network device is going away, flush any packets still pending
4133 * Called with irqs disabled.
4135 static void flush_backlog(void *arg
)
4137 struct net_device
*dev
= arg
;
4138 struct softnet_data
*sd
= this_cpu_ptr(&softnet_data
);
4139 struct sk_buff
*skb
, *tmp
;
4142 skb_queue_walk_safe(&sd
->input_pkt_queue
, skb
, tmp
) {
4143 if (skb
->dev
== dev
) {
4144 __skb_unlink(skb
, &sd
->input_pkt_queue
);
4146 input_queue_head_incr(sd
);
4151 skb_queue_walk_safe(&sd
->process_queue
, skb
, tmp
) {
4152 if (skb
->dev
== dev
) {
4153 __skb_unlink(skb
, &sd
->process_queue
);
4155 input_queue_head_incr(sd
);
4160 static int napi_gro_complete(struct sk_buff
*skb
)
4162 struct packet_offload
*ptype
;
4163 __be16 type
= skb
->protocol
;
4164 struct list_head
*head
= &offload_base
;
4167 BUILD_BUG_ON(sizeof(struct napi_gro_cb
) > sizeof(skb
->cb
));
4169 if (NAPI_GRO_CB(skb
)->count
== 1) {
4170 skb_shinfo(skb
)->gso_size
= 0;
4175 list_for_each_entry_rcu(ptype
, head
, list
) {
4176 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_complete
)
4179 err
= ptype
->callbacks
.gro_complete(skb
, 0);
4185 WARN_ON(&ptype
->list
== head
);
4187 return NET_RX_SUCCESS
;
4191 return netif_receive_skb_internal(skb
);
4194 /* napi->gro_list contains packets ordered by age.
4195 * youngest packets at the head of it.
4196 * Complete skbs in reverse order to reduce latencies.
4198 void napi_gro_flush(struct napi_struct
*napi
, bool flush_old
)
4200 struct sk_buff
*skb
, *prev
= NULL
;
4202 /* scan list and build reverse chain */
4203 for (skb
= napi
->gro_list
; skb
!= NULL
; skb
= skb
->next
) {
4208 for (skb
= prev
; skb
; skb
= prev
) {
4211 if (flush_old
&& NAPI_GRO_CB(skb
)->age
== jiffies
)
4215 napi_gro_complete(skb
);
4219 napi
->gro_list
= NULL
;
4221 EXPORT_SYMBOL(napi_gro_flush
);
4223 static void gro_list_prepare(struct napi_struct
*napi
, struct sk_buff
*skb
)
4226 unsigned int maclen
= skb
->dev
->hard_header_len
;
4227 u32 hash
= skb_get_hash_raw(skb
);
4229 for (p
= napi
->gro_list
; p
; p
= p
->next
) {
4230 unsigned long diffs
;
4232 NAPI_GRO_CB(p
)->flush
= 0;
4234 if (hash
!= skb_get_hash_raw(p
)) {
4235 NAPI_GRO_CB(p
)->same_flow
= 0;
4239 diffs
= (unsigned long)p
->dev
^ (unsigned long)skb
->dev
;
4240 diffs
|= p
->vlan_tci
^ skb
->vlan_tci
;
4241 diffs
|= skb_metadata_dst_cmp(p
, skb
);
4242 if (maclen
== ETH_HLEN
)
4243 diffs
|= compare_ether_header(skb_mac_header(p
),
4244 skb_mac_header(skb
));
4246 diffs
= memcmp(skb_mac_header(p
),
4247 skb_mac_header(skb
),
4249 NAPI_GRO_CB(p
)->same_flow
= !diffs
;
4253 static void skb_gro_reset_offset(struct sk_buff
*skb
)
4255 const struct skb_shared_info
*pinfo
= skb_shinfo(skb
);
4256 const skb_frag_t
*frag0
= &pinfo
->frags
[0];
4258 NAPI_GRO_CB(skb
)->data_offset
= 0;
4259 NAPI_GRO_CB(skb
)->frag0
= NULL
;
4260 NAPI_GRO_CB(skb
)->frag0_len
= 0;
4262 if (skb_mac_header(skb
) == skb_tail_pointer(skb
) &&
4264 !PageHighMem(skb_frag_page(frag0
))) {
4265 NAPI_GRO_CB(skb
)->frag0
= skb_frag_address(frag0
);
4266 NAPI_GRO_CB(skb
)->frag0_len
= min_t(unsigned int,
4267 skb_frag_size(frag0
),
4268 skb
->end
- skb
->tail
);
4272 static void gro_pull_from_frag0(struct sk_buff
*skb
, int grow
)
4274 struct skb_shared_info
*pinfo
= skb_shinfo(skb
);
4276 BUG_ON(skb
->end
- skb
->tail
< grow
);
4278 memcpy(skb_tail_pointer(skb
), NAPI_GRO_CB(skb
)->frag0
, grow
);
4280 skb
->data_len
-= grow
;
4283 pinfo
->frags
[0].page_offset
+= grow
;
4284 skb_frag_size_sub(&pinfo
->frags
[0], grow
);
4286 if (unlikely(!skb_frag_size(&pinfo
->frags
[0]))) {
4287 skb_frag_unref(skb
, 0);
4288 memmove(pinfo
->frags
, pinfo
->frags
+ 1,
4289 --pinfo
->nr_frags
* sizeof(pinfo
->frags
[0]));
4293 static enum gro_result
dev_gro_receive(struct napi_struct
*napi
, struct sk_buff
*skb
)
4295 struct sk_buff
**pp
= NULL
;
4296 struct packet_offload
*ptype
;
4297 __be16 type
= skb
->protocol
;
4298 struct list_head
*head
= &offload_base
;
4300 enum gro_result ret
;
4303 if (!(skb
->dev
->features
& NETIF_F_GRO
))
4306 if (skb_is_gso(skb
) || skb_has_frag_list(skb
) || skb
->csum_bad
)
4309 gro_list_prepare(napi
, skb
);
4312 list_for_each_entry_rcu(ptype
, head
, list
) {
4313 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_receive
)
4316 skb_set_network_header(skb
, skb_gro_offset(skb
));
4317 skb_reset_mac_len(skb
);
4318 NAPI_GRO_CB(skb
)->same_flow
= 0;
4319 NAPI_GRO_CB(skb
)->flush
= 0;
4320 NAPI_GRO_CB(skb
)->free
= 0;
4321 NAPI_GRO_CB(skb
)->encap_mark
= 0;
4322 NAPI_GRO_CB(skb
)->recursion_counter
= 0;
4323 NAPI_GRO_CB(skb
)->gro_remcsum_start
= 0;
4325 /* Setup for GRO checksum validation */
4326 switch (skb
->ip_summed
) {
4327 case CHECKSUM_COMPLETE
:
4328 NAPI_GRO_CB(skb
)->csum
= skb
->csum
;
4329 NAPI_GRO_CB(skb
)->csum_valid
= 1;
4330 NAPI_GRO_CB(skb
)->csum_cnt
= 0;
4332 case CHECKSUM_UNNECESSARY
:
4333 NAPI_GRO_CB(skb
)->csum_cnt
= skb
->csum_level
+ 1;
4334 NAPI_GRO_CB(skb
)->csum_valid
= 0;
4337 NAPI_GRO_CB(skb
)->csum_cnt
= 0;
4338 NAPI_GRO_CB(skb
)->csum_valid
= 0;
4341 pp
= ptype
->callbacks
.gro_receive(&napi
->gro_list
, skb
);
4346 if (&ptype
->list
== head
)
4349 same_flow
= NAPI_GRO_CB(skb
)->same_flow
;
4350 ret
= NAPI_GRO_CB(skb
)->free
? GRO_MERGED_FREE
: GRO_MERGED
;
4353 struct sk_buff
*nskb
= *pp
;
4357 napi_gro_complete(nskb
);
4364 if (NAPI_GRO_CB(skb
)->flush
)
4367 if (unlikely(napi
->gro_count
>= MAX_GRO_SKBS
)) {
4368 struct sk_buff
*nskb
= napi
->gro_list
;
4370 /* locate the end of the list to select the 'oldest' flow */
4371 while (nskb
->next
) {
4377 napi_gro_complete(nskb
);
4381 NAPI_GRO_CB(skb
)->count
= 1;
4382 NAPI_GRO_CB(skb
)->age
= jiffies
;
4383 NAPI_GRO_CB(skb
)->last
= skb
;
4384 skb_shinfo(skb
)->gso_size
= skb_gro_len(skb
);
4385 skb
->next
= napi
->gro_list
;
4386 napi
->gro_list
= skb
;
4390 grow
= skb_gro_offset(skb
) - skb_headlen(skb
);
4392 gro_pull_from_frag0(skb
, grow
);
4401 struct packet_offload
*gro_find_receive_by_type(__be16 type
)
4403 struct list_head
*offload_head
= &offload_base
;
4404 struct packet_offload
*ptype
;
4406 list_for_each_entry_rcu(ptype
, offload_head
, list
) {
4407 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_receive
)
4413 EXPORT_SYMBOL(gro_find_receive_by_type
);
4415 struct packet_offload
*gro_find_complete_by_type(__be16 type
)
4417 struct list_head
*offload_head
= &offload_base
;
4418 struct packet_offload
*ptype
;
4420 list_for_each_entry_rcu(ptype
, offload_head
, list
) {
4421 if (ptype
->type
!= type
|| !ptype
->callbacks
.gro_complete
)
4427 EXPORT_SYMBOL(gro_find_complete_by_type
);
4429 static void napi_skb_free_stolen_head(struct sk_buff
*skb
)
4432 kmem_cache_free(skbuff_head_cache
, skb
);
4435 static gro_result_t
napi_skb_finish(gro_result_t ret
, struct sk_buff
*skb
)
4439 if (netif_receive_skb_internal(skb
))
4447 case GRO_MERGED_FREE
:
4448 if (NAPI_GRO_CB(skb
)->free
== NAPI_GRO_FREE_STOLEN_HEAD
)
4449 napi_skb_free_stolen_head(skb
);
4462 gro_result_t
napi_gro_receive(struct napi_struct
*napi
, struct sk_buff
*skb
)
4464 trace_napi_gro_receive_entry(skb
);
4466 skb_gro_reset_offset(skb
);
4468 return napi_skb_finish(dev_gro_receive(napi
, skb
), skb
);
4470 EXPORT_SYMBOL(napi_gro_receive
);
4472 static void napi_reuse_skb(struct napi_struct
*napi
, struct sk_buff
*skb
)
4474 if (unlikely(skb
->pfmemalloc
)) {
4478 __skb_pull(skb
, skb_headlen(skb
));
4479 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4480 skb_reserve(skb
, NET_SKB_PAD
+ NET_IP_ALIGN
- skb_headroom(skb
));
4482 skb
->dev
= napi
->dev
;
4485 /* eth_type_trans() assumes pkt_type is PACKET_HOST */
4486 skb
->pkt_type
= PACKET_HOST
;
4488 skb
->encapsulation
= 0;
4489 skb_shinfo(skb
)->gso_type
= 0;
4490 skb
->truesize
= SKB_TRUESIZE(skb_end_offset(skb
));
4495 struct sk_buff
*napi_get_frags(struct napi_struct
*napi
)
4497 struct sk_buff
*skb
= napi
->skb
;
4500 skb
= napi_alloc_skb(napi
, GRO_MAX_HEAD
);
4505 EXPORT_SYMBOL(napi_get_frags
);
4507 static gro_result_t
napi_frags_finish(struct napi_struct
*napi
,
4508 struct sk_buff
*skb
,
4514 __skb_push(skb
, ETH_HLEN
);
4515 skb
->protocol
= eth_type_trans(skb
, skb
->dev
);
4516 if (ret
== GRO_NORMAL
&& netif_receive_skb_internal(skb
))
4521 napi_reuse_skb(napi
, skb
);
4524 case GRO_MERGED_FREE
:
4525 if (NAPI_GRO_CB(skb
)->free
== NAPI_GRO_FREE_STOLEN_HEAD
)
4526 napi_skb_free_stolen_head(skb
);
4528 napi_reuse_skb(napi
, skb
);
4538 /* Upper GRO stack assumes network header starts at gro_offset=0
4539 * Drivers could call both napi_gro_frags() and napi_gro_receive()
4540 * We copy ethernet header into skb->data to have a common layout.
4542 static struct sk_buff
*napi_frags_skb(struct napi_struct
*napi
)
4544 struct sk_buff
*skb
= napi
->skb
;
4545 const struct ethhdr
*eth
;
4546 unsigned int hlen
= sizeof(*eth
);
4550 skb_reset_mac_header(skb
);
4551 skb_gro_reset_offset(skb
);
4553 eth
= skb_gro_header_fast(skb
, 0);
4554 if (unlikely(skb_gro_header_hard(skb
, hlen
))) {
4555 eth
= skb_gro_header_slow(skb
, hlen
, 0);
4556 if (unlikely(!eth
)) {
4557 napi_reuse_skb(napi
, skb
);
4561 gro_pull_from_frag0(skb
, hlen
);
4562 NAPI_GRO_CB(skb
)->frag0
+= hlen
;
4563 NAPI_GRO_CB(skb
)->frag0_len
-= hlen
;
4565 __skb_pull(skb
, hlen
);
4568 * This works because the only protocols we care about don't require
4570 * We'll fix it up properly in napi_frags_finish()
4572 skb
->protocol
= eth
->h_proto
;
4577 gro_result_t
napi_gro_frags(struct napi_struct
*napi
)
4579 struct sk_buff
*skb
= napi_frags_skb(napi
);
4584 trace_napi_gro_frags_entry(skb
);
4586 return napi_frags_finish(napi
, skb
, dev_gro_receive(napi
, skb
));
4588 EXPORT_SYMBOL(napi_gro_frags
);
4590 /* Compute the checksum from gro_offset and return the folded value
4591 * after adding in any pseudo checksum.
4593 __sum16
__skb_gro_checksum_complete(struct sk_buff
*skb
)
4598 wsum
= skb_checksum(skb
, skb_gro_offset(skb
), skb_gro_len(skb
), 0);
4600 /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4601 sum
= csum_fold(csum_add(NAPI_GRO_CB(skb
)->csum
, wsum
));
4603 if (unlikely(skb
->ip_summed
== CHECKSUM_COMPLETE
) &&
4604 !skb
->csum_complete_sw
)
4605 netdev_rx_csum_fault(skb
->dev
);
4608 NAPI_GRO_CB(skb
)->csum
= wsum
;
4609 NAPI_GRO_CB(skb
)->csum_valid
= 1;
4613 EXPORT_SYMBOL(__skb_gro_checksum_complete
);
4616 * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4617 * Note: called with local irq disabled, but exits with local irq enabled.
4619 static void net_rps_action_and_irq_enable(struct softnet_data
*sd
)
4622 struct softnet_data
*remsd
= sd
->rps_ipi_list
;
4625 sd
->rps_ipi_list
= NULL
;
4629 /* Send pending IPI's to kick RPS processing on remote cpus. */
4631 struct softnet_data
*next
= remsd
->rps_ipi_next
;
4633 if (cpu_online(remsd
->cpu
))
4634 smp_call_function_single_async(remsd
->cpu
,
4643 static bool sd_has_rps_ipi_waiting(struct softnet_data
*sd
)
4646 return sd
->rps_ipi_list
!= NULL
;
4652 static int process_backlog(struct napi_struct
*napi
, int quota
)
4655 struct softnet_data
*sd
= container_of(napi
, struct softnet_data
, backlog
);
4657 /* Check if we have pending ipi, its better to send them now,
4658 * not waiting net_rx_action() end.
4660 if (sd_has_rps_ipi_waiting(sd
)) {
4661 local_irq_disable();
4662 net_rps_action_and_irq_enable(sd
);
4665 napi
->weight
= weight_p
;
4666 local_irq_disable();
4668 struct sk_buff
*skb
;
4670 while ((skb
= __skb_dequeue(&sd
->process_queue
))) {
4673 __netif_receive_skb(skb
);
4675 local_irq_disable();
4676 input_queue_head_incr(sd
);
4677 if (++work
>= quota
) {
4684 if (skb_queue_empty(&sd
->input_pkt_queue
)) {
4686 * Inline a custom version of __napi_complete().
4687 * only current cpu owns and manipulates this napi,
4688 * and NAPI_STATE_SCHED is the only possible flag set
4690 * We can use a plain write instead of clear_bit(),
4691 * and we dont need an smp_mb() memory barrier.
4699 skb_queue_splice_tail_init(&sd
->input_pkt_queue
,
4700 &sd
->process_queue
);
4709 * __napi_schedule - schedule for receive
4710 * @n: entry to schedule
4712 * The entry's receive function will be scheduled to run.
4713 * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4715 void __napi_schedule(struct napi_struct
*n
)
4717 unsigned long flags
;
4719 local_irq_save(flags
);
4720 ____napi_schedule(this_cpu_ptr(&softnet_data
), n
);
4721 local_irq_restore(flags
);
4723 EXPORT_SYMBOL(__napi_schedule
);
4726 * __napi_schedule_irqoff - schedule for receive
4727 * @n: entry to schedule
4729 * Variant of __napi_schedule() assuming hard irqs are masked
4731 void __napi_schedule_irqoff(struct napi_struct
*n
)
4733 ____napi_schedule(this_cpu_ptr(&softnet_data
), n
);
4735 EXPORT_SYMBOL(__napi_schedule_irqoff
);
4737 void __napi_complete(struct napi_struct
*n
)
4739 BUG_ON(!test_bit(NAPI_STATE_SCHED
, &n
->state
));
4741 list_del_init(&n
->poll_list
);
4742 smp_mb__before_atomic();
4743 clear_bit(NAPI_STATE_SCHED
, &n
->state
);
4745 EXPORT_SYMBOL(__napi_complete
);
4747 void napi_complete_done(struct napi_struct
*n
, int work_done
)
4749 unsigned long flags
;
4752 * don't let napi dequeue from the cpu poll list
4753 * just in case its running on a different cpu
4755 if (unlikely(test_bit(NAPI_STATE_NPSVC
, &n
->state
)))
4759 unsigned long timeout
= 0;
4762 timeout
= n
->dev
->gro_flush_timeout
;
4765 hrtimer_start(&n
->timer
, ns_to_ktime(timeout
),
4766 HRTIMER_MODE_REL_PINNED
);
4768 napi_gro_flush(n
, false);
4770 if (likely(list_empty(&n
->poll_list
))) {
4771 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED
, &n
->state
));
4773 /* If n->poll_list is not empty, we need to mask irqs */
4774 local_irq_save(flags
);
4776 local_irq_restore(flags
);
4779 EXPORT_SYMBOL(napi_complete_done
);
4781 /* must be called under rcu_read_lock(), as we dont take a reference */
4782 struct napi_struct
*napi_by_id(unsigned int napi_id
)
4784 unsigned int hash
= napi_id
% HASH_SIZE(napi_hash
);
4785 struct napi_struct
*napi
;
4787 hlist_for_each_entry_rcu(napi
, &napi_hash
[hash
], napi_hash_node
)
4788 if (napi
->napi_id
== napi_id
)
4793 EXPORT_SYMBOL_GPL(napi_by_id
);
4795 void napi_hash_add(struct napi_struct
*napi
)
4797 if (test_and_set_bit(NAPI_STATE_HASHED
, &napi
->state
))
4800 spin_lock(&napi_hash_lock
);
4802 /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
4804 if (unlikely(++napi_gen_id
< NR_CPUS
+ 1))
4805 napi_gen_id
= NR_CPUS
+ 1;
4806 } while (napi_by_id(napi_gen_id
));
4807 napi
->napi_id
= napi_gen_id
;
4809 hlist_add_head_rcu(&napi
->napi_hash_node
,
4810 &napi_hash
[napi
->napi_id
% HASH_SIZE(napi_hash
)]);
4812 spin_unlock(&napi_hash_lock
);
4814 EXPORT_SYMBOL_GPL(napi_hash_add
);
4816 /* Warning : caller is responsible to make sure rcu grace period
4817 * is respected before freeing memory containing @napi
4819 void napi_hash_del(struct napi_struct
*napi
)
4821 spin_lock(&napi_hash_lock
);
4823 if (test_and_clear_bit(NAPI_STATE_HASHED
, &napi
->state
))
4824 hlist_del_rcu(&napi
->napi_hash_node
);
4826 spin_unlock(&napi_hash_lock
);
4828 EXPORT_SYMBOL_GPL(napi_hash_del
);
4830 static enum hrtimer_restart
napi_watchdog(struct hrtimer
*timer
)
4832 struct napi_struct
*napi
;
4834 napi
= container_of(timer
, struct napi_struct
, timer
);
4836 napi_schedule(napi
);
4838 return HRTIMER_NORESTART
;
4841 void netif_napi_add(struct net_device
*dev
, struct napi_struct
*napi
,
4842 int (*poll
)(struct napi_struct
*, int), int weight
)
4844 INIT_LIST_HEAD(&napi
->poll_list
);
4845 hrtimer_init(&napi
->timer
, CLOCK_MONOTONIC
, HRTIMER_MODE_REL_PINNED
);
4846 napi
->timer
.function
= napi_watchdog
;
4847 napi
->gro_count
= 0;
4848 napi
->gro_list
= NULL
;
4851 if (weight
> NAPI_POLL_WEIGHT
)
4852 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4854 napi
->weight
= weight
;
4855 list_add(&napi
->dev_list
, &dev
->napi_list
);
4857 #ifdef CONFIG_NETPOLL
4858 spin_lock_init(&napi
->poll_lock
);
4859 napi
->poll_owner
= -1;
4861 set_bit(NAPI_STATE_SCHED
, &napi
->state
);
4863 EXPORT_SYMBOL(netif_napi_add
);
4865 void napi_disable(struct napi_struct
*n
)
4868 set_bit(NAPI_STATE_DISABLE
, &n
->state
);
4870 while (test_and_set_bit(NAPI_STATE_SCHED
, &n
->state
))
4872 while (test_and_set_bit(NAPI_STATE_NPSVC
, &n
->state
))
4875 hrtimer_cancel(&n
->timer
);
4877 clear_bit(NAPI_STATE_DISABLE
, &n
->state
);
4879 EXPORT_SYMBOL(napi_disable
);
4881 void netif_napi_del(struct napi_struct
*napi
)
4883 list_del_init(&napi
->dev_list
);
4884 napi_free_frags(napi
);
4886 kfree_skb_list(napi
->gro_list
);
4887 napi
->gro_list
= NULL
;
4888 napi
->gro_count
= 0;
4890 EXPORT_SYMBOL(netif_napi_del
);
4892 static int napi_poll(struct napi_struct
*n
, struct list_head
*repoll
)
4897 list_del_init(&n
->poll_list
);
4899 have
= netpoll_poll_lock(n
);
4903 /* This NAPI_STATE_SCHED test is for avoiding a race
4904 * with netpoll's poll_napi(). Only the entity which
4905 * obtains the lock and sees NAPI_STATE_SCHED set will
4906 * actually make the ->poll() call. Therefore we avoid
4907 * accidentally calling ->poll() when NAPI is not scheduled.
4910 if (test_bit(NAPI_STATE_SCHED
, &n
->state
)) {
4911 work
= n
->poll(n
, weight
);
4915 WARN_ON_ONCE(work
> weight
);
4917 if (likely(work
< weight
))
4920 /* Drivers must not modify the NAPI state if they
4921 * consume the entire weight. In such cases this code
4922 * still "owns" the NAPI instance and therefore can
4923 * move the instance around on the list at-will.
4925 if (unlikely(napi_disable_pending(n
))) {
4931 /* flush too old packets
4932 * If HZ < 1000, flush all packets.
4934 napi_gro_flush(n
, HZ
>= 1000);
4937 /* Some drivers may have called napi_schedule
4938 * prior to exhausting their budget.
4940 if (unlikely(!list_empty(&n
->poll_list
))) {
4941 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
4942 n
->dev
? n
->dev
->name
: "backlog");
4946 list_add_tail(&n
->poll_list
, repoll
);
4949 netpoll_poll_unlock(have
);
4954 static void net_rx_action(struct softirq_action
*h
)
4956 struct softnet_data
*sd
= this_cpu_ptr(&softnet_data
);
4957 unsigned long time_limit
= jiffies
+ 2;
4958 int budget
= netdev_budget
;
4962 local_irq_disable();
4963 list_splice_init(&sd
->poll_list
, &list
);
4967 struct napi_struct
*n
;
4969 if (list_empty(&list
)) {
4970 if (!sd_has_rps_ipi_waiting(sd
) && list_empty(&repoll
))
4975 n
= list_first_entry(&list
, struct napi_struct
, poll_list
);
4976 budget
-= napi_poll(n
, &repoll
);
4978 /* If softirq window is exhausted then punt.
4979 * Allow this to run for 2 jiffies since which will allow
4980 * an average latency of 1.5/HZ.
4982 if (unlikely(budget
<= 0 ||
4983 time_after_eq(jiffies
, time_limit
))) {
4989 local_irq_disable();
4991 list_splice_tail_init(&sd
->poll_list
, &list
);
4992 list_splice_tail(&repoll
, &list
);
4993 list_splice(&list
, &sd
->poll_list
);
4994 if (!list_empty(&sd
->poll_list
))
4995 __raise_softirq_irqoff(NET_RX_SOFTIRQ
);
4997 net_rps_action_and_irq_enable(sd
);
5000 struct netdev_adjacent
{
5001 struct net_device
*dev
;
5003 /* upper master flag, there can only be one master device per list */
5006 /* counter for the number of times this device was added to us */
5009 /* private field for the users */
5012 struct list_head list
;
5013 struct rcu_head rcu
;
5016 static struct netdev_adjacent
*__netdev_find_adj(struct net_device
*adj_dev
,
5017 struct list_head
*adj_list
)
5019 struct netdev_adjacent
*adj
;
5021 list_for_each_entry(adj
, adj_list
, list
) {
5022 if (adj
->dev
== adj_dev
)
5029 * netdev_has_upper_dev - Check if device is linked to an upper device
5031 * @upper_dev: upper device to check
5033 * Find out if a device is linked to specified upper device and return true
5034 * in case it is. Note that this checks only immediate upper device,
5035 * not through a complete stack of devices. The caller must hold the RTNL lock.
5037 bool netdev_has_upper_dev(struct net_device
*dev
,
5038 struct net_device
*upper_dev
)
5042 return __netdev_find_adj(upper_dev
, &dev
->all_adj_list
.upper
);
5044 EXPORT_SYMBOL(netdev_has_upper_dev
);
5047 * netdev_has_any_upper_dev - Check if device is linked to some device
5050 * Find out if a device is linked to an upper device and return true in case
5051 * it is. The caller must hold the RTNL lock.
5053 static bool netdev_has_any_upper_dev(struct net_device
*dev
)
5057 return !list_empty(&dev
->all_adj_list
.upper
);
5061 * netdev_master_upper_dev_get - Get master upper device
5064 * Find a master upper device and return pointer to it or NULL in case
5065 * it's not there. The caller must hold the RTNL lock.
5067 struct net_device
*netdev_master_upper_dev_get(struct net_device
*dev
)
5069 struct netdev_adjacent
*upper
;
5073 if (list_empty(&dev
->adj_list
.upper
))
5076 upper
= list_first_entry(&dev
->adj_list
.upper
,
5077 struct netdev_adjacent
, list
);
5078 if (likely(upper
->master
))
5082 EXPORT_SYMBOL(netdev_master_upper_dev_get
);
5084 void *netdev_adjacent_get_private(struct list_head
*adj_list
)
5086 struct netdev_adjacent
*adj
;
5088 adj
= list_entry(adj_list
, struct netdev_adjacent
, list
);
5090 return adj
->private;
5092 EXPORT_SYMBOL(netdev_adjacent_get_private
);
5095 * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5097 * @iter: list_head ** of the current position
5099 * Gets the next device from the dev's upper list, starting from iter
5100 * position. The caller must hold RCU read lock.
5102 struct net_device
*netdev_upper_get_next_dev_rcu(struct net_device
*dev
,
5103 struct list_head
**iter
)
5105 struct netdev_adjacent
*upper
;
5107 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5109 upper
= list_entry_rcu((*iter
)->next
, struct netdev_adjacent
, list
);
5111 if (&upper
->list
== &dev
->adj_list
.upper
)
5114 *iter
= &upper
->list
;
5118 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu
);
5121 * netdev_all_upper_get_next_dev_rcu - Get the next dev from upper list
5123 * @iter: list_head ** of the current position
5125 * Gets the next device from the dev's upper list, starting from iter
5126 * position. The caller must hold RCU read lock.
5128 struct net_device
*netdev_all_upper_get_next_dev_rcu(struct net_device
*dev
,
5129 struct list_head
**iter
)
5131 struct netdev_adjacent
*upper
;
5133 WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5135 upper
= list_entry_rcu((*iter
)->next
, struct netdev_adjacent
, list
);
5137 if (&upper
->list
== &dev
->all_adj_list
.upper
)
5140 *iter
= &upper
->list
;
5144 EXPORT_SYMBOL(netdev_all_upper_get_next_dev_rcu
);
5147 * netdev_lower_get_next_private - Get the next ->private from the
5148 * lower neighbour list
5150 * @iter: list_head ** of the current position
5152 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5153 * list, starting from iter position. The caller must hold either hold the
5154 * RTNL lock or its own locking that guarantees that the neighbour lower
5155 * list will remain unchanged.
5157 void *netdev_lower_get_next_private(struct net_device
*dev
,
5158 struct list_head
**iter
)
5160 struct netdev_adjacent
*lower
;
5162 lower
= list_entry(*iter
, struct netdev_adjacent
, list
);
5164 if (&lower
->list
== &dev
->adj_list
.lower
)
5167 *iter
= lower
->list
.next
;
5169 return lower
->private;
5171 EXPORT_SYMBOL(netdev_lower_get_next_private
);
5174 * netdev_lower_get_next_private_rcu - Get the next ->private from the
5175 * lower neighbour list, RCU
5178 * @iter: list_head ** of the current position
5180 * Gets the next netdev_adjacent->private from the dev's lower neighbour
5181 * list, starting from iter position. The caller must hold RCU read lock.
5183 void *netdev_lower_get_next_private_rcu(struct net_device
*dev
,
5184 struct list_head
**iter
)
5186 struct netdev_adjacent
*lower
;
5188 WARN_ON_ONCE(!rcu_read_lock_held());
5190 lower
= list_entry_rcu((*iter
)->next
, struct netdev_adjacent
, list
);
5192 if (&lower
->list
== &dev
->adj_list
.lower
)
5195 *iter
= &lower
->list
;
5197 return lower
->private;
5199 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu
);
5202 * netdev_lower_get_next - Get the next device from the lower neighbour
5205 * @iter: list_head ** of the current position
5207 * Gets the next netdev_adjacent from the dev's lower neighbour
5208 * list, starting from iter position. The caller must hold RTNL lock or
5209 * its own locking that guarantees that the neighbour lower
5210 * list will remain unchanged.
5212 void *netdev_lower_get_next(struct net_device
*dev
, struct list_head
**iter
)
5214 struct netdev_adjacent
*lower
;
5216 lower
= list_entry((*iter
)->next
, struct netdev_adjacent
, list
);
5218 if (&lower
->list
== &dev
->adj_list
.lower
)
5221 *iter
= &lower
->list
;
5225 EXPORT_SYMBOL(netdev_lower_get_next
);
5228 * netdev_lower_get_first_private_rcu - Get the first ->private from the
5229 * lower neighbour list, RCU
5233 * Gets the first netdev_adjacent->private from the dev's lower neighbour
5234 * list. The caller must hold RCU read lock.
5236 void *netdev_lower_get_first_private_rcu(struct net_device
*dev
)
5238 struct netdev_adjacent
*lower
;
5240 lower
= list_first_or_null_rcu(&dev
->adj_list
.lower
,
5241 struct netdev_adjacent
, list
);
5243 return lower
->private;
5246 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu
);
5249 * netdev_master_upper_dev_get_rcu - Get master upper device
5252 * Find a master upper device and return pointer to it or NULL in case
5253 * it's not there. The caller must hold the RCU read lock.
5255 struct net_device
*netdev_master_upper_dev_get_rcu(struct net_device
*dev
)
5257 struct netdev_adjacent
*upper
;
5259 upper
= list_first_or_null_rcu(&dev
->adj_list
.upper
,
5260 struct netdev_adjacent
, list
);
5261 if (upper
&& likely(upper
->master
))
5265 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu
);
5267 static int netdev_adjacent_sysfs_add(struct net_device
*dev
,
5268 struct net_device
*adj_dev
,
5269 struct list_head
*dev_list
)
5271 char linkname
[IFNAMSIZ
+7];
5272 sprintf(linkname
, dev_list
== &dev
->adj_list
.upper
?
5273 "upper_%s" : "lower_%s", adj_dev
->name
);
5274 return sysfs_create_link(&(dev
->dev
.kobj
), &(adj_dev
->dev
.kobj
),
5277 static void netdev_adjacent_sysfs_del(struct net_device
*dev
,
5279 struct list_head
*dev_list
)
5281 char linkname
[IFNAMSIZ
+7];
5282 sprintf(linkname
, dev_list
== &dev
->adj_list
.upper
?
5283 "upper_%s" : "lower_%s", name
);
5284 sysfs_remove_link(&(dev
->dev
.kobj
), linkname
);
5287 static inline bool netdev_adjacent_is_neigh_list(struct net_device
*dev
,
5288 struct net_device
*adj_dev
,
5289 struct list_head
*dev_list
)
5291 return (dev_list
== &dev
->adj_list
.upper
||
5292 dev_list
== &dev
->adj_list
.lower
) &&
5293 net_eq(dev_net(dev
), dev_net(adj_dev
));
5296 static int __netdev_adjacent_dev_insert(struct net_device
*dev
,
5297 struct net_device
*adj_dev
,
5299 struct list_head
*dev_list
,
5300 void *private, bool master
)
5302 struct netdev_adjacent
*adj
;
5305 adj
= __netdev_find_adj(adj_dev
, dev_list
);
5308 adj
->ref_nr
+= ref_nr
;
5312 adj
= kmalloc(sizeof(*adj
), GFP_KERNEL
);
5317 adj
->master
= master
;
5318 adj
->ref_nr
= ref_nr
;
5319 adj
->private = private;
5322 pr_debug("dev_hold for %s, because of link added from %s to %s\n",
5323 adj_dev
->name
, dev
->name
, adj_dev
->name
);
5325 if (netdev_adjacent_is_neigh_list(dev
, adj_dev
, dev_list
)) {
5326 ret
= netdev_adjacent_sysfs_add(dev
, adj_dev
, dev_list
);
5331 /* Ensure that master link is always the first item in list. */
5333 ret
= sysfs_create_link(&(dev
->dev
.kobj
),
5334 &(adj_dev
->dev
.kobj
), "master");
5336 goto remove_symlinks
;
5338 list_add_rcu(&adj
->list
, dev_list
);
5340 list_add_tail_rcu(&adj
->list
, dev_list
);
5346 if (netdev_adjacent_is_neigh_list(dev
, adj_dev
, dev_list
))
5347 netdev_adjacent_sysfs_del(dev
, adj_dev
->name
, dev_list
);
5355 static void __netdev_adjacent_dev_remove(struct net_device
*dev
,
5356 struct net_device
*adj_dev
,
5358 struct list_head
*dev_list
)
5360 struct netdev_adjacent
*adj
;
5362 adj
= __netdev_find_adj(adj_dev
, dev_list
);
5365 pr_err("tried to remove device %s from %s\n",
5366 dev
->name
, adj_dev
->name
);
5370 if (adj
->ref_nr
> ref_nr
) {
5371 pr_debug("%s to %s ref_nr-%d = %d\n", dev
->name
, adj_dev
->name
,
5372 ref_nr
, adj
->ref_nr
-ref_nr
);
5373 adj
->ref_nr
-= ref_nr
;
5378 sysfs_remove_link(&(dev
->dev
.kobj
), "master");
5380 if (netdev_adjacent_is_neigh_list(dev
, adj_dev
, dev_list
))
5381 netdev_adjacent_sysfs_del(dev
, adj_dev
->name
, dev_list
);
5383 list_del_rcu(&adj
->list
);
5384 pr_debug("dev_put for %s, because link removed from %s to %s\n",
5385 adj_dev
->name
, dev
->name
, adj_dev
->name
);
5387 kfree_rcu(adj
, rcu
);
5390 static int __netdev_adjacent_dev_link_lists(struct net_device
*dev
,
5391 struct net_device
*upper_dev
,
5393 struct list_head
*up_list
,
5394 struct list_head
*down_list
,
5395 void *private, bool master
)
5399 ret
= __netdev_adjacent_dev_insert(dev
, upper_dev
, ref_nr
, up_list
,
5404 ret
= __netdev_adjacent_dev_insert(upper_dev
, dev
, ref_nr
, down_list
,
5407 __netdev_adjacent_dev_remove(dev
, upper_dev
, ref_nr
, up_list
);
5414 static int __netdev_adjacent_dev_link(struct net_device
*dev
,
5415 struct net_device
*upper_dev
,
5418 return __netdev_adjacent_dev_link_lists(dev
, upper_dev
, ref_nr
,
5419 &dev
->all_adj_list
.upper
,
5420 &upper_dev
->all_adj_list
.lower
,
5424 static void __netdev_adjacent_dev_unlink_lists(struct net_device
*dev
,
5425 struct net_device
*upper_dev
,
5427 struct list_head
*up_list
,
5428 struct list_head
*down_list
)
5430 __netdev_adjacent_dev_remove(dev
, upper_dev
, ref_nr
, up_list
);
5431 __netdev_adjacent_dev_remove(upper_dev
, dev
, ref_nr
, down_list
);
5434 static void __netdev_adjacent_dev_unlink(struct net_device
*dev
,
5435 struct net_device
*upper_dev
,
5438 __netdev_adjacent_dev_unlink_lists(dev
, upper_dev
, ref_nr
,
5439 &dev
->all_adj_list
.upper
,
5440 &upper_dev
->all_adj_list
.lower
);
5443 static int __netdev_adjacent_dev_link_neighbour(struct net_device
*dev
,
5444 struct net_device
*upper_dev
,
5445 void *private, bool master
)
5447 int ret
= __netdev_adjacent_dev_link(dev
, upper_dev
, 1);
5452 ret
= __netdev_adjacent_dev_link_lists(dev
, upper_dev
, 1,
5453 &dev
->adj_list
.upper
,
5454 &upper_dev
->adj_list
.lower
,
5457 __netdev_adjacent_dev_unlink(dev
, upper_dev
, 1);
5464 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device
*dev
,
5465 struct net_device
*upper_dev
)
5467 __netdev_adjacent_dev_unlink(dev
, upper_dev
, 1);
5468 __netdev_adjacent_dev_unlink_lists(dev
, upper_dev
, 1,
5469 &dev
->adj_list
.upper
,
5470 &upper_dev
->adj_list
.lower
);
5473 static int __netdev_upper_dev_link(struct net_device
*dev
,
5474 struct net_device
*upper_dev
, bool master
,
5477 struct netdev_notifier_changeupper_info changeupper_info
;
5478 struct netdev_adjacent
*i
, *j
, *to_i
, *to_j
;
5483 if (dev
== upper_dev
)
5486 /* To prevent loops, check if dev is not upper device to upper_dev. */
5487 if (__netdev_find_adj(dev
, &upper_dev
->all_adj_list
.upper
))
5490 if (__netdev_find_adj(upper_dev
, &dev
->adj_list
.upper
))
5493 if (master
&& netdev_master_upper_dev_get(dev
))
5496 changeupper_info
.upper_dev
= upper_dev
;
5497 changeupper_info
.master
= master
;
5498 changeupper_info
.linking
= true;
5500 ret
= call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER
, dev
,
5501 &changeupper_info
.info
);
5502 ret
= notifier_to_errno(ret
);
5506 ret
= __netdev_adjacent_dev_link_neighbour(dev
, upper_dev
, private,
5511 /* Now that we linked these devs, make all the upper_dev's
5512 * all_adj_list.upper visible to every dev's all_adj_list.lower an
5513 * versa, and don't forget the devices itself. All of these
5514 * links are non-neighbours.
5516 list_for_each_entry(i
, &dev
->all_adj_list
.lower
, list
) {
5517 list_for_each_entry(j
, &upper_dev
->all_adj_list
.upper
, list
) {
5518 pr_debug("Interlinking %s with %s, non-neighbour\n",
5519 i
->dev
->name
, j
->dev
->name
);
5520 ret
= __netdev_adjacent_dev_link(i
->dev
, j
->dev
, i
->ref_nr
);
5526 /* add dev to every upper_dev's upper device */
5527 list_for_each_entry(i
, &upper_dev
->all_adj_list
.upper
, list
) {
5528 pr_debug("linking %s's upper device %s with %s\n",
5529 upper_dev
->name
, i
->dev
->name
, dev
->name
);
5530 ret
= __netdev_adjacent_dev_link(dev
, i
->dev
, i
->ref_nr
);
5532 goto rollback_upper_mesh
;
5535 /* add upper_dev to every dev's lower device */
5536 list_for_each_entry(i
, &dev
->all_adj_list
.lower
, list
) {
5537 pr_debug("linking %s's lower device %s with %s\n", dev
->name
,
5538 i
->dev
->name
, upper_dev
->name
);
5539 ret
= __netdev_adjacent_dev_link(i
->dev
, upper_dev
, i
->ref_nr
);
5541 goto rollback_lower_mesh
;
5544 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER
, dev
,
5545 &changeupper_info
.info
);
5548 rollback_lower_mesh
:
5550 list_for_each_entry(i
, &dev
->all_adj_list
.lower
, list
) {
5553 __netdev_adjacent_dev_unlink(i
->dev
, upper_dev
, i
->ref_nr
);
5558 rollback_upper_mesh
:
5560 list_for_each_entry(i
, &upper_dev
->all_adj_list
.upper
, list
) {
5563 __netdev_adjacent_dev_unlink(dev
, i
->dev
, i
->ref_nr
);
5571 list_for_each_entry(i
, &dev
->all_adj_list
.lower
, list
) {
5572 list_for_each_entry(j
, &upper_dev
->all_adj_list
.upper
, list
) {
5573 if (i
== to_i
&& j
== to_j
)
5575 __netdev_adjacent_dev_unlink(i
->dev
, j
->dev
, i
->ref_nr
);
5581 __netdev_adjacent_dev_unlink_neighbour(dev
, upper_dev
);
5587 * netdev_upper_dev_link - Add a link to the upper device
5589 * @upper_dev: new upper device
5591 * Adds a link to device which is upper to this one. The caller must hold
5592 * the RTNL lock. On a failure a negative errno code is returned.
5593 * On success the reference counts are adjusted and the function
5596 int netdev_upper_dev_link(struct net_device
*dev
,
5597 struct net_device
*upper_dev
)
5599 return __netdev_upper_dev_link(dev
, upper_dev
, false, NULL
);
5601 EXPORT_SYMBOL(netdev_upper_dev_link
);
5604 * netdev_master_upper_dev_link - Add a master link to the upper device
5606 * @upper_dev: new upper device
5608 * Adds a link to device which is upper to this one. In this case, only
5609 * one master upper device can be linked, although other non-master devices
5610 * might be linked as well. The caller must hold the RTNL lock.
5611 * On a failure a negative errno code is returned. On success the reference
5612 * counts are adjusted and the function returns zero.
5614 int netdev_master_upper_dev_link(struct net_device
*dev
,
5615 struct net_device
*upper_dev
)
5617 return __netdev_upper_dev_link(dev
, upper_dev
, true, NULL
);
5619 EXPORT_SYMBOL(netdev_master_upper_dev_link
);
5621 int netdev_master_upper_dev_link_private(struct net_device
*dev
,
5622 struct net_device
*upper_dev
,
5625 return __netdev_upper_dev_link(dev
, upper_dev
, true, private);
5627 EXPORT_SYMBOL(netdev_master_upper_dev_link_private
);
5630 * netdev_upper_dev_unlink - Removes a link to upper device
5632 * @upper_dev: new upper device
5634 * Removes a link to device which is upper to this one. The caller must hold
5637 void netdev_upper_dev_unlink(struct net_device
*dev
,
5638 struct net_device
*upper_dev
)
5640 struct netdev_notifier_changeupper_info changeupper_info
;
5641 struct netdev_adjacent
*i
, *j
;
5644 changeupper_info
.upper_dev
= upper_dev
;
5645 changeupper_info
.master
= netdev_master_upper_dev_get(dev
) == upper_dev
;
5646 changeupper_info
.linking
= false;
5648 call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER
, dev
,
5649 &changeupper_info
.info
);
5651 __netdev_adjacent_dev_unlink_neighbour(dev
, upper_dev
);
5653 /* Here is the tricky part. We must remove all dev's lower
5654 * devices from all upper_dev's upper devices and vice
5655 * versa, to maintain the graph relationship.
5657 list_for_each_entry(i
, &dev
->all_adj_list
.lower
, list
)
5658 list_for_each_entry(j
, &upper_dev
->all_adj_list
.upper
, list
)
5659 __netdev_adjacent_dev_unlink(i
->dev
, j
->dev
, i
->ref_nr
);
5661 /* remove also the devices itself from lower/upper device
5664 list_for_each_entry(i
, &dev
->all_adj_list
.lower
, list
)
5665 __netdev_adjacent_dev_unlink(i
->dev
, upper_dev
, i
->ref_nr
);
5667 list_for_each_entry(i
, &upper_dev
->all_adj_list
.upper
, list
)
5668 __netdev_adjacent_dev_unlink(dev
, i
->dev
, i
->ref_nr
);
5670 call_netdevice_notifiers_info(NETDEV_CHANGEUPPER
, dev
,
5671 &changeupper_info
.info
);
5673 EXPORT_SYMBOL(netdev_upper_dev_unlink
);
5676 * netdev_bonding_info_change - Dispatch event about slave change
5678 * @bonding_info: info to dispatch
5680 * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5681 * The caller must hold the RTNL lock.
5683 void netdev_bonding_info_change(struct net_device
*dev
,
5684 struct netdev_bonding_info
*bonding_info
)
5686 struct netdev_notifier_bonding_info info
;
5688 memcpy(&info
.bonding_info
, bonding_info
,
5689 sizeof(struct netdev_bonding_info
));
5690 call_netdevice_notifiers_info(NETDEV_BONDING_INFO
, dev
,
5693 EXPORT_SYMBOL(netdev_bonding_info_change
);
5695 static void netdev_adjacent_add_links(struct net_device
*dev
)
5697 struct netdev_adjacent
*iter
;
5699 struct net
*net
= dev_net(dev
);
5701 list_for_each_entry(iter
, &dev
->adj_list
.upper
, list
) {
5702 if (!net_eq(net
,dev_net(iter
->dev
)))
5704 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
5705 &iter
->dev
->adj_list
.lower
);
5706 netdev_adjacent_sysfs_add(dev
, iter
->dev
,
5707 &dev
->adj_list
.upper
);
5710 list_for_each_entry(iter
, &dev
->adj_list
.lower
, list
) {
5711 if (!net_eq(net
,dev_net(iter
->dev
)))
5713 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
5714 &iter
->dev
->adj_list
.upper
);
5715 netdev_adjacent_sysfs_add(dev
, iter
->dev
,
5716 &dev
->adj_list
.lower
);
5720 static void netdev_adjacent_del_links(struct net_device
*dev
)
5722 struct netdev_adjacent
*iter
;
5724 struct net
*net
= dev_net(dev
);
5726 list_for_each_entry(iter
, &dev
->adj_list
.upper
, list
) {
5727 if (!net_eq(net
,dev_net(iter
->dev
)))
5729 netdev_adjacent_sysfs_del(iter
->dev
, dev
->name
,
5730 &iter
->dev
->adj_list
.lower
);
5731 netdev_adjacent_sysfs_del(dev
, iter
->dev
->name
,
5732 &dev
->adj_list
.upper
);
5735 list_for_each_entry(iter
, &dev
->adj_list
.lower
, list
) {
5736 if (!net_eq(net
,dev_net(iter
->dev
)))
5738 netdev_adjacent_sysfs_del(iter
->dev
, dev
->name
,
5739 &iter
->dev
->adj_list
.upper
);
5740 netdev_adjacent_sysfs_del(dev
, iter
->dev
->name
,
5741 &dev
->adj_list
.lower
);
5745 void netdev_adjacent_rename_links(struct net_device
*dev
, char *oldname
)
5747 struct netdev_adjacent
*iter
;
5749 struct net
*net
= dev_net(dev
);
5751 list_for_each_entry(iter
, &dev
->adj_list
.upper
, list
) {
5752 if (!net_eq(net
,dev_net(iter
->dev
)))
5754 netdev_adjacent_sysfs_del(iter
->dev
, oldname
,
5755 &iter
->dev
->adj_list
.lower
);
5756 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
5757 &iter
->dev
->adj_list
.lower
);
5760 list_for_each_entry(iter
, &dev
->adj_list
.lower
, list
) {
5761 if (!net_eq(net
,dev_net(iter
->dev
)))
5763 netdev_adjacent_sysfs_del(iter
->dev
, oldname
,
5764 &iter
->dev
->adj_list
.upper
);
5765 netdev_adjacent_sysfs_add(iter
->dev
, dev
,
5766 &iter
->dev
->adj_list
.upper
);
5770 void *netdev_lower_dev_get_private(struct net_device
*dev
,
5771 struct net_device
*lower_dev
)
5773 struct netdev_adjacent
*lower
;
5777 lower
= __netdev_find_adj(lower_dev
, &dev
->adj_list
.lower
);
5781 return lower
->private;
5783 EXPORT_SYMBOL(netdev_lower_dev_get_private
);
5786 int dev_get_nest_level(struct net_device
*dev
,
5787 bool (*type_check
)(struct net_device
*dev
))
5789 struct net_device
*lower
= NULL
;
5790 struct list_head
*iter
;
5796 netdev_for_each_lower_dev(dev
, lower
, iter
) {
5797 nest
= dev_get_nest_level(lower
, type_check
);
5798 if (max_nest
< nest
)
5802 if (type_check(dev
))
5807 EXPORT_SYMBOL(dev_get_nest_level
);
5809 static void dev_change_rx_flags(struct net_device
*dev
, int flags
)
5811 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5813 if (ops
->ndo_change_rx_flags
)
5814 ops
->ndo_change_rx_flags(dev
, flags
);
5817 static int __dev_set_promiscuity(struct net_device
*dev
, int inc
, bool notify
)
5819 unsigned int old_flags
= dev
->flags
;
5825 dev
->flags
|= IFF_PROMISC
;
5826 dev
->promiscuity
+= inc
;
5827 if (dev
->promiscuity
== 0) {
5830 * If inc causes overflow, untouch promisc and return error.
5833 dev
->flags
&= ~IFF_PROMISC
;
5835 dev
->promiscuity
-= inc
;
5836 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
5841 if (dev
->flags
!= old_flags
) {
5842 pr_info("device %s %s promiscuous mode\n",
5844 dev
->flags
& IFF_PROMISC
? "entered" : "left");
5845 if (audit_enabled
) {
5846 current_uid_gid(&uid
, &gid
);
5847 audit_log(current
->audit_context
, GFP_ATOMIC
,
5848 AUDIT_ANOM_PROMISCUOUS
,
5849 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
5850 dev
->name
, (dev
->flags
& IFF_PROMISC
),
5851 (old_flags
& IFF_PROMISC
),
5852 from_kuid(&init_user_ns
, audit_get_loginuid(current
)),
5853 from_kuid(&init_user_ns
, uid
),
5854 from_kgid(&init_user_ns
, gid
),
5855 audit_get_sessionid(current
));
5858 dev_change_rx_flags(dev
, IFF_PROMISC
);
5861 __dev_notify_flags(dev
, old_flags
, IFF_PROMISC
);
5866 * dev_set_promiscuity - update promiscuity count on a device
5870 * Add or remove promiscuity from a device. While the count in the device
5871 * remains above zero the interface remains promiscuous. Once it hits zero
5872 * the device reverts back to normal filtering operation. A negative inc
5873 * value is used to drop promiscuity on the device.
5874 * Return 0 if successful or a negative errno code on error.
5876 int dev_set_promiscuity(struct net_device
*dev
, int inc
)
5878 unsigned int old_flags
= dev
->flags
;
5881 err
= __dev_set_promiscuity(dev
, inc
, true);
5884 if (dev
->flags
!= old_flags
)
5885 dev_set_rx_mode(dev
);
5888 EXPORT_SYMBOL(dev_set_promiscuity
);
5890 static int __dev_set_allmulti(struct net_device
*dev
, int inc
, bool notify
)
5892 unsigned int old_flags
= dev
->flags
, old_gflags
= dev
->gflags
;
5896 dev
->flags
|= IFF_ALLMULTI
;
5897 dev
->allmulti
+= inc
;
5898 if (dev
->allmulti
== 0) {
5901 * If inc causes overflow, untouch allmulti and return error.
5904 dev
->flags
&= ~IFF_ALLMULTI
;
5906 dev
->allmulti
-= inc
;
5907 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
5912 if (dev
->flags
^ old_flags
) {
5913 dev_change_rx_flags(dev
, IFF_ALLMULTI
);
5914 dev_set_rx_mode(dev
);
5916 __dev_notify_flags(dev
, old_flags
,
5917 dev
->gflags
^ old_gflags
);
5923 * dev_set_allmulti - update allmulti count on a device
5927 * Add or remove reception of all multicast frames to a device. While the
5928 * count in the device remains above zero the interface remains listening
5929 * to all interfaces. Once it hits zero the device reverts back to normal
5930 * filtering operation. A negative @inc value is used to drop the counter
5931 * when releasing a resource needing all multicasts.
5932 * Return 0 if successful or a negative errno code on error.
5935 int dev_set_allmulti(struct net_device
*dev
, int inc
)
5937 return __dev_set_allmulti(dev
, inc
, true);
5939 EXPORT_SYMBOL(dev_set_allmulti
);
5942 * Upload unicast and multicast address lists to device and
5943 * configure RX filtering. When the device doesn't support unicast
5944 * filtering it is put in promiscuous mode while unicast addresses
5947 void __dev_set_rx_mode(struct net_device
*dev
)
5949 const struct net_device_ops
*ops
= dev
->netdev_ops
;
5951 /* dev_open will call this function so the list will stay sane. */
5952 if (!(dev
->flags
&IFF_UP
))
5955 if (!netif_device_present(dev
))
5958 if (!(dev
->priv_flags
& IFF_UNICAST_FLT
)) {
5959 /* Unicast addresses changes may only happen under the rtnl,
5960 * therefore calling __dev_set_promiscuity here is safe.
5962 if (!netdev_uc_empty(dev
) && !dev
->uc_promisc
) {
5963 __dev_set_promiscuity(dev
, 1, false);
5964 dev
->uc_promisc
= true;
5965 } else if (netdev_uc_empty(dev
) && dev
->uc_promisc
) {
5966 __dev_set_promiscuity(dev
, -1, false);
5967 dev
->uc_promisc
= false;
5971 if (ops
->ndo_set_rx_mode
)
5972 ops
->ndo_set_rx_mode(dev
);
5975 void dev_set_rx_mode(struct net_device
*dev
)
5977 netif_addr_lock_bh(dev
);
5978 __dev_set_rx_mode(dev
);
5979 netif_addr_unlock_bh(dev
);
5983 * dev_get_flags - get flags reported to userspace
5986 * Get the combination of flag bits exported through APIs to userspace.
5988 unsigned int dev_get_flags(const struct net_device
*dev
)
5992 flags
= (dev
->flags
& ~(IFF_PROMISC
|
5997 (dev
->gflags
& (IFF_PROMISC
|
6000 if (netif_running(dev
)) {
6001 if (netif_oper_up(dev
))
6002 flags
|= IFF_RUNNING
;
6003 if (netif_carrier_ok(dev
))
6004 flags
|= IFF_LOWER_UP
;
6005 if (netif_dormant(dev
))
6006 flags
|= IFF_DORMANT
;
6011 EXPORT_SYMBOL(dev_get_flags
);
6013 int __dev_change_flags(struct net_device
*dev
, unsigned int flags
)
6015 unsigned int old_flags
= dev
->flags
;
6021 * Set the flags on our device.
6024 dev
->flags
= (flags
& (IFF_DEBUG
| IFF_NOTRAILERS
| IFF_NOARP
|
6025 IFF_DYNAMIC
| IFF_MULTICAST
| IFF_PORTSEL
|
6027 (dev
->flags
& (IFF_UP
| IFF_VOLATILE
| IFF_PROMISC
|
6031 * Load in the correct multicast list now the flags have changed.
6034 if ((old_flags
^ flags
) & IFF_MULTICAST
)
6035 dev_change_rx_flags(dev
, IFF_MULTICAST
);
6037 dev_set_rx_mode(dev
);
6040 * Have we downed the interface. We handle IFF_UP ourselves
6041 * according to user attempts to set it, rather than blindly
6046 if ((old_flags
^ flags
) & IFF_UP
)
6047 ret
= ((old_flags
& IFF_UP
) ? __dev_close
: __dev_open
)(dev
);
6049 if ((flags
^ dev
->gflags
) & IFF_PROMISC
) {
6050 int inc
= (flags
& IFF_PROMISC
) ? 1 : -1;
6051 unsigned int old_flags
= dev
->flags
;
6053 dev
->gflags
^= IFF_PROMISC
;
6055 if (__dev_set_promiscuity(dev
, inc
, false) >= 0)
6056 if (dev
->flags
!= old_flags
)
6057 dev_set_rx_mode(dev
);
6060 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6061 is important. Some (broken) drivers set IFF_PROMISC, when
6062 IFF_ALLMULTI is requested not asking us and not reporting.
6064 if ((flags
^ dev
->gflags
) & IFF_ALLMULTI
) {
6065 int inc
= (flags
& IFF_ALLMULTI
) ? 1 : -1;
6067 dev
->gflags
^= IFF_ALLMULTI
;
6068 __dev_set_allmulti(dev
, inc
, false);
6074 void __dev_notify_flags(struct net_device
*dev
, unsigned int old_flags
,
6075 unsigned int gchanges
)
6077 unsigned int changes
= dev
->flags
^ old_flags
;
6080 rtmsg_ifinfo(RTM_NEWLINK
, dev
, gchanges
, GFP_ATOMIC
);
6082 if (changes
& IFF_UP
) {
6083 if (dev
->flags
& IFF_UP
)
6084 call_netdevice_notifiers(NETDEV_UP
, dev
);
6086 call_netdevice_notifiers(NETDEV_DOWN
, dev
);
6089 if (dev
->flags
& IFF_UP
&&
6090 (changes
& ~(IFF_UP
| IFF_PROMISC
| IFF_ALLMULTI
| IFF_VOLATILE
))) {
6091 struct netdev_notifier_change_info change_info
;
6093 change_info
.flags_changed
= changes
;
6094 call_netdevice_notifiers_info(NETDEV_CHANGE
, dev
,
6100 * dev_change_flags - change device settings
6102 * @flags: device state flags
6104 * Change settings on device based state flags. The flags are
6105 * in the userspace exported format.
6107 int dev_change_flags(struct net_device
*dev
, unsigned int flags
)
6110 unsigned int changes
, old_flags
= dev
->flags
, old_gflags
= dev
->gflags
;
6112 ret
= __dev_change_flags(dev
, flags
);
6116 changes
= (old_flags
^ dev
->flags
) | (old_gflags
^ dev
->gflags
);
6117 __dev_notify_flags(dev
, old_flags
, changes
);
6120 EXPORT_SYMBOL(dev_change_flags
);
6122 static int __dev_set_mtu(struct net_device
*dev
, int new_mtu
)
6124 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6126 if (ops
->ndo_change_mtu
)
6127 return ops
->ndo_change_mtu(dev
, new_mtu
);
6134 * dev_set_mtu - Change maximum transfer unit
6136 * @new_mtu: new transfer unit
6138 * Change the maximum transfer size of the network device.
6140 int dev_set_mtu(struct net_device
*dev
, int new_mtu
)
6144 if (new_mtu
== dev
->mtu
)
6147 /* MTU must be positive. */
6151 if (!netif_device_present(dev
))
6154 err
= call_netdevice_notifiers(NETDEV_PRECHANGEMTU
, dev
);
6155 err
= notifier_to_errno(err
);
6159 orig_mtu
= dev
->mtu
;
6160 err
= __dev_set_mtu(dev
, new_mtu
);
6163 err
= call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU
, dev
,
6165 err
= notifier_to_errno(err
);
6167 /* setting mtu back and notifying everyone again,
6168 * so that they have a chance to revert changes.
6170 __dev_set_mtu(dev
, orig_mtu
);
6171 call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU
, dev
,
6177 EXPORT_SYMBOL(dev_set_mtu
);
6180 * dev_set_group - Change group this device belongs to
6182 * @new_group: group this device should belong to
6184 void dev_set_group(struct net_device
*dev
, int new_group
)
6186 dev
->group
= new_group
;
6188 EXPORT_SYMBOL(dev_set_group
);
6191 * dev_set_mac_address - Change Media Access Control Address
6195 * Change the hardware (MAC) address of the device
6197 int dev_set_mac_address(struct net_device
*dev
, struct sockaddr
*sa
)
6199 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6202 if (!ops
->ndo_set_mac_address
)
6204 if (sa
->sa_family
!= dev
->type
)
6206 if (!netif_device_present(dev
))
6208 err
= ops
->ndo_set_mac_address(dev
, sa
);
6211 dev
->addr_assign_type
= NET_ADDR_SET
;
6212 call_netdevice_notifiers(NETDEV_CHANGEADDR
, dev
);
6213 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
6216 EXPORT_SYMBOL(dev_set_mac_address
);
6219 * dev_change_carrier - Change device carrier
6221 * @new_carrier: new value
6223 * Change device carrier
6225 int dev_change_carrier(struct net_device
*dev
, bool new_carrier
)
6227 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6229 if (!ops
->ndo_change_carrier
)
6231 if (!netif_device_present(dev
))
6233 return ops
->ndo_change_carrier(dev
, new_carrier
);
6235 EXPORT_SYMBOL(dev_change_carrier
);
6238 * dev_get_phys_port_id - Get device physical port ID
6242 * Get device physical port ID
6244 int dev_get_phys_port_id(struct net_device
*dev
,
6245 struct netdev_phys_item_id
*ppid
)
6247 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6249 if (!ops
->ndo_get_phys_port_id
)
6251 return ops
->ndo_get_phys_port_id(dev
, ppid
);
6253 EXPORT_SYMBOL(dev_get_phys_port_id
);
6256 * dev_get_phys_port_name - Get device physical port name
6260 * Get device physical port name
6262 int dev_get_phys_port_name(struct net_device
*dev
,
6263 char *name
, size_t len
)
6265 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6267 if (!ops
->ndo_get_phys_port_name
)
6269 return ops
->ndo_get_phys_port_name(dev
, name
, len
);
6271 EXPORT_SYMBOL(dev_get_phys_port_name
);
6274 * dev_change_proto_down - update protocol port state information
6276 * @proto_down: new value
6278 * This info can be used by switch drivers to set the phys state of the
6281 int dev_change_proto_down(struct net_device
*dev
, bool proto_down
)
6283 const struct net_device_ops
*ops
= dev
->netdev_ops
;
6285 if (!ops
->ndo_change_proto_down
)
6287 if (!netif_device_present(dev
))
6289 return ops
->ndo_change_proto_down(dev
, proto_down
);
6291 EXPORT_SYMBOL(dev_change_proto_down
);
6294 * dev_new_index - allocate an ifindex
6295 * @net: the applicable net namespace
6297 * Returns a suitable unique value for a new device interface
6298 * number. The caller must hold the rtnl semaphore or the
6299 * dev_base_lock to be sure it remains unique.
6301 static int dev_new_index(struct net
*net
)
6303 int ifindex
= net
->ifindex
;
6307 if (!__dev_get_by_index(net
, ifindex
))
6308 return net
->ifindex
= ifindex
;
6312 /* Delayed registration/unregisteration */
6313 static LIST_HEAD(net_todo_list
);
6314 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq
);
6316 static void net_set_todo(struct net_device
*dev
)
6318 list_add_tail(&dev
->todo_list
, &net_todo_list
);
6319 dev_net(dev
)->dev_unreg_count
++;
6322 static void rollback_registered_many(struct list_head
*head
)
6324 struct net_device
*dev
, *tmp
;
6325 LIST_HEAD(close_head
);
6327 BUG_ON(dev_boot_phase
);
6330 list_for_each_entry_safe(dev
, tmp
, head
, unreg_list
) {
6331 /* Some devices call without registering
6332 * for initialization unwind. Remove those
6333 * devices and proceed with the remaining.
6335 if (dev
->reg_state
== NETREG_UNINITIALIZED
) {
6336 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6340 list_del(&dev
->unreg_list
);
6343 dev
->dismantle
= true;
6344 BUG_ON(dev
->reg_state
!= NETREG_REGISTERED
);
6347 /* If device is running, close it first. */
6348 list_for_each_entry(dev
, head
, unreg_list
)
6349 list_add_tail(&dev
->close_list
, &close_head
);
6350 dev_close_many(&close_head
, true);
6352 list_for_each_entry(dev
, head
, unreg_list
) {
6353 /* And unlink it from device chain. */
6354 unlist_netdevice(dev
);
6356 dev
->reg_state
= NETREG_UNREGISTERING
;
6357 on_each_cpu(flush_backlog
, dev
, 1);
6362 list_for_each_entry(dev
, head
, unreg_list
) {
6363 struct sk_buff
*skb
= NULL
;
6365 /* Shutdown queueing discipline. */
6369 /* Notify protocols, that we are about to destroy
6370 this device. They should clean all the things.
6372 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
6374 if (!dev
->rtnl_link_ops
||
6375 dev
->rtnl_link_state
== RTNL_LINK_INITIALIZED
)
6376 skb
= rtmsg_ifinfo_build_skb(RTM_DELLINK
, dev
, ~0U,
6380 * Flush the unicast and multicast chains
6385 if (dev
->netdev_ops
->ndo_uninit
)
6386 dev
->netdev_ops
->ndo_uninit(dev
);
6389 rtmsg_ifinfo_send(skb
, dev
, GFP_KERNEL
);
6391 /* Notifier chain MUST detach us all upper devices. */
6392 WARN_ON(netdev_has_any_upper_dev(dev
));
6394 /* Remove entries from kobject tree */
6395 netdev_unregister_kobject(dev
);
6397 /* Remove XPS queueing entries */
6398 netif_reset_xps_queues_gt(dev
, 0);
6404 list_for_each_entry(dev
, head
, unreg_list
)
6408 static void rollback_registered(struct net_device
*dev
)
6412 list_add(&dev
->unreg_list
, &single
);
6413 rollback_registered_many(&single
);
6417 static netdev_features_t
netdev_sync_upper_features(struct net_device
*lower
,
6418 struct net_device
*upper
, netdev_features_t features
)
6420 netdev_features_t upper_disables
= NETIF_F_UPPER_DISABLES
;
6421 netdev_features_t feature
;
6424 for_each_netdev_feature(&upper_disables
, feature_bit
) {
6425 feature
= __NETIF_F_BIT(feature_bit
);
6426 if (!(upper
->wanted_features
& feature
)
6427 && (features
& feature
)) {
6428 netdev_dbg(lower
, "Dropping feature %pNF, upper dev %s has it off.\n",
6429 &feature
, upper
->name
);
6430 features
&= ~feature
;
6437 static void netdev_sync_lower_features(struct net_device
*upper
,
6438 struct net_device
*lower
, netdev_features_t features
)
6440 netdev_features_t upper_disables
= NETIF_F_UPPER_DISABLES
;
6441 netdev_features_t feature
;
6444 for_each_netdev_feature(&upper_disables
, feature_bit
) {
6445 feature
= __NETIF_F_BIT(feature_bit
);
6446 if (!(features
& feature
) && (lower
->features
& feature
)) {
6447 netdev_dbg(upper
, "Disabling feature %pNF on lower dev %s.\n",
6448 &feature
, lower
->name
);
6449 lower
->wanted_features
&= ~feature
;
6450 netdev_update_features(lower
);
6452 if (unlikely(lower
->features
& feature
))
6453 netdev_WARN(upper
, "failed to disable %pNF on %s!\n",
6454 &feature
, lower
->name
);
6459 static netdev_features_t
netdev_fix_features(struct net_device
*dev
,
6460 netdev_features_t features
)
6462 /* Fix illegal checksum combinations */
6463 if ((features
& NETIF_F_HW_CSUM
) &&
6464 (features
& (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))) {
6465 netdev_warn(dev
, "mixed HW and IP checksum settings.\n");
6466 features
&= ~(NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
);
6469 /* TSO requires that SG is present as well. */
6470 if ((features
& NETIF_F_ALL_TSO
) && !(features
& NETIF_F_SG
)) {
6471 netdev_dbg(dev
, "Dropping TSO features since no SG feature.\n");
6472 features
&= ~NETIF_F_ALL_TSO
;
6475 if ((features
& NETIF_F_TSO
) && !(features
& NETIF_F_HW_CSUM
) &&
6476 !(features
& NETIF_F_IP_CSUM
)) {
6477 netdev_dbg(dev
, "Dropping TSO features since no CSUM feature.\n");
6478 features
&= ~NETIF_F_TSO
;
6479 features
&= ~NETIF_F_TSO_ECN
;
6482 if ((features
& NETIF_F_TSO6
) && !(features
& NETIF_F_HW_CSUM
) &&
6483 !(features
& NETIF_F_IPV6_CSUM
)) {
6484 netdev_dbg(dev
, "Dropping TSO6 features since no CSUM feature.\n");
6485 features
&= ~NETIF_F_TSO6
;
6488 /* TSO ECN requires that TSO is present as well. */
6489 if ((features
& NETIF_F_ALL_TSO
) == NETIF_F_TSO_ECN
)
6490 features
&= ~NETIF_F_TSO_ECN
;
6492 /* Software GSO depends on SG. */
6493 if ((features
& NETIF_F_GSO
) && !(features
& NETIF_F_SG
)) {
6494 netdev_dbg(dev
, "Dropping NETIF_F_GSO since no SG feature.\n");
6495 features
&= ~NETIF_F_GSO
;
6498 /* UFO needs SG and checksumming */
6499 if (features
& NETIF_F_UFO
) {
6500 /* maybe split UFO into V4 and V6? */
6501 if (!((features
& NETIF_F_GEN_CSUM
) ||
6502 (features
& (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))
6503 == (NETIF_F_IP_CSUM
|NETIF_F_IPV6_CSUM
))) {
6505 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6506 features
&= ~NETIF_F_UFO
;
6509 if (!(features
& NETIF_F_SG
)) {
6511 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6512 features
&= ~NETIF_F_UFO
;
6516 #ifdef CONFIG_NET_RX_BUSY_POLL
6517 if (dev
->netdev_ops
->ndo_busy_poll
)
6518 features
|= NETIF_F_BUSY_POLL
;
6521 features
&= ~NETIF_F_BUSY_POLL
;
6526 int __netdev_update_features(struct net_device
*dev
)
6528 struct net_device
*upper
, *lower
;
6529 netdev_features_t features
;
6530 struct list_head
*iter
;
6535 features
= netdev_get_wanted_features(dev
);
6537 if (dev
->netdev_ops
->ndo_fix_features
)
6538 features
= dev
->netdev_ops
->ndo_fix_features(dev
, features
);
6540 /* driver might be less strict about feature dependencies */
6541 features
= netdev_fix_features(dev
, features
);
6543 /* some features can't be enabled if they're off an an upper device */
6544 netdev_for_each_upper_dev_rcu(dev
, upper
, iter
)
6545 features
= netdev_sync_upper_features(dev
, upper
, features
);
6547 if (dev
->features
== features
)
6550 netdev_dbg(dev
, "Features changed: %pNF -> %pNF\n",
6551 &dev
->features
, &features
);
6553 if (dev
->netdev_ops
->ndo_set_features
)
6554 err
= dev
->netdev_ops
->ndo_set_features(dev
, features
);
6558 if (unlikely(err
< 0)) {
6560 "set_features() failed (%d); wanted %pNF, left %pNF\n",
6561 err
, &features
, &dev
->features
);
6562 /* return non-0 since some features might have changed and
6563 * it's better to fire a spurious notification than miss it
6569 /* some features must be disabled on lower devices when disabled
6570 * on an upper device (think: bonding master or bridge)
6572 netdev_for_each_lower_dev(dev
, lower
, iter
)
6573 netdev_sync_lower_features(dev
, lower
, features
);
6576 dev
->features
= features
;
6578 return err
< 0 ? 0 : 1;
6582 * netdev_update_features - recalculate device features
6583 * @dev: the device to check
6585 * Recalculate dev->features set and send notifications if it
6586 * has changed. Should be called after driver or hardware dependent
6587 * conditions might have changed that influence the features.
6589 void netdev_update_features(struct net_device
*dev
)
6591 if (__netdev_update_features(dev
))
6592 netdev_features_change(dev
);
6594 EXPORT_SYMBOL(netdev_update_features
);
6597 * netdev_change_features - recalculate device features
6598 * @dev: the device to check
6600 * Recalculate dev->features set and send notifications even
6601 * if they have not changed. Should be called instead of
6602 * netdev_update_features() if also dev->vlan_features might
6603 * have changed to allow the changes to be propagated to stacked
6606 void netdev_change_features(struct net_device
*dev
)
6608 __netdev_update_features(dev
);
6609 netdev_features_change(dev
);
6611 EXPORT_SYMBOL(netdev_change_features
);
6614 * netif_stacked_transfer_operstate - transfer operstate
6615 * @rootdev: the root or lower level device to transfer state from
6616 * @dev: the device to transfer operstate to
6618 * Transfer operational state from root to device. This is normally
6619 * called when a stacking relationship exists between the root
6620 * device and the device(a leaf device).
6622 void netif_stacked_transfer_operstate(const struct net_device
*rootdev
,
6623 struct net_device
*dev
)
6625 if (rootdev
->operstate
== IF_OPER_DORMANT
)
6626 netif_dormant_on(dev
);
6628 netif_dormant_off(dev
);
6630 if (netif_carrier_ok(rootdev
)) {
6631 if (!netif_carrier_ok(dev
))
6632 netif_carrier_on(dev
);
6634 if (netif_carrier_ok(dev
))
6635 netif_carrier_off(dev
);
6638 EXPORT_SYMBOL(netif_stacked_transfer_operstate
);
6641 static int netif_alloc_rx_queues(struct net_device
*dev
)
6643 unsigned int i
, count
= dev
->num_rx_queues
;
6644 struct netdev_rx_queue
*rx
;
6645 size_t sz
= count
* sizeof(*rx
);
6649 rx
= kzalloc(sz
, GFP_KERNEL
| __GFP_NOWARN
| __GFP_REPEAT
);
6657 for (i
= 0; i
< count
; i
++)
6663 static void netdev_init_one_queue(struct net_device
*dev
,
6664 struct netdev_queue
*queue
, void *_unused
)
6666 /* Initialize queue lock */
6667 spin_lock_init(&queue
->_xmit_lock
);
6668 netdev_set_xmit_lockdep_class(&queue
->_xmit_lock
, dev
->type
);
6669 queue
->xmit_lock_owner
= -1;
6670 netdev_queue_numa_node_write(queue
, NUMA_NO_NODE
);
6673 dql_init(&queue
->dql
, HZ
);
6677 static void netif_free_tx_queues(struct net_device
*dev
)
6682 static int netif_alloc_netdev_queues(struct net_device
*dev
)
6684 unsigned int count
= dev
->num_tx_queues
;
6685 struct netdev_queue
*tx
;
6686 size_t sz
= count
* sizeof(*tx
);
6688 if (count
< 1 || count
> 0xffff)
6691 tx
= kzalloc(sz
, GFP_KERNEL
| __GFP_NOWARN
| __GFP_REPEAT
);
6699 netdev_for_each_tx_queue(dev
, netdev_init_one_queue
, NULL
);
6700 spin_lock_init(&dev
->tx_global_lock
);
6705 void netif_tx_stop_all_queues(struct net_device
*dev
)
6709 for (i
= 0; i
< dev
->num_tx_queues
; i
++) {
6710 struct netdev_queue
*txq
= netdev_get_tx_queue(dev
, i
);
6711 netif_tx_stop_queue(txq
);
6714 EXPORT_SYMBOL(netif_tx_stop_all_queues
);
6717 * register_netdevice - register a network device
6718 * @dev: device to register
6720 * Take a completed network device structure and add it to the kernel
6721 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6722 * chain. 0 is returned on success. A negative errno code is returned
6723 * on a failure to set up the device, or if the name is a duplicate.
6725 * Callers must hold the rtnl semaphore. You may want
6726 * register_netdev() instead of this.
6729 * The locking appears insufficient to guarantee two parallel registers
6730 * will not get the same name.
6733 int register_netdevice(struct net_device
*dev
)
6736 struct net
*net
= dev_net(dev
);
6738 BUG_ON(dev_boot_phase
);
6743 /* When net_device's are persistent, this will be fatal. */
6744 BUG_ON(dev
->reg_state
!= NETREG_UNINITIALIZED
);
6747 spin_lock_init(&dev
->addr_list_lock
);
6748 netdev_set_addr_lockdep_class(dev
);
6750 ret
= dev_get_valid_name(net
, dev
, dev
->name
);
6754 /* Init, if this function is available */
6755 if (dev
->netdev_ops
->ndo_init
) {
6756 ret
= dev
->netdev_ops
->ndo_init(dev
);
6764 if (((dev
->hw_features
| dev
->features
) &
6765 NETIF_F_HW_VLAN_CTAG_FILTER
) &&
6766 (!dev
->netdev_ops
->ndo_vlan_rx_add_vid
||
6767 !dev
->netdev_ops
->ndo_vlan_rx_kill_vid
)) {
6768 netdev_WARN(dev
, "Buggy VLAN acceleration in driver!\n");
6775 dev
->ifindex
= dev_new_index(net
);
6776 else if (__dev_get_by_index(net
, dev
->ifindex
))
6779 /* Transfer changeable features to wanted_features and enable
6780 * software offloads (GSO and GRO).
6782 dev
->hw_features
|= NETIF_F_SOFT_FEATURES
;
6783 dev
->features
|= NETIF_F_SOFT_FEATURES
;
6784 dev
->wanted_features
= dev
->features
& dev
->hw_features
;
6786 if (!(dev
->flags
& IFF_LOOPBACK
)) {
6787 dev
->hw_features
|= NETIF_F_NOCACHE_COPY
;
6790 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
6792 dev
->vlan_features
|= NETIF_F_HIGHDMA
;
6794 /* Make NETIF_F_SG inheritable to tunnel devices.
6796 dev
->hw_enc_features
|= NETIF_F_SG
;
6798 /* Make NETIF_F_SG inheritable to MPLS.
6800 dev
->mpls_features
|= NETIF_F_SG
;
6802 ret
= call_netdevice_notifiers(NETDEV_POST_INIT
, dev
);
6803 ret
= notifier_to_errno(ret
);
6807 ret
= netdev_register_kobject(dev
);
6810 dev
->reg_state
= NETREG_REGISTERED
;
6812 __netdev_update_features(dev
);
6815 * Default initial state at registry is that the
6816 * device is present.
6819 set_bit(__LINK_STATE_PRESENT
, &dev
->state
);
6821 linkwatch_init_dev(dev
);
6823 dev_init_scheduler(dev
);
6825 list_netdevice(dev
);
6826 add_device_randomness(dev
->dev_addr
, dev
->addr_len
);
6828 /* If the device has permanent device address, driver should
6829 * set dev_addr and also addr_assign_type should be set to
6830 * NET_ADDR_PERM (default value).
6832 if (dev
->addr_assign_type
== NET_ADDR_PERM
)
6833 memcpy(dev
->perm_addr
, dev
->dev_addr
, dev
->addr_len
);
6835 /* Notify protocols, that a new device appeared. */
6836 ret
= call_netdevice_notifiers(NETDEV_REGISTER
, dev
);
6837 ret
= notifier_to_errno(ret
);
6839 rollback_registered(dev
);
6840 dev
->reg_state
= NETREG_UNREGISTERED
;
6843 * Prevent userspace races by waiting until the network
6844 * device is fully setup before sending notifications.
6846 if (!dev
->rtnl_link_ops
||
6847 dev
->rtnl_link_state
== RTNL_LINK_INITIALIZED
)
6848 rtmsg_ifinfo(RTM_NEWLINK
, dev
, ~0U, GFP_KERNEL
);
6854 if (dev
->netdev_ops
->ndo_uninit
)
6855 dev
->netdev_ops
->ndo_uninit(dev
);
6858 EXPORT_SYMBOL(register_netdevice
);
6861 * init_dummy_netdev - init a dummy network device for NAPI
6862 * @dev: device to init
6864 * This takes a network device structure and initialize the minimum
6865 * amount of fields so it can be used to schedule NAPI polls without
6866 * registering a full blown interface. This is to be used by drivers
6867 * that need to tie several hardware interfaces to a single NAPI
6868 * poll scheduler due to HW limitations.
6870 int init_dummy_netdev(struct net_device
*dev
)
6872 /* Clear everything. Note we don't initialize spinlocks
6873 * are they aren't supposed to be taken by any of the
6874 * NAPI code and this dummy netdev is supposed to be
6875 * only ever used for NAPI polls
6877 memset(dev
, 0, sizeof(struct net_device
));
6879 /* make sure we BUG if trying to hit standard
6880 * register/unregister code path
6882 dev
->reg_state
= NETREG_DUMMY
;
6884 /* NAPI wants this */
6885 INIT_LIST_HEAD(&dev
->napi_list
);
6887 /* a dummy interface is started by default */
6888 set_bit(__LINK_STATE_PRESENT
, &dev
->state
);
6889 set_bit(__LINK_STATE_START
, &dev
->state
);
6891 /* Note : We dont allocate pcpu_refcnt for dummy devices,
6892 * because users of this 'device' dont need to change
6898 EXPORT_SYMBOL_GPL(init_dummy_netdev
);
6902 * register_netdev - register a network device
6903 * @dev: device to register
6905 * Take a completed network device structure and add it to the kernel
6906 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6907 * chain. 0 is returned on success. A negative errno code is returned
6908 * on a failure to set up the device, or if the name is a duplicate.
6910 * This is a wrapper around register_netdevice that takes the rtnl semaphore
6911 * and expands the device name if you passed a format string to
6914 int register_netdev(struct net_device
*dev
)
6919 err
= register_netdevice(dev
);
6923 EXPORT_SYMBOL(register_netdev
);
6925 int netdev_refcnt_read(const struct net_device
*dev
)
6929 for_each_possible_cpu(i
)
6930 refcnt
+= *per_cpu_ptr(dev
->pcpu_refcnt
, i
);
6933 EXPORT_SYMBOL(netdev_refcnt_read
);
6936 * netdev_wait_allrefs - wait until all references are gone.
6937 * @dev: target net_device
6939 * This is called when unregistering network devices.
6941 * Any protocol or device that holds a reference should register
6942 * for netdevice notification, and cleanup and put back the
6943 * reference if they receive an UNREGISTER event.
6944 * We can get stuck here if buggy protocols don't correctly
6947 static void netdev_wait_allrefs(struct net_device
*dev
)
6949 unsigned long rebroadcast_time
, warning_time
;
6952 linkwatch_forget_dev(dev
);
6954 rebroadcast_time
= warning_time
= jiffies
;
6955 refcnt
= netdev_refcnt_read(dev
);
6957 while (refcnt
!= 0) {
6958 if (time_after(jiffies
, rebroadcast_time
+ 1 * HZ
)) {
6961 /* Rebroadcast unregister notification */
6962 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
6968 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL
, dev
);
6969 if (test_bit(__LINK_STATE_LINKWATCH_PENDING
,
6971 /* We must not have linkwatch events
6972 * pending on unregister. If this
6973 * happens, we simply run the queue
6974 * unscheduled, resulting in a noop
6977 linkwatch_run_queue();
6982 rebroadcast_time
= jiffies
;
6987 refcnt
= netdev_refcnt_read(dev
);
6989 if (time_after(jiffies
, warning_time
+ 10 * HZ
)) {
6990 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
6992 warning_time
= jiffies
;
7001 * register_netdevice(x1);
7002 * register_netdevice(x2);
7004 * unregister_netdevice(y1);
7005 * unregister_netdevice(y2);
7011 * We are invoked by rtnl_unlock().
7012 * This allows us to deal with problems:
7013 * 1) We can delete sysfs objects which invoke hotplug
7014 * without deadlocking with linkwatch via keventd.
7015 * 2) Since we run with the RTNL semaphore not held, we can sleep
7016 * safely in order to wait for the netdev refcnt to drop to zero.
7018 * We must not return until all unregister events added during
7019 * the interval the lock was held have been completed.
7021 void netdev_run_todo(void)
7023 struct list_head list
;
7025 /* Snapshot list, allow later requests */
7026 list_replace_init(&net_todo_list
, &list
);
7031 /* Wait for rcu callbacks to finish before next phase */
7032 if (!list_empty(&list
))
7035 while (!list_empty(&list
)) {
7036 struct net_device
*dev
7037 = list_first_entry(&list
, struct net_device
, todo_list
);
7038 list_del(&dev
->todo_list
);
7041 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL
, dev
);
7044 if (unlikely(dev
->reg_state
!= NETREG_UNREGISTERING
)) {
7045 pr_err("network todo '%s' but state %d\n",
7046 dev
->name
, dev
->reg_state
);
7051 dev
->reg_state
= NETREG_UNREGISTERED
;
7053 netdev_wait_allrefs(dev
);
7056 BUG_ON(netdev_refcnt_read(dev
));
7057 BUG_ON(!list_empty(&dev
->ptype_all
));
7058 BUG_ON(!list_empty(&dev
->ptype_specific
));
7059 WARN_ON(rcu_access_pointer(dev
->ip_ptr
));
7060 WARN_ON(rcu_access_pointer(dev
->ip6_ptr
));
7061 WARN_ON(dev
->dn_ptr
);
7063 if (dev
->destructor
)
7064 dev
->destructor(dev
);
7066 /* Report a network device has been unregistered */
7068 dev_net(dev
)->dev_unreg_count
--;
7070 wake_up(&netdev_unregistering_wq
);
7072 /* Free network device */
7073 kobject_put(&dev
->dev
.kobj
);
7077 /* Convert net_device_stats to rtnl_link_stats64. They have the same
7078 * fields in the same order, with only the type differing.
7080 void netdev_stats_to_stats64(struct rtnl_link_stats64
*stats64
,
7081 const struct net_device_stats
*netdev_stats
)
7083 #if BITS_PER_LONG == 64
7084 BUILD_BUG_ON(sizeof(*stats64
) != sizeof(*netdev_stats
));
7085 memcpy(stats64
, netdev_stats
, sizeof(*stats64
));
7087 size_t i
, n
= sizeof(*stats64
) / sizeof(u64
);
7088 const unsigned long *src
= (const unsigned long *)netdev_stats
;
7089 u64
*dst
= (u64
*)stats64
;
7091 BUILD_BUG_ON(sizeof(*netdev_stats
) / sizeof(unsigned long) !=
7092 sizeof(*stats64
) / sizeof(u64
));
7093 for (i
= 0; i
< n
; i
++)
7097 EXPORT_SYMBOL(netdev_stats_to_stats64
);
7100 * dev_get_stats - get network device statistics
7101 * @dev: device to get statistics from
7102 * @storage: place to store stats
7104 * Get network statistics from device. Return @storage.
7105 * The device driver may provide its own method by setting
7106 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7107 * otherwise the internal statistics structure is used.
7109 struct rtnl_link_stats64
*dev_get_stats(struct net_device
*dev
,
7110 struct rtnl_link_stats64
*storage
)
7112 const struct net_device_ops
*ops
= dev
->netdev_ops
;
7114 if (ops
->ndo_get_stats64
) {
7115 memset(storage
, 0, sizeof(*storage
));
7116 ops
->ndo_get_stats64(dev
, storage
);
7117 } else if (ops
->ndo_get_stats
) {
7118 netdev_stats_to_stats64(storage
, ops
->ndo_get_stats(dev
));
7120 netdev_stats_to_stats64(storage
, &dev
->stats
);
7122 storage
->rx_dropped
+= (unsigned long)atomic_long_read(&dev
->rx_dropped
);
7123 storage
->tx_dropped
+= (unsigned long)atomic_long_read(&dev
->tx_dropped
);
7126 EXPORT_SYMBOL(dev_get_stats
);
7128 struct netdev_queue
*dev_ingress_queue_create(struct net_device
*dev
)
7130 struct netdev_queue
*queue
= dev_ingress_queue(dev
);
7132 #ifdef CONFIG_NET_CLS_ACT
7135 queue
= kzalloc(sizeof(*queue
), GFP_KERNEL
);
7138 netdev_init_one_queue(dev
, queue
, NULL
);
7139 RCU_INIT_POINTER(queue
->qdisc
, &noop_qdisc
);
7140 queue
->qdisc_sleeping
= &noop_qdisc
;
7141 rcu_assign_pointer(dev
->ingress_queue
, queue
);
7146 static const struct ethtool_ops default_ethtool_ops
;
7148 void netdev_set_default_ethtool_ops(struct net_device
*dev
,
7149 const struct ethtool_ops
*ops
)
7151 if (dev
->ethtool_ops
== &default_ethtool_ops
)
7152 dev
->ethtool_ops
= ops
;
7154 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops
);
7156 void netdev_freemem(struct net_device
*dev
)
7158 char *addr
= (char *)dev
- dev
->padded
;
7164 * alloc_netdev_mqs - allocate network device
7165 * @sizeof_priv: size of private data to allocate space for
7166 * @name: device name format string
7167 * @name_assign_type: origin of device name
7168 * @setup: callback to initialize device
7169 * @txqs: the number of TX subqueues to allocate
7170 * @rxqs: the number of RX subqueues to allocate
7172 * Allocates a struct net_device with private data area for driver use
7173 * and performs basic initialization. Also allocates subqueue structs
7174 * for each queue on the device.
7176 struct net_device
*alloc_netdev_mqs(int sizeof_priv
, const char *name
,
7177 unsigned char name_assign_type
,
7178 void (*setup
)(struct net_device
*),
7179 unsigned int txqs
, unsigned int rxqs
)
7181 struct net_device
*dev
;
7183 struct net_device
*p
;
7185 BUG_ON(strlen(name
) >= sizeof(dev
->name
));
7188 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7194 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7199 alloc_size
= sizeof(struct net_device
);
7201 /* ensure 32-byte alignment of private area */
7202 alloc_size
= ALIGN(alloc_size
, NETDEV_ALIGN
);
7203 alloc_size
+= sizeof_priv
;
7205 /* ensure 32-byte alignment of whole construct */
7206 alloc_size
+= NETDEV_ALIGN
- 1;
7208 p
= kzalloc(alloc_size
, GFP_KERNEL
| __GFP_NOWARN
| __GFP_REPEAT
);
7210 p
= vzalloc(alloc_size
);
7214 dev
= PTR_ALIGN(p
, NETDEV_ALIGN
);
7215 dev
->padded
= (char *)dev
- (char *)p
;
7217 dev
->pcpu_refcnt
= alloc_percpu(int);
7218 if (!dev
->pcpu_refcnt
)
7221 if (dev_addr_init(dev
))
7227 dev_net_set(dev
, &init_net
);
7229 dev
->gso_max_size
= GSO_MAX_SIZE
;
7230 dev
->gso_max_segs
= GSO_MAX_SEGS
;
7231 dev
->gso_min_segs
= 0;
7233 INIT_LIST_HEAD(&dev
->napi_list
);
7234 INIT_LIST_HEAD(&dev
->unreg_list
);
7235 INIT_LIST_HEAD(&dev
->close_list
);
7236 INIT_LIST_HEAD(&dev
->link_watch_list
);
7237 INIT_LIST_HEAD(&dev
->adj_list
.upper
);
7238 INIT_LIST_HEAD(&dev
->adj_list
.lower
);
7239 INIT_LIST_HEAD(&dev
->all_adj_list
.upper
);
7240 INIT_LIST_HEAD(&dev
->all_adj_list
.lower
);
7241 INIT_LIST_HEAD(&dev
->ptype_all
);
7242 INIT_LIST_HEAD(&dev
->ptype_specific
);
7243 dev
->priv_flags
= IFF_XMIT_DST_RELEASE
| IFF_XMIT_DST_RELEASE_PERM
;
7246 if (!dev
->tx_queue_len
) {
7247 dev
->priv_flags
|= IFF_NO_QUEUE
;
7248 dev
->tx_queue_len
= 1;
7251 dev
->num_tx_queues
= txqs
;
7252 dev
->real_num_tx_queues
= txqs
;
7253 if (netif_alloc_netdev_queues(dev
))
7257 dev
->num_rx_queues
= rxqs
;
7258 dev
->real_num_rx_queues
= rxqs
;
7259 if (netif_alloc_rx_queues(dev
))
7263 strcpy(dev
->name
, name
);
7264 dev
->name_assign_type
= name_assign_type
;
7265 dev
->group
= INIT_NETDEV_GROUP
;
7266 if (!dev
->ethtool_ops
)
7267 dev
->ethtool_ops
= &default_ethtool_ops
;
7269 nf_hook_ingress_init(dev
);
7278 free_percpu(dev
->pcpu_refcnt
);
7280 netdev_freemem(dev
);
7283 EXPORT_SYMBOL(alloc_netdev_mqs
);
7286 * free_netdev - free network device
7289 * This function does the last stage of destroying an allocated device
7290 * interface. The reference to the device object is released.
7291 * If this is the last reference then it will be freed.
7293 void free_netdev(struct net_device
*dev
)
7295 struct napi_struct
*p
, *n
;
7297 netif_free_tx_queues(dev
);
7302 kfree(rcu_dereference_protected(dev
->ingress_queue
, 1));
7304 /* Flush device addresses */
7305 dev_addr_flush(dev
);
7307 list_for_each_entry_safe(p
, n
, &dev
->napi_list
, dev_list
)
7310 free_percpu(dev
->pcpu_refcnt
);
7311 dev
->pcpu_refcnt
= NULL
;
7313 /* Compatibility with error handling in drivers */
7314 if (dev
->reg_state
== NETREG_UNINITIALIZED
) {
7315 netdev_freemem(dev
);
7319 BUG_ON(dev
->reg_state
!= NETREG_UNREGISTERED
);
7320 dev
->reg_state
= NETREG_RELEASED
;
7322 /* will free via device release */
7323 put_device(&dev
->dev
);
7325 EXPORT_SYMBOL(free_netdev
);
7328 * synchronize_net - Synchronize with packet receive processing
7330 * Wait for packets currently being received to be done.
7331 * Does not block later packets from starting.
7333 void synchronize_net(void)
7336 if (rtnl_is_locked())
7337 synchronize_rcu_expedited();
7341 EXPORT_SYMBOL(synchronize_net
);
7344 * unregister_netdevice_queue - remove device from the kernel
7348 * This function shuts down a device interface and removes it
7349 * from the kernel tables.
7350 * If head not NULL, device is queued to be unregistered later.
7352 * Callers must hold the rtnl semaphore. You may want
7353 * unregister_netdev() instead of this.
7356 void unregister_netdevice_queue(struct net_device
*dev
, struct list_head
*head
)
7361 list_move_tail(&dev
->unreg_list
, head
);
7363 rollback_registered(dev
);
7364 /* Finish processing unregister after unlock */
7368 EXPORT_SYMBOL(unregister_netdevice_queue
);
7371 * unregister_netdevice_many - unregister many devices
7372 * @head: list of devices
7374 * Note: As most callers use a stack allocated list_head,
7375 * we force a list_del() to make sure stack wont be corrupted later.
7377 void unregister_netdevice_many(struct list_head
*head
)
7379 struct net_device
*dev
;
7381 if (!list_empty(head
)) {
7382 rollback_registered_many(head
);
7383 list_for_each_entry(dev
, head
, unreg_list
)
7388 EXPORT_SYMBOL(unregister_netdevice_many
);
7391 * unregister_netdev - remove device from the kernel
7394 * This function shuts down a device interface and removes it
7395 * from the kernel tables.
7397 * This is just a wrapper for unregister_netdevice that takes
7398 * the rtnl semaphore. In general you want to use this and not
7399 * unregister_netdevice.
7401 void unregister_netdev(struct net_device
*dev
)
7404 unregister_netdevice(dev
);
7407 EXPORT_SYMBOL(unregister_netdev
);
7410 * dev_change_net_namespace - move device to different nethost namespace
7412 * @net: network namespace
7413 * @pat: If not NULL name pattern to try if the current device name
7414 * is already taken in the destination network namespace.
7416 * This function shuts down a device interface and moves it
7417 * to a new network namespace. On success 0 is returned, on
7418 * a failure a netagive errno code is returned.
7420 * Callers must hold the rtnl semaphore.
7423 int dev_change_net_namespace(struct net_device
*dev
, struct net
*net
, const char *pat
)
7429 /* Don't allow namespace local devices to be moved. */
7431 if (dev
->features
& NETIF_F_NETNS_LOCAL
)
7434 /* Ensure the device has been registrered */
7435 if (dev
->reg_state
!= NETREG_REGISTERED
)
7438 /* Get out if there is nothing todo */
7440 if (net_eq(dev_net(dev
), net
))
7443 /* Pick the destination device name, and ensure
7444 * we can use it in the destination network namespace.
7447 if (__dev_get_by_name(net
, dev
->name
)) {
7448 /* We get here if we can't use the current device name */
7451 err
= dev_get_valid_name(net
, dev
, pat
);
7457 * And now a mini version of register_netdevice unregister_netdevice.
7460 /* If device is running close it first. */
7463 /* And unlink it from device chain */
7464 unlist_netdevice(dev
);
7468 /* Shutdown queueing discipline. */
7471 /* Notify protocols, that we are about to destroy
7472 this device. They should clean all the things.
7474 Note that dev->reg_state stays at NETREG_REGISTERED.
7475 This is wanted because this way 8021q and macvlan know
7476 the device is just moving and can keep their slaves up.
7478 call_netdevice_notifiers(NETDEV_UNREGISTER
, dev
);
7480 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL
, dev
);
7481 rtmsg_ifinfo(RTM_DELLINK
, dev
, ~0U, GFP_KERNEL
);
7484 * Flush the unicast and multicast chains
7489 /* Send a netdev-removed uevent to the old namespace */
7490 kobject_uevent(&dev
->dev
.kobj
, KOBJ_REMOVE
);
7491 netdev_adjacent_del_links(dev
);
7493 /* Actually switch the network namespace */
7494 dev_net_set(dev
, net
);
7496 /* If there is an ifindex conflict assign a new one */
7497 if (__dev_get_by_index(net
, dev
->ifindex
))
7498 dev
->ifindex
= dev_new_index(net
);
7500 /* Send a netdev-add uevent to the new namespace */
7501 kobject_uevent(&dev
->dev
.kobj
, KOBJ_ADD
);
7502 netdev_adjacent_add_links(dev
);
7504 /* Fixup kobjects */
7505 err
= device_rename(&dev
->dev
, dev
->name
);
7508 /* Add the device back in the hashes */
7509 list_netdevice(dev
);
7511 /* Notify protocols, that a new device appeared. */
7512 call_netdevice_notifiers(NETDEV_REGISTER
, dev
);
7515 * Prevent userspace races by waiting until the network
7516 * device is fully setup before sending notifications.
7518 rtmsg_ifinfo(RTM_NEWLINK
, dev
, ~0U, GFP_KERNEL
);
7525 EXPORT_SYMBOL_GPL(dev_change_net_namespace
);
7527 static int dev_cpu_callback(struct notifier_block
*nfb
,
7528 unsigned long action
,
7531 struct sk_buff
**list_skb
;
7532 struct sk_buff
*skb
;
7533 unsigned int cpu
, oldcpu
= (unsigned long)ocpu
;
7534 struct softnet_data
*sd
, *oldsd
;
7536 if (action
!= CPU_DEAD
&& action
!= CPU_DEAD_FROZEN
)
7539 local_irq_disable();
7540 cpu
= smp_processor_id();
7541 sd
= &per_cpu(softnet_data
, cpu
);
7542 oldsd
= &per_cpu(softnet_data
, oldcpu
);
7544 /* Find end of our completion_queue. */
7545 list_skb
= &sd
->completion_queue
;
7547 list_skb
= &(*list_skb
)->next
;
7548 /* Append completion queue from offline CPU. */
7549 *list_skb
= oldsd
->completion_queue
;
7550 oldsd
->completion_queue
= NULL
;
7552 /* Append output queue from offline CPU. */
7553 if (oldsd
->output_queue
) {
7554 *sd
->output_queue_tailp
= oldsd
->output_queue
;
7555 sd
->output_queue_tailp
= oldsd
->output_queue_tailp
;
7556 oldsd
->output_queue
= NULL
;
7557 oldsd
->output_queue_tailp
= &oldsd
->output_queue
;
7559 /* Append NAPI poll list from offline CPU, with one exception :
7560 * process_backlog() must be called by cpu owning percpu backlog.
7561 * We properly handle process_queue & input_pkt_queue later.
7563 while (!list_empty(&oldsd
->poll_list
)) {
7564 struct napi_struct
*napi
= list_first_entry(&oldsd
->poll_list
,
7568 list_del_init(&napi
->poll_list
);
7569 if (napi
->poll
== process_backlog
)
7572 ____napi_schedule(sd
, napi
);
7575 raise_softirq_irqoff(NET_TX_SOFTIRQ
);
7578 /* Process offline CPU's input_pkt_queue */
7579 while ((skb
= __skb_dequeue(&oldsd
->process_queue
))) {
7581 input_queue_head_incr(oldsd
);
7583 while ((skb
= skb_dequeue(&oldsd
->input_pkt_queue
))) {
7585 input_queue_head_incr(oldsd
);
7593 * netdev_increment_features - increment feature set by one
7594 * @all: current feature set
7595 * @one: new feature set
7596 * @mask: mask feature set
7598 * Computes a new feature set after adding a device with feature set
7599 * @one to the master device with current feature set @all. Will not
7600 * enable anything that is off in @mask. Returns the new feature set.
7602 netdev_features_t
netdev_increment_features(netdev_features_t all
,
7603 netdev_features_t one
, netdev_features_t mask
)
7605 if (mask
& NETIF_F_GEN_CSUM
)
7606 mask
|= NETIF_F_ALL_CSUM
;
7607 mask
|= NETIF_F_VLAN_CHALLENGED
;
7609 all
|= one
& (NETIF_F_ONE_FOR_ALL
|NETIF_F_ALL_CSUM
) & mask
;
7610 all
&= one
| ~NETIF_F_ALL_FOR_ALL
;
7612 /* If one device supports hw checksumming, set for all. */
7613 if (all
& NETIF_F_GEN_CSUM
)
7614 all
&= ~(NETIF_F_ALL_CSUM
& ~NETIF_F_GEN_CSUM
);
7618 EXPORT_SYMBOL(netdev_increment_features
);
7620 static struct hlist_head
* __net_init
netdev_create_hash(void)
7623 struct hlist_head
*hash
;
7625 hash
= kmalloc(sizeof(*hash
) * NETDEV_HASHENTRIES
, GFP_KERNEL
);
7627 for (i
= 0; i
< NETDEV_HASHENTRIES
; i
++)
7628 INIT_HLIST_HEAD(&hash
[i
]);
7633 /* Initialize per network namespace state */
7634 static int __net_init
netdev_init(struct net
*net
)
7636 if (net
!= &init_net
)
7637 INIT_LIST_HEAD(&net
->dev_base_head
);
7639 net
->dev_name_head
= netdev_create_hash();
7640 if (net
->dev_name_head
== NULL
)
7643 net
->dev_index_head
= netdev_create_hash();
7644 if (net
->dev_index_head
== NULL
)
7650 kfree(net
->dev_name_head
);
7656 * netdev_drivername - network driver for the device
7657 * @dev: network device
7659 * Determine network driver for device.
7661 const char *netdev_drivername(const struct net_device
*dev
)
7663 const struct device_driver
*driver
;
7664 const struct device
*parent
;
7665 const char *empty
= "";
7667 parent
= dev
->dev
.parent
;
7671 driver
= parent
->driver
;
7672 if (driver
&& driver
->name
)
7673 return driver
->name
;
7677 static void __netdev_printk(const char *level
, const struct net_device
*dev
,
7678 struct va_format
*vaf
)
7680 if (dev
&& dev
->dev
.parent
) {
7681 dev_printk_emit(level
[1] - '0',
7684 dev_driver_string(dev
->dev
.parent
),
7685 dev_name(dev
->dev
.parent
),
7686 netdev_name(dev
), netdev_reg_state(dev
),
7689 printk("%s%s%s: %pV",
7690 level
, netdev_name(dev
), netdev_reg_state(dev
), vaf
);
7692 printk("%s(NULL net_device): %pV", level
, vaf
);
7696 void netdev_printk(const char *level
, const struct net_device
*dev
,
7697 const char *format
, ...)
7699 struct va_format vaf
;
7702 va_start(args
, format
);
7707 __netdev_printk(level
, dev
, &vaf
);
7711 EXPORT_SYMBOL(netdev_printk
);
7713 #define define_netdev_printk_level(func, level) \
7714 void func(const struct net_device *dev, const char *fmt, ...) \
7716 struct va_format vaf; \
7719 va_start(args, fmt); \
7724 __netdev_printk(level, dev, &vaf); \
7728 EXPORT_SYMBOL(func);
7730 define_netdev_printk_level(netdev_emerg
, KERN_EMERG
);
7731 define_netdev_printk_level(netdev_alert
, KERN_ALERT
);
7732 define_netdev_printk_level(netdev_crit
, KERN_CRIT
);
7733 define_netdev_printk_level(netdev_err
, KERN_ERR
);
7734 define_netdev_printk_level(netdev_warn
, KERN_WARNING
);
7735 define_netdev_printk_level(netdev_notice
, KERN_NOTICE
);
7736 define_netdev_printk_level(netdev_info
, KERN_INFO
);
7738 static void __net_exit
netdev_exit(struct net
*net
)
7740 kfree(net
->dev_name_head
);
7741 kfree(net
->dev_index_head
);
7744 static struct pernet_operations __net_initdata netdev_net_ops
= {
7745 .init
= netdev_init
,
7746 .exit
= netdev_exit
,
7749 static void __net_exit
default_device_exit(struct net
*net
)
7751 struct net_device
*dev
, *aux
;
7753 * Push all migratable network devices back to the
7754 * initial network namespace
7757 for_each_netdev_safe(net
, dev
, aux
) {
7759 char fb_name
[IFNAMSIZ
];
7761 /* Ignore unmoveable devices (i.e. loopback) */
7762 if (dev
->features
& NETIF_F_NETNS_LOCAL
)
7765 /* Leave virtual devices for the generic cleanup */
7766 if (dev
->rtnl_link_ops
)
7769 /* Push remaining network devices to init_net */
7770 snprintf(fb_name
, IFNAMSIZ
, "dev%d", dev
->ifindex
);
7771 err
= dev_change_net_namespace(dev
, &init_net
, fb_name
);
7773 pr_emerg("%s: failed to move %s to init_net: %d\n",
7774 __func__
, dev
->name
, err
);
7781 static void __net_exit
rtnl_lock_unregistering(struct list_head
*net_list
)
7783 /* Return with the rtnl_lock held when there are no network
7784 * devices unregistering in any network namespace in net_list.
7788 DEFINE_WAIT_FUNC(wait
, woken_wake_function
);
7790 add_wait_queue(&netdev_unregistering_wq
, &wait
);
7792 unregistering
= false;
7794 list_for_each_entry(net
, net_list
, exit_list
) {
7795 if (net
->dev_unreg_count
> 0) {
7796 unregistering
= true;
7804 wait_woken(&wait
, TASK_UNINTERRUPTIBLE
, MAX_SCHEDULE_TIMEOUT
);
7806 remove_wait_queue(&netdev_unregistering_wq
, &wait
);
7809 static void __net_exit
default_device_exit_batch(struct list_head
*net_list
)
7811 /* At exit all network devices most be removed from a network
7812 * namespace. Do this in the reverse order of registration.
7813 * Do this across as many network namespaces as possible to
7814 * improve batching efficiency.
7816 struct net_device
*dev
;
7818 LIST_HEAD(dev_kill_list
);
7820 /* To prevent network device cleanup code from dereferencing
7821 * loopback devices or network devices that have been freed
7822 * wait here for all pending unregistrations to complete,
7823 * before unregistring the loopback device and allowing the
7824 * network namespace be freed.
7826 * The netdev todo list containing all network devices
7827 * unregistrations that happen in default_device_exit_batch
7828 * will run in the rtnl_unlock() at the end of
7829 * default_device_exit_batch.
7831 rtnl_lock_unregistering(net_list
);
7832 list_for_each_entry(net
, net_list
, exit_list
) {
7833 for_each_netdev_reverse(net
, dev
) {
7834 if (dev
->rtnl_link_ops
&& dev
->rtnl_link_ops
->dellink
)
7835 dev
->rtnl_link_ops
->dellink(dev
, &dev_kill_list
);
7837 unregister_netdevice_queue(dev
, &dev_kill_list
);
7840 unregister_netdevice_many(&dev_kill_list
);
7844 static struct pernet_operations __net_initdata default_device_ops
= {
7845 .exit
= default_device_exit
,
7846 .exit_batch
= default_device_exit_batch
,
7850 * Initialize the DEV module. At boot time this walks the device list and
7851 * unhooks any devices that fail to initialise (normally hardware not
7852 * present) and leaves us with a valid list of present and active devices.
7857 * This is called single threaded during boot, so no need
7858 * to take the rtnl semaphore.
7860 static int __init
net_dev_init(void)
7862 int i
, rc
= -ENOMEM
;
7864 BUG_ON(!dev_boot_phase
);
7866 if (dev_proc_init())
7869 if (netdev_kobject_init())
7872 INIT_LIST_HEAD(&ptype_all
);
7873 for (i
= 0; i
< PTYPE_HASH_SIZE
; i
++)
7874 INIT_LIST_HEAD(&ptype_base
[i
]);
7876 INIT_LIST_HEAD(&offload_base
);
7878 if (register_pernet_subsys(&netdev_net_ops
))
7882 * Initialise the packet receive queues.
7885 for_each_possible_cpu(i
) {
7886 struct softnet_data
*sd
= &per_cpu(softnet_data
, i
);
7888 skb_queue_head_init(&sd
->input_pkt_queue
);
7889 skb_queue_head_init(&sd
->process_queue
);
7890 INIT_LIST_HEAD(&sd
->poll_list
);
7891 sd
->output_queue_tailp
= &sd
->output_queue
;
7893 sd
->csd
.func
= rps_trigger_softirq
;
7898 sd
->backlog
.poll
= process_backlog
;
7899 sd
->backlog
.weight
= weight_p
;
7904 /* The loopback device is special if any other network devices
7905 * is present in a network namespace the loopback device must
7906 * be present. Since we now dynamically allocate and free the
7907 * loopback device ensure this invariant is maintained by
7908 * keeping the loopback device as the first device on the
7909 * list of network devices. Ensuring the loopback devices
7910 * is the first device that appears and the last network device
7913 if (register_pernet_device(&loopback_net_ops
))
7916 if (register_pernet_device(&default_device_ops
))
7919 open_softirq(NET_TX_SOFTIRQ
, net_tx_action
);
7920 open_softirq(NET_RX_SOFTIRQ
, net_rx_action
);
7922 hotcpu_notifier(dev_cpu_callback
, 0);
7929 subsys_initcall(net_dev_init
);